From 6eeb3ec6b62853dab6134c9447c50ef2f8694f10 Mon Sep 17 00:00:00 2001 From: Damian Miralles Date: Wed, 20 Dec 2017 21:01:52 -0700 Subject: [PATCH] kernels: Add AVX support to `32f_x2_divide_32f`,`32f_x2_dot_prod_16i` Adds protokernels for AVX support. Modest speed improvements in some of the kernels, however, it seems to be related to the host architecture being used Gbp-Pq: Name 0017-kernels-Add-AVX-support-to-32f_x2_divide_32f-32f_x2_.patch --- kernels/volk/volk_32f_x2_divide_32f.h | 80 +++++++++++++ kernels/volk/volk_32f_x2_dot_prod_16i.h | 148 ++++++++++++++++++++++++ 2 files changed, 228 insertions(+) diff --git a/kernels/volk/volk_32f_x2_divide_32f.h b/kernels/volk/volk_32f_x2_divide_32f.h index d724173..7cc34ca 100644 --- a/kernels/volk/volk_32f_x2_divide_32f.h +++ b/kernels/volk/volk_32f_x2_divide_32f.h @@ -110,6 +110,42 @@ volk_32f_x2_divide_32f_a_sse(float* cVector, const float* aVector, #endif /* LV_HAVE_SSE */ +#ifdef LV_HAVE_AVX +#include + +static inline void +volk_32f_x2_divide_32f_a_avx(float* cVector, const float* aVector, + const float* bVector, unsigned int num_points) +{ + unsigned int number = 0; + const unsigned int eigthPoints = num_points / 8; + + float* cPtr = cVector; + const float* aPtr = aVector; + const float* bPtr= bVector; + + __m256 aVal, bVal, cVal; + for(;number < eigthPoints; number++){ + aVal = _mm256_load_ps(aPtr); + bVal = _mm256_load_ps(bPtr); + + cVal = _mm256_div_ps(aVal, bVal); + + _mm256_store_ps(cPtr,cVal); // Store the results back into the C container + + aPtr += 8; + bPtr += 8; + cPtr += 8; + } + + number = eigthPoints * 8; + for(;number < num_points; number++){ + *cPtr++ = (*aPtr++) / (*bPtr++); + } +} +#endif /* LV_HAVE_AVX */ + + #ifdef LV_HAVE_GENERIC static inline void @@ -145,3 +181,47 @@ volk_32f_x2_divide_32f_u_orc(float* cVector, const float* aVector, #endif /* INCLUDED_volk_32f_x2_divide_32f_a_H */ + + +#ifndef INCLUDED_volk_32f_x2_divide_32f_u_H +#define INCLUDED_volk_32f_x2_divide_32f_u_H + +#include +#include + +#ifdef LV_HAVE_AVX +#include + +static inline void +volk_32f_x2_divide_32f_u_avx(float* cVector, const float* aVector, + const float* bVector, unsigned int num_points) +{ + unsigned int number = 0; + const unsigned int eigthPoints = num_points / 8; + + float* cPtr = cVector; + const float* aPtr = aVector; + const float* bPtr= bVector; + + __m256 aVal, bVal, cVal; + for(;number < eigthPoints; number++){ + aVal = _mm256_loadu_ps(aPtr); + bVal = _mm256_loadu_ps(bPtr); + + cVal = _mm256_div_ps(aVal, bVal); + + _mm256_storeu_ps(cPtr,cVal); // Store the results back into the C container + + aPtr += 8; + bPtr += 8; + cPtr += 8; + } + + number = eigthPoints * 8; + for(;number < num_points; number++){ + *cPtr++ = (*aPtr++) / (*bPtr++); + } +} +#endif /* LV_HAVE_AVX */ + +#endif /* INCLUDED_volk_32f_x2_divide_32f_u_H */ diff --git a/kernels/volk/volk_32f_x2_dot_prod_16i.h b/kernels/volk/volk_32f_x2_dot_prod_16i.h index 15f01b7..a1279cf 100644 --- a/kernels/volk/volk_32f_x2_dot_prod_16i.h +++ b/kernels/volk/volk_32f_x2_dot_prod_16i.h @@ -82,6 +82,154 @@ static inline void volk_32f_x2_dot_prod_16i_generic(int16_t* result, const float #endif /*LV_HAVE_GENERIC*/ +#ifdef LV_HAVE_AVX + +static inline void volk_32f_x2_dot_prod_16i_a_avx(int16_t* result, const float* input, const float* taps, unsigned int num_points) { + + unsigned int number = 0; + const unsigned int thirtySecondPoints = num_points / 32; + + float dotProduct = 0; + const float* aPtr = input; + const float* bPtr = taps; + + __m256 a0Val, a1Val, a2Val, a3Val; + __m256 b0Val, b1Val, b2Val, b3Val; + __m256 c0Val, c1Val, c2Val, c3Val; + + __m256 dotProdVal0 = _mm256_setzero_ps(); + __m256 dotProdVal1 = _mm256_setzero_ps(); + __m256 dotProdVal2 = _mm256_setzero_ps(); + __m256 dotProdVal3 = _mm256_setzero_ps(); + + for(;number < thirtySecondPoints; number++){ + + a0Val = _mm256_load_ps(aPtr); + a1Val = _mm256_load_ps(aPtr+8); + a2Val = _mm256_load_ps(aPtr+16); + a3Val = _mm256_load_ps(aPtr+24); + + b0Val = _mm256_load_ps(bPtr); + b1Val = _mm256_load_ps(bPtr+8); + b2Val = _mm256_load_ps(bPtr+16); + b3Val = _mm256_load_ps(bPtr+24); + + c0Val = _mm256_mul_ps(a0Val, b0Val); + c1Val = _mm256_mul_ps(a1Val, b1Val); + c2Val = _mm256_mul_ps(a2Val, b2Val); + c3Val = _mm256_mul_ps(a3Val, b3Val); + + dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0); + dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1); + dotProdVal2 = _mm256_add_ps(c2Val, dotProdVal2); + dotProdVal3 = _mm256_add_ps(c3Val, dotProdVal3); + + aPtr += 32; + bPtr += 32; + } + + dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1); + dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2); + dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3); + + __VOLK_ATTR_ALIGNED(32) float dotProductVector[8]; + + _mm256_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector + + dotProduct = dotProductVector[0]; + dotProduct += dotProductVector[1]; + dotProduct += dotProductVector[2]; + dotProduct += dotProductVector[3]; + dotProduct += dotProductVector[4]; + dotProduct += dotProductVector[5]; + dotProduct += dotProductVector[6]; + dotProduct += dotProductVector[7]; + + number = thirtySecondPoints*32; + for(;number < num_points; number++){ + dotProduct += ((*aPtr++) * (*bPtr++)); + } + + *result = (short)dotProduct; +} + +#endif /*LV_HAVE_AVX*/ + + +#ifdef LV_HAVE_AVX + +static inline void volk_32f_x2_dot_prod_16i_u_avx(int16_t* result, const float* input, const float* taps, unsigned int num_points) { + + unsigned int number = 0; + const unsigned int thirtySecondPoints = num_points / 32; + + float dotProduct = 0; + const float* aPtr = input; + const float* bPtr = taps; + + __m256 a0Val, a1Val, a2Val, a3Val; + __m256 b0Val, b1Val, b2Val, b3Val; + __m256 c0Val, c1Val, c2Val, c3Val; + + __m256 dotProdVal0 = _mm256_setzero_ps(); + __m256 dotProdVal1 = _mm256_setzero_ps(); + __m256 dotProdVal2 = _mm256_setzero_ps(); + __m256 dotProdVal3 = _mm256_setzero_ps(); + + for(;number < thirtySecondPoints; number++){ + + a0Val = _mm256_loadu_ps(aPtr); + a1Val = _mm256_loadu_ps(aPtr+8); + a2Val = _mm256_loadu_ps(aPtr+16); + a3Val = _mm256_loadu_ps(aPtr+24); + + b0Val = _mm256_loadu_ps(bPtr); + b1Val = _mm256_loadu_ps(bPtr+8); + b2Val = _mm256_loadu_ps(bPtr+16); + b3Val = _mm256_loadu_ps(bPtr+24); + + c0Val = _mm256_mul_ps(a0Val, b0Val); + c1Val = _mm256_mul_ps(a1Val, b1Val); + c2Val = _mm256_mul_ps(a2Val, b2Val); + c3Val = _mm256_mul_ps(a3Val, b3Val); + + dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0); + dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1); + dotProdVal2 = _mm256_add_ps(c2Val, dotProdVal2); + dotProdVal3 = _mm256_add_ps(c3Val, dotProdVal3); + + aPtr += 32; + bPtr += 32; + } + + dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1); + dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2); + dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3); + + __VOLK_ATTR_ALIGNED(32) float dotProductVector[8]; + + _mm256_storeu_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector + + dotProduct = dotProductVector[0]; + dotProduct += dotProductVector[1]; + dotProduct += dotProductVector[2]; + dotProduct += dotProductVector[3]; + dotProduct += dotProductVector[4]; + dotProduct += dotProductVector[5]; + dotProduct += dotProductVector[6]; + dotProduct += dotProductVector[7]; + + number = thirtySecondPoints*32; + for(;number < num_points; number++){ + dotProduct += ((*aPtr++) * (*bPtr++)); + } + + *result = (short)dotProduct; +} + +#endif /*LV_HAVE_AVX*/ + + #ifdef LV_HAVE_SSE static inline void volk_32f_x2_dot_prod_16i_a_sse(int16_t* result, const float* input, const float* taps, unsigned int num_points) { -- 2.30.2