From: Damian Miralles Date: Sat, 16 Dec 2017 06:05:58 +0000 (-0700) Subject: kernels: Adds AVX support to `volk_32f_*` kernels X-Git-Tag: archive/raspbian/1.3-3+rpi1^2~9 X-Git-Url: https://dgit.raspbian.org/?a=commitdiff_plain;h=a5c0e130a5b95333a11f346b9bcfd0d8852d1287;p=volk.git kernels: Adds AVX support to `volk_32f_*` kernels Adds AVX support to `volk_32f_s32f_normalize`,`volk_32f_s32f_stddev_32f`, `volk_32f_sqrt_32f`, `volk_32f_x2_max_32f` and `volk_32f_x2_min_32f`. Some speed improvements can be seen with the new protokernel addition. Gbp-Pq: Name 0016-kernels-Adds-AVX-support-to-volk_32f_-kernels.patch --- diff --git a/kernels/volk/volk_32f_s32f_normalize.h b/kernels/volk/volk_32f_s32f_normalize.h index 52bf006..17d9da9 100644 --- a/kernels/volk/volk_32f_s32f_normalize.h +++ b/kernels/volk/volk_32f_s32f_normalize.h @@ -105,6 +105,39 @@ static inline void volk_32f_s32f_normalize_a_sse(float* vecBuffer, const float s } #endif /* LV_HAVE_SSE */ + +#ifdef LV_HAVE_AVX +#include + +static inline void volk_32f_s32f_normalize_a_avx(float* vecBuffer, const float scalar, unsigned int num_points){ + unsigned int number = 0; + float* inputPtr = vecBuffer; + + const float invScalar = 1.0 / scalar; + __m256 vecScalar = _mm256_set1_ps(invScalar); + __m256 input1; + + const uint64_t eigthPoints = num_points / 8; + for(;number < eigthPoints; number++){ + + input1 = _mm256_load_ps(inputPtr); + + input1 = _mm256_mul_ps(input1, vecScalar); + + _mm256_store_ps(inputPtr, input1); + + inputPtr += 8; + } + + number = eigthPoints*8; + for(; number < num_points; number++){ + *inputPtr *= invScalar; + inputPtr++; + } +} +#endif /* LV_HAVE_AVX */ + + #ifdef LV_HAVE_GENERIC static inline void volk_32f_s32f_normalize_generic(float* vecBuffer, const float scalar, unsigned int num_points){ @@ -128,6 +161,45 @@ static inline void volk_32f_s32f_normalize_u_orc(float* vecBuffer, const float s #endif /* LV_HAVE_GENERIC */ +#endif /* INCLUDED_volk_32f_s32f_normalize_a_H */ -#endif /* INCLUDED_volk_32f_s32f_normalize_a_H */ +#ifndef INCLUDED_volk_32f_s32f_normalize_u_H +#define INCLUDED_volk_32f_s32f_normalize_u_H + +#include +#include + +#ifdef LV_HAVE_AVX +#include + +static inline void volk_32f_s32f_normalize_u_avx(float* vecBuffer, const float scalar, unsigned int num_points){ + unsigned int number = 0; + float* inputPtr = vecBuffer; + + const float invScalar = 1.0 / scalar; + __m256 vecScalar = _mm256_set1_ps(invScalar); + __m256 input1; + + const uint64_t eigthPoints = num_points / 8; + for(;number < eigthPoints; number++){ + + input1 = _mm256_loadu_ps(inputPtr); + + input1 = _mm256_mul_ps(input1, vecScalar); + + _mm256_storeu_ps(inputPtr, input1); + + inputPtr += 8; + } + + number = eigthPoints*8; + for(; number < num_points; number++){ + *inputPtr *= invScalar; + inputPtr++; + } +} +#endif /* LV_HAVE_AVX */ + + +#endif /* INCLUDED_volk_32f_s32f_normalize_u_H */ diff --git a/kernels/volk/volk_32f_s32f_stddev_32f.h b/kernels/volk/volk_32f_s32f_stddev_32f.h index 30f0ed6..f97a783 100644 --- a/kernels/volk/volk_32f_s32f_stddev_32f.h +++ b/kernels/volk/volk_32f_s32f_stddev_32f.h @@ -132,6 +132,65 @@ volk_32f_s32f_stddev_32f_a_sse4_1(float* stddev, const float* inputBuffer, #endif /* LV_HAVE_SSE4_1 */ +#ifdef LV_HAVE_AVX +#include + +static inline void +volk_32f_s32f_stddev_32f_a_avx(float* stddev, const float* inputBuffer, + const float mean, unsigned int num_points) +{ + float returnValue = 0; + if(num_points > 0){ + unsigned int number = 0; + const unsigned int thirtySecondPoints = num_points / 32; + + const float* aPtr = inputBuffer; + + __VOLK_ATTR_ALIGNED(32) float squareBuffer[8]; + + __m256 squareAccumulator = _mm256_setzero_ps(); + __m256 aVal1, aVal2, aVal3, aVal4; + __m256 cVal1, cVal2, cVal3, cVal4; + for(;number < thirtySecondPoints; number++) { + aVal1 = _mm256_load_ps(aPtr); aPtr += 8; + cVal1 = _mm256_dp_ps(aVal1, aVal1, 0xF1); + + aVal2 = _mm256_load_ps(aPtr); aPtr += 8; + cVal2 = _mm256_dp_ps(aVal2, aVal2, 0xF2); + + aVal3 = _mm256_load_ps(aPtr); aPtr += 8; + cVal3 = _mm256_dp_ps(aVal3, aVal3, 0xF4); + + aVal4 = _mm256_load_ps(aPtr); aPtr += 8; + cVal4 = _mm256_dp_ps(aVal4, aVal4, 0xF8); + + cVal1 = _mm256_or_ps(cVal1, cVal2); + cVal3 = _mm256_or_ps(cVal3, cVal4); + cVal1 = _mm256_or_ps(cVal1, cVal3); + + squareAccumulator = _mm256_add_ps(squareAccumulator, cVal1); // squareAccumulator += x^2 + } + _mm256_store_ps(squareBuffer,squareAccumulator); // Store the results back into the C container + returnValue = squareBuffer[0]; returnValue += squareBuffer[1]; + returnValue += squareBuffer[2]; returnValue += squareBuffer[3]; + returnValue += squareBuffer[4]; returnValue += squareBuffer[5]; + returnValue += squareBuffer[6]; returnValue += squareBuffer[7]; + + number = thirtySecondPoints * 32; + for(;number < num_points; number++){ + returnValue += (*aPtr) * (*aPtr); + aPtr++; + } + returnValue /= num_points; + returnValue -= (mean * mean); + returnValue = sqrtf(returnValue); + } + *stddev = returnValue; +} + +#endif /* LV_HAVE_AVX */ + + #ifdef LV_HAVE_SSE #include diff --git a/kernels/volk/volk_32f_sqrt_32f.h b/kernels/volk/volk_32f_sqrt_32f.h index a5851a0..174f8e3 100644 --- a/kernels/volk/volk_32f_sqrt_32f.h +++ b/kernels/volk/volk_32f_sqrt_32f.h @@ -102,6 +102,39 @@ volk_32f_sqrt_32f_a_sse(float* cVector, const float* aVector, unsigned int num_p #endif /* LV_HAVE_SSE */ +#ifdef LV_HAVE_AVX +#include + +static inline void +volk_32f_sqrt_32f_a_avx(float* cVector, const float* aVector, unsigned int num_points) +{ + unsigned int number = 0; + const unsigned int eigthPoints = num_points / 8; + + float* cPtr = cVector; + const float* aPtr = aVector; + + __m256 aVal, cVal; + for(;number < eigthPoints; number++) { + aVal = _mm256_load_ps(aPtr); + + cVal = _mm256_sqrt_ps(aVal); + + _mm256_store_ps(cPtr,cVal); // Store the results back into the C container + + aPtr += 8; + cPtr += 8; + } + + number = eigthPoints * 8; + for(;number < num_points; number++) { + *cPtr++ = sqrtf(*aPtr++); + } +} + +#endif /* LV_HAVE_AVX */ + + #ifdef LV_HAVE_NEON #include diff --git a/kernels/volk/volk_32f_x2_max_32f.h b/kernels/volk/volk_32f_x2_max_32f.h index 14747c2..1dc0f7d 100644 --- a/kernels/volk/volk_32f_x2_max_32f.h +++ b/kernels/volk/volk_32f_x2_max_32f.h @@ -112,6 +112,44 @@ volk_32f_x2_max_32f_a_sse(float* cVector, const float* aVector, #endif /* LV_HAVE_SSE */ +#ifdef LV_HAVE_AVX +#include + +static inline void +volk_32f_x2_max_32f_a_avx(float* cVector, const float* aVector, + const float* bVector, unsigned int num_points) +{ + unsigned int number = 0; + const unsigned int eigthPoints = num_points / 8; + + float* cPtr = cVector; + const float* aPtr = aVector; + const float* bPtr= bVector; + + __m256 aVal, bVal, cVal; + for(;number < eigthPoints; number++){ + aVal = _mm256_load_ps(aPtr); + bVal = _mm256_load_ps(bPtr); + + cVal = _mm256_max_ps(aVal, bVal); + + _mm256_store_ps(cPtr,cVal); // Store the results back into the C container + + aPtr += 8; + bPtr += 8; + cPtr += 8; + } + + number = eigthPoints * 8; + for(;number < num_points; number++){ + const float a = *aPtr++; + const float b = *bPtr++; + *cPtr++ = ( a > b ? a : b); + } +} +#endif /* LV_HAVE_AVX */ + + #ifdef LV_HAVE_NEON #include @@ -180,3 +218,49 @@ volk_32f_x2_max_32f_u_orc(float* cVector, const float* aVector, #endif /* INCLUDED_volk_32f_x2_max_32f_a_H */ + + +#ifndef INCLUDED_volk_32f_x2_max_32f_u_H +#define INCLUDED_volk_32f_x2_max_32f_u_H + +#include +#include + +#ifdef LV_HAVE_AVX +#include + +static inline void +volk_32f_x2_max_32f_u_avx(float* cVector, const float* aVector, + const float* bVector, unsigned int num_points) +{ + unsigned int number = 0; + const unsigned int eigthPoints = num_points / 8; + + float* cPtr = cVector; + const float* aPtr = aVector; + const float* bPtr= bVector; + + __m256 aVal, bVal, cVal; + for(;number < eigthPoints; number++){ + aVal = _mm256_loadu_ps(aPtr); + bVal = _mm256_loadu_ps(bPtr); + + cVal = _mm256_max_ps(aVal, bVal); + + _mm256_storeu_ps(cPtr,cVal); // Store the results back into the C container + + aPtr += 8; + bPtr += 8; + cPtr += 8; + } + + number = eigthPoints * 8; + for(;number < num_points; number++){ + const float a = *aPtr++; + const float b = *bPtr++; + *cPtr++ = ( a > b ? a : b); + } +} +#endif /* LV_HAVE_AVX */ + +#endif /* INCLUDED_volk_32f_x2_max_32f_u_H */ diff --git a/kernels/volk/volk_32f_x2_min_32f.h b/kernels/volk/volk_32f_x2_min_32f.h index f3cbae1..3beb5fa 100644 --- a/kernels/volk/volk_32f_x2_min_32f.h +++ b/kernels/volk/volk_32f_x2_min_32f.h @@ -112,6 +112,44 @@ volk_32f_x2_min_32f_a_sse(float* cVector, const float* aVector, #endif /* LV_HAVE_SSE */ +#ifdef LV_HAVE_AVX +#include + +static inline void +volk_32f_x2_min_32f_a_avx(float* cVector, const float* aVector, + const float* bVector, unsigned int num_points) +{ + unsigned int number = 0; + const unsigned int eigthPoints = num_points / 8; + + float* cPtr = cVector; + const float* aPtr = aVector; + const float* bPtr= bVector; + + __m256 aVal, bVal, cVal; + for(;number < eigthPoints; number++){ + aVal = _mm256_load_ps(aPtr); + bVal = _mm256_load_ps(bPtr); + + cVal = _mm256_min_ps(aVal, bVal); + + _mm256_store_ps(cPtr,cVal); // Store the results back into the C container + + aPtr += 8; + bPtr += 8; + cPtr += 8; + } + + number = eigthPoints * 8; + for(;number < num_points; number++){ + const float a = *aPtr++; + const float b = *bPtr++; + *cPtr++ = ( a < b ? a : b); + } +} +#endif /* LV_HAVE_AVX */ + + #ifdef LV_HAVE_NEON #include @@ -183,3 +221,49 @@ volk_32f_x2_min_32f_u_orc(float* cVector, const float* aVector, #endif /* INCLUDED_volk_32f_x2_min_32f_a_H */ + + +#ifndef INCLUDED_volk_32f_x2_min_32f_u_H +#define INCLUDED_volk_32f_x2_min_32f_u_H + +#include +#include + +#ifdef LV_HAVE_AVX +#include + +static inline void +volk_32f_x2_min_32f_u_avx(float* cVector, const float* aVector, + const float* bVector, unsigned int num_points) +{ + unsigned int number = 0; + const unsigned int eigthPoints = num_points / 8; + + float* cPtr = cVector; + const float* aPtr = aVector; + const float* bPtr= bVector; + + __m256 aVal, bVal, cVal; + for(;number < eigthPoints; number++){ + aVal = _mm256_loadu_ps(aPtr); + bVal = _mm256_loadu_ps(bPtr); + + cVal = _mm256_min_ps(aVal, bVal); + + _mm256_storeu_ps(cPtr,cVal); // Store the results back into the C container + + aPtr += 8; + bPtr += 8; + cPtr += 8; + } + + number = eigthPoints * 8; + for(;number < num_points; number++){ + const float a = *aPtr++; + const float b = *bPtr++; + *cPtr++ = ( a < b ? a : b); + } +} +#endif /* LV_HAVE_AVX */ + +#endif /* INCLUDED_volk_32f_x2_min_32f_u_H */