From 05cb46b93a74d0236a4fca4a75ff35dd061df718 Mon Sep 17 00:00:00 2001 From: =?utf8?q?Daniel=20Est=C3=A9vez?= Date: Sun, 22 Jan 2023 16:10:27 +0100 Subject: [PATCH] [PATCH 5/5] volk_32f_s32f_convert_8i: code style MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit Apply code style suggestions from #617. Signed-off-by: Daniel Estévez Gbp-Pq: Name 0005-volk_32f_s32f_convert_8i-code-style.patch --- kernels/volk/volk_32f_s32f_convert_8i.h | 283 +++++++++--------------- 1 file changed, 110 insertions(+), 173 deletions(-) diff --git a/kernels/volk/volk_32f_s32f_convert_8i.h b/kernels/volk/volk_32f_s32f_convert_8i.h index 4d7c5ca..d47f95a 100644 --- a/kernels/volk/volk_32f_s32f_convert_8i.h +++ b/kernels/volk/volk_32f_s32f_convert_8i.h @@ -30,12 +30,12 @@ * \li outputVector: The output vector. * * \b Example - * Convert floats from [-1,1] to 16-bit integers with a scale of 5 to maintain smallest + * Convert floats from [-1,1] to 8-bit integers with a scale of 5 to maintain smallest delta * int N = 10; * unsigned int alignment = volk_get_alignment(); * float* increasing = (float*)volk_malloc(sizeof(float)*N, alignment); - * int16_t* out = (int16_t*)volk_malloc(sizeof(int16_t)*N, alignment); + * int8_t* out = (int8_t*)volk_malloc(sizeof(int8_t)*N, alignment); * * for(unsigned int ii = 0; ii < N; ++ii){ * increasing[ii] = 2.f * ((float)ii / (float)N) - 1.f; @@ -46,7 +46,7 @@ * float scale = 5.1f; * - * volk_32f_s32f_convert_32i(out, increasing, scale, N); + * volk_32f_s32f_convert_8i(out, increasing, scale, N); * * for(unsigned int ii = 0; ii < N; ++ii){ * printf("out[%u] = %i\n", ii, out[ii]); @@ -61,12 +61,11 @@ #define INCLUDED_volk_32f_s32f_convert_8i_u_H #include -#include static inline void volk_32f_s32f_convert_8i_single(int8_t* out, const float in) { - float min_val = INT8_MIN; - float max_val = INT8_MAX; + const float min_val = INT8_MIN; + const float max_val = INT8_MAX; if (in > max_val) { *out = (int8_t)(max_val); } else if (in < min_val) { @@ -76,6 +75,24 @@ static inline void volk_32f_s32f_convert_8i_single(int8_t* out, const float in) } } +#ifdef LV_HAVE_GENERIC + +static inline void volk_32f_s32f_convert_8i_generic(int8_t* outputVector, + const float* inputVector, + const float scalar, + unsigned int num_points) +{ + const float* inputVectorPtr = inputVector; + + for (unsigned int number = 0; number < num_points; number++) { + const float r = *inputVectorPtr++ * scalar; + volk_32f_s32f_convert_8i_single(&outputVector[number], r); + } +} + +#endif /* LV_HAVE_GENERIC */ + + #ifdef LV_HAVE_AVX2 #include @@ -84,32 +101,26 @@ static inline void volk_32f_s32f_convert_8i_u_avx2(int8_t* outputVector, const float scalar, unsigned int num_points) { - unsigned int number = 0; - const unsigned int thirtysecondPoints = num_points / 32; const float* inputVectorPtr = (const float*)inputVector; int8_t* outputVectorPtr = outputVector; - float min_val = INT8_MIN; - float max_val = INT8_MAX; - float r; + const float min_val = INT8_MIN; + const float max_val = INT8_MAX; + const __m256 vmin_val = _mm256_set1_ps(min_val); + const __m256 vmax_val = _mm256_set1_ps(max_val); - __m256 vScalar = _mm256_set1_ps(scalar); - __m256 inputVal1, inputVal2, inputVal3, inputVal4; - __m256i intInputVal1, intInputVal2, intInputVal3, intInputVal4; - __m256 vmin_val = _mm256_set1_ps(min_val); - __m256 vmax_val = _mm256_set1_ps(max_val); - __m256i intInputVal; + const __m256 vScalar = _mm256_set1_ps(scalar); - for (; number < thirtysecondPoints; number++) { - inputVal1 = _mm256_loadu_ps(inputVectorPtr); + for (unsigned int number = 0; number < thirtysecondPoints; number++) { + __m256 inputVal1 = _mm256_loadu_ps(inputVectorPtr); inputVectorPtr += 8; - inputVal2 = _mm256_loadu_ps(inputVectorPtr); + __m256 inputVal2 = _mm256_loadu_ps(inputVectorPtr); inputVectorPtr += 8; - inputVal3 = _mm256_loadu_ps(inputVectorPtr); + __m256 inputVal3 = _mm256_loadu_ps(inputVectorPtr); inputVectorPtr += 8; - inputVal4 = _mm256_loadu_ps(inputVectorPtr); + __m256 inputVal4 = _mm256_loadu_ps(inputVectorPtr); inputVectorPtr += 8; inputVal1 = _mm256_max_ps( @@ -121,10 +132,10 @@ static inline void volk_32f_s32f_convert_8i_u_avx2(int8_t* outputVector, inputVal4 = _mm256_max_ps( _mm256_min_ps(_mm256_mul_ps(inputVal4, vScalar), vmax_val), vmin_val); - intInputVal1 = _mm256_cvtps_epi32(inputVal1); - intInputVal2 = _mm256_cvtps_epi32(inputVal2); - intInputVal3 = _mm256_cvtps_epi32(inputVal3); - intInputVal4 = _mm256_cvtps_epi32(inputVal4); + __m256i intInputVal1 = _mm256_cvtps_epi32(inputVal1); + __m256i intInputVal2 = _mm256_cvtps_epi32(inputVal2); + __m256i intInputVal3 = _mm256_cvtps_epi32(inputVal3); + __m256i intInputVal4 = _mm256_cvtps_epi32(inputVal4); intInputVal1 = _mm256_packs_epi32(intInputVal1, intInputVal2); intInputVal1 = _mm256_permute4x64_epi64(intInputVal1, 0b11011000); @@ -132,15 +143,14 @@ static inline void volk_32f_s32f_convert_8i_u_avx2(int8_t* outputVector, intInputVal3 = _mm256_permute4x64_epi64(intInputVal3, 0b11011000); intInputVal1 = _mm256_packs_epi16(intInputVal1, intInputVal3); - intInputVal = _mm256_permute4x64_epi64(intInputVal1, 0b11011000); + const __m256i intInputVal = _mm256_permute4x64_epi64(intInputVal1, 0b11011000); _mm256_storeu_si256((__m256i*)outputVectorPtr, intInputVal); outputVectorPtr += 32; } - number = thirtysecondPoints * 32; - for (; number < num_points; number++) { - r = inputVector[number] * scalar; + for (unsigned int number = thirtysecondPoints * 32; number < num_points; number++) { + float r = inputVector[number] * scalar; volk_32f_s32f_convert_8i_single(&outputVector[number], r); } } @@ -156,31 +166,26 @@ static inline void volk_32f_s32f_convert_8i_u_sse2(int8_t* outputVector, const float scalar, unsigned int num_points) { - unsigned int number = 0; - const unsigned int sixteenthPoints = num_points / 16; const float* inputVectorPtr = (const float*)inputVector; int8_t* outputVectorPtr = outputVector; - float min_val = INT8_MIN; - float max_val = INT8_MAX; - float r; + const float min_val = INT8_MIN; + const float max_val = INT8_MAX; + const __m128 vmin_val = _mm_set_ps1(min_val); + const __m128 vmax_val = _mm_set_ps1(max_val); - __m128 vScalar = _mm_set_ps1(scalar); - __m128 inputVal1, inputVal2, inputVal3, inputVal4; - __m128i intInputVal1, intInputVal2, intInputVal3, intInputVal4; - __m128 vmin_val = _mm_set_ps1(min_val); - __m128 vmax_val = _mm_set_ps1(max_val); + const __m128 vScalar = _mm_set_ps1(scalar); - for (; number < sixteenthPoints; number++) { - inputVal1 = _mm_loadu_ps(inputVectorPtr); + for (unsigned int number = 0; number < sixteenthPoints; number++) { + __m128 inputVal1 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4; - inputVal2 = _mm_loadu_ps(inputVectorPtr); + __m128 inputVal2 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4; - inputVal3 = _mm_loadu_ps(inputVectorPtr); + __m128 inputVal3 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4; - inputVal4 = _mm_loadu_ps(inputVectorPtr); + __m128 inputVal4 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4; inputVal1 = @@ -192,10 +197,10 @@ static inline void volk_32f_s32f_convert_8i_u_sse2(int8_t* outputVector, inputVal4 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal4, vScalar), vmax_val), vmin_val); - intInputVal1 = _mm_cvtps_epi32(inputVal1); - intInputVal2 = _mm_cvtps_epi32(inputVal2); - intInputVal3 = _mm_cvtps_epi32(inputVal3); - intInputVal4 = _mm_cvtps_epi32(inputVal4); + __m128i intInputVal1 = _mm_cvtps_epi32(inputVal1); + __m128i intInputVal2 = _mm_cvtps_epi32(inputVal2); + __m128i intInputVal3 = _mm_cvtps_epi32(inputVal3); + __m128i intInputVal4 = _mm_cvtps_epi32(inputVal4); intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2); intInputVal3 = _mm_packs_epi32(intInputVal3, intInputVal4); @@ -206,9 +211,8 @@ static inline void volk_32f_s32f_convert_8i_u_sse2(int8_t* outputVector, outputVectorPtr += 16; } - number = sixteenthPoints * 16; - for (; number < num_points; number++) { - r = inputVector[number] * scalar; + for (unsigned int number = sixteenthPoints * 16; number < num_points; number++) { + const float r = inputVector[number] * scalar; volk_32f_s32f_convert_8i_single(&outputVector[number], r); } } @@ -224,40 +228,34 @@ static inline void volk_32f_s32f_convert_8i_u_sse(int8_t* outputVector, const float scalar, unsigned int num_points) { - unsigned int number = 0; - size_t inner_loop; - const unsigned int quarterPoints = num_points / 4; const float* inputVectorPtr = (const float*)inputVector; int8_t* outputVectorPtr = outputVector; - float min_val = INT8_MIN; - float max_val = INT8_MAX; - float r; + const float min_val = INT8_MIN; + const float max_val = INT8_MAX; + const __m128 vmin_val = _mm_set_ps1(min_val); + const __m128 vmax_val = _mm_set_ps1(max_val); - __m128 vScalar = _mm_set_ps1(scalar); - __m128 ret; - __m128 vmin_val = _mm_set_ps1(min_val); - __m128 vmax_val = _mm_set_ps1(max_val); + const __m128 vScalar = _mm_set_ps1(scalar); __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4]; - for (; number < quarterPoints; number++) { - ret = _mm_loadu_ps(inputVectorPtr); + for (unsigned int number = 0; number < quarterPoints; number++) { + __m128 ret = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4; ret = _mm_max_ps(_mm_min_ps(_mm_mul_ps(ret, vScalar), vmax_val), vmin_val); _mm_store_ps(outputFloatBuffer, ret); - for (inner_loop = 0; inner_loop < 4; inner_loop++) { + for (size_t inner_loop = 0; inner_loop < 4; inner_loop++) { *outputVectorPtr++ = (int8_t)(rintf(outputFloatBuffer[inner_loop])); } } - number = quarterPoints * 4; - for (; number < num_points; number++) { - r = inputVector[number] * scalar; + for (unsigned int number = quarterPoints * 4; number < num_points; number++) { + const float r = inputVector[number] * scalar; volk_32f_s32f_convert_8i_single(&outputVector[number], r); } } @@ -265,33 +263,11 @@ static inline void volk_32f_s32f_convert_8i_u_sse(int8_t* outputVector, #endif /* LV_HAVE_SSE */ -#ifdef LV_HAVE_GENERIC - -static inline void volk_32f_s32f_convert_8i_generic(int8_t* outputVector, - const float* inputVector, - const float scalar, - unsigned int num_points) -{ - const float* inputVectorPtr = inputVector; - unsigned int number = 0; - float r; - - for (number = 0; number < num_points; number++) { - r = *inputVectorPtr++ * scalar; - volk_32f_s32f_convert_8i_single(&outputVector[number], r); - } -} - -#endif /* LV_HAVE_GENERIC */ - - #endif /* INCLUDED_volk_32f_s32f_convert_8i_u_H */ #ifndef INCLUDED_volk_32f_s32f_convert_8i_a_H #define INCLUDED_volk_32f_s32f_convert_8i_a_H #include -#include -#include #ifdef LV_HAVE_AVX2 #include @@ -301,32 +277,26 @@ static inline void volk_32f_s32f_convert_8i_a_avx2(int8_t* outputVector, const float scalar, unsigned int num_points) { - unsigned int number = 0; - const unsigned int thirtysecondPoints = num_points / 32; const float* inputVectorPtr = (const float*)inputVector; int8_t* outputVectorPtr = outputVector; - float min_val = INT8_MIN; - float max_val = INT8_MAX; - float r; + const float min_val = INT8_MIN; + const float max_val = INT8_MAX; + const __m256 vmin_val = _mm256_set1_ps(min_val); + const __m256 vmax_val = _mm256_set1_ps(max_val); - __m256 vScalar = _mm256_set1_ps(scalar); - __m256 inputVal1, inputVal2, inputVal3, inputVal4; - __m256i intInputVal1, intInputVal2, intInputVal3, intInputVal4; - __m256 vmin_val = _mm256_set1_ps(min_val); - __m256 vmax_val = _mm256_set1_ps(max_val); - __m256i intInputVal; + const __m256 vScalar = _mm256_set1_ps(scalar); - for (; number < thirtysecondPoints; number++) { - inputVal1 = _mm256_load_ps(inputVectorPtr); + for (unsigned int number = 0; number < thirtysecondPoints; number++) { + __m256 inputVal1 = _mm256_load_ps(inputVectorPtr); inputVectorPtr += 8; - inputVal2 = _mm256_load_ps(inputVectorPtr); + __m256 inputVal2 = _mm256_load_ps(inputVectorPtr); inputVectorPtr += 8; - inputVal3 = _mm256_load_ps(inputVectorPtr); + __m256 inputVal3 = _mm256_load_ps(inputVectorPtr); inputVectorPtr += 8; - inputVal4 = _mm256_load_ps(inputVectorPtr); + __m256 inputVal4 = _mm256_load_ps(inputVectorPtr); inputVectorPtr += 8; inputVal1 = _mm256_max_ps( @@ -338,10 +308,10 @@ static inline void volk_32f_s32f_convert_8i_a_avx2(int8_t* outputVector, inputVal4 = _mm256_max_ps( _mm256_min_ps(_mm256_mul_ps(inputVal4, vScalar), vmax_val), vmin_val); - intInputVal1 = _mm256_cvtps_epi32(inputVal1); - intInputVal2 = _mm256_cvtps_epi32(inputVal2); - intInputVal3 = _mm256_cvtps_epi32(inputVal3); - intInputVal4 = _mm256_cvtps_epi32(inputVal4); + __m256i intInputVal1 = _mm256_cvtps_epi32(inputVal1); + __m256i intInputVal2 = _mm256_cvtps_epi32(inputVal2); + __m256i intInputVal3 = _mm256_cvtps_epi32(inputVal3); + __m256i intInputVal4 = _mm256_cvtps_epi32(inputVal4); intInputVal1 = _mm256_packs_epi32(intInputVal1, intInputVal2); intInputVal1 = _mm256_permute4x64_epi64(intInputVal1, 0b11011000); @@ -349,15 +319,14 @@ static inline void volk_32f_s32f_convert_8i_a_avx2(int8_t* outputVector, intInputVal3 = _mm256_permute4x64_epi64(intInputVal3, 0b11011000); intInputVal1 = _mm256_packs_epi16(intInputVal1, intInputVal3); - intInputVal = _mm256_permute4x64_epi64(intInputVal1, 0b11011000); + __m256i intInputVal = _mm256_permute4x64_epi64(intInputVal1, 0b11011000); _mm256_store_si256((__m256i*)outputVectorPtr, intInputVal); outputVectorPtr += 32; } - number = thirtysecondPoints * 32; - for (; number < num_points; number++) { - r = inputVector[number] * scalar; + for (unsigned int number = thirtysecondPoints * 32; number < num_points; number++) { + const float r = inputVector[number] * scalar; volk_32f_s32f_convert_8i_single(&outputVector[number], r); } } @@ -373,31 +342,26 @@ static inline void volk_32f_s32f_convert_8i_a_sse2(int8_t* outputVector, const float scalar, unsigned int num_points) { - unsigned int number = 0; - const unsigned int sixteenthPoints = num_points / 16; const float* inputVectorPtr = (const float*)inputVector; int8_t* outputVectorPtr = outputVector; - float min_val = INT8_MIN; - float max_val = INT8_MAX; - float r; + const float min_val = INT8_MIN; + const float max_val = INT8_MAX; + const __m128 vmin_val = _mm_set_ps1(min_val); + const __m128 vmax_val = _mm_set_ps1(max_val); - __m128 vScalar = _mm_set_ps1(scalar); - __m128 inputVal1, inputVal2, inputVal3, inputVal4; - __m128i intInputVal1, intInputVal2, intInputVal3, intInputVal4; - __m128 vmin_val = _mm_set_ps1(min_val); - __m128 vmax_val = _mm_set_ps1(max_val); + const __m128 vScalar = _mm_set_ps1(scalar); - for (; number < sixteenthPoints; number++) { - inputVal1 = _mm_load_ps(inputVectorPtr); + for (unsigned int number = 0; number < sixteenthPoints; number++) { + __m128 inputVal1 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4; - inputVal2 = _mm_load_ps(inputVectorPtr); + __m128 inputVal2 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4; - inputVal3 = _mm_load_ps(inputVectorPtr); + __m128 inputVal3 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4; - inputVal4 = _mm_load_ps(inputVectorPtr); + __m128 inputVal4 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4; inputVal1 = @@ -409,10 +373,10 @@ static inline void volk_32f_s32f_convert_8i_a_sse2(int8_t* outputVector, inputVal4 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal4, vScalar), vmax_val), vmin_val); - intInputVal1 = _mm_cvtps_epi32(inputVal1); - intInputVal2 = _mm_cvtps_epi32(inputVal2); - intInputVal3 = _mm_cvtps_epi32(inputVal3); - intInputVal4 = _mm_cvtps_epi32(inputVal4); + __m128i intInputVal1 = _mm_cvtps_epi32(inputVal1); + __m128i intInputVal2 = _mm_cvtps_epi32(inputVal2); + __m128i intInputVal3 = _mm_cvtps_epi32(inputVal3); + __m128i intInputVal4 = _mm_cvtps_epi32(inputVal4); intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2); intInputVal3 = _mm_packs_epi32(intInputVal3, intInputVal4); @@ -423,9 +387,8 @@ static inline void volk_32f_s32f_convert_8i_a_sse2(int8_t* outputVector, outputVectorPtr += 16; } - number = sixteenthPoints * 16; - for (; number < num_points; number++) { - r = inputVector[number] * scalar; + for (unsigned int number = sixteenthPoints * 16; number < num_points; number++) { + const float r = inputVector[number] * scalar; volk_32f_s32f_convert_8i_single(&outputVector[number], r); } } @@ -440,40 +403,34 @@ static inline void volk_32f_s32f_convert_8i_a_sse(int8_t* outputVector, const float scalar, unsigned int num_points) { - unsigned int number = 0; - size_t inner_loop; - const unsigned int quarterPoints = num_points / 4; const float* inputVectorPtr = (const float*)inputVector; + int8_t* outputVectorPtr = outputVector; - float min_val = INT8_MIN; - float max_val = INT8_MAX; - float r; + const float min_val = INT8_MIN; + const float max_val = INT8_MAX; + const __m128 vmin_val = _mm_set_ps1(min_val); + const __m128 vmax_val = _mm_set_ps1(max_val); - int8_t* outputVectorPtr = outputVector; - __m128 vScalar = _mm_set_ps1(scalar); - __m128 ret; - __m128 vmin_val = _mm_set_ps1(min_val); - __m128 vmax_val = _mm_set_ps1(max_val); + const __m128 vScalar = _mm_set_ps1(scalar); __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4]; - for (; number < quarterPoints; number++) { - ret = _mm_load_ps(inputVectorPtr); + for (unsigned int number = 0; number < quarterPoints; number++) { + __m128 ret = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4; ret = _mm_max_ps(_mm_min_ps(_mm_mul_ps(ret, vScalar), vmax_val), vmin_val); _mm_store_ps(outputFloatBuffer, ret); - for (inner_loop = 0; inner_loop < 4; inner_loop++) { + for (size_t inner_loop = 0; inner_loop < 4; inner_loop++) { *outputVectorPtr++ = (int8_t)(rintf(outputFloatBuffer[inner_loop])); } } - number = quarterPoints * 4; - for (; number < num_points; number++) { - r = inputVector[number] * scalar; + for (unsigned int number = quarterPoints * 4; number < num_points; number++) { + const float r = inputVector[number] * scalar; volk_32f_s32f_convert_8i_single(&outputVector[number], r); } } @@ -481,24 +438,4 @@ static inline void volk_32f_s32f_convert_8i_a_sse(int8_t* outputVector, #endif /* LV_HAVE_SSE */ -#ifdef LV_HAVE_GENERIC - -static inline void volk_32f_s32f_convert_8i_a_generic(int8_t* outputVector, - const float* inputVector, - const float scalar, - unsigned int num_points) -{ - const float* inputVectorPtr = inputVector; - unsigned int number = 0; - float r; - - for (number = 0; number < num_points; number++) { - r = *inputVectorPtr++ * scalar; - volk_32f_s32f_convert_8i_single(&outputVector[number], r); - } -} - -#endif /* LV_HAVE_GENERIC */ - - #endif /* INCLUDED_volk_32f_s32f_convert_8i_a_H */ -- 2.30.2