From 9e16c3d128cbda09636f49da5830cbcbcfc39ef9 Mon Sep 17 00:00:00 2001 From: Zlika Date: Wed, 16 Jun 2021 15:11:25 +0200 Subject: [PATCH] [PATCH 04/73] Code cleanup Signed-off-by: Zlika Gbp-Pq: Name 0004-Code-cleanup.patch --- kernels/volk/volk_32f_index_min_16u.h | 92 +++++++--------- kernels/volk/volk_32f_index_min_32u.h | 142 +++++++++++-------------- kernels/volk/volk_32fc_index_min_16u.h | 82 +++++++------- kernels/volk/volk_32fc_index_min_32u.h | 102 +++++++++--------- 4 files changed, 190 insertions(+), 228 deletions(-) diff --git a/kernels/volk/volk_32f_index_min_16u.h b/kernels/volk/volk_32f_index_min_16u.h index 848b75c..d8ffcc7 100644 --- a/kernels/volk/volk_32f_index_min_16u.h +++ b/kernels/volk/volk_32f_index_min_16u.h @@ -36,11 +36,11 @@ * * Dispatcher Prototype * \code - * void volk_32f_index_min_16u(uint16_t* target, const float* src0, uint32_t num_points) + * void volk_32f_index_min_16u(uint16_t* target, const float* source, uint32_t num_points) * \endcode * * \b Inputs - * \li src0: The input vector of floats. + * \li source: The input vector of floats. * \li num_points: The number of data points. * * \b Outputs @@ -80,19 +80,17 @@ #include static inline void -volk_32f_index_min_16u_a_avx(uint16_t* target, const float* src0, uint32_t num_points) +volk_32f_index_min_16u_a_avx(uint16_t* target, const float* source, uint32_t num_points) { num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points; - - uint32_t number = 0; const uint32_t eighthPoints = num_points / 8; - float* inputPtr = (float*)src0; + float* inputPtr = (float*)source; __m256 indexIncrementValues = _mm256_set1_ps(8); __m256 currentIndexes = _mm256_set_ps(-1, -2, -3, -4, -5, -6, -7, -8); - float min = src0[0]; + float min = source[0]; float index = 0; __m256 minValues = _mm256_set1_ps(min); __m256 minValuesIndex = _mm256_setzero_ps(); @@ -102,7 +100,7 @@ volk_32f_index_min_16u_a_avx(uint16_t* target, const float* src0, uint32_t num_p __VOLK_ATTR_ALIGNED(32) float minValuesBuffer[8]; __VOLK_ATTR_ALIGNED(32) float minIndexesBuffer[8]; - for (; number < eighthPoints; number++) { + for (uint32_t number = 0; number < eighthPoints; number++) { currentValues = _mm256_load_ps(inputPtr); inputPtr += 8; @@ -118,7 +116,7 @@ volk_32f_index_min_16u_a_avx(uint16_t* target, const float* src0, uint32_t num_p _mm256_store_ps(minValuesBuffer, minValues); _mm256_store_ps(minIndexesBuffer, minValuesIndex); - for (number = 0; number < 8; number++) { + for (uint32_t number = 0; number < 8; number++) { if (minValuesBuffer[number] < min) { index = minIndexesBuffer[number]; min = minValuesBuffer[number]; @@ -128,11 +126,10 @@ volk_32f_index_min_16u_a_avx(uint16_t* target, const float* src0, uint32_t num_p } } - number = eighthPoints * 8; - for (; number < num_points; number++) { - if (src0[number] < min) { + for (uint32_t number = eighthPoints * 8; number < num_points; number++) { + if (source[number] < min) { index = number; - min = src0[number]; + min = source[number]; } } target[0] = (uint16_t)index; @@ -144,19 +141,17 @@ volk_32f_index_min_16u_a_avx(uint16_t* target, const float* src0, uint32_t num_p #include static inline void -volk_32f_index_min_16u_a_sse4_1(uint16_t* target, const float* src0, uint32_t num_points) +volk_32f_index_min_16u_a_sse4_1(uint16_t* target, const float* source, uint32_t num_points) { num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points; - - uint32_t number = 0; const uint32_t quarterPoints = num_points / 4; - float* inputPtr = (float*)src0; + float* inputPtr = (float*)source; __m128 indexIncrementValues = _mm_set1_ps(4); __m128 currentIndexes = _mm_set_ps(-1, -2, -3, -4); - float min = src0[0]; + float min = source[0]; float index = 0; __m128 minValues = _mm_set1_ps(min); __m128 minValuesIndex = _mm_setzero_ps(); @@ -166,7 +161,7 @@ volk_32f_index_min_16u_a_sse4_1(uint16_t* target, const float* src0, uint32_t nu __VOLK_ATTR_ALIGNED(16) float minValuesBuffer[4]; __VOLK_ATTR_ALIGNED(16) float minIndexesBuffer[4]; - for (; number < quarterPoints; number++) { + for (uint32_t number = 0; number < quarterPoints; number++) { currentValues = _mm_load_ps(inputPtr); inputPtr += 4; @@ -182,7 +177,7 @@ volk_32f_index_min_16u_a_sse4_1(uint16_t* target, const float* src0, uint32_t nu _mm_store_ps(minValuesBuffer, minValues); _mm_store_ps(minIndexesBuffer, minValuesIndex); - for (number = 0; number < 4; number++) { + for (uint32_t number = 0; number < 4; number++) { if (minValuesBuffer[number] < min) { index = minIndexesBuffer[number]; min = minValuesBuffer[number]; @@ -192,11 +187,10 @@ volk_32f_index_min_16u_a_sse4_1(uint16_t* target, const float* src0, uint32_t nu } } - number = quarterPoints * 4; - for (; number < num_points; number++) { - if (src0[number] < min) { + for (uint32_t number = quarterPoints * 4; number < num_points; number++) { + if (source[number] < min) { index = number; - min = src0[number]; + min = source[number]; } } target[0] = (uint16_t)index; @@ -210,19 +204,17 @@ volk_32f_index_min_16u_a_sse4_1(uint16_t* target, const float* src0, uint32_t nu #include static inline void -volk_32f_index_min_16u_a_sse(uint16_t* target, const float* src0, uint32_t num_points) +volk_32f_index_min_16u_a_sse(uint16_t* target, const float* source, uint32_t num_points) { num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points; - - uint32_t number = 0; const uint32_t quarterPoints = num_points / 4; - float* inputPtr = (float*)src0; + float* inputPtr = (float*)source; __m128 indexIncrementValues = _mm_set1_ps(4); __m128 currentIndexes = _mm_set_ps(-1, -2, -3, -4); - float min = src0[0]; + float min = source[0]; float index = 0; __m128 minValues = _mm_set1_ps(min); __m128 minValuesIndex = _mm_setzero_ps(); @@ -232,7 +224,7 @@ volk_32f_index_min_16u_a_sse(uint16_t* target, const float* src0, uint32_t num_p __VOLK_ATTR_ALIGNED(16) float minValuesBuffer[4]; __VOLK_ATTR_ALIGNED(16) float minIndexesBuffer[4]; - for (; number < quarterPoints; number++) { + for (uint32_t number = 0; number < quarterPoints; number++) { currentValues = _mm_load_ps(inputPtr); inputPtr += 4; @@ -250,7 +242,7 @@ volk_32f_index_min_16u_a_sse(uint16_t* target, const float* src0, uint32_t num_p _mm_store_ps(minValuesBuffer, minValues); _mm_store_ps(minIndexesBuffer, minValuesIndex); - for (number = 0; number < 4; number++) { + for (uint32_t number = 0; number < 4; number++) { if (minValuesBuffer[number] < min) { index = minIndexesBuffer[number]; min = minValuesBuffer[number]; @@ -260,11 +252,10 @@ volk_32f_index_min_16u_a_sse(uint16_t* target, const float* src0, uint32_t num_p } } - number = quarterPoints * 4; - for (; number < num_points; number++) { - if (src0[number] < min) { + for (uint32_t number = quarterPoints * 4; number < num_points; number++) { + if (source[number] < min) { index = number; - min = src0[number]; + min = source[number]; } } target[0] = (uint16_t)index; @@ -276,19 +267,17 @@ volk_32f_index_min_16u_a_sse(uint16_t* target, const float* src0, uint32_t num_p #ifdef LV_HAVE_GENERIC static inline void -volk_32f_index_min_16u_generic(uint16_t* target, const float* src0, uint32_t num_points) +volk_32f_index_min_16u_generic(uint16_t* target, const float* source, uint32_t num_points) { num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points; - float min = src0[0]; + float min = source[0]; uint16_t index = 0; - uint32_t i = 1; - - for (; i < num_points; ++i) { - if (src0[i] < min) { + for (uint32_t i = 1; i < num_points; ++i) { + if (source[i] < min) { index = i; - min = src0[i]; + min = source[i]; } } target[0] = index; @@ -312,19 +301,17 @@ volk_32f_index_min_16u_generic(uint16_t* target, const float* src0, uint32_t num #include static inline void -volk_32f_index_min_16u_u_avx(uint16_t* target, const float* src0, uint32_t num_points) +volk_32f_index_min_16u_u_avx(uint16_t* target, const float* source, uint32_t num_points) { num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points; - - uint32_t number = 0; const uint32_t eighthPoints = num_points / 8; - float* inputPtr = (float*)src0; + float* inputPtr = (float*)source; __m256 indexIncrementValues = _mm256_set1_ps(8); __m256 currentIndexes = _mm256_set_ps(-1, -2, -3, -4, -5, -6, -7, -8); - float min = src0[0]; + float min = source[0]; float index = 0; __m256 minValues = _mm256_set1_ps(min); __m256 minValuesIndex = _mm256_setzero_ps(); @@ -334,7 +321,7 @@ volk_32f_index_min_16u_u_avx(uint16_t* target, const float* src0, uint32_t num_p __VOLK_ATTR_ALIGNED(32) float minValuesBuffer[8]; __VOLK_ATTR_ALIGNED(32) float minIndexesBuffer[8]; - for (; number < eighthPoints; number++) { + for (uint32_t number = 0; number < eighthPoints; number++) { currentValues = _mm256_loadu_ps(inputPtr); inputPtr += 8; @@ -350,7 +337,7 @@ volk_32f_index_min_16u_u_avx(uint16_t* target, const float* src0, uint32_t num_p _mm256_storeu_ps(minValuesBuffer, minValues); _mm256_storeu_ps(minIndexesBuffer, minValuesIndex); - for (number = 0; number < 8; number++) { + for (uint32_t number = 0; number < 8; number++) { if (minValuesBuffer[number] < min) { index = minIndexesBuffer[number]; min = minValuesBuffer[number]; @@ -360,11 +347,10 @@ volk_32f_index_min_16u_u_avx(uint16_t* target, const float* src0, uint32_t num_p } } - number = eighthPoints * 8; - for (; number < num_points; number++) { - if (src0[number] < min) { + for (uint32_t number = eighthPoints * 8; number < num_points; number++) { + if (source[number] < min) { index = number; - min = src0[number]; + min = source[number]; } } target[0] = (uint16_t)index; diff --git a/kernels/volk/volk_32f_index_min_32u.h b/kernels/volk/volk_32f_index_min_32u.h index 67ee426..23c2d17 100644 --- a/kernels/volk/volk_32f_index_min_32u.h +++ b/kernels/volk/volk_32f_index_min_32u.h @@ -30,11 +30,11 @@ * * Dispatcher Prototype * \code - * void volk_32f_index_min_32u(uint32_t* target, const float* src0, uint32_t num_points) + * void volk_32f_index_min_32u(uint32_t* target, const float* source, uint32_t num_points) * \endcode * * \b Inputs - * \li src0: The input vector of floats. + * \li source: The input vector of floats. * \li num_points: The number of data points. * * \b Outputs @@ -73,18 +73,17 @@ #include static inline void -volk_32f_index_min_32u_a_sse4_1(uint32_t* target, const float* src0, uint32_t num_points) +volk_32f_index_min_32u_a_sse4_1(uint32_t* target, const float* source, uint32_t num_points) { if (num_points > 0) { - uint32_t number = 0; const uint32_t quarterPoints = num_points / 4; - float* inputPtr = (float*)src0; + float* inputPtr = (float*)source; __m128 indexIncrementValues = _mm_set1_ps(4); __m128 currentIndexes = _mm_set_ps(-1, -2, -3, -4); - float min = src0[0]; + float min = source[0]; float index = 0; __m128 minValues = _mm_set1_ps(min); __m128 minValuesIndex = _mm_setzero_ps(); @@ -94,7 +93,7 @@ volk_32f_index_min_32u_a_sse4_1(uint32_t* target, const float* src0, uint32_t nu __VOLK_ATTR_ALIGNED(16) float minValuesBuffer[4]; __VOLK_ATTR_ALIGNED(16) float minIndexesBuffer[4]; - for (; number < quarterPoints; number++) { + for (uint32_t number = 0; number < quarterPoints; number++) { currentValues = _mm_load_ps(inputPtr); inputPtr += 4; @@ -111,7 +110,7 @@ volk_32f_index_min_32u_a_sse4_1(uint32_t* target, const float* src0, uint32_t nu _mm_store_ps(minValuesBuffer, minValues); _mm_store_ps(minIndexesBuffer, minValuesIndex); - for (number = 0; number < 4; number++) { + for (uint32_t number = 0; number < 4; number++) { if (minValuesBuffer[number] < min) { index = minIndexesBuffer[number]; min = minValuesBuffer[number]; @@ -121,11 +120,10 @@ volk_32f_index_min_32u_a_sse4_1(uint32_t* target, const float* src0, uint32_t nu } } - number = quarterPoints * 4; - for (; number < num_points; number++) { - if (src0[number] < min) { + for (uint32_t number = quarterPoints * 4; number < num_points; number++) { + if (source[number] < min) { index = number; - min = src0[number]; + min = source[number]; } } target[0] = (uint32_t)index; @@ -140,18 +138,17 @@ volk_32f_index_min_32u_a_sse4_1(uint32_t* target, const float* src0, uint32_t nu #include static inline void -volk_32f_index_min_32u_a_sse(uint32_t* target, const float* src0, uint32_t num_points) +volk_32f_index_min_32u_a_sse(uint32_t* target, const float* source, uint32_t num_points) { if (num_points > 0) { - uint32_t number = 0; const uint32_t quarterPoints = num_points / 4; - float* inputPtr = (float*)src0; + float* inputPtr = (float*)source; __m128 indexIncrementValues = _mm_set1_ps(4); __m128 currentIndexes = _mm_set_ps(-1, -2, -3, -4); - float min = src0[0]; + float min = source[0]; float index = 0; __m128 minValues = _mm_set1_ps(min); __m128 minValuesIndex = _mm_setzero_ps(); @@ -161,7 +158,7 @@ volk_32f_index_min_32u_a_sse(uint32_t* target, const float* src0, uint32_t num_p __VOLK_ATTR_ALIGNED(16) float minValuesBuffer[4]; __VOLK_ATTR_ALIGNED(16) float minIndexesBuffer[4]; - for (; number < quarterPoints; number++) { + for (uint32_t number = 0; number < quarterPoints; number++) { currentValues = _mm_load_ps(inputPtr); inputPtr += 4; @@ -180,7 +177,7 @@ volk_32f_index_min_32u_a_sse(uint32_t* target, const float* src0, uint32_t num_p _mm_store_ps(minValuesBuffer, minValues); _mm_store_ps(minIndexesBuffer, minValuesIndex); - for (number = 0; number < 4; number++) { + for (uint32_t number = 0; number < 4; number++) { if (minValuesBuffer[number] < min) { index = minIndexesBuffer[number]; min = minValuesBuffer[number]; @@ -190,11 +187,10 @@ volk_32f_index_min_32u_a_sse(uint32_t* target, const float* src0, uint32_t num_p } } - number = quarterPoints * 4; - for (; number < num_points; number++) { - if (src0[number] < min) { + for (uint32_t number = quarterPoints * 4; number < num_points; number++) { + if (source[number] < min) { index = number; - min = src0[number]; + min = source[number]; } } target[0] = (uint32_t)index; @@ -208,18 +204,17 @@ volk_32f_index_min_32u_a_sse(uint32_t* target, const float* src0, uint32_t num_p #include static inline void -volk_32f_index_min_32u_a_avx(uint32_t* target, const float* src0, uint32_t num_points) +volk_32f_index_min_32u_a_avx(uint32_t* target, const float* source, uint32_t num_points) { if (num_points > 0) { - uint32_t number = 0; const uint32_t quarterPoints = num_points / 8; - float* inputPtr = (float*)src0; + float* inputPtr = (float*)source; __m256 indexIncrementValues = _mm256_set1_ps(8); __m256 currentIndexes = _mm256_set_ps(-1, -2, -3, -4, -5, -6, -7, -8); - float min = src0[0]; + float min = source[0]; float index = 0; __m256 minValues = _mm256_set1_ps(min); __m256 minValuesIndex = _mm256_setzero_ps(); @@ -229,7 +224,7 @@ volk_32f_index_min_32u_a_avx(uint32_t* target, const float* src0, uint32_t num_p __VOLK_ATTR_ALIGNED(32) float minValuesBuffer[8]; __VOLK_ATTR_ALIGNED(32) float minIndexesBuffer[8]; - for (; number < quarterPoints; number++) { + for (uint32_t number = 0; number < quarterPoints; number++) { currentValues = _mm256_load_ps(inputPtr); inputPtr += 8; currentIndexes = _mm256_add_ps(currentIndexes, indexIncrementValues); @@ -243,7 +238,7 @@ volk_32f_index_min_32u_a_avx(uint32_t* target, const float* src0, uint32_t num_p _mm256_store_ps(minValuesBuffer, minValues); _mm256_store_ps(minIndexesBuffer, minValuesIndex); - for (number = 0; number < 8; number++) { + for (uint32_t number = 0; number < 8; number++) { if (minValuesBuffer[number] < min) { index = minIndexesBuffer[number]; min = minValuesBuffer[number]; @@ -253,11 +248,10 @@ volk_32f_index_min_32u_a_avx(uint32_t* target, const float* src0, uint32_t num_p } } - number = quarterPoints * 8; - for (; number < num_points; number++) { - if (src0[number] < min) { + for (uint32_t number = quarterPoints * 8; number < num_points; number++) { + if (source[number] < min) { index = number; - min = src0[number]; + min = source[number]; } } target[0] = (uint32_t)index; @@ -271,19 +265,18 @@ volk_32f_index_min_32u_a_avx(uint32_t* target, const float* src0, uint32_t num_p #include static inline void -volk_32f_index_min_32u_neon(uint32_t* target, const float* src0, uint32_t num_points) +volk_32f_index_min_32u_neon(uint32_t* target, const float* source, uint32_t num_points) { if (num_points > 0) { - uint32_t number = 0; const uint32_t quarterPoints = num_points / 4; - float* inputPtr = (float*)src0; + float* inputPtr = (float*)source; float32x4_t indexIncrementValues = vdupq_n_f32(4); __VOLK_ATTR_ALIGNED(16) float currentIndexes_float[4] = { -4.0f, -3.0f, -2.0f, -1.0f }; float32x4_t currentIndexes = vld1q_f32(currentIndexes_float); - float min = src0[0]; + float min = source[0]; float index = 0; float32x4_t minValues = vdupq_n_f32(min); uint32x4_t minValuesIndex = vmovq_n_u32(0); @@ -294,7 +287,7 @@ volk_32f_index_min_32u_neon(uint32_t* target, const float* src0, uint32_t num_po __VOLK_ATTR_ALIGNED(16) float minValuesBuffer[4]; __VOLK_ATTR_ALIGNED(16) float minIndexesBuffer[4]; - for (; number < quarterPoints; number++) { + for (uint32_t number = 0; number < quarterPoints; number++) { currentValues = vld1q_f32(inputPtr); inputPtr += 4; currentIndexes = vaddq_f32(currentIndexes, indexIncrementValues); @@ -308,7 +301,7 @@ volk_32f_index_min_32u_neon(uint32_t* target, const float* src0, uint32_t num_po // Calculate the smallest value from the remaining 4 points vst1q_f32(minValuesBuffer, minValues); vst1q_f32(minIndexesBuffer, vcvtq_f32_u32(minValuesIndex)); - for (number = 0; number < 4; number++) { + for (uint32_t number = 0; number < 4; number++) { if (minValuesBuffer[number] < min) { index = minIndexesBuffer[number]; min = minValuesBuffer[number]; @@ -318,11 +311,10 @@ volk_32f_index_min_32u_neon(uint32_t* target, const float* src0, uint32_t num_po } } - number = quarterPoints * 4; - for (; number < num_points; number++) { - if (src0[number] < min) { + for (uint32_t number = quarterPoints * 4; number < num_points; number++) { + if (source[number] < min) { index = number; - min = src0[number]; + min = source[number]; } } target[0] = (uint32_t)index; @@ -335,18 +327,16 @@ volk_32f_index_min_32u_neon(uint32_t* target, const float* src0, uint32_t num_po #ifdef LV_HAVE_GENERIC static inline void -volk_32f_index_min_32u_generic(uint32_t* target, const float* src0, uint32_t num_points) +volk_32f_index_min_32u_generic(uint32_t* target, const float* source, uint32_t num_points) { if (num_points > 0) { - float min = src0[0]; + float min = source[0]; uint32_t index = 0; - uint32_t i = 1; - - for (; i < num_points; ++i) { - if (src0[i] < min) { + for (uint32_t i = 1; i < num_points; ++i) { + if (source[i] < min) { index = i; - min = src0[i]; + min = source[i]; } } target[0] = index; @@ -371,18 +361,17 @@ volk_32f_index_min_32u_generic(uint32_t* target, const float* src0, uint32_t num #include static inline void -volk_32f_index_min_32u_u_avx(uint32_t* target, const float* src0, uint32_t num_points) +volk_32f_index_min_32u_u_avx(uint32_t* target, const float* source, uint32_t num_points) { if (num_points > 0) { - uint32_t number = 0; const uint32_t quarterPoints = num_points / 8; - float* inputPtr = (float*)src0; + float* inputPtr = (float*)source; __m256 indexIncrementValues = _mm256_set1_ps(8); __m256 currentIndexes = _mm256_set_ps(-1, -2, -3, -4, -5, -6, -7, -8); - float min = src0[0]; + float min = source[0]; float index = 0; __m256 minValues = _mm256_set1_ps(min); __m256 minValuesIndex = _mm256_setzero_ps(); @@ -392,7 +381,7 @@ volk_32f_index_min_32u_u_avx(uint32_t* target, const float* src0, uint32_t num_p __VOLK_ATTR_ALIGNED(32) float minValuesBuffer[8]; __VOLK_ATTR_ALIGNED(32) float minIndexesBuffer[8]; - for (; number < quarterPoints; number++) { + for (uint32_t number = 0; number < quarterPoints; number++) { currentValues = _mm256_loadu_ps(inputPtr); inputPtr += 8; currentIndexes = _mm256_add_ps(currentIndexes, indexIncrementValues); @@ -406,7 +395,7 @@ volk_32f_index_min_32u_u_avx(uint32_t* target, const float* src0, uint32_t num_p _mm256_store_ps(minValuesBuffer, minValues); _mm256_store_ps(minIndexesBuffer, minValuesIndex); - for (number = 0; number < 8; number++) { + for (uint32_t number = 0; number < 8; number++) { if (minValuesBuffer[number] < min) { index = minIndexesBuffer[number]; min = minValuesBuffer[number]; @@ -416,11 +405,10 @@ volk_32f_index_min_32u_u_avx(uint32_t* target, const float* src0, uint32_t num_p } } - number = quarterPoints * 8; - for (; number < num_points; number++) { - if (src0[number] < min) { + for (uint32_t number = quarterPoints * 8; number < num_points; number++) { + if (source[number] < min) { index = number; - min = src0[number]; + min = source[number]; } } target[0] = (uint32_t)index; @@ -434,18 +422,17 @@ volk_32f_index_min_32u_u_avx(uint32_t* target, const float* src0, uint32_t num_p #include static inline void -volk_32f_index_min_32u_u_sse4_1(uint32_t* target, const float* src0, uint32_t num_points) +volk_32f_index_min_32u_u_sse4_1(uint32_t* target, const float* source, uint32_t num_points) { if (num_points > 0) { - uint32_t number = 0; const uint32_t quarterPoints = num_points / 4; - float* inputPtr = (float*)src0; + float* inputPtr = (float*)source; __m128 indexIncrementValues = _mm_set1_ps(4); __m128 currentIndexes = _mm_set_ps(-1, -2, -3, -4); - float min = src0[0]; + float min = source[0]; float index = 0; __m128 minValues = _mm_set1_ps(min); __m128 minValuesIndex = _mm_setzero_ps(); @@ -455,7 +442,7 @@ volk_32f_index_min_32u_u_sse4_1(uint32_t* target, const float* src0, uint32_t nu __VOLK_ATTR_ALIGNED(16) float minValuesBuffer[4]; __VOLK_ATTR_ALIGNED(16) float minIndexesBuffer[4]; - for (; number < quarterPoints; number++) { + for (uint32_t number = 0; number < quarterPoints; number++) { currentValues = _mm_loadu_ps(inputPtr); inputPtr += 4; currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues); @@ -469,7 +456,7 @@ volk_32f_index_min_32u_u_sse4_1(uint32_t* target, const float* src0, uint32_t nu _mm_store_ps(minValuesBuffer, minValues); _mm_store_ps(minIndexesBuffer, minValuesIndex); - for (number = 0; number < 4; number++) { + for (uint32_t number = 0; number < 4; number++) { if (minValuesBuffer[number] < min) { index = minIndexesBuffer[number]; min = minValuesBuffer[number]; @@ -479,11 +466,10 @@ volk_32f_index_min_32u_u_sse4_1(uint32_t* target, const float* src0, uint32_t nu } } - number = quarterPoints * 4; - for (; number < num_points; number++) { - if (src0[number] < min) { + for (uint32_t number = quarterPoints * 4; number < num_points; number++) { + if (source[number] < min) { index = number; - min = src0[number]; + min = source[number]; } } target[0] = (uint32_t)index; @@ -496,18 +482,17 @@ volk_32f_index_min_32u_u_sse4_1(uint32_t* target, const float* src0, uint32_t nu #include static inline void -volk_32f_index_min_32u_u_sse(uint32_t* target, const float* src0, uint32_t num_points) +volk_32f_index_min_32u_u_sse(uint32_t* target, const float* source, uint32_t num_points) { if (num_points > 0) { - uint32_t number = 0; const uint32_t quarterPoints = num_points / 4; - float* inputPtr = (float*)src0; + float* inputPtr = (float*)source; __m128 indexIncrementValues = _mm_set1_ps(4); __m128 currentIndexes = _mm_set_ps(-1, -2, -3, -4); - float min = src0[0]; + float min = source[0]; float index = 0; __m128 minValues = _mm_set1_ps(min); __m128 minValuesIndex = _mm_setzero_ps(); @@ -517,7 +502,7 @@ volk_32f_index_min_32u_u_sse(uint32_t* target, const float* src0, uint32_t num_p __VOLK_ATTR_ALIGNED(16) float minValuesBuffer[4]; __VOLK_ATTR_ALIGNED(16) float minIndexesBuffer[4]; - for (; number < quarterPoints; number++) { + for (uint32_t number = 0; number < quarterPoints; number++) { currentValues = _mm_loadu_ps(inputPtr); inputPtr += 4; currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues); @@ -532,7 +517,7 @@ volk_32f_index_min_32u_u_sse(uint32_t* target, const float* src0, uint32_t num_p _mm_store_ps(minValuesBuffer, minValues); _mm_store_ps(minIndexesBuffer, minValuesIndex); - for (number = 0; number < 4; number++) { + for (uint32_t number = 0; number < 4; number++) { if (minValuesBuffer[number] < min) { index = minIndexesBuffer[number]; min = minValuesBuffer[number]; @@ -542,11 +527,10 @@ volk_32f_index_min_32u_u_sse(uint32_t* target, const float* src0, uint32_t num_p } } - number = quarterPoints * 4; - for (; number < num_points; number++) { - if (src0[number] < min) { + for (uint32_t number = quarterPoints * 4; number < num_points; number++) { + if (source[number] < min) { index = number; - min = src0[number]; + min = source[number]; } } target[0] = (uint32_t)index; diff --git a/kernels/volk/volk_32fc_index_min_16u.h b/kernels/volk/volk_32fc_index_min_16u.h index 5539ebf..bf7f6e3 100644 --- a/kernels/volk/volk_32fc_index_min_16u.h +++ b/kernels/volk/volk_32fc_index_min_16u.h @@ -36,11 +36,11 @@ * * Dispatcher Prototype * \code - * void volk_32fc_index_min_16u(uint16_t* target, lv_32fc_t* src0, uint32_t + * void volk_32fc_index_min_16u(uint16_t* target, lv_32fc_t* source, uint32_t * num_points) \endcode * * \b Inputs - * \li src0: The complex input vector. + * \li source: The complex input vector. * \li num_points: The number of samples. * * \b Outputs @@ -87,7 +87,7 @@ #include static inline void volk_32fc_index_min_16u_a_avx2_variant_0(uint16_t* target, - lv_32fc_t* src0, + lv_32fc_t* source, uint32_t num_points) { num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points; @@ -104,11 +104,11 @@ static inline void volk_32fc_index_min_16u_a_avx2_variant_0(uint16_t* target, __m256i min_indices = _mm256_setzero_si256(); for (unsigned i = 0; i < num_points / 8u; ++i) { - __m256 in0 = _mm256_load_ps((float*)src0); - __m256 in1 = _mm256_load_ps((float*)(src0 + 4)); + __m256 in0 = _mm256_load_ps((float*)source); + __m256 in1 = _mm256_load_ps((float*)(source + 4)); vector_32fc_index_min_variant0( in0, in1, &min_values, &min_indices, ¤t_indices, indices_increment); - src0 += 8; + source += 8; } // determine minimum value and index in the result of the vectorized loop @@ -129,12 +129,12 @@ static inline void volk_32fc_index_min_16u_a_avx2_variant_0(uint16_t* target, // handle tail not processed by the vectorized loop for (unsigned i = num_points & (~7u); i < num_points; ++i) { const float abs_squared = - lv_creal(*src0) * lv_creal(*src0) + lv_cimag(*src0) * lv_cimag(*src0); + lv_creal(*source) * lv_creal(*source) + lv_cimag(*source) * lv_cimag(*source); if (abs_squared < min) { min = abs_squared; index = i; } - ++src0; + ++source; } *target = index; @@ -147,7 +147,7 @@ static inline void volk_32fc_index_min_16u_a_avx2_variant_0(uint16_t* target, #include static inline void volk_32fc_index_min_16u_a_avx2_variant_1(uint16_t* target, - lv_32fc_t* src0, + lv_32fc_t* source, uint32_t num_points) { num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points; @@ -164,11 +164,11 @@ static inline void volk_32fc_index_min_16u_a_avx2_variant_1(uint16_t* target, __m256i min_indices = _mm256_setzero_si256(); for (unsigned i = 0; i < num_points / 8u; ++i) { - __m256 in0 = _mm256_load_ps((float*)src0); - __m256 in1 = _mm256_load_ps((float*)(src0 + 4)); + __m256 in0 = _mm256_load_ps((float*)source); + __m256 in1 = _mm256_load_ps((float*)(source + 4)); vector_32fc_index_min_variant1( in0, in1, &min_values, &min_indices, ¤t_indices, indices_increment); - src0 += 8; + source += 8; } // determine minimum value and index in the result of the vectorized loop @@ -189,12 +189,12 @@ static inline void volk_32fc_index_min_16u_a_avx2_variant_1(uint16_t* target, // handle tail not processed by the vectorized loop for (unsigned i = num_points & (~7u); i < num_points; ++i) { const float abs_squared = - lv_creal(*src0) * lv_creal(*src0) + lv_cimag(*src0) * lv_cimag(*src0); + lv_creal(*source) * lv_creal(*source) + lv_cimag(*source) * lv_cimag(*source); if (abs_squared < min) { min = abs_squared; index = i; } - ++src0; + ++source; } *target = index; @@ -207,7 +207,7 @@ static inline void volk_32fc_index_min_16u_a_avx2_variant_1(uint16_t* target, #include static inline void -volk_32fc_index_min_16u_a_sse3(uint16_t* target, lv_32fc_t* src0, uint32_t num_points) +volk_32fc_index_min_16u_a_sse3(uint16_t* target, lv_32fc_t* source, uint32_t num_points) { num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points; const uint32_t num_bytes = num_points * 8; @@ -225,19 +225,18 @@ volk_32fc_index_min_16u_a_sse3(uint16_t* target, lv_32fc_t* src0, uint32_t num_p holderf.int_vec = _mm_setzero_si128(); holderi.int_vec = _mm_setzero_si128(); - int bound = num_bytes >> 5; - int i = 0; - xmm8 = _mm_setr_epi32(0, 1, 2, 3); xmm9 = _mm_setzero_si128(); xmm10 = _mm_setr_epi32(4, 4, 4, 4); xmm3 = _mm_set_ps1(FLT_MAX); - for (; i < bound; ++i) { - xmm1 = _mm_load_ps((float*)src0); - xmm2 = _mm_load_ps((float*)&src0[2]); + int bound = num_bytes >> 5; + + for (int i = 0; i < bound; ++i) { + xmm1 = _mm_load_ps((float*)source); + xmm2 = _mm_load_ps((float*)&source[2]); - src0 += 4; + source += 4; xmm1 = _mm_mul_ps(xmm1, xmm1); xmm2 = _mm_mul_ps(xmm2, xmm2); @@ -258,14 +257,14 @@ volk_32fc_index_min_16u_a_sse3(uint16_t* target, lv_32fc_t* src0, uint32_t num_p } if (num_bytes >> 4 & 1) { - xmm2 = _mm_load_ps((float*)src0); + xmm2 = _mm_load_ps((float*)source); xmm1 = _mm_movelh_ps(bit128_p(&xmm8)->float_vec, bit128_p(&xmm8)->float_vec); xmm8 = bit128_p(&xmm1)->int_vec; xmm2 = _mm_mul_ps(xmm2, xmm2); - src0 += 2; + source += 2; xmm1 = _mm_hadd_ps(xmm2, xmm2); @@ -286,7 +285,7 @@ volk_32fc_index_min_16u_a_sse3(uint16_t* target, lv_32fc_t* src0, uint32_t num_p if (num_bytes >> 3 & 1) { sq_dist = - lv_creal(src0[0]) * lv_creal(src0[0]) + lv_cimag(src0[0]) * lv_cimag(src0[0]); + lv_creal(source[0]) * lv_creal(source[0]) + lv_cimag(source[0]) * lv_cimag(source[0]); xmm2 = _mm_load1_ps(&sq_dist); @@ -322,21 +321,18 @@ volk_32fc_index_min_16u_a_sse3(uint16_t* target, lv_32fc_t* src0, uint32_t num_p #ifdef LV_HAVE_GENERIC static inline void -volk_32fc_index_min_16u_generic(uint16_t* target, lv_32fc_t* src0, uint32_t num_points) +volk_32fc_index_min_16u_generic(uint16_t* target, lv_32fc_t* source, uint32_t num_points) { num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points; - const uint32_t num_bytes = num_points * 8; float sq_dist = 0.0; float min = FLT_MAX; uint16_t index = 0; - uint32_t i = 0; - - for (; i> 3; ++i) { + for (uint32_t i = 0; i> 3; ++i) { sq_dist = - lv_creal(src0[i]) * lv_creal(src0[i]) + lv_cimag(src0[i]) * lv_cimag(src0[i]); + lv_creal(source[i]) * lv_creal(source[i]) + lv_cimag(source[i]) * lv_cimag(source[i]); if (sq_dist < min) { index = i; @@ -364,7 +360,7 @@ volk_32fc_index_min_16u_generic(uint16_t* target, lv_32fc_t* src0, uint32_t num_ #include static inline void volk_32fc_index_min_16u_u_avx2_variant_0(uint16_t* target, - lv_32fc_t* src0, + lv_32fc_t* source, uint32_t num_points) { num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points; @@ -381,11 +377,11 @@ static inline void volk_32fc_index_min_16u_u_avx2_variant_0(uint16_t* target, __m256i min_indices = _mm256_setzero_si256(); for (unsigned i = 0; i < num_points / 8u; ++i) { - __m256 in0 = _mm256_loadu_ps((float*)src0); - __m256 in1 = _mm256_loadu_ps((float*)(src0 + 4)); + __m256 in0 = _mm256_loadu_ps((float*)source); + __m256 in1 = _mm256_loadu_ps((float*)(source + 4)); vector_32fc_index_min_variant0( in0, in1, &min_values, &min_indices, ¤t_indices, indices_increment); - src0 += 8; + source += 8; } // determine minimum value and index in the result of the vectorized loop @@ -406,12 +402,12 @@ static inline void volk_32fc_index_min_16u_u_avx2_variant_0(uint16_t* target, // handle tail not processed by the vectorized loop for (unsigned i = num_points & (~7u); i < num_points; ++i) { const float abs_squared = - lv_creal(*src0) * lv_creal(*src0) + lv_cimag(*src0) * lv_cimag(*src0); + lv_creal(*source) * lv_creal(*source) + lv_cimag(*source) * lv_cimag(*source); if (abs_squared < min) { min = abs_squared; index = i; } - ++src0; + ++source; } *target = index; @@ -424,7 +420,7 @@ static inline void volk_32fc_index_min_16u_u_avx2_variant_0(uint16_t* target, #include static inline void volk_32fc_index_min_16u_u_avx2_variant_1(uint16_t* target, - lv_32fc_t* src0, + lv_32fc_t* source, uint32_t num_points) { num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points; @@ -441,11 +437,11 @@ static inline void volk_32fc_index_min_16u_u_avx2_variant_1(uint16_t* target, __m256i min_indices = _mm256_setzero_si256(); for (unsigned i = 0; i < num_points / 8u; ++i) { - __m256 in0 = _mm256_loadu_ps((float*)src0); - __m256 in1 = _mm256_loadu_ps((float*)(src0 + 4)); + __m256 in0 = _mm256_loadu_ps((float*)source); + __m256 in1 = _mm256_loadu_ps((float*)(source + 4)); vector_32fc_index_min_variant1( in0, in1, &min_values, &min_indices, ¤t_indices, indices_increment); - src0 += 8; + source += 8; } // determine minimum value and index in the result of the vectorized loop @@ -466,12 +462,12 @@ static inline void volk_32fc_index_min_16u_u_avx2_variant_1(uint16_t* target, // handle tail not processed by the vectorized loop for (unsigned i = num_points & (~7u); i < num_points; ++i) { const float abs_squared = - lv_creal(*src0) * lv_creal(*src0) + lv_cimag(*src0) * lv_cimag(*src0); + lv_creal(*source) * lv_creal(*source) + lv_cimag(*source) * lv_cimag(*source); if (abs_squared < min) { min = abs_squared; index = i; } - ++src0; + ++source; } *target = index; diff --git a/kernels/volk/volk_32fc_index_min_32u.h b/kernels/volk/volk_32fc_index_min_32u.h index 545f9bf..0539dd5 100644 --- a/kernels/volk/volk_32fc_index_min_32u.h +++ b/kernels/volk/volk_32fc_index_min_32u.h @@ -30,11 +30,11 @@ * * Dispatcher Prototype * \code - * void volk_32fc_index_min_32u(uint32_t* target, lv_32fc_t* src0, uint32_t + * void volk_32fc_index_min_32u(uint32_t* target, lv_32fc_t* source, uint32_t * num_points) \endcode * * \b Inputs - * \li src0: The complex input vector. + * \li source: The complex input vector. * \li num_points: The number of samples. * * \b Outputs @@ -80,7 +80,7 @@ #include static inline void volk_32fc_index_min_32u_a_avx2_variant_0(uint32_t* target, - lv_32fc_t* src0, + lv_32fc_t* source, uint32_t num_points) { const __m256i indices_increment = _mm256_set1_epi32(8); @@ -95,11 +95,11 @@ static inline void volk_32fc_index_min_32u_a_avx2_variant_0(uint32_t* target, __m256i min_indices = _mm256_setzero_si256(); for (unsigned i = 0; i < num_points / 8u; ++i) { - __m256 in0 = _mm256_load_ps((float*)src0); - __m256 in1 = _mm256_load_ps((float*)(src0 + 4)); + __m256 in0 = _mm256_load_ps((float*)source); + __m256 in1 = _mm256_load_ps((float*)(source + 4)); vector_32fc_index_min_variant0( in0, in1, &min_values, &min_indices, ¤t_indices, indices_increment); - src0 += 8; + source += 8; } // determine minimum value and index in the result of the vectorized loop @@ -120,12 +120,12 @@ static inline void volk_32fc_index_min_32u_a_avx2_variant_0(uint32_t* target, // handle tail not processed by the vectorized loop for (unsigned i = num_points & (~7u); i < num_points; ++i) { const float abs_squared = - lv_creal(*src0) * lv_creal(*src0) + lv_cimag(*src0) * lv_cimag(*src0); + lv_creal(*source) * lv_creal(*source) + lv_cimag(*source) * lv_cimag(*source); if (abs_squared < min) { min = abs_squared; index = i; } - ++src0; + ++source; } *target = index; @@ -138,7 +138,7 @@ static inline void volk_32fc_index_min_32u_a_avx2_variant_0(uint32_t* target, #include static inline void volk_32fc_index_min_32u_a_avx2_variant_1(uint32_t* target, - lv_32fc_t* src0, + lv_32fc_t* source, uint32_t num_points) { const __m256i indices_increment = _mm256_set1_epi32(8); @@ -153,11 +153,11 @@ static inline void volk_32fc_index_min_32u_a_avx2_variant_1(uint32_t* target, __m256i min_indices = _mm256_setzero_si256(); for (unsigned i = 0; i < num_points / 8u; ++i) { - __m256 in0 = _mm256_load_ps((float*)src0); - __m256 in1 = _mm256_load_ps((float*)(src0 + 4)); + __m256 in0 = _mm256_load_ps((float*)source); + __m256 in1 = _mm256_load_ps((float*)(source + 4)); vector_32fc_index_min_variant1( in0, in1, &min_values, &min_indices, ¤t_indices, indices_increment); - src0 += 8; + source += 8; } // determine minimum value and index in the result of the vectorized loop @@ -178,12 +178,12 @@ static inline void volk_32fc_index_min_32u_a_avx2_variant_1(uint32_t* target, // handle tail not processed by the vectorized loop for (unsigned i = num_points & (~7u); i < num_points; ++i) { const float abs_squared = - lv_creal(*src0) * lv_creal(*src0) + lv_cimag(*src0) * lv_cimag(*src0); + lv_creal(*source) * lv_creal(*source) + lv_cimag(*source) * lv_cimag(*source); if (abs_squared < min) { min = abs_squared; index = i; } - ++src0; + ++source; } *target = index; @@ -196,7 +196,7 @@ static inline void volk_32fc_index_min_32u_a_avx2_variant_1(uint32_t* target, #include static inline void -volk_32fc_index_min_32u_a_sse3(uint32_t* target, lv_32fc_t* src0, uint32_t num_points) +volk_32fc_index_min_32u_a_sse3(uint32_t* target, lv_32fc_t* source, uint32_t num_points) { const uint32_t num_bytes = num_points * 8; @@ -213,19 +213,18 @@ volk_32fc_index_min_32u_a_sse3(uint32_t* target, lv_32fc_t* src0, uint32_t num_p holderf.int_vec = _mm_setzero_si128(); holderi.int_vec = _mm_setzero_si128(); - int bound = num_bytes >> 5; - int i = 0; - xmm8 = _mm_setr_epi32(0, 1, 2, 3); xmm9 = _mm_setzero_si128(); xmm10 = _mm_setr_epi32(4, 4, 4, 4); xmm3 = _mm_set_ps1(FLT_MAX); - for (; i < bound; ++i) { - xmm1 = _mm_load_ps((float*)src0); - xmm2 = _mm_load_ps((float*)&src0[2]); + int bound = num_bytes >> 5; - src0 += 4; + for (int i = 0; i < bound; ++i) { + xmm1 = _mm_load_ps((float*)source); + xmm2 = _mm_load_ps((float*)&source[2]); + + source += 4; xmm1 = _mm_mul_ps(xmm1, xmm1); xmm2 = _mm_mul_ps(xmm2, xmm2); @@ -246,14 +245,14 @@ volk_32fc_index_min_32u_a_sse3(uint32_t* target, lv_32fc_t* src0, uint32_t num_p } if (num_bytes >> 4 & 1) { - xmm2 = _mm_load_ps((float*)src0); + xmm2 = _mm_load_ps((float*)source); xmm1 = _mm_movelh_ps(bit128_p(&xmm8)->float_vec, bit128_p(&xmm8)->float_vec); xmm8 = bit128_p(&xmm1)->int_vec; xmm2 = _mm_mul_ps(xmm2, xmm2); - src0 += 2; + source += 2; xmm1 = _mm_hadd_ps(xmm2, xmm2); @@ -274,7 +273,7 @@ volk_32fc_index_min_32u_a_sse3(uint32_t* target, lv_32fc_t* src0, uint32_t num_p if (num_bytes >> 3 & 1) { sq_dist = - lv_creal(src0[0]) * lv_creal(src0[0]) + lv_cimag(src0[0]) * lv_cimag(src0[0]); + lv_creal(source[0]) * lv_creal(source[0]) + lv_cimag(source[0]) * lv_cimag(source[0]); xmm2 = _mm_load1_ps(&sq_dist); @@ -310,7 +309,7 @@ volk_32fc_index_min_32u_a_sse3(uint32_t* target, lv_32fc_t* src0, uint32_t num_p #ifdef LV_HAVE_GENERIC static inline void -volk_32fc_index_min_32u_generic(uint32_t* target, lv_32fc_t* src0, uint32_t num_points) +volk_32fc_index_min_32u_generic(uint32_t* target, lv_32fc_t* source, uint32_t num_points) { const uint32_t num_bytes = num_points * 8; @@ -318,11 +317,9 @@ volk_32fc_index_min_32u_generic(uint32_t* target, lv_32fc_t* src0, uint32_t num_ float min = FLT_MAX; uint32_t index = 0; - uint32_t i = 0; - - for (; i> 3; ++i) { + for (uint32_t i = 0; i> 3; ++i) { sq_dist = - lv_creal(src0[i]) * lv_creal(src0[i]) + lv_cimag(src0[i]) * lv_cimag(src0[i]); + lv_creal(source[i]) * lv_creal(source[i]) + lv_cimag(source[i]) * lv_cimag(source[i]); if (sq_dist < min) { index = i; @@ -349,7 +346,7 @@ volk_32fc_index_min_32u_generic(uint32_t* target, lv_32fc_t* src0, uint32_t num_ #include static inline void volk_32fc_index_min_32u_u_avx2_variant_0(uint32_t* target, - lv_32fc_t* src0, + lv_32fc_t* source, uint32_t num_points) { const __m256i indices_increment = _mm256_set1_epi32(8); @@ -364,11 +361,11 @@ static inline void volk_32fc_index_min_32u_u_avx2_variant_0(uint32_t* target, __m256i min_indices = _mm256_setzero_si256(); for (unsigned i = 0; i < num_points / 8u; ++i) { - __m256 in0 = _mm256_loadu_ps((float*)src0); - __m256 in1 = _mm256_loadu_ps((float*)(src0 + 4)); + __m256 in0 = _mm256_loadu_ps((float*)source); + __m256 in1 = _mm256_loadu_ps((float*)(source + 4)); vector_32fc_index_min_variant0( in0, in1, &min_values, &min_indices, ¤t_indices, indices_increment); - src0 += 8; + source += 8; } // determine minimum value and index in the result of the vectorized loop @@ -389,12 +386,12 @@ static inline void volk_32fc_index_min_32u_u_avx2_variant_0(uint32_t* target, // handle tail not processed by the vectorized loop for (unsigned i = num_points & (~7u); i < num_points; ++i) { const float abs_squared = - lv_creal(*src0) * lv_creal(*src0) + lv_cimag(*src0) * lv_cimag(*src0); + lv_creal(*source) * lv_creal(*source) + lv_cimag(*source) * lv_cimag(*source); if (abs_squared < min) { min = abs_squared; index = i; } - ++src0; + ++source; } *target = index; @@ -407,7 +404,7 @@ static inline void volk_32fc_index_min_32u_u_avx2_variant_0(uint32_t* target, #include static inline void volk_32fc_index_min_32u_u_avx2_variant_1(uint32_t* target, - lv_32fc_t* src0, + lv_32fc_t* source, uint32_t num_points) { const __m256i indices_increment = _mm256_set1_epi32(8); @@ -422,11 +419,11 @@ static inline void volk_32fc_index_min_32u_u_avx2_variant_1(uint32_t* target, __m256i min_indices = _mm256_setzero_si256(); for (unsigned i = 0; i < num_points / 8u; ++i) { - __m256 in0 = _mm256_loadu_ps((float*)src0); - __m256 in1 = _mm256_loadu_ps((float*)(src0 + 4)); + __m256 in0 = _mm256_loadu_ps((float*)source); + __m256 in1 = _mm256_loadu_ps((float*)(source + 4)); vector_32fc_index_min_variant1( in0, in1, &min_values, &min_indices, ¤t_indices, indices_increment); - src0 += 8; + source += 8; } // determine minimum value and index in the result of the vectorized loop @@ -447,12 +444,12 @@ static inline void volk_32fc_index_min_32u_u_avx2_variant_1(uint32_t* target, // handle tail not processed by the vectorized loop for (unsigned i = num_points & (~7u); i < num_points; ++i) { const float abs_squared = - lv_creal(*src0) * lv_creal(*src0) + lv_cimag(*src0) * lv_cimag(*src0); + lv_creal(*source) * lv_creal(*source) + lv_cimag(*source) * lv_cimag(*source); if (abs_squared < min) { min = abs_squared; index = i; } - ++src0; + ++source; } *target = index; @@ -465,11 +462,10 @@ static inline void volk_32fc_index_min_32u_u_avx2_variant_1(uint32_t* target, #include static inline void -volk_32fc_index_min_32u_neon(uint32_t* target, lv_32fc_t* src0, uint32_t num_points) +volk_32fc_index_min_32u_neon(uint32_t* target, lv_32fc_t* source, uint32_t num_points) { - unsigned int number = 0; const uint32_t quarter_points = num_points / 4; - const lv_32fc_t* src0Ptr = src0; + const lv_32fc_t* sourcePtr = source; uint32_t indices[4] = { 0, 1, 2, 3 }; const uint32x4_t vec_indices_incr = vdupq_n_u32(4); @@ -482,11 +478,11 @@ volk_32fc_index_min_32u_neon(uint32_t* target, lv_32fc_t* src0, uint32_t num_poi float32x4_t vec_min = vdupq_n_f32(FLT_MAX); - for (; number < quarter_points; number++) { + for (uint32_t number = 0; number < quarter_points; number++) { // Load complex and compute magnitude squared const float32x4_t vec_mag2 = - _vmagnitudesquaredq_f32(vld2q_f32((float*)src0Ptr)); - __VOLK_PREFETCH(src0Ptr += 4); + _vmagnitudesquaredq_f32(vld2q_f32((float*)sourcePtr)); + __VOLK_PREFETCH(sourcePtr += 4); // a < b? const uint32x4_t lt_mask = vcltq_f32(vec_mag2, vec_min); vec_min = vbslq_f32(lt_mask, vec_mag2, vec_min); @@ -506,14 +502,14 @@ volk_32fc_index_min_32u_neon(uint32_t* target, lv_32fc_t* src0, uint32_t num_poi } // Deal with the rest - for (number = quarter_points * 4; number < num_points; number++) { - const float re = lv_creal(*src0Ptr); - const float im = lv_cimag(*src0Ptr); + for (uint32_t number = quarter_points * 4; number < num_points; number++) { + const float re = lv_creal(*sourcePtr); + const float im = lv_cimag(*sourcePtr); if ((re * re + im * im) < min) { - min = *src0Ptr; + min = *sourcePtr; index = number; } - src0Ptr++; + sourcePtr++; } *target = index; } -- 2.30.2