From d5a0c2ce7b42e1bb22b7f380b4af24a7e5bd62e0 Mon Sep 17 00:00:00 2001 From: Zlika Date: Wed, 9 Jun 2021 22:47:04 +0200 Subject: [PATCH] [PATCH 01/73] Add volk_32f(c)_index_min_16/32u Signed-off-by: Zlika Gbp-Pq: Name 0001-Add-volk_32f-c-_index_min_16-32u.patch --- docs/kernels.dox | 4 + include/volk/volk_avx2_intrinsics.h | 114 ++++- kernels/volk/volk_32f_index_min_16u.h | 375 +++++++++++++++++ kernels/volk/volk_32f_index_min_32u.h | 558 +++++++++++++++++++++++++ kernels/volk/volk_32fc_index_min_16u.h | 482 +++++++++++++++++++++ kernels/volk/volk_32fc_index_min_32u.h | 524 +++++++++++++++++++++++ lib/kernel_tests.h | 4 + 7 files changed, 2060 insertions(+), 1 deletion(-) create mode 100644 kernels/volk/volk_32f_index_min_16u.h create mode 100644 kernels/volk/volk_32f_index_min_32u.h create mode 100644 kernels/volk/volk_32fc_index_min_16u.h create mode 100644 kernels/volk/volk_32fc_index_min_32u.h diff --git a/docs/kernels.dox b/docs/kernels.dox index e9898f1..55e567b 100644 --- a/docs/kernels.dox +++ b/docs/kernels.dox @@ -48,6 +48,8 @@ \li \subpage volk_32fc_deinterleave_real_64f \li \subpage volk_32fc_index_max_16u \li \subpage volk_32fc_index_max_32u +\li \subpage volk_32fc_index_min_16u +\li \subpage volk_32fc_index_min_32u \li \subpage volk_32fc_magnitude_32f \li \subpage volk_32fc_magnitude_squared_32f \li \subpage volk_32f_cos_32f @@ -63,6 +65,8 @@ \li \subpage volk_32f_expfast_32f \li \subpage volk_32f_index_max_16u \li \subpage volk_32f_index_max_32u +\li \subpage volk_32f_index_min_16u +\li \subpage volk_32f_index_min_32u \li \subpage volk_32f_invsqrt_32f \li \subpage volk_32f_log2_32f \li \subpage volk_32f_s32f_calc_spectral_noise_floor_32f diff --git a/include/volk/volk_avx2_intrinsics.h b/include/volk/volk_avx2_intrinsics.h index 2c397d9..21060d6 100644 --- a/include/volk/volk_avx2_intrinsics.h +++ b/include/volk/volk_avx2_intrinsics.h @@ -130,7 +130,7 @@ static inline __m256 _mm256_scaled_norm_dist_ps_avx2(const __m256 symbols0, * float abs_squared = real(src0) * real(src0) + imag(src0) * imag(src1) * bool compare = abs_squared > max_values[j]; * max_values[j] = compare ? abs_squared : max_values[j]; - * max_indices[j] = compare ? current_indices[j] > max_indices[j] + * max_indices[j] = compare ? current_indices[j] : max_indices[j] * current_indices[j] += 8; // update for next outer loop iteration * ++src0; * } @@ -231,4 +231,116 @@ static inline void vector_32fc_index_max_variant1(__m256 in0, *current_indices = _mm256_add_epi32(*current_indices, indices_increment); } +/* + * The function below vectorizes the inner loop of the following code: + * + * float min_values[8] = {FLT_MAX}; + * unsigned min_indices[8] = {0}; + * unsigned current_indices[8] = {0, 1, 2, 3, 4, 5, 6, 7}; + * for (unsigned i = 0; i < num_points / 8; ++i) { + * for (unsigned j = 0; j < 8; ++j) { + * float abs_squared = real(src0) * real(src0) + imag(src0) * imag(src1) + * bool compare = abs_squared < min_values[j]; + * min_values[j] = compare ? abs_squared : min_values[j]; + * min_indices[j] = compare ? current_indices[j] : min_indices[j] + * current_indices[j] += 8; // update for next outer loop iteration + * ++src0; + * } + * } + */ +static inline void vector_32fc_index_min_variant0(__m256 in0, + __m256 in1, + __m256* min_values, + __m256i* min_indices, + __m256i* current_indices, + __m256i indices_increment) +{ + in0 = _mm256_mul_ps(in0, in0); + in1 = _mm256_mul_ps(in1, in1); + + /* + * Given the vectors a = (a_7, a_6, …, a_1, a_0) and b = (b_7, b_6, …, b_1, b_0) + * hadd_ps(a, b) computes + * (b_7 + b_6, + * b_5 + b_4, + * --------- + * a_7 + b_6, + * a_5 + a_4, + * --------- + * b_3 + b_2, + * b_1 + b_0, + * --------- + * a_3 + a_2, + * a_1 + a_0). + * The result is the squared absolute value of complex numbers at index + * offsets (7, 6, 3, 2, 5, 4, 1, 0). This must be the initial value of + * current_indices! + */ + __m256 abs_squared = _mm256_hadd_ps(in0, in1); + + /* + * Compare the recently computed squared absolute values with the + * previously determined minimum values. cmp_ps(a, b) determines + * a < b ? 0xFFFFFFFF for each element in the vectors => + * compare_mask = abs_squared < min_values ? 0xFFFFFFFF : 0 + * + * If either operand is NaN, 0 is returned as an “ordered” comparision is + * used => the blend operation will select the value from *min_values. + */ + __m256 compare_mask = _mm256_cmp_ps(abs_squared, *min_values, _CMP_LT_OS); + + /* Select minimum by blending. This is the only line which differs from variant1 */ + *min_values = _mm256_blendv_ps(*min_values, abs_squared, compare_mask); + + /* + * Updates indices: blendv_ps(a, b, mask) determines mask ? b : a for + * each element in the vectors => + * min_indices = compare_mask ? current_indices : min_indices + * + * Note: The casting of data types is required to make the compiler happy + * and does not change values. + */ + *min_indices = + _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(*min_indices), + _mm256_castsi256_ps(*current_indices), + compare_mask)); + + /* compute indices of complex numbers which will be loaded in the next iteration */ + *current_indices = _mm256_add_epi32(*current_indices, indices_increment); +} + +/* See _variant0 for details */ +static inline void vector_32fc_index_min_variant1(__m256 in0, + __m256 in1, + __m256* min_values, + __m256i* min_indices, + __m256i* current_indices, + __m256i indices_increment) +{ + in0 = _mm256_mul_ps(in0, in0); + in1 = _mm256_mul_ps(in1, in1); + + __m256 abs_squared = _mm256_hadd_ps(in0, in1); + __m256 compare_mask = _mm256_cmp_ps(abs_squared, *min_values, _CMP_LT_OS); + + /* + * This is the only line which differs from variant0. Using maxps instead of + * blendvps is faster on Intel CPUs (on the ones tested with). + * + * Note: The order of arguments matters if a NaN is encountered in which + * case the value of the second argument is selected. This is consistent + * with the “ordered” comparision and the blend operation: The comparision + * returns false if a NaN is encountered and the blend operation + * consequently selects the value from min_indices. + */ + *min_values = _mm256_min_ps(abs_squared, *min_values); + + *min_indices = + _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(*min_indices), + _mm256_castsi256_ps(*current_indices), + compare_mask)); + + *current_indices = _mm256_add_epi32(*current_indices, indices_increment); +} + #endif /* INCLUDE_VOLK_VOLK_AVX2_INTRINSICS_H_ */ diff --git a/kernels/volk/volk_32f_index_min_16u.h b/kernels/volk/volk_32f_index_min_16u.h new file mode 100644 index 0000000..848b75c --- /dev/null +++ b/kernels/volk/volk_32f_index_min_16u.h @@ -0,0 +1,375 @@ +/* -*- c++ -*- */ +/* + * Copyright 2021 Free Software Foundation, Inc. + * + * This file is part of GNU Radio + * + * GNU Radio is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 3, or (at your option) + * any later version. + * + * GNU Radio is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with GNU Radio; see the file COPYING. If not, write to + * the Free Software Foundation, Inc., 51 Franklin Street, + * Boston, MA 02110-1301, USA. + */ + +/*! + * \page volk_32f_index_min_16u + * + * \b Overview + * + * Returns Argmin_i x[i]. Finds and returns the index which contains + * the fist minimum value in the given vector. + * + * Note that num_points is a uint32_t, but the return value is + * uint16_t. Providing a vector larger than the max of a uint16_t + * (65536) would miss anything outside of this boundary. The kernel + * will check the length of num_points and cap it to this max value, + * anyways. + * + * Dispatcher Prototype + * \code + * void volk_32f_index_min_16u(uint16_t* target, const float* src0, uint32_t num_points) + * \endcode + * + * \b Inputs + * \li src0: The input vector of floats. + * \li num_points: The number of data points. + * + * \b Outputs + * \li target: The index of the fist minimum value in the input buffer. + * + * \b Example + * \code + * int N = 10; + * uint32_t alignment = volk_get_alignment(); + * float* in = (float*)volk_malloc(sizeof(float)*N, alignment); + * uint16_t* out = (uint16_t*)volk_malloc(sizeof(uint16_t), alignment); + * + * for(uint32_t ii = 0; ii < N; ++ii){ + * float x = (float)ii; + * // a parabola with a minimum at x=4 + * in[ii] = (x-4) * (x-4) - 5; + * } + * + * volk_32f_index_min_16u(out, in, N); + * + * printf("minimum is %1.2f at index %u\n", in[*out], *out); + * + * volk_free(in); + * volk_free(out); + * \endcode + */ + +#ifndef INCLUDED_volk_32f_index_min_16u_a_H +#define INCLUDED_volk_32f_index_min_16u_a_H + +#include +#include +#include +#include + +#ifdef LV_HAVE_AVX +#include + +static inline void +volk_32f_index_min_16u_a_avx(uint16_t* target, const float* src0, uint32_t num_points) +{ + num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points; + + uint32_t number = 0; + const uint32_t eighthPoints = num_points / 8; + + float* inputPtr = (float*)src0; + + __m256 indexIncrementValues = _mm256_set1_ps(8); + __m256 currentIndexes = _mm256_set_ps(-1, -2, -3, -4, -5, -6, -7, -8); + + float min = src0[0]; + float index = 0; + __m256 minValues = _mm256_set1_ps(min); + __m256 minValuesIndex = _mm256_setzero_ps(); + __m256 compareResults; + __m256 currentValues; + + __VOLK_ATTR_ALIGNED(32) float minValuesBuffer[8]; + __VOLK_ATTR_ALIGNED(32) float minIndexesBuffer[8]; + + for (; number < eighthPoints; number++) { + + currentValues = _mm256_load_ps(inputPtr); + inputPtr += 8; + currentIndexes = _mm256_add_ps(currentIndexes, indexIncrementValues); + + compareResults = _mm256_cmp_ps(currentValues, minValues, _CMP_LT_OS); + + minValuesIndex = _mm256_blendv_ps(minValuesIndex, currentIndexes, compareResults); + minValues = _mm256_blendv_ps(minValues, currentValues, compareResults); + } + + // Calculate the smallest value from the remaining 4 points + _mm256_store_ps(minValuesBuffer, minValues); + _mm256_store_ps(minIndexesBuffer, minValuesIndex); + + for (number = 0; number < 8; number++) { + if (minValuesBuffer[number] < min) { + index = minIndexesBuffer[number]; + min = minValuesBuffer[number]; + } else if (minValuesBuffer[number] == min) { + if (index > minIndexesBuffer[number]) + index = minIndexesBuffer[number]; + } + } + + number = eighthPoints * 8; + for (; number < num_points; number++) { + if (src0[number] < min) { + index = number; + min = src0[number]; + } + } + target[0] = (uint16_t)index; +} + +#endif /*LV_HAVE_AVX*/ + +#ifdef LV_HAVE_SSE4_1 +#include + +static inline void +volk_32f_index_min_16u_a_sse4_1(uint16_t* target, const float* src0, uint32_t num_points) +{ + num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points; + + uint32_t number = 0; + const uint32_t quarterPoints = num_points / 4; + + float* inputPtr = (float*)src0; + + __m128 indexIncrementValues = _mm_set1_ps(4); + __m128 currentIndexes = _mm_set_ps(-1, -2, -3, -4); + + float min = src0[0]; + float index = 0; + __m128 minValues = _mm_set1_ps(min); + __m128 minValuesIndex = _mm_setzero_ps(); + __m128 compareResults; + __m128 currentValues; + + __VOLK_ATTR_ALIGNED(16) float minValuesBuffer[4]; + __VOLK_ATTR_ALIGNED(16) float minIndexesBuffer[4]; + + for (; number < quarterPoints; number++) { + + currentValues = _mm_load_ps(inputPtr); + inputPtr += 4; + currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues); + + compareResults = _mm_cmplt_ps(currentValues, minValues); + + minValuesIndex = _mm_blendv_ps(minValuesIndex, currentIndexes, compareResults); + minValues = _mm_blendv_ps(minValues, currentValues, compareResults); + } + + // Calculate the smallest value from the remaining 4 points + _mm_store_ps(minValuesBuffer, minValues); + _mm_store_ps(minIndexesBuffer, minValuesIndex); + + for (number = 0; number < 4; number++) { + if (minValuesBuffer[number] < min) { + index = minIndexesBuffer[number]; + min = minValuesBuffer[number]; + } else if (minValuesBuffer[number] == min) { + if (index > minIndexesBuffer[number]) + index = minIndexesBuffer[number]; + } + } + + number = quarterPoints * 4; + for (; number < num_points; number++) { + if (src0[number] < min) { + index = number; + min = src0[number]; + } + } + target[0] = (uint16_t)index; +} + +#endif /*LV_HAVE_SSE4_1*/ + + +#ifdef LV_HAVE_SSE + +#include + +static inline void +volk_32f_index_min_16u_a_sse(uint16_t* target, const float* src0, uint32_t num_points) +{ + num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points; + + uint32_t number = 0; + const uint32_t quarterPoints = num_points / 4; + + float* inputPtr = (float*)src0; + + __m128 indexIncrementValues = _mm_set1_ps(4); + __m128 currentIndexes = _mm_set_ps(-1, -2, -3, -4); + + float min = src0[0]; + float index = 0; + __m128 minValues = _mm_set1_ps(min); + __m128 minValuesIndex = _mm_setzero_ps(); + __m128 compareResults; + __m128 currentValues; + + __VOLK_ATTR_ALIGNED(16) float minValuesBuffer[4]; + __VOLK_ATTR_ALIGNED(16) float minIndexesBuffer[4]; + + for (; number < quarterPoints; number++) { + + currentValues = _mm_load_ps(inputPtr); + inputPtr += 4; + currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues); + + compareResults = _mm_cmplt_ps(currentValues, minValues); + + minValuesIndex = _mm_or_ps(_mm_and_ps(compareResults, currentIndexes), + _mm_andnot_ps(compareResults, minValuesIndex)); + minValues = _mm_or_ps(_mm_and_ps(compareResults, currentValues), + _mm_andnot_ps(compareResults, minValues)); + } + + // Calculate the smallest value from the remaining 4 points + _mm_store_ps(minValuesBuffer, minValues); + _mm_store_ps(minIndexesBuffer, minValuesIndex); + + for (number = 0; number < 4; number++) { + if (minValuesBuffer[number] < min) { + index = minIndexesBuffer[number]; + min = minValuesBuffer[number]; + } else if (minValuesBuffer[number] == min) { + if (index > minIndexesBuffer[number]) + index = minIndexesBuffer[number]; + } + } + + number = quarterPoints * 4; + for (; number < num_points; number++) { + if (src0[number] < min) { + index = number; + min = src0[number]; + } + } + target[0] = (uint16_t)index; +} + +#endif /*LV_HAVE_SSE*/ + + +#ifdef LV_HAVE_GENERIC + +static inline void +volk_32f_index_min_16u_generic(uint16_t* target, const float* src0, uint32_t num_points) +{ + num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points; + + float min = src0[0]; + uint16_t index = 0; + + uint32_t i = 1; + + for (; i < num_points; ++i) { + if (src0[i] < min) { + index = i; + min = src0[i]; + } + } + target[0] = index; +} + +#endif /*LV_HAVE_GENERIC*/ + + +#endif /*INCLUDED_volk_32f_index_min_16u_a_H*/ + + +#ifndef INCLUDED_volk_32f_index_min_16u_u_H +#define INCLUDED_volk_32f_index_min_16u_u_H + +#include +#include +#include +#include + +#ifdef LV_HAVE_AVX +#include + +static inline void +volk_32f_index_min_16u_u_avx(uint16_t* target, const float* src0, uint32_t num_points) +{ + num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points; + + uint32_t number = 0; + const uint32_t eighthPoints = num_points / 8; + + float* inputPtr = (float*)src0; + + __m256 indexIncrementValues = _mm256_set1_ps(8); + __m256 currentIndexes = _mm256_set_ps(-1, -2, -3, -4, -5, -6, -7, -8); + + float min = src0[0]; + float index = 0; + __m256 minValues = _mm256_set1_ps(min); + __m256 minValuesIndex = _mm256_setzero_ps(); + __m256 compareResults; + __m256 currentValues; + + __VOLK_ATTR_ALIGNED(32) float minValuesBuffer[8]; + __VOLK_ATTR_ALIGNED(32) float minIndexesBuffer[8]; + + for (; number < eighthPoints; number++) { + + currentValues = _mm256_loadu_ps(inputPtr); + inputPtr += 8; + currentIndexes = _mm256_add_ps(currentIndexes, indexIncrementValues); + + compareResults = _mm256_cmp_ps(currentValues, minValues, _CMP_LT_OS); + + minValuesIndex = _mm256_blendv_ps(minValuesIndex, currentIndexes, compareResults); + minValues = _mm256_blendv_ps(minValues, currentValues, compareResults); + } + + // Calculate the smallest value from the remaining 4 points + _mm256_storeu_ps(minValuesBuffer, minValues); + _mm256_storeu_ps(minIndexesBuffer, minValuesIndex); + + for (number = 0; number < 8; number++) { + if (minValuesBuffer[number] < min) { + index = minIndexesBuffer[number]; + min = minValuesBuffer[number]; + } else if (minValuesBuffer[number] == min) { + if (index > minIndexesBuffer[number]) + index = minIndexesBuffer[number]; + } + } + + number = eighthPoints * 8; + for (; number < num_points; number++) { + if (src0[number] < min) { + index = number; + min = src0[number]; + } + } + target[0] = (uint16_t)index; +} + +#endif /*LV_HAVE_AVX*/ + +#endif /*INCLUDED_volk_32f_index_min_16u_u_H*/ diff --git a/kernels/volk/volk_32f_index_min_32u.h b/kernels/volk/volk_32f_index_min_32u.h new file mode 100644 index 0000000..67ee426 --- /dev/null +++ b/kernels/volk/volk_32f_index_min_32u.h @@ -0,0 +1,558 @@ +/* -*- c++ -*- */ +/* + * Copyright 2021 Free Software Foundation, Inc. + * + * This file is part of GNU Radio + * + * GNU Radio is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 3, or (at your option) + * any later version. + * + * GNU Radio is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with GNU Radio; see the file COPYING. If not, write to + * the Free Software Foundation, Inc., 51 Franklin Street, + * Boston, MA 02110-1301, USA. + */ + +/*! + * \page volk_32f_index_min_32u + * + * \b Overview + * + * Returns Argmin_i x[i]. Finds and returns the index which contains the first minimum + * value in the given vector. + * + * Dispatcher Prototype + * \code + * void volk_32f_index_min_32u(uint32_t* target, const float* src0, uint32_t num_points) + * \endcode + * + * \b Inputs + * \li src0: The input vector of floats. + * \li num_points: The number of data points. + * + * \b Outputs + * \li target: The index of the first minimum value in the input buffer. + * + * \b Example + * \code + * int N = 10; + * uint32_t alignment = volk_get_alignment(); + * float* in = (float*)volk_malloc(sizeof(float)*N, alignment); + * uint32_t* out = (uint32_t*)volk_malloc(sizeof(uint32_t), alignment); + * + * for(uint32_t ii = 0; ii < N; ++ii){ + * float x = (float)ii; + * // a parabola with a minimum at x=4 + * in[ii] = (x-4) * (x-4) - 5; + * } + * + * volk_32f_index_min_32u(out, in, N); + * + * printf("minimum is %1.2f at index %u\n", in[*out], *out); + * + * volk_free(in); + * volk_free(out); + * \endcode + */ + +#ifndef INCLUDED_volk_32f_index_min_32u_a_H +#define INCLUDED_volk_32f_index_min_32u_a_H + +#include +#include +#include + +#ifdef LV_HAVE_SSE4_1 +#include + +static inline void +volk_32f_index_min_32u_a_sse4_1(uint32_t* target, const float* src0, uint32_t num_points) +{ + if (num_points > 0) { + uint32_t number = 0; + const uint32_t quarterPoints = num_points / 4; + + float* inputPtr = (float*)src0; + + __m128 indexIncrementValues = _mm_set1_ps(4); + __m128 currentIndexes = _mm_set_ps(-1, -2, -3, -4); + + float min = src0[0]; + float index = 0; + __m128 minValues = _mm_set1_ps(min); + __m128 minValuesIndex = _mm_setzero_ps(); + __m128 compareResults; + __m128 currentValues; + + __VOLK_ATTR_ALIGNED(16) float minValuesBuffer[4]; + __VOLK_ATTR_ALIGNED(16) float minIndexesBuffer[4]; + + for (; number < quarterPoints; number++) { + + currentValues = _mm_load_ps(inputPtr); + inputPtr += 4; + currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues); + + compareResults = _mm_cmplt_ps(currentValues, minValues); + + minValuesIndex = + _mm_blendv_ps(minValuesIndex, currentIndexes, compareResults); + minValues = _mm_blendv_ps(minValues, currentValues, compareResults); + } + + // Calculate the smallest value from the remaining 4 points + _mm_store_ps(minValuesBuffer, minValues); + _mm_store_ps(minIndexesBuffer, minValuesIndex); + + for (number = 0; number < 4; number++) { + if (minValuesBuffer[number] < min) { + index = minIndexesBuffer[number]; + min = minValuesBuffer[number]; + } else if (minValuesBuffer[number] == min) { + if (index > minIndexesBuffer[number]) + index = minIndexesBuffer[number]; + } + } + + number = quarterPoints * 4; + for (; number < num_points; number++) { + if (src0[number] < min) { + index = number; + min = src0[number]; + } + } + target[0] = (uint32_t)index; + } +} + +#endif /*LV_HAVE_SSE4_1*/ + + +#ifdef LV_HAVE_SSE + +#include + +static inline void +volk_32f_index_min_32u_a_sse(uint32_t* target, const float* src0, uint32_t num_points) +{ + if (num_points > 0) { + uint32_t number = 0; + const uint32_t quarterPoints = num_points / 4; + + float* inputPtr = (float*)src0; + + __m128 indexIncrementValues = _mm_set1_ps(4); + __m128 currentIndexes = _mm_set_ps(-1, -2, -3, -4); + + float min = src0[0]; + float index = 0; + __m128 minValues = _mm_set1_ps(min); + __m128 minValuesIndex = _mm_setzero_ps(); + __m128 compareResults; + __m128 currentValues; + + __VOLK_ATTR_ALIGNED(16) float minValuesBuffer[4]; + __VOLK_ATTR_ALIGNED(16) float minIndexesBuffer[4]; + + for (; number < quarterPoints; number++) { + + currentValues = _mm_load_ps(inputPtr); + inputPtr += 4; + currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues); + + compareResults = _mm_cmplt_ps(currentValues, minValues); + + minValuesIndex = _mm_or_ps(_mm_and_ps(compareResults, currentIndexes), + _mm_andnot_ps(compareResults, minValuesIndex)); + + minValues = _mm_or_ps(_mm_and_ps(compareResults, currentValues), + _mm_andnot_ps(compareResults, minValues)); + } + + // Calculate the smallest value from the remaining 4 points + _mm_store_ps(minValuesBuffer, minValues); + _mm_store_ps(minIndexesBuffer, minValuesIndex); + + for (number = 0; number < 4; number++) { + if (minValuesBuffer[number] < min) { + index = minIndexesBuffer[number]; + min = minValuesBuffer[number]; + } else if (minValuesBuffer[number] == min) { + if (index > minIndexesBuffer[number]) + index = minIndexesBuffer[number]; + } + } + + number = quarterPoints * 4; + for (; number < num_points; number++) { + if (src0[number] < min) { + index = number; + min = src0[number]; + } + } + target[0] = (uint32_t)index; + } +} + +#endif /*LV_HAVE_SSE*/ + + +#ifdef LV_HAVE_AVX +#include + +static inline void +volk_32f_index_min_32u_a_avx(uint32_t* target, const float* src0, uint32_t num_points) +{ + if (num_points > 0) { + uint32_t number = 0; + const uint32_t quarterPoints = num_points / 8; + + float* inputPtr = (float*)src0; + + __m256 indexIncrementValues = _mm256_set1_ps(8); + __m256 currentIndexes = _mm256_set_ps(-1, -2, -3, -4, -5, -6, -7, -8); + + float min = src0[0]; + float index = 0; + __m256 minValues = _mm256_set1_ps(min); + __m256 minValuesIndex = _mm256_setzero_ps(); + __m256 compareResults; + __m256 currentValues; + + __VOLK_ATTR_ALIGNED(32) float minValuesBuffer[8]; + __VOLK_ATTR_ALIGNED(32) float minIndexesBuffer[8]; + + for (; number < quarterPoints; number++) { + currentValues = _mm256_load_ps(inputPtr); + inputPtr += 8; + currentIndexes = _mm256_add_ps(currentIndexes, indexIncrementValues); + compareResults = _mm256_cmp_ps(currentValues, minValues, _CMP_LT_OS); + minValuesIndex = + _mm256_blendv_ps(minValuesIndex, currentIndexes, compareResults); + minValues = _mm256_blendv_ps(minValues, currentValues, compareResults); + } + + // Calculate the smallest value from the remaining 8 points + _mm256_store_ps(minValuesBuffer, minValues); + _mm256_store_ps(minIndexesBuffer, minValuesIndex); + + for (number = 0; number < 8; number++) { + if (minValuesBuffer[number] < min) { + index = minIndexesBuffer[number]; + min = minValuesBuffer[number]; + } else if (minValuesBuffer[number] == min) { + if (index > minIndexesBuffer[number]) + index = minIndexesBuffer[number]; + } + } + + number = quarterPoints * 8; + for (; number < num_points; number++) { + if (src0[number] < min) { + index = number; + min = src0[number]; + } + } + target[0] = (uint32_t)index; + } +} + +#endif /*LV_HAVE_AVX*/ + + +#ifdef LV_HAVE_NEON +#include + +static inline void +volk_32f_index_min_32u_neon(uint32_t* target, const float* src0, uint32_t num_points) +{ + if (num_points > 0) { + uint32_t number = 0; + const uint32_t quarterPoints = num_points / 4; + + float* inputPtr = (float*)src0; + float32x4_t indexIncrementValues = vdupq_n_f32(4); + __VOLK_ATTR_ALIGNED(16) + float currentIndexes_float[4] = { -4.0f, -3.0f, -2.0f, -1.0f }; + float32x4_t currentIndexes = vld1q_f32(currentIndexes_float); + + float min = src0[0]; + float index = 0; + float32x4_t minValues = vdupq_n_f32(min); + uint32x4_t minValuesIndex = vmovq_n_u32(0); + uint32x4_t compareResults; + uint32x4_t currentIndexes_u; + float32x4_t currentValues; + + __VOLK_ATTR_ALIGNED(16) float minValuesBuffer[4]; + __VOLK_ATTR_ALIGNED(16) float minIndexesBuffer[4]; + + for (; number < quarterPoints; number++) { + currentValues = vld1q_f32(inputPtr); + inputPtr += 4; + currentIndexes = vaddq_f32(currentIndexes, indexIncrementValues); + currentIndexes_u = vcvtq_u32_f32(currentIndexes); + compareResults = vcgeq_f32(currentValues, minValues); + minValuesIndex = vorrq_u32(vandq_u32(compareResults, minValuesIndex), + vbicq_u32(currentIndexes_u, compareResults)); + minValues = vminq_f32(currentValues, minValues); + } + + // Calculate the smallest value from the remaining 4 points + vst1q_f32(minValuesBuffer, minValues); + vst1q_f32(minIndexesBuffer, vcvtq_f32_u32(minValuesIndex)); + for (number = 0; number < 4; number++) { + if (minValuesBuffer[number] < min) { + index = minIndexesBuffer[number]; + min = minValuesBuffer[number]; + } else if (minValues[number] == min) { + if (index > minIndexesBuffer[number]) + index = minIndexesBuffer[number]; + } + } + + number = quarterPoints * 4; + for (; number < num_points; number++) { + if (src0[number] < min) { + index = number; + min = src0[number]; + } + } + target[0] = (uint32_t)index; + } +} + +#endif /*LV_HAVE_NEON*/ + + +#ifdef LV_HAVE_GENERIC + +static inline void +volk_32f_index_min_32u_generic(uint32_t* target, const float* src0, uint32_t num_points) +{ + if (num_points > 0) { + float min = src0[0]; + uint32_t index = 0; + + uint32_t i = 1; + + for (; i < num_points; ++i) { + if (src0[i] < min) { + index = i; + min = src0[i]; + } + } + target[0] = index; + } +} + +#endif /*LV_HAVE_GENERIC*/ + + +#endif /*INCLUDED_volk_32f_index_min_32u_a_H*/ + + +#ifndef INCLUDED_volk_32f_index_min_32u_u_H +#define INCLUDED_volk_32f_index_min_32u_u_H + +#include +#include +#include + + +#ifdef LV_HAVE_AVX +#include + +static inline void +volk_32f_index_min_32u_u_avx(uint32_t* target, const float* src0, uint32_t num_points) +{ + if (num_points > 0) { + uint32_t number = 0; + const uint32_t quarterPoints = num_points / 8; + + float* inputPtr = (float*)src0; + + __m256 indexIncrementValues = _mm256_set1_ps(8); + __m256 currentIndexes = _mm256_set_ps(-1, -2, -3, -4, -5, -6, -7, -8); + + float min = src0[0]; + float index = 0; + __m256 minValues = _mm256_set1_ps(min); + __m256 minValuesIndex = _mm256_setzero_ps(); + __m256 compareResults; + __m256 currentValues; + + __VOLK_ATTR_ALIGNED(32) float minValuesBuffer[8]; + __VOLK_ATTR_ALIGNED(32) float minIndexesBuffer[8]; + + for (; number < quarterPoints; number++) { + currentValues = _mm256_loadu_ps(inputPtr); + inputPtr += 8; + currentIndexes = _mm256_add_ps(currentIndexes, indexIncrementValues); + compareResults = _mm256_cmp_ps(currentValues, minValues, _CMP_LT_OS); + minValuesIndex = + _mm256_blendv_ps(minValuesIndex, currentIndexes, compareResults); + minValues = _mm256_blendv_ps(minValues, currentValues, compareResults); + } + + // Calculate the smalles value from the remaining 8 points + _mm256_store_ps(minValuesBuffer, minValues); + _mm256_store_ps(minIndexesBuffer, minValuesIndex); + + for (number = 0; number < 8; number++) { + if (minValuesBuffer[number] < min) { + index = minIndexesBuffer[number]; + min = minValuesBuffer[number]; + } else if (minValuesBuffer[number] == min) { + if (index > minIndexesBuffer[number]) + index = minIndexesBuffer[number]; + } + } + + number = quarterPoints * 8; + for (; number < num_points; number++) { + if (src0[number] < min) { + index = number; + min = src0[number]; + } + } + target[0] = (uint32_t)index; + } +} + +#endif /*LV_HAVE_AVX*/ + + +#ifdef LV_HAVE_SSE4_1 +#include + +static inline void +volk_32f_index_min_32u_u_sse4_1(uint32_t* target, const float* src0, uint32_t num_points) +{ + if (num_points > 0) { + uint32_t number = 0; + const uint32_t quarterPoints = num_points / 4; + + float* inputPtr = (float*)src0; + + __m128 indexIncrementValues = _mm_set1_ps(4); + __m128 currentIndexes = _mm_set_ps(-1, -2, -3, -4); + + float min = src0[0]; + float index = 0; + __m128 minValues = _mm_set1_ps(min); + __m128 minValuesIndex = _mm_setzero_ps(); + __m128 compareResults; + __m128 currentValues; + + __VOLK_ATTR_ALIGNED(16) float minValuesBuffer[4]; + __VOLK_ATTR_ALIGNED(16) float minIndexesBuffer[4]; + + for (; number < quarterPoints; number++) { + currentValues = _mm_loadu_ps(inputPtr); + inputPtr += 4; + currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues); + compareResults = _mm_cmplt_ps(currentValues, minValues); + minValuesIndex = + _mm_blendv_ps(minValuesIndex, currentIndexes, compareResults); + minValues = _mm_blendv_ps(minValues, currentValues, compareResults); + } + + // Calculate the smallest value from the remaining 4 points + _mm_store_ps(minValuesBuffer, minValues); + _mm_store_ps(minIndexesBuffer, minValuesIndex); + + for (number = 0; number < 4; number++) { + if (minValuesBuffer[number] < min) { + index = minIndexesBuffer[number]; + min = minValuesBuffer[number]; + } else if (minValuesBuffer[number] == min) { + if (index > minIndexesBuffer[number]) + index = minIndexesBuffer[number]; + } + } + + number = quarterPoints * 4; + for (; number < num_points; number++) { + if (src0[number] < min) { + index = number; + min = src0[number]; + } + } + target[0] = (uint32_t)index; + } +} + +#endif /*LV_HAVE_SSE4_1*/ + +#ifdef LV_HAVE_SSE +#include + +static inline void +volk_32f_index_min_32u_u_sse(uint32_t* target, const float* src0, uint32_t num_points) +{ + if (num_points > 0) { + uint32_t number = 0; + const uint32_t quarterPoints = num_points / 4; + + float* inputPtr = (float*)src0; + + __m128 indexIncrementValues = _mm_set1_ps(4); + __m128 currentIndexes = _mm_set_ps(-1, -2, -3, -4); + + float min = src0[0]; + float index = 0; + __m128 minValues = _mm_set1_ps(min); + __m128 minValuesIndex = _mm_setzero_ps(); + __m128 compareResults; + __m128 currentValues; + + __VOLK_ATTR_ALIGNED(16) float minValuesBuffer[4]; + __VOLK_ATTR_ALIGNED(16) float minIndexesBuffer[4]; + + for (; number < quarterPoints; number++) { + currentValues = _mm_loadu_ps(inputPtr); + inputPtr += 4; + currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues); + compareResults = _mm_cmplt_ps(currentValues, minValues); + minValuesIndex = _mm_or_ps(_mm_and_ps(compareResults, currentIndexes), + _mm_andnot_ps(compareResults, minValuesIndex)); + minValues = _mm_or_ps(_mm_and_ps(compareResults, currentValues), + _mm_andnot_ps(compareResults, minValues)); + } + + // Calculate the smallest value from the remaining 4 points + _mm_store_ps(minValuesBuffer, minValues); + _mm_store_ps(minIndexesBuffer, minValuesIndex); + + for (number = 0; number < 4; number++) { + if (minValuesBuffer[number] < min) { + index = minIndexesBuffer[number]; + min = minValuesBuffer[number]; + } else if (minValuesBuffer[number] == min) { + if (index > minIndexesBuffer[number]) + index = minIndexesBuffer[number]; + } + } + + number = quarterPoints * 4; + for (; number < num_points; number++) { + if (src0[number] < min) { + index = number; + min = src0[number]; + } + } + target[0] = (uint32_t)index; + } +} + +#endif /*LV_HAVE_SSE*/ + +#endif /*INCLUDED_volk_32f_index_min_32u_u_H*/ diff --git a/kernels/volk/volk_32fc_index_min_16u.h b/kernels/volk/volk_32fc_index_min_16u.h new file mode 100644 index 0000000..5539ebf --- /dev/null +++ b/kernels/volk/volk_32fc_index_min_16u.h @@ -0,0 +1,482 @@ +/* -*- c++ -*- */ +/* + * Copyright 2021 Free Software Foundation, Inc. + * + * This file is part of GNU Radio + * + * GNU Radio is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 3, or (at your option) + * any later version. + * + * GNU Radio is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with GNU Radio; see the file COPYING. If not, write to + * the Free Software Foundation, Inc., 51 Franklin Street, + * Boston, MA 02110-1301, USA. + */ + +/*! + * \page volk_32fc_index_min_16u + * + * \b Overview + * + * Returns Argmin_i mag(x[i]). Finds and returns the index which contains the + * minimum magnitude for complex points in the given vector. + * + * Note that num_points is a uint32_t, but the return value is + * uint16_t. Providing a vector larger than the max of a uint16_t + * (65536) would miss anything outside of this boundary. The kernel + * will check the length of num_points and cap it to this max value, + * anyways. + * + * Dispatcher Prototype + * \code + * void volk_32fc_index_min_16u(uint16_t* target, lv_32fc_t* src0, uint32_t + * num_points) \endcode + * + * \b Inputs + * \li src0: The complex input vector. + * \li num_points: The number of samples. + * + * \b Outputs + * \li target: The index of the point with minimum magnitude. + * + * \b Example + * Calculate the index of the minimum value of \f$x^2 + x\f$ for points around + * the unit circle. + * \code + * int N = 10; + * uint32_t alignment = volk_get_alignment(); + * lv_32fc_t* in = (lv_32fc_t*)volk_malloc(sizeof(lv_32fc_t)*N, alignment); + * uint16_t* min = (uint16_t*)volk_malloc(sizeof(uint16_t), alignment); + * + * for(uint32_t ii = 0; ii < N/2; ++ii){ + * float real = 2.f * ((float)ii / (float)N) - 1.f; + * float imag = std::sqrt(1.f - real * real); + * in[ii] = lv_cmake(real, imag); + * in[ii] = in[ii] * in[ii] + in[ii]; + * in[N-ii] = lv_cmake(real, imag); + * in[N-ii] = in[N-ii] * in[N-ii] + in[N-ii]; + * } + * + * volk_32fc_index_min_16u(min, in, N); + * + * printf("index of min value = %u\n", *min); + * + * volk_free(in); + * volk_free(min); + * \endcode + */ + +#ifndef INCLUDED_volk_32fc_index_min_16u_a_H +#define INCLUDED_volk_32fc_index_min_16u_a_H + +#include +#include +#include +#include +#include + +#ifdef LV_HAVE_AVX2 +#include +#include + +static inline void volk_32fc_index_min_16u_a_avx2_variant_0(uint16_t* target, + lv_32fc_t* src0, + uint32_t num_points) +{ + num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points; + + const __m256i indices_increment = _mm256_set1_epi32(8); + /* + * At the start of each loop iteration current_indices holds the indices of + * the complex numbers loaded from memory. Explanation for odd order is given + * in implementation of vector_32fc_index_min_variant0(). + */ + __m256i current_indices = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0); + + __m256 min_values = _mm256_set1_ps(FLT_MAX); + __m256i min_indices = _mm256_setzero_si256(); + + for (unsigned i = 0; i < num_points / 8u; ++i) { + __m256 in0 = _mm256_load_ps((float*)src0); + __m256 in1 = _mm256_load_ps((float*)(src0 + 4)); + vector_32fc_index_min_variant0( + in0, in1, &min_values, &min_indices, ¤t_indices, indices_increment); + src0 += 8; + } + + // determine minimum value and index in the result of the vectorized loop + __VOLK_ATTR_ALIGNED(32) float min_values_buffer[8]; + __VOLK_ATTR_ALIGNED(32) uint32_t min_indices_buffer[8]; + _mm256_store_ps(min_values_buffer, min_values); + _mm256_store_si256((__m256i*)min_indices_buffer, min_indices); + + float min = FLT_MAX; + uint32_t index = 0; + for (unsigned i = 0; i < 8; i++) { + if (min_values_buffer[i] < min) { + min = min_values_buffer[i]; + index = min_indices_buffer[i]; + } + } + + // handle tail not processed by the vectorized loop + for (unsigned i = num_points & (~7u); i < num_points; ++i) { + const float abs_squared = + lv_creal(*src0) * lv_creal(*src0) + lv_cimag(*src0) * lv_cimag(*src0); + if (abs_squared < min) { + min = abs_squared; + index = i; + } + ++src0; + } + + *target = index; +} + +#endif /*LV_HAVE_AVX2*/ + +#ifdef LV_HAVE_AVX2 +#include +#include + +static inline void volk_32fc_index_min_16u_a_avx2_variant_1(uint16_t* target, + lv_32fc_t* src0, + uint32_t num_points) +{ + num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points; + + const __m256i indices_increment = _mm256_set1_epi32(8); + /* + * At the start of each loop iteration current_indices holds the indices of + * the complex numbers loaded from memory. Explanation for odd order is given + * in implementation of vector_32fc_index_min_variant0(). + */ + __m256i current_indices = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0); + + __m256 min_values = _mm256_set1_ps(FLT_MAX); + __m256i min_indices = _mm256_setzero_si256(); + + for (unsigned i = 0; i < num_points / 8u; ++i) { + __m256 in0 = _mm256_load_ps((float*)src0); + __m256 in1 = _mm256_load_ps((float*)(src0 + 4)); + vector_32fc_index_min_variant1( + in0, in1, &min_values, &min_indices, ¤t_indices, indices_increment); + src0 += 8; + } + + // determine minimum value and index in the result of the vectorized loop + __VOLK_ATTR_ALIGNED(32) float min_values_buffer[8]; + __VOLK_ATTR_ALIGNED(32) uint32_t min_indices_buffer[8]; + _mm256_store_ps(min_values_buffer, min_values); + _mm256_store_si256((__m256i*)min_indices_buffer, min_indices); + + float min = FLT_MAX; + uint32_t index = 0; + for (unsigned i = 0; i < 8; i++) { + if (min_values_buffer[i] < min) { + min = min_values_buffer[i]; + index = min_indices_buffer[i]; + } + } + + // handle tail not processed by the vectorized loop + for (unsigned i = num_points & (~7u); i < num_points; ++i) { + const float abs_squared = + lv_creal(*src0) * lv_creal(*src0) + lv_cimag(*src0) * lv_cimag(*src0); + if (abs_squared < min) { + min = abs_squared; + index = i; + } + ++src0; + } + + *target = index; +} + +#endif /*LV_HAVE_AVX2*/ + +#ifdef LV_HAVE_SSE3 +#include +#include + +static inline void +volk_32fc_index_min_16u_a_sse3(uint16_t* target, lv_32fc_t* src0, uint32_t num_points) +{ + num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points; + const uint32_t num_bytes = num_points * 8; + + union bit128 holderf; + union bit128 holderi; + float sq_dist = 0.0; + + union bit128 xmm5, xmm4; + __m128 xmm1, xmm2, xmm3; + __m128i xmm8, xmm11, xmm12, xmm9, xmm10; + + xmm5.int_vec = _mm_setzero_si128(); + xmm4.int_vec = _mm_setzero_si128(); + holderf.int_vec = _mm_setzero_si128(); + holderi.int_vec = _mm_setzero_si128(); + + int bound = num_bytes >> 5; + int i = 0; + + xmm8 = _mm_setr_epi32(0, 1, 2, 3); + xmm9 = _mm_setzero_si128(); + xmm10 = _mm_setr_epi32(4, 4, 4, 4); + xmm3 = _mm_set_ps1(FLT_MAX); + + for (; i < bound; ++i) { + xmm1 = _mm_load_ps((float*)src0); + xmm2 = _mm_load_ps((float*)&src0[2]); + + src0 += 4; + + xmm1 = _mm_mul_ps(xmm1, xmm1); + xmm2 = _mm_mul_ps(xmm2, xmm2); + + xmm1 = _mm_hadd_ps(xmm1, xmm2); + + xmm3 = _mm_min_ps(xmm1, xmm3); + + xmm4.float_vec = _mm_cmpgt_ps(xmm1, xmm3); + xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3); + + xmm11 = _mm_and_si128(xmm8, xmm5.int_vec); + xmm12 = _mm_and_si128(xmm9, xmm4.int_vec); + + xmm9 = _mm_add_epi32(xmm11, xmm12); + + xmm8 = _mm_add_epi32(xmm8, xmm10); + } + + if (num_bytes >> 4 & 1) { + xmm2 = _mm_load_ps((float*)src0); + + xmm1 = _mm_movelh_ps(bit128_p(&xmm8)->float_vec, bit128_p(&xmm8)->float_vec); + xmm8 = bit128_p(&xmm1)->int_vec; + + xmm2 = _mm_mul_ps(xmm2, xmm2); + + src0 += 2; + + xmm1 = _mm_hadd_ps(xmm2, xmm2); + + xmm3 = _mm_min_ps(xmm1, xmm3); + + xmm10 = _mm_setr_epi32(2, 2, 2, 2); + + xmm4.float_vec = _mm_cmpgt_ps(xmm1, xmm3); + xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3); + + xmm11 = _mm_and_si128(xmm8, xmm5.int_vec); + xmm12 = _mm_and_si128(xmm9, xmm4.int_vec); + + xmm9 = _mm_add_epi32(xmm11, xmm12); + + xmm8 = _mm_add_epi32(xmm8, xmm10); + } + + if (num_bytes >> 3 & 1) { + sq_dist = + lv_creal(src0[0]) * lv_creal(src0[0]) + lv_cimag(src0[0]) * lv_cimag(src0[0]); + + xmm2 = _mm_load1_ps(&sq_dist); + + xmm1 = xmm3; + + xmm3 = _mm_min_ss(xmm3, xmm2); + + xmm4.float_vec = _mm_cmpgt_ps(xmm1, xmm3); + xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3); + + xmm8 = _mm_shuffle_epi32(xmm8, 0x00); + + xmm11 = _mm_and_si128(xmm8, xmm4.int_vec); + xmm12 = _mm_and_si128(xmm9, xmm5.int_vec); + + xmm9 = _mm_add_epi32(xmm11, xmm12); + } + + _mm_store_ps((float*)&(holderf.f), xmm3); + _mm_store_si128(&(holderi.int_vec), xmm9); + + target[0] = holderi.i[0]; + sq_dist = holderf.f[0]; + target[0] = (holderf.f[1] < sq_dist) ? holderi.i[1] : target[0]; + sq_dist = (holderf.f[1] < sq_dist) ? holderf.f[1] : sq_dist; + target[0] = (holderf.f[2] < sq_dist) ? holderi.i[2] : target[0]; + sq_dist = (holderf.f[2] < sq_dist) ? holderf.f[2] : sq_dist; + target[0] = (holderf.f[3] < sq_dist) ? holderi.i[3] : target[0]; + sq_dist = (holderf.f[3] < sq_dist) ? holderf.f[3] : sq_dist; +} + +#endif /*LV_HAVE_SSE3*/ + +#ifdef LV_HAVE_GENERIC +static inline void +volk_32fc_index_min_16u_generic(uint16_t* target, lv_32fc_t* src0, uint32_t num_points) +{ + num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points; + + const uint32_t num_bytes = num_points * 8; + + float sq_dist = 0.0; + float min = FLT_MAX; + uint16_t index = 0; + + uint32_t i = 0; + + for (; i> 3; ++i) { + sq_dist = + lv_creal(src0[i]) * lv_creal(src0[i]) + lv_cimag(src0[i]) * lv_cimag(src0[i]); + + if (sq_dist < min) { + index = i; + min = sq_dist; + } + } + target[0] = index; +} + +#endif /*LV_HAVE_GENERIC*/ + +#endif /*INCLUDED_volk_32fc_index_min_16u_a_H*/ + +#ifndef INCLUDED_volk_32fc_index_min_16u_u_H +#define INCLUDED_volk_32fc_index_min_16u_u_H + +#include +#include +#include +#include +#include + +#ifdef LV_HAVE_AVX2 +#include +#include + +static inline void volk_32fc_index_min_16u_u_avx2_variant_0(uint16_t* target, + lv_32fc_t* src0, + uint32_t num_points) +{ + num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points; + + const __m256i indices_increment = _mm256_set1_epi32(8); + /* + * At the start of each loop iteration current_indices holds the indices of + * the complex numbers loaded from memory. Explanation for odd order is given + * in implementation of vector_32fc_index_min_variant0(). + */ + __m256i current_indices = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0); + + __m256 min_values = _mm256_set1_ps(FLT_MAX); + __m256i min_indices = _mm256_setzero_si256(); + + for (unsigned i = 0; i < num_points / 8u; ++i) { + __m256 in0 = _mm256_loadu_ps((float*)src0); + __m256 in1 = _mm256_loadu_ps((float*)(src0 + 4)); + vector_32fc_index_min_variant0( + in0, in1, &min_values, &min_indices, ¤t_indices, indices_increment); + src0 += 8; + } + + // determine minimum value and index in the result of the vectorized loop + __VOLK_ATTR_ALIGNED(32) float min_values_buffer[8]; + __VOLK_ATTR_ALIGNED(32) uint32_t min_indices_buffer[8]; + _mm256_store_ps(min_values_buffer, min_values); + _mm256_store_si256((__m256i*)min_indices_buffer, min_indices); + + float min = FLT_MAX; + uint32_t index = 0; + for (unsigned i = 0; i < 8; i++) { + if (min_values_buffer[i] < min) { + min = min_values_buffer[i]; + index = min_indices_buffer[i]; + } + } + + // handle tail not processed by the vectorized loop + for (unsigned i = num_points & (~7u); i < num_points; ++i) { + const float abs_squared = + lv_creal(*src0) * lv_creal(*src0) + lv_cimag(*src0) * lv_cimag(*src0); + if (abs_squared < min) { + min = abs_squared; + index = i; + } + ++src0; + } + + *target = index; +} + +#endif /*LV_HAVE_AVX2*/ + +#ifdef LV_HAVE_AVX2 +#include +#include + +static inline void volk_32fc_index_min_16u_u_avx2_variant_1(uint16_t* target, + lv_32fc_t* src0, + uint32_t num_points) +{ + num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points; + + const __m256i indices_increment = _mm256_set1_epi32(8); + /* + * At the start of each loop iteration current_indices holds the indices of + * the complex numbers loaded from memory. Explanation for odd order is given + * in implementation of vector_32fc_index_min_variant0(). + */ + __m256i current_indices = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0); + + __m256 min_values = _mm256_set1_ps(FLT_MAX); + __m256i min_indices = _mm256_setzero_si256(); + + for (unsigned i = 0; i < num_points / 8u; ++i) { + __m256 in0 = _mm256_loadu_ps((float*)src0); + __m256 in1 = _mm256_loadu_ps((float*)(src0 + 4)); + vector_32fc_index_min_variant1( + in0, in1, &min_values, &min_indices, ¤t_indices, indices_increment); + src0 += 8; + } + + // determine minimum value and index in the result of the vectorized loop + __VOLK_ATTR_ALIGNED(32) float min_values_buffer[8]; + __VOLK_ATTR_ALIGNED(32) uint32_t min_indices_buffer[8]; + _mm256_store_ps(min_values_buffer, min_values); + _mm256_store_si256((__m256i*)min_indices_buffer, min_indices); + + float min = FLT_MAX; + uint32_t index = 0; + for (unsigned i = 0; i < 8; i++) { + if (min_values_buffer[i] < min) { + min = min_values_buffer[i]; + index = min_indices_buffer[i]; + } + } + + // handle tail not processed by the vectorized loop + for (unsigned i = num_points & (~7u); i < num_points; ++i) { + const float abs_squared = + lv_creal(*src0) * lv_creal(*src0) + lv_cimag(*src0) * lv_cimag(*src0); + if (abs_squared < min) { + min = abs_squared; + index = i; + } + ++src0; + } + + *target = index; +} + +#endif /*LV_HAVE_AVX2*/ + +#endif /*INCLUDED_volk_32fc_index_min_16u_u_H*/ diff --git a/kernels/volk/volk_32fc_index_min_32u.h b/kernels/volk/volk_32fc_index_min_32u.h new file mode 100644 index 0000000..290b754 --- /dev/null +++ b/kernels/volk/volk_32fc_index_min_32u.h @@ -0,0 +1,524 @@ +/* -*- c++ -*- */ +/* + * Copyright 2021 Free Software Foundation, Inc. + * + * This file is part of GNU Radio + * + * GNU Radio is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 3, or (at your option) + * any later version. + * + * GNU Radio is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with GNU Radio; see the file COPYING. If not, write to + * the Free Software Foundation, Inc., 51 Franklin Street, + * Boston, MA 02110-1301, USA. + */ + +/*! + * \page volk_32fc_index_min_32u + * + * \b Overview + * + * Returns Argmin_i mag(x[i]). Finds and returns the index which contains the + * minimum magnitude for complex points in the given vector. + * + * Dispatcher Prototype + * \code + * void volk_32fc_index_min_32u(uint32_t* target, lv_32fc_t* src0, uint32_t + * num_points) \endcode + * + * \b Inputs + * \li src0: The complex input vector. + * \li num_points: The number of samples. + * + * \b Outputs + * \li target: The index of the point with minimum magnitude. + * + * \b Example + * Calculate the index of the minimum value of \f$x^2 + x\f$ for points around + * the unit circle. + * \code + * int N = 10; + * uint32_t alignment = volk_get_alignment(); + * lv_32fc_t* in = (lv_32fc_t*)volk_malloc(sizeof(lv_32fc_t)*N, alignment); + * uint32_t* min = (uint32_t*)volk_malloc(sizeof(uint32_t), alignment); + * + * for(uint32_t ii = 0; ii < N/2; ++ii){ + * float real = 2.f * ((float)ii / (float)N) - 1.f; + * float imag = std::sqrt(1.f - real * real); + * in[ii] = lv_cmake(real, imag); + * in[ii] = in[ii] * in[ii] + in[ii]; + * in[N-ii] = lv_cmake(real, imag); + * in[N-ii] = in[N-ii] * in[N-ii] + in[N-ii]; + * } + * + * volk_32fc_index_min_32u(min, in, N); + * + * printf("index of min value = %u\n", *min); + * + * volk_free(in); + * volk_free(min); + * \endcode + */ + +#ifndef INCLUDED_volk_32fc_index_min_32u_a_H +#define INCLUDED_volk_32fc_index_min_32u_a_H + +#include +#include +#include +#include + +#ifdef LV_HAVE_AVX2 +#include +#include + +static inline void volk_32fc_index_min_32u_a_avx2_variant_0(uint32_t* target, + lv_32fc_t* src0, + uint32_t num_points) +{ + const __m256i indices_increment = _mm256_set1_epi32(8); + /* + * At the start of each loop iteration current_indices holds the indices of + * the complex numbers loaded from memory. Explanation for odd order is given + * in implementation of vector_32fc_index_min_variant0(). + */ + __m256i current_indices = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0); + + __m256 min_values = _mm256_set1_ps(FLT_MAX); + __m256i min_indices = _mm256_setzero_si256(); + + for (unsigned i = 0; i < num_points / 8u; ++i) { + __m256 in0 = _mm256_load_ps((float*)src0); + __m256 in1 = _mm256_load_ps((float*)(src0 + 4)); + vector_32fc_index_min_variant0( + in0, in1, &min_values, &min_indices, ¤t_indices, indices_increment); + src0 += 8; + } + + // determine minimum value and index in the result of the vectorized loop + __VOLK_ATTR_ALIGNED(32) float min_values_buffer[8]; + __VOLK_ATTR_ALIGNED(32) uint32_t min_indices_buffer[8]; + _mm256_store_ps(min_values_buffer, min_values); + _mm256_store_si256((__m256i*)min_indices_buffer, min_indices); + + float min = FLT_MAX; + uint32_t index = 0; + for (unsigned i = 0; i < 8; i++) { + if (min_values_buffer[i] < min) { + min = min_values_buffer[i]; + index = min_indices_buffer[i]; + } + } + + // handle tail not processed by the vectorized loop + for (unsigned i = num_points & (~7u); i < num_points; ++i) { + const float abs_squared = + lv_creal(*src0) * lv_creal(*src0) + lv_cimag(*src0) * lv_cimag(*src0); + if (abs_squared < min) { + min = abs_squared; + index = i; + } + ++src0; + } + + *target = index; +} + +#endif /*LV_HAVE_AVX2*/ + +#ifdef LV_HAVE_AVX2 +#include +#include + +static inline void volk_32fc_index_min_32u_a_avx2_variant_1(uint32_t* target, + lv_32fc_t* src0, + uint32_t num_points) +{ + const __m256i indices_increment = _mm256_set1_epi32(8); + /* + * At the start of each loop iteration current_indices holds the indices of + * the complex numbers loaded from memory. Explanation for odd order is given + * in implementation of vector_32fc_index_min_variant0(). + */ + __m256i current_indices = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0); + + __m256 min_values = _mm256_set1_ps(FLT_MAX); + __m256i min_indices = _mm256_setzero_si256(); + + for (unsigned i = 0; i < num_points / 8u; ++i) { + __m256 in0 = _mm256_load_ps((float*)src0); + __m256 in1 = _mm256_load_ps((float*)(src0 + 4)); + vector_32fc_index_min_variant1( + in0, in1, &min_values, &min_indices, ¤t_indices, indices_increment); + src0 += 8; + } + + // determine minimum value and index in the result of the vectorized loop + __VOLK_ATTR_ALIGNED(32) float min_values_buffer[8]; + __VOLK_ATTR_ALIGNED(32) uint32_t min_indices_buffer[8]; + _mm256_store_ps(min_values_buffer, min_values); + _mm256_store_si256((__m256i*)min_indices_buffer, min_indices); + + float min = FLT_MAX; + uint32_t index = 0; + for (unsigned i = 0; i < 8; i++) { + if (min_values_buffer[i] < min) { + min = min_values_buffer[i]; + index = min_indices_buffer[i]; + } + } + + // handle tail not processed by the vectorized loop + for (unsigned i = num_points & (~7u); i < num_points; ++i) { + const float abs_squared = + lv_creal(*src0) * lv_creal(*src0) + lv_cimag(*src0) * lv_cimag(*src0); + if (abs_squared < min) { + min = abs_squared; + index = i; + } + ++src0; + } + + *target = index; +} + +#endif /*LV_HAVE_AVX2*/ + +#ifdef LV_HAVE_SSE3 +#include +#include + +static inline void +volk_32fc_index_min_32u_a_sse3(uint32_t* target, lv_32fc_t* src0, uint32_t num_points) +{ + const uint32_t num_bytes = num_points * 8; + + union bit128 holderf; + union bit128 holderi; + float sq_dist = 0.0; + + union bit128 xmm5, xmm4; + __m128 xmm1, xmm2, xmm3; + __m128i xmm8, xmm11, xmm12, xmm9, xmm10; + + xmm5.int_vec = _mm_setzero_si128(); + xmm4.int_vec = _mm_setzero_si128(); + holderf.int_vec = _mm_setzero_si128(); + holderi.int_vec = _mm_setzero_si128(); + + int bound = num_bytes >> 5; + int i = 0; + + xmm8 = _mm_setr_epi32(0, 1, 2, 3); + xmm9 = _mm_setzero_si128(); + xmm10 = _mm_setr_epi32(4, 4, 4, 4); + xmm3 = _mm_set_ps1(FLT_MAX); + + for (; i < bound; ++i) { + xmm1 = _mm_load_ps((float*)src0); + xmm2 = _mm_load_ps((float*)&src0[2]); + + src0 += 4; + + xmm1 = _mm_mul_ps(xmm1, xmm1); + xmm2 = _mm_mul_ps(xmm2, xmm2); + + xmm1 = _mm_hadd_ps(xmm1, xmm2); + + xmm3 = _mm_min_ps(xmm1, xmm3); + + xmm4.float_vec = _mm_cmpgt_ps(xmm1, xmm3); + xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3); + + xmm11 = _mm_and_si128(xmm8, xmm5.int_vec); + xmm12 = _mm_and_si128(xmm9, xmm4.int_vec); + + xmm9 = _mm_add_epi32(xmm11, xmm12); + + xmm8 = _mm_add_epi32(xmm8, xmm10); + } + + if (num_bytes >> 4 & 1) { + xmm2 = _mm_load_ps((float*)src0); + + xmm1 = _mm_movelh_ps(bit128_p(&xmm8)->float_vec, bit128_p(&xmm8)->float_vec); + xmm8 = bit128_p(&xmm1)->int_vec; + + xmm2 = _mm_mul_ps(xmm2, xmm2); + + src0 += 2; + + xmm1 = _mm_hadd_ps(xmm2, xmm2); + + xmm3 = _mm_min_ps(xmm1, xmm3); + + xmm10 = _mm_setr_epi32(2, 2, 2, 2); + + xmm4.float_vec = _mm_cmpgt_ps(xmm1, xmm3); + xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3); + + xmm11 = _mm_and_si128(xmm8, xmm5.int_vec); + xmm12 = _mm_and_si128(xmm9, xmm4.int_vec); + + xmm9 = _mm_add_epi32(xmm11, xmm12); + + xmm8 = _mm_add_epi32(xmm8, xmm10); + } + + if (num_bytes >> 3 & 1) { + sq_dist = + lv_creal(src0[0]) * lv_creal(src0[0]) + lv_cimag(src0[0]) * lv_cimag(src0[0]); + + xmm2 = _mm_load1_ps(&sq_dist); + + xmm1 = xmm3; + + xmm3 = _mm_min_ss(xmm3, xmm2); + + xmm4.float_vec = _mm_cmpgt_ps(xmm1, xmm3); + xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3); + + xmm8 = _mm_shuffle_epi32(xmm8, 0x00); + + xmm11 = _mm_and_si128(xmm8, xmm4.int_vec); + xmm12 = _mm_and_si128(xmm9, xmm5.int_vec); + + xmm9 = _mm_add_epi32(xmm11, xmm12); + } + + _mm_store_ps((float*)&(holderf.f), xmm3); + _mm_store_si128(&(holderi.int_vec), xmm9); + + target[0] = holderi.i[0]; + sq_dist = holderf.f[0]; + target[0] = (holderf.f[1] < sq_dist) ? holderi.i[1] : target[0]; + sq_dist = (holderf.f[1] < sq_dist) ? holderf.f[1] : sq_dist; + target[0] = (holderf.f[2] < sq_dist) ? holderi.i[2] : target[0]; + sq_dist = (holderf.f[2] < sq_dist) ? holderf.f[2] : sq_dist; + target[0] = (holderf.f[3] < sq_dist) ? holderi.i[3] : target[0]; + sq_dist = (holderf.f[3] < sq_dist) ? holderf.f[3] : sq_dist; +} + +#endif /*LV_HAVE_SSE3*/ + +#ifdef LV_HAVE_GENERIC +static inline void +volk_32fc_index_min_32u_generic(uint32_t* target, lv_32fc_t* src0, uint32_t num_points) +{ + const uint32_t num_bytes = num_points * 8; + + float sq_dist = 0.0; + float min = FLT_MAX; + uint32_t index = 0; + + uint32_t i = 0; + + for (; i> 3; ++i) { + sq_dist = + lv_creal(src0[i]) * lv_creal(src0[i]) + lv_cimag(src0[i]) * lv_cimag(src0[i]); + + if (sq_dist < min) { + index = i; + min = sq_dist; + } + } + target[0] = index; +} + +#endif /*LV_HAVE_GENERIC*/ + +#endif /*INCLUDED_volk_32fc_index_min_32u_a_H*/ + +#ifndef INCLUDED_volk_32fc_index_min_32u_u_H +#define INCLUDED_volk_32fc_index_min_32u_u_H + +#include +#include +#include +#include + +#ifdef LV_HAVE_AVX2 +#include +#include + +static inline void volk_32fc_index_min_32u_u_avx2_variant_0(uint32_t* target, + lv_32fc_t* src0, + uint32_t num_points) +{ + const __m256i indices_increment = _mm256_set1_epi32(8); + /* + * At the start of each loop iteration current_indices holds the indices of + * the complex numbers loaded from memory. Explanation for odd order is given + * in implementation of vector_32fc_index_min_variant0(). + */ + __m256i current_indices = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0); + + __m256 min_values = _mm256_set1_ps(FLT_MAX); + __m256i min_indices = _mm256_setzero_si256(); + + for (unsigned i = 0; i < num_points / 8u; ++i) { + __m256 in0 = _mm256_loadu_ps((float*)src0); + __m256 in1 = _mm256_loadu_ps((float*)(src0 + 4)); + vector_32fc_index_min_variant0( + in0, in1, &min_values, &min_indices, ¤t_indices, indices_increment); + src0 += 8; + } + + // determine minimum value and index in the result of the vectorized loop + __VOLK_ATTR_ALIGNED(32) float min_values_buffer[8]; + __VOLK_ATTR_ALIGNED(32) uint32_t min_indices_buffer[8]; + _mm256_store_ps(min_values_buffer, min_values); + _mm256_store_si256((__m256i*)min_indices_buffer, min_indices); + + float min = FLT_MAX; + uint32_t index = 0; + for (unsigned i = 0; i < 8; i++) { + if (min_values_buffer[i] < min) { + min = min_values_buffer[i]; + index = min_indices_buffer[i]; + } + } + + // handle tail not processed by the vectorized loop + for (unsigned i = num_points & (~7u); i < num_points; ++i) { + const float abs_squared = + lv_creal(*src0) * lv_creal(*src0) + lv_cimag(*src0) * lv_cimag(*src0); + if (abs_squared < min) { + min = abs_squared; + index = i; + } + ++src0; + } + + *target = index; +} + +#endif /*LV_HAVE_AVX2*/ + +#ifdef LV_HAVE_AVX2 +#include +#include + +static inline void volk_32fc_index_min_32u_u_avx2_variant_1(uint32_t* target, + lv_32fc_t* src0, + uint32_t num_points) +{ + const __m256i indices_increment = _mm256_set1_epi32(8); + /* + * At the start of each loop iteration current_indices holds the indices of + * the complex numbers loaded from memory. Explanation for odd order is given + * in implementation of vector_32fc_index_min_variant0(). + */ + __m256i current_indices = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0); + + __m256 min_values = _mm256_set1_ps(FLT_MAX); + __m256i min_indices = _mm256_setzero_si256(); + + for (unsigned i = 0; i < num_points / 8u; ++i) { + __m256 in0 = _mm256_loadu_ps((float*)src0); + __m256 in1 = _mm256_loadu_ps((float*)(src0 + 4)); + vector_32fc_index_min_variant1( + in0, in1, &min_values, &min_indices, ¤t_indices, indices_increment); + src0 += 8; + } + + // determine minimum value and index in the result of the vectorized loop + __VOLK_ATTR_ALIGNED(32) float min_values_buffer[8]; + __VOLK_ATTR_ALIGNED(32) uint32_t min_indices_buffer[8]; + _mm256_store_ps(min_values_buffer, min_values); + _mm256_store_si256((__m256i*)min_indices_buffer, min_indices); + + float min = FLT_MAX; + uint32_t index = 0; + for (unsigned i = 0; i < 8; i++) { + if (min_values_buffer[i] < min) { + min = min_values_buffer[i]; + index = min_indices_buffer[i]; + } + } + + // handle tail not processed by the vectorized loop + for (unsigned i = num_points & (~7u); i < num_points; ++i) { + const float abs_squared = + lv_creal(*src0) * lv_creal(*src0) + lv_cimag(*src0) * lv_cimag(*src0); + if (abs_squared < min) { + min = abs_squared; + index = i; + } + ++src0; + } + + *target = index; +} + +#endif /*LV_HAVE_AVX2*/ + +#ifdef LV_HAVE_NEON +#include +#include + +static inline void +volk_32fc_index_min_32u_neon(uint32_t* target, lv_32fc_t* src0, uint32_t num_points) +{ + unsigned int number = 0; + const uint32_t quarter_points = num_points / 4; + const lv_32fc_t* src0Ptr = src0; + + uint32_t indices[4] = { 0, 1, 2, 3 }; + const uint32x4_t vec_indices_incr = vdupq_n_u32(4); + uint32x4_t vec_indices = vld1q_u32(indices); + uint32x4_t vec_min_indices = vec_indices; + + if (num_points) { + float min = *src0Ptr; + uint32_t index = 0; + + float32x4_t vec_min = vdupq_n_f32(*src0Ptr); + + for (; number < quarter_points; number++) { + // Load complex and compute magnitude squared + const float32x4_t vec_mag2 = + _vmagnitudesquaredq_f32(vld2q_f32((float*)src0Ptr)); + __VOLK_PREFETCH(src0Ptr += 4); + // a < b? + const uint32x4_t lt_mask = vcltq_f32(vec_mag2, vec_min); + vec_min = vbslq_f32(lt_mask, vec_mag2, vec_min); + vec_min_indices = vbslq_u32(lt_mask, vec_indices, vec_min_indices); + vec_indices = vaddq_u32(vec_indices, vec_indices_incr); + } + uint32_t tmp_min_indices[4]; + float tmp_min[4]; + vst1q_u32(tmp_min_indices, vec_min_indices); + vst1q_f32(tmp_min, vec_min); + + for (int i = 0; i < 4; i++) { + if (tmp_min[i] < min) { + min = tmp_min[i]; + index = tmp_min_indices[i]; + } + } + + // Deal with the rest + for (number = quarter_points * 4; number < num_points; number++) { + const float re = lv_creal(*src0Ptr); + const float im = lv_cimag(*src0Ptr); + if ((re * re + im * im) < min) { + min = *src0Ptr; + index = number; + } + src0Ptr++; + } + *target = index; + } +} + +#endif /*LV_HAVE_NEON*/ + +#endif /*INCLUDED_volk_32fc_index_min_32u_u_H*/ diff --git a/lib/kernel_tests.h b/lib/kernel_tests.h index 6df83ab..9f947cf 100644 --- a/lib/kernel_tests.h +++ b/lib/kernel_tests.h @@ -68,6 +68,8 @@ std::vector init_test_list(volk_test_params_t test_params) QA(VOLK_INIT_TEST(volk_32f_x2_add_32f, test_params)) QA(VOLK_INIT_TEST(volk_32f_index_max_16u, test_params)) QA(VOLK_INIT_TEST(volk_32f_index_max_32u, test_params)) + QA(VOLK_INIT_TEST(volk_32f_index_min_16u, test_params)) + QA(VOLK_INIT_TEST(volk_32f_index_min_32u, test_params)) QA(VOLK_INIT_TEST(volk_32fc_32f_multiply_32fc, test_params)) QA(VOLK_INIT_TEST(volk_32fc_32f_add_32fc, test_params)) QA(VOLK_INIT_TEST(volk_32f_log2_32f, test_params.make_absolute(1e-5))) @@ -94,6 +96,8 @@ std::vector init_test_list(volk_test_params_t test_params) QA(VOLK_INIT_TEST(volk_32fc_32f_dot_prod_32fc, test_params_inacc)) QA(VOLK_INIT_TEST(volk_32fc_index_max_16u, test_params)) QA(VOLK_INIT_TEST(volk_32fc_index_max_32u, test_params)) + QA(VOLK_INIT_TEST(volk_32fc_index_min_16u, test_params)) + QA(VOLK_INIT_TEST(volk_32fc_index_min_32u, test_params)) QA(VOLK_INIT_TEST(volk_32fc_s32f_magnitude_16i, test_params)) QA(VOLK_INIT_TEST(volk_32fc_magnitude_32f, test_params_inacc_tenth)) QA(VOLK_INIT_TEST(volk_32fc_magnitude_squared_32f, test_params)) -- 2.30.2