\li \subpage volk_32fc_deinterleave_real_64f
\li \subpage volk_32fc_index_max_16u
\li \subpage volk_32fc_index_max_32u
+\li \subpage volk_32fc_index_min_16u
+\li \subpage volk_32fc_index_min_32u
\li \subpage volk_32fc_magnitude_32f
\li \subpage volk_32fc_magnitude_squared_32f
\li \subpage volk_32f_cos_32f
\li \subpage volk_32f_expfast_32f
\li \subpage volk_32f_index_max_16u
\li \subpage volk_32f_index_max_32u
+\li \subpage volk_32f_index_min_16u
+\li \subpage volk_32f_index_min_32u
\li \subpage volk_32f_invsqrt_32f
\li \subpage volk_32f_log2_32f
\li \subpage volk_32f_s32f_calc_spectral_noise_floor_32f
* float abs_squared = real(src0) * real(src0) + imag(src0) * imag(src1)
* bool compare = abs_squared > max_values[j];
* max_values[j] = compare ? abs_squared : max_values[j];
- * max_indices[j] = compare ? current_indices[j] > max_indices[j]
+ * max_indices[j] = compare ? current_indices[j] : max_indices[j]
* current_indices[j] += 8; // update for next outer loop iteration
* ++src0;
* }
*current_indices = _mm256_add_epi32(*current_indices, indices_increment);
}
+/*
+ * The function below vectorizes the inner loop of the following code:
+ *
+ * float min_values[8] = {FLT_MAX};
+ * unsigned min_indices[8] = {0};
+ * unsigned current_indices[8] = {0, 1, 2, 3, 4, 5, 6, 7};
+ * for (unsigned i = 0; i < num_points / 8; ++i) {
+ * for (unsigned j = 0; j < 8; ++j) {
+ * float abs_squared = real(src0) * real(src0) + imag(src0) * imag(src1)
+ * bool compare = abs_squared < min_values[j];
+ * min_values[j] = compare ? abs_squared : min_values[j];
+ * min_indices[j] = compare ? current_indices[j] : min_indices[j]
+ * current_indices[j] += 8; // update for next outer loop iteration
+ * ++src0;
+ * }
+ * }
+ */
+static inline void vector_32fc_index_min_variant0(__m256 in0,
+ __m256 in1,
+ __m256* min_values,
+ __m256i* min_indices,
+ __m256i* current_indices,
+ __m256i indices_increment)
+{
+ in0 = _mm256_mul_ps(in0, in0);
+ in1 = _mm256_mul_ps(in1, in1);
+
+ /*
+ * Given the vectors a = (a_7, a_6, …, a_1, a_0) and b = (b_7, b_6, …, b_1, b_0)
+ * hadd_ps(a, b) computes
+ * (b_7 + b_6,
+ * b_5 + b_4,
+ * ---------
+ * a_7 + b_6,
+ * a_5 + a_4,
+ * ---------
+ * b_3 + b_2,
+ * b_1 + b_0,
+ * ---------
+ * a_3 + a_2,
+ * a_1 + a_0).
+ * The result is the squared absolute value of complex numbers at index
+ * offsets (7, 6, 3, 2, 5, 4, 1, 0). This must be the initial value of
+ * current_indices!
+ */
+ __m256 abs_squared = _mm256_hadd_ps(in0, in1);
+
+ /*
+ * Compare the recently computed squared absolute values with the
+ * previously determined minimum values. cmp_ps(a, b) determines
+ * a < b ? 0xFFFFFFFF for each element in the vectors =>
+ * compare_mask = abs_squared < min_values ? 0xFFFFFFFF : 0
+ *
+ * If either operand is NaN, 0 is returned as an “ordered” comparision is
+ * used => the blend operation will select the value from *min_values.
+ */
+ __m256 compare_mask = _mm256_cmp_ps(abs_squared, *min_values, _CMP_LT_OS);
+
+ /* Select minimum by blending. This is the only line which differs from variant1 */
+ *min_values = _mm256_blendv_ps(*min_values, abs_squared, compare_mask);
+
+ /*
+ * Updates indices: blendv_ps(a, b, mask) determines mask ? b : a for
+ * each element in the vectors =>
+ * min_indices = compare_mask ? current_indices : min_indices
+ *
+ * Note: The casting of data types is required to make the compiler happy
+ * and does not change values.
+ */
+ *min_indices =
+ _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(*min_indices),
+ _mm256_castsi256_ps(*current_indices),
+ compare_mask));
+
+ /* compute indices of complex numbers which will be loaded in the next iteration */
+ *current_indices = _mm256_add_epi32(*current_indices, indices_increment);
+}
+
+/* See _variant0 for details */
+static inline void vector_32fc_index_min_variant1(__m256 in0,
+ __m256 in1,
+ __m256* min_values,
+ __m256i* min_indices,
+ __m256i* current_indices,
+ __m256i indices_increment)
+{
+ in0 = _mm256_mul_ps(in0, in0);
+ in1 = _mm256_mul_ps(in1, in1);
+
+ __m256 abs_squared = _mm256_hadd_ps(in0, in1);
+ __m256 compare_mask = _mm256_cmp_ps(abs_squared, *min_values, _CMP_LT_OS);
+
+ /*
+ * This is the only line which differs from variant0. Using maxps instead of
+ * blendvps is faster on Intel CPUs (on the ones tested with).
+ *
+ * Note: The order of arguments matters if a NaN is encountered in which
+ * case the value of the second argument is selected. This is consistent
+ * with the “ordered” comparision and the blend operation: The comparision
+ * returns false if a NaN is encountered and the blend operation
+ * consequently selects the value from min_indices.
+ */
+ *min_values = _mm256_min_ps(abs_squared, *min_values);
+
+ *min_indices =
+ _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(*min_indices),
+ _mm256_castsi256_ps(*current_indices),
+ compare_mask));
+
+ *current_indices = _mm256_add_epi32(*current_indices, indices_increment);
+}
+
#endif /* INCLUDE_VOLK_VOLK_AVX2_INTRINSICS_H_ */
--- /dev/null
+/* -*- c++ -*- */
+/*
+ * Copyright 2021 Free Software Foundation, Inc.
+ *
+ * This file is part of GNU Radio
+ *
+ * GNU Radio is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 3, or (at your option)
+ * any later version.
+ *
+ * GNU Radio is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNU Radio; see the file COPYING. If not, write to
+ * the Free Software Foundation, Inc., 51 Franklin Street,
+ * Boston, MA 02110-1301, USA.
+ */
+
+/*!
+ * \page volk_32f_index_min_16u
+ *
+ * \b Overview
+ *
+ * Returns Argmin_i x[i]. Finds and returns the index which contains
+ * the fist minimum value in the given vector.
+ *
+ * Note that num_points is a uint32_t, but the return value is
+ * uint16_t. Providing a vector larger than the max of a uint16_t
+ * (65536) would miss anything outside of this boundary. The kernel
+ * will check the length of num_points and cap it to this max value,
+ * anyways.
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+ * void volk_32f_index_min_16u(uint16_t* target, const float* src0, uint32_t num_points)
+ * \endcode
+ *
+ * \b Inputs
+ * \li src0: The input vector of floats.
+ * \li num_points: The number of data points.
+ *
+ * \b Outputs
+ * \li target: The index of the fist minimum value in the input buffer.
+ *
+ * \b Example
+ * \code
+ * int N = 10;
+ * uint32_t alignment = volk_get_alignment();
+ * float* in = (float*)volk_malloc(sizeof(float)*N, alignment);
+ * uint16_t* out = (uint16_t*)volk_malloc(sizeof(uint16_t), alignment);
+ *
+ * for(uint32_t ii = 0; ii < N; ++ii){
+ * float x = (float)ii;
+ * // a parabola with a minimum at x=4
+ * in[ii] = (x-4) * (x-4) - 5;
+ * }
+ *
+ * volk_32f_index_min_16u(out, in, N);
+ *
+ * printf("minimum is %1.2f at index %u\n", in[*out], *out);
+ *
+ * volk_free(in);
+ * volk_free(out);
+ * \endcode
+ */
+
+#ifndef INCLUDED_volk_32f_index_min_16u_a_H
+#define INCLUDED_volk_32f_index_min_16u_a_H
+
+#include <inttypes.h>
+#include <limits.h>
+#include <stdio.h>
+#include <volk/volk_common.h>
+
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+
+static inline void
+volk_32f_index_min_16u_a_avx(uint16_t* target, const float* src0, uint32_t num_points)
+{
+ num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
+
+ uint32_t number = 0;
+ const uint32_t eighthPoints = num_points / 8;
+
+ float* inputPtr = (float*)src0;
+
+ __m256 indexIncrementValues = _mm256_set1_ps(8);
+ __m256 currentIndexes = _mm256_set_ps(-1, -2, -3, -4, -5, -6, -7, -8);
+
+ float min = src0[0];
+ float index = 0;
+ __m256 minValues = _mm256_set1_ps(min);
+ __m256 minValuesIndex = _mm256_setzero_ps();
+ __m256 compareResults;
+ __m256 currentValues;
+
+ __VOLK_ATTR_ALIGNED(32) float minValuesBuffer[8];
+ __VOLK_ATTR_ALIGNED(32) float minIndexesBuffer[8];
+
+ for (; number < eighthPoints; number++) {
+
+ currentValues = _mm256_load_ps(inputPtr);
+ inputPtr += 8;
+ currentIndexes = _mm256_add_ps(currentIndexes, indexIncrementValues);
+
+ compareResults = _mm256_cmp_ps(currentValues, minValues, _CMP_LT_OS);
+
+ minValuesIndex = _mm256_blendv_ps(minValuesIndex, currentIndexes, compareResults);
+ minValues = _mm256_blendv_ps(minValues, currentValues, compareResults);
+ }
+
+ // Calculate the smallest value from the remaining 4 points
+ _mm256_store_ps(minValuesBuffer, minValues);
+ _mm256_store_ps(minIndexesBuffer, minValuesIndex);
+
+ for (number = 0; number < 8; number++) {
+ if (minValuesBuffer[number] < min) {
+ index = minIndexesBuffer[number];
+ min = minValuesBuffer[number];
+ } else if (minValuesBuffer[number] == min) {
+ if (index > minIndexesBuffer[number])
+ index = minIndexesBuffer[number];
+ }
+ }
+
+ number = eighthPoints * 8;
+ for (; number < num_points; number++) {
+ if (src0[number] < min) {
+ index = number;
+ min = src0[number];
+ }
+ }
+ target[0] = (uint16_t)index;
+}
+
+#endif /*LV_HAVE_AVX*/
+
+#ifdef LV_HAVE_SSE4_1
+#include <smmintrin.h>
+
+static inline void
+volk_32f_index_min_16u_a_sse4_1(uint16_t* target, const float* src0, uint32_t num_points)
+{
+ num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
+
+ uint32_t number = 0;
+ const uint32_t quarterPoints = num_points / 4;
+
+ float* inputPtr = (float*)src0;
+
+ __m128 indexIncrementValues = _mm_set1_ps(4);
+ __m128 currentIndexes = _mm_set_ps(-1, -2, -3, -4);
+
+ float min = src0[0];
+ float index = 0;
+ __m128 minValues = _mm_set1_ps(min);
+ __m128 minValuesIndex = _mm_setzero_ps();
+ __m128 compareResults;
+ __m128 currentValues;
+
+ __VOLK_ATTR_ALIGNED(16) float minValuesBuffer[4];
+ __VOLK_ATTR_ALIGNED(16) float minIndexesBuffer[4];
+
+ for (; number < quarterPoints; number++) {
+
+ currentValues = _mm_load_ps(inputPtr);
+ inputPtr += 4;
+ currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues);
+
+ compareResults = _mm_cmplt_ps(currentValues, minValues);
+
+ minValuesIndex = _mm_blendv_ps(minValuesIndex, currentIndexes, compareResults);
+ minValues = _mm_blendv_ps(minValues, currentValues, compareResults);
+ }
+
+ // Calculate the smallest value from the remaining 4 points
+ _mm_store_ps(minValuesBuffer, minValues);
+ _mm_store_ps(minIndexesBuffer, minValuesIndex);
+
+ for (number = 0; number < 4; number++) {
+ if (minValuesBuffer[number] < min) {
+ index = minIndexesBuffer[number];
+ min = minValuesBuffer[number];
+ } else if (minValuesBuffer[number] == min) {
+ if (index > minIndexesBuffer[number])
+ index = minIndexesBuffer[number];
+ }
+ }
+
+ number = quarterPoints * 4;
+ for (; number < num_points; number++) {
+ if (src0[number] < min) {
+ index = number;
+ min = src0[number];
+ }
+ }
+ target[0] = (uint16_t)index;
+}
+
+#endif /*LV_HAVE_SSE4_1*/
+
+
+#ifdef LV_HAVE_SSE
+
+#include <xmmintrin.h>
+
+static inline void
+volk_32f_index_min_16u_a_sse(uint16_t* target, const float* src0, uint32_t num_points)
+{
+ num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
+
+ uint32_t number = 0;
+ const uint32_t quarterPoints = num_points / 4;
+
+ float* inputPtr = (float*)src0;
+
+ __m128 indexIncrementValues = _mm_set1_ps(4);
+ __m128 currentIndexes = _mm_set_ps(-1, -2, -3, -4);
+
+ float min = src0[0];
+ float index = 0;
+ __m128 minValues = _mm_set1_ps(min);
+ __m128 minValuesIndex = _mm_setzero_ps();
+ __m128 compareResults;
+ __m128 currentValues;
+
+ __VOLK_ATTR_ALIGNED(16) float minValuesBuffer[4];
+ __VOLK_ATTR_ALIGNED(16) float minIndexesBuffer[4];
+
+ for (; number < quarterPoints; number++) {
+
+ currentValues = _mm_load_ps(inputPtr);
+ inputPtr += 4;
+ currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues);
+
+ compareResults = _mm_cmplt_ps(currentValues, minValues);
+
+ minValuesIndex = _mm_or_ps(_mm_and_ps(compareResults, currentIndexes),
+ _mm_andnot_ps(compareResults, minValuesIndex));
+ minValues = _mm_or_ps(_mm_and_ps(compareResults, currentValues),
+ _mm_andnot_ps(compareResults, minValues));
+ }
+
+ // Calculate the smallest value from the remaining 4 points
+ _mm_store_ps(minValuesBuffer, minValues);
+ _mm_store_ps(minIndexesBuffer, minValuesIndex);
+
+ for (number = 0; number < 4; number++) {
+ if (minValuesBuffer[number] < min) {
+ index = minIndexesBuffer[number];
+ min = minValuesBuffer[number];
+ } else if (minValuesBuffer[number] == min) {
+ if (index > minIndexesBuffer[number])
+ index = minIndexesBuffer[number];
+ }
+ }
+
+ number = quarterPoints * 4;
+ for (; number < num_points; number++) {
+ if (src0[number] < min) {
+ index = number;
+ min = src0[number];
+ }
+ }
+ target[0] = (uint16_t)index;
+}
+
+#endif /*LV_HAVE_SSE*/
+
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void
+volk_32f_index_min_16u_generic(uint16_t* target, const float* src0, uint32_t num_points)
+{
+ num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
+
+ float min = src0[0];
+ uint16_t index = 0;
+
+ uint32_t i = 1;
+
+ for (; i < num_points; ++i) {
+ if (src0[i] < min) {
+ index = i;
+ min = src0[i];
+ }
+ }
+ target[0] = index;
+}
+
+#endif /*LV_HAVE_GENERIC*/
+
+
+#endif /*INCLUDED_volk_32f_index_min_16u_a_H*/
+
+
+#ifndef INCLUDED_volk_32f_index_min_16u_u_H
+#define INCLUDED_volk_32f_index_min_16u_u_H
+
+#include <inttypes.h>
+#include <limits.h>
+#include <stdio.h>
+#include <volk/volk_common.h>
+
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+
+static inline void
+volk_32f_index_min_16u_u_avx(uint16_t* target, const float* src0, uint32_t num_points)
+{
+ num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
+
+ uint32_t number = 0;
+ const uint32_t eighthPoints = num_points / 8;
+
+ float* inputPtr = (float*)src0;
+
+ __m256 indexIncrementValues = _mm256_set1_ps(8);
+ __m256 currentIndexes = _mm256_set_ps(-1, -2, -3, -4, -5, -6, -7, -8);
+
+ float min = src0[0];
+ float index = 0;
+ __m256 minValues = _mm256_set1_ps(min);
+ __m256 minValuesIndex = _mm256_setzero_ps();
+ __m256 compareResults;
+ __m256 currentValues;
+
+ __VOLK_ATTR_ALIGNED(32) float minValuesBuffer[8];
+ __VOLK_ATTR_ALIGNED(32) float minIndexesBuffer[8];
+
+ for (; number < eighthPoints; number++) {
+
+ currentValues = _mm256_loadu_ps(inputPtr);
+ inputPtr += 8;
+ currentIndexes = _mm256_add_ps(currentIndexes, indexIncrementValues);
+
+ compareResults = _mm256_cmp_ps(currentValues, minValues, _CMP_LT_OS);
+
+ minValuesIndex = _mm256_blendv_ps(minValuesIndex, currentIndexes, compareResults);
+ minValues = _mm256_blendv_ps(minValues, currentValues, compareResults);
+ }
+
+ // Calculate the smallest value from the remaining 4 points
+ _mm256_storeu_ps(minValuesBuffer, minValues);
+ _mm256_storeu_ps(minIndexesBuffer, minValuesIndex);
+
+ for (number = 0; number < 8; number++) {
+ if (minValuesBuffer[number] < min) {
+ index = minIndexesBuffer[number];
+ min = minValuesBuffer[number];
+ } else if (minValuesBuffer[number] == min) {
+ if (index > minIndexesBuffer[number])
+ index = minIndexesBuffer[number];
+ }
+ }
+
+ number = eighthPoints * 8;
+ for (; number < num_points; number++) {
+ if (src0[number] < min) {
+ index = number;
+ min = src0[number];
+ }
+ }
+ target[0] = (uint16_t)index;
+}
+
+#endif /*LV_HAVE_AVX*/
+
+#endif /*INCLUDED_volk_32f_index_min_16u_u_H*/
--- /dev/null
+/* -*- c++ -*- */
+/*
+ * Copyright 2021 Free Software Foundation, Inc.
+ *
+ * This file is part of GNU Radio
+ *
+ * GNU Radio is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 3, or (at your option)
+ * any later version.
+ *
+ * GNU Radio is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNU Radio; see the file COPYING. If not, write to
+ * the Free Software Foundation, Inc., 51 Franklin Street,
+ * Boston, MA 02110-1301, USA.
+ */
+
+/*!
+ * \page volk_32f_index_min_32u
+ *
+ * \b Overview
+ *
+ * Returns Argmin_i x[i]. Finds and returns the index which contains the first minimum
+ * value in the given vector.
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+ * void volk_32f_index_min_32u(uint32_t* target, const float* src0, uint32_t num_points)
+ * \endcode
+ *
+ * \b Inputs
+ * \li src0: The input vector of floats.
+ * \li num_points: The number of data points.
+ *
+ * \b Outputs
+ * \li target: The index of the first minimum value in the input buffer.
+ *
+ * \b Example
+ * \code
+ * int N = 10;
+ * uint32_t alignment = volk_get_alignment();
+ * float* in = (float*)volk_malloc(sizeof(float)*N, alignment);
+ * uint32_t* out = (uint32_t*)volk_malloc(sizeof(uint32_t), alignment);
+ *
+ * for(uint32_t ii = 0; ii < N; ++ii){
+ * float x = (float)ii;
+ * // a parabola with a minimum at x=4
+ * in[ii] = (x-4) * (x-4) - 5;
+ * }
+ *
+ * volk_32f_index_min_32u(out, in, N);
+ *
+ * printf("minimum is %1.2f at index %u\n", in[*out], *out);
+ *
+ * volk_free(in);
+ * volk_free(out);
+ * \endcode
+ */
+
+#ifndef INCLUDED_volk_32f_index_min_32u_a_H
+#define INCLUDED_volk_32f_index_min_32u_a_H
+
+#include <inttypes.h>
+#include <stdio.h>
+#include <volk/volk_common.h>
+
+#ifdef LV_HAVE_SSE4_1
+#include <smmintrin.h>
+
+static inline void
+volk_32f_index_min_32u_a_sse4_1(uint32_t* target, const float* src0, uint32_t num_points)
+{
+ if (num_points > 0) {
+ uint32_t number = 0;
+ const uint32_t quarterPoints = num_points / 4;
+
+ float* inputPtr = (float*)src0;
+
+ __m128 indexIncrementValues = _mm_set1_ps(4);
+ __m128 currentIndexes = _mm_set_ps(-1, -2, -3, -4);
+
+ float min = src0[0];
+ float index = 0;
+ __m128 minValues = _mm_set1_ps(min);
+ __m128 minValuesIndex = _mm_setzero_ps();
+ __m128 compareResults;
+ __m128 currentValues;
+
+ __VOLK_ATTR_ALIGNED(16) float minValuesBuffer[4];
+ __VOLK_ATTR_ALIGNED(16) float minIndexesBuffer[4];
+
+ for (; number < quarterPoints; number++) {
+
+ currentValues = _mm_load_ps(inputPtr);
+ inputPtr += 4;
+ currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues);
+
+ compareResults = _mm_cmplt_ps(currentValues, minValues);
+
+ minValuesIndex =
+ _mm_blendv_ps(minValuesIndex, currentIndexes, compareResults);
+ minValues = _mm_blendv_ps(minValues, currentValues, compareResults);
+ }
+
+ // Calculate the smallest value from the remaining 4 points
+ _mm_store_ps(minValuesBuffer, minValues);
+ _mm_store_ps(minIndexesBuffer, minValuesIndex);
+
+ for (number = 0; number < 4; number++) {
+ if (minValuesBuffer[number] < min) {
+ index = minIndexesBuffer[number];
+ min = minValuesBuffer[number];
+ } else if (minValuesBuffer[number] == min) {
+ if (index > minIndexesBuffer[number])
+ index = minIndexesBuffer[number];
+ }
+ }
+
+ number = quarterPoints * 4;
+ for (; number < num_points; number++) {
+ if (src0[number] < min) {
+ index = number;
+ min = src0[number];
+ }
+ }
+ target[0] = (uint32_t)index;
+ }
+}
+
+#endif /*LV_HAVE_SSE4_1*/
+
+
+#ifdef LV_HAVE_SSE
+
+#include <xmmintrin.h>
+
+static inline void
+volk_32f_index_min_32u_a_sse(uint32_t* target, const float* src0, uint32_t num_points)
+{
+ if (num_points > 0) {
+ uint32_t number = 0;
+ const uint32_t quarterPoints = num_points / 4;
+
+ float* inputPtr = (float*)src0;
+
+ __m128 indexIncrementValues = _mm_set1_ps(4);
+ __m128 currentIndexes = _mm_set_ps(-1, -2, -3, -4);
+
+ float min = src0[0];
+ float index = 0;
+ __m128 minValues = _mm_set1_ps(min);
+ __m128 minValuesIndex = _mm_setzero_ps();
+ __m128 compareResults;
+ __m128 currentValues;
+
+ __VOLK_ATTR_ALIGNED(16) float minValuesBuffer[4];
+ __VOLK_ATTR_ALIGNED(16) float minIndexesBuffer[4];
+
+ for (; number < quarterPoints; number++) {
+
+ currentValues = _mm_load_ps(inputPtr);
+ inputPtr += 4;
+ currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues);
+
+ compareResults = _mm_cmplt_ps(currentValues, minValues);
+
+ minValuesIndex = _mm_or_ps(_mm_and_ps(compareResults, currentIndexes),
+ _mm_andnot_ps(compareResults, minValuesIndex));
+
+ minValues = _mm_or_ps(_mm_and_ps(compareResults, currentValues),
+ _mm_andnot_ps(compareResults, minValues));
+ }
+
+ // Calculate the smallest value from the remaining 4 points
+ _mm_store_ps(minValuesBuffer, minValues);
+ _mm_store_ps(minIndexesBuffer, minValuesIndex);
+
+ for (number = 0; number < 4; number++) {
+ if (minValuesBuffer[number] < min) {
+ index = minIndexesBuffer[number];
+ min = minValuesBuffer[number];
+ } else if (minValuesBuffer[number] == min) {
+ if (index > minIndexesBuffer[number])
+ index = minIndexesBuffer[number];
+ }
+ }
+
+ number = quarterPoints * 4;
+ for (; number < num_points; number++) {
+ if (src0[number] < min) {
+ index = number;
+ min = src0[number];
+ }
+ }
+ target[0] = (uint32_t)index;
+ }
+}
+
+#endif /*LV_HAVE_SSE*/
+
+
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+
+static inline void
+volk_32f_index_min_32u_a_avx(uint32_t* target, const float* src0, uint32_t num_points)
+{
+ if (num_points > 0) {
+ uint32_t number = 0;
+ const uint32_t quarterPoints = num_points / 8;
+
+ float* inputPtr = (float*)src0;
+
+ __m256 indexIncrementValues = _mm256_set1_ps(8);
+ __m256 currentIndexes = _mm256_set_ps(-1, -2, -3, -4, -5, -6, -7, -8);
+
+ float min = src0[0];
+ float index = 0;
+ __m256 minValues = _mm256_set1_ps(min);
+ __m256 minValuesIndex = _mm256_setzero_ps();
+ __m256 compareResults;
+ __m256 currentValues;
+
+ __VOLK_ATTR_ALIGNED(32) float minValuesBuffer[8];
+ __VOLK_ATTR_ALIGNED(32) float minIndexesBuffer[8];
+
+ for (; number < quarterPoints; number++) {
+ currentValues = _mm256_load_ps(inputPtr);
+ inputPtr += 8;
+ currentIndexes = _mm256_add_ps(currentIndexes, indexIncrementValues);
+ compareResults = _mm256_cmp_ps(currentValues, minValues, _CMP_LT_OS);
+ minValuesIndex =
+ _mm256_blendv_ps(minValuesIndex, currentIndexes, compareResults);
+ minValues = _mm256_blendv_ps(minValues, currentValues, compareResults);
+ }
+
+ // Calculate the smallest value from the remaining 8 points
+ _mm256_store_ps(minValuesBuffer, minValues);
+ _mm256_store_ps(minIndexesBuffer, minValuesIndex);
+
+ for (number = 0; number < 8; number++) {
+ if (minValuesBuffer[number] < min) {
+ index = minIndexesBuffer[number];
+ min = minValuesBuffer[number];
+ } else if (minValuesBuffer[number] == min) {
+ if (index > minIndexesBuffer[number])
+ index = minIndexesBuffer[number];
+ }
+ }
+
+ number = quarterPoints * 8;
+ for (; number < num_points; number++) {
+ if (src0[number] < min) {
+ index = number;
+ min = src0[number];
+ }
+ }
+ target[0] = (uint32_t)index;
+ }
+}
+
+#endif /*LV_HAVE_AVX*/
+
+
+#ifdef LV_HAVE_NEON
+#include <arm_neon.h>
+
+static inline void
+volk_32f_index_min_32u_neon(uint32_t* target, const float* src0, uint32_t num_points)
+{
+ if (num_points > 0) {
+ uint32_t number = 0;
+ const uint32_t quarterPoints = num_points / 4;
+
+ float* inputPtr = (float*)src0;
+ float32x4_t indexIncrementValues = vdupq_n_f32(4);
+ __VOLK_ATTR_ALIGNED(16)
+ float currentIndexes_float[4] = { -4.0f, -3.0f, -2.0f, -1.0f };
+ float32x4_t currentIndexes = vld1q_f32(currentIndexes_float);
+
+ float min = src0[0];
+ float index = 0;
+ float32x4_t minValues = vdupq_n_f32(min);
+ uint32x4_t minValuesIndex = vmovq_n_u32(0);
+ uint32x4_t compareResults;
+ uint32x4_t currentIndexes_u;
+ float32x4_t currentValues;
+
+ __VOLK_ATTR_ALIGNED(16) float minValuesBuffer[4];
+ __VOLK_ATTR_ALIGNED(16) float minIndexesBuffer[4];
+
+ for (; number < quarterPoints; number++) {
+ currentValues = vld1q_f32(inputPtr);
+ inputPtr += 4;
+ currentIndexes = vaddq_f32(currentIndexes, indexIncrementValues);
+ currentIndexes_u = vcvtq_u32_f32(currentIndexes);
+ compareResults = vcgeq_f32(currentValues, minValues);
+ minValuesIndex = vorrq_u32(vandq_u32(compareResults, minValuesIndex),
+ vbicq_u32(currentIndexes_u, compareResults));
+ minValues = vminq_f32(currentValues, minValues);
+ }
+
+ // Calculate the smallest value from the remaining 4 points
+ vst1q_f32(minValuesBuffer, minValues);
+ vst1q_f32(minIndexesBuffer, vcvtq_f32_u32(minValuesIndex));
+ for (number = 0; number < 4; number++) {
+ if (minValuesBuffer[number] < min) {
+ index = minIndexesBuffer[number];
+ min = minValuesBuffer[number];
+ } else if (minValues[number] == min) {
+ if (index > minIndexesBuffer[number])
+ index = minIndexesBuffer[number];
+ }
+ }
+
+ number = quarterPoints * 4;
+ for (; number < num_points; number++) {
+ if (src0[number] < min) {
+ index = number;
+ min = src0[number];
+ }
+ }
+ target[0] = (uint32_t)index;
+ }
+}
+
+#endif /*LV_HAVE_NEON*/
+
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void
+volk_32f_index_min_32u_generic(uint32_t* target, const float* src0, uint32_t num_points)
+{
+ if (num_points > 0) {
+ float min = src0[0];
+ uint32_t index = 0;
+
+ uint32_t i = 1;
+
+ for (; i < num_points; ++i) {
+ if (src0[i] < min) {
+ index = i;
+ min = src0[i];
+ }
+ }
+ target[0] = index;
+ }
+}
+
+#endif /*LV_HAVE_GENERIC*/
+
+
+#endif /*INCLUDED_volk_32f_index_min_32u_a_H*/
+
+
+#ifndef INCLUDED_volk_32f_index_min_32u_u_H
+#define INCLUDED_volk_32f_index_min_32u_u_H
+
+#include <inttypes.h>
+#include <stdio.h>
+#include <volk/volk_common.h>
+
+
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+
+static inline void
+volk_32f_index_min_32u_u_avx(uint32_t* target, const float* src0, uint32_t num_points)
+{
+ if (num_points > 0) {
+ uint32_t number = 0;
+ const uint32_t quarterPoints = num_points / 8;
+
+ float* inputPtr = (float*)src0;
+
+ __m256 indexIncrementValues = _mm256_set1_ps(8);
+ __m256 currentIndexes = _mm256_set_ps(-1, -2, -3, -4, -5, -6, -7, -8);
+
+ float min = src0[0];
+ float index = 0;
+ __m256 minValues = _mm256_set1_ps(min);
+ __m256 minValuesIndex = _mm256_setzero_ps();
+ __m256 compareResults;
+ __m256 currentValues;
+
+ __VOLK_ATTR_ALIGNED(32) float minValuesBuffer[8];
+ __VOLK_ATTR_ALIGNED(32) float minIndexesBuffer[8];
+
+ for (; number < quarterPoints; number++) {
+ currentValues = _mm256_loadu_ps(inputPtr);
+ inputPtr += 8;
+ currentIndexes = _mm256_add_ps(currentIndexes, indexIncrementValues);
+ compareResults = _mm256_cmp_ps(currentValues, minValues, _CMP_LT_OS);
+ minValuesIndex =
+ _mm256_blendv_ps(minValuesIndex, currentIndexes, compareResults);
+ minValues = _mm256_blendv_ps(minValues, currentValues, compareResults);
+ }
+
+ // Calculate the smalles value from the remaining 8 points
+ _mm256_store_ps(minValuesBuffer, minValues);
+ _mm256_store_ps(minIndexesBuffer, minValuesIndex);
+
+ for (number = 0; number < 8; number++) {
+ if (minValuesBuffer[number] < min) {
+ index = minIndexesBuffer[number];
+ min = minValuesBuffer[number];
+ } else if (minValuesBuffer[number] == min) {
+ if (index > minIndexesBuffer[number])
+ index = minIndexesBuffer[number];
+ }
+ }
+
+ number = quarterPoints * 8;
+ for (; number < num_points; number++) {
+ if (src0[number] < min) {
+ index = number;
+ min = src0[number];
+ }
+ }
+ target[0] = (uint32_t)index;
+ }
+}
+
+#endif /*LV_HAVE_AVX*/
+
+
+#ifdef LV_HAVE_SSE4_1
+#include <smmintrin.h>
+
+static inline void
+volk_32f_index_min_32u_u_sse4_1(uint32_t* target, const float* src0, uint32_t num_points)
+{
+ if (num_points > 0) {
+ uint32_t number = 0;
+ const uint32_t quarterPoints = num_points / 4;
+
+ float* inputPtr = (float*)src0;
+
+ __m128 indexIncrementValues = _mm_set1_ps(4);
+ __m128 currentIndexes = _mm_set_ps(-1, -2, -3, -4);
+
+ float min = src0[0];
+ float index = 0;
+ __m128 minValues = _mm_set1_ps(min);
+ __m128 minValuesIndex = _mm_setzero_ps();
+ __m128 compareResults;
+ __m128 currentValues;
+
+ __VOLK_ATTR_ALIGNED(16) float minValuesBuffer[4];
+ __VOLK_ATTR_ALIGNED(16) float minIndexesBuffer[4];
+
+ for (; number < quarterPoints; number++) {
+ currentValues = _mm_loadu_ps(inputPtr);
+ inputPtr += 4;
+ currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues);
+ compareResults = _mm_cmplt_ps(currentValues, minValues);
+ minValuesIndex =
+ _mm_blendv_ps(minValuesIndex, currentIndexes, compareResults);
+ minValues = _mm_blendv_ps(minValues, currentValues, compareResults);
+ }
+
+ // Calculate the smallest value from the remaining 4 points
+ _mm_store_ps(minValuesBuffer, minValues);
+ _mm_store_ps(minIndexesBuffer, minValuesIndex);
+
+ for (number = 0; number < 4; number++) {
+ if (minValuesBuffer[number] < min) {
+ index = minIndexesBuffer[number];
+ min = minValuesBuffer[number];
+ } else if (minValuesBuffer[number] == min) {
+ if (index > minIndexesBuffer[number])
+ index = minIndexesBuffer[number];
+ }
+ }
+
+ number = quarterPoints * 4;
+ for (; number < num_points; number++) {
+ if (src0[number] < min) {
+ index = number;
+ min = src0[number];
+ }
+ }
+ target[0] = (uint32_t)index;
+ }
+}
+
+#endif /*LV_HAVE_SSE4_1*/
+
+#ifdef LV_HAVE_SSE
+#include <xmmintrin.h>
+
+static inline void
+volk_32f_index_min_32u_u_sse(uint32_t* target, const float* src0, uint32_t num_points)
+{
+ if (num_points > 0) {
+ uint32_t number = 0;
+ const uint32_t quarterPoints = num_points / 4;
+
+ float* inputPtr = (float*)src0;
+
+ __m128 indexIncrementValues = _mm_set1_ps(4);
+ __m128 currentIndexes = _mm_set_ps(-1, -2, -3, -4);
+
+ float min = src0[0];
+ float index = 0;
+ __m128 minValues = _mm_set1_ps(min);
+ __m128 minValuesIndex = _mm_setzero_ps();
+ __m128 compareResults;
+ __m128 currentValues;
+
+ __VOLK_ATTR_ALIGNED(16) float minValuesBuffer[4];
+ __VOLK_ATTR_ALIGNED(16) float minIndexesBuffer[4];
+
+ for (; number < quarterPoints; number++) {
+ currentValues = _mm_loadu_ps(inputPtr);
+ inputPtr += 4;
+ currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues);
+ compareResults = _mm_cmplt_ps(currentValues, minValues);
+ minValuesIndex = _mm_or_ps(_mm_and_ps(compareResults, currentIndexes),
+ _mm_andnot_ps(compareResults, minValuesIndex));
+ minValues = _mm_or_ps(_mm_and_ps(compareResults, currentValues),
+ _mm_andnot_ps(compareResults, minValues));
+ }
+
+ // Calculate the smallest value from the remaining 4 points
+ _mm_store_ps(minValuesBuffer, minValues);
+ _mm_store_ps(minIndexesBuffer, minValuesIndex);
+
+ for (number = 0; number < 4; number++) {
+ if (minValuesBuffer[number] < min) {
+ index = minIndexesBuffer[number];
+ min = minValuesBuffer[number];
+ } else if (minValuesBuffer[number] == min) {
+ if (index > minIndexesBuffer[number])
+ index = minIndexesBuffer[number];
+ }
+ }
+
+ number = quarterPoints * 4;
+ for (; number < num_points; number++) {
+ if (src0[number] < min) {
+ index = number;
+ min = src0[number];
+ }
+ }
+ target[0] = (uint32_t)index;
+ }
+}
+
+#endif /*LV_HAVE_SSE*/
+
+#endif /*INCLUDED_volk_32f_index_min_32u_u_H*/
--- /dev/null
+/* -*- c++ -*- */
+/*
+ * Copyright 2021 Free Software Foundation, Inc.
+ *
+ * This file is part of GNU Radio
+ *
+ * GNU Radio is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 3, or (at your option)
+ * any later version.
+ *
+ * GNU Radio is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNU Radio; see the file COPYING. If not, write to
+ * the Free Software Foundation, Inc., 51 Franklin Street,
+ * Boston, MA 02110-1301, USA.
+ */
+
+/*!
+ * \page volk_32fc_index_min_16u
+ *
+ * \b Overview
+ *
+ * Returns Argmin_i mag(x[i]). Finds and returns the index which contains the
+ * minimum magnitude for complex points in the given vector.
+ *
+ * Note that num_points is a uint32_t, but the return value is
+ * uint16_t. Providing a vector larger than the max of a uint16_t
+ * (65536) would miss anything outside of this boundary. The kernel
+ * will check the length of num_points and cap it to this max value,
+ * anyways.
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+ * void volk_32fc_index_min_16u(uint16_t* target, lv_32fc_t* src0, uint32_t
+ * num_points) \endcode
+ *
+ * \b Inputs
+ * \li src0: The complex input vector.
+ * \li num_points: The number of samples.
+ *
+ * \b Outputs
+ * \li target: The index of the point with minimum magnitude.
+ *
+ * \b Example
+ * Calculate the index of the minimum value of \f$x^2 + x\f$ for points around
+ * the unit circle.
+ * \code
+ * int N = 10;
+ * uint32_t alignment = volk_get_alignment();
+ * lv_32fc_t* in = (lv_32fc_t*)volk_malloc(sizeof(lv_32fc_t)*N, alignment);
+ * uint16_t* min = (uint16_t*)volk_malloc(sizeof(uint16_t), alignment);
+ *
+ * for(uint32_t ii = 0; ii < N/2; ++ii){
+ * float real = 2.f * ((float)ii / (float)N) - 1.f;
+ * float imag = std::sqrt(1.f - real * real);
+ * in[ii] = lv_cmake(real, imag);
+ * in[ii] = in[ii] * in[ii] + in[ii];
+ * in[N-ii] = lv_cmake(real, imag);
+ * in[N-ii] = in[N-ii] * in[N-ii] + in[N-ii];
+ * }
+ *
+ * volk_32fc_index_min_16u(min, in, N);
+ *
+ * printf("index of min value = %u\n", *min);
+ *
+ * volk_free(in);
+ * volk_free(min);
+ * \endcode
+ */
+
+#ifndef INCLUDED_volk_32fc_index_min_16u_a_H
+#define INCLUDED_volk_32fc_index_min_16u_a_H
+
+#include <inttypes.h>
+#include <limits.h>
+#include <stdio.h>
+#include <volk/volk_common.h>
+#include <volk/volk_complex.h>
+
+#ifdef LV_HAVE_AVX2
+#include <immintrin.h>
+#include <volk/volk_avx2_intrinsics.h>
+
+static inline void volk_32fc_index_min_16u_a_avx2_variant_0(uint16_t* target,
+ lv_32fc_t* src0,
+ uint32_t num_points)
+{
+ num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
+
+ const __m256i indices_increment = _mm256_set1_epi32(8);
+ /*
+ * At the start of each loop iteration current_indices holds the indices of
+ * the complex numbers loaded from memory. Explanation for odd order is given
+ * in implementation of vector_32fc_index_min_variant0().
+ */
+ __m256i current_indices = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
+
+ __m256 min_values = _mm256_set1_ps(FLT_MAX);
+ __m256i min_indices = _mm256_setzero_si256();
+
+ for (unsigned i = 0; i < num_points / 8u; ++i) {
+ __m256 in0 = _mm256_load_ps((float*)src0);
+ __m256 in1 = _mm256_load_ps((float*)(src0 + 4));
+ vector_32fc_index_min_variant0(
+ in0, in1, &min_values, &min_indices, ¤t_indices, indices_increment);
+ src0 += 8;
+ }
+
+ // determine minimum value and index in the result of the vectorized loop
+ __VOLK_ATTR_ALIGNED(32) float min_values_buffer[8];
+ __VOLK_ATTR_ALIGNED(32) uint32_t min_indices_buffer[8];
+ _mm256_store_ps(min_values_buffer, min_values);
+ _mm256_store_si256((__m256i*)min_indices_buffer, min_indices);
+
+ float min = FLT_MAX;
+ uint32_t index = 0;
+ for (unsigned i = 0; i < 8; i++) {
+ if (min_values_buffer[i] < min) {
+ min = min_values_buffer[i];
+ index = min_indices_buffer[i];
+ }
+ }
+
+ // handle tail not processed by the vectorized loop
+ for (unsigned i = num_points & (~7u); i < num_points; ++i) {
+ const float abs_squared =
+ lv_creal(*src0) * lv_creal(*src0) + lv_cimag(*src0) * lv_cimag(*src0);
+ if (abs_squared < min) {
+ min = abs_squared;
+ index = i;
+ }
+ ++src0;
+ }
+
+ *target = index;
+}
+
+#endif /*LV_HAVE_AVX2*/
+
+#ifdef LV_HAVE_AVX2
+#include <immintrin.h>
+#include <volk/volk_avx2_intrinsics.h>
+
+static inline void volk_32fc_index_min_16u_a_avx2_variant_1(uint16_t* target,
+ lv_32fc_t* src0,
+ uint32_t num_points)
+{
+ num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
+
+ const __m256i indices_increment = _mm256_set1_epi32(8);
+ /*
+ * At the start of each loop iteration current_indices holds the indices of
+ * the complex numbers loaded from memory. Explanation for odd order is given
+ * in implementation of vector_32fc_index_min_variant0().
+ */
+ __m256i current_indices = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
+
+ __m256 min_values = _mm256_set1_ps(FLT_MAX);
+ __m256i min_indices = _mm256_setzero_si256();
+
+ for (unsigned i = 0; i < num_points / 8u; ++i) {
+ __m256 in0 = _mm256_load_ps((float*)src0);
+ __m256 in1 = _mm256_load_ps((float*)(src0 + 4));
+ vector_32fc_index_min_variant1(
+ in0, in1, &min_values, &min_indices, ¤t_indices, indices_increment);
+ src0 += 8;
+ }
+
+ // determine minimum value and index in the result of the vectorized loop
+ __VOLK_ATTR_ALIGNED(32) float min_values_buffer[8];
+ __VOLK_ATTR_ALIGNED(32) uint32_t min_indices_buffer[8];
+ _mm256_store_ps(min_values_buffer, min_values);
+ _mm256_store_si256((__m256i*)min_indices_buffer, min_indices);
+
+ float min = FLT_MAX;
+ uint32_t index = 0;
+ for (unsigned i = 0; i < 8; i++) {
+ if (min_values_buffer[i] < min) {
+ min = min_values_buffer[i];
+ index = min_indices_buffer[i];
+ }
+ }
+
+ // handle tail not processed by the vectorized loop
+ for (unsigned i = num_points & (~7u); i < num_points; ++i) {
+ const float abs_squared =
+ lv_creal(*src0) * lv_creal(*src0) + lv_cimag(*src0) * lv_cimag(*src0);
+ if (abs_squared < min) {
+ min = abs_squared;
+ index = i;
+ }
+ ++src0;
+ }
+
+ *target = index;
+}
+
+#endif /*LV_HAVE_AVX2*/
+
+#ifdef LV_HAVE_SSE3
+#include <pmmintrin.h>
+#include <xmmintrin.h>
+
+static inline void
+volk_32fc_index_min_16u_a_sse3(uint16_t* target, lv_32fc_t* src0, uint32_t num_points)
+{
+ num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
+ const uint32_t num_bytes = num_points * 8;
+
+ union bit128 holderf;
+ union bit128 holderi;
+ float sq_dist = 0.0;
+
+ union bit128 xmm5, xmm4;
+ __m128 xmm1, xmm2, xmm3;
+ __m128i xmm8, xmm11, xmm12, xmm9, xmm10;
+
+ xmm5.int_vec = _mm_setzero_si128();
+ xmm4.int_vec = _mm_setzero_si128();
+ holderf.int_vec = _mm_setzero_si128();
+ holderi.int_vec = _mm_setzero_si128();
+
+ int bound = num_bytes >> 5;
+ int i = 0;
+
+ xmm8 = _mm_setr_epi32(0, 1, 2, 3);
+ xmm9 = _mm_setzero_si128();
+ xmm10 = _mm_setr_epi32(4, 4, 4, 4);
+ xmm3 = _mm_set_ps1(FLT_MAX);
+
+ for (; i < bound; ++i) {
+ xmm1 = _mm_load_ps((float*)src0);
+ xmm2 = _mm_load_ps((float*)&src0[2]);
+
+ src0 += 4;
+
+ xmm1 = _mm_mul_ps(xmm1, xmm1);
+ xmm2 = _mm_mul_ps(xmm2, xmm2);
+
+ xmm1 = _mm_hadd_ps(xmm1, xmm2);
+
+ xmm3 = _mm_min_ps(xmm1, xmm3);
+
+ xmm4.float_vec = _mm_cmpgt_ps(xmm1, xmm3);
+ xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3);
+
+ xmm11 = _mm_and_si128(xmm8, xmm5.int_vec);
+ xmm12 = _mm_and_si128(xmm9, xmm4.int_vec);
+
+ xmm9 = _mm_add_epi32(xmm11, xmm12);
+
+ xmm8 = _mm_add_epi32(xmm8, xmm10);
+ }
+
+ if (num_bytes >> 4 & 1) {
+ xmm2 = _mm_load_ps((float*)src0);
+
+ xmm1 = _mm_movelh_ps(bit128_p(&xmm8)->float_vec, bit128_p(&xmm8)->float_vec);
+ xmm8 = bit128_p(&xmm1)->int_vec;
+
+ xmm2 = _mm_mul_ps(xmm2, xmm2);
+
+ src0 += 2;
+
+ xmm1 = _mm_hadd_ps(xmm2, xmm2);
+
+ xmm3 = _mm_min_ps(xmm1, xmm3);
+
+ xmm10 = _mm_setr_epi32(2, 2, 2, 2);
+
+ xmm4.float_vec = _mm_cmpgt_ps(xmm1, xmm3);
+ xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3);
+
+ xmm11 = _mm_and_si128(xmm8, xmm5.int_vec);
+ xmm12 = _mm_and_si128(xmm9, xmm4.int_vec);
+
+ xmm9 = _mm_add_epi32(xmm11, xmm12);
+
+ xmm8 = _mm_add_epi32(xmm8, xmm10);
+ }
+
+ if (num_bytes >> 3 & 1) {
+ sq_dist =
+ lv_creal(src0[0]) * lv_creal(src0[0]) + lv_cimag(src0[0]) * lv_cimag(src0[0]);
+
+ xmm2 = _mm_load1_ps(&sq_dist);
+
+ xmm1 = xmm3;
+
+ xmm3 = _mm_min_ss(xmm3, xmm2);
+
+ xmm4.float_vec = _mm_cmpgt_ps(xmm1, xmm3);
+ xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3);
+
+ xmm8 = _mm_shuffle_epi32(xmm8, 0x00);
+
+ xmm11 = _mm_and_si128(xmm8, xmm4.int_vec);
+ xmm12 = _mm_and_si128(xmm9, xmm5.int_vec);
+
+ xmm9 = _mm_add_epi32(xmm11, xmm12);
+ }
+
+ _mm_store_ps((float*)&(holderf.f), xmm3);
+ _mm_store_si128(&(holderi.int_vec), xmm9);
+
+ target[0] = holderi.i[0];
+ sq_dist = holderf.f[0];
+ target[0] = (holderf.f[1] < sq_dist) ? holderi.i[1] : target[0];
+ sq_dist = (holderf.f[1] < sq_dist) ? holderf.f[1] : sq_dist;
+ target[0] = (holderf.f[2] < sq_dist) ? holderi.i[2] : target[0];
+ sq_dist = (holderf.f[2] < sq_dist) ? holderf.f[2] : sq_dist;
+ target[0] = (holderf.f[3] < sq_dist) ? holderi.i[3] : target[0];
+ sq_dist = (holderf.f[3] < sq_dist) ? holderf.f[3] : sq_dist;
+}
+
+#endif /*LV_HAVE_SSE3*/
+
+#ifdef LV_HAVE_GENERIC
+static inline void
+volk_32fc_index_min_16u_generic(uint16_t* target, lv_32fc_t* src0, uint32_t num_points)
+{
+ num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
+
+ const uint32_t num_bytes = num_points * 8;
+
+ float sq_dist = 0.0;
+ float min = FLT_MAX;
+ uint16_t index = 0;
+
+ uint32_t i = 0;
+
+ for (; i<num_bytes>> 3; ++i) {
+ sq_dist =
+ lv_creal(src0[i]) * lv_creal(src0[i]) + lv_cimag(src0[i]) * lv_cimag(src0[i]);
+
+ if (sq_dist < min) {
+ index = i;
+ min = sq_dist;
+ }
+ }
+ target[0] = index;
+}
+
+#endif /*LV_HAVE_GENERIC*/
+
+#endif /*INCLUDED_volk_32fc_index_min_16u_a_H*/
+
+#ifndef INCLUDED_volk_32fc_index_min_16u_u_H
+#define INCLUDED_volk_32fc_index_min_16u_u_H
+
+#include <inttypes.h>
+#include <limits.h>
+#include <stdio.h>
+#include <volk/volk_common.h>
+#include <volk/volk_complex.h>
+
+#ifdef LV_HAVE_AVX2
+#include <immintrin.h>
+#include <volk/volk_avx2_intrinsics.h>
+
+static inline void volk_32fc_index_min_16u_u_avx2_variant_0(uint16_t* target,
+ lv_32fc_t* src0,
+ uint32_t num_points)
+{
+ num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
+
+ const __m256i indices_increment = _mm256_set1_epi32(8);
+ /*
+ * At the start of each loop iteration current_indices holds the indices of
+ * the complex numbers loaded from memory. Explanation for odd order is given
+ * in implementation of vector_32fc_index_min_variant0().
+ */
+ __m256i current_indices = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
+
+ __m256 min_values = _mm256_set1_ps(FLT_MAX);
+ __m256i min_indices = _mm256_setzero_si256();
+
+ for (unsigned i = 0; i < num_points / 8u; ++i) {
+ __m256 in0 = _mm256_loadu_ps((float*)src0);
+ __m256 in1 = _mm256_loadu_ps((float*)(src0 + 4));
+ vector_32fc_index_min_variant0(
+ in0, in1, &min_values, &min_indices, ¤t_indices, indices_increment);
+ src0 += 8;
+ }
+
+ // determine minimum value and index in the result of the vectorized loop
+ __VOLK_ATTR_ALIGNED(32) float min_values_buffer[8];
+ __VOLK_ATTR_ALIGNED(32) uint32_t min_indices_buffer[8];
+ _mm256_store_ps(min_values_buffer, min_values);
+ _mm256_store_si256((__m256i*)min_indices_buffer, min_indices);
+
+ float min = FLT_MAX;
+ uint32_t index = 0;
+ for (unsigned i = 0; i < 8; i++) {
+ if (min_values_buffer[i] < min) {
+ min = min_values_buffer[i];
+ index = min_indices_buffer[i];
+ }
+ }
+
+ // handle tail not processed by the vectorized loop
+ for (unsigned i = num_points & (~7u); i < num_points; ++i) {
+ const float abs_squared =
+ lv_creal(*src0) * lv_creal(*src0) + lv_cimag(*src0) * lv_cimag(*src0);
+ if (abs_squared < min) {
+ min = abs_squared;
+ index = i;
+ }
+ ++src0;
+ }
+
+ *target = index;
+}
+
+#endif /*LV_HAVE_AVX2*/
+
+#ifdef LV_HAVE_AVX2
+#include <immintrin.h>
+#include <volk/volk_avx2_intrinsics.h>
+
+static inline void volk_32fc_index_min_16u_u_avx2_variant_1(uint16_t* target,
+ lv_32fc_t* src0,
+ uint32_t num_points)
+{
+ num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
+
+ const __m256i indices_increment = _mm256_set1_epi32(8);
+ /*
+ * At the start of each loop iteration current_indices holds the indices of
+ * the complex numbers loaded from memory. Explanation for odd order is given
+ * in implementation of vector_32fc_index_min_variant0().
+ */
+ __m256i current_indices = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
+
+ __m256 min_values = _mm256_set1_ps(FLT_MAX);
+ __m256i min_indices = _mm256_setzero_si256();
+
+ for (unsigned i = 0; i < num_points / 8u; ++i) {
+ __m256 in0 = _mm256_loadu_ps((float*)src0);
+ __m256 in1 = _mm256_loadu_ps((float*)(src0 + 4));
+ vector_32fc_index_min_variant1(
+ in0, in1, &min_values, &min_indices, ¤t_indices, indices_increment);
+ src0 += 8;
+ }
+
+ // determine minimum value and index in the result of the vectorized loop
+ __VOLK_ATTR_ALIGNED(32) float min_values_buffer[8];
+ __VOLK_ATTR_ALIGNED(32) uint32_t min_indices_buffer[8];
+ _mm256_store_ps(min_values_buffer, min_values);
+ _mm256_store_si256((__m256i*)min_indices_buffer, min_indices);
+
+ float min = FLT_MAX;
+ uint32_t index = 0;
+ for (unsigned i = 0; i < 8; i++) {
+ if (min_values_buffer[i] < min) {
+ min = min_values_buffer[i];
+ index = min_indices_buffer[i];
+ }
+ }
+
+ // handle tail not processed by the vectorized loop
+ for (unsigned i = num_points & (~7u); i < num_points; ++i) {
+ const float abs_squared =
+ lv_creal(*src0) * lv_creal(*src0) + lv_cimag(*src0) * lv_cimag(*src0);
+ if (abs_squared < min) {
+ min = abs_squared;
+ index = i;
+ }
+ ++src0;
+ }
+
+ *target = index;
+}
+
+#endif /*LV_HAVE_AVX2*/
+
+#endif /*INCLUDED_volk_32fc_index_min_16u_u_H*/
--- /dev/null
+/* -*- c++ -*- */
+/*
+ * Copyright 2021 Free Software Foundation, Inc.
+ *
+ * This file is part of GNU Radio
+ *
+ * GNU Radio is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 3, or (at your option)
+ * any later version.
+ *
+ * GNU Radio is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNU Radio; see the file COPYING. If not, write to
+ * the Free Software Foundation, Inc., 51 Franklin Street,
+ * Boston, MA 02110-1301, USA.
+ */
+
+/*!
+ * \page volk_32fc_index_min_32u
+ *
+ * \b Overview
+ *
+ * Returns Argmin_i mag(x[i]). Finds and returns the index which contains the
+ * minimum magnitude for complex points in the given vector.
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+ * void volk_32fc_index_min_32u(uint32_t* target, lv_32fc_t* src0, uint32_t
+ * num_points) \endcode
+ *
+ * \b Inputs
+ * \li src0: The complex input vector.
+ * \li num_points: The number of samples.
+ *
+ * \b Outputs
+ * \li target: The index of the point with minimum magnitude.
+ *
+ * \b Example
+ * Calculate the index of the minimum value of \f$x^2 + x\f$ for points around
+ * the unit circle.
+ * \code
+ * int N = 10;
+ * uint32_t alignment = volk_get_alignment();
+ * lv_32fc_t* in = (lv_32fc_t*)volk_malloc(sizeof(lv_32fc_t)*N, alignment);
+ * uint32_t* min = (uint32_t*)volk_malloc(sizeof(uint32_t), alignment);
+ *
+ * for(uint32_t ii = 0; ii < N/2; ++ii){
+ * float real = 2.f * ((float)ii / (float)N) - 1.f;
+ * float imag = std::sqrt(1.f - real * real);
+ * in[ii] = lv_cmake(real, imag);
+ * in[ii] = in[ii] * in[ii] + in[ii];
+ * in[N-ii] = lv_cmake(real, imag);
+ * in[N-ii] = in[N-ii] * in[N-ii] + in[N-ii];
+ * }
+ *
+ * volk_32fc_index_min_32u(min, in, N);
+ *
+ * printf("index of min value = %u\n", *min);
+ *
+ * volk_free(in);
+ * volk_free(min);
+ * \endcode
+ */
+
+#ifndef INCLUDED_volk_32fc_index_min_32u_a_H
+#define INCLUDED_volk_32fc_index_min_32u_a_H
+
+#include <inttypes.h>
+#include <stdio.h>
+#include <volk/volk_common.h>
+#include <volk/volk_complex.h>
+
+#ifdef LV_HAVE_AVX2
+#include <immintrin.h>
+#include <volk/volk_avx2_intrinsics.h>
+
+static inline void volk_32fc_index_min_32u_a_avx2_variant_0(uint32_t* target,
+ lv_32fc_t* src0,
+ uint32_t num_points)
+{
+ const __m256i indices_increment = _mm256_set1_epi32(8);
+ /*
+ * At the start of each loop iteration current_indices holds the indices of
+ * the complex numbers loaded from memory. Explanation for odd order is given
+ * in implementation of vector_32fc_index_min_variant0().
+ */
+ __m256i current_indices = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
+
+ __m256 min_values = _mm256_set1_ps(FLT_MAX);
+ __m256i min_indices = _mm256_setzero_si256();
+
+ for (unsigned i = 0; i < num_points / 8u; ++i) {
+ __m256 in0 = _mm256_load_ps((float*)src0);
+ __m256 in1 = _mm256_load_ps((float*)(src0 + 4));
+ vector_32fc_index_min_variant0(
+ in0, in1, &min_values, &min_indices, ¤t_indices, indices_increment);
+ src0 += 8;
+ }
+
+ // determine minimum value and index in the result of the vectorized loop
+ __VOLK_ATTR_ALIGNED(32) float min_values_buffer[8];
+ __VOLK_ATTR_ALIGNED(32) uint32_t min_indices_buffer[8];
+ _mm256_store_ps(min_values_buffer, min_values);
+ _mm256_store_si256((__m256i*)min_indices_buffer, min_indices);
+
+ float min = FLT_MAX;
+ uint32_t index = 0;
+ for (unsigned i = 0; i < 8; i++) {
+ if (min_values_buffer[i] < min) {
+ min = min_values_buffer[i];
+ index = min_indices_buffer[i];
+ }
+ }
+
+ // handle tail not processed by the vectorized loop
+ for (unsigned i = num_points & (~7u); i < num_points; ++i) {
+ const float abs_squared =
+ lv_creal(*src0) * lv_creal(*src0) + lv_cimag(*src0) * lv_cimag(*src0);
+ if (abs_squared < min) {
+ min = abs_squared;
+ index = i;
+ }
+ ++src0;
+ }
+
+ *target = index;
+}
+
+#endif /*LV_HAVE_AVX2*/
+
+#ifdef LV_HAVE_AVX2
+#include <immintrin.h>
+#include <volk/volk_avx2_intrinsics.h>
+
+static inline void volk_32fc_index_min_32u_a_avx2_variant_1(uint32_t* target,
+ lv_32fc_t* src0,
+ uint32_t num_points)
+{
+ const __m256i indices_increment = _mm256_set1_epi32(8);
+ /*
+ * At the start of each loop iteration current_indices holds the indices of
+ * the complex numbers loaded from memory. Explanation for odd order is given
+ * in implementation of vector_32fc_index_min_variant0().
+ */
+ __m256i current_indices = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
+
+ __m256 min_values = _mm256_set1_ps(FLT_MAX);
+ __m256i min_indices = _mm256_setzero_si256();
+
+ for (unsigned i = 0; i < num_points / 8u; ++i) {
+ __m256 in0 = _mm256_load_ps((float*)src0);
+ __m256 in1 = _mm256_load_ps((float*)(src0 + 4));
+ vector_32fc_index_min_variant1(
+ in0, in1, &min_values, &min_indices, ¤t_indices, indices_increment);
+ src0 += 8;
+ }
+
+ // determine minimum value and index in the result of the vectorized loop
+ __VOLK_ATTR_ALIGNED(32) float min_values_buffer[8];
+ __VOLK_ATTR_ALIGNED(32) uint32_t min_indices_buffer[8];
+ _mm256_store_ps(min_values_buffer, min_values);
+ _mm256_store_si256((__m256i*)min_indices_buffer, min_indices);
+
+ float min = FLT_MAX;
+ uint32_t index = 0;
+ for (unsigned i = 0; i < 8; i++) {
+ if (min_values_buffer[i] < min) {
+ min = min_values_buffer[i];
+ index = min_indices_buffer[i];
+ }
+ }
+
+ // handle tail not processed by the vectorized loop
+ for (unsigned i = num_points & (~7u); i < num_points; ++i) {
+ const float abs_squared =
+ lv_creal(*src0) * lv_creal(*src0) + lv_cimag(*src0) * lv_cimag(*src0);
+ if (abs_squared < min) {
+ min = abs_squared;
+ index = i;
+ }
+ ++src0;
+ }
+
+ *target = index;
+}
+
+#endif /*LV_HAVE_AVX2*/
+
+#ifdef LV_HAVE_SSE3
+#include <pmmintrin.h>
+#include <xmmintrin.h>
+
+static inline void
+volk_32fc_index_min_32u_a_sse3(uint32_t* target, lv_32fc_t* src0, uint32_t num_points)
+{
+ const uint32_t num_bytes = num_points * 8;
+
+ union bit128 holderf;
+ union bit128 holderi;
+ float sq_dist = 0.0;
+
+ union bit128 xmm5, xmm4;
+ __m128 xmm1, xmm2, xmm3;
+ __m128i xmm8, xmm11, xmm12, xmm9, xmm10;
+
+ xmm5.int_vec = _mm_setzero_si128();
+ xmm4.int_vec = _mm_setzero_si128();
+ holderf.int_vec = _mm_setzero_si128();
+ holderi.int_vec = _mm_setzero_si128();
+
+ int bound = num_bytes >> 5;
+ int i = 0;
+
+ xmm8 = _mm_setr_epi32(0, 1, 2, 3);
+ xmm9 = _mm_setzero_si128();
+ xmm10 = _mm_setr_epi32(4, 4, 4, 4);
+ xmm3 = _mm_set_ps1(FLT_MAX);
+
+ for (; i < bound; ++i) {
+ xmm1 = _mm_load_ps((float*)src0);
+ xmm2 = _mm_load_ps((float*)&src0[2]);
+
+ src0 += 4;
+
+ xmm1 = _mm_mul_ps(xmm1, xmm1);
+ xmm2 = _mm_mul_ps(xmm2, xmm2);
+
+ xmm1 = _mm_hadd_ps(xmm1, xmm2);
+
+ xmm3 = _mm_min_ps(xmm1, xmm3);
+
+ xmm4.float_vec = _mm_cmpgt_ps(xmm1, xmm3);
+ xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3);
+
+ xmm11 = _mm_and_si128(xmm8, xmm5.int_vec);
+ xmm12 = _mm_and_si128(xmm9, xmm4.int_vec);
+
+ xmm9 = _mm_add_epi32(xmm11, xmm12);
+
+ xmm8 = _mm_add_epi32(xmm8, xmm10);
+ }
+
+ if (num_bytes >> 4 & 1) {
+ xmm2 = _mm_load_ps((float*)src0);
+
+ xmm1 = _mm_movelh_ps(bit128_p(&xmm8)->float_vec, bit128_p(&xmm8)->float_vec);
+ xmm8 = bit128_p(&xmm1)->int_vec;
+
+ xmm2 = _mm_mul_ps(xmm2, xmm2);
+
+ src0 += 2;
+
+ xmm1 = _mm_hadd_ps(xmm2, xmm2);
+
+ xmm3 = _mm_min_ps(xmm1, xmm3);
+
+ xmm10 = _mm_setr_epi32(2, 2, 2, 2);
+
+ xmm4.float_vec = _mm_cmpgt_ps(xmm1, xmm3);
+ xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3);
+
+ xmm11 = _mm_and_si128(xmm8, xmm5.int_vec);
+ xmm12 = _mm_and_si128(xmm9, xmm4.int_vec);
+
+ xmm9 = _mm_add_epi32(xmm11, xmm12);
+
+ xmm8 = _mm_add_epi32(xmm8, xmm10);
+ }
+
+ if (num_bytes >> 3 & 1) {
+ sq_dist =
+ lv_creal(src0[0]) * lv_creal(src0[0]) + lv_cimag(src0[0]) * lv_cimag(src0[0]);
+
+ xmm2 = _mm_load1_ps(&sq_dist);
+
+ xmm1 = xmm3;
+
+ xmm3 = _mm_min_ss(xmm3, xmm2);
+
+ xmm4.float_vec = _mm_cmpgt_ps(xmm1, xmm3);
+ xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3);
+
+ xmm8 = _mm_shuffle_epi32(xmm8, 0x00);
+
+ xmm11 = _mm_and_si128(xmm8, xmm4.int_vec);
+ xmm12 = _mm_and_si128(xmm9, xmm5.int_vec);
+
+ xmm9 = _mm_add_epi32(xmm11, xmm12);
+ }
+
+ _mm_store_ps((float*)&(holderf.f), xmm3);
+ _mm_store_si128(&(holderi.int_vec), xmm9);
+
+ target[0] = holderi.i[0];
+ sq_dist = holderf.f[0];
+ target[0] = (holderf.f[1] < sq_dist) ? holderi.i[1] : target[0];
+ sq_dist = (holderf.f[1] < sq_dist) ? holderf.f[1] : sq_dist;
+ target[0] = (holderf.f[2] < sq_dist) ? holderi.i[2] : target[0];
+ sq_dist = (holderf.f[2] < sq_dist) ? holderf.f[2] : sq_dist;
+ target[0] = (holderf.f[3] < sq_dist) ? holderi.i[3] : target[0];
+ sq_dist = (holderf.f[3] < sq_dist) ? holderf.f[3] : sq_dist;
+}
+
+#endif /*LV_HAVE_SSE3*/
+
+#ifdef LV_HAVE_GENERIC
+static inline void
+volk_32fc_index_min_32u_generic(uint32_t* target, lv_32fc_t* src0, uint32_t num_points)
+{
+ const uint32_t num_bytes = num_points * 8;
+
+ float sq_dist = 0.0;
+ float min = FLT_MAX;
+ uint32_t index = 0;
+
+ uint32_t i = 0;
+
+ for (; i<num_bytes>> 3; ++i) {
+ sq_dist =
+ lv_creal(src0[i]) * lv_creal(src0[i]) + lv_cimag(src0[i]) * lv_cimag(src0[i]);
+
+ if (sq_dist < min) {
+ index = i;
+ min = sq_dist;
+ }
+ }
+ target[0] = index;
+}
+
+#endif /*LV_HAVE_GENERIC*/
+
+#endif /*INCLUDED_volk_32fc_index_min_32u_a_H*/
+
+#ifndef INCLUDED_volk_32fc_index_min_32u_u_H
+#define INCLUDED_volk_32fc_index_min_32u_u_H
+
+#include <inttypes.h>
+#include <stdio.h>
+#include <volk/volk_common.h>
+#include <volk/volk_complex.h>
+
+#ifdef LV_HAVE_AVX2
+#include <immintrin.h>
+#include <volk/volk_avx2_intrinsics.h>
+
+static inline void volk_32fc_index_min_32u_u_avx2_variant_0(uint32_t* target,
+ lv_32fc_t* src0,
+ uint32_t num_points)
+{
+ const __m256i indices_increment = _mm256_set1_epi32(8);
+ /*
+ * At the start of each loop iteration current_indices holds the indices of
+ * the complex numbers loaded from memory. Explanation for odd order is given
+ * in implementation of vector_32fc_index_min_variant0().
+ */
+ __m256i current_indices = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
+
+ __m256 min_values = _mm256_set1_ps(FLT_MAX);
+ __m256i min_indices = _mm256_setzero_si256();
+
+ for (unsigned i = 0; i < num_points / 8u; ++i) {
+ __m256 in0 = _mm256_loadu_ps((float*)src0);
+ __m256 in1 = _mm256_loadu_ps((float*)(src0 + 4));
+ vector_32fc_index_min_variant0(
+ in0, in1, &min_values, &min_indices, ¤t_indices, indices_increment);
+ src0 += 8;
+ }
+
+ // determine minimum value and index in the result of the vectorized loop
+ __VOLK_ATTR_ALIGNED(32) float min_values_buffer[8];
+ __VOLK_ATTR_ALIGNED(32) uint32_t min_indices_buffer[8];
+ _mm256_store_ps(min_values_buffer, min_values);
+ _mm256_store_si256((__m256i*)min_indices_buffer, min_indices);
+
+ float min = FLT_MAX;
+ uint32_t index = 0;
+ for (unsigned i = 0; i < 8; i++) {
+ if (min_values_buffer[i] < min) {
+ min = min_values_buffer[i];
+ index = min_indices_buffer[i];
+ }
+ }
+
+ // handle tail not processed by the vectorized loop
+ for (unsigned i = num_points & (~7u); i < num_points; ++i) {
+ const float abs_squared =
+ lv_creal(*src0) * lv_creal(*src0) + lv_cimag(*src0) * lv_cimag(*src0);
+ if (abs_squared < min) {
+ min = abs_squared;
+ index = i;
+ }
+ ++src0;
+ }
+
+ *target = index;
+}
+
+#endif /*LV_HAVE_AVX2*/
+
+#ifdef LV_HAVE_AVX2
+#include <immintrin.h>
+#include <volk/volk_avx2_intrinsics.h>
+
+static inline void volk_32fc_index_min_32u_u_avx2_variant_1(uint32_t* target,
+ lv_32fc_t* src0,
+ uint32_t num_points)
+{
+ const __m256i indices_increment = _mm256_set1_epi32(8);
+ /*
+ * At the start of each loop iteration current_indices holds the indices of
+ * the complex numbers loaded from memory. Explanation for odd order is given
+ * in implementation of vector_32fc_index_min_variant0().
+ */
+ __m256i current_indices = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
+
+ __m256 min_values = _mm256_set1_ps(FLT_MAX);
+ __m256i min_indices = _mm256_setzero_si256();
+
+ for (unsigned i = 0; i < num_points / 8u; ++i) {
+ __m256 in0 = _mm256_loadu_ps((float*)src0);
+ __m256 in1 = _mm256_loadu_ps((float*)(src0 + 4));
+ vector_32fc_index_min_variant1(
+ in0, in1, &min_values, &min_indices, ¤t_indices, indices_increment);
+ src0 += 8;
+ }
+
+ // determine minimum value and index in the result of the vectorized loop
+ __VOLK_ATTR_ALIGNED(32) float min_values_buffer[8];
+ __VOLK_ATTR_ALIGNED(32) uint32_t min_indices_buffer[8];
+ _mm256_store_ps(min_values_buffer, min_values);
+ _mm256_store_si256((__m256i*)min_indices_buffer, min_indices);
+
+ float min = FLT_MAX;
+ uint32_t index = 0;
+ for (unsigned i = 0; i < 8; i++) {
+ if (min_values_buffer[i] < min) {
+ min = min_values_buffer[i];
+ index = min_indices_buffer[i];
+ }
+ }
+
+ // handle tail not processed by the vectorized loop
+ for (unsigned i = num_points & (~7u); i < num_points; ++i) {
+ const float abs_squared =
+ lv_creal(*src0) * lv_creal(*src0) + lv_cimag(*src0) * lv_cimag(*src0);
+ if (abs_squared < min) {
+ min = abs_squared;
+ index = i;
+ }
+ ++src0;
+ }
+
+ *target = index;
+}
+
+#endif /*LV_HAVE_AVX2*/
+
+#ifdef LV_HAVE_NEON
+#include <arm_neon.h>
+#include <volk/volk_neon_intrinsics.h>
+
+static inline void
+volk_32fc_index_min_32u_neon(uint32_t* target, lv_32fc_t* src0, uint32_t num_points)
+{
+ unsigned int number = 0;
+ const uint32_t quarter_points = num_points / 4;
+ const lv_32fc_t* src0Ptr = src0;
+
+ uint32_t indices[4] = { 0, 1, 2, 3 };
+ const uint32x4_t vec_indices_incr = vdupq_n_u32(4);
+ uint32x4_t vec_indices = vld1q_u32(indices);
+ uint32x4_t vec_min_indices = vec_indices;
+
+ if (num_points) {
+ float min = *src0Ptr;
+ uint32_t index = 0;
+
+ float32x4_t vec_min = vdupq_n_f32(*src0Ptr);
+
+ for (; number < quarter_points; number++) {
+ // Load complex and compute magnitude squared
+ const float32x4_t vec_mag2 =
+ _vmagnitudesquaredq_f32(vld2q_f32((float*)src0Ptr));
+ __VOLK_PREFETCH(src0Ptr += 4);
+ // a < b?
+ const uint32x4_t lt_mask = vcltq_f32(vec_mag2, vec_min);
+ vec_min = vbslq_f32(lt_mask, vec_mag2, vec_min);
+ vec_min_indices = vbslq_u32(lt_mask, vec_indices, vec_min_indices);
+ vec_indices = vaddq_u32(vec_indices, vec_indices_incr);
+ }
+ uint32_t tmp_min_indices[4];
+ float tmp_min[4];
+ vst1q_u32(tmp_min_indices, vec_min_indices);
+ vst1q_f32(tmp_min, vec_min);
+
+ for (int i = 0; i < 4; i++) {
+ if (tmp_min[i] < min) {
+ min = tmp_min[i];
+ index = tmp_min_indices[i];
+ }
+ }
+
+ // Deal with the rest
+ for (number = quarter_points * 4; number < num_points; number++) {
+ const float re = lv_creal(*src0Ptr);
+ const float im = lv_cimag(*src0Ptr);
+ if ((re * re + im * im) < min) {
+ min = *src0Ptr;
+ index = number;
+ }
+ src0Ptr++;
+ }
+ *target = index;
+ }
+}
+
+#endif /*LV_HAVE_NEON*/
+
+#endif /*INCLUDED_volk_32fc_index_min_32u_u_H*/
QA(VOLK_INIT_TEST(volk_32f_x2_add_32f, test_params))
QA(VOLK_INIT_TEST(volk_32f_index_max_16u, test_params))
QA(VOLK_INIT_TEST(volk_32f_index_max_32u, test_params))
+ QA(VOLK_INIT_TEST(volk_32f_index_min_16u, test_params))
+ QA(VOLK_INIT_TEST(volk_32f_index_min_32u, test_params))
QA(VOLK_INIT_TEST(volk_32fc_32f_multiply_32fc, test_params))
QA(VOLK_INIT_TEST(volk_32fc_32f_add_32fc, test_params))
QA(VOLK_INIT_TEST(volk_32f_log2_32f, test_params.make_absolute(1e-5)))
QA(VOLK_INIT_TEST(volk_32fc_32f_dot_prod_32fc, test_params_inacc))
QA(VOLK_INIT_TEST(volk_32fc_index_max_16u, test_params))
QA(VOLK_INIT_TEST(volk_32fc_index_max_32u, test_params))
+ QA(VOLK_INIT_TEST(volk_32fc_index_min_16u, test_params))
+ QA(VOLK_INIT_TEST(volk_32fc_index_min_32u, test_params))
QA(VOLK_INIT_TEST(volk_32fc_s32f_magnitude_16i, test_params))
QA(VOLK_INIT_TEST(volk_32fc_magnitude_32f, test_params_inacc_tenth))
QA(VOLK_INIT_TEST(volk_32fc_magnitude_squared_32f, test_params))