*
* <b>Dispatcher Prototype</b>
* \code
- * void volk_32f_index_min_16u(uint16_t* target, const float* src0, uint32_t num_points)
+ * void volk_32f_index_min_16u(uint16_t* target, const float* source, uint32_t num_points)
* \endcode
*
* \b Inputs
- * \li src0: The input vector of floats.
+ * \li source: The input vector of floats.
* \li num_points: The number of data points.
*
* \b Outputs
#include <immintrin.h>
static inline void
-volk_32f_index_min_16u_a_avx(uint16_t* target, const float* src0, uint32_t num_points)
+volk_32f_index_min_16u_a_avx(uint16_t* target, const float* source, uint32_t num_points)
{
num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
-
- uint32_t number = 0;
const uint32_t eighthPoints = num_points / 8;
- float* inputPtr = (float*)src0;
+ float* inputPtr = (float*)source;
__m256 indexIncrementValues = _mm256_set1_ps(8);
__m256 currentIndexes = _mm256_set_ps(-1, -2, -3, -4, -5, -6, -7, -8);
- float min = src0[0];
+ float min = source[0];
float index = 0;
__m256 minValues = _mm256_set1_ps(min);
__m256 minValuesIndex = _mm256_setzero_ps();
__VOLK_ATTR_ALIGNED(32) float minValuesBuffer[8];
__VOLK_ATTR_ALIGNED(32) float minIndexesBuffer[8];
- for (; number < eighthPoints; number++) {
+ for (uint32_t number = 0; number < eighthPoints; number++) {
currentValues = _mm256_load_ps(inputPtr);
inputPtr += 8;
_mm256_store_ps(minValuesBuffer, minValues);
_mm256_store_ps(minIndexesBuffer, minValuesIndex);
- for (number = 0; number < 8; number++) {
+ for (uint32_t number = 0; number < 8; number++) {
if (minValuesBuffer[number] < min) {
index = minIndexesBuffer[number];
min = minValuesBuffer[number];
}
}
- number = eighthPoints * 8;
- for (; number < num_points; number++) {
- if (src0[number] < min) {
+ for (uint32_t number = eighthPoints * 8; number < num_points; number++) {
+ if (source[number] < min) {
index = number;
- min = src0[number];
+ min = source[number];
}
}
target[0] = (uint16_t)index;
#include <smmintrin.h>
static inline void
-volk_32f_index_min_16u_a_sse4_1(uint16_t* target, const float* src0, uint32_t num_points)
+volk_32f_index_min_16u_a_sse4_1(uint16_t* target, const float* source, uint32_t num_points)
{
num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
-
- uint32_t number = 0;
const uint32_t quarterPoints = num_points / 4;
- float* inputPtr = (float*)src0;
+ float* inputPtr = (float*)source;
__m128 indexIncrementValues = _mm_set1_ps(4);
__m128 currentIndexes = _mm_set_ps(-1, -2, -3, -4);
- float min = src0[0];
+ float min = source[0];
float index = 0;
__m128 minValues = _mm_set1_ps(min);
__m128 minValuesIndex = _mm_setzero_ps();
__VOLK_ATTR_ALIGNED(16) float minValuesBuffer[4];
__VOLK_ATTR_ALIGNED(16) float minIndexesBuffer[4];
- for (; number < quarterPoints; number++) {
+ for (uint32_t number = 0; number < quarterPoints; number++) {
currentValues = _mm_load_ps(inputPtr);
inputPtr += 4;
_mm_store_ps(minValuesBuffer, minValues);
_mm_store_ps(minIndexesBuffer, minValuesIndex);
- for (number = 0; number < 4; number++) {
+ for (uint32_t number = 0; number < 4; number++) {
if (minValuesBuffer[number] < min) {
index = minIndexesBuffer[number];
min = minValuesBuffer[number];
}
}
- number = quarterPoints * 4;
- for (; number < num_points; number++) {
- if (src0[number] < min) {
+ for (uint32_t number = quarterPoints * 4; number < num_points; number++) {
+ if (source[number] < min) {
index = number;
- min = src0[number];
+ min = source[number];
}
}
target[0] = (uint16_t)index;
#include <xmmintrin.h>
static inline void
-volk_32f_index_min_16u_a_sse(uint16_t* target, const float* src0, uint32_t num_points)
+volk_32f_index_min_16u_a_sse(uint16_t* target, const float* source, uint32_t num_points)
{
num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
-
- uint32_t number = 0;
const uint32_t quarterPoints = num_points / 4;
- float* inputPtr = (float*)src0;
+ float* inputPtr = (float*)source;
__m128 indexIncrementValues = _mm_set1_ps(4);
__m128 currentIndexes = _mm_set_ps(-1, -2, -3, -4);
- float min = src0[0];
+ float min = source[0];
float index = 0;
__m128 minValues = _mm_set1_ps(min);
__m128 minValuesIndex = _mm_setzero_ps();
__VOLK_ATTR_ALIGNED(16) float minValuesBuffer[4];
__VOLK_ATTR_ALIGNED(16) float minIndexesBuffer[4];
- for (; number < quarterPoints; number++) {
+ for (uint32_t number = 0; number < quarterPoints; number++) {
currentValues = _mm_load_ps(inputPtr);
inputPtr += 4;
_mm_store_ps(minValuesBuffer, minValues);
_mm_store_ps(minIndexesBuffer, minValuesIndex);
- for (number = 0; number < 4; number++) {
+ for (uint32_t number = 0; number < 4; number++) {
if (minValuesBuffer[number] < min) {
index = minIndexesBuffer[number];
min = minValuesBuffer[number];
}
}
- number = quarterPoints * 4;
- for (; number < num_points; number++) {
- if (src0[number] < min) {
+ for (uint32_t number = quarterPoints * 4; number < num_points; number++) {
+ if (source[number] < min) {
index = number;
- min = src0[number];
+ min = source[number];
}
}
target[0] = (uint16_t)index;
#ifdef LV_HAVE_GENERIC
static inline void
-volk_32f_index_min_16u_generic(uint16_t* target, const float* src0, uint32_t num_points)
+volk_32f_index_min_16u_generic(uint16_t* target, const float* source, uint32_t num_points)
{
num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
- float min = src0[0];
+ float min = source[0];
uint16_t index = 0;
- uint32_t i = 1;
-
- for (; i < num_points; ++i) {
- if (src0[i] < min) {
+ for (uint32_t i = 1; i < num_points; ++i) {
+ if (source[i] < min) {
index = i;
- min = src0[i];
+ min = source[i];
}
}
target[0] = index;
#include <immintrin.h>
static inline void
-volk_32f_index_min_16u_u_avx(uint16_t* target, const float* src0, uint32_t num_points)
+volk_32f_index_min_16u_u_avx(uint16_t* target, const float* source, uint32_t num_points)
{
num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
-
- uint32_t number = 0;
const uint32_t eighthPoints = num_points / 8;
- float* inputPtr = (float*)src0;
+ float* inputPtr = (float*)source;
__m256 indexIncrementValues = _mm256_set1_ps(8);
__m256 currentIndexes = _mm256_set_ps(-1, -2, -3, -4, -5, -6, -7, -8);
- float min = src0[0];
+ float min = source[0];
float index = 0;
__m256 minValues = _mm256_set1_ps(min);
__m256 minValuesIndex = _mm256_setzero_ps();
__VOLK_ATTR_ALIGNED(32) float minValuesBuffer[8];
__VOLK_ATTR_ALIGNED(32) float minIndexesBuffer[8];
- for (; number < eighthPoints; number++) {
+ for (uint32_t number = 0; number < eighthPoints; number++) {
currentValues = _mm256_loadu_ps(inputPtr);
inputPtr += 8;
_mm256_storeu_ps(minValuesBuffer, minValues);
_mm256_storeu_ps(minIndexesBuffer, minValuesIndex);
- for (number = 0; number < 8; number++) {
+ for (uint32_t number = 0; number < 8; number++) {
if (minValuesBuffer[number] < min) {
index = minIndexesBuffer[number];
min = minValuesBuffer[number];
}
}
- number = eighthPoints * 8;
- for (; number < num_points; number++) {
- if (src0[number] < min) {
+ for (uint32_t number = eighthPoints * 8; number < num_points; number++) {
+ if (source[number] < min) {
index = number;
- min = src0[number];
+ min = source[number];
}
}
target[0] = (uint16_t)index;
*
* <b>Dispatcher Prototype</b>
* \code
- * void volk_32f_index_min_32u(uint32_t* target, const float* src0, uint32_t num_points)
+ * void volk_32f_index_min_32u(uint32_t* target, const float* source, uint32_t num_points)
* \endcode
*
* \b Inputs
- * \li src0: The input vector of floats.
+ * \li source: The input vector of floats.
* \li num_points: The number of data points.
*
* \b Outputs
#include <smmintrin.h>
static inline void
-volk_32f_index_min_32u_a_sse4_1(uint32_t* target, const float* src0, uint32_t num_points)
+volk_32f_index_min_32u_a_sse4_1(uint32_t* target, const float* source, uint32_t num_points)
{
if (num_points > 0) {
- uint32_t number = 0;
const uint32_t quarterPoints = num_points / 4;
- float* inputPtr = (float*)src0;
+ float* inputPtr = (float*)source;
__m128 indexIncrementValues = _mm_set1_ps(4);
__m128 currentIndexes = _mm_set_ps(-1, -2, -3, -4);
- float min = src0[0];
+ float min = source[0];
float index = 0;
__m128 minValues = _mm_set1_ps(min);
__m128 minValuesIndex = _mm_setzero_ps();
__VOLK_ATTR_ALIGNED(16) float minValuesBuffer[4];
__VOLK_ATTR_ALIGNED(16) float minIndexesBuffer[4];
- for (; number < quarterPoints; number++) {
+ for (uint32_t number = 0; number < quarterPoints; number++) {
currentValues = _mm_load_ps(inputPtr);
inputPtr += 4;
_mm_store_ps(minValuesBuffer, minValues);
_mm_store_ps(minIndexesBuffer, minValuesIndex);
- for (number = 0; number < 4; number++) {
+ for (uint32_t number = 0; number < 4; number++) {
if (minValuesBuffer[number] < min) {
index = minIndexesBuffer[number];
min = minValuesBuffer[number];
}
}
- number = quarterPoints * 4;
- for (; number < num_points; number++) {
- if (src0[number] < min) {
+ for (uint32_t number = quarterPoints * 4; number < num_points; number++) {
+ if (source[number] < min) {
index = number;
- min = src0[number];
+ min = source[number];
}
}
target[0] = (uint32_t)index;
#include <xmmintrin.h>
static inline void
-volk_32f_index_min_32u_a_sse(uint32_t* target, const float* src0, uint32_t num_points)
+volk_32f_index_min_32u_a_sse(uint32_t* target, const float* source, uint32_t num_points)
{
if (num_points > 0) {
- uint32_t number = 0;
const uint32_t quarterPoints = num_points / 4;
- float* inputPtr = (float*)src0;
+ float* inputPtr = (float*)source;
__m128 indexIncrementValues = _mm_set1_ps(4);
__m128 currentIndexes = _mm_set_ps(-1, -2, -3, -4);
- float min = src0[0];
+ float min = source[0];
float index = 0;
__m128 minValues = _mm_set1_ps(min);
__m128 minValuesIndex = _mm_setzero_ps();
__VOLK_ATTR_ALIGNED(16) float minValuesBuffer[4];
__VOLK_ATTR_ALIGNED(16) float minIndexesBuffer[4];
- for (; number < quarterPoints; number++) {
+ for (uint32_t number = 0; number < quarterPoints; number++) {
currentValues = _mm_load_ps(inputPtr);
inputPtr += 4;
_mm_store_ps(minValuesBuffer, minValues);
_mm_store_ps(minIndexesBuffer, minValuesIndex);
- for (number = 0; number < 4; number++) {
+ for (uint32_t number = 0; number < 4; number++) {
if (minValuesBuffer[number] < min) {
index = minIndexesBuffer[number];
min = minValuesBuffer[number];
}
}
- number = quarterPoints * 4;
- for (; number < num_points; number++) {
- if (src0[number] < min) {
+ for (uint32_t number = quarterPoints * 4; number < num_points; number++) {
+ if (source[number] < min) {
index = number;
- min = src0[number];
+ min = source[number];
}
}
target[0] = (uint32_t)index;
#include <immintrin.h>
static inline void
-volk_32f_index_min_32u_a_avx(uint32_t* target, const float* src0, uint32_t num_points)
+volk_32f_index_min_32u_a_avx(uint32_t* target, const float* source, uint32_t num_points)
{
if (num_points > 0) {
- uint32_t number = 0;
const uint32_t quarterPoints = num_points / 8;
- float* inputPtr = (float*)src0;
+ float* inputPtr = (float*)source;
__m256 indexIncrementValues = _mm256_set1_ps(8);
__m256 currentIndexes = _mm256_set_ps(-1, -2, -3, -4, -5, -6, -7, -8);
- float min = src0[0];
+ float min = source[0];
float index = 0;
__m256 minValues = _mm256_set1_ps(min);
__m256 minValuesIndex = _mm256_setzero_ps();
__VOLK_ATTR_ALIGNED(32) float minValuesBuffer[8];
__VOLK_ATTR_ALIGNED(32) float minIndexesBuffer[8];
- for (; number < quarterPoints; number++) {
+ for (uint32_t number = 0; number < quarterPoints; number++) {
currentValues = _mm256_load_ps(inputPtr);
inputPtr += 8;
currentIndexes = _mm256_add_ps(currentIndexes, indexIncrementValues);
_mm256_store_ps(minValuesBuffer, minValues);
_mm256_store_ps(minIndexesBuffer, minValuesIndex);
- for (number = 0; number < 8; number++) {
+ for (uint32_t number = 0; number < 8; number++) {
if (minValuesBuffer[number] < min) {
index = minIndexesBuffer[number];
min = minValuesBuffer[number];
}
}
- number = quarterPoints * 8;
- for (; number < num_points; number++) {
- if (src0[number] < min) {
+ for (uint32_t number = quarterPoints * 8; number < num_points; number++) {
+ if (source[number] < min) {
index = number;
- min = src0[number];
+ min = source[number];
}
}
target[0] = (uint32_t)index;
#include <arm_neon.h>
static inline void
-volk_32f_index_min_32u_neon(uint32_t* target, const float* src0, uint32_t num_points)
+volk_32f_index_min_32u_neon(uint32_t* target, const float* source, uint32_t num_points)
{
if (num_points > 0) {
- uint32_t number = 0;
const uint32_t quarterPoints = num_points / 4;
- float* inputPtr = (float*)src0;
+ float* inputPtr = (float*)source;
float32x4_t indexIncrementValues = vdupq_n_f32(4);
__VOLK_ATTR_ALIGNED(16)
float currentIndexes_float[4] = { -4.0f, -3.0f, -2.0f, -1.0f };
float32x4_t currentIndexes = vld1q_f32(currentIndexes_float);
- float min = src0[0];
+ float min = source[0];
float index = 0;
float32x4_t minValues = vdupq_n_f32(min);
uint32x4_t minValuesIndex = vmovq_n_u32(0);
__VOLK_ATTR_ALIGNED(16) float minValuesBuffer[4];
__VOLK_ATTR_ALIGNED(16) float minIndexesBuffer[4];
- for (; number < quarterPoints; number++) {
+ for (uint32_t number = 0; number < quarterPoints; number++) {
currentValues = vld1q_f32(inputPtr);
inputPtr += 4;
currentIndexes = vaddq_f32(currentIndexes, indexIncrementValues);
// Calculate the smallest value from the remaining 4 points
vst1q_f32(minValuesBuffer, minValues);
vst1q_f32(minIndexesBuffer, vcvtq_f32_u32(minValuesIndex));
- for (number = 0; number < 4; number++) {
+ for (uint32_t number = 0; number < 4; number++) {
if (minValuesBuffer[number] < min) {
index = minIndexesBuffer[number];
min = minValuesBuffer[number];
}
}
- number = quarterPoints * 4;
- for (; number < num_points; number++) {
- if (src0[number] < min) {
+ for (uint32_t number = quarterPoints * 4; number < num_points; number++) {
+ if (source[number] < min) {
index = number;
- min = src0[number];
+ min = source[number];
}
}
target[0] = (uint32_t)index;
#ifdef LV_HAVE_GENERIC
static inline void
-volk_32f_index_min_32u_generic(uint32_t* target, const float* src0, uint32_t num_points)
+volk_32f_index_min_32u_generic(uint32_t* target, const float* source, uint32_t num_points)
{
if (num_points > 0) {
- float min = src0[0];
+ float min = source[0];
uint32_t index = 0;
- uint32_t i = 1;
-
- for (; i < num_points; ++i) {
- if (src0[i] < min) {
+ for (uint32_t i = 1; i < num_points; ++i) {
+ if (source[i] < min) {
index = i;
- min = src0[i];
+ min = source[i];
}
}
target[0] = index;
#include <immintrin.h>
static inline void
-volk_32f_index_min_32u_u_avx(uint32_t* target, const float* src0, uint32_t num_points)
+volk_32f_index_min_32u_u_avx(uint32_t* target, const float* source, uint32_t num_points)
{
if (num_points > 0) {
- uint32_t number = 0;
const uint32_t quarterPoints = num_points / 8;
- float* inputPtr = (float*)src0;
+ float* inputPtr = (float*)source;
__m256 indexIncrementValues = _mm256_set1_ps(8);
__m256 currentIndexes = _mm256_set_ps(-1, -2, -3, -4, -5, -6, -7, -8);
- float min = src0[0];
+ float min = source[0];
float index = 0;
__m256 minValues = _mm256_set1_ps(min);
__m256 minValuesIndex = _mm256_setzero_ps();
__VOLK_ATTR_ALIGNED(32) float minValuesBuffer[8];
__VOLK_ATTR_ALIGNED(32) float minIndexesBuffer[8];
- for (; number < quarterPoints; number++) {
+ for (uint32_t number = 0; number < quarterPoints; number++) {
currentValues = _mm256_loadu_ps(inputPtr);
inputPtr += 8;
currentIndexes = _mm256_add_ps(currentIndexes, indexIncrementValues);
_mm256_store_ps(minValuesBuffer, minValues);
_mm256_store_ps(minIndexesBuffer, minValuesIndex);
- for (number = 0; number < 8; number++) {
+ for (uint32_t number = 0; number < 8; number++) {
if (minValuesBuffer[number] < min) {
index = minIndexesBuffer[number];
min = minValuesBuffer[number];
}
}
- number = quarterPoints * 8;
- for (; number < num_points; number++) {
- if (src0[number] < min) {
+ for (uint32_t number = quarterPoints * 8; number < num_points; number++) {
+ if (source[number] < min) {
index = number;
- min = src0[number];
+ min = source[number];
}
}
target[0] = (uint32_t)index;
#include <smmintrin.h>
static inline void
-volk_32f_index_min_32u_u_sse4_1(uint32_t* target, const float* src0, uint32_t num_points)
+volk_32f_index_min_32u_u_sse4_1(uint32_t* target, const float* source, uint32_t num_points)
{
if (num_points > 0) {
- uint32_t number = 0;
const uint32_t quarterPoints = num_points / 4;
- float* inputPtr = (float*)src0;
+ float* inputPtr = (float*)source;
__m128 indexIncrementValues = _mm_set1_ps(4);
__m128 currentIndexes = _mm_set_ps(-1, -2, -3, -4);
- float min = src0[0];
+ float min = source[0];
float index = 0;
__m128 minValues = _mm_set1_ps(min);
__m128 minValuesIndex = _mm_setzero_ps();
__VOLK_ATTR_ALIGNED(16) float minValuesBuffer[4];
__VOLK_ATTR_ALIGNED(16) float minIndexesBuffer[4];
- for (; number < quarterPoints; number++) {
+ for (uint32_t number = 0; number < quarterPoints; number++) {
currentValues = _mm_loadu_ps(inputPtr);
inputPtr += 4;
currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues);
_mm_store_ps(minValuesBuffer, minValues);
_mm_store_ps(minIndexesBuffer, minValuesIndex);
- for (number = 0; number < 4; number++) {
+ for (uint32_t number = 0; number < 4; number++) {
if (minValuesBuffer[number] < min) {
index = minIndexesBuffer[number];
min = minValuesBuffer[number];
}
}
- number = quarterPoints * 4;
- for (; number < num_points; number++) {
- if (src0[number] < min) {
+ for (uint32_t number = quarterPoints * 4; number < num_points; number++) {
+ if (source[number] < min) {
index = number;
- min = src0[number];
+ min = source[number];
}
}
target[0] = (uint32_t)index;
#include <xmmintrin.h>
static inline void
-volk_32f_index_min_32u_u_sse(uint32_t* target, const float* src0, uint32_t num_points)
+volk_32f_index_min_32u_u_sse(uint32_t* target, const float* source, uint32_t num_points)
{
if (num_points > 0) {
- uint32_t number = 0;
const uint32_t quarterPoints = num_points / 4;
- float* inputPtr = (float*)src0;
+ float* inputPtr = (float*)source;
__m128 indexIncrementValues = _mm_set1_ps(4);
__m128 currentIndexes = _mm_set_ps(-1, -2, -3, -4);
- float min = src0[0];
+ float min = source[0];
float index = 0;
__m128 minValues = _mm_set1_ps(min);
__m128 minValuesIndex = _mm_setzero_ps();
__VOLK_ATTR_ALIGNED(16) float minValuesBuffer[4];
__VOLK_ATTR_ALIGNED(16) float minIndexesBuffer[4];
- for (; number < quarterPoints; number++) {
+ for (uint32_t number = 0; number < quarterPoints; number++) {
currentValues = _mm_loadu_ps(inputPtr);
inputPtr += 4;
currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues);
_mm_store_ps(minValuesBuffer, minValues);
_mm_store_ps(minIndexesBuffer, minValuesIndex);
- for (number = 0; number < 4; number++) {
+ for (uint32_t number = 0; number < 4; number++) {
if (minValuesBuffer[number] < min) {
index = minIndexesBuffer[number];
min = minValuesBuffer[number];
}
}
- number = quarterPoints * 4;
- for (; number < num_points; number++) {
- if (src0[number] < min) {
+ for (uint32_t number = quarterPoints * 4; number < num_points; number++) {
+ if (source[number] < min) {
index = number;
- min = src0[number];
+ min = source[number];
}
}
target[0] = (uint32_t)index;
*
* <b>Dispatcher Prototype</b>
* \code
- * void volk_32fc_index_min_16u(uint16_t* target, lv_32fc_t* src0, uint32_t
+ * void volk_32fc_index_min_16u(uint16_t* target, lv_32fc_t* source, uint32_t
* num_points) \endcode
*
* \b Inputs
- * \li src0: The complex input vector.
+ * \li source: The complex input vector.
* \li num_points: The number of samples.
*
* \b Outputs
#include <volk/volk_avx2_intrinsics.h>
static inline void volk_32fc_index_min_16u_a_avx2_variant_0(uint16_t* target,
- lv_32fc_t* src0,
+ lv_32fc_t* source,
uint32_t num_points)
{
num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
__m256i min_indices = _mm256_setzero_si256();
for (unsigned i = 0; i < num_points / 8u; ++i) {
- __m256 in0 = _mm256_load_ps((float*)src0);
- __m256 in1 = _mm256_load_ps((float*)(src0 + 4));
+ __m256 in0 = _mm256_load_ps((float*)source);
+ __m256 in1 = _mm256_load_ps((float*)(source + 4));
vector_32fc_index_min_variant0(
in0, in1, &min_values, &min_indices, ¤t_indices, indices_increment);
- src0 += 8;
+ source += 8;
}
// determine minimum value and index in the result of the vectorized loop
// handle tail not processed by the vectorized loop
for (unsigned i = num_points & (~7u); i < num_points; ++i) {
const float abs_squared =
- lv_creal(*src0) * lv_creal(*src0) + lv_cimag(*src0) * lv_cimag(*src0);
+ lv_creal(*source) * lv_creal(*source) + lv_cimag(*source) * lv_cimag(*source);
if (abs_squared < min) {
min = abs_squared;
index = i;
}
- ++src0;
+ ++source;
}
*target = index;
#include <volk/volk_avx2_intrinsics.h>
static inline void volk_32fc_index_min_16u_a_avx2_variant_1(uint16_t* target,
- lv_32fc_t* src0,
+ lv_32fc_t* source,
uint32_t num_points)
{
num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
__m256i min_indices = _mm256_setzero_si256();
for (unsigned i = 0; i < num_points / 8u; ++i) {
- __m256 in0 = _mm256_load_ps((float*)src0);
- __m256 in1 = _mm256_load_ps((float*)(src0 + 4));
+ __m256 in0 = _mm256_load_ps((float*)source);
+ __m256 in1 = _mm256_load_ps((float*)(source + 4));
vector_32fc_index_min_variant1(
in0, in1, &min_values, &min_indices, ¤t_indices, indices_increment);
- src0 += 8;
+ source += 8;
}
// determine minimum value and index in the result of the vectorized loop
// handle tail not processed by the vectorized loop
for (unsigned i = num_points & (~7u); i < num_points; ++i) {
const float abs_squared =
- lv_creal(*src0) * lv_creal(*src0) + lv_cimag(*src0) * lv_cimag(*src0);
+ lv_creal(*source) * lv_creal(*source) + lv_cimag(*source) * lv_cimag(*source);
if (abs_squared < min) {
min = abs_squared;
index = i;
}
- ++src0;
+ ++source;
}
*target = index;
#include <xmmintrin.h>
static inline void
-volk_32fc_index_min_16u_a_sse3(uint16_t* target, lv_32fc_t* src0, uint32_t num_points)
+volk_32fc_index_min_16u_a_sse3(uint16_t* target, lv_32fc_t* source, uint32_t num_points)
{
num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
const uint32_t num_bytes = num_points * 8;
holderf.int_vec = _mm_setzero_si128();
holderi.int_vec = _mm_setzero_si128();
- int bound = num_bytes >> 5;
- int i = 0;
-
xmm8 = _mm_setr_epi32(0, 1, 2, 3);
xmm9 = _mm_setzero_si128();
xmm10 = _mm_setr_epi32(4, 4, 4, 4);
xmm3 = _mm_set_ps1(FLT_MAX);
- for (; i < bound; ++i) {
- xmm1 = _mm_load_ps((float*)src0);
- xmm2 = _mm_load_ps((float*)&src0[2]);
+ int bound = num_bytes >> 5;
+
+ for (int i = 0; i < bound; ++i) {
+ xmm1 = _mm_load_ps((float*)source);
+ xmm2 = _mm_load_ps((float*)&source[2]);
- src0 += 4;
+ source += 4;
xmm1 = _mm_mul_ps(xmm1, xmm1);
xmm2 = _mm_mul_ps(xmm2, xmm2);
}
if (num_bytes >> 4 & 1) {
- xmm2 = _mm_load_ps((float*)src0);
+ xmm2 = _mm_load_ps((float*)source);
xmm1 = _mm_movelh_ps(bit128_p(&xmm8)->float_vec, bit128_p(&xmm8)->float_vec);
xmm8 = bit128_p(&xmm1)->int_vec;
xmm2 = _mm_mul_ps(xmm2, xmm2);
- src0 += 2;
+ source += 2;
xmm1 = _mm_hadd_ps(xmm2, xmm2);
if (num_bytes >> 3 & 1) {
sq_dist =
- lv_creal(src0[0]) * lv_creal(src0[0]) + lv_cimag(src0[0]) * lv_cimag(src0[0]);
+ lv_creal(source[0]) * lv_creal(source[0]) + lv_cimag(source[0]) * lv_cimag(source[0]);
xmm2 = _mm_load1_ps(&sq_dist);
#ifdef LV_HAVE_GENERIC
static inline void
-volk_32fc_index_min_16u_generic(uint16_t* target, lv_32fc_t* src0, uint32_t num_points)
+volk_32fc_index_min_16u_generic(uint16_t* target, lv_32fc_t* source, uint32_t num_points)
{
num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
-
const uint32_t num_bytes = num_points * 8;
float sq_dist = 0.0;
float min = FLT_MAX;
uint16_t index = 0;
- uint32_t i = 0;
-
- for (; i<num_bytes>> 3; ++i) {
+ for (uint32_t i = 0; i<num_bytes>> 3; ++i) {
sq_dist =
- lv_creal(src0[i]) * lv_creal(src0[i]) + lv_cimag(src0[i]) * lv_cimag(src0[i]);
+ lv_creal(source[i]) * lv_creal(source[i]) + lv_cimag(source[i]) * lv_cimag(source[i]);
if (sq_dist < min) {
index = i;
#include <volk/volk_avx2_intrinsics.h>
static inline void volk_32fc_index_min_16u_u_avx2_variant_0(uint16_t* target,
- lv_32fc_t* src0,
+ lv_32fc_t* source,
uint32_t num_points)
{
num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
__m256i min_indices = _mm256_setzero_si256();
for (unsigned i = 0; i < num_points / 8u; ++i) {
- __m256 in0 = _mm256_loadu_ps((float*)src0);
- __m256 in1 = _mm256_loadu_ps((float*)(src0 + 4));
+ __m256 in0 = _mm256_loadu_ps((float*)source);
+ __m256 in1 = _mm256_loadu_ps((float*)(source + 4));
vector_32fc_index_min_variant0(
in0, in1, &min_values, &min_indices, ¤t_indices, indices_increment);
- src0 += 8;
+ source += 8;
}
// determine minimum value and index in the result of the vectorized loop
// handle tail not processed by the vectorized loop
for (unsigned i = num_points & (~7u); i < num_points; ++i) {
const float abs_squared =
- lv_creal(*src0) * lv_creal(*src0) + lv_cimag(*src0) * lv_cimag(*src0);
+ lv_creal(*source) * lv_creal(*source) + lv_cimag(*source) * lv_cimag(*source);
if (abs_squared < min) {
min = abs_squared;
index = i;
}
- ++src0;
+ ++source;
}
*target = index;
#include <volk/volk_avx2_intrinsics.h>
static inline void volk_32fc_index_min_16u_u_avx2_variant_1(uint16_t* target,
- lv_32fc_t* src0,
+ lv_32fc_t* source,
uint32_t num_points)
{
num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
__m256i min_indices = _mm256_setzero_si256();
for (unsigned i = 0; i < num_points / 8u; ++i) {
- __m256 in0 = _mm256_loadu_ps((float*)src0);
- __m256 in1 = _mm256_loadu_ps((float*)(src0 + 4));
+ __m256 in0 = _mm256_loadu_ps((float*)source);
+ __m256 in1 = _mm256_loadu_ps((float*)(source + 4));
vector_32fc_index_min_variant1(
in0, in1, &min_values, &min_indices, ¤t_indices, indices_increment);
- src0 += 8;
+ source += 8;
}
// determine minimum value and index in the result of the vectorized loop
// handle tail not processed by the vectorized loop
for (unsigned i = num_points & (~7u); i < num_points; ++i) {
const float abs_squared =
- lv_creal(*src0) * lv_creal(*src0) + lv_cimag(*src0) * lv_cimag(*src0);
+ lv_creal(*source) * lv_creal(*source) + lv_cimag(*source) * lv_cimag(*source);
if (abs_squared < min) {
min = abs_squared;
index = i;
}
- ++src0;
+ ++source;
}
*target = index;
*
* <b>Dispatcher Prototype</b>
* \code
- * void volk_32fc_index_min_32u(uint32_t* target, lv_32fc_t* src0, uint32_t
+ * void volk_32fc_index_min_32u(uint32_t* target, lv_32fc_t* source, uint32_t
* num_points) \endcode
*
* \b Inputs
- * \li src0: The complex input vector.
+ * \li source: The complex input vector.
* \li num_points: The number of samples.
*
* \b Outputs
#include <volk/volk_avx2_intrinsics.h>
static inline void volk_32fc_index_min_32u_a_avx2_variant_0(uint32_t* target,
- lv_32fc_t* src0,
+ lv_32fc_t* source,
uint32_t num_points)
{
const __m256i indices_increment = _mm256_set1_epi32(8);
__m256i min_indices = _mm256_setzero_si256();
for (unsigned i = 0; i < num_points / 8u; ++i) {
- __m256 in0 = _mm256_load_ps((float*)src0);
- __m256 in1 = _mm256_load_ps((float*)(src0 + 4));
+ __m256 in0 = _mm256_load_ps((float*)source);
+ __m256 in1 = _mm256_load_ps((float*)(source + 4));
vector_32fc_index_min_variant0(
in0, in1, &min_values, &min_indices, ¤t_indices, indices_increment);
- src0 += 8;
+ source += 8;
}
// determine minimum value and index in the result of the vectorized loop
// handle tail not processed by the vectorized loop
for (unsigned i = num_points & (~7u); i < num_points; ++i) {
const float abs_squared =
- lv_creal(*src0) * lv_creal(*src0) + lv_cimag(*src0) * lv_cimag(*src0);
+ lv_creal(*source) * lv_creal(*source) + lv_cimag(*source) * lv_cimag(*source);
if (abs_squared < min) {
min = abs_squared;
index = i;
}
- ++src0;
+ ++source;
}
*target = index;
#include <volk/volk_avx2_intrinsics.h>
static inline void volk_32fc_index_min_32u_a_avx2_variant_1(uint32_t* target,
- lv_32fc_t* src0,
+ lv_32fc_t* source,
uint32_t num_points)
{
const __m256i indices_increment = _mm256_set1_epi32(8);
__m256i min_indices = _mm256_setzero_si256();
for (unsigned i = 0; i < num_points / 8u; ++i) {
- __m256 in0 = _mm256_load_ps((float*)src0);
- __m256 in1 = _mm256_load_ps((float*)(src0 + 4));
+ __m256 in0 = _mm256_load_ps((float*)source);
+ __m256 in1 = _mm256_load_ps((float*)(source + 4));
vector_32fc_index_min_variant1(
in0, in1, &min_values, &min_indices, ¤t_indices, indices_increment);
- src0 += 8;
+ source += 8;
}
// determine minimum value and index in the result of the vectorized loop
// handle tail not processed by the vectorized loop
for (unsigned i = num_points & (~7u); i < num_points; ++i) {
const float abs_squared =
- lv_creal(*src0) * lv_creal(*src0) + lv_cimag(*src0) * lv_cimag(*src0);
+ lv_creal(*source) * lv_creal(*source) + lv_cimag(*source) * lv_cimag(*source);
if (abs_squared < min) {
min = abs_squared;
index = i;
}
- ++src0;
+ ++source;
}
*target = index;
#include <xmmintrin.h>
static inline void
-volk_32fc_index_min_32u_a_sse3(uint32_t* target, lv_32fc_t* src0, uint32_t num_points)
+volk_32fc_index_min_32u_a_sse3(uint32_t* target, lv_32fc_t* source, uint32_t num_points)
{
const uint32_t num_bytes = num_points * 8;
holderf.int_vec = _mm_setzero_si128();
holderi.int_vec = _mm_setzero_si128();
- int bound = num_bytes >> 5;
- int i = 0;
-
xmm8 = _mm_setr_epi32(0, 1, 2, 3);
xmm9 = _mm_setzero_si128();
xmm10 = _mm_setr_epi32(4, 4, 4, 4);
xmm3 = _mm_set_ps1(FLT_MAX);
- for (; i < bound; ++i) {
- xmm1 = _mm_load_ps((float*)src0);
- xmm2 = _mm_load_ps((float*)&src0[2]);
+ int bound = num_bytes >> 5;
- src0 += 4;
+ for (int i = 0; i < bound; ++i) {
+ xmm1 = _mm_load_ps((float*)source);
+ xmm2 = _mm_load_ps((float*)&source[2]);
+
+ source += 4;
xmm1 = _mm_mul_ps(xmm1, xmm1);
xmm2 = _mm_mul_ps(xmm2, xmm2);
}
if (num_bytes >> 4 & 1) {
- xmm2 = _mm_load_ps((float*)src0);
+ xmm2 = _mm_load_ps((float*)source);
xmm1 = _mm_movelh_ps(bit128_p(&xmm8)->float_vec, bit128_p(&xmm8)->float_vec);
xmm8 = bit128_p(&xmm1)->int_vec;
xmm2 = _mm_mul_ps(xmm2, xmm2);
- src0 += 2;
+ source += 2;
xmm1 = _mm_hadd_ps(xmm2, xmm2);
if (num_bytes >> 3 & 1) {
sq_dist =
- lv_creal(src0[0]) * lv_creal(src0[0]) + lv_cimag(src0[0]) * lv_cimag(src0[0]);
+ lv_creal(source[0]) * lv_creal(source[0]) + lv_cimag(source[0]) * lv_cimag(source[0]);
xmm2 = _mm_load1_ps(&sq_dist);
#ifdef LV_HAVE_GENERIC
static inline void
-volk_32fc_index_min_32u_generic(uint32_t* target, lv_32fc_t* src0, uint32_t num_points)
+volk_32fc_index_min_32u_generic(uint32_t* target, lv_32fc_t* source, uint32_t num_points)
{
const uint32_t num_bytes = num_points * 8;
float min = FLT_MAX;
uint32_t index = 0;
- uint32_t i = 0;
-
- for (; i<num_bytes>> 3; ++i) {
+ for (uint32_t i = 0; i<num_bytes>> 3; ++i) {
sq_dist =
- lv_creal(src0[i]) * lv_creal(src0[i]) + lv_cimag(src0[i]) * lv_cimag(src0[i]);
+ lv_creal(source[i]) * lv_creal(source[i]) + lv_cimag(source[i]) * lv_cimag(source[i]);
if (sq_dist < min) {
index = i;
#include <volk/volk_avx2_intrinsics.h>
static inline void volk_32fc_index_min_32u_u_avx2_variant_0(uint32_t* target,
- lv_32fc_t* src0,
+ lv_32fc_t* source,
uint32_t num_points)
{
const __m256i indices_increment = _mm256_set1_epi32(8);
__m256i min_indices = _mm256_setzero_si256();
for (unsigned i = 0; i < num_points / 8u; ++i) {
- __m256 in0 = _mm256_loadu_ps((float*)src0);
- __m256 in1 = _mm256_loadu_ps((float*)(src0 + 4));
+ __m256 in0 = _mm256_loadu_ps((float*)source);
+ __m256 in1 = _mm256_loadu_ps((float*)(source + 4));
vector_32fc_index_min_variant0(
in0, in1, &min_values, &min_indices, ¤t_indices, indices_increment);
- src0 += 8;
+ source += 8;
}
// determine minimum value and index in the result of the vectorized loop
// handle tail not processed by the vectorized loop
for (unsigned i = num_points & (~7u); i < num_points; ++i) {
const float abs_squared =
- lv_creal(*src0) * lv_creal(*src0) + lv_cimag(*src0) * lv_cimag(*src0);
+ lv_creal(*source) * lv_creal(*source) + lv_cimag(*source) * lv_cimag(*source);
if (abs_squared < min) {
min = abs_squared;
index = i;
}
- ++src0;
+ ++source;
}
*target = index;
#include <volk/volk_avx2_intrinsics.h>
static inline void volk_32fc_index_min_32u_u_avx2_variant_1(uint32_t* target,
- lv_32fc_t* src0,
+ lv_32fc_t* source,
uint32_t num_points)
{
const __m256i indices_increment = _mm256_set1_epi32(8);
__m256i min_indices = _mm256_setzero_si256();
for (unsigned i = 0; i < num_points / 8u; ++i) {
- __m256 in0 = _mm256_loadu_ps((float*)src0);
- __m256 in1 = _mm256_loadu_ps((float*)(src0 + 4));
+ __m256 in0 = _mm256_loadu_ps((float*)source);
+ __m256 in1 = _mm256_loadu_ps((float*)(source + 4));
vector_32fc_index_min_variant1(
in0, in1, &min_values, &min_indices, ¤t_indices, indices_increment);
- src0 += 8;
+ source += 8;
}
// determine minimum value and index in the result of the vectorized loop
// handle tail not processed by the vectorized loop
for (unsigned i = num_points & (~7u); i < num_points; ++i) {
const float abs_squared =
- lv_creal(*src0) * lv_creal(*src0) + lv_cimag(*src0) * lv_cimag(*src0);
+ lv_creal(*source) * lv_creal(*source) + lv_cimag(*source) * lv_cimag(*source);
if (abs_squared < min) {
min = abs_squared;
index = i;
}
- ++src0;
+ ++source;
}
*target = index;
#include <volk/volk_neon_intrinsics.h>
static inline void
-volk_32fc_index_min_32u_neon(uint32_t* target, lv_32fc_t* src0, uint32_t num_points)
+volk_32fc_index_min_32u_neon(uint32_t* target, lv_32fc_t* source, uint32_t num_points)
{
- unsigned int number = 0;
const uint32_t quarter_points = num_points / 4;
- const lv_32fc_t* src0Ptr = src0;
+ const lv_32fc_t* sourcePtr = source;
uint32_t indices[4] = { 0, 1, 2, 3 };
const uint32x4_t vec_indices_incr = vdupq_n_u32(4);
float32x4_t vec_min = vdupq_n_f32(FLT_MAX);
- for (; number < quarter_points; number++) {
+ for (uint32_t number = 0; number < quarter_points; number++) {
// Load complex and compute magnitude squared
const float32x4_t vec_mag2 =
- _vmagnitudesquaredq_f32(vld2q_f32((float*)src0Ptr));
- __VOLK_PREFETCH(src0Ptr += 4);
+ _vmagnitudesquaredq_f32(vld2q_f32((float*)sourcePtr));
+ __VOLK_PREFETCH(sourcePtr += 4);
// a < b?
const uint32x4_t lt_mask = vcltq_f32(vec_mag2, vec_min);
vec_min = vbslq_f32(lt_mask, vec_mag2, vec_min);
}
// Deal with the rest
- for (number = quarter_points * 4; number < num_points; number++) {
- const float re = lv_creal(*src0Ptr);
- const float im = lv_cimag(*src0Ptr);
+ for (uint32_t number = quarter_points * 4; number < num_points; number++) {
+ const float re = lv_creal(*sourcePtr);
+ const float im = lv_cimag(*sourcePtr);
if ((re * re + im * im) < min) {
- min = *src0Ptr;
+ min = *sourcePtr;
index = number;
}
- src0Ptr++;
+ sourcePtr++;
}
*target = index;
}