/* -*- c++ -*- */
/*
- * Copyright 2012, 2014 Free Software Foundation, Inc.
+ * Copyright 2012, 2014-2016, 2018-2020 Free Software Foundation, Inc.
*
* This file is part of GNU Radio
*
*
* <b>Dispatcher Prototype</b>
* \code
- * void volk_32fc_index_max_16u(uint16_t* target, lv_32fc_t* src0, uint32_t num_points)
- * \endcode
+ * void volk_32fc_index_max_16u(uint16_t* target, lv_32fc_t* src0, uint32_t
+ * num_points) \endcode
*
* \b Inputs
* \li src0: The complex input vector.
volk_32fc_index_max_16u_a_avx2(uint16_t* target, lv_32fc_t* src0, uint32_t num_points)
{
num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
- // Branchless version, if we think it'll make a difference
- // num_points = USHRT_MAX ^ ((num_points ^ USHRT_MAX) & -(num_points < USHRT_MAX));
-
const uint32_t num_bytes = num_points * 8;
union bit256 holderf;
union bit256 holderi;
float sq_dist = 0.0;
+ float max = 0.0;
+ uint16_t index = 0;
union bit256 xmm5, xmm4;
__m256 xmm1, xmm2, xmm3;
- __m256i xmm8, xmm11, xmm12, xmmfive, xmmfour, xmm9, holder0, holder1, xmm10;
+ __m256i xmm8, xmm11, xmm12, xmm9, xmm10;
- xmm5.int_vec = xmmfive = _mm256_setzero_si256();
- xmm4.int_vec = xmmfour = _mm256_setzero_si256();
- holderf.int_vec = holder0 = _mm256_setzero_si256();
- holderi.int_vec = holder1 = _mm256_setzero_si256();
+ xmm5.int_vec = _mm256_setzero_si256();
+ xmm4.int_vec = _mm256_setzero_si256();
+ holderf.int_vec = _mm256_setzero_si256();
+ holderi.int_vec = _mm256_setzero_si256();
int bound = num_bytes >> 6;
int i = 0;
- xmm8 = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
- xmm9 = _mm256_setzero_si256(); //=xmm8
+ xmm8 = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
+ xmm9 = _mm256_setzero_si256();
xmm10 = _mm256_set1_epi32(8);
xmm3 = _mm256_setzero_ps();
- __m256i idx = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
+ __m256i idx = _mm256_setr_epi32(0, 1, 4, 5, 2, 3, 6, 7);
for (; i < bound; ++i) {
xmm1 = _mm256_load_ps((float*)src0);
xmm2 = _mm256_load_ps((float*)&src0[4]);
xmm8 = _mm256_add_epi32(xmm8, xmm10);
}
- xmm10 = _mm256_set1_epi32(4);
- if (num_bytes >> 5 & 1) {
- xmm1 = _mm256_load_ps((float*)src0);
-
- src0 += 4;
-
- xmm1 = _mm256_mul_ps(xmm1, xmm1);
- xmm1 = _mm256_hadd_ps(xmm1, xmm1);
- xmm1 = _mm256_permutevar8x32_ps(xmm1, idx);
-
- xmm3 = _mm256_max_ps(xmm1, xmm3);
-
- xmm4.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_LT_OS);
- xmm5.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_EQ_OQ);
-
- xmm11 = _mm256_and_si256(xmm8, xmm5.int_vec);
- xmm12 = _mm256_and_si256(xmm9, xmm4.int_vec);
-
- xmm9 = _mm256_add_epi32(xmm11, xmm12);
+ _mm256_store_ps((float*)&(holderf.f), xmm3);
+ _mm256_store_si256(&(holderi.int_vec), xmm9);
- xmm8 = _mm256_add_epi32(xmm8, xmm10);
+ for (i = 0; i < 8; i++) {
+ if (holderf.f[i] > max) {
+ index = holderi.i[i];
+ max = holderf.f[i];
+ }
}
- idx = _mm256_set_epi32(1, 0, 1, 0, 1, 0, 1, 0);
- xmm10 = _mm256_set1_epi32(2);
- if (num_bytes >> 4 & 1) {
- xmm2 = _mm256_load_ps((float*)src0);
-
- xmm1 = _mm256_permutevar8x32_ps(bit256_p(&xmm8)->float_vec, idx);
- xmm8 = bit256_p(&xmm1)->int_vec;
-
- xmm2 = _mm256_mul_ps(xmm2, xmm2);
-
- src0 += 2;
-
- xmm1 = _mm256_hadd_ps(xmm2, xmm2);
-
- xmm3 = _mm256_max_ps(xmm1, xmm3);
-
- xmm4.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_LT_OS);
- xmm5.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_EQ_OQ);
-
- xmm11 = _mm256_and_si256(xmm8, xmm5.int_vec);
- xmm12 = _mm256_and_si256(xmm9, xmm4.int_vec);
-
- xmm9 = _mm256_add_epi32(xmm11, xmm12);
+ for (i = bound * 8; i < num_points; i++, src0++) {
+ sq_dist =
+ lv_creal(src0[0]) * lv_creal(src0[0]) + lv_cimag(src0[0]) * lv_cimag(src0[0]);
- xmm8 = _mm256_add_epi32(xmm8, xmm10);
+ if (sq_dist > max) {
+ index = i;
+ max = sq_dist;
+ }
}
-
- /*
- idx = _mm256_setzero_si256();
- for(i = 0; i < leftovers2; ++i) {
- //printf("%u, %u, %u, %u\n", ((uint32_t*)&xmm9)[0], ((uint32_t*)&xmm9)[1],
- ((uint32_t*)&xmm9)[2], ((uint32_t*)&xmm9)[3]);
-
- sq_dist = lv_creal(src0[0]) * lv_creal(src0[0]) + lv_cimag(src0[0]) *
- lv_cimag(src0[0]);
-
- //xmm = _mm_load1_ps(&sq_dist);//insert?
- xmm2 = _mm256_set1_ps(sq_dist);
- //xmm2 = _mm256_insertf128_ps(xmm2, xmm, 0);
-
- xmm1 = xmm3;
-
- xmm3 = _mm256_max_ps(xmm3, xmm2);//only lowest 32bit value
- xmm3 = _mm256_permutevar8x32_ps(xmm3, idx);
-
- xmm4.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_LT_OS);
- xmm5.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_EQ_OQ);
-
- xmm8 = _mm256_permutevar8x32_epi32(xmm8, idx);
-
- xmm11 = _mm256_and_si256(xmm8, xmm4.int_vec);
- xmm12 = _mm256_and_si256(xmm9, xmm5.int_vec);
-
- xmm9 = _mm256_add_epi32(xmm11, xmm12);
- }*/
-
- _mm256_store_ps((float*)&(holderf.f), xmm3);
- _mm256_store_si256(&(holderi.int_vec), xmm9);
-
- target[0] = holderi.i[0];
- sq_dist = holderf.f[0];
- target[0] = (holderf.f[1] > sq_dist) ? holderi.i[1] : target[0];
- sq_dist = (holderf.f[1] > sq_dist) ? holderf.f[1] : sq_dist;
- target[0] = (holderf.f[2] > sq_dist) ? holderi.i[2] : target[0];
- sq_dist = (holderf.f[2] > sq_dist) ? holderf.f[2] : sq_dist;
- target[0] = (holderf.f[3] > sq_dist) ? holderi.i[3] : target[0];
- sq_dist = (holderf.f[3] > sq_dist) ? holderf.f[3] : sq_dist;
- target[0] = (holderf.f[4] > sq_dist) ? holderi.i[4] : target[0];
- sq_dist = (holderf.f[4] > sq_dist) ? holderf.f[4] : sq_dist;
- target[0] = (holderf.f[5] > sq_dist) ? holderi.i[5] : target[0];
- sq_dist = (holderf.f[5] > sq_dist) ? holderf.f[5] : sq_dist;
- target[0] = (holderf.f[6] > sq_dist) ? holderi.i[6] : target[0];
- sq_dist = (holderf.f[6] > sq_dist) ? holderf.f[6] : sq_dist;
- target[0] = (holderf.f[7] > sq_dist) ? holderi.i[7] : target[0];
- sq_dist = (holderf.f[7] > sq_dist) ? holderf.f[7] : sq_dist;
+ target[0] = index;
}
#endif /*LV_HAVE_AVX2*/
volk_32fc_index_max_16u_a_sse3(uint16_t* target, lv_32fc_t* src0, uint32_t num_points)
{
num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
- // Branchless version, if we think it'll make a difference
- // num_points = USHRT_MAX ^ ((num_points ^ USHRT_MAX) & -(num_points < USHRT_MAX));
-
const uint32_t num_bytes = num_points * 8;
union bit128 holderf;
union bit128 xmm5, xmm4;
__m128 xmm1, xmm2, xmm3;
- __m128i xmm8, xmm11, xmm12, xmmfive, xmmfour, xmm9, holder0, holder1, xmm10;
+ __m128i xmm8, xmm11, xmm12, xmm9, xmm10;
- xmm5.int_vec = xmmfive = _mm_setzero_si128();
- xmm4.int_vec = xmmfour = _mm_setzero_si128();
- holderf.int_vec = holder0 = _mm_setzero_si128();
- holderi.int_vec = holder1 = _mm_setzero_si128();
+ xmm5.int_vec = _mm_setzero_si128();
+ xmm4.int_vec = _mm_setzero_si128();
+ holderf.int_vec = _mm_setzero_si128();
+ holderi.int_vec = _mm_setzero_si128();
int bound = num_bytes >> 5;
int i = 0;
- xmm8 = _mm_set_epi32(3, 2, 1, 0); // remember the crazy reverse order!
+ xmm8 = _mm_setr_epi32(0, 1, 2, 3);
xmm9 = _mm_setzero_si128();
- xmm10 = _mm_set_epi32(4, 4, 4, 4);
+ xmm10 = _mm_setr_epi32(4, 4, 4, 4);
xmm3 = _mm_setzero_ps();
- // printf("%f, %f, %f, %f\n", ((float*)&xmm10)[0], ((float*)&xmm10)[1],
- // ((float*)&xmm10)[2], ((float*)&xmm10)[3]);
for (; i < bound; ++i) {
xmm1 = _mm_load_ps((float*)src0);
xmm9 = _mm_add_epi32(xmm11, xmm12);
xmm8 = _mm_add_epi32(xmm8, xmm10);
-
- // printf("%f, %f, %f, %f\n", ((float*)&xmm3)[0], ((float*)&xmm3)[1],
- // ((float*)&xmm3)[2], ((float*)&xmm3)[3]); printf("%u, %u, %u, %u\n",
- // ((uint32_t*)&xmm10)[0], ((uint32_t*)&xmm10)[1], ((uint32_t*)&xmm10)[2],
- // ((uint32_t*)&xmm10)[3]);
}
-
if (num_bytes >> 4 & 1) {
xmm2 = _mm_load_ps((float*)src0);
xmm3 = _mm_max_ps(xmm1, xmm3);
- xmm10 = _mm_set_epi32(2, 2, 2, 2); // load1_ps((float*)&init[2]);
+ xmm10 = _mm_setr_epi32(2, 2, 2, 2);
xmm4.float_vec = _mm_cmplt_ps(xmm1, xmm3);
xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3);
xmm9 = _mm_add_epi32(xmm11, xmm12);
xmm8 = _mm_add_epi32(xmm8, xmm10);
- // printf("egads%u, %u, %u, %u\n", ((uint32_t*)&xmm9)[0], ((uint32_t*)&xmm9)[1],
- // ((uint32_t*)&xmm9)[2], ((uint32_t*)&xmm9)[3]);
}
if (num_bytes >> 3 & 1) {
- // printf("%u, %u, %u, %u\n", ((uint32_t*)&xmm9)[0], ((uint32_t*)&xmm9)[1],
- // ((uint32_t*)&xmm9)[2], ((uint32_t*)&xmm9)[3]);
-
sq_dist =
lv_creal(src0[0]) * lv_creal(src0[0]) + lv_cimag(src0[0]) * lv_cimag(src0[0]);
xmm9 = _mm_add_epi32(xmm11, xmm12);
}
- // printf("%f, %f, %f, %f\n", ((float*)&xmm3)[0], ((float*)&xmm3)[1],
- // ((float*)&xmm3)[2], ((float*)&xmm3)[3]); printf("%u, %u, %u, %u\n",
- // ((uint32_t*)&xmm9)[0], ((uint32_t*)&xmm9)[1], ((uint32_t*)&xmm9)[2],
- // ((uint32_t*)&xmm9)[3]);
-
_mm_store_ps((float*)&(holderf.f), xmm3);
_mm_store_si128(&(holderi.int_vec), xmm9);
sq_dist = (holderf.f[2] > sq_dist) ? holderf.f[2] : sq_dist;
target[0] = (holderf.f[3] > sq_dist) ? holderi.i[3] : target[0];
sq_dist = (holderf.f[3] > sq_dist) ? holderf.f[3] : sq_dist;
-
- /*
- float placeholder = 0.0;
- uint32_t temp0, temp1;
- uint32_t g0 = (((float*)&xmm3)[0] > ((float*)&xmm3)[1]);
- uint32_t l0 = g0 ^ 1;
-
- uint32_t g1 = (((float*)&xmm3)[1] > ((float*)&xmm3)[2]);
- uint32_t l1 = g1 ^ 1;
-
- temp0 = g0 * ((uint32_t*)&xmm9)[0] + l0 * ((uint32_t*)&xmm9)[1];
- temp1 = g0 * ((uint32_t*)&xmm9)[2] + l0 * ((uint32_t*)&xmm9)[3];
- sq_dist = g0 * ((float*)&xmm3)[0] + l0 * ((float*)&xmm3)[1];
- placeholder = g0 * ((float*)&xmm3)[2] + l0 * ((float*)&xmm3)[3];
-
- g0 = (sq_dist > placeholder);
- l0 = g0 ^ 1;
- target[0] = g0 * temp0 + l0 * temp1;
- */
}
#endif /*LV_HAVE_SSE3*/
sq_dist =
lv_creal(src0[i]) * lv_creal(src0[i]) + lv_cimag(src0[i]) * lv_cimag(src0[i]);
- index = sq_dist > max ? i : index;
- max = sq_dist > max ? sq_dist : max;
+ if (sq_dist > max) {
+ index = i;
+ max = sq_dist;
+ }
}
target[0] = index;
}
#endif /*LV_HAVE_GENERIC*/
-
#endif /*INCLUDED_volk_32fc_index_max_16u_a_H*/
-
#ifndef INCLUDED_volk_32fc_index_max_16u_u_H
#define INCLUDED_volk_32fc_index_max_16u_u_H
volk_32fc_index_max_16u_u_avx2(uint16_t* target, lv_32fc_t* src0, uint32_t num_points)
{
num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
- // Branchless version, if we think it'll make a difference
- // num_points = USHRT_MAX ^ ((num_points ^ USHRT_MAX) & -(num_points < USHRT_MAX));
-
const uint32_t num_bytes = num_points * 8;
union bit256 holderf;
union bit256 holderi;
float sq_dist = 0.0;
+ float max = 0.0;
+ uint16_t index = 0;
union bit256 xmm5, xmm4;
__m256 xmm1, xmm2, xmm3;
- __m256i xmm8, xmm11, xmm12, xmmfive, xmmfour, xmm9, holder0, holder1, xmm10;
+ __m256i xmm8, xmm11, xmm12, xmm9, xmm10;
- xmm5.int_vec = xmmfive = _mm256_setzero_si256();
- xmm4.int_vec = xmmfour = _mm256_setzero_si256();
- holderf.int_vec = holder0 = _mm256_setzero_si256();
- holderi.int_vec = holder1 = _mm256_setzero_si256();
+ xmm5.int_vec = _mm256_setzero_si256();
+ xmm4.int_vec = _mm256_setzero_si256();
+ holderf.int_vec = _mm256_setzero_si256();
+ holderi.int_vec = _mm256_setzero_si256();
int bound = num_bytes >> 6;
int i = 0;
- xmm8 = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
- xmm9 = _mm256_setzero_si256(); //=xmm8
+ xmm8 = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
+ xmm9 = _mm256_setzero_si256();
xmm10 = _mm256_set1_epi32(8);
xmm3 = _mm256_setzero_ps();
- __m256i idx = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
+ __m256i idx = _mm256_setr_epi32(0, 1, 4, 5, 2, 3, 6, 7);
for (; i < bound; ++i) {
xmm1 = _mm256_loadu_ps((float*)src0);
xmm2 = _mm256_loadu_ps((float*)&src0[4]);
xmm8 = _mm256_add_epi32(xmm8, xmm10);
}
- xmm10 = _mm256_set1_epi32(4);
- if (num_bytes >> 5 & 1) {
- xmm1 = _mm256_loadu_ps((float*)src0);
-
- src0 += 4;
- xmm1 = _mm256_mul_ps(xmm1, xmm1);
-
- xmm1 = _mm256_hadd_ps(xmm1, xmm1);
- xmm1 = _mm256_permutevar8x32_ps(xmm1, idx);
-
- xmm3 = _mm256_max_ps(xmm1, xmm3);
-
- xmm4.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_LT_OS);
- xmm5.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_EQ_OQ);
-
- xmm11 = _mm256_and_si256(xmm8, xmm5.int_vec);
- xmm12 = _mm256_and_si256(xmm9, xmm4.int_vec);
-
- xmm9 = _mm256_add_epi32(xmm11, xmm12);
+ _mm256_storeu_ps((float*)&(holderf.f), xmm3);
+ _mm256_storeu_si256(&(holderi.int_vec), xmm9);
- xmm8 = _mm256_add_epi32(xmm8, xmm10);
+ for (i = 0; i < 8; i++) {
+ if (holderf.f[i] > max) {
+ index = holderi.i[i];
+ max = holderf.f[i];
+ }
}
- idx = _mm256_set_epi32(1, 0, 1, 0, 1, 0, 1, 0);
- xmm10 = _mm256_set1_epi32(2);
- if (num_bytes >> 4 & 1) {
- xmm2 = _mm256_loadu_ps((float*)src0);
-
- xmm1 = _mm256_permutevar8x32_ps(bit256_p(&xmm8)->float_vec, idx);
- xmm8 = bit256_p(&xmm1)->int_vec;
-
- xmm2 = _mm256_mul_ps(xmm2, xmm2);
-
- src0 += 2;
-
- xmm1 = _mm256_hadd_ps(xmm2, xmm2);
-
- xmm3 = _mm256_max_ps(xmm1, xmm3);
-
- xmm4.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_LT_OS);
- xmm5.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_EQ_OQ);
-
- xmm11 = _mm256_and_si256(xmm8, xmm5.int_vec);
- xmm12 = _mm256_and_si256(xmm9, xmm4.int_vec);
-
- xmm9 = _mm256_add_epi32(xmm11, xmm12);
+ for (i = bound * 8; i < num_points; i++, src0++) {
+ sq_dist =
+ lv_creal(src0[0]) * lv_creal(src0[0]) + lv_cimag(src0[0]) * lv_cimag(src0[0]);
- xmm8 = _mm256_add_epi32(xmm8, xmm10);
+ if (sq_dist > max) {
+ index = i;
+ max = sq_dist;
+ }
}
-
- _mm256_storeu_ps((float*)&(holderf.f), xmm3);
- _mm256_storeu_si256(&(holderi.int_vec), xmm9);
-
- target[0] = holderi.i[0];
- sq_dist = holderf.f[0];
- target[0] = (holderf.f[1] > sq_dist) ? holderi.i[1] : target[0];
- sq_dist = (holderf.f[1] > sq_dist) ? holderf.f[1] : sq_dist;
- target[0] = (holderf.f[2] > sq_dist) ? holderi.i[2] : target[0];
- sq_dist = (holderf.f[2] > sq_dist) ? holderf.f[2] : sq_dist;
- target[0] = (holderf.f[3] > sq_dist) ? holderi.i[3] : target[0];
- sq_dist = (holderf.f[3] > sq_dist) ? holderf.f[3] : sq_dist;
- target[0] = (holderf.f[4] > sq_dist) ? holderi.i[4] : target[0];
- sq_dist = (holderf.f[4] > sq_dist) ? holderf.f[4] : sq_dist;
- target[0] = (holderf.f[5] > sq_dist) ? holderi.i[5] : target[0];
- sq_dist = (holderf.f[5] > sq_dist) ? holderf.f[5] : sq_dist;
- target[0] = (holderf.f[6] > sq_dist) ? holderi.i[6] : target[0];
- sq_dist = (holderf.f[6] > sq_dist) ? holderf.f[6] : sq_dist;
- target[0] = (holderf.f[7] > sq_dist) ? holderi.i[7] : target[0];
- sq_dist = (holderf.f[7] > sq_dist) ? holderf.f[7] : sq_dist;
+ target[0] = index;
}
#endif /*LV_HAVE_AVX2*/
/* -*- c++ -*- */
/*
- * Copyright 2016 Free Software Foundation, Inc.
+ * Copyright 2016, 2018-2020 Free Software Foundation, Inc.
*
* This file is part of GNU Radio
*
*
* <b>Dispatcher Prototype</b>
* \code
- * void volk_32fc_index_max_32u(uint32_t* target, lv_32fc_t* src0, uint32_t num_points)
- * \endcode
+ * void volk_32fc_index_max_32u(uint32_t* target, lv_32fc_t* src0, uint32_t
+ * num_points) \endcode
*
* \b Inputs
* \li src0: The complex input vector.
union bit256 holderf;
union bit256 holderi;
float sq_dist = 0.0;
+ float max = 0.0;
+ uint32_t index = 0;
union bit256 xmm5, xmm4;
__m256 xmm1, xmm2, xmm3;
- __m256i xmm8, xmm11, xmm12, xmmfive, xmmfour, xmm9, holder0, holder1, xmm10;
+ __m256i xmm8, xmm11, xmm12, xmm9, xmm10;
- xmm5.int_vec = xmmfive = _mm256_setzero_si256();
- xmm4.int_vec = xmmfour = _mm256_setzero_si256();
- holderf.int_vec = holder0 = _mm256_setzero_si256();
- holderi.int_vec = holder1 = _mm256_setzero_si256();
+ xmm5.int_vec = _mm256_setzero_si256();
+ xmm4.int_vec = _mm256_setzero_si256();
+ holderf.int_vec = _mm256_setzero_si256();
+ holderi.int_vec = _mm256_setzero_si256();
int bound = num_bytes >> 6;
int i = 0;
- xmm8 = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
+ xmm8 = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
xmm9 = _mm256_setzero_si256();
xmm10 = _mm256_set1_epi32(8);
xmm3 = _mm256_setzero_ps();
- __m256i idx = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
+ __m256i idx = _mm256_setr_epi32(0, 1, 4, 5, 2, 3, 6, 7);
for (; i < bound; ++i) {
xmm1 = _mm256_load_ps((float*)src0);
xmm8 = _mm256_add_epi32(xmm8, xmm10);
}
- xmm10 = _mm256_set1_epi32(4);
- if (num_bytes >> 4 & 1) {
- xmm1 = _mm256_load_ps((float*)src0);
-
- xmm1 = _mm256_mul_ps(xmm1, xmm1);
-
- src0 += 4;
-
- xmm1 = _mm256_hadd_ps(xmm1, xmm1);
-
- xmm3 = _mm256_max_ps(xmm1, xmm3);
-
- xmm4.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_LT_OS);
- xmm5.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_EQ_OQ);
-
- xmm11 = _mm256_and_si256(xmm8, xmm5.int_vec);
- xmm12 = _mm256_and_si256(xmm9, xmm4.int_vec);
-
- xmm9 = _mm256_add_epi32(xmm11, xmm12);
+ _mm256_store_ps((float*)&(holderf.f), xmm3);
+ _mm256_store_si256(&(holderi.int_vec), xmm9);
- xmm8 = _mm256_add_epi32(xmm8, xmm10);
+ for (i = 0; i < 8; i++) {
+ if (holderf.f[i] > max) {
+ index = holderi.i[i];
+ max = holderf.f[i];
+ }
}
- idx = _mm256_set_epi32(1, 0, 1, 0, 1, 0, 1, 0);
- xmm10 = _mm256_set1_epi32(2);
- if (num_bytes >> 4 & 1) {
- xmm2 = _mm256_load_ps((float*)src0);
-
- xmm1 = _mm256_permutevar8x32_ps(bit256_p(&xmm8)->float_vec, idx);
- xmm8 = bit256_p(&xmm1)->int_vec;
-
- xmm2 = _mm256_mul_ps(xmm2, xmm2);
-
- src0 += 2;
-
- xmm1 = _mm256_hadd_ps(xmm2, xmm2);
-
- xmm3 = _mm256_max_ps(xmm1, xmm3);
-
- xmm4.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_LT_OS);
- xmm5.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_EQ_OQ);
-
- xmm11 = _mm256_and_si256(xmm8, xmm5.int_vec);
- xmm12 = _mm256_and_si256(xmm9, xmm4.int_vec);
-
- xmm9 = _mm256_add_epi32(xmm11, xmm12);
+ for (i = bound * 8; i < num_points; i++, src0++) {
+ sq_dist =
+ lv_creal(src0[0]) * lv_creal(src0[0]) + lv_cimag(src0[0]) * lv_cimag(src0[0]);
- xmm8 = _mm256_add_epi32(xmm8, xmm10);
+ if (sq_dist > max) {
+ index = i;
+ max = sq_dist;
+ }
}
-
- _mm256_store_ps((float*)&(holderf.f), xmm3);
- _mm256_store_si256(&(holderi.int_vec), xmm9);
-
- target[0] = holderi.i[0];
- sq_dist = holderf.f[0];
- target[0] = (holderf.f[1] > sq_dist) ? holderi.i[1] : target[0];
- sq_dist = (holderf.f[1] > sq_dist) ? holderf.f[1] : sq_dist;
- target[0] = (holderf.f[2] > sq_dist) ? holderi.i[2] : target[0];
- sq_dist = (holderf.f[2] > sq_dist) ? holderf.f[2] : sq_dist;
- target[0] = (holderf.f[3] > sq_dist) ? holderi.i[3] : target[0];
- sq_dist = (holderf.f[3] > sq_dist) ? holderf.f[3] : sq_dist;
- target[0] = (holderf.f[4] > sq_dist) ? holderi.i[4] : target[0];
- sq_dist = (holderf.f[4] > sq_dist) ? holderf.f[4] : sq_dist;
- target[0] = (holderf.f[5] > sq_dist) ? holderi.i[5] : target[0];
- sq_dist = (holderf.f[5] > sq_dist) ? holderf.f[5] : sq_dist;
- target[0] = (holderf.f[6] > sq_dist) ? holderi.i[6] : target[0];
- sq_dist = (holderf.f[6] > sq_dist) ? holderf.f[6] : sq_dist;
- target[0] = (holderf.f[7] > sq_dist) ? holderi.i[7] : target[0];
- sq_dist = (holderf.f[7] > sq_dist) ? holderf.f[7] : sq_dist;
+ target[0] = index;
}
#endif /*LV_HAVE_AVX2*/
union bit128 xmm5, xmm4;
__m128 xmm1, xmm2, xmm3;
- __m128i xmm8, xmm11, xmm12, xmmfive, xmmfour, xmm9, holder0, holder1, xmm10;
+ __m128i xmm8, xmm11, xmm12, xmm9, xmm10;
- xmm5.int_vec = xmmfive = _mm_setzero_si128();
- xmm4.int_vec = xmmfour = _mm_setzero_si128();
- holderf.int_vec = holder0 = _mm_setzero_si128();
- holderi.int_vec = holder1 = _mm_setzero_si128();
+ xmm5.int_vec = _mm_setzero_si128();
+ xmm4.int_vec = _mm_setzero_si128();
+ holderf.int_vec = _mm_setzero_si128();
+ holderi.int_vec = _mm_setzero_si128();
int bound = num_bytes >> 5;
int i = 0;
- xmm8 = _mm_set_epi32(3, 2, 1, 0); // remember the crazy reverse order!
+ xmm8 = _mm_setr_epi32(0, 1, 2, 3);
xmm9 = _mm_setzero_si128();
- xmm10 = _mm_set_epi32(4, 4, 4, 4);
+ xmm10 = _mm_setr_epi32(4, 4, 4, 4);
xmm3 = _mm_setzero_ps();
- // printf("%f, %f, %f, %f\n", ((float*)&xmm10)[0], ((float*)&xmm10)[1],
- // ((float*)&xmm10)[2], ((float*)&xmm10)[3]);
-
for (; i < bound; ++i) {
xmm1 = _mm_load_ps((float*)src0);
xmm2 = _mm_load_ps((float*)&src0[2]);
xmm9 = _mm_add_epi32(xmm11, xmm12);
xmm8 = _mm_add_epi32(xmm8, xmm10);
-
- // printf("%f, %f, %f, %f\n", ((float*)&xmm3)[0], ((float*)&xmm3)[1],
- // ((float*)&xmm3)[2], ((float*)&xmm3)[3]); printf("%u, %u, %u, %u\n",
- // ((uint32_t*)&xmm10)[0], ((uint32_t*)&xmm10)[1], ((uint32_t*)&xmm10)[2],
- // ((uint32_t*)&xmm10)[3]);
}
-
if (num_bytes >> 4 & 1) {
xmm2 = _mm_load_ps((float*)src0);
xmm3 = _mm_max_ps(xmm1, xmm3);
- xmm10 = _mm_set_epi32(2, 2, 2, 2); // load1_ps((float*)&init[2]);
+ xmm10 = _mm_setr_epi32(2, 2, 2, 2);
xmm4.float_vec = _mm_cmplt_ps(xmm1, xmm3);
xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3);
xmm9 = _mm_add_epi32(xmm11, xmm12);
xmm8 = _mm_add_epi32(xmm8, xmm10);
- // printf("egads%u, %u, %u, %u\n", ((uint32_t*)&xmm9)[0], ((uint32_t*)&xmm9)[1],
- // ((uint32_t*)&xmm9)[2], ((uint32_t*)&xmm9)[3]);
}
if (num_bytes >> 3 & 1) {
- // printf("%u, %u, %u, %u\n", ((uint32_t*)&xmm9)[0], ((uint32_t*)&xmm9)[1],
- // ((uint32_t*)&xmm9)[2], ((uint32_t*)&xmm9)[3]);
-
sq_dist =
lv_creal(src0[0]) * lv_creal(src0[0]) + lv_cimag(src0[0]) * lv_cimag(src0[0]);
xmm9 = _mm_add_epi32(xmm11, xmm12);
}
- // printf("%f, %f, %f, %f\n", ((float*)&xmm3)[0], ((float*)&xmm3)[1],
- // ((float*)&xmm3)[2], ((float*)&xmm3)[3]); printf("%u, %u, %u, %u\n",
- // ((uint32_t*)&xmm9)[0], ((uint32_t*)&xmm9)[1], ((uint32_t*)&xmm9)[2],
- // ((uint32_t*)&xmm9)[3]);
-
_mm_store_ps((float*)&(holderf.f), xmm3);
_mm_store_si128(&(holderi.int_vec), xmm9);
sq_dist = (holderf.f[2] > sq_dist) ? holderf.f[2] : sq_dist;
target[0] = (holderf.f[3] > sq_dist) ? holderi.i[3] : target[0];
sq_dist = (holderf.f[3] > sq_dist) ? holderf.f[3] : sq_dist;
-
- /*
- float placeholder = 0.0;
- uint32_t temp0, temp1;
- uint32_t g0 = (((float*)&xmm3)[0] > ((float*)&xmm3)[1]);
- uint32_t l0 = g0 ^ 1;
-
- uint32_t g1 = (((float*)&xmm3)[1] > ((float*)&xmm3)[2]);
- uint32_t l1 = g1 ^ 1;
-
- temp0 = g0 * ((uint32_t*)&xmm9)[0] + l0 * ((uint32_t*)&xmm9)[1];
- temp1 = g0 * ((uint32_t*)&xmm9)[2] + l0 * ((uint32_t*)&xmm9)[3];
- sq_dist = g0 * ((float*)&xmm3)[0] + l0 * ((float*)&xmm3)[1];
- placeholder = g0 * ((float*)&xmm3)[2] + l0 * ((float*)&xmm3)[3];
-
- g0 = (sq_dist > placeholder);
- l0 = g0 ^ 1;
- target[0] = g0 * temp0 + l0 * temp1;
- */
}
#endif /*LV_HAVE_SSE3*/
sq_dist =
lv_creal(src0[i]) * lv_creal(src0[i]) + lv_cimag(src0[i]) * lv_cimag(src0[i]);
- index = sq_dist > max ? i : index;
- max = sq_dist > max ? sq_dist : max;
+ if (sq_dist > max) {
+ index = i;
+ max = sq_dist;
+ }
}
target[0] = index;
}
#endif /*LV_HAVE_GENERIC*/
-
#endif /*INCLUDED_volk_32fc_index_max_32u_a_H*/
-
#ifndef INCLUDED_volk_32fc_index_max_32u_u_H
#define INCLUDED_volk_32fc_index_max_32u_u_H
union bit256 holderf;
union bit256 holderi;
float sq_dist = 0.0;
+ float max = 0.0;
+ uint32_t index = 0;
union bit256 xmm5, xmm4;
__m256 xmm1, xmm2, xmm3;
- __m256i xmm8, xmm11, xmm12, xmmfive, xmmfour, xmm9, holder0, holder1, xmm10;
+ __m256i xmm8, xmm11, xmm12, xmm9, xmm10;
- xmm5.int_vec = xmmfive = _mm256_setzero_si256();
- xmm4.int_vec = xmmfour = _mm256_setzero_si256();
- holderf.int_vec = holder0 = _mm256_setzero_si256();
- holderi.int_vec = holder1 = _mm256_setzero_si256();
+ xmm5.int_vec = _mm256_setzero_si256();
+ xmm4.int_vec = _mm256_setzero_si256();
+ holderf.int_vec = _mm256_setzero_si256();
+ holderi.int_vec = _mm256_setzero_si256();
int bound = num_bytes >> 6;
int i = 0;
- xmm8 = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
+ xmm8 = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
xmm9 = _mm256_setzero_si256();
xmm10 = _mm256_set1_epi32(8);
xmm3 = _mm256_setzero_ps();
- __m256i idx = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
+ __m256i idx = _mm256_setr_epi32(0, 1, 4, 5, 2, 3, 6, 7);
for (; i < bound; ++i) {
xmm1 = _mm256_loadu_ps((float*)src0);
xmm8 = _mm256_add_epi32(xmm8, xmm10);
}
- xmm10 = _mm256_set1_epi32(4);
- if (num_bytes >> 4 & 1) {
- xmm1 = _mm256_loadu_ps((float*)src0);
-
- xmm1 = _mm256_mul_ps(xmm1, xmm1);
-
- src0 += 4;
-
- xmm1 = _mm256_hadd_ps(xmm1, xmm1);
-
- xmm3 = _mm256_max_ps(xmm1, xmm3);
-
- xmm4.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_LT_OS);
- xmm5.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_EQ_OQ);
-
- xmm11 = _mm256_and_si256(xmm8, xmm5.int_vec);
- xmm12 = _mm256_and_si256(xmm9, xmm4.int_vec);
-
- xmm9 = _mm256_add_epi32(xmm11, xmm12);
+ _mm256_storeu_ps((float*)&(holderf.f), xmm3);
+ _mm256_storeu_si256(&(holderi.int_vec), xmm9);
- xmm8 = _mm256_add_epi32(xmm8, xmm10);
+ for (i = 0; i < 8; i++) {
+ if (holderf.f[i] > max) {
+ index = holderi.i[i];
+ max = holderf.f[i];
+ }
}
- idx = _mm256_set_epi32(1, 0, 1, 0, 1, 0, 1, 0);
- xmm10 = _mm256_set1_epi32(2);
- if (num_bytes >> 4 & 1) {
- xmm2 = _mm256_loadu_ps((float*)src0);
-
- xmm1 = _mm256_permutevar8x32_ps(bit256_p(&xmm8)->float_vec, idx);
- xmm8 = bit256_p(&xmm1)->int_vec;
-
- xmm2 = _mm256_mul_ps(xmm2, xmm2);
-
- src0 += 2;
-
- xmm1 = _mm256_hadd_ps(xmm2, xmm2);
-
- xmm3 = _mm256_max_ps(xmm1, xmm3);
-
- xmm4.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_LT_OS);
- xmm5.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_EQ_OQ);
-
- xmm11 = _mm256_and_si256(xmm8, xmm5.int_vec);
- xmm12 = _mm256_and_si256(xmm9, xmm4.int_vec);
-
- xmm9 = _mm256_add_epi32(xmm11, xmm12);
+ for (i = bound * 8; i < num_points; i++, src0++) {
+ sq_dist =
+ lv_creal(src0[0]) * lv_creal(src0[0]) + lv_cimag(src0[0]) * lv_cimag(src0[0]);
- xmm8 = _mm256_add_epi32(xmm8, xmm10);
+ if (sq_dist > max) {
+ index = i;
+ max = sq_dist;
+ }
}
-
- _mm256_storeu_ps((float*)&(holderf.f), xmm3);
- _mm256_storeu_si256(&(holderi.int_vec), xmm9);
-
- target[0] = holderi.i[0];
- sq_dist = holderf.f[0];
- target[0] = (holderf.f[1] > sq_dist) ? holderi.i[1] : target[0];
- sq_dist = (holderf.f[1] > sq_dist) ? holderf.f[1] : sq_dist;
- target[0] = (holderf.f[2] > sq_dist) ? holderi.i[2] : target[0];
- sq_dist = (holderf.f[2] > sq_dist) ? holderf.f[2] : sq_dist;
- target[0] = (holderf.f[3] > sq_dist) ? holderi.i[3] : target[0];
- sq_dist = (holderf.f[3] > sq_dist) ? holderf.f[3] : sq_dist;
- target[0] = (holderf.f[4] > sq_dist) ? holderi.i[4] : target[0];
- sq_dist = (holderf.f[4] > sq_dist) ? holderf.f[4] : sq_dist;
- target[0] = (holderf.f[5] > sq_dist) ? holderi.i[5] : target[0];
- sq_dist = (holderf.f[5] > sq_dist) ? holderf.f[5] : sq_dist;
- target[0] = (holderf.f[6] > sq_dist) ? holderi.i[6] : target[0];
- sq_dist = (holderf.f[6] > sq_dist) ? holderf.f[6] : sq_dist;
- target[0] = (holderf.f[7] > sq_dist) ? holderi.i[7] : target[0];
- sq_dist = (holderf.f[7] > sq_dist) ? holderf.f[7] : sq_dist;
+ target[0] = index;
}
#endif /*LV_HAVE_AVX2*/