From ce08bc47f0c1abf26c21da70c677f7264c4f272d Mon Sep 17 00:00:00 2001 From: Clayton Smith Date: Sat, 21 Mar 2020 14:59:24 -0400 Subject: [PATCH] [PATCH 6/7] Fix the broken index max kernels Gbp-Pq: Name 0006-Fix-the-broken-index-max-kernels.patch --- kernels/volk/volk_32fc_index_max_16u.h | 299 ++++++------------------- kernels/volk/volk_32fc_index_max_32u.h | 258 ++++++--------------- 2 files changed, 128 insertions(+), 429 deletions(-) diff --git a/kernels/volk/volk_32fc_index_max_16u.h b/kernels/volk/volk_32fc_index_max_16u.h index b9f9cfd..16e76cd 100644 --- a/kernels/volk/volk_32fc_index_max_16u.h +++ b/kernels/volk/volk_32fc_index_max_16u.h @@ -1,6 +1,6 @@ /* -*- c++ -*- */ /* - * Copyright 2012, 2014 Free Software Foundation, Inc. + * Copyright 2012, 2014-2016, 2018-2020 Free Software Foundation, Inc. * * This file is part of GNU Radio * @@ -36,8 +36,8 @@ * * Dispatcher Prototype * \code - * void volk_32fc_index_max_16u(uint16_t* target, lv_32fc_t* src0, uint32_t num_points) - * \endcode + * void volk_32fc_index_max_16u(uint16_t* target, lv_32fc_t* src0, uint32_t + * num_points) \endcode * * \b Inputs * \li src0: The complex input vector. @@ -89,33 +89,32 @@ static inline void volk_32fc_index_max_16u_a_avx2(uint16_t* target, lv_32fc_t* src0, uint32_t num_points) { num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points; - // Branchless version, if we think it'll make a difference - // num_points = USHRT_MAX ^ ((num_points ^ USHRT_MAX) & -(num_points < USHRT_MAX)); - const uint32_t num_bytes = num_points * 8; union bit256 holderf; union bit256 holderi; float sq_dist = 0.0; + float max = 0.0; + uint16_t index = 0; union bit256 xmm5, xmm4; __m256 xmm1, xmm2, xmm3; - __m256i xmm8, xmm11, xmm12, xmmfive, xmmfour, xmm9, holder0, holder1, xmm10; + __m256i xmm8, xmm11, xmm12, xmm9, xmm10; - xmm5.int_vec = xmmfive = _mm256_setzero_si256(); - xmm4.int_vec = xmmfour = _mm256_setzero_si256(); - holderf.int_vec = holder0 = _mm256_setzero_si256(); - holderi.int_vec = holder1 = _mm256_setzero_si256(); + xmm5.int_vec = _mm256_setzero_si256(); + xmm4.int_vec = _mm256_setzero_si256(); + holderf.int_vec = _mm256_setzero_si256(); + holderi.int_vec = _mm256_setzero_si256(); int bound = num_bytes >> 6; int i = 0; - xmm8 = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0); - xmm9 = _mm256_setzero_si256(); //=xmm8 + xmm8 = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7); + xmm9 = _mm256_setzero_si256(); xmm10 = _mm256_set1_epi32(8); xmm3 = _mm256_setzero_ps(); - __m256i idx = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0); + __m256i idx = _mm256_setr_epi32(0, 1, 4, 5, 2, 3, 6, 7); for (; i < bound; ++i) { xmm1 = _mm256_load_ps((float*)src0); xmm2 = _mm256_load_ps((float*)&src0[4]); @@ -140,105 +139,27 @@ volk_32fc_index_max_16u_a_avx2(uint16_t* target, lv_32fc_t* src0, uint32_t num_p xmm8 = _mm256_add_epi32(xmm8, xmm10); } - xmm10 = _mm256_set1_epi32(4); - if (num_bytes >> 5 & 1) { - xmm1 = _mm256_load_ps((float*)src0); - - src0 += 4; - - xmm1 = _mm256_mul_ps(xmm1, xmm1); - xmm1 = _mm256_hadd_ps(xmm1, xmm1); - xmm1 = _mm256_permutevar8x32_ps(xmm1, idx); - - xmm3 = _mm256_max_ps(xmm1, xmm3); - - xmm4.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_LT_OS); - xmm5.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_EQ_OQ); - - xmm11 = _mm256_and_si256(xmm8, xmm5.int_vec); - xmm12 = _mm256_and_si256(xmm9, xmm4.int_vec); - - xmm9 = _mm256_add_epi32(xmm11, xmm12); + _mm256_store_ps((float*)&(holderf.f), xmm3); + _mm256_store_si256(&(holderi.int_vec), xmm9); - xmm8 = _mm256_add_epi32(xmm8, xmm10); + for (i = 0; i < 8; i++) { + if (holderf.f[i] > max) { + index = holderi.i[i]; + max = holderf.f[i]; + } } - idx = _mm256_set_epi32(1, 0, 1, 0, 1, 0, 1, 0); - xmm10 = _mm256_set1_epi32(2); - if (num_bytes >> 4 & 1) { - xmm2 = _mm256_load_ps((float*)src0); - - xmm1 = _mm256_permutevar8x32_ps(bit256_p(&xmm8)->float_vec, idx); - xmm8 = bit256_p(&xmm1)->int_vec; - - xmm2 = _mm256_mul_ps(xmm2, xmm2); - - src0 += 2; - - xmm1 = _mm256_hadd_ps(xmm2, xmm2); - - xmm3 = _mm256_max_ps(xmm1, xmm3); - - xmm4.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_LT_OS); - xmm5.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_EQ_OQ); - - xmm11 = _mm256_and_si256(xmm8, xmm5.int_vec); - xmm12 = _mm256_and_si256(xmm9, xmm4.int_vec); - - xmm9 = _mm256_add_epi32(xmm11, xmm12); + for (i = bound * 8; i < num_points; i++, src0++) { + sq_dist = + lv_creal(src0[0]) * lv_creal(src0[0]) + lv_cimag(src0[0]) * lv_cimag(src0[0]); - xmm8 = _mm256_add_epi32(xmm8, xmm10); + if (sq_dist > max) { + index = i; + max = sq_dist; + } } - - /* - idx = _mm256_setzero_si256(); - for(i = 0; i < leftovers2; ++i) { - //printf("%u, %u, %u, %u\n", ((uint32_t*)&xmm9)[0], ((uint32_t*)&xmm9)[1], - ((uint32_t*)&xmm9)[2], ((uint32_t*)&xmm9)[3]); - - sq_dist = lv_creal(src0[0]) * lv_creal(src0[0]) + lv_cimag(src0[0]) * - lv_cimag(src0[0]); - - //xmm = _mm_load1_ps(&sq_dist);//insert? - xmm2 = _mm256_set1_ps(sq_dist); - //xmm2 = _mm256_insertf128_ps(xmm2, xmm, 0); - - xmm1 = xmm3; - - xmm3 = _mm256_max_ps(xmm3, xmm2);//only lowest 32bit value - xmm3 = _mm256_permutevar8x32_ps(xmm3, idx); - - xmm4.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_LT_OS); - xmm5.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_EQ_OQ); - - xmm8 = _mm256_permutevar8x32_epi32(xmm8, idx); - - xmm11 = _mm256_and_si256(xmm8, xmm4.int_vec); - xmm12 = _mm256_and_si256(xmm9, xmm5.int_vec); - - xmm9 = _mm256_add_epi32(xmm11, xmm12); - }*/ - - _mm256_store_ps((float*)&(holderf.f), xmm3); - _mm256_store_si256(&(holderi.int_vec), xmm9); - - target[0] = holderi.i[0]; - sq_dist = holderf.f[0]; - target[0] = (holderf.f[1] > sq_dist) ? holderi.i[1] : target[0]; - sq_dist = (holderf.f[1] > sq_dist) ? holderf.f[1] : sq_dist; - target[0] = (holderf.f[2] > sq_dist) ? holderi.i[2] : target[0]; - sq_dist = (holderf.f[2] > sq_dist) ? holderf.f[2] : sq_dist; - target[0] = (holderf.f[3] > sq_dist) ? holderi.i[3] : target[0]; - sq_dist = (holderf.f[3] > sq_dist) ? holderf.f[3] : sq_dist; - target[0] = (holderf.f[4] > sq_dist) ? holderi.i[4] : target[0]; - sq_dist = (holderf.f[4] > sq_dist) ? holderf.f[4] : sq_dist; - target[0] = (holderf.f[5] > sq_dist) ? holderi.i[5] : target[0]; - sq_dist = (holderf.f[5] > sq_dist) ? holderf.f[5] : sq_dist; - target[0] = (holderf.f[6] > sq_dist) ? holderi.i[6] : target[0]; - sq_dist = (holderf.f[6] > sq_dist) ? holderf.f[6] : sq_dist; - target[0] = (holderf.f[7] > sq_dist) ? holderi.i[7] : target[0]; - sq_dist = (holderf.f[7] > sq_dist) ? holderf.f[7] : sq_dist; + target[0] = index; } #endif /*LV_HAVE_AVX2*/ @@ -251,9 +172,6 @@ static inline void volk_32fc_index_max_16u_a_sse3(uint16_t* target, lv_32fc_t* src0, uint32_t num_points) { num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points; - // Branchless version, if we think it'll make a difference - // num_points = USHRT_MAX ^ ((num_points ^ USHRT_MAX) & -(num_points < USHRT_MAX)); - const uint32_t num_bytes = num_points * 8; union bit128 holderf; @@ -262,22 +180,20 @@ volk_32fc_index_max_16u_a_sse3(uint16_t* target, lv_32fc_t* src0, uint32_t num_p union bit128 xmm5, xmm4; __m128 xmm1, xmm2, xmm3; - __m128i xmm8, xmm11, xmm12, xmmfive, xmmfour, xmm9, holder0, holder1, xmm10; + __m128i xmm8, xmm11, xmm12, xmm9, xmm10; - xmm5.int_vec = xmmfive = _mm_setzero_si128(); - xmm4.int_vec = xmmfour = _mm_setzero_si128(); - holderf.int_vec = holder0 = _mm_setzero_si128(); - holderi.int_vec = holder1 = _mm_setzero_si128(); + xmm5.int_vec = _mm_setzero_si128(); + xmm4.int_vec = _mm_setzero_si128(); + holderf.int_vec = _mm_setzero_si128(); + holderi.int_vec = _mm_setzero_si128(); int bound = num_bytes >> 5; int i = 0; - xmm8 = _mm_set_epi32(3, 2, 1, 0); // remember the crazy reverse order! + xmm8 = _mm_setr_epi32(0, 1, 2, 3); xmm9 = _mm_setzero_si128(); - xmm10 = _mm_set_epi32(4, 4, 4, 4); + xmm10 = _mm_setr_epi32(4, 4, 4, 4); xmm3 = _mm_setzero_ps(); - // printf("%f, %f, %f, %f\n", ((float*)&xmm10)[0], ((float*)&xmm10)[1], - // ((float*)&xmm10)[2], ((float*)&xmm10)[3]); for (; i < bound; ++i) { xmm1 = _mm_load_ps((float*)src0); @@ -301,14 +217,8 @@ volk_32fc_index_max_16u_a_sse3(uint16_t* target, lv_32fc_t* src0, uint32_t num_p xmm9 = _mm_add_epi32(xmm11, xmm12); xmm8 = _mm_add_epi32(xmm8, xmm10); - - // printf("%f, %f, %f, %f\n", ((float*)&xmm3)[0], ((float*)&xmm3)[1], - // ((float*)&xmm3)[2], ((float*)&xmm3)[3]); printf("%u, %u, %u, %u\n", - // ((uint32_t*)&xmm10)[0], ((uint32_t*)&xmm10)[1], ((uint32_t*)&xmm10)[2], - // ((uint32_t*)&xmm10)[3]); } - if (num_bytes >> 4 & 1) { xmm2 = _mm_load_ps((float*)src0); @@ -323,7 +233,7 @@ volk_32fc_index_max_16u_a_sse3(uint16_t* target, lv_32fc_t* src0, uint32_t num_p xmm3 = _mm_max_ps(xmm1, xmm3); - xmm10 = _mm_set_epi32(2, 2, 2, 2); // load1_ps((float*)&init[2]); + xmm10 = _mm_setr_epi32(2, 2, 2, 2); xmm4.float_vec = _mm_cmplt_ps(xmm1, xmm3); xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3); @@ -334,14 +244,9 @@ volk_32fc_index_max_16u_a_sse3(uint16_t* target, lv_32fc_t* src0, uint32_t num_p xmm9 = _mm_add_epi32(xmm11, xmm12); xmm8 = _mm_add_epi32(xmm8, xmm10); - // printf("egads%u, %u, %u, %u\n", ((uint32_t*)&xmm9)[0], ((uint32_t*)&xmm9)[1], - // ((uint32_t*)&xmm9)[2], ((uint32_t*)&xmm9)[3]); } if (num_bytes >> 3 & 1) { - // printf("%u, %u, %u, %u\n", ((uint32_t*)&xmm9)[0], ((uint32_t*)&xmm9)[1], - // ((uint32_t*)&xmm9)[2], ((uint32_t*)&xmm9)[3]); - sq_dist = lv_creal(src0[0]) * lv_creal(src0[0]) + lv_cimag(src0[0]) * lv_cimag(src0[0]); @@ -362,11 +267,6 @@ volk_32fc_index_max_16u_a_sse3(uint16_t* target, lv_32fc_t* src0, uint32_t num_p xmm9 = _mm_add_epi32(xmm11, xmm12); } - // printf("%f, %f, %f, %f\n", ((float*)&xmm3)[0], ((float*)&xmm3)[1], - // ((float*)&xmm3)[2], ((float*)&xmm3)[3]); printf("%u, %u, %u, %u\n", - // ((uint32_t*)&xmm9)[0], ((uint32_t*)&xmm9)[1], ((uint32_t*)&xmm9)[2], - // ((uint32_t*)&xmm9)[3]); - _mm_store_ps((float*)&(holderf.f), xmm3); _mm_store_si128(&(holderi.int_vec), xmm9); @@ -378,25 +278,6 @@ volk_32fc_index_max_16u_a_sse3(uint16_t* target, lv_32fc_t* src0, uint32_t num_p sq_dist = (holderf.f[2] > sq_dist) ? holderf.f[2] : sq_dist; target[0] = (holderf.f[3] > sq_dist) ? holderi.i[3] : target[0]; sq_dist = (holderf.f[3] > sq_dist) ? holderf.f[3] : sq_dist; - - /* - float placeholder = 0.0; - uint32_t temp0, temp1; - uint32_t g0 = (((float*)&xmm3)[0] > ((float*)&xmm3)[1]); - uint32_t l0 = g0 ^ 1; - - uint32_t g1 = (((float*)&xmm3)[1] > ((float*)&xmm3)[2]); - uint32_t l1 = g1 ^ 1; - - temp0 = g0 * ((uint32_t*)&xmm9)[0] + l0 * ((uint32_t*)&xmm9)[1]; - temp1 = g0 * ((uint32_t*)&xmm9)[2] + l0 * ((uint32_t*)&xmm9)[3]; - sq_dist = g0 * ((float*)&xmm3)[0] + l0 * ((float*)&xmm3)[1]; - placeholder = g0 * ((float*)&xmm3)[2] + l0 * ((float*)&xmm3)[3]; - - g0 = (sq_dist > placeholder); - l0 = g0 ^ 1; - target[0] = g0 * temp0 + l0 * temp1; - */ } #endif /*LV_HAVE_SSE3*/ @@ -419,18 +300,18 @@ volk_32fc_index_max_16u_generic(uint16_t* target, lv_32fc_t* src0, uint32_t num_ sq_dist = lv_creal(src0[i]) * lv_creal(src0[i]) + lv_cimag(src0[i]) * lv_cimag(src0[i]); - index = sq_dist > max ? i : index; - max = sq_dist > max ? sq_dist : max; + if (sq_dist > max) { + index = i; + max = sq_dist; + } } target[0] = index; } #endif /*LV_HAVE_GENERIC*/ - #endif /*INCLUDED_volk_32fc_index_max_16u_a_H*/ - #ifndef INCLUDED_volk_32fc_index_max_16u_u_H #define INCLUDED_volk_32fc_index_max_16u_u_H @@ -447,33 +328,32 @@ static inline void volk_32fc_index_max_16u_u_avx2(uint16_t* target, lv_32fc_t* src0, uint32_t num_points) { num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points; - // Branchless version, if we think it'll make a difference - // num_points = USHRT_MAX ^ ((num_points ^ USHRT_MAX) & -(num_points < USHRT_MAX)); - const uint32_t num_bytes = num_points * 8; union bit256 holderf; union bit256 holderi; float sq_dist = 0.0; + float max = 0.0; + uint16_t index = 0; union bit256 xmm5, xmm4; __m256 xmm1, xmm2, xmm3; - __m256i xmm8, xmm11, xmm12, xmmfive, xmmfour, xmm9, holder0, holder1, xmm10; + __m256i xmm8, xmm11, xmm12, xmm9, xmm10; - xmm5.int_vec = xmmfive = _mm256_setzero_si256(); - xmm4.int_vec = xmmfour = _mm256_setzero_si256(); - holderf.int_vec = holder0 = _mm256_setzero_si256(); - holderi.int_vec = holder1 = _mm256_setzero_si256(); + xmm5.int_vec = _mm256_setzero_si256(); + xmm4.int_vec = _mm256_setzero_si256(); + holderf.int_vec = _mm256_setzero_si256(); + holderi.int_vec = _mm256_setzero_si256(); int bound = num_bytes >> 6; int i = 0; - xmm8 = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0); - xmm9 = _mm256_setzero_si256(); //=xmm8 + xmm8 = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7); + xmm9 = _mm256_setzero_si256(); xmm10 = _mm256_set1_epi32(8); xmm3 = _mm256_setzero_ps(); - __m256i idx = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0); + __m256i idx = _mm256_setr_epi32(0, 1, 4, 5, 2, 3, 6, 7); for (; i < bound; ++i) { xmm1 = _mm256_loadu_ps((float*)src0); xmm2 = _mm256_loadu_ps((float*)&src0[4]); @@ -498,76 +378,27 @@ volk_32fc_index_max_16u_u_avx2(uint16_t* target, lv_32fc_t* src0, uint32_t num_p xmm8 = _mm256_add_epi32(xmm8, xmm10); } - xmm10 = _mm256_set1_epi32(4); - if (num_bytes >> 5 & 1) { - xmm1 = _mm256_loadu_ps((float*)src0); - - src0 += 4; - xmm1 = _mm256_mul_ps(xmm1, xmm1); - - xmm1 = _mm256_hadd_ps(xmm1, xmm1); - xmm1 = _mm256_permutevar8x32_ps(xmm1, idx); - - xmm3 = _mm256_max_ps(xmm1, xmm3); - - xmm4.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_LT_OS); - xmm5.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_EQ_OQ); - - xmm11 = _mm256_and_si256(xmm8, xmm5.int_vec); - xmm12 = _mm256_and_si256(xmm9, xmm4.int_vec); - - xmm9 = _mm256_add_epi32(xmm11, xmm12); + _mm256_storeu_ps((float*)&(holderf.f), xmm3); + _mm256_storeu_si256(&(holderi.int_vec), xmm9); - xmm8 = _mm256_add_epi32(xmm8, xmm10); + for (i = 0; i < 8; i++) { + if (holderf.f[i] > max) { + index = holderi.i[i]; + max = holderf.f[i]; + } } - idx = _mm256_set_epi32(1, 0, 1, 0, 1, 0, 1, 0); - xmm10 = _mm256_set1_epi32(2); - if (num_bytes >> 4 & 1) { - xmm2 = _mm256_loadu_ps((float*)src0); - - xmm1 = _mm256_permutevar8x32_ps(bit256_p(&xmm8)->float_vec, idx); - xmm8 = bit256_p(&xmm1)->int_vec; - - xmm2 = _mm256_mul_ps(xmm2, xmm2); - - src0 += 2; - - xmm1 = _mm256_hadd_ps(xmm2, xmm2); - - xmm3 = _mm256_max_ps(xmm1, xmm3); - - xmm4.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_LT_OS); - xmm5.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_EQ_OQ); - - xmm11 = _mm256_and_si256(xmm8, xmm5.int_vec); - xmm12 = _mm256_and_si256(xmm9, xmm4.int_vec); - - xmm9 = _mm256_add_epi32(xmm11, xmm12); + for (i = bound * 8; i < num_points; i++, src0++) { + sq_dist = + lv_creal(src0[0]) * lv_creal(src0[0]) + lv_cimag(src0[0]) * lv_cimag(src0[0]); - xmm8 = _mm256_add_epi32(xmm8, xmm10); + if (sq_dist > max) { + index = i; + max = sq_dist; + } } - - _mm256_storeu_ps((float*)&(holderf.f), xmm3); - _mm256_storeu_si256(&(holderi.int_vec), xmm9); - - target[0] = holderi.i[0]; - sq_dist = holderf.f[0]; - target[0] = (holderf.f[1] > sq_dist) ? holderi.i[1] : target[0]; - sq_dist = (holderf.f[1] > sq_dist) ? holderf.f[1] : sq_dist; - target[0] = (holderf.f[2] > sq_dist) ? holderi.i[2] : target[0]; - sq_dist = (holderf.f[2] > sq_dist) ? holderf.f[2] : sq_dist; - target[0] = (holderf.f[3] > sq_dist) ? holderi.i[3] : target[0]; - sq_dist = (holderf.f[3] > sq_dist) ? holderf.f[3] : sq_dist; - target[0] = (holderf.f[4] > sq_dist) ? holderi.i[4] : target[0]; - sq_dist = (holderf.f[4] > sq_dist) ? holderf.f[4] : sq_dist; - target[0] = (holderf.f[5] > sq_dist) ? holderi.i[5] : target[0]; - sq_dist = (holderf.f[5] > sq_dist) ? holderf.f[5] : sq_dist; - target[0] = (holderf.f[6] > sq_dist) ? holderi.i[6] : target[0]; - sq_dist = (holderf.f[6] > sq_dist) ? holderf.f[6] : sq_dist; - target[0] = (holderf.f[7] > sq_dist) ? holderi.i[7] : target[0]; - sq_dist = (holderf.f[7] > sq_dist) ? holderf.f[7] : sq_dist; + target[0] = index; } #endif /*LV_HAVE_AVX2*/ diff --git a/kernels/volk/volk_32fc_index_max_32u.h b/kernels/volk/volk_32fc_index_max_32u.h index 7756fc6..556b5fc 100644 --- a/kernels/volk/volk_32fc_index_max_32u.h +++ b/kernels/volk/volk_32fc_index_max_32u.h @@ -1,6 +1,6 @@ /* -*- c++ -*- */ /* - * Copyright 2016 Free Software Foundation, Inc. + * Copyright 2016, 2018-2020 Free Software Foundation, Inc. * * This file is part of GNU Radio * @@ -30,8 +30,8 @@ * * Dispatcher Prototype * \code - * void volk_32fc_index_max_32u(uint32_t* target, lv_32fc_t* src0, uint32_t num_points) - * \endcode + * void volk_32fc_index_max_32u(uint32_t* target, lv_32fc_t* src0, uint32_t + * num_points) \endcode * * \b Inputs * \li src0: The complex input vector. @@ -86,24 +86,26 @@ volk_32fc_index_max_32u_a_avx2(uint32_t* target, lv_32fc_t* src0, uint32_t num_p union bit256 holderf; union bit256 holderi; float sq_dist = 0.0; + float max = 0.0; + uint32_t index = 0; union bit256 xmm5, xmm4; __m256 xmm1, xmm2, xmm3; - __m256i xmm8, xmm11, xmm12, xmmfive, xmmfour, xmm9, holder0, holder1, xmm10; + __m256i xmm8, xmm11, xmm12, xmm9, xmm10; - xmm5.int_vec = xmmfive = _mm256_setzero_si256(); - xmm4.int_vec = xmmfour = _mm256_setzero_si256(); - holderf.int_vec = holder0 = _mm256_setzero_si256(); - holderi.int_vec = holder1 = _mm256_setzero_si256(); + xmm5.int_vec = _mm256_setzero_si256(); + xmm4.int_vec = _mm256_setzero_si256(); + holderf.int_vec = _mm256_setzero_si256(); + holderi.int_vec = _mm256_setzero_si256(); int bound = num_bytes >> 6; int i = 0; - xmm8 = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0); + xmm8 = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7); xmm9 = _mm256_setzero_si256(); xmm10 = _mm256_set1_epi32(8); xmm3 = _mm256_setzero_ps(); - __m256i idx = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0); + __m256i idx = _mm256_setr_epi32(0, 1, 4, 5, 2, 3, 6, 7); for (; i < bound; ++i) { xmm1 = _mm256_load_ps((float*)src0); @@ -130,75 +132,26 @@ volk_32fc_index_max_32u_a_avx2(uint32_t* target, lv_32fc_t* src0, uint32_t num_p xmm8 = _mm256_add_epi32(xmm8, xmm10); } - xmm10 = _mm256_set1_epi32(4); - if (num_bytes >> 4 & 1) { - xmm1 = _mm256_load_ps((float*)src0); - - xmm1 = _mm256_mul_ps(xmm1, xmm1); - - src0 += 4; - - xmm1 = _mm256_hadd_ps(xmm1, xmm1); - - xmm3 = _mm256_max_ps(xmm1, xmm3); - - xmm4.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_LT_OS); - xmm5.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_EQ_OQ); - - xmm11 = _mm256_and_si256(xmm8, xmm5.int_vec); - xmm12 = _mm256_and_si256(xmm9, xmm4.int_vec); - - xmm9 = _mm256_add_epi32(xmm11, xmm12); + _mm256_store_ps((float*)&(holderf.f), xmm3); + _mm256_store_si256(&(holderi.int_vec), xmm9); - xmm8 = _mm256_add_epi32(xmm8, xmm10); + for (i = 0; i < 8; i++) { + if (holderf.f[i] > max) { + index = holderi.i[i]; + max = holderf.f[i]; + } } - idx = _mm256_set_epi32(1, 0, 1, 0, 1, 0, 1, 0); - xmm10 = _mm256_set1_epi32(2); - if (num_bytes >> 4 & 1) { - xmm2 = _mm256_load_ps((float*)src0); - - xmm1 = _mm256_permutevar8x32_ps(bit256_p(&xmm8)->float_vec, idx); - xmm8 = bit256_p(&xmm1)->int_vec; - - xmm2 = _mm256_mul_ps(xmm2, xmm2); - - src0 += 2; - - xmm1 = _mm256_hadd_ps(xmm2, xmm2); - - xmm3 = _mm256_max_ps(xmm1, xmm3); - - xmm4.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_LT_OS); - xmm5.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_EQ_OQ); - - xmm11 = _mm256_and_si256(xmm8, xmm5.int_vec); - xmm12 = _mm256_and_si256(xmm9, xmm4.int_vec); - - xmm9 = _mm256_add_epi32(xmm11, xmm12); + for (i = bound * 8; i < num_points; i++, src0++) { + sq_dist = + lv_creal(src0[0]) * lv_creal(src0[0]) + lv_cimag(src0[0]) * lv_cimag(src0[0]); - xmm8 = _mm256_add_epi32(xmm8, xmm10); + if (sq_dist > max) { + index = i; + max = sq_dist; + } } - - _mm256_store_ps((float*)&(holderf.f), xmm3); - _mm256_store_si256(&(holderi.int_vec), xmm9); - - target[0] = holderi.i[0]; - sq_dist = holderf.f[0]; - target[0] = (holderf.f[1] > sq_dist) ? holderi.i[1] : target[0]; - sq_dist = (holderf.f[1] > sq_dist) ? holderf.f[1] : sq_dist; - target[0] = (holderf.f[2] > sq_dist) ? holderi.i[2] : target[0]; - sq_dist = (holderf.f[2] > sq_dist) ? holderf.f[2] : sq_dist; - target[0] = (holderf.f[3] > sq_dist) ? holderi.i[3] : target[0]; - sq_dist = (holderf.f[3] > sq_dist) ? holderf.f[3] : sq_dist; - target[0] = (holderf.f[4] > sq_dist) ? holderi.i[4] : target[0]; - sq_dist = (holderf.f[4] > sq_dist) ? holderf.f[4] : sq_dist; - target[0] = (holderf.f[5] > sq_dist) ? holderi.i[5] : target[0]; - sq_dist = (holderf.f[5] > sq_dist) ? holderf.f[5] : sq_dist; - target[0] = (holderf.f[6] > sq_dist) ? holderi.i[6] : target[0]; - sq_dist = (holderf.f[6] > sq_dist) ? holderf.f[6] : sq_dist; - target[0] = (holderf.f[7] > sq_dist) ? holderi.i[7] : target[0]; - sq_dist = (holderf.f[7] > sq_dist) ? holderf.f[7] : sq_dist; + target[0] = index; } #endif /*LV_HAVE_AVX2*/ @@ -218,24 +171,21 @@ volk_32fc_index_max_32u_a_sse3(uint32_t* target, lv_32fc_t* src0, uint32_t num_p union bit128 xmm5, xmm4; __m128 xmm1, xmm2, xmm3; - __m128i xmm8, xmm11, xmm12, xmmfive, xmmfour, xmm9, holder0, holder1, xmm10; + __m128i xmm8, xmm11, xmm12, xmm9, xmm10; - xmm5.int_vec = xmmfive = _mm_setzero_si128(); - xmm4.int_vec = xmmfour = _mm_setzero_si128(); - holderf.int_vec = holder0 = _mm_setzero_si128(); - holderi.int_vec = holder1 = _mm_setzero_si128(); + xmm5.int_vec = _mm_setzero_si128(); + xmm4.int_vec = _mm_setzero_si128(); + holderf.int_vec = _mm_setzero_si128(); + holderi.int_vec = _mm_setzero_si128(); int bound = num_bytes >> 5; int i = 0; - xmm8 = _mm_set_epi32(3, 2, 1, 0); // remember the crazy reverse order! + xmm8 = _mm_setr_epi32(0, 1, 2, 3); xmm9 = _mm_setzero_si128(); - xmm10 = _mm_set_epi32(4, 4, 4, 4); + xmm10 = _mm_setr_epi32(4, 4, 4, 4); xmm3 = _mm_setzero_ps(); - // printf("%f, %f, %f, %f\n", ((float*)&xmm10)[0], ((float*)&xmm10)[1], - // ((float*)&xmm10)[2], ((float*)&xmm10)[3]); - for (; i < bound; ++i) { xmm1 = _mm_load_ps((float*)src0); xmm2 = _mm_load_ps((float*)&src0[2]); @@ -258,14 +208,8 @@ volk_32fc_index_max_32u_a_sse3(uint32_t* target, lv_32fc_t* src0, uint32_t num_p xmm9 = _mm_add_epi32(xmm11, xmm12); xmm8 = _mm_add_epi32(xmm8, xmm10); - - // printf("%f, %f, %f, %f\n", ((float*)&xmm3)[0], ((float*)&xmm3)[1], - // ((float*)&xmm3)[2], ((float*)&xmm3)[3]); printf("%u, %u, %u, %u\n", - // ((uint32_t*)&xmm10)[0], ((uint32_t*)&xmm10)[1], ((uint32_t*)&xmm10)[2], - // ((uint32_t*)&xmm10)[3]); } - if (num_bytes >> 4 & 1) { xmm2 = _mm_load_ps((float*)src0); @@ -280,7 +224,7 @@ volk_32fc_index_max_32u_a_sse3(uint32_t* target, lv_32fc_t* src0, uint32_t num_p xmm3 = _mm_max_ps(xmm1, xmm3); - xmm10 = _mm_set_epi32(2, 2, 2, 2); // load1_ps((float*)&init[2]); + xmm10 = _mm_setr_epi32(2, 2, 2, 2); xmm4.float_vec = _mm_cmplt_ps(xmm1, xmm3); xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3); @@ -291,14 +235,9 @@ volk_32fc_index_max_32u_a_sse3(uint32_t* target, lv_32fc_t* src0, uint32_t num_p xmm9 = _mm_add_epi32(xmm11, xmm12); xmm8 = _mm_add_epi32(xmm8, xmm10); - // printf("egads%u, %u, %u, %u\n", ((uint32_t*)&xmm9)[0], ((uint32_t*)&xmm9)[1], - // ((uint32_t*)&xmm9)[2], ((uint32_t*)&xmm9)[3]); } if (num_bytes >> 3 & 1) { - // printf("%u, %u, %u, %u\n", ((uint32_t*)&xmm9)[0], ((uint32_t*)&xmm9)[1], - // ((uint32_t*)&xmm9)[2], ((uint32_t*)&xmm9)[3]); - sq_dist = lv_creal(src0[0]) * lv_creal(src0[0]) + lv_cimag(src0[0]) * lv_cimag(src0[0]); @@ -319,11 +258,6 @@ volk_32fc_index_max_32u_a_sse3(uint32_t* target, lv_32fc_t* src0, uint32_t num_p xmm9 = _mm_add_epi32(xmm11, xmm12); } - // printf("%f, %f, %f, %f\n", ((float*)&xmm3)[0], ((float*)&xmm3)[1], - // ((float*)&xmm3)[2], ((float*)&xmm3)[3]); printf("%u, %u, %u, %u\n", - // ((uint32_t*)&xmm9)[0], ((uint32_t*)&xmm9)[1], ((uint32_t*)&xmm9)[2], - // ((uint32_t*)&xmm9)[3]); - _mm_store_ps((float*)&(holderf.f), xmm3); _mm_store_si128(&(holderi.int_vec), xmm9); @@ -335,25 +269,6 @@ volk_32fc_index_max_32u_a_sse3(uint32_t* target, lv_32fc_t* src0, uint32_t num_p sq_dist = (holderf.f[2] > sq_dist) ? holderf.f[2] : sq_dist; target[0] = (holderf.f[3] > sq_dist) ? holderi.i[3] : target[0]; sq_dist = (holderf.f[3] > sq_dist) ? holderf.f[3] : sq_dist; - - /* - float placeholder = 0.0; - uint32_t temp0, temp1; - uint32_t g0 = (((float*)&xmm3)[0] > ((float*)&xmm3)[1]); - uint32_t l0 = g0 ^ 1; - - uint32_t g1 = (((float*)&xmm3)[1] > ((float*)&xmm3)[2]); - uint32_t l1 = g1 ^ 1; - - temp0 = g0 * ((uint32_t*)&xmm9)[0] + l0 * ((uint32_t*)&xmm9)[1]; - temp1 = g0 * ((uint32_t*)&xmm9)[2] + l0 * ((uint32_t*)&xmm9)[3]; - sq_dist = g0 * ((float*)&xmm3)[0] + l0 * ((float*)&xmm3)[1]; - placeholder = g0 * ((float*)&xmm3)[2] + l0 * ((float*)&xmm3)[3]; - - g0 = (sq_dist > placeholder); - l0 = g0 ^ 1; - target[0] = g0 * temp0 + l0 * temp1; - */ } #endif /*LV_HAVE_SSE3*/ @@ -374,18 +289,18 @@ volk_32fc_index_max_32u_generic(uint32_t* target, lv_32fc_t* src0, uint32_t num_ sq_dist = lv_creal(src0[i]) * lv_creal(src0[i]) + lv_cimag(src0[i]) * lv_cimag(src0[i]); - index = sq_dist > max ? i : index; - max = sq_dist > max ? sq_dist : max; + if (sq_dist > max) { + index = i; + max = sq_dist; + } } target[0] = index; } #endif /*LV_HAVE_GENERIC*/ - #endif /*INCLUDED_volk_32fc_index_max_32u_a_H*/ - #ifndef INCLUDED_volk_32fc_index_max_32u_u_H #define INCLUDED_volk_32fc_index_max_32u_u_H @@ -405,24 +320,26 @@ volk_32fc_index_max_32u_u_avx2(uint32_t* target, lv_32fc_t* src0, uint32_t num_p union bit256 holderf; union bit256 holderi; float sq_dist = 0.0; + float max = 0.0; + uint32_t index = 0; union bit256 xmm5, xmm4; __m256 xmm1, xmm2, xmm3; - __m256i xmm8, xmm11, xmm12, xmmfive, xmmfour, xmm9, holder0, holder1, xmm10; + __m256i xmm8, xmm11, xmm12, xmm9, xmm10; - xmm5.int_vec = xmmfive = _mm256_setzero_si256(); - xmm4.int_vec = xmmfour = _mm256_setzero_si256(); - holderf.int_vec = holder0 = _mm256_setzero_si256(); - holderi.int_vec = holder1 = _mm256_setzero_si256(); + xmm5.int_vec = _mm256_setzero_si256(); + xmm4.int_vec = _mm256_setzero_si256(); + holderf.int_vec = _mm256_setzero_si256(); + holderi.int_vec = _mm256_setzero_si256(); int bound = num_bytes >> 6; int i = 0; - xmm8 = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0); + xmm8 = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7); xmm9 = _mm256_setzero_si256(); xmm10 = _mm256_set1_epi32(8); xmm3 = _mm256_setzero_ps(); - __m256i idx = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0); + __m256i idx = _mm256_setr_epi32(0, 1, 4, 5, 2, 3, 6, 7); for (; i < bound; ++i) { xmm1 = _mm256_loadu_ps((float*)src0); @@ -449,75 +366,26 @@ volk_32fc_index_max_32u_u_avx2(uint32_t* target, lv_32fc_t* src0, uint32_t num_p xmm8 = _mm256_add_epi32(xmm8, xmm10); } - xmm10 = _mm256_set1_epi32(4); - if (num_bytes >> 4 & 1) { - xmm1 = _mm256_loadu_ps((float*)src0); - - xmm1 = _mm256_mul_ps(xmm1, xmm1); - - src0 += 4; - - xmm1 = _mm256_hadd_ps(xmm1, xmm1); - - xmm3 = _mm256_max_ps(xmm1, xmm3); - - xmm4.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_LT_OS); - xmm5.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_EQ_OQ); - - xmm11 = _mm256_and_si256(xmm8, xmm5.int_vec); - xmm12 = _mm256_and_si256(xmm9, xmm4.int_vec); - - xmm9 = _mm256_add_epi32(xmm11, xmm12); + _mm256_storeu_ps((float*)&(holderf.f), xmm3); + _mm256_storeu_si256(&(holderi.int_vec), xmm9); - xmm8 = _mm256_add_epi32(xmm8, xmm10); + for (i = 0; i < 8; i++) { + if (holderf.f[i] > max) { + index = holderi.i[i]; + max = holderf.f[i]; + } } - idx = _mm256_set_epi32(1, 0, 1, 0, 1, 0, 1, 0); - xmm10 = _mm256_set1_epi32(2); - if (num_bytes >> 4 & 1) { - xmm2 = _mm256_loadu_ps((float*)src0); - - xmm1 = _mm256_permutevar8x32_ps(bit256_p(&xmm8)->float_vec, idx); - xmm8 = bit256_p(&xmm1)->int_vec; - - xmm2 = _mm256_mul_ps(xmm2, xmm2); - - src0 += 2; - - xmm1 = _mm256_hadd_ps(xmm2, xmm2); - - xmm3 = _mm256_max_ps(xmm1, xmm3); - - xmm4.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_LT_OS); - xmm5.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_EQ_OQ); - - xmm11 = _mm256_and_si256(xmm8, xmm5.int_vec); - xmm12 = _mm256_and_si256(xmm9, xmm4.int_vec); - - xmm9 = _mm256_add_epi32(xmm11, xmm12); + for (i = bound * 8; i < num_points; i++, src0++) { + sq_dist = + lv_creal(src0[0]) * lv_creal(src0[0]) + lv_cimag(src0[0]) * lv_cimag(src0[0]); - xmm8 = _mm256_add_epi32(xmm8, xmm10); + if (sq_dist > max) { + index = i; + max = sq_dist; + } } - - _mm256_storeu_ps((float*)&(holderf.f), xmm3); - _mm256_storeu_si256(&(holderi.int_vec), xmm9); - - target[0] = holderi.i[0]; - sq_dist = holderf.f[0]; - target[0] = (holderf.f[1] > sq_dist) ? holderi.i[1] : target[0]; - sq_dist = (holderf.f[1] > sq_dist) ? holderf.f[1] : sq_dist; - target[0] = (holderf.f[2] > sq_dist) ? holderi.i[2] : target[0]; - sq_dist = (holderf.f[2] > sq_dist) ? holderf.f[2] : sq_dist; - target[0] = (holderf.f[3] > sq_dist) ? holderi.i[3] : target[0]; - sq_dist = (holderf.f[3] > sq_dist) ? holderf.f[3] : sq_dist; - target[0] = (holderf.f[4] > sq_dist) ? holderi.i[4] : target[0]; - sq_dist = (holderf.f[4] > sq_dist) ? holderf.f[4] : sq_dist; - target[0] = (holderf.f[5] > sq_dist) ? holderi.i[5] : target[0]; - sq_dist = (holderf.f[5] > sq_dist) ? holderf.f[5] : sq_dist; - target[0] = (holderf.f[6] > sq_dist) ? holderi.i[6] : target[0]; - sq_dist = (holderf.f[6] > sq_dist) ? holderf.f[6] : sq_dist; - target[0] = (holderf.f[7] > sq_dist) ? holderi.i[7] : target[0]; - sq_dist = (holderf.f[7] > sq_dist) ? holderf.f[7] : sq_dist; + target[0] = index; } #endif /*LV_HAVE_AVX2*/ -- 2.30.2