From 36d5480c06f26bb737a44c4c6d8d11aa2e731685 Mon Sep 17 00:00:00 2001 From: "Brandon P. Enochs" Date: Tue, 3 Apr 2018 16:22:16 -0400 Subject: [PATCH] Added an AVX protokernel for volk_32fc_x2_32f_square_dist_scalar_mult_32f. Gbp-Pq: Name 0002-Added-an-AVX-protokernel-for-volk_32fc_x2_32f_square.patch --- ...32fc_x2_s32f_square_dist_scalar_mult_32f.h | 63 ++++++++++++++++++- 1 file changed, 60 insertions(+), 3 deletions(-) diff --git a/kernels/volk/volk_32fc_x2_s32f_square_dist_scalar_mult_32f.h b/kernels/volk/volk_32fc_x2_s32f_square_dist_scalar_mult_32f.h index e983578..6180337 100644 --- a/kernels/volk/volk_32fc_x2_s32f_square_dist_scalar_mult_32f.h +++ b/kernels/volk/volk_32fc_x2_s32f_square_dist_scalar_mult_32f.h @@ -84,6 +84,65 @@ #include #include +#ifdef LV_HAVE_AVX +#include + +static inline void +volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_avx( + float *target, lv_32fc_t *src0, lv_32fc_t *points, + float scalar, unsigned int num_points) { + static const unsigned int work_size = 8; + unsigned int avx_work_size = num_points / work_size * work_size; + int i = 0; + + for (; i < avx_work_size; i += work_size) { + lv_32fc_t src = *src0; + __m256 source = _mm256_setr_ps( + lv_creal(src), lv_cimag(src), + lv_creal(src), lv_cimag(src), + lv_creal(src), lv_cimag(src), + lv_creal(src), lv_cimag(src) + ); + __m256 points_low = _mm256_load_ps((const float *) points); + __m256 points_high = _mm256_load_ps((const float *) (points + work_size / 2)); + __m256 difference_low = _mm256_sub_ps(source, points_low); + __m256 difference_high = _mm256_sub_ps(source, points_high); + + difference_low = _mm256_mul_ps(difference_low, difference_low); + difference_high = _mm256_mul_ps(difference_high, difference_high); + + __m256 magnitudes_squared = _mm256_hadd_ps(difference_low, difference_high); + __m128 lower_magnitudes_squared_bottom = _mm256_extractf128_ps(magnitudes_squared, 0); + __m128 upper_magnitudes_squared_top = _mm256_extractf128_ps(magnitudes_squared, 1); + __m256 lower_magnitudes_squared = _mm256_castps128_ps256(lower_magnitudes_squared_bottom); + + lower_magnitudes_squared = _mm256_insertf128_ps( + lower_magnitudes_squared, _mm_permute_ps(lower_magnitudes_squared_bottom, 0x4E), 1 + ); + + __m256 upper_magnitudes_squared = _mm256_castps128_ps256(upper_magnitudes_squared_top); + + upper_magnitudes_squared = _mm256_insertf128_ps(upper_magnitudes_squared, upper_magnitudes_squared_top, 1); + upper_magnitudes_squared_top = _mm_permute_ps(upper_magnitudes_squared_top, 0x4E); + upper_magnitudes_squared = _mm256_insertf128_ps(upper_magnitudes_squared, upper_magnitudes_squared_top, 0); + + __m256 ordered_magnitudes_squared = _mm256_blend_ps(lower_magnitudes_squared, upper_magnitudes_squared, 0xCC); + __m256 scalars = _mm256_set1_ps(scalar); + __m256 output = _mm256_mul_ps(ordered_magnitudes_squared, scalars); + + _mm256_store_ps(target, output); + target += work_size; + points += work_size; + } + for (; i < num_points; ++i) { + lv_32fc_t diff = src0[0] - points[i]; + + target[i] = scalar * (lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff)); + } +} + +#endif /* LV_HAVE_AVX */ + #ifdef LV_HAVE_SSE3 #include #include @@ -183,13 +242,11 @@ static inline void volk_32fc_x2_s32f_square_dist_scalar_mult_32f_generic(float* target, lv_32fc_t* src0, lv_32fc_t* points, float scalar, unsigned int num_points) { - const unsigned int num_bytes = num_points*8; - lv_32fc_t diff; float sq_dist; unsigned int i = 0; - for(; i < num_bytes >> 3; ++i) { + for(; i < num_points; ++i) { diff = src0[0] - points[i]; sq_dist = scalar * (lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff)); -- 2.30.2