Added an AVX protokernel for volk_32fc_x2_32f_square_dist_scalar_mult_32f.
authorBrandon P. Enochs <brandon.enochs@nrl.navy.mil>
Tue, 3 Apr 2018 20:22:16 +0000 (16:22 -0400)
committerA. Maitland Bottoms <bottoms@debian.org>
Sat, 12 May 2018 19:25:04 +0000 (20:25 +0100)
Gbp-Pq: Name 0002-Added-an-AVX-protokernel-for-volk_32fc_x2_32f_square.patch

kernels/volk/volk_32fc_x2_s32f_square_dist_scalar_mult_32f.h

index e9835782069f075be68d055d29f29e2bdfd7069a..6180337d25c52a38183cbe04f35e047a946c8d9a 100644 (file)
 #include<volk/volk_complex.h>
 #include <string.h>
 
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+
+static inline void
+volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_avx(
+        float *target, lv_32fc_t *src0, lv_32fc_t *points,
+        float scalar, unsigned int num_points) {
+  static const unsigned int work_size = 8;
+  unsigned int avx_work_size = num_points / work_size * work_size;
+  int i = 0;
+
+  for (; i < avx_work_size; i += work_size) {
+    lv_32fc_t src = *src0;
+    __m256 source = _mm256_setr_ps(
+            lv_creal(src), lv_cimag(src),
+            lv_creal(src), lv_cimag(src),
+            lv_creal(src), lv_cimag(src),
+            lv_creal(src), lv_cimag(src)
+    );
+    __m256 points_low = _mm256_load_ps((const float *) points);
+    __m256 points_high = _mm256_load_ps((const float *) (points + work_size / 2));
+    __m256 difference_low = _mm256_sub_ps(source, points_low);
+    __m256 difference_high = _mm256_sub_ps(source, points_high);
+
+    difference_low = _mm256_mul_ps(difference_low, difference_low);
+    difference_high = _mm256_mul_ps(difference_high, difference_high);
+
+    __m256 magnitudes_squared = _mm256_hadd_ps(difference_low, difference_high);
+    __m128 lower_magnitudes_squared_bottom = _mm256_extractf128_ps(magnitudes_squared, 0);
+    __m128 upper_magnitudes_squared_top = _mm256_extractf128_ps(magnitudes_squared, 1);
+    __m256 lower_magnitudes_squared = _mm256_castps128_ps256(lower_magnitudes_squared_bottom);
+
+    lower_magnitudes_squared = _mm256_insertf128_ps(
+            lower_magnitudes_squared, _mm_permute_ps(lower_magnitudes_squared_bottom, 0x4E), 1
+    );
+
+    __m256 upper_magnitudes_squared = _mm256_castps128_ps256(upper_magnitudes_squared_top);
+
+    upper_magnitudes_squared = _mm256_insertf128_ps(upper_magnitudes_squared, upper_magnitudes_squared_top, 1);
+    upper_magnitudes_squared_top = _mm_permute_ps(upper_magnitudes_squared_top, 0x4E);
+    upper_magnitudes_squared = _mm256_insertf128_ps(upper_magnitudes_squared, upper_magnitudes_squared_top, 0);
+
+    __m256 ordered_magnitudes_squared = _mm256_blend_ps(lower_magnitudes_squared, upper_magnitudes_squared, 0xCC);
+    __m256 scalars = _mm256_set1_ps(scalar);
+    __m256 output = _mm256_mul_ps(ordered_magnitudes_squared, scalars);
+
+    _mm256_store_ps(target, output);
+    target += work_size;
+    points += work_size;
+  }
+  for (; i < num_points; ++i) {
+    lv_32fc_t diff = src0[0] - points[i];
+
+    target[i] = scalar * (lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff));
+  }
+}
+
+#endif /* LV_HAVE_AVX */
+
 #ifdef LV_HAVE_SSE3
 #include<xmmintrin.h>
 #include<pmmintrin.h>
@@ -183,13 +242,11 @@ static inline void
 volk_32fc_x2_s32f_square_dist_scalar_mult_32f_generic(float* target, lv_32fc_t* src0, lv_32fc_t* points,
                                                       float scalar, unsigned int num_points)
 {
-  const unsigned int num_bytes = num_points*8;
-
   lv_32fc_t diff;
   float sq_dist;
   unsigned int i = 0;
 
-  for(; i < num_bytes >> 3; ++i) {
+  for(; i < num_points; ++i) {
     diff = src0[0] - points[i];
 
     sq_dist = scalar * (lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff));