added __VOLK_PREFETCH() compatibility macro

author Josh Blum <josh@joshknows.com>

Fri, 20 Jan 2017 18:03:49 +0000 (10:03 -0800)

committer A. Maitland Bottoms <bottoms@debian.org>

Sun, 27 Aug 2017 17:44:55 +0000 (18:44 +0100)
author Josh Blum <josh@joshknows.com>
Fri, 20 Jan 2017 18:03:49 +0000 (10:03 -0800)
committer A. Maitland Bottoms <bottoms@debian.org>
Sun, 27 Aug 2017 17:44:55 +0000 (18:44 +0100)
diff --git a/include/volk/volk_common.h b/include/volk/volk_common.h

index 4d35f5ca31fef98e2861d4bf0d6fc8e9ae23552f..a53b13904508d4394b91461e301e4b692f47fa77 100644 (file)
--- a/include/volk/volk_common.h
+++ b/include/volk/volk_common.h
@@ -16,6 +16,7 @@
  #    define __VOLK_ATTR_EXPORT
  #    define __VOLK_ATTR_IMPORT
  #  endif
+#  define __VOLK_PREFETCH(addr)  __builtin_prefetch(addr)
  #elif _MSC_VER
  #  define __VOLK_ATTR_ALIGNED(x) __declspec(align(x))
  #  define __VOLK_ATTR_UNUSED
@@ -23,6 +24,7 @@
  #  define __VOLK_ATTR_DEPRECATED __declspec(deprecated)
  #  define __VOLK_ATTR_EXPORT     __declspec(dllexport)
  #  define __VOLK_ATTR_IMPORT     __declspec(dllimport)
+#  define __VOLK_PREFETCH(addr)
  #else
  #  define __VOLK_ATTR_ALIGNED(x)
  #  define __VOLK_ATTR_UNUSED
@@ -30,6 +32,7 @@
  #  define __VOLK_ATTR_DEPRECATED
  #  define __VOLK_ATTR_EXPORT
  #  define __VOLK_ATTR_IMPORT
+#  define __VOLK_PREFETCH(addr)
  #endif
  
  ////////////////////////////////////////////////////////////////////////
diff --git a/kernels/volk/volk_16i_max_star_16i.h b/kernels/volk/volk_16i_max_star_16i.h

index e4706422e5a497d05303bfef5072c9ce354cd8ae..531a8b5bd1d0502e69e8ec9810daa3c01efb51a2 100644 (file)
--- a/kernels/volk/volk_16i_max_star_16i.h
+++ b/kernels/volk/volk_16i_max_star_16i.h
@@ -139,7 +139,7 @@ volk_16i_max_star_16i_neon(short* target, short* src0, unsigned int num_points)
  
    for(number=0; number < eighth_points; ++number) {
      input_vec = vld1q_s16(src0);
-    __builtin_prefetch(src0+16);
+    __VOLK_PREFETCH(src0+16);
      diff = vsubq_s16(candidate_vec, input_vec);
      comp1 = vcgeq_s16(diff, zeros);
      comp2 = vcltq_s16(diff, zeros);
diff --git a/kernels/volk/volk_16i_max_star_horizontal_16i.h b/kernels/volk/volk_16i_max_star_horizontal_16i.h

index 1da83569675e4fc007cd41632736e3c02a4d50d9..964587c4207f7671e2481c7159a2bd6b3d79ce66 100644 (file)
--- a/kernels/volk/volk_16i_max_star_horizontal_16i.h
+++ b/kernels/volk/volk_16i_max_star_horizontal_16i.h
@@ -169,7 +169,7 @@ volk_16i_max_star_horizontal_16i_neon(int16_t* target, int16_t* src0, unsigned i
    zeros = veorq_s16(zeros, zeros);
    for(number=0; number < eighth_points; ++number) {
      input_vec = vld2q_s16(src0);
-    //__builtin_prefetch(src0+16);
+    //__VOLK_PREFETCH(src0+16);
      diff = vsubq_s16(input_vec.val[0], input_vec.val[1]);
      comp1 = vcgeq_s16(diff, zeros);
      comp2 = vcltq_s16(diff, zeros);
diff --git a/kernels/volk/volk_16ic_convert_32fc.h b/kernels/volk/volk_16ic_convert_32fc.h

index 88e079d920909adf8cb3821c84c92810a9927926..9779b0f0a4b8baaff60e0369565e0b569cbf585c 100644 (file)
--- a/kernels/volk/volk_16ic_convert_32fc.h
+++ b/kernels/volk/volk_16ic_convert_32fc.h
@@ -198,7 +198,7 @@ static inline void volk_16ic_convert_32fc_neon(lv_32fc_t* outputVector, const lv
      for(number = 0; number < sse_iters; number++)
          {
              a16x4 = vld1_s16((const int16_t*)_in);
-            __builtin_prefetch(_in + 4);
+            __VOLK_PREFETCH(_in + 4);
              a32x4 = vmovl_s16(a16x4);
              f32x4 = vcvtq_f32_s32(a32x4);
              vst1q_f32((float32_t*)_out, f32x4);
diff --git a/kernels/volk/volk_16ic_x2_dot_prod_16ic.h b/kernels/volk/volk_16ic_x2_dot_prod_16ic.h

index 9d4c882bad37aee2201964b027786c34b8f0c146..8e6de4c83c21276248660c4c114fb0beb7b1c393 100644 (file)
--- a/kernels/volk/volk_16ic_x2_dot_prod_16ic.h
+++ b/kernels/volk/volk_16ic_x2_dot_prod_16ic.h
@@ -96,9 +96,9 @@ static inline void volk_16ic_x2_dot_prod_16ic_a_sse2(lv_16sc_t* out, const lv_16
                  {
                      // a[127:0]=[a3.i,a3.r,a2.i,a2.r,a1.i,a1.r,a0.i,a0.r]
                      a = _mm_load_si128((__m128i*)_in_a); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg
-                    __builtin_prefetch(_in_a + 8);
+                    __VOLK_PREFETCH(_in_a + 8);
                      b = _mm_load_si128((__m128i*)_in_b);
-                    __builtin_prefetch(_in_b + 8);
+                    __VOLK_PREFETCH(_in_b + 8);
                      c = _mm_mullo_epi16(a, b); // a3.i*b3.i, a3.r*b3.r, ....
  
                      c_sr = _mm_srli_si128(c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst.
@@ -173,9 +173,9 @@ static inline void volk_16ic_x2_dot_prod_16ic_u_sse2(lv_16sc_t* out, const lv_16
                  {
                      // a[127:0]=[a3.i,a3.r,a2.i,a2.r,a1.i,a1.r,a0.i,a0.r]
                      a = _mm_loadu_si128((__m128i*)_in_a); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg
-                    __builtin_prefetch(_in_a + 8);
+                    __VOLK_PREFETCH(_in_a + 8);
                      b = _mm_loadu_si128((__m128i*)_in_b);
-                    __builtin_prefetch(_in_b + 8);
+                    __VOLK_PREFETCH(_in_b + 8);
                      c = _mm_mullo_epi16(a, b); // a3.i*b3.i, a3.r*b3.r, ....
  
                      c_sr = _mm_srli_si128(c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst.
@@ -248,9 +248,9 @@ static inline void volk_16ic_x2_dot_prod_16ic_u_axv2(lv_16sc_t* out, const lv_16
              for(number = 0; number < avx_iters; number++)
                  {
                      a = _mm256_loadu_si256((__m256i*)_in_a);
-                    __builtin_prefetch(_in_a + 16);
+                    __VOLK_PREFETCH(_in_a + 16);
                      b = _mm256_loadu_si256((__m256i*)_in_b);
-                    __builtin_prefetch(_in_b + 16);
+                    __VOLK_PREFETCH(_in_b + 16);
                      c = _mm256_mullo_epi16(a, b);
  
                      c_sr = _mm256_srli_si256(c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst.
@@ -324,9 +324,9 @@ static inline void volk_16ic_x2_dot_prod_16ic_a_axv2(lv_16sc_t* out, const lv_16
              for(number = 0; number < avx_iters; number++)
                  {
                      a = _mm256_load_si256((__m256i*)_in_a);
-                    __builtin_prefetch(_in_a + 16);
+                    __VOLK_PREFETCH(_in_a + 16);
                      b = _mm256_load_si256((__m256i*)_in_b);
-                    __builtin_prefetch(_in_b + 16);
+                    __VOLK_PREFETCH(_in_b + 16);
                      c = _mm256_mullo_epi16(a, b);
  
                      c_sr = _mm256_srli_si256(c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst.
@@ -399,8 +399,8 @@ static inline void volk_16ic_x2_dot_prod_16ic_neon(lv_16sc_t* out, const lv_16sc
                  {
                      a_val = vld2_s16((int16_t*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i
                      b_val = vld2_s16((int16_t*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i
-                    __builtin_prefetch(a_ptr + 8);
-                    __builtin_prefetch(b_ptr + 8);
+                    __VOLK_PREFETCH(a_ptr + 8);
+                    __VOLK_PREFETCH(b_ptr + 8);
  
                      // multiply the real*real and imag*imag to get real result
                      // a0r*b0r|a1r*b1r|a2r*b2r|a3r*b3r
@@ -465,8 +465,8 @@ static inline void volk_16ic_x2_dot_prod_16ic_neon_vma(lv_16sc_t* out, const lv_
          {
              a_val = vld2_s16((int16_t*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i
              b_val = vld2_s16((int16_t*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i
-            __builtin_prefetch(a_ptr + 8);
-            __builtin_prefetch(b_ptr + 8);
+            __VOLK_PREFETCH(a_ptr + 8);
+            __VOLK_PREFETCH(b_ptr + 8);
  
              tmp.val[0] = vmul_s16(a_val.val[0], b_val.val[0]);
              tmp.val[1] = vmul_s16(a_val.val[1], b_val.val[0]);
@@ -519,8 +519,8 @@ static inline void volk_16ic_x2_dot_prod_16ic_neon_optvma(lv_16sc_t* out, const
          {
              a_val = vld2_s16((int16_t*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i
              b_val = vld2_s16((int16_t*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i
-            __builtin_prefetch(a_ptr + 8);
-            __builtin_prefetch(b_ptr + 8);
+            __VOLK_PREFETCH(a_ptr + 8);
+            __VOLK_PREFETCH(b_ptr + 8);
  
              // use 2 accumulators to remove inter-instruction data dependencies
              accumulator1.val[0] = vmla_s16(accumulator1.val[0], a_val.val[0], b_val.val[0]);
diff --git a/kernels/volk/volk_16ic_x2_multiply_16ic.h b/kernels/volk/volk_16ic_x2_multiply_16ic.h

index 17033aecd26f6dd52fbd71531c3809e91930023c..9dcf06f333845433ef403f6470d3eead50c7d45f 100644 (file)
--- a/kernels/volk/volk_16ic_x2_multiply_16ic.h
+++ b/kernels/volk/volk_16ic_x2_multiply_16ic.h
@@ -291,8 +291,8 @@ static inline void volk_16ic_x2_multiply_16ic_neon(lv_16sc_t* out, const lv_16sc
          {
              a_val = vld2_s16((int16_t*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i
              b_val = vld2_s16((int16_t*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i
-            __builtin_prefetch(a_ptr + 4);
-            __builtin_prefetch(b_ptr + 4);
+            __VOLK_PREFETCH(a_ptr + 4);
+            __VOLK_PREFETCH(b_ptr + 4);
  
              // multiply the real*real and imag*imag to get real result
              // a0r*b0r|a1r*b1r|a2r*b2r|a3r*b3r
diff --git a/kernels/volk/volk_32f_x2_add_32f.h b/kernels/volk/volk_32f_x2_add_32f.h

index fc9cf5b42ad0c7da47d44d37b095fedd1474c12d..28cf73df88c8e83ec52056282606dd1555f50ed5 100644 (file)
--- a/kernels/volk/volk_32f_x2_add_32f.h
+++ b/kernels/volk/volk_32f_x2_add_32f.h
@@ -191,8 +191,8 @@ volk_32f_x2_add_32f_u_neon(float* cVector, const float* aVector,
      // Load in to NEON registers
      aVal = vld1q_f32(aPtr);
      bVal = vld1q_f32(bPtr);
-    __builtin_prefetch(aPtr+4);
-    __builtin_prefetch(bPtr+4);
+    __VOLK_PREFETCH(aPtr+4);
+    __VOLK_PREFETCH(bPtr+4);
  
      // vector add
      cVal = vaddq_f32(aVal, bVal);
diff --git a/kernels/volk/volk_32fc_conjugate_32fc.h b/kernels/volk/volk_32fc_conjugate_32fc.h

index 1fdb6c23d664dab08b407648bb8a1e0619c403e4..6994d0ef5183b84056077f064122a05c9b4eb8fe 100644 (file)
--- a/kernels/volk/volk_32fc_conjugate_32fc.h
+++ b/kernels/volk/volk_32fc_conjugate_32fc.h
@@ -248,7 +248,7 @@ volk_32fc_conjugate_32fc_a_neon(lv_32fc_t* cVector, const lv_32fc_t* aVector, un
    const lv_32fc_t* a = aVector;
  
    for(number=0; number < quarterPoints; number++){
-    __builtin_prefetch(a+4);
+    __VOLK_PREFETCH(a+4);
      x = vld2q_f32((float*)a); // Load the complex data as ar,br,cr,dr; ai,bi,ci,di
  
      // xor the imaginary lane
diff --git a/kernels/volk/volk_32fc_convert_16ic.h b/kernels/volk/volk_32fc_convert_16ic.h

index 4f6e6a5869d7fe8b58f39fbfdb2f900e7aa3cc8b..307ab36f3e4876a07eb4baa70a26a25cd1d9e114 100644 (file)
--- a/kernels/volk/volk_32fc_convert_16ic.h
+++ b/kernels/volk/volk_32fc_convert_16ic.h
@@ -75,7 +75,7 @@ static inline void volk_32fc_convert_16ic_u_sse2(lv_16sc_t* outputVector, const
          {
              inputVal1 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4;
              inputVal2 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4;
-            __builtin_prefetch(inputVectorPtr + 8);
+            __VOLK_PREFETCH(inputVectorPtr + 8);
  
              // Clip
              ret1 = _mm_max_ps(_mm_min_ps(inputVal1, vmax_val), vmin_val);
@@ -128,7 +128,7 @@ static inline void volk_32fc_convert_16ic_a_sse2(lv_16sc_t* outputVector, const
          {
              inputVal1 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4;
              inputVal2 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4;
-            __builtin_prefetch(inputVectorPtr + 8);
+            __VOLK_PREFETCH(inputVectorPtr + 8);
  
              // Clip
              ret1 = _mm_max_ps(_mm_min_ps(inputVal1, vmax_val), vmin_val);
@@ -184,7 +184,7 @@ static inline void volk_32fc_convert_16ic_neon(lv_16sc_t* outputVector, const lv
          {
              a = vld1q_f32((const float32_t*)(inputVectorPtr)); inputVectorPtr += 4;
              b = vld1q_f32((const float32_t*)(inputVectorPtr)); inputVectorPtr += 4;
-            __builtin_prefetch(inputVectorPtr + 8);
+            __VOLK_PREFETCH(inputVectorPtr + 8);
  
              ret1 = vmaxq_f32(vminq_f32(a, max_val), min_val);
              ret2 = vmaxq_f32(vminq_f32(b, max_val), min_val);
diff --git a/kernels/volk/volk_32fc_x2_conjugate_dot_prod_32fc.h b/kernels/volk/volk_32fc_x2_conjugate_dot_prod_32fc.h

index 981899c193d8bd1efda62021d521cc6389067759..4addf80e8d06a56cb8bfa73364d42c51d56d03a7 100644 (file)
--- a/kernels/volk/volk_32fc_x2_conjugate_dot_prod_32fc.h
+++ b/kernels/volk/volk_32fc_x2_conjugate_dot_prod_32fc.h
@@ -219,8 +219,8 @@ static inline void volk_32fc_x2_conjugate_dot_prod_32fc_neon(lv_32fc_t* result,
      for(number = 0; number < quarter_points; ++number) {
          a_val = vld2q_f32((float*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i
          b_val = vld2q_f32((float*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i
-        __builtin_prefetch(a_ptr+8);
-        __builtin_prefetch(b_ptr+8);
+        __VOLK_PREFETCH(a_ptr+8);
+        __VOLK_PREFETCH(b_ptr+8);
  
          // do the first multiply
          tmp_imag.val[1] = vmulq_f32(a_val.val[1], b_val.val[0]);
diff --git a/kernels/volk/volk_32fc_x2_dot_prod_32fc.h b/kernels/volk/volk_32fc_x2_dot_prod_32fc.h

index 39d0c789d73868d1ac376d599037e1ecc851929b..0c3271ce21a632efe205efe202af75ad276c4f11 100644 (file)
--- a/kernels/volk/volk_32fc_x2_dot_prod_32fc.h
+++ b/kernels/volk/volk_32fc_x2_dot_prod_32fc.h
@@ -894,8 +894,8 @@ static inline void volk_32fc_x2_dot_prod_32fc_neon(lv_32fc_t* result, const lv_3
      for(number = 0; number < quarter_points; ++number) {
          a_val = vld2q_f32((float*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i
          b_val = vld2q_f32((float*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i
-        __builtin_prefetch(a_ptr+8);
-        __builtin_prefetch(b_ptr+8);
+        __VOLK_PREFETCH(a_ptr+8);
+        __VOLK_PREFETCH(b_ptr+8);
  
          // multiply the real*real and imag*imag to get real result
          // a0r*b0r|a1r*b1r|a2r*b2r|a3r*b3r
@@ -949,8 +949,8 @@ static inline void volk_32fc_x2_dot_prod_32fc_neon_opttests(lv_32fc_t* result, c
      for(number = 0; number < quarter_points; ++number) {
          a_val = vld2q_f32((float*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i
          b_val = vld2q_f32((float*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i
-        __builtin_prefetch(a_ptr+8);
-        __builtin_prefetch(b_ptr+8);
+        __VOLK_PREFETCH(a_ptr+8);
+        __VOLK_PREFETCH(b_ptr+8);
  
          // do the first multiply
          tmp_imag.val[1] = vmulq_f32(a_val.val[1], b_val.val[0]);
@@ -998,8 +998,8 @@ static inline void volk_32fc_x2_dot_prod_32fc_neon_optfma(lv_32fc_t* result, con
      for(number = 0; number < quarter_points; ++number) {
          a_val = vld2q_f32((float*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i
          b_val = vld2q_f32((float*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i
-        __builtin_prefetch(a_ptr+8);
-        __builtin_prefetch(b_ptr+8);
+        __VOLK_PREFETCH(a_ptr+8);
+        __VOLK_PREFETCH(b_ptr+8);
  
          // use 2 accumulators to remove inter-instruction data dependencies
          accumulator1.val[0] = vmlaq_f32(accumulator1.val[0], a_val.val[0], b_val.val[0]);
@@ -1050,8 +1050,8 @@ static inline void volk_32fc_x2_dot_prod_32fc_neon_optfmaunroll(lv_32fc_t* resul
      for(number = 0; number < quarter_points; ++number) {
          a_val = vld4q_f32((float*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i
          b_val = vld4q_f32((float*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i
-        __builtin_prefetch(a_ptr+8);
-        __builtin_prefetch(b_ptr+8);
+        __VOLK_PREFETCH(a_ptr+8);
+        __VOLK_PREFETCH(b_ptr+8);
  
          // use 2 accumulators to remove inter-instruction data dependencies
          accumulator1.val[0] = vmlaq_f32(accumulator1.val[0], a_val.val[0], b_val.val[0]);
diff --git a/kernels/volk/volk_32fc_x2_multiply_32fc.h b/kernels/volk/volk_32fc_x2_multiply_32fc.h

index 17091406e188b2ac85327fada90ed02a7b6af00e..0b9d3fec6b9921c46249ef418c8ded435b0c477a 100644 (file)
--- a/kernels/volk/volk_32fc_x2_multiply_32fc.h
+++ b/kernels/volk/volk_32fc_x2_multiply_32fc.h
@@ -372,8 +372,8 @@ volk_32fc_x2_multiply_32fc_neon(lv_32fc_t* cVector, const lv_32fc_t* aVector,
    for(number = 0; number < quarter_points; ++number) {
      a_val = vld2q_f32((float*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i
      b_val = vld2q_f32((float*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i
-    __builtin_prefetch(a_ptr+4);
-    __builtin_prefetch(b_ptr+4);
+    __VOLK_PREFETCH(a_ptr+4);
+    __VOLK_PREFETCH(b_ptr+4);
  
      // multiply the real*real and imag*imag to get real result
      // a0r*b0r|a1r*b1r|a2r*b2r|a3r*b3r
@@ -420,8 +420,8 @@ volk_32fc_x2_multiply_32fc_neon_opttests(lv_32fc_t* cVector, const lv_32fc_t* aV
    for(number = 0; number < quarter_points; ++number) {
      a_val = vld2q_f32((float*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i
      b_val = vld2q_f32((float*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i
-    __builtin_prefetch(a_ptr+4);
-    __builtin_prefetch(b_ptr+4);
+    __VOLK_PREFETCH(a_ptr+4);
+    __VOLK_PREFETCH(b_ptr+4);
  
      // do the first multiply
      tmp_imag.val[1] = vmulq_f32(a_val.val[1], b_val.val[0]);
diff --git a/kernels/volk/volk_32fc_x2_multiply_conjugate_32fc.h b/kernels/volk/volk_32fc_x2_multiply_conjugate_32fc.h

index 703c78d27396c96080a66a1e193a6aa50dc58caa..c13a32e6e3ba662ffbacbecf94431c16bdfed8d7 100644 (file)
--- a/kernels/volk/volk_32fc_x2_multiply_conjugate_32fc.h
+++ b/kernels/volk/volk_32fc_x2_multiply_conjugate_32fc.h
@@ -262,8 +262,8 @@ volk_32fc_x2_multiply_conjugate_32fc_neon(lv_32fc_t* cVector, const lv_32fc_t* a
      a_val = vld2q_f32((float*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i
      b_val = vld2q_f32((float*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i
      b_val.val[1] = vnegq_f32(b_val.val[1]);
-    __builtin_prefetch(a_ptr+4);
-    __builtin_prefetch(b_ptr+4);
+    __VOLK_PREFETCH(a_ptr+4);
+    __VOLK_PREFETCH(b_ptr+4);
  
      // multiply the real*real and imag*imag to get real result
      // a0r*b0r|a1r*b1r|a2r*b2r|a3r*b3r
author	Josh Blum <josh@joshknows.com>
	Fri, 20 Jan 2017 18:03:49 +0000 (10:03 -0800)
committer	A. Maitland Bottoms <bottoms@debian.org>
	Sun, 27 Aug 2017 17:44:55 +0000 (18:44 +0100)
include/volk/volk_common.h		patch \| blob \| history
kernels/volk/volk_16i_max_star_16i.h		patch \| blob \| history
kernels/volk/volk_16i_max_star_horizontal_16i.h		patch \| blob \| history
kernels/volk/volk_16ic_convert_32fc.h		patch \| blob \| history
kernels/volk/volk_16ic_x2_dot_prod_16ic.h		patch \| blob \| history
kernels/volk/volk_16ic_x2_multiply_16ic.h		patch \| blob \| history
kernels/volk/volk_32f_x2_add_32f.h		patch \| blob \| history
kernels/volk/volk_32fc_conjugate_32fc.h		patch \| blob \| history
kernels/volk/volk_32fc_convert_16ic.h		patch \| blob \| history
kernels/volk/volk_32fc_x2_conjugate_dot_prod_32fc.h		patch \| blob \| history
kernels/volk/volk_32fc_x2_dot_prod_32fc.h		patch \| blob \| history
kernels/volk/volk_32fc_x2_multiply_32fc.h		patch \| blob \| history
kernels/volk/volk_32fc_x2_multiply_conjugate_32fc.h		patch \| blob \| history