added __VOLK_PREFETCH() compatibility macro
authorJosh Blum <josh@joshknows.com>
Fri, 20 Jan 2017 18:03:49 +0000 (10:03 -0800)
committerA. Maitland Bottoms <bottoms@debian.org>
Sun, 4 Feb 2018 18:12:21 +0000 (18:12 +0000)
__VOLK_PREFETCH() performs __builtin_prefetch() on GCC compilers
and is otherwise a NOP for other systems. The use of __builtin_prefetch
was replaced with __VOLK_PREFETCH() to make the kernels portable.

Gbp-Pq: Name 0007-added-__VOLK_PREFETCH-compatibility-macro.patch

13 files changed:
include/volk/volk_common.h
kernels/volk/volk_16i_max_star_16i.h
kernels/volk/volk_16i_max_star_horizontal_16i.h
kernels/volk/volk_16ic_convert_32fc.h
kernels/volk/volk_16ic_x2_dot_prod_16ic.h
kernels/volk/volk_16ic_x2_multiply_16ic.h
kernels/volk/volk_32f_x2_add_32f.h
kernels/volk/volk_32fc_conjugate_32fc.h
kernels/volk/volk_32fc_convert_16ic.h
kernels/volk/volk_32fc_x2_conjugate_dot_prod_32fc.h
kernels/volk/volk_32fc_x2_dot_prod_32fc.h
kernels/volk/volk_32fc_x2_multiply_32fc.h
kernels/volk/volk_32fc_x2_multiply_conjugate_32fc.h

index 4d35f5ca31fef98e2861d4bf0d6fc8e9ae23552f..a53b13904508d4394b91461e301e4b692f47fa77 100644 (file)
@@ -16,6 +16,7 @@
 #    define __VOLK_ATTR_EXPORT
 #    define __VOLK_ATTR_IMPORT
 #  endif
+#  define __VOLK_PREFETCH(addr)  __builtin_prefetch(addr)
 #elif _MSC_VER
 #  define __VOLK_ATTR_ALIGNED(x) __declspec(align(x))
 #  define __VOLK_ATTR_UNUSED
@@ -23,6 +24,7 @@
 #  define __VOLK_ATTR_DEPRECATED __declspec(deprecated)
 #  define __VOLK_ATTR_EXPORT     __declspec(dllexport)
 #  define __VOLK_ATTR_IMPORT     __declspec(dllimport)
+#  define __VOLK_PREFETCH(addr)
 #else
 #  define __VOLK_ATTR_ALIGNED(x)
 #  define __VOLK_ATTR_UNUSED
@@ -30,6 +32,7 @@
 #  define __VOLK_ATTR_DEPRECATED
 #  define __VOLK_ATTR_EXPORT
 #  define __VOLK_ATTR_IMPORT
+#  define __VOLK_PREFETCH(addr)
 #endif
 
 ////////////////////////////////////////////////////////////////////////
index e4706422e5a497d05303bfef5072c9ce354cd8ae..531a8b5bd1d0502e69e8ec9810daa3c01efb51a2 100644 (file)
@@ -139,7 +139,7 @@ volk_16i_max_star_16i_neon(short* target, short* src0, unsigned int num_points)
 
   for(number=0; number < eighth_points; ++number) {
     input_vec = vld1q_s16(src0);
-    __builtin_prefetch(src0+16);
+    __VOLK_PREFETCH(src0+16);
     diff = vsubq_s16(candidate_vec, input_vec);
     comp1 = vcgeq_s16(diff, zeros);
     comp2 = vcltq_s16(diff, zeros);
index 1da83569675e4fc007cd41632736e3c02a4d50d9..964587c4207f7671e2481c7159a2bd6b3d79ce66 100644 (file)
@@ -169,7 +169,7 @@ volk_16i_max_star_horizontal_16i_neon(int16_t* target, int16_t* src0, unsigned i
   zeros = veorq_s16(zeros, zeros);
   for(number=0; number < eighth_points; ++number) {
     input_vec = vld2q_s16(src0);
-    //__builtin_prefetch(src0+16);
+    //__VOLK_PREFETCH(src0+16);
     diff = vsubq_s16(input_vec.val[0], input_vec.val[1]);
     comp1 = vcgeq_s16(diff, zeros);
     comp2 = vcltq_s16(diff, zeros);
index 88e079d920909adf8cb3821c84c92810a9927926..9779b0f0a4b8baaff60e0369565e0b569cbf585c 100644 (file)
@@ -198,7 +198,7 @@ static inline void volk_16ic_convert_32fc_neon(lv_32fc_t* outputVector, const lv
     for(number = 0; number < sse_iters; number++)
         {
             a16x4 = vld1_s16((const int16_t*)_in);
-            __builtin_prefetch(_in + 4);
+            __VOLK_PREFETCH(_in + 4);
             a32x4 = vmovl_s16(a16x4);
             f32x4 = vcvtq_f32_s32(a32x4);
             vst1q_f32((float32_t*)_out, f32x4);
index 9d4c882bad37aee2201964b027786c34b8f0c146..8e6de4c83c21276248660c4c114fb0beb7b1c393 100644 (file)
@@ -96,9 +96,9 @@ static inline void volk_16ic_x2_dot_prod_16ic_a_sse2(lv_16sc_t* out, const lv_16
                 {
                     // a[127:0]=[a3.i,a3.r,a2.i,a2.r,a1.i,a1.r,a0.i,a0.r]
                     a = _mm_load_si128((__m128i*)_in_a); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg
-                    __builtin_prefetch(_in_a + 8);
+                    __VOLK_PREFETCH(_in_a + 8);
                     b = _mm_load_si128((__m128i*)_in_b);
-                    __builtin_prefetch(_in_b + 8);
+                    __VOLK_PREFETCH(_in_b + 8);
                     c = _mm_mullo_epi16(a, b); // a3.i*b3.i, a3.r*b3.r, ....
 
                     c_sr = _mm_srli_si128(c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst.
@@ -173,9 +173,9 @@ static inline void volk_16ic_x2_dot_prod_16ic_u_sse2(lv_16sc_t* out, const lv_16
                 {
                     // a[127:0]=[a3.i,a3.r,a2.i,a2.r,a1.i,a1.r,a0.i,a0.r]
                     a = _mm_loadu_si128((__m128i*)_in_a); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg
-                    __builtin_prefetch(_in_a + 8);
+                    __VOLK_PREFETCH(_in_a + 8);
                     b = _mm_loadu_si128((__m128i*)_in_b);
-                    __builtin_prefetch(_in_b + 8);
+                    __VOLK_PREFETCH(_in_b + 8);
                     c = _mm_mullo_epi16(a, b); // a3.i*b3.i, a3.r*b3.r, ....
 
                     c_sr = _mm_srli_si128(c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst.
@@ -248,9 +248,9 @@ static inline void volk_16ic_x2_dot_prod_16ic_u_axv2(lv_16sc_t* out, const lv_16
             for(number = 0; number < avx_iters; number++)
                 {
                     a = _mm256_loadu_si256((__m256i*)_in_a);
-                    __builtin_prefetch(_in_a + 16);
+                    __VOLK_PREFETCH(_in_a + 16);
                     b = _mm256_loadu_si256((__m256i*)_in_b);
-                    __builtin_prefetch(_in_b + 16);
+                    __VOLK_PREFETCH(_in_b + 16);
                     c = _mm256_mullo_epi16(a, b);
 
                     c_sr = _mm256_srli_si256(c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst.
@@ -324,9 +324,9 @@ static inline void volk_16ic_x2_dot_prod_16ic_a_axv2(lv_16sc_t* out, const lv_16
             for(number = 0; number < avx_iters; number++)
                 {
                     a = _mm256_load_si256((__m256i*)_in_a);
-                    __builtin_prefetch(_in_a + 16);
+                    __VOLK_PREFETCH(_in_a + 16);
                     b = _mm256_load_si256((__m256i*)_in_b);
-                    __builtin_prefetch(_in_b + 16);
+                    __VOLK_PREFETCH(_in_b + 16);
                     c = _mm256_mullo_epi16(a, b);
 
                     c_sr = _mm256_srli_si256(c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst.
@@ -399,8 +399,8 @@ static inline void volk_16ic_x2_dot_prod_16ic_neon(lv_16sc_t* out, const lv_16sc
                 {
                     a_val = vld2_s16((int16_t*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i
                     b_val = vld2_s16((int16_t*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i
-                    __builtin_prefetch(a_ptr + 8);
-                    __builtin_prefetch(b_ptr + 8);
+                    __VOLK_PREFETCH(a_ptr + 8);
+                    __VOLK_PREFETCH(b_ptr + 8);
 
                     // multiply the real*real and imag*imag to get real result
                     // a0r*b0r|a1r*b1r|a2r*b2r|a3r*b3r
@@ -465,8 +465,8 @@ static inline void volk_16ic_x2_dot_prod_16ic_neon_vma(lv_16sc_t* out, const lv_
         {
             a_val = vld2_s16((int16_t*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i
             b_val = vld2_s16((int16_t*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i
-            __builtin_prefetch(a_ptr + 8);
-            __builtin_prefetch(b_ptr + 8);
+            __VOLK_PREFETCH(a_ptr + 8);
+            __VOLK_PREFETCH(b_ptr + 8);
 
             tmp.val[0] = vmul_s16(a_val.val[0], b_val.val[0]);
             tmp.val[1] = vmul_s16(a_val.val[1], b_val.val[0]);
@@ -519,8 +519,8 @@ static inline void volk_16ic_x2_dot_prod_16ic_neon_optvma(lv_16sc_t* out, const
         {
             a_val = vld2_s16((int16_t*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i
             b_val = vld2_s16((int16_t*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i
-            __builtin_prefetch(a_ptr + 8);
-            __builtin_prefetch(b_ptr + 8);
+            __VOLK_PREFETCH(a_ptr + 8);
+            __VOLK_PREFETCH(b_ptr + 8);
 
             // use 2 accumulators to remove inter-instruction data dependencies
             accumulator1.val[0] = vmla_s16(accumulator1.val[0], a_val.val[0], b_val.val[0]);
index 17033aecd26f6dd52fbd71531c3809e91930023c..9dcf06f333845433ef403f6470d3eead50c7d45f 100644 (file)
@@ -291,8 +291,8 @@ static inline void volk_16ic_x2_multiply_16ic_neon(lv_16sc_t* out, const lv_16sc
         {
             a_val = vld2_s16((int16_t*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i
             b_val = vld2_s16((int16_t*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i
-            __builtin_prefetch(a_ptr + 4);
-            __builtin_prefetch(b_ptr + 4);
+            __VOLK_PREFETCH(a_ptr + 4);
+            __VOLK_PREFETCH(b_ptr + 4);
 
             // multiply the real*real and imag*imag to get real result
             // a0r*b0r|a1r*b1r|a2r*b2r|a3r*b3r
index fc9cf5b42ad0c7da47d44d37b095fedd1474c12d..28cf73df88c8e83ec52056282606dd1555f50ed5 100644 (file)
@@ -191,8 +191,8 @@ volk_32f_x2_add_32f_u_neon(float* cVector, const float* aVector,
     // Load in to NEON registers
     aVal = vld1q_f32(aPtr);
     bVal = vld1q_f32(bPtr);
-    __builtin_prefetch(aPtr+4);
-    __builtin_prefetch(bPtr+4);
+    __VOLK_PREFETCH(aPtr+4);
+    __VOLK_PREFETCH(bPtr+4);
 
     // vector add
     cVal = vaddq_f32(aVal, bVal);
index 1fdb6c23d664dab08b407648bb8a1e0619c403e4..6994d0ef5183b84056077f064122a05c9b4eb8fe 100644 (file)
@@ -248,7 +248,7 @@ volk_32fc_conjugate_32fc_a_neon(lv_32fc_t* cVector, const lv_32fc_t* aVector, un
   const lv_32fc_t* a = aVector;
 
   for(number=0; number < quarterPoints; number++){
-    __builtin_prefetch(a+4);
+    __VOLK_PREFETCH(a+4);
     x = vld2q_f32((float*)a); // Load the complex data as ar,br,cr,dr; ai,bi,ci,di
 
     // xor the imaginary lane
index 4f6e6a5869d7fe8b58f39fbfdb2f900e7aa3cc8b..307ab36f3e4876a07eb4baa70a26a25cd1d9e114 100644 (file)
@@ -75,7 +75,7 @@ static inline void volk_32fc_convert_16ic_u_sse2(lv_16sc_t* outputVector, const
         {
             inputVal1 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4;
             inputVal2 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4;
-            __builtin_prefetch(inputVectorPtr + 8);
+            __VOLK_PREFETCH(inputVectorPtr + 8);
 
             // Clip
             ret1 = _mm_max_ps(_mm_min_ps(inputVal1, vmax_val), vmin_val);
@@ -128,7 +128,7 @@ static inline void volk_32fc_convert_16ic_a_sse2(lv_16sc_t* outputVector, const
         {
             inputVal1 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4;
             inputVal2 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4;
-            __builtin_prefetch(inputVectorPtr + 8);
+            __VOLK_PREFETCH(inputVectorPtr + 8);
 
             // Clip
             ret1 = _mm_max_ps(_mm_min_ps(inputVal1, vmax_val), vmin_val);
@@ -184,7 +184,7 @@ static inline void volk_32fc_convert_16ic_neon(lv_16sc_t* outputVector, const lv
         {
             a = vld1q_f32((const float32_t*)(inputVectorPtr)); inputVectorPtr += 4;
             b = vld1q_f32((const float32_t*)(inputVectorPtr)); inputVectorPtr += 4;
-            __builtin_prefetch(inputVectorPtr + 8);
+            __VOLK_PREFETCH(inputVectorPtr + 8);
 
             ret1 = vmaxq_f32(vminq_f32(a, max_val), min_val);
             ret2 = vmaxq_f32(vminq_f32(b, max_val), min_val);
index 981899c193d8bd1efda62021d521cc6389067759..4addf80e8d06a56cb8bfa73364d42c51d56d03a7 100644 (file)
@@ -219,8 +219,8 @@ static inline void volk_32fc_x2_conjugate_dot_prod_32fc_neon(lv_32fc_t* result,
     for(number = 0; number < quarter_points; ++number) {
         a_val = vld2q_f32((float*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i
         b_val = vld2q_f32((float*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i
-        __builtin_prefetch(a_ptr+8);
-        __builtin_prefetch(b_ptr+8);
+        __VOLK_PREFETCH(a_ptr+8);
+        __VOLK_PREFETCH(b_ptr+8);
 
         // do the first multiply
         tmp_imag.val[1] = vmulq_f32(a_val.val[1], b_val.val[0]);
index 39d0c789d73868d1ac376d599037e1ecc851929b..0c3271ce21a632efe205efe202af75ad276c4f11 100644 (file)
@@ -894,8 +894,8 @@ static inline void volk_32fc_x2_dot_prod_32fc_neon(lv_32fc_t* result, const lv_3
     for(number = 0; number < quarter_points; ++number) {
         a_val = vld2q_f32((float*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i
         b_val = vld2q_f32((float*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i
-        __builtin_prefetch(a_ptr+8);
-        __builtin_prefetch(b_ptr+8);
+        __VOLK_PREFETCH(a_ptr+8);
+        __VOLK_PREFETCH(b_ptr+8);
 
         // multiply the real*real and imag*imag to get real result
         // a0r*b0r|a1r*b1r|a2r*b2r|a3r*b3r
@@ -949,8 +949,8 @@ static inline void volk_32fc_x2_dot_prod_32fc_neon_opttests(lv_32fc_t* result, c
     for(number = 0; number < quarter_points; ++number) {
         a_val = vld2q_f32((float*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i
         b_val = vld2q_f32((float*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i
-        __builtin_prefetch(a_ptr+8);
-        __builtin_prefetch(b_ptr+8);
+        __VOLK_PREFETCH(a_ptr+8);
+        __VOLK_PREFETCH(b_ptr+8);
 
         // do the first multiply
         tmp_imag.val[1] = vmulq_f32(a_val.val[1], b_val.val[0]);
@@ -998,8 +998,8 @@ static inline void volk_32fc_x2_dot_prod_32fc_neon_optfma(lv_32fc_t* result, con
     for(number = 0; number < quarter_points; ++number) {
         a_val = vld2q_f32((float*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i
         b_val = vld2q_f32((float*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i
-        __builtin_prefetch(a_ptr+8);
-        __builtin_prefetch(b_ptr+8);
+        __VOLK_PREFETCH(a_ptr+8);
+        __VOLK_PREFETCH(b_ptr+8);
 
         // use 2 accumulators to remove inter-instruction data dependencies
         accumulator1.val[0] = vmlaq_f32(accumulator1.val[0], a_val.val[0], b_val.val[0]);
@@ -1050,8 +1050,8 @@ static inline void volk_32fc_x2_dot_prod_32fc_neon_optfmaunroll(lv_32fc_t* resul
     for(number = 0; number < quarter_points; ++number) {
         a_val = vld4q_f32((float*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i
         b_val = vld4q_f32((float*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i
-        __builtin_prefetch(a_ptr+8);
-        __builtin_prefetch(b_ptr+8);
+        __VOLK_PREFETCH(a_ptr+8);
+        __VOLK_PREFETCH(b_ptr+8);
 
         // use 2 accumulators to remove inter-instruction data dependencies
         accumulator1.val[0] = vmlaq_f32(accumulator1.val[0], a_val.val[0], b_val.val[0]);
index 17091406e188b2ac85327fada90ed02a7b6af00e..0b9d3fec6b9921c46249ef418c8ded435b0c477a 100644 (file)
@@ -372,8 +372,8 @@ volk_32fc_x2_multiply_32fc_neon(lv_32fc_t* cVector, const lv_32fc_t* aVector,
   for(number = 0; number < quarter_points; ++number) {
     a_val = vld2q_f32((float*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i
     b_val = vld2q_f32((float*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i
-    __builtin_prefetch(a_ptr+4);
-    __builtin_prefetch(b_ptr+4);
+    __VOLK_PREFETCH(a_ptr+4);
+    __VOLK_PREFETCH(b_ptr+4);
 
     // multiply the real*real and imag*imag to get real result
     // a0r*b0r|a1r*b1r|a2r*b2r|a3r*b3r
@@ -420,8 +420,8 @@ volk_32fc_x2_multiply_32fc_neon_opttests(lv_32fc_t* cVector, const lv_32fc_t* aV
   for(number = 0; number < quarter_points; ++number) {
     a_val = vld2q_f32((float*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i
     b_val = vld2q_f32((float*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i
-    __builtin_prefetch(a_ptr+4);
-    __builtin_prefetch(b_ptr+4);
+    __VOLK_PREFETCH(a_ptr+4);
+    __VOLK_PREFETCH(b_ptr+4);
 
     // do the first multiply
     tmp_imag.val[1] = vmulq_f32(a_val.val[1], b_val.val[0]);
index 703c78d27396c96080a66a1e193a6aa50dc58caa..c13a32e6e3ba662ffbacbecf94431c16bdfed8d7 100644 (file)
@@ -262,8 +262,8 @@ volk_32fc_x2_multiply_conjugate_32fc_neon(lv_32fc_t* cVector, const lv_32fc_t* a
     a_val = vld2q_f32((float*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i
     b_val = vld2q_f32((float*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i
     b_val.val[1] = vnegq_f32(b_val.val[1]);
-    __builtin_prefetch(a_ptr+4);
-    __builtin_prefetch(b_ptr+4);
+    __VOLK_PREFETCH(a_ptr+4);
+    __VOLK_PREFETCH(b_ptr+4);
 
     // multiply the real*real and imag*imag to get real result
     // a0r*b0r|a1r*b1r|a2r*b2r|a3r*b3r