From e1bb840dde2fe64abd0ab370ad05d126411d6a29 Mon Sep 17 00:00:00 2001
From: Josh Blum <josh@joshknows.com>
Date: Fri, 20 Jan 2017 10:03:49 -0800
Subject: [PATCH] added __VOLK_PREFETCH() compatibility macro

__VOLK_PREFETCH() performs __builtin_prefetch() on GCC compilers
and is otherwise a NOP for other systems. The use of __builtin_prefetch
was replaced with __VOLK_PREFETCH() to make the kernels portable.

Gbp-Pq: Name 0007-added-__VOLK_PREFETCH-compatibility-macro.patch
---
 include/volk/volk_common.h                    |  3 ++
 kernels/volk/volk_16i_max_star_16i.h          |  2 +-
 .../volk/volk_16i_max_star_horizontal_16i.h   |  2 +-
 kernels/volk/volk_16ic_convert_32fc.h         |  2 +-
 kernels/volk/volk_16ic_x2_dot_prod_16ic.h     | 28 +++++++++----------
 kernels/volk/volk_16ic_x2_multiply_16ic.h     |  4 +--
 kernels/volk/volk_32f_x2_add_32f.h            |  4 +--
 kernels/volk/volk_32fc_conjugate_32fc.h       |  2 +-
 kernels/volk/volk_32fc_convert_16ic.h         |  6 ++--
 .../volk_32fc_x2_conjugate_dot_prod_32fc.h    |  4 +--
 kernels/volk/volk_32fc_x2_dot_prod_32fc.h     | 16 +++++------
 kernels/volk/volk_32fc_x2_multiply_32fc.h     |  8 +++---
 .../volk_32fc_x2_multiply_conjugate_32fc.h    |  4 +--
 13 files changed, 44 insertions(+), 41 deletions(-)

diff --git a/include/volk/volk_common.h b/include/volk/volk_common.h
index 4d35f5c..a53b139 100644
--- a/include/volk/volk_common.h
+++ b/include/volk/volk_common.h
@@ -16,6 +16,7 @@
 #    define __VOLK_ATTR_EXPORT
 #    define __VOLK_ATTR_IMPORT
 #  endif
+#  define __VOLK_PREFETCH(addr)  __builtin_prefetch(addr)
 #elif _MSC_VER
 #  define __VOLK_ATTR_ALIGNED(x) __declspec(align(x))
 #  define __VOLK_ATTR_UNUSED
@@ -23,6 +24,7 @@
 #  define __VOLK_ATTR_DEPRECATED __declspec(deprecated)
 #  define __VOLK_ATTR_EXPORT     __declspec(dllexport)
 #  define __VOLK_ATTR_IMPORT     __declspec(dllimport)
+#  define __VOLK_PREFETCH(addr)
 #else
 #  define __VOLK_ATTR_ALIGNED(x)
 #  define __VOLK_ATTR_UNUSED
@@ -30,6 +32,7 @@
 #  define __VOLK_ATTR_DEPRECATED
 #  define __VOLK_ATTR_EXPORT
 #  define __VOLK_ATTR_IMPORT
+#  define __VOLK_PREFETCH(addr)
 #endif
 
 ////////////////////////////////////////////////////////////////////////
diff --git a/kernels/volk/volk_16i_max_star_16i.h b/kernels/volk/volk_16i_max_star_16i.h
index e470642..531a8b5 100644
--- a/kernels/volk/volk_16i_max_star_16i.h
+++ b/kernels/volk/volk_16i_max_star_16i.h
@@ -139,7 +139,7 @@ volk_16i_max_star_16i_neon(short* target, short* src0, unsigned int num_points)
 
   for(number=0; number < eighth_points; ++number) {
     input_vec = vld1q_s16(src0);
-    __builtin_prefetch(src0+16);
+    __VOLK_PREFETCH(src0+16);
     diff = vsubq_s16(candidate_vec, input_vec);
     comp1 = vcgeq_s16(diff, zeros);
     comp2 = vcltq_s16(diff, zeros);
diff --git a/kernels/volk/volk_16i_max_star_horizontal_16i.h b/kernels/volk/volk_16i_max_star_horizontal_16i.h
index 1da8356..964587c 100644
--- a/kernels/volk/volk_16i_max_star_horizontal_16i.h
+++ b/kernels/volk/volk_16i_max_star_horizontal_16i.h
@@ -169,7 +169,7 @@ volk_16i_max_star_horizontal_16i_neon(int16_t* target, int16_t* src0, unsigned i
   zeros = veorq_s16(zeros, zeros);
   for(number=0; number < eighth_points; ++number) {
     input_vec = vld2q_s16(src0);
-    //__builtin_prefetch(src0+16);
+    //__VOLK_PREFETCH(src0+16);
     diff = vsubq_s16(input_vec.val[0], input_vec.val[1]);
     comp1 = vcgeq_s16(diff, zeros);
     comp2 = vcltq_s16(diff, zeros);
diff --git a/kernels/volk/volk_16ic_convert_32fc.h b/kernels/volk/volk_16ic_convert_32fc.h
index 88e079d..9779b0f 100644
--- a/kernels/volk/volk_16ic_convert_32fc.h
+++ b/kernels/volk/volk_16ic_convert_32fc.h
@@ -198,7 +198,7 @@ static inline void volk_16ic_convert_32fc_neon(lv_32fc_t* outputVector, const lv
     for(number = 0; number < sse_iters; number++)
         {
             a16x4 = vld1_s16((const int16_t*)_in);
-            __builtin_prefetch(_in + 4);
+            __VOLK_PREFETCH(_in + 4);
             a32x4 = vmovl_s16(a16x4);
             f32x4 = vcvtq_f32_s32(a32x4);
             vst1q_f32((float32_t*)_out, f32x4);
diff --git a/kernels/volk/volk_16ic_x2_dot_prod_16ic.h b/kernels/volk/volk_16ic_x2_dot_prod_16ic.h
index 9d4c882..8e6de4c 100644
--- a/kernels/volk/volk_16ic_x2_dot_prod_16ic.h
+++ b/kernels/volk/volk_16ic_x2_dot_prod_16ic.h
@@ -96,9 +96,9 @@ static inline void volk_16ic_x2_dot_prod_16ic_a_sse2(lv_16sc_t* out, const lv_16
                 {
                     // a[127:0]=[a3.i,a3.r,a2.i,a2.r,a1.i,a1.r,a0.i,a0.r]
                     a = _mm_load_si128((__m128i*)_in_a); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg
-                    __builtin_prefetch(_in_a + 8);
+                    __VOLK_PREFETCH(_in_a + 8);
                     b = _mm_load_si128((__m128i*)_in_b);
-                    __builtin_prefetch(_in_b + 8);
+                    __VOLK_PREFETCH(_in_b + 8);
                     c = _mm_mullo_epi16(a, b); // a3.i*b3.i, a3.r*b3.r, ....
 
                     c_sr = _mm_srli_si128(c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst.
@@ -173,9 +173,9 @@ static inline void volk_16ic_x2_dot_prod_16ic_u_sse2(lv_16sc_t* out, const lv_16
                 {
                     // a[127:0]=[a3.i,a3.r,a2.i,a2.r,a1.i,a1.r,a0.i,a0.r]
                     a = _mm_loadu_si128((__m128i*)_in_a); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg
-                    __builtin_prefetch(_in_a + 8);
+                    __VOLK_PREFETCH(_in_a + 8);
                     b = _mm_loadu_si128((__m128i*)_in_b);
-                    __builtin_prefetch(_in_b + 8);
+                    __VOLK_PREFETCH(_in_b + 8);
                     c = _mm_mullo_epi16(a, b); // a3.i*b3.i, a3.r*b3.r, ....
 
                     c_sr = _mm_srli_si128(c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst.
@@ -248,9 +248,9 @@ static inline void volk_16ic_x2_dot_prod_16ic_u_axv2(lv_16sc_t* out, const lv_16
             for(number = 0; number < avx_iters; number++)
                 {
                     a = _mm256_loadu_si256((__m256i*)_in_a);
-                    __builtin_prefetch(_in_a + 16);
+                    __VOLK_PREFETCH(_in_a + 16);
                     b = _mm256_loadu_si256((__m256i*)_in_b);
-                    __builtin_prefetch(_in_b + 16);
+                    __VOLK_PREFETCH(_in_b + 16);
                     c = _mm256_mullo_epi16(a, b);
 
                     c_sr = _mm256_srli_si256(c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst.
@@ -324,9 +324,9 @@ static inline void volk_16ic_x2_dot_prod_16ic_a_axv2(lv_16sc_t* out, const lv_16
             for(number = 0; number < avx_iters; number++)
                 {
                     a = _mm256_load_si256((__m256i*)_in_a);
-                    __builtin_prefetch(_in_a + 16);
+                    __VOLK_PREFETCH(_in_a + 16);
                     b = _mm256_load_si256((__m256i*)_in_b);
-                    __builtin_prefetch(_in_b + 16);
+                    __VOLK_PREFETCH(_in_b + 16);
                     c = _mm256_mullo_epi16(a, b);
 
                     c_sr = _mm256_srli_si256(c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst.
@@ -399,8 +399,8 @@ static inline void volk_16ic_x2_dot_prod_16ic_neon(lv_16sc_t* out, const lv_16sc
                 {
                     a_val = vld2_s16((int16_t*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i
                     b_val = vld2_s16((int16_t*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i
-                    __builtin_prefetch(a_ptr + 8);
-                    __builtin_prefetch(b_ptr + 8);
+                    __VOLK_PREFETCH(a_ptr + 8);
+                    __VOLK_PREFETCH(b_ptr + 8);
 
                     // multiply the real*real and imag*imag to get real result
                     // a0r*b0r|a1r*b1r|a2r*b2r|a3r*b3r
@@ -465,8 +465,8 @@ static inline void volk_16ic_x2_dot_prod_16ic_neon_vma(lv_16sc_t* out, const lv_
         {
             a_val = vld2_s16((int16_t*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i
             b_val = vld2_s16((int16_t*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i
-            __builtin_prefetch(a_ptr + 8);
-            __builtin_prefetch(b_ptr + 8);
+            __VOLK_PREFETCH(a_ptr + 8);
+            __VOLK_PREFETCH(b_ptr + 8);
 
             tmp.val[0] = vmul_s16(a_val.val[0], b_val.val[0]);
             tmp.val[1] = vmul_s16(a_val.val[1], b_val.val[0]);
@@ -519,8 +519,8 @@ static inline void volk_16ic_x2_dot_prod_16ic_neon_optvma(lv_16sc_t* out, const
         {
             a_val = vld2_s16((int16_t*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i
             b_val = vld2_s16((int16_t*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i
-            __builtin_prefetch(a_ptr + 8);
-            __builtin_prefetch(b_ptr + 8);
+            __VOLK_PREFETCH(a_ptr + 8);
+            __VOLK_PREFETCH(b_ptr + 8);
 
             // use 2 accumulators to remove inter-instruction data dependencies
             accumulator1.val[0] = vmla_s16(accumulator1.val[0], a_val.val[0], b_val.val[0]);
diff --git a/kernels/volk/volk_16ic_x2_multiply_16ic.h b/kernels/volk/volk_16ic_x2_multiply_16ic.h
index 17033ae..9dcf06f 100644
--- a/kernels/volk/volk_16ic_x2_multiply_16ic.h
+++ b/kernels/volk/volk_16ic_x2_multiply_16ic.h
@@ -291,8 +291,8 @@ static inline void volk_16ic_x2_multiply_16ic_neon(lv_16sc_t* out, const lv_16sc
         {
             a_val = vld2_s16((int16_t*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i
             b_val = vld2_s16((int16_t*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i
-            __builtin_prefetch(a_ptr + 4);
-            __builtin_prefetch(b_ptr + 4);
+            __VOLK_PREFETCH(a_ptr + 4);
+            __VOLK_PREFETCH(b_ptr + 4);
 
             // multiply the real*real and imag*imag to get real result
             // a0r*b0r|a1r*b1r|a2r*b2r|a3r*b3r
diff --git a/kernels/volk/volk_32f_x2_add_32f.h b/kernels/volk/volk_32f_x2_add_32f.h
index fc9cf5b..28cf73d 100644
--- a/kernels/volk/volk_32f_x2_add_32f.h
+++ b/kernels/volk/volk_32f_x2_add_32f.h
@@ -191,8 +191,8 @@ volk_32f_x2_add_32f_u_neon(float* cVector, const float* aVector,
     // Load in to NEON registers
     aVal = vld1q_f32(aPtr);
     bVal = vld1q_f32(bPtr);
-    __builtin_prefetch(aPtr+4);
-    __builtin_prefetch(bPtr+4);
+    __VOLK_PREFETCH(aPtr+4);
+    __VOLK_PREFETCH(bPtr+4);
 
     // vector add
     cVal = vaddq_f32(aVal, bVal);
diff --git a/kernels/volk/volk_32fc_conjugate_32fc.h b/kernels/volk/volk_32fc_conjugate_32fc.h
index 1fdb6c2..6994d0e 100644
--- a/kernels/volk/volk_32fc_conjugate_32fc.h
+++ b/kernels/volk/volk_32fc_conjugate_32fc.h
@@ -248,7 +248,7 @@ volk_32fc_conjugate_32fc_a_neon(lv_32fc_t* cVector, const lv_32fc_t* aVector, un
   const lv_32fc_t* a = aVector;
 
   for(number=0; number < quarterPoints; number++){
-    __builtin_prefetch(a+4);
+    __VOLK_PREFETCH(a+4);
     x = vld2q_f32((float*)a); // Load the complex data as ar,br,cr,dr; ai,bi,ci,di
 
     // xor the imaginary lane
diff --git a/kernels/volk/volk_32fc_convert_16ic.h b/kernels/volk/volk_32fc_convert_16ic.h
index 4f6e6a5..307ab36 100644
--- a/kernels/volk/volk_32fc_convert_16ic.h
+++ b/kernels/volk/volk_32fc_convert_16ic.h
@@ -75,7 +75,7 @@ static inline void volk_32fc_convert_16ic_u_sse2(lv_16sc_t* outputVector, const
         {
             inputVal1 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4;
             inputVal2 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4;
-            __builtin_prefetch(inputVectorPtr + 8);
+            __VOLK_PREFETCH(inputVectorPtr + 8);
 
             // Clip
             ret1 = _mm_max_ps(_mm_min_ps(inputVal1, vmax_val), vmin_val);
@@ -128,7 +128,7 @@ static inline void volk_32fc_convert_16ic_a_sse2(lv_16sc_t* outputVector, const
         {
             inputVal1 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4;
             inputVal2 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4;
-            __builtin_prefetch(inputVectorPtr + 8);
+            __VOLK_PREFETCH(inputVectorPtr + 8);
 
             // Clip
             ret1 = _mm_max_ps(_mm_min_ps(inputVal1, vmax_val), vmin_val);
@@ -184,7 +184,7 @@ static inline void volk_32fc_convert_16ic_neon(lv_16sc_t* outputVector, const lv
         {
             a = vld1q_f32((const float32_t*)(inputVectorPtr)); inputVectorPtr += 4;
             b = vld1q_f32((const float32_t*)(inputVectorPtr)); inputVectorPtr += 4;
-            __builtin_prefetch(inputVectorPtr + 8);
+            __VOLK_PREFETCH(inputVectorPtr + 8);
 
             ret1 = vmaxq_f32(vminq_f32(a, max_val), min_val);
             ret2 = vmaxq_f32(vminq_f32(b, max_val), min_val);
diff --git a/kernels/volk/volk_32fc_x2_conjugate_dot_prod_32fc.h b/kernels/volk/volk_32fc_x2_conjugate_dot_prod_32fc.h
index 981899c..4addf80 100644
--- a/kernels/volk/volk_32fc_x2_conjugate_dot_prod_32fc.h
+++ b/kernels/volk/volk_32fc_x2_conjugate_dot_prod_32fc.h
@@ -219,8 +219,8 @@ static inline void volk_32fc_x2_conjugate_dot_prod_32fc_neon(lv_32fc_t* result,
     for(number = 0; number < quarter_points; ++number) {
         a_val = vld2q_f32((float*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i
         b_val = vld2q_f32((float*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i
-        __builtin_prefetch(a_ptr+8);
-        __builtin_prefetch(b_ptr+8);
+        __VOLK_PREFETCH(a_ptr+8);
+        __VOLK_PREFETCH(b_ptr+8);
 
         // do the first multiply
         tmp_imag.val[1] = vmulq_f32(a_val.val[1], b_val.val[0]);
diff --git a/kernels/volk/volk_32fc_x2_dot_prod_32fc.h b/kernels/volk/volk_32fc_x2_dot_prod_32fc.h
index 39d0c78..0c3271c 100644
--- a/kernels/volk/volk_32fc_x2_dot_prod_32fc.h
+++ b/kernels/volk/volk_32fc_x2_dot_prod_32fc.h
@@ -894,8 +894,8 @@ static inline void volk_32fc_x2_dot_prod_32fc_neon(lv_32fc_t* result, const lv_3
     for(number = 0; number < quarter_points; ++number) {
         a_val = vld2q_f32((float*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i
         b_val = vld2q_f32((float*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i
-        __builtin_prefetch(a_ptr+8);
-        __builtin_prefetch(b_ptr+8);
+        __VOLK_PREFETCH(a_ptr+8);
+        __VOLK_PREFETCH(b_ptr+8);
 
         // multiply the real*real and imag*imag to get real result
         // a0r*b0r|a1r*b1r|a2r*b2r|a3r*b3r
@@ -949,8 +949,8 @@ static inline void volk_32fc_x2_dot_prod_32fc_neon_opttests(lv_32fc_t* result, c
     for(number = 0; number < quarter_points; ++number) {
         a_val = vld2q_f32((float*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i
         b_val = vld2q_f32((float*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i
-        __builtin_prefetch(a_ptr+8);
-        __builtin_prefetch(b_ptr+8);
+        __VOLK_PREFETCH(a_ptr+8);
+        __VOLK_PREFETCH(b_ptr+8);
 
         // do the first multiply
         tmp_imag.val[1] = vmulq_f32(a_val.val[1], b_val.val[0]);
@@ -998,8 +998,8 @@ static inline void volk_32fc_x2_dot_prod_32fc_neon_optfma(lv_32fc_t* result, con
     for(number = 0; number < quarter_points; ++number) {
         a_val = vld2q_f32((float*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i
         b_val = vld2q_f32((float*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i
-        __builtin_prefetch(a_ptr+8);
-        __builtin_prefetch(b_ptr+8);
+        __VOLK_PREFETCH(a_ptr+8);
+        __VOLK_PREFETCH(b_ptr+8);
 
         // use 2 accumulators to remove inter-instruction data dependencies
         accumulator1.val[0] = vmlaq_f32(accumulator1.val[0], a_val.val[0], b_val.val[0]);
@@ -1050,8 +1050,8 @@ static inline void volk_32fc_x2_dot_prod_32fc_neon_optfmaunroll(lv_32fc_t* resul
     for(number = 0; number < quarter_points; ++number) {
         a_val = vld4q_f32((float*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i
         b_val = vld4q_f32((float*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i
-        __builtin_prefetch(a_ptr+8);
-        __builtin_prefetch(b_ptr+8);
+        __VOLK_PREFETCH(a_ptr+8);
+        __VOLK_PREFETCH(b_ptr+8);
 
         // use 2 accumulators to remove inter-instruction data dependencies
         accumulator1.val[0] = vmlaq_f32(accumulator1.val[0], a_val.val[0], b_val.val[0]);
diff --git a/kernels/volk/volk_32fc_x2_multiply_32fc.h b/kernels/volk/volk_32fc_x2_multiply_32fc.h
index 1709140..0b9d3fe 100644
--- a/kernels/volk/volk_32fc_x2_multiply_32fc.h
+++ b/kernels/volk/volk_32fc_x2_multiply_32fc.h
@@ -372,8 +372,8 @@ volk_32fc_x2_multiply_32fc_neon(lv_32fc_t* cVector, const lv_32fc_t* aVector,
   for(number = 0; number < quarter_points; ++number) {
     a_val = vld2q_f32((float*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i
     b_val = vld2q_f32((float*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i
-    __builtin_prefetch(a_ptr+4);
-    __builtin_prefetch(b_ptr+4);
+    __VOLK_PREFETCH(a_ptr+4);
+    __VOLK_PREFETCH(b_ptr+4);
 
     // multiply the real*real and imag*imag to get real result
     // a0r*b0r|a1r*b1r|a2r*b2r|a3r*b3r
@@ -420,8 +420,8 @@ volk_32fc_x2_multiply_32fc_neon_opttests(lv_32fc_t* cVector, const lv_32fc_t* aV
   for(number = 0; number < quarter_points; ++number) {
     a_val = vld2q_f32((float*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i
     b_val = vld2q_f32((float*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i
-    __builtin_prefetch(a_ptr+4);
-    __builtin_prefetch(b_ptr+4);
+    __VOLK_PREFETCH(a_ptr+4);
+    __VOLK_PREFETCH(b_ptr+4);
 
     // do the first multiply
     tmp_imag.val[1] = vmulq_f32(a_val.val[1], b_val.val[0]);
diff --git a/kernels/volk/volk_32fc_x2_multiply_conjugate_32fc.h b/kernels/volk/volk_32fc_x2_multiply_conjugate_32fc.h
index 703c78d..c13a32e 100644
--- a/kernels/volk/volk_32fc_x2_multiply_conjugate_32fc.h
+++ b/kernels/volk/volk_32fc_x2_multiply_conjugate_32fc.h
@@ -262,8 +262,8 @@ volk_32fc_x2_multiply_conjugate_32fc_neon(lv_32fc_t* cVector, const lv_32fc_t* a
     a_val = vld2q_f32((float*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i
     b_val = vld2q_f32((float*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i
     b_val.val[1] = vnegq_f32(b_val.val[1]);
-    __builtin_prefetch(a_ptr+4);
-    __builtin_prefetch(b_ptr+4);
+    __VOLK_PREFETCH(a_ptr+4);
+    __VOLK_PREFETCH(b_ptr+4);
 
     // multiply the real*real and imag*imag to get real result
     // a0r*b0r|a1r*b1r|a2r*b2r|a3r*b3r
-- 
2.30.2