From e1bb840dde2fe64abd0ab370ad05d126411d6a29 Mon Sep 17 00:00:00 2001 From: Josh Blum Date: Fri, 20 Jan 2017 10:03:49 -0800 Subject: [PATCH] added __VOLK_PREFETCH() compatibility macro __VOLK_PREFETCH() performs __builtin_prefetch() on GCC compilers and is otherwise a NOP for other systems. The use of __builtin_prefetch was replaced with __VOLK_PREFETCH() to make the kernels portable. Gbp-Pq: Name 0007-added-__VOLK_PREFETCH-compatibility-macro.patch --- include/volk/volk_common.h | 3 ++ kernels/volk/volk_16i_max_star_16i.h | 2 +- .../volk/volk_16i_max_star_horizontal_16i.h | 2 +- kernels/volk/volk_16ic_convert_32fc.h | 2 +- kernels/volk/volk_16ic_x2_dot_prod_16ic.h | 28 +++++++++---------- kernels/volk/volk_16ic_x2_multiply_16ic.h | 4 +-- kernels/volk/volk_32f_x2_add_32f.h | 4 +-- kernels/volk/volk_32fc_conjugate_32fc.h | 2 +- kernels/volk/volk_32fc_convert_16ic.h | 6 ++-- .../volk_32fc_x2_conjugate_dot_prod_32fc.h | 4 +-- kernels/volk/volk_32fc_x2_dot_prod_32fc.h | 16 +++++------ kernels/volk/volk_32fc_x2_multiply_32fc.h | 8 +++--- .../volk_32fc_x2_multiply_conjugate_32fc.h | 4 +-- 13 files changed, 44 insertions(+), 41 deletions(-) diff --git a/include/volk/volk_common.h b/include/volk/volk_common.h index 4d35f5c..a53b139 100644 --- a/include/volk/volk_common.h +++ b/include/volk/volk_common.h @@ -16,6 +16,7 @@ # define __VOLK_ATTR_EXPORT # define __VOLK_ATTR_IMPORT # endif +# define __VOLK_PREFETCH(addr) __builtin_prefetch(addr) #elif _MSC_VER # define __VOLK_ATTR_ALIGNED(x) __declspec(align(x)) # define __VOLK_ATTR_UNUSED @@ -23,6 +24,7 @@ # define __VOLK_ATTR_DEPRECATED __declspec(deprecated) # define __VOLK_ATTR_EXPORT __declspec(dllexport) # define __VOLK_ATTR_IMPORT __declspec(dllimport) +# define __VOLK_PREFETCH(addr) #else # define __VOLK_ATTR_ALIGNED(x) # define __VOLK_ATTR_UNUSED @@ -30,6 +32,7 @@ # define __VOLK_ATTR_DEPRECATED # define __VOLK_ATTR_EXPORT # define __VOLK_ATTR_IMPORT +# define __VOLK_PREFETCH(addr) #endif //////////////////////////////////////////////////////////////////////// diff --git a/kernels/volk/volk_16i_max_star_16i.h b/kernels/volk/volk_16i_max_star_16i.h index e470642..531a8b5 100644 --- a/kernels/volk/volk_16i_max_star_16i.h +++ b/kernels/volk/volk_16i_max_star_16i.h @@ -139,7 +139,7 @@ volk_16i_max_star_16i_neon(short* target, short* src0, unsigned int num_points) for(number=0; number < eighth_points; ++number) { input_vec = vld1q_s16(src0); - __builtin_prefetch(src0+16); + __VOLK_PREFETCH(src0+16); diff = vsubq_s16(candidate_vec, input_vec); comp1 = vcgeq_s16(diff, zeros); comp2 = vcltq_s16(diff, zeros); diff --git a/kernels/volk/volk_16i_max_star_horizontal_16i.h b/kernels/volk/volk_16i_max_star_horizontal_16i.h index 1da8356..964587c 100644 --- a/kernels/volk/volk_16i_max_star_horizontal_16i.h +++ b/kernels/volk/volk_16i_max_star_horizontal_16i.h @@ -169,7 +169,7 @@ volk_16i_max_star_horizontal_16i_neon(int16_t* target, int16_t* src0, unsigned i zeros = veorq_s16(zeros, zeros); for(number=0; number < eighth_points; ++number) { input_vec = vld2q_s16(src0); - //__builtin_prefetch(src0+16); + //__VOLK_PREFETCH(src0+16); diff = vsubq_s16(input_vec.val[0], input_vec.val[1]); comp1 = vcgeq_s16(diff, zeros); comp2 = vcltq_s16(diff, zeros); diff --git a/kernels/volk/volk_16ic_convert_32fc.h b/kernels/volk/volk_16ic_convert_32fc.h index 88e079d..9779b0f 100644 --- a/kernels/volk/volk_16ic_convert_32fc.h +++ b/kernels/volk/volk_16ic_convert_32fc.h @@ -198,7 +198,7 @@ static inline void volk_16ic_convert_32fc_neon(lv_32fc_t* outputVector, const lv for(number = 0; number < sse_iters; number++) { a16x4 = vld1_s16((const int16_t*)_in); - __builtin_prefetch(_in + 4); + __VOLK_PREFETCH(_in + 4); a32x4 = vmovl_s16(a16x4); f32x4 = vcvtq_f32_s32(a32x4); vst1q_f32((float32_t*)_out, f32x4); diff --git a/kernels/volk/volk_16ic_x2_dot_prod_16ic.h b/kernels/volk/volk_16ic_x2_dot_prod_16ic.h index 9d4c882..8e6de4c 100644 --- a/kernels/volk/volk_16ic_x2_dot_prod_16ic.h +++ b/kernels/volk/volk_16ic_x2_dot_prod_16ic.h @@ -96,9 +96,9 @@ static inline void volk_16ic_x2_dot_prod_16ic_a_sse2(lv_16sc_t* out, const lv_16 { // a[127:0]=[a3.i,a3.r,a2.i,a2.r,a1.i,a1.r,a0.i,a0.r] a = _mm_load_si128((__m128i*)_in_a); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg - __builtin_prefetch(_in_a + 8); + __VOLK_PREFETCH(_in_a + 8); b = _mm_load_si128((__m128i*)_in_b); - __builtin_prefetch(_in_b + 8); + __VOLK_PREFETCH(_in_b + 8); c = _mm_mullo_epi16(a, b); // a3.i*b3.i, a3.r*b3.r, .... c_sr = _mm_srli_si128(c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst. @@ -173,9 +173,9 @@ static inline void volk_16ic_x2_dot_prod_16ic_u_sse2(lv_16sc_t* out, const lv_16 { // a[127:0]=[a3.i,a3.r,a2.i,a2.r,a1.i,a1.r,a0.i,a0.r] a = _mm_loadu_si128((__m128i*)_in_a); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg - __builtin_prefetch(_in_a + 8); + __VOLK_PREFETCH(_in_a + 8); b = _mm_loadu_si128((__m128i*)_in_b); - __builtin_prefetch(_in_b + 8); + __VOLK_PREFETCH(_in_b + 8); c = _mm_mullo_epi16(a, b); // a3.i*b3.i, a3.r*b3.r, .... c_sr = _mm_srli_si128(c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst. @@ -248,9 +248,9 @@ static inline void volk_16ic_x2_dot_prod_16ic_u_axv2(lv_16sc_t* out, const lv_16 for(number = 0; number < avx_iters; number++) { a = _mm256_loadu_si256((__m256i*)_in_a); - __builtin_prefetch(_in_a + 16); + __VOLK_PREFETCH(_in_a + 16); b = _mm256_loadu_si256((__m256i*)_in_b); - __builtin_prefetch(_in_b + 16); + __VOLK_PREFETCH(_in_b + 16); c = _mm256_mullo_epi16(a, b); c_sr = _mm256_srli_si256(c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst. @@ -324,9 +324,9 @@ static inline void volk_16ic_x2_dot_prod_16ic_a_axv2(lv_16sc_t* out, const lv_16 for(number = 0; number < avx_iters; number++) { a = _mm256_load_si256((__m256i*)_in_a); - __builtin_prefetch(_in_a + 16); + __VOLK_PREFETCH(_in_a + 16); b = _mm256_load_si256((__m256i*)_in_b); - __builtin_prefetch(_in_b + 16); + __VOLK_PREFETCH(_in_b + 16); c = _mm256_mullo_epi16(a, b); c_sr = _mm256_srli_si256(c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst. @@ -399,8 +399,8 @@ static inline void volk_16ic_x2_dot_prod_16ic_neon(lv_16sc_t* out, const lv_16sc { a_val = vld2_s16((int16_t*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i b_val = vld2_s16((int16_t*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i - __builtin_prefetch(a_ptr + 8); - __builtin_prefetch(b_ptr + 8); + __VOLK_PREFETCH(a_ptr + 8); + __VOLK_PREFETCH(b_ptr + 8); // multiply the real*real and imag*imag to get real result // a0r*b0r|a1r*b1r|a2r*b2r|a3r*b3r @@ -465,8 +465,8 @@ static inline void volk_16ic_x2_dot_prod_16ic_neon_vma(lv_16sc_t* out, const lv_ { a_val = vld2_s16((int16_t*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i b_val = vld2_s16((int16_t*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i - __builtin_prefetch(a_ptr + 8); - __builtin_prefetch(b_ptr + 8); + __VOLK_PREFETCH(a_ptr + 8); + __VOLK_PREFETCH(b_ptr + 8); tmp.val[0] = vmul_s16(a_val.val[0], b_val.val[0]); tmp.val[1] = vmul_s16(a_val.val[1], b_val.val[0]); @@ -519,8 +519,8 @@ static inline void volk_16ic_x2_dot_prod_16ic_neon_optvma(lv_16sc_t* out, const { a_val = vld2_s16((int16_t*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i b_val = vld2_s16((int16_t*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i - __builtin_prefetch(a_ptr + 8); - __builtin_prefetch(b_ptr + 8); + __VOLK_PREFETCH(a_ptr + 8); + __VOLK_PREFETCH(b_ptr + 8); // use 2 accumulators to remove inter-instruction data dependencies accumulator1.val[0] = vmla_s16(accumulator1.val[0], a_val.val[0], b_val.val[0]); diff --git a/kernels/volk/volk_16ic_x2_multiply_16ic.h b/kernels/volk/volk_16ic_x2_multiply_16ic.h index 17033ae..9dcf06f 100644 --- a/kernels/volk/volk_16ic_x2_multiply_16ic.h +++ b/kernels/volk/volk_16ic_x2_multiply_16ic.h @@ -291,8 +291,8 @@ static inline void volk_16ic_x2_multiply_16ic_neon(lv_16sc_t* out, const lv_16sc { a_val = vld2_s16((int16_t*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i b_val = vld2_s16((int16_t*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i - __builtin_prefetch(a_ptr + 4); - __builtin_prefetch(b_ptr + 4); + __VOLK_PREFETCH(a_ptr + 4); + __VOLK_PREFETCH(b_ptr + 4); // multiply the real*real and imag*imag to get real result // a0r*b0r|a1r*b1r|a2r*b2r|a3r*b3r diff --git a/kernels/volk/volk_32f_x2_add_32f.h b/kernels/volk/volk_32f_x2_add_32f.h index fc9cf5b..28cf73d 100644 --- a/kernels/volk/volk_32f_x2_add_32f.h +++ b/kernels/volk/volk_32f_x2_add_32f.h @@ -191,8 +191,8 @@ volk_32f_x2_add_32f_u_neon(float* cVector, const float* aVector, // Load in to NEON registers aVal = vld1q_f32(aPtr); bVal = vld1q_f32(bPtr); - __builtin_prefetch(aPtr+4); - __builtin_prefetch(bPtr+4); + __VOLK_PREFETCH(aPtr+4); + __VOLK_PREFETCH(bPtr+4); // vector add cVal = vaddq_f32(aVal, bVal); diff --git a/kernels/volk/volk_32fc_conjugate_32fc.h b/kernels/volk/volk_32fc_conjugate_32fc.h index 1fdb6c2..6994d0e 100644 --- a/kernels/volk/volk_32fc_conjugate_32fc.h +++ b/kernels/volk/volk_32fc_conjugate_32fc.h @@ -248,7 +248,7 @@ volk_32fc_conjugate_32fc_a_neon(lv_32fc_t* cVector, const lv_32fc_t* aVector, un const lv_32fc_t* a = aVector; for(number=0; number < quarterPoints; number++){ - __builtin_prefetch(a+4); + __VOLK_PREFETCH(a+4); x = vld2q_f32((float*)a); // Load the complex data as ar,br,cr,dr; ai,bi,ci,di // xor the imaginary lane diff --git a/kernels/volk/volk_32fc_convert_16ic.h b/kernels/volk/volk_32fc_convert_16ic.h index 4f6e6a5..307ab36 100644 --- a/kernels/volk/volk_32fc_convert_16ic.h +++ b/kernels/volk/volk_32fc_convert_16ic.h @@ -75,7 +75,7 @@ static inline void volk_32fc_convert_16ic_u_sse2(lv_16sc_t* outputVector, const { inputVal1 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4; inputVal2 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4; - __builtin_prefetch(inputVectorPtr + 8); + __VOLK_PREFETCH(inputVectorPtr + 8); // Clip ret1 = _mm_max_ps(_mm_min_ps(inputVal1, vmax_val), vmin_val); @@ -128,7 +128,7 @@ static inline void volk_32fc_convert_16ic_a_sse2(lv_16sc_t* outputVector, const { inputVal1 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4; inputVal2 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4; - __builtin_prefetch(inputVectorPtr + 8); + __VOLK_PREFETCH(inputVectorPtr + 8); // Clip ret1 = _mm_max_ps(_mm_min_ps(inputVal1, vmax_val), vmin_val); @@ -184,7 +184,7 @@ static inline void volk_32fc_convert_16ic_neon(lv_16sc_t* outputVector, const lv { a = vld1q_f32((const float32_t*)(inputVectorPtr)); inputVectorPtr += 4; b = vld1q_f32((const float32_t*)(inputVectorPtr)); inputVectorPtr += 4; - __builtin_prefetch(inputVectorPtr + 8); + __VOLK_PREFETCH(inputVectorPtr + 8); ret1 = vmaxq_f32(vminq_f32(a, max_val), min_val); ret2 = vmaxq_f32(vminq_f32(b, max_val), min_val); diff --git a/kernels/volk/volk_32fc_x2_conjugate_dot_prod_32fc.h b/kernels/volk/volk_32fc_x2_conjugate_dot_prod_32fc.h index 981899c..4addf80 100644 --- a/kernels/volk/volk_32fc_x2_conjugate_dot_prod_32fc.h +++ b/kernels/volk/volk_32fc_x2_conjugate_dot_prod_32fc.h @@ -219,8 +219,8 @@ static inline void volk_32fc_x2_conjugate_dot_prod_32fc_neon(lv_32fc_t* result, for(number = 0; number < quarter_points; ++number) { a_val = vld2q_f32((float*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i b_val = vld2q_f32((float*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i - __builtin_prefetch(a_ptr+8); - __builtin_prefetch(b_ptr+8); + __VOLK_PREFETCH(a_ptr+8); + __VOLK_PREFETCH(b_ptr+8); // do the first multiply tmp_imag.val[1] = vmulq_f32(a_val.val[1], b_val.val[0]); diff --git a/kernels/volk/volk_32fc_x2_dot_prod_32fc.h b/kernels/volk/volk_32fc_x2_dot_prod_32fc.h index 39d0c78..0c3271c 100644 --- a/kernels/volk/volk_32fc_x2_dot_prod_32fc.h +++ b/kernels/volk/volk_32fc_x2_dot_prod_32fc.h @@ -894,8 +894,8 @@ static inline void volk_32fc_x2_dot_prod_32fc_neon(lv_32fc_t* result, const lv_3 for(number = 0; number < quarter_points; ++number) { a_val = vld2q_f32((float*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i b_val = vld2q_f32((float*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i - __builtin_prefetch(a_ptr+8); - __builtin_prefetch(b_ptr+8); + __VOLK_PREFETCH(a_ptr+8); + __VOLK_PREFETCH(b_ptr+8); // multiply the real*real and imag*imag to get real result // a0r*b0r|a1r*b1r|a2r*b2r|a3r*b3r @@ -949,8 +949,8 @@ static inline void volk_32fc_x2_dot_prod_32fc_neon_opttests(lv_32fc_t* result, c for(number = 0; number < quarter_points; ++number) { a_val = vld2q_f32((float*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i b_val = vld2q_f32((float*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i - __builtin_prefetch(a_ptr+8); - __builtin_prefetch(b_ptr+8); + __VOLK_PREFETCH(a_ptr+8); + __VOLK_PREFETCH(b_ptr+8); // do the first multiply tmp_imag.val[1] = vmulq_f32(a_val.val[1], b_val.val[0]); @@ -998,8 +998,8 @@ static inline void volk_32fc_x2_dot_prod_32fc_neon_optfma(lv_32fc_t* result, con for(number = 0; number < quarter_points; ++number) { a_val = vld2q_f32((float*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i b_val = vld2q_f32((float*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i - __builtin_prefetch(a_ptr+8); - __builtin_prefetch(b_ptr+8); + __VOLK_PREFETCH(a_ptr+8); + __VOLK_PREFETCH(b_ptr+8); // use 2 accumulators to remove inter-instruction data dependencies accumulator1.val[0] = vmlaq_f32(accumulator1.val[0], a_val.val[0], b_val.val[0]); @@ -1050,8 +1050,8 @@ static inline void volk_32fc_x2_dot_prod_32fc_neon_optfmaunroll(lv_32fc_t* resul for(number = 0; number < quarter_points; ++number) { a_val = vld4q_f32((float*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i b_val = vld4q_f32((float*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i - __builtin_prefetch(a_ptr+8); - __builtin_prefetch(b_ptr+8); + __VOLK_PREFETCH(a_ptr+8); + __VOLK_PREFETCH(b_ptr+8); // use 2 accumulators to remove inter-instruction data dependencies accumulator1.val[0] = vmlaq_f32(accumulator1.val[0], a_val.val[0], b_val.val[0]); diff --git a/kernels/volk/volk_32fc_x2_multiply_32fc.h b/kernels/volk/volk_32fc_x2_multiply_32fc.h index 1709140..0b9d3fe 100644 --- a/kernels/volk/volk_32fc_x2_multiply_32fc.h +++ b/kernels/volk/volk_32fc_x2_multiply_32fc.h @@ -372,8 +372,8 @@ volk_32fc_x2_multiply_32fc_neon(lv_32fc_t* cVector, const lv_32fc_t* aVector, for(number = 0; number < quarter_points; ++number) { a_val = vld2q_f32((float*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i b_val = vld2q_f32((float*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i - __builtin_prefetch(a_ptr+4); - __builtin_prefetch(b_ptr+4); + __VOLK_PREFETCH(a_ptr+4); + __VOLK_PREFETCH(b_ptr+4); // multiply the real*real and imag*imag to get real result // a0r*b0r|a1r*b1r|a2r*b2r|a3r*b3r @@ -420,8 +420,8 @@ volk_32fc_x2_multiply_32fc_neon_opttests(lv_32fc_t* cVector, const lv_32fc_t* aV for(number = 0; number < quarter_points; ++number) { a_val = vld2q_f32((float*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i b_val = vld2q_f32((float*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i - __builtin_prefetch(a_ptr+4); - __builtin_prefetch(b_ptr+4); + __VOLK_PREFETCH(a_ptr+4); + __VOLK_PREFETCH(b_ptr+4); // do the first multiply tmp_imag.val[1] = vmulq_f32(a_val.val[1], b_val.val[0]); diff --git a/kernels/volk/volk_32fc_x2_multiply_conjugate_32fc.h b/kernels/volk/volk_32fc_x2_multiply_conjugate_32fc.h index 703c78d..c13a32e 100644 --- a/kernels/volk/volk_32fc_x2_multiply_conjugate_32fc.h +++ b/kernels/volk/volk_32fc_x2_multiply_conjugate_32fc.h @@ -262,8 +262,8 @@ volk_32fc_x2_multiply_conjugate_32fc_neon(lv_32fc_t* cVector, const lv_32fc_t* a a_val = vld2q_f32((float*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i b_val = vld2q_f32((float*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i b_val.val[1] = vnegq_f32(b_val.val[1]); - __builtin_prefetch(a_ptr+4); - __builtin_prefetch(b_ptr+4); + __VOLK_PREFETCH(a_ptr+4); + __VOLK_PREFETCH(b_ptr+4); // multiply the real*real and imag*imag to get real result // a0r*b0r|a1r*b1r|a2r*b2r|a3r*b3r -- 2.30.2