# define __VOLK_ATTR_EXPORT
# define __VOLK_ATTR_IMPORT
# endif
+# define __VOLK_PREFETCH(addr) __builtin_prefetch(addr)
#elif _MSC_VER
# define __VOLK_ATTR_ALIGNED(x) __declspec(align(x))
# define __VOLK_ATTR_UNUSED
# define __VOLK_ATTR_DEPRECATED __declspec(deprecated)
# define __VOLK_ATTR_EXPORT __declspec(dllexport)
# define __VOLK_ATTR_IMPORT __declspec(dllimport)
+# define __VOLK_PREFETCH(addr)
#else
# define __VOLK_ATTR_ALIGNED(x)
# define __VOLK_ATTR_UNUSED
# define __VOLK_ATTR_DEPRECATED
# define __VOLK_ATTR_EXPORT
# define __VOLK_ATTR_IMPORT
+# define __VOLK_PREFETCH(addr)
#endif
////////////////////////////////////////////////////////////////////////
for(number=0; number < eighth_points; ++number) {
input_vec = vld1q_s16(src0);
- __builtin_prefetch(src0+16);
+ __VOLK_PREFETCH(src0+16);
diff = vsubq_s16(candidate_vec, input_vec);
comp1 = vcgeq_s16(diff, zeros);
comp2 = vcltq_s16(diff, zeros);
zeros = veorq_s16(zeros, zeros);
for(number=0; number < eighth_points; ++number) {
input_vec = vld2q_s16(src0);
- //__builtin_prefetch(src0+16);
+ //__VOLK_PREFETCH(src0+16);
diff = vsubq_s16(input_vec.val[0], input_vec.val[1]);
comp1 = vcgeq_s16(diff, zeros);
comp2 = vcltq_s16(diff, zeros);
for(number = 0; number < sse_iters; number++)
{
a16x4 = vld1_s16((const int16_t*)_in);
- __builtin_prefetch(_in + 4);
+ __VOLK_PREFETCH(_in + 4);
a32x4 = vmovl_s16(a16x4);
f32x4 = vcvtq_f32_s32(a32x4);
vst1q_f32((float32_t*)_out, f32x4);
{
// a[127:0]=[a3.i,a3.r,a2.i,a2.r,a1.i,a1.r,a0.i,a0.r]
a = _mm_load_si128((__m128i*)_in_a); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg
- __builtin_prefetch(_in_a + 8);
+ __VOLK_PREFETCH(_in_a + 8);
b = _mm_load_si128((__m128i*)_in_b);
- __builtin_prefetch(_in_b + 8);
+ __VOLK_PREFETCH(_in_b + 8);
c = _mm_mullo_epi16(a, b); // a3.i*b3.i, a3.r*b3.r, ....
c_sr = _mm_srli_si128(c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst.
{
// a[127:0]=[a3.i,a3.r,a2.i,a2.r,a1.i,a1.r,a0.i,a0.r]
a = _mm_loadu_si128((__m128i*)_in_a); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg
- __builtin_prefetch(_in_a + 8);
+ __VOLK_PREFETCH(_in_a + 8);
b = _mm_loadu_si128((__m128i*)_in_b);
- __builtin_prefetch(_in_b + 8);
+ __VOLK_PREFETCH(_in_b + 8);
c = _mm_mullo_epi16(a, b); // a3.i*b3.i, a3.r*b3.r, ....
c_sr = _mm_srli_si128(c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst.
for(number = 0; number < avx_iters; number++)
{
a = _mm256_loadu_si256((__m256i*)_in_a);
- __builtin_prefetch(_in_a + 16);
+ __VOLK_PREFETCH(_in_a + 16);
b = _mm256_loadu_si256((__m256i*)_in_b);
- __builtin_prefetch(_in_b + 16);
+ __VOLK_PREFETCH(_in_b + 16);
c = _mm256_mullo_epi16(a, b);
c_sr = _mm256_srli_si256(c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst.
for(number = 0; number < avx_iters; number++)
{
a = _mm256_load_si256((__m256i*)_in_a);
- __builtin_prefetch(_in_a + 16);
+ __VOLK_PREFETCH(_in_a + 16);
b = _mm256_load_si256((__m256i*)_in_b);
- __builtin_prefetch(_in_b + 16);
+ __VOLK_PREFETCH(_in_b + 16);
c = _mm256_mullo_epi16(a, b);
c_sr = _mm256_srli_si256(c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst.
{
a_val = vld2_s16((int16_t*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i
b_val = vld2_s16((int16_t*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i
- __builtin_prefetch(a_ptr + 8);
- __builtin_prefetch(b_ptr + 8);
+ __VOLK_PREFETCH(a_ptr + 8);
+ __VOLK_PREFETCH(b_ptr + 8);
// multiply the real*real and imag*imag to get real result
// a0r*b0r|a1r*b1r|a2r*b2r|a3r*b3r
{
a_val = vld2_s16((int16_t*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i
b_val = vld2_s16((int16_t*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i
- __builtin_prefetch(a_ptr + 8);
- __builtin_prefetch(b_ptr + 8);
+ __VOLK_PREFETCH(a_ptr + 8);
+ __VOLK_PREFETCH(b_ptr + 8);
tmp.val[0] = vmul_s16(a_val.val[0], b_val.val[0]);
tmp.val[1] = vmul_s16(a_val.val[1], b_val.val[0]);
{
a_val = vld2_s16((int16_t*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i
b_val = vld2_s16((int16_t*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i
- __builtin_prefetch(a_ptr + 8);
- __builtin_prefetch(b_ptr + 8);
+ __VOLK_PREFETCH(a_ptr + 8);
+ __VOLK_PREFETCH(b_ptr + 8);
// use 2 accumulators to remove inter-instruction data dependencies
accumulator1.val[0] = vmla_s16(accumulator1.val[0], a_val.val[0], b_val.val[0]);
{
a_val = vld2_s16((int16_t*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i
b_val = vld2_s16((int16_t*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i
- __builtin_prefetch(a_ptr + 4);
- __builtin_prefetch(b_ptr + 4);
+ __VOLK_PREFETCH(a_ptr + 4);
+ __VOLK_PREFETCH(b_ptr + 4);
// multiply the real*real and imag*imag to get real result
// a0r*b0r|a1r*b1r|a2r*b2r|a3r*b3r
// Load in to NEON registers
aVal = vld1q_f32(aPtr);
bVal = vld1q_f32(bPtr);
- __builtin_prefetch(aPtr+4);
- __builtin_prefetch(bPtr+4);
+ __VOLK_PREFETCH(aPtr+4);
+ __VOLK_PREFETCH(bPtr+4);
// vector add
cVal = vaddq_f32(aVal, bVal);
const lv_32fc_t* a = aVector;
for(number=0; number < quarterPoints; number++){
- __builtin_prefetch(a+4);
+ __VOLK_PREFETCH(a+4);
x = vld2q_f32((float*)a); // Load the complex data as ar,br,cr,dr; ai,bi,ci,di
// xor the imaginary lane
{
inputVal1 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4;
inputVal2 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4;
- __builtin_prefetch(inputVectorPtr + 8);
+ __VOLK_PREFETCH(inputVectorPtr + 8);
// Clip
ret1 = _mm_max_ps(_mm_min_ps(inputVal1, vmax_val), vmin_val);
{
inputVal1 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4;
inputVal2 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4;
- __builtin_prefetch(inputVectorPtr + 8);
+ __VOLK_PREFETCH(inputVectorPtr + 8);
// Clip
ret1 = _mm_max_ps(_mm_min_ps(inputVal1, vmax_val), vmin_val);
{
a = vld1q_f32((const float32_t*)(inputVectorPtr)); inputVectorPtr += 4;
b = vld1q_f32((const float32_t*)(inputVectorPtr)); inputVectorPtr += 4;
- __builtin_prefetch(inputVectorPtr + 8);
+ __VOLK_PREFETCH(inputVectorPtr + 8);
ret1 = vmaxq_f32(vminq_f32(a, max_val), min_val);
ret2 = vmaxq_f32(vminq_f32(b, max_val), min_val);
for(number = 0; number < quarter_points; ++number) {
a_val = vld2q_f32((float*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i
b_val = vld2q_f32((float*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i
- __builtin_prefetch(a_ptr+8);
- __builtin_prefetch(b_ptr+8);
+ __VOLK_PREFETCH(a_ptr+8);
+ __VOLK_PREFETCH(b_ptr+8);
// do the first multiply
tmp_imag.val[1] = vmulq_f32(a_val.val[1], b_val.val[0]);
for(number = 0; number < quarter_points; ++number) {
a_val = vld2q_f32((float*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i
b_val = vld2q_f32((float*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i
- __builtin_prefetch(a_ptr+8);
- __builtin_prefetch(b_ptr+8);
+ __VOLK_PREFETCH(a_ptr+8);
+ __VOLK_PREFETCH(b_ptr+8);
// multiply the real*real and imag*imag to get real result
// a0r*b0r|a1r*b1r|a2r*b2r|a3r*b3r
for(number = 0; number < quarter_points; ++number) {
a_val = vld2q_f32((float*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i
b_val = vld2q_f32((float*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i
- __builtin_prefetch(a_ptr+8);
- __builtin_prefetch(b_ptr+8);
+ __VOLK_PREFETCH(a_ptr+8);
+ __VOLK_PREFETCH(b_ptr+8);
// do the first multiply
tmp_imag.val[1] = vmulq_f32(a_val.val[1], b_val.val[0]);
for(number = 0; number < quarter_points; ++number) {
a_val = vld2q_f32((float*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i
b_val = vld2q_f32((float*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i
- __builtin_prefetch(a_ptr+8);
- __builtin_prefetch(b_ptr+8);
+ __VOLK_PREFETCH(a_ptr+8);
+ __VOLK_PREFETCH(b_ptr+8);
// use 2 accumulators to remove inter-instruction data dependencies
accumulator1.val[0] = vmlaq_f32(accumulator1.val[0], a_val.val[0], b_val.val[0]);
for(number = 0; number < quarter_points; ++number) {
a_val = vld4q_f32((float*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i
b_val = vld4q_f32((float*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i
- __builtin_prefetch(a_ptr+8);
- __builtin_prefetch(b_ptr+8);
+ __VOLK_PREFETCH(a_ptr+8);
+ __VOLK_PREFETCH(b_ptr+8);
// use 2 accumulators to remove inter-instruction data dependencies
accumulator1.val[0] = vmlaq_f32(accumulator1.val[0], a_val.val[0], b_val.val[0]);
for(number = 0; number < quarter_points; ++number) {
a_val = vld2q_f32((float*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i
b_val = vld2q_f32((float*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i
- __builtin_prefetch(a_ptr+4);
- __builtin_prefetch(b_ptr+4);
+ __VOLK_PREFETCH(a_ptr+4);
+ __VOLK_PREFETCH(b_ptr+4);
// multiply the real*real and imag*imag to get real result
// a0r*b0r|a1r*b1r|a2r*b2r|a3r*b3r
for(number = 0; number < quarter_points; ++number) {
a_val = vld2q_f32((float*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i
b_val = vld2q_f32((float*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i
- __builtin_prefetch(a_ptr+4);
- __builtin_prefetch(b_ptr+4);
+ __VOLK_PREFETCH(a_ptr+4);
+ __VOLK_PREFETCH(b_ptr+4);
// do the first multiply
tmp_imag.val[1] = vmulq_f32(a_val.val[1], b_val.val[0]);
a_val = vld2q_f32((float*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i
b_val = vld2q_f32((float*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i
b_val.val[1] = vnegq_f32(b_val.val[1]);
- __builtin_prefetch(a_ptr+4);
- __builtin_prefetch(b_ptr+4);
+ __VOLK_PREFETCH(a_ptr+4);
+ __VOLK_PREFETCH(b_ptr+4);
// multiply the real*real and imag*imag to get real result
// a0r*b0r|a1r*b1r|a2r*b2r|a3r*b3r