}
#endif
-#if LV_HAVE_SSE && LV_HAVE_32
-static inline void volk_32fc_x2_conjugate_dot_prod_32fc_a_sse_32(lv_32fc_t* result,
- const lv_32fc_t* input,
- const lv_32fc_t* taps,
- unsigned int num_points)
-{
-
- const unsigned int num_bytes = num_points * 8;
-
- __VOLK_ATTR_ALIGNED(16)
- static const uint32_t conjugator[4] = {
- 0x00000000, 0x80000000, 0x00000000, 0x80000000
- };
-
- int bound = num_bytes >> 4;
- int leftovers = num_bytes % 16;
-
- __VOLK_ASM __VOLK_VOLATILE(
- " #pushl %%ebp\n\t"
- " #movl %%esp, %%ebp\n\t"
- " #movl 12(%%ebp), %%eax # input\n\t"
- " #movl 16(%%ebp), %%edx # taps\n\t"
- " #movl 20(%%ebp), %%ecx # n_bytes\n\t"
- " movaps 0(%[conjugator]), %%xmm1\n\t"
- " xorps %%xmm6, %%xmm6 # zero accumulators\n\t"
- " movaps 0(%[eax]), %%xmm0\n\t"
- " xorps %%xmm7, %%xmm7 # zero accumulators\n\t"
- " movaps 0(%[edx]), %%xmm2\n\t"
- " movl %[ecx], (%[out])\n\t"
- " shrl $5, %[ecx] # ecx = n_2_ccomplex_blocks / 2\n\t"
-
- " xorps %%xmm1, %%xmm2\n\t"
- " jmp .%=L1_test\n\t"
- " # 4 taps / loop\n\t"
- " # something like ?? cycles / loop\n\t"
- ".%=Loop1: \n\t"
- "# complex prod: C += A * B, w/ temp Z & Y (or B), xmmPN=$0x8000000080000000\n\t"
- "# movaps (%[eax]), %%xmmA\n\t"
- "# movaps (%[edx]), %%xmmB\n\t"
- "# movaps %%xmmA, %%xmmZ\n\t"
- "# shufps $0xb1, %%xmmZ, %%xmmZ # swap internals\n\t"
- "# mulps %%xmmB, %%xmmA\n\t"
- "# mulps %%xmmZ, %%xmmB\n\t"
- "# # SSE replacement for: pfpnacc %%xmmB, %%xmmA\n\t"
- "# xorps %%xmmPN, %%xmmA\n\t"
- "# movaps %%xmmA, %%xmmZ\n\t"
- "# unpcklps %%xmmB, %%xmmA\n\t"
- "# unpckhps %%xmmB, %%xmmZ\n\t"
- "# movaps %%xmmZ, %%xmmY\n\t"
- "# shufps $0x44, %%xmmA, %%xmmZ # b01000100\n\t"
- "# shufps $0xee, %%xmmY, %%xmmA # b11101110\n\t"
- "# addps %%xmmZ, %%xmmA\n\t"
- "# addps %%xmmA, %%xmmC\n\t"
- "# A=xmm0, B=xmm2, Z=xmm4\n\t"
- "# A'=xmm1, B'=xmm3, Z'=xmm5\n\t"
- " movaps 16(%[edx]), %%xmm3\n\t"
- " movaps %%xmm0, %%xmm4\n\t"
- " xorps %%xmm1, %%xmm3\n\t"
- " mulps %%xmm2, %%xmm0\n\t"
- " movaps 16(%[eax]), %%xmm1\n\t"
- " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t"
- " movaps %%xmm1, %%xmm5\n\t"
- " addps %%xmm0, %%xmm6\n\t"
- " mulps %%xmm3, %%xmm1\n\t"
- " shufps $0xb1, %%xmm5, %%xmm5 # swap internals\n\t"
- " addps %%xmm1, %%xmm6\n\t"
- " movaps 0(%[conjugator]), %%xmm1\n\t"
- " mulps %%xmm4, %%xmm2\n\t"
- " movaps 32(%[eax]), %%xmm0\n\t"
- " addps %%xmm2, %%xmm7\n\t"
- " mulps %%xmm5, %%xmm3\n\t"
- " addl $32, %[eax]\n\t"
- " movaps 32(%[edx]), %%xmm2\n\t"
- " addps %%xmm3, %%xmm7\n\t"
- " xorps %%xmm1, %%xmm2\n\t"
- " addl $32, %[edx]\n\t"
- ".%=L1_test:\n\t"
- " decl %[ecx]\n\t"
- " jge .%=Loop1\n\t"
- " # We've handled the bulk of multiplies up to here.\n\t"
- " # Let's sse if original n_2_ccomplex_blocks was odd.\n\t"
- " # If so, we've got 2 more taps to do.\n\t"
- " movl 0(%[out]), %[ecx] # n_2_ccomplex_blocks\n\t"
- " shrl $4, %[ecx]\n\t"
- " andl $1, %[ecx]\n\t"
- " je .%=Leven\n\t"
- " # The count was odd, do 2 more taps.\n\t"
- " # Note that we've already got mm0/mm2 preloaded\n\t"
- " # from the main loop.\n\t"
- " movaps %%xmm0, %%xmm4\n\t"
- " mulps %%xmm2, %%xmm0\n\t"
- " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t"
- " addps %%xmm0, %%xmm6\n\t"
- " mulps %%xmm4, %%xmm2\n\t"
- " addps %%xmm2, %%xmm7\n\t"
- ".%=Leven:\n\t"
- " # neg inversor\n\t"
- " #movl 8(%%ebp), %[eax] \n\t"
- " xorps %%xmm1, %%xmm1\n\t"
- " movl $0x80000000, (%[out])\n\t"
- " movss (%[out]), %%xmm1\n\t"
- " shufps $0x11, %%xmm1, %%xmm1 # b00010001 # 0 -0 0 -0\n\t"
- " # pfpnacc\n\t"
- " xorps %%xmm1, %%xmm6\n\t"
- " movaps %%xmm6, %%xmm2\n\t"
- " unpcklps %%xmm7, %%xmm6\n\t"
- " unpckhps %%xmm7, %%xmm2\n\t"
- " movaps %%xmm2, %%xmm3\n\t"
- " shufps $0x44, %%xmm6, %%xmm2 # b01000100\n\t"
- " shufps $0xee, %%xmm3, %%xmm6 # b11101110\n\t"
- " addps %%xmm2, %%xmm6\n\t"
- " # xmm6 = r1 i2 r3 i4\n\t"
- " #movl 8(%%ebp), %[eax] # @result\n\t"
- " movhlps %%xmm6, %%xmm4 # xmm4 = r3 i4 ?? ??\n\t"
- " addps %%xmm4, %%xmm6 # xmm6 = r1+r3 i2+i4 ?? ??\n\t"
- " movlps %%xmm6, (%[out]) # store low 2x32 bits (complex) "
- "to memory\n\t"
- " #popl %%ebp\n\t"
- :
- : [eax] "r"(input),
- [edx] "r"(taps),
- [ecx] "r"(num_bytes),
- [out] "r"(result),
- [conjugator] "r"(conjugator));
-
- for (; leftovers > 0; leftovers -= 8) {
- *result += (input[(bound << 1)] * lv_conj(taps[(bound << 1)]));
- }
-}
-#endif /*LV_HAVE_SSE*/
-
#endif /*INCLUDED_volk_32fc_x2_conjugate_dot_prod_32fc_a_H*/
#endif
-#if LV_HAVE_SSE && LV_HAVE_32
-
-static inline void volk_32fc_x2_dot_prod_32fc_a_sse_32(lv_32fc_t* result,
- const lv_32fc_t* input,
- const lv_32fc_t* taps,
- unsigned int num_points)
-{
-
- volk_32fc_x2_dot_prod_32fc_generic(result, input, taps, num_points);
-
-#if 0
- const unsigned int num_bytes = num_points*8;
- unsigned int isodd = num_points & 1;
-
- __VOLK_ASM __VOLK_VOLATILE
- (
- " #pushl %%ebp\n\t"
- " #movl %%esp, %%ebp\n\t"
- " movl 12(%%ebp), %%eax # input\n\t"
- " movl 16(%%ebp), %%edx # taps\n\t"
- " movl 20(%%ebp), %%ecx # n_bytes\n\t"
- " xorps %%xmm6, %%xmm6 # zero accumulators\n\t"
- " movaps 0(%%eax), %%xmm0\n\t"
- " xorps %%xmm7, %%xmm7 # zero accumulators\n\t"
- " movaps 0(%%edx), %%xmm2\n\t"
- " shrl $5, %%ecx # ecx = n_2_ccomplex_blocks / 2\n\t"
- " jmp .%=L1_test\n\t"
- " # 4 taps / loop\n\t"
- " # something like ?? cycles / loop\n\t"
- ".%=Loop1: \n\t"
- "# complex prod: C += A * B, w/ temp Z & Y (or B), xmmPN=$0x8000000080000000\n\t"
- "# movaps (%%eax), %%xmmA\n\t"
- "# movaps (%%edx), %%xmmB\n\t"
- "# movaps %%xmmA, %%xmmZ\n\t"
- "# shufps $0xb1, %%xmmZ, %%xmmZ # swap internals\n\t"
- "# mulps %%xmmB, %%xmmA\n\t"
- "# mulps %%xmmZ, %%xmmB\n\t"
- "# # SSE replacement for: pfpnacc %%xmmB, %%xmmA\n\t"
- "# xorps %%xmmPN, %%xmmA\n\t"
- "# movaps %%xmmA, %%xmmZ\n\t"
- "# unpcklps %%xmmB, %%xmmA\n\t"
- "# unpckhps %%xmmB, %%xmmZ\n\t"
- "# movaps %%xmmZ, %%xmmY\n\t"
- "# shufps $0x44, %%xmmA, %%xmmZ # b01000100\n\t"
- "# shufps $0xee, %%xmmY, %%xmmA # b11101110\n\t"
- "# addps %%xmmZ, %%xmmA\n\t"
- "# addps %%xmmA, %%xmmC\n\t"
- "# A=xmm0, B=xmm2, Z=xmm4\n\t"
- "# A'=xmm1, B'=xmm3, Z'=xmm5\n\t"
- " movaps 16(%%eax), %%xmm1\n\t"
- " movaps %%xmm0, %%xmm4\n\t"
- " mulps %%xmm2, %%xmm0\n\t"
- " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t"
- " movaps 16(%%edx), %%xmm3\n\t"
- " movaps %%xmm1, %%xmm5\n\t"
- " addps %%xmm0, %%xmm6\n\t"
- " mulps %%xmm3, %%xmm1\n\t"
- " shufps $0xb1, %%xmm5, %%xmm5 # swap internals\n\t"
- " addps %%xmm1, %%xmm6\n\t"
- " mulps %%xmm4, %%xmm2\n\t"
- " movaps 32(%%eax), %%xmm0\n\t"
- " addps %%xmm2, %%xmm7\n\t"
- " mulps %%xmm5, %%xmm3\n\t"
- " addl $32, %%eax\n\t"
- " movaps 32(%%edx), %%xmm2\n\t"
- " addps %%xmm3, %%xmm7\n\t"
- " addl $32, %%edx\n\t"
- ".%=L1_test:\n\t"
- " decl %%ecx\n\t"
- " jge .%=Loop1\n\t"
- " # We've handled the bulk of multiplies up to here.\n\t"
- " # Let's sse if original n_2_ccomplex_blocks was odd.\n\t"
- " # If so, we've got 2 more taps to do.\n\t"
- " movl 20(%%ebp), %%ecx # n_2_ccomplex_blocks\n\t"
- " shrl $4, %%ecx\n\t"
- " andl $1, %%ecx\n\t"
- " je .%=Leven\n\t"
- " # The count was odd, do 2 more taps.\n\t"
- " # Note that we've already got mm0/mm2 preloaded\n\t"
- " # from the main loop.\n\t"
- " movaps %%xmm0, %%xmm4\n\t"
- " mulps %%xmm2, %%xmm0\n\t"
- " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t"
- " addps %%xmm0, %%xmm6\n\t"
- " mulps %%xmm4, %%xmm2\n\t"
- " addps %%xmm2, %%xmm7\n\t"
- ".%=Leven:\n\t"
- " # neg inversor\n\t"
- " movl 8(%%ebp), %%eax \n\t"
- " xorps %%xmm1, %%xmm1\n\t"
- " movl $0x80000000, (%%eax)\n\t"
- " movss (%%eax), %%xmm1\n\t"
- " shufps $0x11, %%xmm1, %%xmm1 # b00010001 # 0 -0 0 -0\n\t"
- " # pfpnacc\n\t"
- " xorps %%xmm1, %%xmm6\n\t"
- " movaps %%xmm6, %%xmm2\n\t"
- " unpcklps %%xmm7, %%xmm6\n\t"
- " unpckhps %%xmm7, %%xmm2\n\t"
- " movaps %%xmm2, %%xmm3\n\t"
- " shufps $0x44, %%xmm6, %%xmm2 # b01000100\n\t"
- " shufps $0xee, %%xmm3, %%xmm6 # b11101110\n\t"
- " addps %%xmm2, %%xmm6\n\t"
- " # xmm6 = r1 i2 r3 i4\n\t"
- " #movl 8(%%ebp), %%eax # @result\n\t"
- " movhlps %%xmm6, %%xmm4 # xmm4 = r3 i4 ?? ??\n\t"
- " addps %%xmm4, %%xmm6 # xmm6 = r1+r3 i2+i4 ?? ??\n\t"
- " movlps %%xmm6, (%%eax) # store low 2x32 bits (complex) to memory\n\t"
- " #popl %%ebp\n\t"
- :
- :
- : "eax", "ecx", "edx"
- );
-
-
- int getem = num_bytes % 16;
-
- if(isodd) {
- *result += (input[num_points - 1] * taps[num_points - 1]);
- }
-
- return;
-#endif
-}
-
-#endif /*LV_HAVE_SSE*/
-
#ifdef LV_HAVE_SSE3
#include <pmmintrin.h>