From d42bf5fd295facb029b8178a198d66a12ea1e0f6 Mon Sep 17 00:00:00 2001 From: Clayton Smith Date: Fri, 8 Dec 2023 16:53:53 -0500 Subject: [PATCH] [PATCH 3/6] Remove broken sse_32 kernels Signed-off-by: Clayton Smith Gbp-Pq: Name 0003-Remove-broken-sse_32-kernels.patch --- .../volk_32fc_x2_conjugate_dot_prod_32fc.h | 131 ------------------ kernels/volk/volk_32fc_x2_dot_prod_32fc.h | 126 ----------------- 2 files changed, 257 deletions(-) diff --git a/kernels/volk/volk_32fc_x2_conjugate_dot_prod_32fc.h b/kernels/volk/volk_32fc_x2_conjugate_dot_prod_32fc.h index b1c1938..c71c7a3 100644 --- a/kernels/volk/volk_32fc_x2_conjugate_dot_prod_32fc.h +++ b/kernels/volk/volk_32fc_x2_conjugate_dot_prod_32fc.h @@ -567,136 +567,5 @@ static inline void volk_32fc_x2_conjugate_dot_prod_32fc_a_sse(lv_32fc_t* result, } #endif -#if LV_HAVE_SSE && LV_HAVE_32 -static inline void volk_32fc_x2_conjugate_dot_prod_32fc_a_sse_32(lv_32fc_t* result, - const lv_32fc_t* input, - const lv_32fc_t* taps, - unsigned int num_points) -{ - - const unsigned int num_bytes = num_points * 8; - - __VOLK_ATTR_ALIGNED(16) - static const uint32_t conjugator[4] = { - 0x00000000, 0x80000000, 0x00000000, 0x80000000 - }; - - int bound = num_bytes >> 4; - int leftovers = num_bytes % 16; - - __VOLK_ASM __VOLK_VOLATILE( - " #pushl %%ebp\n\t" - " #movl %%esp, %%ebp\n\t" - " #movl 12(%%ebp), %%eax # input\n\t" - " #movl 16(%%ebp), %%edx # taps\n\t" - " #movl 20(%%ebp), %%ecx # n_bytes\n\t" - " movaps 0(%[conjugator]), %%xmm1\n\t" - " xorps %%xmm6, %%xmm6 # zero accumulators\n\t" - " movaps 0(%[eax]), %%xmm0\n\t" - " xorps %%xmm7, %%xmm7 # zero accumulators\n\t" - " movaps 0(%[edx]), %%xmm2\n\t" - " movl %[ecx], (%[out])\n\t" - " shrl $5, %[ecx] # ecx = n_2_ccomplex_blocks / 2\n\t" - - " xorps %%xmm1, %%xmm2\n\t" - " jmp .%=L1_test\n\t" - " # 4 taps / loop\n\t" - " # something like ?? cycles / loop\n\t" - ".%=Loop1: \n\t" - "# complex prod: C += A * B, w/ temp Z & Y (or B), xmmPN=$0x8000000080000000\n\t" - "# movaps (%[eax]), %%xmmA\n\t" - "# movaps (%[edx]), %%xmmB\n\t" - "# movaps %%xmmA, %%xmmZ\n\t" - "# shufps $0xb1, %%xmmZ, %%xmmZ # swap internals\n\t" - "# mulps %%xmmB, %%xmmA\n\t" - "# mulps %%xmmZ, %%xmmB\n\t" - "# # SSE replacement for: pfpnacc %%xmmB, %%xmmA\n\t" - "# xorps %%xmmPN, %%xmmA\n\t" - "# movaps %%xmmA, %%xmmZ\n\t" - "# unpcklps %%xmmB, %%xmmA\n\t" - "# unpckhps %%xmmB, %%xmmZ\n\t" - "# movaps %%xmmZ, %%xmmY\n\t" - "# shufps $0x44, %%xmmA, %%xmmZ # b01000100\n\t" - "# shufps $0xee, %%xmmY, %%xmmA # b11101110\n\t" - "# addps %%xmmZ, %%xmmA\n\t" - "# addps %%xmmA, %%xmmC\n\t" - "# A=xmm0, B=xmm2, Z=xmm4\n\t" - "# A'=xmm1, B'=xmm3, Z'=xmm5\n\t" - " movaps 16(%[edx]), %%xmm3\n\t" - " movaps %%xmm0, %%xmm4\n\t" - " xorps %%xmm1, %%xmm3\n\t" - " mulps %%xmm2, %%xmm0\n\t" - " movaps 16(%[eax]), %%xmm1\n\t" - " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t" - " movaps %%xmm1, %%xmm5\n\t" - " addps %%xmm0, %%xmm6\n\t" - " mulps %%xmm3, %%xmm1\n\t" - " shufps $0xb1, %%xmm5, %%xmm5 # swap internals\n\t" - " addps %%xmm1, %%xmm6\n\t" - " movaps 0(%[conjugator]), %%xmm1\n\t" - " mulps %%xmm4, %%xmm2\n\t" - " movaps 32(%[eax]), %%xmm0\n\t" - " addps %%xmm2, %%xmm7\n\t" - " mulps %%xmm5, %%xmm3\n\t" - " addl $32, %[eax]\n\t" - " movaps 32(%[edx]), %%xmm2\n\t" - " addps %%xmm3, %%xmm7\n\t" - " xorps %%xmm1, %%xmm2\n\t" - " addl $32, %[edx]\n\t" - ".%=L1_test:\n\t" - " decl %[ecx]\n\t" - " jge .%=Loop1\n\t" - " # We've handled the bulk of multiplies up to here.\n\t" - " # Let's sse if original n_2_ccomplex_blocks was odd.\n\t" - " # If so, we've got 2 more taps to do.\n\t" - " movl 0(%[out]), %[ecx] # n_2_ccomplex_blocks\n\t" - " shrl $4, %[ecx]\n\t" - " andl $1, %[ecx]\n\t" - " je .%=Leven\n\t" - " # The count was odd, do 2 more taps.\n\t" - " # Note that we've already got mm0/mm2 preloaded\n\t" - " # from the main loop.\n\t" - " movaps %%xmm0, %%xmm4\n\t" - " mulps %%xmm2, %%xmm0\n\t" - " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t" - " addps %%xmm0, %%xmm6\n\t" - " mulps %%xmm4, %%xmm2\n\t" - " addps %%xmm2, %%xmm7\n\t" - ".%=Leven:\n\t" - " # neg inversor\n\t" - " #movl 8(%%ebp), %[eax] \n\t" - " xorps %%xmm1, %%xmm1\n\t" - " movl $0x80000000, (%[out])\n\t" - " movss (%[out]), %%xmm1\n\t" - " shufps $0x11, %%xmm1, %%xmm1 # b00010001 # 0 -0 0 -0\n\t" - " # pfpnacc\n\t" - " xorps %%xmm1, %%xmm6\n\t" - " movaps %%xmm6, %%xmm2\n\t" - " unpcklps %%xmm7, %%xmm6\n\t" - " unpckhps %%xmm7, %%xmm2\n\t" - " movaps %%xmm2, %%xmm3\n\t" - " shufps $0x44, %%xmm6, %%xmm2 # b01000100\n\t" - " shufps $0xee, %%xmm3, %%xmm6 # b11101110\n\t" - " addps %%xmm2, %%xmm6\n\t" - " # xmm6 = r1 i2 r3 i4\n\t" - " #movl 8(%%ebp), %[eax] # @result\n\t" - " movhlps %%xmm6, %%xmm4 # xmm4 = r3 i4 ?? ??\n\t" - " addps %%xmm4, %%xmm6 # xmm6 = r1+r3 i2+i4 ?? ??\n\t" - " movlps %%xmm6, (%[out]) # store low 2x32 bits (complex) " - "to memory\n\t" - " #popl %%ebp\n\t" - : - : [eax] "r"(input), - [edx] "r"(taps), - [ecx] "r"(num_bytes), - [out] "r"(result), - [conjugator] "r"(conjugator)); - - for (; leftovers > 0; leftovers -= 8) { - *result += (input[(bound << 1)] * lv_conj(taps[(bound << 1)])); - } -} -#endif /*LV_HAVE_SSE*/ - #endif /*INCLUDED_volk_32fc_x2_conjugate_dot_prod_32fc_a_H*/ diff --git a/kernels/volk/volk_32fc_x2_dot_prod_32fc.h b/kernels/volk/volk_32fc_x2_dot_prod_32fc.h index 16851f8..7d9d4d4 100644 --- a/kernels/volk/volk_32fc_x2_dot_prod_32fc.h +++ b/kernels/volk/volk_32fc_x2_dot_prod_32fc.h @@ -651,132 +651,6 @@ static inline void volk_32fc_x2_dot_prod_32fc_a_sse_64(lv_32fc_t* result, #endif -#if LV_HAVE_SSE && LV_HAVE_32 - -static inline void volk_32fc_x2_dot_prod_32fc_a_sse_32(lv_32fc_t* result, - const lv_32fc_t* input, - const lv_32fc_t* taps, - unsigned int num_points) -{ - - volk_32fc_x2_dot_prod_32fc_generic(result, input, taps, num_points); - -#if 0 - const unsigned int num_bytes = num_points*8; - unsigned int isodd = num_points & 1; - - __VOLK_ASM __VOLK_VOLATILE - ( - " #pushl %%ebp\n\t" - " #movl %%esp, %%ebp\n\t" - " movl 12(%%ebp), %%eax # input\n\t" - " movl 16(%%ebp), %%edx # taps\n\t" - " movl 20(%%ebp), %%ecx # n_bytes\n\t" - " xorps %%xmm6, %%xmm6 # zero accumulators\n\t" - " movaps 0(%%eax), %%xmm0\n\t" - " xorps %%xmm7, %%xmm7 # zero accumulators\n\t" - " movaps 0(%%edx), %%xmm2\n\t" - " shrl $5, %%ecx # ecx = n_2_ccomplex_blocks / 2\n\t" - " jmp .%=L1_test\n\t" - " # 4 taps / loop\n\t" - " # something like ?? cycles / loop\n\t" - ".%=Loop1: \n\t" - "# complex prod: C += A * B, w/ temp Z & Y (or B), xmmPN=$0x8000000080000000\n\t" - "# movaps (%%eax), %%xmmA\n\t" - "# movaps (%%edx), %%xmmB\n\t" - "# movaps %%xmmA, %%xmmZ\n\t" - "# shufps $0xb1, %%xmmZ, %%xmmZ # swap internals\n\t" - "# mulps %%xmmB, %%xmmA\n\t" - "# mulps %%xmmZ, %%xmmB\n\t" - "# # SSE replacement for: pfpnacc %%xmmB, %%xmmA\n\t" - "# xorps %%xmmPN, %%xmmA\n\t" - "# movaps %%xmmA, %%xmmZ\n\t" - "# unpcklps %%xmmB, %%xmmA\n\t" - "# unpckhps %%xmmB, %%xmmZ\n\t" - "# movaps %%xmmZ, %%xmmY\n\t" - "# shufps $0x44, %%xmmA, %%xmmZ # b01000100\n\t" - "# shufps $0xee, %%xmmY, %%xmmA # b11101110\n\t" - "# addps %%xmmZ, %%xmmA\n\t" - "# addps %%xmmA, %%xmmC\n\t" - "# A=xmm0, B=xmm2, Z=xmm4\n\t" - "# A'=xmm1, B'=xmm3, Z'=xmm5\n\t" - " movaps 16(%%eax), %%xmm1\n\t" - " movaps %%xmm0, %%xmm4\n\t" - " mulps %%xmm2, %%xmm0\n\t" - " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t" - " movaps 16(%%edx), %%xmm3\n\t" - " movaps %%xmm1, %%xmm5\n\t" - " addps %%xmm0, %%xmm6\n\t" - " mulps %%xmm3, %%xmm1\n\t" - " shufps $0xb1, %%xmm5, %%xmm5 # swap internals\n\t" - " addps %%xmm1, %%xmm6\n\t" - " mulps %%xmm4, %%xmm2\n\t" - " movaps 32(%%eax), %%xmm0\n\t" - " addps %%xmm2, %%xmm7\n\t" - " mulps %%xmm5, %%xmm3\n\t" - " addl $32, %%eax\n\t" - " movaps 32(%%edx), %%xmm2\n\t" - " addps %%xmm3, %%xmm7\n\t" - " addl $32, %%edx\n\t" - ".%=L1_test:\n\t" - " decl %%ecx\n\t" - " jge .%=Loop1\n\t" - " # We've handled the bulk of multiplies up to here.\n\t" - " # Let's sse if original n_2_ccomplex_blocks was odd.\n\t" - " # If so, we've got 2 more taps to do.\n\t" - " movl 20(%%ebp), %%ecx # n_2_ccomplex_blocks\n\t" - " shrl $4, %%ecx\n\t" - " andl $1, %%ecx\n\t" - " je .%=Leven\n\t" - " # The count was odd, do 2 more taps.\n\t" - " # Note that we've already got mm0/mm2 preloaded\n\t" - " # from the main loop.\n\t" - " movaps %%xmm0, %%xmm4\n\t" - " mulps %%xmm2, %%xmm0\n\t" - " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t" - " addps %%xmm0, %%xmm6\n\t" - " mulps %%xmm4, %%xmm2\n\t" - " addps %%xmm2, %%xmm7\n\t" - ".%=Leven:\n\t" - " # neg inversor\n\t" - " movl 8(%%ebp), %%eax \n\t" - " xorps %%xmm1, %%xmm1\n\t" - " movl $0x80000000, (%%eax)\n\t" - " movss (%%eax), %%xmm1\n\t" - " shufps $0x11, %%xmm1, %%xmm1 # b00010001 # 0 -0 0 -0\n\t" - " # pfpnacc\n\t" - " xorps %%xmm1, %%xmm6\n\t" - " movaps %%xmm6, %%xmm2\n\t" - " unpcklps %%xmm7, %%xmm6\n\t" - " unpckhps %%xmm7, %%xmm2\n\t" - " movaps %%xmm2, %%xmm3\n\t" - " shufps $0x44, %%xmm6, %%xmm2 # b01000100\n\t" - " shufps $0xee, %%xmm3, %%xmm6 # b11101110\n\t" - " addps %%xmm2, %%xmm6\n\t" - " # xmm6 = r1 i2 r3 i4\n\t" - " #movl 8(%%ebp), %%eax # @result\n\t" - " movhlps %%xmm6, %%xmm4 # xmm4 = r3 i4 ?? ??\n\t" - " addps %%xmm4, %%xmm6 # xmm6 = r1+r3 i2+i4 ?? ??\n\t" - " movlps %%xmm6, (%%eax) # store low 2x32 bits (complex) to memory\n\t" - " #popl %%ebp\n\t" - : - : - : "eax", "ecx", "edx" - ); - - - int getem = num_bytes % 16; - - if(isodd) { - *result += (input[num_points - 1] * taps[num_points - 1]); - } - - return; -#endif -} - -#endif /*LV_HAVE_SSE*/ - #ifdef LV_HAVE_SSE3 #include -- 2.30.2