From: A. Maitland Bottoms Date: Sat, 28 Mar 2020 01:48:10 +0000 (+0000) Subject: volk (2.2.1-2) unstable; urgency=medium X-Git-Tag: archive/raspbian/2.2.1-2+rpi1^2~12 X-Git-Url: https://dgit.raspbian.org/?a=commitdiff_plain;h=1f718f06776ad35df030ae539e73a1c6d41adfaa;p=volk.git volk (2.2.1-2) unstable; urgency=medium * update to v2.2.1-11-gfaf230e * cmake: Remove the ORC from the VOLK public link interface * Fix the broken index max kernels [dgit import unpatched volk 2.2.1-2] --- 1f718f06776ad35df030ae539e73a1c6d41adfaa diff --cc debian/1.3_to_1.4_compat_report.html index 0000000,0000000..f9614d6 new file mode 100644 --- /dev/null +++ b/debian/1.3_to_1.4_compat_report.html @@@ -1,0 -1,0 +1,1069 @@@ ++ ++ ++ ++ ++ ++ ++ ++ ++libvolk1-dev: 1.3-3 to 1.4-1 compatibility report ++ ++ ++ ++

API compatibility report for the libvolk1-dev library between 1.3-3 and 1.4-1 versions on x86_64

++ ++
++
++ Binary
Compatibility
++ Source
Compatibility
++
++

Test Info


++ ++ ++ ++ ++ ++ ++ ++
Library Namelibvolk1-dev
Version #11.3-3
Version #21.4-1
Archx86_64
GCC Version7
SubjectBinary Compatibility
++

Test Results


++ ++ ++ ++ ++ ++ ++
Total Header Files135
Total Libraries1
Total Symbols / Types614 / 233
Compatibility99.8%
++

Problem Summary


++ ++ ++ ++ ++ ++ ++ ++ ++ ++
SeverityCount
Added Symbols-45
Removed SymbolsHigh0
Problems with
Data Types
High0
Medium0
Low1
Problems with
Symbols
High1
Medium0
Low0
Problems with
Constants
Low1
++ ++

Added Symbols  45 


++volk.h, libvolk.so.1.4
++volk_32f_64f_add_64f [data]
++volk_32f_64f_add_64f_a [data]
++volk_32f_64f_add_64f_get_func_desc ( )
++volk_32f_64f_add_64f_manual ( double* cVector, float const* aVector, double const* bVector, unsigned int num_points, char const* impl_name )
++volk_32f_64f_add_64f_u [data]
++volk_32f_64f_multiply_64f [data]
++volk_32f_64f_multiply_64f_a [data]
++volk_32f_64f_multiply_64f_get_func_desc ( )
++volk_32f_64f_multiply_64f_manual ( double* cVector, float const* aVector, double const* bVector, unsigned int num_points, char const* impl_name )
++volk_32f_64f_multiply_64f_u [data]
++volk_32f_s32f_mod_rangepuppet_32f [data]
++volk_32f_s32f_mod_rangepuppet_32f_a [data]
++volk_32f_s32f_mod_rangepuppet_32f_get_func_desc ( )
++volk_32f_s32f_mod_rangepuppet_32f_manual ( float* output, float const* input, float bound, unsigned int num_points, char const* impl_name )
++volk_32f_s32f_mod_rangepuppet_32f_u [data]
++volk_32f_s32f_s32f_mod_range_32f [data]
++volk_32f_s32f_s32f_mod_range_32f_a [data]
++volk_32f_s32f_s32f_mod_range_32f_get_func_desc ( )
++volk_32f_s32f_s32f_mod_range_32f_manual ( float* outputVector, float const* inputVector, float const lower_bound, float const upper_bound, unsigned int num_points, char const* impl_name )
++volk_32f_s32f_s32f_mod_range_32f_u [data]
++volk_32fc_32f_add_32fc [data]
++volk_32fc_32f_add_32fc_a [data]
++volk_32fc_32f_add_32fc_get_func_desc ( )
++volk_32fc_32f_add_32fc_manual ( lv_32fc_t* cVector, lv_32fc_t const* aVector, float const* bVector, unsigned int num_points, char const* impl_name )
++volk_32fc_32f_add_32fc_u [data]
++volk_32fc_x2_add_32fc [data]
++volk_32fc_x2_add_32fc_a [data]
++volk_32fc_x2_add_32fc_get_func_desc ( )
++volk_32fc_x2_add_32fc_manual ( lv_32fc_t* cVector, lv_32fc_t const* aVector, lv_32fc_t const* bVector, unsigned int num_points, char const* impl_name )
++volk_32fc_x2_add_32fc_u [data]
++volk_32u_reverse_32u [data]
++volk_32u_reverse_32u_a [data]
++volk_32u_reverse_32u_get_func_desc ( )
++volk_32u_reverse_32u_manual ( uint32_t* out, uint32_t const* in, unsigned int num_points, char const* impl_name )
++volk_32u_reverse_32u_u [data]
++volk_64f_x2_add_64f [data]
++volk_64f_x2_add_64f_a [data]
++volk_64f_x2_add_64f_get_func_desc ( )
++volk_64f_x2_add_64f_manual ( double* cVector, double const* aVector, double const* bVector, unsigned int num_points, char const* impl_name )
++volk_64f_x2_add_64f_u [data]
++volk_64f_x2_multiply_64f [data]
++volk_64f_x2_multiply_64f_a [data]
++volk_64f_x2_multiply_64f_get_func_desc ( )
++volk_64f_x2_multiply_64f_manual ( double* cVector, double const* aVector, double const* bVector, unsigned int num_points, char const* impl_name )
++volk_64f_x2_multiply_64f_u [data]
++
++to the top
++ ++

Problems with Symbols, High Severity  1 


++volk.h, libvolk.so.1.3
++ ++[+] volk_32f_8u_polarbutterfly_32f_manual ( float* llrs, unsigned char* u, int const frame_size, int const frame_exp, int const stage, int const u_num, int const row, char const* impl_name )  1  ++
++ ++
++to the top
++ ++

Problems with Data Types, Low Severity  1 


++volk_typedefs.h
++ ++[+] typedef p_32f_8u_polarbutterfly_32f  1  ++
++ ++ ++
++to the top
++ ++

Problems with Constants, Low Severity  1 


++volk_32f_log2_32f.h
++ ++[+] LOG_POLY_DEGREE ++
++ ++ ++
++to the top
++

Header Files  135 


++
++constants.h
++saturation_arithmetic.h
++volk.h
++volk_16i_32fc_dot_prod_32fc.h
++volk_16i_branch_4_state_8.h
++volk_16i_convert_8i.h
++volk_16i_max_star_16i.h
++volk_16i_max_star_horizontal_16i.h
++volk_16i_permute_and_scalar_add.h
++volk_16i_s32f_convert_32f.h
++volk_16i_x4_quad_max_star_16i.h
++volk_16i_x5_add_quad_16i_x4.h
++volk_16ic_convert_32fc.h
++volk_16ic_deinterleave_16i_x2.h
++volk_16ic_deinterleave_real_16i.h
++volk_16ic_deinterleave_real_8i.h
++volk_16ic_magnitude_16i.h
++volk_16ic_s32f_deinterleave_32f_x2.h
++volk_16ic_s32f_deinterleave_real_32f.h
++volk_16ic_s32f_magnitude_32f.h
++volk_16ic_x2_dot_prod_16ic.h
++volk_16ic_x2_multiply_16ic.h
++volk_16u_byteswap.h
++volk_16u_byteswappuppet_16u.h
++volk_32f_8u_polarbutterfly_32f.h
++volk_32f_8u_polarbutterflypuppet_32f.h
++volk_32f_accumulator_s32f.h
++volk_32f_acos_32f.h
++volk_32f_asin_32f.h
++volk_32f_atan_32f.h
++volk_32f_binary_slicer_32i.h
++volk_32f_binary_slicer_8i.h
++volk_32f_convert_64f.h
++volk_32f_cos_32f.h
++volk_32f_expfast_32f.h
++volk_32f_index_max_16u.h
++volk_32f_index_max_32u.h
++volk_32f_invsqrt_32f.h
++volk_32f_log2_32f.h
++volk_32f_null_32f.h
++volk_32f_s32f_32f_fm_detect_32f.h
++volk_32f_s32f_calc_spectral_noise_floor_32f.h
++volk_32f_s32f_convert_16i.h
++volk_32f_s32f_convert_32i.h
++volk_32f_s32f_convert_8i.h
++volk_32f_s32f_multiply_32f.h
++volk_32f_s32f_normalize.h
++volk_32f_s32f_power_32f.h
++volk_32f_s32f_stddev_32f.h
++volk_32f_sin_32f.h
++volk_32f_sqrt_32f.h
++volk_32f_stddev_and_mean_32f_x2.h
++volk_32f_tan_32f.h
++volk_32f_tanh_32f.h
++volk_32f_x2_add_32f.h
++volk_32f_x2_divide_32f.h
++volk_32f_x2_dot_prod_16i.h
++volk_32f_x2_dot_prod_32f.h
++volk_32f_x2_fm_detectpuppet_32f.h
++volk_32f_x2_interleave_32fc.h
++volk_32f_x2_max_32f.h
++volk_32f_x2_min_32f.h
++volk_32f_x2_multiply_32f.h
++volk_32f_x2_pow_32f.h
++volk_32f_x2_s32f_interleave_16ic.h
++volk_32f_x2_subtract_32f.h
++volk_32f_x3_sum_of_poly_32f.h
++volk_32fc_32f_dot_prod_32fc.h
++volk_32fc_32f_multiply_32fc.h
++volk_32fc_conjugate_32fc.h
++volk_32fc_convert_16ic.h
++volk_32fc_deinterleave_32f_x2.h
++volk_32fc_deinterleave_64f_x2.h
++volk_32fc_deinterleave_imag_32f.h
++volk_32fc_deinterleave_real_32f.h
++volk_32fc_deinterleave_real_64f.h
++volk_32fc_index_max_16u.h
++volk_32fc_index_max_32u.h
++volk_32fc_magnitude_32f.h
++volk_32fc_magnitude_squared_32f.h
++volk_32fc_s32f_atan2_32f.h
++volk_32fc_s32f_deinterleave_real_16i.h
++volk_32fc_s32f_magnitude_16i.h
++volk_32fc_s32f_power_32fc.h
++volk_32fc_s32f_power_spectrum_32f.h
++volk_32fc_s32f_x2_power_spectral_density_32f.h
++volk_32fc_s32fc_multiply_32fc.h
++volk_32fc_s32fc_rotatorpuppet_32fc.h
++volk_32fc_s32fc_x2_rotator_32fc.h
++volk_32fc_x2_conjugate_dot_prod_32fc.h
++volk_32fc_x2_divide_32fc.h
++volk_32fc_x2_dot_prod_32fc.h
++volk_32fc_x2_multiply_32fc.h
++volk_32fc_x2_multiply_conjugate_32fc.h
++volk_32fc_x2_s32f_square_dist_scalar_mult_32f.h
++volk_32fc_x2_square_dist_32f.h
++volk_32i_s32f_convert_32f.h
++volk_32i_x2_and_32i.h
++volk_32i_x2_or_32i.h
++volk_32u_byteswap.h
++volk_32u_byteswappuppet_32u.h
++volk_32u_popcnt.h
++volk_32u_popcntpuppet_32u.h
++volk_64f_convert_32f.h
++volk_64f_x2_max_64f.h
++volk_64f_x2_min_64f.h
++volk_64u_byteswap.h
++volk_64u_byteswappuppet_64u.h
++volk_64u_popcnt.h
++volk_64u_popcntpuppet_64u.h
++volk_8i_convert_16i.h
++volk_8i_s32f_convert_32f.h
++volk_8ic_deinterleave_16i_x2.h
++volk_8ic_deinterleave_real_16i.h
++volk_8ic_deinterleave_real_8i.h
++volk_8ic_s32f_deinterleave_32f_x2.h
++volk_8ic_s32f_deinterleave_real_32f.h
++volk_8ic_x2_multiply_conjugate_16ic.h
++volk_8ic_x2_s32f_multiply_conjugate_32fc.h
++volk_8u_conv_k7_r2puppet_8u.h
++volk_8u_x2_encodeframepolar_8u.h
++volk_8u_x3_encodepolar_8u_x2.h
++volk_8u_x3_encodepolarpuppet_8u.h
++volk_8u_x4_conv_k7_r2_8u.h
++volk_avx_intrinsics.h
++volk_common.h
++volk_complex.h
++volk_config_fixed.h
++volk_cpu.h
++volk_malloc.h
++volk_neon_intrinsics.h
++volk_prefs.h
++volk_sse3_intrinsics.h
++volk_sse_intrinsics.h
++volk_typedefs.h
++
++
to the top
++

Libraries  1 


++
++libvolk.so.1.3
++
++
to the top
++


++

Test Info


++ ++ ++ ++ ++ ++ ++
Library Namelibvolk1-dev
Version #11.3-3
Version #21.4-1
Archx86_64
SubjectSource Compatibility
++

Test Results


++ ++ ++ ++ ++ ++ ++
Total Header Files135
Total Libraries1
Total Symbols / Types660 / 235
Compatibility99.1%
++

Problem Summary


++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++
SeverityCount
Added Symbols-46
Removed SymbolsHigh5
Problems with
Data Types
High0
Medium0
Low1
Problems with
Symbols
High1
Medium0
Low0
Problems with
Constants
Low1
Other Changes
in Constants
-2
++ ++

Added Symbols  46 


++volk.h
++volk_32f_64f_add_64f [data]
++volk_32f_64f_add_64f_a [data]
++volk_32f_64f_add_64f_get_func_desc ( )
++volk_32f_64f_add_64f_manual ( double* cVector, float const* aVector, double const* bVector, unsigned int num_points, char const* impl_name )
++volk_32f_64f_add_64f_u [data]
++volk_32f_64f_multiply_64f [data]
++volk_32f_64f_multiply_64f_a [data]
++volk_32f_64f_multiply_64f_get_func_desc ( )
++volk_32f_64f_multiply_64f_manual ( double* cVector, float const* aVector, double const* bVector, unsigned int num_points, char const* impl_name )
++volk_32f_64f_multiply_64f_u [data]
++volk_32f_s32f_mod_rangepuppet_32f [data]
++volk_32f_s32f_mod_rangepuppet_32f_a [data]
++volk_32f_s32f_mod_rangepuppet_32f_get_func_desc ( )
++volk_32f_s32f_mod_rangepuppet_32f_manual ( float* output, float const* input, float bound, unsigned int num_points, char const* impl_name )
++volk_32f_s32f_mod_rangepuppet_32f_u [data]
++volk_32f_s32f_s32f_mod_range_32f [data]
++volk_32f_s32f_s32f_mod_range_32f_a [data]
++volk_32f_s32f_s32f_mod_range_32f_get_func_desc ( )
++volk_32f_s32f_s32f_mod_range_32f_manual ( float* outputVector, float const* inputVector, float const lower_bound, float const upper_bound, unsigned int num_points, char const* impl_name )
++volk_32f_s32f_s32f_mod_range_32f_u [data]
++volk_32fc_32f_add_32fc [data]
++volk_32fc_32f_add_32fc_a [data]
++volk_32fc_32f_add_32fc_get_func_desc ( )
++volk_32fc_32f_add_32fc_manual ( lv_32fc_t* cVector, lv_32fc_t const* aVector, float const* bVector, unsigned int num_points, char const* impl_name )
++volk_32fc_32f_add_32fc_u [data]
++volk_32fc_x2_add_32fc [data]
++volk_32fc_x2_add_32fc_a [data]
++volk_32fc_x2_add_32fc_get_func_desc ( )
++volk_32fc_x2_add_32fc_manual ( lv_32fc_t* cVector, lv_32fc_t const* aVector, lv_32fc_t const* bVector, unsigned int num_points, char const* impl_name )
++volk_32fc_x2_add_32fc_u [data]
++volk_32u_reverse_32u [data]
++volk_32u_reverse_32u_a [data]
++volk_32u_reverse_32u_get_func_desc ( )
++volk_32u_reverse_32u_manual ( uint32_t* out, uint32_t const* in, unsigned int num_points, char const* impl_name )
++volk_32u_reverse_32u_u [data]
++volk_64f_x2_add_64f [data]
++volk_64f_x2_add_64f_a [data]
++volk_64f_x2_add_64f_get_func_desc ( )
++volk_64f_x2_add_64f_manual ( double* cVector, double const* aVector, double const* bVector, unsigned int num_points, char const* impl_name )
++volk_64f_x2_add_64f_u [data]
++volk_64f_x2_multiply_64f [data]
++volk_64f_x2_multiply_64f_a [data]
++volk_64f_x2_multiply_64f_get_func_desc ( )
++volk_64f_x2_multiply_64f_manual ( double* cVector, double const* aVector, double const* bVector, unsigned int num_points, char const* impl_name )
++volk_64f_x2_multiply_64f_u [data]
++
++volk_32u_reverse_32u.h
++ ++BitReverseTable256 [data] ++
++ ++ ++
++to the top
++

Removed Symbols  5 


++constants.h
++volk_available_machines ( )
++volk_c_compiler ( )
++volk_compiler_flags ( )
++volk_prefix ( )
++volk_version ( )
++
++to the top
++ ++

Problems with Symbols, High Severity  1 


++volk.h
++ ++[+] volk_32f_8u_polarbutterfly_32f_manual ( float* llrs, unsigned char* u, int const frame_size, int const frame_exp, int const stage, int const u_num, int const row, char const* impl_name )  1  ++
++ ++
++to the top
++ ++

Problems with Data Types, Low Severity  1 


++volk_typedefs.h
++ ++[+] typedef p_32f_8u_polarbutterfly_32f  1  ++
++ ++ ++
++to the top
++ ++

Problems with Constants, Low Severity  1 


++volk_32f_log2_32f.h
++ ++[+] LOG_POLY_DEGREE ++
++ ++ ++
++to the top
++ ++

Other Changes in Constants  2 


++volk_common.h
++ ++[+] __VOLK_ASM ++
++ ++ ++ ++[+] __VOLK_VOLATILE ++
++ ++ ++
++to the top
++

Header Files  135 


++
++constants.h
++saturation_arithmetic.h
++volk.h
++volk_16i_32fc_dot_prod_32fc.h
++volk_16i_branch_4_state_8.h
++volk_16i_convert_8i.h
++volk_16i_max_star_16i.h
++volk_16i_max_star_horizontal_16i.h
++volk_16i_permute_and_scalar_add.h
++volk_16i_s32f_convert_32f.h
++volk_16i_x4_quad_max_star_16i.h
++volk_16i_x5_add_quad_16i_x4.h
++volk_16ic_convert_32fc.h
++volk_16ic_deinterleave_16i_x2.h
++volk_16ic_deinterleave_real_16i.h
++volk_16ic_deinterleave_real_8i.h
++volk_16ic_magnitude_16i.h
++volk_16ic_s32f_deinterleave_32f_x2.h
++volk_16ic_s32f_deinterleave_real_32f.h
++volk_16ic_s32f_magnitude_32f.h
++volk_16ic_x2_dot_prod_16ic.h
++volk_16ic_x2_multiply_16ic.h
++volk_16u_byteswap.h
++volk_16u_byteswappuppet_16u.h
++volk_32f_8u_polarbutterfly_32f.h
++volk_32f_8u_polarbutterflypuppet_32f.h
++volk_32f_accumulator_s32f.h
++volk_32f_acos_32f.h
++volk_32f_asin_32f.h
++volk_32f_atan_32f.h
++volk_32f_binary_slicer_32i.h
++volk_32f_binary_slicer_8i.h
++volk_32f_convert_64f.h
++volk_32f_cos_32f.h
++volk_32f_expfast_32f.h
++volk_32f_index_max_16u.h
++volk_32f_index_max_32u.h
++volk_32f_invsqrt_32f.h
++volk_32f_log2_32f.h
++volk_32f_null_32f.h
++volk_32f_s32f_32f_fm_detect_32f.h
++volk_32f_s32f_calc_spectral_noise_floor_32f.h
++volk_32f_s32f_convert_16i.h
++volk_32f_s32f_convert_32i.h
++volk_32f_s32f_convert_8i.h
++volk_32f_s32f_multiply_32f.h
++volk_32f_s32f_normalize.h
++volk_32f_s32f_power_32f.h
++volk_32f_s32f_stddev_32f.h
++volk_32f_sin_32f.h
++volk_32f_sqrt_32f.h
++volk_32f_stddev_and_mean_32f_x2.h
++volk_32f_tan_32f.h
++volk_32f_tanh_32f.h
++volk_32f_x2_add_32f.h
++volk_32f_x2_divide_32f.h
++volk_32f_x2_dot_prod_16i.h
++volk_32f_x2_dot_prod_32f.h
++volk_32f_x2_fm_detectpuppet_32f.h
++volk_32f_x2_interleave_32fc.h
++volk_32f_x2_max_32f.h
++volk_32f_x2_min_32f.h
++volk_32f_x2_multiply_32f.h
++volk_32f_x2_pow_32f.h
++volk_32f_x2_s32f_interleave_16ic.h
++volk_32f_x2_subtract_32f.h
++volk_32f_x3_sum_of_poly_32f.h
++volk_32fc_32f_dot_prod_32fc.h
++volk_32fc_32f_multiply_32fc.h
++volk_32fc_conjugate_32fc.h
++volk_32fc_convert_16ic.h
++volk_32fc_deinterleave_32f_x2.h
++volk_32fc_deinterleave_64f_x2.h
++volk_32fc_deinterleave_imag_32f.h
++volk_32fc_deinterleave_real_32f.h
++volk_32fc_deinterleave_real_64f.h
++volk_32fc_index_max_16u.h
++volk_32fc_index_max_32u.h
++volk_32fc_magnitude_32f.h
++volk_32fc_magnitude_squared_32f.h
++volk_32fc_s32f_atan2_32f.h
++volk_32fc_s32f_deinterleave_real_16i.h
++volk_32fc_s32f_magnitude_16i.h
++volk_32fc_s32f_power_32fc.h
++volk_32fc_s32f_power_spectrum_32f.h
++volk_32fc_s32f_x2_power_spectral_density_32f.h
++volk_32fc_s32fc_multiply_32fc.h
++volk_32fc_s32fc_rotatorpuppet_32fc.h
++volk_32fc_s32fc_x2_rotator_32fc.h
++volk_32fc_x2_conjugate_dot_prod_32fc.h
++volk_32fc_x2_divide_32fc.h
++volk_32fc_x2_dot_prod_32fc.h
++volk_32fc_x2_multiply_32fc.h
++volk_32fc_x2_multiply_conjugate_32fc.h
++volk_32fc_x2_s32f_square_dist_scalar_mult_32f.h
++volk_32fc_x2_square_dist_32f.h
++volk_32i_s32f_convert_32f.h
++volk_32i_x2_and_32i.h
++volk_32i_x2_or_32i.h
++volk_32u_byteswap.h
++volk_32u_byteswappuppet_32u.h
++volk_32u_popcnt.h
++volk_32u_popcntpuppet_32u.h
++volk_64f_convert_32f.h
++volk_64f_x2_max_64f.h
++volk_64f_x2_min_64f.h
++volk_64u_byteswap.h
++volk_64u_byteswappuppet_64u.h
++volk_64u_popcnt.h
++volk_64u_popcntpuppet_64u.h
++volk_8i_convert_16i.h
++volk_8i_s32f_convert_32f.h
++volk_8ic_deinterleave_16i_x2.h
++volk_8ic_deinterleave_real_16i.h
++volk_8ic_deinterleave_real_8i.h
++volk_8ic_s32f_deinterleave_32f_x2.h
++volk_8ic_s32f_deinterleave_real_32f.h
++volk_8ic_x2_multiply_conjugate_16ic.h
++volk_8ic_x2_s32f_multiply_conjugate_32fc.h
++volk_8u_conv_k7_r2puppet_8u.h
++volk_8u_x2_encodeframepolar_8u.h
++volk_8u_x3_encodepolar_8u_x2.h
++volk_8u_x3_encodepolarpuppet_8u.h
++volk_8u_x4_conv_k7_r2_8u.h
++volk_avx_intrinsics.h
++volk_common.h
++volk_complex.h
++volk_config_fixed.h
++volk_cpu.h
++volk_malloc.h
++volk_neon_intrinsics.h
++volk_prefs.h
++volk_sse3_intrinsics.h
++volk_sse_intrinsics.h
++volk_typedefs.h
++
++
to the top
++

Libraries  1 


++
++libvolk.so.1.3
++
++
to the top
++



++ ++
++ ++ diff --cc debian/1.4_to_2.0_compat_report.html index 0000000,0000000..0bb6275 new file mode 100644 --- /dev/null +++ b/debian/1.4_to_2.0_compat_report.html @@@ -1,0 -1,0 +1,1855 @@@ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++volk: 1.4 to 2.0 compatibility report ++ ++ ++ ++

API compatibility report for the volk library between 1.4 and 2.0 versions on x86_64

++ ++
++
++ Binary
Compatibility
++ Source
Compatibility
++
++

Test Info


++ ++ ++ ++ ++ ++ ++ ++
Library Namevolk
Version #11.4
Version #22.0
Archx86_64
GCC Version8
SubjectBinary Compatibility
++

Test Results


++ ++ ++ ++ ++ ++ ++
Total Header Files143
Total Libraries1
Total Symbols / Types660 / 244
Compatibility99.8%
++

Problem Summary


++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++
SeverityCount
Added Symbols-0
Removed SymbolsHigh0
Problems with
Data Types
High0
Medium2
Low3
Problems with
Symbols
High0
Medium2
Low0
Problems with
Constants
Low18
Other Changes
in Constants
-5
++ ++ ++

Problems with Data Types, Medium Severity  2 


++volk_cpu.h
++ ++[+] struct VOLK_CPU  2  ++
++ ++ ++
++to the top
++ ++

Problems with Symbols, Medium Severity  2 


++volk_cpu.h
++ ++[+] volk_cpu [data]  1  ++
++ ++
++volk_prefs.h, libvolk.so.1.4
++ ++[+] volk_get_config_path ( char* p1 )  1  ++
++ ++
++to the top
++ ++

Problems with Data Types, Low Severity  3 


++volk_cpu.h
++ ++[+] struct VOLK_CPU  3  ++
++ ++ ++
++to the top
++ ++

Problems with Constants, Low Severity  18 


++volk_config_fixed.h
++ ++[+] LV_32 ++
++ ++ ++ ++[+] LV_3DNOW ++
++ ++ ++ ++[+] LV_64 ++
++ ++ ++ ++[+] LV_ABM ++
++ ++ ++ ++[+] LV_AVX ++
++ ++ ++ ++[+] LV_AVX2 ++
++ ++ ++ ++[+] LV_FMA ++
++ ++ ++ ++[+] LV_MMX ++
++ ++ ++ ++[+] LV_NORC ++
++ ++ ++ ++[+] LV_ORC ++
++ ++ ++ ++[+] LV_POPCOUNT ++
++ ++ ++ ++[+] LV_SSE ++
++ ++ ++ ++[+] LV_SSE2 ++
++ ++ ++ ++[+] LV_SSE3 ++
++ ++ ++ ++[+] LV_SSE4_1 ++
++ ++ ++ ++[+] LV_SSE4_2 ++
++ ++ ++ ++[+] LV_SSE4_A ++
++ ++ ++ ++[+] LV_SSSE3 ++
++ ++ ++
++to the top
++ ++

Other Changes in Constants  5 


++volk_32f_log2_32f.h
++ ++[+] LOG_POLY_DEGREE ++
++ ++ ++
++volk_config_fixed.h
++ ++[+] LV_AVX512CD ++
++ ++ ++ ++[+] LV_AVX512F ++
++ ++ ++ ++[+] LV_NEONV7 ++
++ ++ ++ ++[+] LV_NEONV8 ++
++ ++ ++
++to the top
++

Header Files  143 


++
++saturation_arithmetic.h
++volk.h
++volk_16i_32fc_dot_prod_32fc.h
++volk_16i_branch_4_state_8.h
++volk_16i_convert_8i.h
++volk_16i_max_star_16i.h
++volk_16i_max_star_horizontal_16i.h
++volk_16i_permute_and_scalar_add.h
++volk_16i_s32f_convert_32f.h
++volk_16i_x4_quad_max_star_16i.h
++volk_16i_x5_add_quad_16i_x4.h
++volk_16ic_convert_32fc.h
++volk_16ic_deinterleave_16i_x2.h
++volk_16ic_deinterleave_real_16i.h
++volk_16ic_deinterleave_real_8i.h
++volk_16ic_magnitude_16i.h
++volk_16ic_s32f_deinterleave_32f_x2.h
++volk_16ic_s32f_deinterleave_real_32f.h
++volk_16ic_s32f_magnitude_32f.h
++volk_16ic_x2_dot_prod_16ic.h
++volk_16ic_x2_multiply_16ic.h
++volk_16u_byteswap.h
++volk_16u_byteswappuppet_16u.h
++volk_32f_64f_add_64f.h
++volk_32f_64f_multiply_64f.h
++volk_32f_8u_polarbutterfly_32f.h
++volk_32f_8u_polarbutterflypuppet_32f.h
++volk_32f_accumulator_s32f.h
++volk_32f_acos_32f.h
++volk_32f_asin_32f.h
++volk_32f_atan_32f.h
++volk_32f_binary_slicer_32i.h
++volk_32f_binary_slicer_8i.h
++volk_32f_convert_64f.h
++volk_32f_cos_32f.h
++volk_32f_expfast_32f.h
++volk_32f_index_max_16u.h
++volk_32f_index_max_32u.h
++volk_32f_invsqrt_32f.h
++volk_32f_log2_32f.h
++volk_32f_null_32f.h
++volk_32f_s32f_32f_fm_detect_32f.h
++volk_32f_s32f_calc_spectral_noise_floor_32f.h
++volk_32f_s32f_convert_16i.h
++volk_32f_s32f_convert_32i.h
++volk_32f_s32f_convert_8i.h
++volk_32f_s32f_mod_rangepuppet_32f.h
++volk_32f_s32f_multiply_32f.h
++volk_32f_s32f_normalize.h
++volk_32f_s32f_power_32f.h
++volk_32f_s32f_s32f_mod_range_32f.h
++volk_32f_s32f_stddev_32f.h
++volk_32f_sin_32f.h
++volk_32f_sqrt_32f.h
++volk_32f_stddev_and_mean_32f_x2.h
++volk_32f_tan_32f.h
++volk_32f_tanh_32f.h
++volk_32f_x2_add_32f.h
++volk_32f_x2_divide_32f.h
++volk_32f_x2_dot_prod_16i.h
++volk_32f_x2_dot_prod_32f.h
++volk_32f_x2_fm_detectpuppet_32f.h
++volk_32f_x2_interleave_32fc.h
++volk_32f_x2_max_32f.h
++volk_32f_x2_min_32f.h
++volk_32f_x2_multiply_32f.h
++volk_32f_x2_pow_32f.h
++volk_32f_x2_s32f_interleave_16ic.h
++volk_32f_x2_subtract_32f.h
++volk_32f_x3_sum_of_poly_32f.h
++volk_32fc_32f_add_32fc.h
++volk_32fc_32f_dot_prod_32fc.h
++volk_32fc_32f_multiply_32fc.h
++volk_32fc_conjugate_32fc.h
++volk_32fc_convert_16ic.h
++volk_32fc_deinterleave_32f_x2.h
++volk_32fc_deinterleave_64f_x2.h
++volk_32fc_deinterleave_imag_32f.h
++volk_32fc_deinterleave_real_32f.h
++volk_32fc_deinterleave_real_64f.h
++volk_32fc_index_max_16u.h
++volk_32fc_index_max_32u.h
++volk_32fc_magnitude_32f.h
++volk_32fc_magnitude_squared_32f.h
++volk_32fc_s32f_atan2_32f.h
++volk_32fc_s32f_deinterleave_real_16i.h
++volk_32fc_s32f_magnitude_16i.h
++volk_32fc_s32f_power_32fc.h
++volk_32fc_s32f_power_spectrum_32f.h
++volk_32fc_s32f_x2_power_spectral_density_32f.h
++volk_32fc_s32fc_multiply_32fc.h
++volk_32fc_s32fc_rotatorpuppet_32fc.h
++volk_32fc_s32fc_x2_rotator_32fc.h
++volk_32fc_x2_add_32fc.h
++volk_32fc_x2_conjugate_dot_prod_32fc.h
++volk_32fc_x2_divide_32fc.h
++volk_32fc_x2_dot_prod_32fc.h
++volk_32fc_x2_multiply_32fc.h
++volk_32fc_x2_multiply_conjugate_32fc.h
++volk_32fc_x2_s32f_square_dist_scalar_mult_32f.h
++volk_32fc_x2_square_dist_32f.h
++volk_32i_s32f_convert_32f.h
++volk_32i_x2_and_32i.h
++volk_32i_x2_or_32i.h
++volk_32u_byteswap.h
++volk_32u_byteswappuppet_32u.h
++volk_32u_popcnt.h
++volk_32u_popcntpuppet_32u.h
++volk_32u_reverse_32u.h
++volk_64f_convert_32f.h
++volk_64f_x2_add_64f.h
++volk_64f_x2_max_64f.h
++volk_64f_x2_min_64f.h
++volk_64f_x2_multiply_64f.h
++volk_64u_byteswap.h
++volk_64u_byteswappuppet_64u.h
++volk_64u_popcnt.h
++volk_64u_popcntpuppet_64u.h
++volk_8i_convert_16i.h
++volk_8i_s32f_convert_32f.h
++volk_8ic_deinterleave_16i_x2.h
++volk_8ic_deinterleave_real_16i.h
++volk_8ic_deinterleave_real_8i.h
++volk_8ic_s32f_deinterleave_32f_x2.h
++volk_8ic_s32f_deinterleave_real_32f.h
++volk_8ic_x2_multiply_conjugate_16ic.h
++volk_8ic_x2_s32f_multiply_conjugate_32fc.h
++volk_8u_conv_k7_r2puppet_8u.h
++volk_8u_x2_encodeframepolar_8u.h
++volk_8u_x3_encodepolar_8u_x2.h
++volk_8u_x3_encodepolarpuppet_8u.h
++volk_8u_x4_conv_k7_r2_8u.h
++volk_avx_intrinsics.h
++volk_common.h
++volk_complex.h
++volk_config_fixed.h
++volk_cpu.h
++volk_malloc.h
++volk_neon_intrinsics.h
++volk_prefs.h
++volk_sse3_intrinsics.h
++volk_sse_intrinsics.h
++volk_typedefs.h
++
++
to the top
++

Libraries  1 


++
++libvolk.so.1.4
++
++
to the top
++


++

Test Info


++ ++ ++ ++ ++ ++ ++
Library Namevolk
Version #11.4
Version #22.0
Archx86_64
SubjectSource Compatibility
++

Test Results


++ ++ ++ ++ ++ ++ ++
Total Header Files143
Total Libraries1
Total Symbols / Types705 / 246
Compatibility99.9%
++

Problem Summary


++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++
SeverityCount
Added Symbols-5
Removed SymbolsHigh0
Problems with
Data Types
High0
Medium0
Low4
Problems with
Symbols
High0
Medium1
Low0
Problems with
Constants
Low18
Other Changes
in Constants
-5
++ ++

Added Symbols  5 


++constants.h
++volk_available_machines ( )
++volk_c_compiler ( )
++volk_compiler_flags ( )
++volk_prefix ( )
++volk_version ( )
++
++to the top
++ ++

Problems with Symbols, Medium Severity  1 


++volk_prefs.h
++ ++[+] volk_get_config_path ( char* p1 )  1  ++
++ ++
++to the top
++ ++

Problems with Data Types, Low Severity  4 


++volk_cpu.h
++ ++[+] struct VOLK_CPU  4  ++
++ ++ ++
++to the top
++ ++

Problems with Constants, Low Severity  18 


++volk_config_fixed.h
++ ++[+] LV_32 ++
++ ++ ++ ++[+] LV_3DNOW ++
++ ++ ++ ++[+] LV_64 ++
++ ++ ++ ++[+] LV_ABM ++
++ ++ ++ ++[+] LV_AVX ++
++ ++ ++ ++[+] LV_AVX2 ++
++ ++ ++ ++[+] LV_FMA ++
++ ++ ++ ++[+] LV_MMX ++
++ ++ ++ ++[+] LV_NORC ++
++ ++ ++ ++[+] LV_ORC ++
++ ++ ++ ++[+] LV_POPCOUNT ++
++ ++ ++ ++[+] LV_SSE ++
++ ++ ++ ++[+] LV_SSE2 ++
++ ++ ++ ++[+] LV_SSE3 ++
++ ++ ++ ++[+] LV_SSE4_1 ++
++ ++ ++ ++[+] LV_SSE4_2 ++
++ ++ ++ ++[+] LV_SSE4_A ++
++ ++ ++ ++[+] LV_SSSE3 ++
++ ++ ++
++to the top
++ ++

Other Changes in Constants  5 


++volk_32f_log2_32f.h
++ ++[+] LOG_POLY_DEGREE ++
++ ++ ++
++volk_config_fixed.h
++ ++[+] LV_AVX512CD ++
++ ++ ++ ++[+] LV_AVX512F ++
++ ++ ++ ++[+] LV_NEONV7 ++
++ ++ ++ ++[+] LV_NEONV8 ++
++ ++ ++
++to the top
++

Header Files  143 


++
++saturation_arithmetic.h
++volk.h
++volk_16i_32fc_dot_prod_32fc.h
++volk_16i_branch_4_state_8.h
++volk_16i_convert_8i.h
++volk_16i_max_star_16i.h
++volk_16i_max_star_horizontal_16i.h
++volk_16i_permute_and_scalar_add.h
++volk_16i_s32f_convert_32f.h
++volk_16i_x4_quad_max_star_16i.h
++volk_16i_x5_add_quad_16i_x4.h
++volk_16ic_convert_32fc.h
++volk_16ic_deinterleave_16i_x2.h
++volk_16ic_deinterleave_real_16i.h
++volk_16ic_deinterleave_real_8i.h
++volk_16ic_magnitude_16i.h
++volk_16ic_s32f_deinterleave_32f_x2.h
++volk_16ic_s32f_deinterleave_real_32f.h
++volk_16ic_s32f_magnitude_32f.h
++volk_16ic_x2_dot_prod_16ic.h
++volk_16ic_x2_multiply_16ic.h
++volk_16u_byteswap.h
++volk_16u_byteswappuppet_16u.h
++volk_32f_64f_add_64f.h
++volk_32f_64f_multiply_64f.h
++volk_32f_8u_polarbutterfly_32f.h
++volk_32f_8u_polarbutterflypuppet_32f.h
++volk_32f_accumulator_s32f.h
++volk_32f_acos_32f.h
++volk_32f_asin_32f.h
++volk_32f_atan_32f.h
++volk_32f_binary_slicer_32i.h
++volk_32f_binary_slicer_8i.h
++volk_32f_convert_64f.h
++volk_32f_cos_32f.h
++volk_32f_expfast_32f.h
++volk_32f_index_max_16u.h
++volk_32f_index_max_32u.h
++volk_32f_invsqrt_32f.h
++volk_32f_log2_32f.h
++volk_32f_null_32f.h
++volk_32f_s32f_32f_fm_detect_32f.h
++volk_32f_s32f_calc_spectral_noise_floor_32f.h
++volk_32f_s32f_convert_16i.h
++volk_32f_s32f_convert_32i.h
++volk_32f_s32f_convert_8i.h
++volk_32f_s32f_mod_rangepuppet_32f.h
++volk_32f_s32f_multiply_32f.h
++volk_32f_s32f_normalize.h
++volk_32f_s32f_power_32f.h
++volk_32f_s32f_s32f_mod_range_32f.h
++volk_32f_s32f_stddev_32f.h
++volk_32f_sin_32f.h
++volk_32f_sqrt_32f.h
++volk_32f_stddev_and_mean_32f_x2.h
++volk_32f_tan_32f.h
++volk_32f_tanh_32f.h
++volk_32f_x2_add_32f.h
++volk_32f_x2_divide_32f.h
++volk_32f_x2_dot_prod_16i.h
++volk_32f_x2_dot_prod_32f.h
++volk_32f_x2_fm_detectpuppet_32f.h
++volk_32f_x2_interleave_32fc.h
++volk_32f_x2_max_32f.h
++volk_32f_x2_min_32f.h
++volk_32f_x2_multiply_32f.h
++volk_32f_x2_pow_32f.h
++volk_32f_x2_s32f_interleave_16ic.h
++volk_32f_x2_subtract_32f.h
++volk_32f_x3_sum_of_poly_32f.h
++volk_32fc_32f_add_32fc.h
++volk_32fc_32f_dot_prod_32fc.h
++volk_32fc_32f_multiply_32fc.h
++volk_32fc_conjugate_32fc.h
++volk_32fc_convert_16ic.h
++volk_32fc_deinterleave_32f_x2.h
++volk_32fc_deinterleave_64f_x2.h
++volk_32fc_deinterleave_imag_32f.h
++volk_32fc_deinterleave_real_32f.h
++volk_32fc_deinterleave_real_64f.h
++volk_32fc_index_max_16u.h
++volk_32fc_index_max_32u.h
++volk_32fc_magnitude_32f.h
++volk_32fc_magnitude_squared_32f.h
++volk_32fc_s32f_atan2_32f.h
++volk_32fc_s32f_deinterleave_real_16i.h
++volk_32fc_s32f_magnitude_16i.h
++volk_32fc_s32f_power_32fc.h
++volk_32fc_s32f_power_spectrum_32f.h
++volk_32fc_s32f_x2_power_spectral_density_32f.h
++volk_32fc_s32fc_multiply_32fc.h
++volk_32fc_s32fc_rotatorpuppet_32fc.h
++volk_32fc_s32fc_x2_rotator_32fc.h
++volk_32fc_x2_add_32fc.h
++volk_32fc_x2_conjugate_dot_prod_32fc.h
++volk_32fc_x2_divide_32fc.h
++volk_32fc_x2_dot_prod_32fc.h
++volk_32fc_x2_multiply_32fc.h
++volk_32fc_x2_multiply_conjugate_32fc.h
++volk_32fc_x2_s32f_square_dist_scalar_mult_32f.h
++volk_32fc_x2_square_dist_32f.h
++volk_32i_s32f_convert_32f.h
++volk_32i_x2_and_32i.h
++volk_32i_x2_or_32i.h
++volk_32u_byteswap.h
++volk_32u_byteswappuppet_32u.h
++volk_32u_popcnt.h
++volk_32u_popcntpuppet_32u.h
++volk_32u_reverse_32u.h
++volk_64f_convert_32f.h
++volk_64f_x2_add_64f.h
++volk_64f_x2_max_64f.h
++volk_64f_x2_min_64f.h
++volk_64f_x2_multiply_64f.h
++volk_64u_byteswap.h
++volk_64u_byteswappuppet_64u.h
++volk_64u_popcnt.h
++volk_64u_popcntpuppet_64u.h
++volk_8i_convert_16i.h
++volk_8i_s32f_convert_32f.h
++volk_8ic_deinterleave_16i_x2.h
++volk_8ic_deinterleave_real_16i.h
++volk_8ic_deinterleave_real_8i.h
++volk_8ic_s32f_deinterleave_32f_x2.h
++volk_8ic_s32f_deinterleave_real_32f.h
++volk_8ic_x2_multiply_conjugate_16ic.h
++volk_8ic_x2_s32f_multiply_conjugate_32fc.h
++volk_8u_conv_k7_r2puppet_8u.h
++volk_8u_x2_encodeframepolar_8u.h
++volk_8u_x3_encodepolar_8u_x2.h
++volk_8u_x3_encodepolarpuppet_8u.h
++volk_8u_x4_conv_k7_r2_8u.h
++volk_avx_intrinsics.h
++volk_common.h
++volk_complex.h
++volk_config_fixed.h
++volk_cpu.h
++volk_malloc.h
++volk_neon_intrinsics.h
++volk_prefs.h
++volk_sse3_intrinsics.h
++volk_sse_intrinsics.h
++volk_typedefs.h
++
++
to the top
++

Libraries  1 


++
++libvolk.so.1.4
++
++
to the top
++



++ ++
++ ++ diff --cc debian/2.2.0_to_2.2.1_compat_report.html index 0000000,0000000..3c8e2f0 new file mode 100644 --- /dev/null +++ b/debian/2.2.0_to_2.2.1_compat_report.html @@@ -1,0 -1,0 +1,769 @@@ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++volk: 2.2.0-3 to 2.2.1-1 compatibility report ++ ++ ++ ++

API compatibility report for the volk library between 2.2.0-3 and 2.2.1-1 versions on x86_64

++ ++
++
++ Binary
Compatibility
++ Source
Compatibility
++
++

Test Info


++ ++ ++ ++ ++ ++ ++ ++
Library Namevolk
Version #12.2.0-3
Version #22.2.1-1
Archx86_64
GCC Version8
SubjectBinary Compatibility
++

Test Results


++ ++ ++ ++ ++ ++ ++
Total Header Files148
Total Libraries1
Total Symbols / Types670 / 246
Compatibility100%
++

Problem Summary


++ ++ ++ ++ ++ ++ ++ ++ ++ ++
SeverityCount
Added Symbols-0
Removed SymbolsHigh0
Problems with
Data Types
High0
Medium0
Low0
Problems with
Symbols
High0
Medium0
Low0
Problems with
Constants
Low0
++ ++

Header Files  148 


++
++constants.h
++saturation_arithmetic.h
++volk.h
++volk_16i_32fc_dot_prod_32fc.h
++volk_16i_branch_4_state_8.h
++volk_16i_convert_8i.h
++volk_16i_max_star_16i.h
++volk_16i_max_star_horizontal_16i.h
++volk_16i_permute_and_scalar_add.h
++volk_16i_s32f_convert_32f.h
++volk_16i_x4_quad_max_star_16i.h
++volk_16i_x5_add_quad_16i_x4.h
++volk_16ic_convert_32fc.h
++volk_16ic_deinterleave_16i_x2.h
++volk_16ic_deinterleave_real_16i.h
++volk_16ic_deinterleave_real_8i.h
++volk_16ic_magnitude_16i.h
++volk_16ic_s32f_deinterleave_32f_x2.h
++volk_16ic_s32f_deinterleave_real_32f.h
++volk_16ic_s32f_magnitude_32f.h
++volk_16ic_x2_dot_prod_16ic.h
++volk_16ic_x2_multiply_16ic.h
++volk_16u_byteswap.h
++volk_16u_byteswappuppet_16u.h
++volk_32f_64f_add_64f.h
++volk_32f_64f_multiply_64f.h
++volk_32f_8u_polarbutterfly_32f.h
++volk_32f_8u_polarbutterflypuppet_32f.h
++volk_32f_accumulator_s32f.h
++volk_32f_acos_32f.h
++volk_32f_asin_32f.h
++volk_32f_atan_32f.h
++volk_32f_binary_slicer_32i.h
++volk_32f_binary_slicer_8i.h
++volk_32f_convert_64f.h
++volk_32f_cos_32f.h
++volk_32f_expfast_32f.h
++volk_32f_index_max_16u.h
++volk_32f_index_max_32u.h
++volk_32f_invsqrt_32f.h
++volk_32f_log2_32f.h
++volk_32f_null_32f.h
++volk_32f_s32f_32f_fm_detect_32f.h
++volk_32f_s32f_calc_spectral_noise_floor_32f.h
++volk_32f_s32f_convert_16i.h
++volk_32f_s32f_convert_32i.h
++volk_32f_s32f_convert_8i.h
++volk_32f_s32f_mod_rangepuppet_32f.h
++volk_32f_s32f_multiply_32f.h
++volk_32f_s32f_normalize.h
++volk_32f_s32f_power_32f.h
++volk_32f_s32f_s32f_mod_range_32f.h
++volk_32f_s32f_stddev_32f.h
++volk_32f_sin_32f.h
++volk_32f_sqrt_32f.h
++volk_32f_stddev_and_mean_32f_x2.h
++volk_32f_tan_32f.h
++volk_32f_tanh_32f.h
++volk_32f_x2_add_32f.h
++volk_32f_x2_divide_32f.h
++volk_32f_x2_dot_prod_16i.h
++volk_32f_x2_dot_prod_32f.h
++volk_32f_x2_fm_detectpuppet_32f.h
++volk_32f_x2_interleave_32fc.h
++volk_32f_x2_max_32f.h
++volk_32f_x2_min_32f.h
++volk_32f_x2_multiply_32f.h
++volk_32f_x2_pow_32f.h
++volk_32f_x2_s32f_interleave_16ic.h
++volk_32f_x2_subtract_32f.h
++volk_32f_x3_sum_of_poly_32f.h
++volk_32fc_32f_add_32fc.h
++volk_32fc_32f_dot_prod_32fc.h
++volk_32fc_32f_multiply_32fc.h
++volk_32fc_conjugate_32fc.h
++volk_32fc_convert_16ic.h
++volk_32fc_deinterleave_32f_x2.h
++volk_32fc_deinterleave_64f_x2.h
++volk_32fc_deinterleave_imag_32f.h
++volk_32fc_deinterleave_real_32f.h
++volk_32fc_deinterleave_real_64f.h
++volk_32fc_index_max_16u.h
++volk_32fc_index_max_32u.h
++volk_32fc_magnitude_32f.h
++volk_32fc_magnitude_squared_32f.h
++volk_32fc_s32f_atan2_32f.h
++volk_32fc_s32f_deinterleave_real_16i.h
++volk_32fc_s32f_magnitude_16i.h
++volk_32fc_s32f_power_32fc.h
++volk_32fc_s32f_power_spectrum_32f.h
++volk_32fc_s32f_x2_power_spectral_density_32f.h
++volk_32fc_s32fc_multiply_32fc.h
++volk_32fc_s32fc_rotatorpuppet_32fc.h
++volk_32fc_s32fc_x2_rotator_32fc.h
++volk_32fc_x2_add_32fc.h
++volk_32fc_x2_conjugate_dot_prod_32fc.h
++volk_32fc_x2_divide_32fc.h
++volk_32fc_x2_dot_prod_32fc.h
++volk_32fc_x2_multiply_32fc.h
++volk_32fc_x2_multiply_conjugate_32fc.h
++volk_32fc_x2_s32f_square_dist_scalar_mult_32f.h
++volk_32fc_x2_s32fc_multiply_conjugate_add_32fc.h
++volk_32fc_x2_square_dist_32f.h
++volk_32i_s32f_convert_32f.h
++volk_32i_x2_and_32i.h
++volk_32i_x2_or_32i.h
++volk_32u_byteswap.h
++volk_32u_byteswappuppet_32u.h
++volk_32u_popcnt.h
++volk_32u_popcntpuppet_32u.h
++volk_32u_reverse_32u.h
++volk_64f_convert_32f.h
++volk_64f_x2_add_64f.h
++volk_64f_x2_max_64f.h
++volk_64f_x2_min_64f.h
++volk_64f_x2_multiply_64f.h
++volk_64u_byteswap.h
++volk_64u_byteswappuppet_64u.h
++volk_64u_popcnt.h
++volk_64u_popcntpuppet_64u.h
++volk_8i_convert_16i.h
++volk_8i_s32f_convert_32f.h
++volk_8ic_deinterleave_16i_x2.h
++volk_8ic_deinterleave_real_16i.h
++volk_8ic_deinterleave_real_8i.h
++volk_8ic_s32f_deinterleave_32f_x2.h
++volk_8ic_s32f_deinterleave_real_32f.h
++volk_8ic_x2_multiply_conjugate_16ic.h
++volk_8ic_x2_s32f_multiply_conjugate_32fc.h
++volk_8u_conv_k7_r2puppet_8u.h
++volk_8u_x2_encodeframepolar_8u.h
++volk_8u_x3_encodepolar_8u_x2.h
++volk_8u_x3_encodepolarpuppet_8u.h
++volk_8u_x4_conv_k7_r2_8u.h
++volk_alloc.hh
++volk_avx2_intrinsics.h
++volk_avx_intrinsics.h
++volk_common.h
++volk_complex.h
++volk_config_fixed.h
++volk_cpu.h
++volk_malloc.h
++volk_neon_intrinsics.h
++volk_prefs.h
++volk_sse3_intrinsics.h
++volk_sse_intrinsics.h
++volk_typedefs.h
++volk_version.h
++
++
to the top
++

Libraries  1 


++
++libvolk.so.2.2
++
++
to the top
++


++

Test Info


++ ++ ++ ++ ++ ++ ++
Library Namevolk
Version #12.2.0-3
Version #22.2.1-1
Archx86_64
SubjectSource Compatibility
++

Test Results


++ ++ ++ ++ ++ ++ ++
Total Header Files148
Total Libraries1
Total Symbols / Types730 / 249
Compatibility100%
++

Problem Summary


++ ++ ++ ++ ++ ++ ++ ++ ++ ++
SeverityCount
Added Symbols-0
Removed SymbolsHigh0
Problems with
Data Types
High0
Medium0
Low0
Problems with
Symbols
High0
Medium0
Low0
Problems with
Constants
Low0
++ ++

Header Files  148 


++
++constants.h
++saturation_arithmetic.h
++volk.h
++volk_16i_32fc_dot_prod_32fc.h
++volk_16i_branch_4_state_8.h
++volk_16i_convert_8i.h
++volk_16i_max_star_16i.h
++volk_16i_max_star_horizontal_16i.h
++volk_16i_permute_and_scalar_add.h
++volk_16i_s32f_convert_32f.h
++volk_16i_x4_quad_max_star_16i.h
++volk_16i_x5_add_quad_16i_x4.h
++volk_16ic_convert_32fc.h
++volk_16ic_deinterleave_16i_x2.h
++volk_16ic_deinterleave_real_16i.h
++volk_16ic_deinterleave_real_8i.h
++volk_16ic_magnitude_16i.h
++volk_16ic_s32f_deinterleave_32f_x2.h
++volk_16ic_s32f_deinterleave_real_32f.h
++volk_16ic_s32f_magnitude_32f.h
++volk_16ic_x2_dot_prod_16ic.h
++volk_16ic_x2_multiply_16ic.h
++volk_16u_byteswap.h
++volk_16u_byteswappuppet_16u.h
++volk_32f_64f_add_64f.h
++volk_32f_64f_multiply_64f.h
++volk_32f_8u_polarbutterfly_32f.h
++volk_32f_8u_polarbutterflypuppet_32f.h
++volk_32f_accumulator_s32f.h
++volk_32f_acos_32f.h
++volk_32f_asin_32f.h
++volk_32f_atan_32f.h
++volk_32f_binary_slicer_32i.h
++volk_32f_binary_slicer_8i.h
++volk_32f_convert_64f.h
++volk_32f_cos_32f.h
++volk_32f_expfast_32f.h
++volk_32f_index_max_16u.h
++volk_32f_index_max_32u.h
++volk_32f_invsqrt_32f.h
++volk_32f_log2_32f.h
++volk_32f_null_32f.h
++volk_32f_s32f_32f_fm_detect_32f.h
++volk_32f_s32f_calc_spectral_noise_floor_32f.h
++volk_32f_s32f_convert_16i.h
++volk_32f_s32f_convert_32i.h
++volk_32f_s32f_convert_8i.h
++volk_32f_s32f_mod_rangepuppet_32f.h
++volk_32f_s32f_multiply_32f.h
++volk_32f_s32f_normalize.h
++volk_32f_s32f_power_32f.h
++volk_32f_s32f_s32f_mod_range_32f.h
++volk_32f_s32f_stddev_32f.h
++volk_32f_sin_32f.h
++volk_32f_sqrt_32f.h
++volk_32f_stddev_and_mean_32f_x2.h
++volk_32f_tan_32f.h
++volk_32f_tanh_32f.h
++volk_32f_x2_add_32f.h
++volk_32f_x2_divide_32f.h
++volk_32f_x2_dot_prod_16i.h
++volk_32f_x2_dot_prod_32f.h
++volk_32f_x2_fm_detectpuppet_32f.h
++volk_32f_x2_interleave_32fc.h
++volk_32f_x2_max_32f.h
++volk_32f_x2_min_32f.h
++volk_32f_x2_multiply_32f.h
++volk_32f_x2_pow_32f.h
++volk_32f_x2_s32f_interleave_16ic.h
++volk_32f_x2_subtract_32f.h
++volk_32f_x3_sum_of_poly_32f.h
++volk_32fc_32f_add_32fc.h
++volk_32fc_32f_dot_prod_32fc.h
++volk_32fc_32f_multiply_32fc.h
++volk_32fc_conjugate_32fc.h
++volk_32fc_convert_16ic.h
++volk_32fc_deinterleave_32f_x2.h
++volk_32fc_deinterleave_64f_x2.h
++volk_32fc_deinterleave_imag_32f.h
++volk_32fc_deinterleave_real_32f.h
++volk_32fc_deinterleave_real_64f.h
++volk_32fc_index_max_16u.h
++volk_32fc_index_max_32u.h
++volk_32fc_magnitude_32f.h
++volk_32fc_magnitude_squared_32f.h
++volk_32fc_s32f_atan2_32f.h
++volk_32fc_s32f_deinterleave_real_16i.h
++volk_32fc_s32f_magnitude_16i.h
++volk_32fc_s32f_power_32fc.h
++volk_32fc_s32f_power_spectrum_32f.h
++volk_32fc_s32f_x2_power_spectral_density_32f.h
++volk_32fc_s32fc_multiply_32fc.h
++volk_32fc_s32fc_rotatorpuppet_32fc.h
++volk_32fc_s32fc_x2_rotator_32fc.h
++volk_32fc_x2_add_32fc.h
++volk_32fc_x2_conjugate_dot_prod_32fc.h
++volk_32fc_x2_divide_32fc.h
++volk_32fc_x2_dot_prod_32fc.h
++volk_32fc_x2_multiply_32fc.h
++volk_32fc_x2_multiply_conjugate_32fc.h
++volk_32fc_x2_s32f_square_dist_scalar_mult_32f.h
++volk_32fc_x2_s32fc_multiply_conjugate_add_32fc.h
++volk_32fc_x2_square_dist_32f.h
++volk_32i_s32f_convert_32f.h
++volk_32i_x2_and_32i.h
++volk_32i_x2_or_32i.h
++volk_32u_byteswap.h
++volk_32u_byteswappuppet_32u.h
++volk_32u_popcnt.h
++volk_32u_popcntpuppet_32u.h
++volk_32u_reverse_32u.h
++volk_64f_convert_32f.h
++volk_64f_x2_add_64f.h
++volk_64f_x2_max_64f.h
++volk_64f_x2_min_64f.h
++volk_64f_x2_multiply_64f.h
++volk_64u_byteswap.h
++volk_64u_byteswappuppet_64u.h
++volk_64u_popcnt.h
++volk_64u_popcntpuppet_64u.h
++volk_8i_convert_16i.h
++volk_8i_s32f_convert_32f.h
++volk_8ic_deinterleave_16i_x2.h
++volk_8ic_deinterleave_real_16i.h
++volk_8ic_deinterleave_real_8i.h
++volk_8ic_s32f_deinterleave_32f_x2.h
++volk_8ic_s32f_deinterleave_real_32f.h
++volk_8ic_x2_multiply_conjugate_16ic.h
++volk_8ic_x2_s32f_multiply_conjugate_32fc.h
++volk_8u_conv_k7_r2puppet_8u.h
++volk_8u_x2_encodeframepolar_8u.h
++volk_8u_x3_encodepolar_8u_x2.h
++volk_8u_x3_encodepolarpuppet_8u.h
++volk_8u_x4_conv_k7_r2_8u.h
++volk_alloc.hh
++volk_avx2_intrinsics.h
++volk_avx_intrinsics.h
++volk_common.h
++volk_complex.h
++volk_config_fixed.h
++volk_cpu.h
++volk_malloc.h
++volk_neon_intrinsics.h
++volk_prefs.h
++volk_sse3_intrinsics.h
++volk_sse_intrinsics.h
++volk_typedefs.h
++volk_version.h
++
++
to the top
++

Libraries  1 


++
++libvolk.so.2.2
++
++
to the top
++



++ ++
++ ++ diff --cc debian/changelog index 0000000,0000000..2553b1e new file mode 100644 --- /dev/null +++ b/debian/changelog @@@ -1,0 -1,0 +1,434 @@@ ++volk (2.2.1-2) unstable; urgency=medium ++ ++ * update to v2.2.1-11-gfaf230e ++ * cmake: Remove the ORC from the VOLK public link interface ++ * Fix the broken index max kernels ++ ++ -- A. Maitland Bottoms Fri, 27 Mar 2020 21:48:10 -0400 ++ ++volk (2.2.1-1) unstable; urgency=high ++ ++ * New upstream bugfix release ++ reason for high urgency: ++ - Fix loop bound in AVX rotator (only one fixed in 2.2.0-3) ++ - Fix out-of-bounds read in AVX2 square dist kernel ++ - Fix length checks in AVX2 index max kernels ++ ++ -- A. Maitland Bottoms Mon, 24 Feb 2020 18:08:05 -0500 ++ ++volk (2.2.0-3) unstable; urgency=high ++ ++ * Update to v2.2.0-6-g5701f8f ++ reason for high urgency: ++ - Fix loop bound in AVX rotator ++ ++ -- A. Maitland Bottoms Sun, 23 Feb 2020 23:49:18 -0500 ++ ++volk (2.2.0-2) unstable; urgency=medium ++ ++ * Upload to unstable ++ ++ -- A. Maitland Bottoms Tue, 18 Feb 2020 17:56:58 -0500 ++ ++volk (2.2.0-1) experimental; urgency=medium ++ ++ * New upstream release ++ - Remove build dependency on python six ++ - Fixup VolkConfigVersion ++ - add volk_version.h ++ ++ -- A. Maitland Bottoms Sun, 16 Feb 2020 18:25:20 -0500 ++ ++volk (2.1.0-2) unstable; urgency=medium ++ ++ * Upload to unstable ++ ++ -- A. Maitland Bottoms Sun, 05 Jan 2020 23:17:57 -0500 ++ ++volk (2.1.0-1) experimental; urgency=medium ++ ++ * New upstream release ++ - The AVX FMA rotator bug is fixed ++ - VOLK offers `volk::vector<>` for C++ to follow RAII ++ - Use C++17 `std::filesystem` ++ - This enables VOLK to be built without Boost if available! ++ - lots of bugfixes ++ - more optimized kernels, especially more NEON versions ++ * Upload to experimental for new ABI library package libvolk2.1 ++ ++ -- A. Maitland Bottoms Sun, 22 Dec 2019 10:27:36 -0500 ++ ++volk (2.0.0-3) unstable; urgency=medium ++ ++ * update to v2.0.0-4-gf04a46f ++ ++ -- A. Maitland Bottoms Thu, 14 Nov 2019 22:47:23 -0500 ++ ++volk (2.0.0-2) unstable; urgency=medium ++ ++ * Upload to unstable ++ ++ -- A. Maitland Bottoms Mon, 12 Aug 2019 22:49:11 -0400 ++ ++volk (2.0.0-1) experimental; urgency=medium ++ ++ * New upstream release ++ ++ -- A. Maitland Bottoms Wed, 07 Aug 2019 23:31:20 -0400 ++ ++volk (1.4-4) unstable; urgency=medium ++ ++ * working volk_modtool with Python 3 ++ * build and install libvolk.a ++ ++ -- A. Maitland Bottoms Mon, 29 Oct 2018 01:32:05 -0400 ++ ++volk (1.4-3) unstable; urgency=medium ++ ++ * update to v1.4-9-g297fefd ++ Added an AVX protokernel for volk_32fc_x2_32f_square_dist_scalar_mult_32f ++ fixed a buffer over-read and over-write in ++ volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_avx ++ Fix 32u_reverse_32u for ARM ++ ++ -- A. Maitland Bottoms Sat, 12 May 2018 15:25:04 -0400 ++ ++volk (1.4-2) unstable; urgency=medium ++ ++ * Upload to unstable, needed by gnuradio (>= 3.7.12.0) ++ ++ -- A. Maitland Bottoms Tue, 03 Apr 2018 01:03:19 -0400 ++ ++volk (1.4-1) experimental; urgency=medium ++ ++ * New upstream release ++ upstream changelog http://libvolk.org/release-v14.html ++ ++ -- A. Maitland Bottoms Tue, 27 Mar 2018 22:57:42 -0400 ++ ++volk (1.3.1-1) unstable; urgency=medium ++ ++ * New upstream bugfix release ++ * Refresh all debian patches for use with git am ++ ++ -- A. Maitland Bottoms Tue, 27 Mar 2018 21:54:29 -0400 ++ ++volk (1.3-3) unstable; urgency=medium ++ ++ * update to v1.3-23-g0109b2e ++ * update debian/libvolk1-dev.abi.tar.gz.amd64 ++ * Add breaks/replaces gnuradio (<=3.7.2.1) (LP: #1614235) ++ ++ -- A. Maitland Bottoms Sun, 04 Feb 2018 13:12:21 -0500 ++ ++volk (1.3-2) unstable; urgency=medium ++ ++ * update to v1.3-16-g28b03a9 ++ apps: fix profile update reading end of lines ++ qa: lower tolerance for 32fc_mag to fix issue #96 ++ * include upstream master patch to sort input files ++ ++ -- A. Maitland Bottoms Sun, 27 Aug 2017 13:44:55 -0400 ++ ++volk (1.3-1) unstable; urgency=medium ++ ++ * New upstream release ++ * The index_max kernels were named with the wrong output datatype. To ++ fix this there are new kernels that return a 32u (int32_t) and the ++ existing kernels had their signatures changed to return 16u (int16_t). ++ * The output to stdout and stderr has been shuffled around. There is no ++ longer a message that prints what VOLK machine is being used and the ++ warning messages go to stderr rather than stdout. ++ * The 32fc_index_max kernels previously were only accurate to the SSE ++ register width (4 points). This was a pretty serious and long-lived ++ bug that's been fixed and the QA updated appropriately. ++ ++ -- A. Maitland Bottoms Sat, 02 Jul 2016 16:30:47 -0400 ++ ++volk (1.2.2-2) unstable; urgency=medium ++ ++ * update to v1.2.2-11-g78c8bc4 (to follow gnuradio maint branch) ++ ++ -- A. Maitland Bottoms Sun, 19 Jun 2016 14:44:15 -0400 ++ ++volk (1.2.2-1) unstable; urgency=medium ++ ++ * New upstream release ++ ++ -- A. Maitland Bottoms Fri, 08 Apr 2016 00:12:10 -0400 ++ ++volk (1.2.1-2) unstable; urgency=medium ++ ++ * Upstream patches: ++ Fix some CMake complaints ++ The fix for compilation with cmake 3.5 ++ ++ -- A. Maitland Bottoms Wed, 23 Mar 2016 17:47:54 -0400 ++ ++volk (1.2.1-1) unstable; urgency=medium ++ ++ * New upstream release ++ ++ -- A. Maitland Bottoms Sun, 07 Feb 2016 19:38:32 -0500 ++ ++volk (1.2-1) unstable; urgency=medium ++ ++ * New upstream release ++ ++ -- A. Maitland Bottoms Thu, 24 Dec 2015 20:28:13 -0500 ++ ++volk (1.1.1-5) experimental; urgency=medium ++ ++ * update to v1.1.1-22-gef53547 to support gnuradio 3.7.9 ++ ++ -- A. Maitland Bottoms Fri, 11 Dec 2015 13:12:55 -0500 ++ ++volk (1.1.1-4) unstable; urgency=medium ++ ++ * more lintian fixes ++ ++ -- A. Maitland Bottoms Wed, 25 Nov 2015 21:49:58 -0500 ++ ++volk (1.1.1-3) unstable; urgency=medium ++ ++ * Lintian fixes Pre-Depends ++ ++ -- A. Maitland Bottoms Thu, 19 Nov 2015 21:24:27 -0500 ++ ++volk (1.1.1-2) unstable; urgency=medium ++ ++ * Note that libvolk1-dev replaces files in gnuradio-dev versions <<3.7.8 ++ (Closes: #802646) again. Thanks Andreas Beckmann. ++ ++ -- A. Maitland Bottoms Fri, 13 Nov 2015 18:45:49 -0500 ++ ++volk (1.1.1-1) unstable; urgency=medium ++ ++ * New upstream release ++ * New architectures exist for the AVX2 and FMA ISAs. ++ * The profiler now generates buffers that are vlen + a tiny amount and ++ generates random data to fill buffers. This is intended to catch bugs ++ in protokernels that write beyond num_points. ++ * Note that libvolk1-dev replaces files in earlier gnuradio-dev versions ++ (Closes: #802646) ++ ++ -- A. Maitland Bottoms Sun, 01 Nov 2015 18:45:43 -0500 ++ ++volk (1.1-4) unstable; urgency=medium ++ ++ * update to v1.1-12-g264addc ++ ++ -- A. Maitland Bottoms Tue, 29 Sep 2015 23:41:50 -0400 ++ ++volk (1.1-3) unstable; urgency=low ++ ++ * drop dh_acc to get reproducible builds ++ ++ -- A. Maitland Bottoms Fri, 11 Sep 2015 22:57:06 -0400 ++ ++volk (1.1-2) unstable; urgency=low ++ ++ * use dh-acc ++ ++ -- A. Maitland Bottoms Mon, 07 Sep 2015 15:45:20 -0400 ++ ++volk (1.1-1) unstable; urgency=medium ++ ++ * re-organize package naming convention ++ * New upstream release tag v1.1 ++ New architectures exist for the AVX2 and FMA ISAs. Along ++ with the build-system support the following kernels have ++ no proto-kernels taking advantage of these architectures: ++ ++ * 32f_x2_dot_prod_32f ++ * 32fc_x2_multiply_32fc ++ * 64_byteswap ++ * 32f_binary_slicer_8i ++ * 16u_byteswap ++ * 32u_byteswap ++ ++ QA/profiler ++ ----------- ++ ++ The profiler now generates buffers that are vlen + a tiny ++ amount and generates random data to fill buffers. This is ++ intended to catch bugs in protokernels that write beyond ++ num_points. ++ ++ -- A. Maitland Bottoms Wed, 26 Aug 2015 09:22:48 -0400 ++ ++volk (1.0.2-2) unstable; urgency=low ++ ++ * Use SOURCE_DATE_EPOCH from the environment, if defined, ++ rather than current date and time to implement volk_build_date() ++ (embedding build date in a library does not help reproducible builds) ++ * add watch file ++ ++ -- A. Maitland Bottoms Sat, 15 Aug 2015 17:43:15 -0400 ++ ++volk (1.0.2-1) unstable; urgency=medium ++ ++ * Maintenance release 24 Jul 2015 by Nathan West ++ * The major change is the CMake logic to add ASM protokernels. Rather ++ than depending on CFLAGS and ASMFLAGS we use the results of VOLK's ++ built in has_ARCH tests. All configurations should work the same as ++ before, but manually specifying CFLAGS and ASMFLAGS on the cmake call ++ for ARM native builds should no longer be necessary. ++ * The 32fc_s32fc_x2_rotator_32fc generic protokernel now includes a ++ previously implied header. ++ * Finally, there is a fix to return the "best" protokernel to the ++ dispatcher when no volk_config exists. Thanks to Alexandre Raymond for ++ pointing this out. ++ * with maint branch patch: ++ kernels-add-missing-include-arm_neon.h ++ * removed unused build-dependency on liboil0.3-dev (closes: #793626) ++ ++ -- A. Maitland Bottoms Wed, 05 Aug 2015 00:43:40 -0400 ++ ++volk (1.0.1-1) unstable; urgency=low ++ ++ * Maintenance Release v1.0.1 08 Jul 2015 by Nathan West ++ This is a maintenance release with bug fixes since the initial release of ++ v1.0 in April. ++ ++ * Contributors ++ ++ The following authors have contributed code to this release: ++ ++ Doug Geiger doug.geiger@bioradiation.net ++ Elliot Briggs elliot.briggs@gmail.com ++ Marcus Mueller marcus@hostalia.de ++ Nathan West nathan.west@okstate.edu ++ Tom Rondeau tom@trondeau.com ++ ++ * Kernels ++ ++ Several bug fixes in different kernels. The NEON implementations of the ++ following kernels have been fixed: ++ ++ 32f_x2_add_32f ++ 32f_x2_dot_prod_32f ++ 32fc_s32fc_multiply_32fc ++ 32fc_x2_multiply_32fc ++ ++ Additionally the NEON asm based 32f_x2_add_32f protokernels were not being ++ used and are now included and available for use via the dispatcher. ++ ++ The 32f_s32f_x2_fm_detect_32f kernel now has a puppet. This solves QA seg ++ faults on 32-bit machines and provide a better test for this kernel. ++ ++ The 32fc_s32fc_x2_rotator_32fc generic protokernel replaced cabsf with ++ hypotf for better Android support. ++ ++ * Building ++ ++ Static builds now trigger the applications (volk_profile and ++ volk-config-info) to be statically linked. ++ ++ The file gcc_x86_cpuid.h has been removed since it was no longer being ++ used. Previously it provided cpuid functionality for ancient compilers ++ that we do not support. ++ ++ All build types now use -Wall. ++ ++ * QA and Testing ++ ++ The documentation around the --update option to volk_profile now makes it ++ clear that the option will only profile kernels without entries in ++ volk_profile. The signature of run_volk_tests with expanded args changed ++ signed types to unsigned types to reflect the actual input. ++ ++ The remaining changes are all non-functional changes to address issues ++ from Coverity. ++ ++ -- A. Maitland Bottoms Fri, 10 Jul 2015 17:57:42 -0400 ++ ++volk (1.0-5) unstable; urgency=medium ++ ++ * native-armv7-build-support skips neon on Debian armel (Closes: #789972) ++ ++ -- A. Maitland Bottoms Sat, 04 Jul 2015 12:36:36 -0400 ++ ++volk (1.0-4) unstable; urgency=low ++ ++ * update native-armv7-build-support patch from gnuradio volk package ++ ++ -- A. Maitland Bottoms Thu, 25 Jun 2015 16:38:49 -0400 ++ ++volk (1.0-3) unstable; urgency=medium ++ ++ * Add Breaks/Replaces (Closes: #789893, #789894) ++ * Allow failing tests ++ ++ -- A. Maitland Bottoms Thu, 25 Jun 2015 12:46:06 -0400 ++ ++volk (1.0-2) unstable; urgency=medium ++ ++ * kernels-add-missing-math.h-include-to-rotator ++ ++ -- A. Maitland Bottoms Wed, 24 Jun 2015 21:09:32 -0400 ++ ++volk (1.0-1) unstable; urgency=low ++ ++ * Initial package (Closes: #782417) ++ Initial Release 11 Apr 2015 by Nathan West ++ ++ VOLK 1.0 is available. This is the first release of VOLK as an independently ++ tracked sub-project of GNU Radio. ++ ++ * Contributors ++ ++ VOLK has been tracked separately from GNU Radio since 2014 Dec 23. ++ Contributors between the split and the initial release are ++ ++ Albert Holguin aholguin_77@yahoo.com ++ Doug Geiger doug.geiger@bioradiation.net ++ Elliot Briggs elliot.briggs@gmail.com ++ Julien Olivain julien.olivain@lsv.ens-cachan.fr ++ Michael Dickens michael.dickens@ettus.com ++ Nathan West nathan.west@okstate.edu ++ Tom Rondeau tom@trondeau.com ++ ++ * QA ++ ++ The test and profiler have significantly changed. The profiler supports ++ run-time changes to vlen and iters to help kernel development and provide ++ more flexibility on embedded systems. Additionally there is a new option ++ to update an existing volk_profile results file with only new kernels which ++ will save time when updating to newer versions of VOLK ++ ++ The QA system creates a static list of kernels and test cases. The QA ++ testing and profiler iterate over this static list rather than each source ++ file keeping its own list. The QA also emits XML results to ++ lib/.unittest/kernels.xml which is formatted similarly to JUnit results. ++ ++ * Modtool ++ ++ Modtool was updated to support the QA and profiler changes. ++ ++ * Kernels ++ ++ New proto-kernels: ++ ++ 16ic_deinterleave_real_8i_neon ++ 16ic_s32f_deinterleave_32f_neon ++ fix preprocessor errors for some compilers on byteswap and popcount puppets ++ ++ ORC was moved to the asm kernels directory. ++ volk_malloc ++ ++ The posix_memalign implementation of Volk_malloc now falls back to a standard ++ malloc if alignment is 1. ++ ++ * Miscellaneous ++ ++ Several build system and cmake changes have made it possible to build VOLK ++ both independently with proper soname versions and in-tree for projects ++ such as GNU Radio. ++ ++ The static builds take advantage of cmake object libraries to speed up builds. ++ ++ Finally, there are a number of changes to satisfy compiler warnings and make ++ QA work on multiple machines. ++ ++ -- A. Maitland Bottoms Sun, 12 Apr 2015 23:20:41 -0400 diff --cc debian/compat index 0000000,0000000..48082f7 new file mode 100644 --- /dev/null +++ b/debian/compat @@@ -1,0 -1,0 +1,1 @@@ ++12 diff --cc debian/control index 0000000,0000000..d53a4a2 new file mode 100644 --- /dev/null +++ b/debian/control @@@ -1,0 -1,0 +1,80 @@@ ++ ++Source: volk ++Section: libdevel ++Priority: optional ++Maintainer: A. Maitland Bottoms ++Build-Depends: cmake, ++ debhelper (>= 12~), ++ dh-python, ++ liborc-0.4-dev, ++ python3-dev, ++ python3-mako ++Build-Depends-Indep: doxygen ++Standards-Version: 4.5.0 ++Homepage: http://libvolk.org ++Vcs-Browser: https://salsa.debian.org/bottoms/pkg-volk ++Vcs-Git: https://salsa.debian.org/bottoms/pkg-volk.git ++ ++Package: libvolk2.2 ++Section: libs ++Architecture: any ++Pre-Depends: ${misc:Pre-Depends} ++Depends: ${misc:Depends}, ${shlibs:Depends} ++Multi-Arch: same ++Recommends: libvolk2-bin ++Suggests: libvolk2-dev ++Description: vector optimized functions ++ Vector-Optimized Library of Kernels is designed to help ++ applications work with the processor's SIMD instruction sets. These are ++ very powerful vector operations that can give signal processing a ++ huge boost in performance. ++ ++Package: libvolk2-dev ++Architecture: any ++Pre-Depends: ${misc:Pre-Depends} ++Depends: libvolk2.2 (=${binary:Version}), ${misc:Depends} ++Breaks: gnuradio-dev (<<3.7.8), libvolk-dev, libvolk1.0-dev, libvolk1-dev ++Replaces: gnuradio-dev (<<3.7.8), libvolk-dev, libvolk1.0-dev, libvolk1-dev ++Suggests: libvolk2-doc ++Multi-Arch: same ++Description: vector optimized function headers ++ Vector-Optimized Library of Kernels is designed to help ++ applications work with the processor's SIMD instruction sets. These are ++ very powerful vector operations that can give signal processing a ++ huge boost in performance. ++ . ++ This package contains the header files. ++ For documentation, see libvolk-doc. ++ ++Package: libvolk2-bin ++Section: libs ++Architecture: any ++Pre-Depends: ${misc:Pre-Depends} ++Depends: libvolk2.2 (=${binary:Version}), ++ ${misc:Depends}, ++ ${python3:Depends}, ++ ${shlibs:Depends} ++Breaks: libvolk1-bin, libvolk-bin, libvolk1.0-bin, gnuradio (<=3.7.2.1) ++Replaces: libvolk1-bin, libvolk-bin, libvolk1.0-bin, gnuradio (<=3.7.2.1) ++Description: vector optimized runtime tools ++ Vector-Optimized Library of Kernels is designed to help ++ applications work with the processor's SIMD instruction sets. These are ++ very powerful vector operations that can give signal processing a ++ huge boost in performance. ++ . ++ This package includes the volk_profile tool. ++ ++Package: libvolk2-doc ++Section: doc ++Architecture: all ++Multi-Arch: foreign ++Depends: ${misc:Depends} ++Recommends: lynx | www-browser ++Description: vector optimized library documentation ++ Vector-Optimized Library of Kernels is designed to help ++ applications work with the processor's SIMD instruction sets. These are ++ very powerful vector operations that can give signal processing a ++ huge boost in performance. ++ . ++ This package includes the Doxygen generated documentation in ++ /usr/share/doc/libvolk2-dev/html/index.html diff --cc debian/copyright index 0000000,0000000..0dc7d72 new file mode 100644 --- /dev/null +++ b/debian/copyright @@@ -1,0 -1,0 +1,187 @@@ ++Format: https://www.debian.org/doc/packaging-manuals/copyright-format/1.0/ ++Upstream-Name: volk ++Upstream-Contact: http://libvolk.org/ ++Source: ++ https://github.com/gnuradio/volk ++Comment: ++ Debian packages by A. Maitland Bottoms ++ git archive --format=tar --prefix=volk-2.1.0/ v2.1.0 | xz > ../volk_2.1.0.orig.tar.xz ++ . ++ Upstream Maintainers: ++ Johannes Demel ++ Michael Dickens ++Copyright: 2014-2019 Free Software Foundation, Inc. ++License: GPL-3+ ++ ++Files: * ++Copyright: 2006, 2009-2020, Free Software Foundation, Inc. ++License: GPL-3+ ++ ++Files: Doxyfile.in ++ DoxygenLayout.xml ++ volk.pc.in ++Copyright: 2014-2020 Free Software Foundation, Inc. ++License: GPL-3+ ++ ++Files: apps/volk_profile.h ++Copyright: 2014-2020 Free Software Foundation, Inc. ++License: GPL-3+ ++ ++Files: appveyor.yml ++Copyright: 2016 Paul Cercueil ++License: GPL-3+ ++ ++Files: cmake/* ++Copyright: 2014-2020 Free Software Foundation, Inc. ++License: GPL-3+ ++ ++Files: cmake/Modules/* ++Copyright: 2006, 2009-2020, Free Software Foundation, Inc. ++License: GPL-3+ ++ ++Files: cmake/Modules/CMakeParseArgumentsCopy.cmake ++Copyright: 2010 Alexander Neundorf ++License: Kitware-BSD ++ All rights reserved. ++ . ++ Redistribution and use in source and binary forms, with or without ++ modification, are permitted provided that the following conditions ++ are met: ++ . ++ * Redistributions of source code must retain the above copyright ++ notice, this list of conditions and the following disclaimer. ++ . ++ * Redistributions in binary form must reproduce the above copyright ++ notice, this list of conditions and the following disclaimer in the ++ documentation and/or other materials provided with the distribution. ++ . ++ * Neither the names of Kitware, Inc., the Insight Software Consortium, ++ nor the names of their contributors may be used to endorse or promote ++ products derived from this software without specific prior written ++ permission. ++ . ++ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ++ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT ++ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR ++ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT ++ HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, ++ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT ++ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, ++ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY ++ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ++ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE ++ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ ++Files: cmake/Modules/FindORC.cmake ++ cmake/Modules/VolkConfig.cmake.in ++Copyright: 2014-2015 Free Software Foundation, Inc. ++License: GPL-3+ ++ ++Files: cmake/msvc/* ++Copyright: 2006-2008, Alexander Chemeris ++License: BSD-2-clause ++ Redistribution and use in source and binary forms, with or without ++ modification, are permitted provided that the following conditions are met: ++ . ++ 1. Redistributions of source code must retain the above copyright notice, ++ this list of conditions and the following disclaimer. ++ . ++ 2. Redistributions in binary form must reproduce the above copyright ++ notice, this list of conditions and the following disclaimer in the ++ documentation and/or other materials provided with the distribution. ++ . ++ 3. The name of the author may be used to endorse or promote products ++ derived from this software without specific prior written permission. ++ . ++ THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED ++ WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF ++ MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO ++ EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, ++ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, ++ PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; ++ OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, ++ WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR ++ OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ++ ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ ++Files: cmake/msvc/config.h ++Copyright: 2005, 2006 Apple Computer, Inc. ++License: LGPL-2+ ++ ++Files: cmake/msvc/stdbool.h ++Copyright: 2005, 2006, Apple Computer, Inc. ++License: LGPL-2+ ++ ++Files: debian/* ++Copyright: 2015-2020 Free Software Foundation, Inc ++License: GPL-3+ ++Comment: assigned by A. Maitland Bottoms ++ ++Files: debian/libvolk2-dev.abi.tar.gz.amd64 ++Copyright: 2019 Free Software Foundation, Inc ++License: GPL-3+ ++ ++Files: docs/* ++Copyright: 2014-2015 Free Software Foundation, Inc. ++License: GPL-3+ ++ ++Files: gen/archs.xml ++ gen/machines.xml ++Copyright: 2014-2015 Free Software Foundation, Inc. ++License: GPL-3+ ++ ++Files: include/volk/volk_common.h ++ include/volk/volk_complex.h ++ include/volk/volk_prefs.h ++Copyright: 2014-2015 Free Software Foundation, Inc. ++License: GPL-3+ ++ ++Files: kernels/volk/asm/* ++Copyright: 2014-2015 Free Software Foundation, Inc. ++License: GPL-3+ ++ ++Files: kernels/volk/volk_16u_byteswappuppet_16u.h ++ kernels/volk/volk_32u_byteswappuppet_32u.h ++ kernels/volk/volk_64u_byteswappuppet_64u.h ++Copyright: 2014-2015 Free Software Foundation, Inc. ++License: GPL-3+ ++ ++Files: lib/kernel_tests.h ++ lib/qa_utils.cc ++ lib/qa_utils.h ++ lib/volk_prefs.c ++Copyright: 2014-2015 Free Software Foundation, Inc. ++License: GPL-3+ ++ ++License: LGPL-2+ ++ This library is free software; you can redistribute it and/or ++ modify it under the terms of the GNU Library General Public ++ License as published by the Free Software Foundation; either ++ version 2 of the License, or (at your option) any later version. ++ . ++ This library is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ Library General Public License for more details. ++ . ++ You should have received a copy of the GNU Library General Public License ++ along with this library; see the file COPYING.LIB. If not, write to ++ the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, ++ Boston, MA 02110-1301, USA. ++ ++License: GPL-3+ ++ This program is free software: you can redistribute it and/or modify ++ it under the terms of the GNU General Public License as published by ++ the Free Software Foundation; either version 3 of the License, or ++ (at your option) any later version. ++ . ++ This program is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ GNU General Public License for more details. ++ . ++ You should have received a copy of the GNU General Public License ++ along with this program. If not, see . ++ . ++ On Debian systems, the complete text of the GNU General ++ Public License version 3 can be found in "/usr/share/common-licenses/GPL-3". diff --cc debian/libvolk2-bin.install index 0000000,0000000..7221b71 new file mode 100644 --- /dev/null +++ b/debian/libvolk2-bin.install @@@ -1,0 -1,0 +1,2 @@@ ++usr/bin/volk* ++usr/lib/python3/dist-packages diff --cc debian/libvolk2-bin.manpages index 0000000,0000000..95bae9e new file mode 100644 --- /dev/null +++ b/debian/libvolk2-bin.manpages @@@ -1,0 -1,0 +1,3 @@@ ++debian/volk-config-info.1 ++debian/volk_modtool.1 ++debian/volk_profile.1 diff --cc debian/libvolk2-dev.abi.tar.gz.amd64 index 0000000,0000000..ff8acb1 new file mode 100644 Binary files differ diff --cc debian/libvolk2-dev.acc index 0000000,0000000..37f5a79 new file mode 100644 --- /dev/null +++ b/debian/libvolk2-dev.acc @@@ -1,0 -1,0 +1,50 @@@ ++ ++ ++ ++ ++ -DHAVE_CPUID_H ++ -DHAVE_DLFCN_H ++ -DHAVE_FENV_H ++ -DHAVE_POSIX_MEMALIGN ++ -DHAVE_XGETBV ++ -D_GLIBCXX_USE_CXX11_ABI=1 ++ -I/usr/include/orc-0.4 ++ -DNDEBUG ++ -std=gnu11 ++ -m64 ++ -mmmx ++ -msse ++ -msse2 ++ -msse3 ++ -mssse3 ++ -msse4.1 ++ -msse4.2 ++ -mpopcnt ++ -mavx ++ -mfma ++ -mavx2 ++ -mavx512f ++ -mavx512cd ++ -fPIC ++ -g ++ -O2 ++ -fstack-protector-strong ++ -Wformat ++ -Werror=format-security ++ -Wdate-time ++ -D_FORTIFY_SOURCE=2 ++ -fvisibility=hidden ++ -Wsign-compare ++ -Wall ++ -Wno-uninitialized ++ ++ ++ ++debian/libvolk2-dev/usr/include/volk/ ++ ++ ++ ++debian/libvolk2.0/usr/lib/ ++ ++ ++ diff --cc debian/libvolk2-dev.docs index 0000000,0000000..47699cc new file mode 100644 --- /dev/null +++ b/debian/libvolk2-dev.docs @@@ -1,0 -1,0 +1,3 @@@ ++debian/1.3_to_1.4_compat_report.html ++debian/1.4_to_2.0_compat_report.html ++debian/2.2.0_to_2.2.1_compat_report.html diff --cc debian/libvolk2-dev.install index 0000000,0000000..8b14c56 new file mode 100644 --- /dev/null +++ b/debian/libvolk2-dev.install @@@ -1,0 -1,0 +1,5 @@@ ++usr/include/* ++usr/lib/*/*volk.a ++usr/lib/*/*volk*so ++usr/lib/*/cmake/volk ++usr/lib/*/pkgconfig/*volk* diff --cc debian/libvolk2-doc.doc-base index 0000000,0000000..3d5fdc8 new file mode 100644 --- /dev/null +++ b/debian/libvolk2-doc.doc-base @@@ -1,0 -1,0 +1,19 @@@ ++Document: libvolk2-doc ++Title: Vector-Optimized Library of Kernels Reference Manual ++Author: GNU Radio Developers ++Abstract: VOLK is the Vector-Optimized Library of Kernels. ++ It is a library that contains kernels of hand-written SIMD code for ++ different mathematical operations. Since each SIMD architecture can ++ be very different and no compiler has yet come along to handle ++ vectorization properly or highly efficiently, VOLK approaches the ++ problem differently. For each architecture or platform that a ++ developer wishes to vectorize for, a new proto-kernel is added to ++ VOLK. At runtime, VOLK will select the correct proto-kernel. In this ++ way, the users of VOLK call a kernel for performing the operation ++ that is platform/architecture agnostic. This allows us to write ++ portable SIMD code. ++Section: Programming/C++ ++ ++Format: HTML ++Index: /usr/share/doc/libvolk2-dev/html/index.html ++Files: /usr/share/doc/libvolk2-dev/html/*.html diff --cc debian/libvolk2-doc.docs index 0000000,0000000..87dd314 new file mode 100644 --- /dev/null +++ b/debian/libvolk2-doc.docs @@@ -1,0 -1,0 +1,1 @@@ ++obj-*/html diff --cc debian/libvolk2.2.install index 0000000,0000000..e4252f4 new file mode 100644 --- /dev/null +++ b/debian/libvolk2.2.install @@@ -1,0 -1,0 +1,1 @@@ ++usr/lib/*/libvolk.so.* diff --cc debian/patches/0001-volk-accurate-exp-kernel.patch index 0000000,0000000..53df58a new file mode 100644 --- /dev/null +++ b/debian/patches/0001-volk-accurate-exp-kernel.patch @@@ -1,0 -1,0 +1,333 @@@ ++From 9b5abaa62ce3b5d5379899d30afe1964eb63d86d Mon Sep 17 00:00:00 2001 ++From: Tom Rondeau ++Date: Tue, 7 Apr 2015 14:37:28 -0400 ++Subject: [PATCH 1/7] volk: accurate exp kernel. ++ ++A more accurate exp VOLK kernel than volk_32f_expfast_32f.Taken from ++code licensed with zlib. ++--- ++ kernels/volk/volk_32f_exp_32f.h | 298 ++++++++++++++++++++++++++++++++ ++ lib/kernel_tests.h | 2 + ++ 2 files changed, 300 insertions(+) ++ create mode 100644 kernels/volk/volk_32f_exp_32f.h ++ ++diff --git a/kernels/volk/volk_32f_exp_32f.h b/kernels/volk/volk_32f_exp_32f.h ++new file mode 100644 ++index 0000000..19c3d9d ++--- /dev/null +++++ b/kernels/volk/volk_32f_exp_32f.h ++@@ -0,0 +1,298 @@ +++/* -*- c++ -*- */ +++/* +++ * Copyright 2015-2020 Free Software Foundation, Inc. +++ * +++ * This file is part of GNU Radio +++ * +++ * GNU Radio is free software; you can redistribute it and/or modify +++ * it under the terms of the GNU General Public License as published by +++ * the Free Software Foundation; either version 3, or (at your option) +++ * any later version. +++ * +++ * GNU Radio is distributed in the hope that it will be useful, +++ * but WITHOUT ANY WARRANTY; without even the implied warranty of +++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +++ * GNU General Public License for more details. +++ * +++ * You should have received a copy of the GNU General Public License +++ * along with GNU Radio; see the file COPYING. If not, write to +++ * the Free Software Foundation, Inc., 51 Franklin Street, +++ * Boston, MA 02110-1301, USA. +++ */ +++ +++/* SIMD (SSE4) implementation of exp +++ Inspired by Intel Approximate Math library, and based on the +++ corresponding algorithms of the cephes math library +++*/ +++ +++/* Copyright (C) 2007 Julien Pommier +++ +++ This software is provided 'as-is', without any express or implied +++ warranty. In no event will the authors be held liable for any damages +++ arising from the use of this software. +++ +++ Permission is granted to anyone to use this software for any purpose, +++ including commercial applications, and to alter it and redistribute it +++ freely, subject to the following restrictions: +++ +++ 1. The origin of this software must not be misrepresented; you must not +++ claim that you wrote the original software. If you use this software +++ in a product, an acknowledgment in the product documentation would be +++ appreciated but is not required. +++ 2. Altered source versions must be plainly marked as such, and must not be +++ misrepresented as being the original software. +++ 3. This notice may not be removed or altered from any source distribution. +++ +++ (this is the zlib license) +++*/ +++ +++/*! +++ * \page volk_32f_exp_32f +++ * +++ * \b Overview +++ * +++ * Computes exponential of input vector and stores results in output vector. +++ * +++ * Dispatcher Prototype +++ * \code +++ * void volk_32f_exp_32f(float* bVector, const float* aVector, unsigned int num_points) +++ * \endcode +++ * +++ * \b Inputs +++ * \li aVector: The input vector of floats. +++ * \li num_points: The number of data points. +++ * +++ * \b Outputs +++ * \li bVector: The vector where results will be stored. +++ * +++ * \b Example +++ * \code +++ * int N = 10; +++ * unsigned int alignment = volk_get_alignment(); +++ * float* in = (float*)volk_malloc(sizeof(float)*N, alignment); +++ * float* out = (float*)volk_malloc(sizeof(float)*N, alignment); +++ * +++ * in[0] = 0; +++ * in[1] = 0.5; +++ * in[2] = std::sqrt(2.f)/2.f; +++ * in[3] = std::sqrt(3.f)/2.f; +++ * in[4] = in[5] = 1; +++ * for(unsigned int ii = 6; ii < N; ++ii){ +++ * in[ii] = - in[N-ii-1]; +++ * } +++ * +++ * volk_32f_exp_32f(out, in, N); +++ * +++ * for(unsigned int ii = 0; ii < N; ++ii){ +++ * printf("exp(%1.3f) = %1.3f\n", in[ii], out[ii]); +++ * } +++ * +++ * volk_free(in); +++ * volk_free(out); +++ * \endcode +++ */ +++ +++#include +++#include +++#include +++ +++#ifndef INCLUDED_volk_32f_exp_32f_a_H +++#define INCLUDED_volk_32f_exp_32f_a_H +++ +++#ifdef LV_HAVE_SSE4_1 +++#include +++ +++static inline void +++volk_32f_exp_32f_a_sse4_1(float* bVector, const float* aVector, unsigned int num_points) +++{ +++ float* bPtr = bVector; +++ const float* aPtr = aVector; +++ +++ unsigned int number = 0; +++ unsigned int quarterPoints = num_points / 4; +++ +++ // Declare variables and constants +++ __m128 aVal, bVal, tmp, fx, mask, pow2n, z, y; +++ __m128 one, exp_hi, exp_lo, log2EF, half, exp_C1, exp_C2; +++ __m128 exp_p0, exp_p1, exp_p2, exp_p3, exp_p4, exp_p5; +++ __m128i emm0, pi32_0x7f; +++ +++ one = _mm_set1_ps(1.0); +++ exp_hi = _mm_set1_ps(88.3762626647949); +++ exp_lo = _mm_set1_ps(-88.3762626647949); +++ log2EF = _mm_set1_ps(1.44269504088896341); +++ half = _mm_set1_ps(0.5); +++ exp_C1 = _mm_set1_ps(0.693359375); +++ exp_C2 = _mm_set1_ps(-2.12194440e-4); +++ pi32_0x7f = _mm_set1_epi32(0x7f); +++ +++ exp_p0 = _mm_set1_ps(1.9875691500e-4); +++ exp_p1 = _mm_set1_ps(1.3981999507e-3); +++ exp_p2 = _mm_set1_ps(8.3334519073e-3); +++ exp_p3 = _mm_set1_ps(4.1665795894e-2); +++ exp_p4 = _mm_set1_ps(1.6666665459e-1); +++ exp_p5 = _mm_set1_ps(5.0000001201e-1); +++ +++ for(;number < quarterPoints; number++) { +++ aVal = _mm_load_ps(aPtr); +++ tmp = _mm_setzero_ps(); +++ +++ aVal = _mm_max_ps(_mm_min_ps(aVal, exp_hi), exp_lo); +++ +++ /* express exp(x) as exp(g + n*log(2)) */ +++ fx = _mm_add_ps(_mm_mul_ps(aVal, log2EF), half); +++ +++ emm0 = _mm_cvttps_epi32(fx); +++ tmp = _mm_cvtepi32_ps(emm0); +++ +++ mask = _mm_and_ps(_mm_cmpgt_ps(tmp, fx), one); +++ fx = _mm_sub_ps(tmp, mask); +++ +++ tmp = _mm_mul_ps(fx, exp_C1); +++ z = _mm_mul_ps(fx, exp_C2); +++ aVal = _mm_sub_ps(_mm_sub_ps(aVal, tmp), z); +++ z = _mm_mul_ps(aVal, aVal); +++ +++ y = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(exp_p0, aVal), exp_p1), aVal); +++ y = _mm_add_ps(_mm_mul_ps(_mm_add_ps(y, exp_p2), aVal), exp_p3); +++ y = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(y, aVal), exp_p4), aVal); +++ y = _mm_add_ps(_mm_mul_ps(_mm_add_ps(y, exp_p5), z), aVal); +++ y = _mm_add_ps(y, one); +++ +++ emm0 = _mm_slli_epi32(_mm_add_epi32(_mm_cvttps_epi32(fx), pi32_0x7f), 23); +++ +++ pow2n = _mm_castsi128_ps(emm0); +++ bVal = _mm_mul_ps(y, pow2n); +++ +++ _mm_store_ps(bPtr, bVal); +++ aPtr += 4; +++ bPtr += 4; +++ } +++ +++ number = quarterPoints * 4; +++ for(;number < num_points; number++) { +++ *bPtr++ = expf(*aPtr++); +++ } +++} +++ +++#endif /* LV_HAVE_SSE4_1 for aligned */ +++ +++ +++#ifdef LV_HAVE_GENERIC +++ +++static inline void +++volk_32f_exp_32f_a_generic(float* bVector, const float* aVector, unsigned int num_points) +++{ +++ float* bPtr = bVector; +++ const float* aPtr = aVector; +++ unsigned int number = 0; +++ +++ for(number = 0; number < num_points; number++) { +++ *bPtr++ = expf(*aPtr++); +++ } +++} +++ +++#endif /* LV_HAVE_GENERIC */ +++ +++#endif /* INCLUDED_volk_32f_exp_32f_a_H */ +++ +++#ifndef INCLUDED_volk_32f_exp_32f_u_H +++#define INCLUDED_volk_32f_exp_32f_u_H +++ +++#ifdef LV_HAVE_SSE4_1 +++#include +++ +++static inline void +++volk_32f_exp_32f_u_sse4_1(float* bVector, const float* aVector, unsigned int num_points) +++{ +++ float* bPtr = bVector; +++ const float* aPtr = aVector; +++ +++ unsigned int number = 0; +++ unsigned int quarterPoints = num_points / 4; +++ +++ // Declare variables and constants +++ __m128 aVal, bVal, tmp, fx, mask, pow2n, z, y; +++ __m128 one, exp_hi, exp_lo, log2EF, half, exp_C1, exp_C2; +++ __m128 exp_p0, exp_p1, exp_p2, exp_p3, exp_p4, exp_p5; +++ __m128i emm0, pi32_0x7f; +++ +++ one = _mm_set1_ps(1.0); +++ exp_hi = _mm_set1_ps(88.3762626647949); +++ exp_lo = _mm_set1_ps(-88.3762626647949); +++ log2EF = _mm_set1_ps(1.44269504088896341); +++ half = _mm_set1_ps(0.5); +++ exp_C1 = _mm_set1_ps(0.693359375); +++ exp_C2 = _mm_set1_ps(-2.12194440e-4); +++ pi32_0x7f = _mm_set1_epi32(0x7f); +++ +++ exp_p0 = _mm_set1_ps(1.9875691500e-4); +++ exp_p1 = _mm_set1_ps(1.3981999507e-3); +++ exp_p2 = _mm_set1_ps(8.3334519073e-3); +++ exp_p3 = _mm_set1_ps(4.1665795894e-2); +++ exp_p4 = _mm_set1_ps(1.6666665459e-1); +++ exp_p5 = _mm_set1_ps(5.0000001201e-1); +++ +++ +++ for(;number < quarterPoints; number++) { +++ aVal = _mm_loadu_ps(aPtr); +++ tmp = _mm_setzero_ps(); +++ +++ aVal = _mm_max_ps(_mm_min_ps(aVal, exp_hi), exp_lo); +++ +++ /* express exp(x) as exp(g + n*log(2)) */ +++ fx = _mm_add_ps(_mm_mul_ps(aVal, log2EF), half); +++ +++ emm0 = _mm_cvttps_epi32(fx); +++ tmp = _mm_cvtepi32_ps(emm0); +++ +++ mask = _mm_and_ps(_mm_cmpgt_ps(tmp, fx), one); +++ fx = _mm_sub_ps(tmp, mask); +++ +++ tmp = _mm_mul_ps(fx, exp_C1); +++ z = _mm_mul_ps(fx, exp_C2); +++ aVal = _mm_sub_ps(_mm_sub_ps(aVal, tmp), z); +++ z = _mm_mul_ps(aVal, aVal); +++ +++ y = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(exp_p0, aVal), exp_p1), aVal); +++ y = _mm_add_ps(_mm_mul_ps(_mm_add_ps(y, exp_p2), aVal), exp_p3); +++ y = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(y, aVal), exp_p4), aVal); +++ y = _mm_add_ps(_mm_mul_ps(_mm_add_ps(y, exp_p5), z), aVal); +++ y = _mm_add_ps(y, one); +++ +++ emm0 = _mm_slli_epi32(_mm_add_epi32(_mm_cvttps_epi32(fx), pi32_0x7f), 23); +++ +++ pow2n = _mm_castsi128_ps(emm0); +++ bVal = _mm_mul_ps(y, pow2n); +++ +++ _mm_storeu_ps(bPtr, bVal); +++ aPtr += 4; +++ bPtr += 4; +++ } +++ +++ number = quarterPoints * 4; +++ for(;number < num_points; number++){ +++ *bPtr++ = expf(*aPtr++); +++ } +++} +++ +++#endif /* LV_HAVE_SSE4_1 for unaligned */ +++ +++ +++#ifdef LV_HAVE_GENERIC +++ +++static inline void +++volk_32f_exp_32f_u_generic(float* bVector, const float* aVector, unsigned int num_points) +++{ +++ float* bPtr = bVector; +++ const float* aPtr = aVector; +++ unsigned int number = 0; +++ +++ for(number = 0; number < num_points; number++){ +++ *bPtr++ = expf(*aPtr++); +++ } +++} +++ +++#endif /* LV_HAVE_GENERIC */ +++ +++#endif /* INCLUDED_volk_32f_exp_32f_u_H */ ++diff --git a/lib/kernel_tests.h b/lib/kernel_tests.h ++index c009c3f..8552488 100644 ++--- a/lib/kernel_tests.h +++++ b/lib/kernel_tests.h ++@@ -144,6 +144,8 @@ std::vector init_test_list(volk_test_params_t test_params) ++ QA(VOLK_INIT_TEST(volk_32fc_x2_s32fc_multiply_conjugate_add_32fc, test_params)) ++ QA(VOLK_INIT_PUPP(volk_8u_x3_encodepolarpuppet_8u, volk_8u_x3_encodepolar_8u_x2, test_params)) ++ QA(VOLK_INIT_PUPP(volk_32f_8u_polarbutterflypuppet_32f, volk_32f_8u_polarbutterfly_32f, test_params)) +++ QA(VOLK_INIT_TEST(volk_32f_exp_32f, test_params)) +++ ++ // no one uses these, so don't test them ++ //VOLK_PROFILE(volk_16i_x5_add_quad_16i_x4, 1e-4, 2046, 10000, &results, benchmark_mode, kernel_regex); ++ //VOLK_PROFILE(volk_16i_branch_4_state_8, 1e-4, 2046, 10000, &results, benchmark_mode, kernel_regex); ++-- ++2.20.1 ++ diff --cc debian/patches/0002-exp-Rename-SSE4.1-to-SSE2-kernel.patch index 0000000,0000000..94d3281 new file mode 100644 --- /dev/null +++ b/debian/patches/0002-exp-Rename-SSE4.1-to-SSE2-kernel.patch @@@ -1,0 -1,0 +1,66 @@@ ++From 52bfb2f049b534aca5b6d3e7475c9b2dd97c55a3 Mon Sep 17 00:00:00 2001 ++From: Johannes Demel ++Date: Tue, 17 Mar 2020 21:20:51 +0100 ++Subject: [PATCH 2/7] exp: Rename SSE4.1 to SSE2 kernel ++ ++The SSE kernel only requires SSE2 instructions. Thus, we can just use ++this instruction level. ++--- ++ kernels/volk/volk_32f_exp_32f.h | 16 ++++++++-------- ++ 1 file changed, 8 insertions(+), 8 deletions(-) ++ ++diff --git a/kernels/volk/volk_32f_exp_32f.h b/kernels/volk/volk_32f_exp_32f.h ++index 19c3d9d..26fdf02 100644 ++--- a/kernels/volk/volk_32f_exp_32f.h +++++ b/kernels/volk/volk_32f_exp_32f.h ++@@ -99,11 +99,11 @@ ++ #ifndef INCLUDED_volk_32f_exp_32f_a_H ++ #define INCLUDED_volk_32f_exp_32f_a_H ++ ++-#ifdef LV_HAVE_SSE4_1 ++-#include +++#ifdef LV_HAVE_SSE2 +++#include ++ ++ static inline void ++-volk_32f_exp_32f_a_sse4_1(float* bVector, const float* aVector, unsigned int num_points) +++volk_32f_exp_32f_a_sse2(float* bVector, const float* aVector, unsigned int num_points) ++ { ++ float* bPtr = bVector; ++ const float* aPtr = aVector; ++@@ -175,7 +175,7 @@ volk_32f_exp_32f_a_sse4_1(float* bVector, const float* aVector, unsigned int num ++ } ++ } ++ ++-#endif /* LV_HAVE_SSE4_1 for aligned */ +++#endif /* LV_HAVE_SSE2 for aligned */ ++ ++ ++ #ifdef LV_HAVE_GENERIC ++@@ -199,11 +199,11 @@ volk_32f_exp_32f_a_generic(float* bVector, const float* aVector, unsigned int nu ++ #ifndef INCLUDED_volk_32f_exp_32f_u_H ++ #define INCLUDED_volk_32f_exp_32f_u_H ++ ++-#ifdef LV_HAVE_SSE4_1 ++-#include +++#ifdef LV_HAVE_SSE2 +++#include ++ ++ static inline void ++-volk_32f_exp_32f_u_sse4_1(float* bVector, const float* aVector, unsigned int num_points) +++volk_32f_exp_32f_u_sse2(float* bVector, const float* aVector, unsigned int num_points) ++ { ++ float* bPtr = bVector; ++ const float* aPtr = aVector; ++@@ -276,7 +276,7 @@ volk_32f_exp_32f_u_sse4_1(float* bVector, const float* aVector, unsigned int num ++ } ++ } ++ ++-#endif /* LV_HAVE_SSE4_1 for unaligned */ +++#endif /* LV_HAVE_SSE2 for unaligned */ ++ ++ ++ #ifdef LV_HAVE_GENERIC ++-- ++2.20.1 ++ diff --cc debian/patches/0003-clang-format-Apply-clang-format.patch index 0000000,0000000..1873202 new file mode 100644 --- /dev/null +++ b/debian/patches/0003-clang-format-Apply-clang-format.patch @@@ -1,0 -1,0 +1,74061 @@@ ++From 092a59997a1e1d5f421a0a5f87ee655ad173b93f Mon Sep 17 00:00:00 2001 ++From: Johannes Demel ++Date: Sun, 23 Feb 2020 15:03:47 +0100 ++Subject: [PATCH 3/7] clang-format: Apply clang-format ++ ++This commit adds `.clang-format` from GNU Radio and apply clang-format. ++ ++Run: ++`find . -regex '.*\.\(c\|cc\|cpp\|cxx\|h\|hh\)' -exec clang-format \ ++-style=file -i {} \;` ++in `.`. ++--- ++ .clang-format | 106 ++ ++ apps/volk-config-info.cc | 77 +- ++ apps/volk_option_helpers.cc | 268 +-- ++ apps/volk_option_helpers.h | 84 +- ++ apps/volk_profile.cc | 205 ++- ++ apps/volk_profile.h | 20 +- ++ cmake/msvc/config.h | 27 +- ++ cmake/msvc/sys/time.h | 77 +- ++ include/volk/saturation_arithmetic.h | 16 +- ++ include/volk/volk_alloc.hh | 42 +- ++ include/volk/volk_avx2_intrinsics.h | 114 +- ++ include/volk/volk_avx_intrinsics.h | 193 +- ++ include/volk/volk_common.h | 148 +- ++ include/volk/volk_complex.h | 41 +- ++ include/volk/volk_malloc.h | 12 +- ++ include/volk/volk_neon_intrinsics.h | 115 +- ++ include/volk/volk_prefs.h | 17 +- ++ include/volk/volk_sse3_intrinsics.h | 79 +- ++ include/volk/volk_sse_intrinsics.h | 53 +- ++ kernels/volk/volk_16i_32fc_dot_prod_32fc.h | 1118 ++++++------ ++ kernels/volk/volk_16i_branch_4_state_8.h | 219 ++- ++ kernels/volk/volk_16i_convert_8i.h | 301 ++-- ++ kernels/volk/volk_16i_max_star_16i.h | 158 +- ++ .../volk/volk_16i_max_star_horizontal_16i.h | 214 +-- ++ .../volk/volk_16i_permute_and_scalar_add.h | 187 +- ++ kernels/volk/volk_16i_s32f_convert_32f.h | 609 +++---- ++ kernels/volk/volk_16i_x4_quad_max_star_16i.h | 357 ++-- ++ kernels/volk/volk_16i_x5_add_quad_16i_x4.h | 336 ++-- ++ kernels/volk/volk_16ic_convert_32fc.h | 241 +-- ++ kernels/volk/volk_16ic_deinterleave_16i_x2.h | 431 +++-- ++ .../volk/volk_16ic_deinterleave_real_16i.h | 397 +++-- ++ kernels/volk/volk_16ic_deinterleave_real_8i.h | 469 +++-- ++ kernels/volk/volk_16ic_magnitude_16i.h | 506 +++--- ++ .../volk/volk_16ic_s32f_deinterleave_32f_x2.h | 418 ++--- ++ .../volk_16ic_s32f_deinterleave_real_32f.h | 372 ++-- ++ kernels/volk/volk_16ic_s32f_magnitude_32f.h | 381 ++-- ++ kernels/volk/volk_16ic_x2_dot_prod_16ic.h | 750 ++++---- ++ kernels/volk/volk_16ic_x2_multiply_16ic.h | 504 ++++-- ++ kernels/volk/volk_16u_byteswap.h | 378 ++-- ++ kernels/volk/volk_16u_byteswappuppet_16u.h | 44 +- ++ kernels/volk/volk_32f_64f_add_64f.h | 270 +-- ++ kernels/volk/volk_32f_64f_multiply_64f.h | 154 +- ++ kernels/volk/volk_32f_8u_polarbutterfly_32f.h | 478 ++--- ++ .../volk_32f_8u_polarbutterflypuppet_32f.h | 155 +- ++ kernels/volk/volk_32f_accumulator_s32f.h | 287 +-- ++ kernels/volk/volk_32f_acos_32f.h | 700 ++++---- ++ kernels/volk/volk_32f_asin_32f.h | 647 +++---- ++ kernels/volk/volk_32f_atan_32f.h | 625 +++---- ++ kernels/volk/volk_32f_binary_slicer_32i.h | 259 +-- ++ kernels/volk/volk_32f_binary_slicer_8i.h | 706 ++++---- ++ kernels/volk/volk_32f_convert_64f.h | 214 ++- ++ kernels/volk/volk_32f_cos_32f.h | 1159 ++++++------ ++ kernels/volk/volk_32f_expfast_32f.h | 347 ++-- ++ kernels/volk/volk_32f_index_max_16u.h | 370 ++-- ++ kernels/volk/volk_32f_index_max_32u.h | 770 ++++---- ++ kernels/volk/volk_32f_invsqrt_32f.h | 189 +- ++ kernels/volk/volk_32f_log2_32f.h | 719 +++++--- ++ kernels/volk/volk_32f_null_32f.h | 16 +- ++ .../volk/volk_32f_s32f_32f_fm_detect_32f.h | 457 ++--- ++ ...k_32f_s32f_calc_spectral_noise_floor_32f.h | 683 +++---- ++ kernels/volk/volk_32f_s32f_convert_16i.h | 815 ++++----- ++ kernels/volk/volk_32f_s32f_convert_32i.h | 579 +++--- ++ kernels/volk/volk_32f_s32f_convert_8i.h | 642 +++---- ++ .../volk/volk_32f_s32f_mod_rangepuppet_32f.h | 63 +- ++ kernels/volk/volk_32f_s32f_multiply_32f.h | 271 +-- ++ kernels/volk/volk_32f_s32f_normalize.h | 150 +- ++ kernels/volk/volk_32f_s32f_power_32f.h | 166 +- ++ .../volk/volk_32f_s32f_s32f_mod_range_32f.h | 718 ++++---- ++ kernels/volk/volk_32f_s32f_stddev_32f.h | 449 ++--- ++ kernels/volk/volk_32f_sin_32f.h | 945 +++++----- ++ kernels/volk/volk_32f_sqrt_32f.h | 153 +- ++ .../volk/volk_32f_stddev_and_mean_32f_x2.h | 583 +++--- ++ kernels/volk/volk_32f_tan_32f.h | 1023 ++++++----- ++ kernels/volk/volk_32f_tanh_32f.h | 631 ++++--- ++ kernels/volk/volk_32f_x2_add_32f.h | 412 +++-- ++ kernels/volk/volk_32f_x2_divide_32f.h | 364 ++-- ++ kernels/volk/volk_32f_x2_dot_prod_16i.h | 1092 ++++++------ ++ kernels/volk/volk_32f_x2_dot_prod_32f.h | 1186 +++++++------ ++ .../volk/volk_32f_x2_fm_detectpuppet_32f.h | 40 +- ++ kernels/volk/volk_32f_x2_interleave_32fc.h | 292 +-- ++ kernels/volk/volk_32f_x2_max_32f.h | 345 ++-- ++ kernels/volk/volk_32f_x2_min_32f.h | 347 ++-- ++ kernels/volk/volk_32f_x2_multiply_32f.h | 375 ++-- ++ kernels/volk/volk_32f_x2_pow_32f.h | 1175 ++++++------ ++ .../volk/volk_32f_x2_s32f_interleave_16ic.h | 324 ++-- ++ kernels/volk/volk_32f_x2_subtract_32f.h | 319 ++-- ++ kernels/volk/volk_32f_x3_sum_of_poly_32f.h | 1026 +++++------ ++ kernels/volk/volk_32fc_32f_add_32fc.h | 281 +-- ++ kernels/volk/volk_32fc_32f_dot_prod_32fc.h | 1205 +++++++------ ++ kernels/volk/volk_32fc_32f_multiply_32fc.h | 226 +-- ++ kernels/volk/volk_32fc_conjugate_32fc.h | 233 +-- ++ kernels/volk/volk_32fc_convert_16ic.h | 439 ++--- ++ kernels/volk/volk_32fc_deinterleave_32f_x2.h | 297 ++-- ++ kernels/volk/volk_32fc_deinterleave_64f_x2.h | 439 ++--- ++ .../volk/volk_32fc_deinterleave_imag_32f.h | 210 +-- ++ .../volk/volk_32fc_deinterleave_real_32f.h | 214 +-- ++ .../volk/volk_32fc_deinterleave_real_64f.h | 262 +-- ++ kernels/volk/volk_32fc_index_max_16u.h | 639 +++---- ++ kernels/volk/volk_32fc_index_max_32u.h | 630 +++---- ++ kernels/volk/volk_32fc_magnitude_32f.h | 556 +++--- ++ .../volk/volk_32fc_magnitude_squared_32f.h | 443 ++--- ++ kernels/volk/volk_32fc_s32f_atan2_32f.h | 208 +-- ++ .../volk_32fc_s32f_deinterleave_real_16i.h | 226 +-- ++ kernels/volk/volk_32fc_s32f_magnitude_16i.h | 297 ++-- ++ kernels/volk/volk_32fc_s32f_power_32fc.h | 121 +- ++ .../volk/volk_32fc_s32f_power_spectrum_32f.h | 176 +- ++ ..._32fc_s32f_x2_power_spectral_density_32f.h | 297 ++-- ++ kernels/volk/volk_32fc_s32fc_multiply_32fc.h | 250 +-- ++ .../volk/volk_32fc_s32fc_rotatorpuppet_32fc.h | 118 +- ++ .../volk/volk_32fc_s32fc_x2_rotator_32fc.h | 260 +-- ++ kernels/volk/volk_32fc_x2_add_32fc.h | 274 +-- ++ .../volk_32fc_x2_conjugate_dot_prod_32fc.h | 1017 ++++++----- ++ kernels/volk/volk_32fc_x2_divide_32fc.h | 372 ++-- ++ kernels/volk/volk_32fc_x2_dot_prod_32fc.h | 1334 +++++++------- ++ kernels/volk/volk_32fc_x2_multiply_32fc.h | 575 +++--- ++ .../volk_32fc_x2_multiply_conjugate_32fc.h | 347 ++-- ++ ...32fc_x2_s32f_square_dist_scalar_mult_32f.h | 657 +++---- ++ ...2fc_x2_s32fc_multiply_conjugate_add_32fc.h | 98 +- ++ kernels/volk/volk_32fc_x2_square_dist_32f.h | 426 ++--- ++ kernels/volk/volk_32i_s32f_convert_32f.h | 347 ++-- ++ kernels/volk/volk_32i_x2_and_32i.h | 320 ++-- ++ kernels/volk/volk_32i_x2_or_32i.h | 321 ++-- ++ kernels/volk/volk_32u_byteswap.h | 433 ++--- ++ kernels/volk/volk_32u_byteswappuppet_32u.h | 44 +- ++ kernels/volk/volk_32u_popcnt.h | 26 +- ++ kernels/volk/volk_32u_popcntpuppet_32u.h | 18 +- ++ kernels/volk/volk_32u_reverse_32u.h | 598 ++++--- ++ kernels/volk/volk_64f_convert_32f.h | 324 ++-- ++ kernels/volk/volk_64f_x2_add_64f.h | 207 +-- ++ kernels/volk/volk_64f_x2_max_64f.h | 276 +-- ++ kernels/volk/volk_64f_x2_min_64f.h | 275 +-- ++ kernels/volk/volk_64f_x2_multiply_64f.h | 207 +-- ++ kernels/volk/volk_64u_byteswap.h | 599 ++++--- ++ kernels/volk/volk_64u_byteswappuppet_64u.h | 56 +- ++ kernels/volk/volk_64u_popcnt.h | 79 +- ++ kernels/volk/volk_64u_popcntpuppet_64u.h | 29 +- ++ kernels/volk/volk_8i_convert_16i.h | 315 ++-- ++ kernels/volk/volk_8i_s32f_convert_32f.h | 528 +++--- ++ kernels/volk/volk_8ic_deinterleave_16i_x2.h | 493 ++++-- ++ kernels/volk/volk_8ic_deinterleave_real_16i.h | 346 ++-- ++ kernels/volk/volk_8ic_deinterleave_real_8i.h | 482 +++-- ++ .../volk/volk_8ic_s32f_deinterleave_32f_x2.h | 571 +++--- ++ .../volk_8ic_s32f_deinterleave_real_32f.h | 395 +++-- ++ .../volk_8ic_x2_multiply_conjugate_16ic.h | 413 +++-- ++ ...volk_8ic_x2_s32f_multiply_conjugate_32fc.h | 496 +++--- ++ kernels/volk/volk_8u_conv_k7_r2puppet_8u.h | 494 +++--- ++ kernels/volk/volk_8u_x2_encodeframepolar_8u.h | 1569 +++++++++++------ ++ kernels/volk/volk_8u_x3_encodepolar_8u_x2.h | 110 +- ++ .../volk/volk_8u_x3_encodepolarpuppet_8u.h | 137 +- ++ kernels/volk/volk_8u_x4_conv_k7_r2_8u.h | 1067 +++++------ ++ lib/kernel_tests.h | 257 +-- ++ lib/qa_utils.cc | 751 +++++--- ++ lib/qa_utils.h | 288 +-- ++ lib/testqa.cc | 96 +- ++ lib/volk_malloc.c | 55 +- ++ lib/volk_prefs.c | 74 +- ++ lib/volk_rank_archs.c | 73 +- ++ lib/volk_rank_archs.h | 22 +- ++ 158 files changed, 32509 insertions(+), 27583 deletions(-) ++ create mode 100644 .clang-format ++ ++diff --git a/.clang-format b/.clang-format ++new file mode 100644 ++index 0000000..285b68d ++--- /dev/null +++++ b/.clang-format ++@@ -0,0 +1,106 @@ +++--- +++Language: Cpp +++# BasedOnStyle: LLVM +++AccessModifierOffset: -4 +++AlignAfterOpenBracket: Align +++AlignConsecutiveAssignments: false +++AlignConsecutiveDeclarations: false +++AlignEscapedNewlinesLeft: true +++AlignOperands: true +++AlignTrailingComments: true +++AllowAllParametersOfDeclarationOnNextLine: true +++AllowShortBlocksOnASingleLine: false +++AllowShortCaseLabelsOnASingleLine: false +++AllowShortFunctionsOnASingleLine: All +++AllowShortIfStatementsOnASingleLine: false +++AllowShortLoopsOnASingleLine: false +++AlwaysBreakAfterDefinitionReturnType: None +++AlwaysBreakAfterReturnType: None +++AlwaysBreakBeforeMultilineStrings: false +++AlwaysBreakTemplateDeclarations: true +++BinPackArguments: false +++BinPackParameters: false +++BreakBeforeBraces: Custom +++BraceWrapping: +++ AfterClass: true +++ AfterControlStatement: false +++ AfterEnum: false +++ AfterFunction: true +++ AfterNamespace: false +++ AfterObjCDeclaration: false +++ AfterStruct: false +++ AfterUnion: false +++ BeforeCatch: false +++ BeforeElse: false +++ IndentBraces: false +++BreakBeforeBinaryOperators: None +++BreakBeforeTernaryOperators: true +++BreakConstructorInitializersBeforeComma: false +++BreakAfterJavaFieldAnnotations: false +++BreakStringLiterals: true +++ColumnLimit: 90 +++CommentPragmas: '^ IWYU pragma:' +++ConstructorInitializerAllOnOneLineOrOnePerLine: true +++ConstructorInitializerIndentWidth: 4 +++ContinuationIndentWidth: 4 +++Cpp11BracedListStyle: false +++DerivePointerAlignment: false +++DisableFormat: false +++ExperimentalAutoDetectBinPacking: false +++ForEachMacros: +++ - foreach +++ - Q_FOREACH +++ - BOOST_FOREACH +++IncludeCategories: +++ - Regex: '^"(gnuradio)/' +++ Priority: 1 +++ - Regex: '^<(gnuradio)/' +++ Priority: 2 +++ - Regex: '^<(boost)/' +++ Priority: 98 +++ - Regex: '^<[a-z]*>$' +++ Priority: 99 +++ - Regex: '^".*"$' +++ Priority: 0 +++ - Regex: '.*' +++ Priority: 10 +++ +++IncludeIsMainRegex: '(Test)?$' +++IndentCaseLabels: false +++IndentWidth: 4 +++IndentWrappedFunctionNames: false +++JavaScriptQuotes: Leave +++JavaScriptWrapImports: true +++KeepEmptyLinesAtTheStartOfBlocks: true +++MacroBlockBegin: '' +++MacroBlockEnd: '' +++MaxEmptyLinesToKeep: 2 +++NamespaceIndentation: None +++ObjCBlockIndentWidth: 2 +++ObjCSpaceAfterProperty: false +++ObjCSpaceBeforeProtocolList: true +++PenaltyBreakBeforeFirstCallParameter: 19 +++PenaltyBreakComment: 300 +++PenaltyBreakFirstLessLess: 120 +++PenaltyBreakString: 1000 +++PenaltyExcessCharacter: 1000000 +++PenaltyReturnTypeOnItsOwnLine: 60 +++PointerAlignment: Left +++ReflowComments: true +++SortIncludes: true +++SpaceAfterCStyleCast: false +++SpaceAfterTemplateKeyword: true +++SpaceBeforeAssignmentOperators: true +++SpaceBeforeParens: ControlStatements +++SpaceInEmptyParentheses: false +++SpacesBeforeTrailingComments: 1 +++SpacesInAngles: false +++SpacesInContainerLiterals: true +++SpacesInCStyleCastParentheses: false +++SpacesInParentheses: false +++SpacesInSquareBrackets: false +++Standard: Cpp11 +++TabWidth: 8 +++UseTab: Never +++ +++ ++diff --git a/apps/volk-config-info.cc b/apps/volk-config-info.cc ++index 4eedcb7..2521993 100644 ++--- a/apps/volk-config-info.cc +++++ b/apps/volk-config-info.cc ++@@ -24,52 +24,63 @@ ++ #include ++ #endif ++ ++-#include // for volk_available_machines, volk_c_com... ++-#include // for operator<<, endl, cout, ostream ++-#include // for string +++#include // for volk_available_machines, volk_c_com... +++#include // for operator<<, endl, cout, ostream +++#include // for string ++ ++-#include "volk/volk.h" // for volk_get_alignment, volk_get_machine ++-#include "volk_option_helpers.h" // for option_list, option_t +++#include "volk/volk.h" // for volk_get_alignment, volk_get_machine +++#include "volk_option_helpers.h" // for option_list, option_t ++ ++ void print_alignment() ++ { ++- std::cout << "Alignment in bytes: " << volk_get_alignment() << std::endl; +++ std::cout << "Alignment in bytes: " << volk_get_alignment() << std::endl; ++ } ++ ++ void print_malloc() ++ { ++- // You don't want to change the volk_malloc code, so just copy the if/else ++- // structure from there and give an explanation for the implementations ++- std::cout << "Used malloc implementation: "; ++- #if HAVE_POSIX_MEMALIGN ++- std::cout << "posix_memalign" << std::endl; ++- #elif defined(_MSC_VER) ++- std::cout << "_aligned_malloc" << std::endl; ++- #else ++- std::cout << "C11 aligned_alloc" << std::endl; ++- #endif +++ // You don't want to change the volk_malloc code, so just copy the if/else +++ // structure from there and give an explanation for the implementations +++ std::cout << "Used malloc implementation: "; +++#if HAVE_POSIX_MEMALIGN +++ std::cout << "posix_memalign" << std::endl; +++#elif defined(_MSC_VER) +++ std::cout << "_aligned_malloc" << std::endl; +++#else +++ std::cout << "C11 aligned_alloc" << std::endl; +++#endif ++ } ++ ++ ++-int ++-main(int argc, char **argv) +++int main(int argc, char** argv) ++ { ++ ++- option_list our_options("volk-config-info"); ++- our_options.add(option_t("prefix", "", "print the VOLK installation prefix", volk_prefix())); ++- our_options.add(option_t("cc", "", "print the VOLK C compiler version", volk_c_compiler())); ++- our_options.add(option_t("cflags", "", "print the VOLK CFLAGS", volk_compiler_flags())); ++- our_options.add(option_t("all-machines", "", "print VOLK machines built", volk_available_machines())); ++- our_options.add(option_t("avail-machines", "", "print VOLK machines on the current " ++- "platform", volk_list_machines)); ++- our_options.add(option_t("machine", "", "print the current VOLK machine that will be used", ++- volk_get_machine())); ++- our_options.add(option_t("alignment", "", "print the memory alignment", print_alignment)); ++- our_options.add(option_t("malloc", "", "print the malloc implementation used in volk_malloc", ++- print_malloc)); ++- our_options.add(option_t("version", "v", "print the VOLK version", volk_version())); +++ option_list our_options("volk-config-info"); +++ our_options.add( +++ option_t("prefix", "", "print the VOLK installation prefix", volk_prefix())); +++ our_options.add( +++ option_t("cc", "", "print the VOLK C compiler version", volk_c_compiler())); +++ our_options.add( +++ option_t("cflags", "", "print the VOLK CFLAGS", volk_compiler_flags())); +++ our_options.add(option_t( +++ "all-machines", "", "print VOLK machines built", volk_available_machines())); +++ our_options.add(option_t("avail-machines", +++ "", +++ "print VOLK machines on the current " +++ "platform", +++ volk_list_machines)); +++ our_options.add(option_t("machine", +++ "", +++ "print the current VOLK machine that will be used", +++ volk_get_machine())); +++ our_options.add( +++ option_t("alignment", "", "print the memory alignment", print_alignment)); +++ our_options.add(option_t("malloc", +++ "", +++ "print the malloc implementation used in volk_malloc", +++ print_malloc)); +++ our_options.add(option_t("version", "v", "print the VOLK version", volk_version())); ++ ++- our_options.parse(argc, argv); +++ our_options.parse(argc, argv); ++ ++- return 0; +++ return 0; ++ } ++diff --git a/apps/volk_option_helpers.cc b/apps/volk_option_helpers.cc ++index 4299709..73d51da 100644 ++--- a/apps/volk_option_helpers.cc +++++ b/apps/volk_option_helpers.cc ++@@ -4,66 +4,97 @@ ++ ++ #include "volk_option_helpers.h" ++ ++-#include // for exception ++-#include // for operator<<, endl, basic_ostream, cout, ostream ++-#include // for pair ++-#include // IWYU pragma: keep ++-#include // IWYU pragma: keep ++-#include // IWYU pragma: keep +++#include // IWYU pragma: keep +++#include // IWYU pragma: keep +++#include // IWYU pragma: keep +++#include // for exception +++#include // for operator<<, endl, basic_ostream, cout, ostream +++#include // for pair ++ ++ /* ++ * Option type ++ */ ++-option_t::option_t(std::string longform, std::string shortform, std::string msg, void (*callback)()) ++- : longform("--" + longform), ++- shortform("-" + shortform), ++- msg(msg), ++- callback(callback) { option_type = VOID_CALLBACK; } ++- ++-option_t::option_t(std::string longform, std::string shortform, std::string msg, void (*callback)(int)) ++- : longform("--" + longform), ++- shortform("-" + shortform), ++- msg(msg), ++- callback((void (*)()) callback) { option_type = INT_CALLBACK; } ++- ++-option_t::option_t(std::string longform, std::string shortform, std::string msg, void (*callback)(float)) ++- : longform("--" + longform), ++- shortform("-" + shortform), ++- msg(msg), ++- callback((void (*)()) callback) { option_type = FLOAT_CALLBACK; } ++- ++-option_t::option_t(std::string longform, std::string shortform, std::string msg, void (*callback)(bool)) ++- : longform("--" + longform), ++- shortform("-" + shortform), ++- msg(msg), ++- callback((void (*)()) callback) { option_type = BOOL_CALLBACK; } ++- ++-option_t::option_t(std::string longform, std::string shortform, std::string msg, void (*callback)(std::string)) ++- : longform("--" + longform), ++- shortform("-" + shortform), ++- msg(msg), ++- callback((void (*)()) callback) { option_type = STRING_CALLBACK; } ++- ++-option_t::option_t(std::string longform, std::string shortform, std::string msg, std::string printval) ++- : longform("--" + longform), ++- shortform("-" + shortform), ++- msg(msg), ++- printval(printval) { option_type = STRING; } +++option_t::option_t(std::string longform, +++ std::string shortform, +++ std::string msg, +++ void (*callback)()) +++ : longform("--" + longform), shortform("-" + shortform), msg(msg), callback(callback) +++{ +++ option_type = VOID_CALLBACK; +++} +++ +++option_t::option_t(std::string longform, +++ std::string shortform, +++ std::string msg, +++ void (*callback)(int)) +++ : longform("--" + longform), +++ shortform("-" + shortform), +++ msg(msg), +++ callback((void (*)())callback) +++{ +++ option_type = INT_CALLBACK; +++} +++ +++option_t::option_t(std::string longform, +++ std::string shortform, +++ std::string msg, +++ void (*callback)(float)) +++ : longform("--" + longform), +++ shortform("-" + shortform), +++ msg(msg), +++ callback((void (*)())callback) +++{ +++ option_type = FLOAT_CALLBACK; +++} +++ +++option_t::option_t(std::string longform, +++ std::string shortform, +++ std::string msg, +++ void (*callback)(bool)) +++ : longform("--" + longform), +++ shortform("-" + shortform), +++ msg(msg), +++ callback((void (*)())callback) +++{ +++ option_type = BOOL_CALLBACK; +++} +++ +++option_t::option_t(std::string longform, +++ std::string shortform, +++ std::string msg, +++ void (*callback)(std::string)) +++ : longform("--" + longform), +++ shortform("-" + shortform), +++ msg(msg), +++ callback((void (*)())callback) +++{ +++ option_type = STRING_CALLBACK; +++} +++ +++option_t::option_t(std::string longform, +++ std::string shortform, +++ std::string msg, +++ std::string printval) +++ : longform("--" + longform), shortform("-" + shortform), msg(msg), printval(printval) +++{ +++ option_type = STRING; +++} ++ ++ ++ /* ++ * Option List ++ */ ++ ++-option_list::option_list(std::string program_name) : ++- program_name(program_name) { +++option_list::option_list(std::string program_name) : program_name(program_name) +++{ ++ internal_list = std::vector(); ++ } ++ ++ ++ void option_list::add(option_t opt) { internal_list.push_back(opt); } ++ ++-void option_list::parse(int argc, char **argv) { +++void option_list::parse(int argc, char** argv) +++{ ++ for (int arg_number = 0; arg_number < argc; ++arg_number) { ++ for (std::vector::iterator this_option = internal_list.begin(); ++ this_option != internal_list.end(); ++@@ -73,74 +104,83 @@ void option_list::parse(int argc, char **argv) { ++ this_option->shortform == std::string(argv[arg_number])) { ++ ++ if (present_options.count(this_option->longform) == 0) { ++- present_options.insert(std::pair(this_option->longform, 1)); +++ present_options.insert( +++ std::pair(this_option->longform, 1)); ++ } else { ++ present_options[this_option->longform] += 1; ++ } ++ switch (this_option->option_type) { ++- case VOID_CALLBACK: ++- this_option->callback(); ++- break; ++- case INT_CALLBACK: ++- try { ++- int_val = atoi(argv[++arg_number]); ++- ((void (*)(int)) this_option->callback)(int_val); ++- } catch (std::exception &exc) { ++- std::cout << "An int option can only receive a number" << std::endl; ++- throw std::exception(); ++- }; ++- break; ++- case FLOAT_CALLBACK: ++- try { ++- double double_val = atof(argv[++arg_number]); ++- ((void (*)(float)) this_option->callback)(double_val); ++- } catch (std::exception &exc) { ++- std::cout << "A float option can only receive a number" << std::endl; ++- throw std::exception(); ++- }; ++- break; ++- case BOOL_CALLBACK: ++- try { ++- if (arg_number == (argc - 1)) { // this is the last arg +++ case VOID_CALLBACK: +++ this_option->callback(); +++ break; +++ case INT_CALLBACK: +++ try { +++ int_val = atoi(argv[++arg_number]); +++ ((void (*)(int))this_option->callback)(int_val); +++ } catch (std::exception& exc) { +++ std::cout << "An int option can only receive a number" +++ << std::endl; +++ throw std::exception(); +++ }; +++ break; +++ case FLOAT_CALLBACK: +++ try { +++ double double_val = atof(argv[++arg_number]); +++ ((void (*)(float))this_option->callback)(double_val); +++ } catch (std::exception& exc) { +++ std::cout << "A float option can only receive a number" +++ << std::endl; +++ throw std::exception(); +++ }; +++ break; +++ case BOOL_CALLBACK: +++ try { +++ if (arg_number == (argc - 1)) { // this is the last arg +++ int_val = 1; +++ } else { // sneak a look at the next arg since it's present +++ char* next_arg = argv[arg_number + 1]; +++ if ((strncmp(next_arg, "-", 1) == 0) || +++ (strncmp(next_arg, "--", 2) == 0)) { +++ // the next arg is actually an arg, the bool is just +++ // present, set to true +++ int_val = 1; +++ } else if (strncmp(next_arg, "true", 4) == 0) { ++ int_val = 1; ++- } else { // sneak a look at the next arg since it's present ++- char *next_arg = argv[arg_number + 1]; ++- if ((strncmp(next_arg, "-", 1) == 0) || (strncmp(next_arg, "--", 2) == 0)) { ++- // the next arg is actually an arg, the bool is just present, set to true ++- int_val = 1; ++- } else if (strncmp(next_arg, "true", 4) == 0) { ++- int_val = 1; ++- } else if (strncmp(next_arg, "false", 5) == 0) { ++- int_val = 0; ++- } else { ++- // we got a number or a string. ++- // convert it to a number and depend on the catch to report an error condition ++- int_val = (bool) atoi(argv[++arg_number]); ++- } +++ } else if (strncmp(next_arg, "false", 5) == 0) { +++ int_val = 0; +++ } else { +++ // we got a number or a string. +++ // convert it to a number and depend on the catch to +++ // report an error condition +++ int_val = (bool)atoi(argv[++arg_number]); ++ } ++- } catch (std::exception &e) { ++- int_val = INT_MIN; ++- }; ++- if (int_val == INT_MIN) { ++- std::cout << "option: '" << argv[arg_number - 1] << "' -> received an unknown value. Boolean " ++- "options should receive one of '0', '1', 'true', 'false'." << std::endl; ++- throw std::exception(); ++- } else if (int_val) { ++- ((void (*)(bool)) this_option->callback)(int_val); ++ } ++- break; ++- case STRING_CALLBACK: ++- try { ++- ((void (*)(std::string)) this_option->callback)(argv[++arg_number]); ++- } catch (std::exception &exc) { ++- throw std::exception(); ++- }; ++- case STRING: ++- std::cout << this_option->printval << std::endl; ++- break; +++ } catch (std::exception& e) { +++ int_val = INT_MIN; +++ }; +++ if (int_val == INT_MIN) { +++ std::cout +++ << "option: '" << argv[arg_number - 1] +++ << "' -> received an unknown value. Boolean " +++ "options should receive one of '0', '1', 'true', 'false'." +++ << std::endl; +++ throw std::exception(); +++ } else if (int_val) { +++ ((void (*)(bool))this_option->callback)(int_val); +++ } +++ break; +++ case STRING_CALLBACK: +++ try { +++ ((void (*)(std::string))this_option->callback)( +++ argv[++arg_number]); +++ } catch (std::exception& exc) { +++ throw std::exception(); +++ }; +++ case STRING: +++ std::cout << this_option->printval << std::endl; +++ break; ++ } ++ } ++- ++ } ++ if (std::string("--help") == std::string(argv[arg_number]) || ++ std::string("-h") == std::string(argv[arg_number])) { ++@@ -150,7 +190,8 @@ void option_list::parse(int argc, char **argv) { ++ } ++ } ++ ++-bool option_list::present(std::string option_name) { +++bool option_list::present(std::string option_name) +++{ ++ if (present_options.count("--" + option_name)) { ++ return true; ++ } else { ++@@ -158,7 +199,8 @@ bool option_list::present(std::string option_name) { ++ } ++ } ++ ++-void option_list::help() { +++void option_list::help() +++{ ++ std::cout << program_name << std::endl; ++ std::cout << " -h [ --help ] \t\tdisplay this help message" << std::endl; ++ for (std::vector::iterator this_option = internal_list.begin(); ++@@ -172,14 +214,14 @@ void option_list::help() { ++ } ++ ++ switch (help_line.size() / 8) { ++- case 0: ++- help_line += "\t"; ++- case 1: ++- help_line += "\t"; ++- case 2: ++- help_line += "\t"; ++- case 3: ++- help_line += "\t"; +++ case 0: +++ help_line += "\t"; +++ case 1: +++ help_line += "\t"; +++ case 2: +++ help_line += "\t"; +++ case 3: +++ help_line += "\t"; ++ } ++ help_line += this_option->msg; ++ std::cout << help_line << std::endl; ++diff --git a/apps/volk_option_helpers.h b/apps/volk_option_helpers.h ++index 8a71547..0756caf 100644 ++--- a/apps/volk_option_helpers.h +++++ b/apps/volk_option_helpers.h ++@@ -5,56 +5,74 @@ ++ #ifndef VOLK_VOLK_OPTION_HELPERS_H ++ #define VOLK_VOLK_OPTION_HELPERS_H ++ ++-#include ++-#include ++ #include ++-#include +++#include ++ #include +++#include +++#include ++ ++-typedef enum ++-{ ++- VOID_CALLBACK, +++typedef enum { +++ VOID_CALLBACK, ++ INT_CALLBACK, ++ BOOL_CALLBACK, ++ STRING_CALLBACK, ++ FLOAT_CALLBACK, ++- STRING, +++ STRING, ++ } VOLK_OPTYPE; ++ ++-class option_t { ++- public: ++- option_t(std::string longform, std::string shortform, std::string msg, void (*callback)()); ++- option_t(std::string longform, std::string shortform, std::string msg, void (*callback)(int)); ++- option_t(std::string longform, std::string shortform, std::string msg, void (*callback)(float)); ++- option_t(std::string longform, std::string shortform, std::string msg, void (*callback)(bool)); ++- option_t(std::string longform, std::string shortform, std::string msg, void (*callback)(std::string)); ++- option_t(std::string longform, std::string shortform, std::string msg, std::string printval); ++- ++- std::string longform; ++- std::string shortform; ++- std::string msg; ++- VOLK_OPTYPE option_type; ++- std::string printval; ++- void (*callback)(); +++class option_t +++{ +++public: +++ option_t(std::string longform, +++ std::string shortform, +++ std::string msg, +++ void (*callback)()); +++ option_t(std::string longform, +++ std::string shortform, +++ std::string msg, +++ void (*callback)(int)); +++ option_t(std::string longform, +++ std::string shortform, +++ std::string msg, +++ void (*callback)(float)); +++ option_t(std::string longform, +++ std::string shortform, +++ std::string msg, +++ void (*callback)(bool)); +++ option_t(std::string longform, +++ std::string shortform, +++ std::string msg, +++ void (*callback)(std::string)); +++ option_t(std::string longform, +++ std::string shortform, +++ std::string msg, +++ std::string printval); ++ +++ std::string longform; +++ std::string shortform; +++ std::string msg; +++ VOLK_OPTYPE option_type; +++ std::string printval; +++ void (*callback)(); ++ }; ++ ++ class option_list ++ { ++- public: ++- option_list(std::string program_name); ++- bool present(std::string option_name); +++public: +++ option_list(std::string program_name); +++ bool present(std::string option_name); +++ +++ void add(option_t opt); ++ ++- void add(option_t opt); +++ void parse(int argc, char** argv); ++ ++- void parse(int argc, char **argv); +++ void help(); ++ ++- void help(); ++- private: ++- std::string program_name; ++- std::vector internal_list; ++- std::map present_options; +++private: +++ std::string program_name; +++ std::vector internal_list; +++ std::map present_options; ++ }; ++ ++ ++-#endif //VOLK_VOLK_OPTION_HELPERS_H +++#endif // VOLK_VOLK_OPTION_HELPERS_H ++diff --git a/apps/volk_profile.cc b/apps/volk_profile.cc ++index 4ef5aeb..3c2e324 100644 ++--- a/apps/volk_profile.cc +++++ b/apps/volk_profile.cc ++@@ -27,23 +27,23 @@ ++ #include ++ #endif ++ #else ++-#include // for create_directories, exists ++-#include // for path, operator<< ++-#include // for filesystem +++#include // for create_directories, exists +++#include // for path, operator<< +++#include // for filesystem ++ #endif ++-#include // for size_t ++-#include // for stat ++-#include // for volk_get_config_path ++-#include // for operator<<, basic_ostream ++-#include // IWYU pragma: keep ++-#include // for map, map<>::iterator ++-#include // for pair ++-#include // for vector, vector<>::const_... ++- ++-#include "kernel_tests.h" // for init_test_list ++-#include "qa_utils.h" // for volk_test_results_t, vol... ++-#include "volk/volk_complex.h" // for lv_32fc_t ++-#include "volk_option_helpers.h" // for option_list, option_t +++#include // for size_t +++#include // for stat +++#include // for volk_get_config_path +++#include // IWYU pragma: keep +++#include // for operator<<, basic_ostream +++#include // for map, map<>::iterator +++#include // for pair +++#include // for vector, vector<>::const_... +++ +++#include "kernel_tests.h" // for init_test_list +++#include "qa_utils.h" // for volk_test_results_t, vol... +++#include "volk/volk_complex.h" // for lv_32fc_t +++#include "volk_option_helpers.h" // for option_list, option_t ++ #include "volk_profile.h" ++ ++ #if HAS_STD_FILESYSTEM ++@@ -72,45 +72,61 @@ void set_json(std::string val) { json_filename = val; } ++ std::string volk_config_path(""); ++ void set_volk_config(std::string val) { volk_config_path = val; } ++ ++-int main(int argc, char *argv[]) { +++int main(int argc, char* argv[]) +++{ ++ ++ option_list profile_options("volk_profile"); ++- profile_options.add(option_t("benchmark", "b", "Run all kernels (benchmark mode)", set_benchmark)); ++- profile_options.add(option_t("tol", "t", "Set the default tolerance for all tests", set_tolerance)); ++- profile_options.add(option_t("vlen", "v", "Set the default vector length for tests", set_vlen)); ++- profile_options.add((option_t("iter", "i", "Set the default number of test iterations per kernel", set_iter))); ++- profile_options.add((option_t("tests-substr", "R", "Run tests matching substring", set_substr))); ++- profile_options.add((option_t("update", "u", "Run only kernels missing from config", set_update))); ++- profile_options.add((option_t("dry-run", "n", "Dry run. Respect other options, but don't write to file", set_dryrun))); ++- profile_options.add((option_t("json", "j", "Write results to JSON file named as argument value", set_json))); ++- profile_options.add((option_t("path", "p", "Specify the volk_config path", set_volk_config))); +++ profile_options.add( +++ option_t("benchmark", "b", "Run all kernels (benchmark mode)", set_benchmark)); +++ profile_options.add( +++ option_t("tol", "t", "Set the default tolerance for all tests", set_tolerance)); +++ profile_options.add( +++ option_t("vlen", "v", "Set the default vector length for tests", set_vlen)); +++ profile_options.add((option_t( +++ "iter", "i", "Set the default number of test iterations per kernel", set_iter))); +++ profile_options.add( +++ (option_t("tests-substr", "R", "Run tests matching substring", set_substr))); +++ profile_options.add( +++ (option_t("update", "u", "Run only kernels missing from config", set_update))); +++ profile_options.add( +++ (option_t("dry-run", +++ "n", +++ "Dry run. Respect other options, but don't write to file", +++ set_dryrun))); +++ profile_options.add((option_t( +++ "json", "j", "Write results to JSON file named as argument value", set_json))); +++ profile_options.add( +++ (option_t("path", "p", "Specify the volk_config path", set_volk_config))); ++ profile_options.parse(argc, argv); ++ ++ if (profile_options.present("help")) { ++ return 0; ++ } ++ ++- if(dry_run) { ++- std::cout << "Warning: this IS a dry-run. Config will not be written!" << std::endl; +++ if (dry_run) { +++ std::cout << "Warning: this IS a dry-run. Config will not be written!" +++ << std::endl; ++ } ++ ++ // Adding program options ++ std::ofstream json_file; ++ std::string config_file; ++ ++- if ( json_filename != "" ) { ++- json_file.open( json_filename.c_str() ); +++ if (json_filename != "") { +++ json_file.open(json_filename.c_str()); ++ } ++ ++- if ( volk_config_path != "" ) { +++ if (volk_config_path != "") { ++ config_file = volk_config_path + "/volk_config"; ++ } ++ ++ // Run tests ++ std::vector results; ++- if(update_mode) { ++- if( config_file != "" ) read_results(&results, config_file); ++- else read_results(&results); +++ if (update_mode) { +++ if (config_file != "") +++ read_results(&results, config_file); +++ else +++ read_results(&results); ++ } ++ ++ // Initialize the list of tests ++@@ -118,22 +134,22 @@ int main(int argc, char *argv[]) { ++ ++ // Iterate through list of tests running each one ++ std::string substr_to_match(test_params.kernel_regex()); ++- for(unsigned int ii = 0; ii < test_cases.size(); ++ii) { +++ for (unsigned int ii = 0; ii < test_cases.size(); ++ii) { ++ bool regex_match = true; ++ ++ volk_test_case_t test_case = test_cases[ii]; ++ // if the kernel name matches regex then do the test ++ std::string test_case_name = test_case.name(); ++- if(test_case_name.find(substr_to_match) == std::string::npos) { +++ if (test_case_name.find(substr_to_match) == std::string::npos) { ++ regex_match = false; ++ } ++ ++ // if we are in update mode check if we've already got results ++ // if we have any, then no need to test that kernel ++ bool update = true; ++- if(update_mode) { ++- for(unsigned int jj=0; jj < results.size(); ++jj) { ++- if(results[jj].name == test_case.name() || +++ if (update_mode) { +++ for (unsigned int jj = 0; jj < results.size(); ++jj) { +++ if (results[jj].name == test_case.name() || ++ results[jj].name == test_case.puppet_master_name()) { ++ update = false; ++ break; ++@@ -141,39 +157,44 @@ int main(int argc, char *argv[]) { ++ } ++ } ++ ++- if( regex_match && update ) { +++ if (regex_match && update) { ++ try { ++- run_volk_tests(test_case.desc(), test_case.kernel_ptr(), test_case.name(), ++- test_case.test_parameters(), &results, test_case.puppet_master_name()); ++- } ++- catch (std::string &error) { ++- std::cerr << "Caught Exception in 'run_volk_tests': " << error << std::endl; +++ run_volk_tests(test_case.desc(), +++ test_case.kernel_ptr(), +++ test_case.name(), +++ test_case.test_parameters(), +++ &results, +++ test_case.puppet_master_name()); +++ } catch (std::string& error) { +++ std::cerr << "Caught Exception in 'run_volk_tests': " << error +++ << std::endl; ++ } ++ } ++ } ++ ++ ++ // Output results according to provided options ++- if(json_filename != "") { +++ if (json_filename != "") { ++ write_json(json_file, results); ++ json_file.close(); ++ } ++ ++- if(!dry_run) { ++- if(config_file != "") write_results(&results, false, config_file); ++- else write_results(&results, false); ++- } ++- else { +++ if (!dry_run) { +++ if (config_file != "") +++ write_results(&results, false, config_file); +++ else +++ write_results(&results, false); +++ } else { ++ std::cout << "Warning: this was a dry-run. Config not generated" << std::endl; ++ } ++ return 0; ++ } ++ ++-void read_results(std::vector *results) +++void read_results(std::vector* results) ++ { ++ char path[1024]; ++ volk_get_config_path(path, true); ++- if(path[0] == 0){ +++ if (path[0] == 0) { ++ std::cout << "No prior test results found ..." << std::endl; ++ return; ++ } ++@@ -181,16 +202,16 @@ void read_results(std::vector *results) ++ read_results(results, std::string(path)); ++ } ++ ++-void read_results(std::vector *results, std::string path) +++void read_results(std::vector* results, std::string path) ++ { ++ struct stat buffer; ++- bool config_status = (stat (path.c_str(), &buffer) == 0); +++ bool config_status = (stat(path.c_str(), &buffer) == 0); ++ ++- if( config_status ) { +++ if (config_status) { ++ // a config exists and we are reading results from it ++ std::ifstream config(path.c_str()); ++ char config_line[256]; ++- while(config.getline(config_line, 255)) { +++ while (config.getline(config_line, 255)) { ++ // tokenize the input line by kernel_name unaligned aligned ++ // then push back in the results vector with fields filled in ++ ++@@ -198,26 +219,26 @@ void read_results(std::vector *results, std::string path) ++ std::string config_str(config_line); ++ std::size_t str_size = config_str.size(); ++ std::size_t found = config_str.find(' '); ++- +++ ++ // Split line by spaces ++- while(found && found < str_size) { +++ while (found && found < str_size) { ++ found = config_str.find(' '); ++ // kernel names MUST be less than 128 chars, which is ++ // a length restricted by volk/volk_prefs.c ++ // on the last token in the parsed string we won't find a space ++ // so make sure we copy at most 128 chars. ++- if(found > 127) { +++ if (found > 127) { ++ found = 127; ++ } ++ str_size = config_str.size(); ++- char buffer[128] = {'\0'}; +++ char buffer[128] = { '\0' }; ++ config_str.copy(buffer, found + 1, 0); ++ buffer[found] = '\0'; ++ single_kernel_result.push_back(std::string(buffer)); ++- config_str.erase(0, found+1); +++ config_str.erase(0, found + 1); ++ } ++ ++- if(single_kernel_result.size() == 3) { +++ if (single_kernel_result.size() == 3) { ++ volk_test_results_t kernel_result; ++ kernel_result.name = std::string(single_kernel_result[0]); ++ kernel_result.config_name = std::string(single_kernel_result[0]); ++@@ -229,45 +250,47 @@ void read_results(std::vector *results, std::string path) ++ } ++ } ++ ++-void write_results(const std::vector *results, bool update_result) +++void write_results(const std::vector* results, bool update_result) ++ { ++ char path[1024]; ++ volk_get_config_path(path, false); ++- if(path[0] == 0){ +++ if (path[0] == 0) { ++ std::cout << "Aborting 'No config save path found' ..." << std::endl; ++ return; ++ } ++ ++- write_results( results, update_result, std::string(path)); +++ write_results(results, update_result, std::string(path)); ++ } ++ ++-void write_results(const std::vector *results, bool update_result, const std::string path) +++void write_results(const std::vector* results, +++ bool update_result, +++ const std::string path) ++ { ++-// struct stat buffer; ++-// bool config_status = (stat (path.c_str(), &buffer) == 0); +++ // struct stat buffer; +++ // bool config_status = (stat (path.c_str(), &buffer) == 0); ++ ++ /* ++ * These ++ */ ++ const fs::path config_path(path); ++- if (! fs::exists(config_path.parent_path())) ++- { +++ if (!fs::exists(config_path.parent_path())) { ++ std::cout << "Creating " << config_path.parent_path() << "..." << std::endl; ++ fs::create_directories(config_path.parent_path()); ++ } ++ ++ std::ofstream config; ++- if(update_result) { +++ if (update_result) { ++ std::cout << "Updating " << path << "..." << std::endl; ++ config.open(path.c_str(), std::ofstream::app); ++- if (!config.is_open()) { //either we don't have write access or we don't have the dir yet +++ if (!config.is_open()) { // either we don't have write access or we don't have the +++ // dir yet ++ std::cout << "Error opening file " << path << std::endl; ++ } ++- } ++- else { +++ } else { ++ std::cout << "Writing " << path << "..." << std::endl; ++ config.open(path.c_str()); ++- if (!config.is_open()) { //either we don't have write access or we don't have the dir yet +++ if (!config.is_open()) { // either we don't have write access or we don't have the +++ // dir yet ++ std::cout << "Error opening file " << path << std::endl; ++ } ++ ++@@ -278,43 +301,45 @@ void write_results(const std::vector *results, bool update_ ++ } ++ ++ std::vector::const_iterator profile_results; ++- for(profile_results = results->begin(); profile_results != results->end(); ++profile_results) { ++- config << profile_results->config_name << " " ++- << profile_results->best_arch_a << " " ++- << profile_results->best_arch_u << std::endl; +++ for (profile_results = results->begin(); profile_results != results->end(); +++ ++profile_results) { +++ config << profile_results->config_name << " " << profile_results->best_arch_a +++ << " " << profile_results->best_arch_u << std::endl; ++ } ++ config.close(); ++ } ++ ++-void write_json(std::ofstream &json_file, std::vector results) +++void write_json(std::ofstream& json_file, std::vector results) ++ { ++ json_file << "{" << std::endl; ++ json_file << " \"volk_tests\": [" << std::endl; ++ size_t len = results.size(); ++ size_t i = 0; ++ std::vector::iterator result; ++- for(result = results.begin(); result != results.end(); ++result) { +++ for (result = results.begin(); result != results.end(); ++result) { ++ json_file << " {" << std::endl; ++ json_file << " \"name\": \"" << result->name << "\"," << std::endl; ++ json_file << " \"vlen\": " << (int)(result->vlen) << "," << std::endl; ++ json_file << " \"iter\": " << result->iter << "," << std::endl; ++- json_file << " \"best_arch_a\": \"" << result->best_arch_a ++- << "\"," << std::endl; ++- json_file << " \"best_arch_u\": \"" << result->best_arch_u ++- << "\"," << std::endl; +++ json_file << " \"best_arch_a\": \"" << result->best_arch_a << "\"," +++ << std::endl; +++ json_file << " \"best_arch_u\": \"" << result->best_arch_u << "\"," +++ << std::endl; ++ json_file << " \"results\": {" << std::endl; ++ size_t results_len = result->results.size(); ++ size_t ri = 0; ++ ++ std::map::iterator kernel_time_pair; ++- for(kernel_time_pair = result->results.begin(); kernel_time_pair != result->results.end(); ++kernel_time_pair) { +++ for (kernel_time_pair = result->results.begin(); +++ kernel_time_pair != result->results.end(); +++ ++kernel_time_pair) { ++ volk_test_time_t time = kernel_time_pair->second; ++ json_file << " \"" << time.name << "\": {" << std::endl; ++ json_file << " \"name\": \"" << time.name << "\"," << std::endl; ++ json_file << " \"time\": " << time.time << "," << std::endl; ++ json_file << " \"units\": \"" << time.units << "\"" << std::endl; ++- json_file << " }" ; ++- if(ri+1 != results_len) { +++ json_file << " }"; +++ if (ri + 1 != results_len) { ++ json_file << ","; ++ } ++ json_file << std::endl; ++@@ -322,7 +347,7 @@ void write_json(std::ofstream &json_file, std::vector resul ++ } ++ json_file << " }" << std::endl; ++ json_file << " }"; ++- if(i+1 != len) { +++ if (i + 1 != len) { ++ json_file << ","; ++ } ++ json_file << std::endl; ++diff --git a/apps/volk_profile.h b/apps/volk_profile.h ++index 51629ab..ae3b474 100644 ++--- a/apps/volk_profile.h +++++ b/apps/volk_profile.h ++@@ -1,14 +1,16 @@ ++ ++ ++-#include // for bool ++-#include // for ofstream ++-#include // for string ++-#include // for vector +++#include // for bool +++#include // for ofstream +++#include // for string +++#include // for vector ++ ++ class volk_test_results_t; ++ ++-void read_results(std::vector *results); ++-void read_results(std::vector *results, std::string path); ++-void write_results(const std::vector *results, bool update_result); ++-void write_results(const std::vector *results, bool update_result, const std::string path); ++-void write_json(std::ofstream &json_file, std::vector results); +++void read_results(std::vector* results); +++void read_results(std::vector* results, std::string path); +++void write_results(const std::vector* results, bool update_result); +++void write_results(const std::vector* results, +++ bool update_result, +++ const std::string path); +++void write_json(std::ofstream& json_file, std::vector results); ++diff --git a/cmake/msvc/config.h b/cmake/msvc/config.h ++index 8b12c2a..68f716e 100644 ++--- a/cmake/msvc/config.h +++++ b/cmake/msvc/config.h ++@@ -9,7 +9,7 @@ ++ // enable inline functions for C code ++ //////////////////////////////////////////////////////////////////////// ++ #ifndef __cplusplus ++-# define inline __inline +++#define inline __inline ++ #endif ++ ++ //////////////////////////////////////////////////////////////////////// ++@@ -23,12 +23,21 @@ typedef ptrdiff_t ssize_t; ++ //////////////////////////////////////////////////////////////////////// ++ #if _MSC_VER < 1800 ++ #include ++-static inline long lrint(double x){return (long)(x > 0.0 ? x + 0.5 : x - 0.5);} ++-static inline long lrintf(float x){return (long)(x > 0.0f ? x + 0.5f : x - 0.5f);} ++-static inline long long llrint(double x){return (long long)(x > 0.0 ? x + 0.5 : x - 0.5);} ++-static inline long long llrintf(float x){return (long long)(x > 0.0f ? x + 0.5f : x - 0.5f);} ++-static inline double rint(double x){return (x > 0.0)? floor(x + 0.5) : ceil(x - 0.5);} ++-static inline float rintf(float x){return (x > 0.0f)? floorf(x + 0.5f) : ceilf(x - 0.5f);} +++static inline long lrint(double x) { return (long)(x > 0.0 ? x + 0.5 : x - 0.5); } +++static inline long lrintf(float x) { return (long)(x > 0.0f ? x + 0.5f : x - 0.5f); } +++static inline long long llrint(double x) +++{ +++ return (long long)(x > 0.0 ? x + 0.5 : x - 0.5); +++} +++static inline long long llrintf(float x) +++{ +++ return (long long)(x > 0.0f ? x + 0.5f : x - 0.5f); +++} +++static inline double rint(double x) { return (x > 0.0) ? floor(x + 0.5) : ceil(x - 0.5); } +++static inline float rintf(float x) +++{ +++ return (x > 0.0f) ? floorf(x + 0.5f) : ceilf(x - 0.5f); +++} ++ #endif ++ ++ //////////////////////////////////////////////////////////////////////// ++@@ -43,7 +52,7 @@ static inline float rintf(float x){return (x > 0.0f)? floorf(x + 0.5f) : ceilf(x ++ // random and srandom ++ //////////////////////////////////////////////////////////////////////// ++ #include ++-static inline long int random (void) { return rand(); } ++-static inline void srandom (unsigned int seed) { srand(seed); } +++static inline long int random(void) { return rand(); } +++static inline void srandom(unsigned int seed) { srand(seed); } ++ ++ #endif // _MSC_CONFIG_H_ ] ++diff --git a/cmake/msvc/sys/time.h b/cmake/msvc/sys/time.h ++index aa0f5dc..4bda1ba 100644 ++--- a/cmake/msvc/sys/time.h +++++ b/cmake/msvc/sys/time.h ++@@ -10,67 +10,62 @@ ++ #define NOMINMAX ++ #endif ++ ++-//http://social.msdn.microsoft.com/Forums/en/vcgeneral/thread/430449b3-f6dd-4e18-84de-eebd26a8d668 +++// http://social.msdn.microsoft.com/Forums/en/vcgeneral/thread/430449b3-f6dd-4e18-84de-eebd26a8d668 ++ #include < time.h > ++ #include //I've omitted this line. ++ #if defined(_MSC_VER) || defined(_MSC_EXTENSIONS) ++- #define DELTA_EPOCH_IN_MICROSECS 11644473600000000Ui64 +++#define DELTA_EPOCH_IN_MICROSECS 11644473600000000Ui64 ++ #else ++- #define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL +++#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL ++ #endif ++ ++ #if _MSC_VER < 1900 ++ struct timespec { ++ ++-time_t tv_sec; /* Seconds since 00:00:00 GMT, */ +++ time_t tv_sec; /* Seconds since 00:00:00 GMT, */ ++ ++-/* 1 January 1970 */ +++ /* 1 January 1970 */ ++ ++-long tv_nsec; /* Additional nanoseconds since */ ++- ++-/* tv_sec */ +++ long tv_nsec; /* Additional nanoseconds since */ ++ +++ /* tv_sec */ ++ }; ++ #endif ++ ++-struct timezone ++-{ ++- int tz_minuteswest; /* minutes W of Greenwich */ ++- int tz_dsttime; /* type of dst correction */ +++struct timezone { +++ int tz_minuteswest; /* minutes W of Greenwich */ +++ int tz_dsttime; /* type of dst correction */ ++ }; ++ ++-static inline int gettimeofday(struct timeval *tv, struct timezone *tz) +++static inline int gettimeofday(struct timeval* tv, struct timezone* tz) ++ { ++- FILETIME ft; ++- unsigned __int64 tmpres = 0; ++- static int tzflag; ++- ++- if (NULL != tv) ++- { ++- GetSystemTimeAsFileTime(&ft); ++- ++- tmpres |= ft.dwHighDateTime; ++- tmpres <<= 32; ++- tmpres |= ft.dwLowDateTime; ++- ++- /*converting file time to unix epoch*/ ++- tmpres -= DELTA_EPOCH_IN_MICROSECS; ++- tv->tv_sec = (long)(tmpres / 1000000UL); ++- tv->tv_usec = (long)(tmpres % 1000000UL); ++- } ++- ++- if (NULL != tz) ++- { ++- if (!tzflag) ++- { ++- _tzset(); ++- tzflag++; +++ FILETIME ft; +++ unsigned __int64 tmpres = 0; +++ static int tzflag; +++ +++ if (NULL != tv) { +++ GetSystemTimeAsFileTime(&ft); +++ +++ tmpres |= ft.dwHighDateTime; +++ tmpres <<= 32; +++ tmpres |= ft.dwLowDateTime; +++ +++ /*converting file time to unix epoch*/ +++ tmpres -= DELTA_EPOCH_IN_MICROSECS; +++ tv->tv_sec = (long)(tmpres / 1000000UL); +++ tv->tv_usec = (long)(tmpres % 1000000UL); +++ } +++ +++ if (NULL != tz) { +++ if (!tzflag) { +++ _tzset(); +++ tzflag++; +++ } +++ tz->tz_minuteswest = _timezone / 60; +++ tz->tz_dsttime = _daylight; ++ } ++- tz->tz_minuteswest = _timezone / 60; ++- tz->tz_dsttime = _daylight; ++- } ++ ++- return 0; +++ return 0; ++ } ++ ++ #endif //_MSC_SYS_TIME_H_ ++diff --git a/include/volk/saturation_arithmetic.h b/include/volk/saturation_arithmetic.h ++index 0886844..7b95ba2 100644 ++--- a/include/volk/saturation_arithmetic.h +++++ b/include/volk/saturation_arithmetic.h ++@@ -28,20 +28,24 @@ ++ ++ static inline int16_t sat_adds16i(int16_t x, int16_t y) ++ { ++- int32_t res = (int32_t) x + (int32_t) y; +++ int32_t res = (int32_t)x + (int32_t)y; ++ ++- if (res < SHRT_MIN) res = SHRT_MIN; ++- if (res > SHRT_MAX) res = SHRT_MAX; +++ if (res < SHRT_MIN) +++ res = SHRT_MIN; +++ if (res > SHRT_MAX) +++ res = SHRT_MAX; ++ ++ return res; ++ } ++ ++ static inline int16_t sat_muls16i(int16_t x, int16_t y) ++ { ++- int32_t res = (int32_t) x * (int32_t) y; +++ int32_t res = (int32_t)x * (int32_t)y; ++ ++- if (res < SHRT_MIN) res = SHRT_MIN; ++- if (res > SHRT_MAX) res = SHRT_MAX; +++ if (res < SHRT_MIN) +++ res = SHRT_MIN; +++ if (res > SHRT_MAX) +++ res = SHRT_MAX; ++ ++ return res; ++ } ++diff --git a/include/volk/volk_alloc.hh b/include/volk/volk_alloc.hh ++index a2975da..44bcfaf 100644 ++--- a/include/volk/volk_alloc.hh +++++ b/include/volk/volk_alloc.hh ++@@ -40,30 +40,40 @@ namespace volk { ++ */ ++ template ++ struct alloc { ++- typedef T value_type; +++ typedef T value_type; ++ ++- alloc() = default; +++ alloc() = default; ++ ++- template constexpr alloc(alloc const&) noexcept {} +++ template +++ constexpr alloc(alloc const&) noexcept +++ { +++ } ++ ++- T* allocate(std::size_t n) { ++- if (n > std::numeric_limits::max() / sizeof(T)) throw std::bad_alloc(); +++ T* allocate(std::size_t n) +++ { +++ if (n > std::numeric_limits::max() / sizeof(T)) +++ throw std::bad_alloc(); ++ ++- if (auto p = static_cast(volk_malloc(n*sizeof(T), volk_get_alignment()))) ++- return p; +++ if (auto p = static_cast(volk_malloc(n * sizeof(T), volk_get_alignment()))) +++ return p; ++ ++- throw std::bad_alloc(); ++- } +++ throw std::bad_alloc(); +++ } ++ ++- void deallocate(T* p, std::size_t) noexcept { volk_free(p); } ++- ++-} ; +++ void deallocate(T* p, std::size_t) noexcept { volk_free(p); } +++}; ++ ++ template ++-bool operator==(alloc const&, alloc const&) { return true; } +++bool operator==(alloc const&, alloc const&) +++{ +++ return true; +++} ++ ++ template ++-bool operator!=(alloc const&, alloc const&) { return false; } +++bool operator!=(alloc const&, alloc const&) +++{ +++ return false; +++} ++ ++ ++ /*! ++@@ -73,8 +83,8 @@ bool operator!=(alloc const&, alloc const&) { return false; } ++ * example code: ++ * volk::vector v(100); // vector using volk_malloc, volk_free ++ */ ++-template ++-using vector = std::vector >; +++template +++using vector = std::vector>; ++ ++ } // namespace volk ++ #endif // INCLUDED_VOLK_ALLOC_H ++diff --git a/include/volk/volk_avx2_intrinsics.h b/include/volk/volk_avx2_intrinsics.h ++index 17badc4..00f3b52 100644 ++--- a/include/volk/volk_avx2_intrinsics.h +++++ b/include/volk/volk_avx2_intrinsics.h ++@@ -1,19 +1,19 @@ ++ /* -*- c++ -*- */ ++-/* +++/* ++ * Copyright 2015 Free Software Foundation, Inc. ++- * +++ * ++ * This file is part of GNU Radio ++- * +++ * ++ * GNU Radio is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 3, or (at your option) ++ * any later version. ++- * +++ * ++ * GNU Radio is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++- * +++ * ++ * You should have received a copy of the GNU General Public License ++ * along with GNU Radio; see the file COPYING. If not, write to ++ * the Free Software Foundation, Inc., 51 Franklin Street, ++@@ -27,28 +27,59 @@ ++ ++ #ifndef INCLUDE_VOLK_VOLK_AVX2_INTRINSICS_H_ ++ #define INCLUDE_VOLK_VOLK_AVX2_INTRINSICS_H_ ++-#include ++ #include "volk/volk_avx_intrinsics.h" +++#include ++ ++-static inline __m256 ++-_mm256_polar_sign_mask_avx2(__m128i fbits){ ++- const __m128i zeros = _mm_set1_epi8(0x00); ++- const __m128i sign_extract = _mm_set1_epi8(0x80); ++- const __m256i shuffle_mask = _mm256_setr_epi8(0xff, 0xff, 0xff, 0x00, 0xff, 0xff, 0xff, 0x01, 0xff, 0xff, 0xff, 0x02, 0xff, 0xff, 0xff, 0x03, ++- 0xff, 0xff, 0xff, 0x04, 0xff, 0xff, 0xff, 0x05, 0xff, 0xff, 0xff, 0x06, 0xff, 0xff, 0xff, 0x07); ++- __m256i sign_bits = _mm256_setzero_si256(); ++- ++- fbits = _mm_cmpgt_epi8(fbits, zeros); ++- fbits = _mm_and_si128(fbits, sign_extract); ++- sign_bits = _mm256_insertf128_si256(sign_bits,fbits,0); ++- sign_bits = _mm256_insertf128_si256(sign_bits,fbits,1); ++- sign_bits = _mm256_shuffle_epi8(sign_bits, shuffle_mask); +++static inline __m256 _mm256_polar_sign_mask_avx2(__m128i fbits) +++{ +++ const __m128i zeros = _mm_set1_epi8(0x00); +++ const __m128i sign_extract = _mm_set1_epi8(0x80); +++ const __m256i shuffle_mask = _mm256_setr_epi8(0xff, +++ 0xff, +++ 0xff, +++ 0x00, +++ 0xff, +++ 0xff, +++ 0xff, +++ 0x01, +++ 0xff, +++ 0xff, +++ 0xff, +++ 0x02, +++ 0xff, +++ 0xff, +++ 0xff, +++ 0x03, +++ 0xff, +++ 0xff, +++ 0xff, +++ 0x04, +++ 0xff, +++ 0xff, +++ 0xff, +++ 0x05, +++ 0xff, +++ 0xff, +++ 0xff, +++ 0x06, +++ 0xff, +++ 0xff, +++ 0xff, +++ 0x07); +++ __m256i sign_bits = _mm256_setzero_si256(); ++ ++- return _mm256_castsi256_ps(sign_bits); +++ fbits = _mm_cmpgt_epi8(fbits, zeros); +++ fbits = _mm_and_si128(fbits, sign_extract); +++ sign_bits = _mm256_insertf128_si256(sign_bits, fbits, 0); +++ sign_bits = _mm256_insertf128_si256(sign_bits, fbits, 1); +++ sign_bits = _mm256_shuffle_epi8(sign_bits, shuffle_mask); +++ +++ return _mm256_castsi256_ps(sign_bits); ++ } ++ ++ static inline __m256 ++-_mm256_polar_fsign_add_llrs_avx2(__m256 src0, __m256 src1, __m128i fbits){ +++_mm256_polar_fsign_add_llrs_avx2(__m256 src0, __m256 src1, __m128i fbits) +++{ ++ // prepare sign mask for correct +- ++ __m256 sign_mask = _mm256_polar_sign_mask_avx2(fbits); ++ ++@@ -61,26 +92,31 @@ _mm256_polar_fsign_add_llrs_avx2(__m256 src0, __m256 src1, __m128i fbits){ ++ return dst; ++ } ++ ++-static inline __m256 ++-_mm256_magnitudesquared_ps_avx2(const __m256 cplxValue0, const __m256 cplxValue1){ ++- const __m256i idx = _mm256_set_epi32(7,6,3,2,5,4,1,0); ++- const __m256 squared0 = _mm256_mul_ps(cplxValue0, cplxValue0); // Square the values ++- const __m256 squared1 = _mm256_mul_ps(cplxValue1, cplxValue1); // Square the Values ++- const __m256 complex_result = _mm256_hadd_ps(squared0, squared1); ++- return _mm256_permutevar8x32_ps(complex_result, idx); +++static inline __m256 _mm256_magnitudesquared_ps_avx2(const __m256 cplxValue0, +++ const __m256 cplxValue1) +++{ +++ const __m256i idx = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0); +++ const __m256 squared0 = _mm256_mul_ps(cplxValue0, cplxValue0); // Square the values +++ const __m256 squared1 = _mm256_mul_ps(cplxValue1, cplxValue1); // Square the Values +++ const __m256 complex_result = _mm256_hadd_ps(squared0, squared1); +++ return _mm256_permutevar8x32_ps(complex_result, idx); ++ } ++ ++-static inline __m256 ++-_mm256_scaled_norm_dist_ps_avx2(const __m256 symbols0, const __m256 symbols1, const __m256 points0, const __m256 points1, const __m256 scalar){ ++- /* ++- * Calculate: |y - x|^2 * SNR_lin ++- * Consider 'symbolsX' and 'pointsX' to be complex float ++- * 'symbolsX' are 'y' and 'pointsX' are 'x' ++- */ ++- const __m256 diff0 = _mm256_sub_ps(symbols0, points0); ++- const __m256 diff1 = _mm256_sub_ps(symbols1, points1); ++- const __m256 norms = _mm256_magnitudesquared_ps_avx2(diff0, diff1); ++- return _mm256_mul_ps(norms, scalar); +++static inline __m256 _mm256_scaled_norm_dist_ps_avx2(const __m256 symbols0, +++ const __m256 symbols1, +++ const __m256 points0, +++ const __m256 points1, +++ const __m256 scalar) +++{ +++ /* +++ * Calculate: |y - x|^2 * SNR_lin +++ * Consider 'symbolsX' and 'pointsX' to be complex float +++ * 'symbolsX' are 'y' and 'pointsX' are 'x' +++ */ +++ const __m256 diff0 = _mm256_sub_ps(symbols0, points0); +++ const __m256 diff1 = _mm256_sub_ps(symbols1, points1); +++ const __m256 norms = _mm256_magnitudesquared_ps_avx2(diff0, diff1); +++ return _mm256_mul_ps(norms, scalar); ++ } ++ ++ #endif /* INCLUDE_VOLK_VOLK_AVX2_INTRINSICS_H_ */ ++diff --git a/include/volk/volk_avx_intrinsics.h b/include/volk/volk_avx_intrinsics.h ++index 808799f..bec846d 100644 ++--- a/include/volk/volk_avx_intrinsics.h +++++ b/include/volk/volk_avx_intrinsics.h ++@@ -1,19 +1,19 @@ ++ /* -*- c++ -*- */ ++-/* +++/* ++ * Copyright 2015 Free Software Foundation, Inc. ++- * +++ * ++ * This file is part of GNU Radio ++- * +++ * ++ * GNU Radio is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 3, or (at your option) ++ * any later version. ++- * +++ * ++ * GNU Radio is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++- * +++ * ++ * You should have received a copy of the GNU General Public License ++ * along with GNU Radio; see the file COPYING. If not, write to ++ * the Free Software Foundation, Inc., 51 Franklin Street, ++@@ -29,90 +29,126 @@ ++ #define INCLUDE_VOLK_VOLK_AVX_INTRINSICS_H_ ++ #include ++ ++-static inline __m256 ++-_mm256_complexmul_ps(__m256 x, __m256 y) +++static inline __m256 _mm256_complexmul_ps(__m256 x, __m256 y) ++ { ++- __m256 yl, yh, tmp1, tmp2; ++- yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr ... ++- yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di ... ++- tmp1 = _mm256_mul_ps(x, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr ... ++- x = _mm256_shuffle_ps(x, x, 0xB1); // Re-arrange x to be ai,ar,bi,br ... ++- tmp2 = _mm256_mul_ps(x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di ++- return _mm256_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di +++ __m256 yl, yh, tmp1, tmp2; +++ yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr ... +++ yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di ... +++ tmp1 = _mm256_mul_ps(x, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr ... +++ x = _mm256_shuffle_ps(x, x, 0xB1); // Re-arrange x to be ai,ar,bi,br ... +++ tmp2 = _mm256_mul_ps(x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di +++ return _mm256_addsub_ps(tmp1, +++ tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di ++ } ++ ++-static inline __m256 ++-_mm256_conjugate_ps(__m256 x){ ++- const __m256 conjugator = _mm256_setr_ps(0, -0.f, 0, -0.f, 0, -0.f, 0, -0.f); ++- return _mm256_xor_ps(x, conjugator); // conjugate y +++static inline __m256 _mm256_conjugate_ps(__m256 x) +++{ +++ const __m256 conjugator = _mm256_setr_ps(0, -0.f, 0, -0.f, 0, -0.f, 0, -0.f); +++ return _mm256_xor_ps(x, conjugator); // conjugate y ++ } ++ ++-static inline __m256 ++-_mm256_complexconjugatemul_ps(__m256 x, __m256 y){ ++- y = _mm256_conjugate_ps(y); ++- return _mm256_complexmul_ps(x, y); +++static inline __m256 _mm256_complexconjugatemul_ps(__m256 x, __m256 y) +++{ +++ y = _mm256_conjugate_ps(y); +++ return _mm256_complexmul_ps(x, y); ++ } ++ ++-static inline __m256 ++-_mm256_normalize_ps(__m256 val) +++static inline __m256 _mm256_normalize_ps(__m256 val) ++ { ++- __m256 tmp1 = _mm256_mul_ps(val, val); ++- tmp1 = _mm256_hadd_ps(tmp1, tmp1); ++- tmp1 = _mm256_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(3, 1, 2, 0)); // equals 0xD8 ++- tmp1 = _mm256_sqrt_ps(tmp1); ++- return _mm256_div_ps(val, tmp1); +++ __m256 tmp1 = _mm256_mul_ps(val, val); +++ tmp1 = _mm256_hadd_ps(tmp1, tmp1); +++ tmp1 = _mm256_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(3, 1, 2, 0)); // equals 0xD8 +++ tmp1 = _mm256_sqrt_ps(tmp1); +++ return _mm256_div_ps(val, tmp1); ++ } ++ ++-static inline __m256 ++-_mm256_magnitudesquared_ps(__m256 cplxValue1, __m256 cplxValue2){ ++- __m256 complex1, complex2; ++- cplxValue1 = _mm256_mul_ps(cplxValue1, cplxValue1); // Square the values ++- cplxValue2 = _mm256_mul_ps(cplxValue2, cplxValue2); // Square the Values ++- complex1 = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x20); ++- complex2 = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x31); ++- return _mm256_hadd_ps(complex1, complex2); // Add the I2 and Q2 values +++static inline __m256 _mm256_magnitudesquared_ps(__m256 cplxValue1, __m256 cplxValue2) +++{ +++ __m256 complex1, complex2; +++ cplxValue1 = _mm256_mul_ps(cplxValue1, cplxValue1); // Square the values +++ cplxValue2 = _mm256_mul_ps(cplxValue2, cplxValue2); // Square the Values +++ complex1 = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x20); +++ complex2 = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x31); +++ return _mm256_hadd_ps(complex1, complex2); // Add the I2 and Q2 values ++ } ++ ++-static inline __m256 ++-_mm256_magnitude_ps(__m256 cplxValue1, __m256 cplxValue2){ ++- return _mm256_sqrt_ps(_mm256_magnitudesquared_ps(cplxValue1, cplxValue2)); +++static inline __m256 _mm256_magnitude_ps(__m256 cplxValue1, __m256 cplxValue2) +++{ +++ return _mm256_sqrt_ps(_mm256_magnitudesquared_ps(cplxValue1, cplxValue2)); ++ } ++ ++-static inline __m256 ++-_mm256_scaled_norm_dist_ps(const __m256 symbols0, const __m256 symbols1, const __m256 points0, const __m256 points1, const __m256 scalar){ ++- /* ++- * Calculate: |y - x|^2 * SNR_lin ++- * Consider 'symbolsX' and 'pointsX' to be complex float ++- * 'symbolsX' are 'y' and 'pointsX' are 'x' ++- */ ++- const __m256 diff0 = _mm256_sub_ps(symbols0, points0); ++- const __m256 diff1 = _mm256_sub_ps(symbols1, points1); ++- const __m256 norms = _mm256_magnitudesquared_ps(diff0, diff1); ++- return _mm256_mul_ps(norms, scalar); +++static inline __m256 _mm256_scaled_norm_dist_ps(const __m256 symbols0, +++ const __m256 symbols1, +++ const __m256 points0, +++ const __m256 points1, +++ const __m256 scalar) +++{ +++ /* +++ * Calculate: |y - x|^2 * SNR_lin +++ * Consider 'symbolsX' and 'pointsX' to be complex float +++ * 'symbolsX' are 'y' and 'pointsX' are 'x' +++ */ +++ const __m256 diff0 = _mm256_sub_ps(symbols0, points0); +++ const __m256 diff1 = _mm256_sub_ps(symbols1, points1); +++ const __m256 norms = _mm256_magnitudesquared_ps(diff0, diff1); +++ return _mm256_mul_ps(norms, scalar); ++ } ++ ++-static inline __m256 ++-_mm256_polar_sign_mask(__m128i fbits){ ++- __m256 sign_mask_dummy = _mm256_setzero_ps(); ++- const __m128i zeros = _mm_set1_epi8(0x00); ++- const __m128i sign_extract = _mm_set1_epi8(0x80); ++- const __m128i shuffle_mask0 = _mm_setr_epi8(0xff, 0xff, 0xff, 0x00, 0xff, 0xff, 0xff, 0x01, 0xff, 0xff, 0xff, 0x02, 0xff, 0xff, 0xff, 0x03); ++- const __m128i shuffle_mask1 = _mm_setr_epi8(0xff, 0xff, 0xff, 0x04, 0xff, 0xff, 0xff, 0x05, 0xff, 0xff, 0xff, 0x06, 0xff, 0xff, 0xff, 0x07); ++- ++- fbits = _mm_cmpgt_epi8(fbits, zeros); ++- fbits = _mm_and_si128(fbits, sign_extract); ++- __m128i sign_bits0 = _mm_shuffle_epi8(fbits, shuffle_mask0); ++- __m128i sign_bits1 = _mm_shuffle_epi8(fbits, shuffle_mask1); ++- ++- __m256 sign_mask = _mm256_insertf128_ps(sign_mask_dummy, _mm_castsi128_ps(sign_bits0), 0x0); ++- return _mm256_insertf128_ps(sign_mask, _mm_castsi128_ps(sign_bits1), 0x1); ++-// // This is the desired function call. Though it seems to be missing in GCC. ++-// // Compare: https://software.intel.com/sites/landingpage/IntrinsicsGuide/# ++-// return _mm256_set_m128(_mm_castsi128_ps(sign_bits1), _mm_castsi128_ps(sign_bits0)); +++static inline __m256 _mm256_polar_sign_mask(__m128i fbits) +++{ +++ __m256 sign_mask_dummy = _mm256_setzero_ps(); +++ const __m128i zeros = _mm_set1_epi8(0x00); +++ const __m128i sign_extract = _mm_set1_epi8(0x80); +++ const __m128i shuffle_mask0 = _mm_setr_epi8(0xff, +++ 0xff, +++ 0xff, +++ 0x00, +++ 0xff, +++ 0xff, +++ 0xff, +++ 0x01, +++ 0xff, +++ 0xff, +++ 0xff, +++ 0x02, +++ 0xff, +++ 0xff, +++ 0xff, +++ 0x03); +++ const __m128i shuffle_mask1 = _mm_setr_epi8(0xff, +++ 0xff, +++ 0xff, +++ 0x04, +++ 0xff, +++ 0xff, +++ 0xff, +++ 0x05, +++ 0xff, +++ 0xff, +++ 0xff, +++ 0x06, +++ 0xff, +++ 0xff, +++ 0xff, +++ 0x07); +++ +++ fbits = _mm_cmpgt_epi8(fbits, zeros); +++ fbits = _mm_and_si128(fbits, sign_extract); +++ __m128i sign_bits0 = _mm_shuffle_epi8(fbits, shuffle_mask0); +++ __m128i sign_bits1 = _mm_shuffle_epi8(fbits, shuffle_mask1); +++ +++ __m256 sign_mask = +++ _mm256_insertf128_ps(sign_mask_dummy, _mm_castsi128_ps(sign_bits0), 0x0); +++ return _mm256_insertf128_ps(sign_mask, _mm_castsi128_ps(sign_bits1), 0x1); +++ // // This is the desired function call. Though it seems to be missing in GCC. +++ // // Compare: https://software.intel.com/sites/landingpage/IntrinsicsGuide/# +++ // return _mm256_set_m128(_mm_castsi128_ps(sign_bits1), +++ // _mm_castsi128_ps(sign_bits0)); ++ } ++ ++ static inline void ++-_mm256_polar_deinterleave(__m256 *llr0, __m256 *llr1, __m256 src0, __m256 src1){ +++_mm256_polar_deinterleave(__m256* llr0, __m256* llr1, __m256 src0, __m256 src1) +++{ ++ // deinterleave values ++ __m256 part0 = _mm256_permute2f128_ps(src0, src1, 0x20); ++ __m256 part1 = _mm256_permute2f128_ps(src0, src1, 0x31); ++@@ -120,22 +156,25 @@ _mm256_polar_deinterleave(__m256 *llr0, __m256 *llr1, __m256 src0, __m256 src1){ ++ *llr1 = _mm256_shuffle_ps(part0, part1, 0xdd); ++ } ++ ++-static inline __m256 ++-_mm256_polar_minsum_llrs(__m256 src0, __m256 src1){ +++static inline __m256 _mm256_polar_minsum_llrs(__m256 src0, __m256 src1) +++{ ++ const __m256 sign_mask = _mm256_set1_ps(-0.0f); ++- const __m256 abs_mask = _mm256_andnot_ps(sign_mask, _mm256_castsi256_ps(_mm256_set1_epi8(0xff))); +++ const __m256 abs_mask = +++ _mm256_andnot_ps(sign_mask, _mm256_castsi256_ps(_mm256_set1_epi8(0xff))); ++ ++ __m256 llr0, llr1; ++ _mm256_polar_deinterleave(&llr0, &llr1, src0, src1); ++ ++ // calculate result ++- __m256 sign = _mm256_xor_ps(_mm256_and_ps(llr0, sign_mask), _mm256_and_ps(llr1, sign_mask)); ++- __m256 dst = _mm256_min_ps(_mm256_and_ps(llr0, abs_mask), _mm256_and_ps(llr1, abs_mask)); +++ __m256 sign = +++ _mm256_xor_ps(_mm256_and_ps(llr0, sign_mask), _mm256_and_ps(llr1, sign_mask)); +++ __m256 dst = +++ _mm256_min_ps(_mm256_and_ps(llr0, abs_mask), _mm256_and_ps(llr1, abs_mask)); ++ return _mm256_or_ps(dst, sign); ++ } ++ ++-static inline __m256 ++-_mm256_polar_fsign_add_llrs(__m256 src0, __m256 src1, __m128i fbits){ +++static inline __m256 _mm256_polar_fsign_add_llrs(__m256 src0, __m256 src1, __m128i fbits) +++{ ++ // prepare sign mask for correct +- ++ __m256 sign_mask = _mm256_polar_sign_mask(fbits); ++ ++diff --git a/include/volk/volk_common.h b/include/volk/volk_common.h ++index 50ea07b..8167d23 100644 ++--- a/include/volk/volk_common.h +++++ b/include/volk/volk_common.h ++@@ -18,61 +18,71 @@ ++ // AppleClang also defines __GNUC__, so do this check first. These ++ // will probably be the same as for __GNUC__, but let's keep them ++ // separate just to be safe. ++-# define __VOLK_ATTR_ALIGNED(x) __attribute__((aligned(x))) ++-# define __VOLK_ATTR_UNUSED __attribute__((unused)) ++-# define __VOLK_ATTR_INLINE __attribute__((always_inline)) ++-# define __VOLK_ATTR_DEPRECATED __attribute__((deprecated)) ++-# define __VOLK_ASM __asm__ ++-# define __VOLK_VOLATILE __volatile__ ++-# define __VOLK_ATTR_EXPORT __attribute__((visibility("default"))) ++-# define __VOLK_ATTR_IMPORT __attribute__((visibility("default"))) ++-# define __VOLK_PREFETCH(addr) __builtin_prefetch(addr) ++-#elif defined(__GNUC__) ++-# define __VOLK_ATTR_ALIGNED(x) __attribute__((aligned(x))) ++-# define __VOLK_ATTR_UNUSED __attribute__((unused)) ++-# define __VOLK_ATTR_INLINE __attribute__((always_inline)) ++-# define __VOLK_ATTR_DEPRECATED __attribute__((deprecated)) ++-# define __VOLK_ASM __asm__ ++-# define __VOLK_VOLATILE __volatile__ ++-# if __GNUC__ >= 4 ++-# define __VOLK_ATTR_EXPORT __attribute__((visibility("default"))) ++-# define __VOLK_ATTR_IMPORT __attribute__((visibility("default"))) ++-# else ++-# define __VOLK_ATTR_EXPORT ++-# define __VOLK_ATTR_IMPORT ++-# endif ++-# define __VOLK_PREFETCH(addr) __builtin_prefetch(addr) +++#define __VOLK_ATTR_ALIGNED(x) __attribute__((aligned(x))) +++#define __VOLK_ATTR_UNUSED __attribute__((unused)) +++#define __VOLK_ATTR_INLINE __attribute__((always_inline)) +++#define __VOLK_ATTR_DEPRECATED __attribute__((deprecated)) +++#define __VOLK_ASM __asm__ +++#define __VOLK_VOLATILE __volatile__ +++#define __VOLK_ATTR_EXPORT __attribute__((visibility("default"))) +++#define __VOLK_ATTR_IMPORT __attribute__((visibility("default"))) +++#define __VOLK_PREFETCH(addr) __builtin_prefetch(addr) +++#elif defined __GNUC__ +++#define __VOLK_ATTR_ALIGNED(x) __attribute__((aligned(x))) +++#define __VOLK_ATTR_UNUSED __attribute__((unused)) +++#define __VOLK_ATTR_INLINE __attribute__((always_inline)) +++#define __VOLK_ATTR_DEPRECATED __attribute__((deprecated)) +++#define __VOLK_ASM __asm__ +++#define __VOLK_VOLATILE __volatile__ +++#if __GNUC__ >= 4 +++#define __VOLK_ATTR_EXPORT __attribute__((visibility("default"))) +++#define __VOLK_ATTR_IMPORT __attribute__((visibility("default"))) ++ #else ++-# warning "Unknown compiler. Using default VOLK macros, which may or not work." ++-# define __VOLK_ATTR_ALIGNED(x) ++-# define __VOLK_ATTR_UNUSED ++-# define __VOLK_ATTR_INLINE ++-# define __VOLK_ATTR_DEPRECATED ++-# define __VOLK_ATTR_EXPORT ++-# define __VOLK_ATTR_IMPORT ++-# define __VOLK_PREFETCH(addr) ++-# define __VOLK_ASM __asm__ ++-# define __VOLK_VOLATILE __volatile__ +++#define __VOLK_ATTR_EXPORT +++#define __VOLK_ATTR_IMPORT +++#endif +++#define __VOLK_PREFETCH(addr) __builtin_prefetch(addr) +++#elif _MSC_VER +++#define __VOLK_ATTR_ALIGNED(x) __declspec(align(x)) +++#define __VOLK_ATTR_UNUSED +++#define __VOLK_ATTR_INLINE __forceinline +++#define __VOLK_ATTR_DEPRECATED __declspec(deprecated) +++#define __VOLK_ATTR_EXPORT __declspec(dllexport) +++#define __VOLK_ATTR_IMPORT __declspec(dllimport) +++#define __VOLK_PREFETCH(addr) +++#define __VOLK_ASM __asm +++#define __VOLK_VOLATILE +++#else +++#define __VOLK_ATTR_ALIGNED(x) +++#define __VOLK_ATTR_UNUSED +++#define __VOLK_ATTR_INLINE +++#define __VOLK_ATTR_DEPRECATED +++#define __VOLK_ATTR_EXPORT +++#define __VOLK_ATTR_IMPORT +++#define __VOLK_PREFETCH(addr) +++#define __VOLK_ASM __asm__ +++#define __VOLK_VOLATILE __volatile__ ++ #endif ++ ++ //////////////////////////////////////////////////////////////////////// ++ // Ignore annoying warnings in MSVC ++ //////////////////////////////////////////////////////////////////////// ++ #if defined(_MSC_VER) ++-# pragma warning(disable: 4244) //'conversion' conversion from 'type1' to 'type2', possible loss of data ++-# pragma warning(disable: 4305) //'identifier' : truncation from 'type1' to 'type2' +++#pragma warning(disable : 4244) //'conversion' conversion from 'type1' to 'type2', +++ //possible loss of data +++#pragma warning(disable : 4305) //'identifier' : truncation from 'type1' to 'type2' ++ #endif ++ ++ //////////////////////////////////////////////////////////////////////// ++ // C-linkage declaration macros ++ // FIXME: due to the usage of complex.h, require gcc for c-linkage ++ //////////////////////////////////////////////////////////////////////// ++-#if defined(__cplusplus) && (defined(__GNUC__) || defined(__clang__)) ++-# define __VOLK_DECL_BEGIN extern "C" { ++-# define __VOLK_DECL_END } +++#if defined(__cplusplus) && (__GNUC__) +++#define __VOLK_DECL_BEGIN extern "C" { +++#define __VOLK_DECL_END } ++ #else ++-# define __VOLK_DECL_BEGIN ++-# define __VOLK_DECL_END +++#define __VOLK_DECL_BEGIN +++#define __VOLK_DECL_END ++ #endif ++ ++ //////////////////////////////////////////////////////////////////////// ++@@ -80,9 +90,9 @@ ++ // http://gcc.gnu.org/wiki/Visibility ++ //////////////////////////////////////////////////////////////////////// ++ #ifdef volk_EXPORTS ++-# define VOLK_API __VOLK_ATTR_EXPORT +++#define VOLK_API __VOLK_ATTR_EXPORT ++ #else ++-# define VOLK_API __VOLK_ATTR_IMPORT +++#define VOLK_API __VOLK_ATTR_IMPORT ++ #endif ++ ++ //////////////////////////////////////////////////////////////////////// ++@@ -98,38 +108,38 @@ ++ #endif ++ #endif ++ ++-union bit128{ ++- uint8_t i8[16]; ++- uint16_t i16[8]; ++- uint32_t i[4]; ++- float f[4]; ++- double d[2]; +++union bit128 { +++ uint8_t i8[16]; +++ uint16_t i16[8]; +++ uint32_t i[4]; +++ float f[4]; +++ double d[2]; ++ ++- #ifdef LV_HAVE_SSE ++- __m128 float_vec; ++- #endif +++#ifdef LV_HAVE_SSE +++ __m128 float_vec; +++#endif ++ ++- #ifdef LV_HAVE_SSE2 ++- __m128i int_vec; ++- __m128d double_vec; ++- #endif +++#ifdef LV_HAVE_SSE2 +++ __m128i int_vec; +++ __m128d double_vec; +++#endif ++ }; ++ ++-union bit256{ ++- uint8_t i8[32]; ++- uint16_t i16[16]; ++- uint32_t i[8]; ++- float f[8]; ++- double d[4]; +++union bit256 { +++ uint8_t i8[32]; +++ uint16_t i16[16]; +++ uint32_t i[8]; +++ float f[8]; +++ double d[4]; ++ ++- #ifdef LV_HAVE_AVX ++- __m256 float_vec; ++- __m256i int_vec; ++- __m256d double_vec; ++- #endif +++#ifdef LV_HAVE_AVX +++ __m256 float_vec; +++ __m256i int_vec; +++ __m256d double_vec; +++#endif ++ }; ++ ++-#define bit128_p(x) ((union bit128 *)(x)) ++-#define bit256_p(x) ((union bit256 *)(x)) +++#define bit128_p(x) ((union bit128*)(x)) +++#define bit256_p(x) ((union bit256*)(x)) ++ ++ #endif /*INCLUDED_LIBVOLK_COMMON_H*/ ++diff --git a/include/volk/volk_complex.h b/include/volk/volk_complex.h ++index 1d61d78..ae78873 100644 ++--- a/include/volk/volk_complex.h +++++ b/include/volk/volk_complex.h ++@@ -19,49 +19,58 @@ ++ ++ #ifdef __cplusplus ++ ++-#include ++ #include +++#include ++ ++-typedef std::complex lv_8sc_t; +++typedef std::complex lv_8sc_t; ++ typedef std::complex lv_16sc_t; ++ typedef std::complex lv_32sc_t; ++ typedef std::complex lv_64sc_t; ++-typedef std::complex lv_32fc_t; ++-typedef std::complex lv_64fc_t; +++typedef std::complex lv_32fc_t; +++typedef std::complex lv_64fc_t; ++ ++-template inline std::complex lv_cmake(const T &r, const T &i){ +++template +++inline std::complex lv_cmake(const T& r, const T& i) +++{ ++ return std::complex(r, i); ++ } ++ ++-template inline typename T::value_type lv_creal(const T &x){ +++template +++inline typename T::value_type lv_creal(const T& x) +++{ ++ return x.real(); ++ } ++ ++-template inline typename T::value_type lv_cimag(const T &x){ +++template +++inline typename T::value_type lv_cimag(const T& x) +++{ ++ return x.imag(); ++ } ++ ++-template inline T lv_conj(const T &x){ +++template +++inline T lv_conj(const T& x) +++{ ++ return std::conj(x); ++ } ++ ++ #else /* __cplusplus */ ++ ++ #if __STDC_VERSION__ >= 199901L /* C99 check */ ++-/* this allows us to conj in lv_conj without the double detour for single-precision floats */ +++/* this allows us to conj in lv_conj without the double detour for single-precision floats +++ */ ++ #include ++ #endif /* C99 check */ ++ ++ #include ++ ++-typedef char complex lv_8sc_t; ++-typedef short complex lv_16sc_t; ++-typedef long complex lv_32sc_t; ++-typedef long long complex lv_64sc_t; ++-typedef float complex lv_32fc_t; ++-typedef double complex lv_64fc_t; +++typedef char complex lv_8sc_t; +++typedef short complex lv_16sc_t; +++typedef long complex lv_32sc_t; +++typedef long long complex lv_64sc_t; +++typedef float complex lv_32fc_t; +++typedef double complex lv_64fc_t; ++ ++-#define lv_cmake(r, i) ((r) + _Complex_I*(i)) +++#define lv_cmake(r, i) ((r) + _Complex_I * (i)) ++ ++ // When GNUC is available, use the complex extensions. ++ // The extensions always return the correct value type. ++diff --git a/include/volk/volk_malloc.h b/include/volk/volk_malloc.h ++index 3477b27..42ca2b0 100644 ++--- a/include/volk/volk_malloc.h +++++ b/include/volk/volk_malloc.h ++@@ -23,8 +23,8 @@ ++ #ifndef INCLUDED_VOLK_MALLOC_H ++ #define INCLUDED_VOLK_MALLOC_H ++ ++-#include ++ #include +++#include ++ ++ __VOLK_DECL_BEGIN ++ ++@@ -40,7 +40,8 @@ __VOLK_DECL_BEGIN ++ * For Apple Clang, we fall back to `posix_memalign`. ++ * see: https://linux.die.net/man/3/aligned_alloc ++ * For MSVC, we fall back to `_aligned_malloc`. ++- * see: https://docs.microsoft.com/en-us/cpp/c-runtime-library/reference/aligned-malloc?view=vs-2019 +++ * see: +++ * https://docs.microsoft.com/en-us/cpp/c-runtime-library/reference/aligned-malloc?view=vs-2019 ++ * ++ * Because of the ways in which volk_malloc may allocate memory, it is ++ * important to always free volk_malloc pointers using volk_free. ++@@ -51,7 +52,7 @@ __VOLK_DECL_BEGIN ++ * \param alignment The byte alignment of the allocated memory. ++ * \return pointer to aligned memory. ++ */ ++-VOLK_API void *volk_malloc(size_t size, size_t alignment); +++VOLK_API void* volk_malloc(size_t size, size_t alignment); ++ ++ /*! ++ * \brief Free's memory allocated by volk_malloc. ++@@ -62,11 +63,12 @@ VOLK_API void *volk_malloc(size_t size, size_t alignment); ++ * Thus, in this case `volk_free` inherits the same behavior `free` exhibits. ++ * see: https://en.cppreference.com/w/c/memory/free ++ * In case `_aligned_malloc` was used, we call `_aligned_free`. ++- * see: https://docs.microsoft.com/en-us/cpp/c-runtime-library/reference/aligned-free?view=vs-2019 +++ * see: +++ * https://docs.microsoft.com/en-us/cpp/c-runtime-library/reference/aligned-free?view=vs-2019 ++ * ++ * \param aptr The aligned pointer allocated by volk_malloc. ++ */ ++-VOLK_API void volk_free(void *aptr); +++VOLK_API void volk_free(void* aptr); ++ ++ __VOLK_DECL_END ++ ++diff --git a/include/volk/volk_neon_intrinsics.h b/include/volk/volk_neon_intrinsics.h ++index 90e7b54..302bd30 100644 ++--- a/include/volk/volk_neon_intrinsics.h +++++ b/include/volk/volk_neon_intrinsics.h ++@@ -67,9 +67,9 @@ ++ 3. This notice may not be removed or altered from any source distribution. ++ ++ (this is the zlib license) ++- +++ ++ _vsincosq_f32 ++- +++ ++ */ ++ ++ /* ++@@ -83,13 +83,12 @@ ++ ++ ++ /* Magnitude squared for float32x4x2_t */ ++-static inline float32x4_t ++-_vmagnitudesquaredq_f32(float32x4x2_t cmplxValue) +++static inline float32x4_t _vmagnitudesquaredq_f32(float32x4x2_t cmplxValue) ++ { ++ float32x4_t iValue, qValue, result; ++ iValue = vmulq_f32(cmplxValue.val[0], cmplxValue.val[0]); // Square the values ++ qValue = vmulq_f32(cmplxValue.val[1], cmplxValue.val[1]); // Square the values ++- result = vaddq_f32(iValue, qValue); // Add the I2 and Q2 values +++ result = vaddq_f32(iValue, qValue); // Add the I2 and Q2 values ++ return result; ++ } ++ ++@@ -97,9 +96,11 @@ _vmagnitudesquaredq_f32(float32x4x2_t cmplxValue) ++ static inline float32x4_t _vinvsqrtq_f32(float32x4_t x) ++ { ++ float32x4_t sqrt_reciprocal = vrsqrteq_f32(x); ++- sqrt_reciprocal = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x, sqrt_reciprocal), sqrt_reciprocal), sqrt_reciprocal); ++- sqrt_reciprocal = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x, sqrt_reciprocal), sqrt_reciprocal), sqrt_reciprocal); ++- +++ sqrt_reciprocal = vmulq_f32( +++ vrsqrtsq_f32(vmulq_f32(x, sqrt_reciprocal), sqrt_reciprocal), sqrt_reciprocal); +++ sqrt_reciprocal = vmulq_f32( +++ vrsqrtsq_f32(vmulq_f32(x, sqrt_reciprocal), sqrt_reciprocal), sqrt_reciprocal); +++ ++ return sqrt_reciprocal; ++ } ++ ++@@ -108,19 +109,19 @@ static inline float32x4_t _vinvq_f32(float32x4_t x) ++ { ++ // Newton's method ++ float32x4_t recip = vrecpeq_f32(x); ++- recip = vmulq_f32(vrecpsq_f32(x, recip), recip); ++- recip = vmulq_f32(vrecpsq_f32(x, recip), recip); +++ recip = vmulq_f32(vrecpsq_f32(x, recip), recip); +++ recip = vmulq_f32(vrecpsq_f32(x, recip), recip); ++ return recip; ++ } ++ ++ /* Complex multiplication for float32x4x2_t */ ++-static inline float32x4x2_t ++-_vmultiply_complexq_f32(float32x4x2_t a_val, float32x4x2_t b_val) +++static inline float32x4x2_t _vmultiply_complexq_f32(float32x4x2_t a_val, +++ float32x4x2_t b_val) ++ { ++ float32x4x2_t tmp_real; ++ float32x4x2_t tmp_imag; ++ float32x4x2_t c_val; ++- +++ ++ // multiply the real*real and imag*imag to get real result ++ // a0r*b0r|a1r*b1r|a2r*b2r|a3r*b3r ++ tmp_real.val[0] = vmulq_f32(a_val.val[0], b_val.val[0]); ++@@ -140,12 +141,12 @@ _vmultiply_complexq_f32(float32x4x2_t a_val, float32x4x2_t b_val) ++ /* From ARM Compute Library, MIT license */ ++ static inline float32x4_t _vtaylor_polyq_f32(float32x4_t x, const float32x4_t coeffs[8]) ++ { ++- float32x4_t cA = vmlaq_f32(coeffs[0], coeffs[4], x); ++- float32x4_t cB = vmlaq_f32(coeffs[2], coeffs[6], x); ++- float32x4_t cC = vmlaq_f32(coeffs[1], coeffs[5], x); ++- float32x4_t cD = vmlaq_f32(coeffs[3], coeffs[7], x); ++- float32x4_t x2 = vmulq_f32(x, x); ++- float32x4_t x4 = vmulq_f32(x2, x2); +++ float32x4_t cA = vmlaq_f32(coeffs[0], coeffs[4], x); +++ float32x4_t cB = vmlaq_f32(coeffs[2], coeffs[6], x); +++ float32x4_t cC = vmlaq_f32(coeffs[1], coeffs[5], x); +++ float32x4_t cD = vmlaq_f32(coeffs[3], coeffs[7], x); +++ float32x4_t x2 = vmulq_f32(x, x); +++ float32x4_t x4 = vmulq_f32(x2, x2); ++ float32x4_t res = vmlaq_f32(vmlaq_f32(cA, cB, x2), vmlaq_f32(cC, cD, x2), x4); ++ return res; ++ } ++@@ -155,121 +156,123 @@ static inline float32x4_t _vtaylor_polyq_f32(float32x4_t x, const float32x4_t co ++ static inline float32x4_t _vlogq_f32(float32x4_t x) ++ { ++ const float32x4_t log_tab[8] = { ++- vdupq_n_f32(-2.29561495781f), ++- vdupq_n_f32(-2.47071170807f), ++- vdupq_n_f32(-5.68692588806f), ++- vdupq_n_f32(-0.165253549814f), ++- vdupq_n_f32(5.17591238022f), ++- vdupq_n_f32(0.844007015228f), ++- vdupq_n_f32(4.58445882797f), ++- vdupq_n_f32(0.0141278216615f), +++ vdupq_n_f32(-2.29561495781f), vdupq_n_f32(-2.47071170807f), +++ vdupq_n_f32(-5.68692588806f), vdupq_n_f32(-0.165253549814f), +++ vdupq_n_f32(5.17591238022f), vdupq_n_f32(0.844007015228f), +++ vdupq_n_f32(4.58445882797f), vdupq_n_f32(0.0141278216615f), ++ }; ++- ++- const int32x4_t CONST_127 = vdupq_n_s32(127); // 127 +++ +++ const int32x4_t CONST_127 = vdupq_n_s32(127); // 127 ++ const float32x4_t CONST_LN2 = vdupq_n_f32(0.6931471805f); // ln(2) ++- +++ ++ // Extract exponent ++- int32x4_t m = vsubq_s32(vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_f32(x), 23)), CONST_127); ++- float32x4_t val = vreinterpretq_f32_s32(vsubq_s32(vreinterpretq_s32_f32(x), vshlq_n_s32(m, 23))); ++- +++ int32x4_t m = vsubq_s32( +++ vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_f32(x), 23)), CONST_127); +++ float32x4_t val = +++ vreinterpretq_f32_s32(vsubq_s32(vreinterpretq_s32_f32(x), vshlq_n_s32(m, 23))); +++ ++ // Polynomial Approximation ++ float32x4_t poly = _vtaylor_polyq_f32(val, log_tab); ++- +++ ++ // Reconstruct ++ poly = vmlaq_f32(poly, vcvtq_f32_s32(m), CONST_LN2); ++- +++ ++ return poly; ++ } ++ ++ /* Evaluation of 4 sines & cosines at once. ++ * Optimized from here (zlib license) ++ * http://gruntthepeon.free.fr/ssemath/ */ ++-static inline float32x4x2_t _vsincosq_f32(float32x4_t x) { +++static inline float32x4x2_t _vsincosq_f32(float32x4_t x) +++{ ++ const float32x4_t c_minus_cephes_DP1 = vdupq_n_f32(-0.78515625); ++ const float32x4_t c_minus_cephes_DP2 = vdupq_n_f32(-2.4187564849853515625e-4); ++ const float32x4_t c_minus_cephes_DP3 = vdupq_n_f32(-3.77489497744594108e-8); ++ const float32x4_t c_sincof_p0 = vdupq_n_f32(-1.9515295891e-4); ++- const float32x4_t c_sincof_p1 = vdupq_n_f32(8.3321608736e-3); +++ const float32x4_t c_sincof_p1 = vdupq_n_f32(8.3321608736e-3); ++ const float32x4_t c_sincof_p2 = vdupq_n_f32(-1.6666654611e-1); ++ const float32x4_t c_coscof_p0 = vdupq_n_f32(2.443315711809948e-005); ++ const float32x4_t c_coscof_p1 = vdupq_n_f32(-1.388731625493765e-003); ++ const float32x4_t c_coscof_p2 = vdupq_n_f32(4.166664568298827e-002); ++ const float32x4_t c_cephes_FOPI = vdupq_n_f32(1.27323954473516); // 4 / M_PI ++- +++ ++ const float32x4_t CONST_1 = vdupq_n_f32(1.f); ++ const float32x4_t CONST_1_2 = vdupq_n_f32(0.5f); ++ const float32x4_t CONST_0 = vdupq_n_f32(0.f); ++- const uint32x4_t CONST_2 = vdupq_n_u32(2); ++- const uint32x4_t CONST_4 = vdupq_n_u32(4); ++- +++ const uint32x4_t CONST_2 = vdupq_n_u32(2); +++ const uint32x4_t CONST_4 = vdupq_n_u32(4); +++ ++ uint32x4_t emm2; ++- +++ ++ uint32x4_t sign_mask_sin, sign_mask_cos; ++ sign_mask_sin = vcltq_f32(x, CONST_0); ++ x = vabsq_f32(x); ++ // scale by 4/pi ++ float32x4_t y = vmulq_f32(x, c_cephes_FOPI); ++- +++ ++ // store the integer part of y in mm0 ++ emm2 = vcvtq_u32_f32(y); ++ /* j=(j+1) & (~1) (see the cephes sources) */ ++ emm2 = vaddq_u32(emm2, vdupq_n_u32(1)); ++ emm2 = vandq_u32(emm2, vdupq_n_u32(~1)); ++ y = vcvtq_f32_u32(emm2); ++- +++ ++ /* get the polynom selection mask ++ there is one polynom for 0 <= x <= Pi/4 ++ and another one for Pi/4 ++ #include ++ #include +++#include ++ ++ __VOLK_DECL_BEGIN ++ ++-typedef struct volk_arch_pref ++-{ ++- char name[128]; //name of the kernel ++- char impl_a[128]; //best aligned impl ++- char impl_u[128]; //best unaligned impl +++typedef struct volk_arch_pref { +++ char name[128]; // name of the kernel +++ char impl_a[128]; // best aligned impl +++ char impl_u[128]; // best unaligned impl ++ } volk_arch_pref_t; ++ ++ //////////////////////////////////////////////////////////////////////// ++@@ -19,13 +18,13 @@ typedef struct volk_arch_pref ++ // if config file should be tested on existence for reading. ++ // returns \0 in the argument on failure. ++ //////////////////////////////////////////////////////////////////////// ++-VOLK_API void volk_get_config_path(char *, bool); +++VOLK_API void volk_get_config_path(char*, bool); ++ ++ //////////////////////////////////////////////////////////////////////// ++ // load prefs into global prefs struct ++ //////////////////////////////////////////////////////////////////////// ++-VOLK_API size_t volk_load_preferences(volk_arch_pref_t **); +++VOLK_API size_t volk_load_preferences(volk_arch_pref_t**); ++ ++ __VOLK_DECL_END ++ ++-#endif //INCLUDED_VOLK_PREFS_H +++#endif // INCLUDED_VOLK_PREFS_H ++diff --git a/include/volk/volk_sse3_intrinsics.h b/include/volk/volk_sse3_intrinsics.h ++index 6b53a2a..6bdc8d8 100644 ++--- a/include/volk/volk_sse3_intrinsics.h +++++ b/include/volk/volk_sse3_intrinsics.h ++@@ -1,19 +1,19 @@ ++ /* -*- c++ -*- */ ++-/* +++/* ++ * Copyright 2015 Free Software Foundation, Inc. ++- * +++ * ++ * This file is part of GNU Radio ++- * +++ * ++ * GNU Radio is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 3, or (at your option) ++ * any later version. ++- * +++ * ++ * GNU Radio is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++- * +++ * ++ * You should have received a copy of the GNU General Public License ++ * along with GNU Radio; see the file COPYING. If not, write to ++ * the Free Software Foundation, Inc., 51 Franklin Street, ++@@ -29,49 +29,52 @@ ++ #define INCLUDE_VOLK_VOLK_SSE3_INTRINSICS_H_ ++ #include ++ ++-static inline __m128 ++-_mm_complexmul_ps(__m128 x, __m128 y) +++static inline __m128 _mm_complexmul_ps(__m128 x, __m128 y) ++ { ++- __m128 yl, yh, tmp1, tmp2; ++- yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr ++- yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di ++- tmp1 = _mm_mul_ps(x, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr ++- x = _mm_shuffle_ps(x, x, 0xB1); // Re-arrange x to be ai,ar,bi,br ++- tmp2 = _mm_mul_ps(x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di ++- return _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di +++ __m128 yl, yh, tmp1, tmp2; +++ yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr +++ yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di +++ tmp1 = _mm_mul_ps(x, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr +++ x = _mm_shuffle_ps(x, x, 0xB1); // Re-arrange x to be ai,ar,bi,br +++ tmp2 = _mm_mul_ps(x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di +++ return _mm_addsub_ps(tmp1, +++ tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di ++ } ++ ++-static inline __m128 ++-_mm_complexconjugatemul_ps(__m128 x, __m128 y) +++static inline __m128 _mm_complexconjugatemul_ps(__m128 x, __m128 y) ++ { ++- const __m128 conjugator = _mm_setr_ps(0, -0.f, 0, -0.f); ++- y = _mm_xor_ps(y, conjugator); // conjugate y ++- return _mm_complexmul_ps(x, y); +++ const __m128 conjugator = _mm_setr_ps(0, -0.f, 0, -0.f); +++ y = _mm_xor_ps(y, conjugator); // conjugate y +++ return _mm_complexmul_ps(x, y); ++ } ++ ++-static inline __m128 ++-_mm_magnitudesquared_ps_sse3(__m128 cplxValue1, __m128 cplxValue2){ ++- cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values ++- cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values ++- return _mm_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values +++static inline __m128 _mm_magnitudesquared_ps_sse3(__m128 cplxValue1, __m128 cplxValue2) +++{ +++ cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values +++ cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values +++ return _mm_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values ++ } ++ ++-static inline __m128 ++-_mm_magnitude_ps_sse3(__m128 cplxValue1, __m128 cplxValue2){ ++- return _mm_sqrt_ps(_mm_magnitudesquared_ps_sse3(cplxValue1, cplxValue2)); +++static inline __m128 _mm_magnitude_ps_sse3(__m128 cplxValue1, __m128 cplxValue2) +++{ +++ return _mm_sqrt_ps(_mm_magnitudesquared_ps_sse3(cplxValue1, cplxValue2)); ++ } ++ ++-static inline __m128 ++-_mm_scaled_norm_dist_ps_sse3(const __m128 symbols0, const __m128 symbols1, const __m128 points0, const __m128 points1, const __m128 scalar){ ++- /* ++- * Calculate: |y - x|^2 * SNR_lin ++- * Consider 'symbolsX' and 'pointsX' to be complex float ++- * 'symbolsX' are 'y' and 'pointsX' are 'x' ++- */ ++- const __m128 diff0 = _mm_sub_ps(symbols0, points0); ++- const __m128 diff1 = _mm_sub_ps(symbols1, points1); ++- const __m128 norms = _mm_magnitudesquared_ps_sse3(diff0, diff1); ++- return _mm_mul_ps(norms, scalar); +++static inline __m128 _mm_scaled_norm_dist_ps_sse3(const __m128 symbols0, +++ const __m128 symbols1, +++ const __m128 points0, +++ const __m128 points1, +++ const __m128 scalar) +++{ +++ /* +++ * Calculate: |y - x|^2 * SNR_lin +++ * Consider 'symbolsX' and 'pointsX' to be complex float +++ * 'symbolsX' are 'y' and 'pointsX' are 'x' +++ */ +++ const __m128 diff0 = _mm_sub_ps(symbols0, points0); +++ const __m128 diff1 = _mm_sub_ps(symbols1, points1); +++ const __m128 norms = _mm_magnitudesquared_ps_sse3(diff0, diff1); +++ return _mm_mul_ps(norms, scalar); ++ } ++ ++ #endif /* INCLUDE_VOLK_VOLK_SSE3_INTRINSICS_H_ */ ++diff --git a/include/volk/volk_sse_intrinsics.h b/include/volk/volk_sse_intrinsics.h ++index 57318e2..24fe7c1 100644 ++--- a/include/volk/volk_sse_intrinsics.h +++++ b/include/volk/volk_sse_intrinsics.h ++@@ -1,19 +1,19 @@ ++ /* -*- c++ -*- */ ++-/* +++/* ++ * Copyright 2015 Free Software Foundation, Inc. ++- * +++ * ++ * This file is part of GNU Radio ++- * +++ * ++ * GNU Radio is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 3, or (at your option) ++ * any later version. ++- * +++ * ++ * GNU Radio is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++- * +++ * ++ * You should have received a copy of the GNU General Public License ++ * along with GNU Radio; see the file COPYING. If not, write to ++ * the Free Software Foundation, Inc., 51 Franklin Street, ++@@ -29,31 +29,34 @@ ++ #define INCLUDE_VOLK_VOLK_SSE_INTRINSICS_H_ ++ #include ++ ++-static inline __m128 ++-_mm_magnitudesquared_ps(__m128 cplxValue1, __m128 cplxValue2){ ++- __m128 iValue, qValue; ++- // Arrange in i1i2i3i4 format ++- iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0)); ++- // Arrange in q1q2q3q4 format ++- qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1)); ++- iValue = _mm_mul_ps(iValue, iValue); // Square the I values ++- qValue = _mm_mul_ps(qValue, qValue); // Square the Q Values ++- return _mm_add_ps(iValue, qValue); // Add the I2 and Q2 values +++static inline __m128 _mm_magnitudesquared_ps(__m128 cplxValue1, __m128 cplxValue2) +++{ +++ __m128 iValue, qValue; +++ // Arrange in i1i2i3i4 format +++ iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2, 0, 2, 0)); +++ // Arrange in q1q2q3q4 format +++ qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3, 1, 3, 1)); +++ iValue = _mm_mul_ps(iValue, iValue); // Square the I values +++ qValue = _mm_mul_ps(qValue, qValue); // Square the Q Values +++ return _mm_add_ps(iValue, qValue); // Add the I2 and Q2 values ++ } ++ ++-static inline __m128 ++-_mm_magnitude_ps(__m128 cplxValue1, __m128 cplxValue2){ ++- return _mm_sqrt_ps(_mm_magnitudesquared_ps(cplxValue1, cplxValue2)); +++static inline __m128 _mm_magnitude_ps(__m128 cplxValue1, __m128 cplxValue2) +++{ +++ return _mm_sqrt_ps(_mm_magnitudesquared_ps(cplxValue1, cplxValue2)); ++ } ++ ++-static inline __m128 ++-_mm_scaled_norm_dist_ps_sse(const __m128 symbols0, const __m128 symbols1, const __m128 points0, const __m128 points1, const __m128 scalar) +++static inline __m128 _mm_scaled_norm_dist_ps_sse(const __m128 symbols0, +++ const __m128 symbols1, +++ const __m128 points0, +++ const __m128 points1, +++ const __m128 scalar) ++ { ++- // calculate scalar * |x - y|^2 ++- const __m128 diff0 = _mm_sub_ps(symbols0, points0); ++- const __m128 diff1 = _mm_sub_ps(symbols1, points1); ++- const __m128 norms = _mm_magnitudesquared_ps(diff0, diff1); ++- return _mm_mul_ps(norms, scalar); +++ // calculate scalar * |x - y|^2 +++ const __m128 diff0 = _mm_sub_ps(symbols0, points0); +++ const __m128 diff1 = _mm_sub_ps(symbols1, points1); +++ const __m128 norms = _mm_magnitudesquared_ps(diff0, diff1); +++ return _mm_mul_ps(norms, scalar); ++ } ++ ++ #endif /* INCLUDE_VOLK_VOLK_SSE_INTRINSICS_H_ */ ++diff --git a/kernels/volk/volk_16i_32fc_dot_prod_32fc.h b/kernels/volk/volk_16i_32fc_dot_prod_32fc.h ++index f250340..2635649 100644 ++--- a/kernels/volk/volk_16i_32fc_dot_prod_32fc.h +++++ b/kernels/volk/volk_16i_32fc_dot_prod_32fc.h ++@@ -33,8 +33,8 @@ ++ * ++ * Dispatcher Prototype ++ * \code ++- * void volk_16i_32fc_dot_prod_32fc(lv_32fc_t* result, const short* input, const lv_32fc_t * taps, unsigned int num_points) ++- * \endcode +++ * void volk_16i_32fc_dot_prod_32fc(lv_32fc_t* result, const short* input, const lv_32fc_t +++ * * taps, unsigned int num_points) \endcode ++ * ++ * \b Inputs ++ * \li input: vector of shorts. ++@@ -58,165 +58,178 @@ ++ #ifndef INCLUDED_volk_16i_32fc_dot_prod_32fc_H ++ #define INCLUDED_volk_16i_32fc_dot_prod_32fc_H ++ ++-#include ++ #include +++#include ++ ++ ++ #ifdef LV_HAVE_GENERIC ++ ++-static inline void volk_16i_32fc_dot_prod_32fc_generic(lv_32fc_t* result, const short* input, const lv_32fc_t * taps, unsigned int num_points) { +++static inline void volk_16i_32fc_dot_prod_32fc_generic(lv_32fc_t* result, +++ const short* input, +++ const lv_32fc_t* taps, +++ unsigned int num_points) +++{ ++ ++- static const int N_UNROLL = 4; +++ static const int N_UNROLL = 4; ++ ++- lv_32fc_t acc0 = 0; ++- lv_32fc_t acc1 = 0; ++- lv_32fc_t acc2 = 0; ++- lv_32fc_t acc3 = 0; +++ lv_32fc_t acc0 = 0; +++ lv_32fc_t acc1 = 0; +++ lv_32fc_t acc2 = 0; +++ lv_32fc_t acc3 = 0; ++ ++- unsigned i = 0; ++- unsigned n = (num_points / N_UNROLL) * N_UNROLL; +++ unsigned i = 0; +++ unsigned n = (num_points / N_UNROLL) * N_UNROLL; ++ ++- for(i = 0; i < n; i += N_UNROLL) { ++- acc0 += taps[i + 0] * (float)input[i + 0]; ++- acc1 += taps[i + 1] * (float)input[i + 1]; ++- acc2 += taps[i + 2] * (float)input[i + 2]; ++- acc3 += taps[i + 3] * (float)input[i + 3]; ++- } +++ for (i = 0; i < n; i += N_UNROLL) { +++ acc0 += taps[i + 0] * (float)input[i + 0]; +++ acc1 += taps[i + 1] * (float)input[i + 1]; +++ acc2 += taps[i + 2] * (float)input[i + 2]; +++ acc3 += taps[i + 3] * (float)input[i + 3]; +++ } ++ ++- for(; i < num_points; i++) { ++- acc0 += taps[i] * (float)input[i]; ++- } +++ for (; i < num_points; i++) { +++ acc0 += taps[i] * (float)input[i]; +++ } ++ ++- *result = acc0 + acc1 + acc2 + acc3; +++ *result = acc0 + acc1 + acc2 + acc3; ++ } ++ ++ #endif /*LV_HAVE_GENERIC*/ ++ ++ #ifdef LV_HAVE_NEON ++ #include ++-static inline void volk_16i_32fc_dot_prod_32fc_neon(lv_32fc_t* result, const short* input, const lv_32fc_t * taps, unsigned int num_points) { ++- ++- unsigned ii; ++- unsigned quarter_points = num_points / 4; ++- lv_32fc_t* tapsPtr = (lv_32fc_t*) taps; ++- short* inputPtr = (short*) input; ++- lv_32fc_t accumulator_vec[4]; ++- ++- float32x4x2_t tapsVal, accumulator_val; ++- int16x4_t input16; ++- int32x4_t input32; ++- float32x4_t input_float, prod_re, prod_im; ++- ++- accumulator_val.val[0] = vdupq_n_f32(0.0); ++- accumulator_val.val[1] = vdupq_n_f32(0.0); ++- ++- for(ii = 0; ii < quarter_points; ++ii) { ++- tapsVal = vld2q_f32((float*)tapsPtr); ++- input16 = vld1_s16(inputPtr); ++- // widen 16-bit int to 32-bit int ++- input32 = vmovl_s16(input16); ++- // convert 32-bit int to float with scale ++- input_float = vcvtq_f32_s32(input32); ++- ++- prod_re = vmulq_f32(input_float, tapsVal.val[0]); ++- prod_im = vmulq_f32(input_float, tapsVal.val[1]); ++- ++- accumulator_val.val[0] = vaddq_f32(prod_re, accumulator_val.val[0]); ++- accumulator_val.val[1] = vaddq_f32(prod_im, accumulator_val.val[1]); ++- ++- tapsPtr += 4; ++- inputPtr += 4; ++- } ++- vst2q_f32((float*)accumulator_vec, accumulator_val); ++- accumulator_vec[0] += accumulator_vec[1]; ++- accumulator_vec[2] += accumulator_vec[3]; ++- accumulator_vec[0] += accumulator_vec[2]; ++- ++- for(ii = quarter_points * 4; ii < num_points; ++ii) { ++- accumulator_vec[0] += *(tapsPtr++) * (float)(*(inputPtr++)); ++- } ++- ++- *result = accumulator_vec[0]; +++static inline void volk_16i_32fc_dot_prod_32fc_neon(lv_32fc_t* result, +++ const short* input, +++ const lv_32fc_t* taps, +++ unsigned int num_points) +++{ +++ +++ unsigned ii; +++ unsigned quarter_points = num_points / 4; +++ lv_32fc_t* tapsPtr = (lv_32fc_t*)taps; +++ short* inputPtr = (short*)input; +++ lv_32fc_t accumulator_vec[4]; +++ +++ float32x4x2_t tapsVal, accumulator_val; +++ int16x4_t input16; +++ int32x4_t input32; +++ float32x4_t input_float, prod_re, prod_im; +++ +++ accumulator_val.val[0] = vdupq_n_f32(0.0); +++ accumulator_val.val[1] = vdupq_n_f32(0.0); +++ +++ for (ii = 0; ii < quarter_points; ++ii) { +++ tapsVal = vld2q_f32((float*)tapsPtr); +++ input16 = vld1_s16(inputPtr); +++ // widen 16-bit int to 32-bit int +++ input32 = vmovl_s16(input16); +++ // convert 32-bit int to float with scale +++ input_float = vcvtq_f32_s32(input32); +++ +++ prod_re = vmulq_f32(input_float, tapsVal.val[0]); +++ prod_im = vmulq_f32(input_float, tapsVal.val[1]); +++ +++ accumulator_val.val[0] = vaddq_f32(prod_re, accumulator_val.val[0]); +++ accumulator_val.val[1] = vaddq_f32(prod_im, accumulator_val.val[1]); +++ +++ tapsPtr += 4; +++ inputPtr += 4; +++ } +++ vst2q_f32((float*)accumulator_vec, accumulator_val); +++ accumulator_vec[0] += accumulator_vec[1]; +++ accumulator_vec[2] += accumulator_vec[3]; +++ accumulator_vec[0] += accumulator_vec[2]; +++ +++ for (ii = quarter_points * 4; ii < num_points; ++ii) { +++ accumulator_vec[0] += *(tapsPtr++) * (float)(*(inputPtr++)); +++ } +++ +++ *result = accumulator_vec[0]; ++ } ++ ++ #endif /*LV_HAVE_NEON*/ ++ ++ #if LV_HAVE_SSE && LV_HAVE_MMX ++ ++-static inline void volk_16i_32fc_dot_prod_32fc_u_sse( lv_32fc_t* result, const short* input, const lv_32fc_t* taps, unsigned int num_points) { ++- ++- unsigned int number = 0; ++- const unsigned int sixteenthPoints = num_points / 8; ++- ++- float res[2]; ++- float *realpt = &res[0], *imagpt = &res[1]; ++- const short* aPtr = input; ++- const float* bPtr = (float*)taps; ++- ++- __m64 m0, m1; ++- __m128 f0, f1, f2, f3; ++- __m128 a0Val, a1Val, a2Val, a3Val; ++- __m128 b0Val, b1Val, b2Val, b3Val; ++- __m128 c0Val, c1Val, c2Val, c3Val; ++- ++- __m128 dotProdVal0 = _mm_setzero_ps(); ++- __m128 dotProdVal1 = _mm_setzero_ps(); ++- __m128 dotProdVal2 = _mm_setzero_ps(); ++- __m128 dotProdVal3 = _mm_setzero_ps(); ++- ++- for(;number < sixteenthPoints; number++){ ++- ++- m0 = _mm_set_pi16(*(aPtr+3), *(aPtr+2), *(aPtr+1), *(aPtr+0)); ++- m1 = _mm_set_pi16(*(aPtr+7), *(aPtr+6), *(aPtr+5), *(aPtr+4)); ++- f0 = _mm_cvtpi16_ps(m0); ++- f1 = _mm_cvtpi16_ps(m0); ++- f2 = _mm_cvtpi16_ps(m1); ++- f3 = _mm_cvtpi16_ps(m1); ++- ++- a0Val = _mm_unpacklo_ps(f0, f1); ++- a1Val = _mm_unpackhi_ps(f0, f1); ++- a2Val = _mm_unpacklo_ps(f2, f3); ++- a3Val = _mm_unpackhi_ps(f2, f3); ++- ++- b0Val = _mm_loadu_ps(bPtr); ++- b1Val = _mm_loadu_ps(bPtr+4); ++- b2Val = _mm_loadu_ps(bPtr+8); ++- b3Val = _mm_loadu_ps(bPtr+12); ++- ++- c0Val = _mm_mul_ps(a0Val, b0Val); ++- c1Val = _mm_mul_ps(a1Val, b1Val); ++- c2Val = _mm_mul_ps(a2Val, b2Val); ++- c3Val = _mm_mul_ps(a3Val, b3Val); ++- ++- dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0); ++- dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1); ++- dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2); ++- dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3); ++- ++- aPtr += 8; ++- bPtr += 16; ++- } ++- ++- dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1); ++- dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2); ++- dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3); ++- ++- __VOLK_ATTR_ALIGNED(16) float dotProductVector[4]; ++- ++- _mm_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector ++- ++- *realpt = dotProductVector[0]; ++- *imagpt = dotProductVector[1]; ++- *realpt += dotProductVector[2]; ++- *imagpt += dotProductVector[3]; ++- ++- number = sixteenthPoints*8; ++- for(;number < num_points; number++){ ++- *realpt += ((*aPtr) * (*bPtr++)); ++- *imagpt += ((*aPtr++) * (*bPtr++)); ++- } ++- ++- *result = *(lv_32fc_t*)(&res[0]); +++static inline void volk_16i_32fc_dot_prod_32fc_u_sse(lv_32fc_t* result, +++ const short* input, +++ const lv_32fc_t* taps, +++ unsigned int num_points) +++{ +++ +++ unsigned int number = 0; +++ const unsigned int sixteenthPoints = num_points / 8; +++ +++ float res[2]; +++ float *realpt = &res[0], *imagpt = &res[1]; +++ const short* aPtr = input; +++ const float* bPtr = (float*)taps; +++ +++ __m64 m0, m1; +++ __m128 f0, f1, f2, f3; +++ __m128 a0Val, a1Val, a2Val, a3Val; +++ __m128 b0Val, b1Val, b2Val, b3Val; +++ __m128 c0Val, c1Val, c2Val, c3Val; +++ +++ __m128 dotProdVal0 = _mm_setzero_ps(); +++ __m128 dotProdVal1 = _mm_setzero_ps(); +++ __m128 dotProdVal2 = _mm_setzero_ps(); +++ __m128 dotProdVal3 = _mm_setzero_ps(); +++ +++ for (; number < sixteenthPoints; number++) { +++ +++ m0 = _mm_set_pi16(*(aPtr + 3), *(aPtr + 2), *(aPtr + 1), *(aPtr + 0)); +++ m1 = _mm_set_pi16(*(aPtr + 7), *(aPtr + 6), *(aPtr + 5), *(aPtr + 4)); +++ f0 = _mm_cvtpi16_ps(m0); +++ f1 = _mm_cvtpi16_ps(m0); +++ f2 = _mm_cvtpi16_ps(m1); +++ f3 = _mm_cvtpi16_ps(m1); +++ +++ a0Val = _mm_unpacklo_ps(f0, f1); +++ a1Val = _mm_unpackhi_ps(f0, f1); +++ a2Val = _mm_unpacklo_ps(f2, f3); +++ a3Val = _mm_unpackhi_ps(f2, f3); +++ +++ b0Val = _mm_loadu_ps(bPtr); +++ b1Val = _mm_loadu_ps(bPtr + 4); +++ b2Val = _mm_loadu_ps(bPtr + 8); +++ b3Val = _mm_loadu_ps(bPtr + 12); +++ +++ c0Val = _mm_mul_ps(a0Val, b0Val); +++ c1Val = _mm_mul_ps(a1Val, b1Val); +++ c2Val = _mm_mul_ps(a2Val, b2Val); +++ c3Val = _mm_mul_ps(a3Val, b3Val); +++ +++ dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0); +++ dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1); +++ dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2); +++ dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3); +++ +++ aPtr += 8; +++ bPtr += 16; +++ } +++ +++ dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1); +++ dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2); +++ dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3); +++ +++ __VOLK_ATTR_ALIGNED(16) float dotProductVector[4]; +++ +++ _mm_store_ps(dotProductVector, +++ dotProdVal0); // Store the results back into the dot product vector +++ +++ *realpt = dotProductVector[0]; +++ *imagpt = dotProductVector[1]; +++ *realpt += dotProductVector[2]; +++ *imagpt += dotProductVector[3]; +++ +++ number = sixteenthPoints * 8; +++ for (; number < num_points; number++) { +++ *realpt += ((*aPtr) * (*bPtr++)); +++ *imagpt += ((*aPtr++) * (*bPtr++)); +++ } +++ +++ *result = *(lv_32fc_t*)(&res[0]); ++ } ++ ++ #endif /*LV_HAVE_SSE && LV_HAVE_MMX*/ ++@@ -224,85 +237,90 @@ static inline void volk_16i_32fc_dot_prod_32fc_u_sse( lv_32fc_t* result, const ++ ++ #if LV_HAVE_AVX2 && LV_HAVE_FMA ++ ++-static inline void volk_16i_32fc_dot_prod_32fc_u_avx2_fma( lv_32fc_t* result, const short* input, const lv_32fc_t* taps, unsigned int num_points) { ++- ++- unsigned int number = 0; ++- const unsigned int sixteenthPoints = num_points / 16; ++- ++- float res[2]; ++- float *realpt = &res[0], *imagpt = &res[1]; ++- const short* aPtr = input; ++- const float* bPtr = (float*)taps; ++- ++- __m128i m0, m1; ++- __m256i f0, f1; ++- __m256 g0, g1, h0, h1, h2, h3; ++- __m256 a0Val, a1Val, a2Val, a3Val; ++- __m256 b0Val, b1Val, b2Val, b3Val; ++- ++- __m256 dotProdVal0 = _mm256_setzero_ps(); ++- __m256 dotProdVal1 = _mm256_setzero_ps(); ++- __m256 dotProdVal2 = _mm256_setzero_ps(); ++- __m256 dotProdVal3 = _mm256_setzero_ps(); ++- ++- for(;number < sixteenthPoints; number++){ ++- ++- m0 = _mm_loadu_si128((__m128i const*) aPtr); ++- m1 = _mm_loadu_si128((__m128i const*)(aPtr+8)); ++- ++- f0 = _mm256_cvtepi16_epi32(m0); ++- g0 = _mm256_cvtepi32_ps(f0); ++- f1 = _mm256_cvtepi16_epi32(m1); ++- g1 = _mm256_cvtepi32_ps(f1); ++- ++- h0 = _mm256_unpacklo_ps(g0, g0); ++- h1 = _mm256_unpackhi_ps(g0, g0); ++- h2 = _mm256_unpacklo_ps(g1, g1); ++- h3 = _mm256_unpackhi_ps(g1, g1); ++- ++- a0Val = _mm256_permute2f128_ps(h0, h1, 0x20); ++- a1Val = _mm256_permute2f128_ps(h0, h1, 0x31); ++- a2Val = _mm256_permute2f128_ps(h2, h3, 0x20); ++- a3Val = _mm256_permute2f128_ps(h2, h3, 0x31); ++- ++- b0Val = _mm256_loadu_ps(bPtr); ++- b1Val = _mm256_loadu_ps(bPtr+8); ++- b2Val = _mm256_loadu_ps(bPtr+16); ++- b3Val = _mm256_loadu_ps(bPtr+24); ++- ++- dotProdVal0 = _mm256_fmadd_ps(a0Val,b0Val,dotProdVal0); ++- dotProdVal1 = _mm256_fmadd_ps(a1Val,b1Val,dotProdVal1); ++- dotProdVal2 = _mm256_fmadd_ps(a2Val,b2Val,dotProdVal2); ++- dotProdVal3 = _mm256_fmadd_ps(a3Val,b3Val,dotProdVal3); ++- ++- aPtr += 16; ++- bPtr += 32; ++- } ++- ++- dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1); ++- dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2); ++- dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3); ++- ++- __VOLK_ATTR_ALIGNED(32) float dotProductVector[8]; ++- ++- _mm256_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector ++- ++- *realpt = dotProductVector[0]; ++- *imagpt = dotProductVector[1]; ++- *realpt += dotProductVector[2]; ++- *imagpt += dotProductVector[3]; ++- *realpt += dotProductVector[4]; ++- *imagpt += dotProductVector[5]; ++- *realpt += dotProductVector[6]; ++- *imagpt += dotProductVector[7]; ++- ++- number = sixteenthPoints*16; ++- for(;number < num_points; number++){ ++- *realpt += ((*aPtr) * (*bPtr++)); ++- *imagpt += ((*aPtr++) * (*bPtr++)); ++- } ++- ++- *result = *(lv_32fc_t*)(&res[0]); +++static inline void volk_16i_32fc_dot_prod_32fc_u_avx2_fma(lv_32fc_t* result, +++ const short* input, +++ const lv_32fc_t* taps, +++ unsigned int num_points) +++{ +++ +++ unsigned int number = 0; +++ const unsigned int sixteenthPoints = num_points / 16; +++ +++ float res[2]; +++ float *realpt = &res[0], *imagpt = &res[1]; +++ const short* aPtr = input; +++ const float* bPtr = (float*)taps; +++ +++ __m128i m0, m1; +++ __m256i f0, f1; +++ __m256 g0, g1, h0, h1, h2, h3; +++ __m256 a0Val, a1Val, a2Val, a3Val; +++ __m256 b0Val, b1Val, b2Val, b3Val; +++ +++ __m256 dotProdVal0 = _mm256_setzero_ps(); +++ __m256 dotProdVal1 = _mm256_setzero_ps(); +++ __m256 dotProdVal2 = _mm256_setzero_ps(); +++ __m256 dotProdVal3 = _mm256_setzero_ps(); +++ +++ for (; number < sixteenthPoints; number++) { +++ +++ m0 = _mm_loadu_si128((__m128i const*)aPtr); +++ m1 = _mm_loadu_si128((__m128i const*)(aPtr + 8)); +++ +++ f0 = _mm256_cvtepi16_epi32(m0); +++ g0 = _mm256_cvtepi32_ps(f0); +++ f1 = _mm256_cvtepi16_epi32(m1); +++ g1 = _mm256_cvtepi32_ps(f1); +++ +++ h0 = _mm256_unpacklo_ps(g0, g0); +++ h1 = _mm256_unpackhi_ps(g0, g0); +++ h2 = _mm256_unpacklo_ps(g1, g1); +++ h3 = _mm256_unpackhi_ps(g1, g1); +++ +++ a0Val = _mm256_permute2f128_ps(h0, h1, 0x20); +++ a1Val = _mm256_permute2f128_ps(h0, h1, 0x31); +++ a2Val = _mm256_permute2f128_ps(h2, h3, 0x20); +++ a3Val = _mm256_permute2f128_ps(h2, h3, 0x31); +++ +++ b0Val = _mm256_loadu_ps(bPtr); +++ b1Val = _mm256_loadu_ps(bPtr + 8); +++ b2Val = _mm256_loadu_ps(bPtr + 16); +++ b3Val = _mm256_loadu_ps(bPtr + 24); +++ +++ dotProdVal0 = _mm256_fmadd_ps(a0Val, b0Val, dotProdVal0); +++ dotProdVal1 = _mm256_fmadd_ps(a1Val, b1Val, dotProdVal1); +++ dotProdVal2 = _mm256_fmadd_ps(a2Val, b2Val, dotProdVal2); +++ dotProdVal3 = _mm256_fmadd_ps(a3Val, b3Val, dotProdVal3); +++ +++ aPtr += 16; +++ bPtr += 32; +++ } +++ +++ dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1); +++ dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2); +++ dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3); +++ +++ __VOLK_ATTR_ALIGNED(32) float dotProductVector[8]; +++ +++ _mm256_store_ps(dotProductVector, +++ dotProdVal0); // Store the results back into the dot product vector +++ +++ *realpt = dotProductVector[0]; +++ *imagpt = dotProductVector[1]; +++ *realpt += dotProductVector[2]; +++ *imagpt += dotProductVector[3]; +++ *realpt += dotProductVector[4]; +++ *imagpt += dotProductVector[5]; +++ *realpt += dotProductVector[6]; +++ *imagpt += dotProductVector[7]; +++ +++ number = sixteenthPoints * 16; +++ for (; number < num_points; number++) { +++ *realpt += ((*aPtr) * (*bPtr++)); +++ *imagpt += ((*aPtr++) * (*bPtr++)); +++ } +++ +++ *result = *(lv_32fc_t*)(&res[0]); ++ } ++ ++ #endif /*LV_HAVE_AVX2 && lV_HAVE_FMA*/ ++@@ -310,91 +328,96 @@ static inline void volk_16i_32fc_dot_prod_32fc_u_avx2_fma( lv_32fc_t* result, co ++ ++ #ifdef LV_HAVE_AVX2 ++ ++-static inline void volk_16i_32fc_dot_prod_32fc_u_avx2( lv_32fc_t* result, const short* input, const lv_32fc_t* taps, unsigned int num_points) { ++- ++- unsigned int number = 0; ++- const unsigned int sixteenthPoints = num_points / 16; ++- ++- float res[2]; ++- float *realpt = &res[0], *imagpt = &res[1]; ++- const short* aPtr = input; ++- const float* bPtr = (float*)taps; ++- ++- __m128i m0, m1; ++- __m256i f0, f1; ++- __m256 g0, g1, h0, h1, h2, h3; ++- __m256 a0Val, a1Val, a2Val, a3Val; ++- __m256 b0Val, b1Val, b2Val, b3Val; ++- __m256 c0Val, c1Val, c2Val, c3Val; ++- ++- __m256 dotProdVal0 = _mm256_setzero_ps(); ++- __m256 dotProdVal1 = _mm256_setzero_ps(); ++- __m256 dotProdVal2 = _mm256_setzero_ps(); ++- __m256 dotProdVal3 = _mm256_setzero_ps(); ++- ++- for(;number < sixteenthPoints; number++){ ++- ++- m0 = _mm_loadu_si128((__m128i const*) aPtr); ++- m1 = _mm_loadu_si128((__m128i const*)(aPtr+8)); ++- ++- f0 = _mm256_cvtepi16_epi32(m0); ++- g0 = _mm256_cvtepi32_ps(f0); ++- f1 = _mm256_cvtepi16_epi32(m1); ++- g1 = _mm256_cvtepi32_ps(f1); ++- ++- h0 = _mm256_unpacklo_ps(g0, g0); ++- h1 = _mm256_unpackhi_ps(g0, g0); ++- h2 = _mm256_unpacklo_ps(g1, g1); ++- h3 = _mm256_unpackhi_ps(g1, g1); ++- ++- a0Val = _mm256_permute2f128_ps(h0, h1, 0x20); ++- a1Val = _mm256_permute2f128_ps(h0, h1, 0x31); ++- a2Val = _mm256_permute2f128_ps(h2, h3, 0x20); ++- a3Val = _mm256_permute2f128_ps(h2, h3, 0x31); ++- ++- b0Val = _mm256_loadu_ps(bPtr); ++- b1Val = _mm256_loadu_ps(bPtr+8); ++- b2Val = _mm256_loadu_ps(bPtr+16); ++- b3Val = _mm256_loadu_ps(bPtr+24); ++- ++- c0Val = _mm256_mul_ps(a0Val, b0Val); ++- c1Val = _mm256_mul_ps(a1Val, b1Val); ++- c2Val = _mm256_mul_ps(a2Val, b2Val); ++- c3Val = _mm256_mul_ps(a3Val, b3Val); ++- ++- dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0); ++- dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1); ++- dotProdVal2 = _mm256_add_ps(c2Val, dotProdVal2); ++- dotProdVal3 = _mm256_add_ps(c3Val, dotProdVal3); ++- ++- aPtr += 16; ++- bPtr += 32; ++- } ++- ++- dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1); ++- dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2); ++- dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3); ++- ++- __VOLK_ATTR_ALIGNED(32) float dotProductVector[8]; ++- ++- _mm256_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector ++- ++- *realpt = dotProductVector[0]; ++- *imagpt = dotProductVector[1]; ++- *realpt += dotProductVector[2]; ++- *imagpt += dotProductVector[3]; ++- *realpt += dotProductVector[4]; ++- *imagpt += dotProductVector[5]; ++- *realpt += dotProductVector[6]; ++- *imagpt += dotProductVector[7]; ++- ++- number = sixteenthPoints*16; ++- for(;number < num_points; number++){ ++- *realpt += ((*aPtr) * (*bPtr++)); ++- *imagpt += ((*aPtr++) * (*bPtr++)); ++- } ++- ++- *result = *(lv_32fc_t*)(&res[0]); +++static inline void volk_16i_32fc_dot_prod_32fc_u_avx2(lv_32fc_t* result, +++ const short* input, +++ const lv_32fc_t* taps, +++ unsigned int num_points) +++{ +++ +++ unsigned int number = 0; +++ const unsigned int sixteenthPoints = num_points / 16; +++ +++ float res[2]; +++ float *realpt = &res[0], *imagpt = &res[1]; +++ const short* aPtr = input; +++ const float* bPtr = (float*)taps; +++ +++ __m128i m0, m1; +++ __m256i f0, f1; +++ __m256 g0, g1, h0, h1, h2, h3; +++ __m256 a0Val, a1Val, a2Val, a3Val; +++ __m256 b0Val, b1Val, b2Val, b3Val; +++ __m256 c0Val, c1Val, c2Val, c3Val; +++ +++ __m256 dotProdVal0 = _mm256_setzero_ps(); +++ __m256 dotProdVal1 = _mm256_setzero_ps(); +++ __m256 dotProdVal2 = _mm256_setzero_ps(); +++ __m256 dotProdVal3 = _mm256_setzero_ps(); +++ +++ for (; number < sixteenthPoints; number++) { +++ +++ m0 = _mm_loadu_si128((__m128i const*)aPtr); +++ m1 = _mm_loadu_si128((__m128i const*)(aPtr + 8)); +++ +++ f0 = _mm256_cvtepi16_epi32(m0); +++ g0 = _mm256_cvtepi32_ps(f0); +++ f1 = _mm256_cvtepi16_epi32(m1); +++ g1 = _mm256_cvtepi32_ps(f1); +++ +++ h0 = _mm256_unpacklo_ps(g0, g0); +++ h1 = _mm256_unpackhi_ps(g0, g0); +++ h2 = _mm256_unpacklo_ps(g1, g1); +++ h3 = _mm256_unpackhi_ps(g1, g1); +++ +++ a0Val = _mm256_permute2f128_ps(h0, h1, 0x20); +++ a1Val = _mm256_permute2f128_ps(h0, h1, 0x31); +++ a2Val = _mm256_permute2f128_ps(h2, h3, 0x20); +++ a3Val = _mm256_permute2f128_ps(h2, h3, 0x31); +++ +++ b0Val = _mm256_loadu_ps(bPtr); +++ b1Val = _mm256_loadu_ps(bPtr + 8); +++ b2Val = _mm256_loadu_ps(bPtr + 16); +++ b3Val = _mm256_loadu_ps(bPtr + 24); +++ +++ c0Val = _mm256_mul_ps(a0Val, b0Val); +++ c1Val = _mm256_mul_ps(a1Val, b1Val); +++ c2Val = _mm256_mul_ps(a2Val, b2Val); +++ c3Val = _mm256_mul_ps(a3Val, b3Val); +++ +++ dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0); +++ dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1); +++ dotProdVal2 = _mm256_add_ps(c2Val, dotProdVal2); +++ dotProdVal3 = _mm256_add_ps(c3Val, dotProdVal3); +++ +++ aPtr += 16; +++ bPtr += 32; +++ } +++ +++ dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1); +++ dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2); +++ dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3); +++ +++ __VOLK_ATTR_ALIGNED(32) float dotProductVector[8]; +++ +++ _mm256_store_ps(dotProductVector, +++ dotProdVal0); // Store the results back into the dot product vector +++ +++ *realpt = dotProductVector[0]; +++ *imagpt = dotProductVector[1]; +++ *realpt += dotProductVector[2]; +++ *imagpt += dotProductVector[3]; +++ *realpt += dotProductVector[4]; +++ *imagpt += dotProductVector[5]; +++ *realpt += dotProductVector[6]; +++ *imagpt += dotProductVector[7]; +++ +++ number = sixteenthPoints * 16; +++ for (; number < num_points; number++) { +++ *realpt += ((*aPtr) * (*bPtr++)); +++ *imagpt += ((*aPtr++) * (*bPtr++)); +++ } +++ +++ *result = *(lv_32fc_t*)(&res[0]); ++ } ++ ++ #endif /*LV_HAVE_AVX2*/ ++@@ -403,171 +426,181 @@ static inline void volk_16i_32fc_dot_prod_32fc_u_avx2( lv_32fc_t* result, const ++ #if LV_HAVE_SSE && LV_HAVE_MMX ++ ++ ++-static inline void volk_16i_32fc_dot_prod_32fc_a_sse( lv_32fc_t* result, const short* input, const lv_32fc_t* taps, unsigned int num_points) { ++- ++- unsigned int number = 0; ++- const unsigned int sixteenthPoints = num_points / 8; ++- ++- float res[2]; ++- float *realpt = &res[0], *imagpt = &res[1]; ++- const short* aPtr = input; ++- const float* bPtr = (float*)taps; ++- ++- __m64 m0, m1; ++- __m128 f0, f1, f2, f3; ++- __m128 a0Val, a1Val, a2Val, a3Val; ++- __m128 b0Val, b1Val, b2Val, b3Val; ++- __m128 c0Val, c1Val, c2Val, c3Val; ++- ++- __m128 dotProdVal0 = _mm_setzero_ps(); ++- __m128 dotProdVal1 = _mm_setzero_ps(); ++- __m128 dotProdVal2 = _mm_setzero_ps(); ++- __m128 dotProdVal3 = _mm_setzero_ps(); ++- ++- for(;number < sixteenthPoints; number++){ ++- ++- m0 = _mm_set_pi16(*(aPtr+3), *(aPtr+2), *(aPtr+1), *(aPtr+0)); ++- m1 = _mm_set_pi16(*(aPtr+7), *(aPtr+6), *(aPtr+5), *(aPtr+4)); ++- f0 = _mm_cvtpi16_ps(m0); ++- f1 = _mm_cvtpi16_ps(m0); ++- f2 = _mm_cvtpi16_ps(m1); ++- f3 = _mm_cvtpi16_ps(m1); ++- ++- a0Val = _mm_unpacklo_ps(f0, f1); ++- a1Val = _mm_unpackhi_ps(f0, f1); ++- a2Val = _mm_unpacklo_ps(f2, f3); ++- a3Val = _mm_unpackhi_ps(f2, f3); ++- ++- b0Val = _mm_load_ps(bPtr); ++- b1Val = _mm_load_ps(bPtr+4); ++- b2Val = _mm_load_ps(bPtr+8); ++- b3Val = _mm_load_ps(bPtr+12); ++- ++- c0Val = _mm_mul_ps(a0Val, b0Val); ++- c1Val = _mm_mul_ps(a1Val, b1Val); ++- c2Val = _mm_mul_ps(a2Val, b2Val); ++- c3Val = _mm_mul_ps(a3Val, b3Val); ++- ++- dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0); ++- dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1); ++- dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2); ++- dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3); ++- ++- aPtr += 8; ++- bPtr += 16; ++- } ++- ++- dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1); ++- dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2); ++- dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3); ++- ++- __VOLK_ATTR_ALIGNED(16) float dotProductVector[4]; ++- ++- _mm_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector ++- ++- *realpt = dotProductVector[0]; ++- *imagpt = dotProductVector[1]; ++- *realpt += dotProductVector[2]; ++- *imagpt += dotProductVector[3]; ++- ++- number = sixteenthPoints*8; ++- for(;number < num_points; number++){ ++- *realpt += ((*aPtr) * (*bPtr++)); ++- *imagpt += ((*aPtr++) * (*bPtr++)); ++- } ++- ++- *result = *(lv_32fc_t*)(&res[0]); +++static inline void volk_16i_32fc_dot_prod_32fc_a_sse(lv_32fc_t* result, +++ const short* input, +++ const lv_32fc_t* taps, +++ unsigned int num_points) +++{ +++ +++ unsigned int number = 0; +++ const unsigned int sixteenthPoints = num_points / 8; +++ +++ float res[2]; +++ float *realpt = &res[0], *imagpt = &res[1]; +++ const short* aPtr = input; +++ const float* bPtr = (float*)taps; +++ +++ __m64 m0, m1; +++ __m128 f0, f1, f2, f3; +++ __m128 a0Val, a1Val, a2Val, a3Val; +++ __m128 b0Val, b1Val, b2Val, b3Val; +++ __m128 c0Val, c1Val, c2Val, c3Val; +++ +++ __m128 dotProdVal0 = _mm_setzero_ps(); +++ __m128 dotProdVal1 = _mm_setzero_ps(); +++ __m128 dotProdVal2 = _mm_setzero_ps(); +++ __m128 dotProdVal3 = _mm_setzero_ps(); +++ +++ for (; number < sixteenthPoints; number++) { +++ +++ m0 = _mm_set_pi16(*(aPtr + 3), *(aPtr + 2), *(aPtr + 1), *(aPtr + 0)); +++ m1 = _mm_set_pi16(*(aPtr + 7), *(aPtr + 6), *(aPtr + 5), *(aPtr + 4)); +++ f0 = _mm_cvtpi16_ps(m0); +++ f1 = _mm_cvtpi16_ps(m0); +++ f2 = _mm_cvtpi16_ps(m1); +++ f3 = _mm_cvtpi16_ps(m1); +++ +++ a0Val = _mm_unpacklo_ps(f0, f1); +++ a1Val = _mm_unpackhi_ps(f0, f1); +++ a2Val = _mm_unpacklo_ps(f2, f3); +++ a3Val = _mm_unpackhi_ps(f2, f3); +++ +++ b0Val = _mm_load_ps(bPtr); +++ b1Val = _mm_load_ps(bPtr + 4); +++ b2Val = _mm_load_ps(bPtr + 8); +++ b3Val = _mm_load_ps(bPtr + 12); +++ +++ c0Val = _mm_mul_ps(a0Val, b0Val); +++ c1Val = _mm_mul_ps(a1Val, b1Val); +++ c2Val = _mm_mul_ps(a2Val, b2Val); +++ c3Val = _mm_mul_ps(a3Val, b3Val); +++ +++ dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0); +++ dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1); +++ dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2); +++ dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3); +++ +++ aPtr += 8; +++ bPtr += 16; +++ } +++ +++ dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1); +++ dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2); +++ dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3); +++ +++ __VOLK_ATTR_ALIGNED(16) float dotProductVector[4]; +++ +++ _mm_store_ps(dotProductVector, +++ dotProdVal0); // Store the results back into the dot product vector +++ +++ *realpt = dotProductVector[0]; +++ *imagpt = dotProductVector[1]; +++ *realpt += dotProductVector[2]; +++ *imagpt += dotProductVector[3]; +++ +++ number = sixteenthPoints * 8; +++ for (; number < num_points; number++) { +++ *realpt += ((*aPtr) * (*bPtr++)); +++ *imagpt += ((*aPtr++) * (*bPtr++)); +++ } +++ +++ *result = *(lv_32fc_t*)(&res[0]); ++ } ++ ++ #endif /*LV_HAVE_SSE && LV_HAVE_MMX*/ ++ ++ #ifdef LV_HAVE_AVX2 ++ ++-static inline void volk_16i_32fc_dot_prod_32fc_a_avx2( lv_32fc_t* result, const short* input, const lv_32fc_t* taps, unsigned int num_points) { ++- ++- unsigned int number = 0; ++- const unsigned int sixteenthPoints = num_points / 16; ++- ++- float res[2]; ++- float *realpt = &res[0], *imagpt = &res[1]; ++- const short* aPtr = input; ++- const float* bPtr = (float*)taps; ++- ++- __m128i m0, m1; ++- __m256i f0, f1; ++- __m256 g0, g1, h0, h1, h2, h3; ++- __m256 a0Val, a1Val, a2Val, a3Val; ++- __m256 b0Val, b1Val, b2Val, b3Val; ++- __m256 c0Val, c1Val, c2Val, c3Val; ++- ++- __m256 dotProdVal0 = _mm256_setzero_ps(); ++- __m256 dotProdVal1 = _mm256_setzero_ps(); ++- __m256 dotProdVal2 = _mm256_setzero_ps(); ++- __m256 dotProdVal3 = _mm256_setzero_ps(); ++- ++- for(;number < sixteenthPoints; number++){ ++- ++- m0 = _mm_load_si128((__m128i const*) aPtr); ++- m1 = _mm_load_si128((__m128i const*)(aPtr+8)); ++- ++- f0 = _mm256_cvtepi16_epi32(m0); ++- g0 = _mm256_cvtepi32_ps(f0); ++- f1 = _mm256_cvtepi16_epi32(m1); ++- g1 = _mm256_cvtepi32_ps(f1); ++- ++- h0 = _mm256_unpacklo_ps(g0, g0); ++- h1 = _mm256_unpackhi_ps(g0, g0); ++- h2 = _mm256_unpacklo_ps(g1, g1); ++- h3 = _mm256_unpackhi_ps(g1, g1); ++- ++- a0Val = _mm256_permute2f128_ps(h0, h1, 0x20); ++- a1Val = _mm256_permute2f128_ps(h0, h1, 0x31); ++- a2Val = _mm256_permute2f128_ps(h2, h3, 0x20); ++- a3Val = _mm256_permute2f128_ps(h2, h3, 0x31); ++- ++- b0Val = _mm256_load_ps(bPtr); ++- b1Val = _mm256_load_ps(bPtr+8); ++- b2Val = _mm256_load_ps(bPtr+16); ++- b3Val = _mm256_load_ps(bPtr+24); ++- ++- c0Val = _mm256_mul_ps(a0Val, b0Val); ++- c1Val = _mm256_mul_ps(a1Val, b1Val); ++- c2Val = _mm256_mul_ps(a2Val, b2Val); ++- c3Val = _mm256_mul_ps(a3Val, b3Val); ++- ++- dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0); ++- dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1); ++- dotProdVal2 = _mm256_add_ps(c2Val, dotProdVal2); ++- dotProdVal3 = _mm256_add_ps(c3Val, dotProdVal3); ++- ++- aPtr += 16; ++- bPtr += 32; ++- } ++- ++- dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1); ++- dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2); ++- dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3); ++- ++- __VOLK_ATTR_ALIGNED(32) float dotProductVector[8]; ++- ++- _mm256_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector ++- ++- *realpt = dotProductVector[0]; ++- *imagpt = dotProductVector[1]; ++- *realpt += dotProductVector[2]; ++- *imagpt += dotProductVector[3]; ++- *realpt += dotProductVector[4]; ++- *imagpt += dotProductVector[5]; ++- *realpt += dotProductVector[6]; ++- *imagpt += dotProductVector[7]; ++- ++- number = sixteenthPoints*16; ++- for(;number < num_points; number++){ ++- *realpt += ((*aPtr) * (*bPtr++)); ++- *imagpt += ((*aPtr++) * (*bPtr++)); ++- } ++- ++- *result = *(lv_32fc_t*)(&res[0]); +++static inline void volk_16i_32fc_dot_prod_32fc_a_avx2(lv_32fc_t* result, +++ const short* input, +++ const lv_32fc_t* taps, +++ unsigned int num_points) +++{ +++ +++ unsigned int number = 0; +++ const unsigned int sixteenthPoints = num_points / 16; +++ +++ float res[2]; +++ float *realpt = &res[0], *imagpt = &res[1]; +++ const short* aPtr = input; +++ const float* bPtr = (float*)taps; +++ +++ __m128i m0, m1; +++ __m256i f0, f1; +++ __m256 g0, g1, h0, h1, h2, h3; +++ __m256 a0Val, a1Val, a2Val, a3Val; +++ __m256 b0Val, b1Val, b2Val, b3Val; +++ __m256 c0Val, c1Val, c2Val, c3Val; +++ +++ __m256 dotProdVal0 = _mm256_setzero_ps(); +++ __m256 dotProdVal1 = _mm256_setzero_ps(); +++ __m256 dotProdVal2 = _mm256_setzero_ps(); +++ __m256 dotProdVal3 = _mm256_setzero_ps(); +++ +++ for (; number < sixteenthPoints; number++) { +++ +++ m0 = _mm_load_si128((__m128i const*)aPtr); +++ m1 = _mm_load_si128((__m128i const*)(aPtr + 8)); +++ +++ f0 = _mm256_cvtepi16_epi32(m0); +++ g0 = _mm256_cvtepi32_ps(f0); +++ f1 = _mm256_cvtepi16_epi32(m1); +++ g1 = _mm256_cvtepi32_ps(f1); +++ +++ h0 = _mm256_unpacklo_ps(g0, g0); +++ h1 = _mm256_unpackhi_ps(g0, g0); +++ h2 = _mm256_unpacklo_ps(g1, g1); +++ h3 = _mm256_unpackhi_ps(g1, g1); +++ +++ a0Val = _mm256_permute2f128_ps(h0, h1, 0x20); +++ a1Val = _mm256_permute2f128_ps(h0, h1, 0x31); +++ a2Val = _mm256_permute2f128_ps(h2, h3, 0x20); +++ a3Val = _mm256_permute2f128_ps(h2, h3, 0x31); +++ +++ b0Val = _mm256_load_ps(bPtr); +++ b1Val = _mm256_load_ps(bPtr + 8); +++ b2Val = _mm256_load_ps(bPtr + 16); +++ b3Val = _mm256_load_ps(bPtr + 24); +++ +++ c0Val = _mm256_mul_ps(a0Val, b0Val); +++ c1Val = _mm256_mul_ps(a1Val, b1Val); +++ c2Val = _mm256_mul_ps(a2Val, b2Val); +++ c3Val = _mm256_mul_ps(a3Val, b3Val); +++ +++ dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0); +++ dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1); +++ dotProdVal2 = _mm256_add_ps(c2Val, dotProdVal2); +++ dotProdVal3 = _mm256_add_ps(c3Val, dotProdVal3); +++ +++ aPtr += 16; +++ bPtr += 32; +++ } +++ +++ dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1); +++ dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2); +++ dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3); +++ +++ __VOLK_ATTR_ALIGNED(32) float dotProductVector[8]; +++ +++ _mm256_store_ps(dotProductVector, +++ dotProdVal0); // Store the results back into the dot product vector +++ +++ *realpt = dotProductVector[0]; +++ *imagpt = dotProductVector[1]; +++ *realpt += dotProductVector[2]; +++ *imagpt += dotProductVector[3]; +++ *realpt += dotProductVector[4]; +++ *imagpt += dotProductVector[5]; +++ *realpt += dotProductVector[6]; +++ *imagpt += dotProductVector[7]; +++ +++ number = sixteenthPoints * 16; +++ for (; number < num_points; number++) { +++ *realpt += ((*aPtr) * (*bPtr++)); +++ *imagpt += ((*aPtr++) * (*bPtr++)); +++ } +++ +++ *result = *(lv_32fc_t*)(&res[0]); ++ } ++ ++ ++@@ -575,85 +608,90 @@ static inline void volk_16i_32fc_dot_prod_32fc_a_avx2( lv_32fc_t* result, const ++ ++ #if LV_HAVE_AVX2 && LV_HAVE_FMA ++ ++-static inline void volk_16i_32fc_dot_prod_32fc_a_avx2_fma( lv_32fc_t* result, const short* input, const lv_32fc_t* taps, unsigned int num_points) { ++- ++- unsigned int number = 0; ++- const unsigned int sixteenthPoints = num_points / 16; ++- ++- float res[2]; ++- float *realpt = &res[0], *imagpt = &res[1]; ++- const short* aPtr = input; ++- const float* bPtr = (float*)taps; ++- ++- __m128i m0, m1; ++- __m256i f0, f1; ++- __m256 g0, g1, h0, h1, h2, h3; ++- __m256 a0Val, a1Val, a2Val, a3Val; ++- __m256 b0Val, b1Val, b2Val, b3Val; ++- ++- __m256 dotProdVal0 = _mm256_setzero_ps(); ++- __m256 dotProdVal1 = _mm256_setzero_ps(); ++- __m256 dotProdVal2 = _mm256_setzero_ps(); ++- __m256 dotProdVal3 = _mm256_setzero_ps(); ++- ++- for(;number < sixteenthPoints; number++){ ++- ++- m0 = _mm_load_si128((__m128i const*) aPtr); ++- m1 = _mm_load_si128((__m128i const*)(aPtr+8)); ++- ++- f0 = _mm256_cvtepi16_epi32(m0); ++- g0 = _mm256_cvtepi32_ps(f0); ++- f1 = _mm256_cvtepi16_epi32(m1); ++- g1 = _mm256_cvtepi32_ps(f1); ++- ++- h0 = _mm256_unpacklo_ps(g0, g0); ++- h1 = _mm256_unpackhi_ps(g0, g0); ++- h2 = _mm256_unpacklo_ps(g1, g1); ++- h3 = _mm256_unpackhi_ps(g1, g1); ++- ++- a0Val = _mm256_permute2f128_ps(h0, h1, 0x20); ++- a1Val = _mm256_permute2f128_ps(h0, h1, 0x31); ++- a2Val = _mm256_permute2f128_ps(h2, h3, 0x20); ++- a3Val = _mm256_permute2f128_ps(h2, h3, 0x31); ++- ++- b0Val = _mm256_load_ps(bPtr); ++- b1Val = _mm256_load_ps(bPtr+8); ++- b2Val = _mm256_load_ps(bPtr+16); ++- b3Val = _mm256_load_ps(bPtr+24); ++- ++- dotProdVal0 = _mm256_fmadd_ps(a0Val,b0Val,dotProdVal0); ++- dotProdVal1 = _mm256_fmadd_ps(a1Val,b1Val,dotProdVal1); ++- dotProdVal2 = _mm256_fmadd_ps(a2Val,b2Val,dotProdVal2); ++- dotProdVal3 = _mm256_fmadd_ps(a3Val,b3Val,dotProdVal3); ++- ++- aPtr += 16; ++- bPtr += 32; ++- } ++- ++- dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1); ++- dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2); ++- dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3); ++- ++- __VOLK_ATTR_ALIGNED(32) float dotProductVector[8]; ++- ++- _mm256_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector ++- ++- *realpt = dotProductVector[0]; ++- *imagpt = dotProductVector[1]; ++- *realpt += dotProductVector[2]; ++- *imagpt += dotProductVector[3]; ++- *realpt += dotProductVector[4]; ++- *imagpt += dotProductVector[5]; ++- *realpt += dotProductVector[6]; ++- *imagpt += dotProductVector[7]; ++- ++- number = sixteenthPoints*16; ++- for(;number < num_points; number++){ ++- *realpt += ((*aPtr) * (*bPtr++)); ++- *imagpt += ((*aPtr++) * (*bPtr++)); ++- } ++- ++- *result = *(lv_32fc_t*)(&res[0]); +++static inline void volk_16i_32fc_dot_prod_32fc_a_avx2_fma(lv_32fc_t* result, +++ const short* input, +++ const lv_32fc_t* taps, +++ unsigned int num_points) +++{ +++ +++ unsigned int number = 0; +++ const unsigned int sixteenthPoints = num_points / 16; +++ +++ float res[2]; +++ float *realpt = &res[0], *imagpt = &res[1]; +++ const short* aPtr = input; +++ const float* bPtr = (float*)taps; +++ +++ __m128i m0, m1; +++ __m256i f0, f1; +++ __m256 g0, g1, h0, h1, h2, h3; +++ __m256 a0Val, a1Val, a2Val, a3Val; +++ __m256 b0Val, b1Val, b2Val, b3Val; +++ +++ __m256 dotProdVal0 = _mm256_setzero_ps(); +++ __m256 dotProdVal1 = _mm256_setzero_ps(); +++ __m256 dotProdVal2 = _mm256_setzero_ps(); +++ __m256 dotProdVal3 = _mm256_setzero_ps(); +++ +++ for (; number < sixteenthPoints; number++) { +++ +++ m0 = _mm_load_si128((__m128i const*)aPtr); +++ m1 = _mm_load_si128((__m128i const*)(aPtr + 8)); +++ +++ f0 = _mm256_cvtepi16_epi32(m0); +++ g0 = _mm256_cvtepi32_ps(f0); +++ f1 = _mm256_cvtepi16_epi32(m1); +++ g1 = _mm256_cvtepi32_ps(f1); +++ +++ h0 = _mm256_unpacklo_ps(g0, g0); +++ h1 = _mm256_unpackhi_ps(g0, g0); +++ h2 = _mm256_unpacklo_ps(g1, g1); +++ h3 = _mm256_unpackhi_ps(g1, g1); +++ +++ a0Val = _mm256_permute2f128_ps(h0, h1, 0x20); +++ a1Val = _mm256_permute2f128_ps(h0, h1, 0x31); +++ a2Val = _mm256_permute2f128_ps(h2, h3, 0x20); +++ a3Val = _mm256_permute2f128_ps(h2, h3, 0x31); +++ +++ b0Val = _mm256_load_ps(bPtr); +++ b1Val = _mm256_load_ps(bPtr + 8); +++ b2Val = _mm256_load_ps(bPtr + 16); +++ b3Val = _mm256_load_ps(bPtr + 24); +++ +++ dotProdVal0 = _mm256_fmadd_ps(a0Val, b0Val, dotProdVal0); +++ dotProdVal1 = _mm256_fmadd_ps(a1Val, b1Val, dotProdVal1); +++ dotProdVal2 = _mm256_fmadd_ps(a2Val, b2Val, dotProdVal2); +++ dotProdVal3 = _mm256_fmadd_ps(a3Val, b3Val, dotProdVal3); +++ +++ aPtr += 16; +++ bPtr += 32; +++ } +++ +++ dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1); +++ dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2); +++ dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3); +++ +++ __VOLK_ATTR_ALIGNED(32) float dotProductVector[8]; +++ +++ _mm256_store_ps(dotProductVector, +++ dotProdVal0); // Store the results back into the dot product vector +++ +++ *realpt = dotProductVector[0]; +++ *imagpt = dotProductVector[1]; +++ *realpt += dotProductVector[2]; +++ *imagpt += dotProductVector[3]; +++ *realpt += dotProductVector[4]; +++ *imagpt += dotProductVector[5]; +++ *realpt += dotProductVector[6]; +++ *imagpt += dotProductVector[7]; +++ +++ number = sixteenthPoints * 16; +++ for (; number < num_points; number++) { +++ *realpt += ((*aPtr) * (*bPtr++)); +++ *imagpt += ((*aPtr++) * (*bPtr++)); +++ } +++ +++ *result = *(lv_32fc_t*)(&res[0]); ++ } ++ ++ ++diff --git a/kernels/volk/volk_16i_branch_4_state_8.h b/kernels/volk/volk_16i_branch_4_state_8.h ++index 31b66cc..4d00b6b 100644 ++--- a/kernels/volk/volk_16i_branch_4_state_8.h +++++ b/kernels/volk/volk_16i_branch_4_state_8.h ++@@ -29,8 +29,8 @@ ++ * ++ * Dispatcher Prototype ++ * \code ++- * void volk_16i_branch_4_state_8(short* target, short* src0, char** permuters, short* cntl2, short* cntl3, short* scalars) ++- * \endcode +++ * void volk_16i_branch_4_state_8(short* target, short* src0, char** permuters, short* +++ * cntl2, short* cntl3, short* scalars) \endcode ++ * ++ * \b Inputs ++ * \li src0: ++@@ -61,155 +61,154 @@ ++ ++ #ifdef LV_HAVE_SSSE3 ++ ++-#include ++ #include ++ #include +++#include ++ ++-static inline void ++-volk_16i_branch_4_state_8_a_ssse3(short* target, short* src0, char** permuters, short* cntl2, short* cntl3, short* scalars) +++static inline void volk_16i_branch_4_state_8_a_ssse3(short* target, +++ short* src0, +++ char** permuters, +++ short* cntl2, +++ short* cntl3, +++ short* scalars) ++ { ++- __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11; ++- __m128i *p_target, *p_src0, *p_cntl2, *p_cntl3, *p_scalars; +++ __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11; +++ __m128i *p_target, *p_src0, *p_cntl2, *p_cntl3, *p_scalars; ++ ++- p_target = (__m128i*)target; ++- p_src0 = (__m128i*)src0; ++- p_cntl2 = (__m128i*)cntl2; ++- p_cntl3 = (__m128i*)cntl3; ++- p_scalars = (__m128i*)scalars; +++ p_target = (__m128i*)target; +++ p_src0 = (__m128i*)src0; +++ p_cntl2 = (__m128i*)cntl2; +++ p_cntl3 = (__m128i*)cntl3; +++ p_scalars = (__m128i*)scalars; ++ ++- xmm0 = _mm_load_si128(p_scalars); +++ xmm0 = _mm_load_si128(p_scalars); ++ ++- xmm1 = _mm_shufflelo_epi16(xmm0, 0); ++- xmm2 = _mm_shufflelo_epi16(xmm0, 0x55); ++- xmm3 = _mm_shufflelo_epi16(xmm0, 0xaa); ++- xmm4 = _mm_shufflelo_epi16(xmm0, 0xff); +++ xmm1 = _mm_shufflelo_epi16(xmm0, 0); +++ xmm2 = _mm_shufflelo_epi16(xmm0, 0x55); +++ xmm3 = _mm_shufflelo_epi16(xmm0, 0xaa); +++ xmm4 = _mm_shufflelo_epi16(xmm0, 0xff); ++ ++- xmm1 = _mm_shuffle_epi32(xmm1, 0x00); ++- xmm2 = _mm_shuffle_epi32(xmm2, 0x00); ++- xmm3 = _mm_shuffle_epi32(xmm3, 0x00); ++- xmm4 = _mm_shuffle_epi32(xmm4, 0x00); +++ xmm1 = _mm_shuffle_epi32(xmm1, 0x00); +++ xmm2 = _mm_shuffle_epi32(xmm2, 0x00); +++ xmm3 = _mm_shuffle_epi32(xmm3, 0x00); +++ xmm4 = _mm_shuffle_epi32(xmm4, 0x00); ++ ++- xmm0 = _mm_load_si128((__m128i*)permuters[0]); ++- xmm6 = _mm_load_si128((__m128i*)permuters[1]); ++- xmm8 = _mm_load_si128((__m128i*)permuters[2]); ++- xmm10 = _mm_load_si128((__m128i*)permuters[3]); +++ xmm0 = _mm_load_si128((__m128i*)permuters[0]); +++ xmm6 = _mm_load_si128((__m128i*)permuters[1]); +++ xmm8 = _mm_load_si128((__m128i*)permuters[2]); +++ xmm10 = _mm_load_si128((__m128i*)permuters[3]); ++ ++- xmm5 = _mm_load_si128(p_src0); ++- xmm0 = _mm_shuffle_epi8(xmm5, xmm0); ++- xmm6 = _mm_shuffle_epi8(xmm5, xmm6); ++- xmm8 = _mm_shuffle_epi8(xmm5, xmm8); ++- xmm10 = _mm_shuffle_epi8(xmm5, xmm10); +++ xmm5 = _mm_load_si128(p_src0); +++ xmm0 = _mm_shuffle_epi8(xmm5, xmm0); +++ xmm6 = _mm_shuffle_epi8(xmm5, xmm6); +++ xmm8 = _mm_shuffle_epi8(xmm5, xmm8); +++ xmm10 = _mm_shuffle_epi8(xmm5, xmm10); ++ ++- xmm5 = _mm_add_epi16(xmm1, xmm2); +++ xmm5 = _mm_add_epi16(xmm1, xmm2); ++ ++- xmm6 = _mm_add_epi16(xmm2, xmm6); ++- xmm8 = _mm_add_epi16(xmm1, xmm8); +++ xmm6 = _mm_add_epi16(xmm2, xmm6); +++ xmm8 = _mm_add_epi16(xmm1, xmm8); ++ ++- xmm7 = _mm_load_si128(p_cntl2); ++- xmm9 = _mm_load_si128(p_cntl3); +++ xmm7 = _mm_load_si128(p_cntl2); +++ xmm9 = _mm_load_si128(p_cntl3); ++ ++- xmm0 = _mm_add_epi16(xmm5, xmm0); +++ xmm0 = _mm_add_epi16(xmm5, xmm0); ++ ++- xmm7 = _mm_and_si128(xmm7, xmm3); ++- xmm9 = _mm_and_si128(xmm9, xmm4); +++ xmm7 = _mm_and_si128(xmm7, xmm3); +++ xmm9 = _mm_and_si128(xmm9, xmm4); ++ ++- xmm5 = _mm_load_si128(&p_cntl2[1]); ++- xmm11 = _mm_load_si128(&p_cntl3[1]); +++ xmm5 = _mm_load_si128(&p_cntl2[1]); +++ xmm11 = _mm_load_si128(&p_cntl3[1]); ++ ++- xmm7 = _mm_add_epi16(xmm7, xmm9); +++ xmm7 = _mm_add_epi16(xmm7, xmm9); ++ ++- xmm5 = _mm_and_si128(xmm5, xmm3); ++- xmm11 = _mm_and_si128(xmm11, xmm4); +++ xmm5 = _mm_and_si128(xmm5, xmm3); +++ xmm11 = _mm_and_si128(xmm11, xmm4); ++ ++- xmm0 = _mm_add_epi16(xmm0, xmm7); +++ xmm0 = _mm_add_epi16(xmm0, xmm7); ++ ++ ++- xmm7 = _mm_load_si128(&p_cntl2[2]); ++- xmm9 = _mm_load_si128(&p_cntl3[2]); +++ xmm7 = _mm_load_si128(&p_cntl2[2]); +++ xmm9 = _mm_load_si128(&p_cntl3[2]); ++ ++- xmm5 = _mm_add_epi16(xmm5, xmm11); +++ xmm5 = _mm_add_epi16(xmm5, xmm11); ++ ++- xmm7 = _mm_and_si128(xmm7, xmm3); ++- xmm9 = _mm_and_si128(xmm9, xmm4); +++ xmm7 = _mm_and_si128(xmm7, xmm3); +++ xmm9 = _mm_and_si128(xmm9, xmm4); ++ ++- xmm6 = _mm_add_epi16(xmm6, xmm5); +++ xmm6 = _mm_add_epi16(xmm6, xmm5); ++ ++ ++- xmm5 = _mm_load_si128(&p_cntl2[3]); ++- xmm11 = _mm_load_si128(&p_cntl3[3]); +++ xmm5 = _mm_load_si128(&p_cntl2[3]); +++ xmm11 = _mm_load_si128(&p_cntl3[3]); ++ ++- xmm7 = _mm_add_epi16(xmm7, xmm9); +++ xmm7 = _mm_add_epi16(xmm7, xmm9); ++ ++- xmm5 = _mm_and_si128(xmm5, xmm3); ++- xmm11 = _mm_and_si128(xmm11, xmm4); +++ xmm5 = _mm_and_si128(xmm5, xmm3); +++ xmm11 = _mm_and_si128(xmm11, xmm4); ++ ++- xmm8 = _mm_add_epi16(xmm8, xmm7); +++ xmm8 = _mm_add_epi16(xmm8, xmm7); ++ ++- xmm5 = _mm_add_epi16(xmm5, xmm11); +++ xmm5 = _mm_add_epi16(xmm5, xmm11); ++ ++- _mm_store_si128(p_target, xmm0); ++- _mm_store_si128(&p_target[1], xmm6); +++ _mm_store_si128(p_target, xmm0); +++ _mm_store_si128(&p_target[1], xmm6); ++ ++- xmm10 = _mm_add_epi16(xmm5, xmm10); +++ xmm10 = _mm_add_epi16(xmm5, xmm10); ++ ++- _mm_store_si128(&p_target[2], xmm8); +++ _mm_store_si128(&p_target[2], xmm8); ++ ++- _mm_store_si128(&p_target[3], xmm10); +++ _mm_store_si128(&p_target[3], xmm10); ++ } ++ ++ ++ #endif /*LV_HAVE_SSEs*/ ++ ++ #ifdef LV_HAVE_GENERIC ++-static inline void ++-volk_16i_branch_4_state_8_generic(short* target, short* src0, char** permuters, short* cntl2, short* cntl3, short* scalars) +++static inline void volk_16i_branch_4_state_8_generic(short* target, +++ short* src0, +++ char** permuters, +++ short* cntl2, +++ short* cntl3, +++ short* scalars) ++ { ++- int i = 0; ++- ++- int bound = 4; ++- ++- for(; i < bound; ++i) { ++- target[i* 8] = src0[((char)permuters[i][0])/2] ++- + ((i + 1)%2 * scalars[0]) ++- + (((i >> 1)^1) * scalars[1]) ++- + (cntl2[i * 8] & scalars[2]) ++- + (cntl3[i * 8] & scalars[3]); ++- target[i* 8 + 1] = src0[((char)permuters[i][1 * 2])/2] ++- + ((i + 1)%2 * scalars[0]) ++- + (((i >> 1)^1) * scalars[1]) ++- + (cntl2[i * 8 + 1] & scalars[2]) ++- + (cntl3[i * 8 + 1] & scalars[3]); ++- target[i* 8 + 2] = src0[((char)permuters[i][2 * 2])/2] ++- + ((i + 1)%2 * scalars[0]) ++- + (((i >> 1)^1) * scalars[1]) ++- + (cntl2[i * 8 + 2] & scalars[2]) ++- + (cntl3[i * 8 + 2] & scalars[3]); ++- target[i* 8 + 3] = src0[((char)permuters[i][3 * 2])/2] ++- + ((i + 1)%2 * scalars[0]) ++- + (((i >> 1)^1) * scalars[1]) ++- + (cntl2[i * 8 + 3] & scalars[2]) ++- + (cntl3[i * 8 + 3] & scalars[3]); ++- target[i* 8 + 4] = src0[((char)permuters[i][4 * 2])/2] ++- + ((i + 1)%2 * scalars[0]) ++- + (((i >> 1)^1) * scalars[1]) ++- + (cntl2[i * 8 + 4] & scalars[2]) ++- + (cntl3[i * 8 + 4] & scalars[3]); ++- target[i* 8 + 5] = src0[((char)permuters[i][5 * 2])/2] ++- + ((i + 1)%2 * scalars[0]) ++- + (((i >> 1)^1) * scalars[1]) ++- + (cntl2[i * 8 + 5] & scalars[2]) ++- + (cntl3[i * 8 + 5] & scalars[3]); ++- target[i* 8 + 6] = src0[((char)permuters[i][6 * 2])/2] ++- + ((i + 1)%2 * scalars[0]) ++- + (((i >> 1)^1) * scalars[1]) ++- + (cntl2[i * 8 + 6] & scalars[2]) ++- + (cntl3[i * 8 + 6] & scalars[3]); ++- target[i* 8 + 7] = src0[((char)permuters[i][7 * 2])/2] ++- + ((i + 1)%2 * scalars[0]) ++- + (((i >> 1)^1) * scalars[1]) ++- + (cntl2[i * 8 + 7] & scalars[2]) ++- + (cntl3[i * 8 + 7] & scalars[3]); ++- } +++ int i = 0; +++ +++ int bound = 4; +++ +++ for (; i < bound; ++i) { +++ target[i * 8] = src0[((char)permuters[i][0]) / 2] + ((i + 1) % 2 * scalars[0]) + +++ (((i >> 1) ^ 1) * scalars[1]) + (cntl2[i * 8] & scalars[2]) + +++ (cntl3[i * 8] & scalars[3]); +++ target[i * 8 + 1] = src0[((char)permuters[i][1 * 2]) / 2] + +++ ((i + 1) % 2 * scalars[0]) + (((i >> 1) ^ 1) * scalars[1]) + +++ (cntl2[i * 8 + 1] & scalars[2]) + +++ (cntl3[i * 8 + 1] & scalars[3]); +++ target[i * 8 + 2] = src0[((char)permuters[i][2 * 2]) / 2] + +++ ((i + 1) % 2 * scalars[0]) + (((i >> 1) ^ 1) * scalars[1]) + +++ (cntl2[i * 8 + 2] & scalars[2]) + +++ (cntl3[i * 8 + 2] & scalars[3]); +++ target[i * 8 + 3] = src0[((char)permuters[i][3 * 2]) / 2] + +++ ((i + 1) % 2 * scalars[0]) + (((i >> 1) ^ 1) * scalars[1]) + +++ (cntl2[i * 8 + 3] & scalars[2]) + +++ (cntl3[i * 8 + 3] & scalars[3]); +++ target[i * 8 + 4] = src0[((char)permuters[i][4 * 2]) / 2] + +++ ((i + 1) % 2 * scalars[0]) + (((i >> 1) ^ 1) * scalars[1]) + +++ (cntl2[i * 8 + 4] & scalars[2]) + +++ (cntl3[i * 8 + 4] & scalars[3]); +++ target[i * 8 + 5] = src0[((char)permuters[i][5 * 2]) / 2] + +++ ((i + 1) % 2 * scalars[0]) + (((i >> 1) ^ 1) * scalars[1]) + +++ (cntl2[i * 8 + 5] & scalars[2]) + +++ (cntl3[i * 8 + 5] & scalars[3]); +++ target[i * 8 + 6] = src0[((char)permuters[i][6 * 2]) / 2] + +++ ((i + 1) % 2 * scalars[0]) + (((i >> 1) ^ 1) * scalars[1]) + +++ (cntl2[i * 8 + 6] & scalars[2]) + +++ (cntl3[i * 8 + 6] & scalars[3]); +++ target[i * 8 + 7] = src0[((char)permuters[i][7 * 2]) / 2] + +++ ((i + 1) % 2 * scalars[0]) + (((i >> 1) ^ 1) * scalars[1]) + +++ (cntl2[i * 8 + 7] & scalars[2]) + +++ (cntl3[i * 8 + 7] & scalars[3]); +++ } ++ } ++ ++ #endif /*LV_HAVE_GENERIC*/ ++diff --git a/kernels/volk/volk_16i_convert_8i.h b/kernels/volk/volk_16i_convert_8i.h ++index e2f953b..f09515d 100644 ++--- a/kernels/volk/volk_16i_convert_8i.h +++++ b/kernels/volk/volk_16i_convert_8i.h ++@@ -29,8 +29,8 @@ ++ * ++ * Dispatcher Prototype ++ * \code ++- * void volk_16i_convert_8i(int8_t* outputVector, const int16_t* inputVector, unsigned int num_points) ++- * \endcode +++ * void volk_16i_convert_8i(int8_t* outputVector, const int16_t* inputVector, unsigned int +++ * num_points) \endcode ++ * ++ * \b Inputs ++ * \li inputVector: The input vector of 16-bit shorts. ++@@ -59,39 +59,42 @@ ++ #ifdef LV_HAVE_AVX2 ++ #include ++ ++-static inline void ++-volk_16i_convert_8i_u_avx2(int8_t* outputVector, const int16_t* inputVector, unsigned int num_points) +++static inline void volk_16i_convert_8i_u_avx2(int8_t* outputVector, +++ const int16_t* inputVector, +++ unsigned int num_points) ++ { ++- unsigned int number = 0; ++- const unsigned int thirtysecondPoints = num_points / 32; +++ unsigned int number = 0; +++ const unsigned int thirtysecondPoints = num_points / 32; ++ ++- int8_t* outputVectorPtr = outputVector; ++- int16_t* inputPtr = (int16_t*)inputVector; ++- __m256i inputVal1; ++- __m256i inputVal2; ++- __m256i ret; +++ int8_t* outputVectorPtr = outputVector; +++ int16_t* inputPtr = (int16_t*)inputVector; +++ __m256i inputVal1; +++ __m256i inputVal2; +++ __m256i ret; ++ ++- for(;number < thirtysecondPoints; number++){ +++ for (; number < thirtysecondPoints; number++) { ++ ++- // Load the 16 values ++- inputVal1 = _mm256_loadu_si256((__m256i*)inputPtr); inputPtr += 16; ++- inputVal2 = _mm256_loadu_si256((__m256i*)inputPtr); inputPtr += 16; +++ // Load the 16 values +++ inputVal1 = _mm256_loadu_si256((__m256i*)inputPtr); +++ inputPtr += 16; +++ inputVal2 = _mm256_loadu_si256((__m256i*)inputPtr); +++ inputPtr += 16; ++ ++- inputVal1 = _mm256_srai_epi16(inputVal1, 8); ++- inputVal2 = _mm256_srai_epi16(inputVal2, 8); +++ inputVal1 = _mm256_srai_epi16(inputVal1, 8); +++ inputVal2 = _mm256_srai_epi16(inputVal2, 8); ++ ++- ret = _mm256_packs_epi16(inputVal1, inputVal2); ++- ret = _mm256_permute4x64_epi64(ret, 0b11011000); +++ ret = _mm256_packs_epi16(inputVal1, inputVal2); +++ ret = _mm256_permute4x64_epi64(ret, 0b11011000); ++ ++- _mm256_storeu_si256((__m256i*)outputVectorPtr, ret); +++ _mm256_storeu_si256((__m256i*)outputVectorPtr, ret); ++ ++- outputVectorPtr += 32; ++- } +++ outputVectorPtr += 32; +++ } ++ ++- number = thirtysecondPoints * 32; ++- for(; number < num_points; number++){ ++- outputVector[number] =(int8_t)(inputVector[number] >> 8); ++- } +++ number = thirtysecondPoints * 32; +++ for (; number < num_points; number++) { +++ outputVector[number] = (int8_t)(inputVector[number] >> 8); +++ } ++ } ++ #endif /* LV_HAVE_AVX2 */ ++ ++@@ -99,60 +102,62 @@ volk_16i_convert_8i_u_avx2(int8_t* outputVector, const int16_t* inputVector, uns ++ #ifdef LV_HAVE_SSE2 ++ #include ++ ++-static inline void ++-volk_16i_convert_8i_u_sse2(int8_t* outputVector, const int16_t* inputVector, unsigned int num_points) +++static inline void volk_16i_convert_8i_u_sse2(int8_t* outputVector, +++ const int16_t* inputVector, +++ unsigned int num_points) ++ { ++- unsigned int number = 0; ++- const unsigned int sixteenthPoints = num_points / 16; +++ unsigned int number = 0; +++ const unsigned int sixteenthPoints = num_points / 16; ++ ++- int8_t* outputVectorPtr = outputVector; ++- int16_t* inputPtr = (int16_t*)inputVector; ++- __m128i inputVal1; ++- __m128i inputVal2; ++- __m128i ret; +++ int8_t* outputVectorPtr = outputVector; +++ int16_t* inputPtr = (int16_t*)inputVector; +++ __m128i inputVal1; +++ __m128i inputVal2; +++ __m128i ret; ++ ++- for(;number < sixteenthPoints; number++){ +++ for (; number < sixteenthPoints; number++) { ++ ++- // Load the 16 values ++- inputVal1 = _mm_loadu_si128((__m128i*)inputPtr); inputPtr += 8; ++- inputVal2 = _mm_loadu_si128((__m128i*)inputPtr); inputPtr += 8; +++ // Load the 16 values +++ inputVal1 = _mm_loadu_si128((__m128i*)inputPtr); +++ inputPtr += 8; +++ inputVal2 = _mm_loadu_si128((__m128i*)inputPtr); +++ inputPtr += 8; ++ ++- inputVal1 = _mm_srai_epi16(inputVal1, 8); ++- inputVal2 = _mm_srai_epi16(inputVal2, 8); +++ inputVal1 = _mm_srai_epi16(inputVal1, 8); +++ inputVal2 = _mm_srai_epi16(inputVal2, 8); ++ ++- ret = _mm_packs_epi16(inputVal1, inputVal2); +++ ret = _mm_packs_epi16(inputVal1, inputVal2); ++ ++- _mm_storeu_si128((__m128i*)outputVectorPtr, ret); +++ _mm_storeu_si128((__m128i*)outputVectorPtr, ret); ++ ++- outputVectorPtr += 16; ++- } +++ outputVectorPtr += 16; +++ } ++ ++- number = sixteenthPoints * 16; ++- for(; number < num_points; number++){ ++- outputVector[number] =(int8_t)(inputVector[number] >> 8); ++- } +++ number = sixteenthPoints * 16; +++ for (; number < num_points; number++) { +++ outputVector[number] = (int8_t)(inputVector[number] >> 8); +++ } ++ } ++ #endif /* LV_HAVE_SSE2 */ ++ ++ ++ #ifdef LV_HAVE_GENERIC ++ ++-static inline void ++-volk_16i_convert_8i_generic(int8_t* outputVector, const int16_t* inputVector, unsigned int num_points) +++static inline void volk_16i_convert_8i_generic(int8_t* outputVector, +++ const int16_t* inputVector, +++ unsigned int num_points) ++ { ++- int8_t* outputVectorPtr = outputVector; ++- const int16_t* inputVectorPtr = inputVector; ++- unsigned int number = 0; +++ int8_t* outputVectorPtr = outputVector; +++ const int16_t* inputVectorPtr = inputVector; +++ unsigned int number = 0; ++ ++- for(number = 0; number < num_points; number++){ ++- *outputVectorPtr++ = ((int8_t)(*inputVectorPtr++ >> 8)); ++- } +++ for (number = 0; number < num_points; number++) { +++ *outputVectorPtr++ = ((int8_t)(*inputVectorPtr++ >> 8)); +++ } ++ } ++ #endif /* LV_HAVE_GENERIC */ ++ ++ ++- ++- ++ #endif /* INCLUDED_volk_16i_convert_8i_u_H */ ++ #ifndef INCLUDED_volk_16i_convert_8i_a_H ++ #define INCLUDED_volk_16i_convert_8i_a_H ++@@ -163,39 +168,42 @@ volk_16i_convert_8i_generic(int8_t* outputVector, const int16_t* inputVector, un ++ #ifdef LV_HAVE_AVX2 ++ #include ++ ++-static inline void ++-volk_16i_convert_8i_a_avx2(int8_t* outputVector, const int16_t* inputVector, unsigned int num_points) +++static inline void volk_16i_convert_8i_a_avx2(int8_t* outputVector, +++ const int16_t* inputVector, +++ unsigned int num_points) ++ { ++- unsigned int number = 0; ++- const unsigned int thirtysecondPoints = num_points / 32; +++ unsigned int number = 0; +++ const unsigned int thirtysecondPoints = num_points / 32; ++ ++- int8_t* outputVectorPtr = outputVector; ++- int16_t* inputPtr = (int16_t*)inputVector; ++- __m256i inputVal1; ++- __m256i inputVal2; ++- __m256i ret; +++ int8_t* outputVectorPtr = outputVector; +++ int16_t* inputPtr = (int16_t*)inputVector; +++ __m256i inputVal1; +++ __m256i inputVal2; +++ __m256i ret; ++ ++- for(;number < thirtysecondPoints; number++){ +++ for (; number < thirtysecondPoints; number++) { ++ ++- // Load the 16 values ++- inputVal1 = _mm256_load_si256((__m256i*)inputPtr); inputPtr += 16; ++- inputVal2 = _mm256_load_si256((__m256i*)inputPtr); inputPtr += 16; +++ // Load the 16 values +++ inputVal1 = _mm256_load_si256((__m256i*)inputPtr); +++ inputPtr += 16; +++ inputVal2 = _mm256_load_si256((__m256i*)inputPtr); +++ inputPtr += 16; ++ ++- inputVal1 = _mm256_srai_epi16(inputVal1, 8); ++- inputVal2 = _mm256_srai_epi16(inputVal2, 8); +++ inputVal1 = _mm256_srai_epi16(inputVal1, 8); +++ inputVal2 = _mm256_srai_epi16(inputVal2, 8); ++ ++- ret = _mm256_packs_epi16(inputVal1, inputVal2); ++- ret = _mm256_permute4x64_epi64(ret, 0b11011000); +++ ret = _mm256_packs_epi16(inputVal1, inputVal2); +++ ret = _mm256_permute4x64_epi64(ret, 0b11011000); ++ ++- _mm256_store_si256((__m256i*)outputVectorPtr, ret); +++ _mm256_store_si256((__m256i*)outputVectorPtr, ret); ++ ++- outputVectorPtr += 32; ++- } +++ outputVectorPtr += 32; +++ } ++ ++- number = thirtysecondPoints * 32; ++- for(; number < num_points; number++){ ++- outputVector[number] =(int8_t)(inputVector[number] >> 8); ++- } +++ number = thirtysecondPoints * 32; +++ for (; number < num_points; number++) { +++ outputVector[number] = (int8_t)(inputVector[number] >> 8); +++ } ++ } ++ #endif /* LV_HAVE_AVX2 */ ++ ++@@ -203,38 +211,41 @@ volk_16i_convert_8i_a_avx2(int8_t* outputVector, const int16_t* inputVector, uns ++ #ifdef LV_HAVE_SSE2 ++ #include ++ ++-static inline void ++-volk_16i_convert_8i_a_sse2(int8_t* outputVector, const int16_t* inputVector, unsigned int num_points) +++static inline void volk_16i_convert_8i_a_sse2(int8_t* outputVector, +++ const int16_t* inputVector, +++ unsigned int num_points) ++ { ++- unsigned int number = 0; ++- const unsigned int sixteenthPoints = num_points / 16; +++ unsigned int number = 0; +++ const unsigned int sixteenthPoints = num_points / 16; ++ ++- int8_t* outputVectorPtr = outputVector; ++- int16_t* inputPtr = (int16_t*)inputVector; ++- __m128i inputVal1; ++- __m128i inputVal2; ++- __m128i ret; +++ int8_t* outputVectorPtr = outputVector; +++ int16_t* inputPtr = (int16_t*)inputVector; +++ __m128i inputVal1; +++ __m128i inputVal2; +++ __m128i ret; ++ ++- for(;number < sixteenthPoints; number++){ +++ for (; number < sixteenthPoints; number++) { ++ ++- // Load the 16 values ++- inputVal1 = _mm_load_si128((__m128i*)inputPtr); inputPtr += 8; ++- inputVal2 = _mm_load_si128((__m128i*)inputPtr); inputPtr += 8; +++ // Load the 16 values +++ inputVal1 = _mm_load_si128((__m128i*)inputPtr); +++ inputPtr += 8; +++ inputVal2 = _mm_load_si128((__m128i*)inputPtr); +++ inputPtr += 8; ++ ++- inputVal1 = _mm_srai_epi16(inputVal1, 8); ++- inputVal2 = _mm_srai_epi16(inputVal2, 8); +++ inputVal1 = _mm_srai_epi16(inputVal1, 8); +++ inputVal2 = _mm_srai_epi16(inputVal2, 8); ++ ++- ret = _mm_packs_epi16(inputVal1, inputVal2); +++ ret = _mm_packs_epi16(inputVal1, inputVal2); ++ ++- _mm_store_si128((__m128i*)outputVectorPtr, ret); +++ _mm_store_si128((__m128i*)outputVectorPtr, ret); ++ ++- outputVectorPtr += 16; ++- } +++ outputVectorPtr += 16; +++ } ++ ++- number = sixteenthPoints * 16; ++- for(; number < num_points; number++){ ++- outputVector[number] =(int8_t)(inputVector[number] >> 8); ++- } +++ number = sixteenthPoints * 16; +++ for (; number < num_points; number++) { +++ outputVector[number] = (int8_t)(inputVector[number] >> 8); +++ } ++ } ++ #endif /* LV_HAVE_SSE2 */ ++ ++@@ -242,53 +253,55 @@ volk_16i_convert_8i_a_sse2(int8_t* outputVector, const int16_t* inputVector, uns ++ #ifdef LV_HAVE_NEON ++ #include ++ ++-static inline void ++-volk_16i_convert_8i_neon(int8_t* outputVector, const int16_t* inputVector, unsigned int num_points) +++static inline void volk_16i_convert_8i_neon(int8_t* outputVector, +++ const int16_t* inputVector, +++ unsigned int num_points) ++ { ++- int8_t* outputVectorPtr = outputVector; ++- const int16_t* inputVectorPtr = inputVector; ++- unsigned int number = 0; ++- unsigned int sixteenth_points = num_points / 16; ++- ++- int16x8_t inputVal0; ++- int16x8_t inputVal1; ++- int8x8_t outputVal0; ++- int8x8_t outputVal1; ++- int8x16_t outputVal; ++- ++- for(number = 0; number < sixteenth_points; number++){ ++- // load two input vectors ++- inputVal0 = vld1q_s16(inputVectorPtr); ++- inputVal1 = vld1q_s16(inputVectorPtr+8); ++- // shift right ++- outputVal0 = vshrn_n_s16(inputVal0, 8); ++- outputVal1 = vshrn_n_s16(inputVal1, 8); ++- // squash two vectors and write output ++- outputVal = vcombine_s8(outputVal0, outputVal1); ++- vst1q_s8(outputVectorPtr, outputVal); ++- inputVectorPtr += 16; ++- outputVectorPtr += 16; ++- } ++- ++- for(number = sixteenth_points * 16; number < num_points; number++){ ++- *outputVectorPtr++ = ((int8_t)(*inputVectorPtr++ >> 8)); ++- } +++ int8_t* outputVectorPtr = outputVector; +++ const int16_t* inputVectorPtr = inputVector; +++ unsigned int number = 0; +++ unsigned int sixteenth_points = num_points / 16; +++ +++ int16x8_t inputVal0; +++ int16x8_t inputVal1; +++ int8x8_t outputVal0; +++ int8x8_t outputVal1; +++ int8x16_t outputVal; +++ +++ for (number = 0; number < sixteenth_points; number++) { +++ // load two input vectors +++ inputVal0 = vld1q_s16(inputVectorPtr); +++ inputVal1 = vld1q_s16(inputVectorPtr + 8); +++ // shift right +++ outputVal0 = vshrn_n_s16(inputVal0, 8); +++ outputVal1 = vshrn_n_s16(inputVal1, 8); +++ // squash two vectors and write output +++ outputVal = vcombine_s8(outputVal0, outputVal1); +++ vst1q_s8(outputVectorPtr, outputVal); +++ inputVectorPtr += 16; +++ outputVectorPtr += 16; +++ } +++ +++ for (number = sixteenth_points * 16; number < num_points; number++) { +++ *outputVectorPtr++ = ((int8_t)(*inputVectorPtr++ >> 8)); +++ } ++ } ++ #endif /* LV_HAVE_NEON */ ++ ++ ++ #ifdef LV_HAVE_GENERIC ++ ++-static inline void ++-volk_16i_convert_8i_a_generic(int8_t* outputVector, const int16_t* inputVector, unsigned int num_points) +++static inline void volk_16i_convert_8i_a_generic(int8_t* outputVector, +++ const int16_t* inputVector, +++ unsigned int num_points) ++ { ++- int8_t* outputVectorPtr = outputVector; ++- const int16_t* inputVectorPtr = inputVector; ++- unsigned int number = 0; +++ int8_t* outputVectorPtr = outputVector; +++ const int16_t* inputVectorPtr = inputVector; +++ unsigned int number = 0; ++ ++- for(number = 0; number < num_points; number++){ ++- *outputVectorPtr++ = ((int8_t)(*inputVectorPtr++ >> 8)); ++- } +++ for (number = 0; number < num_points; number++) { +++ *outputVectorPtr++ = ((int8_t)(*inputVectorPtr++ >> 8)); +++ } ++ } ++ #endif /* LV_HAVE_GENERIC */ ++ ++diff --git a/kernels/volk/volk_16i_max_star_16i.h b/kernels/volk/volk_16i_max_star_16i.h ++index 78fd911..d5dad18 100644 ++--- a/kernels/volk/volk_16i_max_star_16i.h +++++ b/kernels/volk/volk_16i_max_star_16i.h ++@@ -53,67 +53,69 @@ ++ #ifndef INCLUDED_volk_16i_max_star_16i_a_H ++ #define INCLUDED_volk_16i_max_star_16i_a_H ++ ++-#include ++-#include +++#include +++#include ++ ++ #ifdef LV_HAVE_SSSE3 ++ ++-#include ++-#include ++-#include +++#include +++#include +++#include ++ ++ static inline void ++ volk_16i_max_star_16i_a_ssse3(short* target, short* src0, unsigned int num_points) ++ { ++- const unsigned int num_bytes = num_points*2; +++ const unsigned int num_bytes = num_points * 2; ++ ++- short candidate = src0[0]; ++- short cands[8]; ++- __m128i xmm0, xmm1, xmm3, xmm4, xmm5, xmm6; +++ short candidate = src0[0]; +++ short cands[8]; +++ __m128i xmm0, xmm1, xmm3, xmm4, xmm5, xmm6; ++ ++- __m128i *p_src0; +++ __m128i* p_src0; ++ ++- p_src0 = (__m128i*)src0; +++ p_src0 = (__m128i*)src0; ++ ++- int bound = num_bytes >> 4; ++- int leftovers = (num_bytes >> 1) & 7; +++ int bound = num_bytes >> 4; +++ int leftovers = (num_bytes >> 1) & 7; ++ ++- int i = 0; +++ int i = 0; ++ ++- xmm1 = _mm_setzero_si128(); ++- xmm0 = _mm_setzero_si128(); ++- //_mm_insert_epi16(xmm0, candidate, 0); +++ xmm1 = _mm_setzero_si128(); +++ xmm0 = _mm_setzero_si128(); +++ //_mm_insert_epi16(xmm0, candidate, 0); ++ ++- xmm0 = _mm_shuffle_epi8(xmm0, xmm1); +++ xmm0 = _mm_shuffle_epi8(xmm0, xmm1); ++ ++- for(i = 0; i < bound; ++i) { ++- xmm1 = _mm_load_si128(p_src0); ++- p_src0 += 1; ++- //xmm2 = _mm_sub_epi16(xmm1, xmm0); +++ for (i = 0; i < bound; ++i) { +++ xmm1 = _mm_load_si128(p_src0); +++ p_src0 += 1; +++ // xmm2 = _mm_sub_epi16(xmm1, xmm0); ++ ++- xmm3 = _mm_cmpgt_epi16(xmm0, xmm1); ++- xmm4 = _mm_cmpeq_epi16(xmm0, xmm1); ++- xmm5 = _mm_cmpgt_epi16(xmm1, xmm0); +++ xmm3 = _mm_cmpgt_epi16(xmm0, xmm1); +++ xmm4 = _mm_cmpeq_epi16(xmm0, xmm1); +++ xmm5 = _mm_cmpgt_epi16(xmm1, xmm0); ++ ++- xmm6 = _mm_xor_si128(xmm4, xmm5); +++ xmm6 = _mm_xor_si128(xmm4, xmm5); ++ ++- xmm3 = _mm_and_si128(xmm3, xmm0); ++- xmm4 = _mm_and_si128(xmm6, xmm1); +++ xmm3 = _mm_and_si128(xmm3, xmm0); +++ xmm4 = _mm_and_si128(xmm6, xmm1); ++ ++- xmm0 = _mm_add_epi16(xmm3, xmm4); ++- } +++ xmm0 = _mm_add_epi16(xmm3, xmm4); +++ } ++ ++- _mm_store_si128((__m128i*)cands, xmm0); +++ _mm_store_si128((__m128i*)cands, xmm0); ++ ++- for(i = 0; i < 8; ++i) { ++- candidate = ((short)(candidate - cands[i]) > 0) ? candidate : cands[i]; ++- } +++ for (i = 0; i < 8; ++i) { +++ candidate = ((short)(candidate - cands[i]) > 0) ? candidate : cands[i]; +++ } ++ ++- for(i = 0; i < leftovers; ++i) { ++- candidate = ((short)(candidate - src0[(bound << 3) + i]) > 0) ? candidate : src0[(bound << 3) + i]; ++- } +++ for (i = 0; i < leftovers; ++i) { +++ candidate = ((short)(candidate - src0[(bound << 3) + i]) > 0) +++ ? candidate +++ : src0[(bound << 3) + i]; +++ } ++ ++- target[0] = candidate; +++ target[0] = candidate; ++ } ++ ++ #endif /*LV_HAVE_SSSE3*/ ++@@ -124,38 +126,38 @@ volk_16i_max_star_16i_a_ssse3(short* target, short* src0, unsigned int num_point ++ static inline void ++ volk_16i_max_star_16i_neon(short* target, short* src0, unsigned int num_points) ++ { ++- const unsigned int eighth_points = num_points / 8; ++- unsigned number; ++- int16x8_t input_vec; ++- int16x8_t diff, zeros; ++- uint16x8_t comp1, comp2; ++- zeros = vdupq_n_s16(0); ++- ++- int16x8x2_t tmpvec; ++- ++- int16x8_t candidate_vec = vld1q_dup_s16(src0 ); ++- short candidate; ++- ++src0; ++- ++- for(number=0; number < eighth_points; ++number) { ++- input_vec = vld1q_s16(src0); ++- __VOLK_PREFETCH(src0+16); ++- diff = vsubq_s16(candidate_vec, input_vec); ++- comp1 = vcgeq_s16(diff, zeros); ++- comp2 = vcltq_s16(diff, zeros); ++- ++- tmpvec.val[0] = vandq_s16(candidate_vec, (int16x8_t)comp1); ++- tmpvec.val[1] = vandq_s16(input_vec, (int16x8_t)comp2); ++- ++- candidate_vec = vaddq_s16(tmpvec.val[0], tmpvec.val[1]); ++- src0 += 8; ++- } ++- vst1q_s16(&candidate, candidate_vec); ++- ++- for(number=0; number < num_points%8; number++) { ++- candidate = ((int16_t)(candidate - src0[number]) > 0) ? candidate : src0[number]; ++- } ++- target[0] = candidate; +++ const unsigned int eighth_points = num_points / 8; +++ unsigned number; +++ int16x8_t input_vec; +++ int16x8_t diff, zeros; +++ uint16x8_t comp1, comp2; +++ zeros = vdupq_n_s16(0); +++ +++ int16x8x2_t tmpvec; +++ +++ int16x8_t candidate_vec = vld1q_dup_s16(src0); +++ short candidate; +++ ++src0; +++ +++ for (number = 0; number < eighth_points; ++number) { +++ input_vec = vld1q_s16(src0); +++ __VOLK_PREFETCH(src0 + 16); +++ diff = vsubq_s16(candidate_vec, input_vec); +++ comp1 = vcgeq_s16(diff, zeros); +++ comp2 = vcltq_s16(diff, zeros); +++ +++ tmpvec.val[0] = vandq_s16(candidate_vec, (int16x8_t)comp1); +++ tmpvec.val[1] = vandq_s16(input_vec, (int16x8_t)comp2); +++ +++ candidate_vec = vaddq_s16(tmpvec.val[0], tmpvec.val[1]); +++ src0 += 8; +++ } +++ vst1q_s16(&candidate, candidate_vec); +++ +++ for (number = 0; number < num_points % 8; number++) { +++ candidate = ((int16_t)(candidate - src0[number]) > 0) ? candidate : src0[number]; +++ } +++ target[0] = candidate; ++ } ++ #endif /*LV_HAVE_NEON*/ ++ ++@@ -164,17 +166,17 @@ volk_16i_max_star_16i_neon(short* target, short* src0, unsigned int num_points) ++ static inline void ++ volk_16i_max_star_16i_generic(short* target, short* src0, unsigned int num_points) ++ { ++- const unsigned int num_bytes = num_points*2; +++ const unsigned int num_bytes = num_points * 2; ++ ++- int i = 0; +++ int i = 0; ++ ++- int bound = num_bytes >> 1; +++ int bound = num_bytes >> 1; ++ ++- short candidate = src0[0]; ++- for(i = 1; i < bound; ++i) { ++- candidate = ((short)(candidate - src0[i]) > 0) ? candidate : src0[i]; ++- } ++- target[0] = candidate; +++ short candidate = src0[0]; +++ for (i = 1; i < bound; ++i) { +++ candidate = ((short)(candidate - src0[i]) > 0) ? candidate : src0[i]; +++ } +++ target[0] = candidate; ++ } ++ ++ #endif /*LV_HAVE_GENERIC*/ ++diff --git a/kernels/volk/volk_16i_max_star_horizontal_16i.h b/kernels/volk/volk_16i_max_star_horizontal_16i.h ++index 4ffe264..2e1f52b 100644 ++--- a/kernels/volk/volk_16i_max_star_horizontal_16i.h +++++ b/kernels/volk/volk_16i_max_star_horizontal_16i.h ++@@ -29,8 +29,8 @@ ++ * ++ * Dispatcher Prototype ++ * \code ++- * void volk_16i_max_star_horizontal_16i(short* target, short* src0, unsigned int num_points); ++- * \endcode +++ * void volk_16i_max_star_horizontal_16i(short* target, short* src0, unsigned int +++ * num_points); \endcode ++ * ++ * \b Inputs ++ * \li src0: The input vector. ++@@ -55,102 +55,113 @@ ++ ++ #include ++ ++-#include ++-#include +++#include +++#include ++ ++ ++ #ifdef LV_HAVE_SSSE3 ++ ++-#include ++-#include ++-#include +++#include +++#include +++#include ++ ++-static inline void ++-volk_16i_max_star_horizontal_16i_a_ssse3(int16_t* target, int16_t* src0, unsigned int num_points) +++static inline void volk_16i_max_star_horizontal_16i_a_ssse3(int16_t* target, +++ int16_t* src0, +++ unsigned int num_points) ++ { ++- const unsigned int num_bytes = num_points*2; +++ const unsigned int num_bytes = num_points * 2; ++ ++- static const uint8_t shufmask0[16] = {0x00, 0x01, 0x04, 0x05, 0x08, 0x09, 0x0c, 0x0d, 0xff, ++- 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}; ++- static const uint8_t shufmask1[16] = {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, ++- 0x01, 0x04, 0x05, 0x08, 0x09, 0x0c, 0x0d}; ++- static const uint8_t andmask0[16] = {0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02,0x02, 0x00, ++- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}; ++- static const uint8_t andmask1[16] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, ++- 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02}; +++ static const uint8_t shufmask0[16] = { +++ 0x00, 0x01, 0x04, 0x05, 0x08, 0x09, 0x0c, 0x0d, +++ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff +++ }; +++ static const uint8_t shufmask1[16] = { +++ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, +++ 0x00, 0x01, 0x04, 0x05, 0x08, 0x09, 0x0c, 0x0d +++ }; +++ static const uint8_t andmask0[16] = { +++ 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, +++ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 +++ }; +++ static const uint8_t andmask1[16] = { +++ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, +++ 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02 +++ }; ++ ++- __m128i xmm0, xmm1, xmm2, xmm3, xmm4; ++- __m128i xmm5, xmm6, xmm7, xmm8; +++ __m128i xmm0, xmm1, xmm2, xmm3, xmm4; +++ __m128i xmm5, xmm6, xmm7, xmm8; ++ ++- xmm4 = _mm_load_si128((__m128i*)shufmask0); ++- xmm5 = _mm_load_si128((__m128i*)shufmask1); ++- xmm6 = _mm_load_si128((__m128i*)andmask0); ++- xmm7 = _mm_load_si128((__m128i*)andmask1); +++ xmm4 = _mm_load_si128((__m128i*)shufmask0); +++ xmm5 = _mm_load_si128((__m128i*)shufmask1); +++ xmm6 = _mm_load_si128((__m128i*)andmask0); +++ xmm7 = _mm_load_si128((__m128i*)andmask1); ++ ++- __m128i *p_target, *p_src0; +++ __m128i *p_target, *p_src0; ++ ++- p_target = (__m128i*)target; ++- p_src0 = (__m128i*)src0; +++ p_target = (__m128i*)target; +++ p_src0 = (__m128i*)src0; ++ ++- int bound = num_bytes >> 5; ++- int intermediate = (num_bytes >> 4) & 1; ++- int leftovers = (num_bytes >> 1) & 7; +++ int bound = num_bytes >> 5; +++ int intermediate = (num_bytes >> 4) & 1; +++ int leftovers = (num_bytes >> 1) & 7; ++ ++- int i = 0; +++ int i = 0; ++ ++- for(i = 0; i < bound; ++i) { ++- xmm0 = _mm_load_si128(p_src0); ++- xmm1 = _mm_load_si128(&p_src0[1]); +++ for (i = 0; i < bound; ++i) { +++ xmm0 = _mm_load_si128(p_src0); +++ xmm1 = _mm_load_si128(&p_src0[1]); ++ ++- xmm2 = _mm_xor_si128(xmm2, xmm2); ++- p_src0 += 2; +++ xmm2 = _mm_xor_si128(xmm2, xmm2); +++ p_src0 += 2; ++ ++- xmm3 = _mm_hsub_epi16(xmm0, xmm1); +++ xmm3 = _mm_hsub_epi16(xmm0, xmm1); ++ ++- xmm2 = _mm_cmpgt_epi16(xmm2, xmm3); +++ xmm2 = _mm_cmpgt_epi16(xmm2, xmm3); ++ ++- xmm8 = _mm_and_si128(xmm2, xmm6); ++- xmm3 = _mm_and_si128(xmm2, xmm7); +++ xmm8 = _mm_and_si128(xmm2, xmm6); +++ xmm3 = _mm_and_si128(xmm2, xmm7); ++ ++ ++- xmm8 = _mm_add_epi8(xmm8, xmm4); ++- xmm3 = _mm_add_epi8(xmm3, xmm5); +++ xmm8 = _mm_add_epi8(xmm8, xmm4); +++ xmm3 = _mm_add_epi8(xmm3, xmm5); ++ ++- xmm0 = _mm_shuffle_epi8(xmm0, xmm8); ++- xmm1 = _mm_shuffle_epi8(xmm1, xmm3); +++ xmm0 = _mm_shuffle_epi8(xmm0, xmm8); +++ xmm1 = _mm_shuffle_epi8(xmm1, xmm3); ++ ++ ++- xmm3 = _mm_add_epi16(xmm0, xmm1); +++ xmm3 = _mm_add_epi16(xmm0, xmm1); ++ ++ ++- _mm_store_si128(p_target, xmm3); +++ _mm_store_si128(p_target, xmm3); ++ ++- p_target += 1; ++- } +++ p_target += 1; +++ } ++ ++- if (intermediate) { ++- xmm0 = _mm_load_si128(p_src0); +++ if (intermediate) { +++ xmm0 = _mm_load_si128(p_src0); ++ ++- xmm2 = _mm_xor_si128(xmm2, xmm2); ++- p_src0 += 1; +++ xmm2 = _mm_xor_si128(xmm2, xmm2); +++ p_src0 += 1; ++ ++- xmm3 = _mm_hsub_epi16(xmm0, xmm1); ++- xmm2 = _mm_cmpgt_epi16(xmm2, xmm3); +++ xmm3 = _mm_hsub_epi16(xmm0, xmm1); +++ xmm2 = _mm_cmpgt_epi16(xmm2, xmm3); ++ ++- xmm8 = _mm_and_si128(xmm2, xmm6); +++ xmm8 = _mm_and_si128(xmm2, xmm6); ++ ++- xmm3 = _mm_add_epi8(xmm8, xmm4); +++ xmm3 = _mm_add_epi8(xmm8, xmm4); ++ ++- xmm0 = _mm_shuffle_epi8(xmm0, xmm3); +++ xmm0 = _mm_shuffle_epi8(xmm0, xmm3); ++ ++- _mm_storel_pd((double*)p_target, bit128_p(&xmm0)->double_vec); +++ _mm_storel_pd((double*)p_target, bit128_p(&xmm0)->double_vec); ++ ++- p_target = (__m128i*)((int8_t*)p_target + 8); ++- } +++ p_target = (__m128i*)((int8_t*)p_target + 8); +++ } ++ ++- for(i = (bound << 4) + (intermediate << 3); i < (bound << 4) + (intermediate << 3) + leftovers ; i += 2) { ++- target[i>>1] = ((int16_t)(src0[i] - src0[i + 1]) > 0) ? src0[i] : src0[i + 1]; ++- } +++ for (i = (bound << 4) + (intermediate << 3); +++ i < (bound << 4) + (intermediate << 3) + leftovers; +++ i += 2) { +++ target[i >> 1] = ((int16_t)(src0[i] - src0[i + 1]) > 0) ? src0[i] : src0[i + 1]; +++ } ++ } ++ ++ #endif /*LV_HAVE_SSSE3*/ ++@@ -158,54 +169,59 @@ volk_16i_max_star_horizontal_16i_a_ssse3(int16_t* target, int16_t* src0, unsigne ++ #ifdef LV_HAVE_NEON ++ ++ #include ++-static inline void ++-volk_16i_max_star_horizontal_16i_neon(int16_t* target, int16_t* src0, unsigned int num_points) +++static inline void volk_16i_max_star_horizontal_16i_neon(int16_t* target, +++ int16_t* src0, +++ unsigned int num_points) ++ { ++- const unsigned int eighth_points = num_points / 16; ++- unsigned number; ++- int16x8x2_t input_vec; ++- int16x8_t diff, max_vec, zeros; ++- uint16x8_t comp1, comp2; ++- zeros = vdupq_n_s16(0); ++- for(number=0; number < eighth_points; ++number) { ++- input_vec = vld2q_s16(src0); ++- //__VOLK_PREFETCH(src0+16); ++- diff = vsubq_s16(input_vec.val[0], input_vec.val[1]); ++- comp1 = vcgeq_s16(diff, zeros); ++- comp2 = vcltq_s16(diff, zeros); ++- ++- input_vec.val[0] = vandq_s16(input_vec.val[0], (int16x8_t)comp1); ++- input_vec.val[1] = vandq_s16(input_vec.val[1], (int16x8_t)comp2); ++- ++- max_vec = vaddq_s16(input_vec.val[0], input_vec.val[1]); ++- vst1q_s16(target, max_vec); ++- src0 += 16; ++- target += 8; ++- } ++- for(number=0; number < num_points%16; number+=2) { ++- target[number >> 1] = ((int16_t)(src0[number] - src0[number + 1]) > 0) ? src0[number] : src0[number+1]; ++- } ++- +++ const unsigned int eighth_points = num_points / 16; +++ unsigned number; +++ int16x8x2_t input_vec; +++ int16x8_t diff, max_vec, zeros; +++ uint16x8_t comp1, comp2; +++ zeros = vdupq_n_s16(0); +++ for (number = 0; number < eighth_points; ++number) { +++ input_vec = vld2q_s16(src0); +++ //__VOLK_PREFETCH(src0+16); +++ diff = vsubq_s16(input_vec.val[0], input_vec.val[1]); +++ comp1 = vcgeq_s16(diff, zeros); +++ comp2 = vcltq_s16(diff, zeros); +++ +++ input_vec.val[0] = vandq_s16(input_vec.val[0], (int16x8_t)comp1); +++ input_vec.val[1] = vandq_s16(input_vec.val[1], (int16x8_t)comp2); +++ +++ max_vec = vaddq_s16(input_vec.val[0], input_vec.val[1]); +++ vst1q_s16(target, max_vec); +++ src0 += 16; +++ target += 8; +++ } +++ for (number = 0; number < num_points % 16; number += 2) { +++ target[number >> 1] = ((int16_t)(src0[number] - src0[number + 1]) > 0) +++ ? src0[number] +++ : src0[number + 1]; +++ } ++ } ++ #endif /* LV_HAVE_NEON */ ++ ++ #ifdef LV_HAVE_NEONV7 ++-extern void volk_16i_max_star_horizontal_16i_a_neonasm(int16_t* target, int16_t* src0, unsigned int num_points); +++extern void volk_16i_max_star_horizontal_16i_a_neonasm(int16_t* target, +++ int16_t* src0, +++ unsigned int num_points); ++ #endif /* LV_HAVE_NEONV7 */ ++ ++ #ifdef LV_HAVE_GENERIC ++-static inline void ++-volk_16i_max_star_horizontal_16i_generic(int16_t* target, int16_t* src0, unsigned int num_points) +++static inline void volk_16i_max_star_horizontal_16i_generic(int16_t* target, +++ int16_t* src0, +++ unsigned int num_points) ++ { ++- const unsigned int num_bytes = num_points*2; +++ const unsigned int num_bytes = num_points * 2; ++ ++- int i = 0; +++ int i = 0; ++ ++- int bound = num_bytes >> 1; +++ int bound = num_bytes >> 1; ++ ++- for(i = 0; i < bound; i += 2) { ++- target[i >> 1] = ((int16_t) (src0[i] - src0[i + 1]) > 0) ? src0[i] : src0[i+1]; ++- } +++ for (i = 0; i < bound; i += 2) { +++ target[i >> 1] = ((int16_t)(src0[i] - src0[i + 1]) > 0) ? src0[i] : src0[i + 1]; +++ } ++ } ++ ++ #endif /*LV_HAVE_GENERIC*/ ++diff --git a/kernels/volk/volk_16i_permute_and_scalar_add.h b/kernels/volk/volk_16i_permute_and_scalar_add.h ++index 7fcdad3..0563f07 100644 ++--- a/kernels/volk/volk_16i_permute_and_scalar_add.h +++++ b/kernels/volk/volk_16i_permute_and_scalar_add.h ++@@ -29,8 +29,9 @@ ++ * ++ * Dispatcher Prototype ++ * \code ++- * void volk_16i_permute_and_scalar_add(short* target, short* src0, short* permute_indexes, short* cntl0, short* cntl1, short* cntl2, short* cntl3, short* scalars, unsigned int num_points) ++- * \endcode +++ * void volk_16i_permute_and_scalar_add(short* target, short* src0, short* +++ * permute_indexes, short* cntl0, short* cntl1, short* cntl2, short* cntl3, short* +++ * scalars, unsigned int num_points) \endcode ++ * ++ * \b Inputs ++ * \li src0: The input vector. ++@@ -58,137 +59,143 @@ ++ #ifndef INCLUDED_volk_16i_permute_and_scalar_add_a_H ++ #define INCLUDED_volk_16i_permute_and_scalar_add_a_H ++ ++-#include ++-#include +++#include +++#include ++ ++ #ifdef LV_HAVE_SSE2 ++ ++-#include ++-#include ++- ++-static inline void ++-volk_16i_permute_and_scalar_add_a_sse2(short* target, short* src0, short* permute_indexes, ++- short* cntl0, short* cntl1, short* cntl2, short* cntl3, ++- short* scalars, unsigned int num_points) +++#include +++#include +++ +++static inline void volk_16i_permute_and_scalar_add_a_sse2(short* target, +++ short* src0, +++ short* permute_indexes, +++ short* cntl0, +++ short* cntl1, +++ short* cntl2, +++ short* cntl3, +++ short* scalars, +++ unsigned int num_points) ++ { ++ ++- const unsigned int num_bytes = num_points*2; +++ const unsigned int num_bytes = num_points * 2; ++ ++- __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; +++ __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; ++ ++- __m128i *p_target, *p_cntl0, *p_cntl1, *p_cntl2, *p_cntl3, *p_scalars; +++ __m128i *p_target, *p_cntl0, *p_cntl1, *p_cntl2, *p_cntl3, *p_scalars; ++ ++- short* p_permute_indexes = permute_indexes; +++ short* p_permute_indexes = permute_indexes; ++ ++- p_target = (__m128i*)target; ++- p_cntl0 = (__m128i*)cntl0; ++- p_cntl1 = (__m128i*)cntl1; ++- p_cntl2 = (__m128i*)cntl2; ++- p_cntl3 = (__m128i*)cntl3; ++- p_scalars = (__m128i*)scalars; +++ p_target = (__m128i*)target; +++ p_cntl0 = (__m128i*)cntl0; +++ p_cntl1 = (__m128i*)cntl1; +++ p_cntl2 = (__m128i*)cntl2; +++ p_cntl3 = (__m128i*)cntl3; +++ p_scalars = (__m128i*)scalars; ++ ++- int i = 0; +++ int i = 0; ++ ++- int bound = (num_bytes >> 4); ++- int leftovers = (num_bytes >> 1) & 7; +++ int bound = (num_bytes >> 4); +++ int leftovers = (num_bytes >> 1) & 7; ++ ++- xmm0 = _mm_load_si128(p_scalars); +++ xmm0 = _mm_load_si128(p_scalars); ++ ++- xmm1 = _mm_shufflelo_epi16(xmm0, 0); ++- xmm2 = _mm_shufflelo_epi16(xmm0, 0x55); ++- xmm3 = _mm_shufflelo_epi16(xmm0, 0xaa); ++- xmm4 = _mm_shufflelo_epi16(xmm0, 0xff); +++ xmm1 = _mm_shufflelo_epi16(xmm0, 0); +++ xmm2 = _mm_shufflelo_epi16(xmm0, 0x55); +++ xmm3 = _mm_shufflelo_epi16(xmm0, 0xaa); +++ xmm4 = _mm_shufflelo_epi16(xmm0, 0xff); ++ ++- xmm1 = _mm_shuffle_epi32(xmm1, 0x00); ++- xmm2 = _mm_shuffle_epi32(xmm2, 0x00); ++- xmm3 = _mm_shuffle_epi32(xmm3, 0x00); ++- xmm4 = _mm_shuffle_epi32(xmm4, 0x00); +++ xmm1 = _mm_shuffle_epi32(xmm1, 0x00); +++ xmm2 = _mm_shuffle_epi32(xmm2, 0x00); +++ xmm3 = _mm_shuffle_epi32(xmm3, 0x00); +++ xmm4 = _mm_shuffle_epi32(xmm4, 0x00); ++ ++ ++- for(; i < bound; ++i) { ++- xmm0 = _mm_setzero_si128(); ++- xmm5 = _mm_setzero_si128(); ++- xmm6 = _mm_setzero_si128(); ++- xmm7 = _mm_setzero_si128(); +++ for (; i < bound; ++i) { +++ xmm0 = _mm_setzero_si128(); +++ xmm5 = _mm_setzero_si128(); +++ xmm6 = _mm_setzero_si128(); +++ xmm7 = _mm_setzero_si128(); ++ ++- xmm0 = _mm_insert_epi16(xmm0, src0[p_permute_indexes[0]], 0); ++- xmm5 = _mm_insert_epi16(xmm5, src0[p_permute_indexes[1]], 1); ++- xmm6 = _mm_insert_epi16(xmm6, src0[p_permute_indexes[2]], 2); ++- xmm7 = _mm_insert_epi16(xmm7, src0[p_permute_indexes[3]], 3); ++- xmm0 = _mm_insert_epi16(xmm0, src0[p_permute_indexes[4]], 4); ++- xmm5 = _mm_insert_epi16(xmm5, src0[p_permute_indexes[5]], 5); ++- xmm6 = _mm_insert_epi16(xmm6, src0[p_permute_indexes[6]], 6); ++- xmm7 = _mm_insert_epi16(xmm7, src0[p_permute_indexes[7]], 7); +++ xmm0 = _mm_insert_epi16(xmm0, src0[p_permute_indexes[0]], 0); +++ xmm5 = _mm_insert_epi16(xmm5, src0[p_permute_indexes[1]], 1); +++ xmm6 = _mm_insert_epi16(xmm6, src0[p_permute_indexes[2]], 2); +++ xmm7 = _mm_insert_epi16(xmm7, src0[p_permute_indexes[3]], 3); +++ xmm0 = _mm_insert_epi16(xmm0, src0[p_permute_indexes[4]], 4); +++ xmm5 = _mm_insert_epi16(xmm5, src0[p_permute_indexes[5]], 5); +++ xmm6 = _mm_insert_epi16(xmm6, src0[p_permute_indexes[6]], 6); +++ xmm7 = _mm_insert_epi16(xmm7, src0[p_permute_indexes[7]], 7); ++ ++- xmm0 = _mm_add_epi16(xmm0, xmm5); ++- xmm6 = _mm_add_epi16(xmm6, xmm7); +++ xmm0 = _mm_add_epi16(xmm0, xmm5); +++ xmm6 = _mm_add_epi16(xmm6, xmm7); ++ ++- p_permute_indexes += 8; +++ p_permute_indexes += 8; ++ ++- xmm0 = _mm_add_epi16(xmm0, xmm6); +++ xmm0 = _mm_add_epi16(xmm0, xmm6); ++ ++- xmm5 = _mm_load_si128(p_cntl0); ++- xmm6 = _mm_load_si128(p_cntl1); ++- xmm7 = _mm_load_si128(p_cntl2); +++ xmm5 = _mm_load_si128(p_cntl0); +++ xmm6 = _mm_load_si128(p_cntl1); +++ xmm7 = _mm_load_si128(p_cntl2); ++ ++- xmm5 = _mm_and_si128(xmm5, xmm1); ++- xmm6 = _mm_and_si128(xmm6, xmm2); ++- xmm7 = _mm_and_si128(xmm7, xmm3); +++ xmm5 = _mm_and_si128(xmm5, xmm1); +++ xmm6 = _mm_and_si128(xmm6, xmm2); +++ xmm7 = _mm_and_si128(xmm7, xmm3); ++ ++- xmm0 = _mm_add_epi16(xmm0, xmm5); +++ xmm0 = _mm_add_epi16(xmm0, xmm5); ++ ++- xmm5 = _mm_load_si128(p_cntl3); +++ xmm5 = _mm_load_si128(p_cntl3); ++ ++- xmm6 = _mm_add_epi16(xmm6, xmm7); +++ xmm6 = _mm_add_epi16(xmm6, xmm7); ++ ++- p_cntl0 += 1; +++ p_cntl0 += 1; ++ ++- xmm5 = _mm_and_si128(xmm5, xmm4); +++ xmm5 = _mm_and_si128(xmm5, xmm4); ++ ++- xmm0 = _mm_add_epi16(xmm0, xmm6); +++ xmm0 = _mm_add_epi16(xmm0, xmm6); ++ ++- p_cntl1 += 1; ++- p_cntl2 += 1; +++ p_cntl1 += 1; +++ p_cntl2 += 1; ++ ++- xmm0 = _mm_add_epi16(xmm0, xmm5); +++ xmm0 = _mm_add_epi16(xmm0, xmm5); ++ ++- p_cntl3 += 1; +++ p_cntl3 += 1; ++ ++- _mm_store_si128(p_target, xmm0); +++ _mm_store_si128(p_target, xmm0); ++ ++- p_target += 1; ++- } +++ p_target += 1; +++ } ++ ++- for(i = bound * 8; i < (bound * 8) + leftovers; ++i) { ++- target[i] = src0[permute_indexes[i]] ++- + (cntl0[i] & scalars[0]) ++- + (cntl1[i] & scalars[1]) ++- + (cntl2[i] & scalars[2]) ++- + (cntl3[i] & scalars[3]); ++- } +++ for (i = bound * 8; i < (bound * 8) + leftovers; ++i) { +++ target[i] = src0[permute_indexes[i]] + (cntl0[i] & scalars[0]) + +++ (cntl1[i] & scalars[1]) + (cntl2[i] & scalars[2]) + +++ (cntl3[i] & scalars[3]); +++ } ++ } ++ #endif /*LV_HAVE_SSE*/ ++ ++ ++ #ifdef LV_HAVE_GENERIC ++-static inline void ++-volk_16i_permute_and_scalar_add_generic(short* target, short* src0, short* permute_indexes, ++- short* cntl0, short* cntl1, short* cntl2, short* cntl3, ++- short* scalars, unsigned int num_points) +++static inline void volk_16i_permute_and_scalar_add_generic(short* target, +++ short* src0, +++ short* permute_indexes, +++ short* cntl0, +++ short* cntl1, +++ short* cntl2, +++ short* cntl3, +++ short* scalars, +++ unsigned int num_points) ++ { ++- const unsigned int num_bytes = num_points*2; +++ const unsigned int num_bytes = num_points * 2; ++ ++- int i = 0; +++ int i = 0; ++ ++- int bound = num_bytes >> 1; +++ int bound = num_bytes >> 1; ++ ++- for(i = 0; i < bound; ++i) { ++- target[i] = src0[permute_indexes[i]] ++- + (cntl0[i] & scalars[0]) ++- + (cntl1[i] & scalars[1]) ++- + (cntl2[i] & scalars[2]) ++- + (cntl3[i] & scalars[3]); ++- } +++ for (i = 0; i < bound; ++i) { +++ target[i] = src0[permute_indexes[i]] + (cntl0[i] & scalars[0]) + +++ (cntl1[i] & scalars[1]) + (cntl2[i] & scalars[2]) + +++ (cntl3[i] & scalars[3]); +++ } ++ } ++ ++ #endif /*LV_HAVE_GENERIC*/ ++diff --git a/kernels/volk/volk_16i_s32f_convert_32f.h b/kernels/volk/volk_16i_s32f_convert_32f.h ++index 38ea6f5..3fd3a77 100644 ++--- a/kernels/volk/volk_16i_s32f_convert_32f.h +++++ b/kernels/volk/volk_16i_s32f_convert_32f.h ++@@ -29,8 +29,8 @@ ++ * ++ * Dispatcher Prototype ++ * \code ++- * void volk_16i_s32f_convert_32f(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points); ++- * \endcode +++ * void volk_16i_s32f_convert_32f(float* outputVector, const int16_t* inputVector, const +++ * float scalar, unsigned int num_points); \endcode ++ * ++ * \b Inputs ++ * \li inputVector: The input vector of 16-bit shorts. ++@@ -60,238 +60,247 @@ ++ #ifdef LV_HAVE_AVX2 ++ #include ++ ++-static inline void ++-volk_16i_s32f_convert_32f_u_avx2(float* outputVector, const int16_t* inputVector, ++- const float scalar, unsigned int num_points) +++static inline void volk_16i_s32f_convert_32f_u_avx2(float* outputVector, +++ const int16_t* inputVector, +++ const float scalar, +++ unsigned int num_points) ++ { ++- unsigned int number = 0; ++- const unsigned int eighthPoints = num_points / 8; +++ unsigned int number = 0; +++ const unsigned int eighthPoints = num_points / 8; ++ ++- float* outputVectorPtr = outputVector; ++- __m256 invScalar = _mm256_set1_ps(1.0/scalar); ++- int16_t* inputPtr = (int16_t*)inputVector; ++- __m128i inputVal; ++- __m256i inputVal2; ++- __m256 ret; +++ float* outputVectorPtr = outputVector; +++ __m256 invScalar = _mm256_set1_ps(1.0 / scalar); +++ int16_t* inputPtr = (int16_t*)inputVector; +++ __m128i inputVal; +++ __m256i inputVal2; +++ __m256 ret; ++ ++- for(;number < eighthPoints; number++){ +++ for (; number < eighthPoints; number++) { ++ ++- // Load the 8 values ++- inputVal = _mm_loadu_si128((__m128i*)inputPtr); +++ // Load the 8 values +++ inputVal = _mm_loadu_si128((__m128i*)inputPtr); ++ ++- // Convert ++- inputVal2 = _mm256_cvtepi16_epi32(inputVal); +++ // Convert +++ inputVal2 = _mm256_cvtepi16_epi32(inputVal); ++ ++- ret = _mm256_cvtepi32_ps(inputVal2); ++- ret = _mm256_mul_ps(ret, invScalar); +++ ret = _mm256_cvtepi32_ps(inputVal2); +++ ret = _mm256_mul_ps(ret, invScalar); ++ ++- _mm256_storeu_ps(outputVectorPtr, ret); +++ _mm256_storeu_ps(outputVectorPtr, ret); ++ ++- outputVectorPtr += 8; +++ outputVectorPtr += 8; ++ ++- inputPtr += 8; ++- } +++ inputPtr += 8; +++ } ++ ++- number = eighthPoints * 8; ++- for(; number < num_points; number++){ ++- outputVector[number] =((float)(inputVector[number])) / scalar; ++- } +++ number = eighthPoints * 8; +++ for (; number < num_points; number++) { +++ outputVector[number] = ((float)(inputVector[number])) / scalar; +++ } ++ } ++ #endif /* LV_HAVE_AVX2 */ ++ ++ #ifdef LV_HAVE_AVX ++ #include ++ ++-static inline void ++-volk_16i_s32f_convert_32f_u_avx(float* outputVector, const int16_t* inputVector, ++- const float scalar, unsigned int num_points) +++static inline void volk_16i_s32f_convert_32f_u_avx(float* outputVector, +++ const int16_t* inputVector, +++ const float scalar, +++ unsigned int num_points) ++ { ++- unsigned int number = 0; ++- const unsigned int eighthPoints = num_points / 8; +++ unsigned int number = 0; +++ const unsigned int eighthPoints = num_points / 8; ++ ++- float* outputVectorPtr = outputVector; ++- __m128 invScalar = _mm_set_ps1(1.0/scalar); ++- int16_t* inputPtr = (int16_t*)inputVector; ++- __m128i inputVal, inputVal2; ++- __m128 ret; ++- __m256 output; ++- __m256 dummy = _mm256_setzero_ps(); +++ float* outputVectorPtr = outputVector; +++ __m128 invScalar = _mm_set_ps1(1.0 / scalar); +++ int16_t* inputPtr = (int16_t*)inputVector; +++ __m128i inputVal, inputVal2; +++ __m128 ret; +++ __m256 output; +++ __m256 dummy = _mm256_setzero_ps(); ++ ++- for(;number < eighthPoints; number++){ +++ for (; number < eighthPoints; number++) { ++ ++- // Load the 8 values ++- //inputVal = _mm_loadu_si128((__m128i*)inputPtr); ++- inputVal = _mm_loadu_si128((__m128i*)inputPtr); +++ // Load the 8 values +++ // inputVal = _mm_loadu_si128((__m128i*)inputPtr); +++ inputVal = _mm_loadu_si128((__m128i*)inputPtr); ++ ++- // Shift the input data to the right by 64 bits ( 8 bytes ) ++- inputVal2 = _mm_srli_si128(inputVal, 8); +++ // Shift the input data to the right by 64 bits ( 8 bytes ) +++ inputVal2 = _mm_srli_si128(inputVal, 8); ++ ++- // Convert the lower 4 values into 32 bit words ++- inputVal = _mm_cvtepi16_epi32(inputVal); ++- inputVal2 = _mm_cvtepi16_epi32(inputVal2); +++ // Convert the lower 4 values into 32 bit words +++ inputVal = _mm_cvtepi16_epi32(inputVal); +++ inputVal2 = _mm_cvtepi16_epi32(inputVal2); ++ ++- ret = _mm_cvtepi32_ps(inputVal); ++- ret = _mm_mul_ps(ret, invScalar); ++- output = _mm256_insertf128_ps(dummy, ret, 0); +++ ret = _mm_cvtepi32_ps(inputVal); +++ ret = _mm_mul_ps(ret, invScalar); +++ output = _mm256_insertf128_ps(dummy, ret, 0); ++ ++- ret = _mm_cvtepi32_ps(inputVal2); ++- ret = _mm_mul_ps(ret, invScalar); ++- output = _mm256_insertf128_ps(output, ret, 1); +++ ret = _mm_cvtepi32_ps(inputVal2); +++ ret = _mm_mul_ps(ret, invScalar); +++ output = _mm256_insertf128_ps(output, ret, 1); ++ ++- _mm256_storeu_ps(outputVectorPtr, output); +++ _mm256_storeu_ps(outputVectorPtr, output); ++ ++- outputVectorPtr += 8; +++ outputVectorPtr += 8; ++ ++- inputPtr += 8; ++- } +++ inputPtr += 8; +++ } ++ ++- number = eighthPoints * 8; ++- for(; number < num_points; number++){ ++- outputVector[number] =((float)(inputVector[number])) / scalar; ++- } +++ number = eighthPoints * 8; +++ for (; number < num_points; number++) { +++ outputVector[number] = ((float)(inputVector[number])) / scalar; +++ } ++ } ++ #endif /* LV_HAVE_AVX */ ++ ++ #ifdef LV_HAVE_SSE4_1 ++ #include ++ ++-static inline void ++-volk_16i_s32f_convert_32f_u_sse4_1(float* outputVector, const int16_t* inputVector, ++- const float scalar, unsigned int num_points) +++static inline void volk_16i_s32f_convert_32f_u_sse4_1(float* outputVector, +++ const int16_t* inputVector, +++ const float scalar, +++ unsigned int num_points) ++ { ++- unsigned int number = 0; ++- const unsigned int eighthPoints = num_points / 8; +++ unsigned int number = 0; +++ const unsigned int eighthPoints = num_points / 8; ++ ++- float* outputVectorPtr = outputVector; ++- __m128 invScalar = _mm_set_ps1(1.0/scalar); ++- int16_t* inputPtr = (int16_t*)inputVector; ++- __m128i inputVal; ++- __m128i inputVal2; ++- __m128 ret; +++ float* outputVectorPtr = outputVector; +++ __m128 invScalar = _mm_set_ps1(1.0 / scalar); +++ int16_t* inputPtr = (int16_t*)inputVector; +++ __m128i inputVal; +++ __m128i inputVal2; +++ __m128 ret; ++ ++- for(;number < eighthPoints; number++){ +++ for (; number < eighthPoints; number++) { ++ ++- // Load the 8 values ++- inputVal = _mm_loadu_si128((__m128i*)inputPtr); +++ // Load the 8 values +++ inputVal = _mm_loadu_si128((__m128i*)inputPtr); ++ ++- // Shift the input data to the right by 64 bits ( 8 bytes ) ++- inputVal2 = _mm_srli_si128(inputVal, 8); +++ // Shift the input data to the right by 64 bits ( 8 bytes ) +++ inputVal2 = _mm_srli_si128(inputVal, 8); ++ ++- // Convert the lower 4 values into 32 bit words ++- inputVal = _mm_cvtepi16_epi32(inputVal); ++- inputVal2 = _mm_cvtepi16_epi32(inputVal2); +++ // Convert the lower 4 values into 32 bit words +++ inputVal = _mm_cvtepi16_epi32(inputVal); +++ inputVal2 = _mm_cvtepi16_epi32(inputVal2); ++ ++- ret = _mm_cvtepi32_ps(inputVal); ++- ret = _mm_mul_ps(ret, invScalar); ++- _mm_storeu_ps(outputVectorPtr, ret); ++- outputVectorPtr += 4; +++ ret = _mm_cvtepi32_ps(inputVal); +++ ret = _mm_mul_ps(ret, invScalar); +++ _mm_storeu_ps(outputVectorPtr, ret); +++ outputVectorPtr += 4; ++ ++- ret = _mm_cvtepi32_ps(inputVal2); ++- ret = _mm_mul_ps(ret, invScalar); ++- _mm_storeu_ps(outputVectorPtr, ret); +++ ret = _mm_cvtepi32_ps(inputVal2); +++ ret = _mm_mul_ps(ret, invScalar); +++ _mm_storeu_ps(outputVectorPtr, ret); ++ ++- outputVectorPtr += 4; +++ outputVectorPtr += 4; ++ ++- inputPtr += 8; ++- } +++ inputPtr += 8; +++ } ++ ++- number = eighthPoints * 8; ++- for(; number < num_points; number++){ ++- outputVector[number] =((float)(inputVector[number])) / scalar; ++- } +++ number = eighthPoints * 8; +++ for (; number < num_points; number++) { +++ outputVector[number] = ((float)(inputVector[number])) / scalar; +++ } ++ } ++ #endif /* LV_HAVE_SSE4_1 */ ++ ++ #ifdef LV_HAVE_SSE ++ #include ++ ++-static inline void ++-volk_16i_s32f_convert_32f_u_sse(float* outputVector, const int16_t* inputVector, ++- const float scalar, unsigned int num_points) +++static inline void volk_16i_s32f_convert_32f_u_sse(float* outputVector, +++ const int16_t* inputVector, +++ const float scalar, +++ unsigned int num_points) ++ { ++- unsigned int number = 0; ++- const unsigned int quarterPoints = num_points / 4; ++- ++- float* outputVectorPtr = outputVector; ++- __m128 invScalar = _mm_set_ps1(1.0/scalar); ++- int16_t* inputPtr = (int16_t*)inputVector; ++- __m128 ret; ++- ++- for(;number < quarterPoints; number++){ ++- ret = _mm_set_ps((float)(inputPtr[3]), (float)(inputPtr[2]), (float)(inputPtr[1]), (float)(inputPtr[0])); ++- ++- ret = _mm_mul_ps(ret, invScalar); ++- _mm_storeu_ps(outputVectorPtr, ret); ++- ++- inputPtr += 4; ++- outputVectorPtr += 4; ++- } ++- ++- number = quarterPoints * 4; ++- for(; number < num_points; number++){ ++- outputVector[number] = (float)(inputVector[number]) / scalar; ++- } +++ unsigned int number = 0; +++ const unsigned int quarterPoints = num_points / 4; +++ +++ float* outputVectorPtr = outputVector; +++ __m128 invScalar = _mm_set_ps1(1.0 / scalar); +++ int16_t* inputPtr = (int16_t*)inputVector; +++ __m128 ret; +++ +++ for (; number < quarterPoints; number++) { +++ ret = _mm_set_ps((float)(inputPtr[3]), +++ (float)(inputPtr[2]), +++ (float)(inputPtr[1]), +++ (float)(inputPtr[0])); +++ +++ ret = _mm_mul_ps(ret, invScalar); +++ _mm_storeu_ps(outputVectorPtr, ret); +++ +++ inputPtr += 4; +++ outputVectorPtr += 4; +++ } +++ +++ number = quarterPoints * 4; +++ for (; number < num_points; number++) { +++ outputVector[number] = (float)(inputVector[number]) / scalar; +++ } ++ } ++ #endif /* LV_HAVE_SSE */ ++ ++ #ifdef LV_HAVE_GENERIC ++ ++-static inline void ++-volk_16i_s32f_convert_32f_generic(float* outputVector, const int16_t* inputVector, ++- const float scalar, unsigned int num_points) +++static inline void volk_16i_s32f_convert_32f_generic(float* outputVector, +++ const int16_t* inputVector, +++ const float scalar, +++ unsigned int num_points) ++ { ++- float* outputVectorPtr = outputVector; ++- const int16_t* inputVectorPtr = inputVector; ++- unsigned int number = 0; +++ float* outputVectorPtr = outputVector; +++ const int16_t* inputVectorPtr = inputVector; +++ unsigned int number = 0; ++ ++- for(number = 0; number < num_points; number++){ ++- *outputVectorPtr++ = ((float)(*inputVectorPtr++)) / scalar; ++- } +++ for (number = 0; number < num_points; number++) { +++ *outputVectorPtr++ = ((float)(*inputVectorPtr++)) / scalar; +++ } ++ } ++ #endif /* LV_HAVE_GENERIC */ ++ ++ #ifdef LV_HAVE_NEON ++ #include ++ ++-static inline void ++-volk_16i_s32f_convert_32f_neon(float* outputVector, const int16_t* inputVector, ++- const float scalar, unsigned int num_points) +++static inline void volk_16i_s32f_convert_32f_neon(float* outputVector, +++ const int16_t* inputVector, +++ const float scalar, +++ unsigned int num_points) ++ { ++- float* outputPtr = outputVector; ++- const int16_t* inputPtr = inputVector; ++- unsigned int number = 0; ++- unsigned int eighth_points = num_points / 8; ++- ++- int16x4x2_t input16; ++- int32x4_t input32_0, input32_1; ++- float32x4_t input_float_0, input_float_1; ++- float32x4x2_t output_float; ++- float32x4_t inv_scale; ++- ++- inv_scale = vdupq_n_f32(1.0/scalar); ++- ++- // the generic disassembles to a 128-bit load ++- // and duplicates every instruction to operate on 64-bits ++- // at a time. This is only possible with lanes, which is faster ++- // than just doing a vld1_s16, but still slower. ++- for(number = 0; number < eighth_points; number++){ ++- input16 = vld2_s16(inputPtr); ++- // widen 16-bit int to 32-bit int ++- input32_0 = vmovl_s16(input16.val[0]); ++- input32_1 = vmovl_s16(input16.val[1]); ++- // convert 32-bit int to float with scale ++- input_float_0 = vcvtq_f32_s32(input32_0); ++- input_float_1 = vcvtq_f32_s32(input32_1); ++- output_float.val[0] = vmulq_f32(input_float_0, inv_scale); ++- output_float.val[1] = vmulq_f32(input_float_1, inv_scale); ++- vst2q_f32(outputPtr, output_float); ++- inputPtr += 8; ++- outputPtr += 8; ++- } ++- ++- for(number = eighth_points*8; number < num_points; number++){ ++- *outputPtr++ = ((float)(*inputPtr++)) / scalar; ++- } +++ float* outputPtr = outputVector; +++ const int16_t* inputPtr = inputVector; +++ unsigned int number = 0; +++ unsigned int eighth_points = num_points / 8; +++ +++ int16x4x2_t input16; +++ int32x4_t input32_0, input32_1; +++ float32x4_t input_float_0, input_float_1; +++ float32x4x2_t output_float; +++ float32x4_t inv_scale; +++ +++ inv_scale = vdupq_n_f32(1.0 / scalar); +++ +++ // the generic disassembles to a 128-bit load +++ // and duplicates every instruction to operate on 64-bits +++ // at a time. This is only possible with lanes, which is faster +++ // than just doing a vld1_s16, but still slower. +++ for (number = 0; number < eighth_points; number++) { +++ input16 = vld2_s16(inputPtr); +++ // widen 16-bit int to 32-bit int +++ input32_0 = vmovl_s16(input16.val[0]); +++ input32_1 = vmovl_s16(input16.val[1]); +++ // convert 32-bit int to float with scale +++ input_float_0 = vcvtq_f32_s32(input32_0); +++ input_float_1 = vcvtq_f32_s32(input32_1); +++ output_float.val[0] = vmulq_f32(input_float_0, inv_scale); +++ output_float.val[1] = vmulq_f32(input_float_1, inv_scale); +++ vst2q_f32(outputPtr, output_float); +++ inputPtr += 8; +++ outputPtr += 8; +++ } +++ +++ for (number = eighth_points * 8; number < num_points; number++) { +++ *outputPtr++ = ((float)(*inputPtr++)) / scalar; +++ } ++ } ++ #endif /* LV_HAVE_NEON */ ++ ++@@ -306,193 +315,201 @@ volk_16i_s32f_convert_32f_neon(float* outputVector, const int16_t* inputVector, ++ #ifdef LV_HAVE_AVX2 ++ #include ++ ++-static inline void ++-volk_16i_s32f_convert_32f_a_avx2(float* outputVector, const int16_t* inputVector, ++- const float scalar, unsigned int num_points) +++static inline void volk_16i_s32f_convert_32f_a_avx2(float* outputVector, +++ const int16_t* inputVector, +++ const float scalar, +++ unsigned int num_points) ++ { ++- unsigned int number = 0; ++- const unsigned int eighthPoints = num_points / 8; +++ unsigned int number = 0; +++ const unsigned int eighthPoints = num_points / 8; ++ ++- float* outputVectorPtr = outputVector; ++- __m256 invScalar = _mm256_set1_ps(1.0/scalar); ++- int16_t* inputPtr = (int16_t*)inputVector; ++- __m128i inputVal; ++- __m256i inputVal2; ++- __m256 ret; +++ float* outputVectorPtr = outputVector; +++ __m256 invScalar = _mm256_set1_ps(1.0 / scalar); +++ int16_t* inputPtr = (int16_t*)inputVector; +++ __m128i inputVal; +++ __m256i inputVal2; +++ __m256 ret; ++ ++- for(;number < eighthPoints; number++){ +++ for (; number < eighthPoints; number++) { ++ ++- // Load the 8 values ++- inputVal = _mm_load_si128((__m128i*)inputPtr); +++ // Load the 8 values +++ inputVal = _mm_load_si128((__m128i*)inputPtr); ++ ++- // Convert ++- inputVal2 = _mm256_cvtepi16_epi32(inputVal); +++ // Convert +++ inputVal2 = _mm256_cvtepi16_epi32(inputVal); ++ ++- ret = _mm256_cvtepi32_ps(inputVal2); ++- ret = _mm256_mul_ps(ret, invScalar); +++ ret = _mm256_cvtepi32_ps(inputVal2); +++ ret = _mm256_mul_ps(ret, invScalar); ++ ++- _mm256_store_ps(outputVectorPtr, ret); +++ _mm256_store_ps(outputVectorPtr, ret); ++ ++- outputVectorPtr += 8; +++ outputVectorPtr += 8; ++ ++- inputPtr += 8; ++- } +++ inputPtr += 8; +++ } ++ ++- number = eighthPoints * 8; ++- for(; number < num_points; number++){ ++- outputVector[number] =((float)(inputVector[number])) / scalar; ++- } +++ number = eighthPoints * 8; +++ for (; number < num_points; number++) { +++ outputVector[number] = ((float)(inputVector[number])) / scalar; +++ } ++ } ++ #endif /* LV_HAVE_AVX2 */ ++ ++ #ifdef LV_HAVE_AVX ++ #include ++ ++-static inline void ++-volk_16i_s32f_convert_32f_a_avx(float* outputVector, const int16_t* inputVector, ++- const float scalar, unsigned int num_points) +++static inline void volk_16i_s32f_convert_32f_a_avx(float* outputVector, +++ const int16_t* inputVector, +++ const float scalar, +++ unsigned int num_points) ++ { ++- unsigned int number = 0; ++- const unsigned int eighthPoints = num_points / 8; +++ unsigned int number = 0; +++ const unsigned int eighthPoints = num_points / 8; ++ ++- float* outputVectorPtr = outputVector; ++- __m128 invScalar = _mm_set_ps1(1.0/scalar); ++- int16_t* inputPtr = (int16_t*)inputVector; ++- __m128i inputVal, inputVal2; ++- __m128 ret; ++- __m256 output; ++- __m256 dummy = _mm256_setzero_ps(); +++ float* outputVectorPtr = outputVector; +++ __m128 invScalar = _mm_set_ps1(1.0 / scalar); +++ int16_t* inputPtr = (int16_t*)inputVector; +++ __m128i inputVal, inputVal2; +++ __m128 ret; +++ __m256 output; +++ __m256 dummy = _mm256_setzero_ps(); ++ ++- for(;number < eighthPoints; number++){ +++ for (; number < eighthPoints; number++) { ++ ++- // Load the 8 values ++- //inputVal = _mm_loadu_si128((__m128i*)inputPtr); ++- inputVal = _mm_load_si128((__m128i*)inputPtr); +++ // Load the 8 values +++ // inputVal = _mm_loadu_si128((__m128i*)inputPtr); +++ inputVal = _mm_load_si128((__m128i*)inputPtr); ++ ++- // Shift the input data to the right by 64 bits ( 8 bytes ) ++- inputVal2 = _mm_srli_si128(inputVal, 8); +++ // Shift the input data to the right by 64 bits ( 8 bytes ) +++ inputVal2 = _mm_srli_si128(inputVal, 8); ++ ++- // Convert the lower 4 values into 32 bit words ++- inputVal = _mm_cvtepi16_epi32(inputVal); ++- inputVal2 = _mm_cvtepi16_epi32(inputVal2); +++ // Convert the lower 4 values into 32 bit words +++ inputVal = _mm_cvtepi16_epi32(inputVal); +++ inputVal2 = _mm_cvtepi16_epi32(inputVal2); ++ ++- ret = _mm_cvtepi32_ps(inputVal); ++- ret = _mm_mul_ps(ret, invScalar); ++- output = _mm256_insertf128_ps(dummy, ret, 0); +++ ret = _mm_cvtepi32_ps(inputVal); +++ ret = _mm_mul_ps(ret, invScalar); +++ output = _mm256_insertf128_ps(dummy, ret, 0); ++ ++- ret = _mm_cvtepi32_ps(inputVal2); ++- ret = _mm_mul_ps(ret, invScalar); ++- output = _mm256_insertf128_ps(output, ret, 1); +++ ret = _mm_cvtepi32_ps(inputVal2); +++ ret = _mm_mul_ps(ret, invScalar); +++ output = _mm256_insertf128_ps(output, ret, 1); ++ ++- _mm256_store_ps(outputVectorPtr, output); +++ _mm256_store_ps(outputVectorPtr, output); ++ ++- outputVectorPtr += 8; +++ outputVectorPtr += 8; ++ ++- inputPtr += 8; ++- } +++ inputPtr += 8; +++ } ++ ++- number = eighthPoints * 8; ++- for(; number < num_points; number++){ ++- outputVector[number] =((float)(inputVector[number])) / scalar; ++- } +++ number = eighthPoints * 8; +++ for (; number < num_points; number++) { +++ outputVector[number] = ((float)(inputVector[number])) / scalar; +++ } ++ } ++ #endif /* LV_HAVE_AVX */ ++ ++ #ifdef LV_HAVE_SSE4_1 ++ #include ++ ++-static inline void ++-volk_16i_s32f_convert_32f_a_sse4_1(float* outputVector, const int16_t* inputVector, ++- const float scalar, unsigned int num_points) +++static inline void volk_16i_s32f_convert_32f_a_sse4_1(float* outputVector, +++ const int16_t* inputVector, +++ const float scalar, +++ unsigned int num_points) ++ { ++- unsigned int number = 0; ++- const unsigned int eighthPoints = num_points / 8; +++ unsigned int number = 0; +++ const unsigned int eighthPoints = num_points / 8; ++ ++- float* outputVectorPtr = outputVector; ++- __m128 invScalar = _mm_set_ps1(1.0/scalar); ++- int16_t* inputPtr = (int16_t*)inputVector; ++- __m128i inputVal; ++- __m128i inputVal2; ++- __m128 ret; +++ float* outputVectorPtr = outputVector; +++ __m128 invScalar = _mm_set_ps1(1.0 / scalar); +++ int16_t* inputPtr = (int16_t*)inputVector; +++ __m128i inputVal; +++ __m128i inputVal2; +++ __m128 ret; ++ ++- for(;number < eighthPoints; number++){ +++ for (; number < eighthPoints; number++) { ++ ++- // Load the 8 values ++- inputVal = _mm_loadu_si128((__m128i*)inputPtr); +++ // Load the 8 values +++ inputVal = _mm_loadu_si128((__m128i*)inputPtr); ++ ++- // Shift the input data to the right by 64 bits ( 8 bytes ) ++- inputVal2 = _mm_srli_si128(inputVal, 8); +++ // Shift the input data to the right by 64 bits ( 8 bytes ) +++ inputVal2 = _mm_srli_si128(inputVal, 8); ++ ++- // Convert the lower 4 values into 32 bit words ++- inputVal = _mm_cvtepi16_epi32(inputVal); ++- inputVal2 = _mm_cvtepi16_epi32(inputVal2); +++ // Convert the lower 4 values into 32 bit words +++ inputVal = _mm_cvtepi16_epi32(inputVal); +++ inputVal2 = _mm_cvtepi16_epi32(inputVal2); ++ ++- ret = _mm_cvtepi32_ps(inputVal); ++- ret = _mm_mul_ps(ret, invScalar); ++- _mm_storeu_ps(outputVectorPtr, ret); ++- outputVectorPtr += 4; +++ ret = _mm_cvtepi32_ps(inputVal); +++ ret = _mm_mul_ps(ret, invScalar); +++ _mm_storeu_ps(outputVectorPtr, ret); +++ outputVectorPtr += 4; ++ ++- ret = _mm_cvtepi32_ps(inputVal2); ++- ret = _mm_mul_ps(ret, invScalar); ++- _mm_storeu_ps(outputVectorPtr, ret); +++ ret = _mm_cvtepi32_ps(inputVal2); +++ ret = _mm_mul_ps(ret, invScalar); +++ _mm_storeu_ps(outputVectorPtr, ret); ++ ++- outputVectorPtr += 4; +++ outputVectorPtr += 4; ++ ++- inputPtr += 8; ++- } +++ inputPtr += 8; +++ } ++ ++- number = eighthPoints * 8; ++- for(; number < num_points; number++){ ++- outputVector[number] =((float)(inputVector[number])) / scalar; ++- } +++ number = eighthPoints * 8; +++ for (; number < num_points; number++) { +++ outputVector[number] = ((float)(inputVector[number])) / scalar; +++ } ++ } ++ #endif /* LV_HAVE_SSE4_1 */ ++ ++ #ifdef LV_HAVE_SSE ++ #include ++ ++-static inline void ++-volk_16i_s32f_convert_32f_a_sse(float* outputVector, const int16_t* inputVector, ++- const float scalar, unsigned int num_points) +++static inline void volk_16i_s32f_convert_32f_a_sse(float* outputVector, +++ const int16_t* inputVector, +++ const float scalar, +++ unsigned int num_points) ++ { ++- unsigned int number = 0; ++- const unsigned int quarterPoints = num_points / 4; ++- ++- float* outputVectorPtr = outputVector; ++- __m128 invScalar = _mm_set_ps1(1.0/scalar); ++- int16_t* inputPtr = (int16_t*)inputVector; ++- __m128 ret; ++- ++- for(;number < quarterPoints; number++){ ++- ret = _mm_set_ps((float)(inputPtr[3]), (float)(inputPtr[2]), (float)(inputPtr[1]), (float)(inputPtr[0])); ++- ++- ret = _mm_mul_ps(ret, invScalar); ++- _mm_storeu_ps(outputVectorPtr, ret); ++- ++- inputPtr += 4; ++- outputVectorPtr += 4; ++- } ++- ++- number = quarterPoints * 4; ++- for(; number < num_points; number++){ ++- outputVector[number] = (float)(inputVector[number]) / scalar; ++- } +++ unsigned int number = 0; +++ const unsigned int quarterPoints = num_points / 4; +++ +++ float* outputVectorPtr = outputVector; +++ __m128 invScalar = _mm_set_ps1(1.0 / scalar); +++ int16_t* inputPtr = (int16_t*)inputVector; +++ __m128 ret; +++ +++ for (; number < quarterPoints; number++) { +++ ret = _mm_set_ps((float)(inputPtr[3]), +++ (float)(inputPtr[2]), +++ (float)(inputPtr[1]), +++ (float)(inputPtr[0])); +++ +++ ret = _mm_mul_ps(ret, invScalar); +++ _mm_storeu_ps(outputVectorPtr, ret); +++ +++ inputPtr += 4; +++ outputVectorPtr += 4; +++ } +++ +++ number = quarterPoints * 4; +++ for (; number < num_points; number++) { +++ outputVector[number] = (float)(inputVector[number]) / scalar; +++ } ++ } ++ #endif /* LV_HAVE_SSE */ ++ ++ #ifdef LV_HAVE_GENERIC ++ ++-static inline void ++-volk_16i_s32f_convert_32f_a_generic(float* outputVector, const int16_t* inputVector, ++- const float scalar, unsigned int num_points) +++static inline void volk_16i_s32f_convert_32f_a_generic(float* outputVector, +++ const int16_t* inputVector, +++ const float scalar, +++ unsigned int num_points) ++ { ++- float* outputVectorPtr = outputVector; ++- const int16_t* inputVectorPtr = inputVector; ++- unsigned int number = 0; +++ float* outputVectorPtr = outputVector; +++ const int16_t* inputVectorPtr = inputVector; +++ unsigned int number = 0; ++ ++- for(number = 0; number < num_points; number++){ ++- *outputVectorPtr++ = ((float)(*inputVectorPtr++)) / scalar; ++- } +++ for (number = 0; number < num_points; number++) { +++ *outputVectorPtr++ = ((float)(*inputVectorPtr++)) / scalar; +++ } ++ } ++ #endif /* LV_HAVE_GENERIC */ ++ ++diff --git a/kernels/volk/volk_16i_x4_quad_max_star_16i.h b/kernels/volk/volk_16i_x4_quad_max_star_16i.h ++index 6aa74c7..619cc90 100644 ++--- a/kernels/volk/volk_16i_x4_quad_max_star_16i.h +++++ b/kernels/volk/volk_16i_x4_quad_max_star_16i.h ++@@ -29,8 +29,8 @@ ++ * ++ * Dispatcher Prototype ++ * \code ++- * void volk_16i_x4_quad_max_star_16i(short* target, short* src0, short* src1, short* src2, short* src3, unsigned int num_points) ++- * \endcode +++ * void volk_16i_x4_quad_max_star_16i(short* target, short* src0, short* src1, short* +++ * src2, short* src3, unsigned int num_points) \endcode ++ * ++ * \b Inputs ++ * \li src0: The input vector 0. ++@@ -55,149 +55,152 @@ ++ #ifndef INCLUDED_volk_16i_x4_quad_max_star_16i_a_H ++ #define INCLUDED_volk_16i_x4_quad_max_star_16i_a_H ++ ++-#include ++-#include +++#include +++#include ++ ++ #ifdef LV_HAVE_SSE2 ++ ++-#include +++#include ++ ++-static inline void ++-volk_16i_x4_quad_max_star_16i_a_sse2(short* target, short* src0, short* src1, ++- short* src2, short* src3, unsigned int num_points) +++static inline void volk_16i_x4_quad_max_star_16i_a_sse2(short* target, +++ short* src0, +++ short* src1, +++ short* src2, +++ short* src3, +++ unsigned int num_points) ++ { ++- const unsigned int num_bytes = num_points*2; ++- ++- int i = 0; +++ const unsigned int num_bytes = num_points * 2; ++ ++- int bound = (num_bytes >> 4); ++- int bound_copy = bound; ++- int leftovers = (num_bytes >> 1) & 7; +++ int i = 0; ++ ++- __m128i *p_target, *p_src0, *p_src1, *p_src2, *p_src3; ++- p_target = (__m128i*) target; ++- p_src0 = (__m128i*)src0; ++- p_src1 = (__m128i*)src1; ++- p_src2 = (__m128i*)src2; ++- p_src3 = (__m128i*)src3; +++ int bound = (num_bytes >> 4); +++ int bound_copy = bound; +++ int leftovers = (num_bytes >> 1) & 7; ++ ++- __m128i xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8; +++ __m128i *p_target, *p_src0, *p_src1, *p_src2, *p_src3; +++ p_target = (__m128i*)target; +++ p_src0 = (__m128i*)src0; +++ p_src1 = (__m128i*)src1; +++ p_src2 = (__m128i*)src2; +++ p_src3 = (__m128i*)src3; ++ ++- while(bound_copy > 0) { ++- xmm1 = _mm_load_si128(p_src0); ++- xmm2 = _mm_load_si128(p_src1); ++- xmm3 = _mm_load_si128(p_src2); ++- xmm4 = _mm_load_si128(p_src3); +++ __m128i xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8; ++ ++- xmm5 = _mm_setzero_si128(); ++- xmm6 = _mm_setzero_si128(); ++- xmm7 = xmm1; ++- xmm8 = xmm3; +++ while (bound_copy > 0) { +++ xmm1 = _mm_load_si128(p_src0); +++ xmm2 = _mm_load_si128(p_src1); +++ xmm3 = _mm_load_si128(p_src2); +++ xmm4 = _mm_load_si128(p_src3); ++ ++- xmm1 = _mm_sub_epi16(xmm2, xmm1); +++ xmm5 = _mm_setzero_si128(); +++ xmm6 = _mm_setzero_si128(); +++ xmm7 = xmm1; +++ xmm8 = xmm3; ++ ++- xmm3 = _mm_sub_epi16(xmm4, xmm3); +++ xmm1 = _mm_sub_epi16(xmm2, xmm1); ++ ++- xmm5 = _mm_cmpgt_epi16(xmm1, xmm5); ++- xmm6 = _mm_cmpgt_epi16(xmm3, xmm6); +++ xmm3 = _mm_sub_epi16(xmm4, xmm3); ++ ++- xmm2 = _mm_and_si128(xmm5, xmm2); ++- xmm4 = _mm_and_si128(xmm6, xmm4); ++- xmm5 = _mm_andnot_si128(xmm5, xmm7); ++- xmm6 = _mm_andnot_si128(xmm6, xmm8); +++ xmm5 = _mm_cmpgt_epi16(xmm1, xmm5); +++ xmm6 = _mm_cmpgt_epi16(xmm3, xmm6); ++ ++- xmm5 = _mm_add_epi16(xmm2, xmm5); ++- xmm6 = _mm_add_epi16(xmm4, xmm6); +++ xmm2 = _mm_and_si128(xmm5, xmm2); +++ xmm4 = _mm_and_si128(xmm6, xmm4); +++ xmm5 = _mm_andnot_si128(xmm5, xmm7); +++ xmm6 = _mm_andnot_si128(xmm6, xmm8); ++ ++- xmm1 = _mm_xor_si128(xmm1, xmm1); ++- xmm2 = xmm5; ++- xmm5 = _mm_sub_epi16(xmm6, xmm5); ++- p_src0 += 1; ++- bound_copy -= 1; +++ xmm5 = _mm_add_epi16(xmm2, xmm5); +++ xmm6 = _mm_add_epi16(xmm4, xmm6); ++ ++- xmm1 = _mm_cmpgt_epi16(xmm5, xmm1); ++- p_src1 += 1; +++ xmm1 = _mm_xor_si128(xmm1, xmm1); +++ xmm2 = xmm5; +++ xmm5 = _mm_sub_epi16(xmm6, xmm5); +++ p_src0 += 1; +++ bound_copy -= 1; ++ ++- xmm6 = _mm_and_si128(xmm1, xmm6); +++ xmm1 = _mm_cmpgt_epi16(xmm5, xmm1); +++ p_src1 += 1; ++ ++- xmm1 = _mm_andnot_si128(xmm1, xmm2); ++- p_src2 += 1; +++ xmm6 = _mm_and_si128(xmm1, xmm6); ++ ++- xmm1 = _mm_add_epi16(xmm6, xmm1); ++- p_src3 += 1; +++ xmm1 = _mm_andnot_si128(xmm1, xmm2); +++ p_src2 += 1; ++ ++- _mm_store_si128(p_target, xmm1); ++- p_target += 1; +++ xmm1 = _mm_add_epi16(xmm6, xmm1); +++ p_src3 += 1; ++ ++- } +++ _mm_store_si128(p_target, xmm1); +++ p_target += 1; +++ } ++ ++ ++- /*__VOLK_ASM __VOLK_VOLATILE ++- ( ++- "volk_16i_x4_quad_max_star_16i_a_sse2_L1:\n\t" ++- "cmp $0, %[bound]\n\t" ++- "je volk_16i_x4_quad_max_star_16i_a_sse2_END\n\t" +++ /*__VOLK_ASM __VOLK_VOLATILE +++ ( +++ "volk_16i_x4_quad_max_star_16i_a_sse2_L1:\n\t" +++ "cmp $0, %[bound]\n\t" +++ "je volk_16i_x4_quad_max_star_16i_a_sse2_END\n\t" ++ ++- "movaps (%[src0]), %%xmm1\n\t" ++- "movaps (%[src1]), %%xmm2\n\t" ++- "movaps (%[src2]), %%xmm3\n\t" ++- "movaps (%[src3]), %%xmm4\n\t" +++ "movaps (%[src0]), %%xmm1\n\t" +++ "movaps (%[src1]), %%xmm2\n\t" +++ "movaps (%[src2]), %%xmm3\n\t" +++ "movaps (%[src3]), %%xmm4\n\t" ++ ++- "pxor %%xmm5, %%xmm5\n\t" ++- "pxor %%xmm6, %%xmm6\n\t" ++- "movaps %%xmm1, %%xmm7\n\t" ++- "movaps %%xmm3, %%xmm8\n\t" ++- "psubw %%xmm2, %%xmm1\n\t" ++- "psubw %%xmm4, %%xmm3\n\t" +++ "pxor %%xmm5, %%xmm5\n\t" +++ "pxor %%xmm6, %%xmm6\n\t" +++ "movaps %%xmm1, %%xmm7\n\t" +++ "movaps %%xmm3, %%xmm8\n\t" +++ "psubw %%xmm2, %%xmm1\n\t" +++ "psubw %%xmm4, %%xmm3\n\t" ++ ++- "pcmpgtw %%xmm1, %%xmm5\n\t" ++- "pcmpgtw %%xmm3, %%xmm6\n\t" +++ "pcmpgtw %%xmm1, %%xmm5\n\t" +++ "pcmpgtw %%xmm3, %%xmm6\n\t" ++ ++- "pand %%xmm5, %%xmm2\n\t" ++- "pand %%xmm6, %%xmm4\n\t" ++- "pandn %%xmm7, %%xmm5\n\t" ++- "pandn %%xmm8, %%xmm6\n\t" +++ "pand %%xmm5, %%xmm2\n\t" +++ "pand %%xmm6, %%xmm4\n\t" +++ "pandn %%xmm7, %%xmm5\n\t" +++ "pandn %%xmm8, %%xmm6\n\t" ++ ++- "paddw %%xmm2, %%xmm5\n\t" ++- "paddw %%xmm4, %%xmm6\n\t" +++ "paddw %%xmm2, %%xmm5\n\t" +++ "paddw %%xmm4, %%xmm6\n\t" ++ ++- "pxor %%xmm1, %%xmm1\n\t" ++- "movaps %%xmm5, %%xmm2\n\t" +++ "pxor %%xmm1, %%xmm1\n\t" +++ "movaps %%xmm5, %%xmm2\n\t" ++ ++- "psubw %%xmm6, %%xmm5\n\t" ++- "add $16, %[src0]\n\t" ++- "add $-1, %[bound]\n\t" +++ "psubw %%xmm6, %%xmm5\n\t" +++ "add $16, %[src0]\n\t" +++ "add $-1, %[bound]\n\t" ++ ++- "pcmpgtw %%xmm5, %%xmm1\n\t" ++- "add $16, %[src1]\n\t" +++ "pcmpgtw %%xmm5, %%xmm1\n\t" +++ "add $16, %[src1]\n\t" ++ ++- "pand %%xmm1, %%xmm6\n\t" +++ "pand %%xmm1, %%xmm6\n\t" ++ ++- "pandn %%xmm2, %%xmm1\n\t" ++- "add $16, %[src2]\n\t" +++ "pandn %%xmm2, %%xmm1\n\t" +++ "add $16, %[src2]\n\t" ++ ++- "paddw %%xmm6, %%xmm1\n\t" ++- "add $16, %[src3]\n\t" +++ "paddw %%xmm6, %%xmm1\n\t" +++ "add $16, %[src3]\n\t" ++ ++- "movaps %%xmm1, (%[target])\n\t" ++- "addw $16, %[target]\n\t" ++- "jmp volk_16i_x4_quad_max_star_16i_a_sse2_L1\n\t" +++ "movaps %%xmm1, (%[target])\n\t" +++ "addw $16, %[target]\n\t" +++ "jmp volk_16i_x4_quad_max_star_16i_a_sse2_L1\n\t" ++ ++- "volk_16i_x4_quad_max_star_16i_a_sse2_END:\n\t" ++- : ++- :[bound]"r"(bound), [src0]"r"(src0), [src1]"r"(src1), [src2]"r"(src2), [src3]"r"(src3), [target]"r"(target) ++- : ++- ); ++- */ +++ "volk_16i_x4_quad_max_star_16i_a_sse2_END:\n\t" +++ : +++ :[bound]"r"(bound), [src0]"r"(src0), [src1]"r"(src1), [src2]"r"(src2), +++ [src3]"r"(src3), [target]"r"(target) +++ : +++ ); +++ */ ++ ++- short temp0 = 0; ++- short temp1 = 0; ++- for(i = bound * 8; i < (bound * 8) + leftovers; ++i) { ++- temp0 = ((short)(src0[i] - src1[i]) > 0) ? src0[i] : src1[i]; ++- temp1 = ((short)(src2[i] - src3[i])>0) ? src2[i] : src3[i]; ++- target[i] = ((short)(temp0 - temp1)>0) ? temp0 : temp1; ++- } ++- return; +++ short temp0 = 0; +++ short temp1 = 0; +++ for (i = bound * 8; i < (bound * 8) + leftovers; ++i) { +++ temp0 = ((short)(src0[i] - src1[i]) > 0) ? src0[i] : src1[i]; +++ temp1 = ((short)(src2[i] - src3[i]) > 0) ? src2[i] : src3[i]; +++ target[i] = ((short)(temp0 - temp1) > 0) ? temp0 : temp1; +++ } +++ return; ++ } ++ ++ #endif /*LV_HAVE_SSE2*/ ++@@ -206,85 +209,91 @@ volk_16i_x4_quad_max_star_16i_a_sse2(short* target, short* src0, short* src1, ++ ++ #include ++ ++-static inline void ++-volk_16i_x4_quad_max_star_16i_neon(short* target, short* src0, short* src1, ++- short* src2, short* src3, unsigned int num_points) +++static inline void volk_16i_x4_quad_max_star_16i_neon(short* target, +++ short* src0, +++ short* src1, +++ short* src2, +++ short* src3, +++ unsigned int num_points) ++ { ++- const unsigned int eighth_points = num_points / 8; ++- unsigned i; ++- ++- int16x8_t src0_vec, src1_vec, src2_vec, src3_vec; ++- int16x8_t diff12, diff34; ++- int16x8_t comp0, comp1, comp2, comp3; ++- int16x8_t result1_vec, result2_vec; ++- int16x8_t zeros; ++- zeros = vdupq_n_s16(0); ++- for(i=0; i < eighth_points; ++i) { ++- src0_vec = vld1q_s16(src0); ++- src1_vec = vld1q_s16(src1); ++- src2_vec = vld1q_s16(src2); ++- src3_vec = vld1q_s16(src3); ++- diff12 = vsubq_s16(src0_vec, src1_vec); ++- diff34 = vsubq_s16(src2_vec, src3_vec); ++- comp0 = (int16x8_t)vcgeq_s16(diff12, zeros); ++- comp1 = (int16x8_t)vcltq_s16(diff12, zeros); ++- comp2 = (int16x8_t)vcgeq_s16(diff34, zeros); ++- comp3 = (int16x8_t)vcltq_s16(diff34, zeros); ++- comp0 = vandq_s16(src0_vec, comp0); ++- comp1 = vandq_s16(src1_vec, comp1); ++- comp2 = vandq_s16(src2_vec, comp2); ++- comp3 = vandq_s16(src3_vec, comp3); ++- ++- result1_vec = vaddq_s16(comp0, comp1); ++- result2_vec = vaddq_s16(comp2, comp3); ++- ++- diff12 = vsubq_s16(result1_vec, result2_vec); ++- comp0 = (int16x8_t)vcgeq_s16(diff12, zeros); ++- comp1 = (int16x8_t)vcltq_s16(diff12, zeros); ++- comp0 = vandq_s16(result1_vec, comp0); ++- comp1 = vandq_s16(result2_vec, comp1); ++- result1_vec = vaddq_s16(comp0, comp1); ++- vst1q_s16(target, result1_vec); ++- src0 += 8; ++- src1 += 8; ++- src2 += 8; ++- src3 += 8; ++- target += 8; +++ const unsigned int eighth_points = num_points / 8; +++ unsigned i; +++ +++ int16x8_t src0_vec, src1_vec, src2_vec, src3_vec; +++ int16x8_t diff12, diff34; +++ int16x8_t comp0, comp1, comp2, comp3; +++ int16x8_t result1_vec, result2_vec; +++ int16x8_t zeros; +++ zeros = vdupq_n_s16(0); +++ for (i = 0; i < eighth_points; ++i) { +++ src0_vec = vld1q_s16(src0); +++ src1_vec = vld1q_s16(src1); +++ src2_vec = vld1q_s16(src2); +++ src3_vec = vld1q_s16(src3); +++ diff12 = vsubq_s16(src0_vec, src1_vec); +++ diff34 = vsubq_s16(src2_vec, src3_vec); +++ comp0 = (int16x8_t)vcgeq_s16(diff12, zeros); +++ comp1 = (int16x8_t)vcltq_s16(diff12, zeros); +++ comp2 = (int16x8_t)vcgeq_s16(diff34, zeros); +++ comp3 = (int16x8_t)vcltq_s16(diff34, zeros); +++ comp0 = vandq_s16(src0_vec, comp0); +++ comp1 = vandq_s16(src1_vec, comp1); +++ comp2 = vandq_s16(src2_vec, comp2); +++ comp3 = vandq_s16(src3_vec, comp3); +++ +++ result1_vec = vaddq_s16(comp0, comp1); +++ result2_vec = vaddq_s16(comp2, comp3); +++ +++ diff12 = vsubq_s16(result1_vec, result2_vec); +++ comp0 = (int16x8_t)vcgeq_s16(diff12, zeros); +++ comp1 = (int16x8_t)vcltq_s16(diff12, zeros); +++ comp0 = vandq_s16(result1_vec, comp0); +++ comp1 = vandq_s16(result2_vec, comp1); +++ result1_vec = vaddq_s16(comp0, comp1); +++ vst1q_s16(target, result1_vec); +++ src0 += 8; +++ src1 += 8; +++ src2 += 8; +++ src3 += 8; +++ target += 8; ++ } ++ ++- short temp0 = 0; ++- short temp1 = 0; ++- for(i=eighth_points*8; i < num_points; ++i) { ++- temp0 = ((short)(*src0 - *src1) > 0) ? *src0 : *src1; ++- temp1 = ((short)(*src2 - *src3) > 0) ? *src2 : *src3; ++- *target++ = ((short)(temp0 - temp1)>0) ? temp0 : temp1; ++- src0++; ++- src1++; ++- src2++; ++- src3++; ++- } +++ short temp0 = 0; +++ short temp1 = 0; +++ for (i = eighth_points * 8; i < num_points; ++i) { +++ temp0 = ((short)(*src0 - *src1) > 0) ? *src0 : *src1; +++ temp1 = ((short)(*src2 - *src3) > 0) ? *src2 : *src3; +++ *target++ = ((short)(temp0 - temp1) > 0) ? temp0 : temp1; +++ src0++; +++ src1++; +++ src2++; +++ src3++; +++ } ++ } ++ #endif /* LV_HAVE_NEON */ ++ ++ ++ #ifdef LV_HAVE_GENERIC ++-static inline void ++-volk_16i_x4_quad_max_star_16i_generic(short* target, short* src0, short* src1, ++- short* src2, short* src3, unsigned int num_points) +++static inline void volk_16i_x4_quad_max_star_16i_generic(short* target, +++ short* src0, +++ short* src1, +++ short* src2, +++ short* src3, +++ unsigned int num_points) ++ { ++- const unsigned int num_bytes = num_points*2; +++ const unsigned int num_bytes = num_points * 2; ++ ++- int i = 0; +++ int i = 0; ++ ++- int bound = num_bytes >> 1; +++ int bound = num_bytes >> 1; ++ ++- short temp0 = 0; ++- short temp1 = 0; ++- for(i = 0; i < bound; ++i) { ++- temp0 = ((short)(src0[i] - src1[i]) > 0) ? src0[i] : src1[i]; ++- temp1 = ((short)(src2[i] - src3[i])>0) ? src2[i] : src3[i]; ++- target[i] = ((short)(temp0 - temp1)>0) ? temp0 : temp1; ++- } +++ short temp0 = 0; +++ short temp1 = 0; +++ for (i = 0; i < bound; ++i) { +++ temp0 = ((short)(src0[i] - src1[i]) > 0) ? src0[i] : src1[i]; +++ temp1 = ((short)(src2[i] - src3[i]) > 0) ? src2[i] : src3[i]; +++ target[i] = ((short)(temp0 - temp1) > 0) ? temp0 : temp1; +++ } ++ } ++ ++ #endif /*LV_HAVE_GENERIC*/ ++diff --git a/kernels/volk/volk_16i_x5_add_quad_16i_x4.h b/kernels/volk/volk_16i_x5_add_quad_16i_x4.h ++index 30417de..f735f11 100644 ++--- a/kernels/volk/volk_16i_x5_add_quad_16i_x4.h +++++ b/kernels/volk/volk_16i_x5_add_quad_16i_x4.h ++@@ -29,8 +29,9 @@ ++ * ++ * Dispatcher Prototype ++ * \code ++- * void volk_16i_x5_add_quad_16i_x4(short* target0, short* target1, short* target2, short* target3, short* src0, short* src1, short* src2, short* src3, short* src4, unsigned int num_points); ++- * \endcode +++ * void volk_16i_x5_add_quad_16i_x4(short* target0, short* target1, short* target2, short* +++ * target3, short* src0, short* src1, short* src2, short* src3, short* src4, unsigned int +++ * num_points); \endcode ++ * ++ * \b Inputs ++ * \li src0: The input vector 0. ++@@ -59,182 +60,203 @@ ++ #ifndef INCLUDED_volk_16i_x5_add_quad_16i_x4_a_H ++ #define INCLUDED_volk_16i_x5_add_quad_16i_x4_a_H ++ ++-#include ++-#include +++#include +++#include ++ ++ #ifdef LV_HAVE_SSE2 ++-#include ++-#include +++#include +++#include ++ ++-static inline void ++-volk_16i_x5_add_quad_16i_x4_a_sse2(short* target0, short* target1, short* target2, short* target3, ++- short* src0, short* src1, short* src2, short* src3, short* src4, ++- unsigned int num_points) +++static inline void volk_16i_x5_add_quad_16i_x4_a_sse2(short* target0, +++ short* target1, +++ short* target2, +++ short* target3, +++ short* src0, +++ short* src1, +++ short* src2, +++ short* src3, +++ short* src4, +++ unsigned int num_points) ++ { ++- const unsigned int num_bytes = num_points*2; ++- ++- __m128i xmm0, xmm1, xmm2, xmm3, xmm4; ++- __m128i *p_target0, *p_target1, *p_target2, *p_target3, *p_src0, *p_src1, *p_src2, *p_src3, *p_src4; ++- p_target0 = (__m128i*)target0; ++- p_target1 = (__m128i*)target1; ++- p_target2 = (__m128i*)target2; ++- p_target3 = (__m128i*)target3; ++- ++- p_src0 = (__m128i*)src0; ++- p_src1 = (__m128i*)src1; ++- p_src2 = (__m128i*)src2; ++- p_src3 = (__m128i*)src3; ++- p_src4 = (__m128i*)src4; ++- ++- int i = 0; ++- ++- int bound = (num_bytes >> 4); ++- int leftovers = (num_bytes >> 1) & 7; ++- ++- for(; i < bound; ++i) { ++- xmm0 = _mm_load_si128(p_src0); ++- xmm1 = _mm_load_si128(p_src1); ++- xmm2 = _mm_load_si128(p_src2); ++- xmm3 = _mm_load_si128(p_src3); ++- xmm4 = _mm_load_si128(p_src4); ++- ++- p_src0 += 1; ++- p_src1 += 1; ++- ++- xmm1 = _mm_add_epi16(xmm0, xmm1); ++- xmm2 = _mm_add_epi16(xmm0, xmm2); ++- xmm3 = _mm_add_epi16(xmm0, xmm3); ++- xmm4 = _mm_add_epi16(xmm0, xmm4); ++- ++- ++- p_src2 += 1; ++- p_src3 += 1; ++- p_src4 += 1; ++- ++- _mm_store_si128(p_target0, xmm1); ++- _mm_store_si128(p_target1, xmm2); ++- _mm_store_si128(p_target2, xmm3); ++- _mm_store_si128(p_target3, xmm4); ++- ++- p_target0 += 1; ++- p_target1 += 1; ++- p_target2 += 1; ++- p_target3 += 1; ++- } ++- /*__VOLK_ASM __VOLK_VOLATILE ++- ( ++- ".%=volk_16i_x5_add_quad_16i_x4_a_sse2_L1:\n\t" ++- "cmp $0, %[bound]\n\t" ++- "je .%=volk_16i_x5_add_quad_16i_x4_a_sse2_END\n\t" ++- "movaps (%[src0]), %%xmm1\n\t" ++- "movaps (%[src1]), %%xmm2\n\t" ++- "movaps (%[src2]), %%xmm3\n\t" ++- "movaps (%[src3]), %%xmm4\n\t" ++- "movaps (%[src4]), %%xmm5\n\t" ++- "add $16, %[src0]\n\t" ++- "add $16, %[src1]\n\t" ++- "add $16, %[src2]\n\t" ++- "add $16, %[src3]\n\t" ++- "add $16, %[src4]\n\t" ++- "paddw %%xmm1, %%xmm2\n\t" ++- "paddw %%xmm1, %%xmm3\n\t" ++- "paddw %%xmm1, %%xmm4\n\t" ++- "paddw %%xmm1, %%xmm5\n\t" ++- "add $-1, %[bound]\n\t" ++- "movaps %%xmm2, (%[target0])\n\t" ++- "movaps %%xmm3, (%[target1])\n\t" ++- "movaps %%xmm4, (%[target2])\n\t" ++- "movaps %%xmm5, (%[target3])\n\t" ++- "add $16, %[target0]\n\t" ++- "add $16, %[target1]\n\t" ++- "add $16, %[target2]\n\t" ++- "add $16, %[target3]\n\t" ++- "jmp .%=volk_16i_x5_add_quad_16i_x4_a_sse2_L1\n\t" ++- ".%=volk_16i_x5_add_quad_16i_x4_a_sse2_END:\n\t" ++- : ++- :[bound]"r"(bound), [src0]"r"(src0), [src1]"r"(src1), [src2]"r"(src2), [src3]"r"(src3), [src4]"r"(src4), [target0]"r"(target0), [target1]"r"(target1), [target2]"r"(target2), [target3]"r"(target3) ++- :"xmm1", "xmm2", "xmm3", "xmm4", "xmm5" ++- ); ++- */ ++- ++- for(i = bound * 8; i < (bound * 8) + leftovers; ++i) { ++- target0[i] = src0[i] + src1[i]; ++- target1[i] = src0[i] + src2[i]; ++- target2[i] = src0[i] + src3[i]; ++- target3[i] = src0[i] + src4[i]; ++- } +++ const unsigned int num_bytes = num_points * 2; +++ +++ __m128i xmm0, xmm1, xmm2, xmm3, xmm4; +++ __m128i *p_target0, *p_target1, *p_target2, *p_target3, *p_src0, *p_src1, *p_src2, +++ *p_src3, *p_src4; +++ p_target0 = (__m128i*)target0; +++ p_target1 = (__m128i*)target1; +++ p_target2 = (__m128i*)target2; +++ p_target3 = (__m128i*)target3; +++ +++ p_src0 = (__m128i*)src0; +++ p_src1 = (__m128i*)src1; +++ p_src2 = (__m128i*)src2; +++ p_src3 = (__m128i*)src3; +++ p_src4 = (__m128i*)src4; +++ +++ int i = 0; +++ +++ int bound = (num_bytes >> 4); +++ int leftovers = (num_bytes >> 1) & 7; +++ +++ for (; i < bound; ++i) { +++ xmm0 = _mm_load_si128(p_src0); +++ xmm1 = _mm_load_si128(p_src1); +++ xmm2 = _mm_load_si128(p_src2); +++ xmm3 = _mm_load_si128(p_src3); +++ xmm4 = _mm_load_si128(p_src4); +++ +++ p_src0 += 1; +++ p_src1 += 1; +++ +++ xmm1 = _mm_add_epi16(xmm0, xmm1); +++ xmm2 = _mm_add_epi16(xmm0, xmm2); +++ xmm3 = _mm_add_epi16(xmm0, xmm3); +++ xmm4 = _mm_add_epi16(xmm0, xmm4); +++ +++ +++ p_src2 += 1; +++ p_src3 += 1; +++ p_src4 += 1; +++ +++ _mm_store_si128(p_target0, xmm1); +++ _mm_store_si128(p_target1, xmm2); +++ _mm_store_si128(p_target2, xmm3); +++ _mm_store_si128(p_target3, xmm4); +++ +++ p_target0 += 1; +++ p_target1 += 1; +++ p_target2 += 1; +++ p_target3 += 1; +++ } +++ /*__VOLK_ASM __VOLK_VOLATILE +++ ( +++ ".%=volk_16i_x5_add_quad_16i_x4_a_sse2_L1:\n\t" +++ "cmp $0, %[bound]\n\t" +++ "je .%=volk_16i_x5_add_quad_16i_x4_a_sse2_END\n\t" +++ "movaps (%[src0]), %%xmm1\n\t" +++ "movaps (%[src1]), %%xmm2\n\t" +++ "movaps (%[src2]), %%xmm3\n\t" +++ "movaps (%[src3]), %%xmm4\n\t" +++ "movaps (%[src4]), %%xmm5\n\t" +++ "add $16, %[src0]\n\t" +++ "add $16, %[src1]\n\t" +++ "add $16, %[src2]\n\t" +++ "add $16, %[src3]\n\t" +++ "add $16, %[src4]\n\t" +++ "paddw %%xmm1, %%xmm2\n\t" +++ "paddw %%xmm1, %%xmm3\n\t" +++ "paddw %%xmm1, %%xmm4\n\t" +++ "paddw %%xmm1, %%xmm5\n\t" +++ "add $-1, %[bound]\n\t" +++ "movaps %%xmm2, (%[target0])\n\t" +++ "movaps %%xmm3, (%[target1])\n\t" +++ "movaps %%xmm4, (%[target2])\n\t" +++ "movaps %%xmm5, (%[target3])\n\t" +++ "add $16, %[target0]\n\t" +++ "add $16, %[target1]\n\t" +++ "add $16, %[target2]\n\t" +++ "add $16, %[target3]\n\t" +++ "jmp .%=volk_16i_x5_add_quad_16i_x4_a_sse2_L1\n\t" +++ ".%=volk_16i_x5_add_quad_16i_x4_a_sse2_END:\n\t" +++ : +++ :[bound]"r"(bound), [src0]"r"(src0), [src1]"r"(src1), [src2]"r"(src2), +++ [src3]"r"(src3), [src4]"r"(src4), [target0]"r"(target0), [target1]"r"(target1), +++ [target2]"r"(target2), [target3]"r"(target3) +++ :"xmm1", "xmm2", "xmm3", "xmm4", "xmm5" +++ ); +++ */ +++ +++ for (i = bound * 8; i < (bound * 8) + leftovers; ++i) { +++ target0[i] = src0[i] + src1[i]; +++ target1[i] = src0[i] + src2[i]; +++ target2[i] = src0[i] + src3[i]; +++ target3[i] = src0[i] + src4[i]; +++ } ++ } ++ #endif /*LV_HAVE_SSE2*/ ++ ++ #ifdef LV_HAVE_NEON ++ #include ++ ++-static inline void ++-volk_16i_x5_add_quad_16i_x4_neon(short* target0, short* target1, short* target2, short* target3, ++- short* src0, short* src1, short* src2, short* src3, short* src4, ++- unsigned int num_points) +++static inline void volk_16i_x5_add_quad_16i_x4_neon(short* target0, +++ short* target1, +++ short* target2, +++ short* target3, +++ short* src0, +++ short* src1, +++ short* src2, +++ short* src3, +++ short* src4, +++ unsigned int num_points) ++ { ++- const unsigned int eighth_points = num_points / 8; ++- unsigned int number = 0; ++- ++- int16x8_t src0_vec, src1_vec, src2_vec, src3_vec, src4_vec; ++- int16x8_t target0_vec, target1_vec, target2_vec, target3_vec; ++- for(number = 0; number < eighth_points; ++number) { ++- src0_vec = vld1q_s16(src0); ++- src1_vec = vld1q_s16(src1); ++- src2_vec = vld1q_s16(src2); ++- src3_vec = vld1q_s16(src3); ++- src4_vec = vld1q_s16(src4); ++- ++- target0_vec = vaddq_s16(src0_vec , src1_vec); ++- target1_vec = vaddq_s16(src0_vec , src2_vec); ++- target2_vec = vaddq_s16(src0_vec , src3_vec); ++- target3_vec = vaddq_s16(src0_vec , src4_vec); ++- ++- vst1q_s16(target0, target0_vec); ++- vst1q_s16(target1, target1_vec); ++- vst1q_s16(target2, target2_vec); ++- vst1q_s16(target3, target3_vec); ++- src0 += 8; ++- src1 += 8; ++- src2 += 8; ++- src3 += 8; ++- src4 += 8; ++- target0 += 8; ++- target1 += 8; ++- target2 += 8; ++- target3 += 8; ++- } ++- ++- for(number = eighth_points * 8; number < num_points; ++number) { ++- *target0++ = *src0 + *src1++; ++- *target1++ = *src0 + *src2++; ++- *target2++ = *src0 + *src3++; ++- *target3++ = *src0++ + *src4++; ++- } +++ const unsigned int eighth_points = num_points / 8; +++ unsigned int number = 0; +++ +++ int16x8_t src0_vec, src1_vec, src2_vec, src3_vec, src4_vec; +++ int16x8_t target0_vec, target1_vec, target2_vec, target3_vec; +++ for (number = 0; number < eighth_points; ++number) { +++ src0_vec = vld1q_s16(src0); +++ src1_vec = vld1q_s16(src1); +++ src2_vec = vld1q_s16(src2); +++ src3_vec = vld1q_s16(src3); +++ src4_vec = vld1q_s16(src4); +++ +++ target0_vec = vaddq_s16(src0_vec, src1_vec); +++ target1_vec = vaddq_s16(src0_vec, src2_vec); +++ target2_vec = vaddq_s16(src0_vec, src3_vec); +++ target3_vec = vaddq_s16(src0_vec, src4_vec); +++ +++ vst1q_s16(target0, target0_vec); +++ vst1q_s16(target1, target1_vec); +++ vst1q_s16(target2, target2_vec); +++ vst1q_s16(target3, target3_vec); +++ src0 += 8; +++ src1 += 8; +++ src2 += 8; +++ src3 += 8; +++ src4 += 8; +++ target0 += 8; +++ target1 += 8; +++ target2 += 8; +++ target3 += 8; +++ } +++ +++ for (number = eighth_points * 8; number < num_points; ++number) { +++ *target0++ = *src0 + *src1++; +++ *target1++ = *src0 + *src2++; +++ *target2++ = *src0 + *src3++; +++ *target3++ = *src0++ + *src4++; +++ } ++ } ++ ++ #endif /* LV_HAVE_NEON */ ++ ++ #ifdef LV_HAVE_GENERIC ++ ++-static inline void ++-volk_16i_x5_add_quad_16i_x4_generic(short* target0, short* target1, short* target2, short* target3, ++- short* src0, short* src1, short* src2, short* src3, short* src4, ++- unsigned int num_points) +++static inline void volk_16i_x5_add_quad_16i_x4_generic(short* target0, +++ short* target1, +++ short* target2, +++ short* target3, +++ short* src0, +++ short* src1, +++ short* src2, +++ short* src3, +++ short* src4, +++ unsigned int num_points) ++ { ++- const unsigned int num_bytes = num_points*2; +++ const unsigned int num_bytes = num_points * 2; ++ ++- int i = 0; +++ int i = 0; ++ ++- int bound = num_bytes >> 1; +++ int bound = num_bytes >> 1; ++ ++- for(i = 0; i < bound; ++i) { ++- target0[i] = src0[i] + src1[i]; ++- target1[i] = src0[i] + src2[i]; ++- target2[i] = src0[i] + src3[i]; ++- target3[i] = src0[i] + src4[i]; ++- } +++ for (i = 0; i < bound; ++i) { +++ target0[i] = src0[i] + src1[i]; +++ target1[i] = src0[i] + src2[i]; +++ target2[i] = src0[i] + src3[i]; +++ target3[i] = src0[i] + src4[i]; +++ } ++ } ++ ++ #endif /* LV_HAVE_GENERIC */ ++diff --git a/kernels/volk/volk_16ic_convert_32fc.h b/kernels/volk/volk_16ic_convert_32fc.h ++index 84f067c..1453724 100644 ++--- a/kernels/volk/volk_16ic_convert_32fc.h +++++ b/kernels/volk/volk_16ic_convert_32fc.h ++@@ -30,8 +30,8 @@ ++ * ++ * Dispatcher Prototype ++ * \code ++- * void volk_16ic_convert_32fc(lv_32fc_t* outputVector, const lv_16sc_t* inputVector, unsigned int num_points) ++- * \endcode +++ * void volk_16ic_convert_32fc(lv_32fc_t* outputVector, const lv_16sc_t* inputVector, +++ * unsigned int num_points) \endcode ++ * ++ * \b Inputs ++ * \li inputVector: The complex 16-bit integer input data buffer. ++@@ -51,7 +51,9 @@ ++ #ifdef LV_HAVE_AVX2 ++ #include ++ ++-static inline void volk_16ic_convert_32fc_a_avx2(lv_32fc_t* outputVector, const lv_16sc_t* inputVector, unsigned int num_points) +++static inline void volk_16ic_convert_32fc_a_avx2(lv_32fc_t* outputVector, +++ const lv_16sc_t* inputVector, +++ unsigned int num_points) ++ { ++ const unsigned int avx_iters = num_points / 8; ++ unsigned int number = 0; ++@@ -61,36 +63,36 @@ static inline void volk_16ic_convert_32fc_a_avx2(lv_32fc_t* outputVector, const ++ __m256i outValInt; ++ __m128i cplxValue; ++ ++- for(number = 0; number < avx_iters; number++) ++- { ++- cplxValue = _mm_load_si128((__m128i*)complexVectorPtr); ++- complexVectorPtr += 8; ++- ++- outValInt = _mm256_cvtepi16_epi32(cplxValue); ++- outVal = _mm256_cvtepi32_ps(outValInt); ++- _mm256_store_ps((float*)outputVectorPtr, outVal); +++ for (number = 0; number < avx_iters; number++) { +++ cplxValue = _mm_load_si128((__m128i*)complexVectorPtr); +++ complexVectorPtr += 8; ++ ++- outputVectorPtr += 8; ++- } +++ outValInt = _mm256_cvtepi16_epi32(cplxValue); +++ outVal = _mm256_cvtepi32_ps(outValInt); +++ _mm256_store_ps((float*)outputVectorPtr, outVal); +++ +++ outputVectorPtr += 8; +++ } ++ ++ number = avx_iters * 8; ++- for(; number < num_points*2; number++) ++- { ++- *outputVectorPtr++ = (float)*complexVectorPtr++; ++- } +++ for (; number < num_points * 2; number++) { +++ *outputVectorPtr++ = (float)*complexVectorPtr++; +++ } ++ } ++ ++ #endif /* LV_HAVE_AVX2 */ ++ ++ #ifdef LV_HAVE_GENERIC ++ ++-static inline void volk_16ic_convert_32fc_generic(lv_32fc_t* outputVector, const lv_16sc_t* inputVector, unsigned int num_points) +++static inline void volk_16ic_convert_32fc_generic(lv_32fc_t* outputVector, +++ const lv_16sc_t* inputVector, +++ unsigned int num_points) ++ { ++ unsigned int i; ++- for(i = 0; i < num_points; i++) ++- { ++- outputVector[i] = lv_cmake((float)lv_creal(inputVector[i]), (float)lv_cimag(inputVector[i])); ++- } +++ for (i = 0; i < num_points; i++) { +++ outputVector[i] = +++ lv_cmake((float)lv_creal(inputVector[i]), (float)lv_cimag(inputVector[i])); +++ } ++ } ++ ++ #endif /* LV_HAVE_GENERIC */ ++@@ -99,7 +101,9 @@ static inline void volk_16ic_convert_32fc_generic(lv_32fc_t* outputVector, const ++ #ifdef LV_HAVE_SSE2 ++ #include ++ ++-static inline void volk_16ic_convert_32fc_a_sse2(lv_32fc_t* outputVector, const lv_16sc_t* inputVector, unsigned int num_points) +++static inline void volk_16ic_convert_32fc_a_sse2(lv_32fc_t* outputVector, +++ const lv_16sc_t* inputVector, +++ unsigned int num_points) ++ { ++ const unsigned int sse_iters = num_points / 2; ++ ++@@ -108,18 +112,21 @@ static inline void volk_16ic_convert_32fc_a_sse2(lv_32fc_t* outputVector, const ++ __m128 a; ++ unsigned int number; ++ ++- for(number = 0; number < sse_iters; number++) ++- { ++- a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg ++- _mm_store_ps((float*)_out, a); ++- _in += 2; ++- _out += 2; ++- } ++- if (num_points & 1) ++- { ++- *_out++ = lv_cmake((float)lv_creal(*_in), (float)lv_cimag(*_in)); ++- _in++; ++- } +++ for (number = 0; number < sse_iters; number++) { +++ a = _mm_set_ps( +++ (float)(lv_cimag(_in[1])), +++ (float)(lv_creal(_in[1])), +++ (float)(lv_cimag(_in[0])), +++ (float)(lv_creal( +++ _in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg +++ _mm_store_ps((float*)_out, a); +++ _in += 2; +++ _out += 2; +++ } +++ if (num_points & 1) { +++ *_out++ = lv_cmake((float)lv_creal(*_in), (float)lv_cimag(*_in)); +++ _in++; +++ } ++ } ++ ++ #endif /* LV_HAVE_SSE2 */ ++@@ -127,7 +134,9 @@ static inline void volk_16ic_convert_32fc_a_sse2(lv_32fc_t* outputVector, const ++ #ifdef LV_HAVE_AVX ++ #include ++ ++-static inline void volk_16ic_convert_32fc_a_avx(lv_32fc_t* outputVector, const lv_16sc_t* inputVector, unsigned int num_points) +++static inline void volk_16ic_convert_32fc_a_avx(lv_32fc_t* outputVector, +++ const lv_16sc_t* inputVector, +++ unsigned int num_points) ++ { ++ const unsigned int sse_iters = num_points / 4; ++ ++@@ -136,19 +145,26 @@ static inline void volk_16ic_convert_32fc_a_avx(lv_32fc_t* outputVector, const l ++ __m256 a; ++ unsigned int i, number; ++ ++- for(number = 0; number < sse_iters; number++) ++- { ++- a = _mm256_set_ps((float)(lv_cimag(_in[3])), (float)(lv_creal(_in[3])), (float)(lv_cimag(_in[2])), (float)(lv_creal(_in[2])), (float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg ++- _mm256_store_ps((float*)_out, a); ++- _in += 4; ++- _out += 4; ++- } +++ for (number = 0; number < sse_iters; number++) { +++ a = _mm256_set_ps( +++ (float)(lv_cimag(_in[3])), +++ (float)(lv_creal(_in[3])), +++ (float)(lv_cimag(_in[2])), +++ (float)(lv_creal(_in[2])), +++ (float)(lv_cimag(_in[1])), +++ (float)(lv_creal(_in[1])), +++ (float)(lv_cimag(_in[0])), +++ (float)(lv_creal( +++ _in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg +++ _mm256_store_ps((float*)_out, a); +++ _in += 4; +++ _out += 4; +++ } ++ _mm256_zeroupper(); ++- for (i = 0; i < (num_points % 4); ++i) ++- { ++- *_out++ = lv_cmake((float)lv_creal(*_in), (float)lv_cimag(*_in)); ++- _in++; ++- } +++ for (i = 0; i < (num_points % 4); ++i) { +++ *_out++ = lv_cmake((float)lv_creal(*_in), (float)lv_cimag(*_in)); +++ _in++; +++ } ++ } ++ ++ #endif /* LV_HAVE_AVX */ ++@@ -157,7 +173,9 @@ static inline void volk_16ic_convert_32fc_a_avx(lv_32fc_t* outputVector, const l ++ #ifdef LV_HAVE_NEON ++ #include ++ ++-static inline void volk_16ic_convert_32fc_neon(lv_32fc_t* outputVector, const lv_16sc_t* inputVector, unsigned int num_points) +++static inline void volk_16ic_convert_32fc_neon(lv_32fc_t* outputVector, +++ const lv_16sc_t* inputVector, +++ unsigned int num_points) ++ { ++ const unsigned int sse_iters = num_points / 2; ++ ++@@ -169,21 +187,19 @@ static inline void volk_16ic_convert_32fc_neon(lv_32fc_t* outputVector, const lv ++ float32x4_t f32x4; ++ unsigned int i, number; ++ ++- for(number = 0; number < sse_iters; number++) ++- { ++- a16x4 = vld1_s16((const int16_t*)_in); ++- __VOLK_PREFETCH(_in + 4); ++- a32x4 = vmovl_s16(a16x4); ++- f32x4 = vcvtq_f32_s32(a32x4); ++- vst1q_f32((float32_t*)_out, f32x4); ++- _in += 2; ++- _out += 2; ++- } ++- for (i = 0; i < (num_points % 2); ++i) ++- { ++- *_out++ = lv_cmake((float)lv_creal(*_in), (float)lv_cimag(*_in)); ++- _in++; ++- } +++ for (number = 0; number < sse_iters; number++) { +++ a16x4 = vld1_s16((const int16_t*)_in); +++ __VOLK_PREFETCH(_in + 4); +++ a32x4 = vmovl_s16(a16x4); +++ f32x4 = vcvtq_f32_s32(a32x4); +++ vst1q_f32((float32_t*)_out, f32x4); +++ _in += 2; +++ _out += 2; +++ } +++ for (i = 0; i < (num_points % 2); ++i) { +++ *_out++ = lv_cmake((float)lv_creal(*_in), (float)lv_cimag(*_in)); +++ _in++; +++ } ++ } ++ #endif /* LV_HAVE_NEON */ ++ ++@@ -198,7 +214,9 @@ static inline void volk_16ic_convert_32fc_neon(lv_32fc_t* outputVector, const lv ++ #ifdef LV_HAVE_AVX2 ++ #include ++ ++-static inline void volk_16ic_convert_32fc_u_avx2(lv_32fc_t* outputVector, const lv_16sc_t* inputVector, unsigned int num_points) +++static inline void volk_16ic_convert_32fc_u_avx2(lv_32fc_t* outputVector, +++ const lv_16sc_t* inputVector, +++ unsigned int num_points) ++ { ++ const unsigned int avx_iters = num_points / 8; ++ unsigned int number = 0; ++@@ -208,23 +226,21 @@ static inline void volk_16ic_convert_32fc_u_avx2(lv_32fc_t* outputVector, const ++ __m256i outValInt; ++ __m128i cplxValue; ++ ++- for(number = 0; number < avx_iters; number++) ++- { ++- cplxValue = _mm_loadu_si128((__m128i*)complexVectorPtr); ++- complexVectorPtr += 8; ++- ++- outValInt = _mm256_cvtepi16_epi32(cplxValue); ++- outVal = _mm256_cvtepi32_ps(outValInt); ++- _mm256_storeu_ps((float*)outputVectorPtr, outVal); +++ for (number = 0; number < avx_iters; number++) { +++ cplxValue = _mm_loadu_si128((__m128i*)complexVectorPtr); +++ complexVectorPtr += 8; +++ +++ outValInt = _mm256_cvtepi16_epi32(cplxValue); +++ outVal = _mm256_cvtepi32_ps(outValInt); +++ _mm256_storeu_ps((float*)outputVectorPtr, outVal); ++ ++- outputVectorPtr += 8; ++- } +++ outputVectorPtr += 8; +++ } ++ ++ number = avx_iters * 8; ++- for(; number < num_points*2; number++) ++- { ++- *outputVectorPtr++ = (float)*complexVectorPtr++; ++- } +++ for (; number < num_points * 2; number++) { +++ *outputVectorPtr++ = (float)*complexVectorPtr++; +++ } ++ } ++ ++ #endif /* LV_HAVE_AVX2 */ ++@@ -232,7 +248,9 @@ static inline void volk_16ic_convert_32fc_u_avx2(lv_32fc_t* outputVector, const ++ #ifdef LV_HAVE_SSE2 ++ #include ++ ++-static inline void volk_16ic_convert_32fc_u_sse2(lv_32fc_t* outputVector, const lv_16sc_t* inputVector, unsigned int num_points) +++static inline void volk_16ic_convert_32fc_u_sse2(lv_32fc_t* outputVector, +++ const lv_16sc_t* inputVector, +++ unsigned int num_points) ++ { ++ const unsigned int sse_iters = num_points / 2; ++ ++@@ -241,18 +259,21 @@ static inline void volk_16ic_convert_32fc_u_sse2(lv_32fc_t* outputVector, const ++ __m128 a; ++ unsigned int number; ++ ++- for(number = 0; number < sse_iters; number++) ++- { ++- a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg ++- _mm_storeu_ps((float*)_out, a); ++- _in += 2; ++- _out += 2; ++- } ++- if (num_points & 1) ++- { ++- *_out++ = lv_cmake((float)lv_creal(*_in), (float)lv_cimag(*_in)); ++- _in++; ++- } +++ for (number = 0; number < sse_iters; number++) { +++ a = _mm_set_ps( +++ (float)(lv_cimag(_in[1])), +++ (float)(lv_creal(_in[1])), +++ (float)(lv_cimag(_in[0])), +++ (float)(lv_creal( +++ _in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg +++ _mm_storeu_ps((float*)_out, a); +++ _in += 2; +++ _out += 2; +++ } +++ if (num_points & 1) { +++ *_out++ = lv_cmake((float)lv_creal(*_in), (float)lv_cimag(*_in)); +++ _in++; +++ } ++ } ++ ++ #endif /* LV_HAVE_SSE2 */ ++@@ -261,7 +282,9 @@ static inline void volk_16ic_convert_32fc_u_sse2(lv_32fc_t* outputVector, const ++ #ifdef LV_HAVE_AVX ++ #include ++ ++-static inline void volk_16ic_convert_32fc_u_avx(lv_32fc_t* outputVector, const lv_16sc_t* inputVector, unsigned int num_points) +++static inline void volk_16ic_convert_32fc_u_avx(lv_32fc_t* outputVector, +++ const lv_16sc_t* inputVector, +++ unsigned int num_points) ++ { ++ const unsigned int sse_iters = num_points / 4; ++ ++@@ -270,21 +293,27 @@ static inline void volk_16ic_convert_32fc_u_avx(lv_32fc_t* outputVector, const l ++ __m256 a; ++ unsigned int i, number; ++ ++- for(number = 0; number < sse_iters; number++) ++- { ++- a = _mm256_set_ps((float)(lv_cimag(_in[3])), (float)(lv_creal(_in[3])), (float)(lv_cimag(_in[2])), (float)(lv_creal(_in[2])), (float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg ++- _mm256_storeu_ps((float*)_out, a); ++- _in += 4; ++- _out += 4; ++- } +++ for (number = 0; number < sse_iters; number++) { +++ a = _mm256_set_ps( +++ (float)(lv_cimag(_in[3])), +++ (float)(lv_creal(_in[3])), +++ (float)(lv_cimag(_in[2])), +++ (float)(lv_creal(_in[2])), +++ (float)(lv_cimag(_in[1])), +++ (float)(lv_creal(_in[1])), +++ (float)(lv_cimag(_in[0])), +++ (float)(lv_creal( +++ _in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg +++ _mm256_storeu_ps((float*)_out, a); +++ _in += 4; +++ _out += 4; +++ } ++ _mm256_zeroupper(); ++- for (i = 0; i < (num_points % 4); ++i) ++- { ++- *_out++ = lv_cmake((float)lv_creal(*_in), (float)lv_cimag(*_in)); ++- _in++; ++- } +++ for (i = 0; i < (num_points % 4); ++i) { +++ *_out++ = lv_cmake((float)lv_creal(*_in), (float)lv_cimag(*_in)); +++ _in++; +++ } ++ } ++ ++ #endif /* LV_HAVE_AVX */ ++ #endif /* INCLUDED_volk_32fc_convert_16ic_u_H */ ++- ++diff --git a/kernels/volk/volk_16ic_deinterleave_16i_x2.h b/kernels/volk/volk_16ic_deinterleave_16i_x2.h ++index 40d10b4..9e784a6 100644 ++--- a/kernels/volk/volk_16ic_deinterleave_16i_x2.h +++++ b/kernels/volk/volk_16ic_deinterleave_16i_x2.h ++@@ -29,8 +29,8 @@ ++ * ++ * Dispatcher Prototype ++ * \code ++- * void volk_16ic_deinterleave_16i_x2(int16_t* iBuffer, int16_t* qBuffer, const lv_16sc_t* complexVector, unsigned int num_points) ++- * \endcode +++ * void volk_16ic_deinterleave_16i_x2(int16_t* iBuffer, int16_t* qBuffer, const lv_16sc_t* +++ * complexVector, unsigned int num_points) \endcode ++ * ++ * \b Inputs ++ * \li complexVector: The complex input vector. ++@@ -59,179 +59,241 @@ ++ #ifdef LV_HAVE_AVX2 ++ #include ++ ++-static inline void ++-volk_16ic_deinterleave_16i_x2_a_avx2(int16_t* iBuffer, int16_t* qBuffer, const lv_16sc_t* complexVector, unsigned int num_points) +++static inline void volk_16ic_deinterleave_16i_x2_a_avx2(int16_t* iBuffer, +++ int16_t* qBuffer, +++ const lv_16sc_t* complexVector, +++ unsigned int num_points) ++ { ++- unsigned int number = 0; ++- const int8_t* complexVectorPtr = (int8_t*)complexVector; ++- int16_t* iBufferPtr = iBuffer; ++- int16_t* qBufferPtr = qBuffer; ++- ++- __m256i MoveMask = _mm256_set_epi8(15,14,11,10,7,6,3,2,13,12,9,8,5,4,1,0, 15,14,11,10,7,6,3,2,13,12,9,8,5,4,1,0); ++- ++- __m256i iMove2, iMove1; ++- __m256i complexVal1, complexVal2, iOutputVal, qOutputVal; ++- ++- unsigned int sixteenthPoints = num_points / 16; ++- ++- for(number = 0; number < sixteenthPoints; number++){ ++- complexVal1 = _mm256_load_si256((__m256i*)complexVectorPtr); complexVectorPtr += 32; ++- complexVal2 = _mm256_load_si256((__m256i*)complexVectorPtr); complexVectorPtr += 32; ++- ++- iMove2 = _mm256_shuffle_epi8(complexVal2, MoveMask); ++- iMove1 = _mm256_shuffle_epi8(complexVal1, MoveMask); ++- ++- iOutputVal = _mm256_permute2x128_si256(_mm256_permute4x64_epi64(iMove1,0x08),_mm256_permute4x64_epi64(iMove2,0x80),0x30); ++- qOutputVal = _mm256_permute2x128_si256(_mm256_permute4x64_epi64(iMove1,0x0d),_mm256_permute4x64_epi64(iMove2,0xd0),0x30); ++- ++- _mm256_store_si256((__m256i*)iBufferPtr, iOutputVal); ++- _mm256_store_si256((__m256i*)qBufferPtr, qOutputVal); ++- ++- iBufferPtr += 16; ++- qBufferPtr += 16; ++- } ++- ++- number = sixteenthPoints * 16; ++- int16_t* int16ComplexVectorPtr = (int16_t*)complexVectorPtr; ++- for(; number < num_points; number++){ ++- *iBufferPtr++ = *int16ComplexVectorPtr++; ++- *qBufferPtr++ = *int16ComplexVectorPtr++; ++- } +++ unsigned int number = 0; +++ const int8_t* complexVectorPtr = (int8_t*)complexVector; +++ int16_t* iBufferPtr = iBuffer; +++ int16_t* qBufferPtr = qBuffer; +++ +++ __m256i MoveMask = _mm256_set_epi8(15, +++ 14, +++ 11, +++ 10, +++ 7, +++ 6, +++ 3, +++ 2, +++ 13, +++ 12, +++ 9, +++ 8, +++ 5, +++ 4, +++ 1, +++ 0, +++ 15, +++ 14, +++ 11, +++ 10, +++ 7, +++ 6, +++ 3, +++ 2, +++ 13, +++ 12, +++ 9, +++ 8, +++ 5, +++ 4, +++ 1, +++ 0); +++ +++ __m256i iMove2, iMove1; +++ __m256i complexVal1, complexVal2, iOutputVal, qOutputVal; +++ +++ unsigned int sixteenthPoints = num_points / 16; +++ +++ for (number = 0; number < sixteenthPoints; number++) { +++ complexVal1 = _mm256_load_si256((__m256i*)complexVectorPtr); +++ complexVectorPtr += 32; +++ complexVal2 = _mm256_load_si256((__m256i*)complexVectorPtr); +++ complexVectorPtr += 32; +++ +++ iMove2 = _mm256_shuffle_epi8(complexVal2, MoveMask); +++ iMove1 = _mm256_shuffle_epi8(complexVal1, MoveMask); +++ +++ iOutputVal = _mm256_permute2x128_si256(_mm256_permute4x64_epi64(iMove1, 0x08), +++ _mm256_permute4x64_epi64(iMove2, 0x80), +++ 0x30); +++ qOutputVal = _mm256_permute2x128_si256(_mm256_permute4x64_epi64(iMove1, 0x0d), +++ _mm256_permute4x64_epi64(iMove2, 0xd0), +++ 0x30); +++ +++ _mm256_store_si256((__m256i*)iBufferPtr, iOutputVal); +++ _mm256_store_si256((__m256i*)qBufferPtr, qOutputVal); +++ +++ iBufferPtr += 16; +++ qBufferPtr += 16; +++ } +++ +++ number = sixteenthPoints * 16; +++ int16_t* int16ComplexVectorPtr = (int16_t*)complexVectorPtr; +++ for (; number < num_points; number++) { +++ *iBufferPtr++ = *int16ComplexVectorPtr++; +++ *qBufferPtr++ = *int16ComplexVectorPtr++; +++ } ++ } ++ #endif /* LV_HAVE_AVX2 */ ++ ++ #ifdef LV_HAVE_SSSE3 ++ #include ++ ++-static inline void ++-volk_16ic_deinterleave_16i_x2_a_ssse3(int16_t* iBuffer, int16_t* qBuffer, const lv_16sc_t* complexVector, unsigned int num_points) +++static inline void volk_16ic_deinterleave_16i_x2_a_ssse3(int16_t* iBuffer, +++ int16_t* qBuffer, +++ const lv_16sc_t* complexVector, +++ unsigned int num_points) ++ { ++- unsigned int number = 0; ++- const int8_t* complexVectorPtr = (int8_t*)complexVector; ++- int16_t* iBufferPtr = iBuffer; ++- int16_t* qBufferPtr = qBuffer; ++- ++- __m128i iMoveMask1 = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0); ++- __m128i iMoveMask2 = _mm_set_epi8(13, 12, 9, 8, 5, 4, 1, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80); ++- ++- __m128i qMoveMask1 = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 15, 14, 11, 10, 7, 6, 3, 2); ++- __m128i qMoveMask2 = _mm_set_epi8(15, 14, 11, 10, 7, 6, 3, 2, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80); ++- ++- __m128i complexVal1, complexVal2, iOutputVal, qOutputVal; ++- ++- unsigned int eighthPoints = num_points / 8; ++- ++- for(number = 0; number < eighthPoints; number++){ ++- complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16; ++- complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16; ++- ++- iOutputVal = _mm_or_si128( _mm_shuffle_epi8(complexVal1, iMoveMask1) , _mm_shuffle_epi8(complexVal2, iMoveMask2)); ++- qOutputVal = _mm_or_si128( _mm_shuffle_epi8(complexVal1, qMoveMask1) , _mm_shuffle_epi8(complexVal2, qMoveMask2)); ++- ++- _mm_store_si128((__m128i*)iBufferPtr, iOutputVal); ++- _mm_store_si128((__m128i*)qBufferPtr, qOutputVal); ++- ++- iBufferPtr += 8; ++- qBufferPtr += 8; ++- } ++- ++- number = eighthPoints * 8; ++- int16_t* int16ComplexVectorPtr = (int16_t*)complexVectorPtr; ++- for(; number < num_points; number++){ ++- *iBufferPtr++ = *int16ComplexVectorPtr++; ++- *qBufferPtr++ = *int16ComplexVectorPtr++; ++- } +++ unsigned int number = 0; +++ const int8_t* complexVectorPtr = (int8_t*)complexVector; +++ int16_t* iBufferPtr = iBuffer; +++ int16_t* qBufferPtr = qBuffer; +++ +++ __m128i iMoveMask1 = _mm_set_epi8( +++ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0); +++ __m128i iMoveMask2 = _mm_set_epi8( +++ 13, 12, 9, 8, 5, 4, 1, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80); +++ +++ __m128i qMoveMask1 = _mm_set_epi8( +++ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 15, 14, 11, 10, 7, 6, 3, 2); +++ __m128i qMoveMask2 = _mm_set_epi8( +++ 15, 14, 11, 10, 7, 6, 3, 2, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80); +++ +++ __m128i complexVal1, complexVal2, iOutputVal, qOutputVal; +++ +++ unsigned int eighthPoints = num_points / 8; +++ +++ for (number = 0; number < eighthPoints; number++) { +++ complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr); +++ complexVectorPtr += 16; +++ complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr); +++ complexVectorPtr += 16; +++ +++ iOutputVal = _mm_or_si128(_mm_shuffle_epi8(complexVal1, iMoveMask1), +++ _mm_shuffle_epi8(complexVal2, iMoveMask2)); +++ qOutputVal = _mm_or_si128(_mm_shuffle_epi8(complexVal1, qMoveMask1), +++ _mm_shuffle_epi8(complexVal2, qMoveMask2)); +++ +++ _mm_store_si128((__m128i*)iBufferPtr, iOutputVal); +++ _mm_store_si128((__m128i*)qBufferPtr, qOutputVal); +++ +++ iBufferPtr += 8; +++ qBufferPtr += 8; +++ } +++ +++ number = eighthPoints * 8; +++ int16_t* int16ComplexVectorPtr = (int16_t*)complexVectorPtr; +++ for (; number < num_points; number++) { +++ *iBufferPtr++ = *int16ComplexVectorPtr++; +++ *qBufferPtr++ = *int16ComplexVectorPtr++; +++ } ++ } ++ #endif /* LV_HAVE_SSSE3 */ ++ ++ #ifdef LV_HAVE_SSE2 ++ #include ++ ++-static inline void ++-volk_16ic_deinterleave_16i_x2_a_sse2(int16_t* iBuffer, int16_t* qBuffer, const lv_16sc_t* complexVector, unsigned int num_points) +++static inline void volk_16ic_deinterleave_16i_x2_a_sse2(int16_t* iBuffer, +++ int16_t* qBuffer, +++ const lv_16sc_t* complexVector, +++ unsigned int num_points) ++ { ++- unsigned int number = 0; ++- const int16_t* complexVectorPtr = (int16_t*)complexVector; ++- int16_t* iBufferPtr = iBuffer; ++- int16_t* qBufferPtr = qBuffer; ++- __m128i complexVal1, complexVal2, iComplexVal1, iComplexVal2, qComplexVal1, qComplexVal2, iOutputVal, qOutputVal; ++- __m128i lowMask = _mm_set_epi32(0x0, 0x0, 0xFFFFFFFF, 0xFFFFFFFF); ++- __m128i highMask = _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0x0, 0x0); +++ unsigned int number = 0; +++ const int16_t* complexVectorPtr = (int16_t*)complexVector; +++ int16_t* iBufferPtr = iBuffer; +++ int16_t* qBufferPtr = qBuffer; +++ __m128i complexVal1, complexVal2, iComplexVal1, iComplexVal2, qComplexVal1, +++ qComplexVal2, iOutputVal, qOutputVal; +++ __m128i lowMask = _mm_set_epi32(0x0, 0x0, 0xFFFFFFFF, 0xFFFFFFFF); +++ __m128i highMask = _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0x0, 0x0); ++ ++- unsigned int eighthPoints = num_points / 8; +++ unsigned int eighthPoints = num_points / 8; ++ ++- for(number = 0; number < eighthPoints; number++){ ++- complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 8; ++- complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 8; +++ for (number = 0; number < eighthPoints; number++) { +++ complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr); +++ complexVectorPtr += 8; +++ complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr); +++ complexVectorPtr += 8; ++ ++- iComplexVal1 = _mm_shufflelo_epi16(complexVal1, _MM_SHUFFLE(3,1,2,0)); +++ iComplexVal1 = _mm_shufflelo_epi16(complexVal1, _MM_SHUFFLE(3, 1, 2, 0)); ++ ++- iComplexVal1 = _mm_shufflehi_epi16(iComplexVal1, _MM_SHUFFLE(3,1,2,0)); +++ iComplexVal1 = _mm_shufflehi_epi16(iComplexVal1, _MM_SHUFFLE(3, 1, 2, 0)); ++ ++- iComplexVal1 = _mm_shuffle_epi32(iComplexVal1, _MM_SHUFFLE(3,1,2,0)); +++ iComplexVal1 = _mm_shuffle_epi32(iComplexVal1, _MM_SHUFFLE(3, 1, 2, 0)); ++ ++- iComplexVal2 = _mm_shufflelo_epi16(complexVal2, _MM_SHUFFLE(3,1,2,0)); +++ iComplexVal2 = _mm_shufflelo_epi16(complexVal2, _MM_SHUFFLE(3, 1, 2, 0)); ++ ++- iComplexVal2 = _mm_shufflehi_epi16(iComplexVal2, _MM_SHUFFLE(3,1,2,0)); +++ iComplexVal2 = _mm_shufflehi_epi16(iComplexVal2, _MM_SHUFFLE(3, 1, 2, 0)); ++ ++- iComplexVal2 = _mm_shuffle_epi32(iComplexVal2, _MM_SHUFFLE(2,0,3,1)); +++ iComplexVal2 = _mm_shuffle_epi32(iComplexVal2, _MM_SHUFFLE(2, 0, 3, 1)); ++ ++- iOutputVal = _mm_or_si128(_mm_and_si128(iComplexVal1, lowMask), _mm_and_si128(iComplexVal2, highMask)); +++ iOutputVal = _mm_or_si128(_mm_and_si128(iComplexVal1, lowMask), +++ _mm_and_si128(iComplexVal2, highMask)); ++ ++- _mm_store_si128((__m128i*)iBufferPtr, iOutputVal); +++ _mm_store_si128((__m128i*)iBufferPtr, iOutputVal); ++ ++- qComplexVal1 = _mm_shufflelo_epi16(complexVal1, _MM_SHUFFLE(2,0,3,1)); +++ qComplexVal1 = _mm_shufflelo_epi16(complexVal1, _MM_SHUFFLE(2, 0, 3, 1)); ++ ++- qComplexVal1 = _mm_shufflehi_epi16(qComplexVal1, _MM_SHUFFLE(2,0,3,1)); +++ qComplexVal1 = _mm_shufflehi_epi16(qComplexVal1, _MM_SHUFFLE(2, 0, 3, 1)); ++ ++- qComplexVal1 = _mm_shuffle_epi32(qComplexVal1, _MM_SHUFFLE(3,1,2,0)); +++ qComplexVal1 = _mm_shuffle_epi32(qComplexVal1, _MM_SHUFFLE(3, 1, 2, 0)); ++ ++- qComplexVal2 = _mm_shufflelo_epi16(complexVal2, _MM_SHUFFLE(2,0,3,1)); +++ qComplexVal2 = _mm_shufflelo_epi16(complexVal2, _MM_SHUFFLE(2, 0, 3, 1)); ++ ++- qComplexVal2 = _mm_shufflehi_epi16(qComplexVal2, _MM_SHUFFLE(2,0,3,1)); +++ qComplexVal2 = _mm_shufflehi_epi16(qComplexVal2, _MM_SHUFFLE(2, 0, 3, 1)); ++ ++- qComplexVal2 = _mm_shuffle_epi32(qComplexVal2, _MM_SHUFFLE(2,0,3,1)); +++ qComplexVal2 = _mm_shuffle_epi32(qComplexVal2, _MM_SHUFFLE(2, 0, 3, 1)); ++ ++- qOutputVal = _mm_or_si128(_mm_and_si128(qComplexVal1, lowMask), _mm_and_si128(qComplexVal2, highMask)); +++ qOutputVal = _mm_or_si128(_mm_and_si128(qComplexVal1, lowMask), +++ _mm_and_si128(qComplexVal2, highMask)); ++ ++- _mm_store_si128((__m128i*)qBufferPtr, qOutputVal); +++ _mm_store_si128((__m128i*)qBufferPtr, qOutputVal); ++ ++- iBufferPtr += 8; ++- qBufferPtr += 8; ++- } +++ iBufferPtr += 8; +++ qBufferPtr += 8; +++ } ++ ++- number = eighthPoints * 8; ++- for(; number < num_points; number++){ ++- *iBufferPtr++ = *complexVectorPtr++; ++- *qBufferPtr++ = *complexVectorPtr++; ++- } +++ number = eighthPoints * 8; +++ for (; number < num_points; number++) { +++ *iBufferPtr++ = *complexVectorPtr++; +++ *qBufferPtr++ = *complexVectorPtr++; +++ } ++ } ++ #endif /* LV_HAVE_SSE2 */ ++ ++ #ifdef LV_HAVE_GENERIC ++ ++-static inline void ++-volk_16ic_deinterleave_16i_x2_generic(int16_t* iBuffer, int16_t* qBuffer, const lv_16sc_t* complexVector, unsigned int num_points) +++static inline void volk_16ic_deinterleave_16i_x2_generic(int16_t* iBuffer, +++ int16_t* qBuffer, +++ const lv_16sc_t* complexVector, +++ unsigned int num_points) ++ { ++- const int16_t* complexVectorPtr = (const int16_t*)complexVector; ++- int16_t* iBufferPtr = iBuffer; ++- int16_t* qBufferPtr = qBuffer; ++- unsigned int number; ++- for(number = 0; number < num_points; number++){ ++- *iBufferPtr++ = *complexVectorPtr++; ++- *qBufferPtr++ = *complexVectorPtr++; ++- } +++ const int16_t* complexVectorPtr = (const int16_t*)complexVector; +++ int16_t* iBufferPtr = iBuffer; +++ int16_t* qBufferPtr = qBuffer; +++ unsigned int number; +++ for (number = 0; number < num_points; number++) { +++ *iBufferPtr++ = *complexVectorPtr++; +++ *qBufferPtr++ = *complexVectorPtr++; +++ } ++ } ++ #endif /* LV_HAVE_GENERIC */ ++ ++ #ifdef LV_HAVE_ORC ++ ++-extern void ++-volk_16ic_deinterleave_16i_x2_a_orc_impl(int16_t* iBuffer, int16_t* qBuffer, const lv_16sc_t* complexVector, unsigned int num_points); ++-static inline void ++-volk_16ic_deinterleave_16i_x2_u_orc(int16_t* iBuffer, int16_t* qBuffer, const lv_16sc_t* complexVector, unsigned int num_points) +++extern void volk_16ic_deinterleave_16i_x2_a_orc_impl(int16_t* iBuffer, +++ int16_t* qBuffer, +++ const lv_16sc_t* complexVector, +++ unsigned int num_points); +++static inline void volk_16ic_deinterleave_16i_x2_u_orc(int16_t* iBuffer, +++ int16_t* qBuffer, +++ const lv_16sc_t* complexVector, +++ unsigned int num_points) ++ { ++- volk_16ic_deinterleave_16i_x2_a_orc_impl(iBuffer, qBuffer, complexVector, num_points); +++ volk_16ic_deinterleave_16i_x2_a_orc_impl(iBuffer, qBuffer, complexVector, num_points); ++ } ++ #endif /* LV_HAVE_ORC */ ++ ++@@ -246,44 +308,83 @@ volk_16ic_deinterleave_16i_x2_u_orc(int16_t* iBuffer, int16_t* qBuffer, const lv ++ #ifdef LV_HAVE_AVX2 ++ #include ++ ++-static inline void ++-volk_16ic_deinterleave_16i_x2_u_avx2(int16_t* iBuffer, int16_t* qBuffer, const lv_16sc_t* complexVector, unsigned int num_points) +++static inline void volk_16ic_deinterleave_16i_x2_u_avx2(int16_t* iBuffer, +++ int16_t* qBuffer, +++ const lv_16sc_t* complexVector, +++ unsigned int num_points) ++ { ++- unsigned int number = 0; ++- const int8_t* complexVectorPtr = (int8_t*)complexVector; ++- int16_t* iBufferPtr = iBuffer; ++- int16_t* qBufferPtr = qBuffer; ++- ++- __m256i MoveMask = _mm256_set_epi8(15,14,11,10,7,6,3,2,13,12,9,8,5,4,1,0, 15,14,11,10,7,6,3,2,13,12,9,8,5,4,1,0); ++- ++- __m256i iMove2, iMove1; ++- __m256i complexVal1, complexVal2, iOutputVal, qOutputVal; ++- ++- unsigned int sixteenthPoints = num_points / 16; ++- ++- for(number = 0; number < sixteenthPoints; number++){ ++- complexVal1 = _mm256_loadu_si256((__m256i*)complexVectorPtr); complexVectorPtr += 32; ++- complexVal2 = _mm256_loadu_si256((__m256i*)complexVectorPtr); complexVectorPtr += 32; ++- ++- iMove2 = _mm256_shuffle_epi8(complexVal2, MoveMask); ++- iMove1 = _mm256_shuffle_epi8(complexVal1, MoveMask); ++- ++- iOutputVal = _mm256_permute2x128_si256(_mm256_permute4x64_epi64(iMove1,0x08),_mm256_permute4x64_epi64(iMove2,0x80),0x30); ++- qOutputVal = _mm256_permute2x128_si256(_mm256_permute4x64_epi64(iMove1,0x0d),_mm256_permute4x64_epi64(iMove2,0xd0),0x30); ++- ++- _mm256_storeu_si256((__m256i*)iBufferPtr, iOutputVal); ++- _mm256_storeu_si256((__m256i*)qBufferPtr, qOutputVal); ++- ++- iBufferPtr += 16; ++- qBufferPtr += 16; ++- } ++- ++- number = sixteenthPoints * 16; ++- int16_t* int16ComplexVectorPtr = (int16_t*)complexVectorPtr; ++- for(; number < num_points; number++){ ++- *iBufferPtr++ = *int16ComplexVectorPtr++; ++- *qBufferPtr++ = *int16ComplexVectorPtr++; ++- } +++ unsigned int number = 0; +++ const int8_t* complexVectorPtr = (int8_t*)complexVector; +++ int16_t* iBufferPtr = iBuffer; +++ int16_t* qBufferPtr = qBuffer; +++ +++ __m256i MoveMask = _mm256_set_epi8(15, +++ 14, +++ 11, +++ 10, +++ 7, +++ 6, +++ 3, +++ 2, +++ 13, +++ 12, +++ 9, +++ 8, +++ 5, +++ 4, +++ 1, +++ 0, +++ 15, +++ 14, +++ 11, +++ 10, +++ 7, +++ 6, +++ 3, +++ 2, +++ 13, +++ 12, +++ 9, +++ 8, +++ 5, +++ 4, +++ 1, +++ 0); +++ +++ __m256i iMove2, iMove1; +++ __m256i complexVal1, complexVal2, iOutputVal, qOutputVal; +++ +++ unsigned int sixteenthPoints = num_points / 16; +++ +++ for (number = 0; number < sixteenthPoints; number++) { +++ complexVal1 = _mm256_loadu_si256((__m256i*)complexVectorPtr); +++ complexVectorPtr += 32; +++ complexVal2 = _mm256_loadu_si256((__m256i*)complexVectorPtr); +++ complexVectorPtr += 32; +++ +++ iMove2 = _mm256_shuffle_epi8(complexVal2, MoveMask); +++ iMove1 = _mm256_shuffle_epi8(complexVal1, MoveMask); +++ +++ iOutputVal = _mm256_permute2x128_si256(_mm256_permute4x64_epi64(iMove1, 0x08), +++ _mm256_permute4x64_epi64(iMove2, 0x80), +++ 0x30); +++ qOutputVal = _mm256_permute2x128_si256(_mm256_permute4x64_epi64(iMove1, 0x0d), +++ _mm256_permute4x64_epi64(iMove2, 0xd0), +++ 0x30); +++ +++ _mm256_storeu_si256((__m256i*)iBufferPtr, iOutputVal); +++ _mm256_storeu_si256((__m256i*)qBufferPtr, qOutputVal); +++ +++ iBufferPtr += 16; +++ qBufferPtr += 16; +++ } +++ +++ number = sixteenthPoints * 16; +++ int16_t* int16ComplexVectorPtr = (int16_t*)complexVectorPtr; +++ for (; number < num_points; number++) { +++ *iBufferPtr++ = *int16ComplexVectorPtr++; +++ *qBufferPtr++ = *int16ComplexVectorPtr++; +++ } ++ } ++ #endif /* LV_HAVE_AVX2 */ ++ ++diff --git a/kernels/volk/volk_16ic_deinterleave_real_16i.h b/kernels/volk/volk_16ic_deinterleave_real_16i.h ++index c1de553..45fcd99 100644 ++--- a/kernels/volk/volk_16ic_deinterleave_real_16i.h +++++ b/kernels/volk/volk_16ic_deinterleave_real_16i.h ++@@ -25,12 +25,13 @@ ++ * ++ * \b Overview ++ * ++- * Deinterleaves the complex 16 bit vector and returns the real (inphase) part of the signal. +++ * Deinterleaves the complex 16 bit vector and returns the real (inphase) part of the +++ * signal. ++ * ++ * Dispatcher Prototype ++ * \code ++- * void volk_16ic_deinterleave_real_16i(int16_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points) ++- * \endcode +++ * void volk_16ic_deinterleave_real_16i(int16_t* iBuffer, const lv_16sc_t* complexVector, +++ * unsigned int num_points) \endcode ++ * ++ * \b Inputs ++ * \li complexVector: The complex input vector. ++@@ -60,79 +61,149 @@ ++ #ifdef LV_HAVE_AVX2 ++ #include ++ ++-static inline void ++-volk_16ic_deinterleave_real_16i_a_avx2(int16_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points) +++static inline void volk_16ic_deinterleave_real_16i_a_avx2(int16_t* iBuffer, +++ const lv_16sc_t* complexVector, +++ unsigned int num_points) ++ { ++- unsigned int number = 0; ++- const int16_t* complexVectorPtr = (int16_t*)complexVector; ++- int16_t* iBufferPtr = iBuffer; ++- ++- __m256i iMoveMask1 = _mm256_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0); ++- __m256i iMoveMask2 = _mm256_set_epi8(13, 12, 9, 8, 5, 4, 1, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80); ++- ++- __m256i complexVal1, complexVal2, iOutputVal; ++- ++- unsigned int sixteenthPoints = num_points / 16; ++- ++- for(number = 0; number < sixteenthPoints; number++){ ++- complexVal1 = _mm256_load_si256((__m256i*)complexVectorPtr); complexVectorPtr += 16; ++- complexVal2 = _mm256_load_si256((__m256i*)complexVectorPtr); complexVectorPtr += 16; ++- ++- complexVal1 = _mm256_shuffle_epi8(complexVal1, iMoveMask1); ++- complexVal2 = _mm256_shuffle_epi8(complexVal2, iMoveMask2); ++- ++- iOutputVal = _mm256_or_si256(complexVal1, complexVal2); ++- iOutputVal = _mm256_permute4x64_epi64(iOutputVal, 0xd8); ++- ++- _mm256_store_si256((__m256i*)iBufferPtr, iOutputVal); ++- ++- iBufferPtr += 16; ++- } ++- ++- number = sixteenthPoints * 16; ++- for(; number < num_points; number++){ ++- *iBufferPtr++ = *complexVectorPtr++; ++- complexVectorPtr++; ++- } +++ unsigned int number = 0; +++ const int16_t* complexVectorPtr = (int16_t*)complexVector; +++ int16_t* iBufferPtr = iBuffer; +++ +++ __m256i iMoveMask1 = _mm256_set_epi8(0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 13, +++ 12, +++ 9, +++ 8, +++ 5, +++ 4, +++ 1, +++ 0, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 13, +++ 12, +++ 9, +++ 8, +++ 5, +++ 4, +++ 1, +++ 0); +++ __m256i iMoveMask2 = _mm256_set_epi8(13, +++ 12, +++ 9, +++ 8, +++ 5, +++ 4, +++ 1, +++ 0, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 13, +++ 12, +++ 9, +++ 8, +++ 5, +++ 4, +++ 1, +++ 0, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80); +++ +++ __m256i complexVal1, complexVal2, iOutputVal; +++ +++ unsigned int sixteenthPoints = num_points / 16; +++ +++ for (number = 0; number < sixteenthPoints; number++) { +++ complexVal1 = _mm256_load_si256((__m256i*)complexVectorPtr); +++ complexVectorPtr += 16; +++ complexVal2 = _mm256_load_si256((__m256i*)complexVectorPtr); +++ complexVectorPtr += 16; +++ +++ complexVal1 = _mm256_shuffle_epi8(complexVal1, iMoveMask1); +++ complexVal2 = _mm256_shuffle_epi8(complexVal2, iMoveMask2); +++ +++ iOutputVal = _mm256_or_si256(complexVal1, complexVal2); +++ iOutputVal = _mm256_permute4x64_epi64(iOutputVal, 0xd8); +++ +++ _mm256_store_si256((__m256i*)iBufferPtr, iOutputVal); +++ +++ iBufferPtr += 16; +++ } +++ +++ number = sixteenthPoints * 16; +++ for (; number < num_points; number++) { +++ *iBufferPtr++ = *complexVectorPtr++; +++ complexVectorPtr++; +++ } ++ } ++ #endif /* LV_HAVE_AVX2 */ ++ ++ #ifdef LV_HAVE_SSSE3 ++ #include ++ ++-static inline void ++-volk_16ic_deinterleave_real_16i_a_ssse3(int16_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points) +++static inline void volk_16ic_deinterleave_real_16i_a_ssse3(int16_t* iBuffer, +++ const lv_16sc_t* complexVector, +++ unsigned int num_points) ++ { ++- unsigned int number = 0; ++- const int16_t* complexVectorPtr = (int16_t*)complexVector; ++- int16_t* iBufferPtr = iBuffer; +++ unsigned int number = 0; +++ const int16_t* complexVectorPtr = (int16_t*)complexVector; +++ int16_t* iBufferPtr = iBuffer; ++ ++- __m128i iMoveMask1 = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0); ++- __m128i iMoveMask2 = _mm_set_epi8(13, 12, 9, 8, 5, 4, 1, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80); +++ __m128i iMoveMask1 = _mm_set_epi8( +++ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0); +++ __m128i iMoveMask2 = _mm_set_epi8( +++ 13, 12, 9, 8, 5, 4, 1, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80); ++ ++- __m128i complexVal1, complexVal2, iOutputVal; +++ __m128i complexVal1, complexVal2, iOutputVal; ++ ++- unsigned int eighthPoints = num_points / 8; +++ unsigned int eighthPoints = num_points / 8; ++ ++- for(number = 0; number < eighthPoints; number++){ ++- complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 8; ++- complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 8; +++ for (number = 0; number < eighthPoints; number++) { +++ complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr); +++ complexVectorPtr += 8; +++ complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr); +++ complexVectorPtr += 8; ++ ++- complexVal1 = _mm_shuffle_epi8(complexVal1, iMoveMask1); ++- complexVal2 = _mm_shuffle_epi8(complexVal2, iMoveMask2); +++ complexVal1 = _mm_shuffle_epi8(complexVal1, iMoveMask1); +++ complexVal2 = _mm_shuffle_epi8(complexVal2, iMoveMask2); ++ ++- iOutputVal = _mm_or_si128(complexVal1, complexVal2); +++ iOutputVal = _mm_or_si128(complexVal1, complexVal2); ++ ++- _mm_store_si128((__m128i*)iBufferPtr, iOutputVal); +++ _mm_store_si128((__m128i*)iBufferPtr, iOutputVal); ++ ++- iBufferPtr += 8; ++- } +++ iBufferPtr += 8; +++ } ++ ++- number = eighthPoints * 8; ++- for(; number < num_points; number++){ ++- *iBufferPtr++ = *complexVectorPtr++; ++- complexVectorPtr++; ++- } +++ number = eighthPoints * 8; +++ for (; number < num_points; number++) { +++ *iBufferPtr++ = *complexVectorPtr++; +++ complexVectorPtr++; +++ } ++ } ++ #endif /* LV_HAVE_SSSE3 */ ++ ++@@ -140,61 +211,66 @@ volk_16ic_deinterleave_real_16i_a_ssse3(int16_t* iBuffer, const lv_16sc_t* compl ++ #ifdef LV_HAVE_SSE2 ++ #include ++ ++-static inline void ++-volk_16ic_deinterleave_real_16i_a_sse2(int16_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points) +++static inline void volk_16ic_deinterleave_real_16i_a_sse2(int16_t* iBuffer, +++ const lv_16sc_t* complexVector, +++ unsigned int num_points) ++ { ++- unsigned int number = 0; ++- const int16_t* complexVectorPtr = (int16_t*)complexVector; ++- int16_t* iBufferPtr = iBuffer; ++- __m128i complexVal1, complexVal2, iOutputVal; ++- __m128i lowMask = _mm_set_epi32(0x0, 0x0, 0xFFFFFFFF, 0xFFFFFFFF); ++- __m128i highMask = _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0x0, 0x0); +++ unsigned int number = 0; +++ const int16_t* complexVectorPtr = (int16_t*)complexVector; +++ int16_t* iBufferPtr = iBuffer; +++ __m128i complexVal1, complexVal2, iOutputVal; +++ __m128i lowMask = _mm_set_epi32(0x0, 0x0, 0xFFFFFFFF, 0xFFFFFFFF); +++ __m128i highMask = _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0x0, 0x0); ++ ++- unsigned int eighthPoints = num_points / 8; +++ unsigned int eighthPoints = num_points / 8; ++ ++- for(number = 0; number < eighthPoints; number++){ ++- complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 8; ++- complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 8; +++ for (number = 0; number < eighthPoints; number++) { +++ complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr); +++ complexVectorPtr += 8; +++ complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr); +++ complexVectorPtr += 8; ++ ++- complexVal1 = _mm_shufflelo_epi16(complexVal1, _MM_SHUFFLE(3,1,2,0)); +++ complexVal1 = _mm_shufflelo_epi16(complexVal1, _MM_SHUFFLE(3, 1, 2, 0)); ++ ++- complexVal1 = _mm_shufflehi_epi16(complexVal1, _MM_SHUFFLE(3,1,2,0)); +++ complexVal1 = _mm_shufflehi_epi16(complexVal1, _MM_SHUFFLE(3, 1, 2, 0)); ++ ++- complexVal1 = _mm_shuffle_epi32(complexVal1, _MM_SHUFFLE(3,1,2,0)); +++ complexVal1 = _mm_shuffle_epi32(complexVal1, _MM_SHUFFLE(3, 1, 2, 0)); ++ ++- complexVal2 = _mm_shufflelo_epi16(complexVal2, _MM_SHUFFLE(3,1,2,0)); +++ complexVal2 = _mm_shufflelo_epi16(complexVal2, _MM_SHUFFLE(3, 1, 2, 0)); ++ ++- complexVal2 = _mm_shufflehi_epi16(complexVal2, _MM_SHUFFLE(3,1,2,0)); +++ complexVal2 = _mm_shufflehi_epi16(complexVal2, _MM_SHUFFLE(3, 1, 2, 0)); ++ ++- complexVal2 = _mm_shuffle_epi32(complexVal2, _MM_SHUFFLE(2,0,3,1)); +++ complexVal2 = _mm_shuffle_epi32(complexVal2, _MM_SHUFFLE(2, 0, 3, 1)); ++ ++- iOutputVal = _mm_or_si128(_mm_and_si128(complexVal1, lowMask), _mm_and_si128(complexVal2, highMask)); +++ iOutputVal = _mm_or_si128(_mm_and_si128(complexVal1, lowMask), +++ _mm_and_si128(complexVal2, highMask)); ++ ++- _mm_store_si128((__m128i*)iBufferPtr, iOutputVal); +++ _mm_store_si128((__m128i*)iBufferPtr, iOutputVal); ++ ++- iBufferPtr += 8; ++- } +++ iBufferPtr += 8; +++ } ++ ++- number = eighthPoints * 8; ++- for(; number < num_points; number++){ ++- *iBufferPtr++ = *complexVectorPtr++; ++- complexVectorPtr++; ++- } +++ number = eighthPoints * 8; +++ for (; number < num_points; number++) { +++ *iBufferPtr++ = *complexVectorPtr++; +++ complexVectorPtr++; +++ } ++ } ++ #endif /* LV_HAVE_SSE2 */ ++ ++ #ifdef LV_HAVE_GENERIC ++ ++-static inline void ++-volk_16ic_deinterleave_real_16i_generic(int16_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points) +++static inline void volk_16ic_deinterleave_real_16i_generic(int16_t* iBuffer, +++ const lv_16sc_t* complexVector, +++ unsigned int num_points) ++ { ++- unsigned int number = 0; ++- const int16_t* complexVectorPtr = (int16_t*)complexVector; ++- int16_t* iBufferPtr = iBuffer; ++- for(number = 0; number < num_points; number++){ ++- *iBufferPtr++ = *complexVectorPtr++; ++- complexVectorPtr++; ++- } +++ unsigned int number = 0; +++ const int16_t* complexVectorPtr = (int16_t*)complexVector; +++ int16_t* iBufferPtr = iBuffer; +++ for (number = 0; number < num_points; number++) { +++ *iBufferPtr++ = *complexVectorPtr++; +++ complexVectorPtr++; +++ } ++ } ++ #endif /* LV_HAVE_GENERIC */ ++ ++@@ -212,40 +288,105 @@ volk_16ic_deinterleave_real_16i_generic(int16_t* iBuffer, const lv_16sc_t* compl ++ #ifdef LV_HAVE_AVX2 ++ #include ++ ++-static inline void ++-volk_16ic_deinterleave_real_16i_u_avx2(int16_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points) +++static inline void volk_16ic_deinterleave_real_16i_u_avx2(int16_t* iBuffer, +++ const lv_16sc_t* complexVector, +++ unsigned int num_points) ++ { ++- unsigned int number = 0; ++- const int16_t* complexVectorPtr = (int16_t*)complexVector; ++- int16_t* iBufferPtr = iBuffer; ++- ++- __m256i iMoveMask1 = _mm256_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0); ++- __m256i iMoveMask2 = _mm256_set_epi8(13, 12, 9, 8, 5, 4, 1, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80); ++- ++- __m256i complexVal1, complexVal2, iOutputVal; ++- ++- unsigned int sixteenthPoints = num_points / 16; ++- ++- for(number = 0; number < sixteenthPoints; number++){ ++- complexVal1 = _mm256_loadu_si256((__m256i*)complexVectorPtr); complexVectorPtr += 16; ++- complexVal2 = _mm256_loadu_si256((__m256i*)complexVectorPtr); complexVectorPtr += 16; ++- ++- complexVal1 = _mm256_shuffle_epi8(complexVal1, iMoveMask1); ++- complexVal2 = _mm256_shuffle_epi8(complexVal2, iMoveMask2); ++- ++- iOutputVal = _mm256_or_si256(complexVal1, complexVal2); ++- iOutputVal = _mm256_permute4x64_epi64(iOutputVal, 0xd8); ++- ++- _mm256_storeu_si256((__m256i*)iBufferPtr, iOutputVal); ++- ++- iBufferPtr += 16; ++- } ++- ++- number = sixteenthPoints * 16; ++- for(; number < num_points; number++){ ++- *iBufferPtr++ = *complexVectorPtr++; ++- complexVectorPtr++; ++- } +++ unsigned int number = 0; +++ const int16_t* complexVectorPtr = (int16_t*)complexVector; +++ int16_t* iBufferPtr = iBuffer; +++ +++ __m256i iMoveMask1 = _mm256_set_epi8(0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 13, +++ 12, +++ 9, +++ 8, +++ 5, +++ 4, +++ 1, +++ 0, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 13, +++ 12, +++ 9, +++ 8, +++ 5, +++ 4, +++ 1, +++ 0); +++ __m256i iMoveMask2 = _mm256_set_epi8(13, +++ 12, +++ 9, +++ 8, +++ 5, +++ 4, +++ 1, +++ 0, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 13, +++ 12, +++ 9, +++ 8, +++ 5, +++ 4, +++ 1, +++ 0, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80); +++ +++ __m256i complexVal1, complexVal2, iOutputVal; +++ +++ unsigned int sixteenthPoints = num_points / 16; +++ +++ for (number = 0; number < sixteenthPoints; number++) { +++ complexVal1 = _mm256_loadu_si256((__m256i*)complexVectorPtr); +++ complexVectorPtr += 16; +++ complexVal2 = _mm256_loadu_si256((__m256i*)complexVectorPtr); +++ complexVectorPtr += 16; +++ +++ complexVal1 = _mm256_shuffle_epi8(complexVal1, iMoveMask1); +++ complexVal2 = _mm256_shuffle_epi8(complexVal2, iMoveMask2); +++ +++ iOutputVal = _mm256_or_si256(complexVal1, complexVal2); +++ iOutputVal = _mm256_permute4x64_epi64(iOutputVal, 0xd8); +++ +++ _mm256_storeu_si256((__m256i*)iBufferPtr, iOutputVal); +++ +++ iBufferPtr += 16; +++ } +++ +++ number = sixteenthPoints * 16; +++ for (; number < num_points; number++) { +++ *iBufferPtr++ = *complexVectorPtr++; +++ complexVectorPtr++; +++ } ++ } ++ #endif /* LV_HAVE_AVX2 */ ++ ++diff --git a/kernels/volk/volk_16ic_deinterleave_real_8i.h b/kernels/volk/volk_16ic_deinterleave_real_8i.h ++index 1022688..3d8e4ea 100644 ++--- a/kernels/volk/volk_16ic_deinterleave_real_8i.h +++++ b/kernels/volk/volk_16ic_deinterleave_real_8i.h ++@@ -30,8 +30,8 @@ ++ * ++ * Dispatcher Prototype ++ * \code ++- * void volk_16ic_deinterleave_real_8i(int8_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points) ++- * \endcode +++ * void volk_16ic_deinterleave_real_8i(int8_t* iBuffer, const lv_16sc_t* complexVector, +++ * unsigned int num_points) \endcode ++ * ++ * \b Inputs ++ * \li complexVector: The complex input vector. ++@@ -61,54 +61,121 @@ ++ #ifdef LV_HAVE_AVX2 ++ #include ++ ++-static inline void ++-volk_16ic_deinterleave_real_8i_a_avx2(int8_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points) +++static inline void volk_16ic_deinterleave_real_8i_a_avx2(int8_t* iBuffer, +++ const lv_16sc_t* complexVector, +++ unsigned int num_points) ++ { ++- unsigned int number = 0; ++- const int8_t* complexVectorPtr = (int8_t*)complexVector; ++- int8_t* iBufferPtr = iBuffer; ++- __m256i iMoveMask1 = _mm256_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0); ++- __m256i iMoveMask2 = _mm256_set_epi8(13, 12, 9, 8, 5, 4, 1, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80); ++- __m256i complexVal1, complexVal2, complexVal3, complexVal4, iOutputVal; ++- ++- unsigned int thirtysecondPoints = num_points / 32; ++- ++- for(number = 0; number < thirtysecondPoints; number++){ ++- complexVal1 = _mm256_load_si256((__m256i*)complexVectorPtr); complexVectorPtr += 32; ++- complexVal2 = _mm256_load_si256((__m256i*)complexVectorPtr); complexVectorPtr += 32; ++- ++- complexVal3 = _mm256_load_si256((__m256i*)complexVectorPtr); complexVectorPtr += 32; ++- complexVal4 = _mm256_load_si256((__m256i*)complexVectorPtr); complexVectorPtr += 32; ++- ++- complexVal1 = _mm256_shuffle_epi8(complexVal1, iMoveMask1); ++- complexVal2 = _mm256_shuffle_epi8(complexVal2, iMoveMask2); ++- ++- complexVal1 = _mm256_or_si256(complexVal1, complexVal2); ++- complexVal1 = _mm256_permute4x64_epi64(complexVal1, 0xd8); ++- ++- complexVal3 = _mm256_shuffle_epi8(complexVal3, iMoveMask1); ++- complexVal4 = _mm256_shuffle_epi8(complexVal4, iMoveMask2); ++- ++- complexVal3 = _mm256_or_si256(complexVal3, complexVal4); ++- complexVal3 = _mm256_permute4x64_epi64(complexVal3, 0xd8); ++- ++- complexVal1 = _mm256_srai_epi16(complexVal1, 8); ++- complexVal3 = _mm256_srai_epi16(complexVal3, 8); ++- ++- iOutputVal = _mm256_packs_epi16(complexVal1, complexVal3); ++- iOutputVal = _mm256_permute4x64_epi64(iOutputVal, 0xd8); ++- ++- _mm256_store_si256((__m256i*)iBufferPtr, iOutputVal); ++- ++- iBufferPtr += 32; ++- } ++- ++- number = thirtysecondPoints * 32; ++- int16_t* int16ComplexVectorPtr = (int16_t*)complexVectorPtr; ++- for(; number < num_points; number++){ ++- *iBufferPtr++ = ((int8_t)(*int16ComplexVectorPtr++ >> 8)); ++- int16ComplexVectorPtr++; ++- } +++ unsigned int number = 0; +++ const int8_t* complexVectorPtr = (int8_t*)complexVector; +++ int8_t* iBufferPtr = iBuffer; +++ __m256i iMoveMask1 = _mm256_set_epi8(0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 13, +++ 12, +++ 9, +++ 8, +++ 5, +++ 4, +++ 1, +++ 0, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 13, +++ 12, +++ 9, +++ 8, +++ 5, +++ 4, +++ 1, +++ 0); +++ __m256i iMoveMask2 = _mm256_set_epi8(13, +++ 12, +++ 9, +++ 8, +++ 5, +++ 4, +++ 1, +++ 0, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 13, +++ 12, +++ 9, +++ 8, +++ 5, +++ 4, +++ 1, +++ 0, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80); +++ __m256i complexVal1, complexVal2, complexVal3, complexVal4, iOutputVal; +++ +++ unsigned int thirtysecondPoints = num_points / 32; +++ +++ for (number = 0; number < thirtysecondPoints; number++) { +++ complexVal1 = _mm256_load_si256((__m256i*)complexVectorPtr); +++ complexVectorPtr += 32; +++ complexVal2 = _mm256_load_si256((__m256i*)complexVectorPtr); +++ complexVectorPtr += 32; +++ +++ complexVal3 = _mm256_load_si256((__m256i*)complexVectorPtr); +++ complexVectorPtr += 32; +++ complexVal4 = _mm256_load_si256((__m256i*)complexVectorPtr); +++ complexVectorPtr += 32; +++ +++ complexVal1 = _mm256_shuffle_epi8(complexVal1, iMoveMask1); +++ complexVal2 = _mm256_shuffle_epi8(complexVal2, iMoveMask2); +++ +++ complexVal1 = _mm256_or_si256(complexVal1, complexVal2); +++ complexVal1 = _mm256_permute4x64_epi64(complexVal1, 0xd8); +++ +++ complexVal3 = _mm256_shuffle_epi8(complexVal3, iMoveMask1); +++ complexVal4 = _mm256_shuffle_epi8(complexVal4, iMoveMask2); +++ +++ complexVal3 = _mm256_or_si256(complexVal3, complexVal4); +++ complexVal3 = _mm256_permute4x64_epi64(complexVal3, 0xd8); +++ +++ complexVal1 = _mm256_srai_epi16(complexVal1, 8); +++ complexVal3 = _mm256_srai_epi16(complexVal3, 8); +++ +++ iOutputVal = _mm256_packs_epi16(complexVal1, complexVal3); +++ iOutputVal = _mm256_permute4x64_epi64(iOutputVal, 0xd8); +++ +++ _mm256_store_si256((__m256i*)iBufferPtr, iOutputVal); +++ +++ iBufferPtr += 32; +++ } +++ +++ number = thirtysecondPoints * 32; +++ int16_t* int16ComplexVectorPtr = (int16_t*)complexVectorPtr; +++ for (; number < num_points; number++) { +++ *iBufferPtr++ = ((int8_t)(*int16ComplexVectorPtr++ >> 8)); +++ int16ComplexVectorPtr++; +++ } ++ } ++ #endif /* LV_HAVE_AVX2 */ ++ ++@@ -116,105 +183,116 @@ volk_16ic_deinterleave_real_8i_a_avx2(int8_t* iBuffer, const lv_16sc_t* complexV ++ #ifdef LV_HAVE_SSSE3 ++ #include ++ ++-static inline void ++-volk_16ic_deinterleave_real_8i_a_ssse3(int8_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points) +++static inline void volk_16ic_deinterleave_real_8i_a_ssse3(int8_t* iBuffer, +++ const lv_16sc_t* complexVector, +++ unsigned int num_points) ++ { ++- unsigned int number = 0; ++- const int8_t* complexVectorPtr = (int8_t*)complexVector; ++- int8_t* iBufferPtr = iBuffer; ++- __m128i iMoveMask1 = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0); ++- __m128i iMoveMask2 = _mm_set_epi8(13, 12, 9, 8, 5, 4, 1, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80); ++- __m128i complexVal1, complexVal2, complexVal3, complexVal4, iOutputVal; +++ unsigned int number = 0; +++ const int8_t* complexVectorPtr = (int8_t*)complexVector; +++ int8_t* iBufferPtr = iBuffer; +++ __m128i iMoveMask1 = _mm_set_epi8( +++ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0); +++ __m128i iMoveMask2 = _mm_set_epi8( +++ 13, 12, 9, 8, 5, 4, 1, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80); +++ __m128i complexVal1, complexVal2, complexVal3, complexVal4, iOutputVal; ++ ++- unsigned int sixteenthPoints = num_points / 16; +++ unsigned int sixteenthPoints = num_points / 16; ++ ++- for(number = 0; number < sixteenthPoints; number++){ ++- complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16; ++- complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16; +++ for (number = 0; number < sixteenthPoints; number++) { +++ complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr); +++ complexVectorPtr += 16; +++ complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr); +++ complexVectorPtr += 16; ++ ++- complexVal3 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16; ++- complexVal4 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16; +++ complexVal3 = _mm_load_si128((__m128i*)complexVectorPtr); +++ complexVectorPtr += 16; +++ complexVal4 = _mm_load_si128((__m128i*)complexVectorPtr); +++ complexVectorPtr += 16; ++ ++- complexVal1 = _mm_shuffle_epi8(complexVal1, iMoveMask1); ++- complexVal2 = _mm_shuffle_epi8(complexVal2, iMoveMask2); +++ complexVal1 = _mm_shuffle_epi8(complexVal1, iMoveMask1); +++ complexVal2 = _mm_shuffle_epi8(complexVal2, iMoveMask2); ++ ++- complexVal1 = _mm_or_si128(complexVal1, complexVal2); +++ complexVal1 = _mm_or_si128(complexVal1, complexVal2); ++ ++- complexVal3 = _mm_shuffle_epi8(complexVal3, iMoveMask1); ++- complexVal4 = _mm_shuffle_epi8(complexVal4, iMoveMask2); +++ complexVal3 = _mm_shuffle_epi8(complexVal3, iMoveMask1); +++ complexVal4 = _mm_shuffle_epi8(complexVal4, iMoveMask2); ++ ++- complexVal3 = _mm_or_si128(complexVal3, complexVal4); +++ complexVal3 = _mm_or_si128(complexVal3, complexVal4); ++ ++ ++- complexVal1 = _mm_srai_epi16(complexVal1, 8); ++- complexVal3 = _mm_srai_epi16(complexVal3, 8); +++ complexVal1 = _mm_srai_epi16(complexVal1, 8); +++ complexVal3 = _mm_srai_epi16(complexVal3, 8); ++ ++- iOutputVal = _mm_packs_epi16(complexVal1, complexVal3); +++ iOutputVal = _mm_packs_epi16(complexVal1, complexVal3); ++ ++- _mm_store_si128((__m128i*)iBufferPtr, iOutputVal); +++ _mm_store_si128((__m128i*)iBufferPtr, iOutputVal); ++ ++- iBufferPtr += 16; ++- } +++ iBufferPtr += 16; +++ } ++ ++- number = sixteenthPoints * 16; ++- int16_t* int16ComplexVectorPtr = (int16_t*)complexVectorPtr; ++- for(; number < num_points; number++){ ++- *iBufferPtr++ = ((int8_t)(*int16ComplexVectorPtr++ >> 8)); ++- int16ComplexVectorPtr++; ++- } +++ number = sixteenthPoints * 16; +++ int16_t* int16ComplexVectorPtr = (int16_t*)complexVectorPtr; +++ for (; number < num_points; number++) { +++ *iBufferPtr++ = ((int8_t)(*int16ComplexVectorPtr++ >> 8)); +++ int16ComplexVectorPtr++; +++ } ++ } ++ #endif /* LV_HAVE_SSSE3 */ ++ ++ #ifdef LV_HAVE_GENERIC ++ ++-static inline void ++-volk_16ic_deinterleave_real_8i_generic(int8_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points) +++static inline void volk_16ic_deinterleave_real_8i_generic(int8_t* iBuffer, +++ const lv_16sc_t* complexVector, +++ unsigned int num_points) ++ { ++- unsigned int number = 0; ++- int16_t* complexVectorPtr = (int16_t*)complexVector; ++- int8_t* iBufferPtr = iBuffer; ++- for(number = 0; number < num_points; number++){ ++- *iBufferPtr++ = ((int8_t)(*complexVectorPtr++ >> 8)); ++- complexVectorPtr++; ++- } +++ unsigned int number = 0; +++ int16_t* complexVectorPtr = (int16_t*)complexVector; +++ int8_t* iBufferPtr = iBuffer; +++ for (number = 0; number < num_points; number++) { +++ *iBufferPtr++ = ((int8_t)(*complexVectorPtr++ >> 8)); +++ complexVectorPtr++; +++ } ++ } ++ #endif /* LV_HAVE_GENERIC */ ++ ++ #ifdef LV_HAVE_NEON ++ #include ++ ++-static inline void ++-volk_16ic_deinterleave_real_8i_neon(int8_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points) +++static inline void volk_16ic_deinterleave_real_8i_neon(int8_t* iBuffer, +++ const lv_16sc_t* complexVector, +++ unsigned int num_points) ++ { ++- const int16_t* complexVectorPtr = (const int16_t*)complexVector; ++- int8_t* iBufferPtr = iBuffer; ++- unsigned int eighth_points = num_points / 8; ++- unsigned int number; ++- ++- int16x8x2_t complexInput; ++- int8x8_t realOutput; ++- for(number = 0; number < eighth_points; number++){ ++- complexInput = vld2q_s16(complexVectorPtr); ++- realOutput = vshrn_n_s16(complexInput.val[0], 8); ++- vst1_s8(iBufferPtr, realOutput); ++- complexVectorPtr += 16; ++- iBufferPtr += 8; ++- } ++- ++- for(number = eighth_points*8; number < num_points; number++){ ++- *iBufferPtr++ = ((int8_t)(*complexVectorPtr++ >> 8)); ++- complexVectorPtr++; ++- } +++ const int16_t* complexVectorPtr = (const int16_t*)complexVector; +++ int8_t* iBufferPtr = iBuffer; +++ unsigned int eighth_points = num_points / 8; +++ unsigned int number; +++ +++ int16x8x2_t complexInput; +++ int8x8_t realOutput; +++ for (number = 0; number < eighth_points; number++) { +++ complexInput = vld2q_s16(complexVectorPtr); +++ realOutput = vshrn_n_s16(complexInput.val[0], 8); +++ vst1_s8(iBufferPtr, realOutput); +++ complexVectorPtr += 16; +++ iBufferPtr += 8; +++ } +++ +++ for (number = eighth_points * 8; number < num_points; number++) { +++ *iBufferPtr++ = ((int8_t)(*complexVectorPtr++ >> 8)); +++ complexVectorPtr++; +++ } ++ } ++ #endif ++ ++ #ifdef LV_HAVE_ORC ++ ++-extern void ++-volk_16ic_deinterleave_real_8i_a_orc_impl(int8_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points); +++extern void volk_16ic_deinterleave_real_8i_a_orc_impl(int8_t* iBuffer, +++ const lv_16sc_t* complexVector, +++ unsigned int num_points); ++ ++-static inline void ++-volk_16ic_deinterleave_real_8i_u_orc(int8_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points) +++static inline void volk_16ic_deinterleave_real_8i_u_orc(int8_t* iBuffer, +++ const lv_16sc_t* complexVector, +++ unsigned int num_points) ++ { ++ volk_16ic_deinterleave_real_8i_a_orc_impl(iBuffer, complexVector, num_points); ++ } ++@@ -233,54 +311,121 @@ volk_16ic_deinterleave_real_8i_u_orc(int8_t* iBuffer, const lv_16sc_t* complexVe ++ #ifdef LV_HAVE_AVX2 ++ #include ++ ++-static inline void ++-volk_16ic_deinterleave_real_8i_u_avx2(int8_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points) +++static inline void volk_16ic_deinterleave_real_8i_u_avx2(int8_t* iBuffer, +++ const lv_16sc_t* complexVector, +++ unsigned int num_points) ++ { ++- unsigned int number = 0; ++- const int8_t* complexVectorPtr = (int8_t*)complexVector; ++- int8_t* iBufferPtr = iBuffer; ++- __m256i iMoveMask1 = _mm256_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0); ++- __m256i iMoveMask2 = _mm256_set_epi8(13, 12, 9, 8, 5, 4, 1, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80); ++- __m256i complexVal1, complexVal2, complexVal3, complexVal4, iOutputVal; ++- ++- unsigned int thirtysecondPoints = num_points / 32; ++- ++- for(number = 0; number < thirtysecondPoints; number++){ ++- complexVal1 = _mm256_loadu_si256((__m256i*)complexVectorPtr); complexVectorPtr += 32; ++- complexVal2 = _mm256_loadu_si256((__m256i*)complexVectorPtr); complexVectorPtr += 32; ++- ++- complexVal3 = _mm256_loadu_si256((__m256i*)complexVectorPtr); complexVectorPtr += 32; ++- complexVal4 = _mm256_loadu_si256((__m256i*)complexVectorPtr); complexVectorPtr += 32; ++- ++- complexVal1 = _mm256_shuffle_epi8(complexVal1, iMoveMask1); ++- complexVal2 = _mm256_shuffle_epi8(complexVal2, iMoveMask2); ++- ++- complexVal1 = _mm256_or_si256(complexVal1, complexVal2); ++- complexVal1 = _mm256_permute4x64_epi64(complexVal1, 0xd8); ++- ++- complexVal3 = _mm256_shuffle_epi8(complexVal3, iMoveMask1); ++- complexVal4 = _mm256_shuffle_epi8(complexVal4, iMoveMask2); ++- ++- complexVal3 = _mm256_or_si256(complexVal3, complexVal4); ++- complexVal3 = _mm256_permute4x64_epi64(complexVal3, 0xd8); ++- ++- complexVal1 = _mm256_srai_epi16(complexVal1, 8); ++- complexVal3 = _mm256_srai_epi16(complexVal3, 8); ++- ++- iOutputVal = _mm256_packs_epi16(complexVal1, complexVal3); ++- iOutputVal = _mm256_permute4x64_epi64(iOutputVal, 0xd8); ++- ++- _mm256_storeu_si256((__m256i*)iBufferPtr, iOutputVal); ++- ++- iBufferPtr += 32; ++- } ++- ++- number = thirtysecondPoints * 32; ++- int16_t* int16ComplexVectorPtr = (int16_t*)complexVectorPtr; ++- for(; number < num_points; number++){ ++- *iBufferPtr++ = ((int8_t)(*int16ComplexVectorPtr++ >> 8)); ++- int16ComplexVectorPtr++; ++- } +++ unsigned int number = 0; +++ const int8_t* complexVectorPtr = (int8_t*)complexVector; +++ int8_t* iBufferPtr = iBuffer; +++ __m256i iMoveMask1 = _mm256_set_epi8(0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 13, +++ 12, +++ 9, +++ 8, +++ 5, +++ 4, +++ 1, +++ 0, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 13, +++ 12, +++ 9, +++ 8, +++ 5, +++ 4, +++ 1, +++ 0); +++ __m256i iMoveMask2 = _mm256_set_epi8(13, +++ 12, +++ 9, +++ 8, +++ 5, +++ 4, +++ 1, +++ 0, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 13, +++ 12, +++ 9, +++ 8, +++ 5, +++ 4, +++ 1, +++ 0, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80); +++ __m256i complexVal1, complexVal2, complexVal3, complexVal4, iOutputVal; +++ +++ unsigned int thirtysecondPoints = num_points / 32; +++ +++ for (number = 0; number < thirtysecondPoints; number++) { +++ complexVal1 = _mm256_loadu_si256((__m256i*)complexVectorPtr); +++ complexVectorPtr += 32; +++ complexVal2 = _mm256_loadu_si256((__m256i*)complexVectorPtr); +++ complexVectorPtr += 32; +++ +++ complexVal3 = _mm256_loadu_si256((__m256i*)complexVectorPtr); +++ complexVectorPtr += 32; +++ complexVal4 = _mm256_loadu_si256((__m256i*)complexVectorPtr); +++ complexVectorPtr += 32; +++ +++ complexVal1 = _mm256_shuffle_epi8(complexVal1, iMoveMask1); +++ complexVal2 = _mm256_shuffle_epi8(complexVal2, iMoveMask2); +++ +++ complexVal1 = _mm256_or_si256(complexVal1, complexVal2); +++ complexVal1 = _mm256_permute4x64_epi64(complexVal1, 0xd8); +++ +++ complexVal3 = _mm256_shuffle_epi8(complexVal3, iMoveMask1); +++ complexVal4 = _mm256_shuffle_epi8(complexVal4, iMoveMask2); +++ +++ complexVal3 = _mm256_or_si256(complexVal3, complexVal4); +++ complexVal3 = _mm256_permute4x64_epi64(complexVal3, 0xd8); +++ +++ complexVal1 = _mm256_srai_epi16(complexVal1, 8); +++ complexVal3 = _mm256_srai_epi16(complexVal3, 8); +++ +++ iOutputVal = _mm256_packs_epi16(complexVal1, complexVal3); +++ iOutputVal = _mm256_permute4x64_epi64(iOutputVal, 0xd8); +++ +++ _mm256_storeu_si256((__m256i*)iBufferPtr, iOutputVal); +++ +++ iBufferPtr += 32; +++ } +++ +++ number = thirtysecondPoints * 32; +++ int16_t* int16ComplexVectorPtr = (int16_t*)complexVectorPtr; +++ for (; number < num_points; number++) { +++ *iBufferPtr++ = ((int8_t)(*int16ComplexVectorPtr++ >> 8)); +++ int16ComplexVectorPtr++; +++ } ++ } ++ #endif /* LV_HAVE_AVX2 */ ++ #endif /* INCLUDED_volk_16ic_deinterleave_real_8i_u_H */ ++diff --git a/kernels/volk/volk_16ic_magnitude_16i.h b/kernels/volk/volk_16ic_magnitude_16i.h ++index bbe72a8..35b40cb 100644 ++--- a/kernels/volk/volk_16ic_magnitude_16i.h +++++ b/kernels/volk/volk_16ic_magnitude_16i.h ++@@ -30,8 +30,8 @@ ++ * ++ * Dispatcher Prototype ++ * \code ++- * void volk_16ic_magnitude_16i(int16_t* magnitudeVector, const lv_16sc_t* complexVector, unsigned int num_points) ++- * \endcode +++ * void volk_16ic_magnitude_16i(int16_t* magnitudeVector, const lv_16sc_t* complexVector, +++ * unsigned int num_points) \endcode ++ * ++ * \b Inputs ++ * \li complexVector: The complex input vector. ++@@ -54,242 +54,255 @@ ++ #ifndef INCLUDED_volk_16ic_magnitude_16i_a_H ++ #define INCLUDED_volk_16ic_magnitude_16i_a_H ++ ++-#include ++ #include ++-#include ++-#include ++ #include +++#include +++#include +++#include ++ ++ #ifdef LV_HAVE_AVX2 ++ #include ++ ++-static inline void ++-volk_16ic_magnitude_16i_a_avx2(int16_t* magnitudeVector, const lv_16sc_t* complexVector, unsigned int num_points) +++static inline void volk_16ic_magnitude_16i_a_avx2(int16_t* magnitudeVector, +++ const lv_16sc_t* complexVector, +++ unsigned int num_points) ++ { ++- unsigned int number = 0; ++- const unsigned int eighthPoints = num_points / 8; ++- ++- const int16_t* complexVectorPtr = (const int16_t*)complexVector; ++- int16_t* magnitudeVectorPtr = magnitudeVector; ++- ++- __m256 vScalar = _mm256_set1_ps(SHRT_MAX); ++- __m256 invScalar = _mm256_set1_ps(1.0f/SHRT_MAX); ++- __m256i int1, int2; ++- __m128i short1, short2; ++- __m256 cplxValue1, cplxValue2, result; ++- __m256i idx = _mm256_set_epi32(0,0,0,0,5,1,4,0); ++- ++- for(;number < eighthPoints; number++){ ++- ++- int1 = _mm256_load_si256((__m256i*)complexVectorPtr); ++- complexVectorPtr += 16; ++- short1 = _mm256_extracti128_si256(int1,0); ++- short2 = _mm256_extracti128_si256(int1,1); ++- ++- int1 = _mm256_cvtepi16_epi32(short1); ++- int2 = _mm256_cvtepi16_epi32(short2); ++- cplxValue1 = _mm256_cvtepi32_ps(int1); ++- cplxValue2 = _mm256_cvtepi32_ps(int2); ++- ++- cplxValue1 = _mm256_mul_ps(cplxValue1, invScalar); ++- cplxValue2 = _mm256_mul_ps(cplxValue2, invScalar); ++- ++- cplxValue1 = _mm256_mul_ps(cplxValue1, cplxValue1); // Square the values ++- cplxValue2 = _mm256_mul_ps(cplxValue2, cplxValue2); // Square the Values ++- ++- result = _mm256_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values ++- ++- result = _mm256_sqrt_ps(result); // Square root the values ++- ++- result = _mm256_mul_ps(result, vScalar); // Scale the results ++- ++- int1 = _mm256_cvtps_epi32(result); ++- int1 = _mm256_packs_epi32(int1, int1); ++- int1 = _mm256_permutevar8x32_epi32(int1, idx); //permute to compensate for shuffling in hadd and packs ++- short1 = _mm256_extracti128_si256(int1, 0); ++- _mm_store_si128((__m128i*)magnitudeVectorPtr,short1); ++- magnitudeVectorPtr += 8; ++- } ++- ++- number = eighthPoints * 8; ++- magnitudeVectorPtr = &magnitudeVector[number]; ++- complexVectorPtr = (const int16_t*)&complexVector[number]; ++- for(; number < num_points; number++){ ++- const float val1Real = (float)(*complexVectorPtr++) / SHRT_MAX; ++- const float val1Imag = (float)(*complexVectorPtr++) / SHRT_MAX; ++- const float val1Result = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)) * SHRT_MAX; ++- *magnitudeVectorPtr++ = (int16_t)rintf(val1Result); ++- } +++ unsigned int number = 0; +++ const unsigned int eighthPoints = num_points / 8; +++ +++ const int16_t* complexVectorPtr = (const int16_t*)complexVector; +++ int16_t* magnitudeVectorPtr = magnitudeVector; +++ +++ __m256 vScalar = _mm256_set1_ps(SHRT_MAX); +++ __m256 invScalar = _mm256_set1_ps(1.0f / SHRT_MAX); +++ __m256i int1, int2; +++ __m128i short1, short2; +++ __m256 cplxValue1, cplxValue2, result; +++ __m256i idx = _mm256_set_epi32(0, 0, 0, 0, 5, 1, 4, 0); +++ +++ for (; number < eighthPoints; number++) { +++ +++ int1 = _mm256_load_si256((__m256i*)complexVectorPtr); +++ complexVectorPtr += 16; +++ short1 = _mm256_extracti128_si256(int1, 0); +++ short2 = _mm256_extracti128_si256(int1, 1); +++ +++ int1 = _mm256_cvtepi16_epi32(short1); +++ int2 = _mm256_cvtepi16_epi32(short2); +++ cplxValue1 = _mm256_cvtepi32_ps(int1); +++ cplxValue2 = _mm256_cvtepi32_ps(int2); +++ +++ cplxValue1 = _mm256_mul_ps(cplxValue1, invScalar); +++ cplxValue2 = _mm256_mul_ps(cplxValue2, invScalar); +++ +++ cplxValue1 = _mm256_mul_ps(cplxValue1, cplxValue1); // Square the values +++ cplxValue2 = _mm256_mul_ps(cplxValue2, cplxValue2); // Square the Values +++ +++ result = _mm256_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values +++ +++ result = _mm256_sqrt_ps(result); // Square root the values +++ +++ result = _mm256_mul_ps(result, vScalar); // Scale the results +++ +++ int1 = _mm256_cvtps_epi32(result); +++ int1 = _mm256_packs_epi32(int1, int1); +++ int1 = _mm256_permutevar8x32_epi32( +++ int1, idx); // permute to compensate for shuffling in hadd and packs +++ short1 = _mm256_extracti128_si256(int1, 0); +++ _mm_store_si128((__m128i*)magnitudeVectorPtr, short1); +++ magnitudeVectorPtr += 8; +++ } +++ +++ number = eighthPoints * 8; +++ magnitudeVectorPtr = &magnitudeVector[number]; +++ complexVectorPtr = (const int16_t*)&complexVector[number]; +++ for (; number < num_points; number++) { +++ const float val1Real = (float)(*complexVectorPtr++) / SHRT_MAX; +++ const float val1Imag = (float)(*complexVectorPtr++) / SHRT_MAX; +++ const float val1Result = +++ sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)) * SHRT_MAX; +++ *magnitudeVectorPtr++ = (int16_t)rintf(val1Result); +++ } ++ } ++ #endif /* LV_HAVE_AVX2 */ ++ ++ #ifdef LV_HAVE_SSE3 ++ #include ++ ++-static inline void ++-volk_16ic_magnitude_16i_a_sse3(int16_t* magnitudeVector, const lv_16sc_t* complexVector, unsigned int num_points) +++static inline void volk_16ic_magnitude_16i_a_sse3(int16_t* magnitudeVector, +++ const lv_16sc_t* complexVector, +++ unsigned int num_points) ++ { ++- unsigned int number = 0; ++- const unsigned int quarterPoints = num_points / 4; +++ unsigned int number = 0; +++ const unsigned int quarterPoints = num_points / 4; ++ ++- const int16_t* complexVectorPtr = (const int16_t*)complexVector; ++- int16_t* magnitudeVectorPtr = magnitudeVector; +++ const int16_t* complexVectorPtr = (const int16_t*)complexVector; +++ int16_t* magnitudeVectorPtr = magnitudeVector; ++ ++- __m128 vScalar = _mm_set_ps1(SHRT_MAX); ++- __m128 invScalar = _mm_set_ps1(1.0f/SHRT_MAX); +++ __m128 vScalar = _mm_set_ps1(SHRT_MAX); +++ __m128 invScalar = _mm_set_ps1(1.0f / SHRT_MAX); ++ ++- __m128 cplxValue1, cplxValue2, result; +++ __m128 cplxValue1, cplxValue2, result; ++ ++- __VOLK_ATTR_ALIGNED(16) float inputFloatBuffer[8]; ++- __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4]; +++ __VOLK_ATTR_ALIGNED(16) float inputFloatBuffer[8]; +++ __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4]; ++ ++- for(;number < quarterPoints; number++){ +++ for (; number < quarterPoints; number++) { ++ ++- inputFloatBuffer[0] = (float)(complexVectorPtr[0]); ++- inputFloatBuffer[1] = (float)(complexVectorPtr[1]); ++- inputFloatBuffer[2] = (float)(complexVectorPtr[2]); ++- inputFloatBuffer[3] = (float)(complexVectorPtr[3]); +++ inputFloatBuffer[0] = (float)(complexVectorPtr[0]); +++ inputFloatBuffer[1] = (float)(complexVectorPtr[1]); +++ inputFloatBuffer[2] = (float)(complexVectorPtr[2]); +++ inputFloatBuffer[3] = (float)(complexVectorPtr[3]); ++ ++- inputFloatBuffer[4] = (float)(complexVectorPtr[4]); ++- inputFloatBuffer[5] = (float)(complexVectorPtr[5]); ++- inputFloatBuffer[6] = (float)(complexVectorPtr[6]); ++- inputFloatBuffer[7] = (float)(complexVectorPtr[7]); +++ inputFloatBuffer[4] = (float)(complexVectorPtr[4]); +++ inputFloatBuffer[5] = (float)(complexVectorPtr[5]); +++ inputFloatBuffer[6] = (float)(complexVectorPtr[6]); +++ inputFloatBuffer[7] = (float)(complexVectorPtr[7]); ++ ++- cplxValue1 = _mm_load_ps(&inputFloatBuffer[0]); ++- cplxValue2 = _mm_load_ps(&inputFloatBuffer[4]); +++ cplxValue1 = _mm_load_ps(&inputFloatBuffer[0]); +++ cplxValue2 = _mm_load_ps(&inputFloatBuffer[4]); ++ ++- complexVectorPtr += 8; +++ complexVectorPtr += 8; ++ ++- cplxValue1 = _mm_mul_ps(cplxValue1, invScalar); ++- cplxValue2 = _mm_mul_ps(cplxValue2, invScalar); +++ cplxValue1 = _mm_mul_ps(cplxValue1, invScalar); +++ cplxValue2 = _mm_mul_ps(cplxValue2, invScalar); ++ ++- cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values ++- cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values +++ cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values +++ cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values ++ ++- result = _mm_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values +++ result = _mm_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values ++ ++- result = _mm_sqrt_ps(result); // Square root the values +++ result = _mm_sqrt_ps(result); // Square root the values ++ ++- result = _mm_mul_ps(result, vScalar); // Scale the results +++ result = _mm_mul_ps(result, vScalar); // Scale the results ++ ++- _mm_store_ps(outputFloatBuffer, result); ++- *magnitudeVectorPtr++ = (int16_t)rintf(outputFloatBuffer[0]); ++- *magnitudeVectorPtr++ = (int16_t)rintf(outputFloatBuffer[1]); ++- *magnitudeVectorPtr++ = (int16_t)rintf(outputFloatBuffer[2]); ++- *magnitudeVectorPtr++ = (int16_t)rintf(outputFloatBuffer[3]); ++- } +++ _mm_store_ps(outputFloatBuffer, result); +++ *magnitudeVectorPtr++ = (int16_t)rintf(outputFloatBuffer[0]); +++ *magnitudeVectorPtr++ = (int16_t)rintf(outputFloatBuffer[1]); +++ *magnitudeVectorPtr++ = (int16_t)rintf(outputFloatBuffer[2]); +++ *magnitudeVectorPtr++ = (int16_t)rintf(outputFloatBuffer[3]); +++ } ++ ++- number = quarterPoints * 4; ++- magnitudeVectorPtr = &magnitudeVector[number]; ++- complexVectorPtr = (const int16_t*)&complexVector[number]; ++- for(; number < num_points; number++){ ++- const float val1Real = (float)(*complexVectorPtr++) / SHRT_MAX; ++- const float val1Imag = (float)(*complexVectorPtr++) / SHRT_MAX; ++- const float val1Result = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)) * SHRT_MAX; ++- *magnitudeVectorPtr++ = (int16_t)rintf(val1Result); ++- } +++ number = quarterPoints * 4; +++ magnitudeVectorPtr = &magnitudeVector[number]; +++ complexVectorPtr = (const int16_t*)&complexVector[number]; +++ for (; number < num_points; number++) { +++ const float val1Real = (float)(*complexVectorPtr++) / SHRT_MAX; +++ const float val1Imag = (float)(*complexVectorPtr++) / SHRT_MAX; +++ const float val1Result = +++ sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)) * SHRT_MAX; +++ *magnitudeVectorPtr++ = (int16_t)rintf(val1Result); +++ } ++ } ++ #endif /* LV_HAVE_SSE3 */ ++ ++ #ifdef LV_HAVE_SSE ++ #include ++ ++-static inline void ++-volk_16ic_magnitude_16i_a_sse(int16_t* magnitudeVector, const lv_16sc_t* complexVector, unsigned int num_points) +++static inline void volk_16ic_magnitude_16i_a_sse(int16_t* magnitudeVector, +++ const lv_16sc_t* complexVector, +++ unsigned int num_points) ++ { ++- unsigned int number = 0; ++- const unsigned int quarterPoints = num_points / 4; +++ unsigned int number = 0; +++ const unsigned int quarterPoints = num_points / 4; ++ ++- const int16_t* complexVectorPtr = (const int16_t*)complexVector; ++- int16_t* magnitudeVectorPtr = magnitudeVector; +++ const int16_t* complexVectorPtr = (const int16_t*)complexVector; +++ int16_t* magnitudeVectorPtr = magnitudeVector; ++ ++- __m128 vScalar = _mm_set_ps1(SHRT_MAX); ++- __m128 invScalar = _mm_set_ps1(1.0f/SHRT_MAX); +++ __m128 vScalar = _mm_set_ps1(SHRT_MAX); +++ __m128 invScalar = _mm_set_ps1(1.0f / SHRT_MAX); ++ ++- __m128 cplxValue1, cplxValue2, iValue, qValue, result; +++ __m128 cplxValue1, cplxValue2, iValue, qValue, result; ++ ++- __VOLK_ATTR_ALIGNED(16) float inputFloatBuffer[4]; ++- __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4]; +++ __VOLK_ATTR_ALIGNED(16) float inputFloatBuffer[4]; +++ __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4]; ++ ++- for(;number < quarterPoints; number++){ +++ for (; number < quarterPoints; number++) { ++ ++- inputFloatBuffer[0] = (float)(complexVectorPtr[0]); ++- inputFloatBuffer[1] = (float)(complexVectorPtr[1]); ++- inputFloatBuffer[2] = (float)(complexVectorPtr[2]); ++- inputFloatBuffer[3] = (float)(complexVectorPtr[3]); +++ inputFloatBuffer[0] = (float)(complexVectorPtr[0]); +++ inputFloatBuffer[1] = (float)(complexVectorPtr[1]); +++ inputFloatBuffer[2] = (float)(complexVectorPtr[2]); +++ inputFloatBuffer[3] = (float)(complexVectorPtr[3]); ++ ++- cplxValue1 = _mm_load_ps(inputFloatBuffer); ++- complexVectorPtr += 4; +++ cplxValue1 = _mm_load_ps(inputFloatBuffer); +++ complexVectorPtr += 4; ++ ++- inputFloatBuffer[0] = (float)(complexVectorPtr[0]); ++- inputFloatBuffer[1] = (float)(complexVectorPtr[1]); ++- inputFloatBuffer[2] = (float)(complexVectorPtr[2]); ++- inputFloatBuffer[3] = (float)(complexVectorPtr[3]); +++ inputFloatBuffer[0] = (float)(complexVectorPtr[0]); +++ inputFloatBuffer[1] = (float)(complexVectorPtr[1]); +++ inputFloatBuffer[2] = (float)(complexVectorPtr[2]); +++ inputFloatBuffer[3] = (float)(complexVectorPtr[3]); ++ ++- cplxValue2 = _mm_load_ps(inputFloatBuffer); ++- complexVectorPtr += 4; +++ cplxValue2 = _mm_load_ps(inputFloatBuffer); +++ complexVectorPtr += 4; ++ ++- cplxValue1 = _mm_mul_ps(cplxValue1, invScalar); ++- cplxValue2 = _mm_mul_ps(cplxValue2, invScalar); +++ cplxValue1 = _mm_mul_ps(cplxValue1, invScalar); +++ cplxValue2 = _mm_mul_ps(cplxValue2, invScalar); ++ ++- // Arrange in i1i2i3i4 format ++- iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0)); ++- // Arrange in q1q2q3q4 format ++- qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1)); +++ // Arrange in i1i2i3i4 format +++ iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2, 0, 2, 0)); +++ // Arrange in q1q2q3q4 format +++ qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3, 1, 3, 1)); ++ ++- iValue = _mm_mul_ps(iValue, iValue); // Square the I values ++- qValue = _mm_mul_ps(qValue, qValue); // Square the Q Values +++ iValue = _mm_mul_ps(iValue, iValue); // Square the I values +++ qValue = _mm_mul_ps(qValue, qValue); // Square the Q Values ++ ++- result = _mm_add_ps(iValue, qValue); // Add the I2 and Q2 values +++ result = _mm_add_ps(iValue, qValue); // Add the I2 and Q2 values ++ ++- result = _mm_sqrt_ps(result); // Square root the values +++ result = _mm_sqrt_ps(result); // Square root the values ++ ++- result = _mm_mul_ps(result, vScalar); // Scale the results +++ result = _mm_mul_ps(result, vScalar); // Scale the results ++ ++- _mm_store_ps(outputFloatBuffer, result); ++- *magnitudeVectorPtr++ = (int16_t)rintf(outputFloatBuffer[0]); ++- *magnitudeVectorPtr++ = (int16_t)rintf(outputFloatBuffer[1]); ++- *magnitudeVectorPtr++ = (int16_t)rintf(outputFloatBuffer[2]); ++- *magnitudeVectorPtr++ = (int16_t)rintf(outputFloatBuffer[3]); ++- } +++ _mm_store_ps(outputFloatBuffer, result); +++ *magnitudeVectorPtr++ = (int16_t)rintf(outputFloatBuffer[0]); +++ *magnitudeVectorPtr++ = (int16_t)rintf(outputFloatBuffer[1]); +++ *magnitudeVectorPtr++ = (int16_t)rintf(outputFloatBuffer[2]); +++ *magnitudeVectorPtr++ = (int16_t)rintf(outputFloatBuffer[3]); +++ } ++ ++- number = quarterPoints * 4; ++- magnitudeVectorPtr = &magnitudeVector[number]; ++- complexVectorPtr = (const int16_t*)&complexVector[number]; ++- for(; number < num_points; number++){ ++- const float val1Real = (float)(*complexVectorPtr++) / SHRT_MAX; ++- const float val1Imag = (float)(*complexVectorPtr++) / SHRT_MAX; ++- const float val1Result = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)) * SHRT_MAX; ++- *magnitudeVectorPtr++ = (int16_t)rintf(val1Result); ++- } +++ number = quarterPoints * 4; +++ magnitudeVectorPtr = &magnitudeVector[number]; +++ complexVectorPtr = (const int16_t*)&complexVector[number]; +++ for (; number < num_points; number++) { +++ const float val1Real = (float)(*complexVectorPtr++) / SHRT_MAX; +++ const float val1Imag = (float)(*complexVectorPtr++) / SHRT_MAX; +++ const float val1Result = +++ sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)) * SHRT_MAX; +++ *magnitudeVectorPtr++ = (int16_t)rintf(val1Result); +++ } ++ } ++ #endif /* LV_HAVE_SSE */ ++ ++ #ifdef LV_HAVE_GENERIC ++ ++-static inline void ++-volk_16ic_magnitude_16i_generic(int16_t* magnitudeVector, const lv_16sc_t* complexVector, unsigned int num_points) +++static inline void volk_16ic_magnitude_16i_generic(int16_t* magnitudeVector, +++ const lv_16sc_t* complexVector, +++ unsigned int num_points) ++ { ++- const int16_t* complexVectorPtr = (const int16_t*)complexVector; ++- int16_t* magnitudeVectorPtr = magnitudeVector; ++- unsigned int number = 0; ++- const float scalar = SHRT_MAX; ++- for(number = 0; number < num_points; number++){ ++- float real = ((float)(*complexVectorPtr++)) / scalar; ++- float imag = ((float)(*complexVectorPtr++)) / scalar; ++- *magnitudeVectorPtr++ = (int16_t)rintf(sqrtf((real*real) + (imag*imag)) * scalar); ++- } +++ const int16_t* complexVectorPtr = (const int16_t*)complexVector; +++ int16_t* magnitudeVectorPtr = magnitudeVector; +++ unsigned int number = 0; +++ const float scalar = SHRT_MAX; +++ for (number = 0; number < num_points; number++) { +++ float real = ((float)(*complexVectorPtr++)) / scalar; +++ float imag = ((float)(*complexVectorPtr++)) / scalar; +++ *magnitudeVectorPtr++ = +++ (int16_t)rintf(sqrtf((real * real) + (imag * imag)) * scalar); +++ } ++ } ++ #endif /* LV_HAVE_GENERIC */ ++ ++ #ifdef LV_HAVE_ORC_DISABLED ++-extern void ++-volk_16ic_magnitude_16i_a_orc_impl(int16_t* magnitudeVector, const lv_16sc_t* complexVector, float scalar, unsigned int num_points); ++- ++-static inline void ++-volk_16ic_magnitude_16i_u_orc(int16_t* magnitudeVector, const lv_16sc_t* complexVector, unsigned int num_points) +++extern void volk_16ic_magnitude_16i_a_orc_impl(int16_t* magnitudeVector, +++ const lv_16sc_t* complexVector, +++ float scalar, +++ unsigned int num_points); +++ +++static inline void volk_16ic_magnitude_16i_u_orc(int16_t* magnitudeVector, +++ const lv_16sc_t* complexVector, +++ unsigned int num_points) ++ { ++- volk_16ic_magnitude_16i_a_orc_impl(magnitudeVector, complexVector, SHRT_MAX, num_points); +++ volk_16ic_magnitude_16i_a_orc_impl( +++ magnitudeVector, complexVector, SHRT_MAX, num_points); ++ } ++ #endif /* LV_HAVE_ORC */ ++ ++@@ -300,71 +313,74 @@ volk_16ic_magnitude_16i_u_orc(int16_t* magnitudeVector, const lv_16sc_t* complex ++ #ifndef INCLUDED_volk_16ic_magnitude_16i_u_H ++ #define INCLUDED_volk_16ic_magnitude_16i_u_H ++ ++-#include ++ #include ++-#include ++ #include +++#include +++#include ++ ++ #ifdef LV_HAVE_AVX2 ++ #include ++ ++-static inline void ++-volk_16ic_magnitude_16i_u_avx2(int16_t* magnitudeVector, const lv_16sc_t* complexVector, unsigned int num_points) +++static inline void volk_16ic_magnitude_16i_u_avx2(int16_t* magnitudeVector, +++ const lv_16sc_t* complexVector, +++ unsigned int num_points) ++ { ++- unsigned int number = 0; ++- const unsigned int eighthPoints = num_points / 8; ++- ++- const int16_t* complexVectorPtr = (const int16_t*)complexVector; ++- int16_t* magnitudeVectorPtr = magnitudeVector; ++- ++- __m256 vScalar = _mm256_set1_ps(SHRT_MAX); ++- __m256 invScalar = _mm256_set1_ps(1.0f/SHRT_MAX); ++- __m256i int1, int2; ++- __m128i short1, short2; ++- __m256 cplxValue1, cplxValue2, result; ++- __m256i idx = _mm256_set_epi32(0,0,0,0,5,1,4,0); ++- ++- for(;number < eighthPoints; number++){ ++- ++- int1 = _mm256_loadu_si256((__m256i*)complexVectorPtr); ++- complexVectorPtr += 16; ++- short1 = _mm256_extracti128_si256(int1,0); ++- short2 = _mm256_extracti128_si256(int1,1); ++- ++- int1 = _mm256_cvtepi16_epi32(short1); ++- int2 = _mm256_cvtepi16_epi32(short2); ++- cplxValue1 = _mm256_cvtepi32_ps(int1); ++- cplxValue2 = _mm256_cvtepi32_ps(int2); ++- ++- cplxValue1 = _mm256_mul_ps(cplxValue1, invScalar); ++- cplxValue2 = _mm256_mul_ps(cplxValue2, invScalar); ++- ++- cplxValue1 = _mm256_mul_ps(cplxValue1, cplxValue1); // Square the values ++- cplxValue2 = _mm256_mul_ps(cplxValue2, cplxValue2); // Square the Values ++- ++- result = _mm256_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values ++- ++- result = _mm256_sqrt_ps(result); // Square root the values ++- ++- result = _mm256_mul_ps(result, vScalar); // Scale the results ++- ++- int1 = _mm256_cvtps_epi32(result); ++- int1 = _mm256_packs_epi32(int1, int1); ++- int1 = _mm256_permutevar8x32_epi32(int1, idx); //permute to compensate for shuffling in hadd and packs ++- short1 = _mm256_extracti128_si256(int1, 0); ++- _mm_storeu_si128((__m128i*)magnitudeVectorPtr,short1); ++- magnitudeVectorPtr += 8; ++- } ++- ++- number = eighthPoints * 8; ++- magnitudeVectorPtr = &magnitudeVector[number]; ++- complexVectorPtr = (const int16_t*)&complexVector[number]; ++- for(; number < num_points; number++){ ++- const float val1Real = (float)(*complexVectorPtr++) / SHRT_MAX; ++- const float val1Imag = (float)(*complexVectorPtr++) / SHRT_MAX; ++- const float val1Result = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)) * SHRT_MAX; ++- *magnitudeVectorPtr++ = (int16_t)rintf(val1Result); ++- } +++ unsigned int number = 0; +++ const unsigned int eighthPoints = num_points / 8; +++ +++ const int16_t* complexVectorPtr = (const int16_t*)complexVector; +++ int16_t* magnitudeVectorPtr = magnitudeVector; +++ +++ __m256 vScalar = _mm256_set1_ps(SHRT_MAX); +++ __m256 invScalar = _mm256_set1_ps(1.0f / SHRT_MAX); +++ __m256i int1, int2; +++ __m128i short1, short2; +++ __m256 cplxValue1, cplxValue2, result; +++ __m256i idx = _mm256_set_epi32(0, 0, 0, 0, 5, 1, 4, 0); +++ +++ for (; number < eighthPoints; number++) { +++ +++ int1 = _mm256_loadu_si256((__m256i*)complexVectorPtr); +++ complexVectorPtr += 16; +++ short1 = _mm256_extracti128_si256(int1, 0); +++ short2 = _mm256_extracti128_si256(int1, 1); +++ +++ int1 = _mm256_cvtepi16_epi32(short1); +++ int2 = _mm256_cvtepi16_epi32(short2); +++ cplxValue1 = _mm256_cvtepi32_ps(int1); +++ cplxValue2 = _mm256_cvtepi32_ps(int2); +++ +++ cplxValue1 = _mm256_mul_ps(cplxValue1, invScalar); +++ cplxValue2 = _mm256_mul_ps(cplxValue2, invScalar); +++ +++ cplxValue1 = _mm256_mul_ps(cplxValue1, cplxValue1); // Square the values +++ cplxValue2 = _mm256_mul_ps(cplxValue2, cplxValue2); // Square the Values +++ +++ result = _mm256_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values +++ +++ result = _mm256_sqrt_ps(result); // Square root the values +++ +++ result = _mm256_mul_ps(result, vScalar); // Scale the results +++ +++ int1 = _mm256_cvtps_epi32(result); +++ int1 = _mm256_packs_epi32(int1, int1); +++ int1 = _mm256_permutevar8x32_epi32( +++ int1, idx); // permute to compensate for shuffling in hadd and packs +++ short1 = _mm256_extracti128_si256(int1, 0); +++ _mm_storeu_si128((__m128i*)magnitudeVectorPtr, short1); +++ magnitudeVectorPtr += 8; +++ } +++ +++ number = eighthPoints * 8; +++ magnitudeVectorPtr = &magnitudeVector[number]; +++ complexVectorPtr = (const int16_t*)&complexVector[number]; +++ for (; number < num_points; number++) { +++ const float val1Real = (float)(*complexVectorPtr++) / SHRT_MAX; +++ const float val1Imag = (float)(*complexVectorPtr++) / SHRT_MAX; +++ const float val1Result = +++ sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)) * SHRT_MAX; +++ *magnitudeVectorPtr++ = (int16_t)rintf(val1Result); +++ } ++ } ++ #endif /* LV_HAVE_AVX2 */ ++ ++@@ -372,24 +388,25 @@ volk_16ic_magnitude_16i_u_avx2(int16_t* magnitudeVector, const lv_16sc_t* comple ++ #include ++ #include ++ ++-static inline void ++-volk_16ic_magnitude_16i_neonv7(int16_t* magnitudeVector, const lv_16sc_t* complexVector, unsigned int num_points) +++static inline void volk_16ic_magnitude_16i_neonv7(int16_t* magnitudeVector, +++ const lv_16sc_t* complexVector, +++ unsigned int num_points) ++ { ++ unsigned int number = 0; ++ unsigned int quarter_points = num_points / 4; ++- +++ ++ const float scalar = SHRT_MAX; ++ const float inv_scalar = 1.0f / scalar; ++- +++ ++ int16_t* magnitudeVectorPtr = magnitudeVector; ++ const lv_16sc_t* complexVectorPtr = complexVector; ++- +++ ++ float32x4_t mag_vec; ++ float32x4x2_t c_vec; ++- ++- for(number = 0; number < quarter_points; number++) { +++ +++ for (number = 0; number < quarter_points; number++) { ++ const int16x4x2_t c16_vec = vld2_s16((int16_t*)complexVectorPtr); ++- __VOLK_PREFETCH(complexVectorPtr+4); +++ __VOLK_PREFETCH(complexVectorPtr + 4); ++ c_vec.val[0] = vcvtq_f32_s32(vmovl_s16(c16_vec.val[0])); ++ c_vec.val[1] = vcvtq_f32_s32(vmovl_s16(c16_vec.val[1])); ++ // Scale to close to 0-1 ++@@ -406,15 +423,16 @@ volk_16ic_magnitude_16i_neonv7(int16_t* magnitudeVector, const lv_16sc_t* comple ++ const int16x4_t mag16_vec = vmovn_s32(vcvtq_s32_f32(mag_vec)); ++ vst1_s16(magnitudeVectorPtr, mag16_vec); ++ // Advance pointers ++- magnitudeVectorPtr+=4; ++- complexVectorPtr+=4; +++ magnitudeVectorPtr += 4; +++ complexVectorPtr += 4; ++ } ++- +++ ++ // Deal with the rest ++- for(number = quarter_points * 4; number < num_points; number++) { +++ for (number = quarter_points * 4; number < num_points; number++) { ++ const float real = lv_creal(*complexVectorPtr) * inv_scalar; ++ const float imag = lv_cimag(*complexVectorPtr) * inv_scalar; ++- *magnitudeVectorPtr = (int16_t)rintf(sqrtf((real*real) + (imag*imag)) * scalar); +++ *magnitudeVectorPtr = +++ (int16_t)rintf(sqrtf((real * real) + (imag * imag)) * scalar); ++ complexVectorPtr++; ++ magnitudeVectorPtr++; ++ } ++diff --git a/kernels/volk/volk_16ic_s32f_deinterleave_32f_x2.h b/kernels/volk/volk_16ic_s32f_deinterleave_32f_x2.h ++index 50d9341..7425ec6 100644 ++--- a/kernels/volk/volk_16ic_s32f_deinterleave_32f_x2.h +++++ b/kernels/volk/volk_16ic_s32f_deinterleave_32f_x2.h ++@@ -30,8 +30,8 @@ ++ * ++ * Dispatcher Prototype ++ * \code ++- * void volk_16ic_s32f_deinterleave_32f_x2(float* iBuffer, float* qBuffer, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){ ++- * \endcode +++ * void volk_16ic_s32f_deinterleave_32f_x2(float* iBuffer, float* qBuffer, const +++ * lv_16sc_t* complexVector, const float scalar, unsigned int num_points){ \endcode ++ * ++ * \b Inputs ++ * \li complexVector: The complex input vector of 16-bit shorts. ++@@ -56,197 +56,214 @@ ++ #ifndef INCLUDED_volk_16ic_s32f_deinterleave_32f_x2_a_H ++ #define INCLUDED_volk_16ic_s32f_deinterleave_32f_x2_a_H ++ ++-#include ++ #include ++ #include +++#include ++ ++ #ifdef LV_HAVE_AVX2 ++ #include ++ ++-static inline ++-void volk_16ic_s32f_deinterleave_32f_x2_a_avx2(float* iBuffer, float* qBuffer, const lv_16sc_t* complexVector, ++- const float scalar, unsigned int num_points) +++static inline void +++volk_16ic_s32f_deinterleave_32f_x2_a_avx2(float* iBuffer, +++ float* qBuffer, +++ const lv_16sc_t* complexVector, +++ const float scalar, +++ unsigned int num_points) ++ { ++- float* iBufferPtr = iBuffer; ++- float* qBufferPtr = qBuffer; ++- ++- uint64_t number = 0; ++- const uint64_t eighthPoints = num_points / 8; ++- __m256 cplxValue1, cplxValue2, iValue, qValue; ++- __m256i cplxValueA, cplxValueB; ++- __m128i cplxValue128; ++- ++- __m256 invScalar = _mm256_set1_ps(1.0/scalar); ++- int16_t* complexVectorPtr = (int16_t*)complexVector; ++- __m256i idx = _mm256_set_epi32(7,6,3,2,5,4,1,0); ++- ++- for(;number < eighthPoints; number++){ ++- ++- cplxValueA = _mm256_load_si256((__m256i*) complexVectorPtr); ++- complexVectorPtr += 16; ++- ++- //cvt ++- cplxValue128 = _mm256_extracti128_si256(cplxValueA, 0); ++- cplxValueB = _mm256_cvtepi16_epi32(cplxValue128); ++- cplxValue1 = _mm256_cvtepi32_ps(cplxValueB); ++- cplxValue128 = _mm256_extracti128_si256(cplxValueA, 1); ++- cplxValueB = _mm256_cvtepi16_epi32(cplxValue128); ++- cplxValue2 = _mm256_cvtepi32_ps(cplxValueB); ++- ++- cplxValue1 = _mm256_mul_ps(cplxValue1, invScalar); ++- cplxValue2 = _mm256_mul_ps(cplxValue2, invScalar); ++- ++- // Arrange in i1i2i3i4 format ++- iValue = _mm256_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0)); ++- iValue = _mm256_permutevar8x32_ps(iValue,idx); ++- // Arrange in q1q2q3q4 format ++- qValue = _mm256_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1)); ++- qValue = _mm256_permutevar8x32_ps(qValue,idx); ++- ++- _mm256_store_ps(iBufferPtr, iValue); ++- _mm256_store_ps(qBufferPtr, qValue); ++- ++- iBufferPtr += 8; ++- qBufferPtr += 8; ++- } ++- ++- number = eighthPoints * 8; ++- complexVectorPtr = (int16_t*)&complexVector[number]; ++- for(; number < num_points; number++){ ++- *iBufferPtr++ = (float)(*complexVectorPtr++) / scalar; ++- *qBufferPtr++ = (float)(*complexVectorPtr++) / scalar; ++- } +++ float* iBufferPtr = iBuffer; +++ float* qBufferPtr = qBuffer; +++ +++ uint64_t number = 0; +++ const uint64_t eighthPoints = num_points / 8; +++ __m256 cplxValue1, cplxValue2, iValue, qValue; +++ __m256i cplxValueA, cplxValueB; +++ __m128i cplxValue128; +++ +++ __m256 invScalar = _mm256_set1_ps(1.0 / scalar); +++ int16_t* complexVectorPtr = (int16_t*)complexVector; +++ __m256i idx = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0); +++ +++ for (; number < eighthPoints; number++) { +++ +++ cplxValueA = _mm256_load_si256((__m256i*)complexVectorPtr); +++ complexVectorPtr += 16; +++ +++ // cvt +++ cplxValue128 = _mm256_extracti128_si256(cplxValueA, 0); +++ cplxValueB = _mm256_cvtepi16_epi32(cplxValue128); +++ cplxValue1 = _mm256_cvtepi32_ps(cplxValueB); +++ cplxValue128 = _mm256_extracti128_si256(cplxValueA, 1); +++ cplxValueB = _mm256_cvtepi16_epi32(cplxValue128); +++ cplxValue2 = _mm256_cvtepi32_ps(cplxValueB); +++ +++ cplxValue1 = _mm256_mul_ps(cplxValue1, invScalar); +++ cplxValue2 = _mm256_mul_ps(cplxValue2, invScalar); +++ +++ // Arrange in i1i2i3i4 format +++ iValue = _mm256_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2, 0, 2, 0)); +++ iValue = _mm256_permutevar8x32_ps(iValue, idx); +++ // Arrange in q1q2q3q4 format +++ qValue = _mm256_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3, 1, 3, 1)); +++ qValue = _mm256_permutevar8x32_ps(qValue, idx); +++ +++ _mm256_store_ps(iBufferPtr, iValue); +++ _mm256_store_ps(qBufferPtr, qValue); +++ +++ iBufferPtr += 8; +++ qBufferPtr += 8; +++ } +++ +++ number = eighthPoints * 8; +++ complexVectorPtr = (int16_t*)&complexVector[number]; +++ for (; number < num_points; number++) { +++ *iBufferPtr++ = (float)(*complexVectorPtr++) / scalar; +++ *qBufferPtr++ = (float)(*complexVectorPtr++) / scalar; +++ } ++ } ++ #endif /* LV_HAVE_AVX2 */ ++ ++ #ifdef LV_HAVE_SSE ++ #include ++ ++-static inline ++-void volk_16ic_s32f_deinterleave_32f_x2_a_sse(float* iBuffer, float* qBuffer, const lv_16sc_t* complexVector, ++- const float scalar, unsigned int num_points) +++static inline void +++volk_16ic_s32f_deinterleave_32f_x2_a_sse(float* iBuffer, +++ float* qBuffer, +++ const lv_16sc_t* complexVector, +++ const float scalar, +++ unsigned int num_points) ++ { ++- float* iBufferPtr = iBuffer; ++- float* qBufferPtr = qBuffer; +++ float* iBufferPtr = iBuffer; +++ float* qBufferPtr = qBuffer; ++ ++- uint64_t number = 0; ++- const uint64_t quarterPoints = num_points / 4; ++- __m128 cplxValue1, cplxValue2, iValue, qValue; +++ uint64_t number = 0; +++ const uint64_t quarterPoints = num_points / 4; +++ __m128 cplxValue1, cplxValue2, iValue, qValue; ++ ++- __m128 invScalar = _mm_set_ps1(1.0/scalar); ++- int16_t* complexVectorPtr = (int16_t*)complexVector; +++ __m128 invScalar = _mm_set_ps1(1.0 / scalar); +++ int16_t* complexVectorPtr = (int16_t*)complexVector; ++ ++- __VOLK_ATTR_ALIGNED(16) float floatBuffer[8]; +++ __VOLK_ATTR_ALIGNED(16) float floatBuffer[8]; ++ ++- for(;number < quarterPoints; number++){ +++ for (; number < quarterPoints; number++) { ++ ++- floatBuffer[0] = (float)(complexVectorPtr[0]); ++- floatBuffer[1] = (float)(complexVectorPtr[1]); ++- floatBuffer[2] = (float)(complexVectorPtr[2]); ++- floatBuffer[3] = (float)(complexVectorPtr[3]); +++ floatBuffer[0] = (float)(complexVectorPtr[0]); +++ floatBuffer[1] = (float)(complexVectorPtr[1]); +++ floatBuffer[2] = (float)(complexVectorPtr[2]); +++ floatBuffer[3] = (float)(complexVectorPtr[3]); ++ ++- floatBuffer[4] = (float)(complexVectorPtr[4]); ++- floatBuffer[5] = (float)(complexVectorPtr[5]); ++- floatBuffer[6] = (float)(complexVectorPtr[6]); ++- floatBuffer[7] = (float)(complexVectorPtr[7]); +++ floatBuffer[4] = (float)(complexVectorPtr[4]); +++ floatBuffer[5] = (float)(complexVectorPtr[5]); +++ floatBuffer[6] = (float)(complexVectorPtr[6]); +++ floatBuffer[7] = (float)(complexVectorPtr[7]); ++ ++- cplxValue1 = _mm_load_ps(&floatBuffer[0]); ++- cplxValue2 = _mm_load_ps(&floatBuffer[4]); +++ cplxValue1 = _mm_load_ps(&floatBuffer[0]); +++ cplxValue2 = _mm_load_ps(&floatBuffer[4]); ++ ++- complexVectorPtr += 8; +++ complexVectorPtr += 8; ++ ++- cplxValue1 = _mm_mul_ps(cplxValue1, invScalar); ++- cplxValue2 = _mm_mul_ps(cplxValue2, invScalar); +++ cplxValue1 = _mm_mul_ps(cplxValue1, invScalar); +++ cplxValue2 = _mm_mul_ps(cplxValue2, invScalar); ++ ++- // Arrange in i1i2i3i4 format ++- iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0)); ++- // Arrange in q1q2q3q4 format ++- qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1)); +++ // Arrange in i1i2i3i4 format +++ iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2, 0, 2, 0)); +++ // Arrange in q1q2q3q4 format +++ qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3, 1, 3, 1)); ++ ++- _mm_store_ps(iBufferPtr, iValue); ++- _mm_store_ps(qBufferPtr, qValue); +++ _mm_store_ps(iBufferPtr, iValue); +++ _mm_store_ps(qBufferPtr, qValue); ++ ++- iBufferPtr += 4; ++- qBufferPtr += 4; ++- } +++ iBufferPtr += 4; +++ qBufferPtr += 4; +++ } ++ ++- number = quarterPoints * 4; ++- complexVectorPtr = (int16_t*)&complexVector[number]; ++- for(; number < num_points; number++){ ++- *iBufferPtr++ = (float)(*complexVectorPtr++) / scalar; ++- *qBufferPtr++ = (float)(*complexVectorPtr++) / scalar; ++- } +++ number = quarterPoints * 4; +++ complexVectorPtr = (int16_t*)&complexVector[number]; +++ for (; number < num_points; number++) { +++ *iBufferPtr++ = (float)(*complexVectorPtr++) / scalar; +++ *qBufferPtr++ = (float)(*complexVectorPtr++) / scalar; +++ } ++ } ++ #endif /* LV_HAVE_SSE */ ++ ++ #ifdef LV_HAVE_GENERIC ++ ++ static inline void ++-volk_16ic_s32f_deinterleave_32f_x2_generic(float* iBuffer, float* qBuffer, const lv_16sc_t* complexVector, ++- const float scalar, unsigned int num_points) +++volk_16ic_s32f_deinterleave_32f_x2_generic(float* iBuffer, +++ float* qBuffer, +++ const lv_16sc_t* complexVector, +++ const float scalar, +++ unsigned int num_points) ++ { ++- const int16_t* complexVectorPtr = (const int16_t*)complexVector; ++- float* iBufferPtr = iBuffer; ++- float* qBufferPtr = qBuffer; ++- unsigned int number; ++- for(number = 0; number < num_points; number++){ ++- *iBufferPtr++ = (float)(*complexVectorPtr++) / scalar; ++- *qBufferPtr++ = (float)(*complexVectorPtr++) / scalar; ++- } +++ const int16_t* complexVectorPtr = (const int16_t*)complexVector; +++ float* iBufferPtr = iBuffer; +++ float* qBufferPtr = qBuffer; +++ unsigned int number; +++ for (number = 0; number < num_points; number++) { +++ *iBufferPtr++ = (float)(*complexVectorPtr++) / scalar; +++ *qBufferPtr++ = (float)(*complexVectorPtr++) / scalar; +++ } ++ } ++ #endif /* LV_HAVE_GENERIC */ ++ ++ #ifdef LV_HAVE_NEON ++ #include ++-static inline void ++-volk_16ic_s32f_deinterleave_32f_x2_neon(float* iBuffer, float* qBuffer, const lv_16sc_t* complexVector, ++- const float scalar, unsigned int num_points) +++static inline void volk_16ic_s32f_deinterleave_32f_x2_neon(float* iBuffer, +++ float* qBuffer, +++ const lv_16sc_t* complexVector, +++ const float scalar, +++ unsigned int num_points) ++ { ++- const int16_t* complexVectorPtr = (const int16_t*)complexVector; ++- float* iBufferPtr = iBuffer; ++- float* qBufferPtr = qBuffer; ++- unsigned int eighth_points = num_points / 4; ++- unsigned int number; ++- float iScalar = 1.f/scalar; ++- float32x4_t invScalar; ++- invScalar = vld1q_dup_f32(&iScalar); ++- ++- int16x4x2_t complexInput_s16; ++- int32x4x2_t complexInput_s32; ++- float32x4x2_t complexFloat; ++- ++- for(number = 0; number < eighth_points; number++){ ++- complexInput_s16 = vld2_s16(complexVectorPtr); ++- complexInput_s32.val[0] = vmovl_s16(complexInput_s16.val[0]); ++- complexInput_s32.val[1] = vmovl_s16(complexInput_s16.val[1]); ++- complexFloat.val[0] = vcvtq_f32_s32(complexInput_s32.val[0]); ++- complexFloat.val[1] = vcvtq_f32_s32(complexInput_s32.val[1]); ++- complexFloat.val[0] = vmulq_f32(complexFloat.val[0], invScalar); ++- complexFloat.val[1] = vmulq_f32(complexFloat.val[1], invScalar); ++- vst1q_f32(iBufferPtr, complexFloat.val[0]); ++- vst1q_f32(qBufferPtr, complexFloat.val[1]); ++- complexVectorPtr += 8; ++- iBufferPtr += 4; ++- qBufferPtr += 4; ++- } ++- ++- for(number = eighth_points*4; number < num_points; number++){ ++- *iBufferPtr++ = (float)(*complexVectorPtr++) / scalar; ++- *qBufferPtr++ = (float)(*complexVectorPtr++) / scalar; ++- } +++ const int16_t* complexVectorPtr = (const int16_t*)complexVector; +++ float* iBufferPtr = iBuffer; +++ float* qBufferPtr = qBuffer; +++ unsigned int eighth_points = num_points / 4; +++ unsigned int number; +++ float iScalar = 1.f / scalar; +++ float32x4_t invScalar; +++ invScalar = vld1q_dup_f32(&iScalar); +++ +++ int16x4x2_t complexInput_s16; +++ int32x4x2_t complexInput_s32; +++ float32x4x2_t complexFloat; +++ +++ for (number = 0; number < eighth_points; number++) { +++ complexInput_s16 = vld2_s16(complexVectorPtr); +++ complexInput_s32.val[0] = vmovl_s16(complexInput_s16.val[0]); +++ complexInput_s32.val[1] = vmovl_s16(complexInput_s16.val[1]); +++ complexFloat.val[0] = vcvtq_f32_s32(complexInput_s32.val[0]); +++ complexFloat.val[1] = vcvtq_f32_s32(complexInput_s32.val[1]); +++ complexFloat.val[0] = vmulq_f32(complexFloat.val[0], invScalar); +++ complexFloat.val[1] = vmulq_f32(complexFloat.val[1], invScalar); +++ vst1q_f32(iBufferPtr, complexFloat.val[0]); +++ vst1q_f32(qBufferPtr, complexFloat.val[1]); +++ complexVectorPtr += 8; +++ iBufferPtr += 4; +++ qBufferPtr += 4; +++ } +++ +++ for (number = eighth_points * 4; number < num_points; number++) { +++ *iBufferPtr++ = (float)(*complexVectorPtr++) / scalar; +++ *qBufferPtr++ = (float)(*complexVectorPtr++) / scalar; +++ } ++ } ++ #endif /* LV_HAVE_GENERIC */ ++ ++ #ifdef LV_HAVE_ORC ++-extern void ++-volk_16ic_s32f_deinterleave_32f_x2_a_orc_impl(float* iBuffer, float* qBuffer, const lv_16sc_t* complexVector, ++- const float scalar, unsigned int num_points); +++extern void volk_16ic_s32f_deinterleave_32f_x2_a_orc_impl(float* iBuffer, +++ float* qBuffer, +++ const lv_16sc_t* complexVector, +++ const float scalar, +++ unsigned int num_points); ++ ++ static inline void ++-volk_16ic_s32f_deinterleave_32f_x2_u_orc(float* iBuffer, float* qBuffer, const lv_16sc_t* complexVector, ++- const float scalar, unsigned int num_points) +++volk_16ic_s32f_deinterleave_32f_x2_u_orc(float* iBuffer, +++ float* qBuffer, +++ const lv_16sc_t* complexVector, +++ const float scalar, +++ unsigned int num_points) ++ { ++- volk_16ic_s32f_deinterleave_32f_x2_a_orc_impl(iBuffer, qBuffer, complexVector, scalar, num_points); +++ volk_16ic_s32f_deinterleave_32f_x2_a_orc_impl( +++ iBuffer, qBuffer, complexVector, scalar, num_points); ++ } ++ #endif /* LV_HAVE_ORC */ ++ ++@@ -257,66 +274,69 @@ volk_16ic_s32f_deinterleave_32f_x2_u_orc(float* iBuffer, float* qBuffer, const l ++ #ifndef INCLUDED_volk_16ic_s32f_deinterleave_32f_x2_u_H ++ #define INCLUDED_volk_16ic_s32f_deinterleave_32f_x2_u_H ++ ++-#include ++ #include ++ #include +++#include ++ ++ #ifdef LV_HAVE_AVX2 ++ #include ++ ++-static inline ++-void volk_16ic_s32f_deinterleave_32f_x2_u_avx2(float* iBuffer, float* qBuffer, const lv_16sc_t* complexVector, ++- const float scalar, unsigned int num_points) +++static inline void +++volk_16ic_s32f_deinterleave_32f_x2_u_avx2(float* iBuffer, +++ float* qBuffer, +++ const lv_16sc_t* complexVector, +++ const float scalar, +++ unsigned int num_points) ++ { ++- float* iBufferPtr = iBuffer; ++- float* qBufferPtr = qBuffer; ++- ++- uint64_t number = 0; ++- const uint64_t eighthPoints = num_points / 8; ++- __m256 cplxValue1, cplxValue2, iValue, qValue; ++- __m256i cplxValueA, cplxValueB; ++- __m128i cplxValue128; ++- ++- __m256 invScalar = _mm256_set1_ps(1.0/scalar); ++- int16_t* complexVectorPtr = (int16_t*)complexVector; ++- __m256i idx = _mm256_set_epi32(7,6,3,2,5,4,1,0); ++- ++- for(;number < eighthPoints; number++){ ++- ++- cplxValueA = _mm256_loadu_si256((__m256i*) complexVectorPtr); ++- complexVectorPtr += 16; ++- ++- //cvt ++- cplxValue128 = _mm256_extracti128_si256(cplxValueA, 0); ++- cplxValueB = _mm256_cvtepi16_epi32(cplxValue128); ++- cplxValue1 = _mm256_cvtepi32_ps(cplxValueB); ++- cplxValue128 = _mm256_extracti128_si256(cplxValueA, 1); ++- cplxValueB = _mm256_cvtepi16_epi32(cplxValue128); ++- cplxValue2 = _mm256_cvtepi32_ps(cplxValueB); ++- ++- cplxValue1 = _mm256_mul_ps(cplxValue1, invScalar); ++- cplxValue2 = _mm256_mul_ps(cplxValue2, invScalar); ++- ++- // Arrange in i1i2i3i4 format ++- iValue = _mm256_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0)); ++- iValue = _mm256_permutevar8x32_ps(iValue,idx); ++- // Arrange in q1q2q3q4 format ++- qValue = _mm256_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1)); ++- qValue = _mm256_permutevar8x32_ps(qValue,idx); ++- ++- _mm256_storeu_ps(iBufferPtr, iValue); ++- _mm256_storeu_ps(qBufferPtr, qValue); ++- ++- iBufferPtr += 8; ++- qBufferPtr += 8; ++- } ++- ++- number = eighthPoints * 8; ++- complexVectorPtr = (int16_t*)&complexVector[number]; ++- for(; number < num_points; number++){ ++- *iBufferPtr++ = (float)(*complexVectorPtr++) / scalar; ++- *qBufferPtr++ = (float)(*complexVectorPtr++) / scalar; ++- } +++ float* iBufferPtr = iBuffer; +++ float* qBufferPtr = qBuffer; +++ +++ uint64_t number = 0; +++ const uint64_t eighthPoints = num_points / 8; +++ __m256 cplxValue1, cplxValue2, iValue, qValue; +++ __m256i cplxValueA, cplxValueB; +++ __m128i cplxValue128; +++ +++ __m256 invScalar = _mm256_set1_ps(1.0 / scalar); +++ int16_t* complexVectorPtr = (int16_t*)complexVector; +++ __m256i idx = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0); +++ +++ for (; number < eighthPoints; number++) { +++ +++ cplxValueA = _mm256_loadu_si256((__m256i*)complexVectorPtr); +++ complexVectorPtr += 16; +++ +++ // cvt +++ cplxValue128 = _mm256_extracti128_si256(cplxValueA, 0); +++ cplxValueB = _mm256_cvtepi16_epi32(cplxValue128); +++ cplxValue1 = _mm256_cvtepi32_ps(cplxValueB); +++ cplxValue128 = _mm256_extracti128_si256(cplxValueA, 1); +++ cplxValueB = _mm256_cvtepi16_epi32(cplxValue128); +++ cplxValue2 = _mm256_cvtepi32_ps(cplxValueB); +++ +++ cplxValue1 = _mm256_mul_ps(cplxValue1, invScalar); +++ cplxValue2 = _mm256_mul_ps(cplxValue2, invScalar); +++ +++ // Arrange in i1i2i3i4 format +++ iValue = _mm256_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2, 0, 2, 0)); +++ iValue = _mm256_permutevar8x32_ps(iValue, idx); +++ // Arrange in q1q2q3q4 format +++ qValue = _mm256_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3, 1, 3, 1)); +++ qValue = _mm256_permutevar8x32_ps(qValue, idx); +++ +++ _mm256_storeu_ps(iBufferPtr, iValue); +++ _mm256_storeu_ps(qBufferPtr, qValue); +++ +++ iBufferPtr += 8; +++ qBufferPtr += 8; +++ } +++ +++ number = eighthPoints * 8; +++ complexVectorPtr = (int16_t*)&complexVector[number]; +++ for (; number < num_points; number++) { +++ *iBufferPtr++ = (float)(*complexVectorPtr++) / scalar; +++ *qBufferPtr++ = (float)(*complexVectorPtr++) / scalar; +++ } ++ } ++ #endif /* LV_HAVE_AVX2 */ ++ ++diff --git a/kernels/volk/volk_16ic_s32f_deinterleave_real_32f.h b/kernels/volk/volk_16ic_s32f_deinterleave_real_32f.h ++index 713e6a1..8b72d1c 100644 ++--- a/kernels/volk/volk_16ic_s32f_deinterleave_real_32f.h +++++ b/kernels/volk/volk_16ic_s32f_deinterleave_real_32f.h ++@@ -31,8 +31,8 @@ ++ * ++ * Dispatcher Prototype ++ * \code ++- * void volk_16ic_s32f_deinterleave_real_32f(float* iBuffer, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){ ++- * \endcode +++ * void volk_16ic_s32f_deinterleave_real_32f(float* iBuffer, const lv_16sc_t* +++ * complexVector, const float scalar, unsigned int num_points){ \endcode ++ * ++ * \b Inputs ++ * \li complexVector: The complex input vector of 16-bit shorts. ++@@ -56,55 +56,88 @@ ++ #ifndef INCLUDED_volk_16ic_s32f_deinterleave_real_32f_a_H ++ #define INCLUDED_volk_16ic_s32f_deinterleave_real_32f_a_H ++ ++-#include ++ #include ++ #include +++#include ++ ++ #ifdef LV_HAVE_AVX2 ++ #include ++ ++ static inline void ++-volk_16ic_s32f_deinterleave_real_32f_a_avx2(float* iBuffer, const lv_16sc_t* complexVector, ++- const float scalar, unsigned int num_points) +++volk_16ic_s32f_deinterleave_real_32f_a_avx2(float* iBuffer, +++ const lv_16sc_t* complexVector, +++ const float scalar, +++ unsigned int num_points) ++ { ++- float* iBufferPtr = iBuffer; ++- ++- unsigned int number = 0; ++- const unsigned int eighthPoints = num_points / 8; ++- ++- __m256 iFloatValue; ++- ++- const float iScalar= 1.0 / scalar; ++- __m256 invScalar = _mm256_set1_ps(iScalar); ++- __m256i complexVal, iIntVal; ++- __m128i complexVal128; ++- int8_t* complexVectorPtr = (int8_t*)complexVector; ++- ++- __m256i moveMask = _mm256_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0); ++- ++- for(;number < eighthPoints; number++){ ++- complexVal = _mm256_load_si256((__m256i*)complexVectorPtr); complexVectorPtr += 32; ++- complexVal = _mm256_shuffle_epi8(complexVal, moveMask); ++- complexVal = _mm256_permute4x64_epi64(complexVal, 0xd8); ++- complexVal128 = _mm256_extracti128_si256(complexVal, 0); ++- ++- iIntVal = _mm256_cvtepi16_epi32(complexVal128); ++- iFloatValue = _mm256_cvtepi32_ps(iIntVal); ++- ++- iFloatValue = _mm256_mul_ps(iFloatValue, invScalar); ++- ++- _mm256_store_ps(iBufferPtr, iFloatValue); ++- ++- iBufferPtr += 8; ++- } ++- ++- number = eighthPoints * 8; ++- int16_t* sixteenTComplexVectorPtr = (int16_t*)&complexVector[number]; ++- for(; number < num_points; number++){ ++- *iBufferPtr++ = ((float)(*sixteenTComplexVectorPtr++)) * iScalar; ++- sixteenTComplexVectorPtr++; ++- } ++- +++ float* iBufferPtr = iBuffer; +++ +++ unsigned int number = 0; +++ const unsigned int eighthPoints = num_points / 8; +++ +++ __m256 iFloatValue; +++ +++ const float iScalar = 1.0 / scalar; +++ __m256 invScalar = _mm256_set1_ps(iScalar); +++ __m256i complexVal, iIntVal; +++ __m128i complexVal128; +++ int8_t* complexVectorPtr = (int8_t*)complexVector; +++ +++ __m256i moveMask = _mm256_set_epi8(0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 13, +++ 12, +++ 9, +++ 8, +++ 5, +++ 4, +++ 1, +++ 0, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 13, +++ 12, +++ 9, +++ 8, +++ 5, +++ 4, +++ 1, +++ 0); +++ +++ for (; number < eighthPoints; number++) { +++ complexVal = _mm256_load_si256((__m256i*)complexVectorPtr); +++ complexVectorPtr += 32; +++ complexVal = _mm256_shuffle_epi8(complexVal, moveMask); +++ complexVal = _mm256_permute4x64_epi64(complexVal, 0xd8); +++ complexVal128 = _mm256_extracti128_si256(complexVal, 0); +++ +++ iIntVal = _mm256_cvtepi16_epi32(complexVal128); +++ iFloatValue = _mm256_cvtepi32_ps(iIntVal); +++ +++ iFloatValue = _mm256_mul_ps(iFloatValue, invScalar); +++ +++ _mm256_store_ps(iBufferPtr, iFloatValue); +++ +++ iBufferPtr += 8; +++ } +++ +++ number = eighthPoints * 8; +++ int16_t* sixteenTComplexVectorPtr = (int16_t*)&complexVector[number]; +++ for (; number < num_points; number++) { +++ *iBufferPtr++ = ((float)(*sixteenTComplexVectorPtr++)) * iScalar; +++ sixteenTComplexVectorPtr++; +++ } ++ } ++ #endif /* LV_HAVE_AVX2 */ ++ ++@@ -112,44 +145,47 @@ volk_16ic_s32f_deinterleave_real_32f_a_avx2(float* iBuffer, const lv_16sc_t* com ++ #include ++ ++ static inline void ++-volk_16ic_s32f_deinterleave_real_32f_a_sse4_1(float* iBuffer, const lv_16sc_t* complexVector, ++- const float scalar, unsigned int num_points) +++volk_16ic_s32f_deinterleave_real_32f_a_sse4_1(float* iBuffer, +++ const lv_16sc_t* complexVector, +++ const float scalar, +++ unsigned int num_points) ++ { ++- float* iBufferPtr = iBuffer; ++- ++- unsigned int number = 0; ++- const unsigned int quarterPoints = num_points / 4; +++ float* iBufferPtr = iBuffer; ++ ++- __m128 iFloatValue; +++ unsigned int number = 0; +++ const unsigned int quarterPoints = num_points / 4; ++ ++- const float iScalar= 1.0 / scalar; ++- __m128 invScalar = _mm_set_ps1(iScalar); ++- __m128i complexVal, iIntVal; ++- int8_t* complexVectorPtr = (int8_t*)complexVector; +++ __m128 iFloatValue; ++ ++- __m128i moveMask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0); +++ const float iScalar = 1.0 / scalar; +++ __m128 invScalar = _mm_set_ps1(iScalar); +++ __m128i complexVal, iIntVal; +++ int8_t* complexVectorPtr = (int8_t*)complexVector; ++ ++- for(;number < quarterPoints; number++){ ++- complexVal = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16; ++- complexVal = _mm_shuffle_epi8(complexVal, moveMask); +++ __m128i moveMask = _mm_set_epi8( +++ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0); ++ ++- iIntVal = _mm_cvtepi16_epi32(complexVal); ++- iFloatValue = _mm_cvtepi32_ps(iIntVal); +++ for (; number < quarterPoints; number++) { +++ complexVal = _mm_load_si128((__m128i*)complexVectorPtr); +++ complexVectorPtr += 16; +++ complexVal = _mm_shuffle_epi8(complexVal, moveMask); ++ ++- iFloatValue = _mm_mul_ps(iFloatValue, invScalar); +++ iIntVal = _mm_cvtepi16_epi32(complexVal); +++ iFloatValue = _mm_cvtepi32_ps(iIntVal); ++ ++- _mm_store_ps(iBufferPtr, iFloatValue); +++ iFloatValue = _mm_mul_ps(iFloatValue, invScalar); ++ ++- iBufferPtr += 4; ++- } +++ _mm_store_ps(iBufferPtr, iFloatValue); ++ ++- number = quarterPoints * 4; ++- int16_t* sixteenTComplexVectorPtr = (int16_t*)&complexVector[number]; ++- for(; number < num_points; number++){ ++- *iBufferPtr++ = ((float)(*sixteenTComplexVectorPtr++)) * iScalar; ++- sixteenTComplexVectorPtr++; ++- } +++ iBufferPtr += 4; +++ } ++ +++ number = quarterPoints * 4; +++ int16_t* sixteenTComplexVectorPtr = (int16_t*)&complexVector[number]; +++ for (; number < num_points; number++) { +++ *iBufferPtr++ = ((float)(*sixteenTComplexVectorPtr++)) * iScalar; +++ sixteenTComplexVectorPtr++; +++ } ++ } ++ #endif /* LV_HAVE_SSE4_1 */ ++ ++@@ -157,59 +193,66 @@ volk_16ic_s32f_deinterleave_real_32f_a_sse4_1(float* iBuffer, const lv_16sc_t* c ++ #include ++ ++ static inline void ++-volk_16ic_s32f_deinterleave_real_32f_a_sse(float* iBuffer, const lv_16sc_t* complexVector, ++- const float scalar, unsigned int num_points) +++volk_16ic_s32f_deinterleave_real_32f_a_sse(float* iBuffer, +++ const lv_16sc_t* complexVector, +++ const float scalar, +++ unsigned int num_points) ++ { ++- float* iBufferPtr = iBuffer; +++ float* iBufferPtr = iBuffer; ++ ++- unsigned int number = 0; ++- const unsigned int quarterPoints = num_points / 4; ++- __m128 iValue; +++ unsigned int number = 0; +++ const unsigned int quarterPoints = num_points / 4; +++ __m128 iValue; ++ ++- const float iScalar = 1.0/scalar; ++- __m128 invScalar = _mm_set_ps1(iScalar); ++- int16_t* complexVectorPtr = (int16_t*)complexVector; +++ const float iScalar = 1.0 / scalar; +++ __m128 invScalar = _mm_set_ps1(iScalar); +++ int16_t* complexVectorPtr = (int16_t*)complexVector; ++ ++- __VOLK_ATTR_ALIGNED(16) float floatBuffer[4]; +++ __VOLK_ATTR_ALIGNED(16) float floatBuffer[4]; ++ ++- for(;number < quarterPoints; number++){ ++- floatBuffer[0] = (float)(*complexVectorPtr); complexVectorPtr += 2; ++- floatBuffer[1] = (float)(*complexVectorPtr); complexVectorPtr += 2; ++- floatBuffer[2] = (float)(*complexVectorPtr); complexVectorPtr += 2; ++- floatBuffer[3] = (float)(*complexVectorPtr); complexVectorPtr += 2; +++ for (; number < quarterPoints; number++) { +++ floatBuffer[0] = (float)(*complexVectorPtr); +++ complexVectorPtr += 2; +++ floatBuffer[1] = (float)(*complexVectorPtr); +++ complexVectorPtr += 2; +++ floatBuffer[2] = (float)(*complexVectorPtr); +++ complexVectorPtr += 2; +++ floatBuffer[3] = (float)(*complexVectorPtr); +++ complexVectorPtr += 2; ++ ++- iValue = _mm_load_ps(floatBuffer); +++ iValue = _mm_load_ps(floatBuffer); ++ ++- iValue = _mm_mul_ps(iValue, invScalar); +++ iValue = _mm_mul_ps(iValue, invScalar); ++ ++- _mm_store_ps(iBufferPtr, iValue); +++ _mm_store_ps(iBufferPtr, iValue); ++ ++- iBufferPtr += 4; ++- } ++- ++- number = quarterPoints * 4; ++- complexVectorPtr = (int16_t*)&complexVector[number]; ++- for(; number < num_points; number++){ ++- *iBufferPtr++ = ((float)(*complexVectorPtr++)) * iScalar; ++- complexVectorPtr++; ++- } +++ iBufferPtr += 4; +++ } ++ +++ number = quarterPoints * 4; +++ complexVectorPtr = (int16_t*)&complexVector[number]; +++ for (; number < num_points; number++) { +++ *iBufferPtr++ = ((float)(*complexVectorPtr++)) * iScalar; +++ complexVectorPtr++; +++ } ++ } ++ #endif /* LV_HAVE_SSE */ ++ ++ #ifdef LV_HAVE_GENERIC ++ static inline void ++-volk_16ic_s32f_deinterleave_real_32f_generic(float* iBuffer, const lv_16sc_t* complexVector, ++- const float scalar, unsigned int num_points) +++volk_16ic_s32f_deinterleave_real_32f_generic(float* iBuffer, +++ const lv_16sc_t* complexVector, +++ const float scalar, +++ unsigned int num_points) ++ { ++- unsigned int number = 0; ++- const int16_t* complexVectorPtr = (const int16_t*)complexVector; ++- float* iBufferPtr = iBuffer; ++- const float invScalar = 1.0 / scalar; ++- for(number = 0; number < num_points; number++){ ++- *iBufferPtr++ = ((float)(*complexVectorPtr++)) * invScalar; ++- complexVectorPtr++; ++- } +++ unsigned int number = 0; +++ const int16_t* complexVectorPtr = (const int16_t*)complexVector; +++ float* iBufferPtr = iBuffer; +++ const float invScalar = 1.0 / scalar; +++ for (number = 0; number < num_points; number++) { +++ *iBufferPtr++ = ((float)(*complexVectorPtr++)) * invScalar; +++ complexVectorPtr++; +++ } ++ } ++ #endif /* LV_HAVE_GENERIC */ ++ ++@@ -219,55 +262,88 @@ volk_16ic_s32f_deinterleave_real_32f_generic(float* iBuffer, const lv_16sc_t* co ++ #ifndef INCLUDED_volk_16ic_s32f_deinterleave_real_32f_u_H ++ #define INCLUDED_volk_16ic_s32f_deinterleave_real_32f_u_H ++ ++-#include ++ #include ++ #include +++#include ++ ++ #ifdef LV_HAVE_AVX2 ++ #include ++ ++ static inline void ++-volk_16ic_s32f_deinterleave_real_32f_u_avx2(float* iBuffer, const lv_16sc_t* complexVector, ++- const float scalar, unsigned int num_points) +++volk_16ic_s32f_deinterleave_real_32f_u_avx2(float* iBuffer, +++ const lv_16sc_t* complexVector, +++ const float scalar, +++ unsigned int num_points) ++ { ++- float* iBufferPtr = iBuffer; ++- ++- unsigned int number = 0; ++- const unsigned int eighthPoints = num_points / 8; ++- ++- __m256 iFloatValue; ++- ++- const float iScalar= 1.0 / scalar; ++- __m256 invScalar = _mm256_set1_ps(iScalar); ++- __m256i complexVal, iIntVal; ++- __m128i complexVal128; ++- int8_t* complexVectorPtr = (int8_t*)complexVector; ++- ++- __m256i moveMask = _mm256_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0); ++- ++- for(;number < eighthPoints; number++){ ++- complexVal = _mm256_loadu_si256((__m256i*)complexVectorPtr); complexVectorPtr += 32; ++- complexVal = _mm256_shuffle_epi8(complexVal, moveMask); ++- complexVal = _mm256_permute4x64_epi64(complexVal, 0xd8); ++- complexVal128 = _mm256_extracti128_si256(complexVal, 0); ++- ++- iIntVal = _mm256_cvtepi16_epi32(complexVal128); ++- iFloatValue = _mm256_cvtepi32_ps(iIntVal); ++- ++- iFloatValue = _mm256_mul_ps(iFloatValue, invScalar); ++- ++- _mm256_storeu_ps(iBufferPtr, iFloatValue); ++- ++- iBufferPtr += 8; ++- } ++- ++- number = eighthPoints * 8; ++- int16_t* sixteenTComplexVectorPtr = (int16_t*)&complexVector[number]; ++- for(; number < num_points; number++){ ++- *iBufferPtr++ = ((float)(*sixteenTComplexVectorPtr++)) * iScalar; ++- sixteenTComplexVectorPtr++; ++- } ++- +++ float* iBufferPtr = iBuffer; +++ +++ unsigned int number = 0; +++ const unsigned int eighthPoints = num_points / 8; +++ +++ __m256 iFloatValue; +++ +++ const float iScalar = 1.0 / scalar; +++ __m256 invScalar = _mm256_set1_ps(iScalar); +++ __m256i complexVal, iIntVal; +++ __m128i complexVal128; +++ int8_t* complexVectorPtr = (int8_t*)complexVector; +++ +++ __m256i moveMask = _mm256_set_epi8(0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 13, +++ 12, +++ 9, +++ 8, +++ 5, +++ 4, +++ 1, +++ 0, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 13, +++ 12, +++ 9, +++ 8, +++ 5, +++ 4, +++ 1, +++ 0); +++ +++ for (; number < eighthPoints; number++) { +++ complexVal = _mm256_loadu_si256((__m256i*)complexVectorPtr); +++ complexVectorPtr += 32; +++ complexVal = _mm256_shuffle_epi8(complexVal, moveMask); +++ complexVal = _mm256_permute4x64_epi64(complexVal, 0xd8); +++ complexVal128 = _mm256_extracti128_si256(complexVal, 0); +++ +++ iIntVal = _mm256_cvtepi16_epi32(complexVal128); +++ iFloatValue = _mm256_cvtepi32_ps(iIntVal); +++ +++ iFloatValue = _mm256_mul_ps(iFloatValue, invScalar); +++ +++ _mm256_storeu_ps(iBufferPtr, iFloatValue); +++ +++ iBufferPtr += 8; +++ } +++ +++ number = eighthPoints * 8; +++ int16_t* sixteenTComplexVectorPtr = (int16_t*)&complexVector[number]; +++ for (; number < num_points; number++) { +++ *iBufferPtr++ = ((float)(*sixteenTComplexVectorPtr++)) * iScalar; +++ sixteenTComplexVectorPtr++; +++ } ++ } ++ #endif /* LV_HAVE_AVX2 */ ++ ++diff --git a/kernels/volk/volk_16ic_s32f_magnitude_32f.h b/kernels/volk/volk_16ic_s32f_magnitude_32f.h ++index bb0459c..c3e3605 100644 ++--- a/kernels/volk/volk_16ic_s32f_magnitude_32f.h +++++ b/kernels/volk/volk_16ic_s32f_magnitude_32f.h ++@@ -30,8 +30,8 @@ ++ * ++ * Dispatcher Prototype ++ * \code ++- * void volk_16ic_s32f_magnitude_32f(float* magnitudeVector, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points) ++- * \endcode +++ * void volk_16ic_s32f_magnitude_32f(float* magnitudeVector, const lv_16sc_t* +++ * complexVector, const float scalar, unsigned int num_points) \endcode ++ * ++ * \b Inputs ++ * \li complexVector: The complex input vector of complex 16-bit shorts. ++@@ -55,67 +55,68 @@ ++ #ifndef INCLUDED_volk_16ic_s32f_magnitude_32f_a_H ++ #define INCLUDED_volk_16ic_s32f_magnitude_32f_a_H ++ ++-#include ++ #include ++-#include ++ #include +++#include +++#include ++ ++ #ifdef LV_HAVE_AVX2 ++ #include ++ ++-static inline void ++-volk_16ic_s32f_magnitude_32f_a_avx2(float* magnitudeVector, const lv_16sc_t* complexVector, ++- const float scalar, unsigned int num_points) +++static inline void volk_16ic_s32f_magnitude_32f_a_avx2(float* magnitudeVector, +++ const lv_16sc_t* complexVector, +++ const float scalar, +++ unsigned int num_points) ++ { ++- unsigned int number = 0; ++- const unsigned int eighthPoints = num_points / 8; +++ unsigned int number = 0; +++ const unsigned int eighthPoints = num_points / 8; +++ +++ const int16_t* complexVectorPtr = (const int16_t*)complexVector; +++ float* magnitudeVectorPtr = magnitudeVector; ++ ++- const int16_t* complexVectorPtr = (const int16_t*)complexVector; ++- float* magnitudeVectorPtr = magnitudeVector; +++ __m256 invScalar = _mm256_set1_ps(1.0 / scalar); ++ ++- __m256 invScalar = _mm256_set1_ps(1.0/scalar); +++ __m256 cplxValue1, cplxValue2, result; +++ __m256i int1, int2; +++ __m128i short1, short2; +++ __m256i idx = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0); ++ ++- __m256 cplxValue1, cplxValue2, result; ++- __m256i int1, int2; ++- __m128i short1, short2; ++- __m256i idx = _mm256_set_epi32(7,6,3,2,5,4,1,0); +++ for (; number < eighthPoints; number++) { ++ ++- for(;number < eighthPoints; number++){ ++- ++- int1 = _mm256_loadu_si256((__m256i*)complexVectorPtr); ++- complexVectorPtr += 16; ++- short1 = _mm256_extracti128_si256(int1,0); ++- short2 = _mm256_extracti128_si256(int1,1); +++ int1 = _mm256_loadu_si256((__m256i*)complexVectorPtr); +++ complexVectorPtr += 16; +++ short1 = _mm256_extracti128_si256(int1, 0); +++ short2 = _mm256_extracti128_si256(int1, 1); ++ ++- int1 = _mm256_cvtepi16_epi32(short1); ++- int2 = _mm256_cvtepi16_epi32(short2); ++- cplxValue1 = _mm256_cvtepi32_ps(int1); ++- cplxValue2 = _mm256_cvtepi32_ps(int2); +++ int1 = _mm256_cvtepi16_epi32(short1); +++ int2 = _mm256_cvtepi16_epi32(short2); +++ cplxValue1 = _mm256_cvtepi32_ps(int1); +++ cplxValue2 = _mm256_cvtepi32_ps(int2); ++ ++- cplxValue1 = _mm256_mul_ps(cplxValue1, invScalar); ++- cplxValue2 = _mm256_mul_ps(cplxValue2, invScalar); +++ cplxValue1 = _mm256_mul_ps(cplxValue1, invScalar); +++ cplxValue2 = _mm256_mul_ps(cplxValue2, invScalar); ++ ++- cplxValue1 = _mm256_mul_ps(cplxValue1, cplxValue1); // Square the values ++- cplxValue2 = _mm256_mul_ps(cplxValue2, cplxValue2); // Square the Values +++ cplxValue1 = _mm256_mul_ps(cplxValue1, cplxValue1); // Square the values +++ cplxValue2 = _mm256_mul_ps(cplxValue2, cplxValue2); // Square the Values ++ ++- result = _mm256_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values ++- result = _mm256_permutevar8x32_ps(result, idx); +++ result = _mm256_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values +++ result = _mm256_permutevar8x32_ps(result, idx); ++ ++- result = _mm256_sqrt_ps(result); // Square root the values +++ result = _mm256_sqrt_ps(result); // Square root the values ++ ++- _mm256_store_ps(magnitudeVectorPtr, result); +++ _mm256_store_ps(magnitudeVectorPtr, result); ++ ++- magnitudeVectorPtr += 8; ++- } +++ magnitudeVectorPtr += 8; +++ } ++ ++- number = eighthPoints * 8; ++- magnitudeVectorPtr = &magnitudeVector[number]; ++- complexVectorPtr = (const int16_t*)&complexVector[number]; ++- for(; number < num_points; number++){ ++- float val1Real = (float)(*complexVectorPtr++) / scalar; ++- float val1Imag = (float)(*complexVectorPtr++) / scalar; ++- *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)); ++- } +++ number = eighthPoints * 8; +++ magnitudeVectorPtr = &magnitudeVector[number]; +++ complexVectorPtr = (const int16_t*)&complexVector[number]; +++ for (; number < num_points; number++) { +++ float val1Real = (float)(*complexVectorPtr++) / scalar; +++ float val1Imag = (float)(*complexVectorPtr++) / scalar; +++ *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)); +++ } ++ } ++ #endif /* LV_HAVE_AVX2 */ ++ ++@@ -123,127 +124,129 @@ volk_16ic_s32f_magnitude_32f_a_avx2(float* magnitudeVector, const lv_16sc_t* com ++ #ifdef LV_HAVE_SSE3 ++ #include ++ ++-static inline void ++-volk_16ic_s32f_magnitude_32f_a_sse3(float* magnitudeVector, const lv_16sc_t* complexVector, ++- const float scalar, unsigned int num_points) +++static inline void volk_16ic_s32f_magnitude_32f_a_sse3(float* magnitudeVector, +++ const lv_16sc_t* complexVector, +++ const float scalar, +++ unsigned int num_points) ++ { ++- unsigned int number = 0; ++- const unsigned int quarterPoints = num_points / 4; +++ unsigned int number = 0; +++ const unsigned int quarterPoints = num_points / 4; ++ ++- const int16_t* complexVectorPtr = (const int16_t*)complexVector; ++- float* magnitudeVectorPtr = magnitudeVector; +++ const int16_t* complexVectorPtr = (const int16_t*)complexVector; +++ float* magnitudeVectorPtr = magnitudeVector; ++ ++- __m128 invScalar = _mm_set_ps1(1.0/scalar); +++ __m128 invScalar = _mm_set_ps1(1.0 / scalar); ++ ++- __m128 cplxValue1, cplxValue2, result; +++ __m128 cplxValue1, cplxValue2, result; ++ ++- __VOLK_ATTR_ALIGNED(16) float inputFloatBuffer[8]; +++ __VOLK_ATTR_ALIGNED(16) float inputFloatBuffer[8]; ++ ++- for(;number < quarterPoints; number++){ +++ for (; number < quarterPoints; number++) { ++ ++- inputFloatBuffer[0] = (float)(complexVectorPtr[0]); ++- inputFloatBuffer[1] = (float)(complexVectorPtr[1]); ++- inputFloatBuffer[2] = (float)(complexVectorPtr[2]); ++- inputFloatBuffer[3] = (float)(complexVectorPtr[3]); +++ inputFloatBuffer[0] = (float)(complexVectorPtr[0]); +++ inputFloatBuffer[1] = (float)(complexVectorPtr[1]); +++ inputFloatBuffer[2] = (float)(complexVectorPtr[2]); +++ inputFloatBuffer[3] = (float)(complexVectorPtr[3]); ++ ++- inputFloatBuffer[4] = (float)(complexVectorPtr[4]); ++- inputFloatBuffer[5] = (float)(complexVectorPtr[5]); ++- inputFloatBuffer[6] = (float)(complexVectorPtr[6]); ++- inputFloatBuffer[7] = (float)(complexVectorPtr[7]); +++ inputFloatBuffer[4] = (float)(complexVectorPtr[4]); +++ inputFloatBuffer[5] = (float)(complexVectorPtr[5]); +++ inputFloatBuffer[6] = (float)(complexVectorPtr[6]); +++ inputFloatBuffer[7] = (float)(complexVectorPtr[7]); ++ ++- cplxValue1 = _mm_load_ps(&inputFloatBuffer[0]); ++- cplxValue2 = _mm_load_ps(&inputFloatBuffer[4]); +++ cplxValue1 = _mm_load_ps(&inputFloatBuffer[0]); +++ cplxValue2 = _mm_load_ps(&inputFloatBuffer[4]); ++ ++- complexVectorPtr += 8; +++ complexVectorPtr += 8; ++ ++- cplxValue1 = _mm_mul_ps(cplxValue1, invScalar); ++- cplxValue2 = _mm_mul_ps(cplxValue2, invScalar); +++ cplxValue1 = _mm_mul_ps(cplxValue1, invScalar); +++ cplxValue2 = _mm_mul_ps(cplxValue2, invScalar); ++ ++- cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values ++- cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values +++ cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values +++ cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values ++ ++- result = _mm_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values +++ result = _mm_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values ++ ++- result = _mm_sqrt_ps(result); // Square root the values +++ result = _mm_sqrt_ps(result); // Square root the values ++ ++- _mm_store_ps(magnitudeVectorPtr, result); +++ _mm_store_ps(magnitudeVectorPtr, result); ++ ++- magnitudeVectorPtr += 4; ++- } +++ magnitudeVectorPtr += 4; +++ } ++ ++- number = quarterPoints * 4; ++- magnitudeVectorPtr = &magnitudeVector[number]; ++- complexVectorPtr = (const int16_t*)&complexVector[number]; ++- for(; number < num_points; number++){ ++- float val1Real = (float)(*complexVectorPtr++) / scalar; ++- float val1Imag = (float)(*complexVectorPtr++) / scalar; ++- *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)); ++- } +++ number = quarterPoints * 4; +++ magnitudeVectorPtr = &magnitudeVector[number]; +++ complexVectorPtr = (const int16_t*)&complexVector[number]; +++ for (; number < num_points; number++) { +++ float val1Real = (float)(*complexVectorPtr++) / scalar; +++ float val1Imag = (float)(*complexVectorPtr++) / scalar; +++ *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)); +++ } ++ } ++ #endif /* LV_HAVE_SSE3 */ ++ ++ #ifdef LV_HAVE_SSE ++ #include ++ ++-static inline void ++-volk_16ic_s32f_magnitude_32f_a_sse(float* magnitudeVector, const lv_16sc_t* complexVector, ++- const float scalar, unsigned int num_points) +++static inline void volk_16ic_s32f_magnitude_32f_a_sse(float* magnitudeVector, +++ const lv_16sc_t* complexVector, +++ const float scalar, +++ unsigned int num_points) ++ { ++- unsigned int number = 0; ++- const unsigned int quarterPoints = num_points / 4; +++ unsigned int number = 0; +++ const unsigned int quarterPoints = num_points / 4; ++ ++- const int16_t* complexVectorPtr = (const int16_t*)complexVector; ++- float* magnitudeVectorPtr = magnitudeVector; +++ const int16_t* complexVectorPtr = (const int16_t*)complexVector; +++ float* magnitudeVectorPtr = magnitudeVector; ++ ++- const float iScalar = 1.0 / scalar; ++- __m128 invScalar = _mm_set_ps1(iScalar); +++ const float iScalar = 1.0 / scalar; +++ __m128 invScalar = _mm_set_ps1(iScalar); ++ ++- __m128 cplxValue1, cplxValue2, result, re, im; +++ __m128 cplxValue1, cplxValue2, result, re, im; ++ ++- __VOLK_ATTR_ALIGNED(16) float inputFloatBuffer[8]; +++ __VOLK_ATTR_ALIGNED(16) float inputFloatBuffer[8]; ++ ++- for(;number < quarterPoints; number++){ ++- inputFloatBuffer[0] = (float)(complexVectorPtr[0]); ++- inputFloatBuffer[1] = (float)(complexVectorPtr[1]); ++- inputFloatBuffer[2] = (float)(complexVectorPtr[2]); ++- inputFloatBuffer[3] = (float)(complexVectorPtr[3]); +++ for (; number < quarterPoints; number++) { +++ inputFloatBuffer[0] = (float)(complexVectorPtr[0]); +++ inputFloatBuffer[1] = (float)(complexVectorPtr[1]); +++ inputFloatBuffer[2] = (float)(complexVectorPtr[2]); +++ inputFloatBuffer[3] = (float)(complexVectorPtr[3]); ++ ++- inputFloatBuffer[4] = (float)(complexVectorPtr[4]); ++- inputFloatBuffer[5] = (float)(complexVectorPtr[5]); ++- inputFloatBuffer[6] = (float)(complexVectorPtr[6]); ++- inputFloatBuffer[7] = (float)(complexVectorPtr[7]); +++ inputFloatBuffer[4] = (float)(complexVectorPtr[4]); +++ inputFloatBuffer[5] = (float)(complexVectorPtr[5]); +++ inputFloatBuffer[6] = (float)(complexVectorPtr[6]); +++ inputFloatBuffer[7] = (float)(complexVectorPtr[7]); ++ ++- cplxValue1 = _mm_load_ps(&inputFloatBuffer[0]); ++- cplxValue2 = _mm_load_ps(&inputFloatBuffer[4]); +++ cplxValue1 = _mm_load_ps(&inputFloatBuffer[0]); +++ cplxValue2 = _mm_load_ps(&inputFloatBuffer[4]); ++ ++- re = _mm_shuffle_ps(cplxValue1, cplxValue2, 0x88); ++- im = _mm_shuffle_ps(cplxValue1, cplxValue2, 0xdd); +++ re = _mm_shuffle_ps(cplxValue1, cplxValue2, 0x88); +++ im = _mm_shuffle_ps(cplxValue1, cplxValue2, 0xdd); ++ ++- complexVectorPtr += 8; +++ complexVectorPtr += 8; ++ ++- cplxValue1 = _mm_mul_ps(re, invScalar); ++- cplxValue2 = _mm_mul_ps(im, invScalar); +++ cplxValue1 = _mm_mul_ps(re, invScalar); +++ cplxValue2 = _mm_mul_ps(im, invScalar); ++ ++- cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values ++- cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values +++ cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values +++ cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values ++ ++- result = _mm_add_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values +++ result = _mm_add_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values ++ ++- result = _mm_sqrt_ps(result); // Square root the values +++ result = _mm_sqrt_ps(result); // Square root the values ++ ++- _mm_store_ps(magnitudeVectorPtr, result); +++ _mm_store_ps(magnitudeVectorPtr, result); ++ ++- magnitudeVectorPtr += 4; ++- } +++ magnitudeVectorPtr += 4; +++ } ++ ++- number = quarterPoints * 4; ++- magnitudeVectorPtr = &magnitudeVector[number]; ++- complexVectorPtr = (const int16_t*)&complexVector[number]; ++- for(; number < num_points; number++){ ++- float val1Real = (float)(*complexVectorPtr++) * iScalar; ++- float val1Imag = (float)(*complexVectorPtr++) * iScalar; ++- *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)); ++- } +++ number = quarterPoints * 4; +++ magnitudeVectorPtr = &magnitudeVector[number]; +++ complexVectorPtr = (const int16_t*)&complexVector[number]; +++ for (; number < num_points; number++) { +++ float val1Real = (float)(*complexVectorPtr++) * iScalar; +++ float val1Imag = (float)(*complexVectorPtr++) * iScalar; +++ *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)); +++ } ++ } ++ ++ ++@@ -251,33 +254,37 @@ volk_16ic_s32f_magnitude_32f_a_sse(float* magnitudeVector, const lv_16sc_t* comp ++ ++ #ifdef LV_HAVE_GENERIC ++ ++-static inline void ++-volk_16ic_s32f_magnitude_32f_generic(float* magnitudeVector, const lv_16sc_t* complexVector, ++- const float scalar, unsigned int num_points) +++static inline void volk_16ic_s32f_magnitude_32f_generic(float* magnitudeVector, +++ const lv_16sc_t* complexVector, +++ const float scalar, +++ unsigned int num_points) ++ { ++- const int16_t* complexVectorPtr = (const int16_t*)complexVector; ++- float* magnitudeVectorPtr = magnitudeVector; ++- unsigned int number = 0; ++- const float invScalar = 1.0 / scalar; ++- for(number = 0; number < num_points; number++){ ++- float real = ( (float) (*complexVectorPtr++)) * invScalar; ++- float imag = ( (float) (*complexVectorPtr++)) * invScalar; ++- *magnitudeVectorPtr++ = sqrtf((real*real) + (imag*imag)); ++- } +++ const int16_t* complexVectorPtr = (const int16_t*)complexVector; +++ float* magnitudeVectorPtr = magnitudeVector; +++ unsigned int number = 0; +++ const float invScalar = 1.0 / scalar; +++ for (number = 0; number < num_points; number++) { +++ float real = ((float)(*complexVectorPtr++)) * invScalar; +++ float imag = ((float)(*complexVectorPtr++)) * invScalar; +++ *magnitudeVectorPtr++ = sqrtf((real * real) + (imag * imag)); +++ } ++ } ++ #endif /* LV_HAVE_GENERIC */ ++ ++ #ifdef LV_HAVE_ORC_DISABLED ++ ++-extern void ++-volk_16ic_s32f_magnitude_32f_a_orc_impl(float* magnitudeVector, const lv_16sc_t* complexVector, ++- const float scalar, unsigned int num_points); +++extern void volk_16ic_s32f_magnitude_32f_a_orc_impl(float* magnitudeVector, +++ const lv_16sc_t* complexVector, +++ const float scalar, +++ unsigned int num_points); ++ ++-static inline void ++-volk_16ic_s32f_magnitude_32f_u_orc(float* magnitudeVector, const lv_16sc_t* complexVector, ++- const float scalar, unsigned int num_points) +++static inline void volk_16ic_s32f_magnitude_32f_u_orc(float* magnitudeVector, +++ const lv_16sc_t* complexVector, +++ const float scalar, +++ unsigned int num_points) ++ { ++- volk_16ic_s32f_magnitude_32f_a_orc_impl(magnitudeVector, complexVector, scalar, num_points); +++ volk_16ic_s32f_magnitude_32f_a_orc_impl( +++ magnitudeVector, complexVector, scalar, num_points); ++ } ++ #endif /* LV_HAVE_ORC */ ++ ++@@ -287,69 +294,69 @@ volk_16ic_s32f_magnitude_32f_u_orc(float* magnitudeVector, const lv_16sc_t* comp ++ #ifndef INCLUDED_volk_16ic_s32f_magnitude_32f_u_H ++ #define INCLUDED_volk_16ic_s32f_magnitude_32f_u_H ++ ++-#include ++ #include ++-#include ++ #include +++#include +++#include ++ ++ #ifdef LV_HAVE_AVX2 ++ #include ++ ++-static inline void ++-volk_16ic_s32f_magnitude_32f_u_avx2(float* magnitudeVector, const lv_16sc_t* complexVector, ++- const float scalar, unsigned int num_points) +++static inline void volk_16ic_s32f_magnitude_32f_u_avx2(float* magnitudeVector, +++ const lv_16sc_t* complexVector, +++ const float scalar, +++ unsigned int num_points) ++ { ++- unsigned int number = 0; ++- const unsigned int eighthPoints = num_points / 8; +++ unsigned int number = 0; +++ const unsigned int eighthPoints = num_points / 8; +++ +++ const int16_t* complexVectorPtr = (const int16_t*)complexVector; +++ float* magnitudeVectorPtr = magnitudeVector; ++ ++- const int16_t* complexVectorPtr = (const int16_t*)complexVector; ++- float* magnitudeVectorPtr = magnitudeVector; +++ __m256 invScalar = _mm256_set1_ps(1.0 / scalar); ++ ++- __m256 invScalar = _mm256_set1_ps(1.0/scalar); +++ __m256 cplxValue1, cplxValue2, result; +++ __m256i int1, int2; +++ __m128i short1, short2; +++ __m256i idx = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0); ++ ++- __m256 cplxValue1, cplxValue2, result; ++- __m256i int1, int2; ++- __m128i short1, short2; ++- __m256i idx = _mm256_set_epi32(7,6,3,2,5,4,1,0); +++ for (; number < eighthPoints; number++) { ++ ++- for(;number < eighthPoints; number++){ ++- ++- int1 = _mm256_loadu_si256((__m256i*)complexVectorPtr); ++- complexVectorPtr += 16; ++- short1 = _mm256_extracti128_si256(int1,0); ++- short2 = _mm256_extracti128_si256(int1,1); +++ int1 = _mm256_loadu_si256((__m256i*)complexVectorPtr); +++ complexVectorPtr += 16; +++ short1 = _mm256_extracti128_si256(int1, 0); +++ short2 = _mm256_extracti128_si256(int1, 1); ++ ++- int1 = _mm256_cvtepi16_epi32(short1); ++- int2 = _mm256_cvtepi16_epi32(short2); ++- cplxValue1 = _mm256_cvtepi32_ps(int1); ++- cplxValue2 = _mm256_cvtepi32_ps(int2); +++ int1 = _mm256_cvtepi16_epi32(short1); +++ int2 = _mm256_cvtepi16_epi32(short2); +++ cplxValue1 = _mm256_cvtepi32_ps(int1); +++ cplxValue2 = _mm256_cvtepi32_ps(int2); ++ ++- cplxValue1 = _mm256_mul_ps(cplxValue1, invScalar); ++- cplxValue2 = _mm256_mul_ps(cplxValue2, invScalar); +++ cplxValue1 = _mm256_mul_ps(cplxValue1, invScalar); +++ cplxValue2 = _mm256_mul_ps(cplxValue2, invScalar); ++ ++- cplxValue1 = _mm256_mul_ps(cplxValue1, cplxValue1); // Square the values ++- cplxValue2 = _mm256_mul_ps(cplxValue2, cplxValue2); // Square the Values +++ cplxValue1 = _mm256_mul_ps(cplxValue1, cplxValue1); // Square the values +++ cplxValue2 = _mm256_mul_ps(cplxValue2, cplxValue2); // Square the Values ++ ++- result = _mm256_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values ++- result = _mm256_permutevar8x32_ps(result, idx); +++ result = _mm256_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values +++ result = _mm256_permutevar8x32_ps(result, idx); ++ ++- result = _mm256_sqrt_ps(result); // Square root the values +++ result = _mm256_sqrt_ps(result); // Square root the values ++ ++- _mm256_storeu_ps(magnitudeVectorPtr, result); +++ _mm256_storeu_ps(magnitudeVectorPtr, result); ++ ++- magnitudeVectorPtr += 8; ++- } +++ magnitudeVectorPtr += 8; +++ } ++ ++- number = eighthPoints * 8; ++- magnitudeVectorPtr = &magnitudeVector[number]; ++- complexVectorPtr = (const int16_t*)&complexVector[number]; ++- for(; number < num_points; number++){ ++- float val1Real = (float)(*complexVectorPtr++) / scalar; ++- float val1Imag = (float)(*complexVectorPtr++) / scalar; ++- *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)); ++- } +++ number = eighthPoints * 8; +++ magnitudeVectorPtr = &magnitudeVector[number]; +++ complexVectorPtr = (const int16_t*)&complexVector[number]; +++ for (; number < num_points; number++) { +++ float val1Real = (float)(*complexVectorPtr++) / scalar; +++ float val1Imag = (float)(*complexVectorPtr++) / scalar; +++ *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)); +++ } ++ } ++ #endif /* LV_HAVE_AVX2 */ ++ ++ #endif /* INCLUDED_volk_16ic_s32f_magnitude_32f_u_H */ ++- ++diff --git a/kernels/volk/volk_16ic_x2_dot_prod_16ic.h b/kernels/volk/volk_16ic_x2_dot_prod_16ic.h ++index ae10cff..a1a0e8c 100644 ++--- a/kernels/volk/volk_16ic_x2_dot_prod_16ic.h +++++ b/kernels/volk/volk_16ic_x2_dot_prod_16ic.h ++@@ -25,18 +25,20 @@ ++ * ++ * \b Overview ++ * ++- * Multiplies two input complex vectors (16-bit integer each component) and accumulates them, ++- * storing the result. Results are saturated so never go beyond the limits of the data type. +++ * Multiplies two input complex vectors (16-bit integer each component) and accumulates +++ * them, storing the result. Results are saturated so never go beyond the limits of the +++ * data type. ++ * ++ * Dispatcher Prototype ++ * \code ++- * void volk_16ic_x2_dot_prod_16ic(lv_16sc_t* result, const lv_16sc_t* in_a, const lv_16sc_t* in_b, unsigned int num_points); ++- * \endcode +++ * void volk_16ic_x2_dot_prod_16ic(lv_16sc_t* result, const lv_16sc_t* in_a, const +++ * lv_16sc_t* in_b, unsigned int num_points); \endcode ++ * ++ * \b Inputs ++ * \li in_a: One of the vectors to be multiplied and accumulated. ++ * \li in_b: The other vector to be multiplied and accumulated. ++- * \li num_points: Number of complex values to be multiplied together, accumulated and stored into \p result +++ * \li num_points: Number of complex values to be multiplied together, accumulated and +++ * stored into \p result ++ * ++ * \b Outputs ++ * \li result: Value of the accumulated result. ++@@ -46,22 +48,25 @@ ++ #ifndef INCLUDED_volk_16ic_x2_dot_prod_16ic_H ++ #define INCLUDED_volk_16ic_x2_dot_prod_16ic_H ++ +++#include ++ #include ++ #include ++-#include ++ ++ ++ #ifdef LV_HAVE_GENERIC ++ ++-static inline void volk_16ic_x2_dot_prod_16ic_generic(lv_16sc_t* result, const lv_16sc_t* in_a, const lv_16sc_t* in_b, unsigned int num_points) +++static inline void volk_16ic_x2_dot_prod_16ic_generic(lv_16sc_t* result, +++ const lv_16sc_t* in_a, +++ const lv_16sc_t* in_b, +++ unsigned int num_points) ++ { ++ result[0] = lv_cmake((int16_t)0, (int16_t)0); ++ unsigned int n; ++- for (n = 0; n < num_points; n++) ++- { ++- lv_16sc_t tmp = in_a[n] * in_b[n]; ++- result[0] = lv_cmake(sat_adds16i(lv_creal(result[0]), lv_creal(tmp)), sat_adds16i(lv_cimag(result[0]), lv_cimag(tmp) )); ++- } +++ for (n = 0; n < num_points; n++) { +++ lv_16sc_t tmp = in_a[n] * in_b[n]; +++ result[0] = lv_cmake(sat_adds16i(lv_creal(result[0]), lv_creal(tmp)), +++ sat_adds16i(lv_cimag(result[0]), lv_cimag(tmp))); +++ } ++ } ++ ++ #endif /*LV_HAVE_GENERIC*/ ++@@ -70,7 +75,10 @@ static inline void volk_16ic_x2_dot_prod_16ic_generic(lv_16sc_t* result, const l ++ #ifdef LV_HAVE_SSE2 ++ #include ++ ++-static inline void volk_16ic_x2_dot_prod_16ic_a_sse2(lv_16sc_t* out, const lv_16sc_t* in_a, const lv_16sc_t* in_b, unsigned int num_points) +++static inline void volk_16ic_x2_dot_prod_16ic_a_sse2(lv_16sc_t* out, +++ const lv_16sc_t* in_a, +++ const lv_16sc_t* in_b, +++ unsigned int num_points) ++ { ++ lv_16sc_t dotProduct = lv_cmake((int16_t)0, (int16_t)0); ++ ++@@ -81,62 +89,67 @@ static inline void volk_16ic_x2_dot_prod_16ic_a_sse2(lv_16sc_t* out, const lv_16 ++ const lv_16sc_t* _in_b = in_b; ++ lv_16sc_t* _out = out; ++ ++- if (sse_iters > 0) ++- { ++- __m128i a, b, c, c_sr, mask_imag, mask_real, real, imag, imag1, imag2, b_sl, a_sl, realcacc, imagcacc; ++- __VOLK_ATTR_ALIGNED(16) lv_16sc_t dotProductVector[4]; +++ if (sse_iters > 0) { +++ __m128i a, b, c, c_sr, mask_imag, mask_real, real, imag, imag1, imag2, b_sl, a_sl, +++ realcacc, imagcacc; +++ __VOLK_ATTR_ALIGNED(16) lv_16sc_t dotProductVector[4]; ++ ++- realcacc = _mm_setzero_si128(); ++- imagcacc = _mm_setzero_si128(); +++ realcacc = _mm_setzero_si128(); +++ imagcacc = _mm_setzero_si128(); ++ ++- mask_imag = _mm_set_epi8(0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0); ++- mask_real = _mm_set_epi8(0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF); +++ mask_imag = _mm_set_epi8( +++ 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0); +++ mask_real = _mm_set_epi8( +++ 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF); ++ ++- for(number = 0; number < sse_iters; number++) ++- { ++- // a[127:0]=[a3.i,a3.r,a2.i,a2.r,a1.i,a1.r,a0.i,a0.r] ++- a = _mm_load_si128((__m128i*)_in_a); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg ++- __VOLK_PREFETCH(_in_a + 8); ++- b = _mm_load_si128((__m128i*)_in_b); ++- __VOLK_PREFETCH(_in_b + 8); ++- c = _mm_mullo_epi16(a, b); // a3.i*b3.i, a3.r*b3.r, .... +++ for (number = 0; number < sse_iters; number++) { +++ // a[127:0]=[a3.i,a3.r,a2.i,a2.r,a1.i,a1.r,a0.i,a0.r] +++ a = _mm_load_si128( +++ (__m128i*)_in_a); // load (2 byte imag, 2 byte real) x 4 into 128 bits reg +++ __VOLK_PREFETCH(_in_a + 8); +++ b = _mm_load_si128((__m128i*)_in_b); +++ __VOLK_PREFETCH(_in_b + 8); +++ c = _mm_mullo_epi16(a, b); // a3.i*b3.i, a3.r*b3.r, .... ++ ++- c_sr = _mm_srli_si128(c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst. ++- real = _mm_subs_epi16(c, c_sr); +++ c_sr = _mm_srli_si128(c, 2); // Shift a right by imm8 bytes while shifting in +++ // zeros, and store the results in dst. +++ real = _mm_subs_epi16(c, c_sr); ++ ++- b_sl = _mm_slli_si128(b, 2); // b3.r, b2.i .... ++- a_sl = _mm_slli_si128(a, 2); // a3.r, a2.i .... +++ b_sl = _mm_slli_si128(b, 2); // b3.r, b2.i .... +++ a_sl = _mm_slli_si128(a, 2); // a3.r, a2.i .... ++ ++- imag1 = _mm_mullo_epi16(a, b_sl); // a3.i*b3.r, .... ++- imag2 = _mm_mullo_epi16(b, a_sl); // b3.i*a3.r, .... +++ imag1 = _mm_mullo_epi16(a, b_sl); // a3.i*b3.r, .... +++ imag2 = _mm_mullo_epi16(b, a_sl); // b3.i*a3.r, .... ++ ++- imag = _mm_adds_epi16(imag1, imag2); //with saturation arithmetic! +++ imag = _mm_adds_epi16(imag1, imag2); // with saturation arithmetic! ++ ++- realcacc = _mm_adds_epi16(realcacc, real); ++- imagcacc = _mm_adds_epi16(imagcacc, imag); +++ realcacc = _mm_adds_epi16(realcacc, real); +++ imagcacc = _mm_adds_epi16(imagcacc, imag); ++ ++- _in_a += 4; ++- _in_b += 4; ++- } +++ _in_a += 4; +++ _in_b += 4; +++ } ++ ++- realcacc = _mm_and_si128(realcacc, mask_real); ++- imagcacc = _mm_and_si128(imagcacc, mask_imag); +++ realcacc = _mm_and_si128(realcacc, mask_real); +++ imagcacc = _mm_and_si128(imagcacc, mask_imag); ++ ++- a = _mm_or_si128(realcacc, imagcacc); +++ a = _mm_or_si128(realcacc, imagcacc); ++ ++- _mm_store_si128((__m128i*)dotProductVector, a); // Store the results back into the dot product vector +++ _mm_store_si128((__m128i*)dotProductVector, +++ a); // Store the results back into the dot product vector ++ ++- for (number = 0; number < 4; ++number) ++- { ++- dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(dotProductVector[number])), sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[number]))); ++- } +++ for (number = 0; number < 4; ++number) { +++ dotProduct = lv_cmake( +++ sat_adds16i(lv_creal(dotProduct), lv_creal(dotProductVector[number])), +++ sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[number]))); ++ } +++ } ++ ++- for (number = 0; number < (num_points % 4); ++number) ++- { ++- lv_16sc_t tmp = (*_in_a++) * (*_in_b++); ++- dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(tmp)), sat_adds16i(lv_cimag(dotProduct), lv_cimag(tmp))); ++- } +++ for (number = 0; number < (num_points % 4); ++number) { +++ lv_16sc_t tmp = (*_in_a++) * (*_in_b++); +++ dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(tmp)), +++ sat_adds16i(lv_cimag(dotProduct), lv_cimag(tmp))); +++ } ++ ++ *_out = dotProduct; ++ } ++@@ -147,7 +160,10 @@ static inline void volk_16ic_x2_dot_prod_16ic_a_sse2(lv_16sc_t* out, const lv_16 ++ #ifdef LV_HAVE_SSE2 ++ #include ++ ++-static inline void volk_16ic_x2_dot_prod_16ic_u_sse2(lv_16sc_t* out, const lv_16sc_t* in_a, const lv_16sc_t* in_b, unsigned int num_points) +++static inline void volk_16ic_x2_dot_prod_16ic_u_sse2(lv_16sc_t* out, +++ const lv_16sc_t* in_a, +++ const lv_16sc_t* in_b, +++ unsigned int num_points) ++ { ++ lv_16sc_t dotProduct = lv_cmake((int16_t)0, (int16_t)0); ++ ++@@ -158,62 +174,67 @@ static inline void volk_16ic_x2_dot_prod_16ic_u_sse2(lv_16sc_t* out, const lv_16 ++ lv_16sc_t* _out = out; ++ unsigned int number; ++ ++- if (sse_iters > 0) ++- { ++- __m128i a, b, c, c_sr, mask_imag, mask_real, real, imag, imag1, imag2, b_sl, a_sl, realcacc, imagcacc, result; ++- __VOLK_ATTR_ALIGNED(16) lv_16sc_t dotProductVector[4]; +++ if (sse_iters > 0) { +++ __m128i a, b, c, c_sr, mask_imag, mask_real, real, imag, imag1, imag2, b_sl, a_sl, +++ realcacc, imagcacc, result; +++ __VOLK_ATTR_ALIGNED(16) lv_16sc_t dotProductVector[4]; ++ ++- realcacc = _mm_setzero_si128(); ++- imagcacc = _mm_setzero_si128(); +++ realcacc = _mm_setzero_si128(); +++ imagcacc = _mm_setzero_si128(); ++ ++- mask_imag = _mm_set_epi8(0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0); ++- mask_real = _mm_set_epi8(0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF); +++ mask_imag = _mm_set_epi8( +++ 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0); +++ mask_real = _mm_set_epi8( +++ 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF); ++ ++- for(number = 0; number < sse_iters; number++) ++- { ++- // a[127:0]=[a3.i,a3.r,a2.i,a2.r,a1.i,a1.r,a0.i,a0.r] ++- a = _mm_loadu_si128((__m128i*)_in_a); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg ++- __VOLK_PREFETCH(_in_a + 8); ++- b = _mm_loadu_si128((__m128i*)_in_b); ++- __VOLK_PREFETCH(_in_b + 8); ++- c = _mm_mullo_epi16(a, b); // a3.i*b3.i, a3.r*b3.r, .... +++ for (number = 0; number < sse_iters; number++) { +++ // a[127:0]=[a3.i,a3.r,a2.i,a2.r,a1.i,a1.r,a0.i,a0.r] +++ a = _mm_loadu_si128( +++ (__m128i*)_in_a); // load (2 byte imag, 2 byte real) x 4 into 128 bits reg +++ __VOLK_PREFETCH(_in_a + 8); +++ b = _mm_loadu_si128((__m128i*)_in_b); +++ __VOLK_PREFETCH(_in_b + 8); +++ c = _mm_mullo_epi16(a, b); // a3.i*b3.i, a3.r*b3.r, .... ++ ++- c_sr = _mm_srli_si128(c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst. ++- real = _mm_subs_epi16(c, c_sr); +++ c_sr = _mm_srli_si128(c, 2); // Shift a right by imm8 bytes while shifting in +++ // zeros, and store the results in dst. +++ real = _mm_subs_epi16(c, c_sr); ++ ++- b_sl = _mm_slli_si128(b, 2); // b3.r, b2.i .... ++- a_sl = _mm_slli_si128(a, 2); // a3.r, a2.i .... +++ b_sl = _mm_slli_si128(b, 2); // b3.r, b2.i .... +++ a_sl = _mm_slli_si128(a, 2); // a3.r, a2.i .... ++ ++- imag1 = _mm_mullo_epi16(a, b_sl); // a3.i*b3.r, .... ++- imag2 = _mm_mullo_epi16(b, a_sl); // b3.i*a3.r, .... +++ imag1 = _mm_mullo_epi16(a, b_sl); // a3.i*b3.r, .... +++ imag2 = _mm_mullo_epi16(b, a_sl); // b3.i*a3.r, .... ++ ++- imag = _mm_adds_epi16(imag1, imag2); //with saturation arithmetic! +++ imag = _mm_adds_epi16(imag1, imag2); // with saturation arithmetic! ++ ++- realcacc = _mm_adds_epi16(realcacc, real); ++- imagcacc = _mm_adds_epi16(imagcacc, imag); +++ realcacc = _mm_adds_epi16(realcacc, real); +++ imagcacc = _mm_adds_epi16(imagcacc, imag); ++ ++- _in_a += 4; ++- _in_b += 4; ++- } +++ _in_a += 4; +++ _in_b += 4; +++ } ++ ++- realcacc = _mm_and_si128(realcacc, mask_real); ++- imagcacc = _mm_and_si128(imagcacc, mask_imag); +++ realcacc = _mm_and_si128(realcacc, mask_real); +++ imagcacc = _mm_and_si128(imagcacc, mask_imag); ++ ++- result = _mm_or_si128(realcacc, imagcacc); +++ result = _mm_or_si128(realcacc, imagcacc); ++ ++- _mm_storeu_si128((__m128i*)dotProductVector, result); // Store the results back into the dot product vector +++ _mm_storeu_si128((__m128i*)dotProductVector, +++ result); // Store the results back into the dot product vector ++ ++- for (number = 0; number < 4; ++number) ++- { ++- dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(dotProductVector[number])), sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[number]))); ++- } +++ for (number = 0; number < 4; ++number) { +++ dotProduct = lv_cmake( +++ sat_adds16i(lv_creal(dotProduct), lv_creal(dotProductVector[number])), +++ sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[number]))); ++ } +++ } ++ ++- for (number = 0; number < (num_points % 4); ++number) ++- { ++- lv_16sc_t tmp = (*_in_a++) * (*_in_b++); ++- dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(tmp)), sat_adds16i(lv_cimag(dotProduct), lv_cimag(tmp))); ++- } +++ for (number = 0; number < (num_points % 4); ++number) { +++ lv_16sc_t tmp = (*_in_a++) * (*_in_b++); +++ dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(tmp)), +++ sat_adds16i(lv_cimag(dotProduct), lv_cimag(tmp))); +++ } ++ ++ *_out = dotProduct; ++ } ++@@ -223,7 +244,10 @@ static inline void volk_16ic_x2_dot_prod_16ic_u_sse2(lv_16sc_t* out, const lv_16 ++ #ifdef LV_HAVE_AVX2 ++ #include ++ ++-static inline void volk_16ic_x2_dot_prod_16ic_u_axv2(lv_16sc_t* out, const lv_16sc_t* in_a, const lv_16sc_t* in_b, unsigned int num_points) +++static inline void volk_16ic_x2_dot_prod_16ic_u_axv2(lv_16sc_t* out, +++ const lv_16sc_t* in_a, +++ const lv_16sc_t* in_b, +++ unsigned int num_points) ++ { ++ lv_16sc_t dotProduct = lv_cmake((int16_t)0, (int16_t)0); ++ ++@@ -234,62 +258,126 @@ static inline void volk_16ic_x2_dot_prod_16ic_u_axv2(lv_16sc_t* out, const lv_16 ++ lv_16sc_t* _out = out; ++ unsigned int number; ++ ++- if (avx_iters > 0) ++- { ++- __m256i a, b, c, c_sr, mask_imag, mask_real, real, imag, imag1, imag2, b_sl, a_sl, realcacc, imagcacc, result; ++- __VOLK_ATTR_ALIGNED(32) lv_16sc_t dotProductVector[8]; ++- ++- realcacc = _mm256_setzero_si256(); ++- imagcacc = _mm256_setzero_si256(); ++- ++- mask_imag = _mm256_set_epi8(0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0); ++- mask_real = _mm256_set_epi8(0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF); ++- ++- for(number = 0; number < avx_iters; number++) ++- { ++- a = _mm256_loadu_si256((__m256i*)_in_a); ++- __VOLK_PREFETCH(_in_a + 16); ++- b = _mm256_loadu_si256((__m256i*)_in_b); ++- __VOLK_PREFETCH(_in_b + 16); ++- c = _mm256_mullo_epi16(a, b); ++- ++- c_sr = _mm256_srli_si256(c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst. ++- real = _mm256_subs_epi16(c, c_sr); ++- ++- b_sl = _mm256_slli_si256(b, 2); ++- a_sl = _mm256_slli_si256(a, 2); ++- ++- imag1 = _mm256_mullo_epi16(a, b_sl); ++- imag2 = _mm256_mullo_epi16(b, a_sl); ++- ++- imag = _mm256_adds_epi16(imag1, imag2); //with saturation arithmetic! ++- ++- realcacc = _mm256_adds_epi16(realcacc, real); ++- imagcacc = _mm256_adds_epi16(imagcacc, imag); ++- ++- _in_a += 8; ++- _in_b += 8; ++- } +++ if (avx_iters > 0) { +++ __m256i a, b, c, c_sr, mask_imag, mask_real, real, imag, imag1, imag2, b_sl, a_sl, +++ realcacc, imagcacc, result; +++ __VOLK_ATTR_ALIGNED(32) lv_16sc_t dotProductVector[8]; +++ +++ realcacc = _mm256_setzero_si256(); +++ imagcacc = _mm256_setzero_si256(); +++ +++ mask_imag = _mm256_set_epi8(0xFF, +++ 0xFF, +++ 0, +++ 0, +++ 0xFF, +++ 0xFF, +++ 0, +++ 0, +++ 0xFF, +++ 0xFF, +++ 0, +++ 0, +++ 0xFF, +++ 0xFF, +++ 0, +++ 0, +++ 0xFF, +++ 0xFF, +++ 0, +++ 0, +++ 0xFF, +++ 0xFF, +++ 0, +++ 0, +++ 0xFF, +++ 0xFF, +++ 0, +++ 0, +++ 0xFF, +++ 0xFF, +++ 0, +++ 0); +++ mask_real = _mm256_set_epi8(0, +++ 0, +++ 0xFF, +++ 0xFF, +++ 0, +++ 0, +++ 0xFF, +++ 0xFF, +++ 0, +++ 0, +++ 0xFF, +++ 0xFF, +++ 0, +++ 0, +++ 0xFF, +++ 0xFF, +++ 0, +++ 0, +++ 0xFF, +++ 0xFF, +++ 0, +++ 0, +++ 0xFF, +++ 0xFF, +++ 0, +++ 0, +++ 0xFF, +++ 0xFF, +++ 0, +++ 0, +++ 0xFF, +++ 0xFF); +++ +++ for (number = 0; number < avx_iters; number++) { +++ a = _mm256_loadu_si256((__m256i*)_in_a); +++ __VOLK_PREFETCH(_in_a + 16); +++ b = _mm256_loadu_si256((__m256i*)_in_b); +++ __VOLK_PREFETCH(_in_b + 16); +++ c = _mm256_mullo_epi16(a, b); +++ +++ c_sr = _mm256_srli_si256(c, 2); // Shift a right by imm8 bytes while shifting +++ // in zeros, and store the results in dst. +++ real = _mm256_subs_epi16(c, c_sr); +++ +++ b_sl = _mm256_slli_si256(b, 2); +++ a_sl = _mm256_slli_si256(a, 2); +++ +++ imag1 = _mm256_mullo_epi16(a, b_sl); +++ imag2 = _mm256_mullo_epi16(b, a_sl); +++ +++ imag = _mm256_adds_epi16(imag1, imag2); // with saturation arithmetic! +++ +++ realcacc = _mm256_adds_epi16(realcacc, real); +++ imagcacc = _mm256_adds_epi16(imagcacc, imag); +++ +++ _in_a += 8; +++ _in_b += 8; +++ } ++ ++- realcacc = _mm256_and_si256(realcacc, mask_real); ++- imagcacc = _mm256_and_si256(imagcacc, mask_imag); +++ realcacc = _mm256_and_si256(realcacc, mask_real); +++ imagcacc = _mm256_and_si256(imagcacc, mask_imag); ++ ++- result = _mm256_or_si256(realcacc, imagcacc); +++ result = _mm256_or_si256(realcacc, imagcacc); ++ ++- _mm256_storeu_si256((__m256i*)dotProductVector, result); // Store the results back into the dot product vector ++- _mm256_zeroupper(); +++ _mm256_storeu_si256((__m256i*)dotProductVector, +++ result); // Store the results back into the dot product vector +++ _mm256_zeroupper(); ++ ++- for (number = 0; number < 8; ++number) ++- { ++- dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(dotProductVector[number])), sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[number]))); ++- } +++ for (number = 0; number < 8; ++number) { +++ dotProduct = lv_cmake( +++ sat_adds16i(lv_creal(dotProduct), lv_creal(dotProductVector[number])), +++ sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[number]))); ++ } +++ } ++ ++- for (number = 0; number < (num_points % 8); ++number) ++- { ++- lv_16sc_t tmp = (*_in_a++) * (*_in_b++); ++- dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(tmp)), sat_adds16i(lv_cimag(dotProduct), lv_cimag(tmp))); ++- } +++ for (number = 0; number < (num_points % 8); ++number) { +++ lv_16sc_t tmp = (*_in_a++) * (*_in_b++); +++ dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(tmp)), +++ sat_adds16i(lv_cimag(dotProduct), lv_cimag(tmp))); +++ } ++ ++ *_out = dotProduct; ++ } ++@@ -299,7 +387,10 @@ static inline void volk_16ic_x2_dot_prod_16ic_u_axv2(lv_16sc_t* out, const lv_16 ++ #ifdef LV_HAVE_AVX2 ++ #include ++ ++-static inline void volk_16ic_x2_dot_prod_16ic_a_axv2(lv_16sc_t* out, const lv_16sc_t* in_a, const lv_16sc_t* in_b, unsigned int num_points) +++static inline void volk_16ic_x2_dot_prod_16ic_a_axv2(lv_16sc_t* out, +++ const lv_16sc_t* in_a, +++ const lv_16sc_t* in_b, +++ unsigned int num_points) ++ { ++ lv_16sc_t dotProduct = lv_cmake((int16_t)0, (int16_t)0); ++ ++@@ -310,62 +401,126 @@ static inline void volk_16ic_x2_dot_prod_16ic_a_axv2(lv_16sc_t* out, const lv_16 ++ lv_16sc_t* _out = out; ++ unsigned int number; ++ ++- if (avx_iters > 0) ++- { ++- __m256i a, b, c, c_sr, mask_imag, mask_real, real, imag, imag1, imag2, b_sl, a_sl, realcacc, imagcacc, result; ++- __VOLK_ATTR_ALIGNED(32) lv_16sc_t dotProductVector[8]; ++- ++- realcacc = _mm256_setzero_si256(); ++- imagcacc = _mm256_setzero_si256(); ++- ++- mask_imag = _mm256_set_epi8(0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0); ++- mask_real = _mm256_set_epi8(0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF); ++- ++- for(number = 0; number < avx_iters; number++) ++- { ++- a = _mm256_load_si256((__m256i*)_in_a); ++- __VOLK_PREFETCH(_in_a + 16); ++- b = _mm256_load_si256((__m256i*)_in_b); ++- __VOLK_PREFETCH(_in_b + 16); ++- c = _mm256_mullo_epi16(a, b); ++- ++- c_sr = _mm256_srli_si256(c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst. ++- real = _mm256_subs_epi16(c, c_sr); ++- ++- b_sl = _mm256_slli_si256(b, 2); ++- a_sl = _mm256_slli_si256(a, 2); ++- ++- imag1 = _mm256_mullo_epi16(a, b_sl); ++- imag2 = _mm256_mullo_epi16(b, a_sl); ++- ++- imag = _mm256_adds_epi16(imag1, imag2); //with saturation arithmetic! ++- ++- realcacc = _mm256_adds_epi16(realcacc, real); ++- imagcacc = _mm256_adds_epi16(imagcacc, imag); ++- ++- _in_a += 8; ++- _in_b += 8; ++- } +++ if (avx_iters > 0) { +++ __m256i a, b, c, c_sr, mask_imag, mask_real, real, imag, imag1, imag2, b_sl, a_sl, +++ realcacc, imagcacc, result; +++ __VOLK_ATTR_ALIGNED(32) lv_16sc_t dotProductVector[8]; +++ +++ realcacc = _mm256_setzero_si256(); +++ imagcacc = _mm256_setzero_si256(); +++ +++ mask_imag = _mm256_set_epi8(0xFF, +++ 0xFF, +++ 0, +++ 0, +++ 0xFF, +++ 0xFF, +++ 0, +++ 0, +++ 0xFF, +++ 0xFF, +++ 0, +++ 0, +++ 0xFF, +++ 0xFF, +++ 0, +++ 0, +++ 0xFF, +++ 0xFF, +++ 0, +++ 0, +++ 0xFF, +++ 0xFF, +++ 0, +++ 0, +++ 0xFF, +++ 0xFF, +++ 0, +++ 0, +++ 0xFF, +++ 0xFF, +++ 0, +++ 0); +++ mask_real = _mm256_set_epi8(0, +++ 0, +++ 0xFF, +++ 0xFF, +++ 0, +++ 0, +++ 0xFF, +++ 0xFF, +++ 0, +++ 0, +++ 0xFF, +++ 0xFF, +++ 0, +++ 0, +++ 0xFF, +++ 0xFF, +++ 0, +++ 0, +++ 0xFF, +++ 0xFF, +++ 0, +++ 0, +++ 0xFF, +++ 0xFF, +++ 0, +++ 0, +++ 0xFF, +++ 0xFF, +++ 0, +++ 0, +++ 0xFF, +++ 0xFF); +++ +++ for (number = 0; number < avx_iters; number++) { +++ a = _mm256_load_si256((__m256i*)_in_a); +++ __VOLK_PREFETCH(_in_a + 16); +++ b = _mm256_load_si256((__m256i*)_in_b); +++ __VOLK_PREFETCH(_in_b + 16); +++ c = _mm256_mullo_epi16(a, b); +++ +++ c_sr = _mm256_srli_si256(c, 2); // Shift a right by imm8 bytes while shifting +++ // in zeros, and store the results in dst. +++ real = _mm256_subs_epi16(c, c_sr); +++ +++ b_sl = _mm256_slli_si256(b, 2); +++ a_sl = _mm256_slli_si256(a, 2); +++ +++ imag1 = _mm256_mullo_epi16(a, b_sl); +++ imag2 = _mm256_mullo_epi16(b, a_sl); +++ +++ imag = _mm256_adds_epi16(imag1, imag2); // with saturation arithmetic! +++ +++ realcacc = _mm256_adds_epi16(realcacc, real); +++ imagcacc = _mm256_adds_epi16(imagcacc, imag); +++ +++ _in_a += 8; +++ _in_b += 8; +++ } ++ ++- realcacc = _mm256_and_si256(realcacc, mask_real); ++- imagcacc = _mm256_and_si256(imagcacc, mask_imag); +++ realcacc = _mm256_and_si256(realcacc, mask_real); +++ imagcacc = _mm256_and_si256(imagcacc, mask_imag); ++ ++- result = _mm256_or_si256(realcacc, imagcacc); +++ result = _mm256_or_si256(realcacc, imagcacc); ++ ++- _mm256_store_si256((__m256i*)dotProductVector, result); // Store the results back into the dot product vector ++- _mm256_zeroupper(); +++ _mm256_store_si256((__m256i*)dotProductVector, +++ result); // Store the results back into the dot product vector +++ _mm256_zeroupper(); ++ ++- for (number = 0; number < 8; ++number) ++- { ++- dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(dotProductVector[number])), sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[number]))); ++- } +++ for (number = 0; number < 8; ++number) { +++ dotProduct = lv_cmake( +++ sat_adds16i(lv_creal(dotProduct), lv_creal(dotProductVector[number])), +++ sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[number]))); ++ } +++ } ++ ++- for (number = 0; number < (num_points % 8); ++number) ++- { ++- lv_16sc_t tmp = (*_in_a++) * (*_in_b++); ++- dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(tmp)), sat_adds16i(lv_cimag(dotProduct), lv_cimag(tmp))); ++- } +++ for (number = 0; number < (num_points % 8); ++number) { +++ lv_16sc_t tmp = (*_in_a++) * (*_in_b++); +++ dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(tmp)), +++ sat_adds16i(lv_cimag(dotProduct), lv_cimag(tmp))); +++ } ++ ++ *_out = dotProduct; ++ } ++@@ -375,69 +530,70 @@ static inline void volk_16ic_x2_dot_prod_16ic_a_axv2(lv_16sc_t* out, const lv_16 ++ #ifdef LV_HAVE_NEON ++ #include ++ ++-static inline void volk_16ic_x2_dot_prod_16ic_neon(lv_16sc_t* out, const lv_16sc_t* in_a, const lv_16sc_t* in_b, unsigned int num_points) +++static inline void volk_16ic_x2_dot_prod_16ic_neon(lv_16sc_t* out, +++ const lv_16sc_t* in_a, +++ const lv_16sc_t* in_b, +++ unsigned int num_points) ++ { ++ unsigned int quarter_points = num_points / 4; ++ unsigned int number; ++ ++- lv_16sc_t* a_ptr = (lv_16sc_t*) in_a; ++- lv_16sc_t* b_ptr = (lv_16sc_t*) in_b; +++ lv_16sc_t* a_ptr = (lv_16sc_t*)in_a; +++ lv_16sc_t* b_ptr = (lv_16sc_t*)in_b; ++ *out = lv_cmake((int16_t)0, (int16_t)0); ++ ++- if (quarter_points > 0) ++- { ++- // for 2-lane vectors, 1st lane holds the real part, ++- // 2nd lane holds the imaginary part ++- int16x4x2_t a_val, b_val, c_val, accumulator; ++- int16x4x2_t tmp_real, tmp_imag; ++- __VOLK_ATTR_ALIGNED(16) lv_16sc_t accum_result[4]; ++- accumulator.val[0] = vdup_n_s16(0); ++- accumulator.val[1] = vdup_n_s16(0); ++- lv_16sc_t dotProduct = lv_cmake((int16_t)0, (int16_t)0); ++- ++- for(number = 0; number < quarter_points; ++number) ++- { ++- a_val = vld2_s16((int16_t*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i ++- b_val = vld2_s16((int16_t*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i ++- __VOLK_PREFETCH(a_ptr + 8); ++- __VOLK_PREFETCH(b_ptr + 8); ++- ++- // multiply the real*real and imag*imag to get real result ++- // a0r*b0r|a1r*b1r|a2r*b2r|a3r*b3r ++- tmp_real.val[0] = vmul_s16(a_val.val[0], b_val.val[0]); ++- // a0i*b0i|a1i*b1i|a2i*b2i|a3i*b3i ++- tmp_real.val[1] = vmul_s16(a_val.val[1], b_val.val[1]); ++- ++- // Multiply cross terms to get the imaginary result ++- // a0r*b0i|a1r*b1i|a2r*b2i|a3r*b3i ++- tmp_imag.val[0] = vmul_s16(a_val.val[0], b_val.val[1]); ++- // a0i*b0r|a1i*b1r|a2i*b2r|a3i*b3r ++- tmp_imag.val[1] = vmul_s16(a_val.val[1], b_val.val[0]); ++- ++- c_val.val[0] = vqsub_s16(tmp_real.val[0], tmp_real.val[1]); ++- c_val.val[1] = vqadd_s16(tmp_imag.val[0], tmp_imag.val[1]); ++- ++- accumulator.val[0] = vqadd_s16(accumulator.val[0], c_val.val[0]); ++- accumulator.val[1] = vqadd_s16(accumulator.val[1], c_val.val[1]); ++- ++- a_ptr += 4; ++- b_ptr += 4; ++- } ++- ++- vst2_s16((int16_t*)accum_result, accumulator); ++- for (number = 0; number < 4; ++number) ++- { ++- dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(accum_result[number])), sat_adds16i(lv_cimag(dotProduct), lv_cimag(accum_result[number]))); ++- } ++- ++- *out = dotProduct; +++ if (quarter_points > 0) { +++ // for 2-lane vectors, 1st lane holds the real part, +++ // 2nd lane holds the imaginary part +++ int16x4x2_t a_val, b_val, c_val, accumulator; +++ int16x4x2_t tmp_real, tmp_imag; +++ __VOLK_ATTR_ALIGNED(16) lv_16sc_t accum_result[4]; +++ accumulator.val[0] = vdup_n_s16(0); +++ accumulator.val[1] = vdup_n_s16(0); +++ lv_16sc_t dotProduct = lv_cmake((int16_t)0, (int16_t)0); +++ +++ for (number = 0; number < quarter_points; ++number) { +++ a_val = vld2_s16((int16_t*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i +++ b_val = vld2_s16((int16_t*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i +++ __VOLK_PREFETCH(a_ptr + 8); +++ __VOLK_PREFETCH(b_ptr + 8); +++ +++ // multiply the real*real and imag*imag to get real result +++ // a0r*b0r|a1r*b1r|a2r*b2r|a3r*b3r +++ tmp_real.val[0] = vmul_s16(a_val.val[0], b_val.val[0]); +++ // a0i*b0i|a1i*b1i|a2i*b2i|a3i*b3i +++ tmp_real.val[1] = vmul_s16(a_val.val[1], b_val.val[1]); +++ +++ // Multiply cross terms to get the imaginary result +++ // a0r*b0i|a1r*b1i|a2r*b2i|a3r*b3i +++ tmp_imag.val[0] = vmul_s16(a_val.val[0], b_val.val[1]); +++ // a0i*b0r|a1i*b1r|a2i*b2r|a3i*b3r +++ tmp_imag.val[1] = vmul_s16(a_val.val[1], b_val.val[0]); +++ +++ c_val.val[0] = vqsub_s16(tmp_real.val[0], tmp_real.val[1]); +++ c_val.val[1] = vqadd_s16(tmp_imag.val[0], tmp_imag.val[1]); +++ +++ accumulator.val[0] = vqadd_s16(accumulator.val[0], c_val.val[0]); +++ accumulator.val[1] = vqadd_s16(accumulator.val[1], c_val.val[1]); +++ +++ a_ptr += 4; +++ b_ptr += 4; ++ } ++ ++- // tail case ++- for(number = quarter_points * 4; number < num_points; ++number) ++- { ++- *out += (*a_ptr++) * (*b_ptr++); +++ vst2_s16((int16_t*)accum_result, accumulator); +++ for (number = 0; number < 4; ++number) { +++ dotProduct = lv_cmake( +++ sat_adds16i(lv_creal(dotProduct), lv_creal(accum_result[number])), +++ sat_adds16i(lv_cimag(dotProduct), lv_cimag(accum_result[number]))); ++ } +++ +++ *out = dotProduct; +++ } +++ +++ // tail case +++ for (number = quarter_points * 4; number < num_points; ++number) { +++ *out += (*a_ptr++) * (*b_ptr++); +++ } ++ } ++ ++ #endif /* LV_HAVE_NEON */ ++@@ -446,13 +602,16 @@ static inline void volk_16ic_x2_dot_prod_16ic_neon(lv_16sc_t* out, const lv_16sc ++ #ifdef LV_HAVE_NEON ++ #include ++ ++-static inline void volk_16ic_x2_dot_prod_16ic_neon_vma(lv_16sc_t* out, const lv_16sc_t* in_a, const lv_16sc_t* in_b, unsigned int num_points) +++static inline void volk_16ic_x2_dot_prod_16ic_neon_vma(lv_16sc_t* out, +++ const lv_16sc_t* in_a, +++ const lv_16sc_t* in_b, +++ unsigned int num_points) ++ { ++ unsigned int quarter_points = num_points / 4; ++ unsigned int number; ++ ++- lv_16sc_t* a_ptr = (lv_16sc_t*) in_a; ++- lv_16sc_t* b_ptr = (lv_16sc_t*) in_b; +++ lv_16sc_t* a_ptr = (lv_16sc_t*)in_a; +++ lv_16sc_t* b_ptr = (lv_16sc_t*)in_b; ++ // for 2-lane vectors, 1st lane holds the real part, ++ // 2nd lane holds the imaginary part ++ int16x4x2_t a_val, b_val, accumulator; ++@@ -461,35 +620,33 @@ static inline void volk_16ic_x2_dot_prod_16ic_neon_vma(lv_16sc_t* out, const lv_ ++ accumulator.val[0] = vdup_n_s16(0); ++ accumulator.val[1] = vdup_n_s16(0); ++ ++- for(number = 0; number < quarter_points; ++number) ++- { ++- a_val = vld2_s16((int16_t*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i ++- b_val = vld2_s16((int16_t*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i ++- __VOLK_PREFETCH(a_ptr + 8); ++- __VOLK_PREFETCH(b_ptr + 8); +++ for (number = 0; number < quarter_points; ++number) { +++ a_val = vld2_s16((int16_t*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i +++ b_val = vld2_s16((int16_t*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i +++ __VOLK_PREFETCH(a_ptr + 8); +++ __VOLK_PREFETCH(b_ptr + 8); ++ ++- tmp.val[0] = vmul_s16(a_val.val[0], b_val.val[0]); ++- tmp.val[1] = vmul_s16(a_val.val[1], b_val.val[0]); +++ tmp.val[0] = vmul_s16(a_val.val[0], b_val.val[0]); +++ tmp.val[1] = vmul_s16(a_val.val[1], b_val.val[0]); ++ ++- // use multiply accumulate/subtract to get result ++- tmp.val[0] = vmls_s16(tmp.val[0], a_val.val[1], b_val.val[1]); ++- tmp.val[1] = vmla_s16(tmp.val[1], a_val.val[0], b_val.val[1]); +++ // use multiply accumulate/subtract to get result +++ tmp.val[0] = vmls_s16(tmp.val[0], a_val.val[1], b_val.val[1]); +++ tmp.val[1] = vmla_s16(tmp.val[1], a_val.val[0], b_val.val[1]); ++ ++- accumulator.val[0] = vqadd_s16(accumulator.val[0], tmp.val[0]); ++- accumulator.val[1] = vqadd_s16(accumulator.val[1], tmp.val[1]); +++ accumulator.val[0] = vqadd_s16(accumulator.val[0], tmp.val[0]); +++ accumulator.val[1] = vqadd_s16(accumulator.val[1], tmp.val[1]); ++ ++- a_ptr += 4; ++- b_ptr += 4; ++- } +++ a_ptr += 4; +++ b_ptr += 4; +++ } ++ ++ vst2_s16((int16_t*)accum_result, accumulator); ++ *out = accum_result[0] + accum_result[1] + accum_result[2] + accum_result[3]; ++ ++ // tail case ++- for(number = quarter_points * 4; number < num_points; ++number) ++- { ++- *out += (*a_ptr++) * (*b_ptr++); ++- } +++ for (number = quarter_points * 4; number < num_points; ++number) { +++ *out += (*a_ptr++) * (*b_ptr++); +++ } ++ } ++ ++ #endif /* LV_HAVE_NEON */ ++@@ -498,13 +655,16 @@ static inline void volk_16ic_x2_dot_prod_16ic_neon_vma(lv_16sc_t* out, const lv_ ++ #ifdef LV_HAVE_NEON ++ #include ++ ++-static inline void volk_16ic_x2_dot_prod_16ic_neon_optvma(lv_16sc_t* out, const lv_16sc_t* in_a, const lv_16sc_t* in_b, unsigned int num_points) +++static inline void volk_16ic_x2_dot_prod_16ic_neon_optvma(lv_16sc_t* out, +++ const lv_16sc_t* in_a, +++ const lv_16sc_t* in_b, +++ unsigned int num_points) ++ { ++ unsigned int quarter_points = num_points / 4; ++ unsigned int number; ++ ++- lv_16sc_t* a_ptr = (lv_16sc_t*) in_a; ++- lv_16sc_t* b_ptr = (lv_16sc_t*) in_b; +++ lv_16sc_t* a_ptr = (lv_16sc_t*)in_a; +++ lv_16sc_t* b_ptr = (lv_16sc_t*)in_b; ++ // for 2-lane vectors, 1st lane holds the real part, ++ // 2nd lane holds the imaginary part ++ int16x4x2_t a_val, b_val, accumulator1, accumulator2; ++@@ -515,22 +675,21 @@ static inline void volk_16ic_x2_dot_prod_16ic_neon_optvma(lv_16sc_t* out, const ++ accumulator2.val[0] = vdup_n_s16(0); ++ accumulator2.val[1] = vdup_n_s16(0); ++ ++- for(number = 0; number < quarter_points; ++number) ++- { ++- a_val = vld2_s16((int16_t*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i ++- b_val = vld2_s16((int16_t*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i ++- __VOLK_PREFETCH(a_ptr + 8); ++- __VOLK_PREFETCH(b_ptr + 8); +++ for (number = 0; number < quarter_points; ++number) { +++ a_val = vld2_s16((int16_t*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i +++ b_val = vld2_s16((int16_t*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i +++ __VOLK_PREFETCH(a_ptr + 8); +++ __VOLK_PREFETCH(b_ptr + 8); ++ ++- // use 2 accumulators to remove inter-instruction data dependencies ++- accumulator1.val[0] = vmla_s16(accumulator1.val[0], a_val.val[0], b_val.val[0]); ++- accumulator2.val[0] = vmls_s16(accumulator2.val[0], a_val.val[1], b_val.val[1]); ++- accumulator1.val[1] = vmla_s16(accumulator1.val[1], a_val.val[0], b_val.val[1]); ++- accumulator2.val[1] = vmla_s16(accumulator2.val[1], a_val.val[1], b_val.val[0]); +++ // use 2 accumulators to remove inter-instruction data dependencies +++ accumulator1.val[0] = vmla_s16(accumulator1.val[0], a_val.val[0], b_val.val[0]); +++ accumulator2.val[0] = vmls_s16(accumulator2.val[0], a_val.val[1], b_val.val[1]); +++ accumulator1.val[1] = vmla_s16(accumulator1.val[1], a_val.val[0], b_val.val[1]); +++ accumulator2.val[1] = vmla_s16(accumulator2.val[1], a_val.val[1], b_val.val[0]); ++ ++- a_ptr += 4; ++- b_ptr += 4; ++- } +++ a_ptr += 4; +++ b_ptr += 4; +++ } ++ ++ accumulator1.val[0] = vqadd_s16(accumulator1.val[0], accumulator2.val[0]); ++ accumulator1.val[1] = vqadd_s16(accumulator1.val[1], accumulator2.val[1]); ++@@ -539,10 +698,9 @@ static inline void volk_16ic_x2_dot_prod_16ic_neon_optvma(lv_16sc_t* out, const ++ *out = accum_result[0] + accum_result[1] + accum_result[2] + accum_result[3]; ++ ++ // tail case ++- for(number = quarter_points * 4; number < num_points; ++number) ++- { ++- *out += (*a_ptr++) * (*b_ptr++); ++- } +++ for (number = quarter_points * 4; number < num_points; ++number) { +++ *out += (*a_ptr++) * (*b_ptr++); +++ } ++ } ++ ++ #endif /* LV_HAVE_NEON */ ++diff --git a/kernels/volk/volk_16ic_x2_multiply_16ic.h b/kernels/volk/volk_16ic_x2_multiply_16ic.h ++index 20d6a7f..2bf835d 100644 ++--- a/kernels/volk/volk_16ic_x2_multiply_16ic.h +++++ b/kernels/volk/volk_16ic_x2_multiply_16ic.h ++@@ -25,18 +25,19 @@ ++ * ++ * \b Overview ++ * ++- * Multiplies two input complex vectors, point-by-point, storing the result in the third vector. ++- * WARNING: Saturation is not checked. +++ * Multiplies two input complex vectors, point-by-point, storing the result in the third +++ * vector. WARNING: Saturation is not checked. ++ * ++ * Dispatcher Prototype ++ * \code ++- * void volk_16ic_x2_multiply_16ic(lv_16sc_t* result, const lv_16sc_t* in_a, const lv_16sc_t* in_b, unsigned int num_points); ++- * \endcode +++ * void volk_16ic_x2_multiply_16ic(lv_16sc_t* result, const lv_16sc_t* in_a, const +++ * lv_16sc_t* in_b, unsigned int num_points); \endcode ++ * ++ * \b Inputs ++ * \li in_a: One of the vectors to be multiplied. ++ * \li in_b: The other vector to be multiplied. ++- * \li num_points: The number of complex data points to be multiplied from both input vectors. +++ * \li num_points: The number of complex data points to be multiplied from both input +++ * vectors. ++ * ++ * \b Outputs ++ * \li result: The vector where the results will be stored. ++@@ -51,13 +52,15 @@ ++ ++ #ifdef LV_HAVE_GENERIC ++ ++-static inline void volk_16ic_x2_multiply_16ic_generic(lv_16sc_t* result, const lv_16sc_t* in_a, const lv_16sc_t* in_b, unsigned int num_points) +++static inline void volk_16ic_x2_multiply_16ic_generic(lv_16sc_t* result, +++ const lv_16sc_t* in_a, +++ const lv_16sc_t* in_b, +++ unsigned int num_points) ++ { ++ unsigned int n; ++- for (n = 0; n < num_points; n++) ++- { ++- result[n] = in_a[n] * in_b[n]; ++- } +++ for (n = 0; n < num_points; n++) { +++ result[n] = in_a[n] * in_b[n]; +++ } ++ } ++ ++ #endif /*LV_HAVE_GENERIC*/ ++@@ -66,51 +69,58 @@ static inline void volk_16ic_x2_multiply_16ic_generic(lv_16sc_t* result, const l ++ #ifdef LV_HAVE_SSE2 ++ #include ++ ++-static inline void volk_16ic_x2_multiply_16ic_a_sse2(lv_16sc_t* out, const lv_16sc_t* in_a, const lv_16sc_t* in_b, unsigned int num_points) +++static inline void volk_16ic_x2_multiply_16ic_a_sse2(lv_16sc_t* out, +++ const lv_16sc_t* in_a, +++ const lv_16sc_t* in_b, +++ unsigned int num_points) ++ { ++ const unsigned int sse_iters = num_points / 4; ++- __m128i a, b, c, c_sr, mask_imag, mask_real, real, imag, imag1, imag2, b_sl, a_sl, result; +++ __m128i a, b, c, c_sr, mask_imag, mask_real, real, imag, imag1, imag2, b_sl, a_sl, +++ result; ++ ++- mask_imag = _mm_set_epi8(0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0); ++- mask_real = _mm_set_epi8(0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF); +++ mask_imag = _mm_set_epi8( +++ 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0); +++ mask_real = _mm_set_epi8( +++ 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF); ++ ++ const lv_16sc_t* _in_a = in_a; ++ const lv_16sc_t* _in_b = in_b; ++ lv_16sc_t* _out = out; ++ unsigned int number; ++ ++- for(number = 0; number < sse_iters; number++) ++- { ++- a = _mm_load_si128((__m128i*)_in_a); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg ++- b = _mm_load_si128((__m128i*)_in_b); ++- c = _mm_mullo_epi16 (a, b); // a3.i*b3.i, a3.r*b3.r, .... +++ for (number = 0; number < sse_iters; number++) { +++ a = _mm_load_si128( +++ (__m128i*)_in_a); // load (2 byte imag, 2 byte real) x 4 into 128 bits reg +++ b = _mm_load_si128((__m128i*)_in_b); +++ c = _mm_mullo_epi16(a, b); // a3.i*b3.i, a3.r*b3.r, .... ++ ++- c_sr = _mm_srli_si128 (c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst. ++- real = _mm_subs_epi16 (c, c_sr); ++- real = _mm_and_si128 (real, mask_real); // a3.r*b3.r-a3.i*b3.i , 0, a3.r*b3.r- a3.i*b3.i +++ c_sr = _mm_srli_si128(c, 2); // Shift a right by imm8 bytes while shifting in +++ // zeros, and store the results in dst. +++ real = _mm_subs_epi16(c, c_sr); +++ real = _mm_and_si128(real, +++ mask_real); // a3.r*b3.r-a3.i*b3.i , 0, a3.r*b3.r- a3.i*b3.i ++ ++- b_sl = _mm_slli_si128(b, 2); // b3.r, b2.i .... ++- a_sl = _mm_slli_si128(a, 2); // a3.r, a2.i .... +++ b_sl = _mm_slli_si128(b, 2); // b3.r, b2.i .... +++ a_sl = _mm_slli_si128(a, 2); // a3.r, a2.i .... ++ ++- imag1 = _mm_mullo_epi16(a, b_sl); // a3.i*b3.r, .... ++- imag2 = _mm_mullo_epi16(b, a_sl); // b3.i*a3.r, .... +++ imag1 = _mm_mullo_epi16(a, b_sl); // a3.i*b3.r, .... +++ imag2 = _mm_mullo_epi16(b, a_sl); // b3.i*a3.r, .... ++ ++- imag = _mm_adds_epi16(imag1, imag2); ++- imag = _mm_and_si128 (imag, mask_imag); // a3.i*b3.r+b3.i*a3.r, 0, ... +++ imag = _mm_adds_epi16(imag1, imag2); +++ imag = _mm_and_si128(imag, mask_imag); // a3.i*b3.r+b3.i*a3.r, 0, ... ++ ++- result = _mm_or_si128 (real, imag); +++ result = _mm_or_si128(real, imag); ++ ++- _mm_store_si128((__m128i*)_out, result); +++ _mm_store_si128((__m128i*)_out, result); ++ ++- _in_a += 4; ++- _in_b += 4; ++- _out += 4; ++- } +++ _in_a += 4; +++ _in_b += 4; +++ _out += 4; +++ } ++ ++- for (number = sse_iters * 4; number < num_points; ++number) ++- { ++- *_out++ = (*_in_a++) * (*_in_b++); ++- } +++ for (number = sse_iters * 4; number < num_points; ++number) { +++ *_out++ = (*_in_a++) * (*_in_b++); +++ } ++ } ++ #endif /* LV_HAVE_SSE2 */ ++ ++@@ -118,51 +128,58 @@ static inline void volk_16ic_x2_multiply_16ic_a_sse2(lv_16sc_t* out, const lv_16 ++ #ifdef LV_HAVE_SSE2 ++ #include ++ ++-static inline void volk_16ic_x2_multiply_16ic_u_sse2(lv_16sc_t* out, const lv_16sc_t* in_a, const lv_16sc_t* in_b, unsigned int num_points) +++static inline void volk_16ic_x2_multiply_16ic_u_sse2(lv_16sc_t* out, +++ const lv_16sc_t* in_a, +++ const lv_16sc_t* in_b, +++ unsigned int num_points) ++ { ++ const unsigned int sse_iters = num_points / 4; ++- __m128i a, b, c, c_sr, mask_imag, mask_real, real, imag, imag1,imag2, b_sl, a_sl, result; +++ __m128i a, b, c, c_sr, mask_imag, mask_real, real, imag, imag1, imag2, b_sl, a_sl, +++ result; ++ ++- mask_imag = _mm_set_epi8(0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0); ++- mask_real = _mm_set_epi8(0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF); +++ mask_imag = _mm_set_epi8( +++ 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0); +++ mask_real = _mm_set_epi8( +++ 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF); ++ ++ const lv_16sc_t* _in_a = in_a; ++ const lv_16sc_t* _in_b = in_b; ++ lv_16sc_t* _out = out; ++ unsigned int number; ++ ++- for(number = 0; number < sse_iters; number++) ++- { ++- a = _mm_loadu_si128((__m128i*)_in_a); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg ++- b = _mm_loadu_si128((__m128i*)_in_b); ++- c = _mm_mullo_epi16 (a, b); // a3.i*b3.i, a3.r*b3.r, .... +++ for (number = 0; number < sse_iters; number++) { +++ a = _mm_loadu_si128( +++ (__m128i*)_in_a); // load (2 byte imag, 2 byte real) x 4 into 128 bits reg +++ b = _mm_loadu_si128((__m128i*)_in_b); +++ c = _mm_mullo_epi16(a, b); // a3.i*b3.i, a3.r*b3.r, .... ++ ++- c_sr = _mm_srli_si128 (c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst. ++- real = _mm_subs_epi16 (c, c_sr); ++- real = _mm_and_si128 (real, mask_real); // a3.r*b3.r-a3.i*b3.i , 0, a3.r*b3.r- a3.i*b3.i +++ c_sr = _mm_srli_si128(c, 2); // Shift a right by imm8 bytes while shifting in +++ // zeros, and store the results in dst. +++ real = _mm_subs_epi16(c, c_sr); +++ real = _mm_and_si128(real, +++ mask_real); // a3.r*b3.r-a3.i*b3.i , 0, a3.r*b3.r- a3.i*b3.i ++ ++- b_sl = _mm_slli_si128(b, 2); // b3.r, b2.i .... ++- a_sl = _mm_slli_si128(a, 2); // a3.r, a2.i .... +++ b_sl = _mm_slli_si128(b, 2); // b3.r, b2.i .... +++ a_sl = _mm_slli_si128(a, 2); // a3.r, a2.i .... ++ ++- imag1 = _mm_mullo_epi16(a, b_sl); // a3.i*b3.r, .... ++- imag2 = _mm_mullo_epi16(b, a_sl); // b3.i*a3.r, .... +++ imag1 = _mm_mullo_epi16(a, b_sl); // a3.i*b3.r, .... +++ imag2 = _mm_mullo_epi16(b, a_sl); // b3.i*a3.r, .... ++ ++- imag = _mm_adds_epi16(imag1, imag2); ++- imag = _mm_and_si128 (imag, mask_imag); // a3.i*b3.r+b3.i*a3.r, 0, ... +++ imag = _mm_adds_epi16(imag1, imag2); +++ imag = _mm_and_si128(imag, mask_imag); // a3.i*b3.r+b3.i*a3.r, 0, ... ++ ++- result = _mm_or_si128 (real, imag); +++ result = _mm_or_si128(real, imag); ++ ++- _mm_storeu_si128((__m128i*)_out, result); +++ _mm_storeu_si128((__m128i*)_out, result); ++ ++- _in_a += 4; ++- _in_b += 4; ++- _out += 4; ++- } +++ _in_a += 4; +++ _in_b += 4; +++ _out += 4; +++ } ++ ++- for (number = sse_iters * 4; number < num_points; ++number) ++- { ++- *_out++ = (*_in_a++) * (*_in_b++); ++- } +++ for (number = sse_iters * 4; number < num_points; ++number) { +++ *_out++ = (*_in_a++) * (*_in_b++); +++ } ++ } ++ #endif /* LV_HAVE_SSE2 */ ++ ++@@ -170,7 +187,10 @@ static inline void volk_16ic_x2_multiply_16ic_u_sse2(lv_16sc_t* out, const lv_16 ++ #ifdef LV_HAVE_AVX2 ++ #include ++ ++-static inline void volk_16ic_x2_multiply_16ic_u_avx2(lv_16sc_t* out, const lv_16sc_t* in_a, const lv_16sc_t* in_b, unsigned int num_points) +++static inline void volk_16ic_x2_multiply_16ic_u_avx2(lv_16sc_t* out, +++ const lv_16sc_t* in_a, +++ const lv_16sc_t* in_b, +++ unsigned int num_points) ++ { ++ unsigned int number = 0; ++ const unsigned int avx2_points = num_points / 8; ++@@ -179,44 +199,108 @@ static inline void volk_16ic_x2_multiply_16ic_u_avx2(lv_16sc_t* out, const lv_16 ++ const lv_16sc_t* _in_b = in_b; ++ lv_16sc_t* _out = out; ++ ++- __m256i a, b, c, c_sr, real, imag, imag1, imag2, b_sl, a_sl, result; ++- ++- const __m256i mask_imag = _mm256_set_epi8(0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0); ++- const __m256i mask_real = _mm256_set_epi8(0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF); ++- ++- for(;number < avx2_points; number++) ++- { ++- a = _mm256_loadu_si256((__m256i*)_in_a); // Load the ar + ai, br + bi as ar,ai,br,bi ++- b = _mm256_loadu_si256((__m256i*)_in_b); // Load the cr + ci, dr + di as cr,ci,dr,di ++- c = _mm256_mullo_epi16(a, b); ++- ++- c_sr = _mm256_srli_si256(c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst. ++- real = _mm256_subs_epi16(c, c_sr); ++- real = _mm256_and_si256(real, mask_real); // a3.r*b3.r-a3.i*b3.i , 0, a3.r*b3.r- a3.i*b3.i ++- ++- b_sl = _mm256_slli_si256(b, 2); // b3.r, b2.i .... ++- a_sl = _mm256_slli_si256(a, 2); // a3.r, a2.i .... ++- ++- imag1 = _mm256_mullo_epi16(a, b_sl); // a3.i*b3.r, .... ++- imag2 = _mm256_mullo_epi16(b, a_sl); // b3.i*a3.r, .... ++- ++- imag = _mm256_adds_epi16(imag1, imag2); ++- imag = _mm256_and_si256(imag, mask_imag); // a3.i*b3.r+b3.i*a3.r, 0, ... ++- ++- result = _mm256_or_si256(real, imag); ++- ++- _mm256_storeu_si256((__m256i*)_out, result); ++- ++- _in_a += 8; ++- _in_b += 8; ++- _out += 8; ++- } +++ __m256i a, b, c, c_sr, real, imag, imag1, imag2, b_sl, a_sl, result; +++ +++ const __m256i mask_imag = _mm256_set_epi8(0xFF, +++ 0xFF, +++ 0, +++ 0, +++ 0xFF, +++ 0xFF, +++ 0, +++ 0, +++ 0xFF, +++ 0xFF, +++ 0, +++ 0, +++ 0xFF, +++ 0xFF, +++ 0, +++ 0, +++ 0xFF, +++ 0xFF, +++ 0, +++ 0, +++ 0xFF, +++ 0xFF, +++ 0, +++ 0, +++ 0xFF, +++ 0xFF, +++ 0, +++ 0, +++ 0xFF, +++ 0xFF, +++ 0, +++ 0); +++ const __m256i mask_real = _mm256_set_epi8(0, +++ 0, +++ 0xFF, +++ 0xFF, +++ 0, +++ 0, +++ 0xFF, +++ 0xFF, +++ 0, +++ 0, +++ 0xFF, +++ 0xFF, +++ 0, +++ 0, +++ 0xFF, +++ 0xFF, +++ 0, +++ 0, +++ 0xFF, +++ 0xFF, +++ 0, +++ 0, +++ 0xFF, +++ 0xFF, +++ 0, +++ 0, +++ 0xFF, +++ 0xFF, +++ 0, +++ 0, +++ 0xFF, +++ 0xFF); +++ +++ for (; number < avx2_points; number++) { +++ a = _mm256_loadu_si256( +++ (__m256i*)_in_a); // Load the ar + ai, br + bi as ar,ai,br,bi +++ b = _mm256_loadu_si256( +++ (__m256i*)_in_b); // Load the cr + ci, dr + di as cr,ci,dr,di +++ c = _mm256_mullo_epi16(a, b); +++ +++ c_sr = _mm256_srli_si256(c, 2); // Shift a right by imm8 bytes while shifting in +++ // zeros, and store the results in dst. +++ real = _mm256_subs_epi16(c, c_sr); +++ real = _mm256_and_si256( +++ real, mask_real); // a3.r*b3.r-a3.i*b3.i , 0, a3.r*b3.r- a3.i*b3.i +++ +++ b_sl = _mm256_slli_si256(b, 2); // b3.r, b2.i .... +++ a_sl = _mm256_slli_si256(a, 2); // a3.r, a2.i .... +++ +++ imag1 = _mm256_mullo_epi16(a, b_sl); // a3.i*b3.r, .... +++ imag2 = _mm256_mullo_epi16(b, a_sl); // b3.i*a3.r, .... +++ +++ imag = _mm256_adds_epi16(imag1, imag2); +++ imag = _mm256_and_si256(imag, mask_imag); // a3.i*b3.r+b3.i*a3.r, 0, ... +++ +++ result = _mm256_or_si256(real, imag); +++ +++ _mm256_storeu_si256((__m256i*)_out, result); +++ +++ _in_a += 8; +++ _in_b += 8; +++ _out += 8; +++ } ++ _mm256_zeroupper(); ++ number = avx2_points * 8; ++- for(;number < num_points; number++) ++- { ++- *_out++ = (*_in_a++) * (*_in_b++); ++- } +++ for (; number < num_points; number++) { +++ *_out++ = (*_in_a++) * (*_in_b++); +++ } ++ } ++ #endif /* LV_HAVE_AVX2 */ ++ ++@@ -224,7 +308,10 @@ static inline void volk_16ic_x2_multiply_16ic_u_avx2(lv_16sc_t* out, const lv_16 ++ #ifdef LV_HAVE_AVX2 ++ #include ++ ++-static inline void volk_16ic_x2_multiply_16ic_a_avx2(lv_16sc_t* out, const lv_16sc_t* in_a, const lv_16sc_t* in_b, unsigned int num_points) +++static inline void volk_16ic_x2_multiply_16ic_a_avx2(lv_16sc_t* out, +++ const lv_16sc_t* in_a, +++ const lv_16sc_t* in_b, +++ unsigned int num_points) ++ { ++ unsigned int number = 0; ++ const unsigned int avx2_points = num_points / 8; ++@@ -233,44 +320,108 @@ static inline void volk_16ic_x2_multiply_16ic_a_avx2(lv_16sc_t* out, const lv_16 ++ const lv_16sc_t* _in_b = in_b; ++ lv_16sc_t* _out = out; ++ ++- __m256i a, b, c, c_sr, real, imag, imag1, imag2, b_sl, a_sl, result; ++- ++- const __m256i mask_imag = _mm256_set_epi8(0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0); ++- const __m256i mask_real = _mm256_set_epi8(0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF); ++- ++- for(;number < avx2_points; number++) ++- { ++- a = _mm256_load_si256((__m256i*)_in_a); // Load the ar + ai, br + bi as ar,ai,br,bi ++- b = _mm256_load_si256((__m256i*)_in_b); // Load the cr + ci, dr + di as cr,ci,dr,di ++- c = _mm256_mullo_epi16(a, b); ++- ++- c_sr = _mm256_srli_si256(c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst. ++- real = _mm256_subs_epi16(c, c_sr); ++- real = _mm256_and_si256(real, mask_real); // a3.r*b3.r-a3.i*b3.i , 0, a3.r*b3.r- a3.i*b3.i ++- ++- b_sl = _mm256_slli_si256(b, 2); // b3.r, b2.i .... ++- a_sl = _mm256_slli_si256(a, 2); // a3.r, a2.i .... ++- ++- imag1 = _mm256_mullo_epi16(a, b_sl); // a3.i*b3.r, .... ++- imag2 = _mm256_mullo_epi16(b, a_sl); // b3.i*a3.r, .... ++- ++- imag = _mm256_adds_epi16(imag1, imag2); ++- imag = _mm256_and_si256(imag, mask_imag); // a3.i*b3.r+b3.i*a3.r, 0, ... ++- ++- result = _mm256_or_si256(real, imag); ++- ++- _mm256_store_si256((__m256i*)_out, result); ++- ++- _in_a += 8; ++- _in_b += 8; ++- _out += 8; ++- } +++ __m256i a, b, c, c_sr, real, imag, imag1, imag2, b_sl, a_sl, result; +++ +++ const __m256i mask_imag = _mm256_set_epi8(0xFF, +++ 0xFF, +++ 0, +++ 0, +++ 0xFF, +++ 0xFF, +++ 0, +++ 0, +++ 0xFF, +++ 0xFF, +++ 0, +++ 0, +++ 0xFF, +++ 0xFF, +++ 0, +++ 0, +++ 0xFF, +++ 0xFF, +++ 0, +++ 0, +++ 0xFF, +++ 0xFF, +++ 0, +++ 0, +++ 0xFF, +++ 0xFF, +++ 0, +++ 0, +++ 0xFF, +++ 0xFF, +++ 0, +++ 0); +++ const __m256i mask_real = _mm256_set_epi8(0, +++ 0, +++ 0xFF, +++ 0xFF, +++ 0, +++ 0, +++ 0xFF, +++ 0xFF, +++ 0, +++ 0, +++ 0xFF, +++ 0xFF, +++ 0, +++ 0, +++ 0xFF, +++ 0xFF, +++ 0, +++ 0, +++ 0xFF, +++ 0xFF, +++ 0, +++ 0, +++ 0xFF, +++ 0xFF, +++ 0, +++ 0, +++ 0xFF, +++ 0xFF, +++ 0, +++ 0, +++ 0xFF, +++ 0xFF); +++ +++ for (; number < avx2_points; number++) { +++ a = _mm256_load_si256( +++ (__m256i*)_in_a); // Load the ar + ai, br + bi as ar,ai,br,bi +++ b = _mm256_load_si256( +++ (__m256i*)_in_b); // Load the cr + ci, dr + di as cr,ci,dr,di +++ c = _mm256_mullo_epi16(a, b); +++ +++ c_sr = _mm256_srli_si256(c, 2); // Shift a right by imm8 bytes while shifting in +++ // zeros, and store the results in dst. +++ real = _mm256_subs_epi16(c, c_sr); +++ real = _mm256_and_si256( +++ real, mask_real); // a3.r*b3.r-a3.i*b3.i , 0, a3.r*b3.r- a3.i*b3.i +++ +++ b_sl = _mm256_slli_si256(b, 2); // b3.r, b2.i .... +++ a_sl = _mm256_slli_si256(a, 2); // a3.r, a2.i .... +++ +++ imag1 = _mm256_mullo_epi16(a, b_sl); // a3.i*b3.r, .... +++ imag2 = _mm256_mullo_epi16(b, a_sl); // b3.i*a3.r, .... +++ +++ imag = _mm256_adds_epi16(imag1, imag2); +++ imag = _mm256_and_si256(imag, mask_imag); // a3.i*b3.r+b3.i*a3.r, 0, ... +++ +++ result = _mm256_or_si256(real, imag); +++ +++ _mm256_store_si256((__m256i*)_out, result); +++ +++ _in_a += 8; +++ _in_b += 8; +++ _out += 8; +++ } ++ _mm256_zeroupper(); ++ number = avx2_points * 8; ++- for(;number < num_points; number++) ++- { ++- *_out++ = (*_in_a++) * (*_in_b++); ++- } +++ for (; number < num_points; number++) { +++ *_out++ = (*_in_a++) * (*_in_b++); +++ } ++ } ++ #endif /* LV_HAVE_AVX2 */ ++ ++@@ -278,48 +429,49 @@ static inline void volk_16ic_x2_multiply_16ic_a_avx2(lv_16sc_t* out, const lv_16 ++ #ifdef LV_HAVE_NEON ++ #include ++ ++-static inline void volk_16ic_x2_multiply_16ic_neon(lv_16sc_t* out, const lv_16sc_t* in_a, const lv_16sc_t* in_b, unsigned int num_points) +++static inline void volk_16ic_x2_multiply_16ic_neon(lv_16sc_t* out, +++ const lv_16sc_t* in_a, +++ const lv_16sc_t* in_b, +++ unsigned int num_points) ++ { ++- lv_16sc_t *a_ptr = (lv_16sc_t*) in_a; ++- lv_16sc_t *b_ptr = (lv_16sc_t*) in_b; +++ lv_16sc_t* a_ptr = (lv_16sc_t*)in_a; +++ lv_16sc_t* b_ptr = (lv_16sc_t*)in_b; ++ unsigned int quarter_points = num_points / 4; ++ int16x4x2_t a_val, b_val, c_val; ++ int16x4x2_t tmp_real, tmp_imag; ++ unsigned int number = 0; ++ ++- for(number = 0; number < quarter_points; ++number) ++- { ++- a_val = vld2_s16((int16_t*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i ++- b_val = vld2_s16((int16_t*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i ++- __VOLK_PREFETCH(a_ptr + 4); ++- __VOLK_PREFETCH(b_ptr + 4); ++- ++- // multiply the real*real and imag*imag to get real result ++- // a0r*b0r|a1r*b1r|a2r*b2r|a3r*b3r ++- tmp_real.val[0] = vmul_s16(a_val.val[0], b_val.val[0]); ++- // a0i*b0i|a1i*b1i|a2i*b2i|a3i*b3i ++- tmp_real.val[1] = vmul_s16(a_val.val[1], b_val.val[1]); ++- ++- // Multiply cross terms to get the imaginary result ++- // a0r*b0i|a1r*b1i|a2r*b2i|a3r*b3i ++- tmp_imag.val[0] = vmul_s16(a_val.val[0], b_val.val[1]); ++- // a0i*b0r|a1i*b1r|a2i*b2r|a3i*b3r ++- tmp_imag.val[1] = vmul_s16(a_val.val[1], b_val.val[0]); ++- ++- // store the results ++- c_val.val[0] = vsub_s16(tmp_real.val[0], tmp_real.val[1]); ++- c_val.val[1] = vadd_s16(tmp_imag.val[0], tmp_imag.val[1]); ++- vst2_s16((int16_t*)out, c_val); ++- ++- a_ptr += 4; ++- b_ptr += 4; ++- out += 4; ++- } ++- ++- for(number = quarter_points * 4; number < num_points; number++) ++- { ++- *out++ = (*a_ptr++) * (*b_ptr++); ++- } +++ for (number = 0; number < quarter_points; ++number) { +++ a_val = vld2_s16((int16_t*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i +++ b_val = vld2_s16((int16_t*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i +++ __VOLK_PREFETCH(a_ptr + 4); +++ __VOLK_PREFETCH(b_ptr + 4); +++ +++ // multiply the real*real and imag*imag to get real result +++ // a0r*b0r|a1r*b1r|a2r*b2r|a3r*b3r +++ tmp_real.val[0] = vmul_s16(a_val.val[0], b_val.val[0]); +++ // a0i*b0i|a1i*b1i|a2i*b2i|a3i*b3i +++ tmp_real.val[1] = vmul_s16(a_val.val[1], b_val.val[1]); +++ +++ // Multiply cross terms to get the imaginary result +++ // a0r*b0i|a1r*b1i|a2r*b2i|a3r*b3i +++ tmp_imag.val[0] = vmul_s16(a_val.val[0], b_val.val[1]); +++ // a0i*b0r|a1i*b1r|a2i*b2r|a3i*b3r +++ tmp_imag.val[1] = vmul_s16(a_val.val[1], b_val.val[0]); +++ +++ // store the results +++ c_val.val[0] = vsub_s16(tmp_real.val[0], tmp_real.val[1]); +++ c_val.val[1] = vadd_s16(tmp_imag.val[0], tmp_imag.val[1]); +++ vst2_s16((int16_t*)out, c_val); +++ +++ a_ptr += 4; +++ b_ptr += 4; +++ out += 4; +++ } +++ +++ for (number = quarter_points * 4; number < num_points; number++) { +++ *out++ = (*a_ptr++) * (*b_ptr++); +++ } ++ } ++ #endif /* LV_HAVE_NEON */ ++ ++diff --git a/kernels/volk/volk_16u_byteswap.h b/kernels/volk/volk_16u_byteswap.h ++index eaa972f..221dcdb 100644 ++--- a/kernels/volk/volk_16u_byteswap.h +++++ b/kernels/volk/volk_16u_byteswap.h ++@@ -58,74 +58,80 @@ ++ ++ #if LV_HAVE_AVX2 ++ #include ++-static inline void volk_16u_byteswap_a_avx2(uint16_t* intsToSwap, unsigned int num_points){ ++- unsigned int number; +++static inline void volk_16u_byteswap_a_avx2(uint16_t* intsToSwap, unsigned int num_points) +++{ +++ unsigned int number; ++ ++- const unsigned int nPerSet = 16; ++- const uint64_t nSets = num_points / nPerSet; +++ const unsigned int nPerSet = 16; +++ const uint64_t nSets = num_points / nPerSet; ++ ++- uint16_t* inputPtr = (uint16_t*) intsToSwap; +++ uint16_t* inputPtr = (uint16_t*)intsToSwap; ++ ++- const uint8_t shuffleVector[32] = { 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14, 17, 16, 19, 18, 21, 20, 23, 22, 25, 24, 27, 26, 29, 28, 31, 30}; +++ const uint8_t shuffleVector[32] = { 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, +++ 10, 13, 12, 15, 14, 17, 16, 19, 18, 21, 20, +++ 23, 22, 25, 24, 27, 26, 29, 28, 31, 30 }; ++ ++- const __m256i myShuffle = _mm256_loadu_si256((__m256i*) &shuffleVector[0]); +++ const __m256i myShuffle = _mm256_loadu_si256((__m256i*)&shuffleVector[0]); ++ ++- for(number = 0; number < nSets; number++) { ++- // Load the 32t values, increment inputPtr later since we're doing it in-place. ++- const __m256i input = _mm256_load_si256((__m256i*)inputPtr); ++- const __m256i output = _mm256_shuffle_epi8(input, myShuffle); +++ for (number = 0; number < nSets; number++) { +++ // Load the 32t values, increment inputPtr later since we're doing it in-place. +++ const __m256i input = _mm256_load_si256((__m256i*)inputPtr); +++ const __m256i output = _mm256_shuffle_epi8(input, myShuffle); ++ ++- // Store the results ++- _mm256_store_si256((__m256i*)inputPtr, output); ++- inputPtr += nPerSet; ++- } +++ // Store the results +++ _mm256_store_si256((__m256i*)inputPtr, output); +++ inputPtr += nPerSet; +++ } ++ ++- _mm256_zeroupper(); +++ _mm256_zeroupper(); ++ ++- // Byteswap any remaining points: ++- for(number = nPerSet * nSets; number < num_points; number++) { ++- uint16_t outputVal = *inputPtr; ++- outputVal = (((outputVal >> 8) & 0xff) | ((outputVal << 8) & 0xff00)); ++- *inputPtr = outputVal; ++- inputPtr++; ++- } +++ // Byteswap any remaining points: +++ for (number = nPerSet * nSets; number < num_points; number++) { +++ uint16_t outputVal = *inputPtr; +++ outputVal = (((outputVal >> 8) & 0xff) | ((outputVal << 8) & 0xff00)); +++ *inputPtr = outputVal; +++ inputPtr++; +++ } ++ } ++ #endif /* LV_HAVE_AVX2 */ ++ ++ ++ #if LV_HAVE_AVX2 ++ #include ++-static inline void volk_16u_byteswap_u_avx2(uint16_t* intsToSwap, unsigned int num_points){ ++- unsigned int number; +++static inline void volk_16u_byteswap_u_avx2(uint16_t* intsToSwap, unsigned int num_points) +++{ +++ unsigned int number; ++ ++- const unsigned int nPerSet = 16; ++- const uint64_t nSets = num_points / nPerSet; +++ const unsigned int nPerSet = 16; +++ const uint64_t nSets = num_points / nPerSet; ++ ++- uint16_t* inputPtr = (uint16_t*) intsToSwap; +++ uint16_t* inputPtr = (uint16_t*)intsToSwap; ++ ++- const uint8_t shuffleVector[32] = { 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14, 17, 16, 19, 18, 21, 20, 23, 22, 25, 24, 27, 26, 29, 28, 31, 30}; +++ const uint8_t shuffleVector[32] = { 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, +++ 10, 13, 12, 15, 14, 17, 16, 19, 18, 21, 20, +++ 23, 22, 25, 24, 27, 26, 29, 28, 31, 30 }; ++ ++- const __m256i myShuffle = _mm256_loadu_si256((__m256i*) &shuffleVector[0]); +++ const __m256i myShuffle = _mm256_loadu_si256((__m256i*)&shuffleVector[0]); ++ ++- for (number = 0; number < nSets; number++) { ++- // Load the 32t values, increment inputPtr later since we're doing it in-place. ++- const __m256i input = _mm256_loadu_si256((__m256i*)inputPtr); ++- const __m256i output = _mm256_shuffle_epi8(input,myShuffle); +++ for (number = 0; number < nSets; number++) { +++ // Load the 32t values, increment inputPtr later since we're doing it in-place. +++ const __m256i input = _mm256_loadu_si256((__m256i*)inputPtr); +++ const __m256i output = _mm256_shuffle_epi8(input, myShuffle); ++ ++- // Store the results ++- _mm256_storeu_si256((__m256i*)inputPtr, output); ++- inputPtr += nPerSet; ++- } +++ // Store the results +++ _mm256_storeu_si256((__m256i*)inputPtr, output); +++ inputPtr += nPerSet; +++ } ++ ++- _mm256_zeroupper(); +++ _mm256_zeroupper(); ++ ++- // Byteswap any remaining points: ++- for(number = nPerSet * nSets; number < num_points; number++) { ++- uint16_t outputVal = *inputPtr; ++- outputVal = (((outputVal >> 8) & 0xff) | ((outputVal << 8) & 0xff00)); ++- *inputPtr = outputVal; ++- inputPtr++; ++- } +++ // Byteswap any remaining points: +++ for (number = nPerSet * nSets; number < num_points; number++) { +++ uint16_t outputVal = *inputPtr; +++ outputVal = (((outputVal >> 8) & 0xff) | ((outputVal << 8) & 0xff00)); +++ *inputPtr = outputVal; +++ inputPtr++; +++ } ++ } ++ #endif /* LV_HAVE_AVX2 */ ++ ++@@ -133,47 +139,50 @@ static inline void volk_16u_byteswap_u_avx2(uint16_t* intsToSwap, unsigned int n ++ #ifdef LV_HAVE_SSE2 ++ #include ++ ++-static inline void volk_16u_byteswap_u_sse2(uint16_t* intsToSwap, unsigned int num_points){ ++- unsigned int number = 0; ++- uint16_t* inputPtr = intsToSwap; ++- __m128i input, left, right, output; ++- ++- const unsigned int eighthPoints = num_points / 8; ++- for(;number < eighthPoints; number++){ ++- // Load the 16t values, increment inputPtr later since we're doing it in-place. ++- input = _mm_loadu_si128((__m128i*)inputPtr); ++- // Do the two shifts ++- left = _mm_slli_epi16(input, 8); ++- right = _mm_srli_epi16(input, 8); ++- // Or the left and right halves together ++- output = _mm_or_si128(left, right); ++- // Store the results ++- _mm_storeu_si128((__m128i*)inputPtr, output); ++- inputPtr += 8; ++- } ++- ++- // Byteswap any remaining points: ++- number = eighthPoints*8; ++- for(; number < num_points; number++){ ++- uint16_t outputVal = *inputPtr; ++- outputVal = (((outputVal >> 8) & 0xff) | ((outputVal << 8) & 0xff00)); ++- *inputPtr = outputVal; ++- inputPtr++; ++- } +++static inline void volk_16u_byteswap_u_sse2(uint16_t* intsToSwap, unsigned int num_points) +++{ +++ unsigned int number = 0; +++ uint16_t* inputPtr = intsToSwap; +++ __m128i input, left, right, output; +++ +++ const unsigned int eighthPoints = num_points / 8; +++ for (; number < eighthPoints; number++) { +++ // Load the 16t values, increment inputPtr later since we're doing it in-place. +++ input = _mm_loadu_si128((__m128i*)inputPtr); +++ // Do the two shifts +++ left = _mm_slli_epi16(input, 8); +++ right = _mm_srli_epi16(input, 8); +++ // Or the left and right halves together +++ output = _mm_or_si128(left, right); +++ // Store the results +++ _mm_storeu_si128((__m128i*)inputPtr, output); +++ inputPtr += 8; +++ } +++ +++ // Byteswap any remaining points: +++ number = eighthPoints * 8; +++ for (; number < num_points; number++) { +++ uint16_t outputVal = *inputPtr; +++ outputVal = (((outputVal >> 8) & 0xff) | ((outputVal << 8) & 0xff00)); +++ *inputPtr = outputVal; +++ inputPtr++; +++ } ++ } ++ #endif /* LV_HAVE_SSE2 */ ++ ++ #ifdef LV_HAVE_GENERIC ++ ++-static inline void volk_16u_byteswap_generic(uint16_t* intsToSwap, unsigned int num_points){ ++- unsigned int point; ++- uint16_t* inputPtr = intsToSwap; ++- for(point = 0; point < num_points; point++){ ++- uint16_t output = *inputPtr; ++- output = (((output >> 8) & 0xff) | ((output << 8) & 0xff00)); ++- *inputPtr = output; ++- inputPtr++; ++- } +++static inline void volk_16u_byteswap_generic(uint16_t* intsToSwap, +++ unsigned int num_points) +++{ +++ unsigned int point; +++ uint16_t* inputPtr = intsToSwap; +++ for (point = 0; point < num_points; point++) { +++ uint16_t output = *inputPtr; +++ output = (((output >> 8) & 0xff) | ((output << 8) & 0xff00)); +++ *inputPtr = output; +++ inputPtr++; +++ } ++ } ++ #endif /* LV_HAVE_GENERIC */ ++ ++@@ -187,129 +196,136 @@ static inline void volk_16u_byteswap_generic(uint16_t* intsToSwap, unsigned int ++ #ifdef LV_HAVE_SSE2 ++ #include ++ ++-static inline void volk_16u_byteswap_a_sse2(uint16_t* intsToSwap, unsigned int num_points){ ++- unsigned int number = 0; ++- uint16_t* inputPtr = intsToSwap; ++- __m128i input, left, right, output; ++- ++- const unsigned int eighthPoints = num_points / 8; ++- for(;number < eighthPoints; number++){ ++- // Load the 16t values, increment inputPtr later since we're doing it in-place. ++- input = _mm_load_si128((__m128i*)inputPtr); ++- // Do the two shifts ++- left = _mm_slli_epi16(input, 8); ++- right = _mm_srli_epi16(input, 8); ++- // Or the left and right halves together ++- output = _mm_or_si128(left, right); ++- // Store the results ++- _mm_store_si128((__m128i*)inputPtr, output); ++- inputPtr += 8; ++- } ++- ++- ++- // Byteswap any remaining points: ++- number = eighthPoints*8; ++- for(; number < num_points; number++){ ++- uint16_t outputVal = *inputPtr; ++- outputVal = (((outputVal >> 8) & 0xff) | ((outputVal << 8) & 0xff00)); ++- *inputPtr = outputVal; ++- inputPtr++; ++- } +++static inline void volk_16u_byteswap_a_sse2(uint16_t* intsToSwap, unsigned int num_points) +++{ +++ unsigned int number = 0; +++ uint16_t* inputPtr = intsToSwap; +++ __m128i input, left, right, output; +++ +++ const unsigned int eighthPoints = num_points / 8; +++ for (; number < eighthPoints; number++) { +++ // Load the 16t values, increment inputPtr later since we're doing it in-place. +++ input = _mm_load_si128((__m128i*)inputPtr); +++ // Do the two shifts +++ left = _mm_slli_epi16(input, 8); +++ right = _mm_srli_epi16(input, 8); +++ // Or the left and right halves together +++ output = _mm_or_si128(left, right); +++ // Store the results +++ _mm_store_si128((__m128i*)inputPtr, output); +++ inputPtr += 8; +++ } +++ +++ +++ // Byteswap any remaining points: +++ number = eighthPoints * 8; +++ for (; number < num_points; number++) { +++ uint16_t outputVal = *inputPtr; +++ outputVal = (((outputVal >> 8) & 0xff) | ((outputVal << 8) & 0xff00)); +++ *inputPtr = outputVal; +++ inputPtr++; +++ } ++ } ++ #endif /* LV_HAVE_SSE2 */ ++ ++ #ifdef LV_HAVE_NEON ++ #include ++ ++-static inline void volk_16u_byteswap_neon(uint16_t* intsToSwap, unsigned int num_points){ ++- unsigned int number; ++- unsigned int eighth_points = num_points / 8; ++- uint16x8_t input, output; ++- uint16_t* inputPtr = intsToSwap; ++- ++- for(number = 0; number < eighth_points; number++) { ++- input = vld1q_u16(inputPtr); ++- output = vsriq_n_u16(output, input, 8); ++- output = vsliq_n_u16(output, input, 8); ++- vst1q_u16(inputPtr, output); ++- inputPtr += 8; ++- } ++- ++- for(number = eighth_points * 8; number < num_points; number++){ ++- uint16_t output = *inputPtr; ++- output = (((output >> 8) & 0xff) | ((output << 8) & 0xff00)); ++- *inputPtr = output; ++- inputPtr++; ++- } +++static inline void volk_16u_byteswap_neon(uint16_t* intsToSwap, unsigned int num_points) +++{ +++ unsigned int number; +++ unsigned int eighth_points = num_points / 8; +++ uint16x8_t input, output; +++ uint16_t* inputPtr = intsToSwap; +++ +++ for (number = 0; number < eighth_points; number++) { +++ input = vld1q_u16(inputPtr); +++ output = vsriq_n_u16(output, input, 8); +++ output = vsliq_n_u16(output, input, 8); +++ vst1q_u16(inputPtr, output); +++ inputPtr += 8; +++ } +++ +++ for (number = eighth_points * 8; number < num_points; number++) { +++ uint16_t output = *inputPtr; +++ output = (((output >> 8) & 0xff) | ((output << 8) & 0xff00)); +++ *inputPtr = output; +++ inputPtr++; +++ } ++ } ++ #endif /* LV_HAVE_NEON */ ++ ++ #ifdef LV_HAVE_NEON ++ #include ++ ++-static inline void volk_16u_byteswap_neon_table(uint16_t* intsToSwap, unsigned int num_points){ ++- uint16_t* inputPtr = intsToSwap; ++- unsigned int number = 0; ++- unsigned int n16points = num_points / 16; ++- ++- uint8x8x4_t input_table; ++- uint8x8_t int_lookup01, int_lookup23, int_lookup45, int_lookup67; ++- uint8x8_t swapped_int01, swapped_int23, swapped_int45, swapped_int67; ++- ++- /* these magic numbers are used as byte-indices in the LUT. ++- they are pre-computed to save time. A simple C program ++- can calculate them; for example for lookup01: ++- uint8_t chars[8] = {24, 16, 8, 0, 25, 17, 9, 1}; ++- for(ii=0; ii < 8; ++ii) { ++- index += ((uint64_t)(*(chars+ii))) << (ii*8); +++static inline void volk_16u_byteswap_neon_table(uint16_t* intsToSwap, +++ unsigned int num_points) +++{ +++ uint16_t* inputPtr = intsToSwap; +++ unsigned int number = 0; +++ unsigned int n16points = num_points / 16; +++ +++ uint8x8x4_t input_table; +++ uint8x8_t int_lookup01, int_lookup23, int_lookup45, int_lookup67; +++ uint8x8_t swapped_int01, swapped_int23, swapped_int45, swapped_int67; +++ +++ /* these magic numbers are used as byte-indices in the LUT. +++ they are pre-computed to save time. A simple C program +++ can calculate them; for example for lookup01: +++ uint8_t chars[8] = {24, 16, 8, 0, 25, 17, 9, 1}; +++ for(ii=0; ii < 8; ++ii) { +++ index += ((uint64_t)(*(chars+ii))) << (ii*8); +++ } +++ */ +++ int_lookup01 = vcreate_u8(1232017111498883080); +++ int_lookup23 = vcreate_u8(1376697457175036426); +++ int_lookup45 = vcreate_u8(1521377802851189772); +++ int_lookup67 = vcreate_u8(1666058148527343118); +++ +++ for (number = 0; number < n16points; ++number) { +++ input_table = vld4_u8((uint8_t*)inputPtr); +++ swapped_int01 = vtbl4_u8(input_table, int_lookup01); +++ swapped_int23 = vtbl4_u8(input_table, int_lookup23); +++ swapped_int45 = vtbl4_u8(input_table, int_lookup45); +++ swapped_int67 = vtbl4_u8(input_table, int_lookup67); +++ vst1_u8((uint8_t*)inputPtr, swapped_int01); +++ vst1_u8((uint8_t*)(inputPtr + 4), swapped_int23); +++ vst1_u8((uint8_t*)(inputPtr + 8), swapped_int45); +++ vst1_u8((uint8_t*)(inputPtr + 12), swapped_int67); +++ +++ inputPtr += 16; +++ } +++ +++ for (number = n16points * 16; number < num_points; ++number) { +++ uint16_t output = *inputPtr; +++ output = (((output >> 8) & 0xff) | ((output << 8) & 0xff00)); +++ *inputPtr = output; +++ inputPtr++; ++ } ++- */ ++- int_lookup01 = vcreate_u8(1232017111498883080); ++- int_lookup23 = vcreate_u8(1376697457175036426); ++- int_lookup45 = vcreate_u8(1521377802851189772); ++- int_lookup67 = vcreate_u8(1666058148527343118); ++- ++- for(number = 0; number < n16points; ++number){ ++- input_table = vld4_u8((uint8_t*) inputPtr); ++- swapped_int01 = vtbl4_u8(input_table, int_lookup01); ++- swapped_int23 = vtbl4_u8(input_table, int_lookup23); ++- swapped_int45 = vtbl4_u8(input_table, int_lookup45); ++- swapped_int67 = vtbl4_u8(input_table, int_lookup67); ++- vst1_u8((uint8_t*)inputPtr, swapped_int01); ++- vst1_u8((uint8_t*)(inputPtr+4), swapped_int23); ++- vst1_u8((uint8_t*)(inputPtr+8), swapped_int45); ++- vst1_u8((uint8_t*)(inputPtr+12), swapped_int67); ++- ++- inputPtr += 16; ++- } ++- ++- for(number = n16points * 16; number < num_points; ++number){ ++- uint16_t output = *inputPtr; ++- output = (((output >> 8) & 0xff) | ((output << 8) & 0xff00)); ++- *inputPtr = output; ++- inputPtr++; ++- } ++ } ++ #endif /* LV_HAVE_NEON */ ++ ++ #ifdef LV_HAVE_GENERIC ++ ++-static inline void volk_16u_byteswap_a_generic(uint16_t* intsToSwap, unsigned int num_points){ ++- unsigned int point; ++- uint16_t* inputPtr = intsToSwap; ++- for(point = 0; point < num_points; point++){ ++- uint16_t output = *inputPtr; ++- output = (((output >> 8) & 0xff) | ((output << 8) & 0xff00)); ++- *inputPtr = output; ++- inputPtr++; ++- } +++static inline void volk_16u_byteswap_a_generic(uint16_t* intsToSwap, +++ unsigned int num_points) +++{ +++ unsigned int point; +++ uint16_t* inputPtr = intsToSwap; +++ for (point = 0; point < num_points; point++) { +++ uint16_t output = *inputPtr; +++ output = (((output >> 8) & 0xff) | ((output << 8) & 0xff00)); +++ *inputPtr = output; +++ inputPtr++; +++ } ++ } ++ #endif /* LV_HAVE_GENERIC */ ++ ++ #ifdef LV_HAVE_ORC ++ ++ extern void volk_16u_byteswap_a_orc_impl(uint16_t* intsToSwap, unsigned int num_points); ++-static inline void volk_16u_byteswap_u_orc(uint16_t* intsToSwap, unsigned int num_points){ +++static inline void volk_16u_byteswap_u_orc(uint16_t* intsToSwap, unsigned int num_points) +++{ ++ volk_16u_byteswap_a_orc_impl(intsToSwap, num_points); ++ } ++ #endif /* LV_HAVE_ORC */ ++diff --git a/kernels/volk/volk_16u_byteswappuppet_16u.h b/kernels/volk/volk_16u_byteswappuppet_16u.h ++index d3c8c5d..8cb1318 100644 ++--- a/kernels/volk/volk_16u_byteswappuppet_16u.h +++++ b/kernels/volk/volk_16u_byteswappuppet_16u.h ++@@ -3,69 +3,83 @@ ++ ++ ++ #include ++-#include ++ #include +++#include ++ ++ #ifdef LV_HAVE_GENERIC ++-static inline void volk_16u_byteswappuppet_16u_generic(uint16_t*output, uint16_t* intsToSwap, unsigned int num_points){ +++static inline void volk_16u_byteswappuppet_16u_generic(uint16_t* output, +++ uint16_t* intsToSwap, +++ unsigned int num_points) +++{ ++ ++ volk_16u_byteswap_generic((uint16_t*)intsToSwap, num_points); ++ memcpy((void*)output, (void*)intsToSwap, num_points * sizeof(uint16_t)); ++- ++ } ++ #endif ++ ++ #ifdef LV_HAVE_NEON ++-static inline void volk_16u_byteswappuppet_16u_neon(uint16_t*output, uint16_t* intsToSwap, unsigned int num_points){ +++static inline void volk_16u_byteswappuppet_16u_neon(uint16_t* output, +++ uint16_t* intsToSwap, +++ unsigned int num_points) +++{ ++ ++ volk_16u_byteswap_neon((uint16_t*)intsToSwap, num_points); ++ memcpy((void*)output, (void*)intsToSwap, num_points * sizeof(uint16_t)); ++- ++ } ++ #endif ++ ++ #ifdef LV_HAVE_NEON ++-static inline void volk_16u_byteswappuppet_16u_neon_table(uint16_t*output, uint16_t* intsToSwap, unsigned int num_points){ +++static inline void volk_16u_byteswappuppet_16u_neon_table(uint16_t* output, +++ uint16_t* intsToSwap, +++ unsigned int num_points) +++{ ++ ++ volk_16u_byteswap_neon_table((uint16_t*)intsToSwap, num_points); ++ memcpy((void*)output, (void*)intsToSwap, num_points * sizeof(uint16_t)); ++- ++ } ++ #endif ++ ++ #ifdef LV_HAVE_SSE2 ++-static inline void volk_16u_byteswappuppet_16u_u_sse2(uint16_t *output, uint16_t* intsToSwap, unsigned int num_points){ +++static inline void volk_16u_byteswappuppet_16u_u_sse2(uint16_t* output, +++ uint16_t* intsToSwap, +++ unsigned int num_points) +++{ ++ ++ volk_16u_byteswap_u_sse2((uint16_t*)intsToSwap, num_points); ++ memcpy((void*)output, (void*)intsToSwap, num_points * sizeof(uint16_t)); ++- ++ } ++ #endif ++ ++ #ifdef LV_HAVE_SSE2 ++-static inline void volk_16u_byteswappuppet_16u_a_sse2(uint16_t *output, uint16_t* intsToSwap, unsigned int num_points){ +++static inline void volk_16u_byteswappuppet_16u_a_sse2(uint16_t* output, +++ uint16_t* intsToSwap, +++ unsigned int num_points) +++{ ++ ++ volk_16u_byteswap_a_sse2((uint16_t*)intsToSwap, num_points); ++ memcpy((void*)output, (void*)intsToSwap, num_points * sizeof(uint16_t)); ++- ++ } ++ #endif ++ ++ #ifdef LV_HAVE_AVX2 ++-static inline void volk_16u_byteswappuppet_16u_u_avx2(uint16_t *output, uint16_t* intsToSwap, unsigned int num_points){ +++static inline void volk_16u_byteswappuppet_16u_u_avx2(uint16_t* output, +++ uint16_t* intsToSwap, +++ unsigned int num_points) +++{ ++ ++ volk_16u_byteswap_u_avx2((uint16_t*)intsToSwap, num_points); ++ memcpy((void*)output, (void*)intsToSwap, num_points * sizeof(uint16_t)); ++- ++ } ++ #endif ++ ++ #ifdef LV_HAVE_AVX2 ++-static inline void volk_16u_byteswappuppet_16u_a_avx2(uint16_t *output, uint16_t* intsToSwap, unsigned int num_points){ +++static inline void volk_16u_byteswappuppet_16u_a_avx2(uint16_t* output, +++ uint16_t* intsToSwap, +++ unsigned int num_points) +++{ ++ ++ volk_16u_byteswap_a_avx2((uint16_t*)intsToSwap, num_points); ++ memcpy((void*)output, (void*)intsToSwap, num_points * sizeof(uint16_t)); ++- ++ } ++ #endif ++ ++diff --git a/kernels/volk/volk_32f_64f_add_64f.h b/kernels/volk/volk_32f_64f_add_64f.h ++index 770c27e..d00ada5 100644 ++--- a/kernels/volk/volk_32f_64f_add_64f.h +++++ b/kernels/volk/volk_32f_64f_add_64f.h ++@@ -77,18 +77,19 @@ ++ ++ #ifdef LV_HAVE_GENERIC ++ ++-static inline void volk_32f_64f_add_64f_generic(double *cVector, ++- const float *aVector, ++- const double *bVector, ++- unsigned int num_points) { ++- double *cPtr = cVector; ++- const float *aPtr = aVector; ++- const double *bPtr = bVector; ++- unsigned int number = 0; ++- ++- for (number = 0; number < num_points; number++) { ++- *cPtr++ = ((double)(*aPtr++)) + (*bPtr++); ++- } +++static inline void volk_32f_64f_add_64f_generic(double* cVector, +++ const float* aVector, +++ const double* bVector, +++ unsigned int num_points) +++{ +++ double* cPtr = cVector; +++ const float* aPtr = aVector; +++ const double* bPtr = bVector; +++ unsigned int number = 0; +++ +++ for (number = 0; number < num_points; number++) { +++ *cPtr++ = ((double)(*aPtr++)) + (*bPtr++); +++ } ++ } ++ ++ #endif /* LV_HAVE_GENERIC */ ++@@ -96,42 +97,43 @@ static inline void volk_32f_64f_add_64f_generic(double *cVector, ++ #ifdef LV_HAVE_NEONV8 ++ #include ++ ++-static inline void volk_32f_64f_add_64f_neon(double *cVector, ++- const float *aVector, ++- const double *bVector, ++- unsigned int num_points) { ++- unsigned int number = 0; ++- const unsigned int half_points = num_points / 2; ++- ++- double *cPtr = cVector; ++- const float *aPtr = aVector; ++- const double *bPtr = bVector; ++- ++- float64x2_t aVal, bVal, cVal; ++- float32x2_t aVal1; ++- for (number = 0; number < half_points; number++) { ++- // Load in to NEON registers ++- aVal1 = vld1_f32(aPtr); ++- bVal = vld1q_f64(bPtr); ++- __VOLK_PREFETCH(aPtr + 2); ++- __VOLK_PREFETCH(bPtr + 2); ++- aPtr += 2; // q uses quadwords, 4 floats per vadd ++- bPtr += 2; ++- ++- // Vector conversion ++- aVal = vcvt_f64_f32(aVal1); ++- // vector add ++- cVal = vaddq_f64(aVal, bVal); ++- // Store the results back into the C container ++- vst1q_f64(cPtr, cVal); ++- ++- cPtr += 2; ++- } ++- ++- number = half_points * 2; // should be = num_points ++- for (; number < num_points; number++) { ++- *cPtr++ = ((double)(*aPtr++)) + (*bPtr++); ++- } +++static inline void volk_32f_64f_add_64f_neon(double* cVector, +++ const float* aVector, +++ const double* bVector, +++ unsigned int num_points) +++{ +++ unsigned int number = 0; +++ const unsigned int half_points = num_points / 2; +++ +++ double* cPtr = cVector; +++ const float* aPtr = aVector; +++ const double* bPtr = bVector; +++ +++ float64x2_t aVal, bVal, cVal; +++ float32x2_t aVal1; +++ for (number = 0; number < half_points; number++) { +++ // Load in to NEON registers +++ aVal1 = vld1_f32(aPtr); +++ bVal = vld1q_f64(bPtr); +++ __VOLK_PREFETCH(aPtr + 2); +++ __VOLK_PREFETCH(bPtr + 2); +++ aPtr += 2; // q uses quadwords, 4 floats per vadd +++ bPtr += 2; +++ +++ // Vector conversion +++ aVal = vcvt_f64_f32(aVal1); +++ // vector add +++ cVal = vaddq_f64(aVal, bVal); +++ // Store the results back into the C container +++ vst1q_f64(cPtr, cVal); +++ +++ cPtr += 2; +++ } +++ +++ number = half_points * 2; // should be = num_points +++ for (; number < num_points; number++) { +++ *cPtr++ = ((double)(*aPtr++)) + (*bPtr++); +++ } ++ } ++ ++ #endif /* LV_HAVE_NEONV8 */ ++@@ -141,49 +143,50 @@ static inline void volk_32f_64f_add_64f_neon(double *cVector, ++ #include ++ #include ++ ++-static inline void volk_32f_64f_add_64f_u_avx(double *cVector, ++- const float *aVector, ++- const double *bVector, ++- unsigned int num_points) { ++- unsigned int number = 0; ++- const unsigned int eighth_points = num_points / 8; ++- ++- double *cPtr = cVector; ++- const float *aPtr = aVector; ++- const double *bPtr = bVector; ++- ++- __m256 aVal; ++- __m128 aVal1, aVal2; ++- __m256d aDbl1, aDbl2, bVal1, bVal2, cVal1, cVal2; ++- for (; number < eighth_points; number++) { ++- ++- aVal = _mm256_loadu_ps(aPtr); ++- bVal1 = _mm256_loadu_pd(bPtr); ++- bVal2 = _mm256_loadu_pd(bPtr + 4); ++- ++- aVal1 = _mm256_extractf128_ps(aVal, 0); ++- aVal2 = _mm256_extractf128_ps(aVal, 1); ++- ++- aDbl1 = _mm256_cvtps_pd(aVal1); ++- aDbl2 = _mm256_cvtps_pd(aVal2); ++- ++- cVal1 = _mm256_add_pd(aDbl1, bVal1); ++- cVal2 = _mm256_add_pd(aDbl2, bVal2); ++- ++- _mm256_storeu_pd(cPtr, ++- cVal1); // Store the results back into the C container ++- _mm256_storeu_pd(cPtr + 4, ++- cVal2); // Store the results back into the C container ++- ++- aPtr += 8; ++- bPtr += 8; ++- cPtr += 8; ++- } ++- ++- number = eighth_points * 8; ++- for (; number < num_points; number++) { ++- *cPtr++ = ((double)(*aPtr++)) + (*bPtr++); ++- } +++static inline void volk_32f_64f_add_64f_u_avx(double* cVector, +++ const float* aVector, +++ const double* bVector, +++ unsigned int num_points) +++{ +++ unsigned int number = 0; +++ const unsigned int eighth_points = num_points / 8; +++ +++ double* cPtr = cVector; +++ const float* aPtr = aVector; +++ const double* bPtr = bVector; +++ +++ __m256 aVal; +++ __m128 aVal1, aVal2; +++ __m256d aDbl1, aDbl2, bVal1, bVal2, cVal1, cVal2; +++ for (; number < eighth_points; number++) { +++ +++ aVal = _mm256_loadu_ps(aPtr); +++ bVal1 = _mm256_loadu_pd(bPtr); +++ bVal2 = _mm256_loadu_pd(bPtr + 4); +++ +++ aVal1 = _mm256_extractf128_ps(aVal, 0); +++ aVal2 = _mm256_extractf128_ps(aVal, 1); +++ +++ aDbl1 = _mm256_cvtps_pd(aVal1); +++ aDbl2 = _mm256_cvtps_pd(aVal2); +++ +++ cVal1 = _mm256_add_pd(aDbl1, bVal1); +++ cVal2 = _mm256_add_pd(aDbl2, bVal2); +++ +++ _mm256_storeu_pd(cPtr, +++ cVal1); // Store the results back into the C container +++ _mm256_storeu_pd(cPtr + 4, +++ cVal2); // Store the results back into the C container +++ +++ aPtr += 8; +++ bPtr += 8; +++ cPtr += 8; +++ } +++ +++ number = eighth_points * 8; +++ for (; number < num_points; number++) { +++ *cPtr++ = ((double)(*aPtr++)) + (*bPtr++); +++ } ++ } ++ ++ #endif /* LV_HAVE_AVX */ ++@@ -193,48 +196,49 @@ static inline void volk_32f_64f_add_64f_u_avx(double *cVector, ++ #include ++ #include ++ ++-static inline void volk_32f_64f_add_64f_a_avx(double *cVector, ++- const float *aVector, ++- const double *bVector, ++- unsigned int num_points) { ++- unsigned int number = 0; ++- const unsigned int eighth_points = num_points / 8; ++- ++- double *cPtr = cVector; ++- const float *aPtr = aVector; ++- const double *bPtr = bVector; ++- ++- __m256 aVal; ++- __m128 aVal1, aVal2; ++- __m256d aDbl1, aDbl2, bVal1, bVal2, cVal1, cVal2; ++- for (; number < eighth_points; number++) { ++- ++- aVal = _mm256_load_ps(aPtr); ++- bVal1 = _mm256_load_pd(bPtr); ++- bVal2 = _mm256_load_pd(bPtr + 4); ++- ++- aVal1 = _mm256_extractf128_ps(aVal, 0); ++- aVal2 = _mm256_extractf128_ps(aVal, 1); ++- ++- aDbl1 = _mm256_cvtps_pd(aVal1); ++- aDbl2 = _mm256_cvtps_pd(aVal2); ++- ++- cVal1 = _mm256_add_pd(aDbl1, bVal1); ++- cVal2 = _mm256_add_pd(aDbl2, bVal2); ++- ++- _mm256_store_pd(cPtr, cVal1); // Store the results back into the C container ++- _mm256_store_pd(cPtr + 4, ++- cVal2); // Store the results back into the C container ++- ++- aPtr += 8; ++- bPtr += 8; ++- cPtr += 8; ++- } ++- ++- number = eighth_points * 8; ++- for (; number < num_points; number++) { ++- *cPtr++ = ((double)(*aPtr++)) + (*bPtr++); ++- } +++static inline void volk_32f_64f_add_64f_a_avx(double* cVector, +++ const float* aVector, +++ const double* bVector, +++ unsigned int num_points) +++{ +++ unsigned int number = 0; +++ const unsigned int eighth_points = num_points / 8; +++ +++ double* cPtr = cVector; +++ const float* aPtr = aVector; +++ const double* bPtr = bVector; +++ +++ __m256 aVal; +++ __m128 aVal1, aVal2; +++ __m256d aDbl1, aDbl2, bVal1, bVal2, cVal1, cVal2; +++ for (; number < eighth_points; number++) { +++ +++ aVal = _mm256_load_ps(aPtr); +++ bVal1 = _mm256_load_pd(bPtr); +++ bVal2 = _mm256_load_pd(bPtr + 4); +++ +++ aVal1 = _mm256_extractf128_ps(aVal, 0); +++ aVal2 = _mm256_extractf128_ps(aVal, 1); +++ +++ aDbl1 = _mm256_cvtps_pd(aVal1); +++ aDbl2 = _mm256_cvtps_pd(aVal2); +++ +++ cVal1 = _mm256_add_pd(aDbl1, bVal1); +++ cVal2 = _mm256_add_pd(aDbl2, bVal2); +++ +++ _mm256_store_pd(cPtr, cVal1); // Store the results back into the C container +++ _mm256_store_pd(cPtr + 4, +++ cVal2); // Store the results back into the C container +++ +++ aPtr += 8; +++ bPtr += 8; +++ cPtr += 8; +++ } +++ +++ number = eighth_points * 8; +++ for (; number < num_points; number++) { +++ *cPtr++ = ((double)(*aPtr++)) + (*bPtr++); +++ } ++ } ++ ++ #endif /* LV_HAVE_AVX */ ++diff --git a/kernels/volk/volk_32f_64f_multiply_64f.h b/kernels/volk/volk_32f_64f_multiply_64f.h ++index 50f08a1..1039850 100644 ++--- a/kernels/volk/volk_32f_64f_multiply_64f.h +++++ b/kernels/volk/volk_32f_64f_multiply_64f.h ++@@ -31,8 +31,8 @@ ++ * ++ * Dispatcher Prototype ++ * \code ++- * void volk_32f_64f_multiply_64f(double* cVector, const double* aVector, const double* bVector, unsigned int num_points) ++- * \endcode +++ * void volk_32f_64f_multiply_64f(double* cVector, const double* aVector, const double* +++ * bVector, unsigned int num_points) \endcode ++ * ++ * \b Inputs ++ * \li aVector: First input vector. ++@@ -76,18 +76,19 @@ ++ ++ #ifdef LV_HAVE_GENERIC ++ ++-static inline void ++-volk_32f_64f_multiply_64f_generic(double *cVector, const float *aVector, ++- const double *bVector, unsigned int num_points) +++static inline void volk_32f_64f_multiply_64f_generic(double* cVector, +++ const float* aVector, +++ const double* bVector, +++ unsigned int num_points) ++ { ++- double *cPtr = cVector; ++- const float *aPtr = aVector; ++- const double *bPtr = bVector; ++- unsigned int number = 0; ++- ++- for (number = 0; number < num_points; number++) { ++- *cPtr++ = ((double)(*aPtr++)) * (*bPtr++); ++- } +++ double* cPtr = cVector; +++ const float* aPtr = aVector; +++ const double* bPtr = bVector; +++ unsigned int number = 0; +++ +++ for (number = 0; number < num_points; number++) { +++ *cPtr++ = ((double)(*aPtr++)) * (*bPtr++); +++ } ++ } ++ ++ #endif /* LV_HAVE_GENERIC */ ++@@ -102,47 +103,48 @@ volk_32f_64f_multiply_64f_generic(double *cVector, const float *aVector, ++ #include ++ #include ++ ++-static inline void ++-volk_32f_64f_multiply_64f_u_avx(double *cVector, const float *aVector, ++- const double *bVector, unsigned int num_points) +++static inline void volk_32f_64f_multiply_64f_u_avx(double* cVector, +++ const float* aVector, +++ const double* bVector, +++ unsigned int num_points) ++ { ++- unsigned int number = 0; ++- const unsigned int eighth_points = num_points / 8; +++ unsigned int number = 0; +++ const unsigned int eighth_points = num_points / 8; ++ ++- double *cPtr = cVector; ++- const float *aPtr = aVector; ++- const double *bPtr = bVector; +++ double* cPtr = cVector; +++ const float* aPtr = aVector; +++ const double* bPtr = bVector; ++ ++- __m256 aVal; ++- __m128 aVal1, aVal2; ++- __m256d aDbl1, aDbl2, bVal1, bVal2, cVal1, cVal2; ++- for (; number < eighth_points; number++) { +++ __m256 aVal; +++ __m128 aVal1, aVal2; +++ __m256d aDbl1, aDbl2, bVal1, bVal2, cVal1, cVal2; +++ for (; number < eighth_points; number++) { ++ ++- aVal = _mm256_loadu_ps(aPtr); ++- bVal1 = _mm256_loadu_pd(bPtr); ++- bVal2 = _mm256_loadu_pd(bPtr+4); +++ aVal = _mm256_loadu_ps(aPtr); +++ bVal1 = _mm256_loadu_pd(bPtr); +++ bVal2 = _mm256_loadu_pd(bPtr + 4); ++ ++- aVal1 = _mm256_extractf128_ps(aVal, 0); ++- aVal2 = _mm256_extractf128_ps(aVal, 1); +++ aVal1 = _mm256_extractf128_ps(aVal, 0); +++ aVal2 = _mm256_extractf128_ps(aVal, 1); ++ ++- aDbl1 = _mm256_cvtps_pd(aVal1); ++- aDbl2 = _mm256_cvtps_pd(aVal2); +++ aDbl1 = _mm256_cvtps_pd(aVal1); +++ aDbl2 = _mm256_cvtps_pd(aVal2); ++ ++- cVal1 = _mm256_mul_pd(aDbl1, bVal1); ++- cVal2 = _mm256_mul_pd(aDbl2, bVal2); +++ cVal1 = _mm256_mul_pd(aDbl1, bVal1); +++ cVal2 = _mm256_mul_pd(aDbl2, bVal2); ++ ++- _mm256_storeu_pd(cPtr, cVal1); // Store the results back into the C container ++- _mm256_storeu_pd(cPtr+4, cVal2); // Store the results back into the C container +++ _mm256_storeu_pd(cPtr, cVal1); // Store the results back into the C container +++ _mm256_storeu_pd(cPtr + 4, cVal2); // Store the results back into the C container ++ ++- aPtr += 8; ++- bPtr += 8; ++- cPtr += 8; ++- } +++ aPtr += 8; +++ bPtr += 8; +++ cPtr += 8; +++ } ++ ++- number = eighth_points * 8; ++- for (; number < num_points; number++) { ++- *cPtr++ = ((double)(*aPtr++)) * (*bPtr++); ++- } +++ number = eighth_points * 8; +++ for (; number < num_points; number++) { +++ *cPtr++ = ((double)(*aPtr++)) * (*bPtr++); +++ } ++ } ++ ++ #endif /* LV_HAVE_AVX */ ++@@ -153,51 +155,51 @@ volk_32f_64f_multiply_64f_u_avx(double *cVector, const float *aVector, ++ #include ++ #include ++ ++-static inline void ++-volk_32f_64f_multiply_64f_a_avx(double *cVector, const float *aVector, ++- const double *bVector, unsigned int num_points) +++static inline void volk_32f_64f_multiply_64f_a_avx(double* cVector, +++ const float* aVector, +++ const double* bVector, +++ unsigned int num_points) ++ { ++- unsigned int number = 0; ++- const unsigned int eighth_points = num_points / 8; +++ unsigned int number = 0; +++ const unsigned int eighth_points = num_points / 8; ++ ++- double *cPtr = cVector; ++- const float *aPtr = aVector; ++- const double *bPtr = bVector; +++ double* cPtr = cVector; +++ const float* aPtr = aVector; +++ const double* bPtr = bVector; ++ ++- __m256 aVal; ++- __m128 aVal1, aVal2; ++- __m256d aDbl1, aDbl2, bVal1, bVal2, cVal1, cVal2; ++- for (; number < eighth_points; number++) { +++ __m256 aVal; +++ __m128 aVal1, aVal2; +++ __m256d aDbl1, aDbl2, bVal1, bVal2, cVal1, cVal2; +++ for (; number < eighth_points; number++) { ++ ++- aVal = _mm256_load_ps(aPtr); ++- bVal1 = _mm256_load_pd(bPtr); ++- bVal2 = _mm256_load_pd(bPtr+4); +++ aVal = _mm256_load_ps(aPtr); +++ bVal1 = _mm256_load_pd(bPtr); +++ bVal2 = _mm256_load_pd(bPtr + 4); ++ ++- aVal1 = _mm256_extractf128_ps(aVal, 0); ++- aVal2 = _mm256_extractf128_ps(aVal, 1); +++ aVal1 = _mm256_extractf128_ps(aVal, 0); +++ aVal2 = _mm256_extractf128_ps(aVal, 1); ++ ++- aDbl1 = _mm256_cvtps_pd(aVal1); ++- aDbl2 = _mm256_cvtps_pd(aVal2); +++ aDbl1 = _mm256_cvtps_pd(aVal1); +++ aDbl2 = _mm256_cvtps_pd(aVal2); ++ ++- cVal1 = _mm256_mul_pd(aDbl1, bVal1); ++- cVal2 = _mm256_mul_pd(aDbl2, bVal2); +++ cVal1 = _mm256_mul_pd(aDbl1, bVal1); +++ cVal2 = _mm256_mul_pd(aDbl2, bVal2); ++ ++- _mm256_store_pd(cPtr, cVal1); // Store the results back into the C container ++- _mm256_store_pd(cPtr+4, cVal2); // Store the results back into the C container +++ _mm256_store_pd(cPtr, cVal1); // Store the results back into the C container +++ _mm256_store_pd(cPtr + 4, cVal2); // Store the results back into the C container ++ ++- aPtr += 8; ++- bPtr += 8; ++- cPtr += 8; ++- } +++ aPtr += 8; +++ bPtr += 8; +++ cPtr += 8; +++ } ++ ++- number = eighth_points * 8; ++- for (; number < num_points; number++) { ++- *cPtr++ = ((double)(*aPtr++)) * (*bPtr++); ++- } +++ number = eighth_points * 8; +++ for (; number < num_points; number++) { +++ *cPtr++ = ((double)(*aPtr++)) * (*bPtr++); +++ } ++ } ++ ++ #endif /* LV_HAVE_AVX */ ++ ++ ++- ++ #endif /* INCLUDED_volk_32f_64f_multiply_64f_u_H */ ++diff --git a/kernels/volk/volk_32f_8u_polarbutterfly_32f.h b/kernels/volk/volk_32f_8u_polarbutterfly_32f.h ++index 4aba6c4..2198b33 100644 ++--- a/kernels/volk/volk_32f_8u_polarbutterfly_32f.h +++++ b/kernels/volk/volk_32f_8u_polarbutterfly_32f.h ++@@ -51,14 +51,17 @@ ++ * int frame_exp = 10; ++ * int frame_size = 0x01 << frame_exp; ++ * ++- * float* llrs = (float*) volk_malloc(sizeof(float) * frame_size * (frame_exp + 1), volk_get_alignment()); ++- * unsigned char* u = (unsigned char) volk_malloc(sizeof(unsigned char) * frame_size * (frame_exp + 1), volk_get_alignment()); +++ * float* llrs = (float*) volk_malloc(sizeof(float) * frame_size * (frame_exp + 1), +++ * volk_get_alignment()); unsigned char* u = (unsigned char) volk_malloc(sizeof(unsigned +++ * char) * frame_size * (frame_exp + 1), volk_get_alignment()); ++ * ++- * {some_function_to_write_encoded_bits_to_float_llrs(llrs + frame_size * frame_exp, data)}; +++ * {some_function_to_write_encoded_bits_to_float_llrs(llrs + frame_size * frame_exp, +++ * data)}; ++ * ++ * unsigned int u_num; ++ * for(u_num = 0; u_num < frame_size; u_num++){ ++- * volk_32f_8u_polarbutterfly_32f_u_avx(llrs, u, frame_size, frame_exp, 0, u_num, u_num); +++ * volk_32f_8u_polarbutterfly_32f_u_avx(llrs, u, frame_size, frame_exp, 0, u_num, +++ * u_num); ++ * // next line could first search for frozen bit value and then do bit decision. ++ * u[u_num] = llrs[u_num] > 0 ? 0 : 1; ++ * } ++@@ -73,130 +76,131 @@ ++ #include ++ #include ++ ++-static inline float ++-llr_odd(const float la, const float lb) +++static inline float llr_odd(const float la, const float lb) ++ { ++- const float ala = fabsf(la); ++- const float alb = fabsf(lb); ++- return copysignf(1.0f, la) * copysignf(1.0f, lb) * (ala > alb ? alb : ala); +++ const float ala = fabsf(la); +++ const float alb = fabsf(lb); +++ return copysignf(1.0f, la) * copysignf(1.0f, lb) * (ala > alb ? alb : ala); ++ } ++ ++-static inline void ++-llr_odd_stages(float* llrs, int min_stage, const int depth, const int frame_size, const int row) +++static inline void llr_odd_stages( +++ float* llrs, int min_stage, const int depth, const int frame_size, const int row) ++ { ++- int loop_stage = depth - 1; ++- float* dst_llr_ptr; ++- float* src_llr_ptr; ++- int stage_size = 0x01 << loop_stage; ++- ++- int el; ++- while(min_stage <= loop_stage){ ++- dst_llr_ptr = llrs + loop_stage * frame_size + row; ++- src_llr_ptr = dst_llr_ptr + frame_size; ++- for(el = 0; el < stage_size; el++){ ++- *dst_llr_ptr++ = llr_odd(*src_llr_ptr, *(src_llr_ptr + 1)); ++- src_llr_ptr += 2; +++ int loop_stage = depth - 1; +++ float* dst_llr_ptr; +++ float* src_llr_ptr; +++ int stage_size = 0x01 << loop_stage; +++ +++ int el; +++ while (min_stage <= loop_stage) { +++ dst_llr_ptr = llrs + loop_stage * frame_size + row; +++ src_llr_ptr = dst_llr_ptr + frame_size; +++ for (el = 0; el < stage_size; el++) { +++ *dst_llr_ptr++ = llr_odd(*src_llr_ptr, *(src_llr_ptr + 1)); +++ src_llr_ptr += 2; +++ } +++ +++ --loop_stage; +++ stage_size >>= 1; ++ } ++- ++- --loop_stage; ++- stage_size >>= 1; ++- } ++ } ++ ++-static inline float ++-llr_even(const float la, const float lb, const unsigned char f) +++static inline float llr_even(const float la, const float lb, const unsigned char f) ++ { ++- switch(f){ +++ switch (f) { ++ case 0: ++- return lb + la; +++ return lb + la; ++ default: ++- return lb - la; ++- } +++ return lb - la; +++ } ++ } ++ ++ static inline void ++ even_u_values(unsigned char* u_even, const unsigned char* u, const int u_num) ++ { ++- u++; ++- int i; ++- for(i = 1; i < u_num; i += 2){ ++- *u_even++ = *u; ++- u += 2; ++- } +++ u++; +++ int i; +++ for (i = 1; i < u_num; i += 2) { +++ *u_even++ = *u; +++ u += 2; +++ } ++ } ++ ++ static inline void ++ odd_xor_even_values(unsigned char* u_xor, const unsigned char* u, const int u_num) ++ { ++- int i; ++- for(i = 1; i < u_num; i += 2){ ++- *u_xor++ = *u ^ *(u + 1); ++- u += 2; ++- } +++ int i; +++ for (i = 1; i < u_num; i += 2) { +++ *u_xor++ = *u ^ *(u + 1); +++ u += 2; +++ } ++ } ++ ++-static inline int ++-calculate_max_stage_depth_for_row(const int frame_exp, const int row) +++static inline int calculate_max_stage_depth_for_row(const int frame_exp, const int row) ++ { ++- int max_stage_depth = 0; ++- int half_stage_size = 0x01; ++- int stage_size = half_stage_size << 1; ++- while(max_stage_depth < (frame_exp - 1)){ // last stage holds received values. ++- if(!(row % stage_size < half_stage_size)){ ++- break; +++ int max_stage_depth = 0; +++ int half_stage_size = 0x01; +++ int stage_size = half_stage_size << 1; +++ while (max_stage_depth < (frame_exp - 1)) { // last stage holds received values. +++ if (!(row % stage_size < half_stage_size)) { +++ break; +++ } +++ half_stage_size <<= 1; +++ stage_size <<= 1; +++ max_stage_depth++; ++ } ++- half_stage_size <<= 1; ++- stage_size <<= 1; ++- max_stage_depth++; ++- } ++- return max_stage_depth; +++ return max_stage_depth; ++ } ++ ++ #ifdef LV_HAVE_GENERIC ++ ++-static inline void ++-volk_32f_8u_polarbutterfly_32f_generic(float* llrs, unsigned char* u, ++- const int frame_exp, ++- const int stage, const int u_num, const int row) +++static inline void volk_32f_8u_polarbutterfly_32f_generic(float* llrs, +++ unsigned char* u, +++ const int frame_exp, +++ const int stage, +++ const int u_num, +++ const int row) ++ { ++- const int frame_size = 0x01 << frame_exp; ++- const int next_stage = stage + 1; +++ const int frame_size = 0x01 << frame_exp; +++ const int next_stage = stage + 1; ++ ++- const int half_stage_size = 0x01 << stage; ++- const int stage_size = half_stage_size << 1; +++ const int half_stage_size = 0x01 << stage; +++ const int stage_size = half_stage_size << 1; ++ ++- const bool is_upper_stage_half = row % stage_size < half_stage_size; +++ const bool is_upper_stage_half = row % stage_size < half_stage_size; ++ ++-// // this is a natural bit order impl ++- float* next_llrs = llrs + frame_size;// LLRs are stored in a consecutive array. ++- float* call_row_llr = llrs + row; +++ // // this is a natural bit order impl +++ float* next_llrs = llrs + frame_size; // LLRs are stored in a consecutive array. +++ float* call_row_llr = llrs + row; ++ ++- const int section = row - (row % stage_size); ++- const int jump_size = ((row % half_stage_size) << 1) % stage_size; +++ const int section = row - (row % stage_size); +++ const int jump_size = ((row % half_stage_size) << 1) % stage_size; ++ ++- const int next_upper_row = section + jump_size; ++- const int next_lower_row = next_upper_row + 1; +++ const int next_upper_row = section + jump_size; +++ const int next_lower_row = next_upper_row + 1; ++ ++- const float* upper_right_llr_ptr = next_llrs + next_upper_row; ++- const float* lower_right_llr_ptr = next_llrs + next_lower_row; +++ const float* upper_right_llr_ptr = next_llrs + next_upper_row; +++ const float* lower_right_llr_ptr = next_llrs + next_lower_row; ++ ++- if(!is_upper_stage_half){ ++- const int u_pos = u_num >> stage; ++- const unsigned char f = u[u_pos - 1]; ++- *call_row_llr = llr_even(*upper_right_llr_ptr, *lower_right_llr_ptr, f); ++- return; ++- } +++ if (!is_upper_stage_half) { +++ const int u_pos = u_num >> stage; +++ const unsigned char f = u[u_pos - 1]; +++ *call_row_llr = llr_even(*upper_right_llr_ptr, *lower_right_llr_ptr, f); +++ return; +++ } ++ ++- if(frame_exp > next_stage){ ++- unsigned char* u_half = u + frame_size; ++- odd_xor_even_values(u_half, u, u_num); ++- volk_32f_8u_polarbutterfly_32f_generic(next_llrs, u_half, frame_exp, next_stage, u_num, next_upper_row); +++ if (frame_exp > next_stage) { +++ unsigned char* u_half = u + frame_size; +++ odd_xor_even_values(u_half, u, u_num); +++ volk_32f_8u_polarbutterfly_32f_generic( +++ next_llrs, u_half, frame_exp, next_stage, u_num, next_upper_row); ++ ++- even_u_values(u_half, u, u_num); ++- volk_32f_8u_polarbutterfly_32f_generic(next_llrs, u_half, frame_exp, next_stage, u_num, next_lower_row); ++- } +++ even_u_values(u_half, u, u_num); +++ volk_32f_8u_polarbutterfly_32f_generic( +++ next_llrs, u_half, frame_exp, next_stage, u_num, next_lower_row); +++ } ++ ++- *call_row_llr = llr_odd(*upper_right_llr_ptr, *lower_right_llr_ptr); +++ *call_row_llr = llr_odd(*upper_right_llr_ptr, *lower_right_llr_ptr); ++ } ++ ++ #endif /* LV_HAVE_GENERIC */ ++@@ -206,99 +210,99 @@ volk_32f_8u_polarbutterfly_32f_generic(float* llrs, unsigned char* u, ++ #include ++ #include ++ ++-static inline void ++-volk_32f_8u_polarbutterfly_32f_u_avx(float* llrs, unsigned char* u, ++- const int frame_exp, ++- const int stage, const int u_num, const int row) +++static inline void volk_32f_8u_polarbutterfly_32f_u_avx(float* llrs, +++ unsigned char* u, +++ const int frame_exp, +++ const int stage, +++ const int u_num, +++ const int row) ++ { ++- const int frame_size = 0x01 << frame_exp; ++- if(row % 2){ // for odd rows just do the only necessary calculation and return. ++- const float* next_llrs = llrs + frame_size + row; ++- *(llrs + row) = llr_even(*(next_llrs - 1), *next_llrs, u[u_num - 1]); ++- return; ++- } ++- ++- const int max_stage_depth = calculate_max_stage_depth_for_row(frame_exp, row); ++- if(max_stage_depth < 3){ // vectorized version needs larger vectors. ++- volk_32f_8u_polarbutterfly_32f_generic(llrs, u, frame_exp, stage, u_num, row); ++- return; ++- } ++- ++- int loop_stage = max_stage_depth; ++- int stage_size = 0x01 << loop_stage; ++- ++- float* src_llr_ptr; ++- float* dst_llr_ptr; ++- ++- __m256 src0, src1, dst; ++- ++- if(row){ // not necessary for ZERO row. == first bit to be decoded. ++- // first do bit combination for all stages ++- // effectively encode some decoded bits again. ++- unsigned char* u_target = u + frame_size; ++- unsigned char* u_temp = u + 2* frame_size; ++- memcpy(u_temp, u + u_num - stage_size, sizeof(unsigned char) * stage_size); ++- ++- if(stage_size > 15){ ++- _mm256_zeroupper(); ++- volk_8u_x2_encodeframepolar_8u_u_ssse3(u_target, u_temp, stage_size); +++ const int frame_size = 0x01 << frame_exp; +++ if (row % 2) { // for odd rows just do the only necessary calculation and return. +++ const float* next_llrs = llrs + frame_size + row; +++ *(llrs + row) = llr_even(*(next_llrs - 1), *next_llrs, u[u_num - 1]); +++ return; ++ } ++- else{ ++- volk_8u_x2_encodeframepolar_8u_generic(u_target, u_temp, stage_size); +++ +++ const int max_stage_depth = calculate_max_stage_depth_for_row(frame_exp, row); +++ if (max_stage_depth < 3) { // vectorized version needs larger vectors. +++ volk_32f_8u_polarbutterfly_32f_generic(llrs, u, frame_exp, stage, u_num, row); +++ return; ++ } ++ ++- src_llr_ptr = llrs + (max_stage_depth + 1) * frame_size + row - stage_size; ++- dst_llr_ptr = llrs + max_stage_depth * frame_size + row; +++ int loop_stage = max_stage_depth; +++ int stage_size = 0x01 << loop_stage; ++ ++- __m128i fbits; +++ float* src_llr_ptr; +++ float* dst_llr_ptr; ++ ++- int p; ++- for(p = 0; p < stage_size; p += 8){ ++- _mm256_zeroupper(); ++- fbits = _mm_loadu_si128((__m128i*) u_target); ++- u_target += 8; +++ __m256 src0, src1, dst; ++ ++- src0 = _mm256_loadu_ps(src_llr_ptr); ++- src1 = _mm256_loadu_ps(src_llr_ptr + 8); ++- src_llr_ptr += 16; +++ if (row) { // not necessary for ZERO row. == first bit to be decoded. +++ // first do bit combination for all stages +++ // effectively encode some decoded bits again. +++ unsigned char* u_target = u + frame_size; +++ unsigned char* u_temp = u + 2 * frame_size; +++ memcpy(u_temp, u + u_num - stage_size, sizeof(unsigned char) * stage_size); ++ ++- dst = _mm256_polar_fsign_add_llrs(src0, src1, fbits); +++ if (stage_size > 15) { +++ _mm256_zeroupper(); +++ volk_8u_x2_encodeframepolar_8u_u_ssse3(u_target, u_temp, stage_size); +++ } else { +++ volk_8u_x2_encodeframepolar_8u_generic(u_target, u_temp, stage_size); +++ } ++ ++- _mm256_storeu_ps(dst_llr_ptr, dst); ++- dst_llr_ptr += 8; ++- } +++ src_llr_ptr = llrs + (max_stage_depth + 1) * frame_size + row - stage_size; +++ dst_llr_ptr = llrs + max_stage_depth * frame_size + row; ++ ++- --loop_stage; ++- stage_size >>= 1; ++- } +++ __m128i fbits; ++ ++- const int min_stage = stage > 2 ? stage : 2; +++ int p; +++ for (p = 0; p < stage_size; p += 8) { +++ _mm256_zeroupper(); +++ fbits = _mm_loadu_si128((__m128i*)u_target); +++ u_target += 8; ++ ++- _mm256_zeroall(); // Important to clear cache! +++ src0 = _mm256_loadu_ps(src_llr_ptr); +++ src1 = _mm256_loadu_ps(src_llr_ptr + 8); +++ src_llr_ptr += 16; ++ ++- int el; ++- while(min_stage < loop_stage){ ++- dst_llr_ptr = llrs + loop_stage * frame_size + row; ++- src_llr_ptr = dst_llr_ptr + frame_size; ++- for(el = 0; el < stage_size; el += 8){ ++- src0 = _mm256_loadu_ps(src_llr_ptr); ++- src_llr_ptr += 8; ++- src1 = _mm256_loadu_ps(src_llr_ptr); ++- src_llr_ptr += 8; +++ dst = _mm256_polar_fsign_add_llrs(src0, src1, fbits); ++ ++- dst = _mm256_polar_minsum_llrs(src0, src1); +++ _mm256_storeu_ps(dst_llr_ptr, dst); +++ dst_llr_ptr += 8; +++ } ++ ++- _mm256_storeu_ps(dst_llr_ptr, dst); ++- dst_llr_ptr += 8; +++ --loop_stage; +++ stage_size >>= 1; ++ } ++ ++- --loop_stage; ++- stage_size >>= 1; +++ const int min_stage = stage > 2 ? stage : 2; +++ +++ _mm256_zeroall(); // Important to clear cache! ++ ++- } +++ int el; +++ while (min_stage < loop_stage) { +++ dst_llr_ptr = llrs + loop_stage * frame_size + row; +++ src_llr_ptr = dst_llr_ptr + frame_size; +++ for (el = 0; el < stage_size; el += 8) { +++ src0 = _mm256_loadu_ps(src_llr_ptr); +++ src_llr_ptr += 8; +++ src1 = _mm256_loadu_ps(src_llr_ptr); +++ src_llr_ptr += 8; ++ ++- // for stages < 3 vectors are too small!. ++- llr_odd_stages(llrs, stage, loop_stage + 1,frame_size, row); +++ dst = _mm256_polar_minsum_llrs(src0, src1); +++ +++ _mm256_storeu_ps(dst_llr_ptr, dst); +++ dst_llr_ptr += 8; +++ } +++ +++ --loop_stage; +++ stage_size >>= 1; +++ } +++ +++ // for stages < 3 vectors are too small!. +++ llr_odd_stages(llrs, stage, loop_stage + 1, frame_size, row); ++ } ++ ++ #endif /* LV_HAVE_AVX */ ++@@ -307,99 +311,99 @@ volk_32f_8u_polarbutterfly_32f_u_avx(float* llrs, unsigned char* u, ++ #include ++ #include ++ ++-static inline void ++-volk_32f_8u_polarbutterfly_32f_u_avx2(float* llrs, unsigned char* u, ++- const int frame_exp, ++- const int stage, const int u_num, const int row) +++static inline void volk_32f_8u_polarbutterfly_32f_u_avx2(float* llrs, +++ unsigned char* u, +++ const int frame_exp, +++ const int stage, +++ const int u_num, +++ const int row) ++ { ++- const int frame_size = 0x01 << frame_exp; ++- if(row % 2){ // for odd rows just do the only necessary calculation and return. ++- const float* next_llrs = llrs + frame_size + row; ++- *(llrs + row) = llr_even(*(next_llrs - 1), *next_llrs, u[u_num - 1]); ++- return; ++- } ++- ++- const int max_stage_depth = calculate_max_stage_depth_for_row(frame_exp, row); ++- if(max_stage_depth < 3){ // vectorized version needs larger vectors. ++- volk_32f_8u_polarbutterfly_32f_generic(llrs, u, frame_exp, stage, u_num, row); ++- return; ++- } ++- ++- int loop_stage = max_stage_depth; ++- int stage_size = 0x01 << loop_stage; ++- ++- float* src_llr_ptr; ++- float* dst_llr_ptr; ++- ++- __m256 src0, src1, dst; ++- ++- if(row){ // not necessary for ZERO row. == first bit to be decoded. ++- // first do bit combination for all stages ++- // effectively encode some decoded bits again. ++- unsigned char* u_target = u + frame_size; ++- unsigned char* u_temp = u + 2* frame_size; ++- memcpy(u_temp, u + u_num - stage_size, sizeof(unsigned char) * stage_size); ++- ++- if(stage_size > 15){ ++- _mm256_zeroupper(); ++- volk_8u_x2_encodeframepolar_8u_u_ssse3(u_target, u_temp, stage_size); +++ const int frame_size = 0x01 << frame_exp; +++ if (row % 2) { // for odd rows just do the only necessary calculation and return. +++ const float* next_llrs = llrs + frame_size + row; +++ *(llrs + row) = llr_even(*(next_llrs - 1), *next_llrs, u[u_num - 1]); +++ return; ++ } ++- else{ ++- volk_8u_x2_encodeframepolar_8u_generic(u_target, u_temp, stage_size); +++ +++ const int max_stage_depth = calculate_max_stage_depth_for_row(frame_exp, row); +++ if (max_stage_depth < 3) { // vectorized version needs larger vectors. +++ volk_32f_8u_polarbutterfly_32f_generic(llrs, u, frame_exp, stage, u_num, row); +++ return; ++ } ++ ++- src_llr_ptr = llrs + (max_stage_depth + 1) * frame_size + row - stage_size; ++- dst_llr_ptr = llrs + max_stage_depth * frame_size + row; +++ int loop_stage = max_stage_depth; +++ int stage_size = 0x01 << loop_stage; ++ ++- __m128i fbits; +++ float* src_llr_ptr; +++ float* dst_llr_ptr; ++ ++- int p; ++- for(p = 0; p < stage_size; p += 8){ ++- _mm256_zeroupper(); ++- fbits = _mm_loadu_si128((__m128i*) u_target); ++- u_target += 8; +++ __m256 src0, src1, dst; ++ ++- src0 = _mm256_loadu_ps(src_llr_ptr); ++- src1 = _mm256_loadu_ps(src_llr_ptr + 8); ++- src_llr_ptr += 16; +++ if (row) { // not necessary for ZERO row. == first bit to be decoded. +++ // first do bit combination for all stages +++ // effectively encode some decoded bits again. +++ unsigned char* u_target = u + frame_size; +++ unsigned char* u_temp = u + 2 * frame_size; +++ memcpy(u_temp, u + u_num - stage_size, sizeof(unsigned char) * stage_size); ++ ++- dst = _mm256_polar_fsign_add_llrs_avx2(src0, src1, fbits); +++ if (stage_size > 15) { +++ _mm256_zeroupper(); +++ volk_8u_x2_encodeframepolar_8u_u_ssse3(u_target, u_temp, stage_size); +++ } else { +++ volk_8u_x2_encodeframepolar_8u_generic(u_target, u_temp, stage_size); +++ } ++ ++- _mm256_storeu_ps(dst_llr_ptr, dst); ++- dst_llr_ptr += 8; ++- } +++ src_llr_ptr = llrs + (max_stage_depth + 1) * frame_size + row - stage_size; +++ dst_llr_ptr = llrs + max_stage_depth * frame_size + row; ++ ++- --loop_stage; ++- stage_size >>= 1; ++- } +++ __m128i fbits; ++ ++- const int min_stage = stage > 2 ? stage : 2; +++ int p; +++ for (p = 0; p < stage_size; p += 8) { +++ _mm256_zeroupper(); +++ fbits = _mm_loadu_si128((__m128i*)u_target); +++ u_target += 8; ++ ++- _mm256_zeroall(); // Important to clear cache! +++ src0 = _mm256_loadu_ps(src_llr_ptr); +++ src1 = _mm256_loadu_ps(src_llr_ptr + 8); +++ src_llr_ptr += 16; ++ ++- int el; ++- while(min_stage < loop_stage){ ++- dst_llr_ptr = llrs + loop_stage * frame_size + row; ++- src_llr_ptr = dst_llr_ptr + frame_size; ++- for(el = 0; el < stage_size; el += 8){ ++- src0 = _mm256_loadu_ps(src_llr_ptr); ++- src_llr_ptr += 8; ++- src1 = _mm256_loadu_ps(src_llr_ptr); ++- src_llr_ptr += 8; +++ dst = _mm256_polar_fsign_add_llrs_avx2(src0, src1, fbits); ++ ++- dst = _mm256_polar_minsum_llrs(src0, src1); +++ _mm256_storeu_ps(dst_llr_ptr, dst); +++ dst_llr_ptr += 8; +++ } ++ ++- _mm256_storeu_ps(dst_llr_ptr, dst); ++- dst_llr_ptr += 8; +++ --loop_stage; +++ stage_size >>= 1; ++ } ++ ++- --loop_stage; ++- stage_size >>= 1; +++ const int min_stage = stage > 2 ? stage : 2; +++ +++ _mm256_zeroall(); // Important to clear cache! +++ +++ int el; +++ while (min_stage < loop_stage) { +++ dst_llr_ptr = llrs + loop_stage * frame_size + row; +++ src_llr_ptr = dst_llr_ptr + frame_size; +++ for (el = 0; el < stage_size; el += 8) { +++ src0 = _mm256_loadu_ps(src_llr_ptr); +++ src_llr_ptr += 8; +++ src1 = _mm256_loadu_ps(src_llr_ptr); +++ src_llr_ptr += 8; ++ ++- } +++ dst = _mm256_polar_minsum_llrs(src0, src1); +++ +++ _mm256_storeu_ps(dst_llr_ptr, dst); +++ dst_llr_ptr += 8; +++ } +++ +++ --loop_stage; +++ stage_size >>= 1; +++ } ++ ++- // for stages < 3 vectors are too small!. ++- llr_odd_stages(llrs, stage, loop_stage + 1,frame_size, row); +++ // for stages < 3 vectors are too small!. +++ llr_odd_stages(llrs, stage, loop_stage + 1, frame_size, row); ++ } ++ ++ #endif /* LV_HAVE_AVX2 */ ++diff --git a/kernels/volk/volk_32f_8u_polarbutterflypuppet_32f.h b/kernels/volk/volk_32f_8u_polarbutterflypuppet_32f.h ++index fa40a86..6f97dd1 100644 ++--- a/kernels/volk/volk_32f_8u_polarbutterflypuppet_32f.h +++++ b/kernels/volk/volk_32f_8u_polarbutterflypuppet_32f.h ++@@ -33,124 +33,129 @@ ++ #include ++ ++ ++-static inline void ++-sanitize_bytes(unsigned char* u, const int elements) +++static inline void sanitize_bytes(unsigned char* u, const int elements) ++ { ++- int i; ++- unsigned char* u_ptr = u; ++- for(i = 0; i < elements; i++){ ++- *u_ptr = (*u_ptr & 0x01); ++- u_ptr++; ++- } +++ int i; +++ unsigned char* u_ptr = u; +++ for (i = 0; i < elements; i++) { +++ *u_ptr = (*u_ptr & 0x01); +++ u_ptr++; +++ } ++ } ++ ++-static inline void ++-clean_up_intermediate_values(float* llrs, unsigned char* u, const int frame_size, const int elements) +++static inline void clean_up_intermediate_values(float* llrs, +++ unsigned char* u, +++ const int frame_size, +++ const int elements) ++ { ++- memset(u + frame_size, 0, sizeof(unsigned char) * (elements - frame_size)); ++- memset(llrs + frame_size, 0, sizeof(float) * (elements - frame_size)); +++ memset(u + frame_size, 0, sizeof(unsigned char) * (elements - frame_size)); +++ memset(llrs + frame_size, 0, sizeof(float) * (elements - frame_size)); ++ } ++ ++ static inline void ++ generate_error_free_input_vector(float* llrs, unsigned char* u, const int frame_size) ++ { ++- memset(u, 0, frame_size); ++- unsigned char* target = u + frame_size; ++- volk_8u_x2_encodeframepolar_8u_generic(target, u + 2 * frame_size, frame_size); ++- float* ft = llrs; ++- int i; ++- for(i = 0; i < frame_size; i++){ ++- *ft = (-2 * ((float) *target++)) + 1.0f; ++- ft++; ++- } +++ memset(u, 0, frame_size); +++ unsigned char* target = u + frame_size; +++ volk_8u_x2_encodeframepolar_8u_generic(target, u + 2 * frame_size, frame_size); +++ float* ft = llrs; +++ int i; +++ for (i = 0; i < frame_size; i++) { +++ *ft = (-2 * ((float)*target++)) + 1.0f; +++ ft++; +++ } ++ } ++ ++ static inline void ++ print_llr_tree(const float* llrs, const int frame_size, const int frame_exp) ++ { ++- int s, e; ++- for(s = 0; s < frame_size; s++){ ++- for(e = 0; e < frame_exp + 1; e++){ ++- printf("%+4.2f ", llrs[e * frame_size + s]); ++- } ++- printf("\n"); ++- if((s + 1) % 8 == 0){ ++- printf("\n"); +++ int s, e; +++ for (s = 0; s < frame_size; s++) { +++ for (e = 0; e < frame_exp + 1; e++) { +++ printf("%+4.2f ", llrs[e * frame_size + s]); +++ } +++ printf("\n"); +++ if ((s + 1) % 8 == 0) { +++ printf("\n"); +++ } ++ } ++- } ++ } ++ ++-static inline int ++-maximum_frame_size(const int elements) +++static inline int maximum_frame_size(const int elements) ++ { ++- unsigned int frame_size = next_lower_power_of_two(elements); ++- unsigned int frame_exp = log2_of_power_of_2(frame_size); ++- return next_lower_power_of_two(frame_size / frame_exp); +++ unsigned int frame_size = next_lower_power_of_two(elements); +++ unsigned int frame_exp = log2_of_power_of_2(frame_size); +++ return next_lower_power_of_two(frame_size / frame_exp); ++ } ++ ++ #ifdef LV_HAVE_GENERIC ++-static inline void ++-volk_32f_8u_polarbutterflypuppet_32f_generic(float* llrs, const float* input, unsigned char* u, const int elements) +++static inline void volk_32f_8u_polarbutterflypuppet_32f_generic(float* llrs, +++ const float* input, +++ unsigned char* u, +++ const int elements) ++ { ++- unsigned int frame_size = maximum_frame_size(elements); ++- unsigned int frame_exp = log2_of_power_of_2(frame_size); +++ unsigned int frame_size = maximum_frame_size(elements); +++ unsigned int frame_exp = log2_of_power_of_2(frame_size); ++ ++- sanitize_bytes(u, elements); ++- clean_up_intermediate_values(llrs, u, frame_size, elements); ++- generate_error_free_input_vector(llrs + frame_exp * frame_size, u, frame_size); +++ sanitize_bytes(u, elements); +++ clean_up_intermediate_values(llrs, u, frame_size, elements); +++ generate_error_free_input_vector(llrs + frame_exp * frame_size, u, frame_size); ++ ++- unsigned int u_num = 0; ++- for(; u_num < frame_size; u_num++){ ++- volk_32f_8u_polarbutterfly_32f_generic(llrs, u, frame_exp, 0, u_num, u_num); ++- u[u_num] = llrs[u_num] > 0 ? 0 : 1; ++- } +++ unsigned int u_num = 0; +++ for (; u_num < frame_size; u_num++) { +++ volk_32f_8u_polarbutterfly_32f_generic(llrs, u, frame_exp, 0, u_num, u_num); +++ u[u_num] = llrs[u_num] > 0 ? 0 : 1; +++ } ++ ++- clean_up_intermediate_values(llrs, u, frame_size, elements); +++ clean_up_intermediate_values(llrs, u, frame_size, elements); ++ } ++ #endif /* LV_HAVE_GENERIC */ ++ ++ #ifdef LV_HAVE_AVX ++-static inline void ++-volk_32f_8u_polarbutterflypuppet_32f_u_avx(float* llrs, const float* input, unsigned char* u, const int elements) +++static inline void volk_32f_8u_polarbutterflypuppet_32f_u_avx(float* llrs, +++ const float* input, +++ unsigned char* u, +++ const int elements) ++ { ++- unsigned int frame_size = maximum_frame_size(elements); ++- unsigned int frame_exp = log2_of_power_of_2(frame_size); +++ unsigned int frame_size = maximum_frame_size(elements); +++ unsigned int frame_exp = log2_of_power_of_2(frame_size); ++ ++- sanitize_bytes(u, elements); ++- clean_up_intermediate_values(llrs, u, frame_size, elements); ++- generate_error_free_input_vector(llrs + frame_exp * frame_size, u, frame_size); +++ sanitize_bytes(u, elements); +++ clean_up_intermediate_values(llrs, u, frame_size, elements); +++ generate_error_free_input_vector(llrs + frame_exp * frame_size, u, frame_size); ++ ++- unsigned int u_num = 0; ++- for(; u_num < frame_size; u_num++){ ++- volk_32f_8u_polarbutterfly_32f_u_avx(llrs, u, frame_exp, 0, u_num, u_num); ++- u[u_num] = llrs[u_num] > 0 ? 0 : 1; ++- } +++ unsigned int u_num = 0; +++ for (; u_num < frame_size; u_num++) { +++ volk_32f_8u_polarbutterfly_32f_u_avx(llrs, u, frame_exp, 0, u_num, u_num); +++ u[u_num] = llrs[u_num] > 0 ? 0 : 1; +++ } ++ ++- clean_up_intermediate_values(llrs, u, frame_size, elements); +++ clean_up_intermediate_values(llrs, u, frame_size, elements); ++ } ++ #endif /* LV_HAVE_AVX */ ++ ++ #ifdef LV_HAVE_AVX2 ++-static inline void ++-volk_32f_8u_polarbutterflypuppet_32f_u_avx2(float* llrs, const float* input, unsigned char* u, const int elements) +++static inline void volk_32f_8u_polarbutterflypuppet_32f_u_avx2(float* llrs, +++ const float* input, +++ unsigned char* u, +++ const int elements) ++ { ++- unsigned int frame_size = maximum_frame_size(elements); ++- unsigned int frame_exp = log2_of_power_of_2(frame_size); +++ unsigned int frame_size = maximum_frame_size(elements); +++ unsigned int frame_exp = log2_of_power_of_2(frame_size); ++ ++- sanitize_bytes(u, elements); ++- clean_up_intermediate_values(llrs, u, frame_size, elements); ++- generate_error_free_input_vector(llrs + frame_exp * frame_size, u, frame_size); +++ sanitize_bytes(u, elements); +++ clean_up_intermediate_values(llrs, u, frame_size, elements); +++ generate_error_free_input_vector(llrs + frame_exp * frame_size, u, frame_size); ++ ++- unsigned int u_num = 0; ++- for(; u_num < frame_size; u_num++){ ++- volk_32f_8u_polarbutterfly_32f_u_avx2(llrs, u, frame_exp, 0, u_num, u_num); ++- u[u_num] = llrs[u_num] > 0 ? 0 : 1; ++- } +++ unsigned int u_num = 0; +++ for (; u_num < frame_size; u_num++) { +++ volk_32f_8u_polarbutterfly_32f_u_avx2(llrs, u, frame_exp, 0, u_num, u_num); +++ u[u_num] = llrs[u_num] > 0 ? 0 : 1; +++ } ++ ++- clean_up_intermediate_values(llrs, u, frame_size, elements); +++ clean_up_intermediate_values(llrs, u, frame_size, elements); ++ } ++ #endif /* LV_HAVE_AVX2 */ ++ ++ ++- ++ #endif /* VOLK_KERNELS_VOLK_VOLK_32F_8U_POLARBUTTERFLYPUPPET_32F_H_ */ ++diff --git a/kernels/volk/volk_32f_accumulator_s32f.h b/kernels/volk/volk_32f_accumulator_s32f.h ++index f6219c8..9a78f58 100644 ++--- a/kernels/volk/volk_32f_accumulator_s32f.h +++++ b/kernels/volk/volk_32f_accumulator_s32f.h ++@@ -29,8 +29,8 @@ ++ * ++ * Dispatcher Prototype ++ * \code ++- * void volk_32f_accumulator_s32f(float* result, const float* inputBuffer, unsigned int num_points) ++- * \endcode +++ * void volk_32f_accumulator_s32f(float* result, const float* inputBuffer, unsigned int +++ * num_points) \endcode ++ * ++ * \b Inputs ++ * \li inputBuffer The buffer of data to be accumulated ++@@ -63,47 +63,48 @@ ++ #ifndef INCLUDED_volk_32f_accumulator_s32f_a_H ++ #define INCLUDED_volk_32f_accumulator_s32f_a_H ++ ++-#include ++ #include +++#include ++ ++ #ifdef LV_HAVE_AVX ++ #include ++ ++-static inline void ++-volk_32f_accumulator_s32f_a_avx(float* result, const float* inputBuffer, unsigned int num_points) +++static inline void volk_32f_accumulator_s32f_a_avx(float* result, +++ const float* inputBuffer, +++ unsigned int num_points) ++ { ++- float returnValue = 0; ++- unsigned int number = 0; ++- const unsigned int eighthPoints = num_points / 8; ++- ++- const float* aPtr = inputBuffer; ++- __VOLK_ATTR_ALIGNED(32) float tempBuffer[8]; ++- ++- __m256 accumulator = _mm256_setzero_ps(); ++- __m256 aVal = _mm256_setzero_ps(); ++- ++- for(;number < eighthPoints; number++){ ++- aVal = _mm256_load_ps(aPtr); ++- accumulator = _mm256_add_ps(accumulator, aVal); ++- aPtr += 8; ++- } ++- ++- _mm256_store_ps(tempBuffer, accumulator); ++- ++- returnValue = tempBuffer[0]; ++- returnValue += tempBuffer[1]; ++- returnValue += tempBuffer[2]; ++- returnValue += tempBuffer[3]; ++- returnValue += tempBuffer[4]; ++- returnValue += tempBuffer[5]; ++- returnValue += tempBuffer[6]; ++- returnValue += tempBuffer[7]; ++- ++- number = eighthPoints * 8; ++- for(;number < num_points; number++){ ++- returnValue += (*aPtr++); ++- } ++- *result = returnValue; +++ float returnValue = 0; +++ unsigned int number = 0; +++ const unsigned int eighthPoints = num_points / 8; +++ +++ const float* aPtr = inputBuffer; +++ __VOLK_ATTR_ALIGNED(32) float tempBuffer[8]; +++ +++ __m256 accumulator = _mm256_setzero_ps(); +++ __m256 aVal = _mm256_setzero_ps(); +++ +++ for (; number < eighthPoints; number++) { +++ aVal = _mm256_load_ps(aPtr); +++ accumulator = _mm256_add_ps(accumulator, aVal); +++ aPtr += 8; +++ } +++ +++ _mm256_store_ps(tempBuffer, accumulator); +++ +++ returnValue = tempBuffer[0]; +++ returnValue += tempBuffer[1]; +++ returnValue += tempBuffer[2]; +++ returnValue += tempBuffer[3]; +++ returnValue += tempBuffer[4]; +++ returnValue += tempBuffer[5]; +++ returnValue += tempBuffer[6]; +++ returnValue += tempBuffer[7]; +++ +++ number = eighthPoints * 8; +++ for (; number < num_points; number++) { +++ returnValue += (*aPtr++); +++ } +++ *result = returnValue; ++ } ++ #endif /* LV_HAVE_AVX */ ++ ++@@ -111,41 +112,42 @@ volk_32f_accumulator_s32f_a_avx(float* result, const float* inputBuffer, unsigne ++ #ifdef LV_HAVE_AVX ++ #include ++ ++-static inline void ++-volk_32f_accumulator_s32f_u_avx(float* result, const float* inputBuffer, unsigned int num_points) +++static inline void volk_32f_accumulator_s32f_u_avx(float* result, +++ const float* inputBuffer, +++ unsigned int num_points) ++ { ++- float returnValue = 0; ++- unsigned int number = 0; ++- const unsigned int eighthPoints = num_points / 8; ++- ++- const float* aPtr = inputBuffer; ++- __VOLK_ATTR_ALIGNED(32) float tempBuffer[8]; ++- ++- __m256 accumulator = _mm256_setzero_ps(); ++- __m256 aVal = _mm256_setzero_ps(); ++- ++- for(;number < eighthPoints; number++){ ++- aVal = _mm256_loadu_ps(aPtr); ++- accumulator = _mm256_add_ps(accumulator, aVal); ++- aPtr += 8; ++- } ++- ++- _mm256_store_ps(tempBuffer, accumulator); ++- ++- returnValue = tempBuffer[0]; ++- returnValue += tempBuffer[1]; ++- returnValue += tempBuffer[2]; ++- returnValue += tempBuffer[3]; ++- returnValue += tempBuffer[4]; ++- returnValue += tempBuffer[5]; ++- returnValue += tempBuffer[6]; ++- returnValue += tempBuffer[7]; ++- ++- number = eighthPoints * 8; ++- for(;number < num_points; number++){ ++- returnValue += (*aPtr++); ++- } ++- *result = returnValue; +++ float returnValue = 0; +++ unsigned int number = 0; +++ const unsigned int eighthPoints = num_points / 8; +++ +++ const float* aPtr = inputBuffer; +++ __VOLK_ATTR_ALIGNED(32) float tempBuffer[8]; +++ +++ __m256 accumulator = _mm256_setzero_ps(); +++ __m256 aVal = _mm256_setzero_ps(); +++ +++ for (; number < eighthPoints; number++) { +++ aVal = _mm256_loadu_ps(aPtr); +++ accumulator = _mm256_add_ps(accumulator, aVal); +++ aPtr += 8; +++ } +++ +++ _mm256_store_ps(tempBuffer, accumulator); +++ +++ returnValue = tempBuffer[0]; +++ returnValue += tempBuffer[1]; +++ returnValue += tempBuffer[2]; +++ returnValue += tempBuffer[3]; +++ returnValue += tempBuffer[4]; +++ returnValue += tempBuffer[5]; +++ returnValue += tempBuffer[6]; +++ returnValue += tempBuffer[7]; +++ +++ number = eighthPoints * 8; +++ for (; number < num_points; number++) { +++ returnValue += (*aPtr++); +++ } +++ *result = returnValue; ++ } ++ #endif /* LV_HAVE_AVX */ ++ ++@@ -153,37 +155,38 @@ volk_32f_accumulator_s32f_u_avx(float* result, const float* inputBuffer, unsigne ++ #ifdef LV_HAVE_SSE ++ #include ++ ++-static inline void ++-volk_32f_accumulator_s32f_a_sse(float* result, const float* inputBuffer, unsigned int num_points) +++static inline void volk_32f_accumulator_s32f_a_sse(float* result, +++ const float* inputBuffer, +++ unsigned int num_points) ++ { ++- float returnValue = 0; ++- unsigned int number = 0; ++- const unsigned int quarterPoints = num_points / 4; ++- ++- const float* aPtr = inputBuffer; ++- __VOLK_ATTR_ALIGNED(16) float tempBuffer[4]; ++- ++- __m128 accumulator = _mm_setzero_ps(); ++- __m128 aVal = _mm_setzero_ps(); ++- ++- for(;number < quarterPoints; number++){ ++- aVal = _mm_load_ps(aPtr); ++- accumulator = _mm_add_ps(accumulator, aVal); ++- aPtr += 4; ++- } ++- ++- _mm_store_ps(tempBuffer,accumulator); ++- ++- returnValue = tempBuffer[0]; ++- returnValue += tempBuffer[1]; ++- returnValue += tempBuffer[2]; ++- returnValue += tempBuffer[3]; ++- ++- number = quarterPoints * 4; ++- for(;number < num_points; number++){ ++- returnValue += (*aPtr++); ++- } ++- *result = returnValue; +++ float returnValue = 0; +++ unsigned int number = 0; +++ const unsigned int quarterPoints = num_points / 4; +++ +++ const float* aPtr = inputBuffer; +++ __VOLK_ATTR_ALIGNED(16) float tempBuffer[4]; +++ +++ __m128 accumulator = _mm_setzero_ps(); +++ __m128 aVal = _mm_setzero_ps(); +++ +++ for (; number < quarterPoints; number++) { +++ aVal = _mm_load_ps(aPtr); +++ accumulator = _mm_add_ps(accumulator, aVal); +++ aPtr += 4; +++ } +++ +++ _mm_store_ps(tempBuffer, accumulator); +++ +++ returnValue = tempBuffer[0]; +++ returnValue += tempBuffer[1]; +++ returnValue += tempBuffer[2]; +++ returnValue += tempBuffer[3]; +++ +++ number = quarterPoints * 4; +++ for (; number < num_points; number++) { +++ returnValue += (*aPtr++); +++ } +++ *result = returnValue; ++ } ++ #endif /* LV_HAVE_SSE */ ++ ++@@ -191,52 +194,54 @@ volk_32f_accumulator_s32f_a_sse(float* result, const float* inputBuffer, unsigne ++ #ifdef LV_HAVE_SSE ++ #include ++ ++-static inline void ++-volk_32f_accumulator_s32f_u_sse(float* result, const float* inputBuffer, unsigned int num_points) +++static inline void volk_32f_accumulator_s32f_u_sse(float* result, +++ const float* inputBuffer, +++ unsigned int num_points) ++ { ++- float returnValue = 0; ++- unsigned int number = 0; ++- const unsigned int quarterPoints = num_points / 4; ++- ++- const float* aPtr = inputBuffer; ++- __VOLK_ATTR_ALIGNED(16) float tempBuffer[4]; ++- ++- __m128 accumulator = _mm_setzero_ps(); ++- __m128 aVal = _mm_setzero_ps(); ++- ++- for(;number < quarterPoints; number++){ ++- aVal = _mm_load_ps(aPtr); ++- accumulator = _mm_add_ps(accumulator, aVal); ++- aPtr += 4; ++- } ++- ++- _mm_store_ps(tempBuffer,accumulator); ++- ++- returnValue = tempBuffer[0]; ++- returnValue += tempBuffer[1]; ++- returnValue += tempBuffer[2]; ++- returnValue += tempBuffer[3]; ++- ++- number = quarterPoints * 4; ++- for(;number < num_points; number++){ ++- returnValue += (*aPtr++); ++- } ++- *result = returnValue; +++ float returnValue = 0; +++ unsigned int number = 0; +++ const unsigned int quarterPoints = num_points / 4; +++ +++ const float* aPtr = inputBuffer; +++ __VOLK_ATTR_ALIGNED(16) float tempBuffer[4]; +++ +++ __m128 accumulator = _mm_setzero_ps(); +++ __m128 aVal = _mm_setzero_ps(); +++ +++ for (; number < quarterPoints; number++) { +++ aVal = _mm_load_ps(aPtr); +++ accumulator = _mm_add_ps(accumulator, aVal); +++ aPtr += 4; +++ } +++ +++ _mm_store_ps(tempBuffer, accumulator); +++ +++ returnValue = tempBuffer[0]; +++ returnValue += tempBuffer[1]; +++ returnValue += tempBuffer[2]; +++ returnValue += tempBuffer[3]; +++ +++ number = quarterPoints * 4; +++ for (; number < num_points; number++) { +++ returnValue += (*aPtr++); +++ } +++ *result = returnValue; ++ } ++ #endif /* LV_HAVE_SSE */ ++ ++ #ifdef LV_HAVE_GENERIC ++-static inline void ++-volk_32f_accumulator_s32f_generic(float* result, const float* inputBuffer, unsigned int num_points) +++static inline void volk_32f_accumulator_s32f_generic(float* result, +++ const float* inputBuffer, +++ unsigned int num_points) ++ { ++- const float* aPtr = inputBuffer; ++- unsigned int number = 0; ++- float returnValue = 0; ++- ++- for(;number < num_points; number++){ ++- returnValue += (*aPtr++); ++- } ++- *result = returnValue; +++ const float* aPtr = inputBuffer; +++ unsigned int number = 0; +++ float returnValue = 0; +++ +++ for (; number < num_points; number++) { +++ returnValue += (*aPtr++); +++ } +++ *result = returnValue; ++ } ++ #endif /* LV_HAVE_GENERIC */ ++ ++diff --git a/kernels/volk/volk_32f_acos_32f.h b/kernels/volk/volk_32f_acos_32f.h ++index 5c14c2f..92918ca 100644 ++--- a/kernels/volk/volk_32f_acos_32f.h +++++ b/kernels/volk/volk_32f_acos_32f.h ++@@ -67,11 +67,12 @@ ++ * \endcode ++ */ ++ ++-#include ++-#include ++ #include +++#include +++#include ++ ++-/* This is the number of terms of Taylor series to evaluate, increase this for more accuracy*/ +++/* This is the number of terms of Taylor series to evaluate, increase this for more +++ * accuracy*/ ++ #define ACOS_TERMS 2 ++ ++ #ifndef INCLUDED_volk_32f_acos_32f_a_H ++@@ -80,62 +81,68 @@ ++ #if LV_HAVE_AVX2 && LV_HAVE_FMA ++ #include ++ ++-static inline void ++-volk_32f_acos_32f_a_avx2_fma(float* bVector, const float* aVector, unsigned int num_points) +++static inline void volk_32f_acos_32f_a_avx2_fma(float* bVector, +++ const float* aVector, +++ unsigned int num_points) ++ { ++- float* bPtr = bVector; ++- const float* aPtr = aVector; ++- ++- unsigned int number = 0; ++- unsigned int eighthPoints = num_points / 8; ++- int i, j; ++- ++- __m256 aVal, d, pi, pio2, x, y, z, arccosine; ++- __m256 fzeroes, fones, ftwos, ffours, condition; ++- ++- pi = _mm256_set1_ps(3.14159265358979323846); ++- pio2 = _mm256_set1_ps(3.14159265358979323846/2); ++- fzeroes = _mm256_setzero_ps(); ++- fones = _mm256_set1_ps(1.0); ++- ftwos = _mm256_set1_ps(2.0); ++- ffours = _mm256_set1_ps(4.0); ++- ++- for(;number < eighthPoints; number++){ ++- aVal = _mm256_load_ps(aPtr); ++- d = aVal; ++- aVal = _mm256_div_ps(_mm256_sqrt_ps(_mm256_mul_ps(_mm256_add_ps(fones, aVal), _mm256_sub_ps(fones, aVal))), aVal); ++- z = aVal; ++- condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS); ++- z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition)); ++- condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS); ++- x = _mm256_add_ps(z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition)); ++- ++- for(i = 0; i < 2; i++) ++- x = _mm256_add_ps(x, _mm256_sqrt_ps(_mm256_fmadd_ps(x, x,fones))); ++- x = _mm256_div_ps(fones, x); ++- y = fzeroes; ++- for(j = ACOS_TERMS - 1; j >=0 ; j--) ++- y = _mm256_fmadd_ps(y, _mm256_mul_ps(x, x), _mm256_set1_ps(pow(-1,j)/(2*j+1))); ++- ++- y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours)); ++- condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS); ++- ++- y = _mm256_add_ps(y, _mm256_and_ps(_mm256_fnmadd_ps(y, ftwos, pio2), condition)); ++- arccosine = y; ++- condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS); ++- arccosine = _mm256_sub_ps(arccosine, _mm256_and_ps(_mm256_mul_ps(arccosine, ftwos), condition)); ++- condition = _mm256_cmp_ps(d, fzeroes, _CMP_LT_OS); ++- arccosine = _mm256_add_ps(arccosine, _mm256_and_ps(pi, condition)); ++- ++- _mm256_store_ps(bPtr, arccosine); ++- aPtr += 8; ++- bPtr += 8; ++- } ++- ++- number = eighthPoints * 8; ++- for(;number < num_points; number++){ ++- *bPtr++ = acos(*aPtr++); ++- } +++ float* bPtr = bVector; +++ const float* aPtr = aVector; +++ +++ unsigned int number = 0; +++ unsigned int eighthPoints = num_points / 8; +++ int i, j; +++ +++ __m256 aVal, d, pi, pio2, x, y, z, arccosine; +++ __m256 fzeroes, fones, ftwos, ffours, condition; +++ +++ pi = _mm256_set1_ps(3.14159265358979323846); +++ pio2 = _mm256_set1_ps(3.14159265358979323846 / 2); +++ fzeroes = _mm256_setzero_ps(); +++ fones = _mm256_set1_ps(1.0); +++ ftwos = _mm256_set1_ps(2.0); +++ ffours = _mm256_set1_ps(4.0); +++ +++ for (; number < eighthPoints; number++) { +++ aVal = _mm256_load_ps(aPtr); +++ d = aVal; +++ aVal = _mm256_div_ps(_mm256_sqrt_ps(_mm256_mul_ps(_mm256_add_ps(fones, aVal), +++ _mm256_sub_ps(fones, aVal))), +++ aVal); +++ z = aVal; +++ condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS); +++ z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition)); +++ condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS); +++ x = _mm256_add_ps( +++ z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition)); +++ +++ for (i = 0; i < 2; i++) +++ x = _mm256_add_ps(x, _mm256_sqrt_ps(_mm256_fmadd_ps(x, x, fones))); +++ x = _mm256_div_ps(fones, x); +++ y = fzeroes; +++ for (j = ACOS_TERMS - 1; j >= 0; j--) +++ y = _mm256_fmadd_ps( +++ y, _mm256_mul_ps(x, x), _mm256_set1_ps(pow(-1, j) / (2 * j + 1))); +++ +++ y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours)); +++ condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS); +++ +++ y = _mm256_add_ps(y, _mm256_and_ps(_mm256_fnmadd_ps(y, ftwos, pio2), condition)); +++ arccosine = y; +++ condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS); +++ arccosine = _mm256_sub_ps( +++ arccosine, _mm256_and_ps(_mm256_mul_ps(arccosine, ftwos), condition)); +++ condition = _mm256_cmp_ps(d, fzeroes, _CMP_LT_OS); +++ arccosine = _mm256_add_ps(arccosine, _mm256_and_ps(pi, condition)); +++ +++ _mm256_store_ps(bPtr, arccosine); +++ aPtr += 8; +++ bPtr += 8; +++ } +++ +++ number = eighthPoints * 8; +++ for (; number < num_points; number++) { +++ *bPtr++ = acos(*aPtr++); +++ } ++ } ++ ++ #endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for aligned */ ++@@ -147,59 +154,66 @@ volk_32f_acos_32f_a_avx2_fma(float* bVector, const float* aVector, unsigned int ++ static inline void ++ volk_32f_acos_32f_a_avx(float* bVector, const float* aVector, unsigned int num_points) ++ { ++- float* bPtr = bVector; ++- const float* aPtr = aVector; ++- ++- unsigned int number = 0; ++- unsigned int eighthPoints = num_points / 8; ++- int i, j; ++- ++- __m256 aVal, d, pi, pio2, x, y, z, arccosine; ++- __m256 fzeroes, fones, ftwos, ffours, condition; ++- ++- pi = _mm256_set1_ps(3.14159265358979323846); ++- pio2 = _mm256_set1_ps(3.14159265358979323846/2); ++- fzeroes = _mm256_setzero_ps(); ++- fones = _mm256_set1_ps(1.0); ++- ftwos = _mm256_set1_ps(2.0); ++- ffours = _mm256_set1_ps(4.0); ++- ++- for(;number < eighthPoints; number++){ ++- aVal = _mm256_load_ps(aPtr); ++- d = aVal; ++- aVal = _mm256_div_ps(_mm256_sqrt_ps(_mm256_mul_ps(_mm256_add_ps(fones, aVal), _mm256_sub_ps(fones, aVal))), aVal); ++- z = aVal; ++- condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS); ++- z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition)); ++- condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS); ++- x = _mm256_add_ps(z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition)); ++- ++- for(i = 0; i < 2; i++) ++- x = _mm256_add_ps(x, _mm256_sqrt_ps(_mm256_add_ps(fones, _mm256_mul_ps(x, x)))); ++- x = _mm256_div_ps(fones, x); ++- y = fzeroes; ++- for(j = ACOS_TERMS - 1; j >=0 ; j--) ++- y = _mm256_add_ps(_mm256_mul_ps(y, _mm256_mul_ps(x, x)), _mm256_set1_ps(pow(-1,j)/(2*j+1))); ++- ++- y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours)); ++- condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS); ++- ++- y = _mm256_add_ps(y, _mm256_and_ps(_mm256_sub_ps(pio2, _mm256_mul_ps(y, ftwos)), condition)); ++- arccosine = y; ++- condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS); ++- arccosine = _mm256_sub_ps(arccosine, _mm256_and_ps(_mm256_mul_ps(arccosine, ftwos), condition)); ++- condition = _mm256_cmp_ps(d, fzeroes, _CMP_LT_OS); ++- arccosine = _mm256_add_ps(arccosine, _mm256_and_ps(pi, condition)); ++- ++- _mm256_store_ps(bPtr, arccosine); ++- aPtr += 8; ++- bPtr += 8; ++- } ++- ++- number = eighthPoints * 8; ++- for(;number < num_points; number++){ ++- *bPtr++ = acos(*aPtr++); ++- } +++ float* bPtr = bVector; +++ const float* aPtr = aVector; +++ +++ unsigned int number = 0; +++ unsigned int eighthPoints = num_points / 8; +++ int i, j; +++ +++ __m256 aVal, d, pi, pio2, x, y, z, arccosine; +++ __m256 fzeroes, fones, ftwos, ffours, condition; +++ +++ pi = _mm256_set1_ps(3.14159265358979323846); +++ pio2 = _mm256_set1_ps(3.14159265358979323846 / 2); +++ fzeroes = _mm256_setzero_ps(); +++ fones = _mm256_set1_ps(1.0); +++ ftwos = _mm256_set1_ps(2.0); +++ ffours = _mm256_set1_ps(4.0); +++ +++ for (; number < eighthPoints; number++) { +++ aVal = _mm256_load_ps(aPtr); +++ d = aVal; +++ aVal = _mm256_div_ps(_mm256_sqrt_ps(_mm256_mul_ps(_mm256_add_ps(fones, aVal), +++ _mm256_sub_ps(fones, aVal))), +++ aVal); +++ z = aVal; +++ condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS); +++ z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition)); +++ condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS); +++ x = _mm256_add_ps( +++ z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition)); +++ +++ for (i = 0; i < 2; i++) +++ x = _mm256_add_ps(x, +++ _mm256_sqrt_ps(_mm256_add_ps(fones, _mm256_mul_ps(x, x)))); +++ x = _mm256_div_ps(fones, x); +++ y = fzeroes; +++ for (j = ACOS_TERMS - 1; j >= 0; j--) +++ y = _mm256_add_ps(_mm256_mul_ps(y, _mm256_mul_ps(x, x)), +++ _mm256_set1_ps(pow(-1, j) / (2 * j + 1))); +++ +++ y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours)); +++ condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS); +++ +++ y = _mm256_add_ps( +++ y, _mm256_and_ps(_mm256_sub_ps(pio2, _mm256_mul_ps(y, ftwos)), condition)); +++ arccosine = y; +++ condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS); +++ arccosine = _mm256_sub_ps( +++ arccosine, _mm256_and_ps(_mm256_mul_ps(arccosine, ftwos), condition)); +++ condition = _mm256_cmp_ps(d, fzeroes, _CMP_LT_OS); +++ arccosine = _mm256_add_ps(arccosine, _mm256_and_ps(pi, condition)); +++ +++ _mm256_store_ps(bPtr, arccosine); +++ aPtr += 8; +++ bPtr += 8; +++ } +++ +++ number = eighthPoints * 8; +++ for (; number < num_points; number++) { +++ *bPtr++ = acos(*aPtr++); +++ } ++ } ++ ++ #endif /* LV_HAVE_AVX2 for aligned */ ++@@ -210,59 +224,63 @@ volk_32f_acos_32f_a_avx(float* bVector, const float* aVector, unsigned int num_p ++ static inline void ++ volk_32f_acos_32f_a_sse4_1(float* bVector, const float* aVector, unsigned int num_points) ++ { ++- float* bPtr = bVector; ++- const float* aPtr = aVector; ++- ++- unsigned int number = 0; ++- unsigned int quarterPoints = num_points / 4; ++- int i, j; ++- ++- __m128 aVal, d, pi, pio2, x, y, z, arccosine; ++- __m128 fzeroes, fones, ftwos, ffours, condition; ++- ++- pi = _mm_set1_ps(3.14159265358979323846); ++- pio2 = _mm_set1_ps(3.14159265358979323846/2); ++- fzeroes = _mm_setzero_ps(); ++- fones = _mm_set1_ps(1.0); ++- ftwos = _mm_set1_ps(2.0); ++- ffours = _mm_set1_ps(4.0); ++- ++- for(;number < quarterPoints; number++){ ++- aVal = _mm_load_ps(aPtr); ++- d = aVal; ++- aVal = _mm_div_ps(_mm_sqrt_ps(_mm_mul_ps(_mm_add_ps(fones, aVal), _mm_sub_ps(fones, aVal))), aVal); ++- z = aVal; ++- condition = _mm_cmplt_ps(z, fzeroes); ++- z = _mm_sub_ps(z, _mm_and_ps(_mm_mul_ps(z, ftwos), condition)); ++- condition = _mm_cmplt_ps(z, fones); ++- x = _mm_add_ps(z, _mm_and_ps(_mm_sub_ps(_mm_div_ps(fones, z), z), condition)); ++- ++- for(i = 0; i < 2; i++) ++- x = _mm_add_ps(x, _mm_sqrt_ps(_mm_add_ps(fones, _mm_mul_ps(x, x)))); ++- x = _mm_div_ps(fones, x); ++- y = fzeroes; ++- for(j = ACOS_TERMS - 1; j >=0 ; j--) ++- y = _mm_add_ps(_mm_mul_ps(y, _mm_mul_ps(x, x)), _mm_set1_ps(pow(-1,j)/(2*j+1))); ++- ++- y = _mm_mul_ps(y, _mm_mul_ps(x, ffours)); ++- condition = _mm_cmpgt_ps(z, fones); ++- ++- y = _mm_add_ps(y, _mm_and_ps(_mm_sub_ps(pio2, _mm_mul_ps(y, ftwos)), condition)); ++- arccosine = y; ++- condition = _mm_cmplt_ps(aVal, fzeroes); ++- arccosine = _mm_sub_ps(arccosine, _mm_and_ps(_mm_mul_ps(arccosine, ftwos), condition)); ++- condition = _mm_cmplt_ps(d, fzeroes); ++- arccosine = _mm_add_ps(arccosine, _mm_and_ps(pi, condition)); ++- ++- _mm_store_ps(bPtr, arccosine); ++- aPtr += 4; ++- bPtr += 4; ++- } ++- ++- number = quarterPoints * 4; ++- for(;number < num_points; number++){ ++- *bPtr++ = acosf(*aPtr++); ++- } +++ float* bPtr = bVector; +++ const float* aPtr = aVector; +++ +++ unsigned int number = 0; +++ unsigned int quarterPoints = num_points / 4; +++ int i, j; +++ +++ __m128 aVal, d, pi, pio2, x, y, z, arccosine; +++ __m128 fzeroes, fones, ftwos, ffours, condition; +++ +++ pi = _mm_set1_ps(3.14159265358979323846); +++ pio2 = _mm_set1_ps(3.14159265358979323846 / 2); +++ fzeroes = _mm_setzero_ps(); +++ fones = _mm_set1_ps(1.0); +++ ftwos = _mm_set1_ps(2.0); +++ ffours = _mm_set1_ps(4.0); +++ +++ for (; number < quarterPoints; number++) { +++ aVal = _mm_load_ps(aPtr); +++ d = aVal; +++ aVal = _mm_div_ps( +++ _mm_sqrt_ps(_mm_mul_ps(_mm_add_ps(fones, aVal), _mm_sub_ps(fones, aVal))), +++ aVal); +++ z = aVal; +++ condition = _mm_cmplt_ps(z, fzeroes); +++ z = _mm_sub_ps(z, _mm_and_ps(_mm_mul_ps(z, ftwos), condition)); +++ condition = _mm_cmplt_ps(z, fones); +++ x = _mm_add_ps(z, _mm_and_ps(_mm_sub_ps(_mm_div_ps(fones, z), z), condition)); +++ +++ for (i = 0; i < 2; i++) +++ x = _mm_add_ps(x, _mm_sqrt_ps(_mm_add_ps(fones, _mm_mul_ps(x, x)))); +++ x = _mm_div_ps(fones, x); +++ y = fzeroes; +++ for (j = ACOS_TERMS - 1; j >= 0; j--) +++ y = _mm_add_ps(_mm_mul_ps(y, _mm_mul_ps(x, x)), +++ _mm_set1_ps(pow(-1, j) / (2 * j + 1))); +++ +++ y = _mm_mul_ps(y, _mm_mul_ps(x, ffours)); +++ condition = _mm_cmpgt_ps(z, fones); +++ +++ y = _mm_add_ps(y, _mm_and_ps(_mm_sub_ps(pio2, _mm_mul_ps(y, ftwos)), condition)); +++ arccosine = y; +++ condition = _mm_cmplt_ps(aVal, fzeroes); +++ arccosine = +++ _mm_sub_ps(arccosine, _mm_and_ps(_mm_mul_ps(arccosine, ftwos), condition)); +++ condition = _mm_cmplt_ps(d, fzeroes); +++ arccosine = _mm_add_ps(arccosine, _mm_and_ps(pi, condition)); +++ +++ _mm_store_ps(bPtr, arccosine); +++ aPtr += 4; +++ bPtr += 4; +++ } +++ +++ number = quarterPoints * 4; +++ for (; number < num_points; number++) { +++ *bPtr++ = acosf(*aPtr++); +++ } ++ } ++ ++ #endif /* LV_HAVE_SSE4_1 for aligned */ ++@@ -276,62 +294,68 @@ volk_32f_acos_32f_a_sse4_1(float* bVector, const float* aVector, unsigned int nu ++ #if LV_HAVE_AVX2 && LV_HAVE_FMA ++ #include ++ ++-static inline void ++-volk_32f_acos_32f_u_avx2_fma(float* bVector, const float* aVector, unsigned int num_points) +++static inline void volk_32f_acos_32f_u_avx2_fma(float* bVector, +++ const float* aVector, +++ unsigned int num_points) ++ { ++- float* bPtr = bVector; ++- const float* aPtr = aVector; ++- ++- unsigned int number = 0; ++- unsigned int eighthPoints = num_points / 8; ++- int i, j; ++- ++- __m256 aVal, d, pi, pio2, x, y, z, arccosine; ++- __m256 fzeroes, fones, ftwos, ffours, condition; ++- ++- pi = _mm256_set1_ps(3.14159265358979323846); ++- pio2 = _mm256_set1_ps(3.14159265358979323846/2); ++- fzeroes = _mm256_setzero_ps(); ++- fones = _mm256_set1_ps(1.0); ++- ftwos = _mm256_set1_ps(2.0); ++- ffours = _mm256_set1_ps(4.0); ++- ++- for(;number < eighthPoints; number++){ ++- aVal = _mm256_loadu_ps(aPtr); ++- d = aVal; ++- aVal = _mm256_div_ps(_mm256_sqrt_ps(_mm256_mul_ps(_mm256_add_ps(fones, aVal), _mm256_sub_ps(fones, aVal))), aVal); ++- z = aVal; ++- condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS); ++- z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition)); ++- condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS); ++- x = _mm256_add_ps(z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition)); ++- ++- for(i = 0; i < 2; i++) ++- x = _mm256_add_ps(x, _mm256_sqrt_ps(_mm256_fmadd_ps(x, x,fones))); ++- x = _mm256_div_ps(fones, x); ++- y = fzeroes; ++- for(j = ACOS_TERMS - 1; j >=0 ; j--) ++- y = _mm256_fmadd_ps(y, _mm256_mul_ps(x, x), _mm256_set1_ps(pow(-1,j)/(2*j+1))); ++- ++- y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours)); ++- condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS); ++- ++- y = _mm256_add_ps(y, _mm256_and_ps(_mm256_fnmadd_ps(y, ftwos, pio2), condition)); ++- arccosine = y; ++- condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS); ++- arccosine = _mm256_sub_ps(arccosine, _mm256_and_ps(_mm256_mul_ps(arccosine, ftwos), condition)); ++- condition = _mm256_cmp_ps(d, fzeroes, _CMP_LT_OS); ++- arccosine = _mm256_add_ps(arccosine, _mm256_and_ps(pi, condition)); ++- ++- _mm256_storeu_ps(bPtr, arccosine); ++- aPtr += 8; ++- bPtr += 8; ++- } ++- ++- number = eighthPoints * 8; ++- for(;number < num_points; number++){ ++- *bPtr++ = acos(*aPtr++); ++- } +++ float* bPtr = bVector; +++ const float* aPtr = aVector; +++ +++ unsigned int number = 0; +++ unsigned int eighthPoints = num_points / 8; +++ int i, j; +++ +++ __m256 aVal, d, pi, pio2, x, y, z, arccosine; +++ __m256 fzeroes, fones, ftwos, ffours, condition; +++ +++ pi = _mm256_set1_ps(3.14159265358979323846); +++ pio2 = _mm256_set1_ps(3.14159265358979323846 / 2); +++ fzeroes = _mm256_setzero_ps(); +++ fones = _mm256_set1_ps(1.0); +++ ftwos = _mm256_set1_ps(2.0); +++ ffours = _mm256_set1_ps(4.0); +++ +++ for (; number < eighthPoints; number++) { +++ aVal = _mm256_loadu_ps(aPtr); +++ d = aVal; +++ aVal = _mm256_div_ps(_mm256_sqrt_ps(_mm256_mul_ps(_mm256_add_ps(fones, aVal), +++ _mm256_sub_ps(fones, aVal))), +++ aVal); +++ z = aVal; +++ condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS); +++ z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition)); +++ condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS); +++ x = _mm256_add_ps( +++ z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition)); +++ +++ for (i = 0; i < 2; i++) +++ x = _mm256_add_ps(x, _mm256_sqrt_ps(_mm256_fmadd_ps(x, x, fones))); +++ x = _mm256_div_ps(fones, x); +++ y = fzeroes; +++ for (j = ACOS_TERMS - 1; j >= 0; j--) +++ y = _mm256_fmadd_ps( +++ y, _mm256_mul_ps(x, x), _mm256_set1_ps(pow(-1, j) / (2 * j + 1))); +++ +++ y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours)); +++ condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS); +++ +++ y = _mm256_add_ps(y, _mm256_and_ps(_mm256_fnmadd_ps(y, ftwos, pio2), condition)); +++ arccosine = y; +++ condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS); +++ arccosine = _mm256_sub_ps( +++ arccosine, _mm256_and_ps(_mm256_mul_ps(arccosine, ftwos), condition)); +++ condition = _mm256_cmp_ps(d, fzeroes, _CMP_LT_OS); +++ arccosine = _mm256_add_ps(arccosine, _mm256_and_ps(pi, condition)); +++ +++ _mm256_storeu_ps(bPtr, arccosine); +++ aPtr += 8; +++ bPtr += 8; +++ } +++ +++ number = eighthPoints * 8; +++ for (; number < num_points; number++) { +++ *bPtr++ = acos(*aPtr++); +++ } ++ } ++ ++ #endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for unaligned */ ++@@ -343,59 +367,66 @@ volk_32f_acos_32f_u_avx2_fma(float* bVector, const float* aVector, unsigned int ++ static inline void ++ volk_32f_acos_32f_u_avx(float* bVector, const float* aVector, unsigned int num_points) ++ { ++- float* bPtr = bVector; ++- const float* aPtr = aVector; ++- ++- unsigned int number = 0; ++- unsigned int eighthPoints = num_points / 8; ++- int i, j; ++- ++- __m256 aVal, d, pi, pio2, x, y, z, arccosine; ++- __m256 fzeroes, fones, ftwos, ffours, condition; ++- ++- pi = _mm256_set1_ps(3.14159265358979323846); ++- pio2 = _mm256_set1_ps(3.14159265358979323846/2); ++- fzeroes = _mm256_setzero_ps(); ++- fones = _mm256_set1_ps(1.0); ++- ftwos = _mm256_set1_ps(2.0); ++- ffours = _mm256_set1_ps(4.0); ++- ++- for(;number < eighthPoints; number++){ ++- aVal = _mm256_loadu_ps(aPtr); ++- d = aVal; ++- aVal = _mm256_div_ps(_mm256_sqrt_ps(_mm256_mul_ps(_mm256_add_ps(fones, aVal), _mm256_sub_ps(fones, aVal))), aVal); ++- z = aVal; ++- condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS); ++- z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition)); ++- condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS); ++- x = _mm256_add_ps(z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition)); ++- ++- for(i = 0; i < 2; i++) ++- x = _mm256_add_ps(x, _mm256_sqrt_ps(_mm256_add_ps(fones, _mm256_mul_ps(x, x)))); ++- x = _mm256_div_ps(fones, x); ++- y = fzeroes; ++- for(j = ACOS_TERMS - 1; j >=0 ; j--) ++- y = _mm256_add_ps(_mm256_mul_ps(y, _mm256_mul_ps(x, x)), _mm256_set1_ps(pow(-1,j)/(2*j+1))); ++- ++- y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours)); ++- condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS); ++- ++- y = _mm256_add_ps(y, _mm256_and_ps(_mm256_sub_ps(pio2, _mm256_mul_ps(y, ftwos)), condition)); ++- arccosine = y; ++- condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS); ++- arccosine = _mm256_sub_ps(arccosine, _mm256_and_ps(_mm256_mul_ps(arccosine, ftwos), condition)); ++- condition = _mm256_cmp_ps(d, fzeroes, _CMP_LT_OS); ++- arccosine = _mm256_add_ps(arccosine, _mm256_and_ps(pi, condition)); ++- ++- _mm256_storeu_ps(bPtr, arccosine); ++- aPtr += 8; ++- bPtr += 8; ++- } ++- ++- number = eighthPoints * 8; ++- for(;number < num_points; number++){ ++- *bPtr++ = acos(*aPtr++); ++- } +++ float* bPtr = bVector; +++ const float* aPtr = aVector; +++ +++ unsigned int number = 0; +++ unsigned int eighthPoints = num_points / 8; +++ int i, j; +++ +++ __m256 aVal, d, pi, pio2, x, y, z, arccosine; +++ __m256 fzeroes, fones, ftwos, ffours, condition; +++ +++ pi = _mm256_set1_ps(3.14159265358979323846); +++ pio2 = _mm256_set1_ps(3.14159265358979323846 / 2); +++ fzeroes = _mm256_setzero_ps(); +++ fones = _mm256_set1_ps(1.0); +++ ftwos = _mm256_set1_ps(2.0); +++ ffours = _mm256_set1_ps(4.0); +++ +++ for (; number < eighthPoints; number++) { +++ aVal = _mm256_loadu_ps(aPtr); +++ d = aVal; +++ aVal = _mm256_div_ps(_mm256_sqrt_ps(_mm256_mul_ps(_mm256_add_ps(fones, aVal), +++ _mm256_sub_ps(fones, aVal))), +++ aVal); +++ z = aVal; +++ condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS); +++ z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition)); +++ condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS); +++ x = _mm256_add_ps( +++ z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition)); +++ +++ for (i = 0; i < 2; i++) +++ x = _mm256_add_ps(x, +++ _mm256_sqrt_ps(_mm256_add_ps(fones, _mm256_mul_ps(x, x)))); +++ x = _mm256_div_ps(fones, x); +++ y = fzeroes; +++ for (j = ACOS_TERMS - 1; j >= 0; j--) +++ y = _mm256_add_ps(_mm256_mul_ps(y, _mm256_mul_ps(x, x)), +++ _mm256_set1_ps(pow(-1, j) / (2 * j + 1))); +++ +++ y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours)); +++ condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS); +++ +++ y = _mm256_add_ps( +++ y, _mm256_and_ps(_mm256_sub_ps(pio2, _mm256_mul_ps(y, ftwos)), condition)); +++ arccosine = y; +++ condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS); +++ arccosine = _mm256_sub_ps( +++ arccosine, _mm256_and_ps(_mm256_mul_ps(arccosine, ftwos), condition)); +++ condition = _mm256_cmp_ps(d, fzeroes, _CMP_LT_OS); +++ arccosine = _mm256_add_ps(arccosine, _mm256_and_ps(pi, condition)); +++ +++ _mm256_storeu_ps(bPtr, arccosine); +++ aPtr += 8; +++ bPtr += 8; +++ } +++ +++ number = eighthPoints * 8; +++ for (; number < num_points; number++) { +++ *bPtr++ = acos(*aPtr++); +++ } ++ } ++ ++ #endif /* LV_HAVE_AVX2 for unaligned */ ++@@ -406,60 +437,64 @@ volk_32f_acos_32f_u_avx(float* bVector, const float* aVector, unsigned int num_p ++ static inline void ++ volk_32f_acos_32f_u_sse4_1(float* bVector, const float* aVector, unsigned int num_points) ++ { ++- float* bPtr = bVector; ++- const float* aPtr = aVector; ++- ++- unsigned int number = 0; ++- unsigned int quarterPoints = num_points / 4; ++- int i, j; ++- ++- __m128 aVal, d, pi, pio2, x, y, z, arccosine; ++- __m128 fzeroes, fones, ftwos, ffours, condition; ++- ++- pi = _mm_set1_ps(3.14159265358979323846); ++- pio2 = _mm_set1_ps(3.14159265358979323846/2); ++- fzeroes = _mm_setzero_ps(); ++- fones = _mm_set1_ps(1.0); ++- ftwos = _mm_set1_ps(2.0); ++- ffours = _mm_set1_ps(4.0); ++- ++- for(;number < quarterPoints; number++){ ++- aVal = _mm_loadu_ps(aPtr); ++- d = aVal; ++- aVal = _mm_div_ps(_mm_sqrt_ps(_mm_mul_ps(_mm_add_ps(fones, aVal), _mm_sub_ps(fones, aVal))), aVal); ++- z = aVal; ++- condition = _mm_cmplt_ps(z, fzeroes); ++- z = _mm_sub_ps(z, _mm_and_ps(_mm_mul_ps(z, ftwos), condition)); ++- condition = _mm_cmplt_ps(z, fones); ++- x = _mm_add_ps(z, _mm_and_ps(_mm_sub_ps(_mm_div_ps(fones, z), z), condition)); ++- ++- for(i = 0; i < 2; i++) ++- x = _mm_add_ps(x, _mm_sqrt_ps(_mm_add_ps(fones, _mm_mul_ps(x, x)))); ++- x = _mm_div_ps(fones, x); ++- y = fzeroes; ++- ++- for(j = ACOS_TERMS - 1; j >=0 ; j--) ++- y = _mm_add_ps(_mm_mul_ps(y, _mm_mul_ps(x, x)), _mm_set1_ps(pow(-1,j)/(2*j+1))); ++- ++- y = _mm_mul_ps(y, _mm_mul_ps(x, ffours)); ++- condition = _mm_cmpgt_ps(z, fones); ++- ++- y = _mm_add_ps(y, _mm_and_ps(_mm_sub_ps(pio2, _mm_mul_ps(y, ftwos)), condition)); ++- arccosine = y; ++- condition = _mm_cmplt_ps(aVal, fzeroes); ++- arccosine = _mm_sub_ps(arccosine, _mm_and_ps(_mm_mul_ps(arccosine, ftwos), condition)); ++- condition = _mm_cmplt_ps(d, fzeroes); ++- arccosine = _mm_add_ps(arccosine, _mm_and_ps(pi, condition)); ++- ++- _mm_storeu_ps(bPtr, arccosine); ++- aPtr += 4; ++- bPtr += 4; ++- } ++- ++- number = quarterPoints * 4; ++- for(;number < num_points; number++){ ++- *bPtr++ = acosf(*aPtr++); ++- } +++ float* bPtr = bVector; +++ const float* aPtr = aVector; +++ +++ unsigned int number = 0; +++ unsigned int quarterPoints = num_points / 4; +++ int i, j; +++ +++ __m128 aVal, d, pi, pio2, x, y, z, arccosine; +++ __m128 fzeroes, fones, ftwos, ffours, condition; +++ +++ pi = _mm_set1_ps(3.14159265358979323846); +++ pio2 = _mm_set1_ps(3.14159265358979323846 / 2); +++ fzeroes = _mm_setzero_ps(); +++ fones = _mm_set1_ps(1.0); +++ ftwos = _mm_set1_ps(2.0); +++ ffours = _mm_set1_ps(4.0); +++ +++ for (; number < quarterPoints; number++) { +++ aVal = _mm_loadu_ps(aPtr); +++ d = aVal; +++ aVal = _mm_div_ps( +++ _mm_sqrt_ps(_mm_mul_ps(_mm_add_ps(fones, aVal), _mm_sub_ps(fones, aVal))), +++ aVal); +++ z = aVal; +++ condition = _mm_cmplt_ps(z, fzeroes); +++ z = _mm_sub_ps(z, _mm_and_ps(_mm_mul_ps(z, ftwos), condition)); +++ condition = _mm_cmplt_ps(z, fones); +++ x = _mm_add_ps(z, _mm_and_ps(_mm_sub_ps(_mm_div_ps(fones, z), z), condition)); +++ +++ for (i = 0; i < 2; i++) +++ x = _mm_add_ps(x, _mm_sqrt_ps(_mm_add_ps(fones, _mm_mul_ps(x, x)))); +++ x = _mm_div_ps(fones, x); +++ y = fzeroes; +++ +++ for (j = ACOS_TERMS - 1; j >= 0; j--) +++ y = _mm_add_ps(_mm_mul_ps(y, _mm_mul_ps(x, x)), +++ _mm_set1_ps(pow(-1, j) / (2 * j + 1))); +++ +++ y = _mm_mul_ps(y, _mm_mul_ps(x, ffours)); +++ condition = _mm_cmpgt_ps(z, fones); +++ +++ y = _mm_add_ps(y, _mm_and_ps(_mm_sub_ps(pio2, _mm_mul_ps(y, ftwos)), condition)); +++ arccosine = y; +++ condition = _mm_cmplt_ps(aVal, fzeroes); +++ arccosine = +++ _mm_sub_ps(arccosine, _mm_and_ps(_mm_mul_ps(arccosine, ftwos), condition)); +++ condition = _mm_cmplt_ps(d, fzeroes); +++ arccosine = _mm_add_ps(arccosine, _mm_and_ps(pi, condition)); +++ +++ _mm_storeu_ps(bPtr, arccosine); +++ aPtr += 4; +++ bPtr += 4; +++ } +++ +++ number = quarterPoints * 4; +++ for (; number < num_points; number++) { +++ *bPtr++ = acosf(*aPtr++); +++ } ++ } ++ ++ #endif /* LV_HAVE_SSE4_1 for aligned */ ++@@ -469,14 +504,13 @@ volk_32f_acos_32f_u_sse4_1(float* bVector, const float* aVector, unsigned int nu ++ static inline void ++ volk_32f_acos_32f_generic(float* bVector, const float* aVector, unsigned int num_points) ++ { ++- float* bPtr = bVector; ++- const float* aPtr = aVector; ++- unsigned int number = 0; ++- ++- for(number = 0; number < num_points; number++){ ++- *bPtr++ = acosf(*aPtr++); ++- } +++ float* bPtr = bVector; +++ const float* aPtr = aVector; +++ unsigned int number = 0; ++ +++ for (number = 0; number < num_points; number++) { +++ *bPtr++ = acosf(*aPtr++); +++ } ++ } ++ #endif /* LV_HAVE_GENERIC */ ++ ++diff --git a/kernels/volk/volk_32f_asin_32f.h b/kernels/volk/volk_32f_asin_32f.h ++index 864cfcf..946d382 100644 ++--- a/kernels/volk/volk_32f_asin_32f.h +++++ b/kernels/volk/volk_32f_asin_32f.h ++@@ -67,11 +67,12 @@ ++ * \endcode ++ */ ++ ++-#include ++-#include ++ #include +++#include +++#include ++ ++-/* This is the number of terms of Taylor series to evaluate, increase this for more accuracy*/ +++/* This is the number of terms of Taylor series to evaluate, increase this for more +++ * accuracy*/ ++ #define ASIN_TERMS 2 ++ ++ #ifndef INCLUDED_volk_32f_asin_32f_a_H ++@@ -80,60 +81,66 @@ ++ #if LV_HAVE_AVX2 && LV_HAVE_FMA ++ #include ++ ++-static inline void ++-volk_32f_asin_32f_a_avx2_fma(float* bVector, const float* aVector, unsigned int num_points) +++static inline void volk_32f_asin_32f_a_avx2_fma(float* bVector, +++ const float* aVector, +++ unsigned int num_points) ++ { ++- float* bPtr = bVector; ++- const float* aPtr = aVector; ++- ++- unsigned int number = 0; ++- unsigned int eighthPoints = num_points / 8; ++- int i, j; ++- ++- __m256 aVal, pio2, x, y, z, arcsine; ++- __m256 fzeroes, fones, ftwos, ffours, condition; ++- ++- pio2 = _mm256_set1_ps(3.14159265358979323846/2); ++- fzeroes = _mm256_setzero_ps(); ++- fones = _mm256_set1_ps(1.0); ++- ftwos = _mm256_set1_ps(2.0); ++- ffours = _mm256_set1_ps(4.0); ++- ++- for(;number < eighthPoints; number++){ ++- aVal = _mm256_load_ps(aPtr); ++- aVal = _mm256_div_ps(aVal, _mm256_sqrt_ps(_mm256_mul_ps(_mm256_add_ps(fones, aVal), _mm256_sub_ps(fones, aVal)))); ++- z = aVal; ++- condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS); ++- z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition)); ++- condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS); ++- x = _mm256_add_ps(z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition)); ++- ++- for(i = 0; i < 2; i++){ ++- x = _mm256_add_ps(x, _mm256_sqrt_ps(_mm256_fmadd_ps(x,x,fones))); +++ float* bPtr = bVector; +++ const float* aPtr = aVector; +++ +++ unsigned int number = 0; +++ unsigned int eighthPoints = num_points / 8; +++ int i, j; +++ +++ __m256 aVal, pio2, x, y, z, arcsine; +++ __m256 fzeroes, fones, ftwos, ffours, condition; +++ +++ pio2 = _mm256_set1_ps(3.14159265358979323846 / 2); +++ fzeroes = _mm256_setzero_ps(); +++ fones = _mm256_set1_ps(1.0); +++ ftwos = _mm256_set1_ps(2.0); +++ ffours = _mm256_set1_ps(4.0); +++ +++ for (; number < eighthPoints; number++) { +++ aVal = _mm256_load_ps(aPtr); +++ aVal = _mm256_div_ps(aVal, +++ _mm256_sqrt_ps(_mm256_mul_ps(_mm256_add_ps(fones, aVal), +++ _mm256_sub_ps(fones, aVal)))); +++ z = aVal; +++ condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS); +++ z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition)); +++ condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS); +++ x = _mm256_add_ps( +++ z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition)); +++ +++ for (i = 0; i < 2; i++) { +++ x = _mm256_add_ps(x, _mm256_sqrt_ps(_mm256_fmadd_ps(x, x, fones))); +++ } +++ x = _mm256_div_ps(fones, x); +++ y = fzeroes; +++ for (j = ASIN_TERMS - 1; j >= 0; j--) { +++ y = _mm256_fmadd_ps( +++ y, _mm256_mul_ps(x, x), _mm256_set1_ps(pow(-1, j) / (2 * j + 1))); +++ } +++ +++ y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours)); +++ condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS); +++ +++ y = _mm256_add_ps(y, _mm256_and_ps(_mm256_fnmadd_ps(y, ftwos, pio2), condition)); +++ arcsine = y; +++ condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS); +++ arcsine = _mm256_sub_ps(arcsine, +++ _mm256_and_ps(_mm256_mul_ps(arcsine, ftwos), condition)); +++ +++ _mm256_store_ps(bPtr, arcsine); +++ aPtr += 8; +++ bPtr += 8; ++ } ++- x = _mm256_div_ps(fones, x); ++- y = fzeroes; ++- for(j = ASIN_TERMS - 1; j >=0 ; j--){ ++- y = _mm256_fmadd_ps(y, _mm256_mul_ps(x, x), _mm256_set1_ps(pow(-1,j)/(2*j+1))); ++- } ++- ++- y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours)); ++- condition = _mm256_cmp_ps(z, fones,_CMP_GT_OS); ++- ++- y = _mm256_add_ps(y, _mm256_and_ps(_mm256_fnmadd_ps(y,ftwos,pio2), condition)); ++- arcsine = y; ++- condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS); ++- arcsine = _mm256_sub_ps(arcsine, _mm256_and_ps(_mm256_mul_ps(arcsine, ftwos), condition)); ++ ++- _mm256_store_ps(bPtr, arcsine); ++- aPtr += 8; ++- bPtr += 8; ++- } ++- ++- number = eighthPoints * 8; ++- for(;number < num_points; number++){ ++- *bPtr++ = asin(*aPtr++); ++- } +++ number = eighthPoints * 8; +++ for (; number < num_points; number++) { +++ *bPtr++ = asin(*aPtr++); +++ } ++ } ++ ++ #endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for aligned */ ++@@ -145,57 +152,64 @@ volk_32f_asin_32f_a_avx2_fma(float* bVector, const float* aVector, unsigned int ++ static inline void ++ volk_32f_asin_32f_a_avx(float* bVector, const float* aVector, unsigned int num_points) ++ { ++- float* bPtr = bVector; ++- const float* aPtr = aVector; ++- ++- unsigned int number = 0; ++- unsigned int eighthPoints = num_points / 8; ++- int i, j; ++- ++- __m256 aVal, pio2, x, y, z, arcsine; ++- __m256 fzeroes, fones, ftwos, ffours, condition; ++- ++- pio2 = _mm256_set1_ps(3.14159265358979323846/2); ++- fzeroes = _mm256_setzero_ps(); ++- fones = _mm256_set1_ps(1.0); ++- ftwos = _mm256_set1_ps(2.0); ++- ffours = _mm256_set1_ps(4.0); ++- ++- for(;number < eighthPoints; number++){ ++- aVal = _mm256_load_ps(aPtr); ++- aVal = _mm256_div_ps(aVal, _mm256_sqrt_ps(_mm256_mul_ps(_mm256_add_ps(fones, aVal), _mm256_sub_ps(fones, aVal)))); ++- z = aVal; ++- condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS); ++- z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition)); ++- condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS); ++- x = _mm256_add_ps(z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition)); ++- ++- for(i = 0; i < 2; i++){ ++- x = _mm256_add_ps(x, _mm256_sqrt_ps(_mm256_add_ps(fones, _mm256_mul_ps(x, x)))); +++ float* bPtr = bVector; +++ const float* aPtr = aVector; +++ +++ unsigned int number = 0; +++ unsigned int eighthPoints = num_points / 8; +++ int i, j; +++ +++ __m256 aVal, pio2, x, y, z, arcsine; +++ __m256 fzeroes, fones, ftwos, ffours, condition; +++ +++ pio2 = _mm256_set1_ps(3.14159265358979323846 / 2); +++ fzeroes = _mm256_setzero_ps(); +++ fones = _mm256_set1_ps(1.0); +++ ftwos = _mm256_set1_ps(2.0); +++ ffours = _mm256_set1_ps(4.0); +++ +++ for (; number < eighthPoints; number++) { +++ aVal = _mm256_load_ps(aPtr); +++ aVal = _mm256_div_ps(aVal, +++ _mm256_sqrt_ps(_mm256_mul_ps(_mm256_add_ps(fones, aVal), +++ _mm256_sub_ps(fones, aVal)))); +++ z = aVal; +++ condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS); +++ z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition)); +++ condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS); +++ x = _mm256_add_ps( +++ z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition)); +++ +++ for (i = 0; i < 2; i++) { +++ x = _mm256_add_ps(x, +++ _mm256_sqrt_ps(_mm256_add_ps(fones, _mm256_mul_ps(x, x)))); +++ } +++ x = _mm256_div_ps(fones, x); +++ y = fzeroes; +++ for (j = ASIN_TERMS - 1; j >= 0; j--) { +++ y = _mm256_add_ps(_mm256_mul_ps(y, _mm256_mul_ps(x, x)), +++ _mm256_set1_ps(pow(-1, j) / (2 * j + 1))); +++ } +++ +++ y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours)); +++ condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS); +++ +++ y = _mm256_add_ps( +++ y, _mm256_and_ps(_mm256_sub_ps(pio2, _mm256_mul_ps(y, ftwos)), condition)); +++ arcsine = y; +++ condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS); +++ arcsine = _mm256_sub_ps(arcsine, +++ _mm256_and_ps(_mm256_mul_ps(arcsine, ftwos), condition)); +++ +++ _mm256_store_ps(bPtr, arcsine); +++ aPtr += 8; +++ bPtr += 8; ++ } ++- x = _mm256_div_ps(fones, x); ++- y = fzeroes; ++- for(j = ASIN_TERMS - 1; j >=0 ; j--){ ++- y = _mm256_add_ps(_mm256_mul_ps(y, _mm256_mul_ps(x, x)), _mm256_set1_ps(pow(-1,j)/(2*j+1))); ++- } ++- ++- y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours)); ++- condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS); ++- ++- y = _mm256_add_ps(y, _mm256_and_ps(_mm256_sub_ps(pio2, _mm256_mul_ps(y, ftwos)), condition)); ++- arcsine = y; ++- condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS); ++- arcsine = _mm256_sub_ps(arcsine, _mm256_and_ps(_mm256_mul_ps(arcsine, ftwos), condition)); ++ ++- _mm256_store_ps(bPtr, arcsine); ++- aPtr += 8; ++- bPtr += 8; ++- } ++- ++- number = eighthPoints * 8; ++- for(;number < num_points; number++){ ++- *bPtr++ = asin(*aPtr++); ++- } +++ number = eighthPoints * 8; +++ for (; number < num_points; number++) { +++ *bPtr++ = asin(*aPtr++); +++ } ++ } ++ ++ #endif /* LV_HAVE_AVX for aligned */ ++@@ -206,57 +220,60 @@ volk_32f_asin_32f_a_avx(float* bVector, const float* aVector, unsigned int num_p ++ static inline void ++ volk_32f_asin_32f_a_sse4_1(float* bVector, const float* aVector, unsigned int num_points) ++ { ++- float* bPtr = bVector; ++- const float* aPtr = aVector; ++- ++- unsigned int number = 0; ++- unsigned int quarterPoints = num_points / 4; ++- int i, j; ++- ++- __m128 aVal, pio2, x, y, z, arcsine; ++- __m128 fzeroes, fones, ftwos, ffours, condition; ++- ++- pio2 = _mm_set1_ps(3.14159265358979323846/2); ++- fzeroes = _mm_setzero_ps(); ++- fones = _mm_set1_ps(1.0); ++- ftwos = _mm_set1_ps(2.0); ++- ffours = _mm_set1_ps(4.0); ++- ++- for(;number < quarterPoints; number++){ ++- aVal = _mm_load_ps(aPtr); ++- aVal = _mm_div_ps(aVal, _mm_sqrt_ps(_mm_mul_ps(_mm_add_ps(fones, aVal), _mm_sub_ps(fones, aVal)))); ++- z = aVal; ++- condition = _mm_cmplt_ps(z, fzeroes); ++- z = _mm_sub_ps(z, _mm_and_ps(_mm_mul_ps(z, ftwos), condition)); ++- condition = _mm_cmplt_ps(z, fones); ++- x = _mm_add_ps(z, _mm_and_ps(_mm_sub_ps(_mm_div_ps(fones, z), z), condition)); ++- ++- for(i = 0; i < 2; i++){ ++- x = _mm_add_ps(x, _mm_sqrt_ps(_mm_add_ps(fones, _mm_mul_ps(x, x)))); +++ float* bPtr = bVector; +++ const float* aPtr = aVector; +++ +++ unsigned int number = 0; +++ unsigned int quarterPoints = num_points / 4; +++ int i, j; +++ +++ __m128 aVal, pio2, x, y, z, arcsine; +++ __m128 fzeroes, fones, ftwos, ffours, condition; +++ +++ pio2 = _mm_set1_ps(3.14159265358979323846 / 2); +++ fzeroes = _mm_setzero_ps(); +++ fones = _mm_set1_ps(1.0); +++ ftwos = _mm_set1_ps(2.0); +++ ffours = _mm_set1_ps(4.0); +++ +++ for (; number < quarterPoints; number++) { +++ aVal = _mm_load_ps(aPtr); +++ aVal = _mm_div_ps( +++ aVal, +++ _mm_sqrt_ps(_mm_mul_ps(_mm_add_ps(fones, aVal), _mm_sub_ps(fones, aVal)))); +++ z = aVal; +++ condition = _mm_cmplt_ps(z, fzeroes); +++ z = _mm_sub_ps(z, _mm_and_ps(_mm_mul_ps(z, ftwos), condition)); +++ condition = _mm_cmplt_ps(z, fones); +++ x = _mm_add_ps(z, _mm_and_ps(_mm_sub_ps(_mm_div_ps(fones, z), z), condition)); +++ +++ for (i = 0; i < 2; i++) { +++ x = _mm_add_ps(x, _mm_sqrt_ps(_mm_add_ps(fones, _mm_mul_ps(x, x)))); +++ } +++ x = _mm_div_ps(fones, x); +++ y = fzeroes; +++ for (j = ASIN_TERMS - 1; j >= 0; j--) { +++ y = _mm_add_ps(_mm_mul_ps(y, _mm_mul_ps(x, x)), +++ _mm_set1_ps(pow(-1, j) / (2 * j + 1))); +++ } +++ +++ y = _mm_mul_ps(y, _mm_mul_ps(x, ffours)); +++ condition = _mm_cmpgt_ps(z, fones); +++ +++ y = _mm_add_ps(y, _mm_and_ps(_mm_sub_ps(pio2, _mm_mul_ps(y, ftwos)), condition)); +++ arcsine = y; +++ condition = _mm_cmplt_ps(aVal, fzeroes); +++ arcsine = _mm_sub_ps(arcsine, _mm_and_ps(_mm_mul_ps(arcsine, ftwos), condition)); +++ +++ _mm_store_ps(bPtr, arcsine); +++ aPtr += 4; +++ bPtr += 4; ++ } ++- x = _mm_div_ps(fones, x); ++- y = fzeroes; ++- for(j = ASIN_TERMS - 1; j >=0 ; j--){ ++- y = _mm_add_ps(_mm_mul_ps(y, _mm_mul_ps(x, x)), _mm_set1_ps(pow(-1,j)/(2*j+1))); ++- } ++- ++- y = _mm_mul_ps(y, _mm_mul_ps(x, ffours)); ++- condition = _mm_cmpgt_ps(z, fones); ++- ++- y = _mm_add_ps(y, _mm_and_ps(_mm_sub_ps(pio2, _mm_mul_ps(y, ftwos)), condition)); ++- arcsine = y; ++- condition = _mm_cmplt_ps(aVal, fzeroes); ++- arcsine = _mm_sub_ps(arcsine, _mm_and_ps(_mm_mul_ps(arcsine, ftwos), condition)); ++- ++- _mm_store_ps(bPtr, arcsine); ++- aPtr += 4; ++- bPtr += 4; ++- } ++ ++- number = quarterPoints * 4; ++- for(;number < num_points; number++){ ++- *bPtr++ = asinf(*aPtr++); ++- } +++ number = quarterPoints * 4; +++ for (; number < num_points; number++) { +++ *bPtr++ = asinf(*aPtr++); +++ } ++ } ++ ++ #endif /* LV_HAVE_SSE4_1 for aligned */ ++@@ -269,60 +286,66 @@ volk_32f_asin_32f_a_sse4_1(float* bVector, const float* aVector, unsigned int nu ++ #if LV_HAVE_AVX2 && LV_HAVE_FMA ++ #include ++ ++-static inline void ++-volk_32f_asin_32f_u_avx2_fma(float* bVector, const float* aVector, unsigned int num_points) +++static inline void volk_32f_asin_32f_u_avx2_fma(float* bVector, +++ const float* aVector, +++ unsigned int num_points) ++ { ++- float* bPtr = bVector; ++- const float* aPtr = aVector; ++- ++- unsigned int number = 0; ++- unsigned int eighthPoints = num_points / 8; ++- int i, j; ++- ++- __m256 aVal, pio2, x, y, z, arcsine; ++- __m256 fzeroes, fones, ftwos, ffours, condition; ++- ++- pio2 = _mm256_set1_ps(3.14159265358979323846/2); ++- fzeroes = _mm256_setzero_ps(); ++- fones = _mm256_set1_ps(1.0); ++- ftwos = _mm256_set1_ps(2.0); ++- ffours = _mm256_set1_ps(4.0); ++- ++- for(;number < eighthPoints; number++){ ++- aVal = _mm256_loadu_ps(aPtr); ++- aVal = _mm256_div_ps(aVal, _mm256_sqrt_ps(_mm256_mul_ps(_mm256_add_ps(fones, aVal), _mm256_sub_ps(fones, aVal)))); ++- z = aVal; ++- condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS); ++- z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition)); ++- condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS); ++- x = _mm256_add_ps(z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition)); ++- ++- for(i = 0; i < 2; i++){ ++- x = _mm256_add_ps(x, _mm256_sqrt_ps(_mm256_fmadd_ps(x,x,fones))); ++- } ++- x = _mm256_div_ps(fones, x); ++- y = fzeroes; ++- for(j = ASIN_TERMS - 1; j >=0 ; j--){ ++- y = _mm256_fmadd_ps(y, _mm256_mul_ps(x, x), _mm256_set1_ps(pow(-1,j)/(2*j+1))); +++ float* bPtr = bVector; +++ const float* aPtr = aVector; +++ +++ unsigned int number = 0; +++ unsigned int eighthPoints = num_points / 8; +++ int i, j; +++ +++ __m256 aVal, pio2, x, y, z, arcsine; +++ __m256 fzeroes, fones, ftwos, ffours, condition; +++ +++ pio2 = _mm256_set1_ps(3.14159265358979323846 / 2); +++ fzeroes = _mm256_setzero_ps(); +++ fones = _mm256_set1_ps(1.0); +++ ftwos = _mm256_set1_ps(2.0); +++ ffours = _mm256_set1_ps(4.0); +++ +++ for (; number < eighthPoints; number++) { +++ aVal = _mm256_loadu_ps(aPtr); +++ aVal = _mm256_div_ps(aVal, +++ _mm256_sqrt_ps(_mm256_mul_ps(_mm256_add_ps(fones, aVal), +++ _mm256_sub_ps(fones, aVal)))); +++ z = aVal; +++ condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS); +++ z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition)); +++ condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS); +++ x = _mm256_add_ps( +++ z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition)); +++ +++ for (i = 0; i < 2; i++) { +++ x = _mm256_add_ps(x, _mm256_sqrt_ps(_mm256_fmadd_ps(x, x, fones))); +++ } +++ x = _mm256_div_ps(fones, x); +++ y = fzeroes; +++ for (j = ASIN_TERMS - 1; j >= 0; j--) { +++ y = _mm256_fmadd_ps( +++ y, _mm256_mul_ps(x, x), _mm256_set1_ps(pow(-1, j) / (2 * j + 1))); +++ } +++ +++ y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours)); +++ condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS); +++ +++ y = _mm256_add_ps(y, _mm256_and_ps(_mm256_fnmadd_ps(y, ftwos, pio2), condition)); +++ arcsine = y; +++ condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS); +++ arcsine = _mm256_sub_ps(arcsine, +++ _mm256_and_ps(_mm256_mul_ps(arcsine, ftwos), condition)); +++ +++ _mm256_storeu_ps(bPtr, arcsine); +++ aPtr += 8; +++ bPtr += 8; ++ } ++ ++- y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours)); ++- condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS); ++- ++- y = _mm256_add_ps(y, _mm256_and_ps(_mm256_fnmadd_ps(y,ftwos,pio2), condition)); ++- arcsine = y; ++- condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS); ++- arcsine = _mm256_sub_ps(arcsine, _mm256_and_ps(_mm256_mul_ps(arcsine, ftwos), condition)); ++- ++- _mm256_storeu_ps(bPtr, arcsine); ++- aPtr += 8; ++- bPtr += 8; ++- } ++- ++- number = eighthPoints * 8; ++- for(;number < num_points; number++){ ++- *bPtr++ = asin(*aPtr++); ++- } +++ number = eighthPoints * 8; +++ for (; number < num_points; number++) { +++ *bPtr++ = asin(*aPtr++); +++ } ++ } ++ ++ #endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for unaligned */ ++@@ -334,57 +357,64 @@ volk_32f_asin_32f_u_avx2_fma(float* bVector, const float* aVector, unsigned int ++ static inline void ++ volk_32f_asin_32f_u_avx(float* bVector, const float* aVector, unsigned int num_points) ++ { ++- float* bPtr = bVector; ++- const float* aPtr = aVector; ++- ++- unsigned int number = 0; ++- unsigned int eighthPoints = num_points / 8; ++- int i, j; ++- ++- __m256 aVal, pio2, x, y, z, arcsine; ++- __m256 fzeroes, fones, ftwos, ffours, condition; ++- ++- pio2 = _mm256_set1_ps(3.14159265358979323846/2); ++- fzeroes = _mm256_setzero_ps(); ++- fones = _mm256_set1_ps(1.0); ++- ftwos = _mm256_set1_ps(2.0); ++- ffours = _mm256_set1_ps(4.0); ++- ++- for(;number < eighthPoints; number++){ ++- aVal = _mm256_loadu_ps(aPtr); ++- aVal = _mm256_div_ps(aVal, _mm256_sqrt_ps(_mm256_mul_ps(_mm256_add_ps(fones, aVal), _mm256_sub_ps(fones, aVal)))); ++- z = aVal; ++- condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS); ++- z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition)); ++- condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS); ++- x = _mm256_add_ps(z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition)); ++- ++- for(i = 0; i < 2; i++){ ++- x = _mm256_add_ps(x, _mm256_sqrt_ps(_mm256_add_ps(fones, _mm256_mul_ps(x, x)))); +++ float* bPtr = bVector; +++ const float* aPtr = aVector; +++ +++ unsigned int number = 0; +++ unsigned int eighthPoints = num_points / 8; +++ int i, j; +++ +++ __m256 aVal, pio2, x, y, z, arcsine; +++ __m256 fzeroes, fones, ftwos, ffours, condition; +++ +++ pio2 = _mm256_set1_ps(3.14159265358979323846 / 2); +++ fzeroes = _mm256_setzero_ps(); +++ fones = _mm256_set1_ps(1.0); +++ ftwos = _mm256_set1_ps(2.0); +++ ffours = _mm256_set1_ps(4.0); +++ +++ for (; number < eighthPoints; number++) { +++ aVal = _mm256_loadu_ps(aPtr); +++ aVal = _mm256_div_ps(aVal, +++ _mm256_sqrt_ps(_mm256_mul_ps(_mm256_add_ps(fones, aVal), +++ _mm256_sub_ps(fones, aVal)))); +++ z = aVal; +++ condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS); +++ z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition)); +++ condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS); +++ x = _mm256_add_ps( +++ z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition)); +++ +++ for (i = 0; i < 2; i++) { +++ x = _mm256_add_ps(x, +++ _mm256_sqrt_ps(_mm256_add_ps(fones, _mm256_mul_ps(x, x)))); +++ } +++ x = _mm256_div_ps(fones, x); +++ y = fzeroes; +++ for (j = ASIN_TERMS - 1; j >= 0; j--) { +++ y = _mm256_add_ps(_mm256_mul_ps(y, _mm256_mul_ps(x, x)), +++ _mm256_set1_ps(pow(-1, j) / (2 * j + 1))); +++ } +++ +++ y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours)); +++ condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS); +++ +++ y = _mm256_add_ps( +++ y, _mm256_and_ps(_mm256_sub_ps(pio2, _mm256_mul_ps(y, ftwos)), condition)); +++ arcsine = y; +++ condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS); +++ arcsine = _mm256_sub_ps(arcsine, +++ _mm256_and_ps(_mm256_mul_ps(arcsine, ftwos), condition)); +++ +++ _mm256_storeu_ps(bPtr, arcsine); +++ aPtr += 8; +++ bPtr += 8; ++ } ++- x = _mm256_div_ps(fones, x); ++- y = fzeroes; ++- for(j = ASIN_TERMS - 1; j >=0 ; j--){ ++- y = _mm256_add_ps(_mm256_mul_ps(y, _mm256_mul_ps(x, x)), _mm256_set1_ps(pow(-1,j)/(2*j+1))); ++- } ++- ++- y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours)); ++- condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS); ++ ++- y = _mm256_add_ps(y, _mm256_and_ps(_mm256_sub_ps(pio2, _mm256_mul_ps(y, ftwos)), condition)); ++- arcsine = y; ++- condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS); ++- arcsine = _mm256_sub_ps(arcsine, _mm256_and_ps(_mm256_mul_ps(arcsine, ftwos), condition)); ++- ++- _mm256_storeu_ps(bPtr, arcsine); ++- aPtr += 8; ++- bPtr += 8; ++- } ++- ++- number = eighthPoints * 8; ++- for(;number < num_points; number++){ ++- *bPtr++ = asin(*aPtr++); ++- } +++ number = eighthPoints * 8; +++ for (; number < num_points; number++) { +++ *bPtr++ = asin(*aPtr++); +++ } ++ } ++ ++ #endif /* LV_HAVE_AVX for unaligned */ ++@@ -396,57 +426,60 @@ volk_32f_asin_32f_u_avx(float* bVector, const float* aVector, unsigned int num_p ++ static inline void ++ volk_32f_asin_32f_u_sse4_1(float* bVector, const float* aVector, unsigned int num_points) ++ { ++- float* bPtr = bVector; ++- const float* aPtr = aVector; ++- ++- unsigned int number = 0; ++- unsigned int quarterPoints = num_points / 4; ++- int i, j; ++- ++- __m128 aVal, pio2, x, y, z, arcsine; ++- __m128 fzeroes, fones, ftwos, ffours, condition; ++- ++- pio2 = _mm_set1_ps(3.14159265358979323846/2); ++- fzeroes = _mm_setzero_ps(); ++- fones = _mm_set1_ps(1.0); ++- ftwos = _mm_set1_ps(2.0); ++- ffours = _mm_set1_ps(4.0); ++- ++- for(;number < quarterPoints; number++){ ++- aVal = _mm_loadu_ps(aPtr); ++- aVal = _mm_div_ps(aVal, _mm_sqrt_ps(_mm_mul_ps(_mm_add_ps(fones, aVal), _mm_sub_ps(fones, aVal)))); ++- z = aVal; ++- condition = _mm_cmplt_ps(z, fzeroes); ++- z = _mm_sub_ps(z, _mm_and_ps(_mm_mul_ps(z, ftwos), condition)); ++- condition = _mm_cmplt_ps(z, fones); ++- x = _mm_add_ps(z, _mm_and_ps(_mm_sub_ps(_mm_div_ps(fones, z), z), condition)); ++- ++- for(i = 0; i < 2; i++){ ++- x = _mm_add_ps(x, _mm_sqrt_ps(_mm_add_ps(fones, _mm_mul_ps(x, x)))); +++ float* bPtr = bVector; +++ const float* aPtr = aVector; +++ +++ unsigned int number = 0; +++ unsigned int quarterPoints = num_points / 4; +++ int i, j; +++ +++ __m128 aVal, pio2, x, y, z, arcsine; +++ __m128 fzeroes, fones, ftwos, ffours, condition; +++ +++ pio2 = _mm_set1_ps(3.14159265358979323846 / 2); +++ fzeroes = _mm_setzero_ps(); +++ fones = _mm_set1_ps(1.0); +++ ftwos = _mm_set1_ps(2.0); +++ ffours = _mm_set1_ps(4.0); +++ +++ for (; number < quarterPoints; number++) { +++ aVal = _mm_loadu_ps(aPtr); +++ aVal = _mm_div_ps( +++ aVal, +++ _mm_sqrt_ps(_mm_mul_ps(_mm_add_ps(fones, aVal), _mm_sub_ps(fones, aVal)))); +++ z = aVal; +++ condition = _mm_cmplt_ps(z, fzeroes); +++ z = _mm_sub_ps(z, _mm_and_ps(_mm_mul_ps(z, ftwos), condition)); +++ condition = _mm_cmplt_ps(z, fones); +++ x = _mm_add_ps(z, _mm_and_ps(_mm_sub_ps(_mm_div_ps(fones, z), z), condition)); +++ +++ for (i = 0; i < 2; i++) { +++ x = _mm_add_ps(x, _mm_sqrt_ps(_mm_add_ps(fones, _mm_mul_ps(x, x)))); +++ } +++ x = _mm_div_ps(fones, x); +++ y = fzeroes; +++ for (j = ASIN_TERMS - 1; j >= 0; j--) { +++ y = _mm_add_ps(_mm_mul_ps(y, _mm_mul_ps(x, x)), +++ _mm_set1_ps(pow(-1, j) / (2 * j + 1))); +++ } +++ +++ y = _mm_mul_ps(y, _mm_mul_ps(x, ffours)); +++ condition = _mm_cmpgt_ps(z, fones); +++ +++ y = _mm_add_ps(y, _mm_and_ps(_mm_sub_ps(pio2, _mm_mul_ps(y, ftwos)), condition)); +++ arcsine = y; +++ condition = _mm_cmplt_ps(aVal, fzeroes); +++ arcsine = _mm_sub_ps(arcsine, _mm_and_ps(_mm_mul_ps(arcsine, ftwos), condition)); +++ +++ _mm_storeu_ps(bPtr, arcsine); +++ aPtr += 4; +++ bPtr += 4; ++ } ++- x = _mm_div_ps(fones, x); ++- y = fzeroes; ++- for(j = ASIN_TERMS - 1; j >=0 ; j--){ ++- y = _mm_add_ps(_mm_mul_ps(y, _mm_mul_ps(x, x)), _mm_set1_ps(pow(-1,j)/(2*j+1))); ++- } ++- ++- y = _mm_mul_ps(y, _mm_mul_ps(x, ffours)); ++- condition = _mm_cmpgt_ps(z, fones); ++ ++- y = _mm_add_ps(y, _mm_and_ps(_mm_sub_ps(pio2, _mm_mul_ps(y, ftwos)), condition)); ++- arcsine = y; ++- condition = _mm_cmplt_ps(aVal, fzeroes); ++- arcsine = _mm_sub_ps(arcsine, _mm_and_ps(_mm_mul_ps(arcsine, ftwos), condition)); ++- ++- _mm_storeu_ps(bPtr, arcsine); ++- aPtr += 4; ++- bPtr += 4; ++- } ++- ++- number = quarterPoints * 4; ++- for(;number < num_points; number++){ ++- *bPtr++ = asinf(*aPtr++); ++- } +++ number = quarterPoints * 4; +++ for (; number < num_points; number++) { +++ *bPtr++ = asinf(*aPtr++); +++ } ++ } ++ ++ #endif /* LV_HAVE_SSE4_1 for unaligned */ ++@@ -456,13 +489,13 @@ volk_32f_asin_32f_u_sse4_1(float* bVector, const float* aVector, unsigned int nu ++ static inline void ++ volk_32f_asin_32f_u_generic(float* bVector, const float* aVector, unsigned int num_points) ++ { ++- float* bPtr = bVector; ++- const float* aPtr = aVector; ++- unsigned int number = 0; +++ float* bPtr = bVector; +++ const float* aPtr = aVector; +++ unsigned int number = 0; ++ ++- for(number = 0; number < num_points; number++){ ++- *bPtr++ = asinf(*aPtr++); ++- } +++ for (number = 0; number < num_points; number++) { +++ *bPtr++ = asinf(*aPtr++); +++ } ++ } ++ #endif /* LV_HAVE_GENERIC */ ++ ++diff --git a/kernels/volk/volk_32f_atan_32f.h b/kernels/volk/volk_32f_atan_32f.h ++index 3496f0e..6652ee8 100644 ++--- a/kernels/volk/volk_32f_atan_32f.h +++++ b/kernels/volk/volk_32f_atan_32f.h ++@@ -67,11 +67,12 @@ ++ * \endcode ++ */ ++ ++-#include ++-#include ++ #include +++#include +++#include ++ ++-/* This is the number of terms of Taylor series to evaluate, increase this for more accuracy*/ +++/* This is the number of terms of Taylor series to evaluate, increase this for more +++ * accuracy*/ ++ #define TERMS 2 ++ ++ #ifndef INCLUDED_volk_32f_atan_32f_a_H ++@@ -80,59 +81,63 @@ ++ #if LV_HAVE_AVX2 && LV_HAVE_FMA ++ #include ++ ++-static inline void ++-volk_32f_atan_32f_a_avx2_fma(float* bVector, const float* aVector, unsigned int num_points) +++static inline void volk_32f_atan_32f_a_avx2_fma(float* bVector, +++ const float* aVector, +++ unsigned int num_points) ++ { ++- float* bPtr = bVector; ++- const float* aPtr = aVector; ++- ++- unsigned int number = 0; ++- unsigned int eighthPoints = num_points / 8; ++- int i, j; ++- ++- __m256 aVal, pio2, x, y, z, arctangent; ++- __m256 fzeroes, fones, ftwos, ffours, condition; ++- ++- pio2 = _mm256_set1_ps(3.14159265358979323846/2); ++- fzeroes = _mm256_setzero_ps(); ++- fones = _mm256_set1_ps(1.0); ++- ftwos = _mm256_set1_ps(2.0); ++- ffours = _mm256_set1_ps(4.0); ++- ++- for(;number < eighthPoints; number++){ ++- aVal = _mm256_load_ps(aPtr); ++- z = aVal; ++- condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS); ++- z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition)); ++- condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS); ++- x = _mm256_add_ps(z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition)); ++- ++- for(i = 0; i < 2; i++){ ++- x = _mm256_add_ps(x, _mm256_sqrt_ps(_mm256_fmadd_ps(x,x,fones))); +++ float* bPtr = bVector; +++ const float* aPtr = aVector; +++ +++ unsigned int number = 0; +++ unsigned int eighthPoints = num_points / 8; +++ int i, j; +++ +++ __m256 aVal, pio2, x, y, z, arctangent; +++ __m256 fzeroes, fones, ftwos, ffours, condition; +++ +++ pio2 = _mm256_set1_ps(3.14159265358979323846 / 2); +++ fzeroes = _mm256_setzero_ps(); +++ fones = _mm256_set1_ps(1.0); +++ ftwos = _mm256_set1_ps(2.0); +++ ffours = _mm256_set1_ps(4.0); +++ +++ for (; number < eighthPoints; number++) { +++ aVal = _mm256_load_ps(aPtr); +++ z = aVal; +++ condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS); +++ z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition)); +++ condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS); +++ x = _mm256_add_ps( +++ z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition)); +++ +++ for (i = 0; i < 2; i++) { +++ x = _mm256_add_ps(x, _mm256_sqrt_ps(_mm256_fmadd_ps(x, x, fones))); +++ } +++ x = _mm256_div_ps(fones, x); +++ y = fzeroes; +++ for (j = TERMS - 1; j >= 0; j--) { +++ y = _mm256_fmadd_ps( +++ y, _mm256_mul_ps(x, x), _mm256_set1_ps(pow(-1, j) / (2 * j + 1))); +++ } +++ +++ y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours)); +++ condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS); +++ +++ y = _mm256_add_ps(y, _mm256_and_ps(_mm256_fnmadd_ps(y, ftwos, pio2), condition)); +++ arctangent = y; +++ condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS); +++ arctangent = _mm256_sub_ps( +++ arctangent, _mm256_and_ps(_mm256_mul_ps(arctangent, ftwos), condition)); +++ +++ _mm256_store_ps(bPtr, arctangent); +++ aPtr += 8; +++ bPtr += 8; ++ } ++- x = _mm256_div_ps(fones, x); ++- y = fzeroes; ++- for(j = TERMS - 1; j >=0 ; j--){ ++- y = _mm256_fmadd_ps(y, _mm256_mul_ps(x, x), _mm256_set1_ps(pow(-1,j)/(2*j+1))); ++- } ++- ++- y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours)); ++- condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS); ++- ++- y = _mm256_add_ps(y, _mm256_and_ps(_mm256_fnmadd_ps(y,ftwos,pio2), condition)); ++- arctangent = y; ++- condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS); ++- arctangent = _mm256_sub_ps(arctangent, _mm256_and_ps(_mm256_mul_ps(arctangent, ftwos), condition)); ++- ++- _mm256_store_ps(bPtr, arctangent); ++- aPtr += 8; ++- bPtr += 8; ++- } ++ ++- number = eighthPoints * 8; ++- for(;number < num_points; number++){ ++- *bPtr++ = atan(*aPtr++); ++- } +++ number = eighthPoints * 8; +++ for (; number < num_points; number++) { +++ *bPtr++ = atan(*aPtr++); +++ } ++ } ++ ++ #endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for aligned */ ++@@ -144,56 +149,61 @@ volk_32f_atan_32f_a_avx2_fma(float* bVector, const float* aVector, unsigned int ++ static inline void ++ volk_32f_atan_32f_a_avx(float* bVector, const float* aVector, unsigned int num_points) ++ { ++- float* bPtr = bVector; ++- const float* aPtr = aVector; ++- ++- unsigned int number = 0; ++- unsigned int eighthPoints = num_points / 8; ++- int i, j; ++- ++- __m256 aVal, pio2, x, y, z, arctangent; ++- __m256 fzeroes, fones, ftwos, ffours, condition; ++- ++- pio2 = _mm256_set1_ps(3.14159265358979323846/2); ++- fzeroes = _mm256_setzero_ps(); ++- fones = _mm256_set1_ps(1.0); ++- ftwos = _mm256_set1_ps(2.0); ++- ffours = _mm256_set1_ps(4.0); ++- ++- for(;number < eighthPoints; number++){ ++- aVal = _mm256_load_ps(aPtr); ++- z = aVal; ++- condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS); ++- z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition)); ++- condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS); ++- x = _mm256_add_ps(z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition)); ++- ++- for(i = 0; i < 2; i++){ ++- x = _mm256_add_ps(x, _mm256_sqrt_ps(_mm256_add_ps(fones, _mm256_mul_ps(x, x)))); ++- } ++- x = _mm256_div_ps(fones, x); ++- y = fzeroes; ++- for(j = TERMS - 1; j >=0 ; j--){ ++- y = _mm256_add_ps(_mm256_mul_ps(y, _mm256_mul_ps(x, x)), _mm256_set1_ps(pow(-1,j)/(2*j+1))); +++ float* bPtr = bVector; +++ const float* aPtr = aVector; +++ +++ unsigned int number = 0; +++ unsigned int eighthPoints = num_points / 8; +++ int i, j; +++ +++ __m256 aVal, pio2, x, y, z, arctangent; +++ __m256 fzeroes, fones, ftwos, ffours, condition; +++ +++ pio2 = _mm256_set1_ps(3.14159265358979323846 / 2); +++ fzeroes = _mm256_setzero_ps(); +++ fones = _mm256_set1_ps(1.0); +++ ftwos = _mm256_set1_ps(2.0); +++ ffours = _mm256_set1_ps(4.0); +++ +++ for (; number < eighthPoints; number++) { +++ aVal = _mm256_load_ps(aPtr); +++ z = aVal; +++ condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS); +++ z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition)); +++ condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS); +++ x = _mm256_add_ps( +++ z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition)); +++ +++ for (i = 0; i < 2; i++) { +++ x = _mm256_add_ps(x, +++ _mm256_sqrt_ps(_mm256_add_ps(fones, _mm256_mul_ps(x, x)))); +++ } +++ x = _mm256_div_ps(fones, x); +++ y = fzeroes; +++ for (j = TERMS - 1; j >= 0; j--) { +++ y = _mm256_add_ps(_mm256_mul_ps(y, _mm256_mul_ps(x, x)), +++ _mm256_set1_ps(pow(-1, j) / (2 * j + 1))); +++ } +++ +++ y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours)); +++ condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS); +++ +++ y = _mm256_add_ps( +++ y, _mm256_and_ps(_mm256_sub_ps(pio2, _mm256_mul_ps(y, ftwos)), condition)); +++ arctangent = y; +++ condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS); +++ arctangent = _mm256_sub_ps( +++ arctangent, _mm256_and_ps(_mm256_mul_ps(arctangent, ftwos), condition)); +++ +++ _mm256_store_ps(bPtr, arctangent); +++ aPtr += 8; +++ bPtr += 8; ++ } ++ ++- y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours)); ++- condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS); ++- ++- y = _mm256_add_ps(y, _mm256_and_ps(_mm256_sub_ps(pio2, _mm256_mul_ps(y, ftwos)), condition)); ++- arctangent = y; ++- condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS); ++- arctangent = _mm256_sub_ps(arctangent, _mm256_and_ps(_mm256_mul_ps(arctangent, ftwos), condition)); ++- ++- _mm256_store_ps(bPtr, arctangent); ++- aPtr += 8; ++- bPtr += 8; ++- } ++- ++- number = eighthPoints * 8; ++- for(;number < num_points; number++){ ++- *bPtr++ = atan(*aPtr++); ++- } +++ number = eighthPoints * 8; +++ for (; number < num_points; number++) { +++ *bPtr++ = atan(*aPtr++); +++ } ++ } ++ ++ #endif /* LV_HAVE_AVX for aligned */ ++@@ -204,56 +214,58 @@ volk_32f_atan_32f_a_avx(float* bVector, const float* aVector, unsigned int num_p ++ static inline void ++ volk_32f_atan_32f_a_sse4_1(float* bVector, const float* aVector, unsigned int num_points) ++ { ++- float* bPtr = bVector; ++- const float* aPtr = aVector; ++- ++- unsigned int number = 0; ++- unsigned int quarterPoints = num_points / 4; ++- int i, j; ++- ++- __m128 aVal, pio2, x, y, z, arctangent; ++- __m128 fzeroes, fones, ftwos, ffours, condition; ++- ++- pio2 = _mm_set1_ps(3.14159265358979323846/2); ++- fzeroes = _mm_setzero_ps(); ++- fones = _mm_set1_ps(1.0); ++- ftwos = _mm_set1_ps(2.0); ++- ffours = _mm_set1_ps(4.0); ++- ++- for(;number < quarterPoints; number++){ ++- aVal = _mm_load_ps(aPtr); ++- z = aVal; ++- condition = _mm_cmplt_ps(z, fzeroes); ++- z = _mm_sub_ps(z, _mm_and_ps(_mm_mul_ps(z, ftwos), condition)); ++- condition = _mm_cmplt_ps(z, fones); ++- x = _mm_add_ps(z, _mm_and_ps(_mm_sub_ps(_mm_div_ps(fones, z), z), condition)); ++- ++- for(i = 0; i < 2; i++){ ++- x = _mm_add_ps(x, _mm_sqrt_ps(_mm_add_ps(fones, _mm_mul_ps(x, x)))); ++- } ++- x = _mm_div_ps(fones, x); ++- y = fzeroes; ++- for(j = TERMS - 1; j >=0 ; j--){ ++- y = _mm_add_ps(_mm_mul_ps(y, _mm_mul_ps(x, x)), _mm_set1_ps(pow(-1,j)/(2*j+1))); +++ float* bPtr = bVector; +++ const float* aPtr = aVector; +++ +++ unsigned int number = 0; +++ unsigned int quarterPoints = num_points / 4; +++ int i, j; +++ +++ __m128 aVal, pio2, x, y, z, arctangent; +++ __m128 fzeroes, fones, ftwos, ffours, condition; +++ +++ pio2 = _mm_set1_ps(3.14159265358979323846 / 2); +++ fzeroes = _mm_setzero_ps(); +++ fones = _mm_set1_ps(1.0); +++ ftwos = _mm_set1_ps(2.0); +++ ffours = _mm_set1_ps(4.0); +++ +++ for (; number < quarterPoints; number++) { +++ aVal = _mm_load_ps(aPtr); +++ z = aVal; +++ condition = _mm_cmplt_ps(z, fzeroes); +++ z = _mm_sub_ps(z, _mm_and_ps(_mm_mul_ps(z, ftwos), condition)); +++ condition = _mm_cmplt_ps(z, fones); +++ x = _mm_add_ps(z, _mm_and_ps(_mm_sub_ps(_mm_div_ps(fones, z), z), condition)); +++ +++ for (i = 0; i < 2; i++) { +++ x = _mm_add_ps(x, _mm_sqrt_ps(_mm_add_ps(fones, _mm_mul_ps(x, x)))); +++ } +++ x = _mm_div_ps(fones, x); +++ y = fzeroes; +++ for (j = TERMS - 1; j >= 0; j--) { +++ y = _mm_add_ps(_mm_mul_ps(y, _mm_mul_ps(x, x)), +++ _mm_set1_ps(pow(-1, j) / (2 * j + 1))); +++ } +++ +++ y = _mm_mul_ps(y, _mm_mul_ps(x, ffours)); +++ condition = _mm_cmpgt_ps(z, fones); +++ +++ y = _mm_add_ps(y, _mm_and_ps(_mm_sub_ps(pio2, _mm_mul_ps(y, ftwos)), condition)); +++ arctangent = y; +++ condition = _mm_cmplt_ps(aVal, fzeroes); +++ arctangent = +++ _mm_sub_ps(arctangent, _mm_and_ps(_mm_mul_ps(arctangent, ftwos), condition)); +++ +++ _mm_store_ps(bPtr, arctangent); +++ aPtr += 4; +++ bPtr += 4; ++ } ++ ++- y = _mm_mul_ps(y, _mm_mul_ps(x, ffours)); ++- condition = _mm_cmpgt_ps(z, fones); ++- ++- y = _mm_add_ps(y, _mm_and_ps(_mm_sub_ps(pio2, _mm_mul_ps(y, ftwos)), condition)); ++- arctangent = y; ++- condition = _mm_cmplt_ps(aVal, fzeroes); ++- arctangent = _mm_sub_ps(arctangent, _mm_and_ps(_mm_mul_ps(arctangent, ftwos), condition)); ++- ++- _mm_store_ps(bPtr, arctangent); ++- aPtr += 4; ++- bPtr += 4; ++- } ++- ++- number = quarterPoints * 4; ++- for(;number < num_points; number++){ ++- *bPtr++ = atanf(*aPtr++); ++- } +++ number = quarterPoints * 4; +++ for (; number < num_points; number++) { +++ *bPtr++ = atanf(*aPtr++); +++ } ++ } ++ ++ #endif /* LV_HAVE_SSE4_1 for aligned */ ++@@ -266,59 +278,63 @@ volk_32f_atan_32f_a_sse4_1(float* bVector, const float* aVector, unsigned int nu ++ #if LV_HAVE_AVX2 && LV_HAVE_FMA ++ #include ++ ++-static inline void ++-volk_32f_atan_32f_u_avx2_fma(float* bVector, const float* aVector, unsigned int num_points) +++static inline void volk_32f_atan_32f_u_avx2_fma(float* bVector, +++ const float* aVector, +++ unsigned int num_points) ++ { ++- float* bPtr = bVector; ++- const float* aPtr = aVector; ++- ++- unsigned int number = 0; ++- unsigned int eighthPoints = num_points / 8; ++- int i, j; ++- ++- __m256 aVal, pio2, x, y, z, arctangent; ++- __m256 fzeroes, fones, ftwos, ffours, condition; ++- ++- pio2 = _mm256_set1_ps(3.14159265358979323846/2); ++- fzeroes = _mm256_setzero_ps(); ++- fones = _mm256_set1_ps(1.0); ++- ftwos = _mm256_set1_ps(2.0); ++- ffours = _mm256_set1_ps(4.0); ++- ++- for(;number < eighthPoints; number++){ ++- aVal = _mm256_loadu_ps(aPtr); ++- z = aVal; ++- condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS); ++- z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition)); ++- condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS); ++- x = _mm256_add_ps(z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition)); ++- ++- for(i = 0; i < 2; i++){ ++- x = _mm256_add_ps(x, _mm256_sqrt_ps(_mm256_fmadd_ps(x,x,fones))); +++ float* bPtr = bVector; +++ const float* aPtr = aVector; +++ +++ unsigned int number = 0; +++ unsigned int eighthPoints = num_points / 8; +++ int i, j; +++ +++ __m256 aVal, pio2, x, y, z, arctangent; +++ __m256 fzeroes, fones, ftwos, ffours, condition; +++ +++ pio2 = _mm256_set1_ps(3.14159265358979323846 / 2); +++ fzeroes = _mm256_setzero_ps(); +++ fones = _mm256_set1_ps(1.0); +++ ftwos = _mm256_set1_ps(2.0); +++ ffours = _mm256_set1_ps(4.0); +++ +++ for (; number < eighthPoints; number++) { +++ aVal = _mm256_loadu_ps(aPtr); +++ z = aVal; +++ condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS); +++ z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition)); +++ condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS); +++ x = _mm256_add_ps( +++ z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition)); +++ +++ for (i = 0; i < 2; i++) { +++ x = _mm256_add_ps(x, _mm256_sqrt_ps(_mm256_fmadd_ps(x, x, fones))); +++ } +++ x = _mm256_div_ps(fones, x); +++ y = fzeroes; +++ for (j = TERMS - 1; j >= 0; j--) { +++ y = _mm256_fmadd_ps( +++ y, _mm256_mul_ps(x, x), _mm256_set1_ps(pow(-1, j) / (2 * j + 1))); +++ } +++ +++ y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours)); +++ condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS); +++ +++ y = _mm256_add_ps(y, _mm256_and_ps(_mm256_fnmadd_ps(y, ftwos, pio2), condition)); +++ arctangent = y; +++ condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS); +++ arctangent = _mm256_sub_ps( +++ arctangent, _mm256_and_ps(_mm256_mul_ps(arctangent, ftwos), condition)); +++ +++ _mm256_storeu_ps(bPtr, arctangent); +++ aPtr += 8; +++ bPtr += 8; ++ } ++- x = _mm256_div_ps(fones, x); ++- y = fzeroes; ++- for(j = TERMS - 1; j >=0 ; j--){ ++- y = _mm256_fmadd_ps(y, _mm256_mul_ps(x, x), _mm256_set1_ps(pow(-1,j)/(2*j+1))); ++- } ++- ++- y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours)); ++- condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS); ++ ++- y = _mm256_add_ps(y, _mm256_and_ps(_mm256_fnmadd_ps(y,ftwos,pio2), condition)); ++- arctangent = y; ++- condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS); ++- arctangent = _mm256_sub_ps(arctangent, _mm256_and_ps(_mm256_mul_ps(arctangent, ftwos), condition)); ++- ++- _mm256_storeu_ps(bPtr, arctangent); ++- aPtr += 8; ++- bPtr += 8; ++- } ++- ++- number = eighthPoints * 8; ++- for(;number < num_points; number++){ ++- *bPtr++ = atan(*aPtr++); ++- } +++ number = eighthPoints * 8; +++ for (; number < num_points; number++) { +++ *bPtr++ = atan(*aPtr++); +++ } ++ } ++ ++ #endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for unaligned */ ++@@ -330,56 +346,61 @@ volk_32f_atan_32f_u_avx2_fma(float* bVector, const float* aVector, unsigned int ++ static inline void ++ volk_32f_atan_32f_u_avx(float* bVector, const float* aVector, unsigned int num_points) ++ { ++- float* bPtr = bVector; ++- const float* aPtr = aVector; ++- ++- unsigned int number = 0; ++- unsigned int eighthPoints = num_points / 8; ++- int i, j; ++- ++- __m256 aVal, pio2, x, y, z, arctangent; ++- __m256 fzeroes, fones, ftwos, ffours, condition; ++- ++- pio2 = _mm256_set1_ps(3.14159265358979323846/2); ++- fzeroes = _mm256_setzero_ps(); ++- fones = _mm256_set1_ps(1.0); ++- ftwos = _mm256_set1_ps(2.0); ++- ffours = _mm256_set1_ps(4.0); ++- ++- for(;number < eighthPoints; number++){ ++- aVal = _mm256_loadu_ps(aPtr); ++- z = aVal; ++- condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS); ++- z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition)); ++- condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS); ++- x = _mm256_add_ps(z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition)); ++- ++- for(i = 0; i < 2; i++){ ++- x = _mm256_add_ps(x, _mm256_sqrt_ps(_mm256_add_ps(fones, _mm256_mul_ps(x, x)))); ++- } ++- x = _mm256_div_ps(fones, x); ++- y = fzeroes; ++- for(j = TERMS - 1; j >=0 ; j--){ ++- y = _mm256_add_ps(_mm256_mul_ps(y, _mm256_mul_ps(x, x)), _mm256_set1_ps(pow(-1,j)/(2*j+1))); +++ float* bPtr = bVector; +++ const float* aPtr = aVector; +++ +++ unsigned int number = 0; +++ unsigned int eighthPoints = num_points / 8; +++ int i, j; +++ +++ __m256 aVal, pio2, x, y, z, arctangent; +++ __m256 fzeroes, fones, ftwos, ffours, condition; +++ +++ pio2 = _mm256_set1_ps(3.14159265358979323846 / 2); +++ fzeroes = _mm256_setzero_ps(); +++ fones = _mm256_set1_ps(1.0); +++ ftwos = _mm256_set1_ps(2.0); +++ ffours = _mm256_set1_ps(4.0); +++ +++ for (; number < eighthPoints; number++) { +++ aVal = _mm256_loadu_ps(aPtr); +++ z = aVal; +++ condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS); +++ z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition)); +++ condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS); +++ x = _mm256_add_ps( +++ z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition)); +++ +++ for (i = 0; i < 2; i++) { +++ x = _mm256_add_ps(x, +++ _mm256_sqrt_ps(_mm256_add_ps(fones, _mm256_mul_ps(x, x)))); +++ } +++ x = _mm256_div_ps(fones, x); +++ y = fzeroes; +++ for (j = TERMS - 1; j >= 0; j--) { +++ y = _mm256_add_ps(_mm256_mul_ps(y, _mm256_mul_ps(x, x)), +++ _mm256_set1_ps(pow(-1, j) / (2 * j + 1))); +++ } +++ +++ y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours)); +++ condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS); +++ +++ y = _mm256_add_ps( +++ y, _mm256_and_ps(_mm256_sub_ps(pio2, _mm256_mul_ps(y, ftwos)), condition)); +++ arctangent = y; +++ condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS); +++ arctangent = _mm256_sub_ps( +++ arctangent, _mm256_and_ps(_mm256_mul_ps(arctangent, ftwos), condition)); +++ +++ _mm256_storeu_ps(bPtr, arctangent); +++ aPtr += 8; +++ bPtr += 8; ++ } ++ ++- y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours)); ++- condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS); ++- ++- y = _mm256_add_ps(y, _mm256_and_ps(_mm256_sub_ps(pio2, _mm256_mul_ps(y, ftwos)), condition)); ++- arctangent = y; ++- condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS); ++- arctangent = _mm256_sub_ps(arctangent, _mm256_and_ps(_mm256_mul_ps(arctangent, ftwos), condition)); ++- ++- _mm256_storeu_ps(bPtr, arctangent); ++- aPtr += 8; ++- bPtr += 8; ++- } ++- ++- number = eighthPoints * 8; ++- for(;number < num_points; number++){ ++- *bPtr++ = atan(*aPtr++); ++- } +++ number = eighthPoints * 8; +++ for (; number < num_points; number++) { +++ *bPtr++ = atan(*aPtr++); +++ } ++ } ++ ++ #endif /* LV_HAVE_AVX for unaligned */ ++@@ -390,54 +411,56 @@ volk_32f_atan_32f_u_avx(float* bVector, const float* aVector, unsigned int num_p ++ static inline void ++ volk_32f_atan_32f_u_sse4_1(float* bVector, const float* aVector, unsigned int num_points) ++ { ++- float* bPtr = bVector; ++- const float* aPtr = aVector; ++- ++- unsigned int number = 0; ++- unsigned int quarterPoints = num_points / 4; ++- int i, j; ++- ++- __m128 aVal, pio2, x, y, z, arctangent; ++- __m128 fzeroes, fones, ftwos, ffours, condition; ++- ++- pio2 = _mm_set1_ps(3.14159265358979323846/2); ++- fzeroes = _mm_setzero_ps(); ++- fones = _mm_set1_ps(1.0); ++- ftwos = _mm_set1_ps(2.0); ++- ffours = _mm_set1_ps(4.0); ++- ++- for(;number < quarterPoints; number++){ ++- aVal = _mm_loadu_ps(aPtr); ++- z = aVal; ++- condition = _mm_cmplt_ps(z, fzeroes); ++- z = _mm_sub_ps(z, _mm_and_ps(_mm_mul_ps(z, ftwos), condition)); ++- condition = _mm_cmplt_ps(z, fones); ++- x = _mm_add_ps(z, _mm_and_ps(_mm_sub_ps(_mm_div_ps(fones, z), z), condition)); ++- ++- for(i = 0; i < 2; i++) ++- x = _mm_add_ps(x, _mm_sqrt_ps(_mm_add_ps(fones, _mm_mul_ps(x, x)))); ++- x = _mm_div_ps(fones, x); ++- y = fzeroes; ++- for(j = TERMS - 1; j >= 0; j--) ++- y = _mm_add_ps(_mm_mul_ps(y, _mm_mul_ps(x, x)), _mm_set1_ps(pow(-1,j)/(2*j+1))); ++- ++- y = _mm_mul_ps(y, _mm_mul_ps(x, ffours)); ++- condition = _mm_cmpgt_ps(z, fones); ++- ++- y = _mm_add_ps(y, _mm_and_ps(_mm_sub_ps(pio2, _mm_mul_ps(y, ftwos)), condition)); ++- arctangent = y; ++- condition = _mm_cmplt_ps(aVal, fzeroes); ++- arctangent = _mm_sub_ps(arctangent, _mm_and_ps(_mm_mul_ps(arctangent, ftwos), condition)); ++- ++- _mm_storeu_ps(bPtr, arctangent); ++- aPtr += 4; ++- bPtr += 4; ++- } ++- ++- number = quarterPoints * 4; ++- for(;number < num_points; number++){ ++- *bPtr++ = atanf(*aPtr++); ++- } +++ float* bPtr = bVector; +++ const float* aPtr = aVector; +++ +++ unsigned int number = 0; +++ unsigned int quarterPoints = num_points / 4; +++ int i, j; +++ +++ __m128 aVal, pio2, x, y, z, arctangent; +++ __m128 fzeroes, fones, ftwos, ffours, condition; +++ +++ pio2 = _mm_set1_ps(3.14159265358979323846 / 2); +++ fzeroes = _mm_setzero_ps(); +++ fones = _mm_set1_ps(1.0); +++ ftwos = _mm_set1_ps(2.0); +++ ffours = _mm_set1_ps(4.0); +++ +++ for (; number < quarterPoints; number++) { +++ aVal = _mm_loadu_ps(aPtr); +++ z = aVal; +++ condition = _mm_cmplt_ps(z, fzeroes); +++ z = _mm_sub_ps(z, _mm_and_ps(_mm_mul_ps(z, ftwos), condition)); +++ condition = _mm_cmplt_ps(z, fones); +++ x = _mm_add_ps(z, _mm_and_ps(_mm_sub_ps(_mm_div_ps(fones, z), z), condition)); +++ +++ for (i = 0; i < 2; i++) +++ x = _mm_add_ps(x, _mm_sqrt_ps(_mm_add_ps(fones, _mm_mul_ps(x, x)))); +++ x = _mm_div_ps(fones, x); +++ y = fzeroes; +++ for (j = TERMS - 1; j >= 0; j--) +++ y = _mm_add_ps(_mm_mul_ps(y, _mm_mul_ps(x, x)), +++ _mm_set1_ps(pow(-1, j) / (2 * j + 1))); +++ +++ y = _mm_mul_ps(y, _mm_mul_ps(x, ffours)); +++ condition = _mm_cmpgt_ps(z, fones); +++ +++ y = _mm_add_ps(y, _mm_and_ps(_mm_sub_ps(pio2, _mm_mul_ps(y, ftwos)), condition)); +++ arctangent = y; +++ condition = _mm_cmplt_ps(aVal, fzeroes); +++ arctangent = +++ _mm_sub_ps(arctangent, _mm_and_ps(_mm_mul_ps(arctangent, ftwos), condition)); +++ +++ _mm_storeu_ps(bPtr, arctangent); +++ aPtr += 4; +++ bPtr += 4; +++ } +++ +++ number = quarterPoints * 4; +++ for (; number < num_points; number++) { +++ *bPtr++ = atanf(*aPtr++); +++ } ++ } ++ ++ #endif /* LV_HAVE_SSE4_1 for unaligned */ ++@@ -447,13 +470,13 @@ volk_32f_atan_32f_u_sse4_1(float* bVector, const float* aVector, unsigned int nu ++ static inline void ++ volk_32f_atan_32f_generic(float* bVector, const float* aVector, unsigned int num_points) ++ { ++- float* bPtr = bVector; ++- const float* aPtr = aVector; ++- unsigned int number = 0; +++ float* bPtr = bVector; +++ const float* aPtr = aVector; +++ unsigned int number = 0; ++ ++- for(number = 0; number < num_points; number++){ ++- *bPtr++ = atanf(*aPtr++); ++- } +++ for (number = 0; number < num_points; number++) { +++ *bPtr++ = atanf(*aPtr++); +++ } ++ } ++ #endif /* LV_HAVE_GENERIC */ ++ ++diff --git a/kernels/volk/volk_32f_binary_slicer_32i.h b/kernels/volk/volk_32f_binary_slicer_32i.h ++index c56ff8f..635d0c3 100644 ++--- a/kernels/volk/volk_32f_binary_slicer_32i.h +++++ b/kernels/volk/volk_32f_binary_slicer_32i.h ++@@ -30,8 +30,8 @@ ++ * ++ * Dispatcher Prototype ++ * \code ++- * void volk_32f_binary_slicer_32i(int* cVector, const float* aVector, unsigned int num_points) ++- * \endcode +++ * void volk_32f_binary_slicer_32i(int* cVector, const float* aVector, unsigned int +++ * num_points) \endcode ++ * ++ * \b Inputs ++ * \li aVector: The input vector of floats. ++@@ -73,37 +73,38 @@ ++ ++ #ifdef LV_HAVE_GENERIC ++ ++-static inline void ++-volk_32f_binary_slicer_32i_generic(int* cVector, const float* aVector, unsigned int num_points) +++static inline void volk_32f_binary_slicer_32i_generic(int* cVector, +++ const float* aVector, +++ unsigned int num_points) ++ { ++- int* cPtr = cVector; ++- const float* aPtr = aVector; ++- unsigned int number = 0; ++- ++- for(number = 0; number < num_points; number++){ ++- if( *aPtr++ >= 0) { ++- *cPtr++ = 1; ++- } ++- else { ++- *cPtr++ = 0; +++ int* cPtr = cVector; +++ const float* aPtr = aVector; +++ unsigned int number = 0; +++ +++ for (number = 0; number < num_points; number++) { +++ if (*aPtr++ >= 0) { +++ *cPtr++ = 1; +++ } else { +++ *cPtr++ = 0; +++ } ++ } ++- } ++ } ++ #endif /* LV_HAVE_GENERIC */ ++ ++ ++ #ifdef LV_HAVE_GENERIC ++ ++-static inline void ++-volk_32f_binary_slicer_32i_generic_branchless(int* cVector, const float* aVector, unsigned int num_points) +++static inline void volk_32f_binary_slicer_32i_generic_branchless(int* cVector, +++ const float* aVector, +++ unsigned int num_points) ++ { ++- int* cPtr = cVector; ++- const float* aPtr = aVector; ++- unsigned int number = 0; +++ int* cPtr = cVector; +++ const float* aPtr = aVector; +++ unsigned int number = 0; ++ ++- for(number = 0; number < num_points; number++){ ++- *cPtr++ = (*aPtr++ >= 0); ++- } +++ for (number = 0; number < num_points; number++) { +++ *cPtr++ = (*aPtr++ >= 0); +++ } ++ } ++ #endif /* LV_HAVE_GENERIC */ ++ ++@@ -111,40 +112,40 @@ volk_32f_binary_slicer_32i_generic_branchless(int* cVector, const float* aVector ++ #ifdef LV_HAVE_SSE2 ++ #include ++ ++-static inline void ++-volk_32f_binary_slicer_32i_a_sse2(int* cVector, const float* aVector, unsigned int num_points) +++static inline void volk_32f_binary_slicer_32i_a_sse2(int* cVector, +++ const float* aVector, +++ unsigned int num_points) ++ { ++- int* cPtr = cVector; ++- const float* aPtr = aVector; ++- unsigned int number = 0; ++- ++- unsigned int quarter_points = num_points / 4; ++- __m128 a_val, res_f; ++- __m128i res_i, binary_i; ++- __m128 zero_val; ++- zero_val = _mm_set1_ps (0.0f); +++ int* cPtr = cVector; +++ const float* aPtr = aVector; +++ unsigned int number = 0; ++ ++- for(number = 0; number < quarter_points; number++){ ++- a_val = _mm_load_ps(aPtr); +++ unsigned int quarter_points = num_points / 4; +++ __m128 a_val, res_f; +++ __m128i res_i, binary_i; +++ __m128 zero_val; +++ zero_val = _mm_set1_ps(0.0f); ++ ++- res_f = _mm_cmpge_ps (a_val, zero_val); ++- res_i = _mm_cvtps_epi32 (res_f); ++- binary_i = _mm_srli_epi32 (res_i, 31); +++ for (number = 0; number < quarter_points; number++) { +++ a_val = _mm_load_ps(aPtr); ++ ++- _mm_store_si128((__m128i*)cPtr, binary_i); +++ res_f = _mm_cmpge_ps(a_val, zero_val); +++ res_i = _mm_cvtps_epi32(res_f); +++ binary_i = _mm_srli_epi32(res_i, 31); ++ ++- cPtr += 4; ++- aPtr += 4; ++- } +++ _mm_store_si128((__m128i*)cPtr, binary_i); ++ ++- for(number = quarter_points * 4; number < num_points; number++){ ++- if( *aPtr++ >= 0) { ++- *cPtr++ = 1; +++ cPtr += 4; +++ aPtr += 4; ++ } ++- else { ++- *cPtr++ = 0; +++ +++ for (number = quarter_points * 4; number < num_points; number++) { +++ if (*aPtr++ >= 0) { +++ *cPtr++ = 1; +++ } else { +++ *cPtr++ = 0; +++ } ++ } ++- } ++ } ++ #endif /* LV_HAVE_SSE2 */ ++ ++@@ -152,41 +153,41 @@ volk_32f_binary_slicer_32i_a_sse2(int* cVector, const float* aVector, unsigned i ++ #ifdef LV_HAVE_AVX ++ #include ++ ++-static inline void ++-volk_32f_binary_slicer_32i_a_avx(int* cVector, const float* aVector, unsigned int num_points) +++static inline void volk_32f_binary_slicer_32i_a_avx(int* cVector, +++ const float* aVector, +++ unsigned int num_points) ++ { ++- int* cPtr = cVector; ++- const float* aPtr = aVector; ++- unsigned int number = 0; +++ int* cPtr = cVector; +++ const float* aPtr = aVector; +++ unsigned int number = 0; ++ ++- unsigned int quarter_points = num_points / 8; ++- __m256 a_val, res_f, binary_f; ++- __m256i binary_i; ++- __m256 zero_val, one_val; ++- zero_val = _mm256_set1_ps (0.0f); ++- one_val = _mm256_set1_ps (1.0f); +++ unsigned int quarter_points = num_points / 8; +++ __m256 a_val, res_f, binary_f; +++ __m256i binary_i; +++ __m256 zero_val, one_val; +++ zero_val = _mm256_set1_ps(0.0f); +++ one_val = _mm256_set1_ps(1.0f); ++ ++- for(number = 0; number < quarter_points; number++){ ++- a_val = _mm256_load_ps(aPtr); +++ for (number = 0; number < quarter_points; number++) { +++ a_val = _mm256_load_ps(aPtr); ++ ++- res_f = _mm256_cmp_ps (a_val, zero_val, _CMP_GE_OS); ++- binary_f = _mm256_and_ps (res_f, one_val); ++- binary_i = _mm256_cvtps_epi32(binary_f); +++ res_f = _mm256_cmp_ps(a_val, zero_val, _CMP_GE_OS); +++ binary_f = _mm256_and_ps(res_f, one_val); +++ binary_i = _mm256_cvtps_epi32(binary_f); ++ ++- _mm256_store_si256((__m256i *)cPtr, binary_i); +++ _mm256_store_si256((__m256i*)cPtr, binary_i); ++ ++- cPtr += 8; ++- aPtr += 8; ++- } ++- ++- for(number = quarter_points * 8; number < num_points; number++){ ++- if( *aPtr++ >= 0) { ++- *cPtr++ = 1; +++ cPtr += 8; +++ aPtr += 8; ++ } ++- else { ++- *cPtr++ = 0; +++ +++ for (number = quarter_points * 8; number < num_points; number++) { +++ if (*aPtr++ >= 0) { +++ *cPtr++ = 1; +++ } else { +++ *cPtr++ = 0; +++ } ++ } ++- } ++ } ++ #endif /* LV_HAVE_AVX */ ++ ++@@ -194,40 +195,40 @@ volk_32f_binary_slicer_32i_a_avx(int* cVector, const float* aVector, unsigned in ++ #ifdef LV_HAVE_SSE2 ++ #include ++ ++-static inline void ++-volk_32f_binary_slicer_32i_u_sse2(int* cVector, const float* aVector, unsigned int num_points) +++static inline void volk_32f_binary_slicer_32i_u_sse2(int* cVector, +++ const float* aVector, +++ unsigned int num_points) ++ { ++- int* cPtr = cVector; ++- const float* aPtr = aVector; ++- unsigned int number = 0; ++- ++- unsigned int quarter_points = num_points / 4; ++- __m128 a_val, res_f; ++- __m128i res_i, binary_i; ++- __m128 zero_val; ++- zero_val = _mm_set1_ps (0.0f); +++ int* cPtr = cVector; +++ const float* aPtr = aVector; +++ unsigned int number = 0; ++ ++- for(number = 0; number < quarter_points; number++){ ++- a_val = _mm_loadu_ps(aPtr); +++ unsigned int quarter_points = num_points / 4; +++ __m128 a_val, res_f; +++ __m128i res_i, binary_i; +++ __m128 zero_val; +++ zero_val = _mm_set1_ps(0.0f); ++ ++- res_f = _mm_cmpge_ps (a_val, zero_val); ++- res_i = _mm_cvtps_epi32 (res_f); ++- binary_i = _mm_srli_epi32 (res_i, 31); +++ for (number = 0; number < quarter_points; number++) { +++ a_val = _mm_loadu_ps(aPtr); ++ ++- _mm_storeu_si128((__m128i*)cPtr, binary_i); +++ res_f = _mm_cmpge_ps(a_val, zero_val); +++ res_i = _mm_cvtps_epi32(res_f); +++ binary_i = _mm_srli_epi32(res_i, 31); ++ ++- cPtr += 4; ++- aPtr += 4; ++- } +++ _mm_storeu_si128((__m128i*)cPtr, binary_i); ++ ++- for(number = quarter_points * 4; number < num_points; number++){ ++- if( *aPtr++ >= 0) { ++- *cPtr++ = 1; +++ cPtr += 4; +++ aPtr += 4; ++ } ++- else { ++- *cPtr++ = 0; +++ +++ for (number = quarter_points * 4; number < num_points; number++) { +++ if (*aPtr++ >= 0) { +++ *cPtr++ = 1; +++ } else { +++ *cPtr++ = 0; +++ } ++ } ++- } ++ } ++ #endif /* LV_HAVE_SSE2 */ ++ ++@@ -235,41 +236,41 @@ volk_32f_binary_slicer_32i_u_sse2(int* cVector, const float* aVector, unsigned i ++ #ifdef LV_HAVE_AVX ++ #include ++ ++-static inline void ++-volk_32f_binary_slicer_32i_u_avx(int* cVector, const float* aVector, unsigned int num_points) +++static inline void volk_32f_binary_slicer_32i_u_avx(int* cVector, +++ const float* aVector, +++ unsigned int num_points) ++ { ++- int* cPtr = cVector; ++- const float* aPtr = aVector; ++- unsigned int number = 0; ++- ++- unsigned int quarter_points = num_points / 8; ++- __m256 a_val, res_f, binary_f; ++- __m256i binary_i; ++- __m256 zero_val, one_val; ++- zero_val = _mm256_set1_ps (0.0f); ++- one_val = _mm256_set1_ps (1.0f); +++ int* cPtr = cVector; +++ const float* aPtr = aVector; +++ unsigned int number = 0; ++ ++- for(number = 0; number < quarter_points; number++){ ++- a_val = _mm256_loadu_ps(aPtr); +++ unsigned int quarter_points = num_points / 8; +++ __m256 a_val, res_f, binary_f; +++ __m256i binary_i; +++ __m256 zero_val, one_val; +++ zero_val = _mm256_set1_ps(0.0f); +++ one_val = _mm256_set1_ps(1.0f); ++ ++- res_f = _mm256_cmp_ps (a_val, zero_val, _CMP_GE_OS); ++- binary_f = _mm256_and_ps (res_f, one_val); ++- binary_i = _mm256_cvtps_epi32(binary_f); +++ for (number = 0; number < quarter_points; number++) { +++ a_val = _mm256_loadu_ps(aPtr); ++ ++- _mm256_storeu_si256((__m256i*)cPtr, binary_i); +++ res_f = _mm256_cmp_ps(a_val, zero_val, _CMP_GE_OS); +++ binary_f = _mm256_and_ps(res_f, one_val); +++ binary_i = _mm256_cvtps_epi32(binary_f); ++ ++- cPtr += 8; ++- aPtr += 8; ++- } +++ _mm256_storeu_si256((__m256i*)cPtr, binary_i); ++ ++- for(number = quarter_points * 8; number < num_points; number++){ ++- if( *aPtr++ >= 0) { ++- *cPtr++ = 1; +++ cPtr += 8; +++ aPtr += 8; ++ } ++- else { ++- *cPtr++ = 0; +++ +++ for (number = quarter_points * 8; number < num_points; number++) { +++ if (*aPtr++ >= 0) { +++ *cPtr++ = 1; +++ } else { +++ *cPtr++ = 0; +++ } ++ } ++- } ++ } ++ #endif /* LV_HAVE_AVX */ ++ ++diff --git a/kernels/volk/volk_32f_binary_slicer_8i.h b/kernels/volk/volk_32f_binary_slicer_8i.h ++index 5920621..3eddb5c 100644 ++--- a/kernels/volk/volk_32f_binary_slicer_8i.h +++++ b/kernels/volk/volk_32f_binary_slicer_8i.h ++@@ -30,7 +30,8 @@ ++ * ++ * Dispatcher Prototype ++ * \code ++- * void volk_32f_binary_slicer_8i(int8_t* cVector, const float* aVector, unsigned int num_points) +++ * void volk_32f_binary_slicer_8i(int8_t* cVector, const float* aVector, unsigned int +++ num_points) ++ * \endcode ++ * ++ * \b Inputs ++@@ -74,39 +75,38 @@ ++ ++ #ifdef LV_HAVE_GENERIC ++ ++-static inline void ++-volk_32f_binary_slicer_8i_generic(int8_t* cVector, const float* aVector, ++- unsigned int num_points) +++static inline void volk_32f_binary_slicer_8i_generic(int8_t* cVector, +++ const float* aVector, +++ unsigned int num_points) ++ { ++- int8_t* cPtr = cVector; ++- const float* aPtr = aVector; ++- unsigned int number = 0; ++- ++- for(number = 0; number < num_points; number++) { ++- if(*aPtr++ >= 0) { ++- *cPtr++ = 1; ++- } ++- else { ++- *cPtr++ = 0; +++ int8_t* cPtr = cVector; +++ const float* aPtr = aVector; +++ unsigned int number = 0; +++ +++ for (number = 0; number < num_points; number++) { +++ if (*aPtr++ >= 0) { +++ *cPtr++ = 1; +++ } else { +++ *cPtr++ = 0; +++ } ++ } ++- } ++ } ++ #endif /* LV_HAVE_GENERIC */ ++ ++ ++ #ifdef LV_HAVE_GENERIC ++ ++-static inline void ++-volk_32f_binary_slicer_8i_generic_branchless(int8_t* cVector, const float* aVector, ++- unsigned int num_points) +++static inline void volk_32f_binary_slicer_8i_generic_branchless(int8_t* cVector, +++ const float* aVector, +++ unsigned int num_points) ++ { ++- int8_t* cPtr = cVector; ++- const float* aPtr = aVector; ++- unsigned int number = 0; +++ int8_t* cPtr = cVector; +++ const float* aPtr = aVector; +++ unsigned int number = 0; ++ ++- for(number = 0; number < num_points; number++){ ++- *cPtr++ = (*aPtr++ >= 0); ++- } +++ for (number = 0; number < num_points; number++) { +++ *cPtr++ = (*aPtr++ >= 0); +++ } ++ } ++ #endif /* LV_HAVE_GENERIC */ ++ ++@@ -114,279 +114,329 @@ volk_32f_binary_slicer_8i_generic_branchless(int8_t* cVector, const float* aVect ++ #ifdef LV_HAVE_AVX2 ++ #include ++ ++-static inline void ++-volk_32f_binary_slicer_8i_a_avx2(int8_t* cVector, const float* aVector, ++- unsigned int num_points) +++static inline void volk_32f_binary_slicer_8i_a_avx2(int8_t* cVector, +++ const float* aVector, +++ unsigned int num_points) ++ { ++- int8_t* cPtr = cVector; ++- const float* aPtr = aVector; ++- unsigned int number = 0; ++- unsigned int n32points = num_points / 32; ++- ++- const __m256 zero_val = _mm256_set1_ps(0.0f); ++- __m256 a0_val, a1_val, a2_val, a3_val; ++- __m256 res0_f, res1_f, res2_f, res3_f; ++- __m256i res0_i, res1_i, res2_i, res3_i; ++- __m256i byte_shuffle = _mm256_set_epi8( 15, 14, 13, 12, 7, 6, 5, 4, ++- 11, 10, 9, 8, 3, 2, 1, 0, ++- 15, 14, 13, 12, 7, 6, 5, 4, ++- 11, 10, 9, 8, 3, 2, 1, 0); ++- ++- for(number = 0; number < n32points; number++) { ++- a0_val = _mm256_load_ps(aPtr); ++- a1_val = _mm256_load_ps(aPtr+8); ++- a2_val = _mm256_load_ps(aPtr+16); ++- a3_val = _mm256_load_ps(aPtr+24); ++- ++- // compare >= 0; return float ++- res0_f = _mm256_cmp_ps(a0_val, zero_val, _CMP_GE_OS); ++- res1_f = _mm256_cmp_ps(a1_val, zero_val, _CMP_GE_OS); ++- res2_f = _mm256_cmp_ps(a2_val, zero_val, _CMP_GE_OS); ++- res3_f = _mm256_cmp_ps(a3_val, zero_val, _CMP_GE_OS); ++- ++- // convert to 32i and >> 31 ++- res0_i = _mm256_srli_epi32(_mm256_cvtps_epi32(res0_f), 31); ++- res1_i = _mm256_srli_epi32(_mm256_cvtps_epi32(res1_f), 31); ++- res2_i = _mm256_srli_epi32(_mm256_cvtps_epi32(res2_f), 31); ++- res3_i = _mm256_srli_epi32(_mm256_cvtps_epi32(res3_f), 31); ++- ++- // pack in to 16-bit results ++- res0_i = _mm256_packs_epi32(res0_i, res1_i); ++- res2_i = _mm256_packs_epi32(res2_i, res3_i); ++- // pack in to 8-bit results ++- // res0: (after packs_epi32) ++- // a0, a1, a2, a3, b0, b1, b2, b3, a4, a5, a6, a7, b4, b5, b6, b7 ++- // res2: ++- // c0, c1, c2, c3, d0, d1, d2, d3, c4, c5, c6, c7, d4, d5, d6, d7 ++- res0_i = _mm256_packs_epi16(res0_i, res2_i); ++- // shuffle the lanes ++- // res0: (after packs_epi16) ++- // a0, a1, a2, a3, b0, b1, b2, b3, c0, c1, c2, c3, d0, d1, d2, d3 ++- // a4, a5, a6, a7, b4, b5, b6, b7, c4, c5, c6, c7, d4, d5, d6, d7 ++- // 0, 2, 1, 3 -> 11 01 10 00 (0xd8) ++- res0_i = _mm256_permute4x64_epi64(res0_i, 0xd8); ++- ++- // shuffle bytes within lanes ++- // res0: (after shuffle_epi8) ++- // a0, a1, a2, a3, b0, b1, b2, b3, a4, a5, a6, a7, b4, b5, b6, b7 ++- // c0, c1, c2, c3, d0, d1, d2, d3, c4, c5, c6, c7, d4, d5, d6, d7 ++- res0_i = _mm256_shuffle_epi8(res0_i, byte_shuffle); ++- ++- _mm256_store_si256((__m256i*)cPtr, res0_i); ++- aPtr += 32; ++- cPtr += 32; ++- } ++- ++- for(number = n32points * 32; number < num_points; number++) { ++- if( *aPtr++ >= 0) { ++- *cPtr++ = 1; +++ int8_t* cPtr = cVector; +++ const float* aPtr = aVector; +++ unsigned int number = 0; +++ unsigned int n32points = num_points / 32; +++ +++ const __m256 zero_val = _mm256_set1_ps(0.0f); +++ __m256 a0_val, a1_val, a2_val, a3_val; +++ __m256 res0_f, res1_f, res2_f, res3_f; +++ __m256i res0_i, res1_i, res2_i, res3_i; +++ __m256i byte_shuffle = _mm256_set_epi8(15, +++ 14, +++ 13, +++ 12, +++ 7, +++ 6, +++ 5, +++ 4, +++ 11, +++ 10, +++ 9, +++ 8, +++ 3, +++ 2, +++ 1, +++ 0, +++ 15, +++ 14, +++ 13, +++ 12, +++ 7, +++ 6, +++ 5, +++ 4, +++ 11, +++ 10, +++ 9, +++ 8, +++ 3, +++ 2, +++ 1, +++ 0); +++ +++ for (number = 0; number < n32points; number++) { +++ a0_val = _mm256_load_ps(aPtr); +++ a1_val = _mm256_load_ps(aPtr + 8); +++ a2_val = _mm256_load_ps(aPtr + 16); +++ a3_val = _mm256_load_ps(aPtr + 24); +++ +++ // compare >= 0; return float +++ res0_f = _mm256_cmp_ps(a0_val, zero_val, _CMP_GE_OS); +++ res1_f = _mm256_cmp_ps(a1_val, zero_val, _CMP_GE_OS); +++ res2_f = _mm256_cmp_ps(a2_val, zero_val, _CMP_GE_OS); +++ res3_f = _mm256_cmp_ps(a3_val, zero_val, _CMP_GE_OS); +++ +++ // convert to 32i and >> 31 +++ res0_i = _mm256_srli_epi32(_mm256_cvtps_epi32(res0_f), 31); +++ res1_i = _mm256_srli_epi32(_mm256_cvtps_epi32(res1_f), 31); +++ res2_i = _mm256_srli_epi32(_mm256_cvtps_epi32(res2_f), 31); +++ res3_i = _mm256_srli_epi32(_mm256_cvtps_epi32(res3_f), 31); +++ +++ // pack in to 16-bit results +++ res0_i = _mm256_packs_epi32(res0_i, res1_i); +++ res2_i = _mm256_packs_epi32(res2_i, res3_i); +++ // pack in to 8-bit results +++ // res0: (after packs_epi32) +++ // a0, a1, a2, a3, b0, b1, b2, b3, a4, a5, a6, a7, b4, b5, b6, b7 +++ // res2: +++ // c0, c1, c2, c3, d0, d1, d2, d3, c4, c5, c6, c7, d4, d5, d6, d7 +++ res0_i = _mm256_packs_epi16(res0_i, res2_i); +++ // shuffle the lanes +++ // res0: (after packs_epi16) +++ // a0, a1, a2, a3, b0, b1, b2, b3, c0, c1, c2, c3, d0, d1, d2, d3 +++ // a4, a5, a6, a7, b4, b5, b6, b7, c4, c5, c6, c7, d4, d5, d6, d7 +++ // 0, 2, 1, 3 -> 11 01 10 00 (0xd8) +++ res0_i = _mm256_permute4x64_epi64(res0_i, 0xd8); +++ +++ // shuffle bytes within lanes +++ // res0: (after shuffle_epi8) +++ // a0, a1, a2, a3, b0, b1, b2, b3, a4, a5, a6, a7, b4, b5, b6, b7 +++ // c0, c1, c2, c3, d0, d1, d2, d3, c4, c5, c6, c7, d4, d5, d6, d7 +++ res0_i = _mm256_shuffle_epi8(res0_i, byte_shuffle); +++ +++ _mm256_store_si256((__m256i*)cPtr, res0_i); +++ aPtr += 32; +++ cPtr += 32; ++ } ++- else { ++- *cPtr++ = 0; +++ +++ for (number = n32points * 32; number < num_points; number++) { +++ if (*aPtr++ >= 0) { +++ *cPtr++ = 1; +++ } else { +++ *cPtr++ = 0; +++ } ++ } ++- } ++ } ++ #endif ++ ++ #ifdef LV_HAVE_AVX2 ++ #include ++ ++-static inline void ++-volk_32f_binary_slicer_8i_u_avx2(int8_t* cVector, const float* aVector, ++- unsigned int num_points) +++static inline void volk_32f_binary_slicer_8i_u_avx2(int8_t* cVector, +++ const float* aVector, +++ unsigned int num_points) ++ { ++- int8_t* cPtr = cVector; ++- const float* aPtr = aVector; ++- unsigned int number = 0; ++- unsigned int n32points = num_points / 32; ++- ++- const __m256 zero_val = _mm256_set1_ps(0.0f); ++- __m256 a0_val, a1_val, a2_val, a3_val; ++- __m256 res0_f, res1_f, res2_f, res3_f; ++- __m256i res0_i, res1_i, res2_i, res3_i; ++- __m256i byte_shuffle = _mm256_set_epi8( 15, 14, 13, 12, 7, 6, 5, 4, ++- 11, 10, 9, 8, 3, 2, 1, 0, ++- 15, 14, 13, 12, 7, 6, 5, 4, ++- 11, 10, 9, 8, 3, 2, 1, 0); ++- ++- for(number = 0; number < n32points; number++) { ++- a0_val = _mm256_loadu_ps(aPtr); ++- a1_val = _mm256_loadu_ps(aPtr+8); ++- a2_val = _mm256_loadu_ps(aPtr+16); ++- a3_val = _mm256_loadu_ps(aPtr+24); ++- ++- // compare >= 0; return float ++- res0_f = _mm256_cmp_ps(a0_val, zero_val, _CMP_GE_OS); ++- res1_f = _mm256_cmp_ps(a1_val, zero_val, _CMP_GE_OS); ++- res2_f = _mm256_cmp_ps(a2_val, zero_val, _CMP_GE_OS); ++- res3_f = _mm256_cmp_ps(a3_val, zero_val, _CMP_GE_OS); ++- ++- // convert to 32i and >> 31 ++- res0_i = _mm256_srli_epi32(_mm256_cvtps_epi32(res0_f), 31); ++- res1_i = _mm256_srli_epi32(_mm256_cvtps_epi32(res1_f), 31); ++- res2_i = _mm256_srli_epi32(_mm256_cvtps_epi32(res2_f), 31); ++- res3_i = _mm256_srli_epi32(_mm256_cvtps_epi32(res3_f), 31); ++- ++- // pack in to 16-bit results ++- res0_i = _mm256_packs_epi32(res0_i, res1_i); ++- res2_i = _mm256_packs_epi32(res2_i, res3_i); ++- // pack in to 8-bit results ++- // res0: (after packs_epi32) ++- // a0, a1, a2, a3, b0, b1, b2, b3, a4, a5, a6, a7, b4, b5, b6, b7 ++- // res2: ++- // c0, c1, c2, c3, d0, d1, d2, d3, c4, c5, c6, c7, d4, d5, d6, d7 ++- res0_i = _mm256_packs_epi16(res0_i, res2_i); ++- // shuffle the lanes ++- // res0: (after packs_epi16) ++- // a0, a1, a2, a3, b0, b1, b2, b3, c0, c1, c2, c3, d0, d1, d2, d3 ++- // a4, a5, a6, a7, b4, b5, b6, b7, c4, c5, c6, c7, d4, d5, d6, d7 ++- // 0, 2, 1, 3 -> 11 01 10 00 (0xd8) ++- res0_i = _mm256_permute4x64_epi64(res0_i, 0xd8); ++- ++- // shuffle bytes within lanes ++- // res0: (after shuffle_epi8) ++- // a0, a1, a2, a3, b0, b1, b2, b3, a4, a5, a6, a7, b4, b5, b6, b7 ++- // c0, c1, c2, c3, d0, d1, d2, d3, c4, c5, c6, c7, d4, d5, d6, d7 ++- res0_i = _mm256_shuffle_epi8(res0_i, byte_shuffle); ++- ++- _mm256_storeu_si256((__m256i*)cPtr, res0_i); ++- aPtr += 32; ++- cPtr += 32; ++- } ++- ++- for(number = n32points * 32; number < num_points; number++) { ++- if( *aPtr++ >= 0) { ++- *cPtr++ = 1; +++ int8_t* cPtr = cVector; +++ const float* aPtr = aVector; +++ unsigned int number = 0; +++ unsigned int n32points = num_points / 32; +++ +++ const __m256 zero_val = _mm256_set1_ps(0.0f); +++ __m256 a0_val, a1_val, a2_val, a3_val; +++ __m256 res0_f, res1_f, res2_f, res3_f; +++ __m256i res0_i, res1_i, res2_i, res3_i; +++ __m256i byte_shuffle = _mm256_set_epi8(15, +++ 14, +++ 13, +++ 12, +++ 7, +++ 6, +++ 5, +++ 4, +++ 11, +++ 10, +++ 9, +++ 8, +++ 3, +++ 2, +++ 1, +++ 0, +++ 15, +++ 14, +++ 13, +++ 12, +++ 7, +++ 6, +++ 5, +++ 4, +++ 11, +++ 10, +++ 9, +++ 8, +++ 3, +++ 2, +++ 1, +++ 0); +++ +++ for (number = 0; number < n32points; number++) { +++ a0_val = _mm256_loadu_ps(aPtr); +++ a1_val = _mm256_loadu_ps(aPtr + 8); +++ a2_val = _mm256_loadu_ps(aPtr + 16); +++ a3_val = _mm256_loadu_ps(aPtr + 24); +++ +++ // compare >= 0; return float +++ res0_f = _mm256_cmp_ps(a0_val, zero_val, _CMP_GE_OS); +++ res1_f = _mm256_cmp_ps(a1_val, zero_val, _CMP_GE_OS); +++ res2_f = _mm256_cmp_ps(a2_val, zero_val, _CMP_GE_OS); +++ res3_f = _mm256_cmp_ps(a3_val, zero_val, _CMP_GE_OS); +++ +++ // convert to 32i and >> 31 +++ res0_i = _mm256_srli_epi32(_mm256_cvtps_epi32(res0_f), 31); +++ res1_i = _mm256_srli_epi32(_mm256_cvtps_epi32(res1_f), 31); +++ res2_i = _mm256_srli_epi32(_mm256_cvtps_epi32(res2_f), 31); +++ res3_i = _mm256_srli_epi32(_mm256_cvtps_epi32(res3_f), 31); +++ +++ // pack in to 16-bit results +++ res0_i = _mm256_packs_epi32(res0_i, res1_i); +++ res2_i = _mm256_packs_epi32(res2_i, res3_i); +++ // pack in to 8-bit results +++ // res0: (after packs_epi32) +++ // a0, a1, a2, a3, b0, b1, b2, b3, a4, a5, a6, a7, b4, b5, b6, b7 +++ // res2: +++ // c0, c1, c2, c3, d0, d1, d2, d3, c4, c5, c6, c7, d4, d5, d6, d7 +++ res0_i = _mm256_packs_epi16(res0_i, res2_i); +++ // shuffle the lanes +++ // res0: (after packs_epi16) +++ // a0, a1, a2, a3, b0, b1, b2, b3, c0, c1, c2, c3, d0, d1, d2, d3 +++ // a4, a5, a6, a7, b4, b5, b6, b7, c4, c5, c6, c7, d4, d5, d6, d7 +++ // 0, 2, 1, 3 -> 11 01 10 00 (0xd8) +++ res0_i = _mm256_permute4x64_epi64(res0_i, 0xd8); +++ +++ // shuffle bytes within lanes +++ // res0: (after shuffle_epi8) +++ // a0, a1, a2, a3, b0, b1, b2, b3, a4, a5, a6, a7, b4, b5, b6, b7 +++ // c0, c1, c2, c3, d0, d1, d2, d3, c4, c5, c6, c7, d4, d5, d6, d7 +++ res0_i = _mm256_shuffle_epi8(res0_i, byte_shuffle); +++ +++ _mm256_storeu_si256((__m256i*)cPtr, res0_i); +++ aPtr += 32; +++ cPtr += 32; ++ } ++- else { ++- *cPtr++ = 0; +++ +++ for (number = n32points * 32; number < num_points; number++) { +++ if (*aPtr++ >= 0) { +++ *cPtr++ = 1; +++ } else { +++ *cPtr++ = 0; +++ } ++ } ++- } ++ } ++ #endif ++ ++ ++- ++ #ifdef LV_HAVE_SSE2 ++ ++ #include ++ ++-static inline void ++-volk_32f_binary_slicer_8i_a_sse2(int8_t* cVector, const float* aVector, ++- unsigned int num_points) +++static inline void volk_32f_binary_slicer_8i_a_sse2(int8_t* cVector, +++ const float* aVector, +++ unsigned int num_points) ++ { ++- int8_t* cPtr = cVector; ++- const float* aPtr = aVector; ++- unsigned int number = 0; ++- ++- unsigned int n16points = num_points / 16; ++- __m128 a0_val, a1_val, a2_val, a3_val; ++- __m128 res0_f, res1_f, res2_f, res3_f; ++- __m128i res0_i, res1_i, res2_i, res3_i; ++- __m128 zero_val; ++- zero_val = _mm_set1_ps(0.0f); ++- ++- for(number = 0; number < n16points; number++) { ++- a0_val = _mm_load_ps(aPtr); ++- a1_val = _mm_load_ps(aPtr+4); ++- a2_val = _mm_load_ps(aPtr+8); ++- a3_val = _mm_load_ps(aPtr+12); ++- ++- // compare >= 0; return float ++- res0_f = _mm_cmpge_ps(a0_val, zero_val); ++- res1_f = _mm_cmpge_ps(a1_val, zero_val); ++- res2_f = _mm_cmpge_ps(a2_val, zero_val); ++- res3_f = _mm_cmpge_ps(a3_val, zero_val); ++- ++- // convert to 32i and >> 31 ++- res0_i = _mm_srli_epi32(_mm_cvtps_epi32(res0_f), 31); ++- res1_i = _mm_srli_epi32(_mm_cvtps_epi32(res1_f), 31); ++- res2_i = _mm_srli_epi32(_mm_cvtps_epi32(res2_f), 31); ++- res3_i = _mm_srli_epi32(_mm_cvtps_epi32(res3_f), 31); ++- ++- // pack into 16-bit results ++- res0_i = _mm_packs_epi32(res0_i, res1_i); ++- res2_i = _mm_packs_epi32(res2_i, res3_i); ++- ++- // pack into 8-bit results ++- res0_i = _mm_packs_epi16(res0_i, res2_i); ++- ++- _mm_store_si128((__m128i*)cPtr, res0_i); ++- ++- cPtr += 16; ++- aPtr += 16; ++- } ++- ++- for(number = n16points * 16; number < num_points; number++) { ++- if( *aPtr++ >= 0) { ++- *cPtr++ = 1; +++ int8_t* cPtr = cVector; +++ const float* aPtr = aVector; +++ unsigned int number = 0; +++ +++ unsigned int n16points = num_points / 16; +++ __m128 a0_val, a1_val, a2_val, a3_val; +++ __m128 res0_f, res1_f, res2_f, res3_f; +++ __m128i res0_i, res1_i, res2_i, res3_i; +++ __m128 zero_val; +++ zero_val = _mm_set1_ps(0.0f); +++ +++ for (number = 0; number < n16points; number++) { +++ a0_val = _mm_load_ps(aPtr); +++ a1_val = _mm_load_ps(aPtr + 4); +++ a2_val = _mm_load_ps(aPtr + 8); +++ a3_val = _mm_load_ps(aPtr + 12); +++ +++ // compare >= 0; return float +++ res0_f = _mm_cmpge_ps(a0_val, zero_val); +++ res1_f = _mm_cmpge_ps(a1_val, zero_val); +++ res2_f = _mm_cmpge_ps(a2_val, zero_val); +++ res3_f = _mm_cmpge_ps(a3_val, zero_val); +++ +++ // convert to 32i and >> 31 +++ res0_i = _mm_srli_epi32(_mm_cvtps_epi32(res0_f), 31); +++ res1_i = _mm_srli_epi32(_mm_cvtps_epi32(res1_f), 31); +++ res2_i = _mm_srli_epi32(_mm_cvtps_epi32(res2_f), 31); +++ res3_i = _mm_srli_epi32(_mm_cvtps_epi32(res3_f), 31); +++ +++ // pack into 16-bit results +++ res0_i = _mm_packs_epi32(res0_i, res1_i); +++ res2_i = _mm_packs_epi32(res2_i, res3_i); +++ +++ // pack into 8-bit results +++ res0_i = _mm_packs_epi16(res0_i, res2_i); +++ +++ _mm_store_si128((__m128i*)cPtr, res0_i); +++ +++ cPtr += 16; +++ aPtr += 16; ++ } ++- else { ++- *cPtr++ = 0; +++ +++ for (number = n16points * 16; number < num_points; number++) { +++ if (*aPtr++ >= 0) { +++ *cPtr++ = 1; +++ } else { +++ *cPtr++ = 0; +++ } ++ } ++- } ++ } ++ #endif /* LV_HAVE_SSE2 */ ++ ++ ++- ++ #ifdef LV_HAVE_SSE2 ++ #include ++ ++-static inline void ++-volk_32f_binary_slicer_8i_u_sse2(int8_t* cVector, const float* aVector, ++- unsigned int num_points) +++static inline void volk_32f_binary_slicer_8i_u_sse2(int8_t* cVector, +++ const float* aVector, +++ unsigned int num_points) ++ { ++- int8_t* cPtr = cVector; ++- const float* aPtr = aVector; ++- unsigned int number = 0; ++- ++- unsigned int n16points = num_points / 16; ++- __m128 a0_val, a1_val, a2_val, a3_val; ++- __m128 res0_f, res1_f, res2_f, res3_f; ++- __m128i res0_i, res1_i, res2_i, res3_i; ++- __m128 zero_val; ++- zero_val = _mm_set1_ps (0.0f); ++- ++- for(number = 0; number < n16points; number++) { ++- a0_val = _mm_loadu_ps(aPtr); ++- a1_val = _mm_loadu_ps(aPtr+4); ++- a2_val = _mm_loadu_ps(aPtr+8); ++- a3_val = _mm_loadu_ps(aPtr+12); ++- ++- // compare >= 0; return float ++- res0_f = _mm_cmpge_ps(a0_val, zero_val); ++- res1_f = _mm_cmpge_ps(a1_val, zero_val); ++- res2_f = _mm_cmpge_ps(a2_val, zero_val); ++- res3_f = _mm_cmpge_ps(a3_val, zero_val); ++- ++- // convert to 32i and >> 31 ++- res0_i = _mm_srli_epi32(_mm_cvtps_epi32(res0_f), 31); ++- res1_i = _mm_srli_epi32(_mm_cvtps_epi32(res1_f), 31); ++- res2_i = _mm_srli_epi32(_mm_cvtps_epi32(res2_f), 31); ++- res3_i = _mm_srli_epi32(_mm_cvtps_epi32(res3_f), 31); ++- ++- // pack into 16-bit results ++- res0_i = _mm_packs_epi32(res0_i, res1_i); ++- res2_i = _mm_packs_epi32(res2_i, res3_i); ++- ++- // pack into 8-bit results ++- res0_i = _mm_packs_epi16(res0_i, res2_i); ++- ++- _mm_storeu_si128((__m128i*)cPtr, res0_i); ++- ++- cPtr += 16; ++- aPtr += 16; ++- } ++- ++- for(number = n16points * 16; number < num_points; number++) { ++- if( *aPtr++ >= 0) { ++- *cPtr++ = 1; +++ int8_t* cPtr = cVector; +++ const float* aPtr = aVector; +++ unsigned int number = 0; +++ +++ unsigned int n16points = num_points / 16; +++ __m128 a0_val, a1_val, a2_val, a3_val; +++ __m128 res0_f, res1_f, res2_f, res3_f; +++ __m128i res0_i, res1_i, res2_i, res3_i; +++ __m128 zero_val; +++ zero_val = _mm_set1_ps(0.0f); +++ +++ for (number = 0; number < n16points; number++) { +++ a0_val = _mm_loadu_ps(aPtr); +++ a1_val = _mm_loadu_ps(aPtr + 4); +++ a2_val = _mm_loadu_ps(aPtr + 8); +++ a3_val = _mm_loadu_ps(aPtr + 12); +++ +++ // compare >= 0; return float +++ res0_f = _mm_cmpge_ps(a0_val, zero_val); +++ res1_f = _mm_cmpge_ps(a1_val, zero_val); +++ res2_f = _mm_cmpge_ps(a2_val, zero_val); +++ res3_f = _mm_cmpge_ps(a3_val, zero_val); +++ +++ // convert to 32i and >> 31 +++ res0_i = _mm_srli_epi32(_mm_cvtps_epi32(res0_f), 31); +++ res1_i = _mm_srli_epi32(_mm_cvtps_epi32(res1_f), 31); +++ res2_i = _mm_srli_epi32(_mm_cvtps_epi32(res2_f), 31); +++ res3_i = _mm_srli_epi32(_mm_cvtps_epi32(res3_f), 31); +++ +++ // pack into 16-bit results +++ res0_i = _mm_packs_epi32(res0_i, res1_i); +++ res2_i = _mm_packs_epi32(res2_i, res3_i); +++ +++ // pack into 8-bit results +++ res0_i = _mm_packs_epi16(res0_i, res2_i); +++ +++ _mm_storeu_si128((__m128i*)cPtr, res0_i); +++ +++ cPtr += 16; +++ aPtr += 16; ++ } ++- else { ++- *cPtr++ = 0; +++ +++ for (number = n16points * 16; number < num_points; number++) { +++ if (*aPtr++ >= 0) { +++ *cPtr++ = 1; +++ } else { +++ *cPtr++ = 0; +++ } ++ } ++- } ++ } ++ #endif /* LV_HAVE_SSE2 */ ++ ++@@ -394,74 +444,72 @@ volk_32f_binary_slicer_8i_u_sse2(int8_t* cVector, const float* aVector, ++ #ifdef LV_HAVE_NEON ++ #include ++ ++-static inline void ++-volk_32f_binary_slicer_8i_neon(int8_t* cVector, const float* aVector, ++- unsigned int num_points) +++static inline void volk_32f_binary_slicer_8i_neon(int8_t* cVector, +++ const float* aVector, +++ unsigned int num_points) ++ { ++- int8_t* cPtr = cVector; ++- const float* aPtr = aVector; ++- unsigned int number = 0; ++- unsigned int n16points = num_points / 16; ++- ++- float32x4x2_t input_val0, input_val1; ++- float32x4_t zero_val; ++- uint32x4x2_t res0_u32, res1_u32; ++- uint16x4x2_t res0_u16x4, res1_u16x4; ++- uint16x8x2_t res_u16x8; ++- uint8x8x2_t res_u8; ++- uint8x8_t one; ++- ++- zero_val = vdupq_n_f32(0.0); ++- one = vdup_n_u8(0x01); ++- ++- // TODO: this is a good candidate for asm because the vcombines ++- // can be eliminated simply by picking dst registers that are ++- // adjacent. ++- for(number = 0; number < n16points; number++) { ++- input_val0 = vld2q_f32(aPtr); ++- input_val1 = vld2q_f32(aPtr+8); ++- ++- // test against 0; return uint32 ++- res0_u32.val[0] = vcgeq_f32(input_val0.val[0], zero_val); ++- res0_u32.val[1] = vcgeq_f32(input_val0.val[1], zero_val); ++- res1_u32.val[0] = vcgeq_f32(input_val1.val[0], zero_val); ++- res1_u32.val[1] = vcgeq_f32(input_val1.val[1], zero_val); ++- ++- // narrow uint32 -> uint16 followed by combine to 8-element vectors ++- res0_u16x4.val[0] = vmovn_u32(res0_u32.val[0]); ++- res0_u16x4.val[1] = vmovn_u32(res0_u32.val[1]); ++- res1_u16x4.val[0] = vmovn_u32(res1_u32.val[0]); ++- res1_u16x4.val[1] = vmovn_u32(res1_u32.val[1]); ++- ++- res_u16x8.val[0] = vcombine_u16(res0_u16x4.val[0], res1_u16x4.val[0]); ++- res_u16x8.val[1] = vcombine_u16(res0_u16x4.val[1], res1_u16x4.val[1]); ++- ++- // narrow uint16x8 -> uint8x8 ++- res_u8.val[0] = vmovn_u16(res_u16x8.val[0]); ++- res_u8.val[1] = vmovn_u16(res_u16x8.val[1]); ++- // we *could* load twice as much data and do another vcombine here ++- // to get a uint8x16x2 vector, still only do 2 vandqs and a single store ++- // but that turns out to be ~16% slower than this version on zc702 ++- // it's possible register contention in GCC scheduler slows it down ++- // and a hand-written asm with quad-word u8 registers is much faster. ++- ++- res_u8.val[0] = vand_u8(one, res_u8.val[0]); ++- res_u8.val[1] = vand_u8(one, res_u8.val[1]); ++- ++- vst2_u8((unsigned char*)cPtr, res_u8); ++- cPtr += 16; ++- aPtr += 16; ++- ++- } ++- ++- for(number = n16points * 16; number < num_points; number++) { ++- if(*aPtr++ >= 0) { ++- *cPtr++ = 1; +++ int8_t* cPtr = cVector; +++ const float* aPtr = aVector; +++ unsigned int number = 0; +++ unsigned int n16points = num_points / 16; +++ +++ float32x4x2_t input_val0, input_val1; +++ float32x4_t zero_val; +++ uint32x4x2_t res0_u32, res1_u32; +++ uint16x4x2_t res0_u16x4, res1_u16x4; +++ uint16x8x2_t res_u16x8; +++ uint8x8x2_t res_u8; +++ uint8x8_t one; +++ +++ zero_val = vdupq_n_f32(0.0); +++ one = vdup_n_u8(0x01); +++ +++ // TODO: this is a good candidate for asm because the vcombines +++ // can be eliminated simply by picking dst registers that are +++ // adjacent. +++ for (number = 0; number < n16points; number++) { +++ input_val0 = vld2q_f32(aPtr); +++ input_val1 = vld2q_f32(aPtr + 8); +++ +++ // test against 0; return uint32 +++ res0_u32.val[0] = vcgeq_f32(input_val0.val[0], zero_val); +++ res0_u32.val[1] = vcgeq_f32(input_val0.val[1], zero_val); +++ res1_u32.val[0] = vcgeq_f32(input_val1.val[0], zero_val); +++ res1_u32.val[1] = vcgeq_f32(input_val1.val[1], zero_val); +++ +++ // narrow uint32 -> uint16 followed by combine to 8-element vectors +++ res0_u16x4.val[0] = vmovn_u32(res0_u32.val[0]); +++ res0_u16x4.val[1] = vmovn_u32(res0_u32.val[1]); +++ res1_u16x4.val[0] = vmovn_u32(res1_u32.val[0]); +++ res1_u16x4.val[1] = vmovn_u32(res1_u32.val[1]); +++ +++ res_u16x8.val[0] = vcombine_u16(res0_u16x4.val[0], res1_u16x4.val[0]); +++ res_u16x8.val[1] = vcombine_u16(res0_u16x4.val[1], res1_u16x4.val[1]); +++ +++ // narrow uint16x8 -> uint8x8 +++ res_u8.val[0] = vmovn_u16(res_u16x8.val[0]); +++ res_u8.val[1] = vmovn_u16(res_u16x8.val[1]); +++ // we *could* load twice as much data and do another vcombine here +++ // to get a uint8x16x2 vector, still only do 2 vandqs and a single store +++ // but that turns out to be ~16% slower than this version on zc702 +++ // it's possible register contention in GCC scheduler slows it down +++ // and a hand-written asm with quad-word u8 registers is much faster. +++ +++ res_u8.val[0] = vand_u8(one, res_u8.val[0]); +++ res_u8.val[1] = vand_u8(one, res_u8.val[1]); +++ +++ vst2_u8((unsigned char*)cPtr, res_u8); +++ cPtr += 16; +++ aPtr += 16; ++ } ++- else { ++- *cPtr++ = 0; +++ +++ for (number = n16points * 16; number < num_points; number++) { +++ if (*aPtr++ >= 0) { +++ *cPtr++ = 1; +++ } else { +++ *cPtr++ = 0; +++ } ++ } ++- } ++ } ++ #endif /* LV_HAVE_NEON */ ++ ++diff --git a/kernels/volk/volk_32f_convert_64f.h b/kernels/volk/volk_32f_convert_64f.h ++index bf57e3a..d2e3f8a 100644 ++--- a/kernels/volk/volk_32f_convert_64f.h +++++ b/kernels/volk/volk_32f_convert_64f.h ++@@ -29,8 +29,8 @@ ++ * ++ * Dispatcher Prototype ++ * \code ++- * void volk_32f_convert_64f(double* outputVector, const float* inputVector, unsigned int num_points) ++- * \endcode +++ * void volk_32f_convert_64f(double* outputVector, const float* inputVector, unsigned int +++ * num_points) \endcode ++ * ++ * \b Inputs ++ * \li inputVector: The vector of floats to convert to doubles. ++@@ -72,29 +72,33 @@ ++ #ifdef LV_HAVE_AVX ++ #include ++ ++-static inline void volk_32f_convert_64f_u_avx(double* outputVector, const float* inputVector, unsigned int num_points){ ++- unsigned int number = 0; +++static inline void volk_32f_convert_64f_u_avx(double* outputVector, +++ const float* inputVector, +++ unsigned int num_points) +++{ +++ unsigned int number = 0; ++ ++- const unsigned int quarterPoints = num_points / 4; +++ const unsigned int quarterPoints = num_points / 4; ++ ++- const float* inputVectorPtr = (const float*)inputVector; ++- double* outputVectorPtr = outputVector; ++- __m256d ret; ++- __m128 inputVal; +++ const float* inputVectorPtr = (const float*)inputVector; +++ double* outputVectorPtr = outputVector; +++ __m256d ret; +++ __m128 inputVal; ++ ++- for(;number < quarterPoints; number++){ ++- inputVal = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4; +++ for (; number < quarterPoints; number++) { +++ inputVal = _mm_loadu_ps(inputVectorPtr); +++ inputVectorPtr += 4; ++ ++- ret = _mm256_cvtps_pd(inputVal); ++- _mm256_storeu_pd(outputVectorPtr, ret); +++ ret = _mm256_cvtps_pd(inputVal); +++ _mm256_storeu_pd(outputVectorPtr, ret); ++ ++- outputVectorPtr += 4; ++- } +++ outputVectorPtr += 4; +++ } ++ ++- number = quarterPoints * 4; ++- for(; number < num_points; number++){ ++- outputVector[number] = (double)(inputVector[number]); ++- } +++ number = quarterPoints * 4; +++ for (; number < num_points; number++) { +++ outputVector[number] = (double)(inputVector[number]); +++ } ++ } ++ ++ #endif /* LV_HAVE_AVX */ ++@@ -102,56 +106,61 @@ static inline void volk_32f_convert_64f_u_avx(double* outputVector, const float* ++ #ifdef LV_HAVE_SSE2 ++ #include ++ ++-static inline void volk_32f_convert_64f_u_sse2(double* outputVector, const float* inputVector, unsigned int num_points){ ++- unsigned int number = 0; +++static inline void volk_32f_convert_64f_u_sse2(double* outputVector, +++ const float* inputVector, +++ unsigned int num_points) +++{ +++ unsigned int number = 0; ++ ++- const unsigned int quarterPoints = num_points / 4; +++ const unsigned int quarterPoints = num_points / 4; ++ ++- const float* inputVectorPtr = (const float*)inputVector; ++- double* outputVectorPtr = outputVector; ++- __m128d ret; ++- __m128 inputVal; +++ const float* inputVectorPtr = (const float*)inputVector; +++ double* outputVectorPtr = outputVector; +++ __m128d ret; +++ __m128 inputVal; ++ ++- for(;number < quarterPoints; number++){ ++- inputVal = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4; +++ for (; number < quarterPoints; number++) { +++ inputVal = _mm_loadu_ps(inputVectorPtr); +++ inputVectorPtr += 4; ++ ++- ret = _mm_cvtps_pd(inputVal); +++ ret = _mm_cvtps_pd(inputVal); ++ ++- _mm_storeu_pd(outputVectorPtr, ret); ++- outputVectorPtr += 2; +++ _mm_storeu_pd(outputVectorPtr, ret); +++ outputVectorPtr += 2; ++ ++- inputVal = _mm_movehl_ps(inputVal, inputVal); +++ inputVal = _mm_movehl_ps(inputVal, inputVal); ++ ++- ret = _mm_cvtps_pd(inputVal); +++ ret = _mm_cvtps_pd(inputVal); ++ ++- _mm_storeu_pd(outputVectorPtr, ret); ++- outputVectorPtr += 2; ++- } +++ _mm_storeu_pd(outputVectorPtr, ret); +++ outputVectorPtr += 2; +++ } ++ ++- number = quarterPoints * 4; ++- for(; number < num_points; number++){ ++- outputVector[number] = (double)(inputVector[number]); ++- } +++ number = quarterPoints * 4; +++ for (; number < num_points; number++) { +++ outputVector[number] = (double)(inputVector[number]); +++ } ++ } ++ #endif /* LV_HAVE_SSE2 */ ++ ++ ++ #ifdef LV_HAVE_GENERIC ++ ++-static inline void volk_32f_convert_64f_generic(double* outputVector, const float* inputVector, unsigned int num_points){ ++- double* outputVectorPtr = outputVector; ++- const float* inputVectorPtr = inputVector; ++- unsigned int number = 0; ++- ++- for(number = 0; number < num_points; number++){ ++- *outputVectorPtr++ = ((double)(*inputVectorPtr++)); ++- } +++static inline void volk_32f_convert_64f_generic(double* outputVector, +++ const float* inputVector, +++ unsigned int num_points) +++{ +++ double* outputVectorPtr = outputVector; +++ const float* inputVectorPtr = inputVector; +++ unsigned int number = 0; +++ +++ for (number = 0; number < num_points; number++) { +++ *outputVectorPtr++ = ((double)(*inputVectorPtr++)); +++ } ++ } ++ #endif /* LV_HAVE_GENERIC */ ++ ++ ++- ++- ++ #endif /* INCLUDED_volk_32f_convert_64f_u_H */ ++ ++ ++@@ -164,83 +173,92 @@ static inline void volk_32f_convert_64f_generic(double* outputVector, const floa ++ #ifdef LV_HAVE_AVX ++ #include ++ ++-static inline void volk_32f_convert_64f_a_avx(double* outputVector, const float* inputVector, unsigned int num_points){ ++- unsigned int number = 0; +++static inline void volk_32f_convert_64f_a_avx(double* outputVector, +++ const float* inputVector, +++ unsigned int num_points) +++{ +++ unsigned int number = 0; ++ ++- const unsigned int quarterPoints = num_points / 4; +++ const unsigned int quarterPoints = num_points / 4; ++ ++- const float* inputVectorPtr = (const float*)inputVector; ++- double* outputVectorPtr = outputVector; ++- __m256d ret; ++- __m128 inputVal; +++ const float* inputVectorPtr = (const float*)inputVector; +++ double* outputVectorPtr = outputVector; +++ __m256d ret; +++ __m128 inputVal; ++ ++- for(;number < quarterPoints; number++){ ++- inputVal = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4; +++ for (; number < quarterPoints; number++) { +++ inputVal = _mm_load_ps(inputVectorPtr); +++ inputVectorPtr += 4; ++ ++- ret = _mm256_cvtps_pd(inputVal); ++- _mm256_store_pd(outputVectorPtr, ret); +++ ret = _mm256_cvtps_pd(inputVal); +++ _mm256_store_pd(outputVectorPtr, ret); ++ ++- outputVectorPtr += 4; ++- } +++ outputVectorPtr += 4; +++ } ++ ++- number = quarterPoints * 4; ++- for(; number < num_points; number++){ ++- outputVector[number] = (double)(inputVector[number]); ++- } +++ number = quarterPoints * 4; +++ for (; number < num_points; number++) { +++ outputVector[number] = (double)(inputVector[number]); +++ } ++ } ++ #endif /* LV_HAVE_AVX */ ++ ++ #ifdef LV_HAVE_SSE2 ++ #include ++ ++-static inline void volk_32f_convert_64f_a_sse2(double* outputVector, const float* inputVector, unsigned int num_points){ ++- unsigned int number = 0; +++static inline void volk_32f_convert_64f_a_sse2(double* outputVector, +++ const float* inputVector, +++ unsigned int num_points) +++{ +++ unsigned int number = 0; ++ ++- const unsigned int quarterPoints = num_points / 4; +++ const unsigned int quarterPoints = num_points / 4; ++ ++- const float* inputVectorPtr = (const float*)inputVector; ++- double* outputVectorPtr = outputVector; ++- __m128d ret; ++- __m128 inputVal; +++ const float* inputVectorPtr = (const float*)inputVector; +++ double* outputVectorPtr = outputVector; +++ __m128d ret; +++ __m128 inputVal; ++ ++- for(;number < quarterPoints; number++){ ++- inputVal = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4; +++ for (; number < quarterPoints; number++) { +++ inputVal = _mm_load_ps(inputVectorPtr); +++ inputVectorPtr += 4; ++ ++- ret = _mm_cvtps_pd(inputVal); +++ ret = _mm_cvtps_pd(inputVal); ++ ++- _mm_store_pd(outputVectorPtr, ret); ++- outputVectorPtr += 2; +++ _mm_store_pd(outputVectorPtr, ret); +++ outputVectorPtr += 2; ++ ++- inputVal = _mm_movehl_ps(inputVal, inputVal); +++ inputVal = _mm_movehl_ps(inputVal, inputVal); ++ ++- ret = _mm_cvtps_pd(inputVal); +++ ret = _mm_cvtps_pd(inputVal); ++ ++- _mm_store_pd(outputVectorPtr, ret); ++- outputVectorPtr += 2; ++- } +++ _mm_store_pd(outputVectorPtr, ret); +++ outputVectorPtr += 2; +++ } ++ ++- number = quarterPoints * 4; ++- for(; number < num_points; number++){ ++- outputVector[number] = (double)(inputVector[number]); ++- } +++ number = quarterPoints * 4; +++ for (; number < num_points; number++) { +++ outputVector[number] = (double)(inputVector[number]); +++ } ++ } ++ #endif /* LV_HAVE_SSE2 */ ++ ++ ++ #ifdef LV_HAVE_GENERIC ++ ++-static inline void volk_32f_convert_64f_a_generic(double* outputVector, const float* inputVector, unsigned int num_points){ ++- double* outputVectorPtr = outputVector; ++- const float* inputVectorPtr = inputVector; ++- unsigned int number = 0; ++- ++- for(number = 0; number < num_points; number++){ ++- *outputVectorPtr++ = ((double)(*inputVectorPtr++)); ++- } +++static inline void volk_32f_convert_64f_a_generic(double* outputVector, +++ const float* inputVector, +++ unsigned int num_points) +++{ +++ double* outputVectorPtr = outputVector; +++ const float* inputVectorPtr = inputVector; +++ unsigned int number = 0; +++ +++ for (number = 0; number < num_points; number++) { +++ *outputVectorPtr++ = ((double)(*inputVectorPtr++)); +++ } ++ } ++ #endif /* LV_HAVE_GENERIC */ ++ ++ ++- ++- ++ #endif /* INCLUDED_volk_32f_convert_64f_a_H */ ++diff --git a/kernels/volk/volk_32f_cos_32f.h b/kernels/volk/volk_32f_cos_32f.h ++index 39c2008..b493764 100644 ++--- a/kernels/volk/volk_32f_cos_32f.h +++++ b/kernels/volk/volk_32f_cos_32f.h ++@@ -69,9 +69,9 @@ ++ * \endcode ++ */ ++ ++-#include ++-#include ++ #include +++#include +++#include ++ ++ #ifndef INCLUDED_volk_32f_cos_32f_a_H ++ #define INCLUDED_volk_32f_cos_32f_a_H ++@@ -80,86 +80,102 @@ ++ #include ++ ++ static inline void ++- volk_32f_cos_32f_a_avx2_fma(float* bVector, const float* aVector, unsigned int num_points) +++volk_32f_cos_32f_a_avx2_fma(float* bVector, const float* aVector, unsigned int num_points) ++ { ++- float* bPtr = bVector; ++- const float* aPtr = aVector; ++- ++- unsigned int number = 0; ++- unsigned int eighthPoints = num_points / 8; ++- unsigned int i = 0; ++- ++- __m256 aVal, s, r, m4pi, pio4A, pio4B, pio4C, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones, fzeroes; ++- __m256 sine, cosine; ++- __m256i q, ones, twos, fours; ++- ++- m4pi = _mm256_set1_ps(1.273239544735162542821171882678754627704620361328125); ++- pio4A = _mm256_set1_ps(0.7853981554508209228515625); ++- pio4B = _mm256_set1_ps(0.794662735614792836713604629039764404296875e-8); ++- pio4C = _mm256_set1_ps(0.306161699786838294306516483068750264552437361480769e-16); ++- ffours = _mm256_set1_ps(4.0); ++- ftwos = _mm256_set1_ps(2.0); ++- fones = _mm256_set1_ps(1.0); ++- fzeroes = _mm256_setzero_ps(); ++- __m256i zeroes = _mm256_set1_epi32(0); ++- ones = _mm256_set1_epi32(1); ++- __m256i allones = _mm256_set1_epi32(0xffffffff); ++- twos = _mm256_set1_epi32(2); ++- fours = _mm256_set1_epi32(4); ++- ++- cp1 = _mm256_set1_ps(1.0); ++- cp2 = _mm256_set1_ps(0.08333333333333333); ++- cp3 = _mm256_set1_ps(0.002777777777777778); ++- cp4 = _mm256_set1_ps(4.96031746031746e-05); ++- cp5 = _mm256_set1_ps(5.511463844797178e-07); ++- union bit256 condition1; ++- union bit256 condition3; ++- ++- for(;number < eighthPoints; number++){ ++- ++- aVal = _mm256_load_ps(aPtr); ++- // s = fabs(aVal) ++- s = _mm256_sub_ps(aVal, _mm256_and_ps(_mm256_mul_ps(aVal, ftwos), _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS))); ++- // q = (int) (s * (4/pi)), floor(aVal / (pi/4)) ++- q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi))); ++- // r = q + q&1, q indicates quadrant, r gives ++- r = _mm256_cvtepi32_ps(_mm256_add_epi32(q, _mm256_and_si256(q, ones))); ++- ++- s = _mm256_fnmadd_ps(r,pio4A,s); ++- s = _mm256_fnmadd_ps(r,pio4B,s); ++- s = _mm256_fnmadd_ps(r,pio4C,s); ++- ++- s = _mm256_div_ps(s, _mm256_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction ++- s = _mm256_mul_ps(s, s); ++- // Evaluate Taylor series ++- s = _mm256_mul_ps(_mm256_fmadd_ps(_mm256_fmsub_ps(_mm256_fmadd_ps(_mm256_fmsub_ps(s, cp5, cp4), s, cp3), s, cp2), s, cp1), s); ++- ++- for(i = 0; i < 3; i++) ++- s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s)); ++- s = _mm256_div_ps(s, ftwos); ++- ++- sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s)); ++- cosine = _mm256_sub_ps(fones, s); ++- ++- // if(((q+1)&2) != 0) { cosine=sine;} ++- condition1.int_vec = _mm256_cmpeq_epi32(_mm256_and_si256(_mm256_add_epi32(q, ones), twos), zeroes); ++- condition1.int_vec = _mm256_xor_si256(allones, condition1.int_vec); ++- ++- // if(((q+2)&4) != 0) { cosine = -cosine;} ++- condition3.int_vec = _mm256_cmpeq_epi32(_mm256_and_si256(_mm256_add_epi32(q, twos), fours), zeroes); ++- condition3.int_vec = _mm256_xor_si256(allones, condition3.int_vec); ++- ++- cosine = _mm256_add_ps(cosine, _mm256_and_ps(_mm256_sub_ps(sine, cosine), condition1.float_vec)); ++- cosine = _mm256_sub_ps(cosine, _mm256_and_ps(_mm256_mul_ps(cosine, _mm256_set1_ps(2.0f)), condition3.float_vec)); ++- _mm256_store_ps(bPtr, cosine); ++- aPtr += 8; ++- bPtr += 8; ++- } ++- ++- number = eighthPoints * 8; ++- for(;number < num_points; number++){ ++- *bPtr++ = cos(*aPtr++); ++- } +++ float* bPtr = bVector; +++ const float* aPtr = aVector; +++ +++ unsigned int number = 0; +++ unsigned int eighthPoints = num_points / 8; +++ unsigned int i = 0; +++ +++ __m256 aVal, s, r, m4pi, pio4A, pio4B, pio4C, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, +++ fones, fzeroes; +++ __m256 sine, cosine; +++ __m256i q, ones, twos, fours; +++ +++ m4pi = _mm256_set1_ps(1.273239544735162542821171882678754627704620361328125); +++ pio4A = _mm256_set1_ps(0.7853981554508209228515625); +++ pio4B = _mm256_set1_ps(0.794662735614792836713604629039764404296875e-8); +++ pio4C = _mm256_set1_ps(0.306161699786838294306516483068750264552437361480769e-16); +++ ffours = _mm256_set1_ps(4.0); +++ ftwos = _mm256_set1_ps(2.0); +++ fones = _mm256_set1_ps(1.0); +++ fzeroes = _mm256_setzero_ps(); +++ __m256i zeroes = _mm256_set1_epi32(0); +++ ones = _mm256_set1_epi32(1); +++ __m256i allones = _mm256_set1_epi32(0xffffffff); +++ twos = _mm256_set1_epi32(2); +++ fours = _mm256_set1_epi32(4); +++ +++ cp1 = _mm256_set1_ps(1.0); +++ cp2 = _mm256_set1_ps(0.08333333333333333); +++ cp3 = _mm256_set1_ps(0.002777777777777778); +++ cp4 = _mm256_set1_ps(4.96031746031746e-05); +++ cp5 = _mm256_set1_ps(5.511463844797178e-07); +++ union bit256 condition1; +++ union bit256 condition3; +++ +++ for (; number < eighthPoints; number++) { +++ +++ aVal = _mm256_load_ps(aPtr); +++ // s = fabs(aVal) +++ s = _mm256_sub_ps(aVal, +++ _mm256_and_ps(_mm256_mul_ps(aVal, ftwos), +++ _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS))); +++ // q = (int) (s * (4/pi)), floor(aVal / (pi/4)) +++ q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi))); +++ // r = q + q&1, q indicates quadrant, r gives +++ r = _mm256_cvtepi32_ps(_mm256_add_epi32(q, _mm256_and_si256(q, ones))); +++ +++ s = _mm256_fnmadd_ps(r, pio4A, s); +++ s = _mm256_fnmadd_ps(r, pio4B, s); +++ s = _mm256_fnmadd_ps(r, pio4C, s); +++ +++ s = _mm256_div_ps( +++ s, +++ _mm256_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction +++ s = _mm256_mul_ps(s, s); +++ // Evaluate Taylor series +++ s = _mm256_mul_ps( +++ _mm256_fmadd_ps( +++ _mm256_fmsub_ps( +++ _mm256_fmadd_ps(_mm256_fmsub_ps(s, cp5, cp4), s, cp3), s, cp2), +++ s, +++ cp1), +++ s); +++ +++ for (i = 0; i < 3; i++) +++ s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s)); +++ s = _mm256_div_ps(s, ftwos); +++ +++ sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s)); +++ cosine = _mm256_sub_ps(fones, s); +++ +++ // if(((q+1)&2) != 0) { cosine=sine;} +++ condition1.int_vec = +++ _mm256_cmpeq_epi32(_mm256_and_si256(_mm256_add_epi32(q, ones), twos), zeroes); +++ condition1.int_vec = _mm256_xor_si256(allones, condition1.int_vec); +++ +++ // if(((q+2)&4) != 0) { cosine = -cosine;} +++ condition3.int_vec = _mm256_cmpeq_epi32( +++ _mm256_and_si256(_mm256_add_epi32(q, twos), fours), zeroes); +++ condition3.int_vec = _mm256_xor_si256(allones, condition3.int_vec); +++ +++ cosine = _mm256_add_ps( +++ cosine, _mm256_and_ps(_mm256_sub_ps(sine, cosine), condition1.float_vec)); +++ cosine = _mm256_sub_ps(cosine, +++ _mm256_and_ps(_mm256_mul_ps(cosine, _mm256_set1_ps(2.0f)), +++ condition3.float_vec)); +++ _mm256_store_ps(bPtr, cosine); +++ aPtr += 8; +++ bPtr += 8; +++ } +++ +++ number = eighthPoints * 8; +++ for (; number < num_points; number++) { +++ *bPtr++ = cos(*aPtr++); +++ } ++ } ++ ++ #endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for aligned */ ++@@ -168,86 +184,109 @@ static inline void ++ #include ++ ++ static inline void ++- volk_32f_cos_32f_a_avx2(float* bVector, const float* aVector, unsigned int num_points) +++volk_32f_cos_32f_a_avx2(float* bVector, const float* aVector, unsigned int num_points) ++ { ++- float* bPtr = bVector; ++- const float* aPtr = aVector; ++- ++- unsigned int number = 0; ++- unsigned int eighthPoints = num_points / 8; ++- unsigned int i = 0; ++- ++- __m256 aVal, s, r, m4pi, pio4A, pio4B, pio4C, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones, fzeroes; ++- __m256 sine, cosine; ++- __m256i q, ones, twos, fours; ++- ++- m4pi = _mm256_set1_ps(1.273239544735162542821171882678754627704620361328125); ++- pio4A = _mm256_set1_ps(0.7853981554508209228515625); ++- pio4B = _mm256_set1_ps(0.794662735614792836713604629039764404296875e-8); ++- pio4C = _mm256_set1_ps(0.306161699786838294306516483068750264552437361480769e-16); ++- ffours = _mm256_set1_ps(4.0); ++- ftwos = _mm256_set1_ps(2.0); ++- fones = _mm256_set1_ps(1.0); ++- fzeroes = _mm256_setzero_ps(); ++- __m256i zeroes = _mm256_set1_epi32(0); ++- ones = _mm256_set1_epi32(1); ++- __m256i allones = _mm256_set1_epi32(0xffffffff); ++- twos = _mm256_set1_epi32(2); ++- fours = _mm256_set1_epi32(4); ++- ++- cp1 = _mm256_set1_ps(1.0); ++- cp2 = _mm256_set1_ps(0.08333333333333333); ++- cp3 = _mm256_set1_ps(0.002777777777777778); ++- cp4 = _mm256_set1_ps(4.96031746031746e-05); ++- cp5 = _mm256_set1_ps(5.511463844797178e-07); ++- union bit256 condition1; ++- union bit256 condition3; ++- ++- for(;number < eighthPoints; number++){ ++- ++- aVal = _mm256_load_ps(aPtr); ++- // s = fabs(aVal) ++- s = _mm256_sub_ps(aVal, _mm256_and_ps(_mm256_mul_ps(aVal, ftwos), _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS))); ++- // q = (int) (s * (4/pi)), floor(aVal / (pi/4)) ++- q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi))); ++- // r = q + q&1, q indicates quadrant, r gives ++- r = _mm256_cvtepi32_ps(_mm256_add_epi32(q, _mm256_and_si256(q, ones))); ++- ++- s = _mm256_sub_ps(s, _mm256_mul_ps(r,pio4A)); ++- s = _mm256_sub_ps(s, _mm256_mul_ps(r,pio4B)); ++- s = _mm256_sub_ps(s, _mm256_mul_ps(r,pio4C)); ++- ++- s = _mm256_div_ps(s, _mm256_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction ++- s = _mm256_mul_ps(s, s); ++- // Evaluate Taylor series ++- s = _mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(s, cp5), cp4), s), cp3), s), cp2), s), cp1), s); ++- ++- for(i = 0; i < 3; i++) ++- s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s)); ++- s = _mm256_div_ps(s, ftwos); ++- ++- sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s)); ++- cosine = _mm256_sub_ps(fones, s); ++- ++- // if(((q+1)&2) != 0) { cosine=sine;} ++- condition1.int_vec = _mm256_cmpeq_epi32(_mm256_and_si256(_mm256_add_epi32(q, ones), twos), zeroes); ++- condition1.int_vec = _mm256_xor_si256(allones, condition1.int_vec); ++- ++- // if(((q+2)&4) != 0) { cosine = -cosine;} ++- condition3.int_vec = _mm256_cmpeq_epi32(_mm256_and_si256(_mm256_add_epi32(q, twos), fours), zeroes); ++- condition3.int_vec = _mm256_xor_si256(allones, condition3.int_vec); ++- ++- cosine = _mm256_add_ps(cosine, _mm256_and_ps(_mm256_sub_ps(sine, cosine), condition1.float_vec)); ++- cosine = _mm256_sub_ps(cosine, _mm256_and_ps(_mm256_mul_ps(cosine, _mm256_set1_ps(2.0f)), condition3.float_vec)); ++- _mm256_store_ps(bPtr, cosine); ++- aPtr += 8; ++- bPtr += 8; ++- } ++- ++- number = eighthPoints * 8; ++- for(;number < num_points; number++){ ++- *bPtr++ = cos(*aPtr++); ++- } +++ float* bPtr = bVector; +++ const float* aPtr = aVector; +++ +++ unsigned int number = 0; +++ unsigned int eighthPoints = num_points / 8; +++ unsigned int i = 0; +++ +++ __m256 aVal, s, r, m4pi, pio4A, pio4B, pio4C, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, +++ fones, fzeroes; +++ __m256 sine, cosine; +++ __m256i q, ones, twos, fours; +++ +++ m4pi = _mm256_set1_ps(1.273239544735162542821171882678754627704620361328125); +++ pio4A = _mm256_set1_ps(0.7853981554508209228515625); +++ pio4B = _mm256_set1_ps(0.794662735614792836713604629039764404296875e-8); +++ pio4C = _mm256_set1_ps(0.306161699786838294306516483068750264552437361480769e-16); +++ ffours = _mm256_set1_ps(4.0); +++ ftwos = _mm256_set1_ps(2.0); +++ fones = _mm256_set1_ps(1.0); +++ fzeroes = _mm256_setzero_ps(); +++ __m256i zeroes = _mm256_set1_epi32(0); +++ ones = _mm256_set1_epi32(1); +++ __m256i allones = _mm256_set1_epi32(0xffffffff); +++ twos = _mm256_set1_epi32(2); +++ fours = _mm256_set1_epi32(4); +++ +++ cp1 = _mm256_set1_ps(1.0); +++ cp2 = _mm256_set1_ps(0.08333333333333333); +++ cp3 = _mm256_set1_ps(0.002777777777777778); +++ cp4 = _mm256_set1_ps(4.96031746031746e-05); +++ cp5 = _mm256_set1_ps(5.511463844797178e-07); +++ union bit256 condition1; +++ union bit256 condition3; +++ +++ for (; number < eighthPoints; number++) { +++ +++ aVal = _mm256_load_ps(aPtr); +++ // s = fabs(aVal) +++ s = _mm256_sub_ps(aVal, +++ _mm256_and_ps(_mm256_mul_ps(aVal, ftwos), +++ _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS))); +++ // q = (int) (s * (4/pi)), floor(aVal / (pi/4)) +++ q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi))); +++ // r = q + q&1, q indicates quadrant, r gives +++ r = _mm256_cvtepi32_ps(_mm256_add_epi32(q, _mm256_and_si256(q, ones))); +++ +++ s = _mm256_sub_ps(s, _mm256_mul_ps(r, pio4A)); +++ s = _mm256_sub_ps(s, _mm256_mul_ps(r, pio4B)); +++ s = _mm256_sub_ps(s, _mm256_mul_ps(r, pio4C)); +++ +++ s = _mm256_div_ps( +++ s, +++ _mm256_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction +++ s = _mm256_mul_ps(s, s); +++ // Evaluate Taylor series +++ s = _mm256_mul_ps( +++ _mm256_add_ps( +++ _mm256_mul_ps( +++ _mm256_sub_ps( +++ _mm256_mul_ps( +++ _mm256_add_ps( +++ _mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(s, cp5), cp4), +++ s), +++ cp3), +++ s), +++ cp2), +++ s), +++ cp1), +++ s); +++ +++ for (i = 0; i < 3; i++) +++ s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s)); +++ s = _mm256_div_ps(s, ftwos); +++ +++ sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s)); +++ cosine = _mm256_sub_ps(fones, s); +++ +++ // if(((q+1)&2) != 0) { cosine=sine;} +++ condition1.int_vec = +++ _mm256_cmpeq_epi32(_mm256_and_si256(_mm256_add_epi32(q, ones), twos), zeroes); +++ condition1.int_vec = _mm256_xor_si256(allones, condition1.int_vec); +++ +++ // if(((q+2)&4) != 0) { cosine = -cosine;} +++ condition3.int_vec = _mm256_cmpeq_epi32( +++ _mm256_and_si256(_mm256_add_epi32(q, twos), fours), zeroes); +++ condition3.int_vec = _mm256_xor_si256(allones, condition3.int_vec); +++ +++ cosine = _mm256_add_ps( +++ cosine, _mm256_and_ps(_mm256_sub_ps(sine, cosine), condition1.float_vec)); +++ cosine = _mm256_sub_ps(cosine, +++ _mm256_and_ps(_mm256_mul_ps(cosine, _mm256_set1_ps(2.0f)), +++ condition3.float_vec)); +++ _mm256_store_ps(bPtr, cosine); +++ aPtr += 8; +++ bPtr += 8; +++ } +++ +++ number = eighthPoints * 8; +++ for (; number < num_points; number++) { +++ *bPtr++ = cos(*aPtr++); +++ } ++ } ++ ++ #endif /* LV_HAVE_AVX2 for aligned */ ++@@ -256,86 +295,105 @@ static inline void ++ #include ++ ++ static inline void ++- volk_32f_cos_32f_a_sse4_1(float* bVector, const float* aVector, unsigned int num_points) +++volk_32f_cos_32f_a_sse4_1(float* bVector, const float* aVector, unsigned int num_points) ++ { ++- float* bPtr = bVector; ++- const float* aPtr = aVector; ++- ++- unsigned int number = 0; ++- unsigned int quarterPoints = num_points / 4; ++- unsigned int i = 0; ++- ++- __m128 aVal, s, r, m4pi, pio4A, pio4B, pio4C, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones, fzeroes; ++- __m128 sine, cosine; ++- __m128i q, ones, twos, fours; ++- ++- m4pi = _mm_set1_ps(1.273239544735162542821171882678754627704620361328125); ++- pio4A = _mm_set1_ps(0.7853981554508209228515625); ++- pio4B = _mm_set1_ps(0.794662735614792836713604629039764404296875e-8); ++- pio4C = _mm_set1_ps(0.306161699786838294306516483068750264552437361480769e-16); ++- ffours = _mm_set1_ps(4.0); ++- ftwos = _mm_set1_ps(2.0); ++- fones = _mm_set1_ps(1.0); ++- fzeroes = _mm_setzero_ps(); ++- __m128i zeroes = _mm_set1_epi32(0); ++- ones = _mm_set1_epi32(1); ++- __m128i allones = _mm_set1_epi32(0xffffffff); ++- twos = _mm_set1_epi32(2); ++- fours = _mm_set1_epi32(4); ++- ++- cp1 = _mm_set1_ps(1.0); ++- cp2 = _mm_set1_ps(0.08333333333333333); ++- cp3 = _mm_set1_ps(0.002777777777777778); ++- cp4 = _mm_set1_ps(4.96031746031746e-05); ++- cp5 = _mm_set1_ps(5.511463844797178e-07); ++- union bit128 condition1; ++- union bit128 condition3; ++- ++- for(;number < quarterPoints; number++){ ++- ++- aVal = _mm_load_ps(aPtr); ++- // s = fabs(aVal) ++- s = _mm_sub_ps(aVal, _mm_and_ps(_mm_mul_ps(aVal, ftwos), _mm_cmplt_ps(aVal, fzeroes))); ++- // q = (int) (s * (4/pi)), floor(aVal / (pi/4)) ++- q = _mm_cvtps_epi32(_mm_floor_ps(_mm_mul_ps(s, m4pi))); ++- // r = q + q&1, q indicates quadrant, r gives ++- r = _mm_cvtepi32_ps(_mm_add_epi32(q, _mm_and_si128(q, ones))); ++- ++- s = _mm_sub_ps(s, _mm_mul_ps(r, pio4A)); ++- s = _mm_sub_ps(s, _mm_mul_ps(r, pio4B)); ++- s = _mm_sub_ps(s, _mm_mul_ps(r, pio4C)); ++- ++- s = _mm_div_ps(s, _mm_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction ++- s = _mm_mul_ps(s, s); ++- // Evaluate Taylor series ++- s = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(s, cp5), cp4), s), cp3), s), cp2), s), cp1), s); ++- ++- for(i = 0; i < 3; i++) ++- s = _mm_mul_ps(s, _mm_sub_ps(ffours, s)); ++- s = _mm_div_ps(s, ftwos); ++- ++- sine = _mm_sqrt_ps(_mm_mul_ps(_mm_sub_ps(ftwos, s), s)); ++- cosine = _mm_sub_ps(fones, s); ++- ++- // if(((q+1)&2) != 0) { cosine=sine;} ++- condition1.int_vec = _mm_cmpeq_epi32(_mm_and_si128(_mm_add_epi32(q, ones), twos), zeroes); ++- condition1.int_vec = _mm_xor_si128(allones, condition1.int_vec); ++- ++- // if(((q+2)&4) != 0) { cosine = -cosine;} ++- condition3.int_vec = _mm_cmpeq_epi32(_mm_and_si128(_mm_add_epi32(q, twos), fours), zeroes); ++- condition3.int_vec = _mm_xor_si128(allones, condition3.int_vec); ++- ++- cosine = _mm_add_ps(cosine, _mm_and_ps(_mm_sub_ps(sine, cosine), condition1.float_vec)); ++- cosine = _mm_sub_ps(cosine, _mm_and_ps(_mm_mul_ps(cosine, _mm_set1_ps(2.0f)), condition3.float_vec)); ++- _mm_store_ps(bPtr, cosine); ++- aPtr += 4; ++- bPtr += 4; ++- } ++- ++- number = quarterPoints * 4; ++- for(;number < num_points; number++){ ++- *bPtr++ = cosf(*aPtr++); ++- } +++ float* bPtr = bVector; +++ const float* aPtr = aVector; +++ +++ unsigned int number = 0; +++ unsigned int quarterPoints = num_points / 4; +++ unsigned int i = 0; +++ +++ __m128 aVal, s, r, m4pi, pio4A, pio4B, pio4C, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, +++ fones, fzeroes; +++ __m128 sine, cosine; +++ __m128i q, ones, twos, fours; +++ +++ m4pi = _mm_set1_ps(1.273239544735162542821171882678754627704620361328125); +++ pio4A = _mm_set1_ps(0.7853981554508209228515625); +++ pio4B = _mm_set1_ps(0.794662735614792836713604629039764404296875e-8); +++ pio4C = _mm_set1_ps(0.306161699786838294306516483068750264552437361480769e-16); +++ ffours = _mm_set1_ps(4.0); +++ ftwos = _mm_set1_ps(2.0); +++ fones = _mm_set1_ps(1.0); +++ fzeroes = _mm_setzero_ps(); +++ __m128i zeroes = _mm_set1_epi32(0); +++ ones = _mm_set1_epi32(1); +++ __m128i allones = _mm_set1_epi32(0xffffffff); +++ twos = _mm_set1_epi32(2); +++ fours = _mm_set1_epi32(4); +++ +++ cp1 = _mm_set1_ps(1.0); +++ cp2 = _mm_set1_ps(0.08333333333333333); +++ cp3 = _mm_set1_ps(0.002777777777777778); +++ cp4 = _mm_set1_ps(4.96031746031746e-05); +++ cp5 = _mm_set1_ps(5.511463844797178e-07); +++ union bit128 condition1; +++ union bit128 condition3; +++ +++ for (; number < quarterPoints; number++) { +++ +++ aVal = _mm_load_ps(aPtr); +++ // s = fabs(aVal) +++ s = _mm_sub_ps(aVal, +++ _mm_and_ps(_mm_mul_ps(aVal, ftwos), _mm_cmplt_ps(aVal, fzeroes))); +++ // q = (int) (s * (4/pi)), floor(aVal / (pi/4)) +++ q = _mm_cvtps_epi32(_mm_floor_ps(_mm_mul_ps(s, m4pi))); +++ // r = q + q&1, q indicates quadrant, r gives +++ r = _mm_cvtepi32_ps(_mm_add_epi32(q, _mm_and_si128(q, ones))); +++ +++ s = _mm_sub_ps(s, _mm_mul_ps(r, pio4A)); +++ s = _mm_sub_ps(s, _mm_mul_ps(r, pio4B)); +++ s = _mm_sub_ps(s, _mm_mul_ps(r, pio4C)); +++ +++ s = _mm_div_ps( +++ s, _mm_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction +++ s = _mm_mul_ps(s, s); +++ // Evaluate Taylor series +++ s = _mm_mul_ps( +++ _mm_add_ps( +++ _mm_mul_ps( +++ _mm_sub_ps( +++ _mm_mul_ps( +++ _mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(s, cp5), cp4), s), +++ cp3), +++ s), +++ cp2), +++ s), +++ cp1), +++ s); +++ +++ for (i = 0; i < 3; i++) +++ s = _mm_mul_ps(s, _mm_sub_ps(ffours, s)); +++ s = _mm_div_ps(s, ftwos); +++ +++ sine = _mm_sqrt_ps(_mm_mul_ps(_mm_sub_ps(ftwos, s), s)); +++ cosine = _mm_sub_ps(fones, s); +++ +++ // if(((q+1)&2) != 0) { cosine=sine;} +++ condition1.int_vec = +++ _mm_cmpeq_epi32(_mm_and_si128(_mm_add_epi32(q, ones), twos), zeroes); +++ condition1.int_vec = _mm_xor_si128(allones, condition1.int_vec); +++ +++ // if(((q+2)&4) != 0) { cosine = -cosine;} +++ condition3.int_vec = +++ _mm_cmpeq_epi32(_mm_and_si128(_mm_add_epi32(q, twos), fours), zeroes); +++ condition3.int_vec = _mm_xor_si128(allones, condition3.int_vec); +++ +++ cosine = _mm_add_ps(cosine, +++ _mm_and_ps(_mm_sub_ps(sine, cosine), condition1.float_vec)); +++ cosine = _mm_sub_ps( +++ cosine, +++ _mm_and_ps(_mm_mul_ps(cosine, _mm_set1_ps(2.0f)), condition3.float_vec)); +++ _mm_store_ps(bPtr, cosine); +++ aPtr += 4; +++ bPtr += 4; +++ } +++ +++ number = quarterPoints * 4; +++ for (; number < num_points; number++) { +++ *bPtr++ = cosf(*aPtr++); +++ } ++ } ++ ++ #endif /* LV_HAVE_SSE4_1 for aligned */ ++@@ -343,7 +401,6 @@ static inline void ++ #endif /* INCLUDED_volk_32f_cos_32f_a_H */ ++ ++ ++- ++ #ifndef INCLUDED_volk_32f_cos_32f_u_H ++ #define INCLUDED_volk_32f_cos_32f_u_H ++ ++@@ -351,86 +408,102 @@ static inline void ++ #include ++ ++ static inline void ++- volk_32f_cos_32f_u_avx2_fma(float* bVector, const float* aVector, unsigned int num_points) +++volk_32f_cos_32f_u_avx2_fma(float* bVector, const float* aVector, unsigned int num_points) ++ { ++- float* bPtr = bVector; ++- const float* aPtr = aVector; ++- ++- unsigned int number = 0; ++- unsigned int eighthPoints = num_points / 8; ++- unsigned int i = 0; ++- ++- __m256 aVal, s, r, m4pi, pio4A, pio4B, pio4C, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones, fzeroes; ++- __m256 sine, cosine; ++- __m256i q, ones, twos, fours; ++- ++- m4pi = _mm256_set1_ps(1.273239544735162542821171882678754627704620361328125); ++- pio4A = _mm256_set1_ps(0.7853981554508209228515625); ++- pio4B = _mm256_set1_ps(0.794662735614792836713604629039764404296875e-8); ++- pio4C = _mm256_set1_ps(0.306161699786838294306516483068750264552437361480769e-16); ++- ffours = _mm256_set1_ps(4.0); ++- ftwos = _mm256_set1_ps(2.0); ++- fones = _mm256_set1_ps(1.0); ++- fzeroes = _mm256_setzero_ps(); ++- __m256i zeroes = _mm256_set1_epi32(0); ++- ones = _mm256_set1_epi32(1); ++- __m256i allones = _mm256_set1_epi32(0xffffffff); ++- twos = _mm256_set1_epi32(2); ++- fours = _mm256_set1_epi32(4); ++- ++- cp1 = _mm256_set1_ps(1.0); ++- cp2 = _mm256_set1_ps(0.08333333333333333); ++- cp3 = _mm256_set1_ps(0.002777777777777778); ++- cp4 = _mm256_set1_ps(4.96031746031746e-05); ++- cp5 = _mm256_set1_ps(5.511463844797178e-07); ++- union bit256 condition1; ++- union bit256 condition3; ++- ++- for(;number < eighthPoints; number++){ ++- ++- aVal = _mm256_loadu_ps(aPtr); ++- // s = fabs(aVal) ++- s = _mm256_sub_ps(aVal, _mm256_and_ps(_mm256_mul_ps(aVal, ftwos), _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS))); ++- // q = (int) (s * (4/pi)), floor(aVal / (pi/4)) ++- q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi))); ++- // r = q + q&1, q indicates quadrant, r gives ++- r = _mm256_cvtepi32_ps(_mm256_add_epi32(q, _mm256_and_si256(q, ones))); ++- ++- s = _mm256_fnmadd_ps(r,pio4A,s); ++- s = _mm256_fnmadd_ps(r,pio4B,s); ++- s = _mm256_fnmadd_ps(r,pio4C,s); ++- ++- s = _mm256_div_ps(s, _mm256_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction ++- s = _mm256_mul_ps(s, s); ++- // Evaluate Taylor series ++- s = _mm256_mul_ps(_mm256_fmadd_ps(_mm256_fmsub_ps(_mm256_fmadd_ps(_mm256_fmsub_ps(s, cp5, cp4), s, cp3), s, cp2), s, cp1), s); ++- ++- for(i = 0; i < 3; i++) ++- s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s)); ++- s = _mm256_div_ps(s, ftwos); ++- ++- sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s)); ++- cosine = _mm256_sub_ps(fones, s); ++- ++- // if(((q+1)&2) != 0) { cosine=sine;} ++- condition1.int_vec = _mm256_cmpeq_epi32(_mm256_and_si256(_mm256_add_epi32(q, ones), twos), zeroes); ++- condition1.int_vec = _mm256_xor_si256(allones, condition1.int_vec); ++- ++- // if(((q+2)&4) != 0) { cosine = -cosine;} ++- condition3.int_vec = _mm256_cmpeq_epi32(_mm256_and_si256(_mm256_add_epi32(q, twos), fours), zeroes); ++- condition3.int_vec = _mm256_xor_si256(allones, condition3.int_vec); ++- ++- cosine = _mm256_add_ps(cosine, _mm256_and_ps(_mm256_sub_ps(sine, cosine), condition1.float_vec)); ++- cosine = _mm256_sub_ps(cosine, _mm256_and_ps(_mm256_mul_ps(cosine, _mm256_set1_ps(2.0f)), condition3.float_vec)); ++- _mm256_storeu_ps(bPtr, cosine); ++- aPtr += 8; ++- bPtr += 8; ++- } ++- ++- number = eighthPoints * 8; ++- for(;number < num_points; number++){ ++- *bPtr++ = cos(*aPtr++); ++- } +++ float* bPtr = bVector; +++ const float* aPtr = aVector; +++ +++ unsigned int number = 0; +++ unsigned int eighthPoints = num_points / 8; +++ unsigned int i = 0; +++ +++ __m256 aVal, s, r, m4pi, pio4A, pio4B, pio4C, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, +++ fones, fzeroes; +++ __m256 sine, cosine; +++ __m256i q, ones, twos, fours; +++ +++ m4pi = _mm256_set1_ps(1.273239544735162542821171882678754627704620361328125); +++ pio4A = _mm256_set1_ps(0.7853981554508209228515625); +++ pio4B = _mm256_set1_ps(0.794662735614792836713604629039764404296875e-8); +++ pio4C = _mm256_set1_ps(0.306161699786838294306516483068750264552437361480769e-16); +++ ffours = _mm256_set1_ps(4.0); +++ ftwos = _mm256_set1_ps(2.0); +++ fones = _mm256_set1_ps(1.0); +++ fzeroes = _mm256_setzero_ps(); +++ __m256i zeroes = _mm256_set1_epi32(0); +++ ones = _mm256_set1_epi32(1); +++ __m256i allones = _mm256_set1_epi32(0xffffffff); +++ twos = _mm256_set1_epi32(2); +++ fours = _mm256_set1_epi32(4); +++ +++ cp1 = _mm256_set1_ps(1.0); +++ cp2 = _mm256_set1_ps(0.08333333333333333); +++ cp3 = _mm256_set1_ps(0.002777777777777778); +++ cp4 = _mm256_set1_ps(4.96031746031746e-05); +++ cp5 = _mm256_set1_ps(5.511463844797178e-07); +++ union bit256 condition1; +++ union bit256 condition3; +++ +++ for (; number < eighthPoints; number++) { +++ +++ aVal = _mm256_loadu_ps(aPtr); +++ // s = fabs(aVal) +++ s = _mm256_sub_ps(aVal, +++ _mm256_and_ps(_mm256_mul_ps(aVal, ftwos), +++ _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS))); +++ // q = (int) (s * (4/pi)), floor(aVal / (pi/4)) +++ q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi))); +++ // r = q + q&1, q indicates quadrant, r gives +++ r = _mm256_cvtepi32_ps(_mm256_add_epi32(q, _mm256_and_si256(q, ones))); +++ +++ s = _mm256_fnmadd_ps(r, pio4A, s); +++ s = _mm256_fnmadd_ps(r, pio4B, s); +++ s = _mm256_fnmadd_ps(r, pio4C, s); +++ +++ s = _mm256_div_ps( +++ s, +++ _mm256_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction +++ s = _mm256_mul_ps(s, s); +++ // Evaluate Taylor series +++ s = _mm256_mul_ps( +++ _mm256_fmadd_ps( +++ _mm256_fmsub_ps( +++ _mm256_fmadd_ps(_mm256_fmsub_ps(s, cp5, cp4), s, cp3), s, cp2), +++ s, +++ cp1), +++ s); +++ +++ for (i = 0; i < 3; i++) +++ s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s)); +++ s = _mm256_div_ps(s, ftwos); +++ +++ sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s)); +++ cosine = _mm256_sub_ps(fones, s); +++ +++ // if(((q+1)&2) != 0) { cosine=sine;} +++ condition1.int_vec = +++ _mm256_cmpeq_epi32(_mm256_and_si256(_mm256_add_epi32(q, ones), twos), zeroes); +++ condition1.int_vec = _mm256_xor_si256(allones, condition1.int_vec); +++ +++ // if(((q+2)&4) != 0) { cosine = -cosine;} +++ condition3.int_vec = _mm256_cmpeq_epi32( +++ _mm256_and_si256(_mm256_add_epi32(q, twos), fours), zeroes); +++ condition3.int_vec = _mm256_xor_si256(allones, condition3.int_vec); +++ +++ cosine = _mm256_add_ps( +++ cosine, _mm256_and_ps(_mm256_sub_ps(sine, cosine), condition1.float_vec)); +++ cosine = _mm256_sub_ps(cosine, +++ _mm256_and_ps(_mm256_mul_ps(cosine, _mm256_set1_ps(2.0f)), +++ condition3.float_vec)); +++ _mm256_storeu_ps(bPtr, cosine); +++ aPtr += 8; +++ bPtr += 8; +++ } +++ +++ number = eighthPoints * 8; +++ for (; number < num_points; number++) { +++ *bPtr++ = cos(*aPtr++); +++ } ++ } ++ ++ #endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for unaligned */ ++@@ -439,86 +512,109 @@ static inline void ++ #include ++ ++ static inline void ++- volk_32f_cos_32f_u_avx2(float* bVector, const float* aVector, unsigned int num_points) +++volk_32f_cos_32f_u_avx2(float* bVector, const float* aVector, unsigned int num_points) ++ { ++- float* bPtr = bVector; ++- const float* aPtr = aVector; ++- ++- unsigned int number = 0; ++- unsigned int eighthPoints = num_points / 8; ++- unsigned int i = 0; ++- ++- __m256 aVal, s, r, m4pi, pio4A, pio4B, pio4C, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones, fzeroes; ++- __m256 sine, cosine; ++- __m256i q, ones, twos, fours; ++- ++- m4pi = _mm256_set1_ps(1.273239544735162542821171882678754627704620361328125); ++- pio4A = _mm256_set1_ps(0.7853981554508209228515625); ++- pio4B = _mm256_set1_ps(0.794662735614792836713604629039764404296875e-8); ++- pio4C = _mm256_set1_ps(0.306161699786838294306516483068750264552437361480769e-16); ++- ffours = _mm256_set1_ps(4.0); ++- ftwos = _mm256_set1_ps(2.0); ++- fones = _mm256_set1_ps(1.0); ++- fzeroes = _mm256_setzero_ps(); ++- __m256i zeroes = _mm256_set1_epi32(0); ++- ones = _mm256_set1_epi32(1); ++- __m256i allones = _mm256_set1_epi32(0xffffffff); ++- twos = _mm256_set1_epi32(2); ++- fours = _mm256_set1_epi32(4); ++- ++- cp1 = _mm256_set1_ps(1.0); ++- cp2 = _mm256_set1_ps(0.08333333333333333); ++- cp3 = _mm256_set1_ps(0.002777777777777778); ++- cp4 = _mm256_set1_ps(4.96031746031746e-05); ++- cp5 = _mm256_set1_ps(5.511463844797178e-07); ++- union bit256 condition1; ++- union bit256 condition3; ++- ++- for(;number < eighthPoints; number++){ ++- ++- aVal = _mm256_loadu_ps(aPtr); ++- // s = fabs(aVal) ++- s = _mm256_sub_ps(aVal, _mm256_and_ps(_mm256_mul_ps(aVal, ftwos), _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS))); ++- // q = (int) (s * (4/pi)), floor(aVal / (pi/4)) ++- q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi))); ++- // r = q + q&1, q indicates quadrant, r gives ++- r = _mm256_cvtepi32_ps(_mm256_add_epi32(q, _mm256_and_si256(q, ones))); ++- ++- s = _mm256_sub_ps(s, _mm256_mul_ps(r,pio4A)); ++- s = _mm256_sub_ps(s, _mm256_mul_ps(r,pio4B)); ++- s = _mm256_sub_ps(s, _mm256_mul_ps(r,pio4C)); ++- ++- s = _mm256_div_ps(s, _mm256_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction ++- s = _mm256_mul_ps(s, s); ++- // Evaluate Taylor series ++- s = _mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(s, cp5), cp4), s), cp3), s), cp2), s), cp1), s); ++- ++- for(i = 0; i < 3; i++) ++- s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s)); ++- s = _mm256_div_ps(s, ftwos); ++- ++- sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s)); ++- cosine = _mm256_sub_ps(fones, s); ++- ++- // if(((q+1)&2) != 0) { cosine=sine;} ++- condition1.int_vec = _mm256_cmpeq_epi32(_mm256_and_si256(_mm256_add_epi32(q, ones), twos), zeroes); ++- condition1.int_vec = _mm256_xor_si256(allones, condition1.int_vec); ++- ++- // if(((q+2)&4) != 0) { cosine = -cosine;} ++- condition3.int_vec = _mm256_cmpeq_epi32(_mm256_and_si256(_mm256_add_epi32(q, twos), fours), zeroes); ++- condition3.int_vec = _mm256_xor_si256(allones, condition3.int_vec); ++- ++- cosine = _mm256_add_ps(cosine, _mm256_and_ps(_mm256_sub_ps(sine, cosine), condition1.float_vec)); ++- cosine = _mm256_sub_ps(cosine, _mm256_and_ps(_mm256_mul_ps(cosine, _mm256_set1_ps(2.0f)), condition3.float_vec)); ++- _mm256_storeu_ps(bPtr, cosine); ++- aPtr += 8; ++- bPtr += 8; ++- } ++- ++- number = eighthPoints * 8; ++- for(;number < num_points; number++){ ++- *bPtr++ = cos(*aPtr++); ++- } +++ float* bPtr = bVector; +++ const float* aPtr = aVector; +++ +++ unsigned int number = 0; +++ unsigned int eighthPoints = num_points / 8; +++ unsigned int i = 0; +++ +++ __m256 aVal, s, r, m4pi, pio4A, pio4B, pio4C, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, +++ fones, fzeroes; +++ __m256 sine, cosine; +++ __m256i q, ones, twos, fours; +++ +++ m4pi = _mm256_set1_ps(1.273239544735162542821171882678754627704620361328125); +++ pio4A = _mm256_set1_ps(0.7853981554508209228515625); +++ pio4B = _mm256_set1_ps(0.794662735614792836713604629039764404296875e-8); +++ pio4C = _mm256_set1_ps(0.306161699786838294306516483068750264552437361480769e-16); +++ ffours = _mm256_set1_ps(4.0); +++ ftwos = _mm256_set1_ps(2.0); +++ fones = _mm256_set1_ps(1.0); +++ fzeroes = _mm256_setzero_ps(); +++ __m256i zeroes = _mm256_set1_epi32(0); +++ ones = _mm256_set1_epi32(1); +++ __m256i allones = _mm256_set1_epi32(0xffffffff); +++ twos = _mm256_set1_epi32(2); +++ fours = _mm256_set1_epi32(4); +++ +++ cp1 = _mm256_set1_ps(1.0); +++ cp2 = _mm256_set1_ps(0.08333333333333333); +++ cp3 = _mm256_set1_ps(0.002777777777777778); +++ cp4 = _mm256_set1_ps(4.96031746031746e-05); +++ cp5 = _mm256_set1_ps(5.511463844797178e-07); +++ union bit256 condition1; +++ union bit256 condition3; +++ +++ for (; number < eighthPoints; number++) { +++ +++ aVal = _mm256_loadu_ps(aPtr); +++ // s = fabs(aVal) +++ s = _mm256_sub_ps(aVal, +++ _mm256_and_ps(_mm256_mul_ps(aVal, ftwos), +++ _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS))); +++ // q = (int) (s * (4/pi)), floor(aVal / (pi/4)) +++ q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi))); +++ // r = q + q&1, q indicates quadrant, r gives +++ r = _mm256_cvtepi32_ps(_mm256_add_epi32(q, _mm256_and_si256(q, ones))); +++ +++ s = _mm256_sub_ps(s, _mm256_mul_ps(r, pio4A)); +++ s = _mm256_sub_ps(s, _mm256_mul_ps(r, pio4B)); +++ s = _mm256_sub_ps(s, _mm256_mul_ps(r, pio4C)); +++ +++ s = _mm256_div_ps( +++ s, +++ _mm256_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction +++ s = _mm256_mul_ps(s, s); +++ // Evaluate Taylor series +++ s = _mm256_mul_ps( +++ _mm256_add_ps( +++ _mm256_mul_ps( +++ _mm256_sub_ps( +++ _mm256_mul_ps( +++ _mm256_add_ps( +++ _mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(s, cp5), cp4), +++ s), +++ cp3), +++ s), +++ cp2), +++ s), +++ cp1), +++ s); +++ +++ for (i = 0; i < 3; i++) +++ s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s)); +++ s = _mm256_div_ps(s, ftwos); +++ +++ sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s)); +++ cosine = _mm256_sub_ps(fones, s); +++ +++ // if(((q+1)&2) != 0) { cosine=sine;} +++ condition1.int_vec = +++ _mm256_cmpeq_epi32(_mm256_and_si256(_mm256_add_epi32(q, ones), twos), zeroes); +++ condition1.int_vec = _mm256_xor_si256(allones, condition1.int_vec); +++ +++ // if(((q+2)&4) != 0) { cosine = -cosine;} +++ condition3.int_vec = _mm256_cmpeq_epi32( +++ _mm256_and_si256(_mm256_add_epi32(q, twos), fours), zeroes); +++ condition3.int_vec = _mm256_xor_si256(allones, condition3.int_vec); +++ +++ cosine = _mm256_add_ps( +++ cosine, _mm256_and_ps(_mm256_sub_ps(sine, cosine), condition1.float_vec)); +++ cosine = _mm256_sub_ps(cosine, +++ _mm256_and_ps(_mm256_mul_ps(cosine, _mm256_set1_ps(2.0f)), +++ condition3.float_vec)); +++ _mm256_storeu_ps(bPtr, cosine); +++ aPtr += 8; +++ bPtr += 8; +++ } +++ +++ number = eighthPoints * 8; +++ for (; number < num_points; number++) { +++ *bPtr++ = cos(*aPtr++); +++ } ++ } ++ ++ #endif /* LV_HAVE_AVX2 for unaligned */ ++@@ -529,71 +625,88 @@ static inline void ++ static inline void ++ volk_32f_cos_32f_u_sse4_1(float* bVector, const float* aVector, unsigned int num_points) ++ { ++- float* bPtr = bVector; ++- const float* aPtr = aVector; ++- ++- unsigned int number = 0; ++- unsigned int quarterPoints = num_points / 4; ++- unsigned int i = 0; ++- ++- __m128 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones, fzeroes; ++- __m128 sine, cosine, condition1, condition3; ++- __m128i q, r, ones, twos, fours; ++- ++- m4pi = _mm_set1_ps(1.273239545); ++- pio4A = _mm_set1_ps(0.78515625); ++- pio4B = _mm_set1_ps(0.241876e-3); ++- ffours = _mm_set1_ps(4.0); ++- ftwos = _mm_set1_ps(2.0); ++- fones = _mm_set1_ps(1.0); ++- fzeroes = _mm_setzero_ps(); ++- ones = _mm_set1_epi32(1); ++- twos = _mm_set1_epi32(2); ++- fours = _mm_set1_epi32(4); ++- ++- cp1 = _mm_set1_ps(1.0); ++- cp2 = _mm_set1_ps(0.83333333e-1); ++- cp3 = _mm_set1_ps(0.2777778e-2); ++- cp4 = _mm_set1_ps(0.49603e-4); ++- cp5 = _mm_set1_ps(0.551e-6); ++- ++- for(;number < quarterPoints; number++){ ++- aVal = _mm_loadu_ps(aPtr); ++- s = _mm_sub_ps(aVal, _mm_and_ps(_mm_mul_ps(aVal, ftwos), _mm_cmplt_ps(aVal, fzeroes))); ++- q = _mm_cvtps_epi32(_mm_floor_ps(_mm_mul_ps(s, m4pi))); ++- r = _mm_add_epi32(q, _mm_and_si128(q, ones)); ++- ++- s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4A)); ++- s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4B)); ++- ++- s = _mm_div_ps(s, _mm_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction ++- s = _mm_mul_ps(s, s); ++- // Evaluate Taylor series ++- s = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(s, cp5), cp4), s), cp3), s), cp2), s), cp1), s); ++- ++- for(i = 0; i < 3; i++){ ++- s = _mm_mul_ps(s, _mm_sub_ps(ffours, s)); ++- } ++- s = _mm_div_ps(s, ftwos); ++- ++- sine = _mm_sqrt_ps(_mm_mul_ps(_mm_sub_ps(ftwos, s), s)); ++- cosine = _mm_sub_ps(fones, s); ++- ++- condition1 = _mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, ones), twos)), fzeroes); +++ float* bPtr = bVector; +++ const float* aPtr = aVector; ++ ++- condition3 = _mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, twos), fours)), fzeroes); ++- ++- cosine = _mm_add_ps(cosine, _mm_and_ps(_mm_sub_ps(sine, cosine), condition1)); ++- cosine = _mm_sub_ps(cosine, _mm_and_ps(_mm_mul_ps(cosine, _mm_set1_ps(2.0f)), condition3)); ++- _mm_storeu_ps(bPtr, cosine); ++- aPtr += 4; ++- bPtr += 4; ++- } +++ unsigned int number = 0; +++ unsigned int quarterPoints = num_points / 4; +++ unsigned int i = 0; +++ +++ __m128 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones, +++ fzeroes; +++ __m128 sine, cosine, condition1, condition3; +++ __m128i q, r, ones, twos, fours; +++ +++ m4pi = _mm_set1_ps(1.273239545); +++ pio4A = _mm_set1_ps(0.78515625); +++ pio4B = _mm_set1_ps(0.241876e-3); +++ ffours = _mm_set1_ps(4.0); +++ ftwos = _mm_set1_ps(2.0); +++ fones = _mm_set1_ps(1.0); +++ fzeroes = _mm_setzero_ps(); +++ ones = _mm_set1_epi32(1); +++ twos = _mm_set1_epi32(2); +++ fours = _mm_set1_epi32(4); +++ +++ cp1 = _mm_set1_ps(1.0); +++ cp2 = _mm_set1_ps(0.83333333e-1); +++ cp3 = _mm_set1_ps(0.2777778e-2); +++ cp4 = _mm_set1_ps(0.49603e-4); +++ cp5 = _mm_set1_ps(0.551e-6); +++ +++ for (; number < quarterPoints; number++) { +++ aVal = _mm_loadu_ps(aPtr); +++ s = _mm_sub_ps(aVal, +++ _mm_and_ps(_mm_mul_ps(aVal, ftwos), _mm_cmplt_ps(aVal, fzeroes))); +++ q = _mm_cvtps_epi32(_mm_floor_ps(_mm_mul_ps(s, m4pi))); +++ r = _mm_add_epi32(q, _mm_and_si128(q, ones)); +++ +++ s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4A)); +++ s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4B)); +++ +++ s = _mm_div_ps( +++ s, _mm_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction +++ s = _mm_mul_ps(s, s); +++ // Evaluate Taylor series +++ s = _mm_mul_ps( +++ _mm_add_ps( +++ _mm_mul_ps( +++ _mm_sub_ps( +++ _mm_mul_ps( +++ _mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(s, cp5), cp4), s), +++ cp3), +++ s), +++ cp2), +++ s), +++ cp1), +++ s); +++ +++ for (i = 0; i < 3; i++) { +++ s = _mm_mul_ps(s, _mm_sub_ps(ffours, s)); +++ } +++ s = _mm_div_ps(s, ftwos); +++ +++ sine = _mm_sqrt_ps(_mm_mul_ps(_mm_sub_ps(ftwos, s), s)); +++ cosine = _mm_sub_ps(fones, s); +++ +++ condition1 = _mm_cmpneq_ps( +++ _mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, ones), twos)), fzeroes); +++ +++ condition3 = _mm_cmpneq_ps( +++ _mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, twos), fours)), fzeroes); +++ +++ cosine = _mm_add_ps(cosine, _mm_and_ps(_mm_sub_ps(sine, cosine), condition1)); +++ cosine = _mm_sub_ps( +++ cosine, _mm_and_ps(_mm_mul_ps(cosine, _mm_set1_ps(2.0f)), condition3)); +++ _mm_storeu_ps(bPtr, cosine); +++ aPtr += 4; +++ bPtr += 4; +++ } ++ ++- number = quarterPoints * 4; ++- for(;number < num_points; number++){ ++- *bPtr++ = cosf(*aPtr++); ++- } +++ number = quarterPoints * 4; +++ for (; number < num_points; number++) { +++ *bPtr++ = cosf(*aPtr++); +++ } ++ } ++ ++ #endif /* LV_HAVE_SSE4_1 for unaligned */ ++@@ -606,52 +719,55 @@ volk_32f_cos_32f_u_sse4_1(float* bVector, const float* aVector, unsigned int num ++ * Shibata, Naoki, "Efficient evaluation methods of elementary functions ++ * suitable for SIMD computation," in Springer-Verlag 2010 ++ */ ++-static inline void ++-volk_32f_cos_32f_generic_fast(float* bVector, const float* aVector, unsigned int num_points) +++static inline void volk_32f_cos_32f_generic_fast(float* bVector, +++ const float* aVector, +++ unsigned int num_points) ++ { ++- float* bPtr = bVector; ++- const float* aPtr = aVector; ++- ++- float m4pi = 1.273239544735162542821171882678754627704620361328125; ++- float pio4A = 0.7853981554508209228515625; ++- float pio4B = 0.794662735614792836713604629039764404296875e-8; ++- float pio4C = 0.306161699786838294306516483068750264552437361480769e-16; ++- int N = 3; // order of argument reduction ++- ++- unsigned int number; ++- for(number = 0; number < num_points; number++){ ++- float s = fabs(*aPtr); ++- int q = (int)(s * m4pi); ++- int r = q + (q&1); ++- s -= r * pio4A; ++- s -= r * pio4B; ++- s -= r * pio4C; ++- ++- s = s * 0.125; // 2^-N (<--3) ++- s = s*s; ++- s = ((((s/1814400. - 1.0/20160.0)*s + 1.0/360.0)*s - 1.0/12.0)*s + 1.0)*s; ++- ++- int i; ++- for(i=0; i < N; ++i) { ++- s = (4.0-s)*s; ++- } ++- s = s/2.0; ++- ++- float sine = sqrt((2.0-s)*s); ++- float cosine = 1-s; ++- ++- if (((q+1) & 2) != 0) { ++- s = cosine; ++- cosine = sine; ++- sine = s; ++- } ++- if (((q+2) & 4) != 0) { ++- cosine = -cosine; ++- } ++- *bPtr = cosine; ++- bPtr++; ++- aPtr++; ++- } +++ float* bPtr = bVector; +++ const float* aPtr = aVector; +++ +++ float m4pi = 1.273239544735162542821171882678754627704620361328125; +++ float pio4A = 0.7853981554508209228515625; +++ float pio4B = 0.794662735614792836713604629039764404296875e-8; +++ float pio4C = 0.306161699786838294306516483068750264552437361480769e-16; +++ int N = 3; // order of argument reduction +++ +++ unsigned int number; +++ for (number = 0; number < num_points; number++) { +++ float s = fabs(*aPtr); +++ int q = (int)(s * m4pi); +++ int r = q + (q & 1); +++ s -= r * pio4A; +++ s -= r * pio4B; +++ s -= r * pio4C; +++ +++ s = s * 0.125; // 2^-N (<--3) +++ s = s * s; +++ s = ((((s / 1814400. - 1.0 / 20160.0) * s + 1.0 / 360.0) * s - 1.0 / 12.0) * s + +++ 1.0) * +++ s; +++ +++ int i; +++ for (i = 0; i < N; ++i) { +++ s = (4.0 - s) * s; +++ } +++ s = s / 2.0; +++ +++ float sine = sqrt((2.0 - s) * s); +++ float cosine = 1 - s; +++ +++ if (((q + 1) & 2) != 0) { +++ s = cosine; +++ cosine = sine; +++ sine = s; +++ } +++ if (((q + 2) & 4) != 0) { +++ cosine = -cosine; +++ } +++ *bPtr = cosine; +++ bPtr++; +++ aPtr++; +++ } ++ } ++ ++ #endif /* LV_HAVE_GENERIC */ ++@@ -662,13 +778,13 @@ volk_32f_cos_32f_generic_fast(float* bVector, const float* aVector, unsigned int ++ static inline void ++ volk_32f_cos_32f_generic(float* bVector, const float* aVector, unsigned int num_points) ++ { ++- float* bPtr = bVector; ++- const float* aPtr = aVector; ++- unsigned int number = 0; +++ float* bPtr = bVector; +++ const float* aPtr = aVector; +++ unsigned int number = 0; ++ ++- for(; number < num_points; number++){ ++- *bPtr++ = cosf(*aPtr++); ++- } +++ for (; number < num_points; number++) { +++ *bPtr++ = cosf(*aPtr++); +++ } ++ } ++ ++ #endif /* LV_HAVE_GENERIC */ ++@@ -679,30 +795,29 @@ volk_32f_cos_32f_generic(float* bVector, const float* aVector, unsigned int num_ ++ #include ++ ++ static inline void ++-volk_32f_cos_32f_neon(float* bVector, const float* aVector, ++- unsigned int num_points) +++volk_32f_cos_32f_neon(float* bVector, const float* aVector, unsigned int num_points) ++ { ++ unsigned int number = 0; ++ unsigned int quarter_points = num_points / 4; ++ float* bVectorPtr = bVector; ++ const float* aVectorPtr = aVector; ++- +++ ++ float32x4_t b_vec; ++ float32x4_t a_vec; ++- ++- for(number = 0; number < quarter_points; number++) { +++ +++ for (number = 0; number < quarter_points; number++) { ++ a_vec = vld1q_f32(aVectorPtr); ++ // Prefetch next one, speeds things up ++- __VOLK_PREFETCH(aVectorPtr+4); +++ __VOLK_PREFETCH(aVectorPtr + 4); ++ b_vec = _vcosq_f32(a_vec); ++ vst1q_f32(bVectorPtr, b_vec); ++ // move pointers ahead ++- bVectorPtr+=4; ++- aVectorPtr+=4; +++ bVectorPtr += 4; +++ aVectorPtr += 4; ++ } ++- +++ ++ // Deal with the rest ++- for(number = quarter_points * 4; number < num_points; number++) { +++ for (number = quarter_points * 4; number < num_points; number++) { ++ *bVectorPtr++ = cosf(*aVectorPtr++); ++ } ++ } ++diff --git a/kernels/volk/volk_32f_expfast_32f.h b/kernels/volk/volk_32f_expfast_32f.h ++index ecb4914..45de3f9 100644 ++--- a/kernels/volk/volk_32f_expfast_32f.h +++++ b/kernels/volk/volk_32f_expfast_32f.h ++@@ -30,8 +30,8 @@ ++ * ++ * Dispatcher Prototype ++ * \code ++- * void volk_32f_expfast_32f(float* bVector, const float* aVector, unsigned int num_points) ++- * \endcode +++ * void volk_32f_expfast_32f(float* bVector, const float* aVector, unsigned int +++ * num_points) \endcode ++ * ++ * \b Inputs ++ * \li aVector: Input vector of floats. ++@@ -62,9 +62,9 @@ ++ * \endcode ++ */ ++ ++-#include ++-#include ++ #include +++#include +++#include ++ ++ #define Mln2 0.6931471805f ++ #define A 8388608.0f ++@@ -79,34 +79,35 @@ ++ ++ #include ++ ++-static inline void ++- volk_32f_expfast_32f_a_avx_fma(float* bVector, const float* aVector, unsigned int num_points) +++static inline void volk_32f_expfast_32f_a_avx_fma(float* bVector, +++ const float* aVector, +++ unsigned int num_points) ++ { ++- float* bPtr = bVector; ++- const float* aPtr = aVector; ++- ++- unsigned int number = 0; ++- const unsigned int eighthPoints = num_points / 8; ++- ++- __m256 aVal, bVal, a, b; ++- __m256i exp; ++- a = _mm256_set1_ps(A/Mln2); ++- b = _mm256_set1_ps(B-C); ++- ++- for(;number < eighthPoints; number++){ ++- aVal = _mm256_load_ps(aPtr); ++- exp = _mm256_cvtps_epi32(_mm256_fmadd_ps(a,aVal, b)); ++- bVal = _mm256_castsi256_ps(exp); ++- ++- _mm256_store_ps(bPtr, bVal); ++- aPtr += 8; ++- bPtr += 8; ++- } ++- ++- number = eighthPoints * 8; ++- for(;number < num_points; number++){ ++- *bPtr++ = expf(*aPtr++); ++- } +++ float* bPtr = bVector; +++ const float* aPtr = aVector; +++ +++ unsigned int number = 0; +++ const unsigned int eighthPoints = num_points / 8; +++ +++ __m256 aVal, bVal, a, b; +++ __m256i exp; +++ a = _mm256_set1_ps(A / Mln2); +++ b = _mm256_set1_ps(B - C); +++ +++ for (; number < eighthPoints; number++) { +++ aVal = _mm256_load_ps(aPtr); +++ exp = _mm256_cvtps_epi32(_mm256_fmadd_ps(a, aVal, b)); +++ bVal = _mm256_castsi256_ps(exp); +++ +++ _mm256_store_ps(bPtr, bVal); +++ aPtr += 8; +++ bPtr += 8; +++ } +++ +++ number = eighthPoints * 8; +++ for (; number < num_points; number++) { +++ *bPtr++ = expf(*aPtr++); +++ } ++ } ++ ++ #endif /* LV_HAVE_AVX && LV_HAVE_FMA for aligned */ ++@@ -116,33 +117,33 @@ static inline void ++ #include ++ ++ static inline void ++- volk_32f_expfast_32f_a_avx(float* bVector, const float* aVector, unsigned int num_points) +++volk_32f_expfast_32f_a_avx(float* bVector, const float* aVector, unsigned int num_points) ++ { ++- float* bPtr = bVector; ++- const float* aPtr = aVector; ++- ++- unsigned int number = 0; ++- const unsigned int eighthPoints = num_points / 8; ++- ++- __m256 aVal, bVal, a, b; ++- __m256i exp; ++- a = _mm256_set1_ps(A/Mln2); ++- b = _mm256_set1_ps(B-C); ++- ++- for(;number < eighthPoints; number++){ ++- aVal = _mm256_load_ps(aPtr); ++- exp = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_mul_ps(a,aVal), b)); ++- bVal = _mm256_castsi256_ps(exp); ++- ++- _mm256_store_ps(bPtr, bVal); ++- aPtr += 8; ++- bPtr += 8; ++- } ++- ++- number = eighthPoints * 8; ++- for(;number < num_points; number++){ ++- *bPtr++ = expf(*aPtr++); ++- } +++ float* bPtr = bVector; +++ const float* aPtr = aVector; +++ +++ unsigned int number = 0; +++ const unsigned int eighthPoints = num_points / 8; +++ +++ __m256 aVal, bVal, a, b; +++ __m256i exp; +++ a = _mm256_set1_ps(A / Mln2); +++ b = _mm256_set1_ps(B - C); +++ +++ for (; number < eighthPoints; number++) { +++ aVal = _mm256_load_ps(aPtr); +++ exp = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_mul_ps(a, aVal), b)); +++ bVal = _mm256_castsi256_ps(exp); +++ +++ _mm256_store_ps(bPtr, bVal); +++ aPtr += 8; +++ bPtr += 8; +++ } +++ +++ number = eighthPoints * 8; +++ for (; number < num_points; number++) { +++ *bPtr++ = expf(*aPtr++); +++ } ++ } ++ ++ #endif /* LV_HAVE_AVX for aligned */ ++@@ -150,34 +151,35 @@ static inline void ++ #ifdef LV_HAVE_SSE4_1 ++ #include ++ ++-static inline void ++-volk_32f_expfast_32f_a_sse4_1(float* bVector, const float* aVector, unsigned int num_points) +++static inline void volk_32f_expfast_32f_a_sse4_1(float* bVector, +++ const float* aVector, +++ unsigned int num_points) ++ { ++- float* bPtr = bVector; ++- const float* aPtr = aVector; ++- ++- unsigned int number = 0; ++- const unsigned int quarterPoints = num_points / 4; ++- ++- __m128 aVal, bVal, a, b; ++- __m128i exp; ++- a = _mm_set1_ps(A/Mln2); ++- b = _mm_set1_ps(B-C); ++- ++- for(;number < quarterPoints; number++){ ++- aVal = _mm_load_ps(aPtr); ++- exp = _mm_cvtps_epi32(_mm_add_ps(_mm_mul_ps(a,aVal), b)); ++- bVal = _mm_castsi128_ps(exp); ++- ++- _mm_store_ps(bPtr, bVal); ++- aPtr += 4; ++- bPtr += 4; ++- } ++- ++- number = quarterPoints * 4; ++- for(;number < num_points; number++){ ++- *bPtr++ = expf(*aPtr++); ++- } +++ float* bPtr = bVector; +++ const float* aPtr = aVector; +++ +++ unsigned int number = 0; +++ const unsigned int quarterPoints = num_points / 4; +++ +++ __m128 aVal, bVal, a, b; +++ __m128i exp; +++ a = _mm_set1_ps(A / Mln2); +++ b = _mm_set1_ps(B - C); +++ +++ for (; number < quarterPoints; number++) { +++ aVal = _mm_load_ps(aPtr); +++ exp = _mm_cvtps_epi32(_mm_add_ps(_mm_mul_ps(a, aVal), b)); +++ bVal = _mm_castsi128_ps(exp); +++ +++ _mm_store_ps(bPtr, bVal); +++ aPtr += 4; +++ bPtr += 4; +++ } +++ +++ number = quarterPoints * 4; +++ for (; number < num_points; number++) { +++ *bPtr++ = expf(*aPtr++); +++ } ++ } ++ ++ #endif /* LV_HAVE_SSE4_1 for aligned */ ++@@ -190,34 +192,35 @@ volk_32f_expfast_32f_a_sse4_1(float* bVector, const float* aVector, unsigned int ++ #if LV_HAVE_AVX && LV_HAVE_FMA ++ #include ++ ++-static inline void ++-volk_32f_expfast_32f_u_avx_fma(float* bVector, const float* aVector, unsigned int num_points) +++static inline void volk_32f_expfast_32f_u_avx_fma(float* bVector, +++ const float* aVector, +++ unsigned int num_points) ++ { ++- float* bPtr = bVector; ++- const float* aPtr = aVector; ++- ++- unsigned int number = 0; ++- const unsigned int eighthPoints = num_points / 8; ++- ++- __m256 aVal, bVal, a, b; ++- __m256i exp; ++- a = _mm256_set1_ps(A/Mln2); ++- b = _mm256_set1_ps(B-C); ++- ++- for(;number < eighthPoints; number++){ ++- aVal = _mm256_loadu_ps(aPtr); ++- exp = _mm256_cvtps_epi32(_mm256_fmadd_ps(a,aVal, b)); ++- bVal = _mm256_castsi256_ps(exp); ++- ++- _mm256_storeu_ps(bPtr, bVal); ++- aPtr += 8; ++- bPtr += 8; ++- } ++- ++- number = eighthPoints * 8; ++- for(;number < num_points; number++){ ++- *bPtr++ = expf(*aPtr++); ++- } +++ float* bPtr = bVector; +++ const float* aPtr = aVector; +++ +++ unsigned int number = 0; +++ const unsigned int eighthPoints = num_points / 8; +++ +++ __m256 aVal, bVal, a, b; +++ __m256i exp; +++ a = _mm256_set1_ps(A / Mln2); +++ b = _mm256_set1_ps(B - C); +++ +++ for (; number < eighthPoints; number++) { +++ aVal = _mm256_loadu_ps(aPtr); +++ exp = _mm256_cvtps_epi32(_mm256_fmadd_ps(a, aVal, b)); +++ bVal = _mm256_castsi256_ps(exp); +++ +++ _mm256_storeu_ps(bPtr, bVal); +++ aPtr += 8; +++ bPtr += 8; +++ } +++ +++ number = eighthPoints * 8; +++ for (; number < num_points; number++) { +++ *bPtr++ = expf(*aPtr++); +++ } ++ } ++ ++ #endif /* LV_HAVE_AVX && LV_HAVE_FMA for unaligned */ ++@@ -228,31 +231,31 @@ volk_32f_expfast_32f_u_avx_fma(float* bVector, const float* aVector, unsigned in ++ static inline void ++ volk_32f_expfast_32f_u_avx(float* bVector, const float* aVector, unsigned int num_points) ++ { ++- float* bPtr = bVector; ++- const float* aPtr = aVector; ++- ++- unsigned int number = 0; ++- const unsigned int eighthPoints = num_points / 8; ++- ++- __m256 aVal, bVal, a, b; ++- __m256i exp; ++- a = _mm256_set1_ps(A/Mln2); ++- b = _mm256_set1_ps(B-C); ++- ++- for(;number < eighthPoints; number++){ ++- aVal = _mm256_loadu_ps(aPtr); ++- exp = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_mul_ps(a,aVal), b)); ++- bVal = _mm256_castsi256_ps(exp); ++- ++- _mm256_storeu_ps(bPtr, bVal); ++- aPtr += 8; ++- bPtr += 8; ++- } ++- ++- number = eighthPoints * 8; ++- for(;number < num_points; number++){ ++- *bPtr++ = expf(*aPtr++); ++- } +++ float* bPtr = bVector; +++ const float* aPtr = aVector; +++ +++ unsigned int number = 0; +++ const unsigned int eighthPoints = num_points / 8; +++ +++ __m256 aVal, bVal, a, b; +++ __m256i exp; +++ a = _mm256_set1_ps(A / Mln2); +++ b = _mm256_set1_ps(B - C); +++ +++ for (; number < eighthPoints; number++) { +++ aVal = _mm256_loadu_ps(aPtr); +++ exp = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_mul_ps(a, aVal), b)); +++ bVal = _mm256_castsi256_ps(exp); +++ +++ _mm256_storeu_ps(bPtr, bVal); +++ aPtr += 8; +++ bPtr += 8; +++ } +++ +++ number = eighthPoints * 8; +++ for (; number < num_points; number++) { +++ *bPtr++ = expf(*aPtr++); +++ } ++ } ++ ++ #endif /* LV_HAVE_AVX for unaligned */ ++@@ -261,34 +264,35 @@ volk_32f_expfast_32f_u_avx(float* bVector, const float* aVector, unsigned int nu ++ #ifdef LV_HAVE_SSE4_1 ++ #include ++ ++-static inline void ++-volk_32f_expfast_32f_u_sse4_1(float* bVector, const float* aVector, unsigned int num_points) +++static inline void volk_32f_expfast_32f_u_sse4_1(float* bVector, +++ const float* aVector, +++ unsigned int num_points) ++ { ++- float* bPtr = bVector; ++- const float* aPtr = aVector; ++- ++- unsigned int number = 0; ++- const unsigned int quarterPoints = num_points / 4; ++- ++- __m128 aVal, bVal, a, b; ++- __m128i exp; ++- a = _mm_set1_ps(A/Mln2); ++- b = _mm_set1_ps(B-C); ++- ++- for(;number < quarterPoints; number++){ ++- aVal = _mm_loadu_ps(aPtr); ++- exp = _mm_cvtps_epi32(_mm_add_ps(_mm_mul_ps(a,aVal), b)); ++- bVal = _mm_castsi128_ps(exp); ++- ++- _mm_storeu_ps(bPtr, bVal); ++- aPtr += 4; ++- bPtr += 4; ++- } ++- ++- number = quarterPoints * 4; ++- for(;number < num_points; number++){ ++- *bPtr++ = expf(*aPtr++); ++- } +++ float* bPtr = bVector; +++ const float* aPtr = aVector; +++ +++ unsigned int number = 0; +++ const unsigned int quarterPoints = num_points / 4; +++ +++ __m128 aVal, bVal, a, b; +++ __m128i exp; +++ a = _mm_set1_ps(A / Mln2); +++ b = _mm_set1_ps(B - C); +++ +++ for (; number < quarterPoints; number++) { +++ aVal = _mm_loadu_ps(aPtr); +++ exp = _mm_cvtps_epi32(_mm_add_ps(_mm_mul_ps(a, aVal), b)); +++ bVal = _mm_castsi128_ps(exp); +++ +++ _mm_storeu_ps(bPtr, bVal); +++ aPtr += 4; +++ bPtr += 4; +++ } +++ +++ number = quarterPoints * 4; +++ for (; number < num_points; number++) { +++ *bPtr++ = expf(*aPtr++); +++ } ++ } ++ ++ #endif /* LV_HAVE_SSE4_1 for unaligned */ ++@@ -296,16 +300,17 @@ volk_32f_expfast_32f_u_sse4_1(float* bVector, const float* aVector, unsigned int ++ ++ #ifdef LV_HAVE_GENERIC ++ ++-static inline void ++-volk_32f_expfast_32f_generic(float* bVector, const float* aVector, unsigned int num_points) +++static inline void volk_32f_expfast_32f_generic(float* bVector, +++ const float* aVector, +++ unsigned int num_points) ++ { ++- float* bPtr = bVector; ++- const float* aPtr = aVector; ++- unsigned int number = 0; +++ float* bPtr = bVector; +++ const float* aPtr = aVector; +++ unsigned int number = 0; ++ ++- for(number = 0; number < num_points; number++){ ++- *bPtr++ = expf(*aPtr++); ++- } +++ for (number = 0; number < num_points; number++) { +++ *bPtr++ = expf(*aPtr++); +++ } ++ } ++ #endif /* LV_HAVE_GENERIC */ ++ ++diff --git a/kernels/volk/volk_32f_index_max_16u.h b/kernels/volk/volk_32f_index_max_16u.h ++index 7ca6928..3ee10f4 100644 ++--- a/kernels/volk/volk_32f_index_max_16u.h +++++ b/kernels/volk/volk_32f_index_max_16u.h ++@@ -71,72 +71,71 @@ ++ #ifndef INCLUDED_volk_32f_index_max_16u_a_H ++ #define INCLUDED_volk_32f_index_max_16u_a_H ++ ++-#include ++-#include ++ #include ++ #include ++ #include +++#include ++ ++ #ifdef LV_HAVE_AVX ++ #include ++ ++ static inline void ++-volk_32f_index_max_16u_a_avx(uint16_t* target, const float* src0, ++- uint32_t num_points) +++volk_32f_index_max_16u_a_avx(uint16_t* target, const float* src0, uint32_t num_points) ++ { ++- num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points; ++- ++- uint32_t number = 0; ++- const uint32_t eighthPoints = num_points / 8; +++ num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points; ++ ++- float* inputPtr = (float*)src0; +++ uint32_t number = 0; +++ const uint32_t eighthPoints = num_points / 8; ++ ++- __m256 indexIncrementValues = _mm256_set1_ps(8); ++- __m256 currentIndexes = _mm256_set_ps(-1,-2,-3,-4,-5,-6,-7,-8); +++ float* inputPtr = (float*)src0; ++ ++- float max = src0[0]; ++- float index = 0; ++- __m256 maxValues = _mm256_set1_ps(max); ++- __m256 maxValuesIndex = _mm256_setzero_ps(); ++- __m256 compareResults; ++- __m256 currentValues; +++ __m256 indexIncrementValues = _mm256_set1_ps(8); +++ __m256 currentIndexes = _mm256_set_ps(-1, -2, -3, -4, -5, -6, -7, -8); ++ ++- __VOLK_ATTR_ALIGNED(32) float maxValuesBuffer[8]; ++- __VOLK_ATTR_ALIGNED(32) float maxIndexesBuffer[8]; +++ float max = src0[0]; +++ float index = 0; +++ __m256 maxValues = _mm256_set1_ps(max); +++ __m256 maxValuesIndex = _mm256_setzero_ps(); +++ __m256 compareResults; +++ __m256 currentValues; ++ ++- for(;number < eighthPoints; number++){ +++ __VOLK_ATTR_ALIGNED(32) float maxValuesBuffer[8]; +++ __VOLK_ATTR_ALIGNED(32) float maxIndexesBuffer[8]; ++ ++- currentValues = _mm256_load_ps(inputPtr); inputPtr += 8; ++- currentIndexes = _mm256_add_ps(currentIndexes, indexIncrementValues); +++ for (; number < eighthPoints; number++) { ++ ++- compareResults = _mm256_cmp_ps(currentValues, maxValues, _CMP_GT_OS); +++ currentValues = _mm256_load_ps(inputPtr); +++ inputPtr += 8; +++ currentIndexes = _mm256_add_ps(currentIndexes, indexIncrementValues); ++ ++- maxValuesIndex = _mm256_blendv_ps(maxValuesIndex, currentIndexes, compareResults); ++- maxValues = _mm256_blendv_ps(maxValues, currentValues, compareResults); ++- } +++ compareResults = _mm256_cmp_ps(currentValues, maxValues, _CMP_GT_OS); ++ ++- // Calculate the largest value from the remaining 4 points ++- _mm256_store_ps(maxValuesBuffer, maxValues); ++- _mm256_store_ps(maxIndexesBuffer, maxValuesIndex); +++ maxValuesIndex = _mm256_blendv_ps(maxValuesIndex, currentIndexes, compareResults); +++ maxValues = _mm256_blendv_ps(maxValues, currentValues, compareResults); +++ } ++ ++- for(number = 0; number < 8; number++){ ++- if(maxValuesBuffer[number] > max){ ++- index = maxIndexesBuffer[number]; ++- max = maxValuesBuffer[number]; ++- } else if(maxValuesBuffer[number] == max){ ++- if (index > maxIndexesBuffer[number]) ++- index = maxIndexesBuffer[number]; +++ // Calculate the largest value from the remaining 4 points +++ _mm256_store_ps(maxValuesBuffer, maxValues); +++ _mm256_store_ps(maxIndexesBuffer, maxValuesIndex); +++ +++ for (number = 0; number < 8; number++) { +++ if (maxValuesBuffer[number] > max) { +++ index = maxIndexesBuffer[number]; +++ max = maxValuesBuffer[number]; +++ } else if (maxValuesBuffer[number] == max) { +++ if (index > maxIndexesBuffer[number]) +++ index = maxIndexesBuffer[number]; +++ } ++ } ++- } ++ ++- number = eighthPoints * 8; ++- for(;number < num_points; number++){ ++- if(src0[number] > max){ ++- index = number; ++- max = src0[number]; +++ number = eighthPoints * 8; +++ for (; number < num_points; number++) { +++ if (src0[number] > max) { +++ index = number; +++ max = src0[number]; +++ } ++ } ++- } ++- target[0] = (uint16_t)index; +++ target[0] = (uint16_t)index; ++ } ++ ++ #endif /*LV_HAVE_AVX*/ ++@@ -145,62 +144,62 @@ volk_32f_index_max_16u_a_avx(uint16_t* target, const float* src0, ++ #include ++ ++ static inline void ++-volk_32f_index_max_16u_a_sse4_1(uint16_t* target, const float* src0, ++- uint32_t num_points) +++volk_32f_index_max_16u_a_sse4_1(uint16_t* target, const float* src0, uint32_t num_points) ++ { ++- num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points; ++- ++- uint32_t number = 0; ++- const uint32_t quarterPoints = num_points / 4; +++ num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points; ++ ++- float* inputPtr = (float*)src0; +++ uint32_t number = 0; +++ const uint32_t quarterPoints = num_points / 4; ++ ++- __m128 indexIncrementValues = _mm_set1_ps(4); ++- __m128 currentIndexes = _mm_set_ps(-1,-2,-3,-4); +++ float* inputPtr = (float*)src0; ++ ++- float max = src0[0]; ++- float index = 0; ++- __m128 maxValues = _mm_set1_ps(max); ++- __m128 maxValuesIndex = _mm_setzero_ps(); ++- __m128 compareResults; ++- __m128 currentValues; +++ __m128 indexIncrementValues = _mm_set1_ps(4); +++ __m128 currentIndexes = _mm_set_ps(-1, -2, -3, -4); ++ ++- __VOLK_ATTR_ALIGNED(16) float maxValuesBuffer[4]; ++- __VOLK_ATTR_ALIGNED(16) float maxIndexesBuffer[4]; +++ float max = src0[0]; +++ float index = 0; +++ __m128 maxValues = _mm_set1_ps(max); +++ __m128 maxValuesIndex = _mm_setzero_ps(); +++ __m128 compareResults; +++ __m128 currentValues; ++ ++- for(;number < quarterPoints; number++){ +++ __VOLK_ATTR_ALIGNED(16) float maxValuesBuffer[4]; +++ __VOLK_ATTR_ALIGNED(16) float maxIndexesBuffer[4]; ++ ++- currentValues = _mm_load_ps(inputPtr); inputPtr += 4; ++- currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues); +++ for (; number < quarterPoints; number++) { ++ ++- compareResults = _mm_cmpgt_ps(currentValues, maxValues); +++ currentValues = _mm_load_ps(inputPtr); +++ inputPtr += 4; +++ currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues); ++ ++- maxValuesIndex = _mm_blendv_ps(maxValuesIndex, currentIndexes, compareResults); ++- maxValues = _mm_blendv_ps(maxValues, currentValues, compareResults); ++- } +++ compareResults = _mm_cmpgt_ps(currentValues, maxValues); ++ ++- // Calculate the largest value from the remaining 4 points ++- _mm_store_ps(maxValuesBuffer, maxValues); ++- _mm_store_ps(maxIndexesBuffer, maxValuesIndex); +++ maxValuesIndex = _mm_blendv_ps(maxValuesIndex, currentIndexes, compareResults); +++ maxValues = _mm_blendv_ps(maxValues, currentValues, compareResults); +++ } ++ ++- for(number = 0; number < 4; number++){ ++- if(maxValuesBuffer[number] > max){ ++- index = maxIndexesBuffer[number]; ++- max = maxValuesBuffer[number]; ++- } else if(maxValuesBuffer[number] == max){ ++- if (index > maxIndexesBuffer[number]) ++- index = maxIndexesBuffer[number]; +++ // Calculate the largest value from the remaining 4 points +++ _mm_store_ps(maxValuesBuffer, maxValues); +++ _mm_store_ps(maxIndexesBuffer, maxValuesIndex); +++ +++ for (number = 0; number < 4; number++) { +++ if (maxValuesBuffer[number] > max) { +++ index = maxIndexesBuffer[number]; +++ max = maxValuesBuffer[number]; +++ } else if (maxValuesBuffer[number] == max) { +++ if (index > maxIndexesBuffer[number]) +++ index = maxIndexesBuffer[number]; +++ } ++ } ++- } ++ ++- number = quarterPoints * 4; ++- for(;number < num_points; number++){ ++- if(src0[number] > max){ ++- index = number; ++- max = src0[number]; +++ number = quarterPoints * 4; +++ for (; number < num_points; number++) { +++ if (src0[number] > max) { +++ index = number; +++ max = src0[number]; +++ } ++ } ++- } ++- target[0] = (uint16_t)index; +++ target[0] = (uint16_t)index; ++ } ++ ++ #endif /*LV_HAVE_SSE4_1*/ ++@@ -211,64 +210,64 @@ volk_32f_index_max_16u_a_sse4_1(uint16_t* target, const float* src0, ++ #include ++ ++ static inline void ++-volk_32f_index_max_16u_a_sse(uint16_t* target, const float* src0, ++- uint32_t num_points) +++volk_32f_index_max_16u_a_sse(uint16_t* target, const float* src0, uint32_t num_points) ++ { ++- num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points; ++- ++- uint32_t number = 0; ++- const uint32_t quarterPoints = num_points / 4; +++ num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points; ++ ++- float* inputPtr = (float*)src0; +++ uint32_t number = 0; +++ const uint32_t quarterPoints = num_points / 4; ++ ++- __m128 indexIncrementValues = _mm_set1_ps(4); ++- __m128 currentIndexes = _mm_set_ps(-1,-2,-3,-4); +++ float* inputPtr = (float*)src0; ++ ++- float max = src0[0]; ++- float index = 0; ++- __m128 maxValues = _mm_set1_ps(max); ++- __m128 maxValuesIndex = _mm_setzero_ps(); ++- __m128 compareResults; ++- __m128 currentValues; +++ __m128 indexIncrementValues = _mm_set1_ps(4); +++ __m128 currentIndexes = _mm_set_ps(-1, -2, -3, -4); ++ ++- __VOLK_ATTR_ALIGNED(16) float maxValuesBuffer[4]; ++- __VOLK_ATTR_ALIGNED(16) float maxIndexesBuffer[4]; +++ float max = src0[0]; +++ float index = 0; +++ __m128 maxValues = _mm_set1_ps(max); +++ __m128 maxValuesIndex = _mm_setzero_ps(); +++ __m128 compareResults; +++ __m128 currentValues; ++ ++- for(;number < quarterPoints; number++){ +++ __VOLK_ATTR_ALIGNED(16) float maxValuesBuffer[4]; +++ __VOLK_ATTR_ALIGNED(16) float maxIndexesBuffer[4]; ++ ++- currentValues = _mm_load_ps(inputPtr); inputPtr += 4; ++- currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues); +++ for (; number < quarterPoints; number++) { ++ ++- compareResults = _mm_cmpgt_ps(currentValues, maxValues); +++ currentValues = _mm_load_ps(inputPtr); +++ inputPtr += 4; +++ currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues); ++ ++- maxValuesIndex = _mm_or_ps(_mm_and_ps(compareResults, currentIndexes), ++- _mm_andnot_ps(compareResults, maxValuesIndex)); ++- maxValues = _mm_or_ps(_mm_and_ps(compareResults, currentValues), ++- _mm_andnot_ps(compareResults, maxValues)); ++- } +++ compareResults = _mm_cmpgt_ps(currentValues, maxValues); ++ ++- // Calculate the largest value from the remaining 4 points ++- _mm_store_ps(maxValuesBuffer, maxValues); ++- _mm_store_ps(maxIndexesBuffer, maxValuesIndex); +++ maxValuesIndex = _mm_or_ps(_mm_and_ps(compareResults, currentIndexes), +++ _mm_andnot_ps(compareResults, maxValuesIndex)); +++ maxValues = _mm_or_ps(_mm_and_ps(compareResults, currentValues), +++ _mm_andnot_ps(compareResults, maxValues)); +++ } ++ ++- for(number = 0; number < 4; number++){ ++- if(maxValuesBuffer[number] > max){ ++- index = maxIndexesBuffer[number]; ++- max = maxValuesBuffer[number]; ++- } else if(maxValuesBuffer[number] == max){ ++- if (index > maxIndexesBuffer[number]) ++- index = maxIndexesBuffer[number]; +++ // Calculate the largest value from the remaining 4 points +++ _mm_store_ps(maxValuesBuffer, maxValues); +++ _mm_store_ps(maxIndexesBuffer, maxValuesIndex); +++ +++ for (number = 0; number < 4; number++) { +++ if (maxValuesBuffer[number] > max) { +++ index = maxIndexesBuffer[number]; +++ max = maxValuesBuffer[number]; +++ } else if (maxValuesBuffer[number] == max) { +++ if (index > maxIndexesBuffer[number]) +++ index = maxIndexesBuffer[number]; +++ } ++ } ++- } ++ ++- number = quarterPoints * 4; ++- for(;number < num_points; number++){ ++- if(src0[number] > max){ ++- index = number; ++- max = src0[number]; +++ number = quarterPoints * 4; +++ for (; number < num_points; number++) { +++ if (src0[number] > max) { +++ index = number; +++ max = src0[number]; +++ } ++ } ++- } ++- target[0] = (uint16_t)index; +++ target[0] = (uint16_t)index; ++ } ++ ++ #endif /*LV_HAVE_SSE*/ ++@@ -277,23 +276,22 @@ volk_32f_index_max_16u_a_sse(uint16_t* target, const float* src0, ++ #ifdef LV_HAVE_GENERIC ++ ++ static inline void ++-volk_32f_index_max_16u_generic(uint16_t* target, const float* src0, ++- uint32_t num_points) +++volk_32f_index_max_16u_generic(uint16_t* target, const float* src0, uint32_t num_points) ++ { ++- num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points; +++ num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points; ++ ++- float max = src0[0]; ++- uint16_t index = 0; +++ float max = src0[0]; +++ uint16_t index = 0; ++ ++- uint32_t i = 1; +++ uint32_t i = 1; ++ ++- for(; i < num_points; ++i) { ++- if(src0[i] > max) { ++- index = i; ++- max = src0[i]; +++ for (; i < num_points; ++i) { +++ if (src0[i] > max) { +++ index = i; +++ max = src0[i]; +++ } ++ } ++- } ++- target[0] = index; +++ target[0] = index; ++ } ++ ++ #endif /*LV_HAVE_GENERIC*/ ++@@ -302,76 +300,74 @@ volk_32f_index_max_16u_generic(uint16_t* target, const float* src0, ++ #endif /*INCLUDED_volk_32f_index_max_16u_a_H*/ ++ ++ ++- ++ #ifndef INCLUDED_volk_32f_index_max_16u_u_H ++ #define INCLUDED_volk_32f_index_max_16u_u_H ++ ++-#include ++-#include ++ #include ++ #include ++ #include +++#include ++ ++ #ifdef LV_HAVE_AVX ++ #include ++ ++ static inline void ++-volk_32f_index_max_16u_u_avx(uint16_t* target, const float* src0, ++- uint32_t num_points) +++volk_32f_index_max_16u_u_avx(uint16_t* target, const float* src0, uint32_t num_points) ++ { ++- num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points; ++- ++- uint32_t number = 0; ++- const uint32_t eighthPoints = num_points / 8; +++ num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points; ++ ++- float* inputPtr = (float*)src0; +++ uint32_t number = 0; +++ const uint32_t eighthPoints = num_points / 8; ++ ++- __m256 indexIncrementValues = _mm256_set1_ps(8); ++- __m256 currentIndexes = _mm256_set_ps(-1,-2,-3,-4,-5,-6,-7,-8); +++ float* inputPtr = (float*)src0; ++ ++- float max = src0[0]; ++- float index = 0; ++- __m256 maxValues = _mm256_set1_ps(max); ++- __m256 maxValuesIndex = _mm256_setzero_ps(); ++- __m256 compareResults; ++- __m256 currentValues; +++ __m256 indexIncrementValues = _mm256_set1_ps(8); +++ __m256 currentIndexes = _mm256_set_ps(-1, -2, -3, -4, -5, -6, -7, -8); ++ ++- __VOLK_ATTR_ALIGNED(32) float maxValuesBuffer[8]; ++- __VOLK_ATTR_ALIGNED(32) float maxIndexesBuffer[8]; +++ float max = src0[0]; +++ float index = 0; +++ __m256 maxValues = _mm256_set1_ps(max); +++ __m256 maxValuesIndex = _mm256_setzero_ps(); +++ __m256 compareResults; +++ __m256 currentValues; ++ ++- for(;number < eighthPoints; number++){ +++ __VOLK_ATTR_ALIGNED(32) float maxValuesBuffer[8]; +++ __VOLK_ATTR_ALIGNED(32) float maxIndexesBuffer[8]; ++ ++- currentValues = _mm256_loadu_ps(inputPtr); inputPtr += 8; ++- currentIndexes = _mm256_add_ps(currentIndexes, indexIncrementValues); +++ for (; number < eighthPoints; number++) { ++ ++- compareResults = _mm256_cmp_ps(currentValues, maxValues, _CMP_GT_OS); +++ currentValues = _mm256_loadu_ps(inputPtr); +++ inputPtr += 8; +++ currentIndexes = _mm256_add_ps(currentIndexes, indexIncrementValues); ++ ++- maxValuesIndex = _mm256_blendv_ps(maxValuesIndex, currentIndexes, compareResults); ++- maxValues = _mm256_blendv_ps(maxValues, currentValues, compareResults); ++- } +++ compareResults = _mm256_cmp_ps(currentValues, maxValues, _CMP_GT_OS); ++ ++- // Calculate the largest value from the remaining 4 points ++- _mm256_storeu_ps(maxValuesBuffer, maxValues); ++- _mm256_storeu_ps(maxIndexesBuffer, maxValuesIndex); +++ maxValuesIndex = _mm256_blendv_ps(maxValuesIndex, currentIndexes, compareResults); +++ maxValues = _mm256_blendv_ps(maxValues, currentValues, compareResults); +++ } ++ ++- for(number = 0; number < 8; number++){ ++- if(maxValuesBuffer[number] > max){ ++- index = maxIndexesBuffer[number]; ++- max = maxValuesBuffer[number]; ++- } else if(maxValuesBuffer[number] == max){ ++- if (index > maxIndexesBuffer[number]) ++- index = maxIndexesBuffer[number]; +++ // Calculate the largest value from the remaining 4 points +++ _mm256_storeu_ps(maxValuesBuffer, maxValues); +++ _mm256_storeu_ps(maxIndexesBuffer, maxValuesIndex); +++ +++ for (number = 0; number < 8; number++) { +++ if (maxValuesBuffer[number] > max) { +++ index = maxIndexesBuffer[number]; +++ max = maxValuesBuffer[number]; +++ } else if (maxValuesBuffer[number] == max) { +++ if (index > maxIndexesBuffer[number]) +++ index = maxIndexesBuffer[number]; +++ } ++ } ++- } ++ ++- number = eighthPoints * 8; ++- for(;number < num_points; number++){ ++- if(src0[number] > max){ ++- index = number; ++- max = src0[number]; +++ number = eighthPoints * 8; +++ for (; number < num_points; number++) { +++ if (src0[number] > max) { +++ index = number; +++ max = src0[number]; +++ } ++ } ++- } ++- target[0] = (uint16_t)index; +++ target[0] = (uint16_t)index; ++ } ++ ++ #endif /*LV_HAVE_AVX*/ ++diff --git a/kernels/volk/volk_32f_index_max_32u.h b/kernels/volk/volk_32f_index_max_32u.h ++index 318c8e4..315531d 100644 ++--- a/kernels/volk/volk_32f_index_max_32u.h +++++ b/kernels/volk/volk_32f_index_max_32u.h ++@@ -25,7 +25,8 @@ ++ * ++ * \b Overview ++ * ++- * Returns Argmax_i x[i]. Finds and returns the index which contains the first maximum value in the given vector. +++ * Returns Argmax_i x[i]. Finds and returns the index which contains the first maximum +++ * value in the given vector. ++ * ++ * Dispatcher Prototype ++ * \code ++@@ -64,70 +65,71 @@ ++ #ifndef INCLUDED_volk_32f_index_max_32u_a_H ++ #define INCLUDED_volk_32f_index_max_32u_a_H ++ ++-#include ++-#include ++ #include ++ #include +++#include ++ ++ #ifdef LV_HAVE_SSE4_1 ++-#include +++#include ++ ++ static inline void ++ volk_32f_index_max_32u_a_sse4_1(uint32_t* target, const float* src0, uint32_t num_points) ++ { ++- if(num_points > 0){ ++- uint32_t number = 0; ++- const uint32_t quarterPoints = num_points / 4; +++ if (num_points > 0) { +++ uint32_t number = 0; +++ const uint32_t quarterPoints = num_points / 4; ++ ++- float* inputPtr = (float*)src0; +++ float* inputPtr = (float*)src0; ++ ++- __m128 indexIncrementValues = _mm_set1_ps(4); ++- __m128 currentIndexes = _mm_set_ps(-1,-2,-3,-4); +++ __m128 indexIncrementValues = _mm_set1_ps(4); +++ __m128 currentIndexes = _mm_set_ps(-1, -2, -3, -4); ++ ++- float max = src0[0]; ++- float index = 0; ++- __m128 maxValues = _mm_set1_ps(max); ++- __m128 maxValuesIndex = _mm_setzero_ps(); ++- __m128 compareResults; ++- __m128 currentValues; +++ float max = src0[0]; +++ float index = 0; +++ __m128 maxValues = _mm_set1_ps(max); +++ __m128 maxValuesIndex = _mm_setzero_ps(); +++ __m128 compareResults; +++ __m128 currentValues; ++ ++- __VOLK_ATTR_ALIGNED(16) float maxValuesBuffer[4]; ++- __VOLK_ATTR_ALIGNED(16) float maxIndexesBuffer[4]; +++ __VOLK_ATTR_ALIGNED(16) float maxValuesBuffer[4]; +++ __VOLK_ATTR_ALIGNED(16) float maxIndexesBuffer[4]; ++ ++- for(;number < quarterPoints; number++){ +++ for (; number < quarterPoints; number++) { ++ ++- currentValues = _mm_load_ps(inputPtr); inputPtr += 4; ++- currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues); +++ currentValues = _mm_load_ps(inputPtr); +++ inputPtr += 4; +++ currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues); ++ ++- compareResults = _mm_cmpgt_ps(currentValues, maxValues); +++ compareResults = _mm_cmpgt_ps(currentValues, maxValues); ++ ++- maxValuesIndex = _mm_blendv_ps(maxValuesIndex, currentIndexes, compareResults); ++- maxValues = _mm_blendv_ps(maxValues, currentValues, compareResults); ++- } +++ maxValuesIndex = +++ _mm_blendv_ps(maxValuesIndex, currentIndexes, compareResults); +++ maxValues = _mm_blendv_ps(maxValues, currentValues, compareResults); +++ } ++ ++- // Calculate the largest value from the remaining 4 points ++- _mm_store_ps(maxValuesBuffer, maxValues); ++- _mm_store_ps(maxIndexesBuffer, maxValuesIndex); ++- ++- for(number = 0; number < 4; number++){ ++- if(maxValuesBuffer[number] > max){ ++- index = maxIndexesBuffer[number]; ++- max = maxValuesBuffer[number]; ++- } else if(maxValuesBuffer[number] == max){ ++- if (index > maxIndexesBuffer[number]) ++- index = maxIndexesBuffer[number]; ++- } ++- } +++ // Calculate the largest value from the remaining 4 points +++ _mm_store_ps(maxValuesBuffer, maxValues); +++ _mm_store_ps(maxIndexesBuffer, maxValuesIndex); +++ +++ for (number = 0; number < 4; number++) { +++ if (maxValuesBuffer[number] > max) { +++ index = maxIndexesBuffer[number]; +++ max = maxValuesBuffer[number]; +++ } else if (maxValuesBuffer[number] == max) { +++ if (index > maxIndexesBuffer[number]) +++ index = maxIndexesBuffer[number]; +++ } +++ } ++ ++- number = quarterPoints * 4; ++- for(;number < num_points; number++){ ++- if(src0[number] > max){ ++- index = number; ++- max = src0[number]; ++- } +++ number = quarterPoints * 4; +++ for (; number < num_points; number++) { +++ if (src0[number] > max) { +++ index = number; +++ max = src0[number]; +++ } +++ } +++ target[0] = (uint32_t)index; ++ } ++- target[0] = (uint32_t)index; ++- } ++ } ++ ++ #endif /*LV_HAVE_SSE4_1*/ ++@@ -135,67 +137,68 @@ volk_32f_index_max_32u_a_sse4_1(uint32_t* target, const float* src0, uint32_t nu ++ ++ #ifdef LV_HAVE_SSE ++ ++-#include +++#include ++ ++ static inline void ++ volk_32f_index_max_32u_a_sse(uint32_t* target, const float* src0, uint32_t num_points) ++ { ++- if(num_points > 0){ ++- uint32_t number = 0; ++- const uint32_t quarterPoints = num_points / 4; +++ if (num_points > 0) { +++ uint32_t number = 0; +++ const uint32_t quarterPoints = num_points / 4; ++ ++- float* inputPtr = (float*)src0; +++ float* inputPtr = (float*)src0; ++ ++- __m128 indexIncrementValues = _mm_set1_ps(4); ++- __m128 currentIndexes = _mm_set_ps(-1,-2,-3,-4); +++ __m128 indexIncrementValues = _mm_set1_ps(4); +++ __m128 currentIndexes = _mm_set_ps(-1, -2, -3, -4); ++ ++- float max = src0[0]; ++- float index = 0; ++- __m128 maxValues = _mm_set1_ps(max); ++- __m128 maxValuesIndex = _mm_setzero_ps(); ++- __m128 compareResults; ++- __m128 currentValues; +++ float max = src0[0]; +++ float index = 0; +++ __m128 maxValues = _mm_set1_ps(max); +++ __m128 maxValuesIndex = _mm_setzero_ps(); +++ __m128 compareResults; +++ __m128 currentValues; ++ ++- __VOLK_ATTR_ALIGNED(16) float maxValuesBuffer[4]; ++- __VOLK_ATTR_ALIGNED(16) float maxIndexesBuffer[4]; +++ __VOLK_ATTR_ALIGNED(16) float maxValuesBuffer[4]; +++ __VOLK_ATTR_ALIGNED(16) float maxIndexesBuffer[4]; ++ ++- for(;number < quarterPoints; number++){ +++ for (; number < quarterPoints; number++) { ++ ++- currentValues = _mm_load_ps(inputPtr); inputPtr += 4; ++- currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues); +++ currentValues = _mm_load_ps(inputPtr); +++ inputPtr += 4; +++ currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues); ++ ++- compareResults = _mm_cmpgt_ps(currentValues, maxValues); +++ compareResults = _mm_cmpgt_ps(currentValues, maxValues); ++ ++- maxValuesIndex = _mm_or_ps(_mm_and_ps(compareResults, currentIndexes), ++- _mm_andnot_ps(compareResults, maxValuesIndex)); +++ maxValuesIndex = _mm_or_ps(_mm_and_ps(compareResults, currentIndexes), +++ _mm_andnot_ps(compareResults, maxValuesIndex)); ++ ++- maxValues = _mm_or_ps(_mm_and_ps(compareResults, currentValues), ++- _mm_andnot_ps(compareResults, maxValues)); ++- } +++ maxValues = _mm_or_ps(_mm_and_ps(compareResults, currentValues), +++ _mm_andnot_ps(compareResults, maxValues)); +++ } ++ ++- // Calculate the largest value from the remaining 4 points ++- _mm_store_ps(maxValuesBuffer, maxValues); ++- _mm_store_ps(maxIndexesBuffer, maxValuesIndex); ++- ++- for(number = 0; number < 4; number++){ ++- if(maxValuesBuffer[number] > max){ ++- index = maxIndexesBuffer[number]; ++- max = maxValuesBuffer[number]; ++- } else if(maxValuesBuffer[number] == max){ ++- if (index > maxIndexesBuffer[number]) ++- index = maxIndexesBuffer[number]; ++- } ++- } +++ // Calculate the largest value from the remaining 4 points +++ _mm_store_ps(maxValuesBuffer, maxValues); +++ _mm_store_ps(maxIndexesBuffer, maxValuesIndex); +++ +++ for (number = 0; number < 4; number++) { +++ if (maxValuesBuffer[number] > max) { +++ index = maxIndexesBuffer[number]; +++ max = maxValuesBuffer[number]; +++ } else if (maxValuesBuffer[number] == max) { +++ if (index > maxIndexesBuffer[number]) +++ index = maxIndexesBuffer[number]; +++ } +++ } ++ ++- number = quarterPoints * 4; ++- for(;number < num_points; number++){ ++- if(src0[number] > max){ ++- index = number; ++- max = src0[number]; ++- } +++ number = quarterPoints * 4; +++ for (; number < num_points; number++) { +++ if (src0[number] > max) { +++ index = number; +++ max = src0[number]; +++ } +++ } +++ target[0] = (uint32_t)index; ++ } ++- target[0] = (uint32_t)index; ++- } ++ } ++ ++ #endif /*LV_HAVE_SSE*/ ++@@ -204,65 +207,61 @@ volk_32f_index_max_32u_a_sse(uint32_t* target, const float* src0, uint32_t num_p ++ #ifdef LV_HAVE_AVX ++ #include ++ ++-static inline void volk_32f_index_max_32u_a_avx(uint32_t* target, const float* src0, uint32_t num_points) +++static inline void +++volk_32f_index_max_32u_a_avx(uint32_t* target, const float* src0, uint32_t num_points) ++ { ++- if(num_points > 0) ++- { ++- uint32_t number = 0; ++- const uint32_t quarterPoints = num_points / 8; ++- ++- float* inputPtr = (float*)src0; ++- ++- __m256 indexIncrementValues = _mm256_set1_ps(8); ++- __m256 currentIndexes = _mm256_set_ps(-1,-2,-3,-4,-5,-6,-7,-8); ++- ++- float max = src0[0]; ++- float index = 0; ++- __m256 maxValues = _mm256_set1_ps(max); ++- __m256 maxValuesIndex = _mm256_setzero_ps(); ++- __m256 compareResults; ++- __m256 currentValues; ++- ++- __VOLK_ATTR_ALIGNED(32) float maxValuesBuffer[8]; ++- __VOLK_ATTR_ALIGNED(32) float maxIndexesBuffer[8]; ++- ++- for(;number < quarterPoints; number++) ++- { ++- currentValues = _mm256_load_ps(inputPtr); inputPtr += 8; ++- currentIndexes = _mm256_add_ps(currentIndexes, indexIncrementValues); ++- compareResults = _mm256_cmp_ps(currentValues, maxValues, _CMP_GT_OS); ++- maxValuesIndex = _mm256_blendv_ps(maxValuesIndex, currentIndexes, compareResults); ++- maxValues = _mm256_blendv_ps(maxValues, currentValues, compareResults); ++- } ++- ++- // Calculate the largest value from the remaining 8 points ++- _mm256_store_ps(maxValuesBuffer, maxValues); ++- _mm256_store_ps(maxIndexesBuffer, maxValuesIndex); ++- ++- for(number = 0; number < 8; number++) ++- { ++- if(maxValuesBuffer[number] > max) ++- { ++- index = maxIndexesBuffer[number]; ++- max = maxValuesBuffer[number]; ++- } ++- else if(maxValuesBuffer[number] == max){ ++- if (index > maxIndexesBuffer[number]) ++- index = maxIndexesBuffer[number]; ++- } ++- } ++- ++- number = quarterPoints * 8; ++- for(;number < num_points; number++) ++- { ++- if(src0[number] > max) ++- { ++- index = number; ++- max = src0[number]; ++- } ++- } ++- target[0] = (uint32_t)index; +++ if (num_points > 0) { +++ uint32_t number = 0; +++ const uint32_t quarterPoints = num_points / 8; +++ +++ float* inputPtr = (float*)src0; +++ +++ __m256 indexIncrementValues = _mm256_set1_ps(8); +++ __m256 currentIndexes = _mm256_set_ps(-1, -2, -3, -4, -5, -6, -7, -8); +++ +++ float max = src0[0]; +++ float index = 0; +++ __m256 maxValues = _mm256_set1_ps(max); +++ __m256 maxValuesIndex = _mm256_setzero_ps(); +++ __m256 compareResults; +++ __m256 currentValues; +++ +++ __VOLK_ATTR_ALIGNED(32) float maxValuesBuffer[8]; +++ __VOLK_ATTR_ALIGNED(32) float maxIndexesBuffer[8]; +++ +++ for (; number < quarterPoints; number++) { +++ currentValues = _mm256_load_ps(inputPtr); +++ inputPtr += 8; +++ currentIndexes = _mm256_add_ps(currentIndexes, indexIncrementValues); +++ compareResults = _mm256_cmp_ps(currentValues, maxValues, _CMP_GT_OS); +++ maxValuesIndex = +++ _mm256_blendv_ps(maxValuesIndex, currentIndexes, compareResults); +++ maxValues = _mm256_blendv_ps(maxValues, currentValues, compareResults); +++ } +++ +++ // Calculate the largest value from the remaining 8 points +++ _mm256_store_ps(maxValuesBuffer, maxValues); +++ _mm256_store_ps(maxIndexesBuffer, maxValuesIndex); +++ +++ for (number = 0; number < 8; number++) { +++ if (maxValuesBuffer[number] > max) { +++ index = maxIndexesBuffer[number]; +++ max = maxValuesBuffer[number]; +++ } else if (maxValuesBuffer[number] == max) { +++ if (index > maxIndexesBuffer[number]) +++ index = maxIndexesBuffer[number]; +++ } +++ } +++ +++ number = quarterPoints * 8; +++ for (; number < num_points; number++) { +++ if (src0[number] > max) { +++ index = number; +++ max = src0[number]; +++ } ++ } +++ target[0] = (uint32_t)index; +++ } ++ } ++ ++ #endif /*LV_HAVE_AVX*/ ++@@ -271,66 +270,63 @@ static inline void volk_32f_index_max_32u_a_avx(uint32_t* target, const float* s ++ #ifdef LV_HAVE_NEON ++ #include ++ ++-static inline void volk_32f_index_max_32u_neon(uint32_t* target, const float* src0, uint32_t num_points) +++static inline void +++volk_32f_index_max_32u_neon(uint32_t* target, const float* src0, uint32_t num_points) ++ { ++- if(num_points > 0) ++- { ++- uint32_t number = 0; ++- const uint32_t quarterPoints = num_points / 4; ++- ++- float* inputPtr = (float*)src0; ++- float32x4_t indexIncrementValues = vdupq_n_f32(4); ++- __VOLK_ATTR_ALIGNED(16) float currentIndexes_float[4] = { -4.0f, -3.0f, -2.0f, -1.0f }; ++- float32x4_t currentIndexes = vld1q_f32(currentIndexes_float); ++- ++- float max = src0[0]; ++- float index = 0; ++- float32x4_t maxValues = vdupq_n_f32(max); ++- uint32x4_t maxValuesIndex = vmovq_n_u32(0); ++- uint32x4_t compareResults; ++- uint32x4_t currentIndexes_u; ++- float32x4_t currentValues; ++- ++- __VOLK_ATTR_ALIGNED(16) float maxValuesBuffer[4]; ++- __VOLK_ATTR_ALIGNED(16) float maxIndexesBuffer[4]; ++- ++- for(;number < quarterPoints; number++) ++- { ++- currentValues = vld1q_f32(inputPtr); inputPtr += 4; ++- currentIndexes = vaddq_f32(currentIndexes, indexIncrementValues); ++- currentIndexes_u = vcvtq_u32_f32(currentIndexes); ++- compareResults = vcleq_f32(currentValues, maxValues); ++- maxValuesIndex = vorrq_u32( vandq_u32( compareResults, maxValuesIndex ), vbicq_u32(currentIndexes_u, compareResults) ); ++- maxValues = vmaxq_f32(currentValues, maxValues); ++- } ++- ++- // Calculate the largest value from the remaining 4 points ++- vst1q_f32(maxValuesBuffer, maxValues); ++- vst1q_f32(maxIndexesBuffer, vcvtq_f32_u32(maxValuesIndex)); ++- for(number = 0; number < 4; number++) ++- { ++- if(maxValuesBuffer[number] > max) ++- { ++- index = maxIndexesBuffer[number]; ++- max = maxValuesBuffer[number]; ++- } ++- else if(maxValues[number] == max){ ++- if (index > maxIndexesBuffer[number]) ++- index = maxIndexesBuffer[number]; ++- } ++- } ++- ++- number = quarterPoints * 4; ++- for(;number < num_points; number++) ++- { ++- if(src0[number] > max) ++- { ++- index = number; ++- max = src0[number]; ++- } ++- } ++- target[0] = (uint32_t)index; +++ if (num_points > 0) { +++ uint32_t number = 0; +++ const uint32_t quarterPoints = num_points / 4; +++ +++ float* inputPtr = (float*)src0; +++ float32x4_t indexIncrementValues = vdupq_n_f32(4); +++ __VOLK_ATTR_ALIGNED(16) +++ float currentIndexes_float[4] = { -4.0f, -3.0f, -2.0f, -1.0f }; +++ float32x4_t currentIndexes = vld1q_f32(currentIndexes_float); +++ +++ float max = src0[0]; +++ float index = 0; +++ float32x4_t maxValues = vdupq_n_f32(max); +++ uint32x4_t maxValuesIndex = vmovq_n_u32(0); +++ uint32x4_t compareResults; +++ uint32x4_t currentIndexes_u; +++ float32x4_t currentValues; +++ +++ __VOLK_ATTR_ALIGNED(16) float maxValuesBuffer[4]; +++ __VOLK_ATTR_ALIGNED(16) float maxIndexesBuffer[4]; +++ +++ for (; number < quarterPoints; number++) { +++ currentValues = vld1q_f32(inputPtr); +++ inputPtr += 4; +++ currentIndexes = vaddq_f32(currentIndexes, indexIncrementValues); +++ currentIndexes_u = vcvtq_u32_f32(currentIndexes); +++ compareResults = vcleq_f32(currentValues, maxValues); +++ maxValuesIndex = vorrq_u32(vandq_u32(compareResults, maxValuesIndex), +++ vbicq_u32(currentIndexes_u, compareResults)); +++ maxValues = vmaxq_f32(currentValues, maxValues); +++ } +++ +++ // Calculate the largest value from the remaining 4 points +++ vst1q_f32(maxValuesBuffer, maxValues); +++ vst1q_f32(maxIndexesBuffer, vcvtq_f32_u32(maxValuesIndex)); +++ for (number = 0; number < 4; number++) { +++ if (maxValuesBuffer[number] > max) { +++ index = maxIndexesBuffer[number]; +++ max = maxValuesBuffer[number]; +++ } else if (maxValues[number] == max) { +++ if (index > maxIndexesBuffer[number]) +++ index = maxIndexesBuffer[number]; +++ } +++ } +++ +++ number = quarterPoints * 4; +++ for (; number < num_points; number++) { +++ if (src0[number] > max) { +++ index = number; +++ max = src0[number]; +++ } ++ } +++ target[0] = (uint32_t)index; +++ } ++ } ++ ++ #endif /*LV_HAVE_NEON*/ ++@@ -341,20 +337,20 @@ static inline void volk_32f_index_max_32u_neon(uint32_t* target, const float* sr ++ static inline void ++ volk_32f_index_max_32u_generic(uint32_t* target, const float* src0, uint32_t num_points) ++ { ++- if(num_points > 0){ ++- float max = src0[0]; ++- uint32_t index = 0; +++ if (num_points > 0) { +++ float max = src0[0]; +++ uint32_t index = 0; ++ ++- uint32_t i = 1; +++ uint32_t i = 1; ++ ++- for(; i < num_points; ++i) { ++- if(src0[i] > max){ ++- index = i; ++- max = src0[i]; ++- } +++ for (; i < num_points; ++i) { +++ if (src0[i] > max) { +++ index = i; +++ max = src0[i]; +++ } +++ } +++ target[0] = index; ++ } ++- target[0] = index; ++- } ++ } ++ ++ #endif /*LV_HAVE_GENERIC*/ ++@@ -366,209 +362,195 @@ volk_32f_index_max_32u_generic(uint32_t* target, const float* src0, uint32_t num ++ #ifndef INCLUDED_volk_32f_index_max_32u_u_H ++ #define INCLUDED_volk_32f_index_max_32u_u_H ++ ++-#include ++-#include ++ #include ++ #include +++#include ++ ++ ++ #ifdef LV_HAVE_AVX ++ #include ++ ++-static inline void volk_32f_index_max_32u_u_avx(uint32_t* target, const float* src0, uint32_t num_points) +++static inline void +++volk_32f_index_max_32u_u_avx(uint32_t* target, const float* src0, uint32_t num_points) ++ { ++- if(num_points > 0) ++- { ++- uint32_t number = 0; ++- const uint32_t quarterPoints = num_points / 8; ++- ++- float* inputPtr = (float*)src0; ++- ++- __m256 indexIncrementValues = _mm256_set1_ps(8); ++- __m256 currentIndexes = _mm256_set_ps(-1,-2,-3,-4,-5,-6,-7,-8); ++- ++- float max = src0[0]; ++- float index = 0; ++- __m256 maxValues = _mm256_set1_ps(max); ++- __m256 maxValuesIndex = _mm256_setzero_ps(); ++- __m256 compareResults; ++- __m256 currentValues; ++- ++- __VOLK_ATTR_ALIGNED(32) float maxValuesBuffer[8]; ++- __VOLK_ATTR_ALIGNED(32) float maxIndexesBuffer[8]; ++- ++- for(;number < quarterPoints; number++) ++- { ++- currentValues = _mm256_loadu_ps(inputPtr); inputPtr += 8; ++- currentIndexes = _mm256_add_ps(currentIndexes, indexIncrementValues); ++- compareResults = _mm256_cmp_ps(currentValues, maxValues, _CMP_GT_OS); ++- maxValuesIndex = _mm256_blendv_ps(maxValuesIndex, currentIndexes, compareResults); ++- maxValues = _mm256_blendv_ps(maxValues, currentValues, compareResults); ++- } ++- ++- // Calculate the largest value from the remaining 8 points ++- _mm256_store_ps(maxValuesBuffer, maxValues); ++- _mm256_store_ps(maxIndexesBuffer, maxValuesIndex); ++- ++- for(number = 0; number < 8; number++) ++- { ++- if(maxValuesBuffer[number] > max) ++- { ++- index = maxIndexesBuffer[number]; ++- max = maxValuesBuffer[number]; ++- } ++- else if(maxValuesBuffer[number] == max){ ++- if (index > maxIndexesBuffer[number]) ++- index = maxIndexesBuffer[number]; ++- } ++- } ++- ++- number = quarterPoints * 8; ++- for(;number < num_points; number++) ++- { ++- if(src0[number] > max) ++- { ++- index = number; ++- max = src0[number]; ++- } ++- } ++- target[0] = (uint32_t)index; +++ if (num_points > 0) { +++ uint32_t number = 0; +++ const uint32_t quarterPoints = num_points / 8; +++ +++ float* inputPtr = (float*)src0; +++ +++ __m256 indexIncrementValues = _mm256_set1_ps(8); +++ __m256 currentIndexes = _mm256_set_ps(-1, -2, -3, -4, -5, -6, -7, -8); +++ +++ float max = src0[0]; +++ float index = 0; +++ __m256 maxValues = _mm256_set1_ps(max); +++ __m256 maxValuesIndex = _mm256_setzero_ps(); +++ __m256 compareResults; +++ __m256 currentValues; +++ +++ __VOLK_ATTR_ALIGNED(32) float maxValuesBuffer[8]; +++ __VOLK_ATTR_ALIGNED(32) float maxIndexesBuffer[8]; +++ +++ for (; number < quarterPoints; number++) { +++ currentValues = _mm256_loadu_ps(inputPtr); +++ inputPtr += 8; +++ currentIndexes = _mm256_add_ps(currentIndexes, indexIncrementValues); +++ compareResults = _mm256_cmp_ps(currentValues, maxValues, _CMP_GT_OS); +++ maxValuesIndex = +++ _mm256_blendv_ps(maxValuesIndex, currentIndexes, compareResults); +++ maxValues = _mm256_blendv_ps(maxValues, currentValues, compareResults); ++ } +++ +++ // Calculate the largest value from the remaining 8 points +++ _mm256_store_ps(maxValuesBuffer, maxValues); +++ _mm256_store_ps(maxIndexesBuffer, maxValuesIndex); +++ +++ for (number = 0; number < 8; number++) { +++ if (maxValuesBuffer[number] > max) { +++ index = maxIndexesBuffer[number]; +++ max = maxValuesBuffer[number]; +++ } else if (maxValuesBuffer[number] == max) { +++ if (index > maxIndexesBuffer[number]) +++ index = maxIndexesBuffer[number]; +++ } +++ } +++ +++ number = quarterPoints * 8; +++ for (; number < num_points; number++) { +++ if (src0[number] > max) { +++ index = number; +++ max = src0[number]; +++ } +++ } +++ target[0] = (uint32_t)index; +++ } ++ } ++ ++ #endif /*LV_HAVE_AVX*/ ++ ++ ++ #ifdef LV_HAVE_SSE4_1 ++-#include +++#include ++ ++-static inline void volk_32f_index_max_32u_u_sse4_1(uint32_t* target, const float* src0, uint32_t num_points) +++static inline void +++volk_32f_index_max_32u_u_sse4_1(uint32_t* target, const float* src0, uint32_t num_points) ++ { ++- if(num_points > 0) ++- { ++- uint32_t number = 0; ++- const uint32_t quarterPoints = num_points / 4; ++- ++- float* inputPtr = (float*)src0; ++- ++- __m128 indexIncrementValues = _mm_set1_ps(4); ++- __m128 currentIndexes = _mm_set_ps(-1,-2,-3,-4); ++- ++- float max = src0[0]; ++- float index = 0; ++- __m128 maxValues = _mm_set1_ps(max); ++- __m128 maxValuesIndex = _mm_setzero_ps(); ++- __m128 compareResults; ++- __m128 currentValues; ++- ++- __VOLK_ATTR_ALIGNED(16) float maxValuesBuffer[4]; ++- __VOLK_ATTR_ALIGNED(16) float maxIndexesBuffer[4]; ++- ++- for(;number < quarterPoints; number++) ++- { ++- currentValues = _mm_loadu_ps(inputPtr); inputPtr += 4; ++- currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues); ++- compareResults = _mm_cmpgt_ps(currentValues, maxValues); ++- maxValuesIndex = _mm_blendv_ps(maxValuesIndex, currentIndexes, compareResults); ++- maxValues = _mm_blendv_ps(maxValues, currentValues, compareResults); ++- } ++- ++- // Calculate the largest value from the remaining 4 points ++- _mm_store_ps(maxValuesBuffer, maxValues); ++- _mm_store_ps(maxIndexesBuffer, maxValuesIndex); ++- ++- for(number = 0; number < 4; number++) ++- { ++- if(maxValuesBuffer[number] > max) ++- { ++- index = maxIndexesBuffer[number]; ++- max = maxValuesBuffer[number]; ++- } ++- else if(maxValuesBuffer[number] == max){ ++- if (index > maxIndexesBuffer[number]) ++- index = maxIndexesBuffer[number]; ++- } ++- } ++- ++- number = quarterPoints * 4; ++- for(;number < num_points; number++) ++- { ++- if(src0[number] > max) ++- { ++- index = number; ++- max = src0[number]; ++- } ++- } ++- target[0] = (uint32_t)index; +++ if (num_points > 0) { +++ uint32_t number = 0; +++ const uint32_t quarterPoints = num_points / 4; +++ +++ float* inputPtr = (float*)src0; +++ +++ __m128 indexIncrementValues = _mm_set1_ps(4); +++ __m128 currentIndexes = _mm_set_ps(-1, -2, -3, -4); +++ +++ float max = src0[0]; +++ float index = 0; +++ __m128 maxValues = _mm_set1_ps(max); +++ __m128 maxValuesIndex = _mm_setzero_ps(); +++ __m128 compareResults; +++ __m128 currentValues; +++ +++ __VOLK_ATTR_ALIGNED(16) float maxValuesBuffer[4]; +++ __VOLK_ATTR_ALIGNED(16) float maxIndexesBuffer[4]; +++ +++ for (; number < quarterPoints; number++) { +++ currentValues = _mm_loadu_ps(inputPtr); +++ inputPtr += 4; +++ currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues); +++ compareResults = _mm_cmpgt_ps(currentValues, maxValues); +++ maxValuesIndex = +++ _mm_blendv_ps(maxValuesIndex, currentIndexes, compareResults); +++ maxValues = _mm_blendv_ps(maxValues, currentValues, compareResults); ++ } +++ +++ // Calculate the largest value from the remaining 4 points +++ _mm_store_ps(maxValuesBuffer, maxValues); +++ _mm_store_ps(maxIndexesBuffer, maxValuesIndex); +++ +++ for (number = 0; number < 4; number++) { +++ if (maxValuesBuffer[number] > max) { +++ index = maxIndexesBuffer[number]; +++ max = maxValuesBuffer[number]; +++ } else if (maxValuesBuffer[number] == max) { +++ if (index > maxIndexesBuffer[number]) +++ index = maxIndexesBuffer[number]; +++ } +++ } +++ +++ number = quarterPoints * 4; +++ for (; number < num_points; number++) { +++ if (src0[number] > max) { +++ index = number; +++ max = src0[number]; +++ } +++ } +++ target[0] = (uint32_t)index; +++ } ++ } ++ ++ #endif /*LV_HAVE_SSE4_1*/ ++ ++ #ifdef LV_HAVE_SSE ++-#include +++#include ++ ++-static inline void volk_32f_index_max_32u_u_sse(uint32_t* target, const float* src0, uint32_t num_points) +++static inline void +++volk_32f_index_max_32u_u_sse(uint32_t* target, const float* src0, uint32_t num_points) ++ { ++- if(num_points > 0) ++- { ++- uint32_t number = 0; ++- const uint32_t quarterPoints = num_points / 4; ++- ++- float* inputPtr = (float*)src0; ++- ++- __m128 indexIncrementValues = _mm_set1_ps(4); ++- __m128 currentIndexes = _mm_set_ps(-1,-2,-3,-4); ++- ++- float max = src0[0]; ++- float index = 0; ++- __m128 maxValues = _mm_set1_ps(max); ++- __m128 maxValuesIndex = _mm_setzero_ps(); ++- __m128 compareResults; ++- __m128 currentValues; ++- ++- __VOLK_ATTR_ALIGNED(16) float maxValuesBuffer[4]; ++- __VOLK_ATTR_ALIGNED(16) float maxIndexesBuffer[4]; ++- ++- for(;number < quarterPoints; number++) ++- { ++- currentValues = _mm_loadu_ps(inputPtr); inputPtr += 4; ++- currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues); ++- compareResults = _mm_cmpgt_ps(currentValues, maxValues); ++- maxValuesIndex = _mm_or_ps(_mm_and_ps(compareResults, currentIndexes), ++- _mm_andnot_ps(compareResults, maxValuesIndex)); ++- maxValues = _mm_or_ps(_mm_and_ps(compareResults, currentValues), ++- _mm_andnot_ps(compareResults, maxValues)); ++- } ++- ++- // Calculate the largest value from the remaining 4 points ++- _mm_store_ps(maxValuesBuffer, maxValues); ++- _mm_store_ps(maxIndexesBuffer, maxValuesIndex); ++- ++- for(number = 0; number < 4; number++) ++- { ++- if(maxValuesBuffer[number] > max) ++- { ++- index = maxIndexesBuffer[number]; ++- max = maxValuesBuffer[number]; ++- } ++- else if(maxValuesBuffer[number] == max){ ++- if (index > maxIndexesBuffer[number]) ++- index = maxIndexesBuffer[number]; ++- } ++- } ++- ++- number = quarterPoints * 4; ++- for(;number < num_points; number++) ++- { ++- if(src0[number] > max) ++- { ++- index = number; ++- max = src0[number]; ++- } ++- } ++- target[0] = (uint32_t)index; +++ if (num_points > 0) { +++ uint32_t number = 0; +++ const uint32_t quarterPoints = num_points / 4; +++ +++ float* inputPtr = (float*)src0; +++ +++ __m128 indexIncrementValues = _mm_set1_ps(4); +++ __m128 currentIndexes = _mm_set_ps(-1, -2, -3, -4); +++ +++ float max = src0[0]; +++ float index = 0; +++ __m128 maxValues = _mm_set1_ps(max); +++ __m128 maxValuesIndex = _mm_setzero_ps(); +++ __m128 compareResults; +++ __m128 currentValues; +++ +++ __VOLK_ATTR_ALIGNED(16) float maxValuesBuffer[4]; +++ __VOLK_ATTR_ALIGNED(16) float maxIndexesBuffer[4]; +++ +++ for (; number < quarterPoints; number++) { +++ currentValues = _mm_loadu_ps(inputPtr); +++ inputPtr += 4; +++ currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues); +++ compareResults = _mm_cmpgt_ps(currentValues, maxValues); +++ maxValuesIndex = _mm_or_ps(_mm_and_ps(compareResults, currentIndexes), +++ _mm_andnot_ps(compareResults, maxValuesIndex)); +++ maxValues = _mm_or_ps(_mm_and_ps(compareResults, currentValues), +++ _mm_andnot_ps(compareResults, maxValues)); ++ } +++ +++ // Calculate the largest value from the remaining 4 points +++ _mm_store_ps(maxValuesBuffer, maxValues); +++ _mm_store_ps(maxIndexesBuffer, maxValuesIndex); +++ +++ for (number = 0; number < 4; number++) { +++ if (maxValuesBuffer[number] > max) { +++ index = maxIndexesBuffer[number]; +++ max = maxValuesBuffer[number]; +++ } else if (maxValuesBuffer[number] == max) { +++ if (index > maxIndexesBuffer[number]) +++ index = maxIndexesBuffer[number]; +++ } +++ } +++ +++ number = quarterPoints * 4; +++ for (; number < num_points; number++) { +++ if (src0[number] > max) { +++ index = number; +++ max = src0[number]; +++ } +++ } +++ target[0] = (uint32_t)index; +++ } ++ } ++ ++ #endif /*LV_HAVE_SSE*/ ++diff --git a/kernels/volk/volk_32f_invsqrt_32f.h b/kernels/volk/volk_32f_invsqrt_32f.h ++index e416321..e545515 100644 ++--- a/kernels/volk/volk_32f_invsqrt_32f.h +++++ b/kernels/volk/volk_32f_invsqrt_32f.h ++@@ -30,8 +30,8 @@ ++ * ++ * Dispatcher Prototype ++ * \code ++- * void volk_32f_invsqrt_32f(float* cVector, const float* aVector, unsigned int num_points) ++- * \endcode +++ * void volk_32f_invsqrt_32f(float* cVector, const float* aVector, unsigned int +++ * num_points) \endcode ++ * ++ * \b Inputs ++ * \li aVector: the input vector of floats. ++@@ -66,27 +66,27 @@ ++ #define INCLUDED_volk_32f_invsqrt_32f_a_H ++ ++ #include ++-#include ++ #include +++#include ++ #include ++ ++-static inline float ++-Q_rsqrt(float number) +++static inline float Q_rsqrt(float number) ++ { ++- float x2; ++- const float threehalfs = 1.5F; ++- union f32_to_i32 { ++- int32_t i; ++- float f; ++- } u; ++- ++- x2 = number * 0.5F; ++- u.f = number; ++- u.i = 0x5f3759df - ( u.i >> 1 ); // what the fuck? ++- u.f = u.f * ( threehalfs - ( x2 * u.f * u.f ) ); // 1st iteration ++- //u.f = u.f * ( threehalfs - ( x2 * u.f * u.f ) ); // 2nd iteration, this can be removed ++- ++- return u.f; +++ float x2; +++ const float threehalfs = 1.5F; +++ union f32_to_i32 { +++ int32_t i; +++ float f; +++ } u; +++ +++ x2 = number * 0.5F; +++ u.f = number; +++ u.i = 0x5f3759df - (u.i >> 1); // what the fuck? +++ u.f = u.f * (threehalfs - (x2 * u.f * u.f)); // 1st iteration +++ // u.f = u.f * ( threehalfs - ( x2 * u.f * u.f ) ); // 2nd iteration, this can be +++ // removed +++ +++ return u.f; ++ } ++ ++ #ifdef LV_HAVE_AVX ++@@ -95,24 +95,23 @@ Q_rsqrt(float number) ++ static inline void ++ volk_32f_invsqrt_32f_a_avx(float* cVector, const float* aVector, unsigned int num_points) ++ { ++- unsigned int number = 0; ++- const unsigned int eighthPoints = num_points / 8; ++- ++- float* cPtr = cVector; ++- const float* aPtr = aVector; ++- __m256 aVal, cVal; ++- for (; number < eighthPoints; number++) { ++- aVal = _mm256_load_ps(aPtr); ++- cVal = _mm256_rsqrt_ps(aVal); ++- _mm256_store_ps(cPtr, cVal); ++- aPtr += 8; ++- cPtr += 8; ++- } ++- ++- number = eighthPoints * 8; ++- for(;number < num_points; number++) ++- *cPtr++ = Q_rsqrt(*aPtr++); ++- +++ unsigned int number = 0; +++ const unsigned int eighthPoints = num_points / 8; +++ +++ float* cPtr = cVector; +++ const float* aPtr = aVector; +++ __m256 aVal, cVal; +++ for (; number < eighthPoints; number++) { +++ aVal = _mm256_load_ps(aPtr); +++ cVal = _mm256_rsqrt_ps(aVal); +++ _mm256_store_ps(cPtr, cVal); +++ aPtr += 8; +++ cPtr += 8; +++ } +++ +++ number = eighthPoints * 8; +++ for (; number < num_points; number++) +++ *cPtr++ = Q_rsqrt(*aPtr++); ++ } ++ #endif /* LV_HAVE_AVX */ ++ ++@@ -123,29 +122,29 @@ volk_32f_invsqrt_32f_a_avx(float* cVector, const float* aVector, unsigned int nu ++ static inline void ++ volk_32f_invsqrt_32f_a_sse(float* cVector, const float* aVector, unsigned int num_points) ++ { ++- unsigned int number = 0; ++- const unsigned int quarterPoints = num_points / 4; +++ unsigned int number = 0; +++ const unsigned int quarterPoints = num_points / 4; ++ ++- float* cPtr = cVector; ++- const float* aPtr = aVector; +++ float* cPtr = cVector; +++ const float* aPtr = aVector; ++ ++- __m128 aVal, cVal; ++- for(;number < quarterPoints; number++){ +++ __m128 aVal, cVal; +++ for (; number < quarterPoints; number++) { ++ ++- aVal = _mm_load_ps(aPtr); +++ aVal = _mm_load_ps(aPtr); ++ ++- cVal = _mm_rsqrt_ps(aVal); +++ cVal = _mm_rsqrt_ps(aVal); ++ ++- _mm_store_ps(cPtr,cVal); // Store the results back into the C container +++ _mm_store_ps(cPtr, cVal); // Store the results back into the C container ++ ++- aPtr += 4; ++- cPtr += 4; ++- } +++ aPtr += 4; +++ cPtr += 4; +++ } ++ ++- number = quarterPoints * 4; ++- for(;number < num_points; number++) { ++- *cPtr++ = Q_rsqrt(*aPtr++); ++- } +++ number = quarterPoints * 4; +++ for (; number < num_points; number++) { +++ *cPtr++ = Q_rsqrt(*aPtr++); +++ } ++ } ++ #endif /* LV_HAVE_SSE */ ++ ++@@ -156,37 +155,38 @@ volk_32f_invsqrt_32f_a_sse(float* cVector, const float* aVector, unsigned int nu ++ static inline void ++ volk_32f_invsqrt_32f_neon(float* cVector, const float* aVector, unsigned int num_points) ++ { ++- unsigned int number; ++- const unsigned int quarter_points = num_points / 4; ++- ++- float* cPtr = cVector; ++- const float* aPtr = aVector; ++- float32x4_t a_val, c_val; ++- for (number = 0; number < quarter_points; ++number) { ++- a_val = vld1q_f32(aPtr); ++- c_val = vrsqrteq_f32(a_val); ++- vst1q_f32(cPtr, c_val); ++- aPtr += 4; ++- cPtr += 4; ++- } ++- ++- for(number=quarter_points * 4;number < num_points; number++) ++- *cPtr++ = Q_rsqrt(*aPtr++); +++ unsigned int number; +++ const unsigned int quarter_points = num_points / 4; +++ +++ float* cPtr = cVector; +++ const float* aPtr = aVector; +++ float32x4_t a_val, c_val; +++ for (number = 0; number < quarter_points; ++number) { +++ a_val = vld1q_f32(aPtr); +++ c_val = vrsqrteq_f32(a_val); +++ vst1q_f32(cPtr, c_val); +++ aPtr += 4; +++ cPtr += 4; +++ } +++ +++ for (number = quarter_points * 4; number < num_points; number++) +++ *cPtr++ = Q_rsqrt(*aPtr++); ++ } ++ #endif /* LV_HAVE_NEON */ ++ ++ ++ #ifdef LV_HAVE_GENERIC ++ ++-static inline void ++-volk_32f_invsqrt_32f_generic(float* cVector, const float* aVector, unsigned int num_points) +++static inline void volk_32f_invsqrt_32f_generic(float* cVector, +++ const float* aVector, +++ unsigned int num_points) ++ { ++- float* cPtr = cVector; ++- const float* aPtr = aVector; ++- unsigned int number = 0; ++- for(number = 0; number < num_points; number++) { ++- *cPtr++ = Q_rsqrt(*aPtr++); ++- } +++ float* cPtr = cVector; +++ const float* aPtr = aVector; +++ unsigned int number = 0; +++ for (number = 0; number < num_points; number++) { +++ *cPtr++ = Q_rsqrt(*aPtr++); +++ } ++ } ++ #endif /* LV_HAVE_GENERIC */ ++ ++@@ -196,24 +196,23 @@ volk_32f_invsqrt_32f_generic(float* cVector, const float* aVector, unsigned int ++ static inline void ++ volk_32f_invsqrt_32f_u_avx(float* cVector, const float* aVector, unsigned int num_points) ++ { ++- unsigned int number = 0; ++- const unsigned int eighthPoints = num_points / 8; ++- ++- float* cPtr = cVector; ++- const float* aPtr = aVector; ++- __m256 aVal, cVal; ++- for (; number < eighthPoints; number++) { ++- aVal = _mm256_loadu_ps(aPtr); ++- cVal = _mm256_rsqrt_ps(aVal); ++- _mm256_storeu_ps(cPtr, cVal); ++- aPtr += 8; ++- cPtr += 8; ++- } ++- ++- number = eighthPoints * 8; ++- for(;number < num_points; number++) ++- *cPtr++ = Q_rsqrt(*aPtr++); ++- +++ unsigned int number = 0; +++ const unsigned int eighthPoints = num_points / 8; +++ +++ float* cPtr = cVector; +++ const float* aPtr = aVector; +++ __m256 aVal, cVal; +++ for (; number < eighthPoints; number++) { +++ aVal = _mm256_loadu_ps(aPtr); +++ cVal = _mm256_rsqrt_ps(aVal); +++ _mm256_storeu_ps(cPtr, cVal); +++ aPtr += 8; +++ cPtr += 8; +++ } +++ +++ number = eighthPoints * 8; +++ for (; number < num_points; number++) +++ *cPtr++ = Q_rsqrt(*aPtr++); ++ } ++ #endif /* LV_HAVE_AVX */ ++ ++diff --git a/kernels/volk/volk_32f_log2_32f.h b/kernels/volk/volk_32f_log2_32f.h ++index 740f89d..47276d4 100644 ++--- a/kernels/volk/volk_32f_log2_32f.h +++++ b/kernels/volk/volk_32f_log2_32f.h ++@@ -92,17 +92,18 @@ ++ #ifndef INCLUDED_volk_32f_log2_32f_a_H ++ #define INCLUDED_volk_32f_log2_32f_a_H ++ ++-#include ++-#include ++ #include ++ #include +++#include +++#include ++ ++ #define LOG_POLY_DEGREE 6 ++ ++ // +-Inf -> +-127.0f in order to match the behaviour of the SIMD kernels ++-static inline float log2f_non_ieee(float f) { ++- float const result = log2f(f); ++- return isinf(result) ? copysignf(127.0f, result) : result; +++static inline float log2f_non_ieee(float f) +++{ +++ float const result = log2f(f); +++ return isinf(result) ? copysignf(127.0f, result) : result; ++ } ++ ++ #ifdef LV_HAVE_GENERIC ++@@ -110,12 +111,12 @@ static inline float log2f_non_ieee(float f) { ++ static inline void ++ volk_32f_log2_32f_generic(float* bVector, const float* aVector, unsigned int num_points) ++ { ++- float* bPtr = bVector; ++- const float* aPtr = aVector; ++- unsigned int number = 0; +++ float* bPtr = bVector; +++ const float* aPtr = aVector; +++ unsigned int number = 0; ++ ++- for(number = 0; number < num_points; number++) ++- *bPtr++ = log2f_non_ieee(*aPtr++); +++ for (number = 0; number < num_points; number++) +++ *bPtr++ = log2f_non_ieee(*aPtr++); ++ } ++ #endif /* LV_HAVE_GENERIC */ ++ ++@@ -123,56 +124,86 @@ volk_32f_log2_32f_generic(float* bVector, const float* aVector, unsigned int num ++ #include ++ ++ #define POLY0_FMAAVX2(x, c0) _mm256_set1_ps(c0) ++-#define POLY1_FMAAVX2(x, c0, c1) _mm256_fmadd_ps(POLY0_FMAAVX2(x, c1), x, _mm256_set1_ps(c0)) ++-#define POLY2_FMAAVX2(x, c0, c1, c2) _mm256_fmadd_ps(POLY1_FMAAVX2(x, c1, c2), x, _mm256_set1_ps(c0)) ++-#define POLY3_FMAAVX2(x, c0, c1, c2, c3) _mm256_fmadd_ps(POLY2_FMAAVX2(x, c1, c2, c3), x, _mm256_set1_ps(c0)) ++-#define POLY4_FMAAVX2(x, c0, c1, c2, c3, c4) _mm256_fmadd_ps(POLY3_FMAAVX2(x, c1, c2, c3, c4), x, _mm256_set1_ps(c0)) ++-#define POLY5_FMAAVX2(x, c0, c1, c2, c3, c4, c5) _mm256_fmadd_ps(POLY4_FMAAVX2(x, c1, c2, c3, c4, c5), x, _mm256_set1_ps(c0)) ++- ++-static inline void ++-volk_32f_log2_32f_a_avx2_fma(float* bVector, const float* aVector, unsigned int num_points) +++#define POLY1_FMAAVX2(x, c0, c1) \ +++ _mm256_fmadd_ps(POLY0_FMAAVX2(x, c1), x, _mm256_set1_ps(c0)) +++#define POLY2_FMAAVX2(x, c0, c1, c2) \ +++ _mm256_fmadd_ps(POLY1_FMAAVX2(x, c1, c2), x, _mm256_set1_ps(c0)) +++#define POLY3_FMAAVX2(x, c0, c1, c2, c3) \ +++ _mm256_fmadd_ps(POLY2_FMAAVX2(x, c1, c2, c3), x, _mm256_set1_ps(c0)) +++#define POLY4_FMAAVX2(x, c0, c1, c2, c3, c4) \ +++ _mm256_fmadd_ps(POLY3_FMAAVX2(x, c1, c2, c3, c4), x, _mm256_set1_ps(c0)) +++#define POLY5_FMAAVX2(x, c0, c1, c2, c3, c4, c5) \ +++ _mm256_fmadd_ps(POLY4_FMAAVX2(x, c1, c2, c3, c4, c5), x, _mm256_set1_ps(c0)) +++ +++static inline void volk_32f_log2_32f_a_avx2_fma(float* bVector, +++ const float* aVector, +++ unsigned int num_points) ++ { ++- float* bPtr = bVector; ++- const float* aPtr = aVector; +++ float* bPtr = bVector; +++ const float* aPtr = aVector; ++ ++- unsigned int number = 0; ++- const unsigned int eighthPoints = num_points / 8; +++ unsigned int number = 0; +++ const unsigned int eighthPoints = num_points / 8; ++ ++- __m256 aVal, bVal, mantissa, frac, leadingOne; ++- __m256i bias, exp; +++ __m256 aVal, bVal, mantissa, frac, leadingOne; +++ __m256i bias, exp; ++ ++- for(;number < eighthPoints; number++){ +++ for (; number < eighthPoints; number++) { ++ ++- aVal = _mm256_load_ps(aPtr); ++- bias = _mm256_set1_epi32(127); ++- leadingOne = _mm256_set1_ps(1.0f); ++- exp = _mm256_sub_epi32(_mm256_srli_epi32(_mm256_and_si256(_mm256_castps_si256(aVal), _mm256_set1_epi32(0x7f800000)), 23), bias); ++- bVal = _mm256_cvtepi32_ps(exp); +++ aVal = _mm256_load_ps(aPtr); +++ bias = _mm256_set1_epi32(127); +++ leadingOne = _mm256_set1_ps(1.0f); +++ exp = _mm256_sub_epi32( +++ _mm256_srli_epi32(_mm256_and_si256(_mm256_castps_si256(aVal), +++ _mm256_set1_epi32(0x7f800000)), +++ 23), +++ bias); +++ bVal = _mm256_cvtepi32_ps(exp); ++ ++- // Now to extract mantissa ++- frac = _mm256_or_ps(leadingOne, _mm256_and_ps(aVal, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffff)))); +++ // Now to extract mantissa +++ frac = _mm256_or_ps( +++ leadingOne, +++ _mm256_and_ps(aVal, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffff)))); ++ ++ #if LOG_POLY_DEGREE == 6 ++- mantissa = POLY5_FMAAVX2( frac, 3.1157899f, -3.3241990f, 2.5988452f, -1.2315303f, 3.1821337e-1f, -3.4436006e-2f); +++ mantissa = POLY5_FMAAVX2(frac, +++ 3.1157899f, +++ -3.3241990f, +++ 2.5988452f, +++ -1.2315303f, +++ 3.1821337e-1f, +++ -3.4436006e-2f); ++ #elif LOG_POLY_DEGREE == 5 ++- mantissa = POLY4_FMAAVX2( frac, 2.8882704548164776201f, -2.52074962577807006663f, 1.48116647521213171641f, -0.465725644288844778798f, 0.0596515482674574969533f); +++ mantissa = POLY4_FMAAVX2(frac, +++ 2.8882704548164776201f, +++ -2.52074962577807006663f, +++ 1.48116647521213171641f, +++ -0.465725644288844778798f, +++ 0.0596515482674574969533f); ++ #elif LOG_POLY_DEGREE == 4 ++- mantissa = POLY3_FMAAVX2( frac, 2.61761038894603480148f, -1.75647175389045657003f, 0.688243882994381274313f, -0.107254423828329604454f); +++ mantissa = POLY3_FMAAVX2(frac, +++ 2.61761038894603480148f, +++ -1.75647175389045657003f, +++ 0.688243882994381274313f, +++ -0.107254423828329604454f); ++ #elif LOG_POLY_DEGREE == 3 ++- mantissa = POLY2_FMAAVX2( frac, 2.28330284476918490682f, -1.04913055217340124191f, 0.204446009836232697516f); +++ mantissa = POLY2_FMAAVX2(frac, +++ 2.28330284476918490682f, +++ -1.04913055217340124191f, +++ 0.204446009836232697516f); ++ #else ++ #error ++ #endif ++ ++- bVal = _mm256_fmadd_ps(mantissa, _mm256_sub_ps(frac, leadingOne), bVal); ++- _mm256_store_ps(bPtr, bVal); +++ bVal = _mm256_fmadd_ps(mantissa, _mm256_sub_ps(frac, leadingOne), bVal); +++ _mm256_store_ps(bPtr, bVal); ++ ++- aPtr += 8; ++- bPtr += 8; ++- } +++ aPtr += 8; +++ bPtr += 8; +++ } ++ ++- number = eighthPoints * 8; ++- volk_32f_log2_32f_generic(bPtr, aPtr, num_points-number); +++ number = eighthPoints * 8; +++ volk_32f_log2_32f_generic(bPtr, aPtr, num_points - number); ++ } ++ ++ #endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for aligned */ ++@@ -181,56 +212,86 @@ volk_32f_log2_32f_a_avx2_fma(float* bVector, const float* aVector, unsigned int ++ #include ++ ++ #define POLY0_AVX2(x, c0) _mm256_set1_ps(c0) ++-#define POLY1_AVX2(x, c0, c1) _mm256_add_ps(_mm256_mul_ps(POLY0_AVX2(x, c1), x), _mm256_set1_ps(c0)) ++-#define POLY2_AVX2(x, c0, c1, c2) _mm256_add_ps(_mm256_mul_ps(POLY1_AVX2(x, c1, c2), x), _mm256_set1_ps(c0)) ++-#define POLY3_AVX2(x, c0, c1, c2, c3) _mm256_add_ps(_mm256_mul_ps(POLY2_AVX2(x, c1, c2, c3), x), _mm256_set1_ps(c0)) ++-#define POLY4_AVX2(x, c0, c1, c2, c3, c4) _mm256_add_ps(_mm256_mul_ps(POLY3_AVX2(x, c1, c2, c3, c4), x), _mm256_set1_ps(c0)) ++-#define POLY5_AVX2(x, c0, c1, c2, c3, c4, c5) _mm256_add_ps(_mm256_mul_ps(POLY4_AVX2(x, c1, c2, c3, c4, c5), x), _mm256_set1_ps(c0)) +++#define POLY1_AVX2(x, c0, c1) \ +++ _mm256_add_ps(_mm256_mul_ps(POLY0_AVX2(x, c1), x), _mm256_set1_ps(c0)) +++#define POLY2_AVX2(x, c0, c1, c2) \ +++ _mm256_add_ps(_mm256_mul_ps(POLY1_AVX2(x, c1, c2), x), _mm256_set1_ps(c0)) +++#define POLY3_AVX2(x, c0, c1, c2, c3) \ +++ _mm256_add_ps(_mm256_mul_ps(POLY2_AVX2(x, c1, c2, c3), x), _mm256_set1_ps(c0)) +++#define POLY4_AVX2(x, c0, c1, c2, c3, c4) \ +++ _mm256_add_ps(_mm256_mul_ps(POLY3_AVX2(x, c1, c2, c3, c4), x), _mm256_set1_ps(c0)) +++#define POLY5_AVX2(x, c0, c1, c2, c3, c4, c5) \ +++ _mm256_add_ps(_mm256_mul_ps(POLY4_AVX2(x, c1, c2, c3, c4, c5), x), _mm256_set1_ps(c0)) ++ ++ static inline void ++ volk_32f_log2_32f_a_avx2(float* bVector, const float* aVector, unsigned int num_points) ++ { ++- float* bPtr = bVector; ++- const float* aPtr = aVector; +++ float* bPtr = bVector; +++ const float* aPtr = aVector; ++ ++- unsigned int number = 0; ++- const unsigned int eighthPoints = num_points / 8; +++ unsigned int number = 0; +++ const unsigned int eighthPoints = num_points / 8; ++ ++- __m256 aVal, bVal, mantissa, frac, leadingOne; ++- __m256i bias, exp; +++ __m256 aVal, bVal, mantissa, frac, leadingOne; +++ __m256i bias, exp; ++ ++- for(;number < eighthPoints; number++){ +++ for (; number < eighthPoints; number++) { ++ ++- aVal = _mm256_load_ps(aPtr); ++- bias = _mm256_set1_epi32(127); ++- leadingOne = _mm256_set1_ps(1.0f); ++- exp = _mm256_sub_epi32(_mm256_srli_epi32(_mm256_and_si256(_mm256_castps_si256(aVal), _mm256_set1_epi32(0x7f800000)), 23), bias); ++- bVal = _mm256_cvtepi32_ps(exp); +++ aVal = _mm256_load_ps(aPtr); +++ bias = _mm256_set1_epi32(127); +++ leadingOne = _mm256_set1_ps(1.0f); +++ exp = _mm256_sub_epi32( +++ _mm256_srli_epi32(_mm256_and_si256(_mm256_castps_si256(aVal), +++ _mm256_set1_epi32(0x7f800000)), +++ 23), +++ bias); +++ bVal = _mm256_cvtepi32_ps(exp); ++ ++- // Now to extract mantissa ++- frac = _mm256_or_ps(leadingOne, _mm256_and_ps(aVal, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffff)))); +++ // Now to extract mantissa +++ frac = _mm256_or_ps( +++ leadingOne, +++ _mm256_and_ps(aVal, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffff)))); ++ ++ #if LOG_POLY_DEGREE == 6 ++- mantissa = POLY5_AVX2( frac, 3.1157899f, -3.3241990f, 2.5988452f, -1.2315303f, 3.1821337e-1f, -3.4436006e-2f); +++ mantissa = POLY5_AVX2(frac, +++ 3.1157899f, +++ -3.3241990f, +++ 2.5988452f, +++ -1.2315303f, +++ 3.1821337e-1f, +++ -3.4436006e-2f); ++ #elif LOG_POLY_DEGREE == 5 ++- mantissa = POLY4_AVX2( frac, 2.8882704548164776201f, -2.52074962577807006663f, 1.48116647521213171641f, -0.465725644288844778798f, 0.0596515482674574969533f); +++ mantissa = POLY4_AVX2(frac, +++ 2.8882704548164776201f, +++ -2.52074962577807006663f, +++ 1.48116647521213171641f, +++ -0.465725644288844778798f, +++ 0.0596515482674574969533f); ++ #elif LOG_POLY_DEGREE == 4 ++- mantissa = POLY3_AVX2( frac, 2.61761038894603480148f, -1.75647175389045657003f, 0.688243882994381274313f, -0.107254423828329604454f); +++ mantissa = POLY3_AVX2(frac, +++ 2.61761038894603480148f, +++ -1.75647175389045657003f, +++ 0.688243882994381274313f, +++ -0.107254423828329604454f); ++ #elif LOG_POLY_DEGREE == 3 ++- mantissa = POLY2_AVX2( frac, 2.28330284476918490682f, -1.04913055217340124191f, 0.204446009836232697516f); +++ mantissa = POLY2_AVX2(frac, +++ 2.28330284476918490682f, +++ -1.04913055217340124191f, +++ 0.204446009836232697516f); ++ #else ++ #error ++ #endif ++ ++- bVal = _mm256_add_ps(_mm256_mul_ps(mantissa, _mm256_sub_ps(frac, leadingOne)), bVal); ++- _mm256_store_ps(bPtr, bVal); +++ bVal = +++ _mm256_add_ps(_mm256_mul_ps(mantissa, _mm256_sub_ps(frac, leadingOne)), bVal); +++ _mm256_store_ps(bPtr, bVal); ++ ++- aPtr += 8; ++- bPtr += 8; ++- } +++ aPtr += 8; +++ bPtr += 8; +++ } ++ ++- number = eighthPoints * 8; ++- volk_32f_log2_32f_generic(bPtr, aPtr, num_points-number); +++ number = eighthPoints * 8; +++ volk_32f_log2_32f_generic(bPtr, aPtr, num_points - number); ++ } ++ ++ #endif /* LV_HAVE_AVX2 for aligned */ ++@@ -241,54 +302,79 @@ volk_32f_log2_32f_a_avx2(float* bVector, const float* aVector, unsigned int num_ ++ #define POLY0(x, c0) _mm_set1_ps(c0) ++ #define POLY1(x, c0, c1) _mm_add_ps(_mm_mul_ps(POLY0(x, c1), x), _mm_set1_ps(c0)) ++ #define POLY2(x, c0, c1, c2) _mm_add_ps(_mm_mul_ps(POLY1(x, c1, c2), x), _mm_set1_ps(c0)) ++-#define POLY3(x, c0, c1, c2, c3) _mm_add_ps(_mm_mul_ps(POLY2(x, c1, c2, c3), x), _mm_set1_ps(c0)) ++-#define POLY4(x, c0, c1, c2, c3, c4) _mm_add_ps(_mm_mul_ps(POLY3(x, c1, c2, c3, c4), x), _mm_set1_ps(c0)) ++-#define POLY5(x, c0, c1, c2, c3, c4, c5) _mm_add_ps(_mm_mul_ps(POLY4(x, c1, c2, c3, c4, c5), x), _mm_set1_ps(c0)) +++#define POLY3(x, c0, c1, c2, c3) \ +++ _mm_add_ps(_mm_mul_ps(POLY2(x, c1, c2, c3), x), _mm_set1_ps(c0)) +++#define POLY4(x, c0, c1, c2, c3, c4) \ +++ _mm_add_ps(_mm_mul_ps(POLY3(x, c1, c2, c3, c4), x), _mm_set1_ps(c0)) +++#define POLY5(x, c0, c1, c2, c3, c4, c5) \ +++ _mm_add_ps(_mm_mul_ps(POLY4(x, c1, c2, c3, c4, c5), x), _mm_set1_ps(c0)) ++ ++ static inline void ++ volk_32f_log2_32f_a_sse4_1(float* bVector, const float* aVector, unsigned int num_points) ++ { ++- float* bPtr = bVector; ++- const float* aPtr = aVector; +++ float* bPtr = bVector; +++ const float* aPtr = aVector; ++ ++- unsigned int number = 0; ++- const unsigned int quarterPoints = num_points / 4; +++ unsigned int number = 0; +++ const unsigned int quarterPoints = num_points / 4; ++ ++- __m128 aVal, bVal, mantissa, frac, leadingOne; ++- __m128i bias, exp; +++ __m128 aVal, bVal, mantissa, frac, leadingOne; +++ __m128i bias, exp; ++ ++- for(;number < quarterPoints; number++){ +++ for (; number < quarterPoints; number++) { ++ ++- aVal = _mm_load_ps(aPtr); ++- bias = _mm_set1_epi32(127); ++- leadingOne = _mm_set1_ps(1.0f); ++- exp = _mm_sub_epi32(_mm_srli_epi32(_mm_and_si128(_mm_castps_si128(aVal), _mm_set1_epi32(0x7f800000)), 23), bias); ++- bVal = _mm_cvtepi32_ps(exp); +++ aVal = _mm_load_ps(aPtr); +++ bias = _mm_set1_epi32(127); +++ leadingOne = _mm_set1_ps(1.0f); +++ exp = _mm_sub_epi32( +++ _mm_srli_epi32( +++ _mm_and_si128(_mm_castps_si128(aVal), _mm_set1_epi32(0x7f800000)), 23), +++ bias); +++ bVal = _mm_cvtepi32_ps(exp); ++ ++- // Now to extract mantissa ++- frac = _mm_or_ps(leadingOne, _mm_and_ps(aVal, _mm_castsi128_ps(_mm_set1_epi32(0x7fffff)))); +++ // Now to extract mantissa +++ frac = _mm_or_ps(leadingOne, +++ _mm_and_ps(aVal, _mm_castsi128_ps(_mm_set1_epi32(0x7fffff)))); ++ ++ #if LOG_POLY_DEGREE == 6 ++- mantissa = POLY5( frac, 3.1157899f, -3.3241990f, 2.5988452f, -1.2315303f, 3.1821337e-1f, -3.4436006e-2f); +++ mantissa = POLY5(frac, +++ 3.1157899f, +++ -3.3241990f, +++ 2.5988452f, +++ -1.2315303f, +++ 3.1821337e-1f, +++ -3.4436006e-2f); ++ #elif LOG_POLY_DEGREE == 5 ++- mantissa = POLY4( frac, 2.8882704548164776201f, -2.52074962577807006663f, 1.48116647521213171641f, -0.465725644288844778798f, 0.0596515482674574969533f); +++ mantissa = POLY4(frac, +++ 2.8882704548164776201f, +++ -2.52074962577807006663f, +++ 1.48116647521213171641f, +++ -0.465725644288844778798f, +++ 0.0596515482674574969533f); ++ #elif LOG_POLY_DEGREE == 4 ++- mantissa = POLY3( frac, 2.61761038894603480148f, -1.75647175389045657003f, 0.688243882994381274313f, -0.107254423828329604454f); +++ mantissa = POLY3(frac, +++ 2.61761038894603480148f, +++ -1.75647175389045657003f, +++ 0.688243882994381274313f, +++ -0.107254423828329604454f); ++ #elif LOG_POLY_DEGREE == 3 ++- mantissa = POLY2( frac, 2.28330284476918490682f, -1.04913055217340124191f, 0.204446009836232697516f); +++ mantissa = POLY2(frac, +++ 2.28330284476918490682f, +++ -1.04913055217340124191f, +++ 0.204446009836232697516f); ++ #else ++ #error ++ #endif ++ ++- bVal = _mm_add_ps(bVal, _mm_mul_ps(mantissa, _mm_sub_ps(frac, leadingOne))); ++- _mm_store_ps(bPtr, bVal); +++ bVal = _mm_add_ps(bVal, _mm_mul_ps(mantissa, _mm_sub_ps(frac, leadingOne))); +++ _mm_store_ps(bPtr, bVal); ++ ++- aPtr += 4; ++- bPtr += 4; ++- } +++ aPtr += 4; +++ bPtr += 4; +++ } ++ ++- number = quarterPoints * 4; ++- volk_32f_log2_32f_generic(bPtr, aPtr, num_points-number); +++ number = quarterPoints * 4; +++ volk_32f_log2_32f_generic(bPtr, aPtr, num_points - number); ++ } ++ ++ #endif /* LV_HAVE_SSE4_1 for aligned */ ++@@ -297,91 +383,91 @@ volk_32f_log2_32f_a_sse4_1(float* bVector, const float* aVector, unsigned int nu ++ #include ++ ++ /* these macros allow us to embed logs in other kernels */ ++-#define VLOG2Q_NEON_PREAMBLE() \ ++- int32x4_t one = vdupq_n_s32(0x000800000); \ ++- /* minimax polynomial */ \ ++- float32x4_t p0 = vdupq_n_f32(-3.0400402727048585); \ ++- float32x4_t p1 = vdupq_n_f32(6.1129631282966113); \ ++- float32x4_t p2 = vdupq_n_f32(-5.3419892024633207); \ ++- float32x4_t p3 = vdupq_n_f32(3.2865287703753912); \ ++- float32x4_t p4 = vdupq_n_f32(-1.2669182593441635); \ ++- float32x4_t p5 = vdupq_n_f32(0.2751487703421256); \ ++- float32x4_t p6 = vdupq_n_f32(-0.0256910888150985); \ ++- int32x4_t exp_mask = vdupq_n_s32(0x7f800000); \ ++- int32x4_t sig_mask = vdupq_n_s32(0x007fffff); \ ++- int32x4_t exp_bias = vdupq_n_s32(127); ++- ++- ++-#define VLOG2Q_NEON_F32(log2_approx, aval) \ ++- int32x4_t exponent_i = vandq_s32(aval, exp_mask); \ ++- int32x4_t significand_i = vandq_s32(aval, sig_mask); \ ++- exponent_i = vshrq_n_s32(exponent_i, 23); \ ++- \ ++- /* extract the exponent and significand \ ++- we can treat this as fixed point to save ~9% on the \ ++- conversion + float add */ \ ++- significand_i = vorrq_s32(one, significand_i); \ ++- float32x4_t significand_f = vcvtq_n_f32_s32(significand_i,23); \ ++- /* debias the exponent and convert to float */ \ ++- exponent_i = vsubq_s32(exponent_i, exp_bias); \ ++- float32x4_t exponent_f = vcvtq_f32_s32(exponent_i); \ ++- \ ++- /* put the significand through a polynomial fit of log2(x) [1,2] \ ++- add the result to the exponent */ \ ++- log2_approx = vaddq_f32(exponent_f, p0); /* p0 */ \ ++- float32x4_t tmp1 = vmulq_f32(significand_f, p1); /* p1 * x */ \ ++- log2_approx = vaddq_f32(log2_approx, tmp1); \ ++- float32x4_t sig_2 = vmulq_f32(significand_f, significand_f); /* x^2 */ \ ++- tmp1 = vmulq_f32(sig_2, p2); /* p2 * x^2 */ \ ++- log2_approx = vaddq_f32(log2_approx, tmp1); \ ++- \ ++- float32x4_t sig_3 = vmulq_f32(sig_2, significand_f); /* x^3 */ \ ++- tmp1 = vmulq_f32(sig_3, p3); /* p3 * x^3 */ \ ++- log2_approx = vaddq_f32(log2_approx, tmp1); \ ++- float32x4_t sig_4 = vmulq_f32(sig_2, sig_2); /* x^4 */ \ ++- tmp1 = vmulq_f32(sig_4, p4); /* p4 * x^4 */ \ ++- log2_approx = vaddq_f32(log2_approx, tmp1); \ ++- float32x4_t sig_5 = vmulq_f32(sig_3, sig_2); /* x^5 */ \ ++- tmp1 = vmulq_f32(sig_5, p5); /* p5 * x^5 */ \ ++- log2_approx = vaddq_f32(log2_approx, tmp1); \ ++- float32x4_t sig_6 = vmulq_f32(sig_3, sig_3); /* x^6 */ \ ++- tmp1 = vmulq_f32(sig_6, p6); /* p6 * x^6 */ \ ++- log2_approx = vaddq_f32(log2_approx, tmp1); +++#define VLOG2Q_NEON_PREAMBLE() \ +++ int32x4_t one = vdupq_n_s32(0x000800000); \ +++ /* minimax polynomial */ \ +++ float32x4_t p0 = vdupq_n_f32(-3.0400402727048585); \ +++ float32x4_t p1 = vdupq_n_f32(6.1129631282966113); \ +++ float32x4_t p2 = vdupq_n_f32(-5.3419892024633207); \ +++ float32x4_t p3 = vdupq_n_f32(3.2865287703753912); \ +++ float32x4_t p4 = vdupq_n_f32(-1.2669182593441635); \ +++ float32x4_t p5 = vdupq_n_f32(0.2751487703421256); \ +++ float32x4_t p6 = vdupq_n_f32(-0.0256910888150985); \ +++ int32x4_t exp_mask = vdupq_n_s32(0x7f800000); \ +++ int32x4_t sig_mask = vdupq_n_s32(0x007fffff); \ +++ int32x4_t exp_bias = vdupq_n_s32(127); +++ +++ +++#define VLOG2Q_NEON_F32(log2_approx, aval) \ +++ int32x4_t exponent_i = vandq_s32(aval, exp_mask); \ +++ int32x4_t significand_i = vandq_s32(aval, sig_mask); \ +++ exponent_i = vshrq_n_s32(exponent_i, 23); \ +++ \ +++ /* extract the exponent and significand \ +++ we can treat this as fixed point to save ~9% on the \ +++ conversion + float add */ \ +++ significand_i = vorrq_s32(one, significand_i); \ +++ float32x4_t significand_f = vcvtq_n_f32_s32(significand_i, 23); \ +++ /* debias the exponent and convert to float */ \ +++ exponent_i = vsubq_s32(exponent_i, exp_bias); \ +++ float32x4_t exponent_f = vcvtq_f32_s32(exponent_i); \ +++ \ +++ /* put the significand through a polynomial fit of log2(x) [1,2] \ +++ add the result to the exponent */ \ +++ log2_approx = vaddq_f32(exponent_f, p0); /* p0 */ \ +++ float32x4_t tmp1 = vmulq_f32(significand_f, p1); /* p1 * x */ \ +++ log2_approx = vaddq_f32(log2_approx, tmp1); \ +++ float32x4_t sig_2 = vmulq_f32(significand_f, significand_f); /* x^2 */ \ +++ tmp1 = vmulq_f32(sig_2, p2); /* p2 * x^2 */ \ +++ log2_approx = vaddq_f32(log2_approx, tmp1); \ +++ \ +++ float32x4_t sig_3 = vmulq_f32(sig_2, significand_f); /* x^3 */ \ +++ tmp1 = vmulq_f32(sig_3, p3); /* p3 * x^3 */ \ +++ log2_approx = vaddq_f32(log2_approx, tmp1); \ +++ float32x4_t sig_4 = vmulq_f32(sig_2, sig_2); /* x^4 */ \ +++ tmp1 = vmulq_f32(sig_4, p4); /* p4 * x^4 */ \ +++ log2_approx = vaddq_f32(log2_approx, tmp1); \ +++ float32x4_t sig_5 = vmulq_f32(sig_3, sig_2); /* x^5 */ \ +++ tmp1 = vmulq_f32(sig_5, p5); /* p5 * x^5 */ \ +++ log2_approx = vaddq_f32(log2_approx, tmp1); \ +++ float32x4_t sig_6 = vmulq_f32(sig_3, sig_3); /* x^6 */ \ +++ tmp1 = vmulq_f32(sig_6, p6); /* p6 * x^6 */ \ +++ log2_approx = vaddq_f32(log2_approx, tmp1); ++ ++ static inline void ++ volk_32f_log2_32f_neon(float* bVector, const float* aVector, unsigned int num_points) ++ { ++- float* bPtr = bVector; ++- const float* aPtr = aVector; ++- unsigned int number; ++- const unsigned int quarterPoints = num_points / 4; +++ float* bPtr = bVector; +++ const float* aPtr = aVector; +++ unsigned int number; +++ const unsigned int quarterPoints = num_points / 4; ++ ++- int32x4_t aval; ++- float32x4_t log2_approx; +++ int32x4_t aval; +++ float32x4_t log2_approx; ++ ++- VLOG2Q_NEON_PREAMBLE() ++- // lms ++- //p0 = vdupq_n_f32(-1.649132280361871); ++- //p1 = vdupq_n_f32(1.995047138579499); ++- //p2 = vdupq_n_f32(-0.336914839219728); +++ VLOG2Q_NEON_PREAMBLE() +++ // lms +++ // p0 = vdupq_n_f32(-1.649132280361871); +++ // p1 = vdupq_n_f32(1.995047138579499); +++ // p2 = vdupq_n_f32(-0.336914839219728); ++ ++- // keep in mind a single precision float is represented as ++- // (-1)^sign * 2^exp * 1.significand, so the log2 is ++- // log2(2^exp * sig) = exponent + log2(1 + significand/(1<<23) ++- for(number = 0; number < quarterPoints; ++number){ ++- // load float in to an int register without conversion ++- aval = vld1q_s32((int*)aPtr); +++ // keep in mind a single precision float is represented as +++ // (-1)^sign * 2^exp * 1.significand, so the log2 is +++ // log2(2^exp * sig) = exponent + log2(1 + significand/(1<<23) +++ for (number = 0; number < quarterPoints; ++number) { +++ // load float in to an int register without conversion +++ aval = vld1q_s32((int*)aPtr); ++ ++- VLOG2Q_NEON_F32(log2_approx, aval) +++ VLOG2Q_NEON_F32(log2_approx, aval) ++ ++- vst1q_f32(bPtr, log2_approx); +++ vst1q_f32(bPtr, log2_approx); ++ ++- aPtr += 4; ++- bPtr += 4; ++- } +++ aPtr += 4; +++ bPtr += 4; +++ } ++ ++- number = quarterPoints * 4; ++- volk_32f_log2_32f_generic(bPtr, aPtr, num_points-number); +++ number = quarterPoints * 4; +++ volk_32f_log2_32f_generic(bPtr, aPtr, num_points - number); ++ } ++ ++ #endif /* LV_HAVE_NEON */ ++@@ -398,14 +484,14 @@ volk_32f_log2_32f_neon(float* bVector, const float* aVector, unsigned int num_po ++ static inline void ++ volk_32f_log2_32f_u_generic(float* bVector, const float* aVector, unsigned int num_points) ++ { ++- float* bPtr = bVector; ++- const float* aPtr = aVector; ++- unsigned int number = 0; ++- ++- for(number = 0; number < num_points; number++){ ++- float const result = log2f(*aPtr++); ++- *bPtr++ = isinf(result) ? -127.0f : result; ++- } +++ float* bPtr = bVector; +++ const float* aPtr = aVector; +++ unsigned int number = 0; +++ +++ for (number = 0; number < num_points; number++) { +++ float const result = log2f(*aPtr++); +++ *bPtr++ = isinf(result) ? -127.0f : result; +++ } ++ } ++ ++ #endif /* LV_HAVE_GENERIC */ ++@@ -417,54 +503,79 @@ volk_32f_log2_32f_u_generic(float* bVector, const float* aVector, unsigned int n ++ #define POLY0(x, c0) _mm_set1_ps(c0) ++ #define POLY1(x, c0, c1) _mm_add_ps(_mm_mul_ps(POLY0(x, c1), x), _mm_set1_ps(c0)) ++ #define POLY2(x, c0, c1, c2) _mm_add_ps(_mm_mul_ps(POLY1(x, c1, c2), x), _mm_set1_ps(c0)) ++-#define POLY3(x, c0, c1, c2, c3) _mm_add_ps(_mm_mul_ps(POLY2(x, c1, c2, c3), x), _mm_set1_ps(c0)) ++-#define POLY4(x, c0, c1, c2, c3, c4) _mm_add_ps(_mm_mul_ps(POLY3(x, c1, c2, c3, c4), x), _mm_set1_ps(c0)) ++-#define POLY5(x, c0, c1, c2, c3, c4, c5) _mm_add_ps(_mm_mul_ps(POLY4(x, c1, c2, c3, c4, c5), x), _mm_set1_ps(c0)) +++#define POLY3(x, c0, c1, c2, c3) \ +++ _mm_add_ps(_mm_mul_ps(POLY2(x, c1, c2, c3), x), _mm_set1_ps(c0)) +++#define POLY4(x, c0, c1, c2, c3, c4) \ +++ _mm_add_ps(_mm_mul_ps(POLY3(x, c1, c2, c3, c4), x), _mm_set1_ps(c0)) +++#define POLY5(x, c0, c1, c2, c3, c4, c5) \ +++ _mm_add_ps(_mm_mul_ps(POLY4(x, c1, c2, c3, c4, c5), x), _mm_set1_ps(c0)) ++ ++ static inline void ++ volk_32f_log2_32f_u_sse4_1(float* bVector, const float* aVector, unsigned int num_points) ++ { ++- float* bPtr = bVector; ++- const float* aPtr = aVector; +++ float* bPtr = bVector; +++ const float* aPtr = aVector; ++ ++- unsigned int number = 0; ++- const unsigned int quarterPoints = num_points / 4; +++ unsigned int number = 0; +++ const unsigned int quarterPoints = num_points / 4; ++ ++- __m128 aVal, bVal, mantissa, frac, leadingOne; ++- __m128i bias, exp; +++ __m128 aVal, bVal, mantissa, frac, leadingOne; +++ __m128i bias, exp; ++ ++- for(;number < quarterPoints; number++){ +++ for (; number < quarterPoints; number++) { ++ ++- aVal = _mm_loadu_ps(aPtr); ++- bias = _mm_set1_epi32(127); ++- leadingOne = _mm_set1_ps(1.0f); ++- exp = _mm_sub_epi32(_mm_srli_epi32(_mm_and_si128(_mm_castps_si128(aVal), _mm_set1_epi32(0x7f800000)), 23), bias); ++- bVal = _mm_cvtepi32_ps(exp); +++ aVal = _mm_loadu_ps(aPtr); +++ bias = _mm_set1_epi32(127); +++ leadingOne = _mm_set1_ps(1.0f); +++ exp = _mm_sub_epi32( +++ _mm_srli_epi32( +++ _mm_and_si128(_mm_castps_si128(aVal), _mm_set1_epi32(0x7f800000)), 23), +++ bias); +++ bVal = _mm_cvtepi32_ps(exp); ++ ++- // Now to extract mantissa ++- frac = _mm_or_ps(leadingOne, _mm_and_ps(aVal, _mm_castsi128_ps(_mm_set1_epi32(0x7fffff)))); +++ // Now to extract mantissa +++ frac = _mm_or_ps(leadingOne, +++ _mm_and_ps(aVal, _mm_castsi128_ps(_mm_set1_epi32(0x7fffff)))); ++ ++ #if LOG_POLY_DEGREE == 6 ++- mantissa = POLY5( frac, 3.1157899f, -3.3241990f, 2.5988452f, -1.2315303f, 3.1821337e-1f, -3.4436006e-2f); +++ mantissa = POLY5(frac, +++ 3.1157899f, +++ -3.3241990f, +++ 2.5988452f, +++ -1.2315303f, +++ 3.1821337e-1f, +++ -3.4436006e-2f); ++ #elif LOG_POLY_DEGREE == 5 ++- mantissa = POLY4( frac, 2.8882704548164776201f, -2.52074962577807006663f, 1.48116647521213171641f, -0.465725644288844778798f, 0.0596515482674574969533f); +++ mantissa = POLY4(frac, +++ 2.8882704548164776201f, +++ -2.52074962577807006663f, +++ 1.48116647521213171641f, +++ -0.465725644288844778798f, +++ 0.0596515482674574969533f); ++ #elif LOG_POLY_DEGREE == 4 ++- mantissa = POLY3( frac, 2.61761038894603480148f, -1.75647175389045657003f, 0.688243882994381274313f, -0.107254423828329604454f); +++ mantissa = POLY3(frac, +++ 2.61761038894603480148f, +++ -1.75647175389045657003f, +++ 0.688243882994381274313f, +++ -0.107254423828329604454f); ++ #elif LOG_POLY_DEGREE == 3 ++- mantissa = POLY2( frac, 2.28330284476918490682f, -1.04913055217340124191f, 0.204446009836232697516f); +++ mantissa = POLY2(frac, +++ 2.28330284476918490682f, +++ -1.04913055217340124191f, +++ 0.204446009836232697516f); ++ #else ++ #error ++ #endif ++ ++- bVal = _mm_add_ps(bVal, _mm_mul_ps(mantissa, _mm_sub_ps(frac, leadingOne))); ++- _mm_storeu_ps(bPtr, bVal); +++ bVal = _mm_add_ps(bVal, _mm_mul_ps(mantissa, _mm_sub_ps(frac, leadingOne))); +++ _mm_storeu_ps(bPtr, bVal); ++ ++- aPtr += 4; ++- bPtr += 4; ++- } +++ aPtr += 4; +++ bPtr += 4; +++ } ++ ++- number = quarterPoints * 4; ++- volk_32f_log2_32f_u_generic(bPtr, aPtr, num_points-number); +++ number = quarterPoints * 4; +++ volk_32f_log2_32f_u_generic(bPtr, aPtr, num_points - number); ++ } ++ ++ #endif /* LV_HAVE_SSE4_1 for unaligned */ ++@@ -473,56 +584,86 @@ volk_32f_log2_32f_u_sse4_1(float* bVector, const float* aVector, unsigned int nu ++ #include ++ ++ #define POLY0_FMAAVX2(x, c0) _mm256_set1_ps(c0) ++-#define POLY1_FMAAVX2(x, c0, c1) _mm256_fmadd_ps(POLY0_FMAAVX2(x, c1), x, _mm256_set1_ps(c0)) ++-#define POLY2_FMAAVX2(x, c0, c1, c2) _mm256_fmadd_ps(POLY1_FMAAVX2(x, c1, c2), x, _mm256_set1_ps(c0)) ++-#define POLY3_FMAAVX2(x, c0, c1, c2, c3) _mm256_fmadd_ps(POLY2_FMAAVX2(x, c1, c2, c3), x, _mm256_set1_ps(c0)) ++-#define POLY4_FMAAVX2(x, c0, c1, c2, c3, c4) _mm256_fmadd_ps(POLY3_FMAAVX2(x, c1, c2, c3, c4), x, _mm256_set1_ps(c0)) ++-#define POLY5_FMAAVX2(x, c0, c1, c2, c3, c4, c5) _mm256_fmadd_ps(POLY4_FMAAVX2(x, c1, c2, c3, c4, c5), x, _mm256_set1_ps(c0)) ++- ++-static inline void ++-volk_32f_log2_32f_u_avx2_fma(float* bVector, const float* aVector, unsigned int num_points) +++#define POLY1_FMAAVX2(x, c0, c1) \ +++ _mm256_fmadd_ps(POLY0_FMAAVX2(x, c1), x, _mm256_set1_ps(c0)) +++#define POLY2_FMAAVX2(x, c0, c1, c2) \ +++ _mm256_fmadd_ps(POLY1_FMAAVX2(x, c1, c2), x, _mm256_set1_ps(c0)) +++#define POLY3_FMAAVX2(x, c0, c1, c2, c3) \ +++ _mm256_fmadd_ps(POLY2_FMAAVX2(x, c1, c2, c3), x, _mm256_set1_ps(c0)) +++#define POLY4_FMAAVX2(x, c0, c1, c2, c3, c4) \ +++ _mm256_fmadd_ps(POLY3_FMAAVX2(x, c1, c2, c3, c4), x, _mm256_set1_ps(c0)) +++#define POLY5_FMAAVX2(x, c0, c1, c2, c3, c4, c5) \ +++ _mm256_fmadd_ps(POLY4_FMAAVX2(x, c1, c2, c3, c4, c5), x, _mm256_set1_ps(c0)) +++ +++static inline void volk_32f_log2_32f_u_avx2_fma(float* bVector, +++ const float* aVector, +++ unsigned int num_points) ++ { ++- float* bPtr = bVector; ++- const float* aPtr = aVector; +++ float* bPtr = bVector; +++ const float* aPtr = aVector; ++ ++- unsigned int number = 0; ++- const unsigned int eighthPoints = num_points / 8; +++ unsigned int number = 0; +++ const unsigned int eighthPoints = num_points / 8; ++ ++- __m256 aVal, bVal, mantissa, frac, leadingOne; ++- __m256i bias, exp; +++ __m256 aVal, bVal, mantissa, frac, leadingOne; +++ __m256i bias, exp; ++ ++- for(;number < eighthPoints; number++){ +++ for (; number < eighthPoints; number++) { ++ ++- aVal = _mm256_loadu_ps(aPtr); ++- bias = _mm256_set1_epi32(127); ++- leadingOne = _mm256_set1_ps(1.0f); ++- exp = _mm256_sub_epi32(_mm256_srli_epi32(_mm256_and_si256(_mm256_castps_si256(aVal), _mm256_set1_epi32(0x7f800000)), 23), bias); ++- bVal = _mm256_cvtepi32_ps(exp); +++ aVal = _mm256_loadu_ps(aPtr); +++ bias = _mm256_set1_epi32(127); +++ leadingOne = _mm256_set1_ps(1.0f); +++ exp = _mm256_sub_epi32( +++ _mm256_srli_epi32(_mm256_and_si256(_mm256_castps_si256(aVal), +++ _mm256_set1_epi32(0x7f800000)), +++ 23), +++ bias); +++ bVal = _mm256_cvtepi32_ps(exp); ++ ++- // Now to extract mantissa ++- frac = _mm256_or_ps(leadingOne, _mm256_and_ps(aVal, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffff)))); +++ // Now to extract mantissa +++ frac = _mm256_or_ps( +++ leadingOne, +++ _mm256_and_ps(aVal, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffff)))); ++ ++ #if LOG_POLY_DEGREE == 6 ++- mantissa = POLY5_FMAAVX2( frac, 3.1157899f, -3.3241990f, 2.5988452f, -1.2315303f, 3.1821337e-1f, -3.4436006e-2f); +++ mantissa = POLY5_FMAAVX2(frac, +++ 3.1157899f, +++ -3.3241990f, +++ 2.5988452f, +++ -1.2315303f, +++ 3.1821337e-1f, +++ -3.4436006e-2f); ++ #elif LOG_POLY_DEGREE == 5 ++- mantissa = POLY4_FMAAVX2( frac, 2.8882704548164776201f, -2.52074962577807006663f, 1.48116647521213171641f, -0.465725644288844778798f, 0.0596515482674574969533f); +++ mantissa = POLY4_FMAAVX2(frac, +++ 2.8882704548164776201f, +++ -2.52074962577807006663f, +++ 1.48116647521213171641f, +++ -0.465725644288844778798f, +++ 0.0596515482674574969533f); ++ #elif LOG_POLY_DEGREE == 4 ++- mantissa = POLY3_FMAAVX2( frac, 2.61761038894603480148f, -1.75647175389045657003f, 0.688243882994381274313f, -0.107254423828329604454f); +++ mantissa = POLY3_FMAAVX2(frac, +++ 2.61761038894603480148f, +++ -1.75647175389045657003f, +++ 0.688243882994381274313f, +++ -0.107254423828329604454f); ++ #elif LOG_POLY_DEGREE == 3 ++- mantissa = POLY2_FMAAVX2( frac, 2.28330284476918490682f, -1.04913055217340124191f, 0.204446009836232697516f); +++ mantissa = POLY2_FMAAVX2(frac, +++ 2.28330284476918490682f, +++ -1.04913055217340124191f, +++ 0.204446009836232697516f); ++ #else ++ #error ++ #endif ++ ++- bVal = _mm256_fmadd_ps(mantissa, _mm256_sub_ps(frac, leadingOne), bVal); ++- _mm256_storeu_ps(bPtr, bVal); +++ bVal = _mm256_fmadd_ps(mantissa, _mm256_sub_ps(frac, leadingOne), bVal); +++ _mm256_storeu_ps(bPtr, bVal); ++ ++- aPtr += 8; ++- bPtr += 8; ++- } +++ aPtr += 8; +++ bPtr += 8; +++ } ++ ++- number = eighthPoints * 8; ++- volk_32f_log2_32f_u_generic(bPtr, aPtr, num_points-number); +++ number = eighthPoints * 8; +++ volk_32f_log2_32f_u_generic(bPtr, aPtr, num_points - number); ++ } ++ ++ #endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for unaligned */ ++@@ -531,56 +672,86 @@ volk_32f_log2_32f_u_avx2_fma(float* bVector, const float* aVector, unsigned int ++ #include ++ ++ #define POLY0_AVX2(x, c0) _mm256_set1_ps(c0) ++-#define POLY1_AVX2(x, c0, c1) _mm256_add_ps(_mm256_mul_ps(POLY0_AVX2(x, c1), x), _mm256_set1_ps(c0)) ++-#define POLY2_AVX2(x, c0, c1, c2) _mm256_add_ps(_mm256_mul_ps(POLY1_AVX2(x, c1, c2), x), _mm256_set1_ps(c0)) ++-#define POLY3_AVX2(x, c0, c1, c2, c3) _mm256_add_ps(_mm256_mul_ps(POLY2_AVX2(x, c1, c2, c3), x), _mm256_set1_ps(c0)) ++-#define POLY4_AVX2(x, c0, c1, c2, c3, c4) _mm256_add_ps(_mm256_mul_ps(POLY3_AVX2(x, c1, c2, c3, c4), x), _mm256_set1_ps(c0)) ++-#define POLY5_AVX2(x, c0, c1, c2, c3, c4, c5) _mm256_add_ps(_mm256_mul_ps(POLY4_AVX2(x, c1, c2, c3, c4, c5), x), _mm256_set1_ps(c0)) +++#define POLY1_AVX2(x, c0, c1) \ +++ _mm256_add_ps(_mm256_mul_ps(POLY0_AVX2(x, c1), x), _mm256_set1_ps(c0)) +++#define POLY2_AVX2(x, c0, c1, c2) \ +++ _mm256_add_ps(_mm256_mul_ps(POLY1_AVX2(x, c1, c2), x), _mm256_set1_ps(c0)) +++#define POLY3_AVX2(x, c0, c1, c2, c3) \ +++ _mm256_add_ps(_mm256_mul_ps(POLY2_AVX2(x, c1, c2, c3), x), _mm256_set1_ps(c0)) +++#define POLY4_AVX2(x, c0, c1, c2, c3, c4) \ +++ _mm256_add_ps(_mm256_mul_ps(POLY3_AVX2(x, c1, c2, c3, c4), x), _mm256_set1_ps(c0)) +++#define POLY5_AVX2(x, c0, c1, c2, c3, c4, c5) \ +++ _mm256_add_ps(_mm256_mul_ps(POLY4_AVX2(x, c1, c2, c3, c4, c5), x), _mm256_set1_ps(c0)) ++ ++ static inline void ++ volk_32f_log2_32f_u_avx2(float* bVector, const float* aVector, unsigned int num_points) ++ { ++- float* bPtr = bVector; ++- const float* aPtr = aVector; +++ float* bPtr = bVector; +++ const float* aPtr = aVector; ++ ++- unsigned int number = 0; ++- const unsigned int eighthPoints = num_points / 8; +++ unsigned int number = 0; +++ const unsigned int eighthPoints = num_points / 8; ++ ++- __m256 aVal, bVal, mantissa, frac, leadingOne; ++- __m256i bias, exp; +++ __m256 aVal, bVal, mantissa, frac, leadingOne; +++ __m256i bias, exp; ++ ++- for(;number < eighthPoints; number++){ +++ for (; number < eighthPoints; number++) { ++ ++- aVal = _mm256_loadu_ps(aPtr); ++- bias = _mm256_set1_epi32(127); ++- leadingOne = _mm256_set1_ps(1.0f); ++- exp = _mm256_sub_epi32(_mm256_srli_epi32(_mm256_and_si256(_mm256_castps_si256(aVal), _mm256_set1_epi32(0x7f800000)), 23), bias); ++- bVal = _mm256_cvtepi32_ps(exp); +++ aVal = _mm256_loadu_ps(aPtr); +++ bias = _mm256_set1_epi32(127); +++ leadingOne = _mm256_set1_ps(1.0f); +++ exp = _mm256_sub_epi32( +++ _mm256_srli_epi32(_mm256_and_si256(_mm256_castps_si256(aVal), +++ _mm256_set1_epi32(0x7f800000)), +++ 23), +++ bias); +++ bVal = _mm256_cvtepi32_ps(exp); ++ ++- // Now to extract mantissa ++- frac = _mm256_or_ps(leadingOne, _mm256_and_ps(aVal, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffff)))); +++ // Now to extract mantissa +++ frac = _mm256_or_ps( +++ leadingOne, +++ _mm256_and_ps(aVal, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffff)))); ++ ++ #if LOG_POLY_DEGREE == 6 ++- mantissa = POLY5_AVX2( frac, 3.1157899f, -3.3241990f, 2.5988452f, -1.2315303f, 3.1821337e-1f, -3.4436006e-2f); +++ mantissa = POLY5_AVX2(frac, +++ 3.1157899f, +++ -3.3241990f, +++ 2.5988452f, +++ -1.2315303f, +++ 3.1821337e-1f, +++ -3.4436006e-2f); ++ #elif LOG_POLY_DEGREE == 5 ++- mantissa = POLY4_AVX2( frac, 2.8882704548164776201f, -2.52074962577807006663f, 1.48116647521213171641f, -0.465725644288844778798f, 0.0596515482674574969533f); +++ mantissa = POLY4_AVX2(frac, +++ 2.8882704548164776201f, +++ -2.52074962577807006663f, +++ 1.48116647521213171641f, +++ -0.465725644288844778798f, +++ 0.0596515482674574969533f); ++ #elif LOG_POLY_DEGREE == 4 ++- mantissa = POLY3_AVX2( frac, 2.61761038894603480148f, -1.75647175389045657003f, 0.688243882994381274313f, -0.107254423828329604454f); +++ mantissa = POLY3_AVX2(frac, +++ 2.61761038894603480148f, +++ -1.75647175389045657003f, +++ 0.688243882994381274313f, +++ -0.107254423828329604454f); ++ #elif LOG_POLY_DEGREE == 3 ++- mantissa = POLY2_AVX2( frac, 2.28330284476918490682f, -1.04913055217340124191f, 0.204446009836232697516f); +++ mantissa = POLY2_AVX2(frac, +++ 2.28330284476918490682f, +++ -1.04913055217340124191f, +++ 0.204446009836232697516f); ++ #else ++ #error ++ #endif ++ ++- bVal = _mm256_add_ps(_mm256_mul_ps(mantissa, _mm256_sub_ps(frac, leadingOne)), bVal); ++- _mm256_storeu_ps(bPtr, bVal); +++ bVal = +++ _mm256_add_ps(_mm256_mul_ps(mantissa, _mm256_sub_ps(frac, leadingOne)), bVal); +++ _mm256_storeu_ps(bPtr, bVal); ++ ++- aPtr += 8; ++- bPtr += 8; ++- } +++ aPtr += 8; +++ bPtr += 8; +++ } ++ ++- number = eighthPoints * 8; ++- volk_32f_log2_32f_u_generic(bPtr, aPtr, num_points-number); +++ number = eighthPoints * 8; +++ volk_32f_log2_32f_u_generic(bPtr, aPtr, num_points - number); ++ } ++ ++ #endif /* LV_HAVE_AVX2 for unaligned */ ++diff --git a/kernels/volk/volk_32f_null_32f.h b/kernels/volk/volk_32f_null_32f.h ++index 95e8d1a..cbed229 100644 ++--- a/kernels/volk/volk_32f_null_32f.h +++++ b/kernels/volk/volk_32f_null_32f.h ++@@ -20,9 +20,9 @@ ++ * Boston, MA 02110-1301, USA. ++ */ ++ ++-#include ++-#include ++ #include +++#include +++#include ++ ++ #ifndef INCLUDED_volk_32f_null_32f_a_H ++ #define INCLUDED_volk_32f_null_32f_a_H ++@@ -32,13 +32,13 @@ ++ static inline void ++ volk_32f_null_32f_generic(float* bVector, const float* aVector, unsigned int num_points) ++ { ++- float* bPtr = bVector; ++- const float* aPtr = aVector; ++- unsigned int number; +++ float* bPtr = bVector; +++ const float* aPtr = aVector; +++ unsigned int number; ++ ++- for(number = 0; number < num_points; number++){ ++- *bPtr++ = *aPtr++; ++- } +++ for (number = 0; number < num_points; number++) { +++ *bPtr++ = *aPtr++; +++ } ++ } ++ #endif /* LV_HAVE_GENERIC */ ++ ++diff --git a/kernels/volk/volk_32f_s32f_32f_fm_detect_32f.h b/kernels/volk/volk_32f_s32f_32f_fm_detect_32f.h ++index 9879959..3bf7aea 100644 ++--- a/kernels/volk/volk_32f_s32f_32f_fm_detect_32f.h +++++ b/kernels/volk/volk_32f_s32f_32f_fm_detect_32f.h ++@@ -30,14 +30,15 @@ ++ * ++ * Dispatcher Prototype ++ * \code ++- * void volk_32f_s32f_32f_fm_detect_32f(float* outputVector, const float* inputVector, const float bound, float* saveValue, unsigned int num_points) ++- * \endcode +++ * void volk_32f_s32f_32f_fm_detect_32f(float* outputVector, const float* inputVector, +++ * const float bound, float* saveValue, unsigned int num_points) \endcode ++ * ++ * \b Inputs ++- * \li inputVector: The input vector containing phase data (must be on the interval (-bound, bound]). ++- * \li bound: The interval that the input phase data is in, which is used to modulo the differentiation. ++- * \li saveValue: A pointer to a float which contains the phase value of the sample before the first input sample. ++- * \li num_points The number of data points. +++ * \li inputVector: The input vector containing phase data (must be on the interval +++ * (-bound, bound]). \li bound: The interval that the input phase data is in, which is +++ * used to modulo the differentiation. \li saveValue: A pointer to a float which contains +++ * the phase value of the sample before the first input sample. \li num_points The number +++ * of data points. ++ * ++ * \b Outputs ++ * \li outputVector: The vector where the results will be stored. ++@@ -62,67 +63,79 @@ ++ #ifdef LV_HAVE_AVX ++ #include ++ ++-static inline void volk_32f_s32f_32f_fm_detect_32f_a_avx(float* outputVector, const float* inputVector, const float bound, float* saveValue, unsigned int num_points){ ++- if (num_points < 1) { ++- return; ++- } ++- unsigned int number = 1; ++- unsigned int j = 0; ++- // num_points-1 keeps Fedora 7's gcc from crashing... ++- // num_points won't work. :( ++- const unsigned int eighthPoints = (num_points-1) / 8; ++- ++- float* outPtr = outputVector; ++- const float* inPtr = inputVector; ++- __m256 upperBound = _mm256_set1_ps(bound); ++- __m256 lowerBound = _mm256_set1_ps(-bound); ++- __m256 next3old1; ++- __m256 next4; ++- __m256 boundAdjust; ++- __m256 posBoundAdjust = _mm256_set1_ps(-2*bound); // Subtract when we're above. ++- __m256 negBoundAdjust = _mm256_set1_ps(2*bound); // Add when we're below. ++- // Do the first 8 by hand since we're going in from the saveValue: ++- *outPtr = *inPtr - *saveValue; ++- if (*outPtr > bound) *outPtr -= 2*bound; ++- if (*outPtr < -bound) *outPtr += 2*bound; ++- inPtr++; ++- outPtr++; ++- for (j = 1; j < ( (8 < num_points) ? 8 : num_points); j++) { ++- *outPtr = *(inPtr) - *(inPtr-1); ++- if (*outPtr > bound) *outPtr -= 2*bound; ++- if (*outPtr < -bound) *outPtr += 2*bound; ++- inPtr++; ++- outPtr++; ++- } ++- ++- for (; number < eighthPoints; number++) { ++- // Load data ++- next3old1 = _mm256_loadu_ps((float*) (inPtr-1)); ++- next4 = _mm256_load_ps(inPtr); ++- inPtr += 8; ++- // Subtract and store: ++- next3old1 = _mm256_sub_ps(next4, next3old1); ++- // Bound: ++- boundAdjust = _mm256_cmp_ps(next3old1, upperBound, _CMP_GT_OS); ++- boundAdjust = _mm256_and_ps(boundAdjust, posBoundAdjust); ++- next4 = _mm256_cmp_ps(next3old1, lowerBound, _CMP_LT_OS); ++- next4 = _mm256_and_ps(next4, negBoundAdjust); ++- boundAdjust = _mm256_or_ps(next4, boundAdjust); ++- // Make sure we're in the bounding interval: ++- next3old1 = _mm256_add_ps(next3old1, boundAdjust); ++- _mm256_store_ps(outPtr,next3old1); // Store the results back into the output ++- outPtr += 8; ++- } ++- ++- for (number = (8 > (eighthPoints*8) ? 8 : (8 * eighthPoints)); number < num_points; number++) { ++- *outPtr = *(inPtr) - *(inPtr-1); ++- if (*outPtr > bound) *outPtr -= 2*bound; ++- if (*outPtr < -bound) *outPtr += 2*bound; +++static inline void volk_32f_s32f_32f_fm_detect_32f_a_avx(float* outputVector, +++ const float* inputVector, +++ const float bound, +++ float* saveValue, +++ unsigned int num_points) +++{ +++ if (num_points < 1) { +++ return; +++ } +++ unsigned int number = 1; +++ unsigned int j = 0; +++ // num_points-1 keeps Fedora 7's gcc from crashing... +++ // num_points won't work. :( +++ const unsigned int eighthPoints = (num_points - 1) / 8; +++ +++ float* outPtr = outputVector; +++ const float* inPtr = inputVector; +++ __m256 upperBound = _mm256_set1_ps(bound); +++ __m256 lowerBound = _mm256_set1_ps(-bound); +++ __m256 next3old1; +++ __m256 next4; +++ __m256 boundAdjust; +++ __m256 posBoundAdjust = _mm256_set1_ps(-2 * bound); // Subtract when we're above. +++ __m256 negBoundAdjust = _mm256_set1_ps(2 * bound); // Add when we're below. +++ // Do the first 8 by hand since we're going in from the saveValue: +++ *outPtr = *inPtr - *saveValue; +++ if (*outPtr > bound) +++ *outPtr -= 2 * bound; +++ if (*outPtr < -bound) +++ *outPtr += 2 * bound; ++ inPtr++; ++ outPtr++; ++- } ++- ++- *saveValue = inputVector[num_points-1]; +++ for (j = 1; j < ((8 < num_points) ? 8 : num_points); j++) { +++ *outPtr = *(inPtr) - *(inPtr - 1); +++ if (*outPtr > bound) +++ *outPtr -= 2 * bound; +++ if (*outPtr < -bound) +++ *outPtr += 2 * bound; +++ inPtr++; +++ outPtr++; +++ } +++ +++ for (; number < eighthPoints; number++) { +++ // Load data +++ next3old1 = _mm256_loadu_ps((float*)(inPtr - 1)); +++ next4 = _mm256_load_ps(inPtr); +++ inPtr += 8; +++ // Subtract and store: +++ next3old1 = _mm256_sub_ps(next4, next3old1); +++ // Bound: +++ boundAdjust = _mm256_cmp_ps(next3old1, upperBound, _CMP_GT_OS); +++ boundAdjust = _mm256_and_ps(boundAdjust, posBoundAdjust); +++ next4 = _mm256_cmp_ps(next3old1, lowerBound, _CMP_LT_OS); +++ next4 = _mm256_and_ps(next4, negBoundAdjust); +++ boundAdjust = _mm256_or_ps(next4, boundAdjust); +++ // Make sure we're in the bounding interval: +++ next3old1 = _mm256_add_ps(next3old1, boundAdjust); +++ _mm256_store_ps(outPtr, next3old1); // Store the results back into the output +++ outPtr += 8; +++ } +++ +++ for (number = (8 > (eighthPoints * 8) ? 8 : (8 * eighthPoints)); number < num_points; +++ number++) { +++ *outPtr = *(inPtr) - *(inPtr - 1); +++ if (*outPtr > bound) +++ *outPtr -= 2 * bound; +++ if (*outPtr < -bound) +++ *outPtr += 2 * bound; +++ inPtr++; +++ outPtr++; +++ } +++ +++ *saveValue = inputVector[num_points - 1]; ++ } ++ #endif /* LV_HAVE_AVX */ ++ ++@@ -130,102 +143,122 @@ static inline void volk_32f_s32f_32f_fm_detect_32f_a_avx(float* outputVector, co ++ #ifdef LV_HAVE_SSE ++ #include ++ ++-static inline void volk_32f_s32f_32f_fm_detect_32f_a_sse(float* outputVector, const float* inputVector, const float bound, float* saveValue, unsigned int num_points){ ++- if (num_points < 1) { ++- return; ++- } ++- unsigned int number = 1; ++- unsigned int j = 0; ++- // num_points-1 keeps Fedora 7's gcc from crashing... ++- // num_points won't work. :( ++- const unsigned int quarterPoints = (num_points-1) / 4; ++- ++- float* outPtr = outputVector; ++- const float* inPtr = inputVector; ++- __m128 upperBound = _mm_set_ps1(bound); ++- __m128 lowerBound = _mm_set_ps1(-bound); ++- __m128 next3old1; ++- __m128 next4; ++- __m128 boundAdjust; ++- __m128 posBoundAdjust = _mm_set_ps1(-2*bound); // Subtract when we're above. ++- __m128 negBoundAdjust = _mm_set_ps1(2*bound); // Add when we're below. ++- // Do the first 4 by hand since we're going in from the saveValue: ++- *outPtr = *inPtr - *saveValue; ++- if (*outPtr > bound) *outPtr -= 2*bound; ++- if (*outPtr < -bound) *outPtr += 2*bound; ++- inPtr++; ++- outPtr++; ++- for (j = 1; j < ( (4 < num_points) ? 4 : num_points); j++) { ++- *outPtr = *(inPtr) - *(inPtr-1); ++- if (*outPtr > bound) *outPtr -= 2*bound; ++- if (*outPtr < -bound) *outPtr += 2*bound; ++- inPtr++; ++- outPtr++; ++- } ++- ++- for (; number < quarterPoints; number++) { ++- // Load data ++- next3old1 = _mm_loadu_ps((float*) (inPtr-1)); ++- next4 = _mm_load_ps(inPtr); ++- inPtr += 4; ++- // Subtract and store: ++- next3old1 = _mm_sub_ps(next4, next3old1); ++- // Bound: ++- boundAdjust = _mm_cmpgt_ps(next3old1, upperBound); ++- boundAdjust = _mm_and_ps(boundAdjust, posBoundAdjust); ++- next4 = _mm_cmplt_ps(next3old1, lowerBound); ++- next4 = _mm_and_ps(next4, negBoundAdjust); ++- boundAdjust = _mm_or_ps(next4, boundAdjust); ++- // Make sure we're in the bounding interval: ++- next3old1 = _mm_add_ps(next3old1, boundAdjust); ++- _mm_store_ps(outPtr,next3old1); // Store the results back into the output ++- outPtr += 4; ++- } ++- ++- for (number = (4 > (quarterPoints*4) ? 4 : (4 * quarterPoints)); number < num_points; number++) { ++- *outPtr = *(inPtr) - *(inPtr-1); ++- if (*outPtr > bound) *outPtr -= 2*bound; ++- if (*outPtr < -bound) *outPtr += 2*bound; +++static inline void volk_32f_s32f_32f_fm_detect_32f_a_sse(float* outputVector, +++ const float* inputVector, +++ const float bound, +++ float* saveValue, +++ unsigned int num_points) +++{ +++ if (num_points < 1) { +++ return; +++ } +++ unsigned int number = 1; +++ unsigned int j = 0; +++ // num_points-1 keeps Fedora 7's gcc from crashing... +++ // num_points won't work. :( +++ const unsigned int quarterPoints = (num_points - 1) / 4; +++ +++ float* outPtr = outputVector; +++ const float* inPtr = inputVector; +++ __m128 upperBound = _mm_set_ps1(bound); +++ __m128 lowerBound = _mm_set_ps1(-bound); +++ __m128 next3old1; +++ __m128 next4; +++ __m128 boundAdjust; +++ __m128 posBoundAdjust = _mm_set_ps1(-2 * bound); // Subtract when we're above. +++ __m128 negBoundAdjust = _mm_set_ps1(2 * bound); // Add when we're below. +++ // Do the first 4 by hand since we're going in from the saveValue: +++ *outPtr = *inPtr - *saveValue; +++ if (*outPtr > bound) +++ *outPtr -= 2 * bound; +++ if (*outPtr < -bound) +++ *outPtr += 2 * bound; ++ inPtr++; ++ outPtr++; ++- } ++- ++- *saveValue = inputVector[num_points-1]; +++ for (j = 1; j < ((4 < num_points) ? 4 : num_points); j++) { +++ *outPtr = *(inPtr) - *(inPtr - 1); +++ if (*outPtr > bound) +++ *outPtr -= 2 * bound; +++ if (*outPtr < -bound) +++ *outPtr += 2 * bound; +++ inPtr++; +++ outPtr++; +++ } +++ +++ for (; number < quarterPoints; number++) { +++ // Load data +++ next3old1 = _mm_loadu_ps((float*)(inPtr - 1)); +++ next4 = _mm_load_ps(inPtr); +++ inPtr += 4; +++ // Subtract and store: +++ next3old1 = _mm_sub_ps(next4, next3old1); +++ // Bound: +++ boundAdjust = _mm_cmpgt_ps(next3old1, upperBound); +++ boundAdjust = _mm_and_ps(boundAdjust, posBoundAdjust); +++ next4 = _mm_cmplt_ps(next3old1, lowerBound); +++ next4 = _mm_and_ps(next4, negBoundAdjust); +++ boundAdjust = _mm_or_ps(next4, boundAdjust); +++ // Make sure we're in the bounding interval: +++ next3old1 = _mm_add_ps(next3old1, boundAdjust); +++ _mm_store_ps(outPtr, next3old1); // Store the results back into the output +++ outPtr += 4; +++ } +++ +++ for (number = (4 > (quarterPoints * 4) ? 4 : (4 * quarterPoints)); +++ number < num_points; +++ number++) { +++ *outPtr = *(inPtr) - *(inPtr - 1); +++ if (*outPtr > bound) +++ *outPtr -= 2 * bound; +++ if (*outPtr < -bound) +++ *outPtr += 2 * bound; +++ inPtr++; +++ outPtr++; +++ } +++ +++ *saveValue = inputVector[num_points - 1]; ++ } ++ #endif /* LV_HAVE_SSE */ ++ ++ #ifdef LV_HAVE_GENERIC ++ ++-static inline void volk_32f_s32f_32f_fm_detect_32f_generic(float* outputVector, const float* inputVector, const float bound, float* saveValue, unsigned int num_points){ ++- if (num_points < 1) { ++- return; ++- } ++- unsigned int number = 0; ++- float* outPtr = outputVector; ++- const float* inPtr = inputVector; ++- ++- // Do the first 1 by hand since we're going in from the saveValue: ++- *outPtr = *inPtr - *saveValue; ++- if (*outPtr > bound) *outPtr -= 2*bound; ++- if (*outPtr < -bound) *outPtr += 2*bound; ++- inPtr++; ++- outPtr++; ++- ++- for (number = 1; number < num_points; number++) { ++- *outPtr = *(inPtr) - *(inPtr-1); ++- if (*outPtr > bound) *outPtr -= 2*bound; ++- if (*outPtr < -bound) *outPtr += 2*bound; +++static inline void volk_32f_s32f_32f_fm_detect_32f_generic(float* outputVector, +++ const float* inputVector, +++ const float bound, +++ float* saveValue, +++ unsigned int num_points) +++{ +++ if (num_points < 1) { +++ return; +++ } +++ unsigned int number = 0; +++ float* outPtr = outputVector; +++ const float* inPtr = inputVector; +++ +++ // Do the first 1 by hand since we're going in from the saveValue: +++ *outPtr = *inPtr - *saveValue; +++ if (*outPtr > bound) +++ *outPtr -= 2 * bound; +++ if (*outPtr < -bound) +++ *outPtr += 2 * bound; ++ inPtr++; ++ outPtr++; ++- } ++ ++- *saveValue = inputVector[num_points-1]; +++ for (number = 1; number < num_points; number++) { +++ *outPtr = *(inPtr) - *(inPtr - 1); +++ if (*outPtr > bound) +++ *outPtr -= 2 * bound; +++ if (*outPtr < -bound) +++ *outPtr += 2 * bound; +++ inPtr++; +++ outPtr++; +++ } +++ +++ *saveValue = inputVector[num_points - 1]; ++ } ++ #endif /* LV_HAVE_GENERIC */ ++ ++ ++- ++- ++ #endif /* INCLUDED_volk_32f_s32f_32f_fm_detect_32f_a_H */ ++ ++ ++@@ -238,67 +271,79 @@ static inline void volk_32f_s32f_32f_fm_detect_32f_generic(float* outputVector, ++ #ifdef LV_HAVE_AVX ++ #include ++ ++-static inline void volk_32f_s32f_32f_fm_detect_32f_u_avx(float* outputVector, const float* inputVector, const float bound, float* saveValue, unsigned int num_points){ ++- if (num_points < 1) { ++- return; ++- } ++- unsigned int number = 1; ++- unsigned int j = 0; ++- // num_points-1 keeps Fedora 7's gcc from crashing... ++- // num_points won't work. :( ++- const unsigned int eighthPoints = (num_points-1) / 8; ++- ++- float* outPtr = outputVector; ++- const float* inPtr = inputVector; ++- __m256 upperBound = _mm256_set1_ps(bound); ++- __m256 lowerBound = _mm256_set1_ps(-bound); ++- __m256 next3old1; ++- __m256 next4; ++- __m256 boundAdjust; ++- __m256 posBoundAdjust = _mm256_set1_ps(-2*bound); // Subtract when we're above. ++- __m256 negBoundAdjust = _mm256_set1_ps(2*bound); // Add when we're below. ++- // Do the first 8 by hand since we're going in from the saveValue: ++- *outPtr = *inPtr - *saveValue; ++- if (*outPtr > bound) *outPtr -= 2*bound; ++- if (*outPtr < -bound) *outPtr += 2*bound; ++- inPtr++; ++- outPtr++; ++- for (j = 1; j < ( (8 < num_points) ? 8 : num_points); j++) { ++- *outPtr = *(inPtr) - *(inPtr-1); ++- if (*outPtr > bound) *outPtr -= 2*bound; ++- if (*outPtr < -bound) *outPtr += 2*bound; +++static inline void volk_32f_s32f_32f_fm_detect_32f_u_avx(float* outputVector, +++ const float* inputVector, +++ const float bound, +++ float* saveValue, +++ unsigned int num_points) +++{ +++ if (num_points < 1) { +++ return; +++ } +++ unsigned int number = 1; +++ unsigned int j = 0; +++ // num_points-1 keeps Fedora 7's gcc from crashing... +++ // num_points won't work. :( +++ const unsigned int eighthPoints = (num_points - 1) / 8; +++ +++ float* outPtr = outputVector; +++ const float* inPtr = inputVector; +++ __m256 upperBound = _mm256_set1_ps(bound); +++ __m256 lowerBound = _mm256_set1_ps(-bound); +++ __m256 next3old1; +++ __m256 next4; +++ __m256 boundAdjust; +++ __m256 posBoundAdjust = _mm256_set1_ps(-2 * bound); // Subtract when we're above. +++ __m256 negBoundAdjust = _mm256_set1_ps(2 * bound); // Add when we're below. +++ // Do the first 8 by hand since we're going in from the saveValue: +++ *outPtr = *inPtr - *saveValue; +++ if (*outPtr > bound) +++ *outPtr -= 2 * bound; +++ if (*outPtr < -bound) +++ *outPtr += 2 * bound; ++ inPtr++; ++ outPtr++; ++- } ++- ++- for (; number < eighthPoints; number++) { ++- // Load data ++- next3old1 = _mm256_loadu_ps((float*) (inPtr-1)); ++- next4 = _mm256_loadu_ps(inPtr); ++- inPtr += 8; ++- // Subtract and store: ++- next3old1 = _mm256_sub_ps(next4, next3old1); ++- // Bound: ++- boundAdjust = _mm256_cmp_ps(next3old1, upperBound, _CMP_GT_OS); ++- boundAdjust = _mm256_and_ps(boundAdjust, posBoundAdjust); ++- next4 = _mm256_cmp_ps(next3old1, lowerBound, _CMP_LT_OS); ++- next4 = _mm256_and_ps(next4, negBoundAdjust); ++- boundAdjust = _mm256_or_ps(next4, boundAdjust); ++- // Make sure we're in the bounding interval: ++- next3old1 = _mm256_add_ps(next3old1, boundAdjust); ++- _mm256_storeu_ps(outPtr,next3old1); // Store the results back into the output ++- outPtr += 8; ++- } ++- ++- for (number = (8 > (eighthPoints*8) ? 8 : (8 * eighthPoints)); number < num_points; number++) { ++- *outPtr = *(inPtr) - *(inPtr-1); ++- if (*outPtr > bound) *outPtr -= 2*bound; ++- if (*outPtr < -bound) *outPtr += 2*bound; ++- inPtr++; ++- outPtr++; ++- } ++- ++- *saveValue = inputVector[num_points-1]; +++ for (j = 1; j < ((8 < num_points) ? 8 : num_points); j++) { +++ *outPtr = *(inPtr) - *(inPtr - 1); +++ if (*outPtr > bound) +++ *outPtr -= 2 * bound; +++ if (*outPtr < -bound) +++ *outPtr += 2 * bound; +++ inPtr++; +++ outPtr++; +++ } +++ +++ for (; number < eighthPoints; number++) { +++ // Load data +++ next3old1 = _mm256_loadu_ps((float*)(inPtr - 1)); +++ next4 = _mm256_loadu_ps(inPtr); +++ inPtr += 8; +++ // Subtract and store: +++ next3old1 = _mm256_sub_ps(next4, next3old1); +++ // Bound: +++ boundAdjust = _mm256_cmp_ps(next3old1, upperBound, _CMP_GT_OS); +++ boundAdjust = _mm256_and_ps(boundAdjust, posBoundAdjust); +++ next4 = _mm256_cmp_ps(next3old1, lowerBound, _CMP_LT_OS); +++ next4 = _mm256_and_ps(next4, negBoundAdjust); +++ boundAdjust = _mm256_or_ps(next4, boundAdjust); +++ // Make sure we're in the bounding interval: +++ next3old1 = _mm256_add_ps(next3old1, boundAdjust); +++ _mm256_storeu_ps(outPtr, next3old1); // Store the results back into the output +++ outPtr += 8; +++ } +++ +++ for (number = (8 > (eighthPoints * 8) ? 8 : (8 * eighthPoints)); number < num_points; +++ number++) { +++ *outPtr = *(inPtr) - *(inPtr - 1); +++ if (*outPtr > bound) +++ *outPtr -= 2 * bound; +++ if (*outPtr < -bound) +++ *outPtr += 2 * bound; +++ inPtr++; +++ outPtr++; +++ } +++ +++ *saveValue = inputVector[num_points - 1]; ++ } ++ #endif /* LV_HAVE_AVX */ ++ ++diff --git a/kernels/volk/volk_32f_s32f_calc_spectral_noise_floor_32f.h b/kernels/volk/volk_32f_s32f_calc_spectral_noise_floor_32f.h ++index ae371a2..e7e581f 100644 ++--- a/kernels/volk/volk_32f_s32f_calc_spectral_noise_floor_32f.h +++++ b/kernels/volk/volk_32f_s32f_calc_spectral_noise_floor_32f.h ++@@ -35,13 +35,15 @@ ++ * ++ * Dispatcher Prototype ++ * \code ++- * void volk_32f_s32f_calc_spectral_noise_floor_32f(float* noiseFloorAmplitude, const float* realDataPoints, const float spectralExclusionValue, const unsigned int num_points) ++- * \endcode +++ * void volk_32f_s32f_calc_spectral_noise_floor_32f(float* noiseFloorAmplitude, const +++ * float* realDataPoints, const float spectralExclusionValue, const unsigned int +++ * num_points) \endcode ++ * ++ * \b Inputs ++ * \li realDataPoints: The input power spectrum. ++- * \li spectralExclusionValue: The number of dB above the noise floor that a data point must be to be excluded from the noise floor calculation - default value is 20. ++- * \li num_points: The number of data points. +++ * \li spectralExclusionValue: The number of dB above the noise floor that a data point +++ * must be to be excluded from the noise floor calculation - default value is 20. \li +++ * num_points: The number of data points. ++ * ++ * \b Outputs ++ * \li noiseFloorAmplitude: The noise floor of the input spectrum, in dB. ++@@ -59,9 +61,9 @@ ++ #ifndef INCLUDED_volk_32f_s32f_calc_spectral_noise_floor_32f_a_H ++ #define INCLUDED_volk_32f_s32f_calc_spectral_noise_floor_32f_a_H ++ ++-#include ++ #include ++ #include +++#include ++ ++ #ifdef LV_HAVE_AVX ++ #include ++@@ -72,114 +74,117 @@ volk_32f_s32f_calc_spectral_noise_floor_32f_a_avx(float* noiseFloorAmplitude, ++ const float spectralExclusionValue, ++ const unsigned int num_points) ++ { ++- unsigned int number = 0; ++- const unsigned int eighthPoints = num_points / 8; ++- ++- const float* dataPointsPtr = realDataPoints; ++- __VOLK_ATTR_ALIGNED(32) float avgPointsVector[8]; ++- ++- __m256 dataPointsVal; ++- __m256 avgPointsVal = _mm256_setzero_ps(); ++- // Calculate the sum (for mean) for all points ++- for(; number < eighthPoints; number++){ ++- ++- dataPointsVal = _mm256_load_ps(dataPointsPtr); ++- ++- dataPointsPtr += 8; ++- ++- avgPointsVal = _mm256_add_ps(avgPointsVal, dataPointsVal); ++- } ++- ++- _mm256_store_ps(avgPointsVector, avgPointsVal); ++- ++- float sumMean = 0.0; ++- sumMean += avgPointsVector[0]; ++- sumMean += avgPointsVector[1]; ++- sumMean += avgPointsVector[2]; ++- sumMean += avgPointsVector[3]; ++- sumMean += avgPointsVector[4]; ++- sumMean += avgPointsVector[5]; ++- sumMean += avgPointsVector[6]; ++- sumMean += avgPointsVector[7]; ++- ++- number = eighthPoints * 8; ++- for(;number < num_points; number++){ ++- sumMean += realDataPoints[number]; ++- } ++- ++- // calculate the spectral mean ++- // +20 because for the comparison below we only want to throw out bins ++- // that are significantly higher (and would, thus, affect the mean more ++- const float meanAmplitude = (sumMean / ((float)num_points)) + spectralExclusionValue; ++- ++- dataPointsPtr = realDataPoints; // Reset the dataPointsPtr ++- __m256 vMeanAmplitudeVector = _mm256_set1_ps(meanAmplitude); ++- __m256 vOnesVector = _mm256_set1_ps(1.0); ++- __m256 vValidBinCount = _mm256_setzero_ps(); ++- avgPointsVal = _mm256_setzero_ps(); ++- __m256 compareMask; ++- number = 0; ++- // Calculate the sum (for mean) for any points which do NOT exceed the mean amplitude ++- for(; number < eighthPoints; number++){ ++- ++- dataPointsVal = _mm256_load_ps(dataPointsPtr); ++- ++- dataPointsPtr += 8; ++- ++- // Identify which items do not exceed the mean amplitude ++- compareMask = _mm256_cmp_ps(dataPointsVal, vMeanAmplitudeVector, _CMP_LE_OQ); ++- ++- // Mask off the items that exceed the mean amplitude and add the avg Points that do not exceed the mean amplitude ++- avgPointsVal = _mm256_add_ps(avgPointsVal, _mm256_and_ps(compareMask, dataPointsVal)); ++- ++- // Count the number of bins which do not exceed the mean amplitude ++- vValidBinCount = _mm256_add_ps(vValidBinCount, _mm256_and_ps(compareMask, vOnesVector)); ++- } ++- ++- // Calculate the mean from the remaining data points ++- _mm256_store_ps(avgPointsVector, avgPointsVal); ++- ++- sumMean = 0.0; ++- sumMean += avgPointsVector[0]; ++- sumMean += avgPointsVector[1]; ++- sumMean += avgPointsVector[2]; ++- sumMean += avgPointsVector[3]; ++- sumMean += avgPointsVector[4]; ++- sumMean += avgPointsVector[5]; ++- sumMean += avgPointsVector[6]; ++- sumMean += avgPointsVector[7]; ++- ++- // Calculate the number of valid bins from the remaining count ++- __VOLK_ATTR_ALIGNED(32) float validBinCountVector[8]; ++- _mm256_store_ps(validBinCountVector, vValidBinCount); ++- ++- float validBinCount = 0; ++- validBinCount += validBinCountVector[0]; ++- validBinCount += validBinCountVector[1]; ++- validBinCount += validBinCountVector[2]; ++- validBinCount += validBinCountVector[3]; ++- validBinCount += validBinCountVector[4]; ++- validBinCount += validBinCountVector[5]; ++- validBinCount += validBinCountVector[6]; ++- validBinCount += validBinCountVector[7]; ++- ++- number = eighthPoints * 8; ++- for(;number < num_points; number++){ ++- if(realDataPoints[number] <= meanAmplitude){ ++- sumMean += realDataPoints[number]; ++- validBinCount += 1.0; +++ unsigned int number = 0; +++ const unsigned int eighthPoints = num_points / 8; +++ +++ const float* dataPointsPtr = realDataPoints; +++ __VOLK_ATTR_ALIGNED(32) float avgPointsVector[8]; +++ +++ __m256 dataPointsVal; +++ __m256 avgPointsVal = _mm256_setzero_ps(); +++ // Calculate the sum (for mean) for all points +++ for (; number < eighthPoints; number++) { +++ +++ dataPointsVal = _mm256_load_ps(dataPointsPtr); +++ +++ dataPointsPtr += 8; +++ +++ avgPointsVal = _mm256_add_ps(avgPointsVal, dataPointsVal); ++ } ++- } ++ ++- float localNoiseFloorAmplitude = 0; ++- if(validBinCount > 0.0){ ++- localNoiseFloorAmplitude = sumMean / validBinCount; ++- } ++- else{ ++- localNoiseFloorAmplitude = meanAmplitude; // For the odd case that all the amplitudes are equal... ++- } +++ _mm256_store_ps(avgPointsVector, avgPointsVal); +++ +++ float sumMean = 0.0; +++ sumMean += avgPointsVector[0]; +++ sumMean += avgPointsVector[1]; +++ sumMean += avgPointsVector[2]; +++ sumMean += avgPointsVector[3]; +++ sumMean += avgPointsVector[4]; +++ sumMean += avgPointsVector[5]; +++ sumMean += avgPointsVector[6]; +++ sumMean += avgPointsVector[7]; +++ +++ number = eighthPoints * 8; +++ for (; number < num_points; number++) { +++ sumMean += realDataPoints[number]; +++ } +++ +++ // calculate the spectral mean +++ // +20 because for the comparison below we only want to throw out bins +++ // that are significantly higher (and would, thus, affect the mean more +++ const float meanAmplitude = (sumMean / ((float)num_points)) + spectralExclusionValue; +++ +++ dataPointsPtr = realDataPoints; // Reset the dataPointsPtr +++ __m256 vMeanAmplitudeVector = _mm256_set1_ps(meanAmplitude); +++ __m256 vOnesVector = _mm256_set1_ps(1.0); +++ __m256 vValidBinCount = _mm256_setzero_ps(); +++ avgPointsVal = _mm256_setzero_ps(); +++ __m256 compareMask; +++ number = 0; +++ // Calculate the sum (for mean) for any points which do NOT exceed the mean amplitude +++ for (; number < eighthPoints; number++) { +++ +++ dataPointsVal = _mm256_load_ps(dataPointsPtr); +++ +++ dataPointsPtr += 8; +++ +++ // Identify which items do not exceed the mean amplitude +++ compareMask = _mm256_cmp_ps(dataPointsVal, vMeanAmplitudeVector, _CMP_LE_OQ); +++ +++ // Mask off the items that exceed the mean amplitude and add the avg Points that +++ // do not exceed the mean amplitude +++ avgPointsVal = +++ _mm256_add_ps(avgPointsVal, _mm256_and_ps(compareMask, dataPointsVal)); +++ +++ // Count the number of bins which do not exceed the mean amplitude +++ vValidBinCount = +++ _mm256_add_ps(vValidBinCount, _mm256_and_ps(compareMask, vOnesVector)); +++ } ++ ++- *noiseFloorAmplitude = localNoiseFloorAmplitude; +++ // Calculate the mean from the remaining data points +++ _mm256_store_ps(avgPointsVector, avgPointsVal); +++ +++ sumMean = 0.0; +++ sumMean += avgPointsVector[0]; +++ sumMean += avgPointsVector[1]; +++ sumMean += avgPointsVector[2]; +++ sumMean += avgPointsVector[3]; +++ sumMean += avgPointsVector[4]; +++ sumMean += avgPointsVector[5]; +++ sumMean += avgPointsVector[6]; +++ sumMean += avgPointsVector[7]; +++ +++ // Calculate the number of valid bins from the remaining count +++ __VOLK_ATTR_ALIGNED(32) float validBinCountVector[8]; +++ _mm256_store_ps(validBinCountVector, vValidBinCount); +++ +++ float validBinCount = 0; +++ validBinCount += validBinCountVector[0]; +++ validBinCount += validBinCountVector[1]; +++ validBinCount += validBinCountVector[2]; +++ validBinCount += validBinCountVector[3]; +++ validBinCount += validBinCountVector[4]; +++ validBinCount += validBinCountVector[5]; +++ validBinCount += validBinCountVector[6]; +++ validBinCount += validBinCountVector[7]; +++ +++ number = eighthPoints * 8; +++ for (; number < num_points; number++) { +++ if (realDataPoints[number] <= meanAmplitude) { +++ sumMean += realDataPoints[number]; +++ validBinCount += 1.0; +++ } +++ } +++ +++ float localNoiseFloorAmplitude = 0; +++ if (validBinCount > 0.0) { +++ localNoiseFloorAmplitude = sumMean / validBinCount; +++ } else { +++ localNoiseFloorAmplitude = +++ meanAmplitude; // For the odd case that all the amplitudes are equal... +++ } +++ +++ *noiseFloorAmplitude = localNoiseFloorAmplitude; ++ } ++ #endif /* LV_HAVE_AVX */ ++ ++@@ -192,102 +197,103 @@ volk_32f_s32f_calc_spectral_noise_floor_32f_a_sse(float* noiseFloorAmplitude, ++ const float spectralExclusionValue, ++ const unsigned int num_points) ++ { ++- unsigned int number = 0; ++- const unsigned int quarterPoints = num_points / 4; ++- ++- const float* dataPointsPtr = realDataPoints; ++- __VOLK_ATTR_ALIGNED(16) float avgPointsVector[4]; ++- ++- __m128 dataPointsVal; ++- __m128 avgPointsVal = _mm_setzero_ps(); ++- // Calculate the sum (for mean) for all points ++- for(; number < quarterPoints; number++){ ++- ++- dataPointsVal = _mm_load_ps(dataPointsPtr); ++- ++- dataPointsPtr += 4; ++- ++- avgPointsVal = _mm_add_ps(avgPointsVal, dataPointsVal); ++- } ++- ++- _mm_store_ps(avgPointsVector, avgPointsVal); ++- ++- float sumMean = 0.0; ++- sumMean += avgPointsVector[0]; ++- sumMean += avgPointsVector[1]; ++- sumMean += avgPointsVector[2]; ++- sumMean += avgPointsVector[3]; ++- ++- number = quarterPoints * 4; ++- for(;number < num_points; number++){ ++- sumMean += realDataPoints[number]; ++- } ++- ++- // calculate the spectral mean ++- // +20 because for the comparison below we only want to throw out bins ++- // that are significantly higher (and would, thus, affect the mean more ++- const float meanAmplitude = (sumMean / ((float)num_points)) + spectralExclusionValue; ++- ++- dataPointsPtr = realDataPoints; // Reset the dataPointsPtr ++- __m128 vMeanAmplitudeVector = _mm_set_ps1(meanAmplitude); ++- __m128 vOnesVector = _mm_set_ps1(1.0); ++- __m128 vValidBinCount = _mm_setzero_ps(); ++- avgPointsVal = _mm_setzero_ps(); ++- __m128 compareMask; ++- number = 0; ++- // Calculate the sum (for mean) for any points which do NOT exceed the mean amplitude ++- for(; number < quarterPoints; number++){ ++- ++- dataPointsVal = _mm_load_ps(dataPointsPtr); ++- ++- dataPointsPtr += 4; ++- ++- // Identify which items do not exceed the mean amplitude ++- compareMask = _mm_cmple_ps(dataPointsVal, vMeanAmplitudeVector); ++- ++- // Mask off the items that exceed the mean amplitude and add the avg Points that do not exceed the mean amplitude ++- avgPointsVal = _mm_add_ps(avgPointsVal, _mm_and_ps(compareMask, dataPointsVal)); ++- ++- // Count the number of bins which do not exceed the mean amplitude ++- vValidBinCount = _mm_add_ps(vValidBinCount, _mm_and_ps(compareMask, vOnesVector)); ++- } ++- ++- // Calculate the mean from the remaining data points ++- _mm_store_ps(avgPointsVector, avgPointsVal); ++- ++- sumMean = 0.0; ++- sumMean += avgPointsVector[0]; ++- sumMean += avgPointsVector[1]; ++- sumMean += avgPointsVector[2]; ++- sumMean += avgPointsVector[3]; ++- ++- // Calculate the number of valid bins from the remaining count ++- __VOLK_ATTR_ALIGNED(16) float validBinCountVector[4]; ++- _mm_store_ps(validBinCountVector, vValidBinCount); ++- ++- float validBinCount = 0; ++- validBinCount += validBinCountVector[0]; ++- validBinCount += validBinCountVector[1]; ++- validBinCount += validBinCountVector[2]; ++- validBinCount += validBinCountVector[3]; ++- ++- number = quarterPoints * 4; ++- for(;number < num_points; number++){ ++- if(realDataPoints[number] <= meanAmplitude){ ++- sumMean += realDataPoints[number]; ++- validBinCount += 1.0; +++ unsigned int number = 0; +++ const unsigned int quarterPoints = num_points / 4; +++ +++ const float* dataPointsPtr = realDataPoints; +++ __VOLK_ATTR_ALIGNED(16) float avgPointsVector[4]; +++ +++ __m128 dataPointsVal; +++ __m128 avgPointsVal = _mm_setzero_ps(); +++ // Calculate the sum (for mean) for all points +++ for (; number < quarterPoints; number++) { +++ +++ dataPointsVal = _mm_load_ps(dataPointsPtr); +++ +++ dataPointsPtr += 4; +++ +++ avgPointsVal = _mm_add_ps(avgPointsVal, dataPointsVal); +++ } +++ +++ _mm_store_ps(avgPointsVector, avgPointsVal); +++ +++ float sumMean = 0.0; +++ sumMean += avgPointsVector[0]; +++ sumMean += avgPointsVector[1]; +++ sumMean += avgPointsVector[2]; +++ sumMean += avgPointsVector[3]; +++ +++ number = quarterPoints * 4; +++ for (; number < num_points; number++) { +++ sumMean += realDataPoints[number]; +++ } +++ +++ // calculate the spectral mean +++ // +20 because for the comparison below we only want to throw out bins +++ // that are significantly higher (and would, thus, affect the mean more +++ const float meanAmplitude = (sumMean / ((float)num_points)) + spectralExclusionValue; +++ +++ dataPointsPtr = realDataPoints; // Reset the dataPointsPtr +++ __m128 vMeanAmplitudeVector = _mm_set_ps1(meanAmplitude); +++ __m128 vOnesVector = _mm_set_ps1(1.0); +++ __m128 vValidBinCount = _mm_setzero_ps(); +++ avgPointsVal = _mm_setzero_ps(); +++ __m128 compareMask; +++ number = 0; +++ // Calculate the sum (for mean) for any points which do NOT exceed the mean amplitude +++ for (; number < quarterPoints; number++) { +++ +++ dataPointsVal = _mm_load_ps(dataPointsPtr); +++ +++ dataPointsPtr += 4; +++ +++ // Identify which items do not exceed the mean amplitude +++ compareMask = _mm_cmple_ps(dataPointsVal, vMeanAmplitudeVector); +++ +++ // Mask off the items that exceed the mean amplitude and add the avg Points that +++ // do not exceed the mean amplitude +++ avgPointsVal = _mm_add_ps(avgPointsVal, _mm_and_ps(compareMask, dataPointsVal)); +++ +++ // Count the number of bins which do not exceed the mean amplitude +++ vValidBinCount = _mm_add_ps(vValidBinCount, _mm_and_ps(compareMask, vOnesVector)); ++ } ++- } ++ ++- float localNoiseFloorAmplitude = 0; ++- if(validBinCount > 0.0){ ++- localNoiseFloorAmplitude = sumMean / validBinCount; ++- } ++- else{ ++- localNoiseFloorAmplitude = meanAmplitude; // For the odd case that all the amplitudes are equal... ++- } +++ // Calculate the mean from the remaining data points +++ _mm_store_ps(avgPointsVector, avgPointsVal); +++ +++ sumMean = 0.0; +++ sumMean += avgPointsVector[0]; +++ sumMean += avgPointsVector[1]; +++ sumMean += avgPointsVector[2]; +++ sumMean += avgPointsVector[3]; +++ +++ // Calculate the number of valid bins from the remaining count +++ __VOLK_ATTR_ALIGNED(16) float validBinCountVector[4]; +++ _mm_store_ps(validBinCountVector, vValidBinCount); +++ +++ float validBinCount = 0; +++ validBinCount += validBinCountVector[0]; +++ validBinCount += validBinCountVector[1]; +++ validBinCount += validBinCountVector[2]; +++ validBinCount += validBinCountVector[3]; +++ +++ number = quarterPoints * 4; +++ for (; number < num_points; number++) { +++ if (realDataPoints[number] <= meanAmplitude) { +++ sumMean += realDataPoints[number]; +++ validBinCount += 1.0; +++ } +++ } +++ +++ float localNoiseFloorAmplitude = 0; +++ if (validBinCount > 0.0) { +++ localNoiseFloorAmplitude = sumMean / validBinCount; +++ } else { +++ localNoiseFloorAmplitude = +++ meanAmplitude; // For the odd case that all the amplitudes are equal... +++ } ++ ++- *noiseFloorAmplitude = localNoiseFloorAmplitude; +++ *noiseFloorAmplitude = localNoiseFloorAmplitude; ++ } ++ #endif /* LV_HAVE_SSE */ ++ ++@@ -300,36 +306,36 @@ volk_32f_s32f_calc_spectral_noise_floor_32f_generic(float* noiseFloorAmplitude, ++ const float spectralExclusionValue, ++ const unsigned int num_points) ++ { ++- float sumMean = 0.0; ++- unsigned int number; ++- // find the sum (for mean), etc ++- for(number = 0; number < num_points; number++){ ++- // sum (for mean) ++- sumMean += realDataPoints[number]; ++- } ++- ++- // calculate the spectral mean ++- // +20 because for the comparison below we only want to throw out bins ++- // that are significantly higher (and would, thus, affect the mean more) ++- const float meanAmplitude = (sumMean / num_points) + spectralExclusionValue; ++- ++- // now throw out any bins higher than the mean ++- sumMean = 0.0; ++- unsigned int newNumDataPoints = num_points; ++- for(number = 0; number < num_points; number++){ ++- if (realDataPoints[number] <= meanAmplitude) ++- sumMean += realDataPoints[number]; ++- else ++- newNumDataPoints--; ++- } +++ float sumMean = 0.0; +++ unsigned int number; +++ // find the sum (for mean), etc +++ for (number = 0; number < num_points; number++) { +++ // sum (for mean) +++ sumMean += realDataPoints[number]; +++ } +++ +++ // calculate the spectral mean +++ // +20 because for the comparison below we only want to throw out bins +++ // that are significantly higher (and would, thus, affect the mean more) +++ const float meanAmplitude = (sumMean / num_points) + spectralExclusionValue; +++ +++ // now throw out any bins higher than the mean +++ sumMean = 0.0; +++ unsigned int newNumDataPoints = num_points; +++ for (number = 0; number < num_points; number++) { +++ if (realDataPoints[number] <= meanAmplitude) +++ sumMean += realDataPoints[number]; +++ else +++ newNumDataPoints--; +++ } ++ ++- float localNoiseFloorAmplitude = 0.0; ++- if (newNumDataPoints == 0) // in the odd case that all ++- localNoiseFloorAmplitude = meanAmplitude; // amplitudes are equal! ++- else ++- localNoiseFloorAmplitude = sumMean / ((float)newNumDataPoints); +++ float localNoiseFloorAmplitude = 0.0; +++ if (newNumDataPoints == 0) // in the odd case that all +++ localNoiseFloorAmplitude = meanAmplitude; // amplitudes are equal! +++ else +++ localNoiseFloorAmplitude = sumMean / ((float)newNumDataPoints); ++ ++- *noiseFloorAmplitude = localNoiseFloorAmplitude; +++ *noiseFloorAmplitude = localNoiseFloorAmplitude; ++ } ++ #endif /* LV_HAVE_GENERIC */ ++ ++@@ -339,9 +345,9 @@ volk_32f_s32f_calc_spectral_noise_floor_32f_generic(float* noiseFloorAmplitude, ++ #ifndef INCLUDED_volk_32f_s32f_calc_spectral_noise_floor_32f_u_H ++ #define INCLUDED_volk_32f_s32f_calc_spectral_noise_floor_32f_u_H ++ ++-#include ++ #include ++ #include +++#include ++ ++ #ifdef LV_HAVE_AVX ++ #include ++@@ -352,114 +358,117 @@ volk_32f_s32f_calc_spectral_noise_floor_32f_u_avx(float* noiseFloorAmplitude, ++ const float spectralExclusionValue, ++ const unsigned int num_points) ++ { ++- unsigned int number = 0; ++- const unsigned int eighthPoints = num_points / 8; ++- ++- const float* dataPointsPtr = realDataPoints; ++- __VOLK_ATTR_ALIGNED(16) float avgPointsVector[8]; ++- ++- __m256 dataPointsVal; ++- __m256 avgPointsVal = _mm256_setzero_ps(); ++- // Calculate the sum (for mean) for all points ++- for(; number < eighthPoints; number++){ ++- ++- dataPointsVal = _mm256_loadu_ps(dataPointsPtr); ++- ++- dataPointsPtr += 8; ++- ++- avgPointsVal = _mm256_add_ps(avgPointsVal, dataPointsVal); ++- } ++- ++- _mm256_storeu_ps(avgPointsVector, avgPointsVal); ++- ++- float sumMean = 0.0; ++- sumMean += avgPointsVector[0]; ++- sumMean += avgPointsVector[1]; ++- sumMean += avgPointsVector[2]; ++- sumMean += avgPointsVector[3]; ++- sumMean += avgPointsVector[4]; ++- sumMean += avgPointsVector[5]; ++- sumMean += avgPointsVector[6]; ++- sumMean += avgPointsVector[7]; ++- ++- number = eighthPoints * 8; ++- for(;number < num_points; number++){ ++- sumMean += realDataPoints[number]; ++- } ++- ++- // calculate the spectral mean ++- // +20 because for the comparison below we only want to throw out bins ++- // that are significantly higher (and would, thus, affect the mean more ++- const float meanAmplitude = (sumMean / ((float)num_points)) + spectralExclusionValue; ++- ++- dataPointsPtr = realDataPoints; // Reset the dataPointsPtr ++- __m256 vMeanAmplitudeVector = _mm256_set1_ps(meanAmplitude); ++- __m256 vOnesVector = _mm256_set1_ps(1.0); ++- __m256 vValidBinCount = _mm256_setzero_ps(); ++- avgPointsVal = _mm256_setzero_ps(); ++- __m256 compareMask; ++- number = 0; ++- // Calculate the sum (for mean) for any points which do NOT exceed the mean amplitude ++- for(; number < eighthPoints; number++){ ++- ++- dataPointsVal = _mm256_loadu_ps(dataPointsPtr); ++- ++- dataPointsPtr += 8; ++- ++- // Identify which items do not exceed the mean amplitude ++- compareMask = _mm256_cmp_ps(dataPointsVal, vMeanAmplitudeVector, _CMP_LE_OQ); ++- ++- // Mask off the items that exceed the mean amplitude and add the avg Points that do not exceed the mean amplitude ++- avgPointsVal = _mm256_add_ps(avgPointsVal, _mm256_and_ps(compareMask, dataPointsVal)); ++- ++- // Count the number of bins which do not exceed the mean amplitude ++- vValidBinCount = _mm256_add_ps(vValidBinCount, _mm256_and_ps(compareMask, vOnesVector)); ++- } ++- ++- // Calculate the mean from the remaining data points ++- _mm256_storeu_ps(avgPointsVector, avgPointsVal); ++- ++- sumMean = 0.0; ++- sumMean += avgPointsVector[0]; ++- sumMean += avgPointsVector[1]; ++- sumMean += avgPointsVector[2]; ++- sumMean += avgPointsVector[3]; ++- sumMean += avgPointsVector[4]; ++- sumMean += avgPointsVector[5]; ++- sumMean += avgPointsVector[6]; ++- sumMean += avgPointsVector[7]; ++- ++- // Calculate the number of valid bins from the remaining count ++- __VOLK_ATTR_ALIGNED(16) float validBinCountVector[8]; ++- _mm256_storeu_ps(validBinCountVector, vValidBinCount); ++- ++- float validBinCount = 0; ++- validBinCount += validBinCountVector[0]; ++- validBinCount += validBinCountVector[1]; ++- validBinCount += validBinCountVector[2]; ++- validBinCount += validBinCountVector[3]; ++- validBinCount += validBinCountVector[4]; ++- validBinCount += validBinCountVector[5]; ++- validBinCount += validBinCountVector[6]; ++- validBinCount += validBinCountVector[7]; ++- ++- number = eighthPoints * 8; ++- for(;number < num_points; number++){ ++- if(realDataPoints[number] <= meanAmplitude){ ++- sumMean += realDataPoints[number]; ++- validBinCount += 1.0; +++ unsigned int number = 0; +++ const unsigned int eighthPoints = num_points / 8; +++ +++ const float* dataPointsPtr = realDataPoints; +++ __VOLK_ATTR_ALIGNED(16) float avgPointsVector[8]; +++ +++ __m256 dataPointsVal; +++ __m256 avgPointsVal = _mm256_setzero_ps(); +++ // Calculate the sum (for mean) for all points +++ for (; number < eighthPoints; number++) { +++ +++ dataPointsVal = _mm256_loadu_ps(dataPointsPtr); +++ +++ dataPointsPtr += 8; +++ +++ avgPointsVal = _mm256_add_ps(avgPointsVal, dataPointsVal); +++ } +++ +++ _mm256_storeu_ps(avgPointsVector, avgPointsVal); +++ +++ float sumMean = 0.0; +++ sumMean += avgPointsVector[0]; +++ sumMean += avgPointsVector[1]; +++ sumMean += avgPointsVector[2]; +++ sumMean += avgPointsVector[3]; +++ sumMean += avgPointsVector[4]; +++ sumMean += avgPointsVector[5]; +++ sumMean += avgPointsVector[6]; +++ sumMean += avgPointsVector[7]; +++ +++ number = eighthPoints * 8; +++ for (; number < num_points; number++) { +++ sumMean += realDataPoints[number]; +++ } +++ +++ // calculate the spectral mean +++ // +20 because for the comparison below we only want to throw out bins +++ // that are significantly higher (and would, thus, affect the mean more +++ const float meanAmplitude = (sumMean / ((float)num_points)) + spectralExclusionValue; +++ +++ dataPointsPtr = realDataPoints; // Reset the dataPointsPtr +++ __m256 vMeanAmplitudeVector = _mm256_set1_ps(meanAmplitude); +++ __m256 vOnesVector = _mm256_set1_ps(1.0); +++ __m256 vValidBinCount = _mm256_setzero_ps(); +++ avgPointsVal = _mm256_setzero_ps(); +++ __m256 compareMask; +++ number = 0; +++ // Calculate the sum (for mean) for any points which do NOT exceed the mean amplitude +++ for (; number < eighthPoints; number++) { +++ +++ dataPointsVal = _mm256_loadu_ps(dataPointsPtr); +++ +++ dataPointsPtr += 8; +++ +++ // Identify which items do not exceed the mean amplitude +++ compareMask = _mm256_cmp_ps(dataPointsVal, vMeanAmplitudeVector, _CMP_LE_OQ); +++ +++ // Mask off the items that exceed the mean amplitude and add the avg Points that +++ // do not exceed the mean amplitude +++ avgPointsVal = +++ _mm256_add_ps(avgPointsVal, _mm256_and_ps(compareMask, dataPointsVal)); +++ +++ // Count the number of bins which do not exceed the mean amplitude +++ vValidBinCount = +++ _mm256_add_ps(vValidBinCount, _mm256_and_ps(compareMask, vOnesVector)); +++ } +++ +++ // Calculate the mean from the remaining data points +++ _mm256_storeu_ps(avgPointsVector, avgPointsVal); +++ +++ sumMean = 0.0; +++ sumMean += avgPointsVector[0]; +++ sumMean += avgPointsVector[1]; +++ sumMean += avgPointsVector[2]; +++ sumMean += avgPointsVector[3]; +++ sumMean += avgPointsVector[4]; +++ sumMean += avgPointsVector[5]; +++ sumMean += avgPointsVector[6]; +++ sumMean += avgPointsVector[7]; +++ +++ // Calculate the number of valid bins from the remaining count +++ __VOLK_ATTR_ALIGNED(16) float validBinCountVector[8]; +++ _mm256_storeu_ps(validBinCountVector, vValidBinCount); +++ +++ float validBinCount = 0; +++ validBinCount += validBinCountVector[0]; +++ validBinCount += validBinCountVector[1]; +++ validBinCount += validBinCountVector[2]; +++ validBinCount += validBinCountVector[3]; +++ validBinCount += validBinCountVector[4]; +++ validBinCount += validBinCountVector[5]; +++ validBinCount += validBinCountVector[6]; +++ validBinCount += validBinCountVector[7]; +++ +++ number = eighthPoints * 8; +++ for (; number < num_points; number++) { +++ if (realDataPoints[number] <= meanAmplitude) { +++ sumMean += realDataPoints[number]; +++ validBinCount += 1.0; +++ } ++ } ++- } ++ ++- float localNoiseFloorAmplitude = 0; ++- if(validBinCount > 0.0){ ++- localNoiseFloorAmplitude = sumMean / validBinCount; ++- } ++- else{ ++- localNoiseFloorAmplitude = meanAmplitude; // For the odd case that all the amplitudes are equal... ++- } +++ float localNoiseFloorAmplitude = 0; +++ if (validBinCount > 0.0) { +++ localNoiseFloorAmplitude = sumMean / validBinCount; +++ } else { +++ localNoiseFloorAmplitude = +++ meanAmplitude; // For the odd case that all the amplitudes are equal... +++ } ++ ++- *noiseFloorAmplitude = localNoiseFloorAmplitude; +++ *noiseFloorAmplitude = localNoiseFloorAmplitude; ++ } ++ #endif /* LV_HAVE_AVX */ ++ #endif /* INCLUDED_volk_32f_s32f_calc_spectral_noise_floor_32f_u_H */ ++diff --git a/kernels/volk/volk_32f_s32f_convert_16i.h b/kernels/volk/volk_32f_s32f_convert_16i.h ++index 27ef4d9..c9469b7 100644 ++--- a/kernels/volk/volk_32f_s32f_convert_16i.h +++++ b/kernels/volk/volk_32f_s32f_convert_16i.h ++@@ -30,8 +30,8 @@ ++ * ++ * Dispatcher Prototype ++ * \code ++- * void volk_32f_s32f_convert_16i(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points) ++- * \endcode +++ * void volk_32f_s32f_convert_16i(int16_t* outputVector, const float* inputVector, const +++ * float scalar, unsigned int num_points) \endcode ++ * ++ * \b Inputs ++ * \li inputVector: the input vector of floats. ++@@ -42,11 +42,10 @@ ++ * \li outputVector: The output vector. ++ * ++ * \b Example ++- * Convert floats from [-1,1] to 16-bit integers with a scale of 5 to maintain smallest delta ++- * int N = 10; ++- * unsigned int alignment = volk_get_alignment(); ++- * float* increasing = (float*)volk_malloc(sizeof(float)*N, alignment); ++- * int16_t* out = (int16_t*)volk_malloc(sizeof(int16_t)*N, alignment); +++ * Convert floats from [-1,1] to 16-bit integers with a scale of 5 to maintain smallest +++ * delta int N = 10; unsigned int alignment = volk_get_alignment(); float* increasing = +++ * (float*)volk_malloc(sizeof(float)*N, alignment); int16_t* out = +++ * (int16_t*)volk_malloc(sizeof(int16_t)*N, alignment); ++ * ++ * for(unsigned int ii = 0; ii < N; ++ii){ ++ * increasing[ii] = 2.f * ((float)ii / (float)N) - 1.f; ++@@ -76,55 +75,60 @@ ++ #ifdef LV_HAVE_AVX2 ++ #include ++ ++-static inline void ++-volk_32f_s32f_convert_16i_u_avx2(int16_t* outputVector, const float* inputVector, ++- const float scalar, unsigned int num_points) +++static inline void volk_32f_s32f_convert_16i_u_avx2(int16_t* outputVector, +++ const float* inputVector, +++ const float scalar, +++ unsigned int num_points) ++ { ++- unsigned int number = 0; ++- ++- const unsigned int sixteenthPoints = num_points / 16; ++- ++- const float* inputVectorPtr = (const float*)inputVector; ++- int16_t* outputVectorPtr = outputVector; ++- ++- float min_val = SHRT_MIN; ++- float max_val = SHRT_MAX; ++- float r; ++- ++- __m256 vScalar = _mm256_set1_ps(scalar); ++- __m256 inputVal1, inputVal2; ++- __m256i intInputVal1, intInputVal2; ++- __m256 ret1, ret2; ++- __m256 vmin_val = _mm256_set1_ps(min_val); ++- __m256 vmax_val = _mm256_set1_ps(max_val); ++- ++- for(;number < sixteenthPoints; number++){ ++- inputVal1 = _mm256_loadu_ps(inputVectorPtr); inputVectorPtr += 8; ++- inputVal2 = _mm256_loadu_ps(inputVectorPtr); inputVectorPtr += 8; ++- ++- // Scale and clip ++- ret1 = _mm256_max_ps(_mm256_min_ps(_mm256_mul_ps(inputVal1, vScalar), vmax_val), vmin_val); ++- ret2 = _mm256_max_ps(_mm256_min_ps(_mm256_mul_ps(inputVal2, vScalar), vmax_val), vmin_val); ++- ++- intInputVal1 = _mm256_cvtps_epi32(ret1); ++- intInputVal2 = _mm256_cvtps_epi32(ret2); ++- ++- intInputVal1 = _mm256_packs_epi32(intInputVal1, intInputVal2); ++- intInputVal1 = _mm256_permute4x64_epi64(intInputVal1, 0b11011000); ++- ++- _mm256_storeu_si256((__m256i*)outputVectorPtr, intInputVal1); ++- outputVectorPtr += 16; ++- } ++- ++- number = sixteenthPoints * 16; ++- for(; number < num_points; number++){ ++- r = inputVector[number] * scalar; ++- if(r > max_val) ++- r = max_val; ++- else if(r < min_val) ++- r = min_val; ++- outputVector[number] = (int16_t)rintf(r); ++- } +++ unsigned int number = 0; +++ +++ const unsigned int sixteenthPoints = num_points / 16; +++ +++ const float* inputVectorPtr = (const float*)inputVector; +++ int16_t* outputVectorPtr = outputVector; +++ +++ float min_val = SHRT_MIN; +++ float max_val = SHRT_MAX; +++ float r; +++ +++ __m256 vScalar = _mm256_set1_ps(scalar); +++ __m256 inputVal1, inputVal2; +++ __m256i intInputVal1, intInputVal2; +++ __m256 ret1, ret2; +++ __m256 vmin_val = _mm256_set1_ps(min_val); +++ __m256 vmax_val = _mm256_set1_ps(max_val); +++ +++ for (; number < sixteenthPoints; number++) { +++ inputVal1 = _mm256_loadu_ps(inputVectorPtr); +++ inputVectorPtr += 8; +++ inputVal2 = _mm256_loadu_ps(inputVectorPtr); +++ inputVectorPtr += 8; +++ +++ // Scale and clip +++ ret1 = _mm256_max_ps(_mm256_min_ps(_mm256_mul_ps(inputVal1, vScalar), vmax_val), +++ vmin_val); +++ ret2 = _mm256_max_ps(_mm256_min_ps(_mm256_mul_ps(inputVal2, vScalar), vmax_val), +++ vmin_val); +++ +++ intInputVal1 = _mm256_cvtps_epi32(ret1); +++ intInputVal2 = _mm256_cvtps_epi32(ret2); +++ +++ intInputVal1 = _mm256_packs_epi32(intInputVal1, intInputVal2); +++ intInputVal1 = _mm256_permute4x64_epi64(intInputVal1, 0b11011000); +++ +++ _mm256_storeu_si256((__m256i*)outputVectorPtr, intInputVal1); +++ outputVectorPtr += 16; +++ } +++ +++ number = sixteenthPoints * 16; +++ for (; number < num_points; number++) { +++ r = inputVector[number] * scalar; +++ if (r > max_val) +++ r = max_val; +++ else if (r < min_val) +++ r = min_val; +++ outputVector[number] = (int16_t)rintf(r); +++ } ++ } ++ #endif /* LV_HAVE_AVX2 */ ++ ++@@ -132,54 +136,57 @@ volk_32f_s32f_convert_16i_u_avx2(int16_t* outputVector, const float* inputVector ++ #ifdef LV_HAVE_AVX ++ #include ++ ++-static inline void ++-volk_32f_s32f_convert_16i_u_avx(int16_t* outputVector, const float* inputVector, ++- const float scalar, unsigned int num_points) +++static inline void volk_32f_s32f_convert_16i_u_avx(int16_t* outputVector, +++ const float* inputVector, +++ const float scalar, +++ unsigned int num_points) ++ { ++- unsigned int number = 0; +++ unsigned int number = 0; ++ ++- const unsigned int eighthPoints = num_points / 8; +++ const unsigned int eighthPoints = num_points / 8; ++ ++- const float* inputVectorPtr = (const float*)inputVector; ++- int16_t* outputVectorPtr = outputVector; +++ const float* inputVectorPtr = (const float*)inputVector; +++ int16_t* outputVectorPtr = outputVector; ++ ++- float min_val = SHRT_MIN; ++- float max_val = SHRT_MAX; ++- float r; +++ float min_val = SHRT_MIN; +++ float max_val = SHRT_MAX; +++ float r; ++ ++- __m256 vScalar = _mm256_set1_ps(scalar); ++- __m256 inputVal, ret; ++- __m256i intInputVal; ++- __m128i intInputVal1, intInputVal2; ++- __m256 vmin_val = _mm256_set1_ps(min_val); ++- __m256 vmax_val = _mm256_set1_ps(max_val); +++ __m256 vScalar = _mm256_set1_ps(scalar); +++ __m256 inputVal, ret; +++ __m256i intInputVal; +++ __m128i intInputVal1, intInputVal2; +++ __m256 vmin_val = _mm256_set1_ps(min_val); +++ __m256 vmax_val = _mm256_set1_ps(max_val); ++ ++- for(;number < eighthPoints; number++){ ++- inputVal = _mm256_loadu_ps(inputVectorPtr); inputVectorPtr += 8; +++ for (; number < eighthPoints; number++) { +++ inputVal = _mm256_loadu_ps(inputVectorPtr); +++ inputVectorPtr += 8; ++ ++- // Scale and clip ++- ret = _mm256_max_ps(_mm256_min_ps(_mm256_mul_ps(inputVal, vScalar), vmax_val), vmin_val); +++ // Scale and clip +++ ret = _mm256_max_ps(_mm256_min_ps(_mm256_mul_ps(inputVal, vScalar), vmax_val), +++ vmin_val); ++ ++- intInputVal = _mm256_cvtps_epi32(ret); +++ intInputVal = _mm256_cvtps_epi32(ret); ++ ++- intInputVal1 = _mm256_extractf128_si256(intInputVal, 0); ++- intInputVal2 = _mm256_extractf128_si256(intInputVal, 1); +++ intInputVal1 = _mm256_extractf128_si256(intInputVal, 0); +++ intInputVal2 = _mm256_extractf128_si256(intInputVal, 1); ++ ++- intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2); +++ intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2); ++ ++- _mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1); ++- outputVectorPtr += 8; ++- } +++ _mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1); +++ outputVectorPtr += 8; +++ } ++ ++- number = eighthPoints * 8; ++- for(; number < num_points; number++){ ++- r = inputVector[number] * scalar; ++- if(r > max_val) ++- r = max_val; ++- else if(r < min_val) ++- r = min_val; ++- outputVector[number] = (int16_t)rintf(r); ++- } +++ number = eighthPoints * 8; +++ for (; number < num_points; number++) { +++ r = inputVector[number] * scalar; +++ if (r > max_val) +++ r = max_val; +++ else if (r < min_val) +++ r = min_val; +++ outputVector[number] = (int16_t)rintf(r); +++ } ++ } ++ #endif /* LV_HAVE_AVX */ ++ ++@@ -187,54 +194,57 @@ volk_32f_s32f_convert_16i_u_avx(int16_t* outputVector, const float* inputVector, ++ #ifdef LV_HAVE_SSE2 ++ #include ++ ++-static inline void ++-volk_32f_s32f_convert_16i_u_sse2(int16_t* outputVector, const float* inputVector, ++- const float scalar, unsigned int num_points) +++static inline void volk_32f_s32f_convert_16i_u_sse2(int16_t* outputVector, +++ const float* inputVector, +++ const float scalar, +++ unsigned int num_points) ++ { ++- unsigned int number = 0; ++- ++- const unsigned int eighthPoints = num_points / 8; ++- ++- const float* inputVectorPtr = (const float*)inputVector; ++- int16_t* outputVectorPtr = outputVector; ++- ++- float min_val = SHRT_MIN; ++- float max_val = SHRT_MAX; ++- float r; ++- ++- __m128 vScalar = _mm_set_ps1(scalar); ++- __m128 inputVal1, inputVal2; ++- __m128i intInputVal1, intInputVal2; ++- __m128 ret1, ret2; ++- __m128 vmin_val = _mm_set_ps1(min_val); ++- __m128 vmax_val = _mm_set_ps1(max_val); ++- ++- for(;number < eighthPoints; number++){ ++- inputVal1 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4; ++- inputVal2 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4; ++- ++- // Scale and clip ++- ret1 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal1, vScalar), vmax_val), vmin_val); ++- ret2 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal2, vScalar), vmax_val), vmin_val); ++- ++- intInputVal1 = _mm_cvtps_epi32(ret1); ++- intInputVal2 = _mm_cvtps_epi32(ret2); ++- ++- intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2); ++- ++- _mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1); ++- outputVectorPtr += 8; ++- } ++- ++- number = eighthPoints * 8; ++- for(; number < num_points; number++){ ++- r = inputVector[number] * scalar; ++- if(r > max_val) ++- r = max_val; ++- else if(r < min_val) ++- r = min_val; ++- outputVector[number] = (int16_t)rintf(r); ++- } +++ unsigned int number = 0; +++ +++ const unsigned int eighthPoints = num_points / 8; +++ +++ const float* inputVectorPtr = (const float*)inputVector; +++ int16_t* outputVectorPtr = outputVector; +++ +++ float min_val = SHRT_MIN; +++ float max_val = SHRT_MAX; +++ float r; +++ +++ __m128 vScalar = _mm_set_ps1(scalar); +++ __m128 inputVal1, inputVal2; +++ __m128i intInputVal1, intInputVal2; +++ __m128 ret1, ret2; +++ __m128 vmin_val = _mm_set_ps1(min_val); +++ __m128 vmax_val = _mm_set_ps1(max_val); +++ +++ for (; number < eighthPoints; number++) { +++ inputVal1 = _mm_loadu_ps(inputVectorPtr); +++ inputVectorPtr += 4; +++ inputVal2 = _mm_loadu_ps(inputVectorPtr); +++ inputVectorPtr += 4; +++ +++ // Scale and clip +++ ret1 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal1, vScalar), vmax_val), vmin_val); +++ ret2 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal2, vScalar), vmax_val), vmin_val); +++ +++ intInputVal1 = _mm_cvtps_epi32(ret1); +++ intInputVal2 = _mm_cvtps_epi32(ret2); +++ +++ intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2); +++ +++ _mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1); +++ outputVectorPtr += 8; +++ } +++ +++ number = eighthPoints * 8; +++ for (; number < num_points; number++) { +++ r = inputVector[number] * scalar; +++ if (r > max_val) +++ r = max_val; +++ else if (r < min_val) +++ r = min_val; +++ outputVector[number] = (int16_t)rintf(r); +++ } ++ } ++ #endif /* LV_HAVE_SSE2 */ ++ ++@@ -242,76 +252,78 @@ volk_32f_s32f_convert_16i_u_sse2(int16_t* outputVector, const float* inputVector ++ #ifdef LV_HAVE_SSE ++ #include ++ ++-static inline void ++-volk_32f_s32f_convert_16i_u_sse(int16_t* outputVector, const float* inputVector, ++- const float scalar, unsigned int num_points) +++static inline void volk_32f_s32f_convert_16i_u_sse(int16_t* outputVector, +++ const float* inputVector, +++ const float scalar, +++ unsigned int num_points) ++ { ++- unsigned int number = 0; ++- ++- const unsigned int quarterPoints = num_points / 4; ++- ++- const float* inputVectorPtr = (const float*)inputVector; ++- int16_t* outputVectorPtr = outputVector; ++- ++- float min_val = SHRT_MIN; ++- float max_val = SHRT_MAX; ++- float r; ++- ++- __m128 vScalar = _mm_set_ps1(scalar); ++- __m128 ret; ++- __m128 vmin_val = _mm_set_ps1(min_val); ++- __m128 vmax_val = _mm_set_ps1(max_val); ++- ++- __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4]; ++- ++- for(;number < quarterPoints; number++){ ++- ret = _mm_loadu_ps(inputVectorPtr); ++- inputVectorPtr += 4; ++- ++- // Scale and clip ++- ret = _mm_max_ps(_mm_min_ps(_mm_mul_ps(ret, vScalar), vmax_val), vmin_val); ++- ++- _mm_store_ps(outputFloatBuffer, ret); ++- *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[0]); ++- *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[1]); ++- *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[2]); ++- *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[3]); ++- } ++- ++- number = quarterPoints * 4; ++- for(; number < num_points; number++){ ++- r = inputVector[number] * scalar; ++- if(r > max_val) ++- r = max_val; ++- else if(r < min_val) ++- r = min_val; ++- outputVector[number] = (int16_t)rintf(r); ++- } +++ unsigned int number = 0; +++ +++ const unsigned int quarterPoints = num_points / 4; +++ +++ const float* inputVectorPtr = (const float*)inputVector; +++ int16_t* outputVectorPtr = outputVector; +++ +++ float min_val = SHRT_MIN; +++ float max_val = SHRT_MAX; +++ float r; +++ +++ __m128 vScalar = _mm_set_ps1(scalar); +++ __m128 ret; +++ __m128 vmin_val = _mm_set_ps1(min_val); +++ __m128 vmax_val = _mm_set_ps1(max_val); +++ +++ __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4]; +++ +++ for (; number < quarterPoints; number++) { +++ ret = _mm_loadu_ps(inputVectorPtr); +++ inputVectorPtr += 4; +++ +++ // Scale and clip +++ ret = _mm_max_ps(_mm_min_ps(_mm_mul_ps(ret, vScalar), vmax_val), vmin_val); +++ +++ _mm_store_ps(outputFloatBuffer, ret); +++ *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[0]); +++ *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[1]); +++ *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[2]); +++ *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[3]); +++ } +++ +++ number = quarterPoints * 4; +++ for (; number < num_points; number++) { +++ r = inputVector[number] * scalar; +++ if (r > max_val) +++ r = max_val; +++ else if (r < min_val) +++ r = min_val; +++ outputVector[number] = (int16_t)rintf(r); +++ } ++ } ++ #endif /* LV_HAVE_SSE */ ++ ++ ++ #ifdef LV_HAVE_GENERIC ++ ++-static inline void ++-volk_32f_s32f_convert_16i_generic(int16_t* outputVector, const float* inputVector, ++- const float scalar, unsigned int num_points) +++static inline void volk_32f_s32f_convert_16i_generic(int16_t* outputVector, +++ const float* inputVector, +++ const float scalar, +++ unsigned int num_points) ++ { ++- int16_t* outputVectorPtr = outputVector; ++- const float* inputVectorPtr = inputVector; ++- unsigned int number = 0; ++- float min_val = SHRT_MIN; ++- float max_val = SHRT_MAX; ++- float r; ++- ++- for(number = 0; number < num_points; number++){ ++- r = *inputVectorPtr++ * scalar; ++- if(r > max_val) ++- r = max_val; ++- else if(r < min_val) ++- r = min_val; ++- *outputVectorPtr++ = (int16_t)rintf(r); ++- } +++ int16_t* outputVectorPtr = outputVector; +++ const float* inputVectorPtr = inputVector; +++ unsigned int number = 0; +++ float min_val = SHRT_MIN; +++ float max_val = SHRT_MAX; +++ float r; +++ +++ for (number = 0; number < num_points; number++) { +++ r = *inputVectorPtr++ * scalar; +++ if (r > max_val) +++ r = max_val; +++ else if (r < min_val) +++ r = min_val; +++ *outputVectorPtr++ = (int16_t)rintf(r); +++ } ++ } ++ #endif /* LV_HAVE_GENERIC */ ++ ++@@ -320,63 +332,68 @@ volk_32f_s32f_convert_16i_generic(int16_t* outputVector, const float* inputVecto ++ #ifndef INCLUDED_volk_32f_s32f_convert_16i_a_H ++ #define INCLUDED_volk_32f_s32f_convert_16i_a_H ++ ++-#include ++ #include ++-#include ++ #include +++#include +++#include ++ ++ #ifdef LV_HAVE_AVX2 ++ #include ++ ++-static inline void ++-volk_32f_s32f_convert_16i_a_avx2(int16_t* outputVector, const float* inputVector, ++- const float scalar, unsigned int num_points) +++static inline void volk_32f_s32f_convert_16i_a_avx2(int16_t* outputVector, +++ const float* inputVector, +++ const float scalar, +++ unsigned int num_points) ++ { ++- unsigned int number = 0; ++- ++- const unsigned int sixteenthPoints = num_points / 16; ++- ++- const float* inputVectorPtr = (const float*)inputVector; ++- int16_t* outputVectorPtr = outputVector; ++- ++- float min_val = SHRT_MIN; ++- float max_val = SHRT_MAX; ++- float r; ++- ++- __m256 vScalar = _mm256_set1_ps(scalar); ++- __m256 inputVal1, inputVal2; ++- __m256i intInputVal1, intInputVal2; ++- __m256 ret1, ret2; ++- __m256 vmin_val = _mm256_set1_ps(min_val); ++- __m256 vmax_val = _mm256_set1_ps(max_val); ++- ++- for(;number < sixteenthPoints; number++){ ++- inputVal1 = _mm256_load_ps(inputVectorPtr); inputVectorPtr += 8; ++- inputVal2 = _mm256_load_ps(inputVectorPtr); inputVectorPtr += 8; ++- ++- // Scale and clip ++- ret1 = _mm256_max_ps(_mm256_min_ps(_mm256_mul_ps(inputVal1, vScalar), vmax_val), vmin_val); ++- ret2 = _mm256_max_ps(_mm256_min_ps(_mm256_mul_ps(inputVal2, vScalar), vmax_val), vmin_val); ++- ++- intInputVal1 = _mm256_cvtps_epi32(ret1); ++- intInputVal2 = _mm256_cvtps_epi32(ret2); ++- ++- intInputVal1 = _mm256_packs_epi32(intInputVal1, intInputVal2); ++- intInputVal1 = _mm256_permute4x64_epi64(intInputVal1, 0b11011000); ++- ++- _mm256_store_si256((__m256i*)outputVectorPtr, intInputVal1); ++- outputVectorPtr += 16; ++- } ++- ++- number = sixteenthPoints * 16; ++- for(; number < num_points; number++){ ++- r = inputVector[number] * scalar; ++- if(r > max_val) ++- r = max_val; ++- else if(r < min_val) ++- r = min_val; ++- outputVector[number] = (int16_t)rintf(r); ++- } +++ unsigned int number = 0; +++ +++ const unsigned int sixteenthPoints = num_points / 16; +++ +++ const float* inputVectorPtr = (const float*)inputVector; +++ int16_t* outputVectorPtr = outputVector; +++ +++ float min_val = SHRT_MIN; +++ float max_val = SHRT_MAX; +++ float r; +++ +++ __m256 vScalar = _mm256_set1_ps(scalar); +++ __m256 inputVal1, inputVal2; +++ __m256i intInputVal1, intInputVal2; +++ __m256 ret1, ret2; +++ __m256 vmin_val = _mm256_set1_ps(min_val); +++ __m256 vmax_val = _mm256_set1_ps(max_val); +++ +++ for (; number < sixteenthPoints; number++) { +++ inputVal1 = _mm256_load_ps(inputVectorPtr); +++ inputVectorPtr += 8; +++ inputVal2 = _mm256_load_ps(inputVectorPtr); +++ inputVectorPtr += 8; +++ +++ // Scale and clip +++ ret1 = _mm256_max_ps(_mm256_min_ps(_mm256_mul_ps(inputVal1, vScalar), vmax_val), +++ vmin_val); +++ ret2 = _mm256_max_ps(_mm256_min_ps(_mm256_mul_ps(inputVal2, vScalar), vmax_val), +++ vmin_val); +++ +++ intInputVal1 = _mm256_cvtps_epi32(ret1); +++ intInputVal2 = _mm256_cvtps_epi32(ret2); +++ +++ intInputVal1 = _mm256_packs_epi32(intInputVal1, intInputVal2); +++ intInputVal1 = _mm256_permute4x64_epi64(intInputVal1, 0b11011000); +++ +++ _mm256_store_si256((__m256i*)outputVectorPtr, intInputVal1); +++ outputVectorPtr += 16; +++ } +++ +++ number = sixteenthPoints * 16; +++ for (; number < num_points; number++) { +++ r = inputVector[number] * scalar; +++ if (r > max_val) +++ r = max_val; +++ else if (r < min_val) +++ r = min_val; +++ outputVector[number] = (int16_t)rintf(r); +++ } ++ } ++ #endif /* LV_HAVE_AVX2 */ ++ ++@@ -384,108 +401,114 @@ volk_32f_s32f_convert_16i_a_avx2(int16_t* outputVector, const float* inputVector ++ #ifdef LV_HAVE_AVX ++ #include ++ ++-static inline void ++-volk_32f_s32f_convert_16i_a_avx(int16_t* outputVector, const float* inputVector, ++- const float scalar, unsigned int num_points) +++static inline void volk_32f_s32f_convert_16i_a_avx(int16_t* outputVector, +++ const float* inputVector, +++ const float scalar, +++ unsigned int num_points) ++ { ++- unsigned int number = 0; +++ unsigned int number = 0; ++ ++- const unsigned int eighthPoints = num_points / 8; +++ const unsigned int eighthPoints = num_points / 8; ++ ++- const float* inputVectorPtr = (const float*)inputVector; ++- int16_t* outputVectorPtr = outputVector; +++ const float* inputVectorPtr = (const float*)inputVector; +++ int16_t* outputVectorPtr = outputVector; ++ ++- float min_val = SHRT_MIN; ++- float max_val = SHRT_MAX; ++- float r; +++ float min_val = SHRT_MIN; +++ float max_val = SHRT_MAX; +++ float r; ++ ++- __m256 vScalar = _mm256_set1_ps(scalar); ++- __m256 inputVal, ret; ++- __m256i intInputVal; ++- __m128i intInputVal1, intInputVal2; ++- __m256 vmin_val = _mm256_set1_ps(min_val); ++- __m256 vmax_val = _mm256_set1_ps(max_val); +++ __m256 vScalar = _mm256_set1_ps(scalar); +++ __m256 inputVal, ret; +++ __m256i intInputVal; +++ __m128i intInputVal1, intInputVal2; +++ __m256 vmin_val = _mm256_set1_ps(min_val); +++ __m256 vmax_val = _mm256_set1_ps(max_val); ++ ++- for(;number < eighthPoints; number++){ ++- inputVal = _mm256_load_ps(inputVectorPtr); inputVectorPtr += 8; +++ for (; number < eighthPoints; number++) { +++ inputVal = _mm256_load_ps(inputVectorPtr); +++ inputVectorPtr += 8; ++ ++- // Scale and clip ++- ret = _mm256_max_ps(_mm256_min_ps(_mm256_mul_ps(inputVal, vScalar), vmax_val), vmin_val); +++ // Scale and clip +++ ret = _mm256_max_ps(_mm256_min_ps(_mm256_mul_ps(inputVal, vScalar), vmax_val), +++ vmin_val); ++ ++- intInputVal = _mm256_cvtps_epi32(ret); +++ intInputVal = _mm256_cvtps_epi32(ret); ++ ++- intInputVal1 = _mm256_extractf128_si256(intInputVal, 0); ++- intInputVal2 = _mm256_extractf128_si256(intInputVal, 1); +++ intInputVal1 = _mm256_extractf128_si256(intInputVal, 0); +++ intInputVal2 = _mm256_extractf128_si256(intInputVal, 1); ++ ++- intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2); +++ intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2); ++ ++- _mm_store_si128((__m128i*)outputVectorPtr, intInputVal1); ++- outputVectorPtr += 8; ++- } +++ _mm_store_si128((__m128i*)outputVectorPtr, intInputVal1); +++ outputVectorPtr += 8; +++ } ++ ++- number = eighthPoints * 8; ++- for(; number < num_points; number++){ ++- r = inputVector[number] * scalar; ++- if(r > max_val) ++- r = max_val; ++- else if(r < min_val) ++- r = min_val; ++- outputVector[number] = (int16_t)rintf(r); ++- } +++ number = eighthPoints * 8; +++ for (; number < num_points; number++) { +++ r = inputVector[number] * scalar; +++ if (r > max_val) +++ r = max_val; +++ else if (r < min_val) +++ r = min_val; +++ outputVector[number] = (int16_t)rintf(r); +++ } ++ } ++ #endif /* LV_HAVE_AVX */ ++ ++ #ifdef LV_HAVE_SSE2 ++ #include ++ ++-static inline void ++-volk_32f_s32f_convert_16i_a_sse2(int16_t* outputVector, const float* inputVector, ++- const float scalar, unsigned int num_points) +++static inline void volk_32f_s32f_convert_16i_a_sse2(int16_t* outputVector, +++ const float* inputVector, +++ const float scalar, +++ unsigned int num_points) ++ { ++- unsigned int number = 0; ++- ++- const unsigned int eighthPoints = num_points / 8; ++- ++- const float* inputVectorPtr = (const float*)inputVector; ++- int16_t* outputVectorPtr = outputVector; ++- ++- float min_val = SHRT_MIN; ++- float max_val = SHRT_MAX; ++- float r; ++- ++- __m128 vScalar = _mm_set_ps1(scalar); ++- __m128 inputVal1, inputVal2; ++- __m128i intInputVal1, intInputVal2; ++- __m128 ret1, ret2; ++- __m128 vmin_val = _mm_set_ps1(min_val); ++- __m128 vmax_val = _mm_set_ps1(max_val); ++- ++- for(;number < eighthPoints; number++){ ++- inputVal1 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4; ++- inputVal2 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4; ++- ++- // Scale and clip ++- ret1 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal1, vScalar), vmax_val), vmin_val); ++- ret2 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal2, vScalar), vmax_val), vmin_val); ++- ++- intInputVal1 = _mm_cvtps_epi32(ret1); ++- intInputVal2 = _mm_cvtps_epi32(ret2); ++- ++- intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2); ++- ++- _mm_store_si128((__m128i*)outputVectorPtr, intInputVal1); ++- outputVectorPtr += 8; ++- } ++- ++- number = eighthPoints * 8; ++- for(; number < num_points; number++){ ++- r = inputVector[number] * scalar; ++- if(r > max_val) ++- r = max_val; ++- else if(r < min_val) ++- r = min_val; ++- outputVector[number] = (int16_t)rintf(r); ++- } +++ unsigned int number = 0; +++ +++ const unsigned int eighthPoints = num_points / 8; +++ +++ const float* inputVectorPtr = (const float*)inputVector; +++ int16_t* outputVectorPtr = outputVector; +++ +++ float min_val = SHRT_MIN; +++ float max_val = SHRT_MAX; +++ float r; +++ +++ __m128 vScalar = _mm_set_ps1(scalar); +++ __m128 inputVal1, inputVal2; +++ __m128i intInputVal1, intInputVal2; +++ __m128 ret1, ret2; +++ __m128 vmin_val = _mm_set_ps1(min_val); +++ __m128 vmax_val = _mm_set_ps1(max_val); +++ +++ for (; number < eighthPoints; number++) { +++ inputVal1 = _mm_load_ps(inputVectorPtr); +++ inputVectorPtr += 4; +++ inputVal2 = _mm_load_ps(inputVectorPtr); +++ inputVectorPtr += 4; +++ +++ // Scale and clip +++ ret1 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal1, vScalar), vmax_val), vmin_val); +++ ret2 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal2, vScalar), vmax_val), vmin_val); +++ +++ intInputVal1 = _mm_cvtps_epi32(ret1); +++ intInputVal2 = _mm_cvtps_epi32(ret2); +++ +++ intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2); +++ +++ _mm_store_si128((__m128i*)outputVectorPtr, intInputVal1); +++ outputVectorPtr += 8; +++ } +++ +++ number = eighthPoints * 8; +++ for (; number < num_points; number++) { +++ r = inputVector[number] * scalar; +++ if (r > max_val) +++ r = max_val; +++ else if (r < min_val) +++ r = min_val; +++ outputVector[number] = (int16_t)rintf(r); +++ } ++ } ++ #endif /* LV_HAVE_SSE2 */ ++ ++@@ -493,76 +516,78 @@ volk_32f_s32f_convert_16i_a_sse2(int16_t* outputVector, const float* inputVector ++ #ifdef LV_HAVE_SSE ++ #include ++ ++-static inline void ++-volk_32f_s32f_convert_16i_a_sse(int16_t* outputVector, const float* inputVector, ++- const float scalar, unsigned int num_points) +++static inline void volk_32f_s32f_convert_16i_a_sse(int16_t* outputVector, +++ const float* inputVector, +++ const float scalar, +++ unsigned int num_points) ++ { ++- unsigned int number = 0; ++- ++- const unsigned int quarterPoints = num_points / 4; ++- ++- const float* inputVectorPtr = (const float*)inputVector; ++- int16_t* outputVectorPtr = outputVector; ++- ++- float min_val = SHRT_MIN; ++- float max_val = SHRT_MAX; ++- float r; ++- ++- __m128 vScalar = _mm_set_ps1(scalar); ++- __m128 ret; ++- __m128 vmin_val = _mm_set_ps1(min_val); ++- __m128 vmax_val = _mm_set_ps1(max_val); ++- ++- __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4]; ++- ++- for(;number < quarterPoints; number++){ ++- ret = _mm_load_ps(inputVectorPtr); ++- inputVectorPtr += 4; ++- ++- // Scale and clip ++- ret = _mm_max_ps(_mm_min_ps(_mm_mul_ps(ret, vScalar), vmax_val), vmin_val); ++- ++- _mm_store_ps(outputFloatBuffer, ret); ++- *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[0]); ++- *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[1]); ++- *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[2]); ++- *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[3]); ++- } ++- ++- number = quarterPoints * 4; ++- for(; number < num_points; number++){ ++- r = inputVector[number] * scalar; ++- if(r > max_val) ++- r = max_val; ++- else if(r < min_val) ++- r = min_val; ++- outputVector[number] = (int16_t)rintf(r); ++- } +++ unsigned int number = 0; +++ +++ const unsigned int quarterPoints = num_points / 4; +++ +++ const float* inputVectorPtr = (const float*)inputVector; +++ int16_t* outputVectorPtr = outputVector; +++ +++ float min_val = SHRT_MIN; +++ float max_val = SHRT_MAX; +++ float r; +++ +++ __m128 vScalar = _mm_set_ps1(scalar); +++ __m128 ret; +++ __m128 vmin_val = _mm_set_ps1(min_val); +++ __m128 vmax_val = _mm_set_ps1(max_val); +++ +++ __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4]; +++ +++ for (; number < quarterPoints; number++) { +++ ret = _mm_load_ps(inputVectorPtr); +++ inputVectorPtr += 4; +++ +++ // Scale and clip +++ ret = _mm_max_ps(_mm_min_ps(_mm_mul_ps(ret, vScalar), vmax_val), vmin_val); +++ +++ _mm_store_ps(outputFloatBuffer, ret); +++ *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[0]); +++ *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[1]); +++ *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[2]); +++ *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[3]); +++ } +++ +++ number = quarterPoints * 4; +++ for (; number < num_points; number++) { +++ r = inputVector[number] * scalar; +++ if (r > max_val) +++ r = max_val; +++ else if (r < min_val) +++ r = min_val; +++ outputVector[number] = (int16_t)rintf(r); +++ } ++ } ++ #endif /* LV_HAVE_SSE */ ++ ++ ++ #ifdef LV_HAVE_GENERIC ++ ++-static inline void ++-volk_32f_s32f_convert_16i_a_generic(int16_t* outputVector, const float* inputVector, ++- const float scalar, unsigned int num_points) +++static inline void volk_32f_s32f_convert_16i_a_generic(int16_t* outputVector, +++ const float* inputVector, +++ const float scalar, +++ unsigned int num_points) ++ { ++- int16_t* outputVectorPtr = outputVector; ++- const float* inputVectorPtr = inputVector; ++- unsigned int number = 0; ++- float min_val = SHRT_MIN; ++- float max_val = SHRT_MAX; ++- float r; ++- ++- for(number = 0; number < num_points; number++){ ++- r = *inputVectorPtr++ * scalar; ++- if(r < min_val) ++- r = min_val; ++- else if(r > max_val) ++- r = max_val; ++- *outputVectorPtr++ = (int16_t)rintf(r); ++- } +++ int16_t* outputVectorPtr = outputVector; +++ const float* inputVectorPtr = inputVector; +++ unsigned int number = 0; +++ float min_val = SHRT_MIN; +++ float max_val = SHRT_MAX; +++ float r; +++ +++ for (number = 0; number < num_points; number++) { +++ r = *inputVectorPtr++ * scalar; +++ if (r < min_val) +++ r = min_val; +++ else if (r > max_val) +++ r = max_val; +++ *outputVectorPtr++ = (int16_t)rintf(r); +++ } ++ } ++ #endif /* LV_HAVE_GENERIC */ ++ ++diff --git a/kernels/volk/volk_32f_s32f_convert_32i.h b/kernels/volk/volk_32f_s32f_convert_32i.h ++index d2a65a0..d5f7cd4 100644 ++--- a/kernels/volk/volk_32f_s32f_convert_32i.h +++++ b/kernels/volk/volk_32f_s32f_convert_32i.h ++@@ -30,8 +30,8 @@ ++ * ++ * Dispatcher Prototype ++ * \code ++- * void volk_32f_s32f_convert_32i(int32_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points) ++- * \endcode +++ * void volk_32f_s32f_convert_32i(int32_t* outputVector, const float* inputVector, const +++ * float scalar, unsigned int num_points) \endcode ++ * ++ * \b Inputs ++ * \li inputVector: the input vector of floats. ++@@ -77,46 +77,49 @@ ++ #ifdef LV_HAVE_AVX ++ #include ++ ++-static inline void ++-volk_32f_s32f_convert_32i_u_avx(int32_t* outputVector, const float* inputVector, ++- const float scalar, unsigned int num_points) +++static inline void volk_32f_s32f_convert_32i_u_avx(int32_t* outputVector, +++ const float* inputVector, +++ const float scalar, +++ unsigned int num_points) ++ { ++- unsigned int number = 0; ++- ++- const unsigned int eighthPoints = num_points / 8; ++- ++- const float* inputVectorPtr = (const float*)inputVector; ++- int32_t* outputVectorPtr = outputVector; ++- ++- float min_val = INT_MIN; ++- float max_val = INT_MAX; ++- float r; ++- ++- __m256 vScalar = _mm256_set1_ps(scalar); ++- __m256 inputVal1; ++- __m256i intInputVal1; ++- __m256 vmin_val = _mm256_set1_ps(min_val); ++- __m256 vmax_val = _mm256_set1_ps(max_val); ++- ++- for(;number < eighthPoints; number++){ ++- inputVal1 = _mm256_loadu_ps(inputVectorPtr); inputVectorPtr += 8; ++- ++- inputVal1 = _mm256_max_ps(_mm256_min_ps(_mm256_mul_ps(inputVal1, vScalar), vmax_val), vmin_val); ++- intInputVal1 = _mm256_cvtps_epi32(inputVal1); ++- ++- _mm256_storeu_si256((__m256i*)outputVectorPtr, intInputVal1); ++- outputVectorPtr += 8; ++- } ++- ++- number = eighthPoints * 8; ++- for(; number < num_points; number++){ ++- r = inputVector[number] * scalar; ++- if(r > max_val) ++- r = max_val; ++- else if(r < min_val) ++- r = min_val; ++- outputVector[number] = (int32_t)rintf(r); ++- } +++ unsigned int number = 0; +++ +++ const unsigned int eighthPoints = num_points / 8; +++ +++ const float* inputVectorPtr = (const float*)inputVector; +++ int32_t* outputVectorPtr = outputVector; +++ +++ float min_val = INT_MIN; +++ float max_val = INT_MAX; +++ float r; +++ +++ __m256 vScalar = _mm256_set1_ps(scalar); +++ __m256 inputVal1; +++ __m256i intInputVal1; +++ __m256 vmin_val = _mm256_set1_ps(min_val); +++ __m256 vmax_val = _mm256_set1_ps(max_val); +++ +++ for (; number < eighthPoints; number++) { +++ inputVal1 = _mm256_loadu_ps(inputVectorPtr); +++ inputVectorPtr += 8; +++ +++ inputVal1 = _mm256_max_ps( +++ _mm256_min_ps(_mm256_mul_ps(inputVal1, vScalar), vmax_val), vmin_val); +++ intInputVal1 = _mm256_cvtps_epi32(inputVal1); +++ +++ _mm256_storeu_si256((__m256i*)outputVectorPtr, intInputVal1); +++ outputVectorPtr += 8; +++ } +++ +++ number = eighthPoints * 8; +++ for (; number < num_points; number++) { +++ r = inputVector[number] * scalar; +++ if (r > max_val) +++ r = max_val; +++ else if (r < min_val) +++ r = min_val; +++ outputVector[number] = (int32_t)rintf(r); +++ } ++ } ++ ++ #endif /* LV_HAVE_AVX */ ++@@ -124,46 +127,49 @@ volk_32f_s32f_convert_32i_u_avx(int32_t* outputVector, const float* inputVector, ++ #ifdef LV_HAVE_SSE2 ++ #include ++ ++-static inline void ++-volk_32f_s32f_convert_32i_u_sse2(int32_t* outputVector, const float* inputVector, ++- const float scalar, unsigned int num_points) +++static inline void volk_32f_s32f_convert_32i_u_sse2(int32_t* outputVector, +++ const float* inputVector, +++ const float scalar, +++ unsigned int num_points) ++ { ++- unsigned int number = 0; ++- ++- const unsigned int quarterPoints = num_points / 4; ++- ++- const float* inputVectorPtr = (const float*)inputVector; ++- int32_t* outputVectorPtr = outputVector; ++- ++- float min_val = INT_MIN; ++- float max_val = INT_MAX; ++- float r; ++- ++- __m128 vScalar = _mm_set_ps1(scalar); ++- __m128 inputVal1; ++- __m128i intInputVal1; ++- __m128 vmin_val = _mm_set_ps1(min_val); ++- __m128 vmax_val = _mm_set_ps1(max_val); ++- ++- for(;number < quarterPoints; number++){ ++- inputVal1 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4; ++- ++- inputVal1 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal1, vScalar), vmax_val), vmin_val); ++- intInputVal1 = _mm_cvtps_epi32(inputVal1); ++- ++- _mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1); ++- outputVectorPtr += 4; ++- } ++- ++- number = quarterPoints * 4; ++- for(; number < num_points; number++){ ++- r = inputVector[number] * scalar; ++- if(r > max_val) ++- r = max_val; ++- else if(r < min_val) ++- r = min_val; ++- outputVector[number] = (int32_t)rintf(r); ++- } +++ unsigned int number = 0; +++ +++ const unsigned int quarterPoints = num_points / 4; +++ +++ const float* inputVectorPtr = (const float*)inputVector; +++ int32_t* outputVectorPtr = outputVector; +++ +++ float min_val = INT_MIN; +++ float max_val = INT_MAX; +++ float r; +++ +++ __m128 vScalar = _mm_set_ps1(scalar); +++ __m128 inputVal1; +++ __m128i intInputVal1; +++ __m128 vmin_val = _mm_set_ps1(min_val); +++ __m128 vmax_val = _mm_set_ps1(max_val); +++ +++ for (; number < quarterPoints; number++) { +++ inputVal1 = _mm_loadu_ps(inputVectorPtr); +++ inputVectorPtr += 4; +++ +++ inputVal1 = +++ _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal1, vScalar), vmax_val), vmin_val); +++ intInputVal1 = _mm_cvtps_epi32(inputVal1); +++ +++ _mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1); +++ outputVectorPtr += 4; +++ } +++ +++ number = quarterPoints * 4; +++ for (; number < num_points; number++) { +++ r = inputVector[number] * scalar; +++ if (r > max_val) +++ r = max_val; +++ else if (r < min_val) +++ r = min_val; +++ outputVector[number] = (int32_t)rintf(r); +++ } ++ } ++ ++ #endif /* LV_HAVE_SSE2 */ ++@@ -172,50 +178,51 @@ volk_32f_s32f_convert_32i_u_sse2(int32_t* outputVector, const float* inputVector ++ #ifdef LV_HAVE_SSE ++ #include ++ ++-static inline void ++-volk_32f_s32f_convert_32i_u_sse(int32_t* outputVector, const float* inputVector, ++- const float scalar, unsigned int num_points) +++static inline void volk_32f_s32f_convert_32i_u_sse(int32_t* outputVector, +++ const float* inputVector, +++ const float scalar, +++ unsigned int num_points) ++ { ++- unsigned int number = 0; ++- ++- const unsigned int quarterPoints = num_points / 4; ++- ++- const float* inputVectorPtr = (const float*)inputVector; ++- int32_t* outputVectorPtr = outputVector; ++- ++- float min_val = INT_MIN; ++- float max_val = INT_MAX; ++- float r; ++- ++- __m128 vScalar = _mm_set_ps1(scalar); ++- __m128 ret; ++- __m128 vmin_val = _mm_set_ps1(min_val); ++- __m128 vmax_val = _mm_set_ps1(max_val); ++- ++- __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4]; ++- ++- for(;number < quarterPoints; number++){ ++- ret = _mm_loadu_ps(inputVectorPtr); ++- inputVectorPtr += 4; ++- ++- ret = _mm_max_ps(_mm_min_ps(_mm_mul_ps(ret, vScalar), vmax_val), vmin_val); ++- ++- _mm_store_ps(outputFloatBuffer, ret); ++- *outputVectorPtr++ = (int32_t)rintf(outputFloatBuffer[0]); ++- *outputVectorPtr++ = (int32_t)rintf(outputFloatBuffer[1]); ++- *outputVectorPtr++ = (int32_t)rintf(outputFloatBuffer[2]); ++- *outputVectorPtr++ = (int32_t)rintf(outputFloatBuffer[3]); ++- } ++- ++- number = quarterPoints * 4; ++- for(; number < num_points; number++){ ++- r = inputVector[number] * scalar; ++- if(r > max_val) ++- r = max_val; ++- else if(r < min_val) ++- r = min_val; ++- outputVector[number] = (int32_t)rintf(r); ++- } +++ unsigned int number = 0; +++ +++ const unsigned int quarterPoints = num_points / 4; +++ +++ const float* inputVectorPtr = (const float*)inputVector; +++ int32_t* outputVectorPtr = outputVector; +++ +++ float min_val = INT_MIN; +++ float max_val = INT_MAX; +++ float r; +++ +++ __m128 vScalar = _mm_set_ps1(scalar); +++ __m128 ret; +++ __m128 vmin_val = _mm_set_ps1(min_val); +++ __m128 vmax_val = _mm_set_ps1(max_val); +++ +++ __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4]; +++ +++ for (; number < quarterPoints; number++) { +++ ret = _mm_loadu_ps(inputVectorPtr); +++ inputVectorPtr += 4; +++ +++ ret = _mm_max_ps(_mm_min_ps(_mm_mul_ps(ret, vScalar), vmax_val), vmin_val); +++ +++ _mm_store_ps(outputFloatBuffer, ret); +++ *outputVectorPtr++ = (int32_t)rintf(outputFloatBuffer[0]); +++ *outputVectorPtr++ = (int32_t)rintf(outputFloatBuffer[1]); +++ *outputVectorPtr++ = (int32_t)rintf(outputFloatBuffer[2]); +++ *outputVectorPtr++ = (int32_t)rintf(outputFloatBuffer[3]); +++ } +++ +++ number = quarterPoints * 4; +++ for (; number < num_points; number++) { +++ r = inputVector[number] * scalar; +++ if (r > max_val) +++ r = max_val; +++ else if (r < min_val) +++ r = min_val; +++ outputVector[number] = (int32_t)rintf(r); +++ } ++ } ++ ++ #endif /* LV_HAVE_SSE */ ++@@ -223,82 +230,85 @@ volk_32f_s32f_convert_32i_u_sse(int32_t* outputVector, const float* inputVector, ++ ++ #ifdef LV_HAVE_GENERIC ++ ++-static inline void ++-volk_32f_s32f_convert_32i_generic(int32_t* outputVector, const float* inputVector, ++- const float scalar, unsigned int num_points) +++static inline void volk_32f_s32f_convert_32i_generic(int32_t* outputVector, +++ const float* inputVector, +++ const float scalar, +++ unsigned int num_points) ++ { ++- int32_t* outputVectorPtr = outputVector; ++- const float* inputVectorPtr = inputVector; ++- unsigned int number = 0; ++- float min_val = INT_MIN; ++- float max_val = INT_MAX; ++- float r; ++- ++- for(number = 0; number < num_points; number++){ ++- r = *inputVectorPtr++ * scalar; ++- if(r > max_val) ++- r = max_val; ++- else if(r < min_val) ++- r = min_val; ++- *outputVectorPtr++ = (int32_t)rintf(r); ++- } +++ int32_t* outputVectorPtr = outputVector; +++ const float* inputVectorPtr = inputVector; +++ unsigned int number = 0; +++ float min_val = INT_MIN; +++ float max_val = INT_MAX; +++ float r; +++ +++ for (number = 0; number < num_points; number++) { +++ r = *inputVectorPtr++ * scalar; +++ if (r > max_val) +++ r = max_val; +++ else if (r < min_val) +++ r = min_val; +++ *outputVectorPtr++ = (int32_t)rintf(r); +++ } ++ } ++ ++ #endif /* LV_HAVE_GENERIC */ ++ ++ ++- ++ #endif /* INCLUDED_volk_32f_s32f_convert_32i_u_H */ ++ #ifndef INCLUDED_volk_32f_s32f_convert_32i_a_H ++ #define INCLUDED_volk_32f_s32f_convert_32i_a_H ++ ++-#include ++ #include ++ #include +++#include ++ ++ #ifdef LV_HAVE_AVX ++ #include ++ ++-static inline void ++-volk_32f_s32f_convert_32i_a_avx(int32_t* outputVector, const float* inputVector, ++- const float scalar, unsigned int num_points) +++static inline void volk_32f_s32f_convert_32i_a_avx(int32_t* outputVector, +++ const float* inputVector, +++ const float scalar, +++ unsigned int num_points) ++ { ++- unsigned int number = 0; ++- ++- const unsigned int eighthPoints = num_points / 8; ++- ++- const float* inputVectorPtr = (const float*)inputVector; ++- int32_t* outputVectorPtr = outputVector; ++- ++- float min_val = INT_MIN; ++- float max_val = INT_MAX; ++- float r; ++- ++- __m256 vScalar = _mm256_set1_ps(scalar); ++- __m256 inputVal1; ++- __m256i intInputVal1; ++- __m256 vmin_val = _mm256_set1_ps(min_val); ++- __m256 vmax_val = _mm256_set1_ps(max_val); ++- ++- for(;number < eighthPoints; number++){ ++- inputVal1 = _mm256_load_ps(inputVectorPtr); inputVectorPtr += 8; ++- ++- inputVal1 = _mm256_max_ps(_mm256_min_ps(_mm256_mul_ps(inputVal1, vScalar), vmax_val), vmin_val); ++- intInputVal1 = _mm256_cvtps_epi32(inputVal1); ++- ++- _mm256_store_si256((__m256i*)outputVectorPtr, intInputVal1); ++- outputVectorPtr += 8; ++- } ++- ++- number = eighthPoints * 8; ++- for(; number < num_points; number++){ ++- r = inputVector[number] * scalar; ++- if(r > max_val) ++- r = max_val; ++- else if(r < min_val) ++- r = min_val; ++- outputVector[number] = (int32_t)rintf(r); ++- } +++ unsigned int number = 0; +++ +++ const unsigned int eighthPoints = num_points / 8; +++ +++ const float* inputVectorPtr = (const float*)inputVector; +++ int32_t* outputVectorPtr = outputVector; +++ +++ float min_val = INT_MIN; +++ float max_val = INT_MAX; +++ float r; +++ +++ __m256 vScalar = _mm256_set1_ps(scalar); +++ __m256 inputVal1; +++ __m256i intInputVal1; +++ __m256 vmin_val = _mm256_set1_ps(min_val); +++ __m256 vmax_val = _mm256_set1_ps(max_val); +++ +++ for (; number < eighthPoints; number++) { +++ inputVal1 = _mm256_load_ps(inputVectorPtr); +++ inputVectorPtr += 8; +++ +++ inputVal1 = _mm256_max_ps( +++ _mm256_min_ps(_mm256_mul_ps(inputVal1, vScalar), vmax_val), vmin_val); +++ intInputVal1 = _mm256_cvtps_epi32(inputVal1); +++ +++ _mm256_store_si256((__m256i*)outputVectorPtr, intInputVal1); +++ outputVectorPtr += 8; +++ } +++ +++ number = eighthPoints * 8; +++ for (; number < num_points; number++) { +++ r = inputVector[number] * scalar; +++ if (r > max_val) +++ r = max_val; +++ else if (r < min_val) +++ r = min_val; +++ outputVector[number] = (int32_t)rintf(r); +++ } ++ } ++ ++ #endif /* LV_HAVE_AVX */ ++@@ -307,46 +317,49 @@ volk_32f_s32f_convert_32i_a_avx(int32_t* outputVector, const float* inputVector, ++ #ifdef LV_HAVE_SSE2 ++ #include ++ ++-static inline void ++-volk_32f_s32f_convert_32i_a_sse2(int32_t* outputVector, const float* inputVector, ++- const float scalar, unsigned int num_points) +++static inline void volk_32f_s32f_convert_32i_a_sse2(int32_t* outputVector, +++ const float* inputVector, +++ const float scalar, +++ unsigned int num_points) ++ { ++- unsigned int number = 0; ++- ++- const unsigned int quarterPoints = num_points / 4; ++- ++- const float* inputVectorPtr = (const float*)inputVector; ++- int32_t* outputVectorPtr = outputVector; ++- ++- float min_val = INT_MIN; ++- float max_val = INT_MAX; ++- float r; ++- ++- __m128 vScalar = _mm_set_ps1(scalar); ++- __m128 inputVal1; ++- __m128i intInputVal1; ++- __m128 vmin_val = _mm_set_ps1(min_val); ++- __m128 vmax_val = _mm_set_ps1(max_val); ++- ++- for(;number < quarterPoints; number++){ ++- inputVal1 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4; ++- ++- inputVal1 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal1, vScalar), vmax_val), vmin_val); ++- intInputVal1 = _mm_cvtps_epi32(inputVal1); ++- ++- _mm_store_si128((__m128i*)outputVectorPtr, intInputVal1); ++- outputVectorPtr += 4; ++- } ++- ++- number = quarterPoints * 4; ++- for(; number < num_points; number++){ ++- r = inputVector[number] * scalar; ++- if(r > max_val) ++- r = max_val; ++- else if(r < min_val) ++- r = min_val; ++- outputVector[number] = (int32_t)rintf(r); ++- } +++ unsigned int number = 0; +++ +++ const unsigned int quarterPoints = num_points / 4; +++ +++ const float* inputVectorPtr = (const float*)inputVector; +++ int32_t* outputVectorPtr = outputVector; +++ +++ float min_val = INT_MIN; +++ float max_val = INT_MAX; +++ float r; +++ +++ __m128 vScalar = _mm_set_ps1(scalar); +++ __m128 inputVal1; +++ __m128i intInputVal1; +++ __m128 vmin_val = _mm_set_ps1(min_val); +++ __m128 vmax_val = _mm_set_ps1(max_val); +++ +++ for (; number < quarterPoints; number++) { +++ inputVal1 = _mm_load_ps(inputVectorPtr); +++ inputVectorPtr += 4; +++ +++ inputVal1 = +++ _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal1, vScalar), vmax_val), vmin_val); +++ intInputVal1 = _mm_cvtps_epi32(inputVal1); +++ +++ _mm_store_si128((__m128i*)outputVectorPtr, intInputVal1); +++ outputVectorPtr += 4; +++ } +++ +++ number = quarterPoints * 4; +++ for (; number < num_points; number++) { +++ r = inputVector[number] * scalar; +++ if (r > max_val) +++ r = max_val; +++ else if (r < min_val) +++ r = min_val; +++ outputVector[number] = (int32_t)rintf(r); +++ } ++ } ++ ++ #endif /* LV_HAVE_SSE2 */ ++@@ -355,50 +368,51 @@ volk_32f_s32f_convert_32i_a_sse2(int32_t* outputVector, const float* inputVector ++ #ifdef LV_HAVE_SSE ++ #include ++ ++-static inline void ++-volk_32f_s32f_convert_32i_a_sse(int32_t* outputVector, const float* inputVector, ++- const float scalar, unsigned int num_points) +++static inline void volk_32f_s32f_convert_32i_a_sse(int32_t* outputVector, +++ const float* inputVector, +++ const float scalar, +++ unsigned int num_points) ++ { ++- unsigned int number = 0; ++- ++- const unsigned int quarterPoints = num_points / 4; ++- ++- const float* inputVectorPtr = (const float*)inputVector; ++- int32_t* outputVectorPtr = outputVector; ++- ++- float min_val = INT_MIN; ++- float max_val = INT_MAX; ++- float r; ++- ++- __m128 vScalar = _mm_set_ps1(scalar); ++- __m128 ret; ++- __m128 vmin_val = _mm_set_ps1(min_val); ++- __m128 vmax_val = _mm_set_ps1(max_val); ++- ++- __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4]; ++- ++- for(;number < quarterPoints; number++){ ++- ret = _mm_load_ps(inputVectorPtr); ++- inputVectorPtr += 4; ++- ++- ret = _mm_max_ps(_mm_min_ps(_mm_mul_ps(ret, vScalar), vmax_val), vmin_val); ++- ++- _mm_store_ps(outputFloatBuffer, ret); ++- *outputVectorPtr++ = (int32_t)rintf(outputFloatBuffer[0]); ++- *outputVectorPtr++ = (int32_t)rintf(outputFloatBuffer[1]); ++- *outputVectorPtr++ = (int32_t)rintf(outputFloatBuffer[2]); ++- *outputVectorPtr++ = (int32_t)rintf(outputFloatBuffer[3]); ++- } ++- ++- number = quarterPoints * 4; ++- for(; number < num_points; number++){ ++- r = inputVector[number] * scalar; ++- if(r > max_val) ++- r = max_val; ++- else if(r < min_val) ++- r = min_val; ++- outputVector[number] = (int32_t)rintf(r); ++- } +++ unsigned int number = 0; +++ +++ const unsigned int quarterPoints = num_points / 4; +++ +++ const float* inputVectorPtr = (const float*)inputVector; +++ int32_t* outputVectorPtr = outputVector; +++ +++ float min_val = INT_MIN; +++ float max_val = INT_MAX; +++ float r; +++ +++ __m128 vScalar = _mm_set_ps1(scalar); +++ __m128 ret; +++ __m128 vmin_val = _mm_set_ps1(min_val); +++ __m128 vmax_val = _mm_set_ps1(max_val); +++ +++ __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4]; +++ +++ for (; number < quarterPoints; number++) { +++ ret = _mm_load_ps(inputVectorPtr); +++ inputVectorPtr += 4; +++ +++ ret = _mm_max_ps(_mm_min_ps(_mm_mul_ps(ret, vScalar), vmax_val), vmin_val); +++ +++ _mm_store_ps(outputFloatBuffer, ret); +++ *outputVectorPtr++ = (int32_t)rintf(outputFloatBuffer[0]); +++ *outputVectorPtr++ = (int32_t)rintf(outputFloatBuffer[1]); +++ *outputVectorPtr++ = (int32_t)rintf(outputFloatBuffer[2]); +++ *outputVectorPtr++ = (int32_t)rintf(outputFloatBuffer[3]); +++ } +++ +++ number = quarterPoints * 4; +++ for (; number < num_points; number++) { +++ r = inputVector[number] * scalar; +++ if (r > max_val) +++ r = max_val; +++ else if (r < min_val) +++ r = min_val; +++ outputVector[number] = (int32_t)rintf(r); +++ } ++ } ++ ++ #endif /* LV_HAVE_SSE */ ++@@ -406,25 +420,26 @@ volk_32f_s32f_convert_32i_a_sse(int32_t* outputVector, const float* inputVector, ++ ++ #ifdef LV_HAVE_GENERIC ++ ++-static inline void ++-volk_32f_s32f_convert_32i_a_generic(int32_t* outputVector, const float* inputVector, ++- const float scalar, unsigned int num_points) +++static inline void volk_32f_s32f_convert_32i_a_generic(int32_t* outputVector, +++ const float* inputVector, +++ const float scalar, +++ unsigned int num_points) ++ { ++- int32_t* outputVectorPtr = outputVector; ++- const float* inputVectorPtr = inputVector; ++- unsigned int number = 0; ++- float min_val = INT_MIN; ++- float max_val = INT_MAX; ++- float r; ++- ++- for(number = 0; number < num_points; number++){ ++- r = *inputVectorPtr++ * scalar; ++- if(r > max_val) ++- r = max_val; ++- else if(r < min_val) ++- r = min_val; ++- *outputVectorPtr++ = (int32_t)rintf(r); ++- } +++ int32_t* outputVectorPtr = outputVector; +++ const float* inputVectorPtr = inputVector; +++ unsigned int number = 0; +++ float min_val = INT_MIN; +++ float max_val = INT_MAX; +++ float r; +++ +++ for (number = 0; number < num_points; number++) { +++ r = *inputVectorPtr++ * scalar; +++ if (r > max_val) +++ r = max_val; +++ else if (r < min_val) +++ r = min_val; +++ *outputVectorPtr++ = (int32_t)rintf(r); +++ } ++ } ++ ++ #endif /* LV_HAVE_GENERIC */ ++diff --git a/kernels/volk/volk_32f_s32f_convert_8i.h b/kernels/volk/volk_32f_s32f_convert_8i.h ++index 2a1669c..242c3bd 100644 ++--- a/kernels/volk/volk_32f_s32f_convert_8i.h +++++ b/kernels/volk/volk_32f_s32f_convert_8i.h ++@@ -30,7 +30,8 @@ ++ * ++ * Dispatcher Prototype ++ * \code ++- * void volk_32f_s32f_convert_8i(int8_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points) +++ * void volk_32f_s32f_convert_8i(int8_t* outputVector, const float* inputVector, const +++ float scalar, unsigned int num_points) ++ * \endcode ++ * ++ * \b Inputs ++@@ -42,7 +43,8 @@ ++ * \li outputVector: The output vector. ++ * ++ * \b Example ++- * Convert floats from [-1,1] to 16-bit integers with a scale of 5 to maintain smallest delta +++ * Convert floats from [-1,1] to 16-bit integers with a scale of 5 to maintain smallest +++ delta ++ * int N = 10; ++ * unsigned int alignment = volk_get_alignment(); ++ * float* increasing = (float*)volk_malloc(sizeof(float)*N, alignment); ++@@ -74,77 +76,86 @@ ++ #include ++ #include ++ ++-static inline void ++-volk_32f_s32f_convert_8i_single(int8_t* out, const float in){ ++- float min_val = CHAR_MIN; ++- float max_val = CHAR_MAX; ++- if(in > max_val){ ++- *out = (int8_t)(max_val); ++- }else if(in < min_val){ ++- *out = (int8_t)(min_val); ++- }else{ ++- *out = (int8_t)(rintf(in)); ++- } +++static inline void volk_32f_s32f_convert_8i_single(int8_t* out, const float in) +++{ +++ float min_val = CHAR_MIN; +++ float max_val = CHAR_MAX; +++ if (in > max_val) { +++ *out = (int8_t)(max_val); +++ } else if (in < min_val) { +++ *out = (int8_t)(min_val); +++ } else { +++ *out = (int8_t)(rintf(in)); +++ } ++ } ++ ++ #ifdef LV_HAVE_AVX2 ++ #include ++ ++-static inline void ++-volk_32f_s32f_convert_8i_u_avx2(int8_t* outputVector, const float* inputVector, ++- const float scalar, unsigned int num_points) +++static inline void volk_32f_s32f_convert_8i_u_avx2(int8_t* outputVector, +++ const float* inputVector, +++ const float scalar, +++ unsigned int num_points) ++ { ++- unsigned int number = 0; ++- ++- const unsigned int thirtysecondPoints = num_points / 32; ++- ++- const float* inputVectorPtr = (const float*)inputVector; ++- int8_t* outputVectorPtr = outputVector; ++- ++- float min_val = CHAR_MIN; ++- float max_val = CHAR_MAX; ++- float r; ++- ++- __m256 vScalar = _mm256_set1_ps(scalar); ++- __m256 inputVal1, inputVal2, inputVal3, inputVal4; ++- __m256i intInputVal1, intInputVal2, intInputVal3, intInputVal4; ++- __m256 vmin_val = _mm256_set1_ps(min_val); ++- __m256 vmax_val = _mm256_set1_ps(max_val); ++- __m256i intInputVal; ++- ++- for(;number < thirtysecondPoints; number++){ ++- inputVal1 = _mm256_loadu_ps(inputVectorPtr); inputVectorPtr += 8; ++- inputVal2 = _mm256_loadu_ps(inputVectorPtr); inputVectorPtr += 8; ++- inputVal3 = _mm256_loadu_ps(inputVectorPtr); inputVectorPtr += 8; ++- inputVal4 = _mm256_loadu_ps(inputVectorPtr); inputVectorPtr += 8; ++- ++- inputVal1 = _mm256_max_ps(_mm256_min_ps(_mm256_mul_ps(inputVal1, vScalar), vmax_val), vmin_val); ++- inputVal2 = _mm256_max_ps(_mm256_min_ps(_mm256_mul_ps(inputVal2, vScalar), vmax_val), vmin_val); ++- inputVal3 = _mm256_max_ps(_mm256_min_ps(_mm256_mul_ps(inputVal3, vScalar), vmax_val), vmin_val); ++- inputVal4 = _mm256_max_ps(_mm256_min_ps(_mm256_mul_ps(inputVal4, vScalar), vmax_val), vmin_val); ++- ++- intInputVal1 = _mm256_cvtps_epi32(inputVal1); ++- intInputVal2 = _mm256_cvtps_epi32(inputVal2); ++- intInputVal3 = _mm256_cvtps_epi32(inputVal3); ++- intInputVal4 = _mm256_cvtps_epi32(inputVal4); ++- ++- intInputVal1 = _mm256_packs_epi32(intInputVal1, intInputVal2); ++- intInputVal1 = _mm256_permute4x64_epi64(intInputVal1, 0b11011000); ++- intInputVal3 = _mm256_packs_epi32(intInputVal3, intInputVal4); ++- intInputVal3 = _mm256_permute4x64_epi64(intInputVal3, 0b11011000); ++- ++- intInputVal1 = _mm256_packs_epi16(intInputVal1, intInputVal3); ++- intInputVal = _mm256_permute4x64_epi64(intInputVal1, 0b11011000); ++- ++- _mm256_storeu_si256((__m256i*)outputVectorPtr, intInputVal); ++- outputVectorPtr += 32; ++- } ++- ++- number = thirtysecondPoints * 32; ++- for(; number < num_points; number++){ ++- r = inputVector[number] * scalar; ++- volk_32f_s32f_convert_8i_single(&outputVector[number], r); ++- } +++ unsigned int number = 0; +++ +++ const unsigned int thirtysecondPoints = num_points / 32; +++ +++ const float* inputVectorPtr = (const float*)inputVector; +++ int8_t* outputVectorPtr = outputVector; +++ +++ float min_val = CHAR_MIN; +++ float max_val = CHAR_MAX; +++ float r; +++ +++ __m256 vScalar = _mm256_set1_ps(scalar); +++ __m256 inputVal1, inputVal2, inputVal3, inputVal4; +++ __m256i intInputVal1, intInputVal2, intInputVal3, intInputVal4; +++ __m256 vmin_val = _mm256_set1_ps(min_val); +++ __m256 vmax_val = _mm256_set1_ps(max_val); +++ __m256i intInputVal; +++ +++ for (; number < thirtysecondPoints; number++) { +++ inputVal1 = _mm256_loadu_ps(inputVectorPtr); +++ inputVectorPtr += 8; +++ inputVal2 = _mm256_loadu_ps(inputVectorPtr); +++ inputVectorPtr += 8; +++ inputVal3 = _mm256_loadu_ps(inputVectorPtr); +++ inputVectorPtr += 8; +++ inputVal4 = _mm256_loadu_ps(inputVectorPtr); +++ inputVectorPtr += 8; +++ +++ inputVal1 = _mm256_max_ps( +++ _mm256_min_ps(_mm256_mul_ps(inputVal1, vScalar), vmax_val), vmin_val); +++ inputVal2 = _mm256_max_ps( +++ _mm256_min_ps(_mm256_mul_ps(inputVal2, vScalar), vmax_val), vmin_val); +++ inputVal3 = _mm256_max_ps( +++ _mm256_min_ps(_mm256_mul_ps(inputVal3, vScalar), vmax_val), vmin_val); +++ inputVal4 = _mm256_max_ps( +++ _mm256_min_ps(_mm256_mul_ps(inputVal4, vScalar), vmax_val), vmin_val); +++ +++ intInputVal1 = _mm256_cvtps_epi32(inputVal1); +++ intInputVal2 = _mm256_cvtps_epi32(inputVal2); +++ intInputVal3 = _mm256_cvtps_epi32(inputVal3); +++ intInputVal4 = _mm256_cvtps_epi32(inputVal4); +++ +++ intInputVal1 = _mm256_packs_epi32(intInputVal1, intInputVal2); +++ intInputVal1 = _mm256_permute4x64_epi64(intInputVal1, 0b11011000); +++ intInputVal3 = _mm256_packs_epi32(intInputVal3, intInputVal4); +++ intInputVal3 = _mm256_permute4x64_epi64(intInputVal3, 0b11011000); +++ +++ intInputVal1 = _mm256_packs_epi16(intInputVal1, intInputVal3); +++ intInputVal = _mm256_permute4x64_epi64(intInputVal1, 0b11011000); +++ +++ _mm256_storeu_si256((__m256i*)outputVectorPtr, intInputVal); +++ outputVectorPtr += 32; +++ } +++ +++ number = thirtysecondPoints * 32; +++ for (; number < num_points; number++) { +++ r = inputVector[number] * scalar; +++ volk_32f_s32f_convert_8i_single(&outputVector[number], r); +++ } ++ } ++ ++ #endif /* LV_HAVE_AVX2 */ ++@@ -153,57 +164,66 @@ volk_32f_s32f_convert_8i_u_avx2(int8_t* outputVector, const float* inputVector, ++ #ifdef LV_HAVE_SSE2 ++ #include ++ ++-static inline void ++-volk_32f_s32f_convert_8i_u_sse2(int8_t* outputVector, const float* inputVector, ++- const float scalar, unsigned int num_points) +++static inline void volk_32f_s32f_convert_8i_u_sse2(int8_t* outputVector, +++ const float* inputVector, +++ const float scalar, +++ unsigned int num_points) ++ { ++- unsigned int number = 0; ++- ++- const unsigned int sixteenthPoints = num_points / 16; ++- ++- const float* inputVectorPtr = (const float*)inputVector; ++- int8_t* outputVectorPtr = outputVector; ++- ++- float min_val = CHAR_MIN; ++- float max_val = CHAR_MAX; ++- float r; ++- ++- __m128 vScalar = _mm_set_ps1(scalar); ++- __m128 inputVal1, inputVal2, inputVal3, inputVal4; ++- __m128i intInputVal1, intInputVal2, intInputVal3, intInputVal4; ++- __m128 vmin_val = _mm_set_ps1(min_val); ++- __m128 vmax_val = _mm_set_ps1(max_val); ++- ++- for(;number < sixteenthPoints; number++){ ++- inputVal1 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4; ++- inputVal2 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4; ++- inputVal3 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4; ++- inputVal4 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4; ++- ++- inputVal1 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal1, vScalar), vmax_val), vmin_val); ++- inputVal2 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal2, vScalar), vmax_val), vmin_val); ++- inputVal3 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal3, vScalar), vmax_val), vmin_val); ++- inputVal4 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal4, vScalar), vmax_val), vmin_val); ++- ++- intInputVal1 = _mm_cvtps_epi32(inputVal1); ++- intInputVal2 = _mm_cvtps_epi32(inputVal2); ++- intInputVal3 = _mm_cvtps_epi32(inputVal3); ++- intInputVal4 = _mm_cvtps_epi32(inputVal4); ++- ++- intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2); ++- intInputVal3 = _mm_packs_epi32(intInputVal3, intInputVal4); ++- ++- intInputVal1 = _mm_packs_epi16(intInputVal1, intInputVal3); ++- ++- _mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1); ++- outputVectorPtr += 16; ++- } +++ unsigned int number = 0; +++ +++ const unsigned int sixteenthPoints = num_points / 16; +++ +++ const float* inputVectorPtr = (const float*)inputVector; +++ int8_t* outputVectorPtr = outputVector; +++ +++ float min_val = CHAR_MIN; +++ float max_val = CHAR_MAX; +++ float r; +++ +++ __m128 vScalar = _mm_set_ps1(scalar); +++ __m128 inputVal1, inputVal2, inputVal3, inputVal4; +++ __m128i intInputVal1, intInputVal2, intInputVal3, intInputVal4; +++ __m128 vmin_val = _mm_set_ps1(min_val); +++ __m128 vmax_val = _mm_set_ps1(max_val); +++ +++ for (; number < sixteenthPoints; number++) { +++ inputVal1 = _mm_loadu_ps(inputVectorPtr); +++ inputVectorPtr += 4; +++ inputVal2 = _mm_loadu_ps(inputVectorPtr); +++ inputVectorPtr += 4; +++ inputVal3 = _mm_loadu_ps(inputVectorPtr); +++ inputVectorPtr += 4; +++ inputVal4 = _mm_loadu_ps(inputVectorPtr); +++ inputVectorPtr += 4; +++ +++ inputVal1 = +++ _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal1, vScalar), vmax_val), vmin_val); +++ inputVal2 = +++ _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal2, vScalar), vmax_val), vmin_val); +++ inputVal3 = +++ _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal3, vScalar), vmax_val), vmin_val); +++ inputVal4 = +++ _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal4, vScalar), vmax_val), vmin_val); +++ +++ intInputVal1 = _mm_cvtps_epi32(inputVal1); +++ intInputVal2 = _mm_cvtps_epi32(inputVal2); +++ intInputVal3 = _mm_cvtps_epi32(inputVal3); +++ intInputVal4 = _mm_cvtps_epi32(inputVal4); +++ +++ intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2); +++ intInputVal3 = _mm_packs_epi32(intInputVal3, intInputVal4); +++ +++ intInputVal1 = _mm_packs_epi16(intInputVal1, intInputVal3); +++ +++ _mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1); +++ outputVectorPtr += 16; +++ } ++ ++- number = sixteenthPoints * 16; ++- for(; number < num_points; number++){ ++- r = inputVector[number] * scalar; ++- volk_32f_s32f_convert_8i_single(&outputVector[number], r); ++- } +++ number = sixteenthPoints * 16; +++ for (; number < num_points; number++) { +++ r = inputVector[number] * scalar; +++ volk_32f_s32f_convert_8i_single(&outputVector[number], r); +++ } ++ } ++ ++ #endif /* LV_HAVE_SSE2 */ ++@@ -212,46 +232,47 @@ volk_32f_s32f_convert_8i_u_sse2(int8_t* outputVector, const float* inputVector, ++ #ifdef LV_HAVE_SSE ++ #include ++ ++-static inline void ++-volk_32f_s32f_convert_8i_u_sse(int8_t* outputVector, const float* inputVector, ++- const float scalar, unsigned int num_points) +++static inline void volk_32f_s32f_convert_8i_u_sse(int8_t* outputVector, +++ const float* inputVector, +++ const float scalar, +++ unsigned int num_points) ++ { ++- unsigned int number = 0; ++- size_t inner_loop; +++ unsigned int number = 0; +++ size_t inner_loop; ++ ++- const unsigned int quarterPoints = num_points / 4; +++ const unsigned int quarterPoints = num_points / 4; ++ ++- const float* inputVectorPtr = (const float*)inputVector; ++- int8_t* outputVectorPtr = outputVector; +++ const float* inputVectorPtr = (const float*)inputVector; +++ int8_t* outputVectorPtr = outputVector; ++ ++- float min_val = CHAR_MIN; ++- float max_val = CHAR_MAX; ++- float r; +++ float min_val = CHAR_MIN; +++ float max_val = CHAR_MAX; +++ float r; ++ ++- __m128 vScalar = _mm_set_ps1(scalar); ++- __m128 ret; ++- __m128 vmin_val = _mm_set_ps1(min_val); ++- __m128 vmax_val = _mm_set_ps1(max_val); +++ __m128 vScalar = _mm_set_ps1(scalar); +++ __m128 ret; +++ __m128 vmin_val = _mm_set_ps1(min_val); +++ __m128 vmax_val = _mm_set_ps1(max_val); ++ ++- __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4]; +++ __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4]; ++ ++- for(;number < quarterPoints; number++){ ++- ret = _mm_loadu_ps(inputVectorPtr); ++- inputVectorPtr += 4; +++ for (; number < quarterPoints; number++) { +++ ret = _mm_loadu_ps(inputVectorPtr); +++ inputVectorPtr += 4; ++ ++- ret = _mm_max_ps(_mm_min_ps(_mm_mul_ps(ret, vScalar), vmax_val), vmin_val); +++ ret = _mm_max_ps(_mm_min_ps(_mm_mul_ps(ret, vScalar), vmax_val), vmin_val); ++ ++- _mm_store_ps(outputFloatBuffer, ret); ++- for (inner_loop = 0; inner_loop < 4; inner_loop++){ ++- *outputVectorPtr++ = (int8_t)(rintf(outputFloatBuffer[inner_loop])); +++ _mm_store_ps(outputFloatBuffer, ret); +++ for (inner_loop = 0; inner_loop < 4; inner_loop++) { +++ *outputVectorPtr++ = (int8_t)(rintf(outputFloatBuffer[inner_loop])); +++ } ++ } ++- } ++ ++- number = quarterPoints * 4; ++- for(; number < num_points; number++){ ++- r = inputVector[number] * scalar; ++- volk_32f_s32f_convert_8i_single(&outputVector[number], r); ++- } +++ number = quarterPoints * 4; +++ for (; number < num_points; number++) { +++ r = inputVector[number] * scalar; +++ volk_32f_s32f_convert_8i_single(&outputVector[number], r); +++ } ++ } ++ ++ #endif /* LV_HAVE_SSE */ ++@@ -259,18 +280,19 @@ volk_32f_s32f_convert_8i_u_sse(int8_t* outputVector, const float* inputVector, ++ ++ #ifdef LV_HAVE_GENERIC ++ ++-static inline void ++-volk_32f_s32f_convert_8i_generic(int8_t* outputVector, const float* inputVector, ++- const float scalar, unsigned int num_points) +++static inline void volk_32f_s32f_convert_8i_generic(int8_t* outputVector, +++ const float* inputVector, +++ const float scalar, +++ unsigned int num_points) ++ { ++- const float* inputVectorPtr = inputVector; ++- unsigned int number = 0; ++- float r; ++- ++- for(number = 0; number < num_points; number++){ ++- r = *inputVectorPtr++ * scalar; ++- volk_32f_s32f_convert_8i_single(&outputVector[number], r); ++- } +++ const float* inputVectorPtr = inputVector; +++ unsigned int number = 0; +++ float r; +++ +++ for (number = 0; number < num_points; number++) { +++ r = *inputVectorPtr++ * scalar; +++ volk_32f_s32f_convert_8i_single(&outputVector[number], r); +++ } ++ } ++ ++ #endif /* LV_HAVE_GENERIC */ ++@@ -280,68 +302,77 @@ volk_32f_s32f_convert_8i_generic(int8_t* outputVector, const float* inputVector, ++ #ifndef INCLUDED_volk_32f_s32f_convert_8i_a_H ++ #define INCLUDED_volk_32f_s32f_convert_8i_a_H ++ ++-#include ++ #include ++ #include +++#include ++ ++ #ifdef LV_HAVE_AVX2 ++ #include ++ ++-static inline void ++-volk_32f_s32f_convert_8i_a_avx2(int8_t* outputVector, const float* inputVector, ++- const float scalar, unsigned int num_points) +++static inline void volk_32f_s32f_convert_8i_a_avx2(int8_t* outputVector, +++ const float* inputVector, +++ const float scalar, +++ unsigned int num_points) ++ { ++- unsigned int number = 0; ++- ++- const unsigned int thirtysecondPoints = num_points / 32; ++- ++- const float* inputVectorPtr = (const float*)inputVector; ++- int8_t* outputVectorPtr = outputVector; ++- ++- float min_val = CHAR_MIN; ++- float max_val = CHAR_MAX; ++- float r; ++- ++- __m256 vScalar = _mm256_set1_ps(scalar); ++- __m256 inputVal1, inputVal2, inputVal3, inputVal4; ++- __m256i intInputVal1, intInputVal2, intInputVal3, intInputVal4; ++- __m256 vmin_val = _mm256_set1_ps(min_val); ++- __m256 vmax_val = _mm256_set1_ps(max_val); ++- __m256i intInputVal; ++- ++- for(;number < thirtysecondPoints; number++){ ++- inputVal1 = _mm256_load_ps(inputVectorPtr); inputVectorPtr += 8; ++- inputVal2 = _mm256_load_ps(inputVectorPtr); inputVectorPtr += 8; ++- inputVal3 = _mm256_load_ps(inputVectorPtr); inputVectorPtr += 8; ++- inputVal4 = _mm256_load_ps(inputVectorPtr); inputVectorPtr += 8; ++- ++- inputVal1 = _mm256_max_ps(_mm256_min_ps(_mm256_mul_ps(inputVal1, vScalar), vmax_val), vmin_val); ++- inputVal2 = _mm256_max_ps(_mm256_min_ps(_mm256_mul_ps(inputVal2, vScalar), vmax_val), vmin_val); ++- inputVal3 = _mm256_max_ps(_mm256_min_ps(_mm256_mul_ps(inputVal3, vScalar), vmax_val), vmin_val); ++- inputVal4 = _mm256_max_ps(_mm256_min_ps(_mm256_mul_ps(inputVal4, vScalar), vmax_val), vmin_val); ++- ++- intInputVal1 = _mm256_cvtps_epi32(inputVal1); ++- intInputVal2 = _mm256_cvtps_epi32(inputVal2); ++- intInputVal3 = _mm256_cvtps_epi32(inputVal3); ++- intInputVal4 = _mm256_cvtps_epi32(inputVal4); ++- ++- intInputVal1 = _mm256_packs_epi32(intInputVal1, intInputVal2); ++- intInputVal1 = _mm256_permute4x64_epi64(intInputVal1, 0b11011000); ++- intInputVal3 = _mm256_packs_epi32(intInputVal3, intInputVal4); ++- intInputVal3 = _mm256_permute4x64_epi64(intInputVal3, 0b11011000); ++- ++- intInputVal1 = _mm256_packs_epi16(intInputVal1, intInputVal3); ++- intInputVal = _mm256_permute4x64_epi64(intInputVal1, 0b11011000); ++- ++- _mm256_store_si256((__m256i*)outputVectorPtr, intInputVal); ++- outputVectorPtr += 32; ++- } ++- ++- number = thirtysecondPoints * 32; ++- for(; number < num_points; number++){ ++- r = inputVector[number] * scalar; ++- volk_32f_s32f_convert_8i_single(&outputVector[number], r); ++- } +++ unsigned int number = 0; +++ +++ const unsigned int thirtysecondPoints = num_points / 32; +++ +++ const float* inputVectorPtr = (const float*)inputVector; +++ int8_t* outputVectorPtr = outputVector; +++ +++ float min_val = CHAR_MIN; +++ float max_val = CHAR_MAX; +++ float r; +++ +++ __m256 vScalar = _mm256_set1_ps(scalar); +++ __m256 inputVal1, inputVal2, inputVal3, inputVal4; +++ __m256i intInputVal1, intInputVal2, intInputVal3, intInputVal4; +++ __m256 vmin_val = _mm256_set1_ps(min_val); +++ __m256 vmax_val = _mm256_set1_ps(max_val); +++ __m256i intInputVal; +++ +++ for (; number < thirtysecondPoints; number++) { +++ inputVal1 = _mm256_load_ps(inputVectorPtr); +++ inputVectorPtr += 8; +++ inputVal2 = _mm256_load_ps(inputVectorPtr); +++ inputVectorPtr += 8; +++ inputVal3 = _mm256_load_ps(inputVectorPtr); +++ inputVectorPtr += 8; +++ inputVal4 = _mm256_load_ps(inputVectorPtr); +++ inputVectorPtr += 8; +++ +++ inputVal1 = _mm256_max_ps( +++ _mm256_min_ps(_mm256_mul_ps(inputVal1, vScalar), vmax_val), vmin_val); +++ inputVal2 = _mm256_max_ps( +++ _mm256_min_ps(_mm256_mul_ps(inputVal2, vScalar), vmax_val), vmin_val); +++ inputVal3 = _mm256_max_ps( +++ _mm256_min_ps(_mm256_mul_ps(inputVal3, vScalar), vmax_val), vmin_val); +++ inputVal4 = _mm256_max_ps( +++ _mm256_min_ps(_mm256_mul_ps(inputVal4, vScalar), vmax_val), vmin_val); +++ +++ intInputVal1 = _mm256_cvtps_epi32(inputVal1); +++ intInputVal2 = _mm256_cvtps_epi32(inputVal2); +++ intInputVal3 = _mm256_cvtps_epi32(inputVal3); +++ intInputVal4 = _mm256_cvtps_epi32(inputVal4); +++ +++ intInputVal1 = _mm256_packs_epi32(intInputVal1, intInputVal2); +++ intInputVal1 = _mm256_permute4x64_epi64(intInputVal1, 0b11011000); +++ intInputVal3 = _mm256_packs_epi32(intInputVal3, intInputVal4); +++ intInputVal3 = _mm256_permute4x64_epi64(intInputVal3, 0b11011000); +++ +++ intInputVal1 = _mm256_packs_epi16(intInputVal1, intInputVal3); +++ intInputVal = _mm256_permute4x64_epi64(intInputVal1, 0b11011000); +++ +++ _mm256_store_si256((__m256i*)outputVectorPtr, intInputVal); +++ outputVectorPtr += 32; +++ } +++ +++ number = thirtysecondPoints * 32; +++ for (; number < num_points; number++) { +++ r = inputVector[number] * scalar; +++ volk_32f_s32f_convert_8i_single(&outputVector[number], r); +++ } ++ } ++ ++ #endif /* LV_HAVE_AVX2 */ ++@@ -350,57 +381,66 @@ volk_32f_s32f_convert_8i_a_avx2(int8_t* outputVector, const float* inputVector, ++ #ifdef LV_HAVE_SSE2 ++ #include ++ ++-static inline void ++-volk_32f_s32f_convert_8i_a_sse2(int8_t* outputVector, const float* inputVector, ++- const float scalar, unsigned int num_points) +++static inline void volk_32f_s32f_convert_8i_a_sse2(int8_t* outputVector, +++ const float* inputVector, +++ const float scalar, +++ unsigned int num_points) ++ { ++- unsigned int number = 0; ++- ++- const unsigned int sixteenthPoints = num_points / 16; ++- ++- const float* inputVectorPtr = (const float*)inputVector; ++- int8_t* outputVectorPtr = outputVector; ++- ++- float min_val = CHAR_MIN; ++- float max_val = CHAR_MAX; ++- float r; ++- ++- __m128 vScalar = _mm_set_ps1(scalar); ++- __m128 inputVal1, inputVal2, inputVal3, inputVal4; ++- __m128i intInputVal1, intInputVal2, intInputVal3, intInputVal4; ++- __m128 vmin_val = _mm_set_ps1(min_val); ++- __m128 vmax_val = _mm_set_ps1(max_val); ++- ++- for(;number < sixteenthPoints; number++){ ++- inputVal1 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4; ++- inputVal2 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4; ++- inputVal3 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4; ++- inputVal4 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4; ++- ++- inputVal1 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal1, vScalar), vmax_val), vmin_val); ++- inputVal2 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal2, vScalar), vmax_val), vmin_val); ++- inputVal3 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal3, vScalar), vmax_val), vmin_val); ++- inputVal4 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal4, vScalar), vmax_val), vmin_val); ++- ++- intInputVal1 = _mm_cvtps_epi32(inputVal1); ++- intInputVal2 = _mm_cvtps_epi32(inputVal2); ++- intInputVal3 = _mm_cvtps_epi32(inputVal3); ++- intInputVal4 = _mm_cvtps_epi32(inputVal4); ++- ++- intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2); ++- intInputVal3 = _mm_packs_epi32(intInputVal3, intInputVal4); ++- ++- intInputVal1 = _mm_packs_epi16(intInputVal1, intInputVal3); ++- ++- _mm_store_si128((__m128i*)outputVectorPtr, intInputVal1); ++- outputVectorPtr += 16; ++- } +++ unsigned int number = 0; +++ +++ const unsigned int sixteenthPoints = num_points / 16; +++ +++ const float* inputVectorPtr = (const float*)inputVector; +++ int8_t* outputVectorPtr = outputVector; +++ +++ float min_val = CHAR_MIN; +++ float max_val = CHAR_MAX; +++ float r; +++ +++ __m128 vScalar = _mm_set_ps1(scalar); +++ __m128 inputVal1, inputVal2, inputVal3, inputVal4; +++ __m128i intInputVal1, intInputVal2, intInputVal3, intInputVal4; +++ __m128 vmin_val = _mm_set_ps1(min_val); +++ __m128 vmax_val = _mm_set_ps1(max_val); +++ +++ for (; number < sixteenthPoints; number++) { +++ inputVal1 = _mm_load_ps(inputVectorPtr); +++ inputVectorPtr += 4; +++ inputVal2 = _mm_load_ps(inputVectorPtr); +++ inputVectorPtr += 4; +++ inputVal3 = _mm_load_ps(inputVectorPtr); +++ inputVectorPtr += 4; +++ inputVal4 = _mm_load_ps(inputVectorPtr); +++ inputVectorPtr += 4; +++ +++ inputVal1 = +++ _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal1, vScalar), vmax_val), vmin_val); +++ inputVal2 = +++ _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal2, vScalar), vmax_val), vmin_val); +++ inputVal3 = +++ _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal3, vScalar), vmax_val), vmin_val); +++ inputVal4 = +++ _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal4, vScalar), vmax_val), vmin_val); +++ +++ intInputVal1 = _mm_cvtps_epi32(inputVal1); +++ intInputVal2 = _mm_cvtps_epi32(inputVal2); +++ intInputVal3 = _mm_cvtps_epi32(inputVal3); +++ intInputVal4 = _mm_cvtps_epi32(inputVal4); +++ +++ intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2); +++ intInputVal3 = _mm_packs_epi32(intInputVal3, intInputVal4); +++ +++ intInputVal1 = _mm_packs_epi16(intInputVal1, intInputVal3); +++ +++ _mm_store_si128((__m128i*)outputVectorPtr, intInputVal1); +++ outputVectorPtr += 16; +++ } ++ ++- number = sixteenthPoints * 16; ++- for(; number < num_points; number++){ ++- r = inputVector[number] * scalar; ++- volk_32f_s32f_convert_8i_single(&outputVector[number], r); ++- } +++ number = sixteenthPoints * 16; +++ for (; number < num_points; number++) { +++ r = inputVector[number] * scalar; +++ volk_32f_s32f_convert_8i_single(&outputVector[number], r); +++ } ++ } ++ #endif /* LV_HAVE_SSE2 */ ++ ++@@ -408,46 +448,47 @@ volk_32f_s32f_convert_8i_a_sse2(int8_t* outputVector, const float* inputVector, ++ #ifdef LV_HAVE_SSE ++ #include ++ ++-static inline void ++-volk_32f_s32f_convert_8i_a_sse(int8_t* outputVector, const float* inputVector, ++- const float scalar, unsigned int num_points) +++static inline void volk_32f_s32f_convert_8i_a_sse(int8_t* outputVector, +++ const float* inputVector, +++ const float scalar, +++ unsigned int num_points) ++ { ++- unsigned int number = 0; ++- size_t inner_loop; +++ unsigned int number = 0; +++ size_t inner_loop; ++ ++- const unsigned int quarterPoints = num_points / 4; +++ const unsigned int quarterPoints = num_points / 4; ++ ++- const float* inputVectorPtr = (const float*)inputVector; +++ const float* inputVectorPtr = (const float*)inputVector; ++ ++- float min_val = CHAR_MIN; ++- float max_val = CHAR_MAX; ++- float r; +++ float min_val = CHAR_MIN; +++ float max_val = CHAR_MAX; +++ float r; ++ ++- int8_t* outputVectorPtr = outputVector; ++- __m128 vScalar = _mm_set_ps1(scalar); ++- __m128 ret; ++- __m128 vmin_val = _mm_set_ps1(min_val); ++- __m128 vmax_val = _mm_set_ps1(max_val); +++ int8_t* outputVectorPtr = outputVector; +++ __m128 vScalar = _mm_set_ps1(scalar); +++ __m128 ret; +++ __m128 vmin_val = _mm_set_ps1(min_val); +++ __m128 vmax_val = _mm_set_ps1(max_val); ++ ++- __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4]; +++ __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4]; ++ ++- for(;number < quarterPoints; number++){ ++- ret = _mm_load_ps(inputVectorPtr); ++- inputVectorPtr += 4; +++ for (; number < quarterPoints; number++) { +++ ret = _mm_load_ps(inputVectorPtr); +++ inputVectorPtr += 4; ++ ++- ret = _mm_max_ps(_mm_min_ps(_mm_mul_ps(ret, vScalar), vmax_val), vmin_val); +++ ret = _mm_max_ps(_mm_min_ps(_mm_mul_ps(ret, vScalar), vmax_val), vmin_val); ++ ++- _mm_store_ps(outputFloatBuffer, ret); ++- for (inner_loop = 0; inner_loop < 4; inner_loop++){ ++- *outputVectorPtr++ = (int8_t)(rintf(outputFloatBuffer[inner_loop])); +++ _mm_store_ps(outputFloatBuffer, ret); +++ for (inner_loop = 0; inner_loop < 4; inner_loop++) { +++ *outputVectorPtr++ = (int8_t)(rintf(outputFloatBuffer[inner_loop])); +++ } ++ } ++- } ++ ++- number = quarterPoints * 4; ++- for(; number < num_points; number++){ ++- r = inputVector[number] * scalar; ++- volk_32f_s32f_convert_8i_single(&outputVector[number], r); ++- } +++ number = quarterPoints * 4; +++ for (; number < num_points; number++) { +++ r = inputVector[number] * scalar; +++ volk_32f_s32f_convert_8i_single(&outputVector[number], r); +++ } ++ } ++ ++ #endif /* LV_HAVE_SSE */ ++@@ -455,18 +496,19 @@ volk_32f_s32f_convert_8i_a_sse(int8_t* outputVector, const float* inputVector, ++ ++ #ifdef LV_HAVE_GENERIC ++ ++-static inline void ++-volk_32f_s32f_convert_8i_a_generic(int8_t* outputVector, const float* inputVector, ++- const float scalar, unsigned int num_points) +++static inline void volk_32f_s32f_convert_8i_a_generic(int8_t* outputVector, +++ const float* inputVector, +++ const float scalar, +++ unsigned int num_points) ++ { ++- const float* inputVectorPtr = inputVector; ++- unsigned int number = 0; ++- float r; ++- ++- for(number = 0; number < num_points; number++){ ++- r = *inputVectorPtr++ * scalar; ++- volk_32f_s32f_convert_8i_single(&outputVector[number], r); ++- } +++ const float* inputVectorPtr = inputVector; +++ unsigned int number = 0; +++ float r; +++ +++ for (number = 0; number < num_points; number++) { +++ r = *inputVectorPtr++ * scalar; +++ volk_32f_s32f_convert_8i_single(&outputVector[number], r); +++ } ++ } ++ ++ #endif /* LV_HAVE_GENERIC */ ++diff --git a/kernels/volk/volk_32f_s32f_mod_rangepuppet_32f.h b/kernels/volk/volk_32f_s32f_mod_rangepuppet_32f.h ++index 6ace77b..28d7ab5 100644 ++--- a/kernels/volk/volk_32f_s32f_mod_rangepuppet_32f.h +++++ b/kernels/volk/volk_32f_s32f_mod_rangepuppet_32f.h ++@@ -4,42 +4,77 @@ ++ #include ++ ++ #ifdef LV_HAVE_GENERIC ++-static inline void volk_32f_s32f_mod_rangepuppet_32f_generic(float *output, const float *input, float bound, unsigned int num_points){ ++- volk_32f_s32f_s32f_mod_range_32f_generic(output, input, bound-3.141f, bound, num_points); +++static inline void volk_32f_s32f_mod_rangepuppet_32f_generic(float* output, +++ const float* input, +++ float bound, +++ unsigned int num_points) +++{ +++ volk_32f_s32f_s32f_mod_range_32f_generic( +++ output, input, bound - 3.141f, bound, num_points); ++ } ++ #endif ++ ++ ++ #ifdef LV_HAVE_SSE ++-static inline void volk_32f_s32f_mod_rangepuppet_32f_u_sse(float *output, const float *input, float bound, unsigned int num_points){ ++- volk_32f_s32f_s32f_mod_range_32f_u_sse(output, input, bound-3.141f, bound, num_points); +++static inline void volk_32f_s32f_mod_rangepuppet_32f_u_sse(float* output, +++ const float* input, +++ float bound, +++ unsigned int num_points) +++{ +++ volk_32f_s32f_s32f_mod_range_32f_u_sse( +++ output, input, bound - 3.141f, bound, num_points); ++ } ++ #endif ++ #ifdef LV_HAVE_SSE ++-static inline void volk_32f_s32f_mod_rangepuppet_32f_a_sse(float *output, const float *input, float bound, unsigned int num_points){ ++- volk_32f_s32f_s32f_mod_range_32f_a_sse(output, input, bound-3.141f, bound, num_points); +++static inline void volk_32f_s32f_mod_rangepuppet_32f_a_sse(float* output, +++ const float* input, +++ float bound, +++ unsigned int num_points) +++{ +++ volk_32f_s32f_s32f_mod_range_32f_a_sse( +++ output, input, bound - 3.141f, bound, num_points); ++ } ++ #endif ++ ++ #ifdef LV_HAVE_SSE2 ++-static inline void volk_32f_s32f_mod_rangepuppet_32f_u_sse2(float *output, const float *input, float bound, unsigned int num_points){ ++- volk_32f_s32f_s32f_mod_range_32f_u_sse2(output, input, bound-3.141f, bound, num_points); +++static inline void volk_32f_s32f_mod_rangepuppet_32f_u_sse2(float* output, +++ const float* input, +++ float bound, +++ unsigned int num_points) +++{ +++ volk_32f_s32f_s32f_mod_range_32f_u_sse2( +++ output, input, bound - 3.141f, bound, num_points); ++ } ++ #endif ++ #ifdef LV_HAVE_SSE2 ++-static inline void volk_32f_s32f_mod_rangepuppet_32f_a_sse2(float *output, const float *input, float bound, unsigned int num_points){ ++- volk_32f_s32f_s32f_mod_range_32f_a_sse2(output, input, bound-3.141f, bound, num_points); +++static inline void volk_32f_s32f_mod_rangepuppet_32f_a_sse2(float* output, +++ const float* input, +++ float bound, +++ unsigned int num_points) +++{ +++ volk_32f_s32f_s32f_mod_range_32f_a_sse2( +++ output, input, bound - 3.141f, bound, num_points); ++ } ++ #endif ++ ++ #ifdef LV_HAVE_AVX ++-static inline void volk_32f_s32f_mod_rangepuppet_32f_u_avx(float *output, const float *input, float bound, unsigned int num_points){ ++- volk_32f_s32f_s32f_mod_range_32f_u_avx(output, input, bound-3.141f, bound, num_points); +++static inline void volk_32f_s32f_mod_rangepuppet_32f_u_avx(float* output, +++ const float* input, +++ float bound, +++ unsigned int num_points) +++{ +++ volk_32f_s32f_s32f_mod_range_32f_u_avx( +++ output, input, bound - 3.141f, bound, num_points); ++ } ++ #endif ++ #ifdef LV_HAVE_AVX ++-static inline void volk_32f_s32f_mod_rangepuppet_32f_a_avx(float *output, const float *input, float bound, unsigned int num_points){ ++- volk_32f_s32f_s32f_mod_range_32f_a_avx(output, input, bound-3.141f, bound, num_points); +++static inline void volk_32f_s32f_mod_rangepuppet_32f_a_avx(float* output, +++ const float* input, +++ float bound, +++ unsigned int num_points) +++{ +++ volk_32f_s32f_s32f_mod_range_32f_a_avx( +++ output, input, bound - 3.141f, bound, num_points); ++ } ++ #endif ++ #endif ++diff --git a/kernels/volk/volk_32f_s32f_multiply_32f.h b/kernels/volk/volk_32f_s32f_multiply_32f.h ++index 97c7f69..dcc9c6b 100644 ++--- a/kernels/volk/volk_32f_s32f_multiply_32f.h +++++ b/kernels/volk/volk_32f_s32f_multiply_32f.h ++@@ -29,8 +29,8 @@ ++ * ++ * Dispatcher Prototype ++ * \code ++- * void volk_32f_s32f_multiply_32f(float* cVector, const float* aVector, const float scalar, unsigned int num_points) ++- * \endcode +++ * void volk_32f_s32f_multiply_32f(float* cVector, const float* aVector, const float +++ * scalar, unsigned int num_points) \endcode ++ * ++ * \b Inputs ++ * \li aVector: The input vector of floats. ++@@ -75,84 +75,87 @@ ++ #ifdef LV_HAVE_SSE ++ #include ++ ++-static inline void ++-volk_32f_s32f_multiply_32f_u_sse(float* cVector, const float* aVector, ++- const float scalar, unsigned int num_points) +++static inline void volk_32f_s32f_multiply_32f_u_sse(float* cVector, +++ const float* aVector, +++ const float scalar, +++ unsigned int num_points) ++ { ++- unsigned int number = 0; ++- const unsigned int quarterPoints = num_points / 4; +++ unsigned int number = 0; +++ const unsigned int quarterPoints = num_points / 4; ++ ++- float* cPtr = cVector; ++- const float* aPtr = aVector; +++ float* cPtr = cVector; +++ const float* aPtr = aVector; ++ ++- __m128 aVal, bVal, cVal; ++- bVal = _mm_set_ps1(scalar); ++- for(;number < quarterPoints; number++){ ++- aVal = _mm_loadu_ps(aPtr); +++ __m128 aVal, bVal, cVal; +++ bVal = _mm_set_ps1(scalar); +++ for (; number < quarterPoints; number++) { +++ aVal = _mm_loadu_ps(aPtr); ++ ++- cVal = _mm_mul_ps(aVal, bVal); +++ cVal = _mm_mul_ps(aVal, bVal); ++ ++- _mm_storeu_ps(cPtr,cVal); // Store the results back into the C container +++ _mm_storeu_ps(cPtr, cVal); // Store the results back into the C container ++ ++- aPtr += 4; ++- cPtr += 4; ++- } +++ aPtr += 4; +++ cPtr += 4; +++ } ++ ++- number = quarterPoints * 4; ++- for(;number < num_points; number++){ ++- *cPtr++ = (*aPtr++) * scalar; ++- } +++ number = quarterPoints * 4; +++ for (; number < num_points; number++) { +++ *cPtr++ = (*aPtr++) * scalar; +++ } ++ } ++ #endif /* LV_HAVE_SSE */ ++ ++ #ifdef LV_HAVE_AVX ++ #include ++ ++-static inline void ++-volk_32f_s32f_multiply_32f_u_avx(float* cVector, const float* aVector, ++- const float scalar, unsigned int num_points) +++static inline void volk_32f_s32f_multiply_32f_u_avx(float* cVector, +++ const float* aVector, +++ const float scalar, +++ unsigned int num_points) ++ { ++- unsigned int number = 0; ++- const unsigned int eighthPoints = num_points / 8; +++ unsigned int number = 0; +++ const unsigned int eighthPoints = num_points / 8; ++ ++- float* cPtr = cVector; ++- const float* aPtr = aVector; +++ float* cPtr = cVector; +++ const float* aPtr = aVector; ++ ++- __m256 aVal, bVal, cVal; ++- bVal = _mm256_set1_ps(scalar); ++- for(;number < eighthPoints; number++){ +++ __m256 aVal, bVal, cVal; +++ bVal = _mm256_set1_ps(scalar); +++ for (; number < eighthPoints; number++) { ++ ++- aVal = _mm256_loadu_ps(aPtr); +++ aVal = _mm256_loadu_ps(aPtr); ++ ++- cVal = _mm256_mul_ps(aVal, bVal); +++ cVal = _mm256_mul_ps(aVal, bVal); ++ ++- _mm256_storeu_ps(cPtr,cVal); // Store the results back into the C container +++ _mm256_storeu_ps(cPtr, cVal); // Store the results back into the C container ++ ++- aPtr += 8; ++- cPtr += 8; ++- } +++ aPtr += 8; +++ cPtr += 8; +++ } ++ ++- number = eighthPoints * 8; ++- for(;number < num_points; number++){ ++- *cPtr++ = (*aPtr++) * scalar; ++- } +++ number = eighthPoints * 8; +++ for (; number < num_points; number++) { +++ *cPtr++ = (*aPtr++) * scalar; +++ } ++ } ++ #endif /* LV_HAVE_AVX */ ++ ++ #ifdef LV_HAVE_GENERIC ++ ++-static inline void ++-volk_32f_s32f_multiply_32f_generic(float* cVector, const float* aVector, ++- const float scalar, unsigned int num_points) +++static inline void volk_32f_s32f_multiply_32f_generic(float* cVector, +++ const float* aVector, +++ const float scalar, +++ unsigned int num_points) ++ { ++- unsigned int number = 0; ++- const float* inputPtr = aVector; ++- float* outputPtr = cVector; ++- for(number = 0; number < num_points; number++){ ++- *outputPtr = (*inputPtr) * scalar; ++- inputPtr++; ++- outputPtr++; ++- } +++ unsigned int number = 0; +++ const float* inputPtr = aVector; +++ float* outputPtr = cVector; +++ for (number = 0; number < num_points; number++) { +++ *outputPtr = (*inputPtr) * scalar; +++ inputPtr++; +++ outputPtr++; +++ } ++ } ++ #endif /* LV_HAVE_GENERIC */ ++ ++@@ -168,126 +171,132 @@ volk_32f_s32f_multiply_32f_generic(float* cVector, const float* aVector, ++ #ifdef LV_HAVE_SSE ++ #include ++ ++-static inline void ++-volk_32f_s32f_multiply_32f_a_sse(float* cVector, const float* aVector, ++- const float scalar, unsigned int num_points) +++static inline void volk_32f_s32f_multiply_32f_a_sse(float* cVector, +++ const float* aVector, +++ const float scalar, +++ unsigned int num_points) ++ { ++- unsigned int number = 0; ++- const unsigned int quarterPoints = num_points / 4; +++ unsigned int number = 0; +++ const unsigned int quarterPoints = num_points / 4; ++ ++- float* cPtr = cVector; ++- const float* aPtr = aVector; +++ float* cPtr = cVector; +++ const float* aPtr = aVector; ++ ++- __m128 aVal, bVal, cVal; ++- bVal = _mm_set_ps1(scalar); ++- for(;number < quarterPoints; number++){ ++- aVal = _mm_load_ps(aPtr); +++ __m128 aVal, bVal, cVal; +++ bVal = _mm_set_ps1(scalar); +++ for (; number < quarterPoints; number++) { +++ aVal = _mm_load_ps(aPtr); ++ ++- cVal = _mm_mul_ps(aVal, bVal); +++ cVal = _mm_mul_ps(aVal, bVal); ++ ++- _mm_store_ps(cPtr,cVal); // Store the results back into the C container +++ _mm_store_ps(cPtr, cVal); // Store the results back into the C container ++ ++- aPtr += 4; ++- cPtr += 4; ++- } +++ aPtr += 4; +++ cPtr += 4; +++ } ++ ++- number = quarterPoints * 4; ++- for(;number < num_points; number++){ ++- *cPtr++ = (*aPtr++) * scalar; ++- } +++ number = quarterPoints * 4; +++ for (; number < num_points; number++) { +++ *cPtr++ = (*aPtr++) * scalar; +++ } ++ } ++ #endif /* LV_HAVE_SSE */ ++ ++ #ifdef LV_HAVE_AVX ++ #include ++ ++-static inline void ++-volk_32f_s32f_multiply_32f_a_avx(float* cVector, const float* aVector, ++- const float scalar, unsigned int num_points) +++static inline void volk_32f_s32f_multiply_32f_a_avx(float* cVector, +++ const float* aVector, +++ const float scalar, +++ unsigned int num_points) ++ { ++- unsigned int number = 0; ++- const unsigned int eighthPoints = num_points / 8; +++ unsigned int number = 0; +++ const unsigned int eighthPoints = num_points / 8; ++ ++- float* cPtr = cVector; ++- const float* aPtr = aVector; +++ float* cPtr = cVector; +++ const float* aPtr = aVector; ++ ++- __m256 aVal, bVal, cVal; ++- bVal = _mm256_set1_ps(scalar); ++- for(;number < eighthPoints; number++){ ++- aVal = _mm256_load_ps(aPtr); +++ __m256 aVal, bVal, cVal; +++ bVal = _mm256_set1_ps(scalar); +++ for (; number < eighthPoints; number++) { +++ aVal = _mm256_load_ps(aPtr); ++ ++- cVal = _mm256_mul_ps(aVal, bVal); +++ cVal = _mm256_mul_ps(aVal, bVal); ++ ++- _mm256_store_ps(cPtr,cVal); // Store the results back into the C container +++ _mm256_store_ps(cPtr, cVal); // Store the results back into the C container ++ ++- aPtr += 8; ++- cPtr += 8; ++- } +++ aPtr += 8; +++ cPtr += 8; +++ } ++ ++- number = eighthPoints * 8; ++- for(;number < num_points; number++){ ++- *cPtr++ = (*aPtr++) * scalar; ++- } +++ number = eighthPoints * 8; +++ for (; number < num_points; number++) { +++ *cPtr++ = (*aPtr++) * scalar; +++ } ++ } ++ #endif /* LV_HAVE_AVX */ ++ ++ #ifdef LV_HAVE_NEON ++ #include ++ ++-static inline void ++-volk_32f_s32f_multiply_32f_u_neon(float* cVector, const float* aVector, ++- const float scalar, unsigned int num_points) +++static inline void volk_32f_s32f_multiply_32f_u_neon(float* cVector, +++ const float* aVector, +++ const float scalar, +++ unsigned int num_points) ++ { ++- unsigned int number = 0; ++- const float* inputPtr = aVector; ++- float* outputPtr = cVector; ++- const unsigned int quarterPoints = num_points / 4; ++- ++- float32x4_t aVal, cVal; ++- ++- for(number = 0; number < quarterPoints; number++){ ++- aVal = vld1q_f32(inputPtr); // Load into NEON regs ++- cVal = vmulq_n_f32 (aVal, scalar); // Do the multiply ++- vst1q_f32(outputPtr, cVal); // Store results back to output ++- inputPtr += 4; ++- outputPtr += 4; ++- } ++- for(number = quarterPoints * 4; number < num_points; number++){ ++- *outputPtr++ = (*inputPtr++) * scalar; ++- } +++ unsigned int number = 0; +++ const float* inputPtr = aVector; +++ float* outputPtr = cVector; +++ const unsigned int quarterPoints = num_points / 4; +++ +++ float32x4_t aVal, cVal; +++ +++ for (number = 0; number < quarterPoints; number++) { +++ aVal = vld1q_f32(inputPtr); // Load into NEON regs +++ cVal = vmulq_n_f32(aVal, scalar); // Do the multiply +++ vst1q_f32(outputPtr, cVal); // Store results back to output +++ inputPtr += 4; +++ outputPtr += 4; +++ } +++ for (number = quarterPoints * 4; number < num_points; number++) { +++ *outputPtr++ = (*inputPtr++) * scalar; +++ } ++ } ++ #endif /* LV_HAVE_NEON */ ++ ++ ++ #ifdef LV_HAVE_GENERIC ++ ++-static inline void ++-volk_32f_s32f_multiply_32f_a_generic(float* cVector, const float* aVector, ++- const float scalar, unsigned int num_points) +++static inline void volk_32f_s32f_multiply_32f_a_generic(float* cVector, +++ const float* aVector, +++ const float scalar, +++ unsigned int num_points) ++ { ++- unsigned int number = 0; ++- const float* inputPtr = aVector; ++- float* outputPtr = cVector; ++- for(number = 0; number < num_points; number++){ ++- *outputPtr = (*inputPtr) * scalar; ++- inputPtr++; ++- outputPtr++; ++- } +++ unsigned int number = 0; +++ const float* inputPtr = aVector; +++ float* outputPtr = cVector; +++ for (number = 0; number < num_points; number++) { +++ *outputPtr = (*inputPtr) * scalar; +++ inputPtr++; +++ outputPtr++; +++ } ++ } ++ #endif /* LV_HAVE_GENERIC */ ++ ++ ++ #ifdef LV_HAVE_ORC ++ ++-extern void ++-volk_32f_s32f_multiply_32f_a_orc_impl(float* dst, const float* src, ++- const float scalar, unsigned int num_points); +++extern void volk_32f_s32f_multiply_32f_a_orc_impl(float* dst, +++ const float* src, +++ const float scalar, +++ unsigned int num_points); ++ ++-static inline void ++-volk_32f_s32f_multiply_32f_u_orc(float* cVector, const float* aVector, ++- const float scalar, unsigned int num_points) +++static inline void volk_32f_s32f_multiply_32f_u_orc(float* cVector, +++ const float* aVector, +++ const float scalar, +++ unsigned int num_points) ++ { ++- volk_32f_s32f_multiply_32f_a_orc_impl(cVector, aVector, scalar, num_points); +++ volk_32f_s32f_multiply_32f_a_orc_impl(cVector, aVector, scalar, num_points); ++ } ++ ++ #endif /* LV_HAVE_GENERIC */ ++diff --git a/kernels/volk/volk_32f_s32f_normalize.h b/kernels/volk/volk_32f_s32f_normalize.h ++index 404d534..0a05492 100644 ++--- a/kernels/volk/volk_32f_s32f_normalize.h +++++ b/kernels/volk/volk_32f_s32f_normalize.h ++@@ -30,8 +30,8 @@ ++ * ++ * Dispatcher Prototype ++ * \code ++- * void volk_32f_s32f_normalize(float* vecBuffer, const float scalar, unsigned int num_points) ++- * \endcode +++ * void volk_32f_s32f_normalize(float* vecBuffer, const float scalar, unsigned int +++ * num_points) \endcode ++ * ++ * \b Inputs ++ * \li vecBuffer: The buffer of values to be vectorized. ++@@ -76,84 +76,99 @@ ++ #ifdef LV_HAVE_AVX ++ #include ++ ++-static inline void volk_32f_s32f_normalize_a_avx(float* vecBuffer, const float scalar, unsigned int num_points){ ++- unsigned int number = 0; ++- float* inputPtr = vecBuffer; +++static inline void volk_32f_s32f_normalize_a_avx(float* vecBuffer, +++ const float scalar, +++ unsigned int num_points) +++{ +++ unsigned int number = 0; +++ float* inputPtr = vecBuffer; ++ ++- const float invScalar = 1.0 / scalar; ++- __m256 vecScalar = _mm256_set1_ps(invScalar); +++ const float invScalar = 1.0 / scalar; +++ __m256 vecScalar = _mm256_set1_ps(invScalar); ++ ++- __m256 input1; +++ __m256 input1; ++ ++- const uint64_t eighthPoints = num_points / 8; ++- for(;number < eighthPoints; number++){ +++ const uint64_t eighthPoints = num_points / 8; +++ for (; number < eighthPoints; number++) { ++ ++- input1 = _mm256_load_ps(inputPtr); +++ input1 = _mm256_load_ps(inputPtr); ++ ++- input1 = _mm256_mul_ps(input1, vecScalar); +++ input1 = _mm256_mul_ps(input1, vecScalar); ++ ++- _mm256_store_ps(inputPtr, input1); +++ _mm256_store_ps(inputPtr, input1); ++ ++- inputPtr += 8; ++- } +++ inputPtr += 8; +++ } ++ ++- number = eighthPoints*8; ++- for(; number < num_points; number++){ ++- *inputPtr *= invScalar; ++- inputPtr++; ++- } +++ number = eighthPoints * 8; +++ for (; number < num_points; number++) { +++ *inputPtr *= invScalar; +++ inputPtr++; +++ } ++ } ++ #endif /* LV_HAVE_AVX */ ++ ++ #ifdef LV_HAVE_SSE ++ #include ++ ++-static inline void volk_32f_s32f_normalize_a_sse(float* vecBuffer, const float scalar, unsigned int num_points){ ++- unsigned int number = 0; ++- float* inputPtr = vecBuffer; +++static inline void volk_32f_s32f_normalize_a_sse(float* vecBuffer, +++ const float scalar, +++ unsigned int num_points) +++{ +++ unsigned int number = 0; +++ float* inputPtr = vecBuffer; ++ ++- const float invScalar = 1.0 / scalar; ++- __m128 vecScalar = _mm_set_ps1(invScalar); +++ const float invScalar = 1.0 / scalar; +++ __m128 vecScalar = _mm_set_ps1(invScalar); ++ ++- __m128 input1; +++ __m128 input1; ++ ++- const uint64_t quarterPoints = num_points / 4; ++- for(;number < quarterPoints; number++){ +++ const uint64_t quarterPoints = num_points / 4; +++ for (; number < quarterPoints; number++) { ++ ++- input1 = _mm_load_ps(inputPtr); +++ input1 = _mm_load_ps(inputPtr); ++ ++- input1 = _mm_mul_ps(input1, vecScalar); +++ input1 = _mm_mul_ps(input1, vecScalar); ++ ++- _mm_store_ps(inputPtr, input1); +++ _mm_store_ps(inputPtr, input1); ++ ++- inputPtr += 4; ++- } +++ inputPtr += 4; +++ } ++ ++- number = quarterPoints*4; ++- for(; number < num_points; number++){ ++- *inputPtr *= invScalar; ++- inputPtr++; ++- } +++ number = quarterPoints * 4; +++ for (; number < num_points; number++) { +++ *inputPtr *= invScalar; +++ inputPtr++; +++ } ++ } ++ #endif /* LV_HAVE_SSE */ ++ ++ #ifdef LV_HAVE_GENERIC ++ ++-static inline void volk_32f_s32f_normalize_generic(float* vecBuffer, const float scalar, unsigned int num_points){ ++- unsigned int number = 0; ++- float* inputPtr = vecBuffer; ++- const float invScalar = 1.0 / scalar; ++- for(number = 0; number < num_points; number++){ ++- *inputPtr *= invScalar; ++- inputPtr++; ++- } +++static inline void volk_32f_s32f_normalize_generic(float* vecBuffer, +++ const float scalar, +++ unsigned int num_points) +++{ +++ unsigned int number = 0; +++ float* inputPtr = vecBuffer; +++ const float invScalar = 1.0 / scalar; +++ for (number = 0; number < num_points; number++) { +++ *inputPtr *= invScalar; +++ inputPtr++; +++ } ++ } ++ #endif /* LV_HAVE_GENERIC */ ++ ++ #ifdef LV_HAVE_ORC ++ ++-extern void volk_32f_s32f_normalize_a_orc_impl(float* dst, float* src, const float scalar, unsigned int num_points); ++-static inline void volk_32f_s32f_normalize_u_orc(float* vecBuffer, const float scalar, unsigned int num_points){ +++extern void volk_32f_s32f_normalize_a_orc_impl(float* dst, +++ float* src, +++ const float scalar, +++ unsigned int num_points); +++static inline void volk_32f_s32f_normalize_u_orc(float* vecBuffer, +++ const float scalar, +++ unsigned int num_points) +++{ ++ float invscalar = 1.0 / scalar; ++ volk_32f_s32f_normalize_a_orc_impl(vecBuffer, vecBuffer, invscalar, num_points); ++ } ++@@ -169,32 +184,35 @@ static inline void volk_32f_s32f_normalize_u_orc(float* vecBuffer, const float s ++ #ifdef LV_HAVE_AVX ++ #include ++ ++-static inline void volk_32f_s32f_normalize_u_avx(float* vecBuffer, const float scalar, unsigned int num_points){ ++- unsigned int number = 0; ++- float* inputPtr = vecBuffer; +++static inline void volk_32f_s32f_normalize_u_avx(float* vecBuffer, +++ const float scalar, +++ unsigned int num_points) +++{ +++ unsigned int number = 0; +++ float* inputPtr = vecBuffer; ++ ++- const float invScalar = 1.0 / scalar; ++- __m256 vecScalar = _mm256_set1_ps(invScalar); +++ const float invScalar = 1.0 / scalar; +++ __m256 vecScalar = _mm256_set1_ps(invScalar); ++ ++- __m256 input1; +++ __m256 input1; ++ ++- const uint64_t eighthPoints = num_points / 8; ++- for(;number < eighthPoints; number++){ +++ const uint64_t eighthPoints = num_points / 8; +++ for (; number < eighthPoints; number++) { ++ ++- input1 = _mm256_loadu_ps(inputPtr); +++ input1 = _mm256_loadu_ps(inputPtr); ++ ++- input1 = _mm256_mul_ps(input1, vecScalar); +++ input1 = _mm256_mul_ps(input1, vecScalar); ++ ++- _mm256_storeu_ps(inputPtr, input1); +++ _mm256_storeu_ps(inputPtr, input1); ++ ++- inputPtr += 8; ++- } +++ inputPtr += 8; +++ } ++ ++- number = eighthPoints*8; ++- for(; number < num_points; number++){ ++- *inputPtr *= invScalar; ++- inputPtr++; ++- } +++ number = eighthPoints * 8; +++ for (; number < num_points; number++) { +++ *inputPtr *= invScalar; +++ inputPtr++; +++ } ++ } ++ #endif /* LV_HAVE_AVX */ ++ ++diff --git a/kernels/volk/volk_32f_s32f_power_32f.h b/kernels/volk/volk_32f_s32f_power_32f.h ++index 070efdc..9b6fdf4 100644 ++--- a/kernels/volk/volk_32f_s32f_power_32f.h +++++ b/kernels/volk/volk_32f_s32f_power_32f.h ++@@ -30,8 +30,8 @@ ++ * ++ * Dispatcher Prototype ++ * \code ++- * void volk_32f_s32f_power_32f(float* cVector, const float* aVector, const float power, unsigned int num_points) ++- * \endcode +++ * void volk_32f_s32f_power_32f(float* cVector, const float* aVector, const float power, +++ * unsigned int num_points) \endcode ++ * ++ * \b Inputs ++ * \li aVector: The input vector of floats. ++@@ -72,8 +72,8 @@ ++ #define INCLUDED_volk_32f_s32f_power_32f_a_H ++ ++ #include ++-#include ++ #include +++#include ++ ++ #ifdef LV_HAVE_SSE4_1 ++ #include ++@@ -82,49 +82,51 @@ ++ #include ++ #endif /* LV_HAVE_LIB_SIMDMATH */ ++ ++-static inline void ++-volk_32f_s32f_power_32f_a_sse4_1(float* cVector, const float* aVector, ++- const float power, unsigned int num_points) +++static inline void volk_32f_s32f_power_32f_a_sse4_1(float* cVector, +++ const float* aVector, +++ const float power, +++ unsigned int num_points) ++ { ++- unsigned int number = 0; +++ unsigned int number = 0; ++ ++- float* cPtr = cVector; ++- const float* aPtr = aVector; +++ float* cPtr = cVector; +++ const float* aPtr = aVector; ++ ++ #ifdef LV_HAVE_LIB_SIMDMATH ++- const unsigned int quarterPoints = num_points / 4; ++- __m128 vPower = _mm_set_ps1(power); ++- __m128 zeroValue = _mm_setzero_ps(); ++- __m128 signMask; ++- __m128 negatedValues; ++- __m128 negativeOneToPower = _mm_set_ps1(powf(-1, power)); ++- __m128 onesMask = _mm_set_ps1(1); +++ const unsigned int quarterPoints = num_points / 4; +++ __m128 vPower = _mm_set_ps1(power); +++ __m128 zeroValue = _mm_setzero_ps(); +++ __m128 signMask; +++ __m128 negatedValues; +++ __m128 negativeOneToPower = _mm_set_ps1(powf(-1, power)); +++ __m128 onesMask = _mm_set_ps1(1); ++ ++- __m128 aVal, cVal; ++- for(;number < quarterPoints; number++){ +++ __m128 aVal, cVal; +++ for (; number < quarterPoints; number++) { ++ ++- aVal = _mm_load_ps(aPtr); ++- signMask = _mm_cmplt_ps(aVal, zeroValue); ++- negatedValues = _mm_sub_ps(zeroValue, aVal); ++- aVal = _mm_blendv_ps(aVal, negatedValues, signMask); +++ aVal = _mm_load_ps(aPtr); +++ signMask = _mm_cmplt_ps(aVal, zeroValue); +++ negatedValues = _mm_sub_ps(zeroValue, aVal); +++ aVal = _mm_blendv_ps(aVal, negatedValues, signMask); ++ ++- // powf4 doesn't support negative values in the base, so we mask them off and then apply the negative after ++- cVal = powf4(aVal, vPower); // Takes each input value to the specified power +++ // powf4 doesn't support negative values in the base, so we mask them off and then +++ // apply the negative after +++ cVal = powf4(aVal, vPower); // Takes each input value to the specified power ++ ++- cVal = _mm_mul_ps( _mm_blendv_ps(onesMask, negativeOneToPower, signMask), cVal); +++ cVal = _mm_mul_ps(_mm_blendv_ps(onesMask, negativeOneToPower, signMask), cVal); ++ ++- _mm_store_ps(cPtr,cVal); // Store the results back into the C container +++ _mm_store_ps(cPtr, cVal); // Store the results back into the C container ++ ++- aPtr += 4; ++- cPtr += 4; ++- } +++ aPtr += 4; +++ cPtr += 4; +++ } ++ ++- number = quarterPoints * 4; +++ number = quarterPoints * 4; ++ #endif /* LV_HAVE_LIB_SIMDMATH */ ++ ++- for(;number < num_points; number++){ ++- *cPtr++ = powf((*aPtr++), power); ++- } +++ for (; number < num_points; number++) { +++ *cPtr++ = powf((*aPtr++), power); +++ } ++ } ++ ++ #endif /* LV_HAVE_SSE4_1 */ ++@@ -137,49 +139,54 @@ volk_32f_s32f_power_32f_a_sse4_1(float* cVector, const float* aVector, ++ #include ++ #endif /* LV_HAVE_LIB_SIMDMATH */ ++ ++-static inline void ++-volk_32f_s32f_power_32f_a_sse(float* cVector, const float* aVector, ++- const float power, unsigned int num_points) +++static inline void volk_32f_s32f_power_32f_a_sse(float* cVector, +++ const float* aVector, +++ const float power, +++ unsigned int num_points) ++ { ++- unsigned int number = 0; +++ unsigned int number = 0; ++ ++- float* cPtr = cVector; ++- const float* aPtr = aVector; +++ float* cPtr = cVector; +++ const float* aPtr = aVector; ++ ++ #ifdef LV_HAVE_LIB_SIMDMATH ++- const unsigned int quarterPoints = num_points / 4; ++- __m128 vPower = _mm_set_ps1(power); ++- __m128 zeroValue = _mm_setzero_ps(); ++- __m128 signMask; ++- __m128 negatedValues; ++- __m128 negativeOneToPower = _mm_set_ps1(powf(-1, power)); ++- __m128 onesMask = _mm_set_ps1(1); ++- ++- __m128 aVal, cVal; ++- for(;number < quarterPoints; number++){ ++- ++- aVal = _mm_load_ps(aPtr); ++- signMask = _mm_cmplt_ps(aVal, zeroValue); ++- negatedValues = _mm_sub_ps(zeroValue, aVal); ++- aVal = _mm_or_ps(_mm_andnot_ps(signMask, aVal), _mm_and_ps(signMask, negatedValues) ); ++- ++- // powf4 doesn't support negative values in the base, so we mask them off and then apply the negative after ++- cVal = powf4(aVal, vPower); // Takes each input value to the specified power ++- ++- cVal = _mm_mul_ps( _mm_or_ps( _mm_andnot_ps(signMask, onesMask), _mm_and_ps(signMask, negativeOneToPower) ), cVal); ++- ++- _mm_store_ps(cPtr,cVal); // Store the results back into the C container ++- ++- aPtr += 4; ++- cPtr += 4; ++- } ++- ++- number = quarterPoints * 4; +++ const unsigned int quarterPoints = num_points / 4; +++ __m128 vPower = _mm_set_ps1(power); +++ __m128 zeroValue = _mm_setzero_ps(); +++ __m128 signMask; +++ __m128 negatedValues; +++ __m128 negativeOneToPower = _mm_set_ps1(powf(-1, power)); +++ __m128 onesMask = _mm_set_ps1(1); +++ +++ __m128 aVal, cVal; +++ for (; number < quarterPoints; number++) { +++ +++ aVal = _mm_load_ps(aPtr); +++ signMask = _mm_cmplt_ps(aVal, zeroValue); +++ negatedValues = _mm_sub_ps(zeroValue, aVal); +++ aVal = +++ _mm_or_ps(_mm_andnot_ps(signMask, aVal), _mm_and_ps(signMask, negatedValues)); +++ +++ // powf4 doesn't support negative values in the base, so we mask them off and then +++ // apply the negative after +++ cVal = powf4(aVal, vPower); // Takes each input value to the specified power +++ +++ cVal = _mm_mul_ps(_mm_or_ps(_mm_andnot_ps(signMask, onesMask), +++ _mm_and_ps(signMask, negativeOneToPower)), +++ cVal); +++ +++ _mm_store_ps(cPtr, cVal); // Store the results back into the C container +++ +++ aPtr += 4; +++ cPtr += 4; +++ } +++ +++ number = quarterPoints * 4; ++ #endif /* LV_HAVE_LIB_SIMDMATH */ ++ ++- for(;number < num_points; number++){ ++- *cPtr++ = powf((*aPtr++), power); ++- } +++ for (; number < num_points; number++) { +++ *cPtr++ = powf((*aPtr++), power); +++ } ++ } ++ ++ #endif /* LV_HAVE_SSE */ ++@@ -187,17 +194,18 @@ volk_32f_s32f_power_32f_a_sse(float* cVector, const float* aVector, ++ ++ #ifdef LV_HAVE_GENERIC ++ ++-static inline void ++-volk_32f_s32f_power_32f_generic(float* cVector, const float* aVector, ++- const float power, unsigned int num_points) +++static inline void volk_32f_s32f_power_32f_generic(float* cVector, +++ const float* aVector, +++ const float power, +++ unsigned int num_points) ++ { ++- float* cPtr = cVector; ++- const float* aPtr = aVector; ++- unsigned int number = 0; +++ float* cPtr = cVector; +++ const float* aPtr = aVector; +++ unsigned int number = 0; ++ ++- for(number = 0; number < num_points; number++){ ++- *cPtr++ = powf((*aPtr++), power); ++- } +++ for (number = 0; number < num_points; number++) { +++ *cPtr++ = powf((*aPtr++), power); +++ } ++ } ++ #endif /* LV_HAVE_GENERIC */ ++ ++diff --git a/kernels/volk/volk_32f_s32f_s32f_mod_range_32f.h b/kernels/volk/volk_32f_s32f_s32f_mod_range_32f.h ++index 53b4937..d7f23fe 100644 ++--- a/kernels/volk/volk_32f_s32f_s32f_mod_range_32f.h +++++ b/kernels/volk/volk_32f_s32f_s32f_mod_range_32f.h ++@@ -25,8 +25,8 @@ ++ * ++ * Dispatcher Prototype ++ * \code ++- * void volk_32f_s32f_s32f_mod_range_32f(float* outputVector, const float* inputVector, const float lower_bound, const float upper_bound, unsigned int num_points) ++- * \endcode +++ * void volk_32f_s32f_s32f_mod_range_32f(float* outputVector, const float* inputVector, +++ * const float lower_bound, const float upper_bound, unsigned int num_points) \endcode ++ * ++ * \b Inputs ++ * \li inputVector: The input vector ++@@ -46,117 +46,129 @@ ++ #ifdef LV_HAVE_AVX ++ #include ++ ++-static inline void volk_32f_s32f_s32f_mod_range_32f_u_avx(float* outputVector, const float* inputVector, const float lower_bound, const float upper_bound, unsigned int num_points){ ++- __m256 lower = _mm256_set1_ps(lower_bound); ++- __m256 upper = _mm256_set1_ps(upper_bound); ++- __m256 distance = _mm256_sub_ps(upper,lower); ++- float dist = upper_bound - lower_bound; ++- __m256 input, output; ++- __m256 is_smaller, is_bigger; ++- __m256 excess, adj; ++- ++- const float *inPtr = inputVector; ++- float *outPtr = outputVector; ++- size_t eight_points = num_points / 8; ++- size_t counter; ++- for(counter = 0; counter < eight_points; counter++) { ++- input = _mm256_loadu_ps(inPtr); ++- // calculate mask: input < lower, input > upper ++- is_smaller = _mm256_cmp_ps(input, lower, _CMP_LT_OQ); //0x11: Less than, ordered, non-signalling ++- is_bigger = _mm256_cmp_ps(input, upper, _CMP_GT_OQ); //0x1e: greater than, ordered, non-signalling ++- // find out how far we are out-of-bound – positive values! ++- excess = _mm256_and_ps(_mm256_sub_ps(lower, input), is_smaller); ++- excess = _mm256_or_ps(_mm256_and_ps(_mm256_sub_ps(input, upper), is_bigger), excess); ++- // how many do we have to add? (int(excess/distance+1)*distance) ++- excess = _mm256_div_ps(excess, distance); ++- // round down ++- excess = _mm256_cvtepi32_ps(_mm256_cvttps_epi32(excess)); ++- // plus 1 ++- adj = _mm256_set1_ps(1.0f); ++- excess = _mm256_add_ps(excess, adj); ++- // get the sign right, adj is still {1.0f,1.0f,1.0f,1.0f} ++- adj = _mm256_and_ps(adj, is_smaller); ++- adj = _mm256_or_ps(_mm256_and_ps(_mm256_set1_ps(-1.0f), is_bigger), adj); ++- // scale by distance, sign ++- excess = _mm256_mul_ps(_mm256_mul_ps(excess, adj), distance); ++- output = _mm256_add_ps(input, excess); ++- _mm256_storeu_ps(outPtr, output); ++- inPtr += 8; ++- outPtr += 8; ++- } ++- ++- size_t cnt; ++- for(cnt = eight_points * 8; cnt < num_points; cnt++){ ++- float val = inputVector[cnt]; ++- if(val < lower_bound){ ++- float excess = lower_bound - val; ++- signed int count = (int)(excess/dist); ++- outputVector[cnt] = val + (count+1)*dist; +++static inline void volk_32f_s32f_s32f_mod_range_32f_u_avx(float* outputVector, +++ const float* inputVector, +++ const float lower_bound, +++ const float upper_bound, +++ unsigned int num_points) +++{ +++ __m256 lower = _mm256_set1_ps(lower_bound); +++ __m256 upper = _mm256_set1_ps(upper_bound); +++ __m256 distance = _mm256_sub_ps(upper, lower); +++ float dist = upper_bound - lower_bound; +++ __m256 input, output; +++ __m256 is_smaller, is_bigger; +++ __m256 excess, adj; +++ +++ const float* inPtr = inputVector; +++ float* outPtr = outputVector; +++ size_t eight_points = num_points / 8; +++ size_t counter; +++ for (counter = 0; counter < eight_points; counter++) { +++ input = _mm256_loadu_ps(inPtr); +++ // calculate mask: input < lower, input > upper +++ is_smaller = _mm256_cmp_ps( +++ input, lower, _CMP_LT_OQ); // 0x11: Less than, ordered, non-signalling +++ is_bigger = _mm256_cmp_ps( +++ input, upper, _CMP_GT_OQ); // 0x1e: greater than, ordered, non-signalling +++ // find out how far we are out-of-bound – positive values! +++ excess = _mm256_and_ps(_mm256_sub_ps(lower, input), is_smaller); +++ excess = +++ _mm256_or_ps(_mm256_and_ps(_mm256_sub_ps(input, upper), is_bigger), excess); +++ // how many do we have to add? (int(excess/distance+1)*distance) +++ excess = _mm256_div_ps(excess, distance); +++ // round down +++ excess = _mm256_cvtepi32_ps(_mm256_cvttps_epi32(excess)); +++ // plus 1 +++ adj = _mm256_set1_ps(1.0f); +++ excess = _mm256_add_ps(excess, adj); +++ // get the sign right, adj is still {1.0f,1.0f,1.0f,1.0f} +++ adj = _mm256_and_ps(adj, is_smaller); +++ adj = _mm256_or_ps(_mm256_and_ps(_mm256_set1_ps(-1.0f), is_bigger), adj); +++ // scale by distance, sign +++ excess = _mm256_mul_ps(_mm256_mul_ps(excess, adj), distance); +++ output = _mm256_add_ps(input, excess); +++ _mm256_storeu_ps(outPtr, output); +++ inPtr += 8; +++ outPtr += 8; ++ } ++- else if(val > upper_bound){ ++- float excess = val - upper_bound; ++- signed int count = (int)(excess/dist); ++- outputVector[cnt] = val - (count+1)*dist; +++ +++ size_t cnt; +++ for (cnt = eight_points * 8; cnt < num_points; cnt++) { +++ float val = inputVector[cnt]; +++ if (val < lower_bound) { +++ float excess = lower_bound - val; +++ signed int count = (int)(excess / dist); +++ outputVector[cnt] = val + (count + 1) * dist; +++ } else if (val > upper_bound) { +++ float excess = val - upper_bound; +++ signed int count = (int)(excess / dist); +++ outputVector[cnt] = val - (count + 1) * dist; +++ } else +++ outputVector[cnt] = val; ++ } ++- else ++- outputVector[cnt] = val; ++- } ++ } ++-static inline void volk_32f_s32f_s32f_mod_range_32f_a_avx(float* outputVector, const float* inputVector, const float lower_bound, const float upper_bound, unsigned int num_points){ ++- __m256 lower = _mm256_set1_ps(lower_bound); ++- __m256 upper = _mm256_set1_ps(upper_bound); ++- __m256 distance = _mm256_sub_ps(upper,lower); ++- float dist = upper_bound - lower_bound; ++- __m256 input, output; ++- __m256 is_smaller, is_bigger; ++- __m256 excess, adj; ++- ++- const float *inPtr = inputVector; ++- float *outPtr = outputVector; ++- size_t eight_points = num_points / 8; ++- size_t counter; ++- for(counter = 0; counter < eight_points; counter++) { ++- input = _mm256_load_ps(inPtr); ++- // calculate mask: input < lower, input > upper ++- is_smaller = _mm256_cmp_ps(input, lower, _CMP_LT_OQ); //0x11: Less than, ordered, non-signalling ++- is_bigger = _mm256_cmp_ps(input, upper, _CMP_GT_OQ); //0x1e: greater than, ordered, non-signalling ++- // find out how far we are out-of-bound – positive values! ++- excess = _mm256_and_ps(_mm256_sub_ps(lower, input), is_smaller); ++- excess = _mm256_or_ps(_mm256_and_ps(_mm256_sub_ps(input, upper), is_bigger), excess); ++- // how many do we have to add? (int(excess/distance+1)*distance) ++- excess = _mm256_div_ps(excess, distance); ++- // round down ++- excess = _mm256_cvtepi32_ps(_mm256_cvttps_epi32(excess)); ++- // plus 1 ++- adj = _mm256_set1_ps(1.0f); ++- excess = _mm256_add_ps(excess, adj); ++- // get the sign right, adj is still {1.0f,1.0f,1.0f,1.0f} ++- adj = _mm256_and_ps(adj, is_smaller); ++- adj = _mm256_or_ps(_mm256_and_ps(_mm256_set1_ps(-1.0f), is_bigger), adj); ++- // scale by distance, sign ++- excess = _mm256_mul_ps(_mm256_mul_ps(excess, adj), distance); ++- output = _mm256_add_ps(input, excess); ++- _mm256_store_ps(outPtr, output); ++- inPtr += 8; ++- outPtr += 8; ++- } ++- ++- size_t cnt; ++- for(cnt = eight_points * 8; cnt < num_points; cnt++){ ++- float val = inputVector[cnt]; ++- if(val < lower_bound){ ++- float excess = lower_bound - val; ++- signed int count = (int)(excess/dist); ++- outputVector[cnt] = val + (count+1)*dist; +++static inline void volk_32f_s32f_s32f_mod_range_32f_a_avx(float* outputVector, +++ const float* inputVector, +++ const float lower_bound, +++ const float upper_bound, +++ unsigned int num_points) +++{ +++ __m256 lower = _mm256_set1_ps(lower_bound); +++ __m256 upper = _mm256_set1_ps(upper_bound); +++ __m256 distance = _mm256_sub_ps(upper, lower); +++ float dist = upper_bound - lower_bound; +++ __m256 input, output; +++ __m256 is_smaller, is_bigger; +++ __m256 excess, adj; +++ +++ const float* inPtr = inputVector; +++ float* outPtr = outputVector; +++ size_t eight_points = num_points / 8; +++ size_t counter; +++ for (counter = 0; counter < eight_points; counter++) { +++ input = _mm256_load_ps(inPtr); +++ // calculate mask: input < lower, input > upper +++ is_smaller = _mm256_cmp_ps( +++ input, lower, _CMP_LT_OQ); // 0x11: Less than, ordered, non-signalling +++ is_bigger = _mm256_cmp_ps( +++ input, upper, _CMP_GT_OQ); // 0x1e: greater than, ordered, non-signalling +++ // find out how far we are out-of-bound – positive values! +++ excess = _mm256_and_ps(_mm256_sub_ps(lower, input), is_smaller); +++ excess = +++ _mm256_or_ps(_mm256_and_ps(_mm256_sub_ps(input, upper), is_bigger), excess); +++ // how many do we have to add? (int(excess/distance+1)*distance) +++ excess = _mm256_div_ps(excess, distance); +++ // round down +++ excess = _mm256_cvtepi32_ps(_mm256_cvttps_epi32(excess)); +++ // plus 1 +++ adj = _mm256_set1_ps(1.0f); +++ excess = _mm256_add_ps(excess, adj); +++ // get the sign right, adj is still {1.0f,1.0f,1.0f,1.0f} +++ adj = _mm256_and_ps(adj, is_smaller); +++ adj = _mm256_or_ps(_mm256_and_ps(_mm256_set1_ps(-1.0f), is_bigger), adj); +++ // scale by distance, sign +++ excess = _mm256_mul_ps(_mm256_mul_ps(excess, adj), distance); +++ output = _mm256_add_ps(input, excess); +++ _mm256_store_ps(outPtr, output); +++ inPtr += 8; +++ outPtr += 8; ++ } ++- else if(val > upper_bound){ ++- float excess = val - upper_bound; ++- signed int count = (int)(excess/dist); ++- outputVector[cnt] = val - (count+1)*dist; +++ +++ size_t cnt; +++ for (cnt = eight_points * 8; cnt < num_points; cnt++) { +++ float val = inputVector[cnt]; +++ if (val < lower_bound) { +++ float excess = lower_bound - val; +++ signed int count = (int)(excess / dist); +++ outputVector[cnt] = val + (count + 1) * dist; +++ } else if (val > upper_bound) { +++ float excess = val - upper_bound; +++ signed int count = (int)(excess / dist); +++ outputVector[cnt] = val - (count + 1) * dist; +++ } else +++ outputVector[cnt] = val; ++ } ++- else ++- outputVector[cnt] = val; ++- } ++ } ++ #endif /* LV_HAVE_AVX */ ++ ++@@ -164,268 +176,282 @@ static inline void volk_32f_s32f_s32f_mod_range_32f_a_avx(float* outputVector, c ++ #ifdef LV_HAVE_SSE2 ++ #include ++ ++-static inline void volk_32f_s32f_s32f_mod_range_32f_u_sse2(float* outputVector, const float* inputVector, const float lower_bound, const float upper_bound, unsigned int num_points){ ++- __m128 lower = _mm_set_ps1(lower_bound); ++- __m128 upper = _mm_set_ps1(upper_bound); ++- __m128 distance = _mm_sub_ps(upper,lower); ++- float dist = upper_bound - lower_bound; ++- __m128 input, output; ++- __m128 is_smaller, is_bigger; ++- __m128 excess, adj; ++- ++- const float *inPtr = inputVector; ++- float *outPtr = outputVector; ++- size_t quarter_points = num_points / 4; ++- size_t counter; ++- for(counter = 0; counter < quarter_points; counter++) { ++- input = _mm_load_ps(inPtr); ++- // calculate mask: input < lower, input > upper ++- is_smaller = _mm_cmplt_ps(input, lower); ++- is_bigger = _mm_cmpgt_ps(input, upper); ++- // find out how far we are out-of-bound – positive values! ++- excess = _mm_and_ps(_mm_sub_ps(lower, input), is_smaller); ++- excess = _mm_or_ps(_mm_and_ps(_mm_sub_ps(input, upper), is_bigger), excess); ++- // how many do we have to add? (int(excess/distance+1)*distance) ++- excess = _mm_div_ps(excess, distance); ++- // round down ++- excess = _mm_cvtepi32_ps(_mm_cvttps_epi32(excess)); ++- // plus 1 ++- adj = _mm_set_ps1(1.0f); ++- excess = _mm_add_ps(excess, adj); ++- // get the sign right, adj is still {1.0f,1.0f,1.0f,1.0f} ++- adj = _mm_and_ps(adj, is_smaller); ++- adj = _mm_or_ps(_mm_and_ps(_mm_set_ps1(-1.0f), is_bigger), adj); ++- // scale by distance, sign ++- excess = _mm_mul_ps(_mm_mul_ps(excess, adj), distance); ++- output = _mm_add_ps(input, excess); ++- _mm_store_ps(outPtr, output); ++- inPtr += 4; ++- outPtr += 4; ++- } ++- ++- size_t cnt; ++- for(cnt = quarter_points * 4; cnt < num_points; cnt++){ ++- float val = inputVector[cnt]; ++- if(val < lower_bound){ ++- float excess = lower_bound - val; ++- signed int count = (int)(excess/dist); ++- outputVector[cnt] = val + (count+1)*dist; +++static inline void volk_32f_s32f_s32f_mod_range_32f_u_sse2(float* outputVector, +++ const float* inputVector, +++ const float lower_bound, +++ const float upper_bound, +++ unsigned int num_points) +++{ +++ __m128 lower = _mm_set_ps1(lower_bound); +++ __m128 upper = _mm_set_ps1(upper_bound); +++ __m128 distance = _mm_sub_ps(upper, lower); +++ float dist = upper_bound - lower_bound; +++ __m128 input, output; +++ __m128 is_smaller, is_bigger; +++ __m128 excess, adj; +++ +++ const float* inPtr = inputVector; +++ float* outPtr = outputVector; +++ size_t quarter_points = num_points / 4; +++ size_t counter; +++ for (counter = 0; counter < quarter_points; counter++) { +++ input = _mm_load_ps(inPtr); +++ // calculate mask: input < lower, input > upper +++ is_smaller = _mm_cmplt_ps(input, lower); +++ is_bigger = _mm_cmpgt_ps(input, upper); +++ // find out how far we are out-of-bound – positive values! +++ excess = _mm_and_ps(_mm_sub_ps(lower, input), is_smaller); +++ excess = _mm_or_ps(_mm_and_ps(_mm_sub_ps(input, upper), is_bigger), excess); +++ // how many do we have to add? (int(excess/distance+1)*distance) +++ excess = _mm_div_ps(excess, distance); +++ // round down +++ excess = _mm_cvtepi32_ps(_mm_cvttps_epi32(excess)); +++ // plus 1 +++ adj = _mm_set_ps1(1.0f); +++ excess = _mm_add_ps(excess, adj); +++ // get the sign right, adj is still {1.0f,1.0f,1.0f,1.0f} +++ adj = _mm_and_ps(adj, is_smaller); +++ adj = _mm_or_ps(_mm_and_ps(_mm_set_ps1(-1.0f), is_bigger), adj); +++ // scale by distance, sign +++ excess = _mm_mul_ps(_mm_mul_ps(excess, adj), distance); +++ output = _mm_add_ps(input, excess); +++ _mm_store_ps(outPtr, output); +++ inPtr += 4; +++ outPtr += 4; ++ } ++- else if(val > upper_bound){ ++- float excess = val - upper_bound; ++- signed int count = (int)(excess/dist); ++- outputVector[cnt] = val - (count+1)*dist; +++ +++ size_t cnt; +++ for (cnt = quarter_points * 4; cnt < num_points; cnt++) { +++ float val = inputVector[cnt]; +++ if (val < lower_bound) { +++ float excess = lower_bound - val; +++ signed int count = (int)(excess / dist); +++ outputVector[cnt] = val + (count + 1) * dist; +++ } else if (val > upper_bound) { +++ float excess = val - upper_bound; +++ signed int count = (int)(excess / dist); +++ outputVector[cnt] = val - (count + 1) * dist; +++ } else +++ outputVector[cnt] = val; ++ } ++- else ++- outputVector[cnt] = val; ++- } ++ } ++-static inline void volk_32f_s32f_s32f_mod_range_32f_a_sse2(float* outputVector, const float* inputVector, const float lower_bound, const float upper_bound, unsigned int num_points){ ++- __m128 lower = _mm_set_ps1(lower_bound); ++- __m128 upper = _mm_set_ps1(upper_bound); ++- __m128 distance = _mm_sub_ps(upper,lower); ++- __m128 input, output; ++- __m128 is_smaller, is_bigger; ++- __m128 excess, adj; ++- ++- const float *inPtr = inputVector; ++- float *outPtr = outputVector; ++- size_t quarter_points = num_points / 4; ++- size_t counter; ++- for(counter = 0; counter < quarter_points; counter++) { ++- input = _mm_load_ps(inPtr); ++- // calculate mask: input < lower, input > upper ++- is_smaller = _mm_cmplt_ps(input, lower); ++- is_bigger = _mm_cmpgt_ps(input, upper); ++- // find out how far we are out-of-bound – positive values! ++- excess = _mm_and_ps(_mm_sub_ps(lower, input), is_smaller); ++- excess = _mm_or_ps(_mm_and_ps(_mm_sub_ps(input, upper), is_bigger), excess); ++- // how many do we have to add? (int(excess/distance+1)*distance) ++- excess = _mm_div_ps(excess, distance); ++- // round down – for some reason, SSE doesn't come with a 4x float -> 4x int32 conversion. ++- excess = _mm_cvtepi32_ps(_mm_cvttps_epi32(excess)); ++- // plus 1 ++- adj = _mm_set_ps1(1.0f); ++- excess = _mm_add_ps(excess, adj); ++- // get the sign right, adj is still {1.0f,1.0f,1.0f,1.0f} ++- adj = _mm_and_ps(adj, is_smaller); ++- adj = _mm_or_ps(_mm_and_ps(_mm_set_ps1(-1.0f), is_bigger), adj); ++- // scale by distance, sign ++- excess = _mm_mul_ps(_mm_mul_ps(excess, adj), distance); ++- output = _mm_add_ps(input, excess); ++- _mm_store_ps(outPtr, output); ++- inPtr += 4; ++- outPtr += 4; ++- } ++- ++- float dist = upper_bound - lower_bound; ++- size_t cnt; ++- for(cnt = quarter_points * 4; cnt < num_points; cnt++){ ++- float val = inputVector[cnt]; ++- if(val < lower_bound){ ++- float excess = lower_bound - val; ++- signed int count = (int)(excess/dist); ++- outputVector[cnt] = val + (count+1)*dist; +++static inline void volk_32f_s32f_s32f_mod_range_32f_a_sse2(float* outputVector, +++ const float* inputVector, +++ const float lower_bound, +++ const float upper_bound, +++ unsigned int num_points) +++{ +++ __m128 lower = _mm_set_ps1(lower_bound); +++ __m128 upper = _mm_set_ps1(upper_bound); +++ __m128 distance = _mm_sub_ps(upper, lower); +++ __m128 input, output; +++ __m128 is_smaller, is_bigger; +++ __m128 excess, adj; +++ +++ const float* inPtr = inputVector; +++ float* outPtr = outputVector; +++ size_t quarter_points = num_points / 4; +++ size_t counter; +++ for (counter = 0; counter < quarter_points; counter++) { +++ input = _mm_load_ps(inPtr); +++ // calculate mask: input < lower, input > upper +++ is_smaller = _mm_cmplt_ps(input, lower); +++ is_bigger = _mm_cmpgt_ps(input, upper); +++ // find out how far we are out-of-bound – positive values! +++ excess = _mm_and_ps(_mm_sub_ps(lower, input), is_smaller); +++ excess = _mm_or_ps(_mm_and_ps(_mm_sub_ps(input, upper), is_bigger), excess); +++ // how many do we have to add? (int(excess/distance+1)*distance) +++ excess = _mm_div_ps(excess, distance); +++ // round down – for some reason, SSE doesn't come with a 4x float -> 4x int32 +++ // conversion. +++ excess = _mm_cvtepi32_ps(_mm_cvttps_epi32(excess)); +++ // plus 1 +++ adj = _mm_set_ps1(1.0f); +++ excess = _mm_add_ps(excess, adj); +++ // get the sign right, adj is still {1.0f,1.0f,1.0f,1.0f} +++ adj = _mm_and_ps(adj, is_smaller); +++ adj = _mm_or_ps(_mm_and_ps(_mm_set_ps1(-1.0f), is_bigger), adj); +++ // scale by distance, sign +++ excess = _mm_mul_ps(_mm_mul_ps(excess, adj), distance); +++ output = _mm_add_ps(input, excess); +++ _mm_store_ps(outPtr, output); +++ inPtr += 4; +++ outPtr += 4; ++ } ++- else if(val > upper_bound){ ++- float excess = val - upper_bound; ++- signed int count = (int)(excess/dist); ++- outputVector[cnt] = val - (count+1)*dist; +++ +++ float dist = upper_bound - lower_bound; +++ size_t cnt; +++ for (cnt = quarter_points * 4; cnt < num_points; cnt++) { +++ float val = inputVector[cnt]; +++ if (val < lower_bound) { +++ float excess = lower_bound - val; +++ signed int count = (int)(excess / dist); +++ outputVector[cnt] = val + (count + 1) * dist; +++ } else if (val > upper_bound) { +++ float excess = val - upper_bound; +++ signed int count = (int)(excess / dist); +++ outputVector[cnt] = val - (count + 1) * dist; +++ } else +++ outputVector[cnt] = val; ++ } ++- else ++- outputVector[cnt] = val; ++- } ++ } ++ #endif /* LV_HAVE_SSE2 */ ++ ++ #ifdef LV_HAVE_SSE ++ #include ++ ++-static inline void volk_32f_s32f_s32f_mod_range_32f_u_sse(float* outputVector, const float* inputVector, const float lower_bound, const float upper_bound, unsigned int num_points){ ++- __m128 lower = _mm_set_ps1(lower_bound); ++- __m128 upper = _mm_set_ps1(upper_bound); ++- __m128 distance = _mm_sub_ps(upper,lower); ++- float dist = upper_bound - lower_bound; ++- __m128 input, output; ++- __m128 is_smaller, is_bigger; ++- __m128 excess, adj; ++- __m128i rounddown; ++- ++- const float *inPtr = inputVector; ++- float *outPtr = outputVector; ++- size_t quarter_points = num_points / 4; ++- size_t counter; ++- for(counter = 0; counter < quarter_points; counter++) { ++- input = _mm_load_ps(inPtr); ++- // calculate mask: input < lower, input > upper ++- is_smaller = _mm_cmplt_ps(input, lower); ++- is_bigger = _mm_cmpgt_ps(input, upper); ++- // find out how far we are out-of-bound – positive values! ++- excess = _mm_and_ps(_mm_sub_ps(lower, input), is_smaller); ++- excess = _mm_or_ps(_mm_and_ps(_mm_sub_ps(input, upper), is_bigger), excess); ++- // how many do we have to add? (int(excess/distance+1)*distance) ++- excess = _mm_div_ps(excess, distance); ++- // round down – for some reason ++- rounddown = _mm_cvttps_epi32(excess); ++- excess = _mm_cvtepi32_ps(rounddown); ++- // plus 1 ++- adj = _mm_set_ps1(1.0f); ++- excess = _mm_add_ps(excess, adj); ++- // get the sign right, adj is still {1.0f,1.0f,1.0f,1.0f} ++- adj = _mm_and_ps(adj, is_smaller); ++- adj = _mm_or_ps(_mm_and_ps(_mm_set_ps1(-1.0f), is_bigger), adj); ++- // scale by distance, sign ++- excess = _mm_mul_ps(_mm_mul_ps(excess, adj), distance); ++- output = _mm_add_ps(input, excess); ++- _mm_store_ps(outPtr, output); ++- inPtr += 4; ++- outPtr += 4; ++- } ++- ++- size_t cnt; ++- for(cnt = quarter_points * 4; cnt < num_points; cnt++){ ++- float val = inputVector[cnt]; ++- if(val < lower_bound){ ++- float excess = lower_bound - val; ++- signed int count = (int)(excess/dist); ++- outputVector[cnt] = val + (count+1)*dist; +++static inline void volk_32f_s32f_s32f_mod_range_32f_u_sse(float* outputVector, +++ const float* inputVector, +++ const float lower_bound, +++ const float upper_bound, +++ unsigned int num_points) +++{ +++ __m128 lower = _mm_set_ps1(lower_bound); +++ __m128 upper = _mm_set_ps1(upper_bound); +++ __m128 distance = _mm_sub_ps(upper, lower); +++ float dist = upper_bound - lower_bound; +++ __m128 input, output; +++ __m128 is_smaller, is_bigger; +++ __m128 excess, adj; +++ __m128i rounddown; +++ +++ const float* inPtr = inputVector; +++ float* outPtr = outputVector; +++ size_t quarter_points = num_points / 4; +++ size_t counter; +++ for (counter = 0; counter < quarter_points; counter++) { +++ input = _mm_load_ps(inPtr); +++ // calculate mask: input < lower, input > upper +++ is_smaller = _mm_cmplt_ps(input, lower); +++ is_bigger = _mm_cmpgt_ps(input, upper); +++ // find out how far we are out-of-bound – positive values! +++ excess = _mm_and_ps(_mm_sub_ps(lower, input), is_smaller); +++ excess = _mm_or_ps(_mm_and_ps(_mm_sub_ps(input, upper), is_bigger), excess); +++ // how many do we have to add? (int(excess/distance+1)*distance) +++ excess = _mm_div_ps(excess, distance); +++ // round down – for some reason +++ rounddown = _mm_cvttps_epi32(excess); +++ excess = _mm_cvtepi32_ps(rounddown); +++ // plus 1 +++ adj = _mm_set_ps1(1.0f); +++ excess = _mm_add_ps(excess, adj); +++ // get the sign right, adj is still {1.0f,1.0f,1.0f,1.0f} +++ adj = _mm_and_ps(adj, is_smaller); +++ adj = _mm_or_ps(_mm_and_ps(_mm_set_ps1(-1.0f), is_bigger), adj); +++ // scale by distance, sign +++ excess = _mm_mul_ps(_mm_mul_ps(excess, adj), distance); +++ output = _mm_add_ps(input, excess); +++ _mm_store_ps(outPtr, output); +++ inPtr += 4; +++ outPtr += 4; ++ } ++- else if(val > upper_bound){ ++- float excess = val - upper_bound; ++- signed int count = (int)(excess/dist); ++- outputVector[cnt] = val - (count+1)*dist; +++ +++ size_t cnt; +++ for (cnt = quarter_points * 4; cnt < num_points; cnt++) { +++ float val = inputVector[cnt]; +++ if (val < lower_bound) { +++ float excess = lower_bound - val; +++ signed int count = (int)(excess / dist); +++ outputVector[cnt] = val + (count + 1) * dist; +++ } else if (val > upper_bound) { +++ float excess = val - upper_bound; +++ signed int count = (int)(excess / dist); +++ outputVector[cnt] = val - (count + 1) * dist; +++ } else +++ outputVector[cnt] = val; ++ } ++- else ++- outputVector[cnt] = val; ++- } ++ } ++-static inline void volk_32f_s32f_s32f_mod_range_32f_a_sse(float* outputVector, const float* inputVector, const float lower_bound, const float upper_bound, unsigned int num_points){ ++- __m128 lower = _mm_set_ps1(lower_bound); ++- __m128 upper = _mm_set_ps1(upper_bound); ++- __m128 distance = _mm_sub_ps(upper,lower); ++- __m128 input, output; ++- __m128 is_smaller, is_bigger; ++- __m128 excess, adj; ++- __m128i rounddown; ++- ++- const float *inPtr = inputVector; ++- float *outPtr = outputVector; ++- size_t quarter_points = num_points / 4; ++- size_t counter; ++- for(counter = 0; counter < quarter_points; counter++) { ++- input = _mm_load_ps(inPtr); ++- // calculate mask: input < lower, input > upper ++- is_smaller = _mm_cmplt_ps(input, lower); ++- is_bigger = _mm_cmpgt_ps(input, upper); ++- // find out how far we are out-of-bound – positive values! ++- excess = _mm_and_ps(_mm_sub_ps(lower, input), is_smaller); ++- excess = _mm_or_ps(_mm_and_ps(_mm_sub_ps(input, upper), is_bigger), excess); ++- // how many do we have to add? (int(excess/distance+1)*distance) ++- excess = _mm_div_ps(excess, distance); ++- // round down ++- rounddown = _mm_cvttps_epi32(excess); ++- excess = _mm_cvtepi32_ps(rounddown); ++- // plus 1 ++- adj = _mm_set_ps1(1.0f); ++- excess = _mm_add_ps(excess, adj); ++- // get the sign right, adj is still {1.0f,1.0f,1.0f,1.0f} ++- adj = _mm_and_ps(adj, is_smaller); ++- adj = _mm_or_ps(_mm_and_ps(_mm_set_ps1(-1.0f), is_bigger), adj); ++- // scale by distance, sign ++- excess = _mm_mul_ps(_mm_mul_ps(excess, adj), distance); ++- output = _mm_add_ps(input, excess); ++- _mm_store_ps(outPtr, output); ++- inPtr += 4; ++- outPtr += 4; ++- } ++- ++- float dist = upper_bound - lower_bound; ++- size_t cnt; ++- for(cnt = quarter_points * 4; cnt < num_points; cnt++){ ++- float val = inputVector[cnt]; ++- if(val < lower_bound){ ++- float excess = lower_bound - val; ++- signed int count = (int)(excess/dist); ++- outputVector[cnt] = val + (count+1)*dist; +++static inline void volk_32f_s32f_s32f_mod_range_32f_a_sse(float* outputVector, +++ const float* inputVector, +++ const float lower_bound, +++ const float upper_bound, +++ unsigned int num_points) +++{ +++ __m128 lower = _mm_set_ps1(lower_bound); +++ __m128 upper = _mm_set_ps1(upper_bound); +++ __m128 distance = _mm_sub_ps(upper, lower); +++ __m128 input, output; +++ __m128 is_smaller, is_bigger; +++ __m128 excess, adj; +++ __m128i rounddown; +++ +++ const float* inPtr = inputVector; +++ float* outPtr = outputVector; +++ size_t quarter_points = num_points / 4; +++ size_t counter; +++ for (counter = 0; counter < quarter_points; counter++) { +++ input = _mm_load_ps(inPtr); +++ // calculate mask: input < lower, input > upper +++ is_smaller = _mm_cmplt_ps(input, lower); +++ is_bigger = _mm_cmpgt_ps(input, upper); +++ // find out how far we are out-of-bound – positive values! +++ excess = _mm_and_ps(_mm_sub_ps(lower, input), is_smaller); +++ excess = _mm_or_ps(_mm_and_ps(_mm_sub_ps(input, upper), is_bigger), excess); +++ // how many do we have to add? (int(excess/distance+1)*distance) +++ excess = _mm_div_ps(excess, distance); +++ // round down +++ rounddown = _mm_cvttps_epi32(excess); +++ excess = _mm_cvtepi32_ps(rounddown); +++ // plus 1 +++ adj = _mm_set_ps1(1.0f); +++ excess = _mm_add_ps(excess, adj); +++ // get the sign right, adj is still {1.0f,1.0f,1.0f,1.0f} +++ adj = _mm_and_ps(adj, is_smaller); +++ adj = _mm_or_ps(_mm_and_ps(_mm_set_ps1(-1.0f), is_bigger), adj); +++ // scale by distance, sign +++ excess = _mm_mul_ps(_mm_mul_ps(excess, adj), distance); +++ output = _mm_add_ps(input, excess); +++ _mm_store_ps(outPtr, output); +++ inPtr += 4; +++ outPtr += 4; ++ } ++- else if(val > upper_bound){ ++- float excess = val - upper_bound; ++- signed int count = (int)(excess/dist); ++- outputVector[cnt] = val - (count+1)*dist; +++ +++ float dist = upper_bound - lower_bound; +++ size_t cnt; +++ for (cnt = quarter_points * 4; cnt < num_points; cnt++) { +++ float val = inputVector[cnt]; +++ if (val < lower_bound) { +++ float excess = lower_bound - val; +++ signed int count = (int)(excess / dist); +++ outputVector[cnt] = val + (count + 1) * dist; +++ } else if (val > upper_bound) { +++ float excess = val - upper_bound; +++ signed int count = (int)(excess / dist); +++ outputVector[cnt] = val - (count + 1) * dist; +++ } else +++ outputVector[cnt] = val; ++ } ++- else ++- outputVector[cnt] = val; ++- } ++ } ++ #endif /* LV_HAVE_SSE */ ++ ++ #ifdef LV_HAVE_GENERIC ++ ++-static inline void volk_32f_s32f_s32f_mod_range_32f_generic(float* outputVector, const float* inputVector, const float lower_bound, const float upper_bound, unsigned int num_points){ ++- float* outPtr = outputVector; ++- const float *inPtr; ++- float distance = upper_bound - lower_bound; ++- ++- for(inPtr = inputVector; inPtr < inputVector + num_points; inPtr++){ ++- float val = *inPtr; ++- if(val < lower_bound){ ++- float excess = lower_bound - val; ++- signed int count = (int)(excess/distance); ++- *outPtr = val + (count+1)*distance; ++- } ++- else if(val > upper_bound){ ++- float excess = val - upper_bound; ++- signed int count = (int)(excess/distance); ++- *outPtr = val - (count+1)*distance; +++static inline void volk_32f_s32f_s32f_mod_range_32f_generic(float* outputVector, +++ const float* inputVector, +++ const float lower_bound, +++ const float upper_bound, +++ unsigned int num_points) +++{ +++ float* outPtr = outputVector; +++ const float* inPtr; +++ float distance = upper_bound - lower_bound; +++ +++ for (inPtr = inputVector; inPtr < inputVector + num_points; inPtr++) { +++ float val = *inPtr; +++ if (val < lower_bound) { +++ float excess = lower_bound - val; +++ signed int count = (int)(excess / distance); +++ *outPtr = val + (count + 1) * distance; +++ } else if (val > upper_bound) { +++ float excess = val - upper_bound; +++ signed int count = (int)(excess / distance); +++ *outPtr = val - (count + 1) * distance; +++ } else +++ *outPtr = val; +++ outPtr++; ++ } ++- else ++- *outPtr = val; ++- outPtr++; ++- } ++ } ++ #endif /* LV_HAVE_GENERIC */ ++ ++ ++- ++- ++ #endif /* INCLUDED_VOLK_32F_S32F_S32F_MOD_RANGE_32F_A_H */ ++diff --git a/kernels/volk/volk_32f_s32f_stddev_32f.h b/kernels/volk/volk_32f_s32f_stddev_32f.h ++index 4f3dc1c..0a1c32b 100644 ++--- a/kernels/volk/volk_32f_s32f_stddev_32f.h +++++ b/kernels/volk/volk_32f_s32f_stddev_32f.h ++@@ -29,8 +29,8 @@ ++ * ++ * Dispatcher Prototype ++ * \code ++- * void volk_32f_s32f_stddev_32f(float* stddev, const float* inputBuffer, const float mean, unsigned int num_points) ++- * \endcode +++ * void volk_32f_s32f_stddev_32f(float* stddev, const float* inputBuffer, const float +++ * mean, unsigned int num_points) \endcode ++ * ++ * \b Inputs ++ * \li inputBuffer: The input vector of floats. ++@@ -68,65 +68,72 @@ ++ #ifndef INCLUDED_volk_32f_s32f_stddev_32f_a_H ++ #define INCLUDED_volk_32f_s32f_stddev_32f_a_H ++ ++-#include ++ #include ++-#include ++ #include +++#include +++#include ++ ++ #ifdef LV_HAVE_SSE4_1 ++ #include ++ ++-static inline void ++-volk_32f_s32f_stddev_32f_a_sse4_1(float* stddev, const float* inputBuffer, ++- const float mean, unsigned int num_points) +++static inline void volk_32f_s32f_stddev_32f_a_sse4_1(float* stddev, +++ const float* inputBuffer, +++ const float mean, +++ unsigned int num_points) ++ { ++- float returnValue = 0; ++- if(num_points > 0){ ++- unsigned int number = 0; ++- const unsigned int sixteenthPoints = num_points / 16; ++- ++- const float* aPtr = inputBuffer; ++- ++- __VOLK_ATTR_ALIGNED(16) float squareBuffer[4]; ++- ++- __m128 squareAccumulator = _mm_setzero_ps(); ++- __m128 aVal1, aVal2, aVal3, aVal4; ++- __m128 cVal1, cVal2, cVal3, cVal4; ++- for(;number < sixteenthPoints; number++) { ++- aVal1 = _mm_load_ps(aPtr); aPtr += 4; ++- cVal1 = _mm_dp_ps(aVal1, aVal1, 0xF1); ++- ++- aVal2 = _mm_load_ps(aPtr); aPtr += 4; ++- cVal2 = _mm_dp_ps(aVal2, aVal2, 0xF2); ++- ++- aVal3 = _mm_load_ps(aPtr); aPtr += 4; ++- cVal3 = _mm_dp_ps(aVal3, aVal3, 0xF4); ++- ++- aVal4 = _mm_load_ps(aPtr); aPtr += 4; ++- cVal4 = _mm_dp_ps(aVal4, aVal4, 0xF8); ++- ++- cVal1 = _mm_or_ps(cVal1, cVal2); ++- cVal3 = _mm_or_ps(cVal3, cVal4); ++- cVal1 = _mm_or_ps(cVal1, cVal3); ++- ++- squareAccumulator = _mm_add_ps(squareAccumulator, cVal1); // squareAccumulator += x^2 +++ float returnValue = 0; +++ if (num_points > 0) { +++ unsigned int number = 0; +++ const unsigned int sixteenthPoints = num_points / 16; +++ +++ const float* aPtr = inputBuffer; +++ +++ __VOLK_ATTR_ALIGNED(16) float squareBuffer[4]; +++ +++ __m128 squareAccumulator = _mm_setzero_ps(); +++ __m128 aVal1, aVal2, aVal3, aVal4; +++ __m128 cVal1, cVal2, cVal3, cVal4; +++ for (; number < sixteenthPoints; number++) { +++ aVal1 = _mm_load_ps(aPtr); +++ aPtr += 4; +++ cVal1 = _mm_dp_ps(aVal1, aVal1, 0xF1); +++ +++ aVal2 = _mm_load_ps(aPtr); +++ aPtr += 4; +++ cVal2 = _mm_dp_ps(aVal2, aVal2, 0xF2); +++ +++ aVal3 = _mm_load_ps(aPtr); +++ aPtr += 4; +++ cVal3 = _mm_dp_ps(aVal3, aVal3, 0xF4); +++ +++ aVal4 = _mm_load_ps(aPtr); +++ aPtr += 4; +++ cVal4 = _mm_dp_ps(aVal4, aVal4, 0xF8); +++ +++ cVal1 = _mm_or_ps(cVal1, cVal2); +++ cVal3 = _mm_or_ps(cVal3, cVal4); +++ cVal1 = _mm_or_ps(cVal1, cVal3); +++ +++ squareAccumulator = +++ _mm_add_ps(squareAccumulator, cVal1); // squareAccumulator += x^2 +++ } +++ _mm_store_ps(squareBuffer, +++ squareAccumulator); // Store the results back into the C container +++ returnValue = squareBuffer[0]; +++ returnValue += squareBuffer[1]; +++ returnValue += squareBuffer[2]; +++ returnValue += squareBuffer[3]; +++ +++ number = sixteenthPoints * 16; +++ for (; number < num_points; number++) { +++ returnValue += (*aPtr) * (*aPtr); +++ aPtr++; +++ } +++ returnValue /= num_points; +++ returnValue -= (mean * mean); +++ returnValue = sqrtf(returnValue); ++ } ++- _mm_store_ps(squareBuffer,squareAccumulator); // Store the results back into the C container ++- returnValue = squareBuffer[0]; ++- returnValue += squareBuffer[1]; ++- returnValue += squareBuffer[2]; ++- returnValue += squareBuffer[3]; ++- ++- number = sixteenthPoints * 16; ++- for(;number < num_points; number++){ ++- returnValue += (*aPtr) * (*aPtr); ++- aPtr++; ++- } ++- returnValue /= num_points; ++- returnValue -= (mean * mean); ++- returnValue = sqrtf(returnValue); ++- } ++- *stddev = returnValue; +++ *stddev = returnValue; ++ } ++ ++ #endif /* LV_HAVE_SSE4_1 */ ++@@ -134,43 +141,45 @@ volk_32f_s32f_stddev_32f_a_sse4_1(float* stddev, const float* inputBuffer, ++ #ifdef LV_HAVE_SSE ++ #include ++ ++-static inline void ++-volk_32f_s32f_stddev_32f_a_sse(float* stddev, const float* inputBuffer, ++- const float mean, unsigned int num_points) +++static inline void volk_32f_s32f_stddev_32f_a_sse(float* stddev, +++ const float* inputBuffer, +++ const float mean, +++ unsigned int num_points) ++ { ++- float returnValue = 0; ++- if(num_points > 0){ ++- unsigned int number = 0; ++- const unsigned int quarterPoints = num_points / 4; ++- ++- const float* aPtr = inputBuffer; ++- ++- __VOLK_ATTR_ALIGNED(16) float squareBuffer[4]; ++- ++- __m128 squareAccumulator = _mm_setzero_ps(); ++- __m128 aVal = _mm_setzero_ps(); ++- for(;number < quarterPoints; number++) { ++- aVal = _mm_load_ps(aPtr); // aVal = x ++- aVal = _mm_mul_ps(aVal, aVal); // squareAccumulator += x^2 ++- squareAccumulator = _mm_add_ps(squareAccumulator, aVal); ++- aPtr += 4; ++- } ++- _mm_store_ps(squareBuffer,squareAccumulator); // Store the results back into the C container ++- returnValue = squareBuffer[0]; ++- returnValue += squareBuffer[1]; ++- returnValue += squareBuffer[2]; ++- returnValue += squareBuffer[3]; ++- ++- number = quarterPoints * 4; ++- for(;number < num_points; number++){ ++- returnValue += (*aPtr) * (*aPtr); ++- aPtr++; +++ float returnValue = 0; +++ if (num_points > 0) { +++ unsigned int number = 0; +++ const unsigned int quarterPoints = num_points / 4; +++ +++ const float* aPtr = inputBuffer; +++ +++ __VOLK_ATTR_ALIGNED(16) float squareBuffer[4]; +++ +++ __m128 squareAccumulator = _mm_setzero_ps(); +++ __m128 aVal = _mm_setzero_ps(); +++ for (; number < quarterPoints; number++) { +++ aVal = _mm_load_ps(aPtr); // aVal = x +++ aVal = _mm_mul_ps(aVal, aVal); // squareAccumulator += x^2 +++ squareAccumulator = _mm_add_ps(squareAccumulator, aVal); +++ aPtr += 4; +++ } +++ _mm_store_ps(squareBuffer, +++ squareAccumulator); // Store the results back into the C container +++ returnValue = squareBuffer[0]; +++ returnValue += squareBuffer[1]; +++ returnValue += squareBuffer[2]; +++ returnValue += squareBuffer[3]; +++ +++ number = quarterPoints * 4; +++ for (; number < num_points; number++) { +++ returnValue += (*aPtr) * (*aPtr); +++ aPtr++; +++ } +++ returnValue /= num_points; +++ returnValue -= (mean * mean); +++ returnValue = sqrtf(returnValue); ++ } ++- returnValue /= num_points; ++- returnValue -= (mean * mean); ++- returnValue = sqrtf(returnValue); ++- } ++- *stddev = returnValue; +++ *stddev = returnValue; ++ } ++ #endif /* LV_HAVE_SSE */ ++ ++@@ -178,86 +187,93 @@ volk_32f_s32f_stddev_32f_a_sse(float* stddev, const float* inputBuffer, ++ #ifdef LV_HAVE_AVX ++ #include ++ ++-static inline void ++-volk_32f_s32f_stddev_32f_a_avx(float* stddev, const float* inputBuffer, ++- const float mean, unsigned int num_points) +++static inline void volk_32f_s32f_stddev_32f_a_avx(float* stddev, +++ const float* inputBuffer, +++ const float mean, +++ unsigned int num_points) ++ { ++- float stdDev = 0; ++- if(num_points > 0){ ++- unsigned int number = 0; ++- const unsigned int thirtySecondthPoints = num_points / 32; ++- ++- const float* aPtr = inputBuffer; ++- __VOLK_ATTR_ALIGNED(32) float squareBuffer[8]; ++- ++- __m256 squareAccumulator = _mm256_setzero_ps(); ++- __m256 aVal1, aVal2, aVal3, aVal4; ++- __m256 cVal1, cVal2, cVal3, cVal4; ++- for(;number < thirtySecondthPoints; number++) { ++- aVal1 = _mm256_load_ps(aPtr); aPtr += 8; ++- cVal1 = _mm256_dp_ps(aVal1, aVal1, 0xF1); ++- ++- aVal2 = _mm256_load_ps(aPtr); aPtr += 8; ++- cVal2 = _mm256_dp_ps(aVal2, aVal2, 0xF2); ++- ++- aVal3 = _mm256_load_ps(aPtr); aPtr += 8; ++- cVal3 = _mm256_dp_ps(aVal3, aVal3, 0xF4); ++- ++- aVal4 = _mm256_load_ps(aPtr); aPtr += 8; ++- cVal4 = _mm256_dp_ps(aVal4, aVal4, 0xF8); ++- ++- cVal1 = _mm256_or_ps(cVal1, cVal2); ++- cVal3 = _mm256_or_ps(cVal3, cVal4); ++- cVal1 = _mm256_or_ps(cVal1, cVal3); ++- ++- squareAccumulator = _mm256_add_ps(squareAccumulator, cVal1); // squareAccumulator += x^2 +++ float stdDev = 0; +++ if (num_points > 0) { +++ unsigned int number = 0; +++ const unsigned int thirtySecondthPoints = num_points / 32; +++ +++ const float* aPtr = inputBuffer; +++ __VOLK_ATTR_ALIGNED(32) float squareBuffer[8]; +++ +++ __m256 squareAccumulator = _mm256_setzero_ps(); +++ __m256 aVal1, aVal2, aVal3, aVal4; +++ __m256 cVal1, cVal2, cVal3, cVal4; +++ for (; number < thirtySecondthPoints; number++) { +++ aVal1 = _mm256_load_ps(aPtr); +++ aPtr += 8; +++ cVal1 = _mm256_dp_ps(aVal1, aVal1, 0xF1); +++ +++ aVal2 = _mm256_load_ps(aPtr); +++ aPtr += 8; +++ cVal2 = _mm256_dp_ps(aVal2, aVal2, 0xF2); +++ +++ aVal3 = _mm256_load_ps(aPtr); +++ aPtr += 8; +++ cVal3 = _mm256_dp_ps(aVal3, aVal3, 0xF4); +++ +++ aVal4 = _mm256_load_ps(aPtr); +++ aPtr += 8; +++ cVal4 = _mm256_dp_ps(aVal4, aVal4, 0xF8); +++ +++ cVal1 = _mm256_or_ps(cVal1, cVal2); +++ cVal3 = _mm256_or_ps(cVal3, cVal4); +++ cVal1 = _mm256_or_ps(cVal1, cVal3); +++ +++ squareAccumulator = +++ _mm256_add_ps(squareAccumulator, cVal1); // squareAccumulator += x^2 +++ } +++ _mm256_store_ps(squareBuffer, +++ squareAccumulator); // Store the results back into the C container +++ stdDev = squareBuffer[0]; +++ stdDev += squareBuffer[1]; +++ stdDev += squareBuffer[2]; +++ stdDev += squareBuffer[3]; +++ stdDev += squareBuffer[4]; +++ stdDev += squareBuffer[5]; +++ stdDev += squareBuffer[6]; +++ stdDev += squareBuffer[7]; +++ +++ number = thirtySecondthPoints * 32; +++ for (; number < num_points; number++) { +++ stdDev += (*aPtr) * (*aPtr); +++ aPtr++; +++ } +++ stdDev /= num_points; +++ stdDev -= (mean * mean); +++ stdDev = sqrtf(stdDev); ++ } ++- _mm256_store_ps(squareBuffer,squareAccumulator); // Store the results back into the C container ++- stdDev = squareBuffer[0]; ++- stdDev += squareBuffer[1]; ++- stdDev += squareBuffer[2]; ++- stdDev += squareBuffer[3]; ++- stdDev += squareBuffer[4]; ++- stdDev += squareBuffer[5]; ++- stdDev += squareBuffer[6]; ++- stdDev += squareBuffer[7]; ++- ++- number = thirtySecondthPoints * 32; ++- for(;number < num_points; number++){ ++- stdDev += (*aPtr) * (*aPtr); ++- aPtr++; ++- } ++- stdDev /= num_points; ++- stdDev -= (mean * mean); ++- stdDev = sqrtf(stdDev); ++- } ++- *stddev = stdDev; ++- +++ *stddev = stdDev; ++ } ++ #endif /* LV_HAVE_AVX */ ++ ++ ++ #ifdef LV_HAVE_GENERIC ++ ++-static inline void ++-volk_32f_s32f_stddev_32f_generic(float* stddev, const float* inputBuffer, ++- const float mean, unsigned int num_points) +++static inline void volk_32f_s32f_stddev_32f_generic(float* stddev, +++ const float* inputBuffer, +++ const float mean, +++ unsigned int num_points) ++ { ++- float returnValue = 0; ++- if(num_points > 0){ ++- const float* aPtr = inputBuffer; ++- unsigned int number = 0; ++- ++- for(number = 0; number < num_points; number++){ ++- returnValue += (*aPtr) * (*aPtr); ++- aPtr++; +++ float returnValue = 0; +++ if (num_points > 0) { +++ const float* aPtr = inputBuffer; +++ unsigned int number = 0; +++ +++ for (number = 0; number < num_points; number++) { +++ returnValue += (*aPtr) * (*aPtr); +++ aPtr++; +++ } +++ +++ returnValue /= num_points; +++ returnValue -= (mean * mean); +++ returnValue = sqrtf(returnValue); ++ } ++- ++- returnValue /= num_points; ++- returnValue -= (mean * mean); ++- returnValue = sqrtf(returnValue); ++- } ++- *stddev = returnValue; +++ *stddev = returnValue; ++ } ++ ++ #endif /* LV_HAVE_GENERIC */ ++@@ -268,69 +284,76 @@ volk_32f_s32f_stddev_32f_generic(float* stddev, const float* inputBuffer, ++ #ifndef INCLUDED_volk_32f_s32f_stddev_32f_u_H ++ #define INCLUDED_volk_32f_s32f_stddev_32f_u_H ++ ++-#include ++ #include ++-#include ++ #include +++#include +++#include ++ ++ #ifdef LV_HAVE_AVX ++ #include ++ ++-static inline void ++-volk_32f_s32f_stddev_32f_u_avx(float* stddev, const float* inputBuffer, ++- const float mean, unsigned int num_points) +++static inline void volk_32f_s32f_stddev_32f_u_avx(float* stddev, +++ const float* inputBuffer, +++ const float mean, +++ unsigned int num_points) ++ { ++- float stdDev = 0; ++- if(num_points > 0){ ++- unsigned int number = 0; ++- const unsigned int thirtySecondthPoints = num_points / 32; ++- ++- const float* aPtr = inputBuffer; ++- __VOLK_ATTR_ALIGNED(32) float squareBuffer[8]; ++- ++- __m256 squareAccumulator = _mm256_setzero_ps(); ++- __m256 aVal1, aVal2, aVal3, aVal4; ++- __m256 cVal1, cVal2, cVal3, cVal4; ++- for(;number < thirtySecondthPoints; number++) { ++- aVal1 = _mm256_loadu_ps(aPtr); aPtr += 8; ++- cVal1 = _mm256_dp_ps(aVal1, aVal1, 0xF1); ++- ++- aVal2 = _mm256_loadu_ps(aPtr); aPtr += 8; ++- cVal2 = _mm256_dp_ps(aVal2, aVal2, 0xF2); ++- ++- aVal3 = _mm256_loadu_ps(aPtr); aPtr += 8; ++- cVal3 = _mm256_dp_ps(aVal3, aVal3, 0xF4); ++- ++- aVal4 = _mm256_loadu_ps(aPtr); aPtr += 8; ++- cVal4 = _mm256_dp_ps(aVal4, aVal4, 0xF8); ++- ++- cVal1 = _mm256_or_ps(cVal1, cVal2); ++- cVal3 = _mm256_or_ps(cVal3, cVal4); ++- cVal1 = _mm256_or_ps(cVal1, cVal3); ++- ++- squareAccumulator = _mm256_add_ps(squareAccumulator, cVal1); // squareAccumulator += x^2 +++ float stdDev = 0; +++ if (num_points > 0) { +++ unsigned int number = 0; +++ const unsigned int thirtySecondthPoints = num_points / 32; +++ +++ const float* aPtr = inputBuffer; +++ __VOLK_ATTR_ALIGNED(32) float squareBuffer[8]; +++ +++ __m256 squareAccumulator = _mm256_setzero_ps(); +++ __m256 aVal1, aVal2, aVal3, aVal4; +++ __m256 cVal1, cVal2, cVal3, cVal4; +++ for (; number < thirtySecondthPoints; number++) { +++ aVal1 = _mm256_loadu_ps(aPtr); +++ aPtr += 8; +++ cVal1 = _mm256_dp_ps(aVal1, aVal1, 0xF1); +++ +++ aVal2 = _mm256_loadu_ps(aPtr); +++ aPtr += 8; +++ cVal2 = _mm256_dp_ps(aVal2, aVal2, 0xF2); +++ +++ aVal3 = _mm256_loadu_ps(aPtr); +++ aPtr += 8; +++ cVal3 = _mm256_dp_ps(aVal3, aVal3, 0xF4); +++ +++ aVal4 = _mm256_loadu_ps(aPtr); +++ aPtr += 8; +++ cVal4 = _mm256_dp_ps(aVal4, aVal4, 0xF8); +++ +++ cVal1 = _mm256_or_ps(cVal1, cVal2); +++ cVal3 = _mm256_or_ps(cVal3, cVal4); +++ cVal1 = _mm256_or_ps(cVal1, cVal3); +++ +++ squareAccumulator = +++ _mm256_add_ps(squareAccumulator, cVal1); // squareAccumulator += x^2 +++ } +++ _mm256_storeu_ps( +++ squareBuffer, +++ squareAccumulator); // Store the results back into the C container +++ stdDev = squareBuffer[0]; +++ stdDev += squareBuffer[1]; +++ stdDev += squareBuffer[2]; +++ stdDev += squareBuffer[3]; +++ stdDev += squareBuffer[4]; +++ stdDev += squareBuffer[5]; +++ stdDev += squareBuffer[6]; +++ stdDev += squareBuffer[7]; +++ +++ number = thirtySecondthPoints * 32; +++ for (; number < num_points; number++) { +++ stdDev += (*aPtr) * (*aPtr); +++ aPtr++; +++ } +++ stdDev /= num_points; +++ stdDev -= (mean * mean); +++ stdDev = sqrtf(stdDev); ++ } ++- _mm256_storeu_ps(squareBuffer,squareAccumulator); // Store the results back into the C container ++- stdDev = squareBuffer[0]; ++- stdDev += squareBuffer[1]; ++- stdDev += squareBuffer[2]; ++- stdDev += squareBuffer[3]; ++- stdDev += squareBuffer[4]; ++- stdDev += squareBuffer[5]; ++- stdDev += squareBuffer[6]; ++- stdDev += squareBuffer[7]; ++- ++- number = thirtySecondthPoints * 32; ++- for(;number < num_points; number++){ ++- stdDev += (*aPtr) * (*aPtr); ++- aPtr++; ++- } ++- stdDev /= num_points; ++- stdDev -= (mean * mean); ++- stdDev = sqrtf(stdDev); ++- } ++- *stddev = stdDev; ++- +++ *stddev = stdDev; ++ } ++ #endif /* LV_HAVE_AVX */ ++ ++diff --git a/kernels/volk/volk_32f_sin_32f.h b/kernels/volk/volk_32f_sin_32f.h ++index 3780086..e65f25a 100644 ++--- a/kernels/volk/volk_32f_sin_32f.h +++++ b/kernels/volk/volk_32f_sin_32f.h ++@@ -69,9 +69,9 @@ ++ * \endcode ++ */ ++ ++-#include ++-#include ++ #include +++#include +++#include ++ ++ #ifndef INCLUDED_volk_32f_sin_32f_a_H ++ #define INCLUDED_volk_32f_sin_32f_a_H ++@@ -83,72 +83,93 @@ ++ static inline void ++ volk_32f_sin_32f_a_avx2_fma(float* bVector, const float* aVector, unsigned int num_points) ++ { ++- float* bPtr = bVector; ++- const float* aPtr = aVector; ++- ++- unsigned int number = 0; ++- unsigned int eighthPoints = num_points / 8; ++- unsigned int i = 0; ++- ++- __m256 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones, fzeroes; ++- __m256 sine, cosine, condition1, condition2; ++- __m256i q, r, ones, twos, fours; ++- ++- m4pi = _mm256_set1_ps(1.273239545); ++- pio4A = _mm256_set1_ps(0.78515625); ++- pio4B = _mm256_set1_ps(0.241876e-3); ++- ffours = _mm256_set1_ps(4.0); ++- ftwos = _mm256_set1_ps(2.0); ++- fones = _mm256_set1_ps(1.0); ++- fzeroes = _mm256_setzero_ps(); ++- ones = _mm256_set1_epi32(1); ++- twos = _mm256_set1_epi32(2); ++- fours = _mm256_set1_epi32(4); ++- ++- cp1 = _mm256_set1_ps(1.0); ++- cp2 = _mm256_set1_ps(0.83333333e-1); ++- cp3 = _mm256_set1_ps(0.2777778e-2); ++- cp4 = _mm256_set1_ps(0.49603e-4); ++- cp5 = _mm256_set1_ps(0.551e-6); ++- ++- for(;number < eighthPoints; number++) { ++- aVal = _mm256_load_ps(aPtr); ++- s = _mm256_sub_ps(aVal, _mm256_and_ps(_mm256_mul_ps(aVal, ftwos), _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS))); ++- q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi))); ++- r = _mm256_add_epi32(q, _mm256_and_si256(q, ones)); ++- ++- s = _mm256_fnmadd_ps(_mm256_cvtepi32_ps(r), pio4A, s); ++- s = _mm256_fnmadd_ps(_mm256_cvtepi32_ps(r), pio4B, s); ++- ++- s = _mm256_div_ps(s, _mm256_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction ++- s = _mm256_mul_ps(s, s); ++- // Evaluate Taylor series ++- s = _mm256_mul_ps(_mm256_fmadd_ps(_mm256_fmsub_ps(_mm256_fmadd_ps(_mm256_fmsub_ps(s, cp5, cp4), s, cp3), s, cp2), s, cp1), s); ++- ++- for(i = 0; i < 3; i++) { ++- s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s)); +++ float* bPtr = bVector; +++ const float* aPtr = aVector; +++ +++ unsigned int number = 0; +++ unsigned int eighthPoints = num_points / 8; +++ unsigned int i = 0; +++ +++ __m256 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones, +++ fzeroes; +++ __m256 sine, cosine, condition1, condition2; +++ __m256i q, r, ones, twos, fours; +++ +++ m4pi = _mm256_set1_ps(1.273239545); +++ pio4A = _mm256_set1_ps(0.78515625); +++ pio4B = _mm256_set1_ps(0.241876e-3); +++ ffours = _mm256_set1_ps(4.0); +++ ftwos = _mm256_set1_ps(2.0); +++ fones = _mm256_set1_ps(1.0); +++ fzeroes = _mm256_setzero_ps(); +++ ones = _mm256_set1_epi32(1); +++ twos = _mm256_set1_epi32(2); +++ fours = _mm256_set1_epi32(4); +++ +++ cp1 = _mm256_set1_ps(1.0); +++ cp2 = _mm256_set1_ps(0.83333333e-1); +++ cp3 = _mm256_set1_ps(0.2777778e-2); +++ cp4 = _mm256_set1_ps(0.49603e-4); +++ cp5 = _mm256_set1_ps(0.551e-6); +++ +++ for (; number < eighthPoints; number++) { +++ aVal = _mm256_load_ps(aPtr); +++ s = _mm256_sub_ps(aVal, +++ _mm256_and_ps(_mm256_mul_ps(aVal, ftwos), +++ _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS))); +++ q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi))); +++ r = _mm256_add_epi32(q, _mm256_and_si256(q, ones)); +++ +++ s = _mm256_fnmadd_ps(_mm256_cvtepi32_ps(r), pio4A, s); +++ s = _mm256_fnmadd_ps(_mm256_cvtepi32_ps(r), pio4B, s); +++ +++ s = _mm256_div_ps( +++ s, +++ _mm256_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction +++ s = _mm256_mul_ps(s, s); +++ // Evaluate Taylor series +++ s = _mm256_mul_ps( +++ _mm256_fmadd_ps( +++ _mm256_fmsub_ps( +++ _mm256_fmadd_ps(_mm256_fmsub_ps(s, cp5, cp4), s, cp3), s, cp2), +++ s, +++ cp1), +++ s); +++ +++ for (i = 0; i < 3; i++) { +++ s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s)); +++ } +++ s = _mm256_div_ps(s, ftwos); +++ +++ sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s)); +++ cosine = _mm256_sub_ps(fones, s); +++ +++ condition1 = _mm256_cmp_ps( +++ _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, ones), twos)), +++ fzeroes, +++ _CMP_NEQ_UQ); +++ condition2 = _mm256_cmp_ps( +++ _mm256_cmp_ps( +++ _mm256_cvtepi32_ps(_mm256_and_si256(q, fours)), fzeroes, _CMP_NEQ_UQ), +++ _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS), +++ _CMP_NEQ_UQ); +++ // Need this condition only for cos +++ // condition3 = _mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, +++ // twos), fours)), fzeroes); +++ +++ sine = +++ _mm256_add_ps(sine, _mm256_and_ps(_mm256_sub_ps(cosine, sine), condition1)); +++ sine = _mm256_sub_ps( +++ sine, _mm256_and_ps(_mm256_mul_ps(sine, _mm256_set1_ps(2.0f)), condition2)); +++ _mm256_store_ps(bPtr, sine); +++ aPtr += 8; +++ bPtr += 8; +++ } +++ +++ number = eighthPoints * 8; +++ for (; number < num_points; number++) { +++ *bPtr++ = sin(*aPtr++); ++ } ++- s = _mm256_div_ps(s, ftwos); ++- ++- sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s)); ++- cosine = _mm256_sub_ps(fones, s); ++- ++- condition1 = _mm256_cmp_ps(_mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, ones), twos)), fzeroes, _CMP_NEQ_UQ); ++- condition2 = _mm256_cmp_ps(_mm256_cmp_ps(_mm256_cvtepi32_ps(_mm256_and_si256(q, fours)), fzeroes, _CMP_NEQ_UQ), _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS), _CMP_NEQ_UQ); ++- // Need this condition only for cos ++- //condition3 = _mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, twos), fours)), fzeroes); ++- ++- sine = _mm256_add_ps(sine, _mm256_and_ps(_mm256_sub_ps(cosine, sine), condition1)); ++- sine = _mm256_sub_ps(sine, _mm256_and_ps(_mm256_mul_ps(sine, _mm256_set1_ps(2.0f)), condition2)); ++- _mm256_store_ps(bPtr, sine); ++- aPtr += 8; ++- bPtr += 8; ++- } ++- ++- number = eighthPoints * 8; ++- for(;number < num_points; number++) { ++- *bPtr++ = sin(*aPtr++); ++- } ++ } ++ ++ #endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for aligned */ ++@@ -159,72 +180,100 @@ volk_32f_sin_32f_a_avx2_fma(float* bVector, const float* aVector, unsigned int n ++ static inline void ++ volk_32f_sin_32f_a_avx2(float* bVector, const float* aVector, unsigned int num_points) ++ { ++- float* bPtr = bVector; ++- const float* aPtr = aVector; ++- ++- unsigned int number = 0; ++- unsigned int eighthPoints = num_points / 8; ++- unsigned int i = 0; ++- ++- __m256 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones, fzeroes; ++- __m256 sine, cosine, condition1, condition2; ++- __m256i q, r, ones, twos, fours; ++- ++- m4pi = _mm256_set1_ps(1.273239545); ++- pio4A = _mm256_set1_ps(0.78515625); ++- pio4B = _mm256_set1_ps(0.241876e-3); ++- ffours = _mm256_set1_ps(4.0); ++- ftwos = _mm256_set1_ps(2.0); ++- fones = _mm256_set1_ps(1.0); ++- fzeroes = _mm256_setzero_ps(); ++- ones = _mm256_set1_epi32(1); ++- twos = _mm256_set1_epi32(2); ++- fours = _mm256_set1_epi32(4); ++- ++- cp1 = _mm256_set1_ps(1.0); ++- cp2 = _mm256_set1_ps(0.83333333e-1); ++- cp3 = _mm256_set1_ps(0.2777778e-2); ++- cp4 = _mm256_set1_ps(0.49603e-4); ++- cp5 = _mm256_set1_ps(0.551e-6); ++- ++- for(;number < eighthPoints; number++) { ++- aVal = _mm256_load_ps(aPtr); ++- s = _mm256_sub_ps(aVal, _mm256_and_ps(_mm256_mul_ps(aVal, ftwos), _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS))); ++- q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi))); ++- r = _mm256_add_epi32(q, _mm256_and_si256(q, ones)); ++- ++- s = _mm256_sub_ps(s, _mm256_mul_ps(_mm256_cvtepi32_ps(r), pio4A)); ++- s = _mm256_sub_ps(s, _mm256_mul_ps(_mm256_cvtepi32_ps(r), pio4B)); ++- ++- s = _mm256_div_ps(s, _mm256_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction ++- s = _mm256_mul_ps(s, s); ++- // Evaluate Taylor series ++- s = _mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(s, cp5), cp4), s), cp3), s), cp2), s), cp1), s); ++- ++- for(i = 0; i < 3; i++) { ++- s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s)); +++ float* bPtr = bVector; +++ const float* aPtr = aVector; +++ +++ unsigned int number = 0; +++ unsigned int eighthPoints = num_points / 8; +++ unsigned int i = 0; +++ +++ __m256 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones, +++ fzeroes; +++ __m256 sine, cosine, condition1, condition2; +++ __m256i q, r, ones, twos, fours; +++ +++ m4pi = _mm256_set1_ps(1.273239545); +++ pio4A = _mm256_set1_ps(0.78515625); +++ pio4B = _mm256_set1_ps(0.241876e-3); +++ ffours = _mm256_set1_ps(4.0); +++ ftwos = _mm256_set1_ps(2.0); +++ fones = _mm256_set1_ps(1.0); +++ fzeroes = _mm256_setzero_ps(); +++ ones = _mm256_set1_epi32(1); +++ twos = _mm256_set1_epi32(2); +++ fours = _mm256_set1_epi32(4); +++ +++ cp1 = _mm256_set1_ps(1.0); +++ cp2 = _mm256_set1_ps(0.83333333e-1); +++ cp3 = _mm256_set1_ps(0.2777778e-2); +++ cp4 = _mm256_set1_ps(0.49603e-4); +++ cp5 = _mm256_set1_ps(0.551e-6); +++ +++ for (; number < eighthPoints; number++) { +++ aVal = _mm256_load_ps(aPtr); +++ s = _mm256_sub_ps(aVal, +++ _mm256_and_ps(_mm256_mul_ps(aVal, ftwos), +++ _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS))); +++ q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi))); +++ r = _mm256_add_epi32(q, _mm256_and_si256(q, ones)); +++ +++ s = _mm256_sub_ps(s, _mm256_mul_ps(_mm256_cvtepi32_ps(r), pio4A)); +++ s = _mm256_sub_ps(s, _mm256_mul_ps(_mm256_cvtepi32_ps(r), pio4B)); +++ +++ s = _mm256_div_ps( +++ s, +++ _mm256_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction +++ s = _mm256_mul_ps(s, s); +++ // Evaluate Taylor series +++ s = _mm256_mul_ps( +++ _mm256_add_ps( +++ _mm256_mul_ps( +++ _mm256_sub_ps( +++ _mm256_mul_ps( +++ _mm256_add_ps( +++ _mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(s, cp5), cp4), +++ s), +++ cp3), +++ s), +++ cp2), +++ s), +++ cp1), +++ s); +++ +++ for (i = 0; i < 3; i++) { +++ s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s)); +++ } +++ s = _mm256_div_ps(s, ftwos); +++ +++ sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s)); +++ cosine = _mm256_sub_ps(fones, s); +++ +++ condition1 = _mm256_cmp_ps( +++ _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, ones), twos)), +++ fzeroes, +++ _CMP_NEQ_UQ); +++ condition2 = _mm256_cmp_ps( +++ _mm256_cmp_ps( +++ _mm256_cvtepi32_ps(_mm256_and_si256(q, fours)), fzeroes, _CMP_NEQ_UQ), +++ _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS), +++ _CMP_NEQ_UQ); +++ // Need this condition only for cos +++ // condition3 = _mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, +++ // twos), fours)), fzeroes); +++ +++ sine = +++ _mm256_add_ps(sine, _mm256_and_ps(_mm256_sub_ps(cosine, sine), condition1)); +++ sine = _mm256_sub_ps( +++ sine, _mm256_and_ps(_mm256_mul_ps(sine, _mm256_set1_ps(2.0f)), condition2)); +++ _mm256_store_ps(bPtr, sine); +++ aPtr += 8; +++ bPtr += 8; +++ } +++ +++ number = eighthPoints * 8; +++ for (; number < num_points; number++) { +++ *bPtr++ = sin(*aPtr++); ++ } ++- s = _mm256_div_ps(s, ftwos); ++- ++- sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s)); ++- cosine = _mm256_sub_ps(fones, s); ++- ++- condition1 = _mm256_cmp_ps(_mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, ones), twos)), fzeroes, _CMP_NEQ_UQ); ++- condition2 = _mm256_cmp_ps(_mm256_cmp_ps(_mm256_cvtepi32_ps(_mm256_and_si256(q, fours)), fzeroes, _CMP_NEQ_UQ), _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS), _CMP_NEQ_UQ); ++- // Need this condition only for cos ++- //condition3 = _mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, twos), fours)), fzeroes); ++- ++- sine = _mm256_add_ps(sine, _mm256_and_ps(_mm256_sub_ps(cosine, sine), condition1)); ++- sine = _mm256_sub_ps(sine, _mm256_and_ps(_mm256_mul_ps(sine, _mm256_set1_ps(2.0f)), condition2)); ++- _mm256_store_ps(bPtr, sine); ++- aPtr += 8; ++- bPtr += 8; ++- } ++- ++- number = eighthPoints * 8; ++- for(;number < num_points; number++) { ++- *bPtr++ = sin(*aPtr++); ++- } ++ } ++ ++ #endif /* LV_HAVE_AVX2 for aligned */ ++@@ -235,72 +284,91 @@ volk_32f_sin_32f_a_avx2(float* bVector, const float* aVector, unsigned int num_p ++ static inline void ++ volk_32f_sin_32f_a_sse4_1(float* bVector, const float* aVector, unsigned int num_points) ++ { ++- float* bPtr = bVector; ++- const float* aPtr = aVector; ++- ++- unsigned int number = 0; ++- unsigned int quarterPoints = num_points / 4; ++- unsigned int i = 0; ++- ++- __m128 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones, fzeroes; ++- __m128 sine, cosine, condition1, condition2; ++- __m128i q, r, ones, twos, fours; ++- ++- m4pi = _mm_set1_ps(1.273239545); ++- pio4A = _mm_set1_ps(0.78515625); ++- pio4B = _mm_set1_ps(0.241876e-3); ++- ffours = _mm_set1_ps(4.0); ++- ftwos = _mm_set1_ps(2.0); ++- fones = _mm_set1_ps(1.0); ++- fzeroes = _mm_setzero_ps(); ++- ones = _mm_set1_epi32(1); ++- twos = _mm_set1_epi32(2); ++- fours = _mm_set1_epi32(4); ++- ++- cp1 = _mm_set1_ps(1.0); ++- cp2 = _mm_set1_ps(0.83333333e-1); ++- cp3 = _mm_set1_ps(0.2777778e-2); ++- cp4 = _mm_set1_ps(0.49603e-4); ++- cp5 = _mm_set1_ps(0.551e-6); ++- ++- for(;number < quarterPoints; number++) { ++- aVal = _mm_load_ps(aPtr); ++- s = _mm_sub_ps(aVal, _mm_and_ps(_mm_mul_ps(aVal, ftwos), _mm_cmplt_ps(aVal, fzeroes))); ++- q = _mm_cvtps_epi32(_mm_floor_ps(_mm_mul_ps(s, m4pi))); ++- r = _mm_add_epi32(q, _mm_and_si128(q, ones)); ++- ++- s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4A)); ++- s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4B)); ++- ++- s = _mm_div_ps(s, _mm_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction ++- s = _mm_mul_ps(s, s); ++- // Evaluate Taylor series ++- s = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(s, cp5), cp4), s), cp3), s), cp2), s), cp1), s); ++- ++- for(i = 0; i < 3; i++) { ++- s = _mm_mul_ps(s, _mm_sub_ps(ffours, s)); +++ float* bPtr = bVector; +++ const float* aPtr = aVector; +++ +++ unsigned int number = 0; +++ unsigned int quarterPoints = num_points / 4; +++ unsigned int i = 0; +++ +++ __m128 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones, +++ fzeroes; +++ __m128 sine, cosine, condition1, condition2; +++ __m128i q, r, ones, twos, fours; +++ +++ m4pi = _mm_set1_ps(1.273239545); +++ pio4A = _mm_set1_ps(0.78515625); +++ pio4B = _mm_set1_ps(0.241876e-3); +++ ffours = _mm_set1_ps(4.0); +++ ftwos = _mm_set1_ps(2.0); +++ fones = _mm_set1_ps(1.0); +++ fzeroes = _mm_setzero_ps(); +++ ones = _mm_set1_epi32(1); +++ twos = _mm_set1_epi32(2); +++ fours = _mm_set1_epi32(4); +++ +++ cp1 = _mm_set1_ps(1.0); +++ cp2 = _mm_set1_ps(0.83333333e-1); +++ cp3 = _mm_set1_ps(0.2777778e-2); +++ cp4 = _mm_set1_ps(0.49603e-4); +++ cp5 = _mm_set1_ps(0.551e-6); +++ +++ for (; number < quarterPoints; number++) { +++ aVal = _mm_load_ps(aPtr); +++ s = _mm_sub_ps(aVal, +++ _mm_and_ps(_mm_mul_ps(aVal, ftwos), _mm_cmplt_ps(aVal, fzeroes))); +++ q = _mm_cvtps_epi32(_mm_floor_ps(_mm_mul_ps(s, m4pi))); +++ r = _mm_add_epi32(q, _mm_and_si128(q, ones)); +++ +++ s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4A)); +++ s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4B)); +++ +++ s = _mm_div_ps( +++ s, _mm_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction +++ s = _mm_mul_ps(s, s); +++ // Evaluate Taylor series +++ s = _mm_mul_ps( +++ _mm_add_ps( +++ _mm_mul_ps( +++ _mm_sub_ps( +++ _mm_mul_ps( +++ _mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(s, cp5), cp4), s), +++ cp3), +++ s), +++ cp2), +++ s), +++ cp1), +++ s); +++ +++ for (i = 0; i < 3; i++) { +++ s = _mm_mul_ps(s, _mm_sub_ps(ffours, s)); +++ } +++ s = _mm_div_ps(s, ftwos); +++ +++ sine = _mm_sqrt_ps(_mm_mul_ps(_mm_sub_ps(ftwos, s), s)); +++ cosine = _mm_sub_ps(fones, s); +++ +++ condition1 = _mm_cmpneq_ps( +++ _mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, ones), twos)), fzeroes); +++ condition2 = _mm_cmpneq_ps( +++ _mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(q, fours)), fzeroes), +++ _mm_cmplt_ps(aVal, fzeroes)); +++ // Need this condition only for cos +++ // condition3 = _mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, +++ // twos), fours)), fzeroes); +++ +++ sine = _mm_add_ps(sine, _mm_and_ps(_mm_sub_ps(cosine, sine), condition1)); +++ sine = +++ _mm_sub_ps(sine, _mm_and_ps(_mm_mul_ps(sine, _mm_set1_ps(2.0f)), condition2)); +++ _mm_store_ps(bPtr, sine); +++ aPtr += 4; +++ bPtr += 4; +++ } +++ +++ number = quarterPoints * 4; +++ for (; number < num_points; number++) { +++ *bPtr++ = sinf(*aPtr++); ++ } ++- s = _mm_div_ps(s, ftwos); ++- ++- sine = _mm_sqrt_ps(_mm_mul_ps(_mm_sub_ps(ftwos, s), s)); ++- cosine = _mm_sub_ps(fones, s); ++- ++- condition1 = _mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, ones), twos)), fzeroes); ++- condition2 = _mm_cmpneq_ps(_mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(q, fours)), fzeroes), _mm_cmplt_ps(aVal, fzeroes)); ++- // Need this condition only for cos ++- //condition3 = _mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, twos), fours)), fzeroes); ++- ++- sine = _mm_add_ps(sine, _mm_and_ps(_mm_sub_ps(cosine, sine), condition1)); ++- sine = _mm_sub_ps(sine, _mm_and_ps(_mm_mul_ps(sine, _mm_set1_ps(2.0f)), condition2)); ++- _mm_store_ps(bPtr, sine); ++- aPtr += 4; ++- bPtr += 4; ++- } ++- ++- number = quarterPoints * 4; ++- for(;number < num_points; number++) { ++- *bPtr++ = sinf(*aPtr++); ++- } ++ } ++ ++ #endif /* LV_HAVE_SSE4_1 for aligned */ ++@@ -317,72 +385,93 @@ volk_32f_sin_32f_a_sse4_1(float* bVector, const float* aVector, unsigned int num ++ static inline void ++ volk_32f_sin_32f_u_avx2_fma(float* bVector, const float* aVector, unsigned int num_points) ++ { ++- float* bPtr = bVector; ++- const float* aPtr = aVector; ++- ++- unsigned int number = 0; ++- unsigned int eighthPoints = num_points / 8; ++- unsigned int i = 0; ++- ++- __m256 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones, fzeroes; ++- __m256 sine, cosine, condition1, condition2; ++- __m256i q, r, ones, twos, fours; ++- ++- m4pi = _mm256_set1_ps(1.273239545); ++- pio4A = _mm256_set1_ps(0.78515625); ++- pio4B = _mm256_set1_ps(0.241876e-3); ++- ffours = _mm256_set1_ps(4.0); ++- ftwos = _mm256_set1_ps(2.0); ++- fones = _mm256_set1_ps(1.0); ++- fzeroes = _mm256_setzero_ps(); ++- ones = _mm256_set1_epi32(1); ++- twos = _mm256_set1_epi32(2); ++- fours = _mm256_set1_epi32(4); ++- ++- cp1 = _mm256_set1_ps(1.0); ++- cp2 = _mm256_set1_ps(0.83333333e-1); ++- cp3 = _mm256_set1_ps(0.2777778e-2); ++- cp4 = _mm256_set1_ps(0.49603e-4); ++- cp5 = _mm256_set1_ps(0.551e-6); ++- ++- for(;number < eighthPoints; number++) { ++- aVal = _mm256_loadu_ps(aPtr); ++- s = _mm256_sub_ps(aVal, _mm256_and_ps(_mm256_mul_ps(aVal, ftwos), _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS))); ++- q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi))); ++- r = _mm256_add_epi32(q, _mm256_and_si256(q, ones)); ++- ++- s = _mm256_fnmadd_ps(_mm256_cvtepi32_ps(r), pio4A, s); ++- s = _mm256_fnmadd_ps(_mm256_cvtepi32_ps(r), pio4B, s); ++- ++- s = _mm256_div_ps(s, _mm256_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction ++- s = _mm256_mul_ps(s, s); ++- // Evaluate Taylor series ++- s = _mm256_mul_ps(_mm256_fmadd_ps(_mm256_fmsub_ps(_mm256_fmadd_ps(_mm256_fmsub_ps(s, cp5, cp4), s, cp3), s, cp2), s, cp1), s); ++- ++- for(i = 0; i < 3; i++) { ++- s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s)); +++ float* bPtr = bVector; +++ const float* aPtr = aVector; +++ +++ unsigned int number = 0; +++ unsigned int eighthPoints = num_points / 8; +++ unsigned int i = 0; +++ +++ __m256 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones, +++ fzeroes; +++ __m256 sine, cosine, condition1, condition2; +++ __m256i q, r, ones, twos, fours; +++ +++ m4pi = _mm256_set1_ps(1.273239545); +++ pio4A = _mm256_set1_ps(0.78515625); +++ pio4B = _mm256_set1_ps(0.241876e-3); +++ ffours = _mm256_set1_ps(4.0); +++ ftwos = _mm256_set1_ps(2.0); +++ fones = _mm256_set1_ps(1.0); +++ fzeroes = _mm256_setzero_ps(); +++ ones = _mm256_set1_epi32(1); +++ twos = _mm256_set1_epi32(2); +++ fours = _mm256_set1_epi32(4); +++ +++ cp1 = _mm256_set1_ps(1.0); +++ cp2 = _mm256_set1_ps(0.83333333e-1); +++ cp3 = _mm256_set1_ps(0.2777778e-2); +++ cp4 = _mm256_set1_ps(0.49603e-4); +++ cp5 = _mm256_set1_ps(0.551e-6); +++ +++ for (; number < eighthPoints; number++) { +++ aVal = _mm256_loadu_ps(aPtr); +++ s = _mm256_sub_ps(aVal, +++ _mm256_and_ps(_mm256_mul_ps(aVal, ftwos), +++ _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS))); +++ q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi))); +++ r = _mm256_add_epi32(q, _mm256_and_si256(q, ones)); +++ +++ s = _mm256_fnmadd_ps(_mm256_cvtepi32_ps(r), pio4A, s); +++ s = _mm256_fnmadd_ps(_mm256_cvtepi32_ps(r), pio4B, s); +++ +++ s = _mm256_div_ps( +++ s, +++ _mm256_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction +++ s = _mm256_mul_ps(s, s); +++ // Evaluate Taylor series +++ s = _mm256_mul_ps( +++ _mm256_fmadd_ps( +++ _mm256_fmsub_ps( +++ _mm256_fmadd_ps(_mm256_fmsub_ps(s, cp5, cp4), s, cp3), s, cp2), +++ s, +++ cp1), +++ s); +++ +++ for (i = 0; i < 3; i++) { +++ s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s)); +++ } +++ s = _mm256_div_ps(s, ftwos); +++ +++ sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s)); +++ cosine = _mm256_sub_ps(fones, s); +++ +++ condition1 = _mm256_cmp_ps( +++ _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, ones), twos)), +++ fzeroes, +++ _CMP_NEQ_UQ); +++ condition2 = _mm256_cmp_ps( +++ _mm256_cmp_ps( +++ _mm256_cvtepi32_ps(_mm256_and_si256(q, fours)), fzeroes, _CMP_NEQ_UQ), +++ _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS), +++ _CMP_NEQ_UQ); +++ // Need this condition only for cos +++ // condition3 = _mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, +++ // twos), fours)), fzeroes); +++ +++ sine = +++ _mm256_add_ps(sine, _mm256_and_ps(_mm256_sub_ps(cosine, sine), condition1)); +++ sine = _mm256_sub_ps( +++ sine, _mm256_and_ps(_mm256_mul_ps(sine, _mm256_set1_ps(2.0f)), condition2)); +++ _mm256_storeu_ps(bPtr, sine); +++ aPtr += 8; +++ bPtr += 8; +++ } +++ +++ number = eighthPoints * 8; +++ for (; number < num_points; number++) { +++ *bPtr++ = sin(*aPtr++); ++ } ++- s = _mm256_div_ps(s, ftwos); ++- ++- sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s)); ++- cosine = _mm256_sub_ps(fones, s); ++- ++- condition1 = _mm256_cmp_ps(_mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, ones), twos)), fzeroes, _CMP_NEQ_UQ); ++- condition2 = _mm256_cmp_ps(_mm256_cmp_ps(_mm256_cvtepi32_ps(_mm256_and_si256(q, fours)), fzeroes, _CMP_NEQ_UQ), _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS), _CMP_NEQ_UQ); ++- // Need this condition only for cos ++- //condition3 = _mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, twos), fours)), fzeroes); ++- ++- sine = _mm256_add_ps(sine, _mm256_and_ps(_mm256_sub_ps(cosine, sine), condition1)); ++- sine = _mm256_sub_ps(sine, _mm256_and_ps(_mm256_mul_ps(sine, _mm256_set1_ps(2.0f)), condition2)); ++- _mm256_storeu_ps(bPtr, sine); ++- aPtr += 8; ++- bPtr += 8; ++- } ++- ++- number = eighthPoints * 8; ++- for(;number < num_points; number++) { ++- *bPtr++ = sin(*aPtr++); ++- } ++ } ++ ++ #endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for unaligned */ ++@@ -393,72 +482,100 @@ volk_32f_sin_32f_u_avx2_fma(float* bVector, const float* aVector, unsigned int n ++ static inline void ++ volk_32f_sin_32f_u_avx2(float* bVector, const float* aVector, unsigned int num_points) ++ { ++- float* bPtr = bVector; ++- const float* aPtr = aVector; ++- ++- unsigned int number = 0; ++- unsigned int eighthPoints = num_points / 8; ++- unsigned int i = 0; ++- ++- __m256 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones, fzeroes; ++- __m256 sine, cosine, condition1, condition2; ++- __m256i q, r, ones, twos, fours; ++- ++- m4pi = _mm256_set1_ps(1.273239545); ++- pio4A = _mm256_set1_ps(0.78515625); ++- pio4B = _mm256_set1_ps(0.241876e-3); ++- ffours = _mm256_set1_ps(4.0); ++- ftwos = _mm256_set1_ps(2.0); ++- fones = _mm256_set1_ps(1.0); ++- fzeroes = _mm256_setzero_ps(); ++- ones = _mm256_set1_epi32(1); ++- twos = _mm256_set1_epi32(2); ++- fours = _mm256_set1_epi32(4); ++- ++- cp1 = _mm256_set1_ps(1.0); ++- cp2 = _mm256_set1_ps(0.83333333e-1); ++- cp3 = _mm256_set1_ps(0.2777778e-2); ++- cp4 = _mm256_set1_ps(0.49603e-4); ++- cp5 = _mm256_set1_ps(0.551e-6); ++- ++- for(;number < eighthPoints; number++) { ++- aVal = _mm256_loadu_ps(aPtr); ++- s = _mm256_sub_ps(aVal, _mm256_and_ps(_mm256_mul_ps(aVal, ftwos), _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS))); ++- q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi))); ++- r = _mm256_add_epi32(q, _mm256_and_si256(q, ones)); ++- ++- s = _mm256_sub_ps(s, _mm256_mul_ps(_mm256_cvtepi32_ps(r), pio4A)); ++- s = _mm256_sub_ps(s, _mm256_mul_ps(_mm256_cvtepi32_ps(r), pio4B)); ++- ++- s = _mm256_div_ps(s, _mm256_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction ++- s = _mm256_mul_ps(s, s); ++- // Evaluate Taylor series ++- s = _mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(s, cp5), cp4), s), cp3), s), cp2), s), cp1), s); ++- ++- for(i = 0; i < 3; i++) { ++- s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s)); +++ float* bPtr = bVector; +++ const float* aPtr = aVector; +++ +++ unsigned int number = 0; +++ unsigned int eighthPoints = num_points / 8; +++ unsigned int i = 0; +++ +++ __m256 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones, +++ fzeroes; +++ __m256 sine, cosine, condition1, condition2; +++ __m256i q, r, ones, twos, fours; +++ +++ m4pi = _mm256_set1_ps(1.273239545); +++ pio4A = _mm256_set1_ps(0.78515625); +++ pio4B = _mm256_set1_ps(0.241876e-3); +++ ffours = _mm256_set1_ps(4.0); +++ ftwos = _mm256_set1_ps(2.0); +++ fones = _mm256_set1_ps(1.0); +++ fzeroes = _mm256_setzero_ps(); +++ ones = _mm256_set1_epi32(1); +++ twos = _mm256_set1_epi32(2); +++ fours = _mm256_set1_epi32(4); +++ +++ cp1 = _mm256_set1_ps(1.0); +++ cp2 = _mm256_set1_ps(0.83333333e-1); +++ cp3 = _mm256_set1_ps(0.2777778e-2); +++ cp4 = _mm256_set1_ps(0.49603e-4); +++ cp5 = _mm256_set1_ps(0.551e-6); +++ +++ for (; number < eighthPoints; number++) { +++ aVal = _mm256_loadu_ps(aPtr); +++ s = _mm256_sub_ps(aVal, +++ _mm256_and_ps(_mm256_mul_ps(aVal, ftwos), +++ _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS))); +++ q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi))); +++ r = _mm256_add_epi32(q, _mm256_and_si256(q, ones)); +++ +++ s = _mm256_sub_ps(s, _mm256_mul_ps(_mm256_cvtepi32_ps(r), pio4A)); +++ s = _mm256_sub_ps(s, _mm256_mul_ps(_mm256_cvtepi32_ps(r), pio4B)); +++ +++ s = _mm256_div_ps( +++ s, +++ _mm256_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction +++ s = _mm256_mul_ps(s, s); +++ // Evaluate Taylor series +++ s = _mm256_mul_ps( +++ _mm256_add_ps( +++ _mm256_mul_ps( +++ _mm256_sub_ps( +++ _mm256_mul_ps( +++ _mm256_add_ps( +++ _mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(s, cp5), cp4), +++ s), +++ cp3), +++ s), +++ cp2), +++ s), +++ cp1), +++ s); +++ +++ for (i = 0; i < 3; i++) { +++ s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s)); +++ } +++ s = _mm256_div_ps(s, ftwos); +++ +++ sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s)); +++ cosine = _mm256_sub_ps(fones, s); +++ +++ condition1 = _mm256_cmp_ps( +++ _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, ones), twos)), +++ fzeroes, +++ _CMP_NEQ_UQ); +++ condition2 = _mm256_cmp_ps( +++ _mm256_cmp_ps( +++ _mm256_cvtepi32_ps(_mm256_and_si256(q, fours)), fzeroes, _CMP_NEQ_UQ), +++ _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS), +++ _CMP_NEQ_UQ); +++ // Need this condition only for cos +++ // condition3 = _mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, +++ // twos), fours)), fzeroes); +++ +++ sine = +++ _mm256_add_ps(sine, _mm256_and_ps(_mm256_sub_ps(cosine, sine), condition1)); +++ sine = _mm256_sub_ps( +++ sine, _mm256_and_ps(_mm256_mul_ps(sine, _mm256_set1_ps(2.0f)), condition2)); +++ _mm256_storeu_ps(bPtr, sine); +++ aPtr += 8; +++ bPtr += 8; +++ } +++ +++ number = eighthPoints * 8; +++ for (; number < num_points; number++) { +++ *bPtr++ = sin(*aPtr++); ++ } ++- s = _mm256_div_ps(s, ftwos); ++- ++- sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s)); ++- cosine = _mm256_sub_ps(fones, s); ++- ++- condition1 = _mm256_cmp_ps(_mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, ones), twos)), fzeroes, _CMP_NEQ_UQ); ++- condition2 = _mm256_cmp_ps(_mm256_cmp_ps(_mm256_cvtepi32_ps(_mm256_and_si256(q, fours)), fzeroes, _CMP_NEQ_UQ), _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS), _CMP_NEQ_UQ); ++- // Need this condition only for cos ++- //condition3 = _mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, twos), fours)), fzeroes); ++- ++- sine = _mm256_add_ps(sine, _mm256_and_ps(_mm256_sub_ps(cosine, sine), condition1)); ++- sine = _mm256_sub_ps(sine, _mm256_and_ps(_mm256_mul_ps(sine, _mm256_set1_ps(2.0f)), condition2)); ++- _mm256_storeu_ps(bPtr, sine); ++- aPtr += 8; ++- bPtr += 8; ++- } ++- ++- number = eighthPoints * 8; ++- for(;number < num_points; number++) { ++- *bPtr++ = sin(*aPtr++); ++- } ++ } ++ ++ #endif /* LV_HAVE_AVX2 for unaligned */ ++@@ -470,70 +587,88 @@ volk_32f_sin_32f_u_avx2(float* bVector, const float* aVector, unsigned int num_p ++ static inline void ++ volk_32f_sin_32f_u_sse4_1(float* bVector, const float* aVector, unsigned int num_points) ++ { ++- float* bPtr = bVector; ++- const float* aPtr = aVector; ++- ++- unsigned int number = 0; ++- unsigned int quarterPoints = num_points / 4; ++- unsigned int i = 0; ++- ++- __m128 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones, fzeroes; ++- __m128 sine, cosine, condition1, condition2; ++- __m128i q, r, ones, twos, fours; ++- ++- m4pi = _mm_set1_ps(1.273239545); ++- pio4A = _mm_set1_ps(0.78515625); ++- pio4B = _mm_set1_ps(0.241876e-3); ++- ffours = _mm_set1_ps(4.0); ++- ftwos = _mm_set1_ps(2.0); ++- fones = _mm_set1_ps(1.0); ++- fzeroes = _mm_setzero_ps(); ++- ones = _mm_set1_epi32(1); ++- twos = _mm_set1_epi32(2); ++- fours = _mm_set1_epi32(4); ++- ++- cp1 = _mm_set1_ps(1.0); ++- cp2 = _mm_set1_ps(0.83333333e-1); ++- cp3 = _mm_set1_ps(0.2777778e-2); ++- cp4 = _mm_set1_ps(0.49603e-4); ++- cp5 = _mm_set1_ps(0.551e-6); ++- ++- for(;number < quarterPoints; number++) { ++- aVal = _mm_loadu_ps(aPtr); ++- s = _mm_sub_ps(aVal, _mm_and_ps(_mm_mul_ps(aVal, ftwos), _mm_cmplt_ps(aVal, fzeroes))); ++- q = _mm_cvtps_epi32(_mm_floor_ps(_mm_mul_ps(s, m4pi))); ++- r = _mm_add_epi32(q, _mm_and_si128(q, ones)); ++- ++- s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4A)); ++- s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4B)); ++- ++- s = _mm_div_ps(s, _mm_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction ++- s = _mm_mul_ps(s, s); ++- // Evaluate Taylor series ++- s = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(s, cp5), cp4), s), cp3), s), cp2), s), cp1), s); ++- ++- for(i = 0; i < 3; i++) { ++- s = _mm_mul_ps(s, _mm_sub_ps(ffours, s)); ++- } ++- s = _mm_div_ps(s, ftwos); ++- ++- sine = _mm_sqrt_ps(_mm_mul_ps(_mm_sub_ps(ftwos, s), s)); ++- cosine = _mm_sub_ps(fones, s); ++- ++- condition1 = _mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, ones), twos)), fzeroes); ++- condition2 = _mm_cmpneq_ps(_mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(q, fours)), fzeroes), _mm_cmplt_ps(aVal, fzeroes)); +++ float* bPtr = bVector; +++ const float* aPtr = aVector; ++ ++- sine = _mm_add_ps(sine, _mm_and_ps(_mm_sub_ps(cosine, sine), condition1)); ++- sine = _mm_sub_ps(sine, _mm_and_ps(_mm_mul_ps(sine, _mm_set1_ps(2.0f)), condition2)); ++- _mm_storeu_ps(bPtr, sine); ++- aPtr += 4; ++- bPtr += 4; ++- } +++ unsigned int number = 0; +++ unsigned int quarterPoints = num_points / 4; +++ unsigned int i = 0; +++ +++ __m128 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones, +++ fzeroes; +++ __m128 sine, cosine, condition1, condition2; +++ __m128i q, r, ones, twos, fours; +++ +++ m4pi = _mm_set1_ps(1.273239545); +++ pio4A = _mm_set1_ps(0.78515625); +++ pio4B = _mm_set1_ps(0.241876e-3); +++ ffours = _mm_set1_ps(4.0); +++ ftwos = _mm_set1_ps(2.0); +++ fones = _mm_set1_ps(1.0); +++ fzeroes = _mm_setzero_ps(); +++ ones = _mm_set1_epi32(1); +++ twos = _mm_set1_epi32(2); +++ fours = _mm_set1_epi32(4); +++ +++ cp1 = _mm_set1_ps(1.0); +++ cp2 = _mm_set1_ps(0.83333333e-1); +++ cp3 = _mm_set1_ps(0.2777778e-2); +++ cp4 = _mm_set1_ps(0.49603e-4); +++ cp5 = _mm_set1_ps(0.551e-6); +++ +++ for (; number < quarterPoints; number++) { +++ aVal = _mm_loadu_ps(aPtr); +++ s = _mm_sub_ps(aVal, +++ _mm_and_ps(_mm_mul_ps(aVal, ftwos), _mm_cmplt_ps(aVal, fzeroes))); +++ q = _mm_cvtps_epi32(_mm_floor_ps(_mm_mul_ps(s, m4pi))); +++ r = _mm_add_epi32(q, _mm_and_si128(q, ones)); +++ +++ s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4A)); +++ s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4B)); +++ +++ s = _mm_div_ps( +++ s, _mm_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction +++ s = _mm_mul_ps(s, s); +++ // Evaluate Taylor series +++ s = _mm_mul_ps( +++ _mm_add_ps( +++ _mm_mul_ps( +++ _mm_sub_ps( +++ _mm_mul_ps( +++ _mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(s, cp5), cp4), s), +++ cp3), +++ s), +++ cp2), +++ s), +++ cp1), +++ s); +++ +++ for (i = 0; i < 3; i++) { +++ s = _mm_mul_ps(s, _mm_sub_ps(ffours, s)); +++ } +++ s = _mm_div_ps(s, ftwos); +++ +++ sine = _mm_sqrt_ps(_mm_mul_ps(_mm_sub_ps(ftwos, s), s)); +++ cosine = _mm_sub_ps(fones, s); +++ +++ condition1 = _mm_cmpneq_ps( +++ _mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, ones), twos)), fzeroes); +++ condition2 = _mm_cmpneq_ps( +++ _mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(q, fours)), fzeroes), +++ _mm_cmplt_ps(aVal, fzeroes)); +++ +++ sine = _mm_add_ps(sine, _mm_and_ps(_mm_sub_ps(cosine, sine), condition1)); +++ sine = +++ _mm_sub_ps(sine, _mm_and_ps(_mm_mul_ps(sine, _mm_set1_ps(2.0f)), condition2)); +++ _mm_storeu_ps(bPtr, sine); +++ aPtr += 4; +++ bPtr += 4; +++ } ++ ++- number = quarterPoints * 4; ++- for(;number < num_points; number++){ ++- *bPtr++ = sinf(*aPtr++); ++- } +++ number = quarterPoints * 4; +++ for (; number < num_points; number++) { +++ *bPtr++ = sinf(*aPtr++); +++ } ++ } ++ ++ #endif /* LV_HAVE_SSE4_1 for unaligned */ ++@@ -544,14 +679,13 @@ volk_32f_sin_32f_u_sse4_1(float* bVector, const float* aVector, unsigned int num ++ static inline void ++ volk_32f_sin_32f_generic(float* bVector, const float* aVector, unsigned int num_points) ++ { ++- float* bPtr = bVector; ++- const float* aPtr = aVector; ++- unsigned int number = 0; ++- ++- for(number = 0; number < num_points; number++) { ++- *bPtr++ = sinf(*aPtr++); ++- } +++ float* bPtr = bVector; +++ const float* aPtr = aVector; +++ unsigned int number = 0; ++ +++ for (number = 0; number < num_points; number++) { +++ *bPtr++ = sinf(*aPtr++); +++ } ++ } ++ ++ #endif /* LV_HAVE_GENERIC */ ++@@ -562,30 +696,29 @@ volk_32f_sin_32f_generic(float* bVector, const float* aVector, unsigned int num_ ++ #include ++ ++ static inline void ++-volk_32f_sin_32f_neon(float* bVector, const float* aVector, ++- unsigned int num_points) +++volk_32f_sin_32f_neon(float* bVector, const float* aVector, unsigned int num_points) ++ { ++ unsigned int number = 0; ++ unsigned int quarter_points = num_points / 4; ++ float* bVectorPtr = bVector; ++ const float* aVectorPtr = aVector; ++- +++ ++ float32x4_t b_vec; ++ float32x4_t a_vec; ++- ++- for(number = 0; number < quarter_points; number++) { +++ +++ for (number = 0; number < quarter_points; number++) { ++ a_vec = vld1q_f32(aVectorPtr); ++ // Prefetch next one, speeds things up ++- __VOLK_PREFETCH(aVectorPtr+4); +++ __VOLK_PREFETCH(aVectorPtr + 4); ++ b_vec = _vsinq_f32(a_vec); ++ vst1q_f32(bVectorPtr, b_vec); ++ // move pointers ahead ++- bVectorPtr+=4; ++- aVectorPtr+=4; +++ bVectorPtr += 4; +++ aVectorPtr += 4; ++ } ++- +++ ++ // Deal with the rest ++- for(number = quarter_points * 4; number < num_points; number++) { +++ for (number = quarter_points * 4; number < num_points; number++) { ++ *bVectorPtr++ = sinf(*aVectorPtr++); ++ } ++ } ++diff --git a/kernels/volk/volk_32f_sqrt_32f.h b/kernels/volk/volk_32f_sqrt_32f.h ++index 84160af..667d356 100644 ++--- a/kernels/volk/volk_32f_sqrt_32f.h +++++ b/kernels/volk/volk_32f_sqrt_32f.h ++@@ -66,8 +66,8 @@ ++ #define INCLUDED_volk_32f_sqrt_32f_a_H ++ ++ #include ++-#include ++ #include +++#include ++ ++ #ifdef LV_HAVE_SSE ++ #include ++@@ -75,28 +75,28 @@ ++ static inline void ++ volk_32f_sqrt_32f_a_sse(float* cVector, const float* aVector, unsigned int num_points) ++ { ++- unsigned int number = 0; ++- const unsigned int quarterPoints = num_points / 4; +++ unsigned int number = 0; +++ const unsigned int quarterPoints = num_points / 4; ++ ++- float* cPtr = cVector; ++- const float* aPtr = aVector; +++ float* cPtr = cVector; +++ const float* aPtr = aVector; ++ ++- __m128 aVal, cVal; ++- for(;number < quarterPoints; number++) { ++- aVal = _mm_load_ps(aPtr); +++ __m128 aVal, cVal; +++ for (; number < quarterPoints; number++) { +++ aVal = _mm_load_ps(aPtr); ++ ++- cVal = _mm_sqrt_ps(aVal); +++ cVal = _mm_sqrt_ps(aVal); ++ ++- _mm_store_ps(cPtr,cVal); // Store the results back into the C container +++ _mm_store_ps(cPtr, cVal); // Store the results back into the C container ++ ++- aPtr += 4; ++- cPtr += 4; ++- } +++ aPtr += 4; +++ cPtr += 4; +++ } ++ ++- number = quarterPoints * 4; ++- for(;number < num_points; number++) { ++- *cPtr++ = sqrtf(*aPtr++); ++- } +++ number = quarterPoints * 4; +++ for (; number < num_points; number++) { +++ *cPtr++ = sqrtf(*aPtr++); +++ } ++ } ++ ++ #endif /* LV_HAVE_SSE */ ++@@ -107,28 +107,28 @@ volk_32f_sqrt_32f_a_sse(float* cVector, const float* aVector, unsigned int num_p ++ static inline void ++ volk_32f_sqrt_32f_a_avx(float* cVector, const float* aVector, unsigned int num_points) ++ { ++- unsigned int number = 0; ++- const unsigned int eighthPoints = num_points / 8; +++ unsigned int number = 0; +++ const unsigned int eighthPoints = num_points / 8; ++ ++- float* cPtr = cVector; ++- const float* aPtr = aVector; +++ float* cPtr = cVector; +++ const float* aPtr = aVector; ++ ++- __m256 aVal, cVal; ++- for(;number < eighthPoints; number++) { ++- aVal = _mm256_load_ps(aPtr); +++ __m256 aVal, cVal; +++ for (; number < eighthPoints; number++) { +++ aVal = _mm256_load_ps(aPtr); ++ ++- cVal = _mm256_sqrt_ps(aVal); +++ cVal = _mm256_sqrt_ps(aVal); ++ ++- _mm256_store_ps(cPtr,cVal); // Store the results back into the C container +++ _mm256_store_ps(cPtr, cVal); // Store the results back into the C container ++ ++- aPtr += 8; ++- cPtr += 8; ++- } +++ aPtr += 8; +++ cPtr += 8; +++ } ++ ++- number = eighthPoints * 8; ++- for(;number < num_points; number++) { ++- *cPtr++ = sqrtf(*aPtr++); ++- } +++ number = eighthPoints * 8; +++ for (; number < num_points; number++) { +++ *cPtr++ = sqrtf(*aPtr++); +++ } ++ } ++ ++ #endif /* LV_HAVE_AVX */ ++@@ -140,24 +140,24 @@ volk_32f_sqrt_32f_a_avx(float* cVector, const float* aVector, unsigned int num_p ++ static inline void ++ volk_32f_sqrt_32f_neon(float* cVector, const float* aVector, unsigned int num_points) ++ { ++- float* cPtr = cVector; ++- const float* aPtr = aVector; ++- unsigned int number = 0; ++- unsigned int quarter_points = num_points / 4; ++- float32x4_t in_vec, out_vec; ++- ++- for(number = 0; number < quarter_points; number++) { ++- in_vec = vld1q_f32(aPtr); ++- // note that armv8 has vsqrt_f32 which will be much better ++- out_vec = vrecpeq_f32(vrsqrteq_f32(in_vec) ); ++- vst1q_f32(cPtr, out_vec); ++- aPtr += 4; ++- cPtr += 4; ++- } ++- ++- for(number = quarter_points * 4; number < num_points; number++) { ++- *cPtr++ = sqrtf(*aPtr++); ++- } +++ float* cPtr = cVector; +++ const float* aPtr = aVector; +++ unsigned int number = 0; +++ unsigned int quarter_points = num_points / 4; +++ float32x4_t in_vec, out_vec; +++ +++ for (number = 0; number < quarter_points; number++) { +++ in_vec = vld1q_f32(aPtr); +++ // note that armv8 has vsqrt_f32 which will be much better +++ out_vec = vrecpeq_f32(vrsqrteq_f32(in_vec)); +++ vst1q_f32(cPtr, out_vec); +++ aPtr += 4; +++ cPtr += 4; +++ } +++ +++ for (number = quarter_points * 4; number < num_points; number++) { +++ *cPtr++ = sqrtf(*aPtr++); +++ } ++ } ++ ++ #endif /* LV_HAVE_NEON */ ++@@ -168,13 +168,13 @@ volk_32f_sqrt_32f_neon(float* cVector, const float* aVector, unsigned int num_po ++ static inline void ++ volk_32f_sqrt_32f_generic(float* cVector, const float* aVector, unsigned int num_points) ++ { ++- float* cPtr = cVector; ++- const float* aPtr = aVector; ++- unsigned int number = 0; +++ float* cPtr = cVector; +++ const float* aPtr = aVector; +++ unsigned int number = 0; ++ ++- for(number = 0; number < num_points; number++) { ++- *cPtr++ = sqrtf(*aPtr++); ++- } +++ for (number = 0; number < num_points; number++) { +++ *cPtr++ = sqrtf(*aPtr++); +++ } ++ } ++ ++ #endif /* LV_HAVE_GENERIC */ ++@@ -182,13 +182,12 @@ volk_32f_sqrt_32f_generic(float* cVector, const float* aVector, unsigned int num ++ ++ #ifdef LV_HAVE_ORC ++ ++-extern void ++-volk_32f_sqrt_32f_a_orc_impl(float *, const float*, unsigned int); +++extern void volk_32f_sqrt_32f_a_orc_impl(float*, const float*, unsigned int); ++ ++ static inline void ++ volk_32f_sqrt_32f_u_orc(float* cVector, const float* aVector, unsigned int num_points) ++ { ++- volk_32f_sqrt_32f_a_orc_impl(cVector, aVector, num_points); +++ volk_32f_sqrt_32f_a_orc_impl(cVector, aVector, num_points); ++ } ++ ++ #endif /* LV_HAVE_ORC */ ++@@ -199,36 +198,36 @@ volk_32f_sqrt_32f_u_orc(float* cVector, const float* aVector, unsigned int num_p ++ #define INCLUDED_volk_32f_sqrt_32f_u_H ++ ++ #include ++-#include ++ #include +++#include ++ #ifdef LV_HAVE_AVX ++ #include ++ ++ static inline void ++ volk_32f_sqrt_32f_u_avx(float* cVector, const float* aVector, unsigned int num_points) ++ { ++- unsigned int number = 0; ++- const unsigned int eighthPoints = num_points / 8; +++ unsigned int number = 0; +++ const unsigned int eighthPoints = num_points / 8; ++ ++- float* cPtr = cVector; ++- const float* aPtr = aVector; +++ float* cPtr = cVector; +++ const float* aPtr = aVector; ++ ++- __m256 aVal, cVal; ++- for(;number < eighthPoints; number++) { ++- aVal = _mm256_loadu_ps(aPtr); +++ __m256 aVal, cVal; +++ for (; number < eighthPoints; number++) { +++ aVal = _mm256_loadu_ps(aPtr); ++ ++- cVal = _mm256_sqrt_ps(aVal); +++ cVal = _mm256_sqrt_ps(aVal); ++ ++- _mm256_storeu_ps(cPtr,cVal); // Store the results back into the C container +++ _mm256_storeu_ps(cPtr, cVal); // Store the results back into the C container ++ ++- aPtr += 8; ++- cPtr += 8; ++- } +++ aPtr += 8; +++ cPtr += 8; +++ } ++ ++- number = eighthPoints * 8; ++- for(;number < num_points; number++) { ++- *cPtr++ = sqrtf(*aPtr++); ++- } +++ number = eighthPoints * 8; +++ for (; number < num_points; number++) { +++ *cPtr++ = sqrtf(*aPtr++); +++ } ++ } ++ ++ #endif /* LV_HAVE_AVX */ ++diff --git a/kernels/volk/volk_32f_stddev_and_mean_32f_x2.h b/kernels/volk/volk_32f_stddev_and_mean_32f_x2.h ++index 8e996e2..6ad0f17 100644 ++--- a/kernels/volk/volk_32f_stddev_and_mean_32f_x2.h +++++ b/kernels/volk/volk_32f_stddev_and_mean_32f_x2.h ++@@ -29,8 +29,8 @@ ++ * ++ * Dispatcher Prototype ++ * \code ++- * void volk_32f_stddev_and_mean_32f_x2(float* stddev, float* mean, const float* inputBuffer, unsigned int num_points) ++- * \endcode +++ * void volk_32f_stddev_and_mean_32f_x2(float* stddev, float* mean, const float* +++ * inputBuffer, unsigned int num_points) \endcode ++ * ++ * \b Inputs ++ * \li inputBuffer: The buffer of points. ++@@ -41,10 +41,8 @@ ++ * \li mean: The mean of the input buffer. ++ * ++ * \b Example ++- * Generate random numbers with c++11's normal distribution and estimate the mean and standard deviation ++- * \code ++- * int N = 1000; ++- * unsigned int alignment = volk_get_alignment(); +++ * Generate random numbers with c++11's normal distribution and estimate the mean and +++ * standard deviation \code int N = 1000; unsigned int alignment = volk_get_alignment(); ++ * float* rand_numbers = (float*)volk_malloc(sizeof(float)*N, alignment); ++ * float* mean = (float*)volk_malloc(sizeof(float), alignment); ++ * float* stddev = (float*)volk_malloc(sizeof(float), alignment); ++@@ -71,88 +69,94 @@ ++ #ifndef INCLUDED_volk_32f_stddev_and_mean_32f_x2_a_H ++ #define INCLUDED_volk_32f_stddev_and_mean_32f_x2_a_H ++ ++-#include ++ #include ++-#include ++ #include +++#include +++#include ++ ++ #ifdef LV_HAVE_AVX ++ #include ++ ++-static inline void ++-volk_32f_stddev_and_mean_32f_x2_a_avx(float* stddev, float* mean, ++- const float* inputBuffer, ++- unsigned int num_points) +++static inline void volk_32f_stddev_and_mean_32f_x2_a_avx(float* stddev, +++ float* mean, +++ const float* inputBuffer, +++ unsigned int num_points) ++ { ++- float stdDev = 0; ++- float newMean = 0; ++- if(num_points > 0){ ++- unsigned int number = 0; ++- const unsigned int thirtySecondthPoints = num_points / 32; ++- ++- const float* aPtr = inputBuffer; ++- __VOLK_ATTR_ALIGNED(32) float meanBuffer[8]; ++- __VOLK_ATTR_ALIGNED(32) float squareBuffer[8]; ++- ++- __m256 accumulator = _mm256_setzero_ps(); ++- __m256 squareAccumulator = _mm256_setzero_ps(); ++- __m256 aVal1, aVal2, aVal3, aVal4; ++- __m256 cVal1, cVal2, cVal3, cVal4; ++- for(;number < thirtySecondthPoints; number++) { ++- aVal1 = _mm256_load_ps(aPtr); aPtr += 8; ++- cVal1 = _mm256_dp_ps(aVal1, aVal1, 0xF1); ++- accumulator = _mm256_add_ps(accumulator, aVal1); // accumulator += x ++- ++- aVal2 = _mm256_load_ps(aPtr); aPtr += 8; ++- cVal2 = _mm256_dp_ps(aVal2, aVal2, 0xF2); ++- accumulator = _mm256_add_ps(accumulator, aVal2); // accumulator += x ++- ++- aVal3 = _mm256_load_ps(aPtr); aPtr += 8; ++- cVal3 = _mm256_dp_ps(aVal3, aVal3, 0xF4); ++- accumulator = _mm256_add_ps(accumulator, aVal3); // accumulator += x ++- ++- aVal4 = _mm256_load_ps(aPtr); aPtr += 8; ++- cVal4 = _mm256_dp_ps(aVal4, aVal4, 0xF8); ++- accumulator = _mm256_add_ps(accumulator, aVal4); // accumulator += x ++- ++- cVal1 = _mm256_or_ps(cVal1, cVal2); ++- cVal3 = _mm256_or_ps(cVal3, cVal4); ++- cVal1 = _mm256_or_ps(cVal1, cVal3); ++- ++- squareAccumulator = _mm256_add_ps(squareAccumulator, cVal1); // squareAccumulator += x^2 ++- } ++- _mm256_store_ps(meanBuffer,accumulator); // Store the results back into the C container ++- _mm256_store_ps(squareBuffer,squareAccumulator); // Store the results back into the C container ++- newMean = meanBuffer[0]; ++- newMean += meanBuffer[1]; ++- newMean += meanBuffer[2]; ++- newMean += meanBuffer[3]; ++- newMean += meanBuffer[4]; ++- newMean += meanBuffer[5]; ++- newMean += meanBuffer[6]; ++- newMean += meanBuffer[7]; ++- stdDev = squareBuffer[0]; ++- stdDev += squareBuffer[1]; ++- stdDev += squareBuffer[2]; ++- stdDev += squareBuffer[3]; ++- stdDev += squareBuffer[4]; ++- stdDev += squareBuffer[5]; ++- stdDev += squareBuffer[6]; ++- stdDev += squareBuffer[7]; ++- ++- number = thirtySecondthPoints * 32; ++- for(;number < num_points; number++){ ++- stdDev += (*aPtr) * (*aPtr); ++- newMean += *aPtr++; +++ float stdDev = 0; +++ float newMean = 0; +++ if (num_points > 0) { +++ unsigned int number = 0; +++ const unsigned int thirtySecondthPoints = num_points / 32; +++ +++ const float* aPtr = inputBuffer; +++ __VOLK_ATTR_ALIGNED(32) float meanBuffer[8]; +++ __VOLK_ATTR_ALIGNED(32) float squareBuffer[8]; +++ +++ __m256 accumulator = _mm256_setzero_ps(); +++ __m256 squareAccumulator = _mm256_setzero_ps(); +++ __m256 aVal1, aVal2, aVal3, aVal4; +++ __m256 cVal1, cVal2, cVal3, cVal4; +++ for (; number < thirtySecondthPoints; number++) { +++ aVal1 = _mm256_load_ps(aPtr); +++ aPtr += 8; +++ cVal1 = _mm256_dp_ps(aVal1, aVal1, 0xF1); +++ accumulator = _mm256_add_ps(accumulator, aVal1); // accumulator += x +++ +++ aVal2 = _mm256_load_ps(aPtr); +++ aPtr += 8; +++ cVal2 = _mm256_dp_ps(aVal2, aVal2, 0xF2); +++ accumulator = _mm256_add_ps(accumulator, aVal2); // accumulator += x +++ +++ aVal3 = _mm256_load_ps(aPtr); +++ aPtr += 8; +++ cVal3 = _mm256_dp_ps(aVal3, aVal3, 0xF4); +++ accumulator = _mm256_add_ps(accumulator, aVal3); // accumulator += x +++ +++ aVal4 = _mm256_load_ps(aPtr); +++ aPtr += 8; +++ cVal4 = _mm256_dp_ps(aVal4, aVal4, 0xF8); +++ accumulator = _mm256_add_ps(accumulator, aVal4); // accumulator += x +++ +++ cVal1 = _mm256_or_ps(cVal1, cVal2); +++ cVal3 = _mm256_or_ps(cVal3, cVal4); +++ cVal1 = _mm256_or_ps(cVal1, cVal3); +++ +++ squareAccumulator = +++ _mm256_add_ps(squareAccumulator, cVal1); // squareAccumulator += x^2 +++ } +++ _mm256_store_ps(meanBuffer, +++ accumulator); // Store the results back into the C container +++ _mm256_store_ps(squareBuffer, +++ squareAccumulator); // Store the results back into the C container +++ newMean = meanBuffer[0]; +++ newMean += meanBuffer[1]; +++ newMean += meanBuffer[2]; +++ newMean += meanBuffer[3]; +++ newMean += meanBuffer[4]; +++ newMean += meanBuffer[5]; +++ newMean += meanBuffer[6]; +++ newMean += meanBuffer[7]; +++ stdDev = squareBuffer[0]; +++ stdDev += squareBuffer[1]; +++ stdDev += squareBuffer[2]; +++ stdDev += squareBuffer[3]; +++ stdDev += squareBuffer[4]; +++ stdDev += squareBuffer[5]; +++ stdDev += squareBuffer[6]; +++ stdDev += squareBuffer[7]; +++ +++ number = thirtySecondthPoints * 32; +++ for (; number < num_points; number++) { +++ stdDev += (*aPtr) * (*aPtr); +++ newMean += *aPtr++; +++ } +++ newMean /= num_points; +++ stdDev /= num_points; +++ stdDev -= (newMean * newMean); +++ stdDev = sqrtf(stdDev); ++ } ++- newMean /= num_points; ++- stdDev /= num_points; ++- stdDev -= (newMean * newMean); ++- stdDev = sqrtf(stdDev); ++- } ++- *stddev = stdDev; ++- *mean = newMean; ++- +++ *stddev = stdDev; +++ *mean = newMean; ++ } ++ #endif /* LV_HAVE_AVX */ ++ ++@@ -160,151 +164,164 @@ volk_32f_stddev_and_mean_32f_x2_a_avx(float* stddev, float* mean, ++ #ifdef LV_HAVE_AVX ++ #include ++ ++-static inline void ++-volk_32f_stddev_and_mean_32f_x2_u_avx(float* stddev, float* mean, ++- const float* inputBuffer, ++- unsigned int num_points) +++static inline void volk_32f_stddev_and_mean_32f_x2_u_avx(float* stddev, +++ float* mean, +++ const float* inputBuffer, +++ unsigned int num_points) ++ { ++- float stdDev = 0; ++- float newMean = 0; ++- if(num_points > 0){ ++- unsigned int number = 0; ++- const unsigned int thirtySecondthPoints = num_points / 32; ++- ++- const float* aPtr = inputBuffer; ++- __VOLK_ATTR_ALIGNED(32) float meanBuffer[8]; ++- __VOLK_ATTR_ALIGNED(32) float squareBuffer[8]; ++- ++- __m256 accumulator = _mm256_setzero_ps(); ++- __m256 squareAccumulator = _mm256_setzero_ps(); ++- __m256 aVal1, aVal2, aVal3, aVal4; ++- __m256 cVal1, cVal2, cVal3, cVal4; ++- for(;number < thirtySecondthPoints; number++) { ++- aVal1 = _mm256_loadu_ps(aPtr); aPtr += 8; ++- cVal1 = _mm256_dp_ps(aVal1, aVal1, 0xF1); ++- accumulator = _mm256_add_ps(accumulator, aVal1); // accumulator += x ++- ++- aVal2 = _mm256_loadu_ps(aPtr); aPtr += 8; ++- cVal2 = _mm256_dp_ps(aVal2, aVal2, 0xF2); ++- accumulator = _mm256_add_ps(accumulator, aVal2); // accumulator += x ++- ++- aVal3 = _mm256_loadu_ps(aPtr); aPtr += 8; ++- cVal3 = _mm256_dp_ps(aVal3, aVal3, 0xF4); ++- accumulator = _mm256_add_ps(accumulator, aVal3); // accumulator += x ++- ++- aVal4 = _mm256_loadu_ps(aPtr); aPtr += 8; ++- cVal4 = _mm256_dp_ps(aVal4, aVal4, 0xF8); ++- accumulator = _mm256_add_ps(accumulator, aVal4); // accumulator += x ++- ++- cVal1 = _mm256_or_ps(cVal1, cVal2); ++- cVal3 = _mm256_or_ps(cVal3, cVal4); ++- cVal1 = _mm256_or_ps(cVal1, cVal3); ++- ++- squareAccumulator = _mm256_add_ps(squareAccumulator, cVal1); // squareAccumulator += x^2 ++- } ++- _mm256_store_ps(meanBuffer,accumulator); // Store the results back into the C container ++- _mm256_store_ps(squareBuffer,squareAccumulator); // Store the results back into the C container ++- newMean = meanBuffer[0]; ++- newMean += meanBuffer[1]; ++- newMean += meanBuffer[2]; ++- newMean += meanBuffer[3]; ++- newMean += meanBuffer[4]; ++- newMean += meanBuffer[5]; ++- newMean += meanBuffer[6]; ++- newMean += meanBuffer[7]; ++- stdDev = squareBuffer[0]; ++- stdDev += squareBuffer[1]; ++- stdDev += squareBuffer[2]; ++- stdDev += squareBuffer[3]; ++- stdDev += squareBuffer[4]; ++- stdDev += squareBuffer[5]; ++- stdDev += squareBuffer[6]; ++- stdDev += squareBuffer[7]; ++- ++- number = thirtySecondthPoints * 32; ++- for(;number < num_points; number++){ ++- stdDev += (*aPtr) * (*aPtr); ++- newMean += *aPtr++; +++ float stdDev = 0; +++ float newMean = 0; +++ if (num_points > 0) { +++ unsigned int number = 0; +++ const unsigned int thirtySecondthPoints = num_points / 32; +++ +++ const float* aPtr = inputBuffer; +++ __VOLK_ATTR_ALIGNED(32) float meanBuffer[8]; +++ __VOLK_ATTR_ALIGNED(32) float squareBuffer[8]; +++ +++ __m256 accumulator = _mm256_setzero_ps(); +++ __m256 squareAccumulator = _mm256_setzero_ps(); +++ __m256 aVal1, aVal2, aVal3, aVal4; +++ __m256 cVal1, cVal2, cVal3, cVal4; +++ for (; number < thirtySecondthPoints; number++) { +++ aVal1 = _mm256_loadu_ps(aPtr); +++ aPtr += 8; +++ cVal1 = _mm256_dp_ps(aVal1, aVal1, 0xF1); +++ accumulator = _mm256_add_ps(accumulator, aVal1); // accumulator += x +++ +++ aVal2 = _mm256_loadu_ps(aPtr); +++ aPtr += 8; +++ cVal2 = _mm256_dp_ps(aVal2, aVal2, 0xF2); +++ accumulator = _mm256_add_ps(accumulator, aVal2); // accumulator += x +++ +++ aVal3 = _mm256_loadu_ps(aPtr); +++ aPtr += 8; +++ cVal3 = _mm256_dp_ps(aVal3, aVal3, 0xF4); +++ accumulator = _mm256_add_ps(accumulator, aVal3); // accumulator += x +++ +++ aVal4 = _mm256_loadu_ps(aPtr); +++ aPtr += 8; +++ cVal4 = _mm256_dp_ps(aVal4, aVal4, 0xF8); +++ accumulator = _mm256_add_ps(accumulator, aVal4); // accumulator += x +++ +++ cVal1 = _mm256_or_ps(cVal1, cVal2); +++ cVal3 = _mm256_or_ps(cVal3, cVal4); +++ cVal1 = _mm256_or_ps(cVal1, cVal3); +++ +++ squareAccumulator = +++ _mm256_add_ps(squareAccumulator, cVal1); // squareAccumulator += x^2 +++ } +++ _mm256_store_ps(meanBuffer, +++ accumulator); // Store the results back into the C container +++ _mm256_store_ps(squareBuffer, +++ squareAccumulator); // Store the results back into the C container +++ newMean = meanBuffer[0]; +++ newMean += meanBuffer[1]; +++ newMean += meanBuffer[2]; +++ newMean += meanBuffer[3]; +++ newMean += meanBuffer[4]; +++ newMean += meanBuffer[5]; +++ newMean += meanBuffer[6]; +++ newMean += meanBuffer[7]; +++ stdDev = squareBuffer[0]; +++ stdDev += squareBuffer[1]; +++ stdDev += squareBuffer[2]; +++ stdDev += squareBuffer[3]; +++ stdDev += squareBuffer[4]; +++ stdDev += squareBuffer[5]; +++ stdDev += squareBuffer[6]; +++ stdDev += squareBuffer[7]; +++ +++ number = thirtySecondthPoints * 32; +++ for (; number < num_points; number++) { +++ stdDev += (*aPtr) * (*aPtr); +++ newMean += *aPtr++; +++ } +++ newMean /= num_points; +++ stdDev /= num_points; +++ stdDev -= (newMean * newMean); +++ stdDev = sqrtf(stdDev); ++ } ++- newMean /= num_points; ++- stdDev /= num_points; ++- stdDev -= (newMean * newMean); ++- stdDev = sqrtf(stdDev); ++- } ++- *stddev = stdDev; ++- *mean = newMean; ++- +++ *stddev = stdDev; +++ *mean = newMean; ++ } ++ #endif /* LV_HAVE_AVX */ ++ ++ ++ #ifdef LV_HAVE_SSE4_1 ++ #include ++-static inline void ++-volk_32f_stddev_and_mean_32f_x2_a_sse4_1(float* stddev, float* mean, ++- const float* inputBuffer, ++- unsigned int num_points) +++static inline void volk_32f_stddev_and_mean_32f_x2_a_sse4_1(float* stddev, +++ float* mean, +++ const float* inputBuffer, +++ unsigned int num_points) ++ { ++- float returnValue = 0; ++- float newMean = 0; ++- if(num_points > 0){ ++- unsigned int number = 0; ++- const unsigned int sixteenthPoints = num_points / 16; ++- ++- const float* aPtr = inputBuffer; ++- __VOLK_ATTR_ALIGNED(16) float meanBuffer[4]; ++- __VOLK_ATTR_ALIGNED(16) float squareBuffer[4]; ++- ++- __m128 accumulator = _mm_setzero_ps(); ++- __m128 squareAccumulator = _mm_setzero_ps(); ++- __m128 aVal1, aVal2, aVal3, aVal4; ++- __m128 cVal1, cVal2, cVal3, cVal4; ++- for(;number < sixteenthPoints; number++) { ++- aVal1 = _mm_load_ps(aPtr); aPtr += 4; ++- cVal1 = _mm_dp_ps(aVal1, aVal1, 0xF1); ++- accumulator = _mm_add_ps(accumulator, aVal1); // accumulator += x ++- ++- aVal2 = _mm_load_ps(aPtr); aPtr += 4; ++- cVal2 = _mm_dp_ps(aVal2, aVal2, 0xF2); ++- accumulator = _mm_add_ps(accumulator, aVal2); // accumulator += x ++- ++- aVal3 = _mm_load_ps(aPtr); aPtr += 4; ++- cVal3 = _mm_dp_ps(aVal3, aVal3, 0xF4); ++- accumulator = _mm_add_ps(accumulator, aVal3); // accumulator += x ++- ++- aVal4 = _mm_load_ps(aPtr); aPtr += 4; ++- cVal4 = _mm_dp_ps(aVal4, aVal4, 0xF8); ++- accumulator = _mm_add_ps(accumulator, aVal4); // accumulator += x ++- ++- cVal1 = _mm_or_ps(cVal1, cVal2); ++- cVal3 = _mm_or_ps(cVal3, cVal4); ++- cVal1 = _mm_or_ps(cVal1, cVal3); ++- ++- squareAccumulator = _mm_add_ps(squareAccumulator, cVal1); // squareAccumulator += x^2 ++- } ++- _mm_store_ps(meanBuffer,accumulator); // Store the results back into the C container ++- _mm_store_ps(squareBuffer,squareAccumulator); // Store the results back into the C container ++- newMean = meanBuffer[0]; ++- newMean += meanBuffer[1]; ++- newMean += meanBuffer[2]; ++- newMean += meanBuffer[3]; ++- returnValue = squareBuffer[0]; ++- returnValue += squareBuffer[1]; ++- returnValue += squareBuffer[2]; ++- returnValue += squareBuffer[3]; ++- ++- number = sixteenthPoints * 16; ++- for(;number < num_points; number++){ ++- returnValue += (*aPtr) * (*aPtr); ++- newMean += *aPtr++; +++ float returnValue = 0; +++ float newMean = 0; +++ if (num_points > 0) { +++ unsigned int number = 0; +++ const unsigned int sixteenthPoints = num_points / 16; +++ +++ const float* aPtr = inputBuffer; +++ __VOLK_ATTR_ALIGNED(16) float meanBuffer[4]; +++ __VOLK_ATTR_ALIGNED(16) float squareBuffer[4]; +++ +++ __m128 accumulator = _mm_setzero_ps(); +++ __m128 squareAccumulator = _mm_setzero_ps(); +++ __m128 aVal1, aVal2, aVal3, aVal4; +++ __m128 cVal1, cVal2, cVal3, cVal4; +++ for (; number < sixteenthPoints; number++) { +++ aVal1 = _mm_load_ps(aPtr); +++ aPtr += 4; +++ cVal1 = _mm_dp_ps(aVal1, aVal1, 0xF1); +++ accumulator = _mm_add_ps(accumulator, aVal1); // accumulator += x +++ +++ aVal2 = _mm_load_ps(aPtr); +++ aPtr += 4; +++ cVal2 = _mm_dp_ps(aVal2, aVal2, 0xF2); +++ accumulator = _mm_add_ps(accumulator, aVal2); // accumulator += x +++ +++ aVal3 = _mm_load_ps(aPtr); +++ aPtr += 4; +++ cVal3 = _mm_dp_ps(aVal3, aVal3, 0xF4); +++ accumulator = _mm_add_ps(accumulator, aVal3); // accumulator += x +++ +++ aVal4 = _mm_load_ps(aPtr); +++ aPtr += 4; +++ cVal4 = _mm_dp_ps(aVal4, aVal4, 0xF8); +++ accumulator = _mm_add_ps(accumulator, aVal4); // accumulator += x +++ +++ cVal1 = _mm_or_ps(cVal1, cVal2); +++ cVal3 = _mm_or_ps(cVal3, cVal4); +++ cVal1 = _mm_or_ps(cVal1, cVal3); +++ +++ squareAccumulator = +++ _mm_add_ps(squareAccumulator, cVal1); // squareAccumulator += x^2 +++ } +++ _mm_store_ps(meanBuffer, +++ accumulator); // Store the results back into the C container +++ _mm_store_ps(squareBuffer, +++ squareAccumulator); // Store the results back into the C container +++ newMean = meanBuffer[0]; +++ newMean += meanBuffer[1]; +++ newMean += meanBuffer[2]; +++ newMean += meanBuffer[3]; +++ returnValue = squareBuffer[0]; +++ returnValue += squareBuffer[1]; +++ returnValue += squareBuffer[2]; +++ returnValue += squareBuffer[3]; +++ +++ number = sixteenthPoints * 16; +++ for (; number < num_points; number++) { +++ returnValue += (*aPtr) * (*aPtr); +++ newMean += *aPtr++; +++ } +++ newMean /= num_points; +++ returnValue /= num_points; +++ returnValue -= (newMean * newMean); +++ returnValue = sqrtf(returnValue); ++ } ++- newMean /= num_points; ++- returnValue /= num_points; ++- returnValue -= (newMean * newMean); ++- returnValue = sqrtf(returnValue); ++- } ++- *stddev = returnValue; ++- *mean = newMean; +++ *stddev = returnValue; +++ *mean = newMean; ++ } ++ #endif /* LV_HAVE_SSE4_1 */ ++ ++@@ -312,86 +329,86 @@ volk_32f_stddev_and_mean_32f_x2_a_sse4_1(float* stddev, float* mean, ++ #ifdef LV_HAVE_SSE ++ #include ++ ++-static inline void ++-volk_32f_stddev_and_mean_32f_x2_a_sse(float* stddev, float* mean, ++- const float* inputBuffer, ++- unsigned int num_points) +++static inline void volk_32f_stddev_and_mean_32f_x2_a_sse(float* stddev, +++ float* mean, +++ const float* inputBuffer, +++ unsigned int num_points) ++ { ++- float returnValue = 0; ++- float newMean = 0; ++- if(num_points > 0){ ++- unsigned int number = 0; ++- const unsigned int quarterPoints = num_points / 4; ++- ++- const float* aPtr = inputBuffer; ++- __VOLK_ATTR_ALIGNED(16) float meanBuffer[4]; ++- __VOLK_ATTR_ALIGNED(16) float squareBuffer[4]; ++- ++- __m128 accumulator = _mm_setzero_ps(); ++- __m128 squareAccumulator = _mm_setzero_ps(); ++- __m128 aVal = _mm_setzero_ps(); ++- for(;number < quarterPoints; number++) { ++- aVal = _mm_load_ps(aPtr); // aVal = x ++- accumulator = _mm_add_ps(accumulator, aVal); // accumulator += x ++- aVal = _mm_mul_ps(aVal, aVal); // squareAccumulator += x^2 ++- squareAccumulator = _mm_add_ps(squareAccumulator, aVal); ++- aPtr += 4; +++ float returnValue = 0; +++ float newMean = 0; +++ if (num_points > 0) { +++ unsigned int number = 0; +++ const unsigned int quarterPoints = num_points / 4; +++ +++ const float* aPtr = inputBuffer; +++ __VOLK_ATTR_ALIGNED(16) float meanBuffer[4]; +++ __VOLK_ATTR_ALIGNED(16) float squareBuffer[4]; +++ +++ __m128 accumulator = _mm_setzero_ps(); +++ __m128 squareAccumulator = _mm_setzero_ps(); +++ __m128 aVal = _mm_setzero_ps(); +++ for (; number < quarterPoints; number++) { +++ aVal = _mm_load_ps(aPtr); // aVal = x +++ accumulator = _mm_add_ps(accumulator, aVal); // accumulator += x +++ aVal = _mm_mul_ps(aVal, aVal); // squareAccumulator += x^2 +++ squareAccumulator = _mm_add_ps(squareAccumulator, aVal); +++ aPtr += 4; +++ } +++ _mm_store_ps(meanBuffer, +++ accumulator); // Store the results back into the C container +++ _mm_store_ps(squareBuffer, +++ squareAccumulator); // Store the results back into the C container +++ newMean = meanBuffer[0]; +++ newMean += meanBuffer[1]; +++ newMean += meanBuffer[2]; +++ newMean += meanBuffer[3]; +++ returnValue = squareBuffer[0]; +++ returnValue += squareBuffer[1]; +++ returnValue += squareBuffer[2]; +++ returnValue += squareBuffer[3]; +++ +++ number = quarterPoints * 4; +++ for (; number < num_points; number++) { +++ returnValue += (*aPtr) * (*aPtr); +++ newMean += *aPtr++; +++ } +++ newMean /= num_points; +++ returnValue /= num_points; +++ returnValue -= (newMean * newMean); +++ returnValue = sqrtf(returnValue); ++ } ++- _mm_store_ps(meanBuffer,accumulator); // Store the results back into the C container ++- _mm_store_ps(squareBuffer,squareAccumulator); // Store the results back into the C container ++- newMean = meanBuffer[0]; ++- newMean += meanBuffer[1]; ++- newMean += meanBuffer[2]; ++- newMean += meanBuffer[3]; ++- returnValue = squareBuffer[0]; ++- returnValue += squareBuffer[1]; ++- returnValue += squareBuffer[2]; ++- returnValue += squareBuffer[3]; ++- ++- number = quarterPoints * 4; ++- for(;number < num_points; number++){ ++- returnValue += (*aPtr) * (*aPtr); ++- newMean += *aPtr++; ++- } ++- newMean /= num_points; ++- returnValue /= num_points; ++- returnValue -= (newMean * newMean); ++- returnValue = sqrtf(returnValue); ++- } ++- *stddev = returnValue; ++- *mean = newMean; +++ *stddev = returnValue; +++ *mean = newMean; ++ } ++ #endif /* LV_HAVE_SSE */ ++ ++ ++ #ifdef LV_HAVE_GENERIC ++ ++-static inline void ++-volk_32f_stddev_and_mean_32f_x2_generic(float* stddev, float* mean, ++- const float* inputBuffer, ++- unsigned int num_points) +++static inline void volk_32f_stddev_and_mean_32f_x2_generic(float* stddev, +++ float* mean, +++ const float* inputBuffer, +++ unsigned int num_points) ++ { ++- float returnValue = 0; ++- float newMean = 0; ++- if(num_points > 0){ ++- const float* aPtr = inputBuffer; ++- unsigned int number = 0; ++- ++- for(number = 0; number < num_points; number++){ ++- returnValue += (*aPtr) * (*aPtr); ++- newMean += *aPtr++; +++ float returnValue = 0; +++ float newMean = 0; +++ if (num_points > 0) { +++ const float* aPtr = inputBuffer; +++ unsigned int number = 0; +++ +++ for (number = 0; number < num_points; number++) { +++ returnValue += (*aPtr) * (*aPtr); +++ newMean += *aPtr++; +++ } +++ newMean /= num_points; +++ returnValue /= num_points; +++ returnValue -= (newMean * newMean); +++ returnValue = sqrtf(returnValue); ++ } ++- newMean /= num_points; ++- returnValue /= num_points; ++- returnValue -= (newMean * newMean); ++- returnValue = sqrtf(returnValue); ++- } ++- *stddev = returnValue; ++- *mean = newMean; +++ *stddev = returnValue; +++ *mean = newMean; ++ } ++ #endif /* LV_HAVE_GENERIC */ ++ ++ ++- ++- ++ #endif /* INCLUDED_volk_32f_stddev_and_mean_32f_x2_a_H */ ++diff --git a/kernels/volk/volk_32f_tan_32f.h b/kernels/volk/volk_32f_tan_32f.h ++index 239b745..a623a66 100644 ++--- a/kernels/volk/volk_32f_tan_32f.h +++++ b/kernels/volk/volk_32f_tan_32f.h ++@@ -71,9 +71,9 @@ ++ * \endcode ++ */ ++ ++-#include ++-#include ++ #include +++#include +++#include ++ ++ #ifndef INCLUDED_volk_32f_tan_32f_a_H ++ #define INCLUDED_volk_32f_tan_32f_a_H ++@@ -82,78 +82,102 @@ ++ #include ++ ++ static inline void ++-volk_32f_tan_32f_a_avx2_fma(float* bVector, const float* aVector, ++- unsigned int num_points) +++volk_32f_tan_32f_a_avx2_fma(float* bVector, const float* aVector, unsigned int num_points) ++ { ++- float* bPtr = bVector; ++- const float* aPtr = aVector; ++- ++- unsigned int number = 0; ++- unsigned int eighthPoints = num_points / 8; ++- unsigned int i = 0; ++- ++- __m256 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones, fzeroes; ++- __m256 sine, cosine, tangent, condition1, condition2, condition3; ++- __m256i q, r, ones, twos, fours; ++- ++- m4pi = _mm256_set1_ps(1.273239545); ++- pio4A = _mm256_set1_ps(0.78515625); ++- pio4B = _mm256_set1_ps(0.241876e-3); ++- ffours = _mm256_set1_ps(4.0); ++- ftwos = _mm256_set1_ps(2.0); ++- fones = _mm256_set1_ps(1.0); ++- fzeroes = _mm256_setzero_ps(); ++- ones = _mm256_set1_epi32(1); ++- twos = _mm256_set1_epi32(2); ++- fours = _mm256_set1_epi32(4); ++- ++- cp1 = _mm256_set1_ps(1.0); ++- cp2 = _mm256_set1_ps(0.83333333e-1); ++- cp3 = _mm256_set1_ps(0.2777778e-2); ++- cp4 = _mm256_set1_ps(0.49603e-4); ++- cp5 = _mm256_set1_ps(0.551e-6); ++- ++- for(;number < eighthPoints; number++){ ++- aVal = _mm256_load_ps(aPtr); ++- s = _mm256_sub_ps(aVal, _mm256_and_ps(_mm256_mul_ps(aVal, ftwos), _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS))); ++- q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi))); ++- r = _mm256_add_epi32(q, _mm256_and_si256(q, ones)); ++- ++- s = _mm256_fnmadd_ps(_mm256_cvtepi32_ps(r), pio4A, s); ++- s = _mm256_fnmadd_ps(_mm256_cvtepi32_ps(r), pio4B, s); ++- ++- s = _mm256_div_ps(s, _mm256_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction ++- s = _mm256_mul_ps(s, s); ++- // Evaluate Taylor series ++- s = _mm256_mul_ps(_mm256_fmadd_ps(_mm256_fmsub_ps(_mm256_fmadd_ps(_mm256_fmsub_ps(s, cp5, cp4), s, cp3), s, cp2), s, cp1), s); ++- ++- for(i = 0; i < 3; i++){ ++- s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s)); +++ float* bPtr = bVector; +++ const float* aPtr = aVector; +++ +++ unsigned int number = 0; +++ unsigned int eighthPoints = num_points / 8; +++ unsigned int i = 0; +++ +++ __m256 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones, +++ fzeroes; +++ __m256 sine, cosine, tangent, condition1, condition2, condition3; +++ __m256i q, r, ones, twos, fours; +++ +++ m4pi = _mm256_set1_ps(1.273239545); +++ pio4A = _mm256_set1_ps(0.78515625); +++ pio4B = _mm256_set1_ps(0.241876e-3); +++ ffours = _mm256_set1_ps(4.0); +++ ftwos = _mm256_set1_ps(2.0); +++ fones = _mm256_set1_ps(1.0); +++ fzeroes = _mm256_setzero_ps(); +++ ones = _mm256_set1_epi32(1); +++ twos = _mm256_set1_epi32(2); +++ fours = _mm256_set1_epi32(4); +++ +++ cp1 = _mm256_set1_ps(1.0); +++ cp2 = _mm256_set1_ps(0.83333333e-1); +++ cp3 = _mm256_set1_ps(0.2777778e-2); +++ cp4 = _mm256_set1_ps(0.49603e-4); +++ cp5 = _mm256_set1_ps(0.551e-6); +++ +++ for (; number < eighthPoints; number++) { +++ aVal = _mm256_load_ps(aPtr); +++ s = _mm256_sub_ps(aVal, +++ _mm256_and_ps(_mm256_mul_ps(aVal, ftwos), +++ _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS))); +++ q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi))); +++ r = _mm256_add_epi32(q, _mm256_and_si256(q, ones)); +++ +++ s = _mm256_fnmadd_ps(_mm256_cvtepi32_ps(r), pio4A, s); +++ s = _mm256_fnmadd_ps(_mm256_cvtepi32_ps(r), pio4B, s); +++ +++ s = _mm256_div_ps( +++ s, +++ _mm256_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction +++ s = _mm256_mul_ps(s, s); +++ // Evaluate Taylor series +++ s = _mm256_mul_ps( +++ _mm256_fmadd_ps( +++ _mm256_fmsub_ps( +++ _mm256_fmadd_ps(_mm256_fmsub_ps(s, cp5, cp4), s, cp3), s, cp2), +++ s, +++ cp1), +++ s); +++ +++ for (i = 0; i < 3; i++) { +++ s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s)); +++ } +++ s = _mm256_div_ps(s, ftwos); +++ +++ sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s)); +++ cosine = _mm256_sub_ps(fones, s); +++ +++ condition1 = _mm256_cmp_ps( +++ _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, ones), twos)), +++ fzeroes, +++ _CMP_NEQ_UQ); +++ condition2 = _mm256_cmp_ps( +++ _mm256_cmp_ps( +++ _mm256_cvtepi32_ps(_mm256_and_si256(q, fours)), fzeroes, _CMP_NEQ_UQ), +++ _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS), +++ _CMP_NEQ_UQ); +++ condition3 = _mm256_cmp_ps( +++ _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, twos), fours)), +++ fzeroes, +++ _CMP_NEQ_UQ); +++ +++ __m256 temp = cosine; +++ cosine = +++ _mm256_add_ps(cosine, _mm256_and_ps(_mm256_sub_ps(sine, cosine), condition1)); +++ sine = _mm256_add_ps(sine, _mm256_and_ps(_mm256_sub_ps(temp, sine), condition1)); +++ sine = _mm256_sub_ps( +++ sine, _mm256_and_ps(_mm256_mul_ps(sine, _mm256_set1_ps(2.0f)), condition2)); +++ cosine = _mm256_sub_ps( +++ cosine, +++ _mm256_and_ps(_mm256_mul_ps(cosine, _mm256_set1_ps(2.0f)), condition3)); +++ tangent = _mm256_div_ps(sine, cosine); +++ _mm256_store_ps(bPtr, tangent); +++ aPtr += 8; +++ bPtr += 8; +++ } +++ +++ number = eighthPoints * 8; +++ for (; number < num_points; number++) { +++ *bPtr++ = tan(*aPtr++); ++ } ++- s = _mm256_div_ps(s, ftwos); ++- ++- sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s)); ++- cosine = _mm256_sub_ps(fones, s); ++- ++- condition1 = _mm256_cmp_ps(_mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, ones), twos)), fzeroes, _CMP_NEQ_UQ); ++- condition2 = _mm256_cmp_ps(_mm256_cmp_ps(_mm256_cvtepi32_ps(_mm256_and_si256(q, fours)), fzeroes, _CMP_NEQ_UQ), _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS), _CMP_NEQ_UQ); ++- condition3 = _mm256_cmp_ps(_mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, twos), fours)), fzeroes, _CMP_NEQ_UQ); ++- ++- __m256 temp = cosine; ++- cosine = _mm256_add_ps(cosine, _mm256_and_ps(_mm256_sub_ps(sine, cosine), condition1)); ++- sine = _mm256_add_ps(sine, _mm256_and_ps(_mm256_sub_ps(temp, sine), condition1)); ++- sine = _mm256_sub_ps(sine, _mm256_and_ps(_mm256_mul_ps(sine, _mm256_set1_ps(2.0f)), condition2)); ++- cosine = _mm256_sub_ps(cosine, _mm256_and_ps(_mm256_mul_ps(cosine, _mm256_set1_ps(2.0f)), condition3)); ++- tangent = _mm256_div_ps(sine, cosine); ++- _mm256_store_ps(bPtr, tangent); ++- aPtr += 8; ++- bPtr += 8; ++- } ++- ++- number = eighthPoints * 8; ++- for(;number < num_points; number++){ ++- *bPtr++ = tan(*aPtr++); ++- } ++ } ++ ++ #endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for aligned */ ++@@ -162,78 +186,109 @@ volk_32f_tan_32f_a_avx2_fma(float* bVector, const float* aVector, ++ #include ++ ++ static inline void ++-volk_32f_tan_32f_a_avx2(float* bVector, const float* aVector, ++- unsigned int num_points) +++volk_32f_tan_32f_a_avx2(float* bVector, const float* aVector, unsigned int num_points) ++ { ++- float* bPtr = bVector; ++- const float* aPtr = aVector; ++- ++- unsigned int number = 0; ++- unsigned int eighthPoints = num_points / 8; ++- unsigned int i = 0; ++- ++- __m256 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones, fzeroes; ++- __m256 sine, cosine, tangent, condition1, condition2, condition3; ++- __m256i q, r, ones, twos, fours; ++- ++- m4pi = _mm256_set1_ps(1.273239545); ++- pio4A = _mm256_set1_ps(0.78515625); ++- pio4B = _mm256_set1_ps(0.241876e-3); ++- ffours = _mm256_set1_ps(4.0); ++- ftwos = _mm256_set1_ps(2.0); ++- fones = _mm256_set1_ps(1.0); ++- fzeroes = _mm256_setzero_ps(); ++- ones = _mm256_set1_epi32(1); ++- twos = _mm256_set1_epi32(2); ++- fours = _mm256_set1_epi32(4); ++- ++- cp1 = _mm256_set1_ps(1.0); ++- cp2 = _mm256_set1_ps(0.83333333e-1); ++- cp3 = _mm256_set1_ps(0.2777778e-2); ++- cp4 = _mm256_set1_ps(0.49603e-4); ++- cp5 = _mm256_set1_ps(0.551e-6); ++- ++- for(;number < eighthPoints; number++){ ++- aVal = _mm256_load_ps(aPtr); ++- s = _mm256_sub_ps(aVal, _mm256_and_ps(_mm256_mul_ps(aVal, ftwos), _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS))); ++- q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi))); ++- r = _mm256_add_epi32(q, _mm256_and_si256(q, ones)); ++- ++- s = _mm256_sub_ps(s, _mm256_mul_ps(_mm256_cvtepi32_ps(r), pio4A)); ++- s = _mm256_sub_ps(s, _mm256_mul_ps(_mm256_cvtepi32_ps(r), pio4B)); ++- ++- s = _mm256_div_ps(s, _mm256_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction ++- s = _mm256_mul_ps(s, s); ++- // Evaluate Taylor series ++- s = _mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(s, cp5), cp4), s), cp3), s), cp2), s), cp1), s); ++- ++- for(i = 0; i < 3; i++){ ++- s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s)); +++ float* bPtr = bVector; +++ const float* aPtr = aVector; +++ +++ unsigned int number = 0; +++ unsigned int eighthPoints = num_points / 8; +++ unsigned int i = 0; +++ +++ __m256 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones, +++ fzeroes; +++ __m256 sine, cosine, tangent, condition1, condition2, condition3; +++ __m256i q, r, ones, twos, fours; +++ +++ m4pi = _mm256_set1_ps(1.273239545); +++ pio4A = _mm256_set1_ps(0.78515625); +++ pio4B = _mm256_set1_ps(0.241876e-3); +++ ffours = _mm256_set1_ps(4.0); +++ ftwos = _mm256_set1_ps(2.0); +++ fones = _mm256_set1_ps(1.0); +++ fzeroes = _mm256_setzero_ps(); +++ ones = _mm256_set1_epi32(1); +++ twos = _mm256_set1_epi32(2); +++ fours = _mm256_set1_epi32(4); +++ +++ cp1 = _mm256_set1_ps(1.0); +++ cp2 = _mm256_set1_ps(0.83333333e-1); +++ cp3 = _mm256_set1_ps(0.2777778e-2); +++ cp4 = _mm256_set1_ps(0.49603e-4); +++ cp5 = _mm256_set1_ps(0.551e-6); +++ +++ for (; number < eighthPoints; number++) { +++ aVal = _mm256_load_ps(aPtr); +++ s = _mm256_sub_ps(aVal, +++ _mm256_and_ps(_mm256_mul_ps(aVal, ftwos), +++ _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS))); +++ q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi))); +++ r = _mm256_add_epi32(q, _mm256_and_si256(q, ones)); +++ +++ s = _mm256_sub_ps(s, _mm256_mul_ps(_mm256_cvtepi32_ps(r), pio4A)); +++ s = _mm256_sub_ps(s, _mm256_mul_ps(_mm256_cvtepi32_ps(r), pio4B)); +++ +++ s = _mm256_div_ps( +++ s, +++ _mm256_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction +++ s = _mm256_mul_ps(s, s); +++ // Evaluate Taylor series +++ s = _mm256_mul_ps( +++ _mm256_add_ps( +++ _mm256_mul_ps( +++ _mm256_sub_ps( +++ _mm256_mul_ps( +++ _mm256_add_ps( +++ _mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(s, cp5), cp4), +++ s), +++ cp3), +++ s), +++ cp2), +++ s), +++ cp1), +++ s); +++ +++ for (i = 0; i < 3; i++) { +++ s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s)); +++ } +++ s = _mm256_div_ps(s, ftwos); +++ +++ sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s)); +++ cosine = _mm256_sub_ps(fones, s); +++ +++ condition1 = _mm256_cmp_ps( +++ _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, ones), twos)), +++ fzeroes, +++ _CMP_NEQ_UQ); +++ condition2 = _mm256_cmp_ps( +++ _mm256_cmp_ps( +++ _mm256_cvtepi32_ps(_mm256_and_si256(q, fours)), fzeroes, _CMP_NEQ_UQ), +++ _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS), +++ _CMP_NEQ_UQ); +++ condition3 = _mm256_cmp_ps( +++ _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, twos), fours)), +++ fzeroes, +++ _CMP_NEQ_UQ); +++ +++ __m256 temp = cosine; +++ cosine = +++ _mm256_add_ps(cosine, _mm256_and_ps(_mm256_sub_ps(sine, cosine), condition1)); +++ sine = _mm256_add_ps(sine, _mm256_and_ps(_mm256_sub_ps(temp, sine), condition1)); +++ sine = _mm256_sub_ps( +++ sine, _mm256_and_ps(_mm256_mul_ps(sine, _mm256_set1_ps(2.0f)), condition2)); +++ cosine = _mm256_sub_ps( +++ cosine, +++ _mm256_and_ps(_mm256_mul_ps(cosine, _mm256_set1_ps(2.0f)), condition3)); +++ tangent = _mm256_div_ps(sine, cosine); +++ _mm256_store_ps(bPtr, tangent); +++ aPtr += 8; +++ bPtr += 8; +++ } +++ +++ number = eighthPoints * 8; +++ for (; number < num_points; number++) { +++ *bPtr++ = tan(*aPtr++); ++ } ++- s = _mm256_div_ps(s, ftwos); ++- ++- sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s)); ++- cosine = _mm256_sub_ps(fones, s); ++- ++- condition1 = _mm256_cmp_ps(_mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, ones), twos)), fzeroes, _CMP_NEQ_UQ); ++- condition2 = _mm256_cmp_ps(_mm256_cmp_ps(_mm256_cvtepi32_ps(_mm256_and_si256(q, fours)), fzeroes, _CMP_NEQ_UQ), _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS), _CMP_NEQ_UQ); ++- condition3 = _mm256_cmp_ps(_mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, twos), fours)), fzeroes, _CMP_NEQ_UQ); ++- ++- __m256 temp = cosine; ++- cosine = _mm256_add_ps(cosine, _mm256_and_ps(_mm256_sub_ps(sine, cosine), condition1)); ++- sine = _mm256_add_ps(sine, _mm256_and_ps(_mm256_sub_ps(temp, sine), condition1)); ++- sine = _mm256_sub_ps(sine, _mm256_and_ps(_mm256_mul_ps(sine, _mm256_set1_ps(2.0f)), condition2)); ++- cosine = _mm256_sub_ps(cosine, _mm256_and_ps(_mm256_mul_ps(cosine, _mm256_set1_ps(2.0f)), condition3)); ++- tangent = _mm256_div_ps(sine, cosine); ++- _mm256_store_ps(bPtr, tangent); ++- aPtr += 8; ++- bPtr += 8; ++- } ++- ++- number = eighthPoints * 8; ++- for(;number < num_points; number++){ ++- *bPtr++ = tan(*aPtr++); ++- } ++ } ++ ++ #endif /* LV_HAVE_AVX2 for aligned */ ++@@ -242,78 +297,97 @@ volk_32f_tan_32f_a_avx2(float* bVector, const float* aVector, ++ #include ++ ++ static inline void ++-volk_32f_tan_32f_a_sse4_1(float* bVector, const float* aVector, ++- unsigned int num_points) +++volk_32f_tan_32f_a_sse4_1(float* bVector, const float* aVector, unsigned int num_points) ++ { ++- float* bPtr = bVector; ++- const float* aPtr = aVector; ++- ++- unsigned int number = 0; ++- unsigned int quarterPoints = num_points / 4; ++- unsigned int i = 0; ++- ++- __m128 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones, fzeroes; ++- __m128 sine, cosine, tangent, condition1, condition2, condition3; ++- __m128i q, r, ones, twos, fours; ++- ++- m4pi = _mm_set1_ps(1.273239545); ++- pio4A = _mm_set1_ps(0.78515625); ++- pio4B = _mm_set1_ps(0.241876e-3); ++- ffours = _mm_set1_ps(4.0); ++- ftwos = _mm_set1_ps(2.0); ++- fones = _mm_set1_ps(1.0); ++- fzeroes = _mm_setzero_ps(); ++- ones = _mm_set1_epi32(1); ++- twos = _mm_set1_epi32(2); ++- fours = _mm_set1_epi32(4); ++- ++- cp1 = _mm_set1_ps(1.0); ++- cp2 = _mm_set1_ps(0.83333333e-1); ++- cp3 = _mm_set1_ps(0.2777778e-2); ++- cp4 = _mm_set1_ps(0.49603e-4); ++- cp5 = _mm_set1_ps(0.551e-6); ++- ++- for(;number < quarterPoints; number++){ ++- aVal = _mm_load_ps(aPtr); ++- s = _mm_sub_ps(aVal, _mm_and_ps(_mm_mul_ps(aVal, ftwos), _mm_cmplt_ps(aVal, fzeroes))); ++- q = _mm_cvtps_epi32(_mm_floor_ps(_mm_mul_ps(s, m4pi))); ++- r = _mm_add_epi32(q, _mm_and_si128(q, ones)); ++- ++- s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4A)); ++- s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4B)); ++- ++- s = _mm_div_ps(s, _mm_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction ++- s = _mm_mul_ps(s, s); ++- // Evaluate Taylor series ++- s = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(s, cp5), cp4), s), cp3), s), cp2), s), cp1), s); ++- ++- for(i = 0; i < 3; i++){ ++- s = _mm_mul_ps(s, _mm_sub_ps(ffours, s)); +++ float* bPtr = bVector; +++ const float* aPtr = aVector; +++ +++ unsigned int number = 0; +++ unsigned int quarterPoints = num_points / 4; +++ unsigned int i = 0; +++ +++ __m128 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones, +++ fzeroes; +++ __m128 sine, cosine, tangent, condition1, condition2, condition3; +++ __m128i q, r, ones, twos, fours; +++ +++ m4pi = _mm_set1_ps(1.273239545); +++ pio4A = _mm_set1_ps(0.78515625); +++ pio4B = _mm_set1_ps(0.241876e-3); +++ ffours = _mm_set1_ps(4.0); +++ ftwos = _mm_set1_ps(2.0); +++ fones = _mm_set1_ps(1.0); +++ fzeroes = _mm_setzero_ps(); +++ ones = _mm_set1_epi32(1); +++ twos = _mm_set1_epi32(2); +++ fours = _mm_set1_epi32(4); +++ +++ cp1 = _mm_set1_ps(1.0); +++ cp2 = _mm_set1_ps(0.83333333e-1); +++ cp3 = _mm_set1_ps(0.2777778e-2); +++ cp4 = _mm_set1_ps(0.49603e-4); +++ cp5 = _mm_set1_ps(0.551e-6); +++ +++ for (; number < quarterPoints; number++) { +++ aVal = _mm_load_ps(aPtr); +++ s = _mm_sub_ps(aVal, +++ _mm_and_ps(_mm_mul_ps(aVal, ftwos), _mm_cmplt_ps(aVal, fzeroes))); +++ q = _mm_cvtps_epi32(_mm_floor_ps(_mm_mul_ps(s, m4pi))); +++ r = _mm_add_epi32(q, _mm_and_si128(q, ones)); +++ +++ s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4A)); +++ s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4B)); +++ +++ s = _mm_div_ps( +++ s, _mm_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction +++ s = _mm_mul_ps(s, s); +++ // Evaluate Taylor series +++ s = _mm_mul_ps( +++ _mm_add_ps( +++ _mm_mul_ps( +++ _mm_sub_ps( +++ _mm_mul_ps( +++ _mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(s, cp5), cp4), s), +++ cp3), +++ s), +++ cp2), +++ s), +++ cp1), +++ s); +++ +++ for (i = 0; i < 3; i++) { +++ s = _mm_mul_ps(s, _mm_sub_ps(ffours, s)); +++ } +++ s = _mm_div_ps(s, ftwos); +++ +++ sine = _mm_sqrt_ps(_mm_mul_ps(_mm_sub_ps(ftwos, s), s)); +++ cosine = _mm_sub_ps(fones, s); +++ +++ condition1 = _mm_cmpneq_ps( +++ _mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, ones), twos)), fzeroes); +++ condition2 = _mm_cmpneq_ps( +++ _mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(q, fours)), fzeroes), +++ _mm_cmplt_ps(aVal, fzeroes)); +++ condition3 = _mm_cmpneq_ps( +++ _mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, twos), fours)), fzeroes); +++ +++ __m128 temp = cosine; +++ cosine = _mm_add_ps(cosine, _mm_and_ps(_mm_sub_ps(sine, cosine), condition1)); +++ sine = _mm_add_ps(sine, _mm_and_ps(_mm_sub_ps(temp, sine), condition1)); +++ sine = +++ _mm_sub_ps(sine, _mm_and_ps(_mm_mul_ps(sine, _mm_set1_ps(2.0f)), condition2)); +++ cosine = _mm_sub_ps( +++ cosine, _mm_and_ps(_mm_mul_ps(cosine, _mm_set1_ps(2.0f)), condition3)); +++ tangent = _mm_div_ps(sine, cosine); +++ _mm_store_ps(bPtr, tangent); +++ aPtr += 4; +++ bPtr += 4; +++ } +++ +++ number = quarterPoints * 4; +++ for (; number < num_points; number++) { +++ *bPtr++ = tanf(*aPtr++); ++ } ++- s = _mm_div_ps(s, ftwos); ++- ++- sine = _mm_sqrt_ps(_mm_mul_ps(_mm_sub_ps(ftwos, s), s)); ++- cosine = _mm_sub_ps(fones, s); ++- ++- condition1 = _mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, ones), twos)), fzeroes); ++- condition2 = _mm_cmpneq_ps(_mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(q, fours)), fzeroes), _mm_cmplt_ps(aVal, fzeroes)); ++- condition3 = _mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, twos), fours)), fzeroes); ++- ++- __m128 temp = cosine; ++- cosine = _mm_add_ps(cosine, _mm_and_ps(_mm_sub_ps(sine, cosine), condition1)); ++- sine = _mm_add_ps(sine, _mm_and_ps(_mm_sub_ps(temp, sine), condition1)); ++- sine = _mm_sub_ps(sine, _mm_and_ps(_mm_mul_ps(sine, _mm_set1_ps(2.0f)), condition2)); ++- cosine = _mm_sub_ps(cosine, _mm_and_ps(_mm_mul_ps(cosine, _mm_set1_ps(2.0f)), condition3)); ++- tangent = _mm_div_ps(sine, cosine); ++- _mm_store_ps(bPtr, tangent); ++- aPtr += 4; ++- bPtr += 4; ++- } ++- ++- number = quarterPoints * 4; ++- for(;number < num_points; number++){ ++- *bPtr++ = tanf(*aPtr++); ++- } ++ } ++ ++ #endif /* LV_HAVE_SSE4_1 for aligned */ ++@@ -328,78 +402,102 @@ volk_32f_tan_32f_a_sse4_1(float* bVector, const float* aVector, ++ #include ++ ++ static inline void ++-volk_32f_tan_32f_u_avx2_fma(float* bVector, const float* aVector, ++- unsigned int num_points) +++volk_32f_tan_32f_u_avx2_fma(float* bVector, const float* aVector, unsigned int num_points) ++ { ++- float* bPtr = bVector; ++- const float* aPtr = aVector; ++- ++- unsigned int number = 0; ++- unsigned int eighthPoints = num_points / 8; ++- unsigned int i = 0; ++- ++- __m256 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones, fzeroes; ++- __m256 sine, cosine, tangent, condition1, condition2, condition3; ++- __m256i q, r, ones, twos, fours; ++- ++- m4pi = _mm256_set1_ps(1.273239545); ++- pio4A = _mm256_set1_ps(0.78515625); ++- pio4B = _mm256_set1_ps(0.241876e-3); ++- ffours = _mm256_set1_ps(4.0); ++- ftwos = _mm256_set1_ps(2.0); ++- fones = _mm256_set1_ps(1.0); ++- fzeroes = _mm256_setzero_ps(); ++- ones = _mm256_set1_epi32(1); ++- twos = _mm256_set1_epi32(2); ++- fours = _mm256_set1_epi32(4); ++- ++- cp1 = _mm256_set1_ps(1.0); ++- cp2 = _mm256_set1_ps(0.83333333e-1); ++- cp3 = _mm256_set1_ps(0.2777778e-2); ++- cp4 = _mm256_set1_ps(0.49603e-4); ++- cp5 = _mm256_set1_ps(0.551e-6); ++- ++- for(;number < eighthPoints; number++){ ++- aVal = _mm256_loadu_ps(aPtr); ++- s = _mm256_sub_ps(aVal, _mm256_and_ps(_mm256_mul_ps(aVal, ftwos), _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS))); ++- q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi))); ++- r = _mm256_add_epi32(q, _mm256_and_si256(q, ones)); ++- ++- s = _mm256_fnmadd_ps(_mm256_cvtepi32_ps(r), pio4A, s); ++- s = _mm256_fnmadd_ps(_mm256_cvtepi32_ps(r), pio4B, s); ++- ++- s = _mm256_div_ps(s, _mm256_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction ++- s = _mm256_mul_ps(s, s); ++- // Evaluate Taylor series ++- s = _mm256_mul_ps(_mm256_fmadd_ps(_mm256_fmsub_ps(_mm256_fmadd_ps(_mm256_fmsub_ps(s, cp5, cp4), s, cp3), s, cp2), s, cp1), s); ++- ++- for(i = 0; i < 3; i++){ ++- s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s)); +++ float* bPtr = bVector; +++ const float* aPtr = aVector; +++ +++ unsigned int number = 0; +++ unsigned int eighthPoints = num_points / 8; +++ unsigned int i = 0; +++ +++ __m256 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones, +++ fzeroes; +++ __m256 sine, cosine, tangent, condition1, condition2, condition3; +++ __m256i q, r, ones, twos, fours; +++ +++ m4pi = _mm256_set1_ps(1.273239545); +++ pio4A = _mm256_set1_ps(0.78515625); +++ pio4B = _mm256_set1_ps(0.241876e-3); +++ ffours = _mm256_set1_ps(4.0); +++ ftwos = _mm256_set1_ps(2.0); +++ fones = _mm256_set1_ps(1.0); +++ fzeroes = _mm256_setzero_ps(); +++ ones = _mm256_set1_epi32(1); +++ twos = _mm256_set1_epi32(2); +++ fours = _mm256_set1_epi32(4); +++ +++ cp1 = _mm256_set1_ps(1.0); +++ cp2 = _mm256_set1_ps(0.83333333e-1); +++ cp3 = _mm256_set1_ps(0.2777778e-2); +++ cp4 = _mm256_set1_ps(0.49603e-4); +++ cp5 = _mm256_set1_ps(0.551e-6); +++ +++ for (; number < eighthPoints; number++) { +++ aVal = _mm256_loadu_ps(aPtr); +++ s = _mm256_sub_ps(aVal, +++ _mm256_and_ps(_mm256_mul_ps(aVal, ftwos), +++ _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS))); +++ q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi))); +++ r = _mm256_add_epi32(q, _mm256_and_si256(q, ones)); +++ +++ s = _mm256_fnmadd_ps(_mm256_cvtepi32_ps(r), pio4A, s); +++ s = _mm256_fnmadd_ps(_mm256_cvtepi32_ps(r), pio4B, s); +++ +++ s = _mm256_div_ps( +++ s, +++ _mm256_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction +++ s = _mm256_mul_ps(s, s); +++ // Evaluate Taylor series +++ s = _mm256_mul_ps( +++ _mm256_fmadd_ps( +++ _mm256_fmsub_ps( +++ _mm256_fmadd_ps(_mm256_fmsub_ps(s, cp5, cp4), s, cp3), s, cp2), +++ s, +++ cp1), +++ s); +++ +++ for (i = 0; i < 3; i++) { +++ s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s)); +++ } +++ s = _mm256_div_ps(s, ftwos); +++ +++ sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s)); +++ cosine = _mm256_sub_ps(fones, s); +++ +++ condition1 = _mm256_cmp_ps( +++ _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, ones), twos)), +++ fzeroes, +++ _CMP_NEQ_UQ); +++ condition2 = _mm256_cmp_ps( +++ _mm256_cmp_ps( +++ _mm256_cvtepi32_ps(_mm256_and_si256(q, fours)), fzeroes, _CMP_NEQ_UQ), +++ _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS), +++ _CMP_NEQ_UQ); +++ condition3 = _mm256_cmp_ps( +++ _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, twos), fours)), +++ fzeroes, +++ _CMP_NEQ_UQ); +++ +++ __m256 temp = cosine; +++ cosine = +++ _mm256_add_ps(cosine, _mm256_and_ps(_mm256_sub_ps(sine, cosine), condition1)); +++ sine = _mm256_add_ps(sine, _mm256_and_ps(_mm256_sub_ps(temp, sine), condition1)); +++ sine = _mm256_sub_ps( +++ sine, _mm256_and_ps(_mm256_mul_ps(sine, _mm256_set1_ps(2.0f)), condition2)); +++ cosine = _mm256_sub_ps( +++ cosine, +++ _mm256_and_ps(_mm256_mul_ps(cosine, _mm256_set1_ps(2.0f)), condition3)); +++ tangent = _mm256_div_ps(sine, cosine); +++ _mm256_storeu_ps(bPtr, tangent); +++ aPtr += 8; +++ bPtr += 8; +++ } +++ +++ number = eighthPoints * 8; +++ for (; number < num_points; number++) { +++ *bPtr++ = tan(*aPtr++); ++ } ++- s = _mm256_div_ps(s, ftwos); ++- ++- sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s)); ++- cosine = _mm256_sub_ps(fones, s); ++- ++- condition1 = _mm256_cmp_ps(_mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, ones), twos)), fzeroes, _CMP_NEQ_UQ); ++- condition2 = _mm256_cmp_ps(_mm256_cmp_ps(_mm256_cvtepi32_ps(_mm256_and_si256(q, fours)), fzeroes, _CMP_NEQ_UQ), _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS), _CMP_NEQ_UQ); ++- condition3 = _mm256_cmp_ps(_mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, twos), fours)), fzeroes, _CMP_NEQ_UQ); ++- ++- __m256 temp = cosine; ++- cosine = _mm256_add_ps(cosine, _mm256_and_ps(_mm256_sub_ps(sine, cosine), condition1)); ++- sine = _mm256_add_ps(sine, _mm256_and_ps(_mm256_sub_ps(temp, sine), condition1)); ++- sine = _mm256_sub_ps(sine, _mm256_and_ps(_mm256_mul_ps(sine, _mm256_set1_ps(2.0f)), condition2)); ++- cosine = _mm256_sub_ps(cosine, _mm256_and_ps(_mm256_mul_ps(cosine, _mm256_set1_ps(2.0f)), condition3)); ++- tangent = _mm256_div_ps(sine, cosine); ++- _mm256_storeu_ps(bPtr, tangent); ++- aPtr += 8; ++- bPtr += 8; ++- } ++- ++- number = eighthPoints * 8; ++- for(;number < num_points; number++){ ++- *bPtr++ = tan(*aPtr++); ++- } ++ } ++ ++ #endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for unaligned */ ++@@ -408,78 +506,109 @@ volk_32f_tan_32f_u_avx2_fma(float* bVector, const float* aVector, ++ #include ++ ++ static inline void ++-volk_32f_tan_32f_u_avx2(float* bVector, const float* aVector, ++- unsigned int num_points) +++volk_32f_tan_32f_u_avx2(float* bVector, const float* aVector, unsigned int num_points) ++ { ++- float* bPtr = bVector; ++- const float* aPtr = aVector; ++- ++- unsigned int number = 0; ++- unsigned int eighthPoints = num_points / 8; ++- unsigned int i = 0; ++- ++- __m256 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones, fzeroes; ++- __m256 sine, cosine, tangent, condition1, condition2, condition3; ++- __m256i q, r, ones, twos, fours; ++- ++- m4pi = _mm256_set1_ps(1.273239545); ++- pio4A = _mm256_set1_ps(0.78515625); ++- pio4B = _mm256_set1_ps(0.241876e-3); ++- ffours = _mm256_set1_ps(4.0); ++- ftwos = _mm256_set1_ps(2.0); ++- fones = _mm256_set1_ps(1.0); ++- fzeroes = _mm256_setzero_ps(); ++- ones = _mm256_set1_epi32(1); ++- twos = _mm256_set1_epi32(2); ++- fours = _mm256_set1_epi32(4); ++- ++- cp1 = _mm256_set1_ps(1.0); ++- cp2 = _mm256_set1_ps(0.83333333e-1); ++- cp3 = _mm256_set1_ps(0.2777778e-2); ++- cp4 = _mm256_set1_ps(0.49603e-4); ++- cp5 = _mm256_set1_ps(0.551e-6); ++- ++- for(;number < eighthPoints; number++){ ++- aVal = _mm256_loadu_ps(aPtr); ++- s = _mm256_sub_ps(aVal, _mm256_and_ps(_mm256_mul_ps(aVal, ftwos), _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS))); ++- q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi))); ++- r = _mm256_add_epi32(q, _mm256_and_si256(q, ones)); ++- ++- s = _mm256_sub_ps(s, _mm256_mul_ps(_mm256_cvtepi32_ps(r), pio4A)); ++- s = _mm256_sub_ps(s, _mm256_mul_ps(_mm256_cvtepi32_ps(r), pio4B)); ++- ++- s = _mm256_div_ps(s, _mm256_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction ++- s = _mm256_mul_ps(s, s); ++- // Evaluate Taylor series ++- s = _mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(s, cp5), cp4), s), cp3), s), cp2), s), cp1), s); ++- ++- for(i = 0; i < 3; i++){ ++- s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s)); +++ float* bPtr = bVector; +++ const float* aPtr = aVector; +++ +++ unsigned int number = 0; +++ unsigned int eighthPoints = num_points / 8; +++ unsigned int i = 0; +++ +++ __m256 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones, +++ fzeroes; +++ __m256 sine, cosine, tangent, condition1, condition2, condition3; +++ __m256i q, r, ones, twos, fours; +++ +++ m4pi = _mm256_set1_ps(1.273239545); +++ pio4A = _mm256_set1_ps(0.78515625); +++ pio4B = _mm256_set1_ps(0.241876e-3); +++ ffours = _mm256_set1_ps(4.0); +++ ftwos = _mm256_set1_ps(2.0); +++ fones = _mm256_set1_ps(1.0); +++ fzeroes = _mm256_setzero_ps(); +++ ones = _mm256_set1_epi32(1); +++ twos = _mm256_set1_epi32(2); +++ fours = _mm256_set1_epi32(4); +++ +++ cp1 = _mm256_set1_ps(1.0); +++ cp2 = _mm256_set1_ps(0.83333333e-1); +++ cp3 = _mm256_set1_ps(0.2777778e-2); +++ cp4 = _mm256_set1_ps(0.49603e-4); +++ cp5 = _mm256_set1_ps(0.551e-6); +++ +++ for (; number < eighthPoints; number++) { +++ aVal = _mm256_loadu_ps(aPtr); +++ s = _mm256_sub_ps(aVal, +++ _mm256_and_ps(_mm256_mul_ps(aVal, ftwos), +++ _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS))); +++ q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi))); +++ r = _mm256_add_epi32(q, _mm256_and_si256(q, ones)); +++ +++ s = _mm256_sub_ps(s, _mm256_mul_ps(_mm256_cvtepi32_ps(r), pio4A)); +++ s = _mm256_sub_ps(s, _mm256_mul_ps(_mm256_cvtepi32_ps(r), pio4B)); +++ +++ s = _mm256_div_ps( +++ s, +++ _mm256_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction +++ s = _mm256_mul_ps(s, s); +++ // Evaluate Taylor series +++ s = _mm256_mul_ps( +++ _mm256_add_ps( +++ _mm256_mul_ps( +++ _mm256_sub_ps( +++ _mm256_mul_ps( +++ _mm256_add_ps( +++ _mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(s, cp5), cp4), +++ s), +++ cp3), +++ s), +++ cp2), +++ s), +++ cp1), +++ s); +++ +++ for (i = 0; i < 3; i++) { +++ s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s)); +++ } +++ s = _mm256_div_ps(s, ftwos); +++ +++ sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s)); +++ cosine = _mm256_sub_ps(fones, s); +++ +++ condition1 = _mm256_cmp_ps( +++ _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, ones), twos)), +++ fzeroes, +++ _CMP_NEQ_UQ); +++ condition2 = _mm256_cmp_ps( +++ _mm256_cmp_ps( +++ _mm256_cvtepi32_ps(_mm256_and_si256(q, fours)), fzeroes, _CMP_NEQ_UQ), +++ _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS), +++ _CMP_NEQ_UQ); +++ condition3 = _mm256_cmp_ps( +++ _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, twos), fours)), +++ fzeroes, +++ _CMP_NEQ_UQ); +++ +++ __m256 temp = cosine; +++ cosine = +++ _mm256_add_ps(cosine, _mm256_and_ps(_mm256_sub_ps(sine, cosine), condition1)); +++ sine = _mm256_add_ps(sine, _mm256_and_ps(_mm256_sub_ps(temp, sine), condition1)); +++ sine = _mm256_sub_ps( +++ sine, _mm256_and_ps(_mm256_mul_ps(sine, _mm256_set1_ps(2.0f)), condition2)); +++ cosine = _mm256_sub_ps( +++ cosine, +++ _mm256_and_ps(_mm256_mul_ps(cosine, _mm256_set1_ps(2.0f)), condition3)); +++ tangent = _mm256_div_ps(sine, cosine); +++ _mm256_storeu_ps(bPtr, tangent); +++ aPtr += 8; +++ bPtr += 8; +++ } +++ +++ number = eighthPoints * 8; +++ for (; number < num_points; number++) { +++ *bPtr++ = tan(*aPtr++); ++ } ++- s = _mm256_div_ps(s, ftwos); ++- ++- sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s)); ++- cosine = _mm256_sub_ps(fones, s); ++- ++- condition1 = _mm256_cmp_ps(_mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, ones), twos)), fzeroes, _CMP_NEQ_UQ); ++- condition2 = _mm256_cmp_ps(_mm256_cmp_ps(_mm256_cvtepi32_ps(_mm256_and_si256(q, fours)), fzeroes, _CMP_NEQ_UQ), _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS), _CMP_NEQ_UQ); ++- condition3 = _mm256_cmp_ps(_mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, twos), fours)), fzeroes, _CMP_NEQ_UQ); ++- ++- __m256 temp = cosine; ++- cosine = _mm256_add_ps(cosine, _mm256_and_ps(_mm256_sub_ps(sine, cosine), condition1)); ++- sine = _mm256_add_ps(sine, _mm256_and_ps(_mm256_sub_ps(temp, sine), condition1)); ++- sine = _mm256_sub_ps(sine, _mm256_and_ps(_mm256_mul_ps(sine, _mm256_set1_ps(2.0f)), condition2)); ++- cosine = _mm256_sub_ps(cosine, _mm256_and_ps(_mm256_mul_ps(cosine, _mm256_set1_ps(2.0f)), condition3)); ++- tangent = _mm256_div_ps(sine, cosine); ++- _mm256_storeu_ps(bPtr, tangent); ++- aPtr += 8; ++- bPtr += 8; ++- } ++- ++- number = eighthPoints * 8; ++- for(;number < num_points; number++){ ++- *bPtr++ = tan(*aPtr++); ++- } ++ } ++ ++ #endif /* LV_HAVE_AVX2 for unaligned */ ++@@ -491,75 +620,95 @@ volk_32f_tan_32f_u_avx2(float* bVector, const float* aVector, ++ static inline void ++ volk_32f_tan_32f_u_sse4_1(float* bVector, const float* aVector, unsigned int num_points) ++ { ++- float* bPtr = bVector; ++- const float* aPtr = aVector; ++- ++- unsigned int number = 0; ++- unsigned int quarterPoints = num_points / 4; ++- unsigned int i = 0; ++- ++- __m128 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones, fzeroes; ++- __m128 sine, cosine, tangent, condition1, condition2, condition3; ++- __m128i q, r, ones, twos, fours; ++- ++- m4pi = _mm_set1_ps(1.273239545); ++- pio4A = _mm_set1_ps(0.78515625); ++- pio4B = _mm_set1_ps(0.241876e-3); ++- ffours = _mm_set1_ps(4.0); ++- ftwos = _mm_set1_ps(2.0); ++- fones = _mm_set1_ps(1.0); ++- fzeroes = _mm_setzero_ps(); ++- ones = _mm_set1_epi32(1); ++- twos = _mm_set1_epi32(2); ++- fours = _mm_set1_epi32(4); ++- ++- cp1 = _mm_set1_ps(1.0); ++- cp2 = _mm_set1_ps(0.83333333e-1); ++- cp3 = _mm_set1_ps(0.2777778e-2); ++- cp4 = _mm_set1_ps(0.49603e-4); ++- cp5 = _mm_set1_ps(0.551e-6); ++- ++- for(;number < quarterPoints; number++){ ++- aVal = _mm_loadu_ps(aPtr); ++- s = _mm_sub_ps(aVal, _mm_and_ps(_mm_mul_ps(aVal, ftwos), _mm_cmplt_ps(aVal, fzeroes))); ++- q = _mm_cvtps_epi32(_mm_floor_ps(_mm_mul_ps(s, m4pi))); ++- r = _mm_add_epi32(q, _mm_and_si128(q, ones)); ++- ++- s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4A)); ++- s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4B)); ++- ++- s = _mm_div_ps(s, _mm_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction ++- s = _mm_mul_ps(s, s); ++- // Evaluate Taylor series ++- s = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(s, cp5), cp4), s), cp3), s), cp2), s), cp1), s); ++- ++- for(i = 0; i < 3; i++){ ++- s = _mm_mul_ps(s, _mm_sub_ps(ffours, s)); +++ float* bPtr = bVector; +++ const float* aPtr = aVector; +++ +++ unsigned int number = 0; +++ unsigned int quarterPoints = num_points / 4; +++ unsigned int i = 0; +++ +++ __m128 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones, +++ fzeroes; +++ __m128 sine, cosine, tangent, condition1, condition2, condition3; +++ __m128i q, r, ones, twos, fours; +++ +++ m4pi = _mm_set1_ps(1.273239545); +++ pio4A = _mm_set1_ps(0.78515625); +++ pio4B = _mm_set1_ps(0.241876e-3); +++ ffours = _mm_set1_ps(4.0); +++ ftwos = _mm_set1_ps(2.0); +++ fones = _mm_set1_ps(1.0); +++ fzeroes = _mm_setzero_ps(); +++ ones = _mm_set1_epi32(1); +++ twos = _mm_set1_epi32(2); +++ fours = _mm_set1_epi32(4); +++ +++ cp1 = _mm_set1_ps(1.0); +++ cp2 = _mm_set1_ps(0.83333333e-1); +++ cp3 = _mm_set1_ps(0.2777778e-2); +++ cp4 = _mm_set1_ps(0.49603e-4); +++ cp5 = _mm_set1_ps(0.551e-6); +++ +++ for (; number < quarterPoints; number++) { +++ aVal = _mm_loadu_ps(aPtr); +++ s = _mm_sub_ps(aVal, +++ _mm_and_ps(_mm_mul_ps(aVal, ftwos), _mm_cmplt_ps(aVal, fzeroes))); +++ q = _mm_cvtps_epi32(_mm_floor_ps(_mm_mul_ps(s, m4pi))); +++ r = _mm_add_epi32(q, _mm_and_si128(q, ones)); +++ +++ s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4A)); +++ s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4B)); +++ +++ s = _mm_div_ps( +++ s, _mm_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction +++ s = _mm_mul_ps(s, s); +++ // Evaluate Taylor series +++ s = _mm_mul_ps( +++ _mm_add_ps( +++ _mm_mul_ps( +++ _mm_sub_ps( +++ _mm_mul_ps( +++ _mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(s, cp5), cp4), s), +++ cp3), +++ s), +++ cp2), +++ s), +++ cp1), +++ s); +++ +++ for (i = 0; i < 3; i++) { +++ s = _mm_mul_ps(s, _mm_sub_ps(ffours, s)); +++ } +++ s = _mm_div_ps(s, ftwos); +++ +++ sine = _mm_sqrt_ps(_mm_mul_ps(_mm_sub_ps(ftwos, s), s)); +++ cosine = _mm_sub_ps(fones, s); +++ +++ condition1 = _mm_cmpneq_ps( +++ _mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, ones), twos)), fzeroes); +++ condition2 = _mm_cmpneq_ps( +++ _mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(q, fours)), fzeroes), +++ _mm_cmplt_ps(aVal, fzeroes)); +++ condition3 = _mm_cmpneq_ps( +++ _mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, twos), fours)), fzeroes); +++ +++ __m128 temp = cosine; +++ cosine = _mm_add_ps(cosine, _mm_and_ps(_mm_sub_ps(sine, cosine), condition1)); +++ sine = _mm_add_ps(sine, _mm_and_ps(_mm_sub_ps(temp, sine), condition1)); +++ sine = +++ _mm_sub_ps(sine, _mm_and_ps(_mm_mul_ps(sine, _mm_set1_ps(2.0f)), condition2)); +++ cosine = _mm_sub_ps( +++ cosine, _mm_and_ps(_mm_mul_ps(cosine, _mm_set1_ps(2.0f)), condition3)); +++ tangent = _mm_div_ps(sine, cosine); +++ _mm_storeu_ps(bPtr, tangent); +++ aPtr += 4; +++ bPtr += 4; +++ } +++ +++ number = quarterPoints * 4; +++ for (; number < num_points; number++) { +++ *bPtr++ = tanf(*aPtr++); ++ } ++- s = _mm_div_ps(s, ftwos); ++- ++- sine = _mm_sqrt_ps(_mm_mul_ps(_mm_sub_ps(ftwos, s), s)); ++- cosine = _mm_sub_ps(fones, s); ++- ++- condition1 = _mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, ones), twos)), fzeroes); ++- condition2 = _mm_cmpneq_ps(_mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(q, fours)), fzeroes), _mm_cmplt_ps(aVal, fzeroes)); ++- condition3 = _mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, twos), fours)), fzeroes); ++- ++- __m128 temp = cosine; ++- cosine = _mm_add_ps(cosine, _mm_and_ps(_mm_sub_ps(sine, cosine), condition1)); ++- sine = _mm_add_ps(sine, _mm_and_ps(_mm_sub_ps(temp, sine), condition1)); ++- sine = _mm_sub_ps(sine, _mm_and_ps(_mm_mul_ps(sine, _mm_set1_ps(2.0f)), condition2)); ++- cosine = _mm_sub_ps(cosine, _mm_and_ps(_mm_mul_ps(cosine, _mm_set1_ps(2.0f)), condition3)); ++- tangent = _mm_div_ps(sine, cosine); ++- _mm_storeu_ps(bPtr, tangent); ++- aPtr += 4; ++- bPtr += 4; ++- } ++- ++- number = quarterPoints * 4; ++- for(;number < num_points; number++){ ++- *bPtr++ = tanf(*aPtr++); ++- } ++ } ++ ++ #endif /* LV_HAVE_SSE4_1 for unaligned */ ++@@ -568,16 +717,15 @@ volk_32f_tan_32f_u_sse4_1(float* bVector, const float* aVector, unsigned int num ++ #ifdef LV_HAVE_GENERIC ++ ++ static inline void ++-volk_32f_tan_32f_generic(float* bVector, const float* aVector, ++- unsigned int num_points) +++volk_32f_tan_32f_generic(float* bVector, const float* aVector, unsigned int num_points) ++ { ++- float* bPtr = bVector; ++- const float* aPtr = aVector; ++- unsigned int number = 0; +++ float* bPtr = bVector; +++ const float* aPtr = aVector; +++ unsigned int number = 0; ++ ++- for(; number < num_points; number++){ ++- *bPtr++ = tanf(*aPtr++); ++- } +++ for (; number < num_points; number++) { +++ *bPtr++ = tanf(*aPtr++); +++ } ++ } ++ #endif /* LV_HAVE_GENERIC */ ++ ++@@ -587,30 +735,29 @@ volk_32f_tan_32f_generic(float* bVector, const float* aVector, ++ #include ++ ++ static inline void ++-volk_32f_tan_32f_neon(float* bVector, const float* aVector, ++- unsigned int num_points) +++volk_32f_tan_32f_neon(float* bVector, const float* aVector, unsigned int num_points) ++ { ++ unsigned int number = 0; ++ unsigned int quarter_points = num_points / 4; ++ float* bVectorPtr = bVector; ++ const float* aVectorPtr = aVector; ++- +++ ++ float32x4_t b_vec; ++ float32x4_t a_vec; ++- ++- for(number = 0; number < quarter_points; number++) { +++ +++ for (number = 0; number < quarter_points; number++) { ++ a_vec = vld1q_f32(aVectorPtr); ++ // Prefetch next one, speeds things up ++- __VOLK_PREFETCH(aVectorPtr+4); +++ __VOLK_PREFETCH(aVectorPtr + 4); ++ b_vec = _vtanq_f32(a_vec); ++ vst1q_f32(bVectorPtr, b_vec); ++ // move pointers ahead ++- bVectorPtr+=4; ++- aVectorPtr+=4; +++ bVectorPtr += 4; +++ aVectorPtr += 4; ++ } ++- +++ ++ // Deal with the rest ++- for(number = quarter_points * 4; number < num_points; number++) { +++ for (number = quarter_points * 4; number < num_points; number++) { ++ *bVectorPtr++ = tanf(*aVectorPtr++); ++ } ++ } ++diff --git a/kernels/volk/volk_32f_tanh_32f.h b/kernels/volk/volk_32f_tanh_32f.h ++index d49432d..f157d39 100644 ++--- a/kernels/volk/volk_32f_tanh_32f.h +++++ b/kernels/volk/volk_32f_tanh_32f.h ++@@ -69,22 +69,21 @@ ++ #define INCLUDED_volk_32f_tanh_32f_a_H ++ ++ #include ++-#include ++ #include +++#include ++ #include ++ ++ #ifdef LV_HAVE_GENERIC ++ ++ static inline void ++-volk_32f_tanh_32f_generic(float* cVector, const float* aVector, ++- unsigned int num_points) +++volk_32f_tanh_32f_generic(float* cVector, const float* aVector, unsigned int num_points) ++ { ++- unsigned int number = 0; ++- float* cPtr = cVector; ++- const float* aPtr = aVector; ++- for(; number < num_points; number++) { ++- *cPtr++ = tanhf(*aPtr++); ++- } +++ unsigned int number = 0; +++ float* cPtr = cVector; +++ const float* aPtr = aVector; +++ for (; number < num_points; number++) { +++ *cPtr++ = tanhf(*aPtr++); +++ } ++ } ++ ++ #endif /* LV_HAVE_GENERIC */ ++@@ -93,81 +92,88 @@ volk_32f_tanh_32f_generic(float* cVector, const float* aVector, ++ #ifdef LV_HAVE_GENERIC ++ ++ static inline void ++-volk_32f_tanh_32f_series(float* cVector, const float* aVector, ++- unsigned int num_points) +++volk_32f_tanh_32f_series(float* cVector, const float* aVector, unsigned int num_points) ++ { ++- unsigned int number = 0; ++- float* cPtr = cVector; ++- const float* aPtr = aVector; ++- for(; number < num_points; number++) { ++- if(*aPtr > 4.97) ++- *cPtr++ = 1; ++- else if(*aPtr <= -4.97) ++- *cPtr++ = -1; ++- else { ++- float x2 = (*aPtr) * (*aPtr); ++- float a = (*aPtr) * (135135.0f + x2 * (17325.0f + x2 * (378.0f + x2))); ++- float b = 135135.0f + x2 * (62370.0f + x2 * (3150.0f + x2 * 28.0f)); ++- *cPtr++ = a / b; ++- aPtr++; +++ unsigned int number = 0; +++ float* cPtr = cVector; +++ const float* aPtr = aVector; +++ for (; number < num_points; number++) { +++ if (*aPtr > 4.97) +++ *cPtr++ = 1; +++ else if (*aPtr <= -4.97) +++ *cPtr++ = -1; +++ else { +++ float x2 = (*aPtr) * (*aPtr); +++ float a = (*aPtr) * (135135.0f + x2 * (17325.0f + x2 * (378.0f + x2))); +++ float b = 135135.0f + x2 * (62370.0f + x2 * (3150.0f + x2 * 28.0f)); +++ *cPtr++ = a / b; +++ aPtr++; +++ } ++ } ++- } ++ } ++ ++ #endif /* LV_HAVE_GENERIC */ ++ ++ ++- ++ #ifdef LV_HAVE_SSE ++ #include ++ ++ static inline void ++-volk_32f_tanh_32f_a_sse(float* cVector, const float* aVector, ++- unsigned int num_points) +++volk_32f_tanh_32f_a_sse(float* cVector, const float* aVector, unsigned int num_points) ++ { ++- unsigned int number = 0; ++- const unsigned int quarterPoints = num_points / 4; ++- ++- float* cPtr = cVector; ++- const float* aPtr = aVector; ++- ++- __m128 aVal, cVal, x2, a, b; ++- __m128 const1, const2, const3, const4, const5, const6; ++- const1 = _mm_set_ps1(135135.0f); ++- const2 = _mm_set_ps1(17325.0f); ++- const3 = _mm_set_ps1(378.0f); ++- const4 = _mm_set_ps1(62370.0f); ++- const5 = _mm_set_ps1(3150.0f); ++- const6 = _mm_set_ps1(28.0f); ++- for(;number < quarterPoints; number++){ ++- ++- aVal = _mm_load_ps(aPtr); ++- x2 = _mm_mul_ps(aVal, aVal); ++- a = _mm_mul_ps(aVal, _mm_add_ps(const1, _mm_mul_ps(x2, _mm_add_ps(const2, _mm_mul_ps(x2, _mm_add_ps(const3, x2)))))); ++- b = _mm_add_ps(const1, _mm_mul_ps(x2, _mm_add_ps(const4, _mm_mul_ps(x2, _mm_add_ps(const5, _mm_mul_ps(x2, const6)))))); ++- ++- cVal = _mm_div_ps(a, b); ++- ++- _mm_store_ps(cPtr, cVal); // Store the results back into the C container ++- ++- aPtr += 4; ++- cPtr += 4; ++- } ++- ++- number = quarterPoints * 4; ++- for(;number < num_points; number++) { ++- if(*aPtr > 4.97) ++- *cPtr++ = 1; ++- else if(*aPtr <= -4.97) ++- *cPtr++ = -1; ++- else { ++- float x2 = (*aPtr) * (*aPtr); ++- float a = (*aPtr) * (135135.0f + x2 * (17325.0f + x2 * (378.0f + x2))); ++- float b = 135135.0f + x2 * (62370.0f + x2 * (3150.0f + x2 * 28.0f)); ++- *cPtr++ = a / b; ++- aPtr++; +++ unsigned int number = 0; +++ const unsigned int quarterPoints = num_points / 4; +++ +++ float* cPtr = cVector; +++ const float* aPtr = aVector; +++ +++ __m128 aVal, cVal, x2, a, b; +++ __m128 const1, const2, const3, const4, const5, const6; +++ const1 = _mm_set_ps1(135135.0f); +++ const2 = _mm_set_ps1(17325.0f); +++ const3 = _mm_set_ps1(378.0f); +++ const4 = _mm_set_ps1(62370.0f); +++ const5 = _mm_set_ps1(3150.0f); +++ const6 = _mm_set_ps1(28.0f); +++ for (; number < quarterPoints; number++) { +++ +++ aVal = _mm_load_ps(aPtr); +++ x2 = _mm_mul_ps(aVal, aVal); +++ a = _mm_mul_ps( +++ aVal, +++ _mm_add_ps( +++ const1, +++ _mm_mul_ps(x2, +++ _mm_add_ps(const2, _mm_mul_ps(x2, _mm_add_ps(const3, x2)))))); +++ b = _mm_add_ps( +++ const1, +++ _mm_mul_ps( +++ x2, +++ _mm_add_ps(const4, +++ _mm_mul_ps(x2, _mm_add_ps(const5, _mm_mul_ps(x2, const6)))))); +++ +++ cVal = _mm_div_ps(a, b); +++ +++ _mm_store_ps(cPtr, cVal); // Store the results back into the C container +++ +++ aPtr += 4; +++ cPtr += 4; +++ } +++ +++ number = quarterPoints * 4; +++ for (; number < num_points; number++) { +++ if (*aPtr > 4.97) +++ *cPtr++ = 1; +++ else if (*aPtr <= -4.97) +++ *cPtr++ = -1; +++ else { +++ float x2 = (*aPtr) * (*aPtr); +++ float a = (*aPtr) * (135135.0f + x2 * (17325.0f + x2 * (378.0f + x2))); +++ float b = 135135.0f + x2 * (62370.0f + x2 * (3150.0f + x2 * 28.0f)); +++ *cPtr++ = a / b; +++ aPtr++; +++ } ++ } ++- } ++ } ++ #endif /* LV_HAVE_SSE */ ++ ++@@ -176,52 +182,65 @@ volk_32f_tanh_32f_a_sse(float* cVector, const float* aVector, ++ #include ++ ++ static inline void ++-volk_32f_tanh_32f_a_avx(float* cVector, const float* aVector, ++- unsigned int num_points) +++volk_32f_tanh_32f_a_avx(float* cVector, const float* aVector, unsigned int num_points) ++ { ++- unsigned int number = 0; ++- const unsigned int eighthPoints = num_points / 8; ++- ++- float* cPtr = cVector; ++- const float* aPtr = aVector; ++- ++- __m256 aVal, cVal, x2, a, b; ++- __m256 const1, const2, const3, const4, const5, const6; ++- const1 = _mm256_set1_ps(135135.0f); ++- const2 = _mm256_set1_ps(17325.0f); ++- const3 = _mm256_set1_ps(378.0f); ++- const4 = _mm256_set1_ps(62370.0f); ++- const5 = _mm256_set1_ps(3150.0f); ++- const6 = _mm256_set1_ps(28.0f); ++- for(;number < eighthPoints; number++){ ++- ++- aVal = _mm256_load_ps(aPtr); ++- x2 = _mm256_mul_ps(aVal, aVal); ++- a = _mm256_mul_ps(aVal, _mm256_add_ps(const1, _mm256_mul_ps(x2, _mm256_add_ps(const2, _mm256_mul_ps(x2, _mm256_add_ps(const3, x2)))))); ++- b = _mm256_add_ps(const1, _mm256_mul_ps(x2, _mm256_add_ps(const4, _mm256_mul_ps(x2, _mm256_add_ps(const5, _mm256_mul_ps(x2, const6)))))); ++- ++- cVal = _mm256_div_ps(a, b); ++- ++- _mm256_store_ps(cPtr, cVal); // Store the results back into the C container ++- ++- aPtr += 8; ++- cPtr += 8; ++- } ++- ++- number = eighthPoints * 8; ++- for(;number < num_points; number++) { ++- if(*aPtr > 4.97) ++- *cPtr++ = 1; ++- else if(*aPtr <= -4.97) ++- *cPtr++ = -1; ++- else { ++- float x2 = (*aPtr) * (*aPtr); ++- float a = (*aPtr) * (135135.0f + x2 * (17325.0f + x2 * (378.0f + x2))); ++- float b = 135135.0f + x2 * (62370.0f + x2 * (3150.0f + x2 * 28.0f)); ++- *cPtr++ = a / b; ++- aPtr++; +++ unsigned int number = 0; +++ const unsigned int eighthPoints = num_points / 8; +++ +++ float* cPtr = cVector; +++ const float* aPtr = aVector; +++ +++ __m256 aVal, cVal, x2, a, b; +++ __m256 const1, const2, const3, const4, const5, const6; +++ const1 = _mm256_set1_ps(135135.0f); +++ const2 = _mm256_set1_ps(17325.0f); +++ const3 = _mm256_set1_ps(378.0f); +++ const4 = _mm256_set1_ps(62370.0f); +++ const5 = _mm256_set1_ps(3150.0f); +++ const6 = _mm256_set1_ps(28.0f); +++ for (; number < eighthPoints; number++) { +++ +++ aVal = _mm256_load_ps(aPtr); +++ x2 = _mm256_mul_ps(aVal, aVal); +++ a = _mm256_mul_ps( +++ aVal, +++ _mm256_add_ps( +++ const1, +++ _mm256_mul_ps( +++ x2, +++ _mm256_add_ps(const2, +++ _mm256_mul_ps(x2, _mm256_add_ps(const3, x2)))))); +++ b = _mm256_add_ps( +++ const1, +++ _mm256_mul_ps( +++ x2, +++ _mm256_add_ps( +++ const4, +++ _mm256_mul_ps(x2, +++ _mm256_add_ps(const5, _mm256_mul_ps(x2, const6)))))); +++ +++ cVal = _mm256_div_ps(a, b); +++ +++ _mm256_store_ps(cPtr, cVal); // Store the results back into the C container +++ +++ aPtr += 8; +++ cPtr += 8; +++ } +++ +++ number = eighthPoints * 8; +++ for (; number < num_points; number++) { +++ if (*aPtr > 4.97) +++ *cPtr++ = 1; +++ else if (*aPtr <= -4.97) +++ *cPtr++ = -1; +++ else { +++ float x2 = (*aPtr) * (*aPtr); +++ float a = (*aPtr) * (135135.0f + x2 * (17325.0f + x2 * (378.0f + x2))); +++ float b = 135135.0f + x2 * (62370.0f + x2 * (3150.0f + x2 * 28.0f)); +++ *cPtr++ = a / b; +++ aPtr++; +++ } ++ } ++- } ++ } ++ #endif /* LV_HAVE_AVX */ ++ ++@@ -229,52 +248,55 @@ volk_32f_tanh_32f_a_avx(float* cVector, const float* aVector, ++ #include ++ ++ static inline void ++-volk_32f_tanh_32f_a_avx_fma(float* cVector, const float* aVector, ++- unsigned int num_points) +++volk_32f_tanh_32f_a_avx_fma(float* cVector, const float* aVector, unsigned int num_points) ++ { ++- unsigned int number = 0; ++- const unsigned int eighthPoints = num_points / 8; ++- ++- float* cPtr = cVector; ++- const float* aPtr = aVector; ++- ++- __m256 aVal, cVal, x2, a, b; ++- __m256 const1, const2, const3, const4, const5, const6; ++- const1 = _mm256_set1_ps(135135.0f); ++- const2 = _mm256_set1_ps(17325.0f); ++- const3 = _mm256_set1_ps(378.0f); ++- const4 = _mm256_set1_ps(62370.0f); ++- const5 = _mm256_set1_ps(3150.0f); ++- const6 = _mm256_set1_ps(28.0f); ++- for(;number < eighthPoints; number++){ ++- ++- aVal = _mm256_load_ps(aPtr); ++- x2 = _mm256_mul_ps(aVal, aVal); ++- a = _mm256_mul_ps(aVal, _mm256_fmadd_ps(x2, _mm256_fmadd_ps(x2, _mm256_add_ps(const3, x2), const2),const1)); ++- b = _mm256_fmadd_ps(x2, _mm256_fmadd_ps(x2, _mm256_fmadd_ps(x2, const6, const5), const4), const1); ++- ++- cVal = _mm256_div_ps(a, b); ++- ++- _mm256_store_ps(cPtr, cVal); // Store the results back into the C container ++- ++- aPtr += 8; ++- cPtr += 8; ++- } ++- ++- number = eighthPoints * 8; ++- for(;number < num_points; number++) { ++- if(*aPtr > 4.97) ++- *cPtr++ = 1; ++- else if(*aPtr <= -4.97) ++- *cPtr++ = -1; ++- else { ++- float x2 = (*aPtr) * (*aPtr); ++- float a = (*aPtr) * (135135.0f + x2 * (17325.0f + x2 * (378.0f + x2))); ++- float b = 135135.0f + x2 * (62370.0f + x2 * (3150.0f + x2 * 28.0f)); ++- *cPtr++ = a / b; ++- aPtr++; +++ unsigned int number = 0; +++ const unsigned int eighthPoints = num_points / 8; +++ +++ float* cPtr = cVector; +++ const float* aPtr = aVector; +++ +++ __m256 aVal, cVal, x2, a, b; +++ __m256 const1, const2, const3, const4, const5, const6; +++ const1 = _mm256_set1_ps(135135.0f); +++ const2 = _mm256_set1_ps(17325.0f); +++ const3 = _mm256_set1_ps(378.0f); +++ const4 = _mm256_set1_ps(62370.0f); +++ const5 = _mm256_set1_ps(3150.0f); +++ const6 = _mm256_set1_ps(28.0f); +++ for (; number < eighthPoints; number++) { +++ +++ aVal = _mm256_load_ps(aPtr); +++ x2 = _mm256_mul_ps(aVal, aVal); +++ a = _mm256_mul_ps( +++ aVal, +++ _mm256_fmadd_ps( +++ x2, _mm256_fmadd_ps(x2, _mm256_add_ps(const3, x2), const2), const1)); +++ b = _mm256_fmadd_ps( +++ x2, _mm256_fmadd_ps(x2, _mm256_fmadd_ps(x2, const6, const5), const4), const1); +++ +++ cVal = _mm256_div_ps(a, b); +++ +++ _mm256_store_ps(cPtr, cVal); // Store the results back into the C container +++ +++ aPtr += 8; +++ cPtr += 8; +++ } +++ +++ number = eighthPoints * 8; +++ for (; number < num_points; number++) { +++ if (*aPtr > 4.97) +++ *cPtr++ = 1; +++ else if (*aPtr <= -4.97) +++ *cPtr++ = -1; +++ else { +++ float x2 = (*aPtr) * (*aPtr); +++ float a = (*aPtr) * (135135.0f + x2 * (17325.0f + x2 * (378.0f + x2))); +++ float b = 135135.0f + x2 * (62370.0f + x2 * (3150.0f + x2 * 28.0f)); +++ *cPtr++ = a / b; +++ aPtr++; +++ } ++ } ++- } ++ } ++ #endif /* LV_HAVE_AVX && LV_HAVE_FMA */ ++ ++@@ -285,8 +307,8 @@ volk_32f_tanh_32f_a_avx_fma(float* cVector, const float* aVector, ++ #define INCLUDED_volk_32f_tanh_32f_u_H ++ ++ #include ++-#include ++ #include +++#include ++ #include ++ ++ ++@@ -294,52 +316,61 @@ volk_32f_tanh_32f_a_avx_fma(float* cVector, const float* aVector, ++ #include ++ ++ static inline void ++-volk_32f_tanh_32f_u_sse(float* cVector, const float* aVector, ++- unsigned int num_points) +++volk_32f_tanh_32f_u_sse(float* cVector, const float* aVector, unsigned int num_points) ++ { ++- unsigned int number = 0; ++- const unsigned int quarterPoints = num_points / 4; ++- ++- float* cPtr = cVector; ++- const float* aPtr = aVector; ++- ++- __m128 aVal, cVal, x2, a, b; ++- __m128 const1, const2, const3, const4, const5, const6; ++- const1 = _mm_set_ps1(135135.0f); ++- const2 = _mm_set_ps1(17325.0f); ++- const3 = _mm_set_ps1(378.0f); ++- const4 = _mm_set_ps1(62370.0f); ++- const5 = _mm_set_ps1(3150.0f); ++- const6 = _mm_set_ps1(28.0f); ++- for(;number < quarterPoints; number++){ ++- ++- aVal = _mm_loadu_ps(aPtr); ++- x2 = _mm_mul_ps(aVal, aVal); ++- a = _mm_mul_ps(aVal, _mm_add_ps(const1, _mm_mul_ps(x2, _mm_add_ps(const2, _mm_mul_ps(x2, _mm_add_ps(const3, x2)))))); ++- b = _mm_add_ps(const1, _mm_mul_ps(x2, _mm_add_ps(const4, _mm_mul_ps(x2, _mm_add_ps(const5, _mm_mul_ps(x2, const6)))))); ++- ++- cVal = _mm_div_ps(a, b); ++- ++- _mm_storeu_ps(cPtr, cVal); // Store the results back into the C container ++- ++- aPtr += 4; ++- cPtr += 4; ++- } ++- ++- number = quarterPoints * 4; ++- for(;number < num_points; number++) { ++- if(*aPtr > 4.97) ++- *cPtr++ = 1; ++- else if(*aPtr <= -4.97) ++- *cPtr++ = -1; ++- else { ++- float x2 = (*aPtr) * (*aPtr); ++- float a = (*aPtr) * (135135.0f + x2 * (17325.0f + x2 * (378.0f + x2))); ++- float b = 135135.0f + x2 * (62370.0f + x2 * (3150.0f + x2 * 28.0f)); ++- *cPtr++ = a / b; ++- aPtr++; +++ unsigned int number = 0; +++ const unsigned int quarterPoints = num_points / 4; +++ +++ float* cPtr = cVector; +++ const float* aPtr = aVector; +++ +++ __m128 aVal, cVal, x2, a, b; +++ __m128 const1, const2, const3, const4, const5, const6; +++ const1 = _mm_set_ps1(135135.0f); +++ const2 = _mm_set_ps1(17325.0f); +++ const3 = _mm_set_ps1(378.0f); +++ const4 = _mm_set_ps1(62370.0f); +++ const5 = _mm_set_ps1(3150.0f); +++ const6 = _mm_set_ps1(28.0f); +++ for (; number < quarterPoints; number++) { +++ +++ aVal = _mm_loadu_ps(aPtr); +++ x2 = _mm_mul_ps(aVal, aVal); +++ a = _mm_mul_ps( +++ aVal, +++ _mm_add_ps( +++ const1, +++ _mm_mul_ps(x2, +++ _mm_add_ps(const2, _mm_mul_ps(x2, _mm_add_ps(const3, x2)))))); +++ b = _mm_add_ps( +++ const1, +++ _mm_mul_ps( +++ x2, +++ _mm_add_ps(const4, +++ _mm_mul_ps(x2, _mm_add_ps(const5, _mm_mul_ps(x2, const6)))))); +++ +++ cVal = _mm_div_ps(a, b); +++ +++ _mm_storeu_ps(cPtr, cVal); // Store the results back into the C container +++ +++ aPtr += 4; +++ cPtr += 4; +++ } +++ +++ number = quarterPoints * 4; +++ for (; number < num_points; number++) { +++ if (*aPtr > 4.97) +++ *cPtr++ = 1; +++ else if (*aPtr <= -4.97) +++ *cPtr++ = -1; +++ else { +++ float x2 = (*aPtr) * (*aPtr); +++ float a = (*aPtr) * (135135.0f + x2 * (17325.0f + x2 * (378.0f + x2))); +++ float b = 135135.0f + x2 * (62370.0f + x2 * (3150.0f + x2 * 28.0f)); +++ *cPtr++ = a / b; +++ aPtr++; +++ } ++ } ++- } ++ } ++ #endif /* LV_HAVE_SSE */ ++ ++@@ -348,52 +379,65 @@ volk_32f_tanh_32f_u_sse(float* cVector, const float* aVector, ++ #include ++ ++ static inline void ++-volk_32f_tanh_32f_u_avx(float* cVector, const float* aVector, ++- unsigned int num_points) +++volk_32f_tanh_32f_u_avx(float* cVector, const float* aVector, unsigned int num_points) ++ { ++- unsigned int number = 0; ++- const unsigned int eighthPoints = num_points / 8; ++- ++- float* cPtr = cVector; ++- const float* aPtr = aVector; ++- ++- __m256 aVal, cVal, x2, a, b; ++- __m256 const1, const2, const3, const4, const5, const6; ++- const1 = _mm256_set1_ps(135135.0f); ++- const2 = _mm256_set1_ps(17325.0f); ++- const3 = _mm256_set1_ps(378.0f); ++- const4 = _mm256_set1_ps(62370.0f); ++- const5 = _mm256_set1_ps(3150.0f); ++- const6 = _mm256_set1_ps(28.0f); ++- for(;number < eighthPoints; number++){ ++- ++- aVal = _mm256_loadu_ps(aPtr); ++- x2 = _mm256_mul_ps(aVal, aVal); ++- a = _mm256_mul_ps(aVal, _mm256_add_ps(const1, _mm256_mul_ps(x2, _mm256_add_ps(const2, _mm256_mul_ps(x2, _mm256_add_ps(const3, x2)))))); ++- b = _mm256_add_ps(const1, _mm256_mul_ps(x2, _mm256_add_ps(const4, _mm256_mul_ps(x2, _mm256_add_ps(const5, _mm256_mul_ps(x2, const6)))))); ++- ++- cVal = _mm256_div_ps(a, b); ++- ++- _mm256_storeu_ps(cPtr, cVal); // Store the results back into the C container ++- ++- aPtr += 8; ++- cPtr += 8; ++- } ++- ++- number = eighthPoints * 8; ++- for(;number < num_points; number++) { ++- if(*aPtr > 4.97) ++- *cPtr++ = 1; ++- else if(*aPtr <= -4.97) ++- *cPtr++ = -1; ++- else { ++- float x2 = (*aPtr) * (*aPtr); ++- float a = (*aPtr) * (135135.0f + x2 * (17325.0f + x2 * (378.0f + x2))); ++- float b = 135135.0f + x2 * (62370.0f + x2 * (3150.0f + x2 * 28.0f)); ++- *cPtr++ = a / b; ++- aPtr++; +++ unsigned int number = 0; +++ const unsigned int eighthPoints = num_points / 8; +++ +++ float* cPtr = cVector; +++ const float* aPtr = aVector; +++ +++ __m256 aVal, cVal, x2, a, b; +++ __m256 const1, const2, const3, const4, const5, const6; +++ const1 = _mm256_set1_ps(135135.0f); +++ const2 = _mm256_set1_ps(17325.0f); +++ const3 = _mm256_set1_ps(378.0f); +++ const4 = _mm256_set1_ps(62370.0f); +++ const5 = _mm256_set1_ps(3150.0f); +++ const6 = _mm256_set1_ps(28.0f); +++ for (; number < eighthPoints; number++) { +++ +++ aVal = _mm256_loadu_ps(aPtr); +++ x2 = _mm256_mul_ps(aVal, aVal); +++ a = _mm256_mul_ps( +++ aVal, +++ _mm256_add_ps( +++ const1, +++ _mm256_mul_ps( +++ x2, +++ _mm256_add_ps(const2, +++ _mm256_mul_ps(x2, _mm256_add_ps(const3, x2)))))); +++ b = _mm256_add_ps( +++ const1, +++ _mm256_mul_ps( +++ x2, +++ _mm256_add_ps( +++ const4, +++ _mm256_mul_ps(x2, +++ _mm256_add_ps(const5, _mm256_mul_ps(x2, const6)))))); +++ +++ cVal = _mm256_div_ps(a, b); +++ +++ _mm256_storeu_ps(cPtr, cVal); // Store the results back into the C container +++ +++ aPtr += 8; +++ cPtr += 8; +++ } +++ +++ number = eighthPoints * 8; +++ for (; number < num_points; number++) { +++ if (*aPtr > 4.97) +++ *cPtr++ = 1; +++ else if (*aPtr <= -4.97) +++ *cPtr++ = -1; +++ else { +++ float x2 = (*aPtr) * (*aPtr); +++ float a = (*aPtr) * (135135.0f + x2 * (17325.0f + x2 * (378.0f + x2))); +++ float b = 135135.0f + x2 * (62370.0f + x2 * (3150.0f + x2 * 28.0f)); +++ *cPtr++ = a / b; +++ aPtr++; +++ } ++ } ++- } ++ } ++ #endif /* LV_HAVE_AVX */ ++ ++@@ -401,52 +445,55 @@ volk_32f_tanh_32f_u_avx(float* cVector, const float* aVector, ++ #include ++ ++ static inline void ++-volk_32f_tanh_32f_u_avx_fma(float* cVector, const float* aVector, ++- unsigned int num_points) +++volk_32f_tanh_32f_u_avx_fma(float* cVector, const float* aVector, unsigned int num_points) ++ { ++- unsigned int number = 0; ++- const unsigned int eighthPoints = num_points / 8; ++- ++- float* cPtr = cVector; ++- const float* aPtr = aVector; ++- ++- __m256 aVal, cVal, x2, a, b; ++- __m256 const1, const2, const3, const4, const5, const6; ++- const1 = _mm256_set1_ps(135135.0f); ++- const2 = _mm256_set1_ps(17325.0f); ++- const3 = _mm256_set1_ps(378.0f); ++- const4 = _mm256_set1_ps(62370.0f); ++- const5 = _mm256_set1_ps(3150.0f); ++- const6 = _mm256_set1_ps(28.0f); ++- for(;number < eighthPoints; number++){ ++- ++- aVal = _mm256_loadu_ps(aPtr); ++- x2 = _mm256_mul_ps(aVal, aVal); ++- a = _mm256_mul_ps(aVal, _mm256_fmadd_ps(x2, _mm256_fmadd_ps(x2, _mm256_add_ps(const3, x2), const2),const1)); ++- b = _mm256_fmadd_ps(x2, _mm256_fmadd_ps(x2, _mm256_fmadd_ps(x2, const6, const5), const4), const1); ++- ++- cVal = _mm256_div_ps(a, b); ++- ++- _mm256_storeu_ps(cPtr, cVal); // Store the results back into the C container ++- ++- aPtr += 8; ++- cPtr += 8; ++- } ++- ++- number = eighthPoints * 8; ++- for(;number < num_points; number++) { ++- if(*aPtr > 4.97) ++- *cPtr++ = 1; ++- else if(*aPtr <= -4.97) ++- *cPtr++ = -1; ++- else { ++- float x2 = (*aPtr) * (*aPtr); ++- float a = (*aPtr) * (135135.0f + x2 * (17325.0f + x2 * (378.0f + x2))); ++- float b = 135135.0f + x2 * (62370.0f + x2 * (3150.0f + x2 * 28.0f)); ++- *cPtr++ = a / b; ++- aPtr++; +++ unsigned int number = 0; +++ const unsigned int eighthPoints = num_points / 8; +++ +++ float* cPtr = cVector; +++ const float* aPtr = aVector; +++ +++ __m256 aVal, cVal, x2, a, b; +++ __m256 const1, const2, const3, const4, const5, const6; +++ const1 = _mm256_set1_ps(135135.0f); +++ const2 = _mm256_set1_ps(17325.0f); +++ const3 = _mm256_set1_ps(378.0f); +++ const4 = _mm256_set1_ps(62370.0f); +++ const5 = _mm256_set1_ps(3150.0f); +++ const6 = _mm256_set1_ps(28.0f); +++ for (; number < eighthPoints; number++) { +++ +++ aVal = _mm256_loadu_ps(aPtr); +++ x2 = _mm256_mul_ps(aVal, aVal); +++ a = _mm256_mul_ps( +++ aVal, +++ _mm256_fmadd_ps( +++ x2, _mm256_fmadd_ps(x2, _mm256_add_ps(const3, x2), const2), const1)); +++ b = _mm256_fmadd_ps( +++ x2, _mm256_fmadd_ps(x2, _mm256_fmadd_ps(x2, const6, const5), const4), const1); +++ +++ cVal = _mm256_div_ps(a, b); +++ +++ _mm256_storeu_ps(cPtr, cVal); // Store the results back into the C container +++ +++ aPtr += 8; +++ cPtr += 8; +++ } +++ +++ number = eighthPoints * 8; +++ for (; number < num_points; number++) { +++ if (*aPtr > 4.97) +++ *cPtr++ = 1; +++ else if (*aPtr <= -4.97) +++ *cPtr++ = -1; +++ else { +++ float x2 = (*aPtr) * (*aPtr); +++ float a = (*aPtr) * (135135.0f + x2 * (17325.0f + x2 * (378.0f + x2))); +++ float b = 135135.0f + x2 * (62370.0f + x2 * (3150.0f + x2 * 28.0f)); +++ *cPtr++ = a / b; +++ aPtr++; +++ } ++ } ++- } ++ } ++ #endif /* LV_HAVE_AVX && LV_HAVE_FMA */ ++ ++diff --git a/kernels/volk/volk_32f_x2_add_32f.h b/kernels/volk/volk_32f_x2_add_32f.h ++index ce18092..e4b7e93 100644 ++--- a/kernels/volk/volk_32f_x2_add_32f.h +++++ b/kernels/volk/volk_32f_x2_add_32f.h ++@@ -31,8 +31,8 @@ ++ * ++ * Dispatcher Prototype ++ * \code ++- * void volk_32f_x2_add_32f(float* cVector, const float* aVector, const float* bVector, unsigned int num_points) ++- * \endcode +++ * void volk_32f_x2_add_32f(float* cVector, const float* aVector, const float* bVector, +++ * unsigned int num_points) \endcode ++ * ++ * \b Inputs ++ * \li aVector: First vector of input points. ++@@ -44,7 +44,8 @@ ++ * ++ * \b Example ++ * ++- * The follow example adds the increasing and decreasing vectors such that the result of every summation pair is 10 +++ * The follow example adds the increasing and decreasing vectors such that the result of +++ * every summation pair is 10 ++ * ++ * \code ++ * int N = 10; ++@@ -79,37 +80,38 @@ ++ #ifdef LV_HAVE_AVX512F ++ #include ++ ++-static inline void ++-volk_32f_x2_add_32f_u_avx512f(float* cVector, const float* aVector, ++- const float* bVector, unsigned int num_points) +++static inline void volk_32f_x2_add_32f_u_avx512f(float* cVector, +++ const float* aVector, +++ const float* bVector, +++ unsigned int num_points) ++ { ++- unsigned int number = 0; ++- const unsigned int sixteenthPoints = num_points / 16; +++ unsigned int number = 0; +++ const unsigned int sixteenthPoints = num_points / 16; ++ ++- float* cPtr = cVector; ++- const float* aPtr = aVector; ++- const float* bPtr= bVector; +++ float* cPtr = cVector; +++ const float* aPtr = aVector; +++ const float* bPtr = bVector; ++ ++- __m512 aVal, bVal, cVal; ++- for(;number < sixteenthPoints; number++){ +++ __m512 aVal, bVal, cVal; +++ for (; number < sixteenthPoints; number++) { ++ ++- aVal = _mm512_loadu_ps(aPtr); ++- bVal = _mm512_loadu_ps(bPtr); +++ aVal = _mm512_loadu_ps(aPtr); +++ bVal = _mm512_loadu_ps(bPtr); ++ ++- cVal = _mm512_add_ps(aVal, bVal); +++ cVal = _mm512_add_ps(aVal, bVal); ++ ++- _mm512_storeu_ps(cPtr,cVal); // Store the results back into the C container +++ _mm512_storeu_ps(cPtr, cVal); // Store the results back into the C container ++ ++- aPtr += 16; ++- bPtr += 16; ++- cPtr += 16; ++- } +++ aPtr += 16; +++ bPtr += 16; +++ cPtr += 16; +++ } ++ ++- number = sixteenthPoints * 16; +++ number = sixteenthPoints * 16; ++ ++- for(;number < num_points; number++){ ++- *cPtr++ = (*aPtr++) + (*bPtr++); ++- } +++ for (; number < num_points; number++) { +++ *cPtr++ = (*aPtr++) + (*bPtr++); +++ } ++ } ++ ++ #endif /* LV_HAVE_AVX512F */ ++@@ -118,35 +120,36 @@ volk_32f_x2_add_32f_u_avx512f(float* cVector, const float* aVector, ++ #ifdef LV_HAVE_AVX ++ #include ++ ++-static inline void ++-volk_32f_x2_add_32f_u_avx(float* cVector, const float* aVector, ++- const float* bVector, unsigned int num_points) +++static inline void volk_32f_x2_add_32f_u_avx(float* cVector, +++ const float* aVector, +++ const float* bVector, +++ unsigned int num_points) ++ { ++- unsigned int number = 0; ++- const unsigned int eighthPoints = num_points / 8; ++- float* cPtr = cVector; ++- const float* aPtr = aVector; ++- const float* bPtr= bVector; ++- __m256 aVal, bVal, cVal; ++- for(;number < eighthPoints; number++){ +++ unsigned int number = 0; +++ const unsigned int eighthPoints = num_points / 8; +++ float* cPtr = cVector; +++ const float* aPtr = aVector; +++ const float* bPtr = bVector; +++ __m256 aVal, bVal, cVal; +++ for (; number < eighthPoints; number++) { ++ ++- aVal = _mm256_loadu_ps(aPtr); ++- bVal = _mm256_loadu_ps(bPtr); +++ aVal = _mm256_loadu_ps(aPtr); +++ bVal = _mm256_loadu_ps(bPtr); ++ ++- cVal = _mm256_add_ps(aVal, bVal); +++ cVal = _mm256_add_ps(aVal, bVal); ++ ++- _mm256_storeu_ps(cPtr,cVal); // Store the results back into the C container +++ _mm256_storeu_ps(cPtr, cVal); // Store the results back into the C container ++ ++- aPtr += 8; ++- bPtr += 8; ++- cPtr += 8; ++- } +++ aPtr += 8; +++ bPtr += 8; +++ cPtr += 8; +++ } ++ ++- number = eighthPoints * 8; +++ number = eighthPoints * 8; ++ ++- for(;number < num_points; number++){ ++- *cPtr++ = (*aPtr++) + (*bPtr++); ++- } +++ for (; number < num_points; number++) { +++ *cPtr++ = (*aPtr++) + (*bPtr++); +++ } ++ } ++ #endif /* LV_HAVE_AVX */ ++ ++@@ -154,54 +157,56 @@ volk_32f_x2_add_32f_u_avx(float* cVector, const float* aVector, ++ #ifdef LV_HAVE_SSE ++ #include ++ ++-static inline void ++-volk_32f_x2_add_32f_u_sse(float* cVector, const float* aVector, ++- const float* bVector, unsigned int num_points) +++static inline void volk_32f_x2_add_32f_u_sse(float* cVector, +++ const float* aVector, +++ const float* bVector, +++ unsigned int num_points) ++ { ++- unsigned int number = 0; ++- const unsigned int quarterPoints = num_points / 4; +++ unsigned int number = 0; +++ const unsigned int quarterPoints = num_points / 4; ++ ++- float* cPtr = cVector; ++- const float* aPtr = aVector; ++- const float* bPtr= bVector; +++ float* cPtr = cVector; +++ const float* aPtr = aVector; +++ const float* bPtr = bVector; ++ ++- __m128 aVal, bVal, cVal; ++- for(;number < quarterPoints; number++){ +++ __m128 aVal, bVal, cVal; +++ for (; number < quarterPoints; number++) { ++ ++- aVal = _mm_loadu_ps(aPtr); ++- bVal = _mm_loadu_ps(bPtr); +++ aVal = _mm_loadu_ps(aPtr); +++ bVal = _mm_loadu_ps(bPtr); ++ ++- cVal = _mm_add_ps(aVal, bVal); +++ cVal = _mm_add_ps(aVal, bVal); ++ ++- _mm_storeu_ps(cPtr,cVal); // Store the results back into the C container +++ _mm_storeu_ps(cPtr, cVal); // Store the results back into the C container ++ ++- aPtr += 4; ++- bPtr += 4; ++- cPtr += 4; ++- } +++ aPtr += 4; +++ bPtr += 4; +++ cPtr += 4; +++ } ++ ++- number = quarterPoints * 4; ++- for(;number < num_points; number++){ ++- *cPtr++ = (*aPtr++) + (*bPtr++); ++- } +++ number = quarterPoints * 4; +++ for (; number < num_points; number++) { +++ *cPtr++ = (*aPtr++) + (*bPtr++); +++ } ++ } ++ #endif /* LV_HAVE_SSE */ ++ ++ ++ #ifdef LV_HAVE_GENERIC ++ ++-static inline void ++-volk_32f_x2_add_32f_generic(float* cVector, const float* aVector, ++- const float* bVector, unsigned int num_points) +++static inline void volk_32f_x2_add_32f_generic(float* cVector, +++ const float* aVector, +++ const float* bVector, +++ unsigned int num_points) ++ { ++- float* cPtr = cVector; ++- const float* aPtr = aVector; ++- const float* bPtr= bVector; ++- unsigned int number = 0; ++- ++- for(number = 0; number < num_points; number++){ ++- *cPtr++ = (*aPtr++) + (*bPtr++); ++- } +++ float* cPtr = cVector; +++ const float* aPtr = aVector; +++ const float* bPtr = bVector; +++ unsigned int number = 0; +++ +++ for (number = 0; number < num_points; number++) { +++ *cPtr++ = (*aPtr++) + (*bPtr++); +++ } ++ } ++ #endif /* LV_HAVE_GENERIC */ ++ ++@@ -216,37 +221,38 @@ volk_32f_x2_add_32f_generic(float* cVector, const float* aVector, ++ #ifdef LV_HAVE_AVX512F ++ #include ++ ++-static inline void ++-volk_32f_x2_add_32f_a_avx512f(float* cVector, const float* aVector, ++- const float* bVector, unsigned int num_points) +++static inline void volk_32f_x2_add_32f_a_avx512f(float* cVector, +++ const float* aVector, +++ const float* bVector, +++ unsigned int num_points) ++ { ++- unsigned int number = 0; ++- const unsigned int sixteenthPoints = num_points / 16; +++ unsigned int number = 0; +++ const unsigned int sixteenthPoints = num_points / 16; ++ ++- float* cPtr = cVector; ++- const float* aPtr = aVector; ++- const float* bPtr= bVector; +++ float* cPtr = cVector; +++ const float* aPtr = aVector; +++ const float* bPtr = bVector; ++ ++- __m512 aVal, bVal, cVal; ++- for(;number < sixteenthPoints; number++){ +++ __m512 aVal, bVal, cVal; +++ for (; number < sixteenthPoints; number++) { ++ ++- aVal = _mm512_load_ps(aPtr); ++- bVal = _mm512_load_ps(bPtr); +++ aVal = _mm512_load_ps(aPtr); +++ bVal = _mm512_load_ps(bPtr); ++ ++- cVal = _mm512_add_ps(aVal, bVal); +++ cVal = _mm512_add_ps(aVal, bVal); ++ ++- _mm512_store_ps(cPtr,cVal); // Store the results back into the C container +++ _mm512_store_ps(cPtr, cVal); // Store the results back into the C container ++ ++- aPtr += 16; ++- bPtr += 16; ++- cPtr += 16; ++- } +++ aPtr += 16; +++ bPtr += 16; +++ cPtr += 16; +++ } ++ ++- number = sixteenthPoints * 16; +++ number = sixteenthPoints * 16; ++ ++- for(;number < num_points; number++){ ++- *cPtr++ = (*aPtr++) + (*bPtr++); ++- } +++ for (; number < num_points; number++) { +++ *cPtr++ = (*aPtr++) + (*bPtr++); +++ } ++ } ++ ++ #endif /* LV_HAVE_AVX512F */ ++@@ -255,70 +261,73 @@ volk_32f_x2_add_32f_a_avx512f(float* cVector, const float* aVector, ++ #ifdef LV_HAVE_AVX ++ #include ++ ++-static inline void ++-volk_32f_x2_add_32f_a_avx(float* cVector, const float* aVector, ++- const float* bVector, unsigned int num_points) +++static inline void volk_32f_x2_add_32f_a_avx(float* cVector, +++ const float* aVector, +++ const float* bVector, +++ unsigned int num_points) ++ { ++- unsigned int number = 0; ++- const unsigned int eighthPoints = num_points / 8; +++ unsigned int number = 0; +++ const unsigned int eighthPoints = num_points / 8; ++ ++- float* cPtr = cVector; ++- const float* aPtr = aVector; ++- const float* bPtr= bVector; +++ float* cPtr = cVector; +++ const float* aPtr = aVector; +++ const float* bPtr = bVector; ++ ++- __m256 aVal, bVal, cVal; ++- for(;number < eighthPoints; number++){ +++ __m256 aVal, bVal, cVal; +++ for (; number < eighthPoints; number++) { ++ ++- aVal = _mm256_load_ps(aPtr); ++- bVal = _mm256_load_ps(bPtr); +++ aVal = _mm256_load_ps(aPtr); +++ bVal = _mm256_load_ps(bPtr); ++ ++- cVal = _mm256_add_ps(aVal, bVal); +++ cVal = _mm256_add_ps(aVal, bVal); ++ ++- _mm256_store_ps(cPtr,cVal); // Store the results back into the C container +++ _mm256_store_ps(cPtr, cVal); // Store the results back into the C container ++ ++- aPtr += 8; ++- bPtr += 8; ++- cPtr += 8; ++- } +++ aPtr += 8; +++ bPtr += 8; +++ cPtr += 8; +++ } ++ ++- number = eighthPoints * 8; ++- for(;number < num_points; number++){ ++- *cPtr++ = (*aPtr++) + (*bPtr++); ++- } +++ number = eighthPoints * 8; +++ for (; number < num_points; number++) { +++ *cPtr++ = (*aPtr++) + (*bPtr++); +++ } ++ } ++ #endif /* LV_HAVE_AVX */ ++ ++ #ifdef LV_HAVE_SSE ++ #include ++ ++-static inline void ++-volk_32f_x2_add_32f_a_sse(float* cVector, const float* aVector, const float* bVector, unsigned int num_points) +++static inline void volk_32f_x2_add_32f_a_sse(float* cVector, +++ const float* aVector, +++ const float* bVector, +++ unsigned int num_points) ++ { ++- unsigned int number = 0; ++- const unsigned int quarterPoints = num_points / 4; +++ unsigned int number = 0; +++ const unsigned int quarterPoints = num_points / 4; ++ ++- float* cPtr = cVector; ++- const float* aPtr = aVector; ++- const float* bPtr= bVector; +++ float* cPtr = cVector; +++ const float* aPtr = aVector; +++ const float* bPtr = bVector; ++ ++- __m128 aVal, bVal, cVal; ++- for(;number < quarterPoints; number++){ ++- aVal = _mm_load_ps(aPtr); ++- bVal = _mm_load_ps(bPtr); +++ __m128 aVal, bVal, cVal; +++ for (; number < quarterPoints; number++) { +++ aVal = _mm_load_ps(aPtr); +++ bVal = _mm_load_ps(bPtr); ++ ++- cVal = _mm_add_ps(aVal, bVal); +++ cVal = _mm_add_ps(aVal, bVal); ++ ++- _mm_store_ps(cPtr,cVal); // Store the results back into the C container +++ _mm_store_ps(cPtr, cVal); // Store the results back into the C container ++ ++- aPtr += 4; ++- bPtr += 4; ++- cPtr += 4; ++- } +++ aPtr += 4; +++ bPtr += 4; +++ cPtr += 4; +++ } ++ ++- number = quarterPoints * 4; ++- for(;number < num_points; number++){ ++- *cPtr++ = (*aPtr++) + (*bPtr++); ++- } +++ number = quarterPoints * 4; +++ for (; number < num_points; number++) { +++ *cPtr++ = (*aPtr++) + (*bPtr++); +++ } ++ } ++ #endif /* LV_HAVE_SSE */ ++ ++@@ -326,78 +335,89 @@ volk_32f_x2_add_32f_a_sse(float* cVector, const float* aVector, const float* bVe ++ #ifdef LV_HAVE_NEON ++ #include ++ ++-static inline void ++-volk_32f_x2_add_32f_u_neon(float* cVector, const float* aVector, ++- const float* bVector, unsigned int num_points) +++static inline void volk_32f_x2_add_32f_u_neon(float* cVector, +++ const float* aVector, +++ const float* bVector, +++ unsigned int num_points) ++ { ++- unsigned int number = 0; ++- const unsigned int quarterPoints = num_points / 4; ++- ++- float* cPtr = cVector; ++- const float* aPtr = aVector; ++- const float* bPtr= bVector; ++- float32x4_t aVal, bVal, cVal; ++- for(number=0; number < quarterPoints; number++){ ++- // Load in to NEON registers ++- aVal = vld1q_f32(aPtr); ++- bVal = vld1q_f32(bPtr); ++- __VOLK_PREFETCH(aPtr+4); ++- __VOLK_PREFETCH(bPtr+4); ++- ++- // vector add ++- cVal = vaddq_f32(aVal, bVal); ++- // Store the results back into the C container ++- vst1q_f32(cPtr,cVal); ++- ++- aPtr += 4; // q uses quadwords, 4 floats per vadd ++- bPtr += 4; ++- cPtr += 4; ++- } ++- ++- number = quarterPoints * 4; // should be = num_points ++- for(;number < num_points; number++){ ++- *cPtr++ = (*aPtr++) + (*bPtr++); ++- } +++ unsigned int number = 0; +++ const unsigned int quarterPoints = num_points / 4; +++ +++ float* cPtr = cVector; +++ const float* aPtr = aVector; +++ const float* bPtr = bVector; +++ float32x4_t aVal, bVal, cVal; +++ for (number = 0; number < quarterPoints; number++) { +++ // Load in to NEON registers +++ aVal = vld1q_f32(aPtr); +++ bVal = vld1q_f32(bPtr); +++ __VOLK_PREFETCH(aPtr + 4); +++ __VOLK_PREFETCH(bPtr + 4); +++ +++ // vector add +++ cVal = vaddq_f32(aVal, bVal); +++ // Store the results back into the C container +++ vst1q_f32(cPtr, cVal); +++ +++ aPtr += 4; // q uses quadwords, 4 floats per vadd +++ bPtr += 4; +++ cPtr += 4; +++ } +++ +++ number = quarterPoints * 4; // should be = num_points +++ for (; number < num_points; number++) { +++ *cPtr++ = (*aPtr++) + (*bPtr++); +++ } ++ } ++ ++ #endif /* LV_HAVE_NEON */ ++ ++ #ifdef LV_HAVE_NEONV7 ++-extern void volk_32f_x2_add_32f_a_neonasm(float* cVector, const float* aVector, const float* bVector, unsigned int num_points); +++extern void volk_32f_x2_add_32f_a_neonasm(float* cVector, +++ const float* aVector, +++ const float* bVector, +++ unsigned int num_points); ++ #endif /* LV_HAVE_NEONV7 */ ++ ++ #ifdef LV_HAVE_NEONV7 ++-extern void volk_32f_x2_add_32f_a_neonpipeline(float* cVector, const float* aVector, const float* bVector, unsigned int num_points); +++extern void volk_32f_x2_add_32f_a_neonpipeline(float* cVector, +++ const float* aVector, +++ const float* bVector, +++ unsigned int num_points); ++ #endif /* LV_HAVE_NEONV7 */ ++ ++ #ifdef LV_HAVE_GENERIC ++ ++-static inline void ++-volk_32f_x2_add_32f_a_generic(float* cVector, const float* aVector, ++- const float* bVector, unsigned int num_points) +++static inline void volk_32f_x2_add_32f_a_generic(float* cVector, +++ const float* aVector, +++ const float* bVector, +++ unsigned int num_points) ++ { ++- float* cPtr = cVector; ++- const float* aPtr = aVector; ++- const float* bPtr= bVector; ++- unsigned int number = 0; ++- ++- for(number = 0; number < num_points; number++){ ++- *cPtr++ = (*aPtr++) + (*bPtr++); ++- } +++ float* cPtr = cVector; +++ const float* aPtr = aVector; +++ const float* bPtr = bVector; +++ unsigned int number = 0; +++ +++ for (number = 0; number < num_points; number++) { +++ *cPtr++ = (*aPtr++) + (*bPtr++); +++ } ++ } ++ #endif /* LV_HAVE_GENERIC */ ++ ++ ++ #ifdef LV_HAVE_ORC ++ ++-extern void ++-volk_32f_x2_add_32f_a_orc_impl(float* cVector, const float* aVector, ++- const float* bVector, unsigned int num_points); +++extern void volk_32f_x2_add_32f_a_orc_impl(float* cVector, +++ const float* aVector, +++ const float* bVector, +++ unsigned int num_points); ++ ++-static inline void ++-volk_32f_x2_add_32f_u_orc(float* cVector, const float* aVector, ++- const float* bVector, unsigned int num_points){ ++- volk_32f_x2_add_32f_a_orc_impl(cVector, aVector, bVector, num_points); +++static inline void volk_32f_x2_add_32f_u_orc(float* cVector, +++ const float* aVector, +++ const float* bVector, +++ unsigned int num_points) +++{ +++ volk_32f_x2_add_32f_a_orc_impl(cVector, aVector, bVector, num_points); ++ } ++ ++ #endif /* LV_HAVE_ORC */ ++diff --git a/kernels/volk/volk_32f_x2_divide_32f.h b/kernels/volk/volk_32f_x2_divide_32f.h ++index 130767f..8b80365 100644 ++--- a/kernels/volk/volk_32f_x2_divide_32f.h +++++ b/kernels/volk/volk_32f_x2_divide_32f.h ++@@ -31,8 +31,8 @@ ++ * ++ * Dispatcher Prototype ++ * \code ++- * void volk_32f_x2_divide_32f(float* cVector, const float* aVector, const float* bVector, unsigned int num_points) ++- * \endcode +++ * void volk_32f_x2_divide_32f(float* cVector, const float* aVector, const float* bVector, +++ * unsigned int num_points) \endcode ++ * ++ * \b Inputs ++ * \li aVector: First vector of input points. ++@@ -77,35 +77,36 @@ ++ #ifdef LV_HAVE_AVX512F ++ #include ++ ++-static inline void ++-volk_32f_x2_divide_32f_a_avx512f(float* cVector, const float* aVector, ++- const float* bVector, unsigned int num_points) +++static inline void volk_32f_x2_divide_32f_a_avx512f(float* cVector, +++ const float* aVector, +++ const float* bVector, +++ unsigned int num_points) ++ { ++- unsigned int number = 0; ++- const unsigned int sixteenthPoints = num_points / 16; +++ unsigned int number = 0; +++ const unsigned int sixteenthPoints = num_points / 16; ++ ++- float* cPtr = cVector; ++- const float* aPtr = aVector; ++- const float* bPtr= bVector; +++ float* cPtr = cVector; +++ const float* aPtr = aVector; +++ const float* bPtr = bVector; ++ ++- __m512 aVal, bVal, cVal; ++- for(;number < sixteenthPoints; number++){ ++- aVal = _mm512_load_ps(aPtr); ++- bVal = _mm512_load_ps(bPtr); +++ __m512 aVal, bVal, cVal; +++ for (; number < sixteenthPoints; number++) { +++ aVal = _mm512_load_ps(aPtr); +++ bVal = _mm512_load_ps(bPtr); ++ ++- cVal = _mm512_div_ps(aVal, bVal); +++ cVal = _mm512_div_ps(aVal, bVal); ++ ++- _mm512_store_ps(cPtr,cVal); // Store the results back into the C container +++ _mm512_store_ps(cPtr, cVal); // Store the results back into the C container ++ ++- aPtr += 16; ++- bPtr += 16; ++- cPtr += 16; ++- } +++ aPtr += 16; +++ bPtr += 16; +++ cPtr += 16; +++ } ++ ++- number = sixteenthPoints * 16; ++- for(;number < num_points; number++){ ++- *cPtr++ = (*aPtr++) / (*bPtr++); ++- } +++ number = sixteenthPoints * 16; +++ for (; number < num_points; number++) { +++ *cPtr++ = (*aPtr++) / (*bPtr++); +++ } ++ } ++ #endif /* LV_HAVE_AVX512F */ ++ ++@@ -113,35 +114,36 @@ volk_32f_x2_divide_32f_a_avx512f(float* cVector, const float* aVector, ++ #ifdef LV_HAVE_AVX ++ #include ++ ++-static inline void ++-volk_32f_x2_divide_32f_a_avx(float* cVector, const float* aVector, ++- const float* bVector, unsigned int num_points) +++static inline void volk_32f_x2_divide_32f_a_avx(float* cVector, +++ const float* aVector, +++ const float* bVector, +++ unsigned int num_points) ++ { ++- unsigned int number = 0; ++- const unsigned int eighthPoints = num_points / 8; +++ unsigned int number = 0; +++ const unsigned int eighthPoints = num_points / 8; ++ ++- float* cPtr = cVector; ++- const float* aPtr = aVector; ++- const float* bPtr= bVector; +++ float* cPtr = cVector; +++ const float* aPtr = aVector; +++ const float* bPtr = bVector; ++ ++- __m256 aVal, bVal, cVal; ++- for(;number < eighthPoints; number++){ ++- aVal = _mm256_load_ps(aPtr); ++- bVal = _mm256_load_ps(bPtr); +++ __m256 aVal, bVal, cVal; +++ for (; number < eighthPoints; number++) { +++ aVal = _mm256_load_ps(aPtr); +++ bVal = _mm256_load_ps(bPtr); ++ ++- cVal = _mm256_div_ps(aVal, bVal); +++ cVal = _mm256_div_ps(aVal, bVal); ++ ++- _mm256_store_ps(cPtr,cVal); // Store the results back into the C container +++ _mm256_store_ps(cPtr, cVal); // Store the results back into the C container ++ ++- aPtr += 8; ++- bPtr += 8; ++- cPtr += 8; ++- } +++ aPtr += 8; +++ bPtr += 8; +++ cPtr += 8; +++ } ++ ++- number = eighthPoints * 8; ++- for(;number < num_points; number++){ ++- *cPtr++ = (*aPtr++) / (*bPtr++); ++- } +++ number = eighthPoints * 8; +++ for (; number < num_points; number++) { +++ *cPtr++ = (*aPtr++) / (*bPtr++); +++ } ++ } ++ #endif /* LV_HAVE_AVX */ ++ ++@@ -149,35 +151,36 @@ volk_32f_x2_divide_32f_a_avx(float* cVector, const float* aVector, ++ #ifdef LV_HAVE_SSE ++ #include ++ ++-static inline void ++-volk_32f_x2_divide_32f_a_sse(float* cVector, const float* aVector, ++- const float* bVector, unsigned int num_points) +++static inline void volk_32f_x2_divide_32f_a_sse(float* cVector, +++ const float* aVector, +++ const float* bVector, +++ unsigned int num_points) ++ { ++- unsigned int number = 0; ++- const unsigned int quarterPoints = num_points / 4; +++ unsigned int number = 0; +++ const unsigned int quarterPoints = num_points / 4; ++ ++- float* cPtr = cVector; ++- const float* aPtr = aVector; ++- const float* bPtr= bVector; +++ float* cPtr = cVector; +++ const float* aPtr = aVector; +++ const float* bPtr = bVector; ++ ++- __m128 aVal, bVal, cVal; ++- for(;number < quarterPoints; number++){ ++- aVal = _mm_load_ps(aPtr); ++- bVal = _mm_load_ps(bPtr); +++ __m128 aVal, bVal, cVal; +++ for (; number < quarterPoints; number++) { +++ aVal = _mm_load_ps(aPtr); +++ bVal = _mm_load_ps(bPtr); ++ ++- cVal = _mm_div_ps(aVal, bVal); +++ cVal = _mm_div_ps(aVal, bVal); ++ ++- _mm_store_ps(cPtr,cVal); // Store the results back into the C container +++ _mm_store_ps(cPtr, cVal); // Store the results back into the C container ++ ++- aPtr += 4; ++- bPtr += 4; ++- cPtr += 4; ++- } +++ aPtr += 4; +++ bPtr += 4; +++ cPtr += 4; +++ } ++ ++- number = quarterPoints * 4; ++- for(;number < num_points; number++){ ++- *cPtr++ = (*aPtr++) / (*bPtr++); ++- } +++ number = quarterPoints * 4; +++ for (; number < num_points; number++) { +++ *cPtr++ = (*aPtr++) / (*bPtr++); +++ } ++ } ++ #endif /* LV_HAVE_SSE */ ++ ++@@ -185,54 +188,55 @@ volk_32f_x2_divide_32f_a_sse(float* cVector, const float* aVector, ++ #ifdef LV_HAVE_NEON ++ #include ++ ++-static inline void ++-volk_32f_x2_divide_32f_neon(float* cVector, const float* aVector, ++- const float* bVector, unsigned int num_points) +++static inline void volk_32f_x2_divide_32f_neon(float* cVector, +++ const float* aVector, +++ const float* bVector, +++ unsigned int num_points) ++ { ++- float* cPtr = cVector; ++- const float* aPtr = aVector; ++- const float* bPtr = bVector; ++- ++- float32x4x4_t aVal, bVal, bInv, cVal; ++- ++- const unsigned int eighthPoints = num_points / 16; ++- unsigned int number = 0; ++- for(; number < eighthPoints; number++){ ++- aVal = vld4q_f32(aPtr); ++- aPtr += 16; ++- bVal = vld4q_f32(bPtr); ++- bPtr += 16; ++- ++- __VOLK_PREFETCH(aPtr+16); ++- __VOLK_PREFETCH(bPtr+16); ++- ++- bInv.val[0] = vrecpeq_f32(bVal.val[0]); ++- bInv.val[0] = vmulq_f32(bInv.val[0], vrecpsq_f32(bInv.val[0], bVal.val[0])); ++- bInv.val[0] = vmulq_f32(bInv.val[0], vrecpsq_f32(bInv.val[0], bVal.val[0])); ++- cVal.val[0] = vmulq_f32(aVal.val[0], bInv.val[0]); ++- ++- bInv.val[1] = vrecpeq_f32(bVal.val[1]); ++- bInv.val[1] = vmulq_f32(bInv.val[1], vrecpsq_f32(bInv.val[1], bVal.val[1])); ++- bInv.val[1] = vmulq_f32(bInv.val[1], vrecpsq_f32(bInv.val[1], bVal.val[1])); ++- cVal.val[1] = vmulq_f32(aVal.val[1], bInv.val[1]); ++- ++- bInv.val[2] = vrecpeq_f32(bVal.val[2]); ++- bInv.val[2] = vmulq_f32(bInv.val[2], vrecpsq_f32(bInv.val[2], bVal.val[2])); ++- bInv.val[2] = vmulq_f32(bInv.val[2], vrecpsq_f32(bInv.val[2], bVal.val[2])); ++- cVal.val[2] = vmulq_f32(aVal.val[2], bInv.val[2]); ++- ++- bInv.val[3] = vrecpeq_f32(bVal.val[3]); ++- bInv.val[3] = vmulq_f32(bInv.val[3], vrecpsq_f32(bInv.val[3], bVal.val[3])); ++- bInv.val[3] = vmulq_f32(bInv.val[3], vrecpsq_f32(bInv.val[3], bVal.val[3])); ++- cVal.val[3] = vmulq_f32(aVal.val[3], bInv.val[3]); ++- ++- vst4q_f32(cPtr, cVal); ++- cPtr += 16; ++- } ++- ++- for(number = eighthPoints * 16; number < num_points; number++){ ++- *cPtr++ = (*aPtr++) / (*bPtr++); ++- } +++ float* cPtr = cVector; +++ const float* aPtr = aVector; +++ const float* bPtr = bVector; +++ +++ float32x4x4_t aVal, bVal, bInv, cVal; +++ +++ const unsigned int eighthPoints = num_points / 16; +++ unsigned int number = 0; +++ for (; number < eighthPoints; number++) { +++ aVal = vld4q_f32(aPtr); +++ aPtr += 16; +++ bVal = vld4q_f32(bPtr); +++ bPtr += 16; +++ +++ __VOLK_PREFETCH(aPtr + 16); +++ __VOLK_PREFETCH(bPtr + 16); +++ +++ bInv.val[0] = vrecpeq_f32(bVal.val[0]); +++ bInv.val[0] = vmulq_f32(bInv.val[0], vrecpsq_f32(bInv.val[0], bVal.val[0])); +++ bInv.val[0] = vmulq_f32(bInv.val[0], vrecpsq_f32(bInv.val[0], bVal.val[0])); +++ cVal.val[0] = vmulq_f32(aVal.val[0], bInv.val[0]); +++ +++ bInv.val[1] = vrecpeq_f32(bVal.val[1]); +++ bInv.val[1] = vmulq_f32(bInv.val[1], vrecpsq_f32(bInv.val[1], bVal.val[1])); +++ bInv.val[1] = vmulq_f32(bInv.val[1], vrecpsq_f32(bInv.val[1], bVal.val[1])); +++ cVal.val[1] = vmulq_f32(aVal.val[1], bInv.val[1]); +++ +++ bInv.val[2] = vrecpeq_f32(bVal.val[2]); +++ bInv.val[2] = vmulq_f32(bInv.val[2], vrecpsq_f32(bInv.val[2], bVal.val[2])); +++ bInv.val[2] = vmulq_f32(bInv.val[2], vrecpsq_f32(bInv.val[2], bVal.val[2])); +++ cVal.val[2] = vmulq_f32(aVal.val[2], bInv.val[2]); +++ +++ bInv.val[3] = vrecpeq_f32(bVal.val[3]); +++ bInv.val[3] = vmulq_f32(bInv.val[3], vrecpsq_f32(bInv.val[3], bVal.val[3])); +++ bInv.val[3] = vmulq_f32(bInv.val[3], vrecpsq_f32(bInv.val[3], bVal.val[3])); +++ cVal.val[3] = vmulq_f32(aVal.val[3], bInv.val[3]); +++ +++ vst4q_f32(cPtr, cVal); +++ cPtr += 16; +++ } +++ +++ for (number = eighthPoints * 16; number < num_points; number++) { +++ *cPtr++ = (*aPtr++) / (*bPtr++); +++ } ++ } ++ ++ #endif /* LV_HAVE_NEON */ ++@@ -240,38 +244,40 @@ volk_32f_x2_divide_32f_neon(float* cVector, const float* aVector, ++ ++ #ifdef LV_HAVE_GENERIC ++ ++-static inline void ++-volk_32f_x2_divide_32f_generic(float* cVector, const float* aVector, ++- const float* bVector, unsigned int num_points) +++static inline void volk_32f_x2_divide_32f_generic(float* cVector, +++ const float* aVector, +++ const float* bVector, +++ unsigned int num_points) ++ { ++- float* cPtr = cVector; ++- const float* aPtr = aVector; ++- const float* bPtr= bVector; ++- unsigned int number = 0; ++- ++- for(number = 0; number < num_points; number++){ ++- *cPtr++ = (*aPtr++) / (*bPtr++); ++- } +++ float* cPtr = cVector; +++ const float* aPtr = aVector; +++ const float* bPtr = bVector; +++ unsigned int number = 0; +++ +++ for (number = 0; number < num_points; number++) { +++ *cPtr++ = (*aPtr++) / (*bPtr++); +++ } ++ } ++ #endif /* LV_HAVE_GENERIC */ ++ ++ ++ #ifdef LV_HAVE_ORC ++ ++-extern void ++-volk_32f_x2_divide_32f_a_orc_impl(float* cVector, const float* aVector, ++- const float* bVector, unsigned int num_points); +++extern void volk_32f_x2_divide_32f_a_orc_impl(float* cVector, +++ const float* aVector, +++ const float* bVector, +++ unsigned int num_points); ++ ++-static inline void ++-volk_32f_x2_divide_32f_u_orc(float* cVector, const float* aVector, ++- const float* bVector, unsigned int num_points) +++static inline void volk_32f_x2_divide_32f_u_orc(float* cVector, +++ const float* aVector, +++ const float* bVector, +++ unsigned int num_points) ++ { ++- volk_32f_x2_divide_32f_a_orc_impl(cVector, aVector, bVector, num_points); +++ volk_32f_x2_divide_32f_a_orc_impl(cVector, aVector, bVector, num_points); ++ } ++ #endif /* LV_HAVE_ORC */ ++ ++ ++- ++ #endif /* INCLUDED_volk_32f_x2_divide_32f_a_H */ ++ ++ ++@@ -284,35 +290,36 @@ volk_32f_x2_divide_32f_u_orc(float* cVector, const float* aVector, ++ #ifdef LV_HAVE_AVX512F ++ #include ++ ++-static inline void ++-volk_32f_x2_divide_32f_u_avx512f(float* cVector, const float* aVector, ++- const float* bVector, unsigned int num_points) +++static inline void volk_32f_x2_divide_32f_u_avx512f(float* cVector, +++ const float* aVector, +++ const float* bVector, +++ unsigned int num_points) ++ { ++- unsigned int number = 0; ++- const unsigned int sixteenthPoints = num_points / 16; +++ unsigned int number = 0; +++ const unsigned int sixteenthPoints = num_points / 16; ++ ++- float* cPtr = cVector; ++- const float* aPtr = aVector; ++- const float* bPtr= bVector; +++ float* cPtr = cVector; +++ const float* aPtr = aVector; +++ const float* bPtr = bVector; ++ ++- __m512 aVal, bVal, cVal; ++- for(;number < sixteenthPoints; number++){ ++- aVal = _mm512_loadu_ps(aPtr); ++- bVal = _mm512_loadu_ps(bPtr); +++ __m512 aVal, bVal, cVal; +++ for (; number < sixteenthPoints; number++) { +++ aVal = _mm512_loadu_ps(aPtr); +++ bVal = _mm512_loadu_ps(bPtr); ++ ++- cVal = _mm512_div_ps(aVal, bVal); +++ cVal = _mm512_div_ps(aVal, bVal); ++ ++- _mm512_storeu_ps(cPtr,cVal); // Store the results back into the C container +++ _mm512_storeu_ps(cPtr, cVal); // Store the results back into the C container ++ ++- aPtr += 16; ++- bPtr += 16; ++- cPtr += 16; ++- } +++ aPtr += 16; +++ bPtr += 16; +++ cPtr += 16; +++ } ++ ++- number = sixteenthPoints * 16; ++- for(;number < num_points; number++){ ++- *cPtr++ = (*aPtr++) / (*bPtr++); ++- } +++ number = sixteenthPoints * 16; +++ for (; number < num_points; number++) { +++ *cPtr++ = (*aPtr++) / (*bPtr++); +++ } ++ } ++ #endif /* LV_HAVE_AVX512F */ ++ ++@@ -320,35 +327,36 @@ volk_32f_x2_divide_32f_u_avx512f(float* cVector, const float* aVector, ++ #ifdef LV_HAVE_AVX ++ #include ++ ++-static inline void ++-volk_32f_x2_divide_32f_u_avx(float* cVector, const float* aVector, ++- const float* bVector, unsigned int num_points) +++static inline void volk_32f_x2_divide_32f_u_avx(float* cVector, +++ const float* aVector, +++ const float* bVector, +++ unsigned int num_points) ++ { ++- unsigned int number = 0; ++- const unsigned int eighthPoints = num_points / 8; +++ unsigned int number = 0; +++ const unsigned int eighthPoints = num_points / 8; ++ ++- float* cPtr = cVector; ++- const float* aPtr = aVector; ++- const float* bPtr= bVector; +++ float* cPtr = cVector; +++ const float* aPtr = aVector; +++ const float* bPtr = bVector; ++ ++- __m256 aVal, bVal, cVal; ++- for(;number < eighthPoints; number++){ ++- aVal = _mm256_loadu_ps(aPtr); ++- bVal = _mm256_loadu_ps(bPtr); +++ __m256 aVal, bVal, cVal; +++ for (; number < eighthPoints; number++) { +++ aVal = _mm256_loadu_ps(aPtr); +++ bVal = _mm256_loadu_ps(bPtr); ++ ++- cVal = _mm256_div_ps(aVal, bVal); +++ cVal = _mm256_div_ps(aVal, bVal); ++ ++- _mm256_storeu_ps(cPtr,cVal); // Store the results back into the C container +++ _mm256_storeu_ps(cPtr, cVal); // Store the results back into the C container ++ ++- aPtr += 8; ++- bPtr += 8; ++- cPtr += 8; ++- } +++ aPtr += 8; +++ bPtr += 8; +++ cPtr += 8; +++ } ++ ++- number = eighthPoints * 8; ++- for(;number < num_points; number++){ ++- *cPtr++ = (*aPtr++) / (*bPtr++); ++- } +++ number = eighthPoints * 8; +++ for (; number < num_points; number++) { +++ *cPtr++ = (*aPtr++) / (*bPtr++); +++ } ++ } ++ #endif /* LV_HAVE_AVX */ ++ ++diff --git a/kernels/volk/volk_32f_x2_dot_prod_16i.h b/kernels/volk/volk_32f_x2_dot_prod_16i.h ++index c1b5a82..4da7db6 100644 ++--- a/kernels/volk/volk_32f_x2_dot_prod_16i.h +++++ b/kernels/volk/volk_32f_x2_dot_prod_16i.h ++@@ -33,8 +33,8 @@ ++ * ++ * Dispatcher Prototype ++ * \code ++- * void volk_32f_x2_dot_prod_16i(int16_t* result, const float* input, const float* taps, unsigned int num_points) ++- * \endcode +++ * void volk_32f_x2_dot_prod_16i(int16_t* result, const float* input, const float* taps, +++ * unsigned int num_points) \endcode ++ * ++ * \b Inputs ++ * \li input: vector of floats. ++@@ -58,25 +58,29 @@ ++ #ifndef INCLUDED_volk_32f_x2_dot_prod_16i_H ++ #define INCLUDED_volk_32f_x2_dot_prod_16i_H ++ ++-#include ++ #include +++#include ++ ++ ++ #ifdef LV_HAVE_GENERIC ++ ++ ++-static inline void volk_32f_x2_dot_prod_16i_generic(int16_t* result, const float* input, const float* taps, unsigned int num_points) { +++static inline void volk_32f_x2_dot_prod_16i_generic(int16_t* result, +++ const float* input, +++ const float* taps, +++ unsigned int num_points) +++{ ++ ++- float dotProduct = 0; ++- const float* aPtr = input; ++- const float* bPtr= taps; ++- unsigned int number = 0; +++ float dotProduct = 0; +++ const float* aPtr = input; +++ const float* bPtr = taps; +++ unsigned int number = 0; ++ ++- for(number = 0; number < num_points; number++){ ++- dotProduct += ((*aPtr++) * (*bPtr++)); ++- } +++ for (number = 0; number < num_points; number++) { +++ dotProduct += ((*aPtr++) * (*bPtr++)); +++ } ++ ++- *result = (int16_t)dotProduct; +++ *result = (int16_t)dotProduct; ++ } ++ ++ #endif /*LV_HAVE_GENERIC*/ ++@@ -84,68 +88,73 @@ static inline void volk_32f_x2_dot_prod_16i_generic(int16_t* result, const float ++ ++ #ifdef LV_HAVE_SSE ++ ++-static inline void volk_32f_x2_dot_prod_16i_a_sse(int16_t* result, const float* input, const float* taps, unsigned int num_points) { ++- ++- unsigned int number = 0; ++- const unsigned int sixteenthPoints = num_points / 16; ++- ++- float dotProduct = 0; ++- const float* aPtr = input; ++- const float* bPtr = taps; ++- ++- __m128 a0Val, a1Val, a2Val, a3Val; ++- __m128 b0Val, b1Val, b2Val, b3Val; ++- __m128 c0Val, c1Val, c2Val, c3Val; ++- ++- __m128 dotProdVal0 = _mm_setzero_ps(); ++- __m128 dotProdVal1 = _mm_setzero_ps(); ++- __m128 dotProdVal2 = _mm_setzero_ps(); ++- __m128 dotProdVal3 = _mm_setzero_ps(); ++- ++- for(;number < sixteenthPoints; number++){ ++- ++- a0Val = _mm_load_ps(aPtr); ++- a1Val = _mm_load_ps(aPtr+4); ++- a2Val = _mm_load_ps(aPtr+8); ++- a3Val = _mm_load_ps(aPtr+12); ++- b0Val = _mm_load_ps(bPtr); ++- b1Val = _mm_load_ps(bPtr+4); ++- b2Val = _mm_load_ps(bPtr+8); ++- b3Val = _mm_load_ps(bPtr+12); ++- ++- c0Val = _mm_mul_ps(a0Val, b0Val); ++- c1Val = _mm_mul_ps(a1Val, b1Val); ++- c2Val = _mm_mul_ps(a2Val, b2Val); ++- c3Val = _mm_mul_ps(a3Val, b3Val); ++- ++- dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0); ++- dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1); ++- dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2); ++- dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3); ++- ++- aPtr += 16; ++- bPtr += 16; ++- } ++- ++- dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1); ++- dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2); ++- dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3); ++- ++- __VOLK_ATTR_ALIGNED(16) float dotProductVector[4]; ++- ++- _mm_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector ++- ++- dotProduct = dotProductVector[0]; ++- dotProduct += dotProductVector[1]; ++- dotProduct += dotProductVector[2]; ++- dotProduct += dotProductVector[3]; ++- ++- number = sixteenthPoints*16; ++- for(;number < num_points; number++){ ++- dotProduct += ((*aPtr++) * (*bPtr++)); ++- } ++- ++- *result = (short)dotProduct; +++static inline void volk_32f_x2_dot_prod_16i_a_sse(int16_t* result, +++ const float* input, +++ const float* taps, +++ unsigned int num_points) +++{ +++ +++ unsigned int number = 0; +++ const unsigned int sixteenthPoints = num_points / 16; +++ +++ float dotProduct = 0; +++ const float* aPtr = input; +++ const float* bPtr = taps; +++ +++ __m128 a0Val, a1Val, a2Val, a3Val; +++ __m128 b0Val, b1Val, b2Val, b3Val; +++ __m128 c0Val, c1Val, c2Val, c3Val; +++ +++ __m128 dotProdVal0 = _mm_setzero_ps(); +++ __m128 dotProdVal1 = _mm_setzero_ps(); +++ __m128 dotProdVal2 = _mm_setzero_ps(); +++ __m128 dotProdVal3 = _mm_setzero_ps(); +++ +++ for (; number < sixteenthPoints; number++) { +++ +++ a0Val = _mm_load_ps(aPtr); +++ a1Val = _mm_load_ps(aPtr + 4); +++ a2Val = _mm_load_ps(aPtr + 8); +++ a3Val = _mm_load_ps(aPtr + 12); +++ b0Val = _mm_load_ps(bPtr); +++ b1Val = _mm_load_ps(bPtr + 4); +++ b2Val = _mm_load_ps(bPtr + 8); +++ b3Val = _mm_load_ps(bPtr + 12); +++ +++ c0Val = _mm_mul_ps(a0Val, b0Val); +++ c1Val = _mm_mul_ps(a1Val, b1Val); +++ c2Val = _mm_mul_ps(a2Val, b2Val); +++ c3Val = _mm_mul_ps(a3Val, b3Val); +++ +++ dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0); +++ dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1); +++ dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2); +++ dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3); +++ +++ aPtr += 16; +++ bPtr += 16; +++ } +++ +++ dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1); +++ dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2); +++ dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3); +++ +++ __VOLK_ATTR_ALIGNED(16) float dotProductVector[4]; +++ +++ _mm_store_ps(dotProductVector, +++ dotProdVal0); // Store the results back into the dot product vector +++ +++ dotProduct = dotProductVector[0]; +++ dotProduct += dotProductVector[1]; +++ dotProduct += dotProductVector[2]; +++ dotProduct += dotProductVector[3]; +++ +++ number = sixteenthPoints * 16; +++ for (; number < num_points; number++) { +++ dotProduct += ((*aPtr++) * (*bPtr++)); +++ } +++ +++ *result = (short)dotProduct; ++ } ++ ++ #endif /*LV_HAVE_SSE*/ ++@@ -153,66 +162,71 @@ static inline void volk_32f_x2_dot_prod_16i_a_sse(int16_t* result, const float* ++ ++ #if LV_HAVE_AVX2 && LV_HAVE_FMA ++ ++-static inline void volk_32f_x2_dot_prod_16i_a_avx2_fma(int16_t* result, const float* input, const float* taps, unsigned int num_points) { ++- ++- unsigned int number = 0; ++- const unsigned int thirtysecondPoints = num_points / 32; ++- ++- float dotProduct = 0; ++- const float* aPtr = input; ++- const float* bPtr = taps; ++- ++- __m256 a0Val, a1Val, a2Val, a3Val; ++- __m256 b0Val, b1Val, b2Val, b3Val; ++- ++- __m256 dotProdVal0 = _mm256_setzero_ps(); ++- __m256 dotProdVal1 = _mm256_setzero_ps(); ++- __m256 dotProdVal2 = _mm256_setzero_ps(); ++- __m256 dotProdVal3 = _mm256_setzero_ps(); ++- ++- for(;number < thirtysecondPoints; number++){ ++- ++- a0Val = _mm256_load_ps(aPtr); ++- a1Val = _mm256_load_ps(aPtr+8); ++- a2Val = _mm256_load_ps(aPtr+16); ++- a3Val = _mm256_load_ps(aPtr+24); ++- b0Val = _mm256_load_ps(bPtr); ++- b1Val = _mm256_load_ps(bPtr+8); ++- b2Val = _mm256_load_ps(bPtr+16); ++- b3Val = _mm256_load_ps(bPtr+24); ++- ++- dotProdVal0 = _mm256_fmadd_ps(a0Val, b0Val, dotProdVal0); ++- dotProdVal1 = _mm256_fmadd_ps(a1Val, b1Val, dotProdVal1); ++- dotProdVal2 = _mm256_fmadd_ps(a2Val, b2Val, dotProdVal2); ++- dotProdVal3 = _mm256_fmadd_ps(a3Val, b3Val, dotProdVal3); ++- ++- aPtr += 32; ++- bPtr += 32; ++- } ++- ++- dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1); ++- dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2); ++- dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3); ++- ++- __VOLK_ATTR_ALIGNED(32) float dotProductVector[8]; ++- ++- _mm256_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector ++- ++- dotProduct = dotProductVector[0]; ++- dotProduct += dotProductVector[1]; ++- dotProduct += dotProductVector[2]; ++- dotProduct += dotProductVector[3]; ++- dotProduct += dotProductVector[4]; ++- dotProduct += dotProductVector[5]; ++- dotProduct += dotProductVector[6]; ++- dotProduct += dotProductVector[7]; ++- ++- number = thirtysecondPoints*32; ++- for(;number < num_points; number++){ ++- dotProduct += ((*aPtr++) * (*bPtr++)); ++- } ++- ++- *result = (short)dotProduct; +++static inline void volk_32f_x2_dot_prod_16i_a_avx2_fma(int16_t* result, +++ const float* input, +++ const float* taps, +++ unsigned int num_points) +++{ +++ +++ unsigned int number = 0; +++ const unsigned int thirtysecondPoints = num_points / 32; +++ +++ float dotProduct = 0; +++ const float* aPtr = input; +++ const float* bPtr = taps; +++ +++ __m256 a0Val, a1Val, a2Val, a3Val; +++ __m256 b0Val, b1Val, b2Val, b3Val; +++ +++ __m256 dotProdVal0 = _mm256_setzero_ps(); +++ __m256 dotProdVal1 = _mm256_setzero_ps(); +++ __m256 dotProdVal2 = _mm256_setzero_ps(); +++ __m256 dotProdVal3 = _mm256_setzero_ps(); +++ +++ for (; number < thirtysecondPoints; number++) { +++ +++ a0Val = _mm256_load_ps(aPtr); +++ a1Val = _mm256_load_ps(aPtr + 8); +++ a2Val = _mm256_load_ps(aPtr + 16); +++ a3Val = _mm256_load_ps(aPtr + 24); +++ b0Val = _mm256_load_ps(bPtr); +++ b1Val = _mm256_load_ps(bPtr + 8); +++ b2Val = _mm256_load_ps(bPtr + 16); +++ b3Val = _mm256_load_ps(bPtr + 24); +++ +++ dotProdVal0 = _mm256_fmadd_ps(a0Val, b0Val, dotProdVal0); +++ dotProdVal1 = _mm256_fmadd_ps(a1Val, b1Val, dotProdVal1); +++ dotProdVal2 = _mm256_fmadd_ps(a2Val, b2Val, dotProdVal2); +++ dotProdVal3 = _mm256_fmadd_ps(a3Val, b3Val, dotProdVal3); +++ +++ aPtr += 32; +++ bPtr += 32; +++ } +++ +++ dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1); +++ dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2); +++ dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3); +++ +++ __VOLK_ATTR_ALIGNED(32) float dotProductVector[8]; +++ +++ _mm256_store_ps(dotProductVector, +++ dotProdVal0); // Store the results back into the dot product vector +++ +++ dotProduct = dotProductVector[0]; +++ dotProduct += dotProductVector[1]; +++ dotProduct += dotProductVector[2]; +++ dotProduct += dotProductVector[3]; +++ dotProduct += dotProductVector[4]; +++ dotProduct += dotProductVector[5]; +++ dotProduct += dotProductVector[6]; +++ dotProduct += dotProductVector[7]; +++ +++ number = thirtysecondPoints * 32; +++ for (; number < num_points; number++) { +++ dotProduct += ((*aPtr++) * (*bPtr++)); +++ } +++ +++ *result = (short)dotProduct; ++ } ++ ++ #endif /*LV_HAVE_AVX2 && LV_HAVE_FMA*/ ++@@ -220,146 +234,156 @@ static inline void volk_32f_x2_dot_prod_16i_a_avx2_fma(int16_t* result, const f ++ ++ #ifdef LV_HAVE_AVX ++ ++-static inline void volk_32f_x2_dot_prod_16i_a_avx(int16_t* result, const float* input, const float* taps, unsigned int num_points) { ++- ++- unsigned int number = 0; ++- const unsigned int thirtysecondPoints = num_points / 32; ++- ++- float dotProduct = 0; ++- const float* aPtr = input; ++- const float* bPtr = taps; ++- ++- __m256 a0Val, a1Val, a2Val, a3Val; ++- __m256 b0Val, b1Val, b2Val, b3Val; ++- __m256 c0Val, c1Val, c2Val, c3Val; ++- ++- __m256 dotProdVal0 = _mm256_setzero_ps(); ++- __m256 dotProdVal1 = _mm256_setzero_ps(); ++- __m256 dotProdVal2 = _mm256_setzero_ps(); ++- __m256 dotProdVal3 = _mm256_setzero_ps(); ++- ++- for(;number < thirtysecondPoints; number++){ ++- ++- a0Val = _mm256_load_ps(aPtr); ++- a1Val = _mm256_load_ps(aPtr+8); ++- a2Val = _mm256_load_ps(aPtr+16); ++- a3Val = _mm256_load_ps(aPtr+24); ++- b0Val = _mm256_load_ps(bPtr); ++- b1Val = _mm256_load_ps(bPtr+8); ++- b2Val = _mm256_load_ps(bPtr+16); ++- b3Val = _mm256_load_ps(bPtr+24); ++- ++- c0Val = _mm256_mul_ps(a0Val, b0Val); ++- c1Val = _mm256_mul_ps(a1Val, b1Val); ++- c2Val = _mm256_mul_ps(a2Val, b2Val); ++- c3Val = _mm256_mul_ps(a3Val, b3Val); ++- ++- dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0); ++- dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1); ++- dotProdVal2 = _mm256_add_ps(c2Val, dotProdVal2); ++- dotProdVal3 = _mm256_add_ps(c3Val, dotProdVal3); ++- ++- aPtr += 32; ++- bPtr += 32; ++- } ++- ++- dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1); ++- dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2); ++- dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3); ++- ++- __VOLK_ATTR_ALIGNED(32) float dotProductVector[8]; ++- ++- _mm256_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector ++- ++- dotProduct = dotProductVector[0]; ++- dotProduct += dotProductVector[1]; ++- dotProduct += dotProductVector[2]; ++- dotProduct += dotProductVector[3]; ++- dotProduct += dotProductVector[4]; ++- dotProduct += dotProductVector[5]; ++- dotProduct += dotProductVector[6]; ++- dotProduct += dotProductVector[7]; ++- ++- number = thirtysecondPoints*32; ++- for(;number < num_points; number++){ ++- dotProduct += ((*aPtr++) * (*bPtr++)); ++- } ++- ++- *result = (short)dotProduct; +++static inline void volk_32f_x2_dot_prod_16i_a_avx(int16_t* result, +++ const float* input, +++ const float* taps, +++ unsigned int num_points) +++{ +++ +++ unsigned int number = 0; +++ const unsigned int thirtysecondPoints = num_points / 32; +++ +++ float dotProduct = 0; +++ const float* aPtr = input; +++ const float* bPtr = taps; +++ +++ __m256 a0Val, a1Val, a2Val, a3Val; +++ __m256 b0Val, b1Val, b2Val, b3Val; +++ __m256 c0Val, c1Val, c2Val, c3Val; +++ +++ __m256 dotProdVal0 = _mm256_setzero_ps(); +++ __m256 dotProdVal1 = _mm256_setzero_ps(); +++ __m256 dotProdVal2 = _mm256_setzero_ps(); +++ __m256 dotProdVal3 = _mm256_setzero_ps(); +++ +++ for (; number < thirtysecondPoints; number++) { +++ +++ a0Val = _mm256_load_ps(aPtr); +++ a1Val = _mm256_load_ps(aPtr + 8); +++ a2Val = _mm256_load_ps(aPtr + 16); +++ a3Val = _mm256_load_ps(aPtr + 24); +++ b0Val = _mm256_load_ps(bPtr); +++ b1Val = _mm256_load_ps(bPtr + 8); +++ b2Val = _mm256_load_ps(bPtr + 16); +++ b3Val = _mm256_load_ps(bPtr + 24); +++ +++ c0Val = _mm256_mul_ps(a0Val, b0Val); +++ c1Val = _mm256_mul_ps(a1Val, b1Val); +++ c2Val = _mm256_mul_ps(a2Val, b2Val); +++ c3Val = _mm256_mul_ps(a3Val, b3Val); +++ +++ dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0); +++ dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1); +++ dotProdVal2 = _mm256_add_ps(c2Val, dotProdVal2); +++ dotProdVal3 = _mm256_add_ps(c3Val, dotProdVal3); +++ +++ aPtr += 32; +++ bPtr += 32; +++ } +++ +++ dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1); +++ dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2); +++ dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3); +++ +++ __VOLK_ATTR_ALIGNED(32) float dotProductVector[8]; +++ +++ _mm256_store_ps(dotProductVector, +++ dotProdVal0); // Store the results back into the dot product vector +++ +++ dotProduct = dotProductVector[0]; +++ dotProduct += dotProductVector[1]; +++ dotProduct += dotProductVector[2]; +++ dotProduct += dotProductVector[3]; +++ dotProduct += dotProductVector[4]; +++ dotProduct += dotProductVector[5]; +++ dotProduct += dotProductVector[6]; +++ dotProduct += dotProductVector[7]; +++ +++ number = thirtysecondPoints * 32; +++ for (; number < num_points; number++) { +++ dotProduct += ((*aPtr++) * (*bPtr++)); +++ } +++ +++ *result = (short)dotProduct; ++ } ++ ++ #endif /*LV_HAVE_AVX*/ ++ ++ #ifdef LV_HAVE_AVX512F ++ ++-static inline void volk_32f_x2_dot_prod_16i_a_avx512f(int16_t* result, const float* input, const float* taps, unsigned int num_points) { ++- ++- unsigned int number = 0; ++- const unsigned int sixtyfourthPoints = num_points / 64; ++- ++- float dotProduct = 0; ++- const float* aPtr = input; ++- const float* bPtr = taps; ++- ++- __m512 a0Val, a1Val, a2Val, a3Val; ++- __m512 b0Val, b1Val, b2Val, b3Val; ++- ++- __m512 dotProdVal0 = _mm512_setzero_ps(); ++- __m512 dotProdVal1 = _mm512_setzero_ps(); ++- __m512 dotProdVal2 = _mm512_setzero_ps(); ++- __m512 dotProdVal3 = _mm512_setzero_ps(); ++- ++- for(;number < sixtyfourthPoints; number++){ ++- ++- a0Val = _mm512_load_ps(aPtr); ++- a1Val = _mm512_load_ps(aPtr+16); ++- a2Val = _mm512_load_ps(aPtr+32); ++- a3Val = _mm512_load_ps(aPtr+48); ++- b0Val = _mm512_load_ps(bPtr); ++- b1Val = _mm512_load_ps(bPtr+16); ++- b2Val = _mm512_load_ps(bPtr+32); ++- b3Val = _mm512_load_ps(bPtr+48); ++- ++- dotProdVal0 = _mm512_fmadd_ps(a0Val, b0Val, dotProdVal0); ++- dotProdVal1 = _mm512_fmadd_ps(a1Val, b1Val, dotProdVal1); ++- dotProdVal2 = _mm512_fmadd_ps(a2Val, b2Val, dotProdVal2); ++- dotProdVal3 = _mm512_fmadd_ps(a3Val, b3Val, dotProdVal3); ++- ++- aPtr += 64; ++- bPtr += 64; ++- } ++- ++- dotProdVal0 = _mm512_add_ps(dotProdVal0, dotProdVal1); ++- dotProdVal0 = _mm512_add_ps(dotProdVal0, dotProdVal2); ++- dotProdVal0 = _mm512_add_ps(dotProdVal0, dotProdVal3); ++- ++- __VOLK_ATTR_ALIGNED(64) float dotProductVector[16]; ++- ++- _mm512_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector ++- ++- dotProduct = dotProductVector[0]; ++- dotProduct += dotProductVector[1]; ++- dotProduct += dotProductVector[2]; ++- dotProduct += dotProductVector[3]; ++- dotProduct += dotProductVector[4]; ++- dotProduct += dotProductVector[5]; ++- dotProduct += dotProductVector[6]; ++- dotProduct += dotProductVector[7]; ++- dotProduct += dotProductVector[8]; ++- dotProduct += dotProductVector[9]; ++- dotProduct += dotProductVector[10]; ++- dotProduct += dotProductVector[11]; ++- dotProduct += dotProductVector[12]; ++- dotProduct += dotProductVector[13]; ++- dotProduct += dotProductVector[14]; ++- dotProduct += dotProductVector[15]; ++- ++- number = sixtyfourthPoints*64; ++- for(;number < num_points; number++){ ++- dotProduct += ((*aPtr++) * (*bPtr++)); ++- } ++- ++- *result = (short)dotProduct; +++static inline void volk_32f_x2_dot_prod_16i_a_avx512f(int16_t* result, +++ const float* input, +++ const float* taps, +++ unsigned int num_points) +++{ +++ +++ unsigned int number = 0; +++ const unsigned int sixtyfourthPoints = num_points / 64; +++ +++ float dotProduct = 0; +++ const float* aPtr = input; +++ const float* bPtr = taps; +++ +++ __m512 a0Val, a1Val, a2Val, a3Val; +++ __m512 b0Val, b1Val, b2Val, b3Val; +++ +++ __m512 dotProdVal0 = _mm512_setzero_ps(); +++ __m512 dotProdVal1 = _mm512_setzero_ps(); +++ __m512 dotProdVal2 = _mm512_setzero_ps(); +++ __m512 dotProdVal3 = _mm512_setzero_ps(); +++ +++ for (; number < sixtyfourthPoints; number++) { +++ +++ a0Val = _mm512_load_ps(aPtr); +++ a1Val = _mm512_load_ps(aPtr + 16); +++ a2Val = _mm512_load_ps(aPtr + 32); +++ a3Val = _mm512_load_ps(aPtr + 48); +++ b0Val = _mm512_load_ps(bPtr); +++ b1Val = _mm512_load_ps(bPtr + 16); +++ b2Val = _mm512_load_ps(bPtr + 32); +++ b3Val = _mm512_load_ps(bPtr + 48); +++ +++ dotProdVal0 = _mm512_fmadd_ps(a0Val, b0Val, dotProdVal0); +++ dotProdVal1 = _mm512_fmadd_ps(a1Val, b1Val, dotProdVal1); +++ dotProdVal2 = _mm512_fmadd_ps(a2Val, b2Val, dotProdVal2); +++ dotProdVal3 = _mm512_fmadd_ps(a3Val, b3Val, dotProdVal3); +++ +++ aPtr += 64; +++ bPtr += 64; +++ } +++ +++ dotProdVal0 = _mm512_add_ps(dotProdVal0, dotProdVal1); +++ dotProdVal0 = _mm512_add_ps(dotProdVal0, dotProdVal2); +++ dotProdVal0 = _mm512_add_ps(dotProdVal0, dotProdVal3); +++ +++ __VOLK_ATTR_ALIGNED(64) float dotProductVector[16]; +++ +++ _mm512_store_ps(dotProductVector, +++ dotProdVal0); // Store the results back into the dot product vector +++ +++ dotProduct = dotProductVector[0]; +++ dotProduct += dotProductVector[1]; +++ dotProduct += dotProductVector[2]; +++ dotProduct += dotProductVector[3]; +++ dotProduct += dotProductVector[4]; +++ dotProduct += dotProductVector[5]; +++ dotProduct += dotProductVector[6]; +++ dotProduct += dotProductVector[7]; +++ dotProduct += dotProductVector[8]; +++ dotProduct += dotProductVector[9]; +++ dotProduct += dotProductVector[10]; +++ dotProduct += dotProductVector[11]; +++ dotProduct += dotProductVector[12]; +++ dotProduct += dotProductVector[13]; +++ dotProduct += dotProductVector[14]; +++ dotProduct += dotProductVector[15]; +++ +++ number = sixtyfourthPoints * 64; +++ for (; number < num_points; number++) { +++ dotProduct += ((*aPtr++) * (*bPtr++)); +++ } +++ +++ *result = (short)dotProduct; ++ } ++ ++ #endif /*LV_HAVE_AVX512F*/ ++@@ -367,68 +391,73 @@ static inline void volk_32f_x2_dot_prod_16i_a_avx512f(int16_t* result, const fl ++ ++ #ifdef LV_HAVE_SSE ++ ++-static inline void volk_32f_x2_dot_prod_16i_u_sse(int16_t* result, const float* input, const float* taps, unsigned int num_points) { ++- ++- unsigned int number = 0; ++- const unsigned int sixteenthPoints = num_points / 16; ++- ++- float dotProduct = 0; ++- const float* aPtr = input; ++- const float* bPtr = taps; ++- ++- __m128 a0Val, a1Val, a2Val, a3Val; ++- __m128 b0Val, b1Val, b2Val, b3Val; ++- __m128 c0Val, c1Val, c2Val, c3Val; ++- ++- __m128 dotProdVal0 = _mm_setzero_ps(); ++- __m128 dotProdVal1 = _mm_setzero_ps(); ++- __m128 dotProdVal2 = _mm_setzero_ps(); ++- __m128 dotProdVal3 = _mm_setzero_ps(); ++- ++- for(;number < sixteenthPoints; number++){ ++- ++- a0Val = _mm_loadu_ps(aPtr); ++- a1Val = _mm_loadu_ps(aPtr+4); ++- a2Val = _mm_loadu_ps(aPtr+8); ++- a3Val = _mm_loadu_ps(aPtr+12); ++- b0Val = _mm_loadu_ps(bPtr); ++- b1Val = _mm_loadu_ps(bPtr+4); ++- b2Val = _mm_loadu_ps(bPtr+8); ++- b3Val = _mm_loadu_ps(bPtr+12); ++- ++- c0Val = _mm_mul_ps(a0Val, b0Val); ++- c1Val = _mm_mul_ps(a1Val, b1Val); ++- c2Val = _mm_mul_ps(a2Val, b2Val); ++- c3Val = _mm_mul_ps(a3Val, b3Val); ++- ++- dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0); ++- dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1); ++- dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2); ++- dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3); ++- ++- aPtr += 16; ++- bPtr += 16; ++- } ++- ++- dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1); ++- dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2); ++- dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3); ++- ++- __VOLK_ATTR_ALIGNED(16) float dotProductVector[4]; ++- ++- _mm_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector ++- ++- dotProduct = dotProductVector[0]; ++- dotProduct += dotProductVector[1]; ++- dotProduct += dotProductVector[2]; ++- dotProduct += dotProductVector[3]; ++- ++- number = sixteenthPoints*16; ++- for(;number < num_points; number++){ ++- dotProduct += ((*aPtr++) * (*bPtr++)); ++- } ++- ++- *result = (short)dotProduct; +++static inline void volk_32f_x2_dot_prod_16i_u_sse(int16_t* result, +++ const float* input, +++ const float* taps, +++ unsigned int num_points) +++{ +++ +++ unsigned int number = 0; +++ const unsigned int sixteenthPoints = num_points / 16; +++ +++ float dotProduct = 0; +++ const float* aPtr = input; +++ const float* bPtr = taps; +++ +++ __m128 a0Val, a1Val, a2Val, a3Val; +++ __m128 b0Val, b1Val, b2Val, b3Val; +++ __m128 c0Val, c1Val, c2Val, c3Val; +++ +++ __m128 dotProdVal0 = _mm_setzero_ps(); +++ __m128 dotProdVal1 = _mm_setzero_ps(); +++ __m128 dotProdVal2 = _mm_setzero_ps(); +++ __m128 dotProdVal3 = _mm_setzero_ps(); +++ +++ for (; number < sixteenthPoints; number++) { +++ +++ a0Val = _mm_loadu_ps(aPtr); +++ a1Val = _mm_loadu_ps(aPtr + 4); +++ a2Val = _mm_loadu_ps(aPtr + 8); +++ a3Val = _mm_loadu_ps(aPtr + 12); +++ b0Val = _mm_loadu_ps(bPtr); +++ b1Val = _mm_loadu_ps(bPtr + 4); +++ b2Val = _mm_loadu_ps(bPtr + 8); +++ b3Val = _mm_loadu_ps(bPtr + 12); +++ +++ c0Val = _mm_mul_ps(a0Val, b0Val); +++ c1Val = _mm_mul_ps(a1Val, b1Val); +++ c2Val = _mm_mul_ps(a2Val, b2Val); +++ c3Val = _mm_mul_ps(a3Val, b3Val); +++ +++ dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0); +++ dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1); +++ dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2); +++ dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3); +++ +++ aPtr += 16; +++ bPtr += 16; +++ } +++ +++ dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1); +++ dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2); +++ dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3); +++ +++ __VOLK_ATTR_ALIGNED(16) float dotProductVector[4]; +++ +++ _mm_store_ps(dotProductVector, +++ dotProdVal0); // Store the results back into the dot product vector +++ +++ dotProduct = dotProductVector[0]; +++ dotProduct += dotProductVector[1]; +++ dotProduct += dotProductVector[2]; +++ dotProduct += dotProductVector[3]; +++ +++ number = sixteenthPoints * 16; +++ for (; number < num_points; number++) { +++ dotProduct += ((*aPtr++) * (*bPtr++)); +++ } +++ +++ *result = (short)dotProduct; ++ } ++ ++ #endif /*LV_HAVE_SSE*/ ++@@ -436,66 +465,71 @@ static inline void volk_32f_x2_dot_prod_16i_u_sse(int16_t* result, const float* ++ ++ #if LV_HAVE_AVX2 && LV_HAVE_FMA ++ ++-static inline void volk_32f_x2_dot_prod_16i_u_avx2_fma(int16_t* result, const float* input, const float* taps, unsigned int num_points) { ++- ++- unsigned int number = 0; ++- const unsigned int thirtysecondPoints = num_points / 32; ++- ++- float dotProduct = 0; ++- const float* aPtr = input; ++- const float* bPtr = taps; ++- ++- __m256 a0Val, a1Val, a2Val, a3Val; ++- __m256 b0Val, b1Val, b2Val, b3Val; ++- ++- __m256 dotProdVal0 = _mm256_setzero_ps(); ++- __m256 dotProdVal1 = _mm256_setzero_ps(); ++- __m256 dotProdVal2 = _mm256_setzero_ps(); ++- __m256 dotProdVal3 = _mm256_setzero_ps(); ++- ++- for(;number < thirtysecondPoints; number++){ ++- ++- a0Val = _mm256_loadu_ps(aPtr); ++- a1Val = _mm256_loadu_ps(aPtr+8); ++- a2Val = _mm256_loadu_ps(aPtr+16); ++- a3Val = _mm256_loadu_ps(aPtr+24); ++- b0Val = _mm256_loadu_ps(bPtr); ++- b1Val = _mm256_loadu_ps(bPtr+8); ++- b2Val = _mm256_loadu_ps(bPtr+16); ++- b3Val = _mm256_loadu_ps(bPtr+24); ++- ++- dotProdVal0 = _mm256_fmadd_ps(a0Val, b0Val, dotProdVal0); ++- dotProdVal1 = _mm256_fmadd_ps(a1Val, b1Val, dotProdVal1); ++- dotProdVal2 = _mm256_fmadd_ps(a2Val, b2Val, dotProdVal2); ++- dotProdVal3 = _mm256_fmadd_ps(a3Val, b3Val, dotProdVal3); ++- ++- aPtr += 32; ++- bPtr += 32; ++- } ++- ++- dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1); ++- dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2); ++- dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3); ++- ++- __VOLK_ATTR_ALIGNED(32) float dotProductVector[8]; ++- ++- _mm256_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector ++- ++- dotProduct = dotProductVector[0]; ++- dotProduct += dotProductVector[1]; ++- dotProduct += dotProductVector[2]; ++- dotProduct += dotProductVector[3]; ++- dotProduct += dotProductVector[4]; ++- dotProduct += dotProductVector[5]; ++- dotProduct += dotProductVector[6]; ++- dotProduct += dotProductVector[7]; ++- ++- number = thirtysecondPoints*32; ++- for(;number < num_points; number++){ ++- dotProduct += ((*aPtr++) * (*bPtr++)); ++- } ++- ++- *result = (short)dotProduct; +++static inline void volk_32f_x2_dot_prod_16i_u_avx2_fma(int16_t* result, +++ const float* input, +++ const float* taps, +++ unsigned int num_points) +++{ +++ +++ unsigned int number = 0; +++ const unsigned int thirtysecondPoints = num_points / 32; +++ +++ float dotProduct = 0; +++ const float* aPtr = input; +++ const float* bPtr = taps; +++ +++ __m256 a0Val, a1Val, a2Val, a3Val; +++ __m256 b0Val, b1Val, b2Val, b3Val; +++ +++ __m256 dotProdVal0 = _mm256_setzero_ps(); +++ __m256 dotProdVal1 = _mm256_setzero_ps(); +++ __m256 dotProdVal2 = _mm256_setzero_ps(); +++ __m256 dotProdVal3 = _mm256_setzero_ps(); +++ +++ for (; number < thirtysecondPoints; number++) { +++ +++ a0Val = _mm256_loadu_ps(aPtr); +++ a1Val = _mm256_loadu_ps(aPtr + 8); +++ a2Val = _mm256_loadu_ps(aPtr + 16); +++ a3Val = _mm256_loadu_ps(aPtr + 24); +++ b0Val = _mm256_loadu_ps(bPtr); +++ b1Val = _mm256_loadu_ps(bPtr + 8); +++ b2Val = _mm256_loadu_ps(bPtr + 16); +++ b3Val = _mm256_loadu_ps(bPtr + 24); +++ +++ dotProdVal0 = _mm256_fmadd_ps(a0Val, b0Val, dotProdVal0); +++ dotProdVal1 = _mm256_fmadd_ps(a1Val, b1Val, dotProdVal1); +++ dotProdVal2 = _mm256_fmadd_ps(a2Val, b2Val, dotProdVal2); +++ dotProdVal3 = _mm256_fmadd_ps(a3Val, b3Val, dotProdVal3); +++ +++ aPtr += 32; +++ bPtr += 32; +++ } +++ +++ dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1); +++ dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2); +++ dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3); +++ +++ __VOLK_ATTR_ALIGNED(32) float dotProductVector[8]; +++ +++ _mm256_store_ps(dotProductVector, +++ dotProdVal0); // Store the results back into the dot product vector +++ +++ dotProduct = dotProductVector[0]; +++ dotProduct += dotProductVector[1]; +++ dotProduct += dotProductVector[2]; +++ dotProduct += dotProductVector[3]; +++ dotProduct += dotProductVector[4]; +++ dotProduct += dotProductVector[5]; +++ dotProduct += dotProductVector[6]; +++ dotProduct += dotProductVector[7]; +++ +++ number = thirtysecondPoints * 32; +++ for (; number < num_points; number++) { +++ dotProduct += ((*aPtr++) * (*bPtr++)); +++ } +++ +++ *result = (short)dotProduct; ++ } ++ ++ #endif /*LV_HAVE_AVX2 && lV_HAVE_FMA*/ ++@@ -503,146 +537,156 @@ static inline void volk_32f_x2_dot_prod_16i_u_avx2_fma(int16_t* result, const f ++ ++ #ifdef LV_HAVE_AVX ++ ++-static inline void volk_32f_x2_dot_prod_16i_u_avx(int16_t* result, const float* input, const float* taps, unsigned int num_points) { ++- ++- unsigned int number = 0; ++- const unsigned int thirtysecondPoints = num_points / 32; ++- ++- float dotProduct = 0; ++- const float* aPtr = input; ++- const float* bPtr = taps; ++- ++- __m256 a0Val, a1Val, a2Val, a3Val; ++- __m256 b0Val, b1Val, b2Val, b3Val; ++- __m256 c0Val, c1Val, c2Val, c3Val; ++- ++- __m256 dotProdVal0 = _mm256_setzero_ps(); ++- __m256 dotProdVal1 = _mm256_setzero_ps(); ++- __m256 dotProdVal2 = _mm256_setzero_ps(); ++- __m256 dotProdVal3 = _mm256_setzero_ps(); ++- ++- for(;number < thirtysecondPoints; number++){ ++- ++- a0Val = _mm256_loadu_ps(aPtr); ++- a1Val = _mm256_loadu_ps(aPtr+8); ++- a2Val = _mm256_loadu_ps(aPtr+16); ++- a3Val = _mm256_loadu_ps(aPtr+24); ++- b0Val = _mm256_loadu_ps(bPtr); ++- b1Val = _mm256_loadu_ps(bPtr+8); ++- b2Val = _mm256_loadu_ps(bPtr+16); ++- b3Val = _mm256_loadu_ps(bPtr+24); ++- ++- c0Val = _mm256_mul_ps(a0Val, b0Val); ++- c1Val = _mm256_mul_ps(a1Val, b1Val); ++- c2Val = _mm256_mul_ps(a2Val, b2Val); ++- c3Val = _mm256_mul_ps(a3Val, b3Val); ++- ++- dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0); ++- dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1); ++- dotProdVal2 = _mm256_add_ps(c2Val, dotProdVal2); ++- dotProdVal3 = _mm256_add_ps(c3Val, dotProdVal3); ++- ++- aPtr += 32; ++- bPtr += 32; ++- } ++- ++- dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1); ++- dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2); ++- dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3); ++- ++- __VOLK_ATTR_ALIGNED(32) float dotProductVector[8]; ++- ++- _mm256_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector ++- ++- dotProduct = dotProductVector[0]; ++- dotProduct += dotProductVector[1]; ++- dotProduct += dotProductVector[2]; ++- dotProduct += dotProductVector[3]; ++- dotProduct += dotProductVector[4]; ++- dotProduct += dotProductVector[5]; ++- dotProduct += dotProductVector[6]; ++- dotProduct += dotProductVector[7]; ++- ++- number = thirtysecondPoints*32; ++- for(;number < num_points; number++){ ++- dotProduct += ((*aPtr++) * (*bPtr++)); ++- } ++- ++- *result = (short)dotProduct; +++static inline void volk_32f_x2_dot_prod_16i_u_avx(int16_t* result, +++ const float* input, +++ const float* taps, +++ unsigned int num_points) +++{ +++ +++ unsigned int number = 0; +++ const unsigned int thirtysecondPoints = num_points / 32; +++ +++ float dotProduct = 0; +++ const float* aPtr = input; +++ const float* bPtr = taps; +++ +++ __m256 a0Val, a1Val, a2Val, a3Val; +++ __m256 b0Val, b1Val, b2Val, b3Val; +++ __m256 c0Val, c1Val, c2Val, c3Val; +++ +++ __m256 dotProdVal0 = _mm256_setzero_ps(); +++ __m256 dotProdVal1 = _mm256_setzero_ps(); +++ __m256 dotProdVal2 = _mm256_setzero_ps(); +++ __m256 dotProdVal3 = _mm256_setzero_ps(); +++ +++ for (; number < thirtysecondPoints; number++) { +++ +++ a0Val = _mm256_loadu_ps(aPtr); +++ a1Val = _mm256_loadu_ps(aPtr + 8); +++ a2Val = _mm256_loadu_ps(aPtr + 16); +++ a3Val = _mm256_loadu_ps(aPtr + 24); +++ b0Val = _mm256_loadu_ps(bPtr); +++ b1Val = _mm256_loadu_ps(bPtr + 8); +++ b2Val = _mm256_loadu_ps(bPtr + 16); +++ b3Val = _mm256_loadu_ps(bPtr + 24); +++ +++ c0Val = _mm256_mul_ps(a0Val, b0Val); +++ c1Val = _mm256_mul_ps(a1Val, b1Val); +++ c2Val = _mm256_mul_ps(a2Val, b2Val); +++ c3Val = _mm256_mul_ps(a3Val, b3Val); +++ +++ dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0); +++ dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1); +++ dotProdVal2 = _mm256_add_ps(c2Val, dotProdVal2); +++ dotProdVal3 = _mm256_add_ps(c3Val, dotProdVal3); +++ +++ aPtr += 32; +++ bPtr += 32; +++ } +++ +++ dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1); +++ dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2); +++ dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3); +++ +++ __VOLK_ATTR_ALIGNED(32) float dotProductVector[8]; +++ +++ _mm256_store_ps(dotProductVector, +++ dotProdVal0); // Store the results back into the dot product vector +++ +++ dotProduct = dotProductVector[0]; +++ dotProduct += dotProductVector[1]; +++ dotProduct += dotProductVector[2]; +++ dotProduct += dotProductVector[3]; +++ dotProduct += dotProductVector[4]; +++ dotProduct += dotProductVector[5]; +++ dotProduct += dotProductVector[6]; +++ dotProduct += dotProductVector[7]; +++ +++ number = thirtysecondPoints * 32; +++ for (; number < num_points; number++) { +++ dotProduct += ((*aPtr++) * (*bPtr++)); +++ } +++ +++ *result = (short)dotProduct; ++ } ++ ++ #endif /*LV_HAVE_AVX*/ ++ ++ #ifdef LV_HAVE_AVX512F ++ ++-static inline void volk_32f_x2_dot_prod_16i_u_avx512f(int16_t* result, const float* input, const float* taps, unsigned int num_points) { ++- ++- unsigned int number = 0; ++- const unsigned int sixtyfourthPoints = num_points / 64; ++- ++- float dotProduct = 0; ++- const float* aPtr = input; ++- const float* bPtr = taps; ++- ++- __m512 a0Val, a1Val, a2Val, a3Val; ++- __m512 b0Val, b1Val, b2Val, b3Val; ++- ++- __m512 dotProdVal0 = _mm512_setzero_ps(); ++- __m512 dotProdVal1 = _mm512_setzero_ps(); ++- __m512 dotProdVal2 = _mm512_setzero_ps(); ++- __m512 dotProdVal3 = _mm512_setzero_ps(); ++- ++- for(;number < sixtyfourthPoints; number++){ ++- ++- a0Val = _mm512_loadu_ps(aPtr); ++- a1Val = _mm512_loadu_ps(aPtr+16); ++- a2Val = _mm512_loadu_ps(aPtr+32); ++- a3Val = _mm512_loadu_ps(aPtr+48); ++- b0Val = _mm512_loadu_ps(bPtr); ++- b1Val = _mm512_loadu_ps(bPtr+16); ++- b2Val = _mm512_loadu_ps(bPtr+32); ++- b3Val = _mm512_loadu_ps(bPtr+48); ++- ++- dotProdVal0 = _mm512_fmadd_ps(a0Val, b0Val, dotProdVal0); ++- dotProdVal1 = _mm512_fmadd_ps(a1Val, b1Val, dotProdVal1); ++- dotProdVal2 = _mm512_fmadd_ps(a2Val, b2Val, dotProdVal2); ++- dotProdVal3 = _mm512_fmadd_ps(a3Val, b3Val, dotProdVal3); ++- ++- aPtr += 64; ++- bPtr += 64; ++- } ++- ++- dotProdVal0 = _mm512_add_ps(dotProdVal0, dotProdVal1); ++- dotProdVal0 = _mm512_add_ps(dotProdVal0, dotProdVal2); ++- dotProdVal0 = _mm512_add_ps(dotProdVal0, dotProdVal3); ++- ++- __VOLK_ATTR_ALIGNED(64) float dotProductVector[16]; ++- ++- _mm512_storeu_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector ++- ++- dotProduct = dotProductVector[0]; ++- dotProduct += dotProductVector[1]; ++- dotProduct += dotProductVector[2]; ++- dotProduct += dotProductVector[3]; ++- dotProduct += dotProductVector[4]; ++- dotProduct += dotProductVector[5]; ++- dotProduct += dotProductVector[6]; ++- dotProduct += dotProductVector[7]; ++- dotProduct += dotProductVector[8]; ++- dotProduct += dotProductVector[9]; ++- dotProduct += dotProductVector[10]; ++- dotProduct += dotProductVector[11]; ++- dotProduct += dotProductVector[12]; ++- dotProduct += dotProductVector[13]; ++- dotProduct += dotProductVector[14]; ++- dotProduct += dotProductVector[15]; ++- ++- number = sixtyfourthPoints*64; ++- for(;number < num_points; number++){ ++- dotProduct += ((*aPtr++) * (*bPtr++)); ++- } ++- ++- *result = (short)dotProduct; +++static inline void volk_32f_x2_dot_prod_16i_u_avx512f(int16_t* result, +++ const float* input, +++ const float* taps, +++ unsigned int num_points) +++{ +++ +++ unsigned int number = 0; +++ const unsigned int sixtyfourthPoints = num_points / 64; +++ +++ float dotProduct = 0; +++ const float* aPtr = input; +++ const float* bPtr = taps; +++ +++ __m512 a0Val, a1Val, a2Val, a3Val; +++ __m512 b0Val, b1Val, b2Val, b3Val; +++ +++ __m512 dotProdVal0 = _mm512_setzero_ps(); +++ __m512 dotProdVal1 = _mm512_setzero_ps(); +++ __m512 dotProdVal2 = _mm512_setzero_ps(); +++ __m512 dotProdVal3 = _mm512_setzero_ps(); +++ +++ for (; number < sixtyfourthPoints; number++) { +++ +++ a0Val = _mm512_loadu_ps(aPtr); +++ a1Val = _mm512_loadu_ps(aPtr + 16); +++ a2Val = _mm512_loadu_ps(aPtr + 32); +++ a3Val = _mm512_loadu_ps(aPtr + 48); +++ b0Val = _mm512_loadu_ps(bPtr); +++ b1Val = _mm512_loadu_ps(bPtr + 16); +++ b2Val = _mm512_loadu_ps(bPtr + 32); +++ b3Val = _mm512_loadu_ps(bPtr + 48); +++ +++ dotProdVal0 = _mm512_fmadd_ps(a0Val, b0Val, dotProdVal0); +++ dotProdVal1 = _mm512_fmadd_ps(a1Val, b1Val, dotProdVal1); +++ dotProdVal2 = _mm512_fmadd_ps(a2Val, b2Val, dotProdVal2); +++ dotProdVal3 = _mm512_fmadd_ps(a3Val, b3Val, dotProdVal3); +++ +++ aPtr += 64; +++ bPtr += 64; +++ } +++ +++ dotProdVal0 = _mm512_add_ps(dotProdVal0, dotProdVal1); +++ dotProdVal0 = _mm512_add_ps(dotProdVal0, dotProdVal2); +++ dotProdVal0 = _mm512_add_ps(dotProdVal0, dotProdVal3); +++ +++ __VOLK_ATTR_ALIGNED(64) float dotProductVector[16]; +++ +++ _mm512_storeu_ps(dotProductVector, +++ dotProdVal0); // Store the results back into the dot product vector +++ +++ dotProduct = dotProductVector[0]; +++ dotProduct += dotProductVector[1]; +++ dotProduct += dotProductVector[2]; +++ dotProduct += dotProductVector[3]; +++ dotProduct += dotProductVector[4]; +++ dotProduct += dotProductVector[5]; +++ dotProduct += dotProductVector[6]; +++ dotProduct += dotProductVector[7]; +++ dotProduct += dotProductVector[8]; +++ dotProduct += dotProductVector[9]; +++ dotProduct += dotProductVector[10]; +++ dotProduct += dotProductVector[11]; +++ dotProduct += dotProductVector[12]; +++ dotProduct += dotProductVector[13]; +++ dotProduct += dotProductVector[14]; +++ dotProduct += dotProductVector[15]; +++ +++ number = sixtyfourthPoints * 64; +++ for (; number < num_points; number++) { +++ dotProduct += ((*aPtr++) * (*bPtr++)); +++ } +++ +++ *result = (short)dotProduct; ++ } ++ ++ #endif /*LV_HAVE_AVX512F*/ ++diff --git a/kernels/volk/volk_32f_x2_dot_prod_32f.h b/kernels/volk/volk_32f_x2_dot_prod_32f.h ++index ea0f7ba..7854031 100644 ++--- a/kernels/volk/volk_32f_x2_dot_prod_32f.h +++++ b/kernels/volk/volk_32f_x2_dot_prod_32f.h ++@@ -33,8 +33,8 @@ ++ * ++ * Dispatcher Prototype ++ * \code ++- * void volk_32f_x2_dot_prod_32f(float* result, const float* input, const float* taps, unsigned int num_points) ++- * \endcode +++ * void volk_32f_x2_dot_prod_32f(float* result, const float* input, const float* taps, +++ * unsigned int num_points) \endcode ++ * ++ * \b Inputs ++ * \li input: vector of floats. ++@@ -45,10 +45,8 @@ ++ * \li result: pointer to a float value to hold the dot product result. ++ * ++ * \b Example ++- * Take the dot product of an increasing vector and a vector of ones. The result is the sum of integers (0,9). ++- * \code ++- * int N = 10; ++- * unsigned int alignment = volk_get_alignment(); +++ * Take the dot product of an increasing vector and a vector of ones. The result is the +++ * sum of integers (0,9). \code int N = 10; unsigned int alignment = volk_get_alignment(); ++ * float* increasing = (float*)volk_malloc(sizeof(float)*N, alignment); ++ * float* ones = (float*)volk_malloc(sizeof(float)*N, alignment); ++ * float* out = (float*)volk_malloc(sizeof(float)*1, alignment); ++@@ -73,25 +71,29 @@ ++ #ifndef INCLUDED_volk_32f_x2_dot_prod_32f_u_H ++ #define INCLUDED_volk_32f_x2_dot_prod_32f_u_H ++ +++#include ++ #include ++-#include ++ ++ ++ #ifdef LV_HAVE_GENERIC ++ ++ ++-static inline void volk_32f_x2_dot_prod_32f_generic(float * result, const float * input, const float * taps, unsigned int num_points) { +++static inline void volk_32f_x2_dot_prod_32f_generic(float* result, +++ const float* input, +++ const float* taps, +++ unsigned int num_points) +++{ ++ ++- float dotProduct = 0; ++- const float* aPtr = input; ++- const float* bPtr= taps; ++- unsigned int number = 0; +++ float dotProduct = 0; +++ const float* aPtr = input; +++ const float* bPtr = taps; +++ unsigned int number = 0; ++ ++- for(number = 0; number < num_points; number++){ ++- dotProduct += ((*aPtr++) * (*bPtr++)); ++- } +++ for (number = 0; number < num_points; number++) { +++ dotProduct += ((*aPtr++) * (*bPtr++)); +++ } ++ ++- *result = dotProduct; +++ *result = dotProduct; ++ } ++ ++ #endif /*LV_HAVE_GENERIC*/ ++@@ -100,69 +102,73 @@ static inline void volk_32f_x2_dot_prod_32f_generic(float * result, const float ++ #ifdef LV_HAVE_SSE ++ ++ ++-static inline void volk_32f_x2_dot_prod_32f_u_sse( float* result, const float* input, const float* taps, unsigned int num_points) { ++- ++- unsigned int number = 0; ++- const unsigned int sixteenthPoints = num_points / 16; ++- ++- float dotProduct = 0; ++- const float* aPtr = input; ++- const float* bPtr = taps; +++static inline void volk_32f_x2_dot_prod_32f_u_sse(float* result, +++ const float* input, +++ const float* taps, +++ unsigned int num_points) +++{ ++ ++- __m128 a0Val, a1Val, a2Val, a3Val; ++- __m128 b0Val, b1Val, b2Val, b3Val; ++- __m128 c0Val, c1Val, c2Val, c3Val; ++- ++- __m128 dotProdVal0 = _mm_setzero_ps(); ++- __m128 dotProdVal1 = _mm_setzero_ps(); ++- __m128 dotProdVal2 = _mm_setzero_ps(); ++- __m128 dotProdVal3 = _mm_setzero_ps(); ++- ++- for(;number < sixteenthPoints; number++){ ++- ++- a0Val = _mm_loadu_ps(aPtr); ++- a1Val = _mm_loadu_ps(aPtr+4); ++- a2Val = _mm_loadu_ps(aPtr+8); ++- a3Val = _mm_loadu_ps(aPtr+12); ++- b0Val = _mm_loadu_ps(bPtr); ++- b1Val = _mm_loadu_ps(bPtr+4); ++- b2Val = _mm_loadu_ps(bPtr+8); ++- b3Val = _mm_loadu_ps(bPtr+12); ++- ++- c0Val = _mm_mul_ps(a0Val, b0Val); ++- c1Val = _mm_mul_ps(a1Val, b1Val); ++- c2Val = _mm_mul_ps(a2Val, b2Val); ++- c3Val = _mm_mul_ps(a3Val, b3Val); ++- ++- dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0); ++- dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1); ++- dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2); ++- dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3); +++ unsigned int number = 0; +++ const unsigned int sixteenthPoints = num_points / 16; ++ ++- aPtr += 16; ++- bPtr += 16; ++- } +++ float dotProduct = 0; +++ const float* aPtr = input; +++ const float* bPtr = taps; +++ +++ __m128 a0Val, a1Val, a2Val, a3Val; +++ __m128 b0Val, b1Val, b2Val, b3Val; +++ __m128 c0Val, c1Val, c2Val, c3Val; +++ +++ __m128 dotProdVal0 = _mm_setzero_ps(); +++ __m128 dotProdVal1 = _mm_setzero_ps(); +++ __m128 dotProdVal2 = _mm_setzero_ps(); +++ __m128 dotProdVal3 = _mm_setzero_ps(); +++ +++ for (; number < sixteenthPoints; number++) { +++ +++ a0Val = _mm_loadu_ps(aPtr); +++ a1Val = _mm_loadu_ps(aPtr + 4); +++ a2Val = _mm_loadu_ps(aPtr + 8); +++ a3Val = _mm_loadu_ps(aPtr + 12); +++ b0Val = _mm_loadu_ps(bPtr); +++ b1Val = _mm_loadu_ps(bPtr + 4); +++ b2Val = _mm_loadu_ps(bPtr + 8); +++ b3Val = _mm_loadu_ps(bPtr + 12); +++ +++ c0Val = _mm_mul_ps(a0Val, b0Val); +++ c1Val = _mm_mul_ps(a1Val, b1Val); +++ c2Val = _mm_mul_ps(a2Val, b2Val); +++ c3Val = _mm_mul_ps(a3Val, b3Val); +++ +++ dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0); +++ dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1); +++ dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2); +++ dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3); ++ ++- dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1); ++- dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2); ++- dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3); +++ aPtr += 16; +++ bPtr += 16; +++ } ++ ++- __VOLK_ATTR_ALIGNED(16) float dotProductVector[4]; +++ dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1); +++ dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2); +++ dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3); ++ ++- _mm_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector +++ __VOLK_ATTR_ALIGNED(16) float dotProductVector[4]; ++ ++- dotProduct = dotProductVector[0]; ++- dotProduct += dotProductVector[1]; ++- dotProduct += dotProductVector[2]; ++- dotProduct += dotProductVector[3]; +++ _mm_store_ps(dotProductVector, +++ dotProdVal0); // Store the results back into the dot product vector ++ ++- number = sixteenthPoints*16; ++- for(;number < num_points; number++){ ++- dotProduct += ((*aPtr++) * (*bPtr++)); ++- } +++ dotProduct = dotProductVector[0]; +++ dotProduct += dotProductVector[1]; +++ dotProduct += dotProductVector[2]; +++ dotProduct += dotProductVector[3]; ++ ++- *result = dotProduct; +++ number = sixteenthPoints * 16; +++ for (; number < num_points; number++) { +++ dotProduct += ((*aPtr++) * (*bPtr++)); +++ } ++ +++ *result = dotProduct; ++ } ++ ++ #endif /*LV_HAVE_SSE*/ ++@@ -171,127 +177,145 @@ static inline void volk_32f_x2_dot_prod_32f_u_sse( float* result, const float* ++ ++ #include ++ ++-static inline void volk_32f_x2_dot_prod_32f_u_sse3(float * result, const float * input, const float * taps, unsigned int num_points) { ++- unsigned int number = 0; ++- const unsigned int sixteenthPoints = num_points / 16; ++- ++- float dotProduct = 0; ++- const float* aPtr = input; ++- const float* bPtr = taps; ++- ++- __m128 a0Val, a1Val, a2Val, a3Val; ++- __m128 b0Val, b1Val, b2Val, b3Val; ++- __m128 c0Val, c1Val, c2Val, c3Val; ++- ++- __m128 dotProdVal0 = _mm_setzero_ps(); ++- __m128 dotProdVal1 = _mm_setzero_ps(); ++- __m128 dotProdVal2 = _mm_setzero_ps(); ++- __m128 dotProdVal3 = _mm_setzero_ps(); ++- ++- for(;number < sixteenthPoints; number++){ ++- ++- a0Val = _mm_loadu_ps(aPtr); ++- a1Val = _mm_loadu_ps(aPtr+4); ++- a2Val = _mm_loadu_ps(aPtr+8); ++- a3Val = _mm_loadu_ps(aPtr+12); ++- b0Val = _mm_loadu_ps(bPtr); ++- b1Val = _mm_loadu_ps(bPtr+4); ++- b2Val = _mm_loadu_ps(bPtr+8); ++- b3Val = _mm_loadu_ps(bPtr+12); ++- ++- c0Val = _mm_mul_ps(a0Val, b0Val); ++- c1Val = _mm_mul_ps(a1Val, b1Val); ++- c2Val = _mm_mul_ps(a2Val, b2Val); ++- c3Val = _mm_mul_ps(a3Val, b3Val); ++- ++- dotProdVal0 = _mm_add_ps(dotProdVal0, c0Val); ++- dotProdVal1 = _mm_add_ps(dotProdVal1, c1Val); ++- dotProdVal2 = _mm_add_ps(dotProdVal2, c2Val); ++- dotProdVal3 = _mm_add_ps(dotProdVal3, c3Val); ++- ++- aPtr += 16; ++- bPtr += 16; ++- } ++- ++- dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1); ++- dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2); ++- dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3); ++- ++- __VOLK_ATTR_ALIGNED(16) float dotProductVector[4]; ++- _mm_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector ++- ++- dotProduct = dotProductVector[0]; ++- dotProduct += dotProductVector[1]; ++- dotProduct += dotProductVector[2]; ++- dotProduct += dotProductVector[3]; ++- ++- number = sixteenthPoints*16; ++- for(;number < num_points; number++){ ++- dotProduct += ((*aPtr++) * (*bPtr++)); ++- } ++- ++- *result = dotProduct; ++-} ++- ++-#endif /*LV_HAVE_SSE3*/ +++static inline void volk_32f_x2_dot_prod_32f_u_sse3(float* result, +++ const float* input, +++ const float* taps, +++ unsigned int num_points) +++{ +++ unsigned int number = 0; +++ const unsigned int sixteenthPoints = num_points / 16; ++ ++-#ifdef LV_HAVE_SSE4_1 +++ float dotProduct = 0; +++ const float* aPtr = input; +++ const float* bPtr = taps; +++ +++ __m128 a0Val, a1Val, a2Val, a3Val; +++ __m128 b0Val, b1Val, b2Val, b3Val; +++ __m128 c0Val, c1Val, c2Val, c3Val; +++ +++ __m128 dotProdVal0 = _mm_setzero_ps(); +++ __m128 dotProdVal1 = _mm_setzero_ps(); +++ __m128 dotProdVal2 = _mm_setzero_ps(); +++ __m128 dotProdVal3 = _mm_setzero_ps(); +++ +++ for (; number < sixteenthPoints; number++) { +++ +++ a0Val = _mm_loadu_ps(aPtr); +++ a1Val = _mm_loadu_ps(aPtr + 4); +++ a2Val = _mm_loadu_ps(aPtr + 8); +++ a3Val = _mm_loadu_ps(aPtr + 12); +++ b0Val = _mm_loadu_ps(bPtr); +++ b1Val = _mm_loadu_ps(bPtr + 4); +++ b2Val = _mm_loadu_ps(bPtr + 8); +++ b3Val = _mm_loadu_ps(bPtr + 12); +++ +++ c0Val = _mm_mul_ps(a0Val, b0Val); +++ c1Val = _mm_mul_ps(a1Val, b1Val); +++ c2Val = _mm_mul_ps(a2Val, b2Val); +++ c3Val = _mm_mul_ps(a3Val, b3Val); +++ +++ dotProdVal0 = _mm_add_ps(dotProdVal0, c0Val); +++ dotProdVal1 = _mm_add_ps(dotProdVal1, c1Val); +++ dotProdVal2 = _mm_add_ps(dotProdVal2, c2Val); +++ dotProdVal3 = _mm_add_ps(dotProdVal3, c3Val); ++ ++-#include +++ aPtr += 16; +++ bPtr += 16; +++ } ++ ++-static inline void volk_32f_x2_dot_prod_32f_u_sse4_1(float * result, const float * input, const float* taps, unsigned int num_points) { ++- unsigned int number = 0; ++- const unsigned int sixteenthPoints = num_points / 16; +++ dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1); +++ dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2); +++ dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3); ++ ++- float dotProduct = 0; ++- const float* aPtr = input; ++- const float* bPtr = taps; +++ __VOLK_ATTR_ALIGNED(16) float dotProductVector[4]; +++ _mm_store_ps(dotProductVector, +++ dotProdVal0); // Store the results back into the dot product vector ++ ++- __m128 aVal1, bVal1, cVal1; ++- __m128 aVal2, bVal2, cVal2; ++- __m128 aVal3, bVal3, cVal3; ++- __m128 aVal4, bVal4, cVal4; +++ dotProduct = dotProductVector[0]; +++ dotProduct += dotProductVector[1]; +++ dotProduct += dotProductVector[2]; +++ dotProduct += dotProductVector[3]; ++ ++- __m128 dotProdVal = _mm_setzero_ps(); +++ number = sixteenthPoints * 16; +++ for (; number < num_points; number++) { +++ dotProduct += ((*aPtr++) * (*bPtr++)); +++ } ++ ++- for(;number < sixteenthPoints; number++){ +++ *result = dotProduct; +++} ++ ++- aVal1 = _mm_loadu_ps(aPtr); aPtr += 4; ++- aVal2 = _mm_loadu_ps(aPtr); aPtr += 4; ++- aVal3 = _mm_loadu_ps(aPtr); aPtr += 4; ++- aVal4 = _mm_loadu_ps(aPtr); aPtr += 4; +++#endif /*LV_HAVE_SSE3*/ ++ ++- bVal1 = _mm_loadu_ps(bPtr); bPtr += 4; ++- bVal2 = _mm_loadu_ps(bPtr); bPtr += 4; ++- bVal3 = _mm_loadu_ps(bPtr); bPtr += 4; ++- bVal4 = _mm_loadu_ps(bPtr); bPtr += 4; +++#ifdef LV_HAVE_SSE4_1 ++ ++- cVal1 = _mm_dp_ps(aVal1, bVal1, 0xF1); ++- cVal2 = _mm_dp_ps(aVal2, bVal2, 0xF2); ++- cVal3 = _mm_dp_ps(aVal3, bVal3, 0xF4); ++- cVal4 = _mm_dp_ps(aVal4, bVal4, 0xF8); +++#include ++ ++- cVal1 = _mm_or_ps(cVal1, cVal2); ++- cVal3 = _mm_or_ps(cVal3, cVal4); ++- cVal1 = _mm_or_ps(cVal1, cVal3); +++static inline void volk_32f_x2_dot_prod_32f_u_sse4_1(float* result, +++ const float* input, +++ const float* taps, +++ unsigned int num_points) +++{ +++ unsigned int number = 0; +++ const unsigned int sixteenthPoints = num_points / 16; ++ ++- dotProdVal = _mm_add_ps(dotProdVal, cVal1); ++- } +++ float dotProduct = 0; +++ const float* aPtr = input; +++ const float* bPtr = taps; +++ +++ __m128 aVal1, bVal1, cVal1; +++ __m128 aVal2, bVal2, cVal2; +++ __m128 aVal3, bVal3, cVal3; +++ __m128 aVal4, bVal4, cVal4; +++ +++ __m128 dotProdVal = _mm_setzero_ps(); +++ +++ for (; number < sixteenthPoints; number++) { +++ +++ aVal1 = _mm_loadu_ps(aPtr); +++ aPtr += 4; +++ aVal2 = _mm_loadu_ps(aPtr); +++ aPtr += 4; +++ aVal3 = _mm_loadu_ps(aPtr); +++ aPtr += 4; +++ aVal4 = _mm_loadu_ps(aPtr); +++ aPtr += 4; +++ +++ bVal1 = _mm_loadu_ps(bPtr); +++ bPtr += 4; +++ bVal2 = _mm_loadu_ps(bPtr); +++ bPtr += 4; +++ bVal3 = _mm_loadu_ps(bPtr); +++ bPtr += 4; +++ bVal4 = _mm_loadu_ps(bPtr); +++ bPtr += 4; +++ +++ cVal1 = _mm_dp_ps(aVal1, bVal1, 0xF1); +++ cVal2 = _mm_dp_ps(aVal2, bVal2, 0xF2); +++ cVal3 = _mm_dp_ps(aVal3, bVal3, 0xF4); +++ cVal4 = _mm_dp_ps(aVal4, bVal4, 0xF8); +++ +++ cVal1 = _mm_or_ps(cVal1, cVal2); +++ cVal3 = _mm_or_ps(cVal3, cVal4); +++ cVal1 = _mm_or_ps(cVal1, cVal3); +++ +++ dotProdVal = _mm_add_ps(dotProdVal, cVal1); +++ } ++ ++- __VOLK_ATTR_ALIGNED(16) float dotProductVector[4]; ++- _mm_store_ps(dotProductVector, dotProdVal); // Store the results back into the dot product vector +++ __VOLK_ATTR_ALIGNED(16) float dotProductVector[4]; +++ _mm_store_ps(dotProductVector, +++ dotProdVal); // Store the results back into the dot product vector ++ ++- dotProduct = dotProductVector[0]; ++- dotProduct += dotProductVector[1]; ++- dotProduct += dotProductVector[2]; ++- dotProduct += dotProductVector[3]; +++ dotProduct = dotProductVector[0]; +++ dotProduct += dotProductVector[1]; +++ dotProduct += dotProductVector[2]; +++ dotProduct += dotProductVector[3]; ++ ++- number = sixteenthPoints * 16; ++- for(;number < num_points; number++){ ++- dotProduct += ((*aPtr++) * (*bPtr++)); ++- } +++ number = sixteenthPoints * 16; +++ for (; number < num_points; number++) { +++ dotProduct += ((*aPtr++) * (*bPtr++)); +++ } ++ ++- *result = dotProduct; +++ *result = dotProduct; ++ } ++ ++ #endif /*LV_HAVE_SSE4_1*/ ++@@ -300,147 +324,154 @@ static inline void volk_32f_x2_dot_prod_32f_u_sse4_1(float * result, const float ++ ++ #include ++ ++-static inline void volk_32f_x2_dot_prod_32f_u_avx( float* result, const float* input, const float* taps, unsigned int num_points) { +++static inline void volk_32f_x2_dot_prod_32f_u_avx(float* result, +++ const float* input, +++ const float* taps, +++ unsigned int num_points) +++{ ++ ++- unsigned int number = 0; ++- const unsigned int sixteenthPoints = num_points / 16; ++- ++- float dotProduct = 0; ++- const float* aPtr = input; ++- const float* bPtr = taps; +++ unsigned int number = 0; +++ const unsigned int sixteenthPoints = num_points / 16; ++ ++- __m256 a0Val, a1Val; ++- __m256 b0Val, b1Val; ++- __m256 c0Val, c1Val; +++ float dotProduct = 0; +++ const float* aPtr = input; +++ const float* bPtr = taps; ++ ++- __m256 dotProdVal0 = _mm256_setzero_ps(); ++- __m256 dotProdVal1 = _mm256_setzero_ps(); +++ __m256 a0Val, a1Val; +++ __m256 b0Val, b1Val; +++ __m256 c0Val, c1Val; ++ ++- for(;number < sixteenthPoints; number++){ +++ __m256 dotProdVal0 = _mm256_setzero_ps(); +++ __m256 dotProdVal1 = _mm256_setzero_ps(); ++ ++- a0Val = _mm256_loadu_ps(aPtr); ++- a1Val = _mm256_loadu_ps(aPtr+8); ++- b0Val = _mm256_loadu_ps(bPtr); ++- b1Val = _mm256_loadu_ps(bPtr+8); +++ for (; number < sixteenthPoints; number++) { ++ ++- c0Val = _mm256_mul_ps(a0Val, b0Val); ++- c1Val = _mm256_mul_ps(a1Val, b1Val); +++ a0Val = _mm256_loadu_ps(aPtr); +++ a1Val = _mm256_loadu_ps(aPtr + 8); +++ b0Val = _mm256_loadu_ps(bPtr); +++ b1Val = _mm256_loadu_ps(bPtr + 8); ++ ++- dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0); ++- dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1); +++ c0Val = _mm256_mul_ps(a0Val, b0Val); +++ c1Val = _mm256_mul_ps(a1Val, b1Val); ++ ++- aPtr += 16; ++- bPtr += 16; ++- } +++ dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0); +++ dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1); ++ ++- dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1); +++ aPtr += 16; +++ bPtr += 16; +++ } ++ ++- __VOLK_ATTR_ALIGNED(32) float dotProductVector[8]; +++ dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1); ++ ++- _mm256_storeu_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector +++ __VOLK_ATTR_ALIGNED(32) float dotProductVector[8]; ++ ++- dotProduct = dotProductVector[0]; ++- dotProduct += dotProductVector[1]; ++- dotProduct += dotProductVector[2]; ++- dotProduct += dotProductVector[3]; ++- dotProduct += dotProductVector[4]; ++- dotProduct += dotProductVector[5]; ++- dotProduct += dotProductVector[6]; ++- dotProduct += dotProductVector[7]; +++ _mm256_storeu_ps(dotProductVector, +++ dotProdVal0); // Store the results back into the dot product vector ++ ++- number = sixteenthPoints*16; ++- for(;number < num_points; number++){ ++- dotProduct += ((*aPtr++) * (*bPtr++)); ++- } +++ dotProduct = dotProductVector[0]; +++ dotProduct += dotProductVector[1]; +++ dotProduct += dotProductVector[2]; +++ dotProduct += dotProductVector[3]; +++ dotProduct += dotProductVector[4]; +++ dotProduct += dotProductVector[5]; +++ dotProduct += dotProductVector[6]; +++ dotProduct += dotProductVector[7]; ++ ++- *result = dotProduct; +++ number = sixteenthPoints * 16; +++ for (; number < num_points; number++) { +++ dotProduct += ((*aPtr++) * (*bPtr++)); +++ } ++ +++ *result = dotProduct; ++ } ++ ++ #endif /*LV_HAVE_AVX*/ ++ ++ #if LV_HAVE_AVX2 && LV_HAVE_FMA ++ #include ++-static inline void volk_32f_x2_dot_prod_32f_u_avx2_fma(float * result, const float * input, const float* taps, unsigned int num_points){ ++- unsigned int number; ++- const unsigned int eighthPoints = num_points / 8; +++static inline void volk_32f_x2_dot_prod_32f_u_avx2_fma(float* result, +++ const float* input, +++ const float* taps, +++ unsigned int num_points) +++{ +++ unsigned int number; +++ const unsigned int eighthPoints = num_points / 8; ++ ++- const float* aPtr = input; ++- const float* bPtr = taps; ++- ++- __m256 dotProdVal = _mm256_setzero_ps(); ++- __m256 aVal1, bVal1; +++ const float* aPtr = input; +++ const float* bPtr = taps; ++ ++- for (number = 0; number < eighthPoints; number++ ) { +++ __m256 dotProdVal = _mm256_setzero_ps(); +++ __m256 aVal1, bVal1; ++ ++- aVal1 = _mm256_loadu_ps(aPtr); ++- bVal1 = _mm256_loadu_ps(bPtr); ++- aPtr += 8; ++- bPtr += 8; +++ for (number = 0; number < eighthPoints; number++) { ++ ++- dotProdVal = _mm256_fmadd_ps(aVal1, bVal1, dotProdVal); ++- } +++ aVal1 = _mm256_loadu_ps(aPtr); +++ bVal1 = _mm256_loadu_ps(bPtr); +++ aPtr += 8; +++ bPtr += 8; ++ ++- __VOLK_ATTR_ALIGNED(32) float dotProductVector[8]; ++- _mm256_storeu_ps(dotProductVector, dotProdVal); // Store the results back into the dot product vector ++- _mm256_zeroupper(); +++ dotProdVal = _mm256_fmadd_ps(aVal1, bVal1, dotProdVal); +++ } ++ ++- float dotProduct = ++- dotProductVector[0] + dotProductVector[1] + ++- dotProductVector[2] + dotProductVector[3] + ++- dotProductVector[4] + dotProductVector[5] + ++- dotProductVector[6] + dotProductVector[7]; +++ __VOLK_ATTR_ALIGNED(32) float dotProductVector[8]; +++ _mm256_storeu_ps(dotProductVector, +++ dotProdVal); // Store the results back into the dot product vector +++ _mm256_zeroupper(); ++ ++- for(number = eighthPoints * 8; number < num_points; number++){ ++- dotProduct += ((*aPtr++) * (*bPtr++)); ++- } +++ float dotProduct = dotProductVector[0] + dotProductVector[1] + dotProductVector[2] + +++ dotProductVector[3] + dotProductVector[4] + dotProductVector[5] + +++ dotProductVector[6] + dotProductVector[7]; ++ ++- *result = dotProduct; +++ for (number = eighthPoints * 8; number < num_points; number++) { +++ dotProduct += ((*aPtr++) * (*bPtr++)); +++ } ++ +++ *result = dotProduct; ++ } ++ #endif /* LV_HAVE_AVX2 && LV_HAVE_FMA */ ++ ++ #if LV_HAVE_AVX512F ++ #include ++-static inline void volk_32f_x2_dot_prod_32f_u_avx512f(float * result, const float * input, const float* taps, unsigned int num_points){ ++- unsigned int number; ++- const unsigned int sixteenthPoints = num_points / 16; ++- ++- const float* aPtr = input; ++- const float* bPtr = taps; +++static inline void volk_32f_x2_dot_prod_32f_u_avx512f(float* result, +++ const float* input, +++ const float* taps, +++ unsigned int num_points) +++{ +++ unsigned int number; +++ const unsigned int sixteenthPoints = num_points / 16; ++ ++- __m512 dotProdVal = _mm512_setzero_ps(); ++- __m512 aVal1, bVal1; +++ const float* aPtr = input; +++ const float* bPtr = taps; ++ ++- for (number = 0; number < sixteenthPoints; number++ ) { +++ __m512 dotProdVal = _mm512_setzero_ps(); +++ __m512 aVal1, bVal1; ++ ++- aVal1 = _mm512_loadu_ps(aPtr); ++- bVal1 = _mm512_loadu_ps(bPtr); ++- aPtr += 16; ++- bPtr += 16; +++ for (number = 0; number < sixteenthPoints; number++) { ++ ++- dotProdVal = _mm512_fmadd_ps(aVal1, bVal1, dotProdVal); ++- } +++ aVal1 = _mm512_loadu_ps(aPtr); +++ bVal1 = _mm512_loadu_ps(bPtr); +++ aPtr += 16; +++ bPtr += 16; ++ ++- __VOLK_ATTR_ALIGNED(64) float dotProductVector[16]; ++- _mm512_storeu_ps(dotProductVector, dotProdVal); // Store the results back into the dot product vector +++ dotProdVal = _mm512_fmadd_ps(aVal1, bVal1, dotProdVal); +++ } ++ ++- float dotProduct = ++- dotProductVector[0] + dotProductVector[1] + ++- dotProductVector[2] + dotProductVector[3] + ++- dotProductVector[4] + dotProductVector[5] + ++- dotProductVector[6] + dotProductVector[7] + ++- dotProductVector[8] + dotProductVector[9] + ++- dotProductVector[10] + dotProductVector[11] + ++- dotProductVector[12] + dotProductVector[13] + ++- dotProductVector[14] + dotProductVector[15]; +++ __VOLK_ATTR_ALIGNED(64) float dotProductVector[16]; +++ _mm512_storeu_ps(dotProductVector, +++ dotProdVal); // Store the results back into the dot product vector ++ ++- for(number = sixteenthPoints * 16; number < num_points; number++){ ++- dotProduct += ((*aPtr++) * (*bPtr++)); ++- } +++ float dotProduct = dotProductVector[0] + dotProductVector[1] + dotProductVector[2] + +++ dotProductVector[3] + dotProductVector[4] + dotProductVector[5] + +++ dotProductVector[6] + dotProductVector[7] + dotProductVector[8] + +++ dotProductVector[9] + dotProductVector[10] + dotProductVector[11] + +++ dotProductVector[12] + dotProductVector[13] + +++ dotProductVector[14] + dotProductVector[15]; ++ ++- *result = dotProduct; +++ for (number = sixteenthPoints * 16; number < num_points; number++) { +++ dotProduct += ((*aPtr++) * (*bPtr++)); +++ } ++ +++ *result = dotProduct; ++ } ++ #endif /* LV_HAVE_AVX512F */ ++ ++@@ -449,25 +480,29 @@ static inline void volk_32f_x2_dot_prod_32f_u_avx512f(float * result, const floa ++ #ifndef INCLUDED_volk_32f_x2_dot_prod_32f_a_H ++ #define INCLUDED_volk_32f_x2_dot_prod_32f_a_H ++ +++#include ++ #include ++-#include ++ ++ ++ #ifdef LV_HAVE_GENERIC ++ ++ ++-static inline void volk_32f_x2_dot_prod_32f_a_generic(float * result, const float * input, const float * taps, unsigned int num_points) { +++static inline void volk_32f_x2_dot_prod_32f_a_generic(float* result, +++ const float* input, +++ const float* taps, +++ unsigned int num_points) +++{ ++ ++- float dotProduct = 0; ++- const float* aPtr = input; ++- const float* bPtr= taps; ++- unsigned int number = 0; +++ float dotProduct = 0; +++ const float* aPtr = input; +++ const float* bPtr = taps; +++ unsigned int number = 0; ++ ++- for(number = 0; number < num_points; number++){ ++- dotProduct += ((*aPtr++) * (*bPtr++)); ++- } +++ for (number = 0; number < num_points; number++) { +++ dotProduct += ((*aPtr++) * (*bPtr++)); +++ } ++ ++- *result = dotProduct; +++ *result = dotProduct; ++ } ++ ++ #endif /*LV_HAVE_GENERIC*/ ++@@ -476,69 +511,73 @@ static inline void volk_32f_x2_dot_prod_32f_a_generic(float * result, const floa ++ #ifdef LV_HAVE_SSE ++ ++ ++-static inline void volk_32f_x2_dot_prod_32f_a_sse( float* result, const float* input, const float* taps, unsigned int num_points) { ++- ++- unsigned int number = 0; ++- const unsigned int sixteenthPoints = num_points / 16; ++- ++- float dotProduct = 0; ++- const float* aPtr = input; ++- const float* bPtr = taps; ++- ++- __m128 a0Val, a1Val, a2Val, a3Val; ++- __m128 b0Val, b1Val, b2Val, b3Val; ++- __m128 c0Val, c1Val, c2Val, c3Val; ++- ++- __m128 dotProdVal0 = _mm_setzero_ps(); ++- __m128 dotProdVal1 = _mm_setzero_ps(); ++- __m128 dotProdVal2 = _mm_setzero_ps(); ++- __m128 dotProdVal3 = _mm_setzero_ps(); +++static inline void volk_32f_x2_dot_prod_32f_a_sse(float* result, +++ const float* input, +++ const float* taps, +++ unsigned int num_points) +++{ ++ ++- for(;number < sixteenthPoints; number++){ ++- ++- a0Val = _mm_load_ps(aPtr); ++- a1Val = _mm_load_ps(aPtr+4); ++- a2Val = _mm_load_ps(aPtr+8); ++- a3Val = _mm_load_ps(aPtr+12); ++- b0Val = _mm_load_ps(bPtr); ++- b1Val = _mm_load_ps(bPtr+4); ++- b2Val = _mm_load_ps(bPtr+8); ++- b3Val = _mm_load_ps(bPtr+12); ++- ++- c0Val = _mm_mul_ps(a0Val, b0Val); ++- c1Val = _mm_mul_ps(a1Val, b1Val); ++- c2Val = _mm_mul_ps(a2Val, b2Val); ++- c3Val = _mm_mul_ps(a3Val, b3Val); ++- ++- dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0); ++- dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1); ++- dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2); ++- dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3); +++ unsigned int number = 0; +++ const unsigned int sixteenthPoints = num_points / 16; ++ ++- aPtr += 16; ++- bPtr += 16; ++- } +++ float dotProduct = 0; +++ const float* aPtr = input; +++ const float* bPtr = taps; +++ +++ __m128 a0Val, a1Val, a2Val, a3Val; +++ __m128 b0Val, b1Val, b2Val, b3Val; +++ __m128 c0Val, c1Val, c2Val, c3Val; +++ +++ __m128 dotProdVal0 = _mm_setzero_ps(); +++ __m128 dotProdVal1 = _mm_setzero_ps(); +++ __m128 dotProdVal2 = _mm_setzero_ps(); +++ __m128 dotProdVal3 = _mm_setzero_ps(); +++ +++ for (; number < sixteenthPoints; number++) { +++ +++ a0Val = _mm_load_ps(aPtr); +++ a1Val = _mm_load_ps(aPtr + 4); +++ a2Val = _mm_load_ps(aPtr + 8); +++ a3Val = _mm_load_ps(aPtr + 12); +++ b0Val = _mm_load_ps(bPtr); +++ b1Val = _mm_load_ps(bPtr + 4); +++ b2Val = _mm_load_ps(bPtr + 8); +++ b3Val = _mm_load_ps(bPtr + 12); +++ +++ c0Val = _mm_mul_ps(a0Val, b0Val); +++ c1Val = _mm_mul_ps(a1Val, b1Val); +++ c2Val = _mm_mul_ps(a2Val, b2Val); +++ c3Val = _mm_mul_ps(a3Val, b3Val); +++ +++ dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0); +++ dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1); +++ dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2); +++ dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3); ++ ++- dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1); ++- dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2); ++- dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3); +++ aPtr += 16; +++ bPtr += 16; +++ } ++ ++- __VOLK_ATTR_ALIGNED(16) float dotProductVector[4]; +++ dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1); +++ dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2); +++ dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3); ++ ++- _mm_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector +++ __VOLK_ATTR_ALIGNED(16) float dotProductVector[4]; ++ ++- dotProduct = dotProductVector[0]; ++- dotProduct += dotProductVector[1]; ++- dotProduct += dotProductVector[2]; ++- dotProduct += dotProductVector[3]; +++ _mm_store_ps(dotProductVector, +++ dotProdVal0); // Store the results back into the dot product vector ++ ++- number = sixteenthPoints*16; ++- for(;number < num_points; number++){ ++- dotProduct += ((*aPtr++) * (*bPtr++)); ++- } +++ dotProduct = dotProductVector[0]; +++ dotProduct += dotProductVector[1]; +++ dotProduct += dotProductVector[2]; +++ dotProduct += dotProductVector[3]; ++ ++- *result = dotProduct; +++ number = sixteenthPoints * 16; +++ for (; number < num_points; number++) { +++ dotProduct += ((*aPtr++) * (*bPtr++)); +++ } ++ +++ *result = dotProduct; ++ } ++ ++ #endif /*LV_HAVE_SSE*/ ++@@ -547,127 +586,145 @@ static inline void volk_32f_x2_dot_prod_32f_a_sse( float* result, const float* ++ ++ #include ++ ++-static inline void volk_32f_x2_dot_prod_32f_a_sse3(float * result, const float * input, const float * taps, unsigned int num_points) { ++- unsigned int number = 0; ++- const unsigned int sixteenthPoints = num_points / 16; ++- ++- float dotProduct = 0; ++- const float* aPtr = input; ++- const float* bPtr = taps; ++- ++- __m128 a0Val, a1Val, a2Val, a3Val; ++- __m128 b0Val, b1Val, b2Val, b3Val; ++- __m128 c0Val, c1Val, c2Val, c3Val; ++- ++- __m128 dotProdVal0 = _mm_setzero_ps(); ++- __m128 dotProdVal1 = _mm_setzero_ps(); ++- __m128 dotProdVal2 = _mm_setzero_ps(); ++- __m128 dotProdVal3 = _mm_setzero_ps(); ++- ++- for(;number < sixteenthPoints; number++){ ++- ++- a0Val = _mm_load_ps(aPtr); ++- a1Val = _mm_load_ps(aPtr+4); ++- a2Val = _mm_load_ps(aPtr+8); ++- a3Val = _mm_load_ps(aPtr+12); ++- b0Val = _mm_load_ps(bPtr); ++- b1Val = _mm_load_ps(bPtr+4); ++- b2Val = _mm_load_ps(bPtr+8); ++- b3Val = _mm_load_ps(bPtr+12); ++- ++- c0Val = _mm_mul_ps(a0Val, b0Val); ++- c1Val = _mm_mul_ps(a1Val, b1Val); ++- c2Val = _mm_mul_ps(a2Val, b2Val); ++- c3Val = _mm_mul_ps(a3Val, b3Val); ++- ++- dotProdVal0 = _mm_add_ps(dotProdVal0, c0Val); ++- dotProdVal1 = _mm_add_ps(dotProdVal1, c1Val); ++- dotProdVal2 = _mm_add_ps(dotProdVal2, c2Val); ++- dotProdVal3 = _mm_add_ps(dotProdVal3, c3Val); ++- ++- aPtr += 16; ++- bPtr += 16; ++- } ++- ++- dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1); ++- dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2); ++- dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3); ++- ++- __VOLK_ATTR_ALIGNED(16) float dotProductVector[4]; ++- _mm_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector ++- ++- dotProduct = dotProductVector[0]; ++- dotProduct += dotProductVector[1]; ++- dotProduct += dotProductVector[2]; ++- dotProduct += dotProductVector[3]; ++- ++- number = sixteenthPoints*16; ++- for(;number < num_points; number++){ ++- dotProduct += ((*aPtr++) * (*bPtr++)); ++- } ++- ++- *result = dotProduct; ++-} ++- ++-#endif /*LV_HAVE_SSE3*/ +++static inline void volk_32f_x2_dot_prod_32f_a_sse3(float* result, +++ const float* input, +++ const float* taps, +++ unsigned int num_points) +++{ +++ unsigned int number = 0; +++ const unsigned int sixteenthPoints = num_points / 16; ++ ++-#ifdef LV_HAVE_SSE4_1 +++ float dotProduct = 0; +++ const float* aPtr = input; +++ const float* bPtr = taps; +++ +++ __m128 a0Val, a1Val, a2Val, a3Val; +++ __m128 b0Val, b1Val, b2Val, b3Val; +++ __m128 c0Val, c1Val, c2Val, c3Val; +++ +++ __m128 dotProdVal0 = _mm_setzero_ps(); +++ __m128 dotProdVal1 = _mm_setzero_ps(); +++ __m128 dotProdVal2 = _mm_setzero_ps(); +++ __m128 dotProdVal3 = _mm_setzero_ps(); +++ +++ for (; number < sixteenthPoints; number++) { +++ +++ a0Val = _mm_load_ps(aPtr); +++ a1Val = _mm_load_ps(aPtr + 4); +++ a2Val = _mm_load_ps(aPtr + 8); +++ a3Val = _mm_load_ps(aPtr + 12); +++ b0Val = _mm_load_ps(bPtr); +++ b1Val = _mm_load_ps(bPtr + 4); +++ b2Val = _mm_load_ps(bPtr + 8); +++ b3Val = _mm_load_ps(bPtr + 12); +++ +++ c0Val = _mm_mul_ps(a0Val, b0Val); +++ c1Val = _mm_mul_ps(a1Val, b1Val); +++ c2Val = _mm_mul_ps(a2Val, b2Val); +++ c3Val = _mm_mul_ps(a3Val, b3Val); +++ +++ dotProdVal0 = _mm_add_ps(dotProdVal0, c0Val); +++ dotProdVal1 = _mm_add_ps(dotProdVal1, c1Val); +++ dotProdVal2 = _mm_add_ps(dotProdVal2, c2Val); +++ dotProdVal3 = _mm_add_ps(dotProdVal3, c3Val); ++ ++-#include +++ aPtr += 16; +++ bPtr += 16; +++ } ++ ++-static inline void volk_32f_x2_dot_prod_32f_a_sse4_1(float * result, const float * input, const float* taps, unsigned int num_points) { ++- unsigned int number = 0; ++- const unsigned int sixteenthPoints = num_points / 16; +++ dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1); +++ dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2); +++ dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3); ++ ++- float dotProduct = 0; ++- const float* aPtr = input; ++- const float* bPtr = taps; +++ __VOLK_ATTR_ALIGNED(16) float dotProductVector[4]; +++ _mm_store_ps(dotProductVector, +++ dotProdVal0); // Store the results back into the dot product vector ++ ++- __m128 aVal1, bVal1, cVal1; ++- __m128 aVal2, bVal2, cVal2; ++- __m128 aVal3, bVal3, cVal3; ++- __m128 aVal4, bVal4, cVal4; +++ dotProduct = dotProductVector[0]; +++ dotProduct += dotProductVector[1]; +++ dotProduct += dotProductVector[2]; +++ dotProduct += dotProductVector[3]; ++ ++- __m128 dotProdVal = _mm_setzero_ps(); +++ number = sixteenthPoints * 16; +++ for (; number < num_points; number++) { +++ dotProduct += ((*aPtr++) * (*bPtr++)); +++ } ++ ++- for(;number < sixteenthPoints; number++){ +++ *result = dotProduct; +++} ++ ++- aVal1 = _mm_load_ps(aPtr); aPtr += 4; ++- aVal2 = _mm_load_ps(aPtr); aPtr += 4; ++- aVal3 = _mm_load_ps(aPtr); aPtr += 4; ++- aVal4 = _mm_load_ps(aPtr); aPtr += 4; +++#endif /*LV_HAVE_SSE3*/ ++ ++- bVal1 = _mm_load_ps(bPtr); bPtr += 4; ++- bVal2 = _mm_load_ps(bPtr); bPtr += 4; ++- bVal3 = _mm_load_ps(bPtr); bPtr += 4; ++- bVal4 = _mm_load_ps(bPtr); bPtr += 4; +++#ifdef LV_HAVE_SSE4_1 ++ ++- cVal1 = _mm_dp_ps(aVal1, bVal1, 0xF1); ++- cVal2 = _mm_dp_ps(aVal2, bVal2, 0xF2); ++- cVal3 = _mm_dp_ps(aVal3, bVal3, 0xF4); ++- cVal4 = _mm_dp_ps(aVal4, bVal4, 0xF8); +++#include ++ ++- cVal1 = _mm_or_ps(cVal1, cVal2); ++- cVal3 = _mm_or_ps(cVal3, cVal4); ++- cVal1 = _mm_or_ps(cVal1, cVal3); +++static inline void volk_32f_x2_dot_prod_32f_a_sse4_1(float* result, +++ const float* input, +++ const float* taps, +++ unsigned int num_points) +++{ +++ unsigned int number = 0; +++ const unsigned int sixteenthPoints = num_points / 16; ++ ++- dotProdVal = _mm_add_ps(dotProdVal, cVal1); ++- } +++ float dotProduct = 0; +++ const float* aPtr = input; +++ const float* bPtr = taps; +++ +++ __m128 aVal1, bVal1, cVal1; +++ __m128 aVal2, bVal2, cVal2; +++ __m128 aVal3, bVal3, cVal3; +++ __m128 aVal4, bVal4, cVal4; +++ +++ __m128 dotProdVal = _mm_setzero_ps(); +++ +++ for (; number < sixteenthPoints; number++) { +++ +++ aVal1 = _mm_load_ps(aPtr); +++ aPtr += 4; +++ aVal2 = _mm_load_ps(aPtr); +++ aPtr += 4; +++ aVal3 = _mm_load_ps(aPtr); +++ aPtr += 4; +++ aVal4 = _mm_load_ps(aPtr); +++ aPtr += 4; +++ +++ bVal1 = _mm_load_ps(bPtr); +++ bPtr += 4; +++ bVal2 = _mm_load_ps(bPtr); +++ bPtr += 4; +++ bVal3 = _mm_load_ps(bPtr); +++ bPtr += 4; +++ bVal4 = _mm_load_ps(bPtr); +++ bPtr += 4; +++ +++ cVal1 = _mm_dp_ps(aVal1, bVal1, 0xF1); +++ cVal2 = _mm_dp_ps(aVal2, bVal2, 0xF2); +++ cVal3 = _mm_dp_ps(aVal3, bVal3, 0xF4); +++ cVal4 = _mm_dp_ps(aVal4, bVal4, 0xF8); +++ +++ cVal1 = _mm_or_ps(cVal1, cVal2); +++ cVal3 = _mm_or_ps(cVal3, cVal4); +++ cVal1 = _mm_or_ps(cVal1, cVal3); +++ +++ dotProdVal = _mm_add_ps(dotProdVal, cVal1); +++ } ++ ++- __VOLK_ATTR_ALIGNED(16) float dotProductVector[4]; ++- _mm_store_ps(dotProductVector, dotProdVal); // Store the results back into the dot product vector +++ __VOLK_ATTR_ALIGNED(16) float dotProductVector[4]; +++ _mm_store_ps(dotProductVector, +++ dotProdVal); // Store the results back into the dot product vector ++ ++- dotProduct = dotProductVector[0]; ++- dotProduct += dotProductVector[1]; ++- dotProduct += dotProductVector[2]; ++- dotProduct += dotProductVector[3]; +++ dotProduct = dotProductVector[0]; +++ dotProduct += dotProductVector[1]; +++ dotProduct += dotProductVector[2]; +++ dotProduct += dotProductVector[3]; ++ ++- number = sixteenthPoints * 16; ++- for(;number < num_points; number++){ ++- dotProduct += ((*aPtr++) * (*bPtr++)); ++- } +++ number = sixteenthPoints * 16; +++ for (; number < num_points; number++) { +++ dotProduct += ((*aPtr++) * (*bPtr++)); +++ } ++ ++- *result = dotProduct; +++ *result = dotProduct; ++ } ++ ++ #endif /*LV_HAVE_SSE4_1*/ ++@@ -676,159 +733,170 @@ static inline void volk_32f_x2_dot_prod_32f_a_sse4_1(float * result, const float ++ ++ #include ++ ++-static inline void volk_32f_x2_dot_prod_32f_a_avx( float* result, const float* input, const float* taps, unsigned int num_points) { +++static inline void volk_32f_x2_dot_prod_32f_a_avx(float* result, +++ const float* input, +++ const float* taps, +++ unsigned int num_points) +++{ ++ ++- unsigned int number = 0; ++- const unsigned int sixteenthPoints = num_points / 16; ++- ++- float dotProduct = 0; ++- const float* aPtr = input; ++- const float* bPtr = taps; +++ unsigned int number = 0; +++ const unsigned int sixteenthPoints = num_points / 16; ++ ++- __m256 a0Val, a1Val; ++- __m256 b0Val, b1Val; ++- __m256 c0Val, c1Val; +++ float dotProduct = 0; +++ const float* aPtr = input; +++ const float* bPtr = taps; ++ ++- __m256 dotProdVal0 = _mm256_setzero_ps(); ++- __m256 dotProdVal1 = _mm256_setzero_ps(); +++ __m256 a0Val, a1Val; +++ __m256 b0Val, b1Val; +++ __m256 c0Val, c1Val; ++ ++- for(;number < sixteenthPoints; number++){ +++ __m256 dotProdVal0 = _mm256_setzero_ps(); +++ __m256 dotProdVal1 = _mm256_setzero_ps(); ++ ++- a0Val = _mm256_load_ps(aPtr); ++- a1Val = _mm256_load_ps(aPtr+8); ++- b0Val = _mm256_load_ps(bPtr); ++- b1Val = _mm256_load_ps(bPtr+8); +++ for (; number < sixteenthPoints; number++) { ++ ++- c0Val = _mm256_mul_ps(a0Val, b0Val); ++- c1Val = _mm256_mul_ps(a1Val, b1Val); +++ a0Val = _mm256_load_ps(aPtr); +++ a1Val = _mm256_load_ps(aPtr + 8); +++ b0Val = _mm256_load_ps(bPtr); +++ b1Val = _mm256_load_ps(bPtr + 8); ++ ++- dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0); ++- dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1); +++ c0Val = _mm256_mul_ps(a0Val, b0Val); +++ c1Val = _mm256_mul_ps(a1Val, b1Val); ++ ++- aPtr += 16; ++- bPtr += 16; ++- } +++ dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0); +++ dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1); ++ ++- dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1); +++ aPtr += 16; +++ bPtr += 16; +++ } ++ ++- __VOLK_ATTR_ALIGNED(32) float dotProductVector[8]; +++ dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1); ++ ++- _mm256_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector +++ __VOLK_ATTR_ALIGNED(32) float dotProductVector[8]; ++ ++- dotProduct = dotProductVector[0]; ++- dotProduct += dotProductVector[1]; ++- dotProduct += dotProductVector[2]; ++- dotProduct += dotProductVector[3]; ++- dotProduct += dotProductVector[4]; ++- dotProduct += dotProductVector[5]; ++- dotProduct += dotProductVector[6]; ++- dotProduct += dotProductVector[7]; +++ _mm256_store_ps(dotProductVector, +++ dotProdVal0); // Store the results back into the dot product vector ++ ++- number = sixteenthPoints*16; ++- for(;number < num_points; number++){ ++- dotProduct += ((*aPtr++) * (*bPtr++)); ++- } +++ dotProduct = dotProductVector[0]; +++ dotProduct += dotProductVector[1]; +++ dotProduct += dotProductVector[2]; +++ dotProduct += dotProductVector[3]; +++ dotProduct += dotProductVector[4]; +++ dotProduct += dotProductVector[5]; +++ dotProduct += dotProductVector[6]; +++ dotProduct += dotProductVector[7]; ++ ++- *result = dotProduct; +++ number = sixteenthPoints * 16; +++ for (; number < num_points; number++) { +++ dotProduct += ((*aPtr++) * (*bPtr++)); +++ } ++ +++ *result = dotProduct; ++ } ++ #endif /*LV_HAVE_AVX*/ ++ ++ ++ #if LV_HAVE_AVX2 && LV_HAVE_FMA ++ #include ++-static inline void volk_32f_x2_dot_prod_32f_a_avx2_fma(float * result, const float * input, const float* taps, unsigned int num_points){ ++- unsigned int number; ++- const unsigned int eighthPoints = num_points / 8; ++- ++- const float* aPtr = input; ++- const float* bPtr = taps; +++static inline void volk_32f_x2_dot_prod_32f_a_avx2_fma(float* result, +++ const float* input, +++ const float* taps, +++ unsigned int num_points) +++{ +++ unsigned int number; +++ const unsigned int eighthPoints = num_points / 8; ++ ++- __m256 dotProdVal = _mm256_setzero_ps(); ++- __m256 aVal1, bVal1; +++ const float* aPtr = input; +++ const float* bPtr = taps; ++ ++- for (number = 0; number < eighthPoints; number++ ) { +++ __m256 dotProdVal = _mm256_setzero_ps(); +++ __m256 aVal1, bVal1; ++ ++- aVal1 = _mm256_load_ps(aPtr); ++- bVal1 = _mm256_load_ps(bPtr); ++- aPtr += 8; ++- bPtr += 8; +++ for (number = 0; number < eighthPoints; number++) { ++ ++- dotProdVal = _mm256_fmadd_ps(aVal1, bVal1, dotProdVal); ++- } +++ aVal1 = _mm256_load_ps(aPtr); +++ bVal1 = _mm256_load_ps(bPtr); +++ aPtr += 8; +++ bPtr += 8; ++ ++- __VOLK_ATTR_ALIGNED(32) float dotProductVector[8]; ++- _mm256_store_ps(dotProductVector, dotProdVal); // Store the results back into the dot product vector ++- _mm256_zeroupper(); +++ dotProdVal = _mm256_fmadd_ps(aVal1, bVal1, dotProdVal); +++ } ++ ++- float dotProduct = ++- dotProductVector[0] + dotProductVector[1] + ++- dotProductVector[2] + dotProductVector[3] + ++- dotProductVector[4] + dotProductVector[5] + ++- dotProductVector[6] + dotProductVector[7]; +++ __VOLK_ATTR_ALIGNED(32) float dotProductVector[8]; +++ _mm256_store_ps(dotProductVector, +++ dotProdVal); // Store the results back into the dot product vector +++ _mm256_zeroupper(); ++ ++- for(number = eighthPoints * 8; number < num_points; number++){ ++- dotProduct += ((*aPtr++) * (*bPtr++)); ++- } +++ float dotProduct = dotProductVector[0] + dotProductVector[1] + dotProductVector[2] + +++ dotProductVector[3] + dotProductVector[4] + dotProductVector[5] + +++ dotProductVector[6] + dotProductVector[7]; ++ ++- *result = dotProduct; +++ for (number = eighthPoints * 8; number < num_points; number++) { +++ dotProduct += ((*aPtr++) * (*bPtr++)); +++ } ++ +++ *result = dotProduct; ++ } ++ #endif /* LV_HAVE_AVX2 && LV_HAVE_FMA */ ++ ++ #if LV_HAVE_AVX512F ++ #include ++-static inline void volk_32f_x2_dot_prod_32f_a_avx512f(float * result, const float * input, const float* taps, unsigned int num_points){ ++- unsigned int number; ++- const unsigned int sixteenthPoints = num_points / 16; ++- ++- const float* aPtr = input; ++- const float* bPtr = taps; +++static inline void volk_32f_x2_dot_prod_32f_a_avx512f(float* result, +++ const float* input, +++ const float* taps, +++ unsigned int num_points) +++{ +++ unsigned int number; +++ const unsigned int sixteenthPoints = num_points / 16; ++ ++- __m512 dotProdVal = _mm512_setzero_ps(); ++- __m512 aVal1, bVal1; +++ const float* aPtr = input; +++ const float* bPtr = taps; ++ ++- for (number = 0; number < sixteenthPoints; number++ ) { +++ __m512 dotProdVal = _mm512_setzero_ps(); +++ __m512 aVal1, bVal1; ++ ++- aVal1 = _mm512_load_ps(aPtr); ++- bVal1 = _mm512_load_ps(bPtr); ++- aPtr += 16; ++- bPtr += 16; +++ for (number = 0; number < sixteenthPoints; number++) { ++ ++- dotProdVal = _mm512_fmadd_ps(aVal1, bVal1, dotProdVal); ++- } +++ aVal1 = _mm512_load_ps(aPtr); +++ bVal1 = _mm512_load_ps(bPtr); +++ aPtr += 16; +++ bPtr += 16; ++ ++- __VOLK_ATTR_ALIGNED(64) float dotProductVector[16]; ++- _mm512_store_ps(dotProductVector, dotProdVal); // Store the results back into the dot product vector +++ dotProdVal = _mm512_fmadd_ps(aVal1, bVal1, dotProdVal); +++ } ++ ++- float dotProduct = ++- dotProductVector[0] + dotProductVector[1] + ++- dotProductVector[2] + dotProductVector[3] + ++- dotProductVector[4] + dotProductVector[5] + ++- dotProductVector[6] + dotProductVector[7] + ++- dotProductVector[8] + dotProductVector[9] + ++- dotProductVector[10] + dotProductVector[11] + ++- dotProductVector[12] + dotProductVector[13] + ++- dotProductVector[14] + dotProductVector[15]; +++ __VOLK_ATTR_ALIGNED(64) float dotProductVector[16]; +++ _mm512_store_ps(dotProductVector, +++ dotProdVal); // Store the results back into the dot product vector ++ ++- for(number = sixteenthPoints * 16; number < num_points; number++){ ++- dotProduct += ((*aPtr++) * (*bPtr++)); ++- } +++ float dotProduct = dotProductVector[0] + dotProductVector[1] + dotProductVector[2] + +++ dotProductVector[3] + dotProductVector[4] + dotProductVector[5] + +++ dotProductVector[6] + dotProductVector[7] + dotProductVector[8] + +++ dotProductVector[9] + dotProductVector[10] + dotProductVector[11] + +++ dotProductVector[12] + dotProductVector[13] + +++ dotProductVector[14] + dotProductVector[15]; ++ ++- *result = dotProduct; +++ for (number = sixteenthPoints * 16; number < num_points; number++) { +++ dotProduct += ((*aPtr++) * (*bPtr++)); +++ } ++ +++ *result = dotProduct; ++ } ++ #endif /* LV_HAVE_AVX512F */ ++ ++ #ifdef LV_HAVE_NEON ++ #include ++ ++-static inline void volk_32f_x2_dot_prod_32f_neonopts(float * result, const float * input, const float * taps, unsigned int num_points) { +++static inline void volk_32f_x2_dot_prod_32f_neonopts(float* result, +++ const float* input, +++ const float* taps, +++ unsigned int num_points) +++{ ++ ++ unsigned int quarter_points = num_points / 16; ++ float dotProduct = 0; ++ const float* aPtr = input; ++- const float* bPtr= taps; +++ const float* bPtr = taps; ++ unsigned int number = 0; ++ ++ float32x4x4_t a_val, b_val, accumulator0; ++@@ -838,7 +906,7 @@ static inline void volk_32f_x2_dot_prod_32f_neonopts(float * result, const float ++ accumulator0.val[3] = vdupq_n_f32(0); ++ // factor of 4 loop unroll with independent accumulators ++ // uses 12 out of 16 neon q registers ++- for( number = 0; number < quarter_points; ++number) { +++ for (number = 0; number < quarter_points; ++number) { ++ a_val = vld4q_f32(aPtr); ++ b_val = vld4q_f32(bPtr); ++ accumulator0.val[0] = vmlaq_f32(accumulator0.val[0], a_val.val[0], b_val.val[0]); ++@@ -855,8 +923,8 @@ static inline void volk_32f_x2_dot_prod_32f_neonopts(float * result, const float ++ vst1q_f32(accumulator, accumulator0.val[0]); ++ dotProduct = accumulator[0] + accumulator[1] + accumulator[2] + accumulator[3]; ++ ++- for(number = quarter_points*16; number < num_points; number++){ ++- dotProduct += ((*aPtr++) * (*bPtr++)); +++ for (number = quarter_points * 16; number < num_points; number++) { +++ dotProduct += ((*aPtr++) * (*bPtr++)); ++ } ++ ++ *result = dotProduct; ++@@ -865,26 +933,30 @@ static inline void volk_32f_x2_dot_prod_32f_neonopts(float * result, const float ++ #endif ++ ++ ++- ++- ++ #ifdef LV_HAVE_NEON ++-static inline void volk_32f_x2_dot_prod_32f_neon(float * result, const float * input, const float * taps, unsigned int num_points) { +++static inline void volk_32f_x2_dot_prod_32f_neon(float* result, +++ const float* input, +++ const float* taps, +++ unsigned int num_points) +++{ ++ ++ unsigned int quarter_points = num_points / 8; ++ float dotProduct = 0; ++ const float* aPtr = input; ++- const float* bPtr= taps; +++ const float* bPtr = taps; ++ unsigned int number = 0; ++ ++ float32x4x2_t a_val, b_val, accumulator_val; ++ accumulator_val.val[0] = vdupq_n_f32(0); ++ accumulator_val.val[1] = vdupq_n_f32(0); ++ // factor of 2 loop unroll with independent accumulators ++- for( number = 0; number < quarter_points; ++number) { +++ for (number = 0; number < quarter_points; ++number) { ++ a_val = vld2q_f32(aPtr); ++ b_val = vld2q_f32(bPtr); ++- accumulator_val.val[0] = vmlaq_f32(accumulator_val.val[0], a_val.val[0], b_val.val[0]); ++- accumulator_val.val[1] = vmlaq_f32(accumulator_val.val[1], a_val.val[1], b_val.val[1]); +++ accumulator_val.val[0] = +++ vmlaq_f32(accumulator_val.val[0], a_val.val[0], b_val.val[0]); +++ accumulator_val.val[1] = +++ vmlaq_f32(accumulator_val.val[1], a_val.val[1], b_val.val[1]); ++ aPtr += 8; ++ bPtr += 8; ++ } ++@@ -893,8 +965,8 @@ static inline void volk_32f_x2_dot_prod_32f_neon(float * result, const float * i ++ vst1q_f32(accumulator, accumulator_val.val[0]); ++ dotProduct = accumulator[0] + accumulator[1] + accumulator[2] + accumulator[3]; ++ ++- for(number = quarter_points*8; number < num_points; number++){ ++- dotProduct += ((*aPtr++) * (*bPtr++)); +++ for (number = quarter_points * 8; number < num_points; number++) { +++ dotProduct += ((*aPtr++) * (*bPtr++)); ++ } ++ ++ *result = dotProduct; ++@@ -903,11 +975,17 @@ static inline void volk_32f_x2_dot_prod_32f_neon(float * result, const float * i ++ #endif /* LV_HAVE_NEON */ ++ ++ #ifdef LV_HAVE_NEONV7 ++-extern void volk_32f_x2_dot_prod_32f_a_neonasm(float* cVector, const float* aVector, const float* bVector, unsigned int num_points); +++extern void volk_32f_x2_dot_prod_32f_a_neonasm(float* cVector, +++ const float* aVector, +++ const float* bVector, +++ unsigned int num_points); ++ #endif /* LV_HAVE_NEONV7 */ ++ ++ #ifdef LV_HAVE_NEONV7 ++-extern void volk_32f_x2_dot_prod_32f_a_neonasm_opts(float* cVector, const float* aVector, const float* bVector, unsigned int num_points); +++extern void volk_32f_x2_dot_prod_32f_a_neonasm_opts(float* cVector, +++ const float* aVector, +++ const float* bVector, +++ unsigned int num_points); ++ #endif /* LV_HAVE_NEONV7 */ ++ ++ #endif /*INCLUDED_volk_32f_x2_dot_prod_32f_a_H*/ ++diff --git a/kernels/volk/volk_32f_x2_fm_detectpuppet_32f.h b/kernels/volk/volk_32f_x2_fm_detectpuppet_32f.h ++index e1da185..3a3caca 100644 ++--- a/kernels/volk/volk_32f_x2_fm_detectpuppet_32f.h +++++ b/kernels/volk/volk_32f_x2_fm_detectpuppet_32f.h ++@@ -28,32 +28,44 @@ ++ #ifdef LV_HAVE_AVX ++ #include ++ ++-static inline void volk_32f_x2_fm_detectpuppet_32f_a_avx(float* outputVector, const float* inputVector, float* saveValue, unsigned int num_points) +++static inline void volk_32f_x2_fm_detectpuppet_32f_a_avx(float* outputVector, +++ const float* inputVector, +++ float* saveValue, +++ unsigned int num_points) ++ { ++- const float bound = 1.0f; +++ const float bound = 1.0f; ++ ++- volk_32f_s32f_32f_fm_detect_32f_a_avx(outputVector, inputVector, bound, saveValue, num_points); +++ volk_32f_s32f_32f_fm_detect_32f_a_avx( +++ outputVector, inputVector, bound, saveValue, num_points); ++ } ++ #endif /* LV_HAVE_AVX */ ++ ++ #ifdef LV_HAVE_SSE ++ #include ++ ++-static inline void volk_32f_x2_fm_detectpuppet_32f_a_sse(float* outputVector, const float* inputVector, float* saveValue, unsigned int num_points) +++static inline void volk_32f_x2_fm_detectpuppet_32f_a_sse(float* outputVector, +++ const float* inputVector, +++ float* saveValue, +++ unsigned int num_points) ++ { ++- const float bound = 1.0f; +++ const float bound = 1.0f; ++ ++- volk_32f_s32f_32f_fm_detect_32f_a_sse(outputVector, inputVector, bound, saveValue, num_points); +++ volk_32f_s32f_32f_fm_detect_32f_a_sse( +++ outputVector, inputVector, bound, saveValue, num_points); ++ } ++ #endif /* LV_HAVE_SSE */ ++ ++ #ifdef LV_HAVE_GENERIC ++ ++-static inline void volk_32f_x2_fm_detectpuppet_32f_generic(float* outputVector, const float* inputVector, float* saveValue, unsigned int num_points) +++static inline void volk_32f_x2_fm_detectpuppet_32f_generic(float* outputVector, +++ const float* inputVector, +++ float* saveValue, +++ unsigned int num_points) ++ { ++- const float bound = 1.0f; +++ const float bound = 1.0f; ++ ++- volk_32f_s32f_32f_fm_detect_32f_generic(outputVector, inputVector, bound, saveValue, num_points); +++ volk_32f_s32f_32f_fm_detect_32f_generic( +++ outputVector, inputVector, bound, saveValue, num_points); ++ } ++ #endif /* LV_HAVE_GENERIC */ ++ ++@@ -69,11 +81,15 @@ static inline void volk_32f_x2_fm_detectpuppet_32f_generic(float* outputVector, ++ #ifdef LV_HAVE_AVX ++ #include ++ ++-static inline void volk_32f_x2_fm_detectpuppet_32f_u_avx(float* outputVector, const float* inputVector, float* saveValue, unsigned int num_points) +++static inline void volk_32f_x2_fm_detectpuppet_32f_u_avx(float* outputVector, +++ const float* inputVector, +++ float* saveValue, +++ unsigned int num_points) ++ { ++- const float bound = 1.0f; +++ const float bound = 1.0f; ++ ++- volk_32f_s32f_32f_fm_detect_32f_u_avx(outputVector, inputVector, bound, saveValue, num_points); +++ volk_32f_s32f_32f_fm_detect_32f_u_avx( +++ outputVector, inputVector, bound, saveValue, num_points); ++ } ++ #endif /* LV_HAVE_AVX */ ++ #endif /* INCLUDED_volk_32f_x2_fm_detectpuppet_32f_u_H */ ++diff --git a/kernels/volk/volk_32f_x2_interleave_32fc.h b/kernels/volk/volk_32f_x2_interleave_32fc.h ++index ef8ada2..d0cc6dd 100644 ++--- a/kernels/volk/volk_32f_x2_interleave_32fc.h +++++ b/kernels/volk/volk_32f_x2_interleave_32fc.h ++@@ -33,8 +33,8 @@ ++ * ++ * Dispatcher Prototype ++ * \code ++- * void volk_32f_x2_interleave_32fc(lv_32fc_t* complexVector, const float* iBuffer, const float* qBuffer, unsigned int num_points) ++- * \endcode +++ * void volk_32f_x2_interleave_32fc(lv_32fc_t* complexVector, const float* iBuffer, const +++ * float* qBuffer, unsigned int num_points) \endcode ++ * ++ * \b Inputs ++ * \li iBuffer: Input vector of samples for the real part. ++@@ -79,44 +79,45 @@ ++ #ifdef LV_HAVE_AVX ++ #include ++ ++-static inline void ++-volk_32f_x2_interleave_32fc_a_avx(lv_32fc_t* complexVector, const float* iBuffer, ++- const float* qBuffer, unsigned int num_points) +++static inline void volk_32f_x2_interleave_32fc_a_avx(lv_32fc_t* complexVector, +++ const float* iBuffer, +++ const float* qBuffer, +++ unsigned int num_points) ++ { ++- unsigned int number = 0; ++- float* complexVectorPtr = (float*)complexVector; ++- const float* iBufferPtr = iBuffer; ++- const float* qBufferPtr = qBuffer; ++- ++- const uint64_t eighthPoints = num_points / 8; ++- ++- __m256 iValue, qValue, cplxValue1, cplxValue2, cplxValue; ++- for(;number < eighthPoints; number++){ ++- iValue = _mm256_load_ps(iBufferPtr); ++- qValue = _mm256_load_ps(qBufferPtr); ++- ++- // Interleaves the lower two values in the i and q variables into one buffer ++- cplxValue1 = _mm256_unpacklo_ps(iValue, qValue); ++- // Interleaves the upper two values in the i and q variables into one buffer ++- cplxValue2 = _mm256_unpackhi_ps(iValue, qValue); ++- ++- cplxValue = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x20); ++- _mm256_store_ps(complexVectorPtr, cplxValue); ++- complexVectorPtr += 8; ++- ++- cplxValue = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x31); ++- _mm256_store_ps(complexVectorPtr, cplxValue); ++- complexVectorPtr += 8; ++- ++- iBufferPtr += 8; ++- qBufferPtr += 8; ++- } ++- ++- number = eighthPoints * 8; ++- for(; number < num_points; number++){ ++- *complexVectorPtr++ = *iBufferPtr++; ++- *complexVectorPtr++ = *qBufferPtr++; ++- } +++ unsigned int number = 0; +++ float* complexVectorPtr = (float*)complexVector; +++ const float* iBufferPtr = iBuffer; +++ const float* qBufferPtr = qBuffer; +++ +++ const uint64_t eighthPoints = num_points / 8; +++ +++ __m256 iValue, qValue, cplxValue1, cplxValue2, cplxValue; +++ for (; number < eighthPoints; number++) { +++ iValue = _mm256_load_ps(iBufferPtr); +++ qValue = _mm256_load_ps(qBufferPtr); +++ +++ // Interleaves the lower two values in the i and q variables into one buffer +++ cplxValue1 = _mm256_unpacklo_ps(iValue, qValue); +++ // Interleaves the upper two values in the i and q variables into one buffer +++ cplxValue2 = _mm256_unpackhi_ps(iValue, qValue); +++ +++ cplxValue = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x20); +++ _mm256_store_ps(complexVectorPtr, cplxValue); +++ complexVectorPtr += 8; +++ +++ cplxValue = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x31); +++ _mm256_store_ps(complexVectorPtr, cplxValue); +++ complexVectorPtr += 8; +++ +++ iBufferPtr += 8; +++ qBufferPtr += 8; +++ } +++ +++ number = eighthPoints * 8; +++ for (; number < num_points; number++) { +++ *complexVectorPtr++ = *iBufferPtr++; +++ *complexVectorPtr++ = *qBufferPtr++; +++ } ++ } ++ ++ #endif /* LV_HAV_AVX */ ++@@ -124,41 +125,42 @@ volk_32f_x2_interleave_32fc_a_avx(lv_32fc_t* complexVector, const float* iBuffer ++ #ifdef LV_HAVE_SSE ++ #include ++ ++-static inline void ++-volk_32f_x2_interleave_32fc_a_sse(lv_32fc_t* complexVector, const float* iBuffer, ++- const float* qBuffer, unsigned int num_points) +++static inline void volk_32f_x2_interleave_32fc_a_sse(lv_32fc_t* complexVector, +++ const float* iBuffer, +++ const float* qBuffer, +++ unsigned int num_points) ++ { ++- unsigned int number = 0; ++- float* complexVectorPtr = (float*)complexVector; ++- const float* iBufferPtr = iBuffer; ++- const float* qBufferPtr = qBuffer; ++- ++- const uint64_t quarterPoints = num_points / 4; ++- ++- __m128 iValue, qValue, cplxValue; ++- for(;number < quarterPoints; number++){ ++- iValue = _mm_load_ps(iBufferPtr); ++- qValue = _mm_load_ps(qBufferPtr); ++- ++- // Interleaves the lower two values in the i and q variables into one buffer ++- cplxValue = _mm_unpacklo_ps(iValue, qValue); ++- _mm_store_ps(complexVectorPtr, cplxValue); ++- complexVectorPtr += 4; ++- ++- // Interleaves the upper two values in the i and q variables into one buffer ++- cplxValue = _mm_unpackhi_ps(iValue, qValue); ++- _mm_store_ps(complexVectorPtr, cplxValue); ++- complexVectorPtr += 4; ++- ++- iBufferPtr += 4; ++- qBufferPtr += 4; ++- } ++- ++- number = quarterPoints * 4; ++- for(; number < num_points; number++){ ++- *complexVectorPtr++ = *iBufferPtr++; ++- *complexVectorPtr++ = *qBufferPtr++; ++- } +++ unsigned int number = 0; +++ float* complexVectorPtr = (float*)complexVector; +++ const float* iBufferPtr = iBuffer; +++ const float* qBufferPtr = qBuffer; +++ +++ const uint64_t quarterPoints = num_points / 4; +++ +++ __m128 iValue, qValue, cplxValue; +++ for (; number < quarterPoints; number++) { +++ iValue = _mm_load_ps(iBufferPtr); +++ qValue = _mm_load_ps(qBufferPtr); +++ +++ // Interleaves the lower two values in the i and q variables into one buffer +++ cplxValue = _mm_unpacklo_ps(iValue, qValue); +++ _mm_store_ps(complexVectorPtr, cplxValue); +++ complexVectorPtr += 4; +++ +++ // Interleaves the upper two values in the i and q variables into one buffer +++ cplxValue = _mm_unpackhi_ps(iValue, qValue); +++ _mm_store_ps(complexVectorPtr, cplxValue); +++ complexVectorPtr += 4; +++ +++ iBufferPtr += 4; +++ qBufferPtr += 4; +++ } +++ +++ number = quarterPoints * 4; +++ for (; number < num_points; number++) { +++ *complexVectorPtr++ = *iBufferPtr++; +++ *complexVectorPtr++ = *qBufferPtr++; +++ } ++ } ++ #endif /* LV_HAVE_SSE */ ++ ++@@ -166,52 +168,53 @@ volk_32f_x2_interleave_32fc_a_sse(lv_32fc_t* complexVector, const float* iBuffer ++ #ifdef LV_HAVE_NEON ++ #include ++ ++-static inline void ++-volk_32f_x2_interleave_32fc_neon(lv_32fc_t* complexVector, const float* iBuffer, ++- const float* qBuffer, unsigned int num_points) +++static inline void volk_32f_x2_interleave_32fc_neon(lv_32fc_t* complexVector, +++ const float* iBuffer, +++ const float* qBuffer, +++ unsigned int num_points) ++ { ++- unsigned int quarter_points = num_points / 4; ++- unsigned int number; ++- float* complexVectorPtr = (float*) complexVector; ++- ++- float32x4x2_t complex_vec; ++- for(number=0; number < quarter_points; ++number) { ++- complex_vec.val[0] = vld1q_f32(iBuffer); ++- complex_vec.val[1] = vld1q_f32(qBuffer); ++- vst2q_f32(complexVectorPtr, complex_vec); ++- iBuffer += 4; ++- qBuffer += 4; ++- complexVectorPtr += 8; ++- } ++- ++- for(number=quarter_points * 4; number < num_points; ++number) { ++- *complexVectorPtr++ = *iBuffer++; ++- *complexVectorPtr++ = *qBuffer++; ++- } +++ unsigned int quarter_points = num_points / 4; +++ unsigned int number; +++ float* complexVectorPtr = (float*)complexVector; +++ +++ float32x4x2_t complex_vec; +++ for (number = 0; number < quarter_points; ++number) { +++ complex_vec.val[0] = vld1q_f32(iBuffer); +++ complex_vec.val[1] = vld1q_f32(qBuffer); +++ vst2q_f32(complexVectorPtr, complex_vec); +++ iBuffer += 4; +++ qBuffer += 4; +++ complexVectorPtr += 8; +++ } +++ +++ for (number = quarter_points * 4; number < num_points; ++number) { +++ *complexVectorPtr++ = *iBuffer++; +++ *complexVectorPtr++ = *qBuffer++; +++ } ++ } ++ #endif /* LV_HAVE_NEON */ ++ ++ ++ #ifdef LV_HAVE_GENERIC ++ ++-static inline void ++-volk_32f_x2_interleave_32fc_generic(lv_32fc_t* complexVector, const float* iBuffer, ++- const float* qBuffer, unsigned int num_points) +++static inline void volk_32f_x2_interleave_32fc_generic(lv_32fc_t* complexVector, +++ const float* iBuffer, +++ const float* qBuffer, +++ unsigned int num_points) ++ { ++- float* complexVectorPtr = (float*)complexVector; ++- const float* iBufferPtr = iBuffer; ++- const float* qBufferPtr = qBuffer; ++- unsigned int number; ++- ++- for(number = 0; number < num_points; number++){ ++- *complexVectorPtr++ = *iBufferPtr++; ++- *complexVectorPtr++ = *qBufferPtr++; ++- } +++ float* complexVectorPtr = (float*)complexVector; +++ const float* iBufferPtr = iBuffer; +++ const float* qBufferPtr = qBuffer; +++ unsigned int number; +++ +++ for (number = 0; number < num_points; number++) { +++ *complexVectorPtr++ = *iBufferPtr++; +++ *complexVectorPtr++ = *qBufferPtr++; +++ } ++ } ++ #endif /* LV_HAVE_GENERIC */ ++ ++ ++- ++ #endif /* INCLUDED_volk_32f_x2_interleave_32fc_a_H */ ++ ++ #ifndef INCLUDED_volk_32f_x2_interleave_32fc_u_H ++@@ -223,44 +226,45 @@ volk_32f_x2_interleave_32fc_generic(lv_32fc_t* complexVector, const float* iBuff ++ #ifdef LV_HAVE_AVX ++ #include ++ ++-static inline void ++-volk_32f_x2_interleave_32fc_u_avx(lv_32fc_t* complexVector, const float* iBuffer, ++- const float* qBuffer, unsigned int num_points) +++static inline void volk_32f_x2_interleave_32fc_u_avx(lv_32fc_t* complexVector, +++ const float* iBuffer, +++ const float* qBuffer, +++ unsigned int num_points) ++ { ++- unsigned int number = 0; ++- float* complexVectorPtr = (float*)complexVector; ++- const float* iBufferPtr = iBuffer; ++- const float* qBufferPtr = qBuffer; ++- ++- const uint64_t eighthPoints = num_points / 8; ++- ++- __m256 iValue, qValue, cplxValue1, cplxValue2, cplxValue; ++- for(;number < eighthPoints; number++){ ++- iValue = _mm256_loadu_ps(iBufferPtr); ++- qValue = _mm256_loadu_ps(qBufferPtr); ++- ++- // Interleaves the lower two values in the i and q variables into one buffer ++- cplxValue1 = _mm256_unpacklo_ps(iValue, qValue); ++- // Interleaves the upper two values in the i and q variables into one buffer ++- cplxValue2 = _mm256_unpackhi_ps(iValue, qValue); ++- ++- cplxValue = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x20); ++- _mm256_storeu_ps(complexVectorPtr, cplxValue); ++- complexVectorPtr += 8; ++- ++- cplxValue = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x31); ++- _mm256_storeu_ps(complexVectorPtr, cplxValue); ++- complexVectorPtr += 8; ++- ++- iBufferPtr += 8; ++- qBufferPtr += 8; ++- } ++- ++- number = eighthPoints * 8; ++- for(; number < num_points; number++){ ++- *complexVectorPtr++ = *iBufferPtr++; ++- *complexVectorPtr++ = *qBufferPtr++; ++- } +++ unsigned int number = 0; +++ float* complexVectorPtr = (float*)complexVector; +++ const float* iBufferPtr = iBuffer; +++ const float* qBufferPtr = qBuffer; +++ +++ const uint64_t eighthPoints = num_points / 8; +++ +++ __m256 iValue, qValue, cplxValue1, cplxValue2, cplxValue; +++ for (; number < eighthPoints; number++) { +++ iValue = _mm256_loadu_ps(iBufferPtr); +++ qValue = _mm256_loadu_ps(qBufferPtr); +++ +++ // Interleaves the lower two values in the i and q variables into one buffer +++ cplxValue1 = _mm256_unpacklo_ps(iValue, qValue); +++ // Interleaves the upper two values in the i and q variables into one buffer +++ cplxValue2 = _mm256_unpackhi_ps(iValue, qValue); +++ +++ cplxValue = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x20); +++ _mm256_storeu_ps(complexVectorPtr, cplxValue); +++ complexVectorPtr += 8; +++ +++ cplxValue = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x31); +++ _mm256_storeu_ps(complexVectorPtr, cplxValue); +++ complexVectorPtr += 8; +++ +++ iBufferPtr += 8; +++ qBufferPtr += 8; +++ } +++ +++ number = eighthPoints * 8; +++ for (; number < num_points; number++) { +++ *complexVectorPtr++ = *iBufferPtr++; +++ *complexVectorPtr++ = *qBufferPtr++; +++ } ++ } ++ #endif /* LV_HAVE_AVX */ ++ ++diff --git a/kernels/volk/volk_32f_x2_max_32f.h b/kernels/volk/volk_32f_x2_max_32f.h ++index 82086a6..c7eb67f 100644 ++--- a/kernels/volk/volk_32f_x2_max_32f.h +++++ b/kernels/volk/volk_32f_x2_max_32f.h ++@@ -32,8 +32,8 @@ ++ * ++ * Dispatcher Prototype ++ * \code ++- * void volk_32f_x2_max_32f(float* cVector, const float* aVector, const float* bVector, unsigned int num_points) ++- * \endcode +++ * void volk_32f_x2_max_32f(float* cVector, const float* aVector, const float* bVector, +++ * unsigned int num_points) \endcode ++ * ++ * \b Inputs ++ * \li aVector: First input vector. ++@@ -77,176 +77,183 @@ ++ #ifdef LV_HAVE_AVX512F ++ #include ++ ++-static inline void ++-volk_32f_x2_max_32f_a_avx512f(float* cVector, const float* aVector, ++- const float* bVector, unsigned int num_points) +++static inline void volk_32f_x2_max_32f_a_avx512f(float* cVector, +++ const float* aVector, +++ const float* bVector, +++ unsigned int num_points) ++ { ++- unsigned int number = 0; ++- const unsigned int sixteenthPoints = num_points / 16; +++ unsigned int number = 0; +++ const unsigned int sixteenthPoints = num_points / 16; ++ ++- float* cPtr = cVector; ++- const float* aPtr = aVector; ++- const float* bPtr= bVector; +++ float* cPtr = cVector; +++ const float* aPtr = aVector; +++ const float* bPtr = bVector; ++ ++- __m512 aVal, bVal, cVal; ++- for(;number < sixteenthPoints; number++){ ++- aVal = _mm512_load_ps(aPtr); ++- bVal = _mm512_load_ps(bPtr); +++ __m512 aVal, bVal, cVal; +++ for (; number < sixteenthPoints; number++) { +++ aVal = _mm512_load_ps(aPtr); +++ bVal = _mm512_load_ps(bPtr); ++ ++- cVal = _mm512_max_ps(aVal, bVal); +++ cVal = _mm512_max_ps(aVal, bVal); ++ ++- _mm512_store_ps(cPtr,cVal); // Store the results back into the C container +++ _mm512_store_ps(cPtr, cVal); // Store the results back into the C container ++ ++- aPtr += 16; ++- bPtr += 16; ++- cPtr += 16; ++- } +++ aPtr += 16; +++ bPtr += 16; +++ cPtr += 16; +++ } ++ ++- number = sixteenthPoints * 16; ++- for(;number < num_points; number++){ ++- const float a = *aPtr++; ++- const float b = *bPtr++; ++- *cPtr++ = ( a > b ? a : b); ++- } +++ number = sixteenthPoints * 16; +++ for (; number < num_points; number++) { +++ const float a = *aPtr++; +++ const float b = *bPtr++; +++ *cPtr++ = (a > b ? a : b); +++ } ++ } ++ #endif /* LV_HAVE_AVX512F */ ++ ++ #ifdef LV_HAVE_SSE ++ #include ++ ++-static inline void ++-volk_32f_x2_max_32f_a_sse(float* cVector, const float* aVector, ++- const float* bVector, unsigned int num_points) +++static inline void volk_32f_x2_max_32f_a_sse(float* cVector, +++ const float* aVector, +++ const float* bVector, +++ unsigned int num_points) ++ { ++- unsigned int number = 0; ++- const unsigned int quarterPoints = num_points / 4; +++ unsigned int number = 0; +++ const unsigned int quarterPoints = num_points / 4; ++ ++- float* cPtr = cVector; ++- const float* aPtr = aVector; ++- const float* bPtr= bVector; +++ float* cPtr = cVector; +++ const float* aPtr = aVector; +++ const float* bPtr = bVector; ++ ++- __m128 aVal, bVal, cVal; ++- for(;number < quarterPoints; number++){ ++- aVal = _mm_load_ps(aPtr); ++- bVal = _mm_load_ps(bPtr); +++ __m128 aVal, bVal, cVal; +++ for (; number < quarterPoints; number++) { +++ aVal = _mm_load_ps(aPtr); +++ bVal = _mm_load_ps(bPtr); ++ ++- cVal = _mm_max_ps(aVal, bVal); +++ cVal = _mm_max_ps(aVal, bVal); ++ ++- _mm_store_ps(cPtr,cVal); // Store the results back into the C container +++ _mm_store_ps(cPtr, cVal); // Store the results back into the C container ++ ++- aPtr += 4; ++- bPtr += 4; ++- cPtr += 4; ++- } +++ aPtr += 4; +++ bPtr += 4; +++ cPtr += 4; +++ } ++ ++- number = quarterPoints * 4; ++- for(;number < num_points; number++){ ++- const float a = *aPtr++; ++- const float b = *bPtr++; ++- *cPtr++ = ( a > b ? a : b); ++- } +++ number = quarterPoints * 4; +++ for (; number < num_points; number++) { +++ const float a = *aPtr++; +++ const float b = *bPtr++; +++ *cPtr++ = (a > b ? a : b); +++ } ++ } ++ #endif /* LV_HAVE_SSE */ ++ ++ #ifdef LV_HAVE_AVX ++ #include ++ ++-static inline void ++-volk_32f_x2_max_32f_a_avx(float* cVector, const float* aVector, ++- const float* bVector, unsigned int num_points) +++static inline void volk_32f_x2_max_32f_a_avx(float* cVector, +++ const float* aVector, +++ const float* bVector, +++ unsigned int num_points) ++ { ++- unsigned int number = 0; ++- const unsigned int eighthPoints = num_points / 8; +++ unsigned int number = 0; +++ const unsigned int eighthPoints = num_points / 8; ++ ++- float* cPtr = cVector; ++- const float* aPtr = aVector; ++- const float* bPtr= bVector; +++ float* cPtr = cVector; +++ const float* aPtr = aVector; +++ const float* bPtr = bVector; ++ ++- __m256 aVal, bVal, cVal; ++- for(;number < eighthPoints; number++){ ++- aVal = _mm256_load_ps(aPtr); ++- bVal = _mm256_load_ps(bPtr); +++ __m256 aVal, bVal, cVal; +++ for (; number < eighthPoints; number++) { +++ aVal = _mm256_load_ps(aPtr); +++ bVal = _mm256_load_ps(bPtr); ++ ++- cVal = _mm256_max_ps(aVal, bVal); +++ cVal = _mm256_max_ps(aVal, bVal); ++ ++- _mm256_store_ps(cPtr,cVal); // Store the results back into the C container +++ _mm256_store_ps(cPtr, cVal); // Store the results back into the C container ++ ++- aPtr += 8; ++- bPtr += 8; ++- cPtr += 8; ++- } +++ aPtr += 8; +++ bPtr += 8; +++ cPtr += 8; +++ } ++ ++- number = eighthPoints * 8; ++- for(;number < num_points; number++){ ++- const float a = *aPtr++; ++- const float b = *bPtr++; ++- *cPtr++ = ( a > b ? a : b); ++- } +++ number = eighthPoints * 8; +++ for (; number < num_points; number++) { +++ const float a = *aPtr++; +++ const float b = *bPtr++; +++ *cPtr++ = (a > b ? a : b); +++ } ++ } ++ #endif /* LV_HAVE_AVX */ ++ ++ #ifdef LV_HAVE_NEON ++ #include ++ ++-static inline void ++-volk_32f_x2_max_32f_neon(float* cVector, const float* aVector, ++- const float* bVector, unsigned int num_points) +++static inline void volk_32f_x2_max_32f_neon(float* cVector, +++ const float* aVector, +++ const float* bVector, +++ unsigned int num_points) ++ { ++- unsigned int quarter_points = num_points / 4; ++- float* cPtr = cVector; ++- const float* aPtr = aVector; ++- const float* bPtr= bVector; ++- unsigned int number = 0; ++- ++- float32x4_t a_vec, b_vec, c_vec; ++- for(number = 0; number < quarter_points; number++){ ++- a_vec = vld1q_f32(aPtr); ++- b_vec = vld1q_f32(bPtr); ++- c_vec = vmaxq_f32(a_vec, b_vec); ++- vst1q_f32(cPtr, c_vec); ++- aPtr += 4; ++- bPtr += 4; ++- cPtr += 4; ++- } ++- ++- for(number = quarter_points*4; number < num_points; number++){ ++- const float a = *aPtr++; ++- const float b = *bPtr++; ++- *cPtr++ = ( a > b ? a : b); ++- } +++ unsigned int quarter_points = num_points / 4; +++ float* cPtr = cVector; +++ const float* aPtr = aVector; +++ const float* bPtr = bVector; +++ unsigned int number = 0; +++ +++ float32x4_t a_vec, b_vec, c_vec; +++ for (number = 0; number < quarter_points; number++) { +++ a_vec = vld1q_f32(aPtr); +++ b_vec = vld1q_f32(bPtr); +++ c_vec = vmaxq_f32(a_vec, b_vec); +++ vst1q_f32(cPtr, c_vec); +++ aPtr += 4; +++ bPtr += 4; +++ cPtr += 4; +++ } +++ +++ for (number = quarter_points * 4; number < num_points; number++) { +++ const float a = *aPtr++; +++ const float b = *bPtr++; +++ *cPtr++ = (a > b ? a : b); +++ } ++ } ++ #endif /* LV_HAVE_NEON */ ++ ++ ++ #ifdef LV_HAVE_GENERIC ++ ++-static inline void ++-volk_32f_x2_max_32f_generic(float* cVector, const float* aVector, ++- const float* bVector, unsigned int num_points) +++static inline void volk_32f_x2_max_32f_generic(float* cVector, +++ const float* aVector, +++ const float* bVector, +++ unsigned int num_points) ++ { ++- float* cPtr = cVector; ++- const float* aPtr = aVector; ++- const float* bPtr= bVector; ++- unsigned int number = 0; ++- ++- for(number = 0; number < num_points; number++){ ++- const float a = *aPtr++; ++- const float b = *bPtr++; ++- *cPtr++ = ( a > b ? a : b); ++- } +++ float* cPtr = cVector; +++ const float* aPtr = aVector; +++ const float* bPtr = bVector; +++ unsigned int number = 0; +++ +++ for (number = 0; number < num_points; number++) { +++ const float a = *aPtr++; +++ const float b = *bPtr++; +++ *cPtr++ = (a > b ? a : b); +++ } ++ } ++ #endif /* LV_HAVE_GENERIC */ ++ ++ #ifdef LV_HAVE_ORC ++-extern void ++-volk_32f_x2_max_32f_a_orc_impl(float* cVector, const float* aVector, ++- const float* bVector, unsigned int num_points); ++- ++-static inline void ++-volk_32f_x2_max_32f_u_orc(float* cVector, const float* aVector, ++- const float* bVector, unsigned int num_points) +++extern void volk_32f_x2_max_32f_a_orc_impl(float* cVector, +++ const float* aVector, +++ const float* bVector, +++ unsigned int num_points); +++ +++static inline void volk_32f_x2_max_32f_u_orc(float* cVector, +++ const float* aVector, +++ const float* bVector, +++ unsigned int num_points) ++ { ++- volk_32f_x2_max_32f_a_orc_impl(cVector, aVector, bVector, num_points); +++ volk_32f_x2_max_32f_a_orc_impl(cVector, aVector, bVector, num_points); ++ } ++ #endif /* LV_HAVE_ORC */ ++ ++@@ -263,74 +270,76 @@ volk_32f_x2_max_32f_u_orc(float* cVector, const float* aVector, ++ #ifdef LV_HAVE_AVX512F ++ #include ++ ++-static inline void ++-volk_32f_x2_max_32f_u_avx512f(float* cVector, const float* aVector, ++- const float* bVector, unsigned int num_points) +++static inline void volk_32f_x2_max_32f_u_avx512f(float* cVector, +++ const float* aVector, +++ const float* bVector, +++ unsigned int num_points) ++ { ++- unsigned int number = 0; ++- const unsigned int sixteenthPoints = num_points / 16; +++ unsigned int number = 0; +++ const unsigned int sixteenthPoints = num_points / 16; ++ ++- float* cPtr = cVector; ++- const float* aPtr = aVector; ++- const float* bPtr= bVector; +++ float* cPtr = cVector; +++ const float* aPtr = aVector; +++ const float* bPtr = bVector; ++ ++- __m512 aVal, bVal, cVal; ++- for(;number < sixteenthPoints; number++){ ++- aVal = _mm512_loadu_ps(aPtr); ++- bVal = _mm512_loadu_ps(bPtr); +++ __m512 aVal, bVal, cVal; +++ for (; number < sixteenthPoints; number++) { +++ aVal = _mm512_loadu_ps(aPtr); +++ bVal = _mm512_loadu_ps(bPtr); ++ ++- cVal = _mm512_max_ps(aVal, bVal); +++ cVal = _mm512_max_ps(aVal, bVal); ++ ++- _mm512_storeu_ps(cPtr,cVal); // Store the results back into the C container +++ _mm512_storeu_ps(cPtr, cVal); // Store the results back into the C container ++ ++- aPtr += 16; ++- bPtr += 16; ++- cPtr += 16; ++- } +++ aPtr += 16; +++ bPtr += 16; +++ cPtr += 16; +++ } ++ ++- number = sixteenthPoints * 16; ++- for(;number < num_points; number++){ ++- const float a = *aPtr++; ++- const float b = *bPtr++; ++- *cPtr++ = ( a > b ? a : b); ++- } +++ number = sixteenthPoints * 16; +++ for (; number < num_points; number++) { +++ const float a = *aPtr++; +++ const float b = *bPtr++; +++ *cPtr++ = (a > b ? a : b); +++ } ++ } ++ #endif /* LV_HAVE_AVX512F */ ++ ++ #ifdef LV_HAVE_AVX ++ #include ++ ++-static inline void ++-volk_32f_x2_max_32f_u_avx(float* cVector, const float* aVector, ++- const float* bVector, unsigned int num_points) +++static inline void volk_32f_x2_max_32f_u_avx(float* cVector, +++ const float* aVector, +++ const float* bVector, +++ unsigned int num_points) ++ { ++- unsigned int number = 0; ++- const unsigned int eighthPoints = num_points / 8; +++ unsigned int number = 0; +++ const unsigned int eighthPoints = num_points / 8; ++ ++- float* cPtr = cVector; ++- const float* aPtr = aVector; ++- const float* bPtr= bVector; +++ float* cPtr = cVector; +++ const float* aPtr = aVector; +++ const float* bPtr = bVector; ++ ++- __m256 aVal, bVal, cVal; ++- for(;number < eighthPoints; number++){ ++- aVal = _mm256_loadu_ps(aPtr); ++- bVal = _mm256_loadu_ps(bPtr); +++ __m256 aVal, bVal, cVal; +++ for (; number < eighthPoints; number++) { +++ aVal = _mm256_loadu_ps(aPtr); +++ bVal = _mm256_loadu_ps(bPtr); ++ ++- cVal = _mm256_max_ps(aVal, bVal); +++ cVal = _mm256_max_ps(aVal, bVal); ++ ++- _mm256_storeu_ps(cPtr,cVal); // Store the results back into the C container +++ _mm256_storeu_ps(cPtr, cVal); // Store the results back into the C container ++ ++- aPtr += 8; ++- bPtr += 8; ++- cPtr += 8; ++- } +++ aPtr += 8; +++ bPtr += 8; +++ cPtr += 8; +++ } ++ ++- number = eighthPoints * 8; ++- for(;number < num_points; number++){ ++- const float a = *aPtr++; ++- const float b = *bPtr++; ++- *cPtr++ = ( a > b ? a : b); ++- } +++ number = eighthPoints * 8; +++ for (; number < num_points; number++) { +++ const float a = *aPtr++; +++ const float b = *bPtr++; +++ *cPtr++ = (a > b ? a : b); +++ } ++ } ++ #endif /* LV_HAVE_AVX */ ++ ++diff --git a/kernels/volk/volk_32f_x2_min_32f.h b/kernels/volk/volk_32f_x2_min_32f.h ++index 454eb76..aecd11a 100644 ++--- a/kernels/volk/volk_32f_x2_min_32f.h +++++ b/kernels/volk/volk_32f_x2_min_32f.h ++@@ -32,8 +32,8 @@ ++ * ++ * Dispatcher Prototype ++ * \code ++- * void volk_32f_x2_min_32f(float* cVector, const float* aVector, const float* bVector, unsigned int num_points) ++- * \endcode +++ * void volk_32f_x2_min_32f(float* cVector, const float* aVector, const float* bVector, +++ * unsigned int num_points) \endcode ++ * ++ * \b Inputs ++ * \li aVector: First input vector. ++@@ -77,37 +77,38 @@ ++ #ifdef LV_HAVE_SSE ++ #include ++ ++-static inline void ++-volk_32f_x2_min_32f_a_sse(float* cVector, const float* aVector, ++- const float* bVector, unsigned int num_points) +++static inline void volk_32f_x2_min_32f_a_sse(float* cVector, +++ const float* aVector, +++ const float* bVector, +++ unsigned int num_points) ++ { ++- unsigned int number = 0; ++- const unsigned int quarterPoints = num_points / 4; +++ unsigned int number = 0; +++ const unsigned int quarterPoints = num_points / 4; ++ ++- float* cPtr = cVector; ++- const float* aPtr = aVector; ++- const float* bPtr= bVector; +++ float* cPtr = cVector; +++ const float* aPtr = aVector; +++ const float* bPtr = bVector; ++ ++- __m128 aVal, bVal, cVal; ++- for(;number < quarterPoints; number++){ ++- aVal = _mm_load_ps(aPtr); ++- bVal = _mm_load_ps(bPtr); +++ __m128 aVal, bVal, cVal; +++ for (; number < quarterPoints; number++) { +++ aVal = _mm_load_ps(aPtr); +++ bVal = _mm_load_ps(bPtr); ++ ++- cVal = _mm_min_ps(aVal, bVal); +++ cVal = _mm_min_ps(aVal, bVal); ++ ++- _mm_store_ps(cPtr,cVal); // Store the results back into the C container +++ _mm_store_ps(cPtr, cVal); // Store the results back into the C container ++ ++- aPtr += 4; ++- bPtr += 4; ++- cPtr += 4; ++- } +++ aPtr += 4; +++ bPtr += 4; +++ cPtr += 4; +++ } ++ ++- number = quarterPoints * 4; ++- for(;number < num_points; number++){ ++- const float a = *aPtr++; ++- const float b = *bPtr++; ++- *cPtr++ = ( a < b ? a : b); ++- } +++ number = quarterPoints * 4; +++ for (; number < num_points; number++) { +++ const float a = *aPtr++; +++ const float b = *bPtr++; +++ *cPtr++ = (a < b ? a : b); +++ } ++ } ++ #endif /* LV_HAVE_SSE */ ++ ++@@ -115,143 +116,149 @@ volk_32f_x2_min_32f_a_sse(float* cVector, const float* aVector, ++ #ifdef LV_HAVE_NEON ++ #include ++ ++-static inline void ++-volk_32f_x2_min_32f_neon(float* cVector, const float* aVector, ++- const float* bVector, unsigned int num_points) +++static inline void volk_32f_x2_min_32f_neon(float* cVector, +++ const float* aVector, +++ const float* bVector, +++ unsigned int num_points) ++ { ++- float* cPtr = cVector; ++- const float* aPtr = aVector; ++- const float* bPtr= bVector; ++- unsigned int number = 0; ++- unsigned int quarter_points = num_points / 4; ++- ++- float32x4_t a_vec, b_vec, c_vec; ++- for(number = 0; number < quarter_points; number++){ ++- a_vec = vld1q_f32(aPtr); ++- b_vec = vld1q_f32(bPtr); ++- ++- c_vec = vminq_f32(a_vec, b_vec); ++- ++- vst1q_f32(cPtr, c_vec); ++- aPtr += 4; ++- bPtr += 4; ++- cPtr += 4; ++- } ++- ++- for(number = quarter_points*4; number < num_points; number++){ ++- const float a = *aPtr++; ++- const float b = *bPtr++; ++- *cPtr++ = ( a < b ? a : b); ++- } +++ float* cPtr = cVector; +++ const float* aPtr = aVector; +++ const float* bPtr = bVector; +++ unsigned int number = 0; +++ unsigned int quarter_points = num_points / 4; +++ +++ float32x4_t a_vec, b_vec, c_vec; +++ for (number = 0; number < quarter_points; number++) { +++ a_vec = vld1q_f32(aPtr); +++ b_vec = vld1q_f32(bPtr); +++ +++ c_vec = vminq_f32(a_vec, b_vec); +++ +++ vst1q_f32(cPtr, c_vec); +++ aPtr += 4; +++ bPtr += 4; +++ cPtr += 4; +++ } +++ +++ for (number = quarter_points * 4; number < num_points; number++) { +++ const float a = *aPtr++; +++ const float b = *bPtr++; +++ *cPtr++ = (a < b ? a : b); +++ } ++ } ++ #endif /* LV_HAVE_NEON */ ++ ++ ++ #ifdef LV_HAVE_GENERIC ++ ++-static inline void ++-volk_32f_x2_min_32f_generic(float* cVector, const float* aVector, ++- const float* bVector, unsigned int num_points) +++static inline void volk_32f_x2_min_32f_generic(float* cVector, +++ const float* aVector, +++ const float* bVector, +++ unsigned int num_points) ++ { ++- float* cPtr = cVector; ++- const float* aPtr = aVector; ++- const float* bPtr= bVector; ++- unsigned int number = 0; ++- ++- for(number = 0; number < num_points; number++){ ++- const float a = *aPtr++; ++- const float b = *bPtr++; ++- *cPtr++ = ( a < b ? a : b); ++- } +++ float* cPtr = cVector; +++ const float* aPtr = aVector; +++ const float* bPtr = bVector; +++ unsigned int number = 0; +++ +++ for (number = 0; number < num_points; number++) { +++ const float a = *aPtr++; +++ const float b = *bPtr++; +++ *cPtr++ = (a < b ? a : b); +++ } ++ } ++ #endif /* LV_HAVE_GENERIC */ ++ ++ ++ #ifdef LV_HAVE_ORC ++ ++-extern void ++-volk_32f_x2_min_32f_a_orc_impl(float* cVector, const float* aVector, ++- const float* bVector, unsigned int num_points); +++extern void volk_32f_x2_min_32f_a_orc_impl(float* cVector, +++ const float* aVector, +++ const float* bVector, +++ unsigned int num_points); ++ ++-static inline void ++-volk_32f_x2_min_32f_u_orc(float* cVector, const float* aVector, ++- const float* bVector, unsigned int num_points) +++static inline void volk_32f_x2_min_32f_u_orc(float* cVector, +++ const float* aVector, +++ const float* bVector, +++ unsigned int num_points) ++ { ++- volk_32f_x2_min_32f_a_orc_impl(cVector, aVector, bVector, num_points); +++ volk_32f_x2_min_32f_a_orc_impl(cVector, aVector, bVector, num_points); ++ } ++ #endif /* LV_HAVE_ORC */ ++ ++ #ifdef LV_HAVE_AVX ++ #include ++ ++-static inline void ++-volk_32f_x2_min_32f_a_avx(float* cVector, const float* aVector, ++- const float* bVector, unsigned int num_points) +++static inline void volk_32f_x2_min_32f_a_avx(float* cVector, +++ const float* aVector, +++ const float* bVector, +++ unsigned int num_points) ++ { ++- unsigned int number = 0; ++- const unsigned int eighthPoints = num_points / 8; +++ unsigned int number = 0; +++ const unsigned int eighthPoints = num_points / 8; ++ ++- float* cPtr = cVector; ++- const float* aPtr = aVector; ++- const float* bPtr= bVector; +++ float* cPtr = cVector; +++ const float* aPtr = aVector; +++ const float* bPtr = bVector; ++ ++- __m256 aVal, bVal, cVal; ++- for(;number < eighthPoints; number++){ ++- aVal = _mm256_load_ps(aPtr); ++- bVal = _mm256_load_ps(bPtr); +++ __m256 aVal, bVal, cVal; +++ for (; number < eighthPoints; number++) { +++ aVal = _mm256_load_ps(aPtr); +++ bVal = _mm256_load_ps(bPtr); ++ ++- cVal = _mm256_min_ps(aVal, bVal); +++ cVal = _mm256_min_ps(aVal, bVal); ++ ++- _mm256_store_ps(cPtr,cVal); // Store the results back into the C container +++ _mm256_store_ps(cPtr, cVal); // Store the results back into the C container ++ ++- aPtr += 8; ++- bPtr += 8; ++- cPtr += 8; ++- } +++ aPtr += 8; +++ bPtr += 8; +++ cPtr += 8; +++ } ++ ++- number = eighthPoints * 8; ++- for(;number < num_points; number++){ ++- const float a = *aPtr++; ++- const float b = *bPtr++; ++- *cPtr++ = ( a < b ? a : b); ++- } +++ number = eighthPoints * 8; +++ for (; number < num_points; number++) { +++ const float a = *aPtr++; +++ const float b = *bPtr++; +++ *cPtr++ = (a < b ? a : b); +++ } ++ } ++ #endif /* LV_HAVE_AVX */ ++ ++ #ifdef LV_HAVE_AVX512F ++ #include ++ ++-static inline void ++-volk_32f_x2_min_32f_a_avx512f(float* cVector, const float* aVector, ++- const float* bVector, unsigned int num_points) +++static inline void volk_32f_x2_min_32f_a_avx512f(float* cVector, +++ const float* aVector, +++ const float* bVector, +++ unsigned int num_points) ++ { ++- unsigned int number = 0; ++- const unsigned int sixteenthPoints = num_points / 16; +++ unsigned int number = 0; +++ const unsigned int sixteenthPoints = num_points / 16; ++ ++- float* cPtr = cVector; ++- const float* aPtr = aVector; ++- const float* bPtr= bVector; +++ float* cPtr = cVector; +++ const float* aPtr = aVector; +++ const float* bPtr = bVector; ++ ++- __m512 aVal, bVal, cVal; ++- for(;number < sixteenthPoints; number++){ ++- aVal = _mm512_load_ps(aPtr); ++- bVal = _mm512_load_ps(bPtr); +++ __m512 aVal, bVal, cVal; +++ for (; number < sixteenthPoints; number++) { +++ aVal = _mm512_load_ps(aPtr); +++ bVal = _mm512_load_ps(bPtr); ++ ++- cVal = _mm512_min_ps(aVal, bVal); +++ cVal = _mm512_min_ps(aVal, bVal); ++ ++- _mm512_store_ps(cPtr,cVal); // Store the results back into the C container +++ _mm512_store_ps(cPtr, cVal); // Store the results back into the C container ++ ++- aPtr += 16; ++- bPtr += 16; ++- cPtr += 16; ++- } +++ aPtr += 16; +++ bPtr += 16; +++ cPtr += 16; +++ } ++ ++- number = sixteenthPoints * 16; ++- for(;number < num_points; number++){ ++- const float a = *aPtr++; ++- const float b = *bPtr++; ++- *cPtr++ = ( a < b ? a : b); ++- } +++ number = sixteenthPoints * 16; +++ for (; number < num_points; number++) { +++ const float a = *aPtr++; +++ const float b = *bPtr++; +++ *cPtr++ = (a < b ? a : b); +++ } ++ } ++ #endif /* LV_HAVE_AVX512F */ ++ ++@@ -267,74 +274,76 @@ volk_32f_x2_min_32f_a_avx512f(float* cVector, const float* aVector, ++ #ifdef LV_HAVE_AVX512F ++ #include ++ ++-static inline void ++-volk_32f_x2_min_32f_u_avx512f(float* cVector, const float* aVector, ++- const float* bVector, unsigned int num_points) +++static inline void volk_32f_x2_min_32f_u_avx512f(float* cVector, +++ const float* aVector, +++ const float* bVector, +++ unsigned int num_points) ++ { ++- unsigned int number = 0; ++- const unsigned int sixteenthPoints = num_points / 16; +++ unsigned int number = 0; +++ const unsigned int sixteenthPoints = num_points / 16; ++ ++- float* cPtr = cVector; ++- const float* aPtr = aVector; ++- const float* bPtr= bVector; +++ float* cPtr = cVector; +++ const float* aPtr = aVector; +++ const float* bPtr = bVector; ++ ++- __m512 aVal, bVal, cVal; ++- for(;number < sixteenthPoints; number++){ ++- aVal = _mm512_loadu_ps(aPtr); ++- bVal = _mm512_loadu_ps(bPtr); +++ __m512 aVal, bVal, cVal; +++ for (; number < sixteenthPoints; number++) { +++ aVal = _mm512_loadu_ps(aPtr); +++ bVal = _mm512_loadu_ps(bPtr); ++ ++- cVal = _mm512_min_ps(aVal, bVal); +++ cVal = _mm512_min_ps(aVal, bVal); ++ ++- _mm512_storeu_ps(cPtr,cVal); // Store the results back into the C container +++ _mm512_storeu_ps(cPtr, cVal); // Store the results back into the C container ++ ++- aPtr += 16; ++- bPtr += 16; ++- cPtr += 16; ++- } +++ aPtr += 16; +++ bPtr += 16; +++ cPtr += 16; +++ } ++ ++- number = sixteenthPoints * 16; ++- for(;number < num_points; number++){ ++- const float a = *aPtr++; ++- const float b = *bPtr++; ++- *cPtr++ = ( a < b ? a : b); ++- } +++ number = sixteenthPoints * 16; +++ for (; number < num_points; number++) { +++ const float a = *aPtr++; +++ const float b = *bPtr++; +++ *cPtr++ = (a < b ? a : b); +++ } ++ } ++ #endif /* LV_HAVE_AVX512F */ ++ ++ #ifdef LV_HAVE_AVX ++ #include ++ ++-static inline void ++-volk_32f_x2_min_32f_u_avx(float* cVector, const float* aVector, ++- const float* bVector, unsigned int num_points) +++static inline void volk_32f_x2_min_32f_u_avx(float* cVector, +++ const float* aVector, +++ const float* bVector, +++ unsigned int num_points) ++ { ++- unsigned int number = 0; ++- const unsigned int eighthPoints = num_points / 8; +++ unsigned int number = 0; +++ const unsigned int eighthPoints = num_points / 8; ++ ++- float* cPtr = cVector; ++- const float* aPtr = aVector; ++- const float* bPtr= bVector; +++ float* cPtr = cVector; +++ const float* aPtr = aVector; +++ const float* bPtr = bVector; ++ ++- __m256 aVal, bVal, cVal; ++- for(;number < eighthPoints; number++){ ++- aVal = _mm256_loadu_ps(aPtr); ++- bVal = _mm256_loadu_ps(bPtr); +++ __m256 aVal, bVal, cVal; +++ for (; number < eighthPoints; number++) { +++ aVal = _mm256_loadu_ps(aPtr); +++ bVal = _mm256_loadu_ps(bPtr); ++ ++- cVal = _mm256_min_ps(aVal, bVal); +++ cVal = _mm256_min_ps(aVal, bVal); ++ ++- _mm256_storeu_ps(cPtr,cVal); // Store the results back into the C container +++ _mm256_storeu_ps(cPtr, cVal); // Store the results back into the C container ++ ++- aPtr += 8; ++- bPtr += 8; ++- cPtr += 8; ++- } +++ aPtr += 8; +++ bPtr += 8; +++ cPtr += 8; +++ } ++ ++- number = eighthPoints * 8; ++- for(;number < num_points; number++){ ++- const float a = *aPtr++; ++- const float b = *bPtr++; ++- *cPtr++ = ( a < b ? a : b); ++- } +++ number = eighthPoints * 8; +++ for (; number < num_points; number++) { +++ const float a = *aPtr++; +++ const float b = *bPtr++; +++ *cPtr++ = (a < b ? a : b); +++ } ++ } ++ #endif /* LV_HAVE_AVX */ ++ ++diff --git a/kernels/volk/volk_32f_x2_multiply_32f.h b/kernels/volk/volk_32f_x2_multiply_32f.h ++index deb9ae3..eebba18 100644 ++--- a/kernels/volk/volk_32f_x2_multiply_32f.h +++++ b/kernels/volk/volk_32f_x2_multiply_32f.h ++@@ -31,8 +31,8 @@ ++ * ++ * Dispatcher Prototype ++ * \code ++- * void volk_32f_x2_multiply_32f(float* cVector, const float* aVector, const float* bVector, unsigned int num_points) ++- * \endcode +++ * void volk_32f_x2_multiply_32f(float* cVector, const float* aVector, const float* +++ * bVector, unsigned int num_points) \endcode ++ * ++ * \b Inputs ++ * \li aVector: First input vector. ++@@ -77,126 +77,130 @@ ++ #ifdef LV_HAVE_SSE ++ #include ++ ++-static inline void ++-volk_32f_x2_multiply_32f_u_sse(float* cVector, const float* aVector, ++- const float* bVector, unsigned int num_points) +++static inline void volk_32f_x2_multiply_32f_u_sse(float* cVector, +++ const float* aVector, +++ const float* bVector, +++ unsigned int num_points) ++ { ++- unsigned int number = 0; ++- const unsigned int quarterPoints = num_points / 4; +++ unsigned int number = 0; +++ const unsigned int quarterPoints = num_points / 4; ++ ++- float* cPtr = cVector; ++- const float* aPtr = aVector; ++- const float* bPtr= bVector; +++ float* cPtr = cVector; +++ const float* aPtr = aVector; +++ const float* bPtr = bVector; ++ ++- __m128 aVal, bVal, cVal; ++- for(;number < quarterPoints; number++){ +++ __m128 aVal, bVal, cVal; +++ for (; number < quarterPoints; number++) { ++ ++- aVal = _mm_loadu_ps(aPtr); ++- bVal = _mm_loadu_ps(bPtr); +++ aVal = _mm_loadu_ps(aPtr); +++ bVal = _mm_loadu_ps(bPtr); ++ ++- cVal = _mm_mul_ps(aVal, bVal); +++ cVal = _mm_mul_ps(aVal, bVal); ++ ++- _mm_storeu_ps(cPtr,cVal); // Store the results back into the C container +++ _mm_storeu_ps(cPtr, cVal); // Store the results back into the C container ++ ++- aPtr += 4; ++- bPtr += 4; ++- cPtr += 4; ++- } +++ aPtr += 4; +++ bPtr += 4; +++ cPtr += 4; +++ } ++ ++- number = quarterPoints * 4; ++- for(;number < num_points; number++){ ++- *cPtr++ = (*aPtr++) * (*bPtr++); ++- } +++ number = quarterPoints * 4; +++ for (; number < num_points; number++) { +++ *cPtr++ = (*aPtr++) * (*bPtr++); +++ } ++ } ++ #endif /* LV_HAVE_SSE */ ++ ++ #ifdef LV_HAVE_AVX512F ++ #include ++ ++-static inline void ++-volk_32f_x2_multiply_32f_u_avx512f(float* cVector, const float* aVector, ++- const float* bVector, unsigned int num_points) +++static inline void volk_32f_x2_multiply_32f_u_avx512f(float* cVector, +++ const float* aVector, +++ const float* bVector, +++ unsigned int num_points) ++ { ++- unsigned int number = 0; ++- const unsigned int sixteenthPoints = num_points / 16; +++ unsigned int number = 0; +++ const unsigned int sixteenthPoints = num_points / 16; ++ ++- float* cPtr = cVector; ++- const float* aPtr = aVector; ++- const float* bPtr= bVector; +++ float* cPtr = cVector; +++ const float* aPtr = aVector; +++ const float* bPtr = bVector; ++ ++- __m512 aVal, bVal, cVal; ++- for(;number < sixteenthPoints; number++){ +++ __m512 aVal, bVal, cVal; +++ for (; number < sixteenthPoints; number++) { ++ ++- aVal = _mm512_loadu_ps(aPtr); ++- bVal = _mm512_loadu_ps(bPtr); +++ aVal = _mm512_loadu_ps(aPtr); +++ bVal = _mm512_loadu_ps(bPtr); ++ ++- cVal = _mm512_mul_ps(aVal, bVal); +++ cVal = _mm512_mul_ps(aVal, bVal); ++ ++- _mm512_storeu_ps(cPtr,cVal); // Store the results back into the C container +++ _mm512_storeu_ps(cPtr, cVal); // Store the results back into the C container ++ ++- aPtr += 16; ++- bPtr += 16; ++- cPtr += 16; ++- } +++ aPtr += 16; +++ bPtr += 16; +++ cPtr += 16; +++ } ++ ++- number = sixteenthPoints * 16; ++- for(;number < num_points; number++){ ++- *cPtr++ = (*aPtr++) * (*bPtr++); ++- } +++ number = sixteenthPoints * 16; +++ for (; number < num_points; number++) { +++ *cPtr++ = (*aPtr++) * (*bPtr++); +++ } ++ } ++ #endif /* LV_HAVE_AVX512F */ ++ ++ #ifdef LV_HAVE_AVX ++ #include ++ ++-static inline void ++-volk_32f_x2_multiply_32f_u_avx(float* cVector, const float* aVector, ++- const float* bVector, unsigned int num_points) +++static inline void volk_32f_x2_multiply_32f_u_avx(float* cVector, +++ const float* aVector, +++ const float* bVector, +++ unsigned int num_points) ++ { ++- unsigned int number = 0; ++- const unsigned int eighthPoints = num_points / 8; +++ unsigned int number = 0; +++ const unsigned int eighthPoints = num_points / 8; ++ ++- float* cPtr = cVector; ++- const float* aPtr = aVector; ++- const float* bPtr= bVector; +++ float* cPtr = cVector; +++ const float* aPtr = aVector; +++ const float* bPtr = bVector; ++ ++- __m256 aVal, bVal, cVal; ++- for(;number < eighthPoints; number++){ +++ __m256 aVal, bVal, cVal; +++ for (; number < eighthPoints; number++) { ++ ++- aVal = _mm256_loadu_ps(aPtr); ++- bVal = _mm256_loadu_ps(bPtr); +++ aVal = _mm256_loadu_ps(aPtr); +++ bVal = _mm256_loadu_ps(bPtr); ++ ++- cVal = _mm256_mul_ps(aVal, bVal); +++ cVal = _mm256_mul_ps(aVal, bVal); ++ ++- _mm256_storeu_ps(cPtr,cVal); // Store the results back into the C container +++ _mm256_storeu_ps(cPtr, cVal); // Store the results back into the C container ++ ++- aPtr += 8; ++- bPtr += 8; ++- cPtr += 8; ++- } +++ aPtr += 8; +++ bPtr += 8; +++ cPtr += 8; +++ } ++ ++- number = eighthPoints * 8; ++- for(;number < num_points; number++){ ++- *cPtr++ = (*aPtr++) * (*bPtr++); ++- } +++ number = eighthPoints * 8; +++ for (; number < num_points; number++) { +++ *cPtr++ = (*aPtr++) * (*bPtr++); +++ } ++ } ++ #endif /* LV_HAVE_AVX */ ++ ++ ++ #ifdef LV_HAVE_GENERIC ++ ++-static inline void ++-volk_32f_x2_multiply_32f_generic(float* cVector, const float* aVector, ++- const float* bVector, unsigned int num_points) +++static inline void volk_32f_x2_multiply_32f_generic(float* cVector, +++ const float* aVector, +++ const float* bVector, +++ unsigned int num_points) ++ { ++- float* cPtr = cVector; ++- const float* aPtr = aVector; ++- const float* bPtr= bVector; ++- unsigned int number = 0; ++- ++- for(number = 0; number < num_points; number++){ ++- *cPtr++ = (*aPtr++) * (*bPtr++); ++- } +++ float* cPtr = cVector; +++ const float* aPtr = aVector; +++ const float* bPtr = bVector; +++ unsigned int number = 0; +++ +++ for (number = 0; number < num_points; number++) { +++ *cPtr++ = (*aPtr++) * (*bPtr++); +++ } ++ } ++ #endif /* LV_HAVE_GENERIC */ ++ ++@@ -213,72 +217,74 @@ volk_32f_x2_multiply_32f_generic(float* cVector, const float* aVector, ++ #ifdef LV_HAVE_SSE ++ #include ++ ++-static inline void ++-volk_32f_x2_multiply_32f_a_sse(float* cVector, const float* aVector, ++- const float* bVector, unsigned int num_points) +++static inline void volk_32f_x2_multiply_32f_a_sse(float* cVector, +++ const float* aVector, +++ const float* bVector, +++ unsigned int num_points) ++ { ++- unsigned int number = 0; ++- const unsigned int quarterPoints = num_points / 4; +++ unsigned int number = 0; +++ const unsigned int quarterPoints = num_points / 4; ++ ++- float* cPtr = cVector; ++- const float* aPtr = aVector; ++- const float* bPtr= bVector; +++ float* cPtr = cVector; +++ const float* aPtr = aVector; +++ const float* bPtr = bVector; ++ ++- __m128 aVal, bVal, cVal; ++- for(;number < quarterPoints; number++){ +++ __m128 aVal, bVal, cVal; +++ for (; number < quarterPoints; number++) { ++ ++- aVal = _mm_load_ps(aPtr); ++- bVal = _mm_load_ps(bPtr); +++ aVal = _mm_load_ps(aPtr); +++ bVal = _mm_load_ps(bPtr); ++ ++- cVal = _mm_mul_ps(aVal, bVal); +++ cVal = _mm_mul_ps(aVal, bVal); ++ ++- _mm_store_ps(cPtr,cVal); // Store the results back into the C container +++ _mm_store_ps(cPtr, cVal); // Store the results back into the C container ++ ++- aPtr += 4; ++- bPtr += 4; ++- cPtr += 4; ++- } +++ aPtr += 4; +++ bPtr += 4; +++ cPtr += 4; +++ } ++ ++- number = quarterPoints * 4; ++- for(;number < num_points; number++){ ++- *cPtr++ = (*aPtr++) * (*bPtr++); ++- } +++ number = quarterPoints * 4; +++ for (; number < num_points; number++) { +++ *cPtr++ = (*aPtr++) * (*bPtr++); +++ } ++ } ++ #endif /* LV_HAVE_SSE */ ++ ++ #ifdef LV_HAVE_AVX512F ++ #include ++ ++-static inline void ++-volk_32f_x2_multiply_32f_a_avx512f(float* cVector, const float* aVector, ++- const float* bVector, unsigned int num_points) +++static inline void volk_32f_x2_multiply_32f_a_avx512f(float* cVector, +++ const float* aVector, +++ const float* bVector, +++ unsigned int num_points) ++ { ++- unsigned int number = 0; ++- const unsigned int sixteenthPoints = num_points / 16; +++ unsigned int number = 0; +++ const unsigned int sixteenthPoints = num_points / 16; ++ ++- float* cPtr = cVector; ++- const float* aPtr = aVector; ++- const float* bPtr= bVector; +++ float* cPtr = cVector; +++ const float* aPtr = aVector; +++ const float* bPtr = bVector; ++ ++- __m512 aVal, bVal, cVal; ++- for(;number < sixteenthPoints; number++){ +++ __m512 aVal, bVal, cVal; +++ for (; number < sixteenthPoints; number++) { ++ ++- aVal = _mm512_load_ps(aPtr); ++- bVal = _mm512_load_ps(bPtr); +++ aVal = _mm512_load_ps(aPtr); +++ bVal = _mm512_load_ps(bPtr); ++ ++- cVal = _mm512_mul_ps(aVal, bVal); +++ cVal = _mm512_mul_ps(aVal, bVal); ++ ++- _mm512_store_ps(cPtr,cVal); // Store the results back into the C container +++ _mm512_store_ps(cPtr, cVal); // Store the results back into the C container ++ ++- aPtr += 16; ++- bPtr += 16; ++- cPtr += 16; ++- } +++ aPtr += 16; +++ bPtr += 16; +++ cPtr += 16; +++ } ++ ++- number = sixteenthPoints * 16; ++- for(;number < num_points; number++){ ++- *cPtr++ = (*aPtr++) * (*bPtr++); ++- } +++ number = sixteenthPoints * 16; +++ for (; number < num_points; number++) { +++ *cPtr++ = (*aPtr++) * (*bPtr++); +++ } ++ } ++ #endif /* LV_HAVE_AVX512F */ ++ ++@@ -286,36 +292,37 @@ volk_32f_x2_multiply_32f_a_avx512f(float* cVector, const float* aVector, ++ #ifdef LV_HAVE_AVX ++ #include ++ ++-static inline void ++-volk_32f_x2_multiply_32f_a_avx(float* cVector, const float* aVector, ++- const float* bVector, unsigned int num_points) +++static inline void volk_32f_x2_multiply_32f_a_avx(float* cVector, +++ const float* aVector, +++ const float* bVector, +++ unsigned int num_points) ++ { ++- unsigned int number = 0; ++- const unsigned int eighthPoints = num_points / 8; +++ unsigned int number = 0; +++ const unsigned int eighthPoints = num_points / 8; ++ ++- float* cPtr = cVector; ++- const float* aPtr = aVector; ++- const float* bPtr= bVector; +++ float* cPtr = cVector; +++ const float* aPtr = aVector; +++ const float* bPtr = bVector; ++ ++- __m256 aVal, bVal, cVal; ++- for(;number < eighthPoints; number++){ +++ __m256 aVal, bVal, cVal; +++ for (; number < eighthPoints; number++) { ++ ++- aVal = _mm256_load_ps(aPtr); ++- bVal = _mm256_load_ps(bPtr); +++ aVal = _mm256_load_ps(aPtr); +++ bVal = _mm256_load_ps(bPtr); ++ ++- cVal = _mm256_mul_ps(aVal, bVal); +++ cVal = _mm256_mul_ps(aVal, bVal); ++ ++- _mm256_store_ps(cPtr,cVal); // Store the results back into the C container +++ _mm256_store_ps(cPtr, cVal); // Store the results back into the C container ++ ++- aPtr += 8; ++- bPtr += 8; ++- cPtr += 8; ++- } +++ aPtr += 8; +++ bPtr += 8; +++ cPtr += 8; +++ } ++ ++- number = eighthPoints * 8; ++- for(;number < num_points; number++){ ++- *cPtr++ = (*aPtr++) * (*bPtr++); ++- } +++ number = eighthPoints * 8; +++ for (; number < num_points; number++) { +++ *cPtr++ = (*aPtr++) * (*bPtr++); +++ } ++ } ++ #endif /* LV_HAVE_AVX */ ++ ++@@ -323,57 +330,61 @@ volk_32f_x2_multiply_32f_a_avx(float* cVector, const float* aVector, ++ #ifdef LV_HAVE_NEON ++ #include ++ ++-static inline void ++-volk_32f_x2_multiply_32f_neon(float* cVector, const float* aVector, ++- const float* bVector, unsigned int num_points) +++static inline void volk_32f_x2_multiply_32f_neon(float* cVector, +++ const float* aVector, +++ const float* bVector, +++ unsigned int num_points) ++ { ++- const unsigned int quarter_points = num_points / 4; ++- unsigned int number; ++- float32x4_t avec, bvec, cvec; ++- for(number=0; number < quarter_points; ++number) { ++- avec = vld1q_f32(aVector); ++- bvec = vld1q_f32(bVector); ++- cvec = vmulq_f32(avec, bvec); ++- vst1q_f32(cVector, cvec); ++- aVector += 4; ++- bVector += 4; ++- cVector += 4; ++- } ++- for(number=quarter_points*4; number < num_points; ++number) { ++- *cVector++ = *aVector++ * *bVector++; ++- } +++ const unsigned int quarter_points = num_points / 4; +++ unsigned int number; +++ float32x4_t avec, bvec, cvec; +++ for (number = 0; number < quarter_points; ++number) { +++ avec = vld1q_f32(aVector); +++ bvec = vld1q_f32(bVector); +++ cvec = vmulq_f32(avec, bvec); +++ vst1q_f32(cVector, cvec); +++ aVector += 4; +++ bVector += 4; +++ cVector += 4; +++ } +++ for (number = quarter_points * 4; number < num_points; ++number) { +++ *cVector++ = *aVector++ * *bVector++; +++ } ++ } ++ #endif /* LV_HAVE_NEON */ ++ ++ ++ #ifdef LV_HAVE_GENERIC ++ ++-static inline void ++-volk_32f_x2_multiply_32f_a_generic(float* cVector, const float* aVector, ++- const float* bVector, unsigned int num_points) +++static inline void volk_32f_x2_multiply_32f_a_generic(float* cVector, +++ const float* aVector, +++ const float* bVector, +++ unsigned int num_points) ++ { ++- float* cPtr = cVector; ++- const float* aPtr = aVector; ++- const float* bPtr= bVector; ++- unsigned int number = 0; ++- ++- for(number = 0; number < num_points; number++){ ++- *cPtr++ = (*aPtr++) * (*bPtr++); ++- } +++ float* cPtr = cVector; +++ const float* aPtr = aVector; +++ const float* bPtr = bVector; +++ unsigned int number = 0; +++ +++ for (number = 0; number < num_points; number++) { +++ *cPtr++ = (*aPtr++) * (*bPtr++); +++ } ++ } ++ #endif /* LV_HAVE_GENERIC */ ++ ++ ++ #ifdef LV_HAVE_ORC ++-extern void ++-volk_32f_x2_multiply_32f_a_orc_impl(float* cVector, const float* aVector, ++- const float* bVector, unsigned int num_points); ++- ++-static inline void ++-volk_32f_x2_multiply_32f_u_orc(float* cVector, const float* aVector, ++- const float* bVector, unsigned int num_points) +++extern void volk_32f_x2_multiply_32f_a_orc_impl(float* cVector, +++ const float* aVector, +++ const float* bVector, +++ unsigned int num_points); +++ +++static inline void volk_32f_x2_multiply_32f_u_orc(float* cVector, +++ const float* aVector, +++ const float* bVector, +++ unsigned int num_points) ++ { ++- volk_32f_x2_multiply_32f_a_orc_impl(cVector, aVector, bVector, num_points); +++ volk_32f_x2_multiply_32f_a_orc_impl(cVector, aVector, bVector, num_points); ++ } ++ #endif /* LV_HAVE_ORC */ ++ ++diff --git a/kernels/volk/volk_32f_x2_pow_32f.h b/kernels/volk/volk_32f_x2_pow_32f.h ++index daa7f4e..106c57b 100644 ++--- a/kernels/volk/volk_32f_x2_pow_32f.h +++++ b/kernels/volk/volk_32f_x2_pow_32f.h ++@@ -31,8 +31,8 @@ ++ * ++ * Dispatcher Prototype ++ * \code ++- * void volk_32f_x2_pow_32f(float* cVector, const float* bVector, const float* aVector, unsigned int num_points) ++- * \endcode +++ * void volk_32f_x2_pow_32f(float* cVector, const float* bVector, const float* aVector, +++ * unsigned int num_points) \endcode ++ * ++ * \b Inputs ++ * \li bVector: The input vector of indices (power values). ++@@ -71,10 +71,10 @@ ++ #ifndef INCLUDED_volk_32f_x2_pow_32f_a_H ++ #define INCLUDED_volk_32f_x2_pow_32f_a_H ++ ++-#include ++-#include ++ #include ++ #include +++#include +++#include ++ ++ #define POW_POLY_DEGREE 3 ++ ++@@ -82,99 +82,130 @@ ++ #include ++ ++ #define POLY0_AVX2_FMA(x, c0) _mm256_set1_ps(c0) ++-#define POLY1_AVX2_FMA(x, c0, c1) _mm256_fmadd_ps(POLY0_AVX2_FMA(x, c1), x, _mm256_set1_ps(c0)) ++-#define POLY2_AVX2_FMA(x, c0, c1, c2) _mm256_fmadd_ps(POLY1_AVX2_FMA(x, c1, c2), x, _mm256_set1_ps(c0)) ++-#define POLY3_AVX2_FMA(x, c0, c1, c2, c3) _mm256_fmadd_ps(POLY2_AVX2_FMA(x, c1, c2, c3), x, _mm256_set1_ps(c0)) ++-#define POLY4_AVX2_FMA(x, c0, c1, c2, c3, c4) _mm256_fmadd_ps(POLY3_AVX2_FMA(x, c1, c2, c3, c4), x, _mm256_set1_ps(c0)) ++-#define POLY5_AVX2_FMA(x, c0, c1, c2, c3, c4, c5) _mm256_fmadd_ps(POLY4_AVX2_FMA(x, c1, c2, c3, c4, c5), x, _mm256_set1_ps(c0)) ++- ++-static inline void ++-volk_32f_x2_pow_32f_a_avx2_fma(float* cVector, const float* bVector, ++- const float* aVector, unsigned int num_points) +++#define POLY1_AVX2_FMA(x, c0, c1) \ +++ _mm256_fmadd_ps(POLY0_AVX2_FMA(x, c1), x, _mm256_set1_ps(c0)) +++#define POLY2_AVX2_FMA(x, c0, c1, c2) \ +++ _mm256_fmadd_ps(POLY1_AVX2_FMA(x, c1, c2), x, _mm256_set1_ps(c0)) +++#define POLY3_AVX2_FMA(x, c0, c1, c2, c3) \ +++ _mm256_fmadd_ps(POLY2_AVX2_FMA(x, c1, c2, c3), x, _mm256_set1_ps(c0)) +++#define POLY4_AVX2_FMA(x, c0, c1, c2, c3, c4) \ +++ _mm256_fmadd_ps(POLY3_AVX2_FMA(x, c1, c2, c3, c4), x, _mm256_set1_ps(c0)) +++#define POLY5_AVX2_FMA(x, c0, c1, c2, c3, c4, c5) \ +++ _mm256_fmadd_ps(POLY4_AVX2_FMA(x, c1, c2, c3, c4, c5), x, _mm256_set1_ps(c0)) +++ +++static inline void volk_32f_x2_pow_32f_a_avx2_fma(float* cVector, +++ const float* bVector, +++ const float* aVector, +++ unsigned int num_points) ++ { ++- float* cPtr = cVector; ++- const float* bPtr = bVector; ++- const float* aPtr = aVector; ++- ++- unsigned int number = 0; ++- const unsigned int eighthPoints = num_points / 8; ++- ++- __m256 aVal, bVal, cVal, logarithm, mantissa, frac, leadingOne; ++- __m256 tmp, fx, mask, pow2n, z, y; ++- __m256 one, exp_hi, exp_lo, ln2, log2EF, half, exp_C1, exp_C2; ++- __m256 exp_p0, exp_p1, exp_p2, exp_p3, exp_p4, exp_p5; ++- __m256i bias, exp, emm0, pi32_0x7f; ++- ++- one = _mm256_set1_ps(1.0); ++- exp_hi = _mm256_set1_ps(88.3762626647949); ++- exp_lo = _mm256_set1_ps(-88.3762626647949); ++- ln2 = _mm256_set1_ps(0.6931471805); ++- log2EF = _mm256_set1_ps(1.44269504088896341); ++- half = _mm256_set1_ps(0.5); ++- exp_C1 = _mm256_set1_ps(0.693359375); ++- exp_C2 = _mm256_set1_ps(-2.12194440e-4); ++- pi32_0x7f = _mm256_set1_epi32(0x7f); ++- ++- exp_p0 = _mm256_set1_ps(1.9875691500e-4); ++- exp_p1 = _mm256_set1_ps(1.3981999507e-3); ++- exp_p2 = _mm256_set1_ps(8.3334519073e-3); ++- exp_p3 = _mm256_set1_ps(4.1665795894e-2); ++- exp_p4 = _mm256_set1_ps(1.6666665459e-1); ++- exp_p5 = _mm256_set1_ps(5.0000001201e-1); ++- ++- for(;number < eighthPoints; number++){ ++- // First compute the logarithm ++- aVal = _mm256_load_ps(aPtr); ++- bias = _mm256_set1_epi32(127); ++- leadingOne = _mm256_set1_ps(1.0f); ++- exp = _mm256_sub_epi32(_mm256_srli_epi32(_mm256_and_si256(_mm256_castps_si256(aVal), _mm256_set1_epi32(0x7f800000)), 23), bias); ++- logarithm = _mm256_cvtepi32_ps(exp); ++- ++- frac = _mm256_or_ps(leadingOne, _mm256_and_ps(aVal, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffff)))); +++ float* cPtr = cVector; +++ const float* bPtr = bVector; +++ const float* aPtr = aVector; +++ +++ unsigned int number = 0; +++ const unsigned int eighthPoints = num_points / 8; +++ +++ __m256 aVal, bVal, cVal, logarithm, mantissa, frac, leadingOne; +++ __m256 tmp, fx, mask, pow2n, z, y; +++ __m256 one, exp_hi, exp_lo, ln2, log2EF, half, exp_C1, exp_C2; +++ __m256 exp_p0, exp_p1, exp_p2, exp_p3, exp_p4, exp_p5; +++ __m256i bias, exp, emm0, pi32_0x7f; +++ +++ one = _mm256_set1_ps(1.0); +++ exp_hi = _mm256_set1_ps(88.3762626647949); +++ exp_lo = _mm256_set1_ps(-88.3762626647949); +++ ln2 = _mm256_set1_ps(0.6931471805); +++ log2EF = _mm256_set1_ps(1.44269504088896341); +++ half = _mm256_set1_ps(0.5); +++ exp_C1 = _mm256_set1_ps(0.693359375); +++ exp_C2 = _mm256_set1_ps(-2.12194440e-4); +++ pi32_0x7f = _mm256_set1_epi32(0x7f); +++ +++ exp_p0 = _mm256_set1_ps(1.9875691500e-4); +++ exp_p1 = _mm256_set1_ps(1.3981999507e-3); +++ exp_p2 = _mm256_set1_ps(8.3334519073e-3); +++ exp_p3 = _mm256_set1_ps(4.1665795894e-2); +++ exp_p4 = _mm256_set1_ps(1.6666665459e-1); +++ exp_p5 = _mm256_set1_ps(5.0000001201e-1); +++ +++ for (; number < eighthPoints; number++) { +++ // First compute the logarithm +++ aVal = _mm256_load_ps(aPtr); +++ bias = _mm256_set1_epi32(127); +++ leadingOne = _mm256_set1_ps(1.0f); +++ exp = _mm256_sub_epi32( +++ _mm256_srli_epi32(_mm256_and_si256(_mm256_castps_si256(aVal), +++ _mm256_set1_epi32(0x7f800000)), +++ 23), +++ bias); +++ logarithm = _mm256_cvtepi32_ps(exp); +++ +++ frac = _mm256_or_ps( +++ leadingOne, +++ _mm256_and_ps(aVal, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffff)))); ++ ++ #if POW_POLY_DEGREE == 6 ++- mantissa = POLY5_AVX2_FMA( frac, 3.1157899f, -3.3241990f, 2.5988452f, -1.2315303f, 3.1821337e-1f, -3.4436006e-2f); +++ mantissa = POLY5_AVX2_FMA(frac, +++ 3.1157899f, +++ -3.3241990f, +++ 2.5988452f, +++ -1.2315303f, +++ 3.1821337e-1f, +++ -3.4436006e-2f); ++ #elif POW_POLY_DEGREE == 5 ++- mantissa = POLY4_AVX2_FMA( frac, 2.8882704548164776201f, -2.52074962577807006663f, 1.48116647521213171641f, -0.465725644288844778798f, 0.0596515482674574969533f); +++ mantissa = POLY4_AVX2_FMA(frac, +++ 2.8882704548164776201f, +++ -2.52074962577807006663f, +++ 1.48116647521213171641f, +++ -0.465725644288844778798f, +++ 0.0596515482674574969533f); ++ #elif POW_POLY_DEGREE == 4 ++- mantissa = POLY3_AVX2_FMA( frac, 2.61761038894603480148f, -1.75647175389045657003f, 0.688243882994381274313f, -0.107254423828329604454f); +++ mantissa = POLY3_AVX2_FMA(frac, +++ 2.61761038894603480148f, +++ -1.75647175389045657003f, +++ 0.688243882994381274313f, +++ -0.107254423828329604454f); ++ #elif POW_POLY_DEGREE == 3 ++- mantissa = POLY2_AVX2_FMA( frac, 2.28330284476918490682f, -1.04913055217340124191f, 0.204446009836232697516f); +++ mantissa = POLY2_AVX2_FMA(frac, +++ 2.28330284476918490682f, +++ -1.04913055217340124191f, +++ 0.204446009836232697516f); ++ #else ++ #error ++ #endif ++ ++- logarithm = _mm256_fmadd_ps(mantissa, _mm256_sub_ps(frac, leadingOne), logarithm); ++- logarithm = _mm256_mul_ps(logarithm, ln2); +++ logarithm = _mm256_fmadd_ps(mantissa, _mm256_sub_ps(frac, leadingOne), logarithm); +++ logarithm = _mm256_mul_ps(logarithm, ln2); ++ ++- // Now calculate b*lna ++- bVal = _mm256_load_ps(bPtr); ++- bVal = _mm256_mul_ps(bVal, logarithm); +++ // Now calculate b*lna +++ bVal = _mm256_load_ps(bPtr); +++ bVal = _mm256_mul_ps(bVal, logarithm); ++ ++- // Now compute exp(b*lna) ++- bVal = _mm256_max_ps(_mm256_min_ps(bVal, exp_hi), exp_lo); +++ // Now compute exp(b*lna) +++ bVal = _mm256_max_ps(_mm256_min_ps(bVal, exp_hi), exp_lo); ++ ++- fx = _mm256_fmadd_ps(bVal, log2EF, half); +++ fx = _mm256_fmadd_ps(bVal, log2EF, half); ++ ++- emm0 = _mm256_cvttps_epi32(fx); ++- tmp = _mm256_cvtepi32_ps(emm0); +++ emm0 = _mm256_cvttps_epi32(fx); +++ tmp = _mm256_cvtepi32_ps(emm0); ++ ++- mask = _mm256_and_ps(_mm256_cmp_ps(tmp, fx, _CMP_GT_OS), one); ++- fx = _mm256_sub_ps(tmp, mask); +++ mask = _mm256_and_ps(_mm256_cmp_ps(tmp, fx, _CMP_GT_OS), one); +++ fx = _mm256_sub_ps(tmp, mask); ++ ++- tmp = _mm256_fnmadd_ps(fx, exp_C1, bVal); ++- bVal = _mm256_fnmadd_ps(fx, exp_C2, tmp); ++- z = _mm256_mul_ps(bVal, bVal); +++ tmp = _mm256_fnmadd_ps(fx, exp_C1, bVal); +++ bVal = _mm256_fnmadd_ps(fx, exp_C2, tmp); +++ z = _mm256_mul_ps(bVal, bVal); ++ ++- y = _mm256_fmadd_ps(exp_p0, bVal, exp_p1); ++- y = _mm256_fmadd_ps(y, bVal, exp_p2); ++- y = _mm256_fmadd_ps(y, bVal, exp_p3); ++- y = _mm256_fmadd_ps(y, bVal, exp_p4); ++- y = _mm256_fmadd_ps(y, bVal, exp_p5); ++- y = _mm256_fmadd_ps(y, z, bVal); ++- y = _mm256_add_ps(y, one); +++ y = _mm256_fmadd_ps(exp_p0, bVal, exp_p1); +++ y = _mm256_fmadd_ps(y, bVal, exp_p2); +++ y = _mm256_fmadd_ps(y, bVal, exp_p3); +++ y = _mm256_fmadd_ps(y, bVal, exp_p4); +++ y = _mm256_fmadd_ps(y, bVal, exp_p5); +++ y = _mm256_fmadd_ps(y, z, bVal); +++ y = _mm256_add_ps(y, one); ++ ++- emm0 = _mm256_slli_epi32(_mm256_add_epi32(_mm256_cvttps_epi32(fx), pi32_0x7f), 23); +++ emm0 = +++ _mm256_slli_epi32(_mm256_add_epi32(_mm256_cvttps_epi32(fx), pi32_0x7f), 23); ++ ++ pow2n = _mm256_castsi256_ps(emm0); ++ cVal = _mm256_mul_ps(y, pow2n); ++@@ -184,12 +215,12 @@ volk_32f_x2_pow_32f_a_avx2_fma(float* cVector, const float* bVector, ++ aPtr += 8; ++ bPtr += 8; ++ cPtr += 8; ++- } +++ } ++ ++- number = eighthPoints * 8; ++- for(;number < num_points; number++){ ++- *cPtr++ = pow(*aPtr++, *bPtr++); ++- } +++ number = eighthPoints * 8; +++ for (; number < num_points; number++) { +++ *cPtr++ = pow(*aPtr++, *bPtr++); +++ } ++ } ++ ++ #endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for aligned */ ++@@ -198,99 +229,131 @@ volk_32f_x2_pow_32f_a_avx2_fma(float* cVector, const float* bVector, ++ #include ++ ++ #define POLY0_AVX2(x, c0) _mm256_set1_ps(c0) ++-#define POLY1_AVX2(x, c0, c1) _mm256_add_ps(_mm256_mul_ps(POLY0_AVX2(x, c1), x), _mm256_set1_ps(c0)) ++-#define POLY2_AVX2(x, c0, c1, c2) _mm256_add_ps(_mm256_mul_ps(POLY1_AVX2(x, c1, c2), x), _mm256_set1_ps(c0)) ++-#define POLY3_AVX2(x, c0, c1, c2, c3) _mm256_add_ps(_mm256_mul_ps(POLY2_AVX2(x, c1, c2, c3), x), _mm256_set1_ps(c0)) ++-#define POLY4_AVX2(x, c0, c1, c2, c3, c4) _mm256_add_ps(_mm256_mul_ps(POLY3_AVX2(x, c1, c2, c3, c4), x), _mm256_set1_ps(c0)) ++-#define POLY5_AVX2(x, c0, c1, c2, c3, c4, c5) _mm256_add_ps(_mm256_mul_ps(POLY4_AVX2(x, c1, c2, c3, c4, c5), x), _mm256_set1_ps(c0)) ++- ++-static inline void ++-volk_32f_x2_pow_32f_a_avx2(float* cVector, const float* bVector, ++- const float* aVector, unsigned int num_points) +++#define POLY1_AVX2(x, c0, c1) \ +++ _mm256_add_ps(_mm256_mul_ps(POLY0_AVX2(x, c1), x), _mm256_set1_ps(c0)) +++#define POLY2_AVX2(x, c0, c1, c2) \ +++ _mm256_add_ps(_mm256_mul_ps(POLY1_AVX2(x, c1, c2), x), _mm256_set1_ps(c0)) +++#define POLY3_AVX2(x, c0, c1, c2, c3) \ +++ _mm256_add_ps(_mm256_mul_ps(POLY2_AVX2(x, c1, c2, c3), x), _mm256_set1_ps(c0)) +++#define POLY4_AVX2(x, c0, c1, c2, c3, c4) \ +++ _mm256_add_ps(_mm256_mul_ps(POLY3_AVX2(x, c1, c2, c3, c4), x), _mm256_set1_ps(c0)) +++#define POLY5_AVX2(x, c0, c1, c2, c3, c4, c5) \ +++ _mm256_add_ps(_mm256_mul_ps(POLY4_AVX2(x, c1, c2, c3, c4, c5), x), _mm256_set1_ps(c0)) +++ +++static inline void volk_32f_x2_pow_32f_a_avx2(float* cVector, +++ const float* bVector, +++ const float* aVector, +++ unsigned int num_points) ++ { ++- float* cPtr = cVector; ++- const float* bPtr = bVector; ++- const float* aPtr = aVector; ++- ++- unsigned int number = 0; ++- const unsigned int eighthPoints = num_points / 8; ++- ++- __m256 aVal, bVal, cVal, logarithm, mantissa, frac, leadingOne; ++- __m256 tmp, fx, mask, pow2n, z, y; ++- __m256 one, exp_hi, exp_lo, ln2, log2EF, half, exp_C1, exp_C2; ++- __m256 exp_p0, exp_p1, exp_p2, exp_p3, exp_p4, exp_p5; ++- __m256i bias, exp, emm0, pi32_0x7f; ++- ++- one = _mm256_set1_ps(1.0); ++- exp_hi = _mm256_set1_ps(88.3762626647949); ++- exp_lo = _mm256_set1_ps(-88.3762626647949); ++- ln2 = _mm256_set1_ps(0.6931471805); ++- log2EF = _mm256_set1_ps(1.44269504088896341); ++- half = _mm256_set1_ps(0.5); ++- exp_C1 = _mm256_set1_ps(0.693359375); ++- exp_C2 = _mm256_set1_ps(-2.12194440e-4); ++- pi32_0x7f = _mm256_set1_epi32(0x7f); ++- ++- exp_p0 = _mm256_set1_ps(1.9875691500e-4); ++- exp_p1 = _mm256_set1_ps(1.3981999507e-3); ++- exp_p2 = _mm256_set1_ps(8.3334519073e-3); ++- exp_p3 = _mm256_set1_ps(4.1665795894e-2); ++- exp_p4 = _mm256_set1_ps(1.6666665459e-1); ++- exp_p5 = _mm256_set1_ps(5.0000001201e-1); ++- ++- for(;number < eighthPoints; number++){ ++- // First compute the logarithm ++- aVal = _mm256_load_ps(aPtr); ++- bias = _mm256_set1_epi32(127); ++- leadingOne = _mm256_set1_ps(1.0f); ++- exp = _mm256_sub_epi32(_mm256_srli_epi32(_mm256_and_si256(_mm256_castps_si256(aVal), _mm256_set1_epi32(0x7f800000)), 23), bias); ++- logarithm = _mm256_cvtepi32_ps(exp); ++- ++- frac = _mm256_or_ps(leadingOne, _mm256_and_ps(aVal, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffff)))); +++ float* cPtr = cVector; +++ const float* bPtr = bVector; +++ const float* aPtr = aVector; +++ +++ unsigned int number = 0; +++ const unsigned int eighthPoints = num_points / 8; +++ +++ __m256 aVal, bVal, cVal, logarithm, mantissa, frac, leadingOne; +++ __m256 tmp, fx, mask, pow2n, z, y; +++ __m256 one, exp_hi, exp_lo, ln2, log2EF, half, exp_C1, exp_C2; +++ __m256 exp_p0, exp_p1, exp_p2, exp_p3, exp_p4, exp_p5; +++ __m256i bias, exp, emm0, pi32_0x7f; +++ +++ one = _mm256_set1_ps(1.0); +++ exp_hi = _mm256_set1_ps(88.3762626647949); +++ exp_lo = _mm256_set1_ps(-88.3762626647949); +++ ln2 = _mm256_set1_ps(0.6931471805); +++ log2EF = _mm256_set1_ps(1.44269504088896341); +++ half = _mm256_set1_ps(0.5); +++ exp_C1 = _mm256_set1_ps(0.693359375); +++ exp_C2 = _mm256_set1_ps(-2.12194440e-4); +++ pi32_0x7f = _mm256_set1_epi32(0x7f); +++ +++ exp_p0 = _mm256_set1_ps(1.9875691500e-4); +++ exp_p1 = _mm256_set1_ps(1.3981999507e-3); +++ exp_p2 = _mm256_set1_ps(8.3334519073e-3); +++ exp_p3 = _mm256_set1_ps(4.1665795894e-2); +++ exp_p4 = _mm256_set1_ps(1.6666665459e-1); +++ exp_p5 = _mm256_set1_ps(5.0000001201e-1); +++ +++ for (; number < eighthPoints; number++) { +++ // First compute the logarithm +++ aVal = _mm256_load_ps(aPtr); +++ bias = _mm256_set1_epi32(127); +++ leadingOne = _mm256_set1_ps(1.0f); +++ exp = _mm256_sub_epi32( +++ _mm256_srli_epi32(_mm256_and_si256(_mm256_castps_si256(aVal), +++ _mm256_set1_epi32(0x7f800000)), +++ 23), +++ bias); +++ logarithm = _mm256_cvtepi32_ps(exp); +++ +++ frac = _mm256_or_ps( +++ leadingOne, +++ _mm256_and_ps(aVal, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffff)))); ++ ++ #if POW_POLY_DEGREE == 6 ++- mantissa = POLY5_AVX2( frac, 3.1157899f, -3.3241990f, 2.5988452f, -1.2315303f, 3.1821337e-1f, -3.4436006e-2f); +++ mantissa = POLY5_AVX2(frac, +++ 3.1157899f, +++ -3.3241990f, +++ 2.5988452f, +++ -1.2315303f, +++ 3.1821337e-1f, +++ -3.4436006e-2f); ++ #elif POW_POLY_DEGREE == 5 ++- mantissa = POLY4_AVX2( frac, 2.8882704548164776201f, -2.52074962577807006663f, 1.48116647521213171641f, -0.465725644288844778798f, 0.0596515482674574969533f); +++ mantissa = POLY4_AVX2(frac, +++ 2.8882704548164776201f, +++ -2.52074962577807006663f, +++ 1.48116647521213171641f, +++ -0.465725644288844778798f, +++ 0.0596515482674574969533f); ++ #elif POW_POLY_DEGREE == 4 ++- mantissa = POLY3_AVX2( frac, 2.61761038894603480148f, -1.75647175389045657003f, 0.688243882994381274313f, -0.107254423828329604454f); +++ mantissa = POLY3_AVX2(frac, +++ 2.61761038894603480148f, +++ -1.75647175389045657003f, +++ 0.688243882994381274313f, +++ -0.107254423828329604454f); ++ #elif POW_POLY_DEGREE == 3 ++- mantissa = POLY2_AVX2( frac, 2.28330284476918490682f, -1.04913055217340124191f, 0.204446009836232697516f); +++ mantissa = POLY2_AVX2(frac, +++ 2.28330284476918490682f, +++ -1.04913055217340124191f, +++ 0.204446009836232697516f); ++ #else ++ #error ++ #endif ++ ++- logarithm = _mm256_add_ps(_mm256_mul_ps(mantissa, _mm256_sub_ps(frac, leadingOne)), logarithm); ++- logarithm = _mm256_mul_ps(logarithm, ln2); +++ logarithm = _mm256_add_ps( +++ _mm256_mul_ps(mantissa, _mm256_sub_ps(frac, leadingOne)), logarithm); +++ logarithm = _mm256_mul_ps(logarithm, ln2); ++ ++- // Now calculate b*lna ++- bVal = _mm256_load_ps(bPtr); ++- bVal = _mm256_mul_ps(bVal, logarithm); +++ // Now calculate b*lna +++ bVal = _mm256_load_ps(bPtr); +++ bVal = _mm256_mul_ps(bVal, logarithm); ++ ++- // Now compute exp(b*lna) ++- bVal = _mm256_max_ps(_mm256_min_ps(bVal, exp_hi), exp_lo); +++ // Now compute exp(b*lna) +++ bVal = _mm256_max_ps(_mm256_min_ps(bVal, exp_hi), exp_lo); ++ ++- fx = _mm256_add_ps(_mm256_mul_ps(bVal, log2EF), half); +++ fx = _mm256_add_ps(_mm256_mul_ps(bVal, log2EF), half); ++ ++- emm0 = _mm256_cvttps_epi32(fx); ++- tmp = _mm256_cvtepi32_ps(emm0); +++ emm0 = _mm256_cvttps_epi32(fx); +++ tmp = _mm256_cvtepi32_ps(emm0); ++ ++- mask = _mm256_and_ps(_mm256_cmp_ps(tmp, fx, _CMP_GT_OS), one); ++- fx = _mm256_sub_ps(tmp, mask); +++ mask = _mm256_and_ps(_mm256_cmp_ps(tmp, fx, _CMP_GT_OS), one); +++ fx = _mm256_sub_ps(tmp, mask); ++ ++- tmp = _mm256_sub_ps(bVal, _mm256_mul_ps(fx, exp_C1)); ++- bVal = _mm256_sub_ps(tmp, _mm256_mul_ps(fx, exp_C2)); ++- z = _mm256_mul_ps(bVal, bVal); +++ tmp = _mm256_sub_ps(bVal, _mm256_mul_ps(fx, exp_C1)); +++ bVal = _mm256_sub_ps(tmp, _mm256_mul_ps(fx, exp_C2)); +++ z = _mm256_mul_ps(bVal, bVal); ++ ++- y = _mm256_add_ps(_mm256_mul_ps(exp_p0, bVal), exp_p1); ++- y = _mm256_add_ps(_mm256_mul_ps(y, bVal), exp_p2); ++- y = _mm256_add_ps(_mm256_mul_ps(y, bVal), exp_p3); ++- y = _mm256_add_ps(_mm256_mul_ps(y, bVal), exp_p4); ++- y = _mm256_add_ps(_mm256_mul_ps(y, bVal), exp_p5); ++- y = _mm256_add_ps(_mm256_mul_ps(y, z), bVal); ++- y = _mm256_add_ps(y, one); +++ y = _mm256_add_ps(_mm256_mul_ps(exp_p0, bVal), exp_p1); +++ y = _mm256_add_ps(_mm256_mul_ps(y, bVal), exp_p2); +++ y = _mm256_add_ps(_mm256_mul_ps(y, bVal), exp_p3); +++ y = _mm256_add_ps(_mm256_mul_ps(y, bVal), exp_p4); +++ y = _mm256_add_ps(_mm256_mul_ps(y, bVal), exp_p5); +++ y = _mm256_add_ps(_mm256_mul_ps(y, z), bVal); +++ y = _mm256_add_ps(y, one); ++ ++- emm0 = _mm256_slli_epi32(_mm256_add_epi32(_mm256_cvttps_epi32(fx), pi32_0x7f), 23); +++ emm0 = +++ _mm256_slli_epi32(_mm256_add_epi32(_mm256_cvttps_epi32(fx), pi32_0x7f), 23); ++ ++ pow2n = _mm256_castsi256_ps(emm0); ++ cVal = _mm256_mul_ps(y, pow2n); ++@@ -300,12 +363,12 @@ volk_32f_x2_pow_32f_a_avx2(float* cVector, const float* bVector, ++ aPtr += 8; ++ bPtr += 8; ++ cPtr += 8; ++- } +++ } ++ ++- number = eighthPoints * 8; ++- for(;number < num_points; number++){ ++- *cPtr++ = pow(*aPtr++, *bPtr++); ++- } +++ number = eighthPoints * 8; +++ for (; number < num_points; number++) { +++ *cPtr++ = pow(*aPtr++, *bPtr++); +++ } ++ } ++ ++ #endif /* LV_HAVE_AVX2 for aligned */ ++@@ -317,97 +380,124 @@ volk_32f_x2_pow_32f_a_avx2(float* cVector, const float* bVector, ++ #define POLY0(x, c0) _mm_set1_ps(c0) ++ #define POLY1(x, c0, c1) _mm_add_ps(_mm_mul_ps(POLY0(x, c1), x), _mm_set1_ps(c0)) ++ #define POLY2(x, c0, c1, c2) _mm_add_ps(_mm_mul_ps(POLY1(x, c1, c2), x), _mm_set1_ps(c0)) ++-#define POLY3(x, c0, c1, c2, c3) _mm_add_ps(_mm_mul_ps(POLY2(x, c1, c2, c3), x), _mm_set1_ps(c0)) ++-#define POLY4(x, c0, c1, c2, c3, c4) _mm_add_ps(_mm_mul_ps(POLY3(x, c1, c2, c3, c4), x), _mm_set1_ps(c0)) ++-#define POLY5(x, c0, c1, c2, c3, c4, c5) _mm_add_ps(_mm_mul_ps(POLY4(x, c1, c2, c3, c4, c5), x), _mm_set1_ps(c0)) ++- ++-static inline void ++-volk_32f_x2_pow_32f_a_sse4_1(float* cVector, const float* bVector, ++- const float* aVector, unsigned int num_points) +++#define POLY3(x, c0, c1, c2, c3) \ +++ _mm_add_ps(_mm_mul_ps(POLY2(x, c1, c2, c3), x), _mm_set1_ps(c0)) +++#define POLY4(x, c0, c1, c2, c3, c4) \ +++ _mm_add_ps(_mm_mul_ps(POLY3(x, c1, c2, c3, c4), x), _mm_set1_ps(c0)) +++#define POLY5(x, c0, c1, c2, c3, c4, c5) \ +++ _mm_add_ps(_mm_mul_ps(POLY4(x, c1, c2, c3, c4, c5), x), _mm_set1_ps(c0)) +++ +++static inline void volk_32f_x2_pow_32f_a_sse4_1(float* cVector, +++ const float* bVector, +++ const float* aVector, +++ unsigned int num_points) ++ { ++- float* cPtr = cVector; ++- const float* bPtr = bVector; ++- const float* aPtr = aVector; ++- ++- unsigned int number = 0; ++- const unsigned int quarterPoints = num_points / 4; ++- ++- __m128 aVal, bVal, cVal, logarithm, mantissa, frac, leadingOne; ++- __m128 tmp, fx, mask, pow2n, z, y; ++- __m128 one, exp_hi, exp_lo, ln2, log2EF, half, exp_C1, exp_C2; ++- __m128 exp_p0, exp_p1, exp_p2, exp_p3, exp_p4, exp_p5; ++- __m128i bias, exp, emm0, pi32_0x7f; ++- ++- one = _mm_set1_ps(1.0); ++- exp_hi = _mm_set1_ps(88.3762626647949); ++- exp_lo = _mm_set1_ps(-88.3762626647949); ++- ln2 = _mm_set1_ps(0.6931471805); ++- log2EF = _mm_set1_ps(1.44269504088896341); ++- half = _mm_set1_ps(0.5); ++- exp_C1 = _mm_set1_ps(0.693359375); ++- exp_C2 = _mm_set1_ps(-2.12194440e-4); ++- pi32_0x7f = _mm_set1_epi32(0x7f); ++- ++- exp_p0 = _mm_set1_ps(1.9875691500e-4); ++- exp_p1 = _mm_set1_ps(1.3981999507e-3); ++- exp_p2 = _mm_set1_ps(8.3334519073e-3); ++- exp_p3 = _mm_set1_ps(4.1665795894e-2); ++- exp_p4 = _mm_set1_ps(1.6666665459e-1); ++- exp_p5 = _mm_set1_ps(5.0000001201e-1); ++- ++- for(;number < quarterPoints; number++){ ++- // First compute the logarithm ++- aVal = _mm_load_ps(aPtr); ++- bias = _mm_set1_epi32(127); ++- leadingOne = _mm_set1_ps(1.0f); ++- exp = _mm_sub_epi32(_mm_srli_epi32(_mm_and_si128(_mm_castps_si128(aVal), _mm_set1_epi32(0x7f800000)), 23), bias); ++- logarithm = _mm_cvtepi32_ps(exp); ++- ++- frac = _mm_or_ps(leadingOne, _mm_and_ps(aVal, _mm_castsi128_ps(_mm_set1_epi32(0x7fffff)))); +++ float* cPtr = cVector; +++ const float* bPtr = bVector; +++ const float* aPtr = aVector; +++ +++ unsigned int number = 0; +++ const unsigned int quarterPoints = num_points / 4; +++ +++ __m128 aVal, bVal, cVal, logarithm, mantissa, frac, leadingOne; +++ __m128 tmp, fx, mask, pow2n, z, y; +++ __m128 one, exp_hi, exp_lo, ln2, log2EF, half, exp_C1, exp_C2; +++ __m128 exp_p0, exp_p1, exp_p2, exp_p3, exp_p4, exp_p5; +++ __m128i bias, exp, emm0, pi32_0x7f; +++ +++ one = _mm_set1_ps(1.0); +++ exp_hi = _mm_set1_ps(88.3762626647949); +++ exp_lo = _mm_set1_ps(-88.3762626647949); +++ ln2 = _mm_set1_ps(0.6931471805); +++ log2EF = _mm_set1_ps(1.44269504088896341); +++ half = _mm_set1_ps(0.5); +++ exp_C1 = _mm_set1_ps(0.693359375); +++ exp_C2 = _mm_set1_ps(-2.12194440e-4); +++ pi32_0x7f = _mm_set1_epi32(0x7f); +++ +++ exp_p0 = _mm_set1_ps(1.9875691500e-4); +++ exp_p1 = _mm_set1_ps(1.3981999507e-3); +++ exp_p2 = _mm_set1_ps(8.3334519073e-3); +++ exp_p3 = _mm_set1_ps(4.1665795894e-2); +++ exp_p4 = _mm_set1_ps(1.6666665459e-1); +++ exp_p5 = _mm_set1_ps(5.0000001201e-1); +++ +++ for (; number < quarterPoints; number++) { +++ // First compute the logarithm +++ aVal = _mm_load_ps(aPtr); +++ bias = _mm_set1_epi32(127); +++ leadingOne = _mm_set1_ps(1.0f); +++ exp = _mm_sub_epi32( +++ _mm_srli_epi32( +++ _mm_and_si128(_mm_castps_si128(aVal), _mm_set1_epi32(0x7f800000)), 23), +++ bias); +++ logarithm = _mm_cvtepi32_ps(exp); +++ +++ frac = _mm_or_ps(leadingOne, +++ _mm_and_ps(aVal, _mm_castsi128_ps(_mm_set1_epi32(0x7fffff)))); ++ ++ #if POW_POLY_DEGREE == 6 ++- mantissa = POLY5( frac, 3.1157899f, -3.3241990f, 2.5988452f, -1.2315303f, 3.1821337e-1f, -3.4436006e-2f); +++ mantissa = POLY5(frac, +++ 3.1157899f, +++ -3.3241990f, +++ 2.5988452f, +++ -1.2315303f, +++ 3.1821337e-1f, +++ -3.4436006e-2f); ++ #elif POW_POLY_DEGREE == 5 ++- mantissa = POLY4( frac, 2.8882704548164776201f, -2.52074962577807006663f, 1.48116647521213171641f, -0.465725644288844778798f, 0.0596515482674574969533f); +++ mantissa = POLY4(frac, +++ 2.8882704548164776201f, +++ -2.52074962577807006663f, +++ 1.48116647521213171641f, +++ -0.465725644288844778798f, +++ 0.0596515482674574969533f); ++ #elif POW_POLY_DEGREE == 4 ++- mantissa = POLY3( frac, 2.61761038894603480148f, -1.75647175389045657003f, 0.688243882994381274313f, -0.107254423828329604454f); +++ mantissa = POLY3(frac, +++ 2.61761038894603480148f, +++ -1.75647175389045657003f, +++ 0.688243882994381274313f, +++ -0.107254423828329604454f); ++ #elif POW_POLY_DEGREE == 3 ++- mantissa = POLY2( frac, 2.28330284476918490682f, -1.04913055217340124191f, 0.204446009836232697516f); +++ mantissa = POLY2(frac, +++ 2.28330284476918490682f, +++ -1.04913055217340124191f, +++ 0.204446009836232697516f); ++ #else ++ #error ++ #endif ++ ++- logarithm = _mm_add_ps(logarithm, _mm_mul_ps(mantissa, _mm_sub_ps(frac, leadingOne))); ++- logarithm = _mm_mul_ps(logarithm, ln2); +++ logarithm = +++ _mm_add_ps(logarithm, _mm_mul_ps(mantissa, _mm_sub_ps(frac, leadingOne))); +++ logarithm = _mm_mul_ps(logarithm, ln2); ++ ++ ++- // Now calculate b*lna ++- bVal = _mm_load_ps(bPtr); ++- bVal = _mm_mul_ps(bVal, logarithm); +++ // Now calculate b*lna +++ bVal = _mm_load_ps(bPtr); +++ bVal = _mm_mul_ps(bVal, logarithm); ++ ++- // Now compute exp(b*lna) ++- bVal = _mm_max_ps(_mm_min_ps(bVal, exp_hi), exp_lo); +++ // Now compute exp(b*lna) +++ bVal = _mm_max_ps(_mm_min_ps(bVal, exp_hi), exp_lo); ++ ++- fx = _mm_add_ps(_mm_mul_ps(bVal, log2EF), half); +++ fx = _mm_add_ps(_mm_mul_ps(bVal, log2EF), half); ++ ++- emm0 = _mm_cvttps_epi32(fx); ++- tmp = _mm_cvtepi32_ps(emm0); +++ emm0 = _mm_cvttps_epi32(fx); +++ tmp = _mm_cvtepi32_ps(emm0); ++ ++- mask = _mm_and_ps(_mm_cmpgt_ps(tmp, fx), one); ++- fx = _mm_sub_ps(tmp, mask); +++ mask = _mm_and_ps(_mm_cmpgt_ps(tmp, fx), one); +++ fx = _mm_sub_ps(tmp, mask); ++ ++- tmp = _mm_mul_ps(fx, exp_C1); ++- z = _mm_mul_ps(fx, exp_C2); ++- bVal = _mm_sub_ps(_mm_sub_ps(bVal, tmp), z); ++- z = _mm_mul_ps(bVal, bVal); +++ tmp = _mm_mul_ps(fx, exp_C1); +++ z = _mm_mul_ps(fx, exp_C2); +++ bVal = _mm_sub_ps(_mm_sub_ps(bVal, tmp), z); +++ z = _mm_mul_ps(bVal, bVal); ++ ++- y = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(exp_p0, bVal), exp_p1), bVal); ++- y = _mm_add_ps(_mm_mul_ps(_mm_add_ps(y, exp_p2), bVal), exp_p3); ++- y = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(y, bVal), exp_p4), bVal); ++- y = _mm_add_ps(_mm_mul_ps(_mm_add_ps(y, exp_p5), z), bVal); ++- y = _mm_add_ps(y, one); +++ y = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(exp_p0, bVal), exp_p1), bVal); +++ y = _mm_add_ps(_mm_mul_ps(_mm_add_ps(y, exp_p2), bVal), exp_p3); +++ y = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(y, bVal), exp_p4), bVal); +++ y = _mm_add_ps(_mm_mul_ps(_mm_add_ps(y, exp_p5), z), bVal); +++ y = _mm_add_ps(y, one); ++ ++- emm0 = _mm_slli_epi32(_mm_add_epi32(_mm_cvttps_epi32(fx), pi32_0x7f), 23); +++ emm0 = _mm_slli_epi32(_mm_add_epi32(_mm_cvttps_epi32(fx), pi32_0x7f), 23); ++ ++ pow2n = _mm_castsi128_ps(emm0); ++ cVal = _mm_mul_ps(y, pow2n); ++@@ -417,12 +507,12 @@ volk_32f_x2_pow_32f_a_sse4_1(float* cVector, const float* bVector, ++ aPtr += 4; ++ bPtr += 4; ++ cPtr += 4; ++- } +++ } ++ ++- number = quarterPoints * 4; ++- for(;number < num_points; number++){ ++- *cPtr++ = powf(*aPtr++, *bPtr++); ++- } +++ number = quarterPoints * 4; +++ for (; number < num_points; number++) { +++ *cPtr++ = powf(*aPtr++, *bPtr++); +++ } ++ } ++ ++ #endif /* LV_HAVE_SSE4_1 for aligned */ ++@@ -432,27 +522,28 @@ volk_32f_x2_pow_32f_a_sse4_1(float* cVector, const float* bVector, ++ #ifndef INCLUDED_volk_32f_x2_pow_32f_u_H ++ #define INCLUDED_volk_32f_x2_pow_32f_u_H ++ ++-#include ++-#include ++ #include ++ #include +++#include +++#include ++ ++ #define POW_POLY_DEGREE 3 ++ ++ #ifdef LV_HAVE_GENERIC ++ ++-static inline void ++-volk_32f_x2_pow_32f_generic(float* cVector, const float* bVector, ++- const float* aVector, unsigned int num_points) +++static inline void volk_32f_x2_pow_32f_generic(float* cVector, +++ const float* bVector, +++ const float* aVector, +++ unsigned int num_points) ++ { ++- float* cPtr = cVector; ++- const float* bPtr = bVector; ++- const float* aPtr = aVector; ++- unsigned int number = 0; ++- ++- for(number = 0; number < num_points; number++){ ++- *cPtr++ = powf(*aPtr++, *bPtr++); ++- } +++ float* cPtr = cVector; +++ const float* bPtr = bVector; +++ const float* aPtr = aVector; +++ unsigned int number = 0; +++ +++ for (number = 0; number < num_points; number++) { +++ *cPtr++ = powf(*aPtr++, *bPtr++); +++ } ++ } ++ #endif /* LV_HAVE_GENERIC */ ++ ++@@ -463,112 +554,139 @@ volk_32f_x2_pow_32f_generic(float* cVector, const float* bVector, ++ #define POLY0(x, c0) _mm_set1_ps(c0) ++ #define POLY1(x, c0, c1) _mm_add_ps(_mm_mul_ps(POLY0(x, c1), x), _mm_set1_ps(c0)) ++ #define POLY2(x, c0, c1, c2) _mm_add_ps(_mm_mul_ps(POLY1(x, c1, c2), x), _mm_set1_ps(c0)) ++-#define POLY3(x, c0, c1, c2, c3) _mm_add_ps(_mm_mul_ps(POLY2(x, c1, c2, c3), x), _mm_set1_ps(c0)) ++-#define POLY4(x, c0, c1, c2, c3, c4) _mm_add_ps(_mm_mul_ps(POLY3(x, c1, c2, c3, c4), x), _mm_set1_ps(c0)) ++-#define POLY5(x, c0, c1, c2, c3, c4, c5) _mm_add_ps(_mm_mul_ps(POLY4(x, c1, c2, c3, c4, c5), x), _mm_set1_ps(c0)) ++- ++-static inline void ++-volk_32f_x2_pow_32f_u_sse4_1(float* cVector, const float* bVector, ++- const float* aVector, unsigned int num_points) +++#define POLY3(x, c0, c1, c2, c3) \ +++ _mm_add_ps(_mm_mul_ps(POLY2(x, c1, c2, c3), x), _mm_set1_ps(c0)) +++#define POLY4(x, c0, c1, c2, c3, c4) \ +++ _mm_add_ps(_mm_mul_ps(POLY3(x, c1, c2, c3, c4), x), _mm_set1_ps(c0)) +++#define POLY5(x, c0, c1, c2, c3, c4, c5) \ +++ _mm_add_ps(_mm_mul_ps(POLY4(x, c1, c2, c3, c4, c5), x), _mm_set1_ps(c0)) +++ +++static inline void volk_32f_x2_pow_32f_u_sse4_1(float* cVector, +++ const float* bVector, +++ const float* aVector, +++ unsigned int num_points) ++ { ++- float* cPtr = cVector; ++- const float* bPtr = bVector; ++- const float* aPtr = aVector; ++- ++- unsigned int number = 0; ++- const unsigned int quarterPoints = num_points / 4; ++- ++- __m128 aVal, bVal, cVal, logarithm, mantissa, frac, leadingOne; ++- __m128 tmp, fx, mask, pow2n, z, y; ++- __m128 one, exp_hi, exp_lo, ln2, log2EF, half, exp_C1, exp_C2; ++- __m128 exp_p0, exp_p1, exp_p2, exp_p3, exp_p4, exp_p5; ++- __m128i bias, exp, emm0, pi32_0x7f; ++- ++- one = _mm_set1_ps(1.0); ++- exp_hi = _mm_set1_ps(88.3762626647949); ++- exp_lo = _mm_set1_ps(-88.3762626647949); ++- ln2 = _mm_set1_ps(0.6931471805); ++- log2EF = _mm_set1_ps(1.44269504088896341); ++- half = _mm_set1_ps(0.5); ++- exp_C1 = _mm_set1_ps(0.693359375); ++- exp_C2 = _mm_set1_ps(-2.12194440e-4); ++- pi32_0x7f = _mm_set1_epi32(0x7f); ++- ++- exp_p0 = _mm_set1_ps(1.9875691500e-4); ++- exp_p1 = _mm_set1_ps(1.3981999507e-3); ++- exp_p2 = _mm_set1_ps(8.3334519073e-3); ++- exp_p3 = _mm_set1_ps(4.1665795894e-2); ++- exp_p4 = _mm_set1_ps(1.6666665459e-1); ++- exp_p5 = _mm_set1_ps(5.0000001201e-1); ++- ++- for(;number < quarterPoints; number++){ ++- // First compute the logarithm ++- aVal = _mm_loadu_ps(aPtr); ++- bias = _mm_set1_epi32(127); ++- leadingOne = _mm_set1_ps(1.0f); ++- exp = _mm_sub_epi32(_mm_srli_epi32(_mm_and_si128(_mm_castps_si128(aVal), _mm_set1_epi32(0x7f800000)), 23), bias); ++- logarithm = _mm_cvtepi32_ps(exp); ++- ++- frac = _mm_or_ps(leadingOne, _mm_and_ps(aVal, _mm_castsi128_ps(_mm_set1_epi32(0x7fffff)))); +++ float* cPtr = cVector; +++ const float* bPtr = bVector; +++ const float* aPtr = aVector; +++ +++ unsigned int number = 0; +++ const unsigned int quarterPoints = num_points / 4; +++ +++ __m128 aVal, bVal, cVal, logarithm, mantissa, frac, leadingOne; +++ __m128 tmp, fx, mask, pow2n, z, y; +++ __m128 one, exp_hi, exp_lo, ln2, log2EF, half, exp_C1, exp_C2; +++ __m128 exp_p0, exp_p1, exp_p2, exp_p3, exp_p4, exp_p5; +++ __m128i bias, exp, emm0, pi32_0x7f; +++ +++ one = _mm_set1_ps(1.0); +++ exp_hi = _mm_set1_ps(88.3762626647949); +++ exp_lo = _mm_set1_ps(-88.3762626647949); +++ ln2 = _mm_set1_ps(0.6931471805); +++ log2EF = _mm_set1_ps(1.44269504088896341); +++ half = _mm_set1_ps(0.5); +++ exp_C1 = _mm_set1_ps(0.693359375); +++ exp_C2 = _mm_set1_ps(-2.12194440e-4); +++ pi32_0x7f = _mm_set1_epi32(0x7f); +++ +++ exp_p0 = _mm_set1_ps(1.9875691500e-4); +++ exp_p1 = _mm_set1_ps(1.3981999507e-3); +++ exp_p2 = _mm_set1_ps(8.3334519073e-3); +++ exp_p3 = _mm_set1_ps(4.1665795894e-2); +++ exp_p4 = _mm_set1_ps(1.6666665459e-1); +++ exp_p5 = _mm_set1_ps(5.0000001201e-1); +++ +++ for (; number < quarterPoints; number++) { +++ // First compute the logarithm +++ aVal = _mm_loadu_ps(aPtr); +++ bias = _mm_set1_epi32(127); +++ leadingOne = _mm_set1_ps(1.0f); +++ exp = _mm_sub_epi32( +++ _mm_srli_epi32( +++ _mm_and_si128(_mm_castps_si128(aVal), _mm_set1_epi32(0x7f800000)), 23), +++ bias); +++ logarithm = _mm_cvtepi32_ps(exp); +++ +++ frac = _mm_or_ps(leadingOne, +++ _mm_and_ps(aVal, _mm_castsi128_ps(_mm_set1_epi32(0x7fffff)))); ++ ++ #if POW_POLY_DEGREE == 6 ++- mantissa = POLY5( frac, 3.1157899f, -3.3241990f, 2.5988452f, -1.2315303f, 3.1821337e-1f, -3.4436006e-2f); +++ mantissa = POLY5(frac, +++ 3.1157899f, +++ -3.3241990f, +++ 2.5988452f, +++ -1.2315303f, +++ 3.1821337e-1f, +++ -3.4436006e-2f); ++ #elif POW_POLY_DEGREE == 5 ++- mantissa = POLY4( frac, 2.8882704548164776201f, -2.52074962577807006663f, 1.48116647521213171641f, -0.465725644288844778798f, 0.0596515482674574969533f); +++ mantissa = POLY4(frac, +++ 2.8882704548164776201f, +++ -2.52074962577807006663f, +++ 1.48116647521213171641f, +++ -0.465725644288844778798f, +++ 0.0596515482674574969533f); ++ #elif POW_POLY_DEGREE == 4 ++- mantissa = POLY3( frac, 2.61761038894603480148f, -1.75647175389045657003f, 0.688243882994381274313f, -0.107254423828329604454f); +++ mantissa = POLY3(frac, +++ 2.61761038894603480148f, +++ -1.75647175389045657003f, +++ 0.688243882994381274313f, +++ -0.107254423828329604454f); ++ #elif POW_POLY_DEGREE == 3 ++- mantissa = POLY2( frac, 2.28330284476918490682f, -1.04913055217340124191f, 0.204446009836232697516f); +++ mantissa = POLY2(frac, +++ 2.28330284476918490682f, +++ -1.04913055217340124191f, +++ 0.204446009836232697516f); ++ #else ++ #error ++ #endif ++ ++- logarithm = _mm_add_ps(logarithm, _mm_mul_ps(mantissa, _mm_sub_ps(frac, leadingOne))); ++- logarithm = _mm_mul_ps(logarithm, ln2); +++ logarithm = +++ _mm_add_ps(logarithm, _mm_mul_ps(mantissa, _mm_sub_ps(frac, leadingOne))); +++ logarithm = _mm_mul_ps(logarithm, ln2); ++ ++ ++- // Now calculate b*lna ++- bVal = _mm_loadu_ps(bPtr); ++- bVal = _mm_mul_ps(bVal, logarithm); +++ // Now calculate b*lna +++ bVal = _mm_loadu_ps(bPtr); +++ bVal = _mm_mul_ps(bVal, logarithm); ++ ++- // Now compute exp(b*lna) ++- bVal = _mm_max_ps(_mm_min_ps(bVal, exp_hi), exp_lo); +++ // Now compute exp(b*lna) +++ bVal = _mm_max_ps(_mm_min_ps(bVal, exp_hi), exp_lo); ++ ++- fx = _mm_add_ps(_mm_mul_ps(bVal, log2EF), half); +++ fx = _mm_add_ps(_mm_mul_ps(bVal, log2EF), half); ++ ++- emm0 = _mm_cvttps_epi32(fx); ++- tmp = _mm_cvtepi32_ps(emm0); +++ emm0 = _mm_cvttps_epi32(fx); +++ tmp = _mm_cvtepi32_ps(emm0); ++ ++- mask = _mm_and_ps(_mm_cmpgt_ps(tmp, fx), one); ++- fx = _mm_sub_ps(tmp, mask); +++ mask = _mm_and_ps(_mm_cmpgt_ps(tmp, fx), one); +++ fx = _mm_sub_ps(tmp, mask); ++ ++- tmp = _mm_mul_ps(fx, exp_C1); ++- z = _mm_mul_ps(fx, exp_C2); ++- bVal = _mm_sub_ps(_mm_sub_ps(bVal, tmp), z); ++- z = _mm_mul_ps(bVal, bVal); +++ tmp = _mm_mul_ps(fx, exp_C1); +++ z = _mm_mul_ps(fx, exp_C2); +++ bVal = _mm_sub_ps(_mm_sub_ps(bVal, tmp), z); +++ z = _mm_mul_ps(bVal, bVal); ++ ++- y = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(exp_p0, bVal), exp_p1), bVal); ++- y = _mm_add_ps(_mm_mul_ps(_mm_add_ps(y, exp_p2), bVal), exp_p3); ++- y = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(y, bVal), exp_p4), bVal); ++- y = _mm_add_ps(_mm_mul_ps(_mm_add_ps(y, exp_p5), z), bVal); ++- y = _mm_add_ps(y, one); +++ y = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(exp_p0, bVal), exp_p1), bVal); +++ y = _mm_add_ps(_mm_mul_ps(_mm_add_ps(y, exp_p2), bVal), exp_p3); +++ y = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(y, bVal), exp_p4), bVal); +++ y = _mm_add_ps(_mm_mul_ps(_mm_add_ps(y, exp_p5), z), bVal); +++ y = _mm_add_ps(y, one); ++ ++- emm0 = _mm_slli_epi32(_mm_add_epi32(_mm_cvttps_epi32(fx), pi32_0x7f), 23); +++ emm0 = _mm_slli_epi32(_mm_add_epi32(_mm_cvttps_epi32(fx), pi32_0x7f), 23); ++ ++- pow2n = _mm_castsi128_ps(emm0); ++- cVal = _mm_mul_ps(y, pow2n); +++ pow2n = _mm_castsi128_ps(emm0); +++ cVal = _mm_mul_ps(y, pow2n); ++ ++- _mm_storeu_ps(cPtr, cVal); +++ _mm_storeu_ps(cPtr, cVal); ++ ++- aPtr += 4; ++- bPtr += 4; ++- cPtr += 4; ++- } +++ aPtr += 4; +++ bPtr += 4; +++ cPtr += 4; +++ } ++ ++- number = quarterPoints * 4; ++- for(;number < num_points; number++){ ++- *cPtr++ = powf(*aPtr++, *bPtr++); ++- } +++ number = quarterPoints * 4; +++ for (; number < num_points; number++) { +++ *cPtr++ = powf(*aPtr++, *bPtr++); +++ } ++ } ++ ++ #endif /* LV_HAVE_SSE4_1 for unaligned */ ++@@ -577,100 +695,131 @@ volk_32f_x2_pow_32f_u_sse4_1(float* cVector, const float* bVector, ++ #include ++ ++ #define POLY0_AVX2_FMA(x, c0) _mm256_set1_ps(c0) ++-#define POLY1_AVX2_FMA(x, c0, c1) _mm256_fmadd_ps(POLY0_AVX2_FMA(x, c1), x, _mm256_set1_ps(c0)) ++-#define POLY2_AVX2_FMA(x, c0, c1, c2) _mm256_fmadd_ps(POLY1_AVX2_FMA(x, c1, c2), x, _mm256_set1_ps(c0)) ++-#define POLY3_AVX2_FMA(x, c0, c1, c2, c3) _mm256_fmadd_ps(POLY2_AVX2_FMA(x, c1, c2, c3), x, _mm256_set1_ps(c0)) ++-#define POLY4_AVX2_FMA(x, c0, c1, c2, c3, c4) _mm256_fmadd_ps(POLY3_AVX2_FMA(x, c1, c2, c3, c4), x, _mm256_set1_ps(c0)) ++-#define POLY5_AVX2_FMA(x, c0, c1, c2, c3, c4, c5) _mm256_fmadd_ps(POLY4_AVX2_FMA(x, c1, c2, c3, c4, c5), x, _mm256_set1_ps(c0)) ++- ++-static inline void ++-volk_32f_x2_pow_32f_u_avx2_fma(float* cVector, const float* bVector, ++- const float* aVector, unsigned int num_points) +++#define POLY1_AVX2_FMA(x, c0, c1) \ +++ _mm256_fmadd_ps(POLY0_AVX2_FMA(x, c1), x, _mm256_set1_ps(c0)) +++#define POLY2_AVX2_FMA(x, c0, c1, c2) \ +++ _mm256_fmadd_ps(POLY1_AVX2_FMA(x, c1, c2), x, _mm256_set1_ps(c0)) +++#define POLY3_AVX2_FMA(x, c0, c1, c2, c3) \ +++ _mm256_fmadd_ps(POLY2_AVX2_FMA(x, c1, c2, c3), x, _mm256_set1_ps(c0)) +++#define POLY4_AVX2_FMA(x, c0, c1, c2, c3, c4) \ +++ _mm256_fmadd_ps(POLY3_AVX2_FMA(x, c1, c2, c3, c4), x, _mm256_set1_ps(c0)) +++#define POLY5_AVX2_FMA(x, c0, c1, c2, c3, c4, c5) \ +++ _mm256_fmadd_ps(POLY4_AVX2_FMA(x, c1, c2, c3, c4, c5), x, _mm256_set1_ps(c0)) +++ +++static inline void volk_32f_x2_pow_32f_u_avx2_fma(float* cVector, +++ const float* bVector, +++ const float* aVector, +++ unsigned int num_points) ++ { ++- float* cPtr = cVector; ++- const float* bPtr = bVector; ++- const float* aPtr = aVector; ++- ++- unsigned int number = 0; ++- const unsigned int eighthPoints = num_points / 8; ++- ++- __m256 aVal, bVal, cVal, logarithm, mantissa, frac, leadingOne; ++- __m256 tmp, fx, mask, pow2n, z, y; ++- __m256 one, exp_hi, exp_lo, ln2, log2EF, half, exp_C1, exp_C2; ++- __m256 exp_p0, exp_p1, exp_p2, exp_p3, exp_p4, exp_p5; ++- __m256i bias, exp, emm0, pi32_0x7f; ++- ++- one = _mm256_set1_ps(1.0); ++- exp_hi = _mm256_set1_ps(88.3762626647949); ++- exp_lo = _mm256_set1_ps(-88.3762626647949); ++- ln2 = _mm256_set1_ps(0.6931471805); ++- log2EF = _mm256_set1_ps(1.44269504088896341); ++- half = _mm256_set1_ps(0.5); ++- exp_C1 = _mm256_set1_ps(0.693359375); ++- exp_C2 = _mm256_set1_ps(-2.12194440e-4); ++- pi32_0x7f = _mm256_set1_epi32(0x7f); ++- ++- exp_p0 = _mm256_set1_ps(1.9875691500e-4); ++- exp_p1 = _mm256_set1_ps(1.3981999507e-3); ++- exp_p2 = _mm256_set1_ps(8.3334519073e-3); ++- exp_p3 = _mm256_set1_ps(4.1665795894e-2); ++- exp_p4 = _mm256_set1_ps(1.6666665459e-1); ++- exp_p5 = _mm256_set1_ps(5.0000001201e-1); ++- ++- for(;number < eighthPoints; number++){ ++- // First compute the logarithm ++- aVal = _mm256_loadu_ps(aPtr); ++- bias = _mm256_set1_epi32(127); ++- leadingOne = _mm256_set1_ps(1.0f); ++- exp = _mm256_sub_epi32(_mm256_srli_epi32(_mm256_and_si256(_mm256_castps_si256(aVal), _mm256_set1_epi32(0x7f800000)), 23), bias); ++- logarithm = _mm256_cvtepi32_ps(exp); ++- ++- frac = _mm256_or_ps(leadingOne, _mm256_and_ps(aVal, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffff)))); +++ float* cPtr = cVector; +++ const float* bPtr = bVector; +++ const float* aPtr = aVector; +++ +++ unsigned int number = 0; +++ const unsigned int eighthPoints = num_points / 8; +++ +++ __m256 aVal, bVal, cVal, logarithm, mantissa, frac, leadingOne; +++ __m256 tmp, fx, mask, pow2n, z, y; +++ __m256 one, exp_hi, exp_lo, ln2, log2EF, half, exp_C1, exp_C2; +++ __m256 exp_p0, exp_p1, exp_p2, exp_p3, exp_p4, exp_p5; +++ __m256i bias, exp, emm0, pi32_0x7f; +++ +++ one = _mm256_set1_ps(1.0); +++ exp_hi = _mm256_set1_ps(88.3762626647949); +++ exp_lo = _mm256_set1_ps(-88.3762626647949); +++ ln2 = _mm256_set1_ps(0.6931471805); +++ log2EF = _mm256_set1_ps(1.44269504088896341); +++ half = _mm256_set1_ps(0.5); +++ exp_C1 = _mm256_set1_ps(0.693359375); +++ exp_C2 = _mm256_set1_ps(-2.12194440e-4); +++ pi32_0x7f = _mm256_set1_epi32(0x7f); +++ +++ exp_p0 = _mm256_set1_ps(1.9875691500e-4); +++ exp_p1 = _mm256_set1_ps(1.3981999507e-3); +++ exp_p2 = _mm256_set1_ps(8.3334519073e-3); +++ exp_p3 = _mm256_set1_ps(4.1665795894e-2); +++ exp_p4 = _mm256_set1_ps(1.6666665459e-1); +++ exp_p5 = _mm256_set1_ps(5.0000001201e-1); +++ +++ for (; number < eighthPoints; number++) { +++ // First compute the logarithm +++ aVal = _mm256_loadu_ps(aPtr); +++ bias = _mm256_set1_epi32(127); +++ leadingOne = _mm256_set1_ps(1.0f); +++ exp = _mm256_sub_epi32( +++ _mm256_srli_epi32(_mm256_and_si256(_mm256_castps_si256(aVal), +++ _mm256_set1_epi32(0x7f800000)), +++ 23), +++ bias); +++ logarithm = _mm256_cvtepi32_ps(exp); +++ +++ frac = _mm256_or_ps( +++ leadingOne, +++ _mm256_and_ps(aVal, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffff)))); ++ ++ #if POW_POLY_DEGREE == 6 ++- mantissa = POLY5_AVX2_FMA( frac, 3.1157899f, -3.3241990f, 2.5988452f, -1.2315303f, 3.1821337e-1f, -3.4436006e-2f); +++ mantissa = POLY5_AVX2_FMA(frac, +++ 3.1157899f, +++ -3.3241990f, +++ 2.5988452f, +++ -1.2315303f, +++ 3.1821337e-1f, +++ -3.4436006e-2f); ++ #elif POW_POLY_DEGREE == 5 ++- mantissa = POLY4_AVX2_FMA( frac, 2.8882704548164776201f, -2.52074962577807006663f, 1.48116647521213171641f, -0.465725644288844778798f, 0.0596515482674574969533f); +++ mantissa = POLY4_AVX2_FMA(frac, +++ 2.8882704548164776201f, +++ -2.52074962577807006663f, +++ 1.48116647521213171641f, +++ -0.465725644288844778798f, +++ 0.0596515482674574969533f); ++ #elif POW_POLY_DEGREE == 4 ++- mantissa = POLY3_AVX2_FMA( frac, 2.61761038894603480148f, -1.75647175389045657003f, 0.688243882994381274313f, -0.107254423828329604454f); +++ mantissa = POLY3_AVX2_FMA(frac, +++ 2.61761038894603480148f, +++ -1.75647175389045657003f, +++ 0.688243882994381274313f, +++ -0.107254423828329604454f); ++ #elif POW_POLY_DEGREE == 3 ++- mantissa = POLY2_AVX2_FMA( frac, 2.28330284476918490682f, -1.04913055217340124191f, 0.204446009836232697516f); +++ mantissa = POLY2_AVX2_FMA(frac, +++ 2.28330284476918490682f, +++ -1.04913055217340124191f, +++ 0.204446009836232697516f); ++ #else ++ #error ++ #endif ++ ++- logarithm = _mm256_fmadd_ps(mantissa, _mm256_sub_ps(frac, leadingOne), logarithm); ++- logarithm = _mm256_mul_ps(logarithm, ln2); +++ logarithm = _mm256_fmadd_ps(mantissa, _mm256_sub_ps(frac, leadingOne), logarithm); +++ logarithm = _mm256_mul_ps(logarithm, ln2); ++ ++ ++- // Now calculate b*lna ++- bVal = _mm256_loadu_ps(bPtr); ++- bVal = _mm256_mul_ps(bVal, logarithm); +++ // Now calculate b*lna +++ bVal = _mm256_loadu_ps(bPtr); +++ bVal = _mm256_mul_ps(bVal, logarithm); ++ ++- // Now compute exp(b*lna) ++- bVal = _mm256_max_ps(_mm256_min_ps(bVal, exp_hi), exp_lo); +++ // Now compute exp(b*lna) +++ bVal = _mm256_max_ps(_mm256_min_ps(bVal, exp_hi), exp_lo); ++ ++- fx = _mm256_fmadd_ps(bVal, log2EF, half); +++ fx = _mm256_fmadd_ps(bVal, log2EF, half); ++ ++- emm0 = _mm256_cvttps_epi32(fx); ++- tmp = _mm256_cvtepi32_ps(emm0); +++ emm0 = _mm256_cvttps_epi32(fx); +++ tmp = _mm256_cvtepi32_ps(emm0); ++ ++- mask = _mm256_and_ps(_mm256_cmp_ps(tmp, fx, _CMP_GT_OS), one); ++- fx = _mm256_sub_ps(tmp, mask); +++ mask = _mm256_and_ps(_mm256_cmp_ps(tmp, fx, _CMP_GT_OS), one); +++ fx = _mm256_sub_ps(tmp, mask); ++ ++- tmp = _mm256_fnmadd_ps(fx, exp_C1, bVal); ++- bVal = _mm256_fnmadd_ps(fx, exp_C2, tmp); ++- z = _mm256_mul_ps(bVal, bVal); +++ tmp = _mm256_fnmadd_ps(fx, exp_C1, bVal); +++ bVal = _mm256_fnmadd_ps(fx, exp_C2, tmp); +++ z = _mm256_mul_ps(bVal, bVal); ++ ++- y = _mm256_fmadd_ps(exp_p0, bVal, exp_p1); ++- y = _mm256_fmadd_ps(y, bVal, exp_p2); ++- y = _mm256_fmadd_ps(y, bVal, exp_p3); ++- y = _mm256_fmadd_ps(y, bVal, exp_p4); ++- y = _mm256_fmadd_ps(y, bVal, exp_p5); ++- y = _mm256_fmadd_ps(y, z, bVal); ++- y = _mm256_add_ps(y, one); +++ y = _mm256_fmadd_ps(exp_p0, bVal, exp_p1); +++ y = _mm256_fmadd_ps(y, bVal, exp_p2); +++ y = _mm256_fmadd_ps(y, bVal, exp_p3); +++ y = _mm256_fmadd_ps(y, bVal, exp_p4); +++ y = _mm256_fmadd_ps(y, bVal, exp_p5); +++ y = _mm256_fmadd_ps(y, z, bVal); +++ y = _mm256_add_ps(y, one); ++ ++- emm0 = _mm256_slli_epi32(_mm256_add_epi32(_mm256_cvttps_epi32(fx), pi32_0x7f), 23); +++ emm0 = +++ _mm256_slli_epi32(_mm256_add_epi32(_mm256_cvttps_epi32(fx), pi32_0x7f), 23); ++ ++ pow2n = _mm256_castsi256_ps(emm0); ++ cVal = _mm256_mul_ps(y, pow2n); ++@@ -680,12 +829,12 @@ volk_32f_x2_pow_32f_u_avx2_fma(float* cVector, const float* bVector, ++ aPtr += 8; ++ bPtr += 8; ++ cPtr += 8; ++- } +++ } ++ ++- number = eighthPoints * 8; ++- for(;number < num_points; number++){ ++- *cPtr++ = pow(*aPtr++, *bPtr++); ++- } +++ number = eighthPoints * 8; +++ for (; number < num_points; number++) { +++ *cPtr++ = pow(*aPtr++, *bPtr++); +++ } ++ } ++ ++ #endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for unaligned */ ++@@ -694,99 +843,131 @@ volk_32f_x2_pow_32f_u_avx2_fma(float* cVector, const float* bVector, ++ #include ++ ++ #define POLY0_AVX2(x, c0) _mm256_set1_ps(c0) ++-#define POLY1_AVX2(x, c0, c1) _mm256_add_ps(_mm256_mul_ps(POLY0_AVX2(x, c1), x), _mm256_set1_ps(c0)) ++-#define POLY2_AVX2(x, c0, c1, c2) _mm256_add_ps(_mm256_mul_ps(POLY1_AVX2(x, c1, c2), x), _mm256_set1_ps(c0)) ++-#define POLY3_AVX2(x, c0, c1, c2, c3) _mm256_add_ps(_mm256_mul_ps(POLY2_AVX2(x, c1, c2, c3), x), _mm256_set1_ps(c0)) ++-#define POLY4_AVX2(x, c0, c1, c2, c3, c4) _mm256_add_ps(_mm256_mul_ps(POLY3_AVX2(x, c1, c2, c3, c4), x), _mm256_set1_ps(c0)) ++-#define POLY5_AVX2(x, c0, c1, c2, c3, c4, c5) _mm256_add_ps(_mm256_mul_ps(POLY4_AVX2(x, c1, c2, c3, c4, c5), x), _mm256_set1_ps(c0)) ++- ++-static inline void ++-volk_32f_x2_pow_32f_u_avx2(float* cVector, const float* bVector, ++- const float* aVector, unsigned int num_points) +++#define POLY1_AVX2(x, c0, c1) \ +++ _mm256_add_ps(_mm256_mul_ps(POLY0_AVX2(x, c1), x), _mm256_set1_ps(c0)) +++#define POLY2_AVX2(x, c0, c1, c2) \ +++ _mm256_add_ps(_mm256_mul_ps(POLY1_AVX2(x, c1, c2), x), _mm256_set1_ps(c0)) +++#define POLY3_AVX2(x, c0, c1, c2, c3) \ +++ _mm256_add_ps(_mm256_mul_ps(POLY2_AVX2(x, c1, c2, c3), x), _mm256_set1_ps(c0)) +++#define POLY4_AVX2(x, c0, c1, c2, c3, c4) \ +++ _mm256_add_ps(_mm256_mul_ps(POLY3_AVX2(x, c1, c2, c3, c4), x), _mm256_set1_ps(c0)) +++#define POLY5_AVX2(x, c0, c1, c2, c3, c4, c5) \ +++ _mm256_add_ps(_mm256_mul_ps(POLY4_AVX2(x, c1, c2, c3, c4, c5), x), _mm256_set1_ps(c0)) +++ +++static inline void volk_32f_x2_pow_32f_u_avx2(float* cVector, +++ const float* bVector, +++ const float* aVector, +++ unsigned int num_points) ++ { ++- float* cPtr = cVector; ++- const float* bPtr = bVector; ++- const float* aPtr = aVector; ++- ++- unsigned int number = 0; ++- const unsigned int eighthPoints = num_points / 8; ++- ++- __m256 aVal, bVal, cVal, logarithm, mantissa, frac, leadingOne; ++- __m256 tmp, fx, mask, pow2n, z, y; ++- __m256 one, exp_hi, exp_lo, ln2, log2EF, half, exp_C1, exp_C2; ++- __m256 exp_p0, exp_p1, exp_p2, exp_p3, exp_p4, exp_p5; ++- __m256i bias, exp, emm0, pi32_0x7f; ++- ++- one = _mm256_set1_ps(1.0); ++- exp_hi = _mm256_set1_ps(88.3762626647949); ++- exp_lo = _mm256_set1_ps(-88.3762626647949); ++- ln2 = _mm256_set1_ps(0.6931471805); ++- log2EF = _mm256_set1_ps(1.44269504088896341); ++- half = _mm256_set1_ps(0.5); ++- exp_C1 = _mm256_set1_ps(0.693359375); ++- exp_C2 = _mm256_set1_ps(-2.12194440e-4); ++- pi32_0x7f = _mm256_set1_epi32(0x7f); ++- ++- exp_p0 = _mm256_set1_ps(1.9875691500e-4); ++- exp_p1 = _mm256_set1_ps(1.3981999507e-3); ++- exp_p2 = _mm256_set1_ps(8.3334519073e-3); ++- exp_p3 = _mm256_set1_ps(4.1665795894e-2); ++- exp_p4 = _mm256_set1_ps(1.6666665459e-1); ++- exp_p5 = _mm256_set1_ps(5.0000001201e-1); ++- ++- for(;number < eighthPoints; number++){ ++- // First compute the logarithm ++- aVal = _mm256_loadu_ps(aPtr); ++- bias = _mm256_set1_epi32(127); ++- leadingOne = _mm256_set1_ps(1.0f); ++- exp = _mm256_sub_epi32(_mm256_srli_epi32(_mm256_and_si256(_mm256_castps_si256(aVal), _mm256_set1_epi32(0x7f800000)), 23), bias); ++- logarithm = _mm256_cvtepi32_ps(exp); ++- ++- frac = _mm256_or_ps(leadingOne, _mm256_and_ps(aVal, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffff)))); +++ float* cPtr = cVector; +++ const float* bPtr = bVector; +++ const float* aPtr = aVector; +++ +++ unsigned int number = 0; +++ const unsigned int eighthPoints = num_points / 8; +++ +++ __m256 aVal, bVal, cVal, logarithm, mantissa, frac, leadingOne; +++ __m256 tmp, fx, mask, pow2n, z, y; +++ __m256 one, exp_hi, exp_lo, ln2, log2EF, half, exp_C1, exp_C2; +++ __m256 exp_p0, exp_p1, exp_p2, exp_p3, exp_p4, exp_p5; +++ __m256i bias, exp, emm0, pi32_0x7f; +++ +++ one = _mm256_set1_ps(1.0); +++ exp_hi = _mm256_set1_ps(88.3762626647949); +++ exp_lo = _mm256_set1_ps(-88.3762626647949); +++ ln2 = _mm256_set1_ps(0.6931471805); +++ log2EF = _mm256_set1_ps(1.44269504088896341); +++ half = _mm256_set1_ps(0.5); +++ exp_C1 = _mm256_set1_ps(0.693359375); +++ exp_C2 = _mm256_set1_ps(-2.12194440e-4); +++ pi32_0x7f = _mm256_set1_epi32(0x7f); +++ +++ exp_p0 = _mm256_set1_ps(1.9875691500e-4); +++ exp_p1 = _mm256_set1_ps(1.3981999507e-3); +++ exp_p2 = _mm256_set1_ps(8.3334519073e-3); +++ exp_p3 = _mm256_set1_ps(4.1665795894e-2); +++ exp_p4 = _mm256_set1_ps(1.6666665459e-1); +++ exp_p5 = _mm256_set1_ps(5.0000001201e-1); +++ +++ for (; number < eighthPoints; number++) { +++ // First compute the logarithm +++ aVal = _mm256_loadu_ps(aPtr); +++ bias = _mm256_set1_epi32(127); +++ leadingOne = _mm256_set1_ps(1.0f); +++ exp = _mm256_sub_epi32( +++ _mm256_srli_epi32(_mm256_and_si256(_mm256_castps_si256(aVal), +++ _mm256_set1_epi32(0x7f800000)), +++ 23), +++ bias); +++ logarithm = _mm256_cvtepi32_ps(exp); +++ +++ frac = _mm256_or_ps( +++ leadingOne, +++ _mm256_and_ps(aVal, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffff)))); ++ ++ #if POW_POLY_DEGREE == 6 ++- mantissa = POLY5_AVX2( frac, 3.1157899f, -3.3241990f, 2.5988452f, -1.2315303f, 3.1821337e-1f, -3.4436006e-2f); +++ mantissa = POLY5_AVX2(frac, +++ 3.1157899f, +++ -3.3241990f, +++ 2.5988452f, +++ -1.2315303f, +++ 3.1821337e-1f, +++ -3.4436006e-2f); ++ #elif POW_POLY_DEGREE == 5 ++- mantissa = POLY4_AVX2( frac, 2.8882704548164776201f, -2.52074962577807006663f, 1.48116647521213171641f, -0.465725644288844778798f, 0.0596515482674574969533f); +++ mantissa = POLY4_AVX2(frac, +++ 2.8882704548164776201f, +++ -2.52074962577807006663f, +++ 1.48116647521213171641f, +++ -0.465725644288844778798f, +++ 0.0596515482674574969533f); ++ #elif POW_POLY_DEGREE == 4 ++- mantissa = POLY3_AVX2( frac, 2.61761038894603480148f, -1.75647175389045657003f, 0.688243882994381274313f, -0.107254423828329604454f); +++ mantissa = POLY3_AVX2(frac, +++ 2.61761038894603480148f, +++ -1.75647175389045657003f, +++ 0.688243882994381274313f, +++ -0.107254423828329604454f); ++ #elif POW_POLY_DEGREE == 3 ++- mantissa = POLY2_AVX2( frac, 2.28330284476918490682f, -1.04913055217340124191f, 0.204446009836232697516f); +++ mantissa = POLY2_AVX2(frac, +++ 2.28330284476918490682f, +++ -1.04913055217340124191f, +++ 0.204446009836232697516f); ++ #else ++ #error ++ #endif ++ ++- logarithm = _mm256_add_ps(_mm256_mul_ps(mantissa, _mm256_sub_ps(frac, leadingOne)), logarithm); ++- logarithm = _mm256_mul_ps(logarithm, ln2); +++ logarithm = _mm256_add_ps( +++ _mm256_mul_ps(mantissa, _mm256_sub_ps(frac, leadingOne)), logarithm); +++ logarithm = _mm256_mul_ps(logarithm, ln2); ++ ++- // Now calculate b*lna ++- bVal = _mm256_loadu_ps(bPtr); ++- bVal = _mm256_mul_ps(bVal, logarithm); +++ // Now calculate b*lna +++ bVal = _mm256_loadu_ps(bPtr); +++ bVal = _mm256_mul_ps(bVal, logarithm); ++ ++- // Now compute exp(b*lna) ++- bVal = _mm256_max_ps(_mm256_min_ps(bVal, exp_hi), exp_lo); +++ // Now compute exp(b*lna) +++ bVal = _mm256_max_ps(_mm256_min_ps(bVal, exp_hi), exp_lo); ++ ++- fx = _mm256_add_ps(_mm256_mul_ps(bVal, log2EF), half); +++ fx = _mm256_add_ps(_mm256_mul_ps(bVal, log2EF), half); ++ ++- emm0 = _mm256_cvttps_epi32(fx); ++- tmp = _mm256_cvtepi32_ps(emm0); +++ emm0 = _mm256_cvttps_epi32(fx); +++ tmp = _mm256_cvtepi32_ps(emm0); ++ ++- mask = _mm256_and_ps(_mm256_cmp_ps(tmp, fx, _CMP_GT_OS), one); ++- fx = _mm256_sub_ps(tmp, mask); +++ mask = _mm256_and_ps(_mm256_cmp_ps(tmp, fx, _CMP_GT_OS), one); +++ fx = _mm256_sub_ps(tmp, mask); ++ ++- tmp = _mm256_sub_ps(bVal, _mm256_mul_ps(fx, exp_C1)); ++- bVal = _mm256_sub_ps(tmp, _mm256_mul_ps(fx, exp_C2)); ++- z = _mm256_mul_ps(bVal, bVal); +++ tmp = _mm256_sub_ps(bVal, _mm256_mul_ps(fx, exp_C1)); +++ bVal = _mm256_sub_ps(tmp, _mm256_mul_ps(fx, exp_C2)); +++ z = _mm256_mul_ps(bVal, bVal); ++ ++- y = _mm256_add_ps(_mm256_mul_ps(exp_p0, bVal), exp_p1); ++- y = _mm256_add_ps(_mm256_mul_ps(y, bVal), exp_p2); ++- y = _mm256_add_ps(_mm256_mul_ps(y, bVal), exp_p3); ++- y = _mm256_add_ps(_mm256_mul_ps(y, bVal), exp_p4); ++- y = _mm256_add_ps(_mm256_mul_ps(y, bVal), exp_p5); ++- y = _mm256_add_ps(_mm256_mul_ps(y, z), bVal); ++- y = _mm256_add_ps(y, one); +++ y = _mm256_add_ps(_mm256_mul_ps(exp_p0, bVal), exp_p1); +++ y = _mm256_add_ps(_mm256_mul_ps(y, bVal), exp_p2); +++ y = _mm256_add_ps(_mm256_mul_ps(y, bVal), exp_p3); +++ y = _mm256_add_ps(_mm256_mul_ps(y, bVal), exp_p4); +++ y = _mm256_add_ps(_mm256_mul_ps(y, bVal), exp_p5); +++ y = _mm256_add_ps(_mm256_mul_ps(y, z), bVal); +++ y = _mm256_add_ps(y, one); ++ ++- emm0 = _mm256_slli_epi32(_mm256_add_epi32(_mm256_cvttps_epi32(fx), pi32_0x7f), 23); +++ emm0 = +++ _mm256_slli_epi32(_mm256_add_epi32(_mm256_cvttps_epi32(fx), pi32_0x7f), 23); ++ ++ pow2n = _mm256_castsi256_ps(emm0); ++ cVal = _mm256_mul_ps(y, pow2n); ++@@ -796,12 +977,12 @@ volk_32f_x2_pow_32f_u_avx2(float* cVector, const float* bVector, ++ aPtr += 8; ++ bPtr += 8; ++ cPtr += 8; ++- } +++ } ++ ++- number = eighthPoints * 8; ++- for(;number < num_points; number++){ ++- *cPtr++ = pow(*aPtr++, *bPtr++); ++- } +++ number = eighthPoints * 8; +++ for (; number < num_points; number++) { +++ *cPtr++ = pow(*aPtr++, *bPtr++); +++ } ++ } ++ ++ #endif /* LV_HAVE_AVX2 for unaligned */ ++diff --git a/kernels/volk/volk_32f_x2_s32f_interleave_16ic.h b/kernels/volk/volk_32f_x2_s32f_interleave_16ic.h ++index 8021faf..04e5892 100644 ++--- a/kernels/volk/volk_32f_x2_s32f_interleave_16ic.h +++++ b/kernels/volk/volk_32f_x2_s32f_interleave_16ic.h ++@@ -32,8 +32,8 @@ ++ * ++ * Dispatcher Prototype ++ * \code ++- * void volk_32f_x2_s32f_interleave_16ic(lv_16sc_t* complexVector, const float* iBuffer, const float* qBuffer, const float scalar, unsigned int num_points) ++- * \endcode +++ * void volk_32f_x2_s32f_interleave_16ic(lv_16sc_t* complexVector, const float* iBuffer, +++ * const float* qBuffer, const float scalar, unsigned int num_points) \endcode ++ * ++ * \b Inputs ++ * \li iBuffer: Input vector of samples for the real part. ++@@ -75,60 +75,62 @@ ++ #ifndef INCLUDED_volk_32f_x2_s32f_interleave_16ic_a_H ++ #define INCLUDED_volk_32f_x2_s32f_interleave_16ic_a_H ++ ++-#include ++ #include ++ #include +++#include ++ ++ #ifdef LV_HAVE_AVX2 ++ #include ++ ++-static inline void ++-volk_32f_x2_s32f_interleave_16ic_a_avx2(lv_16sc_t* complexVector, const float* iBuffer, ++- const float* qBuffer, const float scalar, unsigned int num_points) +++static inline void volk_32f_x2_s32f_interleave_16ic_a_avx2(lv_16sc_t* complexVector, +++ const float* iBuffer, +++ const float* qBuffer, +++ const float scalar, +++ unsigned int num_points) ++ { ++- unsigned int number = 0; ++- const float* iBufferPtr = iBuffer; ++- const float* qBufferPtr = qBuffer; +++ unsigned int number = 0; +++ const float* iBufferPtr = iBuffer; +++ const float* qBufferPtr = qBuffer; ++ ++- __m256 vScalar = _mm256_set1_ps(scalar); +++ __m256 vScalar = _mm256_set1_ps(scalar); ++ ++- const unsigned int eighthPoints = num_points / 8; +++ const unsigned int eighthPoints = num_points / 8; ++ ++- __m256 iValue, qValue, cplxValue1, cplxValue2; ++- __m256i intValue1, intValue2; +++ __m256 iValue, qValue, cplxValue1, cplxValue2; +++ __m256i intValue1, intValue2; ++ ++- int16_t* complexVectorPtr = (int16_t*)complexVector; +++ int16_t* complexVectorPtr = (int16_t*)complexVector; ++ ++- for(;number < eighthPoints; number++){ ++- iValue = _mm256_load_ps(iBufferPtr); ++- qValue = _mm256_load_ps(qBufferPtr); +++ for (; number < eighthPoints; number++) { +++ iValue = _mm256_load_ps(iBufferPtr); +++ qValue = _mm256_load_ps(qBufferPtr); ++ ++- // Interleaves the lower two values in the i and q variables into one buffer ++- cplxValue1 = _mm256_unpacklo_ps(iValue, qValue); ++- cplxValue1 = _mm256_mul_ps(cplxValue1, vScalar); +++ // Interleaves the lower two values in the i and q variables into one buffer +++ cplxValue1 = _mm256_unpacklo_ps(iValue, qValue); +++ cplxValue1 = _mm256_mul_ps(cplxValue1, vScalar); ++ ++- // Interleaves the upper two values in the i and q variables into one buffer ++- cplxValue2 = _mm256_unpackhi_ps(iValue, qValue); ++- cplxValue2 = _mm256_mul_ps(cplxValue2, vScalar); +++ // Interleaves the upper two values in the i and q variables into one buffer +++ cplxValue2 = _mm256_unpackhi_ps(iValue, qValue); +++ cplxValue2 = _mm256_mul_ps(cplxValue2, vScalar); ++ ++- intValue1 = _mm256_cvtps_epi32(cplxValue1); ++- intValue2 = _mm256_cvtps_epi32(cplxValue2); +++ intValue1 = _mm256_cvtps_epi32(cplxValue1); +++ intValue2 = _mm256_cvtps_epi32(cplxValue2); ++ ++- intValue1 = _mm256_packs_epi32(intValue1, intValue2); +++ intValue1 = _mm256_packs_epi32(intValue1, intValue2); ++ ++- _mm256_store_si256((__m256i*)complexVectorPtr, intValue1); ++- complexVectorPtr += 16; +++ _mm256_store_si256((__m256i*)complexVectorPtr, intValue1); +++ complexVectorPtr += 16; ++ ++- iBufferPtr += 8; ++- qBufferPtr += 8; ++- } +++ iBufferPtr += 8; +++ qBufferPtr += 8; +++ } ++ ++- number = eighthPoints * 8; ++- complexVectorPtr = (int16_t*)(&complexVector[number]); ++- for(; number < num_points; number++){ ++- *complexVectorPtr++ = (int16_t)rintf(*iBufferPtr++ * scalar); ++- *complexVectorPtr++ = (int16_t)rintf(*qBufferPtr++ * scalar); ++- } +++ number = eighthPoints * 8; +++ complexVectorPtr = (int16_t*)(&complexVector[number]); +++ for (; number < num_points; number++) { +++ *complexVectorPtr++ = (int16_t)rintf(*iBufferPtr++ * scalar); +++ *complexVectorPtr++ = (int16_t)rintf(*qBufferPtr++ * scalar); +++ } ++ } ++ #endif /* LV_HAVE_AVX2 */ ++ ++@@ -136,53 +138,55 @@ volk_32f_x2_s32f_interleave_16ic_a_avx2(lv_16sc_t* complexVector, const float* i ++ #ifdef LV_HAVE_SSE2 ++ #include ++ ++-static inline void ++-volk_32f_x2_s32f_interleave_16ic_a_sse2(lv_16sc_t* complexVector, const float* iBuffer, ++- const float* qBuffer, const float scalar, unsigned int num_points) +++static inline void volk_32f_x2_s32f_interleave_16ic_a_sse2(lv_16sc_t* complexVector, +++ const float* iBuffer, +++ const float* qBuffer, +++ const float scalar, +++ unsigned int num_points) ++ { ++- unsigned int number = 0; ++- const float* iBufferPtr = iBuffer; ++- const float* qBufferPtr = qBuffer; +++ unsigned int number = 0; +++ const float* iBufferPtr = iBuffer; +++ const float* qBufferPtr = qBuffer; ++ ++- __m128 vScalar = _mm_set_ps1(scalar); +++ __m128 vScalar = _mm_set_ps1(scalar); ++ ++- const unsigned int quarterPoints = num_points / 4; +++ const unsigned int quarterPoints = num_points / 4; ++ ++- __m128 iValue, qValue, cplxValue1, cplxValue2; ++- __m128i intValue1, intValue2; +++ __m128 iValue, qValue, cplxValue1, cplxValue2; +++ __m128i intValue1, intValue2; ++ ++- int16_t* complexVectorPtr = (int16_t*)complexVector; +++ int16_t* complexVectorPtr = (int16_t*)complexVector; ++ ++- for(;number < quarterPoints; number++){ ++- iValue = _mm_load_ps(iBufferPtr); ++- qValue = _mm_load_ps(qBufferPtr); +++ for (; number < quarterPoints; number++) { +++ iValue = _mm_load_ps(iBufferPtr); +++ qValue = _mm_load_ps(qBufferPtr); ++ ++- // Interleaves the lower two values in the i and q variables into one buffer ++- cplxValue1 = _mm_unpacklo_ps(iValue, qValue); ++- cplxValue1 = _mm_mul_ps(cplxValue1, vScalar); +++ // Interleaves the lower two values in the i and q variables into one buffer +++ cplxValue1 = _mm_unpacklo_ps(iValue, qValue); +++ cplxValue1 = _mm_mul_ps(cplxValue1, vScalar); ++ ++- // Interleaves the upper two values in the i and q variables into one buffer ++- cplxValue2 = _mm_unpackhi_ps(iValue, qValue); ++- cplxValue2 = _mm_mul_ps(cplxValue2, vScalar); +++ // Interleaves the upper two values in the i and q variables into one buffer +++ cplxValue2 = _mm_unpackhi_ps(iValue, qValue); +++ cplxValue2 = _mm_mul_ps(cplxValue2, vScalar); ++ ++- intValue1 = _mm_cvtps_epi32(cplxValue1); ++- intValue2 = _mm_cvtps_epi32(cplxValue2); +++ intValue1 = _mm_cvtps_epi32(cplxValue1); +++ intValue2 = _mm_cvtps_epi32(cplxValue2); ++ ++- intValue1 = _mm_packs_epi32(intValue1, intValue2); +++ intValue1 = _mm_packs_epi32(intValue1, intValue2); ++ ++- _mm_store_si128((__m128i*)complexVectorPtr, intValue1); ++- complexVectorPtr += 8; +++ _mm_store_si128((__m128i*)complexVectorPtr, intValue1); +++ complexVectorPtr += 8; ++ ++- iBufferPtr += 4; ++- qBufferPtr += 4; ++- } +++ iBufferPtr += 4; +++ qBufferPtr += 4; +++ } ++ ++- number = quarterPoints * 4; ++- complexVectorPtr = (int16_t*)(&complexVector[number]); ++- for(; number < num_points; number++){ ++- *complexVectorPtr++ = (int16_t)rintf(*iBufferPtr++ * scalar); ++- *complexVectorPtr++ = (int16_t)rintf(*qBufferPtr++ * scalar); ++- } +++ number = quarterPoints * 4; +++ complexVectorPtr = (int16_t*)(&complexVector[number]); +++ for (; number < num_points; number++) { +++ *complexVectorPtr++ = (int16_t)rintf(*iBufferPtr++ * scalar); +++ *complexVectorPtr++ = (int16_t)rintf(*qBufferPtr++ * scalar); +++ } ++ } ++ #endif /* LV_HAVE_SSE2 */ ++ ++@@ -190,79 +194,83 @@ volk_32f_x2_s32f_interleave_16ic_a_sse2(lv_16sc_t* complexVector, const float* i ++ #ifdef LV_HAVE_SSE ++ #include ++ ++-static inline void ++-volk_32f_x2_s32f_interleave_16ic_a_sse(lv_16sc_t* complexVector, const float* iBuffer, ++- const float* qBuffer, const float scalar, unsigned int num_points) +++static inline void volk_32f_x2_s32f_interleave_16ic_a_sse(lv_16sc_t* complexVector, +++ const float* iBuffer, +++ const float* qBuffer, +++ const float scalar, +++ unsigned int num_points) ++ { ++- unsigned int number = 0; ++- const float* iBufferPtr = iBuffer; ++- const float* qBufferPtr = qBuffer; +++ unsigned int number = 0; +++ const float* iBufferPtr = iBuffer; +++ const float* qBufferPtr = qBuffer; ++ ++- __m128 vScalar = _mm_set_ps1(scalar); +++ __m128 vScalar = _mm_set_ps1(scalar); ++ ++- const unsigned int quarterPoints = num_points / 4; +++ const unsigned int quarterPoints = num_points / 4; ++ ++- __m128 iValue, qValue, cplxValue; +++ __m128 iValue, qValue, cplxValue; ++ ++- int16_t* complexVectorPtr = (int16_t*)complexVector; +++ int16_t* complexVectorPtr = (int16_t*)complexVector; ++ ++- __VOLK_ATTR_ALIGNED(16) float floatBuffer[4]; +++ __VOLK_ATTR_ALIGNED(16) float floatBuffer[4]; ++ ++- for(;number < quarterPoints; number++){ ++- iValue = _mm_load_ps(iBufferPtr); ++- qValue = _mm_load_ps(qBufferPtr); +++ for (; number < quarterPoints; number++) { +++ iValue = _mm_load_ps(iBufferPtr); +++ qValue = _mm_load_ps(qBufferPtr); ++ ++- // Interleaves the lower two values in the i and q variables into one buffer ++- cplxValue = _mm_unpacklo_ps(iValue, qValue); ++- cplxValue = _mm_mul_ps(cplxValue, vScalar); +++ // Interleaves the lower two values in the i and q variables into one buffer +++ cplxValue = _mm_unpacklo_ps(iValue, qValue); +++ cplxValue = _mm_mul_ps(cplxValue, vScalar); ++ ++- _mm_store_ps(floatBuffer, cplxValue); +++ _mm_store_ps(floatBuffer, cplxValue); ++ ++- *complexVectorPtr++ = (int16_t)rintf(floatBuffer[0]); ++- *complexVectorPtr++ = (int16_t)rintf(floatBuffer[1]); ++- *complexVectorPtr++ = (int16_t)rintf(floatBuffer[2]); ++- *complexVectorPtr++ = (int16_t)rintf(floatBuffer[3]); +++ *complexVectorPtr++ = (int16_t)rintf(floatBuffer[0]); +++ *complexVectorPtr++ = (int16_t)rintf(floatBuffer[1]); +++ *complexVectorPtr++ = (int16_t)rintf(floatBuffer[2]); +++ *complexVectorPtr++ = (int16_t)rintf(floatBuffer[3]); ++ ++- // Interleaves the upper two values in the i and q variables into one buffer ++- cplxValue = _mm_unpackhi_ps(iValue, qValue); ++- cplxValue = _mm_mul_ps(cplxValue, vScalar); +++ // Interleaves the upper two values in the i and q variables into one buffer +++ cplxValue = _mm_unpackhi_ps(iValue, qValue); +++ cplxValue = _mm_mul_ps(cplxValue, vScalar); ++ ++- _mm_store_ps(floatBuffer, cplxValue); +++ _mm_store_ps(floatBuffer, cplxValue); ++ ++- *complexVectorPtr++ = (int16_t)rintf(floatBuffer[0]); ++- *complexVectorPtr++ = (int16_t)rintf(floatBuffer[1]); ++- *complexVectorPtr++ = (int16_t)rintf(floatBuffer[2]); ++- *complexVectorPtr++ = (int16_t)rintf(floatBuffer[3]); +++ *complexVectorPtr++ = (int16_t)rintf(floatBuffer[0]); +++ *complexVectorPtr++ = (int16_t)rintf(floatBuffer[1]); +++ *complexVectorPtr++ = (int16_t)rintf(floatBuffer[2]); +++ *complexVectorPtr++ = (int16_t)rintf(floatBuffer[3]); ++ ++- iBufferPtr += 4; ++- qBufferPtr += 4; ++- } +++ iBufferPtr += 4; +++ qBufferPtr += 4; +++ } ++ ++- number = quarterPoints * 4; ++- complexVectorPtr = (int16_t*)(&complexVector[number]); ++- for(; number < num_points; number++){ ++- *complexVectorPtr++ = (int16_t)rintf(*iBufferPtr++ * scalar); ++- *complexVectorPtr++ = (int16_t)rintf(*qBufferPtr++ * scalar); ++- } +++ number = quarterPoints * 4; +++ complexVectorPtr = (int16_t*)(&complexVector[number]); +++ for (; number < num_points; number++) { +++ *complexVectorPtr++ = (int16_t)rintf(*iBufferPtr++ * scalar); +++ *complexVectorPtr++ = (int16_t)rintf(*qBufferPtr++ * scalar); +++ } ++ } ++ #endif /* LV_HAVE_SSE */ ++ ++ ++ #ifdef LV_HAVE_GENERIC ++ ++-static inline void ++-volk_32f_x2_s32f_interleave_16ic_generic(lv_16sc_t* complexVector, const float* iBuffer, ++- const float* qBuffer, const float scalar, unsigned int num_points) +++static inline void volk_32f_x2_s32f_interleave_16ic_generic(lv_16sc_t* complexVector, +++ const float* iBuffer, +++ const float* qBuffer, +++ const float scalar, +++ unsigned int num_points) ++ { ++- int16_t* complexVectorPtr = (int16_t*)complexVector; ++- const float* iBufferPtr = iBuffer; ++- const float* qBufferPtr = qBuffer; ++- unsigned int number = 0; ++- ++- for(number = 0; number < num_points; number++){ ++- *complexVectorPtr++ = (int16_t)rintf(*iBufferPtr++ * scalar); ++- *complexVectorPtr++ = (int16_t)rintf(*qBufferPtr++ * scalar); ++- } +++ int16_t* complexVectorPtr = (int16_t*)complexVector; +++ const float* iBufferPtr = iBuffer; +++ const float* qBufferPtr = qBuffer; +++ unsigned int number = 0; +++ +++ for (number = 0; number < num_points; number++) { +++ *complexVectorPtr++ = (int16_t)rintf(*iBufferPtr++ * scalar); +++ *complexVectorPtr++ = (int16_t)rintf(*qBufferPtr++ * scalar); +++ } ++ } ++ #endif /* LV_HAVE_GENERIC */ ++ ++@@ -272,60 +280,62 @@ volk_32f_x2_s32f_interleave_16ic_generic(lv_16sc_t* complexVector, const float* ++ #ifndef INCLUDED_volk_32f_x2_s32f_interleave_16ic_u_H ++ #define INCLUDED_volk_32f_x2_s32f_interleave_16ic_u_H ++ ++-#include ++ #include ++ #include +++#include ++ ++ #ifdef LV_HAVE_AVX2 ++ #include ++ ++-static inline void ++-volk_32f_x2_s32f_interleave_16ic_u_avx2(lv_16sc_t* complexVector, const float* iBuffer, ++- const float* qBuffer, const float scalar, unsigned int num_points) +++static inline void volk_32f_x2_s32f_interleave_16ic_u_avx2(lv_16sc_t* complexVector, +++ const float* iBuffer, +++ const float* qBuffer, +++ const float scalar, +++ unsigned int num_points) ++ { ++- unsigned int number = 0; ++- const float* iBufferPtr = iBuffer; ++- const float* qBufferPtr = qBuffer; +++ unsigned int number = 0; +++ const float* iBufferPtr = iBuffer; +++ const float* qBufferPtr = qBuffer; ++ ++- __m256 vScalar = _mm256_set1_ps(scalar); +++ __m256 vScalar = _mm256_set1_ps(scalar); ++ ++- const unsigned int eighthPoints = num_points / 8; +++ const unsigned int eighthPoints = num_points / 8; ++ ++- __m256 iValue, qValue, cplxValue1, cplxValue2; ++- __m256i intValue1, intValue2; +++ __m256 iValue, qValue, cplxValue1, cplxValue2; +++ __m256i intValue1, intValue2; ++ ++- int16_t* complexVectorPtr = (int16_t*)complexVector; +++ int16_t* complexVectorPtr = (int16_t*)complexVector; ++ ++- for(;number < eighthPoints; number++){ ++- iValue = _mm256_loadu_ps(iBufferPtr); ++- qValue = _mm256_loadu_ps(qBufferPtr); +++ for (; number < eighthPoints; number++) { +++ iValue = _mm256_loadu_ps(iBufferPtr); +++ qValue = _mm256_loadu_ps(qBufferPtr); ++ ++- // Interleaves the lower two values in the i and q variables into one buffer ++- cplxValue1 = _mm256_unpacklo_ps(iValue, qValue); ++- cplxValue1 = _mm256_mul_ps(cplxValue1, vScalar); +++ // Interleaves the lower two values in the i and q variables into one buffer +++ cplxValue1 = _mm256_unpacklo_ps(iValue, qValue); +++ cplxValue1 = _mm256_mul_ps(cplxValue1, vScalar); ++ ++- // Interleaves the upper two values in the i and q variables into one buffer ++- cplxValue2 = _mm256_unpackhi_ps(iValue, qValue); ++- cplxValue2 = _mm256_mul_ps(cplxValue2, vScalar); +++ // Interleaves the upper two values in the i and q variables into one buffer +++ cplxValue2 = _mm256_unpackhi_ps(iValue, qValue); +++ cplxValue2 = _mm256_mul_ps(cplxValue2, vScalar); ++ ++- intValue1 = _mm256_cvtps_epi32(cplxValue1); ++- intValue2 = _mm256_cvtps_epi32(cplxValue2); +++ intValue1 = _mm256_cvtps_epi32(cplxValue1); +++ intValue2 = _mm256_cvtps_epi32(cplxValue2); ++ ++- intValue1 = _mm256_packs_epi32(intValue1, intValue2); +++ intValue1 = _mm256_packs_epi32(intValue1, intValue2); ++ ++- _mm256_storeu_si256((__m256i*)complexVectorPtr, intValue1); ++- complexVectorPtr += 16; +++ _mm256_storeu_si256((__m256i*)complexVectorPtr, intValue1); +++ complexVectorPtr += 16; ++ ++- iBufferPtr += 8; ++- qBufferPtr += 8; ++- } +++ iBufferPtr += 8; +++ qBufferPtr += 8; +++ } ++ ++- number = eighthPoints * 8; ++- complexVectorPtr = (int16_t*)(&complexVector[number]); ++- for(; number < num_points; number++){ ++- *complexVectorPtr++ = (int16_t)rintf(*iBufferPtr++ * scalar); ++- *complexVectorPtr++ = (int16_t)rintf(*qBufferPtr++ * scalar); ++- } +++ number = eighthPoints * 8; +++ complexVectorPtr = (int16_t*)(&complexVector[number]); +++ for (; number < num_points; number++) { +++ *complexVectorPtr++ = (int16_t)rintf(*iBufferPtr++ * scalar); +++ *complexVectorPtr++ = (int16_t)rintf(*qBufferPtr++ * scalar); +++ } ++ } ++ #endif /* LV_HAVE_AVX2 */ ++ ++diff --git a/kernels/volk/volk_32f_x2_subtract_32f.h b/kernels/volk/volk_32f_x2_subtract_32f.h ++index bdfa0a1..359974c 100644 ++--- a/kernels/volk/volk_32f_x2_subtract_32f.h +++++ b/kernels/volk/volk_32f_x2_subtract_32f.h ++@@ -31,8 +31,8 @@ ++ * ++ * Dispatcher Prototype ++ * \code ++- * void volk_32f_x2_subtract_32f(float* cVector, const float* aVector, const float* bVector, unsigned int num_points) ++- * \endcode +++ * void volk_32f_x2_subtract_32f(float* cVector, const float* aVector, const float* +++ * bVector, unsigned int num_points) \endcode ++ * ++ * \b Inputs ++ * \li aVector: The initial vector. ++@@ -77,126 +77,130 @@ ++ #ifdef LV_HAVE_AVX512F ++ #include ++ ++-static inline void ++-volk_32f_x2_subtract_32f_a_avx512f(float* cVector, const float* aVector, ++- const float* bVector, unsigned int num_points) +++static inline void volk_32f_x2_subtract_32f_a_avx512f(float* cVector, +++ const float* aVector, +++ const float* bVector, +++ unsigned int num_points) ++ { ++- unsigned int number = 0; ++- const unsigned int sixteenthPoints = num_points / 16; +++ unsigned int number = 0; +++ const unsigned int sixteenthPoints = num_points / 16; ++ ++- float* cPtr = cVector; ++- const float* aPtr = aVector; ++- const float* bPtr = bVector; +++ float* cPtr = cVector; +++ const float* aPtr = aVector; +++ const float* bPtr = bVector; ++ ++- __m512 aVal, bVal, cVal; ++- for(;number < sixteenthPoints; number++){ +++ __m512 aVal, bVal, cVal; +++ for (; number < sixteenthPoints; number++) { ++ ++- aVal = _mm512_load_ps(aPtr); ++- bVal = _mm512_load_ps(bPtr); +++ aVal = _mm512_load_ps(aPtr); +++ bVal = _mm512_load_ps(bPtr); ++ ++- cVal = _mm512_sub_ps(aVal, bVal); +++ cVal = _mm512_sub_ps(aVal, bVal); ++ ++- _mm512_store_ps(cPtr,cVal); // Store the results back into the C container +++ _mm512_store_ps(cPtr, cVal); // Store the results back into the C container ++ ++- aPtr += 16; ++- bPtr += 16; ++- cPtr += 16; ++- } +++ aPtr += 16; +++ bPtr += 16; +++ cPtr += 16; +++ } ++ ++- number = sixteenthPoints *16; ++- for(;number < num_points; number++){ ++- *cPtr++ = (*aPtr++) - (*bPtr++); ++- } +++ number = sixteenthPoints * 16; +++ for (; number < num_points; number++) { +++ *cPtr++ = (*aPtr++) - (*bPtr++); +++ } ++ } ++ #endif /* LV_HAVE_AVX512F */ ++ ++ #ifdef LV_HAVE_AVX ++ #include ++ ++-static inline void ++-volk_32f_x2_subtract_32f_a_avx(float* cVector, const float* aVector, ++- const float* bVector, unsigned int num_points) +++static inline void volk_32f_x2_subtract_32f_a_avx(float* cVector, +++ const float* aVector, +++ const float* bVector, +++ unsigned int num_points) ++ { ++- unsigned int number = 0; ++- const unsigned int eighthPoints = num_points / 8; +++ unsigned int number = 0; +++ const unsigned int eighthPoints = num_points / 8; ++ ++- float* cPtr = cVector; ++- const float* aPtr = aVector; ++- const float* bPtr = bVector; +++ float* cPtr = cVector; +++ const float* aPtr = aVector; +++ const float* bPtr = bVector; ++ ++- __m256 aVal, bVal, cVal; ++- for(;number < eighthPoints; number++){ +++ __m256 aVal, bVal, cVal; +++ for (; number < eighthPoints; number++) { ++ ++- aVal = _mm256_load_ps(aPtr); ++- bVal = _mm256_load_ps(bPtr); +++ aVal = _mm256_load_ps(aPtr); +++ bVal = _mm256_load_ps(bPtr); ++ ++- cVal = _mm256_sub_ps(aVal, bVal); +++ cVal = _mm256_sub_ps(aVal, bVal); ++ ++- _mm256_store_ps(cPtr,cVal); // Store the results back into the C container +++ _mm256_store_ps(cPtr, cVal); // Store the results back into the C container ++ ++- aPtr += 8; ++- bPtr += 8; ++- cPtr += 8; ++- } +++ aPtr += 8; +++ bPtr += 8; +++ cPtr += 8; +++ } ++ ++- number = eighthPoints * 8; ++- for(;number < num_points; number++){ ++- *cPtr++ = (*aPtr++) - (*bPtr++); ++- } +++ number = eighthPoints * 8; +++ for (; number < num_points; number++) { +++ *cPtr++ = (*aPtr++) - (*bPtr++); +++ } ++ } ++ #endif /* LV_HAVE_AVX */ ++ ++ #ifdef LV_HAVE_SSE ++ #include ++ ++-static inline void ++-volk_32f_x2_subtract_32f_a_sse(float* cVector, const float* aVector, ++- const float* bVector, unsigned int num_points) +++static inline void volk_32f_x2_subtract_32f_a_sse(float* cVector, +++ const float* aVector, +++ const float* bVector, +++ unsigned int num_points) ++ { ++- unsigned int number = 0; ++- const unsigned int quarterPoints = num_points / 4; +++ unsigned int number = 0; +++ const unsigned int quarterPoints = num_points / 4; ++ ++- float* cPtr = cVector; ++- const float* aPtr = aVector; ++- const float* bPtr = bVector; +++ float* cPtr = cVector; +++ const float* aPtr = aVector; +++ const float* bPtr = bVector; ++ ++- __m128 aVal, bVal, cVal; ++- for(;number < quarterPoints; number++){ +++ __m128 aVal, bVal, cVal; +++ for (; number < quarterPoints; number++) { ++ ++- aVal = _mm_load_ps(aPtr); ++- bVal = _mm_load_ps(bPtr); +++ aVal = _mm_load_ps(aPtr); +++ bVal = _mm_load_ps(bPtr); ++ ++- cVal = _mm_sub_ps(aVal, bVal); +++ cVal = _mm_sub_ps(aVal, bVal); ++ ++- _mm_store_ps(cPtr,cVal); // Store the results back into the C container +++ _mm_store_ps(cPtr, cVal); // Store the results back into the C container ++ ++- aPtr += 4; ++- bPtr += 4; ++- cPtr += 4; ++- } +++ aPtr += 4; +++ bPtr += 4; +++ cPtr += 4; +++ } ++ ++- number = quarterPoints * 4; ++- for(;number < num_points; number++){ ++- *cPtr++ = (*aPtr++) - (*bPtr++); ++- } +++ number = quarterPoints * 4; +++ for (; number < num_points; number++) { +++ *cPtr++ = (*aPtr++) - (*bPtr++); +++ } ++ } ++ #endif /* LV_HAVE_SSE */ ++ ++ ++ #ifdef LV_HAVE_GENERIC ++ ++-static inline void ++-volk_32f_x2_subtract_32f_generic(float* cVector, const float* aVector, ++- const float* bVector, unsigned int num_points) +++static inline void volk_32f_x2_subtract_32f_generic(float* cVector, +++ const float* aVector, +++ const float* bVector, +++ unsigned int num_points) ++ { ++- float* cPtr = cVector; ++- const float* aPtr = aVector; ++- const float* bPtr = bVector; ++- unsigned int number = 0; ++- ++- for(number = 0; number < num_points; number++){ ++- *cPtr++ = (*aPtr++) - (*bPtr++); ++- } +++ float* cPtr = cVector; +++ const float* aPtr = aVector; +++ const float* bPtr = bVector; +++ unsigned int number = 0; +++ +++ for (number = 0; number < num_points; number++) { +++ *cPtr++ = (*aPtr++) - (*bPtr++); +++ } ++ } ++ #endif /* LV_HAVE_GENERIC */ ++ ++@@ -204,45 +208,48 @@ volk_32f_x2_subtract_32f_generic(float* cVector, const float* aVector, ++ #ifdef LV_HAVE_NEON ++ #include ++ ++-static inline void ++-volk_32f_x2_subtract_32f_neon(float* cVector, const float* aVector, ++- const float* bVector, unsigned int num_points) +++static inline void volk_32f_x2_subtract_32f_neon(float* cVector, +++ const float* aVector, +++ const float* bVector, +++ unsigned int num_points) ++ { ++- float* cPtr = cVector; ++- const float* aPtr = aVector; ++- const float* bPtr = bVector; ++- unsigned int number = 0; ++- unsigned int quarter_points = num_points / 4; ++- ++- float32x4_t a_vec, b_vec, c_vec; ++- ++- for(number = 0; number < quarter_points; number++){ ++- a_vec = vld1q_f32(aPtr); ++- b_vec = vld1q_f32(bPtr); ++- c_vec = vsubq_f32(a_vec, b_vec); ++- vst1q_f32(cPtr, c_vec); ++- aPtr += 4; ++- bPtr += 4; ++- cPtr += 4; ++- } ++- ++- for(number = quarter_points * 4; number < num_points; number++){ ++- *cPtr++ = (*aPtr++) - (*bPtr++); ++- } +++ float* cPtr = cVector; +++ const float* aPtr = aVector; +++ const float* bPtr = bVector; +++ unsigned int number = 0; +++ unsigned int quarter_points = num_points / 4; +++ +++ float32x4_t a_vec, b_vec, c_vec; +++ +++ for (number = 0; number < quarter_points; number++) { +++ a_vec = vld1q_f32(aPtr); +++ b_vec = vld1q_f32(bPtr); +++ c_vec = vsubq_f32(a_vec, b_vec); +++ vst1q_f32(cPtr, c_vec); +++ aPtr += 4; +++ bPtr += 4; +++ cPtr += 4; +++ } +++ +++ for (number = quarter_points * 4; number < num_points; number++) { +++ *cPtr++ = (*aPtr++) - (*bPtr++); +++ } ++ } ++ #endif /* LV_HAVE_NEON */ ++ ++ ++ #ifdef LV_HAVE_ORC ++-extern void ++-volk_32f_x2_subtract_32f_a_orc_impl(float* cVector, const float* aVector, ++- const float* bVector, unsigned int num_points); ++- ++-static inline void ++-volk_32f_x2_subtract_32f_u_orc(float* cVector, const float* aVector, ++- const float* bVector, unsigned int num_points) +++extern void volk_32f_x2_subtract_32f_a_orc_impl(float* cVector, +++ const float* aVector, +++ const float* bVector, +++ unsigned int num_points); +++ +++static inline void volk_32f_x2_subtract_32f_u_orc(float* cVector, +++ const float* aVector, +++ const float* bVector, +++ unsigned int num_points) ++ { ++- volk_32f_x2_subtract_32f_a_orc_impl(cVector, aVector, bVector, num_points); +++ volk_32f_x2_subtract_32f_a_orc_impl(cVector, aVector, bVector, num_points); ++ } ++ #endif /* LV_HAVE_ORC */ ++ ++@@ -259,36 +266,37 @@ volk_32f_x2_subtract_32f_u_orc(float* cVector, const float* aVector, ++ #ifdef LV_HAVE_AVX512F ++ #include ++ ++-static inline void ++-volk_32f_x2_subtract_32f_u_avx512f(float* cVector, const float* aVector, ++- const float* bVector, unsigned int num_points) +++static inline void volk_32f_x2_subtract_32f_u_avx512f(float* cVector, +++ const float* aVector, +++ const float* bVector, +++ unsigned int num_points) ++ { ++- unsigned int number = 0; ++- const unsigned int sixteenthPoints = num_points / 16; +++ unsigned int number = 0; +++ const unsigned int sixteenthPoints = num_points / 16; ++ ++- float* cPtr = cVector; ++- const float* aPtr = aVector; ++- const float* bPtr = bVector; +++ float* cPtr = cVector; +++ const float* aPtr = aVector; +++ const float* bPtr = bVector; ++ ++- __m512 aVal, bVal, cVal; ++- for(;number < sixteenthPoints; number++){ +++ __m512 aVal, bVal, cVal; +++ for (; number < sixteenthPoints; number++) { ++ ++- aVal = _mm512_loadu_ps(aPtr); ++- bVal = _mm512_loadu_ps(bPtr); +++ aVal = _mm512_loadu_ps(aPtr); +++ bVal = _mm512_loadu_ps(bPtr); ++ ++- cVal = _mm512_sub_ps(aVal, bVal); +++ cVal = _mm512_sub_ps(aVal, bVal); ++ ++- _mm512_storeu_ps(cPtr,cVal); // Store the results back into the C container +++ _mm512_storeu_ps(cPtr, cVal); // Store the results back into the C container ++ ++- aPtr += 16; ++- bPtr += 16; ++- cPtr += 16; ++- } +++ aPtr += 16; +++ bPtr += 16; +++ cPtr += 16; +++ } ++ ++- number = sixteenthPoints *16; ++- for(;number < num_points; number++){ ++- *cPtr++ = (*aPtr++) - (*bPtr++); ++- } +++ number = sixteenthPoints * 16; +++ for (; number < num_points; number++) { +++ *cPtr++ = (*aPtr++) - (*bPtr++); +++ } ++ } ++ #endif /* LV_HAVE_AVX512F */ ++ ++@@ -296,36 +304,37 @@ volk_32f_x2_subtract_32f_u_avx512f(float* cVector, const float* aVector, ++ #ifdef LV_HAVE_AVX ++ #include ++ ++-static inline void ++-volk_32f_x2_subtract_32f_u_avx(float* cVector, const float* aVector, ++- const float* bVector, unsigned int num_points) +++static inline void volk_32f_x2_subtract_32f_u_avx(float* cVector, +++ const float* aVector, +++ const float* bVector, +++ unsigned int num_points) ++ { ++- unsigned int number = 0; ++- const unsigned int eighthPoints = num_points / 8; +++ unsigned int number = 0; +++ const unsigned int eighthPoints = num_points / 8; ++ ++- float* cPtr = cVector; ++- const float* aPtr = aVector; ++- const float* bPtr = bVector; +++ float* cPtr = cVector; +++ const float* aPtr = aVector; +++ const float* bPtr = bVector; ++ ++- __m256 aVal, bVal, cVal; ++- for(;number < eighthPoints; number++){ +++ __m256 aVal, bVal, cVal; +++ for (; number < eighthPoints; number++) { ++ ++- aVal = _mm256_loadu_ps(aPtr); ++- bVal = _mm256_loadu_ps(bPtr); +++ aVal = _mm256_loadu_ps(aPtr); +++ bVal = _mm256_loadu_ps(bPtr); ++ ++- cVal = _mm256_sub_ps(aVal, bVal); +++ cVal = _mm256_sub_ps(aVal, bVal); ++ ++- _mm256_storeu_ps(cPtr,cVal); // Store the results back into the C container +++ _mm256_storeu_ps(cPtr, cVal); // Store the results back into the C container ++ ++- aPtr += 8; ++- bPtr += 8; ++- cPtr += 8; ++- } +++ aPtr += 8; +++ bPtr += 8; +++ cPtr += 8; +++ } ++ ++- number = eighthPoints * 8; ++- for(;number < num_points; number++){ ++- *cPtr++ = (*aPtr++) - (*bPtr++); ++- } +++ number = eighthPoints * 8; +++ for (; number < num_points; number++) { +++ *cPtr++ = (*aPtr++) - (*bPtr++); +++ } ++ } ++ #endif /* LV_HAVE_AVX */ ++ ++diff --git a/kernels/volk/volk_32f_x3_sum_of_poly_32f.h b/kernels/volk/volk_32f_x3_sum_of_poly_32f.h ++index e74a385..b0b1466 100644 ++--- a/kernels/volk/volk_32f_x3_sum_of_poly_32f.h +++++ b/kernels/volk/volk_32f_x3_sum_of_poly_32f.h ++@@ -30,12 +30,13 @@ ++ * multiply by the rectangle/bin width. ++ * ++ * Expressed as a formula, this function calculates ++- * \f$ \sum f(x) = \sum (c_0 + c_1 \cdot x + c_2 \cdot x^2 + c_3 \cdot x^3 + c_4 \cdot x^4)\f$ +++ * \f$ \sum f(x) = \sum (c_0 + c_1 \cdot x + c_2 \cdot x^2 + c_3 \cdot x^3 + c_4 \cdot +++ * x^4)\f$ ++ * ++ * Dispatcher Prototype ++ * \code ++- * void volk_32f_x3_sum_of_poly_32f(float* target, float* src0, float* center_point_array, float* cutoff, unsigned int num_points) ++- * \endcode +++ * void volk_32f_x3_sum_of_poly_32f(float* target, float* src0, float* center_point_array, +++ * float* cutoff, unsigned int num_points) \endcode ++ * ++ * \b Inputs ++ * \li src0: x values ++@@ -53,9 +54,10 @@ ++ * \code ++ * int npoints = 4096; ++ * float* coefficients = (float*)volk_malloc(sizeof(float) * 5, volk_get_alignment()); ++- * float* input = (float*)volk_malloc(sizeof(float) * npoints, volk_get_alignment()); ++- * float* result = (float*)volk_malloc(sizeof(float), volk_get_alignment()); ++- * float* cutoff = (float*)volk_malloc(sizeof(float), volk_get_alignment()); +++ * float* input = (float*)volk_malloc(sizeof(float) * npoints, +++ * volk_get_alignment()); float* result = (float*)volk_malloc(sizeof(float), +++ * volk_get_alignment()); float* cutoff = (float*)volk_malloc(sizeof(float), +++ * volk_get_alignment()); ++ * // load precomputed Taylor series coefficients ++ * coefficients[0] = 4.48168907033806f; // c1 ++ * coefficients[1] = coefficients[0] * 0.5f; // c2 ++@@ -82,288 +84,291 @@ ++ #ifndef INCLUDED_volk_32f_x3_sum_of_poly_32f_a_H ++ #define INCLUDED_volk_32f_x3_sum_of_poly_32f_a_H ++ ++-#include ++-#include ++-#include +++#include +++#include +++#include ++ ++ #ifndef MAX ++-#define MAX(X,Y) ((X) > (Y)?(X):(Y)) +++#define MAX(X, Y) ((X) > (Y) ? (X) : (Y)) ++ #endif ++ ++ #ifdef LV_HAVE_SSE3 ++-#include ++-#include ++- ++-static inline void ++-volk_32f_x3_sum_of_poly_32f_a_sse3(float* target, float* src0, float* center_point_array, ++- float* cutoff, unsigned int num_points) +++#include +++#include +++ +++static inline void volk_32f_x3_sum_of_poly_32f_a_sse3(float* target, +++ float* src0, +++ float* center_point_array, +++ float* cutoff, +++ unsigned int num_points) ++ { ++- float result = 0.0f; ++- float fst = 0.0f; ++- float sq = 0.0f; ++- float thrd = 0.0f; ++- float frth = 0.0f; ++- ++- __m128 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10; ++- ++- xmm9 = _mm_setzero_ps(); ++- xmm1 = _mm_setzero_ps(); ++- xmm0 = _mm_load1_ps(¢er_point_array[0]); ++- xmm6 = _mm_load1_ps(¢er_point_array[1]); ++- xmm7 = _mm_load1_ps(¢er_point_array[2]); ++- xmm8 = _mm_load1_ps(¢er_point_array[3]); ++- xmm10 = _mm_load1_ps(cutoff); ++- ++- int bound = num_points/8; ++- int leftovers = num_points - 8*bound; ++- int i = 0; ++- for(; i < bound; ++i) { ++- // 1st ++- xmm2 = _mm_load_ps(src0); ++- xmm2 = _mm_max_ps(xmm10, xmm2); ++- xmm3 = _mm_mul_ps(xmm2, xmm2); ++- xmm4 = _mm_mul_ps(xmm2, xmm3); ++- xmm5 = _mm_mul_ps(xmm3, xmm3); ++- ++- xmm2 = _mm_mul_ps(xmm2, xmm0); ++- xmm3 = _mm_mul_ps(xmm3, xmm6); ++- xmm4 = _mm_mul_ps(xmm4, xmm7); ++- xmm5 = _mm_mul_ps(xmm5, xmm8); ++- ++- xmm2 = _mm_add_ps(xmm2, xmm3); ++- xmm3 = _mm_add_ps(xmm4, xmm5); ++- ++- src0 += 4; ++- ++- xmm9 = _mm_add_ps(xmm2, xmm9); ++- xmm9 = _mm_add_ps(xmm3, xmm9); ++- ++- // 2nd ++- xmm2 = _mm_load_ps(src0); ++- xmm2 = _mm_max_ps(xmm10, xmm2); ++- xmm3 = _mm_mul_ps(xmm2, xmm2); ++- xmm4 = _mm_mul_ps(xmm2, xmm3); ++- xmm5 = _mm_mul_ps(xmm3, xmm3); ++- ++- xmm2 = _mm_mul_ps(xmm2, xmm0); ++- xmm3 = _mm_mul_ps(xmm3, xmm6); ++- xmm4 = _mm_mul_ps(xmm4, xmm7); ++- xmm5 = _mm_mul_ps(xmm5, xmm8); ++- ++- xmm2 = _mm_add_ps(xmm2, xmm3); ++- xmm3 = _mm_add_ps(xmm4, xmm5); ++- ++- src0 += 4; ++- ++- xmm1 = _mm_add_ps(xmm2, xmm1); ++- xmm1 = _mm_add_ps(xmm3, xmm1); ++- } ++- xmm2 = _mm_hadd_ps(xmm9, xmm1); ++- xmm3 = _mm_hadd_ps(xmm2, xmm2); ++- xmm4 = _mm_hadd_ps(xmm3, xmm3); ++- _mm_store_ss(&result, xmm4); ++- ++- for(i = 0; i < leftovers; ++i) { ++- fst = *src0++; ++- fst = MAX(fst, *cutoff); ++- sq = fst * fst; ++- thrd = fst * sq; ++- frth = sq * sq; ++- result += (center_point_array[0] * fst + ++- center_point_array[1] * sq + ++- center_point_array[2] * thrd + ++- center_point_array[3] * frth); ++- } ++- ++- result += (float)(num_points) * center_point_array[4]; ++- *target = result; +++ float result = 0.0f; +++ float fst = 0.0f; +++ float sq = 0.0f; +++ float thrd = 0.0f; +++ float frth = 0.0f; +++ +++ __m128 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10; +++ +++ xmm9 = _mm_setzero_ps(); +++ xmm1 = _mm_setzero_ps(); +++ xmm0 = _mm_load1_ps(¢er_point_array[0]); +++ xmm6 = _mm_load1_ps(¢er_point_array[1]); +++ xmm7 = _mm_load1_ps(¢er_point_array[2]); +++ xmm8 = _mm_load1_ps(¢er_point_array[3]); +++ xmm10 = _mm_load1_ps(cutoff); +++ +++ int bound = num_points / 8; +++ int leftovers = num_points - 8 * bound; +++ int i = 0; +++ for (; i < bound; ++i) { +++ // 1st +++ xmm2 = _mm_load_ps(src0); +++ xmm2 = _mm_max_ps(xmm10, xmm2); +++ xmm3 = _mm_mul_ps(xmm2, xmm2); +++ xmm4 = _mm_mul_ps(xmm2, xmm3); +++ xmm5 = _mm_mul_ps(xmm3, xmm3); +++ +++ xmm2 = _mm_mul_ps(xmm2, xmm0); +++ xmm3 = _mm_mul_ps(xmm3, xmm6); +++ xmm4 = _mm_mul_ps(xmm4, xmm7); +++ xmm5 = _mm_mul_ps(xmm5, xmm8); +++ +++ xmm2 = _mm_add_ps(xmm2, xmm3); +++ xmm3 = _mm_add_ps(xmm4, xmm5); +++ +++ src0 += 4; +++ +++ xmm9 = _mm_add_ps(xmm2, xmm9); +++ xmm9 = _mm_add_ps(xmm3, xmm9); +++ +++ // 2nd +++ xmm2 = _mm_load_ps(src0); +++ xmm2 = _mm_max_ps(xmm10, xmm2); +++ xmm3 = _mm_mul_ps(xmm2, xmm2); +++ xmm4 = _mm_mul_ps(xmm2, xmm3); +++ xmm5 = _mm_mul_ps(xmm3, xmm3); +++ +++ xmm2 = _mm_mul_ps(xmm2, xmm0); +++ xmm3 = _mm_mul_ps(xmm3, xmm6); +++ xmm4 = _mm_mul_ps(xmm4, xmm7); +++ xmm5 = _mm_mul_ps(xmm5, xmm8); +++ +++ xmm2 = _mm_add_ps(xmm2, xmm3); +++ xmm3 = _mm_add_ps(xmm4, xmm5); +++ +++ src0 += 4; +++ +++ xmm1 = _mm_add_ps(xmm2, xmm1); +++ xmm1 = _mm_add_ps(xmm3, xmm1); +++ } +++ xmm2 = _mm_hadd_ps(xmm9, xmm1); +++ xmm3 = _mm_hadd_ps(xmm2, xmm2); +++ xmm4 = _mm_hadd_ps(xmm3, xmm3); +++ _mm_store_ss(&result, xmm4); +++ +++ for (i = 0; i < leftovers; ++i) { +++ fst = *src0++; +++ fst = MAX(fst, *cutoff); +++ sq = fst * fst; +++ thrd = fst * sq; +++ frth = sq * sq; +++ result += (center_point_array[0] * fst + center_point_array[1] * sq + +++ center_point_array[2] * thrd + center_point_array[3] * frth); +++ } +++ +++ result += (float)(num_points)*center_point_array[4]; +++ *target = result; ++ } ++ ++ ++ #endif /*LV_HAVE_SSE3*/ ++ ++ #if LV_HAVE_AVX && LV_HAVE_FMA ++-#include +++#include ++ ++-static inline void ++-volk_32f_x3_sum_of_poly_32f_a_avx2_fma(float* target, float* src0, float* center_point_array, ++- float* cutoff, unsigned int num_points) +++static inline void volk_32f_x3_sum_of_poly_32f_a_avx2_fma(float* target, +++ float* src0, +++ float* center_point_array, +++ float* cutoff, +++ unsigned int num_points) ++ { ++- const unsigned int eighth_points = num_points / 8; ++- float fst = 0.0; ++- float sq = 0.0; ++- float thrd = 0.0; ++- float frth = 0.0; ++- ++- __m256 cpa0, cpa1, cpa2, cpa3, cutoff_vec; ++- __m256 target_vec; ++- __m256 x_to_1, x_to_2, x_to_3, x_to_4; ++- ++- cpa0 = _mm256_set1_ps(center_point_array[0]); ++- cpa1 = _mm256_set1_ps(center_point_array[1]); ++- cpa2 = _mm256_set1_ps(center_point_array[2]); ++- cpa3 = _mm256_set1_ps(center_point_array[3]); ++- cutoff_vec = _mm256_set1_ps(*cutoff); ++- target_vec = _mm256_setzero_ps(); ++- ++- unsigned int i; ++- ++- for(i = 0; i < eighth_points; ++i) { ++- x_to_1 = _mm256_load_ps(src0); ++- x_to_1 = _mm256_max_ps(x_to_1, cutoff_vec); ++- x_to_2 = _mm256_mul_ps(x_to_1, x_to_1); // x^2 ++- x_to_3 = _mm256_mul_ps(x_to_1, x_to_2); // x^3 ++- // x^1 * x^3 is slightly faster than x^2 * x^2 ++- x_to_4 = _mm256_mul_ps(x_to_1, x_to_3); // x^4 ++- ++- x_to_2 = _mm256_mul_ps(x_to_2, cpa1); // cpa[1] * x^2 ++- x_to_4 = _mm256_mul_ps(x_to_4, cpa3); // cpa[3] * x^4 ++- ++- x_to_1 = _mm256_fmadd_ps(x_to_1, cpa0, x_to_2); ++- x_to_3 = _mm256_fmadd_ps(x_to_3, cpa2, x_to_4); ++- // this is slightly faster than result += (x_to_1 + x_to_3) ++- target_vec = _mm256_add_ps(x_to_1, target_vec); ++- target_vec = _mm256_add_ps(x_to_3, target_vec); ++- ++- src0 += 8; ++- } ++- ++- // the hadd for vector reduction has very very slight impact @ 50k iters ++- __VOLK_ATTR_ALIGNED(32) float temp_results[8]; ++- target_vec = _mm256_hadd_ps(target_vec, target_vec); // x0+x1 | x2+x3 | x0+x1 | x2+x3 || x4+x5 | x6+x7 | x4+x5 | x6+x7 ++- _mm256_store_ps(temp_results, target_vec); ++- *target = temp_results[0] + temp_results[1] + temp_results[4] + temp_results[5]; ++- ++- for(i = eighth_points*8; i < num_points; ++i) { ++- fst = *src0++; ++- fst = MAX(fst, *cutoff); ++- sq = fst * fst; ++- thrd = fst * sq; ++- frth = sq * sq; ++- *target += (center_point_array[0] * fst + ++- center_point_array[1] * sq + ++- center_point_array[2] * thrd + ++- center_point_array[3] * frth); ++- } ++- *target += (float)(num_points) * center_point_array[4]; +++ const unsigned int eighth_points = num_points / 8; +++ float fst = 0.0; +++ float sq = 0.0; +++ float thrd = 0.0; +++ float frth = 0.0; +++ +++ __m256 cpa0, cpa1, cpa2, cpa3, cutoff_vec; +++ __m256 target_vec; +++ __m256 x_to_1, x_to_2, x_to_3, x_to_4; +++ +++ cpa0 = _mm256_set1_ps(center_point_array[0]); +++ cpa1 = _mm256_set1_ps(center_point_array[1]); +++ cpa2 = _mm256_set1_ps(center_point_array[2]); +++ cpa3 = _mm256_set1_ps(center_point_array[3]); +++ cutoff_vec = _mm256_set1_ps(*cutoff); +++ target_vec = _mm256_setzero_ps(); +++ +++ unsigned int i; +++ +++ for (i = 0; i < eighth_points; ++i) { +++ x_to_1 = _mm256_load_ps(src0); +++ x_to_1 = _mm256_max_ps(x_to_1, cutoff_vec); +++ x_to_2 = _mm256_mul_ps(x_to_1, x_to_1); // x^2 +++ x_to_3 = _mm256_mul_ps(x_to_1, x_to_2); // x^3 +++ // x^1 * x^3 is slightly faster than x^2 * x^2 +++ x_to_4 = _mm256_mul_ps(x_to_1, x_to_3); // x^4 +++ +++ x_to_2 = _mm256_mul_ps(x_to_2, cpa1); // cpa[1] * x^2 +++ x_to_4 = _mm256_mul_ps(x_to_4, cpa3); // cpa[3] * x^4 +++ +++ x_to_1 = _mm256_fmadd_ps(x_to_1, cpa0, x_to_2); +++ x_to_3 = _mm256_fmadd_ps(x_to_3, cpa2, x_to_4); +++ // this is slightly faster than result += (x_to_1 + x_to_3) +++ target_vec = _mm256_add_ps(x_to_1, target_vec); +++ target_vec = _mm256_add_ps(x_to_3, target_vec); +++ +++ src0 += 8; +++ } +++ +++ // the hadd for vector reduction has very very slight impact @ 50k iters +++ __VOLK_ATTR_ALIGNED(32) float temp_results[8]; +++ target_vec = _mm256_hadd_ps( +++ target_vec, +++ target_vec); // x0+x1 | x2+x3 | x0+x1 | x2+x3 || x4+x5 | x6+x7 | x4+x5 | x6+x7 +++ _mm256_store_ps(temp_results, target_vec); +++ *target = temp_results[0] + temp_results[1] + temp_results[4] + temp_results[5]; +++ +++ for (i = eighth_points * 8; i < num_points; ++i) { +++ fst = *src0++; +++ fst = MAX(fst, *cutoff); +++ sq = fst * fst; +++ thrd = fst * sq; +++ frth = sq * sq; +++ *target += (center_point_array[0] * fst + center_point_array[1] * sq + +++ center_point_array[2] * thrd + center_point_array[3] * frth); +++ } +++ *target += (float)(num_points)*center_point_array[4]; ++ } ++ #endif // LV_HAVE_AVX && LV_HAVE_FMA ++ ++ #ifdef LV_HAVE_AVX ++-#include +++#include ++ ++-static inline void ++-volk_32f_x3_sum_of_poly_32f_a_avx(float* target, float* src0, float* center_point_array, ++- float* cutoff, unsigned int num_points) +++static inline void volk_32f_x3_sum_of_poly_32f_a_avx(float* target, +++ float* src0, +++ float* center_point_array, +++ float* cutoff, +++ unsigned int num_points) ++ { ++- const unsigned int eighth_points = num_points / 8; ++- float fst = 0.0; ++- float sq = 0.0; ++- float thrd = 0.0; ++- float frth = 0.0; ++- ++- __m256 cpa0, cpa1, cpa2, cpa3, cutoff_vec; ++- __m256 target_vec; ++- __m256 x_to_1, x_to_2, x_to_3, x_to_4; ++- ++- cpa0 = _mm256_set1_ps(center_point_array[0]); ++- cpa1 = _mm256_set1_ps(center_point_array[1]); ++- cpa2 = _mm256_set1_ps(center_point_array[2]); ++- cpa3 = _mm256_set1_ps(center_point_array[3]); ++- cutoff_vec = _mm256_set1_ps(*cutoff); ++- target_vec = _mm256_setzero_ps(); ++- ++- unsigned int i; ++- ++- for(i = 0; i < eighth_points; ++i) { ++- x_to_1 = _mm256_load_ps(src0); ++- x_to_1 = _mm256_max_ps(x_to_1, cutoff_vec); ++- x_to_2 = _mm256_mul_ps(x_to_1, x_to_1); // x^2 ++- x_to_3 = _mm256_mul_ps(x_to_1, x_to_2); // x^3 ++- // x^1 * x^3 is slightly faster than x^2 * x^2 ++- x_to_4 = _mm256_mul_ps(x_to_1, x_to_3); // x^4 ++- ++- x_to_1 = _mm256_mul_ps(x_to_1, cpa0); // cpa[0] * x^1 ++- x_to_2 = _mm256_mul_ps(x_to_2, cpa1); // cpa[1] * x^2 ++- x_to_3 = _mm256_mul_ps(x_to_3, cpa2); // cpa[2] * x^3 ++- x_to_4 = _mm256_mul_ps(x_to_4, cpa3); // cpa[3] * x^4 ++- ++- x_to_1 = _mm256_add_ps(x_to_1, x_to_2); ++- x_to_3 = _mm256_add_ps(x_to_3, x_to_4); ++- // this is slightly faster than result += (x_to_1 + x_to_3) ++- target_vec = _mm256_add_ps(x_to_1, target_vec); ++- target_vec = _mm256_add_ps(x_to_3, target_vec); ++- ++- src0 += 8; ++- } ++- ++- // the hadd for vector reduction has very very slight impact @ 50k iters ++- __VOLK_ATTR_ALIGNED(32) float temp_results[8]; ++- target_vec = _mm256_hadd_ps(target_vec, target_vec); // x0+x1 | x2+x3 | x0+x1 | x2+x3 || x4+x5 | x6+x7 | x4+x5 | x6+x7 ++- _mm256_store_ps(temp_results, target_vec); ++- *target = temp_results[0] + temp_results[1] + temp_results[4] + temp_results[5]; ++- ++- for(i = eighth_points*8; i < num_points; ++i) { ++- fst = *src0++; ++- fst = MAX(fst, *cutoff); ++- sq = fst * fst; ++- thrd = fst * sq; ++- frth = sq * sq; ++- *target += (center_point_array[0] * fst + ++- center_point_array[1] * sq + ++- center_point_array[2] * thrd + ++- center_point_array[3] * frth); ++- } ++- *target += (float)(num_points) * center_point_array[4]; +++ const unsigned int eighth_points = num_points / 8; +++ float fst = 0.0; +++ float sq = 0.0; +++ float thrd = 0.0; +++ float frth = 0.0; +++ +++ __m256 cpa0, cpa1, cpa2, cpa3, cutoff_vec; +++ __m256 target_vec; +++ __m256 x_to_1, x_to_2, x_to_3, x_to_4; +++ +++ cpa0 = _mm256_set1_ps(center_point_array[0]); +++ cpa1 = _mm256_set1_ps(center_point_array[1]); +++ cpa2 = _mm256_set1_ps(center_point_array[2]); +++ cpa3 = _mm256_set1_ps(center_point_array[3]); +++ cutoff_vec = _mm256_set1_ps(*cutoff); +++ target_vec = _mm256_setzero_ps(); +++ +++ unsigned int i; +++ +++ for (i = 0; i < eighth_points; ++i) { +++ x_to_1 = _mm256_load_ps(src0); +++ x_to_1 = _mm256_max_ps(x_to_1, cutoff_vec); +++ x_to_2 = _mm256_mul_ps(x_to_1, x_to_1); // x^2 +++ x_to_3 = _mm256_mul_ps(x_to_1, x_to_2); // x^3 +++ // x^1 * x^3 is slightly faster than x^2 * x^2 +++ x_to_4 = _mm256_mul_ps(x_to_1, x_to_3); // x^4 +++ +++ x_to_1 = _mm256_mul_ps(x_to_1, cpa0); // cpa[0] * x^1 +++ x_to_2 = _mm256_mul_ps(x_to_2, cpa1); // cpa[1] * x^2 +++ x_to_3 = _mm256_mul_ps(x_to_3, cpa2); // cpa[2] * x^3 +++ x_to_4 = _mm256_mul_ps(x_to_4, cpa3); // cpa[3] * x^4 +++ +++ x_to_1 = _mm256_add_ps(x_to_1, x_to_2); +++ x_to_3 = _mm256_add_ps(x_to_3, x_to_4); +++ // this is slightly faster than result += (x_to_1 + x_to_3) +++ target_vec = _mm256_add_ps(x_to_1, target_vec); +++ target_vec = _mm256_add_ps(x_to_3, target_vec); +++ +++ src0 += 8; +++ } +++ +++ // the hadd for vector reduction has very very slight impact @ 50k iters +++ __VOLK_ATTR_ALIGNED(32) float temp_results[8]; +++ target_vec = _mm256_hadd_ps( +++ target_vec, +++ target_vec); // x0+x1 | x2+x3 | x0+x1 | x2+x3 || x4+x5 | x6+x7 | x4+x5 | x6+x7 +++ _mm256_store_ps(temp_results, target_vec); +++ *target = temp_results[0] + temp_results[1] + temp_results[4] + temp_results[5]; +++ +++ for (i = eighth_points * 8; i < num_points; ++i) { +++ fst = *src0++; +++ fst = MAX(fst, *cutoff); +++ sq = fst * fst; +++ thrd = fst * sq; +++ frth = sq * sq; +++ *target += (center_point_array[0] * fst + center_point_array[1] * sq + +++ center_point_array[2] * thrd + center_point_array[3] * frth); +++ } +++ *target += (float)(num_points)*center_point_array[4]; ++ } ++ #endif // LV_HAVE_AVX ++ ++ ++- ++ #ifdef LV_HAVE_GENERIC ++ ++-static inline void ++-volk_32f_x3_sum_of_poly_32f_generic(float* target, float* src0, float* center_point_array, ++- float* cutoff, unsigned int num_points) +++static inline void volk_32f_x3_sum_of_poly_32f_generic(float* target, +++ float* src0, +++ float* center_point_array, +++ float* cutoff, +++ unsigned int num_points) ++ { ++- const unsigned int eighth_points = num_points / 8; ++- ++- float result[8] = {0.0f,0.0f,0.0f,0.0f, 0.0f,0.0f,0.0f,0.0f}; ++- float fst = 0.0f; ++- float sq = 0.0f; ++- float thrd = 0.0f; ++- float frth = 0.0f; ++- ++- unsigned int i = 0; ++- unsigned int k = 0; ++- for(i = 0; i < eighth_points; ++i) { ++- for(k = 0; k < 8; ++k) { ++- fst = *src0++; ++- fst = MAX(fst, *cutoff); ++- sq = fst * fst; ++- thrd = fst * sq; ++- frth = fst * thrd; ++- result[k] += center_point_array[0] * fst + center_point_array[1] * sq; ++- result[k] += center_point_array[2] * thrd + center_point_array[3] * frth; +++ const unsigned int eighth_points = num_points / 8; +++ +++ float result[8] = { 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f }; +++ float fst = 0.0f; +++ float sq = 0.0f; +++ float thrd = 0.0f; +++ float frth = 0.0f; +++ +++ unsigned int i = 0; +++ unsigned int k = 0; +++ for (i = 0; i < eighth_points; ++i) { +++ for (k = 0; k < 8; ++k) { +++ fst = *src0++; +++ fst = MAX(fst, *cutoff); +++ sq = fst * fst; +++ thrd = fst * sq; +++ frth = fst * thrd; +++ result[k] += center_point_array[0] * fst + center_point_array[1] * sq; +++ result[k] += center_point_array[2] * thrd + center_point_array[3] * frth; +++ } ++ } ++- } ++- for(k = 0; k < 8; k+=2) ++- result[k] = result[k]+result[k+1]; ++- ++- *target = result[0] + result[2] + result[4] + result[6]; ++- ++- for(i = eighth_points*8; i < num_points; ++i) { ++- fst = *src0++; ++- fst = MAX(fst, *cutoff); ++- sq = fst * fst; ++- thrd = fst * sq; ++- frth = fst * thrd; ++- *target += (center_point_array[0] * fst + ++- center_point_array[1] * sq + ++- center_point_array[2] * thrd + ++- center_point_array[3] * frth); ++- } ++- *target += (float)(num_points) * center_point_array[4]; +++ for (k = 0; k < 8; k += 2) +++ result[k] = result[k] + result[k + 1]; +++ +++ *target = result[0] + result[2] + result[4] + result[6]; +++ +++ for (i = eighth_points * 8; i < num_points; ++i) { +++ fst = *src0++; +++ fst = MAX(fst, *cutoff); +++ sq = fst * fst; +++ thrd = fst * sq; +++ frth = fst * thrd; +++ *target += (center_point_array[0] * fst + center_point_array[1] * sq + +++ center_point_array[2] * thrd + center_point_array[3] * frth); +++ } +++ *target += (float)(num_points)*center_point_array[4]; ++ } ++ ++ #endif /*LV_HAVE_GENERIC*/ ++@@ -372,51 +377,52 @@ volk_32f_x3_sum_of_poly_32f_generic(float* target, float* src0, float* center_po ++ #include ++ ++ static inline void ++-volk_32f_x3_sum_of_poly_32f_a_neon(float* __restrict target, float* __restrict src0, +++volk_32f_x3_sum_of_poly_32f_a_neon(float* __restrict target, +++ float* __restrict src0, ++ float* __restrict center_point_array, ++- float* __restrict cutoff, unsigned int num_points) +++ float* __restrict cutoff, +++ unsigned int num_points) ++ { ++- unsigned int i; ++- float zero[4] = {0.0f, 0.0f, 0.0f, 0.0f }; ++- ++- float32x2_t x_to_1, x_to_2, x_to_3, x_to_4; ++- float32x2_t cutoff_vector; ++- float32x2x2_t x_low, x_high; ++- float32x4_t x_qvector, c_qvector, cpa_qvector; ++- float accumulator; ++- float res_accumulators[4]; ++- ++- c_qvector = vld1q_f32( zero ); ++- // load the cutoff in to a vector ++- cutoff_vector = vdup_n_f32( *cutoff ); ++- // ... center point array ++- cpa_qvector = vld1q_f32( center_point_array ); ++- ++- for(i=0; i < num_points; ++i) { ++- // load x (src0) ++- x_to_1 = vdup_n_f32( *src0++ ); ++- ++- // Get a vector of max(src0, cutoff) ++- x_to_1 = vmax_f32(x_to_1, cutoff_vector ); // x^1 ++- x_to_2 = vmul_f32(x_to_1, x_to_1); // x^2 ++- x_to_3 = vmul_f32(x_to_2, x_to_1); // x^3 ++- x_to_4 = vmul_f32(x_to_3, x_to_1); // x^4 ++- // zip up doubles to interleave ++- x_low = vzip_f32(x_to_1, x_to_2); // [x^2 | x^1 || x^2 | x^1] ++- x_high = vzip_f32(x_to_3, x_to_4); // [x^4 | x^3 || x^4 | x^3] ++- // float32x4_t vcombine_f32(float32x2_t low, float32x2_t high); // VMOV d0,d0 ++- x_qvector = vcombine_f32(x_low.val[0], x_high.val[0]); ++- // now we finally have [x^4 | x^3 | x^2 | x] ! ++- ++- c_qvector = vmlaq_f32(c_qvector, x_qvector, cpa_qvector); ++- ++- } ++- // there should be better vector reduction techniques ++- vst1q_f32(res_accumulators, c_qvector ); ++- accumulator = res_accumulators[0] + res_accumulators[1] + ++- res_accumulators[2] + res_accumulators[3]; ++- ++- *target = accumulator + (float)num_points * center_point_array[4]; +++ unsigned int i; +++ float zero[4] = { 0.0f, 0.0f, 0.0f, 0.0f }; +++ +++ float32x2_t x_to_1, x_to_2, x_to_3, x_to_4; +++ float32x2_t cutoff_vector; +++ float32x2x2_t x_low, x_high; +++ float32x4_t x_qvector, c_qvector, cpa_qvector; +++ float accumulator; +++ float res_accumulators[4]; +++ +++ c_qvector = vld1q_f32(zero); +++ // load the cutoff in to a vector +++ cutoff_vector = vdup_n_f32(*cutoff); +++ // ... center point array +++ cpa_qvector = vld1q_f32(center_point_array); +++ +++ for (i = 0; i < num_points; ++i) { +++ // load x (src0) +++ x_to_1 = vdup_n_f32(*src0++); +++ +++ // Get a vector of max(src0, cutoff) +++ x_to_1 = vmax_f32(x_to_1, cutoff_vector); // x^1 +++ x_to_2 = vmul_f32(x_to_1, x_to_1); // x^2 +++ x_to_3 = vmul_f32(x_to_2, x_to_1); // x^3 +++ x_to_4 = vmul_f32(x_to_3, x_to_1); // x^4 +++ // zip up doubles to interleave +++ x_low = vzip_f32(x_to_1, x_to_2); // [x^2 | x^1 || x^2 | x^1] +++ x_high = vzip_f32(x_to_3, x_to_4); // [x^4 | x^3 || x^4 | x^3] +++ // float32x4_t vcombine_f32(float32x2_t low, float32x2_t high); // VMOV d0,d0 +++ x_qvector = vcombine_f32(x_low.val[0], x_high.val[0]); +++ // now we finally have [x^4 | x^3 | x^2 | x] ! +++ +++ c_qvector = vmlaq_f32(c_qvector, x_qvector, cpa_qvector); +++ } +++ // there should be better vector reduction techniques +++ vst1q_f32(res_accumulators, c_qvector); +++ accumulator = res_accumulators[0] + res_accumulators[1] + res_accumulators[2] + +++ res_accumulators[3]; +++ +++ *target = accumulator + (float)num_points * center_point_array[4]; ++ } ++ ++ #endif /* LV_HAVE_NEON */ ++@@ -425,82 +431,82 @@ volk_32f_x3_sum_of_poly_32f_a_neon(float* __restrict target, float* __restrict s ++ #ifdef LV_HAVE_NEON ++ ++ static inline void ++-volk_32f_x3_sum_of_poly_32f_neonvert(float* __restrict target, float* __restrict src0, +++volk_32f_x3_sum_of_poly_32f_neonvert(float* __restrict target, +++ float* __restrict src0, ++ float* __restrict center_point_array, ++- float* __restrict cutoff, unsigned int num_points) +++ float* __restrict cutoff, +++ unsigned int num_points) ++ { ++- unsigned int i; ++- float zero[4] = {0.0f, 0.0f, 0.0f, 0.0f }; ++- ++- float accumulator; ++- ++- float32x4_t accumulator1_vec, accumulator2_vec, accumulator3_vec, accumulator4_vec; ++- accumulator1_vec = vld1q_f32(zero); ++- accumulator2_vec = vld1q_f32(zero); ++- accumulator3_vec = vld1q_f32(zero); ++- accumulator4_vec = vld1q_f32(zero); ++- float32x4_t x_to_1, x_to_2, x_to_3, x_to_4; ++- float32x4_t cutoff_vector, cpa_0, cpa_1, cpa_2, cpa_3; ++- ++- // load the cutoff in to a vector ++- cutoff_vector = vdupq_n_f32( *cutoff ); ++- // ... center point array ++- cpa_0 = vdupq_n_f32(center_point_array[0]); ++- cpa_1 = vdupq_n_f32(center_point_array[1]); ++- cpa_2 = vdupq_n_f32(center_point_array[2]); ++- cpa_3 = vdupq_n_f32(center_point_array[3]); ++- ++- // nathan is not sure why this is slower *and* wrong compared to neonvertfma ++- for(i=0; i < num_points/4; ++i) { ++- // load x ++- x_to_1 = vld1q_f32( src0 ); ++- ++- // Get a vector of max(src0, cutoff) ++- x_to_1 = vmaxq_f32(x_to_1, cutoff_vector ); // x^1 ++- x_to_2 = vmulq_f32(x_to_1, x_to_1); // x^2 ++- x_to_3 = vmulq_f32(x_to_2, x_to_1); // x^3 ++- x_to_4 = vmulq_f32(x_to_3, x_to_1); // x^4 ++- x_to_1 = vmulq_f32(x_to_1, cpa_0); ++- x_to_2 = vmulq_f32(x_to_2, cpa_1); ++- x_to_3 = vmulq_f32(x_to_3, cpa_2); ++- x_to_4 = vmulq_f32(x_to_4, cpa_3); ++- accumulator1_vec = vaddq_f32(accumulator1_vec, x_to_1); ++- accumulator2_vec = vaddq_f32(accumulator2_vec, x_to_2); ++- accumulator3_vec = vaddq_f32(accumulator3_vec, x_to_3); ++- accumulator4_vec = vaddq_f32(accumulator4_vec, x_to_4); ++- ++- src0 += 4; ++- } ++- accumulator1_vec = vaddq_f32(accumulator1_vec, accumulator2_vec); ++- accumulator3_vec = vaddq_f32(accumulator3_vec, accumulator4_vec); ++- accumulator1_vec = vaddq_f32(accumulator1_vec, accumulator3_vec); ++- ++- __VOLK_ATTR_ALIGNED(32) float res_accumulators[4]; ++- vst1q_f32(res_accumulators, accumulator1_vec ); ++- accumulator = res_accumulators[0] + res_accumulators[1] + ++- res_accumulators[2] + res_accumulators[3]; ++- ++- float fst = 0.0; ++- float sq = 0.0; ++- float thrd = 0.0; ++- float frth = 0.0; ++- ++- for(i = 4*num_points/4; i < num_points; ++i) { ++- fst = src0[i]; ++- fst = MAX(fst, *cutoff); ++- ++- sq = fst * fst; ++- thrd = fst * sq; ++- frth = sq * sq; ++- //fith = sq * thrd; ++- ++- accumulator += (center_point_array[0] * fst + ++- center_point_array[1] * sq + ++- center_point_array[2] * thrd + ++- center_point_array[3] * frth); //+ ++- } ++- ++- *target = accumulator + (float)num_points * center_point_array[4]; +++ unsigned int i; +++ float zero[4] = { 0.0f, 0.0f, 0.0f, 0.0f }; +++ +++ float accumulator; +++ +++ float32x4_t accumulator1_vec, accumulator2_vec, accumulator3_vec, accumulator4_vec; +++ accumulator1_vec = vld1q_f32(zero); +++ accumulator2_vec = vld1q_f32(zero); +++ accumulator3_vec = vld1q_f32(zero); +++ accumulator4_vec = vld1q_f32(zero); +++ float32x4_t x_to_1, x_to_2, x_to_3, x_to_4; +++ float32x4_t cutoff_vector, cpa_0, cpa_1, cpa_2, cpa_3; +++ +++ // load the cutoff in to a vector +++ cutoff_vector = vdupq_n_f32(*cutoff); +++ // ... center point array +++ cpa_0 = vdupq_n_f32(center_point_array[0]); +++ cpa_1 = vdupq_n_f32(center_point_array[1]); +++ cpa_2 = vdupq_n_f32(center_point_array[2]); +++ cpa_3 = vdupq_n_f32(center_point_array[3]); +++ +++ // nathan is not sure why this is slower *and* wrong compared to neonvertfma +++ for (i = 0; i < num_points / 4; ++i) { +++ // load x +++ x_to_1 = vld1q_f32(src0); +++ +++ // Get a vector of max(src0, cutoff) +++ x_to_1 = vmaxq_f32(x_to_1, cutoff_vector); // x^1 +++ x_to_2 = vmulq_f32(x_to_1, x_to_1); // x^2 +++ x_to_3 = vmulq_f32(x_to_2, x_to_1); // x^3 +++ x_to_4 = vmulq_f32(x_to_3, x_to_1); // x^4 +++ x_to_1 = vmulq_f32(x_to_1, cpa_0); +++ x_to_2 = vmulq_f32(x_to_2, cpa_1); +++ x_to_3 = vmulq_f32(x_to_3, cpa_2); +++ x_to_4 = vmulq_f32(x_to_4, cpa_3); +++ accumulator1_vec = vaddq_f32(accumulator1_vec, x_to_1); +++ accumulator2_vec = vaddq_f32(accumulator2_vec, x_to_2); +++ accumulator3_vec = vaddq_f32(accumulator3_vec, x_to_3); +++ accumulator4_vec = vaddq_f32(accumulator4_vec, x_to_4); +++ +++ src0 += 4; +++ } +++ accumulator1_vec = vaddq_f32(accumulator1_vec, accumulator2_vec); +++ accumulator3_vec = vaddq_f32(accumulator3_vec, accumulator4_vec); +++ accumulator1_vec = vaddq_f32(accumulator1_vec, accumulator3_vec); +++ +++ __VOLK_ATTR_ALIGNED(32) float res_accumulators[4]; +++ vst1q_f32(res_accumulators, accumulator1_vec); +++ accumulator = res_accumulators[0] + res_accumulators[1] + res_accumulators[2] + +++ res_accumulators[3]; +++ +++ float fst = 0.0; +++ float sq = 0.0; +++ float thrd = 0.0; +++ float frth = 0.0; +++ +++ for (i = 4 * num_points / 4; i < num_points; ++i) { +++ fst = src0[i]; +++ fst = MAX(fst, *cutoff); +++ +++ sq = fst * fst; +++ thrd = fst * sq; +++ frth = sq * sq; +++ // fith = sq * thrd; +++ +++ accumulator += (center_point_array[0] * fst + center_point_array[1] * sq + +++ center_point_array[2] * thrd + center_point_array[3] * frth); //+ +++ } +++ +++ *target = accumulator + (float)num_points * center_point_array[4]; ++ } ++ ++ #endif /* LV_HAVE_NEON */ ++@@ -510,150 +516,154 @@ volk_32f_x3_sum_of_poly_32f_neonvert(float* __restrict target, float* __restrict ++ #ifndef INCLUDED_volk_32f_x3_sum_of_poly_32f_u_H ++ #define INCLUDED_volk_32f_x3_sum_of_poly_32f_u_H ++ ++-#include ++-#include ++-#include +++#include +++#include +++#include ++ ++ #ifndef MAX ++-#define MAX(X,Y) ((X) > (Y)?(X):(Y)) +++#define MAX(X, Y) ((X) > (Y) ? (X) : (Y)) ++ #endif ++ ++ #if LV_HAVE_AVX && LV_HAVE_FMA ++-#include +++#include ++ ++-static inline void ++-volk_32f_x3_sum_of_poly_32f_u_avx_fma(float* target, float* src0, float* center_point_array, ++- float* cutoff, unsigned int num_points) +++static inline void volk_32f_x3_sum_of_poly_32f_u_avx_fma(float* target, +++ float* src0, +++ float* center_point_array, +++ float* cutoff, +++ unsigned int num_points) ++ { ++- const unsigned int eighth_points = num_points / 8; ++- float fst = 0.0; ++- float sq = 0.0; ++- float thrd = 0.0; ++- float frth = 0.0; ++- ++- __m256 cpa0, cpa1, cpa2, cpa3, cutoff_vec; ++- __m256 target_vec; ++- __m256 x_to_1, x_to_2, x_to_3, x_to_4; ++- ++- cpa0 = _mm256_set1_ps(center_point_array[0]); ++- cpa1 = _mm256_set1_ps(center_point_array[1]); ++- cpa2 = _mm256_set1_ps(center_point_array[2]); ++- cpa3 = _mm256_set1_ps(center_point_array[3]); ++- cutoff_vec = _mm256_set1_ps(*cutoff); ++- target_vec = _mm256_setzero_ps(); ++- ++- unsigned int i; ++- ++- for(i = 0; i < eighth_points; ++i) { ++- x_to_1 = _mm256_loadu_ps(src0); ++- x_to_1 = _mm256_max_ps(x_to_1, cutoff_vec); ++- x_to_2 = _mm256_mul_ps(x_to_1, x_to_1); // x^2 ++- x_to_3 = _mm256_mul_ps(x_to_1, x_to_2); // x^3 ++- // x^1 * x^3 is slightly faster than x^2 * x^2 ++- x_to_4 = _mm256_mul_ps(x_to_1, x_to_3); // x^4 ++- ++- x_to_2 = _mm256_mul_ps(x_to_2, cpa1); // cpa[1] * x^2 ++- x_to_4 = _mm256_mul_ps(x_to_4, cpa3); // cpa[3] * x^4 ++- ++- x_to_1 = _mm256_fmadd_ps(x_to_1, cpa0, x_to_2); ++- x_to_3 = _mm256_fmadd_ps(x_to_3, cpa2, x_to_4); ++- // this is slightly faster than result += (x_to_1 + x_to_3) ++- target_vec = _mm256_add_ps(x_to_1, target_vec); ++- target_vec = _mm256_add_ps(x_to_3, target_vec); ++- ++- src0 += 8; ++- } ++- ++- // the hadd for vector reduction has very very slight impact @ 50k iters ++- __VOLK_ATTR_ALIGNED(32) float temp_results[8]; ++- target_vec = _mm256_hadd_ps(target_vec, target_vec); // x0+x1 | x2+x3 | x0+x1 | x2+x3 || x4+x5 | x6+x7 | x4+x5 | x6+x7 ++- _mm256_storeu_ps(temp_results, target_vec); ++- *target = temp_results[0] + temp_results[1] + temp_results[4] + temp_results[5]; ++- ++- for(i = eighth_points*8; i < num_points; ++i) { ++- fst = *src0++; ++- fst = MAX(fst, *cutoff); ++- sq = fst * fst; ++- thrd = fst * sq; ++- frth = sq * sq; ++- *target += (center_point_array[0] * fst + ++- center_point_array[1] * sq + ++- center_point_array[2] * thrd + ++- center_point_array[3] * frth); ++- } ++- ++- *target += (float)(num_points) * center_point_array[4]; +++ const unsigned int eighth_points = num_points / 8; +++ float fst = 0.0; +++ float sq = 0.0; +++ float thrd = 0.0; +++ float frth = 0.0; +++ +++ __m256 cpa0, cpa1, cpa2, cpa3, cutoff_vec; +++ __m256 target_vec; +++ __m256 x_to_1, x_to_2, x_to_3, x_to_4; +++ +++ cpa0 = _mm256_set1_ps(center_point_array[0]); +++ cpa1 = _mm256_set1_ps(center_point_array[1]); +++ cpa2 = _mm256_set1_ps(center_point_array[2]); +++ cpa3 = _mm256_set1_ps(center_point_array[3]); +++ cutoff_vec = _mm256_set1_ps(*cutoff); +++ target_vec = _mm256_setzero_ps(); +++ +++ unsigned int i; +++ +++ for (i = 0; i < eighth_points; ++i) { +++ x_to_1 = _mm256_loadu_ps(src0); +++ x_to_1 = _mm256_max_ps(x_to_1, cutoff_vec); +++ x_to_2 = _mm256_mul_ps(x_to_1, x_to_1); // x^2 +++ x_to_3 = _mm256_mul_ps(x_to_1, x_to_2); // x^3 +++ // x^1 * x^3 is slightly faster than x^2 * x^2 +++ x_to_4 = _mm256_mul_ps(x_to_1, x_to_3); // x^4 +++ +++ x_to_2 = _mm256_mul_ps(x_to_2, cpa1); // cpa[1] * x^2 +++ x_to_4 = _mm256_mul_ps(x_to_4, cpa3); // cpa[3] * x^4 +++ +++ x_to_1 = _mm256_fmadd_ps(x_to_1, cpa0, x_to_2); +++ x_to_3 = _mm256_fmadd_ps(x_to_3, cpa2, x_to_4); +++ // this is slightly faster than result += (x_to_1 + x_to_3) +++ target_vec = _mm256_add_ps(x_to_1, target_vec); +++ target_vec = _mm256_add_ps(x_to_3, target_vec); +++ +++ src0 += 8; +++ } +++ +++ // the hadd for vector reduction has very very slight impact @ 50k iters +++ __VOLK_ATTR_ALIGNED(32) float temp_results[8]; +++ target_vec = _mm256_hadd_ps( +++ target_vec, +++ target_vec); // x0+x1 | x2+x3 | x0+x1 | x2+x3 || x4+x5 | x6+x7 | x4+x5 | x6+x7 +++ _mm256_storeu_ps(temp_results, target_vec); +++ *target = temp_results[0] + temp_results[1] + temp_results[4] + temp_results[5]; +++ +++ for (i = eighth_points * 8; i < num_points; ++i) { +++ fst = *src0++; +++ fst = MAX(fst, *cutoff); +++ sq = fst * fst; +++ thrd = fst * sq; +++ frth = sq * sq; +++ *target += (center_point_array[0] * fst + center_point_array[1] * sq + +++ center_point_array[2] * thrd + center_point_array[3] * frth); +++ } +++ +++ *target += (float)(num_points)*center_point_array[4]; ++ } ++ #endif // LV_HAVE_AVX && LV_HAVE_FMA ++ ++ #ifdef LV_HAVE_AVX ++-#include +++#include ++ ++-static inline void ++-volk_32f_x3_sum_of_poly_32f_u_avx(float* target, float* src0, float* center_point_array, ++- float* cutoff, unsigned int num_points) +++static inline void volk_32f_x3_sum_of_poly_32f_u_avx(float* target, +++ float* src0, +++ float* center_point_array, +++ float* cutoff, +++ unsigned int num_points) ++ { ++- const unsigned int eighth_points = num_points / 8; ++- float fst = 0.0; ++- float sq = 0.0; ++- float thrd = 0.0; ++- float frth = 0.0; ++- ++- __m256 cpa0, cpa1, cpa2, cpa3, cutoff_vec; ++- __m256 target_vec; ++- __m256 x_to_1, x_to_2, x_to_3, x_to_4; ++- ++- cpa0 = _mm256_set1_ps(center_point_array[0]); ++- cpa1 = _mm256_set1_ps(center_point_array[1]); ++- cpa2 = _mm256_set1_ps(center_point_array[2]); ++- cpa3 = _mm256_set1_ps(center_point_array[3]); ++- cutoff_vec = _mm256_set1_ps(*cutoff); ++- target_vec = _mm256_setzero_ps(); ++- ++- unsigned int i; ++- ++- for(i = 0; i < eighth_points; ++i) { ++- x_to_1 = _mm256_loadu_ps(src0); ++- x_to_1 = _mm256_max_ps(x_to_1, cutoff_vec); ++- x_to_2 = _mm256_mul_ps(x_to_1, x_to_1); // x^2 ++- x_to_3 = _mm256_mul_ps(x_to_1, x_to_2); // x^3 ++- // x^1 * x^3 is slightly faster than x^2 * x^2 ++- x_to_4 = _mm256_mul_ps(x_to_1, x_to_3); // x^4 ++- ++- x_to_1 = _mm256_mul_ps(x_to_1, cpa0); // cpa[0] * x^1 ++- x_to_2 = _mm256_mul_ps(x_to_2, cpa1); // cpa[1] * x^2 ++- x_to_3 = _mm256_mul_ps(x_to_3, cpa2); // cpa[2] * x^3 ++- x_to_4 = _mm256_mul_ps(x_to_4, cpa3); // cpa[3] * x^4 ++- ++- x_to_1 = _mm256_add_ps(x_to_1, x_to_2); ++- x_to_3 = _mm256_add_ps(x_to_3, x_to_4); ++- // this is slightly faster than result += (x_to_1 + x_to_3) ++- target_vec = _mm256_add_ps(x_to_1, target_vec); ++- target_vec = _mm256_add_ps(x_to_3, target_vec); ++- ++- src0 += 8; ++- } ++- ++- // the hadd for vector reduction has very very slight impact @ 50k iters ++- __VOLK_ATTR_ALIGNED(32) float temp_results[8]; ++- target_vec = _mm256_hadd_ps(target_vec, target_vec); // x0+x1 | x2+x3 | x0+x1 | x2+x3 || x4+x5 | x6+x7 | x4+x5 | x6+x7 ++- _mm256_storeu_ps(temp_results, target_vec); ++- *target = temp_results[0] + temp_results[1] + temp_results[4] + temp_results[5]; ++- ++- for(i = eighth_points*8; i < num_points; ++i) { ++- fst = *src0++; ++- fst = MAX(fst, *cutoff); ++- sq = fst * fst; ++- thrd = fst * sq; ++- frth = sq * sq; ++- ++- *target += (center_point_array[0] * fst + ++- center_point_array[1] * sq + ++- center_point_array[2] * thrd + ++- center_point_array[3] * frth); ++- } ++- ++- *target += (float)(num_points) * center_point_array[4]; +++ const unsigned int eighth_points = num_points / 8; +++ float fst = 0.0; +++ float sq = 0.0; +++ float thrd = 0.0; +++ float frth = 0.0; +++ +++ __m256 cpa0, cpa1, cpa2, cpa3, cutoff_vec; +++ __m256 target_vec; +++ __m256 x_to_1, x_to_2, x_to_3, x_to_4; +++ +++ cpa0 = _mm256_set1_ps(center_point_array[0]); +++ cpa1 = _mm256_set1_ps(center_point_array[1]); +++ cpa2 = _mm256_set1_ps(center_point_array[2]); +++ cpa3 = _mm256_set1_ps(center_point_array[3]); +++ cutoff_vec = _mm256_set1_ps(*cutoff); +++ target_vec = _mm256_setzero_ps(); +++ +++ unsigned int i; +++ +++ for (i = 0; i < eighth_points; ++i) { +++ x_to_1 = _mm256_loadu_ps(src0); +++ x_to_1 = _mm256_max_ps(x_to_1, cutoff_vec); +++ x_to_2 = _mm256_mul_ps(x_to_1, x_to_1); // x^2 +++ x_to_3 = _mm256_mul_ps(x_to_1, x_to_2); // x^3 +++ // x^1 * x^3 is slightly faster than x^2 * x^2 +++ x_to_4 = _mm256_mul_ps(x_to_1, x_to_3); // x^4 +++ +++ x_to_1 = _mm256_mul_ps(x_to_1, cpa0); // cpa[0] * x^1 +++ x_to_2 = _mm256_mul_ps(x_to_2, cpa1); // cpa[1] * x^2 +++ x_to_3 = _mm256_mul_ps(x_to_3, cpa2); // cpa[2] * x^3 +++ x_to_4 = _mm256_mul_ps(x_to_4, cpa3); // cpa[3] * x^4 +++ +++ x_to_1 = _mm256_add_ps(x_to_1, x_to_2); +++ x_to_3 = _mm256_add_ps(x_to_3, x_to_4); +++ // this is slightly faster than result += (x_to_1 + x_to_3) +++ target_vec = _mm256_add_ps(x_to_1, target_vec); +++ target_vec = _mm256_add_ps(x_to_3, target_vec); +++ +++ src0 += 8; +++ } +++ +++ // the hadd for vector reduction has very very slight impact @ 50k iters +++ __VOLK_ATTR_ALIGNED(32) float temp_results[8]; +++ target_vec = _mm256_hadd_ps( +++ target_vec, +++ target_vec); // x0+x1 | x2+x3 | x0+x1 | x2+x3 || x4+x5 | x6+x7 | x4+x5 | x6+x7 +++ _mm256_storeu_ps(temp_results, target_vec); +++ *target = temp_results[0] + temp_results[1] + temp_results[4] + temp_results[5]; +++ +++ for (i = eighth_points * 8; i < num_points; ++i) { +++ fst = *src0++; +++ fst = MAX(fst, *cutoff); +++ sq = fst * fst; +++ thrd = fst * sq; +++ frth = sq * sq; +++ +++ *target += (center_point_array[0] * fst + center_point_array[1] * sq + +++ center_point_array[2] * thrd + center_point_array[3] * frth); +++ } +++ +++ *target += (float)(num_points)*center_point_array[4]; ++ } ++ #endif // LV_HAVE_AVX ++ ++diff --git a/kernels/volk/volk_32fc_32f_add_32fc.h b/kernels/volk/volk_32fc_32f_add_32fc.h ++index 86a3818..b25ca6a 100644 ++--- a/kernels/volk/volk_32fc_32f_add_32fc.h +++++ b/kernels/volk/volk_32fc_32f_add_32fc.h ++@@ -31,8 +31,8 @@ ++ * ++ * Dispatcher Prototype ++ * \code ++- * void volk_32fc_32f_add_32fc(lv_32fc_t* cVector, const lv_32fc_t* aVector, const float* bVector, unsigned int num_points) ++- * \endcode +++ * void volk_32fc_32f_add_32fc(lv_32fc_t* cVector, const lv_32fc_t* aVector, const float* +++ * bVector, unsigned int num_points) \endcode ++ * ++ * \b Inputs ++ * \li aVector: First vector of input points. ++@@ -44,7 +44,8 @@ ++ * ++ * \b Example ++ * ++- * The follow example adds the increasing and decreasing vectors such that the result of every summation pair is 10 +++ * The follow example adds the increasing and decreasing vectors such that the result of +++ * every summation pair is 10 ++ * ++ * \code ++ * int N = 10; ++@@ -75,18 +76,19 @@ ++ ++ #ifdef LV_HAVE_GENERIC ++ ++-static inline void ++-volk_32fc_32f_add_32fc_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, ++- const float* bVector, unsigned int num_points) +++static inline void volk_32fc_32f_add_32fc_generic(lv_32fc_t* cVector, +++ const lv_32fc_t* aVector, +++ const float* bVector, +++ unsigned int num_points) ++ { ++- lv_32fc_t* cPtr = cVector; ++- const lv_32fc_t* aPtr = aVector; ++- const float* bPtr= bVector; ++- unsigned int number = 0; ++- ++- for(number = 0; number < num_points; number++){ ++- *cPtr++ = (*aPtr++) + (*bPtr++); ++- } +++ lv_32fc_t* cPtr = cVector; +++ const lv_32fc_t* aPtr = aVector; +++ const float* bPtr = bVector; +++ unsigned int number = 0; +++ +++ for (number = 0; number < num_points; number++) { +++ *cPtr++ = (*aPtr++) + (*bPtr++); +++ } ++ } ++ #endif /* LV_HAVE_GENERIC */ ++ ++@@ -94,143 +96,150 @@ volk_32fc_32f_add_32fc_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, ++ #ifdef LV_HAVE_AVX ++ #include ++ ++-static inline void ++-volk_32fc_32f_add_32fc_u_avx(lv_32fc_t* cVector, const lv_32fc_t* aVector, ++- const float* bVector, unsigned int num_points) +++static inline void volk_32fc_32f_add_32fc_u_avx(lv_32fc_t* cVector, +++ const lv_32fc_t* aVector, +++ const float* bVector, +++ unsigned int num_points) ++ { ++- unsigned int number = 0; ++- const unsigned int eighthPoints = num_points / 8; ++- ++- lv_32fc_t* cPtr = cVector; ++- const lv_32fc_t* aPtr = aVector; ++- const float* bPtr= bVector; ++- ++- __m256 aVal1, aVal2, bVal, cVal1, cVal2; ++- __m256 cpx_b1, cpx_b2; ++- __m256 zero; ++- zero = _mm256_setzero_ps(); ++- __m256 tmp1, tmp2; ++- for(;number < eighthPoints; number++){ ++- ++- aVal1 = _mm256_loadu_ps((float *) aPtr); ++- aVal2 = _mm256_loadu_ps((float *) (aPtr+4)); ++- bVal = _mm256_loadu_ps(bPtr); ++- cpx_b1 = _mm256_unpacklo_ps(bVal, zero); // b0, 0, b1, 0, b4, 0, b5, 0 ++- cpx_b2 = _mm256_unpackhi_ps(bVal, zero); // b2, 0, b3, 0, b6, 0, b7, 0 ++- ++- tmp1 = _mm256_permute2f128_ps(cpx_b1, cpx_b2, 0x0+(0x2<<4)); ++- tmp2 = _mm256_permute2f128_ps(cpx_b1, cpx_b2, 0x1+(0x3<<4)); ++- ++- cVal1 = _mm256_add_ps(aVal1, tmp1); ++- cVal2 = _mm256_add_ps(aVal2, tmp2); ++- ++- _mm256_storeu_ps((float *) cPtr, cVal1); // Store the results back into the C container ++- _mm256_storeu_ps((float *) (cPtr+4), cVal2); // Store the results back into the C container ++- ++- aPtr += 8; ++- bPtr += 8; ++- cPtr += 8; ++- } ++- ++- number = eighthPoints * 8; ++- for(;number < num_points; number++){ ++- *cPtr++ = (*aPtr++) + (*bPtr++); ++- } +++ unsigned int number = 0; +++ const unsigned int eighthPoints = num_points / 8; +++ +++ lv_32fc_t* cPtr = cVector; +++ const lv_32fc_t* aPtr = aVector; +++ const float* bPtr = bVector; +++ +++ __m256 aVal1, aVal2, bVal, cVal1, cVal2; +++ __m256 cpx_b1, cpx_b2; +++ __m256 zero; +++ zero = _mm256_setzero_ps(); +++ __m256 tmp1, tmp2; +++ for (; number < eighthPoints; number++) { +++ +++ aVal1 = _mm256_loadu_ps((float*)aPtr); +++ aVal2 = _mm256_loadu_ps((float*)(aPtr + 4)); +++ bVal = _mm256_loadu_ps(bPtr); +++ cpx_b1 = _mm256_unpacklo_ps(bVal, zero); // b0, 0, b1, 0, b4, 0, b5, 0 +++ cpx_b2 = _mm256_unpackhi_ps(bVal, zero); // b2, 0, b3, 0, b6, 0, b7, 0 +++ +++ tmp1 = _mm256_permute2f128_ps(cpx_b1, cpx_b2, 0x0 + (0x2 << 4)); +++ tmp2 = _mm256_permute2f128_ps(cpx_b1, cpx_b2, 0x1 + (0x3 << 4)); +++ +++ cVal1 = _mm256_add_ps(aVal1, tmp1); +++ cVal2 = _mm256_add_ps(aVal2, tmp2); +++ +++ _mm256_storeu_ps((float*)cPtr, +++ cVal1); // Store the results back into the C container +++ _mm256_storeu_ps((float*)(cPtr + 4), +++ cVal2); // Store the results back into the C container +++ +++ aPtr += 8; +++ bPtr += 8; +++ cPtr += 8; +++ } +++ +++ number = eighthPoints * 8; +++ for (; number < num_points; number++) { +++ *cPtr++ = (*aPtr++) + (*bPtr++); +++ } ++ } ++ #endif /* LV_HAVE_AVX */ ++ ++ #ifdef LV_HAVE_AVX ++ #include ++ ++-static inline void ++-volk_32fc_32f_add_32fc_a_avx(lv_32fc_t* cVector, const lv_32fc_t* aVector, ++- const float* bVector, unsigned int num_points) +++static inline void volk_32fc_32f_add_32fc_a_avx(lv_32fc_t* cVector, +++ const lv_32fc_t* aVector, +++ const float* bVector, +++ unsigned int num_points) ++ { ++- unsigned int number = 0; ++- const unsigned int eighthPoints = num_points / 8; ++- ++- lv_32fc_t* cPtr = cVector; ++- const lv_32fc_t* aPtr = aVector; ++- const float* bPtr= bVector; ++- ++- __m256 aVal1, aVal2, bVal, cVal1, cVal2; ++- __m256 cpx_b1, cpx_b2; ++- __m256 zero; ++- zero = _mm256_setzero_ps(); ++- __m256 tmp1, tmp2; ++- for(;number < eighthPoints; number++){ ++- ++- aVal1 = _mm256_load_ps((float *) aPtr); ++- aVal2 = _mm256_load_ps((float *) (aPtr+4)); ++- bVal = _mm256_load_ps(bPtr); ++- cpx_b1 = _mm256_unpacklo_ps(bVal, zero); // b0, 0, b1, 0, b4, 0, b5, 0 ++- cpx_b2 = _mm256_unpackhi_ps(bVal, zero); // b2, 0, b3, 0, b6, 0, b7, 0 ++- ++- tmp1 = _mm256_permute2f128_ps(cpx_b1, cpx_b2, 0x0+(0x2<<4)); ++- tmp2 = _mm256_permute2f128_ps(cpx_b1, cpx_b2, 0x1+(0x3<<4)); ++- ++- cVal1 = _mm256_add_ps(aVal1, tmp1); ++- cVal2 = _mm256_add_ps(aVal2, tmp2); ++- ++- _mm256_store_ps((float *) cPtr, cVal1); // Store the results back into the C container ++- _mm256_store_ps((float *) (cPtr+4), cVal2); // Store the results back into the C container ++- ++- aPtr += 8; ++- bPtr += 8; ++- cPtr += 8; ++- } ++- ++- number = eighthPoints * 8; ++- for(;number < num_points; number++){ ++- *cPtr++ = (*aPtr++) + (*bPtr++); ++- } +++ unsigned int number = 0; +++ const unsigned int eighthPoints = num_points / 8; +++ +++ lv_32fc_t* cPtr = cVector; +++ const lv_32fc_t* aPtr = aVector; +++ const float* bPtr = bVector; +++ +++ __m256 aVal1, aVal2, bVal, cVal1, cVal2; +++ __m256 cpx_b1, cpx_b2; +++ __m256 zero; +++ zero = _mm256_setzero_ps(); +++ __m256 tmp1, tmp2; +++ for (; number < eighthPoints; number++) { +++ +++ aVal1 = _mm256_load_ps((float*)aPtr); +++ aVal2 = _mm256_load_ps((float*)(aPtr + 4)); +++ bVal = _mm256_load_ps(bPtr); +++ cpx_b1 = _mm256_unpacklo_ps(bVal, zero); // b0, 0, b1, 0, b4, 0, b5, 0 +++ cpx_b2 = _mm256_unpackhi_ps(bVal, zero); // b2, 0, b3, 0, b6, 0, b7, 0 +++ +++ tmp1 = _mm256_permute2f128_ps(cpx_b1, cpx_b2, 0x0 + (0x2 << 4)); +++ tmp2 = _mm256_permute2f128_ps(cpx_b1, cpx_b2, 0x1 + (0x3 << 4)); +++ +++ cVal1 = _mm256_add_ps(aVal1, tmp1); +++ cVal2 = _mm256_add_ps(aVal2, tmp2); +++ +++ _mm256_store_ps((float*)cPtr, +++ cVal1); // Store the results back into the C container +++ _mm256_store_ps((float*)(cPtr + 4), +++ cVal2); // Store the results back into the C container +++ +++ aPtr += 8; +++ bPtr += 8; +++ cPtr += 8; +++ } +++ +++ number = eighthPoints * 8; +++ for (; number < num_points; number++) { +++ *cPtr++ = (*aPtr++) + (*bPtr++); +++ } ++ } ++ #endif /* LV_HAVE_AVX */ ++ ++ #ifdef LV_HAVE_NEON ++ #include ++ ++-static inline void ++-volk_32fc_32f_add_32fc_neon(lv_32fc_t* cVector, const lv_32fc_t* aVector, ++- const float* bVector, unsigned int num_points) +++static inline void volk_32fc_32f_add_32fc_neon(lv_32fc_t* cVector, +++ const lv_32fc_t* aVector, +++ const float* bVector, +++ unsigned int num_points) ++ { ++- lv_32fc_t* cPtr = cVector; ++- const lv_32fc_t* aPtr = aVector; ++- const float* bPtr = bVector; ++- ++- float32x4x4_t aVal0, aVal1; ++- float32x4x2_t bVal0, bVal1; ++- ++- const unsigned int sixteenthPoints = num_points / 16; ++- unsigned int number = 0; ++- for(; number < sixteenthPoints; number++){ ++- aVal0 = vld4q_f32((const float*)aPtr); ++- aPtr += 8; ++- aVal1 = vld4q_f32((const float*)aPtr); ++- aPtr += 8; ++- __VOLK_PREFETCH(aPtr+16); ++- ++- bVal0 = vld2q_f32((const float*)bPtr); ++- bPtr += 8; ++- bVal1 = vld2q_f32((const float*)bPtr); ++- bPtr += 8; ++- __VOLK_PREFETCH(bPtr+16); ++- ++- aVal0.val[0] = vaddq_f32(aVal0.val[0], bVal0.val[0]); ++- aVal0.val[2] = vaddq_f32(aVal0.val[2], bVal0.val[1]); ++- ++- aVal1.val[2] = vaddq_f32(aVal1.val[2], bVal1.val[1]); ++- aVal1.val[0] = vaddq_f32(aVal1.val[0], bVal1.val[0]); ++- ++- vst4q_f32((float*)(cPtr), aVal0); ++- cPtr += 8; ++- vst4q_f32((float*)(cPtr), aVal1); ++- cPtr += 8; ++- } ++- ++- for(number = sixteenthPoints * 16; number < num_points; number++){ ++- *cPtr++ = (*aPtr++) + (*bPtr++); ++- } +++ lv_32fc_t* cPtr = cVector; +++ const lv_32fc_t* aPtr = aVector; +++ const float* bPtr = bVector; +++ +++ float32x4x4_t aVal0, aVal1; +++ float32x4x2_t bVal0, bVal1; +++ +++ const unsigned int sixteenthPoints = num_points / 16; +++ unsigned int number = 0; +++ for (; number < sixteenthPoints; number++) { +++ aVal0 = vld4q_f32((const float*)aPtr); +++ aPtr += 8; +++ aVal1 = vld4q_f32((const float*)aPtr); +++ aPtr += 8; +++ __VOLK_PREFETCH(aPtr + 16); +++ +++ bVal0 = vld2q_f32((const float*)bPtr); +++ bPtr += 8; +++ bVal1 = vld2q_f32((const float*)bPtr); +++ bPtr += 8; +++ __VOLK_PREFETCH(bPtr + 16); +++ +++ aVal0.val[0] = vaddq_f32(aVal0.val[0], bVal0.val[0]); +++ aVal0.val[2] = vaddq_f32(aVal0.val[2], bVal0.val[1]); +++ +++ aVal1.val[2] = vaddq_f32(aVal1.val[2], bVal1.val[1]); +++ aVal1.val[0] = vaddq_f32(aVal1.val[0], bVal1.val[0]); +++ +++ vst4q_f32((float*)(cPtr), aVal0); +++ cPtr += 8; +++ vst4q_f32((float*)(cPtr), aVal1); +++ cPtr += 8; +++ } +++ +++ for (number = sixteenthPoints * 16; number < num_points; number++) { +++ *cPtr++ = (*aPtr++) + (*bPtr++); +++ } ++ } ++ #endif /* LV_HAVE_NEON */ ++ ++diff --git a/kernels/volk/volk_32fc_32f_dot_prod_32fc.h b/kernels/volk/volk_32fc_32f_dot_prod_32fc.h ++index 35f7077..d905870 100644 ++--- a/kernels/volk/volk_32fc_32f_dot_prod_32fc.h +++++ b/kernels/volk/volk_32fc_32f_dot_prod_32fc.h ++@@ -33,8 +33,8 @@ ++ * ++ * Dispatcher Prototype ++ * \code ++- * void volk_32fc_32f_dot_prod_32fc(lv_32fc_t* result, const lv_32fc_t* input, const float * taps, unsigned int num_points) ++- * \endcode +++ * void volk_32fc_32f_dot_prod_32fc(lv_32fc_t* result, const lv_32fc_t* input, const float +++ * * taps, unsigned int num_points) \endcode ++ * ++ * \b Inputs ++ * \li input: vector of complex samples ++@@ -63,28 +63,32 @@ ++ #ifndef INCLUDED_volk_32fc_32f_dot_prod_32fc_a_H ++ #define INCLUDED_volk_32fc_32f_dot_prod_32fc_a_H ++ ++-#include ++ #include +++#include ++ ++ #ifdef LV_HAVE_GENERIC ++ ++-static inline void volk_32fc_32f_dot_prod_32fc_generic(lv_32fc_t* result, const lv_32fc_t* input, const float * taps, unsigned int num_points) { +++static inline void volk_32fc_32f_dot_prod_32fc_generic(lv_32fc_t* result, +++ const lv_32fc_t* input, +++ const float* taps, +++ unsigned int num_points) +++{ ++ ++- float res[2]; ++- float *realpt = &res[0], *imagpt = &res[1]; ++- const float* aPtr = (float*)input; ++- const float* bPtr= taps; ++- unsigned int number = 0; +++ float res[2]; +++ float *realpt = &res[0], *imagpt = &res[1]; +++ const float* aPtr = (float*)input; +++ const float* bPtr = taps; +++ unsigned int number = 0; ++ ++- *realpt = 0; ++- *imagpt = 0; +++ *realpt = 0; +++ *imagpt = 0; ++ ++- for(number = 0; number < num_points; number++){ ++- *realpt += ((*aPtr++) * (*bPtr)); ++- *imagpt += ((*aPtr++) * (*bPtr++)); ++- } +++ for (number = 0; number < num_points; number++) { +++ *realpt += ((*aPtr++) * (*bPtr)); +++ *imagpt += ((*aPtr++) * (*bPtr++)); +++ } ++ ++- *result = *(lv_32fc_t*)(&res[0]); +++ *result = *(lv_32fc_t*)(&res[0]); ++ } ++ ++ #endif /*LV_HAVE_GENERIC*/ ++@@ -93,78 +97,83 @@ static inline void volk_32fc_32f_dot_prod_32fc_generic(lv_32fc_t* result, const ++ ++ #include ++ ++-static inline void volk_32fc_32f_dot_prod_32fc_a_avx2_fma( lv_32fc_t* result, const lv_32fc_t* input, const float* taps, unsigned int num_points) { ++- ++- unsigned int number = 0; ++- const unsigned int sixteenthPoints = num_points / 16; ++- ++- float res[2]; ++- float *realpt = &res[0], *imagpt = &res[1]; ++- const float* aPtr = (float*)input; ++- const float* bPtr = taps; ++- ++- __m256 a0Val, a1Val, a2Val, a3Val; ++- __m256 b0Val, b1Val, b2Val, b3Val; ++- __m256 x0Val, x1Val, x0loVal, x0hiVal, x1loVal, x1hiVal; ++- ++- __m256 dotProdVal0 = _mm256_setzero_ps(); ++- __m256 dotProdVal1 = _mm256_setzero_ps(); ++- __m256 dotProdVal2 = _mm256_setzero_ps(); ++- __m256 dotProdVal3 = _mm256_setzero_ps(); ++- ++- for(;number < sixteenthPoints; number++){ ++- ++- a0Val = _mm256_load_ps(aPtr); ++- a1Val = _mm256_load_ps(aPtr+8); ++- a2Val = _mm256_load_ps(aPtr+16); ++- a3Val = _mm256_load_ps(aPtr+24); ++- ++- x0Val = _mm256_load_ps(bPtr); // t0|t1|t2|t3|t4|t5|t6|t7 ++- x1Val = _mm256_load_ps(bPtr+8); ++- x0loVal = _mm256_unpacklo_ps(x0Val, x0Val); // t0|t0|t1|t1|t4|t4|t5|t5 ++- x0hiVal = _mm256_unpackhi_ps(x0Val, x0Val); // t2|t2|t3|t3|t6|t6|t7|t7 ++- x1loVal = _mm256_unpacklo_ps(x1Val, x1Val); ++- x1hiVal = _mm256_unpackhi_ps(x1Val, x1Val); ++- ++- // TODO: it may be possible to rearrange swizzling to better pipeline data ++- b0Val = _mm256_permute2f128_ps(x0loVal, x0hiVal, 0x20); // t0|t0|t1|t1|t2|t2|t3|t3 ++- b1Val = _mm256_permute2f128_ps(x0loVal, x0hiVal, 0x31); // t4|t4|t5|t5|t6|t6|t7|t7 ++- b2Val = _mm256_permute2f128_ps(x1loVal, x1hiVal, 0x20); ++- b3Val = _mm256_permute2f128_ps(x1loVal, x1hiVal, 0x31); ++- ++- dotProdVal0 = _mm256_fmadd_ps(a0Val, b0Val, dotProdVal0); ++- dotProdVal1 = _mm256_fmadd_ps(a1Val, b1Val, dotProdVal1); ++- dotProdVal2 = _mm256_fmadd_ps(a2Val, b2Val, dotProdVal2); ++- dotProdVal3 = _mm256_fmadd_ps(a3Val, b3Val, dotProdVal3); ++- ++- aPtr += 32; ++- bPtr += 16; ++- } ++- ++- dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1); ++- dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2); ++- dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3); ++- ++- __VOLK_ATTR_ALIGNED(32) float dotProductVector[8]; ++- ++- _mm256_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector ++- ++- *realpt = dotProductVector[0]; ++- *imagpt = dotProductVector[1]; ++- *realpt += dotProductVector[2]; ++- *imagpt += dotProductVector[3]; ++- *realpt += dotProductVector[4]; ++- *imagpt += dotProductVector[5]; ++- *realpt += dotProductVector[6]; ++- *imagpt += dotProductVector[7]; ++- ++- number = sixteenthPoints*16; ++- for(;number < num_points; number++){ ++- *realpt += ((*aPtr++) * (*bPtr)); ++- *imagpt += ((*aPtr++) * (*bPtr++)); ++- } ++- ++- *result = *(lv_32fc_t*)(&res[0]); +++static inline void volk_32fc_32f_dot_prod_32fc_a_avx2_fma(lv_32fc_t* result, +++ const lv_32fc_t* input, +++ const float* taps, +++ unsigned int num_points) +++{ +++ +++ unsigned int number = 0; +++ const unsigned int sixteenthPoints = num_points / 16; +++ +++ float res[2]; +++ float *realpt = &res[0], *imagpt = &res[1]; +++ const float* aPtr = (float*)input; +++ const float* bPtr = taps; +++ +++ __m256 a0Val, a1Val, a2Val, a3Val; +++ __m256 b0Val, b1Val, b2Val, b3Val; +++ __m256 x0Val, x1Val, x0loVal, x0hiVal, x1loVal, x1hiVal; +++ +++ __m256 dotProdVal0 = _mm256_setzero_ps(); +++ __m256 dotProdVal1 = _mm256_setzero_ps(); +++ __m256 dotProdVal2 = _mm256_setzero_ps(); +++ __m256 dotProdVal3 = _mm256_setzero_ps(); +++ +++ for (; number < sixteenthPoints; number++) { +++ +++ a0Val = _mm256_load_ps(aPtr); +++ a1Val = _mm256_load_ps(aPtr + 8); +++ a2Val = _mm256_load_ps(aPtr + 16); +++ a3Val = _mm256_load_ps(aPtr + 24); +++ +++ x0Val = _mm256_load_ps(bPtr); // t0|t1|t2|t3|t4|t5|t6|t7 +++ x1Val = _mm256_load_ps(bPtr + 8); +++ x0loVal = _mm256_unpacklo_ps(x0Val, x0Val); // t0|t0|t1|t1|t4|t4|t5|t5 +++ x0hiVal = _mm256_unpackhi_ps(x0Val, x0Val); // t2|t2|t3|t3|t6|t6|t7|t7 +++ x1loVal = _mm256_unpacklo_ps(x1Val, x1Val); +++ x1hiVal = _mm256_unpackhi_ps(x1Val, x1Val); +++ +++ // TODO: it may be possible to rearrange swizzling to better pipeline data +++ b0Val = _mm256_permute2f128_ps(x0loVal, x0hiVal, 0x20); // t0|t0|t1|t1|t2|t2|t3|t3 +++ b1Val = _mm256_permute2f128_ps(x0loVal, x0hiVal, 0x31); // t4|t4|t5|t5|t6|t6|t7|t7 +++ b2Val = _mm256_permute2f128_ps(x1loVal, x1hiVal, 0x20); +++ b3Val = _mm256_permute2f128_ps(x1loVal, x1hiVal, 0x31); +++ +++ dotProdVal0 = _mm256_fmadd_ps(a0Val, b0Val, dotProdVal0); +++ dotProdVal1 = _mm256_fmadd_ps(a1Val, b1Val, dotProdVal1); +++ dotProdVal2 = _mm256_fmadd_ps(a2Val, b2Val, dotProdVal2); +++ dotProdVal3 = _mm256_fmadd_ps(a3Val, b3Val, dotProdVal3); +++ +++ aPtr += 32; +++ bPtr += 16; +++ } +++ +++ dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1); +++ dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2); +++ dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3); +++ +++ __VOLK_ATTR_ALIGNED(32) float dotProductVector[8]; +++ +++ _mm256_store_ps(dotProductVector, +++ dotProdVal0); // Store the results back into the dot product vector +++ +++ *realpt = dotProductVector[0]; +++ *imagpt = dotProductVector[1]; +++ *realpt += dotProductVector[2]; +++ *imagpt += dotProductVector[3]; +++ *realpt += dotProductVector[4]; +++ *imagpt += dotProductVector[5]; +++ *realpt += dotProductVector[6]; +++ *imagpt += dotProductVector[7]; +++ +++ number = sixteenthPoints * 16; +++ for (; number < num_points; number++) { +++ *realpt += ((*aPtr++) * (*bPtr)); +++ *imagpt += ((*aPtr++) * (*bPtr++)); +++ } +++ +++ *result = *(lv_32fc_t*)(&res[0]); ++ } ++ ++ #endif /*LV_HAVE_AVX2 && LV_HAVE_FMA*/ ++@@ -173,164 +182,172 @@ static inline void volk_32fc_32f_dot_prod_32fc_a_avx2_fma( lv_32fc_t* result, co ++ ++ #include ++ ++-static inline void volk_32fc_32f_dot_prod_32fc_a_avx( lv_32fc_t* result, const lv_32fc_t* input, const float* taps, unsigned int num_points) { ++- ++- unsigned int number = 0; ++- const unsigned int sixteenthPoints = num_points / 16; ++- ++- float res[2]; ++- float *realpt = &res[0], *imagpt = &res[1]; ++- const float* aPtr = (float*)input; ++- const float* bPtr = taps; ++- ++- __m256 a0Val, a1Val, a2Val, a3Val; ++- __m256 b0Val, b1Val, b2Val, b3Val; ++- __m256 x0Val, x1Val, x0loVal, x0hiVal, x1loVal, x1hiVal; ++- __m256 c0Val, c1Val, c2Val, c3Val; ++- ++- __m256 dotProdVal0 = _mm256_setzero_ps(); ++- __m256 dotProdVal1 = _mm256_setzero_ps(); ++- __m256 dotProdVal2 = _mm256_setzero_ps(); ++- __m256 dotProdVal3 = _mm256_setzero_ps(); ++- ++- for(;number < sixteenthPoints; number++){ ++- ++- a0Val = _mm256_load_ps(aPtr); ++- a1Val = _mm256_load_ps(aPtr+8); ++- a2Val = _mm256_load_ps(aPtr+16); ++- a3Val = _mm256_load_ps(aPtr+24); ++- ++- x0Val = _mm256_load_ps(bPtr); // t0|t1|t2|t3|t4|t5|t6|t7 ++- x1Val = _mm256_load_ps(bPtr+8); ++- x0loVal = _mm256_unpacklo_ps(x0Val, x0Val); // t0|t0|t1|t1|t4|t4|t5|t5 ++- x0hiVal = _mm256_unpackhi_ps(x0Val, x0Val); // t2|t2|t3|t3|t6|t6|t7|t7 ++- x1loVal = _mm256_unpacklo_ps(x1Val, x1Val); ++- x1hiVal = _mm256_unpackhi_ps(x1Val, x1Val); ++- ++- // TODO: it may be possible to rearrange swizzling to better pipeline data ++- b0Val = _mm256_permute2f128_ps(x0loVal, x0hiVal, 0x20); // t0|t0|t1|t1|t2|t2|t3|t3 ++- b1Val = _mm256_permute2f128_ps(x0loVal, x0hiVal, 0x31); // t4|t4|t5|t5|t6|t6|t7|t7 ++- b2Val = _mm256_permute2f128_ps(x1loVal, x1hiVal, 0x20); ++- b3Val = _mm256_permute2f128_ps(x1loVal, x1hiVal, 0x31); ++- ++- c0Val = _mm256_mul_ps(a0Val, b0Val); ++- c1Val = _mm256_mul_ps(a1Val, b1Val); ++- c2Val = _mm256_mul_ps(a2Val, b2Val); ++- c3Val = _mm256_mul_ps(a3Val, b3Val); ++- ++- dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0); ++- dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1); ++- dotProdVal2 = _mm256_add_ps(c2Val, dotProdVal2); ++- dotProdVal3 = _mm256_add_ps(c3Val, dotProdVal3); ++- ++- aPtr += 32; ++- bPtr += 16; ++- } ++- ++- dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1); ++- dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2); ++- dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3); ++- ++- __VOLK_ATTR_ALIGNED(32) float dotProductVector[8]; ++- ++- _mm256_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector ++- ++- *realpt = dotProductVector[0]; ++- *imagpt = dotProductVector[1]; ++- *realpt += dotProductVector[2]; ++- *imagpt += dotProductVector[3]; ++- *realpt += dotProductVector[4]; ++- *imagpt += dotProductVector[5]; ++- *realpt += dotProductVector[6]; ++- *imagpt += dotProductVector[7]; ++- ++- number = sixteenthPoints*16; ++- for(;number < num_points; number++){ ++- *realpt += ((*aPtr++) * (*bPtr)); ++- *imagpt += ((*aPtr++) * (*bPtr++)); ++- } ++- ++- *result = *(lv_32fc_t*)(&res[0]); +++static inline void volk_32fc_32f_dot_prod_32fc_a_avx(lv_32fc_t* result, +++ const lv_32fc_t* input, +++ const float* taps, +++ unsigned int num_points) +++{ +++ +++ unsigned int number = 0; +++ const unsigned int sixteenthPoints = num_points / 16; +++ +++ float res[2]; +++ float *realpt = &res[0], *imagpt = &res[1]; +++ const float* aPtr = (float*)input; +++ const float* bPtr = taps; +++ +++ __m256 a0Val, a1Val, a2Val, a3Val; +++ __m256 b0Val, b1Val, b2Val, b3Val; +++ __m256 x0Val, x1Val, x0loVal, x0hiVal, x1loVal, x1hiVal; +++ __m256 c0Val, c1Val, c2Val, c3Val; +++ +++ __m256 dotProdVal0 = _mm256_setzero_ps(); +++ __m256 dotProdVal1 = _mm256_setzero_ps(); +++ __m256 dotProdVal2 = _mm256_setzero_ps(); +++ __m256 dotProdVal3 = _mm256_setzero_ps(); +++ +++ for (; number < sixteenthPoints; number++) { +++ +++ a0Val = _mm256_load_ps(aPtr); +++ a1Val = _mm256_load_ps(aPtr + 8); +++ a2Val = _mm256_load_ps(aPtr + 16); +++ a3Val = _mm256_load_ps(aPtr + 24); +++ +++ x0Val = _mm256_load_ps(bPtr); // t0|t1|t2|t3|t4|t5|t6|t7 +++ x1Val = _mm256_load_ps(bPtr + 8); +++ x0loVal = _mm256_unpacklo_ps(x0Val, x0Val); // t0|t0|t1|t1|t4|t4|t5|t5 +++ x0hiVal = _mm256_unpackhi_ps(x0Val, x0Val); // t2|t2|t3|t3|t6|t6|t7|t7 +++ x1loVal = _mm256_unpacklo_ps(x1Val, x1Val); +++ x1hiVal = _mm256_unpackhi_ps(x1Val, x1Val); +++ +++ // TODO: it may be possible to rearrange swizzling to better pipeline data +++ b0Val = _mm256_permute2f128_ps(x0loVal, x0hiVal, 0x20); // t0|t0|t1|t1|t2|t2|t3|t3 +++ b1Val = _mm256_permute2f128_ps(x0loVal, x0hiVal, 0x31); // t4|t4|t5|t5|t6|t6|t7|t7 +++ b2Val = _mm256_permute2f128_ps(x1loVal, x1hiVal, 0x20); +++ b3Val = _mm256_permute2f128_ps(x1loVal, x1hiVal, 0x31); +++ +++ c0Val = _mm256_mul_ps(a0Val, b0Val); +++ c1Val = _mm256_mul_ps(a1Val, b1Val); +++ c2Val = _mm256_mul_ps(a2Val, b2Val); +++ c3Val = _mm256_mul_ps(a3Val, b3Val); +++ +++ dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0); +++ dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1); +++ dotProdVal2 = _mm256_add_ps(c2Val, dotProdVal2); +++ dotProdVal3 = _mm256_add_ps(c3Val, dotProdVal3); +++ +++ aPtr += 32; +++ bPtr += 16; +++ } +++ +++ dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1); +++ dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2); +++ dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3); +++ +++ __VOLK_ATTR_ALIGNED(32) float dotProductVector[8]; +++ +++ _mm256_store_ps(dotProductVector, +++ dotProdVal0); // Store the results back into the dot product vector +++ +++ *realpt = dotProductVector[0]; +++ *imagpt = dotProductVector[1]; +++ *realpt += dotProductVector[2]; +++ *imagpt += dotProductVector[3]; +++ *realpt += dotProductVector[4]; +++ *imagpt += dotProductVector[5]; +++ *realpt += dotProductVector[6]; +++ *imagpt += dotProductVector[7]; +++ +++ number = sixteenthPoints * 16; +++ for (; number < num_points; number++) { +++ *realpt += ((*aPtr++) * (*bPtr)); +++ *imagpt += ((*aPtr++) * (*bPtr++)); +++ } +++ +++ *result = *(lv_32fc_t*)(&res[0]); ++ } ++ ++ #endif /*LV_HAVE_AVX*/ ++ ++ ++- ++- ++ #ifdef LV_HAVE_SSE ++ ++ ++-static inline void volk_32fc_32f_dot_prod_32fc_a_sse( lv_32fc_t* result, const lv_32fc_t* input, const float* taps, unsigned int num_points) { ++- ++- unsigned int number = 0; ++- const unsigned int sixteenthPoints = num_points / 8; ++- ++- float res[2]; ++- float *realpt = &res[0], *imagpt = &res[1]; ++- const float* aPtr = (float*)input; ++- const float* bPtr = taps; ++- ++- __m128 a0Val, a1Val, a2Val, a3Val; ++- __m128 b0Val, b1Val, b2Val, b3Val; ++- __m128 x0Val, x1Val, x2Val, x3Val; ++- __m128 c0Val, c1Val, c2Val, c3Val; ++- ++- __m128 dotProdVal0 = _mm_setzero_ps(); ++- __m128 dotProdVal1 = _mm_setzero_ps(); ++- __m128 dotProdVal2 = _mm_setzero_ps(); ++- __m128 dotProdVal3 = _mm_setzero_ps(); ++- ++- for(;number < sixteenthPoints; number++){ ++- ++- a0Val = _mm_load_ps(aPtr); ++- a1Val = _mm_load_ps(aPtr+4); ++- a2Val = _mm_load_ps(aPtr+8); ++- a3Val = _mm_load_ps(aPtr+12); ++- ++- x0Val = _mm_load_ps(bPtr); ++- x1Val = _mm_load_ps(bPtr); ++- x2Val = _mm_load_ps(bPtr+4); ++- x3Val = _mm_load_ps(bPtr+4); ++- b0Val = _mm_unpacklo_ps(x0Val, x1Val); ++- b1Val = _mm_unpackhi_ps(x0Val, x1Val); ++- b2Val = _mm_unpacklo_ps(x2Val, x3Val); ++- b3Val = _mm_unpackhi_ps(x2Val, x3Val); ++- ++- c0Val = _mm_mul_ps(a0Val, b0Val); ++- c1Val = _mm_mul_ps(a1Val, b1Val); ++- c2Val = _mm_mul_ps(a2Val, b2Val); ++- c3Val = _mm_mul_ps(a3Val, b3Val); ++- ++- dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0); ++- dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1); ++- dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2); ++- dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3); ++- ++- aPtr += 16; ++- bPtr += 8; ++- } ++- ++- dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1); ++- dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2); ++- dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3); ++- ++- __VOLK_ATTR_ALIGNED(16) float dotProductVector[4]; ++- ++- _mm_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector ++- ++- *realpt = dotProductVector[0]; ++- *imagpt = dotProductVector[1]; ++- *realpt += dotProductVector[2]; ++- *imagpt += dotProductVector[3]; ++- ++- number = sixteenthPoints*8; ++- for(;number < num_points; number++){ ++- *realpt += ((*aPtr++) * (*bPtr)); ++- *imagpt += ((*aPtr++) * (*bPtr++)); ++- } ++- ++- *result = *(lv_32fc_t*)(&res[0]); +++static inline void volk_32fc_32f_dot_prod_32fc_a_sse(lv_32fc_t* result, +++ const lv_32fc_t* input, +++ const float* taps, +++ unsigned int num_points) +++{ +++ +++ unsigned int number = 0; +++ const unsigned int sixteenthPoints = num_points / 8; +++ +++ float res[2]; +++ float *realpt = &res[0], *imagpt = &res[1]; +++ const float* aPtr = (float*)input; +++ const float* bPtr = taps; +++ +++ __m128 a0Val, a1Val, a2Val, a3Val; +++ __m128 b0Val, b1Val, b2Val, b3Val; +++ __m128 x0Val, x1Val, x2Val, x3Val; +++ __m128 c0Val, c1Val, c2Val, c3Val; +++ +++ __m128 dotProdVal0 = _mm_setzero_ps(); +++ __m128 dotProdVal1 = _mm_setzero_ps(); +++ __m128 dotProdVal2 = _mm_setzero_ps(); +++ __m128 dotProdVal3 = _mm_setzero_ps(); +++ +++ for (; number < sixteenthPoints; number++) { +++ +++ a0Val = _mm_load_ps(aPtr); +++ a1Val = _mm_load_ps(aPtr + 4); +++ a2Val = _mm_load_ps(aPtr + 8); +++ a3Val = _mm_load_ps(aPtr + 12); +++ +++ x0Val = _mm_load_ps(bPtr); +++ x1Val = _mm_load_ps(bPtr); +++ x2Val = _mm_load_ps(bPtr + 4); +++ x3Val = _mm_load_ps(bPtr + 4); +++ b0Val = _mm_unpacklo_ps(x0Val, x1Val); +++ b1Val = _mm_unpackhi_ps(x0Val, x1Val); +++ b2Val = _mm_unpacklo_ps(x2Val, x3Val); +++ b3Val = _mm_unpackhi_ps(x2Val, x3Val); +++ +++ c0Val = _mm_mul_ps(a0Val, b0Val); +++ c1Val = _mm_mul_ps(a1Val, b1Val); +++ c2Val = _mm_mul_ps(a2Val, b2Val); +++ c3Val = _mm_mul_ps(a3Val, b3Val); +++ +++ dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0); +++ dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1); +++ dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2); +++ dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3); +++ +++ aPtr += 16; +++ bPtr += 8; +++ } +++ +++ dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1); +++ dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2); +++ dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3); +++ +++ __VOLK_ATTR_ALIGNED(16) float dotProductVector[4]; +++ +++ _mm_store_ps(dotProductVector, +++ dotProdVal0); // Store the results back into the dot product vector +++ +++ *realpt = dotProductVector[0]; +++ *imagpt = dotProductVector[1]; +++ *realpt += dotProductVector[2]; +++ *imagpt += dotProductVector[3]; +++ +++ number = sixteenthPoints * 8; +++ for (; number < num_points; number++) { +++ *realpt += ((*aPtr++) * (*bPtr)); +++ *imagpt += ((*aPtr++) * (*bPtr++)); +++ } +++ +++ *result = *(lv_32fc_t*)(&res[0]); ++ } ++ ++ #endif /*LV_HAVE_SSE*/ ++@@ -339,78 +356,83 @@ static inline void volk_32fc_32f_dot_prod_32fc_a_sse( lv_32fc_t* result, const ++ ++ #include ++ ++-static inline void volk_32fc_32f_dot_prod_32fc_u_avx2_fma( lv_32fc_t* result, const lv_32fc_t* input, const float* taps, unsigned int num_points) { ++- ++- unsigned int number = 0; ++- const unsigned int sixteenthPoints = num_points / 16; ++- ++- float res[2]; ++- float *realpt = &res[0], *imagpt = &res[1]; ++- const float* aPtr = (float*)input; ++- const float* bPtr = taps; ++- ++- __m256 a0Val, a1Val, a2Val, a3Val; ++- __m256 b0Val, b1Val, b2Val, b3Val; ++- __m256 x0Val, x1Val, x0loVal, x0hiVal, x1loVal, x1hiVal; ++- ++- __m256 dotProdVal0 = _mm256_setzero_ps(); ++- __m256 dotProdVal1 = _mm256_setzero_ps(); ++- __m256 dotProdVal2 = _mm256_setzero_ps(); ++- __m256 dotProdVal3 = _mm256_setzero_ps(); ++- ++- for(;number < sixteenthPoints; number++){ ++- ++- a0Val = _mm256_loadu_ps(aPtr); ++- a1Val = _mm256_loadu_ps(aPtr+8); ++- a2Val = _mm256_loadu_ps(aPtr+16); ++- a3Val = _mm256_loadu_ps(aPtr+24); ++- ++- x0Val = _mm256_load_ps(bPtr); // t0|t1|t2|t3|t4|t5|t6|t7 ++- x1Val = _mm256_load_ps(bPtr+8); ++- x0loVal = _mm256_unpacklo_ps(x0Val, x0Val); // t0|t0|t1|t1|t4|t4|t5|t5 ++- x0hiVal = _mm256_unpackhi_ps(x0Val, x0Val); // t2|t2|t3|t3|t6|t6|t7|t7 ++- x1loVal = _mm256_unpacklo_ps(x1Val, x1Val); ++- x1hiVal = _mm256_unpackhi_ps(x1Val, x1Val); ++- ++- // TODO: it may be possible to rearrange swizzling to better pipeline data ++- b0Val = _mm256_permute2f128_ps(x0loVal, x0hiVal, 0x20); // t0|t0|t1|t1|t2|t2|t3|t3 ++- b1Val = _mm256_permute2f128_ps(x0loVal, x0hiVal, 0x31); // t4|t4|t5|t5|t6|t6|t7|t7 ++- b2Val = _mm256_permute2f128_ps(x1loVal, x1hiVal, 0x20); ++- b3Val = _mm256_permute2f128_ps(x1loVal, x1hiVal, 0x31); ++- ++- dotProdVal0 = _mm256_fmadd_ps(a0Val, b0Val, dotProdVal0); ++- dotProdVal1 = _mm256_fmadd_ps(a1Val, b1Val, dotProdVal1); ++- dotProdVal2 = _mm256_fmadd_ps(a2Val, b2Val, dotProdVal2); ++- dotProdVal3 = _mm256_fmadd_ps(a3Val, b3Val, dotProdVal3); ++- ++- aPtr += 32; ++- bPtr += 16; ++- } ++- ++- dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1); ++- dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2); ++- dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3); ++- ++- __VOLK_ATTR_ALIGNED(32) float dotProductVector[8]; ++- ++- _mm256_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector ++- ++- *realpt = dotProductVector[0]; ++- *imagpt = dotProductVector[1]; ++- *realpt += dotProductVector[2]; ++- *imagpt += dotProductVector[3]; ++- *realpt += dotProductVector[4]; ++- *imagpt += dotProductVector[5]; ++- *realpt += dotProductVector[6]; ++- *imagpt += dotProductVector[7]; ++- ++- number = sixteenthPoints*16; ++- for(;number < num_points; number++){ ++- *realpt += ((*aPtr++) * (*bPtr)); ++- *imagpt += ((*aPtr++) * (*bPtr++)); ++- } ++- ++- *result = *(lv_32fc_t*)(&res[0]); +++static inline void volk_32fc_32f_dot_prod_32fc_u_avx2_fma(lv_32fc_t* result, +++ const lv_32fc_t* input, +++ const float* taps, +++ unsigned int num_points) +++{ +++ +++ unsigned int number = 0; +++ const unsigned int sixteenthPoints = num_points / 16; +++ +++ float res[2]; +++ float *realpt = &res[0], *imagpt = &res[1]; +++ const float* aPtr = (float*)input; +++ const float* bPtr = taps; +++ +++ __m256 a0Val, a1Val, a2Val, a3Val; +++ __m256 b0Val, b1Val, b2Val, b3Val; +++ __m256 x0Val, x1Val, x0loVal, x0hiVal, x1loVal, x1hiVal; +++ +++ __m256 dotProdVal0 = _mm256_setzero_ps(); +++ __m256 dotProdVal1 = _mm256_setzero_ps(); +++ __m256 dotProdVal2 = _mm256_setzero_ps(); +++ __m256 dotProdVal3 = _mm256_setzero_ps(); +++ +++ for (; number < sixteenthPoints; number++) { +++ +++ a0Val = _mm256_loadu_ps(aPtr); +++ a1Val = _mm256_loadu_ps(aPtr + 8); +++ a2Val = _mm256_loadu_ps(aPtr + 16); +++ a3Val = _mm256_loadu_ps(aPtr + 24); +++ +++ x0Val = _mm256_load_ps(bPtr); // t0|t1|t2|t3|t4|t5|t6|t7 +++ x1Val = _mm256_load_ps(bPtr + 8); +++ x0loVal = _mm256_unpacklo_ps(x0Val, x0Val); // t0|t0|t1|t1|t4|t4|t5|t5 +++ x0hiVal = _mm256_unpackhi_ps(x0Val, x0Val); // t2|t2|t3|t3|t6|t6|t7|t7 +++ x1loVal = _mm256_unpacklo_ps(x1Val, x1Val); +++ x1hiVal = _mm256_unpackhi_ps(x1Val, x1Val); +++ +++ // TODO: it may be possible to rearrange swizzling to better pipeline data +++ b0Val = _mm256_permute2f128_ps(x0loVal, x0hiVal, 0x20); // t0|t0|t1|t1|t2|t2|t3|t3 +++ b1Val = _mm256_permute2f128_ps(x0loVal, x0hiVal, 0x31); // t4|t4|t5|t5|t6|t6|t7|t7 +++ b2Val = _mm256_permute2f128_ps(x1loVal, x1hiVal, 0x20); +++ b3Val = _mm256_permute2f128_ps(x1loVal, x1hiVal, 0x31); +++ +++ dotProdVal0 = _mm256_fmadd_ps(a0Val, b0Val, dotProdVal0); +++ dotProdVal1 = _mm256_fmadd_ps(a1Val, b1Val, dotProdVal1); +++ dotProdVal2 = _mm256_fmadd_ps(a2Val, b2Val, dotProdVal2); +++ dotProdVal3 = _mm256_fmadd_ps(a3Val, b3Val, dotProdVal3); +++ +++ aPtr += 32; +++ bPtr += 16; +++ } +++ +++ dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1); +++ dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2); +++ dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3); +++ +++ __VOLK_ATTR_ALIGNED(32) float dotProductVector[8]; +++ +++ _mm256_store_ps(dotProductVector, +++ dotProdVal0); // Store the results back into the dot product vector +++ +++ *realpt = dotProductVector[0]; +++ *imagpt = dotProductVector[1]; +++ *realpt += dotProductVector[2]; +++ *imagpt += dotProductVector[3]; +++ *realpt += dotProductVector[4]; +++ *imagpt += dotProductVector[5]; +++ *realpt += dotProductVector[6]; +++ *imagpt += dotProductVector[7]; +++ +++ number = sixteenthPoints * 16; +++ for (; number < num_points; number++) { +++ *realpt += ((*aPtr++) * (*bPtr)); +++ *imagpt += ((*aPtr++) * (*bPtr++)); +++ } +++ +++ *result = *(lv_32fc_t*)(&res[0]); ++ } ++ ++ #endif /*LV_HAVE_AVX2 && LV_HAVE_FMA*/ ++@@ -419,162 +441,172 @@ static inline void volk_32fc_32f_dot_prod_32fc_u_avx2_fma( lv_32fc_t* result, co ++ ++ #include ++ ++-static inline void volk_32fc_32f_dot_prod_32fc_u_avx( lv_32fc_t* result, const lv_32fc_t* input, const float* taps, unsigned int num_points) { ++- ++- unsigned int number = 0; ++- const unsigned int sixteenthPoints = num_points / 16; ++- ++- float res[2]; ++- float *realpt = &res[0], *imagpt = &res[1]; ++- const float* aPtr = (float*)input; ++- const float* bPtr = taps; ++- ++- __m256 a0Val, a1Val, a2Val, a3Val; ++- __m256 b0Val, b1Val, b2Val, b3Val; ++- __m256 x0Val, x1Val, x0loVal, x0hiVal, x1loVal, x1hiVal; ++- __m256 c0Val, c1Val, c2Val, c3Val; ++- ++- __m256 dotProdVal0 = _mm256_setzero_ps(); ++- __m256 dotProdVal1 = _mm256_setzero_ps(); ++- __m256 dotProdVal2 = _mm256_setzero_ps(); ++- __m256 dotProdVal3 = _mm256_setzero_ps(); ++- ++- for(;number < sixteenthPoints; number++){ ++- ++- a0Val = _mm256_loadu_ps(aPtr); ++- a1Val = _mm256_loadu_ps(aPtr+8); ++- a2Val = _mm256_loadu_ps(aPtr+16); ++- a3Val = _mm256_loadu_ps(aPtr+24); ++- ++- x0Val = _mm256_loadu_ps(bPtr); // t0|t1|t2|t3|t4|t5|t6|t7 ++- x1Val = _mm256_loadu_ps(bPtr+8); ++- x0loVal = _mm256_unpacklo_ps(x0Val, x0Val); // t0|t0|t1|t1|t4|t4|t5|t5 ++- x0hiVal = _mm256_unpackhi_ps(x0Val, x0Val); // t2|t2|t3|t3|t6|t6|t7|t7 ++- x1loVal = _mm256_unpacklo_ps(x1Val, x1Val); ++- x1hiVal = _mm256_unpackhi_ps(x1Val, x1Val); ++- ++- // TODO: it may be possible to rearrange swizzling to better pipeline data ++- b0Val = _mm256_permute2f128_ps(x0loVal, x0hiVal, 0x20); // t0|t0|t1|t1|t2|t2|t3|t3 ++- b1Val = _mm256_permute2f128_ps(x0loVal, x0hiVal, 0x31); // t4|t4|t5|t5|t6|t6|t7|t7 ++- b2Val = _mm256_permute2f128_ps(x1loVal, x1hiVal, 0x20); ++- b3Val = _mm256_permute2f128_ps(x1loVal, x1hiVal, 0x31); ++- ++- c0Val = _mm256_mul_ps(a0Val, b0Val); ++- c1Val = _mm256_mul_ps(a1Val, b1Val); ++- c2Val = _mm256_mul_ps(a2Val, b2Val); ++- c3Val = _mm256_mul_ps(a3Val, b3Val); ++- ++- dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0); ++- dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1); ++- dotProdVal2 = _mm256_add_ps(c2Val, dotProdVal2); ++- dotProdVal3 = _mm256_add_ps(c3Val, dotProdVal3); ++- ++- aPtr += 32; ++- bPtr += 16; ++- } ++- ++- dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1); ++- dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2); ++- dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3); ++- ++- __VOLK_ATTR_ALIGNED(32) float dotProductVector[8]; ++- ++- _mm256_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector ++- ++- *realpt = dotProductVector[0]; ++- *imagpt = dotProductVector[1]; ++- *realpt += dotProductVector[2]; ++- *imagpt += dotProductVector[3]; ++- *realpt += dotProductVector[4]; ++- *imagpt += dotProductVector[5]; ++- *realpt += dotProductVector[6]; ++- *imagpt += dotProductVector[7]; ++- ++- number = sixteenthPoints*16; ++- for(;number < num_points; number++){ ++- *realpt += ((*aPtr++) * (*bPtr)); ++- *imagpt += ((*aPtr++) * (*bPtr++)); ++- } ++- ++- *result = *(lv_32fc_t*)(&res[0]); +++static inline void volk_32fc_32f_dot_prod_32fc_u_avx(lv_32fc_t* result, +++ const lv_32fc_t* input, +++ const float* taps, +++ unsigned int num_points) +++{ +++ +++ unsigned int number = 0; +++ const unsigned int sixteenthPoints = num_points / 16; +++ +++ float res[2]; +++ float *realpt = &res[0], *imagpt = &res[1]; +++ const float* aPtr = (float*)input; +++ const float* bPtr = taps; +++ +++ __m256 a0Val, a1Val, a2Val, a3Val; +++ __m256 b0Val, b1Val, b2Val, b3Val; +++ __m256 x0Val, x1Val, x0loVal, x0hiVal, x1loVal, x1hiVal; +++ __m256 c0Val, c1Val, c2Val, c3Val; +++ +++ __m256 dotProdVal0 = _mm256_setzero_ps(); +++ __m256 dotProdVal1 = _mm256_setzero_ps(); +++ __m256 dotProdVal2 = _mm256_setzero_ps(); +++ __m256 dotProdVal3 = _mm256_setzero_ps(); +++ +++ for (; number < sixteenthPoints; number++) { +++ +++ a0Val = _mm256_loadu_ps(aPtr); +++ a1Val = _mm256_loadu_ps(aPtr + 8); +++ a2Val = _mm256_loadu_ps(aPtr + 16); +++ a3Val = _mm256_loadu_ps(aPtr + 24); +++ +++ x0Val = _mm256_loadu_ps(bPtr); // t0|t1|t2|t3|t4|t5|t6|t7 +++ x1Val = _mm256_loadu_ps(bPtr + 8); +++ x0loVal = _mm256_unpacklo_ps(x0Val, x0Val); // t0|t0|t1|t1|t4|t4|t5|t5 +++ x0hiVal = _mm256_unpackhi_ps(x0Val, x0Val); // t2|t2|t3|t3|t6|t6|t7|t7 +++ x1loVal = _mm256_unpacklo_ps(x1Val, x1Val); +++ x1hiVal = _mm256_unpackhi_ps(x1Val, x1Val); +++ +++ // TODO: it may be possible to rearrange swizzling to better pipeline data +++ b0Val = _mm256_permute2f128_ps(x0loVal, x0hiVal, 0x20); // t0|t0|t1|t1|t2|t2|t3|t3 +++ b1Val = _mm256_permute2f128_ps(x0loVal, x0hiVal, 0x31); // t4|t4|t5|t5|t6|t6|t7|t7 +++ b2Val = _mm256_permute2f128_ps(x1loVal, x1hiVal, 0x20); +++ b3Val = _mm256_permute2f128_ps(x1loVal, x1hiVal, 0x31); +++ +++ c0Val = _mm256_mul_ps(a0Val, b0Val); +++ c1Val = _mm256_mul_ps(a1Val, b1Val); +++ c2Val = _mm256_mul_ps(a2Val, b2Val); +++ c3Val = _mm256_mul_ps(a3Val, b3Val); +++ +++ dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0); +++ dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1); +++ dotProdVal2 = _mm256_add_ps(c2Val, dotProdVal2); +++ dotProdVal3 = _mm256_add_ps(c3Val, dotProdVal3); +++ +++ aPtr += 32; +++ bPtr += 16; +++ } +++ +++ dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1); +++ dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2); +++ dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3); +++ +++ __VOLK_ATTR_ALIGNED(32) float dotProductVector[8]; +++ +++ _mm256_store_ps(dotProductVector, +++ dotProdVal0); // Store the results back into the dot product vector +++ +++ *realpt = dotProductVector[0]; +++ *imagpt = dotProductVector[1]; +++ *realpt += dotProductVector[2]; +++ *imagpt += dotProductVector[3]; +++ *realpt += dotProductVector[4]; +++ *imagpt += dotProductVector[5]; +++ *realpt += dotProductVector[6]; +++ *imagpt += dotProductVector[7]; +++ +++ number = sixteenthPoints * 16; +++ for (; number < num_points; number++) { +++ *realpt += ((*aPtr++) * (*bPtr)); +++ *imagpt += ((*aPtr++) * (*bPtr++)); +++ } +++ +++ *result = *(lv_32fc_t*)(&res[0]); ++ } ++ #endif /*LV_HAVE_AVX*/ ++ ++ #ifdef LV_HAVE_NEON ++ #include ++ ++-static inline void volk_32fc_32f_dot_prod_32fc_neon_unroll ( lv_32fc_t* __restrict result, const lv_32fc_t* __restrict input, const float* __restrict taps, unsigned int num_points) { ++- ++- unsigned int number; ++- const unsigned int quarterPoints = num_points / 8; ++- ++- float res[2]; ++- float *realpt = &res[0], *imagpt = &res[1]; ++- const float* inputPtr = (float*)input; ++- const float* tapsPtr = taps; ++- float zero[4] = {0.0f, 0.0f, 0.0f, 0.0f }; ++- float accVector_real[4]; ++- float accVector_imag[4]; ++- ++- float32x4x2_t inputVector0, inputVector1; ++- float32x4_t tapsVector0, tapsVector1; ++- float32x4_t tmp_real0, tmp_imag0; ++- float32x4_t tmp_real1, tmp_imag1; ++- float32x4_t real_accumulator0, imag_accumulator0; ++- float32x4_t real_accumulator1, imag_accumulator1; ++- ++- // zero out accumulators ++- // take a *float, return float32x4_t ++- real_accumulator0 = vld1q_f32( zero ); ++- imag_accumulator0 = vld1q_f32( zero ); ++- real_accumulator1 = vld1q_f32( zero ); ++- imag_accumulator1 = vld1q_f32( zero ); ++- ++- for(number=0 ;number < quarterPoints; number++){ ++- // load doublewords and duplicate in to second lane ++- tapsVector0 = vld1q_f32(tapsPtr ); ++- tapsVector1 = vld1q_f32(tapsPtr+4 ); ++- ++- // load quadword of complex numbers in to 2 lanes. 1st lane is real, 2dn imag ++- inputVector0 = vld2q_f32(inputPtr ); ++- inputVector1 = vld2q_f32(inputPtr+8 ); ++- // inputVector is now a struct of two vectors, 0th is real, 1st is imag ++- ++- tmp_real0 = vmulq_f32(tapsVector0, inputVector0.val[0]); ++- tmp_imag0 = vmulq_f32(tapsVector0, inputVector0.val[1]); ++- ++- tmp_real1 = vmulq_f32(tapsVector1, inputVector1.val[0]); ++- tmp_imag1 = vmulq_f32(tapsVector1, inputVector1.val[1]); ++- ++- real_accumulator0 = vaddq_f32(real_accumulator0, tmp_real0); ++- imag_accumulator0 = vaddq_f32(imag_accumulator0, tmp_imag0); ++- ++- real_accumulator1 = vaddq_f32(real_accumulator1, tmp_real1); ++- imag_accumulator1 = vaddq_f32(imag_accumulator1, tmp_imag1); ++- ++- tapsPtr += 8; ++- inputPtr += 16; ++- } ++- ++- real_accumulator0 = vaddq_f32( real_accumulator0, real_accumulator1); ++- imag_accumulator0 = vaddq_f32( imag_accumulator0, imag_accumulator1); ++- // void vst1q_f32( float32_t * ptr, float32x4_t val); ++- // store results back to a complex (array of 2 floats) ++- vst1q_f32(accVector_real, real_accumulator0); ++- vst1q_f32(accVector_imag, imag_accumulator0); ++- *realpt = accVector_real[0] + accVector_real[1] + ++- accVector_real[2] + accVector_real[3] ; ++- ++- *imagpt = accVector_imag[0] + accVector_imag[1] + ++- accVector_imag[2] + accVector_imag[3] ; ++- ++- // clean up the remainder ++- for(number=quarterPoints*8; number < num_points; number++){ ++- *realpt += ((*inputPtr++) * (*tapsPtr)); ++- *imagpt += ((*inputPtr++) * (*tapsPtr++)); ++- } ++- ++- *result = *(lv_32fc_t*)(&res[0]); +++static inline void +++volk_32fc_32f_dot_prod_32fc_neon_unroll(lv_32fc_t* __restrict result, +++ const lv_32fc_t* __restrict input, +++ const float* __restrict taps, +++ unsigned int num_points) +++{ +++ +++ unsigned int number; +++ const unsigned int quarterPoints = num_points / 8; +++ +++ float res[2]; +++ float *realpt = &res[0], *imagpt = &res[1]; +++ const float* inputPtr = (float*)input; +++ const float* tapsPtr = taps; +++ float zero[4] = { 0.0f, 0.0f, 0.0f, 0.0f }; +++ float accVector_real[4]; +++ float accVector_imag[4]; +++ +++ float32x4x2_t inputVector0, inputVector1; +++ float32x4_t tapsVector0, tapsVector1; +++ float32x4_t tmp_real0, tmp_imag0; +++ float32x4_t tmp_real1, tmp_imag1; +++ float32x4_t real_accumulator0, imag_accumulator0; +++ float32x4_t real_accumulator1, imag_accumulator1; +++ +++ // zero out accumulators +++ // take a *float, return float32x4_t +++ real_accumulator0 = vld1q_f32(zero); +++ imag_accumulator0 = vld1q_f32(zero); +++ real_accumulator1 = vld1q_f32(zero); +++ imag_accumulator1 = vld1q_f32(zero); +++ +++ for (number = 0; number < quarterPoints; number++) { +++ // load doublewords and duplicate in to second lane +++ tapsVector0 = vld1q_f32(tapsPtr); +++ tapsVector1 = vld1q_f32(tapsPtr + 4); +++ +++ // load quadword of complex numbers in to 2 lanes. 1st lane is real, 2dn imag +++ inputVector0 = vld2q_f32(inputPtr); +++ inputVector1 = vld2q_f32(inputPtr + 8); +++ // inputVector is now a struct of two vectors, 0th is real, 1st is imag +++ +++ tmp_real0 = vmulq_f32(tapsVector0, inputVector0.val[0]); +++ tmp_imag0 = vmulq_f32(tapsVector0, inputVector0.val[1]); +++ +++ tmp_real1 = vmulq_f32(tapsVector1, inputVector1.val[0]); +++ tmp_imag1 = vmulq_f32(tapsVector1, inputVector1.val[1]); +++ +++ real_accumulator0 = vaddq_f32(real_accumulator0, tmp_real0); +++ imag_accumulator0 = vaddq_f32(imag_accumulator0, tmp_imag0); +++ +++ real_accumulator1 = vaddq_f32(real_accumulator1, tmp_real1); +++ imag_accumulator1 = vaddq_f32(imag_accumulator1, tmp_imag1); +++ +++ tapsPtr += 8; +++ inputPtr += 16; +++ } +++ +++ real_accumulator0 = vaddq_f32(real_accumulator0, real_accumulator1); +++ imag_accumulator0 = vaddq_f32(imag_accumulator0, imag_accumulator1); +++ // void vst1q_f32( float32_t * ptr, float32x4_t val); +++ // store results back to a complex (array of 2 floats) +++ vst1q_f32(accVector_real, real_accumulator0); +++ vst1q_f32(accVector_imag, imag_accumulator0); +++ *realpt = +++ accVector_real[0] + accVector_real[1] + accVector_real[2] + accVector_real[3]; +++ +++ *imagpt = +++ accVector_imag[0] + accVector_imag[1] + accVector_imag[2] + accVector_imag[3]; +++ +++ // clean up the remainder +++ for (number = quarterPoints * 8; number < num_points; number++) { +++ *realpt += ((*inputPtr++) * (*tapsPtr)); +++ *imagpt += ((*inputPtr++) * (*tapsPtr++)); +++ } +++ +++ *result = *(lv_32fc_t*)(&res[0]); ++ } ++ ++ #endif /*LV_HAVE_NEON*/ ++@@ -582,154 +614,171 @@ static inline void volk_32fc_32f_dot_prod_32fc_neon_unroll ( lv_32fc_t* __restri ++ #ifdef LV_HAVE_NEON ++ #include ++ ++-static inline void volk_32fc_32f_dot_prod_32fc_a_neon ( lv_32fc_t* __restrict result, const lv_32fc_t* __restrict input, const float* __restrict taps, unsigned int num_points) { ++- ++- unsigned int number; ++- const unsigned int quarterPoints = num_points / 4; +++static inline void volk_32fc_32f_dot_prod_32fc_a_neon(lv_32fc_t* __restrict result, +++ const lv_32fc_t* __restrict input, +++ const float* __restrict taps, +++ unsigned int num_points) +++{ ++ ++- float res[2]; ++- float *realpt = &res[0], *imagpt = &res[1]; ++- const float* inputPtr = (float*)input; ++- const float* tapsPtr = taps; ++- float zero[4] = {0.0f, 0.0f, 0.0f, 0.0f }; ++- float accVector_real[4]; ++- float accVector_imag[4]; +++ unsigned int number; +++ const unsigned int quarterPoints = num_points / 4; ++ ++- float32x4x2_t inputVector; ++- float32x4_t tapsVector; ++- float32x4_t tmp_real, tmp_imag; ++- float32x4_t real_accumulator, imag_accumulator; +++ float res[2]; +++ float *realpt = &res[0], *imagpt = &res[1]; +++ const float* inputPtr = (float*)input; +++ const float* tapsPtr = taps; +++ float zero[4] = { 0.0f, 0.0f, 0.0f, 0.0f }; +++ float accVector_real[4]; +++ float accVector_imag[4]; ++ +++ float32x4x2_t inputVector; +++ float32x4_t tapsVector; +++ float32x4_t tmp_real, tmp_imag; +++ float32x4_t real_accumulator, imag_accumulator; ++ ++- // zero out accumulators ++- // take a *float, return float32x4_t ++- real_accumulator = vld1q_f32( zero ); ++- imag_accumulator = vld1q_f32( zero ); ++ ++- for(number=0 ;number < quarterPoints; number++){ ++- // load taps ( float32x2x2_t = vld1q_f32( float32_t const * ptr) ) ++- // load doublewords and duplicate in to second lane ++- tapsVector = vld1q_f32(tapsPtr ); +++ // zero out accumulators +++ // take a *float, return float32x4_t +++ real_accumulator = vld1q_f32(zero); +++ imag_accumulator = vld1q_f32(zero); ++ ++- // load quadword of complex numbers in to 2 lanes. 1st lane is real, 2dn imag ++- inputVector = vld2q_f32(inputPtr ); +++ for (number = 0; number < quarterPoints; number++) { +++ // load taps ( float32x2x2_t = vld1q_f32( float32_t const * ptr) ) +++ // load doublewords and duplicate in to second lane +++ tapsVector = vld1q_f32(tapsPtr); ++ ++- tmp_real = vmulq_f32(tapsVector, inputVector.val[0]); ++- tmp_imag = vmulq_f32(tapsVector, inputVector.val[1]); +++ // load quadword of complex numbers in to 2 lanes. 1st lane is real, 2dn imag +++ inputVector = vld2q_f32(inputPtr); ++ ++- real_accumulator = vaddq_f32(real_accumulator, tmp_real); ++- imag_accumulator = vaddq_f32(imag_accumulator, tmp_imag); +++ tmp_real = vmulq_f32(tapsVector, inputVector.val[0]); +++ tmp_imag = vmulq_f32(tapsVector, inputVector.val[1]); ++ +++ real_accumulator = vaddq_f32(real_accumulator, tmp_real); +++ imag_accumulator = vaddq_f32(imag_accumulator, tmp_imag); ++ ++- tapsPtr += 4; ++- inputPtr += 8; ++ ++- } +++ tapsPtr += 4; +++ inputPtr += 8; +++ } ++ ++- // store results back to a complex (array of 2 floats) ++- vst1q_f32(accVector_real, real_accumulator); ++- vst1q_f32(accVector_imag, imag_accumulator); ++- *realpt = accVector_real[0] + accVector_real[1] + ++- accVector_real[2] + accVector_real[3] ; +++ // store results back to a complex (array of 2 floats) +++ vst1q_f32(accVector_real, real_accumulator); +++ vst1q_f32(accVector_imag, imag_accumulator); +++ *realpt = +++ accVector_real[0] + accVector_real[1] + accVector_real[2] + accVector_real[3]; ++ ++- *imagpt = accVector_imag[0] + accVector_imag[1] + ++- accVector_imag[2] + accVector_imag[3] ; +++ *imagpt = +++ accVector_imag[0] + accVector_imag[1] + accVector_imag[2] + accVector_imag[3]; ++ ++- // clean up the remainder ++- for(number=quarterPoints*4; number < num_points; number++){ ++- *realpt += ((*inputPtr++) * (*tapsPtr)); ++- *imagpt += ((*inputPtr++) * (*tapsPtr++)); ++- } +++ // clean up the remainder +++ for (number = quarterPoints * 4; number < num_points; number++) { +++ *realpt += ((*inputPtr++) * (*tapsPtr)); +++ *imagpt += ((*inputPtr++) * (*tapsPtr++)); +++ } ++ ++- *result = *(lv_32fc_t*)(&res[0]); +++ *result = *(lv_32fc_t*)(&res[0]); ++ } ++ ++ #endif /*LV_HAVE_NEON*/ ++ ++ #ifdef LV_HAVE_NEONV7 ++-extern void volk_32fc_32f_dot_prod_32fc_a_neonasm ( lv_32fc_t* result, const lv_32fc_t* input, const float* taps, unsigned int num_points); +++extern void volk_32fc_32f_dot_prod_32fc_a_neonasm(lv_32fc_t* result, +++ const lv_32fc_t* input, +++ const float* taps, +++ unsigned int num_points); ++ #endif /*LV_HAVE_NEONV7*/ ++ ++ #ifdef LV_HAVE_NEONV7 ++-extern void volk_32fc_32f_dot_prod_32fc_a_neonasmvmla ( lv_32fc_t* result, const lv_32fc_t* input, const float* taps, unsigned int num_points); +++extern void volk_32fc_32f_dot_prod_32fc_a_neonasmvmla(lv_32fc_t* result, +++ const lv_32fc_t* input, +++ const float* taps, +++ unsigned int num_points); ++ #endif /*LV_HAVE_NEONV7*/ ++ ++ #ifdef LV_HAVE_NEONV7 ++-extern void volk_32fc_32f_dot_prod_32fc_a_neonpipeline ( lv_32fc_t* result, const lv_32fc_t* input, const float* taps, unsigned int num_points); +++extern void volk_32fc_32f_dot_prod_32fc_a_neonpipeline(lv_32fc_t* result, +++ const lv_32fc_t* input, +++ const float* taps, +++ unsigned int num_points); ++ #endif /*LV_HAVE_NEONV7*/ ++ ++ #ifdef LV_HAVE_SSE ++ ++-static inline void volk_32fc_32f_dot_prod_32fc_u_sse( lv_32fc_t* result, const lv_32fc_t* input, const float* taps, unsigned int num_points) { ++- ++- unsigned int number = 0; ++- const unsigned int sixteenthPoints = num_points / 8; ++- ++- float res[2]; ++- float *realpt = &res[0], *imagpt = &res[1]; ++- const float* aPtr = (float*)input; ++- const float* bPtr = taps; ++- ++- __m128 a0Val, a1Val, a2Val, a3Val; ++- __m128 b0Val, b1Val, b2Val, b3Val; ++- __m128 x0Val, x1Val, x2Val, x3Val; ++- __m128 c0Val, c1Val, c2Val, c3Val; ++- ++- __m128 dotProdVal0 = _mm_setzero_ps(); ++- __m128 dotProdVal1 = _mm_setzero_ps(); ++- __m128 dotProdVal2 = _mm_setzero_ps(); ++- __m128 dotProdVal3 = _mm_setzero_ps(); ++- ++- for(;number < sixteenthPoints; number++){ ++- ++- a0Val = _mm_loadu_ps(aPtr); ++- a1Val = _mm_loadu_ps(aPtr+4); ++- a2Val = _mm_loadu_ps(aPtr+8); ++- a3Val = _mm_loadu_ps(aPtr+12); ++- ++- x0Val = _mm_loadu_ps(bPtr); ++- x1Val = _mm_loadu_ps(bPtr); ++- x2Val = _mm_loadu_ps(bPtr+4); ++- x3Val = _mm_loadu_ps(bPtr+4); ++- b0Val = _mm_unpacklo_ps(x0Val, x1Val); ++- b1Val = _mm_unpackhi_ps(x0Val, x1Val); ++- b2Val = _mm_unpacklo_ps(x2Val, x3Val); ++- b3Val = _mm_unpackhi_ps(x2Val, x3Val); ++- ++- c0Val = _mm_mul_ps(a0Val, b0Val); ++- c1Val = _mm_mul_ps(a1Val, b1Val); ++- c2Val = _mm_mul_ps(a2Val, b2Val); ++- c3Val = _mm_mul_ps(a3Val, b3Val); ++- ++- dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0); ++- dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1); ++- dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2); ++- dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3); ++- ++- aPtr += 16; ++- bPtr += 8; ++- } ++- ++- dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1); ++- dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2); ++- dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3); ++- ++- __VOLK_ATTR_ALIGNED(16) float dotProductVector[4]; ++- ++- _mm_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector ++- ++- *realpt = dotProductVector[0]; ++- *imagpt = dotProductVector[1]; ++- *realpt += dotProductVector[2]; ++- *imagpt += dotProductVector[3]; ++- ++- number = sixteenthPoints*8; ++- for(;number < num_points; number++){ ++- *realpt += ((*aPtr++) * (*bPtr)); ++- *imagpt += ((*aPtr++) * (*bPtr++)); ++- } ++- ++- *result = *(lv_32fc_t*)(&res[0]); +++static inline void volk_32fc_32f_dot_prod_32fc_u_sse(lv_32fc_t* result, +++ const lv_32fc_t* input, +++ const float* taps, +++ unsigned int num_points) +++{ +++ +++ unsigned int number = 0; +++ const unsigned int sixteenthPoints = num_points / 8; +++ +++ float res[2]; +++ float *realpt = &res[0], *imagpt = &res[1]; +++ const float* aPtr = (float*)input; +++ const float* bPtr = taps; +++ +++ __m128 a0Val, a1Val, a2Val, a3Val; +++ __m128 b0Val, b1Val, b2Val, b3Val; +++ __m128 x0Val, x1Val, x2Val, x3Val; +++ __m128 c0Val, c1Val, c2Val, c3Val; +++ +++ __m128 dotProdVal0 = _mm_setzero_ps(); +++ __m128 dotProdVal1 = _mm_setzero_ps(); +++ __m128 dotProdVal2 = _mm_setzero_ps(); +++ __m128 dotProdVal3 = _mm_setzero_ps(); +++ +++ for (; number < sixteenthPoints; number++) { +++ +++ a0Val = _mm_loadu_ps(aPtr); +++ a1Val = _mm_loadu_ps(aPtr + 4); +++ a2Val = _mm_loadu_ps(aPtr + 8); +++ a3Val = _mm_loadu_ps(aPtr + 12); +++ +++ x0Val = _mm_loadu_ps(bPtr); +++ x1Val = _mm_loadu_ps(bPtr); +++ x2Val = _mm_loadu_ps(bPtr + 4); +++ x3Val = _mm_loadu_ps(bPtr + 4); +++ b0Val = _mm_unpacklo_ps(x0Val, x1Val); +++ b1Val = _mm_unpackhi_ps(x0Val, x1Val); +++ b2Val = _mm_unpacklo_ps(x2Val, x3Val); +++ b3Val = _mm_unpackhi_ps(x2Val, x3Val); +++ +++ c0Val = _mm_mul_ps(a0Val, b0Val); +++ c1Val = _mm_mul_ps(a1Val, b1Val); +++ c2Val = _mm_mul_ps(a2Val, b2Val); +++ c3Val = _mm_mul_ps(a3Val, b3Val); +++ +++ dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0); +++ dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1); +++ dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2); +++ dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3); +++ +++ aPtr += 16; +++ bPtr += 8; +++ } +++ +++ dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1); +++ dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2); +++ dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3); +++ +++ __VOLK_ATTR_ALIGNED(16) float dotProductVector[4]; +++ +++ _mm_store_ps(dotProductVector, +++ dotProdVal0); // Store the results back into the dot product vector +++ +++ *realpt = dotProductVector[0]; +++ *imagpt = dotProductVector[1]; +++ *realpt += dotProductVector[2]; +++ *imagpt += dotProductVector[3]; +++ +++ number = sixteenthPoints * 8; +++ for (; number < num_points; number++) { +++ *realpt += ((*aPtr++) * (*bPtr)); +++ *imagpt += ((*aPtr++) * (*bPtr++)); +++ } +++ +++ *result = *(lv_32fc_t*)(&res[0]); ++ } ++ ++ #endif /*LV_HAVE_SSE*/ ++diff --git a/kernels/volk/volk_32fc_32f_multiply_32fc.h b/kernels/volk/volk_32fc_32f_multiply_32fc.h ++index b47883f..196ba9a 100644 ++--- a/kernels/volk/volk_32fc_32f_multiply_32fc.h +++++ b/kernels/volk/volk_32fc_32f_multiply_32fc.h ++@@ -30,8 +30,8 @@ ++ * ++ * Dispatcher Prototype ++ * \code ++- * void volk_32fc_32f_multiply_32fc(lv_32fc_t* cVector, const lv_32fc_t* aVector, const float* bVector, unsigned int num_points); ++- * \endcode +++ * void volk_32fc_32f_multiply_32fc(lv_32fc_t* cVector, const lv_32fc_t* aVector, const +++ * float* bVector, unsigned int num_points); \endcode ++ * ++ * \b Inputs ++ * \li aVector: The input vector of complex floats. ++@@ -61,52 +61,55 @@ ++ #ifdef LV_HAVE_AVX ++ #include ++ ++-static inline void ++-volk_32fc_32f_multiply_32fc_a_avx(lv_32fc_t* cVector, const lv_32fc_t* aVector, ++- const float* bVector, unsigned int num_points) +++static inline void volk_32fc_32f_multiply_32fc_a_avx(lv_32fc_t* cVector, +++ const lv_32fc_t* aVector, +++ const float* bVector, +++ unsigned int num_points) ++ { ++- unsigned int number = 0; ++- const unsigned int eighthPoints = num_points / 8; +++ unsigned int number = 0; +++ const unsigned int eighthPoints = num_points / 8; ++ ++- lv_32fc_t* cPtr = cVector; ++- const lv_32fc_t* aPtr = aVector; ++- const float* bPtr= bVector; +++ lv_32fc_t* cPtr = cVector; +++ const lv_32fc_t* aPtr = aVector; +++ const float* bPtr = bVector; ++ ++- __m256 aVal1, aVal2, bVal, bVal1, bVal2, cVal1, cVal2; +++ __m256 aVal1, aVal2, bVal, bVal1, bVal2, cVal1, cVal2; ++ ++- __m256i permute_mask = _mm256_set_epi32(3, 3, 2, 2, 1, 1, 0, 0); +++ __m256i permute_mask = _mm256_set_epi32(3, 3, 2, 2, 1, 1, 0, 0); ++ ++- for(;number < eighthPoints; number++){ +++ for (; number < eighthPoints; number++) { ++ ++- aVal1 = _mm256_load_ps((float *)aPtr); ++- aPtr += 4; +++ aVal1 = _mm256_load_ps((float*)aPtr); +++ aPtr += 4; ++ ++- aVal2 = _mm256_load_ps((float *)aPtr); ++- aPtr += 4; +++ aVal2 = _mm256_load_ps((float*)aPtr); +++ aPtr += 4; ++ ++- bVal = _mm256_load_ps(bPtr); // b0|b1|b2|b3|b4|b5|b6|b7 ++- bPtr += 8; +++ bVal = _mm256_load_ps(bPtr); // b0|b1|b2|b3|b4|b5|b6|b7 +++ bPtr += 8; ++ ++- bVal1 = _mm256_permute2f128_ps(bVal, bVal, 0x00); // b0|b1|b2|b3|b0|b1|b2|b3 ++- bVal2 = _mm256_permute2f128_ps(bVal, bVal, 0x11); // b4|b5|b6|b7|b4|b5|b6|b7 +++ bVal1 = _mm256_permute2f128_ps(bVal, bVal, 0x00); // b0|b1|b2|b3|b0|b1|b2|b3 +++ bVal2 = _mm256_permute2f128_ps(bVal, bVal, 0x11); // b4|b5|b6|b7|b4|b5|b6|b7 ++ ++- bVal1 = _mm256_permutevar_ps(bVal1, permute_mask); // b0|b0|b1|b1|b2|b2|b3|b3 ++- bVal2 = _mm256_permutevar_ps(bVal2, permute_mask); // b4|b4|b5|b5|b6|b6|b7|b7 +++ bVal1 = _mm256_permutevar_ps(bVal1, permute_mask); // b0|b0|b1|b1|b2|b2|b3|b3 +++ bVal2 = _mm256_permutevar_ps(bVal2, permute_mask); // b4|b4|b5|b5|b6|b6|b7|b7 ++ ++- cVal1 = _mm256_mul_ps(aVal1, bVal1); ++- cVal2 = _mm256_mul_ps(aVal2, bVal2); +++ cVal1 = _mm256_mul_ps(aVal1, bVal1); +++ cVal2 = _mm256_mul_ps(aVal2, bVal2); ++ ++- _mm256_store_ps((float*)cPtr,cVal1); // Store the results back into the C container ++- cPtr += 4; +++ _mm256_store_ps((float*)cPtr, +++ cVal1); // Store the results back into the C container +++ cPtr += 4; ++ ++- _mm256_store_ps((float*)cPtr,cVal2); // Store the results back into the C container ++- cPtr += 4; ++- } +++ _mm256_store_ps((float*)cPtr, +++ cVal2); // Store the results back into the C container +++ cPtr += 4; +++ } ++ ++- number = eighthPoints * 8; ++- for(;number < num_points; ++number){ ++- *cPtr++ = (*aPtr++) * (*bPtr++); ++- } +++ number = eighthPoints * 8; +++ for (; number < num_points; ++number) { +++ *cPtr++ = (*aPtr++) * (*bPtr++); +++ } ++ } ++ #endif /* LV_HAVE_AVX */ ++ ++@@ -114,67 +117,69 @@ volk_32fc_32f_multiply_32fc_a_avx(lv_32fc_t* cVector, const lv_32fc_t* aVector, ++ #ifdef LV_HAVE_SSE ++ #include ++ ++-static inline void ++-volk_32fc_32f_multiply_32fc_a_sse(lv_32fc_t* cVector, const lv_32fc_t* aVector, ++- const float* bVector, unsigned int num_points) +++static inline void volk_32fc_32f_multiply_32fc_a_sse(lv_32fc_t* cVector, +++ const lv_32fc_t* aVector, +++ const float* bVector, +++ unsigned int num_points) ++ { ++- unsigned int number = 0; ++- const unsigned int quarterPoints = num_points / 4; +++ unsigned int number = 0; +++ const unsigned int quarterPoints = num_points / 4; ++ ++- lv_32fc_t* cPtr = cVector; ++- const lv_32fc_t* aPtr = aVector; ++- const float* bPtr= bVector; +++ lv_32fc_t* cPtr = cVector; +++ const lv_32fc_t* aPtr = aVector; +++ const float* bPtr = bVector; ++ ++- __m128 aVal1, aVal2, bVal, bVal1, bVal2, cVal; ++- for(;number < quarterPoints; number++){ +++ __m128 aVal1, aVal2, bVal, bVal1, bVal2, cVal; +++ for (; number < quarterPoints; number++) { ++ ++- aVal1 = _mm_load_ps((const float*)aPtr); ++- aPtr += 2; +++ aVal1 = _mm_load_ps((const float*)aPtr); +++ aPtr += 2; ++ ++- aVal2 = _mm_load_ps((const float*)aPtr); ++- aPtr += 2; +++ aVal2 = _mm_load_ps((const float*)aPtr); +++ aPtr += 2; ++ ++- bVal = _mm_load_ps(bPtr); ++- bPtr += 4; +++ bVal = _mm_load_ps(bPtr); +++ bPtr += 4; ++ ++- bVal1 = _mm_shuffle_ps(bVal, bVal, _MM_SHUFFLE(1,1,0,0)); ++- bVal2 = _mm_shuffle_ps(bVal, bVal, _MM_SHUFFLE(3,3,2,2)); +++ bVal1 = _mm_shuffle_ps(bVal, bVal, _MM_SHUFFLE(1, 1, 0, 0)); +++ bVal2 = _mm_shuffle_ps(bVal, bVal, _MM_SHUFFLE(3, 3, 2, 2)); ++ ++- cVal = _mm_mul_ps(aVal1, bVal1); +++ cVal = _mm_mul_ps(aVal1, bVal1); ++ ++- _mm_store_ps((float*)cPtr,cVal); // Store the results back into the C container ++- cPtr += 2; +++ _mm_store_ps((float*)cPtr, cVal); // Store the results back into the C container +++ cPtr += 2; ++ ++- cVal = _mm_mul_ps(aVal2, bVal2); +++ cVal = _mm_mul_ps(aVal2, bVal2); ++ ++- _mm_store_ps((float*)cPtr,cVal); // Store the results back into the C container +++ _mm_store_ps((float*)cPtr, cVal); // Store the results back into the C container ++ ++- cPtr += 2; ++- } +++ cPtr += 2; +++ } ++ ++- number = quarterPoints * 4; ++- for(;number < num_points; number++){ ++- *cPtr++ = (*aPtr++) * (*bPtr); ++- bPtr++; ++- } +++ number = quarterPoints * 4; +++ for (; number < num_points; number++) { +++ *cPtr++ = (*aPtr++) * (*bPtr); +++ bPtr++; +++ } ++ } ++ #endif /* LV_HAVE_SSE */ ++ ++ ++ #ifdef LV_HAVE_GENERIC ++ ++-static inline void ++-volk_32fc_32f_multiply_32fc_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, ++- const float* bVector, unsigned int num_points) +++static inline void volk_32fc_32f_multiply_32fc_generic(lv_32fc_t* cVector, +++ const lv_32fc_t* aVector, +++ const float* bVector, +++ unsigned int num_points) ++ { ++- lv_32fc_t* cPtr = cVector; ++- const lv_32fc_t* aPtr = aVector; ++- const float* bPtr= bVector; ++- unsigned int number = 0; ++- ++- for(number = 0; number < num_points; number++){ ++- *cPtr++ = (*aPtr++) * (*bPtr++); ++- } +++ lv_32fc_t* cPtr = cVector; +++ const lv_32fc_t* aPtr = aVector; +++ const float* bPtr = bVector; +++ unsigned int number = 0; +++ +++ for (number = 0; number < num_points; number++) { +++ *cPtr++ = (*aPtr++) * (*bPtr++); +++ } ++ } ++ #endif /* LV_HAVE_GENERIC */ ++ ++@@ -182,49 +187,52 @@ volk_32fc_32f_multiply_32fc_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector ++ #ifdef LV_HAVE_NEON ++ #include ++ ++-static inline void ++-volk_32fc_32f_multiply_32fc_neon(lv_32fc_t* cVector, const lv_32fc_t* aVector, ++- const float* bVector, unsigned int num_points) +++static inline void volk_32fc_32f_multiply_32fc_neon(lv_32fc_t* cVector, +++ const lv_32fc_t* aVector, +++ const float* bVector, +++ unsigned int num_points) ++ { ++- lv_32fc_t* cPtr = cVector; ++- const lv_32fc_t* aPtr = aVector; ++- const float* bPtr= bVector; ++- unsigned int number = 0; ++- unsigned int quarter_points = num_points / 4; ++- ++- float32x4x2_t inputVector, outputVector; ++- float32x4_t tapsVector; ++- for(number = 0; number < quarter_points; number++){ ++- inputVector = vld2q_f32((float*)aPtr); ++- tapsVector = vld1q_f32(bPtr); ++- ++- outputVector.val[0] = vmulq_f32(inputVector.val[0], tapsVector); ++- outputVector.val[1] = vmulq_f32(inputVector.val[1], tapsVector); ++- ++- vst2q_f32((float*)cPtr, outputVector); ++- aPtr += 4; ++- bPtr += 4; ++- cPtr += 4; ++- } ++- ++- for(number = quarter_points * 4; number < num_points; number++){ ++- *cPtr++ = (*aPtr++) * (*bPtr++); ++- } +++ lv_32fc_t* cPtr = cVector; +++ const lv_32fc_t* aPtr = aVector; +++ const float* bPtr = bVector; +++ unsigned int number = 0; +++ unsigned int quarter_points = num_points / 4; +++ +++ float32x4x2_t inputVector, outputVector; +++ float32x4_t tapsVector; +++ for (number = 0; number < quarter_points; number++) { +++ inputVector = vld2q_f32((float*)aPtr); +++ tapsVector = vld1q_f32(bPtr); +++ +++ outputVector.val[0] = vmulq_f32(inputVector.val[0], tapsVector); +++ outputVector.val[1] = vmulq_f32(inputVector.val[1], tapsVector); +++ +++ vst2q_f32((float*)cPtr, outputVector); +++ aPtr += 4; +++ bPtr += 4; +++ cPtr += 4; +++ } +++ +++ for (number = quarter_points * 4; number < num_points; number++) { +++ *cPtr++ = (*aPtr++) * (*bPtr++); +++ } ++ } ++ #endif /* LV_HAVE_NEON */ ++ ++ ++ #ifdef LV_HAVE_ORC ++ ++-extern void ++-volk_32fc_32f_multiply_32fc_a_orc_impl(lv_32fc_t* cVector, const lv_32fc_t* aVector, ++- const float* bVector, unsigned int num_points); +++extern void volk_32fc_32f_multiply_32fc_a_orc_impl(lv_32fc_t* cVector, +++ const lv_32fc_t* aVector, +++ const float* bVector, +++ unsigned int num_points); ++ ++-static inline void ++-volk_32fc_32f_multiply_32fc_u_orc(lv_32fc_t* cVector, const lv_32fc_t* aVector, ++- const float* bVector, unsigned int num_points) +++static inline void volk_32fc_32f_multiply_32fc_u_orc(lv_32fc_t* cVector, +++ const lv_32fc_t* aVector, +++ const float* bVector, +++ unsigned int num_points) ++ { ++- volk_32fc_32f_multiply_32fc_a_orc_impl(cVector, aVector, bVector, num_points); +++ volk_32fc_32f_multiply_32fc_a_orc_impl(cVector, aVector, bVector, num_points); ++ } ++ ++ #endif /* LV_HAVE_GENERIC */ ++diff --git a/kernels/volk/volk_32fc_conjugate_32fc.h b/kernels/volk/volk_32fc_conjugate_32fc.h ++index 6994d0e..9195e3a 100644 ++--- a/kernels/volk/volk_32fc_conjugate_32fc.h +++++ b/kernels/volk/volk_32fc_conjugate_32fc.h ++@@ -29,8 +29,8 @@ ++ * ++ * Dispatcher Prototype ++ * \code ++- * void volk_32fc_conjugate_32fc(lv_32fc_t* cVector, const lv_32fc_t* aVector, unsigned int num_points) ++- * \endcode +++ * void volk_32fc_conjugate_32fc(lv_32fc_t* cVector, const lv_32fc_t* aVector, unsigned +++ * int num_points) \endcode ++ * ++ * \b Inputs ++ * \li aVector: The input vector of complex floats. ++@@ -68,91 +68,94 @@ ++ #ifndef INCLUDED_volk_32fc_conjugate_32fc_u_H ++ #define INCLUDED_volk_32fc_conjugate_32fc_u_H ++ +++#include ++ #include ++ #include ++ #include ++-#include ++ ++ #ifdef LV_HAVE_AVX ++ #include ++ ++-static inline void ++-volk_32fc_conjugate_32fc_u_avx(lv_32fc_t* cVector, const lv_32fc_t* aVector, unsigned int num_points) +++static inline void volk_32fc_conjugate_32fc_u_avx(lv_32fc_t* cVector, +++ const lv_32fc_t* aVector, +++ unsigned int num_points) ++ { ++- unsigned int number = 0; ++- const unsigned int quarterPoints = num_points / 4; +++ unsigned int number = 0; +++ const unsigned int quarterPoints = num_points / 4; ++ ++- __m256 x; ++- lv_32fc_t* c = cVector; ++- const lv_32fc_t* a = aVector; +++ __m256 x; +++ lv_32fc_t* c = cVector; +++ const lv_32fc_t* a = aVector; ++ ++- __m256 conjugator = _mm256_setr_ps(0, -0.f, 0, -0.f, 0, -0.f, 0, -0.f); +++ __m256 conjugator = _mm256_setr_ps(0, -0.f, 0, -0.f, 0, -0.f, 0, -0.f); ++ ++- for(;number < quarterPoints; number++){ +++ for (; number < quarterPoints; number++) { ++ ++- x = _mm256_loadu_ps((float*)a); // Load the complex data as ar,ai,br,bi +++ x = _mm256_loadu_ps((float*)a); // Load the complex data as ar,ai,br,bi ++ ++- x = _mm256_xor_ps(x, conjugator); // conjugate register +++ x = _mm256_xor_ps(x, conjugator); // conjugate register ++ ++- _mm256_storeu_ps((float*)c,x); // Store the results back into the C container +++ _mm256_storeu_ps((float*)c, x); // Store the results back into the C container ++ ++- a += 4; ++- c += 4; ++- } +++ a += 4; +++ c += 4; +++ } ++ ++- number = quarterPoints * 4; +++ number = quarterPoints * 4; ++ ++- for(;number < num_points; number++) { ++- *c++ = lv_conj(*a++); ++- } +++ for (; number < num_points; number++) { +++ *c++ = lv_conj(*a++); +++ } ++ } ++ #endif /* LV_HAVE_AVX */ ++ ++ #ifdef LV_HAVE_SSE3 ++ #include ++ ++-static inline void ++-volk_32fc_conjugate_32fc_u_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector, unsigned int num_points) +++static inline void volk_32fc_conjugate_32fc_u_sse3(lv_32fc_t* cVector, +++ const lv_32fc_t* aVector, +++ unsigned int num_points) ++ { ++- unsigned int number = 0; ++- const unsigned int halfPoints = num_points / 2; +++ unsigned int number = 0; +++ const unsigned int halfPoints = num_points / 2; ++ ++- __m128 x; ++- lv_32fc_t* c = cVector; ++- const lv_32fc_t* a = aVector; +++ __m128 x; +++ lv_32fc_t* c = cVector; +++ const lv_32fc_t* a = aVector; ++ ++- __m128 conjugator = _mm_setr_ps(0, -0.f, 0, -0.f); +++ __m128 conjugator = _mm_setr_ps(0, -0.f, 0, -0.f); ++ ++- for(;number < halfPoints; number++){ +++ for (; number < halfPoints; number++) { ++ ++- x = _mm_loadu_ps((float*)a); // Load the complex data as ar,ai,br,bi +++ x = _mm_loadu_ps((float*)a); // Load the complex data as ar,ai,br,bi ++ ++- x = _mm_xor_ps(x, conjugator); // conjugate register +++ x = _mm_xor_ps(x, conjugator); // conjugate register ++ ++- _mm_storeu_ps((float*)c,x); // Store the results back into the C container +++ _mm_storeu_ps((float*)c, x); // Store the results back into the C container ++ ++- a += 2; ++- c += 2; ++- } +++ a += 2; +++ c += 2; +++ } ++ ++- if((num_points % 2) != 0) { ++- *c = lv_conj(*a); ++- } +++ if ((num_points % 2) != 0) { +++ *c = lv_conj(*a); +++ } ++ } ++ #endif /* LV_HAVE_SSE3 */ ++ ++ #ifdef LV_HAVE_GENERIC ++ ++-static inline void ++-volk_32fc_conjugate_32fc_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, unsigned int num_points) +++static inline void volk_32fc_conjugate_32fc_generic(lv_32fc_t* cVector, +++ const lv_32fc_t* aVector, +++ unsigned int num_points) ++ { ++- lv_32fc_t* cPtr = cVector; ++- const lv_32fc_t* aPtr = aVector; ++- unsigned int number = 0; +++ lv_32fc_t* cPtr = cVector; +++ const lv_32fc_t* aPtr = aVector; +++ unsigned int number = 0; ++ ++- for(number = 0; number < num_points; number++){ ++- *cPtr++ = lv_conj(*aPtr++); ++- } +++ for (number = 0; number < num_points; number++) { +++ *cPtr++ = lv_conj(*aPtr++); +++ } ++ } ++ #endif /* LV_HAVE_GENERIC */ ++ ++@@ -161,124 +164,128 @@ volk_32fc_conjugate_32fc_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, u ++ #ifndef INCLUDED_volk_32fc_conjugate_32fc_a_H ++ #define INCLUDED_volk_32fc_conjugate_32fc_a_H ++ +++#include ++ #include ++ #include ++ #include ++-#include ++ ++ #ifdef LV_HAVE_AVX ++ #include ++ ++-static inline void ++-volk_32fc_conjugate_32fc_a_avx(lv_32fc_t* cVector, const lv_32fc_t* aVector, unsigned int num_points) +++static inline void volk_32fc_conjugate_32fc_a_avx(lv_32fc_t* cVector, +++ const lv_32fc_t* aVector, +++ unsigned int num_points) ++ { ++- unsigned int number = 0; ++- const unsigned int quarterPoints = num_points / 4; +++ unsigned int number = 0; +++ const unsigned int quarterPoints = num_points / 4; ++ ++- __m256 x; ++- lv_32fc_t* c = cVector; ++- const lv_32fc_t* a = aVector; +++ __m256 x; +++ lv_32fc_t* c = cVector; +++ const lv_32fc_t* a = aVector; ++ ++- __m256 conjugator = _mm256_setr_ps(0, -0.f, 0, -0.f, 0, -0.f, 0, -0.f); +++ __m256 conjugator = _mm256_setr_ps(0, -0.f, 0, -0.f, 0, -0.f, 0, -0.f); ++ ++- for(;number < quarterPoints; number++){ +++ for (; number < quarterPoints; number++) { ++ ++- x = _mm256_load_ps((float*)a); // Load the complex data as ar,ai,br,bi +++ x = _mm256_load_ps((float*)a); // Load the complex data as ar,ai,br,bi ++ ++- x = _mm256_xor_ps(x, conjugator); // conjugate register +++ x = _mm256_xor_ps(x, conjugator); // conjugate register ++ ++- _mm256_store_ps((float*)c,x); // Store the results back into the C container +++ _mm256_store_ps((float*)c, x); // Store the results back into the C container ++ ++- a += 4; ++- c += 4; ++- } +++ a += 4; +++ c += 4; +++ } ++ ++- number = quarterPoints * 4; +++ number = quarterPoints * 4; ++ ++- for(;number < num_points; number++) { ++- *c++ = lv_conj(*a++); ++- } +++ for (; number < num_points; number++) { +++ *c++ = lv_conj(*a++); +++ } ++ } ++ #endif /* LV_HAVE_AVX */ ++ ++ #ifdef LV_HAVE_SSE3 ++ #include ++ ++-static inline void ++-volk_32fc_conjugate_32fc_a_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector, unsigned int num_points) +++static inline void volk_32fc_conjugate_32fc_a_sse3(lv_32fc_t* cVector, +++ const lv_32fc_t* aVector, +++ unsigned int num_points) ++ { ++- unsigned int number = 0; ++- const unsigned int halfPoints = num_points / 2; +++ unsigned int number = 0; +++ const unsigned int halfPoints = num_points / 2; ++ ++- __m128 x; ++- lv_32fc_t* c = cVector; ++- const lv_32fc_t* a = aVector; +++ __m128 x; +++ lv_32fc_t* c = cVector; +++ const lv_32fc_t* a = aVector; ++ ++- __m128 conjugator = _mm_setr_ps(0, -0.f, 0, -0.f); +++ __m128 conjugator = _mm_setr_ps(0, -0.f, 0, -0.f); ++ ++- for(;number < halfPoints; number++){ +++ for (; number < halfPoints; number++) { ++ ++- x = _mm_load_ps((float*)a); // Load the complex data as ar,ai,br,bi +++ x = _mm_load_ps((float*)a); // Load the complex data as ar,ai,br,bi ++ ++- x = _mm_xor_ps(x, conjugator); // conjugate register +++ x = _mm_xor_ps(x, conjugator); // conjugate register ++ ++- _mm_store_ps((float*)c,x); // Store the results back into the C container +++ _mm_store_ps((float*)c, x); // Store the results back into the C container ++ ++- a += 2; ++- c += 2; ++- } +++ a += 2; +++ c += 2; +++ } ++ ++- if((num_points % 2) != 0) { ++- *c = lv_conj(*a); ++- } +++ if ((num_points % 2) != 0) { +++ *c = lv_conj(*a); +++ } ++ } ++ #endif /* LV_HAVE_SSE3 */ ++ ++ #ifdef LV_HAVE_NEON ++ #include ++ ++-static inline void ++-volk_32fc_conjugate_32fc_a_neon(lv_32fc_t* cVector, const lv_32fc_t* aVector, unsigned int num_points) +++static inline void volk_32fc_conjugate_32fc_a_neon(lv_32fc_t* cVector, +++ const lv_32fc_t* aVector, +++ unsigned int num_points) ++ { ++- unsigned int number; ++- const unsigned int quarterPoints = num_points / 4; +++ unsigned int number; +++ const unsigned int quarterPoints = num_points / 4; ++ ++- float32x4x2_t x; ++- lv_32fc_t* c = cVector; ++- const lv_32fc_t* a = aVector; +++ float32x4x2_t x; +++ lv_32fc_t* c = cVector; +++ const lv_32fc_t* a = aVector; ++ ++- for(number=0; number < quarterPoints; number++){ ++- __VOLK_PREFETCH(a+4); ++- x = vld2q_f32((float*)a); // Load the complex data as ar,br,cr,dr; ai,bi,ci,di +++ for (number = 0; number < quarterPoints; number++) { +++ __VOLK_PREFETCH(a + 4); +++ x = vld2q_f32((float*)a); // Load the complex data as ar,br,cr,dr; ai,bi,ci,di ++ ++- // xor the imaginary lane ++- x.val[1] = vnegq_f32( x.val[1]); +++ // xor the imaginary lane +++ x.val[1] = vnegq_f32(x.val[1]); ++ ++- vst2q_f32((float*)c,x); // Store the results back into the C container +++ vst2q_f32((float*)c, x); // Store the results back into the C container ++ ++- a += 4; ++- c += 4; ++- } +++ a += 4; +++ c += 4; +++ } ++ ++- for(number=quarterPoints*4; number < num_points; number++){ ++- *c++ = lv_conj(*a++); ++- } +++ for (number = quarterPoints * 4; number < num_points; number++) { +++ *c++ = lv_conj(*a++); +++ } ++ } ++ #endif /* LV_HAVE_NEON */ ++ ++ ++ #ifdef LV_HAVE_GENERIC ++ ++-static inline void ++-volk_32fc_conjugate_32fc_a_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, unsigned int num_points) +++static inline void volk_32fc_conjugate_32fc_a_generic(lv_32fc_t* cVector, +++ const lv_32fc_t* aVector, +++ unsigned int num_points) ++ { ++- lv_32fc_t* cPtr = cVector; ++- const lv_32fc_t* aPtr = aVector; ++- unsigned int number = 0; +++ lv_32fc_t* cPtr = cVector; +++ const lv_32fc_t* aPtr = aVector; +++ unsigned int number = 0; ++ ++- for(number = 0; number < num_points; number++){ ++- *cPtr++ = lv_conj(*aPtr++); ++- } +++ for (number = 0; number < num_points; number++) { +++ *cPtr++ = lv_conj(*aPtr++); +++ } ++ } ++ #endif /* LV_HAVE_GENERIC */ ++ ++diff --git a/kernels/volk/volk_32fc_convert_16ic.h b/kernels/volk/volk_32fc_convert_16ic.h ++index 0ba2383..5788158 100644 ++--- a/kernels/volk/volk_32fc_convert_16ic.h +++++ b/kernels/volk/volk_32fc_convert_16ic.h ++@@ -31,8 +31,8 @@ ++ * ++ * Dispatcher Prototype ++ * \code ++- * void volk_32fc_convert_16ic(lv_16sc_t* outputVector, const lv_32fc_t* inputVector, unsigned int num_points); ++- * \endcode +++ * void volk_32fc_convert_16ic(lv_16sc_t* outputVector, const lv_32fc_t* inputVector, +++ * unsigned int num_points); \endcode ++ * ++ * \b Inputs ++ * \li inputVector: The complex 32-bit float input data buffer. ++@@ -46,14 +46,16 @@ ++ #ifndef INCLUDED_volk_32fc_convert_16ic_a_H ++ #define INCLUDED_volk_32fc_convert_16ic_a_H ++ +++#include "volk/volk_complex.h" ++ #include ++ #include ++-#include "volk/volk_complex.h" ++ ++ #ifdef LV_HAVE_AVX2 ++ #include ++ ++-static inline void volk_32fc_convert_16ic_a_avx2(lv_16sc_t* outputVector, const lv_32fc_t* inputVector, unsigned int num_points) +++static inline void volk_32fc_convert_16ic_a_avx2(lv_16sc_t* outputVector, +++ const lv_32fc_t* inputVector, +++ unsigned int num_points) ++ { ++ const unsigned int avx_iters = num_points / 8; ++ ++@@ -71,44 +73,44 @@ static inline void volk_32fc_convert_16ic_a_avx2(lv_16sc_t* outputVector, const ++ const __m256 vmax_val = _mm256_set1_ps(max_val); ++ unsigned int i; ++ ++- for(i = 0; i < avx_iters; i++) ++- { ++- inputVal1 = _mm256_load_ps((float*)inputVectorPtr); ++- inputVectorPtr += 8; ++- inputVal2 = _mm256_load_ps((float*)inputVectorPtr); ++- inputVectorPtr += 8; ++- __VOLK_PREFETCH(inputVectorPtr + 16); ++- ++- // Clip ++- ret1 = _mm256_max_ps(_mm256_min_ps(inputVal1, vmax_val), vmin_val); ++- ret2 = _mm256_max_ps(_mm256_min_ps(inputVal2, vmax_val), vmin_val); ++- ++- intInputVal1 = _mm256_cvtps_epi32(ret1); ++- intInputVal2 = _mm256_cvtps_epi32(ret2); ++- ++- intInputVal1 = _mm256_packs_epi32(intInputVal1, intInputVal2); ++- intInputVal1 = _mm256_permute4x64_epi64(intInputVal1, 0xd8); ++- ++- _mm256_store_si256((__m256i*)outputVectorPtr, intInputVal1); ++- outputVectorPtr += 16; ++- } ++- ++- for(i = avx_iters * 16; i < num_points * 2; i++) ++- { ++- aux = *inputVectorPtr++; ++- if(aux > max_val) ++- aux = max_val; ++- else if(aux < min_val) ++- aux = min_val; ++- *outputVectorPtr++ = (int16_t)rintf(aux); ++- } +++ for (i = 0; i < avx_iters; i++) { +++ inputVal1 = _mm256_load_ps((float*)inputVectorPtr); +++ inputVectorPtr += 8; +++ inputVal2 = _mm256_load_ps((float*)inputVectorPtr); +++ inputVectorPtr += 8; +++ __VOLK_PREFETCH(inputVectorPtr + 16); +++ +++ // Clip +++ ret1 = _mm256_max_ps(_mm256_min_ps(inputVal1, vmax_val), vmin_val); +++ ret2 = _mm256_max_ps(_mm256_min_ps(inputVal2, vmax_val), vmin_val); +++ +++ intInputVal1 = _mm256_cvtps_epi32(ret1); +++ intInputVal2 = _mm256_cvtps_epi32(ret2); +++ +++ intInputVal1 = _mm256_packs_epi32(intInputVal1, intInputVal2); +++ intInputVal1 = _mm256_permute4x64_epi64(intInputVal1, 0xd8); +++ +++ _mm256_store_si256((__m256i*)outputVectorPtr, intInputVal1); +++ outputVectorPtr += 16; +++ } +++ +++ for (i = avx_iters * 16; i < num_points * 2; i++) { +++ aux = *inputVectorPtr++; +++ if (aux > max_val) +++ aux = max_val; +++ else if (aux < min_val) +++ aux = min_val; +++ *outputVectorPtr++ = (int16_t)rintf(aux); +++ } ++ } ++ #endif /* LV_HAVE_AVX2 */ ++ ++ #ifdef LV_HAVE_SSE2 ++ #include ++ ++-static inline void volk_32fc_convert_16ic_a_sse2(lv_16sc_t* outputVector, const lv_32fc_t* inputVector, unsigned int num_points) +++static inline void volk_32fc_convert_16ic_a_sse2(lv_16sc_t* outputVector, +++ const lv_32fc_t* inputVector, +++ unsigned int num_points) ++ { ++ const unsigned int sse_iters = num_points / 4; ++ ++@@ -126,34 +128,34 @@ static inline void volk_32fc_convert_16ic_a_sse2(lv_16sc_t* outputVector, const ++ const __m128 vmax_val = _mm_set_ps1(max_val); ++ unsigned int i; ++ ++- for(i = 0; i < sse_iters; i++) ++- { ++- inputVal1 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4; ++- inputVal2 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4; ++- __VOLK_PREFETCH(inputVectorPtr + 8); ++- ++- // Clip ++- ret1 = _mm_max_ps(_mm_min_ps(inputVal1, vmax_val), vmin_val); ++- ret2 = _mm_max_ps(_mm_min_ps(inputVal2, vmax_val), vmin_val); ++- ++- intInputVal1 = _mm_cvtps_epi32(ret1); ++- intInputVal2 = _mm_cvtps_epi32(ret2); ++- ++- intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2); ++- ++- _mm_store_si128((__m128i*)outputVectorPtr, intInputVal1); ++- outputVectorPtr += 8; ++- } ++- ++- for(i = sse_iters * 8; i < num_points * 2; i++) ++- { ++- aux = *inputVectorPtr++; ++- if(aux > max_val) ++- aux = max_val; ++- else if(aux < min_val) ++- aux = min_val; ++- *outputVectorPtr++ = (int16_t)rintf(aux); ++- } +++ for (i = 0; i < sse_iters; i++) { +++ inputVal1 = _mm_load_ps((float*)inputVectorPtr); +++ inputVectorPtr += 4; +++ inputVal2 = _mm_load_ps((float*)inputVectorPtr); +++ inputVectorPtr += 4; +++ __VOLK_PREFETCH(inputVectorPtr + 8); +++ +++ // Clip +++ ret1 = _mm_max_ps(_mm_min_ps(inputVal1, vmax_val), vmin_val); +++ ret2 = _mm_max_ps(_mm_min_ps(inputVal2, vmax_val), vmin_val); +++ +++ intInputVal1 = _mm_cvtps_epi32(ret1); +++ intInputVal2 = _mm_cvtps_epi32(ret2); +++ +++ intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2); +++ +++ _mm_store_si128((__m128i*)outputVectorPtr, intInputVal1); +++ outputVectorPtr += 8; +++ } +++ +++ for (i = sse_iters * 8; i < num_points * 2; i++) { +++ aux = *inputVectorPtr++; +++ if (aux > max_val) +++ aux = max_val; +++ else if (aux < min_val) +++ aux = min_val; +++ *outputVectorPtr++ = (int16_t)rintf(aux); +++ } ++ } ++ #endif /* LV_HAVE_SSE2 */ ++ ++@@ -161,13 +163,24 @@ static inline void volk_32fc_convert_16ic_a_sse2(lv_16sc_t* outputVector, const ++ #if LV_HAVE_NEONV7 ++ #include ++ ++-#define VCVTRQ_S32_F32(res,val) \ ++- __VOLK_ASM ("VCVTR.S32.F32 %[r0], %[v0]\n\t" : [r0]"=w"(res[0]) : [v0]"w"(val[0]) : ); \ ++- __VOLK_ASM ("VCVTR.S32.F32 %[r1], %[v1]\n\t" : [r1]"=w"(res[1]) : [v1]"w"(val[1]) : ); \ ++- __VOLK_ASM ("VCVTR.S32.F32 %[r2], %[v2]\n\t" : [r2]"=w"(res[2]) : [v2]"w"(val[2]) : ); \ ++- __VOLK_ASM ("VCVTR.S32.F32 %[r3], %[v3]\n\t" : [r3]"=w"(res[3]) : [v3]"w"(val[3]) : ); ++- ++-static inline void volk_32fc_convert_16ic_neon(lv_16sc_t* outputVector, const lv_32fc_t* inputVector, unsigned int num_points) +++#define VCVTRQ_S32_F32(res, val) \ +++ __VOLK_ASM("VCVTR.S32.F32 %[r0], %[v0]\n\t" \ +++ : [r0] "=w"(res[0]) \ +++ : [v0] "w"(val[0]) \ +++ :); \ +++ __VOLK_ASM("VCVTR.S32.F32 %[r1], %[v1]\n\t" \ +++ : [r1] "=w"(res[1]) \ +++ : [v1] "w"(val[1]) \ +++ :); \ +++ __VOLK_ASM("VCVTR.S32.F32 %[r2], %[v2]\n\t" \ +++ : [r2] "=w"(res[2]) \ +++ : [v2] "w"(val[2]) \ +++ :); \ +++ __VOLK_ASM("VCVTR.S32.F32 %[r3], %[v3]\n\t" : [r3] "=w"(res[3]) : [v3] "w"(val[3]) :); +++ +++static inline void volk_32fc_convert_16ic_neon(lv_16sc_t* outputVector, +++ const lv_32fc_t* inputVector, +++ unsigned int num_points) ++ { ++ ++ const unsigned int neon_iters = num_points / 4; ++@@ -184,43 +197,41 @@ static inline void volk_32fc_convert_16ic_neon(lv_16sc_t* outputVector, const lv ++ const float32x4_t max_val = vmovq_n_f32(max_val_f); ++ float32x4_t ret1, ret2, a, b; ++ ++- int32x4_t toint_a={0,0,0,0}; ++- int32x4_t toint_b={0,0,0,0}; +++ int32x4_t toint_a = { 0, 0, 0, 0 }; +++ int32x4_t toint_b = { 0, 0, 0, 0 }; ++ int16x4_t intInputVal1, intInputVal2; ++ int16x8_t res; ++ ++- for(i = 0; i < neon_iters; i++) ++- { ++- a = vld1q_f32((const float32_t*)(inputVectorPtr)); ++- inputVectorPtr += 4; ++- b = vld1q_f32((const float32_t*)(inputVectorPtr)); ++- inputVectorPtr += 4; ++- __VOLK_PREFETCH(inputVectorPtr + 8); ++- ++- ret1 = vmaxq_f32(vminq_f32(a, max_val), min_val); ++- ret2 = vmaxq_f32(vminq_f32(b, max_val), min_val); ++- ++- // vcvtr takes into account the current rounding mode (as does rintf) ++- VCVTRQ_S32_F32(toint_a, ret1); ++- VCVTRQ_S32_F32(toint_b, ret2); ++- ++- intInputVal1 = vqmovn_s32(toint_a); ++- intInputVal2 = vqmovn_s32(toint_b); ++- ++- res = vcombine_s16(intInputVal1, intInputVal2); ++- vst1q_s16((int16_t*)outputVectorPtr, res); ++- outputVectorPtr += 8; ++- } ++- ++- for(i = neon_iters * 8; i < num_points * 2; i++) ++- { ++- aux = *inputVectorPtr++; ++- if(aux > max_val_f) ++- aux = max_val_f; ++- else if(aux < min_val_f) ++- aux = min_val_f; ++- *outputVectorPtr++ = (int16_t)rintf(aux); ++- } +++ for (i = 0; i < neon_iters; i++) { +++ a = vld1q_f32((const float32_t*)(inputVectorPtr)); +++ inputVectorPtr += 4; +++ b = vld1q_f32((const float32_t*)(inputVectorPtr)); +++ inputVectorPtr += 4; +++ __VOLK_PREFETCH(inputVectorPtr + 8); +++ +++ ret1 = vmaxq_f32(vminq_f32(a, max_val), min_val); +++ ret2 = vmaxq_f32(vminq_f32(b, max_val), min_val); +++ +++ // vcvtr takes into account the current rounding mode (as does rintf) +++ VCVTRQ_S32_F32(toint_a, ret1); +++ VCVTRQ_S32_F32(toint_b, ret2); +++ +++ intInputVal1 = vqmovn_s32(toint_a); +++ intInputVal2 = vqmovn_s32(toint_b); +++ +++ res = vcombine_s16(intInputVal1, intInputVal2); +++ vst1q_s16((int16_t*)outputVectorPtr, res); +++ outputVectorPtr += 8; +++ } +++ +++ for (i = neon_iters * 8; i < num_points * 2; i++) { +++ aux = *inputVectorPtr++; +++ if (aux > max_val_f) +++ aux = max_val_f; +++ else if (aux < min_val_f) +++ aux = min_val_f; +++ *outputVectorPtr++ = (int16_t)rintf(aux); +++ } ++ } ++ ++ #undef VCVTRQ_S32_F32 ++@@ -229,7 +240,9 @@ static inline void volk_32fc_convert_16ic_neon(lv_16sc_t* outputVector, const lv ++ #if LV_HAVE_NEONV8 ++ #include ++ ++-static inline void volk_32fc_convert_16ic_neonv8(lv_16sc_t* outputVector, const lv_32fc_t* inputVector, unsigned int num_points) +++static inline void volk_32fc_convert_16ic_neonv8(lv_16sc_t* outputVector, +++ const lv_32fc_t* inputVector, +++ unsigned int num_points) ++ { ++ const unsigned int neon_iters = num_points / 4; ++ ++@@ -245,50 +258,49 @@ static inline void volk_32fc_convert_16ic_neonv8(lv_16sc_t* outputVector, const ++ const float32x4_t max_val = vmovq_n_f32(max_val_f); ++ float32x4_t ret1, ret2, a, b; ++ ++- int32x4_t toint_a={0,0,0,0}, toint_b={0,0,0,0}; +++ int32x4_t toint_a = { 0, 0, 0, 0 }, toint_b = { 0, 0, 0, 0 }; ++ int16x4_t intInputVal1, intInputVal2; ++ int16x8_t res; ++ ++- for(i = 0; i < neon_iters; i++) ++- { ++- a = vld1q_f32((const float32_t*)(inputVectorPtr)); ++- inputVectorPtr += 4; ++- b = vld1q_f32((const float32_t*)(inputVectorPtr)); ++- inputVectorPtr += 4; ++- __VOLK_PREFETCH(inputVectorPtr + 8); ++- ++- ret1 = vmaxq_f32(vminq_f32(a, max_val), min_val); ++- ret2 = vmaxq_f32(vminq_f32(b, max_val), min_val); ++- ++- // vrndiq takes into account the current rounding mode (as does rintf) ++- toint_a = vcvtq_s32_f32(vrndiq_f32(ret1)); ++- toint_b = vcvtq_s32_f32(vrndiq_f32(ret2)); ++- ++- intInputVal1 = vqmovn_s32(toint_a); ++- intInputVal2 = vqmovn_s32(toint_b); ++- ++- res = vcombine_s16(intInputVal1, intInputVal2); ++- vst1q_s16((int16_t*)outputVectorPtr, res); ++- outputVectorPtr += 8; ++- } ++- ++- for(i = neon_iters * 8; i < num_points * 2; i++) ++- { ++- aux = *inputVectorPtr++; ++- if(aux > max_val_f) ++- aux = max_val_f; ++- else if(aux < min_val_f) ++- aux = min_val_f; ++- *outputVectorPtr++ = (int16_t)rintf(aux); ++- } +++ for (i = 0; i < neon_iters; i++) { +++ a = vld1q_f32((const float32_t*)(inputVectorPtr)); +++ inputVectorPtr += 4; +++ b = vld1q_f32((const float32_t*)(inputVectorPtr)); +++ inputVectorPtr += 4; +++ __VOLK_PREFETCH(inputVectorPtr + 8); +++ +++ ret1 = vmaxq_f32(vminq_f32(a, max_val), min_val); +++ ret2 = vmaxq_f32(vminq_f32(b, max_val), min_val); +++ +++ // vrndiq takes into account the current rounding mode (as does rintf) +++ toint_a = vcvtq_s32_f32(vrndiq_f32(ret1)); +++ toint_b = vcvtq_s32_f32(vrndiq_f32(ret2)); +++ +++ intInputVal1 = vqmovn_s32(toint_a); +++ intInputVal2 = vqmovn_s32(toint_b); +++ +++ res = vcombine_s16(intInputVal1, intInputVal2); +++ vst1q_s16((int16_t*)outputVectorPtr, res); +++ outputVectorPtr += 8; +++ } +++ +++ for (i = neon_iters * 8; i < num_points * 2; i++) { +++ aux = *inputVectorPtr++; +++ if (aux > max_val_f) +++ aux = max_val_f; +++ else if (aux < min_val_f) +++ aux = min_val_f; +++ *outputVectorPtr++ = (int16_t)rintf(aux); +++ } ++ } ++ #endif /* LV_HAVE_NEONV8 */ ++ ++ ++- ++ #ifdef LV_HAVE_GENERIC ++ ++-static inline void volk_32fc_convert_16ic_generic(lv_16sc_t* outputVector, const lv_32fc_t* inputVector, unsigned int num_points) +++static inline void volk_32fc_convert_16ic_generic(lv_16sc_t* outputVector, +++ const lv_32fc_t* inputVector, +++ unsigned int num_points) ++ { ++ float* inputVectorPtr = (float*)inputVector; ++ int16_t* outputVectorPtr = (int16_t*)outputVector; ++@@ -296,15 +308,14 @@ static inline void volk_32fc_convert_16ic_generic(lv_16sc_t* outputVector, const ++ const float max_val = (float)SHRT_MAX; ++ float aux; ++ unsigned int i; ++- for(i = 0; i < num_points * 2; i++) ++- { ++- aux = *inputVectorPtr++; ++- if(aux > max_val) ++- aux = max_val; ++- else if(aux < min_val) ++- aux = min_val; ++- *outputVectorPtr++ = (int16_t)rintf(aux); ++- } +++ for (i = 0; i < num_points * 2; i++) { +++ aux = *inputVectorPtr++; +++ if (aux > max_val) +++ aux = max_val; +++ else if (aux < min_val) +++ aux = min_val; +++ *outputVectorPtr++ = (int16_t)rintf(aux); +++ } ++ } ++ #endif /* LV_HAVE_GENERIC */ ++ ++@@ -313,15 +324,17 @@ static inline void volk_32fc_convert_16ic_generic(lv_16sc_t* outputVector, const ++ #ifndef INCLUDED_volk_32fc_convert_16ic_u_H ++ #define INCLUDED_volk_32fc_convert_16ic_u_H ++ +++#include "volk/volk_complex.h" ++ #include ++ #include ++-#include "volk/volk_complex.h" ++ ++ ++ #ifdef LV_HAVE_AVX2 ++ #include ++ ++-static inline void volk_32fc_convert_16ic_u_avx2(lv_16sc_t* outputVector, const lv_32fc_t* inputVector, unsigned int num_points) +++static inline void volk_32fc_convert_16ic_u_avx2(lv_16sc_t* outputVector, +++ const lv_32fc_t* inputVector, +++ unsigned int num_points) ++ { ++ const unsigned int avx_iters = num_points / 8; ++ ++@@ -339,37 +352,35 @@ static inline void volk_32fc_convert_16ic_u_avx2(lv_16sc_t* outputVector, const ++ const __m256 vmax_val = _mm256_set1_ps(max_val); ++ unsigned int i; ++ ++- for(i = 0; i < avx_iters; i++) ++- { ++- inputVal1 = _mm256_loadu_ps((float*)inputVectorPtr); ++- inputVectorPtr += 8; ++- inputVal2 = _mm256_loadu_ps((float*)inputVectorPtr); ++- inputVectorPtr += 8; ++- __VOLK_PREFETCH(inputVectorPtr + 16); ++- ++- // Clip ++- ret1 = _mm256_max_ps(_mm256_min_ps(inputVal1, vmax_val), vmin_val); ++- ret2 = _mm256_max_ps(_mm256_min_ps(inputVal2, vmax_val), vmin_val); ++- ++- intInputVal1 = _mm256_cvtps_epi32(ret1); ++- intInputVal2 = _mm256_cvtps_epi32(ret2); ++- ++- intInputVal1 = _mm256_packs_epi32(intInputVal1, intInputVal2); ++- intInputVal1 = _mm256_permute4x64_epi64(intInputVal1, 0xd8); ++- ++- _mm256_storeu_si256((__m256i*)outputVectorPtr, intInputVal1); ++- outputVectorPtr += 16; ++- } ++- ++- for(i = avx_iters * 16; i < num_points * 2; i++) ++- { ++- aux = *inputVectorPtr++; ++- if(aux > max_val) ++- aux = max_val; ++- else if(aux < min_val) ++- aux = min_val; ++- *outputVectorPtr++ = (int16_t)rintf(aux); ++- } +++ for (i = 0; i < avx_iters; i++) { +++ inputVal1 = _mm256_loadu_ps((float*)inputVectorPtr); +++ inputVectorPtr += 8; +++ inputVal2 = _mm256_loadu_ps((float*)inputVectorPtr); +++ inputVectorPtr += 8; +++ __VOLK_PREFETCH(inputVectorPtr + 16); +++ +++ // Clip +++ ret1 = _mm256_max_ps(_mm256_min_ps(inputVal1, vmax_val), vmin_val); +++ ret2 = _mm256_max_ps(_mm256_min_ps(inputVal2, vmax_val), vmin_val); +++ +++ intInputVal1 = _mm256_cvtps_epi32(ret1); +++ intInputVal2 = _mm256_cvtps_epi32(ret2); +++ +++ intInputVal1 = _mm256_packs_epi32(intInputVal1, intInputVal2); +++ intInputVal1 = _mm256_permute4x64_epi64(intInputVal1, 0xd8); +++ +++ _mm256_storeu_si256((__m256i*)outputVectorPtr, intInputVal1); +++ outputVectorPtr += 16; +++ } +++ +++ for (i = avx_iters * 16; i < num_points * 2; i++) { +++ aux = *inputVectorPtr++; +++ if (aux > max_val) +++ aux = max_val; +++ else if (aux < min_val) +++ aux = min_val; +++ *outputVectorPtr++ = (int16_t)rintf(aux); +++ } ++ } ++ #endif /* LV_HAVE_AVX2 */ ++ ++@@ -377,7 +388,9 @@ static inline void volk_32fc_convert_16ic_u_avx2(lv_16sc_t* outputVector, const ++ #ifdef LV_HAVE_SSE2 ++ #include ++ ++-static inline void volk_32fc_convert_16ic_u_sse2(lv_16sc_t* outputVector, const lv_32fc_t* inputVector, unsigned int num_points) +++static inline void volk_32fc_convert_16ic_u_sse2(lv_16sc_t* outputVector, +++ const lv_32fc_t* inputVector, +++ unsigned int num_points) ++ { ++ const unsigned int sse_iters = num_points / 4; ++ ++@@ -395,36 +408,34 @@ static inline void volk_32fc_convert_16ic_u_sse2(lv_16sc_t* outputVector, const ++ const __m128 vmax_val = _mm_set_ps1(max_val); ++ ++ unsigned int i; ++- for(i = 0; i < sse_iters; i++) ++- { ++- inputVal1 = _mm_loadu_ps((float*)inputVectorPtr); ++- inputVectorPtr += 4; ++- inputVal2 = _mm_loadu_ps((float*)inputVectorPtr); ++- inputVectorPtr += 4; ++- __VOLK_PREFETCH(inputVectorPtr + 8); ++- ++- // Clip ++- ret1 = _mm_max_ps(_mm_min_ps(inputVal1, vmax_val), vmin_val); ++- ret2 = _mm_max_ps(_mm_min_ps(inputVal2, vmax_val), vmin_val); ++- ++- intInputVal1 = _mm_cvtps_epi32(ret1); ++- intInputVal2 = _mm_cvtps_epi32(ret2); ++- ++- intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2); ++- ++- _mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1); ++- outputVectorPtr += 8; ++- } ++- ++- for(i = sse_iters * 8; i < num_points * 2; i++) ++- { ++- aux = *inputVectorPtr++; ++- if(aux > max_val) ++- aux = max_val; ++- else if(aux < min_val) ++- aux = min_val; ++- *outputVectorPtr++ = (int16_t)rintf(aux); ++- } +++ for (i = 0; i < sse_iters; i++) { +++ inputVal1 = _mm_loadu_ps((float*)inputVectorPtr); +++ inputVectorPtr += 4; +++ inputVal2 = _mm_loadu_ps((float*)inputVectorPtr); +++ inputVectorPtr += 4; +++ __VOLK_PREFETCH(inputVectorPtr + 8); +++ +++ // Clip +++ ret1 = _mm_max_ps(_mm_min_ps(inputVal1, vmax_val), vmin_val); +++ ret2 = _mm_max_ps(_mm_min_ps(inputVal2, vmax_val), vmin_val); +++ +++ intInputVal1 = _mm_cvtps_epi32(ret1); +++ intInputVal2 = _mm_cvtps_epi32(ret2); +++ +++ intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2); +++ +++ _mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1); +++ outputVectorPtr += 8; +++ } +++ +++ for (i = sse_iters * 8; i < num_points * 2; i++) { +++ aux = *inputVectorPtr++; +++ if (aux > max_val) +++ aux = max_val; +++ else if (aux < min_val) +++ aux = min_val; +++ *outputVectorPtr++ = (int16_t)rintf(aux); +++ } ++ } ++ #endif /* LV_HAVE_SSE2 */ ++ #endif /* INCLUDED_volk_32fc_convert_16ic_u_H */ ++diff --git a/kernels/volk/volk_32fc_deinterleave_32f_x2.h b/kernels/volk/volk_32fc_deinterleave_32f_x2.h ++index 40cd664..1a06c48 100644 ++--- a/kernels/volk/volk_32fc_deinterleave_32f_x2.h +++++ b/kernels/volk/volk_32fc_deinterleave_32f_x2.h ++@@ -30,8 +30,8 @@ ++ * ++ * Dispatcher Prototype ++ * \code ++- * void volk_32fc_deinterleave_32f_x2(float* iBuffer, float* qBuffer, const lv_32fc_t* complexVector, unsigned int num_points) ++- * \endcode +++ * void volk_32fc_deinterleave_32f_x2(float* iBuffer, float* qBuffer, const lv_32fc_t* +++ * complexVector, unsigned int num_points) \endcode ++ * ++ * \b Inputs ++ * \li complexVector: The complex input vector. ++@@ -78,86 +78,88 @@ ++ ++ #ifdef LV_HAVE_AVX ++ #include ++-static inline void ++-volk_32fc_deinterleave_32f_x2_a_avx(float* iBuffer, float* qBuffer, const lv_32fc_t* complexVector, ++- unsigned int num_points) +++static inline void volk_32fc_deinterleave_32f_x2_a_avx(float* iBuffer, +++ float* qBuffer, +++ const lv_32fc_t* complexVector, +++ unsigned int num_points) ++ { ++- const float* complexVectorPtr = (float*)complexVector; ++- float* iBufferPtr = iBuffer; ++- float* qBufferPtr = qBuffer; ++- ++- unsigned int number = 0; ++- // Mask for real and imaginary parts ++- const unsigned int eighthPoints = num_points / 8; ++- __m256 cplxValue1, cplxValue2, complex1, complex2, iValue, qValue; ++- for(;number < eighthPoints; number++){ ++- cplxValue1 = _mm256_load_ps(complexVectorPtr); ++- complexVectorPtr += 8; ++- ++- cplxValue2 = _mm256_load_ps(complexVectorPtr); ++- complexVectorPtr += 8; ++- ++- complex1 = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x20); ++- complex2 = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x31); ++- ++- // Arrange in i1i2i3i4 format ++- iValue = _mm256_shuffle_ps(complex1, complex2, 0x88); ++- // Arrange in q1q2q3q4 format ++- qValue = _mm256_shuffle_ps(complex1, complex2, 0xdd); ++- ++- _mm256_store_ps(iBufferPtr, iValue); ++- _mm256_store_ps(qBufferPtr, qValue); ++- ++- iBufferPtr += 8; ++- qBufferPtr += 8; ++- } ++- ++- number = eighthPoints * 8; ++- for(; number < num_points; number++){ ++- *iBufferPtr++ = *complexVectorPtr++; ++- *qBufferPtr++ = *complexVectorPtr++; ++- } +++ const float* complexVectorPtr = (float*)complexVector; +++ float* iBufferPtr = iBuffer; +++ float* qBufferPtr = qBuffer; +++ +++ unsigned int number = 0; +++ // Mask for real and imaginary parts +++ const unsigned int eighthPoints = num_points / 8; +++ __m256 cplxValue1, cplxValue2, complex1, complex2, iValue, qValue; +++ for (; number < eighthPoints; number++) { +++ cplxValue1 = _mm256_load_ps(complexVectorPtr); +++ complexVectorPtr += 8; +++ +++ cplxValue2 = _mm256_load_ps(complexVectorPtr); +++ complexVectorPtr += 8; +++ +++ complex1 = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x20); +++ complex2 = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x31); +++ +++ // Arrange in i1i2i3i4 format +++ iValue = _mm256_shuffle_ps(complex1, complex2, 0x88); +++ // Arrange in q1q2q3q4 format +++ qValue = _mm256_shuffle_ps(complex1, complex2, 0xdd); +++ +++ _mm256_store_ps(iBufferPtr, iValue); +++ _mm256_store_ps(qBufferPtr, qValue); +++ +++ iBufferPtr += 8; +++ qBufferPtr += 8; +++ } +++ +++ number = eighthPoints * 8; +++ for (; number < num_points; number++) { +++ *iBufferPtr++ = *complexVectorPtr++; +++ *qBufferPtr++ = *complexVectorPtr++; +++ } ++ } ++ #endif /* LV_HAVE_AVX */ ++ ++ #ifdef LV_HAVE_SSE ++ #include ++ ++-static inline void ++-volk_32fc_deinterleave_32f_x2_a_sse(float* iBuffer, float* qBuffer, const lv_32fc_t* complexVector, ++- unsigned int num_points) +++static inline void volk_32fc_deinterleave_32f_x2_a_sse(float* iBuffer, +++ float* qBuffer, +++ const lv_32fc_t* complexVector, +++ unsigned int num_points) ++ { ++- const float* complexVectorPtr = (float*)complexVector; ++- float* iBufferPtr = iBuffer; ++- float* qBufferPtr = qBuffer; ++- ++- unsigned int number = 0; ++- const unsigned int quarterPoints = num_points / 4; ++- __m128 cplxValue1, cplxValue2, iValue, qValue; ++- for(;number < quarterPoints; number++){ ++- cplxValue1 = _mm_load_ps(complexVectorPtr); ++- complexVectorPtr += 4; ++- ++- cplxValue2 = _mm_load_ps(complexVectorPtr); ++- complexVectorPtr += 4; ++- ++- // Arrange in i1i2i3i4 format ++- iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0)); ++- // Arrange in q1q2q3q4 format ++- qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1)); ++- ++- _mm_store_ps(iBufferPtr, iValue); ++- _mm_store_ps(qBufferPtr, qValue); ++- ++- iBufferPtr += 4; ++- qBufferPtr += 4; ++- } ++- ++- number = quarterPoints * 4; ++- for(; number < num_points; number++){ ++- *iBufferPtr++ = *complexVectorPtr++; ++- *qBufferPtr++ = *complexVectorPtr++; ++- } +++ const float* complexVectorPtr = (float*)complexVector; +++ float* iBufferPtr = iBuffer; +++ float* qBufferPtr = qBuffer; +++ +++ unsigned int number = 0; +++ const unsigned int quarterPoints = num_points / 4; +++ __m128 cplxValue1, cplxValue2, iValue, qValue; +++ for (; number < quarterPoints; number++) { +++ cplxValue1 = _mm_load_ps(complexVectorPtr); +++ complexVectorPtr += 4; +++ +++ cplxValue2 = _mm_load_ps(complexVectorPtr); +++ complexVectorPtr += 4; +++ +++ // Arrange in i1i2i3i4 format +++ iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2, 0, 2, 0)); +++ // Arrange in q1q2q3q4 format +++ qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3, 1, 3, 1)); +++ +++ _mm_store_ps(iBufferPtr, iValue); +++ _mm_store_ps(qBufferPtr, qValue); +++ +++ iBufferPtr += 4; +++ qBufferPtr += 4; +++ } +++ +++ number = quarterPoints * 4; +++ for (; number < num_points; number++) { +++ *iBufferPtr++ = *complexVectorPtr++; +++ *qBufferPtr++ = *complexVectorPtr++; +++ } ++ } ++ #endif /* LV_HAVE_SSE */ ++ ++@@ -165,48 +167,50 @@ volk_32fc_deinterleave_32f_x2_a_sse(float* iBuffer, float* qBuffer, const lv_32f ++ #ifdef LV_HAVE_NEON ++ #include ++ ++-static inline void ++-volk_32fc_deinterleave_32f_x2_neon(float* iBuffer, float* qBuffer, const lv_32fc_t* complexVector, ++- unsigned int num_points) +++static inline void volk_32fc_deinterleave_32f_x2_neon(float* iBuffer, +++ float* qBuffer, +++ const lv_32fc_t* complexVector, +++ unsigned int num_points) ++ { ++- unsigned int number = 0; ++- unsigned int quarter_points = num_points / 4; ++- const float* complexVectorPtr = (float*)complexVector; ++- float* iBufferPtr = iBuffer; ++- float* qBufferPtr = qBuffer; ++- float32x4x2_t complexInput; ++- ++- for(number = 0; number < quarter_points; number++){ ++- complexInput = vld2q_f32(complexVectorPtr); ++- vst1q_f32( iBufferPtr, complexInput.val[0] ); ++- vst1q_f32( qBufferPtr, complexInput.val[1] ); ++- complexVectorPtr += 8; ++- iBufferPtr += 4; ++- qBufferPtr += 4; ++- } ++- ++- for(number = quarter_points*4; number < num_points; number++){ ++- *iBufferPtr++ = *complexVectorPtr++; ++- *qBufferPtr++ = *complexVectorPtr++; ++- } +++ unsigned int number = 0; +++ unsigned int quarter_points = num_points / 4; +++ const float* complexVectorPtr = (float*)complexVector; +++ float* iBufferPtr = iBuffer; +++ float* qBufferPtr = qBuffer; +++ float32x4x2_t complexInput; +++ +++ for (number = 0; number < quarter_points; number++) { +++ complexInput = vld2q_f32(complexVectorPtr); +++ vst1q_f32(iBufferPtr, complexInput.val[0]); +++ vst1q_f32(qBufferPtr, complexInput.val[1]); +++ complexVectorPtr += 8; +++ iBufferPtr += 4; +++ qBufferPtr += 4; +++ } +++ +++ for (number = quarter_points * 4; number < num_points; number++) { +++ *iBufferPtr++ = *complexVectorPtr++; +++ *qBufferPtr++ = *complexVectorPtr++; +++ } ++ } ++ #endif /* LV_HAVE_NEON */ ++ ++ ++ #ifdef LV_HAVE_GENERIC ++ ++-static inline void ++-volk_32fc_deinterleave_32f_x2_generic(float* iBuffer, float* qBuffer, const lv_32fc_t* complexVector, ++- unsigned int num_points) +++static inline void volk_32fc_deinterleave_32f_x2_generic(float* iBuffer, +++ float* qBuffer, +++ const lv_32fc_t* complexVector, +++ unsigned int num_points) ++ { ++- const float* complexVectorPtr = (float*)complexVector; ++- float* iBufferPtr = iBuffer; ++- float* qBufferPtr = qBuffer; ++- unsigned int number; ++- for(number = 0; number < num_points; number++){ ++- *iBufferPtr++ = *complexVectorPtr++; ++- *qBufferPtr++ = *complexVectorPtr++; ++- } +++ const float* complexVectorPtr = (float*)complexVector; +++ float* iBufferPtr = iBuffer; +++ float* qBufferPtr = qBuffer; +++ unsigned int number; +++ for (number = 0; number < num_points; number++) { +++ *iBufferPtr++ = *complexVectorPtr++; +++ *qBufferPtr++ = *complexVectorPtr++; +++ } ++ } ++ #endif /* LV_HAVE_GENERIC */ ++ ++@@ -221,45 +225,46 @@ volk_32fc_deinterleave_32f_x2_generic(float* iBuffer, float* qBuffer, const lv_3 ++ ++ #ifdef LV_HAVE_AVX ++ #include ++-static inline void ++-volk_32fc_deinterleave_32f_x2_u_avx(float* iBuffer, float* qBuffer, const lv_32fc_t* complexVector, ++- unsigned int num_points) +++static inline void volk_32fc_deinterleave_32f_x2_u_avx(float* iBuffer, +++ float* qBuffer, +++ const lv_32fc_t* complexVector, +++ unsigned int num_points) ++ { ++- const float* complexVectorPtr = (float*)complexVector; ++- float* iBufferPtr = iBuffer; ++- float* qBufferPtr = qBuffer; ++- ++- unsigned int number = 0; ++- // Mask for real and imaginary parts ++- const unsigned int eighthPoints = num_points / 8; ++- __m256 cplxValue1, cplxValue2, complex1, complex2, iValue, qValue; ++- for(;number < eighthPoints; number++){ ++- cplxValue1 = _mm256_loadu_ps(complexVectorPtr); ++- complexVectorPtr += 8; ++- ++- cplxValue2 = _mm256_loadu_ps(complexVectorPtr); ++- complexVectorPtr += 8; ++- ++- complex1 = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x20); ++- complex2 = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x31); ++- ++- // Arrange in i1i2i3i4 format ++- iValue = _mm256_shuffle_ps(complex1, complex2, 0x88); ++- // Arrange in q1q2q3q4 format ++- qValue = _mm256_shuffle_ps(complex1, complex2, 0xdd); ++- ++- _mm256_storeu_ps(iBufferPtr, iValue); ++- _mm256_storeu_ps(qBufferPtr, qValue); ++- ++- iBufferPtr += 8; ++- qBufferPtr += 8; ++- } ++- ++- number = eighthPoints * 8; ++- for(; number < num_points; number++){ ++- *iBufferPtr++ = *complexVectorPtr++; ++- *qBufferPtr++ = *complexVectorPtr++; ++- } +++ const float* complexVectorPtr = (float*)complexVector; +++ float* iBufferPtr = iBuffer; +++ float* qBufferPtr = qBuffer; +++ +++ unsigned int number = 0; +++ // Mask for real and imaginary parts +++ const unsigned int eighthPoints = num_points / 8; +++ __m256 cplxValue1, cplxValue2, complex1, complex2, iValue, qValue; +++ for (; number < eighthPoints; number++) { +++ cplxValue1 = _mm256_loadu_ps(complexVectorPtr); +++ complexVectorPtr += 8; +++ +++ cplxValue2 = _mm256_loadu_ps(complexVectorPtr); +++ complexVectorPtr += 8; +++ +++ complex1 = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x20); +++ complex2 = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x31); +++ +++ // Arrange in i1i2i3i4 format +++ iValue = _mm256_shuffle_ps(complex1, complex2, 0x88); +++ // Arrange in q1q2q3q4 format +++ qValue = _mm256_shuffle_ps(complex1, complex2, 0xdd); +++ +++ _mm256_storeu_ps(iBufferPtr, iValue); +++ _mm256_storeu_ps(qBufferPtr, qValue); +++ +++ iBufferPtr += 8; +++ qBufferPtr += 8; +++ } +++ +++ number = eighthPoints * 8; +++ for (; number < num_points; number++) { +++ *iBufferPtr++ = *complexVectorPtr++; +++ *qBufferPtr++ = *complexVectorPtr++; +++ } ++ } ++ #endif /* LV_HAVE_AVX */ ++ #endif /* INCLUDED_volk_32fc_deinterleave_32f_x2_u_H */ ++diff --git a/kernels/volk/volk_32fc_deinterleave_64f_x2.h b/kernels/volk/volk_32fc_deinterleave_64f_x2.h ++index 3e799cb..3b69c3c 100644 ++--- a/kernels/volk/volk_32fc_deinterleave_64f_x2.h +++++ b/kernels/volk/volk_32fc_deinterleave_64f_x2.h ++@@ -79,110 +79,113 @@ ++ #ifdef LV_HAVE_AVX ++ #include ++ ++-static inline void ++-volk_32fc_deinterleave_64f_x2_u_avx(double *iBuffer, double *qBuffer, ++- const lv_32fc_t *complexVector, ++- unsigned int num_points) { ++- unsigned int number = 0; ++- ++- const float *complexVectorPtr = (float *)complexVector; ++- double *iBufferPtr = iBuffer; ++- double *qBufferPtr = qBuffer; ++- ++- const unsigned int quarterPoints = num_points / 4; ++- __m256 cplxValue; ++- __m128 complexH, complexL, fVal; ++- __m256d dVal; ++- ++- for (; number < quarterPoints; number++) { ++- ++- cplxValue = _mm256_loadu_ps(complexVectorPtr); ++- complexVectorPtr += 8; ++- ++- complexH = _mm256_extractf128_ps(cplxValue, 1); ++- complexL = _mm256_extractf128_ps(cplxValue, 0); ++- ++- // Arrange in i1i2i1i2 format ++- fVal = _mm_shuffle_ps(complexL, complexH, _MM_SHUFFLE(2, 0, 2, 0)); ++- dVal = _mm256_cvtps_pd(fVal); ++- _mm256_storeu_pd(iBufferPtr, dVal); ++- ++- // Arrange in q1q2q1q2 format ++- fVal = _mm_shuffle_ps(complexL, complexH, _MM_SHUFFLE(3, 1, 3, 1)); ++- dVal = _mm256_cvtps_pd(fVal); ++- _mm256_storeu_pd(qBufferPtr, dVal); ++- ++- iBufferPtr += 4; ++- qBufferPtr += 4; ++- } ++- ++- number = quarterPoints * 4; ++- for (; number < num_points; number++) { ++- *iBufferPtr++ = *complexVectorPtr++; ++- *qBufferPtr++ = *complexVectorPtr++; ++- } +++static inline void volk_32fc_deinterleave_64f_x2_u_avx(double* iBuffer, +++ double* qBuffer, +++ const lv_32fc_t* complexVector, +++ unsigned int num_points) +++{ +++ unsigned int number = 0; +++ +++ const float* complexVectorPtr = (float*)complexVector; +++ double* iBufferPtr = iBuffer; +++ double* qBufferPtr = qBuffer; +++ +++ const unsigned int quarterPoints = num_points / 4; +++ __m256 cplxValue; +++ __m128 complexH, complexL, fVal; +++ __m256d dVal; +++ +++ for (; number < quarterPoints; number++) { +++ +++ cplxValue = _mm256_loadu_ps(complexVectorPtr); +++ complexVectorPtr += 8; +++ +++ complexH = _mm256_extractf128_ps(cplxValue, 1); +++ complexL = _mm256_extractf128_ps(cplxValue, 0); +++ +++ // Arrange in i1i2i1i2 format +++ fVal = _mm_shuffle_ps(complexL, complexH, _MM_SHUFFLE(2, 0, 2, 0)); +++ dVal = _mm256_cvtps_pd(fVal); +++ _mm256_storeu_pd(iBufferPtr, dVal); +++ +++ // Arrange in q1q2q1q2 format +++ fVal = _mm_shuffle_ps(complexL, complexH, _MM_SHUFFLE(3, 1, 3, 1)); +++ dVal = _mm256_cvtps_pd(fVal); +++ _mm256_storeu_pd(qBufferPtr, dVal); +++ +++ iBufferPtr += 4; +++ qBufferPtr += 4; +++ } +++ +++ number = quarterPoints * 4; +++ for (; number < num_points; number++) { +++ *iBufferPtr++ = *complexVectorPtr++; +++ *qBufferPtr++ = *complexVectorPtr++; +++ } ++ } ++ #endif /* LV_HAVE_AVX */ ++ ++ #ifdef LV_HAVE_SSE2 ++ #include ++ ++-static inline void ++-volk_32fc_deinterleave_64f_x2_u_sse2(double *iBuffer, double *qBuffer, ++- const lv_32fc_t *complexVector, ++- unsigned int num_points) { ++- unsigned int number = 0; ++- ++- const float *complexVectorPtr = (float *)complexVector; ++- double *iBufferPtr = iBuffer; ++- double *qBufferPtr = qBuffer; ++- ++- const unsigned int halfPoints = num_points / 2; ++- __m128 cplxValue, fVal; ++- __m128d dVal; ++- ++- for (; number < halfPoints; number++) { ++- ++- cplxValue = _mm_loadu_ps(complexVectorPtr); ++- complexVectorPtr += 4; ++- ++- // Arrange in i1i2i1i2 format ++- fVal = _mm_shuffle_ps(cplxValue, cplxValue, _MM_SHUFFLE(2, 0, 2, 0)); ++- dVal = _mm_cvtps_pd(fVal); ++- _mm_storeu_pd(iBufferPtr, dVal); ++- ++- // Arrange in q1q2q1q2 format ++- fVal = _mm_shuffle_ps(cplxValue, cplxValue, _MM_SHUFFLE(3, 1, 3, 1)); ++- dVal = _mm_cvtps_pd(fVal); ++- _mm_storeu_pd(qBufferPtr, dVal); ++- ++- iBufferPtr += 2; ++- qBufferPtr += 2; ++- } ++- ++- number = halfPoints * 2; ++- for (; number < num_points; number++) { ++- *iBufferPtr++ = *complexVectorPtr++; ++- *qBufferPtr++ = *complexVectorPtr++; ++- } +++static inline void volk_32fc_deinterleave_64f_x2_u_sse2(double* iBuffer, +++ double* qBuffer, +++ const lv_32fc_t* complexVector, +++ unsigned int num_points) +++{ +++ unsigned int number = 0; +++ +++ const float* complexVectorPtr = (float*)complexVector; +++ double* iBufferPtr = iBuffer; +++ double* qBufferPtr = qBuffer; +++ +++ const unsigned int halfPoints = num_points / 2; +++ __m128 cplxValue, fVal; +++ __m128d dVal; +++ +++ for (; number < halfPoints; number++) { +++ +++ cplxValue = _mm_loadu_ps(complexVectorPtr); +++ complexVectorPtr += 4; +++ +++ // Arrange in i1i2i1i2 format +++ fVal = _mm_shuffle_ps(cplxValue, cplxValue, _MM_SHUFFLE(2, 0, 2, 0)); +++ dVal = _mm_cvtps_pd(fVal); +++ _mm_storeu_pd(iBufferPtr, dVal); +++ +++ // Arrange in q1q2q1q2 format +++ fVal = _mm_shuffle_ps(cplxValue, cplxValue, _MM_SHUFFLE(3, 1, 3, 1)); +++ dVal = _mm_cvtps_pd(fVal); +++ _mm_storeu_pd(qBufferPtr, dVal); +++ +++ iBufferPtr += 2; +++ qBufferPtr += 2; +++ } +++ +++ number = halfPoints * 2; +++ for (; number < num_points; number++) { +++ *iBufferPtr++ = *complexVectorPtr++; +++ *qBufferPtr++ = *complexVectorPtr++; +++ } ++ } ++ #endif /* LV_HAVE_SSE */ ++ ++ #ifdef LV_HAVE_GENERIC ++ ++-static inline void ++-volk_32fc_deinterleave_64f_x2_generic(double *iBuffer, double *qBuffer, ++- const lv_32fc_t *complexVector, ++- unsigned int num_points) { ++- unsigned int number = 0; ++- const float *complexVectorPtr = (float *)complexVector; ++- double *iBufferPtr = iBuffer; ++- double *qBufferPtr = qBuffer; ++- ++- for (number = 0; number < num_points; number++) { ++- *iBufferPtr++ = (double)*complexVectorPtr++; ++- *qBufferPtr++ = (double)*complexVectorPtr++; ++- } +++static inline void volk_32fc_deinterleave_64f_x2_generic(double* iBuffer, +++ double* qBuffer, +++ const lv_32fc_t* complexVector, +++ unsigned int num_points) +++{ +++ unsigned int number = 0; +++ const float* complexVectorPtr = (float*)complexVector; +++ double* iBufferPtr = iBuffer; +++ double* qBufferPtr = qBuffer; +++ +++ for (number = 0; number < num_points; number++) { +++ *iBufferPtr++ = (double)*complexVectorPtr++; +++ *qBufferPtr++ = (double)*complexVectorPtr++; +++ } ++ } ++ #endif /* LV_HAVE_GENERIC */ ++ ++@@ -196,146 +199,150 @@ volk_32fc_deinterleave_64f_x2_generic(double *iBuffer, double *qBuffer, ++ #ifdef LV_HAVE_AVX ++ #include ++ ++-static inline void ++-volk_32fc_deinterleave_64f_x2_a_avx(double *iBuffer, double *qBuffer, ++- const lv_32fc_t *complexVector, ++- unsigned int num_points) { ++- unsigned int number = 0; ++- ++- const float *complexVectorPtr = (float *)complexVector; ++- double *iBufferPtr = iBuffer; ++- double *qBufferPtr = qBuffer; ++- ++- const unsigned int quarterPoints = num_points / 4; ++- __m256 cplxValue; ++- __m128 complexH, complexL, fVal; ++- __m256d dVal; ++- ++- for (; number < quarterPoints; number++) { ++- ++- cplxValue = _mm256_load_ps(complexVectorPtr); ++- complexVectorPtr += 8; ++- ++- complexH = _mm256_extractf128_ps(cplxValue, 1); ++- complexL = _mm256_extractf128_ps(cplxValue, 0); ++- ++- // Arrange in i1i2i1i2 format ++- fVal = _mm_shuffle_ps(complexL, complexH, _MM_SHUFFLE(2, 0, 2, 0)); ++- dVal = _mm256_cvtps_pd(fVal); ++- _mm256_store_pd(iBufferPtr, dVal); ++- ++- // Arrange in q1q2q1q2 format ++- fVal = _mm_shuffle_ps(complexL, complexH, _MM_SHUFFLE(3, 1, 3, 1)); ++- dVal = _mm256_cvtps_pd(fVal); ++- _mm256_store_pd(qBufferPtr, dVal); ++- ++- iBufferPtr += 4; ++- qBufferPtr += 4; ++- } ++- ++- number = quarterPoints * 4; ++- for (; number < num_points; number++) { ++- *iBufferPtr++ = *complexVectorPtr++; ++- *qBufferPtr++ = *complexVectorPtr++; ++- } +++static inline void volk_32fc_deinterleave_64f_x2_a_avx(double* iBuffer, +++ double* qBuffer, +++ const lv_32fc_t* complexVector, +++ unsigned int num_points) +++{ +++ unsigned int number = 0; +++ +++ const float* complexVectorPtr = (float*)complexVector; +++ double* iBufferPtr = iBuffer; +++ double* qBufferPtr = qBuffer; +++ +++ const unsigned int quarterPoints = num_points / 4; +++ __m256 cplxValue; +++ __m128 complexH, complexL, fVal; +++ __m256d dVal; +++ +++ for (; number < quarterPoints; number++) { +++ +++ cplxValue = _mm256_load_ps(complexVectorPtr); +++ complexVectorPtr += 8; +++ +++ complexH = _mm256_extractf128_ps(cplxValue, 1); +++ complexL = _mm256_extractf128_ps(cplxValue, 0); +++ +++ // Arrange in i1i2i1i2 format +++ fVal = _mm_shuffle_ps(complexL, complexH, _MM_SHUFFLE(2, 0, 2, 0)); +++ dVal = _mm256_cvtps_pd(fVal); +++ _mm256_store_pd(iBufferPtr, dVal); +++ +++ // Arrange in q1q2q1q2 format +++ fVal = _mm_shuffle_ps(complexL, complexH, _MM_SHUFFLE(3, 1, 3, 1)); +++ dVal = _mm256_cvtps_pd(fVal); +++ _mm256_store_pd(qBufferPtr, dVal); +++ +++ iBufferPtr += 4; +++ qBufferPtr += 4; +++ } +++ +++ number = quarterPoints * 4; +++ for (; number < num_points; number++) { +++ *iBufferPtr++ = *complexVectorPtr++; +++ *qBufferPtr++ = *complexVectorPtr++; +++ } ++ } ++ #endif /* LV_HAVE_AVX */ ++ ++ #ifdef LV_HAVE_SSE2 ++ #include ++ ++-static inline void ++-volk_32fc_deinterleave_64f_x2_a_sse2(double *iBuffer, double *qBuffer, ++- const lv_32fc_t *complexVector, ++- unsigned int num_points) { ++- unsigned int number = 0; ++- ++- const float *complexVectorPtr = (float *)complexVector; ++- double *iBufferPtr = iBuffer; ++- double *qBufferPtr = qBuffer; ++- ++- const unsigned int halfPoints = num_points / 2; ++- __m128 cplxValue, fVal; ++- __m128d dVal; ++- ++- for (; number < halfPoints; number++) { ++- ++- cplxValue = _mm_load_ps(complexVectorPtr); ++- complexVectorPtr += 4; ++- ++- // Arrange in i1i2i1i2 format ++- fVal = _mm_shuffle_ps(cplxValue, cplxValue, _MM_SHUFFLE(2, 0, 2, 0)); ++- dVal = _mm_cvtps_pd(fVal); ++- _mm_store_pd(iBufferPtr, dVal); ++- ++- // Arrange in q1q2q1q2 format ++- fVal = _mm_shuffle_ps(cplxValue, cplxValue, _MM_SHUFFLE(3, 1, 3, 1)); ++- dVal = _mm_cvtps_pd(fVal); ++- _mm_store_pd(qBufferPtr, dVal); ++- ++- iBufferPtr += 2; ++- qBufferPtr += 2; ++- } ++- ++- number = halfPoints * 2; ++- for (; number < num_points; number++) { ++- *iBufferPtr++ = *complexVectorPtr++; ++- *qBufferPtr++ = *complexVectorPtr++; ++- } +++static inline void volk_32fc_deinterleave_64f_x2_a_sse2(double* iBuffer, +++ double* qBuffer, +++ const lv_32fc_t* complexVector, +++ unsigned int num_points) +++{ +++ unsigned int number = 0; +++ +++ const float* complexVectorPtr = (float*)complexVector; +++ double* iBufferPtr = iBuffer; +++ double* qBufferPtr = qBuffer; +++ +++ const unsigned int halfPoints = num_points / 2; +++ __m128 cplxValue, fVal; +++ __m128d dVal; +++ +++ for (; number < halfPoints; number++) { +++ +++ cplxValue = _mm_load_ps(complexVectorPtr); +++ complexVectorPtr += 4; +++ +++ // Arrange in i1i2i1i2 format +++ fVal = _mm_shuffle_ps(cplxValue, cplxValue, _MM_SHUFFLE(2, 0, 2, 0)); +++ dVal = _mm_cvtps_pd(fVal); +++ _mm_store_pd(iBufferPtr, dVal); +++ +++ // Arrange in q1q2q1q2 format +++ fVal = _mm_shuffle_ps(cplxValue, cplxValue, _MM_SHUFFLE(3, 1, 3, 1)); +++ dVal = _mm_cvtps_pd(fVal); +++ _mm_store_pd(qBufferPtr, dVal); +++ +++ iBufferPtr += 2; +++ qBufferPtr += 2; +++ } +++ +++ number = halfPoints * 2; +++ for (; number < num_points; number++) { +++ *iBufferPtr++ = *complexVectorPtr++; +++ *qBufferPtr++ = *complexVectorPtr++; +++ } ++ } ++ #endif /* LV_HAVE_SSE */ ++ ++ #ifdef LV_HAVE_GENERIC ++ ++-static inline void ++-volk_32fc_deinterleave_64f_x2_a_generic(double *iBuffer, double *qBuffer, ++- const lv_32fc_t *complexVector, ++- unsigned int num_points) { ++- unsigned int number = 0; ++- const float *complexVectorPtr = (float *)complexVector; ++- double *iBufferPtr = iBuffer; ++- double *qBufferPtr = qBuffer; ++- ++- for (number = 0; number < num_points; number++) { ++- *iBufferPtr++ = (double)*complexVectorPtr++; ++- *qBufferPtr++ = (double)*complexVectorPtr++; ++- } +++static inline void volk_32fc_deinterleave_64f_x2_a_generic(double* iBuffer, +++ double* qBuffer, +++ const lv_32fc_t* complexVector, +++ unsigned int num_points) +++{ +++ unsigned int number = 0; +++ const float* complexVectorPtr = (float*)complexVector; +++ double* iBufferPtr = iBuffer; +++ double* qBufferPtr = qBuffer; +++ +++ for (number = 0; number < num_points; number++) { +++ *iBufferPtr++ = (double)*complexVectorPtr++; +++ *qBufferPtr++ = (double)*complexVectorPtr++; +++ } ++ } ++ #endif /* LV_HAVE_GENERIC */ ++ ++ #ifdef LV_HAVE_NEONV8 ++ #include ++ ++-static inline void ++-volk_32fc_deinterleave_64f_x2_neon(double *iBuffer, double *qBuffer, ++- const lv_32fc_t *complexVector, ++- unsigned int num_points) { ++- unsigned int number = 0; ++- unsigned int half_points = num_points / 2; ++- const float *complexVectorPtr = (float *)complexVector; ++- double *iBufferPtr = iBuffer; ++- double *qBufferPtr = qBuffer; ++- float32x2x2_t complexInput; ++- float64x2_t iVal, qVal; ++- ++- for (number = 0; number < half_points; number++) { ++- complexInput = vld2_f32(complexVectorPtr); ++- ++- iVal = vcvt_f64_f32(complexInput.val[0]); ++- qVal = vcvt_f64_f32(complexInput.val[1]); ++- ++- vst1q_f64(iBufferPtr, iVal); ++- vst1q_f64(qBufferPtr, qVal); ++- ++- complexVectorPtr += 4; ++- iBufferPtr += 2; ++- qBufferPtr += 2; ++- } ++- ++- for (number = half_points * 2; number < num_points; number++) { ++- *iBufferPtr++ = (double)*complexVectorPtr++; ++- *qBufferPtr++ = (double)*complexVectorPtr++; ++- } +++static inline void volk_32fc_deinterleave_64f_x2_neon(double* iBuffer, +++ double* qBuffer, +++ const lv_32fc_t* complexVector, +++ unsigned int num_points) +++{ +++ unsigned int number = 0; +++ unsigned int half_points = num_points / 2; +++ const float* complexVectorPtr = (float*)complexVector; +++ double* iBufferPtr = iBuffer; +++ double* qBufferPtr = qBuffer; +++ float32x2x2_t complexInput; +++ float64x2_t iVal, qVal; +++ +++ for (number = 0; number < half_points; number++) { +++ complexInput = vld2_f32(complexVectorPtr); +++ +++ iVal = vcvt_f64_f32(complexInput.val[0]); +++ qVal = vcvt_f64_f32(complexInput.val[1]); +++ +++ vst1q_f64(iBufferPtr, iVal); +++ vst1q_f64(qBufferPtr, qVal); +++ +++ complexVectorPtr += 4; +++ iBufferPtr += 2; +++ qBufferPtr += 2; +++ } +++ +++ for (number = half_points * 2; number < num_points; number++) { +++ *iBufferPtr++ = (double)*complexVectorPtr++; +++ *qBufferPtr++ = (double)*complexVectorPtr++; +++ } ++ } ++ #endif /* LV_HAVE_NEONV8 */ ++ ++diff --git a/kernels/volk/volk_32fc_deinterleave_imag_32f.h b/kernels/volk/volk_32fc_deinterleave_imag_32f.h ++index 13f9764..e3dfa12 100644 ++--- a/kernels/volk/volk_32fc_deinterleave_imag_32f.h +++++ b/kernels/volk/volk_32fc_deinterleave_imag_32f.h ++@@ -30,8 +30,8 @@ ++ * ++ * Dispatcher Prototype ++ * \code ++- * void volk_32fc_deinterleave_image_32f(float* qBuffer, const lv_32fc_t* complexVector, unsigned int num_points) ++- * \endcode +++ * void volk_32fc_deinterleave_image_32f(float* qBuffer, const lv_32fc_t* complexVector, +++ * unsigned int num_points) \endcode ++ * ++ * \b Inputs ++ * \li complexVector: The complex input vector. ++@@ -76,121 +76,121 @@ ++ #ifdef LV_HAVE_AVX ++ #include ++ ++-static inline void ++-volk_32fc_deinterleave_imag_32f_a_avx(float* qBuffer, const lv_32fc_t* complexVector, ++- unsigned int num_points) +++static inline void volk_32fc_deinterleave_imag_32f_a_avx(float* qBuffer, +++ const lv_32fc_t* complexVector, +++ unsigned int num_points) ++ { ++- unsigned int number = 0; ++- const unsigned int eighthPoints = num_points / 8; ++- const float* complexVectorPtr = (const float*)complexVector; ++- float* qBufferPtr = qBuffer; +++ unsigned int number = 0; +++ const unsigned int eighthPoints = num_points / 8; +++ const float* complexVectorPtr = (const float*)complexVector; +++ float* qBufferPtr = qBuffer; ++ ++- __m256 cplxValue1, cplxValue2, complex1, complex2, qValue; ++- for(;number < eighthPoints; number++){ +++ __m256 cplxValue1, cplxValue2, complex1, complex2, qValue; +++ for (; number < eighthPoints; number++) { ++ ++- cplxValue1 = _mm256_load_ps(complexVectorPtr); ++- complexVectorPtr += 8; +++ cplxValue1 = _mm256_load_ps(complexVectorPtr); +++ complexVectorPtr += 8; ++ ++- cplxValue2 = _mm256_load_ps(complexVectorPtr); ++- complexVectorPtr += 8; +++ cplxValue2 = _mm256_load_ps(complexVectorPtr); +++ complexVectorPtr += 8; ++ ++- complex1 = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x20); ++- complex2 = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x31); +++ complex1 = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x20); +++ complex2 = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x31); ++ ++- // Arrange in q1q2q3q4 format ++- qValue = _mm256_shuffle_ps(complex1, complex2, 0xdd); +++ // Arrange in q1q2q3q4 format +++ qValue = _mm256_shuffle_ps(complex1, complex2, 0xdd); ++ ++- _mm256_store_ps(qBufferPtr, qValue); +++ _mm256_store_ps(qBufferPtr, qValue); ++ ++- qBufferPtr += 8; ++- } +++ qBufferPtr += 8; +++ } ++ ++- number = eighthPoints * 8; ++- for(; number < num_points; number++){ ++- complexVectorPtr++; ++- *qBufferPtr++ = *complexVectorPtr++; ++- } +++ number = eighthPoints * 8; +++ for (; number < num_points; number++) { +++ complexVectorPtr++; +++ *qBufferPtr++ = *complexVectorPtr++; +++ } ++ } ++ #endif /* LV_HAVE_AVX */ ++ ++ #ifdef LV_HAVE_SSE ++ #include ++ ++-static inline void ++-volk_32fc_deinterleave_imag_32f_a_sse(float* qBuffer, const lv_32fc_t* complexVector, ++- unsigned int num_points) +++static inline void volk_32fc_deinterleave_imag_32f_a_sse(float* qBuffer, +++ const lv_32fc_t* complexVector, +++ unsigned int num_points) ++ { ++- unsigned int number = 0; ++- const unsigned int quarterPoints = num_points / 4; +++ unsigned int number = 0; +++ const unsigned int quarterPoints = num_points / 4; ++ ++- const float* complexVectorPtr = (const float*)complexVector; ++- float* qBufferPtr = qBuffer; +++ const float* complexVectorPtr = (const float*)complexVector; +++ float* qBufferPtr = qBuffer; ++ ++- __m128 cplxValue1, cplxValue2, iValue; ++- for(;number < quarterPoints; number++){ +++ __m128 cplxValue1, cplxValue2, iValue; +++ for (; number < quarterPoints; number++) { ++ ++- cplxValue1 = _mm_load_ps(complexVectorPtr); ++- complexVectorPtr += 4; +++ cplxValue1 = _mm_load_ps(complexVectorPtr); +++ complexVectorPtr += 4; ++ ++- cplxValue2 = _mm_load_ps(complexVectorPtr); ++- complexVectorPtr += 4; +++ cplxValue2 = _mm_load_ps(complexVectorPtr); +++ complexVectorPtr += 4; ++ ++- // Arrange in q1q2q3q4 format ++- iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1)); +++ // Arrange in q1q2q3q4 format +++ iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3, 1, 3, 1)); ++ ++- _mm_store_ps(qBufferPtr, iValue); +++ _mm_store_ps(qBufferPtr, iValue); ++ ++- qBufferPtr += 4; ++- } +++ qBufferPtr += 4; +++ } ++ ++- number = quarterPoints * 4; ++- for(; number < num_points; number++){ ++- complexVectorPtr++; ++- *qBufferPtr++ = *complexVectorPtr++; ++- } +++ number = quarterPoints * 4; +++ for (; number < num_points; number++) { +++ complexVectorPtr++; +++ *qBufferPtr++ = *complexVectorPtr++; +++ } ++ } ++ #endif /* LV_HAVE_SSE */ ++ ++ #ifdef LV_HAVE_NEON ++ #include ++ ++-static inline void ++-volk_32fc_deinterleave_imag_32f_neon(float* qBuffer, const lv_32fc_t* complexVector, ++- unsigned int num_points) +++static inline void volk_32fc_deinterleave_imag_32f_neon(float* qBuffer, +++ const lv_32fc_t* complexVector, +++ unsigned int num_points) ++ { ++- unsigned int number = 0; ++- unsigned int quarter_points = num_points / 4; ++- const float* complexVectorPtr = (float*)complexVector; ++- float* qBufferPtr = qBuffer; ++- float32x4x2_t complexInput; ++- ++- for(number = 0; number < quarter_points; number++){ ++- complexInput = vld2q_f32(complexVectorPtr); ++- vst1q_f32( qBufferPtr, complexInput.val[1] ); ++- complexVectorPtr += 8; ++- qBufferPtr += 4; ++- } ++- ++- for(number = quarter_points*4; number < num_points; number++){ ++- complexVectorPtr++; ++- *qBufferPtr++ = *complexVectorPtr++; ++- } +++ unsigned int number = 0; +++ unsigned int quarter_points = num_points / 4; +++ const float* complexVectorPtr = (float*)complexVector; +++ float* qBufferPtr = qBuffer; +++ float32x4x2_t complexInput; +++ +++ for (number = 0; number < quarter_points; number++) { +++ complexInput = vld2q_f32(complexVectorPtr); +++ vst1q_f32(qBufferPtr, complexInput.val[1]); +++ complexVectorPtr += 8; +++ qBufferPtr += 4; +++ } +++ +++ for (number = quarter_points * 4; number < num_points; number++) { +++ complexVectorPtr++; +++ *qBufferPtr++ = *complexVectorPtr++; +++ } ++ } ++ #endif /* LV_HAVE_NEON */ ++ ++ #ifdef LV_HAVE_GENERIC ++ ++-static inline void ++-volk_32fc_deinterleave_imag_32f_generic(float* qBuffer, const lv_32fc_t* complexVector, ++- unsigned int num_points) +++static inline void volk_32fc_deinterleave_imag_32f_generic(float* qBuffer, +++ const lv_32fc_t* complexVector, +++ unsigned int num_points) ++ { ++- unsigned int number = 0; ++- const float* complexVectorPtr = (float*)complexVector; ++- float* qBufferPtr = qBuffer; ++- for(number = 0; number < num_points; number++){ ++- complexVectorPtr++; ++- *qBufferPtr++ = *complexVectorPtr++; ++- } +++ unsigned int number = 0; +++ const float* complexVectorPtr = (float*)complexVector; +++ float* qBufferPtr = qBuffer; +++ for (number = 0; number < num_points; number++) { +++ complexVectorPtr++; +++ *qBufferPtr++ = *complexVectorPtr++; +++ } ++ } ++ #endif /* LV_HAVE_GENERIC */ ++ ++@@ -206,40 +206,40 @@ volk_32fc_deinterleave_imag_32f_generic(float* qBuffer, const lv_32fc_t* complex ++ #ifdef LV_HAVE_AVX ++ #include ++ ++-static inline void ++-volk_32fc_deinterleave_imag_32f_u_avx(float* qBuffer, const lv_32fc_t* complexVector, ++- unsigned int num_points) +++static inline void volk_32fc_deinterleave_imag_32f_u_avx(float* qBuffer, +++ const lv_32fc_t* complexVector, +++ unsigned int num_points) ++ { ++- unsigned int number = 0; ++- const unsigned int eighthPoints = num_points / 8; ++- const float* complexVectorPtr = (const float*)complexVector; ++- float* qBufferPtr = qBuffer; +++ unsigned int number = 0; +++ const unsigned int eighthPoints = num_points / 8; +++ const float* complexVectorPtr = (const float*)complexVector; +++ float* qBufferPtr = qBuffer; ++ ++- __m256 cplxValue1, cplxValue2, complex1, complex2, qValue; ++- for(;number < eighthPoints; number++){ +++ __m256 cplxValue1, cplxValue2, complex1, complex2, qValue; +++ for (; number < eighthPoints; number++) { ++ ++- cplxValue1 = _mm256_loadu_ps(complexVectorPtr); ++- complexVectorPtr += 8; +++ cplxValue1 = _mm256_loadu_ps(complexVectorPtr); +++ complexVectorPtr += 8; ++ ++- cplxValue2 = _mm256_loadu_ps(complexVectorPtr); ++- complexVectorPtr += 8; +++ cplxValue2 = _mm256_loadu_ps(complexVectorPtr); +++ complexVectorPtr += 8; ++ ++- complex1 = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x20); ++- complex2 = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x31); +++ complex1 = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x20); +++ complex2 = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x31); ++ ++- // Arrange in q1q2q3q4 format ++- qValue = _mm256_shuffle_ps(complex1, complex2, 0xdd); +++ // Arrange in q1q2q3q4 format +++ qValue = _mm256_shuffle_ps(complex1, complex2, 0xdd); ++ ++- _mm256_storeu_ps(qBufferPtr, qValue); +++ _mm256_storeu_ps(qBufferPtr, qValue); ++ ++- qBufferPtr += 8; ++- } +++ qBufferPtr += 8; +++ } ++ ++- number = eighthPoints * 8; ++- for(; number < num_points; number++){ ++- complexVectorPtr++; ++- *qBufferPtr++ = *complexVectorPtr++; ++- } +++ number = eighthPoints * 8; +++ for (; number < num_points; number++) { +++ complexVectorPtr++; +++ *qBufferPtr++ = *complexVectorPtr++; +++ } ++ } ++ #endif /* LV_HAVE_AVX */ ++ #endif /* INCLUDED_volk_32fc_deinterleave_imag_32f_u_H */ ++diff --git a/kernels/volk/volk_32fc_deinterleave_real_32f.h b/kernels/volk/volk_32fc_deinterleave_real_32f.h ++index 92a94d3..2526a16 100644 ++--- a/kernels/volk/volk_32fc_deinterleave_real_32f.h +++++ b/kernels/volk/volk_32fc_deinterleave_real_32f.h ++@@ -30,8 +30,8 @@ ++ * ++ * Dispatcher Prototype ++ * \code ++- * void volk_32fc_deinterleave_real_32f(float* iBuffer, const lv_32fc_t* complexVector, unsigned int num_points) ++- * \endcode +++ * void volk_32fc_deinterleave_real_32f(float* iBuffer, const lv_32fc_t* complexVector, +++ * unsigned int num_points) \endcode ++ * ++ * \b Inputs ++ * \li complexVector: The complex input vector. ++@@ -76,96 +76,96 @@ ++ #ifdef LV_HAVE_AVX2 ++ #include ++ ++-static inline void ++-volk_32fc_deinterleave_real_32f_a_avx2(float* iBuffer, const lv_32fc_t* complexVector, ++- unsigned int num_points) +++static inline void volk_32fc_deinterleave_real_32f_a_avx2(float* iBuffer, +++ const lv_32fc_t* complexVector, +++ unsigned int num_points) ++ { ++- unsigned int number = 0; ++- const unsigned int eighthPoints = num_points / 8; +++ unsigned int number = 0; +++ const unsigned int eighthPoints = num_points / 8; ++ ++- const float* complexVectorPtr = (const float*)complexVector; ++- float* iBufferPtr = iBuffer; +++ const float* complexVectorPtr = (const float*)complexVector; +++ float* iBufferPtr = iBuffer; ++ ++- __m256 cplxValue1, cplxValue2; ++- __m256 iValue; ++- __m256i idx = _mm256_set_epi32(7,6,3,2,5,4,1,0); ++- for(;number < eighthPoints; number++){ +++ __m256 cplxValue1, cplxValue2; +++ __m256 iValue; +++ __m256i idx = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0); +++ for (; number < eighthPoints; number++) { ++ ++- cplxValue1 = _mm256_load_ps(complexVectorPtr); ++- complexVectorPtr += 8; +++ cplxValue1 = _mm256_load_ps(complexVectorPtr); +++ complexVectorPtr += 8; ++ ++- cplxValue2 = _mm256_load_ps(complexVectorPtr); ++- complexVectorPtr += 8; +++ cplxValue2 = _mm256_load_ps(complexVectorPtr); +++ complexVectorPtr += 8; ++ ++- // Arrange in i1i2i3i4 format ++- iValue = _mm256_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0)); ++- iValue = _mm256_permutevar8x32_ps(iValue,idx); +++ // Arrange in i1i2i3i4 format +++ iValue = _mm256_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2, 0, 2, 0)); +++ iValue = _mm256_permutevar8x32_ps(iValue, idx); ++ ++- _mm256_store_ps(iBufferPtr, iValue); +++ _mm256_store_ps(iBufferPtr, iValue); ++ ++- iBufferPtr += 8; ++- } +++ iBufferPtr += 8; +++ } ++ ++- number = eighthPoints * 8; ++- for(; number < num_points; number++){ ++- *iBufferPtr++ = *complexVectorPtr++; ++- complexVectorPtr++; ++- } +++ number = eighthPoints * 8; +++ for (; number < num_points; number++) { +++ *iBufferPtr++ = *complexVectorPtr++; +++ complexVectorPtr++; +++ } ++ } ++ #endif /* LV_HAVE_AVX2 */ ++ ++ #ifdef LV_HAVE_SSE ++ #include ++ ++-static inline void ++-volk_32fc_deinterleave_real_32f_a_sse(float* iBuffer, const lv_32fc_t* complexVector, ++- unsigned int num_points) +++static inline void volk_32fc_deinterleave_real_32f_a_sse(float* iBuffer, +++ const lv_32fc_t* complexVector, +++ unsigned int num_points) ++ { ++- unsigned int number = 0; ++- const unsigned int quarterPoints = num_points / 4; +++ unsigned int number = 0; +++ const unsigned int quarterPoints = num_points / 4; ++ ++- const float* complexVectorPtr = (const float*)complexVector; ++- float* iBufferPtr = iBuffer; +++ const float* complexVectorPtr = (const float*)complexVector; +++ float* iBufferPtr = iBuffer; ++ ++- __m128 cplxValue1, cplxValue2, iValue; ++- for(;number < quarterPoints; number++){ +++ __m128 cplxValue1, cplxValue2, iValue; +++ for (; number < quarterPoints; number++) { ++ ++- cplxValue1 = _mm_load_ps(complexVectorPtr); ++- complexVectorPtr += 4; +++ cplxValue1 = _mm_load_ps(complexVectorPtr); +++ complexVectorPtr += 4; ++ ++- cplxValue2 = _mm_load_ps(complexVectorPtr); ++- complexVectorPtr += 4; +++ cplxValue2 = _mm_load_ps(complexVectorPtr); +++ complexVectorPtr += 4; ++ ++- // Arrange in i1i2i3i4 format ++- iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0)); +++ // Arrange in i1i2i3i4 format +++ iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2, 0, 2, 0)); ++ ++- _mm_store_ps(iBufferPtr, iValue); +++ _mm_store_ps(iBufferPtr, iValue); ++ ++- iBufferPtr += 4; ++- } +++ iBufferPtr += 4; +++ } ++ ++- number = quarterPoints * 4; ++- for(; number < num_points; number++){ ++- *iBufferPtr++ = *complexVectorPtr++; ++- complexVectorPtr++; ++- } +++ number = quarterPoints * 4; +++ for (; number < num_points; number++) { +++ *iBufferPtr++ = *complexVectorPtr++; +++ complexVectorPtr++; +++ } ++ } ++ #endif /* LV_HAVE_SSE */ ++ ++ ++ #ifdef LV_HAVE_GENERIC ++ ++-static inline void ++-volk_32fc_deinterleave_real_32f_generic(float* iBuffer, const lv_32fc_t* complexVector, ++- unsigned int num_points) +++static inline void volk_32fc_deinterleave_real_32f_generic(float* iBuffer, +++ const lv_32fc_t* complexVector, +++ unsigned int num_points) ++ { ++- unsigned int number = 0; ++- const float* complexVectorPtr = (float*)complexVector; ++- float* iBufferPtr = iBuffer; ++- for(number = 0; number < num_points; number++){ ++- *iBufferPtr++ = *complexVectorPtr++; ++- complexVectorPtr++; ++- } +++ unsigned int number = 0; +++ const float* complexVectorPtr = (float*)complexVector; +++ float* iBufferPtr = iBuffer; +++ for (number = 0; number < num_points; number++) { +++ *iBufferPtr++ = *complexVectorPtr++; +++ complexVectorPtr++; +++ } ++ } ++ #endif /* LV_HAVE_GENERIC */ ++ ++@@ -173,27 +173,27 @@ volk_32fc_deinterleave_real_32f_generic(float* iBuffer, const lv_32fc_t* complex ++ #ifdef LV_HAVE_NEON ++ #include ++ ++-static inline void ++-volk_32fc_deinterleave_real_32f_neon(float* iBuffer, const lv_32fc_t* complexVector, ++- unsigned int num_points) +++static inline void volk_32fc_deinterleave_real_32f_neon(float* iBuffer, +++ const lv_32fc_t* complexVector, +++ unsigned int num_points) ++ { ++- unsigned int number = 0; ++- unsigned int quarter_points = num_points / 4; ++- const float* complexVectorPtr = (float*)complexVector; ++- float* iBufferPtr = iBuffer; ++- float32x4x2_t complexInput; ++- ++- for(number = 0; number < quarter_points; number++){ ++- complexInput = vld2q_f32(complexVectorPtr); ++- vst1q_f32( iBufferPtr, complexInput.val[0] ); ++- complexVectorPtr += 8; ++- iBufferPtr += 4; ++- } ++- ++- for(number = quarter_points*4; number < num_points; number++){ ++- *iBufferPtr++ = *complexVectorPtr++; ++- complexVectorPtr++; ++- } +++ unsigned int number = 0; +++ unsigned int quarter_points = num_points / 4; +++ const float* complexVectorPtr = (float*)complexVector; +++ float* iBufferPtr = iBuffer; +++ float32x4x2_t complexInput; +++ +++ for (number = 0; number < quarter_points; number++) { +++ complexInput = vld2q_f32(complexVectorPtr); +++ vst1q_f32(iBufferPtr, complexInput.val[0]); +++ complexVectorPtr += 8; +++ iBufferPtr += 4; +++ } +++ +++ for (number = quarter_points * 4; number < num_points; number++) { +++ *iBufferPtr++ = *complexVectorPtr++; +++ complexVectorPtr++; +++ } ++ } ++ #endif /* LV_HAVE_NEON */ ++ ++@@ -209,41 +209,41 @@ volk_32fc_deinterleave_real_32f_neon(float* iBuffer, const lv_32fc_t* complexVec ++ #ifdef LV_HAVE_AVX2 ++ #include ++ ++-static inline void ++-volk_32fc_deinterleave_real_32f_u_avx2(float* iBuffer, const lv_32fc_t* complexVector, ++- unsigned int num_points) +++static inline void volk_32fc_deinterleave_real_32f_u_avx2(float* iBuffer, +++ const lv_32fc_t* complexVector, +++ unsigned int num_points) ++ { ++- unsigned int number = 0; ++- const unsigned int eighthPoints = num_points / 8; +++ unsigned int number = 0; +++ const unsigned int eighthPoints = num_points / 8; ++ ++- const float* complexVectorPtr = (const float*)complexVector; ++- float* iBufferPtr = iBuffer; +++ const float* complexVectorPtr = (const float*)complexVector; +++ float* iBufferPtr = iBuffer; ++ ++- __m256 cplxValue1, cplxValue2; ++- __m256 iValue; ++- __m256i idx = _mm256_set_epi32(7,6,3,2,5,4,1,0); ++- for(;number < eighthPoints; number++){ +++ __m256 cplxValue1, cplxValue2; +++ __m256 iValue; +++ __m256i idx = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0); +++ for (; number < eighthPoints; number++) { ++ ++- cplxValue1 = _mm256_loadu_ps(complexVectorPtr); ++- complexVectorPtr += 8; +++ cplxValue1 = _mm256_loadu_ps(complexVectorPtr); +++ complexVectorPtr += 8; ++ ++- cplxValue2 = _mm256_loadu_ps(complexVectorPtr); ++- complexVectorPtr += 8; +++ cplxValue2 = _mm256_loadu_ps(complexVectorPtr); +++ complexVectorPtr += 8; ++ ++- // Arrange in i1i2i3i4 format ++- iValue = _mm256_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0)); ++- iValue = _mm256_permutevar8x32_ps(iValue,idx); +++ // Arrange in i1i2i3i4 format +++ iValue = _mm256_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2, 0, 2, 0)); +++ iValue = _mm256_permutevar8x32_ps(iValue, idx); ++ ++- _mm256_storeu_ps(iBufferPtr, iValue); +++ _mm256_storeu_ps(iBufferPtr, iValue); ++ ++- iBufferPtr += 8; ++- } +++ iBufferPtr += 8; +++ } ++ ++- number = eighthPoints * 8; ++- for(; number < num_points; number++){ ++- *iBufferPtr++ = *complexVectorPtr++; ++- complexVectorPtr++; ++- } +++ number = eighthPoints * 8; +++ for (; number < num_points; number++) { +++ *iBufferPtr++ = *complexVectorPtr++; +++ complexVectorPtr++; +++ } ++ } ++ #endif /* LV_HAVE_AVX2 */ ++ ++diff --git a/kernels/volk/volk_32fc_deinterleave_real_64f.h b/kernels/volk/volk_32fc_deinterleave_real_64f.h ++index 3d6e901..9ec7769 100644 ++--- a/kernels/volk/volk_32fc_deinterleave_real_64f.h +++++ b/kernels/volk/volk_32fc_deinterleave_real_64f.h ++@@ -77,124 +77,132 @@ ++ #ifdef LV_HAVE_AVX2 ++ #include ++ ++-static inline void volk_32fc_deinterleave_real_64f_a_avx2( ++- double *iBuffer, const lv_32fc_t *complexVector, unsigned int num_points) { ++- unsigned int number = 0; ++- ++- const float *complexVectorPtr = (float *)complexVector; ++- double *iBufferPtr = iBuffer; ++- ++- const unsigned int quarterPoints = num_points / 4; ++- __m256 cplxValue; ++- __m128 fVal; ++- __m256d dVal; ++- __m256i idx = _mm256_set_epi32(0, 0, 0, 0, 6, 4, 2, 0); ++- for (; number < quarterPoints; number++) { ++- ++- cplxValue = _mm256_load_ps(complexVectorPtr); ++- complexVectorPtr += 8; ++- ++- // Arrange in i1i2i1i2 format ++- cplxValue = _mm256_permutevar8x32_ps(cplxValue, idx); ++- fVal = _mm256_extractf128_ps(cplxValue, 0); ++- dVal = _mm256_cvtps_pd(fVal); ++- _mm256_store_pd(iBufferPtr, dVal); ++- ++- iBufferPtr += 4; ++- } ++- ++- number = quarterPoints * 4; ++- for (; number < num_points; number++) { ++- *iBufferPtr++ = (double)*complexVectorPtr++; ++- complexVectorPtr++; ++- } +++static inline void volk_32fc_deinterleave_real_64f_a_avx2(double* iBuffer, +++ const lv_32fc_t* complexVector, +++ unsigned int num_points) +++{ +++ unsigned int number = 0; +++ +++ const float* complexVectorPtr = (float*)complexVector; +++ double* iBufferPtr = iBuffer; +++ +++ const unsigned int quarterPoints = num_points / 4; +++ __m256 cplxValue; +++ __m128 fVal; +++ __m256d dVal; +++ __m256i idx = _mm256_set_epi32(0, 0, 0, 0, 6, 4, 2, 0); +++ for (; number < quarterPoints; number++) { +++ +++ cplxValue = _mm256_load_ps(complexVectorPtr); +++ complexVectorPtr += 8; +++ +++ // Arrange in i1i2i1i2 format +++ cplxValue = _mm256_permutevar8x32_ps(cplxValue, idx); +++ fVal = _mm256_extractf128_ps(cplxValue, 0); +++ dVal = _mm256_cvtps_pd(fVal); +++ _mm256_store_pd(iBufferPtr, dVal); +++ +++ iBufferPtr += 4; +++ } +++ +++ number = quarterPoints * 4; +++ for (; number < num_points; number++) { +++ *iBufferPtr++ = (double)*complexVectorPtr++; +++ complexVectorPtr++; +++ } ++ } ++ #endif /* LV_HAVE_AVX2 */ ++ ++ #ifdef LV_HAVE_SSE2 ++ #include ++ ++-static inline void volk_32fc_deinterleave_real_64f_a_sse2( ++- double *iBuffer, const lv_32fc_t *complexVector, unsigned int num_points) { ++- unsigned int number = 0; +++static inline void volk_32fc_deinterleave_real_64f_a_sse2(double* iBuffer, +++ const lv_32fc_t* complexVector, +++ unsigned int num_points) +++{ +++ unsigned int number = 0; ++ ++- const float *complexVectorPtr = (float *)complexVector; ++- double *iBufferPtr = iBuffer; +++ const float* complexVectorPtr = (float*)complexVector; +++ double* iBufferPtr = iBuffer; ++ ++- const unsigned int halfPoints = num_points / 2; ++- __m128 cplxValue, fVal; ++- __m128d dVal; ++- for (; number < halfPoints; number++) { +++ const unsigned int halfPoints = num_points / 2; +++ __m128 cplxValue, fVal; +++ __m128d dVal; +++ for (; number < halfPoints; number++) { ++ ++- cplxValue = _mm_load_ps(complexVectorPtr); ++- complexVectorPtr += 4; +++ cplxValue = _mm_load_ps(complexVectorPtr); +++ complexVectorPtr += 4; ++ ++- // Arrange in i1i2i1i2 format ++- fVal = _mm_shuffle_ps(cplxValue, cplxValue, _MM_SHUFFLE(2, 0, 2, 0)); ++- dVal = _mm_cvtps_pd(fVal); ++- _mm_store_pd(iBufferPtr, dVal); +++ // Arrange in i1i2i1i2 format +++ fVal = _mm_shuffle_ps(cplxValue, cplxValue, _MM_SHUFFLE(2, 0, 2, 0)); +++ dVal = _mm_cvtps_pd(fVal); +++ _mm_store_pd(iBufferPtr, dVal); ++ ++- iBufferPtr += 2; ++- } +++ iBufferPtr += 2; +++ } ++ ++- number = halfPoints * 2; ++- for (; number < num_points; number++) { ++- *iBufferPtr++ = (double)*complexVectorPtr++; ++- complexVectorPtr++; ++- } +++ number = halfPoints * 2; +++ for (; number < num_points; number++) { +++ *iBufferPtr++ = (double)*complexVectorPtr++; +++ complexVectorPtr++; +++ } ++ } ++ #endif /* LV_HAVE_SSE */ ++ ++ #ifdef LV_HAVE_GENERIC ++ ++-static inline void volk_32fc_deinterleave_real_64f_generic( ++- double *iBuffer, const lv_32fc_t *complexVector, unsigned int num_points) { ++- unsigned int number = 0; ++- const float *complexVectorPtr = (float *)complexVector; ++- double *iBufferPtr = iBuffer; ++- for (number = 0; number < num_points; number++) { ++- *iBufferPtr++ = (double)*complexVectorPtr++; ++- complexVectorPtr++; ++- } +++static inline void volk_32fc_deinterleave_real_64f_generic(double* iBuffer, +++ const lv_32fc_t* complexVector, +++ unsigned int num_points) +++{ +++ unsigned int number = 0; +++ const float* complexVectorPtr = (float*)complexVector; +++ double* iBufferPtr = iBuffer; +++ for (number = 0; number < num_points; number++) { +++ *iBufferPtr++ = (double)*complexVectorPtr++; +++ complexVectorPtr++; +++ } ++ } ++ #endif /* LV_HAVE_GENERIC */ ++ ++ #ifdef LV_HAVE_NEONV8 ++ #include ++ ++-static inline void volk_32fc_deinterleave_real_64f_neon( ++- double *iBuffer, const lv_32fc_t *complexVector, unsigned int num_points) { ++- unsigned int number = 0; ++- unsigned int quarter_points = num_points / 4; ++- const float *complexVectorPtr = (float *)complexVector; ++- double *iBufferPtr = iBuffer; ++- float32x2x4_t complexInput; ++- float64x2_t iVal1; ++- float64x2_t iVal2; ++- float64x2x2_t iVal; ++- ++- for (number = 0; number < quarter_points; number++) { ++- // Load data into register ++- complexInput = vld4_f32(complexVectorPtr); ++- ++- // Perform single to double precision conversion ++- iVal1 = vcvt_f64_f32(complexInput.val[0]); ++- iVal2 = vcvt_f64_f32(complexInput.val[2]); ++- iVal.val[0] = iVal1; ++- iVal.val[1] = iVal2; ++- ++- // Store results into memory buffer ++- vst2q_f64(iBufferPtr, iVal); ++- ++- // Update pointers ++- iBufferPtr += 4; ++- complexVectorPtr += 8; ++- } ++- ++- for (number = quarter_points * 4; number < num_points; number++) { ++- *iBufferPtr++ = (double)*complexVectorPtr++; ++- complexVectorPtr++; ++- } +++static inline void volk_32fc_deinterleave_real_64f_neon(double* iBuffer, +++ const lv_32fc_t* complexVector, +++ unsigned int num_points) +++{ +++ unsigned int number = 0; +++ unsigned int quarter_points = num_points / 4; +++ const float* complexVectorPtr = (float*)complexVector; +++ double* iBufferPtr = iBuffer; +++ float32x2x4_t complexInput; +++ float64x2_t iVal1; +++ float64x2_t iVal2; +++ float64x2x2_t iVal; +++ +++ for (number = 0; number < quarter_points; number++) { +++ // Load data into register +++ complexInput = vld4_f32(complexVectorPtr); +++ +++ // Perform single to double precision conversion +++ iVal1 = vcvt_f64_f32(complexInput.val[0]); +++ iVal2 = vcvt_f64_f32(complexInput.val[2]); +++ iVal.val[0] = iVal1; +++ iVal.val[1] = iVal2; +++ +++ // Store results into memory buffer +++ vst2q_f64(iBufferPtr, iVal); +++ +++ // Update pointers +++ iBufferPtr += 4; +++ complexVectorPtr += 8; +++ } +++ +++ for (number = quarter_points * 4; number < num_points; number++) { +++ *iBufferPtr++ = (double)*complexVectorPtr++; +++ complexVectorPtr++; +++ } ++ } ++ #endif /* LV_HAVE_NEON */ ++ ++@@ -209,37 +217,39 @@ static inline void volk_32fc_deinterleave_real_64f_neon( ++ #ifdef LV_HAVE_AVX2 ++ #include ++ ++-static inline void volk_32fc_deinterleave_real_64f_u_avx2( ++- double *iBuffer, const lv_32fc_t *complexVector, unsigned int num_points) { ++- unsigned int number = 0; ++- ++- const float *complexVectorPtr = (float *)complexVector; ++- double *iBufferPtr = iBuffer; ++- ++- const unsigned int quarterPoints = num_points / 4; ++- __m256 cplxValue; ++- __m128 fVal; ++- __m256d dVal; ++- __m256i idx = _mm256_set_epi32(0, 0, 0, 0, 6, 4, 2, 0); ++- for (; number < quarterPoints; number++) { ++- ++- cplxValue = _mm256_loadu_ps(complexVectorPtr); ++- complexVectorPtr += 8; ++- ++- // Arrange in i1i2i1i2 format ++- cplxValue = _mm256_permutevar8x32_ps(cplxValue, idx); ++- fVal = _mm256_extractf128_ps(cplxValue, 0); ++- dVal = _mm256_cvtps_pd(fVal); ++- _mm256_storeu_pd(iBufferPtr, dVal); ++- ++- iBufferPtr += 4; ++- } ++- ++- number = quarterPoints * 4; ++- for (; number < num_points; number++) { ++- *iBufferPtr++ = (double)*complexVectorPtr++; ++- complexVectorPtr++; ++- } +++static inline void volk_32fc_deinterleave_real_64f_u_avx2(double* iBuffer, +++ const lv_32fc_t* complexVector, +++ unsigned int num_points) +++{ +++ unsigned int number = 0; +++ +++ const float* complexVectorPtr = (float*)complexVector; +++ double* iBufferPtr = iBuffer; +++ +++ const unsigned int quarterPoints = num_points / 4; +++ __m256 cplxValue; +++ __m128 fVal; +++ __m256d dVal; +++ __m256i idx = _mm256_set_epi32(0, 0, 0, 0, 6, 4, 2, 0); +++ for (; number < quarterPoints; number++) { +++ +++ cplxValue = _mm256_loadu_ps(complexVectorPtr); +++ complexVectorPtr += 8; +++ +++ // Arrange in i1i2i1i2 format +++ cplxValue = _mm256_permutevar8x32_ps(cplxValue, idx); +++ fVal = _mm256_extractf128_ps(cplxValue, 0); +++ dVal = _mm256_cvtps_pd(fVal); +++ _mm256_storeu_pd(iBufferPtr, dVal); +++ +++ iBufferPtr += 4; +++ } +++ +++ number = quarterPoints * 4; +++ for (; number < num_points; number++) { +++ *iBufferPtr++ = (double)*complexVectorPtr++; +++ complexVectorPtr++; +++ } ++ } ++ #endif /* LV_HAVE_AVX2 */ ++ ++diff --git a/kernels/volk/volk_32fc_index_max_16u.h b/kernels/volk/volk_32fc_index_max_16u.h ++index a9f9508..b9f9cfd 100644 ++--- a/kernels/volk/volk_32fc_index_max_16u.h +++++ b/kernels/volk/volk_32fc_index_max_16u.h ++@@ -76,346 +76,353 @@ ++ #ifndef INCLUDED_volk_32fc_index_max_16u_a_H ++ #define INCLUDED_volk_32fc_index_max_16u_a_H ++ ++-#include ++ #include ++-#include ++ #include +++#include +++#include ++ #include ++ ++ #ifdef LV_HAVE_AVX2 ++ #include ++ ++ static inline void ++-volk_32fc_index_max_16u_a_avx2(uint16_t* target, lv_32fc_t* src0, ++- uint32_t num_points) +++volk_32fc_index_max_16u_a_avx2(uint16_t* target, lv_32fc_t* src0, uint32_t num_points) ++ { ++- num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points; ++- // Branchless version, if we think it'll make a difference ++- //num_points = USHRT_MAX ^ ((num_points ^ USHRT_MAX) & -(num_points < USHRT_MAX)); ++- ++- const uint32_t num_bytes = num_points*8; ++- ++- union bit256 holderf; ++- union bit256 holderi; ++- float sq_dist = 0.0; +++ num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points; +++ // Branchless version, if we think it'll make a difference +++ // num_points = USHRT_MAX ^ ((num_points ^ USHRT_MAX) & -(num_points < USHRT_MAX)); ++ ++- union bit256 xmm5, xmm4; ++- __m256 xmm1, xmm2, xmm3; ++- __m256i xmm8, xmm11, xmm12, xmmfive, xmmfour, xmm9, holder0, holder1, xmm10; +++ const uint32_t num_bytes = num_points * 8; ++ ++- xmm5.int_vec = xmmfive = _mm256_setzero_si256(); ++- xmm4.int_vec = xmmfour = _mm256_setzero_si256(); ++- holderf.int_vec = holder0 = _mm256_setzero_si256(); ++- holderi.int_vec = holder1 = _mm256_setzero_si256(); +++ union bit256 holderf; +++ union bit256 holderi; +++ float sq_dist = 0.0; ++ ++- int bound = num_bytes >> 6; ++- int i = 0; +++ union bit256 xmm5, xmm4; +++ __m256 xmm1, xmm2, xmm3; +++ __m256i xmm8, xmm11, xmm12, xmmfive, xmmfour, xmm9, holder0, holder1, xmm10; ++ ++- xmm8 = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0); ++- xmm9 = _mm256_setzero_si256(); //=xmm8 ++- xmm10 = _mm256_set1_epi32(8); ++- xmm3 = _mm256_setzero_ps(); +++ xmm5.int_vec = xmmfive = _mm256_setzero_si256(); +++ xmm4.int_vec = xmmfour = _mm256_setzero_si256(); +++ holderf.int_vec = holder0 = _mm256_setzero_si256(); +++ holderi.int_vec = holder1 = _mm256_setzero_si256(); ++ ++- __m256i idx = _mm256_set_epi32(7,6,3,2,5,4,1,0); ++- for(; i < bound; ++i) { ++- xmm1 = _mm256_load_ps((float*)src0); ++- xmm2 = _mm256_load_ps((float*)&src0[4]); +++ int bound = num_bytes >> 6; +++ int i = 0; ++ ++- src0 += 8; +++ xmm8 = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0); +++ xmm9 = _mm256_setzero_si256(); //=xmm8 +++ xmm10 = _mm256_set1_epi32(8); +++ xmm3 = _mm256_setzero_ps(); ++ ++- xmm1 = _mm256_mul_ps(xmm1, xmm1); ++- xmm2 = _mm256_mul_ps(xmm2, xmm2); +++ __m256i idx = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0); +++ for (; i < bound; ++i) { +++ xmm1 = _mm256_load_ps((float*)src0); +++ xmm2 = _mm256_load_ps((float*)&src0[4]); ++ ++- xmm1 = _mm256_hadd_ps(xmm1, xmm2); ++- xmm1 = _mm256_permutevar8x32_ps(xmm1, idx); +++ src0 += 8; ++ ++- xmm3 = _mm256_max_ps(xmm1, xmm3); +++ xmm1 = _mm256_mul_ps(xmm1, xmm1); +++ xmm2 = _mm256_mul_ps(xmm2, xmm2); ++ ++- xmm4.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_LT_OS); ++- xmm5.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_EQ_OQ); +++ xmm1 = _mm256_hadd_ps(xmm1, xmm2); +++ xmm1 = _mm256_permutevar8x32_ps(xmm1, idx); ++ ++- xmm11 = _mm256_and_si256(xmm8, xmm5.int_vec); ++- xmm12 = _mm256_and_si256(xmm9, xmm4.int_vec); +++ xmm3 = _mm256_max_ps(xmm1, xmm3); ++ ++- xmm9 = _mm256_add_epi32(xmm11, xmm12); +++ xmm4.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_LT_OS); +++ xmm5.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_EQ_OQ); ++ ++- xmm8 = _mm256_add_epi32(xmm8, xmm10); ++- } ++- xmm10 = _mm256_set1_epi32(4); ++- if (num_bytes >> 5 & 1) { ++- xmm1 = _mm256_load_ps((float*)src0); +++ xmm11 = _mm256_and_si256(xmm8, xmm5.int_vec); +++ xmm12 = _mm256_and_si256(xmm9, xmm4.int_vec); ++ ++- src0 += 4; +++ xmm9 = _mm256_add_epi32(xmm11, xmm12); ++ ++- xmm1 = _mm256_mul_ps(xmm1, xmm1); +++ xmm8 = _mm256_add_epi32(xmm8, xmm10); +++ } +++ xmm10 = _mm256_set1_epi32(4); +++ if (num_bytes >> 5 & 1) { +++ xmm1 = _mm256_load_ps((float*)src0); ++ ++- xmm1 = _mm256_hadd_ps(xmm1, xmm1); ++- xmm1 = _mm256_permutevar8x32_ps(xmm1, idx); +++ src0 += 4; ++ ++- xmm3 = _mm256_max_ps(xmm1, xmm3); +++ xmm1 = _mm256_mul_ps(xmm1, xmm1); ++ ++- xmm4.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_LT_OS); ++- xmm5.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_EQ_OQ); +++ xmm1 = _mm256_hadd_ps(xmm1, xmm1); +++ xmm1 = _mm256_permutevar8x32_ps(xmm1, idx); ++ ++- xmm11 = _mm256_and_si256(xmm8, xmm5.int_vec); ++- xmm12 = _mm256_and_si256(xmm9, xmm4.int_vec); +++ xmm3 = _mm256_max_ps(xmm1, xmm3); ++ ++- xmm9 = _mm256_add_epi32(xmm11, xmm12); +++ xmm4.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_LT_OS); +++ xmm5.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_EQ_OQ); ++ ++- xmm8 = _mm256_add_epi32(xmm8, xmm10); ++- } +++ xmm11 = _mm256_and_si256(xmm8, xmm5.int_vec); +++ xmm12 = _mm256_and_si256(xmm9, xmm4.int_vec); ++ ++- idx = _mm256_set_epi32(1,0,1,0,1,0,1,0); ++- xmm10 = _mm256_set1_epi32(2); ++- if (num_bytes >> 4 & 1) { ++- xmm2 = _mm256_load_ps((float*)src0); +++ xmm9 = _mm256_add_epi32(xmm11, xmm12); ++ ++- xmm1 = _mm256_permutevar8x32_ps(bit256_p(&xmm8)->float_vec, idx); ++- xmm8 = bit256_p(&xmm1)->int_vec; +++ xmm8 = _mm256_add_epi32(xmm8, xmm10); +++ } ++ ++- xmm2 = _mm256_mul_ps(xmm2, xmm2); +++ idx = _mm256_set_epi32(1, 0, 1, 0, 1, 0, 1, 0); +++ xmm10 = _mm256_set1_epi32(2); +++ if (num_bytes >> 4 & 1) { +++ xmm2 = _mm256_load_ps((float*)src0); ++ ++- src0 += 2; +++ xmm1 = _mm256_permutevar8x32_ps(bit256_p(&xmm8)->float_vec, idx); +++ xmm8 = bit256_p(&xmm1)->int_vec; ++ ++- xmm1 = _mm256_hadd_ps(xmm2, xmm2); +++ xmm2 = _mm256_mul_ps(xmm2, xmm2); ++ ++- xmm3 = _mm256_max_ps(xmm1, xmm3); +++ src0 += 2; ++ ++- xmm4.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_LT_OS); ++- xmm5.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_EQ_OQ); ++- ++- xmm11 = _mm256_and_si256(xmm8, xmm5.int_vec); ++- xmm12 = _mm256_and_si256(xmm9, xmm4.int_vec); +++ xmm1 = _mm256_hadd_ps(xmm2, xmm2); ++ ++- xmm9 = _mm256_add_epi32(xmm11, xmm12); +++ xmm3 = _mm256_max_ps(xmm1, xmm3); ++ ++- xmm8 = _mm256_add_epi32(xmm8, xmm10); ++- } +++ xmm4.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_LT_OS); +++ xmm5.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_EQ_OQ); ++ ++- /* ++- idx = _mm256_setzero_si256(); ++- for(i = 0; i < leftovers2; ++i) { ++- //printf("%u, %u, %u, %u\n", ((uint32_t*)&xmm9)[0], ((uint32_t*)&xmm9)[1], ((uint32_t*)&xmm9)[2], ((uint32_t*)&xmm9)[3]); +++ xmm11 = _mm256_and_si256(xmm8, xmm5.int_vec); +++ xmm12 = _mm256_and_si256(xmm9, xmm4.int_vec); ++ ++- sq_dist = lv_creal(src0[0]) * lv_creal(src0[0]) + lv_cimag(src0[0]) * lv_cimag(src0[0]); +++ xmm9 = _mm256_add_epi32(xmm11, xmm12); ++ ++- //xmm = _mm_load1_ps(&sq_dist);//insert? ++- xmm2 = _mm256_set1_ps(sq_dist); ++- //xmm2 = _mm256_insertf128_ps(xmm2, xmm, 0); +++ xmm8 = _mm256_add_epi32(xmm8, xmm10); +++ } ++ ++- xmm1 = xmm3; +++ /* +++ idx = _mm256_setzero_si256(); +++ for(i = 0; i < leftovers2; ++i) { +++ //printf("%u, %u, %u, %u\n", ((uint32_t*)&xmm9)[0], ((uint32_t*)&xmm9)[1], +++ ((uint32_t*)&xmm9)[2], ((uint32_t*)&xmm9)[3]); ++ ++- xmm3 = _mm256_max_ps(xmm3, xmm2);//only lowest 32bit value ++- xmm3 = _mm256_permutevar8x32_ps(xmm3, idx); +++ sq_dist = lv_creal(src0[0]) * lv_creal(src0[0]) + lv_cimag(src0[0]) * +++ lv_cimag(src0[0]); ++ ++- xmm4.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_LT_OS); ++- xmm5.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_EQ_OQ); +++ //xmm = _mm_load1_ps(&sq_dist);//insert? +++ xmm2 = _mm256_set1_ps(sq_dist); +++ //xmm2 = _mm256_insertf128_ps(xmm2, xmm, 0); ++ ++- xmm8 = _mm256_permutevar8x32_epi32(xmm8, idx); +++ xmm1 = xmm3; ++ ++- xmm11 = _mm256_and_si256(xmm8, xmm4.int_vec); ++- xmm12 = _mm256_and_si256(xmm9, xmm5.int_vec); +++ xmm3 = _mm256_max_ps(xmm3, xmm2);//only lowest 32bit value +++ xmm3 = _mm256_permutevar8x32_ps(xmm3, idx); ++ ++- xmm9 = _mm256_add_epi32(xmm11, xmm12); ++-}*/ +++ xmm4.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_LT_OS); +++ xmm5.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_EQ_OQ); ++ ++- _mm256_store_ps((float*)&(holderf.f), xmm3); ++- _mm256_store_si256(&(holderi.int_vec), xmm9); +++ xmm8 = _mm256_permutevar8x32_epi32(xmm8, idx); ++ ++- target[0] = holderi.i[0]; ++- sq_dist = holderf.f[0]; ++- target[0] = (holderf.f[1] > sq_dist) ? holderi.i[1] : target[0]; ++- sq_dist = (holderf.f[1] > sq_dist) ? holderf.f[1] : sq_dist; ++- target[0] = (holderf.f[2] > sq_dist) ? holderi.i[2] : target[0]; ++- sq_dist = (holderf.f[2] > sq_dist) ? holderf.f[2] : sq_dist; ++- target[0] = (holderf.f[3] > sq_dist) ? holderi.i[3] : target[0]; ++- sq_dist = (holderf.f[3] > sq_dist) ? holderf.f[3] : sq_dist; ++- target[0] = (holderf.f[4] > sq_dist) ? holderi.i[4] : target[0]; ++- sq_dist = (holderf.f[4] > sq_dist) ? holderf.f[4] : sq_dist; ++- target[0] = (holderf.f[5] > sq_dist) ? holderi.i[5] : target[0]; ++- sq_dist = (holderf.f[5] > sq_dist) ? holderf.f[5] : sq_dist; ++- target[0] = (holderf.f[6] > sq_dist) ? holderi.i[6] : target[0]; ++- sq_dist = (holderf.f[6] > sq_dist) ? holderf.f[6] : sq_dist; ++- target[0] = (holderf.f[7] > sq_dist) ? holderi.i[7] : target[0]; ++- sq_dist = (holderf.f[7] > sq_dist) ? holderf.f[7] : sq_dist; +++ xmm11 = _mm256_and_si256(xmm8, xmm4.int_vec); +++ xmm12 = _mm256_and_si256(xmm9, xmm5.int_vec); ++ +++ xmm9 = _mm256_add_epi32(xmm11, xmm12); +++ }*/ +++ +++ _mm256_store_ps((float*)&(holderf.f), xmm3); +++ _mm256_store_si256(&(holderi.int_vec), xmm9); +++ +++ target[0] = holderi.i[0]; +++ sq_dist = holderf.f[0]; +++ target[0] = (holderf.f[1] > sq_dist) ? holderi.i[1] : target[0]; +++ sq_dist = (holderf.f[1] > sq_dist) ? holderf.f[1] : sq_dist; +++ target[0] = (holderf.f[2] > sq_dist) ? holderi.i[2] : target[0]; +++ sq_dist = (holderf.f[2] > sq_dist) ? holderf.f[2] : sq_dist; +++ target[0] = (holderf.f[3] > sq_dist) ? holderi.i[3] : target[0]; +++ sq_dist = (holderf.f[3] > sq_dist) ? holderf.f[3] : sq_dist; +++ target[0] = (holderf.f[4] > sq_dist) ? holderi.i[4] : target[0]; +++ sq_dist = (holderf.f[4] > sq_dist) ? holderf.f[4] : sq_dist; +++ target[0] = (holderf.f[5] > sq_dist) ? holderi.i[5] : target[0]; +++ sq_dist = (holderf.f[5] > sq_dist) ? holderf.f[5] : sq_dist; +++ target[0] = (holderf.f[6] > sq_dist) ? holderi.i[6] : target[0]; +++ sq_dist = (holderf.f[6] > sq_dist) ? holderf.f[6] : sq_dist; +++ target[0] = (holderf.f[7] > sq_dist) ? holderi.i[7] : target[0]; +++ sq_dist = (holderf.f[7] > sq_dist) ? holderf.f[7] : sq_dist; ++ } ++ ++ #endif /*LV_HAVE_AVX2*/ ++ ++ #ifdef LV_HAVE_SSE3 ++-#include ++ #include +++#include ++ ++ static inline void ++-volk_32fc_index_max_16u_a_sse3(uint16_t* target, lv_32fc_t* src0, ++- uint32_t num_points) +++volk_32fc_index_max_16u_a_sse3(uint16_t* target, lv_32fc_t* src0, uint32_t num_points) ++ { ++- num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points; ++- // Branchless version, if we think it'll make a difference ++- //num_points = USHRT_MAX ^ ((num_points ^ USHRT_MAX) & -(num_points < USHRT_MAX)); +++ num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points; +++ // Branchless version, if we think it'll make a difference +++ // num_points = USHRT_MAX ^ ((num_points ^ USHRT_MAX) & -(num_points < USHRT_MAX)); ++ ++- const uint32_t num_bytes = num_points*8; +++ const uint32_t num_bytes = num_points * 8; ++ ++- union bit128 holderf; ++- union bit128 holderi; ++- float sq_dist = 0.0; +++ union bit128 holderf; +++ union bit128 holderi; +++ float sq_dist = 0.0; ++ ++- union bit128 xmm5, xmm4; ++- __m128 xmm1, xmm2, xmm3; ++- __m128i xmm8, xmm11, xmm12, xmmfive, xmmfour, xmm9, holder0, holder1, xmm10; +++ union bit128 xmm5, xmm4; +++ __m128 xmm1, xmm2, xmm3; +++ __m128i xmm8, xmm11, xmm12, xmmfive, xmmfour, xmm9, holder0, holder1, xmm10; ++ ++- xmm5.int_vec = xmmfive = _mm_setzero_si128(); ++- xmm4.int_vec = xmmfour = _mm_setzero_si128(); ++- holderf.int_vec = holder0 = _mm_setzero_si128(); ++- holderi.int_vec = holder1 = _mm_setzero_si128(); +++ xmm5.int_vec = xmmfive = _mm_setzero_si128(); +++ xmm4.int_vec = xmmfour = _mm_setzero_si128(); +++ holderf.int_vec = holder0 = _mm_setzero_si128(); +++ holderi.int_vec = holder1 = _mm_setzero_si128(); ++ ++- int bound = num_bytes >> 5; ++- int i = 0; +++ int bound = num_bytes >> 5; +++ int i = 0; ++ ++- xmm8 = _mm_set_epi32(3, 2, 1, 0);//remember the crazy reverse order! ++- xmm9 = _mm_setzero_si128(); ++- xmm10 = _mm_set_epi32(4, 4, 4, 4); ++- xmm3 = _mm_setzero_ps(); ++- //printf("%f, %f, %f, %f\n", ((float*)&xmm10)[0], ((float*)&xmm10)[1], ((float*)&xmm10)[2], ((float*)&xmm10)[3]); +++ xmm8 = _mm_set_epi32(3, 2, 1, 0); // remember the crazy reverse order! +++ xmm9 = _mm_setzero_si128(); +++ xmm10 = _mm_set_epi32(4, 4, 4, 4); +++ xmm3 = _mm_setzero_ps(); +++ // printf("%f, %f, %f, %f\n", ((float*)&xmm10)[0], ((float*)&xmm10)[1], +++ // ((float*)&xmm10)[2], ((float*)&xmm10)[3]); ++ ++- for(; i < bound; ++i) { ++- xmm1 = _mm_load_ps((float*)src0); ++- xmm2 = _mm_load_ps((float*)&src0[2]); +++ for (; i < bound; ++i) { +++ xmm1 = _mm_load_ps((float*)src0); +++ xmm2 = _mm_load_ps((float*)&src0[2]); ++ ++- src0 += 4; +++ src0 += 4; ++ ++- xmm1 = _mm_mul_ps(xmm1, xmm1); ++- xmm2 = _mm_mul_ps(xmm2, xmm2); +++ xmm1 = _mm_mul_ps(xmm1, xmm1); +++ xmm2 = _mm_mul_ps(xmm2, xmm2); ++ ++- xmm1 = _mm_hadd_ps(xmm1, xmm2); +++ xmm1 = _mm_hadd_ps(xmm1, xmm2); ++ ++- xmm3 = _mm_max_ps(xmm1, xmm3); +++ xmm3 = _mm_max_ps(xmm1, xmm3); ++ ++- xmm4.float_vec = _mm_cmplt_ps(xmm1, xmm3); ++- xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3); +++ xmm4.float_vec = _mm_cmplt_ps(xmm1, xmm3); +++ xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3); ++ ++- xmm11 = _mm_and_si128(xmm8, xmm5.int_vec); ++- xmm12 = _mm_and_si128(xmm9, xmm4.int_vec); +++ xmm11 = _mm_and_si128(xmm8, xmm5.int_vec); +++ xmm12 = _mm_and_si128(xmm9, xmm4.int_vec); ++ ++- xmm9 = _mm_add_epi32(xmm11, xmm12); +++ xmm9 = _mm_add_epi32(xmm11, xmm12); ++ ++- xmm8 = _mm_add_epi32(xmm8, xmm10); +++ xmm8 = _mm_add_epi32(xmm8, xmm10); ++ ++- //printf("%f, %f, %f, %f\n", ((float*)&xmm3)[0], ((float*)&xmm3)[1], ((float*)&xmm3)[2], ((float*)&xmm3)[3]); ++- //printf("%u, %u, %u, %u\n", ((uint32_t*)&xmm10)[0], ((uint32_t*)&xmm10)[1], ((uint32_t*)&xmm10)[2], ((uint32_t*)&xmm10)[3]); ++- } +++ // printf("%f, %f, %f, %f\n", ((float*)&xmm3)[0], ((float*)&xmm3)[1], +++ // ((float*)&xmm3)[2], ((float*)&xmm3)[3]); printf("%u, %u, %u, %u\n", +++ // ((uint32_t*)&xmm10)[0], ((uint32_t*)&xmm10)[1], ((uint32_t*)&xmm10)[2], +++ // ((uint32_t*)&xmm10)[3]); +++ } ++ ++ ++- if (num_bytes >> 4 & 1) { ++- xmm2 = _mm_load_ps((float*)src0); +++ if (num_bytes >> 4 & 1) { +++ xmm2 = _mm_load_ps((float*)src0); ++ ++- xmm1 = _mm_movelh_ps(bit128_p(&xmm8)->float_vec, bit128_p(&xmm8)->float_vec); ++- xmm8 = bit128_p(&xmm1)->int_vec; +++ xmm1 = _mm_movelh_ps(bit128_p(&xmm8)->float_vec, bit128_p(&xmm8)->float_vec); +++ xmm8 = bit128_p(&xmm1)->int_vec; ++ ++- xmm2 = _mm_mul_ps(xmm2, xmm2); +++ xmm2 = _mm_mul_ps(xmm2, xmm2); ++ ++- src0 += 2; +++ src0 += 2; ++ ++- xmm1 = _mm_hadd_ps(xmm2, xmm2); +++ xmm1 = _mm_hadd_ps(xmm2, xmm2); ++ ++- xmm3 = _mm_max_ps(xmm1, xmm3); +++ xmm3 = _mm_max_ps(xmm1, xmm3); ++ ++- xmm10 = _mm_set_epi32(2, 2, 2, 2);//load1_ps((float*)&init[2]); +++ xmm10 = _mm_set_epi32(2, 2, 2, 2); // load1_ps((float*)&init[2]); ++ ++- xmm4.float_vec = _mm_cmplt_ps(xmm1, xmm3); ++- xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3); +++ xmm4.float_vec = _mm_cmplt_ps(xmm1, xmm3); +++ xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3); ++ ++- xmm11 = _mm_and_si128(xmm8, xmm5.int_vec); ++- xmm12 = _mm_and_si128(xmm9, xmm4.int_vec); +++ xmm11 = _mm_and_si128(xmm8, xmm5.int_vec); +++ xmm12 = _mm_and_si128(xmm9, xmm4.int_vec); ++ ++- xmm9 = _mm_add_epi32(xmm11, xmm12); +++ xmm9 = _mm_add_epi32(xmm11, xmm12); ++ ++- xmm8 = _mm_add_epi32(xmm8, xmm10); ++- //printf("egads%u, %u, %u, %u\n", ((uint32_t*)&xmm9)[0], ((uint32_t*)&xmm9)[1], ((uint32_t*)&xmm9)[2], ((uint32_t*)&xmm9)[3]); ++- } +++ xmm8 = _mm_add_epi32(xmm8, xmm10); +++ // printf("egads%u, %u, %u, %u\n", ((uint32_t*)&xmm9)[0], ((uint32_t*)&xmm9)[1], +++ // ((uint32_t*)&xmm9)[2], ((uint32_t*)&xmm9)[3]); +++ } ++ ++- if (num_bytes >> 3 & 1) { ++- //printf("%u, %u, %u, %u\n", ((uint32_t*)&xmm9)[0], ((uint32_t*)&xmm9)[1], ((uint32_t*)&xmm9)[2], ((uint32_t*)&xmm9)[3]); +++ if (num_bytes >> 3 & 1) { +++ // printf("%u, %u, %u, %u\n", ((uint32_t*)&xmm9)[0], ((uint32_t*)&xmm9)[1], +++ // ((uint32_t*)&xmm9)[2], ((uint32_t*)&xmm9)[3]); ++ ++- sq_dist = lv_creal(src0[0]) * lv_creal(src0[0]) + lv_cimag(src0[0]) * lv_cimag(src0[0]); +++ sq_dist = +++ lv_creal(src0[0]) * lv_creal(src0[0]) + lv_cimag(src0[0]) * lv_cimag(src0[0]); ++ ++- xmm2 = _mm_load1_ps(&sq_dist); +++ xmm2 = _mm_load1_ps(&sq_dist); ++ ++- xmm1 = xmm3; +++ xmm1 = xmm3; ++ ++- xmm3 = _mm_max_ss(xmm3, xmm2); +++ xmm3 = _mm_max_ss(xmm3, xmm2); ++ ++- xmm4.float_vec = _mm_cmplt_ps(xmm1, xmm3); ++- xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3); +++ xmm4.float_vec = _mm_cmplt_ps(xmm1, xmm3); +++ xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3); ++ ++- xmm8 = _mm_shuffle_epi32(xmm8, 0x00); +++ xmm8 = _mm_shuffle_epi32(xmm8, 0x00); ++ ++- xmm11 = _mm_and_si128(xmm8, xmm4.int_vec); ++- xmm12 = _mm_and_si128(xmm9, xmm5.int_vec); +++ xmm11 = _mm_and_si128(xmm8, xmm4.int_vec); +++ xmm12 = _mm_and_si128(xmm9, xmm5.int_vec); ++ ++- xmm9 = _mm_add_epi32(xmm11, xmm12); ++- } +++ xmm9 = _mm_add_epi32(xmm11, xmm12); +++ } ++ ++- //printf("%f, %f, %f, %f\n", ((float*)&xmm3)[0], ((float*)&xmm3)[1], ((float*)&xmm3)[2], ((float*)&xmm3)[3]); ++- //printf("%u, %u, %u, %u\n", ((uint32_t*)&xmm9)[0], ((uint32_t*)&xmm9)[1], ((uint32_t*)&xmm9)[2], ((uint32_t*)&xmm9)[3]); +++ // printf("%f, %f, %f, %f\n", ((float*)&xmm3)[0], ((float*)&xmm3)[1], +++ // ((float*)&xmm3)[2], ((float*)&xmm3)[3]); printf("%u, %u, %u, %u\n", +++ // ((uint32_t*)&xmm9)[0], ((uint32_t*)&xmm9)[1], ((uint32_t*)&xmm9)[2], +++ // ((uint32_t*)&xmm9)[3]); ++ ++- _mm_store_ps((float*)&(holderf.f), xmm3); ++- _mm_store_si128(&(holderi.int_vec), xmm9); +++ _mm_store_ps((float*)&(holderf.f), xmm3); +++ _mm_store_si128(&(holderi.int_vec), xmm9); ++ ++- target[0] = holderi.i[0]; ++- sq_dist = holderf.f[0]; ++- target[0] = (holderf.f[1] > sq_dist) ? holderi.i[1] : target[0]; ++- sq_dist = (holderf.f[1] > sq_dist) ? holderf.f[1] : sq_dist; ++- target[0] = (holderf.f[2] > sq_dist) ? holderi.i[2] : target[0]; ++- sq_dist = (holderf.f[2] > sq_dist) ? holderf.f[2] : sq_dist; ++- target[0] = (holderf.f[3] > sq_dist) ? holderi.i[3] : target[0]; ++- sq_dist = (holderf.f[3] > sq_dist) ? holderf.f[3] : sq_dist; +++ target[0] = holderi.i[0]; +++ sq_dist = holderf.f[0]; +++ target[0] = (holderf.f[1] > sq_dist) ? holderi.i[1] : target[0]; +++ sq_dist = (holderf.f[1] > sq_dist) ? holderf.f[1] : sq_dist; +++ target[0] = (holderf.f[2] > sq_dist) ? holderi.i[2] : target[0]; +++ sq_dist = (holderf.f[2] > sq_dist) ? holderf.f[2] : sq_dist; +++ target[0] = (holderf.f[3] > sq_dist) ? holderi.i[3] : target[0]; +++ sq_dist = (holderf.f[3] > sq_dist) ? holderf.f[3] : sq_dist; ++ ++- /* ++- float placeholder = 0.0; ++- uint32_t temp0, temp1; ++- uint32_t g0 = (((float*)&xmm3)[0] > ((float*)&xmm3)[1]); ++- uint32_t l0 = g0 ^ 1; +++ /* +++ float placeholder = 0.0; +++ uint32_t temp0, temp1; +++ uint32_t g0 = (((float*)&xmm3)[0] > ((float*)&xmm3)[1]); +++ uint32_t l0 = g0 ^ 1; ++ ++- uint32_t g1 = (((float*)&xmm3)[1] > ((float*)&xmm3)[2]); ++- uint32_t l1 = g1 ^ 1; +++ uint32_t g1 = (((float*)&xmm3)[1] > ((float*)&xmm3)[2]); +++ uint32_t l1 = g1 ^ 1; ++ ++- temp0 = g0 * ((uint32_t*)&xmm9)[0] + l0 * ((uint32_t*)&xmm9)[1]; ++- temp1 = g0 * ((uint32_t*)&xmm9)[2] + l0 * ((uint32_t*)&xmm9)[3]; ++- sq_dist = g0 * ((float*)&xmm3)[0] + l0 * ((float*)&xmm3)[1]; ++- placeholder = g0 * ((float*)&xmm3)[2] + l0 * ((float*)&xmm3)[3]; +++ temp0 = g0 * ((uint32_t*)&xmm9)[0] + l0 * ((uint32_t*)&xmm9)[1]; +++ temp1 = g0 * ((uint32_t*)&xmm9)[2] + l0 * ((uint32_t*)&xmm9)[3]; +++ sq_dist = g0 * ((float*)&xmm3)[0] + l0 * ((float*)&xmm3)[1]; +++ placeholder = g0 * ((float*)&xmm3)[2] + l0 * ((float*)&xmm3)[3]; ++ ++- g0 = (sq_dist > placeholder); ++- l0 = g0 ^ 1; ++- target[0] = g0 * temp0 + l0 * temp1; ++- */ +++ g0 = (sq_dist > placeholder); +++ l0 = g0 ^ 1; +++ target[0] = g0 * temp0 + l0 * temp1; +++ */ ++ } ++ ++ #endif /*LV_HAVE_SSE3*/ ++ ++ #ifdef LV_HAVE_GENERIC ++ static inline void ++- volk_32fc_index_max_16u_generic(uint16_t* target, lv_32fc_t* src0, ++- uint32_t num_points) +++volk_32fc_index_max_16u_generic(uint16_t* target, lv_32fc_t* src0, uint32_t num_points) ++ { ++- num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points; +++ num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points; ++ ++- const uint32_t num_bytes = num_points*8; +++ const uint32_t num_bytes = num_points * 8; ++ ++- float sq_dist = 0.0; ++- float max = 0.0; ++- uint16_t index = 0; +++ float sq_dist = 0.0; +++ float max = 0.0; +++ uint16_t index = 0; ++ ++- uint32_t i = 0; +++ uint32_t i = 0; ++ ++- for(; i < num_bytes >> 3; ++i) { ++- sq_dist = lv_creal(src0[i]) * lv_creal(src0[i]) + lv_cimag(src0[i]) * lv_cimag(src0[i]); +++ for (; i> 3; ++i) { +++ sq_dist = +++ lv_creal(src0[i]) * lv_creal(src0[i]) + lv_cimag(src0[i]) * lv_cimag(src0[i]); ++ ++- index = sq_dist > max ? i : index; ++- max = sq_dist > max ? sq_dist : max; ++- } ++- target[0] = index; +++ index = sq_dist > max ? i : index; +++ max = sq_dist > max ? sq_dist : max; +++ } +++ target[0] = index; ++ } ++ ++ #endif /*LV_HAVE_GENERIC*/ ++@@ -427,142 +434,140 @@ static inline void ++ #ifndef INCLUDED_volk_32fc_index_max_16u_u_H ++ #define INCLUDED_volk_32fc_index_max_16u_u_H ++ ++-#include ++ #include ++-#include ++ #include +++#include +++#include ++ #include ++ ++ #ifdef LV_HAVE_AVX2 ++ #include ++ ++ static inline void ++-volk_32fc_index_max_16u_u_avx2(uint16_t* target, lv_32fc_t* src0, ++- uint32_t num_points) +++volk_32fc_index_max_16u_u_avx2(uint16_t* target, lv_32fc_t* src0, uint32_t num_points) ++ { ++- num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points; ++- // Branchless version, if we think it'll make a difference ++- //num_points = USHRT_MAX ^ ((num_points ^ USHRT_MAX) & -(num_points < USHRT_MAX)); +++ num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points; +++ // Branchless version, if we think it'll make a difference +++ // num_points = USHRT_MAX ^ ((num_points ^ USHRT_MAX) & -(num_points < USHRT_MAX)); ++ ++- const uint32_t num_bytes = num_points*8; +++ const uint32_t num_bytes = num_points * 8; ++ ++- union bit256 holderf; ++- union bit256 holderi; ++- float sq_dist = 0.0; +++ union bit256 holderf; +++ union bit256 holderi; +++ float sq_dist = 0.0; ++ ++- union bit256 xmm5, xmm4; ++- __m256 xmm1, xmm2, xmm3; ++- __m256i xmm8, xmm11, xmm12, xmmfive, xmmfour, xmm9, holder0, holder1, xmm10; +++ union bit256 xmm5, xmm4; +++ __m256 xmm1, xmm2, xmm3; +++ __m256i xmm8, xmm11, xmm12, xmmfive, xmmfour, xmm9, holder0, holder1, xmm10; ++ ++- xmm5.int_vec = xmmfive = _mm256_setzero_si256(); ++- xmm4.int_vec = xmmfour = _mm256_setzero_si256(); ++- holderf.int_vec = holder0 = _mm256_setzero_si256(); ++- holderi.int_vec = holder1 = _mm256_setzero_si256(); +++ xmm5.int_vec = xmmfive = _mm256_setzero_si256(); +++ xmm4.int_vec = xmmfour = _mm256_setzero_si256(); +++ holderf.int_vec = holder0 = _mm256_setzero_si256(); +++ holderi.int_vec = holder1 = _mm256_setzero_si256(); ++ ++- int bound = num_bytes >> 6; ++- int i = 0; +++ int bound = num_bytes >> 6; +++ int i = 0; ++ ++- xmm8 = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0); ++- xmm9 = _mm256_setzero_si256(); //=xmm8 ++- xmm10 = _mm256_set1_epi32(8); ++- xmm3 = _mm256_setzero_ps(); +++ xmm8 = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0); +++ xmm9 = _mm256_setzero_si256(); //=xmm8 +++ xmm10 = _mm256_set1_epi32(8); +++ xmm3 = _mm256_setzero_ps(); ++ ++- __m256i idx = _mm256_set_epi32(7,6,3,2,5,4,1,0); ++- for(; i < bound; ++i) { ++- xmm1 = _mm256_loadu_ps((float*)src0); ++- xmm2 = _mm256_loadu_ps((float*)&src0[4]); +++ __m256i idx = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0); +++ for (; i < bound; ++i) { +++ xmm1 = _mm256_loadu_ps((float*)src0); +++ xmm2 = _mm256_loadu_ps((float*)&src0[4]); ++ ++- src0 += 8; +++ src0 += 8; ++ ++- xmm1 = _mm256_mul_ps(xmm1, xmm1); ++- xmm2 = _mm256_mul_ps(xmm2, xmm2); +++ xmm1 = _mm256_mul_ps(xmm1, xmm1); +++ xmm2 = _mm256_mul_ps(xmm2, xmm2); ++ ++- xmm1 = _mm256_hadd_ps(xmm1, xmm2); ++- xmm1 = _mm256_permutevar8x32_ps(xmm1, idx); +++ xmm1 = _mm256_hadd_ps(xmm1, xmm2); +++ xmm1 = _mm256_permutevar8x32_ps(xmm1, idx); ++ ++- xmm3 = _mm256_max_ps(xmm1, xmm3); +++ xmm3 = _mm256_max_ps(xmm1, xmm3); ++ ++- xmm4.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_LT_OS); ++- xmm5.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_EQ_OQ); +++ xmm4.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_LT_OS); +++ xmm5.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_EQ_OQ); ++ ++- xmm11 = _mm256_and_si256(xmm8, xmm5.int_vec); ++- xmm12 = _mm256_and_si256(xmm9, xmm4.int_vec); +++ xmm11 = _mm256_and_si256(xmm8, xmm5.int_vec); +++ xmm12 = _mm256_and_si256(xmm9, xmm4.int_vec); ++ ++- xmm9 = _mm256_add_epi32(xmm11, xmm12); +++ xmm9 = _mm256_add_epi32(xmm11, xmm12); ++ ++- xmm8 = _mm256_add_epi32(xmm8, xmm10); ++- } ++- xmm10 = _mm256_set1_epi32(4); ++- if (num_bytes >> 5 & 1) { ++- xmm1 = _mm256_loadu_ps((float*)src0); +++ xmm8 = _mm256_add_epi32(xmm8, xmm10); +++ } +++ xmm10 = _mm256_set1_epi32(4); +++ if (num_bytes >> 5 & 1) { +++ xmm1 = _mm256_loadu_ps((float*)src0); ++ ++- src0 += 4; +++ src0 += 4; ++ ++- xmm1 = _mm256_mul_ps(xmm1, xmm1); +++ xmm1 = _mm256_mul_ps(xmm1, xmm1); ++ ++- xmm1 = _mm256_hadd_ps(xmm1, xmm1); ++- xmm1 = _mm256_permutevar8x32_ps(xmm1, idx); +++ xmm1 = _mm256_hadd_ps(xmm1, xmm1); +++ xmm1 = _mm256_permutevar8x32_ps(xmm1, idx); ++ ++- xmm3 = _mm256_max_ps(xmm1, xmm3); +++ xmm3 = _mm256_max_ps(xmm1, xmm3); ++ ++- xmm4.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_LT_OS); ++- xmm5.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_EQ_OQ); +++ xmm4.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_LT_OS); +++ xmm5.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_EQ_OQ); ++ ++- xmm11 = _mm256_and_si256(xmm8, xmm5.int_vec); ++- xmm12 = _mm256_and_si256(xmm9, xmm4.int_vec); +++ xmm11 = _mm256_and_si256(xmm8, xmm5.int_vec); +++ xmm12 = _mm256_and_si256(xmm9, xmm4.int_vec); ++ ++- xmm9 = _mm256_add_epi32(xmm11, xmm12); +++ xmm9 = _mm256_add_epi32(xmm11, xmm12); ++ ++- xmm8 = _mm256_add_epi32(xmm8, xmm10); ++- } +++ xmm8 = _mm256_add_epi32(xmm8, xmm10); +++ } ++ ++- idx = _mm256_set_epi32(1,0,1,0,1,0,1,0); ++- xmm10 = _mm256_set1_epi32(2); ++- if (num_bytes >> 4 & 1) { ++- xmm2 = _mm256_loadu_ps((float*)src0); +++ idx = _mm256_set_epi32(1, 0, 1, 0, 1, 0, 1, 0); +++ xmm10 = _mm256_set1_epi32(2); +++ if (num_bytes >> 4 & 1) { +++ xmm2 = _mm256_loadu_ps((float*)src0); ++ ++- xmm1 = _mm256_permutevar8x32_ps(bit256_p(&xmm8)->float_vec, idx); ++- xmm8 = bit256_p(&xmm1)->int_vec; +++ xmm1 = _mm256_permutevar8x32_ps(bit256_p(&xmm8)->float_vec, idx); +++ xmm8 = bit256_p(&xmm1)->int_vec; ++ ++- xmm2 = _mm256_mul_ps(xmm2, xmm2); +++ xmm2 = _mm256_mul_ps(xmm2, xmm2); ++ ++- src0 += 2; +++ src0 += 2; ++ ++- xmm1 = _mm256_hadd_ps(xmm2, xmm2); +++ xmm1 = _mm256_hadd_ps(xmm2, xmm2); ++ ++- xmm3 = _mm256_max_ps(xmm1, xmm3); +++ xmm3 = _mm256_max_ps(xmm1, xmm3); ++ ++- xmm4.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_LT_OS); ++- xmm5.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_EQ_OQ); +++ xmm4.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_LT_OS); +++ xmm5.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_EQ_OQ); ++ ++- xmm11 = _mm256_and_si256(xmm8, xmm5.int_vec); ++- xmm12 = _mm256_and_si256(xmm9, xmm4.int_vec); +++ xmm11 = _mm256_and_si256(xmm8, xmm5.int_vec); +++ xmm12 = _mm256_and_si256(xmm9, xmm4.int_vec); ++ ++- xmm9 = _mm256_add_epi32(xmm11, xmm12); +++ xmm9 = _mm256_add_epi32(xmm11, xmm12); +++ +++ xmm8 = _mm256_add_epi32(xmm8, xmm10); +++ } ++ ++- xmm8 = _mm256_add_epi32(xmm8, xmm10); ++- } ++- ++- _mm256_storeu_ps((float*)&(holderf.f), xmm3); ++- _mm256_storeu_si256(&(holderi.int_vec), xmm9); ++- ++- target[0] = holderi.i[0]; ++- sq_dist = holderf.f[0]; ++- target[0] = (holderf.f[1] > sq_dist) ? holderi.i[1] : target[0]; ++- sq_dist = (holderf.f[1] > sq_dist) ? holderf.f[1] : sq_dist; ++- target[0] = (holderf.f[2] > sq_dist) ? holderi.i[2] : target[0]; ++- sq_dist = (holderf.f[2] > sq_dist) ? holderf.f[2] : sq_dist; ++- target[0] = (holderf.f[3] > sq_dist) ? holderi.i[3] : target[0]; ++- sq_dist = (holderf.f[3] > sq_dist) ? holderf.f[3] : sq_dist; ++- target[0] = (holderf.f[4] > sq_dist) ? holderi.i[4] : target[0]; ++- sq_dist = (holderf.f[4] > sq_dist) ? holderf.f[4] : sq_dist; ++- target[0] = (holderf.f[5] > sq_dist) ? holderi.i[5] : target[0]; ++- sq_dist = (holderf.f[5] > sq_dist) ? holderf.f[5] : sq_dist; ++- target[0] = (holderf.f[6] > sq_dist) ? holderi.i[6] : target[0]; ++- sq_dist = (holderf.f[6] > sq_dist) ? holderf.f[6] : sq_dist; ++- target[0] = (holderf.f[7] > sq_dist) ? holderi.i[7] : target[0]; ++- sq_dist = (holderf.f[7] > sq_dist) ? holderf.f[7] : sq_dist; +++ _mm256_storeu_ps((float*)&(holderf.f), xmm3); +++ _mm256_storeu_si256(&(holderi.int_vec), xmm9); ++ +++ target[0] = holderi.i[0]; +++ sq_dist = holderf.f[0]; +++ target[0] = (holderf.f[1] > sq_dist) ? holderi.i[1] : target[0]; +++ sq_dist = (holderf.f[1] > sq_dist) ? holderf.f[1] : sq_dist; +++ target[0] = (holderf.f[2] > sq_dist) ? holderi.i[2] : target[0]; +++ sq_dist = (holderf.f[2] > sq_dist) ? holderf.f[2] : sq_dist; +++ target[0] = (holderf.f[3] > sq_dist) ? holderi.i[3] : target[0]; +++ sq_dist = (holderf.f[3] > sq_dist) ? holderf.f[3] : sq_dist; +++ target[0] = (holderf.f[4] > sq_dist) ? holderi.i[4] : target[0]; +++ sq_dist = (holderf.f[4] > sq_dist) ? holderf.f[4] : sq_dist; +++ target[0] = (holderf.f[5] > sq_dist) ? holderi.i[5] : target[0]; +++ sq_dist = (holderf.f[5] > sq_dist) ? holderf.f[5] : sq_dist; +++ target[0] = (holderf.f[6] > sq_dist) ? holderi.i[6] : target[0]; +++ sq_dist = (holderf.f[6] > sq_dist) ? holderf.f[6] : sq_dist; +++ target[0] = (holderf.f[7] > sq_dist) ? holderi.i[7] : target[0]; +++ sq_dist = (holderf.f[7] > sq_dist) ? holderf.f[7] : sq_dist; ++ } ++ ++ #endif /*LV_HAVE_AVX2*/ ++diff --git a/kernels/volk/volk_32fc_index_max_32u.h b/kernels/volk/volk_32fc_index_max_32u.h ++index 67a3faa..7756fc6 100644 ++--- a/kernels/volk/volk_32fc_index_max_32u.h +++++ b/kernels/volk/volk_32fc_index_max_32u.h ++@@ -70,309 +70,314 @@ ++ #ifndef INCLUDED_volk_32fc_index_max_32u_a_H ++ #define INCLUDED_volk_32fc_index_max_32u_a_H ++ +++#include +++#include ++ #include ++-#include ++-#include ++-#include +++#include ++ ++ #ifdef LV_HAVE_AVX2 ++-#include +++#include ++ ++ static inline void ++-volk_32fc_index_max_32u_a_avx2(uint32_t* target, lv_32fc_t* src0, ++- uint32_t num_points) +++volk_32fc_index_max_32u_a_avx2(uint32_t* target, lv_32fc_t* src0, uint32_t num_points) ++ { ++- const uint32_t num_bytes = num_points*8; +++ const uint32_t num_bytes = num_points * 8; ++ ++- union bit256 holderf; ++- union bit256 holderi; ++- float sq_dist = 0.0; +++ union bit256 holderf; +++ union bit256 holderi; +++ float sq_dist = 0.0; ++ ++- union bit256 xmm5, xmm4; ++- __m256 xmm1, xmm2, xmm3; ++- __m256i xmm8, xmm11, xmm12, xmmfive, xmmfour, xmm9, holder0, holder1, xmm10; +++ union bit256 xmm5, xmm4; +++ __m256 xmm1, xmm2, xmm3; +++ __m256i xmm8, xmm11, xmm12, xmmfive, xmmfour, xmm9, holder0, holder1, xmm10; ++ ++- xmm5.int_vec = xmmfive = _mm256_setzero_si256(); ++- xmm4.int_vec = xmmfour = _mm256_setzero_si256(); ++- holderf.int_vec = holder0 = _mm256_setzero_si256(); ++- holderi.int_vec = holder1 = _mm256_setzero_si256(); +++ xmm5.int_vec = xmmfive = _mm256_setzero_si256(); +++ xmm4.int_vec = xmmfour = _mm256_setzero_si256(); +++ holderf.int_vec = holder0 = _mm256_setzero_si256(); +++ holderi.int_vec = holder1 = _mm256_setzero_si256(); ++ ++- int bound = num_bytes >> 6; ++- int i = 0; +++ int bound = num_bytes >> 6; +++ int i = 0; ++ ++- xmm8 = _mm256_set_epi32(7,6,5,4,3, 2, 1, 0); ++- xmm9 = _mm256_setzero_si256(); ++- xmm10 = _mm256_set1_epi32(8); ++- xmm3 = _mm256_setzero_ps(); ++- __m256i idx = _mm256_set_epi32(7,6,3,2,5,4,1,0); +++ xmm8 = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0); +++ xmm9 = _mm256_setzero_si256(); +++ xmm10 = _mm256_set1_epi32(8); +++ xmm3 = _mm256_setzero_ps(); +++ __m256i idx = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0); ++ ++- for(; i < bound; ++i) { ++- xmm1 = _mm256_load_ps((float*)src0); ++- xmm2 = _mm256_load_ps((float*)&src0[4]); +++ for (; i < bound; ++i) { +++ xmm1 = _mm256_load_ps((float*)src0); +++ xmm2 = _mm256_load_ps((float*)&src0[4]); ++ ++- src0 += 8; +++ src0 += 8; ++ ++- xmm1 = _mm256_mul_ps(xmm1, xmm1); ++- xmm2 = _mm256_mul_ps(xmm2, xmm2); +++ xmm1 = _mm256_mul_ps(xmm1, xmm1); +++ xmm2 = _mm256_mul_ps(xmm2, xmm2); ++ ++- xmm1 = _mm256_hadd_ps(xmm1, xmm2); ++- xmm1 = _mm256_permutevar8x32_ps(xmm1, idx); +++ xmm1 = _mm256_hadd_ps(xmm1, xmm2); +++ xmm1 = _mm256_permutevar8x32_ps(xmm1, idx); ++ ++- xmm3 = _mm256_max_ps(xmm1, xmm3); +++ xmm3 = _mm256_max_ps(xmm1, xmm3); ++ ++- xmm4.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_LT_OS); ++- xmm5.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_EQ_OQ); +++ xmm4.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_LT_OS); +++ xmm5.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_EQ_OQ); ++ ++- xmm11 = _mm256_and_si256(xmm8, xmm5.int_vec); ++- xmm12 = _mm256_and_si256(xmm9, xmm4.int_vec); +++ xmm11 = _mm256_and_si256(xmm8, xmm5.int_vec); +++ xmm12 = _mm256_and_si256(xmm9, xmm4.int_vec); ++ ++- xmm9 = _mm256_add_epi32(xmm11, xmm12); +++ xmm9 = _mm256_add_epi32(xmm11, xmm12); ++ ++- xmm8 = _mm256_add_epi32(xmm8, xmm10); ++- } ++- ++- xmm10 = _mm256_set1_epi32(4); ++- if (num_bytes >> 5 & 1) { ++- xmm1 = _mm256_load_ps((float*)src0); ++- ++- xmm1 = _mm256_mul_ps(xmm1, xmm1); +++ xmm8 = _mm256_add_epi32(xmm8, xmm10); +++ } ++ ++- src0 += 4; +++ xmm10 = _mm256_set1_epi32(4); +++ if (num_bytes >> 4 & 1) { +++ xmm1 = _mm256_load_ps((float*)src0); ++ ++- xmm1 = _mm256_hadd_ps(xmm1, xmm1); +++ xmm1 = _mm256_mul_ps(xmm1, xmm1); ++ ++- xmm3 = _mm256_max_ps(xmm1, xmm3); +++ src0 += 4; ++ ++- xmm4.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_LT_OS); ++- xmm5.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_EQ_OQ); +++ xmm1 = _mm256_hadd_ps(xmm1, xmm1); ++ ++- xmm11 = _mm256_and_si256(xmm8, xmm5.int_vec); ++- xmm12 = _mm256_and_si256(xmm9, xmm4.int_vec); +++ xmm3 = _mm256_max_ps(xmm1, xmm3); ++ ++- xmm9 = _mm256_add_epi32(xmm11, xmm12); +++ xmm4.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_LT_OS); +++ xmm5.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_EQ_OQ); ++ ++- xmm8 = _mm256_add_epi32(xmm8, xmm10); ++- } +++ xmm11 = _mm256_and_si256(xmm8, xmm5.int_vec); +++ xmm12 = _mm256_and_si256(xmm9, xmm4.int_vec); ++ ++- idx = _mm256_set_epi32(1,0,1,0,1,0,1,0); ++- xmm10 = _mm256_set1_epi32(2); ++- if (num_bytes >> 4 & 1) { ++- xmm2 = _mm256_load_ps((float*)src0); +++ xmm9 = _mm256_add_epi32(xmm11, xmm12); ++ ++- xmm1 = _mm256_permutevar8x32_ps(bit256_p(&xmm8)->float_vec, idx); ++- xmm8 = bit256_p(&xmm1)->int_vec; +++ xmm8 = _mm256_add_epi32(xmm8, xmm10); +++ } ++ ++- xmm2 = _mm256_mul_ps(xmm2, xmm2); +++ idx = _mm256_set_epi32(1, 0, 1, 0, 1, 0, 1, 0); +++ xmm10 = _mm256_set1_epi32(2); +++ if (num_bytes >> 4 & 1) { +++ xmm2 = _mm256_load_ps((float*)src0); ++ ++- src0 += 2; +++ xmm1 = _mm256_permutevar8x32_ps(bit256_p(&xmm8)->float_vec, idx); +++ xmm8 = bit256_p(&xmm1)->int_vec; ++ ++- xmm1 = _mm256_hadd_ps(xmm2, xmm2); +++ xmm2 = _mm256_mul_ps(xmm2, xmm2); ++ ++- xmm3 = _mm256_max_ps(xmm1, xmm3); +++ src0 += 2; ++ ++- xmm4.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_LT_OS); ++- xmm5.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_EQ_OQ); +++ xmm1 = _mm256_hadd_ps(xmm2, xmm2); ++ ++- xmm11 = _mm256_and_si256(xmm8, xmm5.int_vec); ++- xmm12 = _mm256_and_si256(xmm9, xmm4.int_vec); +++ xmm3 = _mm256_max_ps(xmm1, xmm3); ++ ++- xmm9 = _mm256_add_epi32(xmm11, xmm12); +++ xmm4.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_LT_OS); +++ xmm5.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_EQ_OQ); ++ ++- xmm8 = _mm256_add_epi32(xmm8, xmm10); ++- } +++ xmm11 = _mm256_and_si256(xmm8, xmm5.int_vec); +++ xmm12 = _mm256_and_si256(xmm9, xmm4.int_vec); ++ ++- _mm256_store_ps((float*)&(holderf.f), xmm3); ++- _mm256_store_si256(&(holderi.int_vec), xmm9); +++ xmm9 = _mm256_add_epi32(xmm11, xmm12); ++ ++- target[0] = holderi.i[0]; ++- sq_dist = holderf.f[0]; ++- target[0] = (holderf.f[1] > sq_dist) ? holderi.i[1] : target[0]; ++- sq_dist = (holderf.f[1] > sq_dist) ? holderf.f[1] : sq_dist; ++- target[0] = (holderf.f[2] > sq_dist) ? holderi.i[2] : target[0]; ++- sq_dist = (holderf.f[2] > sq_dist) ? holderf.f[2] : sq_dist; ++- target[0] = (holderf.f[3] > sq_dist) ? holderi.i[3] : target[0]; ++- sq_dist = (holderf.f[3] > sq_dist) ? holderf.f[3] : sq_dist; ++- target[0] = (holderf.f[4] > sq_dist) ? holderi.i[4] : target[0]; ++- sq_dist = (holderf.f[4] > sq_dist) ? holderf.f[4] : sq_dist; ++- target[0] = (holderf.f[5] > sq_dist) ? holderi.i[5] : target[0]; ++- sq_dist = (holderf.f[5] > sq_dist) ? holderf.f[5] : sq_dist; ++- target[0] = (holderf.f[6] > sq_dist) ? holderi.i[6] : target[0]; ++- sq_dist = (holderf.f[6] > sq_dist) ? holderf.f[6] : sq_dist; ++- target[0] = (holderf.f[7] > sq_dist) ? holderi.i[7] : target[0]; ++- sq_dist = (holderf.f[7] > sq_dist) ? holderf.f[7] : sq_dist; +++ xmm8 = _mm256_add_epi32(xmm8, xmm10); +++ } ++ +++ _mm256_store_ps((float*)&(holderf.f), xmm3); +++ _mm256_store_si256(&(holderi.int_vec), xmm9); +++ +++ target[0] = holderi.i[0]; +++ sq_dist = holderf.f[0]; +++ target[0] = (holderf.f[1] > sq_dist) ? holderi.i[1] : target[0]; +++ sq_dist = (holderf.f[1] > sq_dist) ? holderf.f[1] : sq_dist; +++ target[0] = (holderf.f[2] > sq_dist) ? holderi.i[2] : target[0]; +++ sq_dist = (holderf.f[2] > sq_dist) ? holderf.f[2] : sq_dist; +++ target[0] = (holderf.f[3] > sq_dist) ? holderi.i[3] : target[0]; +++ sq_dist = (holderf.f[3] > sq_dist) ? holderf.f[3] : sq_dist; +++ target[0] = (holderf.f[4] > sq_dist) ? holderi.i[4] : target[0]; +++ sq_dist = (holderf.f[4] > sq_dist) ? holderf.f[4] : sq_dist; +++ target[0] = (holderf.f[5] > sq_dist) ? holderi.i[5] : target[0]; +++ sq_dist = (holderf.f[5] > sq_dist) ? holderf.f[5] : sq_dist; +++ target[0] = (holderf.f[6] > sq_dist) ? holderi.i[6] : target[0]; +++ sq_dist = (holderf.f[6] > sq_dist) ? holderf.f[6] : sq_dist; +++ target[0] = (holderf.f[7] > sq_dist) ? holderi.i[7] : target[0]; +++ sq_dist = (holderf.f[7] > sq_dist) ? holderf.f[7] : sq_dist; ++ } ++ ++ #endif /*LV_HAVE_AVX2*/ ++ ++ #ifdef LV_HAVE_SSE3 ++-#include ++-#include +++#include +++#include ++ ++ static inline void ++-volk_32fc_index_max_32u_a_sse3(uint32_t* target, lv_32fc_t* src0, ++- uint32_t num_points) +++volk_32fc_index_max_32u_a_sse3(uint32_t* target, lv_32fc_t* src0, uint32_t num_points) ++ { ++- const uint32_t num_bytes = num_points*8; ++- ++- union bit128 holderf; ++- union bit128 holderi; ++- float sq_dist = 0.0; ++- ++- union bit128 xmm5, xmm4; ++- __m128 xmm1, xmm2, xmm3; ++- __m128i xmm8, xmm11, xmm12, xmmfive, xmmfour, xmm9, holder0, holder1, xmm10; +++ const uint32_t num_bytes = num_points * 8; ++ ++- xmm5.int_vec = xmmfive = _mm_setzero_si128(); ++- xmm4.int_vec = xmmfour = _mm_setzero_si128(); ++- holderf.int_vec = holder0 = _mm_setzero_si128(); ++- holderi.int_vec = holder1 = _mm_setzero_si128(); +++ union bit128 holderf; +++ union bit128 holderi; +++ float sq_dist = 0.0; ++ ++- int bound = num_bytes >> 5; ++- int i = 0; +++ union bit128 xmm5, xmm4; +++ __m128 xmm1, xmm2, xmm3; +++ __m128i xmm8, xmm11, xmm12, xmmfive, xmmfour, xmm9, holder0, holder1, xmm10; ++ ++- xmm8 = _mm_set_epi32(3, 2, 1, 0);//remember the crazy reverse order! ++- xmm9 = _mm_setzero_si128(); ++- xmm10 = _mm_set_epi32(4, 4, 4, 4); ++- xmm3 = _mm_setzero_ps(); +++ xmm5.int_vec = xmmfive = _mm_setzero_si128(); +++ xmm4.int_vec = xmmfour = _mm_setzero_si128(); +++ holderf.int_vec = holder0 = _mm_setzero_si128(); +++ holderi.int_vec = holder1 = _mm_setzero_si128(); ++ ++- //printf("%f, %f, %f, %f\n", ((float*)&xmm10)[0], ((float*)&xmm10)[1], ((float*)&xmm10)[2], ((float*)&xmm10)[3]); +++ int bound = num_bytes >> 5; +++ int i = 0; ++ ++- for(; i < bound; ++i) { ++- xmm1 = _mm_load_ps((float*)src0); ++- xmm2 = _mm_load_ps((float*)&src0[2]); +++ xmm8 = _mm_set_epi32(3, 2, 1, 0); // remember the crazy reverse order! +++ xmm9 = _mm_setzero_si128(); +++ xmm10 = _mm_set_epi32(4, 4, 4, 4); +++ xmm3 = _mm_setzero_ps(); ++ ++- src0 += 4; +++ // printf("%f, %f, %f, %f\n", ((float*)&xmm10)[0], ((float*)&xmm10)[1], +++ // ((float*)&xmm10)[2], ((float*)&xmm10)[3]); ++ ++- xmm1 = _mm_mul_ps(xmm1, xmm1); ++- xmm2 = _mm_mul_ps(xmm2, xmm2); +++ for (; i < bound; ++i) { +++ xmm1 = _mm_load_ps((float*)src0); +++ xmm2 = _mm_load_ps((float*)&src0[2]); ++ ++- xmm1 = _mm_hadd_ps(xmm1, xmm2); +++ src0 += 4; ++ ++- xmm3 = _mm_max_ps(xmm1, xmm3); +++ xmm1 = _mm_mul_ps(xmm1, xmm1); +++ xmm2 = _mm_mul_ps(xmm2, xmm2); ++ ++- xmm4.float_vec = _mm_cmplt_ps(xmm1, xmm3); ++- xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3); +++ xmm1 = _mm_hadd_ps(xmm1, xmm2); ++ ++- xmm11 = _mm_and_si128(xmm8, xmm5.int_vec); ++- xmm12 = _mm_and_si128(xmm9, xmm4.int_vec); +++ xmm3 = _mm_max_ps(xmm1, xmm3); ++ ++- xmm9 = _mm_add_epi32(xmm11, xmm12); +++ xmm4.float_vec = _mm_cmplt_ps(xmm1, xmm3); +++ xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3); ++ ++- xmm8 = _mm_add_epi32(xmm8, xmm10); +++ xmm11 = _mm_and_si128(xmm8, xmm5.int_vec); +++ xmm12 = _mm_and_si128(xmm9, xmm4.int_vec); ++ ++- //printf("%f, %f, %f, %f\n", ((float*)&xmm3)[0], ((float*)&xmm3)[1], ((float*)&xmm3)[2], ((float*)&xmm3)[3]); ++- //printf("%u, %u, %u, %u\n", ((uint32_t*)&xmm10)[0], ((uint32_t*)&xmm10)[1], ((uint32_t*)&xmm10)[2], ((uint32_t*)&xmm10)[3]); ++- } +++ xmm9 = _mm_add_epi32(xmm11, xmm12); ++ +++ xmm8 = _mm_add_epi32(xmm8, xmm10); ++ ++- if (num_bytes >> 4 & 1) { ++- xmm2 = _mm_load_ps((float*)src0); ++- ++- xmm1 = _mm_movelh_ps(bit128_p(&xmm8)->float_vec, bit128_p(&xmm8)->float_vec); ++- xmm8 = bit128_p(&xmm1)->int_vec; ++- ++- xmm2 = _mm_mul_ps(xmm2, xmm2); ++- ++- src0 += 2; ++- ++- xmm1 = _mm_hadd_ps(xmm2, xmm2); +++ // printf("%f, %f, %f, %f\n", ((float*)&xmm3)[0], ((float*)&xmm3)[1], +++ // ((float*)&xmm3)[2], ((float*)&xmm3)[3]); printf("%u, %u, %u, %u\n", +++ // ((uint32_t*)&xmm10)[0], ((uint32_t*)&xmm10)[1], ((uint32_t*)&xmm10)[2], +++ // ((uint32_t*)&xmm10)[3]); +++ } ++ ++- xmm3 = _mm_max_ps(xmm1, xmm3); ++ ++- xmm10 = _mm_set_epi32(2, 2, 2, 2);//load1_ps((float*)&init[2]); +++ if (num_bytes >> 4 & 1) { +++ xmm2 = _mm_load_ps((float*)src0); ++ ++- xmm4.float_vec = _mm_cmplt_ps(xmm1, xmm3); ++- xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3); +++ xmm1 = _mm_movelh_ps(bit128_p(&xmm8)->float_vec, bit128_p(&xmm8)->float_vec); +++ xmm8 = bit128_p(&xmm1)->int_vec; ++ ++- xmm11 = _mm_and_si128(xmm8, xmm5.int_vec); ++- xmm12 = _mm_and_si128(xmm9, xmm4.int_vec); +++ xmm2 = _mm_mul_ps(xmm2, xmm2); ++ ++- xmm9 = _mm_add_epi32(xmm11, xmm12); +++ src0 += 2; ++ ++- xmm8 = _mm_add_epi32(xmm8, xmm10); ++- //printf("egads%u, %u, %u, %u\n", ((uint32_t*)&xmm9)[0], ((uint32_t*)&xmm9)[1], ((uint32_t*)&xmm9)[2], ((uint32_t*)&xmm9)[3]); ++- } +++ xmm1 = _mm_hadd_ps(xmm2, xmm2); ++ ++- if (num_bytes >> 3 & 1) { ++- //printf("%u, %u, %u, %u\n", ((uint32_t*)&xmm9)[0], ((uint32_t*)&xmm9)[1], ((uint32_t*)&xmm9)[2], ((uint32_t*)&xmm9)[3]); +++ xmm3 = _mm_max_ps(xmm1, xmm3); ++ ++- sq_dist = lv_creal(src0[0]) * lv_creal(src0[0]) + lv_cimag(src0[0]) * lv_cimag(src0[0]); +++ xmm10 = _mm_set_epi32(2, 2, 2, 2); // load1_ps((float*)&init[2]); ++ ++- xmm2 = _mm_load1_ps(&sq_dist); +++ xmm4.float_vec = _mm_cmplt_ps(xmm1, xmm3); +++ xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3); ++ ++- xmm1 = xmm3; +++ xmm11 = _mm_and_si128(xmm8, xmm5.int_vec); +++ xmm12 = _mm_and_si128(xmm9, xmm4.int_vec); ++ ++- xmm3 = _mm_max_ss(xmm3, xmm2); +++ xmm9 = _mm_add_epi32(xmm11, xmm12); ++ ++- xmm4.float_vec = _mm_cmplt_ps(xmm1, xmm3); ++- xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3); +++ xmm8 = _mm_add_epi32(xmm8, xmm10); +++ // printf("egads%u, %u, %u, %u\n", ((uint32_t*)&xmm9)[0], ((uint32_t*)&xmm9)[1], +++ // ((uint32_t*)&xmm9)[2], ((uint32_t*)&xmm9)[3]); +++ } ++ ++- xmm8 = _mm_shuffle_epi32(xmm8, 0x00); +++ if (num_bytes >> 3 & 1) { +++ // printf("%u, %u, %u, %u\n", ((uint32_t*)&xmm9)[0], ((uint32_t*)&xmm9)[1], +++ // ((uint32_t*)&xmm9)[2], ((uint32_t*)&xmm9)[3]); ++ ++- xmm11 = _mm_and_si128(xmm8, xmm4.int_vec); ++- xmm12 = _mm_and_si128(xmm9, xmm5.int_vec); +++ sq_dist = +++ lv_creal(src0[0]) * lv_creal(src0[0]) + lv_cimag(src0[0]) * lv_cimag(src0[0]); ++ ++- xmm9 = _mm_add_epi32(xmm11, xmm12); ++- } +++ xmm2 = _mm_load1_ps(&sq_dist); ++ ++- //printf("%f, %f, %f, %f\n", ((float*)&xmm3)[0], ((float*)&xmm3)[1], ((float*)&xmm3)[2], ((float*)&xmm3)[3]); ++- //printf("%u, %u, %u, %u\n", ((uint32_t*)&xmm9)[0], ((uint32_t*)&xmm9)[1], ((uint32_t*)&xmm9)[2], ((uint32_t*)&xmm9)[3]); +++ xmm1 = xmm3; ++ ++- _mm_store_ps((float*)&(holderf.f), xmm3); ++- _mm_store_si128(&(holderi.int_vec), xmm9); +++ xmm3 = _mm_max_ss(xmm3, xmm2); ++ ++- target[0] = holderi.i[0]; ++- sq_dist = holderf.f[0]; ++- target[0] = (holderf.f[1] > sq_dist) ? holderi.i[1] : target[0]; ++- sq_dist = (holderf.f[1] > sq_dist) ? holderf.f[1] : sq_dist; ++- target[0] = (holderf.f[2] > sq_dist) ? holderi.i[2] : target[0]; ++- sq_dist = (holderf.f[2] > sq_dist) ? holderf.f[2] : sq_dist; ++- target[0] = (holderf.f[3] > sq_dist) ? holderi.i[3] : target[0]; ++- sq_dist = (holderf.f[3] > sq_dist) ? holderf.f[3] : sq_dist; +++ xmm4.float_vec = _mm_cmplt_ps(xmm1, xmm3); +++ xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3); ++ ++- /* ++- float placeholder = 0.0; ++- uint32_t temp0, temp1; ++- uint32_t g0 = (((float*)&xmm3)[0] > ((float*)&xmm3)[1]); ++- uint32_t l0 = g0 ^ 1; +++ xmm8 = _mm_shuffle_epi32(xmm8, 0x00); ++ ++- uint32_t g1 = (((float*)&xmm3)[1] > ((float*)&xmm3)[2]); ++- uint32_t l1 = g1 ^ 1; +++ xmm11 = _mm_and_si128(xmm8, xmm4.int_vec); +++ xmm12 = _mm_and_si128(xmm9, xmm5.int_vec); ++ ++- temp0 = g0 * ((uint32_t*)&xmm9)[0] + l0 * ((uint32_t*)&xmm9)[1]; ++- temp1 = g0 * ((uint32_t*)&xmm9)[2] + l0 * ((uint32_t*)&xmm9)[3]; ++- sq_dist = g0 * ((float*)&xmm3)[0] + l0 * ((float*)&xmm3)[1]; ++- placeholder = g0 * ((float*)&xmm3)[2] + l0 * ((float*)&xmm3)[3]; +++ xmm9 = _mm_add_epi32(xmm11, xmm12); +++ } ++ ++- g0 = (sq_dist > placeholder); ++- l0 = g0 ^ 1; ++- target[0] = g0 * temp0 + l0 * temp1; ++- */ +++ // printf("%f, %f, %f, %f\n", ((float*)&xmm3)[0], ((float*)&xmm3)[1], +++ // ((float*)&xmm3)[2], ((float*)&xmm3)[3]); printf("%u, %u, %u, %u\n", +++ // ((uint32_t*)&xmm9)[0], ((uint32_t*)&xmm9)[1], ((uint32_t*)&xmm9)[2], +++ // ((uint32_t*)&xmm9)[3]); +++ +++ _mm_store_ps((float*)&(holderf.f), xmm3); +++ _mm_store_si128(&(holderi.int_vec), xmm9); +++ +++ target[0] = holderi.i[0]; +++ sq_dist = holderf.f[0]; +++ target[0] = (holderf.f[1] > sq_dist) ? holderi.i[1] : target[0]; +++ sq_dist = (holderf.f[1] > sq_dist) ? holderf.f[1] : sq_dist; +++ target[0] = (holderf.f[2] > sq_dist) ? holderi.i[2] : target[0]; +++ sq_dist = (holderf.f[2] > sq_dist) ? holderf.f[2] : sq_dist; +++ target[0] = (holderf.f[3] > sq_dist) ? holderi.i[3] : target[0]; +++ sq_dist = (holderf.f[3] > sq_dist) ? holderf.f[3] : sq_dist; +++ +++ /* +++ float placeholder = 0.0; +++ uint32_t temp0, temp1; +++ uint32_t g0 = (((float*)&xmm3)[0] > ((float*)&xmm3)[1]); +++ uint32_t l0 = g0 ^ 1; +++ +++ uint32_t g1 = (((float*)&xmm3)[1] > ((float*)&xmm3)[2]); +++ uint32_t l1 = g1 ^ 1; +++ +++ temp0 = g0 * ((uint32_t*)&xmm9)[0] + l0 * ((uint32_t*)&xmm9)[1]; +++ temp1 = g0 * ((uint32_t*)&xmm9)[2] + l0 * ((uint32_t*)&xmm9)[3]; +++ sq_dist = g0 * ((float*)&xmm3)[0] + l0 * ((float*)&xmm3)[1]; +++ placeholder = g0 * ((float*)&xmm3)[2] + l0 * ((float*)&xmm3)[3]; +++ +++ g0 = (sq_dist > placeholder); +++ l0 = g0 ^ 1; +++ target[0] = g0 * temp0 + l0 * temp1; +++ */ ++ } ++ ++ #endif /*LV_HAVE_SSE3*/ ++ ++ #ifdef LV_HAVE_GENERIC ++ static inline void ++- volk_32fc_index_max_32u_generic(uint32_t* target, lv_32fc_t* src0, ++- uint32_t num_points) +++volk_32fc_index_max_32u_generic(uint32_t* target, lv_32fc_t* src0, uint32_t num_points) ++ { ++- const uint32_t num_bytes = num_points*8; +++ const uint32_t num_bytes = num_points * 8; ++ ++- float sq_dist = 0.0; ++- float max = 0.0; ++- uint32_t index = 0; +++ float sq_dist = 0.0; +++ float max = 0.0; +++ uint32_t index = 0; ++ ++- uint32_t i = 0; +++ uint32_t i = 0; ++ ++- for(; i < num_bytes >> 3; ++i) { ++- sq_dist = lv_creal(src0[i]) * lv_creal(src0[i]) + lv_cimag(src0[i]) * lv_cimag(src0[i]); +++ for (; i> 3; ++i) { +++ sq_dist = +++ lv_creal(src0[i]) * lv_creal(src0[i]) + lv_cimag(src0[i]) * lv_cimag(src0[i]); ++ ++- index = sq_dist > max ? i : index; ++- max = sq_dist > max ? sq_dist : max; ++- } ++- target[0] = index; +++ index = sq_dist > max ? i : index; +++ max = sq_dist > max ? sq_dist : max; +++ } +++ target[0] = index; ++ } ++ ++ #endif /*LV_HAVE_GENERIC*/ ++@@ -384,137 +389,135 @@ static inline void ++ #ifndef INCLUDED_volk_32fc_index_max_32u_u_H ++ #define INCLUDED_volk_32fc_index_max_32u_u_H ++ +++#include +++#include ++ #include ++-#include ++-#include ++-#include +++#include ++ ++ #ifdef LV_HAVE_AVX2 ++-#include +++#include ++ ++ static inline void ++-volk_32fc_index_max_32u_u_avx2(uint32_t* target, lv_32fc_t* src0, ++- uint32_t num_points) +++volk_32fc_index_max_32u_u_avx2(uint32_t* target, lv_32fc_t* src0, uint32_t num_points) ++ { ++- const uint32_t num_bytes = num_points*8; ++- ++- union bit256 holderf; ++- union bit256 holderi; ++- float sq_dist = 0.0; +++ const uint32_t num_bytes = num_points * 8; ++ ++- union bit256 xmm5, xmm4; ++- __m256 xmm1, xmm2, xmm3; ++- __m256i xmm8, xmm11, xmm12, xmmfive, xmmfour, xmm9, holder0, holder1, xmm10; +++ union bit256 holderf; +++ union bit256 holderi; +++ float sq_dist = 0.0; ++ ++- xmm5.int_vec = xmmfive = _mm256_setzero_si256(); ++- xmm4.int_vec = xmmfour = _mm256_setzero_si256(); ++- holderf.int_vec = holder0 = _mm256_setzero_si256(); ++- holderi.int_vec = holder1 = _mm256_setzero_si256(); +++ union bit256 xmm5, xmm4; +++ __m256 xmm1, xmm2, xmm3; +++ __m256i xmm8, xmm11, xmm12, xmmfive, xmmfour, xmm9, holder0, holder1, xmm10; ++ ++- int bound = num_bytes >> 6; ++- int i = 0; +++ xmm5.int_vec = xmmfive = _mm256_setzero_si256(); +++ xmm4.int_vec = xmmfour = _mm256_setzero_si256(); +++ holderf.int_vec = holder0 = _mm256_setzero_si256(); +++ holderi.int_vec = holder1 = _mm256_setzero_si256(); ++ ++- xmm8 = _mm256_set_epi32(7,6,5,4,3, 2, 1, 0); ++- xmm9 = _mm256_setzero_si256(); ++- xmm10 = _mm256_set1_epi32(8); ++- xmm3 = _mm256_setzero_ps(); ++- __m256i idx = _mm256_set_epi32(7,6,3,2,5,4,1,0); +++ int bound = num_bytes >> 6; +++ int i = 0; ++ ++- for(; i < bound; ++i) { ++- xmm1 = _mm256_loadu_ps((float*)src0); ++- xmm2 = _mm256_loadu_ps((float*)&src0[4]); +++ xmm8 = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0); +++ xmm9 = _mm256_setzero_si256(); +++ xmm10 = _mm256_set1_epi32(8); +++ xmm3 = _mm256_setzero_ps(); +++ __m256i idx = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0); ++ ++- src0 += 8; +++ for (; i < bound; ++i) { +++ xmm1 = _mm256_loadu_ps((float*)src0); +++ xmm2 = _mm256_loadu_ps((float*)&src0[4]); ++ ++- xmm1 = _mm256_mul_ps(xmm1, xmm1); ++- xmm2 = _mm256_mul_ps(xmm2, xmm2); +++ src0 += 8; ++ ++- xmm1 = _mm256_hadd_ps(xmm1, xmm2); ++- xmm1 = _mm256_permutevar8x32_ps(xmm1, idx); +++ xmm1 = _mm256_mul_ps(xmm1, xmm1); +++ xmm2 = _mm256_mul_ps(xmm2, xmm2); ++ ++- xmm3 = _mm256_max_ps(xmm1, xmm3); +++ xmm1 = _mm256_hadd_ps(xmm1, xmm2); +++ xmm1 = _mm256_permutevar8x32_ps(xmm1, idx); ++ ++- xmm4.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_LT_OS); ++- xmm5.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_EQ_OQ); +++ xmm3 = _mm256_max_ps(xmm1, xmm3); ++ ++- xmm11 = _mm256_and_si256(xmm8, xmm5.int_vec); ++- xmm12 = _mm256_and_si256(xmm9, xmm4.int_vec); +++ xmm4.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_LT_OS); +++ xmm5.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_EQ_OQ); ++ ++- xmm9 = _mm256_add_epi32(xmm11, xmm12); +++ xmm11 = _mm256_and_si256(xmm8, xmm5.int_vec); +++ xmm12 = _mm256_and_si256(xmm9, xmm4.int_vec); ++ ++- xmm8 = _mm256_add_epi32(xmm8, xmm10); ++- } +++ xmm9 = _mm256_add_epi32(xmm11, xmm12); ++ ++- xmm10 = _mm256_set1_epi32(4); ++- if (num_bytes >> 5 & 1) { ++- xmm1 = _mm256_loadu_ps((float*)src0); ++- ++- xmm1 = _mm256_mul_ps(xmm1, xmm1); +++ xmm8 = _mm256_add_epi32(xmm8, xmm10); +++ } ++ ++- src0 += 4; +++ xmm10 = _mm256_set1_epi32(4); +++ if (num_bytes >> 4 & 1) { +++ xmm1 = _mm256_loadu_ps((float*)src0); ++ ++- xmm1 = _mm256_hadd_ps(xmm1, xmm1); +++ xmm1 = _mm256_mul_ps(xmm1, xmm1); ++ ++- xmm3 = _mm256_max_ps(xmm1, xmm3); +++ src0 += 4; ++ ++- xmm4.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_LT_OS); ++- xmm5.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_EQ_OQ); +++ xmm1 = _mm256_hadd_ps(xmm1, xmm1); ++ ++- xmm11 = _mm256_and_si256(xmm8, xmm5.int_vec); ++- xmm12 = _mm256_and_si256(xmm9, xmm4.int_vec); +++ xmm3 = _mm256_max_ps(xmm1, xmm3); ++ ++- xmm9 = _mm256_add_epi32(xmm11, xmm12); +++ xmm4.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_LT_OS); +++ xmm5.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_EQ_OQ); ++ ++- xmm8 = _mm256_add_epi32(xmm8, xmm10); ++- } +++ xmm11 = _mm256_and_si256(xmm8, xmm5.int_vec); +++ xmm12 = _mm256_and_si256(xmm9, xmm4.int_vec); ++ ++- idx = _mm256_set_epi32(1,0,1,0,1,0,1,0); ++- xmm10 = _mm256_set1_epi32(2); ++- if (num_bytes >> 4 & 1) { ++- xmm2 = _mm256_loadu_ps((float*)src0); +++ xmm9 = _mm256_add_epi32(xmm11, xmm12); ++ ++- xmm1 = _mm256_permutevar8x32_ps(bit256_p(&xmm8)->float_vec, idx); ++- xmm8 = bit256_p(&xmm1)->int_vec; +++ xmm8 = _mm256_add_epi32(xmm8, xmm10); +++ } ++ ++- xmm2 = _mm256_mul_ps(xmm2, xmm2); +++ idx = _mm256_set_epi32(1, 0, 1, 0, 1, 0, 1, 0); +++ xmm10 = _mm256_set1_epi32(2); +++ if (num_bytes >> 4 & 1) { +++ xmm2 = _mm256_loadu_ps((float*)src0); ++ ++- src0 += 2; +++ xmm1 = _mm256_permutevar8x32_ps(bit256_p(&xmm8)->float_vec, idx); +++ xmm8 = bit256_p(&xmm1)->int_vec; ++ ++- xmm1 = _mm256_hadd_ps(xmm2, xmm2); +++ xmm2 = _mm256_mul_ps(xmm2, xmm2); ++ ++- xmm3 = _mm256_max_ps(xmm1, xmm3); +++ src0 += 2; ++ ++- xmm4.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_LT_OS); ++- xmm5.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_EQ_OQ); +++ xmm1 = _mm256_hadd_ps(xmm2, xmm2); ++ ++- xmm11 = _mm256_and_si256(xmm8, xmm5.int_vec); ++- xmm12 = _mm256_and_si256(xmm9, xmm4.int_vec); +++ xmm3 = _mm256_max_ps(xmm1, xmm3); ++ ++- xmm9 = _mm256_add_epi32(xmm11, xmm12); +++ xmm4.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_LT_OS); +++ xmm5.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_EQ_OQ); ++ ++- xmm8 = _mm256_add_epi32(xmm8, xmm10); ++- } +++ xmm11 = _mm256_and_si256(xmm8, xmm5.int_vec); +++ xmm12 = _mm256_and_si256(xmm9, xmm4.int_vec); ++ ++- _mm256_storeu_ps((float*)&(holderf.f), xmm3); ++- _mm256_storeu_si256(&(holderi.int_vec), xmm9); +++ xmm9 = _mm256_add_epi32(xmm11, xmm12); ++ ++- target[0] = holderi.i[0]; ++- sq_dist = holderf.f[0]; ++- target[0] = (holderf.f[1] > sq_dist) ? holderi.i[1] : target[0]; ++- sq_dist = (holderf.f[1] > sq_dist) ? holderf.f[1] : sq_dist; ++- target[0] = (holderf.f[2] > sq_dist) ? holderi.i[2] : target[0]; ++- sq_dist = (holderf.f[2] > sq_dist) ? holderf.f[2] : sq_dist; ++- target[0] = (holderf.f[3] > sq_dist) ? holderi.i[3] : target[0]; ++- sq_dist = (holderf.f[3] > sq_dist) ? holderf.f[3] : sq_dist; ++- target[0] = (holderf.f[4] > sq_dist) ? holderi.i[4] : target[0]; ++- sq_dist = (holderf.f[4] > sq_dist) ? holderf.f[4] : sq_dist; ++- target[0] = (holderf.f[5] > sq_dist) ? holderi.i[5] : target[0]; ++- sq_dist = (holderf.f[5] > sq_dist) ? holderf.f[5] : sq_dist; ++- target[0] = (holderf.f[6] > sq_dist) ? holderi.i[6] : target[0]; ++- sq_dist = (holderf.f[6] > sq_dist) ? holderf.f[6] : sq_dist; ++- target[0] = (holderf.f[7] > sq_dist) ? holderi.i[7] : target[0]; ++- sq_dist = (holderf.f[7] > sq_dist) ? holderf.f[7] : sq_dist; +++ xmm8 = _mm256_add_epi32(xmm8, xmm10); +++ } ++ +++ _mm256_storeu_ps((float*)&(holderf.f), xmm3); +++ _mm256_storeu_si256(&(holderi.int_vec), xmm9); +++ +++ target[0] = holderi.i[0]; +++ sq_dist = holderf.f[0]; +++ target[0] = (holderf.f[1] > sq_dist) ? holderi.i[1] : target[0]; +++ sq_dist = (holderf.f[1] > sq_dist) ? holderf.f[1] : sq_dist; +++ target[0] = (holderf.f[2] > sq_dist) ? holderi.i[2] : target[0]; +++ sq_dist = (holderf.f[2] > sq_dist) ? holderf.f[2] : sq_dist; +++ target[0] = (holderf.f[3] > sq_dist) ? holderi.i[3] : target[0]; +++ sq_dist = (holderf.f[3] > sq_dist) ? holderf.f[3] : sq_dist; +++ target[0] = (holderf.f[4] > sq_dist) ? holderi.i[4] : target[0]; +++ sq_dist = (holderf.f[4] > sq_dist) ? holderf.f[4] : sq_dist; +++ target[0] = (holderf.f[5] > sq_dist) ? holderi.i[5] : target[0]; +++ sq_dist = (holderf.f[5] > sq_dist) ? holderf.f[5] : sq_dist; +++ target[0] = (holderf.f[6] > sq_dist) ? holderi.i[6] : target[0]; +++ sq_dist = (holderf.f[6] > sq_dist) ? holderf.f[6] : sq_dist; +++ target[0] = (holderf.f[7] > sq_dist) ? holderi.i[7] : target[0]; +++ sq_dist = (holderf.f[7] > sq_dist) ? holderf.f[7] : sq_dist; ++ } ++ ++ #endif /*LV_HAVE_AVX2*/ ++@@ -523,29 +526,29 @@ volk_32fc_index_max_32u_u_avx2(uint32_t* target, lv_32fc_t* src0, ++ #include ++ #include ++ ++-static inline void volk_32fc_index_max_32u_neon(uint32_t* target, lv_32fc_t* src0, uint32_t num_points) +++static inline void +++volk_32fc_index_max_32u_neon(uint32_t* target, lv_32fc_t* src0, uint32_t num_points) ++ { ++ unsigned int number = 0; ++ const uint32_t quarter_points = num_points / 4; ++ const lv_32fc_t* src0Ptr = src0; ++- ++- uint32_t indices[4] = {0, 1, 2, 3}; +++ +++ uint32_t indices[4] = { 0, 1, 2, 3 }; ++ const uint32x4_t vec_indices_incr = vdupq_n_u32(4); ++ uint32x4_t vec_indices = vld1q_u32(indices); ++ uint32x4_t vec_max_indices = vec_indices; ++- ++- if(num_points) ++- { +++ +++ if (num_points) { ++ float max = *src0Ptr; ++ uint32_t index = 0; ++- +++ ++ float32x4_t vec_max = vdupq_n_f32(*src0Ptr); ++- ++- for(;number < quarter_points; number++) ++- { +++ +++ for (; number < quarter_points; number++) { ++ // Load complex and compute magnitude squared ++- const float32x4_t vec_mag2 = _vmagnitudesquaredq_f32(vld2q_f32((float*)src0Ptr)); ++- __VOLK_PREFETCH(src0Ptr+=4); +++ const float32x4_t vec_mag2 = +++ _vmagnitudesquaredq_f32(vld2q_f32((float*)src0Ptr)); +++ __VOLK_PREFETCH(src0Ptr += 4); ++ // a > b? ++ const uint32x4_t gt_mask = vcgtq_f32(vec_mag2, vec_max); ++ vec_max = vbslq_f32(gt_mask, vec_mag2, vec_max); ++@@ -556,20 +559,19 @@ static inline void volk_32fc_index_max_32u_neon(uint32_t* target, lv_32fc_t* src ++ float tmp_max[4]; ++ vst1q_u32(tmp_max_indices, vec_max_indices); ++ vst1q_f32(tmp_max, vec_max); ++- +++ ++ for (int i = 0; i < 4; i++) { ++ if (tmp_max[i] > max) { ++ max = tmp_max[i]; ++ index = tmp_max_indices[i]; ++ } ++ } ++- +++ ++ // Deal with the rest ++- for(number = quarter_points * 4;number < num_points; number++) ++- { +++ for (number = quarter_points * 4; number < num_points; number++) { ++ const float re = lv_creal(*src0Ptr); ++ const float im = lv_cimag(*src0Ptr); ++- if ((re*re+im*im) > max) { +++ if ((re * re + im * im) > max) { ++ max = *src0Ptr; ++ index = number; ++ } ++diff --git a/kernels/volk/volk_32fc_magnitude_32f.h b/kernels/volk/volk_32fc_magnitude_32f.h ++index 1ba6871..6a0a7d8 100644 ++--- a/kernels/volk/volk_32fc_magnitude_32f.h +++++ b/kernels/volk/volk_32fc_magnitude_32f.h ++@@ -30,8 +30,8 @@ ++ * ++ * Dispatcher Prototype ++ * \code ++- * void volk_32fc_magnitude_32f(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points) ++- * \endcode +++ * void volk_32fc_magnitude_32f(float* magnitudeVector, const lv_32fc_t* complexVector, +++ * unsigned int num_points) \endcode ++ * ++ * \b Inputs ++ * \li complexVector: The complex input vector. ++@@ -72,41 +72,41 @@ ++ #define INCLUDED_volk_32fc_magnitude_32f_u_H ++ ++ #include ++-#include ++ #include +++#include ++ ++ #ifdef LV_HAVE_AVX ++ #include ++ #include ++ ++-static inline void ++-volk_32fc_magnitude_32f_u_avx(float* magnitudeVector, const lv_32fc_t* complexVector, ++- unsigned int num_points) +++static inline void volk_32fc_magnitude_32f_u_avx(float* magnitudeVector, +++ const lv_32fc_t* complexVector, +++ unsigned int num_points) ++ { ++- unsigned int number = 0; ++- const unsigned int eighthPoints = num_points / 8; ++- ++- const float* complexVectorPtr = (float*) complexVector; ++- float* magnitudeVectorPtr = magnitudeVector; ++- ++- __m256 cplxValue1, cplxValue2, result; ++- ++- for(; number < eighthPoints; number++){ ++- cplxValue1 = _mm256_loadu_ps(complexVectorPtr); ++- cplxValue2 = _mm256_loadu_ps(complexVectorPtr + 8); ++- result = _mm256_magnitude_ps(cplxValue1, cplxValue2); ++- _mm256_storeu_ps(magnitudeVectorPtr, result); ++- ++- complexVectorPtr += 16; ++- magnitudeVectorPtr += 8; ++- } ++- ++- number = eighthPoints * 8; ++- for(; number < num_points; number++){ ++- float val1Real = *complexVectorPtr++; ++- float val1Imag = *complexVectorPtr++; ++- *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)); ++- } +++ unsigned int number = 0; +++ const unsigned int eighthPoints = num_points / 8; +++ +++ const float* complexVectorPtr = (float*)complexVector; +++ float* magnitudeVectorPtr = magnitudeVector; +++ +++ __m256 cplxValue1, cplxValue2, result; +++ +++ for (; number < eighthPoints; number++) { +++ cplxValue1 = _mm256_loadu_ps(complexVectorPtr); +++ cplxValue2 = _mm256_loadu_ps(complexVectorPtr + 8); +++ result = _mm256_magnitude_ps(cplxValue1, cplxValue2); +++ _mm256_storeu_ps(magnitudeVectorPtr, result); +++ +++ complexVectorPtr += 16; +++ magnitudeVectorPtr += 8; +++ } +++ +++ number = eighthPoints * 8; +++ for (; number < num_points; number++) { +++ float val1Real = *complexVectorPtr++; +++ float val1Imag = *complexVectorPtr++; +++ *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)); +++ } ++ } ++ #endif /* LV_HAVE_AVX */ ++ ++@@ -114,137 +114,137 @@ volk_32fc_magnitude_32f_u_avx(float* magnitudeVector, const lv_32fc_t* complexVe ++ #include ++ #include ++ ++-static inline void ++-volk_32fc_magnitude_32f_u_sse3(float* magnitudeVector, const lv_32fc_t* complexVector, ++- unsigned int num_points) +++static inline void volk_32fc_magnitude_32f_u_sse3(float* magnitudeVector, +++ const lv_32fc_t* complexVector, +++ unsigned int num_points) ++ { ++- unsigned int number = 0; ++- const unsigned int quarterPoints = num_points / 4; +++ unsigned int number = 0; +++ const unsigned int quarterPoints = num_points / 4; ++ ++- const float* complexVectorPtr = (float*) complexVector; ++- float* magnitudeVectorPtr = magnitudeVector; +++ const float* complexVectorPtr = (float*)complexVector; +++ float* magnitudeVectorPtr = magnitudeVector; ++ ++- __m128 cplxValue1, cplxValue2, result; ++- for(; number < quarterPoints; number++){ ++- cplxValue1 = _mm_loadu_ps(complexVectorPtr); ++- complexVectorPtr += 4; +++ __m128 cplxValue1, cplxValue2, result; +++ for (; number < quarterPoints; number++) { +++ cplxValue1 = _mm_loadu_ps(complexVectorPtr); +++ complexVectorPtr += 4; ++ ++- cplxValue2 = _mm_loadu_ps(complexVectorPtr); ++- complexVectorPtr += 4; +++ cplxValue2 = _mm_loadu_ps(complexVectorPtr); +++ complexVectorPtr += 4; ++ ++- result = _mm_magnitude_ps_sse3(cplxValue1, cplxValue2); +++ result = _mm_magnitude_ps_sse3(cplxValue1, cplxValue2); ++ ++- _mm_storeu_ps(magnitudeVectorPtr, result); ++- magnitudeVectorPtr += 4; ++- } +++ _mm_storeu_ps(magnitudeVectorPtr, result); +++ magnitudeVectorPtr += 4; +++ } ++ ++- number = quarterPoints * 4; ++- for(; number < num_points; number++){ ++- float val1Real = *complexVectorPtr++; ++- float val1Imag = *complexVectorPtr++; ++- *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)); ++- } +++ number = quarterPoints * 4; +++ for (; number < num_points; number++) { +++ float val1Real = *complexVectorPtr++; +++ float val1Imag = *complexVectorPtr++; +++ *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)); +++ } ++ } ++ #endif /* LV_HAVE_SSE3 */ ++ ++ ++ #ifdef LV_HAVE_SSE ++-#include ++ #include +++#include ++ ++-static inline void ++-volk_32fc_magnitude_32f_u_sse(float* magnitudeVector, const lv_32fc_t* complexVector, ++- unsigned int num_points) +++static inline void volk_32fc_magnitude_32f_u_sse(float* magnitudeVector, +++ const lv_32fc_t* complexVector, +++ unsigned int num_points) ++ { ++- unsigned int number = 0; ++- const unsigned int quarterPoints = num_points / 4; +++ unsigned int number = 0; +++ const unsigned int quarterPoints = num_points / 4; ++ ++- const float* complexVectorPtr = (float*) complexVector; ++- float* magnitudeVectorPtr = magnitudeVector; +++ const float* complexVectorPtr = (float*)complexVector; +++ float* magnitudeVectorPtr = magnitudeVector; ++ ++- __m128 cplxValue1, cplxValue2, result; +++ __m128 cplxValue1, cplxValue2, result; ++ ++- for(; number < quarterPoints; number++){ ++- cplxValue1 = _mm_loadu_ps(complexVectorPtr); ++- complexVectorPtr += 4; +++ for (; number < quarterPoints; number++) { +++ cplxValue1 = _mm_loadu_ps(complexVectorPtr); +++ complexVectorPtr += 4; ++ ++- cplxValue2 = _mm_loadu_ps(complexVectorPtr); ++- complexVectorPtr += 4; +++ cplxValue2 = _mm_loadu_ps(complexVectorPtr); +++ complexVectorPtr += 4; ++ ++- result = _mm_magnitude_ps(cplxValue1, cplxValue2); ++- _mm_storeu_ps(magnitudeVectorPtr, result); ++- magnitudeVectorPtr += 4; ++- } +++ result = _mm_magnitude_ps(cplxValue1, cplxValue2); +++ _mm_storeu_ps(magnitudeVectorPtr, result); +++ magnitudeVectorPtr += 4; +++ } ++ ++- number = quarterPoints * 4; ++- for(; number < num_points; number++){ ++- float val1Real = *complexVectorPtr++; ++- float val1Imag = *complexVectorPtr++; ++- *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)); ++- } +++ number = quarterPoints * 4; +++ for (; number < num_points; number++) { +++ float val1Real = *complexVectorPtr++; +++ float val1Imag = *complexVectorPtr++; +++ *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)); +++ } ++ } ++ #endif /* LV_HAVE_SSE */ ++ ++ ++ #ifdef LV_HAVE_GENERIC ++ ++-static inline void ++-volk_32fc_magnitude_32f_generic(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points) +++static inline void volk_32fc_magnitude_32f_generic(float* magnitudeVector, +++ const lv_32fc_t* complexVector, +++ unsigned int num_points) ++ { ++- const float* complexVectorPtr = (float*)complexVector; ++- float* magnitudeVectorPtr = magnitudeVector; ++- unsigned int number = 0; ++- for(number = 0; number < num_points; number++){ ++- const float real = *complexVectorPtr++; ++- const float imag = *complexVectorPtr++; ++- *magnitudeVectorPtr++ = sqrtf((real*real) + (imag*imag)); ++- } +++ const float* complexVectorPtr = (float*)complexVector; +++ float* magnitudeVectorPtr = magnitudeVector; +++ unsigned int number = 0; +++ for (number = 0; number < num_points; number++) { +++ const float real = *complexVectorPtr++; +++ const float imag = *complexVectorPtr++; +++ *magnitudeVectorPtr++ = sqrtf((real * real) + (imag * imag)); +++ } ++ } ++ #endif /* LV_HAVE_GENERIC */ ++ ++ ++- ++ #endif /* INCLUDED_volk_32fc_magnitude_32f_u_H */ ++ #ifndef INCLUDED_volk_32fc_magnitude_32f_a_H ++ #define INCLUDED_volk_32fc_magnitude_32f_a_H ++ ++ #include ++-#include ++ #include +++#include ++ ++ #ifdef LV_HAVE_AVX ++ #include ++ #include ++ ++-static inline void ++-volk_32fc_magnitude_32f_a_avx(float* magnitudeVector, const lv_32fc_t* complexVector, ++- unsigned int num_points) +++static inline void volk_32fc_magnitude_32f_a_avx(float* magnitudeVector, +++ const lv_32fc_t* complexVector, +++ unsigned int num_points) ++ { ++- unsigned int number = 0; ++- const unsigned int eighthPoints = num_points / 8; ++- ++- const float* complexVectorPtr = (float*) complexVector; ++- float* magnitudeVectorPtr = magnitudeVector; ++- ++- __m256 cplxValue1, cplxValue2, result; ++- for(; number < eighthPoints; number++){ ++- cplxValue1 = _mm256_load_ps(complexVectorPtr); ++- complexVectorPtr += 8; ++- ++- cplxValue2 = _mm256_load_ps(complexVectorPtr); ++- complexVectorPtr += 8; ++- ++- result = _mm256_magnitude_ps(cplxValue1, cplxValue2); ++- _mm256_store_ps(magnitudeVectorPtr, result); ++- magnitudeVectorPtr += 8; ++- } ++- ++- number = eighthPoints * 8; ++- for(; number < num_points; number++){ ++- float val1Real = *complexVectorPtr++; ++- float val1Imag = *complexVectorPtr++; ++- *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)); ++- } +++ unsigned int number = 0; +++ const unsigned int eighthPoints = num_points / 8; +++ +++ const float* complexVectorPtr = (float*)complexVector; +++ float* magnitudeVectorPtr = magnitudeVector; +++ +++ __m256 cplxValue1, cplxValue2, result; +++ for (; number < eighthPoints; number++) { +++ cplxValue1 = _mm256_load_ps(complexVectorPtr); +++ complexVectorPtr += 8; +++ +++ cplxValue2 = _mm256_load_ps(complexVectorPtr); +++ complexVectorPtr += 8; +++ +++ result = _mm256_magnitude_ps(cplxValue1, cplxValue2); +++ _mm256_store_ps(magnitudeVectorPtr, result); +++ magnitudeVectorPtr += 8; +++ } +++ +++ number = eighthPoints * 8; +++ for (; number < num_points; number++) { +++ float val1Real = *complexVectorPtr++; +++ float val1Imag = *complexVectorPtr++; +++ *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)); +++ } ++ } ++ #endif /* LV_HAVE_AVX */ ++ ++@@ -252,89 +252,89 @@ volk_32fc_magnitude_32f_a_avx(float* magnitudeVector, const lv_32fc_t* complexVe ++ #include ++ #include ++ ++-static inline void ++-volk_32fc_magnitude_32f_a_sse3(float* magnitudeVector, const lv_32fc_t* complexVector, ++- unsigned int num_points) +++static inline void volk_32fc_magnitude_32f_a_sse3(float* magnitudeVector, +++ const lv_32fc_t* complexVector, +++ unsigned int num_points) ++ { ++- unsigned int number = 0; ++- const unsigned int quarterPoints = num_points / 4; ++- ++- const float* complexVectorPtr = (float*) complexVector; ++- float* magnitudeVectorPtr = magnitudeVector; ++- ++- __m128 cplxValue1, cplxValue2, result; ++- for(; number < quarterPoints; number++){ ++- cplxValue1 = _mm_load_ps(complexVectorPtr); ++- complexVectorPtr += 4; ++- ++- cplxValue2 = _mm_load_ps(complexVectorPtr); ++- complexVectorPtr += 4; ++- ++- result = _mm_magnitude_ps_sse3(cplxValue1, cplxValue2); ++- _mm_store_ps(magnitudeVectorPtr, result); ++- magnitudeVectorPtr += 4; ++- } ++- ++- number = quarterPoints * 4; ++- for(; number < num_points; number++){ ++- float val1Real = *complexVectorPtr++; ++- float val1Imag = *complexVectorPtr++; ++- *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)); ++- } +++ unsigned int number = 0; +++ const unsigned int quarterPoints = num_points / 4; +++ +++ const float* complexVectorPtr = (float*)complexVector; +++ float* magnitudeVectorPtr = magnitudeVector; +++ +++ __m128 cplxValue1, cplxValue2, result; +++ for (; number < quarterPoints; number++) { +++ cplxValue1 = _mm_load_ps(complexVectorPtr); +++ complexVectorPtr += 4; +++ +++ cplxValue2 = _mm_load_ps(complexVectorPtr); +++ complexVectorPtr += 4; +++ +++ result = _mm_magnitude_ps_sse3(cplxValue1, cplxValue2); +++ _mm_store_ps(magnitudeVectorPtr, result); +++ magnitudeVectorPtr += 4; +++ } +++ +++ number = quarterPoints * 4; +++ for (; number < num_points; number++) { +++ float val1Real = *complexVectorPtr++; +++ float val1Imag = *complexVectorPtr++; +++ *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)); +++ } ++ } ++ #endif /* LV_HAVE_SSE3 */ ++ ++ #ifdef LV_HAVE_SSE ++-#include ++ #include +++#include ++ ++-static inline void ++-volk_32fc_magnitude_32f_a_sse(float* magnitudeVector, const lv_32fc_t* complexVector, ++- unsigned int num_points) +++static inline void volk_32fc_magnitude_32f_a_sse(float* magnitudeVector, +++ const lv_32fc_t* complexVector, +++ unsigned int num_points) ++ { ++- unsigned int number = 0; ++- const unsigned int quarterPoints = num_points / 4; ++- ++- const float* complexVectorPtr = (float*) complexVector; ++- float* magnitudeVectorPtr = magnitudeVector; ++- ++- __m128 cplxValue1, cplxValue2, result; ++- for(; number < quarterPoints; number++){ ++- cplxValue1 = _mm_load_ps(complexVectorPtr); ++- complexVectorPtr += 4; ++- ++- cplxValue2 = _mm_load_ps(complexVectorPtr); ++- complexVectorPtr += 4; ++- ++- result = _mm_magnitude_ps(cplxValue1, cplxValue2); ++- _mm_store_ps(magnitudeVectorPtr, result); ++- magnitudeVectorPtr += 4; ++- } ++- ++- number = quarterPoints * 4; ++- for(; number < num_points; number++){ ++- float val1Real = *complexVectorPtr++; ++- float val1Imag = *complexVectorPtr++; ++- *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)); ++- } +++ unsigned int number = 0; +++ const unsigned int quarterPoints = num_points / 4; +++ +++ const float* complexVectorPtr = (float*)complexVector; +++ float* magnitudeVectorPtr = magnitudeVector; +++ +++ __m128 cplxValue1, cplxValue2, result; +++ for (; number < quarterPoints; number++) { +++ cplxValue1 = _mm_load_ps(complexVectorPtr); +++ complexVectorPtr += 4; +++ +++ cplxValue2 = _mm_load_ps(complexVectorPtr); +++ complexVectorPtr += 4; +++ +++ result = _mm_magnitude_ps(cplxValue1, cplxValue2); +++ _mm_store_ps(magnitudeVectorPtr, result); +++ magnitudeVectorPtr += 4; +++ } +++ +++ number = quarterPoints * 4; +++ for (; number < num_points; number++) { +++ float val1Real = *complexVectorPtr++; +++ float val1Imag = *complexVectorPtr++; +++ *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)); +++ } ++ } ++ #endif /* LV_HAVE_SSE */ ++ ++ ++ #ifdef LV_HAVE_GENERIC ++ ++-static inline void ++-volk_32fc_magnitude_32f_a_generic(float* magnitudeVector, const lv_32fc_t* complexVector, ++- unsigned int num_points) +++static inline void volk_32fc_magnitude_32f_a_generic(float* magnitudeVector, +++ const lv_32fc_t* complexVector, +++ unsigned int num_points) ++ { ++- const float* complexVectorPtr = (float*)complexVector; ++- float* magnitudeVectorPtr = magnitudeVector; ++- unsigned int number = 0; ++- for(number = 0; number < num_points; number++){ ++- const float real = *complexVectorPtr++; ++- const float imag = *complexVectorPtr++; ++- *magnitudeVectorPtr++ = sqrtf((real*real) + (imag*imag)); ++- } +++ const float* complexVectorPtr = (float*)complexVector; +++ float* magnitudeVectorPtr = magnitudeVector; +++ unsigned int number = 0; +++ for (number = 0; number < num_points; number++) { +++ const float real = *complexVectorPtr++; +++ const float imag = *complexVectorPtr++; +++ *magnitudeVectorPtr++ = sqrtf((real * real) + (imag * imag)); +++ } ++ } ++ #endif /* LV_HAVE_GENERIC */ ++ ++@@ -342,41 +342,43 @@ volk_32fc_magnitude_32f_a_generic(float* magnitudeVector, const lv_32fc_t* compl ++ #ifdef LV_HAVE_NEON ++ #include ++ ++-static inline void ++-volk_32fc_magnitude_32f_neon(float* magnitudeVector, const lv_32fc_t* complexVector, ++- unsigned int num_points) +++static inline void volk_32fc_magnitude_32f_neon(float* magnitudeVector, +++ const lv_32fc_t* complexVector, +++ unsigned int num_points) ++ { ++- unsigned int number; ++- unsigned int quarter_points = num_points / 4; ++- const float* complexVectorPtr = (float*)complexVector; ++- float* magnitudeVectorPtr = magnitudeVector; ++- ++- float32x4x2_t complex_vec; ++- float32x4_t magnitude_vec; ++- for(number = 0; number < quarter_points; number++){ ++- complex_vec = vld2q_f32(complexVectorPtr); ++- complex_vec.val[0] = vmulq_f32(complex_vec.val[0], complex_vec.val[0]); ++- magnitude_vec = vmlaq_f32(complex_vec.val[0], complex_vec.val[1], complex_vec.val[1]); ++- magnitude_vec = vrsqrteq_f32(magnitude_vec); ++- magnitude_vec = vrecpeq_f32( magnitude_vec ); // no plain ol' sqrt ++- vst1q_f32(magnitudeVectorPtr, magnitude_vec); ++- ++- complexVectorPtr += 8; ++- magnitudeVectorPtr += 4; ++- } ++- ++- for(number = quarter_points*4; number < num_points; number++){ ++- const float real = *complexVectorPtr++; ++- const float imag = *complexVectorPtr++; ++- *magnitudeVectorPtr++ = sqrtf((real*real) + (imag*imag)); ++- } +++ unsigned int number; +++ unsigned int quarter_points = num_points / 4; +++ const float* complexVectorPtr = (float*)complexVector; +++ float* magnitudeVectorPtr = magnitudeVector; +++ +++ float32x4x2_t complex_vec; +++ float32x4_t magnitude_vec; +++ for (number = 0; number < quarter_points; number++) { +++ complex_vec = vld2q_f32(complexVectorPtr); +++ complex_vec.val[0] = vmulq_f32(complex_vec.val[0], complex_vec.val[0]); +++ magnitude_vec = +++ vmlaq_f32(complex_vec.val[0], complex_vec.val[1], complex_vec.val[1]); +++ magnitude_vec = vrsqrteq_f32(magnitude_vec); +++ magnitude_vec = vrecpeq_f32(magnitude_vec); // no plain ol' sqrt +++ vst1q_f32(magnitudeVectorPtr, magnitude_vec); +++ +++ complexVectorPtr += 8; +++ magnitudeVectorPtr += 4; +++ } +++ +++ for (number = quarter_points * 4; number < num_points; number++) { +++ const float real = *complexVectorPtr++; +++ const float imag = *complexVectorPtr++; +++ *magnitudeVectorPtr++ = sqrtf((real * real) + (imag * imag)); +++ } ++ } ++ #endif /* LV_HAVE_NEON */ ++ ++ ++ #ifdef LV_HAVE_NEON ++ /*! ++- \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector +++ \brief Calculates the magnitude of the complexVector and stores the results in the +++ magnitudeVector ++ ++ This is an approximation from "Streamlining Digital Signal Processing" by ++ Richard Lyons. Apparently max error is about 1% and mean error is about 0.6%. ++@@ -387,80 +389,80 @@ volk_32fc_magnitude_32f_neon(float* magnitudeVector, const lv_32fc_t* complexVec ++ ++ \param complexVector The vector containing the complex input values ++ \param magnitudeVector The vector containing the real output values ++- \param num_points The number of complex values in complexVector to be calculated and stored into cVector +++ \param num_points The number of complex values in complexVector to be calculated and +++ stored into cVector ++ */ ++-static inline void ++-volk_32fc_magnitude_32f_neon_fancy_sweet(float* magnitudeVector, const lv_32fc_t* complexVector, ++- unsigned int num_points) +++static inline void volk_32fc_magnitude_32f_neon_fancy_sweet( +++ float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points) ++ { ++- unsigned int number; ++- unsigned int quarter_points = num_points / 4; ++- const float* complexVectorPtr = (float*)complexVector; ++- float* magnitudeVectorPtr = magnitudeVector; ++- ++- const float threshold = 0.4142135; ++- ++- float32x4_t a_vec, b_vec, a_high, a_low, b_high, b_low; ++- a_high = vdupq_n_f32( 0.84 ); ++- b_high = vdupq_n_f32( 0.561); ++- a_low = vdupq_n_f32( 0.99 ); ++- b_low = vdupq_n_f32( 0.197); ++- ++- uint32x4_t comp0, comp1; ++- ++- float32x4x2_t complex_vec; ++- float32x4_t min_vec, max_vec, magnitude_vec; ++- float32x4_t real_abs, imag_abs; ++- for(number = 0; number < quarter_points; number++){ ++- complex_vec = vld2q_f32(complexVectorPtr); ++- ++- real_abs = vabsq_f32(complex_vec.val[0]); ++- imag_abs = vabsq_f32(complex_vec.val[1]); ++- ++- min_vec = vminq_f32(real_abs, imag_abs); ++- max_vec = vmaxq_f32(real_abs, imag_abs); ++- ++- // effective branch to choose coefficient pair. ++- comp0 = vcgtq_f32(min_vec, vmulq_n_f32(max_vec, threshold)); ++- comp1 = vcleq_f32(min_vec, vmulq_n_f32(max_vec, threshold)); ++- ++- // and 0s or 1s with coefficients from previous effective branch ++- a_vec = (float32x4_t)vaddq_s32(vandq_s32((int32x4_t)comp0, (int32x4_t)a_high), ++- vandq_s32((int32x4_t)comp1, (int32x4_t)a_low)); ++- b_vec = (float32x4_t)vaddq_s32(vandq_s32((int32x4_t)comp0, (int32x4_t)b_high), ++- vandq_s32((int32x4_t)comp1, (int32x4_t)b_low)); ++- ++- // coefficients chosen, do the weighted sum ++- min_vec = vmulq_f32(min_vec, b_vec); ++- max_vec = vmulq_f32(max_vec, a_vec); ++- ++- magnitude_vec = vaddq_f32(min_vec, max_vec); ++- vst1q_f32(magnitudeVectorPtr, magnitude_vec); ++- ++- complexVectorPtr += 8; ++- magnitudeVectorPtr += 4; ++- } ++- ++- for(number = quarter_points*4; number < num_points; number++){ ++- const float real = *complexVectorPtr++; ++- const float imag = *complexVectorPtr++; ++- *magnitudeVectorPtr++ = sqrtf((real*real) + (imag*imag)); ++- } +++ unsigned int number; +++ unsigned int quarter_points = num_points / 4; +++ const float* complexVectorPtr = (float*)complexVector; +++ float* magnitudeVectorPtr = magnitudeVector; +++ +++ const float threshold = 0.4142135; +++ +++ float32x4_t a_vec, b_vec, a_high, a_low, b_high, b_low; +++ a_high = vdupq_n_f32(0.84); +++ b_high = vdupq_n_f32(0.561); +++ a_low = vdupq_n_f32(0.99); +++ b_low = vdupq_n_f32(0.197); +++ +++ uint32x4_t comp0, comp1; +++ +++ float32x4x2_t complex_vec; +++ float32x4_t min_vec, max_vec, magnitude_vec; +++ float32x4_t real_abs, imag_abs; +++ for (number = 0; number < quarter_points; number++) { +++ complex_vec = vld2q_f32(complexVectorPtr); +++ +++ real_abs = vabsq_f32(complex_vec.val[0]); +++ imag_abs = vabsq_f32(complex_vec.val[1]); +++ +++ min_vec = vminq_f32(real_abs, imag_abs); +++ max_vec = vmaxq_f32(real_abs, imag_abs); +++ +++ // effective branch to choose coefficient pair. +++ comp0 = vcgtq_f32(min_vec, vmulq_n_f32(max_vec, threshold)); +++ comp1 = vcleq_f32(min_vec, vmulq_n_f32(max_vec, threshold)); +++ +++ // and 0s or 1s with coefficients from previous effective branch +++ a_vec = (float32x4_t)vaddq_s32(vandq_s32((int32x4_t)comp0, (int32x4_t)a_high), +++ vandq_s32((int32x4_t)comp1, (int32x4_t)a_low)); +++ b_vec = (float32x4_t)vaddq_s32(vandq_s32((int32x4_t)comp0, (int32x4_t)b_high), +++ vandq_s32((int32x4_t)comp1, (int32x4_t)b_low)); +++ +++ // coefficients chosen, do the weighted sum +++ min_vec = vmulq_f32(min_vec, b_vec); +++ max_vec = vmulq_f32(max_vec, a_vec); +++ +++ magnitude_vec = vaddq_f32(min_vec, max_vec); +++ vst1q_f32(magnitudeVectorPtr, magnitude_vec); +++ +++ complexVectorPtr += 8; +++ magnitudeVectorPtr += 4; +++ } +++ +++ for (number = quarter_points * 4; number < num_points; number++) { +++ const float real = *complexVectorPtr++; +++ const float imag = *complexVectorPtr++; +++ *magnitudeVectorPtr++ = sqrtf((real * real) + (imag * imag)); +++ } ++ } ++ #endif /* LV_HAVE_NEON */ ++ ++ ++ #ifdef LV_HAVE_ORC ++ ++-extern void ++-volk_32fc_magnitude_32f_a_orc_impl(float* magnitudeVector, const lv_32fc_t* complexVector, ++- unsigned int num_points); +++extern void volk_32fc_magnitude_32f_a_orc_impl(float* magnitudeVector, +++ const lv_32fc_t* complexVector, +++ unsigned int num_points); ++ ++-static inline void ++-volk_32fc_magnitude_32f_u_orc(float* magnitudeVector, const lv_32fc_t* complexVector, ++- unsigned int num_points) +++static inline void volk_32fc_magnitude_32f_u_orc(float* magnitudeVector, +++ const lv_32fc_t* complexVector, +++ unsigned int num_points) ++ { ++- volk_32fc_magnitude_32f_a_orc_impl(magnitudeVector, complexVector, num_points); +++ volk_32fc_magnitude_32f_a_orc_impl(magnitudeVector, complexVector, num_points); ++ } ++ #endif /* LV_HAVE_ORC */ ++ ++diff --git a/kernels/volk/volk_32fc_magnitude_squared_32f.h b/kernels/volk/volk_32fc_magnitude_squared_32f.h ++index 51bb4df..cb093ca 100644 ++--- a/kernels/volk/volk_32fc_magnitude_squared_32f.h +++++ b/kernels/volk/volk_32fc_magnitude_squared_32f.h ++@@ -30,8 +30,8 @@ ++ * ++ * Dispatcher Prototype ++ * \code ++- * void volk_32fc_magnitude_squared_32f(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points) ++- * \endcode +++ * void volk_32fc_magnitude_squared_32f(float* magnitudeVector, const lv_32fc_t* +++ * complexVector, unsigned int num_points) \endcode ++ * ++ * \b Inputs ++ * \li complexVector: The complex input vector. ++@@ -72,41 +72,41 @@ ++ #define INCLUDED_volk_32fc_magnitude_squared_32f_u_H ++ ++ #include ++-#include ++ #include +++#include ++ ++ #ifdef LV_HAVE_AVX ++ #include ++ #include ++ ++-static inline void ++-volk_32fc_magnitude_squared_32f_u_avx(float* magnitudeVector, const lv_32fc_t* complexVector, ++- unsigned int num_points) +++static inline void volk_32fc_magnitude_squared_32f_u_avx(float* magnitudeVector, +++ const lv_32fc_t* complexVector, +++ unsigned int num_points) ++ { ++- unsigned int number = 0; ++- const unsigned int eighthPoints = num_points / 8; ++- ++- const float* complexVectorPtr = (float*) complexVector; ++- float* magnitudeVectorPtr = magnitudeVector; ++- ++- __m256 cplxValue1, cplxValue2, result; ++- ++- for(; number < eighthPoints; number++){ ++- cplxValue1 = _mm256_loadu_ps(complexVectorPtr); ++- cplxValue2 = _mm256_loadu_ps(complexVectorPtr + 8); ++- result = _mm256_magnitudesquared_ps(cplxValue1, cplxValue2); ++- _mm256_storeu_ps(magnitudeVectorPtr, result); ++- ++- complexVectorPtr += 16; ++- magnitudeVectorPtr += 8; ++- } ++- ++- number = eighthPoints * 8; ++- for(; number < num_points; number++){ ++- float val1Real = *complexVectorPtr++; ++- float val1Imag = *complexVectorPtr++; ++- *magnitudeVectorPtr++ = (val1Real * val1Real) + (val1Imag * val1Imag); ++- } +++ unsigned int number = 0; +++ const unsigned int eighthPoints = num_points / 8; +++ +++ const float* complexVectorPtr = (float*)complexVector; +++ float* magnitudeVectorPtr = magnitudeVector; +++ +++ __m256 cplxValue1, cplxValue2, result; +++ +++ for (; number < eighthPoints; number++) { +++ cplxValue1 = _mm256_loadu_ps(complexVectorPtr); +++ cplxValue2 = _mm256_loadu_ps(complexVectorPtr + 8); +++ result = _mm256_magnitudesquared_ps(cplxValue1, cplxValue2); +++ _mm256_storeu_ps(magnitudeVectorPtr, result); +++ +++ complexVectorPtr += 16; +++ magnitudeVectorPtr += 8; +++ } +++ +++ number = eighthPoints * 8; +++ for (; number < num_points; number++) { +++ float val1Real = *complexVectorPtr++; +++ float val1Imag = *complexVectorPtr++; +++ *magnitudeVectorPtr++ = (val1Real * val1Real) + (val1Imag * val1Imag); +++ } ++ } ++ #endif /* LV_HAVE_AVX */ ++ ++@@ -115,137 +115,136 @@ volk_32fc_magnitude_squared_32f_u_avx(float* magnitudeVector, const lv_32fc_t* c ++ #include ++ #include ++ ++-static inline void ++-volk_32fc_magnitude_squared_32f_u_sse3(float* magnitudeVector, const lv_32fc_t* complexVector, ++- unsigned int num_points) +++static inline void volk_32fc_magnitude_squared_32f_u_sse3(float* magnitudeVector, +++ const lv_32fc_t* complexVector, +++ unsigned int num_points) ++ { ++- unsigned int number = 0; ++- const unsigned int quarterPoints = num_points / 4; ++- ++- const float* complexVectorPtr = (float*) complexVector; ++- float* magnitudeVectorPtr = magnitudeVector; ++- ++- __m128 cplxValue1, cplxValue2, result; ++- for(; number < quarterPoints; number++){ ++- cplxValue1 = _mm_loadu_ps(complexVectorPtr); ++- complexVectorPtr += 4; ++- ++- cplxValue2 = _mm_loadu_ps(complexVectorPtr); ++- complexVectorPtr += 4; ++- ++- result = _mm_magnitudesquared_ps_sse3(cplxValue1, cplxValue2); ++- _mm_storeu_ps(magnitudeVectorPtr, result); ++- magnitudeVectorPtr += 4; ++- } ++- ++- number = quarterPoints * 4; ++- for(; number < num_points; number++){ ++- float val1Real = *complexVectorPtr++; ++- float val1Imag = *complexVectorPtr++; ++- *magnitudeVectorPtr++ = (val1Real * val1Real) + (val1Imag * val1Imag); ++- } +++ unsigned int number = 0; +++ const unsigned int quarterPoints = num_points / 4; +++ +++ const float* complexVectorPtr = (float*)complexVector; +++ float* magnitudeVectorPtr = magnitudeVector; +++ +++ __m128 cplxValue1, cplxValue2, result; +++ for (; number < quarterPoints; number++) { +++ cplxValue1 = _mm_loadu_ps(complexVectorPtr); +++ complexVectorPtr += 4; +++ +++ cplxValue2 = _mm_loadu_ps(complexVectorPtr); +++ complexVectorPtr += 4; +++ +++ result = _mm_magnitudesquared_ps_sse3(cplxValue1, cplxValue2); +++ _mm_storeu_ps(magnitudeVectorPtr, result); +++ magnitudeVectorPtr += 4; +++ } +++ +++ number = quarterPoints * 4; +++ for (; number < num_points; number++) { +++ float val1Real = *complexVectorPtr++; +++ float val1Imag = *complexVectorPtr++; +++ *magnitudeVectorPtr++ = (val1Real * val1Real) + (val1Imag * val1Imag); +++ } ++ } ++ #endif /* LV_HAVE_SSE3 */ ++ ++ ++ #ifdef LV_HAVE_SSE ++-#include ++ #include +++#include ++ ++-static inline void ++-volk_32fc_magnitude_squared_32f_u_sse(float* magnitudeVector, const lv_32fc_t* complexVector, ++- unsigned int num_points) +++static inline void volk_32fc_magnitude_squared_32f_u_sse(float* magnitudeVector, +++ const lv_32fc_t* complexVector, +++ unsigned int num_points) ++ { ++- unsigned int number = 0; ++- const unsigned int quarterPoints = num_points / 4; +++ unsigned int number = 0; +++ const unsigned int quarterPoints = num_points / 4; ++ ++- const float* complexVectorPtr = (float*) complexVector; ++- float* magnitudeVectorPtr = magnitudeVector; +++ const float* complexVectorPtr = (float*)complexVector; +++ float* magnitudeVectorPtr = magnitudeVector; ++ ++- __m128 cplxValue1, cplxValue2, result; +++ __m128 cplxValue1, cplxValue2, result; ++ ++- for(; number < quarterPoints; number++){ ++- cplxValue1 = _mm_loadu_ps(complexVectorPtr); ++- complexVectorPtr += 4; +++ for (; number < quarterPoints; number++) { +++ cplxValue1 = _mm_loadu_ps(complexVectorPtr); +++ complexVectorPtr += 4; ++ ++- cplxValue2 = _mm_loadu_ps(complexVectorPtr); ++- complexVectorPtr += 4; +++ cplxValue2 = _mm_loadu_ps(complexVectorPtr); +++ complexVectorPtr += 4; ++ ++- result = _mm_magnitudesquared_ps(cplxValue1, cplxValue2); ++- _mm_storeu_ps(magnitudeVectorPtr, result); ++- magnitudeVectorPtr += 4; ++- } +++ result = _mm_magnitudesquared_ps(cplxValue1, cplxValue2); +++ _mm_storeu_ps(magnitudeVectorPtr, result); +++ magnitudeVectorPtr += 4; +++ } ++ ++- number = quarterPoints * 4; ++- for(; number < num_points; number++){ ++- float val1Real = *complexVectorPtr++; ++- float val1Imag = *complexVectorPtr++; ++- *magnitudeVectorPtr++ = (val1Real * val1Real) + (val1Imag * val1Imag); ++- } +++ number = quarterPoints * 4; +++ for (; number < num_points; number++) { +++ float val1Real = *complexVectorPtr++; +++ float val1Imag = *complexVectorPtr++; +++ *magnitudeVectorPtr++ = (val1Real * val1Real) + (val1Imag * val1Imag); +++ } ++ } ++ #endif /* LV_HAVE_SSE */ ++ ++ ++ #ifdef LV_HAVE_GENERIC ++ ++-static inline void ++-volk_32fc_magnitude_squared_32f_generic(float* magnitudeVector, const lv_32fc_t* complexVector, ++- unsigned int num_points) +++static inline void volk_32fc_magnitude_squared_32f_generic(float* magnitudeVector, +++ const lv_32fc_t* complexVector, +++ unsigned int num_points) ++ { ++- const float* complexVectorPtr = (float*)complexVector; ++- float* magnitudeVectorPtr = magnitudeVector; ++- unsigned int number = 0; ++- for(number = 0; number < num_points; number++){ ++- const float real = *complexVectorPtr++; ++- const float imag = *complexVectorPtr++; ++- *magnitudeVectorPtr++ = (real*real) + (imag*imag); ++- } +++ const float* complexVectorPtr = (float*)complexVector; +++ float* magnitudeVectorPtr = magnitudeVector; +++ unsigned int number = 0; +++ for (number = 0; number < num_points; number++) { +++ const float real = *complexVectorPtr++; +++ const float imag = *complexVectorPtr++; +++ *magnitudeVectorPtr++ = (real * real) + (imag * imag); +++ } ++ } ++ #endif /* LV_HAVE_GENERIC */ ++ ++ ++- ++ #endif /* INCLUDED_volk_32fc_magnitude_32f_u_H */ ++ #ifndef INCLUDED_volk_32fc_magnitude_squared_32f_a_H ++ #define INCLUDED_volk_32fc_magnitude_squared_32f_a_H ++ ++ #include ++-#include ++ #include +++#include ++ ++ #ifdef LV_HAVE_AVX ++ #include ++ #include ++ ++-static inline void ++-volk_32fc_magnitude_squared_32f_a_avx(float* magnitudeVector, const lv_32fc_t* complexVector, ++- unsigned int num_points) +++static inline void volk_32fc_magnitude_squared_32f_a_avx(float* magnitudeVector, +++ const lv_32fc_t* complexVector, +++ unsigned int num_points) ++ { ++- unsigned int number = 0; ++- const unsigned int eighthPoints = num_points / 8; ++- ++- const float* complexVectorPtr = (float*) complexVector; ++- float* magnitudeVectorPtr = magnitudeVector; ++- ++- __m256 cplxValue1, cplxValue2, result; ++- for(; number < eighthPoints; number++){ ++- cplxValue1 = _mm256_load_ps(complexVectorPtr); ++- complexVectorPtr += 8; ++- ++- cplxValue2 = _mm256_load_ps(complexVectorPtr); ++- complexVectorPtr += 8; ++- ++- result = _mm256_magnitudesquared_ps(cplxValue1, cplxValue2); ++- _mm256_store_ps(magnitudeVectorPtr, result); ++- magnitudeVectorPtr += 8; ++- } ++- ++- number = eighthPoints * 8; ++- for(; number < num_points; number++){ ++- float val1Real = *complexVectorPtr++; ++- float val1Imag = *complexVectorPtr++; ++- *magnitudeVectorPtr++ = (val1Real * val1Real) + (val1Imag * val1Imag); ++- } +++ unsigned int number = 0; +++ const unsigned int eighthPoints = num_points / 8; +++ +++ const float* complexVectorPtr = (float*)complexVector; +++ float* magnitudeVectorPtr = magnitudeVector; +++ +++ __m256 cplxValue1, cplxValue2, result; +++ for (; number < eighthPoints; number++) { +++ cplxValue1 = _mm256_load_ps(complexVectorPtr); +++ complexVectorPtr += 8; +++ +++ cplxValue2 = _mm256_load_ps(complexVectorPtr); +++ complexVectorPtr += 8; +++ +++ result = _mm256_magnitudesquared_ps(cplxValue1, cplxValue2); +++ _mm256_store_ps(magnitudeVectorPtr, result); +++ magnitudeVectorPtr += 8; +++ } +++ +++ number = eighthPoints * 8; +++ for (; number < num_points; number++) { +++ float val1Real = *complexVectorPtr++; +++ float val1Imag = *complexVectorPtr++; +++ *magnitudeVectorPtr++ = (val1Real * val1Real) + (val1Imag * val1Imag); +++ } ++ } ++ #endif /* LV_HAVE_AVX */ ++ ++@@ -254,72 +253,72 @@ volk_32fc_magnitude_squared_32f_a_avx(float* magnitudeVector, const lv_32fc_t* c ++ #include ++ #include ++ ++-static inline void ++-volk_32fc_magnitude_squared_32f_a_sse3(float* magnitudeVector, const lv_32fc_t* complexVector, ++- unsigned int num_points) +++static inline void volk_32fc_magnitude_squared_32f_a_sse3(float* magnitudeVector, +++ const lv_32fc_t* complexVector, +++ unsigned int num_points) ++ { ++- unsigned int number = 0; ++- const unsigned int quarterPoints = num_points / 4; ++- ++- const float* complexVectorPtr = (float*) complexVector; ++- float* magnitudeVectorPtr = magnitudeVector; ++- ++- __m128 cplxValue1, cplxValue2, result; ++- for(; number < quarterPoints; number++){ ++- cplxValue1 = _mm_load_ps(complexVectorPtr); ++- complexVectorPtr += 4; ++- ++- cplxValue2 = _mm_load_ps(complexVectorPtr); ++- complexVectorPtr += 4; ++- ++- result = _mm_magnitudesquared_ps_sse3(cplxValue1, cplxValue2); ++- _mm_store_ps(magnitudeVectorPtr, result); ++- magnitudeVectorPtr += 4; ++- } ++- ++- number = quarterPoints * 4; ++- for(; number < num_points; number++){ ++- float val1Real = *complexVectorPtr++; ++- float val1Imag = *complexVectorPtr++; ++- *magnitudeVectorPtr++ = (val1Real * val1Real) + (val1Imag * val1Imag); ++- } +++ unsigned int number = 0; +++ const unsigned int quarterPoints = num_points / 4; +++ +++ const float* complexVectorPtr = (float*)complexVector; +++ float* magnitudeVectorPtr = magnitudeVector; +++ +++ __m128 cplxValue1, cplxValue2, result; +++ for (; number < quarterPoints; number++) { +++ cplxValue1 = _mm_load_ps(complexVectorPtr); +++ complexVectorPtr += 4; +++ +++ cplxValue2 = _mm_load_ps(complexVectorPtr); +++ complexVectorPtr += 4; +++ +++ result = _mm_magnitudesquared_ps_sse3(cplxValue1, cplxValue2); +++ _mm_store_ps(magnitudeVectorPtr, result); +++ magnitudeVectorPtr += 4; +++ } +++ +++ number = quarterPoints * 4; +++ for (; number < num_points; number++) { +++ float val1Real = *complexVectorPtr++; +++ float val1Imag = *complexVectorPtr++; +++ *magnitudeVectorPtr++ = (val1Real * val1Real) + (val1Imag * val1Imag); +++ } ++ } ++ #endif /* LV_HAVE_SSE3 */ ++ ++ ++ #ifdef LV_HAVE_SSE ++-#include ++ #include +++#include ++ ++-static inline void ++-volk_32fc_magnitude_squared_32f_a_sse(float* magnitudeVector, const lv_32fc_t* complexVector, ++- unsigned int num_points) +++static inline void volk_32fc_magnitude_squared_32f_a_sse(float* magnitudeVector, +++ const lv_32fc_t* complexVector, +++ unsigned int num_points) ++ { ++- unsigned int number = 0; ++- const unsigned int quarterPoints = num_points / 4; ++- ++- const float* complexVectorPtr = (float*)complexVector; ++- float* magnitudeVectorPtr = magnitudeVector; ++- ++- __m128 cplxValue1, cplxValue2, result; ++- for(;number < quarterPoints; number++){ ++- cplxValue1 = _mm_load_ps(complexVectorPtr); ++- complexVectorPtr += 4; ++- ++- cplxValue2 = _mm_load_ps(complexVectorPtr); ++- complexVectorPtr += 4; ++- ++- result = _mm_magnitudesquared_ps(cplxValue1, cplxValue2); ++- _mm_store_ps(magnitudeVectorPtr, result); ++- magnitudeVectorPtr += 4; ++- } ++- ++- number = quarterPoints * 4; ++- for(; number < num_points; number++){ ++- float val1Real = *complexVectorPtr++; ++- float val1Imag = *complexVectorPtr++; ++- *magnitudeVectorPtr++ = (val1Real * val1Real) + (val1Imag * val1Imag); ++- } +++ unsigned int number = 0; +++ const unsigned int quarterPoints = num_points / 4; +++ +++ const float* complexVectorPtr = (float*)complexVector; +++ float* magnitudeVectorPtr = magnitudeVector; +++ +++ __m128 cplxValue1, cplxValue2, result; +++ for (; number < quarterPoints; number++) { +++ cplxValue1 = _mm_load_ps(complexVectorPtr); +++ complexVectorPtr += 4; +++ +++ cplxValue2 = _mm_load_ps(complexVectorPtr); +++ complexVectorPtr += 4; +++ +++ result = _mm_magnitudesquared_ps(cplxValue1, cplxValue2); +++ _mm_store_ps(magnitudeVectorPtr, result); +++ magnitudeVectorPtr += 4; +++ } +++ +++ number = quarterPoints * 4; +++ for (; number < num_points; number++) { +++ float val1Real = *complexVectorPtr++; +++ float val1Imag = *complexVectorPtr++; +++ *magnitudeVectorPtr++ = (val1Real * val1Real) + (val1Imag * val1Imag); +++ } ++ } ++ #endif /* LV_HAVE_SSE */ ++ ++@@ -327,55 +326,57 @@ volk_32fc_magnitude_squared_32f_a_sse(float* magnitudeVector, const lv_32fc_t* c ++ #ifdef LV_HAVE_NEON ++ #include ++ ++-static inline void ++-volk_32fc_magnitude_squared_32f_neon(float* magnitudeVector, const lv_32fc_t* complexVector, ++- unsigned int num_points) +++static inline void volk_32fc_magnitude_squared_32f_neon(float* magnitudeVector, +++ const lv_32fc_t* complexVector, +++ unsigned int num_points) ++ { ++- unsigned int number = 0; ++- const unsigned int quarterPoints = num_points / 4; ++- ++- const float* complexVectorPtr = (float*)complexVector; ++- float* magnitudeVectorPtr = magnitudeVector; ++- ++- float32x4x2_t cmplx_val; ++- float32x4_t result; ++- for(;number < quarterPoints; number++){ ++- cmplx_val = vld2q_f32(complexVectorPtr); ++- complexVectorPtr += 8; ++- ++- cmplx_val.val[0] = vmulq_f32(cmplx_val.val[0], cmplx_val.val[0]); // Square the values ++- cmplx_val.val[1] = vmulq_f32(cmplx_val.val[1], cmplx_val.val[1]); // Square the values ++- ++- result = vaddq_f32(cmplx_val.val[0], cmplx_val.val[1]); // Add the I2 and Q2 values ++- ++- vst1q_f32(magnitudeVectorPtr, result); ++- magnitudeVectorPtr += 4; ++- } ++- ++- number = quarterPoints * 4; ++- for(; number < num_points; number++){ ++- float val1Real = *complexVectorPtr++; ++- float val1Imag = *complexVectorPtr++; ++- *magnitudeVectorPtr++ = (val1Real * val1Real) + (val1Imag * val1Imag); ++- } +++ unsigned int number = 0; +++ const unsigned int quarterPoints = num_points / 4; +++ +++ const float* complexVectorPtr = (float*)complexVector; +++ float* magnitudeVectorPtr = magnitudeVector; +++ +++ float32x4x2_t cmplx_val; +++ float32x4_t result; +++ for (; number < quarterPoints; number++) { +++ cmplx_val = vld2q_f32(complexVectorPtr); +++ complexVectorPtr += 8; +++ +++ cmplx_val.val[0] = +++ vmulq_f32(cmplx_val.val[0], cmplx_val.val[0]); // Square the values +++ cmplx_val.val[1] = +++ vmulq_f32(cmplx_val.val[1], cmplx_val.val[1]); // Square the values +++ +++ result = +++ vaddq_f32(cmplx_val.val[0], cmplx_val.val[1]); // Add the I2 and Q2 values +++ +++ vst1q_f32(magnitudeVectorPtr, result); +++ magnitudeVectorPtr += 4; +++ } +++ +++ number = quarterPoints * 4; +++ for (; number < num_points; number++) { +++ float val1Real = *complexVectorPtr++; +++ float val1Imag = *complexVectorPtr++; +++ *magnitudeVectorPtr++ = (val1Real * val1Real) + (val1Imag * val1Imag); +++ } ++ } ++ #endif /* LV_HAVE_NEON */ ++ ++ ++ #ifdef LV_HAVE_GENERIC ++ ++-static inline void ++-volk_32fc_magnitude_squared_32f_a_generic(float* magnitudeVector, const lv_32fc_t* complexVector, ++- unsigned int num_points) +++static inline void volk_32fc_magnitude_squared_32f_a_generic( +++ float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points) ++ { ++- const float* complexVectorPtr = (float*)complexVector; ++- float* magnitudeVectorPtr = magnitudeVector; ++- unsigned int number = 0; ++- for(number = 0; number < num_points; number++){ ++- const float real = *complexVectorPtr++; ++- const float imag = *complexVectorPtr++; ++- *magnitudeVectorPtr++ = (real*real) + (imag*imag); ++- } +++ const float* complexVectorPtr = (float*)complexVector; +++ float* magnitudeVectorPtr = magnitudeVector; +++ unsigned int number = 0; +++ for (number = 0; number < num_points; number++) { +++ const float real = *complexVectorPtr++; +++ const float imag = *complexVectorPtr++; +++ *magnitudeVectorPtr++ = (real * real) + (imag * imag); +++ } ++ } ++ #endif /* LV_HAVE_GENERIC */ ++ ++diff --git a/kernels/volk/volk_32fc_s32f_atan2_32f.h b/kernels/volk/volk_32fc_s32f_atan2_32f.h ++index c169336..f08f793 100644 ++--- a/kernels/volk/volk_32fc_s32f_atan2_32f.h +++++ b/kernels/volk/volk_32fc_s32f_atan2_32f.h ++@@ -30,13 +30,13 @@ ++ * ++ * Dispatcher Prototype ++ * \code ++- * void volk_32fc_s32f_atan2_32f(float* outputVector, const lv_32fc_t* complexVector, const float normalizeFactor, unsigned int num_points) ++- * \endcode +++ * void volk_32fc_s32f_atan2_32f(float* outputVector, const lv_32fc_t* complexVector, +++ * const float normalizeFactor, unsigned int num_points) \endcode ++ * ++ * \b Inputs ++- * \li inputVector: The byte-aligned input vector containing interleaved IQ data (I = cos, Q = sin). ++- * \li normalizeFactor: The atan results are divided by this normalization factor. ++- * \li num_points: The number of complex values in \p inputVector. +++ * \li inputVector: The byte-aligned input vector containing interleaved IQ data (I = cos, +++ * Q = sin). \li normalizeFactor: The atan results are divided by this normalization +++ * factor. \li num_points: The number of complex values in \p inputVector. ++ * ++ * \b Outputs ++ * \li outputVector: The vector where the results will be stored. ++@@ -75,8 +75,8 @@ ++ #define INCLUDED_volk_32fc_s32f_atan2_32f_a_H ++ ++ #include ++-#include ++ #include +++#include ++ ++ #ifdef LV_HAVE_SSE4_1 ++ #include ++@@ -85,50 +85,54 @@ ++ #include ++ #endif /* LV_HAVE_LIB_SIMDMATH */ ++ ++-static inline void volk_32fc_s32f_atan2_32f_a_sse4_1(float* outputVector, const lv_32fc_t* complexVector, const float normalizeFactor, unsigned int num_points){ ++- const float* complexVectorPtr = (float*)complexVector; ++- float* outPtr = outputVector; +++static inline void volk_32fc_s32f_atan2_32f_a_sse4_1(float* outputVector, +++ const lv_32fc_t* complexVector, +++ const float normalizeFactor, +++ unsigned int num_points) +++{ +++ const float* complexVectorPtr = (float*)complexVector; +++ float* outPtr = outputVector; ++ ++- unsigned int number = 0; ++- const float invNormalizeFactor = 1.0 / normalizeFactor; +++ unsigned int number = 0; +++ const float invNormalizeFactor = 1.0 / normalizeFactor; ++ ++ #ifdef LV_HAVE_LIB_SIMDMATH ++- const unsigned int quarterPoints = num_points / 4; ++- __m128 testVector = _mm_set_ps1(2*M_PI); ++- __m128 correctVector = _mm_set_ps1(M_PI); ++- __m128 vNormalizeFactor = _mm_set_ps1(invNormalizeFactor); ++- __m128 phase; ++- __m128 complex1, complex2, iValue, qValue; ++- __m128 keepMask; ++- ++- for (; number < quarterPoints; number++) { ++- // Load IQ data: ++- complex1 = _mm_load_ps(complexVectorPtr); ++- complexVectorPtr += 4; ++- complex2 = _mm_load_ps(complexVectorPtr); ++- complexVectorPtr += 4; ++- // Deinterleave IQ data: ++- iValue = _mm_shuffle_ps(complex1, complex2, _MM_SHUFFLE(2,0,2,0)); ++- qValue = _mm_shuffle_ps(complex1, complex2, _MM_SHUFFLE(3,1,3,1)); ++- // Arctan to get phase: ++- phase = atan2f4(qValue, iValue); ++- // When Q = 0 and I < 0, atan2f4 sucks and returns 2pi vice pi. ++- // Compare to 2pi: ++- keepMask = _mm_cmpneq_ps(phase,testVector); ++- phase = _mm_blendv_ps(correctVector, phase, keepMask); ++- // done with above correction. ++- phase = _mm_mul_ps(phase, vNormalizeFactor); ++- _mm_store_ps((float*)outPtr, phase); ++- outPtr += 4; ++- } ++- number = quarterPoints * 4; +++ const unsigned int quarterPoints = num_points / 4; +++ __m128 testVector = _mm_set_ps1(2 * M_PI); +++ __m128 correctVector = _mm_set_ps1(M_PI); +++ __m128 vNormalizeFactor = _mm_set_ps1(invNormalizeFactor); +++ __m128 phase; +++ __m128 complex1, complex2, iValue, qValue; +++ __m128 keepMask; +++ +++ for (; number < quarterPoints; number++) { +++ // Load IQ data: +++ complex1 = _mm_load_ps(complexVectorPtr); +++ complexVectorPtr += 4; +++ complex2 = _mm_load_ps(complexVectorPtr); +++ complexVectorPtr += 4; +++ // Deinterleave IQ data: +++ iValue = _mm_shuffle_ps(complex1, complex2, _MM_SHUFFLE(2, 0, 2, 0)); +++ qValue = _mm_shuffle_ps(complex1, complex2, _MM_SHUFFLE(3, 1, 3, 1)); +++ // Arctan to get phase: +++ phase = atan2f4(qValue, iValue); +++ // When Q = 0 and I < 0, atan2f4 sucks and returns 2pi vice pi. +++ // Compare to 2pi: +++ keepMask = _mm_cmpneq_ps(phase, testVector); +++ phase = _mm_blendv_ps(correctVector, phase, keepMask); +++ // done with above correction. +++ phase = _mm_mul_ps(phase, vNormalizeFactor); +++ _mm_store_ps((float*)outPtr, phase); +++ outPtr += 4; +++ } +++ number = quarterPoints * 4; ++ #endif /* LV_HAVE_SIMDMATH_H */ ++ ++- for (; number < num_points; number++) { ++- const float real = *complexVectorPtr++; ++- const float imag = *complexVectorPtr++; ++- *outPtr++ = atan2f(imag, real) * invNormalizeFactor; ++- } +++ for (; number < num_points; number++) { +++ const float real = *complexVectorPtr++; +++ const float imag = *complexVectorPtr++; +++ *outPtr++ = atan2f(imag, real) * invNormalizeFactor; +++ } ++ } ++ #endif /* LV_HAVE_SSE4_1 */ ++ ++@@ -140,72 +144,78 @@ static inline void volk_32fc_s32f_atan2_32f_a_sse4_1(float* outputVector, const ++ #include ++ #endif /* LV_HAVE_LIB_SIMDMATH */ ++ ++-static inline void volk_32fc_s32f_atan2_32f_a_sse(float* outputVector, const lv_32fc_t* complexVector, const float normalizeFactor, unsigned int num_points){ ++- const float* complexVectorPtr = (float*)complexVector; ++- float* outPtr = outputVector; +++static inline void volk_32fc_s32f_atan2_32f_a_sse(float* outputVector, +++ const lv_32fc_t* complexVector, +++ const float normalizeFactor, +++ unsigned int num_points) +++{ +++ const float* complexVectorPtr = (float*)complexVector; +++ float* outPtr = outputVector; ++ ++- unsigned int number = 0; ++- const float invNormalizeFactor = 1.0 / normalizeFactor; +++ unsigned int number = 0; +++ const float invNormalizeFactor = 1.0 / normalizeFactor; ++ ++ #ifdef LV_HAVE_LIB_SIMDMATH ++- const unsigned int quarterPoints = num_points / 4; ++- __m128 testVector = _mm_set_ps1(2*M_PI); ++- __m128 correctVector = _mm_set_ps1(M_PI); ++- __m128 vNormalizeFactor = _mm_set_ps1(invNormalizeFactor); ++- __m128 phase; ++- __m128 complex1, complex2, iValue, qValue; ++- __m128 mask; ++- __m128 keepMask; ++- ++- for (; number < quarterPoints; number++) { ++- // Load IQ data: ++- complex1 = _mm_load_ps(complexVectorPtr); ++- complexVectorPtr += 4; ++- complex2 = _mm_load_ps(complexVectorPtr); ++- complexVectorPtr += 4; ++- // Deinterleave IQ data: ++- iValue = _mm_shuffle_ps(complex1, complex2, _MM_SHUFFLE(2,0,2,0)); ++- qValue = _mm_shuffle_ps(complex1, complex2, _MM_SHUFFLE(3,1,3,1)); ++- // Arctan to get phase: ++- phase = atan2f4(qValue, iValue); ++- // When Q = 0 and I < 0, atan2f4 sucks and returns 2pi vice pi. ++- // Compare to 2pi: ++- keepMask = _mm_cmpneq_ps(phase,testVector); ++- phase = _mm_and_ps(phase, keepMask); ++- mask = _mm_andnot_ps(keepMask, correctVector); ++- phase = _mm_or_ps(phase, mask); ++- // done with above correction. ++- phase = _mm_mul_ps(phase, vNormalizeFactor); ++- _mm_store_ps((float*)outPtr, phase); ++- outPtr += 4; ++- } ++- number = quarterPoints * 4; +++ const unsigned int quarterPoints = num_points / 4; +++ __m128 testVector = _mm_set_ps1(2 * M_PI); +++ __m128 correctVector = _mm_set_ps1(M_PI); +++ __m128 vNormalizeFactor = _mm_set_ps1(invNormalizeFactor); +++ __m128 phase; +++ __m128 complex1, complex2, iValue, qValue; +++ __m128 mask; +++ __m128 keepMask; +++ +++ for (; number < quarterPoints; number++) { +++ // Load IQ data: +++ complex1 = _mm_load_ps(complexVectorPtr); +++ complexVectorPtr += 4; +++ complex2 = _mm_load_ps(complexVectorPtr); +++ complexVectorPtr += 4; +++ // Deinterleave IQ data: +++ iValue = _mm_shuffle_ps(complex1, complex2, _MM_SHUFFLE(2, 0, 2, 0)); +++ qValue = _mm_shuffle_ps(complex1, complex2, _MM_SHUFFLE(3, 1, 3, 1)); +++ // Arctan to get phase: +++ phase = atan2f4(qValue, iValue); +++ // When Q = 0 and I < 0, atan2f4 sucks and returns 2pi vice pi. +++ // Compare to 2pi: +++ keepMask = _mm_cmpneq_ps(phase, testVector); +++ phase = _mm_and_ps(phase, keepMask); +++ mask = _mm_andnot_ps(keepMask, correctVector); +++ phase = _mm_or_ps(phase, mask); +++ // done with above correction. +++ phase = _mm_mul_ps(phase, vNormalizeFactor); +++ _mm_store_ps((float*)outPtr, phase); +++ outPtr += 4; +++ } +++ number = quarterPoints * 4; ++ #endif /* LV_HAVE_SIMDMATH_H */ ++ ++- for (; number < num_points; number++) { ++- const float real = *complexVectorPtr++; ++- const float imag = *complexVectorPtr++; ++- *outPtr++ = atan2f(imag, real) * invNormalizeFactor; ++- } +++ for (; number < num_points; number++) { +++ const float real = *complexVectorPtr++; +++ const float imag = *complexVectorPtr++; +++ *outPtr++ = atan2f(imag, real) * invNormalizeFactor; +++ } ++ } ++ #endif /* LV_HAVE_SSE */ ++ ++ #ifdef LV_HAVE_GENERIC ++ ++-static inline void volk_32fc_s32f_atan2_32f_generic(float* outputVector, const lv_32fc_t* inputVector, const float normalizeFactor, unsigned int num_points){ ++- float* outPtr = outputVector; ++- const float* inPtr = (float*)inputVector; ++- const float invNormalizeFactor = 1.0 / normalizeFactor; ++- unsigned int number; ++- for ( number = 0; number < num_points; number++) { ++- const float real = *inPtr++; ++- const float imag = *inPtr++; ++- *outPtr++ = atan2f(imag, real) * invNormalizeFactor; ++- } +++static inline void volk_32fc_s32f_atan2_32f_generic(float* outputVector, +++ const lv_32fc_t* inputVector, +++ const float normalizeFactor, +++ unsigned int num_points) +++{ +++ float* outPtr = outputVector; +++ const float* inPtr = (float*)inputVector; +++ const float invNormalizeFactor = 1.0 / normalizeFactor; +++ unsigned int number; +++ for (number = 0; number < num_points; number++) { +++ const float real = *inPtr++; +++ const float imag = *inPtr++; +++ *outPtr++ = atan2f(imag, real) * invNormalizeFactor; +++ } ++ } ++ #endif /* LV_HAVE_GENERIC */ ++ ++ ++- ++- ++ #endif /* INCLUDED_volk_32fc_s32f_atan2_32f_a_H */ ++diff --git a/kernels/volk/volk_32fc_s32f_deinterleave_real_16i.h b/kernels/volk/volk_32fc_s32f_deinterleave_real_16i.h ++index 64c6a8b..f70f494 100644 ++--- a/kernels/volk/volk_32fc_s32f_deinterleave_real_16i.h +++++ b/kernels/volk/volk_32fc_s32f_deinterleave_real_16i.h ++@@ -30,8 +30,8 @@ ++ * ++ * Dispatcher Prototype ++ * \code ++- * void volk_32fc_s32f_deinterleave_real_16i(int16_t* iBuffer, const lv_32fc_t* complexVector, const float scalar, unsigned int num_points) ++- * \endcode +++ * void volk_32fc_s32f_deinterleave_real_16i(int16_t* iBuffer, const lv_32fc_t* +++ * complexVector, const float scalar, unsigned int num_points) \endcode ++ * ++ * \b Inputs ++ * \li complexVector: The complex input vector. ++@@ -73,61 +73,62 @@ ++ #ifndef INCLUDED_volk_32fc_s32f_deinterleave_real_16i_a_H ++ #define INCLUDED_volk_32fc_s32f_deinterleave_real_16i_a_H ++ ++-#include ++ #include ++ #include +++#include ++ ++ ++ #ifdef LV_HAVE_AVX2 ++ #include ++ ++ static inline void ++-volk_32fc_s32f_deinterleave_real_16i_a_avx2(int16_t* iBuffer, const lv_32fc_t* complexVector, ++- const float scalar, unsigned int num_points) +++volk_32fc_s32f_deinterleave_real_16i_a_avx2(int16_t* iBuffer, +++ const lv_32fc_t* complexVector, +++ const float scalar, +++ unsigned int num_points) ++ { ++- unsigned int number = 0; ++- const unsigned int eighthPoints = num_points / 8; ++- ++- const float* complexVectorPtr = (float*)complexVector; ++- int16_t* iBufferPtr = iBuffer; +++ unsigned int number = 0; +++ const unsigned int eighthPoints = num_points / 8; ++ ++- __m256 vScalar = _mm256_set1_ps(scalar); +++ const float* complexVectorPtr = (float*)complexVector; +++ int16_t* iBufferPtr = iBuffer; ++ ++- __m256 cplxValue1, cplxValue2, iValue; ++- __m256i a; ++- __m128i b; +++ __m256 vScalar = _mm256_set1_ps(scalar); ++ ++- __m256i idx = _mm256_set_epi32(3,3,3,3,5,1,4,0); +++ __m256 cplxValue1, cplxValue2, iValue; +++ __m256i a; +++ __m128i b; ++ ++- for(;number < eighthPoints; number++){ ++- cplxValue1 = _mm256_load_ps(complexVectorPtr); ++- complexVectorPtr += 8; +++ __m256i idx = _mm256_set_epi32(3, 3, 3, 3, 5, 1, 4, 0); ++ ++- cplxValue2 = _mm256_load_ps(complexVectorPtr); ++- complexVectorPtr += 8; +++ for (; number < eighthPoints; number++) { +++ cplxValue1 = _mm256_load_ps(complexVectorPtr); +++ complexVectorPtr += 8; ++ ++- // Arrange in i1i2i3i4 format ++- iValue = _mm256_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0)); +++ cplxValue2 = _mm256_load_ps(complexVectorPtr); +++ complexVectorPtr += 8; ++ ++- iValue = _mm256_mul_ps(iValue, vScalar); +++ // Arrange in i1i2i3i4 format +++ iValue = _mm256_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2, 0, 2, 0)); ++ ++- iValue = _mm256_round_ps(iValue, _MM_FROUND_TO_ZERO); ++- a = _mm256_cvtps_epi32(iValue); ++- a = _mm256_packs_epi32(a,a); ++- a = _mm256_permutevar8x32_epi32(a,idx); ++- b = _mm256_extracti128_si256(a,0); +++ iValue = _mm256_mul_ps(iValue, vScalar); ++ ++- _mm_store_si128((__m128i*)iBufferPtr,b); ++- iBufferPtr += 8; +++ iValue = _mm256_round_ps(iValue, _MM_FROUND_TO_ZERO); +++ a = _mm256_cvtps_epi32(iValue); +++ a = _mm256_packs_epi32(a, a); +++ a = _mm256_permutevar8x32_epi32(a, idx); +++ b = _mm256_extracti128_si256(a, 0); ++ ++- } +++ _mm_store_si128((__m128i*)iBufferPtr, b); +++ iBufferPtr += 8; +++ } ++ ++- number = eighthPoints * 8; ++- iBufferPtr = &iBuffer[number]; ++- for(; number < num_points; number++){ ++- *iBufferPtr++ = (int16_t)(*complexVectorPtr++ * scalar); ++- complexVectorPtr++; ++- } +++ number = eighthPoints * 8; +++ iBufferPtr = &iBuffer[number]; +++ for (; number < num_points; number++) { +++ *iBufferPtr++ = (int16_t)(*complexVectorPtr++ * scalar); +++ complexVectorPtr++; +++ } ++ } ++ ++ ++@@ -137,46 +138,48 @@ volk_32fc_s32f_deinterleave_real_16i_a_avx2(int16_t* iBuffer, const lv_32fc_t* c ++ #include ++ ++ static inline void ++-volk_32fc_s32f_deinterleave_real_16i_a_sse(int16_t* iBuffer, const lv_32fc_t* complexVector, ++- const float scalar, unsigned int num_points) +++volk_32fc_s32f_deinterleave_real_16i_a_sse(int16_t* iBuffer, +++ const lv_32fc_t* complexVector, +++ const float scalar, +++ unsigned int num_points) ++ { ++- unsigned int number = 0; ++- const unsigned int quarterPoints = num_points / 4; +++ unsigned int number = 0; +++ const unsigned int quarterPoints = num_points / 4; ++ ++- const float* complexVectorPtr = (float*)complexVector; ++- int16_t* iBufferPtr = iBuffer; +++ const float* complexVectorPtr = (float*)complexVector; +++ int16_t* iBufferPtr = iBuffer; ++ ++- __m128 vScalar = _mm_set_ps1(scalar); +++ __m128 vScalar = _mm_set_ps1(scalar); ++ ++- __m128 cplxValue1, cplxValue2, iValue; +++ __m128 cplxValue1, cplxValue2, iValue; ++ ++- __VOLK_ATTR_ALIGNED(16) float floatBuffer[4]; +++ __VOLK_ATTR_ALIGNED(16) float floatBuffer[4]; ++ ++- for(;number < quarterPoints; number++){ ++- cplxValue1 = _mm_load_ps(complexVectorPtr); ++- complexVectorPtr += 4; +++ for (; number < quarterPoints; number++) { +++ cplxValue1 = _mm_load_ps(complexVectorPtr); +++ complexVectorPtr += 4; ++ ++- cplxValue2 = _mm_load_ps(complexVectorPtr); ++- complexVectorPtr += 4; +++ cplxValue2 = _mm_load_ps(complexVectorPtr); +++ complexVectorPtr += 4; ++ ++- // Arrange in i1i2i3i4 format ++- iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0)); +++ // Arrange in i1i2i3i4 format +++ iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2, 0, 2, 0)); ++ ++- iValue = _mm_mul_ps(iValue, vScalar); +++ iValue = _mm_mul_ps(iValue, vScalar); ++ ++- _mm_store_ps(floatBuffer, iValue); ++- *iBufferPtr++ = (int16_t)(floatBuffer[0]); ++- *iBufferPtr++ = (int16_t)(floatBuffer[1]); ++- *iBufferPtr++ = (int16_t)(floatBuffer[2]); ++- *iBufferPtr++ = (int16_t)(floatBuffer[3]); ++- } +++ _mm_store_ps(floatBuffer, iValue); +++ *iBufferPtr++ = (int16_t)(floatBuffer[0]); +++ *iBufferPtr++ = (int16_t)(floatBuffer[1]); +++ *iBufferPtr++ = (int16_t)(floatBuffer[2]); +++ *iBufferPtr++ = (int16_t)(floatBuffer[3]); +++ } ++ ++- number = quarterPoints * 4; ++- iBufferPtr = &iBuffer[number]; ++- for(; number < num_points; number++){ ++- *iBufferPtr++ = (int16_t)(*complexVectorPtr++ * scalar); ++- complexVectorPtr++; ++- } +++ number = quarterPoints * 4; +++ iBufferPtr = &iBuffer[number]; +++ for (; number < num_points; number++) { +++ *iBufferPtr++ = (int16_t)(*complexVectorPtr++ * scalar); +++ complexVectorPtr++; +++ } ++ } ++ ++ #endif /* LV_HAVE_SSE */ ++@@ -185,16 +188,18 @@ volk_32fc_s32f_deinterleave_real_16i_a_sse(int16_t* iBuffer, const lv_32fc_t* co ++ #ifdef LV_HAVE_GENERIC ++ ++ static inline void ++-volk_32fc_s32f_deinterleave_real_16i_generic(int16_t* iBuffer, const lv_32fc_t* complexVector, ++- const float scalar, unsigned int num_points) +++volk_32fc_s32f_deinterleave_real_16i_generic(int16_t* iBuffer, +++ const lv_32fc_t* complexVector, +++ const float scalar, +++ unsigned int num_points) ++ { ++- const float* complexVectorPtr = (float*)complexVector; ++- int16_t* iBufferPtr = iBuffer; ++- unsigned int number = 0; ++- for(number = 0; number < num_points; number++){ ++- *iBufferPtr++ = (int16_t)(*complexVectorPtr++ * scalar); ++- complexVectorPtr++; ++- } +++ const float* complexVectorPtr = (float*)complexVector; +++ int16_t* iBufferPtr = iBuffer; +++ unsigned int number = 0; +++ for (number = 0; number < num_points; number++) { +++ *iBufferPtr++ = (int16_t)(*complexVectorPtr++ * scalar); +++ complexVectorPtr++; +++ } ++ } ++ ++ #endif /* LV_HAVE_GENERIC */ ++@@ -204,60 +209,61 @@ volk_32fc_s32f_deinterleave_real_16i_generic(int16_t* iBuffer, const lv_32fc_t* ++ #ifndef INCLUDED_volk_32fc_s32f_deinterleave_real_16i_u_H ++ #define INCLUDED_volk_32fc_s32f_deinterleave_real_16i_u_H ++ ++-#include ++ #include ++ #include +++#include ++ ++ #ifdef LV_HAVE_AVX2 ++ #include ++ ++ static inline void ++-volk_32fc_s32f_deinterleave_real_16i_u_avx2(int16_t* iBuffer, const lv_32fc_t* complexVector, ++- const float scalar, unsigned int num_points) +++volk_32fc_s32f_deinterleave_real_16i_u_avx2(int16_t* iBuffer, +++ const lv_32fc_t* complexVector, +++ const float scalar, +++ unsigned int num_points) ++ { ++- unsigned int number = 0; ++- const unsigned int eighthPoints = num_points / 8; ++- ++- const float* complexVectorPtr = (float*)complexVector; ++- int16_t* iBufferPtr = iBuffer; +++ unsigned int number = 0; +++ const unsigned int eighthPoints = num_points / 8; ++ ++- __m256 vScalar = _mm256_set1_ps(scalar); +++ const float* complexVectorPtr = (float*)complexVector; +++ int16_t* iBufferPtr = iBuffer; ++ ++- __m256 cplxValue1, cplxValue2, iValue; ++- __m256i a; ++- __m128i b; +++ __m256 vScalar = _mm256_set1_ps(scalar); ++ ++- __m256i idx = _mm256_set_epi32(3,3,3,3,5,1,4,0); +++ __m256 cplxValue1, cplxValue2, iValue; +++ __m256i a; +++ __m128i b; ++ ++- for(;number < eighthPoints; number++){ ++- cplxValue1 = _mm256_loadu_ps(complexVectorPtr); ++- complexVectorPtr += 8; +++ __m256i idx = _mm256_set_epi32(3, 3, 3, 3, 5, 1, 4, 0); ++ ++- cplxValue2 = _mm256_loadu_ps(complexVectorPtr); ++- complexVectorPtr += 8; +++ for (; number < eighthPoints; number++) { +++ cplxValue1 = _mm256_loadu_ps(complexVectorPtr); +++ complexVectorPtr += 8; ++ ++- // Arrange in i1i2i3i4 format ++- iValue = _mm256_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0)); +++ cplxValue2 = _mm256_loadu_ps(complexVectorPtr); +++ complexVectorPtr += 8; ++ ++- iValue = _mm256_mul_ps(iValue, vScalar); +++ // Arrange in i1i2i3i4 format +++ iValue = _mm256_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2, 0, 2, 0)); ++ ++- iValue = _mm256_round_ps(iValue, _MM_FROUND_TO_ZERO); ++- a = _mm256_cvtps_epi32(iValue); ++- a = _mm256_packs_epi32(a,a); ++- a = _mm256_permutevar8x32_epi32(a,idx); ++- b = _mm256_extracti128_si256(a,0); +++ iValue = _mm256_mul_ps(iValue, vScalar); ++ ++- _mm_storeu_si128((__m128i*)iBufferPtr,b); ++- iBufferPtr += 8; +++ iValue = _mm256_round_ps(iValue, _MM_FROUND_TO_ZERO); +++ a = _mm256_cvtps_epi32(iValue); +++ a = _mm256_packs_epi32(a, a); +++ a = _mm256_permutevar8x32_epi32(a, idx); +++ b = _mm256_extracti128_si256(a, 0); ++ ++- } +++ _mm_storeu_si128((__m128i*)iBufferPtr, b); +++ iBufferPtr += 8; +++ } ++ ++- number = eighthPoints * 8; ++- iBufferPtr = &iBuffer[number]; ++- for(; number < num_points; number++){ ++- *iBufferPtr++ = (int16_t)(*complexVectorPtr++ * scalar); ++- complexVectorPtr++; ++- } +++ number = eighthPoints * 8; +++ iBufferPtr = &iBuffer[number]; +++ for (; number < num_points; number++) { +++ *iBufferPtr++ = (int16_t)(*complexVectorPtr++ * scalar); +++ complexVectorPtr++; +++ } ++ } ++ ++ #endif /* LV_HAVE_AVX2 */ ++diff --git a/kernels/volk/volk_32fc_s32f_magnitude_16i.h b/kernels/volk/volk_32fc_s32f_magnitude_16i.h ++index 6e7e7cb..91a5b8e 100644 ++--- a/kernels/volk/volk_32fc_s32f_magnitude_16i.h +++++ b/kernels/volk/volk_32fc_s32f_magnitude_16i.h ++@@ -31,8 +31,8 @@ ++ * ++ * Dispatcher Prototype ++ * \code ++- * void volk_32fc_s32f_magnitude_16i(int16_t* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points) ++- * \endcode +++ * void volk_32fc_s32f_magnitude_16i(int16_t* magnitudeVector, const lv_32fc_t* +++ * complexVector, unsigned int num_points) \endcode ++ * ++ * \b Inputs ++ * \li complexVector: The complex input vector. ++@@ -73,123 +73,129 @@ ++ #ifdef LV_HAVE_GENERIC ++ #include ++ ++-static inline void ++-volk_32fc_s32f_magnitude_16i_generic(int16_t* magnitudeVector, const lv_32fc_t* complexVector, ++- const float scalar, unsigned int num_points) +++static inline void volk_32fc_s32f_magnitude_16i_generic(int16_t* magnitudeVector, +++ const lv_32fc_t* complexVector, +++ const float scalar, +++ unsigned int num_points) ++ { ++- const float* complexVectorPtr = (float*)complexVector; ++- int16_t* magnitudeVectorPtr = magnitudeVector; ++- unsigned int number = 0; ++- for(number = 0; number < num_points; number++){ ++- __VOLK_VOLATILE float real = *complexVectorPtr++; ++- __VOLK_VOLATILE float imag = *complexVectorPtr++; ++- real *= real; ++- imag *= imag; ++- *magnitudeVectorPtr++ = (int16_t)rintf(scalar*sqrtf(real + imag)); ++- } +++ const float* complexVectorPtr = (float*)complexVector; +++ int16_t* magnitudeVectorPtr = magnitudeVector; +++ unsigned int number = 0; +++ for (number = 0; number < num_points; number++) { +++ __VOLK_VOLATILE float real = *complexVectorPtr++; +++ __VOLK_VOLATILE float imag = *complexVectorPtr++; +++ real *= real; +++ imag *= imag; +++ *magnitudeVectorPtr++ = (int16_t)rintf(scalar * sqrtf(real + imag)); +++ } ++ } ++ #endif /* LV_HAVE_GENERIC */ ++ ++ #ifndef INCLUDED_volk_32fc_s32f_magnitude_16i_a_H ++ #define INCLUDED_volk_32fc_s32f_magnitude_16i_a_H ++ ++-#include ++ #include ++-#include ++ #include +++#include +++#include ++ ++ #ifdef LV_HAVE_AVX2 ++ #include ++ ++-static inline void ++-volk_32fc_s32f_magnitude_16i_a_avx2(int16_t* magnitudeVector, const lv_32fc_t* complexVector, ++- const float scalar, unsigned int num_points) +++static inline void volk_32fc_s32f_magnitude_16i_a_avx2(int16_t* magnitudeVector, +++ const lv_32fc_t* complexVector, +++ const float scalar, +++ unsigned int num_points) ++ { ++- unsigned int number = 0; ++- const unsigned int eighthPoints = num_points / 8; +++ unsigned int number = 0; +++ const unsigned int eighthPoints = num_points / 8; ++ ++- const float* complexVectorPtr = (const float*)complexVector; ++- int16_t* magnitudeVectorPtr = magnitudeVector; +++ const float* complexVectorPtr = (const float*)complexVector; +++ int16_t* magnitudeVectorPtr = magnitudeVector; ++ ++- __m256 vScalar = _mm256_set1_ps(scalar); ++- __m256i idx = _mm256_set_epi32(0,0,0,0,5,1,4,0); ++- __m256 cplxValue1, cplxValue2, result; ++- __m256i resultInt; ++- __m128i resultShort; +++ __m256 vScalar = _mm256_set1_ps(scalar); +++ __m256i idx = _mm256_set_epi32(0, 0, 0, 0, 5, 1, 4, 0); +++ __m256 cplxValue1, cplxValue2, result; +++ __m256i resultInt; +++ __m128i resultShort; ++ ++- for(;number < eighthPoints; number++){ ++- cplxValue1 = _mm256_load_ps(complexVectorPtr); ++- complexVectorPtr += 8; +++ for (; number < eighthPoints; number++) { +++ cplxValue1 = _mm256_load_ps(complexVectorPtr); +++ complexVectorPtr += 8; ++ ++- cplxValue2 = _mm256_load_ps(complexVectorPtr); ++- complexVectorPtr += 8; +++ cplxValue2 = _mm256_load_ps(complexVectorPtr); +++ complexVectorPtr += 8; ++ ++- cplxValue1 = _mm256_mul_ps(cplxValue1, cplxValue1); // Square the values ++- cplxValue2 = _mm256_mul_ps(cplxValue2, cplxValue2); // Square the Values +++ cplxValue1 = _mm256_mul_ps(cplxValue1, cplxValue1); // Square the values +++ cplxValue2 = _mm256_mul_ps(cplxValue2, cplxValue2); // Square the Values ++ ++- result = _mm256_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values +++ result = _mm256_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values ++ ++- result = _mm256_sqrt_ps(result); +++ result = _mm256_sqrt_ps(result); ++ ++- result = _mm256_mul_ps(result, vScalar); +++ result = _mm256_mul_ps(result, vScalar); ++ ++- resultInt = _mm256_cvtps_epi32(result); ++- resultInt = _mm256_packs_epi32(resultInt, resultInt); ++- resultInt = _mm256_permutevar8x32_epi32(resultInt, idx); //permute to compensate for shuffling in hadd and packs ++- resultShort = _mm256_extracti128_si256(resultInt,0); ++- _mm_store_si128((__m128i*)magnitudeVectorPtr,resultShort); ++- magnitudeVectorPtr += 8; ++- } +++ resultInt = _mm256_cvtps_epi32(result); +++ resultInt = _mm256_packs_epi32(resultInt, resultInt); +++ resultInt = _mm256_permutevar8x32_epi32( +++ resultInt, idx); // permute to compensate for shuffling in hadd and packs +++ resultShort = _mm256_extracti128_si256(resultInt, 0); +++ _mm_store_si128((__m128i*)magnitudeVectorPtr, resultShort); +++ magnitudeVectorPtr += 8; +++ } ++ ++- number = eighthPoints * 8; ++- volk_32fc_s32f_magnitude_16i_generic(magnitudeVector+number, complexVector+number, scalar, num_points-number); +++ number = eighthPoints * 8; +++ volk_32fc_s32f_magnitude_16i_generic( +++ magnitudeVector + number, complexVector + number, scalar, num_points - number); ++ } ++ #endif /* LV_HAVE_AVX2 */ ++ ++ #ifdef LV_HAVE_SSE3 ++ #include ++ ++-static inline void ++-volk_32fc_s32f_magnitude_16i_a_sse3(int16_t* magnitudeVector, const lv_32fc_t* complexVector, ++- const float scalar, unsigned int num_points) +++static inline void volk_32fc_s32f_magnitude_16i_a_sse3(int16_t* magnitudeVector, +++ const lv_32fc_t* complexVector, +++ const float scalar, +++ unsigned int num_points) ++ { ++- unsigned int number = 0; ++- const unsigned int quarterPoints = num_points / 4; +++ unsigned int number = 0; +++ const unsigned int quarterPoints = num_points / 4; ++ ++- const float* complexVectorPtr = (const float*)complexVector; ++- int16_t* magnitudeVectorPtr = magnitudeVector; +++ const float* complexVectorPtr = (const float*)complexVector; +++ int16_t* magnitudeVectorPtr = magnitudeVector; ++ ++- __m128 vScalar = _mm_set_ps1(scalar); +++ __m128 vScalar = _mm_set_ps1(scalar); ++ ++- __m128 cplxValue1, cplxValue2, result; +++ __m128 cplxValue1, cplxValue2, result; ++ ++- __VOLK_ATTR_ALIGNED(16) float floatBuffer[4]; +++ __VOLK_ATTR_ALIGNED(16) float floatBuffer[4]; ++ ++- for(;number < quarterPoints; number++){ ++- cplxValue1 = _mm_load_ps(complexVectorPtr); ++- complexVectorPtr += 4; +++ for (; number < quarterPoints; number++) { +++ cplxValue1 = _mm_load_ps(complexVectorPtr); +++ complexVectorPtr += 4; ++ ++- cplxValue2 = _mm_load_ps(complexVectorPtr); ++- complexVectorPtr += 4; +++ cplxValue2 = _mm_load_ps(complexVectorPtr); +++ complexVectorPtr += 4; ++ ++- cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values ++- cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values +++ cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values +++ cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values ++ ++- result = _mm_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values +++ result = _mm_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values ++ ++- result = _mm_sqrt_ps(result); +++ result = _mm_sqrt_ps(result); ++ ++- result = _mm_mul_ps(result, vScalar); +++ result = _mm_mul_ps(result, vScalar); ++ ++- _mm_store_ps(floatBuffer, result); ++- *magnitudeVectorPtr++ = (int16_t)rintf(floatBuffer[0]); ++- *magnitudeVectorPtr++ = (int16_t)rintf(floatBuffer[1]); ++- *magnitudeVectorPtr++ = (int16_t)rintf(floatBuffer[2]); ++- *magnitudeVectorPtr++ = (int16_t)rintf(floatBuffer[3]); ++- } +++ _mm_store_ps(floatBuffer, result); +++ *magnitudeVectorPtr++ = (int16_t)rintf(floatBuffer[0]); +++ *magnitudeVectorPtr++ = (int16_t)rintf(floatBuffer[1]); +++ *magnitudeVectorPtr++ = (int16_t)rintf(floatBuffer[2]); +++ *magnitudeVectorPtr++ = (int16_t)rintf(floatBuffer[3]); +++ } ++ ++- number = quarterPoints * 4; ++- volk_32fc_s32f_magnitude_16i_generic(magnitudeVector+number, complexVector+number, scalar, num_points-number); +++ number = quarterPoints * 4; +++ volk_32fc_s32f_magnitude_16i_generic( +++ magnitudeVector + number, complexVector + number, scalar, num_points - number); ++ } ++ #endif /* LV_HAVE_SSE3 */ ++ ++@@ -197,53 +203,57 @@ volk_32fc_s32f_magnitude_16i_a_sse3(int16_t* magnitudeVector, const lv_32fc_t* c ++ #ifdef LV_HAVE_SSE ++ #include ++ ++-static inline void ++-volk_32fc_s32f_magnitude_16i_a_sse(int16_t* magnitudeVector, const lv_32fc_t* complexVector, ++- const float scalar, unsigned int num_points) +++static inline void volk_32fc_s32f_magnitude_16i_a_sse(int16_t* magnitudeVector, +++ const lv_32fc_t* complexVector, +++ const float scalar, +++ unsigned int num_points) ++ { ++- unsigned int number = 0; ++- const unsigned int quarterPoints = num_points / 4; +++ unsigned int number = 0; +++ const unsigned int quarterPoints = num_points / 4; ++ ++- const float* complexVectorPtr = (const float*)complexVector; ++- int16_t* magnitudeVectorPtr = magnitudeVector; +++ const float* complexVectorPtr = (const float*)complexVector; +++ int16_t* magnitudeVectorPtr = magnitudeVector; ++ ++- __m128 vScalar = _mm_set_ps1(scalar); +++ __m128 vScalar = _mm_set_ps1(scalar); ++ ++- __m128 cplxValue1, cplxValue2, result; ++- __m128 iValue, qValue; +++ __m128 cplxValue1, cplxValue2, result; +++ __m128 iValue, qValue; ++ ++- __VOLK_ATTR_ALIGNED(16) float floatBuffer[4]; +++ __VOLK_ATTR_ALIGNED(16) float floatBuffer[4]; ++ ++- for(;number < quarterPoints; number++){ ++- cplxValue1 = _mm_load_ps(complexVectorPtr); ++- complexVectorPtr += 4; +++ for (; number < quarterPoints; number++) { +++ cplxValue1 = _mm_load_ps(complexVectorPtr); +++ complexVectorPtr += 4; ++ ++- cplxValue2 = _mm_load_ps(complexVectorPtr); ++- complexVectorPtr += 4; +++ cplxValue2 = _mm_load_ps(complexVectorPtr); +++ complexVectorPtr += 4; ++ ++- // Arrange in i1i2i3i4 format ++- iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0)); ++- // Arrange in q1q2q3q4 format ++- qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1)); +++ // Arrange in i1i2i3i4 format +++ iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2, 0, 2, 0)); +++ // Arrange in q1q2q3q4 format +++ qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3, 1, 3, 1)); ++ ++- __VOLK_VOLATILE __m128 iValue2 = _mm_mul_ps(iValue, iValue); // Square the I values ++- __VOLK_VOLATILE __m128 qValue2 = _mm_mul_ps(qValue, qValue); // Square the Q Values +++ __VOLK_VOLATILE __m128 iValue2 = +++ _mm_mul_ps(iValue, iValue); // Square the I values +++ __VOLK_VOLATILE __m128 qValue2 = +++ _mm_mul_ps(qValue, qValue); // Square the Q Values ++ ++- result = _mm_add_ps(iValue2, qValue2); // Add the I2 and Q2 values +++ result = _mm_add_ps(iValue2, qValue2); // Add the I2 and Q2 values ++ ++- result = _mm_sqrt_ps(result); +++ result = _mm_sqrt_ps(result); ++ ++- result = _mm_mul_ps(result, vScalar); +++ result = _mm_mul_ps(result, vScalar); ++ ++- _mm_store_ps(floatBuffer, result); ++- *magnitudeVectorPtr++ = (int16_t)rintf(floatBuffer[0]); ++- *magnitudeVectorPtr++ = (int16_t)rintf(floatBuffer[1]); ++- *magnitudeVectorPtr++ = (int16_t)rintf(floatBuffer[2]); ++- *magnitudeVectorPtr++ = (int16_t)rintf(floatBuffer[3]); ++- } +++ _mm_store_ps(floatBuffer, result); +++ *magnitudeVectorPtr++ = (int16_t)rintf(floatBuffer[0]); +++ *magnitudeVectorPtr++ = (int16_t)rintf(floatBuffer[1]); +++ *magnitudeVectorPtr++ = (int16_t)rintf(floatBuffer[2]); +++ *magnitudeVectorPtr++ = (int16_t)rintf(floatBuffer[3]); +++ } ++ ++- number = quarterPoints * 4; ++- volk_32fc_s32f_magnitude_16i_generic(magnitudeVector+number, complexVector+number, scalar, num_points-number); +++ number = quarterPoints * 4; +++ volk_32fc_s32f_magnitude_16i_generic( +++ magnitudeVector + number, complexVector + number, scalar, num_points - number); ++ } ++ #endif /* LV_HAVE_SSE */ ++ ++@@ -253,56 +263,59 @@ volk_32fc_s32f_magnitude_16i_a_sse(int16_t* magnitudeVector, const lv_32fc_t* co ++ #ifndef INCLUDED_volk_32fc_s32f_magnitude_16i_u_H ++ #define INCLUDED_volk_32fc_s32f_magnitude_16i_u_H ++ ++-#include ++ #include ++-#include ++ #include +++#include +++#include ++ ++ #ifdef LV_HAVE_AVX2 ++ #include ++ ++-static inline void ++-volk_32fc_s32f_magnitude_16i_u_avx2(int16_t* magnitudeVector, const lv_32fc_t* complexVector, ++- const float scalar, unsigned int num_points) +++static inline void volk_32fc_s32f_magnitude_16i_u_avx2(int16_t* magnitudeVector, +++ const lv_32fc_t* complexVector, +++ const float scalar, +++ unsigned int num_points) ++ { ++- unsigned int number = 0; ++- const unsigned int eighthPoints = num_points / 8; +++ unsigned int number = 0; +++ const unsigned int eighthPoints = num_points / 8; ++ ++- const float* complexVectorPtr = (const float*)complexVector; ++- int16_t* magnitudeVectorPtr = magnitudeVector; +++ const float* complexVectorPtr = (const float*)complexVector; +++ int16_t* magnitudeVectorPtr = magnitudeVector; ++ ++- __m256 vScalar = _mm256_set1_ps(scalar); ++- __m256i idx = _mm256_set_epi32(0,0,0,0,5,1,4,0); ++- __m256 cplxValue1, cplxValue2, result; ++- __m256i resultInt; ++- __m128i resultShort; +++ __m256 vScalar = _mm256_set1_ps(scalar); +++ __m256i idx = _mm256_set_epi32(0, 0, 0, 0, 5, 1, 4, 0); +++ __m256 cplxValue1, cplxValue2, result; +++ __m256i resultInt; +++ __m128i resultShort; ++ ++- for(;number < eighthPoints; number++){ ++- cplxValue1 = _mm256_loadu_ps(complexVectorPtr); ++- complexVectorPtr += 8; +++ for (; number < eighthPoints; number++) { +++ cplxValue1 = _mm256_loadu_ps(complexVectorPtr); +++ complexVectorPtr += 8; ++ ++- cplxValue2 = _mm256_loadu_ps(complexVectorPtr); ++- complexVectorPtr += 8; +++ cplxValue2 = _mm256_loadu_ps(complexVectorPtr); +++ complexVectorPtr += 8; ++ ++- cplxValue1 = _mm256_mul_ps(cplxValue1, cplxValue1); // Square the values ++- cplxValue2 = _mm256_mul_ps(cplxValue2, cplxValue2); // Square the Values +++ cplxValue1 = _mm256_mul_ps(cplxValue1, cplxValue1); // Square the values +++ cplxValue2 = _mm256_mul_ps(cplxValue2, cplxValue2); // Square the Values ++ ++- result = _mm256_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values +++ result = _mm256_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values ++ ++- result = _mm256_sqrt_ps(result); +++ result = _mm256_sqrt_ps(result); ++ ++- result = _mm256_mul_ps(result, vScalar); +++ result = _mm256_mul_ps(result, vScalar); ++ ++- resultInt = _mm256_cvtps_epi32(result); ++- resultInt = _mm256_packs_epi32(resultInt, resultInt); ++- resultInt = _mm256_permutevar8x32_epi32(resultInt, idx); //permute to compensate for shuffling in hadd and packs ++- resultShort = _mm256_extracti128_si256(resultInt,0); ++- _mm_storeu_si128((__m128i*)magnitudeVectorPtr,resultShort); ++- magnitudeVectorPtr += 8; ++- } +++ resultInt = _mm256_cvtps_epi32(result); +++ resultInt = _mm256_packs_epi32(resultInt, resultInt); +++ resultInt = _mm256_permutevar8x32_epi32( +++ resultInt, idx); // permute to compensate for shuffling in hadd and packs +++ resultShort = _mm256_extracti128_si256(resultInt, 0); +++ _mm_storeu_si128((__m128i*)magnitudeVectorPtr, resultShort); +++ magnitudeVectorPtr += 8; +++ } ++ ++- number = eighthPoints * 8; ++- volk_32fc_s32f_magnitude_16i_generic(magnitudeVector+number, complexVector+number, scalar, num_points-number); +++ number = eighthPoints * 8; +++ volk_32fc_s32f_magnitude_16i_generic( +++ magnitudeVector + number, complexVector + number, scalar, num_points - number); ++ } ++ #endif /* LV_HAVE_AVX2 */ ++ ++diff --git a/kernels/volk/volk_32fc_s32f_power_32fc.h b/kernels/volk/volk_32fc_s32f_power_32fc.h ++index d2803f2..b31179c 100644 ++--- a/kernels/volk/volk_32fc_s32f_power_32fc.h +++++ b/kernels/volk/volk_32fc_s32f_power_32fc.h ++@@ -31,8 +31,8 @@ ++ * ++ * Dispatcher Prototype ++ * \code ++- * void volk_32fc_s32f_power_32fc(lv_32fc_t* cVector, const lv_32fc_t* aVector, const float power, unsigned int num_points) ++- * \endcode +++ * void volk_32fc_s32f_power_32fc(lv_32fc_t* cVector, const lv_32fc_t* aVector, const +++ * float power, unsigned int num_points) \endcode ++ * ++ * \b Inputs ++ * \li aVector: The complex input vector. ++@@ -56,15 +56,17 @@ ++ #define INCLUDED_volk_32fc_s32f_power_32fc_a_H ++ ++ #include ++-#include ++ #include +++#include ++ ++ //! raise a complex float to a real float power ++-static inline lv_32fc_t __volk_s32fc_s32f_power_s32fc_a(const lv_32fc_t exp, const float power) +++static inline lv_32fc_t __volk_s32fc_s32f_power_s32fc_a(const lv_32fc_t exp, +++ const float power) ++ { ++- const float arg = power*atan2f(lv_creal(exp), lv_cimag(exp)); ++- const float mag = powf(lv_creal(exp)*lv_creal(exp) + lv_cimag(exp)*lv_cimag(exp), power/2); ++- return mag*lv_cmake(-cosf(arg), sinf(arg)); +++ const float arg = power * atan2f(lv_creal(exp), lv_cimag(exp)); +++ const float mag = +++ powf(lv_creal(exp) * lv_creal(exp) + lv_cimag(exp) * lv_cimag(exp), power / 2); +++ return mag * lv_cmake(-cosf(arg), sinf(arg)); ++ } ++ ++ #ifdef LV_HAVE_SSE ++@@ -74,83 +76,94 @@ static inline lv_32fc_t __volk_s32fc_s32f_power_s32fc_a(const lv_32fc_t exp, con ++ #include ++ #endif /* LV_HAVE_LIB_SIMDMATH */ ++ ++-static inline void ++-volk_32fc_s32f_power_32fc_a_sse(lv_32fc_t* cVector, const lv_32fc_t* aVector, ++- const float power, unsigned int num_points) +++static inline void volk_32fc_s32f_power_32fc_a_sse(lv_32fc_t* cVector, +++ const lv_32fc_t* aVector, +++ const float power, +++ unsigned int num_points) ++ { ++- unsigned int number = 0; +++ unsigned int number = 0; ++ ++- lv_32fc_t* cPtr = cVector; ++- const lv_32fc_t* aPtr = aVector; +++ lv_32fc_t* cPtr = cVector; +++ const lv_32fc_t* aPtr = aVector; ++ ++ #ifdef LV_HAVE_LIB_SIMDMATH ++- const unsigned int quarterPoints = num_points / 4; ++- __m128 vPower = _mm_set_ps1(power); +++ const unsigned int quarterPoints = num_points / 4; +++ __m128 vPower = _mm_set_ps1(power); ++ ++- __m128 cplxValue1, cplxValue2, magnitude, phase, iValue, qValue; ++- for(;number < quarterPoints; number++){ +++ __m128 cplxValue1, cplxValue2, magnitude, phase, iValue, qValue; +++ for (; number < quarterPoints; number++) { ++ ++- cplxValue1 = _mm_load_ps((float*)aPtr); ++- aPtr += 2; +++ cplxValue1 = _mm_load_ps((float*)aPtr); +++ aPtr += 2; ++ ++- cplxValue2 = _mm_load_ps((float*)aPtr); ++- aPtr += 2; +++ cplxValue2 = _mm_load_ps((float*)aPtr); +++ aPtr += 2; ++ ++- // Convert to polar coordinates +++ // Convert to polar coordinates ++ ++- // Arrange in i1i2i3i4 format ++- iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0)); ++- // Arrange in q1q2q3q4 format ++- qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1)); +++ // Arrange in i1i2i3i4 format +++ iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2, 0, 2, 0)); +++ // Arrange in q1q2q3q4 format +++ qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3, 1, 3, 1)); ++ ++- phase = atan2f4(qValue, iValue); // Calculate the Phase +++ phase = atan2f4(qValue, iValue); // Calculate the Phase ++ ++- magnitude = _mm_sqrt_ps(_mm_add_ps(_mm_mul_ps(iValue, iValue), _mm_mul_ps(qValue, qValue))); // Calculate the magnitude by square rooting the added I2 and Q2 values +++ magnitude = _mm_sqrt_ps( +++ _mm_add_ps(_mm_mul_ps(iValue, iValue), +++ _mm_mul_ps(qValue, qValue))); // Calculate the magnitude by square +++ // rooting the added I2 and Q2 values ++ ++- // Now calculate the power of the polar coordinate data ++- magnitude = powf4(magnitude, vPower); // Take the magnitude to the specified power +++ // Now calculate the power of the polar coordinate data +++ magnitude = powf4(magnitude, vPower); // Take the magnitude to the specified power ++ ++- phase = _mm_mul_ps(phase, vPower); // Multiply the phase by the specified power +++ phase = _mm_mul_ps(phase, vPower); // Multiply the phase by the specified power ++ ++- // Convert back to cartesian coordinates ++- iValue = _mm_mul_ps( cosf4(phase), magnitude); // Multiply the cos of the phase by the magnitude ++- qValue = _mm_mul_ps( sinf4(phase), magnitude); // Multiply the sin of the phase by the magnitude +++ // Convert back to cartesian coordinates +++ iValue = _mm_mul_ps(cosf4(phase), +++ magnitude); // Multiply the cos of the phase by the magnitude +++ qValue = _mm_mul_ps(sinf4(phase), +++ magnitude); // Multiply the sin of the phase by the magnitude ++ ++- cplxValue1 = _mm_unpacklo_ps(iValue, qValue); // Interleave the lower two i & q values ++- cplxValue2 = _mm_unpackhi_ps(iValue, qValue); // Interleave the upper two i & q values +++ cplxValue1 = +++ _mm_unpacklo_ps(iValue, qValue); // Interleave the lower two i & q values +++ cplxValue2 = +++ _mm_unpackhi_ps(iValue, qValue); // Interleave the upper two i & q values ++ ++- _mm_store_ps((float*)cPtr,cplxValue1); // Store the results back into the C container +++ _mm_store_ps((float*)cPtr, +++ cplxValue1); // Store the results back into the C container ++ ++- cPtr += 2; +++ cPtr += 2; ++ ++- _mm_store_ps((float*)cPtr,cplxValue2); // Store the results back into the C container +++ _mm_store_ps((float*)cPtr, +++ cplxValue2); // Store the results back into the C container ++ ++- cPtr += 2; ++- } +++ cPtr += 2; +++ } ++ ++- number = quarterPoints * 4; +++ number = quarterPoints * 4; ++ #endif /* LV_HAVE_LIB_SIMDMATH */ ++ ++- for(;number < num_points; number++){ ++- *cPtr++ = __volk_s32fc_s32f_power_s32fc_a((*aPtr++), power); ++- } +++ for (; number < num_points; number++) { +++ *cPtr++ = __volk_s32fc_s32f_power_s32fc_a((*aPtr++), power); +++ } ++ } ++ #endif /* LV_HAVE_SSE */ ++ ++ ++ #ifdef LV_HAVE_GENERIC ++ ++-static inline void ++-volk_32fc_s32f_power_32fc_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, ++- const float power, unsigned int num_points) +++static inline void volk_32fc_s32f_power_32fc_generic(lv_32fc_t* cVector, +++ const lv_32fc_t* aVector, +++ const float power, +++ unsigned int num_points) ++ { ++- lv_32fc_t* cPtr = cVector; ++- const lv_32fc_t* aPtr = aVector; ++- unsigned int number = 0; +++ lv_32fc_t* cPtr = cVector; +++ const lv_32fc_t* aPtr = aVector; +++ unsigned int number = 0; ++ ++- for(number = 0; number < num_points; number++){ ++- *cPtr++ = __volk_s32fc_s32f_power_s32fc_a((*aPtr++), power); ++- } +++ for (number = 0; number < num_points; number++) { +++ *cPtr++ = __volk_s32fc_s32f_power_s32fc_a((*aPtr++), power); +++ } ++ } ++ ++ #endif /* LV_HAVE_GENERIC */ ++diff --git a/kernels/volk/volk_32fc_s32f_power_spectrum_32f.h b/kernels/volk/volk_32fc_s32f_power_spectrum_32f.h ++index abe4662..a1a036d 100644 ++--- a/kernels/volk/volk_32fc_s32f_power_spectrum_32f.h +++++ b/kernels/volk/volk_32fc_s32f_power_spectrum_32f.h ++@@ -29,13 +29,13 @@ ++ * ++ * Dispatcher Prototype ++ * \code ++- * void volk_32fc_s32f_power_spectrum_32f(float* logPowerOutput, const lv_32fc_t* complexFFTInput, const float normalizationFactor, unsigned int num_points) ++- * \endcode +++ * void volk_32fc_s32f_power_spectrum_32f(float* logPowerOutput, const lv_32fc_t* +++ * complexFFTInput, const float normalizationFactor, unsigned int num_points) \endcode ++ * ++ * \b Inputs ++ * \li complexFFTInput The complex data output from the FFT point. ++- * \li normalizationFactor: This value is divided against all the input values before the power is calculated. ++- * \li num_points: The number of fft data points. +++ * \li normalizationFactor: This value is divided against all the input values before the +++ * power is calculated. \li num_points: The number of fft data points. ++ * ++ * \b Outputs ++ * \li logPowerOutput: The 10.0 * log10(r*r + i*i) for each data point. ++@@ -54,8 +54,8 @@ ++ #define INCLUDED_volk_32fc_s32f_power_spectrum_32f_a_H ++ ++ #include ++-#include ++ #include +++#include ++ ++ #ifdef LV_HAVE_SSE3 ++ #include ++@@ -65,74 +65,75 @@ ++ #endif /* LV_HAVE_LIB_SIMDMATH */ ++ ++ static inline void ++-volk_32fc_s32f_power_spectrum_32f_a_sse3(float* logPowerOutput, const lv_32fc_t* complexFFTInput, ++- const float normalizationFactor, unsigned int num_points) +++volk_32fc_s32f_power_spectrum_32f_a_sse3(float* logPowerOutput, +++ const lv_32fc_t* complexFFTInput, +++ const float normalizationFactor, +++ unsigned int num_points) ++ { ++- const float* inputPtr = (const float*)complexFFTInput; ++- float* destPtr = logPowerOutput; ++- uint64_t number = 0; ++- const float iNormalizationFactor = 1.0 / normalizationFactor; +++ const float* inputPtr = (const float*)complexFFTInput; +++ float* destPtr = logPowerOutput; +++ uint64_t number = 0; +++ const float iNormalizationFactor = 1.0 / normalizationFactor; ++ #ifdef LV_HAVE_LIB_SIMDMATH ++- __m128 magScalar = _mm_set_ps1(10.0); ++- magScalar = _mm_div_ps(magScalar, logf4(magScalar)); +++ __m128 magScalar = _mm_set_ps1(10.0); +++ magScalar = _mm_div_ps(magScalar, logf4(magScalar)); ++ ++- __m128 invNormalizationFactor = _mm_set_ps1(iNormalizationFactor); +++ __m128 invNormalizationFactor = _mm_set_ps1(iNormalizationFactor); ++ ++- __m128 power; ++- __m128 input1, input2; ++- const uint64_t quarterPoints = num_points / 4; ++- for(;number < quarterPoints; number++){ ++- // Load the complex values ++- input1 =_mm_load_ps(inputPtr); ++- inputPtr += 4; ++- input2 =_mm_load_ps(inputPtr); ++- inputPtr += 4; +++ __m128 power; +++ __m128 input1, input2; +++ const uint64_t quarterPoints = num_points / 4; +++ for (; number < quarterPoints; number++) { +++ // Load the complex values +++ input1 = _mm_load_ps(inputPtr); +++ inputPtr += 4; +++ input2 = _mm_load_ps(inputPtr); +++ inputPtr += 4; ++ ++- // Apply the normalization factor ++- input1 = _mm_mul_ps(input1, invNormalizationFactor); ++- input2 = _mm_mul_ps(input2, invNormalizationFactor); +++ // Apply the normalization factor +++ input1 = _mm_mul_ps(input1, invNormalizationFactor); +++ input2 = _mm_mul_ps(input2, invNormalizationFactor); ++ ++- // Multiply each value by itself ++- // (r1*r1), (i1*i1), (r2*r2), (i2*i2) ++- input1 = _mm_mul_ps(input1, input1); ++- // (r3*r3), (i3*i3), (r4*r4), (i4*i4) ++- input2 = _mm_mul_ps(input2, input2); +++ // Multiply each value by itself +++ // (r1*r1), (i1*i1), (r2*r2), (i2*i2) +++ input1 = _mm_mul_ps(input1, input1); +++ // (r3*r3), (i3*i3), (r4*r4), (i4*i4) +++ input2 = _mm_mul_ps(input2, input2); ++ ++- // Horizontal add, to add (r*r) + (i*i) for each complex value ++- // (r1*r1)+(i1*i1), (r2*r2) + (i2*i2), (r3*r3)+(i3*i3), (r4*r4)+(i4*i4) ++- power = _mm_hadd_ps(input1, input2); +++ // Horizontal add, to add (r*r) + (i*i) for each complex value +++ // (r1*r1)+(i1*i1), (r2*r2) + (i2*i2), (r3*r3)+(i3*i3), (r4*r4)+(i4*i4) +++ power = _mm_hadd_ps(input1, input2); ++ ++- // Calculate the natural log power ++- power = logf4(power); +++ // Calculate the natural log power +++ power = logf4(power); ++ ++- // Convert to log10 and multiply by 10.0 ++- power = _mm_mul_ps(power, magScalar); +++ // Convert to log10 and multiply by 10.0 +++ power = _mm_mul_ps(power, magScalar); ++ ++- // Store the floating point results ++- _mm_store_ps(destPtr, power); +++ // Store the floating point results +++ _mm_store_ps(destPtr, power); ++ ++- destPtr += 4; ++- } +++ destPtr += 4; +++ } ++ ++- number = quarterPoints*4; +++ number = quarterPoints * 4; ++ #endif /* LV_HAVE_LIB_SIMDMATH */ ++- // Calculate the FFT for any remaining points ++- ++- for(; number < num_points; number++){ ++- // Calculate dBm ++- // 50 ohm load assumption ++- // 10 * log10 (v^2 / (2 * 50.0 * .001)) = 10 * log10( v^2 * 10) ++- // 75 ohm load assumption ++- // 10 * log10 (v^2 / (2 * 75.0 * .001)) = 10 * log10( v^2 * 15) +++ // Calculate the FFT for any remaining points ++ ++- const float real = *inputPtr++ * iNormalizationFactor; ++- const float imag = *inputPtr++ * iNormalizationFactor; +++ for (; number < num_points; number++) { +++ // Calculate dBm +++ // 50 ohm load assumption +++ // 10 * log10 (v^2 / (2 * 50.0 * .001)) = 10 * log10( v^2 * 10) +++ // 75 ohm load assumption +++ // 10 * log10 (v^2 / (2 * 75.0 * .001)) = 10 * log10( v^2 * 15) ++ ++- *destPtr = 10.0*log10f(((real * real) + (imag * imag)) + 1e-20); +++ const float real = *inputPtr++ * iNormalizationFactor; +++ const float imag = *inputPtr++ * iNormalizationFactor; ++ ++- destPtr++; ++- } +++ *destPtr = 10.0 * log10f(((real * real) + (imag * imag)) + 1e-20); ++ +++ destPtr++; +++ } ++ } ++ #endif /* LV_HAVE_SSE3 */ ++ ++@@ -141,7 +142,10 @@ volk_32fc_s32f_power_spectrum_32f_a_sse3(float* logPowerOutput, const lv_32fc_t* ++ #include ++ ++ static inline void ++-volk_32fc_s32f_power_spectrum_32f_neon(float* logPowerOutput, const lv_32fc_t* complexFFTInput, const float normalizationFactor, unsigned int num_points) +++volk_32fc_s32f_power_spectrum_32f_neon(float* logPowerOutput, +++ const lv_32fc_t* complexFFTInput, +++ const float normalizationFactor, +++ unsigned int num_points) ++ { ++ float* logPowerOutputPtr = logPowerOutput; ++ const lv_32fc_t* complexFFTInputPtr = complexFFTInput; ++@@ -151,14 +155,14 @@ volk_32fc_s32f_power_spectrum_32f_neon(float* logPowerOutput, const lv_32fc_t* c ++ float32x4x2_t fft_vec; ++ float32x4_t log_pwr_vec; ++ float32x4_t mag_squared_vec; ++- +++ ++ const float inv_ln10_10 = 4.34294481903f; // 10.0/ln(10.) ++- ++- for(number = 0; number < quarter_points; number++) { +++ +++ for (number = 0; number < quarter_points; number++) { ++ // Load ++ fft_vec = vld2q_f32((float*)complexFFTInputPtr); ++ // Prefetch next 4 ++- __VOLK_PREFETCH(complexFFTInputPtr+4); +++ __VOLK_PREFETCH(complexFFTInputPtr + 4); ++ // Normalize ++ fft_vec.val[0] = vmulq_n_f32(fft_vec.val[0], iNormalizationFactor); ++ fft_vec.val[1] = vmulq_n_f32(fft_vec.val[1], iNormalizationFactor); ++@@ -167,12 +171,12 @@ volk_32fc_s32f_power_spectrum_32f_neon(float* logPowerOutput, const lv_32fc_t* c ++ // Store ++ vst1q_f32(logPowerOutputPtr, log_pwr_vec); ++ // Move pointers ahead ++- complexFFTInputPtr+=4; ++- logPowerOutputPtr+=4; +++ complexFFTInputPtr += 4; +++ logPowerOutputPtr += 4; ++ } ++- +++ ++ // deal with the rest ++- for(number = quarter_points * 4; number < num_points; number++) { +++ for (number = quarter_points * 4; number < num_points; number++) { ++ const float real = lv_creal(*complexFFTInputPtr) * iNormalizationFactor; ++ const float imag = lv_cimag(*complexFFTInputPtr) * iNormalizationFactor; ++ *logPowerOutputPtr = 10.0 * log10f(((real * real) + (imag * imag)) + 1e-20); ++@@ -186,27 +190,29 @@ volk_32fc_s32f_power_spectrum_32f_neon(float* logPowerOutput, const lv_32fc_t* c ++ #ifdef LV_HAVE_GENERIC ++ ++ static inline void ++-volk_32fc_s32f_power_spectrum_32f_generic(float* logPowerOutput, const lv_32fc_t* complexFFTInput, ++- const float normalizationFactor, unsigned int num_points) +++volk_32fc_s32f_power_spectrum_32f_generic(float* logPowerOutput, +++ const lv_32fc_t* complexFFTInput, +++ const float normalizationFactor, +++ unsigned int num_points) ++ { ++- // Calculate the Power of the complex point ++- const float* inputPtr = (float*)complexFFTInput; ++- float* realFFTDataPointsPtr = logPowerOutput; ++- const float iNormalizationFactor = 1.0 / normalizationFactor; ++- unsigned int point; ++- for(point = 0; point < num_points; point++){ ++- // Calculate dBm ++- // 50 ohm load assumption ++- // 10 * log10 (v^2 / (2 * 50.0 * .001)) = 10 * log10( v^2 * 10) ++- // 75 ohm load assumption ++- // 10 * log10 (v^2 / (2 * 75.0 * .001)) = 10 * log10( v^2 * 15) ++- ++- const float real = *inputPtr++ * iNormalizationFactor; ++- const float imag = *inputPtr++ * iNormalizationFactor; ++- ++- *realFFTDataPointsPtr = 10.0*log10f(((real * real) + (imag * imag)) + 1e-20); ++- realFFTDataPointsPtr++; ++- } +++ // Calculate the Power of the complex point +++ const float* inputPtr = (float*)complexFFTInput; +++ float* realFFTDataPointsPtr = logPowerOutput; +++ const float iNormalizationFactor = 1.0 / normalizationFactor; +++ unsigned int point; +++ for (point = 0; point < num_points; point++) { +++ // Calculate dBm +++ // 50 ohm load assumption +++ // 10 * log10 (v^2 / (2 * 50.0 * .001)) = 10 * log10( v^2 * 10) +++ // 75 ohm load assumption +++ // 10 * log10 (v^2 / (2 * 75.0 * .001)) = 10 * log10( v^2 * 15) +++ +++ const float real = *inputPtr++ * iNormalizationFactor; +++ const float imag = *inputPtr++ * iNormalizationFactor; +++ +++ *realFFTDataPointsPtr = 10.0 * log10f(((real * real) + (imag * imag)) + 1e-20); +++ realFFTDataPointsPtr++; +++ } ++ } ++ #endif /* LV_HAVE_GENERIC */ ++ ++diff --git a/kernels/volk/volk_32fc_s32f_x2_power_spectral_density_32f.h b/kernels/volk/volk_32fc_s32f_x2_power_spectral_density_32f.h ++index 3260b08..37ca43c 100644 ++--- a/kernels/volk/volk_32fc_s32f_x2_power_spectral_density_32f.h +++++ b/kernels/volk/volk_32fc_s32f_x2_power_spectral_density_32f.h ++@@ -29,14 +29,15 @@ ++ * ++ * Dispatcher Prototype ++ * \code ++- * void volk_32fc_s32f_x2_power_spectral_density_32f(float* logPowerOutput, const lv_32fc_t* complexFFTInput, const float normalizationFactor, const float rbw, unsigned int num_points) ++- * \endcode +++ * void volk_32fc_s32f_x2_power_spectral_density_32f(float* logPowerOutput, const +++ * lv_32fc_t* complexFFTInput, const float normalizationFactor, const float rbw, unsigned +++ * int num_points) \endcode ++ * ++ * \b Inputs ++ * \li complexFFTInput The complex data output from the FFT point. ++- * \li normalizationFactor: This value is divided against all the input values before the power is calculated. ++- * \li rbw: The resolution bandwidth of the fft spectrum ++- * \li num_points: The number of fft data points. +++ * \li normalizationFactor: This value is divided against all the input values before the +++ * power is calculated. \li rbw: The resolution bandwidth of the fft spectrum \li +++ * num_points: The number of fft data points. ++ * ++ * \b Outputs ++ * \li logPowerOutput: The 10.0 * log10((r*r + i*i)/RBW) for each data point. ++@@ -55,8 +56,8 @@ ++ #define INCLUDED_volk_32fc_s32f_x2_power_spectral_density_32f_a_H ++ ++ #include ++-#include ++ #include +++#include ++ ++ #ifdef LV_HAVE_AVX ++ #include ++@@ -66,83 +67,84 @@ ++ #endif /* LV_HAVE_LIB_SIMDMATH */ ++ ++ static inline void ++-volk_32fc_s32f_x2_power_spectral_density_32f_a_avx(float* logPowerOutput, const lv_32fc_t* complexFFTInput, ++- const float normalizationFactor, const float rbw, +++volk_32fc_s32f_x2_power_spectral_density_32f_a_avx(float* logPowerOutput, +++ const lv_32fc_t* complexFFTInput, +++ const float normalizationFactor, +++ const float rbw, ++ unsigned int num_points) ++ { ++- const float* inputPtr = (const float*)complexFFTInput; ++- float* destPtr = logPowerOutput; ++- uint64_t number = 0; ++- const float iRBW = 1.0 / rbw; ++- const float iNormalizationFactor = 1.0 / normalizationFactor; +++ const float* inputPtr = (const float*)complexFFTInput; +++ float* destPtr = logPowerOutput; +++ uint64_t number = 0; +++ const float iRBW = 1.0 / rbw; +++ const float iNormalizationFactor = 1.0 / normalizationFactor; ++ ++ #ifdef LV_HAVE_LIB_SIMDMATH ++- __m256 magScalar = _mm256_set1_ps(10.0); ++- magScalar = _mm256_div_ps(magScalar, logf4(magScalar)); +++ __m256 magScalar = _mm256_set1_ps(10.0); +++ magScalar = _mm256_div_ps(magScalar, logf4(magScalar)); ++ ++- __m256 invRBW = _mm256_set1_ps(iRBW); +++ __m256 invRBW = _mm256_set1_ps(iRBW); ++ ++- __m256 invNormalizationFactor = _mm256_set1_ps(iNormalizationFactor); +++ __m256 invNormalizationFactor = _mm256_set1_ps(iNormalizationFactor); ++ ++- __m256 power; ++- __m256 input1, input2; ++- const uint64_t eighthPoints = num_points / 8; ++- for(;number < eighthPoints; number++){ ++- // Load the complex values ++- input1 =_mm256_load_ps(inputPtr); ++- inputPtr += 8; ++- input2 =_mm256_load_ps(inputPtr); ++- inputPtr += 8; +++ __m256 power; +++ __m256 input1, input2; +++ const uint64_t eighthPoints = num_points / 8; +++ for (; number < eighthPoints; number++) { +++ // Load the complex values +++ input1 = _mm256_load_ps(inputPtr); +++ inputPtr += 8; +++ input2 = _mm256_load_ps(inputPtr); +++ inputPtr += 8; ++ ++- // Apply the normalization factor ++- input1 = _mm256_mul_ps(input1, invNormalizationFactor); ++- input2 = _mm256_mul_ps(input2, invNormalizationFactor); +++ // Apply the normalization factor +++ input1 = _mm256_mul_ps(input1, invNormalizationFactor); +++ input2 = _mm256_mul_ps(input2, invNormalizationFactor); ++ ++- // Multiply each value by itself ++- // (r1*r1), (i1*i1), (r2*r2), (i2*i2) ++- input1 = _mm256_mul_ps(input1, input1); ++- // (r3*r3), (i3*i3), (r4*r4), (i4*i4) ++- input2 = _mm256_mul_ps(input2, input2); +++ // Multiply each value by itself +++ // (r1*r1), (i1*i1), (r2*r2), (i2*i2) +++ input1 = _mm256_mul_ps(input1, input1); +++ // (r3*r3), (i3*i3), (r4*r4), (i4*i4) +++ input2 = _mm256_mul_ps(input2, input2); ++ ++- // Horizontal add, to add (r*r) + (i*i) for each complex value ++- // (r1*r1)+(i1*i1), (r2*r2) + (i2*i2), (r3*r3)+(i3*i3), (r4*r4)+(i4*i4) ++- inputVal1 = _mm256_permute2f128_ps(input1, input2, 0x20); ++- inputVal2 = _mm256_permute2f128_ps(input1, input2, 0x31); +++ // Horizontal add, to add (r*r) + (i*i) for each complex value +++ // (r1*r1)+(i1*i1), (r2*r2) + (i2*i2), (r3*r3)+(i3*i3), (r4*r4)+(i4*i4) +++ inputVal1 = _mm256_permute2f128_ps(input1, input2, 0x20); +++ inputVal2 = _mm256_permute2f128_ps(input1, input2, 0x31); ++ ++- power = _mm256_hadd_ps(inputVal1, inputVal2); +++ power = _mm256_hadd_ps(inputVal1, inputVal2); ++ ++- // Divide by the rbw ++- power = _mm256_mul_ps(power, invRBW); +++ // Divide by the rbw +++ power = _mm256_mul_ps(power, invRBW); ++ ++- // Calculate the natural log power ++- power = logf4(power); +++ // Calculate the natural log power +++ power = logf4(power); ++ ++- // Convert to log10 and multiply by 10.0 ++- power = _mm256_mul_ps(power, magScalar); +++ // Convert to log10 and multiply by 10.0 +++ power = _mm256_mul_ps(power, magScalar); ++ ++- // Store the floating point results ++- _mm256_store_ps(destPtr, power); +++ // Store the floating point results +++ _mm256_store_ps(destPtr, power); ++ ++- destPtr += 8; ++- } +++ destPtr += 8; +++ } ++ ++- number = eighthPoints*8; +++ number = eighthPoints * 8; ++ #endif /* LV_HAVE_LIB_SIMDMATH */ ++- // Calculate the FFT for any remaining points ++- for(; number < num_points; number++){ ++- // Calculate dBm ++- // 50 ohm load assumption ++- // 10 * log10 (v^2 / (2 * 50.0 * .001)) = 10 * log10( v^2 * 10) ++- // 75 ohm load assumption ++- // 10 * log10 (v^2 / (2 * 75.0 * .001)) = 10 * log10( v^2 * 15) ++- ++- const float real = *inputPtr++ * iNormalizationFactor; ++- const float imag = *inputPtr++ * iNormalizationFactor; ++- ++- *destPtr = 10.0*log10f((((real * real) + (imag * imag)) + 1e-20) * iRBW); ++- destPtr++; ++- } ++- +++ // Calculate the FFT for any remaining points +++ for (; number < num_points; number++) { +++ // Calculate dBm +++ // 50 ohm load assumption +++ // 10 * log10 (v^2 / (2 * 50.0 * .001)) = 10 * log10( v^2 * 10) +++ // 75 ohm load assumption +++ // 10 * log10 (v^2 / (2 * 75.0 * .001)) = 10 * log10( v^2 * 15) +++ +++ const float real = *inputPtr++ * iNormalizationFactor; +++ const float imag = *inputPtr++ * iNormalizationFactor; +++ +++ *destPtr = 10.0 * log10f((((real * real) + (imag * imag)) + 1e-20) * iRBW); +++ destPtr++; +++ } ++ } ++ #endif /* LV_HAVE_AVX */ ++ ++@@ -150,86 +152,86 @@ volk_32fc_s32f_x2_power_spectral_density_32f_a_avx(float* logPowerOutput, const ++ #include ++ ++ ++- ++ #ifdef LV_HAVE_LIB_SIMDMATH ++ #include ++ #endif /* LV_HAVE_LIB_SIMDMATH */ ++ ++ static inline void ++-volk_32fc_s32f_x2_power_spectral_density_32f_a_sse3(float* logPowerOutput, const lv_32fc_t* complexFFTInput, ++- const float normalizationFactor, const float rbw, +++volk_32fc_s32f_x2_power_spectral_density_32f_a_sse3(float* logPowerOutput, +++ const lv_32fc_t* complexFFTInput, +++ const float normalizationFactor, +++ const float rbw, ++ unsigned int num_points) ++ { ++- const float* inputPtr = (const float*)complexFFTInput; ++- float* destPtr = logPowerOutput; ++- uint64_t number = 0; ++- const float iRBW = 1.0 / rbw; ++- const float iNormalizationFactor = 1.0 / normalizationFactor; +++ const float* inputPtr = (const float*)complexFFTInput; +++ float* destPtr = logPowerOutput; +++ uint64_t number = 0; +++ const float iRBW = 1.0 / rbw; +++ const float iNormalizationFactor = 1.0 / normalizationFactor; ++ ++ #ifdef LV_HAVE_LIB_SIMDMATH ++- __m128 magScalar = _mm_set_ps1(10.0); ++- magScalar = _mm_div_ps(magScalar, logf4(magScalar)); +++ __m128 magScalar = _mm_set_ps1(10.0); +++ magScalar = _mm_div_ps(magScalar, logf4(magScalar)); ++ ++- __m128 invRBW = _mm_set_ps1(iRBW); +++ __m128 invRBW = _mm_set_ps1(iRBW); ++ ++- __m128 invNormalizationFactor = _mm_set_ps1(iNormalizationFactor); +++ __m128 invNormalizationFactor = _mm_set_ps1(iNormalizationFactor); ++ ++- __m128 power; ++- __m128 input1, input2; ++- const uint64_t quarterPoints = num_points / 4; ++- for(;number < quarterPoints; number++){ ++- // Load the complex values ++- input1 =_mm_load_ps(inputPtr); ++- inputPtr += 4; ++- input2 =_mm_load_ps(inputPtr); ++- inputPtr += 4; +++ __m128 power; +++ __m128 input1, input2; +++ const uint64_t quarterPoints = num_points / 4; +++ for (; number < quarterPoints; number++) { +++ // Load the complex values +++ input1 = _mm_load_ps(inputPtr); +++ inputPtr += 4; +++ input2 = _mm_load_ps(inputPtr); +++ inputPtr += 4; ++ ++- // Apply the normalization factor ++- input1 = _mm_mul_ps(input1, invNormalizationFactor); ++- input2 = _mm_mul_ps(input2, invNormalizationFactor); +++ // Apply the normalization factor +++ input1 = _mm_mul_ps(input1, invNormalizationFactor); +++ input2 = _mm_mul_ps(input2, invNormalizationFactor); ++ ++- // Multiply each value by itself ++- // (r1*r1), (i1*i1), (r2*r2), (i2*i2) ++- input1 = _mm_mul_ps(input1, input1); ++- // (r3*r3), (i3*i3), (r4*r4), (i4*i4) ++- input2 = _mm_mul_ps(input2, input2); +++ // Multiply each value by itself +++ // (r1*r1), (i1*i1), (r2*r2), (i2*i2) +++ input1 = _mm_mul_ps(input1, input1); +++ // (r3*r3), (i3*i3), (r4*r4), (i4*i4) +++ input2 = _mm_mul_ps(input2, input2); ++ ++- // Horizontal add, to add (r*r) + (i*i) for each complex value ++- // (r1*r1)+(i1*i1), (r2*r2) + (i2*i2), (r3*r3)+(i3*i3), (r4*r4)+(i4*i4) ++- power = _mm_hadd_ps(input1, input2); +++ // Horizontal add, to add (r*r) + (i*i) for each complex value +++ // (r1*r1)+(i1*i1), (r2*r2) + (i2*i2), (r3*r3)+(i3*i3), (r4*r4)+(i4*i4) +++ power = _mm_hadd_ps(input1, input2); ++ ++- // Divide by the rbw ++- power = _mm_mul_ps(power, invRBW); +++ // Divide by the rbw +++ power = _mm_mul_ps(power, invRBW); ++ ++- // Calculate the natural log power ++- power = logf4(power); +++ // Calculate the natural log power +++ power = logf4(power); ++ ++- // Convert to log10 and multiply by 10.0 ++- power = _mm_mul_ps(power, magScalar); +++ // Convert to log10 and multiply by 10.0 +++ power = _mm_mul_ps(power, magScalar); ++ ++- // Store the floating point results ++- _mm_store_ps(destPtr, power); +++ // Store the floating point results +++ _mm_store_ps(destPtr, power); ++ ++- destPtr += 4; ++- } +++ destPtr += 4; +++ } ++ ++- number = quarterPoints*4; +++ number = quarterPoints * 4; ++ #endif /* LV_HAVE_LIB_SIMDMATH */ ++- // Calculate the FFT for any remaining points ++- for(; number < num_points; number++){ ++- // Calculate dBm ++- // 50 ohm load assumption ++- // 10 * log10 (v^2 / (2 * 50.0 * .001)) = 10 * log10( v^2 * 10) ++- // 75 ohm load assumption ++- // 10 * log10 (v^2 / (2 * 75.0 * .001)) = 10 * log10( v^2 * 15) ++- ++- const float real = *inputPtr++ * iNormalizationFactor; ++- const float imag = *inputPtr++ * iNormalizationFactor; ++- ++- *destPtr = 10.0*log10f((((real * real) + (imag * imag)) + 1e-20) * iRBW); ++- destPtr++; ++- } ++- +++ // Calculate the FFT for any remaining points +++ for (; number < num_points; number++) { +++ // Calculate dBm +++ // 50 ohm load assumption +++ // 10 * log10 (v^2 / (2 * 50.0 * .001)) = 10 * log10( v^2 * 10) +++ // 75 ohm load assumption +++ // 10 * log10 (v^2 / (2 * 75.0 * .001)) = 10 * log10( v^2 * 15) +++ +++ const float real = *inputPtr++ * iNormalizationFactor; +++ const float imag = *inputPtr++ * iNormalizationFactor; +++ +++ *destPtr = 10.0 * log10f((((real * real) + (imag * imag)) + 1e-20) * iRBW); +++ destPtr++; +++ } ++ } ++ #endif /* LV_HAVE_SSE3 */ ++ ++@@ -237,31 +239,34 @@ volk_32fc_s32f_x2_power_spectral_density_32f_a_sse3(float* logPowerOutput, const ++ #ifdef LV_HAVE_GENERIC ++ ++ static inline void ++-volk_32fc_s32f_x2_power_spectral_density_32f_generic(float* logPowerOutput, const lv_32fc_t* complexFFTInput, ++- const float normalizationFactor, const float rbw, +++volk_32fc_s32f_x2_power_spectral_density_32f_generic(float* logPowerOutput, +++ const lv_32fc_t* complexFFTInput, +++ const float normalizationFactor, +++ const float rbw, ++ unsigned int num_points) ++ { ++- // Calculate the Power of the complex point ++- const float* inputPtr = (float*)complexFFTInput; ++- float* realFFTDataPointsPtr = logPowerOutput; ++- unsigned int point; ++- const float invRBW = 1.0 / rbw; ++- const float iNormalizationFactor = 1.0 / normalizationFactor; ++- ++- for(point = 0; point < num_points; point++){ ++- // Calculate dBm ++- // 50 ohm load assumption ++- // 10 * log10 (v^2 / (2 * 50.0 * .001)) = 10 * log10( v^2 * 10) ++- // 75 ohm load assumption ++- // 10 * log10 (v^2 / (2 * 75.0 * .001)) = 10 * log10( v^2 * 15) ++- ++- const float real = *inputPtr++ * iNormalizationFactor; ++- const float imag = *inputPtr++ * iNormalizationFactor; ++- ++- *realFFTDataPointsPtr = 10.0*log10f((((real * real) + (imag * imag)) + 1e-20) * invRBW); ++- ++- realFFTDataPointsPtr++; ++- } +++ // Calculate the Power of the complex point +++ const float* inputPtr = (float*)complexFFTInput; +++ float* realFFTDataPointsPtr = logPowerOutput; +++ unsigned int point; +++ const float invRBW = 1.0 / rbw; +++ const float iNormalizationFactor = 1.0 / normalizationFactor; +++ +++ for (point = 0; point < num_points; point++) { +++ // Calculate dBm +++ // 50 ohm load assumption +++ // 10 * log10 (v^2 / (2 * 50.0 * .001)) = 10 * log10( v^2 * 10) +++ // 75 ohm load assumption +++ // 10 * log10 (v^2 / (2 * 75.0 * .001)) = 10 * log10( v^2 * 15) +++ +++ const float real = *inputPtr++ * iNormalizationFactor; +++ const float imag = *inputPtr++ * iNormalizationFactor; +++ +++ *realFFTDataPointsPtr = +++ 10.0 * log10f((((real * real) + (imag * imag)) + 1e-20) * invRBW); +++ +++ realFFTDataPointsPtr++; +++ } ++ } ++ #endif /* LV_HAVE_GENERIC */ ++ ++diff --git a/kernels/volk/volk_32fc_s32fc_multiply_32fc.h b/kernels/volk/volk_32fc_s32fc_multiply_32fc.h ++index fe416b4..840008a 100644 ++--- a/kernels/volk/volk_32fc_s32fc_multiply_32fc.h +++++ b/kernels/volk/volk_32fc_s32fc_multiply_32fc.h ++@@ -30,8 +30,8 @@ ++ * ++ * Dispatcher Prototype ++ * \code ++- * void volk_32fc_s32fc_multiply_32fc(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t scalar, unsigned int num_points); ++- * \endcode +++ * void volk_32fc_s32fc_multiply_32fc(lv_32fc_t* cVector, const lv_32fc_t* aVector, const +++ * lv_32fc_t scalar, unsigned int num_points); \endcode ++ * ++ * \b Inputs ++ * \li aVector: The input vector to be multiplied. ++@@ -76,15 +76,19 @@ ++ #ifndef INCLUDED_volk_32fc_s32fc_multiply_32fc_u_H ++ #define INCLUDED_volk_32fc_s32fc_multiply_32fc_u_H ++ +++#include ++ #include ++ #include ++ #include ++-#include ++ ++ #if LV_HAVE_AVX && LV_HAVE_FMA ++ #include ++ ++-static inline void volk_32fc_s32fc_multiply_32fc_u_avx_fma(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t scalar, unsigned int num_points){ +++static inline void volk_32fc_s32fc_multiply_32fc_u_avx_fma(lv_32fc_t* cVector, +++ const lv_32fc_t* aVector, +++ const lv_32fc_t scalar, +++ unsigned int num_points) +++{ ++ unsigned int number = 0; ++ unsigned int i = 0; ++ const unsigned int quarterPoints = num_points / 4; ++@@ -97,34 +101,38 @@ static inline void volk_32fc_s32fc_multiply_32fc_u_avx_fma(lv_32fc_t* cVector, c ++ yl = _mm256_set1_ps(lv_creal(scalar)); ++ yh = _mm256_set1_ps(lv_cimag(scalar)); ++ ++- for(;number < quarterPoints; number++){ ++- x = _mm256_loadu_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi +++ for (; number < quarterPoints; number++) { +++ x = _mm256_loadu_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi ++ ++- tmp1 = x; +++ tmp1 = x; ++ ++- x = _mm256_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br +++ x = _mm256_shuffle_ps(x, x, 0xB1); // Re-arrange x to be ai,ar,bi,br ++ ++- tmp2 = _mm256_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di +++ tmp2 = _mm256_mul_ps(x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di ++ ++- z = _mm256_fmaddsub_ps(tmp1, yl,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di +++ z = _mm256_fmaddsub_ps( +++ tmp1, yl, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di ++ ++- _mm256_storeu_ps((float*)c,z); // Store the results back into the C container +++ _mm256_storeu_ps((float*)c, z); // Store the results back into the C container ++ ++- a += 4; ++- c += 4; +++ a += 4; +++ c += 4; ++ } ++ ++- for(i = num_points-isodd; i < num_points; i++) { +++ for (i = num_points - isodd; i < num_points; i++) { ++ *c++ = (*a++) * scalar; ++ } ++- ++ } ++ #endif /* LV_HAVE_AVX && LV_HAVE_FMA */ ++ ++ #ifdef LV_HAVE_AVX ++ #include ++ ++-static inline void volk_32fc_s32fc_multiply_32fc_u_avx(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t scalar, unsigned int num_points){ +++static inline void volk_32fc_s32fc_multiply_32fc_u_avx(lv_32fc_t* cVector, +++ const lv_32fc_t* aVector, +++ const lv_32fc_t scalar, +++ unsigned int num_points) +++{ ++ unsigned int number = 0; ++ unsigned int i = 0; ++ const unsigned int quarterPoints = num_points / 4; ++@@ -137,35 +145,39 @@ static inline void volk_32fc_s32fc_multiply_32fc_u_avx(lv_32fc_t* cVector, const ++ yl = _mm256_set1_ps(lv_creal(scalar)); ++ yh = _mm256_set1_ps(lv_cimag(scalar)); ++ ++- for(;number < quarterPoints; number++){ ++- x = _mm256_loadu_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi +++ for (; number < quarterPoints; number++) { +++ x = _mm256_loadu_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi ++ ++- tmp1 = _mm256_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr +++ tmp1 = _mm256_mul_ps(x, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr ++ ++- x = _mm256_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br +++ x = _mm256_shuffle_ps(x, x, 0xB1); // Re-arrange x to be ai,ar,bi,br ++ ++- tmp2 = _mm256_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di +++ tmp2 = _mm256_mul_ps(x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di ++ ++- z = _mm256_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di +++ z = _mm256_addsub_ps(tmp1, +++ tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di ++ ++- _mm256_storeu_ps((float*)c,z); // Store the results back into the C container +++ _mm256_storeu_ps((float*)c, z); // Store the results back into the C container ++ ++- a += 4; ++- c += 4; +++ a += 4; +++ c += 4; ++ } ++ ++- for(i = num_points-isodd; i < num_points; i++) { +++ for (i = num_points - isodd; i < num_points; i++) { ++ *c++ = (*a++) * scalar; ++ } ++- ++ } ++ #endif /* LV_HAVE_AVX */ ++ ++ #ifdef LV_HAVE_SSE3 ++ #include ++ ++-static inline void volk_32fc_s32fc_multiply_32fc_u_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t scalar, unsigned int num_points){ ++- unsigned int number = 0; +++static inline void volk_32fc_s32fc_multiply_32fc_u_sse3(lv_32fc_t* cVector, +++ const lv_32fc_t* aVector, +++ const lv_32fc_t scalar, +++ unsigned int num_points) +++{ +++ unsigned int number = 0; ++ const unsigned int halfPoints = num_points / 2; ++ ++ __m128 x, yl, yh, z, tmp1, tmp2; ++@@ -176,53 +188,58 @@ static inline void volk_32fc_s32fc_multiply_32fc_u_sse3(lv_32fc_t* cVector, cons ++ yl = _mm_set_ps1(lv_creal(scalar)); ++ yh = _mm_set_ps1(lv_cimag(scalar)); ++ ++- for(;number < halfPoints; number++){ +++ for (; number < halfPoints; number++) { ++ ++- x = _mm_loadu_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi +++ x = _mm_loadu_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi ++ ++- tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr +++ tmp1 = _mm_mul_ps(x, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr ++ ++- x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br +++ x = _mm_shuffle_ps(x, x, 0xB1); // Re-arrange x to be ai,ar,bi,br ++ ++- tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di +++ tmp2 = _mm_mul_ps(x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di ++ ++- z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di +++ z = _mm_addsub_ps(tmp1, +++ tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di ++ ++- _mm_storeu_ps((float*)c,z); // Store the results back into the C container +++ _mm_storeu_ps((float*)c, z); // Store the results back into the C container ++ ++- a += 2; ++- c += 2; +++ a += 2; +++ c += 2; ++ } ++ ++- if((num_points % 2) != 0) { ++- *c = (*a) * scalar; +++ if ((num_points % 2) != 0) { +++ *c = (*a) * scalar; ++ } ++ } ++ #endif /* LV_HAVE_SSE */ ++ ++ #ifdef LV_HAVE_GENERIC ++ ++-static inline void volk_32fc_s32fc_multiply_32fc_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t scalar, unsigned int num_points){ +++static inline void volk_32fc_s32fc_multiply_32fc_generic(lv_32fc_t* cVector, +++ const lv_32fc_t* aVector, +++ const lv_32fc_t scalar, +++ unsigned int num_points) +++{ ++ lv_32fc_t* cPtr = cVector; ++ const lv_32fc_t* aPtr = aVector; ++ unsigned int number = num_points; ++ ++ // unwrap loop ++- while (number >= 8){ ++- *cPtr++ = (*aPtr++) * scalar; ++- *cPtr++ = (*aPtr++) * scalar; ++- *cPtr++ = (*aPtr++) * scalar; ++- *cPtr++ = (*aPtr++) * scalar; ++- *cPtr++ = (*aPtr++) * scalar; ++- *cPtr++ = (*aPtr++) * scalar; ++- *cPtr++ = (*aPtr++) * scalar; ++- *cPtr++ = (*aPtr++) * scalar; ++- number -= 8; +++ while (number >= 8) { +++ *cPtr++ = (*aPtr++) * scalar; +++ *cPtr++ = (*aPtr++) * scalar; +++ *cPtr++ = (*aPtr++) * scalar; +++ *cPtr++ = (*aPtr++) * scalar; +++ *cPtr++ = (*aPtr++) * scalar; +++ *cPtr++ = (*aPtr++) * scalar; +++ *cPtr++ = (*aPtr++) * scalar; +++ *cPtr++ = (*aPtr++) * scalar; +++ number -= 8; ++ } ++ ++ // clean up any remaining ++ while (number-- > 0) ++- *cPtr++ = *aPtr++ * scalar; +++ *cPtr++ = *aPtr++ * scalar; ++ } ++ #endif /* LV_HAVE_GENERIC */ ++ ++@@ -231,15 +248,19 @@ static inline void volk_32fc_s32fc_multiply_32fc_generic(lv_32fc_t* cVector, con ++ #ifndef INCLUDED_volk_32fc_s32fc_multiply_32fc_a_H ++ #define INCLUDED_volk_32fc_s32fc_multiply_32fc_a_H ++ +++#include ++ #include ++ #include ++ #include ++-#include ++ ++ #if LV_HAVE_AVX && LV_HAVE_FMA ++ #include ++ ++-static inline void volk_32fc_s32fc_multiply_32fc_a_avx_fma(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t scalar, unsigned int num_points){ +++static inline void volk_32fc_s32fc_multiply_32fc_a_avx_fma(lv_32fc_t* cVector, +++ const lv_32fc_t* aVector, +++ const lv_32fc_t scalar, +++ unsigned int num_points) +++{ ++ unsigned int number = 0; ++ unsigned int i = 0; ++ const unsigned int quarterPoints = num_points / 4; ++@@ -252,27 +273,27 @@ static inline void volk_32fc_s32fc_multiply_32fc_a_avx_fma(lv_32fc_t* cVector, c ++ yl = _mm256_set1_ps(lv_creal(scalar)); ++ yh = _mm256_set1_ps(lv_cimag(scalar)); ++ ++- for(;number < quarterPoints; number++){ ++- x = _mm256_load_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi +++ for (; number < quarterPoints; number++) { +++ x = _mm256_load_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi ++ ++- tmp1 = x; +++ tmp1 = x; ++ ++- x = _mm256_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br +++ x = _mm256_shuffle_ps(x, x, 0xB1); // Re-arrange x to be ai,ar,bi,br ++ ++- tmp2 = _mm256_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di +++ tmp2 = _mm256_mul_ps(x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di ++ ++- z = _mm256_fmaddsub_ps(tmp1, yl,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di +++ z = _mm256_fmaddsub_ps( +++ tmp1, yl, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di ++ ++- _mm256_store_ps((float*)c,z); // Store the results back into the C container +++ _mm256_store_ps((float*)c, z); // Store the results back into the C container ++ ++- a += 4; ++- c += 4; +++ a += 4; +++ c += 4; ++ } ++ ++- for(i = num_points-isodd; i < num_points; i++) { +++ for (i = num_points - isodd; i < num_points; i++) { ++ *c++ = (*a++) * scalar; ++ } ++- ++ } ++ #endif /* LV_HAVE_AVX && LV_HAVE_FMA */ ++ ++@@ -280,7 +301,11 @@ static inline void volk_32fc_s32fc_multiply_32fc_a_avx_fma(lv_32fc_t* cVector, c ++ #ifdef LV_HAVE_AVX ++ #include ++ ++-static inline void volk_32fc_s32fc_multiply_32fc_a_avx(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t scalar, unsigned int num_points){ +++static inline void volk_32fc_s32fc_multiply_32fc_a_avx(lv_32fc_t* cVector, +++ const lv_32fc_t* aVector, +++ const lv_32fc_t scalar, +++ unsigned int num_points) +++{ ++ unsigned int number = 0; ++ unsigned int i = 0; ++ const unsigned int quarterPoints = num_points / 4; ++@@ -293,35 +318,39 @@ static inline void volk_32fc_s32fc_multiply_32fc_a_avx(lv_32fc_t* cVector, const ++ yl = _mm256_set1_ps(lv_creal(scalar)); ++ yh = _mm256_set1_ps(lv_cimag(scalar)); ++ ++- for(;number < quarterPoints; number++){ ++- x = _mm256_load_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi +++ for (; number < quarterPoints; number++) { +++ x = _mm256_load_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi ++ ++- tmp1 = _mm256_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr +++ tmp1 = _mm256_mul_ps(x, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr ++ ++- x = _mm256_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br +++ x = _mm256_shuffle_ps(x, x, 0xB1); // Re-arrange x to be ai,ar,bi,br ++ ++- tmp2 = _mm256_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di +++ tmp2 = _mm256_mul_ps(x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di ++ ++- z = _mm256_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di +++ z = _mm256_addsub_ps(tmp1, +++ tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di ++ ++- _mm256_store_ps((float*)c,z); // Store the results back into the C container +++ _mm256_store_ps((float*)c, z); // Store the results back into the C container ++ ++- a += 4; ++- c += 4; +++ a += 4; +++ c += 4; ++ } ++ ++- for(i = num_points-isodd; i < num_points; i++) { +++ for (i = num_points - isodd; i < num_points; i++) { ++ *c++ = (*a++) * scalar; ++ } ++- ++ } ++ #endif /* LV_HAVE_AVX */ ++ ++ #ifdef LV_HAVE_SSE3 ++ #include ++ ++-static inline void volk_32fc_s32fc_multiply_32fc_a_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t scalar, unsigned int num_points){ ++- unsigned int number = 0; +++static inline void volk_32fc_s32fc_multiply_32fc_a_sse3(lv_32fc_t* cVector, +++ const lv_32fc_t* aVector, +++ const lv_32fc_t scalar, +++ unsigned int num_points) +++{ +++ unsigned int number = 0; ++ const unsigned int halfPoints = num_points / 2; ++ ++ __m128 x, yl, yh, z, tmp1, tmp2; ++@@ -332,26 +361,27 @@ static inline void volk_32fc_s32fc_multiply_32fc_a_sse3(lv_32fc_t* cVector, cons ++ yl = _mm_set_ps1(lv_creal(scalar)); ++ yh = _mm_set_ps1(lv_cimag(scalar)); ++ ++- for(;number < halfPoints; number++){ +++ for (; number < halfPoints; number++) { ++ ++- x = _mm_load_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi +++ x = _mm_load_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi ++ ++- tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr +++ tmp1 = _mm_mul_ps(x, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr ++ ++- x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br +++ x = _mm_shuffle_ps(x, x, 0xB1); // Re-arrange x to be ai,ar,bi,br ++ ++- tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di +++ tmp2 = _mm_mul_ps(x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di ++ ++- z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di +++ z = _mm_addsub_ps(tmp1, +++ tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di ++ ++- _mm_store_ps((float*)c,z); // Store the results back into the C container +++ _mm_store_ps((float*)c, z); // Store the results back into the C container ++ ++- a += 2; ++- c += 2; +++ a += 2; +++ c += 2; ++ } ++ ++- if((num_points % 2) != 0) { ++- *c = (*a) * scalar; +++ if ((num_points % 2) != 0) { +++ *c = (*a) * scalar; ++ } ++ } ++ #endif /* LV_HAVE_SSE */ ++@@ -359,7 +389,11 @@ static inline void volk_32fc_s32fc_multiply_32fc_a_sse3(lv_32fc_t* cVector, cons ++ #ifdef LV_HAVE_NEON ++ #include ++ ++-static inline void volk_32fc_s32fc_multiply_32fc_neon(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t scalar, unsigned int num_points){ +++static inline void volk_32fc_s32fc_multiply_32fc_neon(lv_32fc_t* cVector, +++ const lv_32fc_t* aVector, +++ const lv_32fc_t scalar, +++ unsigned int num_points) +++{ ++ lv_32fc_t* cPtr = cVector; ++ const lv_32fc_t* aPtr = aVector; ++ unsigned int number = num_points; ++@@ -370,7 +404,7 @@ static inline void volk_32fc_s32fc_multiply_32fc_neon(lv_32fc_t* cVector, const ++ ++ scalar_val.val[0] = vld1q_dup_f32((const float*)&scalar); ++ scalar_val.val[1] = vld1q_dup_f32(((const float*)&scalar) + 1); ++- for(number = 0; number < quarter_points; ++number) { +++ for (number = 0; number < quarter_points; ++number) { ++ a_val = vld2q_f32((float*)aPtr); ++ tmp_imag.val[1] = vmulq_f32(a_val.val[1], scalar_val.val[0]); ++ tmp_imag.val[0] = vmulq_f32(a_val.val[0], scalar_val.val[0]); ++@@ -383,35 +417,39 @@ static inline void volk_32fc_s32fc_multiply_32fc_neon(lv_32fc_t* cVector, const ++ cPtr += 4; ++ } ++ ++- for(number = quarter_points*4; number < num_points; number++){ ++- *cPtr++ = *aPtr++ * scalar; +++ for (number = quarter_points * 4; number < num_points; number++) { +++ *cPtr++ = *aPtr++ * scalar; ++ } ++ } ++ #endif /* LV_HAVE_NEON */ ++ ++ #ifdef LV_HAVE_GENERIC ++ ++-static inline void volk_32fc_s32fc_multiply_32fc_a_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t scalar, unsigned int num_points){ +++static inline void volk_32fc_s32fc_multiply_32fc_a_generic(lv_32fc_t* cVector, +++ const lv_32fc_t* aVector, +++ const lv_32fc_t scalar, +++ unsigned int num_points) +++{ ++ lv_32fc_t* cPtr = cVector; ++ const lv_32fc_t* aPtr = aVector; ++ unsigned int number = num_points; ++ ++ // unwrap loop ++- while (number >= 8){ ++- *cPtr++ = (*aPtr++) * scalar; ++- *cPtr++ = (*aPtr++) * scalar; ++- *cPtr++ = (*aPtr++) * scalar; ++- *cPtr++ = (*aPtr++) * scalar; ++- *cPtr++ = (*aPtr++) * scalar; ++- *cPtr++ = (*aPtr++) * scalar; ++- *cPtr++ = (*aPtr++) * scalar; ++- *cPtr++ = (*aPtr++) * scalar; ++- number -= 8; +++ while (number >= 8) { +++ *cPtr++ = (*aPtr++) * scalar; +++ *cPtr++ = (*aPtr++) * scalar; +++ *cPtr++ = (*aPtr++) * scalar; +++ *cPtr++ = (*aPtr++) * scalar; +++ *cPtr++ = (*aPtr++) * scalar; +++ *cPtr++ = (*aPtr++) * scalar; +++ *cPtr++ = (*aPtr++) * scalar; +++ *cPtr++ = (*aPtr++) * scalar; +++ number -= 8; ++ } ++ ++ // clean up any remaining ++ while (number-- > 0) ++- *cPtr++ = *aPtr++ * scalar; +++ *cPtr++ = *aPtr++ * scalar; ++ } ++ #endif /* LV_HAVE_GENERIC */ ++ ++diff --git a/kernels/volk/volk_32fc_s32fc_rotatorpuppet_32fc.h b/kernels/volk/volk_32fc_s32fc_rotatorpuppet_32fc.h ++index 181abc5..eba98fe 100644 ++--- a/kernels/volk/volk_32fc_s32fc_rotatorpuppet_32fc.h +++++ b/kernels/volk/volk_32fc_s32fc_rotatorpuppet_32fc.h ++@@ -25,19 +25,24 @@ ++ #define INCLUDED_volk_32fc_s32fc_rotatorpuppet_32fc_a_H ++ ++ ++-#include ++ #include ++ #include +++#include ++ ++ ++ #ifdef LV_HAVE_GENERIC ++ ++-static inline void volk_32fc_s32fc_rotatorpuppet_32fc_generic(lv_32fc_t* outVector, const lv_32fc_t* inVector, const lv_32fc_t phase_inc, unsigned int num_points){ ++- lv_32fc_t phase[1] = {lv_cmake(.3, 0.95393)}; +++static inline void volk_32fc_s32fc_rotatorpuppet_32fc_generic(lv_32fc_t* outVector, +++ const lv_32fc_t* inVector, +++ const lv_32fc_t phase_inc, +++ unsigned int num_points) +++{ +++ lv_32fc_t phase[1] = { lv_cmake(.3, 0.95393) }; ++ (*phase) /= hypotf(lv_creal(*phase), lv_cimag(*phase)); ++- const lv_32fc_t phase_inc_n = phase_inc / hypotf(lv_creal(phase_inc), lv_cimag(phase_inc)); ++- volk_32fc_s32fc_x2_rotator_32fc_generic(outVector, inVector, phase_inc_n, phase, num_points); ++- +++ const lv_32fc_t phase_inc_n = +++ phase_inc / hypotf(lv_creal(phase_inc), lv_cimag(phase_inc)); +++ volk_32fc_s32fc_x2_rotator_32fc_generic( +++ outVector, inVector, phase_inc_n, phase, num_points); ++ } ++ ++ #endif /* LV_HAVE_GENERIC */ ++@@ -47,12 +52,17 @@ static inline void volk_32fc_s32fc_rotatorpuppet_32fc_generic(lv_32fc_t* outVect ++ #include ++ #include ++ ++-static inline void volk_32fc_s32fc_rotatorpuppet_32fc_neon(lv_32fc_t* outVector, const lv_32fc_t* inVector, const lv_32fc_t phase_inc, unsigned int num_points){ ++- lv_32fc_t phase[1] = {lv_cmake(.3, 0.95393)}; +++static inline void volk_32fc_s32fc_rotatorpuppet_32fc_neon(lv_32fc_t* outVector, +++ const lv_32fc_t* inVector, +++ const lv_32fc_t phase_inc, +++ unsigned int num_points) +++{ +++ lv_32fc_t phase[1] = { lv_cmake(.3, 0.95393) }; ++ (*phase) /= hypotf(lv_creal(*phase), lv_cimag(*phase)); ++- const lv_32fc_t phase_inc_n = phase_inc / hypotf(lv_creal(phase_inc), lv_cimag(phase_inc)); ++- volk_32fc_s32fc_x2_rotator_32fc_neon(outVector, inVector, phase_inc_n, phase, num_points); ++- +++ const lv_32fc_t phase_inc_n = +++ phase_inc / hypotf(lv_creal(phase_inc), lv_cimag(phase_inc)); +++ volk_32fc_s32fc_x2_rotator_32fc_neon( +++ outVector, inVector, phase_inc_n, phase, num_points); ++ } ++ ++ #endif /* LV_HAVE_NEON */ ++@@ -61,12 +71,17 @@ static inline void volk_32fc_s32fc_rotatorpuppet_32fc_neon(lv_32fc_t* outVector, ++ #ifdef LV_HAVE_SSE4_1 ++ #include ++ ++-static inline void volk_32fc_s32fc_rotatorpuppet_32fc_a_sse4_1(lv_32fc_t* outVector, const lv_32fc_t* inVector, const lv_32fc_t phase_inc, unsigned int num_points){ ++- lv_32fc_t phase[1] = {lv_cmake(.3, .95393)}; +++static inline void volk_32fc_s32fc_rotatorpuppet_32fc_a_sse4_1(lv_32fc_t* outVector, +++ const lv_32fc_t* inVector, +++ const lv_32fc_t phase_inc, +++ unsigned int num_points) +++{ +++ lv_32fc_t phase[1] = { lv_cmake(.3, .95393) }; ++ (*phase) /= hypotf(lv_creal(*phase), lv_cimag(*phase)); ++- const lv_32fc_t phase_inc_n = phase_inc / hypotf(lv_creal(phase_inc), lv_cimag(phase_inc)); ++- volk_32fc_s32fc_x2_rotator_32fc_a_sse4_1(outVector, inVector, phase_inc_n, phase, num_points); ++- +++ const lv_32fc_t phase_inc_n = +++ phase_inc / hypotf(lv_creal(phase_inc), lv_cimag(phase_inc)); +++ volk_32fc_s32fc_x2_rotator_32fc_a_sse4_1( +++ outVector, inVector, phase_inc_n, phase, num_points); ++ } ++ ++ #endif /* LV_HAVE_SSE4_1 */ ++@@ -74,12 +89,17 @@ static inline void volk_32fc_s32fc_rotatorpuppet_32fc_a_sse4_1(lv_32fc_t* outVec ++ ++ #ifdef LV_HAVE_SSE4_1 ++ #include ++-static inline void volk_32fc_s32fc_rotatorpuppet_32fc_u_sse4_1(lv_32fc_t* outVector, const lv_32fc_t* inVector, const lv_32fc_t phase_inc, unsigned int num_points){ ++- lv_32fc_t phase[1] = {lv_cmake(.3, .95393)}; +++static inline void volk_32fc_s32fc_rotatorpuppet_32fc_u_sse4_1(lv_32fc_t* outVector, +++ const lv_32fc_t* inVector, +++ const lv_32fc_t phase_inc, +++ unsigned int num_points) +++{ +++ lv_32fc_t phase[1] = { lv_cmake(.3, .95393) }; ++ (*phase) /= hypotf(lv_creal(*phase), lv_cimag(*phase)); ++- const lv_32fc_t phase_inc_n = phase_inc / hypotf(lv_creal(phase_inc), lv_cimag(phase_inc)); ++- volk_32fc_s32fc_x2_rotator_32fc_u_sse4_1(outVector, inVector, phase_inc_n, phase, num_points); ++- +++ const lv_32fc_t phase_inc_n = +++ phase_inc / hypotf(lv_creal(phase_inc), lv_cimag(phase_inc)); +++ volk_32fc_s32fc_x2_rotator_32fc_u_sse4_1( +++ outVector, inVector, phase_inc_n, phase, num_points); ++ } ++ ++ #endif /* LV_HAVE_SSE4_1 */ ++@@ -88,11 +108,17 @@ static inline void volk_32fc_s32fc_rotatorpuppet_32fc_u_sse4_1(lv_32fc_t* outVec ++ #ifdef LV_HAVE_AVX ++ #include ++ ++-static inline void volk_32fc_s32fc_rotatorpuppet_32fc_a_avx(lv_32fc_t* outVector, const lv_32fc_t* inVector, const lv_32fc_t phase_inc, unsigned int num_points){ ++- lv_32fc_t phase[1] = {lv_cmake(.3, .95393)}; +++static inline void volk_32fc_s32fc_rotatorpuppet_32fc_a_avx(lv_32fc_t* outVector, +++ const lv_32fc_t* inVector, +++ const lv_32fc_t phase_inc, +++ unsigned int num_points) +++{ +++ lv_32fc_t phase[1] = { lv_cmake(.3, .95393) }; ++ (*phase) /= hypotf(lv_creal(*phase), lv_cimag(*phase)); ++- const lv_32fc_t phase_inc_n = phase_inc / hypotf(lv_creal(phase_inc), lv_cimag(phase_inc)); ++- volk_32fc_s32fc_x2_rotator_32fc_a_avx(outVector, inVector, phase_inc_n, phase, num_points); +++ const lv_32fc_t phase_inc_n = +++ phase_inc / hypotf(lv_creal(phase_inc), lv_cimag(phase_inc)); +++ volk_32fc_s32fc_x2_rotator_32fc_a_avx( +++ outVector, inVector, phase_inc_n, phase, num_points); ++ } ++ ++ #endif /* LV_HAVE_AVX */ ++@@ -101,11 +127,17 @@ static inline void volk_32fc_s32fc_rotatorpuppet_32fc_a_avx(lv_32fc_t* outVector ++ #ifdef LV_HAVE_AVX ++ #include ++ ++-static inline void volk_32fc_s32fc_rotatorpuppet_32fc_u_avx(lv_32fc_t* outVector, const lv_32fc_t* inVector, const lv_32fc_t phase_inc, unsigned int num_points){ ++- lv_32fc_t phase[1] = {lv_cmake(.3, .95393)}; +++static inline void volk_32fc_s32fc_rotatorpuppet_32fc_u_avx(lv_32fc_t* outVector, +++ const lv_32fc_t* inVector, +++ const lv_32fc_t phase_inc, +++ unsigned int num_points) +++{ +++ lv_32fc_t phase[1] = { lv_cmake(.3, .95393) }; ++ (*phase) /= hypotf(lv_creal(*phase), lv_cimag(*phase)); ++- const lv_32fc_t phase_inc_n = phase_inc / hypotf(lv_creal(phase_inc), lv_cimag(phase_inc)); ++- volk_32fc_s32fc_x2_rotator_32fc_u_avx(outVector, inVector, phase_inc_n, phase, num_points); +++ const lv_32fc_t phase_inc_n = +++ phase_inc / hypotf(lv_creal(phase_inc), lv_cimag(phase_inc)); +++ volk_32fc_s32fc_x2_rotator_32fc_u_avx( +++ outVector, inVector, phase_inc_n, phase, num_points); ++ } ++ ++ #endif /* LV_HAVE_AVX */ ++@@ -113,11 +145,17 @@ static inline void volk_32fc_s32fc_rotatorpuppet_32fc_u_avx(lv_32fc_t* outVector ++ #if LV_HAVE_AVX && LV_HAVE_FMA ++ #include ++ ++-static inline void volk_32fc_s32fc_rotatorpuppet_32fc_a_avx_fma(lv_32fc_t* outVector, const lv_32fc_t* inVector, const lv_32fc_t phase_inc, unsigned int num_points){ ++- lv_32fc_t phase[1] = {lv_cmake(.3, .95393)}; +++static inline void volk_32fc_s32fc_rotatorpuppet_32fc_a_avx_fma(lv_32fc_t* outVector, +++ const lv_32fc_t* inVector, +++ const lv_32fc_t phase_inc, +++ unsigned int num_points) +++{ +++ lv_32fc_t phase[1] = { lv_cmake(.3, .95393) }; ++ (*phase) /= hypotf(lv_creal(*phase), lv_cimag(*phase)); ++- const lv_32fc_t phase_inc_n = phase_inc / hypotf(lv_creal(phase_inc), lv_cimag(phase_inc)); ++- volk_32fc_s32fc_x2_rotator_32fc_a_avx_fma(outVector, inVector, phase_inc_n, phase, num_points); +++ const lv_32fc_t phase_inc_n = +++ phase_inc / hypotf(lv_creal(phase_inc), lv_cimag(phase_inc)); +++ volk_32fc_s32fc_x2_rotator_32fc_a_avx_fma( +++ outVector, inVector, phase_inc_n, phase, num_points); ++ } ++ ++ #endif /* LV_HAVE_AVX && LV_HAVE_FMA*/ ++@@ -126,11 +164,17 @@ static inline void volk_32fc_s32fc_rotatorpuppet_32fc_a_avx_fma(lv_32fc_t* outVe ++ #if LV_HAVE_AVX && LV_HAVE_FMA ++ #include ++ ++-static inline void volk_32fc_s32fc_rotatorpuppet_32fc_u_avx_fma(lv_32fc_t* outVector, const lv_32fc_t* inVector, const lv_32fc_t phase_inc, unsigned int num_points){ ++- lv_32fc_t phase[1] = {lv_cmake(.3, .95393)}; +++static inline void volk_32fc_s32fc_rotatorpuppet_32fc_u_avx_fma(lv_32fc_t* outVector, +++ const lv_32fc_t* inVector, +++ const lv_32fc_t phase_inc, +++ unsigned int num_points) +++{ +++ lv_32fc_t phase[1] = { lv_cmake(.3, .95393) }; ++ (*phase) /= hypotf(lv_creal(*phase), lv_cimag(*phase)); ++- const lv_32fc_t phase_inc_n = phase_inc / hypotf(lv_creal(phase_inc), lv_cimag(phase_inc)); ++- volk_32fc_s32fc_x2_rotator_32fc_u_avx_fma(outVector, inVector, phase_inc_n, phase, num_points); +++ const lv_32fc_t phase_inc_n = +++ phase_inc / hypotf(lv_creal(phase_inc), lv_cimag(phase_inc)); +++ volk_32fc_s32fc_x2_rotator_32fc_u_avx_fma( +++ outVector, inVector, phase_inc_n, phase, num_points); ++ } ++ ++ #endif /* LV_HAVE_AVX && LV_HAVE_FMA*/ ++diff --git a/kernels/volk/volk_32fc_s32fc_x2_rotator_32fc.h b/kernels/volk/volk_32fc_s32fc_x2_rotator_32fc.h ++index a886458..c97b8cb 100644 ++--- a/kernels/volk/volk_32fc_s32fc_x2_rotator_32fc.h +++++ b/kernels/volk/volk_32fc_s32fc_x2_rotator_32fc.h ++@@ -30,14 +30,15 @@ ++ * ++ * Dispatcher Prototype ++ * \code ++- * void volk_32fc_s32fc_x2_rotator_32fc(lv_32fc_t* outVector, const lv_32fc_t* inVector, const lv_32fc_t phase_inc, lv_32fc_t* phase, unsigned int num_points) ++- * \endcode +++ * void volk_32fc_s32fc_x2_rotator_32fc(lv_32fc_t* outVector, const lv_32fc_t* inVector, +++ * const lv_32fc_t phase_inc, lv_32fc_t* phase, unsigned int num_points) \endcode ++ * ++ * \b Inputs ++ * \li inVector: Vector to be rotated. ++ * \li phase_inc: rotational velocity. ++ * \li phase: initial phase offset. ++- * \li num_points: The number of values in inVector to be rotated and stored into outVector. +++ * \li num_points: The number of values in inVector to be rotated and stored into +++ * outVector. ++ * ++ * \b Outputs ++ * \li outVector: The vector where the results will be stored. ++@@ -81,31 +82,36 @@ ++ #define INCLUDED_volk_32fc_s32fc_rotator_32fc_a_H ++ ++ ++-#include +++#include ++ #include ++ #include ++-#include +++#include ++ #define ROTATOR_RELOAD 512 ++ ++ ++ #ifdef LV_HAVE_GENERIC ++ ++-static inline void volk_32fc_s32fc_x2_rotator_32fc_generic(lv_32fc_t* outVector, const lv_32fc_t* inVector, const lv_32fc_t phase_inc, lv_32fc_t* phase, unsigned int num_points){ +++static inline void volk_32fc_s32fc_x2_rotator_32fc_generic(lv_32fc_t* outVector, +++ const lv_32fc_t* inVector, +++ const lv_32fc_t phase_inc, +++ lv_32fc_t* phase, +++ unsigned int num_points) +++{ ++ unsigned int i = 0; ++ int j = 0; ++- for(i = 0; i < (unsigned int)(num_points/ROTATOR_RELOAD); ++i) { ++- for(j = 0; j < ROTATOR_RELOAD; ++j) { +++ for (i = 0; i < (unsigned int)(num_points / ROTATOR_RELOAD); ++i) { +++ for (j = 0; j < ROTATOR_RELOAD; ++j) { ++ *outVector++ = *inVector++ * (*phase); ++ (*phase) *= phase_inc; ++ } ++ ++ (*phase) /= hypotf(lv_creal(*phase), lv_cimag(*phase)); ++ } ++- for(i = 0; i < num_points%ROTATOR_RELOAD; ++i) { +++ for (i = 0; i < num_points % ROTATOR_RELOAD; ++i) { ++ *outVector++ = *inVector++ * (*phase); ++ (*phase) *= phase_inc; ++ } ++- if(i){ +++ if (i) { ++ // Make sure, we normalize phase on every call! ++ (*phase) /= hypotf(lv_creal(*phase), lv_cimag(*phase)); ++ } ++@@ -118,43 +124,47 @@ static inline void volk_32fc_s32fc_x2_rotator_32fc_generic(lv_32fc_t* outVector, ++ #include ++ #include ++ ++-static inline void volk_32fc_s32fc_x2_rotator_32fc_neon(lv_32fc_t* outVector, const lv_32fc_t* inVector, const lv_32fc_t phase_inc, lv_32fc_t* phase, unsigned int num_points) +++static inline void volk_32fc_s32fc_x2_rotator_32fc_neon(lv_32fc_t* outVector, +++ const lv_32fc_t* inVector, +++ const lv_32fc_t phase_inc, +++ lv_32fc_t* phase, +++ unsigned int num_points) ++ ++ { ++ lv_32fc_t* outputVectorPtr = outVector; ++ const lv_32fc_t* inputVectorPtr = inVector; ++ lv_32fc_t incr = 1; ++- lv_32fc_t phasePtr[4] = {(*phase), (*phase), (*phase), (*phase)}; +++ lv_32fc_t phasePtr[4] = { (*phase), (*phase), (*phase), (*phase) }; ++ float32x4x2_t input_vec; ++ float32x4x2_t output_vec; ++- +++ ++ unsigned int i = 0, j = 0; ++ const unsigned int quarter_points = num_points / 4; ++- ++- for(i = 0; i < 4; ++i) { +++ +++ for (i = 0; i < 4; ++i) { ++ phasePtr[i] *= incr; ++ incr *= (phase_inc); ++ } ++- +++ ++ // Notice that incr has be incremented in the previous loop ++- const lv_32fc_t incrPtr[4] = {incr, incr, incr, incr}; ++- const float32x4x2_t incr_vec = vld2q_f32((float*) incrPtr); ++- float32x4x2_t phase_vec = vld2q_f32((float*) phasePtr); ++- ++- for(i = 0; i < (unsigned int)(quarter_points/ROTATOR_RELOAD); i++) { ++- for(j = 0; j < ROTATOR_RELOAD; j++) { ++- input_vec = vld2q_f32((float*) inputVectorPtr); +++ const lv_32fc_t incrPtr[4] = { incr, incr, incr, incr }; +++ const float32x4x2_t incr_vec = vld2q_f32((float*)incrPtr); +++ float32x4x2_t phase_vec = vld2q_f32((float*)phasePtr); +++ +++ for (i = 0; i < (unsigned int)(quarter_points / ROTATOR_RELOAD); i++) { +++ for (j = 0; j < ROTATOR_RELOAD; j++) { +++ input_vec = vld2q_f32((float*)inputVectorPtr); ++ // Prefetch next one, speeds things up ++- __VOLK_PREFETCH(inputVectorPtr+4); +++ __VOLK_PREFETCH(inputVectorPtr + 4); ++ // Rotate ++ output_vec = _vmultiply_complexq_f32(input_vec, phase_vec); ++ // Increase phase ++ phase_vec = _vmultiply_complexq_f32(phase_vec, incr_vec); ++ // Store output ++ vst2q_f32((float*)outputVectorPtr, output_vec); ++- ++- outputVectorPtr+=4; ++- inputVectorPtr+=4; +++ +++ outputVectorPtr += 4; +++ inputVectorPtr += 4; ++ } ++ // normalize phase so magnitude doesn't grow because of ++ // floating point rounding error ++@@ -164,20 +174,20 @@ static inline void volk_32fc_s32fc_x2_rotator_32fc_neon(lv_32fc_t* outVector, co ++ phase_vec.val[0] = vmulq_f32(phase_vec.val[0], inv_mag); ++ phase_vec.val[1] = vmulq_f32(phase_vec.val[1], inv_mag); ++ } ++- ++- for(i = 0; i < quarter_points % ROTATOR_RELOAD; i++) { ++- input_vec = vld2q_f32((float*) inputVectorPtr); +++ +++ for (i = 0; i < quarter_points % ROTATOR_RELOAD; i++) { +++ input_vec = vld2q_f32((float*)inputVectorPtr); ++ // Prefetch next one, speeds things up ++- __VOLK_PREFETCH(inputVectorPtr+4); +++ __VOLK_PREFETCH(inputVectorPtr + 4); ++ // Rotate ++ output_vec = _vmultiply_complexq_f32(input_vec, phase_vec); ++ // Increase phase ++ phase_vec = _vmultiply_complexq_f32(phase_vec, incr_vec); ++ // Store output ++ vst2q_f32((float*)outputVectorPtr, output_vec); ++- ++- outputVectorPtr+=4; ++- inputVectorPtr+=4; +++ +++ outputVectorPtr += 4; +++ inputVectorPtr += 4; ++ } ++ // if(i) == true means we looped above ++ if (i) { ++@@ -191,13 +201,13 @@ static inline void volk_32fc_s32fc_x2_rotator_32fc_neon(lv_32fc_t* outVector, co ++ } ++ // Store current phase ++ vst2q_f32((float*)phasePtr, phase_vec); ++- +++ ++ // Deal with the rest ++- for(i = 0; i < num_points % 4; i++) { +++ for (i = 0; i < num_points % 4; i++) { ++ *outputVectorPtr++ = *inputVectorPtr++ * phasePtr[0]; ++ phasePtr[0] *= (phase_inc); ++ } ++- +++ ++ // For continious phase next time we need to call this function ++ (*phase) = phasePtr[0]; ++ } ++@@ -208,15 +218,20 @@ static inline void volk_32fc_s32fc_x2_rotator_32fc_neon(lv_32fc_t* outVector, co ++ #ifdef LV_HAVE_SSE4_1 ++ #include ++ ++-static inline void volk_32fc_s32fc_x2_rotator_32fc_a_sse4_1(lv_32fc_t* outVector, const lv_32fc_t* inVector, const lv_32fc_t phase_inc, lv_32fc_t* phase, unsigned int num_points){ +++static inline void volk_32fc_s32fc_x2_rotator_32fc_a_sse4_1(lv_32fc_t* outVector, +++ const lv_32fc_t* inVector, +++ const lv_32fc_t phase_inc, +++ lv_32fc_t* phase, +++ unsigned int num_points) +++{ ++ lv_32fc_t* cPtr = outVector; ++ const lv_32fc_t* aPtr = inVector; ++ lv_32fc_t incr = 1; ++- lv_32fc_t phase_Ptr[2] = {(*phase), (*phase)}; +++ lv_32fc_t phase_Ptr[2] = { (*phase), (*phase) }; ++ ++ unsigned int i, j = 0; ++ ++- for(i = 0; i < 2; ++i) { +++ for (i = 0; i < 2; ++i) { ++ phase_Ptr[i] *= incr; ++ incr *= (phase_inc); ++ } ++@@ -227,13 +242,13 @@ static inline void volk_32fc_s32fc_x2_rotator_32fc_a_sse4_1(lv_32fc_t* outVector ++ __m128 aVal, phase_Val, inc_Val, yl, yh, tmp1, tmp2, z, ylp, yhp, tmp1p, tmp2p; ++ ++ phase_Val = _mm_loadu_ps((float*)phase_Ptr); ++- inc_Val = _mm_set_ps(lv_cimag(incr), lv_creal(incr),lv_cimag(incr), lv_creal(incr)); +++ inc_Val = _mm_set_ps(lv_cimag(incr), lv_creal(incr), lv_cimag(incr), lv_creal(incr)); ++ ++ const unsigned int halfPoints = num_points / 2; ++ ++ ++- for(i = 0; i < (unsigned int)(halfPoints/ROTATOR_RELOAD); i++) { ++- for(j = 0; j < ROTATOR_RELOAD; ++j) { +++ for (i = 0; i < (unsigned int)(halfPoints / ROTATOR_RELOAD); i++) { +++ for (j = 0; j < ROTATOR_RELOAD; ++j) { ++ ++ aVal = _mm_load_ps((float*)aPtr); ++ ++@@ -264,7 +279,7 @@ static inline void volk_32fc_s32fc_x2_rotator_32fc_a_sse4_1(lv_32fc_t* outVector ++ tmp2 = _mm_sqrt_ps(tmp1); ++ phase_Val = _mm_div_ps(phase_Val, tmp2); ++ } ++- for(i = 0; i < halfPoints%ROTATOR_RELOAD; ++i) { +++ for (i = 0; i < halfPoints % ROTATOR_RELOAD; ++i) { ++ aVal = _mm_load_ps((float*)aPtr); ++ ++ yl = _mm_moveldup_ps(phase_Val); ++@@ -304,7 +319,6 @@ static inline void volk_32fc_s32fc_x2_rotator_32fc_a_sse4_1(lv_32fc_t* outVector ++ } ++ ++ (*phase) = phase_Ptr[0]; ++- ++ } ++ ++ #endif /* LV_HAVE_SSE4_1 for aligned */ ++@@ -313,15 +327,20 @@ static inline void volk_32fc_s32fc_x2_rotator_32fc_a_sse4_1(lv_32fc_t* outVector ++ #ifdef LV_HAVE_SSE4_1 ++ #include ++ ++-static inline void volk_32fc_s32fc_x2_rotator_32fc_u_sse4_1(lv_32fc_t* outVector, const lv_32fc_t* inVector, const lv_32fc_t phase_inc, lv_32fc_t* phase, unsigned int num_points){ +++static inline void volk_32fc_s32fc_x2_rotator_32fc_u_sse4_1(lv_32fc_t* outVector, +++ const lv_32fc_t* inVector, +++ const lv_32fc_t phase_inc, +++ lv_32fc_t* phase, +++ unsigned int num_points) +++{ ++ lv_32fc_t* cPtr = outVector; ++ const lv_32fc_t* aPtr = inVector; ++ lv_32fc_t incr = 1; ++- lv_32fc_t phase_Ptr[2] = {(*phase), (*phase)}; +++ lv_32fc_t phase_Ptr[2] = { (*phase), (*phase) }; ++ ++ unsigned int i, j = 0; ++ ++- for(i = 0; i < 2; ++i) { +++ for (i = 0; i < 2; ++i) { ++ phase_Ptr[i] *= incr; ++ incr *= (phase_inc); ++ } ++@@ -332,13 +351,13 @@ static inline void volk_32fc_s32fc_x2_rotator_32fc_u_sse4_1(lv_32fc_t* outVector ++ __m128 aVal, phase_Val, inc_Val, yl, yh, tmp1, tmp2, z, ylp, yhp, tmp1p, tmp2p; ++ ++ phase_Val = _mm_loadu_ps((float*)phase_Ptr); ++- inc_Val = _mm_set_ps(lv_cimag(incr), lv_creal(incr),lv_cimag(incr), lv_creal(incr)); +++ inc_Val = _mm_set_ps(lv_cimag(incr), lv_creal(incr), lv_cimag(incr), lv_creal(incr)); ++ ++ const unsigned int halfPoints = num_points / 2; ++ ++ ++- for(i = 0; i < (unsigned int)(halfPoints/ROTATOR_RELOAD); i++) { ++- for(j = 0; j < ROTATOR_RELOAD; ++j) { +++ for (i = 0; i < (unsigned int)(halfPoints / ROTATOR_RELOAD); i++) { +++ for (j = 0; j < ROTATOR_RELOAD; ++j) { ++ ++ aVal = _mm_loadu_ps((float*)aPtr); ++ ++@@ -369,7 +388,7 @@ static inline void volk_32fc_s32fc_x2_rotator_32fc_u_sse4_1(lv_32fc_t* outVector ++ tmp2 = _mm_sqrt_ps(tmp1); ++ phase_Val = _mm_div_ps(phase_Val, tmp2); ++ } ++- for(i = 0; i < halfPoints%ROTATOR_RELOAD; ++i) { +++ for (i = 0; i < halfPoints % ROTATOR_RELOAD; ++i) { ++ aVal = _mm_loadu_ps((float*)aPtr); ++ ++ yl = _mm_moveldup_ps(phase_Val); ++@@ -409,7 +428,6 @@ static inline void volk_32fc_s32fc_x2_rotator_32fc_u_sse4_1(lv_32fc_t* outVector ++ } ++ ++ (*phase) = phase_Ptr[0]; ++- ++ } ++ ++ #endif /* LV_HAVE_SSE4_1 */ ++@@ -419,15 +437,20 @@ static inline void volk_32fc_s32fc_x2_rotator_32fc_u_sse4_1(lv_32fc_t* outVector ++ #include ++ #include ++ ++-static inline void volk_32fc_s32fc_x2_rotator_32fc_a_avx(lv_32fc_t* outVector, const lv_32fc_t* inVector, const lv_32fc_t phase_inc, lv_32fc_t* phase, unsigned int num_points){ +++static inline void volk_32fc_s32fc_x2_rotator_32fc_a_avx(lv_32fc_t* outVector, +++ const lv_32fc_t* inVector, +++ const lv_32fc_t phase_inc, +++ lv_32fc_t* phase, +++ unsigned int num_points) +++{ ++ lv_32fc_t* cPtr = outVector; ++ const lv_32fc_t* aPtr = inVector; ++ lv_32fc_t incr = lv_cmake(1.0, 0.0); ++- lv_32fc_t phase_Ptr[4] = {(*phase), (*phase), (*phase), (*phase)}; +++ lv_32fc_t phase_Ptr[4] = { (*phase), (*phase), (*phase), (*phase) }; ++ ++ unsigned int i, j = 0; ++ ++- for(i = 0; i < 4; ++i) { +++ for (i = 0; i < 4; ++i) { ++ phase_Ptr[i] *= incr; ++ incr *= (phase_inc); ++ } ++@@ -435,16 +458,20 @@ static inline void volk_32fc_s32fc_x2_rotator_32fc_a_avx(lv_32fc_t* outVector, c ++ __m256 aVal, phase_Val, z; ++ ++ phase_Val = _mm256_loadu_ps((float*)phase_Ptr); ++- ++- const __m256 inc_Val = _mm256_set_ps(lv_cimag(incr), lv_creal(incr), ++- lv_cimag(incr), lv_creal(incr), ++- lv_cimag(incr), lv_creal(incr), ++- lv_cimag(incr), lv_creal(incr)); +++ +++ const __m256 inc_Val = _mm256_set_ps(lv_cimag(incr), +++ lv_creal(incr), +++ lv_cimag(incr), +++ lv_creal(incr), +++ lv_cimag(incr), +++ lv_creal(incr), +++ lv_cimag(incr), +++ lv_creal(incr)); ++ ++ const unsigned int fourthPoints = num_points / 4; ++ ++- for(i = 0; i < (unsigned int)(fourthPoints/ROTATOR_RELOAD); i++) { ++- for(j = 0; j < ROTATOR_RELOAD; ++j) { +++ for (i = 0; i < (unsigned int)(fourthPoints / ROTATOR_RELOAD); i++) { +++ for (j = 0; j < ROTATOR_RELOAD; ++j) { ++ ++ aVal = _mm256_load_ps((float*)aPtr); ++ ++@@ -458,8 +485,8 @@ static inline void volk_32fc_s32fc_x2_rotator_32fc_a_avx(lv_32fc_t* outVector, c ++ } ++ phase_Val = _mm256_normalize_ps(phase_Val); ++ } ++- ++- for(i = 0; i < fourthPoints%ROTATOR_RELOAD; ++i) { +++ +++ for (i = 0; i < fourthPoints % ROTATOR_RELOAD; ++i) { ++ aVal = _mm256_load_ps((float*)aPtr); ++ ++ z = _mm256_complexmul_ps(aVal, phase_Val); ++@@ -473,10 +500,10 @@ static inline void volk_32fc_s32fc_x2_rotator_32fc_a_avx(lv_32fc_t* outVector, c ++ if (i) { ++ phase_Val = _mm256_normalize_ps(phase_Val); ++ } ++- +++ ++ _mm256_storeu_ps((float*)phase_Ptr, phase_Val); ++ (*phase) = phase_Ptr[0]; ++- volk_32fc_s32fc_x2_rotator_32fc_generic(cPtr, aPtr, phase_inc, phase, num_points%4); +++ volk_32fc_s32fc_x2_rotator_32fc_generic(cPtr, aPtr, phase_inc, phase, num_points % 4); ++ } ++ ++ #endif /* LV_HAVE_AVX for aligned */ ++@@ -486,15 +513,20 @@ static inline void volk_32fc_s32fc_x2_rotator_32fc_a_avx(lv_32fc_t* outVector, c ++ #include ++ #include ++ ++-static inline void volk_32fc_s32fc_x2_rotator_32fc_u_avx(lv_32fc_t* outVector, const lv_32fc_t* inVector, const lv_32fc_t phase_inc, lv_32fc_t* phase, unsigned int num_points){ +++static inline void volk_32fc_s32fc_x2_rotator_32fc_u_avx(lv_32fc_t* outVector, +++ const lv_32fc_t* inVector, +++ const lv_32fc_t phase_inc, +++ lv_32fc_t* phase, +++ unsigned int num_points) +++{ ++ lv_32fc_t* cPtr = outVector; ++ const lv_32fc_t* aPtr = inVector; ++ lv_32fc_t incr = lv_cmake(1.0, 0.0); ++- lv_32fc_t phase_Ptr[4] = {(*phase), (*phase), (*phase), (*phase)}; +++ lv_32fc_t phase_Ptr[4] = { (*phase), (*phase), (*phase), (*phase) }; ++ ++ unsigned int i, j = 0; ++ ++- for(i = 0; i < 4; ++i) { +++ for (i = 0; i < 4; ++i) { ++ phase_Ptr[i] *= incr; ++ incr *= (phase_inc); ++ } ++@@ -502,19 +534,23 @@ static inline void volk_32fc_s32fc_x2_rotator_32fc_u_avx(lv_32fc_t* outVector, c ++ __m256 aVal, phase_Val, z; ++ ++ phase_Val = _mm256_loadu_ps((float*)phase_Ptr); ++- ++- const __m256 inc_Val = _mm256_set_ps(lv_cimag(incr), lv_creal(incr), ++- lv_cimag(incr), lv_creal(incr), ++- lv_cimag(incr), lv_creal(incr), ++- lv_cimag(incr), lv_creal(incr)); ++- +++ +++ const __m256 inc_Val = _mm256_set_ps(lv_cimag(incr), +++ lv_creal(incr), +++ lv_cimag(incr), +++ lv_creal(incr), +++ lv_cimag(incr), +++ lv_creal(incr), +++ lv_cimag(incr), +++ lv_creal(incr)); +++ ++ const unsigned int fourthPoints = num_points / 4; ++ ++- for(i = 0; i < (unsigned int)(fourthPoints/ROTATOR_RELOAD); ++i) { ++- for(j = 0; j < ROTATOR_RELOAD; ++j) { +++ for (i = 0; i < (unsigned int)(fourthPoints / ROTATOR_RELOAD); ++i) { +++ for (j = 0; j < ROTATOR_RELOAD; ++j) { ++ ++ aVal = _mm256_loadu_ps((float*)aPtr); ++- +++ ++ z = _mm256_complexmul_ps(aVal, phase_Val); ++ phase_Val = _mm256_complexmul_ps(phase_Val, inc_Val); ++ ++@@ -524,10 +560,9 @@ static inline void volk_32fc_s32fc_x2_rotator_32fc_u_avx(lv_32fc_t* outVector, c ++ cPtr += 4; ++ } ++ phase_Val = _mm256_normalize_ps(phase_Val); ++- ++ } ++- ++- for(i = 0; i < fourthPoints%ROTATOR_RELOAD; ++i) { +++ +++ for (i = 0; i < num_points % ROTATOR_RELOAD; ++i) { ++ aVal = _mm256_loadu_ps((float*)aPtr); ++ ++ z = _mm256_complexmul_ps(aVal, phase_Val); ++@@ -544,7 +579,7 @@ static inline void volk_32fc_s32fc_x2_rotator_32fc_u_avx(lv_32fc_t* outVector, c ++ ++ _mm256_storeu_ps((float*)phase_Ptr, phase_Val); ++ (*phase) = phase_Ptr[0]; ++- volk_32fc_s32fc_x2_rotator_32fc_generic(cPtr, aPtr, phase_inc, phase, num_points%4); +++ volk_32fc_s32fc_x2_rotator_32fc_generic(cPtr, aPtr, phase_inc, phase, num_points % 4); ++ } ++ ++ #endif /* LV_HAVE_AVX */ ++@@ -552,15 +587,21 @@ static inline void volk_32fc_s32fc_x2_rotator_32fc_u_avx(lv_32fc_t* outVector, c ++ #if LV_HAVE_AVX && LV_HAVE_FMA ++ #include ++ ++-static inline void volk_32fc_s32fc_x2_rotator_32fc_a_avx_fma(lv_32fc_t* outVector, const lv_32fc_t* inVector, const lv_32fc_t phase_inc, lv_32fc_t* phase, unsigned int num_points){ +++static inline void volk_32fc_s32fc_x2_rotator_32fc_a_avx_fma(lv_32fc_t* outVector, +++ const lv_32fc_t* inVector, +++ const lv_32fc_t phase_inc, +++ lv_32fc_t* phase, +++ unsigned int num_points) +++{ ++ lv_32fc_t* cPtr = outVector; ++ const lv_32fc_t* aPtr = inVector; ++ lv_32fc_t incr = 1; ++- __VOLK_ATTR_ALIGNED(32) lv_32fc_t phase_Ptr[4] = {(*phase), (*phase), (*phase), (*phase)}; +++ __VOLK_ATTR_ALIGNED(32) +++ lv_32fc_t phase_Ptr[4] = { (*phase), (*phase), (*phase), (*phase) }; ++ ++ unsigned int i, j = 0; ++ ++- for(i = 0; i < 4; ++i) { +++ for (i = 0; i < 4; ++i) { ++ phase_Ptr[i] *= incr; ++ incr *= (phase_inc); ++ } ++@@ -568,11 +609,18 @@ static inline void volk_32fc_s32fc_x2_rotator_32fc_a_avx_fma(lv_32fc_t* outVecto ++ __m256 aVal, phase_Val, inc_Val, yl, yh, tmp1, tmp2, z, ylp, yhp, tmp1p, tmp2p; ++ ++ phase_Val = _mm256_load_ps((float*)phase_Ptr); ++- inc_Val = _mm256_set_ps(lv_cimag(incr), lv_creal(incr),lv_cimag(incr), lv_creal(incr),lv_cimag(incr), lv_creal(incr),lv_cimag(incr), lv_creal(incr)); +++ inc_Val = _mm256_set_ps(lv_cimag(incr), +++ lv_creal(incr), +++ lv_cimag(incr), +++ lv_creal(incr), +++ lv_cimag(incr), +++ lv_creal(incr), +++ lv_cimag(incr), +++ lv_creal(incr)); ++ const unsigned int fourthPoints = num_points / 4; ++ ++- for(i = 0; i < (unsigned int)(fourthPoints/ROTATOR_RELOAD); i++) { ++- for(j = 0; j < ROTATOR_RELOAD; ++j) { +++ for (i = 0; i < (unsigned int)(fourthPoints / ROTATOR_RELOAD); i++) { +++ for (j = 0; j < ROTATOR_RELOAD; ++j) { ++ ++ aVal = _mm256_load_ps((float*)aPtr); ++ ++@@ -603,7 +651,7 @@ static inline void volk_32fc_s32fc_x2_rotator_32fc_a_avx_fma(lv_32fc_t* outVecto ++ tmp2 = _mm256_sqrt_ps(tmp1); ++ phase_Val = _mm256_div_ps(phase_Val, tmp2); ++ } ++- for(i = 0; i < fourthPoints%ROTATOR_RELOAD; ++i) { +++ for (i = 0; i < fourthPoints % ROTATOR_RELOAD; ++i) { ++ aVal = _mm256_load_ps((float*)aPtr); ++ ++ yl = _mm256_moveldup_ps(phase_Val); ++@@ -636,13 +684,12 @@ static inline void volk_32fc_s32fc_x2_rotator_32fc_a_avx_fma(lv_32fc_t* outVecto ++ } ++ ++ _mm256_store_ps((float*)phase_Ptr, phase_Val); ++- for(i = 0; i < num_points%4; ++i) { +++ for (i = 0; i < num_points % 4; ++i) { ++ *cPtr++ = *aPtr++ * phase_Ptr[0]; ++ phase_Ptr[0] *= (phase_inc); ++ } ++ ++ (*phase) = phase_Ptr[0]; ++- ++ } ++ ++ #endif /* LV_HAVE_AVX && LV_HAVE_FMA for aligned*/ ++@@ -650,15 +697,20 @@ static inline void volk_32fc_s32fc_x2_rotator_32fc_a_avx_fma(lv_32fc_t* outVecto ++ #if LV_HAVE_AVX && LV_HAVE_FMA ++ #include ++ ++-static inline void volk_32fc_s32fc_x2_rotator_32fc_u_avx_fma(lv_32fc_t* outVector, const lv_32fc_t* inVector, const lv_32fc_t phase_inc, lv_32fc_t* phase, unsigned int num_points){ +++static inline void volk_32fc_s32fc_x2_rotator_32fc_u_avx_fma(lv_32fc_t* outVector, +++ const lv_32fc_t* inVector, +++ const lv_32fc_t phase_inc, +++ lv_32fc_t* phase, +++ unsigned int num_points) +++{ ++ lv_32fc_t* cPtr = outVector; ++ const lv_32fc_t* aPtr = inVector; ++ lv_32fc_t incr = 1; ++- lv_32fc_t phase_Ptr[4] = {(*phase), (*phase), (*phase), (*phase)}; +++ lv_32fc_t phase_Ptr[4] = { (*phase), (*phase), (*phase), (*phase) }; ++ ++ unsigned int i, j = 0; ++ ++- for(i = 0; i < 4; ++i) { +++ for (i = 0; i < 4; ++i) { ++ phase_Ptr[i] *= incr; ++ incr *= (phase_inc); ++ } ++@@ -666,11 +718,18 @@ static inline void volk_32fc_s32fc_x2_rotator_32fc_u_avx_fma(lv_32fc_t* outVecto ++ __m256 aVal, phase_Val, inc_Val, yl, yh, tmp1, tmp2, z, ylp, yhp, tmp1p, tmp2p; ++ ++ phase_Val = _mm256_loadu_ps((float*)phase_Ptr); ++- inc_Val = _mm256_set_ps(lv_cimag(incr), lv_creal(incr),lv_cimag(incr), lv_creal(incr),lv_cimag(incr), lv_creal(incr),lv_cimag(incr), lv_creal(incr)); +++ inc_Val = _mm256_set_ps(lv_cimag(incr), +++ lv_creal(incr), +++ lv_cimag(incr), +++ lv_creal(incr), +++ lv_cimag(incr), +++ lv_creal(incr), +++ lv_cimag(incr), +++ lv_creal(incr)); ++ const unsigned int fourthPoints = num_points / 4; ++ ++- for(i = 0; i < (unsigned int)(fourthPoints/ROTATOR_RELOAD); i++) { ++- for(j = 0; j < ROTATOR_RELOAD; ++j) { +++ for (i = 0; i < (unsigned int)(fourthPoints / ROTATOR_RELOAD); i++) { +++ for (j = 0; j < ROTATOR_RELOAD; ++j) { ++ ++ aVal = _mm256_loadu_ps((float*)aPtr); ++ ++@@ -701,7 +760,7 @@ static inline void volk_32fc_s32fc_x2_rotator_32fc_u_avx_fma(lv_32fc_t* outVecto ++ tmp2 = _mm256_sqrt_ps(tmp1); ++ phase_Val = _mm256_div_ps(phase_Val, tmp2); ++ } ++- for(i = 0; i < fourthPoints%ROTATOR_RELOAD; ++i) { +++ for (i = 0; i < fourthPoints % ROTATOR_RELOAD; ++i) { ++ aVal = _mm256_loadu_ps((float*)aPtr); ++ ++ yl = _mm256_moveldup_ps(phase_Val); ++@@ -734,13 +793,12 @@ static inline void volk_32fc_s32fc_x2_rotator_32fc_u_avx_fma(lv_32fc_t* outVecto ++ } ++ ++ _mm256_storeu_ps((float*)phase_Ptr, phase_Val); ++- for(i = 0; i < num_points%4; ++i) { +++ for (i = 0; i < num_points % 4; ++i) { ++ *cPtr++ = *aPtr++ * phase_Ptr[0]; ++ phase_Ptr[0] *= (phase_inc); ++ } ++ ++ (*phase) = phase_Ptr[0]; ++- ++ } ++ ++ #endif /* LV_HAVE_AVX && LV_HAVE_FMA*/ ++diff --git a/kernels/volk/volk_32fc_x2_add_32fc.h b/kernels/volk/volk_32fc_x2_add_32fc.h ++index 90ff787..e7356c3 100644 ++--- a/kernels/volk/volk_32fc_x2_add_32fc.h +++++ b/kernels/volk/volk_32fc_x2_add_32fc.h ++@@ -31,8 +31,8 @@ ++ * ++ * Dispatcher Prototype ++ * \code ++- * void volk_32fc_x2_add_32fc(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points) ++- * \endcode +++ * void volk_32fc_x2_add_32fc(lv_32fc_t* cVector, const lv_32fc_t* aVector, const +++ * lv_32fc_t* bVector, unsigned int num_points) \endcode ++ * ++ * \b Inputs ++ * \li aVector: First vector of input points. ++@@ -44,7 +44,8 @@ ++ * ++ * \b Example ++ * ++- * The follow example adds the increasing and decreasing vectors such that the result of every summation pair is 10 +++ * The follow example adds the increasing and decreasing vectors such that the result of +++ * every summation pair is 10 ++ * ++ * \code ++ * int N = 10; ++@@ -76,36 +77,38 @@ ++ #ifdef LV_HAVE_AVX ++ #include ++ ++-static inline void ++-volk_32fc_x2_add_32fc_u_avx(lv_32fc_t* cVector, const lv_32fc_t* aVector, ++- const lv_32fc_t* bVector, unsigned int num_points) +++static inline void volk_32fc_x2_add_32fc_u_avx(lv_32fc_t* cVector, +++ const lv_32fc_t* aVector, +++ const lv_32fc_t* bVector, +++ unsigned int num_points) ++ { ++- unsigned int number = 0; ++- const unsigned int quarterPoints = num_points / 4; +++ unsigned int number = 0; +++ const unsigned int quarterPoints = num_points / 4; ++ ++- lv_32fc_t* cPtr = cVector; ++- const lv_32fc_t* aPtr = aVector; ++- const lv_32fc_t* bPtr= bVector; +++ lv_32fc_t* cPtr = cVector; +++ const lv_32fc_t* aPtr = aVector; +++ const lv_32fc_t* bPtr = bVector; ++ ++- __m256 aVal, bVal, cVal; ++- for(;number < quarterPoints; number++){ +++ __m256 aVal, bVal, cVal; +++ for (; number < quarterPoints; number++) { ++ ++- aVal = _mm256_loadu_ps((float *) aPtr); ++- bVal = _mm256_loadu_ps((float *) bPtr); +++ aVal = _mm256_loadu_ps((float*)aPtr); +++ bVal = _mm256_loadu_ps((float*)bPtr); ++ ++- cVal = _mm256_add_ps(aVal, bVal); +++ cVal = _mm256_add_ps(aVal, bVal); ++ ++- _mm256_storeu_ps((float *) cPtr,cVal); // Store the results back into the C container +++ _mm256_storeu_ps((float*)cPtr, +++ cVal); // Store the results back into the C container ++ ++- aPtr += 4; ++- bPtr += 4; ++- cPtr += 4; ++- } +++ aPtr += 4; +++ bPtr += 4; +++ cPtr += 4; +++ } ++ ++- number = quarterPoints * 4; ++- for(;number < num_points; number++){ ++- *cPtr++ = (*aPtr++) + (*bPtr++); ++- } +++ number = quarterPoints * 4; +++ for (; number < num_points; number++) { +++ *cPtr++ = (*aPtr++) + (*bPtr++); +++ } ++ } ++ #endif /* LV_HAVE_AVX */ ++ ++@@ -113,36 +116,38 @@ volk_32fc_x2_add_32fc_u_avx(lv_32fc_t* cVector, const lv_32fc_t* aVector, ++ #ifdef LV_HAVE_AVX ++ #include ++ ++-static inline void ++-volk_32fc_x2_add_32fc_a_avx(lv_32fc_t* cVector, const lv_32fc_t* aVector, ++- const lv_32fc_t* bVector, unsigned int num_points) +++static inline void volk_32fc_x2_add_32fc_a_avx(lv_32fc_t* cVector, +++ const lv_32fc_t* aVector, +++ const lv_32fc_t* bVector, +++ unsigned int num_points) ++ { ++- unsigned int number = 0; ++- const unsigned int quarterPoints = num_points / 4; +++ unsigned int number = 0; +++ const unsigned int quarterPoints = num_points / 4; ++ ++- lv_32fc_t* cPtr = cVector; ++- const lv_32fc_t* aPtr = aVector; ++- const lv_32fc_t* bPtr= bVector; +++ lv_32fc_t* cPtr = cVector; +++ const lv_32fc_t* aPtr = aVector; +++ const lv_32fc_t* bPtr = bVector; ++ ++- __m256 aVal, bVal, cVal; ++- for(;number < quarterPoints; number++){ +++ __m256 aVal, bVal, cVal; +++ for (; number < quarterPoints; number++) { ++ ++- aVal = _mm256_load_ps((float*) aPtr); ++- bVal = _mm256_load_ps((float*) bPtr); +++ aVal = _mm256_load_ps((float*)aPtr); +++ bVal = _mm256_load_ps((float*)bPtr); ++ ++- cVal = _mm256_add_ps(aVal, bVal); +++ cVal = _mm256_add_ps(aVal, bVal); ++ ++- _mm256_store_ps((float*) cPtr,cVal); // Store the results back into the C container +++ _mm256_store_ps((float*)cPtr, +++ cVal); // Store the results back into the C container ++ ++- aPtr += 4; ++- bPtr += 4; ++- cPtr += 4; ++- } +++ aPtr += 4; +++ bPtr += 4; +++ cPtr += 4; +++ } ++ ++- number = quarterPoints * 4; ++- for(;number < num_points; number++){ ++- *cPtr++ = (*aPtr++) + (*bPtr++); ++- } +++ number = quarterPoints * 4; +++ for (; number < num_points; number++) { +++ *cPtr++ = (*aPtr++) + (*bPtr++); +++ } ++ } ++ #endif /* LV_HAVE_AVX */ ++ ++@@ -150,54 +155,56 @@ volk_32fc_x2_add_32fc_a_avx(lv_32fc_t* cVector, const lv_32fc_t* aVector, ++ #ifdef LV_HAVE_SSE ++ #include ++ ++-static inline void ++-volk_32fc_x2_add_32fc_u_sse(lv_32fc_t* cVector, const lv_32fc_t* aVector, ++- const lv_32fc_t* bVector, unsigned int num_points) +++static inline void volk_32fc_x2_add_32fc_u_sse(lv_32fc_t* cVector, +++ const lv_32fc_t* aVector, +++ const lv_32fc_t* bVector, +++ unsigned int num_points) ++ { ++- unsigned int number = 0; ++- const unsigned int halfPoints = num_points / 2; +++ unsigned int number = 0; +++ const unsigned int halfPoints = num_points / 2; ++ ++- lv_32fc_t* cPtr = cVector; ++- const lv_32fc_t* aPtr = aVector; ++- const lv_32fc_t* bPtr= bVector; +++ lv_32fc_t* cPtr = cVector; +++ const lv_32fc_t* aPtr = aVector; +++ const lv_32fc_t* bPtr = bVector; ++ ++- __m128 aVal, bVal, cVal; ++- for(;number < halfPoints; number++){ +++ __m128 aVal, bVal, cVal; +++ for (; number < halfPoints; number++) { ++ ++- aVal = _mm_loadu_ps((float *) aPtr); ++- bVal = _mm_loadu_ps((float *) bPtr); +++ aVal = _mm_loadu_ps((float*)aPtr); +++ bVal = _mm_loadu_ps((float*)bPtr); ++ ++- cVal = _mm_add_ps(aVal, bVal); +++ cVal = _mm_add_ps(aVal, bVal); ++ ++- _mm_storeu_ps((float*) cPtr, cVal); // Store the results back into the C container +++ _mm_storeu_ps((float*)cPtr, cVal); // Store the results back into the C container ++ ++- aPtr += 2; ++- bPtr += 2; ++- cPtr += 2; ++- } +++ aPtr += 2; +++ bPtr += 2; +++ cPtr += 2; +++ } ++ ++- number = halfPoints * 2; ++- for(;number < num_points; number++){ ++- *cPtr++ = (*aPtr++) + (*bPtr++); ++- } +++ number = halfPoints * 2; +++ for (; number < num_points; number++) { +++ *cPtr++ = (*aPtr++) + (*bPtr++); +++ } ++ } ++ #endif /* LV_HAVE_SSE */ ++ ++ ++ #ifdef LV_HAVE_GENERIC ++ ++-static inline void ++-volk_32fc_x2_add_32fc_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, ++- const lv_32fc_t* bVector, unsigned int num_points) +++static inline void volk_32fc_x2_add_32fc_generic(lv_32fc_t* cVector, +++ const lv_32fc_t* aVector, +++ const lv_32fc_t* bVector, +++ unsigned int num_points) ++ { ++- lv_32fc_t* cPtr = cVector; ++- const lv_32fc_t* aPtr = aVector; ++- const lv_32fc_t* bPtr= bVector; ++- unsigned int number = 0; ++- ++- for(number = 0; number < num_points; number++){ ++- *cPtr++ = (*aPtr++) + (*bPtr++); ++- } +++ lv_32fc_t* cPtr = cVector; +++ const lv_32fc_t* aPtr = aVector; +++ const lv_32fc_t* bPtr = bVector; +++ unsigned int number = 0; +++ +++ for (number = 0; number < num_points; number++) { +++ *cPtr++ = (*aPtr++) + (*bPtr++); +++ } ++ } ++ #endif /* LV_HAVE_GENERIC */ ++ ++@@ -205,34 +212,36 @@ volk_32fc_x2_add_32fc_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, ++ #ifdef LV_HAVE_SSE ++ #include ++ ++-static inline void ++-volk_32fc_x2_add_32fc_a_sse(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points) +++static inline void volk_32fc_x2_add_32fc_a_sse(lv_32fc_t* cVector, +++ const lv_32fc_t* aVector, +++ const lv_32fc_t* bVector, +++ unsigned int num_points) ++ { ++- unsigned int number = 0; ++- const unsigned int halfPoints = num_points / 2; +++ unsigned int number = 0; +++ const unsigned int halfPoints = num_points / 2; ++ ++- lv_32fc_t* cPtr = cVector; ++- const lv_32fc_t* aPtr = aVector; ++- const lv_32fc_t* bPtr= bVector; +++ lv_32fc_t* cPtr = cVector; +++ const lv_32fc_t* aPtr = aVector; +++ const lv_32fc_t* bPtr = bVector; ++ ++- __m128 aVal, bVal, cVal; ++- for(;number < halfPoints; number++){ ++- aVal = _mm_load_ps((float *) aPtr); ++- bVal = _mm_load_ps((float *) bPtr); +++ __m128 aVal, bVal, cVal; +++ for (; number < halfPoints; number++) { +++ aVal = _mm_load_ps((float*)aPtr); +++ bVal = _mm_load_ps((float*)bPtr); ++ ++- cVal = _mm_add_ps(aVal, bVal); +++ cVal = _mm_add_ps(aVal, bVal); ++ ++- _mm_store_ps((float *) cPtr,cVal); // Store the results back into the C container +++ _mm_store_ps((float*)cPtr, cVal); // Store the results back into the C container ++ ++- aPtr += 2; ++- bPtr += 2; ++- cPtr += 2; ++- } +++ aPtr += 2; +++ bPtr += 2; +++ cPtr += 2; +++ } ++ ++- number = halfPoints * 2; ++- for(;number < num_points; number++){ ++- *cPtr++ = (*aPtr++) + (*bPtr++); ++- } +++ number = halfPoints * 2; +++ for (; number < num_points; number++) { +++ *cPtr++ = (*aPtr++) + (*bPtr++); +++ } ++ } ++ #endif /* LV_HAVE_SSE */ ++ ++@@ -240,38 +249,39 @@ volk_32fc_x2_add_32fc_a_sse(lv_32fc_t* cVector, const lv_32fc_t* aVector, const ++ #ifdef LV_HAVE_NEON ++ #include ++ ++-static inline void ++-volk_32fc_x2_add_32fc_u_neon(lv_32fc_t* cVector, const lv_32fc_t* aVector, ++- const lv_32fc_t* bVector, unsigned int num_points) +++static inline void volk_32fc_x2_add_32fc_u_neon(lv_32fc_t* cVector, +++ const lv_32fc_t* aVector, +++ const lv_32fc_t* bVector, +++ unsigned int num_points) ++ { ++- unsigned int number = 0; ++- const unsigned int halfPoints = num_points / 2; ++- ++- lv_32fc_t* cPtr = cVector; ++- const lv_32fc_t* aPtr = aVector; ++- const lv_32fc_t* bPtr= bVector; ++- float32x4_t aVal, bVal, cVal; ++- for(number=0; number < halfPoints; number++){ ++- // Load in to NEON registers ++- aVal = vld1q_f32((const float32_t*)(aPtr)); ++- bVal = vld1q_f32((const float32_t*)(bPtr)); ++- __VOLK_PREFETCH(aPtr+2); ++- __VOLK_PREFETCH(bPtr+2); ++- ++- // vector add ++- cVal = vaddq_f32(aVal, bVal); ++- // Store the results back into the C container ++- vst1q_f32((float*)(cPtr),cVal); ++- ++- aPtr += 2; // q uses quadwords, 4 lv_32fc_ts per vadd ++- bPtr += 2; ++- cPtr += 2; ++- } ++- ++- number = halfPoints * 2; // should be = num_points ++- for(;number < num_points; number++){ ++- *cPtr++ = (*aPtr++) + (*bPtr++); ++- } +++ unsigned int number = 0; +++ const unsigned int halfPoints = num_points / 2; +++ +++ lv_32fc_t* cPtr = cVector; +++ const lv_32fc_t* aPtr = aVector; +++ const lv_32fc_t* bPtr = bVector; +++ float32x4_t aVal, bVal, cVal; +++ for (number = 0; number < halfPoints; number++) { +++ // Load in to NEON registers +++ aVal = vld1q_f32((const float32_t*)(aPtr)); +++ bVal = vld1q_f32((const float32_t*)(bPtr)); +++ __VOLK_PREFETCH(aPtr + 2); +++ __VOLK_PREFETCH(bPtr + 2); +++ +++ // vector add +++ cVal = vaddq_f32(aVal, bVal); +++ // Store the results back into the C container +++ vst1q_f32((float*)(cPtr), cVal); +++ +++ aPtr += 2; // q uses quadwords, 4 lv_32fc_ts per vadd +++ bPtr += 2; +++ cPtr += 2; +++ } +++ +++ number = halfPoints * 2; // should be = num_points +++ for (; number < num_points; number++) { +++ *cPtr++ = (*aPtr++) + (*bPtr++); +++ } ++ } ++ ++ #endif /* LV_HAVE_NEON */ ++diff --git a/kernels/volk/volk_32fc_x2_conjugate_dot_prod_32fc.h b/kernels/volk/volk_32fc_x2_conjugate_dot_prod_32fc.h ++index 77432ec..0f69499 100644 ++--- a/kernels/volk/volk_32fc_x2_conjugate_dot_prod_32fc.h +++++ b/kernels/volk/volk_32fc_x2_conjugate_dot_prod_32fc.h ++@@ -34,8 +34,8 @@ ++ * ++ * Dispatcher Prototype ++ * \code ++- * void volk_32fc_x2_conjugate_dot_prod_32fc(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) ++- * \endcode +++ * void volk_32fc_x2_conjugate_dot_prod_32fc(lv_32fc_t* result, const lv_32fc_t* input, +++ * const lv_32fc_t* taps, unsigned int num_points) \endcode ++ * ++ * \b Inputs ++ * \li input: vector of complex floats. ++@@ -60,40 +60,44 @@ ++ #define INCLUDED_volk_32fc_x2_conjugate_dot_prod_32fc_u_H ++ ++ ++-#include +++#include ++ ++ ++ #ifdef LV_HAVE_GENERIC ++ ++-static inline void volk_32fc_x2_conjugate_dot_prod_32fc_generic(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) { +++static inline void volk_32fc_x2_conjugate_dot_prod_32fc_generic(lv_32fc_t* result, +++ const lv_32fc_t* input, +++ const lv_32fc_t* taps, +++ unsigned int num_points) +++{ ++ ++- const unsigned int num_bytes = num_points*8; +++ const unsigned int num_bytes = num_points * 8; ++ ++- float * res = (float*) result; ++- float * in = (float*) input; ++- float * tp = (float*) taps; ++- unsigned int n_2_ccomplex_blocks = num_bytes >> 4; +++ float* res = (float*)result; +++ float* in = (float*)input; +++ float* tp = (float*)taps; +++ unsigned int n_2_ccomplex_blocks = num_bytes >> 4; ++ ++- float sum0[2] = {0,0}; ++- float sum1[2] = {0,0}; ++- unsigned int i = 0; +++ float sum0[2] = { 0, 0 }; +++ float sum1[2] = { 0, 0 }; +++ unsigned int i = 0; ++ ++- for(i = 0; i < n_2_ccomplex_blocks; ++i) { ++- sum0[0] += in[0] * tp[0] + in[1] * tp[1]; ++- sum0[1] += (-in[0] * tp[1]) + in[1] * tp[0]; ++- sum1[0] += in[2] * tp[2] + in[3] * tp[3]; ++- sum1[1] += (-in[2] * tp[3]) + in[3] * tp[2]; +++ for (i = 0; i < n_2_ccomplex_blocks; ++i) { +++ sum0[0] += in[0] * tp[0] + in[1] * tp[1]; +++ sum0[1] += (-in[0] * tp[1]) + in[1] * tp[0]; +++ sum1[0] += in[2] * tp[2] + in[3] * tp[3]; +++ sum1[1] += (-in[2] * tp[3]) + in[3] * tp[2]; ++ ++- in += 4; ++- tp += 4; ++- } +++ in += 4; +++ tp += 4; +++ } ++ ++- res[0] = sum0[0] + sum1[0]; ++- res[1] = sum0[1] + sum1[1]; +++ res[0] = sum0[0] + sum1[0]; +++ res[1] = sum0[1] + sum1[1]; ++ ++- if (num_bytes >> 3 & 1) { ++- *result += input[(num_bytes >> 3) - 1] * lv_conj(taps[(num_bytes >> 3) - 1]); ++- } +++ if (num_bytes >> 3 & 1) { +++ *result += input[(num_bytes >> 3) - 1] * lv_conj(taps[(num_bytes >> 3) - 1]); +++ } ++ } ++ ++ #endif /*LV_HAVE_GENERIC*/ ++@@ -103,125 +107,134 @@ static inline void volk_32fc_x2_conjugate_dot_prod_32fc_generic(lv_32fc_t* resul ++ #include ++ ++ static inline void volk_32fc_x2_conjugate_dot_prod_32fc_u_avx(lv_32fc_t* result, ++- const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) +++ const lv_32fc_t* input, +++ const lv_32fc_t* taps, +++ unsigned int num_points) ++ { ++- // Partial sums for indices i, i+1, i+2 and i+3. ++- __m256 sum_a_mult_b_real = _mm256_setzero_ps(); ++- __m256 sum_a_mult_b_imag = _mm256_setzero_ps(); ++- ++- for (long unsigned i = 0; i < (num_points & ~3u); i += 4) { ++- /* Four complex elements a time are processed. ++- * (ar + j⋅ai)*conj(br + j⋅bi) = ++- * ar⋅br + ai⋅bi + j⋅(ai⋅br − ar⋅bi) ++- */ +++ // Partial sums for indices i, i+1, i+2 and i+3. +++ __m256 sum_a_mult_b_real = _mm256_setzero_ps(); +++ __m256 sum_a_mult_b_imag = _mm256_setzero_ps(); +++ +++ for (long unsigned i = 0; i < (num_points & ~3u); i += 4) { +++ /* Four complex elements a time are processed. +++ * (ar + j⋅ai)*conj(br + j⋅bi) = +++ * ar⋅br + ai⋅bi + j⋅(ai⋅br − ar⋅bi) +++ */ +++ +++ /* Load input and taps, split and duplicate real und imaginary parts of taps. +++ * a: | ai,i+3 | ar,i+3 | … | ai,i+1 | ar,i+1 | ai,i+0 | ar,i+0 | +++ * b: | bi,i+3 | br,i+3 | … | bi,i+1 | br,i+1 | bi,i+0 | br,i+0 | +++ * b_real: | br,i+3 | br,i+3 | … | br,i+1 | br,i+1 | br,i+0 | br,i+0 | +++ * b_imag: | bi,i+3 | bi,i+3 | … | bi,i+1 | bi,i+1 | bi,i+0 | bi,i+0 | +++ */ +++ __m256 a = _mm256_loadu_ps((const float*)&input[i]); +++ __m256 b = _mm256_loadu_ps((const float*)&taps[i]); +++ __m256 b_real = _mm256_moveldup_ps(b); +++ __m256 b_imag = _mm256_movehdup_ps(b); +++ +++ // Add | ai⋅br,i+3 | ar⋅br,i+3 | … | ai⋅br,i+0 | ar⋅br,i+0 | to partial sum. +++ sum_a_mult_b_real = _mm256_add_ps(sum_a_mult_b_real, _mm256_mul_ps(a, b_real)); +++ // Add | ai⋅bi,i+3 | −ar⋅bi,i+3 | … | ai⋅bi,i+0 | −ar⋅bi,i+0 | to partial sum. +++ sum_a_mult_b_imag = _mm256_addsub_ps(sum_a_mult_b_imag, _mm256_mul_ps(a, b_imag)); +++ } ++ ++- /* Load input and taps, split and duplicate real und imaginary parts of taps. ++- * a: | ai,i+3 | ar,i+3 | … | ai,i+1 | ar,i+1 | ai,i+0 | ar,i+0 | ++- * b: | bi,i+3 | br,i+3 | … | bi,i+1 | br,i+1 | bi,i+0 | br,i+0 | ++- * b_real: | br,i+3 | br,i+3 | … | br,i+1 | br,i+1 | br,i+0 | br,i+0 | ++- * b_imag: | bi,i+3 | bi,i+3 | … | bi,i+1 | bi,i+1 | bi,i+0 | bi,i+0 | +++ // Swap position of −ar⋅bi and ai⋅bi. +++ sum_a_mult_b_imag = _mm256_permute_ps(sum_a_mult_b_imag, _MM_SHUFFLE(2, 3, 0, 1)); +++ // | ai⋅br + ai⋅bi | ai⋅br − ar⋅bi |, sum contains four such partial sums. +++ __m256 sum = _mm256_add_ps(sum_a_mult_b_real, sum_a_mult_b_imag); +++ /* Sum the four partial sums: Add high half of vector sum to the low one, i.e. +++ * s1 + s3 and s0 + s2 … ++ */ ++- __m256 a = _mm256_loadu_ps((const float *) &input[i]); ++- __m256 b = _mm256_loadu_ps((const float *) &taps[i]); ++- __m256 b_real = _mm256_moveldup_ps(b); ++- __m256 b_imag = _mm256_movehdup_ps(b); ++- ++- // Add | ai⋅br,i+3 | ar⋅br,i+3 | … | ai⋅br,i+0 | ar⋅br,i+0 | to partial sum. ++- sum_a_mult_b_real = _mm256_add_ps(sum_a_mult_b_real, _mm256_mul_ps(a, b_real)); ++- // Add | ai⋅bi,i+3 | −ar⋅bi,i+3 | … | ai⋅bi,i+0 | −ar⋅bi,i+0 | to partial sum. ++- sum_a_mult_b_imag = _mm256_addsub_ps(sum_a_mult_b_imag, _mm256_mul_ps(a, b_imag)); ++- } ++- ++- // Swap position of −ar⋅bi and ai⋅bi. ++- sum_a_mult_b_imag = _mm256_permute_ps(sum_a_mult_b_imag, _MM_SHUFFLE(2, 3, 0, 1)); ++- // | ai⋅br + ai⋅bi | ai⋅br − ar⋅bi |, sum contains four such partial sums. ++- __m256 sum = _mm256_add_ps(sum_a_mult_b_real, sum_a_mult_b_imag); ++- /* Sum the four partial sums: Add high half of vector sum to the low one, i.e. ++- * s1 + s3 and s0 + s2 … ++- */ ++- sum = _mm256_add_ps(sum, _mm256_permute2f128_ps(sum, sum, 0x01)); ++- // … and now (s0 + s2) + (s1 + s3) ++- sum = _mm256_add_ps(sum, _mm256_permute_ps(sum, _MM_SHUFFLE(1, 0, 3, 2))); ++- // Store result. ++- __m128 lower = _mm256_extractf128_ps(sum, 0); ++- _mm_storel_pi((__m64 *) result, lower); ++- ++- // Handle the last elements if num_points mod 4 is bigger than 0. ++- for (long unsigned i = num_points & ~3u; i < num_points; ++i) { ++- *result += lv_cmake( ++- lv_creal(input[i]) * lv_creal(taps[i]) + lv_cimag(input[i]) * lv_cimag(taps[i]), ++- lv_cimag(input[i]) * lv_creal(taps[i]) - lv_creal(input[i]) * lv_cimag(taps[i])); ++- } +++ sum = _mm256_add_ps(sum, _mm256_permute2f128_ps(sum, sum, 0x01)); +++ // … and now (s0 + s2) + (s1 + s3) +++ sum = _mm256_add_ps(sum, _mm256_permute_ps(sum, _MM_SHUFFLE(1, 0, 3, 2))); +++ // Store result. +++ __m128 lower = _mm256_extractf128_ps(sum, 0); +++ _mm_storel_pi((__m64*)result, lower); +++ +++ // Handle the last elements if num_points mod 4 is bigger than 0. +++ for (long unsigned i = num_points & ~3u; i < num_points; ++i) { +++ *result += lv_cmake(lv_creal(input[i]) * lv_creal(taps[i]) + +++ lv_cimag(input[i]) * lv_cimag(taps[i]), +++ lv_cimag(input[i]) * lv_creal(taps[i]) - +++ lv_creal(input[i]) * lv_cimag(taps[i])); +++ } ++ } ++ ++ #endif /* LV_HAVE_AVX */ ++ ++ #ifdef LV_HAVE_SSE3 ++ ++-#include ++ #include +++#include ++ ++ static inline void volk_32fc_x2_conjugate_dot_prod_32fc_u_sse3(lv_32fc_t* result, ++- const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) +++ const lv_32fc_t* input, +++ const lv_32fc_t* taps, +++ unsigned int num_points) ++ { ++- // Partial sums for indices i and i+1. ++- __m128 sum_a_mult_b_real = _mm_setzero_ps(); ++- __m128 sum_a_mult_b_imag = _mm_setzero_ps(); ++- ++- for (long unsigned i = 0; i < (num_points & ~1u); i += 2) { ++- /* Two complex elements a time are processed. ++- * (ar + j⋅ai)*conj(br + j⋅bi) = ++- * ar⋅br + ai⋅bi + j⋅(ai⋅br − ar⋅bi) ++- */ +++ // Partial sums for indices i and i+1. +++ __m128 sum_a_mult_b_real = _mm_setzero_ps(); +++ __m128 sum_a_mult_b_imag = _mm_setzero_ps(); +++ +++ for (long unsigned i = 0; i < (num_points & ~1u); i += 2) { +++ /* Two complex elements a time are processed. +++ * (ar + j⋅ai)*conj(br + j⋅bi) = +++ * ar⋅br + ai⋅bi + j⋅(ai⋅br − ar⋅bi) +++ */ +++ +++ /* Load input and taps, split and duplicate real und imaginary parts of taps. +++ * a: | ai,i+1 | ar,i+1 | ai,i+0 | ar,i+0 | +++ * b: | bi,i+1 | br,i+1 | bi,i+0 | br,i+0 | +++ * b_real: | br,i+1 | br,i+1 | br,i+0 | br,i+0 | +++ * b_imag: | bi,i+1 | bi,i+1 | bi,i+0 | bi,i+0 | +++ */ +++ __m128 a = _mm_loadu_ps((const float*)&input[i]); +++ __m128 b = _mm_loadu_ps((const float*)&taps[i]); +++ __m128 b_real = _mm_moveldup_ps(b); +++ __m128 b_imag = _mm_movehdup_ps(b); +++ +++ // Add | ai⋅br,i+1 | ar⋅br,i+1 | ai⋅br,i+0 | ar⋅br,i+0 | to partial sum. +++ sum_a_mult_b_real = _mm_add_ps(sum_a_mult_b_real, _mm_mul_ps(a, b_real)); +++ // Add | ai⋅bi,i+1 | −ar⋅bi,i+1 | ai⋅bi,i+0 | −ar⋅bi,i+0 | to partial sum. +++ sum_a_mult_b_imag = _mm_addsub_ps(sum_a_mult_b_imag, _mm_mul_ps(a, b_imag)); +++ } ++ ++- /* Load input and taps, split and duplicate real und imaginary parts of taps. ++- * a: | ai,i+1 | ar,i+1 | ai,i+0 | ar,i+0 | ++- * b: | bi,i+1 | br,i+1 | bi,i+0 | br,i+0 | ++- * b_real: | br,i+1 | br,i+1 | br,i+0 | br,i+0 | ++- * b_imag: | bi,i+1 | bi,i+1 | bi,i+0 | bi,i+0 | ++- */ ++- __m128 a = _mm_loadu_ps((const float *) &input[i]); ++- __m128 b = _mm_loadu_ps((const float *) &taps[i]); ++- __m128 b_real = _mm_moveldup_ps(b); ++- __m128 b_imag = _mm_movehdup_ps(b); ++- ++- // Add | ai⋅br,i+1 | ar⋅br,i+1 | ai⋅br,i+0 | ar⋅br,i+0 | to partial sum. ++- sum_a_mult_b_real = _mm_add_ps(sum_a_mult_b_real, _mm_mul_ps(a, b_real)); ++- // Add | ai⋅bi,i+1 | −ar⋅bi,i+1 | ai⋅bi,i+0 | −ar⋅bi,i+0 | to partial sum. ++- sum_a_mult_b_imag = _mm_addsub_ps(sum_a_mult_b_imag, _mm_mul_ps(a, b_imag)); ++- } ++- ++- // Swap position of −ar⋅bi and ai⋅bi. ++- sum_a_mult_b_imag = _mm_shuffle_ps(sum_a_mult_b_imag, sum_a_mult_b_imag, ++- _MM_SHUFFLE(2, 3, 0, 1)); ++- // | ai⋅br + ai⋅bi | ai⋅br − ar⋅bi |, sum contains two such partial sums. ++- __m128 sum = _mm_add_ps(sum_a_mult_b_real, sum_a_mult_b_imag); ++- // Sum the two partial sums. ++- sum = _mm_add_ps(sum, _mm_shuffle_ps(sum, sum, _MM_SHUFFLE(1, 0, 3, 2))); ++- // Store result. ++- _mm_storel_pi((__m64 *) result, sum); ++- ++- // Handle the last element if num_points mod 2 is 1. ++- if (num_points & 1u) { ++- *result += lv_cmake( ++- lv_creal(input[num_points - 1]) * lv_creal(taps[num_points - 1]) + ++- lv_cimag(input[num_points - 1]) * lv_cimag(taps[num_points - 1]), ++- lv_cimag(input[num_points - 1]) * lv_creal(taps[num_points - 1]) - ++- lv_creal(input[num_points - 1]) * lv_cimag(taps[num_points - 1])); ++- } +++ // Swap position of −ar⋅bi and ai⋅bi. +++ sum_a_mult_b_imag = +++ _mm_shuffle_ps(sum_a_mult_b_imag, sum_a_mult_b_imag, _MM_SHUFFLE(2, 3, 0, 1)); +++ // | ai⋅br + ai⋅bi | ai⋅br − ar⋅bi |, sum contains two such partial sums. +++ __m128 sum = _mm_add_ps(sum_a_mult_b_real, sum_a_mult_b_imag); +++ // Sum the two partial sums. +++ sum = _mm_add_ps(sum, _mm_shuffle_ps(sum, sum, _MM_SHUFFLE(1, 0, 3, 2))); +++ // Store result. +++ _mm_storel_pi((__m64*)result, sum); +++ +++ // Handle the last element if num_points mod 2 is 1. +++ if (num_points & 1u) { +++ *result += lv_cmake( +++ lv_creal(input[num_points - 1]) * lv_creal(taps[num_points - 1]) + +++ lv_cimag(input[num_points - 1]) * lv_cimag(taps[num_points - 1]), +++ lv_cimag(input[num_points - 1]) * lv_creal(taps[num_points - 1]) - +++ lv_creal(input[num_points - 1]) * lv_cimag(taps[num_points - 1])); +++ } ++ } ++ ++ #endif /*LV_HAVE_SSE3*/ ++ ++ #ifdef LV_HAVE_NEON ++ #include ++-static inline void volk_32fc_x2_conjugate_dot_prod_32fc_neon(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) { +++static inline void volk_32fc_x2_conjugate_dot_prod_32fc_neon(lv_32fc_t* result, +++ const lv_32fc_t* input, +++ const lv_32fc_t* taps, +++ unsigned int num_points) +++{ ++ ++ unsigned int quarter_points = num_points / 4; ++ unsigned int number; ++ ++- lv_32fc_t* a_ptr = (lv_32fc_t*) taps; ++- lv_32fc_t* b_ptr = (lv_32fc_t*) input; +++ lv_32fc_t* a_ptr = (lv_32fc_t*)taps; +++ lv_32fc_t* b_ptr = (lv_32fc_t*)input; ++ // for 2-lane vectors, 1st lane holds the real part, ++ // 2nd lane holds the imaginary part ++ float32x4x2_t a_val, b_val, accumulator; ++@@ -229,11 +242,11 @@ static inline void volk_32fc_x2_conjugate_dot_prod_32fc_neon(lv_32fc_t* result, ++ accumulator.val[0] = vdupq_n_f32(0); ++ accumulator.val[1] = vdupq_n_f32(0); ++ ++- for(number = 0; number < quarter_points; ++number) { +++ for (number = 0; number < quarter_points; ++number) { ++ a_val = vld2q_f32((float*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i ++ b_val = vld2q_f32((float*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i ++- __VOLK_PREFETCH(a_ptr+8); ++- __VOLK_PREFETCH(b_ptr+8); +++ __VOLK_PREFETCH(a_ptr + 8); +++ __VOLK_PREFETCH(b_ptr + 8); ++ ++ // do the first multiply ++ tmp_imag.val[1] = vmulq_f32(a_val.val[1], b_val.val[0]); ++@@ -255,11 +268,10 @@ static inline void volk_32fc_x2_conjugate_dot_prod_32fc_neon(lv_32fc_t* result, ++ *result = accum_result[0] + accum_result[1] + accum_result[2] + accum_result[3]; ++ ++ // tail case ++- for(number = quarter_points*4; number < num_points; ++number) { ++- *result += (*a_ptr++) * lv_conj(*b_ptr++); +++ for (number = quarter_points * 4; number < num_points; ++number) { +++ *result += (*a_ptr++) * lv_conj(*b_ptr++); ++ } ++ *result = lv_conj(*result); ++- ++ } ++ #endif /*LV_HAVE_NEON*/ ++ ++@@ -268,120 +280,125 @@ static inline void volk_32fc_x2_conjugate_dot_prod_32fc_neon(lv_32fc_t* result, ++ #ifndef INCLUDED_volk_32fc_x2_conjugate_dot_prod_32fc_a_H ++ #define INCLUDED_volk_32fc_x2_conjugate_dot_prod_32fc_a_H ++ +++#include ++ #include ++-#include ++-#include +++#include ++ ++ ++ #ifdef LV_HAVE_AVX ++ #include ++ ++ static inline void volk_32fc_x2_conjugate_dot_prod_32fc_a_avx(lv_32fc_t* result, ++- const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) +++ const lv_32fc_t* input, +++ const lv_32fc_t* taps, +++ unsigned int num_points) ++ { ++- // Partial sums for indices i, i+1, i+2 and i+3. ++- __m256 sum_a_mult_b_real = _mm256_setzero_ps(); ++- __m256 sum_a_mult_b_imag = _mm256_setzero_ps(); ++- ++- for (long unsigned i = 0; i < (num_points & ~3u); i += 4) { ++- /* Four complex elements a time are processed. ++- * (ar + j⋅ai)*conj(br + j⋅bi) = ++- * ar⋅br + ai⋅bi + j⋅(ai⋅br − ar⋅bi) ++- */ +++ // Partial sums for indices i, i+1, i+2 and i+3. +++ __m256 sum_a_mult_b_real = _mm256_setzero_ps(); +++ __m256 sum_a_mult_b_imag = _mm256_setzero_ps(); +++ +++ for (long unsigned i = 0; i < (num_points & ~3u); i += 4) { +++ /* Four complex elements a time are processed. +++ * (ar + j⋅ai)*conj(br + j⋅bi) = +++ * ar⋅br + ai⋅bi + j⋅(ai⋅br − ar⋅bi) +++ */ +++ +++ /* Load input and taps, split and duplicate real und imaginary parts of taps. +++ * a: | ai,i+3 | ar,i+3 | … | ai,i+1 | ar,i+1 | ai,i+0 | ar,i+0 | +++ * b: | bi,i+3 | br,i+3 | … | bi,i+1 | br,i+1 | bi,i+0 | br,i+0 | +++ * b_real: | br,i+3 | br,i+3 | … | br,i+1 | br,i+1 | br,i+0 | br,i+0 | +++ * b_imag: | bi,i+3 | bi,i+3 | … | bi,i+1 | bi,i+1 | bi,i+0 | bi,i+0 | +++ */ +++ __m256 a = _mm256_load_ps((const float*)&input[i]); +++ __m256 b = _mm256_load_ps((const float*)&taps[i]); +++ __m256 b_real = _mm256_moveldup_ps(b); +++ __m256 b_imag = _mm256_movehdup_ps(b); +++ +++ // Add | ai⋅br,i+3 | ar⋅br,i+3 | … | ai⋅br,i+0 | ar⋅br,i+0 | to partial sum. +++ sum_a_mult_b_real = _mm256_add_ps(sum_a_mult_b_real, _mm256_mul_ps(a, b_real)); +++ // Add | ai⋅bi,i+3 | −ar⋅bi,i+3 | … | ai⋅bi,i+0 | −ar⋅bi,i+0 | to partial sum. +++ sum_a_mult_b_imag = _mm256_addsub_ps(sum_a_mult_b_imag, _mm256_mul_ps(a, b_imag)); +++ } ++ ++- /* Load input and taps, split and duplicate real und imaginary parts of taps. ++- * a: | ai,i+3 | ar,i+3 | … | ai,i+1 | ar,i+1 | ai,i+0 | ar,i+0 | ++- * b: | bi,i+3 | br,i+3 | … | bi,i+1 | br,i+1 | bi,i+0 | br,i+0 | ++- * b_real: | br,i+3 | br,i+3 | … | br,i+1 | br,i+1 | br,i+0 | br,i+0 | ++- * b_imag: | bi,i+3 | bi,i+3 | … | bi,i+1 | bi,i+1 | bi,i+0 | bi,i+0 | +++ // Swap position of −ar⋅bi and ai⋅bi. +++ sum_a_mult_b_imag = _mm256_permute_ps(sum_a_mult_b_imag, _MM_SHUFFLE(2, 3, 0, 1)); +++ // | ai⋅br + ai⋅bi | ai⋅br − ar⋅bi |, sum contains four such partial sums. +++ __m256 sum = _mm256_add_ps(sum_a_mult_b_real, sum_a_mult_b_imag); +++ /* Sum the four partial sums: Add high half of vector sum to the low one, i.e. +++ * s1 + s3 and s0 + s2 … ++ */ ++- __m256 a = _mm256_load_ps((const float *) &input[i]); ++- __m256 b = _mm256_load_ps((const float *) &taps[i]); ++- __m256 b_real = _mm256_moveldup_ps(b); ++- __m256 b_imag = _mm256_movehdup_ps(b); ++- ++- // Add | ai⋅br,i+3 | ar⋅br,i+3 | … | ai⋅br,i+0 | ar⋅br,i+0 | to partial sum. ++- sum_a_mult_b_real = _mm256_add_ps(sum_a_mult_b_real, _mm256_mul_ps(a, b_real)); ++- // Add | ai⋅bi,i+3 | −ar⋅bi,i+3 | … | ai⋅bi,i+0 | −ar⋅bi,i+0 | to partial sum. ++- sum_a_mult_b_imag = _mm256_addsub_ps(sum_a_mult_b_imag, _mm256_mul_ps(a, b_imag)); ++- } ++- ++- // Swap position of −ar⋅bi and ai⋅bi. ++- sum_a_mult_b_imag = _mm256_permute_ps(sum_a_mult_b_imag, _MM_SHUFFLE(2, 3, 0, 1)); ++- // | ai⋅br + ai⋅bi | ai⋅br − ar⋅bi |, sum contains four such partial sums. ++- __m256 sum = _mm256_add_ps(sum_a_mult_b_real, sum_a_mult_b_imag); ++- /* Sum the four partial sums: Add high half of vector sum to the low one, i.e. ++- * s1 + s3 and s0 + s2 … ++- */ ++- sum = _mm256_add_ps(sum, _mm256_permute2f128_ps(sum, sum, 0x01)); ++- // … and now (s0 + s2) + (s1 + s3) ++- sum = _mm256_add_ps(sum, _mm256_permute_ps(sum, _MM_SHUFFLE(1, 0, 3, 2))); ++- // Store result. ++- __m128 lower = _mm256_extractf128_ps(sum, 0); ++- _mm_storel_pi((__m64 *) result, lower); ++- ++- // Handle the last elements if num_points mod 4 is bigger than 0. ++- for (long unsigned i = num_points & ~3u; i < num_points; ++i) { ++- *result += lv_cmake( ++- lv_creal(input[i]) * lv_creal(taps[i]) + lv_cimag(input[i]) * lv_cimag(taps[i]), ++- lv_cimag(input[i]) * lv_creal(taps[i]) - lv_creal(input[i]) * lv_cimag(taps[i])); ++- } +++ sum = _mm256_add_ps(sum, _mm256_permute2f128_ps(sum, sum, 0x01)); +++ // … and now (s0 + s2) + (s1 + s3) +++ sum = _mm256_add_ps(sum, _mm256_permute_ps(sum, _MM_SHUFFLE(1, 0, 3, 2))); +++ // Store result. +++ __m128 lower = _mm256_extractf128_ps(sum, 0); +++ _mm_storel_pi((__m64*)result, lower); +++ +++ // Handle the last elements if num_points mod 4 is bigger than 0. +++ for (long unsigned i = num_points & ~3u; i < num_points; ++i) { +++ *result += lv_cmake(lv_creal(input[i]) * lv_creal(taps[i]) + +++ lv_cimag(input[i]) * lv_cimag(taps[i]), +++ lv_cimag(input[i]) * lv_creal(taps[i]) - +++ lv_creal(input[i]) * lv_cimag(taps[i])); +++ } ++ } ++ #endif /* LV_HAVE_AVX */ ++ ++ #ifdef LV_HAVE_SSE3 ++ ++-#include ++ #include +++#include ++ ++ static inline void volk_32fc_x2_conjugate_dot_prod_32fc_a_sse3(lv_32fc_t* result, ++- const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) +++ const lv_32fc_t* input, +++ const lv_32fc_t* taps, +++ unsigned int num_points) ++ { ++- // Partial sums for indices i and i+1. ++- __m128 sum_a_mult_b_real = _mm_setzero_ps(); ++- __m128 sum_a_mult_b_imag = _mm_setzero_ps(); ++- ++- for (long unsigned i = 0; i < (num_points & ~1u); i += 2) { ++- /* Two complex elements a time are processed. ++- * (ar + j⋅ai)*conj(br + j⋅bi) = ++- * ar⋅br + ai⋅bi + j⋅(ai⋅br − ar⋅bi) ++- */ +++ // Partial sums for indices i and i+1. +++ __m128 sum_a_mult_b_real = _mm_setzero_ps(); +++ __m128 sum_a_mult_b_imag = _mm_setzero_ps(); +++ +++ for (long unsigned i = 0; i < (num_points & ~1u); i += 2) { +++ /* Two complex elements a time are processed. +++ * (ar + j⋅ai)*conj(br + j⋅bi) = +++ * ar⋅br + ai⋅bi + j⋅(ai⋅br − ar⋅bi) +++ */ +++ +++ /* Load input and taps, split and duplicate real und imaginary parts of taps. +++ * a: | ai,i+1 | ar,i+1 | ai,i+0 | ar,i+0 | +++ * b: | bi,i+1 | br,i+1 | bi,i+0 | br,i+0 | +++ * b_real: | br,i+1 | br,i+1 | br,i+0 | br,i+0 | +++ * b_imag: | bi,i+1 | bi,i+1 | bi,i+0 | bi,i+0 | +++ */ +++ __m128 a = _mm_load_ps((const float*)&input[i]); +++ __m128 b = _mm_load_ps((const float*)&taps[i]); +++ __m128 b_real = _mm_moveldup_ps(b); +++ __m128 b_imag = _mm_movehdup_ps(b); +++ +++ // Add | ai⋅br,i+1 | ar⋅br,i+1 | ai⋅br,i+0 | ar⋅br,i+0 | to partial sum. +++ sum_a_mult_b_real = _mm_add_ps(sum_a_mult_b_real, _mm_mul_ps(a, b_real)); +++ // Add | ai⋅bi,i+1 | −ar⋅bi,i+1 | ai⋅bi,i+0 | −ar⋅bi,i+0 | to partial sum. +++ sum_a_mult_b_imag = _mm_addsub_ps(sum_a_mult_b_imag, _mm_mul_ps(a, b_imag)); +++ } ++ ++- /* Load input and taps, split and duplicate real und imaginary parts of taps. ++- * a: | ai,i+1 | ar,i+1 | ai,i+0 | ar,i+0 | ++- * b: | bi,i+1 | br,i+1 | bi,i+0 | br,i+0 | ++- * b_real: | br,i+1 | br,i+1 | br,i+0 | br,i+0 | ++- * b_imag: | bi,i+1 | bi,i+1 | bi,i+0 | bi,i+0 | ++- */ ++- __m128 a = _mm_load_ps((const float *) &input[i]); ++- __m128 b = _mm_load_ps((const float *) &taps[i]); ++- __m128 b_real = _mm_moveldup_ps(b); ++- __m128 b_imag = _mm_movehdup_ps(b); ++- ++- // Add | ai⋅br,i+1 | ar⋅br,i+1 | ai⋅br,i+0 | ar⋅br,i+0 | to partial sum. ++- sum_a_mult_b_real = _mm_add_ps(sum_a_mult_b_real, _mm_mul_ps(a, b_real)); ++- // Add | ai⋅bi,i+1 | −ar⋅bi,i+1 | ai⋅bi,i+0 | −ar⋅bi,i+0 | to partial sum. ++- sum_a_mult_b_imag = _mm_addsub_ps(sum_a_mult_b_imag, _mm_mul_ps(a, b_imag)); ++- } ++- ++- // Swap position of −ar⋅bi and ai⋅bi. ++- sum_a_mult_b_imag = _mm_shuffle_ps(sum_a_mult_b_imag, sum_a_mult_b_imag, ++- _MM_SHUFFLE(2, 3, 0, 1)); ++- // | ai⋅br + ai⋅bi | ai⋅br − ar⋅bi |, sum contains two such partial sums. ++- __m128 sum = _mm_add_ps(sum_a_mult_b_real, sum_a_mult_b_imag); ++- // Sum the two partial sums. ++- sum = _mm_add_ps(sum, _mm_shuffle_ps(sum, sum, _MM_SHUFFLE(1, 0, 3, 2))); ++- // Store result. ++- _mm_storel_pi((__m64 *) result, sum); ++- ++- // Handle the last element if num_points mod 2 is 1. ++- if (num_points & 1u) { ++- *result += lv_cmake( ++- lv_creal(input[num_points - 1]) * lv_creal(taps[num_points - 1]) + ++- lv_cimag(input[num_points - 1]) * lv_cimag(taps[num_points - 1]), ++- lv_cimag(input[num_points - 1]) * lv_creal(taps[num_points - 1]) - ++- lv_creal(input[num_points - 1]) * lv_cimag(taps[num_points - 1])); ++- } +++ // Swap position of −ar⋅bi and ai⋅bi. +++ sum_a_mult_b_imag = +++ _mm_shuffle_ps(sum_a_mult_b_imag, sum_a_mult_b_imag, _MM_SHUFFLE(2, 3, 0, 1)); +++ // | ai⋅br + ai⋅bi | ai⋅br − ar⋅bi |, sum contains two such partial sums. +++ __m128 sum = _mm_add_ps(sum_a_mult_b_real, sum_a_mult_b_imag); +++ // Sum the two partial sums. +++ sum = _mm_add_ps(sum, _mm_shuffle_ps(sum, sum, _MM_SHUFFLE(1, 0, 3, 2))); +++ // Store result. +++ _mm_storel_pi((__m64*)result, sum); +++ +++ // Handle the last element if num_points mod 2 is 1. +++ if (num_points & 1u) { +++ *result += lv_cmake( +++ lv_creal(input[num_points - 1]) * lv_creal(taps[num_points - 1]) + +++ lv_cimag(input[num_points - 1]) * lv_cimag(taps[num_points - 1]), +++ lv_cimag(input[num_points - 1]) * lv_creal(taps[num_points - 1]) - +++ lv_creal(input[num_points - 1]) * lv_cimag(taps[num_points - 1])); +++ } ++ } ++ ++ #endif /*LV_HAVE_SSE3*/ ++@@ -390,35 +407,39 @@ static inline void volk_32fc_x2_conjugate_dot_prod_32fc_a_sse3(lv_32fc_t* result ++ #ifdef LV_HAVE_GENERIC ++ ++ ++-static inline void volk_32fc_x2_conjugate_dot_prod_32fc_a_generic(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) { +++static inline void volk_32fc_x2_conjugate_dot_prod_32fc_a_generic(lv_32fc_t* result, +++ const lv_32fc_t* input, +++ const lv_32fc_t* taps, +++ unsigned int num_points) +++{ ++ ++- const unsigned int num_bytes = num_points*8; +++ const unsigned int num_bytes = num_points * 8; ++ ++- float * res = (float*) result; ++- float * in = (float*) input; ++- float * tp = (float*) taps; ++- unsigned int n_2_ccomplex_blocks = num_bytes >> 4; +++ float* res = (float*)result; +++ float* in = (float*)input; +++ float* tp = (float*)taps; +++ unsigned int n_2_ccomplex_blocks = num_bytes >> 4; ++ ++- float sum0[2] = {0,0}; ++- float sum1[2] = {0,0}; ++- unsigned int i = 0; +++ float sum0[2] = { 0, 0 }; +++ float sum1[2] = { 0, 0 }; +++ unsigned int i = 0; ++ ++- for(i = 0; i < n_2_ccomplex_blocks; ++i) { ++- sum0[0] += in[0] * tp[0] + in[1] * tp[1]; ++- sum0[1] += (-in[0] * tp[1]) + in[1] * tp[0]; ++- sum1[0] += in[2] * tp[2] + in[3] * tp[3]; ++- sum1[1] += (-in[2] * tp[3]) + in[3] * tp[2]; +++ for (i = 0; i < n_2_ccomplex_blocks; ++i) { +++ sum0[0] += in[0] * tp[0] + in[1] * tp[1]; +++ sum0[1] += (-in[0] * tp[1]) + in[1] * tp[0]; +++ sum1[0] += in[2] * tp[2] + in[3] * tp[3]; +++ sum1[1] += (-in[2] * tp[3]) + in[3] * tp[2]; ++ ++- in += 4; ++- tp += 4; ++- } +++ in += 4; +++ tp += 4; +++ } ++ ++- res[0] = sum0[0] + sum1[0]; ++- res[1] = sum0[1] + sum1[1]; +++ res[0] = sum0[0] + sum1[0]; +++ res[1] = sum0[1] + sum1[1]; ++ ++- if (num_bytes >> 3 & 1) { ++- *result += input[(num_bytes >> 3) - 1] * lv_conj(taps[(num_bytes >> 3) - 1]); ++- } +++ if (num_bytes >> 3 & 1) { +++ *result += input[(num_bytes >> 3) - 1] * lv_conj(taps[(num_bytes >> 3) - 1]); +++ } ++ } ++ ++ #endif /*LV_HAVE_GENERIC*/ ++@@ -426,256 +447,276 @@ static inline void volk_32fc_x2_conjugate_dot_prod_32fc_a_generic(lv_32fc_t* res ++ ++ #if LV_HAVE_SSE && LV_HAVE_64 ++ ++-static inline void volk_32fc_x2_conjugate_dot_prod_32fc_a_sse(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) { ++- ++- const unsigned int num_bytes = num_points*8; ++- ++- __VOLK_ATTR_ALIGNED(16) static const uint32_t conjugator[4]= {0x00000000, 0x80000000, 0x00000000, 0x80000000}; ++- ++- __VOLK_ASM __VOLK_VOLATILE ++- ( ++- "# ccomplex_conjugate_dotprod_generic (float* result, const float *input,\n\t" ++- "# const float *taps, unsigned num_bytes)\n\t" ++- "# float sum0 = 0;\n\t" ++- "# float sum1 = 0;\n\t" ++- "# float sum2 = 0;\n\t" ++- "# float sum3 = 0;\n\t" ++- "# do {\n\t" ++- "# sum0 += input[0] * taps[0] - input[1] * taps[1];\n\t" ++- "# sum1 += input[0] * taps[1] + input[1] * taps[0];\n\t" ++- "# sum2 += input[2] * taps[2] - input[3] * taps[3];\n\t" ++- "# sum3 += input[2] * taps[3] + input[3] * taps[2];\n\t" ++- "# input += 4;\n\t" ++- "# taps += 4; \n\t" ++- "# } while (--n_2_ccomplex_blocks != 0);\n\t" ++- "# result[0] = sum0 + sum2;\n\t" ++- "# result[1] = sum1 + sum3;\n\t" ++- "# TODO: prefetch and better scheduling\n\t" ++- " xor %%r9, %%r9\n\t" ++- " xor %%r10, %%r10\n\t" ++- " movq %[conjugator], %%r9\n\t" ++- " movq %%rcx, %%rax\n\t" ++- " movaps 0(%%r9), %%xmm8\n\t" ++- " movq %%rcx, %%r8\n\t" ++- " movq %[rsi], %%r9\n\t" ++- " movq %[rdx], %%r10\n\t" ++- " xorps %%xmm6, %%xmm6 # zero accumulators\n\t" ++- " movaps 0(%%r9), %%xmm0\n\t" ++- " xorps %%xmm7, %%xmm7 # zero accumulators\n\t" ++- " movups 0(%%r10), %%xmm2\n\t" ++- " shr $5, %%rax # rax = n_2_ccomplex_blocks / 2\n\t" ++- " shr $4, %%r8\n\t" ++- " xorps %%xmm8, %%xmm2\n\t" ++- " jmp .%=L1_test\n\t" ++- " # 4 taps / loop\n\t" ++- " # something like ?? cycles / loop\n\t" ++- ".%=Loop1: \n\t" ++- "# complex prod: C += A * B, w/ temp Z & Y (or B), xmmPN=$0x8000000080000000\n\t" ++- "# movaps (%%r9), %%xmmA\n\t" ++- "# movaps (%%r10), %%xmmB\n\t" ++- "# movaps %%xmmA, %%xmmZ\n\t" ++- "# shufps $0xb1, %%xmmZ, %%xmmZ # swap internals\n\t" ++- "# mulps %%xmmB, %%xmmA\n\t" ++- "# mulps %%xmmZ, %%xmmB\n\t" ++- "# # SSE replacement for: pfpnacc %%xmmB, %%xmmA\n\t" ++- "# xorps %%xmmPN, %%xmmA\n\t" ++- "# movaps %%xmmA, %%xmmZ\n\t" ++- "# unpcklps %%xmmB, %%xmmA\n\t" ++- "# unpckhps %%xmmB, %%xmmZ\n\t" ++- "# movaps %%xmmZ, %%xmmY\n\t" ++- "# shufps $0x44, %%xmmA, %%xmmZ # b01000100\n\t" ++- "# shufps $0xee, %%xmmY, %%xmmA # b11101110\n\t" ++- "# addps %%xmmZ, %%xmmA\n\t" ++- "# addps %%xmmA, %%xmmC\n\t" ++- "# A=xmm0, B=xmm2, Z=xmm4\n\t" ++- "# A'=xmm1, B'=xmm3, Z'=xmm5\n\t" ++- " movaps 16(%%r9), %%xmm1\n\t" ++- " movaps %%xmm0, %%xmm4\n\t" ++- " mulps %%xmm2, %%xmm0\n\t" ++- " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t" ++- " movaps 16(%%r10), %%xmm3\n\t" ++- " movaps %%xmm1, %%xmm5\n\t" ++- " xorps %%xmm8, %%xmm3\n\t" ++- " addps %%xmm0, %%xmm6\n\t" ++- " mulps %%xmm3, %%xmm1\n\t" ++- " shufps $0xb1, %%xmm5, %%xmm5 # swap internals\n\t" ++- " addps %%xmm1, %%xmm6\n\t" ++- " mulps %%xmm4, %%xmm2\n\t" ++- " movaps 32(%%r9), %%xmm0\n\t" ++- " addps %%xmm2, %%xmm7\n\t" ++- " mulps %%xmm5, %%xmm3\n\t" ++- " add $32, %%r9\n\t" ++- " movaps 32(%%r10), %%xmm2\n\t" ++- " addps %%xmm3, %%xmm7\n\t" ++- " add $32, %%r10\n\t" ++- " xorps %%xmm8, %%xmm2\n\t" ++- ".%=L1_test:\n\t" ++- " dec %%rax\n\t" ++- " jge .%=Loop1\n\t" ++- " # We've handled the bulk of multiplies up to here.\n\t" ++- " # Let's sse if original n_2_ccomplex_blocks was odd.\n\t" ++- " # If so, we've got 2 more taps to do.\n\t" ++- " and $1, %%r8\n\t" ++- " je .%=Leven\n\t" ++- " # The count was odd, do 2 more taps.\n\t" ++- " # Note that we've already got mm0/mm2 preloaded\n\t" ++- " # from the main loop.\n\t" ++- " movaps %%xmm0, %%xmm4\n\t" ++- " mulps %%xmm2, %%xmm0\n\t" ++- " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t" ++- " addps %%xmm0, %%xmm6\n\t" ++- " mulps %%xmm4, %%xmm2\n\t" ++- " addps %%xmm2, %%xmm7\n\t" ++- ".%=Leven:\n\t" ++- " # neg inversor\n\t" ++- " xorps %%xmm1, %%xmm1\n\t" ++- " mov $0x80000000, %%r9\n\t" ++- " movd %%r9, %%xmm1\n\t" ++- " shufps $0x11, %%xmm1, %%xmm1 # b00010001 # 0 -0 0 -0\n\t" ++- " # pfpnacc\n\t" ++- " xorps %%xmm1, %%xmm6\n\t" ++- " movaps %%xmm6, %%xmm2\n\t" ++- " unpcklps %%xmm7, %%xmm6\n\t" ++- " unpckhps %%xmm7, %%xmm2\n\t" ++- " movaps %%xmm2, %%xmm3\n\t" ++- " shufps $0x44, %%xmm6, %%xmm2 # b01000100\n\t" ++- " shufps $0xee, %%xmm3, %%xmm6 # b11101110\n\t" ++- " addps %%xmm2, %%xmm6\n\t" ++- " # xmm6 = r1 i2 r3 i4\n\t" ++- " movhlps %%xmm6, %%xmm4 # xmm4 = r3 i4 ?? ??\n\t" ++- " addps %%xmm4, %%xmm6 # xmm6 = r1+r3 i2+i4 ?? ??\n\t" ++- " movlps %%xmm6, (%[rdi]) # store low 2x32 bits (complex) to memory\n\t" ++- : ++- :[rsi] "r" (input), [rdx] "r" (taps), "c" (num_bytes), [rdi] "r" (result), [conjugator] "r" (conjugator) ++- :"rax", "r8", "r9", "r10" ++- ); ++- ++- int getem = num_bytes % 16; ++- ++- for(; getem > 0; getem -= 8) { ++- *result += (input[(num_bytes >> 3) - 1] * lv_conj(taps[(num_bytes >> 3) - 1])); ++- } +++static inline void volk_32fc_x2_conjugate_dot_prod_32fc_a_sse(lv_32fc_t* result, +++ const lv_32fc_t* input, +++ const lv_32fc_t* taps, +++ unsigned int num_points) +++{ +++ +++ const unsigned int num_bytes = num_points * 8; +++ +++ __VOLK_ATTR_ALIGNED(16) +++ static const uint32_t conjugator[4] = { +++ 0x00000000, 0x80000000, 0x00000000, 0x80000000 +++ }; +++ +++ __VOLK_ASM __VOLK_VOLATILE( +++ "# ccomplex_conjugate_dotprod_generic (float* result, const float *input,\n\t" +++ "# const float *taps, unsigned num_bytes)\n\t" +++ "# float sum0 = 0;\n\t" +++ "# float sum1 = 0;\n\t" +++ "# float sum2 = 0;\n\t" +++ "# float sum3 = 0;\n\t" +++ "# do {\n\t" +++ "# sum0 += input[0] * taps[0] - input[1] * taps[1];\n\t" +++ "# sum1 += input[0] * taps[1] + input[1] * taps[0];\n\t" +++ "# sum2 += input[2] * taps[2] - input[3] * taps[3];\n\t" +++ "# sum3 += input[2] * taps[3] + input[3] * taps[2];\n\t" +++ "# input += 4;\n\t" +++ "# taps += 4; \n\t" +++ "# } while (--n_2_ccomplex_blocks != 0);\n\t" +++ "# result[0] = sum0 + sum2;\n\t" +++ "# result[1] = sum1 + sum3;\n\t" +++ "# TODO: prefetch and better scheduling\n\t" +++ " xor %%r9, %%r9\n\t" +++ " xor %%r10, %%r10\n\t" +++ " movq %[conjugator], %%r9\n\t" +++ " movq %%rcx, %%rax\n\t" +++ " movaps 0(%%r9), %%xmm8\n\t" +++ " movq %%rcx, %%r8\n\t" +++ " movq %[rsi], %%r9\n\t" +++ " movq %[rdx], %%r10\n\t" +++ " xorps %%xmm6, %%xmm6 # zero accumulators\n\t" +++ " movaps 0(%%r9), %%xmm0\n\t" +++ " xorps %%xmm7, %%xmm7 # zero accumulators\n\t" +++ " movups 0(%%r10), %%xmm2\n\t" +++ " shr $5, %%rax # rax = n_2_ccomplex_blocks / 2\n\t" +++ " shr $4, %%r8\n\t" +++ " xorps %%xmm8, %%xmm2\n\t" +++ " jmp .%=L1_test\n\t" +++ " # 4 taps / loop\n\t" +++ " # something like ?? cycles / loop\n\t" +++ ".%=Loop1: \n\t" +++ "# complex prod: C += A * B, w/ temp Z & Y (or B), xmmPN=$0x8000000080000000\n\t" +++ "# movaps (%%r9), %%xmmA\n\t" +++ "# movaps (%%r10), %%xmmB\n\t" +++ "# movaps %%xmmA, %%xmmZ\n\t" +++ "# shufps $0xb1, %%xmmZ, %%xmmZ # swap internals\n\t" +++ "# mulps %%xmmB, %%xmmA\n\t" +++ "# mulps %%xmmZ, %%xmmB\n\t" +++ "# # SSE replacement for: pfpnacc %%xmmB, %%xmmA\n\t" +++ "# xorps %%xmmPN, %%xmmA\n\t" +++ "# movaps %%xmmA, %%xmmZ\n\t" +++ "# unpcklps %%xmmB, %%xmmA\n\t" +++ "# unpckhps %%xmmB, %%xmmZ\n\t" +++ "# movaps %%xmmZ, %%xmmY\n\t" +++ "# shufps $0x44, %%xmmA, %%xmmZ # b01000100\n\t" +++ "# shufps $0xee, %%xmmY, %%xmmA # b11101110\n\t" +++ "# addps %%xmmZ, %%xmmA\n\t" +++ "# addps %%xmmA, %%xmmC\n\t" +++ "# A=xmm0, B=xmm2, Z=xmm4\n\t" +++ "# A'=xmm1, B'=xmm3, Z'=xmm5\n\t" +++ " movaps 16(%%r9), %%xmm1\n\t" +++ " movaps %%xmm0, %%xmm4\n\t" +++ " mulps %%xmm2, %%xmm0\n\t" +++ " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t" +++ " movaps 16(%%r10), %%xmm3\n\t" +++ " movaps %%xmm1, %%xmm5\n\t" +++ " xorps %%xmm8, %%xmm3\n\t" +++ " addps %%xmm0, %%xmm6\n\t" +++ " mulps %%xmm3, %%xmm1\n\t" +++ " shufps $0xb1, %%xmm5, %%xmm5 # swap internals\n\t" +++ " addps %%xmm1, %%xmm6\n\t" +++ " mulps %%xmm4, %%xmm2\n\t" +++ " movaps 32(%%r9), %%xmm0\n\t" +++ " addps %%xmm2, %%xmm7\n\t" +++ " mulps %%xmm5, %%xmm3\n\t" +++ " add $32, %%r9\n\t" +++ " movaps 32(%%r10), %%xmm2\n\t" +++ " addps %%xmm3, %%xmm7\n\t" +++ " add $32, %%r10\n\t" +++ " xorps %%xmm8, %%xmm2\n\t" +++ ".%=L1_test:\n\t" +++ " dec %%rax\n\t" +++ " jge .%=Loop1\n\t" +++ " # We've handled the bulk of multiplies up to here.\n\t" +++ " # Let's sse if original n_2_ccomplex_blocks was odd.\n\t" +++ " # If so, we've got 2 more taps to do.\n\t" +++ " and $1, %%r8\n\t" +++ " je .%=Leven\n\t" +++ " # The count was odd, do 2 more taps.\n\t" +++ " # Note that we've already got mm0/mm2 preloaded\n\t" +++ " # from the main loop.\n\t" +++ " movaps %%xmm0, %%xmm4\n\t" +++ " mulps %%xmm2, %%xmm0\n\t" +++ " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t" +++ " addps %%xmm0, %%xmm6\n\t" +++ " mulps %%xmm4, %%xmm2\n\t" +++ " addps %%xmm2, %%xmm7\n\t" +++ ".%=Leven:\n\t" +++ " # neg inversor\n\t" +++ " xorps %%xmm1, %%xmm1\n\t" +++ " mov $0x80000000, %%r9\n\t" +++ " movd %%r9, %%xmm1\n\t" +++ " shufps $0x11, %%xmm1, %%xmm1 # b00010001 # 0 -0 0 -0\n\t" +++ " # pfpnacc\n\t" +++ " xorps %%xmm1, %%xmm6\n\t" +++ " movaps %%xmm6, %%xmm2\n\t" +++ " unpcklps %%xmm7, %%xmm6\n\t" +++ " unpckhps %%xmm7, %%xmm2\n\t" +++ " movaps %%xmm2, %%xmm3\n\t" +++ " shufps $0x44, %%xmm6, %%xmm2 # b01000100\n\t" +++ " shufps $0xee, %%xmm3, %%xmm6 # b11101110\n\t" +++ " addps %%xmm2, %%xmm6\n\t" +++ " # xmm6 = r1 i2 r3 i4\n\t" +++ " movhlps %%xmm6, %%xmm4 # xmm4 = r3 i4 ?? ??\n\t" +++ " addps %%xmm4, %%xmm6 # xmm6 = r1+r3 i2+i4 ?? ??\n\t" +++ " movlps %%xmm6, (%[rdi]) # store low 2x32 bits (complex) " +++ "to memory\n\t" +++ : +++ : [rsi] "r"(input), +++ [rdx] "r"(taps), +++ "c"(num_bytes), +++ [rdi] "r"(result), +++ [conjugator] "r"(conjugator) +++ : "rax", "r8", "r9", "r10"); +++ +++ int getem = num_bytes % 16; +++ +++ for (; getem > 0; getem -= 8) { +++ *result += (input[(num_bytes >> 3) - 1] * lv_conj(taps[(num_bytes >> 3) - 1])); +++ } ++ } ++ #endif ++ ++ #if LV_HAVE_SSE && LV_HAVE_32 ++-static inline void volk_32fc_x2_conjugate_dot_prod_32fc_a_sse_32(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) { ++- ++- const unsigned int num_bytes = num_points*8; ++- ++- __VOLK_ATTR_ALIGNED(16) static const uint32_t conjugator[4]= {0x00000000, 0x80000000, 0x00000000, 0x80000000}; ++- ++- int bound = num_bytes >> 4; ++- int leftovers = num_bytes % 16; ++- ++- __VOLK_ASM __VOLK_VOLATILE ++- ( ++- " #pushl %%ebp\n\t" ++- " #movl %%esp, %%ebp\n\t" ++- " #movl 12(%%ebp), %%eax # input\n\t" ++- " #movl 16(%%ebp), %%edx # taps\n\t" ++- " #movl 20(%%ebp), %%ecx # n_bytes\n\t" ++- " movaps 0(%[conjugator]), %%xmm1\n\t" ++- " xorps %%xmm6, %%xmm6 # zero accumulators\n\t" ++- " movaps 0(%[eax]), %%xmm0\n\t" ++- " xorps %%xmm7, %%xmm7 # zero accumulators\n\t" ++- " movaps 0(%[edx]), %%xmm2\n\t" ++- " movl %[ecx], (%[out])\n\t" ++- " shrl $5, %[ecx] # ecx = n_2_ccomplex_blocks / 2\n\t" ++- ++- " xorps %%xmm1, %%xmm2\n\t" ++- " jmp .%=L1_test\n\t" ++- " # 4 taps / loop\n\t" ++- " # something like ?? cycles / loop\n\t" ++- ".%=Loop1: \n\t" ++- "# complex prod: C += A * B, w/ temp Z & Y (or B), xmmPN=$0x8000000080000000\n\t" ++- "# movaps (%[eax]), %%xmmA\n\t" ++- "# movaps (%[edx]), %%xmmB\n\t" ++- "# movaps %%xmmA, %%xmmZ\n\t" ++- "# shufps $0xb1, %%xmmZ, %%xmmZ # swap internals\n\t" ++- "# mulps %%xmmB, %%xmmA\n\t" ++- "# mulps %%xmmZ, %%xmmB\n\t" ++- "# # SSE replacement for: pfpnacc %%xmmB, %%xmmA\n\t" ++- "# xorps %%xmmPN, %%xmmA\n\t" ++- "# movaps %%xmmA, %%xmmZ\n\t" ++- "# unpcklps %%xmmB, %%xmmA\n\t" ++- "# unpckhps %%xmmB, %%xmmZ\n\t" ++- "# movaps %%xmmZ, %%xmmY\n\t" ++- "# shufps $0x44, %%xmmA, %%xmmZ # b01000100\n\t" ++- "# shufps $0xee, %%xmmY, %%xmmA # b11101110\n\t" ++- "# addps %%xmmZ, %%xmmA\n\t" ++- "# addps %%xmmA, %%xmmC\n\t" ++- "# A=xmm0, B=xmm2, Z=xmm4\n\t" ++- "# A'=xmm1, B'=xmm3, Z'=xmm5\n\t" ++- " movaps 16(%[edx]), %%xmm3\n\t" ++- " movaps %%xmm0, %%xmm4\n\t" ++- " xorps %%xmm1, %%xmm3\n\t" ++- " mulps %%xmm2, %%xmm0\n\t" ++- " movaps 16(%[eax]), %%xmm1\n\t" ++- " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t" ++- " movaps %%xmm1, %%xmm5\n\t" ++- " addps %%xmm0, %%xmm6\n\t" ++- " mulps %%xmm3, %%xmm1\n\t" ++- " shufps $0xb1, %%xmm5, %%xmm5 # swap internals\n\t" ++- " addps %%xmm1, %%xmm6\n\t" ++- " movaps 0(%[conjugator]), %%xmm1\n\t" ++- " mulps %%xmm4, %%xmm2\n\t" ++- " movaps 32(%[eax]), %%xmm0\n\t" ++- " addps %%xmm2, %%xmm7\n\t" ++- " mulps %%xmm5, %%xmm3\n\t" ++- " addl $32, %[eax]\n\t" ++- " movaps 32(%[edx]), %%xmm2\n\t" ++- " addps %%xmm3, %%xmm7\n\t" ++- " xorps %%xmm1, %%xmm2\n\t" ++- " addl $32, %[edx]\n\t" ++- ".%=L1_test:\n\t" ++- " decl %[ecx]\n\t" ++- " jge .%=Loop1\n\t" ++- " # We've handled the bulk of multiplies up to here.\n\t" ++- " # Let's sse if original n_2_ccomplex_blocks was odd.\n\t" ++- " # If so, we've got 2 more taps to do.\n\t" ++- " movl 0(%[out]), %[ecx] # n_2_ccomplex_blocks\n\t" ++- " shrl $4, %[ecx]\n\t" ++- " andl $1, %[ecx]\n\t" ++- " je .%=Leven\n\t" ++- " # The count was odd, do 2 more taps.\n\t" ++- " # Note that we've already got mm0/mm2 preloaded\n\t" ++- " # from the main loop.\n\t" ++- " movaps %%xmm0, %%xmm4\n\t" ++- " mulps %%xmm2, %%xmm0\n\t" ++- " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t" ++- " addps %%xmm0, %%xmm6\n\t" ++- " mulps %%xmm4, %%xmm2\n\t" ++- " addps %%xmm2, %%xmm7\n\t" ++- ".%=Leven:\n\t" ++- " # neg inversor\n\t" ++- " #movl 8(%%ebp), %[eax] \n\t" ++- " xorps %%xmm1, %%xmm1\n\t" ++- " movl $0x80000000, (%[out])\n\t" ++- " movss (%[out]), %%xmm1\n\t" ++- " shufps $0x11, %%xmm1, %%xmm1 # b00010001 # 0 -0 0 -0\n\t" ++- " # pfpnacc\n\t" ++- " xorps %%xmm1, %%xmm6\n\t" ++- " movaps %%xmm6, %%xmm2\n\t" ++- " unpcklps %%xmm7, %%xmm6\n\t" ++- " unpckhps %%xmm7, %%xmm2\n\t" ++- " movaps %%xmm2, %%xmm3\n\t" ++- " shufps $0x44, %%xmm6, %%xmm2 # b01000100\n\t" ++- " shufps $0xee, %%xmm3, %%xmm6 # b11101110\n\t" ++- " addps %%xmm2, %%xmm6\n\t" ++- " # xmm6 = r1 i2 r3 i4\n\t" ++- " #movl 8(%%ebp), %[eax] # @result\n\t" ++- " movhlps %%xmm6, %%xmm4 # xmm4 = r3 i4 ?? ??\n\t" ++- " addps %%xmm4, %%xmm6 # xmm6 = r1+r3 i2+i4 ?? ??\n\t" ++- " movlps %%xmm6, (%[out]) # store low 2x32 bits (complex) to memory\n\t" ++- " #popl %%ebp\n\t" ++- : ++- : [eax] "r" (input), [edx] "r" (taps), [ecx] "r" (num_bytes), [out] "r" (result), [conjugator] "r" (conjugator) ++- ); ++- ++- for(; leftovers > 0; leftovers -= 8) { ++- *result += (input[(bound << 1)] * lv_conj(taps[(bound << 1)])); ++- } +++static inline void volk_32fc_x2_conjugate_dot_prod_32fc_a_sse_32(lv_32fc_t* result, +++ const lv_32fc_t* input, +++ const lv_32fc_t* taps, +++ unsigned int num_points) +++{ +++ +++ const unsigned int num_bytes = num_points * 8; +++ +++ __VOLK_ATTR_ALIGNED(16) +++ static const uint32_t conjugator[4] = { +++ 0x00000000, 0x80000000, 0x00000000, 0x80000000 +++ }; +++ +++ int bound = num_bytes >> 4; +++ int leftovers = num_bytes % 16; +++ +++ __VOLK_ASM __VOLK_VOLATILE( +++ " #pushl %%ebp\n\t" +++ " #movl %%esp, %%ebp\n\t" +++ " #movl 12(%%ebp), %%eax # input\n\t" +++ " #movl 16(%%ebp), %%edx # taps\n\t" +++ " #movl 20(%%ebp), %%ecx # n_bytes\n\t" +++ " movaps 0(%[conjugator]), %%xmm1\n\t" +++ " xorps %%xmm6, %%xmm6 # zero accumulators\n\t" +++ " movaps 0(%[eax]), %%xmm0\n\t" +++ " xorps %%xmm7, %%xmm7 # zero accumulators\n\t" +++ " movaps 0(%[edx]), %%xmm2\n\t" +++ " movl %[ecx], (%[out])\n\t" +++ " shrl $5, %[ecx] # ecx = n_2_ccomplex_blocks / 2\n\t" +++ +++ " xorps %%xmm1, %%xmm2\n\t" +++ " jmp .%=L1_test\n\t" +++ " # 4 taps / loop\n\t" +++ " # something like ?? cycles / loop\n\t" +++ ".%=Loop1: \n\t" +++ "# complex prod: C += A * B, w/ temp Z & Y (or B), xmmPN=$0x8000000080000000\n\t" +++ "# movaps (%[eax]), %%xmmA\n\t" +++ "# movaps (%[edx]), %%xmmB\n\t" +++ "# movaps %%xmmA, %%xmmZ\n\t" +++ "# shufps $0xb1, %%xmmZ, %%xmmZ # swap internals\n\t" +++ "# mulps %%xmmB, %%xmmA\n\t" +++ "# mulps %%xmmZ, %%xmmB\n\t" +++ "# # SSE replacement for: pfpnacc %%xmmB, %%xmmA\n\t" +++ "# xorps %%xmmPN, %%xmmA\n\t" +++ "# movaps %%xmmA, %%xmmZ\n\t" +++ "# unpcklps %%xmmB, %%xmmA\n\t" +++ "# unpckhps %%xmmB, %%xmmZ\n\t" +++ "# movaps %%xmmZ, %%xmmY\n\t" +++ "# shufps $0x44, %%xmmA, %%xmmZ # b01000100\n\t" +++ "# shufps $0xee, %%xmmY, %%xmmA # b11101110\n\t" +++ "# addps %%xmmZ, %%xmmA\n\t" +++ "# addps %%xmmA, %%xmmC\n\t" +++ "# A=xmm0, B=xmm2, Z=xmm4\n\t" +++ "# A'=xmm1, B'=xmm3, Z'=xmm5\n\t" +++ " movaps 16(%[edx]), %%xmm3\n\t" +++ " movaps %%xmm0, %%xmm4\n\t" +++ " xorps %%xmm1, %%xmm3\n\t" +++ " mulps %%xmm2, %%xmm0\n\t" +++ " movaps 16(%[eax]), %%xmm1\n\t" +++ " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t" +++ " movaps %%xmm1, %%xmm5\n\t" +++ " addps %%xmm0, %%xmm6\n\t" +++ " mulps %%xmm3, %%xmm1\n\t" +++ " shufps $0xb1, %%xmm5, %%xmm5 # swap internals\n\t" +++ " addps %%xmm1, %%xmm6\n\t" +++ " movaps 0(%[conjugator]), %%xmm1\n\t" +++ " mulps %%xmm4, %%xmm2\n\t" +++ " movaps 32(%[eax]), %%xmm0\n\t" +++ " addps %%xmm2, %%xmm7\n\t" +++ " mulps %%xmm5, %%xmm3\n\t" +++ " addl $32, %[eax]\n\t" +++ " movaps 32(%[edx]), %%xmm2\n\t" +++ " addps %%xmm3, %%xmm7\n\t" +++ " xorps %%xmm1, %%xmm2\n\t" +++ " addl $32, %[edx]\n\t" +++ ".%=L1_test:\n\t" +++ " decl %[ecx]\n\t" +++ " jge .%=Loop1\n\t" +++ " # We've handled the bulk of multiplies up to here.\n\t" +++ " # Let's sse if original n_2_ccomplex_blocks was odd.\n\t" +++ " # If so, we've got 2 more taps to do.\n\t" +++ " movl 0(%[out]), %[ecx] # n_2_ccomplex_blocks\n\t" +++ " shrl $4, %[ecx]\n\t" +++ " andl $1, %[ecx]\n\t" +++ " je .%=Leven\n\t" +++ " # The count was odd, do 2 more taps.\n\t" +++ " # Note that we've already got mm0/mm2 preloaded\n\t" +++ " # from the main loop.\n\t" +++ " movaps %%xmm0, %%xmm4\n\t" +++ " mulps %%xmm2, %%xmm0\n\t" +++ " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t" +++ " addps %%xmm0, %%xmm6\n\t" +++ " mulps %%xmm4, %%xmm2\n\t" +++ " addps %%xmm2, %%xmm7\n\t" +++ ".%=Leven:\n\t" +++ " # neg inversor\n\t" +++ " #movl 8(%%ebp), %[eax] \n\t" +++ " xorps %%xmm1, %%xmm1\n\t" +++ " movl $0x80000000, (%[out])\n\t" +++ " movss (%[out]), %%xmm1\n\t" +++ " shufps $0x11, %%xmm1, %%xmm1 # b00010001 # 0 -0 0 -0\n\t" +++ " # pfpnacc\n\t" +++ " xorps %%xmm1, %%xmm6\n\t" +++ " movaps %%xmm6, %%xmm2\n\t" +++ " unpcklps %%xmm7, %%xmm6\n\t" +++ " unpckhps %%xmm7, %%xmm2\n\t" +++ " movaps %%xmm2, %%xmm3\n\t" +++ " shufps $0x44, %%xmm6, %%xmm2 # b01000100\n\t" +++ " shufps $0xee, %%xmm3, %%xmm6 # b11101110\n\t" +++ " addps %%xmm2, %%xmm6\n\t" +++ " # xmm6 = r1 i2 r3 i4\n\t" +++ " #movl 8(%%ebp), %[eax] # @result\n\t" +++ " movhlps %%xmm6, %%xmm4 # xmm4 = r3 i4 ?? ??\n\t" +++ " addps %%xmm4, %%xmm6 # xmm6 = r1+r3 i2+i4 ?? ??\n\t" +++ " movlps %%xmm6, (%[out]) # store low 2x32 bits (complex) " +++ "to memory\n\t" +++ " #popl %%ebp\n\t" +++ : +++ : [eax] "r"(input), +++ [edx] "r"(taps), +++ [ecx] "r"(num_bytes), +++ [out] "r"(result), +++ [conjugator] "r"(conjugator)); +++ +++ for (; leftovers > 0; leftovers -= 8) { +++ *result += (input[(bound << 1)] * lv_conj(taps[(bound << 1)])); +++ } ++ } ++ #endif /*LV_HAVE_SSE*/ ++ ++diff --git a/kernels/volk/volk_32fc_x2_divide_32fc.h b/kernels/volk/volk_32fc_x2_divide_32fc.h ++index 3ce6ede..78c245a 100644 ++--- a/kernels/volk/volk_32fc_x2_divide_32fc.h +++++ b/kernels/volk/volk_32fc_x2_divide_32fc.h ++@@ -29,8 +29,8 @@ ++ * ++ * Dispatcher Prototype ++ * \code ++- * void volk_32fc_x2_divide_32fc(lv_32fc_t* cVector, const lv_32fc_t* numeratorVector, const lv_32fc_t* denumeratorVector, unsigned int num_points); ++- * \endcode +++ * void volk_32fc_x2_divide_32fc(lv_32fc_t* cVector, const lv_32fc_t* numeratorVector, +++ * const lv_32fc_t* denumeratorVector, unsigned int num_points); \endcode ++ * ++ * \b Inputs ++ * \li numeratorVector: The numerator complex values. ++@@ -41,7 +41,8 @@ ++ * \li outputVector: The output vector complex floats. ++ * ++ * \b Example ++- * divide a complex vector by itself, demonstrating the result should be pretty close to 1+0j. +++ * divide a complex vector by itself, demonstrating the result should be pretty close to +++ * 1+0j. ++ * ++ * \code ++ * int N = 10; ++@@ -71,17 +72,18 @@ ++ #ifndef INCLUDED_volk_32fc_x2_divide_32fc_u_H ++ #define INCLUDED_volk_32fc_x2_divide_32fc_u_H ++ +++#include ++ #include ++ #include ++-#include ++ ++ #ifdef LV_HAVE_SSE3 ++ #include ++ #include ++ ++-static inline void ++-volk_32fc_x2_divide_32fc_u_sse3(lv_32fc_t* cVector, const lv_32fc_t* numeratorVector, ++- const lv_32fc_t* denumeratorVector, unsigned int num_points) +++static inline void volk_32fc_x2_divide_32fc_u_sse3(lv_32fc_t* cVector, +++ const lv_32fc_t* numeratorVector, +++ const lv_32fc_t* denumeratorVector, +++ unsigned int num_points) ++ { ++ /* ++ * we'll do the "classical" ++@@ -89,44 +91,46 @@ volk_32fc_x2_divide_32fc_u_sse3(lv_32fc_t* cVector, const lv_32fc_t* numeratorVe ++ * --- = ------- ++ * b |b|^2 ++ * */ ++- unsigned int number = 0; ++- const unsigned int quarterPoints = num_points / 4; ++- ++- __m128 num01, num23, den01, den23, norm, result; ++- lv_32fc_t* c = cVector; ++- const lv_32fc_t* a = numeratorVector; ++- const lv_32fc_t* b = denumeratorVector; ++- ++- for(; number < quarterPoints; number++){ ++- num01 = _mm_loadu_ps((float*) a); // first pair ++- den01 = _mm_loadu_ps((float*) b); // first pair ++- num01 = _mm_complexconjugatemul_ps(num01, den01); // a conj(b) ++- a += 2; ++- b += 2; ++- ++- num23 = _mm_loadu_ps((float*) a); // second pair ++- den23 = _mm_loadu_ps((float*) b); // second pair ++- num23 = _mm_complexconjugatemul_ps(num23, den23); // a conj(b) ++- a += 2; ++- b += 2; ++- ++- norm = _mm_magnitudesquared_ps_sse3(den01, den23); ++- den01 = _mm_unpacklo_ps(norm,norm); ++- den23 = _mm_unpackhi_ps(norm,norm); ++- ++- result = _mm_div_ps(num01, den01); ++- _mm_storeu_ps((float*) c, result); // Store the results back into the C container ++- c += 2; ++- result = _mm_div_ps(num23, den23); ++- _mm_storeu_ps((float*) c, result); // Store the results back into the C container ++- c += 2; ++- } ++- ++- number *= 4; ++- for(;number < num_points; number++){ ++- *c = (*a) / (*b); ++- a++; b++; c++; ++- } +++ unsigned int number = 0; +++ const unsigned int quarterPoints = num_points / 4; +++ +++ __m128 num01, num23, den01, den23, norm, result; +++ lv_32fc_t* c = cVector; +++ const lv_32fc_t* a = numeratorVector; +++ const lv_32fc_t* b = denumeratorVector; +++ +++ for (; number < quarterPoints; number++) { +++ num01 = _mm_loadu_ps((float*)a); // first pair +++ den01 = _mm_loadu_ps((float*)b); // first pair +++ num01 = _mm_complexconjugatemul_ps(num01, den01); // a conj(b) +++ a += 2; +++ b += 2; +++ +++ num23 = _mm_loadu_ps((float*)a); // second pair +++ den23 = _mm_loadu_ps((float*)b); // second pair +++ num23 = _mm_complexconjugatemul_ps(num23, den23); // a conj(b) +++ a += 2; +++ b += 2; +++ +++ norm = _mm_magnitudesquared_ps_sse3(den01, den23); +++ den01 = _mm_unpacklo_ps(norm, norm); +++ den23 = _mm_unpackhi_ps(norm, norm); +++ +++ result = _mm_div_ps(num01, den01); +++ _mm_storeu_ps((float*)c, result); // Store the results back into the C container +++ c += 2; +++ result = _mm_div_ps(num23, den23); +++ _mm_storeu_ps((float*)c, result); // Store the results back into the C container +++ c += 2; +++ } +++ +++ number *= 4; +++ for (; number < num_points; number++) { +++ *c = (*a) / (*b); +++ a++; +++ b++; +++ c++; +++ } ++ } ++ #endif /* LV_HAVE_SSE3 */ ++ ++@@ -135,9 +139,10 @@ volk_32fc_x2_divide_32fc_u_sse3(lv_32fc_t* cVector, const lv_32fc_t* numeratorVe ++ #include ++ #include ++ ++-static inline void ++-volk_32fc_x2_divide_32fc_u_avx(lv_32fc_t* cVector, const lv_32fc_t* numeratorVector, ++- const lv_32fc_t* denumeratorVector, unsigned int num_points) +++static inline void volk_32fc_x2_divide_32fc_u_avx(lv_32fc_t* cVector, +++ const lv_32fc_t* numeratorVector, +++ const lv_32fc_t* denumeratorVector, +++ unsigned int num_points) ++ { ++ /* ++ * we'll do the "classical" ++@@ -153,17 +158,21 @@ volk_32fc_x2_divide_32fc_u_avx(lv_32fc_t* cVector, const lv_32fc_t* numeratorVec ++ const lv_32fc_t* a = numeratorVector; ++ const lv_32fc_t* b = denumeratorVector; ++ ++- for(; number < quarterPoints; number++){ ++- num = _mm256_loadu_ps((float*) a); // Load the ar + ai, br + bi ... as ar,ai,br,bi ... ++- denum = _mm256_loadu_ps((float*) b); // Load the cr + ci, dr + di ... as cr,ci,dr,di ... +++ for (; number < quarterPoints; number++) { +++ num = _mm256_loadu_ps( +++ (float*)a); // Load the ar + ai, br + bi ... as ar,ai,br,bi ... +++ denum = _mm256_loadu_ps( +++ (float*)b); // Load the cr + ci, dr + di ... as cr,ci,dr,di ... ++ mul_conj = _mm256_complexconjugatemul_ps(num, denum); ++ sq = _mm256_mul_ps(denum, denum); // Square the values ++- mag_sq_un = _mm256_hadd_ps(sq,sq); // obtain the actual squared magnitude, although out of order +++ mag_sq_un = _mm256_hadd_ps( +++ sq, sq); // obtain the actual squared magnitude, although out of order ++ mag_sq = _mm256_permute_ps(mag_sq_un, 0xd8); // I order them ++- // best guide I found on using these functions: https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=2738,2059,2738,2738,3875,3874,3875,2738,3870 ++- div = _mm256_div_ps(mul_conj,mag_sq); +++ // best guide I found on using these functions: +++ // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=2738,2059,2738,2738,3875,3874,3875,2738,3870 +++ div = _mm256_div_ps(mul_conj, mag_sq); ++ ++- _mm256_storeu_ps((float*) c, div); // Store the results back into the C container +++ _mm256_storeu_ps((float*)c, div); // Store the results back into the C container ++ ++ a += 4; ++ b += 4; ++@@ -172,51 +181,51 @@ volk_32fc_x2_divide_32fc_u_avx(lv_32fc_t* cVector, const lv_32fc_t* numeratorVec ++ ++ number = quarterPoints * 4; ++ ++- for(; number < num_points; number++){ +++ for (; number < num_points; number++) { ++ *c++ = (*a++) / (*b++); ++ } ++- ++ } ++ #endif /* LV_HAVE_AVX */ ++ ++ ++ #ifdef LV_HAVE_GENERIC ++ ++-static inline void ++-volk_32fc_x2_divide_32fc_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, ++- const lv_32fc_t* bVector, unsigned int num_points) +++static inline void volk_32fc_x2_divide_32fc_generic(lv_32fc_t* cVector, +++ const lv_32fc_t* aVector, +++ const lv_32fc_t* bVector, +++ unsigned int num_points) ++ { ++- lv_32fc_t* cPtr = cVector; ++- const lv_32fc_t* aPtr = aVector; ++- const lv_32fc_t* bPtr= bVector; ++- unsigned int number = 0; ++- ++- for(number = 0; number < num_points; number++){ ++- *cPtr++ = (*aPtr++) / (*bPtr++); ++- } +++ lv_32fc_t* cPtr = cVector; +++ const lv_32fc_t* aPtr = aVector; +++ const lv_32fc_t* bPtr = bVector; +++ unsigned int number = 0; +++ +++ for (number = 0; number < num_points; number++) { +++ *cPtr++ = (*aPtr++) / (*bPtr++); +++ } ++ } ++ #endif /* LV_HAVE_GENERIC */ ++ ++ ++- ++ #endif /* INCLUDED_volk_32fc_x2_divide_32fc_u_H */ ++ ++ ++ #ifndef INCLUDED_volk_32fc_x2_divide_32fc_a_H ++ #define INCLUDED_volk_32fc_x2_divide_32fc_a_H ++ +++#include ++ #include ++ #include ++ #include ++-#include ++ ++ #ifdef LV_HAVE_SSE3 ++ #include ++ #include ++ ++-static inline void ++-volk_32fc_x2_divide_32fc_a_sse3(lv_32fc_t* cVector, const lv_32fc_t* numeratorVector, ++- const lv_32fc_t* denumeratorVector, unsigned int num_points) +++static inline void volk_32fc_x2_divide_32fc_a_sse3(lv_32fc_t* cVector, +++ const lv_32fc_t* numeratorVector, +++ const lv_32fc_t* denumeratorVector, +++ unsigned int num_points) ++ { ++ /* ++ * we'll do the "classical" ++@@ -224,45 +233,47 @@ volk_32fc_x2_divide_32fc_a_sse3(lv_32fc_t* cVector, const lv_32fc_t* numeratorVe ++ * --- = ------- ++ * b |b|^2 ++ * */ ++- unsigned int number = 0; ++- const unsigned int quarterPoints = num_points / 4; ++- ++- __m128 num01, num23, den01, den23, norm, result; ++- lv_32fc_t* c = cVector; ++- const lv_32fc_t* a = numeratorVector; ++- const lv_32fc_t* b = denumeratorVector; ++- ++- for(; number < quarterPoints; number++){ ++- num01 = _mm_load_ps((float*) a); // first pair ++- den01 = _mm_load_ps((float*) b); // first pair ++- num01 = _mm_complexconjugatemul_ps(num01, den01); // a conj(b) ++- a += 2; ++- b += 2; ++- ++- num23 = _mm_load_ps((float*) a); // second pair ++- den23 = _mm_load_ps((float*) b); // second pair ++- num23 = _mm_complexconjugatemul_ps(num23, den23); // a conj(b) ++- a += 2; ++- b += 2; ++- ++- norm = _mm_magnitudesquared_ps_sse3(den01, den23); ++- ++- den01 = _mm_unpacklo_ps(norm,norm); // select the lower floats twice ++- den23 = _mm_unpackhi_ps(norm,norm); // select the upper floats twice ++- ++- result = _mm_div_ps(num01, den01); ++- _mm_store_ps((float*) c, result); // Store the results back into the C container ++- c += 2; ++- result = _mm_div_ps(num23, den23); ++- _mm_store_ps((float*) c, result); // Store the results back into the C container ++- c += 2; ++- } ++- ++- number *= 4; ++- for(;number < num_points; number++){ ++- *c = (*a) / (*b); ++- a++; b++; c++; ++- } +++ unsigned int number = 0; +++ const unsigned int quarterPoints = num_points / 4; +++ +++ __m128 num01, num23, den01, den23, norm, result; +++ lv_32fc_t* c = cVector; +++ const lv_32fc_t* a = numeratorVector; +++ const lv_32fc_t* b = denumeratorVector; +++ +++ for (; number < quarterPoints; number++) { +++ num01 = _mm_load_ps((float*)a); // first pair +++ den01 = _mm_load_ps((float*)b); // first pair +++ num01 = _mm_complexconjugatemul_ps(num01, den01); // a conj(b) +++ a += 2; +++ b += 2; +++ +++ num23 = _mm_load_ps((float*)a); // second pair +++ den23 = _mm_load_ps((float*)b); // second pair +++ num23 = _mm_complexconjugatemul_ps(num23, den23); // a conj(b) +++ a += 2; +++ b += 2; +++ +++ norm = _mm_magnitudesquared_ps_sse3(den01, den23); +++ +++ den01 = _mm_unpacklo_ps(norm, norm); // select the lower floats twice +++ den23 = _mm_unpackhi_ps(norm, norm); // select the upper floats twice +++ +++ result = _mm_div_ps(num01, den01); +++ _mm_store_ps((float*)c, result); // Store the results back into the C container +++ c += 2; +++ result = _mm_div_ps(num23, den23); +++ _mm_store_ps((float*)c, result); // Store the results back into the C container +++ c += 2; +++ } +++ +++ number *= 4; +++ for (; number < num_points; number++) { +++ *c = (*a) / (*b); +++ a++; +++ b++; +++ c++; +++ } ++ } ++ #endif /* LV_HAVE_SSE */ ++ ++@@ -270,9 +281,10 @@ volk_32fc_x2_divide_32fc_a_sse3(lv_32fc_t* cVector, const lv_32fc_t* numeratorVe ++ #include ++ #include ++ ++-static inline void ++-volk_32fc_x2_divide_32fc_a_avx(lv_32fc_t* cVector, const lv_32fc_t* numeratorVector, ++- const lv_32fc_t* denumeratorVector, unsigned int num_points) +++static inline void volk_32fc_x2_divide_32fc_a_avx(lv_32fc_t* cVector, +++ const lv_32fc_t* numeratorVector, +++ const lv_32fc_t* denumeratorVector, +++ unsigned int num_points) ++ { ++ /* ++ * we'll do the "classical" ++@@ -288,17 +300,21 @@ volk_32fc_x2_divide_32fc_a_avx(lv_32fc_t* cVector, const lv_32fc_t* numeratorVec ++ const lv_32fc_t* a = numeratorVector; ++ const lv_32fc_t* b = denumeratorVector; ++ ++- for(; number < quarterPoints; number++){ ++- num = _mm256_load_ps((float*) a); // Load the ar + ai, br + bi ... as ar,ai,br,bi ... ++- denum = _mm256_load_ps((float*) b); // Load the cr + ci, dr + di ... as cr,ci,dr,di ... +++ for (; number < quarterPoints; number++) { +++ num = +++ _mm256_load_ps((float*)a); // Load the ar + ai, br + bi ... as ar,ai,br,bi ... +++ denum = +++ _mm256_load_ps((float*)b); // Load the cr + ci, dr + di ... as cr,ci,dr,di ... ++ mul_conj = _mm256_complexconjugatemul_ps(num, denum); ++ sq = _mm256_mul_ps(denum, denum); // Square the values ++- mag_sq_un = _mm256_hadd_ps(sq,sq); // obtain the actual squared magnitude, although out of order +++ mag_sq_un = _mm256_hadd_ps( +++ sq, sq); // obtain the actual squared magnitude, although out of order ++ mag_sq = _mm256_permute_ps(mag_sq_un, 0xd8); // I order them ++- // best guide I found on using these functions: https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=2738,2059,2738,2738,3875,3874,3875,2738,3870 ++- div = _mm256_div_ps(mul_conj,mag_sq); +++ // best guide I found on using these functions: +++ // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=2738,2059,2738,2738,3875,3874,3875,2738,3870 +++ div = _mm256_div_ps(mul_conj, mag_sq); ++ ++- _mm256_store_ps((float*) c, div); // Store the results back into the C container +++ _mm256_store_ps((float*)c, div); // Store the results back into the C container ++ ++ a += 4; ++ b += 4; ++@@ -307,78 +323,78 @@ volk_32fc_x2_divide_32fc_a_avx(lv_32fc_t* cVector, const lv_32fc_t* numeratorVec ++ ++ number = quarterPoints * 4; ++ ++- for(; number < num_points; number++){ +++ for (; number < num_points; number++) { ++ *c++ = (*a++) / (*b++); ++ } ++- ++- ++ } ++ #endif /* LV_HAVE_AVX */ ++ ++ #ifdef LV_HAVE_NEON ++ #include ++ ++-static inline void ++-volk_32fc_x2_divide_32fc_neon(lv_32fc_t* cVector, const lv_32fc_t* aVector, ++- const lv_32fc_t* bVector, unsigned int num_points) +++static inline void volk_32fc_x2_divide_32fc_neon(lv_32fc_t* cVector, +++ const lv_32fc_t* aVector, +++ const lv_32fc_t* bVector, +++ unsigned int num_points) ++ { ++- lv_32fc_t* cPtr = cVector; ++- const lv_32fc_t* aPtr = aVector; ++- const lv_32fc_t* bPtr = bVector; ++- ++- float32x4x2_t aVal, bVal, cVal; ++- float32x4_t bAbs, bAbsInv; ++- ++- const unsigned int quarterPoints = num_points / 4; ++- unsigned int number = 0; ++- for(; number < quarterPoints; number++){ ++- aVal = vld2q_f32((const float*)(aPtr)); ++- bVal = vld2q_f32((const float*)(bPtr)); ++- aPtr += 4; ++- bPtr += 4; ++- __VOLK_PREFETCH(aPtr+4); ++- __VOLK_PREFETCH(bPtr+4); ++- ++- bAbs = vmulq_f32( bVal.val[0], bVal.val[0]); ++- bAbs = vmlaq_f32(bAbs, bVal.val[1], bVal.val[1]); ++- ++- bAbsInv = vrecpeq_f32(bAbs); ++- bAbsInv = vmulq_f32(bAbsInv, vrecpsq_f32(bAbsInv, bAbs)); ++- bAbsInv = vmulq_f32(bAbsInv, vrecpsq_f32(bAbsInv, bAbs)); ++- ++- cVal.val[0] = vmulq_f32( aVal.val[0], bVal.val[0]); ++- cVal.val[0] = vmlaq_f32(cVal.val[0], aVal.val[1], bVal.val[1]); ++- cVal.val[0] = vmulq_f32(cVal.val[0], bAbsInv); ++- ++- cVal.val[1] = vmulq_f32( aVal.val[1], bVal.val[0]); ++- cVal.val[1] = vmlsq_f32(cVal.val[1], aVal.val[0], bVal.val[1]); ++- cVal.val[1] = vmulq_f32(cVal.val[1], bAbsInv); ++- ++- vst2q_f32((float*)(cPtr), cVal); ++- cPtr += 4; ++- } ++- ++- for(number = quarterPoints * 4; number < num_points; number++){ ++- *cPtr++ = (*aPtr++) / (*bPtr++); ++- } +++ lv_32fc_t* cPtr = cVector; +++ const lv_32fc_t* aPtr = aVector; +++ const lv_32fc_t* bPtr = bVector; +++ +++ float32x4x2_t aVal, bVal, cVal; +++ float32x4_t bAbs, bAbsInv; +++ +++ const unsigned int quarterPoints = num_points / 4; +++ unsigned int number = 0; +++ for (; number < quarterPoints; number++) { +++ aVal = vld2q_f32((const float*)(aPtr)); +++ bVal = vld2q_f32((const float*)(bPtr)); +++ aPtr += 4; +++ bPtr += 4; +++ __VOLK_PREFETCH(aPtr + 4); +++ __VOLK_PREFETCH(bPtr + 4); +++ +++ bAbs = vmulq_f32(bVal.val[0], bVal.val[0]); +++ bAbs = vmlaq_f32(bAbs, bVal.val[1], bVal.val[1]); +++ +++ bAbsInv = vrecpeq_f32(bAbs); +++ bAbsInv = vmulq_f32(bAbsInv, vrecpsq_f32(bAbsInv, bAbs)); +++ bAbsInv = vmulq_f32(bAbsInv, vrecpsq_f32(bAbsInv, bAbs)); +++ +++ cVal.val[0] = vmulq_f32(aVal.val[0], bVal.val[0]); +++ cVal.val[0] = vmlaq_f32(cVal.val[0], aVal.val[1], bVal.val[1]); +++ cVal.val[0] = vmulq_f32(cVal.val[0], bAbsInv); +++ +++ cVal.val[1] = vmulq_f32(aVal.val[1], bVal.val[0]); +++ cVal.val[1] = vmlsq_f32(cVal.val[1], aVal.val[0], bVal.val[1]); +++ cVal.val[1] = vmulq_f32(cVal.val[1], bAbsInv); +++ +++ vst2q_f32((float*)(cPtr), cVal); +++ cPtr += 4; +++ } +++ +++ for (number = quarterPoints * 4; number < num_points; number++) { +++ *cPtr++ = (*aPtr++) / (*bPtr++); +++ } ++ } ++ #endif /* LV_HAVE_NEON */ ++ ++ ++ #ifdef LV_HAVE_GENERIC ++ ++-static inline void ++-volk_32fc_x2_divide_32fc_a_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, ++- const lv_32fc_t* bVector, unsigned int num_points) +++static inline void volk_32fc_x2_divide_32fc_a_generic(lv_32fc_t* cVector, +++ const lv_32fc_t* aVector, +++ const lv_32fc_t* bVector, +++ unsigned int num_points) ++ { ++- lv_32fc_t* cPtr = cVector; ++- const lv_32fc_t* aPtr = aVector; ++- const lv_32fc_t* bPtr= bVector; ++- unsigned int number = 0; ++- ++- for(number = 0; number < num_points; number++){ ++- *cPtr++ = (*aPtr++) / (*bPtr++); ++- } +++ lv_32fc_t* cPtr = cVector; +++ const lv_32fc_t* aPtr = aVector; +++ const lv_32fc_t* bPtr = bVector; +++ unsigned int number = 0; +++ +++ for (number = 0; number < num_points; number++) { +++ *cPtr++ = (*aPtr++) / (*bPtr++); +++ } ++ } ++ #endif /* LV_HAVE_GENERIC */ ++ ++diff --git a/kernels/volk/volk_32fc_x2_dot_prod_32fc.h b/kernels/volk/volk_32fc_x2_dot_prod_32fc.h ++index f4a4469..b0b7fee 100644 ++--- a/kernels/volk/volk_32fc_x2_dot_prod_32fc.h +++++ b/kernels/volk/volk_32fc_x2_dot_prod_32fc.h ++@@ -33,8 +33,8 @@ ++ * ++ * Dispatcher Prototype ++ * \code ++- * void volk_32fc_x2_dot_prod_32fc(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) ++- * \endcode +++ * void volk_32fc_x2_dot_prod_32fc(lv_32fc_t* result, const lv_32fc_t* input, const +++ * lv_32fc_t* taps, unsigned int num_points) \endcode ++ * ++ * \b Inputs ++ * \li input: vector of complex floats. ++@@ -58,236 +58,246 @@ ++ #ifndef INCLUDED_volk_32fc_x2_dot_prod_32fc_u_H ++ #define INCLUDED_volk_32fc_x2_dot_prod_32fc_u_H ++ ++-#include ++-#include ++ #include ++ #include +++#include +++#include ++ ++ ++ #ifdef LV_HAVE_GENERIC ++ ++ ++-static inline void volk_32fc_x2_dot_prod_32fc_generic(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) { +++static inline void volk_32fc_x2_dot_prod_32fc_generic(lv_32fc_t* result, +++ const lv_32fc_t* input, +++ const lv_32fc_t* taps, +++ unsigned int num_points) +++{ ++ ++- float * res = (float*) result; ++- float * in = (float*) input; ++- float * tp = (float*) taps; ++- unsigned int n_2_ccomplex_blocks = num_points/2; +++ float* res = (float*)result; +++ float* in = (float*)input; +++ float* tp = (float*)taps; +++ unsigned int n_2_ccomplex_blocks = num_points / 2; ++ ++- float sum0[2] = {0,0}; ++- float sum1[2] = {0,0}; ++- unsigned int i = 0; +++ float sum0[2] = { 0, 0 }; +++ float sum1[2] = { 0, 0 }; +++ unsigned int i = 0; ++ ++- for(i = 0; i < n_2_ccomplex_blocks; ++i) { ++- sum0[0] += in[0] * tp[0] - in[1] * tp[1]; ++- sum0[1] += in[0] * tp[1] + in[1] * tp[0]; ++- sum1[0] += in[2] * tp[2] - in[3] * tp[3]; ++- sum1[1] += in[2] * tp[3] + in[3] * tp[2]; +++ for (i = 0; i < n_2_ccomplex_blocks; ++i) { +++ sum0[0] += in[0] * tp[0] - in[1] * tp[1]; +++ sum0[1] += in[0] * tp[1] + in[1] * tp[0]; +++ sum1[0] += in[2] * tp[2] - in[3] * tp[3]; +++ sum1[1] += in[2] * tp[3] + in[3] * tp[2]; ++ ++- in += 4; ++- tp += 4; ++- } +++ in += 4; +++ tp += 4; +++ } ++ ++- res[0] = sum0[0] + sum1[0]; ++- res[1] = sum0[1] + sum1[1]; +++ res[0] = sum0[0] + sum1[0]; +++ res[1] = sum0[1] + sum1[1]; ++ ++- // Cleanup if we had an odd number of points ++- if (num_points & 1) { ++- *result += input[num_points - 1] * taps[num_points - 1]; ++- } +++ // Cleanup if we had an odd number of points +++ if (num_points & 1) { +++ *result += input[num_points - 1] * taps[num_points - 1]; +++ } ++ } ++ ++ #endif /*LV_HAVE_GENERIC*/ ++ ++ ++- ++ #if LV_HAVE_SSE && LV_HAVE_64 ++ ++-static inline void volk_32fc_x2_dot_prod_32fc_u_sse_64(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) { ++- ++- const unsigned int num_bytes = num_points*8; ++- unsigned int isodd = num_points & 1; ++- ++- __VOLK_ASM ++- ( ++- "# ccomplex_dotprod_generic (float* result, const float *input,\n\t" ++- "# const float *taps, unsigned num_bytes)\n\t" ++- "# float sum0 = 0;\n\t" ++- "# float sum1 = 0;\n\t" ++- "# float sum2 = 0;\n\t" ++- "# float sum3 = 0;\n\t" ++- "# do {\n\t" ++- "# sum0 += input[0] * taps[0] - input[1] * taps[1];\n\t" ++- "# sum1 += input[0] * taps[1] + input[1] * taps[0];\n\t" ++- "# sum2 += input[2] * taps[2] - input[3] * taps[3];\n\t" ++- "# sum3 += input[2] * taps[3] + input[3] * taps[2];\n\t" ++- "# input += 4;\n\t" ++- "# taps += 4; \n\t" ++- "# } while (--n_2_ccomplex_blocks != 0);\n\t" ++- "# result[0] = sum0 + sum2;\n\t" ++- "# result[1] = sum1 + sum3;\n\t" ++- "# TODO: prefetch and better scheduling\n\t" ++- " xor %%r9, %%r9\n\t" ++- " xor %%r10, %%r10\n\t" ++- " movq %%rcx, %%rax\n\t" ++- " movq %%rcx, %%r8\n\t" ++- " movq %[rsi], %%r9\n\t" ++- " movq %[rdx], %%r10\n\t" ++- " xorps %%xmm6, %%xmm6 # zero accumulators\n\t" ++- " movups 0(%%r9), %%xmm0\n\t" ++- " xorps %%xmm7, %%xmm7 # zero accumulators\n\t" ++- " movups 0(%%r10), %%xmm2\n\t" ++- " shr $5, %%rax # rax = n_2_ccomplex_blocks / 2\n\t" ++- " shr $4, %%r8\n\t" ++- " jmp .%=L1_test\n\t" ++- " # 4 taps / loop\n\t" ++- " # something like ?? cycles / loop\n\t" ++- ".%=Loop1: \n\t" ++- "# complex prod: C += A * B, w/ temp Z & Y (or B), xmmPN=$0x8000000080000000\n\t" ++- "# movups (%%r9), %%xmmA\n\t" ++- "# movups (%%r10), %%xmmB\n\t" ++- "# movups %%xmmA, %%xmmZ\n\t" ++- "# shufps $0xb1, %%xmmZ, %%xmmZ # swap internals\n\t" ++- "# mulps %%xmmB, %%xmmA\n\t" ++- "# mulps %%xmmZ, %%xmmB\n\t" ++- "# # SSE replacement for: pfpnacc %%xmmB, %%xmmA\n\t" ++- "# xorps %%xmmPN, %%xmmA\n\t" ++- "# movups %%xmmA, %%xmmZ\n\t" ++- "# unpcklps %%xmmB, %%xmmA\n\t" ++- "# unpckhps %%xmmB, %%xmmZ\n\t" ++- "# movups %%xmmZ, %%xmmY\n\t" ++- "# shufps $0x44, %%xmmA, %%xmmZ # b01000100\n\t" ++- "# shufps $0xee, %%xmmY, %%xmmA # b11101110\n\t" ++- "# addps %%xmmZ, %%xmmA\n\t" ++- "# addps %%xmmA, %%xmmC\n\t" ++- "# A=xmm0, B=xmm2, Z=xmm4\n\t" ++- "# A'=xmm1, B'=xmm3, Z'=xmm5\n\t" ++- " movups 16(%%r9), %%xmm1\n\t" ++- " movups %%xmm0, %%xmm4\n\t" ++- " mulps %%xmm2, %%xmm0\n\t" ++- " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t" ++- " movups 16(%%r10), %%xmm3\n\t" ++- " movups %%xmm1, %%xmm5\n\t" ++- " addps %%xmm0, %%xmm6\n\t" ++- " mulps %%xmm3, %%xmm1\n\t" ++- " shufps $0xb1, %%xmm5, %%xmm5 # swap internals\n\t" ++- " addps %%xmm1, %%xmm6\n\t" ++- " mulps %%xmm4, %%xmm2\n\t" ++- " movups 32(%%r9), %%xmm0\n\t" ++- " addps %%xmm2, %%xmm7\n\t" ++- " mulps %%xmm5, %%xmm3\n\t" ++- " add $32, %%r9\n\t" ++- " movups 32(%%r10), %%xmm2\n\t" ++- " addps %%xmm3, %%xmm7\n\t" ++- " add $32, %%r10\n\t" ++- ".%=L1_test:\n\t" ++- " dec %%rax\n\t" ++- " jge .%=Loop1\n\t" ++- " # We've handled the bulk of multiplies up to here.\n\t" ++- " # Let's sse if original n_2_ccomplex_blocks was odd.\n\t" ++- " # If so, we've got 2 more taps to do.\n\t" ++- " and $1, %%r8\n\t" ++- " je .%=Leven\n\t" ++- " # The count was odd, do 2 more taps.\n\t" ++- " # Note that we've already got mm0/mm2 preloaded\n\t" ++- " # from the main loop.\n\t" ++- " movups %%xmm0, %%xmm4\n\t" ++- " mulps %%xmm2, %%xmm0\n\t" ++- " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t" ++- " addps %%xmm0, %%xmm6\n\t" ++- " mulps %%xmm4, %%xmm2\n\t" ++- " addps %%xmm2, %%xmm7\n\t" ++- ".%=Leven:\n\t" ++- " # neg inversor\n\t" ++- " xorps %%xmm1, %%xmm1\n\t" ++- " mov $0x80000000, %%r9\n\t" ++- " movd %%r9, %%xmm1\n\t" ++- " shufps $0x11, %%xmm1, %%xmm1 # b00010001 # 0 -0 0 -0\n\t" ++- " # pfpnacc\n\t" ++- " xorps %%xmm1, %%xmm6\n\t" ++- " movups %%xmm6, %%xmm2\n\t" ++- " unpcklps %%xmm7, %%xmm6\n\t" ++- " unpckhps %%xmm7, %%xmm2\n\t" ++- " movups %%xmm2, %%xmm3\n\t" ++- " shufps $0x44, %%xmm6, %%xmm2 # b01000100\n\t" ++- " shufps $0xee, %%xmm3, %%xmm6 # b11101110\n\t" ++- " addps %%xmm2, %%xmm6\n\t" ++- " # xmm6 = r1 i2 r3 i4\n\t" ++- " movhlps %%xmm6, %%xmm4 # xmm4 = r3 i4 ?? ??\n\t" ++- " addps %%xmm4, %%xmm6 # xmm6 = r1+r3 i2+i4 ?? ??\n\t" ++- " movlps %%xmm6, (%[rdi]) # store low 2x32 bits (complex) to memory\n\t" ++- : ++- :[rsi] "r" (input), [rdx] "r" (taps), "c" (num_bytes), [rdi] "r" (result) ++- :"rax", "r8", "r9", "r10" ++- ); ++- ++- ++- if(isodd) { ++- *result += input[num_points - 1] * taps[num_points - 1]; ++- } ++- ++- return; +++static inline void volk_32fc_x2_dot_prod_32fc_u_sse_64(lv_32fc_t* result, +++ const lv_32fc_t* input, +++ const lv_32fc_t* taps, +++ unsigned int num_points) +++{ +++ +++ const unsigned int num_bytes = num_points * 8; +++ unsigned int isodd = num_points & 1; +++ +++ __VOLK_ASM( +++ "# ccomplex_dotprod_generic (float* result, const float *input,\n\t" +++ "# const float *taps, unsigned num_bytes)\n\t" +++ "# float sum0 = 0;\n\t" +++ "# float sum1 = 0;\n\t" +++ "# float sum2 = 0;\n\t" +++ "# float sum3 = 0;\n\t" +++ "# do {\n\t" +++ "# sum0 += input[0] * taps[0] - input[1] * taps[1];\n\t" +++ "# sum1 += input[0] * taps[1] + input[1] * taps[0];\n\t" +++ "# sum2 += input[2] * taps[2] - input[3] * taps[3];\n\t" +++ "# sum3 += input[2] * taps[3] + input[3] * taps[2];\n\t" +++ "# input += 4;\n\t" +++ "# taps += 4; \n\t" +++ "# } while (--n_2_ccomplex_blocks != 0);\n\t" +++ "# result[0] = sum0 + sum2;\n\t" +++ "# result[1] = sum1 + sum3;\n\t" +++ "# TODO: prefetch and better scheduling\n\t" +++ " xor %%r9, %%r9\n\t" +++ " xor %%r10, %%r10\n\t" +++ " movq %%rcx, %%rax\n\t" +++ " movq %%rcx, %%r8\n\t" +++ " movq %[rsi], %%r9\n\t" +++ " movq %[rdx], %%r10\n\t" +++ " xorps %%xmm6, %%xmm6 # zero accumulators\n\t" +++ " movups 0(%%r9), %%xmm0\n\t" +++ " xorps %%xmm7, %%xmm7 # zero accumulators\n\t" +++ " movups 0(%%r10), %%xmm2\n\t" +++ " shr $5, %%rax # rax = n_2_ccomplex_blocks / 2\n\t" +++ " shr $4, %%r8\n\t" +++ " jmp .%=L1_test\n\t" +++ " # 4 taps / loop\n\t" +++ " # something like ?? cycles / loop\n\t" +++ ".%=Loop1: \n\t" +++ "# complex prod: C += A * B, w/ temp Z & Y (or B), xmmPN=$0x8000000080000000\n\t" +++ "# movups (%%r9), %%xmmA\n\t" +++ "# movups (%%r10), %%xmmB\n\t" +++ "# movups %%xmmA, %%xmmZ\n\t" +++ "# shufps $0xb1, %%xmmZ, %%xmmZ # swap internals\n\t" +++ "# mulps %%xmmB, %%xmmA\n\t" +++ "# mulps %%xmmZ, %%xmmB\n\t" +++ "# # SSE replacement for: pfpnacc %%xmmB, %%xmmA\n\t" +++ "# xorps %%xmmPN, %%xmmA\n\t" +++ "# movups %%xmmA, %%xmmZ\n\t" +++ "# unpcklps %%xmmB, %%xmmA\n\t" +++ "# unpckhps %%xmmB, %%xmmZ\n\t" +++ "# movups %%xmmZ, %%xmmY\n\t" +++ "# shufps $0x44, %%xmmA, %%xmmZ # b01000100\n\t" +++ "# shufps $0xee, %%xmmY, %%xmmA # b11101110\n\t" +++ "# addps %%xmmZ, %%xmmA\n\t" +++ "# addps %%xmmA, %%xmmC\n\t" +++ "# A=xmm0, B=xmm2, Z=xmm4\n\t" +++ "# A'=xmm1, B'=xmm3, Z'=xmm5\n\t" +++ " movups 16(%%r9), %%xmm1\n\t" +++ " movups %%xmm0, %%xmm4\n\t" +++ " mulps %%xmm2, %%xmm0\n\t" +++ " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t" +++ " movups 16(%%r10), %%xmm3\n\t" +++ " movups %%xmm1, %%xmm5\n\t" +++ " addps %%xmm0, %%xmm6\n\t" +++ " mulps %%xmm3, %%xmm1\n\t" +++ " shufps $0xb1, %%xmm5, %%xmm5 # swap internals\n\t" +++ " addps %%xmm1, %%xmm6\n\t" +++ " mulps %%xmm4, %%xmm2\n\t" +++ " movups 32(%%r9), %%xmm0\n\t" +++ " addps %%xmm2, %%xmm7\n\t" +++ " mulps %%xmm5, %%xmm3\n\t" +++ " add $32, %%r9\n\t" +++ " movups 32(%%r10), %%xmm2\n\t" +++ " addps %%xmm3, %%xmm7\n\t" +++ " add $32, %%r10\n\t" +++ ".%=L1_test:\n\t" +++ " dec %%rax\n\t" +++ " jge .%=Loop1\n\t" +++ " # We've handled the bulk of multiplies up to here.\n\t" +++ " # Let's sse if original n_2_ccomplex_blocks was odd.\n\t" +++ " # If so, we've got 2 more taps to do.\n\t" +++ " and $1, %%r8\n\t" +++ " je .%=Leven\n\t" +++ " # The count was odd, do 2 more taps.\n\t" +++ " # Note that we've already got mm0/mm2 preloaded\n\t" +++ " # from the main loop.\n\t" +++ " movups %%xmm0, %%xmm4\n\t" +++ " mulps %%xmm2, %%xmm0\n\t" +++ " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t" +++ " addps %%xmm0, %%xmm6\n\t" +++ " mulps %%xmm4, %%xmm2\n\t" +++ " addps %%xmm2, %%xmm7\n\t" +++ ".%=Leven:\n\t" +++ " # neg inversor\n\t" +++ " xorps %%xmm1, %%xmm1\n\t" +++ " mov $0x80000000, %%r9\n\t" +++ " movd %%r9, %%xmm1\n\t" +++ " shufps $0x11, %%xmm1, %%xmm1 # b00010001 # 0 -0 0 -0\n\t" +++ " # pfpnacc\n\t" +++ " xorps %%xmm1, %%xmm6\n\t" +++ " movups %%xmm6, %%xmm2\n\t" +++ " unpcklps %%xmm7, %%xmm6\n\t" +++ " unpckhps %%xmm7, %%xmm2\n\t" +++ " movups %%xmm2, %%xmm3\n\t" +++ " shufps $0x44, %%xmm6, %%xmm2 # b01000100\n\t" +++ " shufps $0xee, %%xmm3, %%xmm6 # b11101110\n\t" +++ " addps %%xmm2, %%xmm6\n\t" +++ " # xmm6 = r1 i2 r3 i4\n\t" +++ " movhlps %%xmm6, %%xmm4 # xmm4 = r3 i4 ?? ??\n\t" +++ " addps %%xmm4, %%xmm6 # xmm6 = r1+r3 i2+i4 ?? ??\n\t" +++ " movlps %%xmm6, (%[rdi]) # store low 2x32 bits (complex) " +++ "to memory\n\t" +++ : +++ : [rsi] "r"(input), [rdx] "r"(taps), "c"(num_bytes), [rdi] "r"(result) +++ : "rax", "r8", "r9", "r10"); +++ +++ +++ if (isodd) { +++ *result += input[num_points - 1] * taps[num_points - 1]; +++ } ++ +++ return; ++ } ++ ++ #endif /* LV_HAVE_SSE && LV_HAVE_64 */ ++ ++ ++- ++- ++ #ifdef LV_HAVE_SSE3 ++ ++ #include ++ ++-static inline void volk_32fc_x2_dot_prod_32fc_u_sse3(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) { +++static inline void volk_32fc_x2_dot_prod_32fc_u_sse3(lv_32fc_t* result, +++ const lv_32fc_t* input, +++ const lv_32fc_t* taps, +++ unsigned int num_points) +++{ ++ ++- lv_32fc_t dotProduct; ++- memset(&dotProduct, 0x0, 2*sizeof(float)); +++ lv_32fc_t dotProduct; +++ memset(&dotProduct, 0x0, 2 * sizeof(float)); ++ ++- unsigned int number = 0; ++- const unsigned int halfPoints = num_points/2; ++- unsigned int isodd = num_points & 1; +++ unsigned int number = 0; +++ const unsigned int halfPoints = num_points / 2; +++ unsigned int isodd = num_points & 1; ++ ++- __m128 x, y, yl, yh, z, tmp1, tmp2, dotProdVal; +++ __m128 x, y, yl, yh, z, tmp1, tmp2, dotProdVal; ++ ++- const lv_32fc_t* a = input; ++- const lv_32fc_t* b = taps; +++ const lv_32fc_t* a = input; +++ const lv_32fc_t* b = taps; ++ ++- dotProdVal = _mm_setzero_ps(); +++ dotProdVal = _mm_setzero_ps(); ++ ++- for(;number < halfPoints; number++){ +++ for (; number < halfPoints; number++) { ++ ++- x = _mm_loadu_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi ++- y = _mm_loadu_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di +++ x = _mm_loadu_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi +++ y = _mm_loadu_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di ++ ++- yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr ++- yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di +++ yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr +++ yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di ++ ++- tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr +++ tmp1 = _mm_mul_ps(x, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr ++ ++- x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br +++ x = _mm_shuffle_ps(x, x, 0xB1); // Re-arrange x to be ai,ar,bi,br ++ ++- tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di +++ tmp2 = _mm_mul_ps(x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di ++ ++- z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di +++ z = _mm_addsub_ps(tmp1, +++ tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di ++ ++- dotProdVal = _mm_add_ps(dotProdVal, z); // Add the complex multiplication results together +++ dotProdVal = +++ _mm_add_ps(dotProdVal, z); // Add the complex multiplication results together ++ ++- a += 2; ++- b += 2; ++- } +++ a += 2; +++ b += 2; +++ } ++ ++- __VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector[2]; +++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector[2]; ++ ++- _mm_storeu_ps((float*)dotProductVector,dotProdVal); // Store the results back into the dot product vector +++ _mm_storeu_ps((float*)dotProductVector, +++ dotProdVal); // Store the results back into the dot product vector ++ ++- dotProduct += ( dotProductVector[0] + dotProductVector[1] ); +++ dotProduct += (dotProductVector[0] + dotProductVector[1]); ++ ++- if(isodd) { ++- dotProduct += input[num_points - 1] * taps[num_points - 1]; ++- } +++ if (isodd) { +++ dotProduct += input[num_points - 1] * taps[num_points - 1]; +++ } ++ ++- *result = dotProduct; +++ *result = dotProduct; ++ } ++ ++ #endif /*LV_HAVE_SSE3*/ ++@@ -296,78 +306,82 @@ static inline void volk_32fc_x2_dot_prod_32fc_u_sse3(lv_32fc_t* result, const lv ++ ++ #include ++ ++-static inline void volk_32fc_x2_dot_prod_32fc_u_sse4_1(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) { +++static inline void volk_32fc_x2_dot_prod_32fc_u_sse4_1(lv_32fc_t* result, +++ const lv_32fc_t* input, +++ const lv_32fc_t* taps, +++ unsigned int num_points) +++{ ++ ++- unsigned int i = 0; ++- const unsigned int qtr_points = num_points/4; ++- const unsigned int isodd = num_points & 3; +++ unsigned int i = 0; +++ const unsigned int qtr_points = num_points / 4; +++ const unsigned int isodd = num_points & 3; ++ ++- __m128 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, real0, real1, im0, im1; ++- float *p_input, *p_taps; ++- __m64 *p_result; +++ __m128 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, real0, real1, im0, im1; +++ float *p_input, *p_taps; +++ __m64* p_result; ++ ++- p_result = (__m64*)result; ++- p_input = (float*)input; ++- p_taps = (float*)taps; +++ p_result = (__m64*)result; +++ p_input = (float*)input; +++ p_taps = (float*)taps; ++ ++- static const __m128i neg = {0x000000000000000080000000}; +++ static const __m128i neg = { 0x000000000000000080000000 }; ++ ++- real0 = _mm_setzero_ps(); ++- real1 = _mm_setzero_ps(); ++- im0 = _mm_setzero_ps(); ++- im1 = _mm_setzero_ps(); +++ real0 = _mm_setzero_ps(); +++ real1 = _mm_setzero_ps(); +++ im0 = _mm_setzero_ps(); +++ im1 = _mm_setzero_ps(); ++ ++- for(; i < qtr_points; ++i) { ++- xmm0 = _mm_loadu_ps(p_input); ++- xmm1 = _mm_loadu_ps(p_taps); +++ for (; i < qtr_points; ++i) { +++ xmm0 = _mm_loadu_ps(p_input); +++ xmm1 = _mm_loadu_ps(p_taps); ++ ++- p_input += 4; ++- p_taps += 4; +++ p_input += 4; +++ p_taps += 4; ++ ++- xmm2 = _mm_loadu_ps(p_input); ++- xmm3 = _mm_loadu_ps(p_taps); +++ xmm2 = _mm_loadu_ps(p_input); +++ xmm3 = _mm_loadu_ps(p_taps); ++ ++- p_input += 4; ++- p_taps += 4; +++ p_input += 4; +++ p_taps += 4; ++ ++- xmm4 = _mm_unpackhi_ps(xmm0, xmm2); ++- xmm5 = _mm_unpackhi_ps(xmm1, xmm3); ++- xmm0 = _mm_unpacklo_ps(xmm0, xmm2); ++- xmm2 = _mm_unpacklo_ps(xmm1, xmm3); +++ xmm4 = _mm_unpackhi_ps(xmm0, xmm2); +++ xmm5 = _mm_unpackhi_ps(xmm1, xmm3); +++ xmm0 = _mm_unpacklo_ps(xmm0, xmm2); +++ xmm2 = _mm_unpacklo_ps(xmm1, xmm3); ++ ++- //imaginary vector from input ++- xmm1 = _mm_unpackhi_ps(xmm0, xmm4); ++- //real vector from input ++- xmm3 = _mm_unpacklo_ps(xmm0, xmm4); ++- //imaginary vector from taps ++- xmm0 = _mm_unpackhi_ps(xmm2, xmm5); ++- //real vector from taps ++- xmm2 = _mm_unpacklo_ps(xmm2, xmm5); +++ // imaginary vector from input +++ xmm1 = _mm_unpackhi_ps(xmm0, xmm4); +++ // real vector from input +++ xmm3 = _mm_unpacklo_ps(xmm0, xmm4); +++ // imaginary vector from taps +++ xmm0 = _mm_unpackhi_ps(xmm2, xmm5); +++ // real vector from taps +++ xmm2 = _mm_unpacklo_ps(xmm2, xmm5); ++ ++- xmm4 = _mm_dp_ps(xmm3, xmm2, 0xf1); ++- xmm5 = _mm_dp_ps(xmm1, xmm0, 0xf1); +++ xmm4 = _mm_dp_ps(xmm3, xmm2, 0xf1); +++ xmm5 = _mm_dp_ps(xmm1, xmm0, 0xf1); ++ ++- xmm6 = _mm_dp_ps(xmm3, xmm0, 0xf2); ++- xmm7 = _mm_dp_ps(xmm1, xmm2, 0xf2); +++ xmm6 = _mm_dp_ps(xmm3, xmm0, 0xf2); +++ xmm7 = _mm_dp_ps(xmm1, xmm2, 0xf2); ++ ++- real0 = _mm_add_ps(xmm4, real0); ++- real1 = _mm_add_ps(xmm5, real1); ++- im0 = _mm_add_ps(xmm6, im0); ++- im1 = _mm_add_ps(xmm7, im1); ++- } +++ real0 = _mm_add_ps(xmm4, real0); +++ real1 = _mm_add_ps(xmm5, real1); +++ im0 = _mm_add_ps(xmm6, im0); +++ im1 = _mm_add_ps(xmm7, im1); +++ } ++ ++- real1 = _mm_xor_ps(real1, bit128_p(&neg)->float_vec); +++ real1 = _mm_xor_ps(real1, bit128_p(&neg)->float_vec); ++ ++- im0 = _mm_add_ps(im0, im1); ++- real0 = _mm_add_ps(real0, real1); +++ im0 = _mm_add_ps(im0, im1); +++ real0 = _mm_add_ps(real0, real1); ++ ++- im0 = _mm_add_ps(im0, real0); +++ im0 = _mm_add_ps(im0, real0); ++ ++- _mm_storel_pi(p_result, im0); +++ _mm_storel_pi(p_result, im0); ++ ++- for(i = num_points-isodd; i < num_points; i++) { ++- *result += input[i] * taps[i]; ++- } +++ for (i = num_points - isodd; i < num_points; i++) { +++ *result += input[i] * taps[i]; +++ } ++ } ++ ++ #endif /*LV_HAVE_SSE4_1*/ ++@@ -376,55 +390,63 @@ static inline void volk_32fc_x2_dot_prod_32fc_u_sse4_1(lv_32fc_t* result, const ++ ++ #include ++ ++-static inline void volk_32fc_x2_dot_prod_32fc_u_avx(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) { +++static inline void volk_32fc_x2_dot_prod_32fc_u_avx(lv_32fc_t* result, +++ const lv_32fc_t* input, +++ const lv_32fc_t* taps, +++ unsigned int num_points) +++{ ++ ++- unsigned int isodd = num_points & 3; ++- unsigned int i = 0; ++- lv_32fc_t dotProduct; ++- memset(&dotProduct, 0x0, 2*sizeof(float)); +++ unsigned int isodd = num_points & 3; +++ unsigned int i = 0; +++ lv_32fc_t dotProduct; +++ memset(&dotProduct, 0x0, 2 * sizeof(float)); ++ ++- unsigned int number = 0; ++- const unsigned int quarterPoints = num_points / 4; +++ unsigned int number = 0; +++ const unsigned int quarterPoints = num_points / 4; ++ ++- __m256 x, y, yl, yh, z, tmp1, tmp2, dotProdVal; +++ __m256 x, y, yl, yh, z, tmp1, tmp2, dotProdVal; ++ ++- const lv_32fc_t* a = input; ++- const lv_32fc_t* b = taps; +++ const lv_32fc_t* a = input; +++ const lv_32fc_t* b = taps; ++ ++- dotProdVal = _mm256_setzero_ps(); +++ dotProdVal = _mm256_setzero_ps(); ++ ++- for(;number < quarterPoints; number++){ ++- x = _mm256_loadu_ps((float*)a); // Load a,b,e,f as ar,ai,br,bi,er,ei,fr,fi ++- y = _mm256_loadu_ps((float*)b); // Load c,d,g,h as cr,ci,dr,di,gr,gi,hr,hi +++ for (; number < quarterPoints; number++) { +++ x = _mm256_loadu_ps((float*)a); // Load a,b,e,f as ar,ai,br,bi,er,ei,fr,fi +++ y = _mm256_loadu_ps((float*)b); // Load c,d,g,h as cr,ci,dr,di,gr,gi,hr,hi ++ ++- yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr,gr,gr,hr,hr ++- yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di,gi,gi,hi,hi +++ yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr,gr,gr,hr,hr +++ yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di,gi,gi,hi,hi ++ ++- tmp1 = _mm256_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr ... +++ tmp1 = _mm256_mul_ps(x, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr ... ++ ++- x = _mm256_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br,ei,er,fi,fr +++ x = _mm256_shuffle_ps(x, x, 0xB1); // Re-arrange x to be ai,ar,bi,br,ei,er,fi,fr ++ ++- tmp2 = _mm256_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di ... +++ tmp2 = _mm256_mul_ps(x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di ... ++ ++- z = _mm256_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di +++ z = _mm256_addsub_ps(tmp1, +++ tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di ++ ++- dotProdVal = _mm256_add_ps(dotProdVal, z); // Add the complex multiplication results together +++ dotProdVal = _mm256_add_ps(dotProdVal, +++ z); // Add the complex multiplication results together ++ ++- a += 4; ++- b += 4; ++- } +++ a += 4; +++ b += 4; +++ } ++ ++- __VOLK_ATTR_ALIGNED(32) lv_32fc_t dotProductVector[4]; +++ __VOLK_ATTR_ALIGNED(32) lv_32fc_t dotProductVector[4]; ++ ++- _mm256_storeu_ps((float*)dotProductVector,dotProdVal); // Store the results back into the dot product vector +++ _mm256_storeu_ps((float*)dotProductVector, +++ dotProdVal); // Store the results back into the dot product vector ++ ++- dotProduct += ( dotProductVector[0] + dotProductVector[1] + dotProductVector[2] + dotProductVector[3]); +++ dotProduct += (dotProductVector[0] + dotProductVector[1] + dotProductVector[2] + +++ dotProductVector[3]); ++ ++- for(i = num_points-isodd; i < num_points; i++) { ++- dotProduct += input[i] * taps[i]; ++- } +++ for (i = num_points - isodd; i < num_points; i++) { +++ dotProduct += input[i] * taps[i]; +++ } ++ ++- *result = dotProduct; +++ *result = dotProduct; ++ } ++ ++ #endif /*LV_HAVE_AVX*/ ++@@ -432,56 +454,64 @@ static inline void volk_32fc_x2_dot_prod_32fc_u_avx(lv_32fc_t* result, const lv_ ++ #if LV_HAVE_AVX && LV_HAVE_FMA ++ #include ++ ++-static inline void volk_32fc_x2_dot_prod_32fc_u_avx_fma(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) { +++static inline void volk_32fc_x2_dot_prod_32fc_u_avx_fma(lv_32fc_t* result, +++ const lv_32fc_t* input, +++ const lv_32fc_t* taps, +++ unsigned int num_points) +++{ ++ ++- unsigned int isodd = num_points & 3; ++- unsigned int i = 0; ++- lv_32fc_t dotProduct; ++- memset(&dotProduct, 0x0, 2*sizeof(float)); +++ unsigned int isodd = num_points & 3; +++ unsigned int i = 0; +++ lv_32fc_t dotProduct; +++ memset(&dotProduct, 0x0, 2 * sizeof(float)); ++ ++- unsigned int number = 0; ++- const unsigned int quarterPoints = num_points / 4; +++ unsigned int number = 0; +++ const unsigned int quarterPoints = num_points / 4; ++ ++- __m256 x, y, yl, yh, z, tmp1, tmp2, dotProdVal; +++ __m256 x, y, yl, yh, z, tmp1, tmp2, dotProdVal; ++ ++- const lv_32fc_t* a = input; ++- const lv_32fc_t* b = taps; +++ const lv_32fc_t* a = input; +++ const lv_32fc_t* b = taps; ++ ++- dotProdVal = _mm256_setzero_ps(); +++ dotProdVal = _mm256_setzero_ps(); ++ ++- for(;number < quarterPoints; number++){ +++ for (; number < quarterPoints; number++) { ++ ++- x = _mm256_loadu_ps((float*)a); // Load a,b,e,f as ar,ai,br,bi,er,ei,fr,fi ++- y = _mm256_loadu_ps((float*)b); // Load c,d,g,h as cr,ci,dr,di,gr,gi,hr,hi +++ x = _mm256_loadu_ps((float*)a); // Load a,b,e,f as ar,ai,br,bi,er,ei,fr,fi +++ y = _mm256_loadu_ps((float*)b); // Load c,d,g,h as cr,ci,dr,di,gr,gi,hr,hi ++ ++- yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr,gr,gr,hr,hr ++- yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di,gi,gi,hi,hi +++ yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr,gr,gr,hr,hr +++ yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di,gi,gi,hi,hi ++ ++- tmp1 = x; +++ tmp1 = x; ++ ++- x = _mm256_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br,ei,er,fi,fr +++ x = _mm256_shuffle_ps(x, x, 0xB1); // Re-arrange x to be ai,ar,bi,br,ei,er,fi,fr ++ ++- tmp2 = _mm256_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di ... +++ tmp2 = _mm256_mul_ps(x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di ... ++ ++- z = _mm256_fmaddsub_ps(tmp1, yl,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di +++ z = _mm256_fmaddsub_ps( +++ tmp1, yl, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di ++ ++- dotProdVal = _mm256_add_ps(dotProdVal, z); // Add the complex multiplication results together +++ dotProdVal = _mm256_add_ps(dotProdVal, +++ z); // Add the complex multiplication results together ++ ++- a += 4; ++- b += 4; ++- } +++ a += 4; +++ b += 4; +++ } ++ ++- __VOLK_ATTR_ALIGNED(32) lv_32fc_t dotProductVector[4]; +++ __VOLK_ATTR_ALIGNED(32) lv_32fc_t dotProductVector[4]; ++ ++- _mm256_storeu_ps((float*)dotProductVector,dotProdVal); // Store the results back into the dot product vector +++ _mm256_storeu_ps((float*)dotProductVector, +++ dotProdVal); // Store the results back into the dot product vector ++ ++- dotProduct += ( dotProductVector[0] + dotProductVector[1] + dotProductVector[2] + dotProductVector[3]); +++ dotProduct += (dotProductVector[0] + dotProductVector[1] + dotProductVector[2] + +++ dotProductVector[3]); ++ ++- for(i = num_points-isodd; i < num_points; i++) { ++- dotProduct += input[i] * taps[i]; ++- } +++ for (i = num_points - isodd; i < num_points; i++) { +++ dotProduct += input[i] * taps[i]; +++ } ++ ++- *result = dotProduct; +++ *result = dotProduct; ++ } ++ ++ #endif /*LV_HAVE_AVX && LV_HAVE_FMA*/ ++@@ -491,44 +521,48 @@ static inline void volk_32fc_x2_dot_prod_32fc_u_avx_fma(lv_32fc_t* result, const ++ #ifndef INCLUDED_volk_32fc_x2_dot_prod_32fc_a_H ++ #define INCLUDED_volk_32fc_x2_dot_prod_32fc_a_H ++ ++-#include ++-#include ++ #include ++ #include +++#include +++#include ++ ++ ++ #ifdef LV_HAVE_GENERIC ++ ++ ++-static inline void volk_32fc_x2_dot_prod_32fc_a_generic(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) { +++static inline void volk_32fc_x2_dot_prod_32fc_a_generic(lv_32fc_t* result, +++ const lv_32fc_t* input, +++ const lv_32fc_t* taps, +++ unsigned int num_points) +++{ ++ ++- const unsigned int num_bytes = num_points*8; +++ const unsigned int num_bytes = num_points * 8; ++ ++- float * res = (float*) result; ++- float * in = (float*) input; ++- float * tp = (float*) taps; ++- unsigned int n_2_ccomplex_blocks = num_bytes >> 4; +++ float* res = (float*)result; +++ float* in = (float*)input; +++ float* tp = (float*)taps; +++ unsigned int n_2_ccomplex_blocks = num_bytes >> 4; ++ ++- float sum0[2] = {0,0}; ++- float sum1[2] = {0,0}; ++- unsigned int i = 0; +++ float sum0[2] = { 0, 0 }; +++ float sum1[2] = { 0, 0 }; +++ unsigned int i = 0; ++ ++- for(i = 0; i < n_2_ccomplex_blocks; ++i) { ++- sum0[0] += in[0] * tp[0] - in[1] * tp[1]; ++- sum0[1] += in[0] * tp[1] + in[1] * tp[0]; ++- sum1[0] += in[2] * tp[2] - in[3] * tp[3]; ++- sum1[1] += in[2] * tp[3] + in[3] * tp[2]; +++ for (i = 0; i < n_2_ccomplex_blocks; ++i) { +++ sum0[0] += in[0] * tp[0] - in[1] * tp[1]; +++ sum0[1] += in[0] * tp[1] + in[1] * tp[0]; +++ sum1[0] += in[2] * tp[2] - in[3] * tp[3]; +++ sum1[1] += in[2] * tp[3] + in[3] * tp[2]; ++ ++- in += 4; ++- tp += 4; ++- } +++ in += 4; +++ tp += 4; +++ } ++ ++- res[0] = sum0[0] + sum1[0]; ++- res[1] = sum0[1] + sum1[1]; +++ res[0] = sum0[0] + sum1[0]; +++ res[1] = sum0[1] + sum1[1]; ++ ++- if (num_points & 1) { ++- *result += input[num_points - 1] * taps[num_points - 1]; ++- } +++ if (num_points & 1) { +++ *result += input[num_points - 1] * taps[num_points - 1]; +++ } ++ } ++ ++ #endif /*LV_HAVE_GENERIC*/ ++@@ -537,140 +571,146 @@ static inline void volk_32fc_x2_dot_prod_32fc_a_generic(lv_32fc_t* result, const ++ #if LV_HAVE_SSE && LV_HAVE_64 ++ ++ ++-static inline void volk_32fc_x2_dot_prod_32fc_a_sse_64(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) { ++- ++- const unsigned int num_bytes = num_points*8; ++- unsigned int isodd = num_points & 1; ++- ++- __VOLK_ASM ++- ( ++- "# ccomplex_dotprod_generic (float* result, const float *input,\n\t" ++- "# const float *taps, unsigned num_bytes)\n\t" ++- "# float sum0 = 0;\n\t" ++- "# float sum1 = 0;\n\t" ++- "# float sum2 = 0;\n\t" ++- "# float sum3 = 0;\n\t" ++- "# do {\n\t" ++- "# sum0 += input[0] * taps[0] - input[1] * taps[1];\n\t" ++- "# sum1 += input[0] * taps[1] + input[1] * taps[0];\n\t" ++- "# sum2 += input[2] * taps[2] - input[3] * taps[3];\n\t" ++- "# sum3 += input[2] * taps[3] + input[3] * taps[2];\n\t" ++- "# input += 4;\n\t" ++- "# taps += 4; \n\t" ++- "# } while (--n_2_ccomplex_blocks != 0);\n\t" ++- "# result[0] = sum0 + sum2;\n\t" ++- "# result[1] = sum1 + sum3;\n\t" ++- "# TODO: prefetch and better scheduling\n\t" ++- " xor %%r9, %%r9\n\t" ++- " xor %%r10, %%r10\n\t" ++- " movq %%rcx, %%rax\n\t" ++- " movq %%rcx, %%r8\n\t" ++- " movq %[rsi], %%r9\n\t" ++- " movq %[rdx], %%r10\n\t" ++- " xorps %%xmm6, %%xmm6 # zero accumulators\n\t" ++- " movaps 0(%%r9), %%xmm0\n\t" ++- " xorps %%xmm7, %%xmm7 # zero accumulators\n\t" ++- " movaps 0(%%r10), %%xmm2\n\t" ++- " shr $5, %%rax # rax = n_2_ccomplex_blocks / 2\n\t" ++- " shr $4, %%r8\n\t" ++- " jmp .%=L1_test\n\t" ++- " # 4 taps / loop\n\t" ++- " # something like ?? cycles / loop\n\t" ++- ".%=Loop1: \n\t" ++- "# complex prod: C += A * B, w/ temp Z & Y (or B), xmmPN=$0x8000000080000000\n\t" ++- "# movaps (%%r9), %%xmmA\n\t" ++- "# movaps (%%r10), %%xmmB\n\t" ++- "# movaps %%xmmA, %%xmmZ\n\t" ++- "# shufps $0xb1, %%xmmZ, %%xmmZ # swap internals\n\t" ++- "# mulps %%xmmB, %%xmmA\n\t" ++- "# mulps %%xmmZ, %%xmmB\n\t" ++- "# # SSE replacement for: pfpnacc %%xmmB, %%xmmA\n\t" ++- "# xorps %%xmmPN, %%xmmA\n\t" ++- "# movaps %%xmmA, %%xmmZ\n\t" ++- "# unpcklps %%xmmB, %%xmmA\n\t" ++- "# unpckhps %%xmmB, %%xmmZ\n\t" ++- "# movaps %%xmmZ, %%xmmY\n\t" ++- "# shufps $0x44, %%xmmA, %%xmmZ # b01000100\n\t" ++- "# shufps $0xee, %%xmmY, %%xmmA # b11101110\n\t" ++- "# addps %%xmmZ, %%xmmA\n\t" ++- "# addps %%xmmA, %%xmmC\n\t" ++- "# A=xmm0, B=xmm2, Z=xmm4\n\t" ++- "# A'=xmm1, B'=xmm3, Z'=xmm5\n\t" ++- " movaps 16(%%r9), %%xmm1\n\t" ++- " movaps %%xmm0, %%xmm4\n\t" ++- " mulps %%xmm2, %%xmm0\n\t" ++- " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t" ++- " movaps 16(%%r10), %%xmm3\n\t" ++- " movaps %%xmm1, %%xmm5\n\t" ++- " addps %%xmm0, %%xmm6\n\t" ++- " mulps %%xmm3, %%xmm1\n\t" ++- " shufps $0xb1, %%xmm5, %%xmm5 # swap internals\n\t" ++- " addps %%xmm1, %%xmm6\n\t" ++- " mulps %%xmm4, %%xmm2\n\t" ++- " movaps 32(%%r9), %%xmm0\n\t" ++- " addps %%xmm2, %%xmm7\n\t" ++- " mulps %%xmm5, %%xmm3\n\t" ++- " add $32, %%r9\n\t" ++- " movaps 32(%%r10), %%xmm2\n\t" ++- " addps %%xmm3, %%xmm7\n\t" ++- " add $32, %%r10\n\t" ++- ".%=L1_test:\n\t" ++- " dec %%rax\n\t" ++- " jge .%=Loop1\n\t" ++- " # We've handled the bulk of multiplies up to here.\n\t" ++- " # Let's sse if original n_2_ccomplex_blocks was odd.\n\t" ++- " # If so, we've got 2 more taps to do.\n\t" ++- " and $1, %%r8\n\t" ++- " je .%=Leven\n\t" ++- " # The count was odd, do 2 more taps.\n\t" ++- " # Note that we've already got mm0/mm2 preloaded\n\t" ++- " # from the main loop.\n\t" ++- " movaps %%xmm0, %%xmm4\n\t" ++- " mulps %%xmm2, %%xmm0\n\t" ++- " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t" ++- " addps %%xmm0, %%xmm6\n\t" ++- " mulps %%xmm4, %%xmm2\n\t" ++- " addps %%xmm2, %%xmm7\n\t" ++- ".%=Leven:\n\t" ++- " # neg inversor\n\t" ++- " xorps %%xmm1, %%xmm1\n\t" ++- " mov $0x80000000, %%r9\n\t" ++- " movd %%r9, %%xmm1\n\t" ++- " shufps $0x11, %%xmm1, %%xmm1 # b00010001 # 0 -0 0 -0\n\t" ++- " # pfpnacc\n\t" ++- " xorps %%xmm1, %%xmm6\n\t" ++- " movaps %%xmm6, %%xmm2\n\t" ++- " unpcklps %%xmm7, %%xmm6\n\t" ++- " unpckhps %%xmm7, %%xmm2\n\t" ++- " movaps %%xmm2, %%xmm3\n\t" ++- " shufps $0x44, %%xmm6, %%xmm2 # b01000100\n\t" ++- " shufps $0xee, %%xmm3, %%xmm6 # b11101110\n\t" ++- " addps %%xmm2, %%xmm6\n\t" ++- " # xmm6 = r1 i2 r3 i4\n\t" ++- " movhlps %%xmm6, %%xmm4 # xmm4 = r3 i4 ?? ??\n\t" ++- " addps %%xmm4, %%xmm6 # xmm6 = r1+r3 i2+i4 ?? ??\n\t" ++- " movlps %%xmm6, (%[rdi]) # store low 2x32 bits (complex) to memory\n\t" ++- : ++- :[rsi] "r" (input), [rdx] "r" (taps), "c" (num_bytes), [rdi] "r" (result) ++- :"rax", "r8", "r9", "r10" ++- ); ++- ++- ++- if(isodd) { ++- *result += input[num_points - 1] * taps[num_points - 1]; ++- } ++- ++- return; +++static inline void volk_32fc_x2_dot_prod_32fc_a_sse_64(lv_32fc_t* result, +++ const lv_32fc_t* input, +++ const lv_32fc_t* taps, +++ unsigned int num_points) +++{ +++ +++ const unsigned int num_bytes = num_points * 8; +++ unsigned int isodd = num_points & 1; +++ +++ __VOLK_ASM( +++ "# ccomplex_dotprod_generic (float* result, const float *input,\n\t" +++ "# const float *taps, unsigned num_bytes)\n\t" +++ "# float sum0 = 0;\n\t" +++ "# float sum1 = 0;\n\t" +++ "# float sum2 = 0;\n\t" +++ "# float sum3 = 0;\n\t" +++ "# do {\n\t" +++ "# sum0 += input[0] * taps[0] - input[1] * taps[1];\n\t" +++ "# sum1 += input[0] * taps[1] + input[1] * taps[0];\n\t" +++ "# sum2 += input[2] * taps[2] - input[3] * taps[3];\n\t" +++ "# sum3 += input[2] * taps[3] + input[3] * taps[2];\n\t" +++ "# input += 4;\n\t" +++ "# taps += 4; \n\t" +++ "# } while (--n_2_ccomplex_blocks != 0);\n\t" +++ "# result[0] = sum0 + sum2;\n\t" +++ "# result[1] = sum1 + sum3;\n\t" +++ "# TODO: prefetch and better scheduling\n\t" +++ " xor %%r9, %%r9\n\t" +++ " xor %%r10, %%r10\n\t" +++ " movq %%rcx, %%rax\n\t" +++ " movq %%rcx, %%r8\n\t" +++ " movq %[rsi], %%r9\n\t" +++ " movq %[rdx], %%r10\n\t" +++ " xorps %%xmm6, %%xmm6 # zero accumulators\n\t" +++ " movaps 0(%%r9), %%xmm0\n\t" +++ " xorps %%xmm7, %%xmm7 # zero accumulators\n\t" +++ " movaps 0(%%r10), %%xmm2\n\t" +++ " shr $5, %%rax # rax = n_2_ccomplex_blocks / 2\n\t" +++ " shr $4, %%r8\n\t" +++ " jmp .%=L1_test\n\t" +++ " # 4 taps / loop\n\t" +++ " # something like ?? cycles / loop\n\t" +++ ".%=Loop1: \n\t" +++ "# complex prod: C += A * B, w/ temp Z & Y (or B), xmmPN=$0x8000000080000000\n\t" +++ "# movaps (%%r9), %%xmmA\n\t" +++ "# movaps (%%r10), %%xmmB\n\t" +++ "# movaps %%xmmA, %%xmmZ\n\t" +++ "# shufps $0xb1, %%xmmZ, %%xmmZ # swap internals\n\t" +++ "# mulps %%xmmB, %%xmmA\n\t" +++ "# mulps %%xmmZ, %%xmmB\n\t" +++ "# # SSE replacement for: pfpnacc %%xmmB, %%xmmA\n\t" +++ "# xorps %%xmmPN, %%xmmA\n\t" +++ "# movaps %%xmmA, %%xmmZ\n\t" +++ "# unpcklps %%xmmB, %%xmmA\n\t" +++ "# unpckhps %%xmmB, %%xmmZ\n\t" +++ "# movaps %%xmmZ, %%xmmY\n\t" +++ "# shufps $0x44, %%xmmA, %%xmmZ # b01000100\n\t" +++ "# shufps $0xee, %%xmmY, %%xmmA # b11101110\n\t" +++ "# addps %%xmmZ, %%xmmA\n\t" +++ "# addps %%xmmA, %%xmmC\n\t" +++ "# A=xmm0, B=xmm2, Z=xmm4\n\t" +++ "# A'=xmm1, B'=xmm3, Z'=xmm5\n\t" +++ " movaps 16(%%r9), %%xmm1\n\t" +++ " movaps %%xmm0, %%xmm4\n\t" +++ " mulps %%xmm2, %%xmm0\n\t" +++ " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t" +++ " movaps 16(%%r10), %%xmm3\n\t" +++ " movaps %%xmm1, %%xmm5\n\t" +++ " addps %%xmm0, %%xmm6\n\t" +++ " mulps %%xmm3, %%xmm1\n\t" +++ " shufps $0xb1, %%xmm5, %%xmm5 # swap internals\n\t" +++ " addps %%xmm1, %%xmm6\n\t" +++ " mulps %%xmm4, %%xmm2\n\t" +++ " movaps 32(%%r9), %%xmm0\n\t" +++ " addps %%xmm2, %%xmm7\n\t" +++ " mulps %%xmm5, %%xmm3\n\t" +++ " add $32, %%r9\n\t" +++ " movaps 32(%%r10), %%xmm2\n\t" +++ " addps %%xmm3, %%xmm7\n\t" +++ " add $32, %%r10\n\t" +++ ".%=L1_test:\n\t" +++ " dec %%rax\n\t" +++ " jge .%=Loop1\n\t" +++ " # We've handled the bulk of multiplies up to here.\n\t" +++ " # Let's sse if original n_2_ccomplex_blocks was odd.\n\t" +++ " # If so, we've got 2 more taps to do.\n\t" +++ " and $1, %%r8\n\t" +++ " je .%=Leven\n\t" +++ " # The count was odd, do 2 more taps.\n\t" +++ " # Note that we've already got mm0/mm2 preloaded\n\t" +++ " # from the main loop.\n\t" +++ " movaps %%xmm0, %%xmm4\n\t" +++ " mulps %%xmm2, %%xmm0\n\t" +++ " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t" +++ " addps %%xmm0, %%xmm6\n\t" +++ " mulps %%xmm4, %%xmm2\n\t" +++ " addps %%xmm2, %%xmm7\n\t" +++ ".%=Leven:\n\t" +++ " # neg inversor\n\t" +++ " xorps %%xmm1, %%xmm1\n\t" +++ " mov $0x80000000, %%r9\n\t" +++ " movd %%r9, %%xmm1\n\t" +++ " shufps $0x11, %%xmm1, %%xmm1 # b00010001 # 0 -0 0 -0\n\t" +++ " # pfpnacc\n\t" +++ " xorps %%xmm1, %%xmm6\n\t" +++ " movaps %%xmm6, %%xmm2\n\t" +++ " unpcklps %%xmm7, %%xmm6\n\t" +++ " unpckhps %%xmm7, %%xmm2\n\t" +++ " movaps %%xmm2, %%xmm3\n\t" +++ " shufps $0x44, %%xmm6, %%xmm2 # b01000100\n\t" +++ " shufps $0xee, %%xmm3, %%xmm6 # b11101110\n\t" +++ " addps %%xmm2, %%xmm6\n\t" +++ " # xmm6 = r1 i2 r3 i4\n\t" +++ " movhlps %%xmm6, %%xmm4 # xmm4 = r3 i4 ?? ??\n\t" +++ " addps %%xmm4, %%xmm6 # xmm6 = r1+r3 i2+i4 ?? ??\n\t" +++ " movlps %%xmm6, (%[rdi]) # store low 2x32 bits (complex) " +++ "to memory\n\t" +++ : +++ : [rsi] "r"(input), [rdx] "r"(taps), "c"(num_bytes), [rdi] "r"(result) +++ : "rax", "r8", "r9", "r10"); +++ +++ +++ if (isodd) { +++ *result += input[num_points - 1] * taps[num_points - 1]; +++ } ++ +++ return; ++ } ++ ++ #endif ++ ++ #if LV_HAVE_SSE && LV_HAVE_32 ++ ++-static inline void volk_32fc_x2_dot_prod_32fc_a_sse_32(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) { +++static inline void volk_32fc_x2_dot_prod_32fc_a_sse_32(lv_32fc_t* result, +++ const lv_32fc_t* input, +++ const lv_32fc_t* taps, +++ unsigned int num_points) +++{ ++ ++- volk_32fc_x2_dot_prod_32fc_a_generic(result, input, taps, num_points); +++ volk_32fc_x2_dot_prod_32fc_a_generic(result, input, taps, num_points); ++ ++ #if 0 ++ const unsigned int num_bytes = num_points*8; ++@@ -792,57 +832,64 @@ static inline void volk_32fc_x2_dot_prod_32fc_a_sse_32(lv_32fc_t* result, const ++ ++ #include ++ ++-static inline void volk_32fc_x2_dot_prod_32fc_a_sse3(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) { +++static inline void volk_32fc_x2_dot_prod_32fc_a_sse3(lv_32fc_t* result, +++ const lv_32fc_t* input, +++ const lv_32fc_t* taps, +++ unsigned int num_points) +++{ ++ ++- const unsigned int num_bytes = num_points*8; ++- unsigned int isodd = num_points & 1; +++ const unsigned int num_bytes = num_points * 8; +++ unsigned int isodd = num_points & 1; ++ ++- lv_32fc_t dotProduct; ++- memset(&dotProduct, 0x0, 2*sizeof(float)); +++ lv_32fc_t dotProduct; +++ memset(&dotProduct, 0x0, 2 * sizeof(float)); ++ ++- unsigned int number = 0; ++- const unsigned int halfPoints = num_bytes >> 4; +++ unsigned int number = 0; +++ const unsigned int halfPoints = num_bytes >> 4; ++ ++- __m128 x, y, yl, yh, z, tmp1, tmp2, dotProdVal; +++ __m128 x, y, yl, yh, z, tmp1, tmp2, dotProdVal; ++ ++- const lv_32fc_t* a = input; ++- const lv_32fc_t* b = taps; +++ const lv_32fc_t* a = input; +++ const lv_32fc_t* b = taps; ++ ++- dotProdVal = _mm_setzero_ps(); +++ dotProdVal = _mm_setzero_ps(); ++ ++- for(;number < halfPoints; number++){ +++ for (; number < halfPoints; number++) { ++ ++- x = _mm_load_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi ++- y = _mm_load_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di +++ x = _mm_load_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi +++ y = _mm_load_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di ++ ++- yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr ++- yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di +++ yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr +++ yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di ++ ++- tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr +++ tmp1 = _mm_mul_ps(x, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr ++ ++- x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br +++ x = _mm_shuffle_ps(x, x, 0xB1); // Re-arrange x to be ai,ar,bi,br ++ ++- tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di +++ tmp2 = _mm_mul_ps(x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di ++ ++- z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di +++ z = _mm_addsub_ps(tmp1, +++ tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di ++ ++- dotProdVal = _mm_add_ps(dotProdVal, z); // Add the complex multiplication results together +++ dotProdVal = +++ _mm_add_ps(dotProdVal, z); // Add the complex multiplication results together ++ ++- a += 2; ++- b += 2; ++- } +++ a += 2; +++ b += 2; +++ } ++ ++- __VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector[2]; +++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector[2]; ++ ++- _mm_store_ps((float*)dotProductVector,dotProdVal); // Store the results back into the dot product vector +++ _mm_store_ps((float*)dotProductVector, +++ dotProdVal); // Store the results back into the dot product vector ++ ++- dotProduct += ( dotProductVector[0] + dotProductVector[1] ); +++ dotProduct += (dotProductVector[0] + dotProductVector[1]); ++ ++- if(isodd) { ++- dotProduct += input[num_points - 1] * taps[num_points - 1]; ++- } +++ if (isodd) { +++ dotProduct += input[num_points - 1] * taps[num_points - 1]; +++ } ++ ++- *result = dotProduct; +++ *result = dotProduct; ++ } ++ ++ #endif /*LV_HAVE_SSE3*/ ++@@ -852,78 +899,82 @@ static inline void volk_32fc_x2_dot_prod_32fc_a_sse3(lv_32fc_t* result, const lv ++ ++ #include ++ ++-static inline void volk_32fc_x2_dot_prod_32fc_a_sse4_1(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) { +++static inline void volk_32fc_x2_dot_prod_32fc_a_sse4_1(lv_32fc_t* result, +++ const lv_32fc_t* input, +++ const lv_32fc_t* taps, +++ unsigned int num_points) +++{ ++ ++- unsigned int i = 0; ++- const unsigned int qtr_points = num_points/4; ++- const unsigned int isodd = num_points & 3; +++ unsigned int i = 0; +++ const unsigned int qtr_points = num_points / 4; +++ const unsigned int isodd = num_points & 3; ++ ++- __m128 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, real0, real1, im0, im1; ++- float *p_input, *p_taps; ++- __m64 *p_result; +++ __m128 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, real0, real1, im0, im1; +++ float *p_input, *p_taps; +++ __m64* p_result; ++ ++- static const __m128i neg = {0x000000000000000080000000}; +++ static const __m128i neg = { 0x000000000000000080000000 }; ++ ++- p_result = (__m64*)result; ++- p_input = (float*)input; ++- p_taps = (float*)taps; +++ p_result = (__m64*)result; +++ p_input = (float*)input; +++ p_taps = (float*)taps; ++ ++- real0 = _mm_setzero_ps(); ++- real1 = _mm_setzero_ps(); ++- im0 = _mm_setzero_ps(); ++- im1 = _mm_setzero_ps(); +++ real0 = _mm_setzero_ps(); +++ real1 = _mm_setzero_ps(); +++ im0 = _mm_setzero_ps(); +++ im1 = _mm_setzero_ps(); ++ ++- for(; i < qtr_points; ++i) { ++- xmm0 = _mm_load_ps(p_input); ++- xmm1 = _mm_load_ps(p_taps); +++ for (; i < qtr_points; ++i) { +++ xmm0 = _mm_load_ps(p_input); +++ xmm1 = _mm_load_ps(p_taps); ++ ++- p_input += 4; ++- p_taps += 4; +++ p_input += 4; +++ p_taps += 4; ++ ++- xmm2 = _mm_load_ps(p_input); ++- xmm3 = _mm_load_ps(p_taps); +++ xmm2 = _mm_load_ps(p_input); +++ xmm3 = _mm_load_ps(p_taps); ++ ++- p_input += 4; ++- p_taps += 4; +++ p_input += 4; +++ p_taps += 4; ++ ++- xmm4 = _mm_unpackhi_ps(xmm0, xmm2); ++- xmm5 = _mm_unpackhi_ps(xmm1, xmm3); ++- xmm0 = _mm_unpacklo_ps(xmm0, xmm2); ++- xmm2 = _mm_unpacklo_ps(xmm1, xmm3); +++ xmm4 = _mm_unpackhi_ps(xmm0, xmm2); +++ xmm5 = _mm_unpackhi_ps(xmm1, xmm3); +++ xmm0 = _mm_unpacklo_ps(xmm0, xmm2); +++ xmm2 = _mm_unpacklo_ps(xmm1, xmm3); ++ ++- //imaginary vector from input ++- xmm1 = _mm_unpackhi_ps(xmm0, xmm4); ++- //real vector from input ++- xmm3 = _mm_unpacklo_ps(xmm0, xmm4); ++- //imaginary vector from taps ++- xmm0 = _mm_unpackhi_ps(xmm2, xmm5); ++- //real vector from taps ++- xmm2 = _mm_unpacklo_ps(xmm2, xmm5); +++ // imaginary vector from input +++ xmm1 = _mm_unpackhi_ps(xmm0, xmm4); +++ // real vector from input +++ xmm3 = _mm_unpacklo_ps(xmm0, xmm4); +++ // imaginary vector from taps +++ xmm0 = _mm_unpackhi_ps(xmm2, xmm5); +++ // real vector from taps +++ xmm2 = _mm_unpacklo_ps(xmm2, xmm5); ++ ++- xmm4 = _mm_dp_ps(xmm3, xmm2, 0xf1); ++- xmm5 = _mm_dp_ps(xmm1, xmm0, 0xf1); +++ xmm4 = _mm_dp_ps(xmm3, xmm2, 0xf1); +++ xmm5 = _mm_dp_ps(xmm1, xmm0, 0xf1); ++ ++- xmm6 = _mm_dp_ps(xmm3, xmm0, 0xf2); ++- xmm7 = _mm_dp_ps(xmm1, xmm2, 0xf2); +++ xmm6 = _mm_dp_ps(xmm3, xmm0, 0xf2); +++ xmm7 = _mm_dp_ps(xmm1, xmm2, 0xf2); ++ ++- real0 = _mm_add_ps(xmm4, real0); ++- real1 = _mm_add_ps(xmm5, real1); ++- im0 = _mm_add_ps(xmm6, im0); ++- im1 = _mm_add_ps(xmm7, im1); ++- } +++ real0 = _mm_add_ps(xmm4, real0); +++ real1 = _mm_add_ps(xmm5, real1); +++ im0 = _mm_add_ps(xmm6, im0); +++ im1 = _mm_add_ps(xmm7, im1); +++ } ++ ++- real1 = _mm_xor_ps(real1, bit128_p(&neg)->float_vec); +++ real1 = _mm_xor_ps(real1, bit128_p(&neg)->float_vec); ++ ++- im0 = _mm_add_ps(im0, im1); ++- real0 = _mm_add_ps(real0, real1); +++ im0 = _mm_add_ps(im0, im1); +++ real0 = _mm_add_ps(real0, real1); ++ ++- im0 = _mm_add_ps(im0, real0); +++ im0 = _mm_add_ps(im0, real0); ++ ++- _mm_storel_pi(p_result, im0); +++ _mm_storel_pi(p_result, im0); ++ ++- for(i = num_points-isodd; i < num_points; i++) { ++- *result += input[i] * taps[i]; ++- } +++ for (i = num_points - isodd; i < num_points; i++) { +++ *result += input[i] * taps[i]; +++ } ++ } ++ ++ #endif /*LV_HAVE_SSE4_1*/ ++@@ -931,13 +982,17 @@ static inline void volk_32fc_x2_dot_prod_32fc_a_sse4_1(lv_32fc_t* result, const ++ #ifdef LV_HAVE_NEON ++ #include ++ ++-static inline void volk_32fc_x2_dot_prod_32fc_neon(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) { +++static inline void volk_32fc_x2_dot_prod_32fc_neon(lv_32fc_t* result, +++ const lv_32fc_t* input, +++ const lv_32fc_t* taps, +++ unsigned int num_points) +++{ ++ ++ unsigned int quarter_points = num_points / 4; ++ unsigned int number; ++ ++- lv_32fc_t* a_ptr = (lv_32fc_t*) taps; ++- lv_32fc_t* b_ptr = (lv_32fc_t*) input; +++ lv_32fc_t* a_ptr = (lv_32fc_t*)taps; +++ lv_32fc_t* b_ptr = (lv_32fc_t*)input; ++ // for 2-lane vectors, 1st lane holds the real part, ++ // 2nd lane holds the imaginary part ++ float32x4x2_t a_val, b_val, c_val, accumulator; ++@@ -945,11 +1000,11 @@ static inline void volk_32fc_x2_dot_prod_32fc_neon(lv_32fc_t* result, const lv_3 ++ accumulator.val[0] = vdupq_n_f32(0); ++ accumulator.val[1] = vdupq_n_f32(0); ++ ++- for(number = 0; number < quarter_points; ++number) { +++ for (number = 0; number < quarter_points; ++number) { ++ a_val = vld2q_f32((float*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i ++ b_val = vld2q_f32((float*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i ++- __VOLK_PREFETCH(a_ptr+8); ++- __VOLK_PREFETCH(b_ptr+8); +++ __VOLK_PREFETCH(a_ptr + 8); +++ __VOLK_PREFETCH(b_ptr + 8); ++ ++ // multiply the real*real and imag*imag to get real result ++ // a0r*b0r|a1r*b1r|a2r*b2r|a3r*b3r ++@@ -977,22 +1032,25 @@ static inline void volk_32fc_x2_dot_prod_32fc_neon(lv_32fc_t* result, const lv_3 ++ *result = accum_result[0] + accum_result[1] + accum_result[2] + accum_result[3]; ++ ++ // tail case ++- for(number = quarter_points*4; number < num_points; ++number) { +++ for (number = quarter_points * 4; number < num_points; ++number) { ++ *result += (*a_ptr++) * (*b_ptr++); ++ } ++- ++ } ++ #endif /*LV_HAVE_NEON*/ ++ ++ #ifdef LV_HAVE_NEON ++ #include ++-static inline void volk_32fc_x2_dot_prod_32fc_neon_opttests(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) { +++static inline void volk_32fc_x2_dot_prod_32fc_neon_opttests(lv_32fc_t* result, +++ const lv_32fc_t* input, +++ const lv_32fc_t* taps, +++ unsigned int num_points) +++{ ++ ++ unsigned int quarter_points = num_points / 4; ++ unsigned int number; ++ ++- lv_32fc_t* a_ptr = (lv_32fc_t*) taps; ++- lv_32fc_t* b_ptr = (lv_32fc_t*) input; +++ lv_32fc_t* a_ptr = (lv_32fc_t*)taps; +++ lv_32fc_t* b_ptr = (lv_32fc_t*)input; ++ // for 2-lane vectors, 1st lane holds the real part, ++ // 2nd lane holds the imaginary part ++ float32x4x2_t a_val, b_val, accumulator; ++@@ -1000,11 +1058,11 @@ static inline void volk_32fc_x2_dot_prod_32fc_neon_opttests(lv_32fc_t* result, c ++ accumulator.val[0] = vdupq_n_f32(0); ++ accumulator.val[1] = vdupq_n_f32(0); ++ ++- for(number = 0; number < quarter_points; ++number) { +++ for (number = 0; number < quarter_points; ++number) { ++ a_val = vld2q_f32((float*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i ++ b_val = vld2q_f32((float*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i ++- __VOLK_PREFETCH(a_ptr+8); ++- __VOLK_PREFETCH(b_ptr+8); +++ __VOLK_PREFETCH(a_ptr + 8); +++ __VOLK_PREFETCH(b_ptr + 8); ++ ++ // do the first multiply ++ tmp_imag.val[1] = vmulq_f32(a_val.val[1], b_val.val[0]); ++@@ -1026,21 +1084,24 @@ static inline void volk_32fc_x2_dot_prod_32fc_neon_opttests(lv_32fc_t* result, c ++ *result = accum_result[0] + accum_result[1] + accum_result[2] + accum_result[3]; ++ ++ // tail case ++- for(number = quarter_points*4; number < num_points; ++number) { +++ for (number = quarter_points * 4; number < num_points; ++number) { ++ *result += (*a_ptr++) * (*b_ptr++); ++ } ++- ++ } ++ #endif /*LV_HAVE_NEON*/ ++ ++ #ifdef LV_HAVE_NEON ++-static inline void volk_32fc_x2_dot_prod_32fc_neon_optfma(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) { +++static inline void volk_32fc_x2_dot_prod_32fc_neon_optfma(lv_32fc_t* result, +++ const lv_32fc_t* input, +++ const lv_32fc_t* taps, +++ unsigned int num_points) +++{ ++ ++ unsigned int quarter_points = num_points / 4; ++ unsigned int number; ++ ++- lv_32fc_t* a_ptr = (lv_32fc_t*) taps; ++- lv_32fc_t* b_ptr = (lv_32fc_t*) input; +++ lv_32fc_t* a_ptr = (lv_32fc_t*)taps; +++ lv_32fc_t* b_ptr = (lv_32fc_t*)input; ++ // for 2-lane vectors, 1st lane holds the real part, ++ // 2nd lane holds the imaginary part ++ float32x4x2_t a_val, b_val, accumulator1, accumulator2; ++@@ -1049,11 +1110,11 @@ static inline void volk_32fc_x2_dot_prod_32fc_neon_optfma(lv_32fc_t* result, con ++ accumulator2.val[0] = vdupq_n_f32(0); ++ accumulator2.val[1] = vdupq_n_f32(0); ++ ++- for(number = 0; number < quarter_points; ++number) { +++ for (number = 0; number < quarter_points; ++number) { ++ a_val = vld2q_f32((float*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i ++ b_val = vld2q_f32((float*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i ++- __VOLK_PREFETCH(a_ptr+8); ++- __VOLK_PREFETCH(b_ptr+8); +++ __VOLK_PREFETCH(a_ptr + 8); +++ __VOLK_PREFETCH(b_ptr + 8); ++ ++ // use 2 accumulators to remove inter-instruction data dependencies ++ accumulator1.val[0] = vmlaq_f32(accumulator1.val[0], a_val.val[0], b_val.val[0]); ++@@ -1071,22 +1132,26 @@ static inline void volk_32fc_x2_dot_prod_32fc_neon_optfma(lv_32fc_t* result, con ++ *result = accum_result[0] + accum_result[1] + accum_result[2] + accum_result[3]; ++ ++ // tail case ++- for(number = quarter_points*4; number < num_points; ++number) { +++ for (number = quarter_points * 4; number < num_points; ++number) { ++ *result += (*a_ptr++) * (*b_ptr++); ++ } ++- ++ } ++ #endif /*LV_HAVE_NEON*/ ++ ++ #ifdef LV_HAVE_NEON ++-static inline void volk_32fc_x2_dot_prod_32fc_neon_optfmaunroll(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) { ++-// NOTE: GCC does a poor job with this kernel, but the equivalent ASM code is very fast +++static inline void volk_32fc_x2_dot_prod_32fc_neon_optfmaunroll(lv_32fc_t* result, +++ const lv_32fc_t* input, +++ const lv_32fc_t* taps, +++ unsigned int num_points) +++{ +++ // NOTE: GCC does a poor job with this kernel, but the equivalent ASM code is very +++ // fast ++ ++ unsigned int quarter_points = num_points / 8; ++ unsigned int number; ++ ++- lv_32fc_t* a_ptr = (lv_32fc_t*) taps; ++- lv_32fc_t* b_ptr = (lv_32fc_t*) input; +++ lv_32fc_t* a_ptr = (lv_32fc_t*)taps; +++ lv_32fc_t* b_ptr = (lv_32fc_t*)input; ++ // for 2-lane vectors, 1st lane holds the real part, ++ // 2nd lane holds the imaginary part ++ float32x4x4_t a_val, b_val, accumulator1, accumulator2; ++@@ -1101,11 +1166,11 @@ static inline void volk_32fc_x2_dot_prod_32fc_neon_optfmaunroll(lv_32fc_t* resul ++ accumulator2.val[3] = vdupq_n_f32(0); ++ ++ // 8 input regs, 8 accumulators -> 16/16 neon regs are used ++- for(number = 0; number < quarter_points; ++number) { +++ for (number = 0; number < quarter_points; ++number) { ++ a_val = vld4q_f32((float*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i ++ b_val = vld4q_f32((float*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i ++- __VOLK_PREFETCH(a_ptr+8); ++- __VOLK_PREFETCH(b_ptr+8); +++ __VOLK_PREFETCH(a_ptr + 8); +++ __VOLK_PREFETCH(b_ptr + 8); ++ ++ // use 2 accumulators to remove inter-instruction data dependencies ++ accumulator1.val[0] = vmlaq_f32(accumulator1.val[0], a_val.val[0], b_val.val[0]); ++@@ -1136,10 +1201,9 @@ static inline void volk_32fc_x2_dot_prod_32fc_neon_optfmaunroll(lv_32fc_t* resul ++ *result = accum_result[0] + accum_result[1] + accum_result[2] + accum_result[3]; ++ ++ // tail case ++- for(number = quarter_points*8; number < num_points; ++number) { +++ for (number = quarter_points * 8; number < num_points; ++number) { ++ *result += (*a_ptr++) * (*b_ptr++); ++ } ++- ++ } ++ #endif /*LV_HAVE_NEON*/ ++ ++@@ -1148,56 +1212,64 @@ static inline void volk_32fc_x2_dot_prod_32fc_neon_optfmaunroll(lv_32fc_t* resul ++ ++ #include ++ ++-static inline void volk_32fc_x2_dot_prod_32fc_a_avx(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) { +++static inline void volk_32fc_x2_dot_prod_32fc_a_avx(lv_32fc_t* result, +++ const lv_32fc_t* input, +++ const lv_32fc_t* taps, +++ unsigned int num_points) +++{ ++ ++- unsigned int isodd = num_points & 3; ++- unsigned int i = 0; ++- lv_32fc_t dotProduct; ++- memset(&dotProduct, 0x0, 2*sizeof(float)); +++ unsigned int isodd = num_points & 3; +++ unsigned int i = 0; +++ lv_32fc_t dotProduct; +++ memset(&dotProduct, 0x0, 2 * sizeof(float)); ++ ++- unsigned int number = 0; ++- const unsigned int quarterPoints = num_points / 4; +++ unsigned int number = 0; +++ const unsigned int quarterPoints = num_points / 4; ++ ++- __m256 x, y, yl, yh, z, tmp1, tmp2, dotProdVal; +++ __m256 x, y, yl, yh, z, tmp1, tmp2, dotProdVal; ++ ++- const lv_32fc_t* a = input; ++- const lv_32fc_t* b = taps; +++ const lv_32fc_t* a = input; +++ const lv_32fc_t* b = taps; ++ ++- dotProdVal = _mm256_setzero_ps(); +++ dotProdVal = _mm256_setzero_ps(); ++ ++- for(;number < quarterPoints; number++){ +++ for (; number < quarterPoints; number++) { ++ ++- x = _mm256_load_ps((float*)a); // Load a,b,e,f as ar,ai,br,bi,er,ei,fr,fi ++- y = _mm256_load_ps((float*)b); // Load c,d,g,h as cr,ci,dr,di,gr,gi,hr,hi +++ x = _mm256_load_ps((float*)a); // Load a,b,e,f as ar,ai,br,bi,er,ei,fr,fi +++ y = _mm256_load_ps((float*)b); // Load c,d,g,h as cr,ci,dr,di,gr,gi,hr,hi ++ ++- yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr,gr,gr,hr,hr ++- yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di,gi,gi,hi,hi +++ yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr,gr,gr,hr,hr +++ yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di,gi,gi,hi,hi ++ ++- tmp1 = _mm256_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr ... +++ tmp1 = _mm256_mul_ps(x, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr ... ++ ++- x = _mm256_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br,ei,er,fi,fr +++ x = _mm256_shuffle_ps(x, x, 0xB1); // Re-arrange x to be ai,ar,bi,br,ei,er,fi,fr ++ ++- tmp2 = _mm256_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di ... +++ tmp2 = _mm256_mul_ps(x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di ... ++ ++- z = _mm256_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di +++ z = _mm256_addsub_ps(tmp1, +++ tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di ++ ++- dotProdVal = _mm256_add_ps(dotProdVal, z); // Add the complex multiplication results together +++ dotProdVal = _mm256_add_ps(dotProdVal, +++ z); // Add the complex multiplication results together ++ ++- a += 4; ++- b += 4; ++- } +++ a += 4; +++ b += 4; +++ } ++ ++- __VOLK_ATTR_ALIGNED(32) lv_32fc_t dotProductVector[4]; +++ __VOLK_ATTR_ALIGNED(32) lv_32fc_t dotProductVector[4]; ++ ++- _mm256_store_ps((float*)dotProductVector,dotProdVal); // Store the results back into the dot product vector +++ _mm256_store_ps((float*)dotProductVector, +++ dotProdVal); // Store the results back into the dot product vector ++ ++- dotProduct += ( dotProductVector[0] + dotProductVector[1] + dotProductVector[2] + dotProductVector[3]); +++ dotProduct += (dotProductVector[0] + dotProductVector[1] + dotProductVector[2] + +++ dotProductVector[3]); ++ ++- for(i = num_points-isodd; i < num_points; i++) { ++- dotProduct += input[i] * taps[i]; ++- } +++ for (i = num_points - isodd; i < num_points; i++) { +++ dotProduct += input[i] * taps[i]; +++ } ++ ++- *result = dotProduct; +++ *result = dotProduct; ++ } ++ ++ #endif /*LV_HAVE_AVX*/ ++@@ -1205,56 +1277,64 @@ static inline void volk_32fc_x2_dot_prod_32fc_a_avx(lv_32fc_t* result, const lv_ ++ #if LV_HAVE_AVX && LV_HAVE_FMA ++ #include ++ ++-static inline void volk_32fc_x2_dot_prod_32fc_a_avx_fma(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) { +++static inline void volk_32fc_x2_dot_prod_32fc_a_avx_fma(lv_32fc_t* result, +++ const lv_32fc_t* input, +++ const lv_32fc_t* taps, +++ unsigned int num_points) +++{ ++ ++- unsigned int isodd = num_points & 3; ++- unsigned int i = 0; ++- lv_32fc_t dotProduct; ++- memset(&dotProduct, 0x0, 2*sizeof(float)); +++ unsigned int isodd = num_points & 3; +++ unsigned int i = 0; +++ lv_32fc_t dotProduct; +++ memset(&dotProduct, 0x0, 2 * sizeof(float)); ++ ++- unsigned int number = 0; ++- const unsigned int quarterPoints = num_points / 4; +++ unsigned int number = 0; +++ const unsigned int quarterPoints = num_points / 4; ++ ++- __m256 x, y, yl, yh, z, tmp1, tmp2, dotProdVal; +++ __m256 x, y, yl, yh, z, tmp1, tmp2, dotProdVal; ++ ++- const lv_32fc_t* a = input; ++- const lv_32fc_t* b = taps; +++ const lv_32fc_t* a = input; +++ const lv_32fc_t* b = taps; ++ ++- dotProdVal = _mm256_setzero_ps(); +++ dotProdVal = _mm256_setzero_ps(); ++ ++- for(;number < quarterPoints; number++){ +++ for (; number < quarterPoints; number++) { ++ ++- x = _mm256_load_ps((float*)a); // Load a,b,e,f as ar,ai,br,bi,er,ei,fr,fi ++- y = _mm256_load_ps((float*)b); // Load c,d,g,h as cr,ci,dr,di,gr,gi,hr,hi +++ x = _mm256_load_ps((float*)a); // Load a,b,e,f as ar,ai,br,bi,er,ei,fr,fi +++ y = _mm256_load_ps((float*)b); // Load c,d,g,h as cr,ci,dr,di,gr,gi,hr,hi ++ ++- yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr,gr,gr,hr,hr ++- yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di,gi,gi,hi,hi +++ yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr,gr,gr,hr,hr +++ yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di,gi,gi,hi,hi ++ ++- tmp1 = x; +++ tmp1 = x; ++ ++- x = _mm256_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br,ei,er,fi,fr +++ x = _mm256_shuffle_ps(x, x, 0xB1); // Re-arrange x to be ai,ar,bi,br,ei,er,fi,fr ++ ++- tmp2 = _mm256_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di ... +++ tmp2 = _mm256_mul_ps(x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di ... ++ ++- z = _mm256_fmaddsub_ps(tmp1, yl,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di +++ z = _mm256_fmaddsub_ps( +++ tmp1, yl, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di ++ ++- dotProdVal = _mm256_add_ps(dotProdVal, z); // Add the complex multiplication results together +++ dotProdVal = _mm256_add_ps(dotProdVal, +++ z); // Add the complex multiplication results together ++ ++- a += 4; ++- b += 4; ++- } +++ a += 4; +++ b += 4; +++ } ++ ++- __VOLK_ATTR_ALIGNED(32) lv_32fc_t dotProductVector[4]; +++ __VOLK_ATTR_ALIGNED(32) lv_32fc_t dotProductVector[4]; ++ ++- _mm256_store_ps((float*)dotProductVector,dotProdVal); // Store the results back into the dot product vector +++ _mm256_store_ps((float*)dotProductVector, +++ dotProdVal); // Store the results back into the dot product vector ++ ++- dotProduct += ( dotProductVector[0] + dotProductVector[1] + dotProductVector[2] + dotProductVector[3]); +++ dotProduct += (dotProductVector[0] + dotProductVector[1] + dotProductVector[2] + +++ dotProductVector[3]); ++ ++- for(i = num_points-isodd; i < num_points; i++) { ++- dotProduct += input[i] * taps[i]; ++- } +++ for (i = num_points - isodd; i < num_points; i++) { +++ dotProduct += input[i] * taps[i]; +++ } ++ ++- *result = dotProduct; +++ *result = dotProduct; ++ } ++ ++ #endif /*LV_HAVE_AVX && LV_HAVE_FMA*/ ++diff --git a/kernels/volk/volk_32fc_x2_multiply_32fc.h b/kernels/volk/volk_32fc_x2_multiply_32fc.h ++index 6bf428b..6cb6907 100644 ++--- a/kernels/volk/volk_32fc_x2_multiply_32fc.h +++++ b/kernels/volk/volk_32fc_x2_multiply_32fc.h ++@@ -29,8 +29,8 @@ ++ * ++ * Dispatcher Prototype ++ * \code ++- * void volk_32fc_x2_multiply_32fc(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points); ++- * \endcode +++ * void volk_32fc_x2_multiply_32fc(lv_32fc_t* cVector, const lv_32fc_t* aVector, const +++ * lv_32fc_t* bVector, unsigned int num_points); \endcode ++ * ++ * \b Inputs ++ * \li aVector: The first input vector of complex floats. ++@@ -70,55 +70,62 @@ ++ #ifndef INCLUDED_volk_32fc_x2_multiply_32fc_u_H ++ #define INCLUDED_volk_32fc_x2_multiply_32fc_u_H ++ +++#include ++ #include ++ #include ++ #include ++-#include ++ ++ #if LV_HAVE_AVX2 && LV_HAVE_FMA ++ #include ++ /*! ++- \brief Multiplies the two input complex vectors and stores their results in the third vector ++- \param cVector The vector where the results will be stored ++- \param aVector One of the vectors to be multiplied ++- \param bVector One of the vectors to be multiplied ++- \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector +++ \brief Multiplies the two input complex vectors and stores their results in the third +++ vector \param cVector The vector where the results will be stored \param aVector One of +++ the vectors to be multiplied \param bVector One of the vectors to be multiplied \param +++ num_points The number of complex values in aVector and bVector to be multiplied together +++ and stored into cVector ++ */ ++-static inline void volk_32fc_x2_multiply_32fc_u_avx2_fma(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){ ++- unsigned int number = 0; ++- const unsigned int quarterPoints = num_points / 4; +++static inline void volk_32fc_x2_multiply_32fc_u_avx2_fma(lv_32fc_t* cVector, +++ const lv_32fc_t* aVector, +++ const lv_32fc_t* bVector, +++ unsigned int num_points) +++{ +++ unsigned int number = 0; +++ const unsigned int quarterPoints = num_points / 4; ++ ++- lv_32fc_t* c = cVector; ++- const lv_32fc_t* a = aVector; ++- const lv_32fc_t* b = bVector; +++ lv_32fc_t* c = cVector; +++ const lv_32fc_t* a = aVector; +++ const lv_32fc_t* b = bVector; ++ ++- for(;number < quarterPoints; number++){ +++ for (; number < quarterPoints; number++) { ++ ++- const __m256 x = _mm256_loadu_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi ++- const __m256 y = _mm256_loadu_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di +++ const __m256 x = +++ _mm256_loadu_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi +++ const __m256 y = +++ _mm256_loadu_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di ++ ++- const __m256 yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr ++- const __m256 yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di +++ const __m256 yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr +++ const __m256 yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di ++ ++- const __m256 tmp2x = _mm256_permute_ps(x,0xB1); // Re-arrange x to be ai,ar,bi,br +++ const __m256 tmp2x = _mm256_permute_ps(x, 0xB1); // Re-arrange x to be ai,ar,bi,br ++ ++- const __m256 tmp2 = _mm256_mul_ps(tmp2x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di +++ const __m256 tmp2 = _mm256_mul_ps(tmp2x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di ++ ++- const __m256 z = _mm256_fmaddsub_ps(x, yl, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di +++ const __m256 z = _mm256_fmaddsub_ps( +++ x, yl, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di ++ ++- _mm256_storeu_ps((float*)c,z); // Store the results back into the C container +++ _mm256_storeu_ps((float*)c, z); // Store the results back into the C container ++ ++- a += 4; ++- b += 4; ++- c += 4; ++- } +++ a += 4; +++ b += 4; +++ c += 4; +++ } ++ ++- _mm256_zeroupper(); +++ _mm256_zeroupper(); ++ ++- number = quarterPoints * 4; ++- for(;number < num_points; number++){ ++- *c++ = (*a++) * (*b++); ++- } +++ number = quarterPoints * 4; +++ for (; number < num_points; number++) { +++ *c++ = (*a++) * (*b++); +++ } ++ } ++ #endif /* LV_HAVE_AVX2 && LV_HAVE_FMA */ ++ ++@@ -127,34 +134,37 @@ static inline void volk_32fc_x2_multiply_32fc_u_avx2_fma(lv_32fc_t* cVector, con ++ #include ++ #include ++ ++-static inline void ++-volk_32fc_x2_multiply_32fc_u_avx(lv_32fc_t* cVector, const lv_32fc_t* aVector, ++- const lv_32fc_t* bVector, unsigned int num_points) +++static inline void volk_32fc_x2_multiply_32fc_u_avx(lv_32fc_t* cVector, +++ const lv_32fc_t* aVector, +++ const lv_32fc_t* bVector, +++ unsigned int num_points) ++ { ++- unsigned int number = 0; ++- const unsigned int quarterPoints = num_points / 4; ++- ++- __m256 x, y, z; ++- lv_32fc_t* c = cVector; ++- const lv_32fc_t* a = aVector; ++- const lv_32fc_t* b = bVector; ++- ++- for(; number < quarterPoints; number++){ ++- x = _mm256_loadu_ps((float*) a); // Load the ar + ai, br + bi ... as ar,ai,br,bi ... ++- y = _mm256_loadu_ps((float*) b); // Load the cr + ci, dr + di ... as cr,ci,dr,di ... ++- z = _mm256_complexmul_ps(x, y); ++- _mm256_storeu_ps((float*) c, z); // Store the results back into the C container ++- ++- a += 4; ++- b += 4; ++- c += 4; ++- } ++- ++- number = quarterPoints * 4; ++- ++- for(; number < num_points; number++){ ++- *c++ = (*a++) * (*b++); ++- } +++ unsigned int number = 0; +++ const unsigned int quarterPoints = num_points / 4; +++ +++ __m256 x, y, z; +++ lv_32fc_t* c = cVector; +++ const lv_32fc_t* a = aVector; +++ const lv_32fc_t* b = bVector; +++ +++ for (; number < quarterPoints; number++) { +++ x = _mm256_loadu_ps( +++ (float*)a); // Load the ar + ai, br + bi ... as ar,ai,br,bi ... +++ y = _mm256_loadu_ps( +++ (float*)b); // Load the cr + ci, dr + di ... as cr,ci,dr,di ... +++ z = _mm256_complexmul_ps(x, y); +++ _mm256_storeu_ps((float*)c, z); // Store the results back into the C container +++ +++ a += 4; +++ b += 4; +++ c += 4; +++ } +++ +++ number = quarterPoints * 4; +++ +++ for (; number < num_points; number++) { +++ *c++ = (*a++) * (*b++); +++ } ++ } ++ #endif /* LV_HAVE_AVX */ ++ ++@@ -163,50 +173,52 @@ volk_32fc_x2_multiply_32fc_u_avx(lv_32fc_t* cVector, const lv_32fc_t* aVector, ++ #include ++ #include ++ ++-static inline void ++-volk_32fc_x2_multiply_32fc_u_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector, ++- const lv_32fc_t* bVector, unsigned int num_points) +++static inline void volk_32fc_x2_multiply_32fc_u_sse3(lv_32fc_t* cVector, +++ const lv_32fc_t* aVector, +++ const lv_32fc_t* bVector, +++ unsigned int num_points) ++ { ++- unsigned int number = 0; ++- const unsigned int halfPoints = num_points / 2; ++- ++- __m128 x, y, z; ++- lv_32fc_t* c = cVector; ++- const lv_32fc_t* a = aVector; ++- const lv_32fc_t* b = bVector; ++- ++- for(; number < halfPoints; number++){ ++- x = _mm_loadu_ps((float*) a); // Load the ar + ai, br + bi as ar,ai,br,bi ++- y = _mm_loadu_ps((float*) b); // Load the cr + ci, dr + di as cr,ci,dr,di ++- z = _mm_complexmul_ps(x, y); ++- _mm_storeu_ps((float*) c, z); // Store the results back into the C container ++- ++- a += 2; ++- b += 2; ++- c += 2; ++- } ++- ++- if((num_points % 2) != 0){ ++- *c = (*a) * (*b); ++- } +++ unsigned int number = 0; +++ const unsigned int halfPoints = num_points / 2; +++ +++ __m128 x, y, z; +++ lv_32fc_t* c = cVector; +++ const lv_32fc_t* a = aVector; +++ const lv_32fc_t* b = bVector; +++ +++ for (; number < halfPoints; number++) { +++ x = _mm_loadu_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi +++ y = _mm_loadu_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di +++ z = _mm_complexmul_ps(x, y); +++ _mm_storeu_ps((float*)c, z); // Store the results back into the C container +++ +++ a += 2; +++ b += 2; +++ c += 2; +++ } +++ +++ if ((num_points % 2) != 0) { +++ *c = (*a) * (*b); +++ } ++ } ++ #endif /* LV_HAVE_SSE */ ++ ++ ++ #ifdef LV_HAVE_GENERIC ++ ++-static inline void ++-volk_32fc_x2_multiply_32fc_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, ++- const lv_32fc_t* bVector, unsigned int num_points) +++static inline void volk_32fc_x2_multiply_32fc_generic(lv_32fc_t* cVector, +++ const lv_32fc_t* aVector, +++ const lv_32fc_t* bVector, +++ unsigned int num_points) ++ { ++- lv_32fc_t* cPtr = cVector; ++- const lv_32fc_t* aPtr = aVector; ++- const lv_32fc_t* bPtr= bVector; ++- unsigned int number = 0; ++- ++- for(number = 0; number < num_points; number++){ ++- *cPtr++ = (*aPtr++) * (*bPtr++); ++- } +++ lv_32fc_t* cPtr = cVector; +++ const lv_32fc_t* aPtr = aVector; +++ const lv_32fc_t* bPtr = bVector; +++ unsigned int number = 0; +++ +++ for (number = 0; number < num_points; number++) { +++ *cPtr++ = (*aPtr++) * (*bPtr++); +++ } ++ } ++ #endif /* LV_HAVE_GENERIC */ ++ ++@@ -215,55 +227,62 @@ volk_32fc_x2_multiply_32fc_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, ++ #ifndef INCLUDED_volk_32fc_x2_multiply_32fc_a_H ++ #define INCLUDED_volk_32fc_x2_multiply_32fc_a_H ++ +++#include ++ #include ++ #include ++ #include ++-#include ++ ++ #if LV_HAVE_AVX2 && LV_HAVE_FMA ++ #include ++ /*! ++- \brief Multiplies the two input complex vectors and stores their results in the third vector ++- \param cVector The vector where the results will be stored ++- \param aVector One of the vectors to be multiplied ++- \param bVector One of the vectors to be multiplied ++- \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector +++ \brief Multiplies the two input complex vectors and stores their results in the third +++ vector \param cVector The vector where the results will be stored \param aVector One of +++ the vectors to be multiplied \param bVector One of the vectors to be multiplied \param +++ num_points The number of complex values in aVector and bVector to be multiplied together +++ and stored into cVector ++ */ ++-static inline void volk_32fc_x2_multiply_32fc_a_avx2_fma(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){ ++- unsigned int number = 0; ++- const unsigned int quarterPoints = num_points / 4; +++static inline void volk_32fc_x2_multiply_32fc_a_avx2_fma(lv_32fc_t* cVector, +++ const lv_32fc_t* aVector, +++ const lv_32fc_t* bVector, +++ unsigned int num_points) +++{ +++ unsigned int number = 0; +++ const unsigned int quarterPoints = num_points / 4; ++ ++- lv_32fc_t* c = cVector; ++- const lv_32fc_t* a = aVector; ++- const lv_32fc_t* b = bVector; +++ lv_32fc_t* c = cVector; +++ const lv_32fc_t* a = aVector; +++ const lv_32fc_t* b = bVector; ++ ++- for(;number < quarterPoints; number++){ +++ for (; number < quarterPoints; number++) { ++ ++- const __m256 x = _mm256_load_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi ++- const __m256 y = _mm256_load_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di +++ const __m256 x = +++ _mm256_load_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi +++ const __m256 y = +++ _mm256_load_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di ++ ++- const __m256 yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr ++- const __m256 yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di +++ const __m256 yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr +++ const __m256 yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di ++ ++- const __m256 tmp2x = _mm256_permute_ps(x,0xB1); // Re-arrange x to be ai,ar,bi,br +++ const __m256 tmp2x = _mm256_permute_ps(x, 0xB1); // Re-arrange x to be ai,ar,bi,br ++ ++- const __m256 tmp2 = _mm256_mul_ps(tmp2x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di +++ const __m256 tmp2 = _mm256_mul_ps(tmp2x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di ++ ++- const __m256 z = _mm256_fmaddsub_ps(x, yl, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di +++ const __m256 z = _mm256_fmaddsub_ps( +++ x, yl, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di ++ ++- _mm256_store_ps((float*)c,z); // Store the results back into the C container +++ _mm256_store_ps((float*)c, z); // Store the results back into the C container ++ ++- a += 4; ++- b += 4; ++- c += 4; ++- } +++ a += 4; +++ b += 4; +++ c += 4; +++ } ++ ++- _mm256_zeroupper(); +++ _mm256_zeroupper(); ++ ++- number = quarterPoints * 4; ++- for(;number < num_points; number++){ ++- *c++ = (*a++) * (*b++); ++- } +++ number = quarterPoints * 4; +++ for (; number < num_points; number++) { +++ *c++ = (*a++) * (*b++); +++ } ++ } ++ #endif /* LV_HAVE_AVX2 && LV_HAVE_FMA */ ++ ++@@ -272,34 +291,35 @@ static inline void volk_32fc_x2_multiply_32fc_a_avx2_fma(lv_32fc_t* cVector, con ++ #include ++ #include ++ ++-static inline void ++-volk_32fc_x2_multiply_32fc_a_avx(lv_32fc_t* cVector, const lv_32fc_t* aVector, ++- const lv_32fc_t* bVector, unsigned int num_points) +++static inline void volk_32fc_x2_multiply_32fc_a_avx(lv_32fc_t* cVector, +++ const lv_32fc_t* aVector, +++ const lv_32fc_t* bVector, +++ unsigned int num_points) ++ { ++- unsigned int number = 0; ++- const unsigned int quarterPoints = num_points / 4; ++- ++- __m256 x, y, z; ++- lv_32fc_t* c = cVector; ++- const lv_32fc_t* a = aVector; ++- const lv_32fc_t* b = bVector; ++- ++- for(; number < quarterPoints; number++){ ++- x = _mm256_load_ps((float*) a); // Load the ar + ai, br + bi ... as ar,ai,br,bi ... ++- y = _mm256_load_ps((float*) b); // Load the cr + ci, dr + di ... as cr,ci,dr,di ... ++- z = _mm256_complexmul_ps(x, y); ++- _mm256_store_ps((float*) c, z); // Store the results back into the C container ++- ++- a += 4; ++- b += 4; ++- c += 4; ++- } ++- ++- number = quarterPoints * 4; ++- ++- for(; number < num_points; number++){ ++- *c++ = (*a++) * (*b++); ++- } +++ unsigned int number = 0; +++ const unsigned int quarterPoints = num_points / 4; +++ +++ __m256 x, y, z; +++ lv_32fc_t* c = cVector; +++ const lv_32fc_t* a = aVector; +++ const lv_32fc_t* b = bVector; +++ +++ for (; number < quarterPoints; number++) { +++ x = _mm256_load_ps((float*)a); // Load the ar + ai, br + bi ... as ar,ai,br,bi ... +++ y = _mm256_load_ps((float*)b); // Load the cr + ci, dr + di ... as cr,ci,dr,di ... +++ z = _mm256_complexmul_ps(x, y); +++ _mm256_store_ps((float*)c, z); // Store the results back into the C container +++ +++ a += 4; +++ b += 4; +++ c += 4; +++ } +++ +++ number = quarterPoints * 4; +++ +++ for (; number < num_points; number++) { +++ *c++ = (*a++) * (*b++); +++ } ++ } ++ #endif /* LV_HAVE_AVX */ ++ ++@@ -307,50 +327,52 @@ volk_32fc_x2_multiply_32fc_a_avx(lv_32fc_t* cVector, const lv_32fc_t* aVector, ++ #include ++ #include ++ ++-static inline void ++-volk_32fc_x2_multiply_32fc_a_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector, ++- const lv_32fc_t* bVector, unsigned int num_points) +++static inline void volk_32fc_x2_multiply_32fc_a_sse3(lv_32fc_t* cVector, +++ const lv_32fc_t* aVector, +++ const lv_32fc_t* bVector, +++ unsigned int num_points) ++ { ++- unsigned int number = 0; ++- const unsigned int halfPoints = num_points / 2; ++- ++- __m128 x, y, z; ++- lv_32fc_t* c = cVector; ++- const lv_32fc_t* a = aVector; ++- const lv_32fc_t* b = bVector; ++- ++- for(; number < halfPoints; number++){ ++- x = _mm_load_ps((float*) a); // Load the ar + ai, br + bi as ar,ai,br,bi ++- y = _mm_load_ps((float*) b); // Load the cr + ci, dr + di as cr,ci,dr,di ++- z = _mm_complexmul_ps(x, y); ++- _mm_store_ps((float*) c, z); // Store the results back into the C container ++- ++- a += 2; ++- b += 2; ++- c += 2; ++- } ++- ++- if((num_points % 2) != 0){ ++- *c = (*a) * (*b); ++- } +++ unsigned int number = 0; +++ const unsigned int halfPoints = num_points / 2; +++ +++ __m128 x, y, z; +++ lv_32fc_t* c = cVector; +++ const lv_32fc_t* a = aVector; +++ const lv_32fc_t* b = bVector; +++ +++ for (; number < halfPoints; number++) { +++ x = _mm_load_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi +++ y = _mm_load_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di +++ z = _mm_complexmul_ps(x, y); +++ _mm_store_ps((float*)c, z); // Store the results back into the C container +++ +++ a += 2; +++ b += 2; +++ c += 2; +++ } +++ +++ if ((num_points % 2) != 0) { +++ *c = (*a) * (*b); +++ } ++ } ++ #endif /* LV_HAVE_SSE */ ++ ++ ++ #ifdef LV_HAVE_GENERIC ++ ++-static inline void ++-volk_32fc_x2_multiply_32fc_a_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, ++- const lv_32fc_t* bVector, unsigned int num_points) +++static inline void volk_32fc_x2_multiply_32fc_a_generic(lv_32fc_t* cVector, +++ const lv_32fc_t* aVector, +++ const lv_32fc_t* bVector, +++ unsigned int num_points) ++ { ++- lv_32fc_t* cPtr = cVector; ++- const lv_32fc_t* aPtr = aVector; ++- const lv_32fc_t* bPtr= bVector; ++- unsigned int number = 0; ++- ++- for(number = 0; number < num_points; number++){ ++- *cPtr++ = (*aPtr++) * (*bPtr++); ++- } +++ lv_32fc_t* cPtr = cVector; +++ const lv_32fc_t* aPtr = aVector; +++ const lv_32fc_t* bPtr = bVector; +++ unsigned int number = 0; +++ +++ for (number = 0; number < num_points; number++) { +++ *cPtr++ = (*aPtr++) * (*bPtr++); +++ } ++ } ++ #endif /* LV_HAVE_GENERIC */ ++ ++@@ -358,113 +380,118 @@ volk_32fc_x2_multiply_32fc_a_generic(lv_32fc_t* cVector, const lv_32fc_t* aVecto ++ #ifdef LV_HAVE_NEON ++ #include ++ ++-static inline void ++-volk_32fc_x2_multiply_32fc_neon(lv_32fc_t* cVector, const lv_32fc_t* aVector, ++- const lv_32fc_t* bVector, unsigned int num_points) +++static inline void volk_32fc_x2_multiply_32fc_neon(lv_32fc_t* cVector, +++ const lv_32fc_t* aVector, +++ const lv_32fc_t* bVector, +++ unsigned int num_points) ++ { ++- lv_32fc_t *a_ptr = (lv_32fc_t*) aVector; ++- lv_32fc_t *b_ptr = (lv_32fc_t*) bVector; ++- unsigned int quarter_points = num_points / 4; ++- float32x4x2_t a_val, b_val, c_val; ++- float32x4x2_t tmp_real, tmp_imag; ++- unsigned int number = 0; ++- ++- for(number = 0; number < quarter_points; ++number) { ++- a_val = vld2q_f32((float*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i ++- b_val = vld2q_f32((float*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i ++- __VOLK_PREFETCH(a_ptr+4); ++- __VOLK_PREFETCH(b_ptr+4); ++- ++- // multiply the real*real and imag*imag to get real result ++- // a0r*b0r|a1r*b1r|a2r*b2r|a3r*b3r ++- tmp_real.val[0] = vmulq_f32(a_val.val[0], b_val.val[0]); ++- // a0i*b0i|a1i*b1i|a2i*b2i|a3i*b3i ++- tmp_real.val[1] = vmulq_f32(a_val.val[1], b_val.val[1]); ++- ++- // Multiply cross terms to get the imaginary result ++- // a0r*b0i|a1r*b1i|a2r*b2i|a3r*b3i ++- tmp_imag.val[0] = vmulq_f32(a_val.val[0], b_val.val[1]); ++- // a0i*b0r|a1i*b1r|a2i*b2r|a3i*b3r ++- tmp_imag.val[1] = vmulq_f32(a_val.val[1], b_val.val[0]); ++- ++- // store the results ++- c_val.val[0] = vsubq_f32(tmp_real.val[0], tmp_real.val[1]); ++- c_val.val[1] = vaddq_f32(tmp_imag.val[0], tmp_imag.val[1]); ++- vst2q_f32((float*)cVector, c_val); ++- ++- a_ptr += 4; ++- b_ptr += 4; ++- cVector += 4; ++- } ++- ++- for(number = quarter_points*4; number < num_points; number++){ ++- *cVector++ = (*a_ptr++) * (*b_ptr++); ++- } +++ lv_32fc_t* a_ptr = (lv_32fc_t*)aVector; +++ lv_32fc_t* b_ptr = (lv_32fc_t*)bVector; +++ unsigned int quarter_points = num_points / 4; +++ float32x4x2_t a_val, b_val, c_val; +++ float32x4x2_t tmp_real, tmp_imag; +++ unsigned int number = 0; +++ +++ for (number = 0; number < quarter_points; ++number) { +++ a_val = vld2q_f32((float*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i +++ b_val = vld2q_f32((float*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i +++ __VOLK_PREFETCH(a_ptr + 4); +++ __VOLK_PREFETCH(b_ptr + 4); +++ +++ // multiply the real*real and imag*imag to get real result +++ // a0r*b0r|a1r*b1r|a2r*b2r|a3r*b3r +++ tmp_real.val[0] = vmulq_f32(a_val.val[0], b_val.val[0]); +++ // a0i*b0i|a1i*b1i|a2i*b2i|a3i*b3i +++ tmp_real.val[1] = vmulq_f32(a_val.val[1], b_val.val[1]); +++ +++ // Multiply cross terms to get the imaginary result +++ // a0r*b0i|a1r*b1i|a2r*b2i|a3r*b3i +++ tmp_imag.val[0] = vmulq_f32(a_val.val[0], b_val.val[1]); +++ // a0i*b0r|a1i*b1r|a2i*b2r|a3i*b3r +++ tmp_imag.val[1] = vmulq_f32(a_val.val[1], b_val.val[0]); +++ +++ // store the results +++ c_val.val[0] = vsubq_f32(tmp_real.val[0], tmp_real.val[1]); +++ c_val.val[1] = vaddq_f32(tmp_imag.val[0], tmp_imag.val[1]); +++ vst2q_f32((float*)cVector, c_val); +++ +++ a_ptr += 4; +++ b_ptr += 4; +++ cVector += 4; +++ } +++ +++ for (number = quarter_points * 4; number < num_points; number++) { +++ *cVector++ = (*a_ptr++) * (*b_ptr++); +++ } ++ } ++ #endif /* LV_HAVE_NEON */ ++ ++ ++ #ifdef LV_HAVE_NEON ++ ++-static inline void ++-volk_32fc_x2_multiply_32fc_neon_opttests(lv_32fc_t* cVector, const lv_32fc_t* aVector, ++- const lv_32fc_t* bVector, unsigned int num_points) +++static inline void volk_32fc_x2_multiply_32fc_neon_opttests(lv_32fc_t* cVector, +++ const lv_32fc_t* aVector, +++ const lv_32fc_t* bVector, +++ unsigned int num_points) ++ { ++- lv_32fc_t *a_ptr = (lv_32fc_t*) aVector; ++- lv_32fc_t *b_ptr = (lv_32fc_t*) bVector; ++- unsigned int quarter_points = num_points / 4; ++- float32x4x2_t a_val, b_val; ++- float32x4x2_t tmp_imag; ++- unsigned int number = 0; ++- ++- for(number = 0; number < quarter_points; ++number) { ++- a_val = vld2q_f32((float*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i ++- b_val = vld2q_f32((float*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i ++- __VOLK_PREFETCH(a_ptr+4); ++- __VOLK_PREFETCH(b_ptr+4); ++- ++- // do the first multiply ++- tmp_imag.val[1] = vmulq_f32(a_val.val[1], b_val.val[0]); ++- tmp_imag.val[0] = vmulq_f32(a_val.val[0], b_val.val[0]); ++- ++- // use multiply accumulate/subtract to get result ++- tmp_imag.val[1] = vmlaq_f32(tmp_imag.val[1], a_val.val[0], b_val.val[1]); ++- tmp_imag.val[0] = vmlsq_f32(tmp_imag.val[0], a_val.val[1], b_val.val[1]); ++- ++- // store ++- vst2q_f32((float*)cVector, tmp_imag); ++- // increment pointers ++- a_ptr += 4; ++- b_ptr += 4; ++- cVector += 4; ++- } ++- ++- for(number = quarter_points*4; number < num_points; number++){ ++- *cVector++ = (*a_ptr++) * (*b_ptr++); ++- } +++ lv_32fc_t* a_ptr = (lv_32fc_t*)aVector; +++ lv_32fc_t* b_ptr = (lv_32fc_t*)bVector; +++ unsigned int quarter_points = num_points / 4; +++ float32x4x2_t a_val, b_val; +++ float32x4x2_t tmp_imag; +++ unsigned int number = 0; +++ +++ for (number = 0; number < quarter_points; ++number) { +++ a_val = vld2q_f32((float*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i +++ b_val = vld2q_f32((float*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i +++ __VOLK_PREFETCH(a_ptr + 4); +++ __VOLK_PREFETCH(b_ptr + 4); +++ +++ // do the first multiply +++ tmp_imag.val[1] = vmulq_f32(a_val.val[1], b_val.val[0]); +++ tmp_imag.val[0] = vmulq_f32(a_val.val[0], b_val.val[0]); +++ +++ // use multiply accumulate/subtract to get result +++ tmp_imag.val[1] = vmlaq_f32(tmp_imag.val[1], a_val.val[0], b_val.val[1]); +++ tmp_imag.val[0] = vmlsq_f32(tmp_imag.val[0], a_val.val[1], b_val.val[1]); +++ +++ // store +++ vst2q_f32((float*)cVector, tmp_imag); +++ // increment pointers +++ a_ptr += 4; +++ b_ptr += 4; +++ cVector += 4; +++ } +++ +++ for (number = quarter_points * 4; number < num_points; number++) { +++ *cVector++ = (*a_ptr++) * (*b_ptr++); +++ } ++ } ++ #endif /* LV_HAVE_NEON */ ++ ++ ++ #ifdef LV_HAVE_NEONV7 ++ ++-extern void ++-volk_32fc_x2_multiply_32fc_a_neonasm(lv_32fc_t* cVector, const lv_32fc_t* aVector, ++- const lv_32fc_t* bVector, unsigned int num_points); +++extern void volk_32fc_x2_multiply_32fc_a_neonasm(lv_32fc_t* cVector, +++ const lv_32fc_t* aVector, +++ const lv_32fc_t* bVector, +++ unsigned int num_points); ++ #endif /* LV_HAVE_NEONV7 */ ++ ++ ++ #ifdef LV_HAVE_ORC ++ ++-extern void ++-volk_32fc_x2_multiply_32fc_a_orc_impl(lv_32fc_t* cVector, const lv_32fc_t* aVector, ++- const lv_32fc_t* bVector, unsigned int num_points); +++extern void volk_32fc_x2_multiply_32fc_a_orc_impl(lv_32fc_t* cVector, +++ const lv_32fc_t* aVector, +++ const lv_32fc_t* bVector, +++ unsigned int num_points); ++ ++-static inline void ++-volk_32fc_x2_multiply_32fc_u_orc(lv_32fc_t* cVector, const lv_32fc_t* aVector, ++- const lv_32fc_t* bVector, unsigned int num_points) +++static inline void volk_32fc_x2_multiply_32fc_u_orc(lv_32fc_t* cVector, +++ const lv_32fc_t* aVector, +++ const lv_32fc_t* bVector, +++ unsigned int num_points) ++ { ++- volk_32fc_x2_multiply_32fc_a_orc_impl(cVector, aVector, bVector, num_points); +++ volk_32fc_x2_multiply_32fc_a_orc_impl(cVector, aVector, bVector, num_points); ++ } ++ ++ #endif /* LV_HAVE_ORC */ ++diff --git a/kernels/volk/volk_32fc_x2_multiply_conjugate_32fc.h b/kernels/volk/volk_32fc_x2_multiply_conjugate_32fc.h ++index 1b1a8b3..4f834c2 100644 ++--- a/kernels/volk/volk_32fc_x2_multiply_conjugate_32fc.h +++++ b/kernels/volk/volk_32fc_x2_multiply_conjugate_32fc.h ++@@ -30,8 +30,8 @@ ++ * ++ * Dispatcher Prototype ++ * \code ++- * void volk_32fc_x2_multiply_conjugate_32fc(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points); ++- * \endcode +++ * void volk_32fc_x2_multiply_conjugate_32fc(lv_32fc_t* cVector, const lv_32fc_t* aVector, +++ * const lv_32fc_t* bVector, unsigned int num_points); \endcode ++ * ++ * \b Inputs ++ * \li aVector: The first input vector of complex floats. ++@@ -71,43 +71,46 @@ ++ #ifndef INCLUDED_volk_32fc_x2_multiply_conjugate_32fc_u_H ++ #define INCLUDED_volk_32fc_x2_multiply_conjugate_32fc_u_H ++ +++#include ++ #include ++ #include ++ #include ++-#include ++ ++ #ifdef LV_HAVE_AVX ++ #include ++ #include ++ ++-static inline void ++-volk_32fc_x2_multiply_conjugate_32fc_u_avx(lv_32fc_t* cVector, const lv_32fc_t* aVector, ++- const lv_32fc_t* bVector, unsigned int num_points) +++static inline void volk_32fc_x2_multiply_conjugate_32fc_u_avx(lv_32fc_t* cVector, +++ const lv_32fc_t* aVector, +++ const lv_32fc_t* bVector, +++ unsigned int num_points) ++ { ++- unsigned int number = 0; ++- const unsigned int quarterPoints = num_points / 4; ++- ++- __m256 x, y, z; ++- lv_32fc_t* c = cVector; ++- const lv_32fc_t* a = aVector; ++- const lv_32fc_t* b = bVector; ++- ++- for(; number < quarterPoints; number++){ ++- x = _mm256_loadu_ps((float*) a); // Load the ar + ai, br + bi ... as ar,ai,br,bi ... ++- y = _mm256_loadu_ps((float*) b); // Load the cr + ci, dr + di ... as cr,ci,dr,di ... ++- z = _mm256_complexconjugatemul_ps(x, y); ++- _mm256_storeu_ps((float*) c, z); // Store the results back into the C container ++- ++- a += 4; ++- b += 4; ++- c += 4; ++- } ++- ++- number = quarterPoints * 4; ++- ++- for(; number < num_points; number++){ ++- *c++ = (*a++) * lv_conj(*b++); ++- } +++ unsigned int number = 0; +++ const unsigned int quarterPoints = num_points / 4; +++ +++ __m256 x, y, z; +++ lv_32fc_t* c = cVector; +++ const lv_32fc_t* a = aVector; +++ const lv_32fc_t* b = bVector; +++ +++ for (; number < quarterPoints; number++) { +++ x = _mm256_loadu_ps( +++ (float*)a); // Load the ar + ai, br + bi ... as ar,ai,br,bi ... +++ y = _mm256_loadu_ps( +++ (float*)b); // Load the cr + ci, dr + di ... as cr,ci,dr,di ... +++ z = _mm256_complexconjugatemul_ps(x, y); +++ _mm256_storeu_ps((float*)c, z); // Store the results back into the C container +++ +++ a += 4; +++ b += 4; +++ c += 4; +++ } +++ +++ number = quarterPoints * 4; +++ +++ for (; number < num_points; number++) { +++ *c++ = (*a++) * lv_conj(*b++); +++ } ++ } ++ #endif /* LV_HAVE_AVX */ ++ ++@@ -116,96 +119,98 @@ volk_32fc_x2_multiply_conjugate_32fc_u_avx(lv_32fc_t* cVector, const lv_32fc_t* ++ #include ++ #include ++ ++-static inline void ++-volk_32fc_x2_multiply_conjugate_32fc_u_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector, ++- const lv_32fc_t* bVector, unsigned int num_points) +++static inline void volk_32fc_x2_multiply_conjugate_32fc_u_sse3(lv_32fc_t* cVector, +++ const lv_32fc_t* aVector, +++ const lv_32fc_t* bVector, +++ unsigned int num_points) ++ { ++- unsigned int number = 0; ++- const unsigned int halfPoints = num_points / 2; ++- ++- __m128 x, y, z; ++- lv_32fc_t* c = cVector; ++- const lv_32fc_t* a = aVector; ++- const lv_32fc_t* b = bVector; ++- ++- for(; number < halfPoints; number++){ ++- x = _mm_loadu_ps((float*) a); // Load the ar + ai, br + bi as ar,ai,br,bi ++- y = _mm_loadu_ps((float*) b); // Load the cr + ci, dr + di as cr,ci,dr,di ++- z = _mm_complexconjugatemul_ps(x, y); ++- _mm_storeu_ps((float*) c, z); // Store the results back into the C container ++- ++- a += 2; ++- b += 2; ++- c += 2; ++- } ++- ++- if((num_points % 2) != 0){ ++- *c = (*a) * lv_conj(*b); ++- } +++ unsigned int number = 0; +++ const unsigned int halfPoints = num_points / 2; +++ +++ __m128 x, y, z; +++ lv_32fc_t* c = cVector; +++ const lv_32fc_t* a = aVector; +++ const lv_32fc_t* b = bVector; +++ +++ for (; number < halfPoints; number++) { +++ x = _mm_loadu_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi +++ y = _mm_loadu_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di +++ z = _mm_complexconjugatemul_ps(x, y); +++ _mm_storeu_ps((float*)c, z); // Store the results back into the C container +++ +++ a += 2; +++ b += 2; +++ c += 2; +++ } +++ +++ if ((num_points % 2) != 0) { +++ *c = (*a) * lv_conj(*b); +++ } ++ } ++ #endif /* LV_HAVE_SSE */ ++ ++ ++ #ifdef LV_HAVE_GENERIC ++ ++-static inline void ++-volk_32fc_x2_multiply_conjugate_32fc_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, ++- const lv_32fc_t* bVector, unsigned int num_points) +++static inline void volk_32fc_x2_multiply_conjugate_32fc_generic(lv_32fc_t* cVector, +++ const lv_32fc_t* aVector, +++ const lv_32fc_t* bVector, +++ unsigned int num_points) ++ { ++- lv_32fc_t* cPtr = cVector; ++- const lv_32fc_t* aPtr = aVector; ++- const lv_32fc_t* bPtr= bVector; ++- unsigned int number = 0; ++- ++- for(number = 0; number < num_points; number++){ ++- *cPtr++ = (*aPtr++) * lv_conj(*bPtr++); ++- } +++ lv_32fc_t* cPtr = cVector; +++ const lv_32fc_t* aPtr = aVector; +++ const lv_32fc_t* bPtr = bVector; +++ unsigned int number = 0; +++ +++ for (number = 0; number < num_points; number++) { +++ *cPtr++ = (*aPtr++) * lv_conj(*bPtr++); +++ } ++ } ++ #endif /* LV_HAVE_GENERIC */ ++ ++ ++- ++ #endif /* INCLUDED_volk_32fc_x2_multiply_conjugate_32fc_u_H */ ++ #ifndef INCLUDED_volk_32fc_x2_multiply_conjugate_32fc_a_H ++ #define INCLUDED_volk_32fc_x2_multiply_conjugate_32fc_a_H ++ +++#include ++ #include ++ #include ++ #include ++-#include ++ ++ #ifdef LV_HAVE_AVX ++ #include ++ #include ++ ++-static inline void ++-volk_32fc_x2_multiply_conjugate_32fc_a_avx(lv_32fc_t* cVector, const lv_32fc_t* aVector, ++- const lv_32fc_t* bVector, unsigned int num_points) +++static inline void volk_32fc_x2_multiply_conjugate_32fc_a_avx(lv_32fc_t* cVector, +++ const lv_32fc_t* aVector, +++ const lv_32fc_t* bVector, +++ unsigned int num_points) ++ { ++- unsigned int number = 0; ++- const unsigned int quarterPoints = num_points / 4; ++- ++- __m256 x, y, z; ++- lv_32fc_t* c = cVector; ++- const lv_32fc_t* a = aVector; ++- const lv_32fc_t* b = bVector; ++- ++- for(; number < quarterPoints; number++){ ++- x = _mm256_load_ps((float*) a); // Load the ar + ai, br + bi ... as ar,ai,br,bi ... ++- y = _mm256_load_ps((float*) b); // Load the cr + ci, dr + di ... as cr,ci,dr,di ... ++- z = _mm256_complexconjugatemul_ps(x, y); ++- _mm256_store_ps((float*) c, z); // Store the results back into the C container ++- ++- a += 4; ++- b += 4; ++- c += 4; ++- } ++- ++- number = quarterPoints * 4; ++- ++- for(; number < num_points; number++){ ++- *c++ = (*a++) * lv_conj(*b++); ++- } +++ unsigned int number = 0; +++ const unsigned int quarterPoints = num_points / 4; +++ +++ __m256 x, y, z; +++ lv_32fc_t* c = cVector; +++ const lv_32fc_t* a = aVector; +++ const lv_32fc_t* b = bVector; +++ +++ for (; number < quarterPoints; number++) { +++ x = _mm256_load_ps((float*)a); // Load the ar + ai, br + bi ... as ar,ai,br,bi ... +++ y = _mm256_load_ps((float*)b); // Load the cr + ci, dr + di ... as cr,ci,dr,di ... +++ z = _mm256_complexconjugatemul_ps(x, y); +++ _mm256_store_ps((float*)c, z); // Store the results back into the C container +++ +++ a += 4; +++ b += 4; +++ c += 4; +++ } +++ +++ number = quarterPoints * 4; +++ +++ for (; number < num_points; number++) { +++ *c++ = (*a++) * lv_conj(*b++); +++ } ++ } ++ #endif /* LV_HAVE_AVX */ ++ ++@@ -214,32 +219,33 @@ volk_32fc_x2_multiply_conjugate_32fc_a_avx(lv_32fc_t* cVector, const lv_32fc_t* ++ #include ++ #include ++ ++-static inline void ++-volk_32fc_x2_multiply_conjugate_32fc_a_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector, ++- const lv_32fc_t* bVector, unsigned int num_points) +++static inline void volk_32fc_x2_multiply_conjugate_32fc_a_sse3(lv_32fc_t* cVector, +++ const lv_32fc_t* aVector, +++ const lv_32fc_t* bVector, +++ unsigned int num_points) ++ { ++- unsigned int number = 0; ++- const unsigned int halfPoints = num_points / 2; ++- ++- __m128 x, y, z; ++- lv_32fc_t* c = cVector; ++- const lv_32fc_t* a = aVector; ++- const lv_32fc_t* b = bVector; ++- ++- for(; number < halfPoints; number++){ ++- x = _mm_load_ps((float*) a); // Load the ar + ai, br + bi as ar,ai,br,bi ++- y = _mm_load_ps((float*) b); // Load the cr + ci, dr + di as cr,ci,dr,di ++- z = _mm_complexconjugatemul_ps(x, y); ++- _mm_store_ps((float*) c, z); // Store the results back into the C container ++- ++- a += 2; ++- b += 2; ++- c += 2; ++- } ++- ++- if((num_points % 2) != 0){ ++- *c = (*a) * lv_conj(*b); ++- } +++ unsigned int number = 0; +++ const unsigned int halfPoints = num_points / 2; +++ +++ __m128 x, y, z; +++ lv_32fc_t* c = cVector; +++ const lv_32fc_t* a = aVector; +++ const lv_32fc_t* b = bVector; +++ +++ for (; number < halfPoints; number++) { +++ x = _mm_load_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi +++ y = _mm_load_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di +++ z = _mm_complexconjugatemul_ps(x, y); +++ _mm_store_ps((float*)c, z); // Store the results back into the C container +++ +++ a += 2; +++ b += 2; +++ c += 2; +++ } +++ +++ if ((num_points % 2) != 0) { +++ *c = (*a) * lv_conj(*b); +++ } ++ } ++ #endif /* LV_HAVE_SSE */ ++ ++@@ -247,49 +253,50 @@ volk_32fc_x2_multiply_conjugate_32fc_a_sse3(lv_32fc_t* cVector, const lv_32fc_t* ++ #ifdef LV_HAVE_NEON ++ #include ++ ++-static inline void ++-volk_32fc_x2_multiply_conjugate_32fc_neon(lv_32fc_t* cVector, const lv_32fc_t* aVector, ++- const lv_32fc_t* bVector, unsigned int num_points) +++static inline void volk_32fc_x2_multiply_conjugate_32fc_neon(lv_32fc_t* cVector, +++ const lv_32fc_t* aVector, +++ const lv_32fc_t* bVector, +++ unsigned int num_points) ++ { ++- lv_32fc_t *a_ptr = (lv_32fc_t*) aVector; ++- lv_32fc_t *b_ptr = (lv_32fc_t*) bVector; ++- unsigned int quarter_points = num_points / 4; ++- float32x4x2_t a_val, b_val, c_val; ++- float32x4x2_t tmp_real, tmp_imag; ++- unsigned int number = 0; ++- ++- for(number = 0; number < quarter_points; ++number) { ++- a_val = vld2q_f32((float*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i ++- b_val = vld2q_f32((float*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i ++- b_val.val[1] = vnegq_f32(b_val.val[1]); ++- __VOLK_PREFETCH(a_ptr+4); ++- __VOLK_PREFETCH(b_ptr+4); ++- ++- // multiply the real*real and imag*imag to get real result ++- // a0r*b0r|a1r*b1r|a2r*b2r|a3r*b3r ++- tmp_real.val[0] = vmulq_f32(a_val.val[0], b_val.val[0]); ++- // a0i*b0i|a1i*b1i|a2i*b2i|a3i*b3i ++- tmp_real.val[1] = vmulq_f32(a_val.val[1], b_val.val[1]); ++- ++- // Multiply cross terms to get the imaginary result +++ lv_32fc_t* a_ptr = (lv_32fc_t*)aVector; +++ lv_32fc_t* b_ptr = (lv_32fc_t*)bVector; +++ unsigned int quarter_points = num_points / 4; +++ float32x4x2_t a_val, b_val, c_val; +++ float32x4x2_t tmp_real, tmp_imag; +++ unsigned int number = 0; +++ +++ for (number = 0; number < quarter_points; ++number) { +++ a_val = vld2q_f32((float*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i +++ b_val = vld2q_f32((float*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i +++ b_val.val[1] = vnegq_f32(b_val.val[1]); +++ __VOLK_PREFETCH(a_ptr + 4); +++ __VOLK_PREFETCH(b_ptr + 4); +++ +++ // multiply the real*real and imag*imag to get real result +++ // a0r*b0r|a1r*b1r|a2r*b2r|a3r*b3r +++ tmp_real.val[0] = vmulq_f32(a_val.val[0], b_val.val[0]); +++ // a0i*b0i|a1i*b1i|a2i*b2i|a3i*b3i +++ tmp_real.val[1] = vmulq_f32(a_val.val[1], b_val.val[1]); +++ +++ // Multiply cross terms to get the imaginary result ++ // a0r*b0i|a1r*b1i|a2r*b2i|a3r*b3i ++- tmp_imag.val[0] = vmulq_f32(a_val.val[0], b_val.val[1]); ++- // a0i*b0r|a1i*b1r|a2i*b2r|a3i*b3r ++- tmp_imag.val[1] = vmulq_f32(a_val.val[1], b_val.val[0]); ++- ++- // store the results ++- c_val.val[0] = vsubq_f32(tmp_real.val[0], tmp_real.val[1]); ++- c_val.val[1] = vaddq_f32(tmp_imag.val[0], tmp_imag.val[1]); ++- vst2q_f32((float*)cVector, c_val); ++- ++- a_ptr += 4; ++- b_ptr += 4; ++- cVector += 4; +++ tmp_imag.val[0] = vmulq_f32(a_val.val[0], b_val.val[1]); +++ // a0i*b0r|a1i*b1r|a2i*b2r|a3i*b3r +++ tmp_imag.val[1] = vmulq_f32(a_val.val[1], b_val.val[0]); +++ +++ // store the results +++ c_val.val[0] = vsubq_f32(tmp_real.val[0], tmp_real.val[1]); +++ c_val.val[1] = vaddq_f32(tmp_imag.val[0], tmp_imag.val[1]); +++ vst2q_f32((float*)cVector, c_val); +++ +++ a_ptr += 4; +++ b_ptr += 4; +++ cVector += 4; ++ } ++ ++- for(number = quarter_points*4; number < num_points; number++){ ++- *cVector++ = (*a_ptr++) * conj(*b_ptr++); ++- } +++ for (number = quarter_points * 4; number < num_points; number++) { +++ *cVector++ = (*a_ptr++) * conj(*b_ptr++); +++ } ++ } ++ #endif /* LV_HAVE_NEON */ ++ ++@@ -297,17 +304,19 @@ volk_32fc_x2_multiply_conjugate_32fc_neon(lv_32fc_t* cVector, const lv_32fc_t* a ++ #ifdef LV_HAVE_GENERIC ++ ++ static inline void ++-volk_32fc_x2_multiply_conjugate_32fc_a_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, ++- const lv_32fc_t* bVector, unsigned int num_points) +++volk_32fc_x2_multiply_conjugate_32fc_a_generic(lv_32fc_t* cVector, +++ const lv_32fc_t* aVector, +++ const lv_32fc_t* bVector, +++ unsigned int num_points) ++ { ++- lv_32fc_t* cPtr = cVector; ++- const lv_32fc_t* aPtr = aVector; ++- const lv_32fc_t* bPtr= bVector; ++- unsigned int number = 0; ++- ++- for(number = 0; number < num_points; number++){ ++- *cPtr++ = (*aPtr++) * lv_conj(*bPtr++); ++- } +++ lv_32fc_t* cPtr = cVector; +++ const lv_32fc_t* aPtr = aVector; +++ const lv_32fc_t* bPtr = bVector; +++ unsigned int number = 0; +++ +++ for (number = 0; number < num_points; number++) { +++ *cPtr++ = (*aPtr++) * lv_conj(*bPtr++); +++ } ++ } ++ #endif /* LV_HAVE_GENERIC */ ++ ++diff --git a/kernels/volk/volk_32fc_x2_s32f_square_dist_scalar_mult_32f.h b/kernels/volk/volk_32fc_x2_s32f_square_dist_scalar_mult_32f.h ++index 1c65f23..1d10561 100644 ++--- a/kernels/volk/volk_32fc_x2_s32f_square_dist_scalar_mult_32f.h +++++ b/kernels/volk/volk_32fc_x2_s32f_square_dist_scalar_mult_32f.h ++@@ -30,8 +30,8 @@ ++ * ++ * Dispatcher Prototype ++ * \code ++- * void volk_32fc_x2_s32f_square_dist_scalar_mult_32f(float* target, lv_32fc_t* src0, lv_32fc_t* points, float scalar, unsigned int num_points) ++- * \endcode +++ * void volk_32fc_x2_s32f_square_dist_scalar_mult_32f(float* target, lv_32fc_t* src0, +++ * lv_32fc_t* points, float scalar, unsigned int num_points) \endcode ++ * ++ * \b Inputs ++ * \li src0: The complex input. Only the first point is used. ++@@ -79,103 +79,107 @@ ++ #ifndef INCLUDED_volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_H ++ #define INCLUDED_volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_H ++ ++-#include +++#include ++ ++ ++-static inline void ++-calculate_scaled_distances(float* target, const lv_32fc_t symbol, const lv_32fc_t* points, ++- const float scalar, const unsigned int num_points) +++static inline void calculate_scaled_distances(float* target, +++ const lv_32fc_t symbol, +++ const lv_32fc_t* points, +++ const float scalar, +++ const unsigned int num_points) ++ { ++- lv_32fc_t diff; ++- for(unsigned int i = 0; i < num_points; ++i) { ++- /* ++- * Calculate: |y - x|^2 * SNR_lin ++- * Compare C++: *target++ = scalar * std::norm(symbol - *constellation++); ++- */ ++- diff = symbol - *points++; ++- *target++ = scalar * (lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff)); ++- } +++ lv_32fc_t diff; +++ for (unsigned int i = 0; i < num_points; ++i) { +++ /* +++ * Calculate: |y - x|^2 * SNR_lin +++ * Compare C++: *target++ = scalar * std::norm(symbol - *constellation++); +++ */ +++ diff = symbol - *points++; +++ *target++ = +++ scalar * (lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff)); +++ } ++ } ++ ++ ++ #ifdef LV_HAVE_AVX2 ++-#include ++-#include +++#include +++#include ++ ++ static inline void ++-volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_avx2(float* target, lv_32fc_t* src0, ++- lv_32fc_t* points, float scalar, +++volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_avx2(float* target, +++ lv_32fc_t* src0, +++ lv_32fc_t* points, +++ float scalar, ++ unsigned int num_points) ++ { ++- const unsigned int num_bytes = num_points*8; ++- __m128 xmm9, xmm10; ++- __m256 xmm4, xmm6; ++- __m256 xmm_points0, xmm_points1, xmm_result; +++ const unsigned int num_bytes = num_points * 8; +++ __m128 xmm9, xmm10; +++ __m256 xmm4, xmm6; +++ __m256 xmm_points0, xmm_points1, xmm_result; ++ ++- const unsigned int bound = num_bytes >> 6; ++- ++- // load complex value into all parts of the register. ++- const __m256 xmm_symbol = _mm256_castpd_ps(_mm256_broadcast_sd((const double*)src0)); ++- const __m128 xmm128_symbol = _mm256_extractf128_ps(xmm_symbol, 1); ++- ++- // Load scalar into all 8 parts of the register ++- const __m256 xmm_scalar = _mm256_broadcast_ss(&scalar); ++- const __m128 xmm128_scalar = _mm256_extractf128_ps(xmm_scalar, 1); +++ const unsigned int bound = num_bytes >> 6; ++ ++- // Set permutation constant ++- const __m256i idx = _mm256_set_epi32(7,6,3,2,5,4,1,0); ++- ++- for(unsigned int i = 0; i < bound; ++i) { ++- xmm_points0 = _mm256_load_ps((float*)points); ++- xmm_points1 = _mm256_load_ps((float*)(points + 4)); ++- points += 8; ++- __VOLK_PREFETCH(points); +++ // load complex value into all parts of the register. +++ const __m256 xmm_symbol = _mm256_castpd_ps(_mm256_broadcast_sd((const double*)src0)); +++ const __m128 xmm128_symbol = _mm256_extractf128_ps(xmm_symbol, 1); ++ ++- xmm_result = _mm256_scaled_norm_dist_ps_avx2(xmm_symbol, xmm_symbol, ++- xmm_points0, xmm_points1, ++- xmm_scalar); ++- ++- _mm256_store_ps(target, xmm_result); ++- target += 8; ++- } +++ // Load scalar into all 8 parts of the register +++ const __m256 xmm_scalar = _mm256_broadcast_ss(&scalar); +++ const __m128 xmm128_scalar = _mm256_extractf128_ps(xmm_scalar, 1); ++ ++- if (num_bytes >> 5 & 1) { ++- xmm_points0 = _mm256_load_ps((float*)points); +++ // Set permutation constant +++ const __m256i idx = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0); ++ ++- xmm4 = _mm256_sub_ps(xmm_symbol, xmm_points0); +++ for (unsigned int i = 0; i < bound; ++i) { +++ xmm_points0 = _mm256_load_ps((float*)points); +++ xmm_points1 = _mm256_load_ps((float*)(points + 4)); +++ points += 8; +++ __VOLK_PREFETCH(points); ++ ++- points += 4; +++ xmm_result = _mm256_scaled_norm_dist_ps_avx2( +++ xmm_symbol, xmm_symbol, xmm_points0, xmm_points1, xmm_scalar); ++ ++- xmm6 = _mm256_mul_ps(xmm4, xmm4); +++ _mm256_store_ps(target, xmm_result); +++ target += 8; +++ } ++ ++- xmm4 = _mm256_hadd_ps(xmm6, xmm6); ++- xmm4 = _mm256_permutevar8x32_ps(xmm4, idx); +++ if (num_bytes >> 5 & 1) { +++ xmm_points0 = _mm256_load_ps((float*)points); ++ ++- xmm_result = _mm256_mul_ps(xmm4, xmm_scalar); +++ xmm4 = _mm256_sub_ps(xmm_symbol, xmm_points0); ++ ++- xmm9 = _mm256_extractf128_ps(xmm_result, 1); ++- _mm_store_ps(target,xmm9); ++- target += 4; ++- } +++ points += 4; ++ ++- if (num_bytes >> 4 & 1) { ++- xmm9 = _mm_load_ps((float*)points); +++ xmm6 = _mm256_mul_ps(xmm4, xmm4); ++ ++- xmm10 = _mm_sub_ps(xmm128_symbol, xmm9); +++ xmm4 = _mm256_hadd_ps(xmm6, xmm6); +++ xmm4 = _mm256_permutevar8x32_ps(xmm4, idx); ++ ++- points += 2; +++ xmm_result = _mm256_mul_ps(xmm4, xmm_scalar); ++ ++- xmm9 = _mm_mul_ps(xmm10, xmm10); +++ xmm9 = _mm256_extractf128_ps(xmm_result, 1); +++ _mm_store_ps(target, xmm9); +++ target += 4; +++ } ++ ++- xmm10 = _mm_hadd_ps(xmm9, xmm9); +++ if (num_bytes >> 4 & 1) { +++ xmm9 = _mm_load_ps((float*)points); ++ ++- xmm10 = _mm_mul_ps(xmm10, xmm128_scalar); +++ xmm10 = _mm_sub_ps(xmm128_symbol, xmm9); ++ ++- _mm_storeh_pi((__m64*)target, xmm10); ++- target += 2; ++- } +++ points += 2; ++ ++- calculate_scaled_distances(target, src0[0], points, scalar, (num_bytes >> 3) & 1); +++ xmm9 = _mm_mul_ps(xmm10, xmm10); +++ +++ xmm10 = _mm_hadd_ps(xmm9, xmm9); +++ +++ xmm10 = _mm_mul_ps(xmm10, xmm128_scalar); +++ +++ _mm_storeh_pi((__m64*)target, xmm10); +++ target += 2; +++ } +++ +++ calculate_scaled_distances(target, src0[0], points, scalar, (num_bytes >> 3) & 1); ++ } ++ ++ #endif /*LV_HAVE_AVX2*/ ++@@ -186,131 +190,139 @@ volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_avx2(float* target, lv_32fc_t* s ++ #include ++ ++ static inline void ++-volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_avx(float *target, lv_32fc_t *src0, ++- lv_32fc_t *points, float scalar, ++- unsigned int num_points) { ++- const int eightsPoints = num_points / 8; ++- const int remainder = num_points - 8 * eightsPoints; ++- ++- __m256 xmm_points0, xmm_points1, xmm_result; ++- ++- // load complex value into all parts of the register. ++- const __m256 xmm_symbol = _mm256_castpd_ps(_mm256_broadcast_sd((const double*)src0)); ++- ++- // Load scalar into all 8 parts of the register ++- const __m256 xmm_scalar = _mm256_broadcast_ss(&scalar); ++- ++- for(int i = 0; i < eightsPoints; ++i){ ++- xmm_points0 = _mm256_load_ps((float*)points); ++- xmm_points1 = _mm256_load_ps((float*)(points + 4)); ++- points += 8; ++- ++- xmm_result = _mm256_scaled_norm_dist_ps(xmm_symbol, xmm_symbol, xmm_points0, ++- xmm_points1, xmm_scalar); ++- ++- _mm256_store_ps(target, xmm_result); ++- target += 8; ++- } ++- ++- const lv_32fc_t symbol = *src0; ++- calculate_scaled_distances(target, symbol, points, scalar, remainder); +++volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_avx(float* target, +++ lv_32fc_t* src0, +++ lv_32fc_t* points, +++ float scalar, +++ unsigned int num_points) +++{ +++ const int eightsPoints = num_points / 8; +++ const int remainder = num_points - 8 * eightsPoints; +++ +++ __m256 xmm_points0, xmm_points1, xmm_result; +++ +++ // load complex value into all parts of the register. +++ const __m256 xmm_symbol = _mm256_castpd_ps(_mm256_broadcast_sd((const double*)src0)); +++ +++ // Load scalar into all 8 parts of the register +++ const __m256 xmm_scalar = _mm256_broadcast_ss(&scalar); +++ +++ for (int i = 0; i < eightsPoints; ++i) { +++ xmm_points0 = _mm256_load_ps((float*)points); +++ xmm_points1 = _mm256_load_ps((float*)(points + 4)); +++ points += 8; +++ +++ xmm_result = _mm256_scaled_norm_dist_ps( +++ xmm_symbol, xmm_symbol, xmm_points0, xmm_points1, xmm_scalar); +++ +++ _mm256_store_ps(target, xmm_result); +++ target += 8; +++ } +++ +++ const lv_32fc_t symbol = *src0; +++ calculate_scaled_distances(target, symbol, points, scalar, remainder); ++ } ++ ++ #endif /* LV_HAVE_AVX */ ++ ++ ++ #ifdef LV_HAVE_SSE3 ++-#include ++-#include +++#include +++#include ++ ++ static inline void ++-volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_sse3(float* target, lv_32fc_t* src0, ++- lv_32fc_t* points, float scalar, +++volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_sse3(float* target, +++ lv_32fc_t* src0, +++ lv_32fc_t* points, +++ float scalar, ++ unsigned int num_points) ++ { ++- __m128 xmm_points0, xmm_points1, xmm_result; ++- ++- /* ++- * First do 4 values in every loop iteration. ++- * There may be up to 3 values left. ++- * leftovers0 indicates if at least 2 more are available for SSE execution. ++- * leftovers1 indicates if there is a single element left. ++- */ ++- const int quarterPoints = num_points / 4; ++- const int leftovers0 = (num_points / 2) - 2 * quarterPoints; ++- const int leftovers1 = num_points % 2; ++- ++- // load complex value into both parts of the register. ++- const __m128 xmm_symbol = _mm_castpd_ps(_mm_load1_pd((const double*)src0)); ++- ++- // Load scalar into all 4 parts of the register ++- const __m128 xmm_scalar = _mm_load1_ps(&scalar); ++- ++- for(int i = 0; i < quarterPoints; ++i) { ++- xmm_points0 = _mm_load_ps((float*)points); ++- xmm_points1 = _mm_load_ps((float*)(points + 2)); ++- points += 4; ++- __VOLK_PREFETCH(points); ++- // calculate distances ++- xmm_result = _mm_scaled_norm_dist_ps_sse3(xmm_symbol, xmm_symbol, xmm_points0, ++- xmm_points1, xmm_scalar); ++- ++- _mm_store_ps(target, xmm_result); ++- target += 4; ++- } ++- ++- for(int i = 0; i < leftovers0; ++i) { ++- xmm_points0 = _mm_load_ps((float*)points); ++- points += 2; ++- ++- xmm_points0 = _mm_sub_ps(xmm_symbol, xmm_points0); ++- xmm_points0 = _mm_mul_ps(xmm_points0, xmm_points0); ++- xmm_points0 = _mm_hadd_ps(xmm_points0, xmm_points0); ++- xmm_result = _mm_mul_ps(xmm_points0, xmm_scalar); ++- ++- _mm_storeh_pi((__m64*)target, xmm_result); ++- target += 2; ++- } ++- ++- calculate_scaled_distances(target, src0[0], points, scalar, leftovers1); +++ __m128 xmm_points0, xmm_points1, xmm_result; +++ +++ /* +++ * First do 4 values in every loop iteration. +++ * There may be up to 3 values left. +++ * leftovers0 indicates if at least 2 more are available for SSE execution. +++ * leftovers1 indicates if there is a single element left. +++ */ +++ const int quarterPoints = num_points / 4; +++ const int leftovers0 = (num_points / 2) - 2 * quarterPoints; +++ const int leftovers1 = num_points % 2; +++ +++ // load complex value into both parts of the register. +++ const __m128 xmm_symbol = _mm_castpd_ps(_mm_load1_pd((const double*)src0)); +++ +++ // Load scalar into all 4 parts of the register +++ const __m128 xmm_scalar = _mm_load1_ps(&scalar); +++ +++ for (int i = 0; i < quarterPoints; ++i) { +++ xmm_points0 = _mm_load_ps((float*)points); +++ xmm_points1 = _mm_load_ps((float*)(points + 2)); +++ points += 4; +++ __VOLK_PREFETCH(points); +++ // calculate distances +++ xmm_result = _mm_scaled_norm_dist_ps_sse3( +++ xmm_symbol, xmm_symbol, xmm_points0, xmm_points1, xmm_scalar); +++ +++ _mm_store_ps(target, xmm_result); +++ target += 4; +++ } +++ +++ for (int i = 0; i < leftovers0; ++i) { +++ xmm_points0 = _mm_load_ps((float*)points); +++ points += 2; +++ +++ xmm_points0 = _mm_sub_ps(xmm_symbol, xmm_points0); +++ xmm_points0 = _mm_mul_ps(xmm_points0, xmm_points0); +++ xmm_points0 = _mm_hadd_ps(xmm_points0, xmm_points0); +++ xmm_result = _mm_mul_ps(xmm_points0, xmm_scalar); +++ +++ _mm_storeh_pi((__m64*)target, xmm_result); +++ target += 2; +++ } +++ +++ calculate_scaled_distances(target, src0[0], points, scalar, leftovers1); ++ } ++ ++ #endif /*LV_HAVE_SSE3*/ ++ ++ #ifdef LV_HAVE_SSE ++-#include ++ #include +++#include ++ static inline void ++-volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_sse(float* target, lv_32fc_t* src0, ++- lv_32fc_t* points, float scalar, +++volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_sse(float* target, +++ lv_32fc_t* src0, +++ lv_32fc_t* points, +++ float scalar, ++ unsigned int num_points) ++ { ++- const __m128 xmm_scalar = _mm_set1_ps(scalar); ++- const __m128 xmm_symbol = _mm_castpd_ps(_mm_load1_pd((const double*)src0)); ++- ++- for (unsigned i = 0; i < num_points / 4; ++i) { ++- __m128 xmm_points0 = _mm_load_ps((float *) points); ++- __m128 xmm_points1 = _mm_load_ps((float *) (points + 2)); ++- points += 4; ++- __m128 xmm_result = _mm_scaled_norm_dist_ps_sse(xmm_symbol, xmm_symbol, ++- xmm_points0, xmm_points1, ++- xmm_scalar); ++- _mm_store_ps((float *) target, xmm_result); ++- target += 4; ++- } ++- ++- calculate_scaled_distances(target, src0[0], points, scalar, num_points % 4); +++ const __m128 xmm_scalar = _mm_set1_ps(scalar); +++ const __m128 xmm_symbol = _mm_castpd_ps(_mm_load1_pd((const double*)src0)); +++ +++ for (unsigned i = 0; i < num_points / 4; ++i) { +++ __m128 xmm_points0 = _mm_load_ps((float*)points); +++ __m128 xmm_points1 = _mm_load_ps((float*)(points + 2)); +++ points += 4; +++ __m128 xmm_result = _mm_scaled_norm_dist_ps_sse( +++ xmm_symbol, xmm_symbol, xmm_points0, xmm_points1, xmm_scalar); +++ _mm_store_ps((float*)target, xmm_result); +++ target += 4; +++ } +++ +++ calculate_scaled_distances(target, src0[0], points, scalar, num_points % 4); ++ } ++ #endif // LV_HAVE_SSE ++ ++ #ifdef LV_HAVE_GENERIC ++ static inline void ++-volk_32fc_x2_s32f_square_dist_scalar_mult_32f_generic(float* target, lv_32fc_t* src0, ++- lv_32fc_t* points, float scalar, +++volk_32fc_x2_s32f_square_dist_scalar_mult_32f_generic(float* target, +++ lv_32fc_t* src0, +++ lv_32fc_t* points, +++ float scalar, ++ unsigned int num_points) ++ { ++- const lv_32fc_t symbol = *src0; ++- calculate_scaled_distances(target, symbol, points, scalar, num_points); +++ const lv_32fc_t symbol = *src0; +++ calculate_scaled_distances(target, symbol, points, scalar, num_points); ++ } ++ ++ #endif /*LV_HAVE_GENERIC*/ ++@@ -321,87 +333,88 @@ volk_32fc_x2_s32f_square_dist_scalar_mult_32f_generic(float* target, lv_32fc_t* ++ #ifndef INCLUDED_volk_32fc_x2_s32f_square_dist_scalar_mult_32f_u_H ++ #define INCLUDED_volk_32fc_x2_s32f_square_dist_scalar_mult_32f_u_H ++ ++-#include +++#include ++ ++ ++ #ifdef LV_HAVE_AVX2 ++-#include +++#include ++ #include ++ ++ static inline void ++-volk_32fc_x2_s32f_square_dist_scalar_mult_32f_u_avx2(float* target, lv_32fc_t* src0, ++- lv_32fc_t* points, float scalar, +++volk_32fc_x2_s32f_square_dist_scalar_mult_32f_u_avx2(float* target, +++ lv_32fc_t* src0, +++ lv_32fc_t* points, +++ float scalar, ++ unsigned int num_points) ++ { ++- const unsigned int num_bytes = num_points*8; ++- __m128 xmm9, xmm10; ++- __m256 xmm4, xmm6; ++- __m256 xmm_points0, xmm_points1, xmm_result; +++ const unsigned int num_bytes = num_points * 8; +++ __m128 xmm9, xmm10; +++ __m256 xmm4, xmm6; +++ __m256 xmm_points0, xmm_points1, xmm_result; +++ +++ const unsigned int bound = num_bytes >> 6; +++ +++ // load complex value into all parts of the register. +++ const __m256 xmm_symbol = _mm256_castpd_ps(_mm256_broadcast_sd((const double*)src0)); +++ const __m128 xmm128_symbol = _mm256_extractf128_ps(xmm_symbol, 1); +++ +++ // Load scalar into all 8 parts of the register +++ const __m256 xmm_scalar = _mm256_broadcast_ss(&scalar); +++ const __m128 xmm128_scalar = _mm256_extractf128_ps(xmm_scalar, 1); ++ ++- const unsigned int bound = num_bytes >> 6; ++- ++- // load complex value into all parts of the register. ++- const __m256 xmm_symbol = _mm256_castpd_ps(_mm256_broadcast_sd((const double*)src0)); ++- const __m128 xmm128_symbol = _mm256_extractf128_ps(xmm_symbol, 1); ++- ++- // Load scalar into all 8 parts of the register ++- const __m256 xmm_scalar = _mm256_broadcast_ss(&scalar); ++- const __m128 xmm128_scalar = _mm256_extractf128_ps(xmm_scalar, 1); +++ // Set permutation constant +++ const __m256i idx = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0); ++ ++- // Set permutation constant ++- const __m256i idx = _mm256_set_epi32(7,6,3,2,5,4,1,0); ++- ++- for(unsigned int i = 0; i < bound; ++i) { ++- xmm_points0 = _mm256_loadu_ps((float*)points); ++- xmm_points1 = _mm256_loadu_ps((float*)(points + 4)); ++- points += 8; ++- __VOLK_PREFETCH(points); +++ for (unsigned int i = 0; i < bound; ++i) { +++ xmm_points0 = _mm256_loadu_ps((float*)points); +++ xmm_points1 = _mm256_loadu_ps((float*)(points + 4)); +++ points += 8; +++ __VOLK_PREFETCH(points); ++ ++- xmm_result = _mm256_scaled_norm_dist_ps_avx2(xmm_symbol, xmm_symbol, ++- xmm_points0, xmm_points1, ++- xmm_scalar); ++- ++- _mm256_storeu_ps(target, xmm_result); ++- target += 8; ++- } +++ xmm_result = _mm256_scaled_norm_dist_ps_avx2( +++ xmm_symbol, xmm_symbol, xmm_points0, xmm_points1, xmm_scalar); ++ ++- if (num_bytes >> 5 & 1) { ++- xmm_points0 = _mm256_loadu_ps((float*)points); +++ _mm256_storeu_ps(target, xmm_result); +++ target += 8; +++ } ++ ++- xmm4 = _mm256_sub_ps(xmm_symbol, xmm_points0); +++ if (num_bytes >> 5 & 1) { +++ xmm_points0 = _mm256_loadu_ps((float*)points); ++ ++- points += 4; +++ xmm4 = _mm256_sub_ps(xmm_symbol, xmm_points0); ++ ++- xmm6 = _mm256_mul_ps(xmm4, xmm4); +++ points += 4; ++ ++- xmm4 = _mm256_hadd_ps(xmm6, xmm6); ++- xmm4 = _mm256_permutevar8x32_ps(xmm4, idx); +++ xmm6 = _mm256_mul_ps(xmm4, xmm4); ++ ++- xmm_result = _mm256_mul_ps(xmm4, xmm_scalar); +++ xmm4 = _mm256_hadd_ps(xmm6, xmm6); +++ xmm4 = _mm256_permutevar8x32_ps(xmm4, idx); ++ ++- xmm9 = _mm256_extractf128_ps(xmm_result, 1); ++- _mm_storeu_ps(target,xmm9); ++- target += 4; ++- } +++ xmm_result = _mm256_mul_ps(xmm4, xmm_scalar); ++ ++- if (num_bytes >> 4 & 1) { ++- xmm9 = _mm_loadu_ps((float*)points); +++ xmm9 = _mm256_extractf128_ps(xmm_result, 1); +++ _mm_storeu_ps(target, xmm9); +++ target += 4; +++ } ++ ++- xmm10 = _mm_sub_ps(xmm128_symbol, xmm9); +++ if (num_bytes >> 4 & 1) { +++ xmm9 = _mm_loadu_ps((float*)points); ++ ++- points += 2; +++ xmm10 = _mm_sub_ps(xmm128_symbol, xmm9); ++ ++- xmm9 = _mm_mul_ps(xmm10, xmm10); +++ points += 2; ++ ++- xmm10 = _mm_hadd_ps(xmm9, xmm9); +++ xmm9 = _mm_mul_ps(xmm10, xmm10); ++ ++- xmm10 = _mm_mul_ps(xmm10, xmm128_scalar); +++ xmm10 = _mm_hadd_ps(xmm9, xmm9); ++ ++- _mm_storeh_pi((__m64*)target, xmm10); ++- target += 2; ++- } +++ xmm10 = _mm_mul_ps(xmm10, xmm128_scalar); ++ ++- calculate_scaled_distances(target, src0[0], points, scalar, (num_bytes >> 3) & 1); +++ _mm_storeh_pi((__m64*)target, xmm10); +++ target += 2; +++ } +++ +++ calculate_scaled_distances(target, src0[0], points, scalar, (num_bytes >> 3) & 1); ++ } ++ ++ #endif /*LV_HAVE_AVX2*/ ++@@ -412,120 +425,126 @@ volk_32fc_x2_s32f_square_dist_scalar_mult_32f_u_avx2(float* target, lv_32fc_t* s ++ #include ++ ++ static inline void ++-volk_32fc_x2_s32f_square_dist_scalar_mult_32f_u_avx(float *target, lv_32fc_t *src0, ++- lv_32fc_t *points, float scalar, ++- unsigned int num_points) { ++- const int eightsPoints = num_points / 8; ++- const int remainder = num_points - 8 * eightsPoints; ++- ++- __m256 xmm_points0, xmm_points1, xmm_result; ++- ++- // load complex value into all parts of the register. ++- const __m256 xmm_symbol = _mm256_castpd_ps(_mm256_broadcast_sd((const double*)src0)); ++- ++- // Load scalar into all 8 parts of the register ++- const __m256 xmm_scalar = _mm256_broadcast_ss(&scalar); ++- ++- for(int i = 0; i < eightsPoints; ++i){ ++- xmm_points0 = _mm256_loadu_ps((float*)points); ++- xmm_points1 = _mm256_loadu_ps((float*)(points + 4)); ++- points += 8; ++- ++- xmm_result = _mm256_scaled_norm_dist_ps(xmm_symbol, xmm_symbol, xmm_points0, ++- xmm_points1, xmm_scalar); ++- ++- _mm256_storeu_ps(target, xmm_result); ++- target += 8; ++- } ++- ++- const lv_32fc_t symbol = *src0; ++- calculate_scaled_distances(target, symbol, points, scalar, remainder); +++volk_32fc_x2_s32f_square_dist_scalar_mult_32f_u_avx(float* target, +++ lv_32fc_t* src0, +++ lv_32fc_t* points, +++ float scalar, +++ unsigned int num_points) +++{ +++ const int eightsPoints = num_points / 8; +++ const int remainder = num_points - 8 * eightsPoints; +++ +++ __m256 xmm_points0, xmm_points1, xmm_result; +++ +++ // load complex value into all parts of the register. +++ const __m256 xmm_symbol = _mm256_castpd_ps(_mm256_broadcast_sd((const double*)src0)); +++ +++ // Load scalar into all 8 parts of the register +++ const __m256 xmm_scalar = _mm256_broadcast_ss(&scalar); +++ +++ for (int i = 0; i < eightsPoints; ++i) { +++ xmm_points0 = _mm256_loadu_ps((float*)points); +++ xmm_points1 = _mm256_loadu_ps((float*)(points + 4)); +++ points += 8; +++ +++ xmm_result = _mm256_scaled_norm_dist_ps( +++ xmm_symbol, xmm_symbol, xmm_points0, xmm_points1, xmm_scalar); +++ +++ _mm256_storeu_ps(target, xmm_result); +++ target += 8; +++ } +++ +++ const lv_32fc_t symbol = *src0; +++ calculate_scaled_distances(target, symbol, points, scalar, remainder); ++ } ++ ++ #endif /* LV_HAVE_AVX */ ++ ++ ++ #ifdef LV_HAVE_SSE3 ++-#include ++-#include +++#include +++#include ++ ++ static inline void ++-volk_32fc_x2_s32f_square_dist_scalar_mult_32f_u_sse3(float* target, lv_32fc_t* src0, ++- lv_32fc_t* points, float scalar, +++volk_32fc_x2_s32f_square_dist_scalar_mult_32f_u_sse3(float* target, +++ lv_32fc_t* src0, +++ lv_32fc_t* points, +++ float scalar, ++ unsigned int num_points) ++ { ++- __m128 xmm_points0, xmm_points1, xmm_result; ++- ++- /* ++- * First do 4 values in every loop iteration. ++- * There may be up to 3 values left. ++- * leftovers0 indicates if at least 2 more are available for SSE execution. ++- * leftovers1 indicates if there is a single element left. ++- */ ++- const int quarterPoints = num_points / 4; ++- const int leftovers0 = (num_points / 2) - 2 * quarterPoints; ++- const int leftovers1 = num_points % 2; ++- ++- // load complex value into both parts of the register. ++- const __m128 xmm_symbol = _mm_castpd_ps(_mm_load1_pd((const double*)src0)); ++- ++- // Load scalar into all 4 parts of the register ++- const __m128 xmm_scalar = _mm_load1_ps(&scalar); ++- ++- for(int i = 0; i < quarterPoints; ++i) { ++- xmm_points0 = _mm_loadu_ps((float*)points); ++- xmm_points1 = _mm_loadu_ps((float*)(points + 2)); ++- points += 4; ++- __VOLK_PREFETCH(points); ++- // calculate distances ++- xmm_result = _mm_scaled_norm_dist_ps_sse3(xmm_symbol, xmm_symbol, xmm_points0, ++- xmm_points1, xmm_scalar); ++- ++- _mm_storeu_ps(target, xmm_result); ++- target += 4; ++- } ++- ++- for(int i = 0; i < leftovers0; ++i) { ++- xmm_points0 = _mm_loadu_ps((float*)points); ++- points += 2; ++- ++- xmm_points0 = _mm_sub_ps(xmm_symbol, xmm_points0); ++- xmm_points0 = _mm_mul_ps(xmm_points0, xmm_points0); ++- xmm_points0 = _mm_hadd_ps(xmm_points0, xmm_points0); ++- xmm_result = _mm_mul_ps(xmm_points0, xmm_scalar); ++- ++- _mm_storeh_pi((__m64*)target, xmm_result); ++- target += 2; ++- } ++- ++- calculate_scaled_distances(target, src0[0], points, scalar, leftovers1); +++ __m128 xmm_points0, xmm_points1, xmm_result; +++ +++ /* +++ * First do 4 values in every loop iteration. +++ * There may be up to 3 values left. +++ * leftovers0 indicates if at least 2 more are available for SSE execution. +++ * leftovers1 indicates if there is a single element left. +++ */ +++ const int quarterPoints = num_points / 4; +++ const int leftovers0 = (num_points / 2) - 2 * quarterPoints; +++ const int leftovers1 = num_points % 2; +++ +++ // load complex value into both parts of the register. +++ const __m128 xmm_symbol = _mm_castpd_ps(_mm_load1_pd((const double*)src0)); +++ +++ // Load scalar into all 4 parts of the register +++ const __m128 xmm_scalar = _mm_load1_ps(&scalar); +++ +++ for (int i = 0; i < quarterPoints; ++i) { +++ xmm_points0 = _mm_loadu_ps((float*)points); +++ xmm_points1 = _mm_loadu_ps((float*)(points + 2)); +++ points += 4; +++ __VOLK_PREFETCH(points); +++ // calculate distances +++ xmm_result = _mm_scaled_norm_dist_ps_sse3( +++ xmm_symbol, xmm_symbol, xmm_points0, xmm_points1, xmm_scalar); +++ +++ _mm_storeu_ps(target, xmm_result); +++ target += 4; +++ } +++ +++ for (int i = 0; i < leftovers0; ++i) { +++ xmm_points0 = _mm_loadu_ps((float*)points); +++ points += 2; +++ +++ xmm_points0 = _mm_sub_ps(xmm_symbol, xmm_points0); +++ xmm_points0 = _mm_mul_ps(xmm_points0, xmm_points0); +++ xmm_points0 = _mm_hadd_ps(xmm_points0, xmm_points0); +++ xmm_result = _mm_mul_ps(xmm_points0, xmm_scalar); +++ +++ _mm_storeh_pi((__m64*)target, xmm_result); +++ target += 2; +++ } +++ +++ calculate_scaled_distances(target, src0[0], points, scalar, leftovers1); ++ } ++ ++ #endif /*LV_HAVE_SSE3*/ ++ ++ #ifdef LV_HAVE_SSE ++-#include ++ #include +++#include ++ static inline void ++-volk_32fc_x2_s32f_square_dist_scalar_mult_32f_u_sse(float* target, lv_32fc_t* src0, ++- lv_32fc_t* points, float scalar, +++volk_32fc_x2_s32f_square_dist_scalar_mult_32f_u_sse(float* target, +++ lv_32fc_t* src0, +++ lv_32fc_t* points, +++ float scalar, ++ unsigned int num_points) ++ { ++- const __m128 xmm_scalar = _mm_set1_ps(scalar); ++- const __m128 xmm_symbol = _mm_castpd_ps(_mm_load1_pd((const double*)src0)); ++- ++- for (unsigned i = 0; i < num_points / 4; ++i) { ++- __m128 xmm_points0 = _mm_loadu_ps((float *) points); ++- __m128 xmm_points1 = _mm_loadu_ps((float *) (points + 2)); ++- points += 4; ++- __m128 xmm_result = _mm_scaled_norm_dist_ps_sse(xmm_symbol, xmm_symbol, ++- xmm_points0, xmm_points1, ++- xmm_scalar); ++- _mm_storeu_ps((float *) target, xmm_result); ++- target += 4; ++- } ++- ++- calculate_scaled_distances(target, src0[0], points, scalar, num_points % 4); +++ const __m128 xmm_scalar = _mm_set1_ps(scalar); +++ const __m128 xmm_symbol = _mm_castpd_ps(_mm_load1_pd((const double*)src0)); +++ +++ for (unsigned i = 0; i < num_points / 4; ++i) { +++ __m128 xmm_points0 = _mm_loadu_ps((float*)points); +++ __m128 xmm_points1 = _mm_loadu_ps((float*)(points + 2)); +++ points += 4; +++ __m128 xmm_result = _mm_scaled_norm_dist_ps_sse( +++ xmm_symbol, xmm_symbol, xmm_points0, xmm_points1, xmm_scalar); +++ _mm_storeu_ps((float*)target, xmm_result); +++ target += 4; +++ } +++ +++ calculate_scaled_distances(target, src0[0], points, scalar, num_points % 4); ++ } ++ #endif // LV_HAVE_SSE ++ ++diff --git a/kernels/volk/volk_32fc_x2_s32fc_multiply_conjugate_add_32fc.h b/kernels/volk/volk_32fc_x2_s32fc_multiply_conjugate_add_32fc.h ++index 6c7f4d3..1fb9b68 100644 ++--- a/kernels/volk/volk_32fc_x2_s32fc_multiply_conjugate_add_32fc.h +++++ b/kernels/volk/volk_32fc_x2_s32fc_multiply_conjugate_add_32fc.h ++@@ -32,14 +32,16 @@ ++ * ++ * Dispatcher Prototype ++ * \code ++- * void volk_32fc_x2_s32fc_multiply_conjugate_add_32fc(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, const lv_32fc_t scalar, unsigned int num_points); ++- * \endcode +++ * void volk_32fc_x2_s32fc_multiply_conjugate_add_32fc(lv_32fc_t* cVector, const +++ * lv_32fc_t* aVector, const lv_32fc_t* bVector, const lv_32fc_t scalar, unsigned int +++ * num_points); \endcode ++ * ++ * \b Inputs ++ * \li aVector: The input vector to be added. ++ * \li bVector: The input vector to be conjugate and multiplied. ++ * \li scalar: The complex scalar to multiply against conjugated bVector. ++- * \li num_points: The number of complex values in aVector and bVector to be conjugate, multiplied and stored into cVector. +++ * \li num_points: The number of complex values in aVector and bVector to be conjugate, +++ * multiplied and stored into cVector. ++ * ++ * \b Outputs ++ * \li cVector: The vector where the results will be stored. ++@@ -84,15 +86,21 @@ ++ #ifndef INCLUDED_volk_32fc_x2_s32fc_multiply_conjugate_add_32fc_H ++ #define INCLUDED_volk_32fc_x2_s32fc_multiply_conjugate_add_32fc_H ++ +++#include ++ #include ++ #include ++ #include ++-#include ++ ++ ++ #ifdef LV_HAVE_GENERIC ++ ++-static inline void volk_32fc_x2_s32fc_multiply_conjugate_add_32fc_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, const lv_32fc_t scalar, unsigned int num_points){ +++static inline void +++volk_32fc_x2_s32fc_multiply_conjugate_add_32fc_generic(lv_32fc_t* cVector, +++ const lv_32fc_t* aVector, +++ const lv_32fc_t* bVector, +++ const lv_32fc_t scalar, +++ unsigned int num_points) +++{ ++ const lv_32fc_t* aPtr = aVector; ++ const lv_32fc_t* bPtr = bVector; ++ lv_32fc_t* cPtr = cVector; ++@@ -123,14 +131,20 @@ static inline void volk_32fc_x2_s32fc_multiply_conjugate_add_32fc_generic(lv_32f ++ #include ++ #include ++ ++-static inline void volk_32fc_x2_s32fc_multiply_conjugate_add_32fc_u_avx(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, const lv_32fc_t scalar, unsigned int num_points) { +++static inline void +++volk_32fc_x2_s32fc_multiply_conjugate_add_32fc_u_avx(lv_32fc_t* cVector, +++ const lv_32fc_t* aVector, +++ const lv_32fc_t* bVector, +++ const lv_32fc_t scalar, +++ unsigned int num_points) +++{ ++ unsigned int number = 0; ++ unsigned int i = 0; ++ const unsigned int quarterPoints = num_points / 4; ++ unsigned int isodd = num_points & 3; ++ ++ __m256 x, y, s, z; ++- lv_32fc_t v_scalar[4] = {scalar, scalar, scalar, scalar}; +++ lv_32fc_t v_scalar[4] = { scalar, scalar, scalar, scalar }; ++ ++ const lv_32fc_t* a = aVector; ++ const lv_32fc_t* b = bVector; ++@@ -139,19 +153,19 @@ static inline void volk_32fc_x2_s32fc_multiply_conjugate_add_32fc_u_avx(lv_32fc_ ++ // Set up constant scalar vector ++ s = _mm256_loadu_ps((float*)v_scalar); ++ ++- for(;number < quarterPoints; number++) { +++ for (; number < quarterPoints; number++) { ++ x = _mm256_loadu_ps((float*)b); ++ y = _mm256_loadu_ps((float*)a); ++ z = _mm256_complexconjugatemul_ps(s, x); ++ z = _mm256_add_ps(y, z); ++- _mm256_storeu_ps((float*)c,z); +++ _mm256_storeu_ps((float*)c, z); ++ ++ a += 4; ++ b += 4; ++ c += 4; ++ } ++ ++- for(i = num_points-isodd; i < num_points; i++) { +++ for (i = num_points - isodd; i < num_points; i++) { ++ *c++ = (*a++) + lv_conj(*b++) * scalar; ++ } ++ } ++@@ -162,12 +176,18 @@ static inline void volk_32fc_x2_s32fc_multiply_conjugate_add_32fc_u_avx(lv_32fc_ ++ #include ++ #include ++ ++-static inline void volk_32fc_x2_s32fc_multiply_conjugate_add_32fc_u_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, const lv_32fc_t scalar, unsigned int num_points) { +++static inline void +++volk_32fc_x2_s32fc_multiply_conjugate_add_32fc_u_sse3(lv_32fc_t* cVector, +++ const lv_32fc_t* aVector, +++ const lv_32fc_t* bVector, +++ const lv_32fc_t scalar, +++ unsigned int num_points) +++{ ++ unsigned int number = 0; ++ const unsigned int halfPoints = num_points / 2; ++ ++ __m128 x, y, s, z; ++- lv_32fc_t v_scalar[2] = {scalar, scalar}; +++ lv_32fc_t v_scalar[2] = { scalar, scalar }; ++ ++ const lv_32fc_t* a = aVector; ++ const lv_32fc_t* b = bVector; ++@@ -176,19 +196,19 @@ static inline void volk_32fc_x2_s32fc_multiply_conjugate_add_32fc_u_sse3(lv_32fc ++ // Set up constant scalar vector ++ s = _mm_loadu_ps((float*)v_scalar); ++ ++- for(;number < halfPoints; number++){ +++ for (; number < halfPoints; number++) { ++ x = _mm_loadu_ps((float*)b); ++ y = _mm_loadu_ps((float*)a); ++ z = _mm_complexconjugatemul_ps(s, x); ++ z = _mm_add_ps(y, z); ++- _mm_storeu_ps((float*)c,z); +++ _mm_storeu_ps((float*)c, z); ++ ++ a += 2; ++ b += 2; ++ c += 2; ++ } ++ ++- if((num_points % 2) != 0) { +++ if ((num_points % 2) != 0) { ++ *c = *a + lv_conj(*b) * scalar; ++ } ++ } ++@@ -199,14 +219,20 @@ static inline void volk_32fc_x2_s32fc_multiply_conjugate_add_32fc_u_sse3(lv_32fc ++ #include ++ #include ++ ++-static inline void volk_32fc_x2_s32fc_multiply_conjugate_add_32fc_a_avx(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, const lv_32fc_t scalar, unsigned int num_points) { +++static inline void +++volk_32fc_x2_s32fc_multiply_conjugate_add_32fc_a_avx(lv_32fc_t* cVector, +++ const lv_32fc_t* aVector, +++ const lv_32fc_t* bVector, +++ const lv_32fc_t scalar, +++ unsigned int num_points) +++{ ++ unsigned int number = 0; ++ unsigned int i = 0; ++ const unsigned int quarterPoints = num_points / 4; ++ unsigned int isodd = num_points & 3; ++ ++ __m256 x, y, s, z; ++- lv_32fc_t v_scalar[4] = {scalar, scalar, scalar, scalar}; +++ lv_32fc_t v_scalar[4] = { scalar, scalar, scalar, scalar }; ++ ++ const lv_32fc_t* a = aVector; ++ const lv_32fc_t* b = bVector; ++@@ -215,19 +241,19 @@ static inline void volk_32fc_x2_s32fc_multiply_conjugate_add_32fc_a_avx(lv_32fc_ ++ // Set up constant scalar vector ++ s = _mm256_load_ps((float*)v_scalar); ++ ++- for(;number < quarterPoints; number++) { +++ for (; number < quarterPoints; number++) { ++ x = _mm256_load_ps((float*)b); ++ y = _mm256_load_ps((float*)a); ++ z = _mm256_complexconjugatemul_ps(s, x); ++ z = _mm256_add_ps(y, z); ++- _mm256_store_ps((float*)c,z); +++ _mm256_store_ps((float*)c, z); ++ ++ a += 4; ++ b += 4; ++ c += 4; ++ } ++ ++- for(i = num_points-isodd; i < num_points; i++) { +++ for (i = num_points - isodd; i < num_points; i++) { ++ *c++ = (*a++) + lv_conj(*b++) * scalar; ++ } ++ } ++@@ -238,12 +264,18 @@ static inline void volk_32fc_x2_s32fc_multiply_conjugate_add_32fc_a_avx(lv_32fc_ ++ #include ++ #include ++ ++-static inline void volk_32fc_x2_s32fc_multiply_conjugate_add_32fc_a_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, const lv_32fc_t scalar, unsigned int num_points) { +++static inline void +++volk_32fc_x2_s32fc_multiply_conjugate_add_32fc_a_sse3(lv_32fc_t* cVector, +++ const lv_32fc_t* aVector, +++ const lv_32fc_t* bVector, +++ const lv_32fc_t scalar, +++ unsigned int num_points) +++{ ++ unsigned int number = 0; ++ const unsigned int halfPoints = num_points / 2; ++ ++ __m128 x, y, s, z; ++- lv_32fc_t v_scalar[2] = {scalar, scalar}; +++ lv_32fc_t v_scalar[2] = { scalar, scalar }; ++ ++ const lv_32fc_t* a = aVector; ++ const lv_32fc_t* b = bVector; ++@@ -252,19 +284,19 @@ static inline void volk_32fc_x2_s32fc_multiply_conjugate_add_32fc_a_sse3(lv_32fc ++ // Set up constant scalar vector ++ s = _mm_load_ps((float*)v_scalar); ++ ++- for(;number < halfPoints; number++){ +++ for (; number < halfPoints; number++) { ++ x = _mm_load_ps((float*)b); ++ y = _mm_load_ps((float*)a); ++ z = _mm_complexconjugatemul_ps(s, x); ++ z = _mm_add_ps(y, z); ++- _mm_store_ps((float*)c,z); +++ _mm_store_ps((float*)c, z); ++ ++ a += 2; ++ b += 2; ++ c += 2; ++ } ++ ++- if((num_points % 2) != 0) { +++ if ((num_points % 2) != 0) { ++ *c = *a + lv_conj(*b) * scalar; ++ } ++ } ++@@ -272,9 +304,15 @@ static inline void volk_32fc_x2_s32fc_multiply_conjugate_add_32fc_a_sse3(lv_32fc ++ ++ ++ #ifdef LV_HAVE_NEON ++-#include ++- ++-static inline void volk_32fc_x2_s32fc_multiply_conjugate_add_32fc_neon(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, const lv_32fc_t scalar, unsigned int num_points){ +++#include +++ +++static inline void +++volk_32fc_x2_s32fc_multiply_conjugate_add_32fc_neon(lv_32fc_t* cVector, +++ const lv_32fc_t* aVector, +++ const lv_32fc_t* bVector, +++ const lv_32fc_t scalar, +++ unsigned int num_points) +++{ ++ const lv_32fc_t* bPtr = bVector; ++ const lv_32fc_t* aPtr = aVector; ++ lv_32fc_t* cPtr = cVector; ++@@ -287,7 +325,7 @@ static inline void volk_32fc_x2_s32fc_multiply_conjugate_add_32fc_neon(lv_32fc_t ++ scalar_val.val[0] = vld1q_dup_f32((const float*)&scalar); ++ scalar_val.val[1] = vld1q_dup_f32(((const float*)&scalar) + 1); ++ ++- for(number = 0; number < quarter_points; ++number) { +++ for (number = 0; number < quarter_points; ++number) { ++ a_val = vld2q_f32((float*)aPtr); ++ b_val = vld2q_f32((float*)bPtr); ++ b_val.val[1] = vnegq_f32(b_val.val[1]); ++@@ -310,7 +348,7 @@ static inline void volk_32fc_x2_s32fc_multiply_conjugate_add_32fc_neon(lv_32fc_t ++ cPtr += 4; ++ } ++ ++- for(number = quarter_points*4; number < num_points; number++){ +++ for (number = quarter_points * 4; number < num_points; number++) { ++ *cPtr++ = (*aPtr++) + lv_conj(*bPtr++) * scalar; ++ } ++ } ++diff --git a/kernels/volk/volk_32fc_x2_square_dist_32f.h b/kernels/volk/volk_32fc_x2_square_dist_32f.h ++index d6c6dff..75f4072 100644 ++--- a/kernels/volk/volk_32fc_x2_square_dist_32f.h +++++ b/kernels/volk/volk_32fc_x2_square_dist_32f.h ++@@ -30,8 +30,8 @@ ++ * ++ * Dispatcher Prototype ++ * \code ++- * void volk_32fc_x2_square_dist_32f(float* target, lv_32fc_t* src0, lv_32fc_t* points, unsigned int num_points) { ++- * \endcode +++ * void volk_32fc_x2_square_dist_32f(float* target, lv_32fc_t* src0, lv_32fc_t* points, +++ * unsigned int num_points) { \endcode ++ * ++ * \b Inputs ++ * \li src0: The complex input. Only the first point is used. ++@@ -78,183 +78,185 @@ ++ #ifndef INCLUDED_volk_32fc_x2_square_dist_32f_a_H ++ #define INCLUDED_volk_32fc_x2_square_dist_32f_a_H ++ ++-#include ++-#include ++-#include +++#include +++#include +++#include ++ ++ #ifdef LV_HAVE_AVX2 ++-#include +++#include ++ ++-static inline void ++-volk_32fc_x2_square_dist_32f_a_avx2(float* target, lv_32fc_t* src0, lv_32fc_t* points, ++- unsigned int num_points) +++static inline void volk_32fc_x2_square_dist_32f_a_avx2(float* target, +++ lv_32fc_t* src0, +++ lv_32fc_t* points, +++ unsigned int num_points) ++ { ++- const unsigned int num_bytes = num_points*8; ++- __m128 xmm0, xmm9, xmm10; ++- __m256 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; ++- ++- lv_32fc_t diff; ++- float sq_dist; ++- int bound = num_bytes >> 6; ++- int leftovers0 = (num_bytes >> 5) & 1; ++- int leftovers1 = (num_bytes >> 4) & 1; ++- int leftovers2 = (num_bytes >> 3) & 1; ++- int i = 0; ++- ++- __m256i idx = _mm256_set_epi32(7,6,3,2,5,4,1,0); ++- xmm1 = _mm256_setzero_ps(); ++- xmm2 = _mm256_load_ps((float*)&points[0]); ++- xmm0 = _mm_load_ps((float*)src0); ++- xmm0 = _mm_permute_ps(xmm0, 0b01000100); ++- xmm1 = _mm256_insertf128_ps(xmm1, xmm0, 0); ++- xmm1 = _mm256_insertf128_ps(xmm1, xmm0, 1); ++- xmm3 = _mm256_load_ps((float*)&points[4]); ++- ++- for(; i < bound; ++i) { ++- xmm4 = _mm256_sub_ps(xmm1, xmm2); ++- xmm5 = _mm256_sub_ps(xmm1, xmm3); ++- points += 8; ++- xmm6 = _mm256_mul_ps(xmm4, xmm4); ++- xmm7 = _mm256_mul_ps(xmm5, xmm5); ++- +++ const unsigned int num_bytes = num_points * 8; +++ __m128 xmm0, xmm9, xmm10; +++ __m256 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; +++ +++ lv_32fc_t diff; +++ float sq_dist; +++ int bound = num_bytes >> 6; +++ int leftovers0 = (num_bytes >> 5) & 1; +++ int leftovers1 = (num_bytes >> 4) & 1; +++ int leftovers2 = (num_bytes >> 3) & 1; +++ int i = 0; +++ +++ __m256i idx = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0); +++ xmm1 = _mm256_setzero_ps(); ++ xmm2 = _mm256_load_ps((float*)&points[0]); +++ xmm0 = _mm_load_ps((float*)src0); +++ xmm0 = _mm_permute_ps(xmm0, 0b01000100); +++ xmm1 = _mm256_insertf128_ps(xmm1, xmm0, 0); +++ xmm1 = _mm256_insertf128_ps(xmm1, xmm0, 1); +++ xmm3 = _mm256_load_ps((float*)&points[4]); ++ ++- xmm4 = _mm256_hadd_ps(xmm6, xmm7); ++- xmm4 = _mm256_permutevar8x32_ps(xmm4, idx); +++ for (; i < bound; ++i) { +++ xmm4 = _mm256_sub_ps(xmm1, xmm2); +++ xmm5 = _mm256_sub_ps(xmm1, xmm3); +++ points += 8; +++ xmm6 = _mm256_mul_ps(xmm4, xmm4); +++ xmm7 = _mm256_mul_ps(xmm5, xmm5); ++ ++- xmm3 = _mm256_load_ps((float*)&points[4]); +++ xmm2 = _mm256_load_ps((float*)&points[0]); ++ ++- _mm256_store_ps(target, xmm4); +++ xmm4 = _mm256_hadd_ps(xmm6, xmm7); +++ xmm4 = _mm256_permutevar8x32_ps(xmm4, idx); ++ ++- target += 8; ++- } +++ xmm3 = _mm256_load_ps((float*)&points[4]); ++ ++- for(i = 0; i < leftovers0; ++i) { +++ _mm256_store_ps(target, xmm4); ++ ++- xmm2 = _mm256_load_ps((float*)&points[0]); +++ target += 8; +++ } ++ ++- xmm4 = _mm256_sub_ps(xmm1, xmm2); +++ for (i = 0; i < leftovers0; ++i) { ++ ++- points += 4; +++ xmm2 = _mm256_load_ps((float*)&points[0]); ++ ++- xmm6 = _mm256_mul_ps(xmm4, xmm4); +++ xmm4 = _mm256_sub_ps(xmm1, xmm2); ++ ++- xmm4 = _mm256_hadd_ps(xmm6, xmm6); ++- xmm4 = _mm256_permutevar8x32_ps(xmm4, idx); +++ points += 4; ++ ++- xmm9 = _mm256_extractf128_ps(xmm4, 1); ++- _mm_store_ps(target,xmm9); +++ xmm6 = _mm256_mul_ps(xmm4, xmm4); ++ ++- target += 4; ++- } +++ xmm4 = _mm256_hadd_ps(xmm6, xmm6); +++ xmm4 = _mm256_permutevar8x32_ps(xmm4, idx); +++ +++ xmm9 = _mm256_extractf128_ps(xmm4, 1); +++ _mm_store_ps(target, xmm9); ++ ++- for(i = 0; i < leftovers1; ++i) { ++- xmm9 = _mm_load_ps((float*)&points[0]); +++ target += 4; +++ } ++ ++- xmm10 = _mm_sub_ps(xmm0, xmm9); +++ for (i = 0; i < leftovers1; ++i) { +++ xmm9 = _mm_load_ps((float*)&points[0]); ++ ++- points += 2; +++ xmm10 = _mm_sub_ps(xmm0, xmm9); ++ ++- xmm9 = _mm_mul_ps(xmm10, xmm10); +++ points += 2; ++ ++- xmm10 = _mm_hadd_ps(xmm9, xmm9); +++ xmm9 = _mm_mul_ps(xmm10, xmm10); ++ ++- _mm_storeh_pi((__m64*)target, xmm10); +++ xmm10 = _mm_hadd_ps(xmm9, xmm9); ++ ++- target += 2; ++- } +++ _mm_storeh_pi((__m64*)target, xmm10); ++ ++- for(i = 0; i < leftovers2; ++i) { +++ target += 2; +++ } ++ ++- diff = src0[0] - points[0]; +++ for (i = 0; i < leftovers2; ++i) { ++ ++- sq_dist = lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff); +++ diff = src0[0] - points[0]; ++ ++- target[0] = sq_dist; ++- } +++ sq_dist = lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff); +++ +++ target[0] = sq_dist; +++ } ++ } ++ ++ #endif /*LV_HAVE_AVX2*/ ++ ++ #ifdef LV_HAVE_SSE3 ++-#include ++-#include +++#include +++#include ++ ++-static inline void ++-volk_32fc_x2_square_dist_32f_a_sse3(float* target, lv_32fc_t* src0, lv_32fc_t* points, ++- unsigned int num_points) +++static inline void volk_32fc_x2_square_dist_32f_a_sse3(float* target, +++ lv_32fc_t* src0, +++ lv_32fc_t* points, +++ unsigned int num_points) ++ { ++- const unsigned int num_bytes = num_points*8; +++ const unsigned int num_bytes = num_points * 8; ++ ++- __m128 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; +++ __m128 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; ++ ++- lv_32fc_t diff; ++- float sq_dist; ++- int bound = num_bytes >> 5; ++- int i = 0; +++ lv_32fc_t diff; +++ float sq_dist; +++ int bound = num_bytes >> 5; +++ int i = 0; ++ ++- xmm1 = _mm_setzero_ps(); ++- xmm1 = _mm_loadl_pi(xmm1, (__m64*)src0); ++- xmm2 = _mm_load_ps((float*)&points[0]); ++- xmm1 = _mm_movelh_ps(xmm1, xmm1); ++- xmm3 = _mm_load_ps((float*)&points[2]); +++ xmm1 = _mm_setzero_ps(); +++ xmm1 = _mm_loadl_pi(xmm1, (__m64*)src0); +++ xmm2 = _mm_load_ps((float*)&points[0]); +++ xmm1 = _mm_movelh_ps(xmm1, xmm1); +++ xmm3 = _mm_load_ps((float*)&points[2]); +++ +++ for (; i < bound - 1; ++i) { +++ xmm4 = _mm_sub_ps(xmm1, xmm2); +++ xmm5 = _mm_sub_ps(xmm1, xmm3); +++ points += 4; +++ xmm6 = _mm_mul_ps(xmm4, xmm4); +++ xmm7 = _mm_mul_ps(xmm5, xmm5); +++ +++ xmm2 = _mm_load_ps((float*)&points[0]); +++ +++ xmm4 = _mm_hadd_ps(xmm6, xmm7); +++ +++ xmm3 = _mm_load_ps((float*)&points[2]); +++ +++ _mm_store_ps(target, xmm4); +++ +++ target += 4; +++ } ++ ++- for(; i < bound - 1; ++i) { ++ xmm4 = _mm_sub_ps(xmm1, xmm2); ++ xmm5 = _mm_sub_ps(xmm1, xmm3); +++ ++ points += 4; ++ xmm6 = _mm_mul_ps(xmm4, xmm4); ++ xmm7 = _mm_mul_ps(xmm5, xmm5); ++ ++- xmm2 = _mm_load_ps((float*)&points[0]); ++- ++ xmm4 = _mm_hadd_ps(xmm6, xmm7); ++ ++- xmm3 = _mm_load_ps((float*)&points[2]); ++- ++ _mm_store_ps(target, xmm4); ++ ++ target += 4; ++- } ++- ++- xmm4 = _mm_sub_ps(xmm1, xmm2); ++- xmm5 = _mm_sub_ps(xmm1, xmm3); ++- ++- points += 4; ++- xmm6 = _mm_mul_ps(xmm4, xmm4); ++- xmm7 = _mm_mul_ps(xmm5, xmm5); ++ ++- xmm4 = _mm_hadd_ps(xmm6, xmm7); +++ if (num_bytes >> 4 & 1) { ++ ++- _mm_store_ps(target, xmm4); +++ xmm2 = _mm_load_ps((float*)&points[0]); ++ ++- target += 4; +++ xmm4 = _mm_sub_ps(xmm1, xmm2); ++ ++- if (num_bytes >> 4 & 1) { +++ points += 2; ++ ++- xmm2 = _mm_load_ps((float*)&points[0]); ++- ++- xmm4 = _mm_sub_ps(xmm1, xmm2); +++ xmm6 = _mm_mul_ps(xmm4, xmm4); ++ ++- points += 2; ++- ++- xmm6 = _mm_mul_ps(xmm4, xmm4); +++ xmm4 = _mm_hadd_ps(xmm6, xmm6); ++ ++- xmm4 = _mm_hadd_ps(xmm6, xmm6); +++ _mm_storeh_pi((__m64*)target, xmm4); ++ ++- _mm_storeh_pi((__m64*)target, xmm4); +++ target += 2; +++ } ++ ++- target += 2; ++- } +++ if (num_bytes >> 3 & 1) { ++ ++- if (num_bytes >> 3 & 1) { +++ diff = src0[0] - points[0]; ++ ++- diff = src0[0] - points[0]; +++ sq_dist = lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff); ++ ++- sq_dist = lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff); ++- ++- target[0] = sq_dist; ++- } +++ target[0] = sq_dist; +++ } ++ } ++ ++ #endif /*LV_HAVE_SSE3*/ ++@@ -262,55 +264,58 @@ volk_32fc_x2_square_dist_32f_a_sse3(float* target, lv_32fc_t* src0, lv_32fc_t* p ++ ++ #ifdef LV_HAVE_NEON ++ #include ++-static inline void ++-volk_32fc_x2_square_dist_32f_neon(float* target, lv_32fc_t* src0, lv_32fc_t* points, unsigned int num_points) +++static inline void volk_32fc_x2_square_dist_32f_neon(float* target, +++ lv_32fc_t* src0, +++ lv_32fc_t* points, +++ unsigned int num_points) ++ { ++- const unsigned int quarter_points = num_points / 4; ++- unsigned int number; ++- ++- float32x4x2_t a_vec, b_vec; ++- float32x4x2_t diff_vec; ++- float32x4_t tmp, tmp1, dist_sq; ++- a_vec.val[0] = vdupq_n_f32( lv_creal(src0[0]) ); ++- a_vec.val[1] = vdupq_n_f32( lv_cimag(src0[0]) ); ++- for(number=0; number < quarter_points; ++number) { ++- b_vec = vld2q_f32((float*)points); ++- diff_vec.val[0] = vsubq_f32(a_vec.val[0], b_vec.val[0]); ++- diff_vec.val[1] = vsubq_f32(a_vec.val[1], b_vec.val[1]); ++- tmp = vmulq_f32(diff_vec.val[0], diff_vec.val[0]); ++- tmp1 = vmulq_f32(diff_vec.val[1], diff_vec.val[1]); ++- ++- dist_sq = vaddq_f32(tmp, tmp1); ++- vst1q_f32(target, dist_sq); ++- points += 4; ++- target += 4; ++- } ++- for(number=quarter_points*4; number < num_points; ++number) { ++- lv_32fc_t diff = src0[0] - *points++; ++- *target++ = lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff); ++- } +++ const unsigned int quarter_points = num_points / 4; +++ unsigned int number; +++ +++ float32x4x2_t a_vec, b_vec; +++ float32x4x2_t diff_vec; +++ float32x4_t tmp, tmp1, dist_sq; +++ a_vec.val[0] = vdupq_n_f32(lv_creal(src0[0])); +++ a_vec.val[1] = vdupq_n_f32(lv_cimag(src0[0])); +++ for (number = 0; number < quarter_points; ++number) { +++ b_vec = vld2q_f32((float*)points); +++ diff_vec.val[0] = vsubq_f32(a_vec.val[0], b_vec.val[0]); +++ diff_vec.val[1] = vsubq_f32(a_vec.val[1], b_vec.val[1]); +++ tmp = vmulq_f32(diff_vec.val[0], diff_vec.val[0]); +++ tmp1 = vmulq_f32(diff_vec.val[1], diff_vec.val[1]); +++ +++ dist_sq = vaddq_f32(tmp, tmp1); +++ vst1q_f32(target, dist_sq); +++ points += 4; +++ target += 4; +++ } +++ for (number = quarter_points * 4; number < num_points; ++number) { +++ lv_32fc_t diff = src0[0] - *points++; +++ *target++ = lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff); +++ } ++ } ++ #endif /* LV_HAVE_NEON */ ++ ++ ++ #ifdef LV_HAVE_GENERIC ++-static inline void ++-volk_32fc_x2_square_dist_32f_generic(float* target, lv_32fc_t* src0, lv_32fc_t* points, ++- unsigned int num_points) +++static inline void volk_32fc_x2_square_dist_32f_generic(float* target, +++ lv_32fc_t* src0, +++ lv_32fc_t* points, +++ unsigned int num_points) ++ { ++- const unsigned int num_bytes = num_points*8; +++ const unsigned int num_bytes = num_points * 8; ++ ++- lv_32fc_t diff; ++- float sq_dist; ++- unsigned int i = 0; +++ lv_32fc_t diff; +++ float sq_dist; +++ unsigned int i = 0; ++ ++- for(; i < num_bytes >> 3; ++i) { ++- diff = src0[0] - points[i]; +++ for (; i> 3; ++i) { +++ diff = src0[0] - points[i]; ++ ++- sq_dist = lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff); +++ sq_dist = lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff); ++ ++- target[i] = sq_dist; ++- } +++ target[i] = sq_dist; +++ } ++ } ++ ++ #endif /*LV_HAVE_GENERIC*/ ++@@ -321,80 +326,85 @@ volk_32fc_x2_square_dist_32f_generic(float* target, lv_32fc_t* src0, lv_32fc_t* ++ #ifndef INCLUDED_volk_32fc_x2_square_dist_32f_u_H ++ #define INCLUDED_volk_32fc_x2_square_dist_32f_u_H ++ ++-#include ++-#include ++-#include +++#include +++#include +++#include ++ ++ #ifdef LV_HAVE_AVX2 ++-#include +++#include ++ ++-static inline void ++-volk_32fc_x2_square_dist_32f_u_avx2(float* target, lv_32fc_t* src0, lv_32fc_t* points, ++- unsigned int num_points) +++static inline void volk_32fc_x2_square_dist_32f_u_avx2(float* target, +++ lv_32fc_t* src0, +++ lv_32fc_t* points, +++ unsigned int num_points) ++ { ++- const unsigned int num_bytes = num_points*8; ++- __m128 xmm0, xmm9; ++- __m256 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; ++- ++- lv_32fc_t diff; ++- float sq_dist; ++- int bound = num_bytes >> 6; ++- int leftovers1 = (num_bytes >> 3) & 0b11; ++- int i = 0; ++- ++- __m256i idx = _mm256_set_epi32(7,6,3,2,5,4,1,0); ++- xmm1 = _mm256_setzero_ps(); ++- xmm0 = _mm_loadu_ps((float*)src0); ++- xmm0 = _mm_permute_ps(xmm0, 0b01000100); ++- xmm1 = _mm256_insertf128_ps(xmm1, xmm0, 0); ++- xmm1 = _mm256_insertf128_ps(xmm1, xmm0, 1); ++- ++- for(; i < bound; ++i) { +++ const unsigned int num_bytes = num_points * 8; +++ __m128 xmm0, xmm9; +++ __m256 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; +++ +++ lv_32fc_t diff; +++ float sq_dist; +++ int bound = num_bytes >> 6; +++ int leftovers1 = (num_bytes >> 3) & 0b11; +++ int i = 0; +++ +++ __m256i idx = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0); +++ xmm1 = _mm256_setzero_ps(); ++ xmm2 = _mm256_loadu_ps((float*)&points[0]); +++ xmm0 = _mm_loadu_ps((float*)src0); +++ xmm0 = _mm_permute_ps(xmm0, 0b01000100); +++ xmm1 = _mm256_insertf128_ps(xmm1, xmm0, 0); +++ xmm1 = _mm256_insertf128_ps(xmm1, xmm0, 1); ++ xmm3 = _mm256_loadu_ps((float*)&points[4]); ++- xmm4 = _mm256_sub_ps(xmm1, xmm2); ++- xmm5 = _mm256_sub_ps(xmm1, xmm3); ++- points += 8; ++- xmm6 = _mm256_mul_ps(xmm4, xmm4); ++- xmm7 = _mm256_mul_ps(xmm5, xmm5); ++ ++- xmm4 = _mm256_hadd_ps(xmm6, xmm7); ++- xmm4 = _mm256_permutevar8x32_ps(xmm4, idx); +++ for (; i < bound; ++i) { +++ xmm4 = _mm256_sub_ps(xmm1, xmm2); +++ xmm5 = _mm256_sub_ps(xmm1, xmm3); +++ points += 8; +++ xmm6 = _mm256_mul_ps(xmm4, xmm4); +++ xmm7 = _mm256_mul_ps(xmm5, xmm5); ++ ++- _mm256_storeu_ps(target, xmm4); +++ xmm2 = _mm256_loadu_ps((float*)&points[0]); ++ ++- target += 8; ++- } +++ xmm4 = _mm256_hadd_ps(xmm6, xmm7); +++ xmm4 = _mm256_permutevar8x32_ps(xmm4, idx); ++ ++- if (num_bytes >> 5 & 1) { +++ xmm3 = _mm256_loadu_ps((float*)&points[4]); ++ ++- xmm2 = _mm256_loadu_ps((float*)&points[0]); +++ _mm256_storeu_ps(target, xmm4); ++ ++- xmm4 = _mm256_sub_ps(xmm1, xmm2); +++ target += 8; +++ } ++ ++- points += 4; +++ if (num_bytes >> 5 & 1) { ++ ++- xmm6 = _mm256_mul_ps(xmm4, xmm4); +++ xmm2 = _mm256_loadu_ps((float*)&points[0]); ++ ++- xmm4 = _mm256_hadd_ps(xmm6, xmm6); ++- xmm4 = _mm256_permutevar8x32_ps(xmm4, idx); +++ xmm4 = _mm256_sub_ps(xmm1, xmm2); ++ ++- xmm9 = _mm256_extractf128_ps(xmm4, 1); ++- _mm_storeu_ps(target,xmm9); +++ points += 4; ++ ++- target += 4; ++- } +++ xmm6 = _mm256_mul_ps(xmm4, xmm4); +++ +++ xmm4 = _mm256_hadd_ps(xmm6, xmm6); +++ xmm4 = _mm256_permutevar8x32_ps(xmm4, idx); +++ +++ xmm9 = _mm256_extractf128_ps(xmm4, 1); +++ _mm_storeu_ps(target, xmm9); +++ +++ target += 4; +++ } ++ ++- for(i = 0; i < leftovers1; ++i) { +++ for (i = 0; i < leftovers1; ++i) { ++ ++- diff = src0[0] - points[0]; ++- points += 1; +++ diff = src0[0] - points[0]; +++ points += 1; ++ ++- sq_dist = lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff); +++ sq_dist = lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff); ++ ++- target[0] = sq_dist; ++- target += 1; ++- } +++ target[0] = sq_dist; +++ target += 1; +++ } ++ } ++ ++ #endif /*LV_HAVE_AVX2*/ ++diff --git a/kernels/volk/volk_32i_s32f_convert_32f.h b/kernels/volk/volk_32i_s32f_convert_32f.h ++index 87d94f9..6b67cdb 100644 ++--- a/kernels/volk/volk_32i_s32f_convert_32f.h +++++ b/kernels/volk/volk_32i_s32f_convert_32f.h ++@@ -30,8 +30,8 @@ ++ * ++ * Dispatcher Prototype ++ * \code ++- * void volk_32i_s32f_convert_32f(float* outputVector, const int32_t* inputVector, const float scalar, unsigned int num_points) ++- * \endcode +++ * void volk_32i_s32f_convert_32f(float* outputVector, const int32_t* inputVector, const +++ * float scalar, unsigned int num_points) \endcode ++ * ++ * \b Inputs ++ * \li inputVector: The vector of 32-bit integers. ++@@ -70,37 +70,38 @@ ++ #ifdef LV_HAVE_AVX512F ++ #include ++ ++-static inline void ++-volk_32i_s32f_convert_32f_u_avx512f(float* outputVector, const int32_t* inputVector, ++- const float scalar, unsigned int num_points) +++static inline void volk_32i_s32f_convert_32f_u_avx512f(float* outputVector, +++ const int32_t* inputVector, +++ const float scalar, +++ unsigned int num_points) ++ { ++- unsigned int number = 0; ++- const unsigned int onesixteenthPoints = num_points / 16; +++ unsigned int number = 0; +++ const unsigned int onesixteenthPoints = num_points / 16; ++ ++- float* outputVectorPtr = outputVector; ++- const float iScalar = 1.0 / scalar; ++- __m512 invScalar = _mm512_set1_ps(iScalar); ++- int32_t* inputPtr = (int32_t*)inputVector; ++- __m512i inputVal; ++- __m512 ret; +++ float* outputVectorPtr = outputVector; +++ const float iScalar = 1.0 / scalar; +++ __m512 invScalar = _mm512_set1_ps(iScalar); +++ int32_t* inputPtr = (int32_t*)inputVector; +++ __m512i inputVal; +++ __m512 ret; ++ ++- for(;number < onesixteenthPoints; number++){ ++- // Load the values ++- inputVal = _mm512_loadu_si512((__m512i*)inputPtr); +++ for (; number < onesixteenthPoints; number++) { +++ // Load the values +++ inputVal = _mm512_loadu_si512((__m512i*)inputPtr); ++ ++- ret = _mm512_cvtepi32_ps(inputVal); ++- ret = _mm512_mul_ps(ret, invScalar); +++ ret = _mm512_cvtepi32_ps(inputVal); +++ ret = _mm512_mul_ps(ret, invScalar); ++ ++- _mm512_storeu_ps(outputVectorPtr, ret); +++ _mm512_storeu_ps(outputVectorPtr, ret); ++ ++- outputVectorPtr += 16; ++- inputPtr += 16; ++- } +++ outputVectorPtr += 16; +++ inputPtr += 16; +++ } ++ ++- number = onesixteenthPoints * 16; ++- for(; number < num_points; number++){ ++- outputVector[number] =((float)(inputVector[number])) * iScalar; ++- } +++ number = onesixteenthPoints * 16; +++ for (; number < num_points; number++) { +++ outputVector[number] = ((float)(inputVector[number])) * iScalar; +++ } ++ } ++ #endif /* LV_HAVE_AVX512F */ ++ ++@@ -108,37 +109,38 @@ volk_32i_s32f_convert_32f_u_avx512f(float* outputVector, const int32_t* inputVec ++ #ifdef LV_HAVE_AVX2 ++ #include ++ ++-static inline void ++-volk_32i_s32f_convert_32f_u_avx2(float* outputVector, const int32_t* inputVector, ++- const float scalar, unsigned int num_points) +++static inline void volk_32i_s32f_convert_32f_u_avx2(float* outputVector, +++ const int32_t* inputVector, +++ const float scalar, +++ unsigned int num_points) ++ { ++- unsigned int number = 0; ++- const unsigned int oneEightPoints = num_points / 8; +++ unsigned int number = 0; +++ const unsigned int oneEightPoints = num_points / 8; ++ ++- float* outputVectorPtr = outputVector; ++- const float iScalar = 1.0 / scalar; ++- __m256 invScalar = _mm256_set1_ps(iScalar); ++- int32_t* inputPtr = (int32_t*)inputVector; ++- __m256i inputVal; ++- __m256 ret; +++ float* outputVectorPtr = outputVector; +++ const float iScalar = 1.0 / scalar; +++ __m256 invScalar = _mm256_set1_ps(iScalar); +++ int32_t* inputPtr = (int32_t*)inputVector; +++ __m256i inputVal; +++ __m256 ret; ++ ++- for(;number < oneEightPoints; number++){ ++- // Load the 4 values ++- inputVal = _mm256_loadu_si256((__m256i*)inputPtr); +++ for (; number < oneEightPoints; number++) { +++ // Load the 4 values +++ inputVal = _mm256_loadu_si256((__m256i*)inputPtr); ++ ++- ret = _mm256_cvtepi32_ps(inputVal); ++- ret = _mm256_mul_ps(ret, invScalar); +++ ret = _mm256_cvtepi32_ps(inputVal); +++ ret = _mm256_mul_ps(ret, invScalar); ++ ++- _mm256_storeu_ps(outputVectorPtr, ret); +++ _mm256_storeu_ps(outputVectorPtr, ret); ++ ++- outputVectorPtr += 8; ++- inputPtr += 8; ++- } +++ outputVectorPtr += 8; +++ inputPtr += 8; +++ } ++ ++- number = oneEightPoints * 8; ++- for(; number < num_points; number++){ ++- outputVector[number] =((float)(inputVector[number])) * iScalar; ++- } +++ number = oneEightPoints * 8; +++ for (; number < num_points; number++) { +++ outputVector[number] = ((float)(inputVector[number])) * iScalar; +++ } ++ } ++ #endif /* LV_HAVE_AVX2 */ ++ ++@@ -146,62 +148,63 @@ volk_32i_s32f_convert_32f_u_avx2(float* outputVector, const int32_t* inputVector ++ #ifdef LV_HAVE_SSE2 ++ #include ++ ++-static inline void ++-volk_32i_s32f_convert_32f_u_sse2(float* outputVector, const int32_t* inputVector, ++- const float scalar, unsigned int num_points) +++static inline void volk_32i_s32f_convert_32f_u_sse2(float* outputVector, +++ const int32_t* inputVector, +++ const float scalar, +++ unsigned int num_points) ++ { ++- unsigned int number = 0; ++- const unsigned int quarterPoints = num_points / 4; +++ unsigned int number = 0; +++ const unsigned int quarterPoints = num_points / 4; ++ ++- float* outputVectorPtr = outputVector; ++- const float iScalar = 1.0 / scalar; ++- __m128 invScalar = _mm_set_ps1(iScalar); ++- int32_t* inputPtr = (int32_t*)inputVector; ++- __m128i inputVal; ++- __m128 ret; +++ float* outputVectorPtr = outputVector; +++ const float iScalar = 1.0 / scalar; +++ __m128 invScalar = _mm_set_ps1(iScalar); +++ int32_t* inputPtr = (int32_t*)inputVector; +++ __m128i inputVal; +++ __m128 ret; ++ ++- for(;number < quarterPoints; number++){ ++- // Load the 4 values ++- inputVal = _mm_loadu_si128((__m128i*)inputPtr); +++ for (; number < quarterPoints; number++) { +++ // Load the 4 values +++ inputVal = _mm_loadu_si128((__m128i*)inputPtr); ++ ++- ret = _mm_cvtepi32_ps(inputVal); ++- ret = _mm_mul_ps(ret, invScalar); +++ ret = _mm_cvtepi32_ps(inputVal); +++ ret = _mm_mul_ps(ret, invScalar); ++ ++- _mm_storeu_ps(outputVectorPtr, ret); +++ _mm_storeu_ps(outputVectorPtr, ret); ++ ++- outputVectorPtr += 4; ++- inputPtr += 4; ++- } +++ outputVectorPtr += 4; +++ inputPtr += 4; +++ } ++ ++- number = quarterPoints * 4; ++- for(; number < num_points; number++){ ++- outputVector[number] =((float)(inputVector[number])) * iScalar; ++- } +++ number = quarterPoints * 4; +++ for (; number < num_points; number++) { +++ outputVector[number] = ((float)(inputVector[number])) * iScalar; +++ } ++ } ++ #endif /* LV_HAVE_SSE2 */ ++ ++ ++ #ifdef LV_HAVE_GENERIC ++ ++-static inline void ++-volk_32i_s32f_convert_32f_generic(float* outputVector, const int32_t* inputVector, ++- const float scalar, unsigned int num_points) +++static inline void volk_32i_s32f_convert_32f_generic(float* outputVector, +++ const int32_t* inputVector, +++ const float scalar, +++ unsigned int num_points) ++ { ++- float* outputVectorPtr = outputVector; ++- const int32_t* inputVectorPtr = inputVector; ++- unsigned int number = 0; ++- const float iScalar = 1.0 / scalar; ++- ++- for(number = 0; number < num_points; number++){ ++- *outputVectorPtr++ = ((float)(*inputVectorPtr++)) * iScalar; ++- } +++ float* outputVectorPtr = outputVector; +++ const int32_t* inputVectorPtr = inputVector; +++ unsigned int number = 0; +++ const float iScalar = 1.0 / scalar; +++ +++ for (number = 0; number < num_points; number++) { +++ *outputVectorPtr++ = ((float)(*inputVectorPtr++)) * iScalar; +++ } ++ } ++ #endif /* LV_HAVE_GENERIC */ ++ ++ #endif /* INCLUDED_volk_32i_s32f_convert_32f_u_H */ ++ ++ ++- ++ #ifndef INCLUDED_volk_32i_s32f_convert_32f_a_H ++ #define INCLUDED_volk_32i_s32f_convert_32f_a_H ++ ++@@ -211,74 +214,76 @@ volk_32i_s32f_convert_32f_generic(float* outputVector, const int32_t* inputVecto ++ #ifdef LV_HAVE_AVX512F ++ #include ++ ++-static inline void ++-volk_32i_s32f_convert_32f_a_avx512f(float* outputVector, const int32_t* inputVector, ++- const float scalar, unsigned int num_points) +++static inline void volk_32i_s32f_convert_32f_a_avx512f(float* outputVector, +++ const int32_t* inputVector, +++ const float scalar, +++ unsigned int num_points) ++ { ++- unsigned int number = 0; ++- const unsigned int onesixteenthPoints = num_points / 16; +++ unsigned int number = 0; +++ const unsigned int onesixteenthPoints = num_points / 16; ++ ++- float* outputVectorPtr = outputVector; ++- const float iScalar = 1.0 / scalar; ++- __m512 invScalar = _mm512_set1_ps(iScalar); ++- int32_t* inputPtr = (int32_t*)inputVector; ++- __m512i inputVal; ++- __m512 ret; +++ float* outputVectorPtr = outputVector; +++ const float iScalar = 1.0 / scalar; +++ __m512 invScalar = _mm512_set1_ps(iScalar); +++ int32_t* inputPtr = (int32_t*)inputVector; +++ __m512i inputVal; +++ __m512 ret; ++ ++- for(;number < onesixteenthPoints; number++){ ++- // Load the values ++- inputVal = _mm512_load_si512((__m512i*)inputPtr); +++ for (; number < onesixteenthPoints; number++) { +++ // Load the values +++ inputVal = _mm512_load_si512((__m512i*)inputPtr); ++ ++- ret = _mm512_cvtepi32_ps(inputVal); ++- ret = _mm512_mul_ps(ret, invScalar); +++ ret = _mm512_cvtepi32_ps(inputVal); +++ ret = _mm512_mul_ps(ret, invScalar); ++ ++- _mm512_store_ps(outputVectorPtr, ret); +++ _mm512_store_ps(outputVectorPtr, ret); ++ ++- outputVectorPtr += 16; ++- inputPtr += 16; ++- } +++ outputVectorPtr += 16; +++ inputPtr += 16; +++ } ++ ++- number = onesixteenthPoints * 16; ++- for(; number < num_points; number++){ ++- outputVector[number] =((float)(inputVector[number])) * iScalar; ++- } +++ number = onesixteenthPoints * 16; +++ for (; number < num_points; number++) { +++ outputVector[number] = ((float)(inputVector[number])) * iScalar; +++ } ++ } ++ #endif /* LV_HAVE_AVX512F */ ++ ++ #ifdef LV_HAVE_AVX2 ++ #include ++ ++-static inline void ++-volk_32i_s32f_convert_32f_a_avx2(float* outputVector, const int32_t* inputVector, ++- const float scalar, unsigned int num_points) +++static inline void volk_32i_s32f_convert_32f_a_avx2(float* outputVector, +++ const int32_t* inputVector, +++ const float scalar, +++ unsigned int num_points) ++ { ++- unsigned int number = 0; ++- const unsigned int oneEightPoints = num_points / 8; +++ unsigned int number = 0; +++ const unsigned int oneEightPoints = num_points / 8; ++ ++- float* outputVectorPtr = outputVector; ++- const float iScalar = 1.0 / scalar; ++- __m256 invScalar = _mm256_set1_ps(iScalar); ++- int32_t* inputPtr = (int32_t*)inputVector; ++- __m256i inputVal; ++- __m256 ret; +++ float* outputVectorPtr = outputVector; +++ const float iScalar = 1.0 / scalar; +++ __m256 invScalar = _mm256_set1_ps(iScalar); +++ int32_t* inputPtr = (int32_t*)inputVector; +++ __m256i inputVal; +++ __m256 ret; ++ ++- for(;number < oneEightPoints; number++){ ++- // Load the 4 values ++- inputVal = _mm256_load_si256((__m256i*)inputPtr); +++ for (; number < oneEightPoints; number++) { +++ // Load the 4 values +++ inputVal = _mm256_load_si256((__m256i*)inputPtr); ++ ++- ret = _mm256_cvtepi32_ps(inputVal); ++- ret = _mm256_mul_ps(ret, invScalar); +++ ret = _mm256_cvtepi32_ps(inputVal); +++ ret = _mm256_mul_ps(ret, invScalar); ++ ++- _mm256_store_ps(outputVectorPtr, ret); +++ _mm256_store_ps(outputVectorPtr, ret); ++ ++- outputVectorPtr += 8; ++- inputPtr += 8; ++- } +++ outputVectorPtr += 8; +++ inputPtr += 8; +++ } ++ ++- number = oneEightPoints * 8; ++- for(; number < num_points; number++){ ++- outputVector[number] =((float)(inputVector[number])) * iScalar; ++- } +++ number = oneEightPoints * 8; +++ for (; number < num_points; number++) { +++ outputVector[number] = ((float)(inputVector[number])) * iScalar; +++ } ++ } ++ #endif /* LV_HAVE_AVX2 */ ++ ++@@ -286,59 +291,59 @@ volk_32i_s32f_convert_32f_a_avx2(float* outputVector, const int32_t* inputVector ++ #ifdef LV_HAVE_SSE2 ++ #include ++ ++-static inline void ++-volk_32i_s32f_convert_32f_a_sse2(float* outputVector, const int32_t* inputVector, ++- const float scalar, unsigned int num_points) +++static inline void volk_32i_s32f_convert_32f_a_sse2(float* outputVector, +++ const int32_t* inputVector, +++ const float scalar, +++ unsigned int num_points) ++ { ++- unsigned int number = 0; ++- const unsigned int quarterPoints = num_points / 4; +++ unsigned int number = 0; +++ const unsigned int quarterPoints = num_points / 4; ++ ++- float* outputVectorPtr = outputVector; ++- const float iScalar = 1.0 / scalar; ++- __m128 invScalar = _mm_set_ps1(iScalar); ++- int32_t* inputPtr = (int32_t*)inputVector; ++- __m128i inputVal; ++- __m128 ret; +++ float* outputVectorPtr = outputVector; +++ const float iScalar = 1.0 / scalar; +++ __m128 invScalar = _mm_set_ps1(iScalar); +++ int32_t* inputPtr = (int32_t*)inputVector; +++ __m128i inputVal; +++ __m128 ret; ++ ++- for(;number < quarterPoints; number++){ ++- // Load the 4 values ++- inputVal = _mm_load_si128((__m128i*)inputPtr); +++ for (; number < quarterPoints; number++) { +++ // Load the 4 values +++ inputVal = _mm_load_si128((__m128i*)inputPtr); ++ ++- ret = _mm_cvtepi32_ps(inputVal); ++- ret = _mm_mul_ps(ret, invScalar); +++ ret = _mm_cvtepi32_ps(inputVal); +++ ret = _mm_mul_ps(ret, invScalar); ++ ++- _mm_store_ps(outputVectorPtr, ret); +++ _mm_store_ps(outputVectorPtr, ret); ++ ++- outputVectorPtr += 4; ++- inputPtr += 4; ++- } +++ outputVectorPtr += 4; +++ inputPtr += 4; +++ } ++ ++- number = quarterPoints * 4; ++- for(; number < num_points; number++){ ++- outputVector[number] =((float)(inputVector[number])) * iScalar; ++- } +++ number = quarterPoints * 4; +++ for (; number < num_points; number++) { +++ outputVector[number] = ((float)(inputVector[number])) * iScalar; +++ } ++ } ++ #endif /* LV_HAVE_SSE2 */ ++ ++ ++ #ifdef LV_HAVE_GENERIC ++ ++-static inline void ++-volk_32i_s32f_convert_32f_a_generic(float* outputVector, const int32_t* inputVector, ++- const float scalar, unsigned int num_points) +++static inline void volk_32i_s32f_convert_32f_a_generic(float* outputVector, +++ const int32_t* inputVector, +++ const float scalar, +++ unsigned int num_points) ++ { ++- float* outputVectorPtr = outputVector; ++- const int32_t* inputVectorPtr = inputVector; ++- unsigned int number = 0; ++- const float iScalar = 1.0 / scalar; ++- ++- for(number = 0; number < num_points; number++){ ++- *outputVectorPtr++ = ((float)(*inputVectorPtr++)) * iScalar; ++- } +++ float* outputVectorPtr = outputVector; +++ const int32_t* inputVectorPtr = inputVector; +++ unsigned int number = 0; +++ const float iScalar = 1.0 / scalar; +++ +++ for (number = 0; number < num_points; number++) { +++ *outputVectorPtr++ = ((float)(*inputVectorPtr++)) * iScalar; +++ } ++ } ++ #endif /* LV_HAVE_GENERIC */ ++ ++ ++- ++- ++ #endif /* INCLUDED_volk_32i_s32f_convert_32f_a_H */ ++diff --git a/kernels/volk/volk_32i_x2_and_32i.h b/kernels/volk/volk_32i_x2_and_32i.h ++index 76f0175..755cfdc 100644 ++--- a/kernels/volk/volk_32i_x2_and_32i.h +++++ b/kernels/volk/volk_32i_x2_and_32i.h ++@@ -29,8 +29,8 @@ ++ * ++ * Dispatcher Prototype ++ * \code ++- * void volk_32i_x2_and_32i(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points) ++- * \endcode +++ * void volk_32i_x2_and_32i(int32_t* cVector, const int32_t* aVector, const int32_t* +++ * bVector, unsigned int num_points) \endcode ++ * ++ * \b Inputs ++ * \li aVector: Input vector of samples. ++@@ -87,72 +87,75 @@ ++ #ifdef LV_HAVE_AVX512F ++ #include ++ ++-static inline void ++-volk_32i_x2_and_32i_a_avx512f(int32_t* cVector, const int32_t* aVector, ++- const int32_t* bVector, unsigned int num_points) +++static inline void volk_32i_x2_and_32i_a_avx512f(int32_t* cVector, +++ const int32_t* aVector, +++ const int32_t* bVector, +++ unsigned int num_points) ++ { ++- unsigned int number = 0; ++- const unsigned int sixteenthPoints = num_points / 16; +++ unsigned int number = 0; +++ const unsigned int sixteenthPoints = num_points / 16; ++ ++- int32_t* cPtr = (int32_t*)cVector; ++- const int32_t* aPtr = (int32_t*)aVector; ++- const int32_t* bPtr = (int32_t*)bVector; +++ int32_t* cPtr = (int32_t*)cVector; +++ const int32_t* aPtr = (int32_t*)aVector; +++ const int32_t* bPtr = (int32_t*)bVector; ++ ++- __m512i aVal, bVal, cVal; ++- for(;number < sixteenthPoints; number++){ +++ __m512i aVal, bVal, cVal; +++ for (; number < sixteenthPoints; number++) { ++ ++- aVal = _mm512_load_si512(aPtr); ++- bVal = _mm512_load_si512(bPtr); +++ aVal = _mm512_load_si512(aPtr); +++ bVal = _mm512_load_si512(bPtr); ++ ++- cVal = _mm512_and_si512(aVal, bVal); +++ cVal = _mm512_and_si512(aVal, bVal); ++ ++- _mm512_store_si512(cPtr,cVal); // Store the results back into the C container +++ _mm512_store_si512(cPtr, cVal); // Store the results back into the C container ++ ++- aPtr += 16; ++- bPtr += 16; ++- cPtr += 16; ++- } +++ aPtr += 16; +++ bPtr += 16; +++ cPtr += 16; +++ } ++ ++- number = sixteenthPoints * 16; ++- for(;number < num_points; number++){ ++- cVector[number] = aVector[number] & bVector[number]; ++- } +++ number = sixteenthPoints * 16; +++ for (; number < num_points; number++) { +++ cVector[number] = aVector[number] & bVector[number]; +++ } ++ } ++ #endif /* LV_HAVE_AVX512F */ ++ ++ #ifdef LV_HAVE_AVX2 ++ #include ++ ++-static inline void ++-volk_32i_x2_and_32i_a_avx2(int32_t* cVector, const int32_t* aVector, ++- const int32_t* bVector, unsigned int num_points) +++static inline void volk_32i_x2_and_32i_a_avx2(int32_t* cVector, +++ const int32_t* aVector, +++ const int32_t* bVector, +++ unsigned int num_points) ++ { ++- unsigned int number = 0; ++- const unsigned int oneEightPoints = num_points / 8; +++ unsigned int number = 0; +++ const unsigned int oneEightPoints = num_points / 8; ++ ++- int32_t* cPtr = cVector; ++- const int32_t* aPtr = aVector; ++- const int32_t* bPtr = bVector; +++ int32_t* cPtr = cVector; +++ const int32_t* aPtr = aVector; +++ const int32_t* bPtr = bVector; ++ ++- __m256i aVal, bVal, cVal; ++- for(;number < oneEightPoints; number++){ +++ __m256i aVal, bVal, cVal; +++ for (; number < oneEightPoints; number++) { ++ ++- aVal = _mm256_load_si256((__m256i*)aPtr); ++- bVal = _mm256_load_si256((__m256i*)bPtr); +++ aVal = _mm256_load_si256((__m256i*)aPtr); +++ bVal = _mm256_load_si256((__m256i*)bPtr); ++ ++- cVal = _mm256_and_si256(aVal, bVal); +++ cVal = _mm256_and_si256(aVal, bVal); ++ ++- _mm256_store_si256((__m256i*)cPtr,cVal); // Store the results back into the C container +++ _mm256_store_si256((__m256i*)cPtr, +++ cVal); // Store the results back into the C container ++ ++- aPtr += 8; ++- bPtr += 8; ++- cPtr += 8; ++- } +++ aPtr += 8; +++ bPtr += 8; +++ cPtr += 8; +++ } ++ ++- number = oneEightPoints * 8; ++- for(;number < num_points; number++){ ++- cVector[number] = aVector[number] & bVector[number]; ++- } +++ number = oneEightPoints * 8; +++ for (; number < num_points; number++) { +++ cVector[number] = aVector[number] & bVector[number]; +++ } ++ } ++ #endif /* LV_HAVE_AVX2 */ ++ ++@@ -160,36 +163,37 @@ volk_32i_x2_and_32i_a_avx2(int32_t* cVector, const int32_t* aVector, ++ #ifdef LV_HAVE_SSE ++ #include ++ ++-static inline void ++-volk_32i_x2_and_32i_a_sse(int32_t* cVector, const int32_t* aVector, ++- const int32_t* bVector, unsigned int num_points) +++static inline void volk_32i_x2_and_32i_a_sse(int32_t* cVector, +++ const int32_t* aVector, +++ const int32_t* bVector, +++ unsigned int num_points) ++ { ++- unsigned int number = 0; ++- const unsigned int quarterPoints = num_points / 4; +++ unsigned int number = 0; +++ const unsigned int quarterPoints = num_points / 4; ++ ++- float* cPtr = (float*)cVector; ++- const float* aPtr = (float*)aVector; ++- const float* bPtr = (float*)bVector; +++ float* cPtr = (float*)cVector; +++ const float* aPtr = (float*)aVector; +++ const float* bPtr = (float*)bVector; ++ ++- __m128 aVal, bVal, cVal; ++- for(;number < quarterPoints; number++){ +++ __m128 aVal, bVal, cVal; +++ for (; number < quarterPoints; number++) { ++ ++- aVal = _mm_load_ps(aPtr); ++- bVal = _mm_load_ps(bPtr); +++ aVal = _mm_load_ps(aPtr); +++ bVal = _mm_load_ps(bPtr); ++ ++- cVal = _mm_and_ps(aVal, bVal); +++ cVal = _mm_and_ps(aVal, bVal); ++ ++- _mm_store_ps(cPtr,cVal); // Store the results back into the C container +++ _mm_store_ps(cPtr, cVal); // Store the results back into the C container ++ ++- aPtr += 4; ++- bPtr += 4; ++- cPtr += 4; ++- } +++ aPtr += 4; +++ bPtr += 4; +++ cPtr += 4; +++ } ++ ++- number = quarterPoints * 4; ++- for(;number < num_points; number++){ ++- cVector[number] = aVector[number] & bVector[number]; ++- } +++ number = quarterPoints * 4; +++ for (; number < num_points; number++) { +++ cVector[number] = aVector[number] & bVector[number]; +++ } ++ } ++ #endif /* LV_HAVE_SSE */ ++ ++@@ -197,62 +201,67 @@ volk_32i_x2_and_32i_a_sse(int32_t* cVector, const int32_t* aVector, ++ #ifdef LV_HAVE_NEON ++ #include ++ ++-static inline void ++-volk_32i_x2_and_32i_neon(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points) +++static inline void volk_32i_x2_and_32i_neon(int32_t* cVector, +++ const int32_t* aVector, +++ const int32_t* bVector, +++ unsigned int num_points) ++ { ++- int32_t* cPtr = cVector; ++- const int32_t* aPtr = aVector; ++- const int32_t* bPtr= bVector; ++- unsigned int number = 0; ++- unsigned int quarter_points = num_points / 4; ++- ++- int32x4_t a_val, b_val, c_val; ++- ++- for(number = 0; number < quarter_points; number++){ ++- a_val = vld1q_s32(aPtr); ++- b_val = vld1q_s32(bPtr); ++- c_val = vandq_s32(a_val, b_val); ++- vst1q_s32(cPtr, c_val); ++- aPtr += 4; ++- bPtr += 4; ++- cPtr += 4; ++- } ++- ++- for(number = quarter_points * 4; number < num_points; number++){ ++- *cPtr++ = (*aPtr++) & (*bPtr++); ++- } +++ int32_t* cPtr = cVector; +++ const int32_t* aPtr = aVector; +++ const int32_t* bPtr = bVector; +++ unsigned int number = 0; +++ unsigned int quarter_points = num_points / 4; +++ +++ int32x4_t a_val, b_val, c_val; +++ +++ for (number = 0; number < quarter_points; number++) { +++ a_val = vld1q_s32(aPtr); +++ b_val = vld1q_s32(bPtr); +++ c_val = vandq_s32(a_val, b_val); +++ vst1q_s32(cPtr, c_val); +++ aPtr += 4; +++ bPtr += 4; +++ cPtr += 4; +++ } +++ +++ for (number = quarter_points * 4; number < num_points; number++) { +++ *cPtr++ = (*aPtr++) & (*bPtr++); +++ } ++ } ++ #endif /* LV_HAVE_NEON */ ++ ++ ++ #ifdef LV_HAVE_GENERIC ++ ++-static inline void ++-volk_32i_x2_and_32i_generic(int32_t* cVector, const int32_t* aVector, ++- const int32_t* bVector, unsigned int num_points) +++static inline void volk_32i_x2_and_32i_generic(int32_t* cVector, +++ const int32_t* aVector, +++ const int32_t* bVector, +++ unsigned int num_points) ++ { ++- int32_t* cPtr = cVector; ++- const int32_t* aPtr = aVector; ++- const int32_t* bPtr= bVector; ++- unsigned int number = 0; ++- ++- for(number = 0; number < num_points; number++){ ++- *cPtr++ = (*aPtr++) & (*bPtr++); ++- } +++ int32_t* cPtr = cVector; +++ const int32_t* aPtr = aVector; +++ const int32_t* bPtr = bVector; +++ unsigned int number = 0; +++ +++ for (number = 0; number < num_points; number++) { +++ *cPtr++ = (*aPtr++) & (*bPtr++); +++ } ++ } ++ #endif /* LV_HAVE_GENERIC */ ++ ++ ++ #ifdef LV_HAVE_ORC ++-extern void ++-volk_32i_x2_and_32i_a_orc_impl(int32_t* cVector, const int32_t* aVector, ++- const int32_t* bVector, unsigned int num_points); ++- ++-static inline void ++-volk_32i_x2_and_32i_u_orc(int32_t* cVector, const int32_t* aVector, ++- const int32_t* bVector, unsigned int num_points) +++extern void volk_32i_x2_and_32i_a_orc_impl(int32_t* cVector, +++ const int32_t* aVector, +++ const int32_t* bVector, +++ unsigned int num_points); +++ +++static inline void volk_32i_x2_and_32i_u_orc(int32_t* cVector, +++ const int32_t* aVector, +++ const int32_t* bVector, +++ unsigned int num_points) ++ { ++- volk_32i_x2_and_32i_a_orc_impl(cVector, aVector, bVector, num_points); +++ volk_32i_x2_and_32i_a_orc_impl(cVector, aVector, bVector, num_points); ++ } ++ #endif /* LV_HAVE_ORC */ ++ ++@@ -269,72 +278,75 @@ volk_32i_x2_and_32i_u_orc(int32_t* cVector, const int32_t* aVector, ++ #ifdef LV_HAVE_AVX512F ++ #include ++ ++-static inline void ++-volk_32i_x2_and_32i_u_avx512f(int32_t* cVector, const int32_t* aVector, ++- const int32_t* bVector, unsigned int num_points) +++static inline void volk_32i_x2_and_32i_u_avx512f(int32_t* cVector, +++ const int32_t* aVector, +++ const int32_t* bVector, +++ unsigned int num_points) ++ { ++- unsigned int number = 0; ++- const unsigned int sixteenthPoints = num_points / 16; +++ unsigned int number = 0; +++ const unsigned int sixteenthPoints = num_points / 16; ++ ++- int32_t* cPtr = (int32_t*)cVector; ++- const int32_t* aPtr = (int32_t*)aVector; ++- const int32_t* bPtr = (int32_t*)bVector; +++ int32_t* cPtr = (int32_t*)cVector; +++ const int32_t* aPtr = (int32_t*)aVector; +++ const int32_t* bPtr = (int32_t*)bVector; ++ ++- __m512i aVal, bVal, cVal; ++- for(;number < sixteenthPoints; number++){ +++ __m512i aVal, bVal, cVal; +++ for (; number < sixteenthPoints; number++) { ++ ++- aVal = _mm512_loadu_si512(aPtr); ++- bVal = _mm512_loadu_si512(bPtr); +++ aVal = _mm512_loadu_si512(aPtr); +++ bVal = _mm512_loadu_si512(bPtr); ++ ++- cVal = _mm512_and_si512(aVal, bVal); +++ cVal = _mm512_and_si512(aVal, bVal); ++ ++- _mm512_storeu_si512(cPtr,cVal); // Store the results back into the C container +++ _mm512_storeu_si512(cPtr, cVal); // Store the results back into the C container ++ ++- aPtr += 16; ++- bPtr += 16; ++- cPtr += 16; ++- } +++ aPtr += 16; +++ bPtr += 16; +++ cPtr += 16; +++ } ++ ++- number = sixteenthPoints * 16; ++- for(;number < num_points; number++){ ++- cVector[number] = aVector[number] & bVector[number]; ++- } +++ number = sixteenthPoints * 16; +++ for (; number < num_points; number++) { +++ cVector[number] = aVector[number] & bVector[number]; +++ } ++ } ++ #endif /* LV_HAVE_AVX512F */ ++ ++ #ifdef LV_HAVE_AVX2 ++ #include ++ ++-static inline void ++-volk_32i_x2_and_32i_u_avx2(int32_t* cVector, const int32_t* aVector, ++- const int32_t* bVector, unsigned int num_points) +++static inline void volk_32i_x2_and_32i_u_avx2(int32_t* cVector, +++ const int32_t* aVector, +++ const int32_t* bVector, +++ unsigned int num_points) ++ { ++- unsigned int number = 0; ++- const unsigned int oneEightPoints = num_points / 8; +++ unsigned int number = 0; +++ const unsigned int oneEightPoints = num_points / 8; ++ ++- int32_t* cPtr = cVector; ++- const int32_t* aPtr = aVector; ++- const int32_t* bPtr = bVector; +++ int32_t* cPtr = cVector; +++ const int32_t* aPtr = aVector; +++ const int32_t* bPtr = bVector; ++ ++- __m256i aVal, bVal, cVal; ++- for(;number < oneEightPoints; number++){ +++ __m256i aVal, bVal, cVal; +++ for (; number < oneEightPoints; number++) { ++ ++- aVal = _mm256_loadu_si256((__m256i*)aPtr); ++- bVal = _mm256_loadu_si256((__m256i*)bPtr); +++ aVal = _mm256_loadu_si256((__m256i*)aPtr); +++ bVal = _mm256_loadu_si256((__m256i*)bPtr); ++ ++- cVal = _mm256_and_si256(aVal, bVal); +++ cVal = _mm256_and_si256(aVal, bVal); ++ ++- _mm256_storeu_si256((__m256i*)cPtr,cVal); // Store the results back into the C container +++ _mm256_storeu_si256((__m256i*)cPtr, +++ cVal); // Store the results back into the C container ++ ++- aPtr += 8; ++- bPtr += 8; ++- cPtr += 8; ++- } +++ aPtr += 8; +++ bPtr += 8; +++ cPtr += 8; +++ } ++ ++- number = oneEightPoints * 8; ++- for(;number < num_points; number++){ ++- cVector[number] = aVector[number] & bVector[number]; ++- } +++ number = oneEightPoints * 8; +++ for (; number < num_points; number++) { +++ cVector[number] = aVector[number] & bVector[number]; +++ } ++ } ++ #endif /* LV_HAVE_AVX2 */ ++ ++diff --git a/kernels/volk/volk_32i_x2_or_32i.h b/kernels/volk/volk_32i_x2_or_32i.h ++index be4c086..b03db89 100644 ++--- a/kernels/volk/volk_32i_x2_or_32i.h +++++ b/kernels/volk/volk_32i_x2_or_32i.h ++@@ -29,8 +29,8 @@ ++ * ++ * Dispatcher Prototype ++ * \code ++- * void volk_32i_x2_or_32i(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points) ++- * \endcode +++ * void volk_32i_x2_or_32i(int32_t* cVector, const int32_t* aVector, const int32_t* +++ * bVector, unsigned int num_points) \endcode ++ * ++ * \b Inputs ++ * \li aVector: Input vector of samples. ++@@ -87,72 +87,75 @@ ++ #ifdef LV_HAVE_AVX512F ++ #include ++ ++-static inline void ++-volk_32i_x2_or_32i_a_avx512f(int32_t* cVector, const int32_t* aVector, ++- const int32_t* bVector, unsigned int num_points) +++static inline void volk_32i_x2_or_32i_a_avx512f(int32_t* cVector, +++ const int32_t* aVector, +++ const int32_t* bVector, +++ unsigned int num_points) ++ { ++- unsigned int number = 0; ++- const unsigned int sixteenthPoints = num_points / 16; +++ unsigned int number = 0; +++ const unsigned int sixteenthPoints = num_points / 16; ++ ++- int32_t* cPtr = (int32_t*)cVector; ++- const int32_t* aPtr = (int32_t*)aVector; ++- const int32_t* bPtr = (int32_t*)bVector; +++ int32_t* cPtr = (int32_t*)cVector; +++ const int32_t* aPtr = (int32_t*)aVector; +++ const int32_t* bPtr = (int32_t*)bVector; ++ ++- __m512i aVal, bVal, cVal; ++- for(;number < sixteenthPoints; number++){ +++ __m512i aVal, bVal, cVal; +++ for (; number < sixteenthPoints; number++) { ++ ++- aVal = _mm512_load_si512(aPtr); ++- bVal = _mm512_load_si512(bPtr); +++ aVal = _mm512_load_si512(aPtr); +++ bVal = _mm512_load_si512(bPtr); ++ ++- cVal = _mm512_or_si512(aVal, bVal); +++ cVal = _mm512_or_si512(aVal, bVal); ++ ++- _mm512_store_si512(cPtr,cVal); // Store the results back into the C container +++ _mm512_store_si512(cPtr, cVal); // Store the results back into the C container ++ ++- aPtr += 16; ++- bPtr += 16; ++- cPtr += 16; ++- } +++ aPtr += 16; +++ bPtr += 16; +++ cPtr += 16; +++ } ++ ++- number = sixteenthPoints * 16; ++- for(;number < num_points; number++){ ++- cVector[number] = aVector[number] | bVector[number]; ++- } +++ number = sixteenthPoints * 16; +++ for (; number < num_points; number++) { +++ cVector[number] = aVector[number] | bVector[number]; +++ } ++ } ++ #endif /* LV_HAVE_AVX512F */ ++ ++ #ifdef LV_HAVE_AVX2 ++ #include ++ ++-static inline void ++-volk_32i_x2_or_32i_a_avx2(int32_t* cVector, const int32_t* aVector, ++- const int32_t* bVector, unsigned int num_points) +++static inline void volk_32i_x2_or_32i_a_avx2(int32_t* cVector, +++ const int32_t* aVector, +++ const int32_t* bVector, +++ unsigned int num_points) ++ { ++- unsigned int number = 0; ++- const unsigned int oneEightPoints = num_points / 8; +++ unsigned int number = 0; +++ const unsigned int oneEightPoints = num_points / 8; ++ ++- int32_t* cPtr = cVector; ++- const int32_t* aPtr = aVector; ++- const int32_t* bPtr = bVector; +++ int32_t* cPtr = cVector; +++ const int32_t* aPtr = aVector; +++ const int32_t* bPtr = bVector; ++ ++- __m256i aVal, bVal, cVal; ++- for(;number < oneEightPoints; number++){ +++ __m256i aVal, bVal, cVal; +++ for (; number < oneEightPoints; number++) { ++ ++- aVal = _mm256_load_si256((__m256i*)aPtr); ++- bVal = _mm256_load_si256((__m256i*)bPtr); +++ aVal = _mm256_load_si256((__m256i*)aPtr); +++ bVal = _mm256_load_si256((__m256i*)bPtr); ++ ++- cVal = _mm256_or_si256(aVal, bVal); +++ cVal = _mm256_or_si256(aVal, bVal); ++ ++- _mm256_store_si256((__m256i*)cPtr,cVal); // Store the results back into the C container +++ _mm256_store_si256((__m256i*)cPtr, +++ cVal); // Store the results back into the C container ++ ++- aPtr += 8; ++- bPtr += 8; ++- cPtr += 8; ++- } +++ aPtr += 8; +++ bPtr += 8; +++ cPtr += 8; +++ } ++ ++- number = oneEightPoints * 8; ++- for(;number < num_points; number++){ ++- cVector[number] = aVector[number] | bVector[number]; ++- } +++ number = oneEightPoints * 8; +++ for (; number < num_points; number++) { +++ cVector[number] = aVector[number] | bVector[number]; +++ } ++ } ++ #endif /* LV_HAVE_AVX2 */ ++ ++@@ -160,35 +163,36 @@ volk_32i_x2_or_32i_a_avx2(int32_t* cVector, const int32_t* aVector, ++ #ifdef LV_HAVE_SSE ++ #include ++ ++-static inline void ++-volk_32i_x2_or_32i_a_sse(int32_t* cVector, const int32_t* aVector, ++- const int32_t* bVector, unsigned int num_points) +++static inline void volk_32i_x2_or_32i_a_sse(int32_t* cVector, +++ const int32_t* aVector, +++ const int32_t* bVector, +++ unsigned int num_points) ++ { ++- unsigned int number = 0; ++- const unsigned int quarterPoints = num_points / 4; +++ unsigned int number = 0; +++ const unsigned int quarterPoints = num_points / 4; ++ ++- float* cPtr = (float*)cVector; ++- const float* aPtr = (float*)aVector; ++- const float* bPtr = (float*)bVector; +++ float* cPtr = (float*)cVector; +++ const float* aPtr = (float*)aVector; +++ const float* bPtr = (float*)bVector; ++ ++- __m128 aVal, bVal, cVal; ++- for(;number < quarterPoints; number++){ ++- aVal = _mm_load_ps(aPtr); ++- bVal = _mm_load_ps(bPtr); +++ __m128 aVal, bVal, cVal; +++ for (; number < quarterPoints; number++) { +++ aVal = _mm_load_ps(aPtr); +++ bVal = _mm_load_ps(bPtr); ++ ++- cVal = _mm_or_ps(aVal, bVal); +++ cVal = _mm_or_ps(aVal, bVal); ++ ++- _mm_store_ps(cPtr,cVal); // Store the results back into the C container +++ _mm_store_ps(cPtr, cVal); // Store the results back into the C container ++ ++- aPtr += 4; ++- bPtr += 4; ++- cPtr += 4; ++- } +++ aPtr += 4; +++ bPtr += 4; +++ cPtr += 4; +++ } ++ ++- number = quarterPoints * 4; ++- for(;number < num_points; number++){ ++- cVector[number] = aVector[number] | bVector[number]; ++- } +++ number = quarterPoints * 4; +++ for (; number < num_points; number++) { +++ cVector[number] = aVector[number] | bVector[number]; +++ } ++ } ++ #endif /* LV_HAVE_SSE */ ++ ++@@ -196,63 +200,67 @@ volk_32i_x2_or_32i_a_sse(int32_t* cVector, const int32_t* aVector, ++ #ifdef LV_HAVE_NEON ++ #include ++ ++-static inline void ++-volk_32i_x2_or_32i_neon(int32_t* cVector, const int32_t* aVector, ++- const int32_t* bVector, unsigned int num_points) +++static inline void volk_32i_x2_or_32i_neon(int32_t* cVector, +++ const int32_t* aVector, +++ const int32_t* bVector, +++ unsigned int num_points) ++ { ++- int32_t* cPtr = cVector; ++- const int32_t* aPtr = aVector; ++- const int32_t* bPtr= bVector; ++- unsigned int number = 0; ++- unsigned int quarter_points = num_points / 4; ++- ++- int32x4_t a_val, b_val, c_val; ++- ++- for(number = 0; number < quarter_points; number++){ ++- a_val = vld1q_s32(aPtr); ++- b_val = vld1q_s32(bPtr); ++- c_val = vorrq_s32(a_val, b_val); ++- vst1q_s32(cPtr, c_val); ++- aPtr += 4; ++- bPtr += 4; ++- cPtr += 4; ++- } ++- ++- for(number = quarter_points * 4; number < num_points; number++){ ++- *cPtr++ = (*aPtr++) | (*bPtr++); ++- } +++ int32_t* cPtr = cVector; +++ const int32_t* aPtr = aVector; +++ const int32_t* bPtr = bVector; +++ unsigned int number = 0; +++ unsigned int quarter_points = num_points / 4; +++ +++ int32x4_t a_val, b_val, c_val; +++ +++ for (number = 0; number < quarter_points; number++) { +++ a_val = vld1q_s32(aPtr); +++ b_val = vld1q_s32(bPtr); +++ c_val = vorrq_s32(a_val, b_val); +++ vst1q_s32(cPtr, c_val); +++ aPtr += 4; +++ bPtr += 4; +++ cPtr += 4; +++ } +++ +++ for (number = quarter_points * 4; number < num_points; number++) { +++ *cPtr++ = (*aPtr++) | (*bPtr++); +++ } ++ } ++ #endif /* LV_HAVE_NEON */ ++ ++ ++ #ifdef LV_HAVE_GENERIC ++ ++-static inline void ++-volk_32i_x2_or_32i_generic(int32_t* cVector, const int32_t* aVector, ++- const int32_t* bVector, unsigned int num_points) +++static inline void volk_32i_x2_or_32i_generic(int32_t* cVector, +++ const int32_t* aVector, +++ const int32_t* bVector, +++ unsigned int num_points) ++ { ++- int32_t* cPtr = cVector; ++- const int32_t* aPtr = aVector; ++- const int32_t* bPtr= bVector; ++- unsigned int number = 0; ++- ++- for(number = 0; number < num_points; number++){ ++- *cPtr++ = (*aPtr++) | (*bPtr++); ++- } +++ int32_t* cPtr = cVector; +++ const int32_t* aPtr = aVector; +++ const int32_t* bPtr = bVector; +++ unsigned int number = 0; +++ +++ for (number = 0; number < num_points; number++) { +++ *cPtr++ = (*aPtr++) | (*bPtr++); +++ } ++ } ++ #endif /* LV_HAVE_GENERIC */ ++ ++ ++ #ifdef LV_HAVE_ORC ++-extern void ++-volk_32i_x2_or_32i_a_orc_impl(int32_t* cVector, const int32_t* aVector, ++- const int32_t* bVector, unsigned int num_points); ++- ++-static inline void ++-volk_32i_x2_or_32i_u_orc(int32_t* cVector, const int32_t* aVector, ++- const int32_t* bVector, unsigned int num_points) +++extern void volk_32i_x2_or_32i_a_orc_impl(int32_t* cVector, +++ const int32_t* aVector, +++ const int32_t* bVector, +++ unsigned int num_points); +++ +++static inline void volk_32i_x2_or_32i_u_orc(int32_t* cVector, +++ const int32_t* aVector, +++ const int32_t* bVector, +++ unsigned int num_points) ++ { ++- volk_32i_x2_or_32i_a_orc_impl(cVector, aVector, bVector, num_points); +++ volk_32i_x2_or_32i_a_orc_impl(cVector, aVector, bVector, num_points); ++ } ++ #endif /* LV_HAVE_ORC */ ++ ++@@ -269,72 +277,75 @@ volk_32i_x2_or_32i_u_orc(int32_t* cVector, const int32_t* aVector, ++ #ifdef LV_HAVE_AVX512F ++ #include ++ ++-static inline void ++-volk_32i_x2_or_32i_u_avx512f(int32_t* cVector, const int32_t* aVector, ++- const int32_t* bVector, unsigned int num_points) +++static inline void volk_32i_x2_or_32i_u_avx512f(int32_t* cVector, +++ const int32_t* aVector, +++ const int32_t* bVector, +++ unsigned int num_points) ++ { ++- unsigned int number = 0; ++- const unsigned int sixteenthPoints = num_points / 16; +++ unsigned int number = 0; +++ const unsigned int sixteenthPoints = num_points / 16; ++ ++- int32_t* cPtr = (int32_t*)cVector; ++- const int32_t* aPtr = (int32_t*)aVector; ++- const int32_t* bPtr = (int32_t*)bVector; +++ int32_t* cPtr = (int32_t*)cVector; +++ const int32_t* aPtr = (int32_t*)aVector; +++ const int32_t* bPtr = (int32_t*)bVector; ++ ++- __m512i aVal, bVal, cVal; ++- for(;number < sixteenthPoints; number++){ +++ __m512i aVal, bVal, cVal; +++ for (; number < sixteenthPoints; number++) { ++ ++- aVal = _mm512_loadu_si512(aPtr); ++- bVal = _mm512_loadu_si512(bPtr); +++ aVal = _mm512_loadu_si512(aPtr); +++ bVal = _mm512_loadu_si512(bPtr); ++ ++- cVal = _mm512_or_si512(aVal, bVal); +++ cVal = _mm512_or_si512(aVal, bVal); ++ ++- _mm512_storeu_si512(cPtr,cVal); // Store the results back into the C container +++ _mm512_storeu_si512(cPtr, cVal); // Store the results back into the C container ++ ++- aPtr += 16; ++- bPtr += 16; ++- cPtr += 16; ++- } +++ aPtr += 16; +++ bPtr += 16; +++ cPtr += 16; +++ } ++ ++- number = sixteenthPoints * 16; ++- for(;number < num_points; number++){ ++- cVector[number] = aVector[number] | bVector[number]; ++- } +++ number = sixteenthPoints * 16; +++ for (; number < num_points; number++) { +++ cVector[number] = aVector[number] | bVector[number]; +++ } ++ } ++ #endif /* LV_HAVE_AVX512F */ ++ ++ #ifdef LV_HAVE_AVX2 ++ #include ++ ++-static inline void ++-volk_32i_x2_or_32i_u_avx2(int32_t* cVector, const int32_t* aVector, ++- const int32_t* bVector, unsigned int num_points) +++static inline void volk_32i_x2_or_32i_u_avx2(int32_t* cVector, +++ const int32_t* aVector, +++ const int32_t* bVector, +++ unsigned int num_points) ++ { ++- unsigned int number = 0; ++- const unsigned int oneEightPoints = num_points / 8; +++ unsigned int number = 0; +++ const unsigned int oneEightPoints = num_points / 8; ++ ++- int32_t* cPtr = cVector; ++- const int32_t* aPtr = aVector; ++- const int32_t* bPtr = bVector; +++ int32_t* cPtr = cVector; +++ const int32_t* aPtr = aVector; +++ const int32_t* bPtr = bVector; ++ ++- __m256i aVal, bVal, cVal; ++- for(;number < oneEightPoints; number++){ +++ __m256i aVal, bVal, cVal; +++ for (; number < oneEightPoints; number++) { ++ ++- aVal = _mm256_loadu_si256((__m256i*)aPtr); ++- bVal = _mm256_loadu_si256((__m256i*)bPtr); +++ aVal = _mm256_loadu_si256((__m256i*)aPtr); +++ bVal = _mm256_loadu_si256((__m256i*)bPtr); ++ ++- cVal = _mm256_or_si256(aVal, bVal); +++ cVal = _mm256_or_si256(aVal, bVal); ++ ++- _mm256_storeu_si256((__m256i*)cPtr,cVal); // Store the results back into the C container +++ _mm256_storeu_si256((__m256i*)cPtr, +++ cVal); // Store the results back into the C container ++ ++- aPtr += 8; ++- bPtr += 8; ++- cPtr += 8; ++- } +++ aPtr += 8; +++ bPtr += 8; +++ cPtr += 8; +++ } ++ ++- number = oneEightPoints * 8; ++- for(;number < num_points; number++){ ++- cVector[number] = aVector[number] | bVector[number]; ++- } +++ number = oneEightPoints * 8; +++ for (; number < num_points; number++) { +++ cVector[number] = aVector[number] | bVector[number]; +++ } ++ } ++ #endif /* LV_HAVE_AVX2 */ ++ ++diff --git a/kernels/volk/volk_32u_byteswap.h b/kernels/volk/volk_32u_byteswap.h ++index f5e6f11..185047c 100644 ++--- a/kernels/volk/volk_32u_byteswap.h +++++ b/kernels/volk/volk_32u_byteswap.h ++@@ -71,38 +71,42 @@ ++ ++ #if LV_HAVE_AVX2 ++ #include ++-static inline void volk_32u_byteswap_u_avx2(uint32_t* intsToSwap, unsigned int num_points){ +++static inline void volk_32u_byteswap_u_avx2(uint32_t* intsToSwap, unsigned int num_points) +++{ ++ ++- unsigned int number; +++ unsigned int number; ++ ++- const unsigned int nPerSet = 8; ++- const uint64_t nSets = num_points / nPerSet; +++ const unsigned int nPerSet = 8; +++ const uint64_t nSets = num_points / nPerSet; ++ ++- uint32_t* inputPtr = intsToSwap; +++ uint32_t* inputPtr = intsToSwap; ++ ++- const uint8_t shuffleVector[32] = { 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12, 19, 18, 17, 16, 23, 22, 21, 20, 27, 26, 25, 24, 31, 30, 29, 28 }; +++ const uint8_t shuffleVector[32] = { 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, +++ 8, 15, 14, 13, 12, 19, 18, 17, 16, 23, 22, +++ 21, 20, 27, 26, 25, 24, 31, 30, 29, 28 }; ++ ++- const __m256i myShuffle = _mm256_loadu_si256((__m256i*) &shuffleVector); +++ const __m256i myShuffle = _mm256_loadu_si256((__m256i*)&shuffleVector); ++ ++- for (number = 0 ;number < nSets; number++) { +++ for (number = 0; number < nSets; number++) { ++ ++- // Load the 32t values, increment inputPtr later since we're doing it in-place. ++- const __m256i input = _mm256_loadu_si256((__m256i*)inputPtr); ++- const __m256i output = _mm256_shuffle_epi8(input,myShuffle); +++ // Load the 32t values, increment inputPtr later since we're doing it in-place. +++ const __m256i input = _mm256_loadu_si256((__m256i*)inputPtr); +++ const __m256i output = _mm256_shuffle_epi8(input, myShuffle); ++ ++- // Store the results ++- _mm256_storeu_si256((__m256i*)inputPtr, output); ++- inputPtr += nPerSet; ++- } ++- _mm256_zeroupper(); ++- ++- // Byteswap any remaining points: ++- for(number = nSets * nPerSet; number < num_points; number++){ ++- uint32_t outputVal = *inputPtr; ++- outputVal = (((outputVal >> 24) & 0xff) | ((outputVal >> 8) & 0x0000ff00) | ((outputVal << 8) & 0x00ff0000) | ((outputVal << 24) & 0xff000000)); ++- *inputPtr = outputVal; ++- inputPtr++; ++- } +++ // Store the results +++ _mm256_storeu_si256((__m256i*)inputPtr, output); +++ inputPtr += nPerSet; +++ } +++ _mm256_zeroupper(); +++ +++ // Byteswap any remaining points: +++ for (number = nSets * nPerSet; number < num_points; number++) { +++ uint32_t outputVal = *inputPtr; +++ outputVal = (((outputVal >> 24) & 0xff) | ((outputVal >> 8) & 0x0000ff00) | +++ ((outputVal << 8) & 0x00ff0000) | ((outputVal << 24) & 0xff000000)); +++ *inputPtr = outputVal; +++ inputPtr++; +++ } ++ } ++ #endif /* LV_HAVE_AVX2 */ ++ ++@@ -110,42 +114,44 @@ static inline void volk_32u_byteswap_u_avx2(uint32_t* intsToSwap, unsigned int n ++ #ifdef LV_HAVE_SSE2 ++ #include ++ ++-static inline void volk_32u_byteswap_u_sse2(uint32_t* intsToSwap, unsigned int num_points){ ++- unsigned int number = 0; ++- ++- uint32_t* inputPtr = intsToSwap; ++- __m128i input, byte1, byte2, byte3, byte4, output; ++- __m128i byte2mask = _mm_set1_epi32(0x00FF0000); ++- __m128i byte3mask = _mm_set1_epi32(0x0000FF00); ++- ++- const uint64_t quarterPoints = num_points / 4; ++- for(;number < quarterPoints; number++){ ++- // Load the 32t values, increment inputPtr later since we're doing it in-place. ++- input = _mm_loadu_si128((__m128i*)inputPtr); ++- // Do the four shifts ++- byte1 = _mm_slli_epi32(input, 24); ++- byte2 = _mm_slli_epi32(input, 8); ++- byte3 = _mm_srli_epi32(input, 8); ++- byte4 = _mm_srli_epi32(input, 24); ++- // Or bytes together ++- output = _mm_or_si128(byte1, byte4); ++- byte2 = _mm_and_si128(byte2, byte2mask); ++- output = _mm_or_si128(output, byte2); ++- byte3 = _mm_and_si128(byte3, byte3mask); ++- output = _mm_or_si128(output, byte3); ++- // Store the results ++- _mm_storeu_si128((__m128i*)inputPtr, output); ++- inputPtr += 4; ++- } ++- ++- // Byteswap any remaining points: ++- number = quarterPoints*4; ++- for(; number < num_points; number++){ ++- uint32_t outputVal = *inputPtr; ++- outputVal = (((outputVal >> 24) & 0xff) | ((outputVal >> 8) & 0x0000ff00) | ((outputVal << 8) & 0x00ff0000) | ((outputVal << 24) & 0xff000000)); ++- *inputPtr = outputVal; ++- inputPtr++; ++- } +++static inline void volk_32u_byteswap_u_sse2(uint32_t* intsToSwap, unsigned int num_points) +++{ +++ unsigned int number = 0; +++ +++ uint32_t* inputPtr = intsToSwap; +++ __m128i input, byte1, byte2, byte3, byte4, output; +++ __m128i byte2mask = _mm_set1_epi32(0x00FF0000); +++ __m128i byte3mask = _mm_set1_epi32(0x0000FF00); +++ +++ const uint64_t quarterPoints = num_points / 4; +++ for (; number < quarterPoints; number++) { +++ // Load the 32t values, increment inputPtr later since we're doing it in-place. +++ input = _mm_loadu_si128((__m128i*)inputPtr); +++ // Do the four shifts +++ byte1 = _mm_slli_epi32(input, 24); +++ byte2 = _mm_slli_epi32(input, 8); +++ byte3 = _mm_srli_epi32(input, 8); +++ byte4 = _mm_srli_epi32(input, 24); +++ // Or bytes together +++ output = _mm_or_si128(byte1, byte4); +++ byte2 = _mm_and_si128(byte2, byte2mask); +++ output = _mm_or_si128(output, byte2); +++ byte3 = _mm_and_si128(byte3, byte3mask); +++ output = _mm_or_si128(output, byte3); +++ // Store the results +++ _mm_storeu_si128((__m128i*)inputPtr, output); +++ inputPtr += 4; +++ } +++ +++ // Byteswap any remaining points: +++ number = quarterPoints * 4; +++ for (; number < num_points; number++) { +++ uint32_t outputVal = *inputPtr; +++ outputVal = (((outputVal >> 24) & 0xff) | ((outputVal >> 8) & 0x0000ff00) | +++ ((outputVal << 8) & 0x00ff0000) | ((outputVal << 24) & 0xff000000)); +++ *inputPtr = outputVal; +++ inputPtr++; +++ } ++ } ++ #endif /* LV_HAVE_SSE2 */ ++ ++@@ -153,100 +159,106 @@ static inline void volk_32u_byteswap_u_sse2(uint32_t* intsToSwap, unsigned int n ++ #ifdef LV_HAVE_NEON ++ #include ++ ++-static inline void volk_32u_byteswap_neon(uint32_t* intsToSwap, unsigned int num_points){ ++- uint32_t* inputPtr = intsToSwap; ++- unsigned int number = 0; ++- unsigned int n8points = num_points / 8; ++- ++- uint8x8x4_t input_table; ++- uint8x8_t int_lookup01, int_lookup23, int_lookup45, int_lookup67; ++- uint8x8_t swapped_int01, swapped_int23, swapped_int45, swapped_int67; ++- ++- /* these magic numbers are used as byte-indices in the LUT. ++- they are pre-computed to save time. A simple C program ++- can calculate them; for example for lookup01: ++- uint8_t chars[8] = {24, 16, 8, 0, 25, 17, 9, 1}; ++- for(ii=0; ii < 8; ++ii) { ++- index += ((uint64_t)(*(chars+ii))) << (ii*8); +++static inline void volk_32u_byteswap_neon(uint32_t* intsToSwap, unsigned int num_points) +++{ +++ uint32_t* inputPtr = intsToSwap; +++ unsigned int number = 0; +++ unsigned int n8points = num_points / 8; +++ +++ uint8x8x4_t input_table; +++ uint8x8_t int_lookup01, int_lookup23, int_lookup45, int_lookup67; +++ uint8x8_t swapped_int01, swapped_int23, swapped_int45, swapped_int67; +++ +++ /* these magic numbers are used as byte-indices in the LUT. +++ they are pre-computed to save time. A simple C program +++ can calculate them; for example for lookup01: +++ uint8_t chars[8] = {24, 16, 8, 0, 25, 17, 9, 1}; +++ for(ii=0; ii < 8; ++ii) { +++ index += ((uint64_t)(*(chars+ii))) << (ii*8); +++ } +++ */ +++ int_lookup01 = vcreate_u8(74609667900706840); +++ int_lookup23 = vcreate_u8(219290013576860186); +++ int_lookup45 = vcreate_u8(363970359253013532); +++ int_lookup67 = vcreate_u8(508650704929166878); +++ +++ for (number = 0; number < n8points; ++number) { +++ input_table = vld4_u8((uint8_t*)inputPtr); +++ swapped_int01 = vtbl4_u8(input_table, int_lookup01); +++ swapped_int23 = vtbl4_u8(input_table, int_lookup23); +++ swapped_int45 = vtbl4_u8(input_table, int_lookup45); +++ swapped_int67 = vtbl4_u8(input_table, int_lookup67); +++ vst1_u8((uint8_t*)inputPtr, swapped_int01); +++ vst1_u8((uint8_t*)(inputPtr + 2), swapped_int23); +++ vst1_u8((uint8_t*)(inputPtr + 4), swapped_int45); +++ vst1_u8((uint8_t*)(inputPtr + 6), swapped_int67); +++ +++ inputPtr += 8; +++ } +++ +++ for (number = n8points * 8; number < num_points; ++number) { +++ uint32_t output = *inputPtr; +++ output = (((output >> 24) & 0xff) | ((output >> 8) & 0x0000ff00) | +++ ((output << 8) & 0x00ff0000) | ((output << 24) & 0xff000000)); +++ +++ *inputPtr = output; +++ inputPtr++; ++ } ++- */ ++- int_lookup01 = vcreate_u8(74609667900706840); ++- int_lookup23 = vcreate_u8(219290013576860186); ++- int_lookup45 = vcreate_u8(363970359253013532); ++- int_lookup67 = vcreate_u8(508650704929166878); ++- ++- for(number = 0; number < n8points; ++number){ ++- input_table = vld4_u8((uint8_t*) inputPtr); ++- swapped_int01 = vtbl4_u8(input_table, int_lookup01); ++- swapped_int23 = vtbl4_u8(input_table, int_lookup23); ++- swapped_int45 = vtbl4_u8(input_table, int_lookup45); ++- swapped_int67 = vtbl4_u8(input_table, int_lookup67); ++- vst1_u8((uint8_t*) inputPtr, swapped_int01); ++- vst1_u8((uint8_t*) (inputPtr+2), swapped_int23); ++- vst1_u8((uint8_t*) (inputPtr+4), swapped_int45); ++- vst1_u8((uint8_t*) (inputPtr+6), swapped_int67); ++- ++- inputPtr += 8; ++- } ++- ++- for(number = n8points * 8; number < num_points; ++number){ ++- uint32_t output = *inputPtr; ++- output = (((output >> 24) & 0xff) | ((output >> 8) & 0x0000ff00) | ((output << 8) & 0x00ff0000) | ((output << 24) & 0xff000000)); ++- ++- *inputPtr = output; ++- inputPtr++; ++- } ++ } ++ #endif /* LV_HAVE_NEON */ ++ ++ #ifdef LV_HAVE_NEONV8 ++ #include ++ ++-static inline void volk_32u_byteswap_neonv8(uint32_t* intsToSwap, unsigned int num_points){ ++- uint32_t* inputPtr = (uint32_t*)intsToSwap; ++- const unsigned int n8points = num_points / 8; ++- uint8x16_t input; ++- uint8x16_t idx = { 3,2,1,0, 7,6,5,4, 11,10,9,8, 15,14,13,12 }; ++- ++- unsigned int number = 0; ++- for(number = 0; number < n8points; ++number){ ++- __VOLK_PREFETCH(inputPtr+8); ++- input = vld1q_u8((uint8_t*) inputPtr); ++- input = vqtbl1q_u8(input, idx); ++- vst1q_u8((uint8_t*) inputPtr, input); ++- inputPtr += 4; ++- ++- input = vld1q_u8((uint8_t*) inputPtr); ++- input = vqtbl1q_u8(input, idx); ++- vst1q_u8((uint8_t*) inputPtr, input); ++- inputPtr += 4; ++- } ++- ++- for(number = n8points * 8; number < num_points; ++number){ ++- uint32_t output = *inputPtr; +++static inline void volk_32u_byteswap_neonv8(uint32_t* intsToSwap, unsigned int num_points) +++{ +++ uint32_t* inputPtr = (uint32_t*)intsToSwap; +++ const unsigned int n8points = num_points / 8; +++ uint8x16_t input; +++ uint8x16_t idx = { 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12 }; +++ +++ unsigned int number = 0; +++ for (number = 0; number < n8points; ++number) { +++ __VOLK_PREFETCH(inputPtr + 8); +++ input = vld1q_u8((uint8_t*)inputPtr); +++ input = vqtbl1q_u8(input, idx); +++ vst1q_u8((uint8_t*)inputPtr, input); +++ inputPtr += 4; +++ +++ input = vld1q_u8((uint8_t*)inputPtr); +++ input = vqtbl1q_u8(input, idx); +++ vst1q_u8((uint8_t*)inputPtr, input); +++ inputPtr += 4; +++ } ++ ++- output = (((output >> 24) & 0xff) | ((output >> 8) & 0x0000ff00) | ((output << 8) & 0x00ff0000) | ((output << 24) & 0xff000000)); +++ for (number = n8points * 8; number < num_points; ++number) { +++ uint32_t output = *inputPtr; ++ ++- *inputPtr++ = output; ++- } +++ output = (((output >> 24) & 0xff) | ((output >> 8) & 0x0000ff00) | +++ ((output << 8) & 0x00ff0000) | ((output << 24) & 0xff000000)); ++ +++ *inputPtr++ = output; +++ } ++ } ++ #endif /* LV_HAVE_NEONV8 */ ++ ++ ++ #ifdef LV_HAVE_GENERIC ++ ++-static inline void volk_32u_byteswap_generic(uint32_t* intsToSwap, unsigned int num_points){ ++- uint32_t* inputPtr = intsToSwap; +++static inline void volk_32u_byteswap_generic(uint32_t* intsToSwap, +++ unsigned int num_points) +++{ +++ uint32_t* inputPtr = intsToSwap; ++ ++- unsigned int point; ++- for(point = 0; point < num_points; point++){ ++- uint32_t output = *inputPtr; ++- output = (((output >> 24) & 0xff) | ((output >> 8) & 0x0000ff00) | ((output << 8) & 0x00ff0000) | ((output << 24) & 0xff000000)); +++ unsigned int point; +++ for (point = 0; point < num_points; point++) { +++ uint32_t output = *inputPtr; +++ output = (((output >> 24) & 0xff) | ((output >> 8) & 0x0000ff00) | +++ ((output << 8) & 0x00ff0000) | ((output << 24) & 0xff000000)); ++ ++- *inputPtr = output; ++- inputPtr++; ++- } +++ *inputPtr = output; +++ inputPtr++; +++ } ++ } ++ #endif /* LV_HAVE_GENERIC */ ++ ++@@ -261,38 +273,42 @@ static inline void volk_32u_byteswap_generic(uint32_t* intsToSwap, unsigned int ++ ++ #if LV_HAVE_AVX2 ++ #include ++-static inline void volk_32u_byteswap_a_avx2(uint32_t* intsToSwap, unsigned int num_points){ +++static inline void volk_32u_byteswap_a_avx2(uint32_t* intsToSwap, unsigned int num_points) +++{ ++ ++- unsigned int number; +++ unsigned int number; ++ ++- const unsigned int nPerSet = 8; ++- const uint64_t nSets = num_points / nPerSet; +++ const unsigned int nPerSet = 8; +++ const uint64_t nSets = num_points / nPerSet; ++ ++- uint32_t* inputPtr = intsToSwap; +++ uint32_t* inputPtr = intsToSwap; ++ ++- const uint8_t shuffleVector[32] = { 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12, 19, 18, 17, 16, 23, 22, 21, 20, 27, 26, 25, 24, 31, 30, 29, 28 }; +++ const uint8_t shuffleVector[32] = { 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, +++ 8, 15, 14, 13, 12, 19, 18, 17, 16, 23, 22, +++ 21, 20, 27, 26, 25, 24, 31, 30, 29, 28 }; ++ ++- const __m256i myShuffle = _mm256_loadu_si256((__m256i*) &shuffleVector); +++ const __m256i myShuffle = _mm256_loadu_si256((__m256i*)&shuffleVector); ++ ++- for (number = 0 ;number < nSets; number++) { +++ for (number = 0; number < nSets; number++) { ++ ++- // Load the 32t values, increment inputPtr later since we're doing it in-place. ++- const __m256i input = _mm256_load_si256((__m256i*)inputPtr); ++- const __m256i output = _mm256_shuffle_epi8(input,myShuffle); +++ // Load the 32t values, increment inputPtr later since we're doing it in-place. +++ const __m256i input = _mm256_load_si256((__m256i*)inputPtr); +++ const __m256i output = _mm256_shuffle_epi8(input, myShuffle); ++ ++- // Store the results ++- _mm256_store_si256((__m256i*)inputPtr, output); ++- inputPtr += nPerSet; ++- } ++- _mm256_zeroupper(); ++- ++- // Byteswap any remaining points: ++- for(number = nSets * nPerSet; number < num_points; number++){ ++- uint32_t outputVal = *inputPtr; ++- outputVal = (((outputVal >> 24) & 0xff) | ((outputVal >> 8) & 0x0000ff00) | ((outputVal << 8) & 0x00ff0000) | ((outputVal << 24) & 0xff000000)); ++- *inputPtr = outputVal; ++- inputPtr++; ++- } +++ // Store the results +++ _mm256_store_si256((__m256i*)inputPtr, output); +++ inputPtr += nPerSet; +++ } +++ _mm256_zeroupper(); +++ +++ // Byteswap any remaining points: +++ for (number = nSets * nPerSet; number < num_points; number++) { +++ uint32_t outputVal = *inputPtr; +++ outputVal = (((outputVal >> 24) & 0xff) | ((outputVal >> 8) & 0x0000ff00) | +++ ((outputVal << 8) & 0x00ff0000) | ((outputVal << 24) & 0xff000000)); +++ *inputPtr = outputVal; +++ inputPtr++; +++ } ++ } ++ #endif /* LV_HAVE_AVX2 */ ++ ++@@ -301,63 +317,66 @@ static inline void volk_32u_byteswap_a_avx2(uint32_t* intsToSwap, unsigned int n ++ #include ++ ++ ++-static inline void volk_32u_byteswap_a_sse2(uint32_t* intsToSwap, unsigned int num_points){ ++- unsigned int number = 0; ++- ++- uint32_t* inputPtr = intsToSwap; ++- __m128i input, byte1, byte2, byte3, byte4, output; ++- __m128i byte2mask = _mm_set1_epi32(0x00FF0000); ++- __m128i byte3mask = _mm_set1_epi32(0x0000FF00); ++- ++- const uint64_t quarterPoints = num_points / 4; ++- for(;number < quarterPoints; number++){ ++- // Load the 32t values, increment inputPtr later since we're doing it in-place. ++- input = _mm_load_si128((__m128i*)inputPtr); ++- // Do the four shifts ++- byte1 = _mm_slli_epi32(input, 24); ++- byte2 = _mm_slli_epi32(input, 8); ++- byte3 = _mm_srli_epi32(input, 8); ++- byte4 = _mm_srli_epi32(input, 24); ++- // Or bytes together ++- output = _mm_or_si128(byte1, byte4); ++- byte2 = _mm_and_si128(byte2, byte2mask); ++- output = _mm_or_si128(output, byte2); ++- byte3 = _mm_and_si128(byte3, byte3mask); ++- output = _mm_or_si128(output, byte3); ++- // Store the results ++- _mm_store_si128((__m128i*)inputPtr, output); ++- inputPtr += 4; ++- } ++- ++- // Byteswap any remaining points: ++- number = quarterPoints*4; ++- for(; number < num_points; number++){ ++- uint32_t outputVal = *inputPtr; ++- outputVal = (((outputVal >> 24) & 0xff) | ((outputVal >> 8) & 0x0000ff00) | ((outputVal << 8) & 0x00ff0000) | ((outputVal << 24) & 0xff000000)); ++- *inputPtr = outputVal; ++- inputPtr++; ++- } +++static inline void volk_32u_byteswap_a_sse2(uint32_t* intsToSwap, unsigned int num_points) +++{ +++ unsigned int number = 0; +++ +++ uint32_t* inputPtr = intsToSwap; +++ __m128i input, byte1, byte2, byte3, byte4, output; +++ __m128i byte2mask = _mm_set1_epi32(0x00FF0000); +++ __m128i byte3mask = _mm_set1_epi32(0x0000FF00); +++ +++ const uint64_t quarterPoints = num_points / 4; +++ for (; number < quarterPoints; number++) { +++ // Load the 32t values, increment inputPtr later since we're doing it in-place. +++ input = _mm_load_si128((__m128i*)inputPtr); +++ // Do the four shifts +++ byte1 = _mm_slli_epi32(input, 24); +++ byte2 = _mm_slli_epi32(input, 8); +++ byte3 = _mm_srli_epi32(input, 8); +++ byte4 = _mm_srli_epi32(input, 24); +++ // Or bytes together +++ output = _mm_or_si128(byte1, byte4); +++ byte2 = _mm_and_si128(byte2, byte2mask); +++ output = _mm_or_si128(output, byte2); +++ byte3 = _mm_and_si128(byte3, byte3mask); +++ output = _mm_or_si128(output, byte3); +++ // Store the results +++ _mm_store_si128((__m128i*)inputPtr, output); +++ inputPtr += 4; +++ } +++ +++ // Byteswap any remaining points: +++ number = quarterPoints * 4; +++ for (; number < num_points; number++) { +++ uint32_t outputVal = *inputPtr; +++ outputVal = (((outputVal >> 24) & 0xff) | ((outputVal >> 8) & 0x0000ff00) | +++ ((outputVal << 8) & 0x00ff0000) | ((outputVal << 24) & 0xff000000)); +++ *inputPtr = outputVal; +++ inputPtr++; +++ } ++ } ++ #endif /* LV_HAVE_SSE2 */ ++ ++ ++ #ifdef LV_HAVE_GENERIC ++ ++-static inline void volk_32u_byteswap_a_generic(uint32_t* intsToSwap, unsigned int num_points){ ++- uint32_t* inputPtr = intsToSwap; +++static inline void volk_32u_byteswap_a_generic(uint32_t* intsToSwap, +++ unsigned int num_points) +++{ +++ uint32_t* inputPtr = intsToSwap; ++ ++- unsigned int point; ++- for(point = 0; point < num_points; point++){ ++- uint32_t output = *inputPtr; ++- output = (((output >> 24) & 0xff) | ((output >> 8) & 0x0000ff00) | ((output << 8) & 0x00ff0000) | ((output << 24) & 0xff000000)); +++ unsigned int point; +++ for (point = 0; point < num_points; point++) { +++ uint32_t output = *inputPtr; +++ output = (((output >> 24) & 0xff) | ((output >> 8) & 0x0000ff00) | +++ ((output << 8) & 0x00ff0000) | ((output << 24) & 0xff000000)); ++ ++- *inputPtr = output; ++- inputPtr++; ++- } +++ *inputPtr = output; +++ inputPtr++; +++ } ++ } ++ #endif /* LV_HAVE_GENERIC */ ++ ++ ++- ++- ++ #endif /* INCLUDED_volk_32u_byteswap_a_H */ ++diff --git a/kernels/volk/volk_32u_byteswappuppet_32u.h b/kernels/volk/volk_32u_byteswappuppet_32u.h ++index c33a5fc..ca5ca17 100644 ++--- a/kernels/volk/volk_32u_byteswappuppet_32u.h +++++ b/kernels/volk/volk_32u_byteswappuppet_32u.h ++@@ -1,70 +1,84 @@ ++ #ifndef INCLUDED_volk_32u_byteswappuppet_32u_H ++ #define INCLUDED_volk_32u_byteswappuppet_32u_H ++ ++-#include ++ #include ++ #include +++#include ++ ++ #ifdef LV_HAVE_GENERIC ++-static inline void volk_32u_byteswappuppet_32u_generic(uint32_t*output, uint32_t* intsToSwap, unsigned int num_points){ +++static inline void volk_32u_byteswappuppet_32u_generic(uint32_t* output, +++ uint32_t* intsToSwap, +++ unsigned int num_points) +++{ ++ ++ volk_32u_byteswap_generic((uint32_t*)intsToSwap, num_points); ++ memcpy((void*)output, (void*)intsToSwap, num_points * sizeof(uint32_t)); ++- ++ } ++ #endif ++ ++ #ifdef LV_HAVE_NEON ++-static inline void volk_32u_byteswappuppet_32u_neon(uint32_t*output, uint32_t* intsToSwap, unsigned int num_points){ +++static inline void volk_32u_byteswappuppet_32u_neon(uint32_t* output, +++ uint32_t* intsToSwap, +++ unsigned int num_points) +++{ ++ ++ volk_32u_byteswap_neon((uint32_t*)intsToSwap, num_points); ++ memcpy((void*)output, (void*)intsToSwap, num_points * sizeof(uint32_t)); ++- ++ } ++ #endif ++ ++ #ifdef LV_HAVE_NEONV8 ++-static inline void volk_32u_byteswappuppet_32u_neonv8(uint32_t*output, uint32_t* intsToSwap, unsigned int num_points){ +++static inline void volk_32u_byteswappuppet_32u_neonv8(uint32_t* output, +++ uint32_t* intsToSwap, +++ unsigned int num_points) +++{ ++ ++ volk_32u_byteswap_neonv8((uint32_t*)intsToSwap, num_points); ++ memcpy((void*)output, (void*)intsToSwap, num_points * sizeof(uint32_t)); ++- ++ } ++ #endif ++ ++ #ifdef LV_HAVE_SSE2 ++-static inline void volk_32u_byteswappuppet_32u_u_sse2(uint32_t *output, uint32_t* intsToSwap, unsigned int num_points){ +++static inline void volk_32u_byteswappuppet_32u_u_sse2(uint32_t* output, +++ uint32_t* intsToSwap, +++ unsigned int num_points) +++{ ++ ++ volk_32u_byteswap_u_sse2((uint32_t*)intsToSwap, num_points); ++ memcpy((void*)output, (void*)intsToSwap, num_points * sizeof(uint32_t)); ++- ++ } ++ #endif ++ ++ #ifdef LV_HAVE_SSE2 ++-static inline void volk_32u_byteswappuppet_32u_a_sse2(uint32_t* output, uint32_t* intsToSwap, unsigned int num_points){ +++static inline void volk_32u_byteswappuppet_32u_a_sse2(uint32_t* output, +++ uint32_t* intsToSwap, +++ unsigned int num_points) +++{ ++ ++ volk_32u_byteswap_a_sse2((uint32_t*)intsToSwap, num_points); ++ memcpy((void*)output, (void*)intsToSwap, num_points * sizeof(uint32_t)); ++- ++ } ++ #endif ++ ++ #ifdef LV_HAVE_AVX2 ++-static inline void volk_32u_byteswappuppet_32u_u_avx2(uint32_t* output, uint32_t* intsToSwap, unsigned int num_points){ +++static inline void volk_32u_byteswappuppet_32u_u_avx2(uint32_t* output, +++ uint32_t* intsToSwap, +++ unsigned int num_points) +++{ ++ ++ volk_32u_byteswap_u_avx2((uint32_t*)intsToSwap, num_points); ++ memcpy((void*)output, (void*)intsToSwap, num_points * sizeof(uint32_t)); ++- ++ } ++ #endif ++ ++ #ifdef LV_HAVE_AVX2 ++-static inline void volk_32u_byteswappuppet_32u_a_avx2(uint32_t* output, uint32_t* intsToSwap, unsigned int num_points){ +++static inline void volk_32u_byteswappuppet_32u_a_avx2(uint32_t* output, +++ uint32_t* intsToSwap, +++ unsigned int num_points) +++{ ++ ++ volk_32u_byteswap_a_avx2((uint32_t*)intsToSwap, num_points); ++ memcpy((void*)output, (void*)intsToSwap, num_points * sizeof(uint32_t)); ++- ++ } ++ #endif ++ ++diff --git a/kernels/volk/volk_32u_popcnt.h b/kernels/volk/volk_32u_popcnt.h ++index 7aa4d43..f6f0c10 100644 ++--- a/kernels/volk/volk_32u_popcnt.h +++++ b/kernels/volk/volk_32u_popcnt.h ++@@ -56,24 +56,23 @@ ++ #ifndef INCLUDED_VOLK_32u_POPCNT_A16_H ++ #define INCLUDED_VOLK_32u_POPCNT_A16_H ++ ++-#include ++ #include +++#include ++ ++ #ifdef LV_HAVE_GENERIC ++ ++-static inline void ++-volk_32u_popcnt_generic(uint32_t* ret, const uint32_t value) +++static inline void volk_32u_popcnt_generic(uint32_t* ret, const uint32_t value) ++ { ++- // This is faster than a lookup table ++- uint32_t retVal = value; +++ // This is faster than a lookup table +++ uint32_t retVal = value; ++ ++- retVal = (retVal & 0x55555555) + (retVal >> 1 & 0x55555555); ++- retVal = (retVal & 0x33333333) + (retVal >> 2 & 0x33333333); ++- retVal = (retVal + (retVal >> 4)) & 0x0F0F0F0F; ++- retVal = (retVal + (retVal >> 8)); ++- retVal = (retVal + (retVal >> 16)) & 0x0000003F; +++ retVal = (retVal & 0x55555555) + (retVal >> 1 & 0x55555555); +++ retVal = (retVal & 0x33333333) + (retVal >> 2 & 0x33333333); +++ retVal = (retVal + (retVal >> 4)) & 0x0F0F0F0F; +++ retVal = (retVal + (retVal >> 8)); +++ retVal = (retVal + (retVal >> 16)) & 0x0000003F; ++ ++- *ret = retVal; +++ *ret = retVal; ++ } ++ ++ #endif /*LV_HAVE_GENERIC*/ ++@@ -83,10 +82,9 @@ volk_32u_popcnt_generic(uint32_t* ret, const uint32_t value) ++ ++ #include ++ ++-static inline void ++-volk_32u_popcnt_a_sse4_2(uint32_t* ret, const uint32_t value) +++static inline void volk_32u_popcnt_a_sse4_2(uint32_t* ret, const uint32_t value) ++ { ++- *ret = _mm_popcnt_u32(value); +++ *ret = _mm_popcnt_u32(value); ++ } ++ ++ #endif /*LV_HAVE_SSE4_2*/ ++diff --git a/kernels/volk/volk_32u_popcntpuppet_32u.h b/kernels/volk/volk_32u_popcntpuppet_32u.h ++index d5edd35..c0389cc 100644 ++--- a/kernels/volk/volk_32u_popcntpuppet_32u.h +++++ b/kernels/volk/volk_32u_popcntpuppet_32u.h ++@@ -27,19 +27,25 @@ ++ #include ++ ++ #ifdef LV_HAVE_GENERIC ++-static inline void volk_32u_popcntpuppet_32u_generic(uint32_t* outVector, const uint32_t* inVector, unsigned int num_points){ +++static inline void volk_32u_popcntpuppet_32u_generic(uint32_t* outVector, +++ const uint32_t* inVector, +++ unsigned int num_points) +++{ ++ unsigned int ii; ++- for(ii=0; ii < num_points; ++ii) { ++- volk_32u_popcnt_generic(outVector+ii, *(inVector+ii) ); +++ for (ii = 0; ii < num_points; ++ii) { +++ volk_32u_popcnt_generic(outVector + ii, *(inVector + ii)); ++ } ++ } ++ #endif /* LV_HAVE_GENERIC */ ++ ++ #ifdef LV_HAVE_SSE4_2 ++-static inline void volk_32u_popcntpuppet_32u_a_sse4_2(uint32_t* outVector, const uint32_t* inVector, unsigned int num_points){ +++static inline void volk_32u_popcntpuppet_32u_a_sse4_2(uint32_t* outVector, +++ const uint32_t* inVector, +++ unsigned int num_points) +++{ ++ unsigned int ii; ++- for(ii=0; ii < num_points; ++ii) { ++- volk_32u_popcnt_a_sse4_2(outVector+ii, *(inVector+ii) ); +++ for (ii = 0; ii < num_points; ++ii) { +++ volk_32u_popcnt_a_sse4_2(outVector + ii, *(inVector + ii)); ++ } ++ } ++ #endif /* LV_HAVE_SSE4_2 */ ++diff --git a/kernels/volk/volk_32u_reverse_32u.h b/kernels/volk/volk_32u_reverse_32u.h ++index b670b13..aff0a9e 100644 ++--- a/kernels/volk/volk_32u_reverse_32u.h +++++ b/kernels/volk/volk_32u_reverse_32u.h ++@@ -24,7 +24,8 @@ ++ * \b bit reversal of the input 32 bit word ++ ++ * Dispatcher Prototype ++- * \code volk_32u_reverse_32u(uint32_t *outputVector, uint32_t *inputVector; unsigned int num_points); +++ * \code volk_32u_reverse_32u(uint32_t *outputVector, uint32_t *inputVector; unsigned int +++ num_points); ++ * \endcode ++ * ++ * \b Inputs ++@@ -32,338 +33,344 @@ ++ * \li num_points The number of data points. ++ * ++ * \b Outputs ++- * \li outputVector: The vector where the results will be stored, which is the bit-reversed input +++ * \li outputVector: The vector where the results will be stored, which is the +++ bit-reversed input ++ * ++ * \endcode ++ */ ++ #ifndef INCLUDED_VOLK_32u_REVERSE_32u_U_H ++ struct dword_split { ++- int b00: 1; ++- int b01: 1; ++- int b02: 1; ++- int b03: 1; ++- int b04: 1; ++- int b05: 1; ++- int b06: 1; ++- int b07: 1; ++- int b08: 1; ++- int b09: 1; ++- int b10: 1; ++- int b11: 1; ++- int b12: 1; ++- int b13: 1; ++- int b14: 1; ++- int b15: 1; ++- int b16: 1; ++- int b17: 1; ++- int b18: 1; ++- int b19: 1; ++- int b20: 1; ++- int b21: 1; ++- int b22: 1; ++- int b23: 1; ++- int b24: 1; ++- int b25: 1; ++- int b26: 1; ++- int b27: 1; ++- int b28: 1; ++- int b29: 1; ++- int b30: 1; ++- int b31: 1; +++ int b00 : 1; +++ int b01 : 1; +++ int b02 : 1; +++ int b03 : 1; +++ int b04 : 1; +++ int b05 : 1; +++ int b06 : 1; +++ int b07 : 1; +++ int b08 : 1; +++ int b09 : 1; +++ int b10 : 1; +++ int b11 : 1; +++ int b12 : 1; +++ int b13 : 1; +++ int b14 : 1; +++ int b15 : 1; +++ int b16 : 1; +++ int b17 : 1; +++ int b18 : 1; +++ int b19 : 1; +++ int b20 : 1; +++ int b21 : 1; +++ int b22 : 1; +++ int b23 : 1; +++ int b24 : 1; +++ int b25 : 1; +++ int b26 : 1; +++ int b27 : 1; +++ int b28 : 1; +++ int b29 : 1; +++ int b30 : 1; +++ int b31 : 1; ++ }; ++ struct char_split { ++- uint8_t b00: 1; ++- uint8_t b01: 1; ++- uint8_t b02: 1; ++- uint8_t b03: 1; ++- uint8_t b04: 1; ++- uint8_t b05: 1; ++- uint8_t b06: 1; ++- uint8_t b07: 1; +++ uint8_t b00 : 1; +++ uint8_t b01 : 1; +++ uint8_t b02 : 1; +++ uint8_t b03 : 1; +++ uint8_t b04 : 1; +++ uint8_t b05 : 1; +++ uint8_t b06 : 1; +++ uint8_t b07 : 1; ++ }; ++ ++-//Idea from "Bit Twiddling Hacks", which dedicates this method to public domain ++-//http://graphics.stanford.edu/~seander/bithacks.html#BitReverseTable +++// Idea from "Bit Twiddling Hacks", which dedicates this method to public domain +++// http://graphics.stanford.edu/~seander/bithacks.html#BitReverseTable ++ static const unsigned char BitReverseTable256[] = { ++- 0x00, 0x80, 0x40, 0xC0, 0x20, 0xA0, 0x60, 0xE0, 0x10, 0x90, 0x50, 0xD0, 0x30, ++- 0xB0, 0x70, 0xF0, 0x08, 0x88, 0x48, 0xC8, 0x28, 0xA8, 0x68, 0xE8, 0x18, 0x98, ++- 0x58, 0xD8, 0x38, 0xB8, 0x78, 0xF8, 0x04, 0x84, 0x44, 0xC4, 0x24, 0xA4, 0x64, ++- 0xE4, 0x14, 0x94, 0x54, 0xD4, 0x34, 0xB4, 0x74, 0xF4, 0x0C, 0x8C, 0x4C, 0xCC, ++- 0x2C, 0xAC, 0x6C, 0xEC, 0x1C, 0x9C, 0x5C, 0xDC, 0x3C, 0xBC, 0x7C, 0xFC, 0x02, ++- 0x82, 0x42, 0xC2, 0x22, 0xA2, 0x62, 0xE2, 0x12, 0x92, 0x52, 0xD2, 0x32, 0xB2, ++- 0x72, 0xF2, 0x0A, 0x8A, 0x4A, 0xCA, 0x2A, 0xAA, 0x6A, 0xEA, 0x1A, 0x9A, 0x5A, ++- 0xDA, 0x3A, 0xBA, 0x7A, 0xFA, 0x06, 0x86, 0x46, 0xC6, 0x26, 0xA6, 0x66, 0xE6, ++- 0x16, 0x96, 0x56, 0xD6, 0x36, 0xB6, 0x76, 0xF6, 0x0E, 0x8E, 0x4E, 0xCE, 0x2E, ++- 0xAE, 0x6E, 0xEE, 0x1E, 0x9E, 0x5E, 0xDE, 0x3E, 0xBE, 0x7E, 0xFE, 0x01, 0x81, ++- 0x41, 0xC1, 0x21, 0xA1, 0x61, 0xE1, 0x11, 0x91, 0x51, 0xD1, 0x31, 0xB1, 0x71, ++- 0xF1, 0x09, 0x89, 0x49, 0xC9, 0x29, 0xA9, 0x69, 0xE9, 0x19, 0x99, 0x59, 0xD9, ++- 0x39, 0xB9, 0x79, 0xF9, 0x05, 0x85, 0x45, 0xC5, 0x25, 0xA5, 0x65, 0xE5, 0x15, ++- 0x95, 0x55, 0xD5, 0x35, 0xB5, 0x75, 0xF5, 0x0D, 0x8D, 0x4D, 0xCD, 0x2D, 0xAD, ++- 0x6D, 0xED, 0x1D, 0x9D, 0x5D, 0xDD, 0x3D, 0xBD, 0x7D, 0xFD, 0x03, 0x83, 0x43, ++- 0xC3, 0x23, 0xA3, 0x63, 0xE3, 0x13, 0x93, 0x53, 0xD3, 0x33, 0xB3, 0x73, 0xF3, ++- 0x0B, 0x8B, 0x4B, 0xCB, 0x2B, 0xAB, 0x6B, 0xEB, 0x1B, 0x9B, 0x5B, 0xDB, 0x3B, ++- 0xBB, 0x7B, 0xFB, 0x07, 0x87, 0x47, 0xC7, 0x27, 0xA7, 0x67, 0xE7, 0x17, 0x97, ++- 0x57, 0xD7, 0x37, 0xB7, 0x77, 0xF7, 0x0F, 0x8F, 0x4F, 0xCF, 0x2F, 0xAF, 0x6F, ++- 0xEF, 0x1F, 0x9F, 0x5F, 0xDF, 0x3F, 0xBF, 0x7F, 0xFF +++ 0x00, 0x80, 0x40, 0xC0, 0x20, 0xA0, 0x60, 0xE0, 0x10, 0x90, 0x50, 0xD0, 0x30, 0xB0, +++ 0x70, 0xF0, 0x08, 0x88, 0x48, 0xC8, 0x28, 0xA8, 0x68, 0xE8, 0x18, 0x98, 0x58, 0xD8, +++ 0x38, 0xB8, 0x78, 0xF8, 0x04, 0x84, 0x44, 0xC4, 0x24, 0xA4, 0x64, 0xE4, 0x14, 0x94, +++ 0x54, 0xD4, 0x34, 0xB4, 0x74, 0xF4, 0x0C, 0x8C, 0x4C, 0xCC, 0x2C, 0xAC, 0x6C, 0xEC, +++ 0x1C, 0x9C, 0x5C, 0xDC, 0x3C, 0xBC, 0x7C, 0xFC, 0x02, 0x82, 0x42, 0xC2, 0x22, 0xA2, +++ 0x62, 0xE2, 0x12, 0x92, 0x52, 0xD2, 0x32, 0xB2, 0x72, 0xF2, 0x0A, 0x8A, 0x4A, 0xCA, +++ 0x2A, 0xAA, 0x6A, 0xEA, 0x1A, 0x9A, 0x5A, 0xDA, 0x3A, 0xBA, 0x7A, 0xFA, 0x06, 0x86, +++ 0x46, 0xC6, 0x26, 0xA6, 0x66, 0xE6, 0x16, 0x96, 0x56, 0xD6, 0x36, 0xB6, 0x76, 0xF6, +++ 0x0E, 0x8E, 0x4E, 0xCE, 0x2E, 0xAE, 0x6E, 0xEE, 0x1E, 0x9E, 0x5E, 0xDE, 0x3E, 0xBE, +++ 0x7E, 0xFE, 0x01, 0x81, 0x41, 0xC1, 0x21, 0xA1, 0x61, 0xE1, 0x11, 0x91, 0x51, 0xD1, +++ 0x31, 0xB1, 0x71, 0xF1, 0x09, 0x89, 0x49, 0xC9, 0x29, 0xA9, 0x69, 0xE9, 0x19, 0x99, +++ 0x59, 0xD9, 0x39, 0xB9, 0x79, 0xF9, 0x05, 0x85, 0x45, 0xC5, 0x25, 0xA5, 0x65, 0xE5, +++ 0x15, 0x95, 0x55, 0xD5, 0x35, 0xB5, 0x75, 0xF5, 0x0D, 0x8D, 0x4D, 0xCD, 0x2D, 0xAD, +++ 0x6D, 0xED, 0x1D, 0x9D, 0x5D, 0xDD, 0x3D, 0xBD, 0x7D, 0xFD, 0x03, 0x83, 0x43, 0xC3, +++ 0x23, 0xA3, 0x63, 0xE3, 0x13, 0x93, 0x53, 0xD3, 0x33, 0xB3, 0x73, 0xF3, 0x0B, 0x8B, +++ 0x4B, 0xCB, 0x2B, 0xAB, 0x6B, 0xEB, 0x1B, 0x9B, 0x5B, 0xDB, 0x3B, 0xBB, 0x7B, 0xFB, +++ 0x07, 0x87, 0x47, 0xC7, 0x27, 0xA7, 0x67, 0xE7, 0x17, 0x97, 0x57, 0xD7, 0x37, 0xB7, +++ 0x77, 0xF7, 0x0F, 0x8F, 0x4F, 0xCF, 0x2F, 0xAF, 0x6F, 0xEF, 0x1F, 0x9F, 0x5F, 0xDF, +++ 0x3F, 0xBF, 0x7F, 0xFF ++ }; ++ #ifdef LV_HAVE_GENERIC ++-static inline void volk_32u_reverse_32u_dword_shuffle(uint32_t* out, const uint32_t* in, ++- unsigned int num_points) +++static inline void volk_32u_reverse_32u_dword_shuffle(uint32_t* out, +++ const uint32_t* in, +++ unsigned int num_points) ++ { ++- const struct dword_split *in_ptr = (const struct dword_split*)in; ++- struct dword_split * out_ptr = (struct dword_split*)out; ++- unsigned int number = 0; ++- for(; number < num_points; ++number){ ++- out_ptr->b00 = in_ptr->b31; ++- out_ptr->b01 = in_ptr->b30; ++- out_ptr->b02 = in_ptr->b29; ++- out_ptr->b03 = in_ptr->b28; ++- out_ptr->b04 = in_ptr->b27; ++- out_ptr->b05 = in_ptr->b26; ++- out_ptr->b06 = in_ptr->b25; ++- out_ptr->b07 = in_ptr->b24; ++- out_ptr->b08 = in_ptr->b23; ++- out_ptr->b09 = in_ptr->b22; ++- out_ptr->b10 = in_ptr->b21; ++- out_ptr->b11 = in_ptr->b20; ++- out_ptr->b12 = in_ptr->b19; ++- out_ptr->b13 = in_ptr->b18; ++- out_ptr->b14 = in_ptr->b17; ++- out_ptr->b15 = in_ptr->b16; ++- out_ptr->b16 = in_ptr->b15; ++- out_ptr->b17 = in_ptr->b14; ++- out_ptr->b18 = in_ptr->b13; ++- out_ptr->b19 = in_ptr->b12; ++- out_ptr->b20 = in_ptr->b11; ++- out_ptr->b21 = in_ptr->b10; ++- out_ptr->b22 = in_ptr->b09; ++- out_ptr->b23 = in_ptr->b08; ++- out_ptr->b24 = in_ptr->b07; ++- out_ptr->b25 = in_ptr->b06; ++- out_ptr->b26 = in_ptr->b05; ++- out_ptr->b27 = in_ptr->b04; ++- out_ptr->b28 = in_ptr->b03; ++- out_ptr->b29 = in_ptr->b02; ++- out_ptr->b30 = in_ptr->b01; ++- out_ptr->b31 = in_ptr->b00; ++- ++in_ptr; ++- ++out_ptr; ++- } +++ const struct dword_split* in_ptr = (const struct dword_split*)in; +++ struct dword_split* out_ptr = (struct dword_split*)out; +++ unsigned int number = 0; +++ for (; number < num_points; ++number) { +++ out_ptr->b00 = in_ptr->b31; +++ out_ptr->b01 = in_ptr->b30; +++ out_ptr->b02 = in_ptr->b29; +++ out_ptr->b03 = in_ptr->b28; +++ out_ptr->b04 = in_ptr->b27; +++ out_ptr->b05 = in_ptr->b26; +++ out_ptr->b06 = in_ptr->b25; +++ out_ptr->b07 = in_ptr->b24; +++ out_ptr->b08 = in_ptr->b23; +++ out_ptr->b09 = in_ptr->b22; +++ out_ptr->b10 = in_ptr->b21; +++ out_ptr->b11 = in_ptr->b20; +++ out_ptr->b12 = in_ptr->b19; +++ out_ptr->b13 = in_ptr->b18; +++ out_ptr->b14 = in_ptr->b17; +++ out_ptr->b15 = in_ptr->b16; +++ out_ptr->b16 = in_ptr->b15; +++ out_ptr->b17 = in_ptr->b14; +++ out_ptr->b18 = in_ptr->b13; +++ out_ptr->b19 = in_ptr->b12; +++ out_ptr->b20 = in_ptr->b11; +++ out_ptr->b21 = in_ptr->b10; +++ out_ptr->b22 = in_ptr->b09; +++ out_ptr->b23 = in_ptr->b08; +++ out_ptr->b24 = in_ptr->b07; +++ out_ptr->b25 = in_ptr->b06; +++ out_ptr->b26 = in_ptr->b05; +++ out_ptr->b27 = in_ptr->b04; +++ out_ptr->b28 = in_ptr->b03; +++ out_ptr->b29 = in_ptr->b02; +++ out_ptr->b30 = in_ptr->b01; +++ out_ptr->b31 = in_ptr->b00; +++ ++in_ptr; +++ ++out_ptr; +++ } ++ } ++ #endif /* LV_HAVE_GENERIC */ ++ ++ #ifdef LV_HAVE_GENERIC ++-static inline void volk_32u_reverse_32u_byte_shuffle(uint32_t* out, const uint32_t* in, ++- unsigned int num_points) +++static inline void volk_32u_reverse_32u_byte_shuffle(uint32_t* out, +++ const uint32_t* in, +++ unsigned int num_points) ++ { ++- const uint32_t *in_ptr = in; ++- uint32_t *out_ptr = out; ++- unsigned int number = 0; ++- for(; number < num_points; ++number){ ++- const struct char_split *in8 = (const struct char_split*)in_ptr; ++- struct char_split *out8 = (struct char_split*)out_ptr; +++ const uint32_t* in_ptr = in; +++ uint32_t* out_ptr = out; +++ unsigned int number = 0; +++ for (; number < num_points; ++number) { +++ const struct char_split* in8 = (const struct char_split*)in_ptr; +++ struct char_split* out8 = (struct char_split*)out_ptr; ++ ++- out8[3].b00 = in8[0].b07; ++- out8[3].b01 = in8[0].b06; ++- out8[3].b02 = in8[0].b05; ++- out8[3].b03 = in8[0].b04; ++- out8[3].b04 = in8[0].b03; ++- out8[3].b05 = in8[0].b02; ++- out8[3].b06 = in8[0].b01; ++- out8[3].b07 = in8[0].b00; +++ out8[3].b00 = in8[0].b07; +++ out8[3].b01 = in8[0].b06; +++ out8[3].b02 = in8[0].b05; +++ out8[3].b03 = in8[0].b04; +++ out8[3].b04 = in8[0].b03; +++ out8[3].b05 = in8[0].b02; +++ out8[3].b06 = in8[0].b01; +++ out8[3].b07 = in8[0].b00; ++ ++- out8[2].b00 = in8[1].b07; ++- out8[2].b01 = in8[1].b06; ++- out8[2].b02 = in8[1].b05; ++- out8[2].b03 = in8[1].b04; ++- out8[2].b04 = in8[1].b03; ++- out8[2].b05 = in8[1].b02; ++- out8[2].b06 = in8[1].b01; ++- out8[2].b07 = in8[1].b00; +++ out8[2].b00 = in8[1].b07; +++ out8[2].b01 = in8[1].b06; +++ out8[2].b02 = in8[1].b05; +++ out8[2].b03 = in8[1].b04; +++ out8[2].b04 = in8[1].b03; +++ out8[2].b05 = in8[1].b02; +++ out8[2].b06 = in8[1].b01; +++ out8[2].b07 = in8[1].b00; ++ ++- out8[1].b00 = in8[2].b07; ++- out8[1].b01 = in8[2].b06; ++- out8[1].b02 = in8[2].b05; ++- out8[1].b03 = in8[2].b04; ++- out8[1].b04 = in8[2].b03; ++- out8[1].b05 = in8[2].b02; ++- out8[1].b06 = in8[2].b01; ++- out8[1].b07 = in8[2].b00; +++ out8[1].b00 = in8[2].b07; +++ out8[1].b01 = in8[2].b06; +++ out8[1].b02 = in8[2].b05; +++ out8[1].b03 = in8[2].b04; +++ out8[1].b04 = in8[2].b03; +++ out8[1].b05 = in8[2].b02; +++ out8[1].b06 = in8[2].b01; +++ out8[1].b07 = in8[2].b00; ++ ++- out8[0].b00 = in8[3].b07; ++- out8[0].b01 = in8[3].b06; ++- out8[0].b02 = in8[3].b05; ++- out8[0].b03 = in8[3].b04; ++- out8[0].b04 = in8[3].b03; ++- out8[0].b05 = in8[3].b02; ++- out8[0].b06 = in8[3].b01; ++- out8[0].b07 = in8[3].b00; ++- ++in_ptr; ++- ++out_ptr; ++- } +++ out8[0].b00 = in8[3].b07; +++ out8[0].b01 = in8[3].b06; +++ out8[0].b02 = in8[3].b05; +++ out8[0].b03 = in8[3].b04; +++ out8[0].b04 = in8[3].b03; +++ out8[0].b05 = in8[3].b02; +++ out8[0].b06 = in8[3].b01; +++ out8[0].b07 = in8[3].b00; +++ ++in_ptr; +++ ++out_ptr; +++ } ++ } ++ #endif /* LV_HAVE_GENERIC */ ++ ++-//Idea from "Bit Twiddling Hacks", which dedicates this method to public domain ++-//http://graphics.stanford.edu/~seander/bithacks.html#BitReverseTable +++// Idea from "Bit Twiddling Hacks", which dedicates this method to public domain +++// http://graphics.stanford.edu/~seander/bithacks.html#BitReverseTable ++ #ifdef LV_HAVE_GENERIC ++-static inline void volk_32u_reverse_32u_lut(uint32_t* out, const uint32_t* in, ++- unsigned int num_points) +++static inline void +++volk_32u_reverse_32u_lut(uint32_t* out, const uint32_t* in, unsigned int num_points) ++ { ++- const uint32_t *in_ptr = in; ++- uint32_t *out_ptr = out; ++- unsigned int number = 0; ++- for(; number < num_points; ++number){ ++- *out_ptr = ++- (BitReverseTable256[*in_ptr & 0xff] << 24) | ++- (BitReverseTable256[(*in_ptr >> 8) & 0xff] << 16) | ++- (BitReverseTable256[(*in_ptr >> 16) & 0xff] << 8) | ++- (BitReverseTable256[(*in_ptr >> 24) & 0xff]); ++- ++in_ptr; ++- ++out_ptr; ++- } +++ const uint32_t* in_ptr = in; +++ uint32_t* out_ptr = out; +++ unsigned int number = 0; +++ for (; number < num_points; ++number) { +++ *out_ptr = (BitReverseTable256[*in_ptr & 0xff] << 24) | +++ (BitReverseTable256[(*in_ptr >> 8) & 0xff] << 16) | +++ (BitReverseTable256[(*in_ptr >> 16) & 0xff] << 8) | +++ (BitReverseTable256[(*in_ptr >> 24) & 0xff]); +++ ++in_ptr; +++ ++out_ptr; +++ } ++ } ++ #endif /* LV_HAVE_GENERIC */ ++ ++-//Single-Byte code from "Bit Twiddling Hacks", which dedicates this method to public domain ++-//http://graphics.stanford.edu/~seander/bithacks.html#ReverseByteWith64Bits +++// Single-Byte code from "Bit Twiddling Hacks", which dedicates this method to public +++// domain http://graphics.stanford.edu/~seander/bithacks.html#ReverseByteWith64Bits ++ #ifdef LV_HAVE_GENERIC ++-static inline void volk_32u_reverse_32u_2001magic(uint32_t* out, const uint32_t* in, ++- unsigned int num_points) +++static inline void +++volk_32u_reverse_32u_2001magic(uint32_t* out, const uint32_t* in, unsigned int num_points) ++ { ++- const uint32_t *in_ptr = in; ++- uint32_t *out_ptr = out; ++- const uint8_t *in8; ++- uint8_t *out8; ++- unsigned int number = 0; ++- for(; number < num_points; ++number){ ++- in8 = (const uint8_t*)in_ptr; ++- out8 = (uint8_t*)out_ptr; ++- out8[3] = ((in8[0] * 0x80200802ULL) & 0x0884422110ULL) * 0x0101010101ULL >> 32; ++- out8[2] = ((in8[1] * 0x80200802ULL) & 0x0884422110ULL) * 0x0101010101ULL >> 32; ++- out8[1] = ((in8[2] * 0x80200802ULL) & 0x0884422110ULL) * 0x0101010101ULL >> 32; ++- out8[0] = ((in8[3] * 0x80200802ULL) & 0x0884422110ULL) * 0x0101010101ULL >> 32; ++- ++in_ptr; ++- ++out_ptr; ++- } +++ const uint32_t* in_ptr = in; +++ uint32_t* out_ptr = out; +++ const uint8_t* in8; +++ uint8_t* out8; +++ unsigned int number = 0; +++ for (; number < num_points; ++number) { +++ in8 = (const uint8_t*)in_ptr; +++ out8 = (uint8_t*)out_ptr; +++ out8[3] = ((in8[0] * 0x80200802ULL) & 0x0884422110ULL) * 0x0101010101ULL >> 32; +++ out8[2] = ((in8[1] * 0x80200802ULL) & 0x0884422110ULL) * 0x0101010101ULL >> 32; +++ out8[1] = ((in8[2] * 0x80200802ULL) & 0x0884422110ULL) * 0x0101010101ULL >> 32; +++ out8[0] = ((in8[3] * 0x80200802ULL) & 0x0884422110ULL) * 0x0101010101ULL >> 32; +++ ++in_ptr; +++ ++out_ptr; +++ } ++ } ++ #endif /* LV_HAVE_GENERIC */ ++ ++ #ifdef LV_HAVE_GENERIC ++ // Current gr-pager implementation ++-static inline void volk_32u_reverse_32u_1972magic(uint32_t* out, const uint32_t* in, ++- unsigned int num_points) +++static inline void +++volk_32u_reverse_32u_1972magic(uint32_t* out, const uint32_t* in, unsigned int num_points) ++ { ++- const uint32_t *in_ptr = in; ++- uint32_t *out_ptr = out; ++- const uint8_t *in8; ++- uint8_t *out8; ++- unsigned int number = 0; ++- for(; number < num_points; ++number){ ++- in8 = (const uint8_t*)in_ptr; ++- out8 = (uint8_t*)out_ptr; ++- out8[3] = (in8[0] * 0x0202020202ULL & 0x010884422010ULL) % 1023; ++- out8[2] = (in8[1] * 0x0202020202ULL & 0x010884422010ULL) % 1023; ++- out8[1] = (in8[2] * 0x0202020202ULL & 0x010884422010ULL) % 1023; ++- out8[0] = (in8[3] * 0x0202020202ULL & 0x010884422010ULL) % 1023; ++- ++in_ptr; ++- ++out_ptr; ++- } +++ const uint32_t* in_ptr = in; +++ uint32_t* out_ptr = out; +++ const uint8_t* in8; +++ uint8_t* out8; +++ unsigned int number = 0; +++ for (; number < num_points; ++number) { +++ in8 = (const uint8_t*)in_ptr; +++ out8 = (uint8_t*)out_ptr; +++ out8[3] = (in8[0] * 0x0202020202ULL & 0x010884422010ULL) % 1023; +++ out8[2] = (in8[1] * 0x0202020202ULL & 0x010884422010ULL) % 1023; +++ out8[1] = (in8[2] * 0x0202020202ULL & 0x010884422010ULL) % 1023; +++ out8[0] = (in8[3] * 0x0202020202ULL & 0x010884422010ULL) % 1023; +++ ++in_ptr; +++ ++out_ptr; +++ } ++ } ++ #endif /* LV_HAVE_GENERIC */ ++ ++-//After lengthy thought and quite a bit of whiteboarding: +++// After lengthy thought and quite a bit of whiteboarding: ++ #ifdef LV_HAVE_GENERIC ++-static inline void volk_32u_reverse_32u_bintree_permute_top_down(uint32_t* out, const uint32_t* in, ++- unsigned int num_points) +++static inline void volk_32u_reverse_32u_bintree_permute_top_down(uint32_t* out, +++ const uint32_t* in, +++ unsigned int num_points) ++ { ++- const uint32_t *in_ptr = in; ++- uint32_t *out_ptr = out; ++- unsigned int number = 0; ++- for(; number < num_points; ++number){ ++- uint32_t tmp = *in_ptr; ++- /* permute uint16: ++- The idea is to simply shift the lower 16 bit up, and the upper 16 bit down. ++- */ ++- tmp = ( tmp << 16 ) | ( tmp >> 16 ); ++- /* permute bytes: ++- shift up by 1 B first, then only consider even bytes, and OR with the unshifted even bytes ++- */ ++- tmp = ((tmp & (0xFF | 0xFF << 16)) << 8) | ((tmp >> 8) & (0xFF | 0xFF << 16)); ++- /* permute 4bit tuples: ++- Same idea, but the "consideration" mask expression becomes unwieldy ++- */ ++- tmp = ((tmp & (0xF | 0xF << 8 | 0xF << 16 | 0xF << 24)) << 4) | ((tmp >> 4) & (0xF | 0xF << 8 | 0xF << 16 | 0xF << 24)); ++- /* permute 2bit tuples: ++- Here, we collapsed the "consideration" mask to a simple hexmask: 0b0011 = ++- 3; we need those every 4b, which coincides with a hex digit! ++- */ ++- tmp = ((tmp & (0x33333333)) << 2) | ((tmp >> 2) & (0x33333333)); ++- /* permute odd/even: ++- 0x01 = 0x1; we need these every 2b, which works out: 0x01 | (0x01 << 2) = 0x05! ++- */ ++- tmp = ((tmp & (0x55555555)) << 1) | ((tmp >> 1) & (0x55555555)); +++ const uint32_t* in_ptr = in; +++ uint32_t* out_ptr = out; +++ unsigned int number = 0; +++ for (; number < num_points; ++number) { +++ uint32_t tmp = *in_ptr; +++ /* permute uint16: +++ The idea is to simply shift the lower 16 bit up, and the upper 16 bit down. +++ */ +++ tmp = (tmp << 16) | (tmp >> 16); +++ /* permute bytes: +++ shift up by 1 B first, then only consider even bytes, and OR with the unshifted +++ even bytes +++ */ +++ tmp = ((tmp & (0xFF | 0xFF << 16)) << 8) | ((tmp >> 8) & (0xFF | 0xFF << 16)); +++ /* permute 4bit tuples: +++ Same idea, but the "consideration" mask expression becomes unwieldy +++ */ +++ tmp = ((tmp & (0xF | 0xF << 8 | 0xF << 16 | 0xF << 24)) << 4) | +++ ((tmp >> 4) & (0xF | 0xF << 8 | 0xF << 16 | 0xF << 24)); +++ /* permute 2bit tuples: +++ Here, we collapsed the "consideration" mask to a simple hexmask: 0b0011 = +++ 3; we need those every 4b, which coincides with a hex digit! +++ */ +++ tmp = ((tmp & (0x33333333)) << 2) | ((tmp >> 2) & (0x33333333)); +++ /* permute odd/even: +++ 0x01 = 0x1; we need these every 2b, which works out: 0x01 | (0x01 << 2) = +++ 0x05! +++ */ +++ tmp = ((tmp & (0x55555555)) << 1) | ((tmp >> 1) & (0x55555555)); ++ ++- *out_ptr = tmp; ++- ++in_ptr; ++- ++out_ptr; ++- } +++ *out_ptr = tmp; +++ ++in_ptr; +++ ++out_ptr; +++ } ++ } ++ #endif /* LV_HAVE_GENERIC */ ++ #ifdef LV_HAVE_GENERIC ++-static inline void volk_32u_reverse_32u_bintree_permute_bottom_up(uint32_t* out, const uint32_t* in, ++- unsigned int num_points) +++static inline void volk_32u_reverse_32u_bintree_permute_bottom_up(uint32_t* out, +++ const uint32_t* in, +++ unsigned int num_points) ++ { ++- //same stuff as top_down, inverted order (permutation matrices don't care, you know!) ++- const uint32_t *in_ptr = in; ++- uint32_t *out_ptr = out; ++- unsigned int number = 0; ++- for(; number < num_points; ++number){ ++- uint32_t tmp = *in_ptr; ++- tmp = ((tmp & (0x55555555)) << 1) | ((tmp >> 1) & (0x55555555)); ++- tmp = ((tmp & (0x33333333)) << 2) | ((tmp >> 2) & (0x33333333)); ++- tmp = ((tmp & (0xF | 0xF << 8 | 0xF << 16 | 0xF << 24)) << 4) | ((tmp >> 4) & (0xF | 0xF << 8 | 0xF << 16 | 0xF << 24)); ++- tmp = ((tmp & (0xFF | 0xFF << 16)) << 8) | ((tmp >> 8) & (0xFF | 0xFF << 16)); ++- tmp = ( tmp << 16 ) | ( tmp >> 16 ); +++ // same stuff as top_down, inverted order (permutation matrices don't care, you know!) +++ const uint32_t* in_ptr = in; +++ uint32_t* out_ptr = out; +++ unsigned int number = 0; +++ for (; number < num_points; ++number) { +++ uint32_t tmp = *in_ptr; +++ tmp = ((tmp & (0x55555555)) << 1) | ((tmp >> 1) & (0x55555555)); +++ tmp = ((tmp & (0x33333333)) << 2) | ((tmp >> 2) & (0x33333333)); +++ tmp = ((tmp & (0xF | 0xF << 8 | 0xF << 16 | 0xF << 24)) << 4) | +++ ((tmp >> 4) & (0xF | 0xF << 8 | 0xF << 16 | 0xF << 24)); +++ tmp = ((tmp & (0xFF | 0xFF << 16)) << 8) | ((tmp >> 8) & (0xFF | 0xFF << 16)); +++ tmp = (tmp << 16) | (tmp >> 16); ++ ++- *out_ptr = tmp; ++- ++in_ptr; ++- ++out_ptr; ++- } +++ *out_ptr = tmp; +++ ++in_ptr; +++ ++out_ptr; +++ } ++ } ++ #endif /* LV_HAVE_GENERIC */ ++ ++ #ifdef LV_HAVE_NEONV8 ++ #include ++ ++-static inline void volk_32u_reverse_32u_neonv8(uint32_t* out, const uint32_t* in, ++- unsigned int num_points) ++-{ ++- const uint32_t *in_ptr = in; ++- uint32_t *out_ptr = out; +++static inline void +++volk_32u_reverse_32u_neonv8(uint32_t* out, const uint32_t* in, unsigned int num_points) +++{ +++ const uint32_t* in_ptr = in; +++ uint32_t* out_ptr = out; ++ ++- const uint8x16_t idx = { 3,2,1,0, 7,6,5,4, 11,10,9,8, 15,14,13,12 }; +++ const uint8x16_t idx = { 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12 }; ++ ++- const unsigned int quarterPoints = num_points/4; +++ const unsigned int quarterPoints = num_points / 4; ++ unsigned int number = 0; ++- for(; number < quarterPoints; ++number){ ++- __VOLK_PREFETCH(in_ptr+4); ++- uint32x4_t x = vld1q_u32(in_ptr); ++- uint32x4_t z = vreinterpretq_u32_u8(vqtbl1q_u8(vrbitq_u8(vreinterpretq_u8_u32 (x)), ++- idx)); ++- vst1q_u32 (out_ptr, z); ++- in_ptr += 4; ++- out_ptr += 4; +++ for (; number < quarterPoints; ++number) { +++ __VOLK_PREFETCH(in_ptr + 4); +++ uint32x4_t x = vld1q_u32(in_ptr); +++ uint32x4_t z = +++ vreinterpretq_u32_u8(vqtbl1q_u8(vrbitq_u8(vreinterpretq_u8_u32(x)), idx)); +++ vst1q_u32(out_ptr, z); +++ in_ptr += 4; +++ out_ptr += 4; ++ } ++- number = quarterPoints*4; ++- for(; number < num_points; ++number){ ++- *out_ptr = ++- (BitReverseTable256[*in_ptr & 0xff] << 24) | ++- (BitReverseTable256[(*in_ptr >> 8) & 0xff] << 16) | ++- (BitReverseTable256[(*in_ptr >> 16) & 0xff] << 8) | ++- (BitReverseTable256[(*in_ptr >> 24) & 0xff]); ++- ++in_ptr; ++- ++out_ptr; +++ number = quarterPoints * 4; +++ for (; number < num_points; ++number) { +++ *out_ptr = (BitReverseTable256[*in_ptr & 0xff] << 24) | +++ (BitReverseTable256[(*in_ptr >> 8) & 0xff] << 16) | +++ (BitReverseTable256[(*in_ptr >> 16) & 0xff] << 8) | +++ (BitReverseTable256[(*in_ptr >> 24) & 0xff]); +++ ++in_ptr; +++ ++out_ptr; ++ } ++ } ++ ++@@ -371,29 +378,35 @@ static inline void volk_32u_reverse_32u_neonv8(uint32_t* out, const uint32_t* in ++ #ifdef LV_HAVE_NEON ++ #include ++ ++-#define DO_RBIT \ ++- __VOLK_ASM("rbit %[result], %[value]" \ ++- : [result]"=r" (*out_ptr) \ ++- : [value] "r" (*in_ptr) \ ++- : ); \ ++- in_ptr++; \ ++- out_ptr++; +++#define DO_RBIT \ +++ __VOLK_ASM("rbit %[result], %[value]" \ +++ : [result] "=r"(*out_ptr) \ +++ : [value] "r"(*in_ptr) \ +++ :); \ +++ in_ptr++; \ +++ out_ptr++; ++ ++-static inline void volk_32u_reverse_32u_arm(uint32_t* out, const uint32_t* in, ++- unsigned int num_points) +++static inline void +++volk_32u_reverse_32u_arm(uint32_t* out, const uint32_t* in, unsigned int num_points) ++ { ++ ++- const uint32_t *in_ptr = in; ++- uint32_t *out_ptr = out; ++- const unsigned int eighthPoints = num_points/8; +++ const uint32_t* in_ptr = in; +++ uint32_t* out_ptr = out; +++ const unsigned int eighthPoints = num_points / 8; ++ unsigned int number = 0; ++- for(; number < eighthPoints; ++number){ ++- __VOLK_PREFETCH(in_ptr+8); ++- DO_RBIT; DO_RBIT; DO_RBIT; DO_RBIT; ++- DO_RBIT; DO_RBIT; DO_RBIT; DO_RBIT; +++ for (; number < eighthPoints; ++number) { +++ __VOLK_PREFETCH(in_ptr + 8); +++ DO_RBIT; +++ DO_RBIT; +++ DO_RBIT; +++ DO_RBIT; +++ DO_RBIT; +++ DO_RBIT; +++ DO_RBIT; +++ DO_RBIT; ++ } ++- number = eighthPoints*8; ++- for(; number < num_points; ++number){ +++ number = eighthPoints * 8; +++ for (; number < num_points; ++number) { ++ DO_RBIT; ++ } ++ } ++@@ -403,4 +416,3 @@ static inline void volk_32u_reverse_32u_arm(uint32_t* out, const uint32_t* in, ++ ++ ++ #endif /* INCLUDED_volk_32u_reverse_32u_u_H */ ++- ++diff --git a/kernels/volk/volk_64f_convert_32f.h b/kernels/volk/volk_64f_convert_32f.h ++index 20422cf..4ebccc0 100644 ++--- a/kernels/volk/volk_64f_convert_32f.h +++++ b/kernels/volk/volk_64f_convert_32f.h ++@@ -29,8 +29,8 @@ ++ * ++ * Dispatcher Prototype ++ * \code ++- * void volk_64f_convert_32f(float* outputVector, const double* inputVector, unsigned int num_points) ++- * \endcode +++ * void volk_64f_convert_32f(float* outputVector, const double* inputVector, unsigned int +++ * num_points) \endcode ++ * ++ * \b Inputs ++ * \li inputVector: The vector of doubles to convert to floats. ++@@ -70,34 +70,39 @@ ++ #ifdef LV_HAVE_AVX512F ++ #include ++ ++-static inline void volk_64f_convert_32f_u_avx512f(float* outputVector, const double* inputVector, unsigned int num_points){ ++- unsigned int number = 0; +++static inline void volk_64f_convert_32f_u_avx512f(float* outputVector, +++ const double* inputVector, +++ unsigned int num_points) +++{ +++ unsigned int number = 0; ++ ++- const unsigned int oneSixteenthPoints = num_points / 16; +++ const unsigned int oneSixteenthPoints = num_points / 16; ++ ++- const double* inputVectorPtr = (const double*)inputVector; ++- float* outputVectorPtr = outputVector; ++- __m256 ret1, ret2; ++- __m512d inputVal1, inputVal2; +++ const double* inputVectorPtr = (const double*)inputVector; +++ float* outputVectorPtr = outputVector; +++ __m256 ret1, ret2; +++ __m512d inputVal1, inputVal2; ++ ++- for(;number < oneSixteenthPoints; number++){ ++- inputVal1 = _mm512_loadu_pd(inputVectorPtr); inputVectorPtr += 8; ++- inputVal2 = _mm512_loadu_pd(inputVectorPtr); inputVectorPtr += 8; +++ for (; number < oneSixteenthPoints; number++) { +++ inputVal1 = _mm512_loadu_pd(inputVectorPtr); +++ inputVectorPtr += 8; +++ inputVal2 = _mm512_loadu_pd(inputVectorPtr); +++ inputVectorPtr += 8; ++ ++- ret1 = _mm512_cvtpd_ps(inputVal1); ++- ret2 = _mm512_cvtpd_ps(inputVal2); +++ ret1 = _mm512_cvtpd_ps(inputVal1); +++ ret2 = _mm512_cvtpd_ps(inputVal2); ++ ++- _mm256_storeu_ps(outputVectorPtr, ret1); ++- outputVectorPtr += 8; +++ _mm256_storeu_ps(outputVectorPtr, ret1); +++ outputVectorPtr += 8; ++ ++- _mm256_storeu_ps(outputVectorPtr, ret2); ++- outputVectorPtr += 8; ++- } +++ _mm256_storeu_ps(outputVectorPtr, ret2); +++ outputVectorPtr += 8; +++ } ++ ++- number = oneSixteenthPoints * 16; ++- for(; number < num_points; number++){ ++- outputVector[number] = (float)(inputVector[number]); ++- } +++ number = oneSixteenthPoints * 16; +++ for (; number < num_points; number++) { +++ outputVector[number] = (float)(inputVector[number]); +++ } ++ } ++ #endif /* LV_HAVE_AVX512F */ ++ ++@@ -105,34 +110,39 @@ static inline void volk_64f_convert_32f_u_avx512f(float* outputVector, const dou ++ #ifdef LV_HAVE_AVX ++ #include ++ ++-static inline void volk_64f_convert_32f_u_avx(float* outputVector, const double* inputVector, unsigned int num_points){ ++- unsigned int number = 0; +++static inline void volk_64f_convert_32f_u_avx(float* outputVector, +++ const double* inputVector, +++ unsigned int num_points) +++{ +++ unsigned int number = 0; ++ ++- const unsigned int oneEightPoints = num_points / 8; +++ const unsigned int oneEightPoints = num_points / 8; ++ ++- const double* inputVectorPtr = (const double*)inputVector; ++- float* outputVectorPtr = outputVector; ++- __m128 ret1, ret2; ++- __m256d inputVal1, inputVal2; +++ const double* inputVectorPtr = (const double*)inputVector; +++ float* outputVectorPtr = outputVector; +++ __m128 ret1, ret2; +++ __m256d inputVal1, inputVal2; ++ ++- for(;number < oneEightPoints; number++){ ++- inputVal1 = _mm256_loadu_pd(inputVectorPtr); inputVectorPtr += 4; ++- inputVal2 = _mm256_loadu_pd(inputVectorPtr); inputVectorPtr += 4; +++ for (; number < oneEightPoints; number++) { +++ inputVal1 = _mm256_loadu_pd(inputVectorPtr); +++ inputVectorPtr += 4; +++ inputVal2 = _mm256_loadu_pd(inputVectorPtr); +++ inputVectorPtr += 4; ++ ++- ret1 = _mm256_cvtpd_ps(inputVal1); ++- ret2 = _mm256_cvtpd_ps(inputVal2); +++ ret1 = _mm256_cvtpd_ps(inputVal1); +++ ret2 = _mm256_cvtpd_ps(inputVal2); ++ ++- _mm_storeu_ps(outputVectorPtr, ret1); ++- outputVectorPtr += 4; +++ _mm_storeu_ps(outputVectorPtr, ret1); +++ outputVectorPtr += 4; ++ ++- _mm_storeu_ps(outputVectorPtr, ret2); ++- outputVectorPtr += 4; ++- } +++ _mm_storeu_ps(outputVectorPtr, ret2); +++ outputVectorPtr += 4; +++ } ++ ++- number = oneEightPoints * 8; ++- for(; number < num_points; number++){ ++- outputVector[number] = (float)(inputVector[number]); ++- } +++ number = oneEightPoints * 8; +++ for (; number < num_points; number++) { +++ outputVector[number] = (float)(inputVector[number]); +++ } ++ } ++ #endif /* LV_HAVE_AVX */ ++ ++@@ -140,53 +150,59 @@ static inline void volk_64f_convert_32f_u_avx(float* outputVector, const double* ++ #ifdef LV_HAVE_SSE2 ++ #include ++ ++-static inline void volk_64f_convert_32f_u_sse2(float* outputVector, const double* inputVector, unsigned int num_points){ ++- unsigned int number = 0; +++static inline void volk_64f_convert_32f_u_sse2(float* outputVector, +++ const double* inputVector, +++ unsigned int num_points) +++{ +++ unsigned int number = 0; ++ ++- const unsigned int quarterPoints = num_points / 4; +++ const unsigned int quarterPoints = num_points / 4; ++ ++- const double* inputVectorPtr = (const double*)inputVector; ++- float* outputVectorPtr = outputVector; ++- __m128 ret, ret2; ++- __m128d inputVal1, inputVal2; +++ const double* inputVectorPtr = (const double*)inputVector; +++ float* outputVectorPtr = outputVector; +++ __m128 ret, ret2; +++ __m128d inputVal1, inputVal2; ++ ++- for(;number < quarterPoints; number++){ ++- inputVal1 = _mm_loadu_pd(inputVectorPtr); inputVectorPtr += 2; ++- inputVal2 = _mm_loadu_pd(inputVectorPtr); inputVectorPtr += 2; +++ for (; number < quarterPoints; number++) { +++ inputVal1 = _mm_loadu_pd(inputVectorPtr); +++ inputVectorPtr += 2; +++ inputVal2 = _mm_loadu_pd(inputVectorPtr); +++ inputVectorPtr += 2; ++ ++- ret = _mm_cvtpd_ps(inputVal1); ++- ret2 = _mm_cvtpd_ps(inputVal2); +++ ret = _mm_cvtpd_ps(inputVal1); +++ ret2 = _mm_cvtpd_ps(inputVal2); ++ ++- ret = _mm_movelh_ps(ret, ret2); +++ ret = _mm_movelh_ps(ret, ret2); ++ ++- _mm_storeu_ps(outputVectorPtr, ret); ++- outputVectorPtr += 4; ++- } +++ _mm_storeu_ps(outputVectorPtr, ret); +++ outputVectorPtr += 4; +++ } ++ ++- number = quarterPoints * 4; ++- for(; number < num_points; number++){ ++- outputVector[number] = (float)(inputVector[number]); ++- } +++ number = quarterPoints * 4; +++ for (; number < num_points; number++) { +++ outputVector[number] = (float)(inputVector[number]); +++ } ++ } ++ #endif /* LV_HAVE_SSE2 */ ++ ++ ++ #ifdef LV_HAVE_GENERIC ++ ++-static inline void volk_64f_convert_32f_generic(float* outputVector, const double* inputVector, unsigned int num_points){ ++- float* outputVectorPtr = outputVector; ++- const double* inputVectorPtr = inputVector; ++- unsigned int number = 0; ++- ++- for(number = 0; number < num_points; number++){ ++- *outputVectorPtr++ = ((float)(*inputVectorPtr++)); ++- } +++static inline void volk_64f_convert_32f_generic(float* outputVector, +++ const double* inputVector, +++ unsigned int num_points) +++{ +++ float* outputVectorPtr = outputVector; +++ const double* inputVectorPtr = inputVector; +++ unsigned int number = 0; +++ +++ for (number = 0; number < num_points; number++) { +++ *outputVectorPtr++ = ((float)(*inputVectorPtr++)); +++ } ++ } ++ #endif /* LV_HAVE_GENERIC */ ++ ++ ++- ++- ++ #endif /* INCLUDED_volk_64f_convert_32f_u_H */ ++ #ifndef INCLUDED_volk_64f_convert_32f_a_H ++ #define INCLUDED_volk_64f_convert_32f_a_H ++@@ -197,34 +213,39 @@ static inline void volk_64f_convert_32f_generic(float* outputVector, const doubl ++ #ifdef LV_HAVE_AVX512F ++ #include ++ ++-static inline void volk_64f_convert_32f_a_avx512f(float* outputVector, const double* inputVector, unsigned int num_points){ ++- unsigned int number = 0; +++static inline void volk_64f_convert_32f_a_avx512f(float* outputVector, +++ const double* inputVector, +++ unsigned int num_points) +++{ +++ unsigned int number = 0; ++ ++- const unsigned int oneSixteenthPoints = num_points / 16; +++ const unsigned int oneSixteenthPoints = num_points / 16; ++ ++- const double* inputVectorPtr = (const double*)inputVector; ++- float* outputVectorPtr = outputVector; ++- __m256 ret1, ret2; ++- __m512d inputVal1, inputVal2; +++ const double* inputVectorPtr = (const double*)inputVector; +++ float* outputVectorPtr = outputVector; +++ __m256 ret1, ret2; +++ __m512d inputVal1, inputVal2; ++ ++- for(;number < oneSixteenthPoints; number++){ ++- inputVal1 = _mm512_load_pd(inputVectorPtr); inputVectorPtr += 8; ++- inputVal2 = _mm512_load_pd(inputVectorPtr); inputVectorPtr += 8; +++ for (; number < oneSixteenthPoints; number++) { +++ inputVal1 = _mm512_load_pd(inputVectorPtr); +++ inputVectorPtr += 8; +++ inputVal2 = _mm512_load_pd(inputVectorPtr); +++ inputVectorPtr += 8; ++ ++- ret1 = _mm512_cvtpd_ps(inputVal1); ++- ret2 = _mm512_cvtpd_ps(inputVal2); +++ ret1 = _mm512_cvtpd_ps(inputVal1); +++ ret2 = _mm512_cvtpd_ps(inputVal2); ++ ++- _mm256_store_ps(outputVectorPtr, ret1); ++- outputVectorPtr += 8; +++ _mm256_store_ps(outputVectorPtr, ret1); +++ outputVectorPtr += 8; ++ ++- _mm256_store_ps(outputVectorPtr, ret2); ++- outputVectorPtr += 8; ++- } +++ _mm256_store_ps(outputVectorPtr, ret2); +++ outputVectorPtr += 8; +++ } ++ ++- number = oneSixteenthPoints * 16; ++- for(; number < num_points; number++){ ++- outputVector[number] = (float)(inputVector[number]); ++- } +++ number = oneSixteenthPoints * 16; +++ for (; number < num_points; number++) { +++ outputVector[number] = (float)(inputVector[number]); +++ } ++ } ++ #endif /* LV_HAVE_AVX512F */ ++ ++@@ -232,34 +253,39 @@ static inline void volk_64f_convert_32f_a_avx512f(float* outputVector, const dou ++ #ifdef LV_HAVE_AVX ++ #include ++ ++-static inline void volk_64f_convert_32f_a_avx(float* outputVector, const double* inputVector, unsigned int num_points){ ++- unsigned int number = 0; +++static inline void volk_64f_convert_32f_a_avx(float* outputVector, +++ const double* inputVector, +++ unsigned int num_points) +++{ +++ unsigned int number = 0; ++ ++- const unsigned int oneEightPoints = num_points / 8; +++ const unsigned int oneEightPoints = num_points / 8; ++ ++- const double* inputVectorPtr = (const double*)inputVector; ++- float* outputVectorPtr = outputVector; ++- __m128 ret1, ret2; ++- __m256d inputVal1, inputVal2; +++ const double* inputVectorPtr = (const double*)inputVector; +++ float* outputVectorPtr = outputVector; +++ __m128 ret1, ret2; +++ __m256d inputVal1, inputVal2; ++ ++- for(;number < oneEightPoints; number++){ ++- inputVal1 = _mm256_load_pd(inputVectorPtr); inputVectorPtr += 4; ++- inputVal2 = _mm256_load_pd(inputVectorPtr); inputVectorPtr += 4; +++ for (; number < oneEightPoints; number++) { +++ inputVal1 = _mm256_load_pd(inputVectorPtr); +++ inputVectorPtr += 4; +++ inputVal2 = _mm256_load_pd(inputVectorPtr); +++ inputVectorPtr += 4; ++ ++- ret1 = _mm256_cvtpd_ps(inputVal1); ++- ret2 = _mm256_cvtpd_ps(inputVal2); +++ ret1 = _mm256_cvtpd_ps(inputVal1); +++ ret2 = _mm256_cvtpd_ps(inputVal2); ++ ++- _mm_store_ps(outputVectorPtr, ret1); ++- outputVectorPtr += 4; +++ _mm_store_ps(outputVectorPtr, ret1); +++ outputVectorPtr += 4; ++ ++- _mm_store_ps(outputVectorPtr, ret2); ++- outputVectorPtr += 4; ++- } +++ _mm_store_ps(outputVectorPtr, ret2); +++ outputVectorPtr += 4; +++ } ++ ++- number = oneEightPoints * 8; ++- for(; number < num_points; number++){ ++- outputVector[number] = (float)(inputVector[number]); ++- } +++ number = oneEightPoints * 8; +++ for (; number < num_points; number++) { +++ outputVector[number] = (float)(inputVector[number]); +++ } ++ } ++ #endif /* LV_HAVE_AVX */ ++ ++@@ -267,51 +293,57 @@ static inline void volk_64f_convert_32f_a_avx(float* outputVector, const double* ++ #ifdef LV_HAVE_SSE2 ++ #include ++ ++-static inline void volk_64f_convert_32f_a_sse2(float* outputVector, const double* inputVector, unsigned int num_points){ ++- unsigned int number = 0; +++static inline void volk_64f_convert_32f_a_sse2(float* outputVector, +++ const double* inputVector, +++ unsigned int num_points) +++{ +++ unsigned int number = 0; ++ ++- const unsigned int quarterPoints = num_points / 4; +++ const unsigned int quarterPoints = num_points / 4; ++ ++- const double* inputVectorPtr = (const double*)inputVector; ++- float* outputVectorPtr = outputVector; ++- __m128 ret, ret2; ++- __m128d inputVal1, inputVal2; +++ const double* inputVectorPtr = (const double*)inputVector; +++ float* outputVectorPtr = outputVector; +++ __m128 ret, ret2; +++ __m128d inputVal1, inputVal2; ++ ++- for(;number < quarterPoints; number++){ ++- inputVal1 = _mm_load_pd(inputVectorPtr); inputVectorPtr += 2; ++- inputVal2 = _mm_load_pd(inputVectorPtr); inputVectorPtr += 2; +++ for (; number < quarterPoints; number++) { +++ inputVal1 = _mm_load_pd(inputVectorPtr); +++ inputVectorPtr += 2; +++ inputVal2 = _mm_load_pd(inputVectorPtr); +++ inputVectorPtr += 2; ++ ++- ret = _mm_cvtpd_ps(inputVal1); ++- ret2 = _mm_cvtpd_ps(inputVal2); +++ ret = _mm_cvtpd_ps(inputVal1); +++ ret2 = _mm_cvtpd_ps(inputVal2); ++ ++- ret = _mm_movelh_ps(ret, ret2); +++ ret = _mm_movelh_ps(ret, ret2); ++ ++- _mm_store_ps(outputVectorPtr, ret); ++- outputVectorPtr += 4; ++- } +++ _mm_store_ps(outputVectorPtr, ret); +++ outputVectorPtr += 4; +++ } ++ ++- number = quarterPoints * 4; ++- for(; number < num_points; number++){ ++- outputVector[number] = (float)(inputVector[number]); ++- } +++ number = quarterPoints * 4; +++ for (; number < num_points; number++) { +++ outputVector[number] = (float)(inputVector[number]); +++ } ++ } ++ #endif /* LV_HAVE_SSE2 */ ++ ++ ++ #ifdef LV_HAVE_GENERIC ++ ++-static inline void volk_64f_convert_32f_a_generic(float* outputVector, const double* inputVector, unsigned int num_points){ ++- float* outputVectorPtr = outputVector; ++- const double* inputVectorPtr = inputVector; ++- unsigned int number = 0; ++- ++- for(number = 0; number < num_points; number++){ ++- *outputVectorPtr++ = ((float)(*inputVectorPtr++)); ++- } +++static inline void volk_64f_convert_32f_a_generic(float* outputVector, +++ const double* inputVector, +++ unsigned int num_points) +++{ +++ float* outputVectorPtr = outputVector; +++ const double* inputVectorPtr = inputVector; +++ unsigned int number = 0; +++ +++ for (number = 0; number < num_points; number++) { +++ *outputVectorPtr++ = ((float)(*inputVectorPtr++)); +++ } ++ } ++ #endif /* LV_HAVE_GENERIC */ ++ ++ ++- ++- ++ #endif /* INCLUDED_volk_64f_convert_32f_a_H */ ++diff --git a/kernels/volk/volk_64f_x2_add_64f.h b/kernels/volk/volk_64f_x2_add_64f.h ++index 03b8e4c..5c512cc 100644 ++--- a/kernels/volk/volk_64f_x2_add_64f.h +++++ b/kernels/volk/volk_64f_x2_add_64f.h ++@@ -31,8 +31,8 @@ ++ * ++ * Dispatcher Prototype ++ * \code ++- * void volk_64f_x2_add_64f(float* cVector, const float* aVector, const float* bVector, unsigned int num_points) ++- * \endcode +++ * void volk_64f_x2_add_64f(float* cVector, const float* aVector, const float* bVector, +++ * unsigned int num_points) \endcode ++ * ++ * \b Inputs ++ * \li aVector: First input vector. ++@@ -76,18 +76,19 @@ ++ ++ #ifdef LV_HAVE_GENERIC ++ ++-static inline void ++-volk_64f_x2_add_64f_generic(double *cVector, const double *aVector, ++- const double *bVector, unsigned int num_points) +++static inline void volk_64f_x2_add_64f_generic(double* cVector, +++ const double* aVector, +++ const double* bVector, +++ unsigned int num_points) ++ { ++- double *cPtr = cVector; ++- const double *aPtr = aVector; ++- const double *bPtr = bVector; ++- unsigned int number = 0; ++- ++- for (number = 0; number < num_points; number++) { ++- *cPtr++ = (*aPtr++) + (*bPtr++); ++- } +++ double* cPtr = cVector; +++ const double* aPtr = aVector; +++ const double* bPtr = bVector; +++ unsigned int number = 0; +++ +++ for (number = 0; number < num_points; number++) { +++ *cPtr++ = (*aPtr++) + (*bPtr++); +++ } ++ } ++ ++ #endif /* LV_HAVE_GENERIC */ ++@@ -100,35 +101,36 @@ volk_64f_x2_add_64f_generic(double *cVector, const double *aVector, ++ ++ #include ++ ++-static inline void ++-volk_64f_x2_add_64f_u_sse2(double *cVector, const double *aVector, ++- const double *bVector, unsigned int num_points) +++static inline void volk_64f_x2_add_64f_u_sse2(double* cVector, +++ const double* aVector, +++ const double* bVector, +++ unsigned int num_points) ++ { ++- unsigned int number = 0; ++- const unsigned int half_points = num_points / 2; +++ unsigned int number = 0; +++ const unsigned int half_points = num_points / 2; ++ ++- double *cPtr = cVector; ++- const double *aPtr = aVector; ++- const double *bPtr = bVector; +++ double* cPtr = cVector; +++ const double* aPtr = aVector; +++ const double* bPtr = bVector; ++ ++- __m128d aVal, bVal, cVal; ++- for (; number < half_points; number++) { ++- aVal = _mm_loadu_pd(aPtr); ++- bVal = _mm_loadu_pd(bPtr); +++ __m128d aVal, bVal, cVal; +++ for (; number < half_points; number++) { +++ aVal = _mm_loadu_pd(aPtr); +++ bVal = _mm_loadu_pd(bPtr); ++ ++- cVal = _mm_add_pd(aVal, bVal); +++ cVal = _mm_add_pd(aVal, bVal); ++ ++- _mm_storeu_pd(cPtr, cVal); // Store the results back into the C container +++ _mm_storeu_pd(cPtr, cVal); // Store the results back into the C container ++ ++- aPtr += 2; ++- bPtr += 2; ++- cPtr += 2; ++- } +++ aPtr += 2; +++ bPtr += 2; +++ cPtr += 2; +++ } ++ ++- number = half_points * 2; ++- for (; number < num_points; number++) { ++- *cPtr++ = (*aPtr++) + (*bPtr++); ++- } +++ number = half_points * 2; +++ for (; number < num_points; number++) { +++ *cPtr++ = (*aPtr++) + (*bPtr++); +++ } ++ } ++ ++ #endif /* LV_HAVE_SSE2 */ ++@@ -138,36 +140,37 @@ volk_64f_x2_add_64f_u_sse2(double *cVector, const double *aVector, ++ ++ #include ++ ++-static inline void ++-volk_64f_x2_add_64f_u_avx(double *cVector, const double *aVector, ++- const double *bVector, unsigned int num_points) +++static inline void volk_64f_x2_add_64f_u_avx(double* cVector, +++ const double* aVector, +++ const double* bVector, +++ unsigned int num_points) ++ { ++- unsigned int number = 0; ++- const unsigned int quarter_points = num_points / 4; +++ unsigned int number = 0; +++ const unsigned int quarter_points = num_points / 4; ++ ++- double *cPtr = cVector; ++- const double *aPtr = aVector; ++- const double *bPtr = bVector; +++ double* cPtr = cVector; +++ const double* aPtr = aVector; +++ const double* bPtr = bVector; ++ ++- __m256d aVal, bVal, cVal; ++- for (; number < quarter_points; number++) { +++ __m256d aVal, bVal, cVal; +++ for (; number < quarter_points; number++) { ++ ++- aVal = _mm256_loadu_pd(aPtr); ++- bVal = _mm256_loadu_pd(bPtr); +++ aVal = _mm256_loadu_pd(aPtr); +++ bVal = _mm256_loadu_pd(bPtr); ++ ++- cVal = _mm256_add_pd(aVal, bVal); +++ cVal = _mm256_add_pd(aVal, bVal); ++ ++- _mm256_storeu_pd(cPtr, cVal); // Store the results back into the C container +++ _mm256_storeu_pd(cPtr, cVal); // Store the results back into the C container ++ ++- aPtr += 4; ++- bPtr += 4; ++- cPtr += 4; ++- } +++ aPtr += 4; +++ bPtr += 4; +++ cPtr += 4; +++ } ++ ++- number = quarter_points * 4; ++- for (; number < num_points; number++) { ++- *cPtr++ = (*aPtr++) + (*bPtr++); ++- } +++ number = quarter_points * 4; +++ for (; number < num_points; number++) { +++ *cPtr++ = (*aPtr++) + (*bPtr++); +++ } ++ } ++ ++ #endif /* LV_HAVE_AVX */ ++@@ -180,35 +183,36 @@ volk_64f_x2_add_64f_u_avx(double *cVector, const double *aVector, ++ ++ #include ++ ++-static inline void ++-volk_64f_x2_add_64f_a_sse2(double *cVector, const double *aVector, ++- const double *bVector, unsigned int num_points) +++static inline void volk_64f_x2_add_64f_a_sse2(double* cVector, +++ const double* aVector, +++ const double* bVector, +++ unsigned int num_points) ++ { ++- unsigned int number = 0; ++- const unsigned int half_points = num_points / 2; +++ unsigned int number = 0; +++ const unsigned int half_points = num_points / 2; ++ ++- double *cPtr = cVector; ++- const double *aPtr = aVector; ++- const double *bPtr = bVector; +++ double* cPtr = cVector; +++ const double* aPtr = aVector; +++ const double* bPtr = bVector; ++ ++- __m128d aVal, bVal, cVal; ++- for (; number < half_points; number++) { ++- aVal = _mm_load_pd(aPtr); ++- bVal = _mm_load_pd(bPtr); +++ __m128d aVal, bVal, cVal; +++ for (; number < half_points; number++) { +++ aVal = _mm_load_pd(aPtr); +++ bVal = _mm_load_pd(bPtr); ++ ++- cVal = _mm_add_pd(aVal, bVal); +++ cVal = _mm_add_pd(aVal, bVal); ++ ++- _mm_store_pd(cPtr, cVal); // Store the results back into the C container +++ _mm_store_pd(cPtr, cVal); // Store the results back into the C container ++ ++- aPtr += 2; ++- bPtr += 2; ++- cPtr += 2; ++- } +++ aPtr += 2; +++ bPtr += 2; +++ cPtr += 2; +++ } ++ ++- number = half_points * 2; ++- for (; number < num_points; number++) { ++- *cPtr++ = (*aPtr++) + (*bPtr++); ++- } +++ number = half_points * 2; +++ for (; number < num_points; number++) { +++ *cPtr++ = (*aPtr++) + (*bPtr++); +++ } ++ } ++ ++ #endif /* LV_HAVE_SSE2 */ ++@@ -218,36 +222,37 @@ volk_64f_x2_add_64f_a_sse2(double *cVector, const double *aVector, ++ ++ #include ++ ++-static inline void ++-volk_64f_x2_add_64f_a_avx(double *cVector, const double *aVector, ++- const double *bVector, unsigned int num_points) +++static inline void volk_64f_x2_add_64f_a_avx(double* cVector, +++ const double* aVector, +++ const double* bVector, +++ unsigned int num_points) ++ { ++- unsigned int number = 0; ++- const unsigned int quarter_points = num_points / 4; +++ unsigned int number = 0; +++ const unsigned int quarter_points = num_points / 4; ++ ++- double *cPtr = cVector; ++- const double *aPtr = aVector; ++- const double *bPtr = bVector; +++ double* cPtr = cVector; +++ const double* aPtr = aVector; +++ const double* bPtr = bVector; ++ ++- __m256d aVal, bVal, cVal; ++- for (; number < quarter_points; number++) { +++ __m256d aVal, bVal, cVal; +++ for (; number < quarter_points; number++) { ++ ++- aVal = _mm256_load_pd(aPtr); ++- bVal = _mm256_load_pd(bPtr); +++ aVal = _mm256_load_pd(aPtr); +++ bVal = _mm256_load_pd(bPtr); ++ ++- cVal = _mm256_add_pd(aVal, bVal); +++ cVal = _mm256_add_pd(aVal, bVal); ++ ++- _mm256_store_pd(cPtr, cVal); // Store the results back into the C container +++ _mm256_store_pd(cPtr, cVal); // Store the results back into the C container ++ ++- aPtr += 4; ++- bPtr += 4; ++- cPtr += 4; ++- } +++ aPtr += 4; +++ bPtr += 4; +++ cPtr += 4; +++ } ++ ++- number = quarter_points * 4; ++- for (; number < num_points; number++) { ++- *cPtr++ = (*aPtr++) + (*bPtr++); ++- } +++ number = quarter_points * 4; +++ for (; number < num_points; number++) { +++ *cPtr++ = (*aPtr++) + (*bPtr++); +++ } ++ } ++ ++ #endif /* LV_HAVE_AVX */ ++diff --git a/kernels/volk/volk_64f_x2_max_64f.h b/kernels/volk/volk_64f_x2_max_64f.h ++index d4464b7..8f7f743 100644 ++--- a/kernels/volk/volk_64f_x2_max_64f.h +++++ b/kernels/volk/volk_64f_x2_max_64f.h ++@@ -32,8 +32,8 @@ ++ * ++ * Dispatcher Prototype ++ * \code ++- * void volk_64f_x2_max_64f(double* cVector, const double* aVector, const double* bVector, unsigned int num_points) ++- * \endcode +++ * void volk_64f_x2_max_64f(double* cVector, const double* aVector, const double* bVector, +++ * unsigned int num_points) \endcode ++ * ++ * \b Inputs ++ * \li aVector: First input vector. ++@@ -77,38 +77,39 @@ ++ #ifdef LV_HAVE_AVX512F ++ #include ++ ++-static inline void ++-volk_64f_x2_max_64f_a_avx512f(double* cVector, const double* aVector, ++- const double* bVector, unsigned int num_points) +++static inline void volk_64f_x2_max_64f_a_avx512f(double* cVector, +++ const double* aVector, +++ const double* bVector, +++ unsigned int num_points) ++ { ++- unsigned int number = 0; ++- const unsigned int eigthPoints = num_points / 8; +++ unsigned int number = 0; +++ const unsigned int eigthPoints = num_points / 8; ++ ++- double* cPtr = cVector; ++- const double* aPtr = aVector; ++- const double* bPtr= bVector; +++ double* cPtr = cVector; +++ const double* aPtr = aVector; +++ const double* bPtr = bVector; ++ ++- __m512d aVal, bVal, cVal; ++- for(;number < eigthPoints; number++){ +++ __m512d aVal, bVal, cVal; +++ for (; number < eigthPoints; number++) { ++ ++- aVal = _mm512_load_pd(aPtr); ++- bVal = _mm512_load_pd(bPtr); +++ aVal = _mm512_load_pd(aPtr); +++ bVal = _mm512_load_pd(bPtr); ++ ++- cVal = _mm512_max_pd(aVal, bVal); +++ cVal = _mm512_max_pd(aVal, bVal); ++ ++- _mm512_store_pd(cPtr,cVal); // Store the results back into the C container +++ _mm512_store_pd(cPtr, cVal); // Store the results back into the C container ++ ++- aPtr += 8; ++- bPtr += 8; ++- cPtr += 8; ++- } +++ aPtr += 8; +++ bPtr += 8; +++ cPtr += 8; +++ } ++ ++- number = eigthPoints * 8; ++- for(;number < num_points; number++){ ++- const double a = *aPtr++; ++- const double b = *bPtr++; ++- *cPtr++ = ( a > b ? a : b); ++- } +++ number = eigthPoints * 8; +++ for (; number < num_points; number++) { +++ const double a = *aPtr++; +++ const double b = *bPtr++; +++ *cPtr++ = (a > b ? a : b); +++ } ++ } ++ #endif /* LV_HAVE_AVX512F */ ++ ++@@ -116,38 +117,39 @@ volk_64f_x2_max_64f_a_avx512f(double* cVector, const double* aVector, ++ #ifdef LV_HAVE_AVX ++ #include ++ ++-static inline void ++-volk_64f_x2_max_64f_a_avx(double* cVector, const double* aVector, ++- const double* bVector, unsigned int num_points) +++static inline void volk_64f_x2_max_64f_a_avx(double* cVector, +++ const double* aVector, +++ const double* bVector, +++ unsigned int num_points) ++ { ++- unsigned int number = 0; ++- const unsigned int quarterPoints = num_points / 4; +++ unsigned int number = 0; +++ const unsigned int quarterPoints = num_points / 4; ++ ++- double* cPtr = cVector; ++- const double* aPtr = aVector; ++- const double* bPtr= bVector; +++ double* cPtr = cVector; +++ const double* aPtr = aVector; +++ const double* bPtr = bVector; ++ ++- __m256d aVal, bVal, cVal; ++- for(;number < quarterPoints; number++){ +++ __m256d aVal, bVal, cVal; +++ for (; number < quarterPoints; number++) { ++ ++- aVal = _mm256_load_pd(aPtr); ++- bVal = _mm256_load_pd(bPtr); +++ aVal = _mm256_load_pd(aPtr); +++ bVal = _mm256_load_pd(bPtr); ++ ++- cVal = _mm256_max_pd(aVal, bVal); +++ cVal = _mm256_max_pd(aVal, bVal); ++ ++- _mm256_store_pd(cPtr,cVal); // Store the results back into the C container +++ _mm256_store_pd(cPtr, cVal); // Store the results back into the C container ++ ++- aPtr += 4; ++- bPtr += 4; ++- cPtr += 4; ++- } +++ aPtr += 4; +++ bPtr += 4; +++ cPtr += 4; +++ } ++ ++- number = quarterPoints * 4; ++- for(;number < num_points; number++){ ++- const double a = *aPtr++; ++- const double b = *bPtr++; ++- *cPtr++ = ( a > b ? a : b); ++- } +++ number = quarterPoints * 4; +++ for (; number < num_points; number++) { +++ const double a = *aPtr++; +++ const double b = *bPtr++; +++ *cPtr++ = (a > b ? a : b); +++ } ++ } ++ #endif /* LV_HAVE_AVX */ ++ ++@@ -155,58 +157,60 @@ volk_64f_x2_max_64f_a_avx(double* cVector, const double* aVector, ++ #ifdef LV_HAVE_SSE2 ++ #include ++ ++-static inline void ++-volk_64f_x2_max_64f_a_sse2(double* cVector, const double* aVector, ++- const double* bVector, unsigned int num_points) +++static inline void volk_64f_x2_max_64f_a_sse2(double* cVector, +++ const double* aVector, +++ const double* bVector, +++ unsigned int num_points) ++ { ++- unsigned int number = 0; ++- const unsigned int halfPoints = num_points / 2; +++ unsigned int number = 0; +++ const unsigned int halfPoints = num_points / 2; ++ ++- double* cPtr = cVector; ++- const double* aPtr = aVector; ++- const double* bPtr= bVector; +++ double* cPtr = cVector; +++ const double* aPtr = aVector; +++ const double* bPtr = bVector; ++ ++- __m128d aVal, bVal, cVal; ++- for(;number < halfPoints; number++){ +++ __m128d aVal, bVal, cVal; +++ for (; number < halfPoints; number++) { ++ ++- aVal = _mm_load_pd(aPtr); ++- bVal = _mm_load_pd(bPtr); +++ aVal = _mm_load_pd(aPtr); +++ bVal = _mm_load_pd(bPtr); ++ ++- cVal = _mm_max_pd(aVal, bVal); +++ cVal = _mm_max_pd(aVal, bVal); ++ ++- _mm_store_pd(cPtr,cVal); // Store the results back into the C container +++ _mm_store_pd(cPtr, cVal); // Store the results back into the C container ++ ++- aPtr += 2; ++- bPtr += 2; ++- cPtr += 2; ++- } +++ aPtr += 2; +++ bPtr += 2; +++ cPtr += 2; +++ } ++ ++- number = halfPoints * 2; ++- for(;number < num_points; number++){ ++- const double a = *aPtr++; ++- const double b = *bPtr++; ++- *cPtr++ = ( a > b ? a : b); ++- } +++ number = halfPoints * 2; +++ for (; number < num_points; number++) { +++ const double a = *aPtr++; +++ const double b = *bPtr++; +++ *cPtr++ = (a > b ? a : b); +++ } ++ } ++ #endif /* LV_HAVE_SSE2 */ ++ ++ ++ #ifdef LV_HAVE_GENERIC ++ ++-static inline void ++-volk_64f_x2_max_64f_generic(double* cVector, const double* aVector, ++- const double* bVector, unsigned int num_points) +++static inline void volk_64f_x2_max_64f_generic(double* cVector, +++ const double* aVector, +++ const double* bVector, +++ unsigned int num_points) ++ { ++- double* cPtr = cVector; ++- const double* aPtr = aVector; ++- const double* bPtr= bVector; ++- unsigned int number = 0; ++- ++- for(number = 0; number < num_points; number++){ ++- const double a = *aPtr++; ++- const double b = *bPtr++; ++- *cPtr++ = ( a > b ? a : b); ++- } +++ double* cPtr = cVector; +++ const double* aPtr = aVector; +++ const double* bPtr = bVector; +++ unsigned int number = 0; +++ +++ for (number = 0; number < num_points; number++) { +++ const double a = *aPtr++; +++ const double b = *bPtr++; +++ *cPtr++ = (a > b ? a : b); +++ } ++ } ++ #endif /* LV_HAVE_GENERIC */ ++ ++@@ -223,38 +227,39 @@ volk_64f_x2_max_64f_generic(double* cVector, const double* aVector, ++ #ifdef LV_HAVE_AVX512F ++ #include ++ ++-static inline void ++-volk_64f_x2_max_64f_u_avx512f(double* cVector, const double* aVector, ++- const double* bVector, unsigned int num_points) +++static inline void volk_64f_x2_max_64f_u_avx512f(double* cVector, +++ const double* aVector, +++ const double* bVector, +++ unsigned int num_points) ++ { ++- unsigned int number = 0; ++- const unsigned int eigthPoints = num_points / 8; +++ unsigned int number = 0; +++ const unsigned int eigthPoints = num_points / 8; ++ ++- double* cPtr = cVector; ++- const double* aPtr = aVector; ++- const double* bPtr= bVector; +++ double* cPtr = cVector; +++ const double* aPtr = aVector; +++ const double* bPtr = bVector; ++ ++- __m512d aVal, bVal, cVal; ++- for(;number < eigthPoints; number++){ +++ __m512d aVal, bVal, cVal; +++ for (; number < eigthPoints; number++) { ++ ++- aVal = _mm512_loadu_pd(aPtr); ++- bVal = _mm512_loadu_pd(bPtr); +++ aVal = _mm512_loadu_pd(aPtr); +++ bVal = _mm512_loadu_pd(bPtr); ++ ++- cVal = _mm512_max_pd(aVal, bVal); +++ cVal = _mm512_max_pd(aVal, bVal); ++ ++- _mm512_storeu_pd(cPtr,cVal); // Store the results back into the C container +++ _mm512_storeu_pd(cPtr, cVal); // Store the results back into the C container ++ ++- aPtr += 8; ++- bPtr += 8; ++- cPtr += 8; ++- } +++ aPtr += 8; +++ bPtr += 8; +++ cPtr += 8; +++ } ++ ++- number = eigthPoints * 8; ++- for(;number < num_points; number++){ ++- const double a = *aPtr++; ++- const double b = *bPtr++; ++- *cPtr++ = ( a > b ? a : b); ++- } +++ number = eigthPoints * 8; +++ for (; number < num_points; number++) { +++ const double a = *aPtr++; +++ const double b = *bPtr++; +++ *cPtr++ = (a > b ? a : b); +++ } ++ } ++ #endif /* LV_HAVE_AVX512F */ ++ ++@@ -262,38 +267,39 @@ volk_64f_x2_max_64f_u_avx512f(double* cVector, const double* aVector, ++ #ifdef LV_HAVE_AVX ++ #include ++ ++-static inline void ++-volk_64f_x2_max_64f_u_avx(double* cVector, const double* aVector, ++- const double* bVector, unsigned int num_points) +++static inline void volk_64f_x2_max_64f_u_avx(double* cVector, +++ const double* aVector, +++ const double* bVector, +++ unsigned int num_points) ++ { ++- unsigned int number = 0; ++- const unsigned int quarterPoints = num_points / 4; +++ unsigned int number = 0; +++ const unsigned int quarterPoints = num_points / 4; ++ ++- double* cPtr = cVector; ++- const double* aPtr = aVector; ++- const double* bPtr= bVector; +++ double* cPtr = cVector; +++ const double* aPtr = aVector; +++ const double* bPtr = bVector; ++ ++- __m256d aVal, bVal, cVal; ++- for(;number < quarterPoints; number++){ +++ __m256d aVal, bVal, cVal; +++ for (; number < quarterPoints; number++) { ++ ++- aVal = _mm256_loadu_pd(aPtr); ++- bVal = _mm256_loadu_pd(bPtr); +++ aVal = _mm256_loadu_pd(aPtr); +++ bVal = _mm256_loadu_pd(bPtr); ++ ++- cVal = _mm256_max_pd(aVal, bVal); +++ cVal = _mm256_max_pd(aVal, bVal); ++ ++- _mm256_storeu_pd(cPtr,cVal); // Store the results back into the C container +++ _mm256_storeu_pd(cPtr, cVal); // Store the results back into the C container ++ ++- aPtr += 4; ++- bPtr += 4; ++- cPtr += 4; ++- } +++ aPtr += 4; +++ bPtr += 4; +++ cPtr += 4; +++ } ++ ++- number = quarterPoints * 4; ++- for(;number < num_points; number++){ ++- const double a = *aPtr++; ++- const double b = *bPtr++; ++- *cPtr++ = ( a > b ? a : b); ++- } +++ number = quarterPoints * 4; +++ for (; number < num_points; number++) { +++ const double a = *aPtr++; +++ const double b = *bPtr++; +++ *cPtr++ = (a > b ? a : b); +++ } ++ } ++ #endif /* LV_HAVE_AVX */ ++ ++diff --git a/kernels/volk/volk_64f_x2_min_64f.h b/kernels/volk/volk_64f_x2_min_64f.h ++index 0ffa305..7dc4d59 100644 ++--- a/kernels/volk/volk_64f_x2_min_64f.h +++++ b/kernels/volk/volk_64f_x2_min_64f.h ++@@ -32,7 +32,8 @@ ++ * ++ * Dispatcher Prototype ++ * \code ++- * void volk_64f_x2_min_64f(double* cVector, const double* aVector, const double* bVector, unsigned int num_points) +++ * void volk_64f_x2_min_64f(double* cVector, const double* aVector, const double* bVector, +++ unsigned int num_points) ++ * \endcode ++ * ++ * \b Inputs ++@@ -77,38 +78,39 @@ ++ #ifdef LV_HAVE_AVX512F ++ #include ++ ++-static inline void ++-volk_64f_x2_min_64f_a_avx512f(double* cVector, const double* aVector, ++- const double* bVector, unsigned int num_points) +++static inline void volk_64f_x2_min_64f_a_avx512f(double* cVector, +++ const double* aVector, +++ const double* bVector, +++ unsigned int num_points) ++ { ++- unsigned int number = 0; ++- const unsigned int eigthPoints = num_points / 8; +++ unsigned int number = 0; +++ const unsigned int eigthPoints = num_points / 8; ++ ++- double* cPtr = cVector; ++- const double* aPtr = aVector; ++- const double* bPtr= bVector; +++ double* cPtr = cVector; +++ const double* aPtr = aVector; +++ const double* bPtr = bVector; ++ ++- __m512d aVal, bVal, cVal; ++- for(;number < eigthPoints; number++){ +++ __m512d aVal, bVal, cVal; +++ for (; number < eigthPoints; number++) { ++ ++- aVal = _mm512_load_pd(aPtr); ++- bVal = _mm512_load_pd(bPtr); +++ aVal = _mm512_load_pd(aPtr); +++ bVal = _mm512_load_pd(bPtr); ++ ++- cVal = _mm512_min_pd(aVal, bVal); +++ cVal = _mm512_min_pd(aVal, bVal); ++ ++- _mm512_store_pd(cPtr,cVal); // Store the results back into the C container +++ _mm512_store_pd(cPtr, cVal); // Store the results back into the C container ++ ++- aPtr += 8; ++- bPtr += 8; ++- cPtr += 8; ++- } +++ aPtr += 8; +++ bPtr += 8; +++ cPtr += 8; +++ } ++ ++- number = eigthPoints * 8; ++- for(;number < num_points; number++){ ++- const double a = *aPtr++; ++- const double b = *bPtr++; ++- *cPtr++ = ( a < b ? a : b); ++- } +++ number = eigthPoints * 8; +++ for (; number < num_points; number++) { +++ const double a = *aPtr++; +++ const double b = *bPtr++; +++ *cPtr++ = (a < b ? a : b); +++ } ++ } ++ #endif /* LV_HAVE_AVX512F */ ++ ++@@ -116,38 +118,39 @@ volk_64f_x2_min_64f_a_avx512f(double* cVector, const double* aVector, ++ #ifdef LV_HAVE_AVX ++ #include ++ ++-static inline void ++-volk_64f_x2_min_64f_a_avx(double* cVector, const double* aVector, ++- const double* bVector, unsigned int num_points) +++static inline void volk_64f_x2_min_64f_a_avx(double* cVector, +++ const double* aVector, +++ const double* bVector, +++ unsigned int num_points) ++ { ++- unsigned int number = 0; ++- const unsigned int quarterPoints = num_points / 4; +++ unsigned int number = 0; +++ const unsigned int quarterPoints = num_points / 4; ++ ++- double* cPtr = cVector; ++- const double* aPtr = aVector; ++- const double* bPtr= bVector; +++ double* cPtr = cVector; +++ const double* aPtr = aVector; +++ const double* bPtr = bVector; ++ ++- __m256d aVal, bVal, cVal; ++- for(;number < quarterPoints; number++){ +++ __m256d aVal, bVal, cVal; +++ for (; number < quarterPoints; number++) { ++ ++- aVal = _mm256_load_pd(aPtr); ++- bVal = _mm256_load_pd(bPtr); +++ aVal = _mm256_load_pd(aPtr); +++ bVal = _mm256_load_pd(bPtr); ++ ++- cVal = _mm256_min_pd(aVal, bVal); +++ cVal = _mm256_min_pd(aVal, bVal); ++ ++- _mm256_store_pd(cPtr,cVal); // Store the results back into the C container +++ _mm256_store_pd(cPtr, cVal); // Store the results back into the C container ++ ++- aPtr += 4; ++- bPtr += 4; ++- cPtr += 4; ++- } +++ aPtr += 4; +++ bPtr += 4; +++ cPtr += 4; +++ } ++ ++- number = quarterPoints * 4; ++- for(;number < num_points; number++){ ++- const double a = *aPtr++; ++- const double b = *bPtr++; ++- *cPtr++ = ( a < b ? a : b); ++- } +++ number = quarterPoints * 4; +++ for (; number < num_points; number++) { +++ const double a = *aPtr++; +++ const double b = *bPtr++; +++ *cPtr++ = (a < b ? a : b); +++ } ++ } ++ #endif /* LV_HAVE_AVX */ ++ ++@@ -155,58 +158,60 @@ volk_64f_x2_min_64f_a_avx(double* cVector, const double* aVector, ++ #ifdef LV_HAVE_SSE2 ++ #include ++ ++-static inline void ++-volk_64f_x2_min_64f_a_sse2(double* cVector, const double* aVector, ++- const double* bVector, unsigned int num_points) +++static inline void volk_64f_x2_min_64f_a_sse2(double* cVector, +++ const double* aVector, +++ const double* bVector, +++ unsigned int num_points) ++ { ++- unsigned int number = 0; ++- const unsigned int halfPoints = num_points / 2; +++ unsigned int number = 0; +++ const unsigned int halfPoints = num_points / 2; ++ ++- double* cPtr = cVector; ++- const double* aPtr = aVector; ++- const double* bPtr= bVector; +++ double* cPtr = cVector; +++ const double* aPtr = aVector; +++ const double* bPtr = bVector; ++ ++- __m128d aVal, bVal, cVal; ++- for(;number < halfPoints; number++){ +++ __m128d aVal, bVal, cVal; +++ for (; number < halfPoints; number++) { ++ ++- aVal = _mm_load_pd(aPtr); ++- bVal = _mm_load_pd(bPtr); +++ aVal = _mm_load_pd(aPtr); +++ bVal = _mm_load_pd(bPtr); ++ ++- cVal = _mm_min_pd(aVal, bVal); +++ cVal = _mm_min_pd(aVal, bVal); ++ ++- _mm_store_pd(cPtr,cVal); // Store the results back into the C container +++ _mm_store_pd(cPtr, cVal); // Store the results back into the C container ++ ++- aPtr += 2; ++- bPtr += 2; ++- cPtr += 2; ++- } +++ aPtr += 2; +++ bPtr += 2; +++ cPtr += 2; +++ } ++ ++- number = halfPoints * 2; ++- for(;number < num_points; number++){ ++- const double a = *aPtr++; ++- const double b = *bPtr++; ++- *cPtr++ = ( a < b ? a : b); ++- } +++ number = halfPoints * 2; +++ for (; number < num_points; number++) { +++ const double a = *aPtr++; +++ const double b = *bPtr++; +++ *cPtr++ = (a < b ? a : b); +++ } ++ } ++ #endif /* LV_HAVE_SSE2 */ ++ ++ ++ #ifdef LV_HAVE_GENERIC ++ ++-static inline void ++-volk_64f_x2_min_64f_generic(double* cVector, const double* aVector, ++- const double* bVector, unsigned int num_points) +++static inline void volk_64f_x2_min_64f_generic(double* cVector, +++ const double* aVector, +++ const double* bVector, +++ unsigned int num_points) ++ { ++- double* cPtr = cVector; ++- const double* aPtr = aVector; ++- const double* bPtr= bVector; ++- unsigned int number = 0; ++- ++- for(number = 0; number < num_points; number++){ ++- const double a = *aPtr++; ++- const double b = *bPtr++; ++- *cPtr++ = ( a < b ? a : b); ++- } +++ double* cPtr = cVector; +++ const double* aPtr = aVector; +++ const double* bPtr = bVector; +++ unsigned int number = 0; +++ +++ for (number = 0; number < num_points; number++) { +++ const double a = *aPtr++; +++ const double b = *bPtr++; +++ *cPtr++ = (a < b ? a : b); +++ } ++ } ++ #endif /* LV_HAVE_GENERIC */ ++ ++@@ -222,38 +227,39 @@ volk_64f_x2_min_64f_generic(double* cVector, const double* aVector, ++ #ifdef LV_HAVE_AVX512F ++ #include ++ ++-static inline void ++-volk_64f_x2_min_64f_u_avx512f(double* cVector, const double* aVector, ++- const double* bVector, unsigned int num_points) +++static inline void volk_64f_x2_min_64f_u_avx512f(double* cVector, +++ const double* aVector, +++ const double* bVector, +++ unsigned int num_points) ++ { ++- unsigned int number = 0; ++- const unsigned int eigthPoints = num_points / 8; +++ unsigned int number = 0; +++ const unsigned int eigthPoints = num_points / 8; ++ ++- double* cPtr = cVector; ++- const double* aPtr = aVector; ++- const double* bPtr= bVector; +++ double* cPtr = cVector; +++ const double* aPtr = aVector; +++ const double* bPtr = bVector; ++ ++- __m512d aVal, bVal, cVal; ++- for(;number < eigthPoints; number++){ +++ __m512d aVal, bVal, cVal; +++ for (; number < eigthPoints; number++) { ++ ++- aVal = _mm512_loadu_pd(aPtr); ++- bVal = _mm512_loadu_pd(bPtr); +++ aVal = _mm512_loadu_pd(aPtr); +++ bVal = _mm512_loadu_pd(bPtr); ++ ++- cVal = _mm512_min_pd(aVal, bVal); +++ cVal = _mm512_min_pd(aVal, bVal); ++ ++- _mm512_storeu_pd(cPtr,cVal); // Store the results back into the C container +++ _mm512_storeu_pd(cPtr, cVal); // Store the results back into the C container ++ ++- aPtr += 8; ++- bPtr += 8; ++- cPtr += 8; ++- } +++ aPtr += 8; +++ bPtr += 8; +++ cPtr += 8; +++ } ++ ++- number = eigthPoints * 8; ++- for(;number < num_points; number++){ ++- const double a = *aPtr++; ++- const double b = *bPtr++; ++- *cPtr++ = ( a < b ? a : b); ++- } +++ number = eigthPoints * 8; +++ for (; number < num_points; number++) { +++ const double a = *aPtr++; +++ const double b = *bPtr++; +++ *cPtr++ = (a < b ? a : b); +++ } ++ } ++ #endif /* LV_HAVE_AVX512F */ ++ ++@@ -261,38 +267,39 @@ volk_64f_x2_min_64f_u_avx512f(double* cVector, const double* aVector, ++ #ifdef LV_HAVE_AVX ++ #include ++ ++-static inline void ++-volk_64f_x2_min_64f_u_avx(double* cVector, const double* aVector, ++- const double* bVector, unsigned int num_points) +++static inline void volk_64f_x2_min_64f_u_avx(double* cVector, +++ const double* aVector, +++ const double* bVector, +++ unsigned int num_points) ++ { ++- unsigned int number = 0; ++- const unsigned int quarterPoints = num_points / 4; +++ unsigned int number = 0; +++ const unsigned int quarterPoints = num_points / 4; ++ ++- double* cPtr = cVector; ++- const double* aPtr = aVector; ++- const double* bPtr= bVector; +++ double* cPtr = cVector; +++ const double* aPtr = aVector; +++ const double* bPtr = bVector; ++ ++- __m256d aVal, bVal, cVal; ++- for(;number < quarterPoints; number++){ +++ __m256d aVal, bVal, cVal; +++ for (; number < quarterPoints; number++) { ++ ++- aVal = _mm256_loadu_pd(aPtr); ++- bVal = _mm256_loadu_pd(bPtr); +++ aVal = _mm256_loadu_pd(aPtr); +++ bVal = _mm256_loadu_pd(bPtr); ++ ++- cVal = _mm256_min_pd(aVal, bVal); +++ cVal = _mm256_min_pd(aVal, bVal); ++ ++- _mm256_storeu_pd(cPtr,cVal); // Store the results back into the C container +++ _mm256_storeu_pd(cPtr, cVal); // Store the results back into the C container ++ ++- aPtr += 4; ++- bPtr += 4; ++- cPtr += 4; ++- } +++ aPtr += 4; +++ bPtr += 4; +++ cPtr += 4; +++ } ++ ++- number = quarterPoints * 4; ++- for(;number < num_points; number++){ ++- const double a = *aPtr++; ++- const double b = *bPtr++; ++- *cPtr++ = ( a < b ? a : b); ++- } +++ number = quarterPoints * 4; +++ for (; number < num_points; number++) { +++ const double a = *aPtr++; +++ const double b = *bPtr++; +++ *cPtr++ = (a < b ? a : b); +++ } ++ } ++ #endif /* LV_HAVE_AVX */ ++ ++diff --git a/kernels/volk/volk_64f_x2_multiply_64f.h b/kernels/volk/volk_64f_x2_multiply_64f.h ++index 6fa9e8e..39a155d 100644 ++--- a/kernels/volk/volk_64f_x2_multiply_64f.h +++++ b/kernels/volk/volk_64f_x2_multiply_64f.h ++@@ -31,8 +31,8 @@ ++ * ++ * Dispatcher Prototype ++ * \code ++- * void volk_64f_x2_multiply_64f(float* cVector, const float* aVector, const float* bVector, unsigned int num_points) ++- * \endcode +++ * void volk_64f_x2_multiply_64f(float* cVector, const float* aVector, const float* +++ * bVector, unsigned int num_points) \endcode ++ * ++ * \b Inputs ++ * \li aVector: First input vector. ++@@ -76,18 +76,19 @@ ++ ++ #ifdef LV_HAVE_GENERIC ++ ++-static inline void ++-volk_64f_x2_multiply_64f_generic(double *cVector, const double *aVector, ++- const double *bVector, unsigned int num_points) +++static inline void volk_64f_x2_multiply_64f_generic(double* cVector, +++ const double* aVector, +++ const double* bVector, +++ unsigned int num_points) ++ { ++- double *cPtr = cVector; ++- const double *aPtr = aVector; ++- const double *bPtr = bVector; ++- unsigned int number = 0; ++- ++- for (number = 0; number < num_points; number++) { ++- *cPtr++ = (*aPtr++) * (*bPtr++); ++- } +++ double* cPtr = cVector; +++ const double* aPtr = aVector; +++ const double* bPtr = bVector; +++ unsigned int number = 0; +++ +++ for (number = 0; number < num_points; number++) { +++ *cPtr++ = (*aPtr++) * (*bPtr++); +++ } ++ } ++ ++ #endif /* LV_HAVE_GENERIC */ ++@@ -100,35 +101,36 @@ volk_64f_x2_multiply_64f_generic(double *cVector, const double *aVector, ++ ++ #include ++ ++-static inline void ++-volk_64f_x2_multiply_64f_u_sse2(double *cVector, const double *aVector, ++- const double *bVector, unsigned int num_points) +++static inline void volk_64f_x2_multiply_64f_u_sse2(double* cVector, +++ const double* aVector, +++ const double* bVector, +++ unsigned int num_points) ++ { ++- unsigned int number = 0; ++- const unsigned int half_points = num_points / 2; +++ unsigned int number = 0; +++ const unsigned int half_points = num_points / 2; ++ ++- double *cPtr = cVector; ++- const double *aPtr = aVector; ++- const double *bPtr = bVector; +++ double* cPtr = cVector; +++ const double* aPtr = aVector; +++ const double* bPtr = bVector; ++ ++- __m128d aVal, bVal, cVal; ++- for (; number < half_points; number++) { ++- aVal = _mm_loadu_pd(aPtr); ++- bVal = _mm_loadu_pd(bPtr); +++ __m128d aVal, bVal, cVal; +++ for (; number < half_points; number++) { +++ aVal = _mm_loadu_pd(aPtr); +++ bVal = _mm_loadu_pd(bPtr); ++ ++- cVal = _mm_mul_pd(aVal, bVal); +++ cVal = _mm_mul_pd(aVal, bVal); ++ ++- _mm_storeu_pd(cPtr, cVal); // Store the results back into the C container +++ _mm_storeu_pd(cPtr, cVal); // Store the results back into the C container ++ ++- aPtr += 2; ++- bPtr += 2; ++- cPtr += 2; ++- } +++ aPtr += 2; +++ bPtr += 2; +++ cPtr += 2; +++ } ++ ++- number = half_points * 2; ++- for (; number < num_points; number++) { ++- *cPtr++ = (*aPtr++) * (*bPtr++); ++- } +++ number = half_points * 2; +++ for (; number < num_points; number++) { +++ *cPtr++ = (*aPtr++) * (*bPtr++); +++ } ++ } ++ ++ #endif /* LV_HAVE_SSE2 */ ++@@ -138,36 +140,37 @@ volk_64f_x2_multiply_64f_u_sse2(double *cVector, const double *aVector, ++ ++ #include ++ ++-static inline void ++-volk_64f_x2_multiply_64f_u_avx(double *cVector, const double *aVector, ++- const double *bVector, unsigned int num_points) +++static inline void volk_64f_x2_multiply_64f_u_avx(double* cVector, +++ const double* aVector, +++ const double* bVector, +++ unsigned int num_points) ++ { ++- unsigned int number = 0; ++- const unsigned int quarter_points = num_points / 4; +++ unsigned int number = 0; +++ const unsigned int quarter_points = num_points / 4; ++ ++- double *cPtr = cVector; ++- const double *aPtr = aVector; ++- const double *bPtr = bVector; +++ double* cPtr = cVector; +++ const double* aPtr = aVector; +++ const double* bPtr = bVector; ++ ++- __m256d aVal, bVal, cVal; ++- for (; number < quarter_points; number++) { +++ __m256d aVal, bVal, cVal; +++ for (; number < quarter_points; number++) { ++ ++- aVal = _mm256_loadu_pd(aPtr); ++- bVal = _mm256_loadu_pd(bPtr); +++ aVal = _mm256_loadu_pd(aPtr); +++ bVal = _mm256_loadu_pd(bPtr); ++ ++- cVal = _mm256_mul_pd(aVal, bVal); +++ cVal = _mm256_mul_pd(aVal, bVal); ++ ++- _mm256_storeu_pd(cPtr, cVal); // Store the results back into the C container +++ _mm256_storeu_pd(cPtr, cVal); // Store the results back into the C container ++ ++- aPtr += 4; ++- bPtr += 4; ++- cPtr += 4; ++- } +++ aPtr += 4; +++ bPtr += 4; +++ cPtr += 4; +++ } ++ ++- number = quarter_points * 4; ++- for (; number < num_points; number++) { ++- *cPtr++ = (*aPtr++) * (*bPtr++); ++- } +++ number = quarter_points * 4; +++ for (; number < num_points; number++) { +++ *cPtr++ = (*aPtr++) * (*bPtr++); +++ } ++ } ++ ++ #endif /* LV_HAVE_AVX */ ++@@ -180,35 +183,36 @@ volk_64f_x2_multiply_64f_u_avx(double *cVector, const double *aVector, ++ ++ #include ++ ++-static inline void ++-volk_64f_x2_multiply_64f_a_sse2(double *cVector, const double *aVector, ++- const double *bVector, unsigned int num_points) +++static inline void volk_64f_x2_multiply_64f_a_sse2(double* cVector, +++ const double* aVector, +++ const double* bVector, +++ unsigned int num_points) ++ { ++- unsigned int number = 0; ++- const unsigned int half_points = num_points / 2; +++ unsigned int number = 0; +++ const unsigned int half_points = num_points / 2; ++ ++- double *cPtr = cVector; ++- const double *aPtr = aVector; ++- const double *bPtr = bVector; +++ double* cPtr = cVector; +++ const double* aPtr = aVector; +++ const double* bPtr = bVector; ++ ++- __m128d aVal, bVal, cVal; ++- for (; number < half_points; number++) { ++- aVal = _mm_load_pd(aPtr); ++- bVal = _mm_load_pd(bPtr); +++ __m128d aVal, bVal, cVal; +++ for (; number < half_points; number++) { +++ aVal = _mm_load_pd(aPtr); +++ bVal = _mm_load_pd(bPtr); ++ ++- cVal = _mm_mul_pd(aVal, bVal); +++ cVal = _mm_mul_pd(aVal, bVal); ++ ++- _mm_store_pd(cPtr, cVal); // Store the results back into the C container +++ _mm_store_pd(cPtr, cVal); // Store the results back into the C container ++ ++- aPtr += 2; ++- bPtr += 2; ++- cPtr += 2; ++- } +++ aPtr += 2; +++ bPtr += 2; +++ cPtr += 2; +++ } ++ ++- number = half_points * 2; ++- for (; number < num_points; number++) { ++- *cPtr++ = (*aPtr++) * (*bPtr++); ++- } +++ number = half_points * 2; +++ for (; number < num_points; number++) { +++ *cPtr++ = (*aPtr++) * (*bPtr++); +++ } ++ } ++ ++ #endif /* LV_HAVE_SSE2 */ ++@@ -218,36 +222,37 @@ volk_64f_x2_multiply_64f_a_sse2(double *cVector, const double *aVector, ++ ++ #include ++ ++-static inline void ++-volk_64f_x2_multiply_64f_a_avx(double *cVector, const double *aVector, ++- const double *bVector, unsigned int num_points) +++static inline void volk_64f_x2_multiply_64f_a_avx(double* cVector, +++ const double* aVector, +++ const double* bVector, +++ unsigned int num_points) ++ { ++- unsigned int number = 0; ++- const unsigned int quarter_points = num_points / 4; +++ unsigned int number = 0; +++ const unsigned int quarter_points = num_points / 4; ++ ++- double *cPtr = cVector; ++- const double *aPtr = aVector; ++- const double *bPtr = bVector; +++ double* cPtr = cVector; +++ const double* aPtr = aVector; +++ const double* bPtr = bVector; ++ ++- __m256d aVal, bVal, cVal; ++- for (; number < quarter_points; number++) { +++ __m256d aVal, bVal, cVal; +++ for (; number < quarter_points; number++) { ++ ++- aVal = _mm256_load_pd(aPtr); ++- bVal = _mm256_load_pd(bPtr); +++ aVal = _mm256_load_pd(aPtr); +++ bVal = _mm256_load_pd(bPtr); ++ ++- cVal = _mm256_mul_pd(aVal, bVal); +++ cVal = _mm256_mul_pd(aVal, bVal); ++ ++- _mm256_store_pd(cPtr, cVal); // Store the results back into the C container +++ _mm256_store_pd(cPtr, cVal); // Store the results back into the C container ++ ++- aPtr += 4; ++- bPtr += 4; ++- cPtr += 4; ++- } +++ aPtr += 4; +++ bPtr += 4; +++ cPtr += 4; +++ } ++ ++- number = quarter_points * 4; ++- for (; number < num_points; number++) { ++- *cPtr++ = (*aPtr++) * (*bPtr++); ++- } +++ number = quarter_points * 4; +++ for (; number < num_points; number++) { +++ *cPtr++ = (*aPtr++) * (*bPtr++); +++ } ++ } ++ ++ #endif /* LV_HAVE_AVX */ ++diff --git a/kernels/volk/volk_64u_byteswap.h b/kernels/volk/volk_64u_byteswap.h ++index 96e0661..38621a4 100644 ++--- a/kernels/volk/volk_64u_byteswap.h +++++ b/kernels/volk/volk_64u_byteswap.h ++@@ -72,71 +72,77 @@ ++ #ifdef LV_HAVE_SSE2 ++ #include ++ ++-static inline void volk_64u_byteswap_u_sse2(uint64_t* intsToSwap, unsigned int num_points){ +++static inline void volk_64u_byteswap_u_sse2(uint64_t* intsToSwap, unsigned int num_points) +++{ ++ uint32_t* inputPtr = (uint32_t*)intsToSwap; ++ __m128i input, byte1, byte2, byte3, byte4, output; ++ __m128i byte2mask = _mm_set1_epi32(0x00FF0000); ++ __m128i byte3mask = _mm_set1_epi32(0x0000FF00); ++ uint64_t number = 0; ++ const unsigned int halfPoints = num_points / 2; ++- for(;number < halfPoints; number++){ ++- // Load the 32t values, increment inputPtr later since we're doing it in-place. ++- input = _mm_loadu_si128((__m128i*)inputPtr); ++- ++- // Do the four shifts ++- byte1 = _mm_slli_epi32(input, 24); ++- byte2 = _mm_slli_epi32(input, 8); ++- byte3 = _mm_srli_epi32(input, 8); ++- byte4 = _mm_srli_epi32(input, 24); ++- // Or bytes together ++- output = _mm_or_si128(byte1, byte4); ++- byte2 = _mm_and_si128(byte2, byte2mask); ++- output = _mm_or_si128(output, byte2); ++- byte3 = _mm_and_si128(byte3, byte3mask); ++- output = _mm_or_si128(output, byte3); ++- ++- // Reorder the two words ++- output = _mm_shuffle_epi32(output, _MM_SHUFFLE(2, 3, 0, 1)); ++- ++- // Store the results ++- _mm_storeu_si128((__m128i*)inputPtr, output); ++- inputPtr += 4; +++ for (; number < halfPoints; number++) { +++ // Load the 32t values, increment inputPtr later since we're doing it in-place. +++ input = _mm_loadu_si128((__m128i*)inputPtr); +++ +++ // Do the four shifts +++ byte1 = _mm_slli_epi32(input, 24); +++ byte2 = _mm_slli_epi32(input, 8); +++ byte3 = _mm_srli_epi32(input, 8); +++ byte4 = _mm_srli_epi32(input, 24); +++ // Or bytes together +++ output = _mm_or_si128(byte1, byte4); +++ byte2 = _mm_and_si128(byte2, byte2mask); +++ output = _mm_or_si128(output, byte2); +++ byte3 = _mm_and_si128(byte3, byte3mask); +++ output = _mm_or_si128(output, byte3); +++ +++ // Reorder the two words +++ output = _mm_shuffle_epi32(output, _MM_SHUFFLE(2, 3, 0, 1)); +++ +++ // Store the results +++ _mm_storeu_si128((__m128i*)inputPtr, output); +++ inputPtr += 4; ++ } ++ ++ // Byteswap any remaining points: ++- number = halfPoints*2; ++- for(; number < num_points; number++){ ++- uint32_t output1 = *inputPtr; ++- uint32_t output2 = inputPtr[1]; +++ number = halfPoints * 2; +++ for (; number < num_points; number++) { +++ uint32_t output1 = *inputPtr; +++ uint32_t output2 = inputPtr[1]; ++ ++- output1 = (((output1 >> 24) & 0xff) | ((output1 >> 8) & 0x0000ff00) | ((output1 << 8) & 0x00ff0000) | ((output1 << 24) & 0xff000000)); +++ output1 = (((output1 >> 24) & 0xff) | ((output1 >> 8) & 0x0000ff00) | +++ ((output1 << 8) & 0x00ff0000) | ((output1 << 24) & 0xff000000)); ++ ++- output2 = (((output2 >> 24) & 0xff) | ((output2 >> 8) & 0x0000ff00) | ((output2 << 8) & 0x00ff0000) | ((output2 << 24) & 0xff000000)); +++ output2 = (((output2 >> 24) & 0xff) | ((output2 >> 8) & 0x0000ff00) | +++ ((output2 << 8) & 0x00ff0000) | ((output2 << 24) & 0xff000000)); ++ ++- *inputPtr++ = output2; ++- *inputPtr++ = output1; +++ *inputPtr++ = output2; +++ *inputPtr++ = output1; ++ } ++ } ++ #endif /* LV_HAVE_SSE2 */ ++ ++ ++- ++ #ifdef LV_HAVE_GENERIC ++ ++-static inline void volk_64u_byteswap_generic(uint64_t* intsToSwap, unsigned int num_points){ ++- uint32_t* inputPtr = (uint32_t*)intsToSwap; ++- unsigned int point; ++- for(point = 0; point < num_points; point++){ ++- uint32_t output1 = *inputPtr; ++- uint32_t output2 = inputPtr[1]; +++static inline void volk_64u_byteswap_generic(uint64_t* intsToSwap, +++ unsigned int num_points) +++{ +++ uint32_t* inputPtr = (uint32_t*)intsToSwap; +++ unsigned int point; +++ for (point = 0; point < num_points; point++) { +++ uint32_t output1 = *inputPtr; +++ uint32_t output2 = inputPtr[1]; ++ ++- output1 = (((output1 >> 24) & 0xff) | ((output1 >> 8) & 0x0000ff00) | ((output1 << 8) & 0x00ff0000) | ((output1 << 24) & 0xff000000)); +++ output1 = (((output1 >> 24) & 0xff) | ((output1 >> 8) & 0x0000ff00) | +++ ((output1 << 8) & 0x00ff0000) | ((output1 << 24) & 0xff000000)); ++ ++- output2 = (((output2 >> 24) & 0xff) | ((output2 >> 8) & 0x0000ff00) | ((output2 << 8) & 0x00ff0000) | ((output2 << 24) & 0xff000000)); +++ output2 = (((output2 >> 24) & 0xff) | ((output2 >> 8) & 0x0000ff00) | +++ ((output2 << 8) & 0x00ff0000) | ((output2 << 24) & 0xff000000)); ++ ++- *inputPtr++ = output2; ++- *inputPtr++ = output1; ++- } +++ *inputPtr++ = output2; +++ *inputPtr++ = output1; +++ } ++ } ++ #endif /* LV_HAVE_GENERIC */ ++ ++@@ -144,47 +150,47 @@ static inline void volk_64u_byteswap_generic(uint64_t* intsToSwap, unsigned int ++ #include ++ static inline void volk_64u_byteswap_a_avx2(uint64_t* intsToSwap, unsigned int num_points) ++ { ++- unsigned int number = 0; ++- ++- const unsigned int nPerSet = 4; ++- const uint64_t nSets = num_points / nPerSet; +++ unsigned int number = 0; ++ ++- uint32_t* inputPtr = (uint32_t*)intsToSwap; +++ const unsigned int nPerSet = 4; +++ const uint64_t nSets = num_points / nPerSet; ++ ++- const uint8_t shuffleVector[32] = { 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8, 23, 22, 21, 20, 19, 18, 17, 16, 31, 30, 29, 28, 27, 26, 25, 24 }; +++ uint32_t* inputPtr = (uint32_t*)intsToSwap; ++ ++- const __m256i myShuffle = _mm256_loadu_si256((__m256i*) &shuffleVector[0]); +++ const uint8_t shuffleVector[32] = { 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, +++ 12, 11, 10, 9, 8, 23, 22, 21, 20, 19, 18, +++ 17, 16, 31, 30, 29, 28, 27, 26, 25, 24 }; ++ ++- for ( ;number < nSets; number++ ) { +++ const __m256i myShuffle = _mm256_loadu_si256((__m256i*)&shuffleVector[0]); ++ ++- // Load the 32t values, increment inputPtr later since we're doing it in-place. ++- const __m256i input = _mm256_load_si256((__m256i*)inputPtr); ++- const __m256i output = _mm256_shuffle_epi8(input, myShuffle); +++ for (; number < nSets; number++) { ++ ++- // Store the results ++- _mm256_store_si256((__m256i*)inputPtr, output); +++ // Load the 32t values, increment inputPtr later since we're doing it in-place. +++ const __m256i input = _mm256_load_si256((__m256i*)inputPtr); +++ const __m256i output = _mm256_shuffle_epi8(input, myShuffle); ++ ++- /* inputPtr is 32bit so increment twice */ ++- inputPtr += 2 * nPerSet; ++- } ++- _mm256_zeroupper(); +++ // Store the results +++ _mm256_store_si256((__m256i*)inputPtr, output); ++ ++- // Byteswap any remaining points: ++- for(number = nSets * nPerSet; number < num_points; ++number ) { ++- uint32_t output1 = *inputPtr; ++- uint32_t output2 = inputPtr[1]; ++- uint32_t out1 = ((((output1) >> 24) & 0x000000ff) | ++- (((output1) >> 8) & 0x0000ff00) | ++- (((output1) << 8) & 0x00ff0000) | ++- (((output1) << 24) & 0xff000000) ); +++ /* inputPtr is 32bit so increment twice */ +++ inputPtr += 2 * nPerSet; +++ } +++ _mm256_zeroupper(); ++ ++- uint32_t out2 = ((((output2) >> 24) & 0x000000ff) | ++- (((output2) >> 8) & 0x0000ff00) | ++- (((output2) << 8) & 0x00ff0000) | ++- (((output2) << 24) & 0xff000000) ); ++- *inputPtr++ = out2; ++- *inputPtr++ = out1; ++- } +++ // Byteswap any remaining points: +++ for (number = nSets * nPerSet; number < num_points; ++number) { +++ uint32_t output1 = *inputPtr; +++ uint32_t output2 = inputPtr[1]; +++ uint32_t out1 = +++ ((((output1) >> 24) & 0x000000ff) | (((output1) >> 8) & 0x0000ff00) | +++ (((output1) << 8) & 0x00ff0000) | (((output1) << 24) & 0xff000000)); +++ +++ uint32_t out2 = +++ ((((output2) >> 24) & 0x000000ff) | (((output2) >> 8) & 0x0000ff00) | +++ (((output2) << 8) & 0x00ff0000) | (((output2) << 24) & 0xff000000)); +++ *inputPtr++ = out2; +++ *inputPtr++ = out1; +++ } ++ } ++ ++ #endif /* LV_HAVE_AVX2 */ ++@@ -192,48 +198,47 @@ static inline void volk_64u_byteswap_a_avx2(uint64_t* intsToSwap, unsigned int n ++ ++ #if LV_HAVE_SSSE3 ++ #include ++-static inline void volk_64u_byteswap_a_ssse3(uint64_t* intsToSwap, unsigned int num_points) +++static inline void volk_64u_byteswap_a_ssse3(uint64_t* intsToSwap, +++ unsigned int num_points) ++ { ++- unsigned int number = 0; +++ unsigned int number = 0; ++ ++- const unsigned int nPerSet = 2; ++- const uint64_t nSets = num_points / nPerSet; +++ const unsigned int nPerSet = 2; +++ const uint64_t nSets = num_points / nPerSet; ++ ++- uint32_t* inputPtr = (uint32_t*)intsToSwap; ++- ++- uint8_t shuffleVector[16] = { 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8 }; +++ uint32_t* inputPtr = (uint32_t*)intsToSwap; ++ ++- const __m128i myShuffle = _mm_loadu_si128((__m128i*) &shuffleVector); +++ uint8_t shuffleVector[16] = { 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8 }; ++ ++- for ( ;number < nSets; number++ ) { +++ const __m128i myShuffle = _mm_loadu_si128((__m128i*)&shuffleVector); ++ ++- // Load the 32t values, increment inputPtr later since we're doing it in-place. ++- const __m128i input = _mm_load_si128((__m128i*)inputPtr); ++- const __m128i output = _mm_shuffle_epi8(input,myShuffle); +++ for (; number < nSets; number++) { ++ ++- // Store the results ++- _mm_store_si128((__m128i*)inputPtr, output); +++ // Load the 32t values, increment inputPtr later since we're doing it in-place. +++ const __m128i input = _mm_load_si128((__m128i*)inputPtr); +++ const __m128i output = _mm_shuffle_epi8(input, myShuffle); ++ ++- /* inputPtr is 32bit so increment twice */ ++- inputPtr += 2 * nPerSet; ++- } +++ // Store the results +++ _mm_store_si128((__m128i*)inputPtr, output); ++ ++- // Byteswap any remaining points: ++- for(number = nSets * nPerSet; number < num_points; ++number ) { ++- uint32_t output1 = *inputPtr; ++- uint32_t output2 = inputPtr[1]; ++- uint32_t out1 = ((((output1) >> 24) & 0x000000ff) | ++- (((output1) >> 8) & 0x0000ff00) | ++- (((output1) << 8) & 0x00ff0000) | ++- (((output1) << 24) & 0xff000000) ); +++ /* inputPtr is 32bit so increment twice */ +++ inputPtr += 2 * nPerSet; +++ } ++ ++- uint32_t out2 = ((((output2) >> 24) & 0x000000ff) | ++- (((output2) >> 8) & 0x0000ff00) | ++- (((output2) << 8) & 0x00ff0000) | ++- (((output2) << 24) & 0xff000000) ); ++- *inputPtr++ = out2; ++- *inputPtr++ = out1; ++- } +++ // Byteswap any remaining points: +++ for (number = nSets * nPerSet; number < num_points; ++number) { +++ uint32_t output1 = *inputPtr; +++ uint32_t output2 = inputPtr[1]; +++ uint32_t out1 = +++ ((((output1) >> 24) & 0x000000ff) | (((output1) >> 8) & 0x0000ff00) | +++ (((output1) << 8) & 0x00ff0000) | (((output1) << 24) & 0xff000000)); +++ +++ uint32_t out2 = +++ ((((output2) >> 24) & 0x000000ff) | (((output2) >> 8) & 0x0000ff00) | +++ (((output2) << 8) & 0x00ff0000) | (((output2) << 24) & 0xff000000)); +++ *inputPtr++ = out2; +++ *inputPtr++ = out1; +++ } ++ } ++ #endif /* LV_HAVE_SSSE3 */ ++ ++@@ -241,86 +246,90 @@ static inline void volk_64u_byteswap_a_ssse3(uint64_t* intsToSwap, unsigned int ++ #ifdef LV_HAVE_NEONV8 ++ #include ++ ++-static inline void volk_64u_byteswap_neonv8(uint64_t* intsToSwap, unsigned int num_points){ ++- uint32_t* inputPtr = (uint32_t*)intsToSwap; ++- const unsigned int n4points = num_points / 4; ++- uint8x16x2_t input; ++- uint8x16_t idx = { 7,6,5,4, 3,2,1,0, 15,14,13,12, 11,10,9,8 }; ++- ++- unsigned int number = 0; ++- for(number = 0; number < n4points; ++number){ ++- __VOLK_PREFETCH(inputPtr+8); ++- input = vld2q_u8((uint8_t*) inputPtr); ++- input.val[0] = vqtbl1q_u8(input.val[0], idx); ++- input.val[1] = vqtbl1q_u8(input.val[1], idx); ++- vst2q_u8((uint8_t*) inputPtr, input); ++- ++- inputPtr += 8; ++- } ++- ++- for(number = n4points * 4; number < num_points; ++number){ ++- uint32_t output1 = *inputPtr; ++- uint32_t output2 = inputPtr[1]; +++static inline void volk_64u_byteswap_neonv8(uint64_t* intsToSwap, unsigned int num_points) +++{ +++ uint32_t* inputPtr = (uint32_t*)intsToSwap; +++ const unsigned int n4points = num_points / 4; +++ uint8x16x2_t input; +++ uint8x16_t idx = { 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8 }; +++ +++ unsigned int number = 0; +++ for (number = 0; number < n4points; ++number) { +++ __VOLK_PREFETCH(inputPtr + 8); +++ input = vld2q_u8((uint8_t*)inputPtr); +++ input.val[0] = vqtbl1q_u8(input.val[0], idx); +++ input.val[1] = vqtbl1q_u8(input.val[1], idx); +++ vst2q_u8((uint8_t*)inputPtr, input); +++ +++ inputPtr += 8; +++ } ++ ++- output1 = (((output1 >> 24) & 0xff) | ((output1 >> 8) & 0x0000ff00) | ((output1 << 8) & 0x00ff0000) | ((output1 << 24) & 0xff000000)); ++- output2 = (((output2 >> 24) & 0xff) | ((output2 >> 8) & 0x0000ff00) | ((output2 << 8) & 0x00ff0000) | ((output2 << 24) & 0xff000000)); +++ for (number = n4points * 4; number < num_points; ++number) { +++ uint32_t output1 = *inputPtr; +++ uint32_t output2 = inputPtr[1]; ++ ++- *inputPtr++ = output2; ++- *inputPtr++ = output1; ++- } +++ output1 = (((output1 >> 24) & 0xff) | ((output1 >> 8) & 0x0000ff00) | +++ ((output1 << 8) & 0x00ff0000) | ((output1 << 24) & 0xff000000)); +++ output2 = (((output2 >> 24) & 0xff) | ((output2 >> 8) & 0x0000ff00) | +++ ((output2 << 8) & 0x00ff0000) | ((output2 << 24) & 0xff000000)); ++ +++ *inputPtr++ = output2; +++ *inputPtr++ = output1; +++ } ++ } ++ #else ++ #ifdef LV_HAVE_NEON ++ #include ++ ++-static inline void volk_64u_byteswap_neon(uint64_t* intsToSwap, unsigned int num_points){ ++- uint32_t* inputPtr = (uint32_t*)intsToSwap; ++- unsigned int number = 0; ++- unsigned int n8points = num_points / 4; ++- ++- uint8x8x4_t input_table; ++- uint8x8_t int_lookup01, int_lookup23, int_lookup45, int_lookup67; ++- uint8x8_t swapped_int01, swapped_int23, swapped_int45, swapped_int67; ++- ++- /* these magic numbers are used as byte-indices in the LUT. ++- they are pre-computed to save time. A simple C program ++- can calculate them; for example for lookup01: ++- uint8_t chars[8] = {24, 16, 8, 0, 25, 17, 9, 1}; ++- for(ii=0; ii < 8; ++ii) { ++- index += ((uint64_t)(*(chars+ii))) << (ii*8); +++static inline void volk_64u_byteswap_neon(uint64_t* intsToSwap, unsigned int num_points) +++{ +++ uint32_t* inputPtr = (uint32_t*)intsToSwap; +++ unsigned int number = 0; +++ unsigned int n8points = num_points / 4; +++ +++ uint8x8x4_t input_table; +++ uint8x8_t int_lookup01, int_lookup23, int_lookup45, int_lookup67; +++ uint8x8_t swapped_int01, swapped_int23, swapped_int45, swapped_int67; +++ +++ /* these magic numbers are used as byte-indices in the LUT. +++ they are pre-computed to save time. A simple C program +++ can calculate them; for example for lookup01: +++ uint8_t chars[8] = {24, 16, 8, 0, 25, 17, 9, 1}; +++ for(ii=0; ii < 8; ++ii) { +++ index += ((uint64_t)(*(chars+ii))) << (ii*8); +++ } +++ */ +++ int_lookup01 = vcreate_u8(2269495096316185); +++ int_lookup23 = vcreate_u8(146949840772469531); +++ int_lookup45 = vcreate_u8(291630186448622877); +++ int_lookup67 = vcreate_u8(436310532124776223); +++ +++ for (number = 0; number < n8points; ++number) { +++ input_table = vld4_u8((uint8_t*)inputPtr); +++ swapped_int01 = vtbl4_u8(input_table, int_lookup01); +++ swapped_int23 = vtbl4_u8(input_table, int_lookup23); +++ swapped_int45 = vtbl4_u8(input_table, int_lookup45); +++ swapped_int67 = vtbl4_u8(input_table, int_lookup67); +++ vst1_u8((uint8_t*)inputPtr, swapped_int01); +++ vst1_u8((uint8_t*)(inputPtr + 2), swapped_int23); +++ vst1_u8((uint8_t*)(inputPtr + 4), swapped_int45); +++ vst1_u8((uint8_t*)(inputPtr + 6), swapped_int67); +++ +++ inputPtr += 4; ++ } ++- */ ++- int_lookup01 = vcreate_u8(2269495096316185); ++- int_lookup23 = vcreate_u8(146949840772469531); ++- int_lookup45 = vcreate_u8(291630186448622877); ++- int_lookup67 = vcreate_u8(436310532124776223); ++- ++- for(number = 0; number < n8points; ++number){ ++- input_table = vld4_u8((uint8_t*) inputPtr); ++- swapped_int01 = vtbl4_u8(input_table, int_lookup01); ++- swapped_int23 = vtbl4_u8(input_table, int_lookup23); ++- swapped_int45 = vtbl4_u8(input_table, int_lookup45); ++- swapped_int67 = vtbl4_u8(input_table, int_lookup67); ++- vst1_u8((uint8_t*) inputPtr, swapped_int01); ++- vst1_u8((uint8_t*) (inputPtr+2), swapped_int23); ++- vst1_u8((uint8_t*) (inputPtr+4), swapped_int45); ++- vst1_u8((uint8_t*) (inputPtr+6), swapped_int67); ++- ++- inputPtr += 4; ++- } ++- ++- for(number = n8points * 4; number < num_points; ++number){ ++- uint32_t output1 = *inputPtr; ++- uint32_t output2 = inputPtr[1]; ++- ++- output1 = (((output1 >> 24) & 0xff) | ((output1 >> 8) & 0x0000ff00) | ((output1 << 8) & 0x00ff0000) | ((output1 << 24) & 0xff000000)); ++- output2 = (((output2 >> 24) & 0xff) | ((output2 >> 8) & 0x0000ff00) | ((output2 << 8) & 0x00ff0000) | ((output2 << 24) & 0xff000000)); ++- ++- *inputPtr++ = output2; ++- *inputPtr++ = output1; ++- } ++ +++ for (number = n8points * 4; number < num_points; ++number) { +++ uint32_t output1 = *inputPtr; +++ uint32_t output2 = inputPtr[1]; +++ +++ output1 = (((output1 >> 24) & 0xff) | ((output1 >> 8) & 0x0000ff00) | +++ ((output1 << 8) & 0x00ff0000) | ((output1 << 24) & 0xff000000)); +++ output2 = (((output2 >> 24) & 0xff) | ((output2 >> 8) & 0x0000ff00) | +++ ((output2 << 8) & 0x00ff0000) | ((output2 << 24) & 0xff000000)); +++ +++ *inputPtr++ = output2; +++ *inputPtr++ = output1; +++ } ++ } ++ #endif /* LV_HAVE_NEON */ ++ #endif ++@@ -336,49 +345,52 @@ static inline void volk_64u_byteswap_neon(uint64_t* intsToSwap, unsigned int num ++ #ifdef LV_HAVE_SSE2 ++ #include ++ ++-static inline void volk_64u_byteswap_a_sse2(uint64_t* intsToSwap, unsigned int num_points){ +++static inline void volk_64u_byteswap_a_sse2(uint64_t* intsToSwap, unsigned int num_points) +++{ ++ uint32_t* inputPtr = (uint32_t*)intsToSwap; ++ __m128i input, byte1, byte2, byte3, byte4, output; ++ __m128i byte2mask = _mm_set1_epi32(0x00FF0000); ++ __m128i byte3mask = _mm_set1_epi32(0x0000FF00); ++ uint64_t number = 0; ++ const unsigned int halfPoints = num_points / 2; ++- for(;number < halfPoints; number++){ ++- // Load the 32t values, increment inputPtr later since we're doing it in-place. ++- input = _mm_load_si128((__m128i*)inputPtr); ++- ++- // Do the four shifts ++- byte1 = _mm_slli_epi32(input, 24); ++- byte2 = _mm_slli_epi32(input, 8); ++- byte3 = _mm_srli_epi32(input, 8); ++- byte4 = _mm_srli_epi32(input, 24); ++- // Or bytes together ++- output = _mm_or_si128(byte1, byte4); ++- byte2 = _mm_and_si128(byte2, byte2mask); ++- output = _mm_or_si128(output, byte2); ++- byte3 = _mm_and_si128(byte3, byte3mask); ++- output = _mm_or_si128(output, byte3); ++- ++- // Reorder the two words ++- output = _mm_shuffle_epi32(output, _MM_SHUFFLE(2, 3, 0, 1)); ++- ++- // Store the results ++- _mm_store_si128((__m128i*)inputPtr, output); ++- inputPtr += 4; +++ for (; number < halfPoints; number++) { +++ // Load the 32t values, increment inputPtr later since we're doing it in-place. +++ input = _mm_load_si128((__m128i*)inputPtr); +++ +++ // Do the four shifts +++ byte1 = _mm_slli_epi32(input, 24); +++ byte2 = _mm_slli_epi32(input, 8); +++ byte3 = _mm_srli_epi32(input, 8); +++ byte4 = _mm_srli_epi32(input, 24); +++ // Or bytes together +++ output = _mm_or_si128(byte1, byte4); +++ byte2 = _mm_and_si128(byte2, byte2mask); +++ output = _mm_or_si128(output, byte2); +++ byte3 = _mm_and_si128(byte3, byte3mask); +++ output = _mm_or_si128(output, byte3); +++ +++ // Reorder the two words +++ output = _mm_shuffle_epi32(output, _MM_SHUFFLE(2, 3, 0, 1)); +++ +++ // Store the results +++ _mm_store_si128((__m128i*)inputPtr, output); +++ inputPtr += 4; ++ } ++ ++ // Byteswap any remaining points: ++- number = halfPoints*2; ++- for(; number < num_points; number++){ ++- uint32_t output1 = *inputPtr; ++- uint32_t output2 = inputPtr[1]; +++ number = halfPoints * 2; +++ for (; number < num_points; number++) { +++ uint32_t output1 = *inputPtr; +++ uint32_t output2 = inputPtr[1]; ++ ++- output1 = (((output1 >> 24) & 0xff) | ((output1 >> 8) & 0x0000ff00) | ((output1 << 8) & 0x00ff0000) | ((output1 << 24) & 0xff000000)); +++ output1 = (((output1 >> 24) & 0xff) | ((output1 >> 8) & 0x0000ff00) | +++ ((output1 << 8) & 0x00ff0000) | ((output1 << 24) & 0xff000000)); ++ ++- output2 = (((output2 >> 24) & 0xff) | ((output2 >> 8) & 0x0000ff00) | ((output2 << 8) & 0x00ff0000) | ((output2 << 24) & 0xff000000)); +++ output2 = (((output2 >> 24) & 0xff) | ((output2 >> 8) & 0x0000ff00) | +++ ((output2 << 8) & 0x00ff0000) | ((output2 << 24) & 0xff000000)); ++ ++- *inputPtr++ = output2; ++- *inputPtr++ = output1; +++ *inputPtr++ = output2; +++ *inputPtr++ = output1; ++ } ++ } ++ #endif /* LV_HAVE_SSE2 */ ++@@ -387,46 +399,46 @@ static inline void volk_64u_byteswap_a_sse2(uint64_t* intsToSwap, unsigned int n ++ #include ++ static inline void volk_64u_byteswap_u_avx2(uint64_t* intsToSwap, unsigned int num_points) ++ { ++- unsigned int number = 0; ++- ++- const unsigned int nPerSet = 4; ++- const uint64_t nSets = num_points / nPerSet; ++- ++- uint32_t* inputPtr = (uint32_t*)intsToSwap; ++- ++- const uint8_t shuffleVector[32] = { 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8, 23, 22, 21, 20, 19, 18, 17, 16, 31, 30, 29, 28, 27, 26, 25, 24 }; ++- ++- const __m256i myShuffle = _mm256_loadu_si256((__m256i*) &shuffleVector[0]); ++- ++- for ( ;number < nSets; number++ ) { ++- // Load the 32t values, increment inputPtr later since we're doing it in-place. ++- const __m256i input = _mm256_loadu_si256((__m256i*)inputPtr); ++- const __m256i output = _mm256_shuffle_epi8(input,myShuffle); ++- ++- // Store the results ++- _mm256_storeu_si256((__m256i*)inputPtr, output); ++- ++- /* inputPtr is 32bit so increment twice */ ++- inputPtr += 2 * nPerSet; ++- } ++- _mm256_zeroupper(); ++- ++- // Byteswap any remaining points: ++- for(number = nSets * nPerSet; number < num_points; ++number ) { ++- uint32_t output1 = *inputPtr; ++- uint32_t output2 = inputPtr[1]; ++- uint32_t out1 = ((((output1) >> 24) & 0x000000ff) | ++- (((output1) >> 8) & 0x0000ff00) | ++- (((output1) << 8) & 0x00ff0000) | ++- (((output1) << 24) & 0xff000000) ); ++- ++- uint32_t out2 = ((((output2) >> 24) & 0x000000ff) | ++- (((output2) >> 8) & 0x0000ff00) | ++- (((output2) << 8) & 0x00ff0000) | ++- (((output2) << 24) & 0xff000000) ); ++- *inputPtr++ = out2; ++- *inputPtr++ = out1; ++- } +++ unsigned int number = 0; +++ +++ const unsigned int nPerSet = 4; +++ const uint64_t nSets = num_points / nPerSet; +++ +++ uint32_t* inputPtr = (uint32_t*)intsToSwap; +++ +++ const uint8_t shuffleVector[32] = { 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, +++ 12, 11, 10, 9, 8, 23, 22, 21, 20, 19, 18, +++ 17, 16, 31, 30, 29, 28, 27, 26, 25, 24 }; +++ +++ const __m256i myShuffle = _mm256_loadu_si256((__m256i*)&shuffleVector[0]); +++ +++ for (; number < nSets; number++) { +++ // Load the 32t values, increment inputPtr later since we're doing it in-place. +++ const __m256i input = _mm256_loadu_si256((__m256i*)inputPtr); +++ const __m256i output = _mm256_shuffle_epi8(input, myShuffle); +++ +++ // Store the results +++ _mm256_storeu_si256((__m256i*)inputPtr, output); +++ +++ /* inputPtr is 32bit so increment twice */ +++ inputPtr += 2 * nPerSet; +++ } +++ _mm256_zeroupper(); +++ +++ // Byteswap any remaining points: +++ for (number = nSets * nPerSet; number < num_points; ++number) { +++ uint32_t output1 = *inputPtr; +++ uint32_t output2 = inputPtr[1]; +++ uint32_t out1 = +++ ((((output1) >> 24) & 0x000000ff) | (((output1) >> 8) & 0x0000ff00) | +++ (((output1) << 8) & 0x00ff0000) | (((output1) << 24) & 0xff000000)); +++ +++ uint32_t out2 = +++ ((((output2) >> 24) & 0x000000ff) | (((output2) >> 8) & 0x0000ff00) | +++ (((output2) << 8) & 0x00ff0000) | (((output2) << 24) & 0xff000000)); +++ *inputPtr++ = out2; +++ *inputPtr++ = out1; +++ } ++ } ++ ++ #endif /* LV_HAVE_AVX2 */ ++@@ -434,70 +446,71 @@ static inline void volk_64u_byteswap_u_avx2(uint64_t* intsToSwap, unsigned int n ++ ++ #if LV_HAVE_SSSE3 ++ #include ++-static inline void volk_64u_byteswap_u_ssse3(uint64_t* intsToSwap, unsigned int num_points) +++static inline void volk_64u_byteswap_u_ssse3(uint64_t* intsToSwap, +++ unsigned int num_points) ++ { ++- unsigned int number = 0; ++- ++- const unsigned int nPerSet = 2; ++- const uint64_t nSets = num_points / nPerSet; +++ unsigned int number = 0; ++ ++- uint32_t* inputPtr = (uint32_t*)intsToSwap; +++ const unsigned int nPerSet = 2; +++ const uint64_t nSets = num_points / nPerSet; ++ ++- uint8_t shuffleVector[16] = { 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8 }; +++ uint32_t* inputPtr = (uint32_t*)intsToSwap; ++ ++- const __m128i myShuffle = _mm_loadu_si128((__m128i*) &shuffleVector); +++ uint8_t shuffleVector[16] = { 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8 }; ++ ++- for ( ;number < nSets; number++ ) { ++- // Load the 32t values, increment inputPtr later since we're doing it in-place. ++- const __m128i input = _mm_loadu_si128((__m128i*)inputPtr); ++- const __m128i output = _mm_shuffle_epi8(input,myShuffle); +++ const __m128i myShuffle = _mm_loadu_si128((__m128i*)&shuffleVector); ++ ++- // Store the results ++- _mm_storeu_si128((__m128i*)inputPtr, output); +++ for (; number < nSets; number++) { +++ // Load the 32t values, increment inputPtr later since we're doing it in-place. +++ const __m128i input = _mm_loadu_si128((__m128i*)inputPtr); +++ const __m128i output = _mm_shuffle_epi8(input, myShuffle); ++ ++- /* inputPtr is 32bit so increment twice */ ++- inputPtr += 2 * nPerSet; ++- } +++ // Store the results +++ _mm_storeu_si128((__m128i*)inputPtr, output); ++ ++- // Byteswap any remaining points: ++- for(number = nSets * nPerSet; number < num_points; ++number ) { ++- uint32_t output1 = *inputPtr; ++- uint32_t output2 = inputPtr[1]; ++- uint32_t out1 = ((((output1) >> 24) & 0x000000ff) | ++- (((output1) >> 8) & 0x0000ff00) | ++- (((output1) << 8) & 0x00ff0000) | ++- (((output1) << 24) & 0xff000000) ); +++ /* inputPtr is 32bit so increment twice */ +++ inputPtr += 2 * nPerSet; +++ } ++ ++- uint32_t out2 = ((((output2) >> 24) & 0x000000ff) | ++- (((output2) >> 8) & 0x0000ff00) | ++- (((output2) << 8) & 0x00ff0000) | ++- (((output2) << 24) & 0xff000000) ); ++- *inputPtr++ = out2; ++- *inputPtr++ = out1; ++- } +++ // Byteswap any remaining points: +++ for (number = nSets * nPerSet; number < num_points; ++number) { +++ uint32_t output1 = *inputPtr; +++ uint32_t output2 = inputPtr[1]; +++ uint32_t out1 = +++ ((((output1) >> 24) & 0x000000ff) | (((output1) >> 8) & 0x0000ff00) | +++ (((output1) << 8) & 0x00ff0000) | (((output1) << 24) & 0xff000000)); +++ +++ uint32_t out2 = +++ ((((output2) >> 24) & 0x000000ff) | (((output2) >> 8) & 0x0000ff00) | +++ (((output2) << 8) & 0x00ff0000) | (((output2) << 24) & 0xff000000)); +++ *inputPtr++ = out2; +++ *inputPtr++ = out1; +++ } ++ } ++ #endif /* LV_HAVE_SSSE3 */ ++ ++ #ifdef LV_HAVE_GENERIC ++ ++-static inline void volk_64u_byteswap_a_generic(uint64_t* intsToSwap, unsigned int num_points){ ++- uint32_t* inputPtr = (uint32_t*)intsToSwap; ++- unsigned int point; ++- for(point = 0; point < num_points; point++){ ++- uint32_t output1 = *inputPtr; ++- uint32_t output2 = inputPtr[1]; +++static inline void volk_64u_byteswap_a_generic(uint64_t* intsToSwap, +++ unsigned int num_points) +++{ +++ uint32_t* inputPtr = (uint32_t*)intsToSwap; +++ unsigned int point; +++ for (point = 0; point < num_points; point++) { +++ uint32_t output1 = *inputPtr; +++ uint32_t output2 = inputPtr[1]; ++ ++- output1 = (((output1 >> 24) & 0xff) | ((output1 >> 8) & 0x0000ff00) | ((output1 << 8) & 0x00ff0000) | ((output1 << 24) & 0xff000000)); +++ output1 = (((output1 >> 24) & 0xff) | ((output1 >> 8) & 0x0000ff00) | +++ ((output1 << 8) & 0x00ff0000) | ((output1 << 24) & 0xff000000)); ++ ++- output2 = (((output2 >> 24) & 0xff) | ((output2 >> 8) & 0x0000ff00) | ((output2 << 8) & 0x00ff0000) | ((output2 << 24) & 0xff000000)); +++ output2 = (((output2 >> 24) & 0xff) | ((output2 >> 8) & 0x0000ff00) | +++ ((output2 << 8) & 0x00ff0000) | ((output2 << 24) & 0xff000000)); ++ ++- *inputPtr++ = output2; ++- *inputPtr++ = output1; ++- } +++ *inputPtr++ = output2; +++ *inputPtr++ = output1; +++ } ++ } ++ #endif /* LV_HAVE_GENERIC */ ++ ++ ++- ++- ++ #endif /* INCLUDED_volk_64u_byteswap_a_H */ ++diff --git a/kernels/volk/volk_64u_byteswappuppet_64u.h b/kernels/volk/volk_64u_byteswappuppet_64u.h ++index 2db0171..ded54ee 100644 ++--- a/kernels/volk/volk_64u_byteswappuppet_64u.h +++++ b/kernels/volk/volk_64u_byteswappuppet_64u.h ++@@ -3,87 +3,105 @@ ++ ++ ++ #include ++-#include ++ #include +++#include ++ ++ #ifdef LV_HAVE_GENERIC ++-static inline void volk_64u_byteswappuppet_64u_generic(uint64_t*output, uint64_t* intsToSwap, unsigned int num_points){ +++static inline void volk_64u_byteswappuppet_64u_generic(uint64_t* output, +++ uint64_t* intsToSwap, +++ unsigned int num_points) +++{ ++ ++ volk_64u_byteswap_generic((uint64_t*)intsToSwap, num_points); ++ memcpy((void*)output, (void*)intsToSwap, num_points * sizeof(uint64_t)); ++- ++ } ++ #endif ++ ++ #ifdef LV_HAVE_NEONV8 ++-static inline void volk_64u_byteswappuppet_64u_neonv8(uint64_t*output, uint64_t* intsToSwap, unsigned int num_points){ +++static inline void volk_64u_byteswappuppet_64u_neonv8(uint64_t* output, +++ uint64_t* intsToSwap, +++ unsigned int num_points) +++{ ++ ++ volk_64u_byteswap_neonv8((uint64_t*)intsToSwap, num_points); ++ memcpy((void*)output, (void*)intsToSwap, num_points * sizeof(uint64_t)); ++- ++ } ++ #else ++ #ifdef LV_HAVE_NEON ++-static inline void volk_64u_byteswappuppet_64u_neon(uint64_t*output, uint64_t* intsToSwap, unsigned int num_points){ +++static inline void volk_64u_byteswappuppet_64u_neon(uint64_t* output, +++ uint64_t* intsToSwap, +++ unsigned int num_points) +++{ ++ ++ volk_64u_byteswap_neon((uint64_t*)intsToSwap, num_points); ++ memcpy((void*)output, (void*)intsToSwap, num_points * sizeof(uint64_t)); ++- ++ } ++ #endif ++ #endif ++ ++ #ifdef LV_HAVE_SSE2 ++-static inline void volk_64u_byteswappuppet_64u_u_sse2(uint64_t* output, uint64_t* intsToSwap, unsigned int num_points){ +++static inline void volk_64u_byteswappuppet_64u_u_sse2(uint64_t* output, +++ uint64_t* intsToSwap, +++ unsigned int num_points) +++{ ++ ++ volk_64u_byteswap_u_sse2((uint64_t*)intsToSwap, num_points); ++ memcpy((void*)output, (void*)intsToSwap, num_points * sizeof(uint64_t)); ++- ++ } ++ #endif ++ ++ #ifdef LV_HAVE_SSE2 ++-static inline void volk_64u_byteswappuppet_64u_a_sse2(uint64_t* output, uint64_t* intsToSwap, unsigned int num_points){ +++static inline void volk_64u_byteswappuppet_64u_a_sse2(uint64_t* output, +++ uint64_t* intsToSwap, +++ unsigned int num_points) +++{ ++ ++ volk_64u_byteswap_a_sse2((uint64_t*)intsToSwap, num_points); ++ memcpy((void*)output, (void*)intsToSwap, num_points * sizeof(uint64_t)); ++- ++ } ++ #endif ++ ++ #ifdef LV_HAVE_SSSE3 ++-static inline void volk_64u_byteswappuppet_64u_u_ssse3(uint64_t* output, uint64_t* intsToSwap, unsigned int num_points){ +++static inline void volk_64u_byteswappuppet_64u_u_ssse3(uint64_t* output, +++ uint64_t* intsToSwap, +++ unsigned int num_points) +++{ ++ ++ volk_64u_byteswap_u_ssse3((uint64_t*)intsToSwap, num_points); ++ memcpy((void*)output, (void*)intsToSwap, num_points * sizeof(uint64_t)); ++- ++ } ++ #endif ++ ++ #ifdef LV_HAVE_SSSE3 ++-static inline void volk_64u_byteswappuppet_64u_a_ssse3(uint64_t* output, uint64_t* intsToSwap, unsigned int num_points){ +++static inline void volk_64u_byteswappuppet_64u_a_ssse3(uint64_t* output, +++ uint64_t* intsToSwap, +++ unsigned int num_points) +++{ ++ ++ volk_64u_byteswap_a_ssse3((uint64_t*)intsToSwap, num_points); ++ memcpy((void*)output, (void*)intsToSwap, num_points * sizeof(uint64_t)); ++- ++ } ++ #endif ++ ++ #ifdef LV_HAVE_AVX2 ++-static inline void volk_64u_byteswappuppet_64u_u_avx2(uint64_t* output, uint64_t* intsToSwap, unsigned int num_points){ +++static inline void volk_64u_byteswappuppet_64u_u_avx2(uint64_t* output, +++ uint64_t* intsToSwap, +++ unsigned int num_points) +++{ ++ ++ volk_64u_byteswap_u_avx2((uint64_t*)intsToSwap, num_points); ++ memcpy((void*)output, (void*)intsToSwap, num_points * sizeof(uint64_t)); ++- ++ } ++ #endif ++ ++ #ifdef LV_HAVE_AVX2 ++-static inline void volk_64u_byteswappuppet_64u_a_avx2(uint64_t* output, uint64_t* intsToSwap, unsigned int num_points){ +++static inline void volk_64u_byteswappuppet_64u_a_avx2(uint64_t* output, +++ uint64_t* intsToSwap, +++ unsigned int num_points) +++{ ++ ++ volk_64u_byteswap_a_avx2((uint64_t*)intsToSwap, num_points); ++ memcpy((void*)output, (void*)intsToSwap, num_points * sizeof(uint64_t)); ++- ++ } ++ #endif ++ ++diff --git a/kernels/volk/volk_64u_popcnt.h b/kernels/volk/volk_64u_popcnt.h ++index cbce2ec..43c2ae0 100644 ++--- a/kernels/volk/volk_64u_popcnt.h +++++ b/kernels/volk/volk_64u_popcnt.h ++@@ -60,39 +60,38 @@ ++ #ifndef INCLUDED_volk_64u_popcnt_a_H ++ #define INCLUDED_volk_64u_popcnt_a_H ++ ++-#include ++ #include +++#include ++ ++ ++ #ifdef LV_HAVE_GENERIC ++ ++ ++-static inline void ++-volk_64u_popcnt_generic(uint64_t* ret, const uint64_t value) +++static inline void volk_64u_popcnt_generic(uint64_t* ret, const uint64_t value) ++ { ++- //const uint32_t* valueVector = (const uint32_t*)&value; ++- ++- // This is faster than a lookup table ++- //uint32_t retVal = valueVector[0]; ++- uint32_t retVal = (uint32_t)(value & 0x00000000FFFFFFFFull); ++- ++- retVal = (retVal & 0x55555555) + (retVal >> 1 & 0x55555555); ++- retVal = (retVal & 0x33333333) + (retVal >> 2 & 0x33333333); ++- retVal = (retVal + (retVal >> 4)) & 0x0F0F0F0F; ++- retVal = (retVal + (retVal >> 8)); ++- retVal = (retVal + (retVal >> 16)) & 0x0000003F; ++- uint64_t retVal64 = retVal; ++- ++- //retVal = valueVector[1]; ++- retVal = (uint32_t)((value & 0xFFFFFFFF00000000ull) >> 32); ++- retVal = (retVal & 0x55555555) + (retVal >> 1 & 0x55555555); ++- retVal = (retVal & 0x33333333) + (retVal >> 2 & 0x33333333); ++- retVal = (retVal + (retVal >> 4)) & 0x0F0F0F0F; ++- retVal = (retVal + (retVal >> 8)); ++- retVal = (retVal + (retVal >> 16)) & 0x0000003F; ++- retVal64 += retVal; ++- ++- *ret = retVal64; +++ // const uint32_t* valueVector = (const uint32_t*)&value; +++ +++ // This is faster than a lookup table +++ // uint32_t retVal = valueVector[0]; +++ uint32_t retVal = (uint32_t)(value & 0x00000000FFFFFFFFull); +++ +++ retVal = (retVal & 0x55555555) + (retVal >> 1 & 0x55555555); +++ retVal = (retVal & 0x33333333) + (retVal >> 2 & 0x33333333); +++ retVal = (retVal + (retVal >> 4)) & 0x0F0F0F0F; +++ retVal = (retVal + (retVal >> 8)); +++ retVal = (retVal + (retVal >> 16)) & 0x0000003F; +++ uint64_t retVal64 = retVal; +++ +++ // retVal = valueVector[1]; +++ retVal = (uint32_t)((value & 0xFFFFFFFF00000000ull) >> 32); +++ retVal = (retVal & 0x55555555) + (retVal >> 1 & 0x55555555); +++ retVal = (retVal & 0x33333333) + (retVal >> 2 & 0x33333333); +++ retVal = (retVal + (retVal >> 4)) & 0x0F0F0F0F; +++ retVal = (retVal + (retVal >> 8)); +++ retVal = (retVal + (retVal >> 16)) & 0x0000003F; +++ retVal64 += retVal; +++ +++ *ret = retVal64; ++ } ++ ++ #endif /*LV_HAVE_GENERIC*/ ++@@ -104,7 +103,7 @@ volk_64u_popcnt_generic(uint64_t* ret, const uint64_t value) ++ ++ static inline void volk_64u_popcnt_a_sse4_2(uint64_t* ret, const uint64_t value) ++ { ++- *ret = _mm_popcnt_u64(value); +++ *ret = _mm_popcnt_u64(value); ++ } ++ ++ #endif /*LV_HAVE_SSE4_2*/ ++@@ -114,19 +113,19 @@ static inline void volk_64u_popcnt_a_sse4_2(uint64_t* ret, const uint64_t value) ++ #include ++ static inline void volk_64u_popcnt_neon(uint64_t* ret, const uint64_t value) ++ { ++- uint8x8_t input_val, count8x8_val; ++- uint16x4_t count16x4_val; ++- uint32x2_t count32x2_val; ++- uint64x1_t count64x1_val; ++- ++- input_val = vld1_u8((unsigned char *) &value); ++- count8x8_val = vcnt_u8(input_val); ++- count16x4_val = vpaddl_u8(count8x8_val); ++- count32x2_val = vpaddl_u16(count16x4_val); ++- count64x1_val = vpaddl_u32(count32x2_val); ++- vst1_u64(ret, count64x1_val); ++- ++- //*ret = _mm_popcnt_u64(value); +++ uint8x8_t input_val, count8x8_val; +++ uint16x4_t count16x4_val; +++ uint32x2_t count32x2_val; +++ uint64x1_t count64x1_val; +++ +++ input_val = vld1_u8((unsigned char*)&value); +++ count8x8_val = vcnt_u8(input_val); +++ count16x4_val = vpaddl_u8(count8x8_val); +++ count32x2_val = vpaddl_u16(count16x4_val); +++ count64x1_val = vpaddl_u32(count32x2_val); +++ vst1_u64(ret, count64x1_val); +++ +++ //*ret = _mm_popcnt_u64(value); ++ } ++ #endif /*LV_HAVE_NEON*/ ++ ++diff --git a/kernels/volk/volk_64u_popcntpuppet_64u.h b/kernels/volk/volk_64u_popcntpuppet_64u.h ++index e38ebb3..688281a 100644 ++--- a/kernels/volk/volk_64u_popcntpuppet_64u.h +++++ b/kernels/volk/volk_64u_popcntpuppet_64u.h ++@@ -23,35 +23,44 @@ ++ #ifndef INCLUDED_volk_64u_popcntpuppet_64u_H ++ #define INCLUDED_volk_64u_popcntpuppet_64u_H ++ ++-#include ++ #include ++ #include +++#include ++ ++ #ifdef LV_HAVE_GENERIC ++-static inline void volk_64u_popcntpuppet_64u_generic(uint64_t* outVector, const uint64_t* inVector, unsigned int num_points){ +++static inline void volk_64u_popcntpuppet_64u_generic(uint64_t* outVector, +++ const uint64_t* inVector, +++ unsigned int num_points) +++{ ++ unsigned int ii; ++- for(ii=0; ii < num_points; ++ii) { ++- volk_64u_popcnt_generic(outVector+ii, num_points ); +++ for (ii = 0; ii < num_points; ++ii) { +++ volk_64u_popcnt_generic(outVector + ii, num_points); ++ } ++ memcpy((void*)outVector, (void*)inVector, num_points * sizeof(uint64_t)); ++ } ++ #endif /* LV_HAVE_GENERIC */ ++ ++ #if LV_HAVE_SSE4_2 && LV_HAVE_64 ++-static inline void volk_64u_popcntpuppet_64u_a_sse4_2(uint64_t* outVector, const uint64_t* inVector, unsigned int num_points){ +++static inline void volk_64u_popcntpuppet_64u_a_sse4_2(uint64_t* outVector, +++ const uint64_t* inVector, +++ unsigned int num_points) +++{ ++ unsigned int ii; ++- for(ii=0; ii < num_points; ++ii) { ++- volk_64u_popcnt_a_sse4_2(outVector+ii, num_points ); +++ for (ii = 0; ii < num_points; ++ii) { +++ volk_64u_popcnt_a_sse4_2(outVector + ii, num_points); ++ } ++ memcpy((void*)outVector, (void*)inVector, num_points * sizeof(uint64_t)); ++ } ++ #endif /* LV_HAVE_SSE4_2 */ ++ ++ #ifdef LV_HAVE_NEON ++-static inline void volk_64u_popcntpuppet_64u_neon(uint64_t* outVector, const uint64_t* inVector, unsigned int num_points){ +++static inline void volk_64u_popcntpuppet_64u_neon(uint64_t* outVector, +++ const uint64_t* inVector, +++ unsigned int num_points) +++{ ++ unsigned int ii; ++- for(ii=0; ii < num_points; ++ii) { ++- volk_64u_popcnt_neon(outVector+ii, num_points ); +++ for (ii = 0; ii < num_points; ++ii) { +++ volk_64u_popcnt_neon(outVector + ii, num_points); ++ } ++ memcpy((void*)outVector, (void*)inVector, num_points * sizeof(uint64_t)); ++ } ++diff --git a/kernels/volk/volk_8i_convert_16i.h b/kernels/volk/volk_8i_convert_16i.h ++index 40400c3..69d8f6a 100644 ++--- a/kernels/volk/volk_8i_convert_16i.h +++++ b/kernels/volk/volk_8i_convert_16i.h ++@@ -30,8 +30,8 @@ ++ * ++ * Dispatcher Prototype ++ * \code ++- * void volk_8i_convert_16i(int16_t* outputVector, const int8_t* inputVector, unsigned int num_points) ++- * \endcode +++ * void volk_8i_convert_16i(int16_t* outputVector, const int8_t* inputVector, unsigned int +++ * num_points) \endcode ++ * ++ * \b Inputs ++ * \li inputVector: The input vector of 8-bit chars. ++@@ -59,32 +59,32 @@ ++ #ifdef LV_HAVE_AVX2 ++ #include ++ ++-static inline void ++-volk_8i_convert_16i_u_avx2(int16_t* outputVector, const int8_t* inputVector, ++- unsigned int num_points) +++static inline void volk_8i_convert_16i_u_avx2(int16_t* outputVector, +++ const int8_t* inputVector, +++ unsigned int num_points) ++ { ++- unsigned int number = 0; ++- const unsigned int sixteenthPoints = num_points / 16; ++- ++- const __m128i* inputVectorPtr = (const __m128i*)inputVector; ++- __m256i* outputVectorPtr = (__m256i*)outputVector; ++- __m128i inputVal; ++- __m256i ret; ++- ++- for(;number < sixteenthPoints; number++){ ++- inputVal = _mm_loadu_si128(inputVectorPtr); ++- ret = _mm256_cvtepi8_epi16(inputVal); ++- ret = _mm256_slli_epi16(ret, 8); // Multiply by 256 ++- _mm256_storeu_si256(outputVectorPtr, ret); ++- ++- outputVectorPtr++; ++- inputVectorPtr++; ++- } ++- ++- number = sixteenthPoints * 16; ++- for(; number < num_points; number++){ ++- outputVector[number] = (int16_t)(inputVector[number])*256; ++- } +++ unsigned int number = 0; +++ const unsigned int sixteenthPoints = num_points / 16; +++ +++ const __m128i* inputVectorPtr = (const __m128i*)inputVector; +++ __m256i* outputVectorPtr = (__m256i*)outputVector; +++ __m128i inputVal; +++ __m256i ret; +++ +++ for (; number < sixteenthPoints; number++) { +++ inputVal = _mm_loadu_si128(inputVectorPtr); +++ ret = _mm256_cvtepi8_epi16(inputVal); +++ ret = _mm256_slli_epi16(ret, 8); // Multiply by 256 +++ _mm256_storeu_si256(outputVectorPtr, ret); +++ +++ outputVectorPtr++; +++ inputVectorPtr++; +++ } +++ +++ number = sixteenthPoints * 16; +++ for (; number < num_points; number++) { +++ outputVector[number] = (int16_t)(inputVector[number]) * 256; +++ } ++ } ++ #endif /* LV_HAVE_AVX2 */ ++ ++@@ -92,57 +92,57 @@ volk_8i_convert_16i_u_avx2(int16_t* outputVector, const int8_t* inputVector, ++ #ifdef LV_HAVE_SSE4_1 ++ #include ++ ++-static inline void ++-volk_8i_convert_16i_u_sse4_1(int16_t* outputVector, const int8_t* inputVector, ++- unsigned int num_points) +++static inline void volk_8i_convert_16i_u_sse4_1(int16_t* outputVector, +++ const int8_t* inputVector, +++ unsigned int num_points) ++ { ++- unsigned int number = 0; ++- const unsigned int sixteenthPoints = num_points / 16; +++ unsigned int number = 0; +++ const unsigned int sixteenthPoints = num_points / 16; ++ ++- const __m128i* inputVectorPtr = (const __m128i*)inputVector; ++- __m128i* outputVectorPtr = (__m128i*)outputVector; ++- __m128i inputVal; ++- __m128i ret; +++ const __m128i* inputVectorPtr = (const __m128i*)inputVector; +++ __m128i* outputVectorPtr = (__m128i*)outputVector; +++ __m128i inputVal; +++ __m128i ret; ++ ++- for(;number < sixteenthPoints; number++){ ++- inputVal = _mm_loadu_si128(inputVectorPtr); ++- ret = _mm_cvtepi8_epi16(inputVal); ++- ret = _mm_slli_epi16(ret, 8); // Multiply by 256 ++- _mm_storeu_si128(outputVectorPtr, ret); +++ for (; number < sixteenthPoints; number++) { +++ inputVal = _mm_loadu_si128(inputVectorPtr); +++ ret = _mm_cvtepi8_epi16(inputVal); +++ ret = _mm_slli_epi16(ret, 8); // Multiply by 256 +++ _mm_storeu_si128(outputVectorPtr, ret); ++ ++- outputVectorPtr++; +++ outputVectorPtr++; ++ ++- inputVal = _mm_srli_si128(inputVal, 8); ++- ret = _mm_cvtepi8_epi16(inputVal); ++- ret = _mm_slli_epi16(ret, 8); // Multiply by 256 ++- _mm_storeu_si128(outputVectorPtr, ret); +++ inputVal = _mm_srli_si128(inputVal, 8); +++ ret = _mm_cvtepi8_epi16(inputVal); +++ ret = _mm_slli_epi16(ret, 8); // Multiply by 256 +++ _mm_storeu_si128(outputVectorPtr, ret); ++ ++- outputVectorPtr++; +++ outputVectorPtr++; ++ ++- inputVectorPtr++; ++- } +++ inputVectorPtr++; +++ } ++ ++- number = sixteenthPoints * 16; ++- for(; number < num_points; number++){ ++- outputVector[number] = (int16_t)(inputVector[number])*256; ++- } +++ number = sixteenthPoints * 16; +++ for (; number < num_points; number++) { +++ outputVector[number] = (int16_t)(inputVector[number]) * 256; +++ } ++ } ++ #endif /* LV_HAVE_SSE4_1 */ ++ ++ ++ #ifdef LV_HAVE_GENERIC ++ ++-static inline void ++-volk_8i_convert_16i_generic(int16_t* outputVector, const int8_t* inputVector, ++- unsigned int num_points) +++static inline void volk_8i_convert_16i_generic(int16_t* outputVector, +++ const int8_t* inputVector, +++ unsigned int num_points) ++ { ++- int16_t* outputVectorPtr = outputVector; ++- const int8_t* inputVectorPtr = inputVector; ++- unsigned int number = 0; +++ int16_t* outputVectorPtr = outputVector; +++ const int8_t* inputVectorPtr = inputVector; +++ unsigned int number = 0; ++ ++- for(number = 0; number < num_points; number++){ ++- *outputVectorPtr++ = ((int16_t)(*inputVectorPtr++)) * 256; ++- } +++ for (number = 0; number < num_points; number++) { +++ *outputVectorPtr++ = ((int16_t)(*inputVectorPtr++)) * 256; +++ } ++ } ++ #endif /* LV_HAVE_GENERIC */ ++ ++@@ -150,7 +150,6 @@ volk_8i_convert_16i_generic(int16_t* outputVector, const int8_t* inputVector, ++ #endif /* INCLUDED_VOLK_8s_CONVERT_16s_UNALIGNED8_H */ ++ ++ ++- ++ #ifndef INCLUDED_volk_8i_convert_16i_a_H ++ #define INCLUDED_volk_8i_convert_16i_a_H ++ ++@@ -160,32 +159,32 @@ volk_8i_convert_16i_generic(int16_t* outputVector, const int8_t* inputVector, ++ #ifdef LV_HAVE_AVX2 ++ #include ++ ++-static inline void ++-volk_8i_convert_16i_a_avx2(int16_t* outputVector, const int8_t* inputVector, ++- unsigned int num_points) +++static inline void volk_8i_convert_16i_a_avx2(int16_t* outputVector, +++ const int8_t* inputVector, +++ unsigned int num_points) ++ { ++- unsigned int number = 0; ++- const unsigned int sixteenthPoints = num_points / 16; ++- ++- const __m128i* inputVectorPtr = (const __m128i*)inputVector; ++- __m256i* outputVectorPtr = (__m256i*)outputVector; ++- __m128i inputVal; ++- __m256i ret; ++- ++- for(;number < sixteenthPoints; number++){ ++- inputVal = _mm_load_si128(inputVectorPtr); ++- ret = _mm256_cvtepi8_epi16(inputVal); ++- ret = _mm256_slli_epi16(ret, 8); // Multiply by 256 ++- _mm256_store_si256(outputVectorPtr, ret); ++- ++- outputVectorPtr++; ++- inputVectorPtr++; ++- } ++- ++- number = sixteenthPoints * 16; ++- for(; number < num_points; number++){ ++- outputVector[number] = (int16_t)(inputVector[number])*256; ++- } +++ unsigned int number = 0; +++ const unsigned int sixteenthPoints = num_points / 16; +++ +++ const __m128i* inputVectorPtr = (const __m128i*)inputVector; +++ __m256i* outputVectorPtr = (__m256i*)outputVector; +++ __m128i inputVal; +++ __m256i ret; +++ +++ for (; number < sixteenthPoints; number++) { +++ inputVal = _mm_load_si128(inputVectorPtr); +++ ret = _mm256_cvtepi8_epi16(inputVal); +++ ret = _mm256_slli_epi16(ret, 8); // Multiply by 256 +++ _mm256_store_si256(outputVectorPtr, ret); +++ +++ outputVectorPtr++; +++ inputVectorPtr++; +++ } +++ +++ number = sixteenthPoints * 16; +++ for (; number < num_points; number++) { +++ outputVector[number] = (int16_t)(inputVector[number]) * 256; +++ } ++ } ++ #endif /* LV_HAVE_AVX2 */ ++ ++@@ -193,57 +192,57 @@ volk_8i_convert_16i_a_avx2(int16_t* outputVector, const int8_t* inputVector, ++ #ifdef LV_HAVE_SSE4_1 ++ #include ++ ++-static inline void ++-volk_8i_convert_16i_a_sse4_1(int16_t* outputVector, const int8_t* inputVector, ++- unsigned int num_points) +++static inline void volk_8i_convert_16i_a_sse4_1(int16_t* outputVector, +++ const int8_t* inputVector, +++ unsigned int num_points) ++ { ++- unsigned int number = 0; ++- const unsigned int sixteenthPoints = num_points / 16; +++ unsigned int number = 0; +++ const unsigned int sixteenthPoints = num_points / 16; ++ ++- const __m128i* inputVectorPtr = (const __m128i*)inputVector; ++- __m128i* outputVectorPtr = (__m128i*)outputVector; ++- __m128i inputVal; ++- __m128i ret; +++ const __m128i* inputVectorPtr = (const __m128i*)inputVector; +++ __m128i* outputVectorPtr = (__m128i*)outputVector; +++ __m128i inputVal; +++ __m128i ret; ++ ++- for(;number < sixteenthPoints; number++){ ++- inputVal = _mm_load_si128(inputVectorPtr); ++- ret = _mm_cvtepi8_epi16(inputVal); ++- ret = _mm_slli_epi16(ret, 8); // Multiply by 256 ++- _mm_store_si128(outputVectorPtr, ret); +++ for (; number < sixteenthPoints; number++) { +++ inputVal = _mm_load_si128(inputVectorPtr); +++ ret = _mm_cvtepi8_epi16(inputVal); +++ ret = _mm_slli_epi16(ret, 8); // Multiply by 256 +++ _mm_store_si128(outputVectorPtr, ret); ++ ++- outputVectorPtr++; +++ outputVectorPtr++; ++ ++- inputVal = _mm_srli_si128(inputVal, 8); ++- ret = _mm_cvtepi8_epi16(inputVal); ++- ret = _mm_slli_epi16(ret, 8); // Multiply by 256 ++- _mm_store_si128(outputVectorPtr, ret); +++ inputVal = _mm_srli_si128(inputVal, 8); +++ ret = _mm_cvtepi8_epi16(inputVal); +++ ret = _mm_slli_epi16(ret, 8); // Multiply by 256 +++ _mm_store_si128(outputVectorPtr, ret); ++ ++- outputVectorPtr++; +++ outputVectorPtr++; ++ ++- inputVectorPtr++; ++- } +++ inputVectorPtr++; +++ } ++ ++- number = sixteenthPoints * 16; ++- for(; number < num_points; number++){ ++- outputVector[number] = (int16_t)(inputVector[number])*256; ++- } +++ number = sixteenthPoints * 16; +++ for (; number < num_points; number++) { +++ outputVector[number] = (int16_t)(inputVector[number]) * 256; +++ } ++ } ++ #endif /* LV_HAVE_SSE4_1 */ ++ ++ ++ #ifdef LV_HAVE_GENERIC ++ ++-static inline void ++-volk_8i_convert_16i_a_generic(int16_t* outputVector, const int8_t* inputVector, ++- unsigned int num_points) +++static inline void volk_8i_convert_16i_a_generic(int16_t* outputVector, +++ const int8_t* inputVector, +++ unsigned int num_points) ++ { ++- int16_t* outputVectorPtr = outputVector; ++- const int8_t* inputVectorPtr = inputVector; ++- unsigned int number = 0; +++ int16_t* outputVectorPtr = outputVector; +++ const int8_t* inputVectorPtr = inputVector; +++ unsigned int number = 0; ++ ++- for(number = 0; number < num_points; number++){ ++- *outputVectorPtr++ = ((int16_t)(*inputVectorPtr++)) * 256; ++- } +++ for (number = 0; number < num_points; number++) { +++ *outputVectorPtr++ = ((int16_t)(*inputVectorPtr++)) * 256; +++ } ++ } ++ #endif /* LV_HAVE_GENERIC */ ++ ++@@ -251,51 +250,51 @@ volk_8i_convert_16i_a_generic(int16_t* outputVector, const int8_t* inputVector, ++ #ifdef LV_HAVE_NEON ++ #include ++ ++-static inline void ++-volk_8i_convert_16i_neon(int16_t* outputVector, const int8_t* inputVector, unsigned int num_points) +++static inline void volk_8i_convert_16i_neon(int16_t* outputVector, +++ const int8_t* inputVector, +++ unsigned int num_points) ++ { ++- int16_t* outputVectorPtr = outputVector; ++- const int8_t* inputVectorPtr = inputVector; ++- unsigned int number; ++- const unsigned int eighth_points = num_points / 8; ++- ++- int8x8_t input_vec ; ++- int16x8_t converted_vec; ++- ++- // NEON doesn't have a concept of 8 bit registers, so we are really ++- // dealing with the low half of 16-bit registers. Since this requires ++- // a move instruction we likely do better with ASM here. ++- for(number = 0; number < eighth_points; ++number) { ++- input_vec = vld1_s8(inputVectorPtr); ++- converted_vec = vmovl_s8(input_vec); ++- //converted_vec = vmulq_s16(converted_vec, scale_factor); ++- converted_vec = vshlq_n_s16(converted_vec, 8); ++- vst1q_s16( outputVectorPtr, converted_vec); ++- ++- inputVectorPtr += 8; ++- outputVectorPtr += 8; ++- } ++- ++- for(number = eighth_points * 8; number < num_points; number++){ ++- *outputVectorPtr++ = ((int16_t)(*inputVectorPtr++)) * 256; ++- } +++ int16_t* outputVectorPtr = outputVector; +++ const int8_t* inputVectorPtr = inputVector; +++ unsigned int number; +++ const unsigned int eighth_points = num_points / 8; +++ +++ int8x8_t input_vec; +++ int16x8_t converted_vec; +++ +++ // NEON doesn't have a concept of 8 bit registers, so we are really +++ // dealing with the low half of 16-bit registers. Since this requires +++ // a move instruction we likely do better with ASM here. +++ for (number = 0; number < eighth_points; ++number) { +++ input_vec = vld1_s8(inputVectorPtr); +++ converted_vec = vmovl_s8(input_vec); +++ // converted_vec = vmulq_s16(converted_vec, scale_factor); +++ converted_vec = vshlq_n_s16(converted_vec, 8); +++ vst1q_s16(outputVectorPtr, converted_vec); +++ +++ inputVectorPtr += 8; +++ outputVectorPtr += 8; +++ } +++ +++ for (number = eighth_points * 8; number < num_points; number++) { +++ *outputVectorPtr++ = ((int16_t)(*inputVectorPtr++)) * 256; +++ } ++ } ++ #endif /* LV_HAVE_NEON */ ++ ++ ++ #ifdef LV_HAVE_ORC ++-extern void ++-volk_8i_convert_16i_a_orc_impl(int16_t* outputVector, const int8_t* inputVector, ++- unsigned int num_points); +++extern void volk_8i_convert_16i_a_orc_impl(int16_t* outputVector, +++ const int8_t* inputVector, +++ unsigned int num_points); ++ ++-static inline void ++-volk_8i_convert_16i_u_orc(int16_t* outputVector, const int8_t* inputVector, ++- unsigned int num_points) +++static inline void volk_8i_convert_16i_u_orc(int16_t* outputVector, +++ const int8_t* inputVector, +++ unsigned int num_points) ++ { ++- volk_8i_convert_16i_a_orc_impl(outputVector, inputVector, num_points); +++ volk_8i_convert_16i_a_orc_impl(outputVector, inputVector, num_points); ++ } ++ #endif /* LV_HAVE_ORC */ ++ ++ ++- ++ #endif /* INCLUDED_VOLK_8s_CONVERT_16s_ALIGNED8_H */ ++diff --git a/kernels/volk/volk_8i_s32f_convert_32f.h b/kernels/volk/volk_8i_s32f_convert_32f.h ++index 97d160b..c3d5666 100644 ++--- a/kernels/volk/volk_8i_s32f_convert_32f.h +++++ b/kernels/volk/volk_8i_s32f_convert_32f.h ++@@ -30,8 +30,8 @@ ++ * ++ * Dispatcher Prototype ++ * \code ++- * void volk_8i_s32f_convert_32f(float* outputVector, const int8_t* inputVector, const float scalar, unsigned int num_points) ++- * \endcode +++ * void volk_8i_s32f_convert_32f(float* outputVector, const int8_t* inputVector, const +++ * float scalar, unsigned int num_points) \endcode ++ * ++ * \b Inputs ++ * \li inputVector: The input vector of 8-bit chars. ++@@ -60,44 +60,45 @@ ++ #ifdef LV_HAVE_AVX2 ++ #include ++ ++-static inline void ++-volk_8i_s32f_convert_32f_u_avx2(float* outputVector, const int8_t* inputVector, ++- const float scalar, unsigned int num_points) +++static inline void volk_8i_s32f_convert_32f_u_avx2(float* outputVector, +++ const int8_t* inputVector, +++ const float scalar, +++ unsigned int num_points) ++ { ++- unsigned int number = 0; ++- const unsigned int sixteenthPoints = num_points / 16; ++- ++- float* outputVectorPtr = outputVector; ++- const float iScalar = 1.0 / scalar; ++- __m256 invScalar = _mm256_set1_ps( iScalar ); ++- const int8_t* inputVectorPtr = inputVector; ++- __m256 ret; ++- __m128i inputVal128; ++- __m256i interimVal; ++- ++- for(;number < sixteenthPoints; number++){ ++- inputVal128 = _mm_loadu_si128((__m128i*)inputVectorPtr); ++- ++- interimVal = _mm256_cvtepi8_epi32(inputVal128); ++- ret = _mm256_cvtepi32_ps(interimVal); ++- ret = _mm256_mul_ps(ret, invScalar); ++- _mm256_storeu_ps(outputVectorPtr, ret); ++- outputVectorPtr += 8; ++- ++- inputVal128 = _mm_srli_si128(inputVal128, 8); ++- interimVal = _mm256_cvtepi8_epi32(inputVal128); ++- ret = _mm256_cvtepi32_ps(interimVal); ++- ret = _mm256_mul_ps(ret, invScalar); ++- _mm256_storeu_ps(outputVectorPtr, ret); ++- outputVectorPtr += 8; ++- ++- inputVectorPtr += 16; ++- } ++- ++- number = sixteenthPoints * 16; ++- for(; number < num_points; number++){ ++- outputVector[number] = (float)(inputVector[number]) * iScalar; ++- } +++ unsigned int number = 0; +++ const unsigned int sixteenthPoints = num_points / 16; +++ +++ float* outputVectorPtr = outputVector; +++ const float iScalar = 1.0 / scalar; +++ __m256 invScalar = _mm256_set1_ps(iScalar); +++ const int8_t* inputVectorPtr = inputVector; +++ __m256 ret; +++ __m128i inputVal128; +++ __m256i interimVal; +++ +++ for (; number < sixteenthPoints; number++) { +++ inputVal128 = _mm_loadu_si128((__m128i*)inputVectorPtr); +++ +++ interimVal = _mm256_cvtepi8_epi32(inputVal128); +++ ret = _mm256_cvtepi32_ps(interimVal); +++ ret = _mm256_mul_ps(ret, invScalar); +++ _mm256_storeu_ps(outputVectorPtr, ret); +++ outputVectorPtr += 8; +++ +++ inputVal128 = _mm_srli_si128(inputVal128, 8); +++ interimVal = _mm256_cvtepi8_epi32(inputVal128); +++ ret = _mm256_cvtepi32_ps(interimVal); +++ ret = _mm256_mul_ps(ret, invScalar); +++ _mm256_storeu_ps(outputVectorPtr, ret); +++ outputVectorPtr += 8; +++ +++ inputVectorPtr += 16; +++ } +++ +++ number = sixteenthPoints * 16; +++ for (; number < num_points; number++) { +++ outputVector[number] = (float)(inputVector[number]) * iScalar; +++ } ++ } ++ #endif /* LV_HAVE_AVX2 */ ++ ++@@ -105,80 +106,81 @@ volk_8i_s32f_convert_32f_u_avx2(float* outputVector, const int8_t* inputVector, ++ #ifdef LV_HAVE_SSE4_1 ++ #include ++ ++-static inline void ++-volk_8i_s32f_convert_32f_u_sse4_1(float* outputVector, const int8_t* inputVector, ++- const float scalar, unsigned int num_points) +++static inline void volk_8i_s32f_convert_32f_u_sse4_1(float* outputVector, +++ const int8_t* inputVector, +++ const float scalar, +++ unsigned int num_points) ++ { ++- unsigned int number = 0; ++- const unsigned int sixteenthPoints = num_points / 16; ++- ++- float* outputVectorPtr = outputVector; ++- const float iScalar = 1.0 / scalar; ++- __m128 invScalar = _mm_set_ps1( iScalar ); ++- const int8_t* inputVectorPtr = inputVector; ++- __m128 ret; ++- __m128i inputVal; ++- __m128i interimVal; ++- ++- for(;number < sixteenthPoints; number++){ ++- inputVal = _mm_loadu_si128((__m128i*)inputVectorPtr); ++- ++- interimVal = _mm_cvtepi8_epi32(inputVal); ++- ret = _mm_cvtepi32_ps(interimVal); ++- ret = _mm_mul_ps(ret, invScalar); ++- _mm_storeu_ps(outputVectorPtr, ret); ++- outputVectorPtr += 4; ++- ++- inputVal = _mm_srli_si128(inputVal, 4); ++- interimVal = _mm_cvtepi8_epi32(inputVal); ++- ret = _mm_cvtepi32_ps(interimVal); ++- ret = _mm_mul_ps(ret, invScalar); ++- _mm_storeu_ps(outputVectorPtr, ret); ++- outputVectorPtr += 4; ++- ++- inputVal = _mm_srli_si128(inputVal, 4); ++- interimVal = _mm_cvtepi8_epi32(inputVal); ++- ret = _mm_cvtepi32_ps(interimVal); ++- ret = _mm_mul_ps(ret, invScalar); ++- _mm_storeu_ps(outputVectorPtr, ret); ++- outputVectorPtr += 4; ++- ++- inputVal = _mm_srli_si128(inputVal, 4); ++- interimVal = _mm_cvtepi8_epi32(inputVal); ++- ret = _mm_cvtepi32_ps(interimVal); ++- ret = _mm_mul_ps(ret, invScalar); ++- _mm_storeu_ps(outputVectorPtr, ret); ++- outputVectorPtr += 4; ++- ++- inputVectorPtr += 16; ++- } ++- ++- number = sixteenthPoints * 16; ++- for(; number < num_points; number++){ ++- outputVector[number] = (float)(inputVector[number]) * iScalar; ++- } +++ unsigned int number = 0; +++ const unsigned int sixteenthPoints = num_points / 16; +++ +++ float* outputVectorPtr = outputVector; +++ const float iScalar = 1.0 / scalar; +++ __m128 invScalar = _mm_set_ps1(iScalar); +++ const int8_t* inputVectorPtr = inputVector; +++ __m128 ret; +++ __m128i inputVal; +++ __m128i interimVal; +++ +++ for (; number < sixteenthPoints; number++) { +++ inputVal = _mm_loadu_si128((__m128i*)inputVectorPtr); +++ +++ interimVal = _mm_cvtepi8_epi32(inputVal); +++ ret = _mm_cvtepi32_ps(interimVal); +++ ret = _mm_mul_ps(ret, invScalar); +++ _mm_storeu_ps(outputVectorPtr, ret); +++ outputVectorPtr += 4; +++ +++ inputVal = _mm_srli_si128(inputVal, 4); +++ interimVal = _mm_cvtepi8_epi32(inputVal); +++ ret = _mm_cvtepi32_ps(interimVal); +++ ret = _mm_mul_ps(ret, invScalar); +++ _mm_storeu_ps(outputVectorPtr, ret); +++ outputVectorPtr += 4; +++ +++ inputVal = _mm_srli_si128(inputVal, 4); +++ interimVal = _mm_cvtepi8_epi32(inputVal); +++ ret = _mm_cvtepi32_ps(interimVal); +++ ret = _mm_mul_ps(ret, invScalar); +++ _mm_storeu_ps(outputVectorPtr, ret); +++ outputVectorPtr += 4; +++ +++ inputVal = _mm_srli_si128(inputVal, 4); +++ interimVal = _mm_cvtepi8_epi32(inputVal); +++ ret = _mm_cvtepi32_ps(interimVal); +++ ret = _mm_mul_ps(ret, invScalar); +++ _mm_storeu_ps(outputVectorPtr, ret); +++ outputVectorPtr += 4; +++ +++ inputVectorPtr += 16; +++ } +++ +++ number = sixteenthPoints * 16; +++ for (; number < num_points; number++) { +++ outputVector[number] = (float)(inputVector[number]) * iScalar; +++ } ++ } ++ #endif /* LV_HAVE_SSE4_1 */ ++ ++ #ifdef LV_HAVE_GENERIC ++ ++-static inline void ++-volk_8i_s32f_convert_32f_generic(float* outputVector, const int8_t* inputVector, ++- const float scalar, unsigned int num_points) +++static inline void volk_8i_s32f_convert_32f_generic(float* outputVector, +++ const int8_t* inputVector, +++ const float scalar, +++ unsigned int num_points) ++ { ++- float* outputVectorPtr = outputVector; ++- const int8_t* inputVectorPtr = inputVector; ++- unsigned int number = 0; ++- const float iScalar = 1.0 / scalar; ++- ++- for(number = 0; number < num_points; number++){ ++- *outputVectorPtr++ = ((float)(*inputVectorPtr++)) * iScalar; ++- } +++ float* outputVectorPtr = outputVector; +++ const int8_t* inputVectorPtr = inputVector; +++ unsigned int number = 0; +++ const float iScalar = 1.0 / scalar; +++ +++ for (number = 0; number < num_points; number++) { +++ *outputVectorPtr++ = ((float)(*inputVectorPtr++)) * iScalar; +++ } ++ } ++ #endif /* LV_HAVE_GENERIC */ ++ ++ ++- ++ #endif /* INCLUDED_VOLK_8s_CONVERT_32f_UNALIGNED8_H */ ++ ++ #ifndef INCLUDED_volk_8i_s32f_convert_32f_a_H ++@@ -190,195 +192,199 @@ volk_8i_s32f_convert_32f_generic(float* outputVector, const int8_t* inputVector, ++ #ifdef LV_HAVE_AVX2 ++ #include ++ ++-static inline void ++-volk_8i_s32f_convert_32f_a_avx2(float* outputVector, const int8_t* inputVector, ++- const float scalar, unsigned int num_points) +++static inline void volk_8i_s32f_convert_32f_a_avx2(float* outputVector, +++ const int8_t* inputVector, +++ const float scalar, +++ unsigned int num_points) ++ { ++- unsigned int number = 0; ++- const unsigned int sixteenthPoints = num_points / 16; ++- ++- float* outputVectorPtr = outputVector; ++- const float iScalar = 1.0 / scalar; ++- __m256 invScalar = _mm256_set1_ps( iScalar ); ++- const int8_t* inputVectorPtr = inputVector; ++- __m256 ret; ++- __m128i inputVal128; ++- __m256i interimVal; ++- ++- for(;number < sixteenthPoints; number++){ ++- inputVal128 = _mm_load_si128((__m128i*)inputVectorPtr); ++- ++- interimVal = _mm256_cvtepi8_epi32(inputVal128); ++- ret = _mm256_cvtepi32_ps(interimVal); ++- ret = _mm256_mul_ps(ret, invScalar); ++- _mm256_store_ps(outputVectorPtr, ret); ++- outputVectorPtr += 8; ++- ++- inputVal128 = _mm_srli_si128(inputVal128, 8); ++- interimVal = _mm256_cvtepi8_epi32(inputVal128); ++- ret = _mm256_cvtepi32_ps(interimVal); ++- ret = _mm256_mul_ps(ret, invScalar); ++- _mm256_store_ps(outputVectorPtr, ret); ++- outputVectorPtr += 8; ++- ++- inputVectorPtr += 16; ++- } ++- ++- number = sixteenthPoints * 16; ++- for(; number < num_points; number++){ ++- outputVector[number] = (float)(inputVector[number]) * iScalar; ++- } +++ unsigned int number = 0; +++ const unsigned int sixteenthPoints = num_points / 16; +++ +++ float* outputVectorPtr = outputVector; +++ const float iScalar = 1.0 / scalar; +++ __m256 invScalar = _mm256_set1_ps(iScalar); +++ const int8_t* inputVectorPtr = inputVector; +++ __m256 ret; +++ __m128i inputVal128; +++ __m256i interimVal; +++ +++ for (; number < sixteenthPoints; number++) { +++ inputVal128 = _mm_load_si128((__m128i*)inputVectorPtr); +++ +++ interimVal = _mm256_cvtepi8_epi32(inputVal128); +++ ret = _mm256_cvtepi32_ps(interimVal); +++ ret = _mm256_mul_ps(ret, invScalar); +++ _mm256_store_ps(outputVectorPtr, ret); +++ outputVectorPtr += 8; +++ +++ inputVal128 = _mm_srli_si128(inputVal128, 8); +++ interimVal = _mm256_cvtepi8_epi32(inputVal128); +++ ret = _mm256_cvtepi32_ps(interimVal); +++ ret = _mm256_mul_ps(ret, invScalar); +++ _mm256_store_ps(outputVectorPtr, ret); +++ outputVectorPtr += 8; +++ +++ inputVectorPtr += 16; +++ } +++ +++ number = sixteenthPoints * 16; +++ for (; number < num_points; number++) { +++ outputVector[number] = (float)(inputVector[number]) * iScalar; +++ } ++ } ++ #endif /* LV_HAVE_AVX2 */ ++ ++ #ifdef LV_HAVE_SSE4_1 ++ #include ++ ++-static inline void ++-volk_8i_s32f_convert_32f_a_sse4_1(float* outputVector, const int8_t* inputVector, ++- const float scalar, unsigned int num_points) +++static inline void volk_8i_s32f_convert_32f_a_sse4_1(float* outputVector, +++ const int8_t* inputVector, +++ const float scalar, +++ unsigned int num_points) ++ { ++- unsigned int number = 0; ++- const unsigned int sixteenthPoints = num_points / 16; ++- ++- float* outputVectorPtr = outputVector; ++- const float iScalar = 1.0 / scalar; ++- __m128 invScalar = _mm_set_ps1(iScalar); ++- const int8_t* inputVectorPtr = inputVector; ++- __m128 ret; ++- __m128i inputVal; ++- __m128i interimVal; ++- ++- for(;number < sixteenthPoints; number++){ ++- inputVal = _mm_load_si128((__m128i*)inputVectorPtr); ++- ++- interimVal = _mm_cvtepi8_epi32(inputVal); ++- ret = _mm_cvtepi32_ps(interimVal); ++- ret = _mm_mul_ps(ret, invScalar); ++- _mm_store_ps(outputVectorPtr, ret); ++- outputVectorPtr += 4; ++- ++- inputVal = _mm_srli_si128(inputVal, 4); ++- interimVal = _mm_cvtepi8_epi32(inputVal); ++- ret = _mm_cvtepi32_ps(interimVal); ++- ret = _mm_mul_ps(ret, invScalar); ++- _mm_store_ps(outputVectorPtr, ret); ++- outputVectorPtr += 4; ++- ++- inputVal = _mm_srli_si128(inputVal, 4); ++- interimVal = _mm_cvtepi8_epi32(inputVal); ++- ret = _mm_cvtepi32_ps(interimVal); ++- ret = _mm_mul_ps(ret, invScalar); ++- _mm_store_ps(outputVectorPtr, ret); ++- outputVectorPtr += 4; ++- ++- inputVal = _mm_srli_si128(inputVal, 4); ++- interimVal = _mm_cvtepi8_epi32(inputVal); ++- ret = _mm_cvtepi32_ps(interimVal); ++- ret = _mm_mul_ps(ret, invScalar); ++- _mm_store_ps(outputVectorPtr, ret); ++- outputVectorPtr += 4; ++- ++- inputVectorPtr += 16; ++- } ++- ++- number = sixteenthPoints * 16; ++- for(; number < num_points; number++){ ++- outputVector[number] = (float)(inputVector[number]) * iScalar; ++- } +++ unsigned int number = 0; +++ const unsigned int sixteenthPoints = num_points / 16; +++ +++ float* outputVectorPtr = outputVector; +++ const float iScalar = 1.0 / scalar; +++ __m128 invScalar = _mm_set_ps1(iScalar); +++ const int8_t* inputVectorPtr = inputVector; +++ __m128 ret; +++ __m128i inputVal; +++ __m128i interimVal; +++ +++ for (; number < sixteenthPoints; number++) { +++ inputVal = _mm_load_si128((__m128i*)inputVectorPtr); +++ +++ interimVal = _mm_cvtepi8_epi32(inputVal); +++ ret = _mm_cvtepi32_ps(interimVal); +++ ret = _mm_mul_ps(ret, invScalar); +++ _mm_store_ps(outputVectorPtr, ret); +++ outputVectorPtr += 4; +++ +++ inputVal = _mm_srli_si128(inputVal, 4); +++ interimVal = _mm_cvtepi8_epi32(inputVal); +++ ret = _mm_cvtepi32_ps(interimVal); +++ ret = _mm_mul_ps(ret, invScalar); +++ _mm_store_ps(outputVectorPtr, ret); +++ outputVectorPtr += 4; +++ +++ inputVal = _mm_srli_si128(inputVal, 4); +++ interimVal = _mm_cvtepi8_epi32(inputVal); +++ ret = _mm_cvtepi32_ps(interimVal); +++ ret = _mm_mul_ps(ret, invScalar); +++ _mm_store_ps(outputVectorPtr, ret); +++ outputVectorPtr += 4; +++ +++ inputVal = _mm_srli_si128(inputVal, 4); +++ interimVal = _mm_cvtepi8_epi32(inputVal); +++ ret = _mm_cvtepi32_ps(interimVal); +++ ret = _mm_mul_ps(ret, invScalar); +++ _mm_store_ps(outputVectorPtr, ret); +++ outputVectorPtr += 4; +++ +++ inputVectorPtr += 16; +++ } +++ +++ number = sixteenthPoints * 16; +++ for (; number < num_points; number++) { +++ outputVector[number] = (float)(inputVector[number]) * iScalar; +++ } ++ } ++ #endif /* LV_HAVE_SSE4_1 */ ++ ++ #ifdef LV_HAVE_NEON ++ #include ++ ++-static inline void ++-volk_8i_s32f_convert_32f_neon(float* outputVector, const int8_t* inputVector, ++- const float scalar, unsigned int num_points) +++static inline void volk_8i_s32f_convert_32f_neon(float* outputVector, +++ const int8_t* inputVector, +++ const float scalar, +++ unsigned int num_points) ++ { ++- float* outputVectorPtr = outputVector; ++- const int8_t* inputVectorPtr = inputVector; ++- ++- const float iScalar = 1.0 / scalar; ++- const float32x4_t qiScalar = vdupq_n_f32(iScalar); ++- ++- int8x8x2_t inputVal; ++- float32x4x2_t outputFloat; ++- int16x8_t tmp; ++- ++- unsigned int number = 0; ++- const unsigned int sixteenthPoints = num_points / 16; ++- for(;number < sixteenthPoints; number++){ ++- __VOLK_PREFETCH(inputVectorPtr+16); ++- ++- inputVal = vld2_s8(inputVectorPtr); ++- inputVal = vzip_s8(inputVal.val[0], inputVal.val[1]); ++- inputVectorPtr += 16; ++- ++- tmp = vmovl_s8(inputVal.val[0]); ++- ++- outputFloat.val[0] = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp))); ++- outputFloat.val[0] = vmulq_f32(outputFloat.val[0], qiScalar); ++- vst1q_f32(outputVectorPtr, outputFloat.val[0]); ++- outputVectorPtr += 4; ++- ++- outputFloat.val[1] = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp))); ++- outputFloat.val[1] = vmulq_f32(outputFloat.val[1], qiScalar); ++- vst1q_f32(outputVectorPtr, outputFloat.val[1]); ++- outputVectorPtr += 4; ++- ++- tmp = vmovl_s8(inputVal.val[1]); ++- ++- outputFloat.val[0] = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp))); ++- outputFloat.val[0] = vmulq_f32(outputFloat.val[0], qiScalar); ++- vst1q_f32(outputVectorPtr, outputFloat.val[0]); ++- outputVectorPtr += 4; ++- ++- outputFloat.val[1] = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp))); ++- outputFloat.val[1] = vmulq_f32(outputFloat.val[1], qiScalar); ++- vst1q_f32(outputVectorPtr, outputFloat.val[1]); ++- outputVectorPtr += 4; ++- } ++- for(number = sixteenthPoints * 16; number < num_points; number++){ ++- *outputVectorPtr++ = ((float)(*inputVectorPtr++)) * iScalar; ++- } +++ float* outputVectorPtr = outputVector; +++ const int8_t* inputVectorPtr = inputVector; +++ +++ const float iScalar = 1.0 / scalar; +++ const float32x4_t qiScalar = vdupq_n_f32(iScalar); +++ +++ int8x8x2_t inputVal; +++ float32x4x2_t outputFloat; +++ int16x8_t tmp; +++ +++ unsigned int number = 0; +++ const unsigned int sixteenthPoints = num_points / 16; +++ for (; number < sixteenthPoints; number++) { +++ __VOLK_PREFETCH(inputVectorPtr + 16); +++ +++ inputVal = vld2_s8(inputVectorPtr); +++ inputVal = vzip_s8(inputVal.val[0], inputVal.val[1]); +++ inputVectorPtr += 16; +++ +++ tmp = vmovl_s8(inputVal.val[0]); +++ +++ outputFloat.val[0] = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp))); +++ outputFloat.val[0] = vmulq_f32(outputFloat.val[0], qiScalar); +++ vst1q_f32(outputVectorPtr, outputFloat.val[0]); +++ outputVectorPtr += 4; +++ +++ outputFloat.val[1] = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp))); +++ outputFloat.val[1] = vmulq_f32(outputFloat.val[1], qiScalar); +++ vst1q_f32(outputVectorPtr, outputFloat.val[1]); +++ outputVectorPtr += 4; +++ +++ tmp = vmovl_s8(inputVal.val[1]); +++ +++ outputFloat.val[0] = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp))); +++ outputFloat.val[0] = vmulq_f32(outputFloat.val[0], qiScalar); +++ vst1q_f32(outputVectorPtr, outputFloat.val[0]); +++ outputVectorPtr += 4; +++ +++ outputFloat.val[1] = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp))); +++ outputFloat.val[1] = vmulq_f32(outputFloat.val[1], qiScalar); +++ vst1q_f32(outputVectorPtr, outputFloat.val[1]); +++ outputVectorPtr += 4; +++ } +++ for (number = sixteenthPoints * 16; number < num_points; number++) { +++ *outputVectorPtr++ = ((float)(*inputVectorPtr++)) * iScalar; +++ } ++ } ++ ++ #endif /* LV_HAVE_NEON */ ++ ++ #ifdef LV_HAVE_GENERIC ++ ++-static inline void ++-volk_8i_s32f_convert_32f_a_generic(float* outputVector, const int8_t* inputVector, ++- const float scalar, unsigned int num_points) +++static inline void volk_8i_s32f_convert_32f_a_generic(float* outputVector, +++ const int8_t* inputVector, +++ const float scalar, +++ unsigned int num_points) ++ { ++- float* outputVectorPtr = outputVector; ++- const int8_t* inputVectorPtr = inputVector; ++- unsigned int number = 0; ++- const float iScalar = 1.0 / scalar; ++- ++- for(number = 0; number < num_points; number++){ ++- *outputVectorPtr++ = ((float)(*inputVectorPtr++)) * iScalar; ++- } +++ float* outputVectorPtr = outputVector; +++ const int8_t* inputVectorPtr = inputVector; +++ unsigned int number = 0; +++ const float iScalar = 1.0 / scalar; +++ +++ for (number = 0; number < num_points; number++) { +++ *outputVectorPtr++ = ((float)(*inputVectorPtr++)) * iScalar; +++ } ++ } ++ #endif /* LV_HAVE_GENERIC */ ++ ++ ++ #ifdef LV_HAVE_ORC ++-extern void ++-volk_8i_s32f_convert_32f_a_orc_impl(float* outputVector, const int8_t* inputVector, ++- const float scalar, unsigned int num_points); ++- ++-static inline void ++-volk_8i_s32f_convert_32f_u_orc(float* outputVector, const int8_t* inputVector, ++- const float scalar, unsigned int num_points) +++extern void volk_8i_s32f_convert_32f_a_orc_impl(float* outputVector, +++ const int8_t* inputVector, +++ const float scalar, +++ unsigned int num_points); +++ +++static inline void volk_8i_s32f_convert_32f_u_orc(float* outputVector, +++ const int8_t* inputVector, +++ const float scalar, +++ unsigned int num_points) ++ { ++- float invscalar = 1.0 / scalar; ++- volk_8i_s32f_convert_32f_a_orc_impl(outputVector, inputVector, invscalar, num_points); +++ float invscalar = 1.0 / scalar; +++ volk_8i_s32f_convert_32f_a_orc_impl(outputVector, inputVector, invscalar, num_points); ++ } ++ #endif /* LV_HAVE_ORC */ ++ ++ ++- ++ #endif /* INCLUDED_VOLK_8s_CONVERT_32f_ALIGNED8_H */ ++- ++diff --git a/kernels/volk/volk_8ic_deinterleave_16i_x2.h b/kernels/volk/volk_8ic_deinterleave_16i_x2.h ++index b4cf251..fa998a0 100644 ++--- a/kernels/volk/volk_8ic_deinterleave_16i_x2.h +++++ b/kernels/volk/volk_8ic_deinterleave_16i_x2.h ++@@ -30,8 +30,8 @@ ++ * ++ * Dispatcher Prototype ++ * \code ++- * void volk_8ic_deinterleave_16i_x2(int16_t* iBuffer, int16_t* qBuffer, const lv_8sc_t* complexVector, unsigned int num_points) ++- * \endcode +++ * void volk_8ic_deinterleave_16i_x2(int16_t* iBuffer, int16_t* qBuffer, const lv_8sc_t* +++ * complexVector, unsigned int num_points) \endcode ++ * ++ * \b Inputs ++ * \li complexVector: The complex input vector. ++@@ -60,91 +60,150 @@ ++ #ifdef LV_HAVE_AVX2 ++ #include ++ ++-static inline void ++-volk_8ic_deinterleave_16i_x2_a_avx2(int16_t* iBuffer, int16_t* qBuffer, ++- const lv_8sc_t* complexVector, unsigned int num_points) +++static inline void volk_8ic_deinterleave_16i_x2_a_avx2(int16_t* iBuffer, +++ int16_t* qBuffer, +++ const lv_8sc_t* complexVector, +++ unsigned int num_points) ++ { ++- unsigned int number = 0; ++- const int8_t* complexVectorPtr = (int8_t*)complexVector; ++- int16_t* iBufferPtr = iBuffer; ++- int16_t* qBufferPtr = qBuffer; ++- __m256i MoveMask = _mm256_set_epi8(15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0, 15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0); ++- __m256i complexVal, iOutputVal, qOutputVal; ++- __m128i iOutputVal0, qOutputVal0; ++- ++- unsigned int sixteenthPoints = num_points / 16; ++- ++- for(number = 0; number < sixteenthPoints; number++){ ++- complexVal = _mm256_load_si256((__m256i*)complexVectorPtr); complexVectorPtr += 32; ++- ++- complexVal = _mm256_shuffle_epi8(complexVal, MoveMask); ++- complexVal = _mm256_permute4x64_epi64(complexVal, 0xd8); ++- ++- iOutputVal0 = _mm256_extracti128_si256(complexVal, 0); ++- qOutputVal0 = _mm256_extracti128_si256(complexVal, 1); ++- ++- iOutputVal = _mm256_cvtepi8_epi16(iOutputVal0); ++- iOutputVal = _mm256_slli_epi16(iOutputVal, 8); ++- ++- qOutputVal = _mm256_cvtepi8_epi16(qOutputVal0); ++- qOutputVal = _mm256_slli_epi16(qOutputVal, 8); ++- ++- _mm256_store_si256((__m256i*)iBufferPtr, iOutputVal); ++- _mm256_store_si256((__m256i*)qBufferPtr, qOutputVal); ++- ++- iBufferPtr += 16; ++- qBufferPtr += 16; ++- } ++- ++- number = sixteenthPoints * 16; ++- for(; number < num_points; number++){ ++- *iBufferPtr++ = ((int16_t)*complexVectorPtr++) * 256; // load 8 bit Complexvector into 16 bit, shift left by 8 bits and store ++- *qBufferPtr++ = ((int16_t)*complexVectorPtr++) * 256; ++- } +++ unsigned int number = 0; +++ const int8_t* complexVectorPtr = (int8_t*)complexVector; +++ int16_t* iBufferPtr = iBuffer; +++ int16_t* qBufferPtr = qBuffer; +++ __m256i MoveMask = _mm256_set_epi8(15, +++ 13, +++ 11, +++ 9, +++ 7, +++ 5, +++ 3, +++ 1, +++ 14, +++ 12, +++ 10, +++ 8, +++ 6, +++ 4, +++ 2, +++ 0, +++ 15, +++ 13, +++ 11, +++ 9, +++ 7, +++ 5, +++ 3, +++ 1, +++ 14, +++ 12, +++ 10, +++ 8, +++ 6, +++ 4, +++ 2, +++ 0); +++ __m256i complexVal, iOutputVal, qOutputVal; +++ __m128i iOutputVal0, qOutputVal0; +++ +++ unsigned int sixteenthPoints = num_points / 16; +++ +++ for (number = 0; number < sixteenthPoints; number++) { +++ complexVal = _mm256_load_si256((__m256i*)complexVectorPtr); +++ complexVectorPtr += 32; +++ +++ complexVal = _mm256_shuffle_epi8(complexVal, MoveMask); +++ complexVal = _mm256_permute4x64_epi64(complexVal, 0xd8); +++ +++ iOutputVal0 = _mm256_extracti128_si256(complexVal, 0); +++ qOutputVal0 = _mm256_extracti128_si256(complexVal, 1); +++ +++ iOutputVal = _mm256_cvtepi8_epi16(iOutputVal0); +++ iOutputVal = _mm256_slli_epi16(iOutputVal, 8); +++ +++ qOutputVal = _mm256_cvtepi8_epi16(qOutputVal0); +++ qOutputVal = _mm256_slli_epi16(qOutputVal, 8); +++ +++ _mm256_store_si256((__m256i*)iBufferPtr, iOutputVal); +++ _mm256_store_si256((__m256i*)qBufferPtr, qOutputVal); +++ +++ iBufferPtr += 16; +++ qBufferPtr += 16; +++ } +++ +++ number = sixteenthPoints * 16; +++ for (; number < num_points; number++) { +++ *iBufferPtr++ = +++ ((int16_t)*complexVectorPtr++) * +++ 256; // load 8 bit Complexvector into 16 bit, shift left by 8 bits and store +++ *qBufferPtr++ = ((int16_t)*complexVectorPtr++) * 256; +++ } ++ } ++ #endif /* LV_HAVE_AVX2 */ ++ ++ #ifdef LV_HAVE_SSE4_1 ++ #include ++ ++-static inline void ++-volk_8ic_deinterleave_16i_x2_a_sse4_1(int16_t* iBuffer, int16_t* qBuffer, ++- const lv_8sc_t* complexVector, unsigned int num_points) +++static inline void volk_8ic_deinterleave_16i_x2_a_sse4_1(int16_t* iBuffer, +++ int16_t* qBuffer, +++ const lv_8sc_t* complexVector, +++ unsigned int num_points) ++ { ++- unsigned int number = 0; ++- const int8_t* complexVectorPtr = (int8_t*)complexVector; ++- int16_t* iBufferPtr = iBuffer; ++- int16_t* qBufferPtr = qBuffer; ++- __m128i iMoveMask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0); // set 16 byte values ++- __m128i qMoveMask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 15, 13, 11, 9, 7, 5, 3, 1); ++- __m128i complexVal, iOutputVal, qOutputVal; ++- ++- unsigned int eighthPoints = num_points / 8; ++- ++- for(number = 0; number < eighthPoints; number++){ ++- complexVal = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16; // aligned load ++- ++- iOutputVal = _mm_shuffle_epi8(complexVal, iMoveMask); // shuffle 16 bytes of 128bit complexVal ++- qOutputVal = _mm_shuffle_epi8(complexVal, qMoveMask); ++- ++- iOutputVal = _mm_cvtepi8_epi16(iOutputVal); // fills 2-byte sign extended versions of lower 8 bytes of input to output ++- iOutputVal = _mm_slli_epi16(iOutputVal, 8); // shift in left by 8 bits, each of the 8 16-bit integers, shift in with zeros ++- ++- qOutputVal = _mm_cvtepi8_epi16(qOutputVal); ++- qOutputVal = _mm_slli_epi16(qOutputVal, 8); ++- ++- _mm_store_si128((__m128i*)iBufferPtr, iOutputVal); // aligned store ++- _mm_store_si128((__m128i*)qBufferPtr, qOutputVal); ++- ++- iBufferPtr += 8; ++- qBufferPtr += 8; ++- } ++- ++- number = eighthPoints * 8; ++- for(; number < num_points; number++){ ++- *iBufferPtr++ = ((int16_t)*complexVectorPtr++) * 256; // load 8 bit Complexvector into 16 bit, shift left by 8 bits and store ++- *qBufferPtr++ = ((int16_t)*complexVectorPtr++) * 256; ++- } +++ unsigned int number = 0; +++ const int8_t* complexVectorPtr = (int8_t*)complexVector; +++ int16_t* iBufferPtr = iBuffer; +++ int16_t* qBufferPtr = qBuffer; +++ __m128i iMoveMask = _mm_set_epi8(0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 14, +++ 12, +++ 10, +++ 8, +++ 6, +++ 4, +++ 2, +++ 0); // set 16 byte values +++ __m128i qMoveMask = _mm_set_epi8( +++ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 15, 13, 11, 9, 7, 5, 3, 1); +++ __m128i complexVal, iOutputVal, qOutputVal; +++ +++ unsigned int eighthPoints = num_points / 8; +++ +++ for (number = 0; number < eighthPoints; number++) { +++ complexVal = _mm_load_si128((__m128i*)complexVectorPtr); +++ complexVectorPtr += 16; // aligned load +++ +++ iOutputVal = _mm_shuffle_epi8(complexVal, +++ iMoveMask); // shuffle 16 bytes of 128bit complexVal +++ qOutputVal = _mm_shuffle_epi8(complexVal, qMoveMask); +++ +++ iOutputVal = _mm_cvtepi8_epi16(iOutputVal); // fills 2-byte sign extended versions +++ // of lower 8 bytes of input to output +++ iOutputVal = +++ _mm_slli_epi16(iOutputVal, 8); // shift in left by 8 bits, each of the 8 +++ // 16-bit integers, shift in with zeros +++ +++ qOutputVal = _mm_cvtepi8_epi16(qOutputVal); +++ qOutputVal = _mm_slli_epi16(qOutputVal, 8); +++ +++ _mm_store_si128((__m128i*)iBufferPtr, iOutputVal); // aligned store +++ _mm_store_si128((__m128i*)qBufferPtr, qOutputVal); +++ +++ iBufferPtr += 8; +++ qBufferPtr += 8; +++ } +++ +++ number = eighthPoints * 8; +++ for (; number < num_points; number++) { +++ *iBufferPtr++ = +++ ((int16_t)*complexVectorPtr++) * +++ 256; // load 8 bit Complexvector into 16 bit, shift left by 8 bits and store +++ *qBufferPtr++ = ((int16_t)*complexVectorPtr++) * 256; +++ } ++ } ++ #endif /* LV_HAVE_SSE4_1 */ ++ ++@@ -152,86 +211,111 @@ volk_8ic_deinterleave_16i_x2_a_sse4_1(int16_t* iBuffer, int16_t* qBuffer, ++ #ifdef LV_HAVE_AVX ++ #include ++ ++-static inline void ++-volk_8ic_deinterleave_16i_x2_a_avx(int16_t* iBuffer, int16_t* qBuffer, ++- const lv_8sc_t* complexVector, unsigned int num_points) +++static inline void volk_8ic_deinterleave_16i_x2_a_avx(int16_t* iBuffer, +++ int16_t* qBuffer, +++ const lv_8sc_t* complexVector, +++ unsigned int num_points) ++ { ++- unsigned int number = 0; ++- const int8_t* complexVectorPtr = (int8_t*)complexVector; ++- int16_t* iBufferPtr = iBuffer; ++- int16_t* qBufferPtr = qBuffer; ++- __m128i iMoveMask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0); // set 16 byte values ++- __m128i qMoveMask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 15, 13, 11, 9, 7, 5, 3, 1); ++- __m256i complexVal, iOutputVal, qOutputVal; ++- __m128i complexVal1, complexVal0; ++- __m128i iOutputVal1, iOutputVal0, qOutputVal1, qOutputVal0; ++- ++- unsigned int sixteenthPoints = num_points / 16; ++- ++- for(number = 0; number < sixteenthPoints; number++){ ++- complexVal = _mm256_load_si256((__m256i*)complexVectorPtr); complexVectorPtr += 32; // aligned load ++- ++- // Extract from complexVal to iOutputVal and qOutputVal ++- complexVal1 = _mm256_extractf128_si256(complexVal, 1); ++- complexVal0 = _mm256_extractf128_si256(complexVal, 0); ++- ++- iOutputVal1 = _mm_shuffle_epi8(complexVal1, iMoveMask); // shuffle 16 bytes of 128bit complexVal ++- iOutputVal0 = _mm_shuffle_epi8(complexVal0, iMoveMask); ++- qOutputVal1 = _mm_shuffle_epi8(complexVal1, qMoveMask); ++- qOutputVal0 = _mm_shuffle_epi8(complexVal0, qMoveMask); ++- ++- iOutputVal1 = _mm_cvtepi8_epi16(iOutputVal1); // fills 2-byte sign extended versions of lower 8 bytes of input to output ++- iOutputVal1 = _mm_slli_epi16(iOutputVal1, 8); // shift in left by 8 bits, each of the 8 16-bit integers, shift in with zeros ++- iOutputVal0 = _mm_cvtepi8_epi16(iOutputVal0); ++- iOutputVal0 = _mm_slli_epi16(iOutputVal0, 8); ++- ++- qOutputVal1 = _mm_cvtepi8_epi16(qOutputVal1); ++- qOutputVal1 = _mm_slli_epi16(qOutputVal1, 8); ++- qOutputVal0 = _mm_cvtepi8_epi16(qOutputVal0); ++- qOutputVal0 = _mm_slli_epi16(qOutputVal0, 8); ++- ++- // Pack iOutputVal0,1 to iOutputVal ++- __m256i dummy = _mm256_setzero_si256(); ++- iOutputVal = _mm256_insertf128_si256(dummy, iOutputVal0, 0); ++- iOutputVal = _mm256_insertf128_si256(iOutputVal, iOutputVal1, 1); ++- qOutputVal = _mm256_insertf128_si256(dummy, qOutputVal0, 0); ++- qOutputVal = _mm256_insertf128_si256(qOutputVal, qOutputVal1, 1); ++- ++- _mm256_store_si256((__m256i*)iBufferPtr, iOutputVal); // aligned store ++- _mm256_store_si256((__m256i*)qBufferPtr, qOutputVal); ++- ++- iBufferPtr += 16; ++- qBufferPtr += 16; ++- } ++- ++- number = sixteenthPoints * 16; ++- for(; number < num_points; number++){ ++- *iBufferPtr++ = ((int16_t)*complexVectorPtr++) * 256; // load 8 bit Complexvector into 16 bit, shift left by 8 bits and store ++- *qBufferPtr++ = ((int16_t)*complexVectorPtr++) * 256; ++- } +++ unsigned int number = 0; +++ const int8_t* complexVectorPtr = (int8_t*)complexVector; +++ int16_t* iBufferPtr = iBuffer; +++ int16_t* qBufferPtr = qBuffer; +++ __m128i iMoveMask = _mm_set_epi8(0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 14, +++ 12, +++ 10, +++ 8, +++ 6, +++ 4, +++ 2, +++ 0); // set 16 byte values +++ __m128i qMoveMask = _mm_set_epi8( +++ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 15, 13, 11, 9, 7, 5, 3, 1); +++ __m256i complexVal, iOutputVal, qOutputVal; +++ __m128i complexVal1, complexVal0; +++ __m128i iOutputVal1, iOutputVal0, qOutputVal1, qOutputVal0; +++ +++ unsigned int sixteenthPoints = num_points / 16; +++ +++ for (number = 0; number < sixteenthPoints; number++) { +++ complexVal = _mm256_load_si256((__m256i*)complexVectorPtr); +++ complexVectorPtr += 32; // aligned load +++ +++ // Extract from complexVal to iOutputVal and qOutputVal +++ complexVal1 = _mm256_extractf128_si256(complexVal, 1); +++ complexVal0 = _mm256_extractf128_si256(complexVal, 0); +++ +++ iOutputVal1 = _mm_shuffle_epi8( +++ complexVal1, iMoveMask); // shuffle 16 bytes of 128bit complexVal +++ iOutputVal0 = _mm_shuffle_epi8(complexVal0, iMoveMask); +++ qOutputVal1 = _mm_shuffle_epi8(complexVal1, qMoveMask); +++ qOutputVal0 = _mm_shuffle_epi8(complexVal0, qMoveMask); +++ +++ iOutputVal1 = +++ _mm_cvtepi8_epi16(iOutputVal1); // fills 2-byte sign extended versions of +++ // lower 8 bytes of input to output +++ iOutputVal1 = +++ _mm_slli_epi16(iOutputVal1, 8); // shift in left by 8 bits, each of the 8 +++ // 16-bit integers, shift in with zeros +++ iOutputVal0 = _mm_cvtepi8_epi16(iOutputVal0); +++ iOutputVal0 = _mm_slli_epi16(iOutputVal0, 8); +++ +++ qOutputVal1 = _mm_cvtepi8_epi16(qOutputVal1); +++ qOutputVal1 = _mm_slli_epi16(qOutputVal1, 8); +++ qOutputVal0 = _mm_cvtepi8_epi16(qOutputVal0); +++ qOutputVal0 = _mm_slli_epi16(qOutputVal0, 8); +++ +++ // Pack iOutputVal0,1 to iOutputVal +++ __m256i dummy = _mm256_setzero_si256(); +++ iOutputVal = _mm256_insertf128_si256(dummy, iOutputVal0, 0); +++ iOutputVal = _mm256_insertf128_si256(iOutputVal, iOutputVal1, 1); +++ qOutputVal = _mm256_insertf128_si256(dummy, qOutputVal0, 0); +++ qOutputVal = _mm256_insertf128_si256(qOutputVal, qOutputVal1, 1); +++ +++ _mm256_store_si256((__m256i*)iBufferPtr, iOutputVal); // aligned store +++ _mm256_store_si256((__m256i*)qBufferPtr, qOutputVal); +++ +++ iBufferPtr += 16; +++ qBufferPtr += 16; +++ } +++ +++ number = sixteenthPoints * 16; +++ for (; number < num_points; number++) { +++ *iBufferPtr++ = +++ ((int16_t)*complexVectorPtr++) * +++ 256; // load 8 bit Complexvector into 16 bit, shift left by 8 bits and store +++ *qBufferPtr++ = ((int16_t)*complexVectorPtr++) * 256; +++ } ++ } ++ #endif /* LV_HAVE_AVX */ ++ ++ ++ #ifdef LV_HAVE_GENERIC ++ ++-static inline void ++-volk_8ic_deinterleave_16i_x2_generic(int16_t* iBuffer, int16_t* qBuffer, ++- const lv_8sc_t* complexVector, unsigned int num_points) +++static inline void volk_8ic_deinterleave_16i_x2_generic(int16_t* iBuffer, +++ int16_t* qBuffer, +++ const lv_8sc_t* complexVector, +++ unsigned int num_points) ++ { ++- const int8_t* complexVectorPtr = (const int8_t*)complexVector; ++- int16_t* iBufferPtr = iBuffer; ++- int16_t* qBufferPtr = qBuffer; ++- unsigned int number; ++- for(number = 0; number < num_points; number++){ ++- *iBufferPtr++ = (int16_t)(*complexVectorPtr++)*256; ++- *qBufferPtr++ = (int16_t)(*complexVectorPtr++)*256; ++- } +++ const int8_t* complexVectorPtr = (const int8_t*)complexVector; +++ int16_t* iBufferPtr = iBuffer; +++ int16_t* qBufferPtr = qBuffer; +++ unsigned int number; +++ for (number = 0; number < num_points; number++) { +++ *iBufferPtr++ = (int16_t)(*complexVectorPtr++) * 256; +++ *qBufferPtr++ = (int16_t)(*complexVectorPtr++) * 256; +++ } ++ } ++ #endif /* LV_HAVE_GENERIC */ ++ ++ ++- ++ #endif /* INCLUDED_volk_8ic_deinterleave_16i_x2_a_H */ ++ ++ #ifndef INCLUDED_volk_8ic_deinterleave_16i_x2_u_H ++@@ -243,47 +327,82 @@ volk_8ic_deinterleave_16i_x2_generic(int16_t* iBuffer, int16_t* qBuffer, ++ #ifdef LV_HAVE_AVX2 ++ #include ++ ++-static inline void ++-volk_8ic_deinterleave_16i_x2_u_avx2(int16_t* iBuffer, int16_t* qBuffer, ++- const lv_8sc_t* complexVector, unsigned int num_points) +++static inline void volk_8ic_deinterleave_16i_x2_u_avx2(int16_t* iBuffer, +++ int16_t* qBuffer, +++ const lv_8sc_t* complexVector, +++ unsigned int num_points) ++ { ++- unsigned int number = 0; ++- const int8_t* complexVectorPtr = (int8_t*)complexVector; ++- int16_t* iBufferPtr = iBuffer; ++- int16_t* qBufferPtr = qBuffer; ++- __m256i MoveMask = _mm256_set_epi8(15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0, 15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0); ++- __m256i complexVal, iOutputVal, qOutputVal; ++- __m128i iOutputVal0, qOutputVal0; ++- ++- unsigned int sixteenthPoints = num_points / 16; ++- ++- for(number = 0; number < sixteenthPoints; number++){ ++- complexVal = _mm256_loadu_si256((__m256i*)complexVectorPtr); complexVectorPtr += 32; ++- ++- complexVal = _mm256_shuffle_epi8(complexVal, MoveMask); ++- complexVal = _mm256_permute4x64_epi64(complexVal, 0xd8); ++- ++- iOutputVal0 = _mm256_extracti128_si256(complexVal, 0); ++- qOutputVal0 = _mm256_extracti128_si256(complexVal, 1); ++- ++- iOutputVal = _mm256_cvtepi8_epi16(iOutputVal0); ++- iOutputVal = _mm256_slli_epi16(iOutputVal, 8); ++- ++- qOutputVal = _mm256_cvtepi8_epi16(qOutputVal0); ++- qOutputVal = _mm256_slli_epi16(qOutputVal, 8); ++- ++- _mm256_storeu_si256((__m256i*)iBufferPtr, iOutputVal); ++- _mm256_storeu_si256((__m256i*)qBufferPtr, qOutputVal); ++- ++- iBufferPtr += 16; ++- qBufferPtr += 16; ++- } ++- ++- number = sixteenthPoints * 16; ++- for(; number < num_points; number++){ ++- *iBufferPtr++ = ((int16_t)*complexVectorPtr++) * 256; // load 8 bit Complexvector into 16 bit, shift left by 8 bits and store ++- *qBufferPtr++ = ((int16_t)*complexVectorPtr++) * 256; ++- } +++ unsigned int number = 0; +++ const int8_t* complexVectorPtr = (int8_t*)complexVector; +++ int16_t* iBufferPtr = iBuffer; +++ int16_t* qBufferPtr = qBuffer; +++ __m256i MoveMask = _mm256_set_epi8(15, +++ 13, +++ 11, +++ 9, +++ 7, +++ 5, +++ 3, +++ 1, +++ 14, +++ 12, +++ 10, +++ 8, +++ 6, +++ 4, +++ 2, +++ 0, +++ 15, +++ 13, +++ 11, +++ 9, +++ 7, +++ 5, +++ 3, +++ 1, +++ 14, +++ 12, +++ 10, +++ 8, +++ 6, +++ 4, +++ 2, +++ 0); +++ __m256i complexVal, iOutputVal, qOutputVal; +++ __m128i iOutputVal0, qOutputVal0; +++ +++ unsigned int sixteenthPoints = num_points / 16; +++ +++ for (number = 0; number < sixteenthPoints; number++) { +++ complexVal = _mm256_loadu_si256((__m256i*)complexVectorPtr); +++ complexVectorPtr += 32; +++ +++ complexVal = _mm256_shuffle_epi8(complexVal, MoveMask); +++ complexVal = _mm256_permute4x64_epi64(complexVal, 0xd8); +++ +++ iOutputVal0 = _mm256_extracti128_si256(complexVal, 0); +++ qOutputVal0 = _mm256_extracti128_si256(complexVal, 1); +++ +++ iOutputVal = _mm256_cvtepi8_epi16(iOutputVal0); +++ iOutputVal = _mm256_slli_epi16(iOutputVal, 8); +++ +++ qOutputVal = _mm256_cvtepi8_epi16(qOutputVal0); +++ qOutputVal = _mm256_slli_epi16(qOutputVal, 8); +++ +++ _mm256_storeu_si256((__m256i*)iBufferPtr, iOutputVal); +++ _mm256_storeu_si256((__m256i*)qBufferPtr, qOutputVal); +++ +++ iBufferPtr += 16; +++ qBufferPtr += 16; +++ } +++ +++ number = sixteenthPoints * 16; +++ for (; number < num_points; number++) { +++ *iBufferPtr++ = +++ ((int16_t)*complexVectorPtr++) * +++ 256; // load 8 bit Complexvector into 16 bit, shift left by 8 bits and store +++ *qBufferPtr++ = ((int16_t)*complexVectorPtr++) * 256; +++ } ++ } ++ #endif /* LV_HAVE_AVX2 */ ++ #endif /* INCLUDED_volk_8ic_deinterleave_16i_x2_u_H */ ++diff --git a/kernels/volk/volk_8ic_deinterleave_real_16i.h b/kernels/volk/volk_8ic_deinterleave_real_16i.h ++index f15879a..aaebb47 100644 ++--- a/kernels/volk/volk_8ic_deinterleave_real_16i.h +++++ b/kernels/volk/volk_8ic_deinterleave_real_16i.h ++@@ -30,8 +30,8 @@ ++ * ++ * Dispatcher Prototype ++ * \code ++- * void volk_8ic_deinterleave_real_16i(int16_t* iBuffer, const lv_8sc_t* complexVector, unsigned int num_points) ++- * \endcode +++ * void volk_8ic_deinterleave_real_16i(int16_t* iBuffer, const lv_8sc_t* complexVector, +++ * unsigned int num_points) \endcode ++ * ++ * \b Inputs ++ * \li complexVector: The complex input vector. ++@@ -60,75 +60,109 @@ ++ #ifdef LV_HAVE_AVX2 ++ #include ++ ++-static inline void ++-volk_8ic_deinterleave_real_16i_a_avx2(int16_t* iBuffer, const lv_8sc_t* complexVector, ++- unsigned int num_points) +++static inline void volk_8ic_deinterleave_real_16i_a_avx2(int16_t* iBuffer, +++ const lv_8sc_t* complexVector, +++ unsigned int num_points) ++ { ++- unsigned int number = 0; ++- const int8_t* complexVectorPtr = (int8_t*)complexVector; ++- int16_t* iBufferPtr = iBuffer; ++- __m256i moveMask = _mm256_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0); ++- __m256i complexVal, outputVal; ++- __m128i outputVal0; ++- ++- unsigned int sixteenthPoints = num_points / 16; ++- ++- for(number = 0; number < sixteenthPoints; number++){ ++- complexVal = _mm256_load_si256((__m256i*)complexVectorPtr); complexVectorPtr += 32; ++- ++- complexVal = _mm256_shuffle_epi8(complexVal, moveMask); ++- complexVal = _mm256_permute4x64_epi64(complexVal, 0xd8); ++- ++- outputVal0 = _mm256_extractf128_si256(complexVal, 0); ++- ++- outputVal = _mm256_cvtepi8_epi16(outputVal0); ++- outputVal = _mm256_slli_epi16(outputVal, 7); ++- ++- _mm256_store_si256((__m256i*)iBufferPtr, outputVal); ++- ++- iBufferPtr += 16; ++- } ++- ++- number = sixteenthPoints * 16; ++- for(; number < num_points; number++){ ++- *iBufferPtr++ = ((int16_t)*complexVectorPtr++) * 128; ++- complexVectorPtr++; ++- } +++ unsigned int number = 0; +++ const int8_t* complexVectorPtr = (int8_t*)complexVector; +++ int16_t* iBufferPtr = iBuffer; +++ __m256i moveMask = _mm256_set_epi8(0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 14, +++ 12, +++ 10, +++ 8, +++ 6, +++ 4, +++ 2, +++ 0, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 14, +++ 12, +++ 10, +++ 8, +++ 6, +++ 4, +++ 2, +++ 0); +++ __m256i complexVal, outputVal; +++ __m128i outputVal0; +++ +++ unsigned int sixteenthPoints = num_points / 16; +++ +++ for (number = 0; number < sixteenthPoints; number++) { +++ complexVal = _mm256_load_si256((__m256i*)complexVectorPtr); +++ complexVectorPtr += 32; +++ +++ complexVal = _mm256_shuffle_epi8(complexVal, moveMask); +++ complexVal = _mm256_permute4x64_epi64(complexVal, 0xd8); +++ +++ outputVal0 = _mm256_extractf128_si256(complexVal, 0); +++ +++ outputVal = _mm256_cvtepi8_epi16(outputVal0); +++ outputVal = _mm256_slli_epi16(outputVal, 7); +++ +++ _mm256_store_si256((__m256i*)iBufferPtr, outputVal); +++ +++ iBufferPtr += 16; +++ } +++ +++ number = sixteenthPoints * 16; +++ for (; number < num_points; number++) { +++ *iBufferPtr++ = ((int16_t)*complexVectorPtr++) * 128; +++ complexVectorPtr++; +++ } ++ } ++ #endif /* LV_HAVE_AVX2 */ ++ ++ #ifdef LV_HAVE_SSE4_1 ++ #include ++ ++-static inline void ++-volk_8ic_deinterleave_real_16i_a_sse4_1(int16_t* iBuffer, const lv_8sc_t* complexVector, ++- unsigned int num_points) +++static inline void volk_8ic_deinterleave_real_16i_a_sse4_1(int16_t* iBuffer, +++ const lv_8sc_t* complexVector, +++ unsigned int num_points) ++ { ++- unsigned int number = 0; ++- const int8_t* complexVectorPtr = (int8_t*)complexVector; ++- int16_t* iBufferPtr = iBuffer; ++- __m128i moveMask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0); ++- __m128i complexVal, outputVal; +++ unsigned int number = 0; +++ const int8_t* complexVectorPtr = (int8_t*)complexVector; +++ int16_t* iBufferPtr = iBuffer; +++ __m128i moveMask = _mm_set_epi8( +++ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0); +++ __m128i complexVal, outputVal; ++ ++- unsigned int eighthPoints = num_points / 8; +++ unsigned int eighthPoints = num_points / 8; ++ ++- for(number = 0; number < eighthPoints; number++){ ++- complexVal = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16; +++ for (number = 0; number < eighthPoints; number++) { +++ complexVal = _mm_load_si128((__m128i*)complexVectorPtr); +++ complexVectorPtr += 16; ++ ++- complexVal = _mm_shuffle_epi8(complexVal, moveMask); +++ complexVal = _mm_shuffle_epi8(complexVal, moveMask); ++ ++- outputVal = _mm_cvtepi8_epi16(complexVal); ++- outputVal = _mm_slli_epi16(outputVal, 7); +++ outputVal = _mm_cvtepi8_epi16(complexVal); +++ outputVal = _mm_slli_epi16(outputVal, 7); ++ ++- _mm_store_si128((__m128i*)iBufferPtr, outputVal); ++- iBufferPtr += 8; ++- } +++ _mm_store_si128((__m128i*)iBufferPtr, outputVal); +++ iBufferPtr += 8; +++ } ++ ++- number = eighthPoints * 8; ++- for(; number < num_points; number++){ ++- *iBufferPtr++ = ((int16_t)*complexVectorPtr++) * 128; ++- complexVectorPtr++; ++- } +++ number = eighthPoints * 8; +++ for (; number < num_points; number++) { +++ *iBufferPtr++ = ((int16_t)*complexVectorPtr++) * 128; +++ complexVectorPtr++; +++ } ++ } ++ #endif /* LV_HAVE_SSE4_1 */ ++ ++@@ -136,63 +170,65 @@ volk_8ic_deinterleave_real_16i_a_sse4_1(int16_t* iBuffer, const lv_8sc_t* comple ++ #ifdef LV_HAVE_AVX ++ #include ++ ++-static inline void ++-volk_8ic_deinterleave_real_16i_a_avx(int16_t* iBuffer, const lv_8sc_t* complexVector, ++- unsigned int num_points) +++static inline void volk_8ic_deinterleave_real_16i_a_avx(int16_t* iBuffer, +++ const lv_8sc_t* complexVector, +++ unsigned int num_points) ++ { ++- unsigned int number = 0; ++- const int8_t* complexVectorPtr = (int8_t*)complexVector; ++- int16_t* iBufferPtr = iBuffer; ++- __m128i moveMask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0); ++- __m256i complexVal, outputVal; ++- __m128i complexVal1, complexVal0, outputVal1, outputVal0; ++- ++- unsigned int sixteenthPoints = num_points / 16; ++- ++- for(number = 0; number < sixteenthPoints; number++){ ++- complexVal = _mm256_load_si256((__m256i*)complexVectorPtr); complexVectorPtr += 32; ++- ++- complexVal1 = _mm256_extractf128_si256(complexVal, 1); ++- complexVal0 = _mm256_extractf128_si256(complexVal, 0); ++- ++- outputVal1 = _mm_shuffle_epi8(complexVal1, moveMask); ++- outputVal0 = _mm_shuffle_epi8(complexVal0, moveMask); ++- ++- outputVal1 = _mm_cvtepi8_epi16(outputVal1); ++- outputVal1 = _mm_slli_epi16(outputVal1, 7); ++- outputVal0 = _mm_cvtepi8_epi16(outputVal0); ++- outputVal0 = _mm_slli_epi16(outputVal0, 7); ++- ++- __m256i dummy = _mm256_setzero_si256(); ++- outputVal = _mm256_insertf128_si256(dummy, outputVal0, 0); ++- outputVal = _mm256_insertf128_si256(outputVal, outputVal1, 1); ++- _mm256_store_si256((__m256i*)iBufferPtr, outputVal); ++- ++- iBufferPtr += 16; ++- } ++- ++- number = sixteenthPoints * 16; ++- for(; number < num_points; number++){ ++- *iBufferPtr++ = ((int16_t)*complexVectorPtr++) * 128; ++- complexVectorPtr++; ++- } +++ unsigned int number = 0; +++ const int8_t* complexVectorPtr = (int8_t*)complexVector; +++ int16_t* iBufferPtr = iBuffer; +++ __m128i moveMask = _mm_set_epi8( +++ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0); +++ __m256i complexVal, outputVal; +++ __m128i complexVal1, complexVal0, outputVal1, outputVal0; +++ +++ unsigned int sixteenthPoints = num_points / 16; +++ +++ for (number = 0; number < sixteenthPoints; number++) { +++ complexVal = _mm256_load_si256((__m256i*)complexVectorPtr); +++ complexVectorPtr += 32; +++ +++ complexVal1 = _mm256_extractf128_si256(complexVal, 1); +++ complexVal0 = _mm256_extractf128_si256(complexVal, 0); +++ +++ outputVal1 = _mm_shuffle_epi8(complexVal1, moveMask); +++ outputVal0 = _mm_shuffle_epi8(complexVal0, moveMask); +++ +++ outputVal1 = _mm_cvtepi8_epi16(outputVal1); +++ outputVal1 = _mm_slli_epi16(outputVal1, 7); +++ outputVal0 = _mm_cvtepi8_epi16(outputVal0); +++ outputVal0 = _mm_slli_epi16(outputVal0, 7); +++ +++ __m256i dummy = _mm256_setzero_si256(); +++ outputVal = _mm256_insertf128_si256(dummy, outputVal0, 0); +++ outputVal = _mm256_insertf128_si256(outputVal, outputVal1, 1); +++ _mm256_store_si256((__m256i*)iBufferPtr, outputVal); +++ +++ iBufferPtr += 16; +++ } +++ +++ number = sixteenthPoints * 16; +++ for (; number < num_points; number++) { +++ *iBufferPtr++ = ((int16_t)*complexVectorPtr++) * 128; +++ complexVectorPtr++; +++ } ++ } ++ #endif /* LV_HAVE_AVX */ ++ ++ ++ #ifdef LV_HAVE_GENERIC ++ ++-static inline void ++-volk_8ic_deinterleave_real_16i_generic(int16_t* iBuffer, const lv_8sc_t* complexVector, ++- unsigned int num_points) +++static inline void volk_8ic_deinterleave_real_16i_generic(int16_t* iBuffer, +++ const lv_8sc_t* complexVector, +++ unsigned int num_points) ++ { ++- unsigned int number = 0; ++- const int8_t* complexVectorPtr = (const int8_t*)complexVector; ++- int16_t* iBufferPtr = iBuffer; ++- for(number = 0; number < num_points; number++){ ++- *iBufferPtr++ = ((int16_t)(*complexVectorPtr++)) * 128; ++- complexVectorPtr++; ++- } +++ unsigned int number = 0; +++ const int8_t* complexVectorPtr = (const int8_t*)complexVector; +++ int16_t* iBufferPtr = iBuffer; +++ for (number = 0; number < num_points; number++) { +++ *iBufferPtr++ = ((int16_t)(*complexVectorPtr++)) * 128; +++ complexVectorPtr++; +++ } ++ } ++ #endif /* LV_HAVE_GENERIC */ ++ ++@@ -209,40 +245,72 @@ volk_8ic_deinterleave_real_16i_generic(int16_t* iBuffer, const lv_8sc_t* complex ++ #ifdef LV_HAVE_AVX2 ++ #include ++ ++-static inline void ++-volk_8ic_deinterleave_real_16i_u_avx2(int16_t* iBuffer, const lv_8sc_t* complexVector, ++- unsigned int num_points) +++static inline void volk_8ic_deinterleave_real_16i_u_avx2(int16_t* iBuffer, +++ const lv_8sc_t* complexVector, +++ unsigned int num_points) ++ { ++- unsigned int number = 0; ++- const int8_t* complexVectorPtr = (int8_t*)complexVector; ++- int16_t* iBufferPtr = iBuffer; ++- __m256i moveMask = _mm256_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0); ++- __m256i complexVal, outputVal; ++- __m128i outputVal0; ++- ++- unsigned int sixteenthPoints = num_points / 16; ++- ++- for(number = 0; number < sixteenthPoints; number++){ ++- complexVal = _mm256_loadu_si256((__m256i*)complexVectorPtr); complexVectorPtr += 32; ++- ++- complexVal = _mm256_shuffle_epi8(complexVal, moveMask); ++- complexVal = _mm256_permute4x64_epi64(complexVal, 0xd8); ++- ++- outputVal0 = _mm256_extractf128_si256(complexVal, 0); ++- ++- outputVal = _mm256_cvtepi8_epi16(outputVal0); ++- outputVal = _mm256_slli_epi16(outputVal, 7); ++- ++- _mm256_storeu_si256((__m256i*)iBufferPtr, outputVal); ++- ++- iBufferPtr += 16; ++- } ++- ++- number = sixteenthPoints * 16; ++- for(; number < num_points; number++){ ++- *iBufferPtr++ = ((int16_t)*complexVectorPtr++) * 128; ++- complexVectorPtr++; ++- } +++ unsigned int number = 0; +++ const int8_t* complexVectorPtr = (int8_t*)complexVector; +++ int16_t* iBufferPtr = iBuffer; +++ __m256i moveMask = _mm256_set_epi8(0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 14, +++ 12, +++ 10, +++ 8, +++ 6, +++ 4, +++ 2, +++ 0, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 14, +++ 12, +++ 10, +++ 8, +++ 6, +++ 4, +++ 2, +++ 0); +++ __m256i complexVal, outputVal; +++ __m128i outputVal0; +++ +++ unsigned int sixteenthPoints = num_points / 16; +++ +++ for (number = 0; number < sixteenthPoints; number++) { +++ complexVal = _mm256_loadu_si256((__m256i*)complexVectorPtr); +++ complexVectorPtr += 32; +++ +++ complexVal = _mm256_shuffle_epi8(complexVal, moveMask); +++ complexVal = _mm256_permute4x64_epi64(complexVal, 0xd8); +++ +++ outputVal0 = _mm256_extractf128_si256(complexVal, 0); +++ +++ outputVal = _mm256_cvtepi8_epi16(outputVal0); +++ outputVal = _mm256_slli_epi16(outputVal, 7); +++ +++ _mm256_storeu_si256((__m256i*)iBufferPtr, outputVal); +++ +++ iBufferPtr += 16; +++ } +++ +++ number = sixteenthPoints * 16; +++ for (; number < num_points; number++) { +++ *iBufferPtr++ = ((int16_t)*complexVectorPtr++) * 128; +++ complexVectorPtr++; +++ } ++ } ++ #endif /* LV_HAVE_AVX2 */ ++ #endif /* INCLUDED_volk_8ic_deinterleave_real_16i_u_H */ ++diff --git a/kernels/volk/volk_8ic_deinterleave_real_8i.h b/kernels/volk/volk_8ic_deinterleave_real_8i.h ++index 6cc3f15..a1a835d 100644 ++--- a/kernels/volk/volk_8ic_deinterleave_real_8i.h +++++ b/kernels/volk/volk_8ic_deinterleave_real_8i.h ++@@ -30,8 +30,8 @@ ++ * ++ * Dispatcher Prototype ++ * \code ++- * void volk_8ic_deinterleave_real_8i(int8_t* iBuffer, const lv_8sc_t* complexVector, unsigned int num_points) ++- * \endcode +++ * void volk_8ic_deinterleave_real_8i(int8_t* iBuffer, const lv_8sc_t* complexVector, +++ * unsigned int num_points) \endcode ++ * ++ * \b Inputs ++ * \li complexVector: The complex input vector. ++@@ -59,40 +59,102 @@ ++ #ifdef LV_HAVE_AVX2 ++ #include ++ ++-static inline void ++-volk_8ic_deinterleave_real_8i_a_avx2(int8_t* iBuffer, const lv_8sc_t* complexVector, ++- unsigned int num_points) +++static inline void volk_8ic_deinterleave_real_8i_a_avx2(int8_t* iBuffer, +++ const lv_8sc_t* complexVector, +++ unsigned int num_points) ++ { ++- unsigned int number = 0; ++- const int8_t* complexVectorPtr = (int8_t*)complexVector; ++- int8_t* iBufferPtr = iBuffer; ++- __m256i moveMask1 = _mm256_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0); ++- __m256i moveMask2 = _mm256_set_epi8(14, 12, 10, 8, 6, 4, 2, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80); ++- __m256i complexVal1, complexVal2, outputVal; ++- ++- unsigned int thirtysecondPoints = num_points / 32; ++- ++- for(number = 0; number < thirtysecondPoints; number++){ ++- ++- complexVal1 = _mm256_load_si256((__m256i*)complexVectorPtr); ++- complexVectorPtr += 32; ++- complexVal2 = _mm256_load_si256((__m256i*)complexVectorPtr); ++- complexVectorPtr += 32; ++- ++- complexVal1 = _mm256_shuffle_epi8(complexVal1, moveMask1); ++- complexVal2 = _mm256_shuffle_epi8(complexVal2, moveMask2); ++- outputVal = _mm256_or_si256(complexVal1, complexVal2); ++- outputVal = _mm256_permute4x64_epi64(outputVal, 0xd8); ++- ++- _mm256_store_si256((__m256i*)iBufferPtr, outputVal); ++- iBufferPtr += 32; ++- } ++- ++- number = thirtysecondPoints * 32; ++- for(; number < num_points; number++){ ++- *iBufferPtr++ = *complexVectorPtr++; ++- complexVectorPtr++; ++- } +++ unsigned int number = 0; +++ const int8_t* complexVectorPtr = (int8_t*)complexVector; +++ int8_t* iBufferPtr = iBuffer; +++ __m256i moveMask1 = _mm256_set_epi8(0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 14, +++ 12, +++ 10, +++ 8, +++ 6, +++ 4, +++ 2, +++ 0, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 14, +++ 12, +++ 10, +++ 8, +++ 6, +++ 4, +++ 2, +++ 0); +++ __m256i moveMask2 = _mm256_set_epi8(14, +++ 12, +++ 10, +++ 8, +++ 6, +++ 4, +++ 2, +++ 0, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 14, +++ 12, +++ 10, +++ 8, +++ 6, +++ 4, +++ 2, +++ 0, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80); +++ __m256i complexVal1, complexVal2, outputVal; +++ +++ unsigned int thirtysecondPoints = num_points / 32; +++ +++ for (number = 0; number < thirtysecondPoints; number++) { +++ +++ complexVal1 = _mm256_load_si256((__m256i*)complexVectorPtr); +++ complexVectorPtr += 32; +++ complexVal2 = _mm256_load_si256((__m256i*)complexVectorPtr); +++ complexVectorPtr += 32; +++ +++ complexVal1 = _mm256_shuffle_epi8(complexVal1, moveMask1); +++ complexVal2 = _mm256_shuffle_epi8(complexVal2, moveMask2); +++ outputVal = _mm256_or_si256(complexVal1, complexVal2); +++ outputVal = _mm256_permute4x64_epi64(outputVal, 0xd8); +++ +++ _mm256_store_si256((__m256i*)iBufferPtr, outputVal); +++ iBufferPtr += 32; +++ } +++ +++ number = thirtysecondPoints * 32; +++ for (; number < num_points; number++) { +++ *iBufferPtr++ = *complexVectorPtr++; +++ complexVectorPtr++; +++ } ++ } ++ #endif /* LV_HAVE_AVX2 */ ++ ++@@ -100,37 +162,41 @@ volk_8ic_deinterleave_real_8i_a_avx2(int8_t* iBuffer, const lv_8sc_t* complexVec ++ #ifdef LV_HAVE_SSSE3 ++ #include ++ ++-static inline void ++-volk_8ic_deinterleave_real_8i_a_ssse3(int8_t* iBuffer, const lv_8sc_t* complexVector, ++- unsigned int num_points) +++static inline void volk_8ic_deinterleave_real_8i_a_ssse3(int8_t* iBuffer, +++ const lv_8sc_t* complexVector, +++ unsigned int num_points) ++ { ++- unsigned int number = 0; ++- const int8_t* complexVectorPtr = (int8_t*)complexVector; ++- int8_t* iBufferPtr = iBuffer; ++- __m128i moveMask1 = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0); ++- __m128i moveMask2 = _mm_set_epi8(14, 12, 10, 8, 6, 4, 2, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80); ++- __m128i complexVal1, complexVal2, outputVal; ++- ++- unsigned int sixteenthPoints = num_points / 16; ++- ++- for(number = 0; number < sixteenthPoints; number++){ ++- complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16; ++- complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16; ++- ++- complexVal1 = _mm_shuffle_epi8(complexVal1, moveMask1); ++- complexVal2 = _mm_shuffle_epi8(complexVal2, moveMask2); ++- ++- outputVal = _mm_or_si128(complexVal1, complexVal2); ++- ++- _mm_store_si128((__m128i*)iBufferPtr, outputVal); ++- iBufferPtr += 16; ++- } ++- ++- number = sixteenthPoints * 16; ++- for(; number < num_points; number++){ ++- *iBufferPtr++ = *complexVectorPtr++; ++- complexVectorPtr++; ++- } +++ unsigned int number = 0; +++ const int8_t* complexVectorPtr = (int8_t*)complexVector; +++ int8_t* iBufferPtr = iBuffer; +++ __m128i moveMask1 = _mm_set_epi8( +++ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0); +++ __m128i moveMask2 = _mm_set_epi8( +++ 14, 12, 10, 8, 6, 4, 2, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80); +++ __m128i complexVal1, complexVal2, outputVal; +++ +++ unsigned int sixteenthPoints = num_points / 16; +++ +++ for (number = 0; number < sixteenthPoints; number++) { +++ complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr); +++ complexVectorPtr += 16; +++ complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr); +++ complexVectorPtr += 16; +++ +++ complexVal1 = _mm_shuffle_epi8(complexVal1, moveMask1); +++ complexVal2 = _mm_shuffle_epi8(complexVal2, moveMask2); +++ +++ outputVal = _mm_or_si128(complexVal1, complexVal2); +++ +++ _mm_store_si128((__m128i*)iBufferPtr, outputVal); +++ iBufferPtr += 16; +++ } +++ +++ number = sixteenthPoints * 16; +++ for (; number < num_points; number++) { +++ *iBufferPtr++ = *complexVectorPtr++; +++ complexVectorPtr++; +++ } ++ } ++ #endif /* LV_HAVE_SSSE3 */ ++ ++@@ -138,72 +204,75 @@ volk_8ic_deinterleave_real_8i_a_ssse3(int8_t* iBuffer, const lv_8sc_t* complexVe ++ #ifdef LV_HAVE_AVX ++ #include ++ ++-static inline void ++-volk_8ic_deinterleave_real_8i_a_avx(int8_t* iBuffer, const lv_8sc_t* complexVector, ++- unsigned int num_points) +++static inline void volk_8ic_deinterleave_real_8i_a_avx(int8_t* iBuffer, +++ const lv_8sc_t* complexVector, +++ unsigned int num_points) ++ { ++- unsigned int number = 0; ++- const int8_t* complexVectorPtr = (int8_t*)complexVector; ++- int8_t* iBufferPtr = iBuffer; ++- __m128i moveMaskL = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0); ++- __m128i moveMaskH = _mm_set_epi8(14, 12, 10, 8, 6, 4, 2, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80); ++- __m256i complexVal1, complexVal2, outputVal; ++- __m128i complexVal1H, complexVal1L, complexVal2H, complexVal2L, outputVal1, outputVal2; ++- ++- unsigned int thirtysecondPoints = num_points / 32; ++- ++- for(number = 0; number < thirtysecondPoints; number++){ ++- ++- complexVal1 = _mm256_load_si256((__m256i*)complexVectorPtr); ++- complexVectorPtr += 32; ++- complexVal2 = _mm256_load_si256((__m256i*)complexVectorPtr); ++- complexVectorPtr += 32; ++- ++- complexVal1H = _mm256_extractf128_si256(complexVal1, 1); ++- complexVal1L = _mm256_extractf128_si256(complexVal1, 0); ++- complexVal2H = _mm256_extractf128_si256(complexVal2, 1); ++- complexVal2L = _mm256_extractf128_si256(complexVal2, 0); ++- ++- complexVal1H = _mm_shuffle_epi8(complexVal1H, moveMaskH); ++- complexVal1L = _mm_shuffle_epi8(complexVal1L, moveMaskL); ++- outputVal1 = _mm_or_si128(complexVal1H, complexVal1L); ++- ++- ++- complexVal2H = _mm_shuffle_epi8(complexVal2H, moveMaskH); ++- complexVal2L = _mm_shuffle_epi8(complexVal2L, moveMaskL); ++- outputVal2 = _mm_or_si128(complexVal2H, complexVal2L); ++- ++- __m256i dummy = _mm256_setzero_si256(); ++- outputVal = _mm256_insertf128_si256(dummy, outputVal1, 0); ++- outputVal = _mm256_insertf128_si256(outputVal, outputVal2, 1); ++- ++- ++- _mm256_store_si256((__m256i*)iBufferPtr, outputVal); ++- iBufferPtr += 32; ++- } ++- ++- number = thirtysecondPoints * 32; ++- for(; number < num_points; number++){ ++- *iBufferPtr++ = *complexVectorPtr++; ++- complexVectorPtr++; ++- } +++ unsigned int number = 0; +++ const int8_t* complexVectorPtr = (int8_t*)complexVector; +++ int8_t* iBufferPtr = iBuffer; +++ __m128i moveMaskL = _mm_set_epi8( +++ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0); +++ __m128i moveMaskH = _mm_set_epi8( +++ 14, 12, 10, 8, 6, 4, 2, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80); +++ __m256i complexVal1, complexVal2, outputVal; +++ __m128i complexVal1H, complexVal1L, complexVal2H, complexVal2L, outputVal1, +++ outputVal2; +++ +++ unsigned int thirtysecondPoints = num_points / 32; +++ +++ for (number = 0; number < thirtysecondPoints; number++) { +++ +++ complexVal1 = _mm256_load_si256((__m256i*)complexVectorPtr); +++ complexVectorPtr += 32; +++ complexVal2 = _mm256_load_si256((__m256i*)complexVectorPtr); +++ complexVectorPtr += 32; +++ +++ complexVal1H = _mm256_extractf128_si256(complexVal1, 1); +++ complexVal1L = _mm256_extractf128_si256(complexVal1, 0); +++ complexVal2H = _mm256_extractf128_si256(complexVal2, 1); +++ complexVal2L = _mm256_extractf128_si256(complexVal2, 0); +++ +++ complexVal1H = _mm_shuffle_epi8(complexVal1H, moveMaskH); +++ complexVal1L = _mm_shuffle_epi8(complexVal1L, moveMaskL); +++ outputVal1 = _mm_or_si128(complexVal1H, complexVal1L); +++ +++ +++ complexVal2H = _mm_shuffle_epi8(complexVal2H, moveMaskH); +++ complexVal2L = _mm_shuffle_epi8(complexVal2L, moveMaskL); +++ outputVal2 = _mm_or_si128(complexVal2H, complexVal2L); +++ +++ __m256i dummy = _mm256_setzero_si256(); +++ outputVal = _mm256_insertf128_si256(dummy, outputVal1, 0); +++ outputVal = _mm256_insertf128_si256(outputVal, outputVal2, 1); +++ +++ +++ _mm256_store_si256((__m256i*)iBufferPtr, outputVal); +++ iBufferPtr += 32; +++ } +++ +++ number = thirtysecondPoints * 32; +++ for (; number < num_points; number++) { +++ *iBufferPtr++ = *complexVectorPtr++; +++ complexVectorPtr++; +++ } ++ } ++ #endif /* LV_HAVE_AVX */ ++ ++ ++ #ifdef LV_HAVE_GENERIC ++ ++-static inline void ++-volk_8ic_deinterleave_real_8i_generic(int8_t* iBuffer, const lv_8sc_t* complexVector, ++- unsigned int num_points) +++static inline void volk_8ic_deinterleave_real_8i_generic(int8_t* iBuffer, +++ const lv_8sc_t* complexVector, +++ unsigned int num_points) ++ { ++- unsigned int number = 0; ++- const int8_t* complexVectorPtr = (int8_t*)complexVector; ++- int8_t* iBufferPtr = iBuffer; ++- for(number = 0; number < num_points; number++){ ++- *iBufferPtr++ = *complexVectorPtr++; ++- complexVectorPtr++; ++- } +++ unsigned int number = 0; +++ const int8_t* complexVectorPtr = (int8_t*)complexVector; +++ int8_t* iBufferPtr = iBuffer; +++ for (number = 0; number < num_points; number++) { +++ *iBufferPtr++ = *complexVectorPtr++; +++ complexVectorPtr++; +++ } ++ } ++ #endif /* LV_HAVE_GENERIC */ ++ ++@@ -211,26 +280,27 @@ volk_8ic_deinterleave_real_8i_generic(int8_t* iBuffer, const lv_8sc_t* complexVe ++ #ifdef LV_HAVE_NEON ++ #include ++ ++-static inline void ++-volk_8ic_deinterleave_real_8i_neon(int8_t* iBuffer, const lv_8sc_t* complexVector, unsigned int num_points) +++static inline void volk_8ic_deinterleave_real_8i_neon(int8_t* iBuffer, +++ const lv_8sc_t* complexVector, +++ unsigned int num_points) ++ { ++- unsigned int number; ++- unsigned int sixteenth_points = num_points / 16; ++- ++- int8x16x2_t input_vector; ++- for(number=0; number < sixteenth_points; ++number) { ++- input_vector = vld2q_s8((int8_t*) complexVector ); ++- vst1q_s8(iBuffer, input_vector.val[0]); ++- iBuffer += 16; ++- complexVector += 16; ++- } ++- ++- const int8_t* complexVectorPtr = (int8_t*)complexVector; ++- int8_t* iBufferPtr = iBuffer; ++- for(number = sixteenth_points*16; number < num_points; number++){ ++- *iBufferPtr++ = *complexVectorPtr++; ++- complexVectorPtr++; ++- } +++ unsigned int number; +++ unsigned int sixteenth_points = num_points / 16; +++ +++ int8x16x2_t input_vector; +++ for (number = 0; number < sixteenth_points; ++number) { +++ input_vector = vld2q_s8((int8_t*)complexVector); +++ vst1q_s8(iBuffer, input_vector.val[0]); +++ iBuffer += 16; +++ complexVector += 16; +++ } +++ +++ const int8_t* complexVectorPtr = (int8_t*)complexVector; +++ int8_t* iBufferPtr = iBuffer; +++ for (number = sixteenth_points * 16; number < num_points; number++) { +++ *iBufferPtr++ = *complexVectorPtr++; +++ complexVectorPtr++; +++ } ++ } ++ #endif /* LV_HAVE_NEON */ ++ ++@@ -246,40 +316,102 @@ volk_8ic_deinterleave_real_8i_neon(int8_t* iBuffer, const lv_8sc_t* complexVecto ++ #ifdef LV_HAVE_AVX2 ++ #include ++ ++-static inline void ++-volk_8ic_deinterleave_real_8i_u_avx2(int8_t* iBuffer, const lv_8sc_t* complexVector, ++- unsigned int num_points) +++static inline void volk_8ic_deinterleave_real_8i_u_avx2(int8_t* iBuffer, +++ const lv_8sc_t* complexVector, +++ unsigned int num_points) ++ { ++- unsigned int number = 0; ++- const int8_t* complexVectorPtr = (int8_t*)complexVector; ++- int8_t* iBufferPtr = iBuffer; ++- __m256i moveMask1 = _mm256_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0); ++- __m256i moveMask2 = _mm256_set_epi8(14, 12, 10, 8, 6, 4, 2, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80); ++- __m256i complexVal1, complexVal2, outputVal; ++- ++- unsigned int thirtysecondPoints = num_points / 32; ++- ++- for(number = 0; number < thirtysecondPoints; number++){ ++- ++- complexVal1 = _mm256_loadu_si256((__m256i*)complexVectorPtr); ++- complexVectorPtr += 32; ++- complexVal2 = _mm256_loadu_si256((__m256i*)complexVectorPtr); ++- complexVectorPtr += 32; ++- ++- complexVal1 = _mm256_shuffle_epi8(complexVal1, moveMask1); ++- complexVal2 = _mm256_shuffle_epi8(complexVal2, moveMask2); ++- outputVal = _mm256_or_si256(complexVal1, complexVal2); ++- outputVal = _mm256_permute4x64_epi64(outputVal, 0xd8); ++- ++- _mm256_storeu_si256((__m256i*)iBufferPtr, outputVal); ++- iBufferPtr += 32; ++- } ++- ++- number = thirtysecondPoints * 32; ++- for(; number < num_points; number++){ ++- *iBufferPtr++ = *complexVectorPtr++; ++- complexVectorPtr++; ++- } +++ unsigned int number = 0; +++ const int8_t* complexVectorPtr = (int8_t*)complexVector; +++ int8_t* iBufferPtr = iBuffer; +++ __m256i moveMask1 = _mm256_set_epi8(0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 14, +++ 12, +++ 10, +++ 8, +++ 6, +++ 4, +++ 2, +++ 0, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 14, +++ 12, +++ 10, +++ 8, +++ 6, +++ 4, +++ 2, +++ 0); +++ __m256i moveMask2 = _mm256_set_epi8(14, +++ 12, +++ 10, +++ 8, +++ 6, +++ 4, +++ 2, +++ 0, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 14, +++ 12, +++ 10, +++ 8, +++ 6, +++ 4, +++ 2, +++ 0, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80); +++ __m256i complexVal1, complexVal2, outputVal; +++ +++ unsigned int thirtysecondPoints = num_points / 32; +++ +++ for (number = 0; number < thirtysecondPoints; number++) { +++ +++ complexVal1 = _mm256_loadu_si256((__m256i*)complexVectorPtr); +++ complexVectorPtr += 32; +++ complexVal2 = _mm256_loadu_si256((__m256i*)complexVectorPtr); +++ complexVectorPtr += 32; +++ +++ complexVal1 = _mm256_shuffle_epi8(complexVal1, moveMask1); +++ complexVal2 = _mm256_shuffle_epi8(complexVal2, moveMask2); +++ outputVal = _mm256_or_si256(complexVal1, complexVal2); +++ outputVal = _mm256_permute4x64_epi64(outputVal, 0xd8); +++ +++ _mm256_storeu_si256((__m256i*)iBufferPtr, outputVal); +++ iBufferPtr += 32; +++ } +++ +++ number = thirtysecondPoints * 32; +++ for (; number < num_points; number++) { +++ *iBufferPtr++ = *complexVectorPtr++; +++ complexVectorPtr++; +++ } ++ } ++ #endif /* LV_HAVE_AVX2 */ ++ ++diff --git a/kernels/volk/volk_8ic_s32f_deinterleave_32f_x2.h b/kernels/volk/volk_8ic_s32f_deinterleave_32f_x2.h ++index 736f7c0..f622752 100644 ++--- a/kernels/volk/volk_8ic_s32f_deinterleave_32f_x2.h +++++ b/kernels/volk/volk_8ic_s32f_deinterleave_32f_x2.h ++@@ -31,8 +31,8 @@ ++ * ++ * Dispatcher Prototype ++ * \code ++- * void volk_8ic_s32f_deinterleave_32f_x2(float* iBuffer, float* qBuffer, const lv_8sc_t* complexVector, const float scalar, unsigned int num_points) ++- * \endcode +++ * void volk_8ic_s32f_deinterleave_32f_x2(float* iBuffer, float* qBuffer, const lv_8sc_t* +++ * complexVector, const float scalar, unsigned int num_points) \endcode ++ * ++ * \b Inputs ++ * \li complexVector: The complex input vector. ++@@ -56,74 +56,79 @@ ++ #ifndef INCLUDED_volk_8ic_s32f_deinterleave_32f_x2_a_H ++ #define INCLUDED_volk_8ic_s32f_deinterleave_32f_x2_a_H ++ ++-#include ++ #include ++ #include +++#include ++ ++ ++ #ifdef LV_HAVE_SSE4_1 ++ #include ++ ++ static inline void ++-volk_8ic_s32f_deinterleave_32f_x2_a_sse4_1(float* iBuffer, float* qBuffer, const lv_8sc_t* complexVector, ++- const float scalar, unsigned int num_points) +++volk_8ic_s32f_deinterleave_32f_x2_a_sse4_1(float* iBuffer, +++ float* qBuffer, +++ const lv_8sc_t* complexVector, +++ const float scalar, +++ unsigned int num_points) ++ { ++- float* iBufferPtr = iBuffer; ++- float* qBufferPtr = qBuffer; ++- ++- unsigned int number = 0; ++- const unsigned int eighthPoints = num_points / 8; ++- __m128 iFloatValue, qFloatValue; ++- ++- const float iScalar= 1.0 / scalar; ++- __m128 invScalar = _mm_set_ps1(iScalar); ++- __m128i complexVal, iIntVal, qIntVal, iComplexVal, qComplexVal; ++- int8_t* complexVectorPtr = (int8_t*)complexVector; ++- ++- __m128i iMoveMask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0); ++- __m128i qMoveMask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 15, 13, 11, 9, 7, 5, 3, 1); ++- ++- for(;number < eighthPoints; number++){ ++- complexVal = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16; ++- iComplexVal = _mm_shuffle_epi8(complexVal, iMoveMask); ++- qComplexVal = _mm_shuffle_epi8(complexVal, qMoveMask); ++- ++- iIntVal = _mm_cvtepi8_epi32(iComplexVal); ++- iFloatValue = _mm_cvtepi32_ps(iIntVal); ++- iFloatValue = _mm_mul_ps(iFloatValue, invScalar); ++- _mm_store_ps(iBufferPtr, iFloatValue); ++- iBufferPtr += 4; ++- ++- iComplexVal = _mm_srli_si128(iComplexVal, 4); ++- ++- iIntVal = _mm_cvtepi8_epi32(iComplexVal); ++- iFloatValue = _mm_cvtepi32_ps(iIntVal); ++- iFloatValue = _mm_mul_ps(iFloatValue, invScalar); ++- _mm_store_ps(iBufferPtr, iFloatValue); ++- iBufferPtr += 4; ++- ++- qIntVal = _mm_cvtepi8_epi32(qComplexVal); ++- qFloatValue = _mm_cvtepi32_ps(qIntVal); ++- qFloatValue = _mm_mul_ps(qFloatValue, invScalar); ++- _mm_store_ps(qBufferPtr, qFloatValue); ++- qBufferPtr += 4; ++- ++- qComplexVal = _mm_srli_si128(qComplexVal, 4); ++- ++- qIntVal = _mm_cvtepi8_epi32(qComplexVal); ++- qFloatValue = _mm_cvtepi32_ps(qIntVal); ++- qFloatValue = _mm_mul_ps(qFloatValue, invScalar); ++- _mm_store_ps(qBufferPtr, qFloatValue); ++- ++- qBufferPtr += 4; ++- } ++- ++- number = eighthPoints * 8; ++- for(; number < num_points; number++){ ++- *iBufferPtr++ = (float)(*complexVectorPtr++) * iScalar; ++- *qBufferPtr++ = (float)(*complexVectorPtr++) * iScalar; ++- } ++- +++ float* iBufferPtr = iBuffer; +++ float* qBufferPtr = qBuffer; +++ +++ unsigned int number = 0; +++ const unsigned int eighthPoints = num_points / 8; +++ __m128 iFloatValue, qFloatValue; +++ +++ const float iScalar = 1.0 / scalar; +++ __m128 invScalar = _mm_set_ps1(iScalar); +++ __m128i complexVal, iIntVal, qIntVal, iComplexVal, qComplexVal; +++ int8_t* complexVectorPtr = (int8_t*)complexVector; +++ +++ __m128i iMoveMask = _mm_set_epi8( +++ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0); +++ __m128i qMoveMask = _mm_set_epi8( +++ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 15, 13, 11, 9, 7, 5, 3, 1); +++ +++ for (; number < eighthPoints; number++) { +++ complexVal = _mm_load_si128((__m128i*)complexVectorPtr); +++ complexVectorPtr += 16; +++ iComplexVal = _mm_shuffle_epi8(complexVal, iMoveMask); +++ qComplexVal = _mm_shuffle_epi8(complexVal, qMoveMask); +++ +++ iIntVal = _mm_cvtepi8_epi32(iComplexVal); +++ iFloatValue = _mm_cvtepi32_ps(iIntVal); +++ iFloatValue = _mm_mul_ps(iFloatValue, invScalar); +++ _mm_store_ps(iBufferPtr, iFloatValue); +++ iBufferPtr += 4; +++ +++ iComplexVal = _mm_srli_si128(iComplexVal, 4); +++ +++ iIntVal = _mm_cvtepi8_epi32(iComplexVal); +++ iFloatValue = _mm_cvtepi32_ps(iIntVal); +++ iFloatValue = _mm_mul_ps(iFloatValue, invScalar); +++ _mm_store_ps(iBufferPtr, iFloatValue); +++ iBufferPtr += 4; +++ +++ qIntVal = _mm_cvtepi8_epi32(qComplexVal); +++ qFloatValue = _mm_cvtepi32_ps(qIntVal); +++ qFloatValue = _mm_mul_ps(qFloatValue, invScalar); +++ _mm_store_ps(qBufferPtr, qFloatValue); +++ qBufferPtr += 4; +++ +++ qComplexVal = _mm_srli_si128(qComplexVal, 4); +++ +++ qIntVal = _mm_cvtepi8_epi32(qComplexVal); +++ qFloatValue = _mm_cvtepi32_ps(qIntVal); +++ qFloatValue = _mm_mul_ps(qFloatValue, invScalar); +++ _mm_store_ps(qBufferPtr, qFloatValue); +++ +++ qBufferPtr += 4; +++ } +++ +++ number = eighthPoints * 8; +++ for (; number < num_points; number++) { +++ *iBufferPtr++ = (float)(*complexVectorPtr++) * iScalar; +++ *qBufferPtr++ = (float)(*complexVectorPtr++) * iScalar; +++ } ++ } ++ #endif /* LV_HAVE_SSE4_1 */ ++ ++@@ -131,59 +136,60 @@ volk_8ic_s32f_deinterleave_32f_x2_a_sse4_1(float* iBuffer, float* qBuffer, const ++ #ifdef LV_HAVE_SSE ++ #include ++ ++-static inline void ++-volk_8ic_s32f_deinterleave_32f_x2_a_sse(float* iBuffer, float* qBuffer, ++- const lv_8sc_t* complexVector, ++- const float scalar, unsigned int num_points) +++static inline void volk_8ic_s32f_deinterleave_32f_x2_a_sse(float* iBuffer, +++ float* qBuffer, +++ const lv_8sc_t* complexVector, +++ const float scalar, +++ unsigned int num_points) ++ { ++- float* iBufferPtr = iBuffer; ++- float* qBufferPtr = qBuffer; +++ float* iBufferPtr = iBuffer; +++ float* qBufferPtr = qBuffer; ++ ++- unsigned int number = 0; ++- const unsigned int quarterPoints = num_points / 4; ++- __m128 cplxValue1, cplxValue2, iValue, qValue; +++ unsigned int number = 0; +++ const unsigned int quarterPoints = num_points / 4; +++ __m128 cplxValue1, cplxValue2, iValue, qValue; ++ ++- __m128 invScalar = _mm_set_ps1(1.0/scalar); ++- int8_t* complexVectorPtr = (int8_t*)complexVector; +++ __m128 invScalar = _mm_set_ps1(1.0 / scalar); +++ int8_t* complexVectorPtr = (int8_t*)complexVector; ++ ++- __VOLK_ATTR_ALIGNED(16) float floatBuffer[8]; +++ __VOLK_ATTR_ALIGNED(16) float floatBuffer[8]; ++ ++- for(;number < quarterPoints; number++){ ++- floatBuffer[0] = (float)(complexVectorPtr[0]); ++- floatBuffer[1] = (float)(complexVectorPtr[1]); ++- floatBuffer[2] = (float)(complexVectorPtr[2]); ++- floatBuffer[3] = (float)(complexVectorPtr[3]); +++ for (; number < quarterPoints; number++) { +++ floatBuffer[0] = (float)(complexVectorPtr[0]); +++ floatBuffer[1] = (float)(complexVectorPtr[1]); +++ floatBuffer[2] = (float)(complexVectorPtr[2]); +++ floatBuffer[3] = (float)(complexVectorPtr[3]); ++ ++- floatBuffer[4] = (float)(complexVectorPtr[4]); ++- floatBuffer[5] = (float)(complexVectorPtr[5]); ++- floatBuffer[6] = (float)(complexVectorPtr[6]); ++- floatBuffer[7] = (float)(complexVectorPtr[7]); +++ floatBuffer[4] = (float)(complexVectorPtr[4]); +++ floatBuffer[5] = (float)(complexVectorPtr[5]); +++ floatBuffer[6] = (float)(complexVectorPtr[6]); +++ floatBuffer[7] = (float)(complexVectorPtr[7]); ++ ++- cplxValue1 = _mm_load_ps(&floatBuffer[0]); ++- cplxValue2 = _mm_load_ps(&floatBuffer[4]); +++ cplxValue1 = _mm_load_ps(&floatBuffer[0]); +++ cplxValue2 = _mm_load_ps(&floatBuffer[4]); ++ ++- complexVectorPtr += 8; +++ complexVectorPtr += 8; ++ ++- cplxValue1 = _mm_mul_ps(cplxValue1, invScalar); ++- cplxValue2 = _mm_mul_ps(cplxValue2, invScalar); +++ cplxValue1 = _mm_mul_ps(cplxValue1, invScalar); +++ cplxValue2 = _mm_mul_ps(cplxValue2, invScalar); ++ ++- // Arrange in i1i2i3i4 format ++- iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0)); ++- qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1)); +++ // Arrange in i1i2i3i4 format +++ iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2, 0, 2, 0)); +++ qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3, 1, 3, 1)); ++ ++- _mm_store_ps(iBufferPtr, iValue); ++- _mm_store_ps(qBufferPtr, qValue); +++ _mm_store_ps(iBufferPtr, iValue); +++ _mm_store_ps(qBufferPtr, qValue); ++ ++- iBufferPtr += 4; ++- qBufferPtr += 4; ++- } +++ iBufferPtr += 4; +++ qBufferPtr += 4; +++ } ++ ++- number = quarterPoints * 4; ++- complexVectorPtr = (int8_t*)&complexVector[number]; ++- for(; number < num_points; number++){ ++- *iBufferPtr++ = (float)(*complexVectorPtr++) / scalar; ++- *qBufferPtr++ = (float)(*complexVectorPtr++) / scalar; ++- } +++ number = quarterPoints * 4; +++ complexVectorPtr = (int8_t*)&complexVector[number]; +++ for (; number < num_points; number++) { +++ *iBufferPtr++ = (float)(*complexVectorPtr++) / scalar; +++ *qBufferPtr++ = (float)(*complexVectorPtr++) / scalar; +++ } ++ } ++ #endif /* LV_HAVE_SSE */ ++ ++@@ -191,70 +197,127 @@ volk_8ic_s32f_deinterleave_32f_x2_a_sse(float* iBuffer, float* qBuffer, ++ #ifdef LV_HAVE_AVX2 ++ #include ++ ++-static inline void ++-volk_8ic_s32f_deinterleave_32f_x2_a_avx2(float* iBuffer, float* qBuffer, const lv_8sc_t* complexVector, ++- const float scalar, unsigned int num_points) +++static inline void volk_8ic_s32f_deinterleave_32f_x2_a_avx2(float* iBuffer, +++ float* qBuffer, +++ const lv_8sc_t* complexVector, +++ const float scalar, +++ unsigned int num_points) ++ { ++- float* iBufferPtr = iBuffer; ++- float* qBufferPtr = qBuffer; ++- ++- unsigned int number = 0; ++- const unsigned int sixteenthPoints = num_points / 16; ++- __m256 iFloatValue, qFloatValue; ++- ++- const float iScalar= 1.0 / scalar; ++- __m256 invScalar = _mm256_set1_ps(iScalar); ++- __m256i complexVal, iIntVal, qIntVal, iComplexVal, qComplexVal; ++- int8_t* complexVectorPtr = (int8_t*)complexVector; ++- ++- __m256i iMoveMask = _mm256_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, ++- 14, 12, 10, 8, 6, 4, 2, 0, ++- 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, ++- 14, 12, 10, 8, 6, 4, 2, 0); ++- __m256i qMoveMask = _mm256_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, ++- 15, 13, 11, 9, 7, 5, 3, 1, ++- 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, ++- 15, 13, 11, 9, 7, 5, 3, 1); ++- ++- for(;number < sixteenthPoints; number++){ ++- complexVal = _mm256_load_si256((__m256i*)complexVectorPtr); ++- complexVectorPtr += 32; ++- iComplexVal = _mm256_shuffle_epi8(complexVal, iMoveMask); ++- qComplexVal = _mm256_shuffle_epi8(complexVal, qMoveMask); ++- ++- iIntVal = _mm256_cvtepi8_epi32(_mm256_castsi256_si128(iComplexVal)); ++- iFloatValue = _mm256_cvtepi32_ps(iIntVal); ++- iFloatValue = _mm256_mul_ps(iFloatValue, invScalar); ++- _mm256_store_ps(iBufferPtr, iFloatValue); ++- iBufferPtr += 8; ++- ++- iComplexVal = _mm256_permute4x64_epi64(iComplexVal, 0b11000110); ++- iIntVal = _mm256_cvtepi8_epi32(_mm256_castsi256_si128(iComplexVal)); ++- iFloatValue = _mm256_cvtepi32_ps(iIntVal); ++- iFloatValue = _mm256_mul_ps(iFloatValue, invScalar); ++- _mm256_store_ps(iBufferPtr, iFloatValue); ++- iBufferPtr += 8; ++- ++- qIntVal = _mm256_cvtepi8_epi32(_mm256_castsi256_si128(qComplexVal)); ++- qFloatValue = _mm256_cvtepi32_ps(qIntVal); ++- qFloatValue = _mm256_mul_ps(qFloatValue, invScalar); ++- _mm256_store_ps(qBufferPtr, qFloatValue); ++- qBufferPtr += 8; ++- ++- qComplexVal = _mm256_permute4x64_epi64(qComplexVal, 0b11000110); ++- qIntVal = _mm256_cvtepi8_epi32(_mm256_castsi256_si128(qComplexVal)); ++- qFloatValue = _mm256_cvtepi32_ps(qIntVal); ++- qFloatValue = _mm256_mul_ps(qFloatValue, invScalar); ++- _mm256_store_ps(qBufferPtr, qFloatValue); ++- qBufferPtr += 8; ++- } ++- ++- number = sixteenthPoints * 16; ++- for(; number < num_points; number++){ ++- *iBufferPtr++ = (float)(*complexVectorPtr++) * iScalar; ++- *qBufferPtr++ = (float)(*complexVectorPtr++) * iScalar; ++- } ++- +++ float* iBufferPtr = iBuffer; +++ float* qBufferPtr = qBuffer; +++ +++ unsigned int number = 0; +++ const unsigned int sixteenthPoints = num_points / 16; +++ __m256 iFloatValue, qFloatValue; +++ +++ const float iScalar = 1.0 / scalar; +++ __m256 invScalar = _mm256_set1_ps(iScalar); +++ __m256i complexVal, iIntVal, qIntVal, iComplexVal, qComplexVal; +++ int8_t* complexVectorPtr = (int8_t*)complexVector; +++ +++ __m256i iMoveMask = _mm256_set_epi8(0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 14, +++ 12, +++ 10, +++ 8, +++ 6, +++ 4, +++ 2, +++ 0, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 14, +++ 12, +++ 10, +++ 8, +++ 6, +++ 4, +++ 2, +++ 0); +++ __m256i qMoveMask = _mm256_set_epi8(0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 15, +++ 13, +++ 11, +++ 9, +++ 7, +++ 5, +++ 3, +++ 1, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 15, +++ 13, +++ 11, +++ 9, +++ 7, +++ 5, +++ 3, +++ 1); +++ +++ for (; number < sixteenthPoints; number++) { +++ complexVal = _mm256_load_si256((__m256i*)complexVectorPtr); +++ complexVectorPtr += 32; +++ iComplexVal = _mm256_shuffle_epi8(complexVal, iMoveMask); +++ qComplexVal = _mm256_shuffle_epi8(complexVal, qMoveMask); +++ +++ iIntVal = _mm256_cvtepi8_epi32(_mm256_castsi256_si128(iComplexVal)); +++ iFloatValue = _mm256_cvtepi32_ps(iIntVal); +++ iFloatValue = _mm256_mul_ps(iFloatValue, invScalar); +++ _mm256_store_ps(iBufferPtr, iFloatValue); +++ iBufferPtr += 8; +++ +++ iComplexVal = _mm256_permute4x64_epi64(iComplexVal, 0b11000110); +++ iIntVal = _mm256_cvtepi8_epi32(_mm256_castsi256_si128(iComplexVal)); +++ iFloatValue = _mm256_cvtepi32_ps(iIntVal); +++ iFloatValue = _mm256_mul_ps(iFloatValue, invScalar); +++ _mm256_store_ps(iBufferPtr, iFloatValue); +++ iBufferPtr += 8; +++ +++ qIntVal = _mm256_cvtepi8_epi32(_mm256_castsi256_si128(qComplexVal)); +++ qFloatValue = _mm256_cvtepi32_ps(qIntVal); +++ qFloatValue = _mm256_mul_ps(qFloatValue, invScalar); +++ _mm256_store_ps(qBufferPtr, qFloatValue); +++ qBufferPtr += 8; +++ +++ qComplexVal = _mm256_permute4x64_epi64(qComplexVal, 0b11000110); +++ qIntVal = _mm256_cvtepi8_epi32(_mm256_castsi256_si128(qComplexVal)); +++ qFloatValue = _mm256_cvtepi32_ps(qIntVal); +++ qFloatValue = _mm256_mul_ps(qFloatValue, invScalar); +++ _mm256_store_ps(qBufferPtr, qFloatValue); +++ qBufferPtr += 8; +++ } +++ +++ number = sixteenthPoints * 16; +++ for (; number < num_points; number++) { +++ *iBufferPtr++ = (float)(*complexVectorPtr++) * iScalar; +++ *qBufferPtr++ = (float)(*complexVectorPtr++) * iScalar; +++ } ++ } ++ #endif /* LV_HAVE_AVX2 */ ++ ++@@ -262,19 +325,21 @@ volk_8ic_s32f_deinterleave_32f_x2_a_avx2(float* iBuffer, float* qBuffer, const l ++ #ifdef LV_HAVE_GENERIC ++ ++ static inline void ++-volk_8ic_s32f_deinterleave_32f_x2_generic(float* iBuffer, float* qBuffer, +++volk_8ic_s32f_deinterleave_32f_x2_generic(float* iBuffer, +++ float* qBuffer, ++ const lv_8sc_t* complexVector, ++- const float scalar, unsigned int num_points) +++ const float scalar, +++ unsigned int num_points) ++ { ++- const int8_t* complexVectorPtr = (const int8_t*)complexVector; ++- float* iBufferPtr = iBuffer; ++- float* qBufferPtr = qBuffer; ++- unsigned int number; ++- const float invScalar = 1.0 / scalar; ++- for(number = 0; number < num_points; number++){ ++- *iBufferPtr++ = (float)(*complexVectorPtr++)*invScalar; ++- *qBufferPtr++ = (float)(*complexVectorPtr++)*invScalar; ++- } +++ const int8_t* complexVectorPtr = (const int8_t*)complexVector; +++ float* iBufferPtr = iBuffer; +++ float* qBufferPtr = qBuffer; +++ unsigned int number; +++ const float invScalar = 1.0 / scalar; +++ for (number = 0; number < num_points; number++) { +++ *iBufferPtr++ = (float)(*complexVectorPtr++) * invScalar; +++ *qBufferPtr++ = (float)(*complexVectorPtr++) * invScalar; +++ } ++ } ++ #endif /* LV_HAVE_GENERIC */ ++ ++@@ -285,75 +350,107 @@ volk_8ic_s32f_deinterleave_32f_x2_generic(float* iBuffer, float* qBuffer, ++ #ifndef INCLUDED_volk_8ic_s32f_deinterleave_32f_x2_u_H ++ #define INCLUDED_volk_8ic_s32f_deinterleave_32f_x2_u_H ++ ++-#include ++ #include ++ #include +++#include ++ ++ #ifdef LV_HAVE_AVX2 ++ #include ++ ++-static inline void ++-volk_8ic_s32f_deinterleave_32f_x2_u_avx2(float* iBuffer, float* qBuffer, const lv_8sc_t* complexVector, ++- const float scalar, unsigned int num_points) +++static inline void volk_8ic_s32f_deinterleave_32f_x2_u_avx2(float* iBuffer, +++ float* qBuffer, +++ const lv_8sc_t* complexVector, +++ const float scalar, +++ unsigned int num_points) ++ { ++- float* iBufferPtr = iBuffer; ++- float* qBufferPtr = qBuffer; ++- ++- unsigned int number = 0; ++- const unsigned int sixteenthPoints = num_points / 16; ++- __m256 iFloatValue, qFloatValue; ++- ++- const float iScalar= 1.0 / scalar; ++- __m256 invScalar = _mm256_set1_ps(iScalar); ++- __m256i complexVal, iIntVal, qIntVal; ++- __m128i iComplexVal, qComplexVal; ++- int8_t* complexVectorPtr = (int8_t*)complexVector; ++- ++- __m256i MoveMask = _mm256_set_epi8(15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, ++- 6, 4, 2, 0,15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0); ++- ++- for(;number < sixteenthPoints; number++){ ++- complexVal = _mm256_loadu_si256((__m256i*)complexVectorPtr); complexVectorPtr += 32; ++- complexVal = _mm256_shuffle_epi8(complexVal, MoveMask); ++- complexVal = _mm256_permute4x64_epi64(complexVal,0xd8); ++- iComplexVal = _mm256_extractf128_si256(complexVal,0); ++- qComplexVal = _mm256_extractf128_si256(complexVal,1); ++- ++- iIntVal = _mm256_cvtepi8_epi32(iComplexVal); ++- iFloatValue = _mm256_cvtepi32_ps(iIntVal); ++- iFloatValue = _mm256_mul_ps(iFloatValue, invScalar); ++- _mm256_storeu_ps(iBufferPtr, iFloatValue); ++- iBufferPtr += 8; ++- ++- qIntVal = _mm256_cvtepi8_epi32(qComplexVal); ++- qFloatValue = _mm256_cvtepi32_ps(qIntVal); ++- qFloatValue = _mm256_mul_ps(qFloatValue, invScalar); ++- _mm256_storeu_ps(qBufferPtr, qFloatValue); ++- qBufferPtr += 8; ++- ++- complexVal = _mm256_srli_si256(complexVal, 8); ++- iComplexVal = _mm256_extractf128_si256(complexVal,0); ++- qComplexVal = _mm256_extractf128_si256(complexVal,1); ++- ++- iIntVal = _mm256_cvtepi8_epi32(iComplexVal); ++- iFloatValue = _mm256_cvtepi32_ps(iIntVal); ++- iFloatValue = _mm256_mul_ps(iFloatValue, invScalar); ++- _mm256_storeu_ps(iBufferPtr, iFloatValue); ++- iBufferPtr += 8; ++- ++- qIntVal = _mm256_cvtepi8_epi32(qComplexVal); ++- qFloatValue = _mm256_cvtepi32_ps(qIntVal); ++- qFloatValue = _mm256_mul_ps(qFloatValue, invScalar); ++- _mm256_storeu_ps(qBufferPtr, qFloatValue); ++- qBufferPtr += 8; ++- } ++- ++- number = sixteenthPoints * 16; ++- for(; number < num_points; number++){ ++- *iBufferPtr++ = (float)(*complexVectorPtr++) * iScalar; ++- *qBufferPtr++ = (float)(*complexVectorPtr++) * iScalar; ++- } ++- +++ float* iBufferPtr = iBuffer; +++ float* qBufferPtr = qBuffer; +++ +++ unsigned int number = 0; +++ const unsigned int sixteenthPoints = num_points / 16; +++ __m256 iFloatValue, qFloatValue; +++ +++ const float iScalar = 1.0 / scalar; +++ __m256 invScalar = _mm256_set1_ps(iScalar); +++ __m256i complexVal, iIntVal, qIntVal; +++ __m128i iComplexVal, qComplexVal; +++ int8_t* complexVectorPtr = (int8_t*)complexVector; +++ +++ __m256i MoveMask = _mm256_set_epi8(15, +++ 13, +++ 11, +++ 9, +++ 7, +++ 5, +++ 3, +++ 1, +++ 14, +++ 12, +++ 10, +++ 8, +++ 6, +++ 4, +++ 2, +++ 0, +++ 15, +++ 13, +++ 11, +++ 9, +++ 7, +++ 5, +++ 3, +++ 1, +++ 14, +++ 12, +++ 10, +++ 8, +++ 6, +++ 4, +++ 2, +++ 0); +++ +++ for (; number < sixteenthPoints; number++) { +++ complexVal = _mm256_loadu_si256((__m256i*)complexVectorPtr); +++ complexVectorPtr += 32; +++ complexVal = _mm256_shuffle_epi8(complexVal, MoveMask); +++ complexVal = _mm256_permute4x64_epi64(complexVal, 0xd8); +++ iComplexVal = _mm256_extractf128_si256(complexVal, 0); +++ qComplexVal = _mm256_extractf128_si256(complexVal, 1); +++ +++ iIntVal = _mm256_cvtepi8_epi32(iComplexVal); +++ iFloatValue = _mm256_cvtepi32_ps(iIntVal); +++ iFloatValue = _mm256_mul_ps(iFloatValue, invScalar); +++ _mm256_storeu_ps(iBufferPtr, iFloatValue); +++ iBufferPtr += 8; +++ +++ qIntVal = _mm256_cvtepi8_epi32(qComplexVal); +++ qFloatValue = _mm256_cvtepi32_ps(qIntVal); +++ qFloatValue = _mm256_mul_ps(qFloatValue, invScalar); +++ _mm256_storeu_ps(qBufferPtr, qFloatValue); +++ qBufferPtr += 8; +++ +++ complexVal = _mm256_srli_si256(complexVal, 8); +++ iComplexVal = _mm256_extractf128_si256(complexVal, 0); +++ qComplexVal = _mm256_extractf128_si256(complexVal, 1); +++ +++ iIntVal = _mm256_cvtepi8_epi32(iComplexVal); +++ iFloatValue = _mm256_cvtepi32_ps(iIntVal); +++ iFloatValue = _mm256_mul_ps(iFloatValue, invScalar); +++ _mm256_storeu_ps(iBufferPtr, iFloatValue); +++ iBufferPtr += 8; +++ +++ qIntVal = _mm256_cvtepi8_epi32(qComplexVal); +++ qFloatValue = _mm256_cvtepi32_ps(qIntVal); +++ qFloatValue = _mm256_mul_ps(qFloatValue, invScalar); +++ _mm256_storeu_ps(qBufferPtr, qFloatValue); +++ qBufferPtr += 8; +++ } +++ +++ number = sixteenthPoints * 16; +++ for (; number < num_points; number++) { +++ *iBufferPtr++ = (float)(*complexVectorPtr++) * iScalar; +++ *qBufferPtr++ = (float)(*complexVectorPtr++) * iScalar; +++ } ++ } ++ #endif /* LV_HAVE_AVX2 */ ++ ++diff --git a/kernels/volk/volk_8ic_s32f_deinterleave_real_32f.h b/kernels/volk/volk_8ic_s32f_deinterleave_real_32f.h ++index 0c85ee9..4c1afe7 100644 ++--- a/kernels/volk/volk_8ic_s32f_deinterleave_real_32f.h +++++ b/kernels/volk/volk_8ic_s32f_deinterleave_real_32f.h ++@@ -31,8 +31,8 @@ ++ * ++ * Dispatcher Prototype ++ * \code ++- * void volk_8ic_s32f_deinterleave_real_32f(float* iBuffer, const lv_8sc_t* complexVector, const float scalar, unsigned int num_points) ++- * \endcode +++ * void volk_8ic_s32f_deinterleave_real_32f(float* iBuffer, const lv_8sc_t* complexVector, +++ * const float scalar, unsigned int num_points) \endcode ++ * ++ * \b Inputs ++ * \li complexVector: The complex input vector. ++@@ -55,57 +55,86 @@ ++ #ifndef INCLUDED_volk_8ic_s32f_deinterleave_real_32f_a_H ++ #define INCLUDED_volk_8ic_s32f_deinterleave_real_32f_a_H ++ ++-#include ++ #include ++ #include +++#include ++ ++ #ifdef LV_HAVE_AVX2 ++ #include ++ ++ static inline void ++-volk_8ic_s32f_deinterleave_real_32f_a_avx2(float* iBuffer, const lv_8sc_t* complexVector, ++- const float scalar, unsigned int num_points) +++volk_8ic_s32f_deinterleave_real_32f_a_avx2(float* iBuffer, +++ const lv_8sc_t* complexVector, +++ const float scalar, +++ unsigned int num_points) ++ { ++- float* iBufferPtr = iBuffer; ++- ++- unsigned int number = 0; ++- const unsigned int sixteenthPoints = num_points / 16; ++- __m256 iFloatValue; ++- ++- const float iScalar= 1.0 / scalar; ++- __m256 invScalar = _mm256_set1_ps(iScalar); ++- __m256i complexVal, iIntVal; ++- int8_t* complexVectorPtr = (int8_t*)complexVector; ++- ++- __m256i moveMask = _mm256_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, ++- 14, 12, 10, 8, 6, 4, 2, 0, ++- 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, ++- 14, 12, 10, 8, 6, 4, 2, 0); ++- for(;number < sixteenthPoints; number++){ ++- complexVal = _mm256_load_si256((__m256i*)complexVectorPtr); ++- complexVectorPtr += 32; ++- complexVal = _mm256_shuffle_epi8(complexVal, moveMask); ++- ++- iIntVal = _mm256_cvtepi8_epi32(_mm256_castsi256_si128(complexVal)); ++- iFloatValue = _mm256_cvtepi32_ps(iIntVal); ++- iFloatValue = _mm256_mul_ps(iFloatValue, invScalar); ++- _mm256_store_ps(iBufferPtr, iFloatValue); ++- iBufferPtr += 8; ++- ++- complexVal = _mm256_permute4x64_epi64(complexVal, 0b11000110); ++- iIntVal = _mm256_cvtepi8_epi32(_mm256_castsi256_si128(complexVal)); ++- iFloatValue = _mm256_cvtepi32_ps(iIntVal); ++- iFloatValue = _mm256_mul_ps(iFloatValue, invScalar); ++- _mm256_store_ps(iBufferPtr, iFloatValue); ++- iBufferPtr += 8; ++- } ++- ++- number = sixteenthPoints * 16; ++- for(; number < num_points; number++){ ++- *iBufferPtr++ = (float)(*complexVectorPtr++) * iScalar; ++- complexVectorPtr++; ++- } ++- +++ float* iBufferPtr = iBuffer; +++ +++ unsigned int number = 0; +++ const unsigned int sixteenthPoints = num_points / 16; +++ __m256 iFloatValue; +++ +++ const float iScalar = 1.0 / scalar; +++ __m256 invScalar = _mm256_set1_ps(iScalar); +++ __m256i complexVal, iIntVal; +++ int8_t* complexVectorPtr = (int8_t*)complexVector; +++ +++ __m256i moveMask = _mm256_set_epi8(0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 14, +++ 12, +++ 10, +++ 8, +++ 6, +++ 4, +++ 2, +++ 0, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 14, +++ 12, +++ 10, +++ 8, +++ 6, +++ 4, +++ 2, +++ 0); +++ for (; number < sixteenthPoints; number++) { +++ complexVal = _mm256_load_si256((__m256i*)complexVectorPtr); +++ complexVectorPtr += 32; +++ complexVal = _mm256_shuffle_epi8(complexVal, moveMask); +++ +++ iIntVal = _mm256_cvtepi8_epi32(_mm256_castsi256_si128(complexVal)); +++ iFloatValue = _mm256_cvtepi32_ps(iIntVal); +++ iFloatValue = _mm256_mul_ps(iFloatValue, invScalar); +++ _mm256_store_ps(iBufferPtr, iFloatValue); +++ iBufferPtr += 8; +++ +++ complexVal = _mm256_permute4x64_epi64(complexVal, 0b11000110); +++ iIntVal = _mm256_cvtepi8_epi32(_mm256_castsi256_si128(complexVal)); +++ iFloatValue = _mm256_cvtepi32_ps(iIntVal); +++ iFloatValue = _mm256_mul_ps(iFloatValue, invScalar); +++ _mm256_store_ps(iBufferPtr, iFloatValue); +++ iBufferPtr += 8; +++ } +++ +++ number = sixteenthPoints * 16; +++ for (; number < num_points; number++) { +++ *iBufferPtr++ = (float)(*complexVectorPtr++) * iScalar; +++ complexVectorPtr++; +++ } ++ } ++ #endif /* LV_HAVE_AVX2 */ ++ ++@@ -114,52 +143,55 @@ volk_8ic_s32f_deinterleave_real_32f_a_avx2(float* iBuffer, const lv_8sc_t* compl ++ #include ++ ++ static inline void ++-volk_8ic_s32f_deinterleave_real_32f_a_sse4_1(float* iBuffer, const lv_8sc_t* complexVector, ++- const float scalar, unsigned int num_points) +++volk_8ic_s32f_deinterleave_real_32f_a_sse4_1(float* iBuffer, +++ const lv_8sc_t* complexVector, +++ const float scalar, +++ unsigned int num_points) ++ { ++- float* iBufferPtr = iBuffer; ++- ++- unsigned int number = 0; ++- const unsigned int eighthPoints = num_points / 8; ++- __m128 iFloatValue; +++ float* iBufferPtr = iBuffer; ++ ++- const float iScalar= 1.0 / scalar; ++- __m128 invScalar = _mm_set_ps1(iScalar); ++- __m128i complexVal, iIntVal; ++- int8_t* complexVectorPtr = (int8_t*)complexVector; +++ unsigned int number = 0; +++ const unsigned int eighthPoints = num_points / 8; +++ __m128 iFloatValue; ++ ++- __m128i moveMask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0); +++ const float iScalar = 1.0 / scalar; +++ __m128 invScalar = _mm_set_ps1(iScalar); +++ __m128i complexVal, iIntVal; +++ int8_t* complexVectorPtr = (int8_t*)complexVector; ++ ++- for(;number < eighthPoints; number++){ ++- complexVal = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16; ++- complexVal = _mm_shuffle_epi8(complexVal, moveMask); +++ __m128i moveMask = _mm_set_epi8( +++ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0); ++ ++- iIntVal = _mm_cvtepi8_epi32(complexVal); ++- iFloatValue = _mm_cvtepi32_ps(iIntVal); +++ for (; number < eighthPoints; number++) { +++ complexVal = _mm_load_si128((__m128i*)complexVectorPtr); +++ complexVectorPtr += 16; +++ complexVal = _mm_shuffle_epi8(complexVal, moveMask); ++ ++- iFloatValue = _mm_mul_ps(iFloatValue, invScalar); +++ iIntVal = _mm_cvtepi8_epi32(complexVal); +++ iFloatValue = _mm_cvtepi32_ps(iIntVal); ++ ++- _mm_store_ps(iBufferPtr, iFloatValue); +++ iFloatValue = _mm_mul_ps(iFloatValue, invScalar); ++ ++- iBufferPtr += 4; +++ _mm_store_ps(iBufferPtr, iFloatValue); ++ ++- complexVal = _mm_srli_si128(complexVal, 4); ++- iIntVal = _mm_cvtepi8_epi32(complexVal); ++- iFloatValue = _mm_cvtepi32_ps(iIntVal); +++ iBufferPtr += 4; ++ ++- iFloatValue = _mm_mul_ps(iFloatValue, invScalar); +++ complexVal = _mm_srli_si128(complexVal, 4); +++ iIntVal = _mm_cvtepi8_epi32(complexVal); +++ iFloatValue = _mm_cvtepi32_ps(iIntVal); ++ ++- _mm_store_ps(iBufferPtr, iFloatValue); +++ iFloatValue = _mm_mul_ps(iFloatValue, invScalar); ++ ++- iBufferPtr += 4; ++- } +++ _mm_store_ps(iBufferPtr, iFloatValue); ++ ++- number = eighthPoints * 8; ++- for(; number < num_points; number++){ ++- *iBufferPtr++ = (float)(*complexVectorPtr++) * iScalar; ++- complexVectorPtr++; ++- } +++ iBufferPtr += 4; +++ } ++ +++ number = eighthPoints * 8; +++ for (; number < num_points; number++) { +++ *iBufferPtr++ = (float)(*complexVectorPtr++) * iScalar; +++ complexVectorPtr++; +++ } ++ } ++ #endif /* LV_HAVE_SSE4_1 */ ++ ++@@ -168,42 +200,47 @@ volk_8ic_s32f_deinterleave_real_32f_a_sse4_1(float* iBuffer, const lv_8sc_t* com ++ #include ++ ++ static inline void ++-volk_8ic_s32f_deinterleave_real_32f_a_sse(float* iBuffer, const lv_8sc_t* complexVector, ++- const float scalar, unsigned int num_points) +++volk_8ic_s32f_deinterleave_real_32f_a_sse(float* iBuffer, +++ const lv_8sc_t* complexVector, +++ const float scalar, +++ unsigned int num_points) ++ { ++- float* iBufferPtr = iBuffer; ++- ++- unsigned int number = 0; ++- const unsigned int quarterPoints = num_points / 4; ++- __m128 iValue; +++ float* iBufferPtr = iBuffer; ++ ++- const float iScalar= 1.0 / scalar; ++- __m128 invScalar = _mm_set_ps1(iScalar); ++- int8_t* complexVectorPtr = (int8_t*)complexVector; +++ unsigned int number = 0; +++ const unsigned int quarterPoints = num_points / 4; +++ __m128 iValue; ++ ++- __VOLK_ATTR_ALIGNED(16) float floatBuffer[4]; +++ const float iScalar = 1.0 / scalar; +++ __m128 invScalar = _mm_set_ps1(iScalar); +++ int8_t* complexVectorPtr = (int8_t*)complexVector; ++ ++- for(;number < quarterPoints; number++){ ++- floatBuffer[0] = (float)(*complexVectorPtr); complexVectorPtr += 2; ++- floatBuffer[1] = (float)(*complexVectorPtr); complexVectorPtr += 2; ++- floatBuffer[2] = (float)(*complexVectorPtr); complexVectorPtr += 2; ++- floatBuffer[3] = (float)(*complexVectorPtr); complexVectorPtr += 2; +++ __VOLK_ATTR_ALIGNED(16) float floatBuffer[4]; ++ ++- iValue = _mm_load_ps(floatBuffer); +++ for (; number < quarterPoints; number++) { +++ floatBuffer[0] = (float)(*complexVectorPtr); +++ complexVectorPtr += 2; +++ floatBuffer[1] = (float)(*complexVectorPtr); +++ complexVectorPtr += 2; +++ floatBuffer[2] = (float)(*complexVectorPtr); +++ complexVectorPtr += 2; +++ floatBuffer[3] = (float)(*complexVectorPtr); +++ complexVectorPtr += 2; ++ ++- iValue = _mm_mul_ps(iValue, invScalar); +++ iValue = _mm_load_ps(floatBuffer); ++ ++- _mm_store_ps(iBufferPtr, iValue); +++ iValue = _mm_mul_ps(iValue, invScalar); ++ ++- iBufferPtr += 4; ++- } +++ _mm_store_ps(iBufferPtr, iValue); ++ ++- number = quarterPoints * 4; ++- for(; number < num_points; number++){ ++- *iBufferPtr++ = (float)(*complexVectorPtr++) * iScalar; ++- complexVectorPtr++; ++- } +++ iBufferPtr += 4; +++ } ++ +++ number = quarterPoints * 4; +++ for (; number < num_points; number++) { +++ *iBufferPtr++ = (float)(*complexVectorPtr++) * iScalar; +++ complexVectorPtr++; +++ } ++ } ++ #endif /* LV_HAVE_SSE */ ++ ++@@ -211,83 +248,117 @@ volk_8ic_s32f_deinterleave_real_32f_a_sse(float* iBuffer, const lv_8sc_t* comple ++ #ifdef LV_HAVE_GENERIC ++ ++ static inline void ++-volk_8ic_s32f_deinterleave_real_32f_generic(float* iBuffer, const lv_8sc_t* complexVector, ++- const float scalar, unsigned int num_points) +++volk_8ic_s32f_deinterleave_real_32f_generic(float* iBuffer, +++ const lv_8sc_t* complexVector, +++ const float scalar, +++ unsigned int num_points) ++ { ++- unsigned int number = 0; ++- const int8_t* complexVectorPtr = (const int8_t*)complexVector; ++- float* iBufferPtr = iBuffer; ++- const float invScalar = 1.0 / scalar; ++- for(number = 0; number < num_points; number++){ ++- *iBufferPtr++ = ((float)(*complexVectorPtr++)) * invScalar; ++- complexVectorPtr++; ++- } +++ unsigned int number = 0; +++ const int8_t* complexVectorPtr = (const int8_t*)complexVector; +++ float* iBufferPtr = iBuffer; +++ const float invScalar = 1.0 / scalar; +++ for (number = 0; number < num_points; number++) { +++ *iBufferPtr++ = ((float)(*complexVectorPtr++)) * invScalar; +++ complexVectorPtr++; +++ } ++ } ++ #endif /* LV_HAVE_GENERIC */ ++ ++ ++- ++ #endif /* INCLUDED_volk_8ic_s32f_deinterleave_real_32f_a_H */ ++ ++ #ifndef INCLUDED_volk_8ic_s32f_deinterleave_real_32f_u_H ++ #define INCLUDED_volk_8ic_s32f_deinterleave_real_32f_u_H ++ ++-#include ++ #include ++ #include +++#include ++ ++ #ifdef LV_HAVE_AVX2 ++ #include ++ ++ static inline void ++-volk_8ic_s32f_deinterleave_real_32f_u_avx2(float* iBuffer, const lv_8sc_t* complexVector, ++- const float scalar, unsigned int num_points) +++volk_8ic_s32f_deinterleave_real_32f_u_avx2(float* iBuffer, +++ const lv_8sc_t* complexVector, +++ const float scalar, +++ unsigned int num_points) ++ { ++- float* iBufferPtr = iBuffer; ++- ++- unsigned int number = 0; ++- const unsigned int sixteenthPoints = num_points / 16; ++- __m256 iFloatValue; ++- ++- const float iScalar= 1.0 / scalar; ++- __m256 invScalar = _mm256_set1_ps(iScalar); ++- __m256i complexVal, iIntVal; ++- __m128i hcomplexVal; ++- int8_t* complexVectorPtr = (int8_t*)complexVector; ++- ++- __m256i moveMask = _mm256_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0); ++- ++- for(;number < sixteenthPoints; number++){ ++- complexVal = _mm256_loadu_si256((__m256i*)complexVectorPtr); complexVectorPtr += 32; ++- complexVal = _mm256_shuffle_epi8(complexVal, moveMask); ++- ++- hcomplexVal = _mm256_extracti128_si256(complexVal,0); ++- iIntVal = _mm256_cvtepi8_epi32(hcomplexVal); ++- iFloatValue = _mm256_cvtepi32_ps(iIntVal); ++- ++- iFloatValue = _mm256_mul_ps(iFloatValue, invScalar); ++- ++- _mm256_storeu_ps(iBufferPtr, iFloatValue); ++- ++- iBufferPtr += 8; ++- ++- hcomplexVal = _mm256_extracti128_si256(complexVal,1); ++- iIntVal = _mm256_cvtepi8_epi32(hcomplexVal); ++- iFloatValue = _mm256_cvtepi32_ps(iIntVal); ++- ++- iFloatValue = _mm256_mul_ps(iFloatValue, invScalar); ++- ++- _mm256_storeu_ps(iBufferPtr, iFloatValue); ++- ++- iBufferPtr += 8; ++- } ++- ++- number = sixteenthPoints * 16; ++- for(; number < num_points; number++){ ++- *iBufferPtr++ = (float)(*complexVectorPtr++) * iScalar; ++- complexVectorPtr++; ++- } ++- +++ float* iBufferPtr = iBuffer; +++ +++ unsigned int number = 0; +++ const unsigned int sixteenthPoints = num_points / 16; +++ __m256 iFloatValue; +++ +++ const float iScalar = 1.0 / scalar; +++ __m256 invScalar = _mm256_set1_ps(iScalar); +++ __m256i complexVal, iIntVal; +++ __m128i hcomplexVal; +++ int8_t* complexVectorPtr = (int8_t*)complexVector; +++ +++ __m256i moveMask = _mm256_set_epi8(0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 14, +++ 12, +++ 10, +++ 8, +++ 6, +++ 4, +++ 2, +++ 0, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 14, +++ 12, +++ 10, +++ 8, +++ 6, +++ 4, +++ 2, +++ 0); +++ +++ for (; number < sixteenthPoints; number++) { +++ complexVal = _mm256_loadu_si256((__m256i*)complexVectorPtr); +++ complexVectorPtr += 32; +++ complexVal = _mm256_shuffle_epi8(complexVal, moveMask); +++ +++ hcomplexVal = _mm256_extracti128_si256(complexVal, 0); +++ iIntVal = _mm256_cvtepi8_epi32(hcomplexVal); +++ iFloatValue = _mm256_cvtepi32_ps(iIntVal); +++ +++ iFloatValue = _mm256_mul_ps(iFloatValue, invScalar); +++ +++ _mm256_storeu_ps(iBufferPtr, iFloatValue); +++ +++ iBufferPtr += 8; +++ +++ hcomplexVal = _mm256_extracti128_si256(complexVal, 1); +++ iIntVal = _mm256_cvtepi8_epi32(hcomplexVal); +++ iFloatValue = _mm256_cvtepi32_ps(iIntVal); +++ +++ iFloatValue = _mm256_mul_ps(iFloatValue, invScalar); +++ +++ _mm256_storeu_ps(iBufferPtr, iFloatValue); +++ +++ iBufferPtr += 8; +++ } +++ +++ number = sixteenthPoints * 16; +++ for (; number < num_points; number++) { +++ *iBufferPtr++ = (float)(*complexVectorPtr++) * iScalar; +++ complexVectorPtr++; +++ } ++ } ++ #endif /* LV_HAVE_AVX2 */ ++ ++diff --git a/kernels/volk/volk_8ic_x2_multiply_conjugate_16ic.h b/kernels/volk/volk_8ic_x2_multiply_conjugate_16ic.h ++index 6762658..7f9fd96 100644 ++--- a/kernels/volk/volk_8ic_x2_multiply_conjugate_16ic.h +++++ b/kernels/volk/volk_8ic_x2_multiply_conjugate_16ic.h ++@@ -30,64 +30,73 @@ ++ #ifdef LV_HAVE_AVX2 ++ #include ++ /*! ++- \brief Multiplys the one complex vector with the complex conjugate of the second complex vector and stores their results in the third vector ++- \param cVector The complex vector where the results will be stored ++- \param aVector One of the complex vectors to be multiplied ++- \param bVector The complex vector which will be converted to complex conjugate and multiplied ++- \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector +++ \brief Multiplys the one complex vector with the complex conjugate of the second complex +++ vector and stores their results in the third vector \param cVector The complex vector +++ where the results will be stored \param aVector One of the complex vectors to be +++ multiplied \param bVector The complex vector which will be converted to complex +++ conjugate and multiplied \param num_points The number of complex values in aVector and +++ bVector to be multiplied together and stored into cVector ++ */ ++-static inline void volk_8ic_x2_multiply_conjugate_16ic_a_avx2(lv_16sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t* bVector, unsigned int num_points){ ++- unsigned int number = 0; ++- const unsigned int quarterPoints = num_points / 8; ++- ++- __m256i x, y, realz, imagz; ++- lv_16sc_t* c = cVector; ++- const lv_8sc_t* a = aVector; ++- const lv_8sc_t* b = bVector; ++- __m256i conjugateSign = _mm256_set_epi16(-1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1); ++- ++- for(;number < quarterPoints; number++){ ++- // Convert 8 bit values into 16 bit values ++- x = _mm256_cvtepi8_epi16(_mm_load_si128((__m128i*)a)); ++- y = _mm256_cvtepi8_epi16(_mm_load_si128((__m128i*)b)); ++- ++- // Calculate the ar*cr - ai*(-ci) portions ++- realz = _mm256_madd_epi16(x,y); ++- ++- // Calculate the complex conjugate of the cr + ci j values ++- y = _mm256_sign_epi16(y, conjugateSign); ++- ++- // Shift the order of the cr and ci values ++- y = _mm256_shufflehi_epi16(_mm256_shufflelo_epi16(y, _MM_SHUFFLE(2,3,0,1) ), _MM_SHUFFLE(2,3,0,1)); ++- ++- // Calculate the ar*(-ci) + cr*(ai) ++- imagz = _mm256_madd_epi16(x,y); ++- ++- // Perform the addition of products ++- ++- _mm256_store_si256((__m256i*)c, _mm256_packs_epi32(_mm256_unpacklo_epi32(realz, imagz), _mm256_unpackhi_epi32(realz, imagz))); ++- ++- a += 8; ++- b += 8; ++- c += 8; ++- } ++- ++- number = quarterPoints * 8; ++- int16_t* c16Ptr = (int16_t*)&cVector[number]; ++- int8_t* a8Ptr = (int8_t*)&aVector[number]; ++- int8_t* b8Ptr = (int8_t*)&bVector[number]; ++- for(; number < num_points; number++){ ++- float aReal = (float)*a8Ptr++; ++- float aImag = (float)*a8Ptr++; ++- lv_32fc_t aVal = lv_cmake(aReal, aImag ); ++- float bReal = (float)*b8Ptr++; ++- float bImag = (float)*b8Ptr++; ++- lv_32fc_t bVal = lv_cmake( bReal, -bImag ); ++- lv_32fc_t temp = aVal * bVal; ++- ++- *c16Ptr++ = (int16_t)lv_creal(temp); ++- *c16Ptr++ = (int16_t)lv_cimag(temp); ++- } +++static inline void volk_8ic_x2_multiply_conjugate_16ic_a_avx2(lv_16sc_t* cVector, +++ const lv_8sc_t* aVector, +++ const lv_8sc_t* bVector, +++ unsigned int num_points) +++{ +++ unsigned int number = 0; +++ const unsigned int quarterPoints = num_points / 8; +++ +++ __m256i x, y, realz, imagz; +++ lv_16sc_t* c = cVector; +++ const lv_8sc_t* a = aVector; +++ const lv_8sc_t* b = bVector; +++ __m256i conjugateSign = +++ _mm256_set_epi16(-1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1); +++ +++ for (; number < quarterPoints; number++) { +++ // Convert 8 bit values into 16 bit values +++ x = _mm256_cvtepi8_epi16(_mm_load_si128((__m128i*)a)); +++ y = _mm256_cvtepi8_epi16(_mm_load_si128((__m128i*)b)); +++ +++ // Calculate the ar*cr - ai*(-ci) portions +++ realz = _mm256_madd_epi16(x, y); +++ +++ // Calculate the complex conjugate of the cr + ci j values +++ y = _mm256_sign_epi16(y, conjugateSign); +++ +++ // Shift the order of the cr and ci values +++ y = _mm256_shufflehi_epi16(_mm256_shufflelo_epi16(y, _MM_SHUFFLE(2, 3, 0, 1)), +++ _MM_SHUFFLE(2, 3, 0, 1)); +++ +++ // Calculate the ar*(-ci) + cr*(ai) +++ imagz = _mm256_madd_epi16(x, y); +++ +++ // Perform the addition of products +++ +++ _mm256_store_si256((__m256i*)c, +++ _mm256_packs_epi32(_mm256_unpacklo_epi32(realz, imagz), +++ _mm256_unpackhi_epi32(realz, imagz))); +++ +++ a += 8; +++ b += 8; +++ c += 8; +++ } +++ +++ number = quarterPoints * 8; +++ int16_t* c16Ptr = (int16_t*)&cVector[number]; +++ int8_t* a8Ptr = (int8_t*)&aVector[number]; +++ int8_t* b8Ptr = (int8_t*)&bVector[number]; +++ for (; number < num_points; number++) { +++ float aReal = (float)*a8Ptr++; +++ float aImag = (float)*a8Ptr++; +++ lv_32fc_t aVal = lv_cmake(aReal, aImag); +++ float bReal = (float)*b8Ptr++; +++ float bImag = (float)*b8Ptr++; +++ lv_32fc_t bVal = lv_cmake(bReal, -bImag); +++ lv_32fc_t temp = aVal * bVal; +++ +++ *c16Ptr++ = (int16_t)lv_creal(temp); +++ *c16Ptr++ = (int16_t)lv_cimag(temp); +++ } ++ } ++ #endif /* LV_HAVE_AVX2 */ ++ ++@@ -95,90 +104,103 @@ static inline void volk_8ic_x2_multiply_conjugate_16ic_a_avx2(lv_16sc_t* cVector ++ #ifdef LV_HAVE_SSE4_1 ++ #include ++ /*! ++- \brief Multiplys the one complex vector with the complex conjugate of the second complex vector and stores their results in the third vector ++- \param cVector The complex vector where the results will be stored ++- \param aVector One of the complex vectors to be multiplied ++- \param bVector The complex vector which will be converted to complex conjugate and multiplied ++- \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector +++ \brief Multiplys the one complex vector with the complex conjugate of the second complex +++ vector and stores their results in the third vector \param cVector The complex vector +++ where the results will be stored \param aVector One of the complex vectors to be +++ multiplied \param bVector The complex vector which will be converted to complex +++ conjugate and multiplied \param num_points The number of complex values in aVector and +++ bVector to be multiplied together and stored into cVector ++ */ ++-static inline void volk_8ic_x2_multiply_conjugate_16ic_a_sse4_1(lv_16sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t* bVector, unsigned int num_points){ ++- unsigned int number = 0; ++- const unsigned int quarterPoints = num_points / 4; ++- ++- __m128i x, y, realz, imagz; ++- lv_16sc_t* c = cVector; ++- const lv_8sc_t* a = aVector; ++- const lv_8sc_t* b = bVector; ++- __m128i conjugateSign = _mm_set_epi16(-1, 1, -1, 1, -1, 1, -1, 1); ++- ++- for(;number < quarterPoints; number++){ ++- // Convert into 8 bit values into 16 bit values ++- x = _mm_cvtepi8_epi16(_mm_loadl_epi64((__m128i*)a)); ++- y = _mm_cvtepi8_epi16(_mm_loadl_epi64((__m128i*)b)); ++- ++- // Calculate the ar*cr - ai*(-ci) portions ++- realz = _mm_madd_epi16(x,y); ++- ++- // Calculate the complex conjugate of the cr + ci j values ++- y = _mm_sign_epi16(y, conjugateSign); ++- ++- // Shift the order of the cr and ci values ++- y = _mm_shufflehi_epi16(_mm_shufflelo_epi16(y, _MM_SHUFFLE(2,3,0,1) ), _MM_SHUFFLE(2,3,0,1)); ++- ++- // Calculate the ar*(-ci) + cr*(ai) ++- imagz = _mm_madd_epi16(x,y); ++- ++- _mm_store_si128((__m128i*)c, _mm_packs_epi32(_mm_unpacklo_epi32(realz, imagz), _mm_unpackhi_epi32(realz, imagz))); ++- ++- a += 4; ++- b += 4; ++- c += 4; ++- } ++- ++- number = quarterPoints * 4; ++- int16_t* c16Ptr = (int16_t*)&cVector[number]; ++- int8_t* a8Ptr = (int8_t*)&aVector[number]; ++- int8_t* b8Ptr = (int8_t*)&bVector[number]; ++- for(; number < num_points; number++){ ++- float aReal = (float)*a8Ptr++; ++- float aImag = (float)*a8Ptr++; ++- lv_32fc_t aVal = lv_cmake(aReal, aImag ); ++- float bReal = (float)*b8Ptr++; ++- float bImag = (float)*b8Ptr++; ++- lv_32fc_t bVal = lv_cmake( bReal, -bImag ); ++- lv_32fc_t temp = aVal * bVal; ++- ++- *c16Ptr++ = (int16_t)lv_creal(temp); ++- *c16Ptr++ = (int16_t)lv_cimag(temp); ++- } +++static inline void volk_8ic_x2_multiply_conjugate_16ic_a_sse4_1(lv_16sc_t* cVector, +++ const lv_8sc_t* aVector, +++ const lv_8sc_t* bVector, +++ unsigned int num_points) +++{ +++ unsigned int number = 0; +++ const unsigned int quarterPoints = num_points / 4; +++ +++ __m128i x, y, realz, imagz; +++ lv_16sc_t* c = cVector; +++ const lv_8sc_t* a = aVector; +++ const lv_8sc_t* b = bVector; +++ __m128i conjugateSign = _mm_set_epi16(-1, 1, -1, 1, -1, 1, -1, 1); +++ +++ for (; number < quarterPoints; number++) { +++ // Convert into 8 bit values into 16 bit values +++ x = _mm_cvtepi8_epi16(_mm_loadl_epi64((__m128i*)a)); +++ y = _mm_cvtepi8_epi16(_mm_loadl_epi64((__m128i*)b)); +++ +++ // Calculate the ar*cr - ai*(-ci) portions +++ realz = _mm_madd_epi16(x, y); +++ +++ // Calculate the complex conjugate of the cr + ci j values +++ y = _mm_sign_epi16(y, conjugateSign); +++ +++ // Shift the order of the cr and ci values +++ y = _mm_shufflehi_epi16(_mm_shufflelo_epi16(y, _MM_SHUFFLE(2, 3, 0, 1)), +++ _MM_SHUFFLE(2, 3, 0, 1)); +++ +++ // Calculate the ar*(-ci) + cr*(ai) +++ imagz = _mm_madd_epi16(x, y); +++ +++ _mm_store_si128((__m128i*)c, +++ _mm_packs_epi32(_mm_unpacklo_epi32(realz, imagz), +++ _mm_unpackhi_epi32(realz, imagz))); +++ +++ a += 4; +++ b += 4; +++ c += 4; +++ } +++ +++ number = quarterPoints * 4; +++ int16_t* c16Ptr = (int16_t*)&cVector[number]; +++ int8_t* a8Ptr = (int8_t*)&aVector[number]; +++ int8_t* b8Ptr = (int8_t*)&bVector[number]; +++ for (; number < num_points; number++) { +++ float aReal = (float)*a8Ptr++; +++ float aImag = (float)*a8Ptr++; +++ lv_32fc_t aVal = lv_cmake(aReal, aImag); +++ float bReal = (float)*b8Ptr++; +++ float bImag = (float)*b8Ptr++; +++ lv_32fc_t bVal = lv_cmake(bReal, -bImag); +++ lv_32fc_t temp = aVal * bVal; +++ +++ *c16Ptr++ = (int16_t)lv_creal(temp); +++ *c16Ptr++ = (int16_t)lv_cimag(temp); +++ } ++ } ++ #endif /* LV_HAVE_SSE4_1 */ ++ ++ #ifdef LV_HAVE_GENERIC ++ /*! ++- \brief Multiplys the one complex vector with the complex conjugate of the second complex vector and stores their results in the third vector ++- \param cVector The complex vector where the results will be stored ++- \param aVector One of the complex vectors to be multiplied ++- \param bVector The complex vector which will be converted to complex conjugate and multiplied ++- \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector +++ \brief Multiplys the one complex vector with the complex conjugate of the second complex +++ vector and stores their results in the third vector \param cVector The complex vector +++ where the results will be stored \param aVector One of the complex vectors to be +++ multiplied \param bVector The complex vector which will be converted to complex +++ conjugate and multiplied \param num_points The number of complex values in aVector and +++ bVector to be multiplied together and stored into cVector ++ */ ++-static inline void volk_8ic_x2_multiply_conjugate_16ic_generic(lv_16sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t* bVector, unsigned int num_points){ ++- unsigned int number = 0; ++- int16_t* c16Ptr = (int16_t*)cVector; ++- int8_t* a8Ptr = (int8_t*)aVector; ++- int8_t* b8Ptr = (int8_t*)bVector; ++- for(number =0; number < num_points; number++){ ++- float aReal = (float)*a8Ptr++; ++- float aImag = (float)*a8Ptr++; ++- lv_32fc_t aVal = lv_cmake(aReal, aImag ); ++- float bReal = (float)*b8Ptr++; ++- float bImag = (float)*b8Ptr++; ++- lv_32fc_t bVal = lv_cmake( bReal, -bImag ); ++- lv_32fc_t temp = aVal * bVal; ++- ++- *c16Ptr++ = (int16_t)lv_creal(temp); ++- *c16Ptr++ = (int16_t)lv_cimag(temp); ++- } +++static inline void volk_8ic_x2_multiply_conjugate_16ic_generic(lv_16sc_t* cVector, +++ const lv_8sc_t* aVector, +++ const lv_8sc_t* bVector, +++ unsigned int num_points) +++{ +++ unsigned int number = 0; +++ int16_t* c16Ptr = (int16_t*)cVector; +++ int8_t* a8Ptr = (int8_t*)aVector; +++ int8_t* b8Ptr = (int8_t*)bVector; +++ for (number = 0; number < num_points; number++) { +++ float aReal = (float)*a8Ptr++; +++ float aImag = (float)*a8Ptr++; +++ lv_32fc_t aVal = lv_cmake(aReal, aImag); +++ float bReal = (float)*b8Ptr++; +++ float bImag = (float)*b8Ptr++; +++ lv_32fc_t bVal = lv_cmake(bReal, -bImag); +++ lv_32fc_t temp = aVal * bVal; +++ +++ *c16Ptr++ = (int16_t)lv_creal(temp); +++ *c16Ptr++ = (int16_t)lv_cimag(temp); +++ } ++ } ++ #endif /* LV_HAVE_GENERIC */ ++ ++@@ -194,64 +216,73 @@ static inline void volk_8ic_x2_multiply_conjugate_16ic_generic(lv_16sc_t* cVecto ++ #ifdef LV_HAVE_AVX2 ++ #include ++ /*! ++- \brief Multiplys the one complex vector with the complex conjugate of the second complex vector and stores their results in the third vector ++- \param cVector The complex vector where the results will be stored ++- \param aVector One of the complex vectors to be multiplied ++- \param bVector The complex vector which will be converted to complex conjugate and multiplied ++- \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector +++ \brief Multiplys the one complex vector with the complex conjugate of the second complex +++ vector and stores their results in the third vector \param cVector The complex vector +++ where the results will be stored \param aVector One of the complex vectors to be +++ multiplied \param bVector The complex vector which will be converted to complex +++ conjugate and multiplied \param num_points The number of complex values in aVector and +++ bVector to be multiplied together and stored into cVector ++ */ ++-static inline void volk_8ic_x2_multiply_conjugate_16ic_u_avx2(lv_16sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t* bVector, unsigned int num_points){ ++- unsigned int number = 0; ++- const unsigned int oneEigthPoints = num_points / 8; ++- ++- __m256i x, y, realz, imagz; ++- lv_16sc_t* c = cVector; ++- const lv_8sc_t* a = aVector; ++- const lv_8sc_t* b = bVector; ++- __m256i conjugateSign = _mm256_set_epi16(-1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1); ++- ++- for(;number < oneEigthPoints; number++){ ++- // Convert 8 bit values into 16 bit values ++- x = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*)a)); ++- y = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*)b)); ++- ++- // Calculate the ar*cr - ai*(-ci) portions ++- realz = _mm256_madd_epi16(x,y); ++- ++- // Calculate the complex conjugate of the cr + ci j values ++- y = _mm256_sign_epi16(y, conjugateSign); ++- ++- // Shift the order of the cr and ci values ++- y = _mm256_shufflehi_epi16(_mm256_shufflelo_epi16(y, _MM_SHUFFLE(2,3,0,1) ), _MM_SHUFFLE(2,3,0,1)); ++- ++- // Calculate the ar*(-ci) + cr*(ai) ++- imagz = _mm256_madd_epi16(x,y); ++- ++- // Perform the addition of products ++- ++- _mm256_storeu_si256((__m256i*)c, _mm256_packs_epi32(_mm256_unpacklo_epi32(realz, imagz), _mm256_unpackhi_epi32(realz, imagz))); ++- ++- a += 8; ++- b += 8; ++- c += 8; ++- } ++- ++- number = oneEigthPoints * 8; ++- int16_t* c16Ptr = (int16_t*)&cVector[number]; ++- int8_t* a8Ptr = (int8_t*)&aVector[number]; ++- int8_t* b8Ptr = (int8_t*)&bVector[number]; ++- for(; number < num_points; number++){ ++- float aReal = (float)*a8Ptr++; ++- float aImag = (float)*a8Ptr++; ++- lv_32fc_t aVal = lv_cmake(aReal, aImag ); ++- float bReal = (float)*b8Ptr++; ++- float bImag = (float)*b8Ptr++; ++- lv_32fc_t bVal = lv_cmake( bReal, -bImag ); ++- lv_32fc_t temp = aVal * bVal; ++- ++- *c16Ptr++ = (int16_t)lv_creal(temp); ++- *c16Ptr++ = (int16_t)lv_cimag(temp); ++- } +++static inline void volk_8ic_x2_multiply_conjugate_16ic_u_avx2(lv_16sc_t* cVector, +++ const lv_8sc_t* aVector, +++ const lv_8sc_t* bVector, +++ unsigned int num_points) +++{ +++ unsigned int number = 0; +++ const unsigned int oneEigthPoints = num_points / 8; +++ +++ __m256i x, y, realz, imagz; +++ lv_16sc_t* c = cVector; +++ const lv_8sc_t* a = aVector; +++ const lv_8sc_t* b = bVector; +++ __m256i conjugateSign = +++ _mm256_set_epi16(-1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1); +++ +++ for (; number < oneEigthPoints; number++) { +++ // Convert 8 bit values into 16 bit values +++ x = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*)a)); +++ y = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*)b)); +++ +++ // Calculate the ar*cr - ai*(-ci) portions +++ realz = _mm256_madd_epi16(x, y); +++ +++ // Calculate the complex conjugate of the cr + ci j values +++ y = _mm256_sign_epi16(y, conjugateSign); +++ +++ // Shift the order of the cr and ci values +++ y = _mm256_shufflehi_epi16(_mm256_shufflelo_epi16(y, _MM_SHUFFLE(2, 3, 0, 1)), +++ _MM_SHUFFLE(2, 3, 0, 1)); +++ +++ // Calculate the ar*(-ci) + cr*(ai) +++ imagz = _mm256_madd_epi16(x, y); +++ +++ // Perform the addition of products +++ +++ _mm256_storeu_si256((__m256i*)c, +++ _mm256_packs_epi32(_mm256_unpacklo_epi32(realz, imagz), +++ _mm256_unpackhi_epi32(realz, imagz))); +++ +++ a += 8; +++ b += 8; +++ c += 8; +++ } +++ +++ number = oneEigthPoints * 8; +++ int16_t* c16Ptr = (int16_t*)&cVector[number]; +++ int8_t* a8Ptr = (int8_t*)&aVector[number]; +++ int8_t* b8Ptr = (int8_t*)&bVector[number]; +++ for (; number < num_points; number++) { +++ float aReal = (float)*a8Ptr++; +++ float aImag = (float)*a8Ptr++; +++ lv_32fc_t aVal = lv_cmake(aReal, aImag); +++ float bReal = (float)*b8Ptr++; +++ float bImag = (float)*b8Ptr++; +++ lv_32fc_t bVal = lv_cmake(bReal, -bImag); +++ lv_32fc_t temp = aVal * bVal; +++ +++ *c16Ptr++ = (int16_t)lv_creal(temp); +++ *c16Ptr++ = (int16_t)lv_cimag(temp); +++ } ++ } ++ #endif /* LV_HAVE_AVX2 */ ++ ++diff --git a/kernels/volk/volk_8ic_x2_s32f_multiply_conjugate_32fc.h b/kernels/volk/volk_8ic_x2_s32f_multiply_conjugate_32fc.h ++index 82e40c8..db6bd7a 100644 ++--- a/kernels/volk/volk_8ic_x2_s32f_multiply_conjugate_32fc.h +++++ b/kernels/volk/volk_8ic_x2_s32f_multiply_conjugate_32fc.h ++@@ -30,14 +30,15 @@ ++ * ++ * Dispatcher Prototype ++ * \code ++- * void volk_8ic_x2_s32f_multiply_conjugate_32fc(lv_32fc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t* bVector, const float scalar, unsigned int num_points) ++- * \endcode +++ * void volk_8ic_x2_s32f_multiply_conjugate_32fc(lv_32fc_t* cVector, const lv_8sc_t* +++ * aVector, const lv_8sc_t* bVector, const float scalar, unsigned int num_points) \endcode ++ * ++ * \b Inputs ++ * \li aVector: One of the complex vectors to be multiplied. ++- * \li bVector: The complex vector which will be converted to complex conjugate and multiplied. ++- * \li scalar: each output value is scaled by 1/scalar. ++- * \li num_points: The number of complex values in aVector and bVector to be multiplied together and stored into cVector. +++ * \li bVector: The complex vector which will be converted to complex conjugate and +++ * multiplied. \li scalar: each output value is scaled by 1/scalar. \li num_points: The +++ * number of complex values in aVector and bVector to be multiplied together and stored +++ * into cVector. ++ * ++ * \b Outputs ++ * \li cVector: The complex vector where the results will be stored. ++@@ -64,160 +65,167 @@ ++ #include ++ ++ static inline void ++-volk_8ic_x2_s32f_multiply_conjugate_32fc_a_avx2(lv_32fc_t* cVector, const lv_8sc_t* aVector, ++- const lv_8sc_t* bVector, const float scalar, ++- unsigned int num_points) +++volk_8ic_x2_s32f_multiply_conjugate_32fc_a_avx2(lv_32fc_t* cVector, +++ const lv_8sc_t* aVector, +++ const lv_8sc_t* bVector, +++ const float scalar, +++ unsigned int num_points) ++ { ++- unsigned int number = 0; ++- const unsigned int oneEigthPoints = num_points / 8; ++- ++- __m256i x, y, realz, imagz; ++- __m256 ret, retlo, rethi; ++- lv_32fc_t* c = cVector; ++- const lv_8sc_t* a = aVector; ++- const lv_8sc_t* b = bVector; ++- __m256i conjugateSign = _mm256_set_epi16(-1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1); ++- ++- __m256 invScalar = _mm256_set1_ps(1.0/scalar); ++- ++- for(;number < oneEigthPoints; number++){ ++- // Convert 8 bit values into 16 bit values ++- x = _mm256_cvtepi8_epi16(_mm_load_si128((__m128i*)a)); ++- y = _mm256_cvtepi8_epi16(_mm_load_si128((__m128i*)b)); ++- ++- // Calculate the ar*cr - ai*(-ci) portions ++- realz = _mm256_madd_epi16(x,y); ++- ++- // Calculate the complex conjugate of the cr + ci j values ++- y = _mm256_sign_epi16(y, conjugateSign); ++- ++- // Shift the order of the cr and ci values ++- y = _mm256_shufflehi_epi16(_mm256_shufflelo_epi16(y, _MM_SHUFFLE(2,3,0,1) ), _MM_SHUFFLE(2,3,0,1)); ++- ++- // Calculate the ar*(-ci) + cr*(ai) ++- imagz = _mm256_madd_epi16(x,y); ++- ++- // Interleave real and imaginary and then convert to float values ++- retlo = _mm256_cvtepi32_ps(_mm256_unpacklo_epi32(realz, imagz)); ++- ++- // Normalize the floating point values ++- retlo = _mm256_mul_ps(retlo, invScalar); ++- ++- // Interleave real and imaginary and then convert to float values ++- rethi = _mm256_cvtepi32_ps(_mm256_unpackhi_epi32(realz, imagz)); ++- ++- // Normalize the floating point values ++- rethi = _mm256_mul_ps(rethi, invScalar); ++- ++- ret = _mm256_permute2f128_ps(retlo, rethi, 0b00100000); ++- _mm256_store_ps((float*)c, ret); ++- c += 4; ++- ++- ret = _mm256_permute2f128_ps(retlo, rethi, 0b00110001); ++- _mm256_store_ps((float*)c, ret); ++- c += 4; ++- ++- a += 8; ++- b += 8; ++- } ++- ++- number = oneEigthPoints * 8; ++- float* cFloatPtr = (float*)&cVector[number]; ++- int8_t* a8Ptr = (int8_t*)&aVector[number]; ++- int8_t* b8Ptr = (int8_t*)&bVector[number]; ++- for(; number < num_points; number++){ ++- float aReal = (float)*a8Ptr++; ++- float aImag = (float)*a8Ptr++; ++- lv_32fc_t aVal = lv_cmake(aReal, aImag ); ++- float bReal = (float)*b8Ptr++; ++- float bImag = (float)*b8Ptr++; ++- lv_32fc_t bVal = lv_cmake( bReal, -bImag ); ++- lv_32fc_t temp = aVal * bVal; ++- ++- *cFloatPtr++ = lv_creal(temp) / scalar; ++- *cFloatPtr++ = lv_cimag(temp) / scalar; ++- } +++ unsigned int number = 0; +++ const unsigned int oneEigthPoints = num_points / 8; +++ +++ __m256i x, y, realz, imagz; +++ __m256 ret, retlo, rethi; +++ lv_32fc_t* c = cVector; +++ const lv_8sc_t* a = aVector; +++ const lv_8sc_t* b = bVector; +++ __m256i conjugateSign = +++ _mm256_set_epi16(-1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1); +++ +++ __m256 invScalar = _mm256_set1_ps(1.0 / scalar); +++ +++ for (; number < oneEigthPoints; number++) { +++ // Convert 8 bit values into 16 bit values +++ x = _mm256_cvtepi8_epi16(_mm_load_si128((__m128i*)a)); +++ y = _mm256_cvtepi8_epi16(_mm_load_si128((__m128i*)b)); +++ +++ // Calculate the ar*cr - ai*(-ci) portions +++ realz = _mm256_madd_epi16(x, y); +++ +++ // Calculate the complex conjugate of the cr + ci j values +++ y = _mm256_sign_epi16(y, conjugateSign); +++ +++ // Shift the order of the cr and ci values +++ y = _mm256_shufflehi_epi16(_mm256_shufflelo_epi16(y, _MM_SHUFFLE(2, 3, 0, 1)), +++ _MM_SHUFFLE(2, 3, 0, 1)); +++ +++ // Calculate the ar*(-ci) + cr*(ai) +++ imagz = _mm256_madd_epi16(x, y); +++ +++ // Interleave real and imaginary and then convert to float values +++ retlo = _mm256_cvtepi32_ps(_mm256_unpacklo_epi32(realz, imagz)); +++ +++ // Normalize the floating point values +++ retlo = _mm256_mul_ps(retlo, invScalar); +++ +++ // Interleave real and imaginary and then convert to float values +++ rethi = _mm256_cvtepi32_ps(_mm256_unpackhi_epi32(realz, imagz)); +++ +++ // Normalize the floating point values +++ rethi = _mm256_mul_ps(rethi, invScalar); +++ +++ ret = _mm256_permute2f128_ps(retlo, rethi, 0b00100000); +++ _mm256_store_ps((float*)c, ret); +++ c += 4; +++ +++ ret = _mm256_permute2f128_ps(retlo, rethi, 0b00110001); +++ _mm256_store_ps((float*)c, ret); +++ c += 4; +++ +++ a += 8; +++ b += 8; +++ } +++ +++ number = oneEigthPoints * 8; +++ float* cFloatPtr = (float*)&cVector[number]; +++ int8_t* a8Ptr = (int8_t*)&aVector[number]; +++ int8_t* b8Ptr = (int8_t*)&bVector[number]; +++ for (; number < num_points; number++) { +++ float aReal = (float)*a8Ptr++; +++ float aImag = (float)*a8Ptr++; +++ lv_32fc_t aVal = lv_cmake(aReal, aImag); +++ float bReal = (float)*b8Ptr++; +++ float bImag = (float)*b8Ptr++; +++ lv_32fc_t bVal = lv_cmake(bReal, -bImag); +++ lv_32fc_t temp = aVal * bVal; +++ +++ *cFloatPtr++ = lv_creal(temp) / scalar; +++ *cFloatPtr++ = lv_cimag(temp) / scalar; +++ } ++ } ++-#endif /* LV_HAVE_AVX2*/ +++#endif /* LV_HAVE_AVX2*/ ++ ++ ++ #ifdef LV_HAVE_SSE4_1 ++ #include ++ ++ static inline void ++-volk_8ic_x2_s32f_multiply_conjugate_32fc_a_sse4_1(lv_32fc_t* cVector, const lv_8sc_t* aVector, ++- const lv_8sc_t* bVector, const float scalar, +++volk_8ic_x2_s32f_multiply_conjugate_32fc_a_sse4_1(lv_32fc_t* cVector, +++ const lv_8sc_t* aVector, +++ const lv_8sc_t* bVector, +++ const float scalar, ++ unsigned int num_points) ++ { ++- unsigned int number = 0; ++- const unsigned int quarterPoints = num_points / 4; ++- ++- __m128i x, y, realz, imagz; ++- __m128 ret; ++- lv_32fc_t* c = cVector; ++- const lv_8sc_t* a = aVector; ++- const lv_8sc_t* b = bVector; ++- __m128i conjugateSign = _mm_set_epi16(-1, 1, -1, 1, -1, 1, -1, 1); ++- ++- __m128 invScalar = _mm_set_ps1(1.0/scalar); ++- ++- for(;number < quarterPoints; number++){ ++- // Convert into 8 bit values into 16 bit values ++- x = _mm_cvtepi8_epi16(_mm_loadl_epi64((__m128i*)a)); ++- y = _mm_cvtepi8_epi16(_mm_loadl_epi64((__m128i*)b)); ++- ++- // Calculate the ar*cr - ai*(-ci) portions ++- realz = _mm_madd_epi16(x,y); ++- ++- // Calculate the complex conjugate of the cr + ci j values ++- y = _mm_sign_epi16(y, conjugateSign); ++- ++- // Shift the order of the cr and ci values ++- y = _mm_shufflehi_epi16(_mm_shufflelo_epi16(y, _MM_SHUFFLE(2,3,0,1) ), _MM_SHUFFLE(2,3,0,1)); ++- ++- // Calculate the ar*(-ci) + cr*(ai) ++- imagz = _mm_madd_epi16(x,y); ++- ++- // Interleave real and imaginary and then convert to float values ++- ret = _mm_cvtepi32_ps(_mm_unpacklo_epi32(realz, imagz)); ++- ++- // Normalize the floating point values ++- ret = _mm_mul_ps(ret, invScalar); ++- ++- // Store the floating point values ++- _mm_store_ps((float*)c, ret); ++- c += 2; ++- ++- // Interleave real and imaginary and then convert to float values ++- ret = _mm_cvtepi32_ps(_mm_unpackhi_epi32(realz, imagz)); ++- ++- // Normalize the floating point values ++- ret = _mm_mul_ps(ret, invScalar); ++- ++- // Store the floating point values ++- _mm_store_ps((float*)c, ret); ++- c += 2; ++- ++- a += 4; ++- b += 4; ++- } ++- ++- number = quarterPoints * 4; ++- float* cFloatPtr = (float*)&cVector[number]; ++- int8_t* a8Ptr = (int8_t*)&aVector[number]; ++- int8_t* b8Ptr = (int8_t*)&bVector[number]; ++- for(; number < num_points; number++){ ++- float aReal = (float)*a8Ptr++; ++- float aImag = (float)*a8Ptr++; ++- lv_32fc_t aVal = lv_cmake(aReal, aImag ); ++- float bReal = (float)*b8Ptr++; ++- float bImag = (float)*b8Ptr++; ++- lv_32fc_t bVal = lv_cmake( bReal, -bImag ); ++- lv_32fc_t temp = aVal * bVal; ++- ++- *cFloatPtr++ = lv_creal(temp) / scalar; ++- *cFloatPtr++ = lv_cimag(temp) / scalar; ++- } +++ unsigned int number = 0; +++ const unsigned int quarterPoints = num_points / 4; +++ +++ __m128i x, y, realz, imagz; +++ __m128 ret; +++ lv_32fc_t* c = cVector; +++ const lv_8sc_t* a = aVector; +++ const lv_8sc_t* b = bVector; +++ __m128i conjugateSign = _mm_set_epi16(-1, 1, -1, 1, -1, 1, -1, 1); +++ +++ __m128 invScalar = _mm_set_ps1(1.0 / scalar); +++ +++ for (; number < quarterPoints; number++) { +++ // Convert into 8 bit values into 16 bit values +++ x = _mm_cvtepi8_epi16(_mm_loadl_epi64((__m128i*)a)); +++ y = _mm_cvtepi8_epi16(_mm_loadl_epi64((__m128i*)b)); +++ +++ // Calculate the ar*cr - ai*(-ci) portions +++ realz = _mm_madd_epi16(x, y); +++ +++ // Calculate the complex conjugate of the cr + ci j values +++ y = _mm_sign_epi16(y, conjugateSign); +++ +++ // Shift the order of the cr and ci values +++ y = _mm_shufflehi_epi16(_mm_shufflelo_epi16(y, _MM_SHUFFLE(2, 3, 0, 1)), +++ _MM_SHUFFLE(2, 3, 0, 1)); +++ +++ // Calculate the ar*(-ci) + cr*(ai) +++ imagz = _mm_madd_epi16(x, y); +++ +++ // Interleave real and imaginary and then convert to float values +++ ret = _mm_cvtepi32_ps(_mm_unpacklo_epi32(realz, imagz)); +++ +++ // Normalize the floating point values +++ ret = _mm_mul_ps(ret, invScalar); +++ +++ // Store the floating point values +++ _mm_store_ps((float*)c, ret); +++ c += 2; +++ +++ // Interleave real and imaginary and then convert to float values +++ ret = _mm_cvtepi32_ps(_mm_unpackhi_epi32(realz, imagz)); +++ +++ // Normalize the floating point values +++ ret = _mm_mul_ps(ret, invScalar); +++ +++ // Store the floating point values +++ _mm_store_ps((float*)c, ret); +++ c += 2; +++ +++ a += 4; +++ b += 4; +++ } +++ +++ number = quarterPoints * 4; +++ float* cFloatPtr = (float*)&cVector[number]; +++ int8_t* a8Ptr = (int8_t*)&aVector[number]; +++ int8_t* b8Ptr = (int8_t*)&bVector[number]; +++ for (; number < num_points; number++) { +++ float aReal = (float)*a8Ptr++; +++ float aImag = (float)*a8Ptr++; +++ lv_32fc_t aVal = lv_cmake(aReal, aImag); +++ float bReal = (float)*b8Ptr++; +++ float bImag = (float)*b8Ptr++; +++ lv_32fc_t bVal = lv_cmake(bReal, -bImag); +++ lv_32fc_t temp = aVal * bVal; +++ +++ *cFloatPtr++ = lv_creal(temp) / scalar; +++ *cFloatPtr++ = lv_cimag(temp) / scalar; +++ } ++ } ++ #endif /* LV_HAVE_SSE4_1 */ ++ ++@@ -225,27 +233,29 @@ volk_8ic_x2_s32f_multiply_conjugate_32fc_a_sse4_1(lv_32fc_t* cVector, const lv_8 ++ #ifdef LV_HAVE_GENERIC ++ ++ static inline void ++-volk_8ic_x2_s32f_multiply_conjugate_32fc_generic(lv_32fc_t* cVector, const lv_8sc_t* aVector, ++- const lv_8sc_t* bVector, const float scalar, +++volk_8ic_x2_s32f_multiply_conjugate_32fc_generic(lv_32fc_t* cVector, +++ const lv_8sc_t* aVector, +++ const lv_8sc_t* bVector, +++ const float scalar, ++ unsigned int num_points) ++ { ++- unsigned int number = 0; ++- float* cPtr = (float*)cVector; ++- const float invScalar = 1.0 / scalar; ++- int8_t* a8Ptr = (int8_t*)aVector; ++- int8_t* b8Ptr = (int8_t*)bVector; ++- for(number = 0; number < num_points; number++){ ++- float aReal = (float)*a8Ptr++; ++- float aImag = (float)*a8Ptr++; ++- lv_32fc_t aVal = lv_cmake(aReal, aImag ); ++- float bReal = (float)*b8Ptr++; ++- float bImag = (float)*b8Ptr++; ++- lv_32fc_t bVal = lv_cmake( bReal, -bImag ); ++- lv_32fc_t temp = aVal * bVal; ++- ++- *cPtr++ = (lv_creal(temp) * invScalar); ++- *cPtr++ = (lv_cimag(temp) * invScalar); ++- } +++ unsigned int number = 0; +++ float* cPtr = (float*)cVector; +++ const float invScalar = 1.0 / scalar; +++ int8_t* a8Ptr = (int8_t*)aVector; +++ int8_t* b8Ptr = (int8_t*)bVector; +++ for (number = 0; number < num_points; number++) { +++ float aReal = (float)*a8Ptr++; +++ float aImag = (float)*a8Ptr++; +++ lv_32fc_t aVal = lv_cmake(aReal, aImag); +++ float bReal = (float)*b8Ptr++; +++ float bImag = (float)*b8Ptr++; +++ lv_32fc_t bVal = lv_cmake(bReal, -bImag); +++ lv_32fc_t temp = aVal * bVal; +++ +++ *cPtr++ = (lv_creal(temp) * invScalar); +++ *cPtr++ = (lv_cimag(temp) * invScalar); +++ } ++ } ++ #endif /* LV_HAVE_GENERIC */ ++ ++@@ -263,81 +273,85 @@ volk_8ic_x2_s32f_multiply_conjugate_32fc_generic(lv_32fc_t* cVector, const lv_8s ++ #include ++ ++ static inline void ++-volk_8ic_x2_s32f_multiply_conjugate_32fc_u_avx2(lv_32fc_t* cVector, const lv_8sc_t* aVector, ++- const lv_8sc_t* bVector, const float scalar, ++- unsigned int num_points) +++volk_8ic_x2_s32f_multiply_conjugate_32fc_u_avx2(lv_32fc_t* cVector, +++ const lv_8sc_t* aVector, +++ const lv_8sc_t* bVector, +++ const float scalar, +++ unsigned int num_points) ++ { ++- unsigned int number = 0; ++- const unsigned int oneEigthPoints = num_points / 8; ++- ++- __m256i x, y, realz, imagz; ++- __m256 ret, retlo, rethi; ++- lv_32fc_t* c = cVector; ++- const lv_8sc_t* a = aVector; ++- const lv_8sc_t* b = bVector; ++- __m256i conjugateSign = _mm256_set_epi16(-1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1); ++- ++- __m256 invScalar = _mm256_set1_ps(1.0/scalar); ++- ++- for(;number < oneEigthPoints; number++){ ++- // Convert 8 bit values into 16 bit values ++- x = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*)a)); ++- y = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*)b)); ++- ++- // Calculate the ar*cr - ai*(-ci) portions ++- realz = _mm256_madd_epi16(x,y); ++- ++- // Calculate the complex conjugate of the cr + ci j values ++- y = _mm256_sign_epi16(y, conjugateSign); ++- ++- // Shift the order of the cr and ci values ++- y = _mm256_shufflehi_epi16(_mm256_shufflelo_epi16(y, _MM_SHUFFLE(2,3,0,1) ), _MM_SHUFFLE(2,3,0,1)); ++- ++- // Calculate the ar*(-ci) + cr*(ai) ++- imagz = _mm256_madd_epi16(x,y); ++- ++- // Interleave real and imaginary and then convert to float values ++- retlo = _mm256_cvtepi32_ps(_mm256_unpacklo_epi32(realz, imagz)); ++- ++- // Normalize the floating point values ++- retlo = _mm256_mul_ps(retlo, invScalar); ++- ++- // Interleave real and imaginary and then convert to float values ++- rethi = _mm256_cvtepi32_ps(_mm256_unpackhi_epi32(realz, imagz)); ++- ++- // Normalize the floating point values ++- rethi = _mm256_mul_ps(rethi, invScalar); ++- ++- ret = _mm256_permute2f128_ps(retlo, rethi, 0b00100000); ++- _mm256_storeu_ps((float*)c, ret); ++- c += 4; ++- ++- ret = _mm256_permute2f128_ps(retlo, rethi, 0b00110001); ++- _mm256_storeu_ps((float*)c, ret); ++- c += 4; ++- ++- a += 8; ++- b += 8; ++- } ++- ++- number = oneEigthPoints * 8; ++- float* cFloatPtr = (float*)&cVector[number]; ++- int8_t* a8Ptr = (int8_t*)&aVector[number]; ++- int8_t* b8Ptr = (int8_t*)&bVector[number]; ++- for(; number < num_points; number++){ ++- float aReal = (float)*a8Ptr++; ++- float aImag = (float)*a8Ptr++; ++- lv_32fc_t aVal = lv_cmake(aReal, aImag ); ++- float bReal = (float)*b8Ptr++; ++- float bImag = (float)*b8Ptr++; ++- lv_32fc_t bVal = lv_cmake( bReal, -bImag ); ++- lv_32fc_t temp = aVal * bVal; ++- ++- *cFloatPtr++ = lv_creal(temp) / scalar; ++- *cFloatPtr++ = lv_cimag(temp) / scalar; ++- } +++ unsigned int number = 0; +++ const unsigned int oneEigthPoints = num_points / 8; +++ +++ __m256i x, y, realz, imagz; +++ __m256 ret, retlo, rethi; +++ lv_32fc_t* c = cVector; +++ const lv_8sc_t* a = aVector; +++ const lv_8sc_t* b = bVector; +++ __m256i conjugateSign = +++ _mm256_set_epi16(-1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1); +++ +++ __m256 invScalar = _mm256_set1_ps(1.0 / scalar); +++ +++ for (; number < oneEigthPoints; number++) { +++ // Convert 8 bit values into 16 bit values +++ x = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*)a)); +++ y = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*)b)); +++ +++ // Calculate the ar*cr - ai*(-ci) portions +++ realz = _mm256_madd_epi16(x, y); +++ +++ // Calculate the complex conjugate of the cr + ci j values +++ y = _mm256_sign_epi16(y, conjugateSign); +++ +++ // Shift the order of the cr and ci values +++ y = _mm256_shufflehi_epi16(_mm256_shufflelo_epi16(y, _MM_SHUFFLE(2, 3, 0, 1)), +++ _MM_SHUFFLE(2, 3, 0, 1)); +++ +++ // Calculate the ar*(-ci) + cr*(ai) +++ imagz = _mm256_madd_epi16(x, y); +++ +++ // Interleave real and imaginary and then convert to float values +++ retlo = _mm256_cvtepi32_ps(_mm256_unpacklo_epi32(realz, imagz)); +++ +++ // Normalize the floating point values +++ retlo = _mm256_mul_ps(retlo, invScalar); +++ +++ // Interleave real and imaginary and then convert to float values +++ rethi = _mm256_cvtepi32_ps(_mm256_unpackhi_epi32(realz, imagz)); +++ +++ // Normalize the floating point values +++ rethi = _mm256_mul_ps(rethi, invScalar); +++ +++ ret = _mm256_permute2f128_ps(retlo, rethi, 0b00100000); +++ _mm256_storeu_ps((float*)c, ret); +++ c += 4; +++ +++ ret = _mm256_permute2f128_ps(retlo, rethi, 0b00110001); +++ _mm256_storeu_ps((float*)c, ret); +++ c += 4; +++ +++ a += 8; +++ b += 8; +++ } +++ +++ number = oneEigthPoints * 8; +++ float* cFloatPtr = (float*)&cVector[number]; +++ int8_t* a8Ptr = (int8_t*)&aVector[number]; +++ int8_t* b8Ptr = (int8_t*)&bVector[number]; +++ for (; number < num_points; number++) { +++ float aReal = (float)*a8Ptr++; +++ float aImag = (float)*a8Ptr++; +++ lv_32fc_t aVal = lv_cmake(aReal, aImag); +++ float bReal = (float)*b8Ptr++; +++ float bImag = (float)*b8Ptr++; +++ lv_32fc_t bVal = lv_cmake(bReal, -bImag); +++ lv_32fc_t temp = aVal * bVal; +++ +++ *cFloatPtr++ = lv_creal(temp) / scalar; +++ *cFloatPtr++ = lv_cimag(temp) / scalar; +++ } ++ } ++-#endif /* LV_HAVE_AVX2*/ +++#endif /* LV_HAVE_AVX2*/ ++ ++ ++ #endif /* INCLUDED_volk_8ic_x2_s32f_multiply_conjugate_32fc_u_H */ ++diff --git a/kernels/volk/volk_8u_conv_k7_r2puppet_8u.h b/kernels/volk/volk_8u_conv_k7_r2puppet_8u.h ++index 00f83de..69287cd 100644 ++--- a/kernels/volk/volk_8u_conv_k7_r2puppet_8u.h +++++ b/kernels/volk/volk_8u_conv_k7_r2puppet_8u.h ++@@ -23,21 +23,21 @@ ++ #ifndef INCLUDED_volk_8u_conv_k7_r2puppet_8u_H ++ #define INCLUDED_volk_8u_conv_k7_r2puppet_8u_H ++ +++#include ++ #include ++ #include ++-#include ++ ++ typedef union { ++- //decision_t is a BIT vector ++- unsigned char* t; ++- unsigned int* w; +++ // decision_t is a BIT vector +++ unsigned char* t; +++ unsigned int* w; ++ } p_decision_t; ++ ++ static inline int parity(int x, unsigned char* Partab) ++ { ++- x ^= (x >> 16); ++- x ^= (x >> 8); ++- return Partab[x]; +++ x ^= (x >> 16); +++ x ^= (x >> 8); +++ return Partab[x]; ++ } ++ ++ static inline int chainback_viterbi(unsigned char* data, ++@@ -46,135 +46,143 @@ static inline int chainback_viterbi(unsigned char* data, ++ unsigned int tailsize, ++ unsigned char* decisions) ++ { ++- unsigned char* d; ++- int d_ADDSHIFT = 0; ++- int d_numstates = (1 << 6); ++- int d_decision_t_size = d_numstates/8; ++- unsigned int d_k = 7; ++- int d_framebits = nbits; ++- /* ADDSHIFT and SUBSHIFT make sure that the thing returned is a byte. */ ++- d = decisions; ++- /* Make room beyond the end of the encoder register so we can ++- * accumulate a full byte of decoded data ++- */ ++- ++- endstate = (endstate%d_numstates) << d_ADDSHIFT; ++- ++- /* The store into data[] only needs to be done every 8 bits. ++- * But this avoids a conditional branch, and the writes will ++- * combine in the cache anyway ++- */ ++- ++- d += tailsize * d_decision_t_size ; /* Look past tail */ ++- int retval; ++- int dif = tailsize - (d_k - 1); ++- //printf("break, %d, %d\n", dif, (nbits+dif)%d_framebits); ++- p_decision_t dec; ++- while(nbits-- > d_framebits - (d_k - 1)) { ++- int k; ++- dec.t = &d[nbits * d_decision_t_size]; ++- k = (dec.w[(endstate>>d_ADDSHIFT)/32] >> ((endstate>>d_ADDSHIFT)%32)) & 1; ++- ++- endstate = (endstate >> 1) | (k << (d_k-2+d_ADDSHIFT)); ++- //data[((nbits+dif)%nbits)>>3] = endstate>>d_SUBSHIFT; ++- //printf("%d, %d\n", k, (nbits+dif)%d_framebits); ++- data[((nbits+dif)%d_framebits)] = k; ++- ++- retval = endstate; ++- } ++- nbits += 1; ++- ++- while(nbits-- != 0) { ++- int k; ++- ++- dec.t = &d[nbits * d_decision_t_size]; ++- ++- k = (dec.w[(endstate>>d_ADDSHIFT)/32] >> ((endstate>>d_ADDSHIFT)%32)) & 1; ++- ++- endstate = (endstate >> 1) | (k << (d_k-2+d_ADDSHIFT)); ++- data[((nbits+dif)%d_framebits)] = k; ++- } ++- //printf("%d, %d, %d, %d, %d, %d, %d, %d\n", data[4095],data[4094],data[4093],data[4092],data[4091],data[4090],data[4089],data[4088]); ++- ++- ++- return retval >> d_ADDSHIFT; +++ unsigned char* d; +++ int d_ADDSHIFT = 0; +++ int d_numstates = (1 << 6); +++ int d_decision_t_size = d_numstates / 8; +++ unsigned int d_k = 7; +++ int d_framebits = nbits; +++ /* ADDSHIFT and SUBSHIFT make sure that the thing returned is a byte. */ +++ d = decisions; +++ /* Make room beyond the end of the encoder register so we can +++ * accumulate a full byte of decoded data +++ */ +++ +++ endstate = (endstate % d_numstates) << d_ADDSHIFT; +++ +++ /* The store into data[] only needs to be done every 8 bits. +++ * But this avoids a conditional branch, and the writes will +++ * combine in the cache anyway +++ */ +++ +++ d += tailsize * d_decision_t_size; /* Look past tail */ +++ int retval; +++ int dif = tailsize - (d_k - 1); +++ // printf("break, %d, %d\n", dif, (nbits+dif)%d_framebits); +++ p_decision_t dec; +++ while (nbits-- > d_framebits - (d_k - 1)) { +++ int k; +++ dec.t = &d[nbits * d_decision_t_size]; +++ k = (dec.w[(endstate >> d_ADDSHIFT) / 32] >> ((endstate >> d_ADDSHIFT) % 32)) & 1; +++ +++ endstate = (endstate >> 1) | (k << (d_k - 2 + d_ADDSHIFT)); +++ // data[((nbits+dif)%nbits)>>3] = endstate>>d_SUBSHIFT; +++ // printf("%d, %d\n", k, (nbits+dif)%d_framebits); +++ data[((nbits + dif) % d_framebits)] = k; +++ +++ retval = endstate; +++ } +++ nbits += 1; +++ +++ while (nbits-- != 0) { +++ int k; +++ +++ dec.t = &d[nbits * d_decision_t_size]; +++ +++ k = (dec.w[(endstate >> d_ADDSHIFT) / 32] >> ((endstate >> d_ADDSHIFT) % 32)) & 1; +++ +++ endstate = (endstate >> 1) | (k << (d_k - 2 + d_ADDSHIFT)); +++ data[((nbits + dif) % d_framebits)] = k; +++ } +++ // printf("%d, %d, %d, %d, %d, %d, %d, %d\n", +++ // data[4095],data[4094],data[4093],data[4092],data[4091],data[4090],data[4089],data[4088]); +++ +++ +++ return retval >> d_ADDSHIFT; ++ } ++ ++ ++ #if LV_HAVE_SSE3 ++ ++-#include ++ #include ++-#include ++ #include +++#include ++ #include +++#include ++ ++-static inline void volk_8u_conv_k7_r2puppet_8u_spiral(unsigned char* syms, unsigned char* dec, unsigned int framebits) { ++- ++- ++- static int once = 1; ++- int d_numstates = (1 << 6); ++- int rate = 2; ++- static unsigned char* D; ++- static unsigned char* Y; ++- static unsigned char* X; ++- static unsigned int excess = 6; ++- static unsigned char* Branchtab; ++- static unsigned char Partab[256]; ++- ++- int d_polys[2] = {79, 109}; ++- ++- ++- if(once) { ++- ++- X = (unsigned char*)volk_malloc(2*d_numstates, volk_get_alignment()); ++- Y = X + d_numstates; ++- Branchtab = (unsigned char*)volk_malloc(d_numstates/2*rate, volk_get_alignment()); ++- D = (unsigned char*)volk_malloc((d_numstates/8) * (framebits + 6), volk_get_alignment()); ++- int state, i; ++- int cnt,ti; ++- ++- /* Initialize parity lookup table */ ++- for(i=0;i<256;i++){ ++- cnt = 0; ++- ti = i; ++- while(ti){ ++- if(ti & 1) ++- cnt++; ++- ti >>= 1; ++- } ++- Partab[i] = cnt & 1; ++- } ++- /* Initialize the branch table */ ++- for(state=0;state < d_numstates/2;state++){ ++- for(i=0; i>= 1; +++ } +++ Partab[i] = cnt & 1; +++ } +++ /* Initialize the branch table */ +++ for (state = 0; state < d_numstates / 2; state++) { +++ for (i = 0; i < rate; i++) { +++ Branchtab[i * d_numstates / 2 + state] = +++ parity((2 * state) & d_polys[i], Partab) ? 255 : 0; +++ } +++ } +++ +++ once = 0; +++ } +++ +++ // unbias the old_metrics +++ memset(X, 31, d_numstates); ++ ++- // initialize decisions ++- memset(D, 0, (d_numstates/8) * (framebits + 6)); +++ // initialize decisions +++ memset(D, 0, (d_numstates / 8) * (framebits + 6)); ++ ++- volk_8u_x4_conv_k7_r2_8u_spiral(Y, X, syms, D, framebits/2 - excess, excess, Branchtab); +++ volk_8u_x4_conv_k7_r2_8u_spiral( +++ Y, X, syms, D, framebits / 2 - excess, excess, Branchtab); ++ ++- unsigned int min = X[0]; ++- int i = 0, state = 0; ++- for(i = 0; i < (d_numstates); ++i) { ++- if(X[i] < min) { ++- min = X[i]; ++- state = i; +++ unsigned int min = X[0]; +++ int i = 0, state = 0; +++ for (i = 0; i < (d_numstates); ++i) { +++ if (X[i] < min) { +++ min = X[i]; +++ state = i; +++ } ++ } ++- } ++ ++- chainback_viterbi(dec, framebits/2 -excess, state, excess, D); +++ chainback_viterbi(dec, framebits / 2 - excess, state, excess, D); ++ ++- return; +++ return; ++ } ++ ++ #endif /*LV_HAVE_SSE3*/ ++@@ -185,151 +193,161 @@ static inline void volk_8u_conv_k7_r2puppet_8u_spiral(unsigned char* syms, unsig ++ #include ++ #include ++ ++-static inline void volk_8u_conv_k7_r2puppet_8u_avx2(unsigned char* syms, unsigned char* dec, unsigned int framebits) { ++- ++- ++- static int once = 1; ++- int d_numstates = (1 << 6); ++- int rate = 2; ++- static unsigned char* D; ++- static unsigned char* Y; ++- static unsigned char* X; ++- static unsigned int excess = 6; ++- static unsigned char* Branchtab; ++- static unsigned char Partab[256]; ++- ++- int d_polys[2] = {79, 109}; ++- ++- ++- if(once) { ++- ++- X = (unsigned char*)volk_malloc(2*d_numstates, volk_get_alignment()); ++- Y = X + d_numstates; ++- Branchtab = (unsigned char*)volk_malloc(d_numstates/2*rate, volk_get_alignment()); ++- D = (unsigned char*)volk_malloc((d_numstates/8) * (framebits + 6), volk_get_alignment()); ++- int state, i; ++- int cnt,ti; ++- ++- /* Initialize parity lookup table */ ++- for(i=0;i<256;i++){ ++- cnt = 0; ++- ti = i; ++- while(ti){ ++- if(ti & 1) ++- cnt++; ++- ti >>= 1; ++- } ++- Partab[i] = cnt & 1; ++- } ++- /* Initialize the branch table */ ++- for(state=0;state < d_numstates/2;state++){ ++- for(i=0; i>= 1; +++ } +++ Partab[i] = cnt & 1; +++ } +++ /* Initialize the branch table */ +++ for (state = 0; state < d_numstates / 2; state++) { +++ for (i = 0; i < rate; i++) { +++ Branchtab[i * d_numstates / 2 + state] = +++ parity((2 * state) & d_polys[i], Partab) ? 255 : 0; +++ } +++ } +++ +++ once = 0; +++ } +++ +++ // unbias the old_metrics +++ memset(X, 31, d_numstates); ++ ++- // initialize decisions ++- memset(D, 0, (d_numstates/8) * (framebits + 6)); +++ // initialize decisions +++ memset(D, 0, (d_numstates / 8) * (framebits + 6)); ++ ++- volk_8u_x4_conv_k7_r2_8u_avx2(Y, X, syms, D, framebits/2 - excess, excess, Branchtab); +++ volk_8u_x4_conv_k7_r2_8u_avx2( +++ Y, X, syms, D, framebits / 2 - excess, excess, Branchtab); ++ ++- unsigned int min = X[0]; ++- int i = 0, state = 0; ++- for(i = 0; i < (d_numstates); ++i) { ++- if(X[i] < min) { ++- min = X[i]; ++- state = i; +++ unsigned int min = X[0]; +++ int i = 0, state = 0; +++ for (i = 0; i < (d_numstates); ++i) { +++ if (X[i] < min) { +++ min = X[i]; +++ state = i; +++ } ++ } ++- } ++ ++- chainback_viterbi(dec, framebits/2 -excess, state, excess, D); +++ chainback_viterbi(dec, framebits / 2 - excess, state, excess, D); ++ ++- return; +++ return; ++ } ++ ++ #endif /*LV_HAVE_AVX2*/ ++ ++ ++- ++ #if LV_HAVE_GENERIC ++ ++ ++-static inline void volk_8u_conv_k7_r2puppet_8u_generic(unsigned char* syms, unsigned char* dec, unsigned int framebits) { ++- ++- ++- ++- static int once = 1; ++- int d_numstates = (1 << 6); ++- int rate = 2; ++- static unsigned char* Y; ++- static unsigned char* X; ++- static unsigned char* D; ++- static unsigned int excess = 6; ++- static unsigned char* Branchtab; ++- static unsigned char Partab[256]; ++- ++- int d_polys[2] = {79, 109}; ++- ++- ++- if(once) { ++- ++- X = (unsigned char*)volk_malloc(2*d_numstates, volk_get_alignment()); ++- Y = X + d_numstates; ++- Branchtab = (unsigned char*)volk_malloc(d_numstates/2*rate, volk_get_alignment()); ++- D = (unsigned char*)volk_malloc((d_numstates/8) * (framebits + 6), volk_get_alignment()); +++static inline void volk_8u_conv_k7_r2puppet_8u_generic(unsigned char* syms, +++ unsigned char* dec, +++ unsigned int framebits) +++{ ++ ++- int state, i; ++- int cnt,ti; ++ ++- /* Initialize parity lookup table */ ++- for(i=0;i<256;i++){ ++- cnt = 0; ++- ti = i; ++- while(ti){ ++- if(ti & 1) ++- cnt++; ++- ti >>= 1; ++- } ++- Partab[i] = cnt & 1; +++ static int once = 1; +++ int d_numstates = (1 << 6); +++ int rate = 2; +++ static unsigned char* Y; +++ static unsigned char* X; +++ static unsigned char* D; +++ static unsigned int excess = 6; +++ static unsigned char* Branchtab; +++ static unsigned char Partab[256]; +++ +++ int d_polys[2] = { 79, 109 }; +++ +++ +++ if (once) { +++ +++ X = (unsigned char*)volk_malloc(2 * d_numstates, volk_get_alignment()); +++ Y = X + d_numstates; +++ Branchtab = +++ (unsigned char*)volk_malloc(d_numstates / 2 * rate, volk_get_alignment()); +++ D = (unsigned char*)volk_malloc((d_numstates / 8) * (framebits + 6), +++ volk_get_alignment()); +++ +++ int state, i; +++ int cnt, ti; +++ +++ /* Initialize parity lookup table */ +++ for (i = 0; i < 256; i++) { +++ cnt = 0; +++ ti = i; +++ while (ti) { +++ if (ti & 1) +++ cnt++; +++ ti >>= 1; +++ } +++ Partab[i] = cnt & 1; +++ } +++ /* Initialize the branch table */ +++ for (state = 0; state < d_numstates / 2; state++) { +++ for (i = 0; i < rate; i++) { +++ Branchtab[i * d_numstates / 2 + state] = +++ parity((2 * state) & d_polys[i], Partab) ? 255 : 0; +++ } +++ } +++ +++ once = 0; ++ } ++- /* Initialize the branch table */ ++- for(state=0;state < d_numstates/2;state++){ ++- for(i=0; i ++ ++-static inline unsigned int ++-log2_of_power_of_2(unsigned int val){ ++- // algorithm from: http://graphics.stanford.edu/~seander/bithacks.html#IntegerLog ++- static const unsigned int b[] = {0xAAAAAAAA, 0xCCCCCCCC, 0xF0F0F0F0, ++- 0xFF00FF00, 0xFFFF0000}; ++- ++- unsigned int res = (val & b[0]) != 0; ++- res |= ((val & b[4]) != 0) << 4; ++- res |= ((val & b[3]) != 0) << 3; ++- res |= ((val & b[2]) != 0) << 2; ++- res |= ((val & b[1]) != 0) << 1; ++- return res; +++static inline unsigned int log2_of_power_of_2(unsigned int val) +++{ +++ // algorithm from: http://graphics.stanford.edu/~seander/bithacks.html#IntegerLog +++ static const unsigned int b[] = { +++ 0xAAAAAAAA, 0xCCCCCCCC, 0xF0F0F0F0, 0xFF00FF00, 0xFFFF0000 +++ }; +++ +++ unsigned int res = (val & b[0]) != 0; +++ res |= ((val & b[4]) != 0) << 4; +++ res |= ((val & b[3]) != 0) << 3; +++ res |= ((val & b[2]) != 0) << 2; +++ res |= ((val & b[1]) != 0) << 1; +++ return res; ++ } ++ ++-static inline void ++-encodepolar_single_stage(unsigned char* frame_ptr, const unsigned char* temp_ptr, ++- const unsigned int num_branches, const unsigned int frame_half) +++static inline void encodepolar_single_stage(unsigned char* frame_ptr, +++ const unsigned char* temp_ptr, +++ const unsigned int num_branches, +++ const unsigned int frame_half) ++ { ++- unsigned int branch, bit; ++- for(branch = 0; branch < num_branches; ++branch){ ++- for(bit = 0; bit < frame_half; ++bit){ ++- *frame_ptr = *temp_ptr ^ *(temp_ptr + 1); ++- *(frame_ptr + frame_half) = *(temp_ptr + 1); ++- ++frame_ptr; ++- temp_ptr += 2; +++ unsigned int branch, bit; +++ for (branch = 0; branch < num_branches; ++branch) { +++ for (bit = 0; bit < frame_half; ++bit) { +++ *frame_ptr = *temp_ptr ^ *(temp_ptr + 1); +++ *(frame_ptr + frame_half) = *(temp_ptr + 1); +++ ++frame_ptr; +++ temp_ptr += 2; +++ } +++ frame_ptr += frame_half; ++ } ++- frame_ptr += frame_half; ++- } ++ } ++ ++ #ifdef LV_HAVE_GENERIC ++ ++-static inline void ++-volk_8u_x2_encodeframepolar_8u_generic(unsigned char* frame, unsigned char* temp, ++- unsigned int frame_size) +++static inline void volk_8u_x2_encodeframepolar_8u_generic(unsigned char* frame, +++ unsigned char* temp, +++ unsigned int frame_size) ++ { ++- unsigned int stage = log2_of_power_of_2(frame_size); ++- unsigned int frame_half = frame_size >> 1; ++- unsigned int num_branches = 1; ++- ++- while(stage){ ++- // encode stage ++- encodepolar_single_stage(frame, temp, num_branches, frame_half); ++- memcpy(temp, frame, sizeof(unsigned char) * frame_size); ++- ++- // update all the parameters. ++- num_branches = num_branches << 1; ++- frame_half = frame_half >> 1; ++- --stage; ++- } +++ unsigned int stage = log2_of_power_of_2(frame_size); +++ unsigned int frame_half = frame_size >> 1; +++ unsigned int num_branches = 1; +++ +++ while (stage) { +++ // encode stage +++ encodepolar_single_stage(frame, temp, num_branches, frame_half); +++ memcpy(temp, frame, sizeof(unsigned char) * frame_size); +++ +++ // update all the parameters. +++ num_branches = num_branches << 1; +++ frame_half = frame_half >> 1; +++ --stage; +++ } ++ } ++ #endif /* LV_HAVE_GENERIC */ ++ ++ #ifdef LV_HAVE_SSSE3 ++ #include ++ ++-static inline void ++-volk_8u_x2_encodeframepolar_8u_u_ssse3(unsigned char* frame, unsigned char* temp, ++- unsigned int frame_size) +++static inline void volk_8u_x2_encodeframepolar_8u_u_ssse3(unsigned char* frame, +++ unsigned char* temp, +++ unsigned int frame_size) ++ { ++- const unsigned int po2 = log2_of_power_of_2(frame_size); ++- ++- unsigned int stage = po2; ++- unsigned char* frame_ptr = frame; ++- unsigned char* temp_ptr = temp; ++- ++- unsigned int frame_half = frame_size >> 1; ++- unsigned int num_branches = 1; ++- unsigned int branch; ++- unsigned int bit; ++- ++- // prepare constants ++- const __m128i mask_stage1 = _mm_set_epi8(0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF); ++- ++- // get some SIMD registers to play with. ++- __m128i r_frame0, r_temp0, shifted; ++- ++- { ++- __m128i r_frame1, r_temp1; ++- const __m128i shuffle_separate = _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15); ++- ++- while(stage > 4){ ++- frame_ptr = frame; ++- temp_ptr = temp; ++- ++- // for stage = 5 a branch has 32 elements. So upper stages are even bigger. ++- for(branch = 0; branch < num_branches; ++branch){ ++- for(bit = 0; bit < frame_half; bit += 16){ ++- r_temp0 = _mm_loadu_si128((__m128i *) temp_ptr); ++- temp_ptr += 16; ++- r_temp1 = _mm_loadu_si128((__m128i *) temp_ptr); ++- temp_ptr += 16; ++- ++- shifted = _mm_srli_si128(r_temp0, 1); ++- shifted = _mm_and_si128(shifted, mask_stage1); ++- r_temp0 = _mm_xor_si128(shifted, r_temp0); ++- r_temp0 = _mm_shuffle_epi8(r_temp0, shuffle_separate); ++- ++- shifted = _mm_srli_si128(r_temp1, 1); ++- shifted = _mm_and_si128(shifted, mask_stage1); ++- r_temp1 = _mm_xor_si128(shifted, r_temp1); ++- r_temp1 = _mm_shuffle_epi8(r_temp1, shuffle_separate); ++- ++- r_frame0 = _mm_unpacklo_epi64(r_temp0, r_temp1); ++- _mm_storeu_si128((__m128i*) frame_ptr, r_frame0); ++- ++- r_frame1 = _mm_unpackhi_epi64(r_temp0, r_temp1); ++- _mm_storeu_si128((__m128i*) (frame_ptr + frame_half), r_frame1); ++- frame_ptr += 16; +++ const unsigned int po2 = log2_of_power_of_2(frame_size); +++ +++ unsigned int stage = po2; +++ unsigned char* frame_ptr = frame; +++ unsigned char* temp_ptr = temp; +++ +++ unsigned int frame_half = frame_size >> 1; +++ unsigned int num_branches = 1; +++ unsigned int branch; +++ unsigned int bit; +++ +++ // prepare constants +++ const __m128i mask_stage1 = _mm_set_epi8(0x0, +++ 0xFF, +++ 0x0, +++ 0xFF, +++ 0x0, +++ 0xFF, +++ 0x0, +++ 0xFF, +++ 0x0, +++ 0xFF, +++ 0x0, +++ 0xFF, +++ 0x0, +++ 0xFF, +++ 0x0, +++ 0xFF); +++ +++ // get some SIMD registers to play with. +++ __m128i r_frame0, r_temp0, shifted; +++ +++ { +++ __m128i r_frame1, r_temp1; +++ const __m128i shuffle_separate = +++ _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15); +++ +++ while (stage > 4) { +++ frame_ptr = frame; +++ temp_ptr = temp; +++ +++ // for stage = 5 a branch has 32 elements. So upper stages are even bigger. +++ for (branch = 0; branch < num_branches; ++branch) { +++ for (bit = 0; bit < frame_half; bit += 16) { +++ r_temp0 = _mm_loadu_si128((__m128i*)temp_ptr); +++ temp_ptr += 16; +++ r_temp1 = _mm_loadu_si128((__m128i*)temp_ptr); +++ temp_ptr += 16; +++ +++ shifted = _mm_srli_si128(r_temp0, 1); +++ shifted = _mm_and_si128(shifted, mask_stage1); +++ r_temp0 = _mm_xor_si128(shifted, r_temp0); +++ r_temp0 = _mm_shuffle_epi8(r_temp0, shuffle_separate); +++ +++ shifted = _mm_srli_si128(r_temp1, 1); +++ shifted = _mm_and_si128(shifted, mask_stage1); +++ r_temp1 = _mm_xor_si128(shifted, r_temp1); +++ r_temp1 = _mm_shuffle_epi8(r_temp1, shuffle_separate); +++ +++ r_frame0 = _mm_unpacklo_epi64(r_temp0, r_temp1); +++ _mm_storeu_si128((__m128i*)frame_ptr, r_frame0); +++ +++ r_frame1 = _mm_unpackhi_epi64(r_temp0, r_temp1); +++ _mm_storeu_si128((__m128i*)(frame_ptr + frame_half), r_frame1); +++ frame_ptr += 16; +++ } +++ +++ frame_ptr += frame_half; +++ } +++ memcpy(temp, frame, sizeof(unsigned char) * frame_size); +++ +++ num_branches = num_branches << 1; +++ frame_half = frame_half >> 1; +++ stage--; ++ } ++- ++- frame_ptr += frame_half; ++- } ++- memcpy(temp, frame, sizeof(unsigned char) * frame_size); ++- ++- num_branches = num_branches << 1; ++- frame_half = frame_half >> 1; ++- stage--; ++ } ++- } ++ ++- // This last part requires at least 16-bit frames. ++- // Smaller frames are useless for SIMD optimization anyways. Just choose GENERIC! +++ // This last part requires at least 16-bit frames. +++ // Smaller frames are useless for SIMD optimization anyways. Just choose GENERIC! ++ ++- // reset pointers to correct positions. ++- frame_ptr = frame; ++- temp_ptr = temp; +++ // reset pointers to correct positions. +++ frame_ptr = frame; +++ temp_ptr = temp; ++ ++- // prefetch first chunk ++- __VOLK_PREFETCH(temp_ptr); ++- ++- const __m128i shuffle_stage4 = _mm_setr_epi8(0, 8, 4, 12, 2, 10, 6, 14, 1, 9, 5, 13, 3, 11, 7, 15); ++- const __m128i mask_stage4 = _mm_set_epi8(0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF); ++- const __m128i mask_stage3 = _mm_set_epi8(0x0, 0x0, 0x0, 0x0, 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x0, 0x0, 0x0, 0xFF, 0xFF, 0xFF, 0xFF); ++- const __m128i mask_stage2 = _mm_set_epi8(0x0, 0x0, 0xFF, 0xFF, 0x0, 0x0, 0xFF, 0xFF, 0x0, 0x0, 0xFF, 0xFF, 0x0, 0x0, 0xFF, 0xFF); ++- ++- for(branch = 0; branch < num_branches; ++branch){ ++- r_temp0 = _mm_loadu_si128((__m128i*) temp_ptr); ++- ++- // prefetch next chunk ++- temp_ptr += 16; +++ // prefetch first chunk ++ __VOLK_PREFETCH(temp_ptr); ++ ++- // shuffle once for bit-reversal. ++- r_temp0 = _mm_shuffle_epi8(r_temp0, shuffle_stage4); ++- ++- shifted = _mm_srli_si128(r_temp0, 8); ++- shifted = _mm_and_si128(shifted, mask_stage4); ++- r_frame0 = _mm_xor_si128(shifted, r_temp0); ++- ++- shifted = _mm_srli_si128(r_frame0, 4); ++- shifted = _mm_and_si128(shifted, mask_stage3); ++- r_frame0 = _mm_xor_si128(shifted, r_frame0); ++- ++- shifted = _mm_srli_si128(r_frame0, 2); ++- shifted = _mm_and_si128(shifted, mask_stage2); ++- r_frame0 = _mm_xor_si128(shifted, r_frame0); ++- ++- shifted = _mm_srli_si128(r_frame0, 1); ++- shifted = _mm_and_si128(shifted, mask_stage1); ++- r_frame0 = _mm_xor_si128(shifted, r_frame0); ++- ++- // store result of chunk. ++- _mm_storeu_si128((__m128i*)frame_ptr, r_frame0); ++- frame_ptr += 16; ++- } +++ const __m128i shuffle_stage4 = +++ _mm_setr_epi8(0, 8, 4, 12, 2, 10, 6, 14, 1, 9, 5, 13, 3, 11, 7, 15); +++ const __m128i mask_stage4 = _mm_set_epi8(0x0, +++ 0x0, +++ 0x0, +++ 0x0, +++ 0x0, +++ 0x0, +++ 0x0, +++ 0x0, +++ 0xFF, +++ 0xFF, +++ 0xFF, +++ 0xFF, +++ 0xFF, +++ 0xFF, +++ 0xFF, +++ 0xFF); +++ const __m128i mask_stage3 = _mm_set_epi8(0x0, +++ 0x0, +++ 0x0, +++ 0x0, +++ 0xFF, +++ 0xFF, +++ 0xFF, +++ 0xFF, +++ 0x0, +++ 0x0, +++ 0x0, +++ 0x0, +++ 0xFF, +++ 0xFF, +++ 0xFF, +++ 0xFF); +++ const __m128i mask_stage2 = _mm_set_epi8(0x0, +++ 0x0, +++ 0xFF, +++ 0xFF, +++ 0x0, +++ 0x0, +++ 0xFF, +++ 0xFF, +++ 0x0, +++ 0x0, +++ 0xFF, +++ 0xFF, +++ 0x0, +++ 0x0, +++ 0xFF, +++ 0xFF); +++ +++ for (branch = 0; branch < num_branches; ++branch) { +++ r_temp0 = _mm_loadu_si128((__m128i*)temp_ptr); +++ +++ // prefetch next chunk +++ temp_ptr += 16; +++ __VOLK_PREFETCH(temp_ptr); +++ +++ // shuffle once for bit-reversal. +++ r_temp0 = _mm_shuffle_epi8(r_temp0, shuffle_stage4); +++ +++ shifted = _mm_srli_si128(r_temp0, 8); +++ shifted = _mm_and_si128(shifted, mask_stage4); +++ r_frame0 = _mm_xor_si128(shifted, r_temp0); +++ +++ shifted = _mm_srli_si128(r_frame0, 4); +++ shifted = _mm_and_si128(shifted, mask_stage3); +++ r_frame0 = _mm_xor_si128(shifted, r_frame0); +++ +++ shifted = _mm_srli_si128(r_frame0, 2); +++ shifted = _mm_and_si128(shifted, mask_stage2); +++ r_frame0 = _mm_xor_si128(shifted, r_frame0); +++ +++ shifted = _mm_srli_si128(r_frame0, 1); +++ shifted = _mm_and_si128(shifted, mask_stage1); +++ r_frame0 = _mm_xor_si128(shifted, r_frame0); +++ +++ // store result of chunk. +++ _mm_storeu_si128((__m128i*)frame_ptr, r_frame0); +++ frame_ptr += 16; +++ } ++ } ++ ++ #endif /* LV_HAVE_SSSE3 */ ++@@ -201,154 +265,351 @@ volk_8u_x2_encodeframepolar_8u_u_ssse3(unsigned char* frame, unsigned char* temp ++ #ifdef LV_HAVE_AVX2 ++ #include ++ ++-static inline void ++-volk_8u_x2_encodeframepolar_8u_u_avx2(unsigned char* frame, unsigned char* temp, ++- unsigned int frame_size) +++static inline void volk_8u_x2_encodeframepolar_8u_u_avx2(unsigned char* frame, +++ unsigned char* temp, +++ unsigned int frame_size) ++ { ++- const unsigned int po2 = log2_of_power_of_2(frame_size); ++- ++- unsigned int stage = po2; ++- unsigned char* frame_ptr = frame; ++- unsigned char* temp_ptr = temp; ++- ++- unsigned int frame_half = frame_size >> 1; ++- unsigned int num_branches = 1; ++- unsigned int branch; ++- unsigned int bit; ++- ++- // prepare constants ++- const __m256i mask_stage1 = _mm256_set_epi8(0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, ++- 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF); ++- ++- const __m128i mask_stage0 = _mm_set_epi8(0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF); ++- // get some SIMD registers to play with. ++- __m256i r_frame0, r_temp0, shifted; ++- __m128i r_temp2, r_frame2, shifted2; ++- { ++- __m256i r_frame1, r_temp1; ++- __m128i r_frame3, r_temp3; ++- const __m256i shuffle_separate = _mm256_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15, ++- 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15); ++- const __m128i shuffle_separate128 = _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15); ++- ++- while(stage > 4){ ++- frame_ptr = frame; ++- temp_ptr = temp; ++- ++- // for stage = 5 a branch has 32 elements. So upper stages are even bigger. ++- for(branch = 0; branch < num_branches; ++branch){ ++- for(bit = 0; bit < frame_half; bit += 32){ ++- if ((frame_half-bit)<32) //if only 16 bits remaining in frame, not 32 ++- { ++- r_temp2 = _mm_loadu_si128((__m128i *) temp_ptr); ++- temp_ptr += 16; ++- r_temp3 = _mm_loadu_si128((__m128i *) temp_ptr); ++- temp_ptr += 16; ++- ++- shifted2 = _mm_srli_si128(r_temp2, 1); ++- shifted2 = _mm_and_si128(shifted2, mask_stage0); ++- r_temp2 = _mm_xor_si128(shifted2, r_temp2); ++- r_temp2 = _mm_shuffle_epi8(r_temp2, shuffle_separate128); ++- ++- shifted2 = _mm_srli_si128(r_temp3, 1); ++- shifted2 = _mm_and_si128(shifted2, mask_stage0); ++- r_temp3 = _mm_xor_si128(shifted2, r_temp3); ++- r_temp3 = _mm_shuffle_epi8(r_temp3, shuffle_separate128); ++- ++- r_frame2 = _mm_unpacklo_epi64(r_temp2, r_temp3); ++- _mm_storeu_si128((__m128i*) frame_ptr, r_frame2); ++- ++- r_frame3 = _mm_unpackhi_epi64(r_temp2, r_temp3); ++- _mm_storeu_si128((__m128i*) (frame_ptr + frame_half), r_frame3); ++- frame_ptr += 16; ++- break; ++- } ++- r_temp0 = _mm256_loadu_si256((__m256i *) temp_ptr); ++- temp_ptr += 32; ++- r_temp1 = _mm256_loadu_si256((__m256i *) temp_ptr); ++- temp_ptr += 32; ++- ++- shifted = _mm256_srli_si256(r_temp0, 1);//operate on 128 bit lanes ++- shifted = _mm256_and_si256(shifted, mask_stage1); ++- r_temp0 = _mm256_xor_si256(shifted, r_temp0); ++- r_temp0 = _mm256_shuffle_epi8(r_temp0, shuffle_separate); ++- ++- shifted = _mm256_srli_si256(r_temp1, 1); ++- shifted = _mm256_and_si256(shifted, mask_stage1); ++- r_temp1 = _mm256_xor_si256(shifted, r_temp1); ++- r_temp1 = _mm256_shuffle_epi8(r_temp1, shuffle_separate); ++- ++- r_frame0 = _mm256_unpacklo_epi64(r_temp0, r_temp1); ++- r_temp1 = _mm256_unpackhi_epi64(r_temp0, r_temp1); ++- r_frame0 = _mm256_permute4x64_epi64(r_frame0, 0xd8); ++- r_frame1 = _mm256_permute4x64_epi64(r_temp1, 0xd8); ++- ++- _mm256_storeu_si256((__m256i*) frame_ptr, r_frame0); ++- ++- _mm256_storeu_si256((__m256i*) (frame_ptr + frame_half), r_frame1); ++- frame_ptr += 32; +++ const unsigned int po2 = log2_of_power_of_2(frame_size); +++ +++ unsigned int stage = po2; +++ unsigned char* frame_ptr = frame; +++ unsigned char* temp_ptr = temp; +++ +++ unsigned int frame_half = frame_size >> 1; +++ unsigned int num_branches = 1; +++ unsigned int branch; +++ unsigned int bit; +++ +++ // prepare constants +++ const __m256i mask_stage1 = _mm256_set_epi8(0x0, +++ 0xFF, +++ 0x0, +++ 0xFF, +++ 0x0, +++ 0xFF, +++ 0x0, +++ 0xFF, +++ 0x0, +++ 0xFF, +++ 0x0, +++ 0xFF, +++ 0x0, +++ 0xFF, +++ 0x0, +++ 0xFF, +++ 0x0, +++ 0xFF, +++ 0x0, +++ 0xFF, +++ 0x0, +++ 0xFF, +++ 0x0, +++ 0xFF, +++ 0x0, +++ 0xFF, +++ 0x0, +++ 0xFF, +++ 0x0, +++ 0xFF, +++ 0x0, +++ 0xFF); +++ +++ const __m128i mask_stage0 = _mm_set_epi8(0x0, +++ 0xFF, +++ 0x0, +++ 0xFF, +++ 0x0, +++ 0xFF, +++ 0x0, +++ 0xFF, +++ 0x0, +++ 0xFF, +++ 0x0, +++ 0xFF, +++ 0x0, +++ 0xFF, +++ 0x0, +++ 0xFF); +++ // get some SIMD registers to play with. +++ __m256i r_frame0, r_temp0, shifted; +++ __m128i r_temp2, r_frame2, shifted2; +++ { +++ __m256i r_frame1, r_temp1; +++ __m128i r_frame3, r_temp3; +++ const __m256i shuffle_separate = _mm256_setr_epi8(0, +++ 2, +++ 4, +++ 6, +++ 8, +++ 10, +++ 12, +++ 14, +++ 1, +++ 3, +++ 5, +++ 7, +++ 9, +++ 11, +++ 13, +++ 15, +++ 0, +++ 2, +++ 4, +++ 6, +++ 8, +++ 10, +++ 12, +++ 14, +++ 1, +++ 3, +++ 5, +++ 7, +++ 9, +++ 11, +++ 13, +++ 15); +++ const __m128i shuffle_separate128 = +++ _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15); +++ +++ while (stage > 4) { +++ frame_ptr = frame; +++ temp_ptr = temp; +++ +++ // for stage = 5 a branch has 32 elements. So upper stages are even bigger. +++ for (branch = 0; branch < num_branches; ++branch) { +++ for (bit = 0; bit < frame_half; bit += 32) { +++ if ((frame_half - bit) < +++ 32) // if only 16 bits remaining in frame, not 32 +++ { +++ r_temp2 = _mm_loadu_si128((__m128i*)temp_ptr); +++ temp_ptr += 16; +++ r_temp3 = _mm_loadu_si128((__m128i*)temp_ptr); +++ temp_ptr += 16; +++ +++ shifted2 = _mm_srli_si128(r_temp2, 1); +++ shifted2 = _mm_and_si128(shifted2, mask_stage0); +++ r_temp2 = _mm_xor_si128(shifted2, r_temp2); +++ r_temp2 = _mm_shuffle_epi8(r_temp2, shuffle_separate128); +++ +++ shifted2 = _mm_srli_si128(r_temp3, 1); +++ shifted2 = _mm_and_si128(shifted2, mask_stage0); +++ r_temp3 = _mm_xor_si128(shifted2, r_temp3); +++ r_temp3 = _mm_shuffle_epi8(r_temp3, shuffle_separate128); +++ +++ r_frame2 = _mm_unpacklo_epi64(r_temp2, r_temp3); +++ _mm_storeu_si128((__m128i*)frame_ptr, r_frame2); +++ +++ r_frame3 = _mm_unpackhi_epi64(r_temp2, r_temp3); +++ _mm_storeu_si128((__m128i*)(frame_ptr + frame_half), r_frame3); +++ frame_ptr += 16; +++ break; +++ } +++ r_temp0 = _mm256_loadu_si256((__m256i*)temp_ptr); +++ temp_ptr += 32; +++ r_temp1 = _mm256_loadu_si256((__m256i*)temp_ptr); +++ temp_ptr += 32; +++ +++ shifted = _mm256_srli_si256(r_temp0, 1); // operate on 128 bit lanes +++ shifted = _mm256_and_si256(shifted, mask_stage1); +++ r_temp0 = _mm256_xor_si256(shifted, r_temp0); +++ r_temp0 = _mm256_shuffle_epi8(r_temp0, shuffle_separate); +++ +++ shifted = _mm256_srli_si256(r_temp1, 1); +++ shifted = _mm256_and_si256(shifted, mask_stage1); +++ r_temp1 = _mm256_xor_si256(shifted, r_temp1); +++ r_temp1 = _mm256_shuffle_epi8(r_temp1, shuffle_separate); +++ +++ r_frame0 = _mm256_unpacklo_epi64(r_temp0, r_temp1); +++ r_temp1 = _mm256_unpackhi_epi64(r_temp0, r_temp1); +++ r_frame0 = _mm256_permute4x64_epi64(r_frame0, 0xd8); +++ r_frame1 = _mm256_permute4x64_epi64(r_temp1, 0xd8); +++ +++ _mm256_storeu_si256((__m256i*)frame_ptr, r_frame0); +++ +++ _mm256_storeu_si256((__m256i*)(frame_ptr + frame_half), r_frame1); +++ frame_ptr += 32; +++ } +++ +++ frame_ptr += frame_half; +++ } +++ memcpy(temp, frame, sizeof(unsigned char) * frame_size); +++ +++ num_branches = num_branches << 1; +++ frame_half = frame_half >> 1; +++ stage--; ++ } ++- ++- frame_ptr += frame_half; ++- } ++- memcpy(temp, frame, sizeof(unsigned char) * frame_size); ++- ++- num_branches = num_branches << 1; ++- frame_half = frame_half >> 1; ++- stage--; ++ } ++- } ++- ++- // This last part requires at least 32-bit frames. ++- // Smaller frames are useless for SIMD optimization anyways. Just choose GENERIC! ++- ++- // reset pointers to correct positions. ++- frame_ptr = frame; ++- temp_ptr = temp; ++ ++- // prefetch first chunk ++- __VOLK_PREFETCH(temp_ptr); +++ // This last part requires at least 32-bit frames. +++ // Smaller frames are useless for SIMD optimization anyways. Just choose GENERIC! ++ ++- const __m256i shuffle_stage4 = _mm256_setr_epi8(0, 8, 4, 12, 2, 10, 6, 14, 1, 9, 5, 13, 3, 11, 7, 15, ++- 0, 8, 4, 12, 2, 10, 6, 14, 1, 9, 5, 13, 3, 11, 7, 15); ++- const __m256i mask_stage4 = _mm256_set_epi8(0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, ++- 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF); ++- const __m256i mask_stage3 = _mm256_set_epi8(0x0, 0x0, 0x0, 0x0, 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x0, 0x0, 0x0, 0xFF, 0xFF, 0xFF, 0xFF, ++- 0x0, 0x0, 0x0, 0x0, 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x0, 0x0, 0x0, 0xFF, 0xFF, 0xFF, 0xFF); ++- const __m256i mask_stage2 = _mm256_set_epi8(0x0, 0x0, 0xFF, 0xFF, 0x0, 0x0, 0xFF, 0xFF, 0x0, 0x0, 0xFF, 0xFF, 0x0, 0x0, 0xFF, 0xFF, ++- 0x0, 0x0, 0xFF, 0xFF, 0x0, 0x0, 0xFF, 0xFF, 0x0, 0x0, 0xFF, 0xFF, 0x0, 0x0, 0xFF, 0xFF); +++ // reset pointers to correct positions. +++ frame_ptr = frame; +++ temp_ptr = temp; ++ ++- for(branch = 0; branch < num_branches/2; ++branch){ ++- r_temp0 = _mm256_loadu_si256((__m256i*) temp_ptr); ++- ++- // prefetch next chunk ++- temp_ptr += 32; +++ // prefetch first chunk ++ __VOLK_PREFETCH(temp_ptr); ++ ++- // shuffle once for bit-reversal. ++- r_temp0 = _mm256_shuffle_epi8(r_temp0, shuffle_stage4); ++- ++- shifted = _mm256_srli_si256(r_temp0, 8); //128 bit lanes ++- shifted = _mm256_and_si256(shifted, mask_stage4); ++- r_frame0 = _mm256_xor_si256(shifted, r_temp0); ++- ++- ++- shifted = _mm256_srli_si256(r_frame0, 4); ++- shifted = _mm256_and_si256(shifted, mask_stage3); ++- r_frame0 = _mm256_xor_si256(shifted, r_frame0); ++- ++- shifted = _mm256_srli_si256(r_frame0, 2); ++- shifted = _mm256_and_si256(shifted, mask_stage2); ++- r_frame0 = _mm256_xor_si256(shifted, r_frame0); ++- ++- shifted = _mm256_srli_si256(r_frame0, 1); ++- shifted = _mm256_and_si256(shifted, mask_stage1); ++- r_frame0 = _mm256_xor_si256(shifted, r_frame0); ++- ++- // store result of chunk. ++- _mm256_storeu_si256((__m256i*)frame_ptr, r_frame0); ++- frame_ptr += 32; ++- } +++ const __m256i shuffle_stage4 = _mm256_setr_epi8(0, +++ 8, +++ 4, +++ 12, +++ 2, +++ 10, +++ 6, +++ 14, +++ 1, +++ 9, +++ 5, +++ 13, +++ 3, +++ 11, +++ 7, +++ 15, +++ 0, +++ 8, +++ 4, +++ 12, +++ 2, +++ 10, +++ 6, +++ 14, +++ 1, +++ 9, +++ 5, +++ 13, +++ 3, +++ 11, +++ 7, +++ 15); +++ const __m256i mask_stage4 = _mm256_set_epi8(0x0, +++ 0x0, +++ 0x0, +++ 0x0, +++ 0x0, +++ 0x0, +++ 0x0, +++ 0x0, +++ 0xFF, +++ 0xFF, +++ 0xFF, +++ 0xFF, +++ 0xFF, +++ 0xFF, +++ 0xFF, +++ 0xFF, +++ 0x0, +++ 0x0, +++ 0x0, +++ 0x0, +++ 0x0, +++ 0x0, +++ 0x0, +++ 0x0, +++ 0xFF, +++ 0xFF, +++ 0xFF, +++ 0xFF, +++ 0xFF, +++ 0xFF, +++ 0xFF, +++ 0xFF); +++ const __m256i mask_stage3 = _mm256_set_epi8(0x0, +++ 0x0, +++ 0x0, +++ 0x0, +++ 0xFF, +++ 0xFF, +++ 0xFF, +++ 0xFF, +++ 0x0, +++ 0x0, +++ 0x0, +++ 0x0, +++ 0xFF, +++ 0xFF, +++ 0xFF, +++ 0xFF, +++ 0x0, +++ 0x0, +++ 0x0, +++ 0x0, +++ 0xFF, +++ 0xFF, +++ 0xFF, +++ 0xFF, +++ 0x0, +++ 0x0, +++ 0x0, +++ 0x0, +++ 0xFF, +++ 0xFF, +++ 0xFF, +++ 0xFF); +++ const __m256i mask_stage2 = _mm256_set_epi8(0x0, +++ 0x0, +++ 0xFF, +++ 0xFF, +++ 0x0, +++ 0x0, +++ 0xFF, +++ 0xFF, +++ 0x0, +++ 0x0, +++ 0xFF, +++ 0xFF, +++ 0x0, +++ 0x0, +++ 0xFF, +++ 0xFF, +++ 0x0, +++ 0x0, +++ 0xFF, +++ 0xFF, +++ 0x0, +++ 0x0, +++ 0xFF, +++ 0xFF, +++ 0x0, +++ 0x0, +++ 0xFF, +++ 0xFF, +++ 0x0, +++ 0x0, +++ 0xFF, +++ 0xFF); +++ +++ for (branch = 0; branch < num_branches / 2; ++branch) { +++ r_temp0 = _mm256_loadu_si256((__m256i*)temp_ptr); +++ +++ // prefetch next chunk +++ temp_ptr += 32; +++ __VOLK_PREFETCH(temp_ptr); +++ +++ // shuffle once for bit-reversal. +++ r_temp0 = _mm256_shuffle_epi8(r_temp0, shuffle_stage4); +++ +++ shifted = _mm256_srli_si256(r_temp0, 8); // 128 bit lanes +++ shifted = _mm256_and_si256(shifted, mask_stage4); +++ r_frame0 = _mm256_xor_si256(shifted, r_temp0); +++ +++ +++ shifted = _mm256_srli_si256(r_frame0, 4); +++ shifted = _mm256_and_si256(shifted, mask_stage3); +++ r_frame0 = _mm256_xor_si256(shifted, r_frame0); +++ +++ shifted = _mm256_srli_si256(r_frame0, 2); +++ shifted = _mm256_and_si256(shifted, mask_stage2); +++ r_frame0 = _mm256_xor_si256(shifted, r_frame0); +++ +++ shifted = _mm256_srli_si256(r_frame0, 1); +++ shifted = _mm256_and_si256(shifted, mask_stage1); +++ r_frame0 = _mm256_xor_si256(shifted, r_frame0); +++ +++ // store result of chunk. +++ _mm256_storeu_si256((__m256i*)frame_ptr, r_frame0); +++ frame_ptr += 32; +++ } ++ } ++ #endif /* LV_HAVE_AVX2 */ ++ ++@@ -360,272 +621,530 @@ volk_8u_x2_encodeframepolar_8u_u_avx2(unsigned char* frame, unsigned char* temp, ++ #ifdef LV_HAVE_SSSE3 ++ #include ++ ++-static inline void ++-volk_8u_x2_encodeframepolar_8u_a_ssse3(unsigned char* frame, unsigned char* temp, ++- unsigned int frame_size) +++static inline void volk_8u_x2_encodeframepolar_8u_a_ssse3(unsigned char* frame, +++ unsigned char* temp, +++ unsigned int frame_size) ++ { ++- const unsigned int po2 = log2_of_power_of_2(frame_size); ++- ++- unsigned int stage = po2; ++- unsigned char* frame_ptr = frame; ++- unsigned char* temp_ptr = temp; ++- ++- unsigned int frame_half = frame_size >> 1; ++- unsigned int num_branches = 1; ++- unsigned int branch; ++- unsigned int bit; ++- ++- // prepare constants ++- const __m128i mask_stage1 = _mm_set_epi8(0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF); ++- ++- // get some SIMD registers to play with. ++- __m128i r_frame0, r_temp0, shifted; ++- ++- { ++- __m128i r_frame1, r_temp1; ++- const __m128i shuffle_separate = _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15); ++- ++- while(stage > 4){ ++- frame_ptr = frame; ++- temp_ptr = temp; ++- ++- // for stage = 5 a branch has 32 elements. So upper stages are even bigger. ++- for(branch = 0; branch < num_branches; ++branch){ ++- for(bit = 0; bit < frame_half; bit += 16){ ++- r_temp0 = _mm_load_si128((__m128i *) temp_ptr); ++- temp_ptr += 16; ++- r_temp1 = _mm_load_si128((__m128i *) temp_ptr); ++- temp_ptr += 16; ++- ++- shifted = _mm_srli_si128(r_temp0, 1); ++- shifted = _mm_and_si128(shifted, mask_stage1); ++- r_temp0 = _mm_xor_si128(shifted, r_temp0); ++- r_temp0 = _mm_shuffle_epi8(r_temp0, shuffle_separate); ++- ++- shifted = _mm_srli_si128(r_temp1, 1); ++- shifted = _mm_and_si128(shifted, mask_stage1); ++- r_temp1 = _mm_xor_si128(shifted, r_temp1); ++- r_temp1 = _mm_shuffle_epi8(r_temp1, shuffle_separate); ++- ++- r_frame0 = _mm_unpacklo_epi64(r_temp0, r_temp1); ++- _mm_store_si128((__m128i*) frame_ptr, r_frame0); ++- ++- r_frame1 = _mm_unpackhi_epi64(r_temp0, r_temp1); ++- _mm_store_si128((__m128i*) (frame_ptr + frame_half), r_frame1); ++- frame_ptr += 16; +++ const unsigned int po2 = log2_of_power_of_2(frame_size); +++ +++ unsigned int stage = po2; +++ unsigned char* frame_ptr = frame; +++ unsigned char* temp_ptr = temp; +++ +++ unsigned int frame_half = frame_size >> 1; +++ unsigned int num_branches = 1; +++ unsigned int branch; +++ unsigned int bit; +++ +++ // prepare constants +++ const __m128i mask_stage1 = _mm_set_epi8(0x0, +++ 0xFF, +++ 0x0, +++ 0xFF, +++ 0x0, +++ 0xFF, +++ 0x0, +++ 0xFF, +++ 0x0, +++ 0xFF, +++ 0x0, +++ 0xFF, +++ 0x0, +++ 0xFF, +++ 0x0, +++ 0xFF); +++ +++ // get some SIMD registers to play with. +++ __m128i r_frame0, r_temp0, shifted; +++ +++ { +++ __m128i r_frame1, r_temp1; +++ const __m128i shuffle_separate = +++ _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15); +++ +++ while (stage > 4) { +++ frame_ptr = frame; +++ temp_ptr = temp; +++ +++ // for stage = 5 a branch has 32 elements. So upper stages are even bigger. +++ for (branch = 0; branch < num_branches; ++branch) { +++ for (bit = 0; bit < frame_half; bit += 16) { +++ r_temp0 = _mm_load_si128((__m128i*)temp_ptr); +++ temp_ptr += 16; +++ r_temp1 = _mm_load_si128((__m128i*)temp_ptr); +++ temp_ptr += 16; +++ +++ shifted = _mm_srli_si128(r_temp0, 1); +++ shifted = _mm_and_si128(shifted, mask_stage1); +++ r_temp0 = _mm_xor_si128(shifted, r_temp0); +++ r_temp0 = _mm_shuffle_epi8(r_temp0, shuffle_separate); +++ +++ shifted = _mm_srli_si128(r_temp1, 1); +++ shifted = _mm_and_si128(shifted, mask_stage1); +++ r_temp1 = _mm_xor_si128(shifted, r_temp1); +++ r_temp1 = _mm_shuffle_epi8(r_temp1, shuffle_separate); +++ +++ r_frame0 = _mm_unpacklo_epi64(r_temp0, r_temp1); +++ _mm_store_si128((__m128i*)frame_ptr, r_frame0); +++ +++ r_frame1 = _mm_unpackhi_epi64(r_temp0, r_temp1); +++ _mm_store_si128((__m128i*)(frame_ptr + frame_half), r_frame1); +++ frame_ptr += 16; +++ } +++ +++ frame_ptr += frame_half; +++ } +++ memcpy(temp, frame, sizeof(unsigned char) * frame_size); +++ +++ num_branches = num_branches << 1; +++ frame_half = frame_half >> 1; +++ stage--; ++ } ++- ++- frame_ptr += frame_half; ++- } ++- memcpy(temp, frame, sizeof(unsigned char) * frame_size); ++- ++- num_branches = num_branches << 1; ++- frame_half = frame_half >> 1; ++- stage--; ++ } ++- } ++- ++- // This last part requires at least 16-bit frames. ++- // Smaller frames are useless for SIMD optimization anyways. Just choose GENERIC! ++- ++- // reset pointers to correct positions. ++- frame_ptr = frame; ++- temp_ptr = temp; ++ ++- // prefetch first chunk ++- __VOLK_PREFETCH(temp_ptr); +++ // This last part requires at least 16-bit frames. +++ // Smaller frames are useless for SIMD optimization anyways. Just choose GENERIC! ++ ++- const __m128i shuffle_stage4 = _mm_setr_epi8(0, 8, 4, 12, 2, 10, 6, 14, 1, 9, 5, 13, 3, 11, 7, 15); ++- const __m128i mask_stage4 = _mm_set_epi8(0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF); ++- const __m128i mask_stage3 = _mm_set_epi8(0x0, 0x0, 0x0, 0x0, 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x0, 0x0, 0x0, 0xFF, 0xFF, 0xFF, 0xFF); ++- const __m128i mask_stage2 = _mm_set_epi8(0x0, 0x0, 0xFF, 0xFF, 0x0, 0x0, 0xFF, 0xFF, 0x0, 0x0, 0xFF, 0xFF, 0x0, 0x0, 0xFF, 0xFF); +++ // reset pointers to correct positions. +++ frame_ptr = frame; +++ temp_ptr = temp; ++ ++- for(branch = 0; branch < num_branches; ++branch){ ++- r_temp0 = _mm_load_si128((__m128i*) temp_ptr); ++- ++- // prefetch next chunk ++- temp_ptr += 16; +++ // prefetch first chunk ++ __VOLK_PREFETCH(temp_ptr); ++ ++- // shuffle once for bit-reversal. ++- r_temp0 = _mm_shuffle_epi8(r_temp0, shuffle_stage4); ++- ++- shifted = _mm_srli_si128(r_temp0, 8); ++- shifted = _mm_and_si128(shifted, mask_stage4); ++- r_frame0 = _mm_xor_si128(shifted, r_temp0); ++- ++- shifted = _mm_srli_si128(r_frame0, 4); ++- shifted = _mm_and_si128(shifted, mask_stage3); ++- r_frame0 = _mm_xor_si128(shifted, r_frame0); ++- ++- shifted = _mm_srli_si128(r_frame0, 2); ++- shifted = _mm_and_si128(shifted, mask_stage2); ++- r_frame0 = _mm_xor_si128(shifted, r_frame0); ++- ++- shifted = _mm_srli_si128(r_frame0, 1); ++- shifted = _mm_and_si128(shifted, mask_stage1); ++- r_frame0 = _mm_xor_si128(shifted, r_frame0); ++- ++- // store result of chunk. ++- _mm_store_si128((__m128i*)frame_ptr, r_frame0); ++- frame_ptr += 16; ++- } +++ const __m128i shuffle_stage4 = +++ _mm_setr_epi8(0, 8, 4, 12, 2, 10, 6, 14, 1, 9, 5, 13, 3, 11, 7, 15); +++ const __m128i mask_stage4 = _mm_set_epi8(0x0, +++ 0x0, +++ 0x0, +++ 0x0, +++ 0x0, +++ 0x0, +++ 0x0, +++ 0x0, +++ 0xFF, +++ 0xFF, +++ 0xFF, +++ 0xFF, +++ 0xFF, +++ 0xFF, +++ 0xFF, +++ 0xFF); +++ const __m128i mask_stage3 = _mm_set_epi8(0x0, +++ 0x0, +++ 0x0, +++ 0x0, +++ 0xFF, +++ 0xFF, +++ 0xFF, +++ 0xFF, +++ 0x0, +++ 0x0, +++ 0x0, +++ 0x0, +++ 0xFF, +++ 0xFF, +++ 0xFF, +++ 0xFF); +++ const __m128i mask_stage2 = _mm_set_epi8(0x0, +++ 0x0, +++ 0xFF, +++ 0xFF, +++ 0x0, +++ 0x0, +++ 0xFF, +++ 0xFF, +++ 0x0, +++ 0x0, +++ 0xFF, +++ 0xFF, +++ 0x0, +++ 0x0, +++ 0xFF, +++ 0xFF); +++ +++ for (branch = 0; branch < num_branches; ++branch) { +++ r_temp0 = _mm_load_si128((__m128i*)temp_ptr); +++ +++ // prefetch next chunk +++ temp_ptr += 16; +++ __VOLK_PREFETCH(temp_ptr); +++ +++ // shuffle once for bit-reversal. +++ r_temp0 = _mm_shuffle_epi8(r_temp0, shuffle_stage4); +++ +++ shifted = _mm_srli_si128(r_temp0, 8); +++ shifted = _mm_and_si128(shifted, mask_stage4); +++ r_frame0 = _mm_xor_si128(shifted, r_temp0); +++ +++ shifted = _mm_srli_si128(r_frame0, 4); +++ shifted = _mm_and_si128(shifted, mask_stage3); +++ r_frame0 = _mm_xor_si128(shifted, r_frame0); +++ +++ shifted = _mm_srli_si128(r_frame0, 2); +++ shifted = _mm_and_si128(shifted, mask_stage2); +++ r_frame0 = _mm_xor_si128(shifted, r_frame0); +++ +++ shifted = _mm_srli_si128(r_frame0, 1); +++ shifted = _mm_and_si128(shifted, mask_stage1); +++ r_frame0 = _mm_xor_si128(shifted, r_frame0); +++ +++ // store result of chunk. +++ _mm_store_si128((__m128i*)frame_ptr, r_frame0); +++ frame_ptr += 16; +++ } ++ } ++ #endif /* LV_HAVE_SSSE3 */ ++ ++ #ifdef LV_HAVE_AVX2 ++ #include ++ ++-static inline void ++-volk_8u_x2_encodeframepolar_8u_a_avx2(unsigned char* frame, unsigned char* temp, ++- unsigned int frame_size) +++static inline void volk_8u_x2_encodeframepolar_8u_a_avx2(unsigned char* frame, +++ unsigned char* temp, +++ unsigned int frame_size) ++ { ++- const unsigned int po2 = log2_of_power_of_2(frame_size); ++- ++- unsigned int stage = po2; ++- unsigned char* frame_ptr = frame; ++- unsigned char* temp_ptr = temp; ++- ++- unsigned int frame_half = frame_size >> 1; ++- unsigned int num_branches = 1; ++- unsigned int branch; ++- unsigned int bit; ++- ++- // prepare constants ++- const __m256i mask_stage1 = _mm256_set_epi8(0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, ++- 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF); ++- ++- const __m128i mask_stage0 = _mm_set_epi8(0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF); ++- // get some SIMD registers to play with. ++- __m256i r_frame0, r_temp0, shifted; ++- __m128i r_temp2, r_frame2, shifted2; ++- { ++- __m256i r_frame1, r_temp1; ++- __m128i r_frame3, r_temp3; ++- const __m256i shuffle_separate = _mm256_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15, ++- 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15); ++- const __m128i shuffle_separate128 = _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15); ++- ++- while(stage > 4){ ++- frame_ptr = frame; ++- temp_ptr = temp; ++- ++- // for stage = 5 a branch has 32 elements. So upper stages are even bigger. ++- for(branch = 0; branch < num_branches; ++branch){ ++- for(bit = 0; bit < frame_half; bit += 32){ ++- if ((frame_half-bit)<32) //if only 16 bits remaining in frame, not 32 ++- { ++- r_temp2 = _mm_load_si128((__m128i *) temp_ptr); ++- temp_ptr += 16; ++- r_temp3 = _mm_load_si128((__m128i *) temp_ptr); ++- temp_ptr += 16; ++- ++- shifted2 = _mm_srli_si128(r_temp2, 1); ++- shifted2 = _mm_and_si128(shifted2, mask_stage0); ++- r_temp2 = _mm_xor_si128(shifted2, r_temp2); ++- r_temp2 = _mm_shuffle_epi8(r_temp2, shuffle_separate128); ++- ++- shifted2 = _mm_srli_si128(r_temp3, 1); ++- shifted2 = _mm_and_si128(shifted2, mask_stage0); ++- r_temp3 = _mm_xor_si128(shifted2, r_temp3); ++- r_temp3 = _mm_shuffle_epi8(r_temp3, shuffle_separate128); ++- ++- r_frame2 = _mm_unpacklo_epi64(r_temp2, r_temp3); ++- _mm_store_si128((__m128i*) frame_ptr, r_frame2); ++- ++- r_frame3 = _mm_unpackhi_epi64(r_temp2, r_temp3); ++- _mm_store_si128((__m128i*) (frame_ptr + frame_half), r_frame3); ++- frame_ptr += 16; ++- break; ++- } ++- r_temp0 = _mm256_load_si256((__m256i *) temp_ptr); ++- temp_ptr += 32; ++- r_temp1 = _mm256_load_si256((__m256i *) temp_ptr); ++- temp_ptr += 32; ++- ++- shifted = _mm256_srli_si256(r_temp0, 1);//operate on 128 bit lanes ++- shifted = _mm256_and_si256(shifted, mask_stage1); ++- r_temp0 = _mm256_xor_si256(shifted, r_temp0); ++- r_temp0 = _mm256_shuffle_epi8(r_temp0, shuffle_separate); ++- ++- shifted = _mm256_srli_si256(r_temp1, 1); ++- shifted = _mm256_and_si256(shifted, mask_stage1); ++- r_temp1 = _mm256_xor_si256(shifted, r_temp1); ++- r_temp1 = _mm256_shuffle_epi8(r_temp1, shuffle_separate); ++- ++- r_frame0 = _mm256_unpacklo_epi64(r_temp0, r_temp1); ++- r_temp1 = _mm256_unpackhi_epi64(r_temp0, r_temp1); ++- r_frame0 = _mm256_permute4x64_epi64(r_frame0, 0xd8); ++- r_frame1 = _mm256_permute4x64_epi64(r_temp1, 0xd8); ++- ++- _mm256_store_si256((__m256i*) frame_ptr, r_frame0); ++- ++- _mm256_store_si256((__m256i*) (frame_ptr + frame_half), r_frame1); ++- frame_ptr += 32; +++ const unsigned int po2 = log2_of_power_of_2(frame_size); +++ +++ unsigned int stage = po2; +++ unsigned char* frame_ptr = frame; +++ unsigned char* temp_ptr = temp; +++ +++ unsigned int frame_half = frame_size >> 1; +++ unsigned int num_branches = 1; +++ unsigned int branch; +++ unsigned int bit; +++ +++ // prepare constants +++ const __m256i mask_stage1 = _mm256_set_epi8(0x0, +++ 0xFF, +++ 0x0, +++ 0xFF, +++ 0x0, +++ 0xFF, +++ 0x0, +++ 0xFF, +++ 0x0, +++ 0xFF, +++ 0x0, +++ 0xFF, +++ 0x0, +++ 0xFF, +++ 0x0, +++ 0xFF, +++ 0x0, +++ 0xFF, +++ 0x0, +++ 0xFF, +++ 0x0, +++ 0xFF, +++ 0x0, +++ 0xFF, +++ 0x0, +++ 0xFF, +++ 0x0, +++ 0xFF, +++ 0x0, +++ 0xFF, +++ 0x0, +++ 0xFF); +++ +++ const __m128i mask_stage0 = _mm_set_epi8(0x0, +++ 0xFF, +++ 0x0, +++ 0xFF, +++ 0x0, +++ 0xFF, +++ 0x0, +++ 0xFF, +++ 0x0, +++ 0xFF, +++ 0x0, +++ 0xFF, +++ 0x0, +++ 0xFF, +++ 0x0, +++ 0xFF); +++ // get some SIMD registers to play with. +++ __m256i r_frame0, r_temp0, shifted; +++ __m128i r_temp2, r_frame2, shifted2; +++ { +++ __m256i r_frame1, r_temp1; +++ __m128i r_frame3, r_temp3; +++ const __m256i shuffle_separate = _mm256_setr_epi8(0, +++ 2, +++ 4, +++ 6, +++ 8, +++ 10, +++ 12, +++ 14, +++ 1, +++ 3, +++ 5, +++ 7, +++ 9, +++ 11, +++ 13, +++ 15, +++ 0, +++ 2, +++ 4, +++ 6, +++ 8, +++ 10, +++ 12, +++ 14, +++ 1, +++ 3, +++ 5, +++ 7, +++ 9, +++ 11, +++ 13, +++ 15); +++ const __m128i shuffle_separate128 = +++ _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15); +++ +++ while (stage > 4) { +++ frame_ptr = frame; +++ temp_ptr = temp; +++ +++ // for stage = 5 a branch has 32 elements. So upper stages are even bigger. +++ for (branch = 0; branch < num_branches; ++branch) { +++ for (bit = 0; bit < frame_half; bit += 32) { +++ if ((frame_half - bit) < +++ 32) // if only 16 bits remaining in frame, not 32 +++ { +++ r_temp2 = _mm_load_si128((__m128i*)temp_ptr); +++ temp_ptr += 16; +++ r_temp3 = _mm_load_si128((__m128i*)temp_ptr); +++ temp_ptr += 16; +++ +++ shifted2 = _mm_srli_si128(r_temp2, 1); +++ shifted2 = _mm_and_si128(shifted2, mask_stage0); +++ r_temp2 = _mm_xor_si128(shifted2, r_temp2); +++ r_temp2 = _mm_shuffle_epi8(r_temp2, shuffle_separate128); +++ +++ shifted2 = _mm_srli_si128(r_temp3, 1); +++ shifted2 = _mm_and_si128(shifted2, mask_stage0); +++ r_temp3 = _mm_xor_si128(shifted2, r_temp3); +++ r_temp3 = _mm_shuffle_epi8(r_temp3, shuffle_separate128); +++ +++ r_frame2 = _mm_unpacklo_epi64(r_temp2, r_temp3); +++ _mm_store_si128((__m128i*)frame_ptr, r_frame2); +++ +++ r_frame3 = _mm_unpackhi_epi64(r_temp2, r_temp3); +++ _mm_store_si128((__m128i*)(frame_ptr + frame_half), r_frame3); +++ frame_ptr += 16; +++ break; +++ } +++ r_temp0 = _mm256_load_si256((__m256i*)temp_ptr); +++ temp_ptr += 32; +++ r_temp1 = _mm256_load_si256((__m256i*)temp_ptr); +++ temp_ptr += 32; +++ +++ shifted = _mm256_srli_si256(r_temp0, 1); // operate on 128 bit lanes +++ shifted = _mm256_and_si256(shifted, mask_stage1); +++ r_temp0 = _mm256_xor_si256(shifted, r_temp0); +++ r_temp0 = _mm256_shuffle_epi8(r_temp0, shuffle_separate); +++ +++ shifted = _mm256_srli_si256(r_temp1, 1); +++ shifted = _mm256_and_si256(shifted, mask_stage1); +++ r_temp1 = _mm256_xor_si256(shifted, r_temp1); +++ r_temp1 = _mm256_shuffle_epi8(r_temp1, shuffle_separate); +++ +++ r_frame0 = _mm256_unpacklo_epi64(r_temp0, r_temp1); +++ r_temp1 = _mm256_unpackhi_epi64(r_temp0, r_temp1); +++ r_frame0 = _mm256_permute4x64_epi64(r_frame0, 0xd8); +++ r_frame1 = _mm256_permute4x64_epi64(r_temp1, 0xd8); +++ +++ _mm256_store_si256((__m256i*)frame_ptr, r_frame0); +++ +++ _mm256_store_si256((__m256i*)(frame_ptr + frame_half), r_frame1); +++ frame_ptr += 32; +++ } +++ +++ frame_ptr += frame_half; +++ } +++ memcpy(temp, frame, sizeof(unsigned char) * frame_size); +++ +++ num_branches = num_branches << 1; +++ frame_half = frame_half >> 1; +++ stage--; ++ } ++- ++- frame_ptr += frame_half; ++- } ++- memcpy(temp, frame, sizeof(unsigned char) * frame_size); ++- ++- num_branches = num_branches << 1; ++- frame_half = frame_half >> 1; ++- stage--; ++ } ++- } ++- ++- // This last part requires at least 32-bit frames. ++- // Smaller frames are useless for SIMD optimization anyways. Just choose GENERIC! ++ ++- // reset pointers to correct positions. ++- frame_ptr = frame; ++- temp_ptr = temp; +++ // This last part requires at least 32-bit frames. +++ // Smaller frames are useless for SIMD optimization anyways. Just choose GENERIC! ++ ++- // prefetch first chunk. ++- __VOLK_PREFETCH(temp_ptr); +++ // reset pointers to correct positions. +++ frame_ptr = frame; +++ temp_ptr = temp; ++ ++- const __m256i shuffle_stage4 = _mm256_setr_epi8(0, 8, 4, 12, 2, 10, 6, 14, 1, 9, 5, 13, 3, 11, 7, 15, ++- 0, 8, 4, 12, 2, 10, 6, 14, 1, 9, 5, 13, 3, 11, 7, 15); ++- const __m256i mask_stage4 = _mm256_set_epi8(0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, ++- 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF); ++- const __m256i mask_stage3 = _mm256_set_epi8(0x0, 0x0, 0x0, 0x0, 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x0, 0x0, 0x0, 0xFF, 0xFF, 0xFF, 0xFF, ++- 0x0, 0x0, 0x0, 0x0, 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x0, 0x0, 0x0, 0xFF, 0xFF, 0xFF, 0xFF); ++- const __m256i mask_stage2 = _mm256_set_epi8(0x0, 0x0, 0xFF, 0xFF, 0x0, 0x0, 0xFF, 0xFF, 0x0, 0x0, 0xFF, 0xFF, 0x0, 0x0, 0xFF, 0xFF, ++- 0x0, 0x0, 0xFF, 0xFF, 0x0, 0x0, 0xFF, 0xFF, 0x0, 0x0, 0xFF, 0xFF, 0x0, 0x0, 0xFF, 0xFF); ++- ++- for(branch = 0; branch < num_branches/2; ++branch){ ++- r_temp0 = _mm256_load_si256((__m256i*) temp_ptr); ++- ++- // prefetch next chunk ++- temp_ptr += 32; +++ // prefetch first chunk. ++ __VOLK_PREFETCH(temp_ptr); ++ ++- // shuffle once for bit-reversal. ++- r_temp0 = _mm256_shuffle_epi8(r_temp0, shuffle_stage4); ++- ++- shifted = _mm256_srli_si256(r_temp0, 8); //128 bit lanes ++- shifted = _mm256_and_si256(shifted, mask_stage4); ++- r_frame0 = _mm256_xor_si256(shifted, r_temp0); ++- ++- shifted = _mm256_srli_si256(r_frame0, 4); ++- shifted = _mm256_and_si256(shifted, mask_stage3); ++- r_frame0 = _mm256_xor_si256(shifted, r_frame0); ++- ++- shifted = _mm256_srli_si256(r_frame0, 2); ++- shifted = _mm256_and_si256(shifted, mask_stage2); ++- r_frame0 = _mm256_xor_si256(shifted, r_frame0); ++- ++- shifted = _mm256_srli_si256(r_frame0, 1); ++- shifted = _mm256_and_si256(shifted, mask_stage1); ++- r_frame0 = _mm256_xor_si256(shifted, r_frame0); ++- ++- // store result of chunk. ++- _mm256_store_si256((__m256i*)frame_ptr, r_frame0); ++- frame_ptr += 32; ++- } +++ const __m256i shuffle_stage4 = _mm256_setr_epi8(0, +++ 8, +++ 4, +++ 12, +++ 2, +++ 10, +++ 6, +++ 14, +++ 1, +++ 9, +++ 5, +++ 13, +++ 3, +++ 11, +++ 7, +++ 15, +++ 0, +++ 8, +++ 4, +++ 12, +++ 2, +++ 10, +++ 6, +++ 14, +++ 1, +++ 9, +++ 5, +++ 13, +++ 3, +++ 11, +++ 7, +++ 15); +++ const __m256i mask_stage4 = _mm256_set_epi8(0x0, +++ 0x0, +++ 0x0, +++ 0x0, +++ 0x0, +++ 0x0, +++ 0x0, +++ 0x0, +++ 0xFF, +++ 0xFF, +++ 0xFF, +++ 0xFF, +++ 0xFF, +++ 0xFF, +++ 0xFF, +++ 0xFF, +++ 0x0, +++ 0x0, +++ 0x0, +++ 0x0, +++ 0x0, +++ 0x0, +++ 0x0, +++ 0x0, +++ 0xFF, +++ 0xFF, +++ 0xFF, +++ 0xFF, +++ 0xFF, +++ 0xFF, +++ 0xFF, +++ 0xFF); +++ const __m256i mask_stage3 = _mm256_set_epi8(0x0, +++ 0x0, +++ 0x0, +++ 0x0, +++ 0xFF, +++ 0xFF, +++ 0xFF, +++ 0xFF, +++ 0x0, +++ 0x0, +++ 0x0, +++ 0x0, +++ 0xFF, +++ 0xFF, +++ 0xFF, +++ 0xFF, +++ 0x0, +++ 0x0, +++ 0x0, +++ 0x0, +++ 0xFF, +++ 0xFF, +++ 0xFF, +++ 0xFF, +++ 0x0, +++ 0x0, +++ 0x0, +++ 0x0, +++ 0xFF, +++ 0xFF, +++ 0xFF, +++ 0xFF); +++ const __m256i mask_stage2 = _mm256_set_epi8(0x0, +++ 0x0, +++ 0xFF, +++ 0xFF, +++ 0x0, +++ 0x0, +++ 0xFF, +++ 0xFF, +++ 0x0, +++ 0x0, +++ 0xFF, +++ 0xFF, +++ 0x0, +++ 0x0, +++ 0xFF, +++ 0xFF, +++ 0x0, +++ 0x0, +++ 0xFF, +++ 0xFF, +++ 0x0, +++ 0x0, +++ 0xFF, +++ 0xFF, +++ 0x0, +++ 0x0, +++ 0xFF, +++ 0xFF, +++ 0x0, +++ 0x0, +++ 0xFF, +++ 0xFF); +++ +++ for (branch = 0; branch < num_branches / 2; ++branch) { +++ r_temp0 = _mm256_load_si256((__m256i*)temp_ptr); +++ +++ // prefetch next chunk +++ temp_ptr += 32; +++ __VOLK_PREFETCH(temp_ptr); +++ +++ // shuffle once for bit-reversal. +++ r_temp0 = _mm256_shuffle_epi8(r_temp0, shuffle_stage4); +++ +++ shifted = _mm256_srli_si256(r_temp0, 8); // 128 bit lanes +++ shifted = _mm256_and_si256(shifted, mask_stage4); +++ r_frame0 = _mm256_xor_si256(shifted, r_temp0); +++ +++ shifted = _mm256_srli_si256(r_frame0, 4); +++ shifted = _mm256_and_si256(shifted, mask_stage3); +++ r_frame0 = _mm256_xor_si256(shifted, r_frame0); +++ +++ shifted = _mm256_srli_si256(r_frame0, 2); +++ shifted = _mm256_and_si256(shifted, mask_stage2); +++ r_frame0 = _mm256_xor_si256(shifted, r_frame0); +++ +++ shifted = _mm256_srli_si256(r_frame0, 1); +++ shifted = _mm256_and_si256(shifted, mask_stage1); +++ r_frame0 = _mm256_xor_si256(shifted, r_frame0); +++ +++ // store result of chunk. +++ _mm256_store_si256((__m256i*)frame_ptr, r_frame0); +++ frame_ptr += 32; +++ } ++ } ++ #endif /* LV_HAVE_AVX2 */ ++ ++ ++- ++ #endif /* VOLK_KERNELS_VOLK_VOLK_8U_X2_ENCODEFRAMEPOLAR_8U_A_H_ */ ++diff --git a/kernels/volk/volk_8u_x3_encodepolar_8u_x2.h b/kernels/volk/volk_8u_x3_encodepolar_8u_x2.h ++index 5bccd95..413836e 100644 ++--- a/kernels/volk/volk_8u_x3_encodepolar_8u_x2.h +++++ b/kernels/volk/volk_8u_x3_encodepolar_8u_x2.h ++@@ -29,9 +29,9 @@ ++ * ++ * Dispatcher Prototype ++ * \code ++- * void volk_8u_x3_encodepolar_8u(unsigned char* frame, const unsigned char* frozen_bit_mask, const unsigned char* frozen_bits, ++- * const unsigned char* info_bits, unsigned int frame_size, unsigned int info_bit_size) ++- * \endcode +++ * void volk_8u_x3_encodepolar_8u(unsigned char* frame, const unsigned char* +++ * frozen_bit_mask, const unsigned char* frozen_bits, const unsigned char* info_bits, +++ * unsigned int frame_size, unsigned int info_bit_size) \endcode ++ * ++ * \b Inputs ++ * \li frame: buffer for encoded frame ++@@ -55,14 +55,17 @@ ++ * unsigned char* frozen_bit_mask = get_frozen_bit_mask(frame_size, num_frozen_bits); ++ * ++ * // set elements to desired values. Typically all zero. ++- * unsigned char* frozen_bits = (unsigned char) volk_malloc(sizeof(unsigned char) * num_frozen_bits, volk_get_alignment()); +++ * unsigned char* frozen_bits = (unsigned char) volk_malloc(sizeof(unsigned char) * +++ * num_frozen_bits, volk_get_alignment()); ++ * ++- * unsigned char* frame = (unsigned char) volk_malloc(sizeof(unsigned char) * frame_size, volk_get_alignment()); ++- * unsigned char* temp = (unsigned char) volk_malloc(sizeof(unsigned char) * frame_size, volk_get_alignment()); +++ * unsigned char* frame = (unsigned char) volk_malloc(sizeof(unsigned char) * frame_size, +++ * volk_get_alignment()); unsigned char* temp = (unsigned char) +++ * volk_malloc(sizeof(unsigned char) * frame_size, volk_get_alignment()); ++ * ++ * unsigned char* info_bits = get_info_bits_to_encode(num_info_bits); ++ * ++- * volk_8u_x3_encodepolar_8u_x2_generic(frame, temp, frozen_bit_mask, frozen_bits, info_bits, frame_size); +++ * volk_8u_x3_encodepolar_8u_x2_generic(frame, temp, frozen_bit_mask, frozen_bits, +++ * info_bits, frame_size); ++ * ++ * volk_free(frozen_bit_mask); ++ * volk_free(frozen_bits); ++@@ -77,27 +80,32 @@ ++ #include ++ #include ++ ++-static inline void ++-interleave_frozen_and_info_bits(unsigned char* target, const unsigned char* frozen_bit_mask, ++- const unsigned char* frozen_bits, const unsigned char* info_bits, ++- const unsigned int frame_size) +++static inline void interleave_frozen_and_info_bits(unsigned char* target, +++ const unsigned char* frozen_bit_mask, +++ const unsigned char* frozen_bits, +++ const unsigned char* info_bits, +++ const unsigned int frame_size) ++ { ++- unsigned int bit; ++- for(bit = 0; bit < frame_size; ++bit){ ++- *target++ = *frozen_bit_mask++ ? *frozen_bits++ : *info_bits++; ++- } +++ unsigned int bit; +++ for (bit = 0; bit < frame_size; ++bit) { +++ *target++ = *frozen_bit_mask++ ? *frozen_bits++ : *info_bits++; +++ } ++ } ++ ++ #ifdef LV_HAVE_GENERIC ++ ++ static inline void ++-volk_8u_x3_encodepolar_8u_x2_generic(unsigned char* frame, unsigned char* temp, const unsigned char* frozen_bit_mask, ++- const unsigned char* frozen_bits, const unsigned char* info_bits, +++volk_8u_x3_encodepolar_8u_x2_generic(unsigned char* frame, +++ unsigned char* temp, +++ const unsigned char* frozen_bit_mask, +++ const unsigned char* frozen_bits, +++ const unsigned char* info_bits, ++ unsigned int frame_size) ++ { ++- // interleave ++- interleave_frozen_and_info_bits(temp, frozen_bit_mask, frozen_bits, info_bits, frame_size); ++- volk_8u_x2_encodeframepolar_8u_generic(frame, temp, frame_size); +++ // interleave +++ interleave_frozen_and_info_bits( +++ temp, frozen_bit_mask, frozen_bits, info_bits, frame_size); +++ volk_8u_x2_encodeframepolar_8u_generic(frame, temp, frame_size); ++ } ++ #endif /* LV_HAVE_GENERIC */ ++ ++@@ -106,14 +114,17 @@ volk_8u_x3_encodepolar_8u_x2_generic(unsigned char* frame, unsigned char* temp, ++ #include ++ ++ static inline void ++-volk_8u_x3_encodepolar_8u_x2_u_ssse3(unsigned char* frame, unsigned char* temp, ++- const unsigned char* frozen_bit_mask, ++- const unsigned char* frozen_bits, const unsigned char* info_bits, ++- unsigned int frame_size) +++volk_8u_x3_encodepolar_8u_x2_u_ssse3(unsigned char* frame, +++ unsigned char* temp, +++ const unsigned char* frozen_bit_mask, +++ const unsigned char* frozen_bits, +++ const unsigned char* info_bits, +++ unsigned int frame_size) ++ { ++- // interleave ++- interleave_frozen_and_info_bits(temp, frozen_bit_mask, frozen_bits, info_bits, frame_size); ++- volk_8u_x2_encodeframepolar_8u_u_ssse3(frame, temp, frame_size); +++ // interleave +++ interleave_frozen_and_info_bits( +++ temp, frozen_bit_mask, frozen_bits, info_bits, frame_size); +++ volk_8u_x2_encodeframepolar_8u_u_ssse3(frame, temp, frame_size); ++ } ++ ++ #endif /* LV_HAVE_SSSE3 */ ++@@ -121,13 +132,16 @@ volk_8u_x3_encodepolar_8u_x2_u_ssse3(unsigned char* frame, unsigned char* temp, ++ #ifdef LV_HAVE_AVX2 ++ #include ++ static inline void ++-volk_8u_x3_encodepolar_8u_x2_u_avx2(unsigned char* frame, unsigned char* temp, ++- const unsigned char* frozen_bit_mask, ++- const unsigned char* frozen_bits, const unsigned char* info_bits, ++- unsigned int frame_size) +++volk_8u_x3_encodepolar_8u_x2_u_avx2(unsigned char* frame, +++ unsigned char* temp, +++ const unsigned char* frozen_bit_mask, +++ const unsigned char* frozen_bits, +++ const unsigned char* info_bits, +++ unsigned int frame_size) ++ { ++- interleave_frozen_and_info_bits(temp, frozen_bit_mask, frozen_bits, info_bits, frame_size); ++- volk_8u_x2_encodeframepolar_8u_u_avx2(frame, temp, frame_size); +++ interleave_frozen_and_info_bits( +++ temp, frozen_bit_mask, frozen_bits, info_bits, frame_size); +++ volk_8u_x2_encodeframepolar_8u_u_avx2(frame, temp, frame_size); ++ } ++ #endif /* LV_HAVE_AVX2 */ ++ ++@@ -139,26 +153,32 @@ volk_8u_x3_encodepolar_8u_x2_u_avx2(unsigned char* frame, unsigned char* temp, ++ #ifdef LV_HAVE_SSSE3 ++ #include ++ static inline void ++-volk_8u_x3_encodepolar_8u_x2_a_ssse3(unsigned char* frame, unsigned char* temp, ++- const unsigned char* frozen_bit_mask, ++- const unsigned char* frozen_bits, const unsigned char* info_bits, ++- unsigned int frame_size) +++volk_8u_x3_encodepolar_8u_x2_a_ssse3(unsigned char* frame, +++ unsigned char* temp, +++ const unsigned char* frozen_bit_mask, +++ const unsigned char* frozen_bits, +++ const unsigned char* info_bits, +++ unsigned int frame_size) ++ { ++- interleave_frozen_and_info_bits(temp, frozen_bit_mask, frozen_bits, info_bits, frame_size); ++- volk_8u_x2_encodeframepolar_8u_a_ssse3(frame, temp, frame_size); +++ interleave_frozen_and_info_bits( +++ temp, frozen_bit_mask, frozen_bits, info_bits, frame_size); +++ volk_8u_x2_encodeframepolar_8u_a_ssse3(frame, temp, frame_size); ++ } ++ #endif /* LV_HAVE_SSSE3 */ ++ ++ #ifdef LV_HAVE_AVX2 ++ #include ++ static inline void ++-volk_8u_x3_encodepolar_8u_x2_a_avx2(unsigned char* frame, unsigned char* temp, ++- const unsigned char* frozen_bit_mask, ++- const unsigned char* frozen_bits, const unsigned char* info_bits, ++- unsigned int frame_size) +++volk_8u_x3_encodepolar_8u_x2_a_avx2(unsigned char* frame, +++ unsigned char* temp, +++ const unsigned char* frozen_bit_mask, +++ const unsigned char* frozen_bits, +++ const unsigned char* info_bits, +++ unsigned int frame_size) ++ { ++- interleave_frozen_and_info_bits(temp, frozen_bit_mask, frozen_bits, info_bits, frame_size); ++- volk_8u_x2_encodeframepolar_8u_a_avx2(frame, temp, frame_size); +++ interleave_frozen_and_info_bits( +++ temp, frozen_bit_mask, frozen_bits, info_bits, frame_size); +++ volk_8u_x2_encodeframepolar_8u_a_avx2(frame, temp, frame_size); ++ } ++ #endif /* LV_HAVE_AVX2 */ ++ ++diff --git a/kernels/volk/volk_8u_x3_encodepolarpuppet_8u.h b/kernels/volk/volk_8u_x3_encodepolarpuppet_8u.h ++index 1f6be2c..1badbf1 100644 ++--- a/kernels/volk/volk_8u_x3_encodepolarpuppet_8u.h +++++ b/kernels/volk/volk_8u_x3_encodepolarpuppet_8u.h ++@@ -29,71 +29,82 @@ ++ #include ++ #include ++ ++-static inline unsigned int ++-next_lower_power_of_two(const unsigned int val) +++static inline unsigned int next_lower_power_of_two(const unsigned int val) ++ { ++- // algorithm found and adopted from: http://acius2.blogspot.de/2007/11/calculating-next-power-of-2.html ++- unsigned int res = val; ++- res = (res >> 1) | res; ++- res = (res >> 2) | res; ++- res = (res >> 4) | res; ++- res = (res >> 8) | res; ++- res = (res >> 16) | res; ++- res += 1; ++- return res >> 1; +++ // algorithm found and adopted from: +++ // http://acius2.blogspot.de/2007/11/calculating-next-power-of-2.html +++ unsigned int res = val; +++ res = (res >> 1) | res; +++ res = (res >> 2) | res; +++ res = (res >> 4) | res; +++ res = (res >> 8) | res; +++ res = (res >> 16) | res; +++ res += 1; +++ return res >> 1; ++ } ++ ++-static inline void ++-adjust_frozen_mask(unsigned char* mask, const unsigned int frame_size) +++static inline void adjust_frozen_mask(unsigned char* mask, const unsigned int frame_size) ++ { ++- // just like the rest of the puppet this function exists for test purposes only. ++- unsigned int i; ++- for(i = 0; i < frame_size; ++i){ ++- *mask = (*mask & 0x80) ? 0xFF : 0x00; ++- mask++; ++- } +++ // just like the rest of the puppet this function exists for test purposes only. +++ unsigned int i; +++ for (i = 0; i < frame_size; ++i) { +++ *mask = (*mask & 0x80) ? 0xFF : 0x00; +++ mask++; +++ } ++ } ++ ++ #ifdef LV_HAVE_GENERIC ++ static inline void ++-volk_8u_x3_encodepolarpuppet_8u_generic(unsigned char* frame, unsigned char* frozen_bit_mask, ++- const unsigned char* frozen_bits, const unsigned char* info_bits, ++- unsigned int frame_size) +++volk_8u_x3_encodepolarpuppet_8u_generic(unsigned char* frame, +++ unsigned char* frozen_bit_mask, +++ const unsigned char* frozen_bits, +++ const unsigned char* info_bits, +++ unsigned int frame_size) ++ { ++- frame_size = next_lower_power_of_two(frame_size); ++- unsigned char* temp = (unsigned char*) volk_malloc(sizeof(unsigned char) * frame_size, volk_get_alignment()); ++- adjust_frozen_mask(frozen_bit_mask, frame_size); ++- volk_8u_x3_encodepolar_8u_x2_generic(frame, temp, frozen_bit_mask, frozen_bits, info_bits, frame_size); ++- volk_free(temp); +++ frame_size = next_lower_power_of_two(frame_size); +++ unsigned char* temp = (unsigned char*)volk_malloc(sizeof(unsigned char) * frame_size, +++ volk_get_alignment()); +++ adjust_frozen_mask(frozen_bit_mask, frame_size); +++ volk_8u_x3_encodepolar_8u_x2_generic( +++ frame, temp, frozen_bit_mask, frozen_bits, info_bits, frame_size); +++ volk_free(temp); ++ } ++ #endif /* LV_HAVE_GENERIC */ ++ ++ ++ #ifdef LV_HAVE_SSSE3 ++ static inline void ++-volk_8u_x3_encodepolarpuppet_8u_u_ssse3(unsigned char* frame, unsigned char* frozen_bit_mask, ++- const unsigned char* frozen_bits, const unsigned char* info_bits, ++- unsigned int frame_size) +++volk_8u_x3_encodepolarpuppet_8u_u_ssse3(unsigned char* frame, +++ unsigned char* frozen_bit_mask, +++ const unsigned char* frozen_bits, +++ const unsigned char* info_bits, +++ unsigned int frame_size) ++ { ++- frame_size = next_lower_power_of_two(frame_size); ++- unsigned char* temp = (unsigned char*) volk_malloc(sizeof(unsigned char) * frame_size, volk_get_alignment()); ++- adjust_frozen_mask(frozen_bit_mask, frame_size); ++- volk_8u_x3_encodepolar_8u_x2_u_ssse3(frame, temp, frozen_bit_mask, frozen_bits, info_bits, frame_size); ++- volk_free(temp); +++ frame_size = next_lower_power_of_two(frame_size); +++ unsigned char* temp = (unsigned char*)volk_malloc(sizeof(unsigned char) * frame_size, +++ volk_get_alignment()); +++ adjust_frozen_mask(frozen_bit_mask, frame_size); +++ volk_8u_x3_encodepolar_8u_x2_u_ssse3( +++ frame, temp, frozen_bit_mask, frozen_bits, info_bits, frame_size); +++ volk_free(temp); ++ } ++ #endif /* LV_HAVE_SSSE3 */ ++ ++ #ifdef LV_HAVE_AVX2 ++ static inline void ++-volk_8u_x3_encodepolarpuppet_8u_u_avx2(unsigned char* frame, unsigned char* frozen_bit_mask, ++- const unsigned char* frozen_bits, const unsigned char* info_bits, ++- unsigned int frame_size) +++volk_8u_x3_encodepolarpuppet_8u_u_avx2(unsigned char* frame, +++ unsigned char* frozen_bit_mask, +++ const unsigned char* frozen_bits, +++ const unsigned char* info_bits, +++ unsigned int frame_size) ++ { ++- frame_size = next_lower_power_of_two(frame_size); ++- unsigned char* temp = (unsigned char*) volk_malloc(sizeof(unsigned char) * frame_size, volk_get_alignment()); ++- adjust_frozen_mask(frozen_bit_mask, frame_size); ++- volk_8u_x3_encodepolar_8u_x2_u_avx2(frame, temp, frozen_bit_mask, frozen_bits, info_bits, frame_size); ++- volk_free(temp); +++ frame_size = next_lower_power_of_two(frame_size); +++ unsigned char* temp = (unsigned char*)volk_malloc(sizeof(unsigned char) * frame_size, +++ volk_get_alignment()); +++ adjust_frozen_mask(frozen_bit_mask, frame_size); +++ volk_8u_x3_encodepolar_8u_x2_u_avx2( +++ frame, temp, frozen_bit_mask, frozen_bits, info_bits, frame_size); +++ volk_free(temp); ++ } ++ #endif /* LV_HAVE_AVX2 */ ++ ++@@ -104,29 +115,37 @@ volk_8u_x3_encodepolarpuppet_8u_u_avx2(unsigned char* frame, unsigned char* froz ++ ++ #ifdef LV_HAVE_SSSE3 ++ static inline void ++-volk_8u_x3_encodepolarpuppet_8u_a_ssse3(unsigned char* frame, unsigned char* frozen_bit_mask, ++- const unsigned char* frozen_bits, const unsigned char* info_bits, ++- unsigned int frame_size) +++volk_8u_x3_encodepolarpuppet_8u_a_ssse3(unsigned char* frame, +++ unsigned char* frozen_bit_mask, +++ const unsigned char* frozen_bits, +++ const unsigned char* info_bits, +++ unsigned int frame_size) ++ { ++- frame_size = next_lower_power_of_two(frame_size); ++- unsigned char* temp = (unsigned char*) volk_malloc(sizeof(unsigned char) * frame_size, volk_get_alignment()); ++- adjust_frozen_mask(frozen_bit_mask, frame_size); ++- volk_8u_x3_encodepolar_8u_x2_a_ssse3(frame, temp, frozen_bit_mask, frozen_bits, info_bits, frame_size); ++- volk_free(temp); +++ frame_size = next_lower_power_of_two(frame_size); +++ unsigned char* temp = (unsigned char*)volk_malloc(sizeof(unsigned char) * frame_size, +++ volk_get_alignment()); +++ adjust_frozen_mask(frozen_bit_mask, frame_size); +++ volk_8u_x3_encodepolar_8u_x2_a_ssse3( +++ frame, temp, frozen_bit_mask, frozen_bits, info_bits, frame_size); +++ volk_free(temp); ++ } ++ #endif /* LV_HAVE_SSSE3 */ ++ ++ #ifdef LV_HAVE_AVX2 ++ static inline void ++-volk_8u_x3_encodepolarpuppet_8u_a_avx2(unsigned char* frame, unsigned char* frozen_bit_mask, ++- const unsigned char* frozen_bits, const unsigned char* info_bits, ++- unsigned int frame_size) +++volk_8u_x3_encodepolarpuppet_8u_a_avx2(unsigned char* frame, +++ unsigned char* frozen_bit_mask, +++ const unsigned char* frozen_bits, +++ const unsigned char* info_bits, +++ unsigned int frame_size) ++ { ++- frame_size = next_lower_power_of_two(frame_size); ++- unsigned char* temp = (unsigned char*) volk_malloc(sizeof(unsigned char) * frame_size, volk_get_alignment()); ++- adjust_frozen_mask(frozen_bit_mask, frame_size); ++- volk_8u_x3_encodepolar_8u_x2_a_avx2(frame, temp, frozen_bit_mask, frozen_bits, info_bits, frame_size); ++- volk_free(temp); +++ frame_size = next_lower_power_of_two(frame_size); +++ unsigned char* temp = (unsigned char*)volk_malloc(sizeof(unsigned char) * frame_size, +++ volk_get_alignment()); +++ adjust_frozen_mask(frozen_bit_mask, frame_size); +++ volk_8u_x3_encodepolar_8u_x2_a_avx2( +++ frame, temp, frozen_bit_mask, frozen_bits, info_bits, frame_size); +++ volk_free(temp); ++ } ++ #endif /* LV_HAVE_AVX2 */ ++ ++diff --git a/kernels/volk/volk_8u_x4_conv_k7_r2_8u.h b/kernels/volk/volk_8u_x4_conv_k7_r2_8u.h ++index 029ba75..89460a6 100644 ++--- a/kernels/volk/volk_8u_x4_conv_k7_r2_8u.h +++++ b/kernels/volk/volk_8u_x4_conv_k7_r2_8u.h ++@@ -30,8 +30,9 @@ ++ * ++ * Dispatcher Prototype ++ * \code ++- * void volk_8u_x4_conv_k7_r2_8u(unsigned char* Y, unsigned char* X, unsigned char* syms, unsigned char* dec, unsigned int framebits, unsigned int excess, unsigned char* Branchtab) ++- * \endcode +++ * void volk_8u_x4_conv_k7_r2_8u(unsigned char* Y, unsigned char* X, unsigned char* syms, +++ * unsigned char* dec, unsigned int framebits, unsigned int excess, unsigned char* +++ * Branchtab) \endcode ++ * ++ * \b Inputs ++ * \li X: ++@@ -58,67 +59,71 @@ ++ #define INCLUDED_volk_8u_x4_conv_k7_r2_8u_H ++ ++ typedef union { ++- unsigned char/*DECISIONTYPE*/ t[64/*NUMSTATES*//8/*DECISIONTYPE_BITSIZE*/]; ++- unsigned int w[64/*NUMSTATES*//32]; ++- unsigned short s[64/*NUMSTATES*//16]; ++- unsigned char c[64/*NUMSTATES*//8]; +++ unsigned char /*DECISIONTYPE*/ t[64 /*NUMSTATES*/ / 8 /*DECISIONTYPE_BITSIZE*/]; +++ unsigned int w[64 /*NUMSTATES*/ / 32]; +++ unsigned short s[64 /*NUMSTATES*/ / 16]; +++ unsigned char c[64 /*NUMSTATES*/ / 8]; ++ #ifdef _MSC_VER ++ } decision_t; ++ #else ++-} decision_t __attribute__ ((aligned (16))); +++} decision_t __attribute__((aligned(16))); ++ #endif ++ ++ ++-static inline void ++-renormalize(unsigned char* X, unsigned char threshold) +++static inline void renormalize(unsigned char* X, unsigned char threshold) ++ { ++- int NUMSTATES = 64; ++- int i; ++- ++- unsigned char min=X[0]; ++- //if(min > threshold) { ++- for(i=0;iX[i]) ++- min=X[i]; ++- for(i=0;i threshold) { +++ for (i = 0; i < NUMSTATES; i++) +++ if (min > X[i]) +++ min = X[i]; +++ for (i = 0; i < NUMSTATES; i++) +++ X[i] -= min; +++ //} ++ } ++ ++ ++-//helper BFLY for GENERIC version ++-static inline void ++-BFLY(int i, int s, unsigned char * syms, unsigned char *Y, ++- unsigned char *X, decision_t * d, unsigned char* Branchtab) +++// helper BFLY for GENERIC version +++static inline void BFLY(int i, +++ int s, +++ unsigned char* syms, +++ unsigned char* Y, +++ unsigned char* X, +++ decision_t* d, +++ unsigned char* Branchtab) ++ { ++- int j, decision0, decision1; ++- unsigned char metric,m0,m1,m2,m3; +++ int j, decision0, decision1; +++ unsigned char metric, m0, m1, m2, m3; ++ ++- int NUMSTATES = 64; ++- int RATE = 2; ++- int METRICSHIFT = 1; ++- int PRECISIONSHIFT = 2; +++ int NUMSTATES = 64; +++ int RATE = 2; +++ int METRICSHIFT = 1; +++ int PRECISIONSHIFT = 2; ++ ++- metric =0; ++- for(j=0;j>METRICSHIFT; ++- metric=metric>>PRECISIONSHIFT; +++ metric = 0; +++ for (j = 0; j < RATE; j++) +++ metric += (Branchtab[i + j * NUMSTATES / 2] ^ syms[s * RATE + j]) >> METRICSHIFT; +++ metric = metric >> PRECISIONSHIFT; ++ ++- unsigned char max = ((RATE*((256 -1)>>METRICSHIFT))>>PRECISIONSHIFT); +++ unsigned char max = ((RATE * ((256 - 1) >> METRICSHIFT)) >> PRECISIONSHIFT); ++ ++- m0 = X[i] + metric; ++- m1 = X[i+NUMSTATES/2] + (max - metric); ++- m2 = X[i] + (max - metric); ++- m3 = X[i+NUMSTATES/2] + metric; +++ m0 = X[i] + metric; +++ m1 = X[i + NUMSTATES / 2] + (max - metric); +++ m2 = X[i] + (max - metric); +++ m3 = X[i + NUMSTATES / 2] + metric; ++ ++- decision0 = (signed int)(m0-m1) > 0; ++- decision1 = (signed int)(m2-m3) > 0; +++ decision0 = (signed int)(m0 - m1) > 0; +++ decision1 = (signed int)(m2 - m3) > 0; ++ ++- Y[2*i] = decision0 ? m1 : m0; ++- Y[2*i+1] = decision1 ? m3 : m2; +++ Y[2 * i] = decision0 ? m1 : m0; +++ Y[2 * i + 1] = decision1 ? m3 : m2; ++ ++- d->w[i/(sizeof(unsigned int)*8/2)+s*(sizeof(decision_t)/sizeof(unsigned int))] |= ++- (decision0|decision1<<1) << ((2*i)&(sizeof(unsigned int)*8-1)); +++ d->w[i / (sizeof(unsigned int) * 8 / 2) + +++ s * (sizeof(decision_t) / sizeof(unsigned int))] |= +++ (decision0 | decision1 << 1) << ((2 * i) & (sizeof(unsigned int) * 8 - 1)); ++ } ++ ++ ++@@ -127,188 +132,199 @@ BFLY(int i, int s, unsigned char * syms, unsigned char *Y, ++ #include ++ #include ++ ++-static inline void ++-volk_8u_x4_conv_k7_r2_8u_avx2(unsigned char* Y, unsigned char* X, ++- unsigned char* syms, unsigned char* dec, ++- unsigned int framebits, unsigned int excess, ++- unsigned char* Branchtab) +++static inline void volk_8u_x4_conv_k7_r2_8u_avx2(unsigned char* Y, +++ unsigned char* X, +++ unsigned char* syms, +++ unsigned char* dec, +++ unsigned int framebits, +++ unsigned int excess, +++ unsigned char* Branchtab) ++ { ++- unsigned int i9; ++- for(i9 = 0; i9 < ((framebits + excess)>>1); i9++) { ++- unsigned char a75, a81; ++- int a73, a92; ++- int s20, s21; ++- unsigned char *a80, *b6; ++- int *a110, *a91, *a93; ++- __m256i *a112, *a71, *a72, *a77, *a83, *a95; ++- __m256i a86, a87; ++- __m256i a76, a78, a79, a82, a84, a85, a88, a89 ++- , a90, d10, d9, m23, m24, m25 ++- , m26, s18, s19, s22 ++- , s23, s24, s25, t13, t14, t15; ++- a71 = ((__m256i *) X); ++- s18 = *(a71); ++- a72 = (a71 + 1); ++- s19 = *(a72); ++- s22 = _mm256_permute2x128_si256(s18,s19,0x20); ++- s19 = _mm256_permute2x128_si256(s18,s19,0x31); ++- s18 = s22; ++- a73 = (4 * i9); ++- b6 = (syms + a73); ++- a75 = *(b6); ++- a76 = _mm256_set1_epi8(a75); ++- a77 = ((__m256i *) Branchtab); ++- a78 = *(a77); ++- a79 = _mm256_xor_si256(a76, a78); ++- a80 = (b6 + 1); ++- a81 = *(a80); ++- a82 = _mm256_set1_epi8(a81); ++- a83 = (a77 + 1); ++- a84 = *(a83); ++- a85 = _mm256_xor_si256(a82, a84); ++- t13 = _mm256_avg_epu8(a79,a85); ++- a86 = ((__m256i ) t13); ++- a87 = _mm256_srli_epi16(a86, 2); ++- a88 = ((__m256i ) a87); ++- t14 = _mm256_and_si256(a88, _mm256_set1_epi8(63)); ++- t15 = _mm256_subs_epu8(_mm256_set1_epi8(63), t14); ++- m23 = _mm256_adds_epu8(s18, t14); ++- m24 = _mm256_adds_epu8(s19, t15); ++- m25 = _mm256_adds_epu8(s18, t15); ++- m26 = _mm256_adds_epu8(s19, t14); ++- a89 = _mm256_min_epu8(m24, m23); ++- d9 = _mm256_cmpeq_epi8(a89, m24); ++- a90 = _mm256_min_epu8(m26, m25); ++- d10 = _mm256_cmpeq_epi8(a90, m26); ++- s22 = _mm256_unpacklo_epi8(d9,d10); ++- s23 = _mm256_unpackhi_epi8(d9,d10); ++- s20 = _mm256_movemask_epi8(_mm256_permute2x128_si256(s22, s23, 0x20)); ++- a91 = ((int *) dec); ++- a92 = (4 * i9); ++- a93 = (a91 + a92); ++- *(a93) = s20; ++- s21 = _mm256_movemask_epi8(_mm256_permute2x128_si256(s22, s23, 0x31)); ++- a110 = (a93 + 1); ++- *(a110) = s21; ++- s22 = _mm256_unpacklo_epi8(a89, a90); ++- s23 = _mm256_unpackhi_epi8(a89, a90); ++- a95 = ((__m256i *) Y); ++- s24 = _mm256_permute2x128_si256(s22, s23, 0x20); ++- *(a95) = s24; ++- s23 = _mm256_permute2x128_si256(s22, s23, 0x31); ++- a112 = (a95 + 1); ++- *(a112) = s23; ++- if ((((unsigned char *) Y)[0]>210)) { ++- __m256i m5, m6; ++- m5 = ((__m256i *) Y)[0]; ++- m5 = _mm256_min_epu8(m5, ((__m256i *) Y)[1]); ++- __m256i m7; ++- m7 = _mm256_min_epu8(_mm256_srli_si256(m5, 8), m5); ++- m7 = ((__m256i ) _mm256_min_epu8(((__m256i ) _mm256_srli_epi64(m7, 32)), ((__m256i ) m7))); ++- m7 = ((__m256i ) _mm256_min_epu8(((__m256i ) _mm256_srli_epi64(m7, 16)), ((__m256i ) m7))); ++- m7 = ((__m256i ) _mm256_min_epu8(((__m256i ) _mm256_srli_epi64(m7, 8)), ((__m256i ) m7))); ++- m7 = _mm256_unpacklo_epi8(m7, m7); ++- m7 = _mm256_shufflelo_epi16(m7, 0); ++- m6 = _mm256_unpacklo_epi64(m7, m7); ++- m6 = _mm256_permute2x128_si256(m6, m6, 0); //copy lower half of m6 to upper half, since above ops operate on 128 bit lanes ++- ((__m256i *) Y)[0] = _mm256_subs_epu8(((__m256i *) Y)[0], m6); ++- ((__m256i *) Y)[1] = _mm256_subs_epu8(((__m256i *) Y)[1], m6); +++ unsigned int i9; +++ for (i9 = 0; i9 < ((framebits + excess) >> 1); i9++) { +++ unsigned char a75, a81; +++ int a73, a92; +++ int s20, s21; +++ unsigned char *a80, *b6; +++ int *a110, *a91, *a93; +++ __m256i *a112, *a71, *a72, *a77, *a83, *a95; +++ __m256i a86, a87; +++ __m256i a76, a78, a79, a82, a84, a85, a88, a89, a90, d10, d9, m23, m24, m25, m26, +++ s18, s19, s22, s23, s24, s25, t13, t14, t15; +++ a71 = ((__m256i*)X); +++ s18 = *(a71); +++ a72 = (a71 + 1); +++ s19 = *(a72); +++ s22 = _mm256_permute2x128_si256(s18, s19, 0x20); +++ s19 = _mm256_permute2x128_si256(s18, s19, 0x31); +++ s18 = s22; +++ a73 = (4 * i9); +++ b6 = (syms + a73); +++ a75 = *(b6); +++ a76 = _mm256_set1_epi8(a75); +++ a77 = ((__m256i*)Branchtab); +++ a78 = *(a77); +++ a79 = _mm256_xor_si256(a76, a78); +++ a80 = (b6 + 1); +++ a81 = *(a80); +++ a82 = _mm256_set1_epi8(a81); +++ a83 = (a77 + 1); +++ a84 = *(a83); +++ a85 = _mm256_xor_si256(a82, a84); +++ t13 = _mm256_avg_epu8(a79, a85); +++ a86 = ((__m256i)t13); +++ a87 = _mm256_srli_epi16(a86, 2); +++ a88 = ((__m256i)a87); +++ t14 = _mm256_and_si256(a88, _mm256_set1_epi8(63)); +++ t15 = _mm256_subs_epu8(_mm256_set1_epi8(63), t14); +++ m23 = _mm256_adds_epu8(s18, t14); +++ m24 = _mm256_adds_epu8(s19, t15); +++ m25 = _mm256_adds_epu8(s18, t15); +++ m26 = _mm256_adds_epu8(s19, t14); +++ a89 = _mm256_min_epu8(m24, m23); +++ d9 = _mm256_cmpeq_epi8(a89, m24); +++ a90 = _mm256_min_epu8(m26, m25); +++ d10 = _mm256_cmpeq_epi8(a90, m26); +++ s22 = _mm256_unpacklo_epi8(d9, d10); +++ s23 = _mm256_unpackhi_epi8(d9, d10); +++ s20 = _mm256_movemask_epi8(_mm256_permute2x128_si256(s22, s23, 0x20)); +++ a91 = ((int*)dec); +++ a92 = (4 * i9); +++ a93 = (a91 + a92); +++ *(a93) = s20; +++ s21 = _mm256_movemask_epi8(_mm256_permute2x128_si256(s22, s23, 0x31)); +++ a110 = (a93 + 1); +++ *(a110) = s21; +++ s22 = _mm256_unpacklo_epi8(a89, a90); +++ s23 = _mm256_unpackhi_epi8(a89, a90); +++ a95 = ((__m256i*)Y); +++ s24 = _mm256_permute2x128_si256(s22, s23, 0x20); +++ *(a95) = s24; +++ s23 = _mm256_permute2x128_si256(s22, s23, 0x31); +++ a112 = (a95 + 1); +++ *(a112) = s23; +++ if ((((unsigned char*)Y)[0] > 210)) { +++ __m256i m5, m6; +++ m5 = ((__m256i*)Y)[0]; +++ m5 = _mm256_min_epu8(m5, ((__m256i*)Y)[1]); +++ __m256i m7; +++ m7 = _mm256_min_epu8(_mm256_srli_si256(m5, 8), m5); +++ m7 = ((__m256i)_mm256_min_epu8(((__m256i)_mm256_srli_epi64(m7, 32)), +++ ((__m256i)m7))); +++ m7 = ((__m256i)_mm256_min_epu8(((__m256i)_mm256_srli_epi64(m7, 16)), +++ ((__m256i)m7))); +++ m7 = ((__m256i)_mm256_min_epu8(((__m256i)_mm256_srli_epi64(m7, 8)), +++ ((__m256i)m7))); +++ m7 = _mm256_unpacklo_epi8(m7, m7); +++ m7 = _mm256_shufflelo_epi16(m7, 0); +++ m6 = _mm256_unpacklo_epi64(m7, m7); +++ m6 = _mm256_permute2x128_si256( +++ m6, m6, 0); // copy lower half of m6 to upper half, since above ops +++ // operate on 128 bit lanes +++ ((__m256i*)Y)[0] = _mm256_subs_epu8(((__m256i*)Y)[0], m6); +++ ((__m256i*)Y)[1] = _mm256_subs_epu8(((__m256i*)Y)[1], m6); +++ } +++ unsigned char a188, a194; +++ int a205; +++ int s48, s54; +++ unsigned char *a187, *a193; +++ int *a204, *a206, *a223, *b16; +++ __m256i *a184, *a185, *a190, *a196, *a208, *a225; +++ __m256i a199, a200; +++ __m256i a189, a191, a192, a195, a197, a198, a201, a202, a203, d17, d18, m39, m40, +++ m41, m42, s46, s47, s50, s51, t25, t26, t27; +++ a184 = ((__m256i*)Y); +++ s46 = *(a184); +++ a185 = (a184 + 1); +++ s47 = *(a185); +++ s50 = _mm256_permute2x128_si256(s46, s47, 0x20); +++ s47 = _mm256_permute2x128_si256(s46, s47, 0x31); +++ s46 = s50; +++ a187 = (b6 + 2); +++ a188 = *(a187); +++ a189 = _mm256_set1_epi8(a188); +++ a190 = ((__m256i*)Branchtab); +++ a191 = *(a190); +++ a192 = _mm256_xor_si256(a189, a191); +++ a193 = (b6 + 3); +++ a194 = *(a193); +++ a195 = _mm256_set1_epi8(a194); +++ a196 = (a190 + 1); +++ a197 = *(a196); +++ a198 = _mm256_xor_si256(a195, a197); +++ t25 = _mm256_avg_epu8(a192, a198); +++ a199 = ((__m256i)t25); +++ a200 = _mm256_srli_epi16(a199, 2); +++ a201 = ((__m256i)a200); +++ t26 = _mm256_and_si256(a201, _mm256_set1_epi8(63)); +++ t27 = _mm256_subs_epu8(_mm256_set1_epi8(63), t26); +++ m39 = _mm256_adds_epu8(s46, t26); +++ m40 = _mm256_adds_epu8(s47, t27); +++ m41 = _mm256_adds_epu8(s46, t27); +++ m42 = _mm256_adds_epu8(s47, t26); +++ a202 = _mm256_min_epu8(m40, m39); +++ d17 = _mm256_cmpeq_epi8(a202, m40); +++ a203 = _mm256_min_epu8(m42, m41); +++ d18 = _mm256_cmpeq_epi8(a203, m42); +++ s24 = _mm256_unpacklo_epi8(d17, d18); +++ s25 = _mm256_unpackhi_epi8(d17, d18); +++ s48 = _mm256_movemask_epi8(_mm256_permute2x128_si256(s24, s25, 0x20)); +++ a204 = ((int*)dec); +++ a205 = (4 * i9); +++ b16 = (a204 + a205); +++ a206 = (b16 + 2); +++ *(a206) = s48; +++ s54 = _mm256_movemask_epi8(_mm256_permute2x128_si256(s24, s25, 0x31)); +++ a223 = (b16 + 3); +++ *(a223) = s54; +++ s50 = _mm256_unpacklo_epi8(a202, a203); +++ s51 = _mm256_unpackhi_epi8(a202, a203); +++ s25 = _mm256_permute2x128_si256(s50, s51, 0x20); +++ s51 = _mm256_permute2x128_si256(s50, s51, 0x31); +++ a208 = ((__m256i*)X); +++ *(a208) = s25; +++ a225 = (a208 + 1); +++ *(a225) = s51; +++ +++ if ((((unsigned char*)X)[0] > 210)) { +++ __m256i m12, m13; +++ m12 = ((__m256i*)X)[0]; +++ m12 = _mm256_min_epu8(m12, ((__m256i*)X)[1]); +++ __m256i m14; +++ m14 = _mm256_min_epu8(_mm256_srli_si256(m12, 8), m12); +++ m14 = ((__m256i)_mm256_min_epu8(((__m256i)_mm256_srli_epi64(m14, 32)), +++ ((__m256i)m14))); +++ m14 = ((__m256i)_mm256_min_epu8(((__m256i)_mm256_srli_epi64(m14, 16)), +++ ((__m256i)m14))); +++ m14 = ((__m256i)_mm256_min_epu8(((__m256i)_mm256_srli_epi64(m14, 8)), +++ ((__m256i)m14))); +++ m14 = _mm256_unpacklo_epi8(m14, m14); +++ m14 = _mm256_shufflelo_epi16(m14, 0); +++ m13 = _mm256_unpacklo_epi64(m14, m14); +++ m13 = _mm256_permute2x128_si256(m13, m13, 0); +++ ((__m256i*)X)[0] = _mm256_subs_epu8(((__m256i*)X)[0], m13); +++ ((__m256i*)X)[1] = _mm256_subs_epu8(((__m256i*)X)[1], m13); +++ } ++ } ++- unsigned char a188, a194; ++- int a205; ++- int s48, s54; ++- unsigned char *a187, *a193; ++- int *a204, *a206, *a223, *b16; ++- __m256i *a184, *a185, *a190, *a196, *a208, *a225; ++- __m256i a199, a200; ++- __m256i a189, a191, a192, a195, a197, a198, a201 ++- , a202, a203, d17, d18, m39, m40, m41 ++- , m42, s46, s47, s50 ++- , s51, t25, t26, t27; ++- a184 = ((__m256i *) Y); ++- s46 = *(a184); ++- a185 = (a184 + 1); ++- s47 = *(a185); ++- s50 = _mm256_permute2x128_si256(s46,s47,0x20); ++- s47 = _mm256_permute2x128_si256(s46,s47,0x31); ++- s46 = s50; ++- a187 = (b6 + 2); ++- a188 = *(a187); ++- a189 = _mm256_set1_epi8(a188); ++- a190 = ((__m256i *) Branchtab); ++- a191 = *(a190); ++- a192 = _mm256_xor_si256(a189, a191); ++- a193 = (b6 + 3); ++- a194 = *(a193); ++- a195 = _mm256_set1_epi8(a194); ++- a196 = (a190 + 1); ++- a197 = *(a196); ++- a198 = _mm256_xor_si256(a195, a197); ++- t25 = _mm256_avg_epu8(a192,a198); ++- a199 = ((__m256i ) t25); ++- a200 = _mm256_srli_epi16(a199, 2); ++- a201 = ((__m256i ) a200); ++- t26 = _mm256_and_si256(a201, _mm256_set1_epi8(63)); ++- t27 = _mm256_subs_epu8(_mm256_set1_epi8(63), t26); ++- m39 = _mm256_adds_epu8(s46, t26); ++- m40 = _mm256_adds_epu8(s47, t27); ++- m41 = _mm256_adds_epu8(s46, t27); ++- m42 = _mm256_adds_epu8(s47, t26); ++- a202 = _mm256_min_epu8(m40, m39); ++- d17 = _mm256_cmpeq_epi8(a202, m40); ++- a203 = _mm256_min_epu8(m42, m41); ++- d18 = _mm256_cmpeq_epi8(a203, m42); ++- s24 = _mm256_unpacklo_epi8(d17,d18); ++- s25 = _mm256_unpackhi_epi8(d17,d18); ++- s48 = _mm256_movemask_epi8(_mm256_permute2x128_si256(s24, s25, 0x20)); ++- a204 = ((int *) dec); ++- a205 = (4 * i9); ++- b16 = (a204 + a205); ++- a206 = (b16 + 2); ++- *(a206) = s48; ++- s54 = _mm256_movemask_epi8(_mm256_permute2x128_si256(s24, s25, 0x31)); ++- a223 = (b16 + 3); ++- *(a223) = s54; ++- s50 = _mm256_unpacklo_epi8(a202, a203); ++- s51 = _mm256_unpackhi_epi8(a202, a203); ++- s25 = _mm256_permute2x128_si256(s50, s51, 0x20); ++- s51 = _mm256_permute2x128_si256(s50, s51, 0x31); ++- a208 = ((__m256i *) X); ++- *(a208) = s25; ++- a225 = (a208 + 1); ++- *(a225) = s51; ++- ++- if ((((unsigned char *) X)[0]>210)) { ++- __m256i m12, m13; ++- m12 = ((__m256i *) X)[0]; ++- m12 = _mm256_min_epu8(m12, ((__m256i *) X)[1]); ++- __m256i m14; ++- m14 = _mm256_min_epu8(_mm256_srli_si256(m12, 8), m12); ++- m14 = ((__m256i ) _mm256_min_epu8(((__m256i ) _mm256_srli_epi64(m14, 32)), ((__m256i ) m14))); ++- m14 = ((__m256i ) _mm256_min_epu8(((__m256i ) _mm256_srli_epi64(m14, 16)), ((__m256i ) m14))); ++- m14 = ((__m256i ) _mm256_min_epu8(((__m256i ) _mm256_srli_epi64(m14, 8)), ((__m256i ) m14))); ++- m14 = _mm256_unpacklo_epi8(m14, m14); ++- m14 = _mm256_shufflelo_epi16(m14, 0); ++- m13 = _mm256_unpacklo_epi64(m14, m14); ++- m13 = _mm256_permute2x128_si256(m13, m13, 0); ++- ((__m256i *) X)[0] = _mm256_subs_epu8(((__m256i *) X)[0], m13); ++- ((__m256i *) X)[1] = _mm256_subs_epu8(((__m256i *) X)[1], m13); ++- } ++- } ++- ++- renormalize(X, 210); ++ ++- unsigned int j; ++- for(j=0; j < (framebits + excess) % 2; ++j) { ++- int i; ++- for(i=0;i<64/2;i++){ ++- BFLY(i, (((framebits+excess) >> 1) << 1) + j , syms, Y, X, (decision_t *)dec, Branchtab); +++ renormalize(X, 210); +++ +++ unsigned int j; +++ for (j = 0; j < (framebits + excess) % 2; ++j) { +++ int i; +++ for (i = 0; i < 64 / 2; i++) { +++ BFLY(i, +++ (((framebits + excess) >> 1) << 1) + j, +++ syms, +++ Y, +++ X, +++ (decision_t*)dec, +++ Branchtab); +++ } +++ +++ renormalize(Y, 210); ++ } ++- ++- renormalize(Y, 210); ++- ++- } ++- /*skip*/ +++ /*skip*/ ++ } ++ ++ #endif /*LV_HAVE_AVX2*/ ++@@ -316,295 +332,300 @@ volk_8u_x4_conv_k7_r2_8u_avx2(unsigned char* Y, unsigned char* X, ++ ++ #if LV_HAVE_SSE3 ++ ++-#include ++ #include ++-#include ++ #include +++#include ++ #include +++#include ++ ++-static inline void ++-volk_8u_x4_conv_k7_r2_8u_spiral(unsigned char* Y, unsigned char* X, ++- unsigned char* syms, unsigned char* dec, ++- unsigned int framebits, unsigned int excess, ++- unsigned char* Branchtab) +++static inline void volk_8u_x4_conv_k7_r2_8u_spiral(unsigned char* Y, +++ unsigned char* X, +++ unsigned char* syms, +++ unsigned char* dec, +++ unsigned int framebits, +++ unsigned int excess, +++ unsigned char* Branchtab) ++ { ++- unsigned int i9; ++- for(i9 = 0; i9 < ((framebits + excess) >> 1); i9++) { ++- unsigned char a75, a81; ++- int a73, a92; ++- short int s20, s21, s26, s27; ++- unsigned char *a74, *a80, *b6; ++- short int *a110, *a111, *a91, *a93, *a94; ++- __m128i *a102, *a112, *a113, *a71, *a72, *a77, *a83 ++- , *a95, *a96, *a97, *a98, *a99; ++- __m128i a105, a106, a86, a87; ++- __m128i a100, a101, a103, a104, a107, a108, a109 ++- , a76, a78, a79, a82, a84, a85, a88, a89 ++- , a90, d10, d11, d12, d9, m23, m24, m25 ++- , m26, m27, m28, m29, m30, s18, s19, s22 ++- , s23, s24, s25, s28, s29, t13, t14, t15 ++- , t16, t17, t18; ++- a71 = ((__m128i *) X); ++- s18 = *(a71); ++- a72 = (a71 + 2); ++- s19 = *(a72); ++- a73 = (4 * i9); ++- a74 = (syms + a73); ++- a75 = *(a74); ++- a76 = _mm_set1_epi8(a75); ++- a77 = ((__m128i *) Branchtab); ++- a78 = *(a77); ++- a79 = _mm_xor_si128(a76, a78); ++- b6 = (a73 + syms); ++- a80 = (b6 + 1); ++- a81 = *(a80); ++- a82 = _mm_set1_epi8(a81); ++- a83 = (a77 + 2); ++- a84 = *(a83); ++- a85 = _mm_xor_si128(a82, a84); ++- t13 = _mm_avg_epu8(a79,a85); ++- a86 = ((__m128i ) t13); ++- a87 = _mm_srli_epi16(a86, 2); ++- a88 = ((__m128i ) a87); ++- t14 = _mm_and_si128(a88, _mm_set_epi8(63, 63, 63, 63, 63, 63, 63 ++- , 63, 63, 63, 63, 63, 63, 63, 63 ++- , 63)); ++- t15 = _mm_subs_epu8(_mm_set_epi8(63, 63, 63, 63, 63, 63, 63 ++- , 63, 63, 63, 63, 63, 63, 63, 63 ++- , 63), t14); ++- m23 = _mm_adds_epu8(s18, t14); ++- m24 = _mm_adds_epu8(s19, t15); ++- m25 = _mm_adds_epu8(s18, t15); ++- m26 = _mm_adds_epu8(s19, t14); ++- a89 = _mm_min_epu8(m24, m23); ++- d9 = _mm_cmpeq_epi8(a89, m24); ++- a90 = _mm_min_epu8(m26, m25); ++- d10 = _mm_cmpeq_epi8(a90, m26); ++- s20 = _mm_movemask_epi8(_mm_unpacklo_epi8(d9,d10)); ++- a91 = ((short int *) dec); ++- a92 = (8 * i9); ++- a93 = (a91 + a92); ++- *(a93) = s20; ++- s21 = _mm_movemask_epi8(_mm_unpackhi_epi8(d9,d10)); ++- a94 = (a93 + 1); ++- *(a94) = s21; ++- s22 = _mm_unpacklo_epi8(a89, a90); ++- s23 = _mm_unpackhi_epi8(a89, a90); ++- a95 = ((__m128i *) Y); ++- *(a95) = s22; ++- a96 = (a95 + 1); ++- *(a96) = s23; ++- a97 = (a71 + 1); ++- s24 = *(a97); ++- a98 = (a71 + 3); ++- s25 = *(a98); ++- a99 = (a77 + 1); ++- a100 = *(a99); ++- a101 = _mm_xor_si128(a76, a100); ++- a102 = (a77 + 3); ++- a103 = *(a102); ++- a104 = _mm_xor_si128(a82, a103); ++- t16 = _mm_avg_epu8(a101,a104); ++- a105 = ((__m128i ) t16); ++- a106 = _mm_srli_epi16(a105, 2); ++- a107 = ((__m128i ) a106); ++- t17 = _mm_and_si128(a107, _mm_set_epi8(63, 63, 63, 63, 63, 63, 63 ++- , 63, 63, 63, 63, 63, 63, 63, 63 ++- , 63)); ++- t18 = _mm_subs_epu8(_mm_set_epi8(63, 63, 63, 63, 63, 63, 63 ++- , 63, 63, 63, 63, 63, 63, 63, 63 ++- , 63), t17); ++- m27 = _mm_adds_epu8(s24, t17); ++- m28 = _mm_adds_epu8(s25, t18); ++- m29 = _mm_adds_epu8(s24, t18); ++- m30 = _mm_adds_epu8(s25, t17); ++- a108 = _mm_min_epu8(m28, m27); ++- d11 = _mm_cmpeq_epi8(a108, m28); ++- a109 = _mm_min_epu8(m30, m29); ++- d12 = _mm_cmpeq_epi8(a109, m30); ++- s26 = _mm_movemask_epi8(_mm_unpacklo_epi8(d11,d12)); ++- a110 = (a93 + 2); ++- *(a110) = s26; ++- s27 = _mm_movemask_epi8(_mm_unpackhi_epi8(d11,d12)); ++- a111 = (a93 + 3); ++- *(a111) = s27; ++- s28 = _mm_unpacklo_epi8(a108, a109); ++- s29 = _mm_unpackhi_epi8(a108, a109); ++- a112 = (a95 + 2); ++- *(a112) = s28; ++- a113 = (a95 + 3); ++- *(a113) = s29; ++- if ((((unsigned char *) Y)[0]>210)) { ++- __m128i m5, m6; ++- m5 = ((__m128i *) Y)[0]; ++- m5 = _mm_min_epu8(m5, ((__m128i *) Y)[1]); ++- m5 = _mm_min_epu8(m5, ((__m128i *) Y)[2]); ++- m5 = _mm_min_epu8(m5, ((__m128i *) Y)[3]); ++- __m128i m7; ++- m7 = _mm_min_epu8(_mm_srli_si128(m5, 8), m5); ++- m7 = ((__m128i ) _mm_min_epu8(((__m128i ) _mm_srli_epi64(m7, 32)), ((__m128i ) m7))); ++- m7 = ((__m128i ) _mm_min_epu8(((__m128i ) _mm_srli_epi64(m7, 16)), ((__m128i ) m7))); ++- m7 = ((__m128i ) _mm_min_epu8(((__m128i ) _mm_srli_epi64(m7, 8)), ((__m128i ) m7))); ++- m7 = _mm_unpacklo_epi8(m7, m7); ++- m7 = _mm_shufflelo_epi16(m7, _MM_SHUFFLE(0, 0, 0, 0)); ++- m6 = _mm_unpacklo_epi64(m7, m7); ++- ((__m128i *) Y)[0] = _mm_subs_epu8(((__m128i *) Y)[0], m6); ++- ((__m128i *) Y)[1] = _mm_subs_epu8(((__m128i *) Y)[1], m6); ++- ((__m128i *) Y)[2] = _mm_subs_epu8(((__m128i *) Y)[2], m6); ++- ((__m128i *) Y)[3] = _mm_subs_epu8(((__m128i *) Y)[3], m6); ++- } ++- unsigned char a188, a194; ++- int a186, a205; ++- short int s48, s49, s54, s55; ++- unsigned char *a187, *a193, *b15; ++- short int *a204, *a206, *a207, *a223, *a224, *b16; ++- __m128i *a184, *a185, *a190, *a196, *a208, *a209, *a210 ++- , *a211, *a212, *a215, *a225, *a226; ++- __m128i a199, a200, a218, a219; ++- __m128i a189, a191, a192, a195, a197, a198, a201 ++- , a202, a203, a213, a214, a216, a217, a220, a221 ++- , a222, d17, d18, d19, d20, m39, m40, m41 ++- , m42, m43, m44, m45, m46, s46, s47, s50 ++- , s51, s52, s53, s56, s57, t25, t26, t27 ++- , t28, t29, t30; ++- a184 = ((__m128i *) Y); ++- s46 = *(a184); ++- a185 = (a184 + 2); ++- s47 = *(a185); ++- a186 = (4 * i9); ++- b15 = (a186 + syms); ++- a187 = (b15 + 2); ++- a188 = *(a187); ++- a189 = _mm_set1_epi8(a188); ++- a190 = ((__m128i *) Branchtab); ++- a191 = *(a190); ++- a192 = _mm_xor_si128(a189, a191); ++- a193 = (b15 + 3); ++- a194 = *(a193); ++- a195 = _mm_set1_epi8(a194); ++- a196 = (a190 + 2); ++- a197 = *(a196); ++- a198 = _mm_xor_si128(a195, a197); ++- t25 = _mm_avg_epu8(a192,a198); ++- a199 = ((__m128i ) t25); ++- a200 = _mm_srli_epi16(a199, 2); ++- a201 = ((__m128i ) a200); ++- t26 = _mm_and_si128(a201, _mm_set_epi8(63, 63, 63, 63, 63, 63, 63 ++- , 63, 63, 63, 63, 63, 63, 63, 63 ++- , 63)); ++- t27 = _mm_subs_epu8(_mm_set_epi8(63, 63, 63, 63, 63, 63, 63 ++- , 63, 63, 63, 63, 63, 63, 63, 63 ++- , 63), t26); ++- m39 = _mm_adds_epu8(s46, t26); ++- m40 = _mm_adds_epu8(s47, t27); ++- m41 = _mm_adds_epu8(s46, t27); ++- m42 = _mm_adds_epu8(s47, t26); ++- a202 = _mm_min_epu8(m40, m39); ++- d17 = _mm_cmpeq_epi8(a202, m40); ++- a203 = _mm_min_epu8(m42, m41); ++- d18 = _mm_cmpeq_epi8(a203, m42); ++- s48 = _mm_movemask_epi8(_mm_unpacklo_epi8(d17,d18)); ++- a204 = ((short int *) dec); ++- a205 = (8 * i9); ++- b16 = (a204 + a205); ++- a206 = (b16 + 4); ++- *(a206) = s48; ++- s49 = _mm_movemask_epi8(_mm_unpackhi_epi8(d17,d18)); ++- a207 = (b16 + 5); ++- *(a207) = s49; ++- s50 = _mm_unpacklo_epi8(a202, a203); ++- s51 = _mm_unpackhi_epi8(a202, a203); ++- a208 = ((__m128i *) X); ++- *(a208) = s50; ++- a209 = (a208 + 1); ++- *(a209) = s51; ++- a210 = (a184 + 1); ++- s52 = *(a210); ++- a211 = (a184 + 3); ++- s53 = *(a211); ++- a212 = (a190 + 1); ++- a213 = *(a212); ++- a214 = _mm_xor_si128(a189, a213); ++- a215 = (a190 + 3); ++- a216 = *(a215); ++- a217 = _mm_xor_si128(a195, a216); ++- t28 = _mm_avg_epu8(a214,a217); ++- a218 = ((__m128i ) t28); ++- a219 = _mm_srli_epi16(a218, 2); ++- a220 = ((__m128i ) a219); ++- t29 = _mm_and_si128(a220, _mm_set_epi8(63, 63, 63, 63, 63, 63, 63 ++- , 63, 63, 63, 63, 63, 63, 63, 63 ++- , 63)); ++- t30 = _mm_subs_epu8(_mm_set_epi8(63, 63, 63, 63, 63, 63, 63 ++- , 63, 63, 63, 63, 63, 63, 63, 63 ++- , 63), t29); ++- m43 = _mm_adds_epu8(s52, t29); ++- m44 = _mm_adds_epu8(s53, t30); ++- m45 = _mm_adds_epu8(s52, t30); ++- m46 = _mm_adds_epu8(s53, t29); ++- a221 = _mm_min_epu8(m44, m43); ++- d19 = _mm_cmpeq_epi8(a221, m44); ++- a222 = _mm_min_epu8(m46, m45); ++- d20 = _mm_cmpeq_epi8(a222, m46); ++- s54 = _mm_movemask_epi8(_mm_unpacklo_epi8(d19,d20)); ++- a223 = (b16 + 6); ++- *(a223) = s54; ++- s55 = _mm_movemask_epi8(_mm_unpackhi_epi8(d19,d20)); ++- a224 = (b16 + 7); ++- *(a224) = s55; ++- s56 = _mm_unpacklo_epi8(a221, a222); ++- s57 = _mm_unpackhi_epi8(a221, a222); ++- a225 = (a208 + 2); ++- *(a225) = s56; ++- a226 = (a208 + 3); ++- *(a226) = s57; ++- if ((((unsigned char *) X)[0]>210)) { ++- __m128i m12, m13; ++- m12 = ((__m128i *) X)[0]; ++- m12 = _mm_min_epu8(m12, ((__m128i *) X)[1]); ++- m12 = _mm_min_epu8(m12, ((__m128i *) X)[2]); ++- m12 = _mm_min_epu8(m12, ((__m128i *) X)[3]); ++- __m128i m14; ++- m14 = _mm_min_epu8(_mm_srli_si128(m12, 8), m12); ++- m14 = ((__m128i ) _mm_min_epu8(((__m128i ) _mm_srli_epi64(m14, 32)), ((__m128i ) m14))); ++- m14 = ((__m128i ) _mm_min_epu8(((__m128i ) _mm_srli_epi64(m14, 16)), ((__m128i ) m14))); ++- m14 = ((__m128i ) _mm_min_epu8(((__m128i ) _mm_srli_epi64(m14, 8)), ((__m128i ) m14))); ++- m14 = _mm_unpacklo_epi8(m14, m14); ++- m14 = _mm_shufflelo_epi16(m14, _MM_SHUFFLE(0, 0, 0, 0)); ++- m13 = _mm_unpacklo_epi64(m14, m14); ++- ((__m128i *) X)[0] = _mm_subs_epu8(((__m128i *) X)[0], m13); ++- ((__m128i *) X)[1] = _mm_subs_epu8(((__m128i *) X)[1], m13); ++- ((__m128i *) X)[2] = _mm_subs_epu8(((__m128i *) X)[2], m13); ++- ((__m128i *) X)[3] = _mm_subs_epu8(((__m128i *) X)[3], m13); +++ unsigned int i9; +++ for (i9 = 0; i9 < ((framebits + excess) >> 1); i9++) { +++ unsigned char a75, a81; +++ int a73, a92; +++ short int s20, s21, s26, s27; +++ unsigned char *a74, *a80, *b6; +++ short int *a110, *a111, *a91, *a93, *a94; +++ __m128i *a102, *a112, *a113, *a71, *a72, *a77, *a83, *a95, *a96, *a97, *a98, *a99; +++ __m128i a105, a106, a86, a87; +++ __m128i a100, a101, a103, a104, a107, a108, a109, a76, a78, a79, a82, a84, a85, +++ a88, a89, a90, d10, d11, d12, d9, m23, m24, m25, m26, m27, m28, m29, m30, s18, +++ s19, s22, s23, s24, s25, s28, s29, t13, t14, t15, t16, t17, t18; +++ a71 = ((__m128i*)X); +++ s18 = *(a71); +++ a72 = (a71 + 2); +++ s19 = *(a72); +++ a73 = (4 * i9); +++ a74 = (syms + a73); +++ a75 = *(a74); +++ a76 = _mm_set1_epi8(a75); +++ a77 = ((__m128i*)Branchtab); +++ a78 = *(a77); +++ a79 = _mm_xor_si128(a76, a78); +++ b6 = (a73 + syms); +++ a80 = (b6 + 1); +++ a81 = *(a80); +++ a82 = _mm_set1_epi8(a81); +++ a83 = (a77 + 2); +++ a84 = *(a83); +++ a85 = _mm_xor_si128(a82, a84); +++ t13 = _mm_avg_epu8(a79, a85); +++ a86 = ((__m128i)t13); +++ a87 = _mm_srli_epi16(a86, 2); +++ a88 = ((__m128i)a87); +++ t14 = _mm_and_si128( +++ a88, +++ _mm_set_epi8(63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63)); +++ t15 = _mm_subs_epu8( +++ _mm_set_epi8(63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63), +++ t14); +++ m23 = _mm_adds_epu8(s18, t14); +++ m24 = _mm_adds_epu8(s19, t15); +++ m25 = _mm_adds_epu8(s18, t15); +++ m26 = _mm_adds_epu8(s19, t14); +++ a89 = _mm_min_epu8(m24, m23); +++ d9 = _mm_cmpeq_epi8(a89, m24); +++ a90 = _mm_min_epu8(m26, m25); +++ d10 = _mm_cmpeq_epi8(a90, m26); +++ s20 = _mm_movemask_epi8(_mm_unpacklo_epi8(d9, d10)); +++ a91 = ((short int*)dec); +++ a92 = (8 * i9); +++ a93 = (a91 + a92); +++ *(a93) = s20; +++ s21 = _mm_movemask_epi8(_mm_unpackhi_epi8(d9, d10)); +++ a94 = (a93 + 1); +++ *(a94) = s21; +++ s22 = _mm_unpacklo_epi8(a89, a90); +++ s23 = _mm_unpackhi_epi8(a89, a90); +++ a95 = ((__m128i*)Y); +++ *(a95) = s22; +++ a96 = (a95 + 1); +++ *(a96) = s23; +++ a97 = (a71 + 1); +++ s24 = *(a97); +++ a98 = (a71 + 3); +++ s25 = *(a98); +++ a99 = (a77 + 1); +++ a100 = *(a99); +++ a101 = _mm_xor_si128(a76, a100); +++ a102 = (a77 + 3); +++ a103 = *(a102); +++ a104 = _mm_xor_si128(a82, a103); +++ t16 = _mm_avg_epu8(a101, a104); +++ a105 = ((__m128i)t16); +++ a106 = _mm_srli_epi16(a105, 2); +++ a107 = ((__m128i)a106); +++ t17 = _mm_and_si128( +++ a107, +++ _mm_set_epi8(63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63)); +++ t18 = _mm_subs_epu8( +++ _mm_set_epi8(63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63), +++ t17); +++ m27 = _mm_adds_epu8(s24, t17); +++ m28 = _mm_adds_epu8(s25, t18); +++ m29 = _mm_adds_epu8(s24, t18); +++ m30 = _mm_adds_epu8(s25, t17); +++ a108 = _mm_min_epu8(m28, m27); +++ d11 = _mm_cmpeq_epi8(a108, m28); +++ a109 = _mm_min_epu8(m30, m29); +++ d12 = _mm_cmpeq_epi8(a109, m30); +++ s26 = _mm_movemask_epi8(_mm_unpacklo_epi8(d11, d12)); +++ a110 = (a93 + 2); +++ *(a110) = s26; +++ s27 = _mm_movemask_epi8(_mm_unpackhi_epi8(d11, d12)); +++ a111 = (a93 + 3); +++ *(a111) = s27; +++ s28 = _mm_unpacklo_epi8(a108, a109); +++ s29 = _mm_unpackhi_epi8(a108, a109); +++ a112 = (a95 + 2); +++ *(a112) = s28; +++ a113 = (a95 + 3); +++ *(a113) = s29; +++ if ((((unsigned char*)Y)[0] > 210)) { +++ __m128i m5, m6; +++ m5 = ((__m128i*)Y)[0]; +++ m5 = _mm_min_epu8(m5, ((__m128i*)Y)[1]); +++ m5 = _mm_min_epu8(m5, ((__m128i*)Y)[2]); +++ m5 = _mm_min_epu8(m5, ((__m128i*)Y)[3]); +++ __m128i m7; +++ m7 = _mm_min_epu8(_mm_srli_si128(m5, 8), m5); +++ m7 = +++ ((__m128i)_mm_min_epu8(((__m128i)_mm_srli_epi64(m7, 32)), ((__m128i)m7))); +++ m7 = +++ ((__m128i)_mm_min_epu8(((__m128i)_mm_srli_epi64(m7, 16)), ((__m128i)m7))); +++ m7 = ((__m128i)_mm_min_epu8(((__m128i)_mm_srli_epi64(m7, 8)), ((__m128i)m7))); +++ m7 = _mm_unpacklo_epi8(m7, m7); +++ m7 = _mm_shufflelo_epi16(m7, _MM_SHUFFLE(0, 0, 0, 0)); +++ m6 = _mm_unpacklo_epi64(m7, m7); +++ ((__m128i*)Y)[0] = _mm_subs_epu8(((__m128i*)Y)[0], m6); +++ ((__m128i*)Y)[1] = _mm_subs_epu8(((__m128i*)Y)[1], m6); +++ ((__m128i*)Y)[2] = _mm_subs_epu8(((__m128i*)Y)[2], m6); +++ ((__m128i*)Y)[3] = _mm_subs_epu8(((__m128i*)Y)[3], m6); +++ } +++ unsigned char a188, a194; +++ int a186, a205; +++ short int s48, s49, s54, s55; +++ unsigned char *a187, *a193, *b15; +++ short int *a204, *a206, *a207, *a223, *a224, *b16; +++ __m128i *a184, *a185, *a190, *a196, *a208, *a209, *a210, *a211, *a212, *a215, +++ *a225, *a226; +++ __m128i a199, a200, a218, a219; +++ __m128i a189, a191, a192, a195, a197, a198, a201, a202, a203, a213, a214, a216, +++ a217, a220, a221, a222, d17, d18, d19, d20, m39, m40, m41, m42, m43, m44, m45, +++ m46, s46, s47, s50, s51, s52, s53, s56, s57, t25, t26, t27, t28, t29, t30; +++ a184 = ((__m128i*)Y); +++ s46 = *(a184); +++ a185 = (a184 + 2); +++ s47 = *(a185); +++ a186 = (4 * i9); +++ b15 = (a186 + syms); +++ a187 = (b15 + 2); +++ a188 = *(a187); +++ a189 = _mm_set1_epi8(a188); +++ a190 = ((__m128i*)Branchtab); +++ a191 = *(a190); +++ a192 = _mm_xor_si128(a189, a191); +++ a193 = (b15 + 3); +++ a194 = *(a193); +++ a195 = _mm_set1_epi8(a194); +++ a196 = (a190 + 2); +++ a197 = *(a196); +++ a198 = _mm_xor_si128(a195, a197); +++ t25 = _mm_avg_epu8(a192, a198); +++ a199 = ((__m128i)t25); +++ a200 = _mm_srli_epi16(a199, 2); +++ a201 = ((__m128i)a200); +++ t26 = _mm_and_si128( +++ a201, +++ _mm_set_epi8(63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63)); +++ t27 = _mm_subs_epu8( +++ _mm_set_epi8(63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63), +++ t26); +++ m39 = _mm_adds_epu8(s46, t26); +++ m40 = _mm_adds_epu8(s47, t27); +++ m41 = _mm_adds_epu8(s46, t27); +++ m42 = _mm_adds_epu8(s47, t26); +++ a202 = _mm_min_epu8(m40, m39); +++ d17 = _mm_cmpeq_epi8(a202, m40); +++ a203 = _mm_min_epu8(m42, m41); +++ d18 = _mm_cmpeq_epi8(a203, m42); +++ s48 = _mm_movemask_epi8(_mm_unpacklo_epi8(d17, d18)); +++ a204 = ((short int*)dec); +++ a205 = (8 * i9); +++ b16 = (a204 + a205); +++ a206 = (b16 + 4); +++ *(a206) = s48; +++ s49 = _mm_movemask_epi8(_mm_unpackhi_epi8(d17, d18)); +++ a207 = (b16 + 5); +++ *(a207) = s49; +++ s50 = _mm_unpacklo_epi8(a202, a203); +++ s51 = _mm_unpackhi_epi8(a202, a203); +++ a208 = ((__m128i*)X); +++ *(a208) = s50; +++ a209 = (a208 + 1); +++ *(a209) = s51; +++ a210 = (a184 + 1); +++ s52 = *(a210); +++ a211 = (a184 + 3); +++ s53 = *(a211); +++ a212 = (a190 + 1); +++ a213 = *(a212); +++ a214 = _mm_xor_si128(a189, a213); +++ a215 = (a190 + 3); +++ a216 = *(a215); +++ a217 = _mm_xor_si128(a195, a216); +++ t28 = _mm_avg_epu8(a214, a217); +++ a218 = ((__m128i)t28); +++ a219 = _mm_srli_epi16(a218, 2); +++ a220 = ((__m128i)a219); +++ t29 = _mm_and_si128( +++ a220, +++ _mm_set_epi8(63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63)); +++ t30 = _mm_subs_epu8( +++ _mm_set_epi8(63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63), +++ t29); +++ m43 = _mm_adds_epu8(s52, t29); +++ m44 = _mm_adds_epu8(s53, t30); +++ m45 = _mm_adds_epu8(s52, t30); +++ m46 = _mm_adds_epu8(s53, t29); +++ a221 = _mm_min_epu8(m44, m43); +++ d19 = _mm_cmpeq_epi8(a221, m44); +++ a222 = _mm_min_epu8(m46, m45); +++ d20 = _mm_cmpeq_epi8(a222, m46); +++ s54 = _mm_movemask_epi8(_mm_unpacklo_epi8(d19, d20)); +++ a223 = (b16 + 6); +++ *(a223) = s54; +++ s55 = _mm_movemask_epi8(_mm_unpackhi_epi8(d19, d20)); +++ a224 = (b16 + 7); +++ *(a224) = s55; +++ s56 = _mm_unpacklo_epi8(a221, a222); +++ s57 = _mm_unpackhi_epi8(a221, a222); +++ a225 = (a208 + 2); +++ *(a225) = s56; +++ a226 = (a208 + 3); +++ *(a226) = s57; +++ if ((((unsigned char*)X)[0] > 210)) { +++ __m128i m12, m13; +++ m12 = ((__m128i*)X)[0]; +++ m12 = _mm_min_epu8(m12, ((__m128i*)X)[1]); +++ m12 = _mm_min_epu8(m12, ((__m128i*)X)[2]); +++ m12 = _mm_min_epu8(m12, ((__m128i*)X)[3]); +++ __m128i m14; +++ m14 = _mm_min_epu8(_mm_srli_si128(m12, 8), m12); +++ m14 = ((__m128i)_mm_min_epu8(((__m128i)_mm_srli_epi64(m14, 32)), +++ ((__m128i)m14))); +++ m14 = ((__m128i)_mm_min_epu8(((__m128i)_mm_srli_epi64(m14, 16)), +++ ((__m128i)m14))); +++ m14 = ((__m128i)_mm_min_epu8(((__m128i)_mm_srli_epi64(m14, 8)), +++ ((__m128i)m14))); +++ m14 = _mm_unpacklo_epi8(m14, m14); +++ m14 = _mm_shufflelo_epi16(m14, _MM_SHUFFLE(0, 0, 0, 0)); +++ m13 = _mm_unpacklo_epi64(m14, m14); +++ ((__m128i*)X)[0] = _mm_subs_epu8(((__m128i*)X)[0], m13); +++ ((__m128i*)X)[1] = _mm_subs_epu8(((__m128i*)X)[1], m13); +++ ((__m128i*)X)[2] = _mm_subs_epu8(((__m128i*)X)[2], m13); +++ ((__m128i*)X)[3] = _mm_subs_epu8(((__m128i*)X)[3], m13); +++ } ++ } ++- } ++- ++- renormalize(X, 210); ++ ++- /*int ch; ++- for(ch = 0; ch < 64; ch++) { ++- printf("%d,", X[ch]); ++- } ++- printf("\n");*/ ++- ++- unsigned int j; ++- for(j=0; j < (framebits + excess) % 2; ++j) { ++- int i; ++- for(i=0;i<64/2;i++){ ++- BFLY(i, (((framebits+excess) >> 1) << 1) + j , syms, Y, X, (decision_t *)dec, Branchtab); ++- } +++ renormalize(X, 210); ++ ++- ++- renormalize(Y, 210); ++- ++- /*printf("\n"); +++ /*int ch; ++ for(ch = 0; ch < 64; ch++) { ++- printf("%d,", Y[ch]); +++ printf("%d,", X[ch]); ++ } ++ printf("\n");*/ ++ ++- } ++- /*skip*/ +++ unsigned int j; +++ for (j = 0; j < (framebits + excess) % 2; ++j) { +++ int i; +++ for (i = 0; i < 64 / 2; i++) { +++ BFLY(i, +++ (((framebits + excess) >> 1) << 1) + j, +++ syms, +++ Y, +++ X, +++ (decision_t*)dec, +++ Branchtab); +++ } +++ +++ +++ renormalize(Y, 210); +++ +++ /*printf("\n"); +++ for(ch = 0; ch < 64; ch++) { +++ printf("%d,", Y[ch]); +++ } +++ printf("\n");*/ +++ } +++ /*skip*/ ++ } ++ ++ #endif /*LV_HAVE_SSE3*/ ++@@ -612,30 +633,32 @@ volk_8u_x4_conv_k7_r2_8u_spiral(unsigned char* Y, unsigned char* X, ++ ++ #if LV_HAVE_GENERIC ++ ++-static inline void ++-volk_8u_x4_conv_k7_r2_8u_generic(unsigned char* Y, unsigned char* X, ++- unsigned char* syms, unsigned char* dec, ++- unsigned int framebits, unsigned int excess, ++- unsigned char* Branchtab) +++static inline void volk_8u_x4_conv_k7_r2_8u_generic(unsigned char* Y, +++ unsigned char* X, +++ unsigned char* syms, +++ unsigned char* dec, +++ unsigned int framebits, +++ unsigned int excess, +++ unsigned char* Branchtab) ++ { ++- int nbits = framebits + excess; ++- int NUMSTATES = 64; ++- int RENORMALIZE_THRESHOLD = 210; ++- ++- int s,i; ++- for (s=0;s init_test_list(volk_test_params_t test_params) ++@@ -32,127 +37,135 @@ std::vector init_test_list(volk_test_params_t test_params) ++ test_params_rotator.set_tol(1e-3); ++ ++ std::vector test_cases; ++- QA(VOLK_INIT_PUPP(volk_64u_popcntpuppet_64u, volk_64u_popcnt, test_params)) ++- QA(VOLK_INIT_PUPP(volk_64u_popcntpuppet_64u, volk_64u_popcnt, test_params)) ++- QA(VOLK_INIT_PUPP(volk_64u_popcntpuppet_64u, volk_64u_popcnt, test_params)) +++ QA(VOLK_INIT_PUPP(volk_64u_popcntpuppet_64u, volk_64u_popcnt, test_params)) +++ QA(VOLK_INIT_PUPP(volk_64u_popcntpuppet_64u, volk_64u_popcnt, test_params)) +++ QA(VOLK_INIT_PUPP(volk_64u_popcntpuppet_64u, volk_64u_popcnt, test_params)) ++ QA(VOLK_INIT_PUPP(volk_16u_byteswappuppet_16u, volk_16u_byteswap, test_params)) ++ QA(VOLK_INIT_PUPP(volk_32u_byteswappuppet_32u, volk_32u_byteswap, test_params)) ++- QA(VOLK_INIT_PUPP(volk_32u_popcntpuppet_32u, volk_32u_popcnt_32u, test_params)) +++ QA(VOLK_INIT_PUPP(volk_32u_popcntpuppet_32u, volk_32u_popcnt_32u, test_params)) ++ QA(VOLK_INIT_PUPP(volk_64u_byteswappuppet_64u, volk_64u_byteswap, test_params)) ++- QA(VOLK_INIT_PUPP(volk_32fc_s32fc_rotatorpuppet_32fc, volk_32fc_s32fc_x2_rotator_32fc, test_params_rotator)) ++- QA(VOLK_INIT_PUPP(volk_8u_conv_k7_r2puppet_8u, volk_8u_x4_conv_k7_r2_8u, test_params.make_tol(0))) ++- QA(VOLK_INIT_PUPP(volk_32f_x2_fm_detectpuppet_32f, volk_32f_s32f_32f_fm_detect_32f, test_params)) ++- QA(VOLK_INIT_TEST(volk_16ic_s32f_deinterleave_real_32f, test_params)) ++- QA(VOLK_INIT_TEST(volk_16ic_deinterleave_real_8i, test_params)) ++- QA(VOLK_INIT_TEST(volk_16ic_deinterleave_16i_x2, test_params)) ++- QA(VOLK_INIT_TEST(volk_16ic_s32f_deinterleave_32f_x2, test_params)) ++- QA(VOLK_INIT_TEST(volk_16ic_deinterleave_real_16i, test_params)) ++- QA(VOLK_INIT_TEST(volk_16ic_magnitude_16i, test_params)) ++- QA(VOLK_INIT_TEST(volk_16ic_s32f_magnitude_32f, test_params)) ++- QA(VOLK_INIT_TEST(volk_16ic_convert_32fc, test_params)) ++- QA(VOLK_INIT_TEST(volk_16ic_x2_multiply_16ic, test_params)) ++- QA(VOLK_INIT_TEST(volk_16ic_x2_dot_prod_16ic, test_params)) ++- QA(VOLK_INIT_TEST(volk_16i_s32f_convert_32f, test_params)) ++- QA(VOLK_INIT_TEST(volk_16i_convert_8i, test_params)) ++- QA(VOLK_INIT_TEST(volk_16i_32fc_dot_prod_32fc, test_params_inacc)) ++- QA(VOLK_INIT_TEST(volk_32f_accumulator_s32f, test_params_inacc)) ++- QA(VOLK_INIT_TEST(volk_32f_x2_add_32f, test_params)) ++- QA(VOLK_INIT_TEST(volk_32f_index_max_16u, test_params)) ++- QA(VOLK_INIT_TEST(volk_32f_index_max_32u, test_params)) ++- QA(VOLK_INIT_TEST(volk_32fc_32f_multiply_32fc, test_params)) ++- QA(VOLK_INIT_TEST(volk_32fc_32f_add_32fc, test_params)) ++- QA(VOLK_INIT_TEST(volk_32f_log2_32f, test_params.make_absolute(1e-5))) ++- QA(VOLK_INIT_TEST(volk_32f_expfast_32f, test_params_inacc_tenth)) ++- QA(VOLK_INIT_TEST(volk_32f_x2_pow_32f, test_params_inacc)) ++- QA(VOLK_INIT_TEST(volk_32f_sin_32f, test_params_inacc)) ++- QA(VOLK_INIT_TEST(volk_32f_cos_32f, test_params_inacc)) ++- QA(VOLK_INIT_TEST(volk_32f_tan_32f, test_params_inacc)) ++- QA(VOLK_INIT_TEST(volk_32f_atan_32f, test_params_inacc)) ++- QA(VOLK_INIT_TEST(volk_32f_asin_32f, test_params_inacc)) ++- QA(VOLK_INIT_TEST(volk_32f_acos_32f, test_params_inacc)) ++- QA(VOLK_INIT_TEST(volk_32fc_s32f_power_32fc, test_params_power)) ++- QA(VOLK_INIT_TEST(volk_32f_s32f_calc_spectral_noise_floor_32f, test_params_inacc)) ++- QA(VOLK_INIT_TEST(volk_32fc_s32f_atan2_32f, test_params)) ++- QA(VOLK_INIT_TEST(volk_32fc_x2_conjugate_dot_prod_32fc, test_params_inacc_tenth)) ++- QA(VOLK_INIT_TEST(volk_32fc_deinterleave_32f_x2, test_params)) ++- QA(VOLK_INIT_TEST(volk_32fc_deinterleave_64f_x2, test_params)) ++- QA(VOLK_INIT_TEST(volk_32fc_s32f_deinterleave_real_16i, test_params)) ++- QA(VOLK_INIT_TEST(volk_32fc_deinterleave_imag_32f, test_params)) ++- QA(VOLK_INIT_TEST(volk_32fc_deinterleave_real_32f, test_params)) ++- QA(VOLK_INIT_TEST(volk_32fc_deinterleave_real_64f, test_params)) ++- QA(VOLK_INIT_TEST(volk_32fc_x2_dot_prod_32fc, test_params_inacc)) ++- QA(VOLK_INIT_TEST(volk_32fc_32f_dot_prod_32fc, test_params_inacc)) ++- QA(VOLK_INIT_TEST(volk_32fc_index_max_16u, test_params)) ++- QA(VOLK_INIT_TEST(volk_32fc_index_max_32u, test_params)) ++- QA(VOLK_INIT_TEST(volk_32fc_s32f_magnitude_16i, test_params)) ++- QA(VOLK_INIT_TEST(volk_32fc_magnitude_32f, test_params_inacc_tenth)) ++- QA(VOLK_INIT_TEST(volk_32fc_magnitude_squared_32f, test_params)) ++- QA(VOLK_INIT_TEST(volk_32fc_x2_add_32fc, test_params)) ++- QA(VOLK_INIT_TEST(volk_32fc_x2_multiply_32fc, test_params)) ++- QA(VOLK_INIT_TEST(volk_32fc_x2_multiply_conjugate_32fc, test_params)) ++- QA(VOLK_INIT_TEST(volk_32fc_x2_divide_32fc, test_params)) ++- QA(VOLK_INIT_TEST(volk_32fc_conjugate_32fc, test_params)) ++- QA(VOLK_INIT_TEST(volk_32f_s32f_convert_16i, test_params)) ++- QA(VOLK_INIT_TEST(volk_32f_s32f_convert_32i, test_params)) ++- QA(VOLK_INIT_TEST(volk_32f_convert_64f, test_params)) ++- QA(VOLK_INIT_TEST(volk_32f_s32f_convert_8i, test_params)) ++- QA(VOLK_INIT_TEST(volk_32fc_convert_16ic, test_params)) ++- QA(VOLK_INIT_TEST(volk_32fc_s32f_power_spectrum_32f, test_params)) ++- QA(VOLK_INIT_TEST(volk_32fc_x2_square_dist_32f, test_params)) ++- QA(VOLK_INIT_TEST(volk_32fc_x2_s32f_square_dist_scalar_mult_32f, test_params)) ++- QA(VOLK_INIT_TEST(volk_32f_x2_divide_32f, test_params)) ++- QA(VOLK_INIT_TEST(volk_32f_x2_dot_prod_32f, test_params_inacc)) ++- QA(VOLK_INIT_TEST(volk_32f_x2_s32f_interleave_16ic, test_params)) ++- QA(VOLK_INIT_TEST(volk_32f_x2_interleave_32fc, test_params)) ++- QA(VOLK_INIT_TEST(volk_32f_x2_max_32f, test_params)) ++- QA(VOLK_INIT_TEST(volk_32f_x2_min_32f, test_params)) ++- QA(VOLK_INIT_TEST(volk_32f_x2_multiply_32f, test_params)) ++- QA(VOLK_INIT_TEST(volk_32f_64f_multiply_64f, test_params)) ++- QA(VOLK_INIT_TEST(volk_32f_64f_add_64f, test_params)) ++- QA(VOLK_INIT_TEST(volk_32f_s32f_normalize, test_params)) ++- QA(VOLK_INIT_TEST(volk_32f_s32f_power_32f, test_params)) ++- QA(VOLK_INIT_TEST(volk_32f_sqrt_32f, test_params_inacc)) ++- QA(VOLK_INIT_TEST(volk_32f_s32f_stddev_32f, test_params_inacc)) ++- QA(VOLK_INIT_TEST(volk_32f_stddev_and_mean_32f_x2, test_params_inacc)) ++- QA(VOLK_INIT_TEST(volk_32f_x2_subtract_32f, test_params)) ++- QA(VOLK_INIT_TEST(volk_32f_x3_sum_of_poly_32f, test_params_inacc)) ++- QA(VOLK_INIT_TEST(volk_32i_x2_and_32i, test_params)) ++- QA(VOLK_INIT_TEST(volk_32i_s32f_convert_32f, test_params)) ++- QA(VOLK_INIT_TEST(volk_32i_x2_or_32i, test_params)) ++- QA(VOLK_INIT_TEST(volk_32f_x2_dot_prod_16i, test_params)) ++- QA(VOLK_INIT_TEST(volk_64f_convert_32f, test_params)) ++- QA(VOLK_INIT_TEST(volk_64f_x2_max_64f, test_params)) ++- QA(VOLK_INIT_TEST(volk_64f_x2_min_64f, test_params)) ++- QA(VOLK_INIT_TEST(volk_64f_x2_multiply_64f, test_params)) ++- QA(VOLK_INIT_TEST(volk_64f_x2_add_64f, test_params)) ++- QA(VOLK_INIT_TEST(volk_8ic_deinterleave_16i_x2, test_params)) ++- QA(VOLK_INIT_TEST(volk_8ic_s32f_deinterleave_32f_x2, test_params)) ++- QA(VOLK_INIT_TEST(volk_8ic_deinterleave_real_16i, test_params)) ++- QA(VOLK_INIT_TEST(volk_8ic_s32f_deinterleave_real_32f, test_params)) ++- QA(VOLK_INIT_TEST(volk_8ic_deinterleave_real_8i, test_params)) ++- QA(VOLK_INIT_TEST(volk_8ic_x2_multiply_conjugate_16ic, test_params)) ++- QA(VOLK_INIT_TEST(volk_8ic_x2_s32f_multiply_conjugate_32fc, test_params)) ++- QA(VOLK_INIT_TEST(volk_8i_convert_16i, test_params)) ++- QA(VOLK_INIT_TEST(volk_8i_s32f_convert_32f, test_params)) ++- QA(VOLK_INIT_TEST(volk_32fc_s32fc_multiply_32fc, test_params)) ++- QA(VOLK_INIT_TEST(volk_32f_s32f_multiply_32f, test_params)) ++- QA(VOLK_INIT_TEST(volk_32f_binary_slicer_32i, test_params)) ++- QA(VOLK_INIT_TEST(volk_32f_binary_slicer_8i, test_params)) ++- QA(VOLK_INIT_TEST(volk_32u_reverse_32u, test_params)) ++- QA(VOLK_INIT_TEST(volk_32f_tanh_32f, test_params_inacc)) ++- QA(VOLK_INIT_TEST(volk_32f_s32f_mod_rangepuppet_32f, test_params)) +++ QA(VOLK_INIT_PUPP(volk_32fc_s32fc_rotatorpuppet_32fc, +++ volk_32fc_s32fc_x2_rotator_32fc, +++ test_params_rotator)) +++ QA(VOLK_INIT_PUPP( +++ volk_8u_conv_k7_r2puppet_8u, volk_8u_x4_conv_k7_r2_8u, test_params.make_tol(0))) +++ QA(VOLK_INIT_PUPP( +++ volk_32f_x2_fm_detectpuppet_32f, volk_32f_s32f_32f_fm_detect_32f, test_params)) +++ QA(VOLK_INIT_TEST(volk_16ic_s32f_deinterleave_real_32f, test_params)) +++ QA(VOLK_INIT_TEST(volk_16ic_deinterleave_real_8i, test_params)) +++ QA(VOLK_INIT_TEST(volk_16ic_deinterleave_16i_x2, test_params)) +++ QA(VOLK_INIT_TEST(volk_16ic_s32f_deinterleave_32f_x2, test_params)) +++ QA(VOLK_INIT_TEST(volk_16ic_deinterleave_real_16i, test_params)) +++ QA(VOLK_INIT_TEST(volk_16ic_magnitude_16i, test_params)) +++ QA(VOLK_INIT_TEST(volk_16ic_s32f_magnitude_32f, test_params)) +++ QA(VOLK_INIT_TEST(volk_16ic_convert_32fc, test_params)) +++ QA(VOLK_INIT_TEST(volk_16ic_x2_multiply_16ic, test_params)) +++ QA(VOLK_INIT_TEST(volk_16ic_x2_dot_prod_16ic, test_params)) +++ QA(VOLK_INIT_TEST(volk_16i_s32f_convert_32f, test_params)) +++ QA(VOLK_INIT_TEST(volk_16i_convert_8i, test_params)) +++ QA(VOLK_INIT_TEST(volk_16i_32fc_dot_prod_32fc, test_params_inacc)) +++ QA(VOLK_INIT_TEST(volk_32f_accumulator_s32f, test_params_inacc)) +++ QA(VOLK_INIT_TEST(volk_32f_x2_add_32f, test_params)) +++ QA(VOLK_INIT_TEST(volk_32f_index_max_16u, test_params)) +++ QA(VOLK_INIT_TEST(volk_32f_index_max_32u, test_params)) +++ QA(VOLK_INIT_TEST(volk_32fc_32f_multiply_32fc, test_params)) +++ QA(VOLK_INIT_TEST(volk_32fc_32f_add_32fc, test_params)) +++ QA(VOLK_INIT_TEST(volk_32f_log2_32f, test_params.make_absolute(1e-5))) +++ QA(VOLK_INIT_TEST(volk_32f_expfast_32f, test_params_inacc_tenth)) +++ QA(VOLK_INIT_TEST(volk_32f_x2_pow_32f, test_params_inacc)) +++ QA(VOLK_INIT_TEST(volk_32f_sin_32f, test_params_inacc)) +++ QA(VOLK_INIT_TEST(volk_32f_cos_32f, test_params_inacc)) +++ QA(VOLK_INIT_TEST(volk_32f_tan_32f, test_params_inacc)) +++ QA(VOLK_INIT_TEST(volk_32f_atan_32f, test_params_inacc)) +++ QA(VOLK_INIT_TEST(volk_32f_asin_32f, test_params_inacc)) +++ QA(VOLK_INIT_TEST(volk_32f_acos_32f, test_params_inacc)) +++ QA(VOLK_INIT_TEST(volk_32fc_s32f_power_32fc, test_params_power)) +++ QA(VOLK_INIT_TEST(volk_32f_s32f_calc_spectral_noise_floor_32f, test_params_inacc)) +++ QA(VOLK_INIT_TEST(volk_32fc_s32f_atan2_32f, test_params)) +++ QA(VOLK_INIT_TEST(volk_32fc_x2_conjugate_dot_prod_32fc, test_params_inacc_tenth)) +++ QA(VOLK_INIT_TEST(volk_32fc_deinterleave_32f_x2, test_params)) +++ QA(VOLK_INIT_TEST(volk_32fc_deinterleave_64f_x2, test_params)) +++ QA(VOLK_INIT_TEST(volk_32fc_s32f_deinterleave_real_16i, test_params)) +++ QA(VOLK_INIT_TEST(volk_32fc_deinterleave_imag_32f, test_params)) +++ QA(VOLK_INIT_TEST(volk_32fc_deinterleave_real_32f, test_params)) +++ QA(VOLK_INIT_TEST(volk_32fc_deinterleave_real_64f, test_params)) +++ QA(VOLK_INIT_TEST(volk_32fc_x2_dot_prod_32fc, test_params_inacc)) +++ QA(VOLK_INIT_TEST(volk_32fc_32f_dot_prod_32fc, test_params_inacc)) +++ QA(VOLK_INIT_TEST(volk_32fc_index_max_16u, test_params)) +++ QA(VOLK_INIT_TEST(volk_32fc_index_max_32u, test_params)) +++ QA(VOLK_INIT_TEST(volk_32fc_s32f_magnitude_16i, test_params)) +++ QA(VOLK_INIT_TEST(volk_32fc_magnitude_32f, test_params_inacc_tenth)) +++ QA(VOLK_INIT_TEST(volk_32fc_magnitude_squared_32f, test_params)) +++ QA(VOLK_INIT_TEST(volk_32fc_x2_add_32fc, test_params)) +++ QA(VOLK_INIT_TEST(volk_32fc_x2_multiply_32fc, test_params)) +++ QA(VOLK_INIT_TEST(volk_32fc_x2_multiply_conjugate_32fc, test_params)) +++ QA(VOLK_INIT_TEST(volk_32fc_x2_divide_32fc, test_params)) +++ QA(VOLK_INIT_TEST(volk_32fc_conjugate_32fc, test_params)) +++ QA(VOLK_INIT_TEST(volk_32f_s32f_convert_16i, test_params)) +++ QA(VOLK_INIT_TEST(volk_32f_s32f_convert_32i, test_params)) +++ QA(VOLK_INIT_TEST(volk_32f_convert_64f, test_params)) +++ QA(VOLK_INIT_TEST(volk_32f_s32f_convert_8i, test_params)) +++ QA(VOLK_INIT_TEST(volk_32fc_convert_16ic, test_params)) +++ QA(VOLK_INIT_TEST(volk_32fc_s32f_power_spectrum_32f, test_params)) +++ QA(VOLK_INIT_TEST(volk_32fc_x2_square_dist_32f, test_params)) +++ QA(VOLK_INIT_TEST(volk_32fc_x2_s32f_square_dist_scalar_mult_32f, test_params)) +++ QA(VOLK_INIT_TEST(volk_32f_x2_divide_32f, test_params)) +++ QA(VOLK_INIT_TEST(volk_32f_x2_dot_prod_32f, test_params_inacc)) +++ QA(VOLK_INIT_TEST(volk_32f_x2_s32f_interleave_16ic, test_params)) +++ QA(VOLK_INIT_TEST(volk_32f_x2_interleave_32fc, test_params)) +++ QA(VOLK_INIT_TEST(volk_32f_x2_max_32f, test_params)) +++ QA(VOLK_INIT_TEST(volk_32f_x2_min_32f, test_params)) +++ QA(VOLK_INIT_TEST(volk_32f_x2_multiply_32f, test_params)) +++ QA(VOLK_INIT_TEST(volk_32f_64f_multiply_64f, test_params)) +++ QA(VOLK_INIT_TEST(volk_32f_64f_add_64f, test_params)) +++ QA(VOLK_INIT_TEST(volk_32f_s32f_normalize, test_params)) +++ QA(VOLK_INIT_TEST(volk_32f_s32f_power_32f, test_params)) +++ QA(VOLK_INIT_TEST(volk_32f_sqrt_32f, test_params_inacc)) +++ QA(VOLK_INIT_TEST(volk_32f_s32f_stddev_32f, test_params_inacc)) +++ QA(VOLK_INIT_TEST(volk_32f_stddev_and_mean_32f_x2, test_params_inacc)) +++ QA(VOLK_INIT_TEST(volk_32f_x2_subtract_32f, test_params)) +++ QA(VOLK_INIT_TEST(volk_32f_x3_sum_of_poly_32f, test_params_inacc)) +++ QA(VOLK_INIT_TEST(volk_32i_x2_and_32i, test_params)) +++ QA(VOLK_INIT_TEST(volk_32i_s32f_convert_32f, test_params)) +++ QA(VOLK_INIT_TEST(volk_32i_x2_or_32i, test_params)) +++ QA(VOLK_INIT_TEST(volk_32f_x2_dot_prod_16i, test_params)) +++ QA(VOLK_INIT_TEST(volk_64f_convert_32f, test_params)) +++ QA(VOLK_INIT_TEST(volk_64f_x2_max_64f, test_params)) +++ QA(VOLK_INIT_TEST(volk_64f_x2_min_64f, test_params)) +++ QA(VOLK_INIT_TEST(volk_64f_x2_multiply_64f, test_params)) +++ QA(VOLK_INIT_TEST(volk_64f_x2_add_64f, test_params)) +++ QA(VOLK_INIT_TEST(volk_8ic_deinterleave_16i_x2, test_params)) +++ QA(VOLK_INIT_TEST(volk_8ic_s32f_deinterleave_32f_x2, test_params)) +++ QA(VOLK_INIT_TEST(volk_8ic_deinterleave_real_16i, test_params)) +++ QA(VOLK_INIT_TEST(volk_8ic_s32f_deinterleave_real_32f, test_params)) +++ QA(VOLK_INIT_TEST(volk_8ic_deinterleave_real_8i, test_params)) +++ QA(VOLK_INIT_TEST(volk_8ic_x2_multiply_conjugate_16ic, test_params)) +++ QA(VOLK_INIT_TEST(volk_8ic_x2_s32f_multiply_conjugate_32fc, test_params)) +++ QA(VOLK_INIT_TEST(volk_8i_convert_16i, test_params)) +++ QA(VOLK_INIT_TEST(volk_8i_s32f_convert_32f, test_params)) +++ QA(VOLK_INIT_TEST(volk_32fc_s32fc_multiply_32fc, test_params)) +++ QA(VOLK_INIT_TEST(volk_32f_s32f_multiply_32f, test_params)) +++ QA(VOLK_INIT_TEST(volk_32f_binary_slicer_32i, test_params)) +++ QA(VOLK_INIT_TEST(volk_32f_binary_slicer_8i, test_params)) +++ QA(VOLK_INIT_TEST(volk_32u_reverse_32u, test_params)) +++ QA(VOLK_INIT_TEST(volk_32f_tanh_32f, test_params_inacc)) +++ QA(VOLK_INIT_TEST(volk_32f_s32f_mod_rangepuppet_32f, test_params)) ++ QA(VOLK_INIT_TEST(volk_32fc_x2_s32fc_multiply_conjugate_add_32fc, test_params)) ++- QA(VOLK_INIT_PUPP(volk_8u_x3_encodepolarpuppet_8u, volk_8u_x3_encodepolar_8u_x2, test_params)) ++- QA(VOLK_INIT_PUPP(volk_32f_8u_polarbutterflypuppet_32f, volk_32f_8u_polarbutterfly_32f, test_params)) ++- QA(VOLK_INIT_TEST(volk_32f_exp_32f, test_params)) ++- +++ QA(VOLK_INIT_PUPP( +++ volk_8u_x3_encodepolarpuppet_8u, volk_8u_x3_encodepolar_8u_x2, test_params)) +++ QA(VOLK_INIT_PUPP(volk_32f_8u_polarbutterflypuppet_32f, +++ volk_32f_8u_polarbutterfly_32f, +++ test_params)) ++ // no one uses these, so don't test them ++- //VOLK_PROFILE(volk_16i_x5_add_quad_16i_x4, 1e-4, 2046, 10000, &results, benchmark_mode, kernel_regex); ++- //VOLK_PROFILE(volk_16i_branch_4_state_8, 1e-4, 2046, 10000, &results, benchmark_mode, kernel_regex); ++- //VOLK_PROFILE(volk_16i_max_star_16i, 0, 0, 204602, 10000, &results, benchmark_mode, kernel_regex); ++- //VOLK_PROFILE(volk_16i_max_star_horizontal_16i, 0, 0, 204602, 10000, &results, benchmark_mode, kernel_regex); ++- //VOLK_PROFILE(volk_16i_permute_and_scalar_add, 1e-4, 0, 2046, 10000, &results, benchmark_mode, kernel_regex); ++- //VOLK_PROFILE(volk_16i_x4_quad_max_star_16i, 1e-4, 0, 2046, 10000, &results, benchmark_mode, kernel_regex); +++ // VOLK_PROFILE(volk_16i_x5_add_quad_16i_x4, 1e-4, 2046, 10000, &results, +++ // benchmark_mode, kernel_regex); VOLK_PROFILE(volk_16i_branch_4_state_8, 1e-4, 2046, +++ // 10000, &results, benchmark_mode, kernel_regex); VOLK_PROFILE(volk_16i_max_star_16i, +++ // 0, 0, 204602, 10000, &results, benchmark_mode, kernel_regex); +++ // VOLK_PROFILE(volk_16i_max_star_horizontal_16i, 0, 0, 204602, 10000, &results, +++ // benchmark_mode, kernel_regex); VOLK_PROFILE(volk_16i_permute_and_scalar_add, 1e-4, +++ // 0, 2046, 10000, &results, benchmark_mode, kernel_regex); +++ // VOLK_PROFILE(volk_16i_x4_quad_max_star_16i, 1e-4, 0, 2046, 10000, &results, +++ // benchmark_mode, kernel_regex); ++ // we need a puppet for this one ++ //(VOLK_INIT_TEST(volk_32fc_s32f_x2_power_spectral_density_32f, test_params)) ++ ++diff --git a/lib/qa_utils.cc b/lib/qa_utils.cc ++index 76df069..1dcee6e 100644 ++--- a/lib/qa_utils.cc +++++ b/lib/qa_utils.cc ++@@ -1,79 +1,94 @@ ++-#include ++ #include "qa_utils.h" +++#include ++ ++-#include // for volk_func_desc_t ++-#include // for volk_free, volk_m... +++#include // for volk_func_desc_t +++#include // for volk_free, volk_m... ++ ++-#include // for assert ++-#include // for uint16_t, uint64_t ++-#include // for CLOCKS_PER_SEC ++-#include // for int16_t, int32_t +++#include // for assert +++#include // for uint16_t, uint64_t +++#include // for CLOCKS_PER_SEC +++#include // for int16_t, int32_t ++ #include ++-#include // for sqrt, fabs, abs ++-#include // for memcpy, memset ++-#include // for clock ++-#include // for operator<<, basic... ++-#include // for cout, cerr ++-#include // for numeric_limits ++-#include // for map, map<>::mappe... +++#include // for sqrt, fabs, abs +++#include // for memcpy, memset +++#include // for clock +++#include // for operator<<, basic... +++#include // for cout, cerr +++#include // for numeric_limits +++#include // for map, map<>::mappe... ++ #include ++-#include // for vector, _Bit_refe... +++#include // for vector, _Bit_refe... ++ ++ template ++-void random_floats(void *buf, unsigned int n, std::default_random_engine& rnd_engine) +++void random_floats(void* buf, unsigned int n, std::default_random_engine& rnd_engine) ++ { ++- T *array = static_cast(buf); +++ T* array = static_cast(buf); ++ std::uniform_real_distribution uniform_dist(T(-1), T(1)); ++- for(unsigned int i = 0; i < n; i++) { +++ for (unsigned int i = 0; i < n; i++) { ++ array[i] = uniform_dist(rnd_engine); ++ } ++ } ++ ++-void load_random_data(void *data, volk_type_t type, unsigned int n) { +++void load_random_data(void* data, volk_type_t type, unsigned int n) +++{ ++ std::random_device rnd_device; ++ std::default_random_engine rnd_engine(rnd_device()); ++- if(type.is_complex) n *= 2; ++- if(type.is_float) { ++- if(type.size == 8) { +++ if (type.is_complex) +++ n *= 2; +++ if (type.is_float) { +++ if (type.size == 8) { ++ random_floats(data, n, rnd_engine); ++ } else { ++- random_floats (data, n, rnd_engine); +++ random_floats(data, n, rnd_engine); ++ } ++ } else { ++- float int_max = float(uint64_t(2) << (type.size*8)); ++- if(type.is_signed) int_max /= 2.0; +++ float int_max = float(uint64_t(2) << (type.size * 8)); +++ if (type.is_signed) +++ int_max /= 2.0; ++ std::uniform_real_distribution uniform_dist(-int_max, int_max); ++- for(unsigned int i=0; i 8 or < 1"; //no shenanigans here +++ throw "load_random_data: no support for data size > 8 or < 1"; // no +++ // shenanigans +++ // here ++ } ++ } ++ } ++ } ++ ++-static std::vector get_arch_list(volk_func_desc_t desc) { +++static std::vector get_arch_list(volk_func_desc_t desc) +++{ ++ std::vector archlist; ++ ++- for(size_t i = 0; i < desc.n_impls; i++) { +++ for (size_t i = 0; i < desc.n_impls; i++) { ++ archlist.push_back(std::string(desc.impl_names[i])); ++ } ++ ++@@ -96,7 +111,8 @@ T volk_lexical_cast(const std::string& str) ++ return var; ++ } ++ ++-volk_type_t volk_type_from_string(std::string name) { +++volk_type_t volk_type_from_string(std::string name) +++{ ++ volk_type_t type; ++ type.is_float = false; ++ type.is_scalar = false; ++@@ -105,28 +121,28 @@ volk_type_t volk_type_from_string(std::string name) { ++ type.size = 0; ++ type.str = name; ++ ++- if(name.size() < 2) { +++ if (name.size() < 2) { ++ throw std::string("name too short to be a datatype"); ++ } ++ ++- //is it a scalar? ++- if(name[0] == 's') { +++ // is it a scalar? +++ if (name[0] == 's') { ++ type.is_scalar = true; ++- name = name.substr(1, name.size()-1); +++ name = name.substr(1, name.size() - 1); ++ } ++ ++- //get the data size +++ // get the data size ++ size_t last_size_pos = name.find_last_of("0123456789"); ++- if(last_size_pos == std::string::npos) { +++ if (last_size_pos == std::string::npos) { ++ throw std::string("no size spec in type ").append(name); ++ } ++- //will throw if malformed ++- int size = volk_lexical_cast(name.substr(0, last_size_pos+1)); +++ // will throw if malformed +++ int size = volk_lexical_cast(name.substr(0, last_size_pos + 1)); ++ ++ assert(((size % 8) == 0) && (size <= 64) && (size != 0)); ++- type.size = size/8; //in bytes +++ type.size = size / 8; // in bytes ++ ++- for(size_t i=last_size_pos+1; i < name.size(); i++) { +++ for (size_t i = last_size_pos + 1; i < name.size(); i++) { ++ switch (name[i]) { ++ case 'f': ++ type.is_float = true; ++@@ -148,7 +164,8 @@ volk_type_t volk_type_from_string(std::string name) { ++ return type; ++ } ++ ++-std::vector split_signature(const std::string &protokernel_signature) { +++std::vector split_signature(const std::string& protokernel_signature) +++{ ++ std::vector signature_tokens; ++ std::string token; ++ for (unsigned int loc = 0; loc < protokernel_signature.size(); ++loc) { ++@@ -165,16 +182,17 @@ std::vector split_signature(const std::string &protokernel_signatur ++ return signature_tokens; ++ } ++ ++-static void get_signatures_from_name(std::vector &inputsig, ++- std::vector &outputsig, ++- std::string name) { +++static void get_signatures_from_name(std::vector& inputsig, +++ std::vector& outputsig, +++ std::string name) +++{ ++ ++ std::vector toked = split_signature(name); ++ ++ assert(toked[0] == "volk"); ++ toked.erase(toked.begin()); ++ ++- //ok. we're assuming a string in the form +++ // ok. we're assuming a string in the form ++ //(sig)_(multiplier-opt)_..._(name)_(sig)_(multiplier-opt)_..._(alignment) ++ ++ enum { SIDE_INPUT, SIDE_NAME, SIDE_OUTPUT } side = SIDE_INPUT; ++@@ -184,106 +202,184 @@ static void get_signatures_from_name(std::vector &inputsig, ++ std::string token = toked[token_index]; ++ try { ++ type = volk_type_from_string(token); ++- if(side == SIDE_NAME) side = SIDE_OUTPUT; //if this is the first one after the name... ++- ++- if(side == SIDE_INPUT) inputsig.push_back(type); ++- else outputsig.push_back(type); ++- } catch (...){ ++- if(token[0] == 'x' && (token.size() > 1) && (token[1] > '0' && token[1] < '9')) { //it's a multiplier ++- if(side == SIDE_INPUT) assert(inputsig.size() > 0); ++- else assert(outputsig.size() > 0); ++- int multiplier = volk_lexical_cast(token.substr(1, token.size()-1)); //will throw if invalid ++- for(int i=1; i 1) && +++ (token[1] > '0' && token[1] < '9')) { // it's a multiplier +++ if (side == SIDE_INPUT) +++ assert(inputsig.size() > 0); +++ else +++ assert(outputsig.size() > 0); +++ int multiplier = volk_lexical_cast( +++ token.substr(1, token.size() - 1)); // will throw if invalid +++ for (int i = 1; i < multiplier; i++) { +++ if (side == SIDE_INPUT) +++ inputsig.push_back(inputsig.back()); +++ else +++ outputsig.push_back(outputsig.back()); ++ } ++- } ++- else if(side == SIDE_INPUT) { //it's the function name, at least it better be +++ } else if (side == +++ SIDE_INPUT) { // it's the function name, at least it better be ++ side = SIDE_NAME; ++ fn_name.append("_"); ++ fn_name.append(token); ++- } ++- else if(side == SIDE_OUTPUT) { ++- if(token != toked.back()) throw; //the last token in the name is the alignment +++ } else if (side == SIDE_OUTPUT) { +++ if (token != toked.back()) +++ throw; // the last token in the name is the alignment ++ } ++ } ++ } ++- //we don't need an output signature (some fn's operate on the input data, "in place"), but we do need at least one input! +++ // we don't need an output signature (some fn's operate on the input data, "in +++ // place"), but we do need at least one input! ++ assert(inputsig.size() != 0); ++- ++ } ++ ++-inline void run_cast_test1(volk_fn_1arg func, std::vector &buffs, unsigned int vlen, unsigned int iter, std::string arch) { ++- while(iter--) func(buffs[0], vlen, arch.c_str()); +++inline void run_cast_test1(volk_fn_1arg func, +++ std::vector& buffs, +++ unsigned int vlen, +++ unsigned int iter, +++ std::string arch) +++{ +++ while (iter--) +++ func(buffs[0], vlen, arch.c_str()); ++ } ++ ++-inline void run_cast_test2(volk_fn_2arg func, std::vector &buffs, unsigned int vlen, unsigned int iter, std::string arch) { ++- while(iter--) func(buffs[0], buffs[1], vlen, arch.c_str()); +++inline void run_cast_test2(volk_fn_2arg func, +++ std::vector& buffs, +++ unsigned int vlen, +++ unsigned int iter, +++ std::string arch) +++{ +++ while (iter--) +++ func(buffs[0], buffs[1], vlen, arch.c_str()); ++ } ++ ++-inline void run_cast_test3(volk_fn_3arg func, std::vector &buffs, unsigned int vlen, unsigned int iter, std::string arch) { ++- while(iter--) func(buffs[0], buffs[1], buffs[2], vlen, arch.c_str()); +++inline void run_cast_test3(volk_fn_3arg func, +++ std::vector& buffs, +++ unsigned int vlen, +++ unsigned int iter, +++ std::string arch) +++{ +++ while (iter--) +++ func(buffs[0], buffs[1], buffs[2], vlen, arch.c_str()); ++ } ++ ++-inline void run_cast_test4(volk_fn_4arg func, std::vector &buffs, unsigned int vlen, unsigned int iter, std::string arch) { ++- while(iter--) func(buffs[0], buffs[1], buffs[2], buffs[3], vlen, arch.c_str()); +++inline void run_cast_test4(volk_fn_4arg func, +++ std::vector& buffs, +++ unsigned int vlen, +++ unsigned int iter, +++ std::string arch) +++{ +++ while (iter--) +++ func(buffs[0], buffs[1], buffs[2], buffs[3], vlen, arch.c_str()); ++ } ++ ++-inline void run_cast_test1_s32f(volk_fn_1arg_s32f func, std::vector &buffs, float scalar, unsigned int vlen, unsigned int iter, std::string arch) { ++- while(iter--) func(buffs[0], scalar, vlen, arch.c_str()); +++inline void run_cast_test1_s32f(volk_fn_1arg_s32f func, +++ std::vector& buffs, +++ float scalar, +++ unsigned int vlen, +++ unsigned int iter, +++ std::string arch) +++{ +++ while (iter--) +++ func(buffs[0], scalar, vlen, arch.c_str()); ++ } ++ ++-inline void run_cast_test2_s32f(volk_fn_2arg_s32f func, std::vector &buffs, float scalar, unsigned int vlen, unsigned int iter, std::string arch) { ++- while(iter--) func(buffs[0], buffs[1], scalar, vlen, arch.c_str()); +++inline void run_cast_test2_s32f(volk_fn_2arg_s32f func, +++ std::vector& buffs, +++ float scalar, +++ unsigned int vlen, +++ unsigned int iter, +++ std::string arch) +++{ +++ while (iter--) +++ func(buffs[0], buffs[1], scalar, vlen, arch.c_str()); ++ } ++ ++-inline void run_cast_test3_s32f(volk_fn_3arg_s32f func, std::vector &buffs, float scalar, unsigned int vlen, unsigned int iter, std::string arch) { ++- while(iter--) func(buffs[0], buffs[1], buffs[2], scalar, vlen, arch.c_str()); +++inline void run_cast_test3_s32f(volk_fn_3arg_s32f func, +++ std::vector& buffs, +++ float scalar, +++ unsigned int vlen, +++ unsigned int iter, +++ std::string arch) +++{ +++ while (iter--) +++ func(buffs[0], buffs[1], buffs[2], scalar, vlen, arch.c_str()); ++ } ++ ++-inline void run_cast_test1_s32fc(volk_fn_1arg_s32fc func, std::vector &buffs, lv_32fc_t scalar, unsigned int vlen, unsigned int iter, std::string arch) { ++- while(iter--) func(buffs[0], scalar, vlen, arch.c_str()); +++inline void run_cast_test1_s32fc(volk_fn_1arg_s32fc func, +++ std::vector& buffs, +++ lv_32fc_t scalar, +++ unsigned int vlen, +++ unsigned int iter, +++ std::string arch) +++{ +++ while (iter--) +++ func(buffs[0], scalar, vlen, arch.c_str()); ++ } ++ ++-inline void run_cast_test2_s32fc(volk_fn_2arg_s32fc func, std::vector &buffs, lv_32fc_t scalar, unsigned int vlen, unsigned int iter, std::string arch) { ++- while(iter--) func(buffs[0], buffs[1], scalar, vlen, arch.c_str()); +++inline void run_cast_test2_s32fc(volk_fn_2arg_s32fc func, +++ std::vector& buffs, +++ lv_32fc_t scalar, +++ unsigned int vlen, +++ unsigned int iter, +++ std::string arch) +++{ +++ while (iter--) +++ func(buffs[0], buffs[1], scalar, vlen, arch.c_str()); ++ } ++ ++-inline void run_cast_test3_s32fc(volk_fn_3arg_s32fc func, std::vector &buffs, lv_32fc_t scalar, unsigned int vlen, unsigned int iter, std::string arch) { ++- while(iter--) func(buffs[0], buffs[1], buffs[2], scalar, vlen, arch.c_str()); +++inline void run_cast_test3_s32fc(volk_fn_3arg_s32fc func, +++ std::vector& buffs, +++ lv_32fc_t scalar, +++ unsigned int vlen, +++ unsigned int iter, +++ std::string arch) +++{ +++ while (iter--) +++ func(buffs[0], buffs[1], buffs[2], scalar, vlen, arch.c_str()); ++ } ++ ++ template ++-bool fcompare(t *in1, t *in2, unsigned int vlen, float tol, bool absolute_mode) { +++bool fcompare(t* in1, t* in2, unsigned int vlen, float tol, bool absolute_mode) +++{ ++ bool fail = false; ++ int print_max_errs = 10; ++- for(unsigned int i=0; i tol) { ++- fail=true; ++- if(print_max_errs-- > 0) { ++- std::cout << "offset " << i << " in1: " << t(((t *)(in1))[i]) << " in2: " << t(((t *)(in2))[i]); +++ if (fabs(((t*)(in1))[i] - ((t*)(in2))[i]) > tol) { +++ fail = true; +++ if (print_max_errs-- > 0) { +++ std::cout << "offset " << i << " in1: " << t(((t*)(in1))[i]) +++ << " in2: " << t(((t*)(in2))[i]); ++ std::cout << " tolerance was: " << tol << std::endl; ++ } ++ } ++ } else { ++ // for very small numbers we'll see round off errors due to limited ++ // precision. So a special test case... ++- if(fabs(((t *)(in1))[i]) < 1e-30) { ++- if( fabs( ((t *)(in2))[i] ) > tol ) ++- { ++- fail=true; ++- if(print_max_errs-- > 0) { ++- std::cout << "offset " << i << " in1: " << t(((t *)(in1))[i]) << " in2: " << t(((t *)(in2))[i]); +++ if (fabs(((t*)(in1))[i]) < 1e-30) { +++ if (fabs(((t*)(in2))[i]) > tol) { +++ fail = true; +++ if (print_max_errs-- > 0) { +++ std::cout << "offset " << i << " in1: " << t(((t*)(in1))[i]) +++ << " in2: " << t(((t*)(in2))[i]); ++ std::cout << " tolerance was: " << tol << std::endl; ++ } ++ } ++ } ++ // the primary test is the percent different greater than given tol ++- else if(fabs(((t *)(in1))[i] - ((t *)(in2))[i])/fabs(((t *)in1)[i]) > tol) { ++- fail=true; ++- if(print_max_errs-- > 0) { ++- std::cout << "offset " << i << " in1: " << t(((t *)(in1))[i]) << " in2: " << t(((t *)(in2))[i]); +++ else if (fabs(((t*)(in1))[i] - ((t*)(in2))[i]) / fabs(((t*)in1)[i]) > tol) { +++ fail = true; +++ if (print_max_errs-- > 0) { +++ std::cout << "offset " << i << " in1: " << t(((t*)(in1))[i]) +++ << " in2: " << t(((t*)(in2))[i]); ++ std::cout << " tolerance was: " << tol << std::endl; ++ } ++ } ++@@ -294,43 +390,50 @@ bool fcompare(t *in1, t *in2, unsigned int vlen, float tol, bool absolute_mode) ++ } ++ ++ template ++-bool ccompare(t *in1, t *in2, unsigned int vlen, float tol, bool absolute_mode) { +++bool ccompare(t* in1, t* in2, unsigned int vlen, float tol, bool absolute_mode) +++{ ++ if (absolute_mode) { ++- std::cout << "ccompare does not support absolute mode" << std::endl; ++- return true; +++ std::cout << "ccompare does not support absolute mode" << std::endl; +++ return true; ++ } ++ bool fail = false; ++ int print_max_errs = 10; ++- for(unsigned int i=0; i<2*vlen; i+=2) { ++- if (std::isnan(in1[i]) || std::isnan(in1[i+1]) || std::isnan(in2[i]) || std::isnan(in2[i+1]) ++- || std::isinf(in1[i]) || std::isinf(in1[i+1]) || std::isinf(in2[i]) || std::isinf(in2[i+1])) { ++- fail=true; ++- if(print_max_errs-- > 0) { ++- std::cout << "offset " << i/2 << " in1: " << in1[i] << " + " << in1[i+1] << "j in2: " << in2[i] << " + " << in2[i+1] << "j"; +++ for (unsigned int i = 0; i < 2 * vlen; i += 2) { +++ if (std::isnan(in1[i]) || std::isnan(in1[i + 1]) || std::isnan(in2[i]) || +++ std::isnan(in2[i + 1]) || std::isinf(in1[i]) || std::isinf(in1[i + 1]) || +++ std::isinf(in2[i]) || std::isinf(in2[i + 1])) { +++ fail = true; +++ if (print_max_errs-- > 0) { +++ std::cout << "offset " << i / 2 << " in1: " << in1[i] << " + " +++ << in1[i + 1] << "j in2: " << in2[i] << " + " << in2[i + 1] +++ << "j"; ++ std::cout << " tolerance was: " << tol << std::endl; ++ } ++ } ++- t diff[2] = { in1[i] - in2[i], in1[i+1] - in2[i+1] }; ++- t err = std::sqrt(diff[0] * diff[0] + diff[1] * diff[1]); ++- t norm = std::sqrt(in1[i] * in1[i] + in1[i+1] * in1[i+1]); +++ t diff[2] = { in1[i] - in2[i], in1[i + 1] - in2[i + 1] }; +++ t err = std::sqrt(diff[0] * diff[0] + diff[1] * diff[1]); +++ t norm = std::sqrt(in1[i] * in1[i] + in1[i + 1] * in1[i + 1]); ++ ++ // for very small numbers we'll see round off errors due to limited ++ // precision. So a special test case... ++ if (norm < 1e-30) { ++- if (err > tol) ++- { ++- fail=true; ++- if(print_max_errs-- > 0) { ++- std::cout << "offset " << i/2 << " in1: " << in1[i] << " + " << in1[i+1] << "j in2: " << in2[i] << " + " << in2[i+1] << "j"; +++ if (err > tol) { +++ fail = true; +++ if (print_max_errs-- > 0) { +++ std::cout << "offset " << i / 2 << " in1: " << in1[i] << " + " +++ << in1[i + 1] << "j in2: " << in2[i] << " + " << in2[i + 1] +++ << "j"; ++ std::cout << " tolerance was: " << tol << std::endl; ++ } ++ } ++ } ++ // the primary test is the percent different greater than given tol ++- else if((err / norm) > tol) { ++- fail=true; ++- if(print_max_errs-- > 0) { ++- std::cout << "offset " << i/2 << " in1: " << in1[i] << " + " << in1[i+1] << "j in2: " << in2[i] << " + " << in2[i+1] << "j"; +++ else if ((err / norm) > tol) { +++ fail = true; +++ if (print_max_errs-- > 0) { +++ std::cout << "offset " << i / 2 << " in1: " << in1[i] << " + " +++ << in1[i + 1] << "j in2: " << in2[i] << " + " << in2[i + 1] +++ << "j"; ++ std::cout << " tolerance was: " << tol << std::endl; ++ } ++ } ++@@ -340,18 +443,21 @@ bool ccompare(t *in1, t *in2, unsigned int vlen, float tol, bool absolute_mode) ++ } ++ ++ template ++-bool icompare(t *in1, t *in2, unsigned int vlen, unsigned int tol, bool absolute_mode) { +++bool icompare(t* in1, t* in2, unsigned int vlen, unsigned int tol, bool absolute_mode) +++{ ++ if (absolute_mode) { ++- std::cout << "icompare does not support absolute mode" << std::endl; ++- return true; +++ std::cout << "icompare does not support absolute mode" << std::endl; +++ return true; ++ } ++ bool fail = false; ++ int print_max_errs = 10; ++- for(unsigned int i=0; i tol) { ++- fail=true; ++- if(print_max_errs-- > 0) { ++- std::cout << "offset " << i << " in1: " << static_cast(t(((t *)(in1))[i])) << " in2: " << static_cast(t(((t *)(in2))[i])); +++ for (unsigned int i = 0; i < vlen; i++) { +++ if (((unsigned int)abs(int(((t*)(in1))[i]) - int(((t*)(in2))[i]))) > tol) { +++ fail = true; +++ if (print_max_errs-- > 0) { +++ std::cout << "offset " << i +++ << " in1: " << static_cast(t(((t*)(in1))[i])) +++ << " in2: " << static_cast(t(((t*)(in2))[i])); ++ std::cout << " tolerance was: " << tol << std::endl; ++ } ++ } ++@@ -360,34 +466,46 @@ bool icompare(t *in1, t *in2, unsigned int vlen, unsigned int tol, bool absolute ++ return fail; ++ } ++ ++-class volk_qa_aligned_mem_pool{ +++class volk_qa_aligned_mem_pool +++{ ++ public: ++- void *get_new(size_t size){ +++ void* get_new(size_t size) +++ { ++ size_t alignment = volk_get_alignment(); ++ void* ptr = volk_malloc(size, alignment); ++ memset(ptr, 0x00, size); ++ _mems.push_back(ptr); ++ return ptr; ++ } ++- ~volk_qa_aligned_mem_pool() { ++- for(unsigned int ii = 0; ii < _mems.size(); ++ii) { +++ ~volk_qa_aligned_mem_pool() +++ { +++ for (unsigned int ii = 0; ii < _mems.size(); ++ii) { ++ volk_free(_mems[ii]); ++ } ++ } ++-private: std::vector _mems; +++ +++private: +++ std::vector _mems; ++ }; ++ ++ bool run_volk_tests(volk_func_desc_t desc, ++ void (*manual_func)(), ++ std::string name, ++ volk_test_params_t test_params, ++- std::vector *results, ++- std::string puppet_master_name ++-) +++ std::vector* results, +++ std::string puppet_master_name) ++ { ++- return run_volk_tests(desc, manual_func, name, test_params.tol(), test_params.scalar(), ++- test_params.vlen(), test_params.iter(), results, puppet_master_name, ++- test_params.absolute_mode(), test_params.benchmark_mode()); +++ return run_volk_tests(desc, +++ manual_func, +++ name, +++ test_params.tol(), +++ test_params.scalar(), +++ test_params.vlen(), +++ test_params.iter(), +++ results, +++ puppet_master_name, +++ test_params.absolute_mode(), +++ test_params.benchmark_mode()); ++ } ++ ++ bool run_volk_tests(volk_func_desc_t desc, ++@@ -397,17 +515,18 @@ bool run_volk_tests(volk_func_desc_t desc, ++ lv_32fc_t scalar, ++ unsigned int vlen, ++ unsigned int iter, ++- std::vector *results, +++ std::vector* results, ++ std::string puppet_master_name, ++ bool absolute_mode, ++- bool benchmark_mode ++-) { +++ bool benchmark_mode) +++{ ++ // Initialize this entry in results vector ++ results->push_back(volk_test_results_t()); ++ results->back().name = name; ++ results->back().vlen = vlen; ++ results->back().iter = iter; ++- std::cout << "RUN_VOLK_TESTS: " << name << "(" << vlen << "," << iter << ")" << std::endl; +++ std::cout << "RUN_VOLK_TESTS: " << name << "(" << vlen << "," << iter << ")" +++ << std::endl; ++ ++ // vlen_twiddle will increase vlen for malloc and data generation ++ // but kernels will still be called with the user provided vlen. ++@@ -418,57 +537,64 @@ bool run_volk_tests(volk_func_desc_t desc, ++ const float tol_f = tol; ++ const unsigned int tol_i = static_cast(tol); ++ ++- //first let's get a list of available architectures for the test +++ // first let's get a list of available architectures for the test ++ std::vector arch_list = get_arch_list(desc); ++ ++- if((!benchmark_mode) && (arch_list.size() < 2)) { +++ if ((!benchmark_mode) && (arch_list.size() < 2)) { ++ std::cout << "no architectures to test" << std::endl; ++ return false; ++ } ++ ++- //something that can hang onto memory and cleanup when this function exits +++ // something that can hang onto memory and cleanup when this function exits ++ volk_qa_aligned_mem_pool mem_pool; ++ ++- //now we have to get a function signature by parsing the name +++ // now we have to get a function signature by parsing the name ++ std::vector inputsig, outputsig; ++ try { ++ get_signatures_from_name(inputsig, outputsig, name); ++- } ++- catch (std::exception &error) { ++- std::cerr << "Error: unable to get function signature from kernel name" << std::endl; +++ } catch (std::exception& error) { +++ std::cerr << "Error: unable to get function signature from kernel name" +++ << std::endl; ++ std::cerr << " - " << name << std::endl; ++ return false; ++ } ++ ++- //pull the input scalars into their own vector +++ // pull the input scalars into their own vector ++ std::vector inputsc; ++- for(size_t i=0; i inbuffs; ++- for (unsigned int inputsig_index = 0; inputsig_index < inputsig.size(); ++ inputsig_index) { +++ std::vector inbuffs; +++ for (unsigned int inputsig_index = 0; inputsig_index < inputsig.size(); +++ ++inputsig_index) { ++ volk_type_t sig = inputsig[inputsig_index]; ++- if(!sig.is_scalar) //we don't make buffers for scalars ++- inbuffs.push_back(mem_pool.get_new(vlen*sig.size*(sig.is_complex ? 2 : 1))); +++ if (!sig.is_scalar) // we don't make buffers for scalars +++ inbuffs.push_back( +++ mem_pool.get_new(vlen * sig.size * (sig.is_complex ? 2 : 1))); ++ } ++- for(size_t i=0; i > test_data; ++- for(size_t i=0; i arch_buffs; ++- for(size_t j=0; j> test_data; +++ for (size_t i = 0; i < arch_list.size(); i++) { +++ std::vector arch_buffs; +++ for (size_t j = 0; j < outputsig.size(); j++) { +++ arch_buffs.push_back(mem_pool.get_new(vlen * outputsig[j].size * +++ (outputsig[j].is_complex ? 2 : 1))); ++ } ++- for(size_t j=0; j start, end; ++ std::vector profile_times; ++- for(size_t i = 0; i < arch_list.size(); i++) { +++ for (size_t i = 0; i < arch_list.size(); i++) { ++ start = std::chrono::system_clock::now(); ++ ++- switch(both_sigs.size()) { ++- case 1: ++- if(inputsc.size() == 0) { ++- run_cast_test1((volk_fn_1arg)(manual_func), test_data[i], vlen, iter, arch_list[i]); ++- } else if(inputsc.size() == 1 && inputsc[0].is_float) { ++- if(inputsc[0].is_complex) { ++- run_cast_test1_s32fc((volk_fn_1arg_s32fc)(manual_func), test_data[i], scalar, vlen, iter, arch_list[i]); ++- } else { ++- run_cast_test1_s32f((volk_fn_1arg_s32f)(manual_func), test_data[i], scalar.real(), vlen, iter, arch_list[i]); ++- } ++- } else throw "unsupported 1 arg function >1 scalars"; ++- break; ++- case 2: ++- if(inputsc.size() == 0) { ++- run_cast_test2((volk_fn_2arg)(manual_func), test_data[i], vlen, iter, arch_list[i]); ++- } else if(inputsc.size() == 1 && inputsc[0].is_float) { ++- if(inputsc[0].is_complex) { ++- run_cast_test2_s32fc((volk_fn_2arg_s32fc)(manual_func), test_data[i], scalar, vlen, iter, arch_list[i]); ++- } else { ++- run_cast_test2_s32f((volk_fn_2arg_s32f)(manual_func), test_data[i], scalar.real(), vlen, iter, arch_list[i]); ++- } ++- } else throw "unsupported 2 arg function >1 scalars"; ++- break; ++- case 3: ++- if(inputsc.size() == 0) { ++- run_cast_test3((volk_fn_3arg)(manual_func), test_data[i], vlen, iter, arch_list[i]); ++- } else if(inputsc.size() == 1 && inputsc[0].is_float) { ++- if(inputsc[0].is_complex) { ++- run_cast_test3_s32fc((volk_fn_3arg_s32fc)(manual_func), test_data[i], scalar, vlen, iter, arch_list[i]); ++- } else { ++- run_cast_test3_s32f((volk_fn_3arg_s32f)(manual_func), test_data[i], scalar.real(), vlen, iter, arch_list[i]); ++- } ++- } else throw "unsupported 3 arg function >1 scalars"; ++- break; ++- case 4: ++- run_cast_test4((volk_fn_4arg)(manual_func), test_data[i], vlen, iter, arch_list[i]); ++- break; ++- default: ++- throw "no function handler for this signature"; ++- break; +++ switch (both_sigs.size()) { +++ case 1: +++ if (inputsc.size() == 0) { +++ run_cast_test1( +++ (volk_fn_1arg)(manual_func), test_data[i], vlen, iter, arch_list[i]); +++ } else if (inputsc.size() == 1 && inputsc[0].is_float) { +++ if (inputsc[0].is_complex) { +++ run_cast_test1_s32fc((volk_fn_1arg_s32fc)(manual_func), +++ test_data[i], +++ scalar, +++ vlen, +++ iter, +++ arch_list[i]); +++ } else { +++ run_cast_test1_s32f((volk_fn_1arg_s32f)(manual_func), +++ test_data[i], +++ scalar.real(), +++ vlen, +++ iter, +++ arch_list[i]); +++ } +++ } else +++ throw "unsupported 1 arg function >1 scalars"; +++ break; +++ case 2: +++ if (inputsc.size() == 0) { +++ run_cast_test2( +++ (volk_fn_2arg)(manual_func), test_data[i], vlen, iter, arch_list[i]); +++ } else if (inputsc.size() == 1 && inputsc[0].is_float) { +++ if (inputsc[0].is_complex) { +++ run_cast_test2_s32fc((volk_fn_2arg_s32fc)(manual_func), +++ test_data[i], +++ scalar, +++ vlen, +++ iter, +++ arch_list[i]); +++ } else { +++ run_cast_test2_s32f((volk_fn_2arg_s32f)(manual_func), +++ test_data[i], +++ scalar.real(), +++ vlen, +++ iter, +++ arch_list[i]); +++ } +++ } else +++ throw "unsupported 2 arg function >1 scalars"; +++ break; +++ case 3: +++ if (inputsc.size() == 0) { +++ run_cast_test3( +++ (volk_fn_3arg)(manual_func), test_data[i], vlen, iter, arch_list[i]); +++ } else if (inputsc.size() == 1 && inputsc[0].is_float) { +++ if (inputsc[0].is_complex) { +++ run_cast_test3_s32fc((volk_fn_3arg_s32fc)(manual_func), +++ test_data[i], +++ scalar, +++ vlen, +++ iter, +++ arch_list[i]); +++ } else { +++ run_cast_test3_s32f((volk_fn_3arg_s32f)(manual_func), +++ test_data[i], +++ scalar.real(), +++ vlen, +++ iter, +++ arch_list[i]); +++ } +++ } else +++ throw "unsupported 3 arg function >1 scalars"; +++ break; +++ case 4: +++ run_cast_test4( +++ (volk_fn_4arg)(manual_func), test_data[i], vlen, iter, arch_list[i]); +++ break; +++ default: +++ throw "no function handler for this signature"; +++ break; ++ } ++ ++ end = std::chrono::system_clock::now(); ++@@ -541,10 +704,10 @@ bool run_volk_tests(volk_func_desc_t desc, ++ profile_times.push_back(arch_time); ++ } ++ ++- //and now compare each output to the generic output ++- //first we have to know which output is the generic one, they aren't in order... ++- size_t generic_offset=0; ++- for(size_t i=0; i arch_results; ++- for(size_t i=0; iback().results[arch_list[i]]; +++ if (fail) { +++ volk_test_time_t* result = &results->back().results[arch_list[i]]; ++ result->pass = false; ++ fail_global = true; ++ std::cout << name << ": fail on arch " << arch_list[i] << std::endl; ++@@ -634,15 +851,13 @@ bool run_volk_tests(volk_func_desc_t desc, ++ double best_time_u = std::numeric_limits::max(); ++ std::string best_arch_a = "generic"; ++ std::string best_arch_u = "generic"; ++- for(size_t i=0; i < arch_list.size(); i++) ++- { ++- if((profile_times[i] < best_time_u) && arch_results[i] && desc.impl_alignment[i] == 0) ++- { +++ for (size_t i = 0; i < arch_list.size(); i++) { +++ if ((profile_times[i] < best_time_u) && arch_results[i] && +++ desc.impl_alignment[i] == 0) { ++ best_time_u = profile_times[i]; ++ best_arch_u = arch_list[i]; ++ } ++- if((profile_times[i] < best_time_a) && arch_results[i]) ++- { +++ if ((profile_times[i] < best_time_a) && arch_results[i]) { ++ best_time_a = profile_times[i]; ++ best_arch_a = arch_list[i]; ++ } ++@@ -651,7 +866,7 @@ bool run_volk_tests(volk_func_desc_t desc, ++ std::cout << "Best aligned arch: " << best_arch_a << std::endl; ++ std::cout << "Best unaligned arch: " << best_arch_u << std::endl; ++ ++- if(puppet_master_name == "NULL") { +++ if (puppet_master_name == "NULL") { ++ results->back().config_name = name; ++ } else { ++ results->back().config_name = puppet_master_name; ++diff --git a/lib/qa_utils.h b/lib/qa_utils.h ++index 2d8458b..74c3db4 100644 ++--- a/lib/qa_utils.h +++++ b/lib/qa_utils.h ++@@ -1,14 +1,14 @@ ++ #ifndef VOLK_QA_UTILS_H ++ #define VOLK_QA_UTILS_H ++ ++-#include // for bool, false ++-#include // for volk_func_desc_t ++-#include // for NULL ++-#include // for map ++-#include // for string, basic_string ++-#include // for vector +++#include // for bool, false +++#include // for volk_func_desc_t +++#include // for NULL +++#include // for map +++#include // for string, basic_string +++#include // for vector ++ ++-#include "volk/volk_complex.h" // for lv_32fc_t +++#include "volk/volk_complex.h" // for lv_32fc_t ++ ++ /************************************************ ++ * VOLK QA type definitions * ++@@ -22,93 +22,119 @@ struct volk_type_t { ++ std::string str; ++ }; ++ ++-class volk_test_time_t { ++- public: ++- std::string name; ++- double time; ++- std::string units; ++- bool pass; +++class volk_test_time_t +++{ +++public: +++ std::string name; +++ double time; +++ std::string units; +++ bool pass; ++ }; ++ ++-class volk_test_results_t { ++- public: ++- std::string name; ++- std::string config_name; ++- unsigned int vlen; ++- unsigned int iter; ++- std::map results; ++- std::string best_arch_a; ++- std::string best_arch_u; +++class volk_test_results_t +++{ +++public: +++ std::string name; +++ std::string config_name; +++ unsigned int vlen; +++ unsigned int iter; +++ std::map results; +++ std::string best_arch_a; +++ std::string best_arch_u; ++ }; ++ ++-class volk_test_params_t { ++- private: ++- float _tol; ++- lv_32fc_t _scalar; ++- unsigned int _vlen; ++- unsigned int _iter; ++- bool _benchmark_mode; ++- bool _absolute_mode; ++- std::string _kernel_regex; ++- public: ++- // ctor ++- volk_test_params_t(float tol, lv_32fc_t scalar, unsigned int vlen, unsigned int iter, ++- bool benchmark_mode, std::string kernel_regex) : ++- _tol(tol), _scalar(scalar), _vlen(vlen), _iter(iter), ++- _benchmark_mode(benchmark_mode), _absolute_mode(false), _kernel_regex(kernel_regex) {}; ++- // setters ++- void set_tol(float tol) {_tol=tol;}; ++- void set_scalar(lv_32fc_t scalar) {_scalar=scalar;}; ++- void set_vlen(unsigned int vlen) {_vlen=vlen;}; ++- void set_iter(unsigned int iter) {_iter=iter;}; ++- void set_benchmark(bool benchmark) {_benchmark_mode=benchmark;}; ++- void set_regex(std::string regex) {_kernel_regex=regex;}; ++- // getters ++- float tol() {return _tol;}; ++- lv_32fc_t scalar() {return _scalar;}; ++- unsigned int vlen() {return _vlen;}; ++- unsigned int iter() {return _iter;}; ++- bool benchmark_mode() {return _benchmark_mode;}; ++- bool absolute_mode() {return _absolute_mode;}; ++- std::string kernel_regex() {return _kernel_regex;}; ++- volk_test_params_t make_absolute(float tol) { ++- volk_test_params_t t(*this); ++- t._tol = tol; ++- t._absolute_mode = true; ++- return t; ++- } ++- volk_test_params_t make_tol(float tol) { ++- volk_test_params_t t(*this); ++- t._tol = tol; ++- return t; ++- } +++class volk_test_params_t +++{ +++private: +++ float _tol; +++ lv_32fc_t _scalar; +++ unsigned int _vlen; +++ unsigned int _iter; +++ bool _benchmark_mode; +++ bool _absolute_mode; +++ std::string _kernel_regex; +++ +++public: +++ // ctor +++ volk_test_params_t(float tol, +++ lv_32fc_t scalar, +++ unsigned int vlen, +++ unsigned int iter, +++ bool benchmark_mode, +++ std::string kernel_regex) +++ : _tol(tol), +++ _scalar(scalar), +++ _vlen(vlen), +++ _iter(iter), +++ _benchmark_mode(benchmark_mode), +++ _absolute_mode(false), +++ _kernel_regex(kernel_regex){}; +++ // setters +++ void set_tol(float tol) { _tol = tol; }; +++ void set_scalar(lv_32fc_t scalar) { _scalar = scalar; }; +++ void set_vlen(unsigned int vlen) { _vlen = vlen; }; +++ void set_iter(unsigned int iter) { _iter = iter; }; +++ void set_benchmark(bool benchmark) { _benchmark_mode = benchmark; }; +++ void set_regex(std::string regex) { _kernel_regex = regex; }; +++ // getters +++ float tol() { return _tol; }; +++ lv_32fc_t scalar() { return _scalar; }; +++ unsigned int vlen() { return _vlen; }; +++ unsigned int iter() { return _iter; }; +++ bool benchmark_mode() { return _benchmark_mode; }; +++ bool absolute_mode() { return _absolute_mode; }; +++ std::string kernel_regex() { return _kernel_regex; }; +++ volk_test_params_t make_absolute(float tol) +++ { +++ volk_test_params_t t(*this); +++ t._tol = tol; +++ t._absolute_mode = true; +++ return t; +++ } +++ volk_test_params_t make_tol(float tol) +++ { +++ volk_test_params_t t(*this); +++ t._tol = tol; +++ return t; +++ } ++ }; ++ ++-class volk_test_case_t { ++- private: ++- volk_func_desc_t _desc; ++- void(*_kernel_ptr)(); ++- std::string _name; ++- volk_test_params_t _test_parameters; ++- std::string _puppet_master_name; ++- public: ++- volk_func_desc_t desc() {return _desc;}; ++- void (*kernel_ptr()) () {return _kernel_ptr;}; ++- std::string name() {return _name;}; ++- std::string puppet_master_name() {return _puppet_master_name;}; ++- volk_test_params_t test_parameters() {return _test_parameters;}; ++- // normal ctor ++- volk_test_case_t(volk_func_desc_t desc, void(*kernel_ptr)(), std::string name, ++- volk_test_params_t test_parameters) : ++- _desc(desc), _kernel_ptr(kernel_ptr), _name(name), _test_parameters(test_parameters), ++- _puppet_master_name("NULL") ++- {}; ++- // ctor for puppets ++- volk_test_case_t(volk_func_desc_t desc, void(*kernel_ptr)(), std::string name, ++- std::string puppet_master_name, volk_test_params_t test_parameters) : ++- _desc(desc), _kernel_ptr(kernel_ptr), _name(name), _test_parameters(test_parameters), ++- _puppet_master_name(puppet_master_name) ++- {}; +++class volk_test_case_t +++{ +++private: +++ volk_func_desc_t _desc; +++ void (*_kernel_ptr)(); +++ std::string _name; +++ volk_test_params_t _test_parameters; +++ std::string _puppet_master_name; +++ +++public: +++ volk_func_desc_t desc() { return _desc; }; +++ void (*kernel_ptr())() { return _kernel_ptr; }; +++ std::string name() { return _name; }; +++ std::string puppet_master_name() { return _puppet_master_name; }; +++ volk_test_params_t test_parameters() { return _test_parameters; }; +++ // normal ctor +++ volk_test_case_t(volk_func_desc_t desc, +++ void (*kernel_ptr)(), +++ std::string name, +++ volk_test_params_t test_parameters) +++ : _desc(desc), +++ _kernel_ptr(kernel_ptr), +++ _name(name), +++ _test_parameters(test_parameters), +++ _puppet_master_name("NULL"){}; +++ // ctor for puppets +++ volk_test_case_t(volk_func_desc_t desc, +++ void (*kernel_ptr)(), +++ std::string name, +++ std::string puppet_master_name, +++ volk_test_params_t test_parameters) +++ : _desc(desc), +++ _kernel_ptr(kernel_ptr), +++ _name(name), +++ _test_parameters(test_parameters), +++ _puppet_master_name(puppet_master_name){}; ++ }; ++ ++ /************************************************ ++@@ -117,42 +143,58 @@ class volk_test_case_t { ++ volk_type_t volk_type_from_string(std::string); ++ ++ float uniform(void); ++-void random_floats(float *buf, unsigned n); +++void random_floats(float* buf, unsigned n); ++ ++-bool run_volk_tests( ++- volk_func_desc_t, ++- void(*)(), ++- std::string, ++- volk_test_params_t, ++- std::vector *results = NULL, ++- std::string puppet_master_name = "NULL" ++- ); +++bool run_volk_tests(volk_func_desc_t, +++ void (*)(), +++ std::string, +++ volk_test_params_t, +++ std::vector* results = NULL, +++ std::string puppet_master_name = "NULL"); ++ ++-bool run_volk_tests( ++- volk_func_desc_t, ++- void(*)(), ++- std::string, ++- float, ++- lv_32fc_t, ++- unsigned int, ++- unsigned int, ++- std::vector *results = NULL, ++- std::string puppet_master_name = "NULL", ++- bool absolute_mode = false, ++- bool benchmark_mode = false ++-); +++bool run_volk_tests(volk_func_desc_t, +++ void (*)(), +++ std::string, +++ float, +++ lv_32fc_t, +++ unsigned int, +++ unsigned int, +++ std::vector* results = NULL, +++ std::string puppet_master_name = "NULL", +++ bool absolute_mode = false, +++ bool benchmark_mode = false); ++ ++-#define VOLK_PROFILE(func, test_params, results) run_volk_tests(func##_get_func_desc(), (void (*)())func##_manual, std::string(#func), test_params, results, "NULL") ++-#define VOLK_PUPPET_PROFILE(func, puppet_master_func, test_params, results) run_volk_tests(func##_get_func_desc(), (void (*)())func##_manual, std::string(#func), test_params, results, std::string(#puppet_master_func)) ++-typedef void (*volk_fn_1arg)(void *, unsigned int, const char*); //one input, operate in place ++-typedef void (*volk_fn_2arg)(void *, void *, unsigned int, const char*); ++-typedef void (*volk_fn_3arg)(void *, void *, void *, unsigned int, const char*); ++-typedef void (*volk_fn_4arg)(void *, void *, void *, void *, unsigned int, const char*); ++-typedef void (*volk_fn_1arg_s32f)(void *, float, unsigned int, const char*); //one input vector, one scalar float input ++-typedef void (*volk_fn_2arg_s32f)(void *, void *, float, unsigned int, const char*); ++-typedef void (*volk_fn_3arg_s32f)(void *, void *, void *, float, unsigned int, const char*); ++-typedef void (*volk_fn_1arg_s32fc)(void *, lv_32fc_t, unsigned int, const char*); //one input vector, one scalar float input ++-typedef void (*volk_fn_2arg_s32fc)(void *, void *, lv_32fc_t, unsigned int, const char*); ++-typedef void (*volk_fn_3arg_s32fc)(void *, void *, void *, lv_32fc_t, unsigned int, const char*); +++#define VOLK_PROFILE(func, test_params, results) \ +++ run_volk_tests(func##_get_func_desc(), \ +++ (void (*)())func##_manual, \ +++ std::string(#func), \ +++ test_params, \ +++ results, \ +++ "NULL") +++#define VOLK_PUPPET_PROFILE(func, puppet_master_func, test_params, results) \ +++ run_volk_tests(func##_get_func_desc(), \ +++ (void (*)())func##_manual, \ +++ std::string(#func), \ +++ test_params, \ +++ results, \ +++ std::string(#puppet_master_func)) +++typedef void (*volk_fn_1arg)(void*, +++ unsigned int, +++ const char*); // one input, operate in place +++typedef void (*volk_fn_2arg)(void*, void*, unsigned int, const char*); +++typedef void (*volk_fn_3arg)(void*, void*, void*, unsigned int, const char*); +++typedef void (*volk_fn_4arg)(void*, void*, void*, void*, unsigned int, const char*); +++typedef void (*volk_fn_1arg_s32f)( +++ void*, float, unsigned int, const char*); // one input vector, one scalar float input +++typedef void (*volk_fn_2arg_s32f)(void*, void*, float, unsigned int, const char*); +++typedef void (*volk_fn_3arg_s32f)(void*, void*, void*, float, unsigned int, const char*); +++typedef void (*volk_fn_1arg_s32fc)( +++ void*, +++ lv_32fc_t, +++ unsigned int, +++ const char*); // one input vector, one scalar float input +++typedef void (*volk_fn_2arg_s32fc)(void*, void*, lv_32fc_t, unsigned int, const char*); +++typedef void (*volk_fn_3arg_s32fc)( +++ void*, void*, void*, lv_32fc_t, unsigned int, const char*); ++ ++-#endif //VOLK_QA_UTILS_H +++#endif // VOLK_QA_UTILS_H ++diff --git a/lib/testqa.cc b/lib/testqa.cc ++index 8b0f4d6..c885383 100644 ++--- a/lib/testqa.cc +++++ b/lib/testqa.cc ++@@ -20,18 +20,18 @@ ++ * Boston, MA 02110-1301, USA. ++ */ ++ ++-#include // for bool, false, true ++-#include // for operator<<, basic_ostream, endl, char... ++-#include // IWYU pragma: keep ++-#include // for map, map<>::iterator, _Rb_tree_iterator ++-#include // for string, operator<< ++-#include // for pair ++-#include // for vector ++- +++#include // for bool, false, true +++#include // IWYU pragma: keep +++#include // for operator<<, basic_ostream, endl, char... +++#include // for map, map<>::iterator, _Rb_tree_iterator +++#include // for string, operator<< +++#include // for pair +++#include // for vector +++ +++#include "kernel_tests.h" // for init_test_list +++#include "qa_utils.h" // for volk_test_case_t, volk_test_results_t +++#include "volk/volk_complex.h" // for lv_32fc_t ++ #include ++-#include "kernel_tests.h" // for init_test_list ++-#include "qa_utils.h" // for volk_test_case_t, volk_test_results_t ++-#include "volk/volk_complex.h" // for lv_32fc_t ++ ++ void print_qa_xml(std::vector results, unsigned int nfails); ++ ++@@ -46,45 +46,52 @@ int main(int argc, char* argv[]) ++ bool def_benchmark_mode = true; ++ std::string def_kernel_regex = ""; ++ ++- volk_test_params_t test_params(def_tol, def_scalar, def_vlen, def_iter, ++- def_benchmark_mode, def_kernel_regex); +++ volk_test_params_t test_params( +++ def_tol, def_scalar, def_vlen, def_iter, def_benchmark_mode, def_kernel_regex); ++ std::vector test_cases = init_test_list(test_params); ++ std::vector results; ++ ++- if (argc > 1){ ++- for(unsigned int ii = 0; ii < test_cases.size(); ++ii){ ++- if (std::string(argv[1]) == test_cases[ii].name()){ +++ if (argc > 1) { +++ for (unsigned int ii = 0; ii < test_cases.size(); ++ii) { +++ if (std::string(argv[1]) == test_cases[ii].name()) { ++ volk_test_case_t test_case = test_cases[ii]; ++- if (run_volk_tests(test_case.desc(), test_case.kernel_ptr(), +++ if (run_volk_tests(test_case.desc(), +++ test_case.kernel_ptr(), ++ test_case.name(), ++- test_case.test_parameters(), &results, +++ test_case.test_parameters(), +++ &results, ++ test_case.puppet_master_name())) { ++- return 1; +++ return 1; ++ } else { ++- return 0; +++ return 0; ++ } ++ } ++ } ++- std::cerr << "Did not run a test for kernel: " << std::string(argv[1]) << " !" << std::endl; +++ std::cerr << "Did not run a test for kernel: " << std::string(argv[1]) << " !" +++ << std::endl; ++ return 0; ++ ++- }else{ +++ } else { ++ std::vector qa_failures; ++ // Test every kernel reporting failures when they occur ++- for(unsigned int ii = 0; ii < test_cases.size(); ++ii) { +++ for (unsigned int ii = 0; ii < test_cases.size(); ++ii) { ++ bool qa_result = false; ++ volk_test_case_t test_case = test_cases[ii]; ++ try { ++- qa_result = run_volk_tests(test_case.desc(), test_case.kernel_ptr(), test_case.name(), ++- test_case.test_parameters(), &results, test_case.puppet_master_name()); ++- } ++- catch(...) { +++ qa_result = run_volk_tests(test_case.desc(), +++ test_case.kernel_ptr(), +++ test_case.name(), +++ test_case.test_parameters(), +++ &results, +++ test_case.puppet_master_name()); +++ } catch (...) { ++ // TODO: what exceptions might we need to catch and how do we handle them? ++- std::cerr << "Exception found on kernel: " << test_case.name() << std::endl; +++ std::cerr << "Exception found on kernel: " << test_case.name() +++ << std::endl; ++ qa_result = false; ++ } ++ ++- if(qa_result) { +++ if (qa_result) { ++ std::cerr << "Failure on " << test_case.name() << std::endl; ++ qa_failures.push_back(test_case.name()); ++ } ++@@ -96,9 +103,9 @@ int main(int argc, char* argv[]) ++ // Summarize QA results ++ std::cerr << "Kernel QA finished: " << qa_failures.size() << " failures out of " ++ << test_cases.size() << " tests." << std::endl; ++- if(qa_failures.size() > 0) { +++ if (qa_failures.size() > 0) { ++ std::cerr << "The following kernels failed QA:" << std::endl; ++- for(unsigned int ii = 0; ii < qa_failures.size(); ++ii) { +++ for (unsigned int ii = 0; ii < qa_failures.size(); ++ii) { ++ std::cerr << " " << qa_failures[ii] << std::endl; ++ } ++ qa_ret_val = 1; ++@@ -118,26 +125,28 @@ void print_qa_xml(std::vector results, unsigned int nfails) ++ qa_file.open(".unittest/kernels.xml"); ++ ++ qa_file << "" << std::endl; ++- qa_file << "" << std::endl; +++ qa_file << "" << std::endl; ++ ++ // Results are in a vector by kernel. Each element has a result ++ // map containing time and arch name with test result ++- for(unsigned int ii=0; ii < results.size(); ++ii) { +++ for (unsigned int ii = 0; ii < results.size(); ++ii) { ++ volk_test_results_t result = results[ii]; ++ qa_file << " " << std::endl; ++ ++ std::map::iterator kernel_time_pair; ++- for(kernel_time_pair = result.results.begin(); kernel_time_pair != result.results.end(); ++kernel_time_pair) { +++ for (kernel_time_pair = result.results.begin(); +++ kernel_time_pair != result.results.end(); +++ ++kernel_time_pair) { ++ volk_test_time_t test_time = kernel_time_pair->second; ++- qa_file << " " << std::endl; ++- if(!test_time.pass) ++- qa_file << " " << ++- "" << std::endl; +++ qa_file << " " << std::endl; +++ if (!test_time.pass) +++ qa_file << " " +++ << "" << std::endl; ++ qa_file << " " << std::endl; ++ } ++ qa_file << " " << std::endl; ++@@ -146,5 +155,4 @@ void print_qa_xml(std::vector results, unsigned int nfails) ++ ++ qa_file << "" << std::endl; ++ qa_file.close(); ++- ++ } ++diff --git a/lib/volk_malloc.c b/lib/volk_malloc.c ++index df36240..b3779e1 100644 ++--- a/lib/volk_malloc.c +++++ b/lib/volk_malloc.c ++@@ -31,7 +31,8 @@ ++ * see: https://en.cppreference.com/w/c/memory/aligned_alloc ++ * ++ * MSVC is broken ++- * see: https://docs.microsoft.com/en-us/cpp/overview/visual-cpp-language-conformance?view=vs-2019 +++ * see: +++ * https://docs.microsoft.com/en-us/cpp/overview/visual-cpp-language-conformance?view=vs-2019 ++ * This section: ++ * C11 The Universal CRT implemented the parts of the ++ * C11 Standard Library that are required by C++17, ++@@ -46,39 +47,43 @@ ++ * We must work around this problem because MSVC is non-compliant! ++ */ ++ ++-void *volk_malloc(size_t size, size_t alignment) +++ +++void* volk_malloc(size_t size, size_t alignment) ++ { ++ #if HAVE_POSIX_MEMALIGN ++- // quoting posix_memalign() man page: ++- // "alignment must be a power of two and a multiple of sizeof(void *)" ++- // volk_get_alignment() could return 1 for some machines (e.g. generic_orc) ++- if (alignment == 1){ ++- return malloc(size); ++- } ++- void *ptr; ++- int err = posix_memalign(&ptr, alignment, size); ++- if(err != 0) { ++- ptr = NULL; ++- fprintf(stderr, ++- "VOLK: Error allocating memory " ++- "(posix_memalign: error %d: %s)\n", err, strerror(err)); ++- } +++ // quoting posix_memalign() man page: +++ // "alignment must be a power of two and a multiple of sizeof(void *)" +++ // volk_get_alignment() could return 1 for some machines (e.g. generic_orc) +++ if (alignment == 1) { +++ return malloc(size); +++ } +++ void* ptr; +++ int err = posix_memalign(&ptr, alignment, size); +++ if (err != 0) { +++ ptr = NULL; +++ fprintf(stderr, +++ "VOLK: Error allocating memory " +++ "(posix_memalign: error %d: %s)\n", +++ err, +++ strerror(err)); +++ } ++ #elif defined(_MSC_VER) ++- void *ptr = _aligned_malloc(size, alignment); +++ void* ptr = _aligned_malloc(size, alignment); ++ #else ++- void *ptr = aligned_alloc(alignment, size); +++ void* ptr = aligned_alloc(alignment, size); ++ #endif ++- if(ptr == NULL) { ++- fprintf(stderr, "VOLK: Error allocating memory (aligned_alloc/_aligned_malloc)\n"); ++- } ++- return ptr; +++ if (ptr == NULL) { +++ fprintf(stderr, +++ "VOLK: Error allocating memory (aligned_alloc/_aligned_malloc)\n"); +++ } +++ return ptr; ++ } ++ ++-void volk_free(void *ptr) +++void volk_free(void* ptr) ++ { ++ #if defined(_MSC_VER) ++- _aligned_free(ptr); +++ _aligned_free(ptr); ++ #else ++- free(ptr); +++ free(ptr); ++ #endif ++ } ++diff --git a/lib/volk_prefs.c b/lib/volk_prefs.c ++index 0b5fe8e..8934bf7 100644 ++--- a/lib/volk_prefs.c +++++ b/lib/volk_prefs.c ++@@ -1,6 +1,6 @@ +++#include ++ #include ++ #include ++-#include ++ #include ++ #if defined(_MSC_VER) ++ #include ++@@ -11,82 +11,84 @@ ++ #endif ++ #include ++ ++-void volk_get_config_path(char *path, bool read) +++void volk_get_config_path(char* path, bool read) ++ { ++- if (!path) return; ++- const char *suffix = "/.volk/volk_config"; ++- const char *suffix2 = "/volk/volk_config"; //non-hidden ++- char *home = NULL; +++ if (!path) +++ return; +++ const char* suffix = "/.volk/volk_config"; +++ const char* suffix2 = "/volk/volk_config"; // non-hidden +++ char* home = NULL; ++ ++- //allows config redirection via env variable +++ // allows config redirection via env variable ++ home = getenv("VOLK_CONFIGPATH"); ++- if(home!=NULL){ ++- strncpy(path,home,512); ++- strcat(path,suffix2); ++- if (!read || access(path, F_OK) != -1){ +++ if (home != NULL) { +++ strncpy(path, home, 512); +++ strcat(path, suffix2); +++ if (!read || access(path, F_OK) != -1) { ++ return; ++ } ++ } ++ ++- //check for user-local config file +++ // check for user-local config file ++ home = getenv("HOME"); ++- if (home != NULL){ +++ if (home != NULL) { ++ strncpy(path, home, 512); ++ strcat(path, suffix); ++- if (!read || (access(path, F_OK) != -1)){ +++ if (!read || (access(path, F_OK) != -1)) { ++ return; ++ } ++ } ++ ++- //check for config file in APPDATA (Windows) +++ // check for config file in APPDATA (Windows) ++ home = getenv("APPDATA"); ++- if (home != NULL){ +++ if (home != NULL) { ++ strncpy(path, home, 512); ++ strcat(path, suffix); ++- if (!read || (access(path, F_OK) != -1)){ +++ if (!read || (access(path, F_OK) != -1)) { ++ return; ++ } ++ } ++ ++- //check for system-wide config file ++- if (access("/etc/volk/volk_config", F_OK) != -1){ +++ // check for system-wide config file +++ if (access("/etc/volk/volk_config", F_OK) != -1) { ++ strncpy(path, "/etc", 512); ++ strcat(path, suffix2); ++- if (!read || (access(path, F_OK) != -1)){ +++ if (!read || (access(path, F_OK) != -1)) { ++ return; ++ } ++ } ++ ++- //If still no path was found set path[0] to '0' and fall through +++ // If still no path was found set path[0] to '0' and fall through ++ path[0] = 0; ++ return; ++ } ++ ++-size_t volk_load_preferences(volk_arch_pref_t **prefs_res) +++size_t volk_load_preferences(volk_arch_pref_t** prefs_res) ++ { ++- FILE *config_file; +++ FILE* config_file; ++ char path[512], line[512]; ++ size_t n_arch_prefs = 0; ++- volk_arch_pref_t *prefs = NULL; +++ volk_arch_pref_t* prefs = NULL; ++ ++- //get the config path +++ // get the config path ++ volk_get_config_path(path, true); ++- if (!path[0]) return n_arch_prefs; //no prefs found +++ if (!path[0]) +++ return n_arch_prefs; // no prefs found ++ config_file = fopen(path, "r"); ++- if(!config_file) return n_arch_prefs; //no prefs found +++ if (!config_file) +++ return n_arch_prefs; // no prefs found ++ ++- //reset the file pointer and write the prefs into volk_arch_prefs ++- while(fgets(line, sizeof(line), config_file) != NULL) ++- { ++- void *new_prefs = realloc(prefs, (n_arch_prefs + 1) * sizeof(*prefs)); +++ // reset the file pointer and write the prefs into volk_arch_prefs +++ while (fgets(line, sizeof(line), config_file) != NULL) { +++ void* new_prefs = realloc(prefs, (n_arch_prefs + 1) * sizeof(*prefs)); ++ if (!new_prefs) { ++- printf ("volk_load_preferences: bad malloc\n"); +++ printf("volk_load_preferences: bad malloc\n"); ++ break; ++ } ++- prefs = (volk_arch_pref_t *) new_prefs; ++- volk_arch_pref_t *p = prefs + n_arch_prefs; ++- if(sscanf(line, "%s %s %s", p->name, p->impl_a, p->impl_u) == 3 && !strncmp(p->name, "volk_", 5)) ++- { +++ prefs = (volk_arch_pref_t*)new_prefs; +++ volk_arch_pref_t* p = prefs + n_arch_prefs; +++ if (sscanf(line, "%s %s %s", p->name, p->impl_a, p->impl_u) == 3 && +++ !strncmp(p->name, "volk_", 5)) { ++ n_arch_prefs++; ++ } ++ } ++diff --git a/lib/volk_rank_archs.c b/lib/volk_rank_archs.c ++index 346619e..7cf3fd7 100644 ++--- a/lib/volk_rank_archs.c +++++ b/lib/volk_rank_archs.c ++@@ -24,84 +24,83 @@ ++ #include ++ #include ++ ++-#include ++ #include +++#include ++ ++-int volk_get_index( ++- const char *impl_names[], //list of implementations by name ++- const size_t n_impls, //number of implementations available ++- const char *impl_name //the implementation name to find ++-){ +++int volk_get_index(const char* impl_names[], // list of implementations by name +++ const size_t n_impls, // number of implementations available +++ const char* impl_name // the implementation name to find +++) +++{ ++ unsigned int i; ++ for (i = 0; i < n_impls; i++) { ++- if(!strncmp(impl_names[i], impl_name, 20)) { +++ if (!strncmp(impl_names[i], impl_name, 20)) { ++ return i; ++ } ++ } ++- //TODO return -1; ++- //something terrible should happen here +++ // TODO return -1; +++ // something terrible should happen here ++ fprintf(stderr, "Volk warning: no arch found, returning generic impl\n"); ++- return volk_get_index(impl_names, n_impls, "generic"); //but we'll fake it for now +++ return volk_get_index(impl_names, n_impls, "generic"); // but we'll fake it for now ++ } ++ ++-int volk_rank_archs( ++- const char *kern_name, //name of the kernel to rank ++- const char *impl_names[], //list of implementations by name ++- const int* impl_deps, //requirement mask per implementation ++- const bool* alignment, //alignment status of each implementation ++- size_t n_impls, //number of implementations available ++- const bool align //if false, filter aligned implementations +++int volk_rank_archs(const char* kern_name, // name of the kernel to rank +++ const char* impl_names[], // list of implementations by name +++ const int* impl_deps, // requirement mask per implementation +++ const bool* alignment, // alignment status of each implementation +++ size_t n_impls, // number of implementations available +++ const bool align // if false, filter aligned implementations ++ ) ++ { ++ size_t i; ++- static volk_arch_pref_t *volk_arch_prefs; +++ static volk_arch_pref_t* volk_arch_prefs; ++ static size_t n_arch_prefs = 0; ++ static int prefs_loaded = 0; ++- if(!prefs_loaded) { +++ if (!prefs_loaded) { ++ n_arch_prefs = volk_load_preferences(&volk_arch_prefs); ++ prefs_loaded = 1; ++ } ++ ++ // If we've defined VOLK_GENERIC to be anything, always return the ++ // 'generic' kernel. Used in GR's QA code. ++- char *gen_env = getenv("VOLK_GENERIC"); ++- if(gen_env) { ++- return volk_get_index(impl_names, n_impls, "generic"); +++ char* gen_env = getenv("VOLK_GENERIC"); +++ if (gen_env) { +++ return volk_get_index(impl_names, n_impls, "generic"); ++ } ++ ++- //now look for the function name in the prefs list ++- for(i = 0; i < n_arch_prefs; i++) ++- { ++- if(!strncmp(kern_name, volk_arch_prefs[i].name, sizeof(volk_arch_prefs[i].name))) //found it +++ // now look for the function name in the prefs list +++ for (i = 0; i < n_arch_prefs; i++) { +++ if (!strncmp(kern_name, +++ volk_arch_prefs[i].name, +++ sizeof(volk_arch_prefs[i].name))) // found it ++ { ++- const char *impl_name = align? volk_arch_prefs[i].impl_a : volk_arch_prefs[i].impl_u; +++ const char* impl_name = +++ align ? volk_arch_prefs[i].impl_a : volk_arch_prefs[i].impl_u; ++ return volk_get_index(impl_names, n_impls, impl_name); ++ } ++ } ++ ++- //return the best index with the largest deps +++ // return the best index with the largest deps ++ size_t best_index_a = 0; ++ size_t best_index_u = 0; ++ int best_value_a = -1; ++ int best_value_u = -1; ++- for(i = 0; i < n_impls; i++) ++- { +++ for (i = 0; i < n_impls; i++) { ++ const signed val = impl_deps[i]; ++- if (alignment[i] && val > best_value_a) ++- { +++ if (alignment[i] && val > best_value_a) { ++ best_index_a = i; ++ best_value_a = val; ++ } ++- if (!alignment[i] && val > best_value_u) ++- { +++ if (!alignment[i] && val > best_value_u) { ++ best_index_u = i; ++ best_value_u = val; ++ } ++ } ++ ++- //when align and we found a best aligned, use it ++- if (align && best_value_a != -1) return best_index_a; +++ // when align and we found a best aligned, use it +++ if (align && best_value_a != -1) +++ return best_index_a; ++ ++- //otherwise return the best unaligned +++ // otherwise return the best unaligned ++ return best_index_u; ++ } ++diff --git a/lib/volk_rank_archs.h b/lib/volk_rank_archs.h ++index b3bf8ff..9434778 100644 ++--- a/lib/volk_rank_archs.h +++++ b/lib/volk_rank_archs.h ++@@ -22,26 +22,24 @@ ++ #ifndef INCLUDED_VOLK_RANK_ARCHS_H ++ #define INCLUDED_VOLK_RANK_ARCHS_H ++ ++-#include ++ #include +++#include ++ ++ #ifdef __cplusplus ++ extern "C" { ++ #endif ++ ++-int volk_get_index( ++- const char *impl_names[], //list of implementations by name ++- const size_t n_impls, //number of implementations available ++- const char *impl_name //the implementation name to find +++int volk_get_index(const char* impl_names[], // list of implementations by name +++ const size_t n_impls, // number of implementations available +++ const char* impl_name // the implementation name to find ++ ); ++ ++-int volk_rank_archs( ++- const char *kern_name, //name of the kernel to rank ++- const char *impl_names[], //list of implementations by name ++- const int* impl_deps, //requirement mask per implementation ++- const bool* alignment, //alignment status of each implementation ++- size_t n_impls, //number of implementations available ++- const bool align //if false, filter aligned implementations +++int volk_rank_archs(const char* kern_name, // name of the kernel to rank +++ const char* impl_names[], // list of implementations by name +++ const int* impl_deps, // requirement mask per implementation +++ const bool* alignment, // alignment status of each implementation +++ size_t n_impls, // number of implementations available +++ const bool align // if false, filter aligned implementations ++ ); ++ ++ #ifdef __cplusplus ++-- ++2.20.1 ++ diff --cc debian/patches/0004-clang-format-Update-PR-with-GitHub-Action.patch index 0000000,0000000..6db7e6c new file mode 100644 --- /dev/null +++ b/debian/patches/0004-clang-format-Update-PR-with-GitHub-Action.patch @@@ -1,0 -1,0 +1,53 @@@ ++From d1a4cc1f775b73c8a14ec2a27513f1d1cc977513 Mon Sep 17 00:00:00 2001 ++From: Johannes Demel ++Date: Tue, 17 Mar 2020 21:53:08 +0100 ++Subject: [PATCH 4/7] clang-format: Update PR with GitHub Action ++ ++--- ++ .github/workflows/check-pr-formatting.yml | 19 +++++++++++++++++++ ++ include/volk/volk_common.h | 2 +- ++ 2 files changed, 20 insertions(+), 1 deletion(-) ++ create mode 100644 .github/workflows/check-pr-formatting.yml ++ ++diff --git a/.github/workflows/check-pr-formatting.yml b/.github/workflows/check-pr-formatting.yml ++new file mode 100644 ++index 0000000..b1d2d83 ++--- /dev/null +++++ b/.github/workflows/check-pr-formatting.yml ++@@ -0,0 +1,19 @@ +++name: Check PR Formatting +++ +++on: +++ push: +++ pull_request: +++ paths-ignore: +++ - 'tmpl/' +++ +++jobs: +++ build: +++ runs-on: ubuntu-latest +++ +++ steps: +++ - uses: actions/checkout@v2 +++ - uses: gnuradio/clang-format-lint-action@v0.5-4 +++ with: +++ source: '.' +++ exclude: './volk' +++ extensions: 'c,cc,cpp,cxx,h,hh' ++\ No newline at end of file ++diff --git a/include/volk/volk_common.h b/include/volk/volk_common.h ++index 8167d23..7e78358 100644 ++--- a/include/volk/volk_common.h +++++ b/include/volk/volk_common.h ++@@ -69,7 +69,7 @@ ++ //////////////////////////////////////////////////////////////////////// ++ #if defined(_MSC_VER) ++ #pragma warning(disable : 4244) //'conversion' conversion from 'type1' to 'type2', ++- //possible loss of data +++ // possible loss of data ++ #pragma warning(disable : 4305) //'identifier' : truncation from 'type1' to 'type2' ++ #endif ++ ++-- ++2.20.1 ++ diff --cc debian/patches/0005-clang-format-Rebase-onto-current-master.patch index 0000000,0000000..b83819a new file mode 100644 --- /dev/null +++ b/debian/patches/0005-clang-format-Rebase-onto-current-master.patch @@@ -1,0 -1,0 +1,409 @@@ ++From 1ed5fa23ad4b298bd2685d2891abfabf14b601e0 Mon Sep 17 00:00:00 2001 ++From: Johannes Demel ++Date: Tue, 17 Mar 2020 22:07:07 +0100 ++Subject: [PATCH 5/7] clang-format: Rebase onto current master ++ ++This commit applies clang format to the latest master branch. ++--- ++ .github/workflows/check-pr-formatting.yml | 4 +- ++ include/volk/volk_common.h | 18 +- ++ kernels/volk/volk_32f_exp_32f.h | 302 +++++++++++----------- ++ 3 files changed, 163 insertions(+), 161 deletions(-) ++ ++diff --git a/.github/workflows/check-pr-formatting.yml b/.github/workflows/check-pr-formatting.yml ++index b1d2d83..9c7a286 100644 ++--- a/.github/workflows/check-pr-formatting.yml +++++ b/.github/workflows/check-pr-formatting.yml ++@@ -2,6 +2,8 @@ name: Check PR Formatting ++ ++ on: ++ push: +++ paths-ignore: +++ - 'tmpl/' ++ pull_request: ++ paths-ignore: ++ - 'tmpl/' ++@@ -15,5 +17,5 @@ jobs: ++ - uses: gnuradio/clang-format-lint-action@v0.5-4 ++ with: ++ source: '.' ++- exclude: './volk' +++ exclude: './tmpl' ++ extensions: 'c,cc,cpp,cxx,h,hh' ++\ No newline at end of file ++diff --git a/include/volk/volk_common.h b/include/volk/volk_common.h ++index 7e78358..4e14982 100644 ++--- a/include/volk/volk_common.h +++++ b/include/volk/volk_common.h ++@@ -5,15 +5,15 @@ ++ // Cross-platform attribute macros ++ //////////////////////////////////////////////////////////////////////// ++ #if _MSC_VER ++-# define __VOLK_ATTR_ALIGNED(x) __declspec(align(x)) ++-# define __VOLK_ATTR_UNUSED ++-# define __VOLK_ATTR_INLINE __forceinline ++-# define __VOLK_ATTR_DEPRECATED __declspec(deprecated) ++-# define __VOLK_ATTR_EXPORT __declspec(dllexport) ++-# define __VOLK_ATTR_IMPORT __declspec(dllimport) ++-# define __VOLK_PREFETCH(addr) ++-# define __VOLK_ASM __asm ++-# define __VOLK_VOLATILE +++#define __VOLK_ATTR_ALIGNED(x) __declspec(align(x)) +++#define __VOLK_ATTR_UNUSED +++#define __VOLK_ATTR_INLINE __forceinline +++#define __VOLK_ATTR_DEPRECATED __declspec(deprecated) +++#define __VOLK_ATTR_EXPORT __declspec(dllexport) +++#define __VOLK_ATTR_IMPORT __declspec(dllimport) +++#define __VOLK_PREFETCH(addr) +++#define __VOLK_ASM __asm +++#define __VOLK_VOLATILE ++ #elif defined(__clang__) ++ // AppleClang also defines __GNUC__, so do this check first. These ++ // will probably be the same as for __GNUC__, but let's keep them ++diff --git a/kernels/volk/volk_32f_exp_32f.h b/kernels/volk/volk_32f_exp_32f.h ++index 26fdf02..da4ada7 100644 ++--- a/kernels/volk/volk_32f_exp_32f.h +++++ b/kernels/volk/volk_32f_exp_32f.h ++@@ -92,9 +92,9 @@ ++ * \endcode ++ */ ++ ++-#include ++-#include ++ #include +++#include +++#include ++ ++ #ifndef INCLUDED_volk_32f_exp_32f_a_H ++ #define INCLUDED_volk_32f_exp_32f_a_H ++@@ -105,74 +105,74 @@ ++ static inline void ++ volk_32f_exp_32f_a_sse2(float* bVector, const float* aVector, unsigned int num_points) ++ { ++- float* bPtr = bVector; ++- const float* aPtr = aVector; ++- ++- unsigned int number = 0; ++- unsigned int quarterPoints = num_points / 4; ++- ++- // Declare variables and constants ++- __m128 aVal, bVal, tmp, fx, mask, pow2n, z, y; ++- __m128 one, exp_hi, exp_lo, log2EF, half, exp_C1, exp_C2; ++- __m128 exp_p0, exp_p1, exp_p2, exp_p3, exp_p4, exp_p5; ++- __m128i emm0, pi32_0x7f; ++- ++- one = _mm_set1_ps(1.0); ++- exp_hi = _mm_set1_ps(88.3762626647949); ++- exp_lo = _mm_set1_ps(-88.3762626647949); ++- log2EF = _mm_set1_ps(1.44269504088896341); ++- half = _mm_set1_ps(0.5); ++- exp_C1 = _mm_set1_ps(0.693359375); ++- exp_C2 = _mm_set1_ps(-2.12194440e-4); ++- pi32_0x7f = _mm_set1_epi32(0x7f); ++- ++- exp_p0 = _mm_set1_ps(1.9875691500e-4); ++- exp_p1 = _mm_set1_ps(1.3981999507e-3); ++- exp_p2 = _mm_set1_ps(8.3334519073e-3); ++- exp_p3 = _mm_set1_ps(4.1665795894e-2); ++- exp_p4 = _mm_set1_ps(1.6666665459e-1); ++- exp_p5 = _mm_set1_ps(5.0000001201e-1); ++- ++- for(;number < quarterPoints; number++) { ++- aVal = _mm_load_ps(aPtr); ++- tmp = _mm_setzero_ps(); ++- ++- aVal = _mm_max_ps(_mm_min_ps(aVal, exp_hi), exp_lo); ++- ++- /* express exp(x) as exp(g + n*log(2)) */ ++- fx = _mm_add_ps(_mm_mul_ps(aVal, log2EF), half); ++- ++- emm0 = _mm_cvttps_epi32(fx); ++- tmp = _mm_cvtepi32_ps(emm0); ++- ++- mask = _mm_and_ps(_mm_cmpgt_ps(tmp, fx), one); ++- fx = _mm_sub_ps(tmp, mask); ++- ++- tmp = _mm_mul_ps(fx, exp_C1); ++- z = _mm_mul_ps(fx, exp_C2); ++- aVal = _mm_sub_ps(_mm_sub_ps(aVal, tmp), z); ++- z = _mm_mul_ps(aVal, aVal); ++- ++- y = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(exp_p0, aVal), exp_p1), aVal); ++- y = _mm_add_ps(_mm_mul_ps(_mm_add_ps(y, exp_p2), aVal), exp_p3); ++- y = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(y, aVal), exp_p4), aVal); ++- y = _mm_add_ps(_mm_mul_ps(_mm_add_ps(y, exp_p5), z), aVal); ++- y = _mm_add_ps(y, one); ++- ++- emm0 = _mm_slli_epi32(_mm_add_epi32(_mm_cvttps_epi32(fx), pi32_0x7f), 23); ++- ++- pow2n = _mm_castsi128_ps(emm0); ++- bVal = _mm_mul_ps(y, pow2n); ++- ++- _mm_store_ps(bPtr, bVal); ++- aPtr += 4; ++- bPtr += 4; ++- } ++- ++- number = quarterPoints * 4; ++- for(;number < num_points; number++) { ++- *bPtr++ = expf(*aPtr++); ++- } +++ float* bPtr = bVector; +++ const float* aPtr = aVector; +++ +++ unsigned int number = 0; +++ unsigned int quarterPoints = num_points / 4; +++ +++ // Declare variables and constants +++ __m128 aVal, bVal, tmp, fx, mask, pow2n, z, y; +++ __m128 one, exp_hi, exp_lo, log2EF, half, exp_C1, exp_C2; +++ __m128 exp_p0, exp_p1, exp_p2, exp_p3, exp_p4, exp_p5; +++ __m128i emm0, pi32_0x7f; +++ +++ one = _mm_set1_ps(1.0); +++ exp_hi = _mm_set1_ps(88.3762626647949); +++ exp_lo = _mm_set1_ps(-88.3762626647949); +++ log2EF = _mm_set1_ps(1.44269504088896341); +++ half = _mm_set1_ps(0.5); +++ exp_C1 = _mm_set1_ps(0.693359375); +++ exp_C2 = _mm_set1_ps(-2.12194440e-4); +++ pi32_0x7f = _mm_set1_epi32(0x7f); +++ +++ exp_p0 = _mm_set1_ps(1.9875691500e-4); +++ exp_p1 = _mm_set1_ps(1.3981999507e-3); +++ exp_p2 = _mm_set1_ps(8.3334519073e-3); +++ exp_p3 = _mm_set1_ps(4.1665795894e-2); +++ exp_p4 = _mm_set1_ps(1.6666665459e-1); +++ exp_p5 = _mm_set1_ps(5.0000001201e-1); +++ +++ for (; number < quarterPoints; number++) { +++ aVal = _mm_load_ps(aPtr); +++ tmp = _mm_setzero_ps(); +++ +++ aVal = _mm_max_ps(_mm_min_ps(aVal, exp_hi), exp_lo); +++ +++ /* express exp(x) as exp(g + n*log(2)) */ +++ fx = _mm_add_ps(_mm_mul_ps(aVal, log2EF), half); +++ +++ emm0 = _mm_cvttps_epi32(fx); +++ tmp = _mm_cvtepi32_ps(emm0); +++ +++ mask = _mm_and_ps(_mm_cmpgt_ps(tmp, fx), one); +++ fx = _mm_sub_ps(tmp, mask); +++ +++ tmp = _mm_mul_ps(fx, exp_C1); +++ z = _mm_mul_ps(fx, exp_C2); +++ aVal = _mm_sub_ps(_mm_sub_ps(aVal, tmp), z); +++ z = _mm_mul_ps(aVal, aVal); +++ +++ y = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(exp_p0, aVal), exp_p1), aVal); +++ y = _mm_add_ps(_mm_mul_ps(_mm_add_ps(y, exp_p2), aVal), exp_p3); +++ y = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(y, aVal), exp_p4), aVal); +++ y = _mm_add_ps(_mm_mul_ps(_mm_add_ps(y, exp_p5), z), aVal); +++ y = _mm_add_ps(y, one); +++ +++ emm0 = _mm_slli_epi32(_mm_add_epi32(_mm_cvttps_epi32(fx), pi32_0x7f), 23); +++ +++ pow2n = _mm_castsi128_ps(emm0); +++ bVal = _mm_mul_ps(y, pow2n); +++ +++ _mm_store_ps(bPtr, bVal); +++ aPtr += 4; +++ bPtr += 4; +++ } +++ +++ number = quarterPoints * 4; +++ for (; number < num_points; number++) { +++ *bPtr++ = expf(*aPtr++); +++ } ++ } ++ ++ #endif /* LV_HAVE_SSE2 for aligned */ ++@@ -183,13 +183,13 @@ volk_32f_exp_32f_a_sse2(float* bVector, const float* aVector, unsigned int num_p ++ static inline void ++ volk_32f_exp_32f_a_generic(float* bVector, const float* aVector, unsigned int num_points) ++ { ++- float* bPtr = bVector; ++- const float* aPtr = aVector; ++- unsigned int number = 0; +++ float* bPtr = bVector; +++ const float* aPtr = aVector; +++ unsigned int number = 0; ++ ++- for(number = 0; number < num_points; number++) { ++- *bPtr++ = expf(*aPtr++); ++- } +++ for (number = 0; number < num_points; number++) { +++ *bPtr++ = expf(*aPtr++); +++ } ++ } ++ ++ #endif /* LV_HAVE_GENERIC */ ++@@ -205,75 +205,75 @@ volk_32f_exp_32f_a_generic(float* bVector, const float* aVector, unsigned int nu ++ static inline void ++ volk_32f_exp_32f_u_sse2(float* bVector, const float* aVector, unsigned int num_points) ++ { ++- float* bPtr = bVector; ++- const float* aPtr = aVector; ++- ++- unsigned int number = 0; ++- unsigned int quarterPoints = num_points / 4; ++- ++- // Declare variables and constants ++- __m128 aVal, bVal, tmp, fx, mask, pow2n, z, y; ++- __m128 one, exp_hi, exp_lo, log2EF, half, exp_C1, exp_C2; ++- __m128 exp_p0, exp_p1, exp_p2, exp_p3, exp_p4, exp_p5; ++- __m128i emm0, pi32_0x7f; ++- ++- one = _mm_set1_ps(1.0); ++- exp_hi = _mm_set1_ps(88.3762626647949); ++- exp_lo = _mm_set1_ps(-88.3762626647949); ++- log2EF = _mm_set1_ps(1.44269504088896341); ++- half = _mm_set1_ps(0.5); ++- exp_C1 = _mm_set1_ps(0.693359375); ++- exp_C2 = _mm_set1_ps(-2.12194440e-4); ++- pi32_0x7f = _mm_set1_epi32(0x7f); ++- ++- exp_p0 = _mm_set1_ps(1.9875691500e-4); ++- exp_p1 = _mm_set1_ps(1.3981999507e-3); ++- exp_p2 = _mm_set1_ps(8.3334519073e-3); ++- exp_p3 = _mm_set1_ps(4.1665795894e-2); ++- exp_p4 = _mm_set1_ps(1.6666665459e-1); ++- exp_p5 = _mm_set1_ps(5.0000001201e-1); ++- ++- ++- for(;number < quarterPoints; number++) { ++- aVal = _mm_loadu_ps(aPtr); ++- tmp = _mm_setzero_ps(); ++- ++- aVal = _mm_max_ps(_mm_min_ps(aVal, exp_hi), exp_lo); ++- ++- /* express exp(x) as exp(g + n*log(2)) */ ++- fx = _mm_add_ps(_mm_mul_ps(aVal, log2EF), half); ++- ++- emm0 = _mm_cvttps_epi32(fx); ++- tmp = _mm_cvtepi32_ps(emm0); ++- ++- mask = _mm_and_ps(_mm_cmpgt_ps(tmp, fx), one); ++- fx = _mm_sub_ps(tmp, mask); ++- ++- tmp = _mm_mul_ps(fx, exp_C1); ++- z = _mm_mul_ps(fx, exp_C2); ++- aVal = _mm_sub_ps(_mm_sub_ps(aVal, tmp), z); ++- z = _mm_mul_ps(aVal, aVal); ++- ++- y = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(exp_p0, aVal), exp_p1), aVal); ++- y = _mm_add_ps(_mm_mul_ps(_mm_add_ps(y, exp_p2), aVal), exp_p3); ++- y = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(y, aVal), exp_p4), aVal); ++- y = _mm_add_ps(_mm_mul_ps(_mm_add_ps(y, exp_p5), z), aVal); ++- y = _mm_add_ps(y, one); ++- ++- emm0 = _mm_slli_epi32(_mm_add_epi32(_mm_cvttps_epi32(fx), pi32_0x7f), 23); ++- ++- pow2n = _mm_castsi128_ps(emm0); ++- bVal = _mm_mul_ps(y, pow2n); ++- ++- _mm_storeu_ps(bPtr, bVal); ++- aPtr += 4; ++- bPtr += 4; ++- } ++- ++- number = quarterPoints * 4; ++- for(;number < num_points; number++){ ++- *bPtr++ = expf(*aPtr++); ++- } +++ float* bPtr = bVector; +++ const float* aPtr = aVector; +++ +++ unsigned int number = 0; +++ unsigned int quarterPoints = num_points / 4; +++ +++ // Declare variables and constants +++ __m128 aVal, bVal, tmp, fx, mask, pow2n, z, y; +++ __m128 one, exp_hi, exp_lo, log2EF, half, exp_C1, exp_C2; +++ __m128 exp_p0, exp_p1, exp_p2, exp_p3, exp_p4, exp_p5; +++ __m128i emm0, pi32_0x7f; +++ +++ one = _mm_set1_ps(1.0); +++ exp_hi = _mm_set1_ps(88.3762626647949); +++ exp_lo = _mm_set1_ps(-88.3762626647949); +++ log2EF = _mm_set1_ps(1.44269504088896341); +++ half = _mm_set1_ps(0.5); +++ exp_C1 = _mm_set1_ps(0.693359375); +++ exp_C2 = _mm_set1_ps(-2.12194440e-4); +++ pi32_0x7f = _mm_set1_epi32(0x7f); +++ +++ exp_p0 = _mm_set1_ps(1.9875691500e-4); +++ exp_p1 = _mm_set1_ps(1.3981999507e-3); +++ exp_p2 = _mm_set1_ps(8.3334519073e-3); +++ exp_p3 = _mm_set1_ps(4.1665795894e-2); +++ exp_p4 = _mm_set1_ps(1.6666665459e-1); +++ exp_p5 = _mm_set1_ps(5.0000001201e-1); +++ +++ +++ for (; number < quarterPoints; number++) { +++ aVal = _mm_loadu_ps(aPtr); +++ tmp = _mm_setzero_ps(); +++ +++ aVal = _mm_max_ps(_mm_min_ps(aVal, exp_hi), exp_lo); +++ +++ /* express exp(x) as exp(g + n*log(2)) */ +++ fx = _mm_add_ps(_mm_mul_ps(aVal, log2EF), half); +++ +++ emm0 = _mm_cvttps_epi32(fx); +++ tmp = _mm_cvtepi32_ps(emm0); +++ +++ mask = _mm_and_ps(_mm_cmpgt_ps(tmp, fx), one); +++ fx = _mm_sub_ps(tmp, mask); +++ +++ tmp = _mm_mul_ps(fx, exp_C1); +++ z = _mm_mul_ps(fx, exp_C2); +++ aVal = _mm_sub_ps(_mm_sub_ps(aVal, tmp), z); +++ z = _mm_mul_ps(aVal, aVal); +++ +++ y = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(exp_p0, aVal), exp_p1), aVal); +++ y = _mm_add_ps(_mm_mul_ps(_mm_add_ps(y, exp_p2), aVal), exp_p3); +++ y = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(y, aVal), exp_p4), aVal); +++ y = _mm_add_ps(_mm_mul_ps(_mm_add_ps(y, exp_p5), z), aVal); +++ y = _mm_add_ps(y, one); +++ +++ emm0 = _mm_slli_epi32(_mm_add_epi32(_mm_cvttps_epi32(fx), pi32_0x7f), 23); +++ +++ pow2n = _mm_castsi128_ps(emm0); +++ bVal = _mm_mul_ps(y, pow2n); +++ +++ _mm_storeu_ps(bPtr, bVal); +++ aPtr += 4; +++ bPtr += 4; +++ } +++ +++ number = quarterPoints * 4; +++ for (; number < num_points; number++) { +++ *bPtr++ = expf(*aPtr++); +++ } ++ } ++ ++ #endif /* LV_HAVE_SSE2 for unaligned */ ++@@ -284,13 +284,13 @@ volk_32f_exp_32f_u_sse2(float* bVector, const float* aVector, unsigned int num_p ++ static inline void ++ volk_32f_exp_32f_u_generic(float* bVector, const float* aVector, unsigned int num_points) ++ { ++- float* bPtr = bVector; ++- const float* aPtr = aVector; ++- unsigned int number = 0; +++ float* bPtr = bVector; +++ const float* aPtr = aVector; +++ unsigned int number = 0; ++ ++- for(number = 0; number < num_points; number++){ ++- *bPtr++ = expf(*aPtr++); ++- } +++ for (number = 0; number < num_points; number++) { +++ *bPtr++ = expf(*aPtr++); +++ } ++ } ++ ++ #endif /* LV_HAVE_GENERIC */ ++-- ++2.20.1 ++ diff --cc debian/patches/0006-Fix-the-broken-index-max-kernels.patch index 0000000,0000000..fdeed34 new file mode 100644 --- /dev/null +++ b/debian/patches/0006-Fix-the-broken-index-max-kernels.patch @@@ -1,0 -1,0 +1,882 @@@ ++From 67cbe6fe2aa73608a07c8c294313c42e8ff4d661 Mon Sep 17 00:00:00 2001 ++From: Clayton Smith ++Date: Sat, 21 Mar 2020 14:59:24 -0400 ++Subject: [PATCH 6/7] Fix the broken index max kernels ++ ++--- ++ kernels/volk/volk_32fc_index_max_16u.h | 299 ++++++------------------- ++ kernels/volk/volk_32fc_index_max_32u.h | 258 ++++++--------------- ++ 2 files changed, 128 insertions(+), 429 deletions(-) ++ ++diff --git a/kernels/volk/volk_32fc_index_max_16u.h b/kernels/volk/volk_32fc_index_max_16u.h ++index b9f9cfd..16e76cd 100644 ++--- a/kernels/volk/volk_32fc_index_max_16u.h +++++ b/kernels/volk/volk_32fc_index_max_16u.h ++@@ -1,6 +1,6 @@ ++ /* -*- c++ -*- */ ++ /* ++- * Copyright 2012, 2014 Free Software Foundation, Inc. +++ * Copyright 2012, 2014-2016, 2018-2020 Free Software Foundation, Inc. ++ * ++ * This file is part of GNU Radio ++ * ++@@ -36,8 +36,8 @@ ++ * ++ * Dispatcher Prototype ++ * \code ++- * void volk_32fc_index_max_16u(uint16_t* target, lv_32fc_t* src0, uint32_t num_points) ++- * \endcode +++ * void volk_32fc_index_max_16u(uint16_t* target, lv_32fc_t* src0, uint32_t +++ * num_points) \endcode ++ * ++ * \b Inputs ++ * \li src0: The complex input vector. ++@@ -89,33 +89,32 @@ static inline void ++ volk_32fc_index_max_16u_a_avx2(uint16_t* target, lv_32fc_t* src0, uint32_t num_points) ++ { ++ num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points; ++- // Branchless version, if we think it'll make a difference ++- // num_points = USHRT_MAX ^ ((num_points ^ USHRT_MAX) & -(num_points < USHRT_MAX)); ++- ++ const uint32_t num_bytes = num_points * 8; ++ ++ union bit256 holderf; ++ union bit256 holderi; ++ float sq_dist = 0.0; +++ float max = 0.0; +++ uint16_t index = 0; ++ ++ union bit256 xmm5, xmm4; ++ __m256 xmm1, xmm2, xmm3; ++- __m256i xmm8, xmm11, xmm12, xmmfive, xmmfour, xmm9, holder0, holder1, xmm10; +++ __m256i xmm8, xmm11, xmm12, xmm9, xmm10; ++ ++- xmm5.int_vec = xmmfive = _mm256_setzero_si256(); ++- xmm4.int_vec = xmmfour = _mm256_setzero_si256(); ++- holderf.int_vec = holder0 = _mm256_setzero_si256(); ++- holderi.int_vec = holder1 = _mm256_setzero_si256(); +++ xmm5.int_vec = _mm256_setzero_si256(); +++ xmm4.int_vec = _mm256_setzero_si256(); +++ holderf.int_vec = _mm256_setzero_si256(); +++ holderi.int_vec = _mm256_setzero_si256(); ++ ++ int bound = num_bytes >> 6; ++ int i = 0; ++ ++- xmm8 = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0); ++- xmm9 = _mm256_setzero_si256(); //=xmm8 +++ xmm8 = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7); +++ xmm9 = _mm256_setzero_si256(); ++ xmm10 = _mm256_set1_epi32(8); ++ xmm3 = _mm256_setzero_ps(); ++ ++- __m256i idx = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0); +++ __m256i idx = _mm256_setr_epi32(0, 1, 4, 5, 2, 3, 6, 7); ++ for (; i < bound; ++i) { ++ xmm1 = _mm256_load_ps((float*)src0); ++ xmm2 = _mm256_load_ps((float*)&src0[4]); ++@@ -140,105 +139,27 @@ volk_32fc_index_max_16u_a_avx2(uint16_t* target, lv_32fc_t* src0, uint32_t num_p ++ ++ xmm8 = _mm256_add_epi32(xmm8, xmm10); ++ } ++- xmm10 = _mm256_set1_epi32(4); ++- if (num_bytes >> 5 & 1) { ++- xmm1 = _mm256_load_ps((float*)src0); ++- ++- src0 += 4; ++- ++- xmm1 = _mm256_mul_ps(xmm1, xmm1); ++ ++- xmm1 = _mm256_hadd_ps(xmm1, xmm1); ++- xmm1 = _mm256_permutevar8x32_ps(xmm1, idx); ++- ++- xmm3 = _mm256_max_ps(xmm1, xmm3); ++- ++- xmm4.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_LT_OS); ++- xmm5.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_EQ_OQ); ++- ++- xmm11 = _mm256_and_si256(xmm8, xmm5.int_vec); ++- xmm12 = _mm256_and_si256(xmm9, xmm4.int_vec); ++- ++- xmm9 = _mm256_add_epi32(xmm11, xmm12); +++ _mm256_store_ps((float*)&(holderf.f), xmm3); +++ _mm256_store_si256(&(holderi.int_vec), xmm9); ++ ++- xmm8 = _mm256_add_epi32(xmm8, xmm10); +++ for (i = 0; i < 8; i++) { +++ if (holderf.f[i] > max) { +++ index = holderi.i[i]; +++ max = holderf.f[i]; +++ } ++ } ++ ++- idx = _mm256_set_epi32(1, 0, 1, 0, 1, 0, 1, 0); ++- xmm10 = _mm256_set1_epi32(2); ++- if (num_bytes >> 4 & 1) { ++- xmm2 = _mm256_load_ps((float*)src0); ++- ++- xmm1 = _mm256_permutevar8x32_ps(bit256_p(&xmm8)->float_vec, idx); ++- xmm8 = bit256_p(&xmm1)->int_vec; ++- ++- xmm2 = _mm256_mul_ps(xmm2, xmm2); ++- ++- src0 += 2; ++- ++- xmm1 = _mm256_hadd_ps(xmm2, xmm2); ++- ++- xmm3 = _mm256_max_ps(xmm1, xmm3); ++- ++- xmm4.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_LT_OS); ++- xmm5.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_EQ_OQ); ++- ++- xmm11 = _mm256_and_si256(xmm8, xmm5.int_vec); ++- xmm12 = _mm256_and_si256(xmm9, xmm4.int_vec); ++- ++- xmm9 = _mm256_add_epi32(xmm11, xmm12); +++ for (i = bound * 8; i < num_points; i++, src0++) { +++ sq_dist = +++ lv_creal(src0[0]) * lv_creal(src0[0]) + lv_cimag(src0[0]) * lv_cimag(src0[0]); ++ ++- xmm8 = _mm256_add_epi32(xmm8, xmm10); +++ if (sq_dist > max) { +++ index = i; +++ max = sq_dist; +++ } ++ } ++- ++- /* ++- idx = _mm256_setzero_si256(); ++- for(i = 0; i < leftovers2; ++i) { ++- //printf("%u, %u, %u, %u\n", ((uint32_t*)&xmm9)[0], ((uint32_t*)&xmm9)[1], ++- ((uint32_t*)&xmm9)[2], ((uint32_t*)&xmm9)[3]); ++- ++- sq_dist = lv_creal(src0[0]) * lv_creal(src0[0]) + lv_cimag(src0[0]) * ++- lv_cimag(src0[0]); ++- ++- //xmm = _mm_load1_ps(&sq_dist);//insert? ++- xmm2 = _mm256_set1_ps(sq_dist); ++- //xmm2 = _mm256_insertf128_ps(xmm2, xmm, 0); ++- ++- xmm1 = xmm3; ++- ++- xmm3 = _mm256_max_ps(xmm3, xmm2);//only lowest 32bit value ++- xmm3 = _mm256_permutevar8x32_ps(xmm3, idx); ++- ++- xmm4.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_LT_OS); ++- xmm5.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_EQ_OQ); ++- ++- xmm8 = _mm256_permutevar8x32_epi32(xmm8, idx); ++- ++- xmm11 = _mm256_and_si256(xmm8, xmm4.int_vec); ++- xmm12 = _mm256_and_si256(xmm9, xmm5.int_vec); ++- ++- xmm9 = _mm256_add_epi32(xmm11, xmm12); ++- }*/ ++- ++- _mm256_store_ps((float*)&(holderf.f), xmm3); ++- _mm256_store_si256(&(holderi.int_vec), xmm9); ++- ++- target[0] = holderi.i[0]; ++- sq_dist = holderf.f[0]; ++- target[0] = (holderf.f[1] > sq_dist) ? holderi.i[1] : target[0]; ++- sq_dist = (holderf.f[1] > sq_dist) ? holderf.f[1] : sq_dist; ++- target[0] = (holderf.f[2] > sq_dist) ? holderi.i[2] : target[0]; ++- sq_dist = (holderf.f[2] > sq_dist) ? holderf.f[2] : sq_dist; ++- target[0] = (holderf.f[3] > sq_dist) ? holderi.i[3] : target[0]; ++- sq_dist = (holderf.f[3] > sq_dist) ? holderf.f[3] : sq_dist; ++- target[0] = (holderf.f[4] > sq_dist) ? holderi.i[4] : target[0]; ++- sq_dist = (holderf.f[4] > sq_dist) ? holderf.f[4] : sq_dist; ++- target[0] = (holderf.f[5] > sq_dist) ? holderi.i[5] : target[0]; ++- sq_dist = (holderf.f[5] > sq_dist) ? holderf.f[5] : sq_dist; ++- target[0] = (holderf.f[6] > sq_dist) ? holderi.i[6] : target[0]; ++- sq_dist = (holderf.f[6] > sq_dist) ? holderf.f[6] : sq_dist; ++- target[0] = (holderf.f[7] > sq_dist) ? holderi.i[7] : target[0]; ++- sq_dist = (holderf.f[7] > sq_dist) ? holderf.f[7] : sq_dist; +++ target[0] = index; ++ } ++ ++ #endif /*LV_HAVE_AVX2*/ ++@@ -251,9 +172,6 @@ static inline void ++ volk_32fc_index_max_16u_a_sse3(uint16_t* target, lv_32fc_t* src0, uint32_t num_points) ++ { ++ num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points; ++- // Branchless version, if we think it'll make a difference ++- // num_points = USHRT_MAX ^ ((num_points ^ USHRT_MAX) & -(num_points < USHRT_MAX)); ++- ++ const uint32_t num_bytes = num_points * 8; ++ ++ union bit128 holderf; ++@@ -262,22 +180,20 @@ volk_32fc_index_max_16u_a_sse3(uint16_t* target, lv_32fc_t* src0, uint32_t num_p ++ ++ union bit128 xmm5, xmm4; ++ __m128 xmm1, xmm2, xmm3; ++- __m128i xmm8, xmm11, xmm12, xmmfive, xmmfour, xmm9, holder0, holder1, xmm10; +++ __m128i xmm8, xmm11, xmm12, xmm9, xmm10; ++ ++- xmm5.int_vec = xmmfive = _mm_setzero_si128(); ++- xmm4.int_vec = xmmfour = _mm_setzero_si128(); ++- holderf.int_vec = holder0 = _mm_setzero_si128(); ++- holderi.int_vec = holder1 = _mm_setzero_si128(); +++ xmm5.int_vec = _mm_setzero_si128(); +++ xmm4.int_vec = _mm_setzero_si128(); +++ holderf.int_vec = _mm_setzero_si128(); +++ holderi.int_vec = _mm_setzero_si128(); ++ ++ int bound = num_bytes >> 5; ++ int i = 0; ++ ++- xmm8 = _mm_set_epi32(3, 2, 1, 0); // remember the crazy reverse order! +++ xmm8 = _mm_setr_epi32(0, 1, 2, 3); ++ xmm9 = _mm_setzero_si128(); ++- xmm10 = _mm_set_epi32(4, 4, 4, 4); +++ xmm10 = _mm_setr_epi32(4, 4, 4, 4); ++ xmm3 = _mm_setzero_ps(); ++- // printf("%f, %f, %f, %f\n", ((float*)&xmm10)[0], ((float*)&xmm10)[1], ++- // ((float*)&xmm10)[2], ((float*)&xmm10)[3]); ++ ++ for (; i < bound; ++i) { ++ xmm1 = _mm_load_ps((float*)src0); ++@@ -301,14 +217,8 @@ volk_32fc_index_max_16u_a_sse3(uint16_t* target, lv_32fc_t* src0, uint32_t num_p ++ xmm9 = _mm_add_epi32(xmm11, xmm12); ++ ++ xmm8 = _mm_add_epi32(xmm8, xmm10); ++- ++- // printf("%f, %f, %f, %f\n", ((float*)&xmm3)[0], ((float*)&xmm3)[1], ++- // ((float*)&xmm3)[2], ((float*)&xmm3)[3]); printf("%u, %u, %u, %u\n", ++- // ((uint32_t*)&xmm10)[0], ((uint32_t*)&xmm10)[1], ((uint32_t*)&xmm10)[2], ++- // ((uint32_t*)&xmm10)[3]); ++ } ++ ++- ++ if (num_bytes >> 4 & 1) { ++ xmm2 = _mm_load_ps((float*)src0); ++ ++@@ -323,7 +233,7 @@ volk_32fc_index_max_16u_a_sse3(uint16_t* target, lv_32fc_t* src0, uint32_t num_p ++ ++ xmm3 = _mm_max_ps(xmm1, xmm3); ++ ++- xmm10 = _mm_set_epi32(2, 2, 2, 2); // load1_ps((float*)&init[2]); +++ xmm10 = _mm_setr_epi32(2, 2, 2, 2); ++ ++ xmm4.float_vec = _mm_cmplt_ps(xmm1, xmm3); ++ xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3); ++@@ -334,14 +244,9 @@ volk_32fc_index_max_16u_a_sse3(uint16_t* target, lv_32fc_t* src0, uint32_t num_p ++ xmm9 = _mm_add_epi32(xmm11, xmm12); ++ ++ xmm8 = _mm_add_epi32(xmm8, xmm10); ++- // printf("egads%u, %u, %u, %u\n", ((uint32_t*)&xmm9)[0], ((uint32_t*)&xmm9)[1], ++- // ((uint32_t*)&xmm9)[2], ((uint32_t*)&xmm9)[3]); ++ } ++ ++ if (num_bytes >> 3 & 1) { ++- // printf("%u, %u, %u, %u\n", ((uint32_t*)&xmm9)[0], ((uint32_t*)&xmm9)[1], ++- // ((uint32_t*)&xmm9)[2], ((uint32_t*)&xmm9)[3]); ++- ++ sq_dist = ++ lv_creal(src0[0]) * lv_creal(src0[0]) + lv_cimag(src0[0]) * lv_cimag(src0[0]); ++ ++@@ -362,11 +267,6 @@ volk_32fc_index_max_16u_a_sse3(uint16_t* target, lv_32fc_t* src0, uint32_t num_p ++ xmm9 = _mm_add_epi32(xmm11, xmm12); ++ } ++ ++- // printf("%f, %f, %f, %f\n", ((float*)&xmm3)[0], ((float*)&xmm3)[1], ++- // ((float*)&xmm3)[2], ((float*)&xmm3)[3]); printf("%u, %u, %u, %u\n", ++- // ((uint32_t*)&xmm9)[0], ((uint32_t*)&xmm9)[1], ((uint32_t*)&xmm9)[2], ++- // ((uint32_t*)&xmm9)[3]); ++- ++ _mm_store_ps((float*)&(holderf.f), xmm3); ++ _mm_store_si128(&(holderi.int_vec), xmm9); ++ ++@@ -378,25 +278,6 @@ volk_32fc_index_max_16u_a_sse3(uint16_t* target, lv_32fc_t* src0, uint32_t num_p ++ sq_dist = (holderf.f[2] > sq_dist) ? holderf.f[2] : sq_dist; ++ target[0] = (holderf.f[3] > sq_dist) ? holderi.i[3] : target[0]; ++ sq_dist = (holderf.f[3] > sq_dist) ? holderf.f[3] : sq_dist; ++- ++- /* ++- float placeholder = 0.0; ++- uint32_t temp0, temp1; ++- uint32_t g0 = (((float*)&xmm3)[0] > ((float*)&xmm3)[1]); ++- uint32_t l0 = g0 ^ 1; ++- ++- uint32_t g1 = (((float*)&xmm3)[1] > ((float*)&xmm3)[2]); ++- uint32_t l1 = g1 ^ 1; ++- ++- temp0 = g0 * ((uint32_t*)&xmm9)[0] + l0 * ((uint32_t*)&xmm9)[1]; ++- temp1 = g0 * ((uint32_t*)&xmm9)[2] + l0 * ((uint32_t*)&xmm9)[3]; ++- sq_dist = g0 * ((float*)&xmm3)[0] + l0 * ((float*)&xmm3)[1]; ++- placeholder = g0 * ((float*)&xmm3)[2] + l0 * ((float*)&xmm3)[3]; ++- ++- g0 = (sq_dist > placeholder); ++- l0 = g0 ^ 1; ++- target[0] = g0 * temp0 + l0 * temp1; ++- */ ++ } ++ ++ #endif /*LV_HAVE_SSE3*/ ++@@ -419,18 +300,18 @@ volk_32fc_index_max_16u_generic(uint16_t* target, lv_32fc_t* src0, uint32_t num_ ++ sq_dist = ++ lv_creal(src0[i]) * lv_creal(src0[i]) + lv_cimag(src0[i]) * lv_cimag(src0[i]); ++ ++- index = sq_dist > max ? i : index; ++- max = sq_dist > max ? sq_dist : max; +++ if (sq_dist > max) { +++ index = i; +++ max = sq_dist; +++ } ++ } ++ target[0] = index; ++ } ++ ++ #endif /*LV_HAVE_GENERIC*/ ++ ++- ++ #endif /*INCLUDED_volk_32fc_index_max_16u_a_H*/ ++ ++- ++ #ifndef INCLUDED_volk_32fc_index_max_16u_u_H ++ #define INCLUDED_volk_32fc_index_max_16u_u_H ++ ++@@ -447,33 +328,32 @@ static inline void ++ volk_32fc_index_max_16u_u_avx2(uint16_t* target, lv_32fc_t* src0, uint32_t num_points) ++ { ++ num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points; ++- // Branchless version, if we think it'll make a difference ++- // num_points = USHRT_MAX ^ ((num_points ^ USHRT_MAX) & -(num_points < USHRT_MAX)); ++- ++ const uint32_t num_bytes = num_points * 8; ++ ++ union bit256 holderf; ++ union bit256 holderi; ++ float sq_dist = 0.0; +++ float max = 0.0; +++ uint16_t index = 0; ++ ++ union bit256 xmm5, xmm4; ++ __m256 xmm1, xmm2, xmm3; ++- __m256i xmm8, xmm11, xmm12, xmmfive, xmmfour, xmm9, holder0, holder1, xmm10; +++ __m256i xmm8, xmm11, xmm12, xmm9, xmm10; ++ ++- xmm5.int_vec = xmmfive = _mm256_setzero_si256(); ++- xmm4.int_vec = xmmfour = _mm256_setzero_si256(); ++- holderf.int_vec = holder0 = _mm256_setzero_si256(); ++- holderi.int_vec = holder1 = _mm256_setzero_si256(); +++ xmm5.int_vec = _mm256_setzero_si256(); +++ xmm4.int_vec = _mm256_setzero_si256(); +++ holderf.int_vec = _mm256_setzero_si256(); +++ holderi.int_vec = _mm256_setzero_si256(); ++ ++ int bound = num_bytes >> 6; ++ int i = 0; ++ ++- xmm8 = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0); ++- xmm9 = _mm256_setzero_si256(); //=xmm8 +++ xmm8 = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7); +++ xmm9 = _mm256_setzero_si256(); ++ xmm10 = _mm256_set1_epi32(8); ++ xmm3 = _mm256_setzero_ps(); ++ ++- __m256i idx = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0); +++ __m256i idx = _mm256_setr_epi32(0, 1, 4, 5, 2, 3, 6, 7); ++ for (; i < bound; ++i) { ++ xmm1 = _mm256_loadu_ps((float*)src0); ++ xmm2 = _mm256_loadu_ps((float*)&src0[4]); ++@@ -498,76 +378,27 @@ volk_32fc_index_max_16u_u_avx2(uint16_t* target, lv_32fc_t* src0, uint32_t num_p ++ ++ xmm8 = _mm256_add_epi32(xmm8, xmm10); ++ } ++- xmm10 = _mm256_set1_epi32(4); ++- if (num_bytes >> 5 & 1) { ++- xmm1 = _mm256_loadu_ps((float*)src0); ++- ++- src0 += 4; ++ ++- xmm1 = _mm256_mul_ps(xmm1, xmm1); ++- ++- xmm1 = _mm256_hadd_ps(xmm1, xmm1); ++- xmm1 = _mm256_permutevar8x32_ps(xmm1, idx); ++- ++- xmm3 = _mm256_max_ps(xmm1, xmm3); ++- ++- xmm4.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_LT_OS); ++- xmm5.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_EQ_OQ); ++- ++- xmm11 = _mm256_and_si256(xmm8, xmm5.int_vec); ++- xmm12 = _mm256_and_si256(xmm9, xmm4.int_vec); ++- ++- xmm9 = _mm256_add_epi32(xmm11, xmm12); +++ _mm256_storeu_ps((float*)&(holderf.f), xmm3); +++ _mm256_storeu_si256(&(holderi.int_vec), xmm9); ++ ++- xmm8 = _mm256_add_epi32(xmm8, xmm10); +++ for (i = 0; i < 8; i++) { +++ if (holderf.f[i] > max) { +++ index = holderi.i[i]; +++ max = holderf.f[i]; +++ } ++ } ++ ++- idx = _mm256_set_epi32(1, 0, 1, 0, 1, 0, 1, 0); ++- xmm10 = _mm256_set1_epi32(2); ++- if (num_bytes >> 4 & 1) { ++- xmm2 = _mm256_loadu_ps((float*)src0); ++- ++- xmm1 = _mm256_permutevar8x32_ps(bit256_p(&xmm8)->float_vec, idx); ++- xmm8 = bit256_p(&xmm1)->int_vec; ++- ++- xmm2 = _mm256_mul_ps(xmm2, xmm2); ++- ++- src0 += 2; ++- ++- xmm1 = _mm256_hadd_ps(xmm2, xmm2); ++- ++- xmm3 = _mm256_max_ps(xmm1, xmm3); ++- ++- xmm4.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_LT_OS); ++- xmm5.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_EQ_OQ); ++- ++- xmm11 = _mm256_and_si256(xmm8, xmm5.int_vec); ++- xmm12 = _mm256_and_si256(xmm9, xmm4.int_vec); ++- ++- xmm9 = _mm256_add_epi32(xmm11, xmm12); +++ for (i = bound * 8; i < num_points; i++, src0++) { +++ sq_dist = +++ lv_creal(src0[0]) * lv_creal(src0[0]) + lv_cimag(src0[0]) * lv_cimag(src0[0]); ++ ++- xmm8 = _mm256_add_epi32(xmm8, xmm10); +++ if (sq_dist > max) { +++ index = i; +++ max = sq_dist; +++ } ++ } ++- ++- _mm256_storeu_ps((float*)&(holderf.f), xmm3); ++- _mm256_storeu_si256(&(holderi.int_vec), xmm9); ++- ++- target[0] = holderi.i[0]; ++- sq_dist = holderf.f[0]; ++- target[0] = (holderf.f[1] > sq_dist) ? holderi.i[1] : target[0]; ++- sq_dist = (holderf.f[1] > sq_dist) ? holderf.f[1] : sq_dist; ++- target[0] = (holderf.f[2] > sq_dist) ? holderi.i[2] : target[0]; ++- sq_dist = (holderf.f[2] > sq_dist) ? holderf.f[2] : sq_dist; ++- target[0] = (holderf.f[3] > sq_dist) ? holderi.i[3] : target[0]; ++- sq_dist = (holderf.f[3] > sq_dist) ? holderf.f[3] : sq_dist; ++- target[0] = (holderf.f[4] > sq_dist) ? holderi.i[4] : target[0]; ++- sq_dist = (holderf.f[4] > sq_dist) ? holderf.f[4] : sq_dist; ++- target[0] = (holderf.f[5] > sq_dist) ? holderi.i[5] : target[0]; ++- sq_dist = (holderf.f[5] > sq_dist) ? holderf.f[5] : sq_dist; ++- target[0] = (holderf.f[6] > sq_dist) ? holderi.i[6] : target[0]; ++- sq_dist = (holderf.f[6] > sq_dist) ? holderf.f[6] : sq_dist; ++- target[0] = (holderf.f[7] > sq_dist) ? holderi.i[7] : target[0]; ++- sq_dist = (holderf.f[7] > sq_dist) ? holderf.f[7] : sq_dist; +++ target[0] = index; ++ } ++ ++ #endif /*LV_HAVE_AVX2*/ ++diff --git a/kernels/volk/volk_32fc_index_max_32u.h b/kernels/volk/volk_32fc_index_max_32u.h ++index 7756fc6..556b5fc 100644 ++--- a/kernels/volk/volk_32fc_index_max_32u.h +++++ b/kernels/volk/volk_32fc_index_max_32u.h ++@@ -1,6 +1,6 @@ ++ /* -*- c++ -*- */ ++ /* ++- * Copyright 2016 Free Software Foundation, Inc. +++ * Copyright 2016, 2018-2020 Free Software Foundation, Inc. ++ * ++ * This file is part of GNU Radio ++ * ++@@ -30,8 +30,8 @@ ++ * ++ * Dispatcher Prototype ++ * \code ++- * void volk_32fc_index_max_32u(uint32_t* target, lv_32fc_t* src0, uint32_t num_points) ++- * \endcode +++ * void volk_32fc_index_max_32u(uint32_t* target, lv_32fc_t* src0, uint32_t +++ * num_points) \endcode ++ * ++ * \b Inputs ++ * \li src0: The complex input vector. ++@@ -86,24 +86,26 @@ volk_32fc_index_max_32u_a_avx2(uint32_t* target, lv_32fc_t* src0, uint32_t num_p ++ union bit256 holderf; ++ union bit256 holderi; ++ float sq_dist = 0.0; +++ float max = 0.0; +++ uint32_t index = 0; ++ ++ union bit256 xmm5, xmm4; ++ __m256 xmm1, xmm2, xmm3; ++- __m256i xmm8, xmm11, xmm12, xmmfive, xmmfour, xmm9, holder0, holder1, xmm10; +++ __m256i xmm8, xmm11, xmm12, xmm9, xmm10; ++ ++- xmm5.int_vec = xmmfive = _mm256_setzero_si256(); ++- xmm4.int_vec = xmmfour = _mm256_setzero_si256(); ++- holderf.int_vec = holder0 = _mm256_setzero_si256(); ++- holderi.int_vec = holder1 = _mm256_setzero_si256(); +++ xmm5.int_vec = _mm256_setzero_si256(); +++ xmm4.int_vec = _mm256_setzero_si256(); +++ holderf.int_vec = _mm256_setzero_si256(); +++ holderi.int_vec = _mm256_setzero_si256(); ++ ++ int bound = num_bytes >> 6; ++ int i = 0; ++ ++- xmm8 = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0); +++ xmm8 = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7); ++ xmm9 = _mm256_setzero_si256(); ++ xmm10 = _mm256_set1_epi32(8); ++ xmm3 = _mm256_setzero_ps(); ++- __m256i idx = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0); +++ __m256i idx = _mm256_setr_epi32(0, 1, 4, 5, 2, 3, 6, 7); ++ ++ for (; i < bound; ++i) { ++ xmm1 = _mm256_load_ps((float*)src0); ++@@ -130,75 +132,26 @@ volk_32fc_index_max_32u_a_avx2(uint32_t* target, lv_32fc_t* src0, uint32_t num_p ++ xmm8 = _mm256_add_epi32(xmm8, xmm10); ++ } ++ ++- xmm10 = _mm256_set1_epi32(4); ++- if (num_bytes >> 4 & 1) { ++- xmm1 = _mm256_load_ps((float*)src0); ++- ++- xmm1 = _mm256_mul_ps(xmm1, xmm1); ++- ++- src0 += 4; ++- ++- xmm1 = _mm256_hadd_ps(xmm1, xmm1); ++- ++- xmm3 = _mm256_max_ps(xmm1, xmm3); ++- ++- xmm4.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_LT_OS); ++- xmm5.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_EQ_OQ); ++- ++- xmm11 = _mm256_and_si256(xmm8, xmm5.int_vec); ++- xmm12 = _mm256_and_si256(xmm9, xmm4.int_vec); ++- ++- xmm9 = _mm256_add_epi32(xmm11, xmm12); +++ _mm256_store_ps((float*)&(holderf.f), xmm3); +++ _mm256_store_si256(&(holderi.int_vec), xmm9); ++ ++- xmm8 = _mm256_add_epi32(xmm8, xmm10); +++ for (i = 0; i < 8; i++) { +++ if (holderf.f[i] > max) { +++ index = holderi.i[i]; +++ max = holderf.f[i]; +++ } ++ } ++ ++- idx = _mm256_set_epi32(1, 0, 1, 0, 1, 0, 1, 0); ++- xmm10 = _mm256_set1_epi32(2); ++- if (num_bytes >> 4 & 1) { ++- xmm2 = _mm256_load_ps((float*)src0); ++- ++- xmm1 = _mm256_permutevar8x32_ps(bit256_p(&xmm8)->float_vec, idx); ++- xmm8 = bit256_p(&xmm1)->int_vec; ++- ++- xmm2 = _mm256_mul_ps(xmm2, xmm2); ++- ++- src0 += 2; ++- ++- xmm1 = _mm256_hadd_ps(xmm2, xmm2); ++- ++- xmm3 = _mm256_max_ps(xmm1, xmm3); ++- ++- xmm4.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_LT_OS); ++- xmm5.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_EQ_OQ); ++- ++- xmm11 = _mm256_and_si256(xmm8, xmm5.int_vec); ++- xmm12 = _mm256_and_si256(xmm9, xmm4.int_vec); ++- ++- xmm9 = _mm256_add_epi32(xmm11, xmm12); +++ for (i = bound * 8; i < num_points; i++, src0++) { +++ sq_dist = +++ lv_creal(src0[0]) * lv_creal(src0[0]) + lv_cimag(src0[0]) * lv_cimag(src0[0]); ++ ++- xmm8 = _mm256_add_epi32(xmm8, xmm10); +++ if (sq_dist > max) { +++ index = i; +++ max = sq_dist; +++ } ++ } ++- ++- _mm256_store_ps((float*)&(holderf.f), xmm3); ++- _mm256_store_si256(&(holderi.int_vec), xmm9); ++- ++- target[0] = holderi.i[0]; ++- sq_dist = holderf.f[0]; ++- target[0] = (holderf.f[1] > sq_dist) ? holderi.i[1] : target[0]; ++- sq_dist = (holderf.f[1] > sq_dist) ? holderf.f[1] : sq_dist; ++- target[0] = (holderf.f[2] > sq_dist) ? holderi.i[2] : target[0]; ++- sq_dist = (holderf.f[2] > sq_dist) ? holderf.f[2] : sq_dist; ++- target[0] = (holderf.f[3] > sq_dist) ? holderi.i[3] : target[0]; ++- sq_dist = (holderf.f[3] > sq_dist) ? holderf.f[3] : sq_dist; ++- target[0] = (holderf.f[4] > sq_dist) ? holderi.i[4] : target[0]; ++- sq_dist = (holderf.f[4] > sq_dist) ? holderf.f[4] : sq_dist; ++- target[0] = (holderf.f[5] > sq_dist) ? holderi.i[5] : target[0]; ++- sq_dist = (holderf.f[5] > sq_dist) ? holderf.f[5] : sq_dist; ++- target[0] = (holderf.f[6] > sq_dist) ? holderi.i[6] : target[0]; ++- sq_dist = (holderf.f[6] > sq_dist) ? holderf.f[6] : sq_dist; ++- target[0] = (holderf.f[7] > sq_dist) ? holderi.i[7] : target[0]; ++- sq_dist = (holderf.f[7] > sq_dist) ? holderf.f[7] : sq_dist; +++ target[0] = index; ++ } ++ ++ #endif /*LV_HAVE_AVX2*/ ++@@ -218,24 +171,21 @@ volk_32fc_index_max_32u_a_sse3(uint32_t* target, lv_32fc_t* src0, uint32_t num_p ++ ++ union bit128 xmm5, xmm4; ++ __m128 xmm1, xmm2, xmm3; ++- __m128i xmm8, xmm11, xmm12, xmmfive, xmmfour, xmm9, holder0, holder1, xmm10; +++ __m128i xmm8, xmm11, xmm12, xmm9, xmm10; ++ ++- xmm5.int_vec = xmmfive = _mm_setzero_si128(); ++- xmm4.int_vec = xmmfour = _mm_setzero_si128(); ++- holderf.int_vec = holder0 = _mm_setzero_si128(); ++- holderi.int_vec = holder1 = _mm_setzero_si128(); +++ xmm5.int_vec = _mm_setzero_si128(); +++ xmm4.int_vec = _mm_setzero_si128(); +++ holderf.int_vec = _mm_setzero_si128(); +++ holderi.int_vec = _mm_setzero_si128(); ++ ++ int bound = num_bytes >> 5; ++ int i = 0; ++ ++- xmm8 = _mm_set_epi32(3, 2, 1, 0); // remember the crazy reverse order! +++ xmm8 = _mm_setr_epi32(0, 1, 2, 3); ++ xmm9 = _mm_setzero_si128(); ++- xmm10 = _mm_set_epi32(4, 4, 4, 4); +++ xmm10 = _mm_setr_epi32(4, 4, 4, 4); ++ xmm3 = _mm_setzero_ps(); ++ ++- // printf("%f, %f, %f, %f\n", ((float*)&xmm10)[0], ((float*)&xmm10)[1], ++- // ((float*)&xmm10)[2], ((float*)&xmm10)[3]); ++- ++ for (; i < bound; ++i) { ++ xmm1 = _mm_load_ps((float*)src0); ++ xmm2 = _mm_load_ps((float*)&src0[2]); ++@@ -258,14 +208,8 @@ volk_32fc_index_max_32u_a_sse3(uint32_t* target, lv_32fc_t* src0, uint32_t num_p ++ xmm9 = _mm_add_epi32(xmm11, xmm12); ++ ++ xmm8 = _mm_add_epi32(xmm8, xmm10); ++- ++- // printf("%f, %f, %f, %f\n", ((float*)&xmm3)[0], ((float*)&xmm3)[1], ++- // ((float*)&xmm3)[2], ((float*)&xmm3)[3]); printf("%u, %u, %u, %u\n", ++- // ((uint32_t*)&xmm10)[0], ((uint32_t*)&xmm10)[1], ((uint32_t*)&xmm10)[2], ++- // ((uint32_t*)&xmm10)[3]); ++ } ++ ++- ++ if (num_bytes >> 4 & 1) { ++ xmm2 = _mm_load_ps((float*)src0); ++ ++@@ -280,7 +224,7 @@ volk_32fc_index_max_32u_a_sse3(uint32_t* target, lv_32fc_t* src0, uint32_t num_p ++ ++ xmm3 = _mm_max_ps(xmm1, xmm3); ++ ++- xmm10 = _mm_set_epi32(2, 2, 2, 2); // load1_ps((float*)&init[2]); +++ xmm10 = _mm_setr_epi32(2, 2, 2, 2); ++ ++ xmm4.float_vec = _mm_cmplt_ps(xmm1, xmm3); ++ xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3); ++@@ -291,14 +235,9 @@ volk_32fc_index_max_32u_a_sse3(uint32_t* target, lv_32fc_t* src0, uint32_t num_p ++ xmm9 = _mm_add_epi32(xmm11, xmm12); ++ ++ xmm8 = _mm_add_epi32(xmm8, xmm10); ++- // printf("egads%u, %u, %u, %u\n", ((uint32_t*)&xmm9)[0], ((uint32_t*)&xmm9)[1], ++- // ((uint32_t*)&xmm9)[2], ((uint32_t*)&xmm9)[3]); ++ } ++ ++ if (num_bytes >> 3 & 1) { ++- // printf("%u, %u, %u, %u\n", ((uint32_t*)&xmm9)[0], ((uint32_t*)&xmm9)[1], ++- // ((uint32_t*)&xmm9)[2], ((uint32_t*)&xmm9)[3]); ++- ++ sq_dist = ++ lv_creal(src0[0]) * lv_creal(src0[0]) + lv_cimag(src0[0]) * lv_cimag(src0[0]); ++ ++@@ -319,11 +258,6 @@ volk_32fc_index_max_32u_a_sse3(uint32_t* target, lv_32fc_t* src0, uint32_t num_p ++ xmm9 = _mm_add_epi32(xmm11, xmm12); ++ } ++ ++- // printf("%f, %f, %f, %f\n", ((float*)&xmm3)[0], ((float*)&xmm3)[1], ++- // ((float*)&xmm3)[2], ((float*)&xmm3)[3]); printf("%u, %u, %u, %u\n", ++- // ((uint32_t*)&xmm9)[0], ((uint32_t*)&xmm9)[1], ((uint32_t*)&xmm9)[2], ++- // ((uint32_t*)&xmm9)[3]); ++- ++ _mm_store_ps((float*)&(holderf.f), xmm3); ++ _mm_store_si128(&(holderi.int_vec), xmm9); ++ ++@@ -335,25 +269,6 @@ volk_32fc_index_max_32u_a_sse3(uint32_t* target, lv_32fc_t* src0, uint32_t num_p ++ sq_dist = (holderf.f[2] > sq_dist) ? holderf.f[2] : sq_dist; ++ target[0] = (holderf.f[3] > sq_dist) ? holderi.i[3] : target[0]; ++ sq_dist = (holderf.f[3] > sq_dist) ? holderf.f[3] : sq_dist; ++- ++- /* ++- float placeholder = 0.0; ++- uint32_t temp0, temp1; ++- uint32_t g0 = (((float*)&xmm3)[0] > ((float*)&xmm3)[1]); ++- uint32_t l0 = g0 ^ 1; ++- ++- uint32_t g1 = (((float*)&xmm3)[1] > ((float*)&xmm3)[2]); ++- uint32_t l1 = g1 ^ 1; ++- ++- temp0 = g0 * ((uint32_t*)&xmm9)[0] + l0 * ((uint32_t*)&xmm9)[1]; ++- temp1 = g0 * ((uint32_t*)&xmm9)[2] + l0 * ((uint32_t*)&xmm9)[3]; ++- sq_dist = g0 * ((float*)&xmm3)[0] + l0 * ((float*)&xmm3)[1]; ++- placeholder = g0 * ((float*)&xmm3)[2] + l0 * ((float*)&xmm3)[3]; ++- ++- g0 = (sq_dist > placeholder); ++- l0 = g0 ^ 1; ++- target[0] = g0 * temp0 + l0 * temp1; ++- */ ++ } ++ ++ #endif /*LV_HAVE_SSE3*/ ++@@ -374,18 +289,18 @@ volk_32fc_index_max_32u_generic(uint32_t* target, lv_32fc_t* src0, uint32_t num_ ++ sq_dist = ++ lv_creal(src0[i]) * lv_creal(src0[i]) + lv_cimag(src0[i]) * lv_cimag(src0[i]); ++ ++- index = sq_dist > max ? i : index; ++- max = sq_dist > max ? sq_dist : max; +++ if (sq_dist > max) { +++ index = i; +++ max = sq_dist; +++ } ++ } ++ target[0] = index; ++ } ++ ++ #endif /*LV_HAVE_GENERIC*/ ++ ++- ++ #endif /*INCLUDED_volk_32fc_index_max_32u_a_H*/ ++ ++- ++ #ifndef INCLUDED_volk_32fc_index_max_32u_u_H ++ #define INCLUDED_volk_32fc_index_max_32u_u_H ++ ++@@ -405,24 +320,26 @@ volk_32fc_index_max_32u_u_avx2(uint32_t* target, lv_32fc_t* src0, uint32_t num_p ++ union bit256 holderf; ++ union bit256 holderi; ++ float sq_dist = 0.0; +++ float max = 0.0; +++ uint32_t index = 0; ++ ++ union bit256 xmm5, xmm4; ++ __m256 xmm1, xmm2, xmm3; ++- __m256i xmm8, xmm11, xmm12, xmmfive, xmmfour, xmm9, holder0, holder1, xmm10; +++ __m256i xmm8, xmm11, xmm12, xmm9, xmm10; ++ ++- xmm5.int_vec = xmmfive = _mm256_setzero_si256(); ++- xmm4.int_vec = xmmfour = _mm256_setzero_si256(); ++- holderf.int_vec = holder0 = _mm256_setzero_si256(); ++- holderi.int_vec = holder1 = _mm256_setzero_si256(); +++ xmm5.int_vec = _mm256_setzero_si256(); +++ xmm4.int_vec = _mm256_setzero_si256(); +++ holderf.int_vec = _mm256_setzero_si256(); +++ holderi.int_vec = _mm256_setzero_si256(); ++ ++ int bound = num_bytes >> 6; ++ int i = 0; ++ ++- xmm8 = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0); +++ xmm8 = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7); ++ xmm9 = _mm256_setzero_si256(); ++ xmm10 = _mm256_set1_epi32(8); ++ xmm3 = _mm256_setzero_ps(); ++- __m256i idx = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0); +++ __m256i idx = _mm256_setr_epi32(0, 1, 4, 5, 2, 3, 6, 7); ++ ++ for (; i < bound; ++i) { ++ xmm1 = _mm256_loadu_ps((float*)src0); ++@@ -449,75 +366,26 @@ volk_32fc_index_max_32u_u_avx2(uint32_t* target, lv_32fc_t* src0, uint32_t num_p ++ xmm8 = _mm256_add_epi32(xmm8, xmm10); ++ } ++ ++- xmm10 = _mm256_set1_epi32(4); ++- if (num_bytes >> 4 & 1) { ++- xmm1 = _mm256_loadu_ps((float*)src0); ++- ++- xmm1 = _mm256_mul_ps(xmm1, xmm1); ++- ++- src0 += 4; ++- ++- xmm1 = _mm256_hadd_ps(xmm1, xmm1); ++- ++- xmm3 = _mm256_max_ps(xmm1, xmm3); ++- ++- xmm4.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_LT_OS); ++- xmm5.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_EQ_OQ); ++- ++- xmm11 = _mm256_and_si256(xmm8, xmm5.int_vec); ++- xmm12 = _mm256_and_si256(xmm9, xmm4.int_vec); ++- ++- xmm9 = _mm256_add_epi32(xmm11, xmm12); +++ _mm256_storeu_ps((float*)&(holderf.f), xmm3); +++ _mm256_storeu_si256(&(holderi.int_vec), xmm9); ++ ++- xmm8 = _mm256_add_epi32(xmm8, xmm10); +++ for (i = 0; i < 8; i++) { +++ if (holderf.f[i] > max) { +++ index = holderi.i[i]; +++ max = holderf.f[i]; +++ } ++ } ++ ++- idx = _mm256_set_epi32(1, 0, 1, 0, 1, 0, 1, 0); ++- xmm10 = _mm256_set1_epi32(2); ++- if (num_bytes >> 4 & 1) { ++- xmm2 = _mm256_loadu_ps((float*)src0); ++- ++- xmm1 = _mm256_permutevar8x32_ps(bit256_p(&xmm8)->float_vec, idx); ++- xmm8 = bit256_p(&xmm1)->int_vec; ++- ++- xmm2 = _mm256_mul_ps(xmm2, xmm2); ++- ++- src0 += 2; ++- ++- xmm1 = _mm256_hadd_ps(xmm2, xmm2); ++- ++- xmm3 = _mm256_max_ps(xmm1, xmm3); ++- ++- xmm4.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_LT_OS); ++- xmm5.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_EQ_OQ); ++- ++- xmm11 = _mm256_and_si256(xmm8, xmm5.int_vec); ++- xmm12 = _mm256_and_si256(xmm9, xmm4.int_vec); ++- ++- xmm9 = _mm256_add_epi32(xmm11, xmm12); +++ for (i = bound * 8; i < num_points; i++, src0++) { +++ sq_dist = +++ lv_creal(src0[0]) * lv_creal(src0[0]) + lv_cimag(src0[0]) * lv_cimag(src0[0]); ++ ++- xmm8 = _mm256_add_epi32(xmm8, xmm10); +++ if (sq_dist > max) { +++ index = i; +++ max = sq_dist; +++ } ++ } ++- ++- _mm256_storeu_ps((float*)&(holderf.f), xmm3); ++- _mm256_storeu_si256(&(holderi.int_vec), xmm9); ++- ++- target[0] = holderi.i[0]; ++- sq_dist = holderf.f[0]; ++- target[0] = (holderf.f[1] > sq_dist) ? holderi.i[1] : target[0]; ++- sq_dist = (holderf.f[1] > sq_dist) ? holderf.f[1] : sq_dist; ++- target[0] = (holderf.f[2] > sq_dist) ? holderi.i[2] : target[0]; ++- sq_dist = (holderf.f[2] > sq_dist) ? holderf.f[2] : sq_dist; ++- target[0] = (holderf.f[3] > sq_dist) ? holderi.i[3] : target[0]; ++- sq_dist = (holderf.f[3] > sq_dist) ? holderf.f[3] : sq_dist; ++- target[0] = (holderf.f[4] > sq_dist) ? holderi.i[4] : target[0]; ++- sq_dist = (holderf.f[4] > sq_dist) ? holderf.f[4] : sq_dist; ++- target[0] = (holderf.f[5] > sq_dist) ? holderi.i[5] : target[0]; ++- sq_dist = (holderf.f[5] > sq_dist) ? holderf.f[5] : sq_dist; ++- target[0] = (holderf.f[6] > sq_dist) ? holderi.i[6] : target[0]; ++- sq_dist = (holderf.f[6] > sq_dist) ? holderf.f[6] : sq_dist; ++- target[0] = (holderf.f[7] > sq_dist) ? holderi.i[7] : target[0]; ++- sq_dist = (holderf.f[7] > sq_dist) ? holderf.f[7] : sq_dist; +++ target[0] = index; ++ } ++ ++ #endif /*LV_HAVE_AVX2*/ ++-- ++2.20.1 ++ diff --cc debian/patches/0007-cmake-Remove-the-ORC-from-the-VOLK-public-link-inter.patch index 0000000,0000000..30ab9cb new file mode 100644 --- /dev/null +++ b/debian/patches/0007-cmake-Remove-the-ORC-from-the-VOLK-public-link-inter.patch @@@ -1,0 -1,0 +1,53 @@@ ++From d214a7f62554341aaee7f66ec259131b5cbe84e3 Mon Sep 17 00:00:00 2001 ++From: Vasil Velichkov ++Date: Sun, 22 Mar 2020 22:22:13 +0200 ++Subject: [PATCH 7/7] cmake: Remove the ORC from the VOLK public link interface ++ ++The ORC is an internal dependency that is used to generate SIMD ++implementations of some the kernels and no ORC types or functions are ++exposed by the VOLK library so adding it to the public link interface is ++unnecessary when linking dynamically. ++ ++Currently the ORC is added to the INTERFACE_LINK_LIBRARIES property of ++the Volk::volk target in VolkTargets.cmake and you need to have the ORC ++development files (liborc-*-dev) installed on your system in order to ++successfully link a program or library that uses VOLK. ++--- ++ lib/CMakeLists.txt | 7 ++++++- ++ 1 file changed, 6 insertions(+), 1 deletion(-) ++ ++diff --git a/lib/CMakeLists.txt b/lib/CMakeLists.txt ++index c5c784a..02ffa40 100644 ++--- a/lib/CMakeLists.txt +++++ b/lib/CMakeLists.txt ++@@ -465,7 +465,6 @@ if(ORC_FOUND) ++ #setup orc library usage ++ include_directories(${ORC_INCLUDE_DIRS}) ++ link_directories(${ORC_LIBRARY_DIRS}) ++- list(APPEND volk_libraries ${ORC_LIBRARIES}) ++ ++ #setup orc functions ++ file(GLOB orc_files ${PROJECT_SOURCE_DIR}/kernels/volk/asm/orc/*.orc) ++@@ -572,6 +571,9 @@ target_include_directories(volk ++ ) ++ ++ #Configure target properties +++if(ORC_FOUND) +++ target_link_libraries(volk PRIVATE ${ORC_LIBRARIES}) +++endif() ++ if(NOT MSVC) ++ target_link_libraries(volk PUBLIC m) ++ endif() ++@@ -597,6 +599,9 @@ install(TARGETS volk ++ if(ENABLE_STATIC_LIBS) ++ add_library(volk_static STATIC $) ++ target_link_libraries(volk_static PUBLIC ${volk_libraries} pthread) +++ if(ORC_FOUND) +++ target_link_libraries(volk_static PUBLIC ${ORC_LIBRARIES}) +++ endif() ++ if(NOT MSVC) ++ target_link_libraries(volk_static PUBLIC m) ++ endif() ++-- ++2.20.1 ++ diff --cc debian/patches/avoid-unnecessary-soversion-bump index 0000000,0000000..63865b6 new file mode 100644 --- /dev/null +++ b/debian/patches/avoid-unnecessary-soversion-bump @@@ -1,0 -1,0 +1,11 @@@ ++--- a/CMakeLists.txt +++++ b/CMakeLists.txt ++@@ -67,7 +67,7 @@ ++ ++ set(VERSION_INFO_MAJOR_VERSION 2) ++ set(VERSION_INFO_MINOR_VERSION 2) ++-set(VERSION_INFO_MAINT_VERSION 1) +++set(VERSION_INFO_MAINT_VERSION 0) ++ include(VolkVersion) #setup version info ++ ++ macro(set_version_str VAR) diff --cc debian/patches/make-acc-happy index 0000000,0000000..7c5c767 new file mode 100644 --- /dev/null +++ b/debian/patches/make-acc-happy @@@ -1,0 -1,0 +1,60 @@@ ++From 799245ea6e9e05cc0ed0fabe783fbbe1a5054fd4 Mon Sep 17 00:00:00 2001 ++From: "A. Maitland Bottoms" ++Date: Tue, 27 Mar 2018 22:02:59 -0400 ++Subject: [PATCH 2/6] make acc happy ++ ++The abi-compliance-checker grabs all the .h files it finds ++and tries to compile them all. Even though some are not ++appropriate for the architecture being run on. Being careful ++with preprocessor protections avoids problems. ++--- ++ include/volk/volk_neon_intrinsics.h | 2 ++ ++ kernels/volk/volk_32f_8u_polarbutterflypuppet_32f.h | 1 + ++ kernels/volk/volk_8u_x2_encodeframepolar_8u.h | 3 --- ++ 3 files changed, 3 insertions(+), 3 deletions(-) ++ ++--- a/include/volk/volk_neon_intrinsics.h +++++ b/include/volk/volk_neon_intrinsics.h ++@@ -79,6 +79,7 @@ ++ ++ #ifndef INCLUDE_VOLK_VOLK_NEON_INTRINSICS_H_ ++ #define INCLUDE_VOLK_VOLK_NEON_INTRINSICS_H_ +++#ifdef LV_HAVE_NEON ++ #include ++ ++ ++@@ -278,4 +279,5 @@ ++ } ++ ++ +++#endif /*LV_HAVE_NEON*/ ++ #endif /* INCLUDE_VOLK_VOLK_NEON_INTRINSICS_H_ */ ++--- a/kernels/volk/volk_32f_8u_polarbutterflypuppet_32f.h +++++ b/kernels/volk/volk_32f_8u_polarbutterflypuppet_32f.h ++@@ -31,6 +31,7 @@ ++ #include ++ #include ++ #include +++#include ++ ++ ++ static inline void sanitize_bytes(unsigned char* u, const int elements) ++--- a/kernels/volk/volk_8u_x2_encodeframepolar_8u.h +++++ b/kernels/volk/volk_8u_x2_encodeframepolar_8u.h ++@@ -60,8 +60,6 @@ ++ } ++ } ++ ++-#ifdef LV_HAVE_GENERIC ++- ++ static inline void volk_8u_x2_encodeframepolar_8u_generic(unsigned char* frame, ++ unsigned char* temp, ++ unsigned int frame_size) ++@@ -81,7 +79,6 @@ ++ --stage; ++ } ++ } ++-#endif /* LV_HAVE_GENERIC */ ++ ++ #ifdef LV_HAVE_SSSE3 ++ #include diff --cc debian/patches/optional-static-apps index 0000000,0000000..399ee9b new file mode 100644 --- /dev/null +++ b/debian/patches/optional-static-apps @@@ -1,0 -1,0 +1,20 @@@ ++--- a/apps/CMakeLists.txt +++++ b/apps/CMakeLists.txt ++@@ -62,7 +62,7 @@ ++ target_link_libraries(volk_profile PRIVATE std::filesystem) ++ endif() ++ ++-if(ENABLE_STATIC_LIBS) +++if(ENABLE_STATIC_LIBS AND ENABLE_STATIC_APPS) ++ target_link_libraries(volk_profile PRIVATE volk_static) ++ set_target_properties(volk_profile PROPERTIES LINK_FLAGS "-static") ++ else() ++@@ -79,7 +79,7 @@ ++ add_executable(volk-config-info volk-config-info.cc ${CMAKE_CURRENT_SOURCE_DIR}/volk_option_helpers.cc ++ ) ++ ++-if(ENABLE_STATIC_LIBS) +++if(ENABLE_STATIC_LIBS AND ENABLE_STATIC_APPS) ++ target_link_libraries(volk-config-info volk_static) ++ set_target_properties(volk-config-info PROPERTIES LINK_FLAGS "-static") ++ else() diff --cc debian/patches/remove-external-HTML-resources index 0000000,0000000..493356f new file mode 100644 --- /dev/null +++ b/debian/patches/remove-external-HTML-resources @@@ -1,0 -1,0 +1,8 @@@ ++--- a/README.md +++++ b/README.md ++@@ -1,5 +1,3 @@ ++-[![Build Status](https://travis-ci.org/gnuradio/volk.svg?branch=master)](https://travis-ci.org/gnuradio/volk) [![Build status](https://ci.appveyor.com/api/projects/status/5o56mgw0do20jlh3/branch/master?svg=true)](https://ci.appveyor.com/project/gnuradio/volk/branch/master) ++- ++ ![VOLK Logo](/docs/volk_logo.png) ++ ++ # Welcome to VOLK! diff --cc debian/patches/series index 0000000,0000000..1cb8e3d new file mode 100644 --- /dev/null +++ b/debian/patches/series @@@ -1,0 -1,0 +1,11 @@@ ++0001-volk-accurate-exp-kernel.patch ++0002-exp-Rename-SSE4.1-to-SSE2-kernel.patch ++0003-clang-format-Apply-clang-format.patch ++0004-clang-format-Update-PR-with-GitHub-Action.patch ++0005-clang-format-Rebase-onto-current-master.patch ++0006-Fix-the-broken-index-max-kernels.patch ++0007-cmake-Remove-the-ORC-from-the-VOLK-public-link-inter.patch ++avoid-unnecessary-soversion-bump ++make-acc-happy ++optional-static-apps ++remove-external-HTML-resources diff --cc debian/rules index 0000000,0000000..f80dbba new file mode 100755 --- /dev/null +++ b/debian/rules @@@ -1,0 -1,0 +1,23 @@@ ++#!/usr/bin/make -f ++DEB_HOST_MULTIARCH ?= $(shell dpkg-architecture -qDEB_HOST_MULTIARCH) ++export DEB_HOST_MULTIARCH ++export DH_VERBOSE=1 ++ ++%: ++ dh $@ --with python3 ++ ++override_dh_auto_configure: ++ dh_auto_configure -- -DLIB_SUFFIX="/$(DEB_HOST_MULTIARCH)" \ ++ -DENABLE_STATIC_LIBS=On, -DPYTHON_EXECUTABLE=/usr/bin/python3 \ ++ -DCMAKE_BUILD_TYPE=RelWithDebInfo ++ ++override_dh_auto_build-indep: ++ cmake --build obj-* --target volk_doc ++ ++override_dh_auto_test: ++ - dh_auto_test -- CTEST_TEST_TIMEOUT=60 ++ ++override_dh_acc: ++ - abi-compliance-checker -l libvolk2-dev -v1 2.0.0-1 -dump debian/libvolk2-dev.acc -dump-path debian/libvolk2-dev/usr/lib/x86_64-linux-gnu/dh-acc/libvolk2-dev_2.0.0-1.abi.tar.gz ++ - cat logs/libvolk2-dev/2.0.0-1/log.txt ++ - dh_acc diff --cc debian/source/format index 0000000,0000000..163aaf8 new file mode 100644 --- /dev/null +++ b/debian/source/format @@@ -1,0 -1,0 +1,1 @@@ ++3.0 (quilt) diff --cc debian/source/include-binaries index 0000000,0000000..2a77b05 new file mode 100644 --- /dev/null +++ b/debian/source/include-binaries @@@ -1,0 -1,0 +1,1 @@@ ++debian/libvolk2-dev.abi.tar.gz.amd64 diff --cc debian/upstream/signing-key.asc index 0000000,0000000..f6d7f93 new file mode 100644 --- /dev/null +++ b/debian/upstream/signing-key.asc @@@ -1,0 -1,0 +1,52 @@@ ++-----BEGIN PGP PUBLIC KEY BLOCK----- ++Version: GnuPG v1 ++ ++mQINBFcTzE0BEACWkwa+pAwjBPwUvL8E9adB6sFlH/bw/3Dj2Vr/bXDkNrZDEQzc ++C3wmoX3AZo0GSWpjlmlOGOPy6u4wZxEPfilKs+eDNnuIZN3gmLoRTThgbbrnH9bw ++kIaUMiUn8VJ0pk5ULaygG6APxl4EOVrMfzgRnxmIbUfggiBLaW/xq2a/BaVrUAuA ++oHv1GTGJkwcK0RfYigJMfZl9iHVJVopffexBt1hOeGYxiyLXSDWjOhLLVzhlfgTE ++T9YdLGyjoXFmImsCvkAA2MA52e5YGUQIBrqmiXdHFit7sve0e5Dw0aLyuTnMR0MO ++a2eIHWU6TYYv5GTJPzjBbWM1pRCgtupNilg2+RfN0tOTp27RQnUtgcCo26uBU+jV ++pyvnidpDGnuUBL3WNLZlUiqmiZs8Hc9BGNw3rKB37sUOMXz6XessnhRspXC1Mot4 ++V3I1NoKwb0wjgqlkAYIGCCSuySosC5HH2OssopBUH6U5QXjFp11QbP2e+QkvKPKA ++S9V4ouSMrIDZ4krtu6QFDYsHa0zZ54yRl3O4UpfISlz3yngO2eKM019C5n51kd62 ++Ia00rtx8ypvUxMy67PTEFdCKLJ6Ua/hEGcpxGygFMRa0pjHSrC6e9LvPudK92jsq ++qO0TjhUytig5k9YPoEa2JGn/kqP+K1HGAdJPay/HmcNTZWh0hoamhuJ6NwARAQAB ++tCZOYXRoYW4gV2VzdCA8bmF0aGFuLndlc3RAZ251cmFkaW8ub3JnPokCPgQTAQIA ++KAUCVxPMTQIbAwUJA8JnAAYLCQgHAwIGFQgCCQoLBBYCAwECHgECF4AACgkQOFMj ++7mQCCR20CA//VJfDu8W8BI/44JkucC+XBVqwOcfg/rcSHflgi0mNNz7hyJ+idwcB ++JVFSbhSpXucl6baJ0nDe8gcMuGFLyF4uLwCByX3ExDAnFL3Mu/jIyOUX8TGudZU7 ++wTEhzOLPxmXfbo8lw3TETC1Xsl8g1gU/KBJnTl3WbdGZUlKW6fP0TR5BMdYskNHm ++CCqAvXWniZwjSX/jlpWremfTU9i9DUad8ufcdJue7uiZRNq4JLaWmSbtGNzDzJIq ++6csHc3GFcd0Q/LDEDcm1AG081yLEmRnbTstZo+xW27yaRyoe1Dpm9ehsl19dVaO7 ++9ek2CEarqHjtRfO1MJMSBGiaS1lvujukYKZQRGNDKemDJwuQCVkxBMEef7SNX8XG ++2OPTARVp0hlrhMVFUk3hScekrKobq81YyCfWxBxxjRWySdInFhuT29cxxRLUxb69 ++3MKLzFJRlq+oEbWJN8QGqILQ785TZA8MdnMsGywPk43x9spgYbwPhtJYb/Aes9B9 ++NFkZ6EzVtzV7ztITuGhefRxt3eEmdFYNDHooWNFQdifcUgLoBgKOkP+oHOc+9mx7 ++6CDN9ZJTHb87W3ISw7SLI4YcMPYipEN5g51ceInDc3kXFYQ+EqU691kOuGNtx3ov ++qqvPm9PBR00GSwhLQt7s127MFpYx9+in87+UMBFXyo/VstVBPQW2GLq5Ag0EVxPM ++TQEQAK+fh+ckP728ZVRn5mr8PtsG3gktyS6LlH7EjMsHnvQR16EVAjn5G915OQUY ++Bk6yk9l0VRX0NLautc41NwVlHI4FYBBjz6mEnDocvo+BT0g5KYTyjJPOxmEzgVZW ++3Zp/jPjK5Z9YZTCIalrk2iHVQCe8fFCnaXNGNQoku1jBPRUOOTI979LWPx4d7MI0 ++7Yy+8xp5ogCrcTxea9VrMeXqnXzvy2peiceZDlvNmcEUCz222i6t2k9rUwY0+ozg ++TbsorE42h4B+a49ylY4zOX9fTPfsUj59/z/ilrxZy2qP2lBIFC+wFphKF3Qkilxd ++dnVGTsb9oKCQjuMcvh7MR27RVGLjW1pVMWGMmXBkIDu0U88Hn91XKfm1ZmWgksoU ++MC7BZocvUxIKnV+WiKy9ooP/HSzgP7ggdG+16B3yDdicB0DiBFEKZEmIWCBt5NXR ++q853WwFSH7xcrEOTXnqtkRUX4+obdwQhtqTueSC4xqX0+YVixZUC6ewqueFmPn+l ++WItCV7XU67NXTJNRC3i4kIF+hpT5YWtx56NuNcvhN25bZr1frTChOuXcCBNrOU+b ++yo2wpXAcfq+YmnaP0ZFFh7wKRi4leEPL/+JyitQbvSQU4Lejwanzvv7Ug1j4qZo1 ++A6WSxXYUWJY5rhh8nWYtJJOn5Wj4Y3gWa1taUpYw1g2lf0o5ABEBAAGJAiUEGAEC ++AA8FAlcTzE0CGwwFCQPCZwAACgkQOFMj7mQCCR2uXRAAiBsOfqp+QuQqO3OPW8OZ ++I2+JNbaaFEC1TorUhGs5XiT4wKyn1wDni4mavO4kJ8nK4Zc1qBYWeMOClj6JySJL ++yf0aVTjLyn+4Q4jt/9Dmn15wbOWZvdSICipfcLWmPLYniizsJWA4Mqoefcztmyxk ++FrJZ+Vri6MH5PxVuZjHhOUVfXIsqRhqqrpRjVnjzGvNxLgP3aLHfQPim/jbxaeRK ++oVtDNDLA+1nwdpZ8Hehe5OVfUKWuz1DXrdM0eY7pTRcms8+7y//AXpRqygH7TLx5 ++mXavdmAzgYcamQGfu/K4Mq9Bkgr1BNasgkxnPu+J0Z4jO9HsRBCJWf2BLKXmYedD ++5t0LR8bJHUTV7lsIifo0Ev47qsk1QX41KSKPAMwSzmtTLA0wzPJrkUEeVgm075N7 ++btLneqw5EyDcz3pJ7aD3HceWh+HZOREnfYXyMLxWTND7SKx0k6nmM8xasYHP0/6y ++mR8picMjbPlyoETe6B6yKi5rDjOrDwrKqBjulcUHsRhjAAUUI6IHgj4v5gCfTPS7 ++WrV98icGSHYnuxV40NT8Nt0lWNrPJhIUm1nu3UkEInznxMii1h6ga6REE/TJsStD ++C46x7fsiH4HkK1FJ+owoLhsVQo0OE4nWh8lWIBhTpR4wxThwfVHKt/H12st3tHuI ++CLIM6szb01rYgHTn9/vDgJE= ++=MlbD ++-----END PGP PUBLIC KEY BLOCK----- diff --cc debian/volk-config-info.1 index 0000000,0000000..e8d6efd new file mode 100644 --- /dev/null +++ b/debian/volk-config-info.1 @@@ -1,0 -1,0 +1,45 @@@ ++.\" DO NOT MODIFY THIS FILE! It was generated by help2man 1.40.10. ++.TH VOLK-CONFIG-INFO "1" "July 2014" "volk-config-info 0.1" "User Commands" ++.SH NAME ++volk-config-info \- pkgconfig-like tool for Vector Optimized Library of Kernels 0.1 ++.SH DESCRIPTION ++.SS "Program options: volk-config-info [options]:" ++.TP ++\fB\-h\fR [ \fB\-\-help\fR ] ++print help message ++.TP ++\fB\-\-prefix\fR ++print VOLK installation prefix ++.TP ++\fB\-\-builddate\fR ++print VOLK build date (RFC2822 format) ++.TP ++\fB\-\-cc\fR ++print VOLK C compiler version ++.TP ++\fB\-\-cflags\fR ++print VOLK CFLAGS ++.TP ++\fB\-\-all\-machines\fR ++print VOLK machines built into library ++.TP ++\fB\-\-avail\-machines\fR ++print VOLK machines the current platform can use ++.TP ++\fB\-\-machine\fR ++print the VOLK machine that will be used ++.TP ++\fB\-v\fR [ \fB\-\-version\fR ] ++print VOLK version ++.SH "SEE ALSO" ++The full documentation for ++.B volk-config-info ++is maintained as a Texinfo manual. If the ++.B info ++and ++.B volk-config-info ++programs are properly installed at your site, the command ++.IP ++.B info volk-config-info ++.PP ++should give you access to the complete manual. diff --cc debian/volk_modtool.1 index 0000000,0000000..752e7f5 new file mode 100644 --- /dev/null +++ b/debian/volk_modtool.1 @@@ -1,0 -1,0 +1,112 @@@ ++.TH GNURADIO "1" "August 2013" "volk_modtool 3.7" "User Commands" ++.SH NAME ++volk_modtool \- tailor VOLK modules ++.SH DESCRIPTION ++The volk_modtool tool is installed along with VOLK as a way of helping ++to construct, add to, and interogate the VOLK library or companion ++libraries. ++.P ++volk_modtool is installed into $prefix/bin. ++.P ++VOLK modtool enables creating standalone (out-of-tree) VOLK modules ++and provides a few tools for sharing VOLK kernels between VOLK ++modules. If you need to design or work with VOLK kernels away from ++the canonical VOLK library, this is the tool. If you need to tailor ++your own VOLK library for whatever reason, this is the tool. ++.P ++The canonical VOLK library installs a volk.h and a libvolk.so. Your ++own library will install volk_$name.h and libvolk_$name.so. Ya Gronk? ++Good. ++.P ++There isn't a substantial difference between the canonical VOLK ++module and any other VOLK module. They're all peers. Any module ++created via VOLK modtool will come complete with a default ++volk_modtool.cfg file associating the module with the base from which ++it came, its distinctive $name and its destination (or path). These ++values (created from user input if VOLK modtool runs without a ++user-supplied config file or a default config file) serve as default ++values for some VOLK modtool actions. It's more or less intended for ++the user to change directories to the top level of a created VOLK ++module and then run volk_modtool to take advantage of the values ++stored in the default volk_modtool.cfg file. ++.P ++Apart from creating new VOLK modules, VOLK modtool allows you to list ++the names of kernels in other modules, list the names of kernels in ++the current module, add kernels from another module into the current ++module, and remove kernels from the current module. When moving ++kernels between modules, VOLK modtool does its best to keep the qa ++and profiling code for those kernels intact. If the base has a test ++or a profiling call for some kernel, those calls will follow the ++kernel when VOLK modtool adds that kernel. If QA or profiling ++requires a puppet kernel, the puppet kernel will follow the original ++kernel when VOLK modtool adds that original kernel. VOLK modtool ++respects puppets. ++.P ++====================================================================== ++.P ++.SH Installing a new VOLK Library: ++.P ++Run the command "volk_modtool -i". This will ask you three questions: ++.P ++ name: // the name to give your VOLK library: volk_ ++ destination: // directory new source tree is built under -- must exists. ++ // It will create /volk_ ++ base: // the directory containing the original VOLK source code ++.P ++This will build a new skeleton directory in the destination provided ++with the name volk_. It will contain the necessary structure to ++build: ++.P ++ mkdir build ++ cd build ++ cmake -DCMAKE_INSTALL_PREFIX=/opt/volk ../ ++ make ++ sudo make install ++.P ++Right now, the library is empty and contains no kernels. Kernels can ++be added from another VOLK library using the '-a' option. If not ++specified, the kernel will be extracted from the base VOLK ++directory. Using the '-b' allows us to specify another VOLK library to ++use for this purpose. ++.P ++ volk_modtool -a -n 32fc_x2_conjugate_dot_prod_32fc ++.P ++This will put the code for the new kernel into ++/volk_/kernels/volk_/ ++.P ++Other kernels must be added by hand. See the following webpages for ++more information about creating VOLK kernels: ++ http://gnuradio.org/doc/doxygen/volk_guide.html ++ http://gnuradio.org/redmine/projects/gnuradio/wiki/Volk ++.P ++====================================================================== ++.P ++.SH OPTIONS ++.P ++Options for Adding and Removing Kernels: ++ -a, --add_kernel ++ Add kernel from existing VOLK module. Uses the base VOLK module ++ unless -b is used. Use -n to specify the kernel name. ++ Requires: -n. ++ Optional: -b ++.P ++ -A, --add_all_kernels ++ Add all kernels from existing VOLK module. Uses the base VOLK ++ module unless -b is used. ++ Optional: -b ++.P ++ -x, --remove_kernel ++ Remove kernel from module. ++ Required: -n. ++ Optional: -b ++.P ++Options for Listing Kernels: ++ -l, --list ++ Lists all kernels available in the base VOLK module. ++.P ++ -k, --kernels ++ Lists all kernels in this VOLK module. ++.P ++ -r, --remote-list ++ Lists all kernels in another VOLK module that is specified ++ using the -b option. diff --cc debian/volk_profile.1 index 0000000,0000000..405facb new file mode 100644 --- /dev/null +++ b/debian/volk_profile.1 @@@ -1,0 -1,0 +1,5 @@@ ++.TH UHD_FFT "1" "March 2012" "volk_profile 3.5" "User Commands" ++.SH NAME ++volk_profile \- Quality Assurance application for libvolk functions ++.SH DESCRIPTION ++Writes profile results to a file. diff --cc debian/watch index 0000000,0000000..1339ebb new file mode 100644 --- /dev/null +++ b/debian/watch @@@ -1,0 -1,0 +1,4 @@@ ++version=4 ++ opts="pgpsigurlmangle=s%$%.asc%,filenamemangle=s%(?:.*?)?volk-?(\d[\d.]*)\.tar\.xz%volk_$1.orig.tar.xz%" \ ++ https://github.com/gnuradio/volk/releases \ ++ (?:.*?/)?volk-?(\d[\d.]*)\.tar\.xz debian uupdate