From: A. Maitland Bottoms Date: Sat, 28 Mar 2020 01:48:10 +0000 (+0000) Subject: volk (2.2.1-2) unstable; urgency=medium X-Git-Tag: archive/raspbian/2.2.1-2+rpi1^2~12 X-Git-Url: https://dgit.raspbian.org/?a=commitdiff_plain;h=1f718f06776ad35df030ae539e73a1c6d41adfaa;p=volk.git volk (2.2.1-2) unstable; urgency=medium * update to v2.2.1-11-gfaf230e * cmake: Remove the ORC from the VOLK public link interface * Fix the broken index max kernels [dgit import unpatched volk 2.2.1-2] --- 1f718f06776ad35df030ae539e73a1c6d41adfaa diff --cc debian/1.3_to_1.4_compat_report.html index 0000000,0000000..f9614d6 new file mode 100644 --- /dev/null +++ b/debian/1.3_to_1.4_compat_report.html @@@ -1,0 -1,0 +1,1069 @@@ ++ ++ ++ ++ ++ ++ ++ ++ ++libvolk1-dev: 1.3-3 to 1.4-1 compatibility report ++ ++ ++ ++

API compatibility report for the libvolk1-dev library between 1.3-3 and 1.4-1 versions on x86_64

++ ++
++

++ Binary
Compatibility ++ Source
Compatibility ++

Test Info

++ ++ ++ ++ ++ ++ ++ ++

Library Name	libvolk1-dev
Version #1	1.3-3
Version #2	1.4-1
Arch	x86_64
GCC Version	7
Subject	Binary Compatibility

Test Results

++ ++ ++ ++ ++ ++ ++

Total Header Files	135
Total Libraries	1
Total Symbols / Types	614 / 233
Compatibility	99.8%

Problem Summary

++ ++ ++ ++ ++ ++ ++ ++ ++ ++

	Severity	Count
Added Symbols	-	45
Removed Symbols	High	0
Problems with Data Types	High	0
	Medium	0
	Low	1
Problems with Symbols	High	1
	Medium	0
	Low	0
Problems with Constants	Low	1

++ ++

Added Symbols 45

++volk.h, libvolk.so.1.4
++volk_32f_64f_add_64f [data]
++volk_32f_64f_add_64f_a [data]
++volk_32f_64f_add_64f_get_func_desc ( )
++volk_32f_64f_add_64f_manual ( double* cVector, float const* aVector, double const* bVector, unsigned int num_points, char const* impl_name )
++volk_32f_64f_add_64f_u [data]
++volk_32f_64f_multiply_64f [data]
++volk_32f_64f_multiply_64f_a [data]
++volk_32f_64f_multiply_64f_get_func_desc ( )
++volk_32f_64f_multiply_64f_manual ( double* cVector, float const* aVector, double const* bVector, unsigned int num_points, char const* impl_name )
++volk_32f_64f_multiply_64f_u [data]
++volk_32f_s32f_mod_rangepuppet_32f [data]
++volk_32f_s32f_mod_rangepuppet_32f_a [data]
++volk_32f_s32f_mod_rangepuppet_32f_get_func_desc ( )
++volk_32f_s32f_mod_rangepuppet_32f_manual ( float* output, float const* input, float bound, unsigned int num_points, char const* impl_name )
++volk_32f_s32f_mod_rangepuppet_32f_u [data]
++volk_32f_s32f_s32f_mod_range_32f [data]
++volk_32f_s32f_s32f_mod_range_32f_a [data]
++volk_32f_s32f_s32f_mod_range_32f_get_func_desc ( )
++volk_32f_s32f_s32f_mod_range_32f_manual ( float* outputVector, float const* inputVector, float const lower_bound, float const upper_bound, unsigned int num_points, char const* impl_name )
++volk_32f_s32f_s32f_mod_range_32f_u [data]
++volk_32fc_32f_add_32fc [data]
++volk_32fc_32f_add_32fc_a [data]
++volk_32fc_32f_add_32fc_get_func_desc ( )
++volk_32fc_32f_add_32fc_manual ( lv_32fc_t* cVector, lv_32fc_t const* aVector, float const* bVector, unsigned int num_points, char const* impl_name )
++volk_32fc_32f_add_32fc_u [data]
++volk_32fc_x2_add_32fc [data]
++volk_32fc_x2_add_32fc_a [data]
++volk_32fc_x2_add_32fc_get_func_desc ( )
++volk_32fc_x2_add_32fc_manual ( lv_32fc_t* cVector, lv_32fc_t const* aVector, lv_32fc_t const* bVector, unsigned int num_points, char const* impl_name )
++volk_32fc_x2_add_32fc_u [data]
++volk_32u_reverse_32u [data]
++volk_32u_reverse_32u_a [data]
++volk_32u_reverse_32u_get_func_desc ( )
++volk_32u_reverse_32u_manual ( uint32_t* out, uint32_t const* in, unsigned int num_points, char const* impl_name )
++volk_32u_reverse_32u_u [data]
++volk_64f_x2_add_64f [data]
++volk_64f_x2_add_64f_a [data]
++volk_64f_x2_add_64f_get_func_desc ( )
++volk_64f_x2_add_64f_manual ( double* cVector, double const* aVector, double const* bVector, unsigned int num_points, char const* impl_name )
++volk_64f_x2_add_64f_u [data]
++volk_64f_x2_multiply_64f [data]
++volk_64f_x2_multiply_64f_a [data]
++volk_64f_x2_multiply_64f_get_func_desc ( )
++volk_64f_x2_multiply_64f_manual ( double* cVector, double const* aVector, double const* bVector, unsigned int num_points, char const* impl_name )
++volk_64f_x2_multiply_64f_u [data]
++
++to the top
++ ++

Problems with Symbols, High Severity 1

	Change	Effect
1	3rd middle parameter frame_size has been removed from the calling stack.	Layout of parameter's stack has been changed and therefore parameters at higher positions in the stack may be incorrectly initialized by applications.

++volk.h, libvolk.so.1.3
++ ++[+] volk_32f_8u_polarbutterfly_32f_manual ( float* llrs, unsigned char* u, int const frame_size, int const frame_exp, int const stage, int const u_num, int const row, char const* impl_name ) 1 ++
++ ++
++to the top
++ ++

Problems with Data Types, Low Severity 1

	Change	Effect
1	Base type has been changed from void()(float, unsigned char, int, int, int, int, int) to void()(float, unsigned char, int, int, int, int).	Replacement of the base data type may indicate a change in its semantic meaning.

++volk_typedefs.h
++ ++[+] typedef p_32f_8u_polarbutterfly_32f 1 ++
++ ++ ++
++to the top
++ ++

Problems with Constants, Low Severity 1

	Change	Effect
1	The constant LOG_POLY_DEGREE with value 6 has been removed.	The value of this constant may no longer be properly handled by new-version library functions.

++volk_32f_log2_32f.h
++ ++[+] LOG_POLY_DEGREE ++
++ ++ ++
++to the top
++

Header Files 135

++constants.h
++saturation_arithmetic.h
++volk.h
++volk_16i_32fc_dot_prod_32fc.h
++volk_16i_branch_4_state_8.h
++volk_16i_convert_8i.h
++volk_16i_max_star_16i.h
++volk_16i_max_star_horizontal_16i.h
++volk_16i_permute_and_scalar_add.h
++volk_16i_s32f_convert_32f.h
++volk_16i_x4_quad_max_star_16i.h
++volk_16i_x5_add_quad_16i_x4.h
++volk_16ic_convert_32fc.h
++volk_16ic_deinterleave_16i_x2.h
++volk_16ic_deinterleave_real_16i.h
++volk_16ic_deinterleave_real_8i.h
++volk_16ic_magnitude_16i.h
++volk_16ic_s32f_deinterleave_32f_x2.h
++volk_16ic_s32f_deinterleave_real_32f.h
++volk_16ic_s32f_magnitude_32f.h
++volk_16ic_x2_dot_prod_16ic.h
++volk_16ic_x2_multiply_16ic.h
++volk_16u_byteswap.h
++volk_16u_byteswappuppet_16u.h
++volk_32f_8u_polarbutterfly_32f.h
++volk_32f_8u_polarbutterflypuppet_32f.h
++volk_32f_accumulator_s32f.h
++volk_32f_acos_32f.h
++volk_32f_asin_32f.h
++volk_32f_atan_32f.h
++volk_32f_binary_slicer_32i.h
++volk_32f_binary_slicer_8i.h
++volk_32f_convert_64f.h
++volk_32f_cos_32f.h
++volk_32f_expfast_32f.h
++volk_32f_index_max_16u.h
++volk_32f_index_max_32u.h
++volk_32f_invsqrt_32f.h
++volk_32f_log2_32f.h
++volk_32f_null_32f.h
++volk_32f_s32f_32f_fm_detect_32f.h
++volk_32f_s32f_calc_spectral_noise_floor_32f.h
++volk_32f_s32f_convert_16i.h
++volk_32f_s32f_convert_32i.h
++volk_32f_s32f_convert_8i.h
++volk_32f_s32f_multiply_32f.h
++volk_32f_s32f_normalize.h
++volk_32f_s32f_power_32f.h
++volk_32f_s32f_stddev_32f.h
++volk_32f_sin_32f.h
++volk_32f_sqrt_32f.h
++volk_32f_stddev_and_mean_32f_x2.h
++volk_32f_tan_32f.h
++volk_32f_tanh_32f.h
++volk_32f_x2_add_32f.h
++volk_32f_x2_divide_32f.h
++volk_32f_x2_dot_prod_16i.h
++volk_32f_x2_dot_prod_32f.h
++volk_32f_x2_fm_detectpuppet_32f.h
++volk_32f_x2_interleave_32fc.h
++volk_32f_x2_max_32f.h
++volk_32f_x2_min_32f.h
++volk_32f_x2_multiply_32f.h
++volk_32f_x2_pow_32f.h
++volk_32f_x2_s32f_interleave_16ic.h
++volk_32f_x2_subtract_32f.h
++volk_32f_x3_sum_of_poly_32f.h
++volk_32fc_32f_dot_prod_32fc.h
++volk_32fc_32f_multiply_32fc.h
++volk_32fc_conjugate_32fc.h
++volk_32fc_convert_16ic.h
++volk_32fc_deinterleave_32f_x2.h
++volk_32fc_deinterleave_64f_x2.h
++volk_32fc_deinterleave_imag_32f.h
++volk_32fc_deinterleave_real_32f.h
++volk_32fc_deinterleave_real_64f.h
++volk_32fc_index_max_16u.h
++volk_32fc_index_max_32u.h
++volk_32fc_magnitude_32f.h
++volk_32fc_magnitude_squared_32f.h
++volk_32fc_s32f_atan2_32f.h
++volk_32fc_s32f_deinterleave_real_16i.h
++volk_32fc_s32f_magnitude_16i.h
++volk_32fc_s32f_power_32fc.h
++volk_32fc_s32f_power_spectrum_32f.h
++volk_32fc_s32f_x2_power_spectral_density_32f.h
++volk_32fc_s32fc_multiply_32fc.h
++volk_32fc_s32fc_rotatorpuppet_32fc.h
++volk_32fc_s32fc_x2_rotator_32fc.h
++volk_32fc_x2_conjugate_dot_prod_32fc.h
++volk_32fc_x2_divide_32fc.h
++volk_32fc_x2_dot_prod_32fc.h
++volk_32fc_x2_multiply_32fc.h
++volk_32fc_x2_multiply_conjugate_32fc.h
++volk_32fc_x2_s32f_square_dist_scalar_mult_32f.h
++volk_32fc_x2_square_dist_32f.h
++volk_32i_s32f_convert_32f.h
++volk_32i_x2_and_32i.h
++volk_32i_x2_or_32i.h
++volk_32u_byteswap.h
++volk_32u_byteswappuppet_32u.h
++volk_32u_popcnt.h
++volk_32u_popcntpuppet_32u.h
++volk_64f_convert_32f.h
++volk_64f_x2_max_64f.h
++volk_64f_x2_min_64f.h
++volk_64u_byteswap.h
++volk_64u_byteswappuppet_64u.h
++volk_64u_popcnt.h
++volk_64u_popcntpuppet_64u.h
++volk_8i_convert_16i.h
++volk_8i_s32f_convert_32f.h
++volk_8ic_deinterleave_16i_x2.h
++volk_8ic_deinterleave_real_16i.h
++volk_8ic_deinterleave_real_8i.h
++volk_8ic_s32f_deinterleave_32f_x2.h
++volk_8ic_s32f_deinterleave_real_32f.h
++volk_8ic_x2_multiply_conjugate_16ic.h
++volk_8ic_x2_s32f_multiply_conjugate_32fc.h
++volk_8u_conv_k7_r2puppet_8u.h
++volk_8u_x2_encodeframepolar_8u.h
++volk_8u_x3_encodepolar_8u_x2.h
++volk_8u_x3_encodepolarpuppet_8u.h
++volk_8u_x4_conv_k7_r2_8u.h
++volk_avx_intrinsics.h
++volk_common.h
++volk_complex.h
++volk_config_fixed.h
++volk_cpu.h
++volk_malloc.h
++volk_neon_intrinsics.h
++volk_prefs.h
++volk_sse3_intrinsics.h
++volk_sse_intrinsics.h
++volk_typedefs.h
++

++
to the top
++

Libraries 1

++libvolk.so.1.3
++

++
to the top
++

Test Info

++ ++ ++ ++ ++ ++ ++

Library Name	libvolk1-dev
Version #1	1.3-3
Version #2	1.4-1
Arch	x86_64
Subject	Source Compatibility

Test Results

++ ++ ++ ++ ++ ++ ++

Total Header Files	135
Total Libraries	1
Total Symbols / Types	660 / 235
Compatibility	99.1%

Problem Summary

++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++

	Severity	Count
Added Symbols	-	46
Removed Symbols	High	5
Problems with Data Types	High	0
	Medium	0
	Low	1
Problems with Symbols	High	1
	Medium	0
	Low	0
Problems with Constants	Low	1
Other Changes in Constants	-	2

++ ++

Added Symbols 46

++volk.h
++volk_32f_64f_add_64f [data]
++volk_32f_64f_add_64f_a [data]
++volk_32f_64f_add_64f_get_func_desc ( )
++volk_32f_64f_add_64f_manual ( double* cVector, float const* aVector, double const* bVector, unsigned int num_points, char const* impl_name )
++volk_32f_64f_add_64f_u [data]
++volk_32f_64f_multiply_64f [data]
++volk_32f_64f_multiply_64f_a [data]
++volk_32f_64f_multiply_64f_get_func_desc ( )
++volk_32f_64f_multiply_64f_manual ( double* cVector, float const* aVector, double const* bVector, unsigned int num_points, char const* impl_name )
++volk_32f_64f_multiply_64f_u [data]
++volk_32f_s32f_mod_rangepuppet_32f [data]
++volk_32f_s32f_mod_rangepuppet_32f_a [data]
++volk_32f_s32f_mod_rangepuppet_32f_get_func_desc ( )
++volk_32f_s32f_mod_rangepuppet_32f_manual ( float* output, float const* input, float bound, unsigned int num_points, char const* impl_name )
++volk_32f_s32f_mod_rangepuppet_32f_u [data]
++volk_32f_s32f_s32f_mod_range_32f [data]
++volk_32f_s32f_s32f_mod_range_32f_a [data]
++volk_32f_s32f_s32f_mod_range_32f_get_func_desc ( )
++volk_32f_s32f_s32f_mod_range_32f_manual ( float* outputVector, float const* inputVector, float const lower_bound, float const upper_bound, unsigned int num_points, char const* impl_name )
++volk_32f_s32f_s32f_mod_range_32f_u [data]
++volk_32fc_32f_add_32fc [data]
++volk_32fc_32f_add_32fc_a [data]
++volk_32fc_32f_add_32fc_get_func_desc ( )
++volk_32fc_32f_add_32fc_manual ( lv_32fc_t* cVector, lv_32fc_t const* aVector, float const* bVector, unsigned int num_points, char const* impl_name )
++volk_32fc_32f_add_32fc_u [data]
++volk_32fc_x2_add_32fc [data]
++volk_32fc_x2_add_32fc_a [data]
++volk_32fc_x2_add_32fc_get_func_desc ( )
++volk_32fc_x2_add_32fc_manual ( lv_32fc_t* cVector, lv_32fc_t const* aVector, lv_32fc_t const* bVector, unsigned int num_points, char const* impl_name )
++volk_32fc_x2_add_32fc_u [data]
++volk_32u_reverse_32u [data]
++volk_32u_reverse_32u_a [data]
++volk_32u_reverse_32u_get_func_desc ( )
++volk_32u_reverse_32u_manual ( uint32_t* out, uint32_t const* in, unsigned int num_points, char const* impl_name )
++volk_32u_reverse_32u_u [data]
++volk_64f_x2_add_64f [data]
++volk_64f_x2_add_64f_a [data]
++volk_64f_x2_add_64f_get_func_desc ( )
++volk_64f_x2_add_64f_manual ( double* cVector, double const* aVector, double const* bVector, unsigned int num_points, char const* impl_name )
++volk_64f_x2_add_64f_u [data]
++volk_64f_x2_multiply_64f [data]
++volk_64f_x2_multiply_64f_a [data]
++volk_64f_x2_multiply_64f_get_func_desc ( )
++volk_64f_x2_multiply_64f_manual ( double* cVector, double const* aVector, double const* bVector, unsigned int num_points, char const* impl_name )
++volk_64f_x2_multiply_64f_u [data]
++
++volk_32u_reverse_32u.h
++ ++BitReverseTable256 [data] ++
++ ++ ++
++to the top
++

Removed Symbols 5

++constants.h
++volk_available_machines ( )
++volk_c_compiler ( )
++volk_compiler_flags ( )
++volk_prefix ( )
++volk_version ( )
++
++to the top
++ ++

Problems with Symbols, High Severity 1

	Change	Effect
1	3rd middle parameter frame_size has been removed from the calling stack.	Recompilation of a client program may be broken.

++volk.h
++ ++[+] volk_32f_8u_polarbutterfly_32f_manual ( float* llrs, unsigned char* u, int const frame_size, int const frame_exp, int const stage, int const u_num, int const row, char const* impl_name ) 1 ++
++ ++
++to the top
++ ++

Problems with Data Types, Low Severity 1

	Change	Effect
1	Base type has been changed from void()(float, unsigned char, int, int, int, int, int) to void()(float, unsigned char, int, int, int, int).	Recompilation of a client program may be broken.

++volk_typedefs.h
++ ++[+] typedef p_32f_8u_polarbutterfly_32f 1 ++
++ ++ ++
++to the top
++ ++

Problems with Constants, Low Severity 1

	Change	Effect
1	The constant LOG_POLY_DEGREE with value 6 has been removed.	Recompilation of a client program may be broken.

++volk_32f_log2_32f.h
++ ++[+] LOG_POLY_DEGREE ++
++ ++ ++
++to the top
++ ++

Other Changes in Constants 2

	Change	Effect
1	The constant __VOLK_ASM with value __asm__ has been added.	No effect.

	Change	Effect
1	The constant __VOLK_VOLATILE with value __volatile__ has been added.	No effect.

++volk_common.h
++ ++[+] __VOLK_ASM ++
++ ++ ++ ++[+] __VOLK_VOLATILE ++
++ ++ ++
++to the top
++

Header Files 135

++
to the top
++

Libraries 1

++libvolk.so.1.3
++

++
to the top
++

++ ++
++ ++ diff --cc debian/1.4_to_2.0_compat_report.html index 0000000,0000000..0bb6275 new file mode 100644 --- /dev/null +++ b/debian/1.4_to_2.0_compat_report.html @@@ -1,0 -1,0 +1,1855 @@@ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++volk: 1.4 to 2.0 compatibility report ++ ++ ++ ++

API compatibility report for the volk library between 1.4 and 2.0 versions on x86_64

++ ++
++

++ Binary
Compatibility ++ Source
Compatibility ++

Test Info

++ ++ ++ ++ ++ ++ ++ ++

Library Name	volk
Version #1	1.4
Version #2	2.0
Arch	x86_64
GCC Version	8
Subject	Binary Compatibility

Test Results

++ ++ ++ ++ ++ ++ ++

Total Header Files	143
Total Libraries	1
Total Symbols / Types	660 / 244
Compatibility	99.8%

Problem Summary

++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++

	Severity	Count
Added Symbols	-	0
Removed Symbols	High	0
Problems with Data Types	High	0
	Medium	2
	Low	3
Problems with Symbols	High	0
	Medium	2
	Low	0
Problems with Constants	Low	18
Other Changes in Constants	-	5

++ ++ ++

Problems with Data Types, Medium Severity 2

	Change	Effect
1	Field has_neonv7 has been added at the middle position of this structural type.	1) Size of the inclusive type has been changed. 2) Layout of structure fields has been changed and therefore fields at higher positions of the structure definition may be incorrectly accessed by applications.
2	Field has_neonv8 has been added at the middle position of this structural type.	1) Size of the inclusive type has been changed. 2) Layout of structure fields has been changed and therefore fields at higher positions of the structure definition may be incorrectly accessed by applications.

++volk_cpu.h
++ ++[+] struct VOLK_CPU 2 ++
++ ++ ++
++to the top
++ ++

Problems with Symbols, Medium Severity 2

	Change	Effect
1	Size of this global data has been changed from 176 bytes to 208 bytes.	Applications will obtain a different value and execution may change.

	Change	Effect
1	2nd parameter p2 has been added to the calling stack.	This parameter will not be initialized by old clients.

++volk_cpu.h
++ ++[+] volk_cpu [data] 1 ++
++ ++
++volk_prefs.h, libvolk.so.1.4
++ ++[+] volk_get_config_path ( char* p1 ) 1 ++
++ ++
++to the top
++ ++

Problems with Data Types, Low Severity 3

	Change	Effect
1	Field has_avx512cd has been added to this type.	1) This field will not be initialized by old clients. 2) Size of the inclusive type has been changed. NOTE: this field should be accessed only from the new library functions, otherwise it may result in crash or incorrect behavior of applications.
2	Field has_avx512f has been added to this type.	1) This field will not be initialized by old clients. 2) Size of the inclusive type has been changed. NOTE: this field should be accessed only from the new library functions, otherwise it may result in crash or incorrect behavior of applications.
3	Size of this type has been changed from 176 bytes to 208 bytes.	The fields or parameters of such data type may be incorrectly initialized or accessed by old client applications.

++volk_cpu.h
++ ++[+] struct VOLK_CPU 3 ++
++ ++ ++
++to the top
++ ++

Problems with Constants, Low Severity 18

	Change	Effect
1	The value of constant LV_32 has been changed from 4 to 6.	Applications will pass an old value of this constant as the parameter to the new-version library functions, that expect a new one. This may result in crash of incorrect behavior of applications.

	Change	Effect
1	The value of constant LV_3DNOW has been changed from 6 to 8.	Applications will pass an old value of this constant as the parameter to the new-version library functions, that expect a new one. This may result in crash of incorrect behavior of applications.

	Change	Effect
1	The value of constant LV_64 has been changed from 5 to 7.	Applications will pass an old value of this constant as the parameter to the new-version library functions, that expect a new one. This may result in crash of incorrect behavior of applications.

	Change	Effect
1	The value of constant LV_ABM has been changed from 7 to 9.	Applications will pass an old value of this constant as the parameter to the new-version library functions, that expect a new one. This may result in crash of incorrect behavior of applications.

	Change	Effect
1	The value of constant LV_AVX has been changed from 20 to 22.	Applications will pass an old value of this constant as the parameter to the new-version library functions, that expect a new one. This may result in crash of incorrect behavior of applications.

	Change	Effect
1	The value of constant LV_AVX2 has been changed from 21 to 23.	Applications will pass an old value of this constant as the parameter to the new-version library functions, that expect a new one. This may result in crash of incorrect behavior of applications.

	Change	Effect
1	The value of constant LV_FMA has been changed from 10 to 12.	Applications will pass an old value of this constant as the parameter to the new-version library functions, that expect a new one. This may result in crash of incorrect behavior of applications.

	Change	Effect
1	The value of constant LV_MMX has been changed from 9 to 11.	Applications will pass an old value of this constant as the parameter to the new-version library functions, that expect a new one. This may result in crash of incorrect behavior of applications.

	Change	Effect
1	The value of constant LV_NORC has been changed from 14 to 16.	Applications will pass an old value of this constant as the parameter to the new-version library functions, that expect a new one. This may result in crash of incorrect behavior of applications.

	Change	Effect
1	The value of constant LV_ORC has been changed from 13 to 15.	Applications will pass an old value of this constant as the parameter to the new-version library functions, that expect a new one. This may result in crash of incorrect behavior of applications.

	Change	Effect
1	The value of constant LV_POPCOUNT has been changed from 8 to 10.	Applications will pass an old value of this constant as the parameter to the new-version library functions, that expect a new one. This may result in crash of incorrect behavior of applications.

	Change	Effect
1	The value of constant LV_SSE has been changed from 11 to 13.	Applications will pass an old value of this constant as the parameter to the new-version library functions, that expect a new one. This may result in crash of incorrect behavior of applications.

	Change	Effect
1	The value of constant LV_SSE2 has been changed from 12 to 14.	Applications will pass an old value of this constant as the parameter to the new-version library functions, that expect a new one. This may result in crash of incorrect behavior of applications.

	Change	Effect
1	The value of constant LV_SSE3 has been changed from 15 to 17.	Applications will pass an old value of this constant as the parameter to the new-version library functions, that expect a new one. This may result in crash of incorrect behavior of applications.

	Change	Effect
1	The value of constant LV_SSE4_1 has been changed from 18 to 20.	Applications will pass an old value of this constant as the parameter to the new-version library functions, that expect a new one. This may result in crash of incorrect behavior of applications.

	Change	Effect
1	The value of constant LV_SSE4_2 has been changed from 19 to 21.	Applications will pass an old value of this constant as the parameter to the new-version library functions, that expect a new one. This may result in crash of incorrect behavior of applications.

	Change	Effect
1	The value of constant LV_SSE4_A has been changed from 17 to 19.	Applications will pass an old value of this constant as the parameter to the new-version library functions, that expect a new one. This may result in crash of incorrect behavior of applications.

	Change	Effect
1	The value of constant LV_SSSE3 has been changed from 16 to 18.	Applications will pass an old value of this constant as the parameter to the new-version library functions, that expect a new one. This may result in crash of incorrect behavior of applications.

++volk_config_fixed.h
++ ++[+] LV_32 ++
++ ++ ++ ++[+] LV_3DNOW ++
++ ++ ++ ++[+] LV_64 ++
++ ++ ++ ++[+] LV_ABM ++
++ ++ ++ ++[+] LV_AVX ++
++ ++ ++ ++[+] LV_AVX2 ++
++ ++ ++ ++[+] LV_FMA ++
++ ++ ++ ++[+] LV_MMX ++
++ ++ ++ ++[+] LV_NORC ++
++ ++ ++ ++[+] LV_ORC ++
++ ++ ++ ++[+] LV_POPCOUNT ++
++ ++ ++ ++[+] LV_SSE ++
++ ++ ++ ++[+] LV_SSE2 ++
++ ++ ++ ++[+] LV_SSE3 ++
++ ++ ++ ++[+] LV_SSE4_1 ++
++ ++ ++ ++[+] LV_SSE4_2 ++
++ ++ ++ ++[+] LV_SSE4_A ++
++ ++ ++ ++[+] LV_SSSE3 ++
++ ++ ++
++to the top
++ ++

Other Changes in Constants 5

	Change	Effect
1	The constant LOG_POLY_DEGREE with value 6 has been added.	No effect.

	Change	Effect
1	The constant LV_AVX512CD with value 25 has been added.	No effect.

	Change	Effect
1	The constant LV_AVX512F with value 24 has been added.	No effect.

	Change	Effect
1	The constant LV_NEONV7 with value 4 has been added.	No effect.

	Change	Effect
1	The constant LV_NEONV8 with value 5 has been added.	No effect.

++volk_32f_log2_32f.h
++ ++[+] LOG_POLY_DEGREE ++
++ ++ ++
++volk_config_fixed.h
++ ++[+] LV_AVX512CD ++
++ ++ ++ ++[+] LV_AVX512F ++
++ ++ ++ ++[+] LV_NEONV7 ++
++ ++ ++ ++[+] LV_NEONV8 ++
++ ++ ++
++to the top
++

Header Files 143

++saturation_arithmetic.h
++volk.h
++volk_16i_32fc_dot_prod_32fc.h
++volk_16i_branch_4_state_8.h
++volk_16i_convert_8i.h
++volk_16i_max_star_16i.h
++volk_16i_max_star_horizontal_16i.h
++volk_16i_permute_and_scalar_add.h
++volk_16i_s32f_convert_32f.h
++volk_16i_x4_quad_max_star_16i.h
++volk_16i_x5_add_quad_16i_x4.h
++volk_16ic_convert_32fc.h
++volk_16ic_deinterleave_16i_x2.h
++volk_16ic_deinterleave_real_16i.h
++volk_16ic_deinterleave_real_8i.h
++volk_16ic_magnitude_16i.h
++volk_16ic_s32f_deinterleave_32f_x2.h
++volk_16ic_s32f_deinterleave_real_32f.h
++volk_16ic_s32f_magnitude_32f.h
++volk_16ic_x2_dot_prod_16ic.h
++volk_16ic_x2_multiply_16ic.h
++volk_16u_byteswap.h
++volk_16u_byteswappuppet_16u.h
++volk_32f_64f_add_64f.h
++volk_32f_64f_multiply_64f.h
++volk_32f_8u_polarbutterfly_32f.h
++volk_32f_8u_polarbutterflypuppet_32f.h
++volk_32f_accumulator_s32f.h
++volk_32f_acos_32f.h
++volk_32f_asin_32f.h
++volk_32f_atan_32f.h
++volk_32f_binary_slicer_32i.h
++volk_32f_binary_slicer_8i.h
++volk_32f_convert_64f.h
++volk_32f_cos_32f.h
++volk_32f_expfast_32f.h
++volk_32f_index_max_16u.h
++volk_32f_index_max_32u.h
++volk_32f_invsqrt_32f.h
++volk_32f_log2_32f.h
++volk_32f_null_32f.h
++volk_32f_s32f_32f_fm_detect_32f.h
++volk_32f_s32f_calc_spectral_noise_floor_32f.h
++volk_32f_s32f_convert_16i.h
++volk_32f_s32f_convert_32i.h
++volk_32f_s32f_convert_8i.h
++volk_32f_s32f_mod_rangepuppet_32f.h
++volk_32f_s32f_multiply_32f.h
++volk_32f_s32f_normalize.h
++volk_32f_s32f_power_32f.h
++volk_32f_s32f_s32f_mod_range_32f.h
++volk_32f_s32f_stddev_32f.h
++volk_32f_sin_32f.h
++volk_32f_sqrt_32f.h
++volk_32f_stddev_and_mean_32f_x2.h
++volk_32f_tan_32f.h
++volk_32f_tanh_32f.h
++volk_32f_x2_add_32f.h
++volk_32f_x2_divide_32f.h
++volk_32f_x2_dot_prod_16i.h
++volk_32f_x2_dot_prod_32f.h
++volk_32f_x2_fm_detectpuppet_32f.h
++volk_32f_x2_interleave_32fc.h
++volk_32f_x2_max_32f.h
++volk_32f_x2_min_32f.h
++volk_32f_x2_multiply_32f.h
++volk_32f_x2_pow_32f.h
++volk_32f_x2_s32f_interleave_16ic.h
++volk_32f_x2_subtract_32f.h
++volk_32f_x3_sum_of_poly_32f.h
++volk_32fc_32f_add_32fc.h
++volk_32fc_32f_dot_prod_32fc.h
++volk_32fc_32f_multiply_32fc.h
++volk_32fc_conjugate_32fc.h
++volk_32fc_convert_16ic.h
++volk_32fc_deinterleave_32f_x2.h
++volk_32fc_deinterleave_64f_x2.h
++volk_32fc_deinterleave_imag_32f.h
++volk_32fc_deinterleave_real_32f.h
++volk_32fc_deinterleave_real_64f.h
++volk_32fc_index_max_16u.h
++volk_32fc_index_max_32u.h
++volk_32fc_magnitude_32f.h
++volk_32fc_magnitude_squared_32f.h
++volk_32fc_s32f_atan2_32f.h
++volk_32fc_s32f_deinterleave_real_16i.h
++volk_32fc_s32f_magnitude_16i.h
++volk_32fc_s32f_power_32fc.h
++volk_32fc_s32f_power_spectrum_32f.h
++volk_32fc_s32f_x2_power_spectral_density_32f.h
++volk_32fc_s32fc_multiply_32fc.h
++volk_32fc_s32fc_rotatorpuppet_32fc.h
++volk_32fc_s32fc_x2_rotator_32fc.h
++volk_32fc_x2_add_32fc.h
++volk_32fc_x2_conjugate_dot_prod_32fc.h
++volk_32fc_x2_divide_32fc.h
++volk_32fc_x2_dot_prod_32fc.h
++volk_32fc_x2_multiply_32fc.h
++volk_32fc_x2_multiply_conjugate_32fc.h
++volk_32fc_x2_s32f_square_dist_scalar_mult_32f.h
++volk_32fc_x2_square_dist_32f.h
++volk_32i_s32f_convert_32f.h
++volk_32i_x2_and_32i.h
++volk_32i_x2_or_32i.h
++volk_32u_byteswap.h
++volk_32u_byteswappuppet_32u.h
++volk_32u_popcnt.h
++volk_32u_popcntpuppet_32u.h
++volk_32u_reverse_32u.h
++volk_64f_convert_32f.h
++volk_64f_x2_add_64f.h
++volk_64f_x2_max_64f.h
++volk_64f_x2_min_64f.h
++volk_64f_x2_multiply_64f.h
++volk_64u_byteswap.h
++volk_64u_byteswappuppet_64u.h
++volk_64u_popcnt.h
++volk_64u_popcntpuppet_64u.h
++volk_8i_convert_16i.h
++volk_8i_s32f_convert_32f.h
++volk_8ic_deinterleave_16i_x2.h
++volk_8ic_deinterleave_real_16i.h
++volk_8ic_deinterleave_real_8i.h
++volk_8ic_s32f_deinterleave_32f_x2.h
++volk_8ic_s32f_deinterleave_real_32f.h
++volk_8ic_x2_multiply_conjugate_16ic.h
++volk_8ic_x2_s32f_multiply_conjugate_32fc.h
++volk_8u_conv_k7_r2puppet_8u.h
++volk_8u_x2_encodeframepolar_8u.h
++volk_8u_x3_encodepolar_8u_x2.h
++volk_8u_x3_encodepolarpuppet_8u.h
++volk_8u_x4_conv_k7_r2_8u.h
++volk_avx_intrinsics.h
++volk_common.h
++volk_complex.h
++volk_config_fixed.h
++volk_cpu.h
++volk_malloc.h
++volk_neon_intrinsics.h
++volk_prefs.h
++volk_sse3_intrinsics.h
++volk_sse_intrinsics.h
++volk_typedefs.h
++

++
to the top
++

Libraries 1

++libvolk.so.1.4
++

++
to the top
++

Test Info

++ ++ ++ ++ ++ ++ ++

Library Name	volk
Version #1	1.4
Version #2	2.0
Arch	x86_64
Subject	Source Compatibility

Test Results

++ ++ ++ ++ ++ ++ ++

Total Header Files	143
Total Libraries	1
Total Symbols / Types	705 / 246
Compatibility	99.9%

Problem Summary

++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++

	Severity	Count
Added Symbols	-	5
Removed Symbols	High	0
Problems with Data Types	High	0
	Medium	0
	Low	4
Problems with Symbols	High	0
	Medium	1
	Low	0
Problems with Constants	Low	18
Other Changes in Constants	-	5

++ ++

Added Symbols 5

++constants.h
++volk_available_machines ( )
++volk_c_compiler ( )
++volk_compiler_flags ( )
++volk_prefix ( )
++volk_version ( )
++
++to the top
++ ++

Problems with Symbols, Medium Severity 1

	Change	Effect
1	2nd parameter p2 has been added to the calling stack.	Recompilation of a client program may be broken.

++volk_prefs.h
++ ++[+] volk_get_config_path ( char* p1 ) 1 ++
++ ++
++to the top
++ ++

Problems with Data Types, Low Severity 4

	Change	Effect
1	Field has_avx512cd has been added to this type.	This field will not be initialized or used by old client applications.
2	Field has_avx512f has been added to this type.	This field will not be initialized or used by old client applications.
3	Field has_neonv7 has been added to this type.	This field will not be initialized or used by old client applications.
4	Field has_neonv8 has been added to this type.	This field will not be initialized or used by old client applications.

++volk_cpu.h
++ ++[+] struct VOLK_CPU 4 ++
++ ++ ++
++to the top
++ ++

Problems with Constants, Low Severity 18

	Change	Effect
1	The value of constant LV_32 has been changed from 4 to 6.	Recompilation of a client program may be broken.

Other Changes in Constants 5

Header Files 143

++
to the top
++

Libraries 1

++libvolk.so.1.4
++

++
to the top
++

++ ++
++ ++ diff --cc debian/2.2.0_to_2.2.1_compat_report.html index 0000000,0000000..3c8e2f0 new file mode 100644 --- /dev/null +++ b/debian/2.2.0_to_2.2.1_compat_report.html @@@ -1,0 -1,0 +1,769 @@@ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++volk: 2.2.0-3 to 2.2.1-1 compatibility report ++ ++ ++ ++

API compatibility report for the volk library between 2.2.0-3 and 2.2.1-1 versions on x86_64

++ ++
++

++ Binary
Compatibility ++ Source
Compatibility ++

Test Info

++ ++ ++ ++ ++ ++ ++ ++

Library Name	volk
Version #1	2.2.0-3
Version #2	2.2.1-1
Arch	x86_64
GCC Version	8
Subject	Binary Compatibility

Test Results

++ ++ ++ ++ ++ ++ ++

Total Header Files	148
Total Libraries	1
Total Symbols / Types	670 / 246
Compatibility	100%

Problem Summary

++ ++ ++ ++ ++ ++ ++ ++ ++ ++

	Severity	Count
Added Symbols	-	0
Removed Symbols	High	0
Problems with Data Types	High	0
	Medium	0
	Low	0
Problems with Symbols	High	0
	Medium	0
	Low	0
Problems with Constants	Low	0

++ ++

Header Files 148

++constants.h
++saturation_arithmetic.h
++volk.h
++volk_16i_32fc_dot_prod_32fc.h
++volk_16i_branch_4_state_8.h
++volk_16i_convert_8i.h
++volk_16i_max_star_16i.h
++volk_16i_max_star_horizontal_16i.h
++volk_16i_permute_and_scalar_add.h
++volk_16i_s32f_convert_32f.h
++volk_16i_x4_quad_max_star_16i.h
++volk_16i_x5_add_quad_16i_x4.h
++volk_16ic_convert_32fc.h
++volk_16ic_deinterleave_16i_x2.h
++volk_16ic_deinterleave_real_16i.h
++volk_16ic_deinterleave_real_8i.h
++volk_16ic_magnitude_16i.h
++volk_16ic_s32f_deinterleave_32f_x2.h
++volk_16ic_s32f_deinterleave_real_32f.h
++volk_16ic_s32f_magnitude_32f.h
++volk_16ic_x2_dot_prod_16ic.h
++volk_16ic_x2_multiply_16ic.h
++volk_16u_byteswap.h
++volk_16u_byteswappuppet_16u.h
++volk_32f_64f_add_64f.h
++volk_32f_64f_multiply_64f.h
++volk_32f_8u_polarbutterfly_32f.h
++volk_32f_8u_polarbutterflypuppet_32f.h
++volk_32f_accumulator_s32f.h
++volk_32f_acos_32f.h
++volk_32f_asin_32f.h
++volk_32f_atan_32f.h
++volk_32f_binary_slicer_32i.h
++volk_32f_binary_slicer_8i.h
++volk_32f_convert_64f.h
++volk_32f_cos_32f.h
++volk_32f_expfast_32f.h
++volk_32f_index_max_16u.h
++volk_32f_index_max_32u.h
++volk_32f_invsqrt_32f.h
++volk_32f_log2_32f.h
++volk_32f_null_32f.h
++volk_32f_s32f_32f_fm_detect_32f.h
++volk_32f_s32f_calc_spectral_noise_floor_32f.h
++volk_32f_s32f_convert_16i.h
++volk_32f_s32f_convert_32i.h
++volk_32f_s32f_convert_8i.h
++volk_32f_s32f_mod_rangepuppet_32f.h
++volk_32f_s32f_multiply_32f.h
++volk_32f_s32f_normalize.h
++volk_32f_s32f_power_32f.h
++volk_32f_s32f_s32f_mod_range_32f.h
++volk_32f_s32f_stddev_32f.h
++volk_32f_sin_32f.h
++volk_32f_sqrt_32f.h
++volk_32f_stddev_and_mean_32f_x2.h
++volk_32f_tan_32f.h
++volk_32f_tanh_32f.h
++volk_32f_x2_add_32f.h
++volk_32f_x2_divide_32f.h
++volk_32f_x2_dot_prod_16i.h
++volk_32f_x2_dot_prod_32f.h
++volk_32f_x2_fm_detectpuppet_32f.h
++volk_32f_x2_interleave_32fc.h
++volk_32f_x2_max_32f.h
++volk_32f_x2_min_32f.h
++volk_32f_x2_multiply_32f.h
++volk_32f_x2_pow_32f.h
++volk_32f_x2_s32f_interleave_16ic.h
++volk_32f_x2_subtract_32f.h
++volk_32f_x3_sum_of_poly_32f.h
++volk_32fc_32f_add_32fc.h
++volk_32fc_32f_dot_prod_32fc.h
++volk_32fc_32f_multiply_32fc.h
++volk_32fc_conjugate_32fc.h
++volk_32fc_convert_16ic.h
++volk_32fc_deinterleave_32f_x2.h
++volk_32fc_deinterleave_64f_x2.h
++volk_32fc_deinterleave_imag_32f.h
++volk_32fc_deinterleave_real_32f.h
++volk_32fc_deinterleave_real_64f.h
++volk_32fc_index_max_16u.h
++volk_32fc_index_max_32u.h
++volk_32fc_magnitude_32f.h
++volk_32fc_magnitude_squared_32f.h
++volk_32fc_s32f_atan2_32f.h
++volk_32fc_s32f_deinterleave_real_16i.h
++volk_32fc_s32f_magnitude_16i.h
++volk_32fc_s32f_power_32fc.h
++volk_32fc_s32f_power_spectrum_32f.h
++volk_32fc_s32f_x2_power_spectral_density_32f.h
++volk_32fc_s32fc_multiply_32fc.h
++volk_32fc_s32fc_rotatorpuppet_32fc.h
++volk_32fc_s32fc_x2_rotator_32fc.h
++volk_32fc_x2_add_32fc.h
++volk_32fc_x2_conjugate_dot_prod_32fc.h
++volk_32fc_x2_divide_32fc.h
++volk_32fc_x2_dot_prod_32fc.h
++volk_32fc_x2_multiply_32fc.h
++volk_32fc_x2_multiply_conjugate_32fc.h
++volk_32fc_x2_s32f_square_dist_scalar_mult_32f.h
++volk_32fc_x2_s32fc_multiply_conjugate_add_32fc.h
++volk_32fc_x2_square_dist_32f.h
++volk_32i_s32f_convert_32f.h
++volk_32i_x2_and_32i.h
++volk_32i_x2_or_32i.h
++volk_32u_byteswap.h
++volk_32u_byteswappuppet_32u.h
++volk_32u_popcnt.h
++volk_32u_popcntpuppet_32u.h
++volk_32u_reverse_32u.h
++volk_64f_convert_32f.h
++volk_64f_x2_add_64f.h
++volk_64f_x2_max_64f.h
++volk_64f_x2_min_64f.h
++volk_64f_x2_multiply_64f.h
++volk_64u_byteswap.h
++volk_64u_byteswappuppet_64u.h
++volk_64u_popcnt.h
++volk_64u_popcntpuppet_64u.h
++volk_8i_convert_16i.h
++volk_8i_s32f_convert_32f.h
++volk_8ic_deinterleave_16i_x2.h
++volk_8ic_deinterleave_real_16i.h
++volk_8ic_deinterleave_real_8i.h
++volk_8ic_s32f_deinterleave_32f_x2.h
++volk_8ic_s32f_deinterleave_real_32f.h
++volk_8ic_x2_multiply_conjugate_16ic.h
++volk_8ic_x2_s32f_multiply_conjugate_32fc.h
++volk_8u_conv_k7_r2puppet_8u.h
++volk_8u_x2_encodeframepolar_8u.h
++volk_8u_x3_encodepolar_8u_x2.h
++volk_8u_x3_encodepolarpuppet_8u.h
++volk_8u_x4_conv_k7_r2_8u.h
++volk_alloc.hh
++volk_avx2_intrinsics.h
++volk_avx_intrinsics.h
++volk_common.h
++volk_complex.h
++volk_config_fixed.h
++volk_cpu.h
++volk_malloc.h
++volk_neon_intrinsics.h
++volk_prefs.h
++volk_sse3_intrinsics.h
++volk_sse_intrinsics.h
++volk_typedefs.h
++volk_version.h
++

++
to the top
++

Libraries 1

++libvolk.so.2.2
++

++
to the top
++

Test Info

++ ++ ++ ++ ++ ++ ++

Library Name	volk
Version #1	2.2.0-3
Version #2	2.2.1-1
Arch	x86_64
Subject	Source Compatibility

Test Results

++ ++ ++ ++ ++ ++ ++

Total Header Files	148
Total Libraries	1
Total Symbols / Types	730 / 249
Compatibility	100%

Problem Summary

++ ++ ++ ++ ++ ++ ++ ++ ++ ++

	Severity	Count
Added Symbols	-	0
Removed Symbols	High	0
Problems with Data Types	High	0
	Medium	0
	Low	0
Problems with Symbols	High	0
	Medium	0
	Low	0
Problems with Constants	Low	0

++ ++

Header Files 148

++constants.h
++saturation_arithmetic.h
++volk.h
++volk_16i_32fc_dot_prod_32fc.h
++volk_16i_branch_4_state_8.h
++volk_16i_convert_8i.h
++volk_16i_max_star_16i.h
++volk_16i_max_star_horizontal_16i.h
++volk_16i_permute_and_scalar_add.h
++volk_16i_s32f_convert_32f.h
++volk_16i_x4_quad_max_star_16i.h
++volk_16i_x5_add_quad_16i_x4.h
++volk_16ic_convert_32fc.h
++volk_16ic_deinterleave_16i_x2.h
++volk_16ic_deinterleave_real_16i.h
++volk_16ic_deinterleave_real_8i.h
++volk_16ic_magnitude_16i.h
++volk_16ic_s32f_deinterleave_32f_x2.h
++volk_16ic_s32f_deinterleave_real_32f.h
++volk_16ic_s32f_magnitude_32f.h
++volk_16ic_x2_dot_prod_16ic.h
++volk_16ic_x2_multiply_16ic.h
++volk_16u_byteswap.h
++volk_16u_byteswappuppet_16u.h
++volk_32f_64f_add_64f.h
++volk_32f_64f_multiply_64f.h
++volk_32f_8u_polarbutterfly_32f.h
++volk_32f_8u_polarbutterflypuppet_32f.h
++volk_32f_accumulator_s32f.h
++volk_32f_acos_32f.h
++volk_32f_asin_32f.h
++volk_32f_atan_32f.h
++volk_32f_binary_slicer_32i.h
++volk_32f_binary_slicer_8i.h
++volk_32f_convert_64f.h
++volk_32f_cos_32f.h
++volk_32f_expfast_32f.h
++volk_32f_index_max_16u.h
++volk_32f_index_max_32u.h
++volk_32f_invsqrt_32f.h
++volk_32f_log2_32f.h
++volk_32f_null_32f.h
++volk_32f_s32f_32f_fm_detect_32f.h
++volk_32f_s32f_calc_spectral_noise_floor_32f.h
++volk_32f_s32f_convert_16i.h
++volk_32f_s32f_convert_32i.h
++volk_32f_s32f_convert_8i.h
++volk_32f_s32f_mod_rangepuppet_32f.h
++volk_32f_s32f_multiply_32f.h
++volk_32f_s32f_normalize.h
++volk_32f_s32f_power_32f.h
++volk_32f_s32f_s32f_mod_range_32f.h
++volk_32f_s32f_stddev_32f.h
++volk_32f_sin_32f.h
++volk_32f_sqrt_32f.h
++volk_32f_stddev_and_mean_32f_x2.h
++volk_32f_tan_32f.h
++volk_32f_tanh_32f.h
++volk_32f_x2_add_32f.h
++volk_32f_x2_divide_32f.h
++volk_32f_x2_dot_prod_16i.h
++volk_32f_x2_dot_prod_32f.h
++volk_32f_x2_fm_detectpuppet_32f.h
++volk_32f_x2_interleave_32fc.h
++volk_32f_x2_max_32f.h
++volk_32f_x2_min_32f.h
++volk_32f_x2_multiply_32f.h
++volk_32f_x2_pow_32f.h
++volk_32f_x2_s32f_interleave_16ic.h
++volk_32f_x2_subtract_32f.h
++volk_32f_x3_sum_of_poly_32f.h
++volk_32fc_32f_add_32fc.h
++volk_32fc_32f_dot_prod_32fc.h
++volk_32fc_32f_multiply_32fc.h
++volk_32fc_conjugate_32fc.h
++volk_32fc_convert_16ic.h
++volk_32fc_deinterleave_32f_x2.h
++volk_32fc_deinterleave_64f_x2.h
++volk_32fc_deinterleave_imag_32f.h
++volk_32fc_deinterleave_real_32f.h
++volk_32fc_deinterleave_real_64f.h
++volk_32fc_index_max_16u.h
++volk_32fc_index_max_32u.h
++volk_32fc_magnitude_32f.h
++volk_32fc_magnitude_squared_32f.h
++volk_32fc_s32f_atan2_32f.h
++volk_32fc_s32f_deinterleave_real_16i.h
++volk_32fc_s32f_magnitude_16i.h
++volk_32fc_s32f_power_32fc.h
++volk_32fc_s32f_power_spectrum_32f.h
++volk_32fc_s32f_x2_power_spectral_density_32f.h
++volk_32fc_s32fc_multiply_32fc.h
++volk_32fc_s32fc_rotatorpuppet_32fc.h
++volk_32fc_s32fc_x2_rotator_32fc.h
++volk_32fc_x2_add_32fc.h
++volk_32fc_x2_conjugate_dot_prod_32fc.h
++volk_32fc_x2_divide_32fc.h
++volk_32fc_x2_dot_prod_32fc.h
++volk_32fc_x2_multiply_32fc.h
++volk_32fc_x2_multiply_conjugate_32fc.h
++volk_32fc_x2_s32f_square_dist_scalar_mult_32f.h
++volk_32fc_x2_s32fc_multiply_conjugate_add_32fc.h
++volk_32fc_x2_square_dist_32f.h
++volk_32i_s32f_convert_32f.h
++volk_32i_x2_and_32i.h
++volk_32i_x2_or_32i.h
++volk_32u_byteswap.h
++volk_32u_byteswappuppet_32u.h
++volk_32u_popcnt.h
++volk_32u_popcntpuppet_32u.h
++volk_32u_reverse_32u.h
++volk_64f_convert_32f.h
++volk_64f_x2_add_64f.h
++volk_64f_x2_max_64f.h
++volk_64f_x2_min_64f.h
++volk_64f_x2_multiply_64f.h
++volk_64u_byteswap.h
++volk_64u_byteswappuppet_64u.h
++volk_64u_popcnt.h
++volk_64u_popcntpuppet_64u.h
++volk_8i_convert_16i.h
++volk_8i_s32f_convert_32f.h
++volk_8ic_deinterleave_16i_x2.h
++volk_8ic_deinterleave_real_16i.h
++volk_8ic_deinterleave_real_8i.h
++volk_8ic_s32f_deinterleave_32f_x2.h
++volk_8ic_s32f_deinterleave_real_32f.h
++volk_8ic_x2_multiply_conjugate_16ic.h
++volk_8ic_x2_s32f_multiply_conjugate_32fc.h
++volk_8u_conv_k7_r2puppet_8u.h
++volk_8u_x2_encodeframepolar_8u.h
++volk_8u_x3_encodepolar_8u_x2.h
++volk_8u_x3_encodepolarpuppet_8u.h
++volk_8u_x4_conv_k7_r2_8u.h
++volk_alloc.hh
++volk_avx2_intrinsics.h
++volk_avx_intrinsics.h
++volk_common.h
++volk_complex.h
++volk_config_fixed.h
++volk_cpu.h
++volk_malloc.h
++volk_neon_intrinsics.h
++volk_prefs.h
++volk_sse3_intrinsics.h
++volk_sse_intrinsics.h
++volk_typedefs.h
++volk_version.h
++

++
to the top
++

Libraries 1

++libvolk.so.2.2
++

++
to the top
++

++ ++
++ ++ diff --cc debian/changelog index 0000000,0000000..2553b1e new file mode 100644 --- /dev/null +++ b/debian/changelog @@@ -1,0 -1,0 +1,434 @@@ ++volk (2.2.1-2) unstable; urgency=medium ++ ++ * update to v2.2.1-11-gfaf230e ++ * cmake: Remove the ORC from the VOLK public link interface ++ * Fix the broken index max kernels ++ ++ -- A. Maitland Bottoms Fri, 27 Mar 2020 21:48:10 -0400 ++ ++volk (2.2.1-1) unstable; urgency=high ++ ++ * New upstream bugfix release ++ reason for high urgency: ++ - Fix loop bound in AVX rotator (only one fixed in 2.2.0-3) ++ - Fix out-of-bounds read in AVX2 square dist kernel ++ - Fix length checks in AVX2 index max kernels ++ ++ -- A. Maitland Bottoms Mon, 24 Feb 2020 18:08:05 -0500 ++ ++volk (2.2.0-3) unstable; urgency=high ++ ++ * Update to v2.2.0-6-g5701f8f ++ reason for high urgency: ++ - Fix loop bound in AVX rotator ++ ++ -- A. Maitland Bottoms Sun, 23 Feb 2020 23:49:18 -0500 ++ ++volk (2.2.0-2) unstable; urgency=medium ++ ++ * Upload to unstable ++ ++ -- A. Maitland Bottoms Tue, 18 Feb 2020 17:56:58 -0500 ++ ++volk (2.2.0-1) experimental; urgency=medium ++ ++ * New upstream release ++ - Remove build dependency on python six ++ - Fixup VolkConfigVersion ++ - add volk_version.h ++ ++ -- A. Maitland Bottoms Sun, 16 Feb 2020 18:25:20 -0500 ++ ++volk (2.1.0-2) unstable; urgency=medium ++ ++ * Upload to unstable ++ ++ -- A. Maitland Bottoms Sun, 05 Jan 2020 23:17:57 -0500 ++ ++volk (2.1.0-1) experimental; urgency=medium ++ ++ * New upstream release ++ - The AVX FMA rotator bug is fixed ++ - VOLK offers `volk::vector<>` for C++ to follow RAII ++ - Use C++17 `std::filesystem` ++ - This enables VOLK to be built without Boost if available! ++ - lots of bugfixes ++ - more optimized kernels, especially more NEON versions ++ * Upload to experimental for new ABI library package libvolk2.1 ++ ++ -- A. Maitland Bottoms Sun, 22 Dec 2019 10:27:36 -0500 ++ ++volk (2.0.0-3) unstable; urgency=medium ++ ++ * update to v2.0.0-4-gf04a46f ++ ++ -- A. Maitland Bottoms Thu, 14 Nov 2019 22:47:23 -0500 ++ ++volk (2.0.0-2) unstable; urgency=medium ++ ++ * Upload to unstable ++ ++ -- A. Maitland Bottoms Mon, 12 Aug 2019 22:49:11 -0400 ++ ++volk (2.0.0-1) experimental; urgency=medium ++ ++ * New upstream release ++ ++ -- A. Maitland Bottoms Wed, 07 Aug 2019 23:31:20 -0400 ++ ++volk (1.4-4) unstable; urgency=medium ++ ++ * working volk_modtool with Python 3 ++ * build and install libvolk.a ++ ++ -- A. Maitland Bottoms Mon, 29 Oct 2018 01:32:05 -0400 ++ ++volk (1.4-3) unstable; urgency=medium ++ ++ * update to v1.4-9-g297fefd ++ Added an AVX protokernel for volk_32fc_x2_32f_square_dist_scalar_mult_32f ++ fixed a buffer over-read and over-write in ++ volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_avx ++ Fix 32u_reverse_32u for ARM ++ ++ -- A. Maitland Bottoms Sat, 12 May 2018 15:25:04 -0400 ++ ++volk (1.4-2) unstable; urgency=medium ++ ++ * Upload to unstable, needed by gnuradio (>= 3.7.12.0) ++ ++ -- A. Maitland Bottoms Tue, 03 Apr 2018 01:03:19 -0400 ++ ++volk (1.4-1) experimental; urgency=medium ++ ++ * New upstream release ++ upstream changelog http://libvolk.org/release-v14.html ++ ++ -- A. Maitland Bottoms Tue, 27 Mar 2018 22:57:42 -0400 ++ ++volk (1.3.1-1) unstable; urgency=medium ++ ++ * New upstream bugfix release ++ * Refresh all debian patches for use with git am ++ ++ -- A. Maitland Bottoms Tue, 27 Mar 2018 21:54:29 -0400 ++ ++volk (1.3-3) unstable; urgency=medium ++ ++ * update to v1.3-23-g0109b2e ++ * update debian/libvolk1-dev.abi.tar.gz.amd64 ++ * Add breaks/replaces gnuradio (<=3.7.2.1) (LP: #1614235) ++ ++ -- A. Maitland Bottoms Sun, 04 Feb 2018 13:12:21 -0500 ++ ++volk (1.3-2) unstable; urgency=medium ++ ++ * update to v1.3-16-g28b03a9 ++ apps: fix profile update reading end of lines ++ qa: lower tolerance for 32fc_mag to fix issue #96 ++ * include upstream master patch to sort input files ++ ++ -- A. Maitland Bottoms Sun, 27 Aug 2017 13:44:55 -0400 ++ ++volk (1.3-1) unstable; urgency=medium ++ ++ * New upstream release ++ * The index_max kernels were named with the wrong output datatype. To ++ fix this there are new kernels that return a 32u (int32_t) and the ++ existing kernels had their signatures changed to return 16u (int16_t). ++ * The output to stdout and stderr has been shuffled around. There is no ++ longer a message that prints what VOLK machine is being used and the ++ warning messages go to stderr rather than stdout. ++ * The 32fc_index_max kernels previously were only accurate to the SSE ++ register width (4 points). This was a pretty serious and long-lived ++ bug that's been fixed and the QA updated appropriately. ++ ++ -- A. Maitland Bottoms Sat, 02 Jul 2016 16:30:47 -0400 ++ ++volk (1.2.2-2) unstable; urgency=medium ++ ++ * update to v1.2.2-11-g78c8bc4 (to follow gnuradio maint branch) ++ ++ -- A. Maitland Bottoms Sun, 19 Jun 2016 14:44:15 -0400 ++ ++volk (1.2.2-1) unstable; urgency=medium ++ ++ * New upstream release ++ ++ -- A. Maitland Bottoms Fri, 08 Apr 2016 00:12:10 -0400 ++ ++volk (1.2.1-2) unstable; urgency=medium ++ ++ * Upstream patches: ++ Fix some CMake complaints ++ The fix for compilation with cmake 3.5 ++ ++ -- A. Maitland Bottoms Wed, 23 Mar 2016 17:47:54 -0400 ++ ++volk (1.2.1-1) unstable; urgency=medium ++ ++ * New upstream release ++ ++ -- A. Maitland Bottoms Sun, 07 Feb 2016 19:38:32 -0500 ++ ++volk (1.2-1) unstable; urgency=medium ++ ++ * New upstream release ++ ++ -- A. Maitland Bottoms Thu, 24 Dec 2015 20:28:13 -0500 ++ ++volk (1.1.1-5) experimental; urgency=medium ++ ++ * update to v1.1.1-22-gef53547 to support gnuradio 3.7.9 ++ ++ -- A. Maitland Bottoms Fri, 11 Dec 2015 13:12:55 -0500 ++ ++volk (1.1.1-4) unstable; urgency=medium ++ ++ * more lintian fixes ++ ++ -- A. Maitland Bottoms Wed, 25 Nov 2015 21:49:58 -0500 ++ ++volk (1.1.1-3) unstable; urgency=medium ++ ++ * Lintian fixes Pre-Depends ++ ++ -- A. Maitland Bottoms Thu, 19 Nov 2015 21:24:27 -0500 ++ ++volk (1.1.1-2) unstable; urgency=medium ++ ++ * Note that libvolk1-dev replaces files in gnuradio-dev versions <<3.7.8 ++ (Closes: #802646) again. Thanks Andreas Beckmann. ++ ++ -- A. Maitland Bottoms Fri, 13 Nov 2015 18:45:49 -0500 ++ ++volk (1.1.1-1) unstable; urgency=medium ++ ++ * New upstream release ++ * New architectures exist for the AVX2 and FMA ISAs. ++ * The profiler now generates buffers that are vlen + a tiny amount and ++ generates random data to fill buffers. This is intended to catch bugs ++ in protokernels that write beyond num_points. ++ * Note that libvolk1-dev replaces files in earlier gnuradio-dev versions ++ (Closes: #802646) ++ ++ -- A. Maitland Bottoms Sun, 01 Nov 2015 18:45:43 -0500 ++ ++volk (1.1-4) unstable; urgency=medium ++ ++ * update to v1.1-12-g264addc ++ ++ -- A. Maitland Bottoms Tue, 29 Sep 2015 23:41:50 -0400 ++ ++volk (1.1-3) unstable; urgency=low ++ ++ * drop dh_acc to get reproducible builds ++ ++ -- A. Maitland Bottoms Fri, 11 Sep 2015 22:57:06 -0400 ++ ++volk (1.1-2) unstable; urgency=low ++ ++ * use dh-acc ++ ++ -- A. Maitland Bottoms Mon, 07 Sep 2015 15:45:20 -0400 ++ ++volk (1.1-1) unstable; urgency=medium ++ ++ * re-organize package naming convention ++ * New upstream release tag v1.1 ++ New architectures exist for the AVX2 and FMA ISAs. Along ++ with the build-system support the following kernels have ++ no proto-kernels taking advantage of these architectures: ++ ++ * 32f_x2_dot_prod_32f ++ * 32fc_x2_multiply_32fc ++ * 64_byteswap ++ * 32f_binary_slicer_8i ++ * 16u_byteswap ++ * 32u_byteswap ++ ++ QA/profiler ++ ----------- ++ ++ The profiler now generates buffers that are vlen + a tiny ++ amount and generates random data to fill buffers. This is ++ intended to catch bugs in protokernels that write beyond ++ num_points. ++ ++ -- A. Maitland Bottoms Wed, 26 Aug 2015 09:22:48 -0400 ++ ++volk (1.0.2-2) unstable; urgency=low ++ ++ * Use SOURCE_DATE_EPOCH from the environment, if defined, ++ rather than current date and time to implement volk_build_date() ++ (embedding build date in a library does not help reproducible builds) ++ * add watch file ++ ++ -- A. Maitland Bottoms Sat, 15 Aug 2015 17:43:15 -0400 ++ ++volk (1.0.2-1) unstable; urgency=medium ++ ++ * Maintenance release 24 Jul 2015 by Nathan West ++ * The major change is the CMake logic to add ASM protokernels. Rather ++ than depending on CFLAGS and ASMFLAGS we use the results of VOLK's ++ built in has_ARCH tests. All configurations should work the same as ++ before, but manually specifying CFLAGS and ASMFLAGS on the cmake call ++ for ARM native builds should no longer be necessary. ++ * The 32fc_s32fc_x2_rotator_32fc generic protokernel now includes a ++ previously implied header. ++ * Finally, there is a fix to return the "best" protokernel to the ++ dispatcher when no volk_config exists. Thanks to Alexandre Raymond for ++ pointing this out. ++ * with maint branch patch: ++ kernels-add-missing-include-arm_neon.h ++ * removed unused build-dependency on liboil0.3-dev (closes: #793626) ++ ++ -- A. Maitland Bottoms Wed, 05 Aug 2015 00:43:40 -0400 ++ ++volk (1.0.1-1) unstable; urgency=low ++ ++ * Maintenance Release v1.0.1 08 Jul 2015 by Nathan West ++ This is a maintenance release with bug fixes since the initial release of ++ v1.0 in April. ++ ++ * Contributors ++ ++ The following authors have contributed code to this release: ++ ++ Doug Geiger doug.geiger@bioradiation.net ++ Elliot Briggs elliot.briggs@gmail.com ++ Marcus Mueller marcus@hostalia.de ++ Nathan West nathan.west@okstate.edu ++ Tom Rondeau tom@trondeau.com ++ ++ * Kernels ++ ++ Several bug fixes in different kernels. The NEON implementations of the ++ following kernels have been fixed: ++ ++ 32f_x2_add_32f ++ 32f_x2_dot_prod_32f ++ 32fc_s32fc_multiply_32fc ++ 32fc_x2_multiply_32fc ++ ++ Additionally the NEON asm based 32f_x2_add_32f protokernels were not being ++ used and are now included and available for use via the dispatcher. ++ ++ The 32f_s32f_x2_fm_detect_32f kernel now has a puppet. This solves QA seg ++ faults on 32-bit machines and provide a better test for this kernel. ++ ++ The 32fc_s32fc_x2_rotator_32fc generic protokernel replaced cabsf with ++ hypotf for better Android support. ++ ++ * Building ++ ++ Static builds now trigger the applications (volk_profile and ++ volk-config-info) to be statically linked. ++ ++ The file gcc_x86_cpuid.h has been removed since it was no longer being ++ used. Previously it provided cpuid functionality for ancient compilers ++ that we do not support. ++ ++ All build types now use -Wall. ++ ++ * QA and Testing ++ ++ The documentation around the --update option to volk_profile now makes it ++ clear that the option will only profile kernels without entries in ++ volk_profile. The signature of run_volk_tests with expanded args changed ++ signed types to unsigned types to reflect the actual input. ++ ++ The remaining changes are all non-functional changes to address issues ++ from Coverity. ++ ++ -- A. Maitland Bottoms Fri, 10 Jul 2015 17:57:42 -0400 ++ ++volk (1.0-5) unstable; urgency=medium ++ ++ * native-armv7-build-support skips neon on Debian armel (Closes: #789972) ++ ++ -- A. Maitland Bottoms Sat, 04 Jul 2015 12:36:36 -0400 ++ ++volk (1.0-4) unstable; urgency=low ++ ++ * update native-armv7-build-support patch from gnuradio volk package ++ ++ -- A. Maitland Bottoms Thu, 25 Jun 2015 16:38:49 -0400 ++ ++volk (1.0-3) unstable; urgency=medium ++ ++ * Add Breaks/Replaces (Closes: #789893, #789894) ++ * Allow failing tests ++ ++ -- A. Maitland Bottoms Thu, 25 Jun 2015 12:46:06 -0400 ++ ++volk (1.0-2) unstable; urgency=medium ++ ++ * kernels-add-missing-math.h-include-to-rotator ++ ++ -- A. Maitland Bottoms Wed, 24 Jun 2015 21:09:32 -0400 ++ ++volk (1.0-1) unstable; urgency=low ++ ++ * Initial package (Closes: #782417) ++ Initial Release 11 Apr 2015 by Nathan West ++ ++ VOLK 1.0 is available. This is the first release of VOLK as an independently ++ tracked sub-project of GNU Radio. ++ ++ * Contributors ++ ++ VOLK has been tracked separately from GNU Radio since 2014 Dec 23. ++ Contributors between the split and the initial release are ++ ++ Albert Holguin aholguin_77@yahoo.com ++ Doug Geiger doug.geiger@bioradiation.net ++ Elliot Briggs elliot.briggs@gmail.com ++ Julien Olivain julien.olivain@lsv.ens-cachan.fr ++ Michael Dickens michael.dickens@ettus.com ++ Nathan West nathan.west@okstate.edu ++ Tom Rondeau tom@trondeau.com ++ ++ * QA ++ ++ The test and profiler have significantly changed. The profiler supports ++ run-time changes to vlen and iters to help kernel development and provide ++ more flexibility on embedded systems. Additionally there is a new option ++ to update an existing volk_profile results file with only new kernels which ++ will save time when updating to newer versions of VOLK ++ ++ The QA system creates a static list of kernels and test cases. The QA ++ testing and profiler iterate over this static list rather than each source ++ file keeping its own list. The QA also emits XML results to ++ lib/.unittest/kernels.xml which is formatted similarly to JUnit results. ++ ++ * Modtool ++ ++ Modtool was updated to support the QA and profiler changes. ++ ++ * Kernels ++ ++ New proto-kernels: ++ ++ 16ic_deinterleave_real_8i_neon ++ 16ic_s32f_deinterleave_32f_neon ++ fix preprocessor errors for some compilers on byteswap and popcount puppets ++ ++ ORC was moved to the asm kernels directory. ++ volk_malloc ++ ++ The posix_memalign implementation of Volk_malloc now falls back to a standard ++ malloc if alignment is 1. ++ ++ * Miscellaneous ++ ++ Several build system and cmake changes have made it possible to build VOLK ++ both independently with proper soname versions and in-tree for projects ++ such as GNU Radio. ++ ++ The static builds take advantage of cmake object libraries to speed up builds. ++ ++ Finally, there are a number of changes to satisfy compiler warnings and make ++ QA work on multiple machines. ++ ++ -- A. Maitland Bottoms Sun, 12 Apr 2015 23:20:41 -0400 diff --cc debian/compat index 0000000,0000000..48082f7 new file mode 100644 --- /dev/null +++ b/debian/compat @@@ -1,0 -1,0 +1,1 @@@ ++12 diff --cc debian/control index 0000000,0000000..d53a4a2 new file mode 100644 --- /dev/null +++ b/debian/control @@@ -1,0 -1,0 +1,80 @@@ ++ ++Source: volk ++Section: libdevel ++Priority: optional ++Maintainer: A. Maitland Bottoms ++Build-Depends: cmake, ++ debhelper (>= 12~), ++ dh-python, ++ liborc-0.4-dev, ++ python3-dev, ++ python3-mako ++Build-Depends-Indep: doxygen ++Standards-Version: 4.5.0 ++Homepage: http://libvolk.org ++Vcs-Browser: https://salsa.debian.org/bottoms/pkg-volk ++Vcs-Git: https://salsa.debian.org/bottoms/pkg-volk.git ++ ++Package: libvolk2.2 ++Section: libs ++Architecture: any ++Pre-Depends: ${misc:Pre-Depends} ++Depends: ${misc:Depends}, ${shlibs:Depends} ++Multi-Arch: same ++Recommends: libvolk2-bin ++Suggests: libvolk2-dev ++Description: vector optimized functions ++ Vector-Optimized Library of Kernels is designed to help ++ applications work with the processor's SIMD instruction sets. These are ++ very powerful vector operations that can give signal processing a ++ huge boost in performance. ++ ++Package: libvolk2-dev ++Architecture: any ++Pre-Depends: ${misc:Pre-Depends} ++Depends: libvolk2.2 (=${binary:Version}), ${misc:Depends} ++Breaks: gnuradio-dev (<<3.7.8), libvolk-dev, libvolk1.0-dev, libvolk1-dev ++Replaces: gnuradio-dev (<<3.7.8), libvolk-dev, libvolk1.0-dev, libvolk1-dev ++Suggests: libvolk2-doc ++Multi-Arch: same ++Description: vector optimized function headers ++ Vector-Optimized Library of Kernels is designed to help ++ applications work with the processor's SIMD instruction sets. These are ++ very powerful vector operations that can give signal processing a ++ huge boost in performance. ++ . ++ This package contains the header files. ++ For documentation, see libvolk-doc. ++ ++Package: libvolk2-bin ++Section: libs ++Architecture: any ++Pre-Depends: ${misc:Pre-Depends} ++Depends: libvolk2.2 (=${binary:Version}), ++ ${misc:Depends}, ++ ${python3:Depends}, ++ ${shlibs:Depends} ++Breaks: libvolk1-bin, libvolk-bin, libvolk1.0-bin, gnuradio (<=3.7.2.1) ++Replaces: libvolk1-bin, libvolk-bin, libvolk1.0-bin, gnuradio (<=3.7.2.1) ++Description: vector optimized runtime tools ++ Vector-Optimized Library of Kernels is designed to help ++ applications work with the processor's SIMD instruction sets. These are ++ very powerful vector operations that can give signal processing a ++ huge boost in performance. ++ . ++ This package includes the volk_profile tool. ++ ++Package: libvolk2-doc ++Section: doc ++Architecture: all ++Multi-Arch: foreign ++Depends: ${misc:Depends} ++Recommends: lynx | www-browser ++Description: vector optimized library documentation ++ Vector-Optimized Library of Kernels is designed to help ++ applications work with the processor's SIMD instruction sets. These are ++ very powerful vector operations that can give signal processing a ++ huge boost in performance. ++ . ++ This package includes the Doxygen generated documentation in ++ /usr/share/doc/libvolk2-dev/html/index.html diff --cc debian/copyright index 0000000,0000000..0dc7d72 new file mode 100644 --- /dev/null +++ b/debian/copyright @@@ -1,0 -1,0 +1,187 @@@ ++Format: https://www.debian.org/doc/packaging-manuals/copyright-format/1.0/ ++Upstream-Name: volk ++Upstream-Contact: http://libvolk.org/ ++Source: ++ https://github.com/gnuradio/volk ++Comment: ++ Debian packages by A. Maitland Bottoms ++ git archive --format=tar --prefix=volk-2.1.0/ v2.1.0 | xz > ../volk_2.1.0.orig.tar.xz ++ . ++ Upstream Maintainers: ++ Johannes Demel ++ Michael Dickens ++Copyright: 2014-2019 Free Software Foundation, Inc. ++License: GPL-3+ ++ ++Files: * ++Copyright: 2006, 2009-2020, Free Software Foundation, Inc. ++License: GPL-3+ ++ ++Files: Doxyfile.in ++ DoxygenLayout.xml ++ volk.pc.in ++Copyright: 2014-2020 Free Software Foundation, Inc. ++License: GPL-3+ ++ ++Files: apps/volk_profile.h ++Copyright: 2014-2020 Free Software Foundation, Inc. ++License: GPL-3+ ++ ++Files: appveyor.yml ++Copyright: 2016 Paul Cercueil ++License: GPL-3+ ++ ++Files: cmake/* ++Copyright: 2014-2020 Free Software Foundation, Inc. ++License: GPL-3+ ++ ++Files: cmake/Modules/* ++Copyright: 2006, 2009-2020, Free Software Foundation, Inc. ++License: GPL-3+ ++ ++Files: cmake/Modules/CMakeParseArgumentsCopy.cmake ++Copyright: 2010 Alexander Neundorf ++License: Kitware-BSD ++ All rights reserved. ++ . ++ Redistribution and use in source and binary forms, with or without ++ modification, are permitted provided that the following conditions ++ are met: ++ . ++ * Redistributions of source code must retain the above copyright ++ notice, this list of conditions and the following disclaimer. ++ . ++ * Redistributions in binary form must reproduce the above copyright ++ notice, this list of conditions and the following disclaimer in the ++ documentation and/or other materials provided with the distribution. ++ . ++ * Neither the names of Kitware, Inc., the Insight Software Consortium, ++ nor the names of their contributors may be used to endorse or promote ++ products derived from this software without specific prior written ++ permission. ++ . ++ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ++ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT ++ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR ++ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT ++ HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, ++ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT ++ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, ++ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY ++ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ++ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE ++ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ ++Files: cmake/Modules/FindORC.cmake ++ cmake/Modules/VolkConfig.cmake.in ++Copyright: 2014-2015 Free Software Foundation, Inc. ++License: GPL-3+ ++ ++Files: cmake/msvc/* ++Copyright: 2006-2008, Alexander Chemeris ++License: BSD-2-clause ++ Redistribution and use in source and binary forms, with or without ++ modification, are permitted provided that the following conditions are met: ++ . ++ 1. Redistributions of source code must retain the above copyright notice, ++ this list of conditions and the following disclaimer. ++ . ++ 2. Redistributions in binary form must reproduce the above copyright ++ notice, this list of conditions and the following disclaimer in the ++ documentation and/or other materials provided with the distribution. ++ . ++ 3. The name of the author may be used to endorse or promote products ++ derived from this software without specific prior written permission. ++ . ++ THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED ++ WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF ++ MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO ++ EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, ++ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, ++ PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; ++ OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, ++ WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR ++ OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ++ ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ ++Files: cmake/msvc/config.h ++Copyright: 2005, 2006 Apple Computer, Inc. ++License: LGPL-2+ ++ ++Files: cmake/msvc/stdbool.h ++Copyright: 2005, 2006, Apple Computer, Inc. ++License: LGPL-2+ ++ ++Files: debian/* ++Copyright: 2015-2020 Free Software Foundation, Inc ++License: GPL-3+ ++Comment: assigned by A. Maitland Bottoms ++ ++Files: debian/libvolk2-dev.abi.tar.gz.amd64 ++Copyright: 2019 Free Software Foundation, Inc ++License: GPL-3+ ++ ++Files: docs/* ++Copyright: 2014-2015 Free Software Foundation, Inc. ++License: GPL-3+ ++ ++Files: gen/archs.xml ++ gen/machines.xml ++Copyright: 2014-2015 Free Software Foundation, Inc. ++License: GPL-3+ ++ ++Files: include/volk/volk_common.h ++ include/volk/volk_complex.h ++ include/volk/volk_prefs.h ++Copyright: 2014-2015 Free Software Foundation, Inc. ++License: GPL-3+ ++ ++Files: kernels/volk/asm/* ++Copyright: 2014-2015 Free Software Foundation, Inc. ++License: GPL-3+ ++ ++Files: kernels/volk/volk_16u_byteswappuppet_16u.h ++ kernels/volk/volk_32u_byteswappuppet_32u.h ++ kernels/volk/volk_64u_byteswappuppet_64u.h ++Copyright: 2014-2015 Free Software Foundation, Inc. ++License: GPL-3+ ++ ++Files: lib/kernel_tests.h ++ lib/qa_utils.cc ++ lib/qa_utils.h ++ lib/volk_prefs.c ++Copyright: 2014-2015 Free Software Foundation, Inc. ++License: GPL-3+ ++ ++License: LGPL-2+ ++ This library is free software; you can redistribute it and/or ++ modify it under the terms of the GNU Library General Public ++ License as published by the Free Software Foundation; either ++ version 2 of the License, or (at your option) any later version. ++ . ++ This library is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ Library General Public License for more details. ++ . ++ You should have received a copy of the GNU Library General Public License ++ along with this library; see the file COPYING.LIB. If not, write to ++ the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, ++ Boston, MA 02110-1301, USA. ++ ++License: GPL-3+ ++ This program is free software: you can redistribute it and/or modify ++ it under the terms of the GNU General Public License as published by ++ the Free Software Foundation; either version 3 of the License, or ++ (at your option) any later version. ++ . ++ This program is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ GNU General Public License for more details. ++ . ++ You should have received a copy of the GNU General Public License ++ along with this program. If not, see . ++ . ++ On Debian systems, the complete text of the GNU General ++ Public License version 3 can be found in "/usr/share/common-licenses/GPL-3". diff --cc debian/libvolk2-bin.install index 0000000,0000000..7221b71 new file mode 100644 --- /dev/null +++ b/debian/libvolk2-bin.install @@@ -1,0 -1,0 +1,2 @@@ ++usr/bin/volk* ++usr/lib/python3/dist-packages diff --cc debian/libvolk2-bin.manpages index 0000000,0000000..95bae9e new file mode 100644 --- /dev/null +++ b/debian/libvolk2-bin.manpages @@@ -1,0 -1,0 +1,3 @@@ ++debian/volk-config-info.1 ++debian/volk_modtool.1 ++debian/volk_profile.1 diff --cc debian/libvolk2-dev.abi.tar.gz.amd64 index 0000000,0000000..ff8acb1 new file mode 100644 Binary files differ diff --cc debian/libvolk2-dev.acc index 0000000,0000000..37f5a79 new file mode 100644 --- /dev/null +++ b/debian/libvolk2-dev.acc @@@ -1,0 -1,0 +1,50 @@@ ++ ++ ++ ++ ++ -DHAVE_CPUID_H ++ -DHAVE_DLFCN_H ++ -DHAVE_FENV_H ++ -DHAVE_POSIX_MEMALIGN ++ -DHAVE_XGETBV ++ -D_GLIBCXX_USE_CXX11_ABI=1 ++ -I/usr/include/orc-0.4 ++ -DNDEBUG ++ -std=gnu11 ++ -m64 ++ -mmmx ++ -msse ++ -msse2 ++ -msse3 ++ -mssse3 ++ -msse4.1 ++ -msse4.2 ++ -mpopcnt ++ -mavx ++ -mfma ++ -mavx2 ++ -mavx512f ++ -mavx512cd ++ -fPIC ++ -g ++ -O2 ++ -fstack-protector-strong ++ -Wformat ++ -Werror=format-security ++ -Wdate-time ++ -D_FORTIFY_SOURCE=2 ++ -fvisibility=hidden ++ -Wsign-compare ++ -Wall ++ -Wno-uninitialized ++ ++ ++ ++debian/libvolk2-dev/usr/include/volk/ ++ ++ ++ ++debian/libvolk2.0/usr/lib/ ++ ++ ++ diff --cc debian/libvolk2-dev.docs index 0000000,0000000..47699cc new file mode 100644 --- /dev/null +++ b/debian/libvolk2-dev.docs @@@ -1,0 -1,0 +1,3 @@@ ++debian/1.3_to_1.4_compat_report.html ++debian/1.4_to_2.0_compat_report.html ++debian/2.2.0_to_2.2.1_compat_report.html diff --cc debian/libvolk2-dev.install index 0000000,0000000..8b14c56 new file mode 100644 --- /dev/null +++ b/debian/libvolk2-dev.install @@@ -1,0 -1,0 +1,5 @@@ ++usr/include/* ++usr/lib/*/*volk.a ++usr/lib/*/*volk*so ++usr/lib/*/cmake/volk ++usr/lib/*/pkgconfig/*volk* diff --cc debian/libvolk2-doc.doc-base index 0000000,0000000..3d5fdc8 new file mode 100644 --- /dev/null +++ b/debian/libvolk2-doc.doc-base @@@ -1,0 -1,0 +1,19 @@@ ++Document: libvolk2-doc ++Title: Vector-Optimized Library of Kernels Reference Manual ++Author: GNU Radio Developers ++Abstract: VOLK is the Vector-Optimized Library of Kernels. ++ It is a library that contains kernels of hand-written SIMD code for ++ different mathematical operations. Since each SIMD architecture can ++ be very different and no compiler has yet come along to handle ++ vectorization properly or highly efficiently, VOLK approaches the ++ problem differently. For each architecture or platform that a ++ developer wishes to vectorize for, a new proto-kernel is added to ++ VOLK. At runtime, VOLK will select the correct proto-kernel. In this ++ way, the users of VOLK call a kernel for performing the operation ++ that is platform/architecture agnostic. This allows us to write ++ portable SIMD code. ++Section: Programming/C++ ++ ++Format: HTML ++Index: /usr/share/doc/libvolk2-dev/html/index.html ++Files: /usr/share/doc/libvolk2-dev/html/*.html diff --cc debian/libvolk2-doc.docs index 0000000,0000000..87dd314 new file mode 100644 --- /dev/null +++ b/debian/libvolk2-doc.docs @@@ -1,0 -1,0 +1,1 @@@ ++obj-*/html diff --cc debian/libvolk2.2.install index 0000000,0000000..e4252f4 new file mode 100644 --- /dev/null +++ b/debian/libvolk2.2.install @@@ -1,0 -1,0 +1,1 @@@ ++usr/lib/*/libvolk.so.* diff --cc debian/patches/0001-volk-accurate-exp-kernel.patch index 0000000,0000000..53df58a new file mode 100644 --- /dev/null +++ b/debian/patches/0001-volk-accurate-exp-kernel.patch @@@ -1,0 -1,0 +1,333 @@@ ++From 9b5abaa62ce3b5d5379899d30afe1964eb63d86d Mon Sep 17 00:00:00 2001 ++From: Tom Rondeau ++Date: Tue, 7 Apr 2015 14:37:28 -0400 ++Subject: [PATCH 1/7] volk: accurate exp kernel. ++ ++A more accurate exp VOLK kernel than volk_32f_expfast_32f.Taken from ++code licensed with zlib. ++--- ++ kernels/volk/volk_32f_exp_32f.h | 298 ++++++++++++++++++++++++++++++++ ++ lib/kernel_tests.h | 2 + ++ 2 files changed, 300 insertions(+) ++ create mode 100644 kernels/volk/volk_32f_exp_32f.h ++ ++diff --git a/kernels/volk/volk_32f_exp_32f.h b/kernels/volk/volk_32f_exp_32f.h ++new file mode 100644 ++index 0000000..19c3d9d ++--- /dev/null +++++ b/kernels/volk/volk_32f_exp_32f.h ++@@ -0,0 +1,298 @@ +++/* -*- c++ -*- */ +++/* +++ * Copyright 2015-2020 Free Software Foundation, Inc. +++ * +++ * This file is part of GNU Radio +++ * +++ * GNU Radio is free software; you can redistribute it and/or modify +++ * it under the terms of the GNU General Public License as published by +++ * the Free Software Foundation; either version 3, or (at your option) +++ * any later version. +++ * +++ * GNU Radio is distributed in the hope that it will be useful, +++ * but WITHOUT ANY WARRANTY; without even the implied warranty of +++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +++ * GNU General Public License for more details. +++ * +++ * You should have received a copy of the GNU General Public License +++ * along with GNU Radio; see the file COPYING. If not, write to +++ * the Free Software Foundation, Inc., 51 Franklin Street, +++ * Boston, MA 02110-1301, USA. +++ */ +++ +++/* SIMD (SSE4) implementation of exp +++ Inspired by Intel Approximate Math library, and based on the +++ corresponding algorithms of the cephes math library +++*/ +++ +++/* Copyright (C) 2007 Julien Pommier +++ +++ This software is provided 'as-is', without any express or implied +++ warranty. In no event will the authors be held liable for any damages +++ arising from the use of this software. +++ +++ Permission is granted to anyone to use this software for any purpose, +++ including commercial applications, and to alter it and redistribute it +++ freely, subject to the following restrictions: +++ +++ 1. The origin of this software must not be misrepresented; you must not +++ claim that you wrote the original software. If you use this software +++ in a product, an acknowledgment in the product documentation would be +++ appreciated but is not required. +++ 2. Altered source versions must be plainly marked as such, and must not be +++ misrepresented as being the original software. +++ 3. This notice may not be removed or altered from any source distribution. +++ +++ (this is the zlib license) +++*/ +++ +++/*! +++ * \page volk_32f_exp_32f +++ * +++ * \b Overview +++ * +++ * Computes exponential of input vector and stores results in output vector. +++ * +++ * Dispatcher Prototype +++ * \code +++ * void volk_32f_exp_32f(float* bVector, const float* aVector, unsigned int num_points) +++ * \endcode +++ * +++ * \b Inputs +++ * \li aVector: The input vector of floats. +++ * \li num_points: The number of data points. +++ * +++ * \b Outputs +++ * \li bVector: The vector where results will be stored. +++ * +++ * \b Example +++ * \code +++ * int N = 10; +++ * unsigned int alignment = volk_get_alignment(); +++ * float* in = (float*)volk_malloc(sizeof(float)*N, alignment); +++ * float* out = (float*)volk_malloc(sizeof(float)*N, alignment); +++ * +++ * in[0] = 0; +++ * in[1] = 0.5; +++ * in[2] = std::sqrt(2.f)/2.f; +++ * in[3] = std::sqrt(3.f)/2.f; +++ * in[4] = in[5] = 1; +++ * for(unsigned int ii = 6; ii < N; ++ii){ +++ * in[ii] = - in[N-ii-1]; +++ * } +++ * +++ * volk_32f_exp_32f(out, in, N); +++ * +++ * for(unsigned int ii = 0; ii < N; ++ii){ +++ * printf("exp(%1.3f) = %1.3f\n", in[ii], out[ii]); +++ * } +++ * +++ * volk_free(in); +++ * volk_free(out); +++ * \endcode +++ */ +++ +++#include +++#include +++#include +++ +++#ifndef INCLUDED_volk_32f_exp_32f_a_H +++#define INCLUDED_volk_32f_exp_32f_a_H +++ +++#ifdef LV_HAVE_SSE4_1 +++#include +++ +++static inline void +++volk_32f_exp_32f_a_sse4_1(float* bVector, const float* aVector, unsigned int num_points) +++{ +++ float* bPtr = bVector; +++ const float* aPtr = aVector; +++ +++ unsigned int number = 0; +++ unsigned int quarterPoints = num_points / 4; +++ +++ // Declare variables and constants +++ __m128 aVal, bVal, tmp, fx, mask, pow2n, z, y; +++ __m128 one, exp_hi, exp_lo, log2EF, half, exp_C1, exp_C2; +++ __m128 exp_p0, exp_p1, exp_p2, exp_p3, exp_p4, exp_p5; +++ __m128i emm0, pi32_0x7f; +++ +++ one = _mm_set1_ps(1.0); +++ exp_hi = _mm_set1_ps(88.3762626647949); +++ exp_lo = _mm_set1_ps(-88.3762626647949); +++ log2EF = _mm_set1_ps(1.44269504088896341); +++ half = _mm_set1_ps(0.5); +++ exp_C1 = _mm_set1_ps(0.693359375); +++ exp_C2 = _mm_set1_ps(-2.12194440e-4); +++ pi32_0x7f = _mm_set1_epi32(0x7f); +++ +++ exp_p0 = _mm_set1_ps(1.9875691500e-4); +++ exp_p1 = _mm_set1_ps(1.3981999507e-3); +++ exp_p2 = _mm_set1_ps(8.3334519073e-3); +++ exp_p3 = _mm_set1_ps(4.1665795894e-2); +++ exp_p4 = _mm_set1_ps(1.6666665459e-1); +++ exp_p5 = _mm_set1_ps(5.0000001201e-1); +++ +++ for(;number < quarterPoints; number++) { +++ aVal = _mm_load_ps(aPtr); +++ tmp = _mm_setzero_ps(); +++ +++ aVal = _mm_max_ps(_mm_min_ps(aVal, exp_hi), exp_lo); +++ +++ /* express exp(x) as exp(g + n*log(2)) */ +++ fx = _mm_add_ps(_mm_mul_ps(aVal, log2EF), half); +++ +++ emm0 = _mm_cvttps_epi32(fx); +++ tmp = _mm_cvtepi32_ps(emm0); +++ +++ mask = _mm_and_ps(_mm_cmpgt_ps(tmp, fx), one); +++ fx = _mm_sub_ps(tmp, mask); +++ +++ tmp = _mm_mul_ps(fx, exp_C1); +++ z = _mm_mul_ps(fx, exp_C2); +++ aVal = _mm_sub_ps(_mm_sub_ps(aVal, tmp), z); +++ z = _mm_mul_ps(aVal, aVal); +++ +++ y = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(exp_p0, aVal), exp_p1), aVal); +++ y = _mm_add_ps(_mm_mul_ps(_mm_add_ps(y, exp_p2), aVal), exp_p3); +++ y = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(y, aVal), exp_p4), aVal); +++ y = _mm_add_ps(_mm_mul_ps(_mm_add_ps(y, exp_p5), z), aVal); +++ y = _mm_add_ps(y, one); +++ +++ emm0 = _mm_slli_epi32(_mm_add_epi32(_mm_cvttps_epi32(fx), pi32_0x7f), 23); +++ +++ pow2n = _mm_castsi128_ps(emm0); +++ bVal = _mm_mul_ps(y, pow2n); +++ +++ _mm_store_ps(bPtr, bVal); +++ aPtr += 4; +++ bPtr += 4; +++ } +++ +++ number = quarterPoints * 4; +++ for(;number < num_points; number++) { +++ *bPtr++ = expf(*aPtr++); +++ } +++} +++ +++#endif /* LV_HAVE_SSE4_1 for aligned */ +++ +++ +++#ifdef LV_HAVE_GENERIC +++ +++static inline void +++volk_32f_exp_32f_a_generic(float* bVector, const float* aVector, unsigned int num_points) +++{ +++ float* bPtr = bVector; +++ const float* aPtr = aVector; +++ unsigned int number = 0; +++ +++ for(number = 0; number < num_points; number++) { +++ *bPtr++ = expf(*aPtr++); +++ } +++} +++ +++#endif /* LV_HAVE_GENERIC */ +++ +++#endif /* INCLUDED_volk_32f_exp_32f_a_H */ +++ +++#ifndef INCLUDED_volk_32f_exp_32f_u_H +++#define INCLUDED_volk_32f_exp_32f_u_H +++ +++#ifdef LV_HAVE_SSE4_1 +++#include +++ +++static inline void +++volk_32f_exp_32f_u_sse4_1(float* bVector, const float* aVector, unsigned int num_points) +++{ +++ float* bPtr = bVector; +++ const float* aPtr = aVector; +++ +++ unsigned int number = 0; +++ unsigned int quarterPoints = num_points / 4; +++ +++ // Declare variables and constants +++ __m128 aVal, bVal, tmp, fx, mask, pow2n, z, y; +++ __m128 one, exp_hi, exp_lo, log2EF, half, exp_C1, exp_C2; +++ __m128 exp_p0, exp_p1, exp_p2, exp_p3, exp_p4, exp_p5; +++ __m128i emm0, pi32_0x7f; +++ +++ one = _mm_set1_ps(1.0); +++ exp_hi = _mm_set1_ps(88.3762626647949); +++ exp_lo = _mm_set1_ps(-88.3762626647949); +++ log2EF = _mm_set1_ps(1.44269504088896341); +++ half = _mm_set1_ps(0.5); +++ exp_C1 = _mm_set1_ps(0.693359375); +++ exp_C2 = _mm_set1_ps(-2.12194440e-4); +++ pi32_0x7f = _mm_set1_epi32(0x7f); +++ +++ exp_p0 = _mm_set1_ps(1.9875691500e-4); +++ exp_p1 = _mm_set1_ps(1.3981999507e-3); +++ exp_p2 = _mm_set1_ps(8.3334519073e-3); +++ exp_p3 = _mm_set1_ps(4.1665795894e-2); +++ exp_p4 = _mm_set1_ps(1.6666665459e-1); +++ exp_p5 = _mm_set1_ps(5.0000001201e-1); +++ +++ +++ for(;number < quarterPoints; number++) { +++ aVal = _mm_loadu_ps(aPtr); +++ tmp = _mm_setzero_ps(); +++ +++ aVal = _mm_max_ps(_mm_min_ps(aVal, exp_hi), exp_lo); +++ +++ /* express exp(x) as exp(g + n*log(2)) */ +++ fx = _mm_add_ps(_mm_mul_ps(aVal, log2EF), half); +++ +++ emm0 = _mm_cvttps_epi32(fx); +++ tmp = _mm_cvtepi32_ps(emm0); +++ +++ mask = _mm_and_ps(_mm_cmpgt_ps(tmp, fx), one); +++ fx = _mm_sub_ps(tmp, mask); +++ +++ tmp = _mm_mul_ps(fx, exp_C1); +++ z = _mm_mul_ps(fx, exp_C2); +++ aVal = _mm_sub_ps(_mm_sub_ps(aVal, tmp), z); +++ z = _mm_mul_ps(aVal, aVal); +++ +++ y = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(exp_p0, aVal), exp_p1), aVal); +++ y = _mm_add_ps(_mm_mul_ps(_mm_add_ps(y, exp_p2), aVal), exp_p3); +++ y = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(y, aVal), exp_p4), aVal); +++ y = _mm_add_ps(_mm_mul_ps(_mm_add_ps(y, exp_p5), z), aVal); +++ y = _mm_add_ps(y, one); +++ +++ emm0 = _mm_slli_epi32(_mm_add_epi32(_mm_cvttps_epi32(fx), pi32_0x7f), 23); +++ +++ pow2n = _mm_castsi128_ps(emm0); +++ bVal = _mm_mul_ps(y, pow2n); +++ +++ _mm_storeu_ps(bPtr, bVal); +++ aPtr += 4; +++ bPtr += 4; +++ } +++ +++ number = quarterPoints * 4; +++ for(;number < num_points; number++){ +++ *bPtr++ = expf(*aPtr++); +++ } +++} +++ +++#endif /* LV_HAVE_SSE4_1 for unaligned */ +++ +++ +++#ifdef LV_HAVE_GENERIC +++ +++static inline void +++volk_32f_exp_32f_u_generic(float* bVector, const float* aVector, unsigned int num_points) +++{ +++ float* bPtr = bVector; +++ const float* aPtr = aVector; +++ unsigned int number = 0; +++ +++ for(number = 0; number < num_points; number++){ +++ *bPtr++ = expf(*aPtr++); +++ } +++} +++ +++#endif /* LV_HAVE_GENERIC */ +++ +++#endif /* INCLUDED_volk_32f_exp_32f_u_H */ ++diff --git a/lib/kernel_tests.h b/lib/kernel_tests.h ++index c009c3f..8552488 100644 ++--- a/lib/kernel_tests.h +++++ b/lib/kernel_tests.h ++@@ -144,6 +144,8 @@ std::vector init_test_list(volk_test_params_t test_params) ++ QA(VOLK_INIT_TEST(volk_32fc_x2_s32fc_multiply_conjugate_add_32fc, test_params)) ++ QA(VOLK_INIT_PUPP(volk_8u_x3_encodepolarpuppet_8u, volk_8u_x3_encodepolar_8u_x2, test_params)) ++ QA(VOLK_INIT_PUPP(volk_32f_8u_polarbutterflypuppet_32f, volk_32f_8u_polarbutterfly_32f, test_params)) +++ QA(VOLK_INIT_TEST(volk_32f_exp_32f, test_params)) +++ ++ // no one uses these, so don't test them ++ //VOLK_PROFILE(volk_16i_x5_add_quad_16i_x4, 1e-4, 2046, 10000, &results, benchmark_mode, kernel_regex); ++ //VOLK_PROFILE(volk_16i_branch_4_state_8, 1e-4, 2046, 10000, &results, benchmark_mode, kernel_regex); ++-- ++2.20.1 ++ diff --cc debian/patches/0002-exp-Rename-SSE4.1-to-SSE2-kernel.patch index 0000000,0000000..94d3281 new file mode 100644 --- /dev/null +++ b/debian/patches/0002-exp-Rename-SSE4.1-to-SSE2-kernel.patch @@@ -1,0 -1,0 +1,66 @@@ ++From 52bfb2f049b534aca5b6d3e7475c9b2dd97c55a3 Mon Sep 17 00:00:00 2001 ++From: Johannes Demel ++Date: Tue, 17 Mar 2020 21:20:51 +0100 ++Subject: [PATCH 2/7] exp: Rename SSE4.1 to SSE2 kernel ++ ++The SSE kernel only requires SSE2 instructions. Thus, we can just use ++this instruction level. ++--- ++ kernels/volk/volk_32f_exp_32f.h | 16 ++++++++-------- ++ 1 file changed, 8 insertions(+), 8 deletions(-) ++ ++diff --git a/kernels/volk/volk_32f_exp_32f.h b/kernels/volk/volk_32f_exp_32f.h ++index 19c3d9d..26fdf02 100644 ++--- a/kernels/volk/volk_32f_exp_32f.h +++++ b/kernels/volk/volk_32f_exp_32f.h ++@@ -99,11 +99,11 @@ ++ #ifndef INCLUDED_volk_32f_exp_32f_a_H ++ #define INCLUDED_volk_32f_exp_32f_a_H ++ ++-#ifdef LV_HAVE_SSE4_1 ++-#include +++#ifdef LV_HAVE_SSE2 +++#include ++ ++ static inline void ++-volk_32f_exp_32f_a_sse4_1(float* bVector, const float* aVector, unsigned int num_points) +++volk_32f_exp_32f_a_sse2(float* bVector, const float* aVector, unsigned int num_points) ++ { ++ float* bPtr = bVector; ++ const float* aPtr = aVector; ++@@ -175,7 +175,7 @@ volk_32f_exp_32f_a_sse4_1(float* bVector, const float* aVector, unsigned int num ++ } ++ } ++ ++-#endif /* LV_HAVE_SSE4_1 for aligned */ +++#endif /* LV_HAVE_SSE2 for aligned */ ++ ++ ++ #ifdef LV_HAVE_GENERIC ++@@ -199,11 +199,11 @@ volk_32f_exp_32f_a_generic(float* bVector, const float* aVector, unsigned int nu ++ #ifndef INCLUDED_volk_32f_exp_32f_u_H ++ #define INCLUDED_volk_32f_exp_32f_u_H ++ ++-#ifdef LV_HAVE_SSE4_1 ++-#include +++#ifdef LV_HAVE_SSE2 +++#include ++ ++ static inline void ++-volk_32f_exp_32f_u_sse4_1(float* bVector, const float* aVector, unsigned int num_points) +++volk_32f_exp_32f_u_sse2(float* bVector, const float* aVector, unsigned int num_points) ++ { ++ float* bPtr = bVector; ++ const float* aPtr = aVector; ++@@ -276,7 +276,7 @@ volk_32f_exp_32f_u_sse4_1(float* bVector, const float* aVector, unsigned int num ++ } ++ } ++ ++-#endif /* LV_HAVE_SSE4_1 for unaligned */ +++#endif /* LV_HAVE_SSE2 for unaligned */ ++ ++ ++ #ifdef LV_HAVE_GENERIC ++-- ++2.20.1 ++ diff --cc debian/patches/0003-clang-format-Apply-clang-format.patch index 0000000,0000000..1873202 new file mode 100644 --- /dev/null +++ b/debian/patches/0003-clang-format-Apply-clang-format.patch @@@ -1,0 -1,0 +1,74061 @@@ ++From 092a59997a1e1d5f421a0a5f87ee655ad173b93f Mon Sep 17 00:00:00 2001 ++From: Johannes Demel ++Date: Sun, 23 Feb 2020 15:03:47 +0100 ++Subject: [PATCH 3/7] clang-format: Apply clang-format ++ ++This commit adds `.clang-format` from GNU Radio and apply clang-format. ++ ++Run: ++`find . -regex '.*\.$c\|cc\|cpp\|cxx\|h\|hh$' -exec clang-format \ ++-style=file -i {} \;` ++in `.`. ++--- ++ .clang-format | 106 ++ ++ apps/volk-config-info.cc | 77 +- ++ apps/volk_option_helpers.cc | 268 +-- ++ apps/volk_option_helpers.h | 84 +- ++ apps/volk_profile.cc | 205 ++- ++ apps/volk_profile.h | 20 +- ++ cmake/msvc/config.h | 27 +- ++ cmake/msvc/sys/time.h | 77 +- ++ include/volk/saturation_arithmetic.h | 16 +- ++ include/volk/volk_alloc.hh | 42 +- ++ include/volk/volk_avx2_intrinsics.h | 114 +- ++ include/volk/volk_avx_intrinsics.h | 193 +- ++ include/volk/volk_common.h | 148 +- ++ include/volk/volk_complex.h | 41 +- ++ include/volk/volk_malloc.h | 12 +- ++ include/volk/volk_neon_intrinsics.h | 115 +- ++ include/volk/volk_prefs.h | 17 +- ++ include/volk/volk_sse3_intrinsics.h | 79 +- ++ include/volk/volk_sse_intrinsics.h | 53 +- ++ kernels/volk/volk_16i_32fc_dot_prod_32fc.h | 1118 ++++++------ ++ kernels/volk/volk_16i_branch_4_state_8.h | 219 ++- ++ kernels/volk/volk_16i_convert_8i.h | 301 ++-- ++ kernels/volk/volk_16i_max_star_16i.h | 158 +- ++ .../volk/volk_16i_max_star_horizontal_16i.h | 214 +-- ++ .../volk/volk_16i_permute_and_scalar_add.h | 187 +- ++ kernels/volk/volk_16i_s32f_convert_32f.h | 609 +++---- ++ kernels/volk/volk_16i_x4_quad_max_star_16i.h | 357 ++-- ++ kernels/volk/volk_16i_x5_add_quad_16i_x4.h | 336 ++-- ++ kernels/volk/volk_16ic_convert_32fc.h | 241 +-- ++ kernels/volk/volk_16ic_deinterleave_16i_x2.h | 431 +++-- ++ .../volk/volk_16ic_deinterleave_real_16i.h | 397 +++-- ++ kernels/volk/volk_16ic_deinterleave_real_8i.h | 469 +++-- ++ kernels/volk/volk_16ic_magnitude_16i.h | 506 +++--- ++ .../volk/volk_16ic_s32f_deinterleave_32f_x2.h | 418 ++--- ++ .../volk_16ic_s32f_deinterleave_real_32f.h | 372 ++-- ++ kernels/volk/volk_16ic_s32f_magnitude_32f.h | 381 ++-- ++ kernels/volk/volk_16ic_x2_dot_prod_16ic.h | 750 ++++---- ++ kernels/volk/volk_16ic_x2_multiply_16ic.h | 504 ++++-- ++ kernels/volk/volk_16u_byteswap.h | 378 ++-- ++ kernels/volk/volk_16u_byteswappuppet_16u.h | 44 +- ++ kernels/volk/volk_32f_64f_add_64f.h | 270 +-- ++ kernels/volk/volk_32f_64f_multiply_64f.h | 154 +- ++ kernels/volk/volk_32f_8u_polarbutterfly_32f.h | 478 ++--- ++ .../volk_32f_8u_polarbutterflypuppet_32f.h | 155 +- ++ kernels/volk/volk_32f_accumulator_s32f.h | 287 +-- ++ kernels/volk/volk_32f_acos_32f.h | 700 ++++---- ++ kernels/volk/volk_32f_asin_32f.h | 647 +++---- ++ kernels/volk/volk_32f_atan_32f.h | 625 +++---- ++ kernels/volk/volk_32f_binary_slicer_32i.h | 259 +-- ++ kernels/volk/volk_32f_binary_slicer_8i.h | 706 ++++---- ++ kernels/volk/volk_32f_convert_64f.h | 214 ++- ++ kernels/volk/volk_32f_cos_32f.h | 1159 ++++++------ ++ kernels/volk/volk_32f_expfast_32f.h | 347 ++-- ++ kernels/volk/volk_32f_index_max_16u.h | 370 ++-- ++ kernels/volk/volk_32f_index_max_32u.h | 770 ++++---- ++ kernels/volk/volk_32f_invsqrt_32f.h | 189 +- ++ kernels/volk/volk_32f_log2_32f.h | 719 +++++--- ++ kernels/volk/volk_32f_null_32f.h | 16 +- ++ .../volk/volk_32f_s32f_32f_fm_detect_32f.h | 457 ++--- ++ ...k_32f_s32f_calc_spectral_noise_floor_32f.h | 683 +++---- ++ kernels/volk/volk_32f_s32f_convert_16i.h | 815 ++++----- ++ kernels/volk/volk_32f_s32f_convert_32i.h | 579 +++--- ++ kernels/volk/volk_32f_s32f_convert_8i.h | 642 +++---- ++ .../volk/volk_32f_s32f_mod_rangepuppet_32f.h | 63 +- ++ kernels/volk/volk_32f_s32f_multiply_32f.h | 271 +-- ++ kernels/volk/volk_32f_s32f_normalize.h | 150 +- ++ kernels/volk/volk_32f_s32f_power_32f.h | 166 +- ++ .../volk/volk_32f_s32f_s32f_mod_range_32f.h | 718 ++++---- ++ kernels/volk/volk_32f_s32f_stddev_32f.h | 449 ++--- ++ kernels/volk/volk_32f_sin_32f.h | 945 +++++----- ++ kernels/volk/volk_32f_sqrt_32f.h | 153 +- ++ .../volk/volk_32f_stddev_and_mean_32f_x2.h | 583 +++--- ++ kernels/volk/volk_32f_tan_32f.h | 1023 ++++++----- ++ kernels/volk/volk_32f_tanh_32f.h | 631 ++++--- ++ kernels/volk/volk_32f_x2_add_32f.h | 412 +++-- ++ kernels/volk/volk_32f_x2_divide_32f.h | 364 ++-- ++ kernels/volk/volk_32f_x2_dot_prod_16i.h | 1092 ++++++------ ++ kernels/volk/volk_32f_x2_dot_prod_32f.h | 1186 +++++++------ ++ .../volk/volk_32f_x2_fm_detectpuppet_32f.h | 40 +- ++ kernels/volk/volk_32f_x2_interleave_32fc.h | 292 +-- ++ kernels/volk/volk_32f_x2_max_32f.h | 345 ++-- ++ kernels/volk/volk_32f_x2_min_32f.h | 347 ++-- ++ kernels/volk/volk_32f_x2_multiply_32f.h | 375 ++-- ++ kernels/volk/volk_32f_x2_pow_32f.h | 1175 ++++++------ ++ .../volk/volk_32f_x2_s32f_interleave_16ic.h | 324 ++-- ++ kernels/volk/volk_32f_x2_subtract_32f.h | 319 ++-- ++ kernels/volk/volk_32f_x3_sum_of_poly_32f.h | 1026 +++++------ ++ kernels/volk/volk_32fc_32f_add_32fc.h | 281 +-- ++ kernels/volk/volk_32fc_32f_dot_prod_32fc.h | 1205 +++++++------ ++ kernels/volk/volk_32fc_32f_multiply_32fc.h | 226 +-- ++ kernels/volk/volk_32fc_conjugate_32fc.h | 233 +-- ++ kernels/volk/volk_32fc_convert_16ic.h | 439 ++--- ++ kernels/volk/volk_32fc_deinterleave_32f_x2.h | 297 ++-- ++ kernels/volk/volk_32fc_deinterleave_64f_x2.h | 439 ++--- ++ .../volk/volk_32fc_deinterleave_imag_32f.h | 210 +-- ++ .../volk/volk_32fc_deinterleave_real_32f.h | 214 +-- ++ .../volk/volk_32fc_deinterleave_real_64f.h | 262 +-- ++ kernels/volk/volk_32fc_index_max_16u.h | 639 +++---- ++ kernels/volk/volk_32fc_index_max_32u.h | 630 +++---- ++ kernels/volk/volk_32fc_magnitude_32f.h | 556 +++--- ++ .../volk/volk_32fc_magnitude_squared_32f.h | 443 ++--- ++ kernels/volk/volk_32fc_s32f_atan2_32f.h | 208 +-- ++ .../volk_32fc_s32f_deinterleave_real_16i.h | 226 +-- ++ kernels/volk/volk_32fc_s32f_magnitude_16i.h | 297 ++-- ++ kernels/volk/volk_32fc_s32f_power_32fc.h | 121 +- ++ .../volk/volk_32fc_s32f_power_spectrum_32f.h | 176 +- ++ ..._32fc_s32f_x2_power_spectral_density_32f.h | 297 ++-- ++ kernels/volk/volk_32fc_s32fc_multiply_32fc.h | 250 +-- ++ .../volk/volk_32fc_s32fc_rotatorpuppet_32fc.h | 118 +- ++ .../volk/volk_32fc_s32fc_x2_rotator_32fc.h | 260 +-- ++ kernels/volk/volk_32fc_x2_add_32fc.h | 274 +-- ++ .../volk_32fc_x2_conjugate_dot_prod_32fc.h | 1017 ++++++----- ++ kernels/volk/volk_32fc_x2_divide_32fc.h | 372 ++-- ++ kernels/volk/volk_32fc_x2_dot_prod_32fc.h | 1334 +++++++------- ++ kernels/volk/volk_32fc_x2_multiply_32fc.h | 575 +++--- ++ .../volk_32fc_x2_multiply_conjugate_32fc.h | 347 ++-- ++ ...32fc_x2_s32f_square_dist_scalar_mult_32f.h | 657 +++---- ++ ...2fc_x2_s32fc_multiply_conjugate_add_32fc.h | 98 +- ++ kernels/volk/volk_32fc_x2_square_dist_32f.h | 426 ++--- ++ kernels/volk/volk_32i_s32f_convert_32f.h | 347 ++-- ++ kernels/volk/volk_32i_x2_and_32i.h | 320 ++-- ++ kernels/volk/volk_32i_x2_or_32i.h | 321 ++-- ++ kernels/volk/volk_32u_byteswap.h | 433 ++--- ++ kernels/volk/volk_32u_byteswappuppet_32u.h | 44 +- ++ kernels/volk/volk_32u_popcnt.h | 26 +- ++ kernels/volk/volk_32u_popcntpuppet_32u.h | 18 +- ++ kernels/volk/volk_32u_reverse_32u.h | 598 ++++--- ++ kernels/volk/volk_64f_convert_32f.h | 324 ++-- ++ kernels/volk/volk_64f_x2_add_64f.h | 207 +-- ++ kernels/volk/volk_64f_x2_max_64f.h | 276 +-- ++ kernels/volk/volk_64f_x2_min_64f.h | 275 +-- ++ kernels/volk/volk_64f_x2_multiply_64f.h | 207 +-- ++ kernels/volk/volk_64u_byteswap.h | 599 ++++--- ++ kernels/volk/volk_64u_byteswappuppet_64u.h | 56 +- ++ kernels/volk/volk_64u_popcnt.h | 79 +- ++ kernels/volk/volk_64u_popcntpuppet_64u.h | 29 +- ++ kernels/volk/volk_8i_convert_16i.h | 315 ++-- ++ kernels/volk/volk_8i_s32f_convert_32f.h | 528 +++--- ++ kernels/volk/volk_8ic_deinterleave_16i_x2.h | 493 ++++-- ++ kernels/volk/volk_8ic_deinterleave_real_16i.h | 346 ++-- ++ kernels/volk/volk_8ic_deinterleave_real_8i.h | 482 +++-- ++ .../volk/volk_8ic_s32f_deinterleave_32f_x2.h | 571 +++--- ++ .../volk_8ic_s32f_deinterleave_real_32f.h | 395 +++-- ++ .../volk_8ic_x2_multiply_conjugate_16ic.h | 413 +++-- ++ ...volk_8ic_x2_s32f_multiply_conjugate_32fc.h | 496 +++--- ++ kernels/volk/volk_8u_conv_k7_r2puppet_8u.h | 494 +++--- ++ kernels/volk/volk_8u_x2_encodeframepolar_8u.h | 1569 +++++++++++------ ++ kernels/volk/volk_8u_x3_encodepolar_8u_x2.h | 110 +- ++ .../volk/volk_8u_x3_encodepolarpuppet_8u.h | 137 +- ++ kernels/volk/volk_8u_x4_conv_k7_r2_8u.h | 1067 +++++------ ++ lib/kernel_tests.h | 257 +-- ++ lib/qa_utils.cc | 751 +++++--- ++ lib/qa_utils.h | 288 +-- ++ lib/testqa.cc | 96 +- ++ lib/volk_malloc.c | 55 +- ++ lib/volk_prefs.c | 74 +- ++ lib/volk_rank_archs.c | 73 +- ++ lib/volk_rank_archs.h | 22 +- ++ 158 files changed, 32509 insertions(+), 27583 deletions(-) ++ create mode 100644 .clang-format ++ ++diff --git a/.clang-format b/.clang-format ++new file mode 100644 ++index 0000000..285b68d ++--- /dev/null +++++ b/.clang-format ++@@ -0,0 +1,106 @@ +++--- +++Language: Cpp +++# BasedOnStyle: LLVM +++AccessModifierOffset: -4 +++AlignAfterOpenBracket: Align +++AlignConsecutiveAssignments: false +++AlignConsecutiveDeclarations: false +++AlignEscapedNewlinesLeft: true +++AlignOperands: true +++AlignTrailingComments: true +++AllowAllParametersOfDeclarationOnNextLine: true +++AllowShortBlocksOnASingleLine: false +++AllowShortCaseLabelsOnASingleLine: false +++AllowShortFunctionsOnASingleLine: All +++AllowShortIfStatementsOnASingleLine: false +++AllowShortLoopsOnASingleLine: false +++AlwaysBreakAfterDefinitionReturnType: None +++AlwaysBreakAfterReturnType: None +++AlwaysBreakBeforeMultilineStrings: false +++AlwaysBreakTemplateDeclarations: true +++BinPackArguments: false +++BinPackParameters: false +++BreakBeforeBraces: Custom +++BraceWrapping: +++ AfterClass: true +++ AfterControlStatement: false +++ AfterEnum: false +++ AfterFunction: true +++ AfterNamespace: false +++ AfterObjCDeclaration: false +++ AfterStruct: false +++ AfterUnion: false +++ BeforeCatch: false +++ BeforeElse: false +++ IndentBraces: false +++BreakBeforeBinaryOperators: None +++BreakBeforeTernaryOperators: true +++BreakConstructorInitializersBeforeComma: false +++BreakAfterJavaFieldAnnotations: false +++BreakStringLiterals: true +++ColumnLimit: 90 +++CommentPragmas: '^ IWYU pragma:' +++ConstructorInitializerAllOnOneLineOrOnePerLine: true +++ConstructorInitializerIndentWidth: 4 +++ContinuationIndentWidth: 4 +++Cpp11BracedListStyle: false +++DerivePointerAlignment: false +++DisableFormat: false +++ExperimentalAutoDetectBinPacking: false +++ForEachMacros: +++ - foreach +++ - Q_FOREACH +++ - BOOST_FOREACH +++IncludeCategories: +++ - Regex: '^"(gnuradio)/' +++ Priority: 1 +++ - Regex: '^<(gnuradio)/' +++ Priority: 2 +++ - Regex: '^<(boost)/' +++ Priority: 98 +++ - Regex: '^<[a-z]*>$' +++ Priority: 99 +++ - Regex: '^".*"$' +++ Priority: 0 +++ - Regex: '.*' +++ Priority: 10 +++ +++IncludeIsMainRegex: '(Test)?$' +++IndentCaseLabels: false +++IndentWidth: 4 +++IndentWrappedFunctionNames: false +++JavaScriptQuotes: Leave +++JavaScriptWrapImports: true +++KeepEmptyLinesAtTheStartOfBlocks: true +++MacroBlockBegin: '' +++MacroBlockEnd: '' +++MaxEmptyLinesToKeep: 2 +++NamespaceIndentation: None +++ObjCBlockIndentWidth: 2 +++ObjCSpaceAfterProperty: false +++ObjCSpaceBeforeProtocolList: true +++PenaltyBreakBeforeFirstCallParameter: 19 +++PenaltyBreakComment: 300 +++PenaltyBreakFirstLessLess: 120 +++PenaltyBreakString: 1000 +++PenaltyExcessCharacter: 1000000 +++PenaltyReturnTypeOnItsOwnLine: 60 +++PointerAlignment: Left +++ReflowComments: true +++SortIncludes: true +++SpaceAfterCStyleCast: false +++SpaceAfterTemplateKeyword: true +++SpaceBeforeAssignmentOperators: true +++SpaceBeforeParens: ControlStatements +++SpaceInEmptyParentheses: false +++SpacesBeforeTrailingComments: 1 +++SpacesInAngles: false +++SpacesInContainerLiterals: true +++SpacesInCStyleCastParentheses: false +++SpacesInParentheses: false +++SpacesInSquareBrackets: false +++Standard: Cpp11 +++TabWidth: 8 +++UseTab: Never +++ +++ ++diff --git a/apps/volk-config-info.cc b/apps/volk-config-info.cc ++index 4eedcb7..2521993 100644 ++--- a/apps/volk-config-info.cc +++++ b/apps/volk-config-info.cc ++@@ -24,52 +24,63 @@ ++ #include ++ #endif ++ ++-#include // for volk_available_machines, volk_c_com... ++-#include // for operator<<, endl, cout, ostream ++-#include // for string +++#include // for volk_available_machines, volk_c_com... +++#include // for operator<<, endl, cout, ostream +++#include // for string ++ ++-#include "volk/volk.h" // for volk_get_alignment, volk_get_machine ++-#include "volk_option_helpers.h" // for option_list, option_t +++#include "volk/volk.h" // for volk_get_alignment, volk_get_machine +++#include "volk_option_helpers.h" // for option_list, option_t ++ ++ void print_alignment() ++ { ++- std::cout << "Alignment in bytes: " << volk_get_alignment() << std::endl; +++ std::cout << "Alignment in bytes: " << volk_get_alignment() << std::endl; ++ } ++ ++ void print_malloc() ++ { ++- // You don't want to change the volk_malloc code, so just copy the if/else ++- // structure from there and give an explanation for the implementations ++- std::cout << "Used malloc implementation: "; ++- #if HAVE_POSIX_MEMALIGN ++- std::cout << "posix_memalign" << std::endl; ++- #elif defined(_MSC_VER) ++- std::cout << "_aligned_malloc" << std::endl; ++- #else ++- std::cout << "C11 aligned_alloc" << std::endl; ++- #endif +++ // You don't want to change the volk_malloc code, so just copy the if/else +++ // structure from there and give an explanation for the implementations +++ std::cout << "Used malloc implementation: "; +++#if HAVE_POSIX_MEMALIGN +++ std::cout << "posix_memalign" << std::endl; +++#elif defined(_MSC_VER) +++ std::cout << "_aligned_malloc" << std::endl; +++#else +++ std::cout << "C11 aligned_alloc" << std::endl; +++#endif ++ } ++ ++ ++-int ++-main(int argc, char **argv) +++int main(int argc, char** argv) ++ { ++ ++- option_list our_options("volk-config-info"); ++- our_options.add(option_t("prefix", "", "print the VOLK installation prefix", volk_prefix())); ++- our_options.add(option_t("cc", "", "print the VOLK C compiler version", volk_c_compiler())); ++- our_options.add(option_t("cflags", "", "print the VOLK CFLAGS", volk_compiler_flags())); ++- our_options.add(option_t("all-machines", "", "print VOLK machines built", volk_available_machines())); ++- our_options.add(option_t("avail-machines", "", "print VOLK machines on the current " ++- "platform", volk_list_machines)); ++- our_options.add(option_t("machine", "", "print the current VOLK machine that will be used", ++- volk_get_machine())); ++- our_options.add(option_t("alignment", "", "print the memory alignment", print_alignment)); ++- our_options.add(option_t("malloc", "", "print the malloc implementation used in volk_malloc", ++- print_malloc)); ++- our_options.add(option_t("version", "v", "print the VOLK version", volk_version())); +++ option_list our_options("volk-config-info"); +++ our_options.add( +++ option_t("prefix", "", "print the VOLK installation prefix", volk_prefix())); +++ our_options.add( +++ option_t("cc", "", "print the VOLK C compiler version", volk_c_compiler())); +++ our_options.add( +++ option_t("cflags", "", "print the VOLK CFLAGS", volk_compiler_flags())); +++ our_options.add(option_t( +++ "all-machines", "", "print VOLK machines built", volk_available_machines())); +++ our_options.add(option_t("avail-machines", +++ "", +++ "print VOLK machines on the current " +++ "platform", +++ volk_list_machines)); +++ our_options.add(option_t("machine", +++ "", +++ "print the current VOLK machine that will be used", +++ volk_get_machine())); +++ our_options.add( +++ option_t("alignment", "", "print the memory alignment", print_alignment)); +++ our_options.add(option_t("malloc", +++ "", +++ "print the malloc implementation used in volk_malloc", +++ print_malloc)); +++ our_options.add(option_t("version", "v", "print the VOLK version", volk_version())); ++ ++- our_options.parse(argc, argv); +++ our_options.parse(argc, argv); ++ ++- return 0; +++ return 0; ++ } ++diff --git a/apps/volk_option_helpers.cc b/apps/volk_option_helpers.cc ++index 4299709..73d51da 100644 ++--- a/apps/volk_option_helpers.cc +++++ b/apps/volk_option_helpers.cc ++@@ -4,66 +4,97 @@ ++ ++ #include "volk_option_helpers.h" ++ ++-#include // for exception ++-#include // for operator<<, endl, basic_ostream, cout, ostream ++-#include // for pair ++-#include // IWYU pragma: keep ++-#include // IWYU pragma: keep ++-#include // IWYU pragma: keep +++#include // IWYU pragma: keep +++#include // IWYU pragma: keep +++#include // IWYU pragma: keep +++#include // for exception +++#include // for operator<<, endl, basic_ostream, cout, ostream +++#include // for pair ++ ++ /* ++ * Option type ++ */ ++-option_t::option_t(std::string longform, std::string shortform, std::string msg, void (*callback)()) ++- : longform("--" + longform), ++- shortform("-" + shortform), ++- msg(msg), ++- callback(callback) { option_type = VOID_CALLBACK; } ++- ++-option_t::option_t(std::string longform, std::string shortform, std::string msg, void (*callback)(int)) ++- : longform("--" + longform), ++- shortform("-" + shortform), ++- msg(msg), ++- callback((void (*)()) callback) { option_type = INT_CALLBACK; } ++- ++-option_t::option_t(std::string longform, std::string shortform, std::string msg, void (*callback)(float)) ++- : longform("--" + longform), ++- shortform("-" + shortform), ++- msg(msg), ++- callback((void (*)()) callback) { option_type = FLOAT_CALLBACK; } ++- ++-option_t::option_t(std::string longform, std::string shortform, std::string msg, void (*callback)(bool)) ++- : longform("--" + longform), ++- shortform("-" + shortform), ++- msg(msg), ++- callback((void (*)()) callback) { option_type = BOOL_CALLBACK; } ++- ++-option_t::option_t(std::string longform, std::string shortform, std::string msg, void (*callback)(std::string)) ++- : longform("--" + longform), ++- shortform("-" + shortform), ++- msg(msg), ++- callback((void (*)()) callback) { option_type = STRING_CALLBACK; } ++- ++-option_t::option_t(std::string longform, std::string shortform, std::string msg, std::string printval) ++- : longform("--" + longform), ++- shortform("-" + shortform), ++- msg(msg), ++- printval(printval) { option_type = STRING; } +++option_t::option_t(std::string longform, +++ std::string shortform, +++ std::string msg, +++ void (*callback)()) +++ : longform("--" + longform), shortform("-" + shortform), msg(msg), callback(callback) +++{ +++ option_type = VOID_CALLBACK; +++} +++ +++option_t::option_t(std::string longform, +++ std::string shortform, +++ std::string msg, +++ void (*callback)(int)) +++ : longform("--" + longform), +++ shortform("-" + shortform), +++ msg(msg), +++ callback((void (*)())callback) +++{ +++ option_type = INT_CALLBACK; +++} +++ +++option_t::option_t(std::string longform, +++ std::string shortform, +++ std::string msg, +++ void (*callback)(float)) +++ : longform("--" + longform), +++ shortform("-" + shortform), +++ msg(msg), +++ callback((void (*)())callback) +++{ +++ option_type = FLOAT_CALLBACK; +++} +++ +++option_t::option_t(std::string longform, +++ std::string shortform, +++ std::string msg, +++ void (*callback)(bool)) +++ : longform("--" + longform), +++ shortform("-" + shortform), +++ msg(msg), +++ callback((void (*)())callback) +++{ +++ option_type = BOOL_CALLBACK; +++} +++ +++option_t::option_t(std::string longform, +++ std::string shortform, +++ std::string msg, +++ void (*callback)(std::string)) +++ : longform("--" + longform), +++ shortform("-" + shortform), +++ msg(msg), +++ callback((void (*)())callback) +++{ +++ option_type = STRING_CALLBACK; +++} +++ +++option_t::option_t(std::string longform, +++ std::string shortform, +++ std::string msg, +++ std::string printval) +++ : longform("--" + longform), shortform("-" + shortform), msg(msg), printval(printval) +++{ +++ option_type = STRING; +++} ++ ++ ++ /* ++ * Option List ++ */ ++ ++-option_list::option_list(std::string program_name) : ++- program_name(program_name) { +++option_list::option_list(std::string program_name) : program_name(program_name) +++{ ++ internal_list = std::vector(); ++ } ++ ++ ++ void option_list::add(option_t opt) { internal_list.push_back(opt); } ++ ++-void option_list::parse(int argc, char **argv) { +++void option_list::parse(int argc, char** argv) +++{ ++ for (int arg_number = 0; arg_number < argc; ++arg_number) { ++ for (std::vector::iterator this_option = internal_list.begin(); ++ this_option != internal_list.end(); ++@@ -73,74 +104,83 @@ void option_list::parse(int argc, char **argv) { ++ this_option->shortform == std::string(argv[arg_number])) { ++ ++ if (present_options.count(this_option->longform) == 0) { ++- present_options.insert(std::pair(this_option->longform, 1)); +++ present_options.insert( +++ std::pair(this_option->longform, 1)); ++ } else { ++ present_options[this_option->longform] += 1; ++ } ++ switch (this_option->option_type) { ++- case VOID_CALLBACK: ++- this_option->callback(); ++- break; ++- case INT_CALLBACK: ++- try { ++- int_val = atoi(argv[++arg_number]); ++- ((void (*)(int)) this_option->callback)(int_val); ++- } catch (std::exception &exc) { ++- std::cout << "An int option can only receive a number" << std::endl; ++- throw std::exception(); ++- }; ++- break; ++- case FLOAT_CALLBACK: ++- try { ++- double double_val = atof(argv[++arg_number]); ++- ((void (*)(float)) this_option->callback)(double_val); ++- } catch (std::exception &exc) { ++- std::cout << "A float option can only receive a number" << std::endl; ++- throw std::exception(); ++- }; ++- break; ++- case BOOL_CALLBACK: ++- try { ++- if (arg_number == (argc - 1)) { // this is the last arg +++ case VOID_CALLBACK: +++ this_option->callback(); +++ break; +++ case INT_CALLBACK: +++ try { +++ int_val = atoi(argv[++arg_number]); +++ ((void (*)(int))this_option->callback)(int_val); +++ } catch (std::exception& exc) { +++ std::cout << "An int option can only receive a number" +++ << std::endl; +++ throw std::exception(); +++ }; +++ break; +++ case FLOAT_CALLBACK: +++ try { +++ double double_val = atof(argv[++arg_number]); +++ ((void (*)(float))this_option->callback)(double_val); +++ } catch (std::exception& exc) { +++ std::cout << "A float option can only receive a number" +++ << std::endl; +++ throw std::exception(); +++ }; +++ break; +++ case BOOL_CALLBACK: +++ try { +++ if (arg_number == (argc - 1)) { // this is the last arg +++ int_val = 1; +++ } else { // sneak a look at the next arg since it's present +++ char* next_arg = argv[arg_number + 1]; +++ if ((strncmp(next_arg, "-", 1) == 0) || +++ (strncmp(next_arg, "--", 2) == 0)) { +++ // the next arg is actually an arg, the bool is just +++ // present, set to true +++ int_val = 1; +++ } else if (strncmp(next_arg, "true", 4) == 0) { ++ int_val = 1; ++- } else { // sneak a look at the next arg since it's present ++- char *next_arg = argv[arg_number + 1]; ++- if ((strncmp(next_arg, "-", 1) == 0) || (strncmp(next_arg, "--", 2) == 0)) { ++- // the next arg is actually an arg, the bool is just present, set to true ++- int_val = 1; ++- } else if (strncmp(next_arg, "true", 4) == 0) { ++- int_val = 1; ++- } else if (strncmp(next_arg, "false", 5) == 0) { ++- int_val = 0; ++- } else { ++- // we got a number or a string. ++- // convert it to a number and depend on the catch to report an error condition ++- int_val = (bool) atoi(argv[++arg_number]); ++- } +++ } else if (strncmp(next_arg, "false", 5) == 0) { +++ int_val = 0; +++ } else { +++ // we got a number or a string. +++ // convert it to a number and depend on the catch to +++ // report an error condition +++ int_val = (bool)atoi(argv[++arg_number]); ++ } ++- } catch (std::exception &e) { ++- int_val = INT_MIN; ++- }; ++- if (int_val == INT_MIN) { ++- std::cout << "option: '" << argv[arg_number - 1] << "' -> received an unknown value. Boolean " ++- "options should receive one of '0', '1', 'true', 'false'." << std::endl; ++- throw std::exception(); ++- } else if (int_val) { ++- ((void (*)(bool)) this_option->callback)(int_val); ++ } ++- break; ++- case STRING_CALLBACK: ++- try { ++- ((void (*)(std::string)) this_option->callback)(argv[++arg_number]); ++- } catch (std::exception &exc) { ++- throw std::exception(); ++- }; ++- case STRING: ++- std::cout << this_option->printval << std::endl; ++- break; +++ } catch (std::exception& e) { +++ int_val = INT_MIN; +++ }; +++ if (int_val == INT_MIN) { +++ std::cout +++ << "option: '" << argv[arg_number - 1] +++ << "' -> received an unknown value. Boolean " +++ "options should receive one of '0', '1', 'true', 'false'." +++ << std::endl; +++ throw std::exception(); +++ } else if (int_val) { +++ ((void (*)(bool))this_option->callback)(int_val); +++ } +++ break; +++ case STRING_CALLBACK: +++ try { +++ ((void (*)(std::string))this_option->callback)( +++ argv[++arg_number]); +++ } catch (std::exception& exc) { +++ throw std::exception(); +++ }; +++ case STRING: +++ std::cout << this_option->printval << std::endl; +++ break; ++ } ++ } ++- ++ } ++ if (std::string("--help") == std::string(argv[arg_number]) || ++ std::string("-h") == std::string(argv[arg_number])) { ++@@ -150,7 +190,8 @@ void option_list::parse(int argc, char **argv) { ++ } ++ } ++ ++-bool option_list::present(std::string option_name) { +++bool option_list::present(std::string option_name) +++{ ++ if (present_options.count("--" + option_name)) { ++ return true; ++ } else { ++@@ -158,7 +199,8 @@ bool option_list::present(std::string option_name) { ++ } ++ } ++ ++-void option_list::help() { +++void option_list::help() +++{ ++ std::cout << program_name << std::endl; ++ std::cout << " -h [ --help ] \t\tdisplay this help message" << std::endl; ++ for (std::vector::iterator this_option = internal_list.begin(); ++@@ -172,14 +214,14 @@ void option_list::help() { ++ } ++ ++ switch (help_line.size() / 8) { ++- case 0: ++- help_line += "\t"; ++- case 1: ++- help_line += "\t"; ++- case 2: ++- help_line += "\t"; ++- case 3: ++- help_line += "\t"; +++ case 0: +++ help_line += "\t"; +++ case 1: +++ help_line += "\t"; +++ case 2: +++ help_line += "\t"; +++ case 3: +++ help_line += "\t"; ++ } ++ help_line += this_option->msg; ++ std::cout << help_line << std::endl; ++diff --git a/apps/volk_option_helpers.h b/apps/volk_option_helpers.h ++index 8a71547..0756caf 100644 ++--- a/apps/volk_option_helpers.h +++++ b/apps/volk_option_helpers.h ++@@ -5,56 +5,74 @@ ++ #ifndef VOLK_VOLK_OPTION_HELPERS_H ++ #define VOLK_VOLK_OPTION_HELPERS_H ++ ++-#include ++-#include ++ #include ++-#include +++#include ++ #include +++#include +++#include ++ ++-typedef enum ++-{ ++- VOID_CALLBACK, +++typedef enum { +++ VOID_CALLBACK, ++ INT_CALLBACK, ++ BOOL_CALLBACK, ++ STRING_CALLBACK, ++ FLOAT_CALLBACK, ++- STRING, +++ STRING, ++ } VOLK_OPTYPE; ++ ++-class option_t { ++- public: ++- option_t(std::string longform, std::string shortform, std::string msg, void (*callback)()); ++- option_t(std::string longform, std::string shortform, std::string msg, void (*callback)(int)); ++- option_t(std::string longform, std::string shortform, std::string msg, void (*callback)(float)); ++- option_t(std::string longform, std::string shortform, std::string msg, void (*callback)(bool)); ++- option_t(std::string longform, std::string shortform, std::string msg, void (*callback)(std::string)); ++- option_t(std::string longform, std::string shortform, std::string msg, std::string printval); ++- ++- std::string longform; ++- std::string shortform; ++- std::string msg; ++- VOLK_OPTYPE option_type; ++- std::string printval; ++- void (*callback)(); +++class option_t +++{ +++public: +++ option_t(std::string longform, +++ std::string shortform, +++ std::string msg, +++ void (*callback)()); +++ option_t(std::string longform, +++ std::string shortform, +++ std::string msg, +++ void (*callback)(int)); +++ option_t(std::string longform, +++ std::string shortform, +++ std::string msg, +++ void (*callback)(float)); +++ option_t(std::string longform, +++ std::string shortform, +++ std::string msg, +++ void (*callback)(bool)); +++ option_t(std::string longform, +++ std::string shortform, +++ std::string msg, +++ void (*callback)(std::string)); +++ option_t(std::string longform, +++ std::string shortform, +++ std::string msg, +++ std::string printval); ++ +++ std::string longform; +++ std::string shortform; +++ std::string msg; +++ VOLK_OPTYPE option_type; +++ std::string printval; +++ void (*callback)(); ++ }; ++ ++ class option_list ++ { ++- public: ++- option_list(std::string program_name); ++- bool present(std::string option_name); +++public: +++ option_list(std::string program_name); +++ bool present(std::string option_name); +++ +++ void add(option_t opt); ++ ++- void add(option_t opt); +++ void parse(int argc, char** argv); ++ ++- void parse(int argc, char **argv); +++ void help(); ++ ++- void help(); ++- private: ++- std::string program_name; ++- std::vector internal_list; ++- std::map present_options; +++private: +++ std::string program_name; +++ std::vector internal_list; +++ std::map present_options; ++ }; ++ ++ ++-#endif //VOLK_VOLK_OPTION_HELPERS_H +++#endif // VOLK_VOLK_OPTION_HELPERS_H ++diff --git a/apps/volk_profile.cc b/apps/volk_profile.cc ++index 4ef5aeb..3c2e324 100644 ++--- a/apps/volk_profile.cc +++++ b/apps/volk_profile.cc ++@@ -27,23 +27,23 @@ ++ #include ++ #endif ++ #else ++-#include // for create_directories, exists ++-#include // for path, operator<< ++-#include // for filesystem +++#include // for create_directories, exists +++#include // for path, operator<< +++#include // for filesystem ++ #endif ++-#include // for size_t ++-#include // for stat ++-#include // for volk_get_config_path ++-#include // for operator<<, basic_ostream ++-#include // IWYU pragma: keep ++-#include // for map, map<>::iterator ++-#include // for pair ++-#include // for vector, vector<>::const_... ++- ++-#include "kernel_tests.h" // for init_test_list ++-#include "qa_utils.h" // for volk_test_results_t, vol... ++-#include "volk/volk_complex.h" // for lv_32fc_t ++-#include "volk_option_helpers.h" // for option_list, option_t +++#include // for size_t +++#include // for stat +++#include // for volk_get_config_path +++#include // IWYU pragma: keep +++#include // for operator<<, basic_ostream +++#include // for map, map<>::iterator +++#include // for pair +++#include // for vector, vector<>::const_... +++ +++#include "kernel_tests.h" // for init_test_list +++#include "qa_utils.h" // for volk_test_results_t, vol... +++#include "volk/volk_complex.h" // for lv_32fc_t +++#include "volk_option_helpers.h" // for option_list, option_t ++ #include "volk_profile.h" ++ ++ #if HAS_STD_FILESYSTEM ++@@ -72,45 +72,61 @@ void set_json(std::string val) { json_filename = val; } ++ std::string volk_config_path(""); ++ void set_volk_config(std::string val) { volk_config_path = val; } ++ ++-int main(int argc, char *argv[]) { +++int main(int argc, char* argv[]) +++{ ++ ++ option_list profile_options("volk_profile"); ++- profile_options.add(option_t("benchmark", "b", "Run all kernels (benchmark mode)", set_benchmark)); ++- profile_options.add(option_t("tol", "t", "Set the default tolerance for all tests", set_tolerance)); ++- profile_options.add(option_t("vlen", "v", "Set the default vector length for tests", set_vlen)); ++- profile_options.add((option_t("iter", "i", "Set the default number of test iterations per kernel", set_iter))); ++- profile_options.add((option_t("tests-substr", "R", "Run tests matching substring", set_substr))); ++- profile_options.add((option_t("update", "u", "Run only kernels missing from config", set_update))); ++- profile_options.add((option_t("dry-run", "n", "Dry run. Respect other options, but don't write to file", set_dryrun))); ++- profile_options.add((option_t("json", "j", "Write results to JSON file named as argument value", set_json))); ++- profile_options.add((option_t("path", "p", "Specify the volk_config path", set_volk_config))); +++ profile_options.add( +++ option_t("benchmark", "b", "Run all kernels (benchmark mode)", set_benchmark)); +++ profile_options.add( +++ option_t("tol", "t", "Set the default tolerance for all tests", set_tolerance)); +++ profile_options.add( +++ option_t("vlen", "v", "Set the default vector length for tests", set_vlen)); +++ profile_options.add((option_t( +++ "iter", "i", "Set the default number of test iterations per kernel", set_iter))); +++ profile_options.add( +++ (option_t("tests-substr", "R", "Run tests matching substring", set_substr))); +++ profile_options.add( +++ (option_t("update", "u", "Run only kernels missing from config", set_update))); +++ profile_options.add( +++ (option_t("dry-run", +++ "n", +++ "Dry run. Respect other options, but don't write to file", +++ set_dryrun))); +++ profile_options.add((option_t( +++ "json", "j", "Write results to JSON file named as argument value", set_json))); +++ profile_options.add( +++ (option_t("path", "p", "Specify the volk_config path", set_volk_config))); ++ profile_options.parse(argc, argv); ++ ++ if (profile_options.present("help")) { ++ return 0; ++ } ++ ++- if(dry_run) { ++- std::cout << "Warning: this IS a dry-run. Config will not be written!" << std::endl; +++ if (dry_run) { +++ std::cout << "Warning: this IS a dry-run. Config will not be written!" +++ << std::endl; ++ } ++ ++ // Adding program options ++ std::ofstream json_file; ++ std::string config_file; ++ ++- if ( json_filename != "" ) { ++- json_file.open( json_filename.c_str() ); +++ if (json_filename != "") { +++ json_file.open(json_filename.c_str()); ++ } ++ ++- if ( volk_config_path != "" ) { +++ if (volk_config_path != "") { ++ config_file = volk_config_path + "/volk_config"; ++ } ++ ++ // Run tests ++ std::vector results; ++- if(update_mode) { ++- if( config_file != "" ) read_results(&results, config_file); ++- else read_results(&results); +++ if (update_mode) { +++ if (config_file != "") +++ read_results(&results, config_file); +++ else +++ read_results(&results); ++ } ++ ++ // Initialize the list of tests ++@@ -118,22 +134,22 @@ int main(int argc, char *argv[]) { ++ ++ // Iterate through list of tests running each one ++ std::string substr_to_match(test_params.kernel_regex()); ++- for(unsigned int ii = 0; ii < test_cases.size(); ++ii) { +++ for (unsigned int ii = 0; ii < test_cases.size(); ++ii) { ++ bool regex_match = true; ++ ++ volk_test_case_t test_case = test_cases[ii]; ++ // if the kernel name matches regex then do the test ++ std::string test_case_name = test_case.name(); ++- if(test_case_name.find(substr_to_match) == std::string::npos) { +++ if (test_case_name.find(substr_to_match) == std::string::npos) { ++ regex_match = false; ++ } ++ ++ // if we are in update mode check if we've already got results ++ // if we have any, then no need to test that kernel ++ bool update = true; ++- if(update_mode) { ++- for(unsigned int jj=0; jj < results.size(); ++jj) { ++- if(results[jj].name == test_case.name() || +++ if (update_mode) { +++ for (unsigned int jj = 0; jj < results.size(); ++jj) { +++ if (results[jj].name == test_case.name() || ++ results[jj].name == test_case.puppet_master_name()) { ++ update = false; ++ break; ++@@ -141,39 +157,44 @@ int main(int argc, char *argv[]) { ++ } ++ } ++ ++- if( regex_match && update ) { +++ if (regex_match && update) { ++ try { ++- run_volk_tests(test_case.desc(), test_case.kernel_ptr(), test_case.name(), ++- test_case.test_parameters(), &results, test_case.puppet_master_name()); ++- } ++- catch (std::string &error) { ++- std::cerr << "Caught Exception in 'run_volk_tests': " << error << std::endl; +++ run_volk_tests(test_case.desc(), +++ test_case.kernel_ptr(), +++ test_case.name(), +++ test_case.test_parameters(), +++ &results, +++ test_case.puppet_master_name()); +++ } catch (std::string& error) { +++ std::cerr << "Caught Exception in 'run_volk_tests': " << error +++ << std::endl; ++ } ++ } ++ } ++ ++ ++ // Output results according to provided options ++- if(json_filename != "") { +++ if (json_filename != "") { ++ write_json(json_file, results); ++ json_file.close(); ++ } ++ ++- if(!dry_run) { ++- if(config_file != "") write_results(&results, false, config_file); ++- else write_results(&results, false); ++- } ++- else { +++ if (!dry_run) { +++ if (config_file != "") +++ write_results(&results, false, config_file); +++ else +++ write_results(&results, false); +++ } else { ++ std::cout << "Warning: this was a dry-run. Config not generated" << std::endl; ++ } ++ return 0; ++ } ++ ++-void read_results(std::vector *results) +++void read_results(std::vector* results) ++ { ++ char path[1024]; ++ volk_get_config_path(path, true); ++- if(path[0] == 0){ +++ if (path[0] == 0) { ++ std::cout << "No prior test results found ..." << std::endl; ++ return; ++ } ++@@ -181,16 +202,16 @@ void read_results(std::vector *results) ++ read_results(results, std::string(path)); ++ } ++ ++-void read_results(std::vector *results, std::string path) +++void read_results(std::vector* results, std::string path) ++ { ++ struct stat buffer; ++- bool config_status = (stat (path.c_str(), &buffer) == 0); +++ bool config_status = (stat(path.c_str(), &buffer) == 0); ++ ++- if( config_status ) { +++ if (config_status) { ++ // a config exists and we are reading results from it ++ std::ifstream config(path.c_str()); ++ char config_line[256]; ++- while(config.getline(config_line, 255)) { +++ while (config.getline(config_line, 255)) { ++ // tokenize the input line by kernel_name unaligned aligned ++ // then push back in the results vector with fields filled in ++ ++@@ -198,26 +219,26 @@ void read_results(std::vector *results, std::string path) ++ std::string config_str(config_line); ++ std::size_t str_size = config_str.size(); ++ std::size_t found = config_str.find(' '); ++- +++ ++ // Split line by spaces ++- while(found && found < str_size) { +++ while (found && found < str_size) { ++ found = config_str.find(' '); ++ // kernel names MUST be less than 128 chars, which is ++ // a length restricted by volk/volk_prefs.c ++ // on the last token in the parsed string we won't find a space ++ // so make sure we copy at most 128 chars. ++- if(found > 127) { +++ if (found > 127) { ++ found = 127; ++ } ++ str_size = config_str.size(); ++- char buffer[128] = {'\0'}; +++ char buffer[128] = { '\0' }; ++ config_str.copy(buffer, found + 1, 0); ++ buffer[found] = '\0'; ++ single_kernel_result.push_back(std::string(buffer)); ++- config_str.erase(0, found+1); +++ config_str.erase(0, found + 1); ++ } ++ ++- if(single_kernel_result.size() == 3) { +++ if (single_kernel_result.size() == 3) { ++ volk_test_results_t kernel_result; ++ kernel_result.name = std::string(single_kernel_result[0]); ++ kernel_result.config_name = std::string(single_kernel_result[0]); ++@@ -229,45 +250,47 @@ void read_results(std::vector *results, std::string path) ++ } ++ } ++ ++-void write_results(const std::vector *results, bool update_result) +++void write_results(const std::vector* results, bool update_result) ++ { ++ char path[1024]; ++ volk_get_config_path(path, false); ++- if(path[0] == 0){ +++ if (path[0] == 0) { ++ std::cout << "Aborting 'No config save path found' ..." << std::endl; ++ return; ++ } ++ ++- write_results( results, update_result, std::string(path)); +++ write_results(results, update_result, std::string(path)); ++ } ++ ++-void write_results(const std::vector *results, bool update_result, const std::string path) +++void write_results(const std::vector* results, +++ bool update_result, +++ const std::string path) ++ { ++-// struct stat buffer; ++-// bool config_status = (stat (path.c_str(), &buffer) == 0); +++ // struct stat buffer; +++ // bool config_status = (stat (path.c_str(), &buffer) == 0); ++ ++ /* ++ * These ++ */ ++ const fs::path config_path(path); ++- if (! fs::exists(config_path.parent_path())) ++- { +++ if (!fs::exists(config_path.parent_path())) { ++ std::cout << "Creating " << config_path.parent_path() << "..." << std::endl; ++ fs::create_directories(config_path.parent_path()); ++ } ++ ++ std::ofstream config; ++- if(update_result) { +++ if (update_result) { ++ std::cout << "Updating " << path << "..." << std::endl; ++ config.open(path.c_str(), std::ofstream::app); ++- if (!config.is_open()) { //either we don't have write access or we don't have the dir yet +++ if (!config.is_open()) { // either we don't have write access or we don't have the +++ // dir yet ++ std::cout << "Error opening file " << path << std::endl; ++ } ++- } ++- else { +++ } else { ++ std::cout << "Writing " << path << "..." << std::endl; ++ config.open(path.c_str()); ++- if (!config.is_open()) { //either we don't have write access or we don't have the dir yet +++ if (!config.is_open()) { // either we don't have write access or we don't have the +++ // dir yet ++ std::cout << "Error opening file " << path << std::endl; ++ } ++ ++@@ -278,43 +301,45 @@ void write_results(const std::vector *results, bool update_ ++ } ++ ++ std::vector::const_iterator profile_results; ++- for(profile_results = results->begin(); profile_results != results->end(); ++profile_results) { ++- config << profile_results->config_name << " " ++- << profile_results->best_arch_a << " " ++- << profile_results->best_arch_u << std::endl; +++ for (profile_results = results->begin(); profile_results != results->end(); +++ ++profile_results) { +++ config << profile_results->config_name << " " << profile_results->best_arch_a +++ << " " << profile_results->best_arch_u << std::endl; ++ } ++ config.close(); ++ } ++ ++-void write_json(std::ofstream &json_file, std::vector results) +++void write_json(std::ofstream& json_file, std::vector results) ++ { ++ json_file << "{" << std::endl; ++ json_file << " \"volk_tests\": [" << std::endl; ++ size_t len = results.size(); ++ size_t i = 0; ++ std::vector::iterator result; ++- for(result = results.begin(); result != results.end(); ++result) { +++ for (result = results.begin(); result != results.end(); ++result) { ++ json_file << " {" << std::endl; ++ json_file << " \"name\": \"" << result->name << "\"," << std::endl; ++ json_file << " \"vlen\": " << (int)(result->vlen) << "," << std::endl; ++ json_file << " \"iter\": " << result->iter << "," << std::endl; ++- json_file << " \"best_arch_a\": \"" << result->best_arch_a ++- << "\"," << std::endl; ++- json_file << " \"best_arch_u\": \"" << result->best_arch_u ++- << "\"," << std::endl; +++ json_file << " \"best_arch_a\": \"" << result->best_arch_a << "\"," +++ << std::endl; +++ json_file << " \"best_arch_u\": \"" << result->best_arch_u << "\"," +++ << std::endl; ++ json_file << " \"results\": {" << std::endl; ++ size_t results_len = result->results.size(); ++ size_t ri = 0; ++ ++ std::map::iterator kernel_time_pair; ++- for(kernel_time_pair = result->results.begin(); kernel_time_pair != result->results.end(); ++kernel_time_pair) { +++ for (kernel_time_pair = result->results.begin(); +++ kernel_time_pair != result->results.end(); +++ ++kernel_time_pair) { ++ volk_test_time_t time = kernel_time_pair->second; ++ json_file << " \"" << time.name << "\": {" << std::endl; ++ json_file << " \"name\": \"" << time.name << "\"," << std::endl; ++ json_file << " \"time\": " << time.time << "," << std::endl; ++ json_file << " \"units\": \"" << time.units << "\"" << std::endl; ++- json_file << " }" ; ++- if(ri+1 != results_len) { +++ json_file << " }"; +++ if (ri + 1 != results_len) { ++ json_file << ","; ++ } ++ json_file << std::endl; ++@@ -322,7 +347,7 @@ void write_json(std::ofstream &json_file, std::vector resul ++ } ++ json_file << " }" << std::endl; ++ json_file << " }"; ++- if(i+1 != len) { +++ if (i + 1 != len) { ++ json_file << ","; ++ } ++ json_file << std::endl; ++diff --git a/apps/volk_profile.h b/apps/volk_profile.h ++index 51629ab..ae3b474 100644 ++--- a/apps/volk_profile.h +++++ b/apps/volk_profile.h ++@@ -1,14 +1,16 @@ ++ ++ ++-#include // for bool ++-#include // for ofstream ++-#include // for string ++-#include // for vector +++#include // for bool +++#include // for ofstream +++#include // for string +++#include // for vector ++ ++ class volk_test_results_t; ++ ++-void read_results(std::vector *results); ++-void read_results(std::vector *results, std::string path); ++-void write_results(const std::vector *results, bool update_result); ++-void write_results(const std::vector *results, bool update_result, const std::string path); ++-void write_json(std::ofstream &json_file, std::vector results); +++void read_results(std::vector* results); +++void read_results(std::vector* results, std::string path); +++void write_results(const std::vector* results, bool update_result); +++void write_results(const std::vector* results, +++ bool update_result, +++ const std::string path); +++void write_json(std::ofstream& json_file, std::vector results); ++diff --git a/cmake/msvc/config.h b/cmake/msvc/config.h ++index 8b12c2a..68f716e 100644 ++--- a/cmake/msvc/config.h +++++ b/cmake/msvc/config.h ++@@ -9,7 +9,7 @@ ++ // enable inline functions for C code ++ //////////////////////////////////////////////////////////////////////// ++ #ifndef __cplusplus ++-# define inline __inline +++#define inline __inline ++ #endif ++ ++ //////////////////////////////////////////////////////////////////////// ++@@ -23,12 +23,21 @@ typedef ptrdiff_t ssize_t; ++ //////////////////////////////////////////////////////////////////////// ++ #if _MSC_VER < 1800 ++ #include ++-static inline long lrint(double x){return (long)(x > 0.0 ? x + 0.5 : x - 0.5);} ++-static inline long lrintf(float x){return (long)(x > 0.0f ? x + 0.5f : x - 0.5f);} ++-static inline long long llrint(double x){return (long long)(x > 0.0 ? x + 0.5 : x - 0.5);} ++-static inline long long llrintf(float x){return (long long)(x > 0.0f ? x + 0.5f : x - 0.5f);} ++-static inline double rint(double x){return (x > 0.0)? floor(x + 0.5) : ceil(x - 0.5);} ++-static inline float rintf(float x){return (x > 0.0f)? floorf(x + 0.5f) : ceilf(x - 0.5f);} +++static inline long lrint(double x) { return (long)(x > 0.0 ? x + 0.5 : x - 0.5); } +++static inline long lrintf(float x) { return (long)(x > 0.0f ? x + 0.5f : x - 0.5f); } +++static inline long long llrint(double x) +++{ +++ return (long long)(x > 0.0 ? x + 0.5 : x - 0.5); +++} +++static inline long long llrintf(float x) +++{ +++ return (long long)(x > 0.0f ? x + 0.5f : x - 0.5f); +++} +++static inline double rint(double x) { return (x > 0.0) ? floor(x + 0.5) : ceil(x - 0.5); } +++static inline float rintf(float x) +++{ +++ return (x > 0.0f) ? floorf(x + 0.5f) : ceilf(x - 0.5f); +++} ++ #endif ++ ++ //////////////////////////////////////////////////////////////////////// ++@@ -43,7 +52,7 @@ static inline float rintf(float x){return (x > 0.0f)? floorf(x + 0.5f) : ceilf(x ++ // random and srandom ++ //////////////////////////////////////////////////////////////////////// ++ #include ++-static inline long int random (void) { return rand(); } ++-static inline void srandom (unsigned int seed) { srand(seed); } +++static inline long int random(void) { return rand(); } +++static inline void srandom(unsigned int seed) { srand(seed); } ++ ++ #endif // _MSC_CONFIG_H_ ] ++diff --git a/cmake/msvc/sys/time.h b/cmake/msvc/sys/time.h ++index aa0f5dc..4bda1ba 100644 ++--- a/cmake/msvc/sys/time.h +++++ b/cmake/msvc/sys/time.h ++@@ -10,67 +10,62 @@ ++ #define NOMINMAX ++ #endif ++ ++-//http://social.msdn.microsoft.com/Forums/en/vcgeneral/thread/430449b3-f6dd-4e18-84de-eebd26a8d668 +++// http://social.msdn.microsoft.com/Forums/en/vcgeneral/thread/430449b3-f6dd-4e18-84de-eebd26a8d668 ++ #include < time.h > ++ #include //I've omitted this line. ++ #if defined(_MSC_VER) || defined(_MSC_EXTENSIONS) ++- #define DELTA_EPOCH_IN_MICROSECS 11644473600000000Ui64 +++#define DELTA_EPOCH_IN_MICROSECS 11644473600000000Ui64 ++ #else ++- #define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL +++#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL ++ #endif ++ ++ #if _MSC_VER < 1900 ++ struct timespec { ++ ++-time_t tv_sec; /* Seconds since 00:00:00 GMT, */ +++ time_t tv_sec; /* Seconds since 00:00:00 GMT, */ ++ ++-/* 1 January 1970 */ +++ /* 1 January 1970 */ ++ ++-long tv_nsec; /* Additional nanoseconds since */ ++- ++-/* tv_sec */ +++ long tv_nsec; /* Additional nanoseconds since */ ++ +++ /* tv_sec */ ++ }; ++ #endif ++ ++-struct timezone ++-{ ++- int tz_minuteswest; /* minutes W of Greenwich */ ++- int tz_dsttime; /* type of dst correction */ +++struct timezone { +++ int tz_minuteswest; /* minutes W of Greenwich */ +++ int tz_dsttime; /* type of dst correction */ ++ }; ++ ++-static inline int gettimeofday(struct timeval *tv, struct timezone *tz) +++static inline int gettimeofday(struct timeval* tv, struct timezone* tz) ++ { ++- FILETIME ft; ++- unsigned __int64 tmpres = 0; ++- static int tzflag; ++- ++- if (NULL != tv) ++- { ++- GetSystemTimeAsFileTime(&ft); ++- ++- tmpres |= ft.dwHighDateTime; ++- tmpres <<= 32; ++- tmpres |= ft.dwLowDateTime; ++- ++- /*converting file time to unix epoch*/ ++- tmpres -= DELTA_EPOCH_IN_MICROSECS; ++- tv->tv_sec = (long)(tmpres / 1000000UL); ++- tv->tv_usec = (long)(tmpres % 1000000UL); ++- } ++- ++- if (NULL != tz) ++- { ++- if (!tzflag) ++- { ++- _tzset(); ++- tzflag++; +++ FILETIME ft; +++ unsigned __int64 tmpres = 0; +++ static int tzflag; +++ +++ if (NULL != tv) { +++ GetSystemTimeAsFileTime(&ft); +++ +++ tmpres |= ft.dwHighDateTime; +++ tmpres <<= 32; +++ tmpres |= ft.dwLowDateTime; +++ +++ /*converting file time to unix epoch*/ +++ tmpres -= DELTA_EPOCH_IN_MICROSECS; +++ tv->tv_sec = (long)(tmpres / 1000000UL); +++ tv->tv_usec = (long)(tmpres % 1000000UL); +++ } +++ +++ if (NULL != tz) { +++ if (!tzflag) { +++ _tzset(); +++ tzflag++; +++ } +++ tz->tz_minuteswest = _timezone / 60; +++ tz->tz_dsttime = _daylight; ++ } ++- tz->tz_minuteswest = _timezone / 60; ++- tz->tz_dsttime = _daylight; ++- } ++ ++- return 0; +++ return 0; ++ } ++ ++ #endif //_MSC_SYS_TIME_H_ ++diff --git a/include/volk/saturation_arithmetic.h b/include/volk/saturation_arithmetic.h ++index 0886844..7b95ba2 100644 ++--- a/include/volk/saturation_arithmetic.h +++++ b/include/volk/saturation_arithmetic.h ++@@ -28,20 +28,24 @@ ++ ++ static inline int16_t sat_adds16i(int16_t x, int16_t y) ++ { ++- int32_t res = (int32_t) x + (int32_t) y; +++ int32_t res = (int32_t)x + (int32_t)y; ++ ++- if (res < SHRT_MIN) res = SHRT_MIN; ++- if (res > SHRT_MAX) res = SHRT_MAX; +++ if (res < SHRT_MIN) +++ res = SHRT_MIN; +++ if (res > SHRT_MAX) +++ res = SHRT_MAX; ++ ++ return res; ++ } ++ ++ static inline int16_t sat_muls16i(int16_t x, int16_t y) ++ { ++- int32_t res = (int32_t) x * (int32_t) y; +++ int32_t res = (int32_t)x * (int32_t)y; ++ ++- if (res < SHRT_MIN) res = SHRT_MIN; ++- if (res > SHRT_MAX) res = SHRT_MAX; +++ if (res < SHRT_MIN) +++ res = SHRT_MIN; +++ if (res > SHRT_MAX) +++ res = SHRT_MAX; ++ ++ return res; ++ } ++diff --git a/include/volk/volk_alloc.hh b/include/volk/volk_alloc.hh ++index a2975da..44bcfaf 100644 ++--- a/include/volk/volk_alloc.hh +++++ b/include/volk/volk_alloc.hh ++@@ -40,30 +40,40 @@ namespace volk { ++ */ ++ template ++ struct alloc { ++- typedef T value_type; +++ typedef T value_type; ++ ++- alloc() = default; +++ alloc() = default; ++ ++- template constexpr alloc(alloc const&) noexcept {} +++ template +++ constexpr alloc(alloc const&) noexcept +++ { +++ } ++ ++- T* allocate(std::size_t n) { ++- if (n > std::numeric_limits::max() / sizeof(T)) throw std::bad_alloc(); +++ T* allocate(std::size_t n) +++ { +++ if (n > std::numeric_limits::max() / sizeof(T)) +++ throw std::bad_alloc(); ++ ++- if (auto p = static_cast(volk_malloc(n*sizeof(T), volk_get_alignment()))) ++- return p; +++ if (auto p = static_cast(volk_malloc(n * sizeof(T), volk_get_alignment()))) +++ return p; ++ ++- throw std::bad_alloc(); ++- } +++ throw std::bad_alloc(); +++ } ++ ++- void deallocate(T* p, std::size_t) noexcept { volk_free(p); } ++- ++-} ; +++ void deallocate(T* p, std::size_t) noexcept { volk_free(p); } +++}; ++ ++ template ++-bool operator==(alloc const&, alloc const&) { return true; } +++bool operator==(alloc const&, alloc const&) +++{ +++ return true; +++} ++ ++ template ++-bool operator!=(alloc const&, alloc const&) { return false; } +++bool operator!=(alloc const&, alloc const&) +++{ +++ return false; +++} ++ ++ ++ /*! ++@@ -73,8 +83,8 @@ bool operator!=(alloc const&, alloc const&) { return false; } ++ * example code: ++ * volk::vector v(100); // vector using volk_malloc, volk_free ++ */ ++-template ++-using vector = std::vector >; +++template +++using vector = std::vector>; ++ ++ } // namespace volk ++ #endif // INCLUDED_VOLK_ALLOC_H ++diff --git a/include/volk/volk_avx2_intrinsics.h b/include/volk/volk_avx2_intrinsics.h ++index 17badc4..00f3b52 100644 ++--- a/include/volk/volk_avx2_intrinsics.h +++++ b/include/volk/volk_avx2_intrinsics.h ++@@ -1,19 +1,19 @@ ++ /* -*- c++ -*- */ ++-/* +++/* ++ * Copyright 2015 Free Software Foundation, Inc. ++- * +++ * ++ * This file is part of GNU Radio ++- * +++ * ++ * GNU Radio is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 3, or (at your option) ++ * any later version. ++- * +++ * ++ * GNU Radio is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++- * +++ * ++ * You should have received a copy of the GNU General Public License ++ * along with GNU Radio; see the file COPYING. If not, write to ++ * the Free Software Foundation, Inc., 51 Franklin Street, ++@@ -27,28 +27,59 @@ ++ ++ #ifndef INCLUDE_VOLK_VOLK_AVX2_INTRINSICS_H_ ++ #define INCLUDE_VOLK_VOLK_AVX2_INTRINSICS_H_ ++-#include ++ #include "volk/volk_avx_intrinsics.h" +++#include ++ ++-static inline __m256 ++-_mm256_polar_sign_mask_avx2(__m128i fbits){ ++- const __m128i zeros = _mm_set1_epi8(0x00); ++- const __m128i sign_extract = _mm_set1_epi8(0x80); ++- const __m256i shuffle_mask = _mm256_setr_epi8(0xff, 0xff, 0xff, 0x00, 0xff, 0xff, 0xff, 0x01, 0xff, 0xff, 0xff, 0x02, 0xff, 0xff, 0xff, 0x03, ++- 0xff, 0xff, 0xff, 0x04, 0xff, 0xff, 0xff, 0x05, 0xff, 0xff, 0xff, 0x06, 0xff, 0xff, 0xff, 0x07); ++- __m256i sign_bits = _mm256_setzero_si256(); ++- ++- fbits = _mm_cmpgt_epi8(fbits, zeros); ++- fbits = _mm_and_si128(fbits, sign_extract); ++- sign_bits = _mm256_insertf128_si256(sign_bits,fbits,0); ++- sign_bits = _mm256_insertf128_si256(sign_bits,fbits,1); ++- sign_bits = _mm256_shuffle_epi8(sign_bits, shuffle_mask); +++static inline __m256 _mm256_polar_sign_mask_avx2(__m128i fbits) +++{ +++ const __m128i zeros = _mm_set1_epi8(0x00); +++ const __m128i sign_extract = _mm_set1_epi8(0x80); +++ const __m256i shuffle_mask = _mm256_setr_epi8(0xff, +++ 0xff, +++ 0xff, +++ 0x00, +++ 0xff, +++ 0xff, +++ 0xff, +++ 0x01, +++ 0xff, +++ 0xff, +++ 0xff, +++ 0x02, +++ 0xff, +++ 0xff, +++ 0xff, +++ 0x03, +++ 0xff, +++ 0xff, +++ 0xff, +++ 0x04, +++ 0xff, +++ 0xff, +++ 0xff, +++ 0x05, +++ 0xff, +++ 0xff, +++ 0xff, +++ 0x06, +++ 0xff, +++ 0xff, +++ 0xff, +++ 0x07); +++ __m256i sign_bits = _mm256_setzero_si256(); ++ ++- return _mm256_castsi256_ps(sign_bits); +++ fbits = _mm_cmpgt_epi8(fbits, zeros); +++ fbits = _mm_and_si128(fbits, sign_extract); +++ sign_bits = _mm256_insertf128_si256(sign_bits, fbits, 0); +++ sign_bits = _mm256_insertf128_si256(sign_bits, fbits, 1); +++ sign_bits = _mm256_shuffle_epi8(sign_bits, shuffle_mask); +++ +++ return _mm256_castsi256_ps(sign_bits); ++ } ++ ++ static inline __m256 ++-_mm256_polar_fsign_add_llrs_avx2(__m256 src0, __m256 src1, __m128i fbits){ +++_mm256_polar_fsign_add_llrs_avx2(__m256 src0, __m256 src1, __m128i fbits) +++{ ++ // prepare sign mask for correct +- ++ __m256 sign_mask = _mm256_polar_sign_mask_avx2(fbits); ++ ++@@ -61,26 +92,31 @@ _mm256_polar_fsign_add_llrs_avx2(__m256 src0, __m256 src1, __m128i fbits){ ++ return dst; ++ } ++ ++-static inline __m256 ++-_mm256_magnitudesquared_ps_avx2(const __m256 cplxValue0, const __m256 cplxValue1){ ++- const __m256i idx = _mm256_set_epi32(7,6,3,2,5,4,1,0); ++- const __m256 squared0 = _mm256_mul_ps(cplxValue0, cplxValue0); // Square the values ++- const __m256 squared1 = _mm256_mul_ps(cplxValue1, cplxValue1); // Square the Values ++- const __m256 complex_result = _mm256_hadd_ps(squared0, squared1); ++- return _mm256_permutevar8x32_ps(complex_result, idx); +++static inline __m256 _mm256_magnitudesquared_ps_avx2(const __m256 cplxValue0, +++ const __m256 cplxValue1) +++{ +++ const __m256i idx = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0); +++ const __m256 squared0 = _mm256_mul_ps(cplxValue0, cplxValue0); // Square the values +++ const __m256 squared1 = _mm256_mul_ps(cplxValue1, cplxValue1); // Square the Values +++ const __m256 complex_result = _mm256_hadd_ps(squared0, squared1); +++ return _mm256_permutevar8x32_ps(complex_result, idx); ++ } ++ ++-static inline __m256 ++-_mm256_scaled_norm_dist_ps_avx2(const __m256 symbols0, const __m256 symbols1, const __m256 points0, const __m256 points1, const __m256 scalar){ ++- /* ++- * Calculate: |y - x|^2 * SNR_lin ++- * Consider 'symbolsX' and 'pointsX' to be complex float ++- * 'symbolsX' are 'y' and 'pointsX' are 'x' ++- */ ++- const __m256 diff0 = _mm256_sub_ps(symbols0, points0); ++- const __m256 diff1 = _mm256_sub_ps(symbols1, points1); ++- const __m256 norms = _mm256_magnitudesquared_ps_avx2(diff0, diff1); ++- return _mm256_mul_ps(norms, scalar); +++static inline __m256 _mm256_scaled_norm_dist_ps_avx2(const __m256 symbols0, +++ const __m256 symbols1, +++ const __m256 points0, +++ const __m256 points1, +++ const __m256 scalar) +++{ +++ /* +++ * Calculate: |y - x|^2 * SNR_lin +++ * Consider 'symbolsX' and 'pointsX' to be complex float +++ * 'symbolsX' are 'y' and 'pointsX' are 'x' +++ */ +++ const __m256 diff0 = _mm256_sub_ps(symbols0, points0); +++ const __m256 diff1 = _mm256_sub_ps(symbols1, points1); +++ const __m256 norms = _mm256_magnitudesquared_ps_avx2(diff0, diff1); +++ return _mm256_mul_ps(norms, scalar); ++ } ++ ++ #endif /* INCLUDE_VOLK_VOLK_AVX2_INTRINSICS_H_ */ ++diff --git a/include/volk/volk_avx_intrinsics.h b/include/volk/volk_avx_intrinsics.h ++index 808799f..bec846d 100644 ++--- a/include/volk/volk_avx_intrinsics.h +++++ b/include/volk/volk_avx_intrinsics.h ++@@ -1,19 +1,19 @@ ++ /* -*- c++ -*- */ ++-/* +++/* ++ * Copyright 2015 Free Software Foundation, Inc. ++- * +++ * ++ * This file is part of GNU Radio ++- * +++ * ++ * GNU Radio is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 3, or (at your option) ++ * any later version. ++- * +++ * ++ * GNU Radio is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++- * +++ * ++ * You should have received a copy of the GNU General Public License ++ * along with GNU Radio; see the file COPYING. If not, write to ++ * the Free Software Foundation, Inc., 51 Franklin Street, ++@@ -29,90 +29,126 @@ ++ #define INCLUDE_VOLK_VOLK_AVX_INTRINSICS_H_ ++ #include ++ ++-static inline __m256 ++-_mm256_complexmul_ps(__m256 x, __m256 y) +++static inline __m256 _mm256_complexmul_ps(__m256 x, __m256 y) ++ { ++- __m256 yl, yh, tmp1, tmp2; ++- yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr ... ++- yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di ... ++- tmp1 = _mm256_mul_ps(x, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr ... ++- x = _mm256_shuffle_ps(x, x, 0xB1); // Re-arrange x to be ai,ar,bi,br ... ++- tmp2 = _mm256_mul_ps(x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di ++- return _mm256_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di +++ __m256 yl, yh, tmp1, tmp2; +++ yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr ... +++ yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di ... +++ tmp1 = _mm256_mul_ps(x, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr ... +++ x = _mm256_shuffle_ps(x, x, 0xB1); // Re-arrange x to be ai,ar,bi,br ... +++ tmp2 = _mm256_mul_ps(x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di +++ return _mm256_addsub_ps(tmp1, +++ tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di ++ } ++ ++-static inline __m256 ++-_mm256_conjugate_ps(__m256 x){ ++- const __m256 conjugator = _mm256_setr_ps(0, -0.f, 0, -0.f, 0, -0.f, 0, -0.f); ++- return _mm256_xor_ps(x, conjugator); // conjugate y +++static inline __m256 _mm256_conjugate_ps(__m256 x) +++{ +++ const __m256 conjugator = _mm256_setr_ps(0, -0.f, 0, -0.f, 0, -0.f, 0, -0.f); +++ return _mm256_xor_ps(x, conjugator); // conjugate y ++ } ++ ++-static inline __m256 ++-_mm256_complexconjugatemul_ps(__m256 x, __m256 y){ ++- y = _mm256_conjugate_ps(y); ++- return _mm256_complexmul_ps(x, y); +++static inline __m256 _mm256_complexconjugatemul_ps(__m256 x, __m256 y) +++{ +++ y = _mm256_conjugate_ps(y); +++ return _mm256_complexmul_ps(x, y); ++ } ++ ++-static inline __m256 ++-_mm256_normalize_ps(__m256 val) +++static inline __m256 _mm256_normalize_ps(__m256 val) ++ { ++- __m256 tmp1 = _mm256_mul_ps(val, val); ++- tmp1 = _mm256_hadd_ps(tmp1, tmp1); ++- tmp1 = _mm256_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(3, 1, 2, 0)); // equals 0xD8 ++- tmp1 = _mm256_sqrt_ps(tmp1); ++- return _mm256_div_ps(val, tmp1); +++ __m256 tmp1 = _mm256_mul_ps(val, val); +++ tmp1 = _mm256_hadd_ps(tmp1, tmp1); +++ tmp1 = _mm256_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(3, 1, 2, 0)); // equals 0xD8 +++ tmp1 = _mm256_sqrt_ps(tmp1); +++ return _mm256_div_ps(val, tmp1); ++ } ++ ++-static inline __m256 ++-_mm256_magnitudesquared_ps(__m256 cplxValue1, __m256 cplxValue2){ ++- __m256 complex1, complex2; ++- cplxValue1 = _mm256_mul_ps(cplxValue1, cplxValue1); // Square the values ++- cplxValue2 = _mm256_mul_ps(cplxValue2, cplxValue2); // Square the Values ++- complex1 = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x20); ++- complex2 = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x31); ++- return _mm256_hadd_ps(complex1, complex2); // Add the I2 and Q2 values +++static inline __m256 _mm256_magnitudesquared_ps(__m256 cplxValue1, __m256 cplxValue2) +++{ +++ __m256 complex1, complex2; +++ cplxValue1 = _mm256_mul_ps(cplxValue1, cplxValue1); // Square the values +++ cplxValue2 = _mm256_mul_ps(cplxValue2, cplxValue2); // Square the Values +++ complex1 = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x20); +++ complex2 = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x31); +++ return _mm256_hadd_ps(complex1, complex2); // Add the I2 and Q2 values ++ } ++ ++-static inline __m256 ++-_mm256_magnitude_ps(__m256 cplxValue1, __m256 cplxValue2){ ++- return _mm256_sqrt_ps(_mm256_magnitudesquared_ps(cplxValue1, cplxValue2)); +++static inline __m256 _mm256_magnitude_ps(__m256 cplxValue1, __m256 cplxValue2) +++{ +++ return _mm256_sqrt_ps(_mm256_magnitudesquared_ps(cplxValue1, cplxValue2)); ++ } ++ ++-static inline __m256 ++-_mm256_scaled_norm_dist_ps(const __m256 symbols0, const __m256 symbols1, const __m256 points0, const __m256 points1, const __m256 scalar){ ++- /* ++- * Calculate: |y - x|^2 * SNR_lin ++- * Consider 'symbolsX' and 'pointsX' to be complex float ++- * 'symbolsX' are 'y' and 'pointsX' are 'x' ++- */ ++- const __m256 diff0 = _mm256_sub_ps(symbols0, points0); ++- const __m256 diff1 = _mm256_sub_ps(symbols1, points1); ++- const __m256 norms = _mm256_magnitudesquared_ps(diff0, diff1); ++- return _mm256_mul_ps(norms, scalar); +++static inline __m256 _mm256_scaled_norm_dist_ps(const __m256 symbols0, +++ const __m256 symbols1, +++ const __m256 points0, +++ const __m256 points1, +++ const __m256 scalar) +++{ +++ /* +++ * Calculate: |y - x|^2 * SNR_lin +++ * Consider 'symbolsX' and 'pointsX' to be complex float +++ * 'symbolsX' are 'y' and 'pointsX' are 'x' +++ */ +++ const __m256 diff0 = _mm256_sub_ps(symbols0, points0); +++ const __m256 diff1 = _mm256_sub_ps(symbols1, points1); +++ const __m256 norms = _mm256_magnitudesquared_ps(diff0, diff1); +++ return _mm256_mul_ps(norms, scalar); ++ } ++ ++-static inline __m256 ++-_mm256_polar_sign_mask(__m128i fbits){ ++- __m256 sign_mask_dummy = _mm256_setzero_ps(); ++- const __m128i zeros = _mm_set1_epi8(0x00); ++- const __m128i sign_extract = _mm_set1_epi8(0x80); ++- const __m128i shuffle_mask0 = _mm_setr_epi8(0xff, 0xff, 0xff, 0x00, 0xff, 0xff, 0xff, 0x01, 0xff, 0xff, 0xff, 0x02, 0xff, 0xff, 0xff, 0x03); ++- const __m128i shuffle_mask1 = _mm_setr_epi8(0xff, 0xff, 0xff, 0x04, 0xff, 0xff, 0xff, 0x05, 0xff, 0xff, 0xff, 0x06, 0xff, 0xff, 0xff, 0x07); ++- ++- fbits = _mm_cmpgt_epi8(fbits, zeros); ++- fbits = _mm_and_si128(fbits, sign_extract); ++- __m128i sign_bits0 = _mm_shuffle_epi8(fbits, shuffle_mask0); ++- __m128i sign_bits1 = _mm_shuffle_epi8(fbits, shuffle_mask1); ++- ++- __m256 sign_mask = _mm256_insertf128_ps(sign_mask_dummy, _mm_castsi128_ps(sign_bits0), 0x0); ++- return _mm256_insertf128_ps(sign_mask, _mm_castsi128_ps(sign_bits1), 0x1); ++-// // This is the desired function call. Though it seems to be missing in GCC. ++-// // Compare: https://software.intel.com/sites/landingpage/IntrinsicsGuide/# ++-// return _mm256_set_m128(_mm_castsi128_ps(sign_bits1), _mm_castsi128_ps(sign_bits0)); +++static inline __m256 _mm256_polar_sign_mask(__m128i fbits) +++{ +++ __m256 sign_mask_dummy = _mm256_setzero_ps(); +++ const __m128i zeros = _mm_set1_epi8(0x00); +++ const __m128i sign_extract = _mm_set1_epi8(0x80); +++ const __m128i shuffle_mask0 = _mm_setr_epi8(0xff, +++ 0xff, +++ 0xff, +++ 0x00, +++ 0xff, +++ 0xff, +++ 0xff, +++ 0x01, +++ 0xff, +++ 0xff, +++ 0xff, +++ 0x02, +++ 0xff, +++ 0xff, +++ 0xff, +++ 0x03); +++ const __m128i shuffle_mask1 = _mm_setr_epi8(0xff, +++ 0xff, +++ 0xff, +++ 0x04, +++ 0xff, +++ 0xff, +++ 0xff, +++ 0x05, +++ 0xff, +++ 0xff, +++ 0xff, +++ 0x06, +++ 0xff, +++ 0xff, +++ 0xff, +++ 0x07); +++ +++ fbits = _mm_cmpgt_epi8(fbits, zeros); +++ fbits = _mm_and_si128(fbits, sign_extract); +++ __m128i sign_bits0 = _mm_shuffle_epi8(fbits, shuffle_mask0); +++ __m128i sign_bits1 = _mm_shuffle_epi8(fbits, shuffle_mask1); +++ +++ __m256 sign_mask = +++ _mm256_insertf128_ps(sign_mask_dummy, _mm_castsi128_ps(sign_bits0), 0x0); +++ return _mm256_insertf128_ps(sign_mask, _mm_castsi128_ps(sign_bits1), 0x1); +++ // // This is the desired function call. Though it seems to be missing in GCC. +++ // // Compare: https://software.intel.com/sites/landingpage/IntrinsicsGuide/# +++ // return _mm256_set_m128(_mm_castsi128_ps(sign_bits1), +++ // _mm_castsi128_ps(sign_bits0)); ++ } ++ ++ static inline void ++-_mm256_polar_deinterleave(__m256 *llr0, __m256 *llr1, __m256 src0, __m256 src1){ +++_mm256_polar_deinterleave(__m256* llr0, __m256* llr1, __m256 src0, __m256 src1) +++{ ++ // deinterleave values ++ __m256 part0 = _mm256_permute2f128_ps(src0, src1, 0x20); ++ __m256 part1 = _mm256_permute2f128_ps(src0, src1, 0x31); ++@@ -120,22 +156,25 @@ _mm256_polar_deinterleave(__m256 *llr0, __m256 *llr1, __m256 src0, __m256 src1){ ++ *llr1 = _mm256_shuffle_ps(part0, part1, 0xdd); ++ } ++ ++-static inline __m256 ++-_mm256_polar_minsum_llrs(__m256 src0, __m256 src1){ +++static inline __m256 _mm256_polar_minsum_llrs(__m256 src0, __m256 src1) +++{ ++ const __m256 sign_mask = _mm256_set1_ps(-0.0f); ++- const __m256 abs_mask = _mm256_andnot_ps(sign_mask, _mm256_castsi256_ps(_mm256_set1_epi8(0xff))); +++ const __m256 abs_mask = +++ _mm256_andnot_ps(sign_mask, _mm256_castsi256_ps(_mm256_set1_epi8(0xff))); ++ ++ __m256 llr0, llr1; ++ _mm256_polar_deinterleave(&llr0, &llr1, src0, src1); ++ ++ // calculate result ++- __m256 sign = _mm256_xor_ps(_mm256_and_ps(llr0, sign_mask), _mm256_and_ps(llr1, sign_mask)); ++- __m256 dst = _mm256_min_ps(_mm256_and_ps(llr0, abs_mask), _mm256_and_ps(llr1, abs_mask)); +++ __m256 sign = +++ _mm256_xor_ps(_mm256_and_ps(llr0, sign_mask), _mm256_and_ps(llr1, sign_mask)); +++ __m256 dst = +++ _mm256_min_ps(_mm256_and_ps(llr0, abs_mask), _mm256_and_ps(llr1, abs_mask)); ++ return _mm256_or_ps(dst, sign); ++ } ++ ++-static inline __m256 ++-_mm256_polar_fsign_add_llrs(__m256 src0, __m256 src1, __m128i fbits){ +++static inline __m256 _mm256_polar_fsign_add_llrs(__m256 src0, __m256 src1, __m128i fbits) +++{ ++ // prepare sign mask for correct +- ++ __m256 sign_mask = _mm256_polar_sign_mask(fbits); ++ ++diff --git a/include/volk/volk_common.h b/include/volk/volk_common.h ++index 50ea07b..8167d23 100644 ++--- a/include/volk/volk_common.h +++++ b/include/volk/volk_common.h ++@@ -18,61 +18,71 @@ ++ // AppleClang also defines __GNUC__, so do this check first. These ++ // will probably be the same as for __GNUC__, but let's keep them ++ // separate just to be safe. ++-# define __VOLK_ATTR_ALIGNED(x) __attribute__((aligned(x))) ++-# define __VOLK_ATTR_UNUSED __attribute__((unused)) ++-# define __VOLK_ATTR_INLINE __attribute__((always_inline)) ++-# define __VOLK_ATTR_DEPRECATED __attribute__((deprecated)) ++-# define __VOLK_ASM __asm__ ++-# define __VOLK_VOLATILE __volatile__ ++-# define __VOLK_ATTR_EXPORT __attribute__((visibility("default"))) ++-# define __VOLK_ATTR_IMPORT __attribute__((visibility("default"))) ++-# define __VOLK_PREFETCH(addr) __builtin_prefetch(addr) ++-#elif defined(__GNUC__) ++-# define __VOLK_ATTR_ALIGNED(x) __attribute__((aligned(x))) ++-# define __VOLK_ATTR_UNUSED __attribute__((unused)) ++-# define __VOLK_ATTR_INLINE __attribute__((always_inline)) ++-# define __VOLK_ATTR_DEPRECATED __attribute__((deprecated)) ++-# define __VOLK_ASM __asm__ ++-# define __VOLK_VOLATILE __volatile__ ++-# if __GNUC__ >= 4 ++-# define __VOLK_ATTR_EXPORT __attribute__((visibility("default"))) ++-# define __VOLK_ATTR_IMPORT __attribute__((visibility("default"))) ++-# else ++-# define __VOLK_ATTR_EXPORT ++-# define __VOLK_ATTR_IMPORT ++-# endif ++-# define __VOLK_PREFETCH(addr) __builtin_prefetch(addr) +++#define __VOLK_ATTR_ALIGNED(x) __attribute__((aligned(x))) +++#define __VOLK_ATTR_UNUSED __attribute__((unused)) +++#define __VOLK_ATTR_INLINE __attribute__((always_inline)) +++#define __VOLK_ATTR_DEPRECATED __attribute__((deprecated)) +++#define __VOLK_ASM __asm__ +++#define __VOLK_VOLATILE __volatile__ +++#define __VOLK_ATTR_EXPORT __attribute__((visibility("default"))) +++#define __VOLK_ATTR_IMPORT __attribute__((visibility("default"))) +++#define __VOLK_PREFETCH(addr) __builtin_prefetch(addr) +++#elif defined __GNUC__ +++#define __VOLK_ATTR_ALIGNED(x) __attribute__((aligned(x))) +++#define __VOLK_ATTR_UNUSED __attribute__((unused)) +++#define __VOLK_ATTR_INLINE __attribute__((always_inline)) +++#define __VOLK_ATTR_DEPRECATED __attribute__((deprecated)) +++#define __VOLK_ASM __asm__ +++#define __VOLK_VOLATILE __volatile__ +++#if __GNUC__ >= 4 +++#define __VOLK_ATTR_EXPORT __attribute__((visibility("default"))) +++#define __VOLK_ATTR_IMPORT __attribute__((visibility("default"))) ++ #else ++-# warning "Unknown compiler. Using default VOLK macros, which may or not work." ++-# define __VOLK_ATTR_ALIGNED(x) ++-# define __VOLK_ATTR_UNUSED ++-# define __VOLK_ATTR_INLINE ++-# define __VOLK_ATTR_DEPRECATED ++-# define __VOLK_ATTR_EXPORT ++-# define __VOLK_ATTR_IMPORT ++-# define __VOLK_PREFETCH(addr) ++-# define __VOLK_ASM __asm__ ++-# define __VOLK_VOLATILE __volatile__ +++#define __VOLK_ATTR_EXPORT +++#define __VOLK_ATTR_IMPORT +++#endif +++#define __VOLK_PREFETCH(addr) __builtin_prefetch(addr) +++#elif _MSC_VER +++#define __VOLK_ATTR_ALIGNED(x) __declspec(align(x)) +++#define __VOLK_ATTR_UNUSED +++#define __VOLK_ATTR_INLINE __forceinline +++#define __VOLK_ATTR_DEPRECATED __declspec(deprecated) +++#define __VOLK_ATTR_EXPORT __declspec(dllexport) +++#define __VOLK_ATTR_IMPORT __declspec(dllimport) +++#define __VOLK_PREFETCH(addr) +++#define __VOLK_ASM __asm +++#define __VOLK_VOLATILE +++#else +++#define __VOLK_ATTR_ALIGNED(x) +++#define __VOLK_ATTR_UNUSED +++#define __VOLK_ATTR_INLINE +++#define __VOLK_ATTR_DEPRECATED +++#define __VOLK_ATTR_EXPORT +++#define __VOLK_ATTR_IMPORT +++#define __VOLK_PREFETCH(addr) +++#define __VOLK_ASM __asm__ +++#define __VOLK_VOLATILE __volatile__ ++ #endif ++ ++ //////////////////////////////////////////////////////////////////////// ++ // Ignore annoying warnings in MSVC ++ //////////////////////////////////////////////////////////////////////// ++ #if defined(_MSC_VER) ++-# pragma warning(disable: 4244) //'conversion' conversion from 'type1' to 'type2', possible loss of data ++-# pragma warning(disable: 4305) //'identifier' : truncation from 'type1' to 'type2' +++#pragma warning(disable : 4244) //'conversion' conversion from 'type1' to 'type2', +++ //possible loss of data +++#pragma warning(disable : 4305) //'identifier' : truncation from 'type1' to 'type2' ++ #endif ++ ++ //////////////////////////////////////////////////////////////////////// ++ // C-linkage declaration macros ++ // FIXME: due to the usage of complex.h, require gcc for c-linkage ++ //////////////////////////////////////////////////////////////////////// ++-#if defined(__cplusplus) && (defined(__GNUC__) || defined(__clang__)) ++-# define __VOLK_DECL_BEGIN extern "C" { ++-# define __VOLK_DECL_END } +++#if defined(__cplusplus) && (__GNUC__) +++#define __VOLK_DECL_BEGIN extern "C" { +++#define __VOLK_DECL_END } ++ #else ++-# define __VOLK_DECL_BEGIN ++-# define __VOLK_DECL_END +++#define __VOLK_DECL_BEGIN +++#define __VOLK_DECL_END ++ #endif ++ ++ //////////////////////////////////////////////////////////////////////// ++@@ -80,9 +90,9 @@ ++ // http://gcc.gnu.org/wiki/Visibility ++ //////////////////////////////////////////////////////////////////////// ++ #ifdef volk_EXPORTS ++-# define VOLK_API __VOLK_ATTR_EXPORT +++#define VOLK_API __VOLK_ATTR_EXPORT ++ #else ++-# define VOLK_API __VOLK_ATTR_IMPORT +++#define VOLK_API __VOLK_ATTR_IMPORT ++ #endif ++ ++ //////////////////////////////////////////////////////////////////////// ++@@ -98,38 +108,38 @@ ++ #endif ++ #endif ++ ++-union bit128{ ++- uint8_t i8[16]; ++- uint16_t i16[8]; ++- uint32_t i[4]; ++- float f[4]; ++- double d[2]; +++union bit128 { +++ uint8_t i8[16]; +++ uint16_t i16[8]; +++ uint32_t i[4]; +++ float f[4]; +++ double d[2]; ++ ++- #ifdef LV_HAVE_SSE ++- __m128 float_vec; ++- #endif +++#ifdef LV_HAVE_SSE +++ __m128 float_vec; +++#endif ++ ++- #ifdef LV_HAVE_SSE2 ++- __m128i int_vec; ++- __m128d double_vec; ++- #endif +++#ifdef LV_HAVE_SSE2 +++ __m128i int_vec; +++ __m128d double_vec; +++#endif ++ }; ++ ++-union bit256{ ++- uint8_t i8[32]; ++- uint16_t i16[16]; ++- uint32_t i[8]; ++- float f[8]; ++- double d[4]; +++union bit256 { +++ uint8_t i8[32]; +++ uint16_t i16[16]; +++ uint32_t i[8]; +++ float f[8]; +++ double d[4]; ++ ++- #ifdef LV_HAVE_AVX ++- __m256 float_vec; ++- __m256i int_vec; ++- __m256d double_vec; ++- #endif +++#ifdef LV_HAVE_AVX +++ __m256 float_vec; +++ __m256i int_vec; +++ __m256d double_vec; +++#endif ++ }; ++ ++-#define bit128_p(x) ((union bit128 *)(x)) ++-#define bit256_p(x) ((union bit256 *)(x)) +++#define bit128_p(x) ((union bit128*)(x)) +++#define bit256_p(x) ((union bit256*)(x)) ++ ++ #endif /*INCLUDED_LIBVOLK_COMMON_H*/ ++diff --git a/include/volk/volk_complex.h b/include/volk/volk_complex.h ++index 1d61d78..ae78873 100644 ++--- a/include/volk/volk_complex.h +++++ b/include/volk/volk_complex.h ++@@ -19,49 +19,58 @@ ++ ++ #ifdef __cplusplus ++ ++-#include ++ #include +++#include ++ ++-typedef std::complex lv_8sc_t; +++typedef std::complex lv_8sc_t; ++ typedef std::complex lv_16sc_t; ++ typedef std::complex lv_32sc_t; ++ typedef std::complex lv_64sc_t; ++-typedef std::complex lv_32fc_t; ++-typedef std::complex lv_64fc_t; +++typedef std::complex lv_32fc_t; +++typedef std::complex lv_64fc_t; ++ ++-template inline std::complex lv_cmake(const T &r, const T &i){ +++template +++inline std::complex lv_cmake(const T& r, const T& i) +++{ ++ return std::complex(r, i); ++ } ++ ++-template inline typename T::value_type lv_creal(const T &x){ +++template +++inline typename T::value_type lv_creal(const T& x) +++{ ++ return x.real(); ++ } ++ ++-template inline typename T::value_type lv_cimag(const T &x){ +++template +++inline typename T::value_type lv_cimag(const T& x) +++{ ++ return x.imag(); ++ } ++ ++-template inline T lv_conj(const T &x){ +++template +++inline T lv_conj(const T& x) +++{ ++ return std::conj(x); ++ } ++ ++ #else /* __cplusplus */ ++ ++ #if __STDC_VERSION__ >= 199901L /* C99 check */ ++-/* this allows us to conj in lv_conj without the double detour for single-precision floats */ +++/* this allows us to conj in lv_conj without the double detour for single-precision floats +++ */ ++ #include ++ #endif /* C99 check */ ++ ++ #include ++ ++-typedef char complex lv_8sc_t; ++-typedef short complex lv_16sc_t; ++-typedef long complex lv_32sc_t; ++-typedef long long complex lv_64sc_t; ++-typedef float complex lv_32fc_t; ++-typedef double complex lv_64fc_t; +++typedef char complex lv_8sc_t; +++typedef short complex lv_16sc_t; +++typedef long complex lv_32sc_t; +++typedef long long complex lv_64sc_t; +++typedef float complex lv_32fc_t; +++typedef double complex lv_64fc_t; ++ ++-#define lv_cmake(r, i) ((r) + _Complex_I*(i)) +++#define lv_cmake(r, i) ((r) + _Complex_I * (i)) ++ ++ // When GNUC is available, use the complex extensions. ++ // The extensions always return the correct value type. ++diff --git a/include/volk/volk_malloc.h b/include/volk/volk_malloc.h ++index 3477b27..42ca2b0 100644 ++--- a/include/volk/volk_malloc.h +++++ b/include/volk/volk_malloc.h ++@@ -23,8 +23,8 @@ ++ #ifndef INCLUDED_VOLK_MALLOC_H ++ #define INCLUDED_VOLK_MALLOC_H ++ ++-#include ++ #include +++#include ++ ++ __VOLK_DECL_BEGIN ++ ++@@ -40,7 +40,8 @@ __VOLK_DECL_BEGIN ++ * For Apple Clang, we fall back to `posix_memalign`. ++ * see: https://linux.die.net/man/3/aligned_alloc ++ * For MSVC, we fall back to `_aligned_malloc`. ++- * see: https://docs.microsoft.com/en-us/cpp/c-runtime-library/reference/aligned-malloc?view=vs-2019 +++ * see: +++ * https://docs.microsoft.com/en-us/cpp/c-runtime-library/reference/aligned-malloc?view=vs-2019 ++ * ++ * Because of the ways in which volk_malloc may allocate memory, it is ++ * important to always free volk_malloc pointers using volk_free. ++@@ -51,7 +52,7 @@ __VOLK_DECL_BEGIN ++ * \param alignment The byte alignment of the allocated memory. ++ * \return pointer to aligned memory. ++ */ ++-VOLK_API void *volk_malloc(size_t size, size_t alignment); +++VOLK_API void* volk_malloc(size_t size, size_t alignment); ++ ++ /*! ++ * \brief Free's memory allocated by volk_malloc. ++@@ -62,11 +63,12 @@ VOLK_API void *volk_malloc(size_t size, size_t alignment); ++ * Thus, in this case `volk_free` inherits the same behavior `free` exhibits. ++ * see: https://en.cppreference.com/w/c/memory/free ++ * In case `_aligned_malloc` was used, we call `_aligned_free`. ++- * see: https://docs.microsoft.com/en-us/cpp/c-runtime-library/reference/aligned-free?view=vs-2019 +++ * see: +++ * https://docs.microsoft.com/en-us/cpp/c-runtime-library/reference/aligned-free?view=vs-2019 ++ * ++ * \param aptr The aligned pointer allocated by volk_malloc. ++ */ ++-VOLK_API void volk_free(void *aptr); +++VOLK_API void volk_free(void* aptr); ++ ++ __VOLK_DECL_END ++ ++diff --git a/include/volk/volk_neon_intrinsics.h b/include/volk/volk_neon_intrinsics.h ++index 90e7b54..302bd30 100644 ++--- a/include/volk/volk_neon_intrinsics.h +++++ b/include/volk/volk_neon_intrinsics.h ++@@ -67,9 +67,9 @@ ++ 3. This notice may not be removed or altered from any source distribution. ++ ++ (this is the zlib license) ++- +++ ++ _vsincosq_f32 ++- +++ ++ */ ++ ++ /* ++@@ -83,13 +83,12 @@ ++ ++ ++ /* Magnitude squared for float32x4x2_t */ ++-static inline float32x4_t ++-_vmagnitudesquaredq_f32(float32x4x2_t cmplxValue) +++static inline float32x4_t _vmagnitudesquaredq_f32(float32x4x2_t cmplxValue) ++ { ++ float32x4_t iValue, qValue, result; ++ iValue = vmulq_f32(cmplxValue.val[0], cmplxValue.val[0]); // Square the values ++ qValue = vmulq_f32(cmplxValue.val[1], cmplxValue.val[1]); // Square the values ++- result = vaddq_f32(iValue, qValue); // Add the I2 and Q2 values +++ result = vaddq_f32(iValue, qValue); // Add the I2 and Q2 values ++ return result; ++ } ++ ++@@ -97,9 +96,11 @@ _vmagnitudesquaredq_f32(float32x4x2_t cmplxValue) ++ static inline float32x4_t _vinvsqrtq_f32(float32x4_t x) ++ { ++ float32x4_t sqrt_reciprocal = vrsqrteq_f32(x); ++- sqrt_reciprocal = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x, sqrt_reciprocal), sqrt_reciprocal), sqrt_reciprocal); ++- sqrt_reciprocal = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x, sqrt_reciprocal), sqrt_reciprocal), sqrt_reciprocal); ++- +++ sqrt_reciprocal = vmulq_f32( +++ vrsqrtsq_f32(vmulq_f32(x, sqrt_reciprocal), sqrt_reciprocal), sqrt_reciprocal); +++ sqrt_reciprocal = vmulq_f32( +++ vrsqrtsq_f32(vmulq_f32(x, sqrt_reciprocal), sqrt_reciprocal), sqrt_reciprocal); +++ ++ return sqrt_reciprocal; ++ } ++ ++@@ -108,19 +109,19 @@ static inline float32x4_t _vinvq_f32(float32x4_t x) ++ { ++ // Newton's method ++ float32x4_t recip = vrecpeq_f32(x); ++- recip = vmulq_f32(vrecpsq_f32(x, recip), recip); ++- recip = vmulq_f32(vrecpsq_f32(x, recip), recip); +++ recip = vmulq_f32(vrecpsq_f32(x, recip), recip); +++ recip = vmulq_f32(vrecpsq_f32(x, recip), recip); ++ return recip; ++ } ++ ++ /* Complex multiplication for float32x4x2_t */ ++-static inline float32x4x2_t ++-_vmultiply_complexq_f32(float32x4x2_t a_val, float32x4x2_t b_val) +++static inline float32x4x2_t _vmultiply_complexq_f32(float32x4x2_t a_val, +++ float32x4x2_t b_val) ++ { ++ float32x4x2_t tmp_real; ++ float32x4x2_t tmp_imag; ++ float32x4x2_t c_val; ++- +++ ++ // multiply the real*real and imag*imag to get real result ++ // a0r*b0r|a1r*b1r|a2r*b2r|a3r*b3r ++ tmp_real.val[0] = vmulq_f32(a_val.val[0], b_val.val[0]); ++@@ -140,12 +141,12 @@ _vmultiply_complexq_f32(float32x4x2_t a_val, float32x4x2_t b_val) ++ /* From ARM Compute Library, MIT license */ ++ static inline float32x4_t _vtaylor_polyq_f32(float32x4_t x, const float32x4_t coeffs[8]) ++ { ++- float32x4_t cA = vmlaq_f32(coeffs[0], coeffs[4], x); ++- float32x4_t cB = vmlaq_f32(coeffs[2], coeffs[6], x); ++- float32x4_t cC = vmlaq_f32(coeffs[1], coeffs[5], x); ++- float32x4_t cD = vmlaq_f32(coeffs[3], coeffs[7], x); ++- float32x4_t x2 = vmulq_f32(x, x); ++- float32x4_t x4 = vmulq_f32(x2, x2); +++ float32x4_t cA = vmlaq_f32(coeffs[0], coeffs[4], x); +++ float32x4_t cB = vmlaq_f32(coeffs[2], coeffs[6], x); +++ float32x4_t cC = vmlaq_f32(coeffs[1], coeffs[5], x); +++ float32x4_t cD = vmlaq_f32(coeffs[3], coeffs[7], x); +++ float32x4_t x2 = vmulq_f32(x, x); +++ float32x4_t x4 = vmulq_f32(x2, x2); ++ float32x4_t res = vmlaq_f32(vmlaq_f32(cA, cB, x2), vmlaq_f32(cC, cD, x2), x4); ++ return res; ++ } ++@@ -155,121 +156,123 @@ static inline float32x4_t _vtaylor_polyq_f32(float32x4_t x, const float32x4_t co ++ static inline float32x4_t _vlogq_f32(float32x4_t x) ++ { ++ const float32x4_t log_tab[8] = { ++- vdupq_n_f32(-2.29561495781f), ++- vdupq_n_f32(-2.47071170807f), ++- vdupq_n_f32(-5.68692588806f), ++- vdupq_n_f32(-0.165253549814f), ++- vdupq_n_f32(5.17591238022f), ++- vdupq_n_f32(0.844007015228f), ++- vdupq_n_f32(4.58445882797f), ++- vdupq_n_f32(0.0141278216615f), +++ vdupq_n_f32(-2.29561495781f), vdupq_n_f32(-2.47071170807f), +++ vdupq_n_f32(-5.68692588806f), vdupq_n_f32(-0.165253549814f), +++ vdupq_n_f32(5.17591238022f), vdupq_n_f32(0.844007015228f), +++ vdupq_n_f32(4.58445882797f), vdupq_n_f32(0.0141278216615f), ++ }; ++- ++- const int32x4_t CONST_127 = vdupq_n_s32(127); // 127 +++ +++ const int32x4_t CONST_127 = vdupq_n_s32(127); // 127 ++ const float32x4_t CONST_LN2 = vdupq_n_f32(0.6931471805f); // ln(2) ++- +++ ++ // Extract exponent ++- int32x4_t m = vsubq_s32(vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_f32(x), 23)), CONST_127); ++- float32x4_t val = vreinterpretq_f32_s32(vsubq_s32(vreinterpretq_s32_f32(x), vshlq_n_s32(m, 23))); ++- +++ int32x4_t m = vsubq_s32( +++ vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_f32(x), 23)), CONST_127); +++ float32x4_t val = +++ vreinterpretq_f32_s32(vsubq_s32(vreinterpretq_s32_f32(x), vshlq_n_s32(m, 23))); +++ ++ // Polynomial Approximation ++ float32x4_t poly = _vtaylor_polyq_f32(val, log_tab); ++- +++ ++ // Reconstruct ++ poly = vmlaq_f32(poly, vcvtq_f32_s32(m), CONST_LN2); ++- +++ ++ return poly; ++ } ++ ++ /* Evaluation of 4 sines & cosines at once. ++ * Optimized from here (zlib license) ++ * http://gruntthepeon.free.fr/ssemath/ */ ++-static inline float32x4x2_t _vsincosq_f32(float32x4_t x) { +++static inline float32x4x2_t _vsincosq_f32(float32x4_t x) +++{ ++ const float32x4_t c_minus_cephes_DP1 = vdupq_n_f32(-0.78515625); ++ const float32x4_t c_minus_cephes_DP2 = vdupq_n_f32(-2.4187564849853515625e-4); ++ const float32x4_t c_minus_cephes_DP3 = vdupq_n_f32(-3.77489497744594108e-8); ++ const float32x4_t c_sincof_p0 = vdupq_n_f32(-1.9515295891e-4); ++- const float32x4_t c_sincof_p1 = vdupq_n_f32(8.3321608736e-3); +++ const float32x4_t c_sincof_p1 = vdupq_n_f32(8.3321608736e-3); ++ const float32x4_t c_sincof_p2 = vdupq_n_f32(-1.6666654611e-1); ++ const float32x4_t c_coscof_p0 = vdupq_n_f32(2.443315711809948e-005); ++ const float32x4_t c_coscof_p1 = vdupq_n_f32(-1.388731625493765e-003); ++ const float32x4_t c_coscof_p2 = vdupq_n_f32(4.166664568298827e-002); ++ const float32x4_t c_cephes_FOPI = vdupq_n_f32(1.27323954473516); // 4 / M_PI ++- +++ ++ const float32x4_t CONST_1 = vdupq_n_f32(1.f); ++ const float32x4_t CONST_1_2 = vdupq_n_f32(0.5f); ++ const float32x4_t CONST_0 = vdupq_n_f32(0.f); ++- const uint32x4_t CONST_2 = vdupq_n_u32(2); ++- const uint32x4_t CONST_4 = vdupq_n_u32(4); ++- +++ const uint32x4_t CONST_2 = vdupq_n_u32(2); +++ const uint32x4_t CONST_4 = vdupq_n_u32(4); +++ ++ uint32x4_t emm2; ++- +++ ++ uint32x4_t sign_mask_sin, sign_mask_cos; ++ sign_mask_sin = vcltq_f32(x, CONST_0); ++ x = vabsq_f32(x); ++ // scale by 4/pi ++ float32x4_t y = vmulq_f32(x, c_cephes_FOPI); ++- +++ ++ // store the integer part of y in mm0 ++ emm2 = vcvtq_u32_f32(y); ++ /* j=(j+1) & (~1) (see the cephes sources) */ ++ emm2 = vaddq_u32(emm2, vdupq_n_u32(1)); ++ emm2 = vandq_u32(emm2, vdupq_n_u32(~1)); ++ y = vcvtq_f32_u32(emm2); ++- +++ ++ /* get the polynom selection mask ++ there is one polynom for 0 <= x <= Pi/4 ++ and another one for Pi/4 ++ #include ++ #include +++#include ++ ++ __VOLK_DECL_BEGIN ++ ++-typedef struct volk_arch_pref ++-{ ++- char name[128]; //name of the kernel ++- char impl_a[128]; //best aligned impl ++- char impl_u[128]; //best unaligned impl +++typedef struct volk_arch_pref { +++ char name[128]; // name of the kernel +++ char impl_a[128]; // best aligned impl +++ char impl_u[128]; // best unaligned impl ++ } volk_arch_pref_t; ++ ++ //////////////////////////////////////////////////////////////////////// ++@@ -19,13 +18,13 @@ typedef struct volk_arch_pref ++ // if config file should be tested on existence for reading. ++ // returns \0 in the argument on failure. ++ //////////////////////////////////////////////////////////////////////// ++-VOLK_API void volk_get_config_path(char *, bool); +++VOLK_API void volk_get_config_path(char*, bool); ++ ++ //////////////////////////////////////////////////////////////////////// ++ // load prefs into global prefs struct ++ //////////////////////////////////////////////////////////////////////// ++-VOLK_API size_t volk_load_preferences(volk_arch_pref_t **); +++VOLK_API size_t volk_load_preferences(volk_arch_pref_t**); ++ ++ __VOLK_DECL_END ++ ++-#endif //INCLUDED_VOLK_PREFS_H +++#endif // INCLUDED_VOLK_PREFS_H ++diff --git a/include/volk/volk_sse3_intrinsics.h b/include/volk/volk_sse3_intrinsics.h ++index 6b53a2a..6bdc8d8 100644 ++--- a/include/volk/volk_sse3_intrinsics.h +++++ b/include/volk/volk_sse3_intrinsics.h ++@@ -1,19 +1,19 @@ ++ /* -*- c++ -*- */ ++-/* +++/* ++ * Copyright 2015 Free Software Foundation, Inc. ++- * +++ * ++ * This file is part of GNU Radio ++- * +++ * ++ * GNU Radio is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 3, or (at your option) ++ * any later version. ++- * +++ * ++ * GNU Radio is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++- * +++ * ++ * You should have received a copy of the GNU General Public License ++ * along with GNU Radio; see the file COPYING. If not, write to ++ * the Free Software Foundation, Inc., 51 Franklin Street, ++@@ -29,49 +29,52 @@ ++ #define INCLUDE_VOLK_VOLK_SSE3_INTRINSICS_H_ ++ #include ++ ++-static inline __m128 ++-_mm_complexmul_ps(__m128 x, __m128 y) +++static inline __m128 _mm_complexmul_ps(__m128 x, __m128 y) ++ { ++- __m128 yl, yh, tmp1, tmp2; ++- yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr ++- yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di ++- tmp1 = _mm_mul_ps(x, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr ++- x = _mm_shuffle_ps(x, x, 0xB1); // Re-arrange x to be ai,ar,bi,br ++- tmp2 = _mm_mul_ps(x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di ++- return _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di +++ __m128 yl, yh, tmp1, tmp2; +++ yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr +++ yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di +++ tmp1 = _mm_mul_ps(x, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr +++ x = _mm_shuffle_ps(x, x, 0xB1); // Re-arrange x to be ai,ar,bi,br +++ tmp2 = _mm_mul_ps(x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di +++ return _mm_addsub_ps(tmp1, +++ tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di ++ } ++ ++-static inline __m128 ++-_mm_complexconjugatemul_ps(__m128 x, __m128 y) +++static inline __m128 _mm_complexconjugatemul_ps(__m128 x, __m128 y) ++ { ++- const __m128 conjugator = _mm_setr_ps(0, -0.f, 0, -0.f); ++- y = _mm_xor_ps(y, conjugator); // conjugate y ++- return _mm_complexmul_ps(x, y); +++ const __m128 conjugator = _mm_setr_ps(0, -0.f, 0, -0.f); +++ y = _mm_xor_ps(y, conjugator); // conjugate y +++ return _mm_complexmul_ps(x, y); ++ } ++ ++-static inline __m128 ++-_mm_magnitudesquared_ps_sse3(__m128 cplxValue1, __m128 cplxValue2){ ++- cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values ++- cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values ++- return _mm_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values +++static inline __m128 _mm_magnitudesquared_ps_sse3(__m128 cplxValue1, __m128 cplxValue2) +++{ +++ cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values +++ cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values +++ return _mm_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values ++ } ++ ++-static inline __m128 ++-_mm_magnitude_ps_sse3(__m128 cplxValue1, __m128 cplxValue2){ ++- return _mm_sqrt_ps(_mm_magnitudesquared_ps_sse3(cplxValue1, cplxValue2)); +++static inline __m128 _mm_magnitude_ps_sse3(__m128 cplxValue1, __m128 cplxValue2) +++{ +++ return _mm_sqrt_ps(_mm_magnitudesquared_ps_sse3(cplxValue1, cplxValue2)); ++ } ++ ++-static inline __m128 ++-_mm_scaled_norm_dist_ps_sse3(const __m128 symbols0, const __m128 symbols1, const __m128 points0, const __m128 points1, const __m128 scalar){ ++- /* ++- * Calculate: |y - x|^2 * SNR_lin ++- * Consider 'symbolsX' and 'pointsX' to be complex float ++- * 'symbolsX' are 'y' and 'pointsX' are 'x' ++- */ ++- const __m128 diff0 = _mm_sub_ps(symbols0, points0); ++- const __m128 diff1 = _mm_sub_ps(symbols1, points1); ++- const __m128 norms = _mm_magnitudesquared_ps_sse3(diff0, diff1); ++- return _mm_mul_ps(norms, scalar); +++static inline __m128 _mm_scaled_norm_dist_ps_sse3(const __m128 symbols0, +++ const __m128 symbols1, +++ const __m128 points0, +++ const __m128 points1, +++ const __m128 scalar) +++{ +++ /* +++ * Calculate: |y - x|^2 * SNR_lin +++ * Consider 'symbolsX' and 'pointsX' to be complex float +++ * 'symbolsX' are 'y' and 'pointsX' are 'x' +++ */ +++ const __m128 diff0 = _mm_sub_ps(symbols0, points0); +++ const __m128 diff1 = _mm_sub_ps(symbols1, points1); +++ const __m128 norms = _mm_magnitudesquared_ps_sse3(diff0, diff1); +++ return _mm_mul_ps(norms, scalar); ++ } ++ ++ #endif /* INCLUDE_VOLK_VOLK_SSE3_INTRINSICS_H_ */ ++diff --git a/include/volk/volk_sse_intrinsics.h b/include/volk/volk_sse_intrinsics.h ++index 57318e2..24fe7c1 100644 ++--- a/include/volk/volk_sse_intrinsics.h +++++ b/include/volk/volk_sse_intrinsics.h ++@@ -1,19 +1,19 @@ ++ /* -*- c++ -*- */ ++-/* +++/* ++ * Copyright 2015 Free Software Foundation, Inc. ++- * +++ * ++ * This file is part of GNU Radio ++- * +++ * ++ * GNU Radio is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 3, or (at your option) ++ * any later version. ++- * +++ * ++ * GNU Radio is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++- * +++ * ++ * You should have received a copy of the GNU General Public License ++ * along with GNU Radio; see the file COPYING. If not, write to ++ * the Free Software Foundation, Inc., 51 Franklin Street, ++@@ -29,31 +29,34 @@ ++ #define INCLUDE_VOLK_VOLK_SSE_INTRINSICS_H_ ++ #include ++ ++-static inline __m128 ++-_mm_magnitudesquared_ps(__m128 cplxValue1, __m128 cplxValue2){ ++- __m128 iValue, qValue; ++- // Arrange in i1i2i3i4 format ++- iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0)); ++- // Arrange in q1q2q3q4 format ++- qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1)); ++- iValue = _mm_mul_ps(iValue, iValue); // Square the I values ++- qValue = _mm_mul_ps(qValue, qValue); // Square the Q Values ++- return _mm_add_ps(iValue, qValue); // Add the I2 and Q2 values +++static inline __m128 _mm_magnitudesquared_ps(__m128 cplxValue1, __m128 cplxValue2) +++{ +++ __m128 iValue, qValue; +++ // Arrange in i1i2i3i4 format +++ iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2, 0, 2, 0)); +++ // Arrange in q1q2q3q4 format +++ qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3, 1, 3, 1)); +++ iValue = _mm_mul_ps(iValue, iValue); // Square the I values +++ qValue = _mm_mul_ps(qValue, qValue); // Square the Q Values +++ return _mm_add_ps(iValue, qValue); // Add the I2 and Q2 values ++ } ++ ++-static inline __m128 ++-_mm_magnitude_ps(__m128 cplxValue1, __m128 cplxValue2){ ++- return _mm_sqrt_ps(_mm_magnitudesquared_ps(cplxValue1, cplxValue2)); +++static inline __m128 _mm_magnitude_ps(__m128 cplxValue1, __m128 cplxValue2) +++{ +++ return _mm_sqrt_ps(_mm_magnitudesquared_ps(cplxValue1, cplxValue2)); ++ } ++ ++-static inline __m128 ++-_mm_scaled_norm_dist_ps_sse(const __m128 symbols0, const __m128 symbols1, const __m128 points0, const __m128 points1, const __m128 scalar) +++static inline __m128 _mm_scaled_norm_dist_ps_sse(const __m128 symbols0, +++ const __m128 symbols1, +++ const __m128 points0, +++ const __m128 points1, +++ const __m128 scalar) ++ { ++- // calculate scalar * |x - y|^2 ++- const __m128 diff0 = _mm_sub_ps(symbols0, points0); ++- const __m128 diff1 = _mm_sub_ps(symbols1, points1); ++- const __m128 norms = _mm_magnitudesquared_ps(diff0, diff1); ++- return _mm_mul_ps(norms, scalar); +++ // calculate scalar * |x - y|^2 +++ const __m128 diff0 = _mm_sub_ps(symbols0, points0); +++ const __m128 diff1 = _mm_sub_ps(symbols1, points1); +++ const __m128 norms = _mm_magnitudesquared_ps(diff0, diff1); +++ return _mm_mul_ps(norms, scalar); ++ } ++ ++ #endif /* INCLUDE_VOLK_VOLK_SSE_INTRINSICS_H_ */ ++diff --git a/kernels/volk/volk_16i_32fc_dot_prod_32fc.h b/kernels/volk/volk_16i_32fc_dot_prod_32fc.h ++index f250340..2635649 100644 ++--- a/kernels/volk/volk_16i_32fc_dot_prod_32fc.h +++++ b/kernels/volk/volk_16i_32fc_dot_prod_32fc.h ++@@ -33,8 +33,8 @@ ++ * ++ * Dispatcher Prototype ++ * \code ++- * void volk_16i_32fc_dot_prod_32fc(lv_32fc_t* result, const short* input, const lv_32fc_t * taps, unsigned int num_points) ++- * \endcode +++ * void volk_16i_32fc_dot_prod_32fc(lv_32fc_t* result, const short* input, const lv_32fc_t +++ * * taps, unsigned int num_points) \endcode ++ * ++ * \b Inputs ++ * \li input: vector of shorts. ++@@ -58,165 +58,178 @@ ++ #ifndef INCLUDED_volk_16i_32fc_dot_prod_32fc_H ++ #define INCLUDED_volk_16i_32fc_dot_prod_32fc_H ++ ++-#include ++ #include +++#include ++ ++ ++ #ifdef LV_HAVE_GENERIC ++ ++-static inline void volk_16i_32fc_dot_prod_32fc_generic(lv_32fc_t* result, const short* input, const lv_32fc_t * taps, unsigned int num_points) { +++static inline void volk_16i_32fc_dot_prod_32fc_generic(lv_32fc_t* result, +++ const short* input, +++ const lv_32fc_t* taps, +++ unsigned int num_points) +++{ ++ ++- static const int N_UNROLL = 4; +++ static const int N_UNROLL = 4; ++ ++- lv_32fc_t acc0 = 0; ++- lv_32fc_t acc1 = 0; ++- lv_32fc_t acc2 = 0; ++- lv_32fc_t acc3 = 0; +++ lv_32fc_t acc0 = 0; +++ lv_32fc_t acc1 = 0; +++ lv_32fc_t acc2 = 0; +++ lv_32fc_t acc3 = 0; ++ ++- unsigned i = 0; ++- unsigned n = (num_points / N_UNROLL) * N_UNROLL; +++ unsigned i = 0; +++ unsigned n = (num_points / N_UNROLL) * N_UNROLL; ++ ++- for(i = 0; i < n; i += N_UNROLL) { ++- acc0 += taps[i + 0] * (float)input[i + 0]; ++- acc1 += taps[i + 1] * (float)input[i + 1]; ++- acc2 += taps[i + 2] * (float)input[i + 2]; ++- acc3 += taps[i + 3] * (float)input[i + 3]; ++- } +++ for (i = 0; i < n; i += N_UNROLL) { +++ acc0 += taps[i + 0] * (float)input[i + 0]; +++ acc1 += taps[i + 1] * (float)input[i + 1]; +++ acc2 += taps[i + 2] * (float)input[i + 2]; +++ acc3 += taps[i + 3] * (float)input[i + 3]; +++ } ++ ++- for(; i < num_points; i++) { ++- acc0 += taps[i] * (float)input[i]; ++- } +++ for (; i < num_points; i++) { +++ acc0 += taps[i] * (float)input[i]; +++ } ++ ++- *result = acc0 + acc1 + acc2 + acc3; +++ *result = acc0 + acc1 + acc2 + acc3; ++ } ++ ++ #endif /*LV_HAVE_GENERIC*/ ++ ++ #ifdef LV_HAVE_NEON ++ #include ++-static inline void volk_16i_32fc_dot_prod_32fc_neon(lv_32fc_t* result, const short* input, const lv_32fc_t * taps, unsigned int num_points) { ++- ++- unsigned ii; ++- unsigned quarter_points = num_points / 4; ++- lv_32fc_t* tapsPtr = (lv_32fc_t*) taps; ++- short* inputPtr = (short*) input; ++- lv_32fc_t accumulator_vec[4]; ++- ++- float32x4x2_t tapsVal, accumulator_val; ++- int16x4_t input16; ++- int32x4_t input32; ++- float32x4_t input_float, prod_re, prod_im; ++- ++- accumulator_val.val[0] = vdupq_n_f32(0.0); ++- accumulator_val.val[1] = vdupq_n_f32(0.0); ++- ++- for(ii = 0; ii < quarter_points; ++ii) { ++- tapsVal = vld2q_f32((float*)tapsPtr); ++- input16 = vld1_s16(inputPtr); ++- // widen 16-bit int to 32-bit int ++- input32 = vmovl_s16(input16); ++- // convert 32-bit int to float with scale ++- input_float = vcvtq_f32_s32(input32); ++- ++- prod_re = vmulq_f32(input_float, tapsVal.val[0]); ++- prod_im = vmulq_f32(input_float, tapsVal.val[1]); ++- ++- accumulator_val.val[0] = vaddq_f32(prod_re, accumulator_val.val[0]); ++- accumulator_val.val[1] = vaddq_f32(prod_im, accumulator_val.val[1]); ++- ++- tapsPtr += 4; ++- inputPtr += 4; ++- } ++- vst2q_f32((float*)accumulator_vec, accumulator_val); ++- accumulator_vec[0] += accumulator_vec[1]; ++- accumulator_vec[2] += accumulator_vec[3]; ++- accumulator_vec[0] += accumulator_vec[2]; ++- ++- for(ii = quarter_points * 4; ii < num_points; ++ii) { ++- accumulator_vec[0] += *(tapsPtr++) * (float)(*(inputPtr++)); ++- } ++- ++- *result = accumulator_vec[0]; +++static inline void volk_16i_32fc_dot_prod_32fc_neon(lv_32fc_t* result, +++ const short* input, +++ const lv_32fc_t* taps, +++ unsigned int num_points) +++{ +++ +++ unsigned ii; +++ unsigned quarter_points = num_points / 4; +++ lv_32fc_t* tapsPtr = (lv_32fc_t*)taps; +++ short* inputPtr = (short*)input; +++ lv_32fc_t accumulator_vec[4]; +++ +++ float32x4x2_t tapsVal, accumulator_val; +++ int16x4_t input16; +++ int32x4_t input32; +++ float32x4_t input_float, prod_re, prod_im; +++ +++ accumulator_val.val[0] = vdupq_n_f32(0.0); +++ accumulator_val.val[1] = vdupq_n_f32(0.0); +++ +++ for (ii = 0; ii < quarter_points; ++ii) { +++ tapsVal = vld2q_f32((float*)tapsPtr); +++ input16 = vld1_s16(inputPtr); +++ // widen 16-bit int to 32-bit int +++ input32 = vmovl_s16(input16); +++ // convert 32-bit int to float with scale +++ input_float = vcvtq_f32_s32(input32); +++ +++ prod_re = vmulq_f32(input_float, tapsVal.val[0]); +++ prod_im = vmulq_f32(input_float, tapsVal.val[1]); +++ +++ accumulator_val.val[0] = vaddq_f32(prod_re, accumulator_val.val[0]); +++ accumulator_val.val[1] = vaddq_f32(prod_im, accumulator_val.val[1]); +++ +++ tapsPtr += 4; +++ inputPtr += 4; +++ } +++ vst2q_f32((float*)accumulator_vec, accumulator_val); +++ accumulator_vec[0] += accumulator_vec[1]; +++ accumulator_vec[2] += accumulator_vec[3]; +++ accumulator_vec[0] += accumulator_vec[2]; +++ +++ for (ii = quarter_points * 4; ii < num_points; ++ii) { +++ accumulator_vec[0] += *(tapsPtr++) * (float)(*(inputPtr++)); +++ } +++ +++ *result = accumulator_vec[0]; ++ } ++ ++ #endif /*LV_HAVE_NEON*/ ++ ++ #if LV_HAVE_SSE && LV_HAVE_MMX ++ ++-static inline void volk_16i_32fc_dot_prod_32fc_u_sse( lv_32fc_t* result, const short* input, const lv_32fc_t* taps, unsigned int num_points) { ++- ++- unsigned int number = 0; ++- const unsigned int sixteenthPoints = num_points / 8; ++- ++- float res[2]; ++- float *realpt = &res[0], *imagpt = &res[1]; ++- const short* aPtr = input; ++- const float* bPtr = (float*)taps; ++- ++- __m64 m0, m1; ++- __m128 f0, f1, f2, f3; ++- __m128 a0Val, a1Val, a2Val, a3Val; ++- __m128 b0Val, b1Val, b2Val, b3Val; ++- __m128 c0Val, c1Val, c2Val, c3Val; ++- ++- __m128 dotProdVal0 = _mm_setzero_ps(); ++- __m128 dotProdVal1 = _mm_setzero_ps(); ++- __m128 dotProdVal2 = _mm_setzero_ps(); ++- __m128 dotProdVal3 = _mm_setzero_ps(); ++- ++- for(;number < sixteenthPoints; number++){ ++- ++- m0 = _mm_set_pi16(*(aPtr+3), *(aPtr+2), *(aPtr+1), *(aPtr+0)); ++- m1 = _mm_set_pi16(*(aPtr+7), *(aPtr+6), *(aPtr+5), *(aPtr+4)); ++- f0 = _mm_cvtpi16_ps(m0); ++- f1 = _mm_cvtpi16_ps(m0); ++- f2 = _mm_cvtpi16_ps(m1); ++- f3 = _mm_cvtpi16_ps(m1); ++- ++- a0Val = _mm_unpacklo_ps(f0, f1); ++- a1Val = _mm_unpackhi_ps(f0, f1); ++- a2Val = _mm_unpacklo_ps(f2, f3); ++- a3Val = _mm_unpackhi_ps(f2, f3); ++- ++- b0Val = _mm_loadu_ps(bPtr); ++- b1Val = _mm_loadu_ps(bPtr+4); ++- b2Val = _mm_loadu_ps(bPtr+8); ++- b3Val = _mm_loadu_ps(bPtr+12); ++- ++- c0Val = _mm_mul_ps(a0Val, b0Val); ++- c1Val = _mm_mul_ps(a1Val, b1Val); ++- c2Val = _mm_mul_ps(a2Val, b2Val); ++- c3Val = _mm_mul_ps(a3Val, b3Val); ++- ++- dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0); ++- dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1); ++- dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2); ++- dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3); ++- ++- aPtr += 8; ++- bPtr += 16; ++- } ++- ++- dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1); ++- dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2); ++- dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3); ++- ++- __VOLK_ATTR_ALIGNED(16) float dotProductVector[4]; ++- ++- _mm_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector ++- ++- *realpt = dotProductVector[0]; ++- *imagpt = dotProductVector[1]; ++- *realpt += dotProductVector[2]; ++- *imagpt += dotProductVector[3]; ++- ++- number = sixteenthPoints*8; ++- for(;number < num_points; number++){ ++- *realpt += ((*aPtr) * (*bPtr++)); ++- *imagpt += ((*aPtr++) * (*bPtr++)); ++- } ++- ++- *result = *(lv_32fc_t*)(&res[0]); +++static inline void volk_16i_32fc_dot_prod_32fc_u_sse(lv_32fc_t* result, +++ const short* input, +++ const lv_32fc_t* taps, +++ unsigned int num_points) +++{ +++ +++ unsigned int number = 0; +++ const unsigned int sixteenthPoints = num_points / 8; +++ +++ float res[2]; +++ float *realpt = &res[0], *imagpt = &res[1]; +++ const short* aPtr = input; +++ const float* bPtr = (float*)taps; +++ +++ __m64 m0, m1; +++ __m128 f0, f1, f2, f3; +++ __m128 a0Val, a1Val, a2Val, a3Val; +++ __m128 b0Val, b1Val, b2Val, b3Val; +++ __m128 c0Val, c1Val, c2Val, c3Val; +++ +++ __m128 dotProdVal0 = _mm_setzero_ps(); +++ __m128 dotProdVal1 = _mm_setzero_ps(); +++ __m128 dotProdVal2 = _mm_setzero_ps(); +++ __m128 dotProdVal3 = _mm_setzero_ps(); +++ +++ for (; number < sixteenthPoints; number++) { +++ +++ m0 = _mm_set_pi16(*(aPtr + 3), *(aPtr + 2), *(aPtr + 1), *(aPtr + 0)); +++ m1 = _mm_set_pi16(*(aPtr + 7), *(aPtr + 6), *(aPtr + 5), *(aPtr + 4)); +++ f0 = _mm_cvtpi16_ps(m0); +++ f1 = _mm_cvtpi16_ps(m0); +++ f2 = _mm_cvtpi16_ps(m1); +++ f3 = _mm_cvtpi16_ps(m1); +++ +++ a0Val = _mm_unpacklo_ps(f0, f1); +++ a1Val = _mm_unpackhi_ps(f0, f1); +++ a2Val = _mm_unpacklo_ps(f2, f3); +++ a3Val = _mm_unpackhi_ps(f2, f3); +++ +++ b0Val = _mm_loadu_ps(bPtr); +++ b1Val = _mm_loadu_ps(bPtr + 4); +++ b2Val = _mm_loadu_ps(bPtr + 8); +++ b3Val = _mm_loadu_ps(bPtr + 12); +++ +++ c0Val = _mm_mul_ps(a0Val, b0Val); +++ c1Val = _mm_mul_ps(a1Val, b1Val); +++ c2Val = _mm_mul_ps(a2Val, b2Val); +++ c3Val = _mm_mul_ps(a3Val, b3Val); +++ +++ dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0); +++ dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1); +++ dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2); +++ dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3); +++ +++ aPtr += 8; +++ bPtr += 16; +++ } +++ +++ dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1); +++ dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2); +++ dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3); +++ +++ __VOLK_ATTR_ALIGNED(16) float dotProductVector[4]; +++ +++ _mm_store_ps(dotProductVector, +++ dotProdVal0); // Store the results back into the dot product vector +++ +++ *realpt = dotProductVector[0]; +++ *imagpt = dotProductVector[1]; +++ *realpt += dotProductVector[2]; +++ *imagpt += dotProductVector[3]; +++ +++ number = sixteenthPoints * 8; +++ for (; number < num_points; number++) { +++ *realpt += ((*aPtr) * (*bPtr++)); +++ *imagpt += ((*aPtr++) * (*bPtr++)); +++ } +++ +++ *result = *(lv_32fc_t*)(&res[0]); ++ } ++ ++ #endif /*LV_HAVE_SSE && LV_HAVE_MMX*/ ++@@ -224,85 +237,90 @@ static inline void volk_16i_32fc_dot_prod_32fc_u_sse( lv_32fc_t* result, const ++ ++ #if LV_HAVE_AVX2 && LV_HAVE_FMA ++ ++-static inline void volk_16i_32fc_dot_prod_32fc_u_avx2_fma( lv_32fc_t* result, const short* input, const lv_32fc_t* taps, unsigned int num_points) { ++- ++- unsigned int number = 0; ++- const unsigned int sixteenthPoints = num_points / 16; ++- ++- float res[2]; ++- float *realpt = &res[0], *imagpt = &res[1]; ++- const short* aPtr = input; ++- const float* bPtr = (float*)taps; ++- ++- __m128i m0, m1; ++- __m256i f0, f1; ++- __m256 g0, g1, h0, h1, h2, h3; ++- __m256 a0Val, a1Val, a2Val, a3Val; ++- __m256 b0Val, b1Val, b2Val, b3Val; ++- ++- __m256 dotProdVal0 = _mm256_setzero_ps(); ++- __m256 dotProdVal1 = _mm256_setzero_ps(); ++- __m256 dotProdVal2 = _mm256_setzero_ps(); ++- __m256 dotProdVal3 = _mm256_setzero_ps(); ++- ++- for(;number < sixteenthPoints; number++){ ++- ++- m0 = _mm_loadu_si128((__m128i const*) aPtr); ++- m1 = _mm_loadu_si128((__m128i const*)(aPtr+8)); ++- ++- f0 = _mm256_cvtepi16_epi32(m0); ++- g0 = _mm256_cvtepi32_ps(f0); ++- f1 = _mm256_cvtepi16_epi32(m1); ++- g1 = _mm256_cvtepi32_ps(f1); ++- ++- h0 = _mm256_unpacklo_ps(g0, g0); ++- h1 = _mm256_unpackhi_ps(g0, g0); ++- h2 = _mm256_unpacklo_ps(g1, g1); ++- h3 = _mm256_unpackhi_ps(g1, g1); ++- ++- a0Val = _mm256_permute2f128_ps(h0, h1, 0x20); ++- a1Val = _mm256_permute2f128_ps(h0, h1, 0x31); ++- a2Val = _mm256_permute2f128_ps(h2, h3, 0x20); ++- a3Val = _mm256_permute2f128_ps(h2, h3, 0x31); ++- ++- b0Val = _mm256_loadu_ps(bPtr); ++- b1Val = _mm256_loadu_ps(bPtr+8); ++- b2Val = _mm256_loadu_ps(bPtr+16); ++- b3Val = _mm256_loadu_ps(bPtr+24); ++- ++- dotProdVal0 = _mm256_fmadd_ps(a0Val,b0Val,dotProdVal0); ++- dotProdVal1 = _mm256_fmadd_ps(a1Val,b1Val,dotProdVal1); ++- dotProdVal2 = _mm256_fmadd_ps(a2Val,b2Val,dotProdVal2); ++- dotProdVal3 = _mm256_fmadd_ps(a3Val,b3Val,dotProdVal3); ++- ++- aPtr += 16; ++- bPtr += 32; ++- } ++- ++- dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1); ++- dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2); ++- dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3); ++- ++- __VOLK_ATTR_ALIGNED(32) float dotProductVector[8]; ++- ++- _mm256_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector ++- ++- *realpt = dotProductVector[0]; ++- *imagpt = dotProductVector[1]; ++- *realpt += dotProductVector[2]; ++- *imagpt += dotProductVector[3]; ++- *realpt += dotProductVector[4]; ++- *imagpt += dotProductVector[5]; ++- *realpt += dotProductVector[6]; ++- *imagpt += dotProductVector[7]; ++- ++- number = sixteenthPoints*16; ++- for(;number < num_points; number++){ ++- *realpt += ((*aPtr) * (*bPtr++)); ++- *imagpt += ((*aPtr++) * (*bPtr++)); ++- } ++- ++- *result = *(lv_32fc_t*)(&res[0]); +++static inline void volk_16i_32fc_dot_prod_32fc_u_avx2_fma(lv_32fc_t* result, +++ const short* input, +++ const lv_32fc_t* taps, +++ unsigned int num_points) +++{ +++ +++ unsigned int number = 0; +++ const unsigned int sixteenthPoints = num_points / 16; +++ +++ float res[2]; +++ float *realpt = &res[0], *imagpt = &res[1]; +++ const short* aPtr = input; +++ const float* bPtr = (float*)taps; +++ +++ __m128i m0, m1; +++ __m256i f0, f1; +++ __m256 g0, g1, h0, h1, h2, h3; +++ __m256 a0Val, a1Val, a2Val, a3Val; +++ __m256 b0Val, b1Val, b2Val, b3Val; +++ +++ __m256 dotProdVal0 = _mm256_setzero_ps(); +++ __m256 dotProdVal1 = _mm256_setzero_ps(); +++ __m256 dotProdVal2 = _mm256_setzero_ps(); +++ __m256 dotProdVal3 = _mm256_setzero_ps(); +++ +++ for (; number < sixteenthPoints; number++) { +++ +++ m0 = _mm_loadu_si128((__m128i const*)aPtr); +++ m1 = _mm_loadu_si128((__m128i const*)(aPtr + 8)); +++ +++ f0 = _mm256_cvtepi16_epi32(m0); +++ g0 = _mm256_cvtepi32_ps(f0); +++ f1 = _mm256_cvtepi16_epi32(m1); +++ g1 = _mm256_cvtepi32_ps(f1); +++ +++ h0 = _mm256_unpacklo_ps(g0, g0); +++ h1 = _mm256_unpackhi_ps(g0, g0); +++ h2 = _mm256_unpacklo_ps(g1, g1); +++ h3 = _mm256_unpackhi_ps(g1, g1); +++ +++ a0Val = _mm256_permute2f128_ps(h0, h1, 0x20); +++ a1Val = _mm256_permute2f128_ps(h0, h1, 0x31); +++ a2Val = _mm256_permute2f128_ps(h2, h3, 0x20); +++ a3Val = _mm256_permute2f128_ps(h2, h3, 0x31); +++ +++ b0Val = _mm256_loadu_ps(bPtr); +++ b1Val = _mm256_loadu_ps(bPtr + 8); +++ b2Val = _mm256_loadu_ps(bPtr + 16); +++ b3Val = _mm256_loadu_ps(bPtr + 24); +++ +++ dotProdVal0 = _mm256_fmadd_ps(a0Val, b0Val, dotProdVal0); +++ dotProdVal1 = _mm256_fmadd_ps(a1Val, b1Val, dotProdVal1); +++ dotProdVal2 = _mm256_fmadd_ps(a2Val, b2Val, dotProdVal2); +++ dotProdVal3 = _mm256_fmadd_ps(a3Val, b3Val, dotProdVal3); +++ +++ aPtr += 16; +++ bPtr += 32; +++ } +++ +++ dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1); +++ dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2); +++ dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3); +++ +++ __VOLK_ATTR_ALIGNED(32) float dotProductVector[8]; +++ +++ _mm256_store_ps(dotProductVector, +++ dotProdVal0); // Store the results back into the dot product vector +++ +++ *realpt = dotProductVector[0]; +++ *imagpt = dotProductVector[1]; +++ *realpt += dotProductVector[2]; +++ *imagpt += dotProductVector[3]; +++ *realpt += dotProductVector[4]; +++ *imagpt += dotProductVector[5]; +++ *realpt += dotProductVector[6]; +++ *imagpt += dotProductVector[7]; +++ +++ number = sixteenthPoints * 16; +++ for (; number < num_points; number++) { +++ *realpt += ((*aPtr) * (*bPtr++)); +++ *imagpt += ((*aPtr++) * (*bPtr++)); +++ } +++ +++ *result = *(lv_32fc_t*)(&res[0]); ++ } ++ ++ #endif /*LV_HAVE_AVX2 && lV_HAVE_FMA*/ ++@@ -310,91 +328,96 @@ static inline void volk_16i_32fc_dot_prod_32fc_u_avx2_fma( lv_32fc_t* result, co ++ ++ #ifdef LV_HAVE_AVX2 ++ ++-static inline void volk_16i_32fc_dot_prod_32fc_u_avx2( lv_32fc_t* result, const short* input, const lv_32fc_t* taps, unsigned int num_points) { ++- ++- unsigned int number = 0; ++- const unsigned int sixteenthPoints = num_points / 16; ++- ++- float res[2]; ++- float *realpt = &res[0], *imagpt = &res[1]; ++- const short* aPtr = input; ++- const float* bPtr = (float*)taps; ++- ++- __m128i m0, m1; ++- __m256i f0, f1; ++- __m256 g0, g1, h0, h1, h2, h3; ++- __m256 a0Val, a1Val, a2Val, a3Val; ++- __m256 b0Val, b1Val, b2Val, b3Val; ++- __m256 c0Val, c1Val, c2Val, c3Val; ++- ++- __m256 dotProdVal0 = _mm256_setzero_ps(); ++- __m256 dotProdVal1 = _mm256_setzero_ps(); ++- __m256 dotProdVal2 = _mm256_setzero_ps(); ++- __m256 dotProdVal3 = _mm256_setzero_ps(); ++- ++- for(;number < sixteenthPoints; number++){ ++- ++- m0 = _mm_loadu_si128((__m128i const*) aPtr); ++- m1 = _mm_loadu_si128((__m128i const*)(aPtr+8)); ++- ++- f0 = _mm256_cvtepi16_epi32(m0); ++- g0 = _mm256_cvtepi32_ps(f0); ++- f1 = _mm256_cvtepi16_epi32(m1); ++- g1 = _mm256_cvtepi32_ps(f1); ++- ++- h0 = _mm256_unpacklo_ps(g0, g0); ++- h1 = _mm256_unpackhi_ps(g0, g0); ++- h2 = _mm256_unpacklo_ps(g1, g1); ++- h3 = _mm256_unpackhi_ps(g1, g1); ++- ++- a0Val = _mm256_permute2f128_ps(h0, h1, 0x20); ++- a1Val = _mm256_permute2f128_ps(h0, h1, 0x31); ++- a2Val = _mm256_permute2f128_ps(h2, h3, 0x20); ++- a3Val = _mm256_permute2f128_ps(h2, h3, 0x31); ++- ++- b0Val = _mm256_loadu_ps(bPtr); ++- b1Val = _mm256_loadu_ps(bPtr+8); ++- b2Val = _mm256_loadu_ps(bPtr+16); ++- b3Val = _mm256_loadu_ps(bPtr+24); ++- ++- c0Val = _mm256_mul_ps(a0Val, b0Val); ++- c1Val = _mm256_mul_ps(a1Val, b1Val); ++- c2Val = _mm256_mul_ps(a2Val, b2Val); ++- c3Val = _mm256_mul_ps(a3Val, b3Val); ++- ++- dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0); ++- dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1); ++- dotProdVal2 = _mm256_add_ps(c2Val, dotProdVal2); ++- dotProdVal3 = _mm256_add_ps(c3Val, dotProdVal3); ++- ++- aPtr += 16; ++- bPtr += 32; ++- } ++- ++- dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1); ++- dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2); ++- dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3); ++- ++- __VOLK_ATTR_ALIGNED(32) float dotProductVector[8]; ++- ++- _mm256_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector ++- ++- *realpt = dotProductVector[0]; ++- *imagpt = dotProductVector[1]; ++- *realpt += dotProductVector[2]; ++- *imagpt += dotProductVector[3]; ++- *realpt += dotProductVector[4]; ++- *imagpt += dotProductVector[5]; ++- *realpt += dotProductVector[6]; ++- *imagpt += dotProductVector[7]; ++- ++- number = sixteenthPoints*16; ++- for(;number < num_points; number++){ ++- *realpt += ((*aPtr) * (*bPtr++)); ++- *imagpt += ((*aPtr++) * (*bPtr++)); ++- } ++- ++- *result = *(lv_32fc_t*)(&res[0]); +++static inline void volk_16i_32fc_dot_prod_32fc_u_avx2(lv_32fc_t* result, +++ const short* input, +++ const lv_32fc_t* taps, +++ unsigned int num_points) +++{ +++ +++ unsigned int number = 0; +++ const unsigned int sixteenthPoints = num_points / 16; +++ +++ float res[2]; +++ float *realpt = &res[0], *imagpt = &res[1]; +++ const short* aPtr = input; +++ const float* bPtr = (float*)taps; +++ +++ __m128i m0, m1; +++ __m256i f0, f1; +++ __m256 g0, g1, h0, h1, h2, h3; +++ __m256 a0Val, a1Val, a2Val, a3Val; +++ __m256 b0Val, b1Val, b2Val, b3Val; +++ __m256 c0Val, c1Val, c2Val, c3Val; +++ +++ __m256 dotProdVal0 = _mm256_setzero_ps(); +++ __m256 dotProdVal1 = _mm256_setzero_ps(); +++ __m256 dotProdVal2 = _mm256_setzero_ps(); +++ __m256 dotProdVal3 = _mm256_setzero_ps(); +++ +++ for (; number < sixteenthPoints; number++) { +++ +++ m0 = _mm_loadu_si128((__m128i const*)aPtr); +++ m1 = _mm_loadu_si128((__m128i const*)(aPtr + 8)); +++ +++ f0 = _mm256_cvtepi16_epi32(m0); +++ g0 = _mm256_cvtepi32_ps(f0); +++ f1 = _mm256_cvtepi16_epi32(m1); +++ g1 = _mm256_cvtepi32_ps(f1); +++ +++ h0 = _mm256_unpacklo_ps(g0, g0); +++ h1 = _mm256_unpackhi_ps(g0, g0); +++ h2 = _mm256_unpacklo_ps(g1, g1); +++ h3 = _mm256_unpackhi_ps(g1, g1); +++ +++ a0Val = _mm256_permute2f128_ps(h0, h1, 0x20); +++ a1Val = _mm256_permute2f128_ps(h0, h1, 0x31); +++ a2Val = _mm256_permute2f128_ps(h2, h3, 0x20); +++ a3Val = _mm256_permute2f128_ps(h2, h3, 0x31); +++ +++ b0Val = _mm256_loadu_ps(bPtr); +++ b1Val = _mm256_loadu_ps(bPtr + 8); +++ b2Val = _mm256_loadu_ps(bPtr + 16); +++ b3Val = _mm256_loadu_ps(bPtr + 24); +++ +++ c0Val = _mm256_mul_ps(a0Val, b0Val); +++ c1Val = _mm256_mul_ps(a1Val, b1Val); +++ c2Val = _mm256_mul_ps(a2Val, b2Val); +++ c3Val = _mm256_mul_ps(a3Val, b3Val); +++ +++ dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0); +++ dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1); +++ dotProdVal2 = _mm256_add_ps(c2Val, dotProdVal2); +++ dotProdVal3 = _mm256_add_ps(c3Val, dotProdVal3); +++ +++ aPtr += 16; +++ bPtr += 32; +++ } +++ +++ dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1); +++ dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2); +++ dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3); +++ +++ __VOLK_ATTR_ALIGNED(32) float dotProductVector[8]; +++ +++ _mm256_store_ps(dotProductVector, +++ dotProdVal0); // Store the results back into the dot product vector +++ +++ *realpt = dotProductVector[0]; +++ *imagpt = dotProductVector[1]; +++ *realpt += dotProductVector[2]; +++ *imagpt += dotProductVector[3]; +++ *realpt += dotProductVector[4]; +++ *imagpt += dotProductVector[5]; +++ *realpt += dotProductVector[6]; +++ *imagpt += dotProductVector[7]; +++ +++ number = sixteenthPoints * 16; +++ for (; number < num_points; number++) { +++ *realpt += ((*aPtr) * (*bPtr++)); +++ *imagpt += ((*aPtr++) * (*bPtr++)); +++ } +++ +++ *result = *(lv_32fc_t*)(&res[0]); ++ } ++ ++ #endif /*LV_HAVE_AVX2*/ ++@@ -403,171 +426,181 @@ static inline void volk_16i_32fc_dot_prod_32fc_u_avx2( lv_32fc_t* result, const ++ #if LV_HAVE_SSE && LV_HAVE_MMX ++ ++ ++-static inline void volk_16i_32fc_dot_prod_32fc_a_sse( lv_32fc_t* result, const short* input, const lv_32fc_t* taps, unsigned int num_points) { ++- ++- unsigned int number = 0; ++- const unsigned int sixteenthPoints = num_points / 8; ++- ++- float res[2]; ++- float *realpt = &res[0], *imagpt = &res[1]; ++- const short* aPtr = input; ++- const float* bPtr = (float*)taps; ++- ++- __m64 m0, m1; ++- __m128 f0, f1, f2, f3; ++- __m128 a0Val, a1Val, a2Val, a3Val; ++- __m128 b0Val, b1Val, b2Val, b3Val; ++- __m128 c0Val, c1Val, c2Val, c3Val; ++- ++- __m128 dotProdVal0 = _mm_setzero_ps(); ++- __m128 dotProdVal1 = _mm_setzero_ps(); ++- __m128 dotProdVal2 = _mm_setzero_ps(); ++- __m128 dotProdVal3 = _mm_setzero_ps(); ++- ++- for(;number < sixteenthPoints; number++){ ++- ++- m0 = _mm_set_pi16(*(aPtr+3), *(aPtr+2), *(aPtr+1), *(aPtr+0)); ++- m1 = _mm_set_pi16(*(aPtr+7), *(aPtr+6), *(aPtr+5), *(aPtr+4)); ++- f0 = _mm_cvtpi16_ps(m0); ++- f1 = _mm_cvtpi16_ps(m0); ++- f2 = _mm_cvtpi16_ps(m1); ++- f3 = _mm_cvtpi16_ps(m1); ++- ++- a0Val = _mm_unpacklo_ps(f0, f1); ++- a1Val = _mm_unpackhi_ps(f0, f1); ++- a2Val = _mm_unpacklo_ps(f2, f3); ++- a3Val = _mm_unpackhi_ps(f2, f3); ++- ++- b0Val = _mm_load_ps(bPtr); ++- b1Val = _mm_load_ps(bPtr+4); ++- b2Val = _mm_load_ps(bPtr+8); ++- b3Val = _mm_load_ps(bPtr+12); ++- ++- c0Val = _mm_mul_ps(a0Val, b0Val); ++- c1Val = _mm_mul_ps(a1Val, b1Val); ++- c2Val = _mm_mul_ps(a2Val, b2Val); ++- c3Val = _mm_mul_ps(a3Val, b3Val); ++- ++- dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0); ++- dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1); ++- dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2); ++- dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3); ++- ++- aPtr += 8; ++- bPtr += 16; ++- } ++- ++- dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1); ++- dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2); ++- dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3); ++- ++- __VOLK_ATTR_ALIGNED(16) float dotProductVector[4]; ++- ++- _mm_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector ++- ++- *realpt = dotProductVector[0]; ++- *imagpt = dotProductVector[1]; ++- *realpt += dotProductVector[2]; ++- *imagpt += dotProductVector[3]; ++- ++- number = sixteenthPoints*8; ++- for(;number < num_points; number++){ ++- *realpt += ((*aPtr) * (*bPtr++)); ++- *imagpt += ((*aPtr++) * (*bPtr++)); ++- } ++- ++- *result = *(lv_32fc_t*)(&res[0]); +++static inline void volk_16i_32fc_dot_prod_32fc_a_sse(lv_32fc_t* result, +++ const short* input, +++ const lv_32fc_t* taps, +++ unsigned int num_points) +++{ +++ +++ unsigned int number = 0; +++ const unsigned int sixteenthPoints = num_points / 8; +++ +++ float res[2]; +++ float *realpt = &res[0], *imagpt = &res[1]; +++ const short* aPtr = input; +++ const float* bPtr = (float*)taps; +++ +++ __m64 m0, m1; +++ __m128 f0, f1, f2, f3; +++ __m128 a0Val, a1Val, a2Val, a3Val; +++ __m128 b0Val, b1Val, b2Val, b3Val; +++ __m128 c0Val, c1Val, c2Val, c3Val; +++ +++ __m128 dotProdVal0 = _mm_setzero_ps(); +++ __m128 dotProdVal1 = _mm_setzero_ps(); +++ __m128 dotProdVal2 = _mm_setzero_ps(); +++ __m128 dotProdVal3 = _mm_setzero_ps(); +++ +++ for (; number < sixteenthPoints; number++) { +++ +++ m0 = _mm_set_pi16(*(aPtr + 3), *(aPtr + 2), *(aPtr + 1), *(aPtr + 0)); +++ m1 = _mm_set_pi16(*(aPtr + 7), *(aPtr + 6), *(aPtr + 5), *(aPtr + 4)); +++ f0 = _mm_cvtpi16_ps(m0); +++ f1 = _mm_cvtpi16_ps(m0); +++ f2 = _mm_cvtpi16_ps(m1); +++ f3 = _mm_cvtpi16_ps(m1); +++ +++ a0Val = _mm_unpacklo_ps(f0, f1); +++ a1Val = _mm_unpackhi_ps(f0, f1); +++ a2Val = _mm_unpacklo_ps(f2, f3); +++ a3Val = _mm_unpackhi_ps(f2, f3); +++ +++ b0Val = _mm_load_ps(bPtr); +++ b1Val = _mm_load_ps(bPtr + 4); +++ b2Val = _mm_load_ps(bPtr + 8); +++ b3Val = _mm_load_ps(bPtr + 12); +++ +++ c0Val = _mm_mul_ps(a0Val, b0Val); +++ c1Val = _mm_mul_ps(a1Val, b1Val); +++ c2Val = _mm_mul_ps(a2Val, b2Val); +++ c3Val = _mm_mul_ps(a3Val, b3Val); +++ +++ dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0); +++ dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1); +++ dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2); +++ dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3); +++ +++ aPtr += 8; +++ bPtr += 16; +++ } +++ +++ dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1); +++ dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2); +++ dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3); +++ +++ __VOLK_ATTR_ALIGNED(16) float dotProductVector[4]; +++ +++ _mm_store_ps(dotProductVector, +++ dotProdVal0); // Store the results back into the dot product vector +++ +++ *realpt = dotProductVector[0]; +++ *imagpt = dotProductVector[1]; +++ *realpt += dotProductVector[2]; +++ *imagpt += dotProductVector[3]; +++ +++ number = sixteenthPoints * 8; +++ for (; number < num_points; number++) { +++ *realpt += ((*aPtr) * (*bPtr++)); +++ *imagpt += ((*aPtr++) * (*bPtr++)); +++ } +++ +++ *result = *(lv_32fc_t*)(&res[0]); ++ } ++ ++ #endif /*LV_HAVE_SSE && LV_HAVE_MMX*/ ++ ++ #ifdef LV_HAVE_AVX2 ++ ++-static inline void volk_16i_32fc_dot_prod_32fc_a_avx2( lv_32fc_t* result, const short* input, const lv_32fc_t* taps, unsigned int num_points) { ++- ++- unsigned int number = 0; ++- const unsigned int sixteenthPoints = num_points / 16; ++- ++- float res[2]; ++- float *realpt = &res[0], *imagpt = &res[1]; ++- const short* aPtr = input; ++- const float* bPtr = (float*)taps; ++- ++- __m128i m0, m1; ++- __m256i f0, f1; ++- __m256 g0, g1, h0, h1, h2, h3; ++- __m256 a0Val, a1Val, a2Val, a3Val; ++- __m256 b0Val, b1Val, b2Val, b3Val; ++- __m256 c0Val, c1Val, c2Val, c3Val; ++- ++- __m256 dotProdVal0 = _mm256_setzero_ps(); ++- __m256 dotProdVal1 = _mm256_setzero_ps(); ++- __m256 dotProdVal2 = _mm256_setzero_ps(); ++- __m256 dotProdVal3 = _mm256_setzero_ps(); ++- ++- for(;number < sixteenthPoints; number++){ ++- ++- m0 = _mm_load_si128((__m128i const*) aPtr); ++- m1 = _mm_load_si128((__m128i const*)(aPtr+8)); ++- ++- f0 = _mm256_cvtepi16_epi32(m0); ++- g0 = _mm256_cvtepi32_ps(f0); ++- f1 = _mm256_cvtepi16_epi32(m1); ++- g1 = _mm256_cvtepi32_ps(f1); ++- ++- h0 = _mm256_unpacklo_ps(g0, g0); ++- h1 = _mm256_unpackhi_ps(g0, g0); ++- h2 = _mm256_unpacklo_ps(g1, g1); ++- h3 = _mm256_unpackhi_ps(g1, g1); ++- ++- a0Val = _mm256_permute2f128_ps(h0, h1, 0x20); ++- a1Val = _mm256_permute2f128_ps(h0, h1, 0x31); ++- a2Val = _mm256_permute2f128_ps(h2, h3, 0x20); ++- a3Val = _mm256_permute2f128_ps(h2, h3, 0x31); ++- ++- b0Val = _mm256_load_ps(bPtr); ++- b1Val = _mm256_load_ps(bPtr+8); ++- b2Val = _mm256_load_ps(bPtr+16); ++- b3Val = _mm256_load_ps(bPtr+24); ++- ++- c0Val = _mm256_mul_ps(a0Val, b0Val); ++- c1Val = _mm256_mul_ps(a1Val, b1Val); ++- c2Val = _mm256_mul_ps(a2Val, b2Val); ++- c3Val = _mm256_mul_ps(a3Val, b3Val); ++- ++- dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0); ++- dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1); ++- dotProdVal2 = _mm256_add_ps(c2Val, dotProdVal2); ++- dotProdVal3 = _mm256_add_ps(c3Val, dotProdVal3); ++- ++- aPtr += 16; ++- bPtr += 32; ++- } ++- ++- dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1); ++- dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2); ++- dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3); ++- ++- __VOLK_ATTR_ALIGNED(32) float dotProductVector[8]; ++- ++- _mm256_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector ++- ++- *realpt = dotProductVector[0]; ++- *imagpt = dotProductVector[1]; ++- *realpt += dotProductVector[2]; ++- *imagpt += dotProductVector[3]; ++- *realpt += dotProductVector[4]; ++- *imagpt += dotProductVector[5]; ++- *realpt += dotProductVector[6]; ++- *imagpt += dotProductVector[7]; ++- ++- number = sixteenthPoints*16; ++- for(;number < num_points; number++){ ++- *realpt += ((*aPtr) * (*bPtr++)); ++- *imagpt += ((*aPtr++) * (*bPtr++)); ++- } ++- ++- *result = *(lv_32fc_t*)(&res[0]); +++static inline void volk_16i_32fc_dot_prod_32fc_a_avx2(lv_32fc_t* result, +++ const short* input, +++ const lv_32fc_t* taps, +++ unsigned int num_points) +++{ +++ +++ unsigned int number = 0; +++ const unsigned int sixteenthPoints = num_points / 16; +++ +++ float res[2]; +++ float *realpt = &res[0], *imagpt = &res[1]; +++ const short* aPtr = input; +++ const float* bPtr = (float*)taps; +++ +++ __m128i m0, m1; +++ __m256i f0, f1; +++ __m256 g0, g1, h0, h1, h2, h3; +++ __m256 a0Val, a1Val, a2Val, a3Val; +++ __m256 b0Val, b1Val, b2Val, b3Val; +++ __m256 c0Val, c1Val, c2Val, c3Val; +++ +++ __m256 dotProdVal0 = _mm256_setzero_ps(); +++ __m256 dotProdVal1 = _mm256_setzero_ps(); +++ __m256 dotProdVal2 = _mm256_setzero_ps(); +++ __m256 dotProdVal3 = _mm256_setzero_ps(); +++ +++ for (; number < sixteenthPoints; number++) { +++ +++ m0 = _mm_load_si128((__m128i const*)aPtr); +++ m1 = _mm_load_si128((__m128i const*)(aPtr + 8)); +++ +++ f0 = _mm256_cvtepi16_epi32(m0); +++ g0 = _mm256_cvtepi32_ps(f0); +++ f1 = _mm256_cvtepi16_epi32(m1); +++ g1 = _mm256_cvtepi32_ps(f1); +++ +++ h0 = _mm256_unpacklo_ps(g0, g0); +++ h1 = _mm256_unpackhi_ps(g0, g0); +++ h2 = _mm256_unpacklo_ps(g1, g1); +++ h3 = _mm256_unpackhi_ps(g1, g1); +++ +++ a0Val = _mm256_permute2f128_ps(h0, h1, 0x20); +++ a1Val = _mm256_permute2f128_ps(h0, h1, 0x31); +++ a2Val = _mm256_permute2f128_ps(h2, h3, 0x20); +++ a3Val = _mm256_permute2f128_ps(h2, h3, 0x31); +++ +++ b0Val = _mm256_load_ps(bPtr); +++ b1Val = _mm256_load_ps(bPtr + 8); +++ b2Val = _mm256_load_ps(bPtr + 16); +++ b3Val = _mm256_load_ps(bPtr + 24); +++ +++ c0Val = _mm256_mul_ps(a0Val, b0Val); +++ c1Val = _mm256_mul_ps(a1Val, b1Val); +++ c2Val = _mm256_mul_ps(a2Val, b2Val); +++ c3Val = _mm256_mul_ps(a3Val, b3Val); +++ +++ dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0); +++ dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1); +++ dotProdVal2 = _mm256_add_ps(c2Val, dotProdVal2); +++ dotProdVal3 = _mm256_add_ps(c3Val, dotProdVal3); +++ +++ aPtr += 16; +++ bPtr += 32; +++ } +++ +++ dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1); +++ dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2); +++ dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3); +++ +++ __VOLK_ATTR_ALIGNED(32) float dotProductVector[8]; +++ +++ _mm256_store_ps(dotProductVector, +++ dotProdVal0); // Store the results back into the dot product vector +++ +++ *realpt = dotProductVector[0]; +++ *imagpt = dotProductVector[1]; +++ *realpt += dotProductVector[2]; +++ *imagpt += dotProductVector[3]; +++ *realpt += dotProductVector[4]; +++ *imagpt += dotProductVector[5]; +++ *realpt += dotProductVector[6]; +++ *imagpt += dotProductVector[7]; +++ +++ number = sixteenthPoints * 16; +++ for (; number < num_points; number++) { +++ *realpt += ((*aPtr) * (*bPtr++)); +++ *imagpt += ((*aPtr++) * (*bPtr++)); +++ } +++ +++ *result = *(lv_32fc_t*)(&res[0]); ++ } ++ ++ ++@@ -575,85 +608,90 @@ static inline void volk_16i_32fc_dot_prod_32fc_a_avx2( lv_32fc_t* result, const ++ ++ #if LV_HAVE_AVX2 && LV_HAVE_FMA ++ ++-static inline void volk_16i_32fc_dot_prod_32fc_a_avx2_fma( lv_32fc_t* result, const short* input, const lv_32fc_t* taps, unsigned int num_points) { ++- ++- unsigned int number = 0; ++- const unsigned int sixteenthPoints = num_points / 16; ++- ++- float res[2]; ++- float *realpt = &res[0], *imagpt = &res[1]; ++- const short* aPtr = input; ++- const float* bPtr = (float*)taps; ++- ++- __m128i m0, m1; ++- __m256i f0, f1; ++- __m256 g0, g1, h0, h1, h2, h3; ++- __m256 a0Val, a1Val, a2Val, a3Val; ++- __m256 b0Val, b1Val, b2Val, b3Val; ++- ++- __m256 dotProdVal0 = _mm256_setzero_ps(); ++- __m256 dotProdVal1 = _mm256_setzero_ps(); ++- __m256 dotProdVal2 = _mm256_setzero_ps(); ++- __m256 dotProdVal3 = _mm256_setzero_ps(); ++- ++- for(;number < sixteenthPoints; number++){ ++- ++- m0 = _mm_load_si128((__m128i const*) aPtr); ++- m1 = _mm_load_si128((__m128i const*)(aPtr+8)); ++- ++- f0 = _mm256_cvtepi16_epi32(m0); ++- g0 = _mm256_cvtepi32_ps(f0); ++- f1 = _mm256_cvtepi16_epi32(m1); ++- g1 = _mm256_cvtepi32_ps(f1); ++- ++- h0 = _mm256_unpacklo_ps(g0, g0); ++- h1 = _mm256_unpackhi_ps(g0, g0); ++- h2 = _mm256_unpacklo_ps(g1, g1); ++- h3 = _mm256_unpackhi_ps(g1, g1); ++- ++- a0Val = _mm256_permute2f128_ps(h0, h1, 0x20); ++- a1Val = _mm256_permute2f128_ps(h0, h1, 0x31); ++- a2Val = _mm256_permute2f128_ps(h2, h3, 0x20); ++- a3Val = _mm256_permute2f128_ps(h2, h3, 0x31); ++- ++- b0Val = _mm256_load_ps(bPtr); ++- b1Val = _mm256_load_ps(bPtr+8); ++- b2Val = _mm256_load_ps(bPtr+16); ++- b3Val = _mm256_load_ps(bPtr+24); ++- ++- dotProdVal0 = _mm256_fmadd_ps(a0Val,b0Val,dotProdVal0); ++- dotProdVal1 = _mm256_fmadd_ps(a1Val,b1Val,dotProdVal1); ++- dotProdVal2 = _mm256_fmadd_ps(a2Val,b2Val,dotProdVal2); ++- dotProdVal3 = _mm256_fmadd_ps(a3Val,b3Val,dotProdVal3); ++- ++- aPtr += 16; ++- bPtr += 32; ++- } ++- ++- dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1); ++- dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2); ++- dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3); ++- ++- __VOLK_ATTR_ALIGNED(32) float dotProductVector[8]; ++- ++- _mm256_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector ++- ++- *realpt = dotProductVector[0]; ++- *imagpt = dotProductVector[1]; ++- *realpt += dotProductVector[2]; ++- *imagpt += dotProductVector[3]; ++- *realpt += dotProductVector[4]; ++- *imagpt += dotProductVector[5]; ++- *realpt += dotProductVector[6]; ++- *imagpt += dotProductVector[7]; ++- ++- number = sixteenthPoints*16; ++- for(;number < num_points; number++){ ++- *realpt += ((*aPtr) * (*bPtr++)); ++- *imagpt += ((*aPtr++) * (*bPtr++)); ++- } ++- ++- *result = *(lv_32fc_t*)(&res[0]); +++static inline void volk_16i_32fc_dot_prod_32fc_a_avx2_fma(lv_32fc_t* result, +++ const short* input, +++ const lv_32fc_t* taps, +++ unsigned int num_points) +++{ +++ +++ unsigned int number = 0; +++ const unsigned int sixteenthPoints = num_points / 16; +++ +++ float res[2]; +++ float *realpt = &res[0], *imagpt = &res[1]; +++ const short* aPtr = input; +++ const float* bPtr = (float*)taps; +++ +++ __m128i m0, m1; +++ __m256i f0, f1; +++ __m256 g0, g1, h0, h1, h2, h3; +++ __m256 a0Val, a1Val, a2Val, a3Val; +++ __m256 b0Val, b1Val, b2Val, b3Val; +++ +++ __m256 dotProdVal0 = _mm256_setzero_ps(); +++ __m256 dotProdVal1 = _mm256_setzero_ps(); +++ __m256 dotProdVal2 = _mm256_setzero_ps(); +++ __m256 dotProdVal3 = _mm256_setzero_ps(); +++ +++ for (; number < sixteenthPoints; number++) { +++ +++ m0 = _mm_load_si128((__m128i const*)aPtr); +++ m1 = _mm_load_si128((__m128i const*)(aPtr + 8)); +++ +++ f0 = _mm256_cvtepi16_epi32(m0); +++ g0 = _mm256_cvtepi32_ps(f0); +++ f1 = _mm256_cvtepi16_epi32(m1); +++ g1 = _mm256_cvtepi32_ps(f1); +++ +++ h0 = _mm256_unpacklo_ps(g0, g0); +++ h1 = _mm256_unpackhi_ps(g0, g0); +++ h2 = _mm256_unpacklo_ps(g1, g1); +++ h3 = _mm256_unpackhi_ps(g1, g1); +++ +++ a0Val = _mm256_permute2f128_ps(h0, h1, 0x20); +++ a1Val = _mm256_permute2f128_ps(h0, h1, 0x31); +++ a2Val = _mm256_permute2f128_ps(h2, h3, 0x20); +++ a3Val = _mm256_permute2f128_ps(h2, h3, 0x31); +++ +++ b0Val = _mm256_load_ps(bPtr); +++ b1Val = _mm256_load_ps(bPtr + 8); +++ b2Val = _mm256_load_ps(bPtr + 16); +++ b3Val = _mm256_load_ps(bPtr + 24); +++ +++ dotProdVal0 = _mm256_fmadd_ps(a0Val, b0Val, dotProdVal0); +++ dotProdVal1 = _mm256_fmadd_ps(a1Val, b1Val, dotProdVal1); +++ dotProdVal2 = _mm256_fmadd_ps(a2Val, b2Val, dotProdVal2); +++ dotProdVal3 = _mm256_fmadd_ps(a3Val, b3Val, dotProdVal3); +++ +++ aPtr += 16; +++ bPtr += 32; +++ } +++ +++ dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1); +++ dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2); +++ dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3); +++ +++ __VOLK_ATTR_ALIGNED(32) float dotProductVector[8]; +++ +++ _mm256_store_ps(dotProductVector, +++ dotProdVal0); // Store the results back into the dot product vector +++ +++ *realpt = dotProductVector[0]; +++ *imagpt = dotProductVector[1]; +++ *realpt += dotProductVector[2]; +++ *imagpt += dotProductVector[3]; +++ *realpt += dotProductVector[4]; +++ *imagpt += dotProductVector[5]; +++ *realpt += dotProductVector[6]; +++ *imagpt += dotProductVector[7]; +++ +++ number = sixteenthPoints * 16; +++ for (; number < num_points; number++) { +++ *realpt += ((*aPtr) * (*bPtr++)); +++ *imagpt += ((*aPtr++) * (*bPtr++)); +++ } +++ +++ *result = *(lv_32fc_t*)(&res[0]); ++ } ++ ++ ++diff --git a/kernels/volk/volk_16i_branch_4_state_8.h b/kernels/volk/volk_16i_branch_4_state_8.h ++index 31b66cc..4d00b6b 100644 ++--- a/kernels/volk/volk_16i_branch_4_state_8.h +++++ b/kernels/volk/volk_16i_branch_4_state_8.h ++@@ -29,8 +29,8 @@ ++ * ++ * Dispatcher Prototype ++ * \code ++- * void volk_16i_branch_4_state_8(short* target, short* src0, char** permuters, short* cntl2, short* cntl3, short* scalars) ++- * \endcode +++ * void volk_16i_branch_4_state_8(short* target, short* src0, char** permuters, short* +++ * cntl2, short* cntl3, short* scalars) \endcode ++ * ++ * \b Inputs ++ * \li src0: ++@@ -61,155 +61,154 @@ ++ ++ #ifdef LV_HAVE_SSSE3 ++ ++-#include ++ #include ++ #include +++#include ++ ++-static inline void ++-volk_16i_branch_4_state_8_a_ssse3(short* target, short* src0, char** permuters, short* cntl2, short* cntl3, short* scalars) +++static inline void volk_16i_branch_4_state_8_a_ssse3(short* target, +++ short* src0, +++ char** permuters, +++ short* cntl2, +++ short* cntl3, +++ short* scalars) ++ { ++- __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11; ++- __m128i *p_target, *p_src0, *p_cntl2, *p_cntl3, *p_scalars; +++ __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11; +++ __m128i *p_target, *p_src0, *p_cntl2, *p_cntl3, *p_scalars; ++ ++- p_target = (__m128i*)target; ++- p_src0 = (__m128i*)src0; ++- p_cntl2 = (__m128i*)cntl2; ++- p_cntl3 = (__m128i*)cntl3; ++- p_scalars = (__m128i*)scalars; +++ p_target = (__m128i*)target; +++ p_src0 = (__m128i*)src0; +++ p_cntl2 = (__m128i*)cntl2; +++ p_cntl3 = (__m128i*)cntl3; +++ p_scalars = (__m128i*)scalars; ++ ++- xmm0 = _mm_load_si128(p_scalars); +++ xmm0 = _mm_load_si128(p_scalars); ++ ++- xmm1 = _mm_shufflelo_epi16(xmm0, 0); ++- xmm2 = _mm_shufflelo_epi16(xmm0, 0x55); ++- xmm3 = _mm_shufflelo_epi16(xmm0, 0xaa); ++- xmm4 = _mm_shufflelo_epi16(xmm0, 0xff); +++ xmm1 = _mm_shufflelo_epi16(xmm0, 0); +++ xmm2 = _mm_shufflelo_epi16(xmm0, 0x55); +++ xmm3 = _mm_shufflelo_epi16(xmm0, 0xaa); +++ xmm4 = _mm_shufflelo_epi16(xmm0, 0xff); ++ ++- xmm1 = _mm_shuffle_epi32(xmm1, 0x00); ++- xmm2 = _mm_shuffle_epi32(xmm2, 0x00); ++- xmm3 = _mm_shuffle_epi32(xmm3, 0x00); ++- xmm4 = _mm_shuffle_epi32(xmm4, 0x00); +++ xmm1 = _mm_shuffle_epi32(xmm1, 0x00); +++ xmm2 = _mm_shuffle_epi32(xmm2, 0x00); +++ xmm3 = _mm_shuffle_epi32(xmm3, 0x00); +++ xmm4 = _mm_shuffle_epi32(xmm4, 0x00); ++ ++- xmm0 = _mm_load_si128((__m128i*)permuters[0]); ++- xmm6 = _mm_load_si128((__m128i*)permuters[1]); ++- xmm8 = _mm_load_si128((__m128i*)permuters[2]); ++- xmm10 = _mm_load_si128((__m128i*)permuters[3]); +++ xmm0 = _mm_load_si128((__m128i*)permuters[0]); +++ xmm6 = _mm_load_si128((__m128i*)permuters[1]); +++ xmm8 = _mm_load_si128((__m128i*)permuters[2]); +++ xmm10 = _mm_load_si128((__m128i*)permuters[3]); ++ ++- xmm5 = _mm_load_si128(p_src0); ++- xmm0 = _mm_shuffle_epi8(xmm5, xmm0); ++- xmm6 = _mm_shuffle_epi8(xmm5, xmm6); ++- xmm8 = _mm_shuffle_epi8(xmm5, xmm8); ++- xmm10 = _mm_shuffle_epi8(xmm5, xmm10); +++ xmm5 = _mm_load_si128(p_src0); +++ xmm0 = _mm_shuffle_epi8(xmm5, xmm0); +++ xmm6 = _mm_shuffle_epi8(xmm5, xmm6); +++ xmm8 = _mm_shuffle_epi8(xmm5, xmm8); +++ xmm10 = _mm_shuffle_epi8(xmm5, xmm10); ++ ++- xmm5 = _mm_add_epi16(xmm1, xmm2); +++ xmm5 = _mm_add_epi16(xmm1, xmm2); ++ ++- xmm6 = _mm_add_epi16(xmm2, xmm6); ++- xmm8 = _mm_add_epi16(xmm1, xmm8); +++ xmm6 = _mm_add_epi16(xmm2, xmm6); +++ xmm8 = _mm_add_epi16(xmm1, xmm8); ++ ++- xmm7 = _mm_load_si128(p_cntl2); ++- xmm9 = _mm_load_si128(p_cntl3); +++ xmm7 = _mm_load_si128(p_cntl2); +++ xmm9 = _mm_load_si128(p_cntl3); ++ ++- xmm0 = _mm_add_epi16(xmm5, xmm0); +++ xmm0 = _mm_add_epi16(xmm5, xmm0); ++ ++- xmm7 = _mm_and_si128(xmm7, xmm3); ++- xmm9 = _mm_and_si128(xmm9, xmm4); +++ xmm7 = _mm_and_si128(xmm7, xmm3); +++ xmm9 = _mm_and_si128(xmm9, xmm4); ++ ++- xmm5 = _mm_load_si128(&p_cntl2[1]); ++- xmm11 = _mm_load_si128(&p_cntl3[1]); +++ xmm5 = _mm_load_si128(&p_cntl2[1]); +++ xmm11 = _mm_load_si128(&p_cntl3[1]); ++ ++- xmm7 = _mm_add_epi16(xmm7, xmm9); +++ xmm7 = _mm_add_epi16(xmm7, xmm9); ++ ++- xmm5 = _mm_and_si128(xmm5, xmm3); ++- xmm11 = _mm_and_si128(xmm11, xmm4); +++ xmm5 = _mm_and_si128(xmm5, xmm3); +++ xmm11 = _mm_and_si128(xmm11, xmm4); ++ ++- xmm0 = _mm_add_epi16(xmm0, xmm7); +++ xmm0 = _mm_add_epi16(xmm0, xmm7); ++ ++ ++- xmm7 = _mm_load_si128(&p_cntl2[2]); ++- xmm9 = _mm_load_si128(&p_cntl3[2]); +++ xmm7 = _mm_load_si128(&p_cntl2[2]); +++ xmm9 = _mm_load_si128(&p_cntl3[2]); ++ ++- xmm5 = _mm_add_epi16(xmm5, xmm11); +++ xmm5 = _mm_add_epi16(xmm5, xmm11); ++ ++- xmm7 = _mm_and_si128(xmm7, xmm3); ++- xmm9 = _mm_and_si128(xmm9, xmm4); +++ xmm7 = _mm_and_si128(xmm7, xmm3); +++ xmm9 = _mm_and_si128(xmm9, xmm4); ++ ++- xmm6 = _mm_add_epi16(xmm6, xmm5); +++ xmm6 = _mm_add_epi16(xmm6, xmm5); ++ ++ ++- xmm5 = _mm_load_si128(&p_cntl2[3]); ++- xmm11 = _mm_load_si128(&p_cntl3[3]); +++ xmm5 = _mm_load_si128(&p_cntl2[3]); +++ xmm11 = _mm_load_si128(&p_cntl3[3]); ++ ++- xmm7 = _mm_add_epi16(xmm7, xmm9); +++ xmm7 = _mm_add_epi16(xmm7, xmm9); ++ ++- xmm5 = _mm_and_si128(xmm5, xmm3); ++- xmm11 = _mm_and_si128(xmm11, xmm4); +++ xmm5 = _mm_and_si128(xmm5, xmm3); +++ xmm11 = _mm_and_si128(xmm11, xmm4); ++ ++- xmm8 = _mm_add_epi16(xmm8, xmm7); +++ xmm8 = _mm_add_epi16(xmm8, xmm7); ++ ++- xmm5 = _mm_add_epi16(xmm5, xmm11); +++ xmm5 = _mm_add_epi16(xmm5, xmm11); ++ ++- _mm_store_si128(p_target, xmm0); ++- _mm_store_si128(&p_target[1], xmm6); +++ _mm_store_si128(p_target, xmm0); +++ _mm_store_si128(&p_target[1], xmm6); ++ ++- xmm10 = _mm_add_epi16(xmm5, xmm10); +++ xmm10 = _mm_add_epi16(xmm5, xmm10); ++ ++- _mm_store_si128(&p_target[2], xmm8); +++ _mm_store_si128(&p_target[2], xmm8); ++ ++- _mm_store_si128(&p_target[3], xmm10); +++ _mm_store_si128(&p_target[3], xmm10); ++ } ++ ++ ++ #endif /*LV_HAVE_SSEs*/ ++ ++ #ifdef LV_HAVE_GENERIC ++-static inline void ++-volk_16i_branch_4_state_8_generic(short* target, short* src0, char** permuters, short* cntl2, short* cntl3, short* scalars) +++static inline void volk_16i_branch_4_state_8_generic(short* target, +++ short* src0, +++ char** permuters, +++ short* cntl2, +++ short* cntl3, +++ short* scalars) ++ { ++- int i = 0; ++- ++- int bound = 4; ++- ++- for(; i < bound; ++i) { ++- target[i* 8] = src0[((char)permuters[i][0])/2] ++- + ((i + 1)%2 * scalars[0]) ++- + (((i >> 1)^1) * scalars[1]) ++- + (cntl2[i * 8] & scalars[2]) ++- + (cntl3[i * 8] & scalars[3]); ++- target[i* 8 + 1] = src0[((char)permuters[i][1 * 2])/2] ++- + ((i + 1)%2 * scalars[0]) ++- + (((i >> 1)^1) * scalars[1]) ++- + (cntl2[i * 8 + 1] & scalars[2]) ++- + (cntl3[i * 8 + 1] & scalars[3]); ++- target[i* 8 + 2] = src0[((char)permuters[i][2 * 2])/2] ++- + ((i + 1)%2 * scalars[0]) ++- + (((i >> 1)^1) * scalars[1]) ++- + (cntl2[i * 8 + 2] & scalars[2]) ++- + (cntl3[i * 8 + 2] & scalars[3]); ++- target[i* 8 + 3] = src0[((char)permuters[i][3 * 2])/2] ++- + ((i + 1)%2 * scalars[0]) ++- + (((i >> 1)^1) * scalars[1]) ++- + (cntl2[i * 8 + 3] & scalars[2]) ++- + (cntl3[i * 8 + 3] & scalars[3]); ++- target[i* 8 + 4] = src0[((char)permuters[i][4 * 2])/2] ++- + ((i + 1)%2 * scalars[0]) ++- + (((i >> 1)^1) * scalars[1]) ++- + (cntl2[i * 8 + 4] & scalars[2]) ++- + (cntl3[i * 8 + 4] & scalars[3]); ++- target[i* 8 + 5] = src0[((char)permuters[i][5 * 2])/2] ++- + ((i + 1)%2 * scalars[0]) ++- + (((i >> 1)^1) * scalars[1]) ++- + (cntl2[i * 8 + 5] & scalars[2]) ++- + (cntl3[i * 8 + 5] & scalars[3]); ++- target[i* 8 + 6] = src0[((char)permuters[i][6 * 2])/2] ++- + ((i + 1)%2 * scalars[0]) ++- + (((i >> 1)^1) * scalars[1]) ++- + (cntl2[i * 8 + 6] & scalars[2]) ++- + (cntl3[i * 8 + 6] & scalars[3]); ++- target[i* 8 + 7] = src0[((char)permuters[i][7 * 2])/2] ++- + ((i + 1)%2 * scalars[0]) ++- + (((i >> 1)^1) * scalars[1]) ++- + (cntl2[i * 8 + 7] & scalars[2]) ++- + (cntl3[i * 8 + 7] & scalars[3]); ++- } +++ int i = 0; +++ +++ int bound = 4; +++ +++ for (; i < bound; ++i) { +++ target[i * 8] = src0[((char)permuters[i][0]) / 2] + ((i + 1) % 2 * scalars[0]) + +++ (((i >> 1) ^ 1) * scalars[1]) + (cntl2[i * 8] & scalars[2]) + +++ (cntl3[i * 8] & scalars[3]); +++ target[i * 8 + 1] = src0[((char)permuters[i][1 * 2]) / 2] + +++ ((i + 1) % 2 * scalars[0]) + (((i >> 1) ^ 1) * scalars[1]) + +++ (cntl2[i * 8 + 1] & scalars[2]) + +++ (cntl3[i * 8 + 1] & scalars[3]); +++ target[i * 8 + 2] = src0[((char)permuters[i][2 * 2]) / 2] + +++ ((i + 1) % 2 * scalars[0]) + (((i >> 1) ^ 1) * scalars[1]) + +++ (cntl2[i * 8 + 2] & scalars[2]) + +++ (cntl3[i * 8 + 2] & scalars[3]); +++ target[i * 8 + 3] = src0[((char)permuters[i][3 * 2]) / 2] + +++ ((i + 1) % 2 * scalars[0]) + (((i >> 1) ^ 1) * scalars[1]) + +++ (cntl2[i * 8 + 3] & scalars[2]) + +++ (cntl3[i * 8 + 3] & scalars[3]); +++ target[i * 8 + 4] = src0[((char)permuters[i][4 * 2]) / 2] + +++ ((i + 1) % 2 * scalars[0]) + (((i >> 1) ^ 1) * scalars[1]) + +++ (cntl2[i * 8 + 4] & scalars[2]) + +++ (cntl3[i * 8 + 4] & scalars[3]); +++ target[i * 8 + 5] = src0[((char)permuters[i][5 * 2]) / 2] + +++ ((i + 1) % 2 * scalars[0]) + (((i >> 1) ^ 1) * scalars[1]) + +++ (cntl2[i * 8 + 5] & scalars[2]) + +++ (cntl3[i * 8 + 5] & scalars[3]); +++ target[i * 8 + 6] = src0[((char)permuters[i][6 * 2]) / 2] + +++ ((i + 1) % 2 * scalars[0]) + (((i >> 1) ^ 1) * scalars[1]) + +++ (cntl2[i * 8 + 6] & scalars[2]) + +++ (cntl3[i * 8 + 6] & scalars[3]); +++ target[i * 8 + 7] = src0[((char)permuters[i][7 * 2]) / 2] + +++ ((i + 1) % 2 * scalars[0]) + (((i >> 1) ^ 1) * scalars[1]) + +++ (cntl2[i * 8 + 7] & scalars[2]) + +++ (cntl3[i * 8 + 7] & scalars[3]); +++ } ++ } ++ ++ #endif /*LV_HAVE_GENERIC*/ ++diff --git a/kernels/volk/volk_16i_convert_8i.h b/kernels/volk/volk_16i_convert_8i.h ++index e2f953b..f09515d 100644 ++--- a/kernels/volk/volk_16i_convert_8i.h +++++ b/kernels/volk/volk_16i_convert_8i.h ++@@ -29,8 +29,8 @@ ++ * ++ * Dispatcher Prototype ++ * \code ++- * void volk_16i_convert_8i(int8_t* outputVector, const int16_t* inputVector, unsigned int num_points) ++- * \endcode +++ * void volk_16i_convert_8i(int8_t* outputVector, const int16_t* inputVector, unsigned int +++ * num_points) \endcode ++ * ++ * \b Inputs ++ * \li inputVector: The input vector of 16-bit shorts. ++@@ -59,39 +59,42 @@ ++ #ifdef LV_HAVE_AVX2 ++ #include ++ ++-static inline void ++-volk_16i_convert_8i_u_avx2(int8_t* outputVector, const int16_t* inputVector, unsigned int num_points) +++static inline void volk_16i_convert_8i_u_avx2(int8_t* outputVector, +++ const int16_t* inputVector, +++ unsigned int num_points) ++ { ++- unsigned int number = 0; ++- const unsigned int thirtysecondPoints = num_points / 32; +++ unsigned int number = 0; +++ const unsigned int thirtysecondPoints = num_points / 32; ++ ++- int8_t* outputVectorPtr = outputVector; ++- int16_t* inputPtr = (int16_t*)inputVector; ++- __m256i inputVal1; ++- __m256i inputVal2; ++- __m256i ret; +++ int8_t* outputVectorPtr = outputVector; +++ int16_t* inputPtr = (int16_t*)inputVector; +++ __m256i inputVal1; +++ __m256i inputVal2; +++ __m256i ret; ++ ++- for(;number < thirtysecondPoints; number++){ +++ for (; number < thirtysecondPoints; number++) { ++ ++- // Load the 16 values ++- inputVal1 = _mm256_loadu_si256((__m256i*)inputPtr); inputPtr += 16; ++- inputVal2 = _mm256_loadu_si256((__m256i*)inputPtr); inputPtr += 16; +++ // Load the 16 values +++ inputVal1 = _mm256_loadu_si256((__m256i*)inputPtr); +++ inputPtr += 16; +++ inputVal2 = _mm256_loadu_si256((__m256i*)inputPtr); +++ inputPtr += 16; ++ ++- inputVal1 = _mm256_srai_epi16(inputVal1, 8); ++- inputVal2 = _mm256_srai_epi16(inputVal2, 8); +++ inputVal1 = _mm256_srai_epi16(inputVal1, 8); +++ inputVal2 = _mm256_srai_epi16(inputVal2, 8); ++ ++- ret = _mm256_packs_epi16(inputVal1, inputVal2); ++- ret = _mm256_permute4x64_epi64(ret, 0b11011000); +++ ret = _mm256_packs_epi16(inputVal1, inputVal2); +++ ret = _mm256_permute4x64_epi64(ret, 0b11011000); ++ ++- _mm256_storeu_si256((__m256i*)outputVectorPtr, ret); +++ _mm256_storeu_si256((__m256i*)outputVectorPtr, ret); ++ ++- outputVectorPtr += 32; ++- } +++ outputVectorPtr += 32; +++ } ++ ++- number = thirtysecondPoints * 32; ++- for(; number < num_points; number++){ ++- outputVector[number] =(int8_t)(inputVector[number] >> 8); ++- } +++ number = thirtysecondPoints * 32; +++ for (; number < num_points; number++) { +++ outputVector[number] = (int8_t)(inputVector[number] >> 8); +++ } ++ } ++ #endif /* LV_HAVE_AVX2 */ ++ ++@@ -99,60 +102,62 @@ volk_16i_convert_8i_u_avx2(int8_t* outputVector, const int16_t* inputVector, uns ++ #ifdef LV_HAVE_SSE2 ++ #include ++ ++-static inline void ++-volk_16i_convert_8i_u_sse2(int8_t* outputVector, const int16_t* inputVector, unsigned int num_points) +++static inline void volk_16i_convert_8i_u_sse2(int8_t* outputVector, +++ const int16_t* inputVector, +++ unsigned int num_points) ++ { ++- unsigned int number = 0; ++- const unsigned int sixteenthPoints = num_points / 16; +++ unsigned int number = 0; +++ const unsigned int sixteenthPoints = num_points / 16; ++ ++- int8_t* outputVectorPtr = outputVector; ++- int16_t* inputPtr = (int16_t*)inputVector; ++- __m128i inputVal1; ++- __m128i inputVal2; ++- __m128i ret; +++ int8_t* outputVectorPtr = outputVector; +++ int16_t* inputPtr = (int16_t*)inputVector; +++ __m128i inputVal1; +++ __m128i inputVal2; +++ __m128i ret; ++ ++- for(;number < sixteenthPoints; number++){ +++ for (; number < sixteenthPoints; number++) { ++ ++- // Load the 16 values ++- inputVal1 = _mm_loadu_si128((__m128i*)inputPtr); inputPtr += 8; ++- inputVal2 = _mm_loadu_si128((__m128i*)inputPtr); inputPtr += 8; +++ // Load the 16 values +++ inputVal1 = _mm_loadu_si128((__m128i*)inputPtr); +++ inputPtr += 8; +++ inputVal2 = _mm_loadu_si128((__m128i*)inputPtr); +++ inputPtr += 8; ++ ++- inputVal1 = _mm_srai_epi16(inputVal1, 8); ++- inputVal2 = _mm_srai_epi16(inputVal2, 8); +++ inputVal1 = _mm_srai_epi16(inputVal1, 8); +++ inputVal2 = _mm_srai_epi16(inputVal2, 8); ++ ++- ret = _mm_packs_epi16(inputVal1, inputVal2); +++ ret = _mm_packs_epi16(inputVal1, inputVal2); ++ ++- _mm_storeu_si128((__m128i*)outputVectorPtr, ret); +++ _mm_storeu_si128((__m128i*)outputVectorPtr, ret); ++ ++- outputVectorPtr += 16; ++- } +++ outputVectorPtr += 16; +++ } ++ ++- number = sixteenthPoints * 16; ++- for(; number < num_points; number++){ ++- outputVector[number] =(int8_t)(inputVector[number] >> 8); ++- } +++ number = sixteenthPoints * 16; +++ for (; number < num_points; number++) { +++ outputVector[number] = (int8_t)(inputVector[number] >> 8); +++ } ++ } ++ #endif /* LV_HAVE_SSE2 */ ++ ++ ++ #ifdef LV_HAVE_GENERIC ++ ++-static inline void ++-volk_16i_convert_8i_generic(int8_t* outputVector, const int16_t* inputVector, unsigned int num_points) +++static inline void volk_16i_convert_8i_generic(int8_t* outputVector, +++ const int16_t* inputVector, +++ unsigned int num_points) ++ { ++- int8_t* outputVectorPtr = outputVector; ++- const int16_t* inputVectorPtr = inputVector; ++- unsigned int number = 0; +++ int8_t* outputVectorPtr = outputVector; +++ const int16_t* inputVectorPtr = inputVector; +++ unsigned int number = 0; ++ ++- for(number = 0; number < num_points; number++){ ++- *outputVectorPtr++ = ((int8_t)(*inputVectorPtr++ >> 8)); ++- } +++ for (number = 0; number < num_points; number++) { +++ *outputVectorPtr++ = ((int8_t)(*inputVectorPtr++ >> 8)); +++ } ++ } ++ #endif /* LV_HAVE_GENERIC */ ++ ++ ++- ++- ++ #endif /* INCLUDED_volk_16i_convert_8i_u_H */ ++ #ifndef INCLUDED_volk_16i_convert_8i_a_H ++ #define INCLUDED_volk_16i_convert_8i_a_H ++@@ -163,39 +168,42 @@ volk_16i_convert_8i_generic(int8_t* outputVector, const int16_t* inputVector, un ++ #ifdef LV_HAVE_AVX2 ++ #include ++ ++-static inline void ++-volk_16i_convert_8i_a_avx2(int8_t* outputVector, const int16_t* inputVector, unsigned int num_points) +++static inline void volk_16i_convert_8i_a_avx2(int8_t* outputVector, +++ const int16_t* inputVector, +++ unsigned int num_points) ++ { ++- unsigned int number = 0; ++- const unsigned int thirtysecondPoints = num_points / 32; +++ unsigned int number = 0; +++ const unsigned int thirtysecondPoints = num_points / 32; ++ ++- int8_t* outputVectorPtr = outputVector; ++- int16_t* inputPtr = (int16_t*)inputVector; ++- __m256i inputVal1; ++- __m256i inputVal2; ++- __m256i ret; +++ int8_t* outputVectorPtr = outputVector; +++ int16_t* inputPtr = (int16_t*)inputVector; +++ __m256i inputVal1; +++ __m256i inputVal2; +++ __m256i ret; ++ ++- for(;number < thirtysecondPoints; number++){ +++ for (; number < thirtysecondPoints; number++) { ++ ++- // Load the 16 values ++- inputVal1 = _mm256_load_si256((__m256i*)inputPtr); inputPtr += 16; ++- inputVal2 = _mm256_load_si256((__m256i*)inputPtr); inputPtr += 16; +++ // Load the 16 values +++ inputVal1 = _mm256_load_si256((__m256i*)inputPtr); +++ inputPtr += 16; +++ inputVal2 = _mm256_load_si256((__m256i*)inputPtr); +++ inputPtr += 16; ++ ++- inputVal1 = _mm256_srai_epi16(inputVal1, 8); ++- inputVal2 = _mm256_srai_epi16(inputVal2, 8); +++ inputVal1 = _mm256_srai_epi16(inputVal1, 8); +++ inputVal2 = _mm256_srai_epi16(inputVal2, 8); ++ ++- ret = _mm256_packs_epi16(inputVal1, inputVal2); ++- ret = _mm256_permute4x64_epi64(ret, 0b11011000); +++ ret = _mm256_packs_epi16(inputVal1, inputVal2); +++ ret = _mm256_permute4x64_epi64(ret, 0b11011000); ++ ++- _mm256_store_si256((__m256i*)outputVectorPtr, ret); +++ _mm256_store_si256((__m256i*)outputVectorPtr, ret); ++ ++- outputVectorPtr += 32; ++- } +++ outputVectorPtr += 32; +++ } ++ ++- number = thirtysecondPoints * 32; ++- for(; number < num_points; number++){ ++- outputVector[number] =(int8_t)(inputVector[number] >> 8); ++- } +++ number = thirtysecondPoints * 32; +++ for (; number < num_points; number++) { +++ outputVector[number] = (int8_t)(inputVector[number] >> 8); +++ } ++ } ++ #endif /* LV_HAVE_AVX2 */ ++ ++@@ -203,38 +211,41 @@ volk_16i_convert_8i_a_avx2(int8_t* outputVector, const int16_t* inputVector, uns ++ #ifdef LV_HAVE_SSE2 ++ #include ++ ++-static inline void ++-volk_16i_convert_8i_a_sse2(int8_t* outputVector, const int16_t* inputVector, unsigned int num_points) +++static inline void volk_16i_convert_8i_a_sse2(int8_t* outputVector, +++ const int16_t* inputVector, +++ unsigned int num_points) ++ { ++- unsigned int number = 0; ++- const unsigned int sixteenthPoints = num_points / 16; +++ unsigned int number = 0; +++ const unsigned int sixteenthPoints = num_points / 16; ++ ++- int8_t* outputVectorPtr = outputVector; ++- int16_t* inputPtr = (int16_t*)inputVector; ++- __m128i inputVal1; ++- __m128i inputVal2; ++- __m128i ret; +++ int8_t* outputVectorPtr = outputVector; +++ int16_t* inputPtr = (int16_t*)inputVector; +++ __m128i inputVal1; +++ __m128i inputVal2; +++ __m128i ret; ++ ++- for(;number < sixteenthPoints; number++){ +++ for (; number < sixteenthPoints; number++) { ++ ++- // Load the 16 values ++- inputVal1 = _mm_load_si128((__m128i*)inputPtr); inputPtr += 8; ++- inputVal2 = _mm_load_si128((__m128i*)inputPtr); inputPtr += 8; +++ // Load the 16 values +++ inputVal1 = _mm_load_si128((__m128i*)inputPtr); +++ inputPtr += 8; +++ inputVal2 = _mm_load_si128((__m128i*)inputPtr); +++ inputPtr += 8; ++ ++- inputVal1 = _mm_srai_epi16(inputVal1, 8); ++- inputVal2 = _mm_srai_epi16(inputVal2, 8); +++ inputVal1 = _mm_srai_epi16(inputVal1, 8); +++ inputVal2 = _mm_srai_epi16(inputVal2, 8); ++ ++- ret = _mm_packs_epi16(inputVal1, inputVal2); +++ ret = _mm_packs_epi16(inputVal1, inputVal2); ++ ++- _mm_store_si128((__m128i*)outputVectorPtr, ret); +++ _mm_store_si128((__m128i*)outputVectorPtr, ret); ++ ++- outputVectorPtr += 16; ++- } +++ outputVectorPtr += 16; +++ } ++ ++- number = sixteenthPoints * 16; ++- for(; number < num_points; number++){ ++- outputVector[number] =(int8_t)(inputVector[number] >> 8); ++- } +++ number = sixteenthPoints * 16; +++ for (; number < num_points; number++) { +++ outputVector[number] = (int8_t)(inputVector[number] >> 8); +++ } ++ } ++ #endif /* LV_HAVE_SSE2 */ ++ ++@@ -242,53 +253,55 @@ volk_16i_convert_8i_a_sse2(int8_t* outputVector, const int16_t* inputVector, uns ++ #ifdef LV_HAVE_NEON ++ #include ++ ++-static inline void ++-volk_16i_convert_8i_neon(int8_t* outputVector, const int16_t* inputVector, unsigned int num_points) +++static inline void volk_16i_convert_8i_neon(int8_t* outputVector, +++ const int16_t* inputVector, +++ unsigned int num_points) ++ { ++- int8_t* outputVectorPtr = outputVector; ++- const int16_t* inputVectorPtr = inputVector; ++- unsigned int number = 0; ++- unsigned int sixteenth_points = num_points / 16; ++- ++- int16x8_t inputVal0; ++- int16x8_t inputVal1; ++- int8x8_t outputVal0; ++- int8x8_t outputVal1; ++- int8x16_t outputVal; ++- ++- for(number = 0; number < sixteenth_points; number++){ ++- // load two input vectors ++- inputVal0 = vld1q_s16(inputVectorPtr); ++- inputVal1 = vld1q_s16(inputVectorPtr+8); ++- // shift right ++- outputVal0 = vshrn_n_s16(inputVal0, 8); ++- outputVal1 = vshrn_n_s16(inputVal1, 8); ++- // squash two vectors and write output ++- outputVal = vcombine_s8(outputVal0, outputVal1); ++- vst1q_s8(outputVectorPtr, outputVal); ++- inputVectorPtr += 16; ++- outputVectorPtr += 16; ++- } ++- ++- for(number = sixteenth_points * 16; number < num_points; number++){ ++- *outputVectorPtr++ = ((int8_t)(*inputVectorPtr++ >> 8)); ++- } +++ int8_t* outputVectorPtr = outputVector; +++ const int16_t* inputVectorPtr = inputVector; +++ unsigned int number = 0; +++ unsigned int sixteenth_points = num_points / 16; +++ +++ int16x8_t inputVal0; +++ int16x8_t inputVal1; +++ int8x8_t outputVal0; +++ int8x8_t outputVal1; +++ int8x16_t outputVal; +++ +++ for (number = 0; number < sixteenth_points; number++) { +++ // load two input vectors +++ inputVal0 = vld1q_s16(inputVectorPtr); +++ inputVal1 = vld1q_s16(inputVectorPtr + 8); +++ // shift right +++ outputVal0 = vshrn_n_s16(inputVal0, 8); +++ outputVal1 = vshrn_n_s16(inputVal1, 8); +++ // squash two vectors and write output +++ outputVal = vcombine_s8(outputVal0, outputVal1); +++ vst1q_s8(outputVectorPtr, outputVal); +++ inputVectorPtr += 16; +++ outputVectorPtr += 16; +++ } +++ +++ for (number = sixteenth_points * 16; number < num_points; number++) { +++ *outputVectorPtr++ = ((int8_t)(*inputVectorPtr++ >> 8)); +++ } ++ } ++ #endif /* LV_HAVE_NEON */ ++ ++ ++ #ifdef LV_HAVE_GENERIC ++ ++-static inline void ++-volk_16i_convert_8i_a_generic(int8_t* outputVector, const int16_t* inputVector, unsigned int num_points) +++static inline void volk_16i_convert_8i_a_generic(int8_t* outputVector, +++ const int16_t* inputVector, +++ unsigned int num_points) ++ { ++- int8_t* outputVectorPtr = outputVector; ++- const int16_t* inputVectorPtr = inputVector; ++- unsigned int number = 0; +++ int8_t* outputVectorPtr = outputVector; +++ const int16_t* inputVectorPtr = inputVector; +++ unsigned int number = 0; ++ ++- for(number = 0; number < num_points; number++){ ++- *outputVectorPtr++ = ((int8_t)(*inputVectorPtr++ >> 8)); ++- } +++ for (number = 0; number < num_points; number++) { +++ *outputVectorPtr++ = ((int8_t)(*inputVectorPtr++ >> 8)); +++ } ++ } ++ #endif /* LV_HAVE_GENERIC */ ++ ++diff --git a/kernels/volk/volk_16i_max_star_16i.h b/kernels/volk/volk_16i_max_star_16i.h ++index 78fd911..d5dad18 100644 ++--- a/kernels/volk/volk_16i_max_star_16i.h +++++ b/kernels/volk/volk_16i_max_star_16i.h ++@@ -53,67 +53,69 @@ ++ #ifndef INCLUDED_volk_16i_max_star_16i_a_H ++ #define INCLUDED_volk_16i_max_star_16i_a_H ++ ++-#include ++-#include +++#include +++#include ++ ++ #ifdef LV_HAVE_SSSE3 ++ ++-#include ++-#include ++-#include +++#include +++#include +++#include ++ ++ static inline void ++ volk_16i_max_star_16i_a_ssse3(short* target, short* src0, unsigned int num_points) ++ { ++- const unsigned int num_bytes = num_points*2; +++ const unsigned int num_bytes = num_points * 2; ++ ++- short candidate = src0[0]; ++- short cands[8]; ++- __m128i xmm0, xmm1, xmm3, xmm4, xmm5, xmm6; +++ short candidate = src0[0]; +++ short cands[8]; +++ __m128i xmm0, xmm1, xmm3, xmm4, xmm5, xmm6; ++ ++- __m128i *p_src0; +++ __m128i* p_src0; ++ ++- p_src0 = (__m128i*)src0; +++ p_src0 = (__m128i*)src0; ++ ++- int bound = num_bytes >> 4; ++- int leftovers = (num_bytes >> 1) & 7; +++ int bound = num_bytes >> 4; +++ int leftovers = (num_bytes >> 1) & 7; ++ ++- int i = 0; +++ int i = 0; ++ ++- xmm1 = _mm_setzero_si128(); ++- xmm0 = _mm_setzero_si128(); ++- //_mm_insert_epi16(xmm0, candidate, 0); +++ xmm1 = _mm_setzero_si128(); +++ xmm0 = _mm_setzero_si128(); +++ //_mm_insert_epi16(xmm0, candidate, 0); ++ ++- xmm0 = _mm_shuffle_epi8(xmm0, xmm1); +++ xmm0 = _mm_shuffle_epi8(xmm0, xmm1); ++ ++- for(i = 0; i < bound; ++i) { ++- xmm1 = _mm_load_si128(p_src0); ++- p_src0 += 1; ++- //xmm2 = _mm_sub_epi16(xmm1, xmm0); +++ for (i = 0; i < bound; ++i) { +++ xmm1 = _mm_load_si128(p_src0); +++ p_src0 += 1; +++ // xmm2 = _mm_sub_epi16(xmm1, xmm0); ++ ++- xmm3 = _mm_cmpgt_epi16(xmm0, xmm1); ++- xmm4 = _mm_cmpeq_epi16(xmm0, xmm1); ++- xmm5 = _mm_cmpgt_epi16(xmm1, xmm0); +++ xmm3 = _mm_cmpgt_epi16(xmm0, xmm1); +++ xmm4 = _mm_cmpeq_epi16(xmm0, xmm1); +++ xmm5 = _mm_cmpgt_epi16(xmm1, xmm0); ++ ++- xmm6 = _mm_xor_si128(xmm4, xmm5); +++ xmm6 = _mm_xor_si128(xmm4, xmm5); ++ ++- xmm3 = _mm_and_si128(xmm3, xmm0); ++- xmm4 = _mm_and_si128(xmm6, xmm1); +++ xmm3 = _mm_and_si128(xmm3, xmm0); +++ xmm4 = _mm_and_si128(xmm6, xmm1); ++ ++- xmm0 = _mm_add_epi16(xmm3, xmm4); ++- } +++ xmm0 = _mm_add_epi16(xmm3, xmm4); +++ } ++ ++- _mm_store_si128((__m128i*)cands, xmm0); +++ _mm_store_si128((__m128i*)cands, xmm0); ++ ++- for(i = 0; i < 8; ++i) { ++- candidate = ((short)(candidate - cands[i]) > 0) ? candidate : cands[i]; ++- } +++ for (i = 0; i < 8; ++i) { +++ candidate = ((short)(candidate - cands[i]) > 0) ? candidate : cands[i]; +++ } ++ ++- for(i = 0; i < leftovers; ++i) { ++- candidate = ((short)(candidate - src0[(bound << 3) + i]) > 0) ? candidate : src0[(bound << 3) + i]; ++- } +++ for (i = 0; i < leftovers; ++i) { +++ candidate = ((short)(candidate - src0[(bound << 3) + i]) > 0) +++ ? candidate +++ : src0[(bound << 3) + i]; +++ } ++ ++- target[0] = candidate; +++ target[0] = candidate; ++ } ++ ++ #endif /*LV_HAVE_SSSE3*/ ++@@ -124,38 +126,38 @@ volk_16i_max_star_16i_a_ssse3(short* target, short* src0, unsigned int num_point ++ static inline void ++ volk_16i_max_star_16i_neon(short* target, short* src0, unsigned int num_points) ++ { ++- const unsigned int eighth_points = num_points / 8; ++- unsigned number; ++- int16x8_t input_vec; ++- int16x8_t diff, zeros; ++- uint16x8_t comp1, comp2; ++- zeros = vdupq_n_s16(0); ++- ++- int16x8x2_t tmpvec; ++- ++- int16x8_t candidate_vec = vld1q_dup_s16(src0 ); ++- short candidate; ++- ++src0; ++- ++- for(number=0; number < eighth_points; ++number) { ++- input_vec = vld1q_s16(src0); ++- __VOLK_PREFETCH(src0+16); ++- diff = vsubq_s16(candidate_vec, input_vec); ++- comp1 = vcgeq_s16(diff, zeros); ++- comp2 = vcltq_s16(diff, zeros); ++- ++- tmpvec.val[0] = vandq_s16(candidate_vec, (int16x8_t)comp1); ++- tmpvec.val[1] = vandq_s16(input_vec, (int16x8_t)comp2); ++- ++- candidate_vec = vaddq_s16(tmpvec.val[0], tmpvec.val[1]); ++- src0 += 8; ++- } ++- vst1q_s16(&candidate, candidate_vec); ++- ++- for(number=0; number < num_points%8; number++) { ++- candidate = ((int16_t)(candidate - src0[number]) > 0) ? candidate : src0[number]; ++- } ++- target[0] = candidate; +++ const unsigned int eighth_points = num_points / 8; +++ unsigned number; +++ int16x8_t input_vec; +++ int16x8_t diff, zeros; +++ uint16x8_t comp1, comp2; +++ zeros = vdupq_n_s16(0); +++ +++ int16x8x2_t tmpvec; +++ +++ int16x8_t candidate_vec = vld1q_dup_s16(src0); +++ short candidate; +++ ++src0; +++ +++ for (number = 0; number < eighth_points; ++number) { +++ input_vec = vld1q_s16(src0); +++ __VOLK_PREFETCH(src0 + 16); +++ diff = vsubq_s16(candidate_vec, input_vec); +++ comp1 = vcgeq_s16(diff, zeros); +++ comp2 = vcltq_s16(diff, zeros); +++ +++ tmpvec.val[0] = vandq_s16(candidate_vec, (int16x8_t)comp1); +++ tmpvec.val[1] = vandq_s16(input_vec, (int16x8_t)comp2); +++ +++ candidate_vec = vaddq_s16(tmpvec.val[0], tmpvec.val[1]); +++ src0 += 8; +++ } +++ vst1q_s16(&candidate, candidate_vec); +++ +++ for (number = 0; number < num_points % 8; number++) { +++ candidate = ((int16_t)(candidate - src0[number]) > 0) ? candidate : src0[number]; +++ } +++ target[0] = candidate; ++ } ++ #endif /*LV_HAVE_NEON*/ ++ ++@@ -164,17 +166,17 @@ volk_16i_max_star_16i_neon(short* target, short* src0, unsigned int num_points) ++ static inline void ++ volk_16i_max_star_16i_generic(short* target, short* src0, unsigned int num_points) ++ { ++- const unsigned int num_bytes = num_points*2; +++ const unsigned int num_bytes = num_points * 2; ++ ++- int i = 0; +++ int i = 0; ++ ++- int bound = num_bytes >> 1; +++ int bound = num_bytes >> 1; ++ ++- short candidate = src0[0]; ++- for(i = 1; i < bound; ++i) { ++- candidate = ((short)(candidate - src0[i]) > 0) ? candidate : src0[i]; ++- } ++- target[0] = candidate; +++ short candidate = src0[0]; +++ for (i = 1; i < bound; ++i) { +++ candidate = ((short)(candidate - src0[i]) > 0) ? candidate : src0[i]; +++ } +++ target[0] = candidate; ++ } ++ ++ #endif /*LV_HAVE_GENERIC*/ ++diff --git a/kernels/volk/volk_16i_max_star_horizontal_16i.h b/kernels/volk/volk_16i_max_star_horizontal_16i.h ++index 4ffe264..2e1f52b 100644 ++--- a/kernels/volk/volk_16i_max_star_horizontal_16i.h +++++ b/kernels/volk/volk_16i_max_star_horizontal_16i.h ++@@ -29,8 +29,8 @@ ++ * ++ * Dispatcher Prototype ++ * \code ++- * void volk_16i_max_star_horizontal_16i(short* target, short* src0, unsigned int num_points); ++- * \endcode +++ * void volk_16i_max_star_horizontal_16i(short* target, short* src0, unsigned int +++ * num_points); \endcode ++ * ++ * \b Inputs ++ * \li src0: The input vector. ++@@ -55,102 +55,113 @@ ++ ++ #include ++ ++-#include ++-#include +++#include +++#include ++ ++ ++ #ifdef LV_HAVE_SSSE3 ++ ++-#include ++-#include ++-#include +++#include +++#include +++#include ++ ++-static inline void ++-volk_16i_max_star_horizontal_16i_a_ssse3(int16_t* target, int16_t* src0, unsigned int num_points) +++static inline void volk_16i_max_star_horizontal_16i_a_ssse3(int16_t* target, +++ int16_t* src0, +++ unsigned int num_points) ++ { ++- const unsigned int num_bytes = num_points*2; +++ const unsigned int num_bytes = num_points * 2; ++ ++- static const uint8_t shufmask0[16] = {0x00, 0x01, 0x04, 0x05, 0x08, 0x09, 0x0c, 0x0d, 0xff, ++- 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}; ++- static const uint8_t shufmask1[16] = {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, ++- 0x01, 0x04, 0x05, 0x08, 0x09, 0x0c, 0x0d}; ++- static const uint8_t andmask0[16] = {0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02,0x02, 0x00, ++- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}; ++- static const uint8_t andmask1[16] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, ++- 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02}; +++ static const uint8_t shufmask0[16] = { +++ 0x00, 0x01, 0x04, 0x05, 0x08, 0x09, 0x0c, 0x0d, +++ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff +++ }; +++ static const uint8_t shufmask1[16] = { +++ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, +++ 0x00, 0x01, 0x04, 0x05, 0x08, 0x09, 0x0c, 0x0d +++ }; +++ static const uint8_t andmask0[16] = { +++ 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, +++ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 +++ }; +++ static const uint8_t andmask1[16] = { +++ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, +++ 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02 +++ }; ++ ++- __m128i xmm0, xmm1, xmm2, xmm3, xmm4; ++- __m128i xmm5, xmm6, xmm7, xmm8; +++ __m128i xmm0, xmm1, xmm2, xmm3, xmm4; +++ __m128i xmm5, xmm6, xmm7, xmm8; ++ ++- xmm4 = _mm_load_si128((__m128i*)shufmask0); ++- xmm5 = _mm_load_si128((__m128i*)shufmask1); ++- xmm6 = _mm_load_si128((__m128i*)andmask0); ++- xmm7 = _mm_load_si128((__m128i*)andmask1); +++ xmm4 = _mm_load_si128((__m128i*)shufmask0); +++ xmm5 = _mm_load_si128((__m128i*)shufmask1); +++ xmm6 = _mm_load_si128((__m128i*)andmask0); +++ xmm7 = _mm_load_si128((__m128i*)andmask1); ++ ++- __m128i *p_target, *p_src0; +++ __m128i *p_target, *p_src0; ++ ++- p_target = (__m128i*)target; ++- p_src0 = (__m128i*)src0; +++ p_target = (__m128i*)target; +++ p_src0 = (__m128i*)src0; ++ ++- int bound = num_bytes >> 5; ++- int intermediate = (num_bytes >> 4) & 1; ++- int leftovers = (num_bytes >> 1) & 7; +++ int bound = num_bytes >> 5; +++ int intermediate = (num_bytes >> 4) & 1; +++ int leftovers = (num_bytes >> 1) & 7; ++ ++- int i = 0; +++ int i = 0; ++ ++- for(i = 0; i < bound; ++i) { ++- xmm0 = _mm_load_si128(p_src0); ++- xmm1 = _mm_load_si128(&p_src0[1]); +++ for (i = 0; i < bound; ++i) { +++ xmm0 = _mm_load_si128(p_src0); +++ xmm1 = _mm_load_si128(&p_src0[1]); ++ ++- xmm2 = _mm_xor_si128(xmm2, xmm2); ++- p_src0 += 2; +++ xmm2 = _mm_xor_si128(xmm2, xmm2); +++ p_src0 += 2; ++ ++- xmm3 = _mm_hsub_epi16(xmm0, xmm1); +++ xmm3 = _mm_hsub_epi16(xmm0, xmm1); ++ ++- xmm2 = _mm_cmpgt_epi16(xmm2, xmm3); +++ xmm2 = _mm_cmpgt_epi16(xmm2, xmm3); ++ ++- xmm8 = _mm_and_si128(xmm2, xmm6); ++- xmm3 = _mm_and_si128(xmm2, xmm7); +++ xmm8 = _mm_and_si128(xmm2, xmm6); +++ xmm3 = _mm_and_si128(xmm2, xmm7); ++ ++ ++- xmm8 = _mm_add_epi8(xmm8, xmm4); ++- xmm3 = _mm_add_epi8(xmm3, xmm5); +++ xmm8 = _mm_add_epi8(xmm8, xmm4); +++ xmm3 = _mm_add_epi8(xmm3, xmm5); ++ ++- xmm0 = _mm_shuffle_epi8(xmm0, xmm8); ++- xmm1 = _mm_shuffle_epi8(xmm1, xmm3); +++ xmm0 = _mm_shuffle_epi8(xmm0, xmm8); +++ xmm1 = _mm_shuffle_epi8(xmm1, xmm3); ++ ++ ++- xmm3 = _mm_add_epi16(xmm0, xmm1); +++ xmm3 = _mm_add_epi16(xmm0, xmm1); ++ ++ ++- _mm_store_si128(p_target, xmm3); +++ _mm_store_si128(p_target, xmm3); ++ ++- p_target += 1; ++- } +++ p_target += 1; +++ } ++ ++- if (intermediate) { ++- xmm0 = _mm_load_si128(p_src0); +++ if (intermediate) { +++ xmm0 = _mm_load_si128(p_src0); ++ ++- xmm2 = _mm_xor_si128(xmm2, xmm2); ++- p_src0 += 1; +++ xmm2 = _mm_xor_si128(xmm2, xmm2); +++ p_src0 += 1; ++ ++- xmm3 = _mm_hsub_epi16(xmm0, xmm1); ++- xmm2 = _mm_cmpgt_epi16(xmm2, xmm3); +++ xmm3 = _mm_hsub_epi16(xmm0, xmm1); +++ xmm2 = _mm_cmpgt_epi16(xmm2, xmm3); ++ ++- xmm8 = _mm_and_si128(xmm2, xmm6); +++ xmm8 = _mm_and_si128(xmm2, xmm6); ++ ++- xmm3 = _mm_add_epi8(xmm8, xmm4); +++ xmm3 = _mm_add_epi8(xmm8, xmm4); ++ ++- xmm0 = _mm_shuffle_epi8(xmm0, xmm3); +++ xmm0 = _mm_shuffle_epi8(xmm0, xmm3); ++ ++- _mm_storel_pd((double*)p_target, bit128_p(&xmm0)->double_vec); +++ _mm_storel_pd((double*)p_target, bit128_p(&xmm0)->double_vec); ++ ++- p_target = (__m128i*)((int8_t*)p_target + 8); ++- } +++ p_target = (__m128i*)((int8_t*)p_target + 8); +++ } ++ ++- for(i = (bound << 4) + (intermediate << 3); i < (bound << 4) + (intermediate << 3) + leftovers ; i += 2) { ++- target[i>>1] = ((int16_t)(src0[i] - src0[i + 1]) > 0) ? src0[i] : src0[i + 1]; ++- } +++ for (i = (bound << 4) + (intermediate << 3); +++ i < (bound << 4) + (intermediate << 3) + leftovers; +++ i += 2) { +++ target[i >> 1] = ((int16_t)(src0[i] - src0[i + 1]) > 0) ? src0[i] : src0[i + 1]; +++ } ++ } ++ ++ #endif /*LV_HAVE_SSSE3*/ ++@@ -158,54 +169,59 @@ volk_16i_max_star_horizontal_16i_a_ssse3(int16_t* target, int16_t* src0, unsigne ++ #ifdef LV_HAVE_NEON ++ ++ #include ++-static inline void ++-volk_16i_max_star_horizontal_16i_neon(int16_t* target, int16_t* src0, unsigned int num_points) +++static inline void volk_16i_max_star_horizontal_16i_neon(int16_t* target, +++ int16_t* src0, +++ unsigned int num_points) ++ { ++- const unsigned int eighth_points = num_points / 16; ++- unsigned number; ++- int16x8x2_t input_vec; ++- int16x8_t diff, max_vec, zeros; ++- uint16x8_t comp1, comp2; ++- zeros = vdupq_n_s16(0); ++- for(number=0; number < eighth_points; ++number) { ++- input_vec = vld2q_s16(src0); ++- //__VOLK_PREFETCH(src0+16); ++- diff = vsubq_s16(input_vec.val[0], input_vec.val[1]); ++- comp1 = vcgeq_s16(diff, zeros); ++- comp2 = vcltq_s16(diff, zeros); ++- ++- input_vec.val[0] = vandq_s16(input_vec.val[0], (int16x8_t)comp1); ++- input_vec.val[1] = vandq_s16(input_vec.val[1], (int16x8_t)comp2); ++- ++- max_vec = vaddq_s16(input_vec.val[0], input_vec.val[1]); ++- vst1q_s16(target, max_vec); ++- src0 += 16; ++- target += 8; ++- } ++- for(number=0; number < num_points%16; number+=2) { ++- target[number >> 1] = ((int16_t)(src0[number] - src0[number + 1]) > 0) ? src0[number] : src0[number+1]; ++- } ++- +++ const unsigned int eighth_points = num_points / 16; +++ unsigned number; +++ int16x8x2_t input_vec; +++ int16x8_t diff, max_vec, zeros; +++ uint16x8_t comp1, comp2; +++ zeros = vdupq_n_s16(0); +++ for (number = 0; number < eighth_points; ++number) { +++ input_vec = vld2q_s16(src0); +++ //__VOLK_PREFETCH(src0+16); +++ diff = vsubq_s16(input_vec.val[0], input_vec.val[1]); +++ comp1 = vcgeq_s16(diff, zeros); +++ comp2 = vcltq_s16(diff, zeros); +++ +++ input_vec.val[0] = vandq_s16(input_vec.val[0], (int16x8_t)comp1); +++ input_vec.val[1] = vandq_s16(input_vec.val[1], (int16x8_t)comp2); +++ +++ max_vec = vaddq_s16(input_vec.val[0], input_vec.val[1]); +++ vst1q_s16(target, max_vec); +++ src0 += 16; +++ target += 8; +++ } +++ for (number = 0; number < num_points % 16; number += 2) { +++ target[number >> 1] = ((int16_t)(src0[number] - src0[number + 1]) > 0) +++ ? src0[number] +++ : src0[number + 1]; +++ } ++ } ++ #endif /* LV_HAVE_NEON */ ++ ++ #ifdef LV_HAVE_NEONV7 ++-extern void volk_16i_max_star_horizontal_16i_a_neonasm(int16_t* target, int16_t* src0, unsigned int num_points); +++extern void volk_16i_max_star_horizontal_16i_a_neonasm(int16_t* target, +++ int16_t* src0, +++ unsigned int num_points); ++ #endif /* LV_HAVE_NEONV7 */ ++ ++ #ifdef LV_HAVE_GENERIC ++-static inline void ++-volk_16i_max_star_horizontal_16i_generic(int16_t* target, int16_t* src0, unsigned int num_points) +++static inline void volk_16i_max_star_horizontal_16i_generic(int16_t* target, +++ int16_t* src0, +++ unsigned int num_points) ++ { ++- const unsigned int num_bytes = num_points*2; +++ const unsigned int num_bytes = num_points * 2; ++ ++- int i = 0; +++ int i = 0; ++ ++- int bound = num_bytes >> 1; +++ int bound = num_bytes >> 1; ++ ++- for(i = 0; i < bound; i += 2) { ++- target[i >> 1] = ((int16_t) (src0[i] - src0[i + 1]) > 0) ? src0[i] : src0[i+1]; ++- } +++ for (i = 0; i < bound; i += 2) { +++ target[i >> 1] = ((int16_t)(src0[i] - src0[i + 1]) > 0) ? src0[i] : src0[i + 1]; +++ } ++ } ++ ++ #endif /*LV_HAVE_GENERIC*/ ++diff --git a/kernels/volk/volk_16i_permute_and_scalar_add.h b/kernels/volk/volk_16i_permute_and_scalar_add.h ++index 7fcdad3..0563f07 100644 ++--- a/kernels/volk/volk_16i_permute_and_scalar_add.h +++++ b/kernels/volk/volk_16i_permute_and_scalar_add.h ++@@ -29,8 +29,9 @@ ++ * ++ * Dispatcher Prototype ++ * \code ++- * void volk_16i_permute_and_scalar_add(short* target, short* src0, short* permute_indexes, short* cntl0, short* cntl1, short* cntl2, short* cntl3, short* scalars, unsigned int num_points) ++- * \endcode +++ * void volk_16i_permute_and_scalar_add(short* target, short* src0, short* +++ * permute_indexes, short* cntl0, short* cntl1, short* cntl2, short* cntl3, short* +++ * scalars, unsigned int num_points) \endcode ++ * ++ * \b Inputs ++ * \li src0: The input vector. ++@@ -58,137 +59,143 @@ ++ #ifndef INCLUDED_volk_16i_permute_and_scalar_add_a_H ++ #define INCLUDED_volk_16i_permute_and_scalar_add_a_H ++ ++-#include ++-#include +++#include +++#include ++ ++ #ifdef LV_HAVE_SSE2 ++ ++-#include ++-#include ++- ++-static inline void ++-volk_16i_permute_and_scalar_add_a_sse2(short* target, short* src0, short* permute_indexes, ++- short* cntl0, short* cntl1, short* cntl2, short* cntl3, ++- short* scalars, unsigned int num_points) +++#include +++#include +++ +++static inline void volk_16i_permute_and_scalar_add_a_sse2(short* target, +++ short* src0, +++ short* permute_indexes, +++ short* cntl0, +++ short* cntl1, +++ short* cntl2, +++ short* cntl3, +++ short* scalars, +++ unsigned int num_points) ++ { ++ ++- const unsigned int num_bytes = num_points*2; +++ const unsigned int num_bytes = num_points * 2; ++ ++- __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; +++ __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; ++ ++- __m128i *p_target, *p_cntl0, *p_cntl1, *p_cntl2, *p_cntl3, *p_scalars; +++ __m128i *p_target, *p_cntl0, *p_cntl1, *p_cntl2, *p_cntl3, *p_scalars; ++ ++- short* p_permute_indexes = permute_indexes; +++ short* p_permute_indexes = permute_indexes; ++ ++- p_target = (__m128i*)target; ++- p_cntl0 = (__m128i*)cntl0; ++- p_cntl1 = (__m128i*)cntl1; ++- p_cntl2 = (__m128i*)cntl2; ++- p_cntl3 = (__m128i*)cntl3; ++- p_scalars = (__m128i*)scalars; +++ p_target = (__m128i*)target; +++ p_cntl0 = (__m128i*)cntl0; +++ p_cntl1 = (__m128i*)cntl1; +++ p_cntl2 = (__m128i*)cntl2; +++ p_cntl3 = (__m128i*)cntl3; +++ p_scalars = (__m128i*)scalars; ++ ++- int i = 0; +++ int i = 0; ++ ++- int bound = (num_bytes >> 4); ++- int leftovers = (num_bytes >> 1) & 7; +++ int bound = (num_bytes >> 4); +++ int leftovers = (num_bytes >> 1) & 7; ++ ++- xmm0 = _mm_load_si128(p_scalars); +++ xmm0 = _mm_load_si128(p_scalars); ++ ++- xmm1 = _mm_shufflelo_epi16(xmm0, 0); ++- xmm2 = _mm_shufflelo_epi16(xmm0, 0x55); ++- xmm3 = _mm_shufflelo_epi16(xmm0, 0xaa); ++- xmm4 = _mm_shufflelo_epi16(xmm0, 0xff); +++ xmm1 = _mm_shufflelo_epi16(xmm0, 0); +++ xmm2 = _mm_shufflelo_epi16(xmm0, 0x55); +++ xmm3 = _mm_shufflelo_epi16(xmm0, 0xaa); +++ xmm4 = _mm_shufflelo_epi16(xmm0, 0xff); ++ ++- xmm1 = _mm_shuffle_epi32(xmm1, 0x00); ++- xmm2 = _mm_shuffle_epi32(xmm2, 0x00); ++- xmm3 = _mm_shuffle_epi32(xmm3, 0x00); ++- xmm4 = _mm_shuffle_epi32(xmm4, 0x00); +++ xmm1 = _mm_shuffle_epi32(xmm1, 0x00); +++ xmm2 = _mm_shuffle_epi32(xmm2, 0x00); +++ xmm3 = _mm_shuffle_epi32(xmm3, 0x00); +++ xmm4 = _mm_shuffle_epi32(xmm4, 0x00); ++ ++ ++- for(; i < bound; ++i) { ++- xmm0 = _mm_setzero_si128(); ++- xmm5 = _mm_setzero_si128(); ++- xmm6 = _mm_setzero_si128(); ++- xmm7 = _mm_setzero_si128(); +++ for (; i < bound; ++i) { +++ xmm0 = _mm_setzero_si128(); +++ xmm5 = _mm_setzero_si128(); +++ xmm6 = _mm_setzero_si128(); +++ xmm7 = _mm_setzero_si128(); ++ ++- xmm0 = _mm_insert_epi16(xmm0, src0[p_permute_indexes[0]], 0); ++- xmm5 = _mm_insert_epi16(xmm5, src0[p_permute_indexes[1]], 1); ++- xmm6 = _mm_insert_epi16(xmm6, src0[p_permute_indexes[2]], 2); ++- xmm7 = _mm_insert_epi16(xmm7, src0[p_permute_indexes[3]], 3); ++- xmm0 = _mm_insert_epi16(xmm0, src0[p_permute_indexes[4]], 4); ++- xmm5 = _mm_insert_epi16(xmm5, src0[p_permute_indexes[5]], 5); ++- xmm6 = _mm_insert_epi16(xmm6, src0[p_permute_indexes[6]], 6); ++- xmm7 = _mm_insert_epi16(xmm7, src0[p_permute_indexes[7]], 7); +++ xmm0 = _mm_insert_epi16(xmm0, src0[p_permute_indexes[0]], 0); +++ xmm5 = _mm_insert_epi16(xmm5, src0[p_permute_indexes[1]], 1); +++ xmm6 = _mm_insert_epi16(xmm6, src0[p_permute_indexes[2]], 2); +++ xmm7 = _mm_insert_epi16(xmm7, src0[p_permute_indexes[3]], 3); +++ xmm0 = _mm_insert_epi16(xmm0, src0[p_permute_indexes[4]], 4); +++ xmm5 = _mm_insert_epi16(xmm5, src0[p_permute_indexes[5]], 5); +++ xmm6 = _mm_insert_epi16(xmm6, src0[p_permute_indexes[6]], 6); +++ xmm7 = _mm_insert_epi16(xmm7, src0[p_permute_indexes[7]], 7); ++ ++- xmm0 = _mm_add_epi16(xmm0, xmm5); ++- xmm6 = _mm_add_epi16(xmm6, xmm7); +++ xmm0 = _mm_add_epi16(xmm0, xmm5); +++ xmm6 = _mm_add_epi16(xmm6, xmm7); ++ ++- p_permute_indexes += 8; +++ p_permute_indexes += 8; ++ ++- xmm0 = _mm_add_epi16(xmm0, xmm6); +++ xmm0 = _mm_add_epi16(xmm0, xmm6); ++ ++- xmm5 = _mm_load_si128(p_cntl0); ++- xmm6 = _mm_load_si128(p_cntl1); ++- xmm7 = _mm_load_si128(p_cntl2); +++ xmm5 = _mm_load_si128(p_cntl0); +++ xmm6 = _mm_load_si128(p_cntl1); +++ xmm7 = _mm_load_si128(p_cntl2); ++ ++- xmm5 = _mm_and_si128(xmm5, xmm1); ++- xmm6 = _mm_and_si128(xmm6, xmm2); ++- xmm7 = _mm_and_si128(xmm7, xmm3); +++ xmm5 = _mm_and_si128(xmm5, xmm1); +++ xmm6 = _mm_and_si128(xmm6, xmm2); +++ xmm7 = _mm_and_si128(xmm7, xmm3); ++ ++- xmm0 = _mm_add_epi16(xmm0, xmm5); +++ xmm0 = _mm_add_epi16(xmm0, xmm5); ++ ++- xmm5 = _mm_load_si128(p_cntl3); +++ xmm5 = _mm_load_si128(p_cntl3); ++ ++- xmm6 = _mm_add_epi16(xmm6, xmm7); +++ xmm6 = _mm_add_epi16(xmm6, xmm7); ++ ++- p_cntl0 += 1; +++ p_cntl0 += 1; ++ ++- xmm5 = _mm_and_si128(xmm5, xmm4); +++ xmm5 = _mm_and_si128(xmm5, xmm4); ++ ++- xmm0 = _mm_add_epi16(xmm0, xmm6); +++ xmm0 = _mm_add_epi16(xmm0, xmm6); ++ ++- p_cntl1 += 1; ++- p_cntl2 += 1; +++ p_cntl1 += 1; +++ p_cntl2 += 1; ++ ++- xmm0 = _mm_add_epi16(xmm0, xmm5); +++ xmm0 = _mm_add_epi16(xmm0, xmm5); ++ ++- p_cntl3 += 1; +++ p_cntl3 += 1; ++ ++- _mm_store_si128(p_target, xmm0); +++ _mm_store_si128(p_target, xmm0); ++ ++- p_target += 1; ++- } +++ p_target += 1; +++ } ++ ++- for(i = bound * 8; i < (bound * 8) + leftovers; ++i) { ++- target[i] = src0[permute_indexes[i]] ++- + (cntl0[i] & scalars[0]) ++- + (cntl1[i] & scalars[1]) ++- + (cntl2[i] & scalars[2]) ++- + (cntl3[i] & scalars[3]); ++- } +++ for (i = bound * 8; i < (bound * 8) + leftovers; ++i) { +++ target[i] = src0[permute_indexes[i]] + (cntl0[i] & scalars[0]) + +++ (cntl1[i] & scalars[1]) + (cntl2[i] & scalars[2]) + +++ (cntl3[i] & scalars[3]); +++ } ++ } ++ #endif /*LV_HAVE_SSE*/ ++ ++ ++ #ifdef LV_HAVE_GENERIC ++-static inline void ++-volk_16i_permute_and_scalar_add_generic(short* target, short* src0, short* permute_indexes, ++- short* cntl0, short* cntl1, short* cntl2, short* cntl3, ++- short* scalars, unsigned int num_points) +++static inline void volk_16i_permute_and_scalar_add_generic(short* target, +++ short* src0, +++ short* permute_indexes, +++ short* cntl0, +++ short* cntl1, +++ short* cntl2, +++ short* cntl3, +++ short* scalars, +++ unsigned int num_points) ++ { ++- const unsigned int num_bytes = num_points*2; +++ const unsigned int num_bytes = num_points * 2; ++ ++- int i = 0; +++ int i = 0; ++ ++- int bound = num_bytes >> 1; +++ int bound = num_bytes >> 1; ++ ++- for(i = 0; i < bound; ++i) { ++- target[i] = src0[permute_indexes[i]] ++- + (cntl0[i] & scalars[0]) ++- + (cntl1[i] & scalars[1]) ++- + (cntl2[i] & scalars[2]) ++- + (cntl3[i] & scalars[3]); ++- } +++ for (i = 0; i < bound; ++i) { +++ target[i] = src0[permute_indexes[i]] + (cntl0[i] & scalars[0]) + +++ (cntl1[i] & scalars[1]) + (cntl2[i] & scalars[2]) + +++ (cntl3[i] & scalars[3]); +++ } ++ } ++ ++ #endif /*LV_HAVE_GENERIC*/ ++diff --git a/kernels/volk/volk_16i_s32f_convert_32f.h b/kernels/volk/volk_16i_s32f_convert_32f.h ++index 38ea6f5..3fd3a77 100644 ++--- a/kernels/volk/volk_16i_s32f_convert_32f.h +++++ b/kernels/volk/volk_16i_s32f_convert_32f.h ++@@ -29,8 +29,8 @@ ++ * ++ * Dispatcher Prototype ++ * \code ++- * void volk_16i_s32f_convert_32f(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points); ++- * \endcode +++ * void volk_16i_s32f_convert_32f(float* outputVector, const int16_t* inputVector, const +++ * float scalar, unsigned int num_points); \endcode ++ * ++ * \b Inputs ++ * \li inputVector: The input vector of 16-bit shorts. ++@@ -60,238 +60,247 @@ ++ #ifdef LV_HAVE_AVX2 ++ #include ++ ++-static inline void ++-volk_16i_s32f_convert_32f_u_avx2(float* outputVector, const int16_t* inputVector, ++- const float scalar, unsigned int num_points) +++static inline void volk_16i_s32f_convert_32f_u_avx2(float* outputVector, +++ const int16_t* inputVector, +++ const float scalar, +++ unsigned int num_points) ++ { ++- unsigned int number = 0; ++- const unsigned int eighthPoints = num_points / 8; +++ unsigned int number = 0; +++ const unsigned int eighthPoints = num_points / 8; ++ ++- float* outputVectorPtr = outputVector; ++- __m256 invScalar = _mm256_set1_ps(1.0/scalar); ++- int16_t* inputPtr = (int16_t*)inputVector; ++- __m128i inputVal; ++- __m256i inputVal2; ++- __m256 ret; +++ float* outputVectorPtr = outputVector; +++ __m256 invScalar = _mm256_set1_ps(1.0 / scalar); +++ int16_t* inputPtr = (int16_t*)inputVector; +++ __m128i inputVal; +++ __m256i inputVal2; +++ __m256 ret; ++ ++- for(;number < eighthPoints; number++){ +++ for (; number < eighthPoints; number++) { ++ ++- // Load the 8 values ++- inputVal = _mm_loadu_si128((__m128i*)inputPtr); +++ // Load the 8 values +++ inputVal = _mm_loadu_si128((__m128i*)inputPtr); ++ ++- // Convert ++- inputVal2 = _mm256_cvtepi16_epi32(inputVal); +++ // Convert +++ inputVal2 = _mm256_cvtepi16_epi32(inputVal); ++ ++- ret = _mm256_cvtepi32_ps(inputVal2); ++- ret = _mm256_mul_ps(ret, invScalar); +++ ret = _mm256_cvtepi32_ps(inputVal2); +++ ret = _mm256_mul_ps(ret, invScalar); ++ ++- _mm256_storeu_ps(outputVectorPtr, ret); +++ _mm256_storeu_ps(outputVectorPtr, ret); ++ ++- outputVectorPtr += 8; +++ outputVectorPtr += 8; ++ ++- inputPtr += 8; ++- } +++ inputPtr += 8; +++ } ++ ++- number = eighthPoints * 8; ++- for(; number < num_points; number++){ ++- outputVector[number] =((float)(inputVector[number])) / scalar; ++- } +++ number = eighthPoints * 8; +++ for (; number < num_points; number++) { +++ outputVector[number] = ((float)(inputVector[number])) / scalar; +++ } ++ } ++ #endif /* LV_HAVE_AVX2 */ ++ ++ #ifdef LV_HAVE_AVX ++ #include ++ ++-static inline void ++-volk_16i_s32f_convert_32f_u_avx(float* outputVector, const int16_t* inputVector, ++- const float scalar, unsigned int num_points) +++static inline void volk_16i_s32f_convert_32f_u_avx(float* outputVector, +++ const int16_t* inputVector, +++ const float scalar, +++ unsigned int num_points) ++ { ++- unsigned int number = 0; ++- const unsigned int eighthPoints = num_points / 8; +++ unsigned int number = 0; +++ const unsigned int eighthPoints = num_points / 8; ++ ++- float* outputVectorPtr = outputVector; ++- __m128 invScalar = _mm_set_ps1(1.0/scalar); ++- int16_t* inputPtr = (int16_t*)inputVector; ++- __m128i inputVal, inputVal2; ++- __m128 ret; ++- __m256 output; ++- __m256 dummy = _mm256_setzero_ps(); +++ float* outputVectorPtr = outputVector; +++ __m128 invScalar = _mm_set_ps1(1.0 / scalar); +++ int16_t* inputPtr = (int16_t*)inputVector; +++ __m128i inputVal, inputVal2; +++ __m128 ret; +++ __m256 output; +++ __m256 dummy = _mm256_setzero_ps(); ++ ++- for(;number < eighthPoints; number++){ +++ for (; number < eighthPoints; number++) { ++ ++- // Load the 8 values ++- //inputVal = _mm_loadu_si128((__m128i*)inputPtr); ++- inputVal = _mm_loadu_si128((__m128i*)inputPtr); +++ // Load the 8 values +++ // inputVal = _mm_loadu_si128((__m128i*)inputPtr); +++ inputVal = _mm_loadu_si128((__m128i*)inputPtr); ++ ++- // Shift the input data to the right by 64 bits ( 8 bytes ) ++- inputVal2 = _mm_srli_si128(inputVal, 8); +++ // Shift the input data to the right by 64 bits ( 8 bytes ) +++ inputVal2 = _mm_srli_si128(inputVal, 8); ++ ++- // Convert the lower 4 values into 32 bit words ++- inputVal = _mm_cvtepi16_epi32(inputVal); ++- inputVal2 = _mm_cvtepi16_epi32(inputVal2); +++ // Convert the lower 4 values into 32 bit words +++ inputVal = _mm_cvtepi16_epi32(inputVal); +++ inputVal2 = _mm_cvtepi16_epi32(inputVal2); ++ ++- ret = _mm_cvtepi32_ps(inputVal); ++- ret = _mm_mul_ps(ret, invScalar); ++- output = _mm256_insertf128_ps(dummy, ret, 0); +++ ret = _mm_cvtepi32_ps(inputVal); +++ ret = _mm_mul_ps(ret, invScalar); +++ output = _mm256_insertf128_ps(dummy, ret, 0); ++ ++- ret = _mm_cvtepi32_ps(inputVal2); ++- ret = _mm_mul_ps(ret, invScalar); ++- output = _mm256_insertf128_ps(output, ret, 1); +++ ret = _mm_cvtepi32_ps(inputVal2); +++ ret = _mm_mul_ps(ret, invScalar); +++ output = _mm256_insertf128_ps(output, ret, 1); ++ ++- _mm256_storeu_ps(outputVectorPtr, output); +++ _mm256_storeu_ps(outputVectorPtr, output); ++ ++- outputVectorPtr += 8; +++ outputVectorPtr += 8; ++ ++- inputPtr += 8; ++- } +++ inputPtr += 8; +++ } ++ ++- number = eighthPoints * 8; ++- for(; number < num_points; number++){ ++- outputVector[number] =((float)(inputVector[number])) / scalar; ++- } +++ number = eighthPoints * 8; +++ for (; number < num_points; number++) { +++ outputVector[number] = ((float)(inputVector[number])) / scalar; +++ } ++ } ++ #endif /* LV_HAVE_AVX */ ++ ++ #ifdef LV_HAVE_SSE4_1 ++ #include ++ ++-static inline void ++-volk_16i_s32f_convert_32f_u_sse4_1(float* outputVector, const int16_t* inputVector, ++- const float scalar, unsigned int num_points) +++static inline void volk_16i_s32f_convert_32f_u_sse4_1(float* outputVector, +++ const int16_t* inputVector, +++ const float scalar, +++ unsigned int num_points) ++ { ++- unsigned int number = 0; ++- const unsigned int eighthPoints = num_points / 8; +++ unsigned int number = 0; +++ const unsigned int eighthPoints = num_points / 8; ++ ++- float* outputVectorPtr = outputVector; ++- __m128 invScalar = _mm_set_ps1(1.0/scalar); ++- int16_t* inputPtr = (int16_t*)inputVector; ++- __m128i inputVal; ++- __m128i inputVal2; ++- __m128 ret; +++ float* outputVectorPtr = outputVector; +++ __m128 invScalar = _mm_set_ps1(1.0 / scalar); +++ int16_t* inputPtr = (int16_t*)inputVector; +++ __m128i inputVal; +++ __m128i inputVal2; +++ __m128 ret; ++ ++- for(;number < eighthPoints; number++){ +++ for (; number < eighthPoints; number++) { ++ ++- // Load the 8 values ++- inputVal = _mm_loadu_si128((__m128i*)inputPtr); +++ // Load the 8 values +++ inputVal = _mm_loadu_si128((__m128i*)inputPtr); ++ ++- // Shift the input data to the right by 64 bits ( 8 bytes ) ++- inputVal2 = _mm_srli_si128(inputVal, 8); +++ // Shift the input data to the right by 64 bits ( 8 bytes ) +++ inputVal2 = _mm_srli_si128(inputVal, 8); ++ ++- // Convert the lower 4 values into 32 bit words ++- inputVal = _mm_cvtepi16_epi32(inputVal); ++- inputVal2 = _mm_cvtepi16_epi32(inputVal2); +++ // Convert the lower 4 values into 32 bit words +++ inputVal = _mm_cvtepi16_epi32(inputVal); +++ inputVal2 = _mm_cvtepi16_epi32(inputVal2); ++ ++- ret = _mm_cvtepi32_ps(inputVal); ++- ret = _mm_mul_ps(ret, invScalar); ++- _mm_storeu_ps(outputVectorPtr, ret); ++- outputVectorPtr += 4; +++ ret = _mm_cvtepi32_ps(inputVal); +++ ret = _mm_mul_ps(ret, invScalar); +++ _mm_storeu_ps(outputVectorPtr, ret); +++ outputVectorPtr += 4; ++ ++- ret = _mm_cvtepi32_ps(inputVal2); ++- ret = _mm_mul_ps(ret, invScalar); ++- _mm_storeu_ps(outputVectorPtr, ret); +++ ret = _mm_cvtepi32_ps(inputVal2); +++ ret = _mm_mul_ps(ret, invScalar); +++ _mm_storeu_ps(outputVectorPtr, ret); ++ ++- outputVectorPtr += 4; +++ outputVectorPtr += 4; ++ ++- inputPtr += 8; ++- } +++ inputPtr += 8; +++ } ++ ++- number = eighthPoints * 8; ++- for(; number < num_points; number++){ ++- outputVector[number] =((float)(inputVector[number])) / scalar; ++- } +++ number = eighthPoints * 8; +++ for (; number < num_points; number++) { +++ outputVector[number] = ((float)(inputVector[number])) / scalar; +++ } ++ } ++ #endif /* LV_HAVE_SSE4_1 */ ++ ++ #ifdef LV_HAVE_SSE ++ #include ++ ++-static inline void ++-volk_16i_s32f_convert_32f_u_sse(float* outputVector, const int16_t* inputVector, ++- const float scalar, unsigned int num_points) +++static inline void volk_16i_s32f_convert_32f_u_sse(float* outputVector, +++ const int16_t* inputVector, +++ const float scalar, +++ unsigned int num_points) ++ { ++- unsigned int number = 0; ++- const unsigned int quarterPoints = num_points / 4; ++- ++- float* outputVectorPtr = outputVector; ++- __m128 invScalar = _mm_set_ps1(1.0/scalar); ++- int16_t* inputPtr = (int16_t*)inputVector; ++- __m128 ret; ++- ++- for(;number < quarterPoints; number++){ ++- ret = _mm_set_ps((float)(inputPtr[3]), (float)(inputPtr[2]), (float)(inputPtr[1]), (float)(inputPtr[0])); ++- ++- ret = _mm_mul_ps(ret, invScalar); ++- _mm_storeu_ps(outputVectorPtr, ret); ++- ++- inputPtr += 4; ++- outputVectorPtr += 4; ++- } ++- ++- number = quarterPoints * 4; ++- for(; number < num_points; number++){ ++- outputVector[number] = (float)(inputVector[number]) / scalar; ++- } +++ unsigned int number = 0; +++ const unsigned int quarterPoints = num_points / 4; +++ +++ float* outputVectorPtr = outputVector; +++ __m128 invScalar = _mm_set_ps1(1.0 / scalar); +++ int16_t* inputPtr = (int16_t*)inputVector; +++ __m128 ret; +++ +++ for (; number < quarterPoints; number++) { +++ ret = _mm_set_ps((float)(inputPtr[3]), +++ (float)(inputPtr[2]), +++ (float)(inputPtr[1]), +++ (float)(inputPtr[0])); +++ +++ ret = _mm_mul_ps(ret, invScalar); +++ _mm_storeu_ps(outputVectorPtr, ret); +++ +++ inputPtr += 4; +++ outputVectorPtr += 4; +++ } +++ +++ number = quarterPoints * 4; +++ for (; number < num_points; number++) { +++ outputVector[number] = (float)(inputVector[number]) / scalar; +++ } ++ } ++ #endif /* LV_HAVE_SSE */ ++ ++ #ifdef LV_HAVE_GENERIC ++ ++-static inline void ++-volk_16i_s32f_convert_32f_generic(float* outputVector, const int16_t* inputVector, ++- const float scalar, unsigned int num_points) +++static inline void volk_16i_s32f_convert_32f_generic(float* outputVector, +++ const int16_t* inputVector, +++ const float scalar, +++ unsigned int num_points) ++ { ++- float* outputVectorPtr = outputVector; ++- const int16_t* inputVectorPtr = inputVector; ++- unsigned int number = 0; +++ float* outputVectorPtr = outputVector; +++ const int16_t* inputVectorPtr = inputVector; +++ unsigned int number = 0; ++ ++- for(number = 0; number < num_points; number++){ ++- *outputVectorPtr++ = ((float)(*inputVectorPtr++)) / scalar; ++- } +++ for (number = 0; number < num_points; number++) { +++ *outputVectorPtr++ = ((float)(*inputVectorPtr++)) / scalar; +++ } ++ } ++ #endif /* LV_HAVE_GENERIC */ ++ ++ #ifdef LV_HAVE_NEON ++ #include ++ ++-static inline void ++-volk_16i_s32f_convert_32f_neon(float* outputVector, const int16_t* inputVector, ++- const float scalar, unsigned int num_points) +++static inline void volk_16i_s32f_convert_32f_neon(float* outputVector, +++ const int16_t* inputVector, +++ const float scalar, +++ unsigned int num_points) ++ { ++- float* outputPtr = outputVector; ++- const int16_t* inputPtr = inputVector; ++- unsigned int number = 0; ++- unsigned int eighth_points = num_points / 8; ++- ++- int16x4x2_t input16; ++- int32x4_t input32_0, input32_1; ++- float32x4_t input_float_0, input_float_1; ++- float32x4x2_t output_float; ++- float32x4_t inv_scale; ++- ++- inv_scale = vdupq_n_f32(1.0/scalar); ++- ++- // the generic disassembles to a 128-bit load ++- // and duplicates every instruction to operate on 64-bits ++- // at a time. This is only possible with lanes, which is faster ++- // than just doing a vld1_s16, but still slower. ++- for(number = 0; number < eighth_points; number++){ ++- input16 = vld2_s16(inputPtr); ++- // widen 16-bit int to 32-bit int ++- input32_0 = vmovl_s16(input16.val[0]); ++- input32_1 = vmovl_s16(input16.val[1]); ++- // convert 32-bit int to float with scale ++- input_float_0 = vcvtq_f32_s32(input32_0); ++- input_float_1 = vcvtq_f32_s32(input32_1); ++- output_float.val[0] = vmulq_f32(input_float_0, inv_scale); ++- output_float.val[1] = vmulq_f32(input_float_1, inv_scale); ++- vst2q_f32(outputPtr, output_float); ++- inputPtr += 8; ++- outputPtr += 8; ++- } ++- ++- for(number = eighth_points*8; number < num_points; number++){ ++- *outputPtr++ = ((float)(*inputPtr++)) / scalar; ++- } +++ float* outputPtr = outputVector; +++ const int16_t* inputPtr = inputVector; +++ unsigned int number = 0; +++ unsigned int eighth_points = num_points / 8; +++ +++ int16x4x2_t input16; +++ int32x4_t input32_0, input32_1; +++ float32x4_t input_float_0, input_float_1; +++ float32x4x2_t output_float; +++ float32x4_t inv_scale; +++ +++ inv_scale = vdupq_n_f32(1.0 / scalar); +++ +++ // the generic disassembles to a 128-bit load +++ // and duplicates every instruction to operate on 64-bits +++ // at a time. This is only possible with lanes, which is faster +++ // than just doing a vld1_s16, but still slower. +++ for (number = 0; number < eighth_points; number++) { +++ input16 = vld2_s16(inputPtr); +++ // widen 16-bit int to 32-bit int +++ input32_0 = vmovl_s16(input16.val[0]); +++ input32_1 = vmovl_s16(input16.val[1]); +++ // convert 32-bit int to float with scale +++ input_float_0 = vcvtq_f32_s32(input32_0); +++ input_float_1 = vcvtq_f32_s32(input32_1); +++ output_float.val[0] = vmulq_f32(input_float_0, inv_scale); +++ output_float.val[1] = vmulq_f32(input_float_1, inv_scale); +++ vst2q_f32(outputPtr, output_float); +++ inputPtr += 8; +++ outputPtr += 8; +++ } +++ +++ for (number = eighth_points * 8; number < num_points; number++) { +++ *outputPtr++ = ((float)(*inputPtr++)) / scalar; +++ } ++ } ++ #endif /* LV_HAVE_NEON */ ++ ++@@ -306,193 +315,201 @@ volk_16i_s32f_convert_32f_neon(float* outputVector, const int16_t* inputVector, ++ #ifdef LV_HAVE_AVX2 ++ #include ++ ++-static inline void ++-volk_16i_s32f_convert_32f_a_avx2(float* outputVector, const int16_t* inputVector, ++- const float scalar, unsigned int num_points) +++static inline void volk_16i_s32f_convert_32f_a_avx2(float* outputVector, +++ const int16_t* inputVector, +++ const float scalar, +++ unsigned int num_points) ++ { ++- unsigned int number = 0; ++- const unsigned int eighthPoints = num_points / 8; +++ unsigned int number = 0; +++ const unsigned int eighthPoints = num_points / 8; ++ ++- float* outputVectorPtr = outputVector; ++- __m256 invScalar = _mm256_set1_ps(1.0/scalar); ++- int16_t* inputPtr = (int16_t*)inputVector; ++- __m128i inputVal; ++- __m256i inputVal2; ++- __m256 ret; +++ float* outputVectorPtr = outputVector; +++ __m256 invScalar = _mm256_set1_ps(1.0 / scalar); +++ int16_t* inputPtr = (int16_t*)inputVector; +++ __m128i inputVal; +++ __m256i inputVal2; +++ __m256 ret; ++ ++- for(;number < eighthPoints; number++){ +++ for (; number < eighthPoints; number++) { ++ ++- // Load the 8 values ++- inputVal = _mm_load_si128((__m128i*)inputPtr); +++ // Load the 8 values +++ inputVal = _mm_load_si128((__m128i*)inputPtr); ++ ++- // Convert ++- inputVal2 = _mm256_cvtepi16_epi32(inputVal); +++ // Convert +++ inputVal2 = _mm256_cvtepi16_epi32(inputVal); ++ ++- ret = _mm256_cvtepi32_ps(inputVal2); ++- ret = _mm256_mul_ps(ret, invScalar); +++ ret = _mm256_cvtepi32_ps(inputVal2); +++ ret = _mm256_mul_ps(ret, invScalar); ++ ++- _mm256_store_ps(outputVectorPtr, ret); +++ _mm256_store_ps(outputVectorPtr, ret); ++ ++- outputVectorPtr += 8; +++ outputVectorPtr += 8; ++ ++- inputPtr += 8; ++- } +++ inputPtr += 8; +++ } ++ ++- number = eighthPoints * 8; ++- for(; number < num_points; number++){ ++- outputVector[number] =((float)(inputVector[number])) / scalar; ++- } +++ number = eighthPoints * 8; +++ for (; number < num_points; number++) { +++ outputVector[number] = ((float)(inputVector[number])) / scalar; +++ } ++ } ++ #endif /* LV_HAVE_AVX2 */ ++ ++ #ifdef LV_HAVE_AVX ++ #include ++ ++-static inline void ++-volk_16i_s32f_convert_32f_a_avx(float* outputVector, const int16_t* inputVector, ++- const float scalar, unsigned int num_points) +++static inline void volk_16i_s32f_convert_32f_a_avx(float* outputVector, +++ const int16_t* inputVector, +++ const float scalar, +++ unsigned int num_points) ++ { ++- unsigned int number = 0; ++- const unsigned int eighthPoints = num_points / 8; +++ unsigned int number = 0; +++ const unsigned int eighthPoints = num_points / 8; ++ ++- float* outputVectorPtr = outputVector; ++- __m128 invScalar = _mm_set_ps1(1.0/scalar); ++- int16_t* inputPtr = (int16_t*)inputVector; ++- __m128i inputVal, inputVal2; ++- __m128 ret; ++- __m256 output; ++- __m256 dummy = _mm256_setzero_ps(); +++ float* outputVectorPtr = outputVector; +++ __m128 invScalar = _mm_set_ps1(1.0 / scalar); +++ int16_t* inputPtr = (int16_t*)inputVector; +++ __m128i inputVal, inputVal2; +++ __m128 ret; +++ __m256 output; +++ __m256 dummy = _mm256_setzero_ps(); ++ ++- for(;number < eighthPoints; number++){ +++ for (; number < eighthPoints; number++) { ++ ++- // Load the 8 values ++- //inputVal = _mm_loadu_si128((__m128i*)inputPtr); ++- inputVal = _mm_load_si128((__m128i*)inputPtr); +++ // Load the 8 values +++ // inputVal = _mm_loadu_si128((__m128i*)inputPtr); +++ inputVal = _mm_load_si128((__m128i*)inputPtr); ++ ++- // Shift the input data to the right by 64 bits ( 8 bytes ) ++- inputVal2 = _mm_srli_si128(inputVal, 8); +++ // Shift the input data to the right by 64 bits ( 8 bytes ) +++ inputVal2 = _mm_srli_si128(inputVal, 8); ++ ++- // Convert the lower 4 values into 32 bit words ++- inputVal = _mm_cvtepi16_epi32(inputVal); ++- inputVal2 = _mm_cvtepi16_epi32(inputVal2); +++ // Convert the lower 4 values into 32 bit words +++ inputVal = _mm_cvtepi16_epi32(inputVal); +++ inputVal2 = _mm_cvtepi16_epi32(inputVal2); ++ ++- ret = _mm_cvtepi32_ps(inputVal); ++- ret = _mm_mul_ps(ret, invScalar); ++- output = _mm256_insertf128_ps(dummy, ret, 0); +++ ret = _mm_cvtepi32_ps(inputVal); +++ ret = _mm_mul_ps(ret, invScalar); +++ output = _mm256_insertf128_ps(dummy, ret, 0); ++ ++- ret = _mm_cvtepi32_ps(inputVal2); ++- ret = _mm_mul_ps(ret, invScalar); ++- output = _mm256_insertf128_ps(output, ret, 1); +++ ret = _mm_cvtepi32_ps(inputVal2); +++ ret = _mm_mul_ps(ret, invScalar); +++ output = _mm256_insertf128_ps(output, ret, 1); ++ ++- _mm256_store_ps(outputVectorPtr, output); +++ _mm256_store_ps(outputVectorPtr, output); ++ ++- outputVectorPtr += 8; +++ outputVectorPtr += 8; ++ ++- inputPtr += 8; ++- } +++ inputPtr += 8; +++ } ++ ++- number = eighthPoints * 8; ++- for(; number < num_points; number++){ ++- outputVector[number] =((float)(inputVector[number])) / scalar; ++- } +++ number = eighthPoints * 8; +++ for (; number < num_points; number++) { +++ outputVector[number] = ((float)(inputVector[number])) / scalar; +++ } ++ } ++ #endif /* LV_HAVE_AVX */ ++ ++ #ifdef LV_HAVE_SSE4_1 ++ #include ++ ++-static inline void ++-volk_16i_s32f_convert_32f_a_sse4_1(float* outputVector, const int16_t* inputVector, ++- const float scalar, unsigned int num_points) +++static inline void volk_16i_s32f_convert_32f_a_sse4_1(float* outputVector, +++ const int16_t* inputVector, +++ const float scalar, +++ unsigned int num_points) ++ { ++- unsigned int number = 0; ++- const unsigned int eighthPoints = num_points / 8; +++ unsigned int number = 0; +++ const unsigned int eighthPoints = num_points / 8; ++ ++- float* outputVectorPtr = outputVector; ++- __m128 invScalar = _mm_set_ps1(1.0/scalar); ++- int16_t* inputPtr = (int16_t*)inputVector; ++- __m128i inputVal; ++- __m128i inputVal2; ++- __m128 ret; +++ float* outputVectorPtr = outputVector; +++ __m128 invScalar = _mm_set_ps1(1.0 / scalar); +++ int16_t* inputPtr = (int16_t*)inputVector; +++ __m128i inputVal; +++ __m128i inputVal2; +++ __m128 ret; ++ ++- for(;number < eighthPoints; number++){ +++ for (; number < eighthPoints; number++) { ++ ++- // Load the 8 values ++- inputVal = _mm_loadu_si128((__m128i*)inputPtr); +++ // Load the 8 values +++ inputVal = _mm_loadu_si128((__m128i*)inputPtr); ++ ++- // Shift the input data to the right by 64 bits ( 8 bytes ) ++- inputVal2 = _mm_srli_si128(inputVal, 8); +++ // Shift the input data to the right by 64 bits ( 8 bytes ) +++ inputVal2 = _mm_srli_si128(inputVal, 8); ++ ++- // Convert the lower 4 values into 32 bit words ++- inputVal = _mm_cvtepi16_epi32(inputVal); ++- inputVal2 = _mm_cvtepi16_epi32(inputVal2); +++ // Convert the lower 4 values into 32 bit words +++ inputVal = _mm_cvtepi16_epi32(inputVal); +++ inputVal2 = _mm_cvtepi16_epi32(inputVal2); ++ ++- ret = _mm_cvtepi32_ps(inputVal); ++- ret = _mm_mul_ps(ret, invScalar); ++- _mm_storeu_ps(outputVectorPtr, ret); ++- outputVectorPtr += 4; +++ ret = _mm_cvtepi32_ps(inputVal); +++ ret = _mm_mul_ps(ret, invScalar); +++ _mm_storeu_ps(outputVectorPtr, ret); +++ outputVectorPtr += 4; ++ ++- ret = _mm_cvtepi32_ps(inputVal2); ++- ret = _mm_mul_ps(ret, invScalar); ++- _mm_storeu_ps(outputVectorPtr, ret); +++ ret = _mm_cvtepi32_ps(inputVal2); +++ ret = _mm_mul_ps(ret, invScalar); +++ _mm_storeu_ps(outputVectorPtr, ret); ++ ++- outputVectorPtr += 4; +++ outputVectorPtr += 4; ++ ++- inputPtr += 8; ++- } +++ inputPtr += 8; +++ } ++ ++- number = eighthPoints * 8; ++- for(; number < num_points; number++){ ++- outputVector[number] =((float)(inputVector[number])) / scalar; ++- } +++ number = eighthPoints * 8; +++ for (; number < num_points; number++) { +++ outputVector[number] = ((float)(inputVector[number])) / scalar; +++ } ++ } ++ #endif /* LV_HAVE_SSE4_1 */ ++ ++ #ifdef LV_HAVE_SSE ++ #include ++ ++-static inline void ++-volk_16i_s32f_convert_32f_a_sse(float* outputVector, const int16_t* inputVector, ++- const float scalar, unsigned int num_points) +++static inline void volk_16i_s32f_convert_32f_a_sse(float* outputVector, +++ const int16_t* inputVector, +++ const float scalar, +++ unsigned int num_points) ++ { ++- unsigned int number = 0; ++- const unsigned int quarterPoints = num_points / 4; ++- ++- float* outputVectorPtr = outputVector; ++- __m128 invScalar = _mm_set_ps1(1.0/scalar); ++- int16_t* inputPtr = (int16_t*)inputVector; ++- __m128 ret; ++- ++- for(;number < quarterPoints; number++){ ++- ret = _mm_set_ps((float)(inputPtr[3]), (float)(inputPtr[2]), (float)(inputPtr[1]), (float)(inputPtr[0])); ++- ++- ret = _mm_mul_ps(ret, invScalar); ++- _mm_storeu_ps(outputVectorPtr, ret); ++- ++- inputPtr += 4; ++- outputVectorPtr += 4; ++- } ++- ++- number = quarterPoints * 4; ++- for(; number < num_points; number++){ ++- outputVector[number] = (float)(inputVector[number]) / scalar; ++- } +++ unsigned int number = 0; +++ const unsigned int quarterPoints = num_points / 4; +++ +++ float* outputVectorPtr = outputVector; +++ __m128 invScalar = _mm_set_ps1(1.0 / scalar); +++ int16_t* inputPtr = (int16_t*)inputVector; +++ __m128 ret; +++ +++ for (; number < quarterPoints; number++) { +++ ret = _mm_set_ps((float)(inputPtr[3]), +++ (float)(inputPtr[2]), +++ (float)(inputPtr[1]), +++ (float)(inputPtr[0])); +++ +++ ret = _mm_mul_ps(ret, invScalar); +++ _mm_storeu_ps(outputVectorPtr, ret); +++ +++ inputPtr += 4; +++ outputVectorPtr += 4; +++ } +++ +++ number = quarterPoints * 4; +++ for (; number < num_points; number++) { +++ outputVector[number] = (float)(inputVector[number]) / scalar; +++ } ++ } ++ #endif /* LV_HAVE_SSE */ ++ ++ #ifdef LV_HAVE_GENERIC ++ ++-static inline void ++-volk_16i_s32f_convert_32f_a_generic(float* outputVector, const int16_t* inputVector, ++- const float scalar, unsigned int num_points) +++static inline void volk_16i_s32f_convert_32f_a_generic(float* outputVector, +++ const int16_t* inputVector, +++ const float scalar, +++ unsigned int num_points) ++ { ++- float* outputVectorPtr = outputVector; ++- const int16_t* inputVectorPtr = inputVector; ++- unsigned int number = 0; +++ float* outputVectorPtr = outputVector; +++ const int16_t* inputVectorPtr = inputVector; +++ unsigned int number = 0; ++ ++- for(number = 0; number < num_points; number++){ ++- *outputVectorPtr++ = ((float)(*inputVectorPtr++)) / scalar; ++- } +++ for (number = 0; number < num_points; number++) { +++ *outputVectorPtr++ = ((float)(*inputVectorPtr++)) / scalar; +++ } ++ } ++ #endif /* LV_HAVE_GENERIC */ ++ ++diff --git a/kernels/volk/volk_16i_x4_quad_max_star_16i.h b/kernels/volk/volk_16i_x4_quad_max_star_16i.h ++index 6aa74c7..619cc90 100644 ++--- a/kernels/volk/volk_16i_x4_quad_max_star_16i.h +++++ b/kernels/volk/volk_16i_x4_quad_max_star_16i.h ++@@ -29,8 +29,8 @@ ++ * ++ * Dispatcher Prototype ++ * \code ++- * void volk_16i_x4_quad_max_star_16i(short* target, short* src0, short* src1, short* src2, short* src3, unsigned int num_points) ++- * \endcode +++ * void volk_16i_x4_quad_max_star_16i(short* target, short* src0, short* src1, short* +++ * src2, short* src3, unsigned int num_points) \endcode ++ * ++ * \b Inputs ++ * \li src0: The input vector 0. ++@@ -55,149 +55,152 @@ ++ #ifndef INCLUDED_volk_16i_x4_quad_max_star_16i_a_H ++ #define INCLUDED_volk_16i_x4_quad_max_star_16i_a_H ++ ++-#include ++-#include +++#include +++#include ++ ++ #ifdef LV_HAVE_SSE2 ++ ++-#include +++#include ++ ++-static inline void ++-volk_16i_x4_quad_max_star_16i_a_sse2(short* target, short* src0, short* src1, ++- short* src2, short* src3, unsigned int num_points) +++static inline void volk_16i_x4_quad_max_star_16i_a_sse2(short* target, +++ short* src0, +++ short* src1, +++ short* src2, +++ short* src3, +++ unsigned int num_points) ++ { ++- const unsigned int num_bytes = num_points*2; ++- ++- int i = 0; +++ const unsigned int num_bytes = num_points * 2; ++ ++- int bound = (num_bytes >> 4); ++- int bound_copy = bound; ++- int leftovers = (num_bytes >> 1) & 7; +++ int i = 0; ++ ++- __m128i *p_target, *p_src0, *p_src1, *p_src2, *p_src3; ++- p_target = (__m128i*) target; ++- p_src0 = (__m128i*)src0; ++- p_src1 = (__m128i*)src1; ++- p_src2 = (__m128i*)src2; ++- p_src3 = (__m128i*)src3; +++ int bound = (num_bytes >> 4); +++ int bound_copy = bound; +++ int leftovers = (num_bytes >> 1) & 7; ++ ++- __m128i xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8; +++ __m128i *p_target, *p_src0, *p_src1, *p_src2, *p_src3; +++ p_target = (__m128i*)target; +++ p_src0 = (__m128i*)src0; +++ p_src1 = (__m128i*)src1; +++ p_src2 = (__m128i*)src2; +++ p_src3 = (__m128i*)src3; ++ ++- while(bound_copy > 0) { ++- xmm1 = _mm_load_si128(p_src0); ++- xmm2 = _mm_load_si128(p_src1); ++- xmm3 = _mm_load_si128(p_src2); ++- xmm4 = _mm_load_si128(p_src3); +++ __m128i xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8; ++ ++- xmm5 = _mm_setzero_si128(); ++- xmm6 = _mm_setzero_si128(); ++- xmm7 = xmm1; ++- xmm8 = xmm3; +++ while (bound_copy > 0) { +++ xmm1 = _mm_load_si128(p_src0); +++ xmm2 = _mm_load_si128(p_src1); +++ xmm3 = _mm_load_si128(p_src2); +++ xmm4 = _mm_load_si128(p_src3); ++ ++- xmm1 = _mm_sub_epi16(xmm2, xmm1); +++ xmm5 = _mm_setzero_si128(); +++ xmm6 = _mm_setzero_si128(); +++ xmm7 = xmm1; +++ xmm8 = xmm3; ++ ++- xmm3 = _mm_sub_epi16(xmm4, xmm3); +++ xmm1 = _mm_sub_epi16(xmm2, xmm1); ++ ++- xmm5 = _mm_cmpgt_epi16(xmm1, xmm5); ++- xmm6 = _mm_cmpgt_epi16(xmm3, xmm6); +++ xmm3 = _mm_sub_epi16(xmm4, xmm3); ++ ++- xmm2 = _mm_and_si128(xmm5, xmm2); ++- xmm4 = _mm_and_si128(xmm6, xmm4); ++- xmm5 = _mm_andnot_si128(xmm5, xmm7); ++- xmm6 = _mm_andnot_si128(xmm6, xmm8); +++ xmm5 = _mm_cmpgt_epi16(xmm1, xmm5); +++ xmm6 = _mm_cmpgt_epi16(xmm3, xmm6); ++ ++- xmm5 = _mm_add_epi16(xmm2, xmm5); ++- xmm6 = _mm_add_epi16(xmm4, xmm6); +++ xmm2 = _mm_and_si128(xmm5, xmm2); +++ xmm4 = _mm_and_si128(xmm6, xmm4); +++ xmm5 = _mm_andnot_si128(xmm5, xmm7); +++ xmm6 = _mm_andnot_si128(xmm6, xmm8); ++ ++- xmm1 = _mm_xor_si128(xmm1, xmm1); ++- xmm2 = xmm5; ++- xmm5 = _mm_sub_epi16(xmm6, xmm5); ++- p_src0 += 1; ++- bound_copy -= 1; +++ xmm5 = _mm_add_epi16(xmm2, xmm5); +++ xmm6 = _mm_add_epi16(xmm4, xmm6); ++ ++- xmm1 = _mm_cmpgt_epi16(xmm5, xmm1); ++- p_src1 += 1; +++ xmm1 = _mm_xor_si128(xmm1, xmm1); +++ xmm2 = xmm5; +++ xmm5 = _mm_sub_epi16(xmm6, xmm5); +++ p_src0 += 1; +++ bound_copy -= 1; ++ ++- xmm6 = _mm_and_si128(xmm1, xmm6); +++ xmm1 = _mm_cmpgt_epi16(xmm5, xmm1); +++ p_src1 += 1; ++ ++- xmm1 = _mm_andnot_si128(xmm1, xmm2); ++- p_src2 += 1; +++ xmm6 = _mm_and_si128(xmm1, xmm6); ++ ++- xmm1 = _mm_add_epi16(xmm6, xmm1); ++- p_src3 += 1; +++ xmm1 = _mm_andnot_si128(xmm1, xmm2); +++ p_src2 += 1; ++ ++- _mm_store_si128(p_target, xmm1); ++- p_target += 1; +++ xmm1 = _mm_add_epi16(xmm6, xmm1); +++ p_src3 += 1; ++ ++- } +++ _mm_store_si128(p_target, xmm1); +++ p_target += 1; +++ } ++ ++ ++- /*__VOLK_ASM __VOLK_VOLATILE ++- ( ++- "volk_16i_x4_quad_max_star_16i_a_sse2_L1:\n\t" ++- "cmp $0, %[bound]\n\t" ++- "je volk_16i_x4_quad_max_star_16i_a_sse2_END\n\t" +++ /*__VOLK_ASM __VOLK_VOLATILE +++ ( +++ "volk_16i_x4_quad_max_star_16i_a_sse2_L1:\n\t" +++ "cmp $0, %[bound]\n\t" +++ "je volk_16i_x4_quad_max_star_16i_a_sse2_END\n\t" ++ ++- "movaps (%[src0]), %%xmm1\n\t" ++- "movaps (%[src1]), %%xmm2\n\t" ++- "movaps (%[src2]), %%xmm3\n\t" ++- "movaps (%[src3]), %%xmm4\n\t" +++ "movaps (%[src0]), %%xmm1\n\t" +++ "movaps (%[src1]), %%xmm2\n\t" +++ "movaps (%[src2]), %%xmm3\n\t" +++ "movaps (%[src3]), %%xmm4\n\t" ++ ++- "pxor %%xmm5, %%xmm5\n\t" ++- "pxor %%xmm6, %%xmm6\n\t" ++- "movaps %%xmm1, %%xmm7\n\t" ++- "movaps %%xmm3, %%xmm8\n\t" ++- "psubw %%xmm2, %%xmm1\n\t" ++- "psubw %%xmm4, %%xmm3\n\t" +++ "pxor %%xmm5, %%xmm5\n\t" +++ "pxor %%xmm6, %%xmm6\n\t" +++ "movaps %%xmm1, %%xmm7\n\t" +++ "movaps %%xmm3, %%xmm8\n\t" +++ "psubw %%xmm2, %%xmm1\n\t" +++ "psubw %%xmm4, %%xmm3\n\t" ++ ++- "pcmpgtw %%xmm1, %%xmm5\n\t" ++- "pcmpgtw %%xmm3, %%xmm6\n\t" +++ "pcmpgtw %%xmm1, %%xmm5\n\t" +++ "pcmpgtw %%xmm3, %%xmm6\n\t" ++ ++- "pand %%xmm5, %%xmm2\n\t" ++- "pand %%xmm6, %%xmm4\n\t" ++- "pandn %%xmm7, %%xmm5\n\t" ++- "pandn %%xmm8, %%xmm6\n\t" +++ "pand %%xmm5, %%xmm2\n\t" +++ "pand %%xmm6, %%xmm4\n\t" +++ "pandn %%xmm7, %%xmm5\n\t" +++ "pandn %%xmm8, %%xmm6\n\t" ++ ++- "paddw %%xmm2, %%xmm5\n\t" ++- "paddw %%xmm4, %%xmm6\n\t" +++ "paddw %%xmm2, %%xmm5\n\t" +++ "paddw %%xmm4, %%xmm6\n\t" ++ ++- "pxor %%xmm1, %%xmm1\n\t" ++- "movaps %%xmm5, %%xmm2\n\t" +++ "pxor %%xmm1, %%xmm1\n\t" +++ "movaps %%xmm5, %%xmm2\n\t" ++ ++- "psubw %%xmm6, %%xmm5\n\t" ++- "add $16, %[src0]\n\t" ++- "add $-1, %[bound]\n\t" +++ "psubw %%xmm6, %%xmm5\n\t" +++ "add $16, %[src0]\n\t" +++ "add $-1, %[bound]\n\t" ++ ++- "pcmpgtw %%xmm5, %%xmm1\n\t" ++- "add $16, %[src1]\n\t" +++ "pcmpgtw %%xmm5, %%xmm1\n\t" +++ "add $16, %[src1]\n\t" ++ ++- "pand %%xmm1, %%xmm6\n\t" +++ "pand %%xmm1, %%xmm6\n\t" ++ ++- "pandn %%xmm2, %%xmm1\n\t" ++- "add $16, %[src2]\n\t" +++ "pandn %%xmm2, %%xmm1\n\t" +++ "add $16, %[src2]\n\t" ++ ++- "paddw %%xmm6, %%xmm1\n\t" ++- "add $16, %[src3]\n\t" +++ "paddw %%xmm6, %%xmm1\n\t" +++ "add $16, %[src3]\n\t" ++ ++- "movaps %%xmm1, (%[target])\n\t" ++- "addw $16, %[target]\n\t" ++- "jmp volk_16i_x4_quad_max_star_16i_a_sse2_L1\n\t" +++ "movaps %%xmm1, (%[target])\n\t" +++ "addw $16, %[target]\n\t" +++ "jmp volk_16i_x4_quad_max_star_16i_a_sse2_L1\n\t" ++ ++- "volk_16i_x4_quad_max_star_16i_a_sse2_END:\n\t" ++- : ++- :[bound]"r"(bound), [src0]"r"(src0), [src1]"r"(src1), [src2]"r"(src2), [src3]"r"(src3), [target]"r"(target) ++- : ++- ); ++- */ +++ "volk_16i_x4_quad_max_star_16i_a_sse2_END:\n\t" +++ : +++ :[bound]"r"(bound), [src0]"r"(src0), [src1]"r"(src1), [src2]"r"(src2), +++ [src3]"r"(src3), [target]"r"(target) +++ : +++ ); +++ */ ++ ++- short temp0 = 0; ++- short temp1 = 0; ++- for(i = bound * 8; i < (bound * 8) + leftovers; ++i) { ++- temp0 = ((short)(src0[i] - src1[i]) > 0) ? src0[i] : src1[i]; ++- temp1 = ((short)(src2[i] - src3[i])>0) ? src2[i] : src3[i]; ++- target[i] = ((short)(temp0 - temp1)>0) ? temp0 : temp1; ++- } ++- return; +++ short temp0 = 0; +++ short temp1 = 0; +++ for (i = bound * 8; i < (bound * 8) + leftovers; ++i) { +++ temp0 = ((short)(src0[i] - src1[i]) > 0) ? src0[i] : src1[i]; +++ temp1 = ((short)(src2[i] - src3[i]) > 0) ? src2[i] : src3[i]; +++ target[i] = ((short)(temp0 - temp1) > 0) ? temp0 : temp1; +++ } +++ return; ++ } ++ ++ #endif /*LV_HAVE_SSE2*/ ++@@ -206,85 +209,91 @@ volk_16i_x4_quad_max_star_16i_a_sse2(short* target, short* src0, short* src1, ++ ++ #include ++ ++-static inline void ++-volk_16i_x4_quad_max_star_16i_neon(short* target, short* src0, short* src1, ++- short* src2, short* src3, unsigned int num_points) +++static inline void volk_16i_x4_quad_max_star_16i_neon(short* target, +++ short* src0, +++ short* src1, +++ short* src2, +++ short* src3, +++ unsigned int num_points) ++ { ++- const unsigned int eighth_points = num_points / 8; ++- unsigned i; ++- ++- int16x8_t src0_vec, src1_vec, src2_vec, src3_vec; ++- int16x8_t diff12, diff34; ++- int16x8_t comp0, comp1, comp2, comp3; ++- int16x8_t result1_vec, result2_vec; ++- int16x8_t zeros; ++- zeros = vdupq_n_s16(0); ++- for(i=0; i < eighth_points; ++i) { ++- src0_vec = vld1q_s16(src0); ++- src1_vec = vld1q_s16(src1); ++- src2_vec = vld1q_s16(src2); ++- src3_vec = vld1q_s16(src3); ++- diff12 = vsubq_s16(src0_vec, src1_vec); ++- diff34 = vsubq_s16(src2_vec, src3_vec); ++- comp0 = (int16x8_t)vcgeq_s16(diff12, zeros); ++- comp1 = (int16x8_t)vcltq_s16(diff12, zeros); ++- comp2 = (int16x8_t)vcgeq_s16(diff34, zeros); ++- comp3 = (int16x8_t)vcltq_s16(diff34, zeros); ++- comp0 = vandq_s16(src0_vec, comp0); ++- comp1 = vandq_s16(src1_vec, comp1); ++- comp2 = vandq_s16(src2_vec, comp2); ++- comp3 = vandq_s16(src3_vec, comp3); ++- ++- result1_vec = vaddq_s16(comp0, comp1); ++- result2_vec = vaddq_s16(comp2, comp3); ++- ++- diff12 = vsubq_s16(result1_vec, result2_vec); ++- comp0 = (int16x8_t)vcgeq_s16(diff12, zeros); ++- comp1 = (int16x8_t)vcltq_s16(diff12, zeros); ++- comp0 = vandq_s16(result1_vec, comp0); ++- comp1 = vandq_s16(result2_vec, comp1); ++- result1_vec = vaddq_s16(comp0, comp1); ++- vst1q_s16(target, result1_vec); ++- src0 += 8; ++- src1 += 8; ++- src2 += 8; ++- src3 += 8; ++- target += 8; +++ const unsigned int eighth_points = num_points / 8; +++ unsigned i; +++ +++ int16x8_t src0_vec, src1_vec, src2_vec, src3_vec; +++ int16x8_t diff12, diff34; +++ int16x8_t comp0, comp1, comp2, comp3; +++ int16x8_t result1_vec, result2_vec; +++ int16x8_t zeros; +++ zeros = vdupq_n_s16(0); +++ for (i = 0; i < eighth_points; ++i) { +++ src0_vec = vld1q_s16(src0); +++ src1_vec = vld1q_s16(src1); +++ src2_vec = vld1q_s16(src2); +++ src3_vec = vld1q_s16(src3); +++ diff12 = vsubq_s16(src0_vec, src1_vec); +++ diff34 = vsubq_s16(src2_vec, src3_vec); +++ comp0 = (int16x8_t)vcgeq_s16(diff12, zeros); +++ comp1 = (int16x8_t)vcltq_s16(diff12, zeros); +++ comp2 = (int16x8_t)vcgeq_s16(diff34, zeros); +++ comp3 = (int16x8_t)vcltq_s16(diff34, zeros); +++ comp0 = vandq_s16(src0_vec, comp0); +++ comp1 = vandq_s16(src1_vec, comp1); +++ comp2 = vandq_s16(src2_vec, comp2); +++ comp3 = vandq_s16(src3_vec, comp3); +++ +++ result1_vec = vaddq_s16(comp0, comp1); +++ result2_vec = vaddq_s16(comp2, comp3); +++ +++ diff12 = vsubq_s16(result1_vec, result2_vec); +++ comp0 = (int16x8_t)vcgeq_s16(diff12, zeros); +++ comp1 = (int16x8_t)vcltq_s16(diff12, zeros); +++ comp0 = vandq_s16(result1_vec, comp0); +++ comp1 = vandq_s16(result2_vec, comp1); +++ result1_vec = vaddq_s16(comp0, comp1); +++ vst1q_s16(target, result1_vec); +++ src0 += 8; +++ src1 += 8; +++ src2 += 8; +++ src3 += 8; +++ target += 8; ++ } ++ ++- short temp0 = 0; ++- short temp1 = 0; ++- for(i=eighth_points*8; i < num_points; ++i) { ++- temp0 = ((short)(*src0 - *src1) > 0) ? *src0 : *src1; ++- temp1 = ((short)(*src2 - *src3) > 0) ? *src2 : *src3; ++- *target++ = ((short)(temp0 - temp1)>0) ? temp0 : temp1; ++- src0++; ++- src1++; ++- src2++; ++- src3++; ++- } +++ short temp0 = 0; +++ short temp1 = 0; +++ for (i = eighth_points * 8; i < num_points; ++i) { +++ temp0 = ((short)(*src0 - *src1) > 0) ? *src0 : *src1; +++ temp1 = ((short)(*src2 - *src3) > 0) ? *src2 : *src3; +++ *target++ = ((short)(temp0 - temp1) > 0) ? temp0 : temp1; +++ src0++; +++ src1++; +++ src2++; +++ src3++; +++ } ++ } ++ #endif /* LV_HAVE_NEON */ ++ ++ ++ #ifdef LV_HAVE_GENERIC ++-static inline void ++-volk_16i_x4_quad_max_star_16i_generic(short* target, short* src0, short* src1, ++- short* src2, short* src3, unsigned int num_points) +++static inline void volk_16i_x4_quad_max_star_16i_generic(short* target, +++ short* src0, +++ short* src1, +++ short* src2, +++ short* src3, +++ unsigned int num_points) ++ { ++- const unsigned int num_bytes = num_points*2; +++ const unsigned int num_bytes = num_points * 2; ++ ++- int i = 0; +++ int i = 0; ++ ++- int bound = num_bytes >> 1; +++ int bound = num_bytes >> 1; ++ ++- short temp0 = 0; ++- short temp1 = 0; ++- for(i = 0; i < bound; ++i) { ++- temp0 = ((short)(src0[i] - src1[i]) > 0) ? src0[i] : src1[i]; ++- temp1 = ((short)(src2[i] - src3[i])>0) ? src2[i] : src3[i]; ++- target[i] = ((short)(temp0 - temp1)>0) ? temp0 : temp1; ++- } +++ short temp0 = 0; +++ short temp1 = 0; +++ for (i = 0; i < bound; ++i) { +++ temp0 = ((short)(src0[i] - src1[i]) > 0) ? src0[i] : src1[i]; +++ temp1 = ((short)(src2[i] - src3[i]) > 0) ? src2[i] : src3[i]; +++ target[i] = ((short)(temp0 - temp1) > 0) ? temp0 : temp1; +++ } ++ } ++ ++ #endif /*LV_HAVE_GENERIC*/ ++diff --git a/kernels/volk/volk_16i_x5_add_quad_16i_x4.h b/kernels/volk/volk_16i_x5_add_quad_16i_x4.h ++index 30417de..f735f11 100644 ++--- a/kernels/volk/volk_16i_x5_add_quad_16i_x4.h +++++ b/kernels/volk/volk_16i_x5_add_quad_16i_x4.h ++@@ -29,8 +29,9 @@ ++ * ++ * Dispatcher Prototype ++ * \code ++- * void volk_16i_x5_add_quad_16i_x4(short* target0, short* target1, short* target2, short* target3, short* src0, short* src1, short* src2, short* src3, short* src4, unsigned int num_points); ++- * \endcode +++ * void volk_16i_x5_add_quad_16i_x4(short* target0, short* target1, short* target2, short* +++ * target3, short* src0, short* src1, short* src2, short* src3, short* src4, unsigned int +++ * num_points); \endcode ++ * ++ * \b Inputs ++ * \li src0: The input vector 0. ++@@ -59,182 +60,203 @@ ++ #ifndef INCLUDED_volk_16i_x5_add_quad_16i_x4_a_H ++ #define INCLUDED_volk_16i_x5_add_quad_16i_x4_a_H ++ ++-#include ++-#include +++#include +++#include ++ ++ #ifdef LV_HAVE_SSE2 ++-#include ++-#include +++#include +++#include ++ ++-static inline void ++-volk_16i_x5_add_quad_16i_x4_a_sse2(short* target0, short* target1, short* target2, short* target3, ++- short* src0, short* src1, short* src2, short* src3, short* src4, ++- unsigned int num_points) +++static inline void volk_16i_x5_add_quad_16i_x4_a_sse2(short* target0, +++ short* target1, +++ short* target2, +++ short* target3, +++ short* src0, +++ short* src1, +++ short* src2, +++ short* src3, +++ short* src4, +++ unsigned int num_points) ++ { ++- const unsigned int num_bytes = num_points*2; ++- ++- __m128i xmm0, xmm1, xmm2, xmm3, xmm4; ++- __m128i *p_target0, *p_target1, *p_target2, *p_target3, *p_src0, *p_src1, *p_src2, *p_src3, *p_src4; ++- p_target0 = (__m128i*)target0; ++- p_target1 = (__m128i*)target1; ++- p_target2 = (__m128i*)target2; ++- p_target3 = (__m128i*)target3; ++- ++- p_src0 = (__m128i*)src0; ++- p_src1 = (__m128i*)src1; ++- p_src2 = (__m128i*)src2; ++- p_src3 = (__m128i*)src3; ++- p_src4 = (__m128i*)src4; ++- ++- int i = 0; ++- ++- int bound = (num_bytes >> 4); ++- int leftovers = (num_bytes >> 1) & 7; ++- ++- for(; i < bound; ++i) { ++- xmm0 = _mm_load_si128(p_src0); ++- xmm1 = _mm_load_si128(p_src1); ++- xmm2 = _mm_load_si128(p_src2); ++- xmm3 = _mm_load_si128(p_src3); ++- xmm4 = _mm_load_si128(p_src4); ++- ++- p_src0 += 1; ++- p_src1 += 1; ++- ++- xmm1 = _mm_add_epi16(xmm0, xmm1); ++- xmm2 = _mm_add_epi16(xmm0, xmm2); ++- xmm3 = _mm_add_epi16(xmm0, xmm3); ++- xmm4 = _mm_add_epi16(xmm0, xmm4); ++- ++- ++- p_src2 += 1; ++- p_src3 += 1; ++- p_src4 += 1; ++- ++- _mm_store_si128(p_target0, xmm1); ++- _mm_store_si128(p_target1, xmm2); ++- _mm_store_si128(p_target2, xmm3); ++- _mm_store_si128(p_target3, xmm4); ++- ++- p_target0 += 1; ++- p_target1 += 1; ++- p_target2 += 1; ++- p_target3 += 1; ++- } ++- /*__VOLK_ASM __VOLK_VOLATILE ++- ( ++- ".%=volk_16i_x5_add_quad_16i_x4_a_sse2_L1:\n\t" ++- "cmp $0, %[bound]\n\t" ++- "je .%=volk_16i_x5_add_quad_16i_x4_a_sse2_END\n\t" ++- "movaps (%[src0]), %%xmm1\n\t" ++- "movaps (%[src1]), %%xmm2\n\t" ++- "movaps (%[src2]), %%xmm3\n\t" ++- "movaps (%[src3]), %%xmm4\n\t" ++- "movaps (%[src4]), %%xmm5\n\t" ++- "add $16, %[src0]\n\t" ++- "add $16, %[src1]\n\t" ++- "add $16, %[src2]\n\t" ++- "add $16, %[src3]\n\t" ++- "add $16, %[src4]\n\t" ++- "paddw %%xmm1, %%xmm2\n\t" ++- "paddw %%xmm1, %%xmm3\n\t" ++- "paddw %%xmm1, %%xmm4\n\t" ++- "paddw %%xmm1, %%xmm5\n\t" ++- "add $-1, %[bound]\n\t" ++- "movaps %%xmm2, (%[target0])\n\t" ++- "movaps %%xmm3, (%[target1])\n\t" ++- "movaps %%xmm4, (%[target2])\n\t" ++- "movaps %%xmm5, (%[target3])\n\t" ++- "add $16, %[target0]\n\t" ++- "add $16, %[target1]\n\t" ++- "add $16, %[target2]\n\t" ++- "add $16, %[target3]\n\t" ++- "jmp .%=volk_16i_x5_add_quad_16i_x4_a_sse2_L1\n\t" ++- ".%=volk_16i_x5_add_quad_16i_x4_a_sse2_END:\n\t" ++- : ++- :[bound]"r"(bound), [src0]"r"(src0), [src1]"r"(src1), [src2]"r"(src2), [src3]"r"(src3), [src4]"r"(src4), [target0]"r"(target0), [target1]"r"(target1), [target2]"r"(target2), [target3]"r"(target3) ++- :"xmm1", "xmm2", "xmm3", "xmm4", "xmm5" ++- ); ++- */ ++- ++- for(i = bound * 8; i < (bound * 8) + leftovers; ++i) { ++- target0[i] = src0[i] + src1[i]; ++- target1[i] = src0[i] + src2[i]; ++- target2[i] = src0[i] + src3[i]; ++- target3[i] = src0[i] + src4[i]; ++- } +++ const unsigned int num_bytes = num_points * 2; +++ +++ __m128i xmm0, xmm1, xmm2, xmm3, xmm4; +++ __m128i *p_target0, *p_target1, *p_target2, *p_target3, *p_src0, *p_src1, *p_src2, +++ *p_src3, *p_src4; +++ p_target0 = (__m128i*)target0; +++ p_target1 = (__m128i*)target1; +++ p_target2 = (__m128i*)target2; +++ p_target3 = (__m128i*)target3; +++ +++ p_src0 = (__m128i*)src0; +++ p_src1 = (__m128i*)src1; +++ p_src2 = (__m128i*)src2; +++ p_src3 = (__m128i*)src3; +++ p_src4 = (__m128i*)src4; +++ +++ int i = 0; +++ +++ int bound = (num_bytes >> 4); +++ int leftovers = (num_bytes >> 1) & 7; +++ +++ for (; i < bound; ++i) { +++ xmm0 = _mm_load_si128(p_src0); +++ xmm1 = _mm_load_si128(p_src1); +++ xmm2 = _mm_load_si128(p_src2); +++ xmm3 = _mm_load_si128(p_src3); +++ xmm4 = _mm_load_si128(p_src4); +++ +++ p_src0 += 1; +++ p_src1 += 1; +++ +++ xmm1 = _mm_add_epi16(xmm0, xmm1); +++ xmm2 = _mm_add_epi16(xmm0, xmm2); +++ xmm3 = _mm_add_epi16(xmm0, xmm3); +++ xmm4 = _mm_add_epi16(xmm0, xmm4); +++ +++ +++ p_src2 += 1; +++ p_src3 += 1; +++ p_src4 += 1; +++ +++ _mm_store_si128(p_target0, xmm1); +++ _mm_store_si128(p_target1, xmm2); +++ _mm_store_si128(p_target2, xmm3); +++ _mm_store_si128(p_target3, xmm4); +++ +++ p_target0 += 1; +++ p_target1 += 1; +++ p_target2 += 1; +++ p_target3 += 1; +++ } +++ /*__VOLK_ASM __VOLK_VOLATILE +++ ( +++ ".%=volk_16i_x5_add_quad_16i_x4_a_sse2_L1:\n\t" +++ "cmp $0, %[bound]\n\t" +++ "je .%=volk_16i_x5_add_quad_16i_x4_a_sse2_END\n\t" +++ "movaps (%[src0]), %%xmm1\n\t" +++ "movaps (%[src1]), %%xmm2\n\t" +++ "movaps (%[src2]), %%xmm3\n\t" +++ "movaps (%[src3]), %%xmm4\n\t" +++ "movaps (%[src4]), %%xmm5\n\t" +++ "add $16, %[src0]\n\t" +++ "add $16, %[src1]\n\t" +++ "add $16, %[src2]\n\t" +++ "add $16, %[src3]\n\t" +++ "add $16, %[src4]\n\t" +++ "paddw %%xmm1, %%xmm2\n\t" +++ "paddw %%xmm1, %%xmm3\n\t" +++ "paddw %%xmm1, %%xmm4\n\t" +++ "paddw %%xmm1, %%xmm5\n\t" +++ "add $-1, %[bound]\n\t" +++ "movaps %%xmm2, (%[target0])\n\t" +++ "movaps %%xmm3, (%[target1])\n\t" +++ "movaps %%xmm4, (%[target2])\n\t" +++ "movaps %%xmm5, (%[target3])\n\t" +++ "add $16, %[target0]\n\t" +++ "add $16, %[target1]\n\t" +++ "add $16, %[target2]\n\t" +++ "add $16, %[target3]\n\t" +++ "jmp .%=volk_16i_x5_add_quad_16i_x4_a_sse2_L1\n\t" +++ ".%=volk_16i_x5_add_quad_16i_x4_a_sse2_END:\n\t" +++ : +++ :[bound]"r"(bound), [src0]"r"(src0), [src1]"r"(src1), [src2]"r"(src2), +++ [src3]"r"(src3), [src4]"r"(src4), [target0]"r"(target0), [target1]"r"(target1), +++ [target2]"r"(target2), [target3]"r"(target3) +++ :"xmm1", "xmm2", "xmm3", "xmm4", "xmm5" +++ ); +++ */ +++ +++ for (i = bound * 8; i < (bound * 8) + leftovers; ++i) { +++ target0[i] = src0[i] + src1[i]; +++ target1[i] = src0[i] + src2[i]; +++ target2[i] = src0[i] + src3[i]; +++ target3[i] = src0[i] + src4[i]; +++ } ++ } ++ #endif /*LV_HAVE_SSE2*/ ++ ++ #ifdef LV_HAVE_NEON ++ #include ++ ++-static inline void ++-volk_16i_x5_add_quad_16i_x4_neon(short* target0, short* target1, short* target2, short* target3, ++- short* src0, short* src1, short* src2, short* src3, short* src4, ++- unsigned int num_points) +++static inline void volk_16i_x5_add_quad_16i_x4_neon(short* target0, +++ short* target1, +++ short* target2, +++ short* target3, +++ short* src0, +++ short* src1, +++ short* src2, +++ short* src3, +++ short* src4, +++ unsigned int num_points) ++ { ++- const unsigned int eighth_points = num_points / 8; ++- unsigned int number = 0; ++- ++- int16x8_t src0_vec, src1_vec, src2_vec, src3_vec, src4_vec; ++- int16x8_t target0_vec, target1_vec, target2_vec, target3_vec; ++- for(number = 0; number < eighth_points; ++number) { ++- src0_vec = vld1q_s16(src0); ++- src1_vec = vld1q_s16(src1); ++- src2_vec = vld1q_s16(src2); ++- src3_vec = vld1q_s16(src3); ++- src4_vec = vld1q_s16(src4); ++- ++- target0_vec = vaddq_s16(src0_vec , src1_vec); ++- target1_vec = vaddq_s16(src0_vec , src2_vec); ++- target2_vec = vaddq_s16(src0_vec , src3_vec); ++- target3_vec = vaddq_s16(src0_vec , src4_vec); ++- ++- vst1q_s16(target0, target0_vec); ++- vst1q_s16(target1, target1_vec); ++- vst1q_s16(target2, target2_vec); ++- vst1q_s16(target3, target3_vec); ++- src0 += 8; ++- src1 += 8; ++- src2 += 8; ++- src3 += 8; ++- src4 += 8; ++- target0 += 8; ++- target1 += 8; ++- target2 += 8; ++- target3 += 8; ++- } ++- ++- for(number = eighth_points * 8; number < num_points; ++number) { ++- *target0++ = *src0 + *src1++; ++- *target1++ = *src0 + *src2++; ++- *target2++ = *src0 + *src3++; ++- *target3++ = *src0++ + *src4++; ++- } +++ const unsigned int eighth_points = num_points / 8; +++ unsigned int number = 0; +++ +++ int16x8_t src0_vec, src1_vec, src2_vec, src3_vec, src4_vec; +++ int16x8_t target0_vec, target1_vec, target2_vec, target3_vec; +++ for (number = 0; number < eighth_points; ++number) { +++ src0_vec = vld1q_s16(src0); +++ src1_vec = vld1q_s16(src1); +++ src2_vec = vld1q_s16(src2); +++ src3_vec = vld1q_s16(src3); +++ src4_vec = vld1q_s16(src4); +++ +++ target0_vec = vaddq_s16(src0_vec, src1_vec); +++ target1_vec = vaddq_s16(src0_vec, src2_vec); +++ target2_vec = vaddq_s16(src0_vec, src3_vec); +++ target3_vec = vaddq_s16(src0_vec, src4_vec); +++ +++ vst1q_s16(target0, target0_vec); +++ vst1q_s16(target1, target1_vec); +++ vst1q_s16(target2, target2_vec); +++ vst1q_s16(target3, target3_vec); +++ src0 += 8; +++ src1 += 8; +++ src2 += 8; +++ src3 += 8; +++ src4 += 8; +++ target0 += 8; +++ target1 += 8; +++ target2 += 8; +++ target3 += 8; +++ } +++ +++ for (number = eighth_points * 8; number < num_points; ++number) { +++ *target0++ = *src0 + *src1++; +++ *target1++ = *src0 + *src2++; +++ *target2++ = *src0 + *src3++; +++ *target3++ = *src0++ + *src4++; +++ } ++ } ++ ++ #endif /* LV_HAVE_NEON */ ++ ++ #ifdef LV_HAVE_GENERIC ++ ++-static inline void ++-volk_16i_x5_add_quad_16i_x4_generic(short* target0, short* target1, short* target2, short* target3, ++- short* src0, short* src1, short* src2, short* src3, short* src4, ++- unsigned int num_points) +++static inline void volk_16i_x5_add_quad_16i_x4_generic(short* target0, +++ short* target1, +++ short* target2, +++ short* target3, +++ short* src0, +++ short* src1, +++ short* src2, +++ short* src3, +++ short* src4, +++ unsigned int num_points) ++ { ++- const unsigned int num_bytes = num_points*2; +++ const unsigned int num_bytes = num_points * 2; ++ ++- int i = 0; +++ int i = 0; ++ ++- int bound = num_bytes >> 1; +++ int bound = num_bytes >> 1; ++ ++- for(i = 0; i < bound; ++i) { ++- target0[i] = src0[i] + src1[i]; ++- target1[i] = src0[i] + src2[i]; ++- target2[i] = src0[i] + src3[i]; ++- target3[i] = src0[i] + src4[i]; ++- } +++ for (i = 0; i < bound; ++i) { +++ target0[i] = src0[i] + src1[i]; +++ target1[i] = src0[i] + src2[i]; +++ target2[i] = src0[i] + src3[i]; +++ target3[i] = src0[i] + src4[i]; +++ } ++ } ++ ++ #endif /* LV_HAVE_GENERIC */ ++diff --git a/kernels/volk/volk_16ic_convert_32fc.h b/kernels/volk/volk_16ic_convert_32fc.h ++index 84f067c..1453724 100644 ++--- a/kernels/volk/volk_16ic_convert_32fc.h +++++ b/kernels/volk/volk_16ic_convert_32fc.h ++@@ -30,8 +30,8 @@ ++ * ++ * Dispatcher Prototype ++ * \code ++- * void volk_16ic_convert_32fc(lv_32fc_t* outputVector, const lv_16sc_t* inputVector, unsigned int num_points) ++- * \endcode +++ * void volk_16ic_convert_32fc(lv_32fc_t* outputVector, const lv_16sc_t* inputVector, +++ * unsigned int num_points) \endcode ++ * ++ * \b Inputs ++ * \li inputVector: The complex 16-bit integer input data buffer. ++@@ -51,7 +51,9 @@ ++ #ifdef LV_HAVE_AVX2 ++ #include ++ ++-static inline void volk_16ic_convert_32fc_a_avx2(lv_32fc_t* outputVector, const lv_16sc_t* inputVector, unsigned int num_points) +++static inline void volk_16ic_convert_32fc_a_avx2(lv_32fc_t* outputVector, +++ const lv_16sc_t* inputVector, +++ unsigned int num_points) ++ { ++ const unsigned int avx_iters = num_points / 8; ++ unsigned int number = 0; ++@@ -61,36 +63,36 @@ static inline void volk_16ic_convert_32fc_a_avx2(lv_32fc_t* outputVector, const ++ __m256i outValInt; ++ __m128i cplxValue; ++ ++- for(number = 0; number < avx_iters; number++) ++- { ++- cplxValue = _mm_load_si128((__m128i*)complexVectorPtr); ++- complexVectorPtr += 8; ++- ++- outValInt = _mm256_cvtepi16_epi32(cplxValue); ++- outVal = _mm256_cvtepi32_ps(outValInt); ++- _mm256_store_ps((float*)outputVectorPtr, outVal); +++ for (number = 0; number < avx_iters; number++) { +++ cplxValue = _mm_load_si128((__m128i*)complexVectorPtr); +++ complexVectorPtr += 8; ++ ++- outputVectorPtr += 8; ++- } +++ outValInt = _mm256_cvtepi16_epi32(cplxValue); +++ outVal = _mm256_cvtepi32_ps(outValInt); +++ _mm256_store_ps((float*)outputVectorPtr, outVal); +++ +++ outputVectorPtr += 8; +++ } ++ ++ number = avx_iters * 8; ++- for(; number < num_points*2; number++) ++- { ++- *outputVectorPtr++ = (float)*complexVectorPtr++; ++- } +++ for (; number < num_points * 2; number++) { +++ *outputVectorPtr++ = (float)*complexVectorPtr++; +++ } ++ } ++ ++ #endif /* LV_HAVE_AVX2 */ ++ ++ #ifdef LV_HAVE_GENERIC ++ ++-static inline void volk_16ic_convert_32fc_generic(lv_32fc_t* outputVector, const lv_16sc_t* inputVector, unsigned int num_points) +++static inline void volk_16ic_convert_32fc_generic(lv_32fc_t* outputVector, +++ const lv_16sc_t* inputVector, +++ unsigned int num_points) ++ { ++ unsigned int i; ++- for(i = 0; i < num_points; i++) ++- { ++- outputVector[i] = lv_cmake((float)lv_creal(inputVector[i]), (float)lv_cimag(inputVector[i])); ++- } +++ for (i = 0; i < num_points; i++) { +++ outputVector[i] = +++ lv_cmake((float)lv_creal(inputVector[i]), (float)lv_cimag(inputVector[i])); +++ } ++ } ++ ++ #endif /* LV_HAVE_GENERIC */ ++@@ -99,7 +101,9 @@ static inline void volk_16ic_convert_32fc_generic(lv_32fc_t* outputVector, const ++ #ifdef LV_HAVE_SSE2 ++ #include ++ ++-static inline void volk_16ic_convert_32fc_a_sse2(lv_32fc_t* outputVector, const lv_16sc_t* inputVector, unsigned int num_points) +++static inline void volk_16ic_convert_32fc_a_sse2(lv_32fc_t* outputVector, +++ const lv_16sc_t* inputVector, +++ unsigned int num_points) ++ { ++ const unsigned int sse_iters = num_points / 2; ++ ++@@ -108,18 +112,21 @@ static inline void volk_16ic_convert_32fc_a_sse2(lv_32fc_t* outputVector, const ++ __m128 a; ++ unsigned int number; ++ ++- for(number = 0; number < sse_iters; number++) ++- { ++- a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg ++- _mm_store_ps((float*)_out, a); ++- _in += 2; ++- _out += 2; ++- } ++- if (num_points & 1) ++- { ++- *_out++ = lv_cmake((float)lv_creal(*_in), (float)lv_cimag(*_in)); ++- _in++; ++- } +++ for (number = 0; number < sse_iters; number++) { +++ a = _mm_set_ps( +++ (float)(lv_cimag(_in[1])), +++ (float)(lv_creal(_in[1])), +++ (float)(lv_cimag(_in[0])), +++ (float)(lv_creal( +++ _in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg +++ _mm_store_ps((float*)_out, a); +++ _in += 2; +++ _out += 2; +++ } +++ if (num_points & 1) { +++ *_out++ = lv_cmake((float)lv_creal(*_in), (float)lv_cimag(*_in)); +++ _in++; +++ } ++ } ++ ++ #endif /* LV_HAVE_SSE2 */ ++@@ -127,7 +134,9 @@ static inline void volk_16ic_convert_32fc_a_sse2(lv_32fc_t* outputVector, const ++ #ifdef LV_HAVE_AVX ++ #include ++ ++-static inline void volk_16ic_convert_32fc_a_avx(lv_32fc_t* outputVector, const lv_16sc_t* inputVector, unsigned int num_points) +++static inline void volk_16ic_convert_32fc_a_avx(lv_32fc_t* outputVector, +++ const lv_16sc_t* inputVector, +++ unsigned int num_points) ++ { ++ const unsigned int sse_iters = num_points / 4; ++ ++@@ -136,19 +145,26 @@ static inline void volk_16ic_convert_32fc_a_avx(lv_32fc_t* outputVector, const l ++ __m256 a; ++ unsigned int i, number; ++ ++- for(number = 0; number < sse_iters; number++) ++- { ++- a = _mm256_set_ps((float)(lv_cimag(_in[3])), (float)(lv_creal(_in[3])), (float)(lv_cimag(_in[2])), (float)(lv_creal(_in[2])), (float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg ++- _mm256_store_ps((float*)_out, a); ++- _in += 4; ++- _out += 4; ++- } +++ for (number = 0; number < sse_iters; number++) { +++ a = _mm256_set_ps( +++ (float)(lv_cimag(_in[3])), +++ (float)(lv_creal(_in[3])), +++ (float)(lv_cimag(_in[2])), +++ (float)(lv_creal(_in[2])), +++ (float)(lv_cimag(_in[1])), +++ (float)(lv_creal(_in[1])), +++ (float)(lv_cimag(_in[0])), +++ (float)(lv_creal( +++ _in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg +++ _mm256_store_ps((float*)_out, a); +++ _in += 4; +++ _out += 4; +++ } ++ _mm256_zeroupper(); ++- for (i = 0; i < (num_points % 4); ++i) ++- { ++- *_out++ = lv_cmake((float)lv_creal(*_in), (float)lv_cimag(*_in)); ++- _in++; ++- } +++ for (i = 0; i < (num_points % 4); ++i) { +++ *_out++ = lv_cmake((float)lv_creal(*_in), (float)lv_cimag(*_in)); +++ _in++; +++ } ++ } ++ ++ #endif /* LV_HAVE_AVX */ ++@@ -157,7 +173,9 @@ static inline void volk_16ic_convert_32fc_a_avx(lv_32fc_t* outputVector, const l ++ #ifdef LV_HAVE_NEON ++ #include ++ ++-static inline void volk_16ic_convert_32fc_neon(lv_32fc_t* outputVector, const lv_16sc_t* inputVector, unsigned int num_points) +++static inline void volk_16ic_convert_32fc_neon(lv_32fc_t* outputVector, +++ const lv_16sc_t* inputVector, +++ unsigned int num_points) ++ { ++ const unsigned int sse_iters = num_points / 2; ++ ++@@ -169,21 +187,19 @@ static inline void volk_16ic_convert_32fc_neon(lv_32fc_t* outputVector, const lv ++ float32x4_t f32x4; ++ unsigned int i, number; ++ ++- for(number = 0; number < sse_iters; number++) ++- { ++- a16x4 = vld1_s16((const int16_t*)_in); ++- __VOLK_PREFETCH(_in + 4); ++- a32x4 = vmovl_s16(a16x4); ++- f32x4 = vcvtq_f32_s32(a32x4); ++- vst1q_f32((float32_t*)_out, f32x4); ++- _in += 2; ++- _out += 2; ++- } ++- for (i = 0; i < (num_points % 2); ++i) ++- { ++- *_out++ = lv_cmake((float)lv_creal(*_in), (float)lv_cimag(*_in)); ++- _in++; ++- } +++ for (number = 0; number < sse_iters; number++) { +++ a16x4 = vld1_s16((const int16_t*)_in); +++ __VOLK_PREFETCH(_in + 4); +++ a32x4 = vmovl_s16(a16x4); +++ f32x4 = vcvtq_f32_s32(a32x4); +++ vst1q_f32((float32_t*)_out, f32x4); +++ _in += 2; +++ _out += 2; +++ } +++ for (i = 0; i < (num_points % 2); ++i) { +++ *_out++ = lv_cmake((float)lv_creal(*_in), (float)lv_cimag(*_in)); +++ _in++; +++ } ++ } ++ #endif /* LV_HAVE_NEON */ ++ ++@@ -198,7 +214,9 @@ static inline void volk_16ic_convert_32fc_neon(lv_32fc_t* outputVector, const lv ++ #ifdef LV_HAVE_AVX2 ++ #include ++ ++-static inline void volk_16ic_convert_32fc_u_avx2(lv_32fc_t* outputVector, const lv_16sc_t* inputVector, unsigned int num_points) +++static inline void volk_16ic_convert_32fc_u_avx2(lv_32fc_t* outputVector, +++ const lv_16sc_t* inputVector, +++ unsigned int num_points) ++ { ++ const unsigned int avx_iters = num_points / 8; ++ unsigned int number = 0; ++@@ -208,23 +226,21 @@ static inline void volk_16ic_convert_32fc_u_avx2(lv_32fc_t* outputVector, const ++ __m256i outValInt; ++ __m128i cplxValue; ++ ++- for(number = 0; number < avx_iters; number++) ++- { ++- cplxValue = _mm_loadu_si128((__m128i*)complexVectorPtr); ++- complexVectorPtr += 8; ++- ++- outValInt = _mm256_cvtepi16_epi32(cplxValue); ++- outVal = _mm256_cvtepi32_ps(outValInt); ++- _mm256_storeu_ps((float*)outputVectorPtr, outVal); +++ for (number = 0; number < avx_iters; number++) { +++ cplxValue = _mm_loadu_si128((__m128i*)complexVectorPtr); +++ complexVectorPtr += 8; +++ +++ outValInt = _mm256_cvtepi16_epi32(cplxValue); +++ outVal = _mm256_cvtepi32_ps(outValInt); +++ _mm256_storeu_ps((float*)outputVectorPtr, outVal); ++ ++- outputVectorPtr += 8; ++- } +++ outputVectorPtr += 8; +++ } ++ ++ number = avx_iters * 8; ++- for(; number < num_points*2; number++) ++- { ++- *outputVectorPtr++ = (float)*complexVectorPtr++; ++- } +++ for (; number < num_points * 2; number++) { +++ *outputVectorPtr++ = (float)*complexVectorPtr++; +++ } ++ } ++ ++ #endif /* LV_HAVE_AVX2 */ ++@@ -232,7 +248,9 @@ static inline void volk_16ic_convert_32fc_u_avx2(lv_32fc_t* outputVector, const ++ #ifdef LV_HAVE_SSE2 ++ #include ++ ++-static inline void volk_16ic_convert_32fc_u_sse2(lv_32fc_t* outputVector, const lv_16sc_t* inputVector, unsigned int num_points) +++static inline void volk_16ic_convert_32fc_u_sse2(lv_32fc_t* outputVector, +++ const lv_16sc_t* inputVector, +++ unsigned int num_points) ++ { ++ const unsigned int sse_iters = num_points / 2; ++ ++@@ -241,18 +259,21 @@ static inline void volk_16ic_convert_32fc_u_sse2(lv_32fc_t* outputVector, const ++ __m128 a; ++ unsigned int number; ++ ++- for(number = 0; number < sse_iters; number++) ++- { ++- a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg ++- _mm_storeu_ps((float*)_out, a); ++- _in += 2; ++- _out += 2; ++- } ++- if (num_points & 1) ++- { ++- *_out++ = lv_cmake((float)lv_creal(*_in), (float)lv_cimag(*_in)); ++- _in++; ++- } +++ for (number = 0; number < sse_iters; number++) { +++ a = _mm_set_ps( +++ (float)(lv_cimag(_in[1])), +++ (float)(lv_creal(_in[1])), +++ (float)(lv_cimag(_in[0])), +++ (float)(lv_creal( +++ _in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg +++ _mm_storeu_ps((float*)_out, a); +++ _in += 2; +++ _out += 2; +++ } +++ if (num_points & 1) { +++ *_out++ = lv_cmake((float)lv_creal(*_in), (float)lv_cimag(*_in)); +++ _in++; +++ } ++ } ++ ++ #endif /* LV_HAVE_SSE2 */ ++@@ -261,7 +282,9 @@ static inline void volk_16ic_convert_32fc_u_sse2(lv_32fc_t* outputVector, const ++ #ifdef LV_HAVE_AVX ++ #include ++ ++-static inline void volk_16ic_convert_32fc_u_avx(lv_32fc_t* outputVector, const lv_16sc_t* inputVector, unsigned int num_points) +++static inline void volk_16ic_convert_32fc_u_avx(lv_32fc_t* outputVector, +++ const lv_16sc_t* inputVector, +++ unsigned int num_points) ++ { ++ const unsigned int sse_iters = num_points / 4; ++ ++@@ -270,21 +293,27 @@ static inline void volk_16ic_convert_32fc_u_avx(lv_32fc_t* outputVector, const l ++ __m256 a; ++ unsigned int i, number; ++ ++- for(number = 0; number < sse_iters; number++) ++- { ++- a = _mm256_set_ps((float)(lv_cimag(_in[3])), (float)(lv_creal(_in[3])), (float)(lv_cimag(_in[2])), (float)(lv_creal(_in[2])), (float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg ++- _mm256_storeu_ps((float*)_out, a); ++- _in += 4; ++- _out += 4; ++- } +++ for (number = 0; number < sse_iters; number++) { +++ a = _mm256_set_ps( +++ (float)(lv_cimag(_in[3])), +++ (float)(lv_creal(_in[3])), +++ (float)(lv_cimag(_in[2])), +++ (float)(lv_creal(_in[2])), +++ (float)(lv_cimag(_in[1])), +++ (float)(lv_creal(_in[1])), +++ (float)(lv_cimag(_in[0])), +++ (float)(lv_creal( +++ _in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg +++ _mm256_storeu_ps((float*)_out, a); +++ _in += 4; +++ _out += 4; +++ } ++ _mm256_zeroupper(); ++- for (i = 0; i < (num_points % 4); ++i) ++- { ++- *_out++ = lv_cmake((float)lv_creal(*_in), (float)lv_cimag(*_in)); ++- _in++; ++- } +++ for (i = 0; i < (num_points % 4); ++i) { +++ *_out++ = lv_cmake((float)lv_creal(*_in), (float)lv_cimag(*_in)); +++ _in++; +++ } ++ } ++ ++ #endif /* LV_HAVE_AVX */ ++ #endif /* INCLUDED_volk_32fc_convert_16ic_u_H */ ++- ++diff --git a/kernels/volk/volk_16ic_deinterleave_16i_x2.h b/kernels/volk/volk_16ic_deinterleave_16i_x2.h ++index 40d10b4..9e784a6 100644 ++--- a/kernels/volk/volk_16ic_deinterleave_16i_x2.h +++++ b/kernels/volk/volk_16ic_deinterleave_16i_x2.h ++@@ -29,8 +29,8 @@ ++ * ++ * Dispatcher Prototype ++ * \code ++- * void volk_16ic_deinterleave_16i_x2(int16_t* iBuffer, int16_t* qBuffer, const lv_16sc_t* complexVector, unsigned int num_points) ++- * \endcode +++ * void volk_16ic_deinterleave_16i_x2(int16_t* iBuffer, int16_t* qBuffer, const lv_16sc_t* +++ * complexVector, unsigned int num_points) \endcode ++ * ++ * \b Inputs ++ * \li complexVector: The complex input vector. ++@@ -59,179 +59,241 @@ ++ #ifdef LV_HAVE_AVX2 ++ #include ++ ++-static inline void ++-volk_16ic_deinterleave_16i_x2_a_avx2(int16_t* iBuffer, int16_t* qBuffer, const lv_16sc_t* complexVector, unsigned int num_points) +++static inline void volk_16ic_deinterleave_16i_x2_a_avx2(int16_t* iBuffer, +++ int16_t* qBuffer, +++ const lv_16sc_t* complexVector, +++ unsigned int num_points) ++ { ++- unsigned int number = 0; ++- const int8_t* complexVectorPtr = (int8_t*)complexVector; ++- int16_t* iBufferPtr = iBuffer; ++- int16_t* qBufferPtr = qBuffer; ++- ++- __m256i MoveMask = _mm256_set_epi8(15,14,11,10,7,6,3,2,13,12,9,8,5,4,1,0, 15,14,11,10,7,6,3,2,13,12,9,8,5,4,1,0); ++- ++- __m256i iMove2, iMove1; ++- __m256i complexVal1, complexVal2, iOutputVal, qOutputVal; ++- ++- unsigned int sixteenthPoints = num_points / 16; ++- ++- for(number = 0; number < sixteenthPoints; number++){ ++- complexVal1 = _mm256_load_si256((__m256i*)complexVectorPtr); complexVectorPtr += 32; ++- complexVal2 = _mm256_load_si256((__m256i*)complexVectorPtr); complexVectorPtr += 32; ++- ++- iMove2 = _mm256_shuffle_epi8(complexVal2, MoveMask); ++- iMove1 = _mm256_shuffle_epi8(complexVal1, MoveMask); ++- ++- iOutputVal = _mm256_permute2x128_si256(_mm256_permute4x64_epi64(iMove1,0x08),_mm256_permute4x64_epi64(iMove2,0x80),0x30); ++- qOutputVal = _mm256_permute2x128_si256(_mm256_permute4x64_epi64(iMove1,0x0d),_mm256_permute4x64_epi64(iMove2,0xd0),0x30); ++- ++- _mm256_store_si256((__m256i*)iBufferPtr, iOutputVal); ++- _mm256_store_si256((__m256i*)qBufferPtr, qOutputVal); ++- ++- iBufferPtr += 16; ++- qBufferPtr += 16; ++- } ++- ++- number = sixteenthPoints * 16; ++- int16_t* int16ComplexVectorPtr = (int16_t*)complexVectorPtr; ++- for(; number < num_points; number++){ ++- *iBufferPtr++ = *int16ComplexVectorPtr++; ++- *qBufferPtr++ = *int16ComplexVectorPtr++; ++- } +++ unsigned int number = 0; +++ const int8_t* complexVectorPtr = (int8_t*)complexVector; +++ int16_t* iBufferPtr = iBuffer; +++ int16_t* qBufferPtr = qBuffer; +++ +++ __m256i MoveMask = _mm256_set_epi8(15, +++ 14, +++ 11, +++ 10, +++ 7, +++ 6, +++ 3, +++ 2, +++ 13, +++ 12, +++ 9, +++ 8, +++ 5, +++ 4, +++ 1, +++ 0, +++ 15, +++ 14, +++ 11, +++ 10, +++ 7, +++ 6, +++ 3, +++ 2, +++ 13, +++ 12, +++ 9, +++ 8, +++ 5, +++ 4, +++ 1, +++ 0); +++ +++ __m256i iMove2, iMove1; +++ __m256i complexVal1, complexVal2, iOutputVal, qOutputVal; +++ +++ unsigned int sixteenthPoints = num_points / 16; +++ +++ for (number = 0; number < sixteenthPoints; number++) { +++ complexVal1 = _mm256_load_si256((__m256i*)complexVectorPtr); +++ complexVectorPtr += 32; +++ complexVal2 = _mm256_load_si256((__m256i*)complexVectorPtr); +++ complexVectorPtr += 32; +++ +++ iMove2 = _mm256_shuffle_epi8(complexVal2, MoveMask); +++ iMove1 = _mm256_shuffle_epi8(complexVal1, MoveMask); +++ +++ iOutputVal = _mm256_permute2x128_si256(_mm256_permute4x64_epi64(iMove1, 0x08), +++ _mm256_permute4x64_epi64(iMove2, 0x80), +++ 0x30); +++ qOutputVal = _mm256_permute2x128_si256(_mm256_permute4x64_epi64(iMove1, 0x0d), +++ _mm256_permute4x64_epi64(iMove2, 0xd0), +++ 0x30); +++ +++ _mm256_store_si256((__m256i*)iBufferPtr, iOutputVal); +++ _mm256_store_si256((__m256i*)qBufferPtr, qOutputVal); +++ +++ iBufferPtr += 16; +++ qBufferPtr += 16; +++ } +++ +++ number = sixteenthPoints * 16; +++ int16_t* int16ComplexVectorPtr = (int16_t*)complexVectorPtr; +++ for (; number < num_points; number++) { +++ *iBufferPtr++ = *int16ComplexVectorPtr++; +++ *qBufferPtr++ = *int16ComplexVectorPtr++; +++ } ++ } ++ #endif /* LV_HAVE_AVX2 */ ++ ++ #ifdef LV_HAVE_SSSE3 ++ #include ++ ++-static inline void ++-volk_16ic_deinterleave_16i_x2_a_ssse3(int16_t* iBuffer, int16_t* qBuffer, const lv_16sc_t* complexVector, unsigned int num_points) +++static inline void volk_16ic_deinterleave_16i_x2_a_ssse3(int16_t* iBuffer, +++ int16_t* qBuffer, +++ const lv_16sc_t* complexVector, +++ unsigned int num_points) ++ { ++- unsigned int number = 0; ++- const int8_t* complexVectorPtr = (int8_t*)complexVector; ++- int16_t* iBufferPtr = iBuffer; ++- int16_t* qBufferPtr = qBuffer; ++- ++- __m128i iMoveMask1 = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0); ++- __m128i iMoveMask2 = _mm_set_epi8(13, 12, 9, 8, 5, 4, 1, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80); ++- ++- __m128i qMoveMask1 = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 15, 14, 11, 10, 7, 6, 3, 2); ++- __m128i qMoveMask2 = _mm_set_epi8(15, 14, 11, 10, 7, 6, 3, 2, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80); ++- ++- __m128i complexVal1, complexVal2, iOutputVal, qOutputVal; ++- ++- unsigned int eighthPoints = num_points / 8; ++- ++- for(number = 0; number < eighthPoints; number++){ ++- complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16; ++- complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16; ++- ++- iOutputVal = _mm_or_si128( _mm_shuffle_epi8(complexVal1, iMoveMask1) , _mm_shuffle_epi8(complexVal2, iMoveMask2)); ++- qOutputVal = _mm_or_si128( _mm_shuffle_epi8(complexVal1, qMoveMask1) , _mm_shuffle_epi8(complexVal2, qMoveMask2)); ++- ++- _mm_store_si128((__m128i*)iBufferPtr, iOutputVal); ++- _mm_store_si128((__m128i*)qBufferPtr, qOutputVal); ++- ++- iBufferPtr += 8; ++- qBufferPtr += 8; ++- } ++- ++- number = eighthPoints * 8; ++- int16_t* int16ComplexVectorPtr = (int16_t*)complexVectorPtr; ++- for(; number < num_points; number++){ ++- *iBufferPtr++ = *int16ComplexVectorPtr++; ++- *qBufferPtr++ = *int16ComplexVectorPtr++; ++- } +++ unsigned int number = 0; +++ const int8_t* complexVectorPtr = (int8_t*)complexVector; +++ int16_t* iBufferPtr = iBuffer; +++ int16_t* qBufferPtr = qBuffer; +++ +++ __m128i iMoveMask1 = _mm_set_epi8( +++ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0); +++ __m128i iMoveMask2 = _mm_set_epi8( +++ 13, 12, 9, 8, 5, 4, 1, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80); +++ +++ __m128i qMoveMask1 = _mm_set_epi8( +++ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 15, 14, 11, 10, 7, 6, 3, 2); +++ __m128i qMoveMask2 = _mm_set_epi8( +++ 15, 14, 11, 10, 7, 6, 3, 2, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80); +++ +++ __m128i complexVal1, complexVal2, iOutputVal, qOutputVal; +++ +++ unsigned int eighthPoints = num_points / 8; +++ +++ for (number = 0; number < eighthPoints; number++) { +++ complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr); +++ complexVectorPtr += 16; +++ complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr); +++ complexVectorPtr += 16; +++ +++ iOutputVal = _mm_or_si128(_mm_shuffle_epi8(complexVal1, iMoveMask1), +++ _mm_shuffle_epi8(complexVal2, iMoveMask2)); +++ qOutputVal = _mm_or_si128(_mm_shuffle_epi8(complexVal1, qMoveMask1), +++ _mm_shuffle_epi8(complexVal2, qMoveMask2)); +++ +++ _mm_store_si128((__m128i*)iBufferPtr, iOutputVal); +++ _mm_store_si128((__m128i*)qBufferPtr, qOutputVal); +++ +++ iBufferPtr += 8; +++ qBufferPtr += 8; +++ } +++ +++ number = eighthPoints * 8; +++ int16_t* int16ComplexVectorPtr = (int16_t*)complexVectorPtr; +++ for (; number < num_points; number++) { +++ *iBufferPtr++ = *int16ComplexVectorPtr++; +++ *qBufferPtr++ = *int16ComplexVectorPtr++; +++ } ++ } ++ #endif /* LV_HAVE_SSSE3 */ ++ ++ #ifdef LV_HAVE_SSE2 ++ #include ++ ++-static inline void ++-volk_16ic_deinterleave_16i_x2_a_sse2(int16_t* iBuffer, int16_t* qBuffer, const lv_16sc_t* complexVector, unsigned int num_points) +++static inline void volk_16ic_deinterleave_16i_x2_a_sse2(int16_t* iBuffer, +++ int16_t* qBuffer, +++ const lv_16sc_t* complexVector, +++ unsigned int num_points) ++ { ++- unsigned int number = 0; ++- const int16_t* complexVectorPtr = (int16_t*)complexVector; ++- int16_t* iBufferPtr = iBuffer; ++- int16_t* qBufferPtr = qBuffer; ++- __m128i complexVal1, complexVal2, iComplexVal1, iComplexVal2, qComplexVal1, qComplexVal2, iOutputVal, qOutputVal; ++- __m128i lowMask = _mm_set_epi32(0x0, 0x0, 0xFFFFFFFF, 0xFFFFFFFF); ++- __m128i highMask = _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0x0, 0x0); +++ unsigned int number = 0; +++ const int16_t* complexVectorPtr = (int16_t*)complexVector; +++ int16_t* iBufferPtr = iBuffer; +++ int16_t* qBufferPtr = qBuffer; +++ __m128i complexVal1, complexVal2, iComplexVal1, iComplexVal2, qComplexVal1, +++ qComplexVal2, iOutputVal, qOutputVal; +++ __m128i lowMask = _mm_set_epi32(0x0, 0x0, 0xFFFFFFFF, 0xFFFFFFFF); +++ __m128i highMask = _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0x0, 0x0); ++ ++- unsigned int eighthPoints = num_points / 8; +++ unsigned int eighthPoints = num_points / 8; ++ ++- for(number = 0; number < eighthPoints; number++){ ++- complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 8; ++- complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 8; +++ for (number = 0; number < eighthPoints; number++) { +++ complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr); +++ complexVectorPtr += 8; +++ complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr); +++ complexVectorPtr += 8; ++ ++- iComplexVal1 = _mm_shufflelo_epi16(complexVal1, _MM_SHUFFLE(3,1,2,0)); +++ iComplexVal1 = _mm_shufflelo_epi16(complexVal1, _MM_SHUFFLE(3, 1, 2, 0)); ++ ++- iComplexVal1 = _mm_shufflehi_epi16(iComplexVal1, _MM_SHUFFLE(3,1,2,0)); +++ iComplexVal1 = _mm_shufflehi_epi16(iComplexVal1, _MM_SHUFFLE(3, 1, 2, 0)); ++ ++- iComplexVal1 = _mm_shuffle_epi32(iComplexVal1, _MM_SHUFFLE(3,1,2,0)); +++ iComplexVal1 = _mm_shuffle_epi32(iComplexVal1, _MM_SHUFFLE(3, 1, 2, 0)); ++ ++- iComplexVal2 = _mm_shufflelo_epi16(complexVal2, _MM_SHUFFLE(3,1,2,0)); +++ iComplexVal2 = _mm_shufflelo_epi16(complexVal2, _MM_SHUFFLE(3, 1, 2, 0)); ++ ++- iComplexVal2 = _mm_shufflehi_epi16(iComplexVal2, _MM_SHUFFLE(3,1,2,0)); +++ iComplexVal2 = _mm_shufflehi_epi16(iComplexVal2, _MM_SHUFFLE(3, 1, 2, 0)); ++ ++- iComplexVal2 = _mm_shuffle_epi32(iComplexVal2, _MM_SHUFFLE(2,0,3,1)); +++ iComplexVal2 = _mm_shuffle_epi32(iComplexVal2, _MM_SHUFFLE(2, 0, 3, 1)); ++ ++- iOutputVal = _mm_or_si128(_mm_and_si128(iComplexVal1, lowMask), _mm_and_si128(iComplexVal2, highMask)); +++ iOutputVal = _mm_or_si128(_mm_and_si128(iComplexVal1, lowMask), +++ _mm_and_si128(iComplexVal2, highMask)); ++ ++- _mm_store_si128((__m128i*)iBufferPtr, iOutputVal); +++ _mm_store_si128((__m128i*)iBufferPtr, iOutputVal); ++ ++- qComplexVal1 = _mm_shufflelo_epi16(complexVal1, _MM_SHUFFLE(2,0,3,1)); +++ qComplexVal1 = _mm_shufflelo_epi16(complexVal1, _MM_SHUFFLE(2, 0, 3, 1)); ++ ++- qComplexVal1 = _mm_shufflehi_epi16(qComplexVal1, _MM_SHUFFLE(2,0,3,1)); +++ qComplexVal1 = _mm_shufflehi_epi16(qComplexVal1, _MM_SHUFFLE(2, 0, 3, 1)); ++ ++- qComplexVal1 = _mm_shuffle_epi32(qComplexVal1, _MM_SHUFFLE(3,1,2,0)); +++ qComplexVal1 = _mm_shuffle_epi32(qComplexVal1, _MM_SHUFFLE(3, 1, 2, 0)); ++ ++- qComplexVal2 = _mm_shufflelo_epi16(complexVal2, _MM_SHUFFLE(2,0,3,1)); +++ qComplexVal2 = _mm_shufflelo_epi16(complexVal2, _MM_SHUFFLE(2, 0, 3, 1)); ++ ++- qComplexVal2 = _mm_shufflehi_epi16(qComplexVal2, _MM_SHUFFLE(2,0,3,1)); +++ qComplexVal2 = _mm_shufflehi_epi16(qComplexVal2, _MM_SHUFFLE(2, 0, 3, 1)); ++ ++- qComplexVal2 = _mm_shuffle_epi32(qComplexVal2, _MM_SHUFFLE(2,0,3,1)); +++ qComplexVal2 = _mm_shuffle_epi32(qComplexVal2, _MM_SHUFFLE(2, 0, 3, 1)); ++ ++- qOutputVal = _mm_or_si128(_mm_and_si128(qComplexVal1, lowMask), _mm_and_si128(qComplexVal2, highMask)); +++ qOutputVal = _mm_or_si128(_mm_and_si128(qComplexVal1, lowMask), +++ _mm_and_si128(qComplexVal2, highMask)); ++ ++- _mm_store_si128((__m128i*)qBufferPtr, qOutputVal); +++ _mm_store_si128((__m128i*)qBufferPtr, qOutputVal); ++ ++- iBufferPtr += 8; ++- qBufferPtr += 8; ++- } +++ iBufferPtr += 8; +++ qBufferPtr += 8; +++ } ++ ++- number = eighthPoints * 8; ++- for(; number < num_points; number++){ ++- *iBufferPtr++ = *complexVectorPtr++; ++- *qBufferPtr++ = *complexVectorPtr++; ++- } +++ number = eighthPoints * 8; +++ for (; number < num_points; number++) { +++ *iBufferPtr++ = *complexVectorPtr++; +++ *qBufferPtr++ = *complexVectorPtr++; +++ } ++ } ++ #endif /* LV_HAVE_SSE2 */ ++ ++ #ifdef LV_HAVE_GENERIC ++ ++-static inline void ++-volk_16ic_deinterleave_16i_x2_generic(int16_t* iBuffer, int16_t* qBuffer, const lv_16sc_t* complexVector, unsigned int num_points) +++static inline void volk_16ic_deinterleave_16i_x2_generic(int16_t* iBuffer, +++ int16_t* qBuffer, +++ const lv_16sc_t* complexVector, +++ unsigned int num_points) ++ { ++- const int16_t* complexVectorPtr = (const int16_t*)complexVector; ++- int16_t* iBufferPtr = iBuffer; ++- int16_t* qBufferPtr = qBuffer; ++- unsigned int number; ++- for(number = 0; number < num_points; number++){ ++- *iBufferPtr++ = *complexVectorPtr++; ++- *qBufferPtr++ = *complexVectorPtr++; ++- } +++ const int16_t* complexVectorPtr = (const int16_t*)complexVector; +++ int16_t* iBufferPtr = iBuffer; +++ int16_t* qBufferPtr = qBuffer; +++ unsigned int number; +++ for (number = 0; number < num_points; number++) { +++ *iBufferPtr++ = *complexVectorPtr++; +++ *qBufferPtr++ = *complexVectorPtr++; +++ } ++ } ++ #endif /* LV_HAVE_GENERIC */ ++ ++ #ifdef LV_HAVE_ORC ++ ++-extern void ++-volk_16ic_deinterleave_16i_x2_a_orc_impl(int16_t* iBuffer, int16_t* qBuffer, const lv_16sc_t* complexVector, unsigned int num_points); ++-static inline void ++-volk_16ic_deinterleave_16i_x2_u_orc(int16_t* iBuffer, int16_t* qBuffer, const lv_16sc_t* complexVector, unsigned int num_points) +++extern void volk_16ic_deinterleave_16i_x2_a_orc_impl(int16_t* iBuffer, +++ int16_t* qBuffer, +++ const lv_16sc_t* complexVector, +++ unsigned int num_points); +++static inline void volk_16ic_deinterleave_16i_x2_u_orc(int16_t* iBuffer, +++ int16_t* qBuffer, +++ const lv_16sc_t* complexVector, +++ unsigned int num_points) ++ { ++- volk_16ic_deinterleave_16i_x2_a_orc_impl(iBuffer, qBuffer, complexVector, num_points); +++ volk_16ic_deinterleave_16i_x2_a_orc_impl(iBuffer, qBuffer, complexVector, num_points); ++ } ++ #endif /* LV_HAVE_ORC */ ++ ++@@ -246,44 +308,83 @@ volk_16ic_deinterleave_16i_x2_u_orc(int16_t* iBuffer, int16_t* qBuffer, const lv ++ #ifdef LV_HAVE_AVX2 ++ #include ++ ++-static inline void ++-volk_16ic_deinterleave_16i_x2_u_avx2(int16_t* iBuffer, int16_t* qBuffer, const lv_16sc_t* complexVector, unsigned int num_points) +++static inline void volk_16ic_deinterleave_16i_x2_u_avx2(int16_t* iBuffer, +++ int16_t* qBuffer, +++ const lv_16sc_t* complexVector, +++ unsigned int num_points) ++ { ++- unsigned int number = 0; ++- const int8_t* complexVectorPtr = (int8_t*)complexVector; ++- int16_t* iBufferPtr = iBuffer; ++- int16_t* qBufferPtr = qBuffer; ++- ++- __m256i MoveMask = _mm256_set_epi8(15,14,11,10,7,6,3,2,13,12,9,8,5,4,1,0, 15,14,11,10,7,6,3,2,13,12,9,8,5,4,1,0); ++- ++- __m256i iMove2, iMove1; ++- __m256i complexVal1, complexVal2, iOutputVal, qOutputVal; ++- ++- unsigned int sixteenthPoints = num_points / 16; ++- ++- for(number = 0; number < sixteenthPoints; number++){ ++- complexVal1 = _mm256_loadu_si256((__m256i*)complexVectorPtr); complexVectorPtr += 32; ++- complexVal2 = _mm256_loadu_si256((__m256i*)complexVectorPtr); complexVectorPtr += 32; ++- ++- iMove2 = _mm256_shuffle_epi8(complexVal2, MoveMask); ++- iMove1 = _mm256_shuffle_epi8(complexVal1, MoveMask); ++- ++- iOutputVal = _mm256_permute2x128_si256(_mm256_permute4x64_epi64(iMove1,0x08),_mm256_permute4x64_epi64(iMove2,0x80),0x30); ++- qOutputVal = _mm256_permute2x128_si256(_mm256_permute4x64_epi64(iMove1,0x0d),_mm256_permute4x64_epi64(iMove2,0xd0),0x30); ++- ++- _mm256_storeu_si256((__m256i*)iBufferPtr, iOutputVal); ++- _mm256_storeu_si256((__m256i*)qBufferPtr, qOutputVal); ++- ++- iBufferPtr += 16; ++- qBufferPtr += 16; ++- } ++- ++- number = sixteenthPoints * 16; ++- int16_t* int16ComplexVectorPtr = (int16_t*)complexVectorPtr; ++- for(; number < num_points; number++){ ++- *iBufferPtr++ = *int16ComplexVectorPtr++; ++- *qBufferPtr++ = *int16ComplexVectorPtr++; ++- } +++ unsigned int number = 0; +++ const int8_t* complexVectorPtr = (int8_t*)complexVector; +++ int16_t* iBufferPtr = iBuffer; +++ int16_t* qBufferPtr = qBuffer; +++ +++ __m256i MoveMask = _mm256_set_epi8(15, +++ 14, +++ 11, +++ 10, +++ 7, +++ 6, +++ 3, +++ 2, +++ 13, +++ 12, +++ 9, +++ 8, +++ 5, +++ 4, +++ 1, +++ 0, +++ 15, +++ 14, +++ 11, +++ 10, +++ 7, +++ 6, +++ 3, +++ 2, +++ 13, +++ 12, +++ 9, +++ 8, +++ 5, +++ 4, +++ 1, +++ 0); +++ +++ __m256i iMove2, iMove1; +++ __m256i complexVal1, complexVal2, iOutputVal, qOutputVal; +++ +++ unsigned int sixteenthPoints = num_points / 16; +++ +++ for (number = 0; number < sixteenthPoints; number++) { +++ complexVal1 = _mm256_loadu_si256((__m256i*)complexVectorPtr); +++ complexVectorPtr += 32; +++ complexVal2 = _mm256_loadu_si256((__m256i*)complexVectorPtr); +++ complexVectorPtr += 32; +++ +++ iMove2 = _mm256_shuffle_epi8(complexVal2, MoveMask); +++ iMove1 = _mm256_shuffle_epi8(complexVal1, MoveMask); +++ +++ iOutputVal = _mm256_permute2x128_si256(_mm256_permute4x64_epi64(iMove1, 0x08), +++ _mm256_permute4x64_epi64(iMove2, 0x80), +++ 0x30); +++ qOutputVal = _mm256_permute2x128_si256(_mm256_permute4x64_epi64(iMove1, 0x0d), +++ _mm256_permute4x64_epi64(iMove2, 0xd0), +++ 0x30); +++ +++ _mm256_storeu_si256((__m256i*)iBufferPtr, iOutputVal); +++ _mm256_storeu_si256((__m256i*)qBufferPtr, qOutputVal); +++ +++ iBufferPtr += 16; +++ qBufferPtr += 16; +++ } +++ +++ number = sixteenthPoints * 16; +++ int16_t* int16ComplexVectorPtr = (int16_t*)complexVectorPtr; +++ for (; number < num_points; number++) { +++ *iBufferPtr++ = *int16ComplexVectorPtr++; +++ *qBufferPtr++ = *int16ComplexVectorPtr++; +++ } ++ } ++ #endif /* LV_HAVE_AVX2 */ ++ ++diff --git a/kernels/volk/volk_16ic_deinterleave_real_16i.h b/kernels/volk/volk_16ic_deinterleave_real_16i.h ++index c1de553..45fcd99 100644 ++--- a/kernels/volk/volk_16ic_deinterleave_real_16i.h +++++ b/kernels/volk/volk_16ic_deinterleave_real_16i.h ++@@ -25,12 +25,13 @@ ++ * ++ * \b Overview ++ * ++- * Deinterleaves the complex 16 bit vector and returns the real (inphase) part of the signal. +++ * Deinterleaves the complex 16 bit vector and returns the real (inphase) part of the +++ * signal. ++ * ++ * Dispatcher Prototype ++ * \code ++- * void volk_16ic_deinterleave_real_16i(int16_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points) ++- * \endcode +++ * void volk_16ic_deinterleave_real_16i(int16_t* iBuffer, const lv_16sc_t* complexVector, +++ * unsigned int num_points) \endcode ++ * ++ * \b Inputs ++ * \li complexVector: The complex input vector. ++@@ -60,79 +61,149 @@ ++ #ifdef LV_HAVE_AVX2 ++ #include ++ ++-static inline void ++-volk_16ic_deinterleave_real_16i_a_avx2(int16_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points) +++static inline void volk_16ic_deinterleave_real_16i_a_avx2(int16_t* iBuffer, +++ const lv_16sc_t* complexVector, +++ unsigned int num_points) ++ { ++- unsigned int number = 0; ++- const int16_t* complexVectorPtr = (int16_t*)complexVector; ++- int16_t* iBufferPtr = iBuffer; ++- ++- __m256i iMoveMask1 = _mm256_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0); ++- __m256i iMoveMask2 = _mm256_set_epi8(13, 12, 9, 8, 5, 4, 1, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80); ++- ++- __m256i complexVal1, complexVal2, iOutputVal; ++- ++- unsigned int sixteenthPoints = num_points / 16; ++- ++- for(number = 0; number < sixteenthPoints; number++){ ++- complexVal1 = _mm256_load_si256((__m256i*)complexVectorPtr); complexVectorPtr += 16; ++- complexVal2 = _mm256_load_si256((__m256i*)complexVectorPtr); complexVectorPtr += 16; ++- ++- complexVal1 = _mm256_shuffle_epi8(complexVal1, iMoveMask1); ++- complexVal2 = _mm256_shuffle_epi8(complexVal2, iMoveMask2); ++- ++- iOutputVal = _mm256_or_si256(complexVal1, complexVal2); ++- iOutputVal = _mm256_permute4x64_epi64(iOutputVal, 0xd8); ++- ++- _mm256_store_si256((__m256i*)iBufferPtr, iOutputVal); ++- ++- iBufferPtr += 16; ++- } ++- ++- number = sixteenthPoints * 16; ++- for(; number < num_points; number++){ ++- *iBufferPtr++ = *complexVectorPtr++; ++- complexVectorPtr++; ++- } +++ unsigned int number = 0; +++ const int16_t* complexVectorPtr = (int16_t*)complexVector; +++ int16_t* iBufferPtr = iBuffer; +++ +++ __m256i iMoveMask1 = _mm256_set_epi8(0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 13, +++ 12, +++ 9, +++ 8, +++ 5, +++ 4, +++ 1, +++ 0, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 13, +++ 12, +++ 9, +++ 8, +++ 5, +++ 4, +++ 1, +++ 0); +++ __m256i iMoveMask2 = _mm256_set_epi8(13, +++ 12, +++ 9, +++ 8, +++ 5, +++ 4, +++ 1, +++ 0, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 13, +++ 12, +++ 9, +++ 8, +++ 5, +++ 4, +++ 1, +++ 0, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80); +++ +++ __m256i complexVal1, complexVal2, iOutputVal; +++ +++ unsigned int sixteenthPoints = num_points / 16; +++ +++ for (number = 0; number < sixteenthPoints; number++) { +++ complexVal1 = _mm256_load_si256((__m256i*)complexVectorPtr); +++ complexVectorPtr += 16; +++ complexVal2 = _mm256_load_si256((__m256i*)complexVectorPtr); +++ complexVectorPtr += 16; +++ +++ complexVal1 = _mm256_shuffle_epi8(complexVal1, iMoveMask1); +++ complexVal2 = _mm256_shuffle_epi8(complexVal2, iMoveMask2); +++ +++ iOutputVal = _mm256_or_si256(complexVal1, complexVal2); +++ iOutputVal = _mm256_permute4x64_epi64(iOutputVal, 0xd8); +++ +++ _mm256_store_si256((__m256i*)iBufferPtr, iOutputVal); +++ +++ iBufferPtr += 16; +++ } +++ +++ number = sixteenthPoints * 16; +++ for (; number < num_points; number++) { +++ *iBufferPtr++ = *complexVectorPtr++; +++ complexVectorPtr++; +++ } ++ } ++ #endif /* LV_HAVE_AVX2 */ ++ ++ #ifdef LV_HAVE_SSSE3 ++ #include ++ ++-static inline void ++-volk_16ic_deinterleave_real_16i_a_ssse3(int16_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points) +++static inline void volk_16ic_deinterleave_real_16i_a_ssse3(int16_t* iBuffer, +++ const lv_16sc_t* complexVector, +++ unsigned int num_points) ++ { ++- unsigned int number = 0; ++- const int16_t* complexVectorPtr = (int16_t*)complexVector; ++- int16_t* iBufferPtr = iBuffer; +++ unsigned int number = 0; +++ const int16_t* complexVectorPtr = (int16_t*)complexVector; +++ int16_t* iBufferPtr = iBuffer; ++ ++- __m128i iMoveMask1 = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0); ++- __m128i iMoveMask2 = _mm_set_epi8(13, 12, 9, 8, 5, 4, 1, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80); +++ __m128i iMoveMask1 = _mm_set_epi8( +++ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0); +++ __m128i iMoveMask2 = _mm_set_epi8( +++ 13, 12, 9, 8, 5, 4, 1, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80); ++ ++- __m128i complexVal1, complexVal2, iOutputVal; +++ __m128i complexVal1, complexVal2, iOutputVal; ++ ++- unsigned int eighthPoints = num_points / 8; +++ unsigned int eighthPoints = num_points / 8; ++ ++- for(number = 0; number < eighthPoints; number++){ ++- complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 8; ++- complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 8; +++ for (number = 0; number < eighthPoints; number++) { +++ complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr); +++ complexVectorPtr += 8; +++ complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr); +++ complexVectorPtr += 8; ++ ++- complexVal1 = _mm_shuffle_epi8(complexVal1, iMoveMask1); ++- complexVal2 = _mm_shuffle_epi8(complexVal2, iMoveMask2); +++ complexVal1 = _mm_shuffle_epi8(complexVal1, iMoveMask1); +++ complexVal2 = _mm_shuffle_epi8(complexVal2, iMoveMask2); ++ ++- iOutputVal = _mm_or_si128(complexVal1, complexVal2); +++ iOutputVal = _mm_or_si128(complexVal1, complexVal2); ++ ++- _mm_store_si128((__m128i*)iBufferPtr, iOutputVal); +++ _mm_store_si128((__m128i*)iBufferPtr, iOutputVal); ++ ++- iBufferPtr += 8; ++- } +++ iBufferPtr += 8; +++ } ++ ++- number = eighthPoints * 8; ++- for(; number < num_points; number++){ ++- *iBufferPtr++ = *complexVectorPtr++; ++- complexVectorPtr++; ++- } +++ number = eighthPoints * 8; +++ for (; number < num_points; number++) { +++ *iBufferPtr++ = *complexVectorPtr++; +++ complexVectorPtr++; +++ } ++ } ++ #endif /* LV_HAVE_SSSE3 */ ++ ++@@ -140,61 +211,66 @@ volk_16ic_deinterleave_real_16i_a_ssse3(int16_t* iBuffer, const lv_16sc_t* compl ++ #ifdef LV_HAVE_SSE2 ++ #include ++ ++-static inline void ++-volk_16ic_deinterleave_real_16i_a_sse2(int16_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points) +++static inline void volk_16ic_deinterleave_real_16i_a_sse2(int16_t* iBuffer, +++ const lv_16sc_t* complexVector, +++ unsigned int num_points) ++ { ++- unsigned int number = 0; ++- const int16_t* complexVectorPtr = (int16_t*)complexVector; ++- int16_t* iBufferPtr = iBuffer; ++- __m128i complexVal1, complexVal2, iOutputVal; ++- __m128i lowMask = _mm_set_epi32(0x0, 0x0, 0xFFFFFFFF, 0xFFFFFFFF); ++- __m128i highMask = _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0x0, 0x0); +++ unsigned int number = 0; +++ const int16_t* complexVectorPtr = (int16_t*)complexVector; +++ int16_t* iBufferPtr = iBuffer; +++ __m128i complexVal1, complexVal2, iOutputVal; +++ __m128i lowMask = _mm_set_epi32(0x0, 0x0, 0xFFFFFFFF, 0xFFFFFFFF); +++ __m128i highMask = _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0x0, 0x0); ++ ++- unsigned int eighthPoints = num_points / 8; +++ unsigned int eighthPoints = num_points / 8; ++ ++- for(number = 0; number < eighthPoints; number++){ ++- complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 8; ++- complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 8; +++ for (number = 0; number < eighthPoints; number++) { +++ complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr); +++ complexVectorPtr += 8; +++ complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr); +++ complexVectorPtr += 8; ++ ++- complexVal1 = _mm_shufflelo_epi16(complexVal1, _MM_SHUFFLE(3,1,2,0)); +++ complexVal1 = _mm_shufflelo_epi16(complexVal1, _MM_SHUFFLE(3, 1, 2, 0)); ++ ++- complexVal1 = _mm_shufflehi_epi16(complexVal1, _MM_SHUFFLE(3,1,2,0)); +++ complexVal1 = _mm_shufflehi_epi16(complexVal1, _MM_SHUFFLE(3, 1, 2, 0)); ++ ++- complexVal1 = _mm_shuffle_epi32(complexVal1, _MM_SHUFFLE(3,1,2,0)); +++ complexVal1 = _mm_shuffle_epi32(complexVal1, _MM_SHUFFLE(3, 1, 2, 0)); ++ ++- complexVal2 = _mm_shufflelo_epi16(complexVal2, _MM_SHUFFLE(3,1,2,0)); +++ complexVal2 = _mm_shufflelo_epi16(complexVal2, _MM_SHUFFLE(3, 1, 2, 0)); ++ ++- complexVal2 = _mm_shufflehi_epi16(complexVal2, _MM_SHUFFLE(3,1,2,0)); +++ complexVal2 = _mm_shufflehi_epi16(complexVal2, _MM_SHUFFLE(3, 1, 2, 0)); ++ ++- complexVal2 = _mm_shuffle_epi32(complexVal2, _MM_SHUFFLE(2,0,3,1)); +++ complexVal2 = _mm_shuffle_epi32(complexVal2, _MM_SHUFFLE(2, 0, 3, 1)); ++ ++- iOutputVal = _mm_or_si128(_mm_and_si128(complexVal1, lowMask), _mm_and_si128(complexVal2, highMask)); +++ iOutputVal = _mm_or_si128(_mm_and_si128(complexVal1, lowMask), +++ _mm_and_si128(complexVal2, highMask)); ++ ++- _mm_store_si128((__m128i*)iBufferPtr, iOutputVal); +++ _mm_store_si128((__m128i*)iBufferPtr, iOutputVal); ++ ++- iBufferPtr += 8; ++- } +++ iBufferPtr += 8; +++ } ++ ++- number = eighthPoints * 8; ++- for(; number < num_points; number++){ ++- *iBufferPtr++ = *complexVectorPtr++; ++- complexVectorPtr++; ++- } +++ number = eighthPoints * 8; +++ for (; number < num_points; number++) { +++ *iBufferPtr++ = *complexVectorPtr++; +++ complexVectorPtr++; +++ } ++ } ++ #endif /* LV_HAVE_SSE2 */ ++ ++ #ifdef LV_HAVE_GENERIC ++ ++-static inline void ++-volk_16ic_deinterleave_real_16i_generic(int16_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points) +++static inline void volk_16ic_deinterleave_real_16i_generic(int16_t* iBuffer, +++ const lv_16sc_t* complexVector, +++ unsigned int num_points) ++ { ++- unsigned int number = 0; ++- const int16_t* complexVectorPtr = (int16_t*)complexVector; ++- int16_t* iBufferPtr = iBuffer; ++- for(number = 0; number < num_points; number++){ ++- *iBufferPtr++ = *complexVectorPtr++; ++- complexVectorPtr++; ++- } +++ unsigned int number = 0; +++ const int16_t* complexVectorPtr = (int16_t*)complexVector; +++ int16_t* iBufferPtr = iBuffer; +++ for (number = 0; number < num_points; number++) { +++ *iBufferPtr++ = *complexVectorPtr++; +++ complexVectorPtr++; +++ } ++ } ++ #endif /* LV_HAVE_GENERIC */ ++ ++@@ -212,40 +288,105 @@ volk_16ic_deinterleave_real_16i_generic(int16_t* iBuffer, const lv_16sc_t* compl ++ #ifdef LV_HAVE_AVX2 ++ #include ++ ++-static inline void ++-volk_16ic_deinterleave_real_16i_u_avx2(int16_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points) +++static inline void volk_16ic_deinterleave_real_16i_u_avx2(int16_t* iBuffer, +++ const lv_16sc_t* complexVector, +++ unsigned int num_points) ++ { ++- unsigned int number = 0; ++- const int16_t* complexVectorPtr = (int16_t*)complexVector; ++- int16_t* iBufferPtr = iBuffer; ++- ++- __m256i iMoveMask1 = _mm256_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0); ++- __m256i iMoveMask2 = _mm256_set_epi8(13, 12, 9, 8, 5, 4, 1, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80); ++- ++- __m256i complexVal1, complexVal2, iOutputVal; ++- ++- unsigned int sixteenthPoints = num_points / 16; ++- ++- for(number = 0; number < sixteenthPoints; number++){ ++- complexVal1 = _mm256_loadu_si256((__m256i*)complexVectorPtr); complexVectorPtr += 16; ++- complexVal2 = _mm256_loadu_si256((__m256i*)complexVectorPtr); complexVectorPtr += 16; ++- ++- complexVal1 = _mm256_shuffle_epi8(complexVal1, iMoveMask1); ++- complexVal2 = _mm256_shuffle_epi8(complexVal2, iMoveMask2); ++- ++- iOutputVal = _mm256_or_si256(complexVal1, complexVal2); ++- iOutputVal = _mm256_permute4x64_epi64(iOutputVal, 0xd8); ++- ++- _mm256_storeu_si256((__m256i*)iBufferPtr, iOutputVal); ++- ++- iBufferPtr += 16; ++- } ++- ++- number = sixteenthPoints * 16; ++- for(; number < num_points; number++){ ++- *iBufferPtr++ = *complexVectorPtr++; ++- complexVectorPtr++; ++- } +++ unsigned int number = 0; +++ const int16_t* complexVectorPtr = (int16_t*)complexVector; +++ int16_t* iBufferPtr = iBuffer; +++ +++ __m256i iMoveMask1 = _mm256_set_epi8(0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 13, +++ 12, +++ 9, +++ 8, +++ 5, +++ 4, +++ 1, +++ 0, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 13, +++ 12, +++ 9, +++ 8, +++ 5, +++ 4, +++ 1, +++ 0); +++ __m256i iMoveMask2 = _mm256_set_epi8(13, +++ 12, +++ 9, +++ 8, +++ 5, +++ 4, +++ 1, +++ 0, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 13, +++ 12, +++ 9, +++ 8, +++ 5, +++ 4, +++ 1, +++ 0, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80); +++ +++ __m256i complexVal1, complexVal2, iOutputVal; +++ +++ unsigned int sixteenthPoints = num_points / 16; +++ +++ for (number = 0; number < sixteenthPoints; number++) { +++ complexVal1 = _mm256_loadu_si256((__m256i*)complexVectorPtr); +++ complexVectorPtr += 16; +++ complexVal2 = _mm256_loadu_si256((__m256i*)complexVectorPtr); +++ complexVectorPtr += 16; +++ +++ complexVal1 = _mm256_shuffle_epi8(complexVal1, iMoveMask1); +++ complexVal2 = _mm256_shuffle_epi8(complexVal2, iMoveMask2); +++ +++ iOutputVal = _mm256_or_si256(complexVal1, complexVal2); +++ iOutputVal = _mm256_permute4x64_epi64(iOutputVal, 0xd8); +++ +++ _mm256_storeu_si256((__m256i*)iBufferPtr, iOutputVal); +++ +++ iBufferPtr += 16; +++ } +++ +++ number = sixteenthPoints * 16; +++ for (; number < num_points; number++) { +++ *iBufferPtr++ = *complexVectorPtr++; +++ complexVectorPtr++; +++ } ++ } ++ #endif /* LV_HAVE_AVX2 */ ++ ++diff --git a/kernels/volk/volk_16ic_deinterleave_real_8i.h b/kernels/volk/volk_16ic_deinterleave_real_8i.h ++index 1022688..3d8e4ea 100644 ++--- a/kernels/volk/volk_16ic_deinterleave_real_8i.h +++++ b/kernels/volk/volk_16ic_deinterleave_real_8i.h ++@@ -30,8 +30,8 @@ ++ * ++ * Dispatcher Prototype ++ * \code ++- * void volk_16ic_deinterleave_real_8i(int8_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points) ++- * \endcode +++ * void volk_16ic_deinterleave_real_8i(int8_t* iBuffer, const lv_16sc_t* complexVector, +++ * unsigned int num_points) \endcode ++ * ++ * \b Inputs ++ * \li complexVector: The complex input vector. ++@@ -61,54 +61,121 @@ ++ #ifdef LV_HAVE_AVX2 ++ #include ++ ++-static inline void ++-volk_16ic_deinterleave_real_8i_a_avx2(int8_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points) +++static inline void volk_16ic_deinterleave_real_8i_a_avx2(int8_t* iBuffer, +++ const lv_16sc_t* complexVector, +++ unsigned int num_points) ++ { ++- unsigned int number = 0; ++- const int8_t* complexVectorPtr = (int8_t*)complexVector; ++- int8_t* iBufferPtr = iBuffer; ++- __m256i iMoveMask1 = _mm256_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0); ++- __m256i iMoveMask2 = _mm256_set_epi8(13, 12, 9, 8, 5, 4, 1, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80); ++- __m256i complexVal1, complexVal2, complexVal3, complexVal4, iOutputVal; ++- ++- unsigned int thirtysecondPoints = num_points / 32; ++- ++- for(number = 0; number < thirtysecondPoints; number++){ ++- complexVal1 = _mm256_load_si256((__m256i*)complexVectorPtr); complexVectorPtr += 32; ++- complexVal2 = _mm256_load_si256((__m256i*)complexVectorPtr); complexVectorPtr += 32; ++- ++- complexVal3 = _mm256_load_si256((__m256i*)complexVectorPtr); complexVectorPtr += 32; ++- complexVal4 = _mm256_load_si256((__m256i*)complexVectorPtr); complexVectorPtr += 32; ++- ++- complexVal1 = _mm256_shuffle_epi8(complexVal1, iMoveMask1); ++- complexVal2 = _mm256_shuffle_epi8(complexVal2, iMoveMask2); ++- ++- complexVal1 = _mm256_or_si256(complexVal1, complexVal2); ++- complexVal1 = _mm256_permute4x64_epi64(complexVal1, 0xd8); ++- ++- complexVal3 = _mm256_shuffle_epi8(complexVal3, iMoveMask1); ++- complexVal4 = _mm256_shuffle_epi8(complexVal4, iMoveMask2); ++- ++- complexVal3 = _mm256_or_si256(complexVal3, complexVal4); ++- complexVal3 = _mm256_permute4x64_epi64(complexVal3, 0xd8); ++- ++- complexVal1 = _mm256_srai_epi16(complexVal1, 8); ++- complexVal3 = _mm256_srai_epi16(complexVal3, 8); ++- ++- iOutputVal = _mm256_packs_epi16(complexVal1, complexVal3); ++- iOutputVal = _mm256_permute4x64_epi64(iOutputVal, 0xd8); ++- ++- _mm256_store_si256((__m256i*)iBufferPtr, iOutputVal); ++- ++- iBufferPtr += 32; ++- } ++- ++- number = thirtysecondPoints * 32; ++- int16_t* int16ComplexVectorPtr = (int16_t*)complexVectorPtr; ++- for(; number < num_points; number++){ ++- *iBufferPtr++ = ((int8_t)(*int16ComplexVectorPtr++ >> 8)); ++- int16ComplexVectorPtr++; ++- } +++ unsigned int number = 0; +++ const int8_t* complexVectorPtr = (int8_t*)complexVector; +++ int8_t* iBufferPtr = iBuffer; +++ __m256i iMoveMask1 = _mm256_set_epi8(0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 13, +++ 12, +++ 9, +++ 8, +++ 5, +++ 4, +++ 1, +++ 0, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 13, +++ 12, +++ 9, +++ 8, +++ 5, +++ 4, +++ 1, +++ 0); +++ __m256i iMoveMask2 = _mm256_set_epi8(13, +++ 12, +++ 9, +++ 8, +++ 5, +++ 4, +++ 1, +++ 0, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 13, +++ 12, +++ 9, +++ 8, +++ 5, +++ 4, +++ 1, +++ 0, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80); +++ __m256i complexVal1, complexVal2, complexVal3, complexVal4, iOutputVal; +++ +++ unsigned int thirtysecondPoints = num_points / 32; +++ +++ for (number = 0; number < thirtysecondPoints; number++) { +++ complexVal1 = _mm256_load_si256((__m256i*)complexVectorPtr); +++ complexVectorPtr += 32; +++ complexVal2 = _mm256_load_si256((__m256i*)complexVectorPtr); +++ complexVectorPtr += 32; +++ +++ complexVal3 = _mm256_load_si256((__m256i*)complexVectorPtr); +++ complexVectorPtr += 32; +++ complexVal4 = _mm256_load_si256((__m256i*)complexVectorPtr); +++ complexVectorPtr += 32; +++ +++ complexVal1 = _mm256_shuffle_epi8(complexVal1, iMoveMask1); +++ complexVal2 = _mm256_shuffle_epi8(complexVal2, iMoveMask2); +++ +++ complexVal1 = _mm256_or_si256(complexVal1, complexVal2); +++ complexVal1 = _mm256_permute4x64_epi64(complexVal1, 0xd8); +++ +++ complexVal3 = _mm256_shuffle_epi8(complexVal3, iMoveMask1); +++ complexVal4 = _mm256_shuffle_epi8(complexVal4, iMoveMask2); +++ +++ complexVal3 = _mm256_or_si256(complexVal3, complexVal4); +++ complexVal3 = _mm256_permute4x64_epi64(complexVal3, 0xd8); +++ +++ complexVal1 = _mm256_srai_epi16(complexVal1, 8); +++ complexVal3 = _mm256_srai_epi16(complexVal3, 8); +++ +++ iOutputVal = _mm256_packs_epi16(complexVal1, complexVal3); +++ iOutputVal = _mm256_permute4x64_epi64(iOutputVal, 0xd8); +++ +++ _mm256_store_si256((__m256i*)iBufferPtr, iOutputVal); +++ +++ iBufferPtr += 32; +++ } +++ +++ number = thirtysecondPoints * 32; +++ int16_t* int16ComplexVectorPtr = (int16_t*)complexVectorPtr; +++ for (; number < num_points; number++) { +++ *iBufferPtr++ = ((int8_t)(*int16ComplexVectorPtr++ >> 8)); +++ int16ComplexVectorPtr++; +++ } ++ } ++ #endif /* LV_HAVE_AVX2 */ ++ ++@@ -116,105 +183,116 @@ volk_16ic_deinterleave_real_8i_a_avx2(int8_t* iBuffer, const lv_16sc_t* complexV ++ #ifdef LV_HAVE_SSSE3 ++ #include ++ ++-static inline void ++-volk_16ic_deinterleave_real_8i_a_ssse3(int8_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points) +++static inline void volk_16ic_deinterleave_real_8i_a_ssse3(int8_t* iBuffer, +++ const lv_16sc_t* complexVector, +++ unsigned int num_points) ++ { ++- unsigned int number = 0; ++- const int8_t* complexVectorPtr = (int8_t*)complexVector; ++- int8_t* iBufferPtr = iBuffer; ++- __m128i iMoveMask1 = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0); ++- __m128i iMoveMask2 = _mm_set_epi8(13, 12, 9, 8, 5, 4, 1, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80); ++- __m128i complexVal1, complexVal2, complexVal3, complexVal4, iOutputVal; +++ unsigned int number = 0; +++ const int8_t* complexVectorPtr = (int8_t*)complexVector; +++ int8_t* iBufferPtr = iBuffer; +++ __m128i iMoveMask1 = _mm_set_epi8( +++ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0); +++ __m128i iMoveMask2 = _mm_set_epi8( +++ 13, 12, 9, 8, 5, 4, 1, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80); +++ __m128i complexVal1, complexVal2, complexVal3, complexVal4, iOutputVal; ++ ++- unsigned int sixteenthPoints = num_points / 16; +++ unsigned int sixteenthPoints = num_points / 16; ++ ++- for(number = 0; number < sixteenthPoints; number++){ ++- complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16; ++- complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16; +++ for (number = 0; number < sixteenthPoints; number++) { +++ complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr); +++ complexVectorPtr += 16; +++ complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr); +++ complexVectorPtr += 16; ++ ++- complexVal3 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16; ++- complexVal4 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16; +++ complexVal3 = _mm_load_si128((__m128i*)complexVectorPtr); +++ complexVectorPtr += 16; +++ complexVal4 = _mm_load_si128((__m128i*)complexVectorPtr); +++ complexVectorPtr += 16; ++ ++- complexVal1 = _mm_shuffle_epi8(complexVal1, iMoveMask1); ++- complexVal2 = _mm_shuffle_epi8(complexVal2, iMoveMask2); +++ complexVal1 = _mm_shuffle_epi8(complexVal1, iMoveMask1); +++ complexVal2 = _mm_shuffle_epi8(complexVal2, iMoveMask2); ++ ++- complexVal1 = _mm_or_si128(complexVal1, complexVal2); +++ complexVal1 = _mm_or_si128(complexVal1, complexVal2); ++ ++- complexVal3 = _mm_shuffle_epi8(complexVal3, iMoveMask1); ++- complexVal4 = _mm_shuffle_epi8(complexVal4, iMoveMask2); +++ complexVal3 = _mm_shuffle_epi8(complexVal3, iMoveMask1); +++ complexVal4 = _mm_shuffle_epi8(complexVal4, iMoveMask2); ++ ++- complexVal3 = _mm_or_si128(complexVal3, complexVal4); +++ complexVal3 = _mm_or_si128(complexVal3, complexVal4); ++ ++ ++- complexVal1 = _mm_srai_epi16(complexVal1, 8); ++- complexVal3 = _mm_srai_epi16(complexVal3, 8); +++ complexVal1 = _mm_srai_epi16(complexVal1, 8); +++ complexVal3 = _mm_srai_epi16(complexVal3, 8); ++ ++- iOutputVal = _mm_packs_epi16(complexVal1, complexVal3); +++ iOutputVal = _mm_packs_epi16(complexVal1, complexVal3); ++ ++- _mm_store_si128((__m128i*)iBufferPtr, iOutputVal); +++ _mm_store_si128((__m128i*)iBufferPtr, iOutputVal); ++ ++- iBufferPtr += 16; ++- } +++ iBufferPtr += 16; +++ } ++ ++- number = sixteenthPoints * 16; ++- int16_t* int16ComplexVectorPtr = (int16_t*)complexVectorPtr; ++- for(; number < num_points; number++){ ++- *iBufferPtr++ = ((int8_t)(*int16ComplexVectorPtr++ >> 8)); ++- int16ComplexVectorPtr++; ++- } +++ number = sixteenthPoints * 16; +++ int16_t* int16ComplexVectorPtr = (int16_t*)complexVectorPtr; +++ for (; number < num_points; number++) { +++ *iBufferPtr++ = ((int8_t)(*int16ComplexVectorPtr++ >> 8)); +++ int16ComplexVectorPtr++; +++ } ++ } ++ #endif /* LV_HAVE_SSSE3 */ ++ ++ #ifdef LV_HAVE_GENERIC ++ ++-static inline void ++-volk_16ic_deinterleave_real_8i_generic(int8_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points) +++static inline void volk_16ic_deinterleave_real_8i_generic(int8_t* iBuffer, +++ const lv_16sc_t* complexVector, +++ unsigned int num_points) ++ { ++- unsigned int number = 0; ++- int16_t* complexVectorPtr = (int16_t*)complexVector; ++- int8_t* iBufferPtr = iBuffer; ++- for(number = 0; number < num_points; number++){ ++- *iBufferPtr++ = ((int8_t)(*complexVectorPtr++ >> 8)); ++- complexVectorPtr++; ++- } +++ unsigned int number = 0; +++ int16_t* complexVectorPtr = (int16_t*)complexVector; +++ int8_t* iBufferPtr = iBuffer; +++ for (number = 0; number < num_points; number++) { +++ *iBufferPtr++ = ((int8_t)(*complexVectorPtr++ >> 8)); +++ complexVectorPtr++; +++ } ++ } ++ #endif /* LV_HAVE_GENERIC */ ++ ++ #ifdef LV_HAVE_NEON ++ #include ++ ++-static inline void ++-volk_16ic_deinterleave_real_8i_neon(int8_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points) +++static inline void volk_16ic_deinterleave_real_8i_neon(int8_t* iBuffer, +++ const lv_16sc_t* complexVector, +++ unsigned int num_points) ++ { ++- const int16_t* complexVectorPtr = (const int16_t*)complexVector; ++- int8_t* iBufferPtr = iBuffer; ++- unsigned int eighth_points = num_points / 8; ++- unsigned int number; ++- ++- int16x8x2_t complexInput; ++- int8x8_t realOutput; ++- for(number = 0; number < eighth_points; number++){ ++- complexInput = vld2q_s16(complexVectorPtr); ++- realOutput = vshrn_n_s16(complexInput.val[0], 8); ++- vst1_s8(iBufferPtr, realOutput); ++- complexVectorPtr += 16; ++- iBufferPtr += 8; ++- } ++- ++- for(number = eighth_points*8; number < num_points; number++){ ++- *iBufferPtr++ = ((int8_t)(*complexVectorPtr++ >> 8)); ++- complexVectorPtr++; ++- } +++ const int16_t* complexVectorPtr = (const int16_t*)complexVector; +++ int8_t* iBufferPtr = iBuffer; +++ unsigned int eighth_points = num_points / 8; +++ unsigned int number; +++ +++ int16x8x2_t complexInput; +++ int8x8_t realOutput; +++ for (number = 0; number < eighth_points; number++) { +++ complexInput = vld2q_s16(complexVectorPtr); +++ realOutput = vshrn_n_s16(complexInput.val[0], 8); +++ vst1_s8(iBufferPtr, realOutput); +++ complexVectorPtr += 16; +++ iBufferPtr += 8; +++ } +++ +++ for (number = eighth_points * 8; number < num_points; number++) { +++ *iBufferPtr++ = ((int8_t)(*complexVectorPtr++ >> 8)); +++ complexVectorPtr++; +++ } ++ } ++ #endif ++ ++ #ifdef LV_HAVE_ORC ++ ++-extern void ++-volk_16ic_deinterleave_real_8i_a_orc_impl(int8_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points); +++extern void volk_16ic_deinterleave_real_8i_a_orc_impl(int8_t* iBuffer, +++ const lv_16sc_t* complexVector, +++ unsigned int num_points); ++ ++-static inline void ++-volk_16ic_deinterleave_real_8i_u_orc(int8_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points) +++static inline void volk_16ic_deinterleave_real_8i_u_orc(int8_t* iBuffer, +++ const lv_16sc_t* complexVector, +++ unsigned int num_points) ++ { ++ volk_16ic_deinterleave_real_8i_a_orc_impl(iBuffer, complexVector, num_points); ++ } ++@@ -233,54 +311,121 @@ volk_16ic_deinterleave_real_8i_u_orc(int8_t* iBuffer, const lv_16sc_t* complexVe ++ #ifdef LV_HAVE_AVX2 ++ #include ++ ++-static inline void ++-volk_16ic_deinterleave_real_8i_u_avx2(int8_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points) +++static inline void volk_16ic_deinterleave_real_8i_u_avx2(int8_t* iBuffer, +++ const lv_16sc_t* complexVector, +++ unsigned int num_points) ++ { ++- unsigned int number = 0; ++- const int8_t* complexVectorPtr = (int8_t*)complexVector; ++- int8_t* iBufferPtr = iBuffer; ++- __m256i iMoveMask1 = _mm256_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0); ++- __m256i iMoveMask2 = _mm256_set_epi8(13, 12, 9, 8, 5, 4, 1, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80); ++- __m256i complexVal1, complexVal2, complexVal3, complexVal4, iOutputVal; ++- ++- unsigned int thirtysecondPoints = num_points / 32; ++- ++- for(number = 0; number < thirtysecondPoints; number++){ ++- complexVal1 = _mm256_loadu_si256((__m256i*)complexVectorPtr); complexVectorPtr += 32; ++- complexVal2 = _mm256_loadu_si256((__m256i*)complexVectorPtr); complexVectorPtr += 32; ++- ++- complexVal3 = _mm256_loadu_si256((__m256i*)complexVectorPtr); complexVectorPtr += 32; ++- complexVal4 = _mm256_loadu_si256((__m256i*)complexVectorPtr); complexVectorPtr += 32; ++- ++- complexVal1 = _mm256_shuffle_epi8(complexVal1, iMoveMask1); ++- complexVal2 = _mm256_shuffle_epi8(complexVal2, iMoveMask2); ++- ++- complexVal1 = _mm256_or_si256(complexVal1, complexVal2); ++- complexVal1 = _mm256_permute4x64_epi64(complexVal1, 0xd8); ++- ++- complexVal3 = _mm256_shuffle_epi8(complexVal3, iMoveMask1); ++- complexVal4 = _mm256_shuffle_epi8(complexVal4, iMoveMask2); ++- ++- complexVal3 = _mm256_or_si256(complexVal3, complexVal4); ++- complexVal3 = _mm256_permute4x64_epi64(complexVal3, 0xd8); ++- ++- complexVal1 = _mm256_srai_epi16(complexVal1, 8); ++- complexVal3 = _mm256_srai_epi16(complexVal3, 8); ++- ++- iOutputVal = _mm256_packs_epi16(complexVal1, complexVal3); ++- iOutputVal = _mm256_permute4x64_epi64(iOutputVal, 0xd8); ++- ++- _mm256_storeu_si256((__m256i*)iBufferPtr, iOutputVal); ++- ++- iBufferPtr += 32; ++- } ++- ++- number = thirtysecondPoints * 32; ++- int16_t* int16ComplexVectorPtr = (int16_t*)complexVectorPtr; ++- for(; number < num_points; number++){ ++- *iBufferPtr++ = ((int8_t)(*int16ComplexVectorPtr++ >> 8)); ++- int16ComplexVectorPtr++; ++- } +++ unsigned int number = 0; +++ const int8_t* complexVectorPtr = (int8_t*)complexVector; +++ int8_t* iBufferPtr = iBuffer; +++ __m256i iMoveMask1 = _mm256_set_epi8(0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 13, +++ 12, +++ 9, +++ 8, +++ 5, +++ 4, +++ 1, +++ 0, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 13, +++ 12, +++ 9, +++ 8, +++ 5, +++ 4, +++ 1, +++ 0); +++ __m256i iMoveMask2 = _mm256_set_epi8(13, +++ 12, +++ 9, +++ 8, +++ 5, +++ 4, +++ 1, +++ 0, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 13, +++ 12, +++ 9, +++ 8, +++ 5, +++ 4, +++ 1, +++ 0, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80); +++ __m256i complexVal1, complexVal2, complexVal3, complexVal4, iOutputVal; +++ +++ unsigned int thirtysecondPoints = num_points / 32; +++ +++ for (number = 0; number < thirtysecondPoints; number++) { +++ complexVal1 = _mm256_loadu_si256((__m256i*)complexVectorPtr); +++ complexVectorPtr += 32; +++ complexVal2 = _mm256_loadu_si256((__m256i*)complexVectorPtr); +++ complexVectorPtr += 32; +++ +++ complexVal3 = _mm256_loadu_si256((__m256i*)complexVectorPtr); +++ complexVectorPtr += 32; +++ complexVal4 = _mm256_loadu_si256((__m256i*)complexVectorPtr); +++ complexVectorPtr += 32; +++ +++ complexVal1 = _mm256_shuffle_epi8(complexVal1, iMoveMask1); +++ complexVal2 = _mm256_shuffle_epi8(complexVal2, iMoveMask2); +++ +++ complexVal1 = _mm256_or_si256(complexVal1, complexVal2); +++ complexVal1 = _mm256_permute4x64_epi64(complexVal1, 0xd8); +++ +++ complexVal3 = _mm256_shuffle_epi8(complexVal3, iMoveMask1); +++ complexVal4 = _mm256_shuffle_epi8(complexVal4, iMoveMask2); +++ +++ complexVal3 = _mm256_or_si256(complexVal3, complexVal4); +++ complexVal3 = _mm256_permute4x64_epi64(complexVal3, 0xd8); +++ +++ complexVal1 = _mm256_srai_epi16(complexVal1, 8); +++ complexVal3 = _mm256_srai_epi16(complexVal3, 8); +++ +++ iOutputVal = _mm256_packs_epi16(complexVal1, complexVal3); +++ iOutputVal = _mm256_permute4x64_epi64(iOutputVal, 0xd8); +++ +++ _mm256_storeu_si256((__m256i*)iBufferPtr, iOutputVal); +++ +++ iBufferPtr += 32; +++ } +++ +++ number = thirtysecondPoints * 32; +++ int16_t* int16ComplexVectorPtr = (int16_t*)complexVectorPtr; +++ for (; number < num_points; number++) { +++ *iBufferPtr++ = ((int8_t)(*int16ComplexVectorPtr++ >> 8)); +++ int16ComplexVectorPtr++; +++ } ++ } ++ #endif /* LV_HAVE_AVX2 */ ++ #endif /* INCLUDED_volk_16ic_deinterleave_real_8i_u_H */ ++diff --git a/kernels/volk/volk_16ic_magnitude_16i.h b/kernels/volk/volk_16ic_magnitude_16i.h ++index bbe72a8..35b40cb 100644 ++--- a/kernels/volk/volk_16ic_magnitude_16i.h +++++ b/kernels/volk/volk_16ic_magnitude_16i.h ++@@ -30,8 +30,8 @@ ++ * ++ * Dispatcher Prototype ++ * \code ++- * void volk_16ic_magnitude_16i(int16_t* magnitudeVector, const lv_16sc_t* complexVector, unsigned int num_points) ++- * \endcode +++ * void volk_16ic_magnitude_16i(int16_t* magnitudeVector, const lv_16sc_t* complexVector, +++ * unsigned int num_points) \endcode ++ * ++ * \b Inputs ++ * \li complexVector: The complex input vector. ++@@ -54,242 +54,255 @@ ++ #ifndef INCLUDED_volk_16ic_magnitude_16i_a_H ++ #define INCLUDED_volk_16ic_magnitude_16i_a_H ++ ++-#include ++ #include ++-#include ++-#include ++ #include +++#include +++#include +++#include ++ ++ #ifdef LV_HAVE_AVX2 ++ #include ++ ++-static inline void ++-volk_16ic_magnitude_16i_a_avx2(int16_t* magnitudeVector, const lv_16sc_t* complexVector, unsigned int num_points) +++static inline void volk_16ic_magnitude_16i_a_avx2(int16_t* magnitudeVector, +++ const lv_16sc_t* complexVector, +++ unsigned int num_points) ++ { ++- unsigned int number = 0; ++- const unsigned int eighthPoints = num_points / 8; ++- ++- const int16_t* complexVectorPtr = (const int16_t*)complexVector; ++- int16_t* magnitudeVectorPtr = magnitudeVector; ++- ++- __m256 vScalar = _mm256_set1_ps(SHRT_MAX); ++- __m256 invScalar = _mm256_set1_ps(1.0f/SHRT_MAX); ++- __m256i int1, int2; ++- __m128i short1, short2; ++- __m256 cplxValue1, cplxValue2, result; ++- __m256i idx = _mm256_set_epi32(0,0,0,0,5,1,4,0); ++- ++- for(;number < eighthPoints; number++){ ++- ++- int1 = _mm256_load_si256((__m256i*)complexVectorPtr); ++- complexVectorPtr += 16; ++- short1 = _mm256_extracti128_si256(int1,0); ++- short2 = _mm256_extracti128_si256(int1,1); ++- ++- int1 = _mm256_cvtepi16_epi32(short1); ++- int2 = _mm256_cvtepi16_epi32(short2); ++- cplxValue1 = _mm256_cvtepi32_ps(int1); ++- cplxValue2 = _mm256_cvtepi32_ps(int2); ++- ++- cplxValue1 = _mm256_mul_ps(cplxValue1, invScalar); ++- cplxValue2 = _mm256_mul_ps(cplxValue2, invScalar); ++- ++- cplxValue1 = _mm256_mul_ps(cplxValue1, cplxValue1); // Square the values ++- cplxValue2 = _mm256_mul_ps(cplxValue2, cplxValue2); // Square the Values ++- ++- result = _mm256_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values ++- ++- result = _mm256_sqrt_ps(result); // Square root the values ++- ++- result = _mm256_mul_ps(result, vScalar); // Scale the results ++- ++- int1 = _mm256_cvtps_epi32(result); ++- int1 = _mm256_packs_epi32(int1, int1); ++- int1 = _mm256_permutevar8x32_epi32(int1, idx); //permute to compensate for shuffling in hadd and packs ++- short1 = _mm256_extracti128_si256(int1, 0); ++- _mm_store_si128((__m128i*)magnitudeVectorPtr,short1); ++- magnitudeVectorPtr += 8; ++- } ++- ++- number = eighthPoints * 8; ++- magnitudeVectorPtr = &magnitudeVector[number]; ++- complexVectorPtr = (const int16_t*)&complexVector[number]; ++- for(; number < num_points; number++){ ++- const float val1Real = (float)(*complexVectorPtr++) / SHRT_MAX; ++- const float val1Imag = (float)(*complexVectorPtr++) / SHRT_MAX; ++- const float val1Result = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)) * SHRT_MAX; ++- *magnitudeVectorPtr++ = (int16_t)rintf(val1Result); ++- } +++ unsigned int number = 0; +++ const unsigned int eighthPoints = num_points / 8; +++ +++ const int16_t* complexVectorPtr = (const int16_t*)complexVector; +++ int16_t* magnitudeVectorPtr = magnitudeVector; +++ +++ __m256 vScalar = _mm256_set1_ps(SHRT_MAX); +++ __m256 invScalar = _mm256_set1_ps(1.0f / SHRT_MAX); +++ __m256i int1, int2; +++ __m128i short1, short2; +++ __m256 cplxValue1, cplxValue2, result; +++ __m256i idx = _mm256_set_epi32(0, 0, 0, 0, 5, 1, 4, 0); +++ +++ for (; number < eighthPoints; number++) { +++ +++ int1 = _mm256_load_si256((__m256i*)complexVectorPtr); +++ complexVectorPtr += 16; +++ short1 = _mm256_extracti128_si256(int1, 0); +++ short2 = _mm256_extracti128_si256(int1, 1); +++ +++ int1 = _mm256_cvtepi16_epi32(short1); +++ int2 = _mm256_cvtepi16_epi32(short2); +++ cplxValue1 = _mm256_cvtepi32_ps(int1); +++ cplxValue2 = _mm256_cvtepi32_ps(int2); +++ +++ cplxValue1 = _mm256_mul_ps(cplxValue1, invScalar); +++ cplxValue2 = _mm256_mul_ps(cplxValue2, invScalar); +++ +++ cplxValue1 = _mm256_mul_ps(cplxValue1, cplxValue1); // Square the values +++ cplxValue2 = _mm256_mul_ps(cplxValue2, cplxValue2); // Square the Values +++ +++ result = _mm256_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values +++ +++ result = _mm256_sqrt_ps(result); // Square root the values +++ +++ result = _mm256_mul_ps(result, vScalar); // Scale the results +++ +++ int1 = _mm256_cvtps_epi32(result); +++ int1 = _mm256_packs_epi32(int1, int1); +++ int1 = _mm256_permutevar8x32_epi32( +++ int1, idx); // permute to compensate for shuffling in hadd and packs +++ short1 = _mm256_extracti128_si256(int1, 0); +++ _mm_store_si128((__m128i*)magnitudeVectorPtr, short1); +++ magnitudeVectorPtr += 8; +++ } +++ +++ number = eighthPoints * 8; +++ magnitudeVectorPtr = &magnitudeVector[number]; +++ complexVectorPtr = (const int16_t*)&complexVector[number]; +++ for (; number < num_points; number++) { +++ const float val1Real = (float)(*complexVectorPtr++) / SHRT_MAX; +++ const float val1Imag = (float)(*complexVectorPtr++) / SHRT_MAX; +++ const float val1Result = +++ sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)) * SHRT_MAX; +++ *magnitudeVectorPtr++ = (int16_t)rintf(val1Result); +++ } ++ } ++ #endif /* LV_HAVE_AVX2 */ ++ ++ #ifdef LV_HAVE_SSE3 ++ #include ++ ++-static inline void ++-volk_16ic_magnitude_16i_a_sse3(int16_t* magnitudeVector, const lv_16sc_t* complexVector, unsigned int num_points) +++static inline void volk_16ic_magnitude_16i_a_sse3(int16_t* magnitudeVector, +++ const lv_16sc_t* complexVector, +++ unsigned int num_points) ++ { ++- unsigned int number = 0; ++- const unsigned int quarterPoints = num_points / 4; +++ unsigned int number = 0; +++ const unsigned int quarterPoints = num_points / 4; ++ ++- const int16_t* complexVectorPtr = (const int16_t*)complexVector; ++- int16_t* magnitudeVectorPtr = magnitudeVector; +++ const int16_t* complexVectorPtr = (const int16_t*)complexVector; +++ int16_t* magnitudeVectorPtr = magnitudeVector; ++ ++- __m128 vScalar = _mm_set_ps1(SHRT_MAX); ++- __m128 invScalar = _mm_set_ps1(1.0f/SHRT_MAX); +++ __m128 vScalar = _mm_set_ps1(SHRT_MAX); +++ __m128 invScalar = _mm_set_ps1(1.0f / SHRT_MAX); ++ ++- __m128 cplxValue1, cplxValue2, result; +++ __m128 cplxValue1, cplxValue2, result; ++ ++- __VOLK_ATTR_ALIGNED(16) float inputFloatBuffer[8]; ++- __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4]; +++ __VOLK_ATTR_ALIGNED(16) float inputFloatBuffer[8]; +++ __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4]; ++ ++- for(;number < quarterPoints; number++){ +++ for (; number < quarterPoints; number++) { ++ ++- inputFloatBuffer[0] = (float)(complexVectorPtr[0]); ++- inputFloatBuffer[1] = (float)(complexVectorPtr[1]); ++- inputFloatBuffer[2] = (float)(complexVectorPtr[2]); ++- inputFloatBuffer[3] = (float)(complexVectorPtr[3]); +++ inputFloatBuffer[0] = (float)(complexVectorPtr[0]); +++ inputFloatBuffer[1] = (float)(complexVectorPtr[1]); +++ inputFloatBuffer[2] = (float)(complexVectorPtr[2]); +++ inputFloatBuffer[3] = (float)(complexVectorPtr[3]); ++ ++- inputFloatBuffer[4] = (float)(complexVectorPtr[4]); ++- inputFloatBuffer[5] = (float)(complexVectorPtr[5]); ++- inputFloatBuffer[6] = (float)(complexVectorPtr[6]); ++- inputFloatBuffer[7] = (float)(complexVectorPtr[7]); +++ inputFloatBuffer[4] = (float)(complexVectorPtr[4]); +++ inputFloatBuffer[5] = (float)(complexVectorPtr[5]); +++ inputFloatBuffer[6] = (float)(complexVectorPtr[6]); +++ inputFloatBuffer[7] = (float)(complexVectorPtr[7]); ++ ++- cplxValue1 = _mm_load_ps(&inputFloatBuffer[0]); ++- cplxValue2 = _mm_load_ps(&inputFloatBuffer[4]); +++ cplxValue1 = _mm_load_ps(&inputFloatBuffer[0]); +++ cplxValue2 = _mm_load_ps(&inputFloatBuffer[4]); ++ ++- complexVectorPtr += 8; +++ complexVectorPtr += 8; ++ ++- cplxValue1 = _mm_mul_ps(cplxValue1, invScalar); ++- cplxValue2 = _mm_mul_ps(cplxValue2, invScalar); +++ cplxValue1 = _mm_mul_ps(cplxValue1, invScalar); +++ cplxValue2 = _mm_mul_ps(cplxValue2, invScalar); ++ ++- cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values ++- cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values +++ cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values +++ cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values ++ ++- result = _mm_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values +++ result = _mm_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values ++ ++- result = _mm_sqrt_ps(result); // Square root the values +++ result = _mm_sqrt_ps(result); // Square root the values ++ ++- result = _mm_mul_ps(result, vScalar); // Scale the results +++ result = _mm_mul_ps(result, vScalar); // Scale the results ++ ++- _mm_store_ps(outputFloatBuffer, result); ++- *magnitudeVectorPtr++ = (int16_t)rintf(outputFloatBuffer[0]); ++- *magnitudeVectorPtr++ = (int16_t)rintf(outputFloatBuffer[1]); ++- *magnitudeVectorPtr++ = (int16_t)rintf(outputFloatBuffer[2]); ++- *magnitudeVectorPtr++ = (int16_t)rintf(outputFloatBuffer[3]); ++- } +++ _mm_store_ps(outputFloatBuffer, result); +++ *magnitudeVectorPtr++ = (int16_t)rintf(outputFloatBuffer[0]); +++ *magnitudeVectorPtr++ = (int16_t)rintf(outputFloatBuffer[1]); +++ *magnitudeVectorPtr++ = (int16_t)rintf(outputFloatBuffer[2]); +++ *magnitudeVectorPtr++ = (int16_t)rintf(outputFloatBuffer[3]); +++ } ++ ++- number = quarterPoints * 4; ++- magnitudeVectorPtr = &magnitudeVector[number]; ++- complexVectorPtr = (const int16_t*)&complexVector[number]; ++- for(; number < num_points; number++){ ++- const float val1Real = (float)(*complexVectorPtr++) / SHRT_MAX; ++- const float val1Imag = (float)(*complexVectorPtr++) / SHRT_MAX; ++- const float val1Result = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)) * SHRT_MAX; ++- *magnitudeVectorPtr++ = (int16_t)rintf(val1Result); ++- } +++ number = quarterPoints * 4; +++ magnitudeVectorPtr = &magnitudeVector[number]; +++ complexVectorPtr = (const int16_t*)&complexVector[number]; +++ for (; number < num_points; number++) { +++ const float val1Real = (float)(*complexVectorPtr++) / SHRT_MAX; +++ const float val1Imag = (float)(*complexVectorPtr++) / SHRT_MAX; +++ const float val1Result = +++ sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)) * SHRT_MAX; +++ *magnitudeVectorPtr++ = (int16_t)rintf(val1Result); +++ } ++ } ++ #endif /* LV_HAVE_SSE3 */ ++ ++ #ifdef LV_HAVE_SSE ++ #include ++ ++-static inline void ++-volk_16ic_magnitude_16i_a_sse(int16_t* magnitudeVector, const lv_16sc_t* complexVector, unsigned int num_points) +++static inline void volk_16ic_magnitude_16i_a_sse(int16_t* magnitudeVector, +++ const lv_16sc_t* complexVector, +++ unsigned int num_points) ++ { ++- unsigned int number = 0; ++- const unsigned int quarterPoints = num_points / 4; +++ unsigned int number = 0; +++ const unsigned int quarterPoints = num_points / 4; ++ ++- const int16_t* complexVectorPtr = (const int16_t*)complexVector; ++- int16_t* magnitudeVectorPtr = magnitudeVector; +++ const int16_t* complexVectorPtr = (const int16_t*)complexVector; +++ int16_t* magnitudeVectorPtr = magnitudeVector; ++ ++- __m128 vScalar = _mm_set_ps1(SHRT_MAX); ++- __m128 invScalar = _mm_set_ps1(1.0f/SHRT_MAX); +++ __m128 vScalar = _mm_set_ps1(SHRT_MAX); +++ __m128 invScalar = _mm_set_ps1(1.0f / SHRT_MAX); ++ ++- __m128 cplxValue1, cplxValue2, iValue, qValue, result; +++ __m128 cplxValue1, cplxValue2, iValue, qValue, result; ++ ++- __VOLK_ATTR_ALIGNED(16) float inputFloatBuffer[4]; ++- __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4]; +++ __VOLK_ATTR_ALIGNED(16) float inputFloatBuffer[4]; +++ __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4]; ++ ++- for(;number < quarterPoints; number++){ +++ for (; number < quarterPoints; number++) { ++ ++- inputFloatBuffer[0] = (float)(complexVectorPtr[0]); ++- inputFloatBuffer[1] = (float)(complexVectorPtr[1]); ++- inputFloatBuffer[2] = (float)(complexVectorPtr[2]); ++- inputFloatBuffer[3] = (float)(complexVectorPtr[3]); +++ inputFloatBuffer[0] = (float)(complexVectorPtr[0]); +++ inputFloatBuffer[1] = (float)(complexVectorPtr[1]); +++ inputFloatBuffer[2] = (float)(complexVectorPtr[2]); +++ inputFloatBuffer[3] = (float)(complexVectorPtr[3]); ++ ++- cplxValue1 = _mm_load_ps(inputFloatBuffer); ++- complexVectorPtr += 4; +++ cplxValue1 = _mm_load_ps(inputFloatBuffer); +++ complexVectorPtr += 4; ++ ++- inputFloatBuffer[0] = (float)(complexVectorPtr[0]); ++- inputFloatBuffer[1] = (float)(complexVectorPtr[1]); ++- inputFloatBuffer[2] = (float)(complexVectorPtr[2]); ++- inputFloatBuffer[3] = (float)(complexVectorPtr[3]); +++ inputFloatBuffer[0] = (float)(complexVectorPtr[0]); +++ inputFloatBuffer[1] = (float)(complexVectorPtr[1]); +++ inputFloatBuffer[2] = (float)(complexVectorPtr[2]); +++ inputFloatBuffer[3] = (float)(complexVectorPtr[3]); ++ ++- cplxValue2 = _mm_load_ps(inputFloatBuffer); ++- complexVectorPtr += 4; +++ cplxValue2 = _mm_load_ps(inputFloatBuffer); +++ complexVectorPtr += 4; ++ ++- cplxValue1 = _mm_mul_ps(cplxValue1, invScalar); ++- cplxValue2 = _mm_mul_ps(cplxValue2, invScalar); +++ cplxValue1 = _mm_mul_ps(cplxValue1, invScalar); +++ cplxValue2 = _mm_mul_ps(cplxValue2, invScalar); ++ ++- // Arrange in i1i2i3i4 format ++- iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0)); ++- // Arrange in q1q2q3q4 format ++- qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1)); +++ // Arrange in i1i2i3i4 format +++ iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2, 0, 2, 0)); +++ // Arrange in q1q2q3q4 format +++ qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3, 1, 3, 1)); ++ ++- iValue = _mm_mul_ps(iValue, iValue); // Square the I values ++- qValue = _mm_mul_ps(qValue, qValue); // Square the Q Values +++ iValue = _mm_mul_ps(iValue, iValue); // Square the I values +++ qValue = _mm_mul_ps(qValue, qValue); // Square the Q Values ++ ++- result = _mm_add_ps(iValue, qValue); // Add the I2 and Q2 values +++ result = _mm_add_ps(iValue, qValue); // Add the I2 and Q2 values ++ ++- result = _mm_sqrt_ps(result); // Square root the values +++ result = _mm_sqrt_ps(result); // Square root the values ++ ++- result = _mm_mul_ps(result, vScalar); // Scale the results +++ result = _mm_mul_ps(result, vScalar); // Scale the results ++ ++- _mm_store_ps(outputFloatBuffer, result); ++- *magnitudeVectorPtr++ = (int16_t)rintf(outputFloatBuffer[0]); ++- *magnitudeVectorPtr++ = (int16_t)rintf(outputFloatBuffer[1]); ++- *magnitudeVectorPtr++ = (int16_t)rintf(outputFloatBuffer[2]); ++- *magnitudeVectorPtr++ = (int16_t)rintf(outputFloatBuffer[3]); ++- } +++ _mm_store_ps(outputFloatBuffer, result); +++ *magnitudeVectorPtr++ = (int16_t)rintf(outputFloatBuffer[0]); +++ *magnitudeVectorPtr++ = (int16_t)rintf(outputFloatBuffer[1]); +++ *magnitudeVectorPtr++ = (int16_t)rintf(outputFloatBuffer[2]); +++ *magnitudeVectorPtr++ = (int16_t)rintf(outputFloatBuffer[3]); +++ } ++ ++- number = quarterPoints * 4; ++- magnitudeVectorPtr = &magnitudeVector[number]; ++- complexVectorPtr = (const int16_t*)&complexVector[number]; ++- for(; number < num_points; number++){ ++- const float val1Real = (float)(*complexVectorPtr++) / SHRT_MAX; ++- const float val1Imag = (float)(*complexVectorPtr++) / SHRT_MAX; ++- const float val1Result = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)) * SHRT_MAX; ++- *magnitudeVectorPtr++ = (int16_t)rintf(val1Result); ++- } +++ number = quarterPoints * 4; +++ magnitudeVectorPtr = &magnitudeVector[number]; +++ complexVectorPtr = (const int16_t*)&complexVector[number]; +++ for (; number < num_points; number++) { +++ const float val1Real = (float)(*complexVectorPtr++) / SHRT_MAX; +++ const float val1Imag = (float)(*complexVectorPtr++) / SHRT_MAX; +++ const float val1Result = +++ sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)) * SHRT_MAX; +++ *magnitudeVectorPtr++ = (int16_t)rintf(val1Result); +++ } ++ } ++ #endif /* LV_HAVE_SSE */ ++ ++ #ifdef LV_HAVE_GENERIC ++ ++-static inline void ++-volk_16ic_magnitude_16i_generic(int16_t* magnitudeVector, const lv_16sc_t* complexVector, unsigned int num_points) +++static inline void volk_16ic_magnitude_16i_generic(int16_t* magnitudeVector, +++ const lv_16sc_t* complexVector, +++ unsigned int num_points) ++ { ++- const int16_t* complexVectorPtr = (const int16_t*)complexVector; ++- int16_t* magnitudeVectorPtr = magnitudeVector; ++- unsigned int number = 0; ++- const float scalar = SHRT_MAX; ++- for(number = 0; number < num_points; number++){ ++- float real = ((float)(*complexVectorPtr++)) / scalar; ++- float imag = ((float)(*complexVectorPtr++)) / scalar; ++- *magnitudeVectorPtr++ = (int16_t)rintf(sqrtf((real*real) + (imag*imag)) * scalar); ++- } +++ const int16_t* complexVectorPtr = (const int16_t*)complexVector; +++ int16_t* magnitudeVectorPtr = magnitudeVector; +++ unsigned int number = 0; +++ const float scalar = SHRT_MAX; +++ for (number = 0; number < num_points; number++) { +++ float real = ((float)(*complexVectorPtr++)) / scalar; +++ float imag = ((float)(*complexVectorPtr++)) / scalar; +++ *magnitudeVectorPtr++ = +++ (int16_t)rintf(sqrtf((real * real) + (imag * imag)) * scalar); +++ } ++ } ++ #endif /* LV_HAVE_GENERIC */ ++ ++ #ifdef LV_HAVE_ORC_DISABLED ++-extern void ++-volk_16ic_magnitude_16i_a_orc_impl(int16_t* magnitudeVector, const lv_16sc_t* complexVector, float scalar, unsigned int num_points); ++- ++-static inline void ++-volk_16ic_magnitude_16i_u_orc(int16_t* magnitudeVector, const lv_16sc_t* complexVector, unsigned int num_points) +++extern void volk_16ic_magnitude_16i_a_orc_impl(int16_t* magnitudeVector, +++ const lv_16sc_t* complexVector, +++ float scalar, +++ unsigned int num_points); +++ +++static inline void volk_16ic_magnitude_16i_u_orc(int16_t* magnitudeVector, +++ const lv_16sc_t* complexVector, +++ unsigned int num_points) ++ { ++- volk_16ic_magnitude_16i_a_orc_impl(magnitudeVector, complexVector, SHRT_MAX, num_points); +++ volk_16ic_magnitude_16i_a_orc_impl( +++ magnitudeVector, complexVector, SHRT_MAX, num_points); ++ } ++ #endif /* LV_HAVE_ORC */ ++ ++@@ -300,71 +313,74 @@ volk_16ic_magnitude_16i_u_orc(int16_t* magnitudeVector, const lv_16sc_t* complex ++ #ifndef INCLUDED_volk_16ic_magnitude_16i_u_H ++ #define INCLUDED_volk_16ic_magnitude_16i_u_H ++ ++-#include ++ #include ++-#include ++ #include +++#include +++#include ++ ++ #ifdef LV_HAVE_AVX2 ++ #include ++ ++-static inline void ++-volk_16ic_magnitude_16i_u_avx2(int16_t* magnitudeVector, const lv_16sc_t* complexVector, unsigned int num_points) +++static inline void volk_16ic_magnitude_16i_u_avx2(int16_t* magnitudeVector, +++ const lv_16sc_t* complexVector, +++ unsigned int num_points) ++ { ++- unsigned int number = 0; ++- const unsigned int eighthPoints = num_points / 8; ++- ++- const int16_t* complexVectorPtr = (const int16_t*)complexVector; ++- int16_t* magnitudeVectorPtr = magnitudeVector; ++- ++- __m256 vScalar = _mm256_set1_ps(SHRT_MAX); ++- __m256 invScalar = _mm256_set1_ps(1.0f/SHRT_MAX); ++- __m256i int1, int2; ++- __m128i short1, short2; ++- __m256 cplxValue1, cplxValue2, result; ++- __m256i idx = _mm256_set_epi32(0,0,0,0,5,1,4,0); ++- ++- for(;number < eighthPoints; number++){ ++- ++- int1 = _mm256_loadu_si256((__m256i*)complexVectorPtr); ++- complexVectorPtr += 16; ++- short1 = _mm256_extracti128_si256(int1,0); ++- short2 = _mm256_extracti128_si256(int1,1); ++- ++- int1 = _mm256_cvtepi16_epi32(short1); ++- int2 = _mm256_cvtepi16_epi32(short2); ++- cplxValue1 = _mm256_cvtepi32_ps(int1); ++- cplxValue2 = _mm256_cvtepi32_ps(int2); ++- ++- cplxValue1 = _mm256_mul_ps(cplxValue1, invScalar); ++- cplxValue2 = _mm256_mul_ps(cplxValue2, invScalar); ++- ++- cplxValue1 = _mm256_mul_ps(cplxValue1, cplxValue1); // Square the values ++- cplxValue2 = _mm256_mul_ps(cplxValue2, cplxValue2); // Square the Values ++- ++- result = _mm256_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values ++- ++- result = _mm256_sqrt_ps(result); // Square root the values ++- ++- result = _mm256_mul_ps(result, vScalar); // Scale the results ++- ++- int1 = _mm256_cvtps_epi32(result); ++- int1 = _mm256_packs_epi32(int1, int1); ++- int1 = _mm256_permutevar8x32_epi32(int1, idx); //permute to compensate for shuffling in hadd and packs ++- short1 = _mm256_extracti128_si256(int1, 0); ++- _mm_storeu_si128((__m128i*)magnitudeVectorPtr,short1); ++- magnitudeVectorPtr += 8; ++- } ++- ++- number = eighthPoints * 8; ++- magnitudeVectorPtr = &magnitudeVector[number]; ++- complexVectorPtr = (const int16_t*)&complexVector[number]; ++- for(; number < num_points; number++){ ++- const float val1Real = (float)(*complexVectorPtr++) / SHRT_MAX; ++- const float val1Imag = (float)(*complexVectorPtr++) / SHRT_MAX; ++- const float val1Result = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)) * SHRT_MAX; ++- *magnitudeVectorPtr++ = (int16_t)rintf(val1Result); ++- } +++ unsigned int number = 0; +++ const unsigned int eighthPoints = num_points / 8; +++ +++ const int16_t* complexVectorPtr = (const int16_t*)complexVector; +++ int16_t* magnitudeVectorPtr = magnitudeVector; +++ +++ __m256 vScalar = _mm256_set1_ps(SHRT_MAX); +++ __m256 invScalar = _mm256_set1_ps(1.0f / SHRT_MAX); +++ __m256i int1, int2; +++ __m128i short1, short2; +++ __m256 cplxValue1, cplxValue2, result; +++ __m256i idx = _mm256_set_epi32(0, 0, 0, 0, 5, 1, 4, 0); +++ +++ for (; number < eighthPoints; number++) { +++ +++ int1 = _mm256_loadu_si256((__m256i*)complexVectorPtr); +++ complexVectorPtr += 16; +++ short1 = _mm256_extracti128_si256(int1, 0); +++ short2 = _mm256_extracti128_si256(int1, 1); +++ +++ int1 = _mm256_cvtepi16_epi32(short1); +++ int2 = _mm256_cvtepi16_epi32(short2); +++ cplxValue1 = _mm256_cvtepi32_ps(int1); +++ cplxValue2 = _mm256_cvtepi32_ps(int2); +++ +++ cplxValue1 = _mm256_mul_ps(cplxValue1, invScalar); +++ cplxValue2 = _mm256_mul_ps(cplxValue2, invScalar); +++ +++ cplxValue1 = _mm256_mul_ps(cplxValue1, cplxValue1); // Square the values +++ cplxValue2 = _mm256_mul_ps(cplxValue2, cplxValue2); // Square the Values +++ +++ result = _mm256_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values +++ +++ result = _mm256_sqrt_ps(result); // Square root the values +++ +++ result = _mm256_mul_ps(result, vScalar); // Scale the results +++ +++ int1 = _mm256_cvtps_epi32(result); +++ int1 = _mm256_packs_epi32(int1, int1); +++ int1 = _mm256_permutevar8x32_epi32( +++ int1, idx); // permute to compensate for shuffling in hadd and packs +++ short1 = _mm256_extracti128_si256(int1, 0); +++ _mm_storeu_si128((__m128i*)magnitudeVectorPtr, short1); +++ magnitudeVectorPtr += 8; +++ } +++ +++ number = eighthPoints * 8; +++ magnitudeVectorPtr = &magnitudeVector[number]; +++ complexVectorPtr = (const int16_t*)&complexVector[number]; +++ for (; number < num_points; number++) { +++ const float val1Real = (float)(*complexVectorPtr++) / SHRT_MAX; +++ const float val1Imag = (float)(*complexVectorPtr++) / SHRT_MAX; +++ const float val1Result = +++ sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)) * SHRT_MAX; +++ *magnitudeVectorPtr++ = (int16_t)rintf(val1Result); +++ } ++ } ++ #endif /* LV_HAVE_AVX2 */ ++ ++@@ -372,24 +388,25 @@ volk_16ic_magnitude_16i_u_avx2(int16_t* magnitudeVector, const lv_16sc_t* comple ++ #include ++ #include ++ ++-static inline void ++-volk_16ic_magnitude_16i_neonv7(int16_t* magnitudeVector, const lv_16sc_t* complexVector, unsigned int num_points) +++static inline void volk_16ic_magnitude_16i_neonv7(int16_t* magnitudeVector, +++ const lv_16sc_t* complexVector, +++ unsigned int num_points) ++ { ++ unsigned int number = 0; ++ unsigned int quarter_points = num_points / 4; ++- +++ ++ const float scalar = SHRT_MAX; ++ const float inv_scalar = 1.0f / scalar; ++- +++ ++ int16_t* magnitudeVectorPtr = magnitudeVector; ++ const lv_16sc_t* complexVectorPtr = complexVector; ++- +++ ++ float32x4_t mag_vec; ++ float32x4x2_t c_vec; ++- ++- for(number = 0; number < quarter_points; number++) { +++ +++ for (number = 0; number < quarter_points; number++) { ++ const int16x4x2_t c16_vec = vld2_s16((int16_t*)complexVectorPtr); ++- __VOLK_PREFETCH(complexVectorPtr+4); +++ __VOLK_PREFETCH(complexVectorPtr + 4); ++ c_vec.val[0] = vcvtq_f32_s32(vmovl_s16(c16_vec.val[0])); ++ c_vec.val[1] = vcvtq_f32_s32(vmovl_s16(c16_vec.val[1])); ++ // Scale to close to 0-1 ++@@ -406,15 +423,16 @@ volk_16ic_magnitude_16i_neonv7(int16_t* magnitudeVector, const lv_16sc_t* comple ++ const int16x4_t mag16_vec = vmovn_s32(vcvtq_s32_f32(mag_vec)); ++ vst1_s16(magnitudeVectorPtr, mag16_vec); ++ // Advance pointers ++- magnitudeVectorPtr+=4; ++- complexVectorPtr+=4; +++ magnitudeVectorPtr += 4; +++ complexVectorPtr += 4; ++ } ++- +++ ++ // Deal with the rest ++- for(number = quarter_points * 4; number < num_points; number++) { +++ for (number = quarter_points * 4; number < num_points; number++) { ++ const float real = lv_creal(*complexVectorPtr) * inv_scalar; ++ const float imag = lv_cimag(*complexVectorPtr) * inv_scalar; ++- *magnitudeVectorPtr = (int16_t)rintf(sqrtf((real*real) + (imag*imag)) * scalar); +++ *magnitudeVectorPtr = +++ (int16_t)rintf(sqrtf((real * real) + (imag * imag)) * scalar); ++ complexVectorPtr++; ++ magnitudeVectorPtr++; ++ } ++diff --git a/kernels/volk/volk_16ic_s32f_deinterleave_32f_x2.h b/kernels/volk/volk_16ic_s32f_deinterleave_32f_x2.h ++index 50d9341..7425ec6 100644 ++--- a/kernels/volk/volk_16ic_s32f_deinterleave_32f_x2.h +++++ b/kernels/volk/volk_16ic_s32f_deinterleave_32f_x2.h ++@@ -30,8 +30,8 @@ ++ * ++ * Dispatcher Prototype ++ * \code ++- * void volk_16ic_s32f_deinterleave_32f_x2(float* iBuffer, float* qBuffer, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){ ++- * \endcode +++ * void volk_16ic_s32f_deinterleave_32f_x2(float* iBuffer, float* qBuffer, const +++ * lv_16sc_t* complexVector, const float scalar, unsigned int num_points){ \endcode ++ * ++ * \b Inputs ++ * \li complexVector: The complex input vector of 16-bit shorts. ++@@ -56,197 +56,214 @@ ++ #ifndef INCLUDED_volk_16ic_s32f_deinterleave_32f_x2_a_H ++ #define INCLUDED_volk_16ic_s32f_deinterleave_32f_x2_a_H ++ ++-#include ++ #include ++ #include +++#include ++ ++ #ifdef LV_HAVE_AVX2 ++ #include ++ ++-static inline ++-void volk_16ic_s32f_deinterleave_32f_x2_a_avx2(float* iBuffer, float* qBuffer, const lv_16sc_t* complexVector, ++- const float scalar, unsigned int num_points) +++static inline void +++volk_16ic_s32f_deinterleave_32f_x2_a_avx2(float* iBuffer, +++ float* qBuffer, +++ const lv_16sc_t* complexVector, +++ const float scalar, +++ unsigned int num_points) ++ { ++- float* iBufferPtr = iBuffer; ++- float* qBufferPtr = qBuffer; ++- ++- uint64_t number = 0; ++- const uint64_t eighthPoints = num_points / 8; ++- __m256 cplxValue1, cplxValue2, iValue, qValue; ++- __m256i cplxValueA, cplxValueB; ++- __m128i cplxValue128; ++- ++- __m256 invScalar = _mm256_set1_ps(1.0/scalar); ++- int16_t* complexVectorPtr = (int16_t*)complexVector; ++- __m256i idx = _mm256_set_epi32(7,6,3,2,5,4,1,0); ++- ++- for(;number < eighthPoints; number++){ ++- ++- cplxValueA = _mm256_load_si256((__m256i*) complexVectorPtr); ++- complexVectorPtr += 16; ++- ++- //cvt ++- cplxValue128 = _mm256_extracti128_si256(cplxValueA, 0); ++- cplxValueB = _mm256_cvtepi16_epi32(cplxValue128); ++- cplxValue1 = _mm256_cvtepi32_ps(cplxValueB); ++- cplxValue128 = _mm256_extracti128_si256(cplxValueA, 1); ++- cplxValueB = _mm256_cvtepi16_epi32(cplxValue128); ++- cplxValue2 = _mm256_cvtepi32_ps(cplxValueB); ++- ++- cplxValue1 = _mm256_mul_ps(cplxValue1, invScalar); ++- cplxValue2 = _mm256_mul_ps(cplxValue2, invScalar); ++- ++- // Arrange in i1i2i3i4 format ++- iValue = _mm256_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0)); ++- iValue = _mm256_permutevar8x32_ps(iValue,idx); ++- // Arrange in q1q2q3q4 format ++- qValue = _mm256_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1)); ++- qValue = _mm256_permutevar8x32_ps(qValue,idx); ++- ++- _mm256_store_ps(iBufferPtr, iValue); ++- _mm256_store_ps(qBufferPtr, qValue); ++- ++- iBufferPtr += 8; ++- qBufferPtr += 8; ++- } ++- ++- number = eighthPoints * 8; ++- complexVectorPtr = (int16_t*)&complexVector[number]; ++- for(; number < num_points; number++){ ++- *iBufferPtr++ = (float)(*complexVectorPtr++) / scalar; ++- *qBufferPtr++ = (float)(*complexVectorPtr++) / scalar; ++- } +++ float* iBufferPtr = iBuffer; +++ float* qBufferPtr = qBuffer; +++ +++ uint64_t number = 0; +++ const uint64_t eighthPoints = num_points / 8; +++ __m256 cplxValue1, cplxValue2, iValue, qValue; +++ __m256i cplxValueA, cplxValueB; +++ __m128i cplxValue128; +++ +++ __m256 invScalar = _mm256_set1_ps(1.0 / scalar); +++ int16_t* complexVectorPtr = (int16_t*)complexVector; +++ __m256i idx = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0); +++ +++ for (; number < eighthPoints; number++) { +++ +++ cplxValueA = _mm256_load_si256((__m256i*)complexVectorPtr); +++ complexVectorPtr += 16; +++ +++ // cvt +++ cplxValue128 = _mm256_extracti128_si256(cplxValueA, 0); +++ cplxValueB = _mm256_cvtepi16_epi32(cplxValue128); +++ cplxValue1 = _mm256_cvtepi32_ps(cplxValueB); +++ cplxValue128 = _mm256_extracti128_si256(cplxValueA, 1); +++ cplxValueB = _mm256_cvtepi16_epi32(cplxValue128); +++ cplxValue2 = _mm256_cvtepi32_ps(cplxValueB); +++ +++ cplxValue1 = _mm256_mul_ps(cplxValue1, invScalar); +++ cplxValue2 = _mm256_mul_ps(cplxValue2, invScalar); +++ +++ // Arrange in i1i2i3i4 format +++ iValue = _mm256_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2, 0, 2, 0)); +++ iValue = _mm256_permutevar8x32_ps(iValue, idx); +++ // Arrange in q1q2q3q4 format +++ qValue = _mm256_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3, 1, 3, 1)); +++ qValue = _mm256_permutevar8x32_ps(qValue, idx); +++ +++ _mm256_store_ps(iBufferPtr, iValue); +++ _mm256_store_ps(qBufferPtr, qValue); +++ +++ iBufferPtr += 8; +++ qBufferPtr += 8; +++ } +++ +++ number = eighthPoints * 8; +++ complexVectorPtr = (int16_t*)&complexVector[number]; +++ for (; number < num_points; number++) { +++ *iBufferPtr++ = (float)(*complexVectorPtr++) / scalar; +++ *qBufferPtr++ = (float)(*complexVectorPtr++) / scalar; +++ } ++ } ++ #endif /* LV_HAVE_AVX2 */ ++ ++ #ifdef LV_HAVE_SSE ++ #include ++ ++-static inline ++-void volk_16ic_s32f_deinterleave_32f_x2_a_sse(float* iBuffer, float* qBuffer, const lv_16sc_t* complexVector, ++- const float scalar, unsigned int num_points) +++static inline void +++volk_16ic_s32f_deinterleave_32f_x2_a_sse(float* iBuffer, +++ float* qBuffer, +++ const lv_16sc_t* complexVector, +++ const float scalar, +++ unsigned int num_points) ++ { ++- float* iBufferPtr = iBuffer; ++- float* qBufferPtr = qBuffer; +++ float* iBufferPtr = iBuffer; +++ float* qBufferPtr = qBuffer; ++ ++- uint64_t number = 0; ++- const uint64_t quarterPoints = num_points / 4; ++- __m128 cplxValue1, cplxValue2, iValue, qValue; +++ uint64_t number = 0; +++ const uint64_t quarterPoints = num_points / 4; +++ __m128 cplxValue1, cplxValue2, iValue, qValue; ++ ++- __m128 invScalar = _mm_set_ps1(1.0/scalar); ++- int16_t* complexVectorPtr = (int16_t*)complexVector; +++ __m128 invScalar = _mm_set_ps1(1.0 / scalar); +++ int16_t* complexVectorPtr = (int16_t*)complexVector; ++ ++- __VOLK_ATTR_ALIGNED(16) float floatBuffer[8]; +++ __VOLK_ATTR_ALIGNED(16) float floatBuffer[8]; ++ ++- for(;number < quarterPoints; number++){ +++ for (; number < quarterPoints; number++) { ++ ++- floatBuffer[0] = (float)(complexVectorPtr[0]); ++- floatBuffer[1] = (float)(complexVectorPtr[1]); ++- floatBuffer[2] = (float)(complexVectorPtr[2]); ++- floatBuffer[3] = (float)(complexVectorPtr[3]); +++ floatBuffer[0] = (float)(complexVectorPtr[0]); +++ floatBuffer[1] = (float)(complexVectorPtr[1]); +++ floatBuffer[2] = (float)(complexVectorPtr[2]); +++ floatBuffer[3] = (float)(complexVectorPtr[3]); ++ ++- floatBuffer[4] = (float)(complexVectorPtr[4]); ++- floatBuffer[5] = (float)(complexVectorPtr[5]); ++- floatBuffer[6] = (float)(complexVectorPtr[6]); ++- floatBuffer[7] = (float)(complexVectorPtr[7]); +++ floatBuffer[4] = (float)(complexVectorPtr[4]); +++ floatBuffer[5] = (float)(complexVectorPtr[5]); +++ floatBuffer[6] = (float)(complexVectorPtr[6]); +++ floatBuffer[7] = (float)(complexVectorPtr[7]); ++ ++- cplxValue1 = _mm_load_ps(&floatBuffer[0]); ++- cplxValue2 = _mm_load_ps(&floatBuffer[4]); +++ cplxValue1 = _mm_load_ps(&floatBuffer[0]); +++ cplxValue2 = _mm_load_ps(&floatBuffer[4]); ++ ++- complexVectorPtr += 8; +++ complexVectorPtr += 8; ++ ++- cplxValue1 = _mm_mul_ps(cplxValue1, invScalar); ++- cplxValue2 = _mm_mul_ps(cplxValue2, invScalar); +++ cplxValue1 = _mm_mul_ps(cplxValue1, invScalar); +++ cplxValue2 = _mm_mul_ps(cplxValue2, invScalar); ++ ++- // Arrange in i1i2i3i4 format ++- iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0)); ++- // Arrange in q1q2q3q4 format ++- qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1)); +++ // Arrange in i1i2i3i4 format +++ iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2, 0, 2, 0)); +++ // Arrange in q1q2q3q4 format +++ qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3, 1, 3, 1)); ++ ++- _mm_store_ps(iBufferPtr, iValue); ++- _mm_store_ps(qBufferPtr, qValue); +++ _mm_store_ps(iBufferPtr, iValue); +++ _mm_store_ps(qBufferPtr, qValue); ++ ++- iBufferPtr += 4; ++- qBufferPtr += 4; ++- } +++ iBufferPtr += 4; +++ qBufferPtr += 4; +++ } ++ ++- number = quarterPoints * 4; ++- complexVectorPtr = (int16_t*)&complexVector[number]; ++- for(; number < num_points; number++){ ++- *iBufferPtr++ = (float)(*complexVectorPtr++) / scalar; ++- *qBufferPtr++ = (float)(*complexVectorPtr++) / scalar; ++- } +++ number = quarterPoints * 4; +++ complexVectorPtr = (int16_t*)&complexVector[number]; +++ for (; number < num_points; number++) { +++ *iBufferPtr++ = (float)(*complexVectorPtr++) / scalar; +++ *qBufferPtr++ = (float)(*complexVectorPtr++) / scalar; +++ } ++ } ++ #endif /* LV_HAVE_SSE */ ++ ++ #ifdef LV_HAVE_GENERIC ++ ++ static inline void ++-volk_16ic_s32f_deinterleave_32f_x2_generic(float* iBuffer, float* qBuffer, const lv_16sc_t* complexVector, ++- const float scalar, unsigned int num_points) +++volk_16ic_s32f_deinterleave_32f_x2_generic(float* iBuffer, +++ float* qBuffer, +++ const lv_16sc_t* complexVector, +++ const float scalar, +++ unsigned int num_points) ++ { ++- const int16_t* complexVectorPtr = (const int16_t*)complexVector; ++- float* iBufferPtr = iBuffer; ++- float* qBufferPtr = qBuffer; ++- unsigned int number; ++- for(number = 0; number < num_points; number++){ ++- *iBufferPtr++ = (float)(*complexVectorPtr++) / scalar; ++- *qBufferPtr++ = (float)(*complexVectorPtr++) / scalar; ++- } +++ const int16_t* complexVectorPtr = (const int16_t*)complexVector; +++ float* iBufferPtr = iBuffer; +++ float* qBufferPtr = qBuffer; +++ unsigned int number; +++ for (number = 0; number < num_points; number++) { +++ *iBufferPtr++ = (float)(*complexVectorPtr++) / scalar; +++ *qBufferPtr++ = (float)(*complexVectorPtr++) / scalar; +++ } ++ } ++ #endif /* LV_HAVE_GENERIC */ ++ ++ #ifdef LV_HAVE_NEON ++ #include ++-static inline void ++-volk_16ic_s32f_deinterleave_32f_x2_neon(float* iBuffer, float* qBuffer, const lv_16sc_t* complexVector, ++- const float scalar, unsigned int num_points) +++static inline void volk_16ic_s32f_deinterleave_32f_x2_neon(float* iBuffer, +++ float* qBuffer, +++ const lv_16sc_t* complexVector, +++ const float scalar, +++ unsigned int num_points) ++ { ++- const int16_t* complexVectorPtr = (const int16_t*)complexVector; ++- float* iBufferPtr = iBuffer; ++- float* qBufferPtr = qBuffer; ++- unsigned int eighth_points = num_points / 4; ++- unsigned int number; ++- float iScalar = 1.f/scalar; ++- float32x4_t invScalar; ++- invScalar = vld1q_dup_f32(&iScalar); ++- ++- int16x4x2_t complexInput_s16; ++- int32x4x2_t complexInput_s32; ++- float32x4x2_t complexFloat; ++- ++- for(number = 0; number < eighth_points; number++){ ++- complexInput_s16 = vld2_s16(complexVectorPtr); ++- complexInput_s32.val[0] = vmovl_s16(complexInput_s16.val[0]); ++- complexInput_s32.val[1] = vmovl_s16(complexInput_s16.val[1]); ++- complexFloat.val[0] = vcvtq_f32_s32(complexInput_s32.val[0]); ++- complexFloat.val[1] = vcvtq_f32_s32(complexInput_s32.val[1]); ++- complexFloat.val[0] = vmulq_f32(complexFloat.val[0], invScalar); ++- complexFloat.val[1] = vmulq_f32(complexFloat.val[1], invScalar); ++- vst1q_f32(iBufferPtr, complexFloat.val[0]); ++- vst1q_f32(qBufferPtr, complexFloat.val[1]); ++- complexVectorPtr += 8; ++- iBufferPtr += 4; ++- qBufferPtr += 4; ++- } ++- ++- for(number = eighth_points*4; number < num_points; number++){ ++- *iBufferPtr++ = (float)(*complexVectorPtr++) / scalar; ++- *qBufferPtr++ = (float)(*complexVectorPtr++) / scalar; ++- } +++ const int16_t* complexVectorPtr = (const int16_t*)complexVector; +++ float* iBufferPtr = iBuffer; +++ float* qBufferPtr = qBuffer; +++ unsigned int eighth_points = num_points / 4; +++ unsigned int number; +++ float iScalar = 1.f / scalar; +++ float32x4_t invScalar; +++ invScalar = vld1q_dup_f32(&iScalar); +++ +++ int16x4x2_t complexInput_s16; +++ int32x4x2_t complexInput_s32; +++ float32x4x2_t complexFloat; +++ +++ for (number = 0; number < eighth_points; number++) { +++ complexInput_s16 = vld2_s16(complexVectorPtr); +++ complexInput_s32.val[0] = vmovl_s16(complexInput_s16.val[0]); +++ complexInput_s32.val[1] = vmovl_s16(complexInput_s16.val[1]); +++ complexFloat.val[0] = vcvtq_f32_s32(complexInput_s32.val[0]); +++ complexFloat.val[1] = vcvtq_f32_s32(complexInput_s32.val[1]); +++ complexFloat.val[0] = vmulq_f32(complexFloat.val[0], invScalar); +++ complexFloat.val[1] = vmulq_f32(complexFloat.val[1], invScalar); +++ vst1q_f32(iBufferPtr, complexFloat.val[0]); +++ vst1q_f32(qBufferPtr, complexFloat.val[1]); +++ complexVectorPtr += 8; +++ iBufferPtr += 4; +++ qBufferPtr += 4; +++ } +++ +++ for (number = eighth_points * 4; number < num_points; number++) { +++ *iBufferPtr++ = (float)(*complexVectorPtr++) / scalar; +++ *qBufferPtr++ = (float)(*complexVectorPtr++) / scalar; +++ } ++ } ++ #endif /* LV_HAVE_GENERIC */ ++ ++ #ifdef LV_HAVE_ORC ++-extern void ++-volk_16ic_s32f_deinterleave_32f_x2_a_orc_impl(float* iBuffer, float* qBuffer, const lv_16sc_t* complexVector, ++- const float scalar, unsigned int num_points); +++extern void volk_16ic_s32f_deinterleave_32f_x2_a_orc_impl(float* iBuffer, +++ float* qBuffer, +++ const lv_16sc_t* complexVector, +++ const float scalar, +++ unsigned int num_points); ++ ++ static inline void ++-volk_16ic_s32f_deinterleave_32f_x2_u_orc(float* iBuffer, float* qBuffer, const lv_16sc_t* complexVector, ++- const float scalar, unsigned int num_points) +++volk_16ic_s32f_deinterleave_32f_x2_u_orc(float* iBuffer, +++ float* qBuffer, +++ const lv_16sc_t* complexVector, +++ const float scalar, +++ unsigned int num_points) ++ { ++- volk_16ic_s32f_deinterleave_32f_x2_a_orc_impl(iBuffer, qBuffer, complexVector, scalar, num_points); +++ volk_16ic_s32f_deinterleave_32f_x2_a_orc_impl( +++ iBuffer, qBuffer, complexVector, scalar, num_points); ++ } ++ #endif /* LV_HAVE_ORC */ ++ ++@@ -257,66 +274,69 @@ volk_16ic_s32f_deinterleave_32f_x2_u_orc(float* iBuffer, float* qBuffer, const l ++ #ifndef INCLUDED_volk_16ic_s32f_deinterleave_32f_x2_u_H ++ #define INCLUDED_volk_16ic_s32f_deinterleave_32f_x2_u_H ++ ++-#include ++ #include ++ #include +++#include ++ ++ #ifdef LV_HAVE_AVX2 ++ #include ++ ++-static inline ++-void volk_16ic_s32f_deinterleave_32f_x2_u_avx2(float* iBuffer, float* qBuffer, const lv_16sc_t* complexVector, ++- const float scalar, unsigned int num_points) +++static inline void +++volk_16ic_s32f_deinterleave_32f_x2_u_avx2(float* iBuffer, +++ float* qBuffer, +++ const lv_16sc_t* complexVector, +++ const float scalar, +++ unsigned int num_points) ++ { ++- float* iBufferPtr = iBuffer; ++- float* qBufferPtr = qBuffer; ++- ++- uint64_t number = 0; ++- const uint64_t eighthPoints = num_points / 8; ++- __m256 cplxValue1, cplxValue2, iValue, qValue; ++- __m256i cplxValueA, cplxValueB; ++- __m128i cplxValue128; ++- ++- __m256 invScalar = _mm256_set1_ps(1.0/scalar); ++- int16_t* complexVectorPtr = (int16_t*)complexVector; ++- __m256i idx = _mm256_set_epi32(7,6,3,2,5,4,1,0); ++- ++- for(;number < eighthPoints; number++){ ++- ++- cplxValueA = _mm256_loadu_si256((__m256i*) complexVectorPtr); ++- complexVectorPtr += 16; ++- ++- //cvt ++- cplxValue128 = _mm256_extracti128_si256(cplxValueA, 0); ++- cplxValueB = _mm256_cvtepi16_epi32(cplxValue128); ++- cplxValue1 = _mm256_cvtepi32_ps(cplxValueB); ++- cplxValue128 = _mm256_extracti128_si256(cplxValueA, 1); ++- cplxValueB = _mm256_cvtepi16_epi32(cplxValue128); ++- cplxValue2 = _mm256_cvtepi32_ps(cplxValueB); ++- ++- cplxValue1 = _mm256_mul_ps(cplxValue1, invScalar); ++- cplxValue2 = _mm256_mul_ps(cplxValue2, invScalar); ++- ++- // Arrange in i1i2i3i4 format ++- iValue = _mm256_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0)); ++- iValue = _mm256_permutevar8x32_ps(iValue,idx); ++- // Arrange in q1q2q3q4 format ++- qValue = _mm256_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1)); ++- qValue = _mm256_permutevar8x32_ps(qValue,idx); ++- ++- _mm256_storeu_ps(iBufferPtr, iValue); ++- _mm256_storeu_ps(qBufferPtr, qValue); ++- ++- iBufferPtr += 8; ++- qBufferPtr += 8; ++- } ++- ++- number = eighthPoints * 8; ++- complexVectorPtr = (int16_t*)&complexVector[number]; ++- for(; number < num_points; number++){ ++- *iBufferPtr++ = (float)(*complexVectorPtr++) / scalar; ++- *qBufferPtr++ = (float)(*complexVectorPtr++) / scalar; ++- } +++ float* iBufferPtr = iBuffer; +++ float* qBufferPtr = qBuffer; +++ +++ uint64_t number = 0; +++ const uint64_t eighthPoints = num_points / 8; +++ __m256 cplxValue1, cplxValue2, iValue, qValue; +++ __m256i cplxValueA, cplxValueB; +++ __m128i cplxValue128; +++ +++ __m256 invScalar = _mm256_set1_ps(1.0 / scalar); +++ int16_t* complexVectorPtr = (int16_t*)complexVector; +++ __m256i idx = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0); +++ +++ for (; number < eighthPoints; number++) { +++ +++ cplxValueA = _mm256_loadu_si256((__m256i*)complexVectorPtr); +++ complexVectorPtr += 16; +++ +++ // cvt +++ cplxValue128 = _mm256_extracti128_si256(cplxValueA, 0); +++ cplxValueB = _mm256_cvtepi16_epi32(cplxValue128); +++ cplxValue1 = _mm256_cvtepi32_ps(cplxValueB); +++ cplxValue128 = _mm256_extracti128_si256(cplxValueA, 1); +++ cplxValueB = _mm256_cvtepi16_epi32(cplxValue128); +++ cplxValue2 = _mm256_cvtepi32_ps(cplxValueB); +++ +++ cplxValue1 = _mm256_mul_ps(cplxValue1, invScalar); +++ cplxValue2 = _mm256_mul_ps(cplxValue2, invScalar); +++ +++ // Arrange in i1i2i3i4 format +++ iValue = _mm256_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2, 0, 2, 0)); +++ iValue = _mm256_permutevar8x32_ps(iValue, idx); +++ // Arrange in q1q2q3q4 format +++ qValue = _mm256_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3, 1, 3, 1)); +++ qValue = _mm256_permutevar8x32_ps(qValue, idx); +++ +++ _mm256_storeu_ps(iBufferPtr, iValue); +++ _mm256_storeu_ps(qBufferPtr, qValue); +++ +++ iBufferPtr += 8; +++ qBufferPtr += 8; +++ } +++ +++ number = eighthPoints * 8; +++ complexVectorPtr = (int16_t*)&complexVector[number]; +++ for (; number < num_points; number++) { +++ *iBufferPtr++ = (float)(*complexVectorPtr++) / scalar; +++ *qBufferPtr++ = (float)(*complexVectorPtr++) / scalar; +++ } ++ } ++ #endif /* LV_HAVE_AVX2 */ ++ ++diff --git a/kernels/volk/volk_16ic_s32f_deinterleave_real_32f.h b/kernels/volk/volk_16ic_s32f_deinterleave_real_32f.h ++index 713e6a1..8b72d1c 100644 ++--- a/kernels/volk/volk_16ic_s32f_deinterleave_real_32f.h +++++ b/kernels/volk/volk_16ic_s32f_deinterleave_real_32f.h ++@@ -31,8 +31,8 @@ ++ * ++ * Dispatcher Prototype ++ * \code ++- * void volk_16ic_s32f_deinterleave_real_32f(float* iBuffer, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){ ++- * \endcode +++ * void volk_16ic_s32f_deinterleave_real_32f(float* iBuffer, const lv_16sc_t* +++ * complexVector, const float scalar, unsigned int num_points){ \endcode ++ * ++ * \b Inputs ++ * \li complexVector: The complex input vector of 16-bit shorts. ++@@ -56,55 +56,88 @@ ++ #ifndef INCLUDED_volk_16ic_s32f_deinterleave_real_32f_a_H ++ #define INCLUDED_volk_16ic_s32f_deinterleave_real_32f_a_H ++ ++-#include ++ #include ++ #include +++#include ++ ++ #ifdef LV_HAVE_AVX2 ++ #include ++ ++ static inline void ++-volk_16ic_s32f_deinterleave_real_32f_a_avx2(float* iBuffer, const lv_16sc_t* complexVector, ++- const float scalar, unsigned int num_points) +++volk_16ic_s32f_deinterleave_real_32f_a_avx2(float* iBuffer, +++ const lv_16sc_t* complexVector, +++ const float scalar, +++ unsigned int num_points) ++ { ++- float* iBufferPtr = iBuffer; ++- ++- unsigned int number = 0; ++- const unsigned int eighthPoints = num_points / 8; ++- ++- __m256 iFloatValue; ++- ++- const float iScalar= 1.0 / scalar; ++- __m256 invScalar = _mm256_set1_ps(iScalar); ++- __m256i complexVal, iIntVal; ++- __m128i complexVal128; ++- int8_t* complexVectorPtr = (int8_t*)complexVector; ++- ++- __m256i moveMask = _mm256_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0); ++- ++- for(;number < eighthPoints; number++){ ++- complexVal = _mm256_load_si256((__m256i*)complexVectorPtr); complexVectorPtr += 32; ++- complexVal = _mm256_shuffle_epi8(complexVal, moveMask); ++- complexVal = _mm256_permute4x64_epi64(complexVal, 0xd8); ++- complexVal128 = _mm256_extracti128_si256(complexVal, 0); ++- ++- iIntVal = _mm256_cvtepi16_epi32(complexVal128); ++- iFloatValue = _mm256_cvtepi32_ps(iIntVal); ++- ++- iFloatValue = _mm256_mul_ps(iFloatValue, invScalar); ++- ++- _mm256_store_ps(iBufferPtr, iFloatValue); ++- ++- iBufferPtr += 8; ++- } ++- ++- number = eighthPoints * 8; ++- int16_t* sixteenTComplexVectorPtr = (int16_t*)&complexVector[number]; ++- for(; number < num_points; number++){ ++- *iBufferPtr++ = ((float)(*sixteenTComplexVectorPtr++)) * iScalar; ++- sixteenTComplexVectorPtr++; ++- } ++- +++ float* iBufferPtr = iBuffer; +++ +++ unsigned int number = 0; +++ const unsigned int eighthPoints = num_points / 8; +++ +++ __m256 iFloatValue; +++ +++ const float iScalar = 1.0 / scalar; +++ __m256 invScalar = _mm256_set1_ps(iScalar); +++ __m256i complexVal, iIntVal; +++ __m128i complexVal128; +++ int8_t* complexVectorPtr = (int8_t*)complexVector; +++ +++ __m256i moveMask = _mm256_set_epi8(0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 13, +++ 12, +++ 9, +++ 8, +++ 5, +++ 4, +++ 1, +++ 0, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 13, +++ 12, +++ 9, +++ 8, +++ 5, +++ 4, +++ 1, +++ 0); +++ +++ for (; number < eighthPoints; number++) { +++ complexVal = _mm256_load_si256((__m256i*)complexVectorPtr); +++ complexVectorPtr += 32; +++ complexVal = _mm256_shuffle_epi8(complexVal, moveMask); +++ complexVal = _mm256_permute4x64_epi64(complexVal, 0xd8); +++ complexVal128 = _mm256_extracti128_si256(complexVal, 0); +++ +++ iIntVal = _mm256_cvtepi16_epi32(complexVal128); +++ iFloatValue = _mm256_cvtepi32_ps(iIntVal); +++ +++ iFloatValue = _mm256_mul_ps(iFloatValue, invScalar); +++ +++ _mm256_store_ps(iBufferPtr, iFloatValue); +++ +++ iBufferPtr += 8; +++ } +++ +++ number = eighthPoints * 8; +++ int16_t* sixteenTComplexVectorPtr = (int16_t*)&complexVector[number]; +++ for (; number < num_points; number++) { +++ *iBufferPtr++ = ((float)(*sixteenTComplexVectorPtr++)) * iScalar; +++ sixteenTComplexVectorPtr++; +++ } ++ } ++ #endif /* LV_HAVE_AVX2 */ ++ ++@@ -112,44 +145,47 @@ volk_16ic_s32f_deinterleave_real_32f_a_avx2(float* iBuffer, const lv_16sc_t* com ++ #include ++ ++ static inline void ++-volk_16ic_s32f_deinterleave_real_32f_a_sse4_1(float* iBuffer, const lv_16sc_t* complexVector, ++- const float scalar, unsigned int num_points) +++volk_16ic_s32f_deinterleave_real_32f_a_sse4_1(float* iBuffer, +++ const lv_16sc_t* complexVector, +++ const float scalar, +++ unsigned int num_points) ++ { ++- float* iBufferPtr = iBuffer; ++- ++- unsigned int number = 0; ++- const unsigned int quarterPoints = num_points / 4; +++ float* iBufferPtr = iBuffer; ++ ++- __m128 iFloatValue; +++ unsigned int number = 0; +++ const unsigned int quarterPoints = num_points / 4; ++ ++- const float iScalar= 1.0 / scalar; ++- __m128 invScalar = _mm_set_ps1(iScalar); ++- __m128i complexVal, iIntVal; ++- int8_t* complexVectorPtr = (int8_t*)complexVector; +++ __m128 iFloatValue; ++ ++- __m128i moveMask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0); +++ const float iScalar = 1.0 / scalar; +++ __m128 invScalar = _mm_set_ps1(iScalar); +++ __m128i complexVal, iIntVal; +++ int8_t* complexVectorPtr = (int8_t*)complexVector; ++ ++- for(;number < quarterPoints; number++){ ++- complexVal = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16; ++- complexVal = _mm_shuffle_epi8(complexVal, moveMask); +++ __m128i moveMask = _mm_set_epi8( +++ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0); ++ ++- iIntVal = _mm_cvtepi16_epi32(complexVal); ++- iFloatValue = _mm_cvtepi32_ps(iIntVal); +++ for (; number < quarterPoints; number++) { +++ complexVal = _mm_load_si128((__m128i*)complexVectorPtr); +++ complexVectorPtr += 16; +++ complexVal = _mm_shuffle_epi8(complexVal, moveMask); ++ ++- iFloatValue = _mm_mul_ps(iFloatValue, invScalar); +++ iIntVal = _mm_cvtepi16_epi32(complexVal); +++ iFloatValue = _mm_cvtepi32_ps(iIntVal); ++ ++- _mm_store_ps(iBufferPtr, iFloatValue); +++ iFloatValue = _mm_mul_ps(iFloatValue, invScalar); ++ ++- iBufferPtr += 4; ++- } +++ _mm_store_ps(iBufferPtr, iFloatValue); ++ ++- number = quarterPoints * 4; ++- int16_t* sixteenTComplexVectorPtr = (int16_t*)&complexVector[number]; ++- for(; number < num_points; number++){ ++- *iBufferPtr++ = ((float)(*sixteenTComplexVectorPtr++)) * iScalar; ++- sixteenTComplexVectorPtr++; ++- } +++ iBufferPtr += 4; +++ } ++ +++ number = quarterPoints * 4; +++ int16_t* sixteenTComplexVectorPtr = (int16_t*)&complexVector[number]; +++ for (; number < num_points; number++) { +++ *iBufferPtr++ = ((float)(*sixteenTComplexVectorPtr++)) * iScalar; +++ sixteenTComplexVectorPtr++; +++ } ++ } ++ #endif /* LV_HAVE_SSE4_1 */ ++ ++@@ -157,59 +193,66 @@ volk_16ic_s32f_deinterleave_real_32f_a_sse4_1(float* iBuffer, const lv_16sc_t* c ++ #include ++ ++ static inline void ++-volk_16ic_s32f_deinterleave_real_32f_a_sse(float* iBuffer, const lv_16sc_t* complexVector, ++- const float scalar, unsigned int num_points) +++volk_16ic_s32f_deinterleave_real_32f_a_sse(float* iBuffer, +++ const lv_16sc_t* complexVector, +++ const float scalar, +++ unsigned int num_points) ++ { ++- float* iBufferPtr = iBuffer; +++ float* iBufferPtr = iBuffer; ++ ++- unsigned int number = 0; ++- const unsigned int quarterPoints = num_points / 4; ++- __m128 iValue; +++ unsigned int number = 0; +++ const unsigned int quarterPoints = num_points / 4; +++ __m128 iValue; ++ ++- const float iScalar = 1.0/scalar; ++- __m128 invScalar = _mm_set_ps1(iScalar); ++- int16_t* complexVectorPtr = (int16_t*)complexVector; +++ const float iScalar = 1.0 / scalar; +++ __m128 invScalar = _mm_set_ps1(iScalar); +++ int16_t* complexVectorPtr = (int16_t*)complexVector; ++ ++- __VOLK_ATTR_ALIGNED(16) float floatBuffer[4]; +++ __VOLK_ATTR_ALIGNED(16) float floatBuffer[4]; ++ ++- for(;number < quarterPoints; number++){ ++- floatBuffer[0] = (float)(*complexVectorPtr); complexVectorPtr += 2; ++- floatBuffer[1] = (float)(*complexVectorPtr); complexVectorPtr += 2; ++- floatBuffer[2] = (float)(*complexVectorPtr); complexVectorPtr += 2; ++- floatBuffer[3] = (float)(*complexVectorPtr); complexVectorPtr += 2; +++ for (; number < quarterPoints; number++) { +++ floatBuffer[0] = (float)(*complexVectorPtr); +++ complexVectorPtr += 2; +++ floatBuffer[1] = (float)(*complexVectorPtr); +++ complexVectorPtr += 2; +++ floatBuffer[2] = (float)(*complexVectorPtr); +++ complexVectorPtr += 2; +++ floatBuffer[3] = (float)(*complexVectorPtr); +++ complexVectorPtr += 2; ++ ++- iValue = _mm_load_ps(floatBuffer); +++ iValue = _mm_load_ps(floatBuffer); ++ ++- iValue = _mm_mul_ps(iValue, invScalar); +++ iValue = _mm_mul_ps(iValue, invScalar); ++ ++- _mm_store_ps(iBufferPtr, iValue); +++ _mm_store_ps(iBufferPtr, iValue); ++ ++- iBufferPtr += 4; ++- } ++- ++- number = quarterPoints * 4; ++- complexVectorPtr = (int16_t*)&complexVector[number]; ++- for(; number < num_points; number++){ ++- *iBufferPtr++ = ((float)(*complexVectorPtr++)) * iScalar; ++- complexVectorPtr++; ++- } +++ iBufferPtr += 4; +++ } ++ +++ number = quarterPoints * 4; +++ complexVectorPtr = (int16_t*)&complexVector[number]; +++ for (; number < num_points; number++) { +++ *iBufferPtr++ = ((float)(*complexVectorPtr++)) * iScalar; +++ complexVectorPtr++; +++ } ++ } ++ #endif /* LV_HAVE_SSE */ ++ ++ #ifdef LV_HAVE_GENERIC ++ static inline void ++-volk_16ic_s32f_deinterleave_real_32f_generic(float* iBuffer, const lv_16sc_t* complexVector, ++- const float scalar, unsigned int num_points) +++volk_16ic_s32f_deinterleave_real_32f_generic(float* iBuffer, +++ const lv_16sc_t* complexVector, +++ const float scalar, +++ unsigned int num_points) ++ { ++- unsigned int number = 0; ++- const int16_t* complexVectorPtr = (const int16_t*)complexVector; ++- float* iBufferPtr = iBuffer; ++- const float invScalar = 1.0 / scalar; ++- for(number = 0; number < num_points; number++){ ++- *iBufferPtr++ = ((float)(*complexVectorPtr++)) * invScalar; ++- complexVectorPtr++; ++- } +++ unsigned int number = 0; +++ const int16_t* complexVectorPtr = (const int16_t*)complexVector; +++ float* iBufferPtr = iBuffer; +++ const float invScalar = 1.0 / scalar; +++ for (number = 0; number < num_points; number++) { +++ *iBufferPtr++ = ((float)(*complexVectorPtr++)) * invScalar; +++ complexVectorPtr++; +++ } ++ } ++ #endif /* LV_HAVE_GENERIC */ ++ ++@@ -219,55 +262,88 @@ volk_16ic_s32f_deinterleave_real_32f_generic(float* iBuffer, const lv_16sc_t* co ++ #ifndef INCLUDED_volk_16ic_s32f_deinterleave_real_32f_u_H ++ #define INCLUDED_volk_16ic_s32f_deinterleave_real_32f_u_H ++ ++-#include ++ #include ++ #include +++#include ++ ++ #ifdef LV_HAVE_AVX2 ++ #include ++ ++ static inline void ++-volk_16ic_s32f_deinterleave_real_32f_u_avx2(float* iBuffer, const lv_16sc_t* complexVector, ++- const float scalar, unsigned int num_points) +++volk_16ic_s32f_deinterleave_real_32f_u_avx2(float* iBuffer, +++ const lv_16sc_t* complexVector, +++ const float scalar, +++ unsigned int num_points) ++ { ++- float* iBufferPtr = iBuffer; ++- ++- unsigned int number = 0; ++- const unsigned int eighthPoints = num_points / 8; ++- ++- __m256 iFloatValue; ++- ++- const float iScalar= 1.0 / scalar; ++- __m256 invScalar = _mm256_set1_ps(iScalar); ++- __m256i complexVal, iIntVal; ++- __m128i complexVal128; ++- int8_t* complexVectorPtr = (int8_t*)complexVector; ++- ++- __m256i moveMask = _mm256_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0); ++- ++- for(;number < eighthPoints; number++){ ++- complexVal = _mm256_loadu_si256((__m256i*)complexVectorPtr); complexVectorPtr += 32; ++- complexVal = _mm256_shuffle_epi8(complexVal, moveMask); ++- complexVal = _mm256_permute4x64_epi64(complexVal, 0xd8); ++- complexVal128 = _mm256_extracti128_si256(complexVal, 0); ++- ++- iIntVal = _mm256_cvtepi16_epi32(complexVal128); ++- iFloatValue = _mm256_cvtepi32_ps(iIntVal); ++- ++- iFloatValue = _mm256_mul_ps(iFloatValue, invScalar); ++- ++- _mm256_storeu_ps(iBufferPtr, iFloatValue); ++- ++- iBufferPtr += 8; ++- } ++- ++- number = eighthPoints * 8; ++- int16_t* sixteenTComplexVectorPtr = (int16_t*)&complexVector[number]; ++- for(; number < num_points; number++){ ++- *iBufferPtr++ = ((float)(*sixteenTComplexVectorPtr++)) * iScalar; ++- sixteenTComplexVectorPtr++; ++- } ++- +++ float* iBufferPtr = iBuffer; +++ +++ unsigned int number = 0; +++ const unsigned int eighthPoints = num_points / 8; +++ +++ __m256 iFloatValue; +++ +++ const float iScalar = 1.0 / scalar; +++ __m256 invScalar = _mm256_set1_ps(iScalar); +++ __m256i complexVal, iIntVal; +++ __m128i complexVal128; +++ int8_t* complexVectorPtr = (int8_t*)complexVector; +++ +++ __m256i moveMask = _mm256_set_epi8(0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 13, +++ 12, +++ 9, +++ 8, +++ 5, +++ 4, +++ 1, +++ 0, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 0x80, +++ 13, +++ 12, +++ 9, +++ 8, +++ 5, +++ 4, +++ 1, +++ 0); +++ +++ for (; number < eighthPoints; number++) { +++ complexVal = _mm256_loadu_si256((__m256i*)complexVectorPtr); +++ complexVectorPtr += 32; +++ complexVal = _mm256_shuffle_epi8(complexVal, moveMask); +++ complexVal = _mm256_permute4x64_epi64(complexVal, 0xd8); +++ complexVal128 = _mm256_extracti128_si256(complexVal, 0); +++ +++ iIntVal = _mm256_cvtepi16_epi32(complexVal128); +++ iFloatValue = _mm256_cvtepi32_ps(iIntVal); +++ +++ iFloatValue = _mm256_mul_ps(iFloatValue, invScalar); +++ +++ _mm256_storeu_ps(iBufferPtr, iFloatValue); +++ +++ iBufferPtr += 8; +++ } +++ +++ number = eighthPoints * 8; +++ int16_t* sixteenTComplexVectorPtr = (int16_t*)&complexVector[number]; +++ for (; number < num_points; number++) { +++ *iBufferPtr++ = ((float)(*sixteenTComplexVectorPtr++)) * iScalar; +++ sixteenTComplexVectorPtr++; +++ } ++ } ++ #endif /* LV_HAVE_AVX2 */ ++ ++diff --git a/kernels/volk/volk_16ic_s32f_magnitude_32f.h b/kernels/volk/volk_16ic_s32f_magnitude_32f.h ++index bb0459c..c3e3605 100644 ++--- a/kernels/volk/volk_16ic_s32f_magnitude_32f.h +++++ b/kernels/volk/volk_16ic_s32f_magnitude_32f.h ++@@ -30,8 +30,8 @@ ++ * ++ * Dispatcher Prototype ++ * \code ++- * void volk_16ic_s32f_magnitude_32f(float* magnitudeVector, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points) ++- * \endcode +++ * void volk_16ic_s32f_magnitude_32f(float* magnitudeVector, const lv_16sc_t* +++ * complexVector, const float scalar, unsigned int num_points) \endcode ++ * ++ * \b Inputs ++ * \li complexVector: The complex input vector of complex 16-bit shorts. ++@@ -55,67 +55,68 @@ ++ #ifndef INCLUDED_volk_16ic_s32f_magnitude_32f_a_H ++ #define INCLUDED_volk_16ic_s32f_magnitude_32f_a_H ++ ++-#include ++ #include ++-#include ++ #include +++#include +++#include ++ ++ #ifdef LV_HAVE_AVX2 ++ #include ++ ++-static inline void ++-volk_16ic_s32f_magnitude_32f_a_avx2(float* magnitudeVector, const lv_16sc_t* complexVector, ++- const float scalar, unsigned int num_points) +++static inline void volk_16ic_s32f_magnitude_32f_a_avx2(float* magnitudeVector, +++ const lv_16sc_t* complexVector, +++ const float scalar, +++ unsigned int num_points) ++ { ++- unsigned int number = 0; ++- const unsigned int eighthPoints = num_points / 8; +++ unsigned int number = 0; +++ const unsigned int eighthPoints = num_points / 8; +++ +++ const int16_t* complexVectorPtr = (const int16_t*)complexVector; +++ float* magnitudeVectorPtr = magnitudeVector; ++ ++- const int16_t* complexVectorPtr = (const int16_t*)complexVector; ++- float* magnitudeVectorPtr = magnitudeVector; +++ __m256 invScalar = _mm256_set1_ps(1.0 / scalar); ++ ++- __m256 invScalar = _mm256_set1_ps(1.0/scalar); +++ __m256 cplxValue1, cplxValue2, result; +++ __m256i int1, int2; +++ __m128i short1, short2; +++ __m256i idx = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0); ++ ++- __m256 cplxValue1, cplxValue2, result; ++- __m256i int1, int2; ++- __m128i short1, short2; ++- __m256i idx = _mm256_set_epi32(7,6,3,2,5,4,1,0); +++ for (; number < eighthPoints; number++) { ++ ++- for(;number < eighthPoints; number++){ ++- ++- int1 = _mm256_loadu_si256((__m256i*)complexVectorPtr); ++- complexVectorPtr += 16; ++- short1 = _mm256_extracti128_si256(int1,0); ++- short2 = _mm256_extracti128_si256(int1,1); +++ int1 = _mm256_loadu_si256((__m256i*)complexVectorPtr); +++ complexVectorPtr += 16; +++ short1 = _mm256_extracti128_si256(int1, 0); +++ short2 = _mm256_extracti128_si256(int1, 1); ++ ++- int1 = _mm256_cvtepi16_epi32(short1); ++- int2 = _mm256_cvtepi16_epi32(short2); ++- cplxValue1 = _mm256_cvtepi32_ps(int1); ++- cplxValue2 = _mm256_cvtepi32_ps(int2); +++ int1 = _mm256_cvtepi16_epi32(short1); +++ int2 = _mm256_cvtepi16_epi32(short2); +++ cplxValue1 = _mm256_cvtepi32_ps(int1); +++ cplxValue2 = _mm256_cvtepi32_ps(int2); ++ ++- cplxValue1 = _mm256_mul_ps(cplxValue1, invScalar); ++- cplxValue2 = _mm256_mul_ps(cplxValue2, invScalar); +++ cplxValue1 = _mm256_mul_ps(cplxValue1, invScalar); +++ cplxValue2 = _mm256_mul_ps(cplxValue2, invScalar); ++ ++- cplxValue1 = _mm256_mul_ps(cplxValue1, cplxValue1); // Square the values ++- cplxValue2 = _mm256_mul_ps(cplxValue2, cplxValue2); // Square the Values +++ cplxValue1 = _mm256_mul_ps(cplxValue1, cplxValue1); // Square the values +++ cplxValue2 = _mm256_mul_ps(cplxValue2, cplxValue2); // Square the Values ++ ++- result = _mm256_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values ++- result = _mm256_permutevar8x32_ps(result, idx); +++ result = _mm256_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values +++ result = _mm256_permutevar8x32_ps(result, idx); ++ ++- result = _mm256_sqrt_ps(result); // Square root the values +++ result = _mm256_sqrt_ps(result); // Square root the values ++ ++- _mm256_store_ps(magnitudeVectorPtr, result); +++ _mm256_store_ps(magnitudeVectorPtr, result); ++ ++- magnitudeVectorPtr += 8; ++- } +++ magnitudeVectorPtr += 8; +++ } ++ ++- number = eighthPoints * 8; ++- magnitudeVectorPtr = &magnitudeVector[number]; ++- complexVectorPtr = (const int16_t*)&complexVector[number]; ++- for(; number < num_points; number++){ ++- float val1Real = (float)(*complexVectorPtr++) / scalar; ++- float val1Imag = (float)(*complexVectorPtr++) / scalar; ++- *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)); ++- } +++ number = eighthPoints * 8; +++ magnitudeVectorPtr = &magnitudeVector[number]; +++ complexVectorPtr = (const int16_t*)&complexVector[number]; +++ for (; number < num_points; number++) { +++ float val1Real = (float)(*complexVectorPtr++) / scalar; +++ float val1Imag = (float)(*complexVectorPtr++) / scalar; +++ *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)); +++ } ++ } ++ #endif /* LV_HAVE_AVX2 */ ++ ++@@ -123,127 +124,129 @@ volk_16ic_s32f_magnitude_32f_a_avx2(float* magnitudeVector, const lv_16sc_t* com ++ #ifdef LV_HAVE_SSE3 ++ #include ++ ++-static inline void ++-volk_16ic_s32f_magnitude_32f_a_sse3(float* magnitudeVector, const lv_16sc_t* complexVector, ++- const float scalar, unsigned int num_points) +++static inline void volk_16ic_s32f_magnitude_32f_a_sse3(float* magnitudeVector, +++ const lv_16sc_t* complexVector, +++ const float scalar, +++ unsigned int num_points) ++ { ++- unsigned int number = 0; ++- const unsigned int quarterPoints = num_points / 4; +++ unsigned int number = 0; +++ const unsigned int quarterPoints = num_points / 4; ++ ++- const int16_t* complexVectorPtr = (const int16_t*)complexVector; ++- float* magnitudeVectorPtr = magnitudeVector; +++ const int16_t* complexVectorPtr = (const int16_t*)complexVector; +++ float* magnitudeVectorPtr = magnitudeVector; ++ ++- __m128 invScalar = _mm_set_ps1(1.0/scalar); +++ __m128 invScalar = _mm_set_ps1(1.0 / scalar); ++ ++- __m128 cplxValue1, cplxValue2, result; +++ __m128 cplxValue1, cplxValue2, result; ++ ++- __VOLK_ATTR_ALIGNED(16) float inputFloatBuffer[8]; +++ __VOLK_ATTR_ALIGNED(16) float inputFloatBuffer[8]; ++ ++- for(;number < quarterPoints; number++){ +++ for (; number < quarterPoints; number++) { ++ ++- inputFloatBuffer[0] = (float)(complexVectorPtr[0]); ++- inputFloatBuffer[1] = (float)(complexVectorPtr[1]); ++- inputFloatBuffer[2] = (float)(complexVectorPtr[2]); ++- inputFloatBuffer[3] = (float)(complexVectorPtr[3]); +++ inputFloatBuffer[0] = (float)(complexVectorPtr[0]); +++ inputFloatBuffer[1] = (float)(complexVectorPtr[1]); +++ inputFloatBuffer[2] = (float)(complexVectorPtr[2]); +++ inputFloatBuffer[3] = (float)(complexVectorPtr[3]); ++ ++- inputFloatBuffer[4] = (float)(complexVectorPtr[4]); ++- inputFloatBuffer[5] = (float)(complexVectorPtr[5]); ++- inputFloatBuffer[6] = (float)(complexVectorPtr[6]); ++- inputFloatBuffer[7] = (float)(complexVectorPtr[7]); +++ inputFloatBuffer[4] = (float)(complexVectorPtr[4]); +++ inputFloatBuffer[5] = (float)(complexVectorPtr[5]); +++ inputFloatBuffer[6] = (float)(complexVectorPtr[6]); +++ inputFloatBuffer[7] = (float)(complexVectorPtr[7]); ++ ++- cplxValue1 = _mm_load_ps(&inputFloatBuffer[0]); ++- cplxValue2 = _mm_load_ps(&inputFloatBuffer[4]); +++ cplxValue1 = _mm_load_ps(&inputFloatBuffer[0]); +++ cplxValue2 = _mm_load_ps(&inputFloatBuffer[4]); ++ ++- complexVectorPtr += 8; +++ complexVectorPtr += 8; ++ ++- cplxValue1 = _mm_mul_ps(cplxValue1, invScalar); ++- cplxValue2 = _mm_mul_ps(cplxValue2, invScalar); +++ cplxValue1 = _mm_mul_ps(cplxValue1, invScalar); +++ cplxValue2 = _mm_mul_ps(cplxValue2, invScalar); ++ ++- cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values ++- cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values +++ cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values +++ cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values ++ ++- result = _mm_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values +++ result = _mm_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values ++ ++- result = _mm_sqrt_ps(result); // Square root the values +++ result = _mm_sqrt_ps(result); // Square root the values ++ ++- _mm_store_ps(magnitudeVectorPtr, result); +++ _mm_store_ps(magnitudeVectorPtr, result); ++ ++- magnitudeVectorPtr += 4; ++- } +++ magnitudeVectorPtr += 4; +++ } ++ ++- number = quarterPoints * 4; ++- magnitudeVectorPtr = &magnitudeVector[number]; ++- complexVectorPtr = (const int16_t*)&complexVector[number]; ++- for(; number < num_points; number++){ ++- float val1Real = (float)(*complexVectorPtr++) / scalar; ++- float val1Imag = (float)(*complexVectorPtr++) / scalar; ++- *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)); ++- } +++ number = quarterPoints * 4; +++ magnitudeVectorPtr = &magnitudeVector[number]; +++ complexVectorPtr = (const int16_t*)&complexVector[number]; +++ for (; number < num_points; number++) { +++ float val1Real = (float)(*complexVectorPtr++) / scalar; +++ float val1Imag = (float)(*complexVectorPtr++) / scalar; +++ *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)); +++ } ++ } ++ #endif /* LV_HAVE_SSE3 */ ++ ++ #ifdef LV_HAVE_SSE ++ #include ++ ++-static inline void ++-volk_16ic_s32f_magnitude_32f_a_sse(float* magnitudeVector, const lv_16sc_t* complexVector, ++- const float scalar, unsigned int num_points) +++static inline void volk_16ic_s32f_magnitude_32f_a_sse(float* magnitudeVector, +++ const lv_16sc_t* complexVector, +++ const float scalar, +++ unsigned int num_points) ++ { ++- unsigned int number = 0; ++- const unsigned int quarterPoints = num_points / 4; +++ unsigned int number = 0; +++ const unsigned int quarterPoints = num_points / 4; ++ ++- const int16_t* complexVectorPtr = (const int16_t*)complexVector; ++- float* magnitudeVectorPtr = magnitudeVector; +++ const int16_t* complexVectorPtr = (const int16_t*)complexVector; +++ float* magnitudeVectorPtr = magnitudeVector; ++ ++- const float iScalar = 1.0 / scalar; ++- __m128 invScalar = _mm_set_ps1(iScalar); +++ const float iScalar = 1.0 / scalar; +++ __m128 invScalar = _mm_set_ps1(iScalar); ++ ++- __m128 cplxValue1, cplxValue2, result, re, im; +++ __m128 cplxValue1, cplxValue2, result, re, im; ++ ++- __VOLK_ATTR_ALIGNED(16) float inputFloatBuffer[8]; +++ __VOLK_ATTR_ALIGNED(16) float inputFloatBuffer[8]; ++ ++- for(;number < quarterPoints; number++){ ++- inputFloatBuffer[0] = (float)(complexVectorPtr[0]); ++- inputFloatBuffer[1] = (float)(complexVectorPtr[1]); ++- inputFloatBuffer[2] = (float)(complexVectorPtr[2]); ++- inputFloatBuffer[3] = (float)(complexVectorPtr[3]); +++ for (; number < quarterPoints; number++) { +++ inputFloatBuffer[0] = (float)(complexVectorPtr[0]); +++ inputFloatBuffer[1] = (float)(complexVectorPtr[1]); +++ inputFloatBuffer[2] = (float)(complexVectorPtr[2]); +++ inputFloatBuffer[3] = (float)(complexVectorPtr[3]); ++ ++- inputFloatBuffer[4] = (float)(complexVectorPtr[4]); ++- inputFloatBuffer[5] = (float)(complexVectorPtr[5]); ++- inputFloatBuffer[6] = (float)(complexVectorPtr[6]); ++- inputFloatBuffer[7] = (float)(complexVectorPtr[7]); +++ inputFloatBuffer[4] = (float)(complexVectorPtr[4]); +++ inputFloatBuffer[5] = (float)(complexVectorPtr[5]); +++ inputFloatBuffer[6] = (float)(complexVectorPtr[6]); +++ inputFloatBuffer[7] = (float)(complexVectorPtr[7]); ++ ++- cplxValue1 = _mm_load_ps(&inputFloatBuffer[0]); ++- cplxValue2 = _mm_load_ps(&inputFloatBuffer[4]); +++ cplxValue1 = _mm_load_ps(&inputFloatBuffer[0]); +++ cplxValue2 = _mm_load_ps(&inputFloatBuffer[4]); ++ ++- re = _mm_shuffle_ps(cplxValue1, cplxValue2, 0x88); ++- im = _mm_shuffle_ps(cplxValue1, cplxValue2, 0xdd); +++ re = _mm_shuffle_ps(cplxValue1, cplxValue2, 0x88); +++ im = _mm_shuffle_ps(cplxValue1, cplxValue2, 0xdd); ++ ++- complexVectorPtr += 8; +++ complexVectorPtr += 8; ++ ++- cplxValue1 = _mm_mul_ps(re, invScalar); ++- cplxValue2 = _mm_mul_ps(im, invScalar); +++ cplxValue1 = _mm_mul_ps(re, invScalar); +++ cplxValue2 = _mm_mul_ps(im, invScalar); ++ ++- cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values ++- cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values +++ cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values +++ cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values ++ ++- result = _mm_add_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values +++ result = _mm_add_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values ++ ++- result = _mm_sqrt_ps(result); // Square root the values +++ result = _mm_sqrt_ps(result); // Square root the values ++ ++- _mm_store_ps(magnitudeVectorPtr, result); +++ _mm_store_ps(magnitudeVectorPtr, result); ++ ++- magnitudeVectorPtr += 4; ++- } +++ magnitudeVectorPtr += 4; +++ } ++ ++- number = quarterPoints * 4; ++- magnitudeVectorPtr = &magnitudeVector[number]; ++- complexVectorPtr = (const int16_t*)&complexVector[number]; ++- for(; number < num_points; number++){ ++- float val1Real = (float)(*complexVectorPtr++) * iScalar; ++- float val1Imag = (float)(*complexVectorPtr++) * iScalar; ++- *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)); ++- } +++ number = quarterPoints * 4; +++ magnitudeVectorPtr = &magnitudeVector[number]; +++ complexVectorPtr = (const int16_t*)&complexVector[number]; +++ for (; number < num_points; number++) { +++ float val1Real = (float)(*complexVectorPtr++) * iScalar; +++ float val1Imag = (float)(*complexVectorPtr++) * iScalar; +++ *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)); +++ } ++ } ++ ++ ++@@ -251,33 +254,37 @@ volk_16ic_s32f_magnitude_32f_a_sse(float* magnitudeVector, const lv_16sc_t* comp ++ ++ #ifdef LV_HAVE_GENERIC ++ ++-static inline void ++-volk_16ic_s32f_magnitude_32f_generic(float* magnitudeVector, const lv_16sc_t* complexVector, ++- const float scalar, unsigned int num_points) +++static inline void volk_16ic_s32f_magnitude_32f_generic(float* magnitudeVector, +++ const lv_16sc_t* complexVector, +++ const float scalar, +++ unsigned int num_points) ++ { ++- const int16_t* complexVectorPtr = (const int16_t*)complexVector; ++- float* magnitudeVectorPtr = magnitudeVector; ++- unsigned int number = 0; ++- const float invScalar = 1.0 / scalar; ++- for(number = 0; number < num_points; number++){ ++- float real = ( (float) (*complexVectorPtr++)) * invScalar; ++- float imag = ( (float) (*complexVectorPtr++)) * invScalar; ++- *magnitudeVectorPtr++ = sqrtf((real*real) + (imag*imag)); ++- } +++ const int16_t* complexVectorPtr = (const int16_t*)complexVector; +++ float* magnitudeVectorPtr = magnitudeVector; +++ unsigned int number = 0; +++ const float invScalar = 1.0 / scalar; +++ for (number = 0; number < num_points; number++) { +++ float real = ((float)(*complexVectorPtr++)) * invScalar; +++ float imag = ((float)(*complexVectorPtr++)) * invScalar; +++ *magnitudeVectorPtr++ = sqrtf((real * real) + (imag * imag)); +++ } ++ } ++ #endif /* LV_HAVE_GENERIC */ ++ ++ #ifdef LV_HAVE_ORC_DISABLED ++ ++-extern void ++-volk_16ic_s32f_magnitude_32f_a_orc_impl(float* magnitudeVector, const lv_16sc_t* complexVector, ++- const float scalar, unsigned int num_points); +++extern void volk_16ic_s32f_magnitude_32f_a_orc_impl(float* magnitudeVector, +++ const lv_16sc_t* complexVector, +++ const float scalar, +++ unsigned int num_points); ++ ++-static inline void ++-volk_16ic_s32f_magnitude_32f_u_orc(float* magnitudeVector, const lv_16sc_t* complexVector, ++- const float scalar, unsigned int num_points) +++static inline void volk_16ic_s32f_magnitude_32f_u_orc(float* magnitudeVector, +++ const lv_16sc_t* complexVector, +++ const float scalar, +++ unsigned int num_points) ++ { ++- volk_16ic_s32f_magnitude_32f_a_orc_impl(magnitudeVector, complexVector, scalar, num_points); +++ volk_16ic_s32f_magnitude_32f_a_orc_impl( +++ magnitudeVector, complexVector, scalar, num_points); ++ } ++ #endif /* LV_HAVE_ORC */ ++ ++@@ -287,69 +294,69 @@ volk_16ic_s32f_magnitude_32f_u_orc(float* magnitudeVector, const lv_16sc_t* comp ++ #ifndef INCLUDED_volk_16ic_s32f_magnitude_32f_u_H ++ #define INCLUDED_volk_16ic_s32f_magnitude_32f_u_H ++ ++-#include ++ #include ++-#include ++ #include +++#include +++#include ++ ++ #ifdef LV_HAVE_AVX2 ++ #include ++ ++-static inline void ++-volk_16ic_s32f_magnitude_32f_u_avx2(float* magnitudeVector, const lv_16sc_t* complexVector, ++- const float scalar, unsigned int num_points) +++static inline void volk_16ic_s32f_magnitude_32f_u_avx2(float* magnitudeVector, +++ const lv_16sc_t* complexVector, +++ const float scalar, +++ unsigned int num_points) ++ { ++- unsigned int number = 0; ++- const unsigned int eighthPoints = num_points / 8; +++ unsigned int number = 0; +++ const unsigned int eighthPoints = num_points / 8; +++ +++ const int16_t* complexVectorPtr = (const int16_t*)complexVector; +++ float* magnitudeVectorPtr = magnitudeVector; ++ ++- const int16_t* complexVectorPtr = (const int16_t*)complexVector; ++- float* magnitudeVectorPtr = magnitudeVector; +++ __m256 invScalar = _mm256_set1_ps(1.0 / scalar); ++ ++- __m256 invScalar = _mm256_set1_ps(1.0/scalar); +++ __m256 cplxValue1, cplxValue2, result; +++ __m256i int1, int2; +++ __m128i short1, short2; +++ __m256i idx = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0); ++ ++- __m256 cplxValue1, cplxValue2, result; ++- __m256i int1, int2; ++- __m128i short1, short2; ++- __m256i idx = _mm256_set_epi32(7,6,3,2,5,4,1,0); +++ for (; number < eighthPoints; number++) { ++ ++- for(;number < eighthPoints; number++){ ++- ++- int1 = _mm256_loadu_si256((__m256i*)complexVectorPtr); ++- complexVectorPtr += 16; ++- short1 = _mm256_extracti128_si256(int1,0); ++- short2 = _mm256_extracti128_si256(int1,1); +++ int1 = _mm256_loadu_si256((__m256i*)complexVectorPtr); +++ complexVectorPtr += 16; +++ short1 = _mm256_extracti128_si256(int1, 0); +++ short2 = _mm256_extracti128_si256(int1, 1); ++ ++- int1 = _mm256_cvtepi16_epi32(short1); ++- int2 = _mm256_cvtepi16_epi32(short2); ++- cplxValue1 = _mm256_cvtepi32_ps(int1); ++- cplxValue2 = _mm256_cvtepi32_ps(int2); +++ int1 = _mm256_cvtepi16_epi32(short1); +++ int2 = _mm256_cvtepi16_epi32(short2); +++ cplxValue1 = _mm256_cvtepi32_ps(int1); +++ cplxValue2 = _mm256_cvtepi32_ps(int2); ++ ++- cplxValue1 = _mm256_mul_ps(cplxValue1, invScalar); ++- cplxValue2 = _mm256_mul_ps(cplxValue2, invScalar); +++ cplxValue1 = _mm256_mul_ps(cplxValue1, invScalar); +++ cplxValue2 = _mm256_mul_ps(cplxValue2, invScalar); ++ ++- cplxValue1 = _mm256_mul_ps(cplxValue1, cplxValue1); // Square the values ++- cplxValue2 = _mm256_mul_ps(cplxValue2, cplxValue2); // Square the Values +++ cplxValue1 = _mm256_mul_ps(cplxValue1, cplxValue1); // Square the values +++ cplxValue2 = _mm256_mul_ps(cplxValue2, cplxValue2); // Square the Values ++ ++- result = _mm256_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values ++- result = _mm256_permutevar8x32_ps(result, idx); +++ result = _mm256_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values +++ result = _mm256_permutevar8x32_ps(result, idx); ++ ++- result = _mm256_sqrt_ps(result); // Square root the values +++ result = _mm256_sqrt_ps(result); // Square root the values ++ ++- _mm256_storeu_ps(magnitudeVectorPtr, result); +++ _mm256_storeu_ps(magnitudeVectorPtr, result); ++ ++- magnitudeVectorPtr += 8; ++- } +++ magnitudeVectorPtr += 8; +++ } ++ ++- number = eighthPoints * 8; ++- magnitudeVectorPtr = &magnitudeVector[number]; ++- complexVectorPtr = (const int16_t*)&complexVector[number]; ++- for(; number < num_points; number++){ ++- float val1Real = (float)(*complexVectorPtr++) / scalar; ++- float val1Imag = (float)(*complexVectorPtr++) / scalar; ++- *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)); ++- } +++ number = eighthPoints * 8; +++ magnitudeVectorPtr = &magnitudeVector[number]; +++ complexVectorPtr = (const int16_t*)&complexVector[number]; +++ for (; number < num_points; number++) { +++ float val1Real = (float)(*complexVectorPtr++) / scalar; +++ float val1Imag = (float)(*complexVectorPtr++) / scalar; +++ *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)); +++ } ++ } ++ #endif /* LV_HAVE_AVX2 */ ++ ++ #endif /* INCLUDED_volk_16ic_s32f_magnitude_32f_u_H */ ++- ++diff --git a/kernels/volk/volk_16ic_x2_dot_prod_16ic.h b/kernels/volk/volk_16ic_x2_dot_prod_16ic.h ++index ae10cff..a1a0e8c 100644 ++--- a/kernels/volk/volk_16ic_x2_dot_prod_16ic.h +++++ b/kernels/volk/volk_16ic_x2_dot_prod_16ic.h ++@@ -25,18 +25,20 @@ ++ * ++ * \b Overview ++ * ++- * Multiplies two input complex vectors (16-bit integer each component) and accumulates them, ++- * storing the result. Results are saturated so never go beyond the limits of the data type. +++ * Multiplies two input complex vectors (16-bit integer each component) and accumulates +++ * them, storing the result. Results are saturated so never go beyond the limits of the +++ * data type. ++ * ++ * Dispatcher Prototype ++ * \code ++- * void volk_16ic_x2_dot_prod_16ic(lv_16sc_t* result, const lv_16sc_t* in_a, const lv_16sc_t* in_b, unsigned int num_points); ++- * \endcode +++ * void volk_16ic_x2_dot_prod_16ic(lv_16sc_t* result, const lv_16sc_t* in_a, const +++ * lv_16sc_t* in_b, unsigned int num_points); \endcode ++ * ++ * \b Inputs ++ * \li in_a: One of the vectors to be multiplied and accumulated. ++ * \li in_b: The other vector to be multiplied and accumulated. ++- * \li num_points: Number of complex values to be multiplied together, accumulated and stored into \p result +++ * \li num_points: Number of complex values to be multiplied together, accumulated and +++ * stored into \p result ++ * ++ * \b Outputs ++ * \li result: Value of the accumulated result. ++@@ -46,22 +48,25 @@ ++ #ifndef INCLUDED_volk_16ic_x2_dot_prod_16ic_H ++ #define INCLUDED_volk_16ic_x2_dot_prod_16ic_H ++ +++#include ++ #include ++ #include ++-#include ++ ++ ++ #ifdef LV_HAVE_GENERIC ++ ++-static inline void volk_16ic_x2_dot_prod_16ic_generic(lv_16sc_t* result, const lv_16sc_t* in_a, const lv_16sc_t* in_b, unsigned int num_points) +++static inline void volk_16ic_x2_dot_prod_16ic_generic(lv_16sc_t* result, +++ const lv_16sc_t* in_a, +++ const lv_16sc_t* in_b, +++ unsigned int num_points) ++ { ++ result[0] = lv_cmake((int16_t)0, (int16_t)0); ++ unsigned int n; ++- for (n = 0; n < num_points; n++) ++- { ++- lv_16sc_t tmp = in_a[n] * in_b[n]; ++- result[0] = lv_cmake(sat_adds16i(lv_creal(result[0]), lv_creal(tmp)), sat_adds16i(lv_cimag(result[0]), lv_cimag(tmp) )); ++- } +++ for (n = 0; n < num_points; n++) { +++ lv_16sc_t tmp = in_a[n] * in_b[n]; +++ result[0] = lv_cmake(sat_adds16i(lv_creal(result[0]), lv_creal(tmp)), +++ sat_adds16i(lv_cimag(result[0]), lv_cimag(tmp))); +++ } ++ } ++ ++ #endif /*LV_HAVE_GENERIC*/ ++@@ -70,7 +75,10 @@ static inline void volk_16ic_x2_dot_prod_16ic_generic(lv_16sc_t* result, const l ++ #ifdef LV_HAVE_SSE2 ++ #include ++ ++-static inline void volk_16ic_x2_dot_prod_16ic_a_sse2(lv_16sc_t* out, const lv_16sc_t* in_a, const lv_16sc_t* in_b, unsigned int num_points) +++static inline void volk_16ic_x2_dot_prod_16ic_a_sse2(lv_16sc_t* out, +++ const lv_16sc_t* in_a, +++ const lv_16sc_t* in_b, +++ unsigned int num_points) ++ { ++ lv_16sc_t dotProduct = lv_cmake((int16_t)0, (int16_t)0); ++ ++@@ -81,62 +89,67 @@ static inline void volk_16ic_x2_dot_prod_16ic_a_sse2(lv_16sc_t* out, const lv_16 ++ const lv_16sc_t* _in_b = in_b; ++ lv_16sc_t* _out = out; ++ ++- if (sse_iters > 0) ++- { ++- __m128i a, b, c, c_sr, mask_imag, mask_real, real, imag, imag1, imag2, b_sl, a_sl, realcacc, imagcacc; ++- __VOLK_ATTR_ALIGNED(16) lv_16sc_t dotProductVector[4]; +++ if (sse_iters > 0) { +++ __m128i a, b, c, c_sr, mask_imag, mask_real, real, imag, imag1, imag2, b_sl, a_sl, +++ realcacc, imagcacc; +++ __VOLK_ATTR_ALIGNED(16) lv_16sc_t dotProductVector[4]; ++ ++- realcacc = _mm_setzero_si128(); ++- imagcacc = _mm_setzero_si128(); +++ realcacc = _mm_setzero_si128(); +++ imagcacc = _mm_setzero_si128(); ++ ++- mask_imag = _mm_set_epi8(0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0); ++- mask_real = _mm_set_epi8(0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF); +++ mask_imag = _mm_set_epi8( +++ 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0); +++ mask_real = _mm_set_epi8( +++ 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF); ++ ++- for(number = 0; number < sse_iters; number++) ++- { ++- // a[127:0]=[a3.i,a3.r,a2.i,a2.r,a1.i,a1.r,a0.i,a0.r] ++- a = _mm_load_si128((__m128i*)_in_a); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg ++- __VOLK_PREFETCH(_in_a + 8); ++- b = _mm_load_si128((__m128i*)_in_b); ++- __VOLK_PREFETCH(_in_b + 8); ++- c = _mm_mullo_epi16(a, b); // a3.i*b3.i, a3.r*b3.r, .... +++ for (number = 0; number < sse_iters; number++) { +++ // a[127:0]=[a3.i,a3.r,a2.i,a2.r,a1.i,a1.r,a0.i,a0.r] +++ a = _mm_load_si128( +++ (__m128i*)_in_a); // load (2 byte imag, 2 byte real) x 4 into 128 bits reg +++ __VOLK_PREFETCH(_in_a + 8); +++ b = _mm_load_si128((__m128i*)_in_b); +++ __VOLK_PREFETCH(_in_b + 8); +++ c = _mm_mullo_epi16(a, b); // a3.i*b3.i, a3.r*b3.r, .... ++ ++- c_sr = _mm_srli_si128(c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst. ++- real = _mm_subs_epi16(c, c_sr); +++ c_sr = _mm_srli_si128(c, 2); // Shift a right by imm8 bytes while shifting in +++ // zeros, and store the results in dst. +++ real = _mm_subs_epi16(c, c_sr); ++ ++- b_sl = _mm_slli_si128(b, 2); // b3.r, b2.i .... ++- a_sl = _mm_slli_si128(a, 2); // a3.r, a2.i .... +++ b_sl = _mm_slli_si128(b, 2); // b3.r, b2.i .... +++ a_sl = _mm_slli_si128(a, 2); // a3.r, a2.i .... ++ ++- imag1 = _mm_mullo_epi16(a, b_sl); // a3.i*b3.r, .... ++- imag2 = _mm_mullo_epi16(b, a_sl); // b3.i*a3.r, .... +++ imag1 = _mm_mullo_epi16(a, b_sl); // a3.i*b3.r, .... +++ imag2 = _mm_mullo_epi16(b, a_sl); // b3.i*a3.r, .... ++ ++- imag = _mm_adds_epi16(imag1, imag2); //with saturation arithmetic! +++ imag = _mm_adds_epi16(imag1, imag2); // with saturation arithmetic! ++ ++- realcacc = _mm_adds_epi16(realcacc, real); ++- imagcacc = _mm_adds_epi16(imagcacc, imag); +++ realcacc = _mm_adds_epi16(realcacc, real); +++ imagcacc = _mm_adds_epi16(imagcacc, imag); ++ ++- _in_a += 4; ++- _in_b += 4; ++- } +++ _in_a += 4; +++ _in_b += 4; +++ } ++ ++- realcacc = _mm_and_si128(realcacc, mask_real); ++- imagcacc = _mm_and_si128(imagcacc, mask_imag); +++ realcacc = _mm_and_si128(realcacc, mask_real); +++ imagcacc = _mm_and_si128(imagcacc, mask_imag); ++ ++- a = _mm_or_si128(realcacc, imagcacc); +++ a = _mm_or_si128(realcacc, imagcacc); ++ ++- _mm_store_si128((__m128i*)dotProductVector, a); // Store the results back into the dot product vector +++ _mm_store_si128((__m128i*)dotProductVector, +++ a); // Store the results back into the dot product vector ++ ++- for (number = 0; number < 4; ++number) ++- { ++- dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(dotProductVector[number])), sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[number]))); ++- } +++ for (number = 0; number < 4; ++number) { +++ dotProduct = lv_cmake( +++ sat_adds16i(lv_creal(dotProduct), lv_creal(dotProductVector[number])), +++ sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[number]))); ++ } +++ } ++ ++- for (number = 0; number < (num_points % 4); ++number) ++- { ++- lv_16sc_t tmp = (*_in_a++) * (*_in_b++); ++- dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(tmp)), sat_adds16i(lv_cimag(dotProduct), lv_cimag(tmp))); ++- } +++ for (number = 0; number < (num_points % 4); ++number) { +++ lv_16sc_t tmp = (*_in_a++) * (*_in_b++); +++ dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(tmp)), +++ sat_adds16i(lv_cimag(dotProduct), lv_cimag(tmp))); +++ } ++ ++ *_out = dotProduct; ++ } ++@@ -147,7 +160,10 @@ static inline void volk_16ic_x2_dot_prod_16ic_a_sse2(lv_16sc_t* out, const lv_16 ++ #ifdef LV_HAVE_SSE2 ++ #include ++ ++-static inline void volk_16ic_x2_dot_prod_16ic_u_sse2(lv_16sc_t* out, const lv_16sc_t* in_a, const lv_16sc_t* in_b, unsigned int num_points) +++static inline void volk_16ic_x2_dot_prod_16ic_u_sse2(lv_16sc_t* out, +++ const lv_16sc_t* in_a, +++ const lv_16sc_t* in_b, +++ unsigned int num_points) ++ { ++ lv_16sc_t dotProduct = lv_cmake((int16_t)0, (int16_t)0); ++ ++@@ -158,62 +174,67 @@ static inline void volk_16ic_x2_dot_prod_16ic_u_sse2(lv_16sc_t* out, const lv_16 ++ lv_16sc_t* _out = out; ++ unsigned int number; ++ ++- if (sse_iters > 0) ++- { ++- __m128i a, b, c, c_sr, mask_imag, mask_real, real, imag, imag1, imag2, b_sl, a_sl, realcacc, imagcacc, result; ++- __VOLK_ATTR_ALIGNED(16) lv_16sc_t dotProductVector[4]; +++ if (sse_iters > 0) { +++ __m128i a, b, c, c_sr, mask_imag, mask_real, real, imag, imag1, imag2, b_sl, a_sl, +++ realcacc, imagcacc, result; +++ __VOLK_ATTR_ALIGNED(16) lv_16sc_t dotProductVector[4]; ++ ++- realcacc = _mm_setzero_si128(); ++- imagcacc = _mm_setzero_si128(); +++ realcacc = _mm_setzero_si128(); +++ imagcacc = _mm_setzero_si128(); ++ ++- mask_imag = _mm_set_epi8(0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0); ++- mask_real = _mm_set_epi8(0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF); +++ mask_imag = _mm_set_epi8( +++ 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0); +++ mask_real = _mm_set_epi8( +++ 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF); ++ ++- for(number = 0; number < sse_iters; number++) ++- { ++- // a[127:0]=[a3.i,a3.r,a2.i,a2.r,a1.i,a1.r,a0.i,a0.r] ++- a = _mm_loadu_si128((__m128i*)_in_a); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg ++- __VOLK_PREFETCH(_in_a + 8); ++- b = _mm_loadu_si128((__m128i*)_in_b); ++- __VOLK_PREFETCH(_in_b + 8); ++- c = _mm_mullo_epi16(a, b); // a3.i*b3.i, a3.r*b3.r, .... +++ for (number = 0; number < sse_iters; number++) { +++ // a[127:0]=[a3.i,a3.r,a2.i,a2.r,a1.i,a1.r,a0.i,a0.r] +++ a = _mm_loadu_si128( +++ (__m128i*)_in_a); // load (2 byte imag, 2 byte real) x 4 into 128 bits reg +++ __VOLK_PREFETCH(_in_a + 8); +++ b = _mm_loadu_si128((__m128i*)_in_b); +++ __VOLK_PREFETCH(_in_b + 8); +++ c = _mm_mullo_epi16(a, b); // a3.i*b3.i, a3.r*b3.r, .... ++ ++- c_sr = _mm_srli_si128(c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst. ++- real = _mm_subs_epi16(c, c_sr); +++ c_sr = _mm_srli_si128(c, 2); // Shift a right by imm8 bytes while shifting in +++ // zeros, and store the results in dst. +++ real = _mm_subs_epi16(c, c_sr); ++ ++- b_sl = _mm_slli_si128(b, 2); // b3.r, b2.i .... ++- a_sl = _mm_slli_si128(a, 2); // a3.r, a2.i .... +++ b_sl = _mm_slli_si128(b, 2); // b3.r, b2.i .... +++ a_sl = _mm_slli_si128(a, 2); // a3.r, a2.i .... ++ ++- imag1 = _mm_mullo_epi16(a, b_sl); // a3.i*b3.r, .... ++- imag2 = _mm_mullo_epi16(b, a_sl); // b3.i*a3.r, .... +++ imag1 = _mm_mullo_epi16(a, b_sl); // a3.i*b3.r, .... +++ imag2 = _mm_mullo_epi16(b, a_sl); // b3.i*a3.r, .... ++ ++- imag = _mm_adds_epi16(imag1, imag2); //with saturation arithmetic! +++ imag = _mm_adds_epi16(imag1, imag2); // with saturation arithmetic! ++ ++- realcacc = _mm_adds_epi16(realcacc, real); ++- imagcacc = _mm_adds_epi16(imagcacc, imag); +++ realcacc = _mm_adds_epi16(realcacc, real); +++ imagcacc = _mm_adds_epi16(imagcacc, imag); ++ ++- _in_a += 4; ++- _in_b += 4; ++- } +++ _in_a += 4; +++ _in_b += 4; +++ } ++ ++- realcacc = _mm_and_si128(realcacc, mask_real); ++- imagcacc = _mm_and_si128(imagcacc, mask_imag); +++ realcacc = _mm_and_si128(realcacc, mask_real); +++ imagcacc = _mm_and_si128(imagcacc, mask_imag); ++ ++- result = _mm_or_si128(realcacc, imagcacc); +++ result = _mm_or_si128(realcacc, imagcacc); ++ ++- _mm_storeu_si128((__m128i*)dotProductVector, result); // Store the results back into the dot product vector +++ _mm_storeu_si128((__m128i*)dotProductVector, +++ result); // Store the results back into the dot product vector ++ ++- for (number = 0; number < 4; ++number) ++- { ++- dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(dotProductVector[number])), sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[number]))); ++- } +++ for (number = 0; number < 4; ++number) { +++ dotProduct = lv_cmake( +++ sat_adds16i(lv_creal(dotProduct), lv_creal(dotProductVector[number])), +++ sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[number]))); ++ } +++ } ++ ++- for (number = 0; number < (num_points % 4); ++number) ++- { ++- lv_16sc_t tmp = (*_in_a++) * (*_in_b++); ++- dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(tmp)), sat_adds16i(lv_cimag(dotProduct), lv_cimag(tmp))); ++- } +++ for (number = 0; number < (num_points % 4); ++number) { +++ lv_16sc_t tmp = (*_in_a++) * (*_in_b++); +++ dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(tmp)), +++ sat_adds16i(lv_cimag(dotProduct), lv_cimag(tmp))); +++ } ++ ++ *_out = dotProduct; ++ } ++@@ -223,7 +244,10 @@ static inline void volk_16ic_x2_dot_prod_16ic_u_sse2(lv_16sc_t* out, const lv_16 ++ #ifdef LV_HAVE_AVX2 ++ #include ++ ++-static inline void volk_16ic_x2_dot_prod_16ic_u_axv2(lv_16sc_t* out, const lv_16sc_t* in_a, const lv_16sc_t* in_b, unsigned int num_points) +++static inline void volk_16ic_x2_dot_prod_16ic_u_axv2(lv_16sc_t* out, +++ const lv_16sc_t* in_a, +++ const lv_16sc_t* in_b, +++ unsigned int num_points) ++ { ++ lv_16sc_t dotProduct = lv_cmake((int16_t)0, (int16_t)0); ++ ++@@ -234,62 +258,126 @@ static inline void volk_16ic_x2_dot_prod_16ic_u_axv2(lv_16sc_t* out, const lv_16 ++ lv_16sc_t* _out = out; ++ unsigned int number; ++ ++- if (avx_iters > 0) ++- { ++- __m256i a, b, c, c_sr, mask_imag, mask_real, real, imag, imag1, imag2, b_sl, a_sl, realcacc, imagcacc, result; ++- __VOLK_ATTR_ALIGNED(32) lv_16sc_t dotProductVector[8]; ++- ++- realcacc = _mm256_setzero_si256(); ++- imagcacc = _mm256_setzero_si256(); ++- ++- mask_imag = _mm256_set_epi8(0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0); ++- mask_real = _mm256_set_epi8(0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF); ++- ++- for(number = 0; number < avx_iters; number++) ++- { ++- a = _mm256_loadu_si256((__m256i*)_in_a); ++- __VOLK_PREFETCH(_in_a + 16); ++- b = _mm256_loadu_si256((__m256i*)_in_b); ++- __VOLK_PREFETCH(_in_b + 16); ++- c = _mm256_mullo_epi16(a, b); ++- ++- c_sr = _mm256_srli_si256(c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst. ++- real = _mm256_subs_epi16(c, c_sr); ++- ++- b_sl = _mm256_slli_si256(b, 2); ++- a_sl = _mm256_slli_si256(a, 2); ++- ++- imag1 = _mm256_mullo_epi16(a, b_sl); ++- imag2 = _mm256_mullo_epi16(b, a_sl); ++- ++- imag = _mm256_adds_epi16(imag1, imag2); //with saturation arithmetic! ++- ++- realcacc = _mm256_adds_epi16(realcacc, real); ++- imagcacc = _mm256_adds_epi16(imagcacc, imag); ++- ++- _in_a += 8; ++- _in_b += 8; ++- } +++ if (avx_iters > 0) { +++ __m256i a, b, c, c_sr, mask_imag, mask_real, real, imag, imag1, imag2, b_sl, a_sl, +++ realcacc, imagcacc, result; +++ __VOLK_ATTR_ALIGNED(32) lv_16sc_t dotProductVector[8]; +++ +++ realcacc = _mm256_setzero_si256(); +++ imagcacc = _mm256_setzero_si256(); +++ +++ mask_imag = _mm256_set_epi8(0xFF, +++ 0xFF, +++ 0, +++ 0, +++ 0xFF, +++ 0xFF, +++ 0, +++ 0, +++ 0xFF, +++ 0xFF, +++ 0, +++ 0, +++ 0xFF, +++ 0xFF, +++ 0, +++ 0, +++ 0xFF, +++ 0xFF, +++ 0, +++ 0, +++ 0xFF, +++ 0xFF, +++ 0, +++ 0, +++ 0xFF, +++ 0xFF, +++ 0, +++ 0, +++ 0xFF, +++ 0xFF, +++ 0, +++ 0); +++ mask_real = _mm256_set_epi8(0, +++ 0, +++ 0xFF, +++ 0xFF, +++ 0, +++ 0, +++ 0xFF, +++ 0xFF, +++ 0, +++ 0, +++ 0xFF, +++ 0xFF, +++ 0, +++ 0, +++ 0xFF, +++ 0xFF, +++ 0, +++ 0, +++ 0xFF, +++ 0xFF, +++ 0, +++ 0, +++ 0xFF, +++ 0xFF, +++ 0, +++ 0, +++ 0xFF, +++ 0xFF, +++ 0, +++ 0, +++ 0xFF, +++ 0xFF); +++ +++ for (number = 0; number < avx_iters; number++) { +++ a = _mm256_loadu_si256((__m256i*)_in_a); +++ __VOLK_PREFETCH(_in_a + 16); +++ b = _mm256_loadu_si256((__m256i*)_in_b); +++ __VOLK_PREFETCH(_in_b + 16); +++ c = _mm256_mullo_epi16(a, b); +++ +++ c_sr = _mm256_srli_si256(c, 2); // Shift a right by imm8 bytes while shifting +++ // in zeros, and store the results in dst. +++ real = _mm256_subs_epi16(c, c_sr); +++ +++ b_sl = _mm256_slli_si256(b, 2); +++ a_sl = _mm256_slli_si256(a, 2); +++ +++ imag1 = _mm256_mullo_epi16(a, b_sl); +++ imag2 = _mm256_mullo_epi16(b, a_sl); +++ +++ imag = _mm256_adds_epi16(imag1, imag2); // with saturation arithmetic! +++ +++ realcacc = _mm256_adds_epi16(realcacc, real); +++ imagcacc = _mm256_adds_epi16(imagcacc, imag); +++ +++ _in_a += 8; +++ _in_b += 8; +++ } ++ ++- realcacc = _mm256_and_si256(realcacc, mask_real); ++- imagcacc = _mm256_and_si256(imagcacc, mask_imag); +++ realcacc = _mm256_and_si256(realcacc, mask_real); +++ imagcacc = _mm256_and_si256(imagcacc, mask_imag); ++ ++- result = _mm256_or_si256(realcacc, imagcacc); +++ result = _mm256_or_si256(realcacc, imagcacc); ++ ++- _mm256_storeu_si256((__m256i*)dotProductVector, result); // Store the results back into the dot product vector ++- _mm256_zeroupper(); +++ _mm256_storeu_si256((__m256i*)dotProductVector, +++ result); // Store the results back into the dot product vector +++ _mm256_zeroupper(); ++ ++- for (number = 0; number < 8; ++number) ++- { ++- dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(dotProductVector[number])), sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[number]))); ++- } +++ for (number = 0; number < 8; ++number) { +++ dotProduct = lv_cmake( +++ sat_adds16i(lv_creal(dotProduct), lv_creal(dotProductVector[number])), +++ sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[number]))); ++ } +++ } ++ ++- for (number = 0; number < (num_points % 8); ++number) ++- { ++- lv_16sc_t tmp = (*_in_a++) * (*_in_b++); ++- dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(tmp)), sat_adds16i(lv_cimag(dotProduct), lv_cimag(tmp))); ++- } +++ for (number = 0; number < (num_points % 8); ++number) { +++ lv_16sc_t tmp = (*_in_a++) * (*_in_b++); +++ dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(tmp)), +++ sat_adds16i(lv_cimag(dotProduct), lv_cimag(tmp))); +++ } ++ ++ *_out = dotProduct; ++ } ++@@ -299,7 +387,10 @@ static inline void volk_16ic_x2_dot_prod_16ic_u_axv2(lv_16sc_t* out, const lv_16 ++ #ifdef LV_HAVE_AVX2 ++ #include ++ ++-static inline void volk_16ic_x2_dot_prod_16ic_a_axv2(lv_16sc_t* out, const lv_16sc_t* in_a, const lv_16sc_t* in_b, unsigned int num_points) +++static inline void volk_16ic_x2_dot_prod_16ic_a_axv2(lv_16sc_t* out, +++ const lv_16sc_t* in_a, +++ const lv_16sc_t* in_b, +++ unsigned int num_points) ++ { ++ lv_16sc_t dotProduct = lv_cmake((int16_t)0, (int16_t)0); ++ ++@@ -310,62 +401,126 @@ static inline void volk_16ic_x2_dot_prod_16ic_a_axv2(lv_16sc_t* out, const lv_16 ++ lv_16sc_t* _out = out; ++ unsigned int number; ++ ++- if (avx_iters > 0) ++- { ++- __m256i a, b, c, c_sr, mask_imag, mask_real, real, imag, imag1, imag2, b_sl, a_sl, realcacc, imagcacc, result; ++- __VOLK_ATTR_ALIGNED(32) lv_16sc_t dotProductVector[8]; ++- ++- realcacc = _mm256_setzero_si256(); ++- imagcacc = _mm256_setzero_si256(); ++- ++- mask_imag = _mm256_set_epi8(0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0); ++- mask_real = _mm256_set_epi8(0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF); ++- ++- for(number = 0; number < avx_iters; number++) ++- { ++- a = _mm256_load_si256((__m256i*)_in_a); ++- __VOLK_PREFETCH(_in_a + 16); ++- b = _mm256_load_si256((__m256i*)_in_b); ++- __VOLK_PREFETCH(_in_b + 16); ++- c = _mm256_mullo_epi16(a, b); ++- ++- c_sr = _mm256_srli_si256(c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst. ++- real = _mm256_subs_epi16(c, c_sr); ++- ++- b_sl = _mm256_slli_si256(b, 2); ++- a_sl = _mm256_slli_si256(a, 2); ++- ++- imag1 = _mm256_mullo_epi16(a, b_sl); ++- imag2 = _mm256_mullo_epi16(b, a_sl); ++- ++- imag = _mm256_adds_epi16(imag1, imag2); //with saturation arithmetic! ++- ++- realcacc = _mm256_adds_epi16(realcacc, real); ++- imagcacc = _mm256_adds_epi16(imagcacc, imag); ++- ++- _in_a += 8; ++- _in_b += 8; ++- } +++ if (avx_iters > 0) { +++ __m256i a, b, c, c_sr, mask_imag, mask_real, real, imag, imag1, imag2, b_sl, a_sl, +++ realcacc, imagcacc, result; +++ __VOLK_ATTR_ALIGNED(32) lv_16sc_t dotProductVector[8]; +++ +++ realcacc = _mm256_setzero_si256(); +++ imagcacc = _mm256_setzero_si256(); +++ +++ mask_imag = _mm256_set_epi8(0xFF, +++ 0xFF, +++ 0, +++ 0, +++ 0xFF, +++ 0xFF, +++ 0, +++ 0, +++ 0xFF, +++ 0xFF, +++ 0, +++ 0, +++ 0xFF, +++ 0xFF, +++ 0, +++ 0, +++ 0xFF, +++ 0xFF, +++ 0, +++ 0, +++ 0xFF, +++ 0xFF, +++ 0, +++ 0, +++ 0xFF, +++ 0xFF, +++ 0, +++ 0, +++ 0xFF, +++ 0xFF, +++ 0, +++ 0); +++ mask_real = _mm256_set_epi8(0, +++ 0, +++ 0xFF, +++ 0xFF, +++ 0, +++ 0, +++ 0xFF, +++ 0xFF, +++ 0, +++ 0, +++ 0xFF, +++ 0xFF, +++ 0, +++ 0, +++ 0xFF, +++ 0xFF, +++ 0, +++ 0, +++ 0xFF, +++ 0xFF, +++ 0, +++ 0, +++ 0xFF, +++ 0xFF, +++ 0, +++ 0, +++ 0xFF, +++ 0xFF, +++ 0, +++ 0, +++ 0xFF, +++ 0xFF); +++ +++ for (number = 0; number < avx_iters; number++) { +++ a = _mm256_load_si256((__m256i*)_in_a); +++ __VOLK_PREFETCH(_in_a + 16); +++ b = _mm256_load_si256((__m256i*)_in_b); +++ __VOLK_PREFETCH(_in_b + 16); +++ c = _mm256_mullo_epi16(a, b); +++ +++ c_sr = _mm256_srli_si256(c, 2); // Shift a right by imm8 bytes while shifting +++ // in zeros, and store the results in dst. +++ real = _mm256_subs_epi16(c, c_sr); +++ +++ b_sl = _mm256_slli_si256(b, 2); +++ a_sl = _mm256_slli_si256(a, 2); +++ +++ imag1 = _mm256_mullo_epi16(a, b_sl); +++ imag2 = _mm256_mullo_epi16(b, a_sl); +++ +++ imag = _mm256_adds_epi16(imag1, imag2); // with saturation arithmetic! +++ +++ realcacc = _mm256_adds_epi16(realcacc, real); +++ imagcacc = _mm256_adds_epi16(imagcacc, imag); +++ +++ _in_a += 8; +++ _in_b += 8; +++ } ++ ++- realcacc = _mm256_and_si256(realcacc, mask_real); ++- imagcacc = _mm256_and_si256(imagcacc, mask_imag); +++ realcacc = _mm256_and_si256(realcacc, mask_real); +++ imagcacc = _mm256_and_si256(imagcacc, mask_imag); ++ ++- result = _mm256_or_si256(realcacc, imagcacc); +++ result = _mm256_or_si256(realcacc, imagcacc); ++ ++- _mm256_store_si256((__m256i*)dotProductVector, result); // Store the results back into the dot product vector ++- _mm256_zeroupper(); +++ _mm256_store_si256((__m256i*)dotProductVector, +++ result); // Store the results back into the dot product vector +++ _mm256_zeroupper(); ++ ++- for (number = 0; number < 8; ++number) ++- { ++- dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(dotProductVector[number])), sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[number]))); ++- } +++ for (number = 0; number < 8; ++number) { +++ dotProduct = lv_cmake( +++ sat_adds16i(lv_creal(dotProduct), lv_creal(dotProductVector[number])), +++ sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[number]))); ++ } +++ } ++ ++- for (number = 0; number < (num_points % 8); ++number) ++- { ++- lv_16sc_t tmp = (*_in_a++) * (*_in_b++); ++- dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(tmp)), sat_adds16i(lv_cimag(dotProduct), lv_cimag(tmp))); ++- } +++ for (number = 0; number < (num_points % 8); ++number) { +++ lv_16sc_t tmp = (*_in_a++) * (*_in_b++); +++ dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(tmp)), +++ sat_adds16i(lv_cimag(dotProduct), lv_cimag(tmp))); +++ } ++ ++ *_out = dotProduct; ++ } ++@@ -375,69 +530,70 @@ static inline void volk_16ic_x2_dot_prod_16ic_a_axv2(lv_16sc_t* out, const lv_16 ++ #ifdef LV_HAVE_NEON ++ #include ++ ++-static inline void volk_16ic_x2_dot_prod_16ic_neon(lv_16sc_t* out, const lv_16sc_t* in_a, const lv_16sc_t* in_b, unsigned int num_points) +++static inline void volk_16ic_x2_dot_prod_16ic_neon(lv_16sc_t* out, +++ const lv_16sc_t* in_a, +++ const lv_16sc_t* in_b, +++ unsigned int num_points) ++ { ++ unsigned int quarter_points = num_points / 4; ++ unsigned int number; ++ ++- lv_16sc_t* a_ptr = (lv_16sc_t*) in_a; ++- lv_16sc_t* b_ptr = (lv_16sc_t*) in_b; +++ lv_16sc_t* a_ptr = (lv_16sc_t*)in_a; +++ lv_16sc_t* b_ptr = (lv_16sc_t*)in_b; ++ *out = lv_cmake((int16_t)0, (int16_t)0); ++ ++- if (quarter_points > 0) ++- { ++- // for 2-lane vectors, 1st lane holds the real part, ++- // 2nd lane holds the imaginary part ++- int16x4x2_t a_val, b_val, c_val, accumulator; ++- int16x4x2_t tmp_real, tmp_imag; ++- __VOLK_ATTR_ALIGNED(16) lv_16sc_t accum_result[4]; ++- accumulator.val[0] = vdup_n_s16(0); ++- accumulator.val[1] = vdup_n_s16(0); ++- lv_16sc_t dotProduct = lv_cmake((int16_t)0, (int16_t)0); ++- ++- for(number = 0; number < quarter_points; ++number) ++- { ++- a_val = vld2_s16((int16_t*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i ++- b_val = vld2_s16((int16_t*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i ++- __VOLK_PREFETCH(a_ptr + 8); ++- __VOLK_PREFETCH(b_ptr + 8); ++- ++- // multiply the real*real and imag*imag to get real result ++- // a0r*b0r|a1r*b1r|a2r*b2r|a3r*b3r ++- tmp_real.val[0] = vmul_s16(a_val.val[0], b_val.val[0]); ++- // a0i*b0i|a1i*b1i|a2i*b2i|a3i*b3i ++- tmp_real.val[1] = vmul_s16(a_val.val[1], b_val.val[1]); ++- ++- // Multiply cross terms to get the imaginary result ++- // a0r*b0i|a1r*b1i|a2r*b2i|a3r*b3i ++- tmp_imag.val[0] = vmul_s16(a_val.val[0], b_val.val[1]); ++- // a0i*b0r|a1i*b1r|a2i*b2r|a3i*b3r ++- tmp_imag.val[1] = vmul_s16(a_val.val[1], b_val.val[0]); ++- ++- c_val.val[0] = vqsub_s16(tmp_real.val[0], tmp_real.val[1]); ++- c_val.val[1] = vqadd_s16(tmp_imag.val[0], tmp_imag.val[1]); ++- ++- accumulator.val[0] = vqadd_s16(accumulator.val[0], c_val.val[0]); ++- accumulator.val[1] = vqadd_s16(accumulator.val[1], c_val.val[1]); ++- ++- a_ptr += 4; ++- b_ptr += 4; ++- } ++- ++- vst2_s16((int16_t*)accum_result, accumulator); ++- for (number = 0; number < 4; ++number) ++- { ++- dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(accum_result[number])), sat_adds16i(lv_cimag(dotProduct), lv_cimag(accum_result[number]))); ++- } ++- ++- *out = dotProduct; +++ if (quarter_points > 0) { +++ // for 2-lane vectors, 1st lane holds the real part, +++ // 2nd lane holds the imaginary part +++ int16x4x2_t a_val, b_val, c_val, accumulator; +++ int16x4x2_t tmp_real, tmp_imag; +++ __VOLK_ATTR_ALIGNED(16) lv_16sc_t accum_result[4]; +++ accumulator.val[0] = vdup_n_s16(0); +++ accumulator.val[1] = vdup_n_s16(0); +++ lv_16sc_t dotProduct = lv_cmake((int16_t)0, (int16_t)0); +++ +++ for (number = 0; number < quarter_points; ++number) { +++ a_val = vld2_s16((int16_t*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i +++ b_val = vld2_s16((int16_t*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i +++ __VOLK_PREFETCH(a_ptr + 8); +++ __VOLK_PREFETCH(b_ptr + 8); +++ +++ // multiply the real*real and imag*imag to get real result +++ // a0r*b0r|a1r*b1r|a2r*b2r|a3r*b3r +++ tmp_real.val[0] = vmul_s16(a_val.val[0], b_val.val[0]); +++ // a0i*b0i|a1i*b1i|a2i*b2i|a3i*b3i +++ tmp_real.val[1] = vmul_s16(a_val.val[1], b_val.val[1]); +++ +++ // Multiply cross terms to get the imaginary result +++ // a0r*b0i|a1r*b1i|a2r*b2i|a3r*b3i +++ tmp_imag.val[0] = vmul_s16(a_val.val[0], b_val.val[1]); +++ // a0i*b0r|a1i*b1r|a2i*b2r|a3i*b3r +++ tmp_imag.val[1] = vmul_s16(a_val.val[1], b_val.val[0]); +++ +++ c_val.val[0] = vqsub_s16(tmp_real.val[0], tmp_real.val[1]); +++ c_val.val[1] = vqadd_s16(tmp_imag.val[0], tmp_imag.val[1]); +++ +++ accumulator.val[0] = vqadd_s16(accumulator.val[0], c_val.val[0]); +++ accumulator.val[1] = vqadd_s16(accumulator.val[1], c_val.val[1]); +++ +++ a_ptr += 4; +++ b_ptr += 4; ++ } ++ ++- // tail case ++- for(number = quarter_points * 4; number < num_points; ++number) ++- { ++- *out += (*a_ptr++) * (*b_ptr++); +++ vst2_s16((int16_t*)accum_result, accumulator); +++ for (number = 0; number < 4; ++number) { +++ dotProduct = lv_cmake( +++ sat_adds16i(lv_creal(dotProduct), lv_creal(accum_result[number])), +++ sat_adds16i(lv_cimag(dotProduct), lv_cimag(accum_result[number]))); ++ } +++ +++ *out = dotProduct; +++ } +++ +++ // tail case +++ for (number = quarter_points * 4; number < num_points; ++number) { +++ *out += (*a_ptr++) * (*b_ptr++); +++ } ++ } ++ ++ #endif /* LV_HAVE_NEON */ ++@@ -446,13 +602,16 @@ static inline void volk_16ic_x2_dot_prod_16ic_neon(lv_16sc_t* out, const lv_16sc ++ #ifdef LV_HAVE_NEON ++ #include ++ ++-static inline void volk_16ic_x2_dot_prod_16ic_neon_vma(lv_16sc_t* out, const lv_16sc_t* in_a, const lv_16sc_t* in_b, unsigned int num_points) +++static inline void volk_16ic_x2_dot_prod_16ic_neon_vma(lv_16sc_t* out, +++ const lv_16sc_t* in_a, +++ const lv_16sc_t* in_b, +++ unsigned int num_points) ++ { ++ unsigned int quarter_points = num_points / 4; ++ unsigned int number; ++ ++- lv_16sc_t* a_ptr = (lv_16sc_t*) in_a; ++- lv_16sc_t* b_ptr = (lv_16sc_t*) in_b; +++ lv_16sc_t* a_ptr = (lv_16sc_t*)in_a; +++ lv_16sc_t* b_ptr = (lv_16sc_t*)in_b; ++ // for 2-lane vectors, 1st lane holds the real part, ++ // 2nd lane holds the imaginary part ++ int16x4x2_t a_val, b_val, accumulator; ++@@ -461,35 +620,33 @@ static inline void volk_16ic_x2_dot_prod_16ic_neon_vma(lv_16sc_t* out, const lv_ ++ accumulator.val[0] = vdup_n_s16(0); ++ accumulator.val[1] = vdup_n_s16(0); ++ ++- for(number = 0; number < quarter_points; ++number) ++- { ++- a_val = vld2_s16((int16_t*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i ++- b_val = vld2_s16((int16_t*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i ++- __VOLK_PREFETCH(a_ptr + 8); ++- __VOLK_PREFETCH(b_ptr + 8); +++ for (number = 0; number < quarter_points; ++number) { +++ a_val = vld2_s16((int16_t*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i +++ b_val = vld2_s16((int16_t*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i +++ __VOLK_PREFETCH(a_ptr + 8); +++ __VOLK_PREFETCH(b_ptr + 8); ++ ++- tmp.val[0] = vmul_s16(a_val.val[0], b_val.val[0]); ++- tmp.val[1] = vmul_s16(a_val.val[1], b_val.val[0]); +++ tmp.val[0] = vmul_s16(a_val.val[0], b_val.val[0]); +++ tmp.val[1] = vmul_s16(a_val.val[1], b_val.val[0]); ++ ++- // use multiply accumulate/subtract to get result ++- tmp.val[0] = vmls_s16(tmp.val[0], a_val.val[1], b_val.val[1]); ++- tmp.val[1] = vmla_s16(tmp.val[1], a_val.val[0], b_val.val[1]); +++ // use multiply accumulate/subtract to get result +++ tmp.val[0] = vmls_s16(tmp.val[0], a_val.val[1], b_val.val[1]); +++ tmp.val[1] = vmla_s16(tmp.val[1], a_val.val[0], b_val.val[1]); ++ ++- accumulator.val[0] = vqadd_s16(accumulator.val[0], tmp.val[0]); ++- accumulator.val[1] = vqadd_s16(accumulator.val[1], tmp.val[1]); +++ accumulator.val[0] = vqadd_s16(accumulator.val[0], tmp.val[0]); +++ accumulator.val[1] = vqadd_s16(accumulator.val[1], tmp.val[1]); ++ ++- a_ptr += 4; ++- b_ptr += 4; ++- } +++ a_ptr += 4; +++ b_ptr += 4; +++ } ++ ++ vst2_s16((int16_t*)accum_result, accumulator); ++ *out = accum_result[0] + accum_result[1] + accum_result[2] + accum_result[3]; ++ ++ // tail case ++- for(number = quarter_points * 4; number < num_points; ++number) ++- { ++- *out += (*a_ptr++) * (*b_ptr++); ++- } +++ for (number = quarter_points * 4; number < num_points; ++number) { +++ *out += (*a_ptr++) * (*b_ptr++); +++ } ++ } ++ ++ #endif /* LV_HAVE_NEON */ ++@@ -498,13 +655,16 @@ static inline void volk_16ic_x2_dot_prod_16ic_neon_vma(lv_16sc_t* out, const lv_ ++ #ifdef LV_HAVE_NEON ++ #include ++ ++-static inline void volk_16ic_x2_dot_prod_16ic_neon_optvma(lv_16sc_t* out, const lv_16sc_t* in_a, const lv_16sc_t* in_b, unsigned int num_points) +++static inline void volk_16ic_x2_dot_prod_16ic_neon_optvma(lv_16sc_t* out, +++ const lv_16sc_t* in_a, +++ const lv_16sc_t* in_b, +++ unsigned int num_points) ++ { ++ unsigned int quarter_points = num_points / 4; ++ unsigned int number; ++ ++- lv_16sc_t* a_ptr = (lv_16sc_t*) in_a; ++- lv_16sc_t* b_ptr = (lv_16sc_t*) in_b; +++ lv_16sc_t* a_ptr = (lv_16sc_t*)in_a; +++ lv_16sc_t* b_ptr = (lv_16sc_t*)in_b; ++ // for 2-lane vectors, 1st lane holds the real part, ++ // 2nd lane holds the imaginary part ++ int16x4x2_t a_val, b_val, accumulator1, accumulator2; ++@@ -515,22 +675,21 @@ static inline void volk_16ic_x2_dot_prod_16ic_neon_optvma(lv_16sc_t* out, const ++ accumulator2.val[0] = vdup_n_s16(0); ++ accumulator2.val[1] = vdup_n_s16(0); ++ ++- for(number = 0; number < quarter_points; ++number) ++- { ++- a_val = vld2_s16((int16_t*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i ++- b_val = vld2_s16((int16_t*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i ++- __VOLK_PREFETCH(a_ptr + 8); ++- __VOLK_PREFETCH(b_ptr + 8); +++ for (number = 0; number < quarter_points; ++number) { +++ a_val = vld2_s16((int16_t*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i +++ b_val = vld2_s16((int16_t*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i +++ __VOLK_PREFETCH(a_ptr + 8); +++ __VOLK_PREFETCH(b_ptr + 8); ++ ++- // use 2 accumulators to remove inter-instruction data dependencies ++- accumulator1.val[0] = vmla_s16(accumulator1.val[0], a_val.val[0], b_val.val[0]); ++- accumulator2.val[0] = vmls_s16(accumulator2.val[0], a_val.val[1], b_val.val[1]); ++- accumulator1.val[1] = vmla_s16(accumulator1.val[1], a_val.val[0], b_val.val[1]); ++- accumulator2.val[1] = vmla_s16(accumulator2.val[1], a_val.val[1], b_val.val[0]); +++ // use 2 accumulators to remove inter-instruction data dependencies +++ accumulator1.val[0] = vmla_s16(accumulator1.val[0], a_val.val[0], b_val.val[0]); +++ accumulator2.val[0] = vmls_s16(accumulator2.val[0], a_val.val[1], b_val.val[1]); +++ accumulator1.val[1] = vmla_s16(accumulator1.val[1], a_val.val[0], b_val.val[1]); +++ accumulator2.val[1] = vmla_s16(accumulator2.val[1], a_val.val[1], b_val.val[0]); ++ ++- a_ptr += 4; ++- b_ptr += 4; ++- } +++ a_ptr += 4; +++ b_ptr += 4; +++ } ++ ++ accumulator1.val[0] = vqadd_s16(accumulator1.val[0], accumulator2.val[0]); ++ accumulator1.val[1] = vqadd_s16(accumulator1.val[1], accumulator2.val[1]); ++@@ -539,10 +698,9 @@ static inline void volk_16ic_x2_dot_prod_16ic_neon_optvma(lv_16sc_t* out, const ++ *out = accum_result[0] + accum_result[1] + accum_result[2] + accum_result[3]; ++ ++ // tail case ++- for(number = quarter_points * 4; number < num_points; ++number) ++- { ++- *out += (*a_ptr++) * (*b_ptr++); ++- } +++ for (number = quarter_points * 4; number < num_points; ++number) { +++ *out += (*a_ptr++) * (*b_ptr++); +++ } ++ } ++ ++ #endif /* LV_HAVE_NEON */ ++diff --git a/kernels/volk/volk_16ic_x2_multiply_16ic.h b/kernels/volk/volk_16ic_x2_multiply_16ic.h ++index 20d6a7f..2bf835d 100644 ++--- a/kernels/volk/volk_16ic_x2_multiply_16ic.h +++++ b/kernels/volk/volk_16ic_x2_multiply_16ic.h ++@@ -25,18 +25,19 @@ ++ * ++ * \b Overview ++ * ++- * Multiplies two input complex vectors, point-by-point, storing the result in the third vector. ++- * WARNING: Saturation is not checked. +++ * Multiplies two input complex vectors, point-by-point, storing the result in the third +++ * vector. WARNING: Saturation is not checked. ++ * ++ * Dispatcher Prototype ++ * \code ++- * void volk_16ic_x2_multiply_16ic(lv_16sc_t* result, const lv_16sc_t* in_a, const lv_16sc_t* in_b, unsigned int num_points); ++- * \endcode +++ * void volk_16ic_x2_multiply_16ic(lv_16sc_t* result, const lv_16sc_t* in_a, const +++ * lv_16sc_t* in_b, unsigned int num_points); \endcode ++ * ++ * \b Inputs ++ * \li in_a: One of the vectors to be multiplied. ++ * \li in_b: The other vector to be multiplied. ++- * \li num_points: The number of complex data points to be multiplied from both input vectors. +++ * \li num_points: The number of complex data points to be multiplied from both input +++ * vectors. ++ * ++ * \b Outputs ++ * \li result: The vector where the results will be stored. ++@@ -51,13 +52,15 @@ ++ ++ #ifdef LV_HAVE_GENERIC ++ ++-static inline void volk_16ic_x2_multiply_16ic_generic(lv_16sc_t* result, const lv_16sc_t* in_a, const lv_16sc_t* in_b, unsigned int num_points) +++static inline void volk_16ic_x2_multiply_16ic_generic(lv_16sc_t* result, +++ const lv_16sc_t* in_a, +++ const lv_16sc_t* in_b, +++ unsigned int num_points) ++ { ++ unsigned int n; ++- for (n = 0; n < num_points; n++) ++- { ++- result[n] = in_a[n] * in_b[n]; ++- } +++ for (n = 0; n < num_points; n++) { +++ result[n] = in_a[n] * in_b[n]; +++ } ++ } ++ ++ #endif /*LV_HAVE_GENERIC*/ ++@@ -66,51 +69,58 @@ static inline void volk_16ic_x2_multiply_16ic_generic(lv_16sc_t* result, const l ++ #ifdef LV_HAVE_SSE2 ++ #include ++ ++-static inline void volk_16ic_x2_multiply_16ic_a_sse2(lv_16sc_t* out, const lv_16sc_t* in_a, const lv_16sc_t* in_b, unsigned int num_points) +++static inline void volk_16ic_x2_multiply_16ic_a_sse2(lv_16sc_t* out, +++ const lv_16sc_t* in_a, +++ const lv_16sc_t* in_b, +++ unsigned int num_points) ++ { ++ const unsigned int sse_iters = num_points / 4; ++- __m128i a, b, c, c_sr, mask_imag, mask_real, real, imag, imag1, imag2, b_sl, a_sl, result; +++ __m128i a, b, c, c_sr, mask_imag, mask_real, real, imag, imag1, imag2, b_sl, a_sl, +++ result; ++ ++- mask_imag = _mm_set_epi8(0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0); ++- mask_real = _mm_set_epi8(0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF); +++ mask_imag = _mm_set_epi8( +++ 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0); +++ mask_real = _mm_set_epi8( +++ 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF); ++ ++ const lv_16sc_t* _in_a = in_a; ++ const lv_16sc_t* _in_b = in_b; ++ lv_16sc_t* _out = out; ++ unsigned int number; ++ ++- for(number = 0; number < sse_iters; number++) ++- { ++- a = _mm_load_si128((__m128i*)_in_a); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg ++- b = _mm_load_si128((__m128i*)_in_b); ++- c = _mm_mullo_epi16 (a, b); // a3.i*b3.i, a3.r*b3.r, .... +++ for (number = 0; number < sse_iters; number++) { +++ a = _mm_load_si128( +++ (__m128i*)_in_a); // load (2 byte imag, 2 byte real) x 4 into 128 bits reg +++ b = _mm_load_si128((__m128i*)_in_b); +++ c = _mm_mullo_epi16(a, b); // a3.i*b3.i, a3.r*b3.r, .... ++ ++- c_sr = _mm_srli_si128 (c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst. ++- real = _mm_subs_epi16 (c, c_sr); ++- real = _mm_and_si128 (real, mask_real); // a3.r*b3.r-a3.i*b3.i , 0, a3.r*b3.r- a3.i*b3.i +++ c_sr = _mm_srli_si128(c, 2); // Shift a right by imm8 bytes while shifting in +++ // zeros, and store the results in dst. +++ real = _mm_subs_epi16(c, c_sr); +++ real = _mm_and_si128(real, +++ mask_real); // a3.r*b3.r-a3.i*b3.i , 0, a3.r*b3.r- a3.i*b3.i ++ ++- b_sl = _mm_slli_si128(b, 2); // b3.r, b2.i .... ++- a_sl = _mm_slli_si128(a, 2); // a3.r, a2.i .... +++ b_sl = _mm_slli_si128(b, 2); // b3.r, b2.i .... +++ a_sl = _mm_slli_si128(a, 2); // a3.r, a2.i .... ++ ++- imag1 = _mm_mullo_epi16(a, b_sl); // a3.i*b3.r, .... ++- imag2 = _mm_mullo_epi16(b, a_sl); // b3.i*a3.r, .... +++ imag1 = _mm_mullo_epi16(a, b_sl); // a3.i*b3.r, .... +++ imag2 = _mm_mullo_epi16(b, a_sl); // b3.i*a3.r, .... ++ ++- imag = _mm_adds_epi16(imag1, imag2); ++- imag = _mm_and_si128 (imag, mask_imag); // a3.i*b3.r+b3.i*a3.r, 0, ... +++ imag = _mm_adds_epi16(imag1, imag2); +++ imag = _mm_and_si128(imag, mask_imag); // a3.i*b3.r+b3.i*a3.r, 0, ... ++ ++- result = _mm_or_si128 (real, imag); +++ result = _mm_or_si128(real, imag); ++ ++- _mm_store_si128((__m128i*)_out, result); +++ _mm_store_si128((__m128i*)_out, result); ++ ++- _in_a += 4; ++- _in_b += 4; ++- _out += 4; ++- } +++ _in_a += 4; +++ _in_b += 4; +++ _out += 4; +++ } ++ ++- for (number = sse_iters * 4; number < num_points; ++number) ++- { ++- *_out++ = (*_in_a++) * (*_in_b++); ++- } +++ for (number = sse_iters * 4; number < num_points; ++number) { +++ *_out++ = (*_in_a++) * (*_in_b++); +++ } ++ } ++ #endif /* LV_HAVE_SSE2 */ ++ ++@@ -118,51 +128,58 @@ static inline void volk_16ic_x2_multiply_16ic_a_sse2(lv_16sc_t* out, const lv_16 ++ #ifdef LV_HAVE_SSE2 ++ #include ++ ++-static inline void volk_16ic_x2_multiply_16ic_u_sse2(lv_16sc_t* out, const lv_16sc_t* in_a, const lv_16sc_t* in_b, unsigned int num_points) +++static inline void volk_16ic_x2_multiply_16ic_u_sse2(lv_16sc_t* out, +++ const lv_16sc_t* in_a, +++ const lv_16sc_t* in_b, +++ unsigned int num_points) ++ { ++ const unsigned int sse_iters = num_points / 4; ++- __m128i a, b, c, c_sr, mask_imag, mask_real, real, imag, imag1,imag2, b_sl, a_sl, result; +++ __m128i a, b, c, c_sr, mask_imag, mask_real, real, imag, imag1, imag2, b_sl, a_sl, +++ result; ++ ++- mask_imag = _mm_set_epi8(0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0); ++- mask_real = _mm_set_epi8(0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF); +++ mask_imag = _mm_set_epi8( +++ 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0); +++ mask_real = _mm_set_epi8( +++ 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF); ++ ++ const lv_16sc_t* _in_a = in_a; ++ const lv_16sc_t* _in_b = in_b; ++ lv_16sc_t* _out = out; ++ unsigned int number; ++ ++- for(number = 0; number < sse_iters; number++) ++- { ++- a = _mm_loadu_si128((__m128i*)_in_a); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg ++- b = _mm_loadu_si128((__m128i*)_in_b); ++- c = _mm_mullo_epi16 (a, b); // a3.i*b3.i, a3.r*b3.r, .... +++ for (number = 0; number < sse_iters; number++) { +++ a = _mm_loadu_si128( +++ (__m128i*)_in_a); // load (2 byte imag, 2 byte real) x 4 into 128 bits reg +++ b = _mm_loadu_si128((__m128i*)_in_b); +++ c = _mm_mullo_epi16(a, b); // a3.i*b3.i, a3.r*b3.r, .... ++ ++- c_sr = _mm_srli_si128 (c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst. ++- real = _mm_subs_epi16 (c, c_sr); ++- real = _mm_and_si128 (real, mask_real); // a3.r*b3.r-a3.i*b3.i , 0, a3.r*b3.r- a3.i*b3.i +++ c_sr = _mm_srli_si128(c, 2); // Shift a right by imm8 bytes while shifting in +++ // zeros, and store the results in dst. +++ real = _mm_subs_epi16(c, c_sr); +++ real = _mm_and_si128(real, +++ mask_real); // a3.r*b3.r-a3.i*b3.i , 0, a3.r*b3.r- a3.i*b3.i ++ ++- b_sl = _mm_slli_si128(b, 2); // b3.r, b2.i .... ++- a_sl = _mm_slli_si128(a, 2); // a3.r, a2.i .... +++ b_sl = _mm_slli_si128(b, 2); // b3.r, b2.i .... +++ a_sl = _mm_slli_si128(a, 2); // a3.r, a2.i .... ++ ++- imag1 = _mm_mullo_epi16(a, b_sl); // a3.i*b3.r, .... ++- imag2 = _mm_mullo_epi16(b, a_sl); // b3.i*a3.r, .... +++ imag1 = _mm_mullo_epi16(a, b_sl); // a3.i*b3.r, .... +++ imag2 = _mm_mullo_epi16(b, a_sl); // b3.i*a3.r, .... ++ ++- imag = _mm_adds_epi16(imag1, imag2); ++- imag = _mm_and_si128 (imag, mask_imag); // a3.i*b3.r+b3.i*a3.r, 0, ... +++ imag = _mm_adds_epi16(imag1, imag2); +++ imag = _mm_and_si128(imag, mask_imag); // a3.i*b3.r+b3.i*a3.r, 0, ... ++ ++- result = _mm_or_si128 (real, imag); +++ result = _mm_or_si128(real, imag); ++ ++- _mm_storeu_si128((__m128i*)_out, result); +++ _mm_storeu_si128((__m128i*)_out, result); ++ ++- _in_a += 4; ++- _in_b += 4; ++- _out += 4; ++- } +++ _in_a += 4; +++ _in_b += 4; +++ _out += 4; +++ } ++ ++- for (number = sse_iters * 4; number < num_points; ++number) ++- { ++- *_out++ = (*_in_a++) * (*_in_b++); ++- } +++ for (number = sse_iters * 4; number < num_points; ++number) { +++ *_out++ = (*_in_a++) * (*_in_b++); +++ } ++ } ++ #endif /* LV_HAVE_SSE2 */ ++ ++@@ -170,7 +187,10 @@ static inline void volk_16ic_x2_multiply_16ic_u_sse2(lv_16sc_t* out, const lv_16 ++ #ifdef LV_HAVE_AVX2 ++ #include ++ ++-static inline void volk_16ic_x2_multiply_16ic_u_avx2(lv_16sc_t* out, const lv_16sc_t* in_a, const lv_16sc_t* in_b, unsigned int num_points) +++static inline void volk_16ic_x2_multiply_16ic_u_avx2(lv_16sc_t* out, +++ const lv_16sc_t* in_a, +++ const lv_16sc_t* in_b, +++ unsigned int num_points) ++ { ++ unsigned int number = 0; ++ const unsigned int avx2_points = num_points / 8; ++@@ -179,44 +199,108 @@ static inline void volk_16ic_x2_multiply_16ic_u_avx2(lv_16sc_t* out, const lv_16 ++ const lv_16sc_t* _in_b = in_b; ++ lv_16sc_t* _out = out; ++ ++- __m256i a, b, c, c_sr, real, imag, imag1, imag2, b_sl, a_sl, result; ++- ++- const __m256i mask_imag = _mm256_set_epi8(0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0); ++- const __m256i mask_real = _mm256_set_epi8(0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF); ++- ++- for(;number < avx2_points; number++) ++- { ++- a = _mm256_loadu_si256((__m256i*)_in_a); // Load the ar + ai, br + bi as ar,ai,br,bi ++- b = _mm256_loadu_si256((__m256i*)_in_b); // Load the cr + ci, dr + di as cr,ci,dr,di ++- c = _mm256_mullo_epi16(a, b); ++- ++- c_sr = _mm256_srli_si256(c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst. ++- real = _mm256_subs_epi16(c, c_sr); ++- real = _mm256_and_si256(real, mask_real); // a3.r*b3.r-a3.i*b3.i , 0, a3.r*b3.r- a3.i*b3.i ++- ++- b_sl = _mm256_slli_si256(b, 2); // b3.r, b2.i .... ++- a_sl = _mm256_slli_si256(a, 2); // a3.r, a2.i .... ++- ++- imag1 = _mm256_mullo_epi16(a, b_sl); // a3.i*b3.r, .... ++- imag2 = _mm256_mullo_epi16(b, a_sl); // b3.i*a3.r, .... ++- ++- imag = _mm256_adds_epi16(imag1, imag2); ++- imag = _mm256_and_si256(imag, mask_imag); // a3.i*b3.r+b3.i*a3.r, 0, ... ++- ++- result = _mm256_or_si256(real, imag); ++- ++- _mm256_storeu_si256((__m256i*)_out, result); ++- ++- _in_a += 8; ++- _in_b += 8; ++- _out += 8; ++- } +++ __m256i a, b, c, c_sr, real, imag, imag1, imag2, b_sl, a_sl, result; +++ +++ const __m256i mask_imag = _mm256_set_epi8(0xFF, +++ 0xFF, +++ 0, +++ 0, +++ 0xFF, +++ 0xFF, +++ 0, +++ 0, +++ 0xFF, +++ 0xFF, +++ 0, +++ 0, +++ 0xFF, +++ 0xFF, +++ 0, +++ 0, +++ 0xFF, +++ 0xFF, +++ 0, +++ 0, +++ 0xFF, +++ 0xFF, +++ 0, +++ 0, +++ 0xFF, +++ 0xFF, +++ 0, +++ 0, +++ 0xFF, +++ 0xFF, +++ 0, +++ 0); +++ const __m256i mask_real = _mm256_set_epi8(0, +++ 0, +++ 0xFF, +++ 0xFF, +++ 0, +++ 0, +++ 0xFF, +++ 0xFF, +++ 0, +++ 0, +++ 0xFF, +++ 0xFF, +++ 0, +++ 0, +++ 0xFF, +++ 0xFF, +++ 0, +++ 0, +++ 0xFF, +++ 0xFF, +++ 0, +++ 0, +++ 0xFF, +++ 0xFF, +++ 0, +++ 0, +++ 0xFF, +++ 0xFF, +++ 0, +++ 0, +++ 0xFF, +++ 0xFF); +++ +++ for (; number < avx2_points; number++) { +++ a = _mm256_loadu_si256( +++ (__m256i*)_in_a); // Load the ar + ai, br + bi as ar,ai,br,bi +++ b = _mm256_loadu_si256( +++ (__m256i*)_in_b); // Load the cr + ci, dr + di as cr,ci,dr,di +++ c = _mm256_mullo_epi16(a, b); +++ +++ c_sr = _mm256_srli_si256(c, 2); // Shift a right by imm8 bytes while shifting in +++ // zeros, and store the results in dst. +++ real = _mm256_subs_epi16(c, c_sr); +++ real = _mm256_and_si256( +++ real, mask_real); // a3.r*b3.r-a3.i*b3.i , 0, a3.r*b3.r- a3.i*b3.i +++ +++ b_sl = _mm256_slli_si256(b, 2); // b3.r, b2.i .... +++ a_sl = _mm256_slli_si256(a, 2); // a3.r, a2.i .... +++ +++ imag1 = _mm256_mullo_epi16(a, b_sl); // a3.i*b3.r, .... +++ imag2 = _mm256_mullo_epi16(b, a_sl); // b3.i*a3.r, .... +++ +++ imag = _mm256_adds_epi16(imag1, imag2); +++ imag = _mm256_and_si256(imag, mask_imag); // a3.i*b3.r+b3.i*a3.r, 0, ... +++ +++ result = _mm256_or_si256(real, imag); +++ +++ _mm256_storeu_si256((__m256i*)_out, result); +++ +++ _in_a += 8; +++ _in_b += 8; +++ _out += 8; +++ } ++ _mm256_zeroupper(); ++ number = avx2_points * 8; ++- for(;number < num_points; number++) ++- { ++- *_out++ = (*_in_a++) * (*_in_b++); ++- } +++ for (; number < num_points; number++) { +++ *_out++ = (*_in_a++) * (*_in_b++); +++ } ++ } ++ #endif /* LV_HAVE_AVX2 */ ++ ++@@ -224,7 +308,10 @@ static inline void volk_16ic_x2_multiply_16ic_u_avx2(lv_16sc_t* out, const lv_16 ++ #ifdef LV_HAVE_AVX2 ++ #include ++ ++-static inline void volk_16ic_x2_multiply_16ic_a_avx2(lv_16sc_t* out, const lv_16sc_t* in_a, const lv_16sc_t* in_b, unsigned int num_points) +++static inline void volk_16ic_x2_multiply_16ic_a_avx2(lv_16sc_t* out, +++ const lv_16sc_t* in_a, +++ const lv_16sc_t* in_b, +++ unsigned int num_points) ++ { ++ unsigned int number = 0; ++ const unsigned int avx2_points = num_points / 8; ++@@ -233,44 +320,108 @@ static inline void volk_16ic_x2_multiply_16ic_a_avx2(lv_16sc_t* out, const lv_16 ++ const lv_16sc_t* _in_b = in_b; ++ lv_16sc_t* _out = out; ++ ++- __m256i a, b, c, c_sr, real, imag, imag1, imag2, b_sl, a_sl, result; ++- ++- const __m256i mask_imag = _mm256_set_epi8(0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0); ++- const __m256i mask_real = _mm256_set_epi8(0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF); ++- ++- for(;number < avx2_points; number++) ++- { ++- a = _mm256_load_si256((__m256i*)_in_a); // Load the ar + ai, br + bi as ar,ai,br,bi ++- b = _mm256_load_si256((__m256i*)_in_b); // Load the cr + ci, dr + di as cr,ci,dr,di ++- c = _mm256_mullo_epi16(a, b); ++- ++- c_sr = _mm256_srli_si256(c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst. ++- real = _mm256_subs_epi16(c, c_sr); ++- real = _mm256_and_si256(real, mask_real); // a3.r*b3.r-a3.i*b3.i , 0, a3.r*b3.r- a3.i*b3.i ++- ++- b_sl = _mm256_slli_si256(b, 2); // b3.r, b2.i .... ++- a_sl = _mm256_slli_si256(a, 2); // a3.r, a2.i .... ++- ++- imag1 = _mm256_mullo_epi16(a, b_sl); // a3.i*b3.r, .... ++- imag2 = _mm256_mullo_epi16(b, a_sl); // b3.i*a3.r, .... ++- ++- imag = _mm256_adds_epi16(imag1, imag2); ++- imag = _mm256_and_si256(imag, mask_imag); // a3.i*b3.r+b3.i*a3.r, 0, ... ++- ++- result = _mm256_or_si256(real, imag); ++- ++- _mm256_store_si256((__m256i*)_out, result); ++- ++- _in_a += 8; ++- _in_b += 8; ++- _out += 8; ++- } +++ __m256i a, b, c, c_sr, real, imag, imag1, imag2, b_sl, a_sl, result; +++ +++ const __m256i mask_imag = _mm256_set_epi8(0xFF, +++ 0xFF, +++ 0, +++ 0, +++ 0xFF, +++ 0xFF, +++ 0, +++ 0, +++ 0xFF, +++ 0xFF, +++ 0, +++ 0, +++ 0xFF, +++ 0xFF, +++ 0, +++ 0, +++ 0xFF, +++ 0xFF, +++ 0, +++ 0, +++ 0xFF, +++ 0xFF, +++ 0, +++ 0, +++ 0xFF, +++ 0xFF, +++ 0, +++ 0, +++ 0xFF, +++ 0xFF, +++ 0, +++ 0); +++ const __m256i mask_real = _mm256_set_epi8(0, +++ 0, +++ 0xFF, +++ 0xFF, +++ 0, +++ 0, +++ 0xFF, +++ 0xFF, +++ 0, +++ 0, +++ 0xFF, +++ 0xFF, +++ 0, +++ 0, +++ 0xFF, +++ 0xFF, +++ 0, +++ 0, +++ 0xFF, +++ 0xFF, +++ 0, +++ 0, +++ 0xFF, +++ 0xFF, +++ 0, +++ 0, +++ 0xFF, +++ 0xFF, +++ 0, +++ 0, +++ 0xFF, +++ 0xFF); +++ +++ for (; number < avx2_points; number++) { +++ a = _mm256_load_si256( +++ (__m256i*)_in_a); // Load the ar + ai, br + bi as ar,ai,br,bi +++ b = _mm256_load_si256( +++ (__m256i*)_in_b); // Load the cr + ci, dr + di as cr,ci,dr,di +++ c = _mm256_mullo_epi16(a, b); +++ +++ c_sr = _mm256_srli_si256(c, 2); // Shift a right by imm8 bytes while shifting in +++ // zeros, and store the results in dst. +++ real = _mm256_subs_epi16(c, c_sr); +++ real = _mm256_and_si256( +++ real, mask_real); // a3.r*b3.r-a3.i*b3.i , 0, a3.r*b3.r- a3.i*b3.i +++ +++ b_sl = _mm256_slli_si256(b, 2); // b3.r, b2.i .... +++ a_sl = _mm256_slli_si256(a, 2); // a3.r, a2.i .... +++ +++ imag1 = _mm256_mullo_epi16(a, b_sl); // a3.i*b3.r, .... +++ imag2 = _mm256_mullo_epi16(b, a_sl); // b3.i*a3.r, .... +++ +++ imag = _mm256_adds_epi16(imag1, imag2); +++ imag = _mm256_and_si256(imag, mask_imag); // a3.i*b3.r+b3.i*a3.r, 0, ... +++ +++ result = _mm256_or_si256(real, imag); +++ +++ _mm256_store_si256((__m256i*)_out, result); +++ +++ _in_a += 8; +++ _in_b += 8; +++ _out += 8; +++ } ++ _mm256_zeroupper(); ++ number = avx2_points * 8; ++- for(;number < num_points; number++) ++- { ++- *_out++ = (*_in_a++) * (*_in_b++); ++- } +++ for (; number < num_points; number++) { +++ *_out++ = (*_in_a++) * (*_in_b++); +++ } ++ } ++ #endif /* LV_HAVE_AVX2 */ ++ ++@@ -278,48 +429,49 @@ static inline void volk_16ic_x2_multiply_16ic_a_avx2(lv_16sc_t* out, const lv_16 ++ #ifdef LV_HAVE_NEON ++ #include ++ ++-static inline void volk_16ic_x2_multiply_16ic_neon(lv_16sc_t* out, const lv_16sc_t* in_a, const lv_16sc_t* in_b, unsigned int num_points) +++static inline void volk_16ic_x2_multiply_16ic_neon(lv_16sc_t* out, +++ const lv_16sc_t* in_a, +++ const lv_16sc_t* in_b, +++ unsigned int num_points) ++ { ++- lv_16sc_t *a_ptr = (lv_16sc_t*) in_a; ++- lv_16sc_t *b_ptr = (lv_16sc_t*) in_b; +++ lv_16sc_t* a_ptr = (lv_16sc_t*)in_a; +++ lv_16sc_t* b_ptr = (lv_16sc_t*)in_b; ++ unsigned int quarter_points = num_points / 4; ++ int16x4x2_t a_val, b_val, c_val; ++ int16x4x2_t tmp_real, tmp_imag; ++ unsigned int number = 0; ++ ++- for(number = 0; number < quarter_points; ++number) ++- { ++- a_val = vld2_s16((int16_t*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i ++- b_val = vld2_s16((int16_t*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i ++- __VOLK_PREFETCH(a_ptr + 4); ++- __VOLK_PREFETCH(b_ptr + 4); ++- ++- // multiply the real*real and imag*imag to get real result ++- // a0r*b0r|a1r*b1r|a2r*b2r|a3r*b3r ++- tmp_real.val[0] = vmul_s16(a_val.val[0], b_val.val[0]); ++- // a0i*b0i|a1i*b1i|a2i*b2i|a3i*b3i ++- tmp_real.val[1] = vmul_s16(a_val.val[1], b_val.val[1]); ++- ++- // Multiply cross terms to get the imaginary result ++- // a0r*b0i|a1r*b1i|a2r*b2i|a3r*b3i ++- tmp_imag.val[0] = vmul_s16(a_val.val[0], b_val.val[1]); ++- // a0i*b0r|a1i*b1r|a2i*b2r|a3i*b3r ++- tmp_imag.val[1] = vmul_s16(a_val.val[1], b_val.val[0]); ++- ++- // store the results ++- c_val.val[0] = vsub_s16(tmp_real.val[0], tmp_real.val[1]); ++- c_val.val[1] = vadd_s16(tmp_imag.val[0], tmp_imag.val[1]); ++- vst2_s16((int16_t*)out, c_val); ++- ++- a_ptr += 4; ++- b_ptr += 4; ++- out += 4; ++- } ++- ++- for(number = quarter_points * 4; number < num_points; number++) ++- { ++- *out++ = (*a_ptr++) * (*b_ptr++); ++- } +++ for (number = 0; number < quarter_points; ++number) { +++ a_val = vld2_s16((int16_t*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i +++ b_val = vld2_s16((int16_t*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i +++ __VOLK_PREFETCH(a_ptr + 4); +++ __VOLK_PREFETCH(b_ptr + 4); +++ +++ // multiply the real*real and imag*imag to get real result +++ // a0r*b0r|a1r*b1r|a2r*b2r|a3r*b3r +++ tmp_real.val[0] = vmul_s16(a_val.val[0], b_val.val[0]); +++ // a0i*b0i|a1i*b1i|a2i*b2i|a3i*b3i +++ tmp_real.val[1] = vmul_s16(a_val.val[1], b_val.val[1]); +++ +++ // Multiply cross terms to get the imaginary result +++ // a0r*b0i|a1r*b1i|a2r*b2i|a3r*b3i +++ tmp_imag.val[0] = vmul_s16(a_val.val[0], b_val.val[1]); +++ // a0i*b0r|a1i*b1r|a2i*b2r|a3i*b3r +++ tmp_imag.val[1] = vmul_s16(a_val.val[1], b_val.val[0]); +++ +++ // store the results +++ c_val.val[0] = vsub_s16(tmp_real.val[0], tmp_real.val[1]); +++ c_val.val[1] = vadd_s16(tmp_imag.val[0], tmp_imag.val[1]); +++ vst2_s16((int16_t*)out, c_val); +++ +++ a_ptr += 4; +++ b_ptr += 4; +++ out += 4; +++ } +++ +++ for (number = quarter_points * 4; number < num_points; number++) { +++ *out++ = (*a_ptr++) * (*b_ptr++); +++ } ++ } ++ #endif /* LV_HAVE_NEON */ ++ ++diff --git a/kernels/volk/volk_16u_byteswap.h b/kernels/volk/volk_16u_byteswap.h ++index eaa972f..221dcdb 100644 ++--- a/kernels/volk/volk_16u_byteswap.h +++++ b/kernels/volk/volk_16u_byteswap.h ++@@ -58,74 +58,80 @@ ++ ++ #if LV_HAVE_AVX2 ++ #include ++-static inline void volk_16u_byteswap_a_avx2(uint16_t* intsToSwap, unsigned int num_points){ ++- unsigned int number; +++static inline void volk_16u_byteswap_a_avx2(uint16_t* intsToSwap, unsigned int num_points) +++{ +++ unsigned int number; ++ ++- const unsigned int nPerSet = 16; ++- const uint64_t nSets = num_points / nPerSet; +++ const unsigned int nPerSet = 16; +++ const uint64_t nSets = num_points / nPerSet; ++ ++- uint16_t* inputPtr = (uint16_t*) intsToSwap; +++ uint16_t* inputPtr = (uint16_t*)intsToSwap; ++ ++- const uint8_t shuffleVector[32] = { 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14, 17, 16, 19, 18, 21, 20, 23, 22, 25, 24, 27, 26, 29, 28, 31, 30}; +++ const uint8_t shuffleVector[32] = { 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, +++ 10, 13, 12, 15, 14, 17, 16, 19, 18, 21, 20, +++ 23, 22, 25, 24, 27, 26, 29, 28, 31, 30 }; ++ ++- const __m256i myShuffle = _mm256_loadu_si256((__m256i*) &shuffleVector[0]); +++ const __m256i myShuffle = _mm256_loadu_si256((__m256i*)&shuffleVector[0]); ++ ++- for(number = 0; number < nSets; number++) { ++- // Load the 32t values, increment inputPtr later since we're doing it in-place. ++- const __m256i input = _mm256_load_si256((__m256i*)inputPtr); ++- const __m256i output = _mm256_shuffle_epi8(input, myShuffle); +++ for (number = 0; number < nSets; number++) { +++ // Load the 32t values, increment inputPtr later since we're doing it in-place. +++ const __m256i input = _mm256_load_si256((__m256i*)inputPtr); +++ const __m256i output = _mm256_shuffle_epi8(input, myShuffle); ++ ++- // Store the results ++- _mm256_store_si256((__m256i*)inputPtr, output); ++- inputPtr += nPerSet; ++- } +++ // Store the results +++ _mm256_store_si256((__m256i*)inputPtr, output); +++ inputPtr += nPerSet; +++ } ++ ++- _mm256_zeroupper(); +++ _mm256_zeroupper(); ++ ++- // Byteswap any remaining points: ++- for(number = nPerSet * nSets; number < num_points; number++) { ++- uint16_t outputVal = *inputPtr; ++- outputVal = (((outputVal >> 8) & 0xff) | ((outputVal << 8) & 0xff00)); ++- *inputPtr = outputVal; ++- inputPtr++; ++- } +++ // Byteswap any remaining points: +++ for (number = nPerSet * nSets; number < num_points; number++) { +++ uint16_t outputVal = *inputPtr; +++ outputVal = (((outputVal >> 8) & 0xff) | ((outputVal << 8) & 0xff00)); +++ *inputPtr = outputVal; +++ inputPtr++; +++ } ++ } ++ #endif /* LV_HAVE_AVX2 */ ++ ++ ++ #if LV_HAVE_AVX2 ++ #include ++-static inline void volk_16u_byteswap_u_avx2(uint16_t* intsToSwap, unsigned int num_points){ ++- unsigned int number; +++static inline void volk_16u_byteswap_u_avx2(uint16_t* intsToSwap, unsigned int num_points) +++{ +++ unsigned int number; ++ ++- const unsigned int nPerSet = 16; ++- const uint64_t nSets = num_points / nPerSet; +++ const unsigned int nPerSet = 16; +++ const uint64_t nSets = num_points / nPerSet; ++ ++- uint16_t* inputPtr = (uint16_t*) intsToSwap; +++ uint16_t* inputPtr = (uint16_t*)intsToSwap; ++ ++- const uint8_t shuffleVector[32] = { 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14, 17, 16, 19, 18, 21, 20, 23, 22, 25, 24, 27, 26, 29, 28, 31, 30}; +++ const uint8_t shuffleVector[32] = { 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, +++ 10, 13, 12, 15, 14, 17, 16, 19, 18, 21, 20, +++ 23, 22, 25, 24, 27, 26, 29, 28, 31, 30 }; ++ ++- const __m256i myShuffle = _mm256_loadu_si256((__m256i*) &shuffleVector[0]); +++ const __m256i myShuffle = _mm256_loadu_si256((__m256i*)&shuffleVector[0]); ++ ++- for (number = 0; number < nSets; number++) { ++- // Load the 32t values, increment inputPtr later since we're doing it in-place. ++- const __m256i input = _mm256_loadu_si256((__m256i*)inputPtr); ++- const __m256i output = _mm256_shuffle_epi8(input,myShuffle); +++ for (number = 0; number < nSets; number++) { +++ // Load the 32t values, increment inputPtr later since we're doing it in-place. +++ const __m256i input = _mm256_loadu_si256((__m256i*)inputPtr); +++ const __m256i output = _mm256_shuffle_epi8(input, myShuffle); ++ ++- // Store the results ++- _mm256_storeu_si256((__m256i*)inputPtr, output); ++- inputPtr += nPerSet; ++- } +++ // Store the results +++ _mm256_storeu_si256((__m256i*)inputPtr, output); +++ inputPtr += nPerSet; +++ } ++ ++- _mm256_zeroupper(); +++ _mm256_zeroupper(); ++ ++- // Byteswap any remaining points: ++- for(number = nPerSet * nSets; number < num_points; number++) { ++- uint16_t outputVal = *inputPtr; ++- outputVal = (((outputVal >> 8) & 0xff) | ((outputVal << 8) & 0xff00)); ++- *inputPtr = outputVal; ++- inputPtr++; ++- } +++ // Byteswap any remaining points: +++ for (number = nPerSet * nSets; number < num_points; number++) { +++ uint16_t outputVal = *inputPtr; +++ outputVal = (((outputVal >> 8) & 0xff) | ((outputVal << 8) & 0xff00)); +++ *inputPtr = outputVal; +++ inputPtr++; +++ } ++ } ++ #endif /* LV_HAVE_AVX2 */ ++ ++@@ -133,47 +139,50 @@ static inline void volk_16u_byteswap_u_avx2(uint16_t* intsToSwap, unsigned int n ++ #ifdef LV_HAVE_SSE2 ++ #include ++ ++-static inline void volk_16u_byteswap_u_sse2(uint16_t* intsToSwap, unsigned int num_points){ ++- unsigned int number = 0; ++- uint16_t* inputPtr = intsToSwap; ++- __m128i input, left, right, output; ++- ++- const unsigned int eighthPoints = num_points / 8; ++- for(;number < eighthPoints; number++){ ++- // Load the 16t values, increment inputPtr later since we're doing it in-place. ++- input = _mm_loadu_si128((__m128i*)inputPtr); ++- // Do the two shifts ++- left = _mm_slli_epi16(input, 8); ++- right = _mm_srli_epi16(input, 8); ++- // Or the left and right halves together ++- output = _mm_or_si128(left, right); ++- // Store the results ++- _mm_storeu_si128((__m128i*)inputPtr, output); ++- inputPtr += 8; ++- } ++- ++- // Byteswap any remaining points: ++- number = eighthPoints*8; ++- for(; number < num_points; number++){ ++- uint16_t outputVal = *inputPtr; ++- outputVal = (((outputVal >> 8) & 0xff) | ((outputVal << 8) & 0xff00)); ++- *inputPtr = outputVal; ++- inputPtr++; ++- } +++static inline void volk_16u_byteswap_u_sse2(uint16_t* intsToSwap, unsigned int num_points) +++{ +++ unsigned int number = 0; +++ uint16_t* inputPtr = intsToSwap; +++ __m128i input, left, right, output; +++ +++ const unsigned int eighthPoints = num_points / 8; +++ for (; number < eighthPoints; number++) { +++ // Load the 16t values, increment inputPtr later since we're doing it in-place. +++ input = _mm_loadu_si128((__m128i*)inputPtr); +++ // Do the two shifts +++ left = _mm_slli_epi16(input, 8); +++ right = _mm_srli_epi16(input, 8); +++ // Or the left and right halves together +++ output = _mm_or_si128(left, right); +++ // Store the results +++ _mm_storeu_si128((__m128i*)inputPtr, output); +++ inputPtr += 8; +++ } +++ +++ // Byteswap any remaining points: +++ number = eighthPoints * 8; +++ for (; number < num_points; number++) { +++ uint16_t outputVal = *inputPtr; +++ outputVal = (((outputVal >> 8) & 0xff) | ((outputVal << 8) & 0xff00)); +++ *inputPtr = outputVal; +++ inputPtr++; +++ } ++ } ++ #endif /* LV_HAVE_SSE2 */ ++ ++ #ifdef LV_HAVE_GENERIC ++ ++-static inline void volk_16u_byteswap_generic(uint16_t* intsToSwap, unsigned int num_points){ ++- unsigned int point; ++- uint16_t* inputPtr = intsToSwap; ++- for(point = 0; point < num_points; point++){ ++- uint16_t output = *inputPtr; ++- output = (((output >> 8) & 0xff) | ((output << 8) & 0xff00)); ++- *inputPtr = output; ++- inputPtr++; ++- } +++static inline void volk_16u_byteswap_generic(uint16_t* intsToSwap, +++ unsigned int num_points) +++{ +++ unsigned int point; +++ uint16_t* inputPtr = intsToSwap; +++ for (point = 0; point < num_points; point++) { +++ uint16_t output = *inputPtr; +++ output = (((output >> 8) & 0xff) | ((output << 8) & 0xff00)); +++ *inputPtr = output; +++ inputPtr++; +++ } ++ } ++ #endif /* LV_HAVE_GENERIC */ ++ ++@@ -187,129 +196,136 @@ static inline void volk_16u_byteswap_generic(uint16_t* intsToSwap, unsigned int ++ #ifdef LV_HAVE_SSE2 ++ #include ++ ++-static inline void volk_16u_byteswap_a_sse2(uint16_t* intsToSwap, unsigned int num_points){ ++- unsigned int number = 0; ++- uint16_t* inputPtr = intsToSwap; ++- __m128i input, left, right, output; ++- ++- const unsigned int eighthPoints = num_points / 8; ++- for(;number < eighthPoints; number++){ ++- // Load the 16t values, increment inputPtr later since we're doing it in-place. ++- input = _mm_load_si128((__m128i*)inputPtr); ++- // Do the two shifts ++- left = _mm_slli_epi16(input, 8); ++- right = _mm_srli_epi16(input, 8); ++- // Or the left and right halves together ++- output = _mm_or_si128(left, right); ++- // Store the results ++- _mm_store_si128((__m128i*)inputPtr, output); ++- inputPtr += 8; ++- } ++- ++- ++- // Byteswap any remaining points: ++- number = eighthPoints*8; ++- for(; number < num_points; number++){ ++- uint16_t outputVal = *inputPtr; ++- outputVal = (((outputVal >> 8) & 0xff) | ((outputVal << 8) & 0xff00)); ++- *inputPtr = outputVal; ++- inputPtr++; ++- } +++static inline void volk_16u_byteswap_a_sse2(uint16_t* intsToSwap, unsigned int num_points) +++{ +++ unsigned int number = 0; +++ uint16_t* inputPtr = intsToSwap; +++ __m128i input, left, right, output; +++ +++ const unsigned int eighthPoints = num_points / 8; +++ for (; number < eighthPoints; number++) { +++ // Load the 16t values, increment inputPtr later since we're doing it in-place. +++ input = _mm_load_si128((__m128i*)inputPtr); +++ // Do the two shifts +++ left = _mm_slli_epi16(input, 8); +++ right = _mm_srli_epi16(input, 8); +++ // Or the left and right halves together +++ output = _mm_or_si128(left, right); +++ // Store the results +++ _mm_store_si128((__m128i*)inputPtr, output); +++ inputPtr += 8; +++ } +++ +++ +++ // Byteswap any remaining points: +++ number = eighthPoints * 8; +++ for (; number < num_points; number++) { +++ uint16_t outputVal = *inputPtr; +++ outputVal = (((outputVal >> 8) & 0xff) | ((outputVal << 8) & 0xff00)); +++ *inputPtr = outputVal; +++ inputPtr++; +++ } ++ } ++ #endif /* LV_HAVE_SSE2 */ ++ ++ #ifdef LV_HAVE_NEON ++ #include ++ ++-static inline void volk_16u_byteswap_neon(uint16_t* intsToSwap, unsigned int num_points){ ++- unsigned int number; ++- unsigned int eighth_points = num_points / 8; ++- uint16x8_t input, output; ++- uint16_t* inputPtr = intsToSwap; ++- ++- for(number = 0; number < eighth_points; number++) { ++- input = vld1q_u16(inputPtr); ++- output = vsriq_n_u16(output, input, 8); ++- output = vsliq_n_u16(output, input, 8); ++- vst1q_u16(inputPtr, output); ++- inputPtr += 8; ++- } ++- ++- for(number = eighth_points * 8; number < num_points; number++){ ++- uint16_t output = *inputPtr; ++- output = (((output >> 8) & 0xff) | ((output << 8) & 0xff00)); ++- *inputPtr = output; ++- inputPtr++; ++- } +++static inline void volk_16u_byteswap_neon(uint16_t* intsToSwap, unsigned int num_points) +++{ +++ unsigned int number; +++ unsigned int eighth_points = num_points / 8; +++ uint16x8_t input, output; +++ uint16_t* inputPtr = intsToSwap; +++ +++ for (number = 0; number < eighth_points; number++) { +++ input = vld1q_u16(inputPtr); +++ output = vsriq_n_u16(output, input, 8); +++ output = vsliq_n_u16(output, input, 8); +++ vst1q_u16(inputPtr, output); +++ inputPtr += 8; +++ } +++ +++ for (number = eighth_points * 8; number < num_points; number++) { +++ uint16_t output = *inputPtr; +++ output = (((output >> 8) & 0xff) | ((output << 8) & 0xff00)); +++ *inputPtr = output; +++ inputPtr++; +++ } ++ } ++ #endif /* LV_HAVE_NEON */ ++ ++ #ifdef LV_HAVE_NEON ++ #include ++ ++-static inline void volk_16u_byteswap_neon_table(uint16_t* intsToSwap, unsigned int num_points){ ++- uint16_t* inputPtr = intsToSwap; ++- unsigned int number = 0; ++- unsigned int n16points = num_points / 16; ++- ++- uint8x8x4_t input_table; ++- uint8x8_t int_lookup01, int_lookup23, int_lookup45, int_lookup67; ++- uint8x8_t swapped_int01, swapped_int23, swapped_int45, swapped_int67; ++- ++- /* these magic numbers are used as byte-indices in the LUT. ++- they are pre-computed to save time. A simple C program ++- can calculate them; for example for lookup01: ++- uint8_t chars[8] = {24, 16, 8, 0, 25, 17, 9, 1}; ++- for(ii=0; ii < 8; ++ii) { ++- index += ((uint64_t)(*(chars+ii))) << (ii*8); +++static inline void volk_16u_byteswap_neon_table(uint16_t* intsToSwap, +++ unsigned int num_points) +++{ +++ uint16_t* inputPtr = intsToSwap; +++ unsigned int number = 0; +++ unsigned int n16points = num_points / 16; +++ +++ uint8x8x4_t input_table; +++ uint8x8_t int_lookup01, int_lookup23, int_lookup45, int_lookup67; +++ uint8x8_t swapped_int01, swapped_int23, swapped_int45, swapped_int67; +++ +++ /* these magic numbers are used as byte-indices in the LUT. +++ they are pre-computed to save time. A simple C program +++ can calculate them; for example for lookup01: +++ uint8_t chars[8] = {24, 16, 8, 0, 25, 17, 9, 1}; +++ for(ii=0; ii < 8; ++ii) { +++ index += ((uint64_t)(*(chars+ii))) << (ii*8); +++ } +++ */ +++ int_lookup01 = vcreate_u8(1232017111498883080); +++ int_lookup23 = vcreate_u8(1376697457175036426); +++ int_lookup45 = vcreate_u8(1521377802851189772); +++ int_lookup67 = vcreate_u8(1666058148527343118); +++ +++ for (number = 0; number < n16points; ++number) { +++ input_table = vld4_u8((uint8_t*)inputPtr); +++ swapped_int01 = vtbl4_u8(input_table, int_lookup01); +++ swapped_int23 = vtbl4_u8(input_table, int_lookup23); +++ swapped_int45 = vtbl4_u8(input_table, int_lookup45); +++ swapped_int67 = vtbl4_u8(input_table, int_lookup67); +++ vst1_u8((uint8_t*)inputPtr, swapped_int01); +++ vst1_u8((uint8_t*)(inputPtr + 4), swapped_int23); +++ vst1_u8((uint8_t*)(inputPtr + 8), swapped_int45); +++ vst1_u8((uint8_t*)(inputPtr + 12), swapped_int67); +++ +++ inputPtr += 16; +++ } +++ +++ for (number = n16points * 16; number < num_points; ++number) { +++ uint16_t output = *inputPtr; +++ output = (((output >> 8) & 0xff) | ((output << 8) & 0xff00)); +++ *inputPtr = output; +++ inputPtr++; ++ } ++- */ ++- int_lookup01 = vcreate_u8(1232017111498883080); ++- int_lookup23 = vcreate_u8(1376697457175036426); ++- int_lookup45 = vcreate_u8(1521377802851189772); ++- int_lookup67 = vcreate_u8(1666058148527343118); ++- ++- for(number = 0; number < n16points; ++number){ ++- input_table = vld4_u8((uint8_t*) inputPtr); ++- swapped_int01 = vtbl4_u8(input_table, int_lookup01); ++- swapped_int23 = vtbl4_u8(input_table, int_lookup23); ++- swapped_int45 = vtbl4_u8(input_table, int_lookup45); ++- swapped_int67 = vtbl4_u8(input_table, int_lookup67); ++- vst1_u8((uint8_t*)inputPtr, swapped_int01); ++- vst1_u8((uint8_t*)(inputPtr+4), swapped_int23); ++- vst1_u8((uint8_t*)(inputPtr+8), swapped_int45); ++- vst1_u8((uint8_t*)(inputPtr+12), swapped_int67); ++- ++- inputPtr += 16; ++- } ++- ++- for(number = n16points * 16; number < num_points; ++number){ ++- uint16_t output = *inputPtr; ++- output = (((output >> 8) & 0xff) | ((output << 8) & 0xff00)); ++- *inputPtr = output; ++- inputPtr++; ++- } ++ } ++ #endif /* LV_HAVE_NEON */ ++ ++ #ifdef LV_HAVE_GENERIC ++ ++-static inline void volk_16u_byteswap_a_generic(uint16_t* intsToSwap, unsigned int num_points){ ++- unsigned int point; ++- uint16_t* inputPtr = intsToSwap; ++- for(point = 0; point < num_points; point++){ ++- uint16_t output = *inputPtr; ++- output = (((output >> 8) & 0xff) | ((output << 8) & 0xff00)); ++- *inputPtr = output; ++- inputPtr++; ++- } +++static inline void volk_16u_byteswap_a_generic(uint16_t* intsToSwap, +++ unsigned int num_points) +++{ +++ unsigned int point; +++ uint16_t* inputPtr = intsToSwap; +++ for (point = 0; point < num_points; point++) { +++ uint16_t output = *inputPtr; +++ output = (((output >> 8) & 0xff) | ((output << 8) & 0xff00)); +++ *inputPtr = output; +++ inputPtr++; +++ } ++ } ++ #endif /* LV_HAVE_GENERIC */ ++ ++ #ifdef LV_HAVE_ORC ++ ++ extern void volk_16u_byteswap_a_orc_impl(uint16_t* intsToSwap, unsigned int num_points); ++-static inline void volk_16u_byteswap_u_orc(uint16_t* intsToSwap, unsigned int num_points){ +++static inline void volk_16u_byteswap_u_orc(uint16_t* intsToSwap, unsigned int num_points) +++{ ++ volk_16u_byteswap_a_orc_impl(intsToSwap, num_points); ++ } ++ #endif /* LV_HAVE_ORC */ ++diff --git a/kernels/volk/volk_16u_byteswappuppet_16u.h b/kernels/volk/volk_16u_byteswappuppet_16u.h ++index d3c8c5d..8cb1318 100644 ++--- a/kernels/volk/volk_16u_byteswappuppet_16u.h +++++ b/kernels/volk/volk_16u_byteswappuppet_16u.h ++@@ -3,69 +3,83 @@ ++ ++ ++ #include ++-#include ++ #include +++#include ++ ++ #ifdef LV_HAVE_GENERIC ++-static inline void volk_16u_byteswappuppet_16u_generic(uint16_t*output, uint16_t* intsToSwap, unsigned int num_points){ +++static inline void volk_16u_byteswappuppet_16u_generic(uint16_t* output, +++ uint16_t* intsToSwap, +++ unsigned int num_points) +++{ ++ ++ volk_16u_byteswap_generic((uint16_t*)intsToSwap, num_points); ++ memcpy((void*)output, (void*)intsToSwap, num_points * sizeof(uint16_t)); ++- ++ } ++ #endif ++ ++ #ifdef LV_HAVE_NEON ++-static inline void volk_16u_byteswappuppet_16u_neon(uint16_t*output, uint16_t* intsToSwap, unsigned int num_points){ +++static inline void volk_16u_byteswappuppet_16u_neon(uint16_t* output, +++ uint16_t* intsToSwap, +++ unsigned int num_points) +++{ ++ ++ volk_16u_byteswap_neon((uint16_t*)intsToSwap, num_points); ++ memcpy((void*)output, (void*)intsToSwap, num_points * sizeof(uint16_t)); ++- ++ } ++ #endif ++ ++ #ifdef LV_HAVE_NEON ++-static inline void volk_16u_byteswappuppet_16u_neon_table(uint16_t*output, uint16_t* intsToSwap, unsigned int num_points){ +++static inline void volk_16u_byteswappuppet_16u_neon_table(uint16_t* output, +++ uint16_t* intsToSwap, +++ unsigned int num_points) +++{ ++ ++ volk_16u_byteswap_neon_table((uint16_t*)intsToSwap, num_points); ++ memcpy((void*)output, (void*)intsToSwap, num_points * sizeof(uint16_t)); ++- ++ } ++ #endif ++ ++ #ifdef LV_HAVE_SSE2 ++-static inline void volk_16u_byteswappuppet_16u_u_sse2(uint16_t *output, uint16_t* intsToSwap, unsigned int num_points){ +++static inline void volk_16u_byteswappuppet_16u_u_sse2(uint16_t* output, +++ uint16_t* intsToSwap, +++ unsigned int num_points) +++{ ++ ++ volk_16u_byteswap_u_sse2((uint16_t*)intsToSwap, num_points); ++ memcpy((void*)output, (void*)intsToSwap, num_points * sizeof(uint16_t)); ++- ++ } ++ #endif ++ ++ #ifdef LV_HAVE_SSE2 ++-static inline void volk_16u_byteswappuppet_16u_a_sse2(uint16_t *output, uint16_t* intsToSwap, unsigned int num_points){ +++static inline void volk_16u_byteswappuppet_16u_a_sse2(uint16_t* output, +++ uint16_t* intsToSwap, +++ unsigned int num_points) +++{ ++ ++ volk_16u_byteswap_a_sse2((uint16_t*)intsToSwap, num_points); ++ memcpy((void*)output, (void*)intsToSwap, num_points * sizeof(uint16_t)); ++- ++ } ++ #endif ++ ++ #ifdef LV_HAVE_AVX2 ++-static inline void volk_16u_byteswappuppet_16u_u_avx2(uint16_t *output, uint16_t* intsToSwap, unsigned int num_points){ +++static inline void volk_16u_byteswappuppet_16u_u_avx2(uint16_t* output, +++ uint16_t* intsToSwap, +++ unsigned int num_points) +++{ ++ ++ volk_16u_byteswap_u_avx2((uint16_t*)intsToSwap, num_points); ++ memcpy((void*)output, (void*)intsToSwap, num_points * sizeof(uint16_t)); ++- ++ } ++ #endif ++ ++ #ifdef LV_HAVE_AVX2 ++-static inline void volk_16u_byteswappuppet_16u_a_avx2(uint16_t *output, uint16_t* intsToSwap, unsigned int num_points){ +++static inline void volk_16u_byteswappuppet_16u_a_avx2(uint16_t* output, +++ uint16_t* intsToSwap, +++ unsigned int num_points) +++{ ++ ++ volk_16u_byteswap_a_avx2((uint16_t*)intsToSwap, num_points); ++ memcpy((void*)output, (void*)intsToSwap, num_points * sizeof(uint16_t)); ++- ++ } ++ #endif ++ ++diff --git a/kernels/volk/volk_32f_64f_add_64f.h b/kernels/volk/volk_32f_64f_add_64f.h ++index 770c27e..d00ada5 100644 ++--- a/kernels/volk/volk_32f_64f_add_64f.h +++++ b/kernels/volk/volk_32f_64f_add_64f.h ++@@ -77,18 +77,19 @@ ++ ++ #ifdef LV_HAVE_GENERIC ++ ++-static inline void volk_32f_64f_add_64f_generic(double *cVector, ++- const float *aVector, ++- const double *bVector, ++- unsigned int num_points) { ++- double *cPtr = cVector; ++- const float *aPtr = aVector; ++- const double *bPtr = bVector; ++- unsigned int number = 0; ++- ++- for (number = 0; number < num_points; number++) { ++- *cPtr++ = ((double)(*aPtr++)) + (*bPtr++); ++- } +++static inline void volk_32f_64f_add_64f_generic(double* cVector, +++ const float* aVector, +++ const double* bVector, +++ unsigned int num_points) +++{ +++ double* cPtr = cVector; +++ const float* aPtr = aVector; +++ const double* bPtr = bVector; +++ unsigned int number = 0; +++ +++ for (number = 0; number < num_points; number++) { +++ *cPtr++ = ((double)(*aPtr++)) + (*bPtr++); +++ } ++ } ++ ++ #endif /* LV_HAVE_GENERIC */ ++@@ -96,42 +97,43 @@ static inline void volk_32f_64f_add_64f_generic(double *cVector, ++ #ifdef LV_HAVE_NEONV8 ++ #include ++ ++-static inline void volk_32f_64f_add_64f_neon(double *cVector, ++- const float *aVector, ++- const double *bVector, ++- unsigned int num_points) { ++- unsigned int number = 0; ++- const unsigned int half_points = num_points / 2; ++- ++- double *cPtr = cVector; ++- const float *aPtr = aVector; ++- const double *bPtr = bVector; ++- ++- float64x2_t aVal, bVal, cVal; ++- float32x2_t aVal1; ++- for (number = 0; number < half_points; number++) { ++- // Load in to NEON registers ++- aVal1 = vld1_f32(aPtr); ++- bVal = vld1q_f64(bPtr); ++- __VOLK_PREFETCH(aPtr + 2); ++- __VOLK_PREFETCH(bPtr + 2); ++- aPtr += 2; // q uses quadwords, 4 floats per vadd ++- bPtr += 2; ++- ++- // Vector conversion ++- aVal = vcvt_f64_f32(aVal1); ++- // vector add ++- cVal = vaddq_f64(aVal, bVal); ++- // Store the results back into the C container ++- vst1q_f64(cPtr, cVal); ++- ++- cPtr += 2; ++- } ++- ++- number = half_points * 2; // should be = num_points ++- for (; number < num_points; number++) { ++- *cPtr++ = ((double)(*aPtr++)) + (*bPtr++); ++- } +++static inline void volk_32f_64f_add_64f_neon(double* cVector, +++ const float* aVector, +++ const double* bVector, +++ unsigned int num_points) +++{ +++ unsigned int number = 0; +++ const unsigned int half_points = num_points / 2; +++ +++ double* cPtr = cVector; +++ const float* aPtr = aVector; +++ const double* bPtr = bVector; +++ +++ float64x2_t aVal, bVal, cVal; +++ float32x2_t aVal1; +++ for (number = 0; number < half_points; number++) { +++ // Load in to NEON registers +++ aVal1 = vld1_f32(aPtr); +++ bVal = vld1q_f64(bPtr); +++ __VOLK_PREFETCH(aPtr + 2); +++ __VOLK_PREFETCH(bPtr + 2); +++ aPtr += 2; // q uses quadwords, 4 floats per vadd +++ bPtr += 2; +++ +++ // Vector conversion +++ aVal = vcvt_f64_f32(aVal1); +++ // vector add +++ cVal = vaddq_f64(aVal, bVal); +++ // Store the results back into the C container +++ vst1q_f64(cPtr, cVal); +++ +++ cPtr += 2; +++ } +++ +++ number = half_points * 2; // should be = num_points +++ for (; number < num_points; number++) { +++ *cPtr++ = ((double)(*aPtr++)) + (*bPtr++); +++ } ++ } ++ ++ #endif /* LV_HAVE_NEONV8 */ ++@@ -141,49 +143,50 @@ static inline void volk_32f_64f_add_64f_neon(double *cVector, ++ #include ++ #include