From 8e7ca9d28b7b979f26ebf54e936b032c81f16914 Mon Sep 17 00:00:00 2001 From: Johannes Demel Date: Sun, 23 Feb 2020 15:03:47 +0100 Subject: [PATCH] [PATCH 3/7] clang-format: Apply clang-format This commit adds `.clang-format` from GNU Radio and apply clang-format. Run: `find . -regex '.*\.\(c\|cc\|cpp\|cxx\|h\|hh\)' -exec clang-format \ -style=file -i {} \;` in `.`. Gbp-Pq: Name 0003-clang-format-Apply-clang-format.patch --- .clang-format | 106 ++ apps/volk-config-info.cc | 77 +- apps/volk_option_helpers.cc | 268 +-- apps/volk_option_helpers.h | 84 +- apps/volk_profile.cc | 205 ++- apps/volk_profile.h | 20 +- cmake/msvc/config.h | 27 +- cmake/msvc/sys/time.h | 77 +- include/volk/saturation_arithmetic.h | 16 +- include/volk/volk_alloc.hh | 42 +- include/volk/volk_avx2_intrinsics.h | 114 +- include/volk/volk_avx_intrinsics.h | 193 +- include/volk/volk_common.h | 148 +- include/volk/volk_complex.h | 41 +- include/volk/volk_malloc.h | 12 +- include/volk/volk_neon_intrinsics.h | 115 +- include/volk/volk_prefs.h | 17 +- include/volk/volk_sse3_intrinsics.h | 79 +- include/volk/volk_sse_intrinsics.h | 53 +- kernels/volk/volk_16i_32fc_dot_prod_32fc.h | 1118 ++++++------ kernels/volk/volk_16i_branch_4_state_8.h | 219 ++- kernels/volk/volk_16i_convert_8i.h | 301 ++-- kernels/volk/volk_16i_max_star_16i.h | 158 +- .../volk/volk_16i_max_star_horizontal_16i.h | 214 +-- .../volk/volk_16i_permute_and_scalar_add.h | 187 +- kernels/volk/volk_16i_s32f_convert_32f.h | 609 +++---- kernels/volk/volk_16i_x4_quad_max_star_16i.h | 357 ++-- kernels/volk/volk_16i_x5_add_quad_16i_x4.h | 336 ++-- kernels/volk/volk_16ic_convert_32fc.h | 241 +-- kernels/volk/volk_16ic_deinterleave_16i_x2.h | 431 +++-- .../volk/volk_16ic_deinterleave_real_16i.h | 397 +++-- kernels/volk/volk_16ic_deinterleave_real_8i.h | 469 +++-- kernels/volk/volk_16ic_magnitude_16i.h | 506 +++--- .../volk/volk_16ic_s32f_deinterleave_32f_x2.h | 418 ++--- .../volk_16ic_s32f_deinterleave_real_32f.h | 372 ++-- kernels/volk/volk_16ic_s32f_magnitude_32f.h | 381 ++-- kernels/volk/volk_16ic_x2_dot_prod_16ic.h | 750 ++++---- kernels/volk/volk_16ic_x2_multiply_16ic.h | 504 ++++-- kernels/volk/volk_16u_byteswap.h | 378 ++-- kernels/volk/volk_16u_byteswappuppet_16u.h | 44 +- kernels/volk/volk_32f_64f_add_64f.h | 270 +-- kernels/volk/volk_32f_64f_multiply_64f.h | 154 +- kernels/volk/volk_32f_8u_polarbutterfly_32f.h | 478 ++--- .../volk_32f_8u_polarbutterflypuppet_32f.h | 155 +- kernels/volk/volk_32f_accumulator_s32f.h | 287 +-- kernels/volk/volk_32f_acos_32f.h | 700 ++++---- kernels/volk/volk_32f_asin_32f.h | 647 +++---- kernels/volk/volk_32f_atan_32f.h | 625 +++---- kernels/volk/volk_32f_binary_slicer_32i.h | 259 +-- kernels/volk/volk_32f_binary_slicer_8i.h | 706 ++++---- kernels/volk/volk_32f_convert_64f.h | 214 ++- kernels/volk/volk_32f_cos_32f.h | 1159 ++++++------ kernels/volk/volk_32f_expfast_32f.h | 347 ++-- kernels/volk/volk_32f_index_max_16u.h | 370 ++-- kernels/volk/volk_32f_index_max_32u.h | 770 ++++---- kernels/volk/volk_32f_invsqrt_32f.h | 189 +- kernels/volk/volk_32f_log2_32f.h | 719 +++++--- kernels/volk/volk_32f_null_32f.h | 16 +- .../volk/volk_32f_s32f_32f_fm_detect_32f.h | 457 ++--- ...k_32f_s32f_calc_spectral_noise_floor_32f.h | 683 +++---- kernels/volk/volk_32f_s32f_convert_16i.h | 815 ++++----- kernels/volk/volk_32f_s32f_convert_32i.h | 579 +++--- kernels/volk/volk_32f_s32f_convert_8i.h | 642 +++---- .../volk/volk_32f_s32f_mod_rangepuppet_32f.h | 63 +- kernels/volk/volk_32f_s32f_multiply_32f.h | 271 +-- kernels/volk/volk_32f_s32f_normalize.h | 150 +- kernels/volk/volk_32f_s32f_power_32f.h | 166 +- .../volk/volk_32f_s32f_s32f_mod_range_32f.h | 718 ++++---- kernels/volk/volk_32f_s32f_stddev_32f.h | 449 ++--- kernels/volk/volk_32f_sin_32f.h | 945 +++++----- kernels/volk/volk_32f_sqrt_32f.h | 153 +- .../volk/volk_32f_stddev_and_mean_32f_x2.h | 583 +++--- kernels/volk/volk_32f_tan_32f.h | 1023 ++++++----- kernels/volk/volk_32f_tanh_32f.h | 631 ++++--- kernels/volk/volk_32f_x2_add_32f.h | 412 +++-- kernels/volk/volk_32f_x2_divide_32f.h | 364 ++-- kernels/volk/volk_32f_x2_dot_prod_16i.h | 1092 ++++++------ kernels/volk/volk_32f_x2_dot_prod_32f.h | 1186 +++++++------ .../volk/volk_32f_x2_fm_detectpuppet_32f.h | 40 +- kernels/volk/volk_32f_x2_interleave_32fc.h | 292 +-- kernels/volk/volk_32f_x2_max_32f.h | 345 ++-- kernels/volk/volk_32f_x2_min_32f.h | 347 ++-- kernels/volk/volk_32f_x2_multiply_32f.h | 375 ++-- kernels/volk/volk_32f_x2_pow_32f.h | 1175 ++++++------ .../volk/volk_32f_x2_s32f_interleave_16ic.h | 324 ++-- kernels/volk/volk_32f_x2_subtract_32f.h | 319 ++-- kernels/volk/volk_32f_x3_sum_of_poly_32f.h | 1026 +++++------ kernels/volk/volk_32fc_32f_add_32fc.h | 281 +-- kernels/volk/volk_32fc_32f_dot_prod_32fc.h | 1205 +++++++------ kernels/volk/volk_32fc_32f_multiply_32fc.h | 226 +-- kernels/volk/volk_32fc_conjugate_32fc.h | 233 +-- kernels/volk/volk_32fc_convert_16ic.h | 439 ++--- kernels/volk/volk_32fc_deinterleave_32f_x2.h | 297 ++-- kernels/volk/volk_32fc_deinterleave_64f_x2.h | 439 ++--- .../volk/volk_32fc_deinterleave_imag_32f.h | 210 +-- .../volk/volk_32fc_deinterleave_real_32f.h | 214 +-- .../volk/volk_32fc_deinterleave_real_64f.h | 262 +-- kernels/volk/volk_32fc_index_max_16u.h | 639 +++---- kernels/volk/volk_32fc_index_max_32u.h | 630 +++---- kernels/volk/volk_32fc_magnitude_32f.h | 556 +++--- .../volk/volk_32fc_magnitude_squared_32f.h | 443 ++--- kernels/volk/volk_32fc_s32f_atan2_32f.h | 208 +-- .../volk_32fc_s32f_deinterleave_real_16i.h | 226 +-- kernels/volk/volk_32fc_s32f_magnitude_16i.h | 297 ++-- kernels/volk/volk_32fc_s32f_power_32fc.h | 121 +- .../volk/volk_32fc_s32f_power_spectrum_32f.h | 176 +- ..._32fc_s32f_x2_power_spectral_density_32f.h | 297 ++-- kernels/volk/volk_32fc_s32fc_multiply_32fc.h | 250 +-- .../volk/volk_32fc_s32fc_rotatorpuppet_32fc.h | 118 +- .../volk/volk_32fc_s32fc_x2_rotator_32fc.h | 260 +-- kernels/volk/volk_32fc_x2_add_32fc.h | 274 +-- .../volk_32fc_x2_conjugate_dot_prod_32fc.h | 1017 ++++++----- kernels/volk/volk_32fc_x2_divide_32fc.h | 372 ++-- kernels/volk/volk_32fc_x2_dot_prod_32fc.h | 1334 +++++++------- kernels/volk/volk_32fc_x2_multiply_32fc.h | 575 +++--- .../volk_32fc_x2_multiply_conjugate_32fc.h | 347 ++-- ...32fc_x2_s32f_square_dist_scalar_mult_32f.h | 657 +++---- ...2fc_x2_s32fc_multiply_conjugate_add_32fc.h | 98 +- kernels/volk/volk_32fc_x2_square_dist_32f.h | 426 ++--- kernels/volk/volk_32i_s32f_convert_32f.h | 347 ++-- kernels/volk/volk_32i_x2_and_32i.h | 320 ++-- kernels/volk/volk_32i_x2_or_32i.h | 321 ++-- kernels/volk/volk_32u_byteswap.h | 433 ++--- kernels/volk/volk_32u_byteswappuppet_32u.h | 44 +- kernels/volk/volk_32u_popcnt.h | 26 +- kernels/volk/volk_32u_popcntpuppet_32u.h | 18 +- kernels/volk/volk_32u_reverse_32u.h | 598 ++++--- kernels/volk/volk_64f_convert_32f.h | 324 ++-- kernels/volk/volk_64f_x2_add_64f.h | 207 +-- kernels/volk/volk_64f_x2_max_64f.h | 276 +-- kernels/volk/volk_64f_x2_min_64f.h | 275 +-- kernels/volk/volk_64f_x2_multiply_64f.h | 207 +-- kernels/volk/volk_64u_byteswap.h | 599 ++++--- kernels/volk/volk_64u_byteswappuppet_64u.h | 56 +- kernels/volk/volk_64u_popcnt.h | 79 +- kernels/volk/volk_64u_popcntpuppet_64u.h | 29 +- kernels/volk/volk_8i_convert_16i.h | 315 ++-- kernels/volk/volk_8i_s32f_convert_32f.h | 528 +++--- kernels/volk/volk_8ic_deinterleave_16i_x2.h | 493 ++++-- kernels/volk/volk_8ic_deinterleave_real_16i.h | 346 ++-- kernels/volk/volk_8ic_deinterleave_real_8i.h | 482 +++-- .../volk/volk_8ic_s32f_deinterleave_32f_x2.h | 571 +++--- .../volk_8ic_s32f_deinterleave_real_32f.h | 395 +++-- .../volk_8ic_x2_multiply_conjugate_16ic.h | 413 +++-- ...volk_8ic_x2_s32f_multiply_conjugate_32fc.h | 496 +++--- kernels/volk/volk_8u_conv_k7_r2puppet_8u.h | 494 +++--- kernels/volk/volk_8u_x2_encodeframepolar_8u.h | 1569 +++++++++++------ kernels/volk/volk_8u_x3_encodepolar_8u_x2.h | 110 +- .../volk/volk_8u_x3_encodepolarpuppet_8u.h | 137 +- kernels/volk/volk_8u_x4_conv_k7_r2_8u.h | 1067 +++++------ lib/kernel_tests.h | 257 +-- lib/qa_utils.cc | 751 +++++--- lib/qa_utils.h | 288 +-- lib/testqa.cc | 96 +- lib/volk_malloc.c | 55 +- lib/volk_prefs.c | 74 +- lib/volk_rank_archs.c | 73 +- lib/volk_rank_archs.h | 22 +- 158 files changed, 32509 insertions(+), 27583 deletions(-) create mode 100644 .clang-format diff --git a/.clang-format b/.clang-format new file mode 100644 index 0000000..285b68d --- /dev/null +++ b/.clang-format @@ -0,0 +1,106 @@ +--- +Language: Cpp +# BasedOnStyle: LLVM +AccessModifierOffset: -4 +AlignAfterOpenBracket: Align +AlignConsecutiveAssignments: false +AlignConsecutiveDeclarations: false +AlignEscapedNewlinesLeft: true +AlignOperands: true +AlignTrailingComments: true +AllowAllParametersOfDeclarationOnNextLine: true +AllowShortBlocksOnASingleLine: false +AllowShortCaseLabelsOnASingleLine: false +AllowShortFunctionsOnASingleLine: All +AllowShortIfStatementsOnASingleLine: false +AllowShortLoopsOnASingleLine: false +AlwaysBreakAfterDefinitionReturnType: None +AlwaysBreakAfterReturnType: None +AlwaysBreakBeforeMultilineStrings: false +AlwaysBreakTemplateDeclarations: true +BinPackArguments: false +BinPackParameters: false +BreakBeforeBraces: Custom +BraceWrapping: + AfterClass: true + AfterControlStatement: false + AfterEnum: false + AfterFunction: true + AfterNamespace: false + AfterObjCDeclaration: false + AfterStruct: false + AfterUnion: false + BeforeCatch: false + BeforeElse: false + IndentBraces: false +BreakBeforeBinaryOperators: None +BreakBeforeTernaryOperators: true +BreakConstructorInitializersBeforeComma: false +BreakAfterJavaFieldAnnotations: false +BreakStringLiterals: true +ColumnLimit: 90 +CommentPragmas: '^ IWYU pragma:' +ConstructorInitializerAllOnOneLineOrOnePerLine: true +ConstructorInitializerIndentWidth: 4 +ContinuationIndentWidth: 4 +Cpp11BracedListStyle: false +DerivePointerAlignment: false +DisableFormat: false +ExperimentalAutoDetectBinPacking: false +ForEachMacros: + - foreach + - Q_FOREACH + - BOOST_FOREACH +IncludeCategories: + - Regex: '^"(gnuradio)/' + Priority: 1 + - Regex: '^<(gnuradio)/' + Priority: 2 + - Regex: '^<(boost)/' + Priority: 98 + - Regex: '^<[a-z]*>$' + Priority: 99 + - Regex: '^".*"$' + Priority: 0 + - Regex: '.*' + Priority: 10 + +IncludeIsMainRegex: '(Test)?$' +IndentCaseLabels: false +IndentWidth: 4 +IndentWrappedFunctionNames: false +JavaScriptQuotes: Leave +JavaScriptWrapImports: true +KeepEmptyLinesAtTheStartOfBlocks: true +MacroBlockBegin: '' +MacroBlockEnd: '' +MaxEmptyLinesToKeep: 2 +NamespaceIndentation: None +ObjCBlockIndentWidth: 2 +ObjCSpaceAfterProperty: false +ObjCSpaceBeforeProtocolList: true +PenaltyBreakBeforeFirstCallParameter: 19 +PenaltyBreakComment: 300 +PenaltyBreakFirstLessLess: 120 +PenaltyBreakString: 1000 +PenaltyExcessCharacter: 1000000 +PenaltyReturnTypeOnItsOwnLine: 60 +PointerAlignment: Left +ReflowComments: true +SortIncludes: true +SpaceAfterCStyleCast: false +SpaceAfterTemplateKeyword: true +SpaceBeforeAssignmentOperators: true +SpaceBeforeParens: ControlStatements +SpaceInEmptyParentheses: false +SpacesBeforeTrailingComments: 1 +SpacesInAngles: false +SpacesInContainerLiterals: true +SpacesInCStyleCastParentheses: false +SpacesInParentheses: false +SpacesInSquareBrackets: false +Standard: Cpp11 +TabWidth: 8 +UseTab: Never + + diff --git a/apps/volk-config-info.cc b/apps/volk-config-info.cc index 4eedcb7..2521993 100644 --- a/apps/volk-config-info.cc +++ b/apps/volk-config-info.cc @@ -24,52 +24,63 @@ #include #endif -#include // for volk_available_machines, volk_c_com... -#include // for operator<<, endl, cout, ostream -#include // for string +#include // for volk_available_machines, volk_c_com... +#include // for operator<<, endl, cout, ostream +#include // for string -#include "volk/volk.h" // for volk_get_alignment, volk_get_machine -#include "volk_option_helpers.h" // for option_list, option_t +#include "volk/volk.h" // for volk_get_alignment, volk_get_machine +#include "volk_option_helpers.h" // for option_list, option_t void print_alignment() { - std::cout << "Alignment in bytes: " << volk_get_alignment() << std::endl; + std::cout << "Alignment in bytes: " << volk_get_alignment() << std::endl; } void print_malloc() { - // You don't want to change the volk_malloc code, so just copy the if/else - // structure from there and give an explanation for the implementations - std::cout << "Used malloc implementation: "; - #if HAVE_POSIX_MEMALIGN - std::cout << "posix_memalign" << std::endl; - #elif defined(_MSC_VER) - std::cout << "_aligned_malloc" << std::endl; - #else - std::cout << "C11 aligned_alloc" << std::endl; - #endif + // You don't want to change the volk_malloc code, so just copy the if/else + // structure from there and give an explanation for the implementations + std::cout << "Used malloc implementation: "; +#if HAVE_POSIX_MEMALIGN + std::cout << "posix_memalign" << std::endl; +#elif defined(_MSC_VER) + std::cout << "_aligned_malloc" << std::endl; +#else + std::cout << "C11 aligned_alloc" << std::endl; +#endif } -int -main(int argc, char **argv) +int main(int argc, char** argv) { - option_list our_options("volk-config-info"); - our_options.add(option_t("prefix", "", "print the VOLK installation prefix", volk_prefix())); - our_options.add(option_t("cc", "", "print the VOLK C compiler version", volk_c_compiler())); - our_options.add(option_t("cflags", "", "print the VOLK CFLAGS", volk_compiler_flags())); - our_options.add(option_t("all-machines", "", "print VOLK machines built", volk_available_machines())); - our_options.add(option_t("avail-machines", "", "print VOLK machines on the current " - "platform", volk_list_machines)); - our_options.add(option_t("machine", "", "print the current VOLK machine that will be used", - volk_get_machine())); - our_options.add(option_t("alignment", "", "print the memory alignment", print_alignment)); - our_options.add(option_t("malloc", "", "print the malloc implementation used in volk_malloc", - print_malloc)); - our_options.add(option_t("version", "v", "print the VOLK version", volk_version())); + option_list our_options("volk-config-info"); + our_options.add( + option_t("prefix", "", "print the VOLK installation prefix", volk_prefix())); + our_options.add( + option_t("cc", "", "print the VOLK C compiler version", volk_c_compiler())); + our_options.add( + option_t("cflags", "", "print the VOLK CFLAGS", volk_compiler_flags())); + our_options.add(option_t( + "all-machines", "", "print VOLK machines built", volk_available_machines())); + our_options.add(option_t("avail-machines", + "", + "print VOLK machines on the current " + "platform", + volk_list_machines)); + our_options.add(option_t("machine", + "", + "print the current VOLK machine that will be used", + volk_get_machine())); + our_options.add( + option_t("alignment", "", "print the memory alignment", print_alignment)); + our_options.add(option_t("malloc", + "", + "print the malloc implementation used in volk_malloc", + print_malloc)); + our_options.add(option_t("version", "v", "print the VOLK version", volk_version())); - our_options.parse(argc, argv); + our_options.parse(argc, argv); - return 0; + return 0; } diff --git a/apps/volk_option_helpers.cc b/apps/volk_option_helpers.cc index 4299709..73d51da 100644 --- a/apps/volk_option_helpers.cc +++ b/apps/volk_option_helpers.cc @@ -4,66 +4,97 @@ #include "volk_option_helpers.h" -#include // for exception -#include // for operator<<, endl, basic_ostream, cout, ostream -#include // for pair -#include // IWYU pragma: keep -#include // IWYU pragma: keep -#include // IWYU pragma: keep +#include // IWYU pragma: keep +#include // IWYU pragma: keep +#include // IWYU pragma: keep +#include // for exception +#include // for operator<<, endl, basic_ostream, cout, ostream +#include // for pair /* * Option type */ -option_t::option_t(std::string longform, std::string shortform, std::string msg, void (*callback)()) - : longform("--" + longform), - shortform("-" + shortform), - msg(msg), - callback(callback) { option_type = VOID_CALLBACK; } - -option_t::option_t(std::string longform, std::string shortform, std::string msg, void (*callback)(int)) - : longform("--" + longform), - shortform("-" + shortform), - msg(msg), - callback((void (*)()) callback) { option_type = INT_CALLBACK; } - -option_t::option_t(std::string longform, std::string shortform, std::string msg, void (*callback)(float)) - : longform("--" + longform), - shortform("-" + shortform), - msg(msg), - callback((void (*)()) callback) { option_type = FLOAT_CALLBACK; } - -option_t::option_t(std::string longform, std::string shortform, std::string msg, void (*callback)(bool)) - : longform("--" + longform), - shortform("-" + shortform), - msg(msg), - callback((void (*)()) callback) { option_type = BOOL_CALLBACK; } - -option_t::option_t(std::string longform, std::string shortform, std::string msg, void (*callback)(std::string)) - : longform("--" + longform), - shortform("-" + shortform), - msg(msg), - callback((void (*)()) callback) { option_type = STRING_CALLBACK; } - -option_t::option_t(std::string longform, std::string shortform, std::string msg, std::string printval) - : longform("--" + longform), - shortform("-" + shortform), - msg(msg), - printval(printval) { option_type = STRING; } +option_t::option_t(std::string longform, + std::string shortform, + std::string msg, + void (*callback)()) + : longform("--" + longform), shortform("-" + shortform), msg(msg), callback(callback) +{ + option_type = VOID_CALLBACK; +} + +option_t::option_t(std::string longform, + std::string shortform, + std::string msg, + void (*callback)(int)) + : longform("--" + longform), + shortform("-" + shortform), + msg(msg), + callback((void (*)())callback) +{ + option_type = INT_CALLBACK; +} + +option_t::option_t(std::string longform, + std::string shortform, + std::string msg, + void (*callback)(float)) + : longform("--" + longform), + shortform("-" + shortform), + msg(msg), + callback((void (*)())callback) +{ + option_type = FLOAT_CALLBACK; +} + +option_t::option_t(std::string longform, + std::string shortform, + std::string msg, + void (*callback)(bool)) + : longform("--" + longform), + shortform("-" + shortform), + msg(msg), + callback((void (*)())callback) +{ + option_type = BOOL_CALLBACK; +} + +option_t::option_t(std::string longform, + std::string shortform, + std::string msg, + void (*callback)(std::string)) + : longform("--" + longform), + shortform("-" + shortform), + msg(msg), + callback((void (*)())callback) +{ + option_type = STRING_CALLBACK; +} + +option_t::option_t(std::string longform, + std::string shortform, + std::string msg, + std::string printval) + : longform("--" + longform), shortform("-" + shortform), msg(msg), printval(printval) +{ + option_type = STRING; +} /* * Option List */ -option_list::option_list(std::string program_name) : - program_name(program_name) { +option_list::option_list(std::string program_name) : program_name(program_name) +{ internal_list = std::vector(); } void option_list::add(option_t opt) { internal_list.push_back(opt); } -void option_list::parse(int argc, char **argv) { +void option_list::parse(int argc, char** argv) +{ for (int arg_number = 0; arg_number < argc; ++arg_number) { for (std::vector::iterator this_option = internal_list.begin(); this_option != internal_list.end(); @@ -73,74 +104,83 @@ void option_list::parse(int argc, char **argv) { this_option->shortform == std::string(argv[arg_number])) { if (present_options.count(this_option->longform) == 0) { - present_options.insert(std::pair(this_option->longform, 1)); + present_options.insert( + std::pair(this_option->longform, 1)); } else { present_options[this_option->longform] += 1; } switch (this_option->option_type) { - case VOID_CALLBACK: - this_option->callback(); - break; - case INT_CALLBACK: - try { - int_val = atoi(argv[++arg_number]); - ((void (*)(int)) this_option->callback)(int_val); - } catch (std::exception &exc) { - std::cout << "An int option can only receive a number" << std::endl; - throw std::exception(); - }; - break; - case FLOAT_CALLBACK: - try { - double double_val = atof(argv[++arg_number]); - ((void (*)(float)) this_option->callback)(double_val); - } catch (std::exception &exc) { - std::cout << "A float option can only receive a number" << std::endl; - throw std::exception(); - }; - break; - case BOOL_CALLBACK: - try { - if (arg_number == (argc - 1)) { // this is the last arg + case VOID_CALLBACK: + this_option->callback(); + break; + case INT_CALLBACK: + try { + int_val = atoi(argv[++arg_number]); + ((void (*)(int))this_option->callback)(int_val); + } catch (std::exception& exc) { + std::cout << "An int option can only receive a number" + << std::endl; + throw std::exception(); + }; + break; + case FLOAT_CALLBACK: + try { + double double_val = atof(argv[++arg_number]); + ((void (*)(float))this_option->callback)(double_val); + } catch (std::exception& exc) { + std::cout << "A float option can only receive a number" + << std::endl; + throw std::exception(); + }; + break; + case BOOL_CALLBACK: + try { + if (arg_number == (argc - 1)) { // this is the last arg + int_val = 1; + } else { // sneak a look at the next arg since it's present + char* next_arg = argv[arg_number + 1]; + if ((strncmp(next_arg, "-", 1) == 0) || + (strncmp(next_arg, "--", 2) == 0)) { + // the next arg is actually an arg, the bool is just + // present, set to true + int_val = 1; + } else if (strncmp(next_arg, "true", 4) == 0) { int_val = 1; - } else { // sneak a look at the next arg since it's present - char *next_arg = argv[arg_number + 1]; - if ((strncmp(next_arg, "-", 1) == 0) || (strncmp(next_arg, "--", 2) == 0)) { - // the next arg is actually an arg, the bool is just present, set to true - int_val = 1; - } else if (strncmp(next_arg, "true", 4) == 0) { - int_val = 1; - } else if (strncmp(next_arg, "false", 5) == 0) { - int_val = 0; - } else { - // we got a number or a string. - // convert it to a number and depend on the catch to report an error condition - int_val = (bool) atoi(argv[++arg_number]); - } + } else if (strncmp(next_arg, "false", 5) == 0) { + int_val = 0; + } else { + // we got a number or a string. + // convert it to a number and depend on the catch to + // report an error condition + int_val = (bool)atoi(argv[++arg_number]); } - } catch (std::exception &e) { - int_val = INT_MIN; - }; - if (int_val == INT_MIN) { - std::cout << "option: '" << argv[arg_number - 1] << "' -> received an unknown value. Boolean " - "options should receive one of '0', '1', 'true', 'false'." << std::endl; - throw std::exception(); - } else if (int_val) { - ((void (*)(bool)) this_option->callback)(int_val); } - break; - case STRING_CALLBACK: - try { - ((void (*)(std::string)) this_option->callback)(argv[++arg_number]); - } catch (std::exception &exc) { - throw std::exception(); - }; - case STRING: - std::cout << this_option->printval << std::endl; - break; + } catch (std::exception& e) { + int_val = INT_MIN; + }; + if (int_val == INT_MIN) { + std::cout + << "option: '" << argv[arg_number - 1] + << "' -> received an unknown value. Boolean " + "options should receive one of '0', '1', 'true', 'false'." + << std::endl; + throw std::exception(); + } else if (int_val) { + ((void (*)(bool))this_option->callback)(int_val); + } + break; + case STRING_CALLBACK: + try { + ((void (*)(std::string))this_option->callback)( + argv[++arg_number]); + } catch (std::exception& exc) { + throw std::exception(); + }; + case STRING: + std::cout << this_option->printval << std::endl; + break; } } - } if (std::string("--help") == std::string(argv[arg_number]) || std::string("-h") == std::string(argv[arg_number])) { @@ -150,7 +190,8 @@ void option_list::parse(int argc, char **argv) { } } -bool option_list::present(std::string option_name) { +bool option_list::present(std::string option_name) +{ if (present_options.count("--" + option_name)) { return true; } else { @@ -158,7 +199,8 @@ bool option_list::present(std::string option_name) { } } -void option_list::help() { +void option_list::help() +{ std::cout << program_name << std::endl; std::cout << " -h [ --help ] \t\tdisplay this help message" << std::endl; for (std::vector::iterator this_option = internal_list.begin(); @@ -172,14 +214,14 @@ void option_list::help() { } switch (help_line.size() / 8) { - case 0: - help_line += "\t"; - case 1: - help_line += "\t"; - case 2: - help_line += "\t"; - case 3: - help_line += "\t"; + case 0: + help_line += "\t"; + case 1: + help_line += "\t"; + case 2: + help_line += "\t"; + case 3: + help_line += "\t"; } help_line += this_option->msg; std::cout << help_line << std::endl; diff --git a/apps/volk_option_helpers.h b/apps/volk_option_helpers.h index 8a71547..0756caf 100644 --- a/apps/volk_option_helpers.h +++ b/apps/volk_option_helpers.h @@ -5,56 +5,74 @@ #ifndef VOLK_VOLK_OPTION_HELPERS_H #define VOLK_VOLK_OPTION_HELPERS_H -#include -#include #include -#include +#include #include +#include +#include -typedef enum -{ - VOID_CALLBACK, +typedef enum { + VOID_CALLBACK, INT_CALLBACK, BOOL_CALLBACK, STRING_CALLBACK, FLOAT_CALLBACK, - STRING, + STRING, } VOLK_OPTYPE; -class option_t { - public: - option_t(std::string longform, std::string shortform, std::string msg, void (*callback)()); - option_t(std::string longform, std::string shortform, std::string msg, void (*callback)(int)); - option_t(std::string longform, std::string shortform, std::string msg, void (*callback)(float)); - option_t(std::string longform, std::string shortform, std::string msg, void (*callback)(bool)); - option_t(std::string longform, std::string shortform, std::string msg, void (*callback)(std::string)); - option_t(std::string longform, std::string shortform, std::string msg, std::string printval); - - std::string longform; - std::string shortform; - std::string msg; - VOLK_OPTYPE option_type; - std::string printval; - void (*callback)(); +class option_t +{ +public: + option_t(std::string longform, + std::string shortform, + std::string msg, + void (*callback)()); + option_t(std::string longform, + std::string shortform, + std::string msg, + void (*callback)(int)); + option_t(std::string longform, + std::string shortform, + std::string msg, + void (*callback)(float)); + option_t(std::string longform, + std::string shortform, + std::string msg, + void (*callback)(bool)); + option_t(std::string longform, + std::string shortform, + std::string msg, + void (*callback)(std::string)); + option_t(std::string longform, + std::string shortform, + std::string msg, + std::string printval); + std::string longform; + std::string shortform; + std::string msg; + VOLK_OPTYPE option_type; + std::string printval; + void (*callback)(); }; class option_list { - public: - option_list(std::string program_name); - bool present(std::string option_name); +public: + option_list(std::string program_name); + bool present(std::string option_name); + + void add(option_t opt); - void add(option_t opt); + void parse(int argc, char** argv); - void parse(int argc, char **argv); + void help(); - void help(); - private: - std::string program_name; - std::vector internal_list; - std::map present_options; +private: + std::string program_name; + std::vector internal_list; + std::map present_options; }; -#endif //VOLK_VOLK_OPTION_HELPERS_H +#endif // VOLK_VOLK_OPTION_HELPERS_H diff --git a/apps/volk_profile.cc b/apps/volk_profile.cc index 4ef5aeb..3c2e324 100644 --- a/apps/volk_profile.cc +++ b/apps/volk_profile.cc @@ -27,23 +27,23 @@ #include #endif #else -#include // for create_directories, exists -#include // for path, operator<< -#include // for filesystem +#include // for create_directories, exists +#include // for path, operator<< +#include // for filesystem #endif -#include // for size_t -#include // for stat -#include // for volk_get_config_path -#include // for operator<<, basic_ostream -#include // IWYU pragma: keep -#include // for map, map<>::iterator -#include // for pair -#include // for vector, vector<>::const_... - -#include "kernel_tests.h" // for init_test_list -#include "qa_utils.h" // for volk_test_results_t, vol... -#include "volk/volk_complex.h" // for lv_32fc_t -#include "volk_option_helpers.h" // for option_list, option_t +#include // for size_t +#include // for stat +#include // for volk_get_config_path +#include // IWYU pragma: keep +#include // for operator<<, basic_ostream +#include // for map, map<>::iterator +#include // for pair +#include // for vector, vector<>::const_... + +#include "kernel_tests.h" // for init_test_list +#include "qa_utils.h" // for volk_test_results_t, vol... +#include "volk/volk_complex.h" // for lv_32fc_t +#include "volk_option_helpers.h" // for option_list, option_t #include "volk_profile.h" #if HAS_STD_FILESYSTEM @@ -72,45 +72,61 @@ void set_json(std::string val) { json_filename = val; } std::string volk_config_path(""); void set_volk_config(std::string val) { volk_config_path = val; } -int main(int argc, char *argv[]) { +int main(int argc, char* argv[]) +{ option_list profile_options("volk_profile"); - profile_options.add(option_t("benchmark", "b", "Run all kernels (benchmark mode)", set_benchmark)); - profile_options.add(option_t("tol", "t", "Set the default tolerance for all tests", set_tolerance)); - profile_options.add(option_t("vlen", "v", "Set the default vector length for tests", set_vlen)); - profile_options.add((option_t("iter", "i", "Set the default number of test iterations per kernel", set_iter))); - profile_options.add((option_t("tests-substr", "R", "Run tests matching substring", set_substr))); - profile_options.add((option_t("update", "u", "Run only kernels missing from config", set_update))); - profile_options.add((option_t("dry-run", "n", "Dry run. Respect other options, but don't write to file", set_dryrun))); - profile_options.add((option_t("json", "j", "Write results to JSON file named as argument value", set_json))); - profile_options.add((option_t("path", "p", "Specify the volk_config path", set_volk_config))); + profile_options.add( + option_t("benchmark", "b", "Run all kernels (benchmark mode)", set_benchmark)); + profile_options.add( + option_t("tol", "t", "Set the default tolerance for all tests", set_tolerance)); + profile_options.add( + option_t("vlen", "v", "Set the default vector length for tests", set_vlen)); + profile_options.add((option_t( + "iter", "i", "Set the default number of test iterations per kernel", set_iter))); + profile_options.add( + (option_t("tests-substr", "R", "Run tests matching substring", set_substr))); + profile_options.add( + (option_t("update", "u", "Run only kernels missing from config", set_update))); + profile_options.add( + (option_t("dry-run", + "n", + "Dry run. Respect other options, but don't write to file", + set_dryrun))); + profile_options.add((option_t( + "json", "j", "Write results to JSON file named as argument value", set_json))); + profile_options.add( + (option_t("path", "p", "Specify the volk_config path", set_volk_config))); profile_options.parse(argc, argv); if (profile_options.present("help")) { return 0; } - if(dry_run) { - std::cout << "Warning: this IS a dry-run. Config will not be written!" << std::endl; + if (dry_run) { + std::cout << "Warning: this IS a dry-run. Config will not be written!" + << std::endl; } // Adding program options std::ofstream json_file; std::string config_file; - if ( json_filename != "" ) { - json_file.open( json_filename.c_str() ); + if (json_filename != "") { + json_file.open(json_filename.c_str()); } - if ( volk_config_path != "" ) { + if (volk_config_path != "") { config_file = volk_config_path + "/volk_config"; } // Run tests std::vector results; - if(update_mode) { - if( config_file != "" ) read_results(&results, config_file); - else read_results(&results); + if (update_mode) { + if (config_file != "") + read_results(&results, config_file); + else + read_results(&results); } // Initialize the list of tests @@ -118,22 +134,22 @@ int main(int argc, char *argv[]) { // Iterate through list of tests running each one std::string substr_to_match(test_params.kernel_regex()); - for(unsigned int ii = 0; ii < test_cases.size(); ++ii) { + for (unsigned int ii = 0; ii < test_cases.size(); ++ii) { bool regex_match = true; volk_test_case_t test_case = test_cases[ii]; // if the kernel name matches regex then do the test std::string test_case_name = test_case.name(); - if(test_case_name.find(substr_to_match) == std::string::npos) { + if (test_case_name.find(substr_to_match) == std::string::npos) { regex_match = false; } // if we are in update mode check if we've already got results // if we have any, then no need to test that kernel bool update = true; - if(update_mode) { - for(unsigned int jj=0; jj < results.size(); ++jj) { - if(results[jj].name == test_case.name() || + if (update_mode) { + for (unsigned int jj = 0; jj < results.size(); ++jj) { + if (results[jj].name == test_case.name() || results[jj].name == test_case.puppet_master_name()) { update = false; break; @@ -141,39 +157,44 @@ int main(int argc, char *argv[]) { } } - if( regex_match && update ) { + if (regex_match && update) { try { - run_volk_tests(test_case.desc(), test_case.kernel_ptr(), test_case.name(), - test_case.test_parameters(), &results, test_case.puppet_master_name()); - } - catch (std::string &error) { - std::cerr << "Caught Exception in 'run_volk_tests': " << error << std::endl; + run_volk_tests(test_case.desc(), + test_case.kernel_ptr(), + test_case.name(), + test_case.test_parameters(), + &results, + test_case.puppet_master_name()); + } catch (std::string& error) { + std::cerr << "Caught Exception in 'run_volk_tests': " << error + << std::endl; } } } // Output results according to provided options - if(json_filename != "") { + if (json_filename != "") { write_json(json_file, results); json_file.close(); } - if(!dry_run) { - if(config_file != "") write_results(&results, false, config_file); - else write_results(&results, false); - } - else { + if (!dry_run) { + if (config_file != "") + write_results(&results, false, config_file); + else + write_results(&results, false); + } else { std::cout << "Warning: this was a dry-run. Config not generated" << std::endl; } return 0; } -void read_results(std::vector *results) +void read_results(std::vector* results) { char path[1024]; volk_get_config_path(path, true); - if(path[0] == 0){ + if (path[0] == 0) { std::cout << "No prior test results found ..." << std::endl; return; } @@ -181,16 +202,16 @@ void read_results(std::vector *results) read_results(results, std::string(path)); } -void read_results(std::vector *results, std::string path) +void read_results(std::vector* results, std::string path) { struct stat buffer; - bool config_status = (stat (path.c_str(), &buffer) == 0); + bool config_status = (stat(path.c_str(), &buffer) == 0); - if( config_status ) { + if (config_status) { // a config exists and we are reading results from it std::ifstream config(path.c_str()); char config_line[256]; - while(config.getline(config_line, 255)) { + while (config.getline(config_line, 255)) { // tokenize the input line by kernel_name unaligned aligned // then push back in the results vector with fields filled in @@ -198,26 +219,26 @@ void read_results(std::vector *results, std::string path) std::string config_str(config_line); std::size_t str_size = config_str.size(); std::size_t found = config_str.find(' '); - + // Split line by spaces - while(found && found < str_size) { + while (found && found < str_size) { found = config_str.find(' '); // kernel names MUST be less than 128 chars, which is // a length restricted by volk/volk_prefs.c // on the last token in the parsed string we won't find a space // so make sure we copy at most 128 chars. - if(found > 127) { + if (found > 127) { found = 127; } str_size = config_str.size(); - char buffer[128] = {'\0'}; + char buffer[128] = { '\0' }; config_str.copy(buffer, found + 1, 0); buffer[found] = '\0'; single_kernel_result.push_back(std::string(buffer)); - config_str.erase(0, found+1); + config_str.erase(0, found + 1); } - if(single_kernel_result.size() == 3) { + if (single_kernel_result.size() == 3) { volk_test_results_t kernel_result; kernel_result.name = std::string(single_kernel_result[0]); kernel_result.config_name = std::string(single_kernel_result[0]); @@ -229,45 +250,47 @@ void read_results(std::vector *results, std::string path) } } -void write_results(const std::vector *results, bool update_result) +void write_results(const std::vector* results, bool update_result) { char path[1024]; volk_get_config_path(path, false); - if(path[0] == 0){ + if (path[0] == 0) { std::cout << "Aborting 'No config save path found' ..." << std::endl; return; } - write_results( results, update_result, std::string(path)); + write_results(results, update_result, std::string(path)); } -void write_results(const std::vector *results, bool update_result, const std::string path) +void write_results(const std::vector* results, + bool update_result, + const std::string path) { -// struct stat buffer; -// bool config_status = (stat (path.c_str(), &buffer) == 0); + // struct stat buffer; + // bool config_status = (stat (path.c_str(), &buffer) == 0); /* * These */ const fs::path config_path(path); - if (! fs::exists(config_path.parent_path())) - { + if (!fs::exists(config_path.parent_path())) { std::cout << "Creating " << config_path.parent_path() << "..." << std::endl; fs::create_directories(config_path.parent_path()); } std::ofstream config; - if(update_result) { + if (update_result) { std::cout << "Updating " << path << "..." << std::endl; config.open(path.c_str(), std::ofstream::app); - if (!config.is_open()) { //either we don't have write access or we don't have the dir yet + if (!config.is_open()) { // either we don't have write access or we don't have the + // dir yet std::cout << "Error opening file " << path << std::endl; } - } - else { + } else { std::cout << "Writing " << path << "..." << std::endl; config.open(path.c_str()); - if (!config.is_open()) { //either we don't have write access or we don't have the dir yet + if (!config.is_open()) { // either we don't have write access or we don't have the + // dir yet std::cout << "Error opening file " << path << std::endl; } @@ -278,43 +301,45 @@ void write_results(const std::vector *results, bool update_ } std::vector::const_iterator profile_results; - for(profile_results = results->begin(); profile_results != results->end(); ++profile_results) { - config << profile_results->config_name << " " - << profile_results->best_arch_a << " " - << profile_results->best_arch_u << std::endl; + for (profile_results = results->begin(); profile_results != results->end(); + ++profile_results) { + config << profile_results->config_name << " " << profile_results->best_arch_a + << " " << profile_results->best_arch_u << std::endl; } config.close(); } -void write_json(std::ofstream &json_file, std::vector results) +void write_json(std::ofstream& json_file, std::vector results) { json_file << "{" << std::endl; json_file << " \"volk_tests\": [" << std::endl; size_t len = results.size(); size_t i = 0; std::vector::iterator result; - for(result = results.begin(); result != results.end(); ++result) { + for (result = results.begin(); result != results.end(); ++result) { json_file << " {" << std::endl; json_file << " \"name\": \"" << result->name << "\"," << std::endl; json_file << " \"vlen\": " << (int)(result->vlen) << "," << std::endl; json_file << " \"iter\": " << result->iter << "," << std::endl; - json_file << " \"best_arch_a\": \"" << result->best_arch_a - << "\"," << std::endl; - json_file << " \"best_arch_u\": \"" << result->best_arch_u - << "\"," << std::endl; + json_file << " \"best_arch_a\": \"" << result->best_arch_a << "\"," + << std::endl; + json_file << " \"best_arch_u\": \"" << result->best_arch_u << "\"," + << std::endl; json_file << " \"results\": {" << std::endl; size_t results_len = result->results.size(); size_t ri = 0; std::map::iterator kernel_time_pair; - for(kernel_time_pair = result->results.begin(); kernel_time_pair != result->results.end(); ++kernel_time_pair) { + for (kernel_time_pair = result->results.begin(); + kernel_time_pair != result->results.end(); + ++kernel_time_pair) { volk_test_time_t time = kernel_time_pair->second; json_file << " \"" << time.name << "\": {" << std::endl; json_file << " \"name\": \"" << time.name << "\"," << std::endl; json_file << " \"time\": " << time.time << "," << std::endl; json_file << " \"units\": \"" << time.units << "\"" << std::endl; - json_file << " }" ; - if(ri+1 != results_len) { + json_file << " }"; + if (ri + 1 != results_len) { json_file << ","; } json_file << std::endl; @@ -322,7 +347,7 @@ void write_json(std::ofstream &json_file, std::vector resul } json_file << " }" << std::endl; json_file << " }"; - if(i+1 != len) { + if (i + 1 != len) { json_file << ","; } json_file << std::endl; diff --git a/apps/volk_profile.h b/apps/volk_profile.h index 51629ab..ae3b474 100644 --- a/apps/volk_profile.h +++ b/apps/volk_profile.h @@ -1,14 +1,16 @@ -#include // for bool -#include // for ofstream -#include // for string -#include // for vector +#include // for bool +#include // for ofstream +#include // for string +#include // for vector class volk_test_results_t; -void read_results(std::vector *results); -void read_results(std::vector *results, std::string path); -void write_results(const std::vector *results, bool update_result); -void write_results(const std::vector *results, bool update_result, const std::string path); -void write_json(std::ofstream &json_file, std::vector results); +void read_results(std::vector* results); +void read_results(std::vector* results, std::string path); +void write_results(const std::vector* results, bool update_result); +void write_results(const std::vector* results, + bool update_result, + const std::string path); +void write_json(std::ofstream& json_file, std::vector results); diff --git a/cmake/msvc/config.h b/cmake/msvc/config.h index 8b12c2a..68f716e 100644 --- a/cmake/msvc/config.h +++ b/cmake/msvc/config.h @@ -9,7 +9,7 @@ // enable inline functions for C code //////////////////////////////////////////////////////////////////////// #ifndef __cplusplus -# define inline __inline +#define inline __inline #endif //////////////////////////////////////////////////////////////////////// @@ -23,12 +23,21 @@ typedef ptrdiff_t ssize_t; //////////////////////////////////////////////////////////////////////// #if _MSC_VER < 1800 #include -static inline long lrint(double x){return (long)(x > 0.0 ? x + 0.5 : x - 0.5);} -static inline long lrintf(float x){return (long)(x > 0.0f ? x + 0.5f : x - 0.5f);} -static inline long long llrint(double x){return (long long)(x > 0.0 ? x + 0.5 : x - 0.5);} -static inline long long llrintf(float x){return (long long)(x > 0.0f ? x + 0.5f : x - 0.5f);} -static inline double rint(double x){return (x > 0.0)? floor(x + 0.5) : ceil(x - 0.5);} -static inline float rintf(float x){return (x > 0.0f)? floorf(x + 0.5f) : ceilf(x - 0.5f);} +static inline long lrint(double x) { return (long)(x > 0.0 ? x + 0.5 : x - 0.5); } +static inline long lrintf(float x) { return (long)(x > 0.0f ? x + 0.5f : x - 0.5f); } +static inline long long llrint(double x) +{ + return (long long)(x > 0.0 ? x + 0.5 : x - 0.5); +} +static inline long long llrintf(float x) +{ + return (long long)(x > 0.0f ? x + 0.5f : x - 0.5f); +} +static inline double rint(double x) { return (x > 0.0) ? floor(x + 0.5) : ceil(x - 0.5); } +static inline float rintf(float x) +{ + return (x > 0.0f) ? floorf(x + 0.5f) : ceilf(x - 0.5f); +} #endif //////////////////////////////////////////////////////////////////////// @@ -43,7 +52,7 @@ static inline float rintf(float x){return (x > 0.0f)? floorf(x + 0.5f) : ceilf(x // random and srandom //////////////////////////////////////////////////////////////////////// #include -static inline long int random (void) { return rand(); } -static inline void srandom (unsigned int seed) { srand(seed); } +static inline long int random(void) { return rand(); } +static inline void srandom(unsigned int seed) { srand(seed); } #endif // _MSC_CONFIG_H_ ] diff --git a/cmake/msvc/sys/time.h b/cmake/msvc/sys/time.h index aa0f5dc..4bda1ba 100644 --- a/cmake/msvc/sys/time.h +++ b/cmake/msvc/sys/time.h @@ -10,67 +10,62 @@ #define NOMINMAX #endif -//http://social.msdn.microsoft.com/Forums/en/vcgeneral/thread/430449b3-f6dd-4e18-84de-eebd26a8d668 +// http://social.msdn.microsoft.com/Forums/en/vcgeneral/thread/430449b3-f6dd-4e18-84de-eebd26a8d668 #include < time.h > #include //I've omitted this line. #if defined(_MSC_VER) || defined(_MSC_EXTENSIONS) - #define DELTA_EPOCH_IN_MICROSECS 11644473600000000Ui64 +#define DELTA_EPOCH_IN_MICROSECS 11644473600000000Ui64 #else - #define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL +#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL #endif #if _MSC_VER < 1900 struct timespec { -time_t tv_sec; /* Seconds since 00:00:00 GMT, */ + time_t tv_sec; /* Seconds since 00:00:00 GMT, */ -/* 1 January 1970 */ + /* 1 January 1970 */ -long tv_nsec; /* Additional nanoseconds since */ - -/* tv_sec */ + long tv_nsec; /* Additional nanoseconds since */ + /* tv_sec */ }; #endif -struct timezone -{ - int tz_minuteswest; /* minutes W of Greenwich */ - int tz_dsttime; /* type of dst correction */ +struct timezone { + int tz_minuteswest; /* minutes W of Greenwich */ + int tz_dsttime; /* type of dst correction */ }; -static inline int gettimeofday(struct timeval *tv, struct timezone *tz) +static inline int gettimeofday(struct timeval* tv, struct timezone* tz) { - FILETIME ft; - unsigned __int64 tmpres = 0; - static int tzflag; - - if (NULL != tv) - { - GetSystemTimeAsFileTime(&ft); - - tmpres |= ft.dwHighDateTime; - tmpres <<= 32; - tmpres |= ft.dwLowDateTime; - - /*converting file time to unix epoch*/ - tmpres -= DELTA_EPOCH_IN_MICROSECS; - tv->tv_sec = (long)(tmpres / 1000000UL); - tv->tv_usec = (long)(tmpres % 1000000UL); - } - - if (NULL != tz) - { - if (!tzflag) - { - _tzset(); - tzflag++; + FILETIME ft; + unsigned __int64 tmpres = 0; + static int tzflag; + + if (NULL != tv) { + GetSystemTimeAsFileTime(&ft); + + tmpres |= ft.dwHighDateTime; + tmpres <<= 32; + tmpres |= ft.dwLowDateTime; + + /*converting file time to unix epoch*/ + tmpres -= DELTA_EPOCH_IN_MICROSECS; + tv->tv_sec = (long)(tmpres / 1000000UL); + tv->tv_usec = (long)(tmpres % 1000000UL); + } + + if (NULL != tz) { + if (!tzflag) { + _tzset(); + tzflag++; + } + tz->tz_minuteswest = _timezone / 60; + tz->tz_dsttime = _daylight; } - tz->tz_minuteswest = _timezone / 60; - tz->tz_dsttime = _daylight; - } - return 0; + return 0; } #endif //_MSC_SYS_TIME_H_ diff --git a/include/volk/saturation_arithmetic.h b/include/volk/saturation_arithmetic.h index 0886844..7b95ba2 100644 --- a/include/volk/saturation_arithmetic.h +++ b/include/volk/saturation_arithmetic.h @@ -28,20 +28,24 @@ static inline int16_t sat_adds16i(int16_t x, int16_t y) { - int32_t res = (int32_t) x + (int32_t) y; + int32_t res = (int32_t)x + (int32_t)y; - if (res < SHRT_MIN) res = SHRT_MIN; - if (res > SHRT_MAX) res = SHRT_MAX; + if (res < SHRT_MIN) + res = SHRT_MIN; + if (res > SHRT_MAX) + res = SHRT_MAX; return res; } static inline int16_t sat_muls16i(int16_t x, int16_t y) { - int32_t res = (int32_t) x * (int32_t) y; + int32_t res = (int32_t)x * (int32_t)y; - if (res < SHRT_MIN) res = SHRT_MIN; - if (res > SHRT_MAX) res = SHRT_MAX; + if (res < SHRT_MIN) + res = SHRT_MIN; + if (res > SHRT_MAX) + res = SHRT_MAX; return res; } diff --git a/include/volk/volk_alloc.hh b/include/volk/volk_alloc.hh index a2975da..44bcfaf 100644 --- a/include/volk/volk_alloc.hh +++ b/include/volk/volk_alloc.hh @@ -40,30 +40,40 @@ namespace volk { */ template struct alloc { - typedef T value_type; + typedef T value_type; - alloc() = default; + alloc() = default; - template constexpr alloc(alloc const&) noexcept {} + template + constexpr alloc(alloc const&) noexcept + { + } - T* allocate(std::size_t n) { - if (n > std::numeric_limits::max() / sizeof(T)) throw std::bad_alloc(); + T* allocate(std::size_t n) + { + if (n > std::numeric_limits::max() / sizeof(T)) + throw std::bad_alloc(); - if (auto p = static_cast(volk_malloc(n*sizeof(T), volk_get_alignment()))) - return p; + if (auto p = static_cast(volk_malloc(n * sizeof(T), volk_get_alignment()))) + return p; - throw std::bad_alloc(); - } + throw std::bad_alloc(); + } - void deallocate(T* p, std::size_t) noexcept { volk_free(p); } - -} ; + void deallocate(T* p, std::size_t) noexcept { volk_free(p); } +}; template -bool operator==(alloc const&, alloc const&) { return true; } +bool operator==(alloc const&, alloc const&) +{ + return true; +} template -bool operator!=(alloc const&, alloc const&) { return false; } +bool operator!=(alloc const&, alloc const&) +{ + return false; +} /*! @@ -73,8 +83,8 @@ bool operator!=(alloc const&, alloc const&) { return false; } * example code: * volk::vector v(100); // vector using volk_malloc, volk_free */ -template -using vector = std::vector >; +template +using vector = std::vector>; } // namespace volk #endif // INCLUDED_VOLK_ALLOC_H diff --git a/include/volk/volk_avx2_intrinsics.h b/include/volk/volk_avx2_intrinsics.h index 17badc4..00f3b52 100644 --- a/include/volk/volk_avx2_intrinsics.h +++ b/include/volk/volk_avx2_intrinsics.h @@ -1,19 +1,19 @@ /* -*- c++ -*- */ -/* +/* * Copyright 2015 Free Software Foundation, Inc. - * + * * This file is part of GNU Radio - * + * * GNU Radio is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 3, or (at your option) * any later version. - * + * * GNU Radio is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. - * + * * You should have received a copy of the GNU General Public License * along with GNU Radio; see the file COPYING. If not, write to * the Free Software Foundation, Inc., 51 Franklin Street, @@ -27,28 +27,59 @@ #ifndef INCLUDE_VOLK_VOLK_AVX2_INTRINSICS_H_ #define INCLUDE_VOLK_VOLK_AVX2_INTRINSICS_H_ -#include #include "volk/volk_avx_intrinsics.h" +#include -static inline __m256 -_mm256_polar_sign_mask_avx2(__m128i fbits){ - const __m128i zeros = _mm_set1_epi8(0x00); - const __m128i sign_extract = _mm_set1_epi8(0x80); - const __m256i shuffle_mask = _mm256_setr_epi8(0xff, 0xff, 0xff, 0x00, 0xff, 0xff, 0xff, 0x01, 0xff, 0xff, 0xff, 0x02, 0xff, 0xff, 0xff, 0x03, - 0xff, 0xff, 0xff, 0x04, 0xff, 0xff, 0xff, 0x05, 0xff, 0xff, 0xff, 0x06, 0xff, 0xff, 0xff, 0x07); - __m256i sign_bits = _mm256_setzero_si256(); - - fbits = _mm_cmpgt_epi8(fbits, zeros); - fbits = _mm_and_si128(fbits, sign_extract); - sign_bits = _mm256_insertf128_si256(sign_bits,fbits,0); - sign_bits = _mm256_insertf128_si256(sign_bits,fbits,1); - sign_bits = _mm256_shuffle_epi8(sign_bits, shuffle_mask); +static inline __m256 _mm256_polar_sign_mask_avx2(__m128i fbits) +{ + const __m128i zeros = _mm_set1_epi8(0x00); + const __m128i sign_extract = _mm_set1_epi8(0x80); + const __m256i shuffle_mask = _mm256_setr_epi8(0xff, + 0xff, + 0xff, + 0x00, + 0xff, + 0xff, + 0xff, + 0x01, + 0xff, + 0xff, + 0xff, + 0x02, + 0xff, + 0xff, + 0xff, + 0x03, + 0xff, + 0xff, + 0xff, + 0x04, + 0xff, + 0xff, + 0xff, + 0x05, + 0xff, + 0xff, + 0xff, + 0x06, + 0xff, + 0xff, + 0xff, + 0x07); + __m256i sign_bits = _mm256_setzero_si256(); - return _mm256_castsi256_ps(sign_bits); + fbits = _mm_cmpgt_epi8(fbits, zeros); + fbits = _mm_and_si128(fbits, sign_extract); + sign_bits = _mm256_insertf128_si256(sign_bits, fbits, 0); + sign_bits = _mm256_insertf128_si256(sign_bits, fbits, 1); + sign_bits = _mm256_shuffle_epi8(sign_bits, shuffle_mask); + + return _mm256_castsi256_ps(sign_bits); } static inline __m256 -_mm256_polar_fsign_add_llrs_avx2(__m256 src0, __m256 src1, __m128i fbits){ +_mm256_polar_fsign_add_llrs_avx2(__m256 src0, __m256 src1, __m128i fbits) +{ // prepare sign mask for correct +- __m256 sign_mask = _mm256_polar_sign_mask_avx2(fbits); @@ -61,26 +92,31 @@ _mm256_polar_fsign_add_llrs_avx2(__m256 src0, __m256 src1, __m128i fbits){ return dst; } -static inline __m256 -_mm256_magnitudesquared_ps_avx2(const __m256 cplxValue0, const __m256 cplxValue1){ - const __m256i idx = _mm256_set_epi32(7,6,3,2,5,4,1,0); - const __m256 squared0 = _mm256_mul_ps(cplxValue0, cplxValue0); // Square the values - const __m256 squared1 = _mm256_mul_ps(cplxValue1, cplxValue1); // Square the Values - const __m256 complex_result = _mm256_hadd_ps(squared0, squared1); - return _mm256_permutevar8x32_ps(complex_result, idx); +static inline __m256 _mm256_magnitudesquared_ps_avx2(const __m256 cplxValue0, + const __m256 cplxValue1) +{ + const __m256i idx = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0); + const __m256 squared0 = _mm256_mul_ps(cplxValue0, cplxValue0); // Square the values + const __m256 squared1 = _mm256_mul_ps(cplxValue1, cplxValue1); // Square the Values + const __m256 complex_result = _mm256_hadd_ps(squared0, squared1); + return _mm256_permutevar8x32_ps(complex_result, idx); } -static inline __m256 -_mm256_scaled_norm_dist_ps_avx2(const __m256 symbols0, const __m256 symbols1, const __m256 points0, const __m256 points1, const __m256 scalar){ - /* - * Calculate: |y - x|^2 * SNR_lin - * Consider 'symbolsX' and 'pointsX' to be complex float - * 'symbolsX' are 'y' and 'pointsX' are 'x' - */ - const __m256 diff0 = _mm256_sub_ps(symbols0, points0); - const __m256 diff1 = _mm256_sub_ps(symbols1, points1); - const __m256 norms = _mm256_magnitudesquared_ps_avx2(diff0, diff1); - return _mm256_mul_ps(norms, scalar); +static inline __m256 _mm256_scaled_norm_dist_ps_avx2(const __m256 symbols0, + const __m256 symbols1, + const __m256 points0, + const __m256 points1, + const __m256 scalar) +{ + /* + * Calculate: |y - x|^2 * SNR_lin + * Consider 'symbolsX' and 'pointsX' to be complex float + * 'symbolsX' are 'y' and 'pointsX' are 'x' + */ + const __m256 diff0 = _mm256_sub_ps(symbols0, points0); + const __m256 diff1 = _mm256_sub_ps(symbols1, points1); + const __m256 norms = _mm256_magnitudesquared_ps_avx2(diff0, diff1); + return _mm256_mul_ps(norms, scalar); } #endif /* INCLUDE_VOLK_VOLK_AVX2_INTRINSICS_H_ */ diff --git a/include/volk/volk_avx_intrinsics.h b/include/volk/volk_avx_intrinsics.h index 808799f..bec846d 100644 --- a/include/volk/volk_avx_intrinsics.h +++ b/include/volk/volk_avx_intrinsics.h @@ -1,19 +1,19 @@ /* -*- c++ -*- */ -/* +/* * Copyright 2015 Free Software Foundation, Inc. - * + * * This file is part of GNU Radio - * + * * GNU Radio is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 3, or (at your option) * any later version. - * + * * GNU Radio is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. - * + * * You should have received a copy of the GNU General Public License * along with GNU Radio; see the file COPYING. If not, write to * the Free Software Foundation, Inc., 51 Franklin Street, @@ -29,90 +29,126 @@ #define INCLUDE_VOLK_VOLK_AVX_INTRINSICS_H_ #include -static inline __m256 -_mm256_complexmul_ps(__m256 x, __m256 y) +static inline __m256 _mm256_complexmul_ps(__m256 x, __m256 y) { - __m256 yl, yh, tmp1, tmp2; - yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr ... - yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di ... - tmp1 = _mm256_mul_ps(x, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr ... - x = _mm256_shuffle_ps(x, x, 0xB1); // Re-arrange x to be ai,ar,bi,br ... - tmp2 = _mm256_mul_ps(x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di - return _mm256_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + __m256 yl, yh, tmp1, tmp2; + yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr ... + yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di ... + tmp1 = _mm256_mul_ps(x, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr ... + x = _mm256_shuffle_ps(x, x, 0xB1); // Re-arrange x to be ai,ar,bi,br ... + tmp2 = _mm256_mul_ps(x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + return _mm256_addsub_ps(tmp1, + tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di } -static inline __m256 -_mm256_conjugate_ps(__m256 x){ - const __m256 conjugator = _mm256_setr_ps(0, -0.f, 0, -0.f, 0, -0.f, 0, -0.f); - return _mm256_xor_ps(x, conjugator); // conjugate y +static inline __m256 _mm256_conjugate_ps(__m256 x) +{ + const __m256 conjugator = _mm256_setr_ps(0, -0.f, 0, -0.f, 0, -0.f, 0, -0.f); + return _mm256_xor_ps(x, conjugator); // conjugate y } -static inline __m256 -_mm256_complexconjugatemul_ps(__m256 x, __m256 y){ - y = _mm256_conjugate_ps(y); - return _mm256_complexmul_ps(x, y); +static inline __m256 _mm256_complexconjugatemul_ps(__m256 x, __m256 y) +{ + y = _mm256_conjugate_ps(y); + return _mm256_complexmul_ps(x, y); } -static inline __m256 -_mm256_normalize_ps(__m256 val) +static inline __m256 _mm256_normalize_ps(__m256 val) { - __m256 tmp1 = _mm256_mul_ps(val, val); - tmp1 = _mm256_hadd_ps(tmp1, tmp1); - tmp1 = _mm256_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(3, 1, 2, 0)); // equals 0xD8 - tmp1 = _mm256_sqrt_ps(tmp1); - return _mm256_div_ps(val, tmp1); + __m256 tmp1 = _mm256_mul_ps(val, val); + tmp1 = _mm256_hadd_ps(tmp1, tmp1); + tmp1 = _mm256_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(3, 1, 2, 0)); // equals 0xD8 + tmp1 = _mm256_sqrt_ps(tmp1); + return _mm256_div_ps(val, tmp1); } -static inline __m256 -_mm256_magnitudesquared_ps(__m256 cplxValue1, __m256 cplxValue2){ - __m256 complex1, complex2; - cplxValue1 = _mm256_mul_ps(cplxValue1, cplxValue1); // Square the values - cplxValue2 = _mm256_mul_ps(cplxValue2, cplxValue2); // Square the Values - complex1 = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x20); - complex2 = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x31); - return _mm256_hadd_ps(complex1, complex2); // Add the I2 and Q2 values +static inline __m256 _mm256_magnitudesquared_ps(__m256 cplxValue1, __m256 cplxValue2) +{ + __m256 complex1, complex2; + cplxValue1 = _mm256_mul_ps(cplxValue1, cplxValue1); // Square the values + cplxValue2 = _mm256_mul_ps(cplxValue2, cplxValue2); // Square the Values + complex1 = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x20); + complex2 = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x31); + return _mm256_hadd_ps(complex1, complex2); // Add the I2 and Q2 values } -static inline __m256 -_mm256_magnitude_ps(__m256 cplxValue1, __m256 cplxValue2){ - return _mm256_sqrt_ps(_mm256_magnitudesquared_ps(cplxValue1, cplxValue2)); +static inline __m256 _mm256_magnitude_ps(__m256 cplxValue1, __m256 cplxValue2) +{ + return _mm256_sqrt_ps(_mm256_magnitudesquared_ps(cplxValue1, cplxValue2)); } -static inline __m256 -_mm256_scaled_norm_dist_ps(const __m256 symbols0, const __m256 symbols1, const __m256 points0, const __m256 points1, const __m256 scalar){ - /* - * Calculate: |y - x|^2 * SNR_lin - * Consider 'symbolsX' and 'pointsX' to be complex float - * 'symbolsX' are 'y' and 'pointsX' are 'x' - */ - const __m256 diff0 = _mm256_sub_ps(symbols0, points0); - const __m256 diff1 = _mm256_sub_ps(symbols1, points1); - const __m256 norms = _mm256_magnitudesquared_ps(diff0, diff1); - return _mm256_mul_ps(norms, scalar); +static inline __m256 _mm256_scaled_norm_dist_ps(const __m256 symbols0, + const __m256 symbols1, + const __m256 points0, + const __m256 points1, + const __m256 scalar) +{ + /* + * Calculate: |y - x|^2 * SNR_lin + * Consider 'symbolsX' and 'pointsX' to be complex float + * 'symbolsX' are 'y' and 'pointsX' are 'x' + */ + const __m256 diff0 = _mm256_sub_ps(symbols0, points0); + const __m256 diff1 = _mm256_sub_ps(symbols1, points1); + const __m256 norms = _mm256_magnitudesquared_ps(diff0, diff1); + return _mm256_mul_ps(norms, scalar); } -static inline __m256 -_mm256_polar_sign_mask(__m128i fbits){ - __m256 sign_mask_dummy = _mm256_setzero_ps(); - const __m128i zeros = _mm_set1_epi8(0x00); - const __m128i sign_extract = _mm_set1_epi8(0x80); - const __m128i shuffle_mask0 = _mm_setr_epi8(0xff, 0xff, 0xff, 0x00, 0xff, 0xff, 0xff, 0x01, 0xff, 0xff, 0xff, 0x02, 0xff, 0xff, 0xff, 0x03); - const __m128i shuffle_mask1 = _mm_setr_epi8(0xff, 0xff, 0xff, 0x04, 0xff, 0xff, 0xff, 0x05, 0xff, 0xff, 0xff, 0x06, 0xff, 0xff, 0xff, 0x07); - - fbits = _mm_cmpgt_epi8(fbits, zeros); - fbits = _mm_and_si128(fbits, sign_extract); - __m128i sign_bits0 = _mm_shuffle_epi8(fbits, shuffle_mask0); - __m128i sign_bits1 = _mm_shuffle_epi8(fbits, shuffle_mask1); - - __m256 sign_mask = _mm256_insertf128_ps(sign_mask_dummy, _mm_castsi128_ps(sign_bits0), 0x0); - return _mm256_insertf128_ps(sign_mask, _mm_castsi128_ps(sign_bits1), 0x1); -// // This is the desired function call. Though it seems to be missing in GCC. -// // Compare: https://software.intel.com/sites/landingpage/IntrinsicsGuide/# -// return _mm256_set_m128(_mm_castsi128_ps(sign_bits1), _mm_castsi128_ps(sign_bits0)); +static inline __m256 _mm256_polar_sign_mask(__m128i fbits) +{ + __m256 sign_mask_dummy = _mm256_setzero_ps(); + const __m128i zeros = _mm_set1_epi8(0x00); + const __m128i sign_extract = _mm_set1_epi8(0x80); + const __m128i shuffle_mask0 = _mm_setr_epi8(0xff, + 0xff, + 0xff, + 0x00, + 0xff, + 0xff, + 0xff, + 0x01, + 0xff, + 0xff, + 0xff, + 0x02, + 0xff, + 0xff, + 0xff, + 0x03); + const __m128i shuffle_mask1 = _mm_setr_epi8(0xff, + 0xff, + 0xff, + 0x04, + 0xff, + 0xff, + 0xff, + 0x05, + 0xff, + 0xff, + 0xff, + 0x06, + 0xff, + 0xff, + 0xff, + 0x07); + + fbits = _mm_cmpgt_epi8(fbits, zeros); + fbits = _mm_and_si128(fbits, sign_extract); + __m128i sign_bits0 = _mm_shuffle_epi8(fbits, shuffle_mask0); + __m128i sign_bits1 = _mm_shuffle_epi8(fbits, shuffle_mask1); + + __m256 sign_mask = + _mm256_insertf128_ps(sign_mask_dummy, _mm_castsi128_ps(sign_bits0), 0x0); + return _mm256_insertf128_ps(sign_mask, _mm_castsi128_ps(sign_bits1), 0x1); + // // This is the desired function call. Though it seems to be missing in GCC. + // // Compare: https://software.intel.com/sites/landingpage/IntrinsicsGuide/# + // return _mm256_set_m128(_mm_castsi128_ps(sign_bits1), + // _mm_castsi128_ps(sign_bits0)); } static inline void -_mm256_polar_deinterleave(__m256 *llr0, __m256 *llr1, __m256 src0, __m256 src1){ +_mm256_polar_deinterleave(__m256* llr0, __m256* llr1, __m256 src0, __m256 src1) +{ // deinterleave values __m256 part0 = _mm256_permute2f128_ps(src0, src1, 0x20); __m256 part1 = _mm256_permute2f128_ps(src0, src1, 0x31); @@ -120,22 +156,25 @@ _mm256_polar_deinterleave(__m256 *llr0, __m256 *llr1, __m256 src0, __m256 src1){ *llr1 = _mm256_shuffle_ps(part0, part1, 0xdd); } -static inline __m256 -_mm256_polar_minsum_llrs(__m256 src0, __m256 src1){ +static inline __m256 _mm256_polar_minsum_llrs(__m256 src0, __m256 src1) +{ const __m256 sign_mask = _mm256_set1_ps(-0.0f); - const __m256 abs_mask = _mm256_andnot_ps(sign_mask, _mm256_castsi256_ps(_mm256_set1_epi8(0xff))); + const __m256 abs_mask = + _mm256_andnot_ps(sign_mask, _mm256_castsi256_ps(_mm256_set1_epi8(0xff))); __m256 llr0, llr1; _mm256_polar_deinterleave(&llr0, &llr1, src0, src1); // calculate result - __m256 sign = _mm256_xor_ps(_mm256_and_ps(llr0, sign_mask), _mm256_and_ps(llr1, sign_mask)); - __m256 dst = _mm256_min_ps(_mm256_and_ps(llr0, abs_mask), _mm256_and_ps(llr1, abs_mask)); + __m256 sign = + _mm256_xor_ps(_mm256_and_ps(llr0, sign_mask), _mm256_and_ps(llr1, sign_mask)); + __m256 dst = + _mm256_min_ps(_mm256_and_ps(llr0, abs_mask), _mm256_and_ps(llr1, abs_mask)); return _mm256_or_ps(dst, sign); } -static inline __m256 -_mm256_polar_fsign_add_llrs(__m256 src0, __m256 src1, __m128i fbits){ +static inline __m256 _mm256_polar_fsign_add_llrs(__m256 src0, __m256 src1, __m128i fbits) +{ // prepare sign mask for correct +- __m256 sign_mask = _mm256_polar_sign_mask(fbits); diff --git a/include/volk/volk_common.h b/include/volk/volk_common.h index 50ea07b..8167d23 100644 --- a/include/volk/volk_common.h +++ b/include/volk/volk_common.h @@ -18,61 +18,71 @@ // AppleClang also defines __GNUC__, so do this check first. These // will probably be the same as for __GNUC__, but let's keep them // separate just to be safe. -# define __VOLK_ATTR_ALIGNED(x) __attribute__((aligned(x))) -# define __VOLK_ATTR_UNUSED __attribute__((unused)) -# define __VOLK_ATTR_INLINE __attribute__((always_inline)) -# define __VOLK_ATTR_DEPRECATED __attribute__((deprecated)) -# define __VOLK_ASM __asm__ -# define __VOLK_VOLATILE __volatile__ -# define __VOLK_ATTR_EXPORT __attribute__((visibility("default"))) -# define __VOLK_ATTR_IMPORT __attribute__((visibility("default"))) -# define __VOLK_PREFETCH(addr) __builtin_prefetch(addr) -#elif defined(__GNUC__) -# define __VOLK_ATTR_ALIGNED(x) __attribute__((aligned(x))) -# define __VOLK_ATTR_UNUSED __attribute__((unused)) -# define __VOLK_ATTR_INLINE __attribute__((always_inline)) -# define __VOLK_ATTR_DEPRECATED __attribute__((deprecated)) -# define __VOLK_ASM __asm__ -# define __VOLK_VOLATILE __volatile__ -# if __GNUC__ >= 4 -# define __VOLK_ATTR_EXPORT __attribute__((visibility("default"))) -# define __VOLK_ATTR_IMPORT __attribute__((visibility("default"))) -# else -# define __VOLK_ATTR_EXPORT -# define __VOLK_ATTR_IMPORT -# endif -# define __VOLK_PREFETCH(addr) __builtin_prefetch(addr) +#define __VOLK_ATTR_ALIGNED(x) __attribute__((aligned(x))) +#define __VOLK_ATTR_UNUSED __attribute__((unused)) +#define __VOLK_ATTR_INLINE __attribute__((always_inline)) +#define __VOLK_ATTR_DEPRECATED __attribute__((deprecated)) +#define __VOLK_ASM __asm__ +#define __VOLK_VOLATILE __volatile__ +#define __VOLK_ATTR_EXPORT __attribute__((visibility("default"))) +#define __VOLK_ATTR_IMPORT __attribute__((visibility("default"))) +#define __VOLK_PREFETCH(addr) __builtin_prefetch(addr) +#elif defined __GNUC__ +#define __VOLK_ATTR_ALIGNED(x) __attribute__((aligned(x))) +#define __VOLK_ATTR_UNUSED __attribute__((unused)) +#define __VOLK_ATTR_INLINE __attribute__((always_inline)) +#define __VOLK_ATTR_DEPRECATED __attribute__((deprecated)) +#define __VOLK_ASM __asm__ +#define __VOLK_VOLATILE __volatile__ +#if __GNUC__ >= 4 +#define __VOLK_ATTR_EXPORT __attribute__((visibility("default"))) +#define __VOLK_ATTR_IMPORT __attribute__((visibility("default"))) #else -# warning "Unknown compiler. Using default VOLK macros, which may or not work." -# define __VOLK_ATTR_ALIGNED(x) -# define __VOLK_ATTR_UNUSED -# define __VOLK_ATTR_INLINE -# define __VOLK_ATTR_DEPRECATED -# define __VOLK_ATTR_EXPORT -# define __VOLK_ATTR_IMPORT -# define __VOLK_PREFETCH(addr) -# define __VOLK_ASM __asm__ -# define __VOLK_VOLATILE __volatile__ +#define __VOLK_ATTR_EXPORT +#define __VOLK_ATTR_IMPORT +#endif +#define __VOLK_PREFETCH(addr) __builtin_prefetch(addr) +#elif _MSC_VER +#define __VOLK_ATTR_ALIGNED(x) __declspec(align(x)) +#define __VOLK_ATTR_UNUSED +#define __VOLK_ATTR_INLINE __forceinline +#define __VOLK_ATTR_DEPRECATED __declspec(deprecated) +#define __VOLK_ATTR_EXPORT __declspec(dllexport) +#define __VOLK_ATTR_IMPORT __declspec(dllimport) +#define __VOLK_PREFETCH(addr) +#define __VOLK_ASM __asm +#define __VOLK_VOLATILE +#else +#define __VOLK_ATTR_ALIGNED(x) +#define __VOLK_ATTR_UNUSED +#define __VOLK_ATTR_INLINE +#define __VOLK_ATTR_DEPRECATED +#define __VOLK_ATTR_EXPORT +#define __VOLK_ATTR_IMPORT +#define __VOLK_PREFETCH(addr) +#define __VOLK_ASM __asm__ +#define __VOLK_VOLATILE __volatile__ #endif //////////////////////////////////////////////////////////////////////// // Ignore annoying warnings in MSVC //////////////////////////////////////////////////////////////////////// #if defined(_MSC_VER) -# pragma warning(disable: 4244) //'conversion' conversion from 'type1' to 'type2', possible loss of data -# pragma warning(disable: 4305) //'identifier' : truncation from 'type1' to 'type2' +#pragma warning(disable : 4244) //'conversion' conversion from 'type1' to 'type2', + //possible loss of data +#pragma warning(disable : 4305) //'identifier' : truncation from 'type1' to 'type2' #endif //////////////////////////////////////////////////////////////////////// // C-linkage declaration macros // FIXME: due to the usage of complex.h, require gcc for c-linkage //////////////////////////////////////////////////////////////////////// -#if defined(__cplusplus) && (defined(__GNUC__) || defined(__clang__)) -# define __VOLK_DECL_BEGIN extern "C" { -# define __VOLK_DECL_END } +#if defined(__cplusplus) && (__GNUC__) +#define __VOLK_DECL_BEGIN extern "C" { +#define __VOLK_DECL_END } #else -# define __VOLK_DECL_BEGIN -# define __VOLK_DECL_END +#define __VOLK_DECL_BEGIN +#define __VOLK_DECL_END #endif //////////////////////////////////////////////////////////////////////// @@ -80,9 +90,9 @@ // http://gcc.gnu.org/wiki/Visibility //////////////////////////////////////////////////////////////////////// #ifdef volk_EXPORTS -# define VOLK_API __VOLK_ATTR_EXPORT +#define VOLK_API __VOLK_ATTR_EXPORT #else -# define VOLK_API __VOLK_ATTR_IMPORT +#define VOLK_API __VOLK_ATTR_IMPORT #endif //////////////////////////////////////////////////////////////////////// @@ -98,38 +108,38 @@ #endif #endif -union bit128{ - uint8_t i8[16]; - uint16_t i16[8]; - uint32_t i[4]; - float f[4]; - double d[2]; +union bit128 { + uint8_t i8[16]; + uint16_t i16[8]; + uint32_t i[4]; + float f[4]; + double d[2]; - #ifdef LV_HAVE_SSE - __m128 float_vec; - #endif +#ifdef LV_HAVE_SSE + __m128 float_vec; +#endif - #ifdef LV_HAVE_SSE2 - __m128i int_vec; - __m128d double_vec; - #endif +#ifdef LV_HAVE_SSE2 + __m128i int_vec; + __m128d double_vec; +#endif }; -union bit256{ - uint8_t i8[32]; - uint16_t i16[16]; - uint32_t i[8]; - float f[8]; - double d[4]; +union bit256 { + uint8_t i8[32]; + uint16_t i16[16]; + uint32_t i[8]; + float f[8]; + double d[4]; - #ifdef LV_HAVE_AVX - __m256 float_vec; - __m256i int_vec; - __m256d double_vec; - #endif +#ifdef LV_HAVE_AVX + __m256 float_vec; + __m256i int_vec; + __m256d double_vec; +#endif }; -#define bit128_p(x) ((union bit128 *)(x)) -#define bit256_p(x) ((union bit256 *)(x)) +#define bit128_p(x) ((union bit128*)(x)) +#define bit256_p(x) ((union bit256*)(x)) #endif /*INCLUDED_LIBVOLK_COMMON_H*/ diff --git a/include/volk/volk_complex.h b/include/volk/volk_complex.h index 1d61d78..ae78873 100644 --- a/include/volk/volk_complex.h +++ b/include/volk/volk_complex.h @@ -19,49 +19,58 @@ #ifdef __cplusplus -#include #include +#include -typedef std::complex lv_8sc_t; +typedef std::complex lv_8sc_t; typedef std::complex lv_16sc_t; typedef std::complex lv_32sc_t; typedef std::complex lv_64sc_t; -typedef std::complex lv_32fc_t; -typedef std::complex lv_64fc_t; +typedef std::complex lv_32fc_t; +typedef std::complex lv_64fc_t; -template inline std::complex lv_cmake(const T &r, const T &i){ +template +inline std::complex lv_cmake(const T& r, const T& i) +{ return std::complex(r, i); } -template inline typename T::value_type lv_creal(const T &x){ +template +inline typename T::value_type lv_creal(const T& x) +{ return x.real(); } -template inline typename T::value_type lv_cimag(const T &x){ +template +inline typename T::value_type lv_cimag(const T& x) +{ return x.imag(); } -template inline T lv_conj(const T &x){ +template +inline T lv_conj(const T& x) +{ return std::conj(x); } #else /* __cplusplus */ #if __STDC_VERSION__ >= 199901L /* C99 check */ -/* this allows us to conj in lv_conj without the double detour for single-precision floats */ +/* this allows us to conj in lv_conj without the double detour for single-precision floats + */ #include #endif /* C99 check */ #include -typedef char complex lv_8sc_t; -typedef short complex lv_16sc_t; -typedef long complex lv_32sc_t; -typedef long long complex lv_64sc_t; -typedef float complex lv_32fc_t; -typedef double complex lv_64fc_t; +typedef char complex lv_8sc_t; +typedef short complex lv_16sc_t; +typedef long complex lv_32sc_t; +typedef long long complex lv_64sc_t; +typedef float complex lv_32fc_t; +typedef double complex lv_64fc_t; -#define lv_cmake(r, i) ((r) + _Complex_I*(i)) +#define lv_cmake(r, i) ((r) + _Complex_I * (i)) // When GNUC is available, use the complex extensions. // The extensions always return the correct value type. diff --git a/include/volk/volk_malloc.h b/include/volk/volk_malloc.h index 3477b27..42ca2b0 100644 --- a/include/volk/volk_malloc.h +++ b/include/volk/volk_malloc.h @@ -23,8 +23,8 @@ #ifndef INCLUDED_VOLK_MALLOC_H #define INCLUDED_VOLK_MALLOC_H -#include #include +#include __VOLK_DECL_BEGIN @@ -40,7 +40,8 @@ __VOLK_DECL_BEGIN * For Apple Clang, we fall back to `posix_memalign`. * see: https://linux.die.net/man/3/aligned_alloc * For MSVC, we fall back to `_aligned_malloc`. - * see: https://docs.microsoft.com/en-us/cpp/c-runtime-library/reference/aligned-malloc?view=vs-2019 + * see: + * https://docs.microsoft.com/en-us/cpp/c-runtime-library/reference/aligned-malloc?view=vs-2019 * * Because of the ways in which volk_malloc may allocate memory, it is * important to always free volk_malloc pointers using volk_free. @@ -51,7 +52,7 @@ __VOLK_DECL_BEGIN * \param alignment The byte alignment of the allocated memory. * \return pointer to aligned memory. */ -VOLK_API void *volk_malloc(size_t size, size_t alignment); +VOLK_API void* volk_malloc(size_t size, size_t alignment); /*! * \brief Free's memory allocated by volk_malloc. @@ -62,11 +63,12 @@ VOLK_API void *volk_malloc(size_t size, size_t alignment); * Thus, in this case `volk_free` inherits the same behavior `free` exhibits. * see: https://en.cppreference.com/w/c/memory/free * In case `_aligned_malloc` was used, we call `_aligned_free`. - * see: https://docs.microsoft.com/en-us/cpp/c-runtime-library/reference/aligned-free?view=vs-2019 + * see: + * https://docs.microsoft.com/en-us/cpp/c-runtime-library/reference/aligned-free?view=vs-2019 * * \param aptr The aligned pointer allocated by volk_malloc. */ -VOLK_API void volk_free(void *aptr); +VOLK_API void volk_free(void* aptr); __VOLK_DECL_END diff --git a/include/volk/volk_neon_intrinsics.h b/include/volk/volk_neon_intrinsics.h index 90e7b54..302bd30 100644 --- a/include/volk/volk_neon_intrinsics.h +++ b/include/volk/volk_neon_intrinsics.h @@ -67,9 +67,9 @@ 3. This notice may not be removed or altered from any source distribution. (this is the zlib license) - + _vsincosq_f32 - + */ /* @@ -83,13 +83,12 @@ /* Magnitude squared for float32x4x2_t */ -static inline float32x4_t -_vmagnitudesquaredq_f32(float32x4x2_t cmplxValue) +static inline float32x4_t _vmagnitudesquaredq_f32(float32x4x2_t cmplxValue) { float32x4_t iValue, qValue, result; iValue = vmulq_f32(cmplxValue.val[0], cmplxValue.val[0]); // Square the values qValue = vmulq_f32(cmplxValue.val[1], cmplxValue.val[1]); // Square the values - result = vaddq_f32(iValue, qValue); // Add the I2 and Q2 values + result = vaddq_f32(iValue, qValue); // Add the I2 and Q2 values return result; } @@ -97,9 +96,11 @@ _vmagnitudesquaredq_f32(float32x4x2_t cmplxValue) static inline float32x4_t _vinvsqrtq_f32(float32x4_t x) { float32x4_t sqrt_reciprocal = vrsqrteq_f32(x); - sqrt_reciprocal = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x, sqrt_reciprocal), sqrt_reciprocal), sqrt_reciprocal); - sqrt_reciprocal = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x, sqrt_reciprocal), sqrt_reciprocal), sqrt_reciprocal); - + sqrt_reciprocal = vmulq_f32( + vrsqrtsq_f32(vmulq_f32(x, sqrt_reciprocal), sqrt_reciprocal), sqrt_reciprocal); + sqrt_reciprocal = vmulq_f32( + vrsqrtsq_f32(vmulq_f32(x, sqrt_reciprocal), sqrt_reciprocal), sqrt_reciprocal); + return sqrt_reciprocal; } @@ -108,19 +109,19 @@ static inline float32x4_t _vinvq_f32(float32x4_t x) { // Newton's method float32x4_t recip = vrecpeq_f32(x); - recip = vmulq_f32(vrecpsq_f32(x, recip), recip); - recip = vmulq_f32(vrecpsq_f32(x, recip), recip); + recip = vmulq_f32(vrecpsq_f32(x, recip), recip); + recip = vmulq_f32(vrecpsq_f32(x, recip), recip); return recip; } /* Complex multiplication for float32x4x2_t */ -static inline float32x4x2_t -_vmultiply_complexq_f32(float32x4x2_t a_val, float32x4x2_t b_val) +static inline float32x4x2_t _vmultiply_complexq_f32(float32x4x2_t a_val, + float32x4x2_t b_val) { float32x4x2_t tmp_real; float32x4x2_t tmp_imag; float32x4x2_t c_val; - + // multiply the real*real and imag*imag to get real result // a0r*b0r|a1r*b1r|a2r*b2r|a3r*b3r tmp_real.val[0] = vmulq_f32(a_val.val[0], b_val.val[0]); @@ -140,12 +141,12 @@ _vmultiply_complexq_f32(float32x4x2_t a_val, float32x4x2_t b_val) /* From ARM Compute Library, MIT license */ static inline float32x4_t _vtaylor_polyq_f32(float32x4_t x, const float32x4_t coeffs[8]) { - float32x4_t cA = vmlaq_f32(coeffs[0], coeffs[4], x); - float32x4_t cB = vmlaq_f32(coeffs[2], coeffs[6], x); - float32x4_t cC = vmlaq_f32(coeffs[1], coeffs[5], x); - float32x4_t cD = vmlaq_f32(coeffs[3], coeffs[7], x); - float32x4_t x2 = vmulq_f32(x, x); - float32x4_t x4 = vmulq_f32(x2, x2); + float32x4_t cA = vmlaq_f32(coeffs[0], coeffs[4], x); + float32x4_t cB = vmlaq_f32(coeffs[2], coeffs[6], x); + float32x4_t cC = vmlaq_f32(coeffs[1], coeffs[5], x); + float32x4_t cD = vmlaq_f32(coeffs[3], coeffs[7], x); + float32x4_t x2 = vmulq_f32(x, x); + float32x4_t x4 = vmulq_f32(x2, x2); float32x4_t res = vmlaq_f32(vmlaq_f32(cA, cB, x2), vmlaq_f32(cC, cD, x2), x4); return res; } @@ -155,121 +156,123 @@ static inline float32x4_t _vtaylor_polyq_f32(float32x4_t x, const float32x4_t co static inline float32x4_t _vlogq_f32(float32x4_t x) { const float32x4_t log_tab[8] = { - vdupq_n_f32(-2.29561495781f), - vdupq_n_f32(-2.47071170807f), - vdupq_n_f32(-5.68692588806f), - vdupq_n_f32(-0.165253549814f), - vdupq_n_f32(5.17591238022f), - vdupq_n_f32(0.844007015228f), - vdupq_n_f32(4.58445882797f), - vdupq_n_f32(0.0141278216615f), + vdupq_n_f32(-2.29561495781f), vdupq_n_f32(-2.47071170807f), + vdupq_n_f32(-5.68692588806f), vdupq_n_f32(-0.165253549814f), + vdupq_n_f32(5.17591238022f), vdupq_n_f32(0.844007015228f), + vdupq_n_f32(4.58445882797f), vdupq_n_f32(0.0141278216615f), }; - - const int32x4_t CONST_127 = vdupq_n_s32(127); // 127 + + const int32x4_t CONST_127 = vdupq_n_s32(127); // 127 const float32x4_t CONST_LN2 = vdupq_n_f32(0.6931471805f); // ln(2) - + // Extract exponent - int32x4_t m = vsubq_s32(vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_f32(x), 23)), CONST_127); - float32x4_t val = vreinterpretq_f32_s32(vsubq_s32(vreinterpretq_s32_f32(x), vshlq_n_s32(m, 23))); - + int32x4_t m = vsubq_s32( + vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_f32(x), 23)), CONST_127); + float32x4_t val = + vreinterpretq_f32_s32(vsubq_s32(vreinterpretq_s32_f32(x), vshlq_n_s32(m, 23))); + // Polynomial Approximation float32x4_t poly = _vtaylor_polyq_f32(val, log_tab); - + // Reconstruct poly = vmlaq_f32(poly, vcvtq_f32_s32(m), CONST_LN2); - + return poly; } /* Evaluation of 4 sines & cosines at once. * Optimized from here (zlib license) * http://gruntthepeon.free.fr/ssemath/ */ -static inline float32x4x2_t _vsincosq_f32(float32x4_t x) { +static inline float32x4x2_t _vsincosq_f32(float32x4_t x) +{ const float32x4_t c_minus_cephes_DP1 = vdupq_n_f32(-0.78515625); const float32x4_t c_minus_cephes_DP2 = vdupq_n_f32(-2.4187564849853515625e-4); const float32x4_t c_minus_cephes_DP3 = vdupq_n_f32(-3.77489497744594108e-8); const float32x4_t c_sincof_p0 = vdupq_n_f32(-1.9515295891e-4); - const float32x4_t c_sincof_p1 = vdupq_n_f32(8.3321608736e-3); + const float32x4_t c_sincof_p1 = vdupq_n_f32(8.3321608736e-3); const float32x4_t c_sincof_p2 = vdupq_n_f32(-1.6666654611e-1); const float32x4_t c_coscof_p0 = vdupq_n_f32(2.443315711809948e-005); const float32x4_t c_coscof_p1 = vdupq_n_f32(-1.388731625493765e-003); const float32x4_t c_coscof_p2 = vdupq_n_f32(4.166664568298827e-002); const float32x4_t c_cephes_FOPI = vdupq_n_f32(1.27323954473516); // 4 / M_PI - + const float32x4_t CONST_1 = vdupq_n_f32(1.f); const float32x4_t CONST_1_2 = vdupq_n_f32(0.5f); const float32x4_t CONST_0 = vdupq_n_f32(0.f); - const uint32x4_t CONST_2 = vdupq_n_u32(2); - const uint32x4_t CONST_4 = vdupq_n_u32(4); - + const uint32x4_t CONST_2 = vdupq_n_u32(2); + const uint32x4_t CONST_4 = vdupq_n_u32(4); + uint32x4_t emm2; - + uint32x4_t sign_mask_sin, sign_mask_cos; sign_mask_sin = vcltq_f32(x, CONST_0); x = vabsq_f32(x); // scale by 4/pi float32x4_t y = vmulq_f32(x, c_cephes_FOPI); - + // store the integer part of y in mm0 emm2 = vcvtq_u32_f32(y); /* j=(j+1) & (~1) (see the cephes sources) */ emm2 = vaddq_u32(emm2, vdupq_n_u32(1)); emm2 = vandq_u32(emm2, vdupq_n_u32(~1)); y = vcvtq_f32_u32(emm2); - + /* get the polynom selection mask there is one polynom for 0 <= x <= Pi/4 and another one for Pi/4 #include #include +#include __VOLK_DECL_BEGIN -typedef struct volk_arch_pref -{ - char name[128]; //name of the kernel - char impl_a[128]; //best aligned impl - char impl_u[128]; //best unaligned impl +typedef struct volk_arch_pref { + char name[128]; // name of the kernel + char impl_a[128]; // best aligned impl + char impl_u[128]; // best unaligned impl } volk_arch_pref_t; //////////////////////////////////////////////////////////////////////// @@ -19,13 +18,13 @@ typedef struct volk_arch_pref // if config file should be tested on existence for reading. // returns \0 in the argument on failure. //////////////////////////////////////////////////////////////////////// -VOLK_API void volk_get_config_path(char *, bool); +VOLK_API void volk_get_config_path(char*, bool); //////////////////////////////////////////////////////////////////////// // load prefs into global prefs struct //////////////////////////////////////////////////////////////////////// -VOLK_API size_t volk_load_preferences(volk_arch_pref_t **); +VOLK_API size_t volk_load_preferences(volk_arch_pref_t**); __VOLK_DECL_END -#endif //INCLUDED_VOLK_PREFS_H +#endif // INCLUDED_VOLK_PREFS_H diff --git a/include/volk/volk_sse3_intrinsics.h b/include/volk/volk_sse3_intrinsics.h index 6b53a2a..6bdc8d8 100644 --- a/include/volk/volk_sse3_intrinsics.h +++ b/include/volk/volk_sse3_intrinsics.h @@ -1,19 +1,19 @@ /* -*- c++ -*- */ -/* +/* * Copyright 2015 Free Software Foundation, Inc. - * + * * This file is part of GNU Radio - * + * * GNU Radio is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 3, or (at your option) * any later version. - * + * * GNU Radio is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. - * + * * You should have received a copy of the GNU General Public License * along with GNU Radio; see the file COPYING. If not, write to * the Free Software Foundation, Inc., 51 Franklin Street, @@ -29,49 +29,52 @@ #define INCLUDE_VOLK_VOLK_SSE3_INTRINSICS_H_ #include -static inline __m128 -_mm_complexmul_ps(__m128 x, __m128 y) +static inline __m128 _mm_complexmul_ps(__m128 x, __m128 y) { - __m128 yl, yh, tmp1, tmp2; - yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr - yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di - tmp1 = _mm_mul_ps(x, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr - x = _mm_shuffle_ps(x, x, 0xB1); // Re-arrange x to be ai,ar,bi,br - tmp2 = _mm_mul_ps(x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di - return _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + __m128 yl, yh, tmp1, tmp2; + yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr + yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di + tmp1 = _mm_mul_ps(x, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + x = _mm_shuffle_ps(x, x, 0xB1); // Re-arrange x to be ai,ar,bi,br + tmp2 = _mm_mul_ps(x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + return _mm_addsub_ps(tmp1, + tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di } -static inline __m128 -_mm_complexconjugatemul_ps(__m128 x, __m128 y) +static inline __m128 _mm_complexconjugatemul_ps(__m128 x, __m128 y) { - const __m128 conjugator = _mm_setr_ps(0, -0.f, 0, -0.f); - y = _mm_xor_ps(y, conjugator); // conjugate y - return _mm_complexmul_ps(x, y); + const __m128 conjugator = _mm_setr_ps(0, -0.f, 0, -0.f); + y = _mm_xor_ps(y, conjugator); // conjugate y + return _mm_complexmul_ps(x, y); } -static inline __m128 -_mm_magnitudesquared_ps_sse3(__m128 cplxValue1, __m128 cplxValue2){ - cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values - cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values - return _mm_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values +static inline __m128 _mm_magnitudesquared_ps_sse3(__m128 cplxValue1, __m128 cplxValue2) +{ + cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values + cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values + return _mm_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values } -static inline __m128 -_mm_magnitude_ps_sse3(__m128 cplxValue1, __m128 cplxValue2){ - return _mm_sqrt_ps(_mm_magnitudesquared_ps_sse3(cplxValue1, cplxValue2)); +static inline __m128 _mm_magnitude_ps_sse3(__m128 cplxValue1, __m128 cplxValue2) +{ + return _mm_sqrt_ps(_mm_magnitudesquared_ps_sse3(cplxValue1, cplxValue2)); } -static inline __m128 -_mm_scaled_norm_dist_ps_sse3(const __m128 symbols0, const __m128 symbols1, const __m128 points0, const __m128 points1, const __m128 scalar){ - /* - * Calculate: |y - x|^2 * SNR_lin - * Consider 'symbolsX' and 'pointsX' to be complex float - * 'symbolsX' are 'y' and 'pointsX' are 'x' - */ - const __m128 diff0 = _mm_sub_ps(symbols0, points0); - const __m128 diff1 = _mm_sub_ps(symbols1, points1); - const __m128 norms = _mm_magnitudesquared_ps_sse3(diff0, diff1); - return _mm_mul_ps(norms, scalar); +static inline __m128 _mm_scaled_norm_dist_ps_sse3(const __m128 symbols0, + const __m128 symbols1, + const __m128 points0, + const __m128 points1, + const __m128 scalar) +{ + /* + * Calculate: |y - x|^2 * SNR_lin + * Consider 'symbolsX' and 'pointsX' to be complex float + * 'symbolsX' are 'y' and 'pointsX' are 'x' + */ + const __m128 diff0 = _mm_sub_ps(symbols0, points0); + const __m128 diff1 = _mm_sub_ps(symbols1, points1); + const __m128 norms = _mm_magnitudesquared_ps_sse3(diff0, diff1); + return _mm_mul_ps(norms, scalar); } #endif /* INCLUDE_VOLK_VOLK_SSE3_INTRINSICS_H_ */ diff --git a/include/volk/volk_sse_intrinsics.h b/include/volk/volk_sse_intrinsics.h index 57318e2..24fe7c1 100644 --- a/include/volk/volk_sse_intrinsics.h +++ b/include/volk/volk_sse_intrinsics.h @@ -1,19 +1,19 @@ /* -*- c++ -*- */ -/* +/* * Copyright 2015 Free Software Foundation, Inc. - * + * * This file is part of GNU Radio - * + * * GNU Radio is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 3, or (at your option) * any later version. - * + * * GNU Radio is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. - * + * * You should have received a copy of the GNU General Public License * along with GNU Radio; see the file COPYING. If not, write to * the Free Software Foundation, Inc., 51 Franklin Street, @@ -29,31 +29,34 @@ #define INCLUDE_VOLK_VOLK_SSE_INTRINSICS_H_ #include -static inline __m128 -_mm_magnitudesquared_ps(__m128 cplxValue1, __m128 cplxValue2){ - __m128 iValue, qValue; - // Arrange in i1i2i3i4 format - iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0)); - // Arrange in q1q2q3q4 format - qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1)); - iValue = _mm_mul_ps(iValue, iValue); // Square the I values - qValue = _mm_mul_ps(qValue, qValue); // Square the Q Values - return _mm_add_ps(iValue, qValue); // Add the I2 and Q2 values +static inline __m128 _mm_magnitudesquared_ps(__m128 cplxValue1, __m128 cplxValue2) +{ + __m128 iValue, qValue; + // Arrange in i1i2i3i4 format + iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2, 0, 2, 0)); + // Arrange in q1q2q3q4 format + qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3, 1, 3, 1)); + iValue = _mm_mul_ps(iValue, iValue); // Square the I values + qValue = _mm_mul_ps(qValue, qValue); // Square the Q Values + return _mm_add_ps(iValue, qValue); // Add the I2 and Q2 values } -static inline __m128 -_mm_magnitude_ps(__m128 cplxValue1, __m128 cplxValue2){ - return _mm_sqrt_ps(_mm_magnitudesquared_ps(cplxValue1, cplxValue2)); +static inline __m128 _mm_magnitude_ps(__m128 cplxValue1, __m128 cplxValue2) +{ + return _mm_sqrt_ps(_mm_magnitudesquared_ps(cplxValue1, cplxValue2)); } -static inline __m128 -_mm_scaled_norm_dist_ps_sse(const __m128 symbols0, const __m128 symbols1, const __m128 points0, const __m128 points1, const __m128 scalar) +static inline __m128 _mm_scaled_norm_dist_ps_sse(const __m128 symbols0, + const __m128 symbols1, + const __m128 points0, + const __m128 points1, + const __m128 scalar) { - // calculate scalar * |x - y|^2 - const __m128 diff0 = _mm_sub_ps(symbols0, points0); - const __m128 diff1 = _mm_sub_ps(symbols1, points1); - const __m128 norms = _mm_magnitudesquared_ps(diff0, diff1); - return _mm_mul_ps(norms, scalar); + // calculate scalar * |x - y|^2 + const __m128 diff0 = _mm_sub_ps(symbols0, points0); + const __m128 diff1 = _mm_sub_ps(symbols1, points1); + const __m128 norms = _mm_magnitudesquared_ps(diff0, diff1); + return _mm_mul_ps(norms, scalar); } #endif /* INCLUDE_VOLK_VOLK_SSE_INTRINSICS_H_ */ diff --git a/kernels/volk/volk_16i_32fc_dot_prod_32fc.h b/kernels/volk/volk_16i_32fc_dot_prod_32fc.h index f250340..2635649 100644 --- a/kernels/volk/volk_16i_32fc_dot_prod_32fc.h +++ b/kernels/volk/volk_16i_32fc_dot_prod_32fc.h @@ -33,8 +33,8 @@ * * Dispatcher Prototype * \code - * void volk_16i_32fc_dot_prod_32fc(lv_32fc_t* result, const short* input, const lv_32fc_t * taps, unsigned int num_points) - * \endcode + * void volk_16i_32fc_dot_prod_32fc(lv_32fc_t* result, const short* input, const lv_32fc_t + * * taps, unsigned int num_points) \endcode * * \b Inputs * \li input: vector of shorts. @@ -58,165 +58,178 @@ #ifndef INCLUDED_volk_16i_32fc_dot_prod_32fc_H #define INCLUDED_volk_16i_32fc_dot_prod_32fc_H -#include #include +#include #ifdef LV_HAVE_GENERIC -static inline void volk_16i_32fc_dot_prod_32fc_generic(lv_32fc_t* result, const short* input, const lv_32fc_t * taps, unsigned int num_points) { +static inline void volk_16i_32fc_dot_prod_32fc_generic(lv_32fc_t* result, + const short* input, + const lv_32fc_t* taps, + unsigned int num_points) +{ - static const int N_UNROLL = 4; + static const int N_UNROLL = 4; - lv_32fc_t acc0 = 0; - lv_32fc_t acc1 = 0; - lv_32fc_t acc2 = 0; - lv_32fc_t acc3 = 0; + lv_32fc_t acc0 = 0; + lv_32fc_t acc1 = 0; + lv_32fc_t acc2 = 0; + lv_32fc_t acc3 = 0; - unsigned i = 0; - unsigned n = (num_points / N_UNROLL) * N_UNROLL; + unsigned i = 0; + unsigned n = (num_points / N_UNROLL) * N_UNROLL; - for(i = 0; i < n; i += N_UNROLL) { - acc0 += taps[i + 0] * (float)input[i + 0]; - acc1 += taps[i + 1] * (float)input[i + 1]; - acc2 += taps[i + 2] * (float)input[i + 2]; - acc3 += taps[i + 3] * (float)input[i + 3]; - } + for (i = 0; i < n; i += N_UNROLL) { + acc0 += taps[i + 0] * (float)input[i + 0]; + acc1 += taps[i + 1] * (float)input[i + 1]; + acc2 += taps[i + 2] * (float)input[i + 2]; + acc3 += taps[i + 3] * (float)input[i + 3]; + } - for(; i < num_points; i++) { - acc0 += taps[i] * (float)input[i]; - } + for (; i < num_points; i++) { + acc0 += taps[i] * (float)input[i]; + } - *result = acc0 + acc1 + acc2 + acc3; + *result = acc0 + acc1 + acc2 + acc3; } #endif /*LV_HAVE_GENERIC*/ #ifdef LV_HAVE_NEON #include -static inline void volk_16i_32fc_dot_prod_32fc_neon(lv_32fc_t* result, const short* input, const lv_32fc_t * taps, unsigned int num_points) { - - unsigned ii; - unsigned quarter_points = num_points / 4; - lv_32fc_t* tapsPtr = (lv_32fc_t*) taps; - short* inputPtr = (short*) input; - lv_32fc_t accumulator_vec[4]; - - float32x4x2_t tapsVal, accumulator_val; - int16x4_t input16; - int32x4_t input32; - float32x4_t input_float, prod_re, prod_im; - - accumulator_val.val[0] = vdupq_n_f32(0.0); - accumulator_val.val[1] = vdupq_n_f32(0.0); - - for(ii = 0; ii < quarter_points; ++ii) { - tapsVal = vld2q_f32((float*)tapsPtr); - input16 = vld1_s16(inputPtr); - // widen 16-bit int to 32-bit int - input32 = vmovl_s16(input16); - // convert 32-bit int to float with scale - input_float = vcvtq_f32_s32(input32); - - prod_re = vmulq_f32(input_float, tapsVal.val[0]); - prod_im = vmulq_f32(input_float, tapsVal.val[1]); - - accumulator_val.val[0] = vaddq_f32(prod_re, accumulator_val.val[0]); - accumulator_val.val[1] = vaddq_f32(prod_im, accumulator_val.val[1]); - - tapsPtr += 4; - inputPtr += 4; - } - vst2q_f32((float*)accumulator_vec, accumulator_val); - accumulator_vec[0] += accumulator_vec[1]; - accumulator_vec[2] += accumulator_vec[3]; - accumulator_vec[0] += accumulator_vec[2]; - - for(ii = quarter_points * 4; ii < num_points; ++ii) { - accumulator_vec[0] += *(tapsPtr++) * (float)(*(inputPtr++)); - } - - *result = accumulator_vec[0]; +static inline void volk_16i_32fc_dot_prod_32fc_neon(lv_32fc_t* result, + const short* input, + const lv_32fc_t* taps, + unsigned int num_points) +{ + + unsigned ii; + unsigned quarter_points = num_points / 4; + lv_32fc_t* tapsPtr = (lv_32fc_t*)taps; + short* inputPtr = (short*)input; + lv_32fc_t accumulator_vec[4]; + + float32x4x2_t tapsVal, accumulator_val; + int16x4_t input16; + int32x4_t input32; + float32x4_t input_float, prod_re, prod_im; + + accumulator_val.val[0] = vdupq_n_f32(0.0); + accumulator_val.val[1] = vdupq_n_f32(0.0); + + for (ii = 0; ii < quarter_points; ++ii) { + tapsVal = vld2q_f32((float*)tapsPtr); + input16 = vld1_s16(inputPtr); + // widen 16-bit int to 32-bit int + input32 = vmovl_s16(input16); + // convert 32-bit int to float with scale + input_float = vcvtq_f32_s32(input32); + + prod_re = vmulq_f32(input_float, tapsVal.val[0]); + prod_im = vmulq_f32(input_float, tapsVal.val[1]); + + accumulator_val.val[0] = vaddq_f32(prod_re, accumulator_val.val[0]); + accumulator_val.val[1] = vaddq_f32(prod_im, accumulator_val.val[1]); + + tapsPtr += 4; + inputPtr += 4; + } + vst2q_f32((float*)accumulator_vec, accumulator_val); + accumulator_vec[0] += accumulator_vec[1]; + accumulator_vec[2] += accumulator_vec[3]; + accumulator_vec[0] += accumulator_vec[2]; + + for (ii = quarter_points * 4; ii < num_points; ++ii) { + accumulator_vec[0] += *(tapsPtr++) * (float)(*(inputPtr++)); + } + + *result = accumulator_vec[0]; } #endif /*LV_HAVE_NEON*/ #if LV_HAVE_SSE && LV_HAVE_MMX -static inline void volk_16i_32fc_dot_prod_32fc_u_sse( lv_32fc_t* result, const short* input, const lv_32fc_t* taps, unsigned int num_points) { - - unsigned int number = 0; - const unsigned int sixteenthPoints = num_points / 8; - - float res[2]; - float *realpt = &res[0], *imagpt = &res[1]; - const short* aPtr = input; - const float* bPtr = (float*)taps; - - __m64 m0, m1; - __m128 f0, f1, f2, f3; - __m128 a0Val, a1Val, a2Val, a3Val; - __m128 b0Val, b1Val, b2Val, b3Val; - __m128 c0Val, c1Val, c2Val, c3Val; - - __m128 dotProdVal0 = _mm_setzero_ps(); - __m128 dotProdVal1 = _mm_setzero_ps(); - __m128 dotProdVal2 = _mm_setzero_ps(); - __m128 dotProdVal3 = _mm_setzero_ps(); - - for(;number < sixteenthPoints; number++){ - - m0 = _mm_set_pi16(*(aPtr+3), *(aPtr+2), *(aPtr+1), *(aPtr+0)); - m1 = _mm_set_pi16(*(aPtr+7), *(aPtr+6), *(aPtr+5), *(aPtr+4)); - f0 = _mm_cvtpi16_ps(m0); - f1 = _mm_cvtpi16_ps(m0); - f2 = _mm_cvtpi16_ps(m1); - f3 = _mm_cvtpi16_ps(m1); - - a0Val = _mm_unpacklo_ps(f0, f1); - a1Val = _mm_unpackhi_ps(f0, f1); - a2Val = _mm_unpacklo_ps(f2, f3); - a3Val = _mm_unpackhi_ps(f2, f3); - - b0Val = _mm_loadu_ps(bPtr); - b1Val = _mm_loadu_ps(bPtr+4); - b2Val = _mm_loadu_ps(bPtr+8); - b3Val = _mm_loadu_ps(bPtr+12); - - c0Val = _mm_mul_ps(a0Val, b0Val); - c1Val = _mm_mul_ps(a1Val, b1Val); - c2Val = _mm_mul_ps(a2Val, b2Val); - c3Val = _mm_mul_ps(a3Val, b3Val); - - dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0); - dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1); - dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2); - dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3); - - aPtr += 8; - bPtr += 16; - } - - dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1); - dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2); - dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3); - - __VOLK_ATTR_ALIGNED(16) float dotProductVector[4]; - - _mm_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector - - *realpt = dotProductVector[0]; - *imagpt = dotProductVector[1]; - *realpt += dotProductVector[2]; - *imagpt += dotProductVector[3]; - - number = sixteenthPoints*8; - for(;number < num_points; number++){ - *realpt += ((*aPtr) * (*bPtr++)); - *imagpt += ((*aPtr++) * (*bPtr++)); - } - - *result = *(lv_32fc_t*)(&res[0]); +static inline void volk_16i_32fc_dot_prod_32fc_u_sse(lv_32fc_t* result, + const short* input, + const lv_32fc_t* taps, + unsigned int num_points) +{ + + unsigned int number = 0; + const unsigned int sixteenthPoints = num_points / 8; + + float res[2]; + float *realpt = &res[0], *imagpt = &res[1]; + const short* aPtr = input; + const float* bPtr = (float*)taps; + + __m64 m0, m1; + __m128 f0, f1, f2, f3; + __m128 a0Val, a1Val, a2Val, a3Val; + __m128 b0Val, b1Val, b2Val, b3Val; + __m128 c0Val, c1Val, c2Val, c3Val; + + __m128 dotProdVal0 = _mm_setzero_ps(); + __m128 dotProdVal1 = _mm_setzero_ps(); + __m128 dotProdVal2 = _mm_setzero_ps(); + __m128 dotProdVal3 = _mm_setzero_ps(); + + for (; number < sixteenthPoints; number++) { + + m0 = _mm_set_pi16(*(aPtr + 3), *(aPtr + 2), *(aPtr + 1), *(aPtr + 0)); + m1 = _mm_set_pi16(*(aPtr + 7), *(aPtr + 6), *(aPtr + 5), *(aPtr + 4)); + f0 = _mm_cvtpi16_ps(m0); + f1 = _mm_cvtpi16_ps(m0); + f2 = _mm_cvtpi16_ps(m1); + f3 = _mm_cvtpi16_ps(m1); + + a0Val = _mm_unpacklo_ps(f0, f1); + a1Val = _mm_unpackhi_ps(f0, f1); + a2Val = _mm_unpacklo_ps(f2, f3); + a3Val = _mm_unpackhi_ps(f2, f3); + + b0Val = _mm_loadu_ps(bPtr); + b1Val = _mm_loadu_ps(bPtr + 4); + b2Val = _mm_loadu_ps(bPtr + 8); + b3Val = _mm_loadu_ps(bPtr + 12); + + c0Val = _mm_mul_ps(a0Val, b0Val); + c1Val = _mm_mul_ps(a1Val, b1Val); + c2Val = _mm_mul_ps(a2Val, b2Val); + c3Val = _mm_mul_ps(a3Val, b3Val); + + dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0); + dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1); + dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2); + dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3); + + aPtr += 8; + bPtr += 16; + } + + dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1); + dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2); + dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3); + + __VOLK_ATTR_ALIGNED(16) float dotProductVector[4]; + + _mm_store_ps(dotProductVector, + dotProdVal0); // Store the results back into the dot product vector + + *realpt = dotProductVector[0]; + *imagpt = dotProductVector[1]; + *realpt += dotProductVector[2]; + *imagpt += dotProductVector[3]; + + number = sixteenthPoints * 8; + for (; number < num_points; number++) { + *realpt += ((*aPtr) * (*bPtr++)); + *imagpt += ((*aPtr++) * (*bPtr++)); + } + + *result = *(lv_32fc_t*)(&res[0]); } #endif /*LV_HAVE_SSE && LV_HAVE_MMX*/ @@ -224,85 +237,90 @@ static inline void volk_16i_32fc_dot_prod_32fc_u_sse( lv_32fc_t* result, const #if LV_HAVE_AVX2 && LV_HAVE_FMA -static inline void volk_16i_32fc_dot_prod_32fc_u_avx2_fma( lv_32fc_t* result, const short* input, const lv_32fc_t* taps, unsigned int num_points) { - - unsigned int number = 0; - const unsigned int sixteenthPoints = num_points / 16; - - float res[2]; - float *realpt = &res[0], *imagpt = &res[1]; - const short* aPtr = input; - const float* bPtr = (float*)taps; - - __m128i m0, m1; - __m256i f0, f1; - __m256 g0, g1, h0, h1, h2, h3; - __m256 a0Val, a1Val, a2Val, a3Val; - __m256 b0Val, b1Val, b2Val, b3Val; - - __m256 dotProdVal0 = _mm256_setzero_ps(); - __m256 dotProdVal1 = _mm256_setzero_ps(); - __m256 dotProdVal2 = _mm256_setzero_ps(); - __m256 dotProdVal3 = _mm256_setzero_ps(); - - for(;number < sixteenthPoints; number++){ - - m0 = _mm_loadu_si128((__m128i const*) aPtr); - m1 = _mm_loadu_si128((__m128i const*)(aPtr+8)); - - f0 = _mm256_cvtepi16_epi32(m0); - g0 = _mm256_cvtepi32_ps(f0); - f1 = _mm256_cvtepi16_epi32(m1); - g1 = _mm256_cvtepi32_ps(f1); - - h0 = _mm256_unpacklo_ps(g0, g0); - h1 = _mm256_unpackhi_ps(g0, g0); - h2 = _mm256_unpacklo_ps(g1, g1); - h3 = _mm256_unpackhi_ps(g1, g1); - - a0Val = _mm256_permute2f128_ps(h0, h1, 0x20); - a1Val = _mm256_permute2f128_ps(h0, h1, 0x31); - a2Val = _mm256_permute2f128_ps(h2, h3, 0x20); - a3Val = _mm256_permute2f128_ps(h2, h3, 0x31); - - b0Val = _mm256_loadu_ps(bPtr); - b1Val = _mm256_loadu_ps(bPtr+8); - b2Val = _mm256_loadu_ps(bPtr+16); - b3Val = _mm256_loadu_ps(bPtr+24); - - dotProdVal0 = _mm256_fmadd_ps(a0Val,b0Val,dotProdVal0); - dotProdVal1 = _mm256_fmadd_ps(a1Val,b1Val,dotProdVal1); - dotProdVal2 = _mm256_fmadd_ps(a2Val,b2Val,dotProdVal2); - dotProdVal3 = _mm256_fmadd_ps(a3Val,b3Val,dotProdVal3); - - aPtr += 16; - bPtr += 32; - } - - dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1); - dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2); - dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3); - - __VOLK_ATTR_ALIGNED(32) float dotProductVector[8]; - - _mm256_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector - - *realpt = dotProductVector[0]; - *imagpt = dotProductVector[1]; - *realpt += dotProductVector[2]; - *imagpt += dotProductVector[3]; - *realpt += dotProductVector[4]; - *imagpt += dotProductVector[5]; - *realpt += dotProductVector[6]; - *imagpt += dotProductVector[7]; - - number = sixteenthPoints*16; - for(;number < num_points; number++){ - *realpt += ((*aPtr) * (*bPtr++)); - *imagpt += ((*aPtr++) * (*bPtr++)); - } - - *result = *(lv_32fc_t*)(&res[0]); +static inline void volk_16i_32fc_dot_prod_32fc_u_avx2_fma(lv_32fc_t* result, + const short* input, + const lv_32fc_t* taps, + unsigned int num_points) +{ + + unsigned int number = 0; + const unsigned int sixteenthPoints = num_points / 16; + + float res[2]; + float *realpt = &res[0], *imagpt = &res[1]; + const short* aPtr = input; + const float* bPtr = (float*)taps; + + __m128i m0, m1; + __m256i f0, f1; + __m256 g0, g1, h0, h1, h2, h3; + __m256 a0Val, a1Val, a2Val, a3Val; + __m256 b0Val, b1Val, b2Val, b3Val; + + __m256 dotProdVal0 = _mm256_setzero_ps(); + __m256 dotProdVal1 = _mm256_setzero_ps(); + __m256 dotProdVal2 = _mm256_setzero_ps(); + __m256 dotProdVal3 = _mm256_setzero_ps(); + + for (; number < sixteenthPoints; number++) { + + m0 = _mm_loadu_si128((__m128i const*)aPtr); + m1 = _mm_loadu_si128((__m128i const*)(aPtr + 8)); + + f0 = _mm256_cvtepi16_epi32(m0); + g0 = _mm256_cvtepi32_ps(f0); + f1 = _mm256_cvtepi16_epi32(m1); + g1 = _mm256_cvtepi32_ps(f1); + + h0 = _mm256_unpacklo_ps(g0, g0); + h1 = _mm256_unpackhi_ps(g0, g0); + h2 = _mm256_unpacklo_ps(g1, g1); + h3 = _mm256_unpackhi_ps(g1, g1); + + a0Val = _mm256_permute2f128_ps(h0, h1, 0x20); + a1Val = _mm256_permute2f128_ps(h0, h1, 0x31); + a2Val = _mm256_permute2f128_ps(h2, h3, 0x20); + a3Val = _mm256_permute2f128_ps(h2, h3, 0x31); + + b0Val = _mm256_loadu_ps(bPtr); + b1Val = _mm256_loadu_ps(bPtr + 8); + b2Val = _mm256_loadu_ps(bPtr + 16); + b3Val = _mm256_loadu_ps(bPtr + 24); + + dotProdVal0 = _mm256_fmadd_ps(a0Val, b0Val, dotProdVal0); + dotProdVal1 = _mm256_fmadd_ps(a1Val, b1Val, dotProdVal1); + dotProdVal2 = _mm256_fmadd_ps(a2Val, b2Val, dotProdVal2); + dotProdVal3 = _mm256_fmadd_ps(a3Val, b3Val, dotProdVal3); + + aPtr += 16; + bPtr += 32; + } + + dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1); + dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2); + dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3); + + __VOLK_ATTR_ALIGNED(32) float dotProductVector[8]; + + _mm256_store_ps(dotProductVector, + dotProdVal0); // Store the results back into the dot product vector + + *realpt = dotProductVector[0]; + *imagpt = dotProductVector[1]; + *realpt += dotProductVector[2]; + *imagpt += dotProductVector[3]; + *realpt += dotProductVector[4]; + *imagpt += dotProductVector[5]; + *realpt += dotProductVector[6]; + *imagpt += dotProductVector[7]; + + number = sixteenthPoints * 16; + for (; number < num_points; number++) { + *realpt += ((*aPtr) * (*bPtr++)); + *imagpt += ((*aPtr++) * (*bPtr++)); + } + + *result = *(lv_32fc_t*)(&res[0]); } #endif /*LV_HAVE_AVX2 && lV_HAVE_FMA*/ @@ -310,91 +328,96 @@ static inline void volk_16i_32fc_dot_prod_32fc_u_avx2_fma( lv_32fc_t* result, co #ifdef LV_HAVE_AVX2 -static inline void volk_16i_32fc_dot_prod_32fc_u_avx2( lv_32fc_t* result, const short* input, const lv_32fc_t* taps, unsigned int num_points) { - - unsigned int number = 0; - const unsigned int sixteenthPoints = num_points / 16; - - float res[2]; - float *realpt = &res[0], *imagpt = &res[1]; - const short* aPtr = input; - const float* bPtr = (float*)taps; - - __m128i m0, m1; - __m256i f0, f1; - __m256 g0, g1, h0, h1, h2, h3; - __m256 a0Val, a1Val, a2Val, a3Val; - __m256 b0Val, b1Val, b2Val, b3Val; - __m256 c0Val, c1Val, c2Val, c3Val; - - __m256 dotProdVal0 = _mm256_setzero_ps(); - __m256 dotProdVal1 = _mm256_setzero_ps(); - __m256 dotProdVal2 = _mm256_setzero_ps(); - __m256 dotProdVal3 = _mm256_setzero_ps(); - - for(;number < sixteenthPoints; number++){ - - m0 = _mm_loadu_si128((__m128i const*) aPtr); - m1 = _mm_loadu_si128((__m128i const*)(aPtr+8)); - - f0 = _mm256_cvtepi16_epi32(m0); - g0 = _mm256_cvtepi32_ps(f0); - f1 = _mm256_cvtepi16_epi32(m1); - g1 = _mm256_cvtepi32_ps(f1); - - h0 = _mm256_unpacklo_ps(g0, g0); - h1 = _mm256_unpackhi_ps(g0, g0); - h2 = _mm256_unpacklo_ps(g1, g1); - h3 = _mm256_unpackhi_ps(g1, g1); - - a0Val = _mm256_permute2f128_ps(h0, h1, 0x20); - a1Val = _mm256_permute2f128_ps(h0, h1, 0x31); - a2Val = _mm256_permute2f128_ps(h2, h3, 0x20); - a3Val = _mm256_permute2f128_ps(h2, h3, 0x31); - - b0Val = _mm256_loadu_ps(bPtr); - b1Val = _mm256_loadu_ps(bPtr+8); - b2Val = _mm256_loadu_ps(bPtr+16); - b3Val = _mm256_loadu_ps(bPtr+24); - - c0Val = _mm256_mul_ps(a0Val, b0Val); - c1Val = _mm256_mul_ps(a1Val, b1Val); - c2Val = _mm256_mul_ps(a2Val, b2Val); - c3Val = _mm256_mul_ps(a3Val, b3Val); - - dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0); - dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1); - dotProdVal2 = _mm256_add_ps(c2Val, dotProdVal2); - dotProdVal3 = _mm256_add_ps(c3Val, dotProdVal3); - - aPtr += 16; - bPtr += 32; - } - - dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1); - dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2); - dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3); - - __VOLK_ATTR_ALIGNED(32) float dotProductVector[8]; - - _mm256_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector - - *realpt = dotProductVector[0]; - *imagpt = dotProductVector[1]; - *realpt += dotProductVector[2]; - *imagpt += dotProductVector[3]; - *realpt += dotProductVector[4]; - *imagpt += dotProductVector[5]; - *realpt += dotProductVector[6]; - *imagpt += dotProductVector[7]; - - number = sixteenthPoints*16; - for(;number < num_points; number++){ - *realpt += ((*aPtr) * (*bPtr++)); - *imagpt += ((*aPtr++) * (*bPtr++)); - } - - *result = *(lv_32fc_t*)(&res[0]); +static inline void volk_16i_32fc_dot_prod_32fc_u_avx2(lv_32fc_t* result, + const short* input, + const lv_32fc_t* taps, + unsigned int num_points) +{ + + unsigned int number = 0; + const unsigned int sixteenthPoints = num_points / 16; + + float res[2]; + float *realpt = &res[0], *imagpt = &res[1]; + const short* aPtr = input; + const float* bPtr = (float*)taps; + + __m128i m0, m1; + __m256i f0, f1; + __m256 g0, g1, h0, h1, h2, h3; + __m256 a0Val, a1Val, a2Val, a3Val; + __m256 b0Val, b1Val, b2Val, b3Val; + __m256 c0Val, c1Val, c2Val, c3Val; + + __m256 dotProdVal0 = _mm256_setzero_ps(); + __m256 dotProdVal1 = _mm256_setzero_ps(); + __m256 dotProdVal2 = _mm256_setzero_ps(); + __m256 dotProdVal3 = _mm256_setzero_ps(); + + for (; number < sixteenthPoints; number++) { + + m0 = _mm_loadu_si128((__m128i const*)aPtr); + m1 = _mm_loadu_si128((__m128i const*)(aPtr + 8)); + + f0 = _mm256_cvtepi16_epi32(m0); + g0 = _mm256_cvtepi32_ps(f0); + f1 = _mm256_cvtepi16_epi32(m1); + g1 = _mm256_cvtepi32_ps(f1); + + h0 = _mm256_unpacklo_ps(g0, g0); + h1 = _mm256_unpackhi_ps(g0, g0); + h2 = _mm256_unpacklo_ps(g1, g1); + h3 = _mm256_unpackhi_ps(g1, g1); + + a0Val = _mm256_permute2f128_ps(h0, h1, 0x20); + a1Val = _mm256_permute2f128_ps(h0, h1, 0x31); + a2Val = _mm256_permute2f128_ps(h2, h3, 0x20); + a3Val = _mm256_permute2f128_ps(h2, h3, 0x31); + + b0Val = _mm256_loadu_ps(bPtr); + b1Val = _mm256_loadu_ps(bPtr + 8); + b2Val = _mm256_loadu_ps(bPtr + 16); + b3Val = _mm256_loadu_ps(bPtr + 24); + + c0Val = _mm256_mul_ps(a0Val, b0Val); + c1Val = _mm256_mul_ps(a1Val, b1Val); + c2Val = _mm256_mul_ps(a2Val, b2Val); + c3Val = _mm256_mul_ps(a3Val, b3Val); + + dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0); + dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1); + dotProdVal2 = _mm256_add_ps(c2Val, dotProdVal2); + dotProdVal3 = _mm256_add_ps(c3Val, dotProdVal3); + + aPtr += 16; + bPtr += 32; + } + + dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1); + dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2); + dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3); + + __VOLK_ATTR_ALIGNED(32) float dotProductVector[8]; + + _mm256_store_ps(dotProductVector, + dotProdVal0); // Store the results back into the dot product vector + + *realpt = dotProductVector[0]; + *imagpt = dotProductVector[1]; + *realpt += dotProductVector[2]; + *imagpt += dotProductVector[3]; + *realpt += dotProductVector[4]; + *imagpt += dotProductVector[5]; + *realpt += dotProductVector[6]; + *imagpt += dotProductVector[7]; + + number = sixteenthPoints * 16; + for (; number < num_points; number++) { + *realpt += ((*aPtr) * (*bPtr++)); + *imagpt += ((*aPtr++) * (*bPtr++)); + } + + *result = *(lv_32fc_t*)(&res[0]); } #endif /*LV_HAVE_AVX2*/ @@ -403,171 +426,181 @@ static inline void volk_16i_32fc_dot_prod_32fc_u_avx2( lv_32fc_t* result, const #if LV_HAVE_SSE && LV_HAVE_MMX -static inline void volk_16i_32fc_dot_prod_32fc_a_sse( lv_32fc_t* result, const short* input, const lv_32fc_t* taps, unsigned int num_points) { - - unsigned int number = 0; - const unsigned int sixteenthPoints = num_points / 8; - - float res[2]; - float *realpt = &res[0], *imagpt = &res[1]; - const short* aPtr = input; - const float* bPtr = (float*)taps; - - __m64 m0, m1; - __m128 f0, f1, f2, f3; - __m128 a0Val, a1Val, a2Val, a3Val; - __m128 b0Val, b1Val, b2Val, b3Val; - __m128 c0Val, c1Val, c2Val, c3Val; - - __m128 dotProdVal0 = _mm_setzero_ps(); - __m128 dotProdVal1 = _mm_setzero_ps(); - __m128 dotProdVal2 = _mm_setzero_ps(); - __m128 dotProdVal3 = _mm_setzero_ps(); - - for(;number < sixteenthPoints; number++){ - - m0 = _mm_set_pi16(*(aPtr+3), *(aPtr+2), *(aPtr+1), *(aPtr+0)); - m1 = _mm_set_pi16(*(aPtr+7), *(aPtr+6), *(aPtr+5), *(aPtr+4)); - f0 = _mm_cvtpi16_ps(m0); - f1 = _mm_cvtpi16_ps(m0); - f2 = _mm_cvtpi16_ps(m1); - f3 = _mm_cvtpi16_ps(m1); - - a0Val = _mm_unpacklo_ps(f0, f1); - a1Val = _mm_unpackhi_ps(f0, f1); - a2Val = _mm_unpacklo_ps(f2, f3); - a3Val = _mm_unpackhi_ps(f2, f3); - - b0Val = _mm_load_ps(bPtr); - b1Val = _mm_load_ps(bPtr+4); - b2Val = _mm_load_ps(bPtr+8); - b3Val = _mm_load_ps(bPtr+12); - - c0Val = _mm_mul_ps(a0Val, b0Val); - c1Val = _mm_mul_ps(a1Val, b1Val); - c2Val = _mm_mul_ps(a2Val, b2Val); - c3Val = _mm_mul_ps(a3Val, b3Val); - - dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0); - dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1); - dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2); - dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3); - - aPtr += 8; - bPtr += 16; - } - - dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1); - dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2); - dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3); - - __VOLK_ATTR_ALIGNED(16) float dotProductVector[4]; - - _mm_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector - - *realpt = dotProductVector[0]; - *imagpt = dotProductVector[1]; - *realpt += dotProductVector[2]; - *imagpt += dotProductVector[3]; - - number = sixteenthPoints*8; - for(;number < num_points; number++){ - *realpt += ((*aPtr) * (*bPtr++)); - *imagpt += ((*aPtr++) * (*bPtr++)); - } - - *result = *(lv_32fc_t*)(&res[0]); +static inline void volk_16i_32fc_dot_prod_32fc_a_sse(lv_32fc_t* result, + const short* input, + const lv_32fc_t* taps, + unsigned int num_points) +{ + + unsigned int number = 0; + const unsigned int sixteenthPoints = num_points / 8; + + float res[2]; + float *realpt = &res[0], *imagpt = &res[1]; + const short* aPtr = input; + const float* bPtr = (float*)taps; + + __m64 m0, m1; + __m128 f0, f1, f2, f3; + __m128 a0Val, a1Val, a2Val, a3Val; + __m128 b0Val, b1Val, b2Val, b3Val; + __m128 c0Val, c1Val, c2Val, c3Val; + + __m128 dotProdVal0 = _mm_setzero_ps(); + __m128 dotProdVal1 = _mm_setzero_ps(); + __m128 dotProdVal2 = _mm_setzero_ps(); + __m128 dotProdVal3 = _mm_setzero_ps(); + + for (; number < sixteenthPoints; number++) { + + m0 = _mm_set_pi16(*(aPtr + 3), *(aPtr + 2), *(aPtr + 1), *(aPtr + 0)); + m1 = _mm_set_pi16(*(aPtr + 7), *(aPtr + 6), *(aPtr + 5), *(aPtr + 4)); + f0 = _mm_cvtpi16_ps(m0); + f1 = _mm_cvtpi16_ps(m0); + f2 = _mm_cvtpi16_ps(m1); + f3 = _mm_cvtpi16_ps(m1); + + a0Val = _mm_unpacklo_ps(f0, f1); + a1Val = _mm_unpackhi_ps(f0, f1); + a2Val = _mm_unpacklo_ps(f2, f3); + a3Val = _mm_unpackhi_ps(f2, f3); + + b0Val = _mm_load_ps(bPtr); + b1Val = _mm_load_ps(bPtr + 4); + b2Val = _mm_load_ps(bPtr + 8); + b3Val = _mm_load_ps(bPtr + 12); + + c0Val = _mm_mul_ps(a0Val, b0Val); + c1Val = _mm_mul_ps(a1Val, b1Val); + c2Val = _mm_mul_ps(a2Val, b2Val); + c3Val = _mm_mul_ps(a3Val, b3Val); + + dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0); + dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1); + dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2); + dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3); + + aPtr += 8; + bPtr += 16; + } + + dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1); + dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2); + dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3); + + __VOLK_ATTR_ALIGNED(16) float dotProductVector[4]; + + _mm_store_ps(dotProductVector, + dotProdVal0); // Store the results back into the dot product vector + + *realpt = dotProductVector[0]; + *imagpt = dotProductVector[1]; + *realpt += dotProductVector[2]; + *imagpt += dotProductVector[3]; + + number = sixteenthPoints * 8; + for (; number < num_points; number++) { + *realpt += ((*aPtr) * (*bPtr++)); + *imagpt += ((*aPtr++) * (*bPtr++)); + } + + *result = *(lv_32fc_t*)(&res[0]); } #endif /*LV_HAVE_SSE && LV_HAVE_MMX*/ #ifdef LV_HAVE_AVX2 -static inline void volk_16i_32fc_dot_prod_32fc_a_avx2( lv_32fc_t* result, const short* input, const lv_32fc_t* taps, unsigned int num_points) { - - unsigned int number = 0; - const unsigned int sixteenthPoints = num_points / 16; - - float res[2]; - float *realpt = &res[0], *imagpt = &res[1]; - const short* aPtr = input; - const float* bPtr = (float*)taps; - - __m128i m0, m1; - __m256i f0, f1; - __m256 g0, g1, h0, h1, h2, h3; - __m256 a0Val, a1Val, a2Val, a3Val; - __m256 b0Val, b1Val, b2Val, b3Val; - __m256 c0Val, c1Val, c2Val, c3Val; - - __m256 dotProdVal0 = _mm256_setzero_ps(); - __m256 dotProdVal1 = _mm256_setzero_ps(); - __m256 dotProdVal2 = _mm256_setzero_ps(); - __m256 dotProdVal3 = _mm256_setzero_ps(); - - for(;number < sixteenthPoints; number++){ - - m0 = _mm_load_si128((__m128i const*) aPtr); - m1 = _mm_load_si128((__m128i const*)(aPtr+8)); - - f0 = _mm256_cvtepi16_epi32(m0); - g0 = _mm256_cvtepi32_ps(f0); - f1 = _mm256_cvtepi16_epi32(m1); - g1 = _mm256_cvtepi32_ps(f1); - - h0 = _mm256_unpacklo_ps(g0, g0); - h1 = _mm256_unpackhi_ps(g0, g0); - h2 = _mm256_unpacklo_ps(g1, g1); - h3 = _mm256_unpackhi_ps(g1, g1); - - a0Val = _mm256_permute2f128_ps(h0, h1, 0x20); - a1Val = _mm256_permute2f128_ps(h0, h1, 0x31); - a2Val = _mm256_permute2f128_ps(h2, h3, 0x20); - a3Val = _mm256_permute2f128_ps(h2, h3, 0x31); - - b0Val = _mm256_load_ps(bPtr); - b1Val = _mm256_load_ps(bPtr+8); - b2Val = _mm256_load_ps(bPtr+16); - b3Val = _mm256_load_ps(bPtr+24); - - c0Val = _mm256_mul_ps(a0Val, b0Val); - c1Val = _mm256_mul_ps(a1Val, b1Val); - c2Val = _mm256_mul_ps(a2Val, b2Val); - c3Val = _mm256_mul_ps(a3Val, b3Val); - - dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0); - dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1); - dotProdVal2 = _mm256_add_ps(c2Val, dotProdVal2); - dotProdVal3 = _mm256_add_ps(c3Val, dotProdVal3); - - aPtr += 16; - bPtr += 32; - } - - dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1); - dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2); - dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3); - - __VOLK_ATTR_ALIGNED(32) float dotProductVector[8]; - - _mm256_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector - - *realpt = dotProductVector[0]; - *imagpt = dotProductVector[1]; - *realpt += dotProductVector[2]; - *imagpt += dotProductVector[3]; - *realpt += dotProductVector[4]; - *imagpt += dotProductVector[5]; - *realpt += dotProductVector[6]; - *imagpt += dotProductVector[7]; - - number = sixteenthPoints*16; - for(;number < num_points; number++){ - *realpt += ((*aPtr) * (*bPtr++)); - *imagpt += ((*aPtr++) * (*bPtr++)); - } - - *result = *(lv_32fc_t*)(&res[0]); +static inline void volk_16i_32fc_dot_prod_32fc_a_avx2(lv_32fc_t* result, + const short* input, + const lv_32fc_t* taps, + unsigned int num_points) +{ + + unsigned int number = 0; + const unsigned int sixteenthPoints = num_points / 16; + + float res[2]; + float *realpt = &res[0], *imagpt = &res[1]; + const short* aPtr = input; + const float* bPtr = (float*)taps; + + __m128i m0, m1; + __m256i f0, f1; + __m256 g0, g1, h0, h1, h2, h3; + __m256 a0Val, a1Val, a2Val, a3Val; + __m256 b0Val, b1Val, b2Val, b3Val; + __m256 c0Val, c1Val, c2Val, c3Val; + + __m256 dotProdVal0 = _mm256_setzero_ps(); + __m256 dotProdVal1 = _mm256_setzero_ps(); + __m256 dotProdVal2 = _mm256_setzero_ps(); + __m256 dotProdVal3 = _mm256_setzero_ps(); + + for (; number < sixteenthPoints; number++) { + + m0 = _mm_load_si128((__m128i const*)aPtr); + m1 = _mm_load_si128((__m128i const*)(aPtr + 8)); + + f0 = _mm256_cvtepi16_epi32(m0); + g0 = _mm256_cvtepi32_ps(f0); + f1 = _mm256_cvtepi16_epi32(m1); + g1 = _mm256_cvtepi32_ps(f1); + + h0 = _mm256_unpacklo_ps(g0, g0); + h1 = _mm256_unpackhi_ps(g0, g0); + h2 = _mm256_unpacklo_ps(g1, g1); + h3 = _mm256_unpackhi_ps(g1, g1); + + a0Val = _mm256_permute2f128_ps(h0, h1, 0x20); + a1Val = _mm256_permute2f128_ps(h0, h1, 0x31); + a2Val = _mm256_permute2f128_ps(h2, h3, 0x20); + a3Val = _mm256_permute2f128_ps(h2, h3, 0x31); + + b0Val = _mm256_load_ps(bPtr); + b1Val = _mm256_load_ps(bPtr + 8); + b2Val = _mm256_load_ps(bPtr + 16); + b3Val = _mm256_load_ps(bPtr + 24); + + c0Val = _mm256_mul_ps(a0Val, b0Val); + c1Val = _mm256_mul_ps(a1Val, b1Val); + c2Val = _mm256_mul_ps(a2Val, b2Val); + c3Val = _mm256_mul_ps(a3Val, b3Val); + + dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0); + dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1); + dotProdVal2 = _mm256_add_ps(c2Val, dotProdVal2); + dotProdVal3 = _mm256_add_ps(c3Val, dotProdVal3); + + aPtr += 16; + bPtr += 32; + } + + dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1); + dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2); + dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3); + + __VOLK_ATTR_ALIGNED(32) float dotProductVector[8]; + + _mm256_store_ps(dotProductVector, + dotProdVal0); // Store the results back into the dot product vector + + *realpt = dotProductVector[0]; + *imagpt = dotProductVector[1]; + *realpt += dotProductVector[2]; + *imagpt += dotProductVector[3]; + *realpt += dotProductVector[4]; + *imagpt += dotProductVector[5]; + *realpt += dotProductVector[6]; + *imagpt += dotProductVector[7]; + + number = sixteenthPoints * 16; + for (; number < num_points; number++) { + *realpt += ((*aPtr) * (*bPtr++)); + *imagpt += ((*aPtr++) * (*bPtr++)); + } + + *result = *(lv_32fc_t*)(&res[0]); } @@ -575,85 +608,90 @@ static inline void volk_16i_32fc_dot_prod_32fc_a_avx2( lv_32fc_t* result, const #if LV_HAVE_AVX2 && LV_HAVE_FMA -static inline void volk_16i_32fc_dot_prod_32fc_a_avx2_fma( lv_32fc_t* result, const short* input, const lv_32fc_t* taps, unsigned int num_points) { - - unsigned int number = 0; - const unsigned int sixteenthPoints = num_points / 16; - - float res[2]; - float *realpt = &res[0], *imagpt = &res[1]; - const short* aPtr = input; - const float* bPtr = (float*)taps; - - __m128i m0, m1; - __m256i f0, f1; - __m256 g0, g1, h0, h1, h2, h3; - __m256 a0Val, a1Val, a2Val, a3Val; - __m256 b0Val, b1Val, b2Val, b3Val; - - __m256 dotProdVal0 = _mm256_setzero_ps(); - __m256 dotProdVal1 = _mm256_setzero_ps(); - __m256 dotProdVal2 = _mm256_setzero_ps(); - __m256 dotProdVal3 = _mm256_setzero_ps(); - - for(;number < sixteenthPoints; number++){ - - m0 = _mm_load_si128((__m128i const*) aPtr); - m1 = _mm_load_si128((__m128i const*)(aPtr+8)); - - f0 = _mm256_cvtepi16_epi32(m0); - g0 = _mm256_cvtepi32_ps(f0); - f1 = _mm256_cvtepi16_epi32(m1); - g1 = _mm256_cvtepi32_ps(f1); - - h0 = _mm256_unpacklo_ps(g0, g0); - h1 = _mm256_unpackhi_ps(g0, g0); - h2 = _mm256_unpacklo_ps(g1, g1); - h3 = _mm256_unpackhi_ps(g1, g1); - - a0Val = _mm256_permute2f128_ps(h0, h1, 0x20); - a1Val = _mm256_permute2f128_ps(h0, h1, 0x31); - a2Val = _mm256_permute2f128_ps(h2, h3, 0x20); - a3Val = _mm256_permute2f128_ps(h2, h3, 0x31); - - b0Val = _mm256_load_ps(bPtr); - b1Val = _mm256_load_ps(bPtr+8); - b2Val = _mm256_load_ps(bPtr+16); - b3Val = _mm256_load_ps(bPtr+24); - - dotProdVal0 = _mm256_fmadd_ps(a0Val,b0Val,dotProdVal0); - dotProdVal1 = _mm256_fmadd_ps(a1Val,b1Val,dotProdVal1); - dotProdVal2 = _mm256_fmadd_ps(a2Val,b2Val,dotProdVal2); - dotProdVal3 = _mm256_fmadd_ps(a3Val,b3Val,dotProdVal3); - - aPtr += 16; - bPtr += 32; - } - - dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1); - dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2); - dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3); - - __VOLK_ATTR_ALIGNED(32) float dotProductVector[8]; - - _mm256_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector - - *realpt = dotProductVector[0]; - *imagpt = dotProductVector[1]; - *realpt += dotProductVector[2]; - *imagpt += dotProductVector[3]; - *realpt += dotProductVector[4]; - *imagpt += dotProductVector[5]; - *realpt += dotProductVector[6]; - *imagpt += dotProductVector[7]; - - number = sixteenthPoints*16; - for(;number < num_points; number++){ - *realpt += ((*aPtr) * (*bPtr++)); - *imagpt += ((*aPtr++) * (*bPtr++)); - } - - *result = *(lv_32fc_t*)(&res[0]); +static inline void volk_16i_32fc_dot_prod_32fc_a_avx2_fma(lv_32fc_t* result, + const short* input, + const lv_32fc_t* taps, + unsigned int num_points) +{ + + unsigned int number = 0; + const unsigned int sixteenthPoints = num_points / 16; + + float res[2]; + float *realpt = &res[0], *imagpt = &res[1]; + const short* aPtr = input; + const float* bPtr = (float*)taps; + + __m128i m0, m1; + __m256i f0, f1; + __m256 g0, g1, h0, h1, h2, h3; + __m256 a0Val, a1Val, a2Val, a3Val; + __m256 b0Val, b1Val, b2Val, b3Val; + + __m256 dotProdVal0 = _mm256_setzero_ps(); + __m256 dotProdVal1 = _mm256_setzero_ps(); + __m256 dotProdVal2 = _mm256_setzero_ps(); + __m256 dotProdVal3 = _mm256_setzero_ps(); + + for (; number < sixteenthPoints; number++) { + + m0 = _mm_load_si128((__m128i const*)aPtr); + m1 = _mm_load_si128((__m128i const*)(aPtr + 8)); + + f0 = _mm256_cvtepi16_epi32(m0); + g0 = _mm256_cvtepi32_ps(f0); + f1 = _mm256_cvtepi16_epi32(m1); + g1 = _mm256_cvtepi32_ps(f1); + + h0 = _mm256_unpacklo_ps(g0, g0); + h1 = _mm256_unpackhi_ps(g0, g0); + h2 = _mm256_unpacklo_ps(g1, g1); + h3 = _mm256_unpackhi_ps(g1, g1); + + a0Val = _mm256_permute2f128_ps(h0, h1, 0x20); + a1Val = _mm256_permute2f128_ps(h0, h1, 0x31); + a2Val = _mm256_permute2f128_ps(h2, h3, 0x20); + a3Val = _mm256_permute2f128_ps(h2, h3, 0x31); + + b0Val = _mm256_load_ps(bPtr); + b1Val = _mm256_load_ps(bPtr + 8); + b2Val = _mm256_load_ps(bPtr + 16); + b3Val = _mm256_load_ps(bPtr + 24); + + dotProdVal0 = _mm256_fmadd_ps(a0Val, b0Val, dotProdVal0); + dotProdVal1 = _mm256_fmadd_ps(a1Val, b1Val, dotProdVal1); + dotProdVal2 = _mm256_fmadd_ps(a2Val, b2Val, dotProdVal2); + dotProdVal3 = _mm256_fmadd_ps(a3Val, b3Val, dotProdVal3); + + aPtr += 16; + bPtr += 32; + } + + dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1); + dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2); + dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3); + + __VOLK_ATTR_ALIGNED(32) float dotProductVector[8]; + + _mm256_store_ps(dotProductVector, + dotProdVal0); // Store the results back into the dot product vector + + *realpt = dotProductVector[0]; + *imagpt = dotProductVector[1]; + *realpt += dotProductVector[2]; + *imagpt += dotProductVector[3]; + *realpt += dotProductVector[4]; + *imagpt += dotProductVector[5]; + *realpt += dotProductVector[6]; + *imagpt += dotProductVector[7]; + + number = sixteenthPoints * 16; + for (; number < num_points; number++) { + *realpt += ((*aPtr) * (*bPtr++)); + *imagpt += ((*aPtr++) * (*bPtr++)); + } + + *result = *(lv_32fc_t*)(&res[0]); } diff --git a/kernels/volk/volk_16i_branch_4_state_8.h b/kernels/volk/volk_16i_branch_4_state_8.h index 31b66cc..4d00b6b 100644 --- a/kernels/volk/volk_16i_branch_4_state_8.h +++ b/kernels/volk/volk_16i_branch_4_state_8.h @@ -29,8 +29,8 @@ * * Dispatcher Prototype * \code - * void volk_16i_branch_4_state_8(short* target, short* src0, char** permuters, short* cntl2, short* cntl3, short* scalars) - * \endcode + * void volk_16i_branch_4_state_8(short* target, short* src0, char** permuters, short* + * cntl2, short* cntl3, short* scalars) \endcode * * \b Inputs * \li src0: @@ -61,155 +61,154 @@ #ifdef LV_HAVE_SSSE3 -#include #include #include +#include -static inline void -volk_16i_branch_4_state_8_a_ssse3(short* target, short* src0, char** permuters, short* cntl2, short* cntl3, short* scalars) +static inline void volk_16i_branch_4_state_8_a_ssse3(short* target, + short* src0, + char** permuters, + short* cntl2, + short* cntl3, + short* scalars) { - __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11; - __m128i *p_target, *p_src0, *p_cntl2, *p_cntl3, *p_scalars; + __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11; + __m128i *p_target, *p_src0, *p_cntl2, *p_cntl3, *p_scalars; - p_target = (__m128i*)target; - p_src0 = (__m128i*)src0; - p_cntl2 = (__m128i*)cntl2; - p_cntl3 = (__m128i*)cntl3; - p_scalars = (__m128i*)scalars; + p_target = (__m128i*)target; + p_src0 = (__m128i*)src0; + p_cntl2 = (__m128i*)cntl2; + p_cntl3 = (__m128i*)cntl3; + p_scalars = (__m128i*)scalars; - xmm0 = _mm_load_si128(p_scalars); + xmm0 = _mm_load_si128(p_scalars); - xmm1 = _mm_shufflelo_epi16(xmm0, 0); - xmm2 = _mm_shufflelo_epi16(xmm0, 0x55); - xmm3 = _mm_shufflelo_epi16(xmm0, 0xaa); - xmm4 = _mm_shufflelo_epi16(xmm0, 0xff); + xmm1 = _mm_shufflelo_epi16(xmm0, 0); + xmm2 = _mm_shufflelo_epi16(xmm0, 0x55); + xmm3 = _mm_shufflelo_epi16(xmm0, 0xaa); + xmm4 = _mm_shufflelo_epi16(xmm0, 0xff); - xmm1 = _mm_shuffle_epi32(xmm1, 0x00); - xmm2 = _mm_shuffle_epi32(xmm2, 0x00); - xmm3 = _mm_shuffle_epi32(xmm3, 0x00); - xmm4 = _mm_shuffle_epi32(xmm4, 0x00); + xmm1 = _mm_shuffle_epi32(xmm1, 0x00); + xmm2 = _mm_shuffle_epi32(xmm2, 0x00); + xmm3 = _mm_shuffle_epi32(xmm3, 0x00); + xmm4 = _mm_shuffle_epi32(xmm4, 0x00); - xmm0 = _mm_load_si128((__m128i*)permuters[0]); - xmm6 = _mm_load_si128((__m128i*)permuters[1]); - xmm8 = _mm_load_si128((__m128i*)permuters[2]); - xmm10 = _mm_load_si128((__m128i*)permuters[3]); + xmm0 = _mm_load_si128((__m128i*)permuters[0]); + xmm6 = _mm_load_si128((__m128i*)permuters[1]); + xmm8 = _mm_load_si128((__m128i*)permuters[2]); + xmm10 = _mm_load_si128((__m128i*)permuters[3]); - xmm5 = _mm_load_si128(p_src0); - xmm0 = _mm_shuffle_epi8(xmm5, xmm0); - xmm6 = _mm_shuffle_epi8(xmm5, xmm6); - xmm8 = _mm_shuffle_epi8(xmm5, xmm8); - xmm10 = _mm_shuffle_epi8(xmm5, xmm10); + xmm5 = _mm_load_si128(p_src0); + xmm0 = _mm_shuffle_epi8(xmm5, xmm0); + xmm6 = _mm_shuffle_epi8(xmm5, xmm6); + xmm8 = _mm_shuffle_epi8(xmm5, xmm8); + xmm10 = _mm_shuffle_epi8(xmm5, xmm10); - xmm5 = _mm_add_epi16(xmm1, xmm2); + xmm5 = _mm_add_epi16(xmm1, xmm2); - xmm6 = _mm_add_epi16(xmm2, xmm6); - xmm8 = _mm_add_epi16(xmm1, xmm8); + xmm6 = _mm_add_epi16(xmm2, xmm6); + xmm8 = _mm_add_epi16(xmm1, xmm8); - xmm7 = _mm_load_si128(p_cntl2); - xmm9 = _mm_load_si128(p_cntl3); + xmm7 = _mm_load_si128(p_cntl2); + xmm9 = _mm_load_si128(p_cntl3); - xmm0 = _mm_add_epi16(xmm5, xmm0); + xmm0 = _mm_add_epi16(xmm5, xmm0); - xmm7 = _mm_and_si128(xmm7, xmm3); - xmm9 = _mm_and_si128(xmm9, xmm4); + xmm7 = _mm_and_si128(xmm7, xmm3); + xmm9 = _mm_and_si128(xmm9, xmm4); - xmm5 = _mm_load_si128(&p_cntl2[1]); - xmm11 = _mm_load_si128(&p_cntl3[1]); + xmm5 = _mm_load_si128(&p_cntl2[1]); + xmm11 = _mm_load_si128(&p_cntl3[1]); - xmm7 = _mm_add_epi16(xmm7, xmm9); + xmm7 = _mm_add_epi16(xmm7, xmm9); - xmm5 = _mm_and_si128(xmm5, xmm3); - xmm11 = _mm_and_si128(xmm11, xmm4); + xmm5 = _mm_and_si128(xmm5, xmm3); + xmm11 = _mm_and_si128(xmm11, xmm4); - xmm0 = _mm_add_epi16(xmm0, xmm7); + xmm0 = _mm_add_epi16(xmm0, xmm7); - xmm7 = _mm_load_si128(&p_cntl2[2]); - xmm9 = _mm_load_si128(&p_cntl3[2]); + xmm7 = _mm_load_si128(&p_cntl2[2]); + xmm9 = _mm_load_si128(&p_cntl3[2]); - xmm5 = _mm_add_epi16(xmm5, xmm11); + xmm5 = _mm_add_epi16(xmm5, xmm11); - xmm7 = _mm_and_si128(xmm7, xmm3); - xmm9 = _mm_and_si128(xmm9, xmm4); + xmm7 = _mm_and_si128(xmm7, xmm3); + xmm9 = _mm_and_si128(xmm9, xmm4); - xmm6 = _mm_add_epi16(xmm6, xmm5); + xmm6 = _mm_add_epi16(xmm6, xmm5); - xmm5 = _mm_load_si128(&p_cntl2[3]); - xmm11 = _mm_load_si128(&p_cntl3[3]); + xmm5 = _mm_load_si128(&p_cntl2[3]); + xmm11 = _mm_load_si128(&p_cntl3[3]); - xmm7 = _mm_add_epi16(xmm7, xmm9); + xmm7 = _mm_add_epi16(xmm7, xmm9); - xmm5 = _mm_and_si128(xmm5, xmm3); - xmm11 = _mm_and_si128(xmm11, xmm4); + xmm5 = _mm_and_si128(xmm5, xmm3); + xmm11 = _mm_and_si128(xmm11, xmm4); - xmm8 = _mm_add_epi16(xmm8, xmm7); + xmm8 = _mm_add_epi16(xmm8, xmm7); - xmm5 = _mm_add_epi16(xmm5, xmm11); + xmm5 = _mm_add_epi16(xmm5, xmm11); - _mm_store_si128(p_target, xmm0); - _mm_store_si128(&p_target[1], xmm6); + _mm_store_si128(p_target, xmm0); + _mm_store_si128(&p_target[1], xmm6); - xmm10 = _mm_add_epi16(xmm5, xmm10); + xmm10 = _mm_add_epi16(xmm5, xmm10); - _mm_store_si128(&p_target[2], xmm8); + _mm_store_si128(&p_target[2], xmm8); - _mm_store_si128(&p_target[3], xmm10); + _mm_store_si128(&p_target[3], xmm10); } #endif /*LV_HAVE_SSEs*/ #ifdef LV_HAVE_GENERIC -static inline void -volk_16i_branch_4_state_8_generic(short* target, short* src0, char** permuters, short* cntl2, short* cntl3, short* scalars) +static inline void volk_16i_branch_4_state_8_generic(short* target, + short* src0, + char** permuters, + short* cntl2, + short* cntl3, + short* scalars) { - int i = 0; - - int bound = 4; - - for(; i < bound; ++i) { - target[i* 8] = src0[((char)permuters[i][0])/2] - + ((i + 1)%2 * scalars[0]) - + (((i >> 1)^1) * scalars[1]) - + (cntl2[i * 8] & scalars[2]) - + (cntl3[i * 8] & scalars[3]); - target[i* 8 + 1] = src0[((char)permuters[i][1 * 2])/2] - + ((i + 1)%2 * scalars[0]) - + (((i >> 1)^1) * scalars[1]) - + (cntl2[i * 8 + 1] & scalars[2]) - + (cntl3[i * 8 + 1] & scalars[3]); - target[i* 8 + 2] = src0[((char)permuters[i][2 * 2])/2] - + ((i + 1)%2 * scalars[0]) - + (((i >> 1)^1) * scalars[1]) - + (cntl2[i * 8 + 2] & scalars[2]) - + (cntl3[i * 8 + 2] & scalars[3]); - target[i* 8 + 3] = src0[((char)permuters[i][3 * 2])/2] - + ((i + 1)%2 * scalars[0]) - + (((i >> 1)^1) * scalars[1]) - + (cntl2[i * 8 + 3] & scalars[2]) - + (cntl3[i * 8 + 3] & scalars[3]); - target[i* 8 + 4] = src0[((char)permuters[i][4 * 2])/2] - + ((i + 1)%2 * scalars[0]) - + (((i >> 1)^1) * scalars[1]) - + (cntl2[i * 8 + 4] & scalars[2]) - + (cntl3[i * 8 + 4] & scalars[3]); - target[i* 8 + 5] = src0[((char)permuters[i][5 * 2])/2] - + ((i + 1)%2 * scalars[0]) - + (((i >> 1)^1) * scalars[1]) - + (cntl2[i * 8 + 5] & scalars[2]) - + (cntl3[i * 8 + 5] & scalars[3]); - target[i* 8 + 6] = src0[((char)permuters[i][6 * 2])/2] - + ((i + 1)%2 * scalars[0]) - + (((i >> 1)^1) * scalars[1]) - + (cntl2[i * 8 + 6] & scalars[2]) - + (cntl3[i * 8 + 6] & scalars[3]); - target[i* 8 + 7] = src0[((char)permuters[i][7 * 2])/2] - + ((i + 1)%2 * scalars[0]) - + (((i >> 1)^1) * scalars[1]) - + (cntl2[i * 8 + 7] & scalars[2]) - + (cntl3[i * 8 + 7] & scalars[3]); - } + int i = 0; + + int bound = 4; + + for (; i < bound; ++i) { + target[i * 8] = src0[((char)permuters[i][0]) / 2] + ((i + 1) % 2 * scalars[0]) + + (((i >> 1) ^ 1) * scalars[1]) + (cntl2[i * 8] & scalars[2]) + + (cntl3[i * 8] & scalars[3]); + target[i * 8 + 1] = src0[((char)permuters[i][1 * 2]) / 2] + + ((i + 1) % 2 * scalars[0]) + (((i >> 1) ^ 1) * scalars[1]) + + (cntl2[i * 8 + 1] & scalars[2]) + + (cntl3[i * 8 + 1] & scalars[3]); + target[i * 8 + 2] = src0[((char)permuters[i][2 * 2]) / 2] + + ((i + 1) % 2 * scalars[0]) + (((i >> 1) ^ 1) * scalars[1]) + + (cntl2[i * 8 + 2] & scalars[2]) + + (cntl3[i * 8 + 2] & scalars[3]); + target[i * 8 + 3] = src0[((char)permuters[i][3 * 2]) / 2] + + ((i + 1) % 2 * scalars[0]) + (((i >> 1) ^ 1) * scalars[1]) + + (cntl2[i * 8 + 3] & scalars[2]) + + (cntl3[i * 8 + 3] & scalars[3]); + target[i * 8 + 4] = src0[((char)permuters[i][4 * 2]) / 2] + + ((i + 1) % 2 * scalars[0]) + (((i >> 1) ^ 1) * scalars[1]) + + (cntl2[i * 8 + 4] & scalars[2]) + + (cntl3[i * 8 + 4] & scalars[3]); + target[i * 8 + 5] = src0[((char)permuters[i][5 * 2]) / 2] + + ((i + 1) % 2 * scalars[0]) + (((i >> 1) ^ 1) * scalars[1]) + + (cntl2[i * 8 + 5] & scalars[2]) + + (cntl3[i * 8 + 5] & scalars[3]); + target[i * 8 + 6] = src0[((char)permuters[i][6 * 2]) / 2] + + ((i + 1) % 2 * scalars[0]) + (((i >> 1) ^ 1) * scalars[1]) + + (cntl2[i * 8 + 6] & scalars[2]) + + (cntl3[i * 8 + 6] & scalars[3]); + target[i * 8 + 7] = src0[((char)permuters[i][7 * 2]) / 2] + + ((i + 1) % 2 * scalars[0]) + (((i >> 1) ^ 1) * scalars[1]) + + (cntl2[i * 8 + 7] & scalars[2]) + + (cntl3[i * 8 + 7] & scalars[3]); + } } #endif /*LV_HAVE_GENERIC*/ diff --git a/kernels/volk/volk_16i_convert_8i.h b/kernels/volk/volk_16i_convert_8i.h index e2f953b..f09515d 100644 --- a/kernels/volk/volk_16i_convert_8i.h +++ b/kernels/volk/volk_16i_convert_8i.h @@ -29,8 +29,8 @@ * * Dispatcher Prototype * \code - * void volk_16i_convert_8i(int8_t* outputVector, const int16_t* inputVector, unsigned int num_points) - * \endcode + * void volk_16i_convert_8i(int8_t* outputVector, const int16_t* inputVector, unsigned int + * num_points) \endcode * * \b Inputs * \li inputVector: The input vector of 16-bit shorts. @@ -59,39 +59,42 @@ #ifdef LV_HAVE_AVX2 #include -static inline void -volk_16i_convert_8i_u_avx2(int8_t* outputVector, const int16_t* inputVector, unsigned int num_points) +static inline void volk_16i_convert_8i_u_avx2(int8_t* outputVector, + const int16_t* inputVector, + unsigned int num_points) { - unsigned int number = 0; - const unsigned int thirtysecondPoints = num_points / 32; + unsigned int number = 0; + const unsigned int thirtysecondPoints = num_points / 32; - int8_t* outputVectorPtr = outputVector; - int16_t* inputPtr = (int16_t*)inputVector; - __m256i inputVal1; - __m256i inputVal2; - __m256i ret; + int8_t* outputVectorPtr = outputVector; + int16_t* inputPtr = (int16_t*)inputVector; + __m256i inputVal1; + __m256i inputVal2; + __m256i ret; - for(;number < thirtysecondPoints; number++){ + for (; number < thirtysecondPoints; number++) { - // Load the 16 values - inputVal1 = _mm256_loadu_si256((__m256i*)inputPtr); inputPtr += 16; - inputVal2 = _mm256_loadu_si256((__m256i*)inputPtr); inputPtr += 16; + // Load the 16 values + inputVal1 = _mm256_loadu_si256((__m256i*)inputPtr); + inputPtr += 16; + inputVal2 = _mm256_loadu_si256((__m256i*)inputPtr); + inputPtr += 16; - inputVal1 = _mm256_srai_epi16(inputVal1, 8); - inputVal2 = _mm256_srai_epi16(inputVal2, 8); + inputVal1 = _mm256_srai_epi16(inputVal1, 8); + inputVal2 = _mm256_srai_epi16(inputVal2, 8); - ret = _mm256_packs_epi16(inputVal1, inputVal2); - ret = _mm256_permute4x64_epi64(ret, 0b11011000); + ret = _mm256_packs_epi16(inputVal1, inputVal2); + ret = _mm256_permute4x64_epi64(ret, 0b11011000); - _mm256_storeu_si256((__m256i*)outputVectorPtr, ret); + _mm256_storeu_si256((__m256i*)outputVectorPtr, ret); - outputVectorPtr += 32; - } + outputVectorPtr += 32; + } - number = thirtysecondPoints * 32; - for(; number < num_points; number++){ - outputVector[number] =(int8_t)(inputVector[number] >> 8); - } + number = thirtysecondPoints * 32; + for (; number < num_points; number++) { + outputVector[number] = (int8_t)(inputVector[number] >> 8); + } } #endif /* LV_HAVE_AVX2 */ @@ -99,60 +102,62 @@ volk_16i_convert_8i_u_avx2(int8_t* outputVector, const int16_t* inputVector, uns #ifdef LV_HAVE_SSE2 #include -static inline void -volk_16i_convert_8i_u_sse2(int8_t* outputVector, const int16_t* inputVector, unsigned int num_points) +static inline void volk_16i_convert_8i_u_sse2(int8_t* outputVector, + const int16_t* inputVector, + unsigned int num_points) { - unsigned int number = 0; - const unsigned int sixteenthPoints = num_points / 16; + unsigned int number = 0; + const unsigned int sixteenthPoints = num_points / 16; - int8_t* outputVectorPtr = outputVector; - int16_t* inputPtr = (int16_t*)inputVector; - __m128i inputVal1; - __m128i inputVal2; - __m128i ret; + int8_t* outputVectorPtr = outputVector; + int16_t* inputPtr = (int16_t*)inputVector; + __m128i inputVal1; + __m128i inputVal2; + __m128i ret; - for(;number < sixteenthPoints; number++){ + for (; number < sixteenthPoints; number++) { - // Load the 16 values - inputVal1 = _mm_loadu_si128((__m128i*)inputPtr); inputPtr += 8; - inputVal2 = _mm_loadu_si128((__m128i*)inputPtr); inputPtr += 8; + // Load the 16 values + inputVal1 = _mm_loadu_si128((__m128i*)inputPtr); + inputPtr += 8; + inputVal2 = _mm_loadu_si128((__m128i*)inputPtr); + inputPtr += 8; - inputVal1 = _mm_srai_epi16(inputVal1, 8); - inputVal2 = _mm_srai_epi16(inputVal2, 8); + inputVal1 = _mm_srai_epi16(inputVal1, 8); + inputVal2 = _mm_srai_epi16(inputVal2, 8); - ret = _mm_packs_epi16(inputVal1, inputVal2); + ret = _mm_packs_epi16(inputVal1, inputVal2); - _mm_storeu_si128((__m128i*)outputVectorPtr, ret); + _mm_storeu_si128((__m128i*)outputVectorPtr, ret); - outputVectorPtr += 16; - } + outputVectorPtr += 16; + } - number = sixteenthPoints * 16; - for(; number < num_points; number++){ - outputVector[number] =(int8_t)(inputVector[number] >> 8); - } + number = sixteenthPoints * 16; + for (; number < num_points; number++) { + outputVector[number] = (int8_t)(inputVector[number] >> 8); + } } #endif /* LV_HAVE_SSE2 */ #ifdef LV_HAVE_GENERIC -static inline void -volk_16i_convert_8i_generic(int8_t* outputVector, const int16_t* inputVector, unsigned int num_points) +static inline void volk_16i_convert_8i_generic(int8_t* outputVector, + const int16_t* inputVector, + unsigned int num_points) { - int8_t* outputVectorPtr = outputVector; - const int16_t* inputVectorPtr = inputVector; - unsigned int number = 0; + int8_t* outputVectorPtr = outputVector; + const int16_t* inputVectorPtr = inputVector; + unsigned int number = 0; - for(number = 0; number < num_points; number++){ - *outputVectorPtr++ = ((int8_t)(*inputVectorPtr++ >> 8)); - } + for (number = 0; number < num_points; number++) { + *outputVectorPtr++ = ((int8_t)(*inputVectorPtr++ >> 8)); + } } #endif /* LV_HAVE_GENERIC */ - - #endif /* INCLUDED_volk_16i_convert_8i_u_H */ #ifndef INCLUDED_volk_16i_convert_8i_a_H #define INCLUDED_volk_16i_convert_8i_a_H @@ -163,39 +168,42 @@ volk_16i_convert_8i_generic(int8_t* outputVector, const int16_t* inputVector, un #ifdef LV_HAVE_AVX2 #include -static inline void -volk_16i_convert_8i_a_avx2(int8_t* outputVector, const int16_t* inputVector, unsigned int num_points) +static inline void volk_16i_convert_8i_a_avx2(int8_t* outputVector, + const int16_t* inputVector, + unsigned int num_points) { - unsigned int number = 0; - const unsigned int thirtysecondPoints = num_points / 32; + unsigned int number = 0; + const unsigned int thirtysecondPoints = num_points / 32; - int8_t* outputVectorPtr = outputVector; - int16_t* inputPtr = (int16_t*)inputVector; - __m256i inputVal1; - __m256i inputVal2; - __m256i ret; + int8_t* outputVectorPtr = outputVector; + int16_t* inputPtr = (int16_t*)inputVector; + __m256i inputVal1; + __m256i inputVal2; + __m256i ret; - for(;number < thirtysecondPoints; number++){ + for (; number < thirtysecondPoints; number++) { - // Load the 16 values - inputVal1 = _mm256_load_si256((__m256i*)inputPtr); inputPtr += 16; - inputVal2 = _mm256_load_si256((__m256i*)inputPtr); inputPtr += 16; + // Load the 16 values + inputVal1 = _mm256_load_si256((__m256i*)inputPtr); + inputPtr += 16; + inputVal2 = _mm256_load_si256((__m256i*)inputPtr); + inputPtr += 16; - inputVal1 = _mm256_srai_epi16(inputVal1, 8); - inputVal2 = _mm256_srai_epi16(inputVal2, 8); + inputVal1 = _mm256_srai_epi16(inputVal1, 8); + inputVal2 = _mm256_srai_epi16(inputVal2, 8); - ret = _mm256_packs_epi16(inputVal1, inputVal2); - ret = _mm256_permute4x64_epi64(ret, 0b11011000); + ret = _mm256_packs_epi16(inputVal1, inputVal2); + ret = _mm256_permute4x64_epi64(ret, 0b11011000); - _mm256_store_si256((__m256i*)outputVectorPtr, ret); + _mm256_store_si256((__m256i*)outputVectorPtr, ret); - outputVectorPtr += 32; - } + outputVectorPtr += 32; + } - number = thirtysecondPoints * 32; - for(; number < num_points; number++){ - outputVector[number] =(int8_t)(inputVector[number] >> 8); - } + number = thirtysecondPoints * 32; + for (; number < num_points; number++) { + outputVector[number] = (int8_t)(inputVector[number] >> 8); + } } #endif /* LV_HAVE_AVX2 */ @@ -203,38 +211,41 @@ volk_16i_convert_8i_a_avx2(int8_t* outputVector, const int16_t* inputVector, uns #ifdef LV_HAVE_SSE2 #include -static inline void -volk_16i_convert_8i_a_sse2(int8_t* outputVector, const int16_t* inputVector, unsigned int num_points) +static inline void volk_16i_convert_8i_a_sse2(int8_t* outputVector, + const int16_t* inputVector, + unsigned int num_points) { - unsigned int number = 0; - const unsigned int sixteenthPoints = num_points / 16; + unsigned int number = 0; + const unsigned int sixteenthPoints = num_points / 16; - int8_t* outputVectorPtr = outputVector; - int16_t* inputPtr = (int16_t*)inputVector; - __m128i inputVal1; - __m128i inputVal2; - __m128i ret; + int8_t* outputVectorPtr = outputVector; + int16_t* inputPtr = (int16_t*)inputVector; + __m128i inputVal1; + __m128i inputVal2; + __m128i ret; - for(;number < sixteenthPoints; number++){ + for (; number < sixteenthPoints; number++) { - // Load the 16 values - inputVal1 = _mm_load_si128((__m128i*)inputPtr); inputPtr += 8; - inputVal2 = _mm_load_si128((__m128i*)inputPtr); inputPtr += 8; + // Load the 16 values + inputVal1 = _mm_load_si128((__m128i*)inputPtr); + inputPtr += 8; + inputVal2 = _mm_load_si128((__m128i*)inputPtr); + inputPtr += 8; - inputVal1 = _mm_srai_epi16(inputVal1, 8); - inputVal2 = _mm_srai_epi16(inputVal2, 8); + inputVal1 = _mm_srai_epi16(inputVal1, 8); + inputVal2 = _mm_srai_epi16(inputVal2, 8); - ret = _mm_packs_epi16(inputVal1, inputVal2); + ret = _mm_packs_epi16(inputVal1, inputVal2); - _mm_store_si128((__m128i*)outputVectorPtr, ret); + _mm_store_si128((__m128i*)outputVectorPtr, ret); - outputVectorPtr += 16; - } + outputVectorPtr += 16; + } - number = sixteenthPoints * 16; - for(; number < num_points; number++){ - outputVector[number] =(int8_t)(inputVector[number] >> 8); - } + number = sixteenthPoints * 16; + for (; number < num_points; number++) { + outputVector[number] = (int8_t)(inputVector[number] >> 8); + } } #endif /* LV_HAVE_SSE2 */ @@ -242,53 +253,55 @@ volk_16i_convert_8i_a_sse2(int8_t* outputVector, const int16_t* inputVector, uns #ifdef LV_HAVE_NEON #include -static inline void -volk_16i_convert_8i_neon(int8_t* outputVector, const int16_t* inputVector, unsigned int num_points) +static inline void volk_16i_convert_8i_neon(int8_t* outputVector, + const int16_t* inputVector, + unsigned int num_points) { - int8_t* outputVectorPtr = outputVector; - const int16_t* inputVectorPtr = inputVector; - unsigned int number = 0; - unsigned int sixteenth_points = num_points / 16; - - int16x8_t inputVal0; - int16x8_t inputVal1; - int8x8_t outputVal0; - int8x8_t outputVal1; - int8x16_t outputVal; - - for(number = 0; number < sixteenth_points; number++){ - // load two input vectors - inputVal0 = vld1q_s16(inputVectorPtr); - inputVal1 = vld1q_s16(inputVectorPtr+8); - // shift right - outputVal0 = vshrn_n_s16(inputVal0, 8); - outputVal1 = vshrn_n_s16(inputVal1, 8); - // squash two vectors and write output - outputVal = vcombine_s8(outputVal0, outputVal1); - vst1q_s8(outputVectorPtr, outputVal); - inputVectorPtr += 16; - outputVectorPtr += 16; - } - - for(number = sixteenth_points * 16; number < num_points; number++){ - *outputVectorPtr++ = ((int8_t)(*inputVectorPtr++ >> 8)); - } + int8_t* outputVectorPtr = outputVector; + const int16_t* inputVectorPtr = inputVector; + unsigned int number = 0; + unsigned int sixteenth_points = num_points / 16; + + int16x8_t inputVal0; + int16x8_t inputVal1; + int8x8_t outputVal0; + int8x8_t outputVal1; + int8x16_t outputVal; + + for (number = 0; number < sixteenth_points; number++) { + // load two input vectors + inputVal0 = vld1q_s16(inputVectorPtr); + inputVal1 = vld1q_s16(inputVectorPtr + 8); + // shift right + outputVal0 = vshrn_n_s16(inputVal0, 8); + outputVal1 = vshrn_n_s16(inputVal1, 8); + // squash two vectors and write output + outputVal = vcombine_s8(outputVal0, outputVal1); + vst1q_s8(outputVectorPtr, outputVal); + inputVectorPtr += 16; + outputVectorPtr += 16; + } + + for (number = sixteenth_points * 16; number < num_points; number++) { + *outputVectorPtr++ = ((int8_t)(*inputVectorPtr++ >> 8)); + } } #endif /* LV_HAVE_NEON */ #ifdef LV_HAVE_GENERIC -static inline void -volk_16i_convert_8i_a_generic(int8_t* outputVector, const int16_t* inputVector, unsigned int num_points) +static inline void volk_16i_convert_8i_a_generic(int8_t* outputVector, + const int16_t* inputVector, + unsigned int num_points) { - int8_t* outputVectorPtr = outputVector; - const int16_t* inputVectorPtr = inputVector; - unsigned int number = 0; + int8_t* outputVectorPtr = outputVector; + const int16_t* inputVectorPtr = inputVector; + unsigned int number = 0; - for(number = 0; number < num_points; number++){ - *outputVectorPtr++ = ((int8_t)(*inputVectorPtr++ >> 8)); - } + for (number = 0; number < num_points; number++) { + *outputVectorPtr++ = ((int8_t)(*inputVectorPtr++ >> 8)); + } } #endif /* LV_HAVE_GENERIC */ diff --git a/kernels/volk/volk_16i_max_star_16i.h b/kernels/volk/volk_16i_max_star_16i.h index 78fd911..d5dad18 100644 --- a/kernels/volk/volk_16i_max_star_16i.h +++ b/kernels/volk/volk_16i_max_star_16i.h @@ -53,67 +53,69 @@ #ifndef INCLUDED_volk_16i_max_star_16i_a_H #define INCLUDED_volk_16i_max_star_16i_a_H -#include -#include +#include +#include #ifdef LV_HAVE_SSSE3 -#include -#include -#include +#include +#include +#include static inline void volk_16i_max_star_16i_a_ssse3(short* target, short* src0, unsigned int num_points) { - const unsigned int num_bytes = num_points*2; + const unsigned int num_bytes = num_points * 2; - short candidate = src0[0]; - short cands[8]; - __m128i xmm0, xmm1, xmm3, xmm4, xmm5, xmm6; + short candidate = src0[0]; + short cands[8]; + __m128i xmm0, xmm1, xmm3, xmm4, xmm5, xmm6; - __m128i *p_src0; + __m128i* p_src0; - p_src0 = (__m128i*)src0; + p_src0 = (__m128i*)src0; - int bound = num_bytes >> 4; - int leftovers = (num_bytes >> 1) & 7; + int bound = num_bytes >> 4; + int leftovers = (num_bytes >> 1) & 7; - int i = 0; + int i = 0; - xmm1 = _mm_setzero_si128(); - xmm0 = _mm_setzero_si128(); - //_mm_insert_epi16(xmm0, candidate, 0); + xmm1 = _mm_setzero_si128(); + xmm0 = _mm_setzero_si128(); + //_mm_insert_epi16(xmm0, candidate, 0); - xmm0 = _mm_shuffle_epi8(xmm0, xmm1); + xmm0 = _mm_shuffle_epi8(xmm0, xmm1); - for(i = 0; i < bound; ++i) { - xmm1 = _mm_load_si128(p_src0); - p_src0 += 1; - //xmm2 = _mm_sub_epi16(xmm1, xmm0); + for (i = 0; i < bound; ++i) { + xmm1 = _mm_load_si128(p_src0); + p_src0 += 1; + // xmm2 = _mm_sub_epi16(xmm1, xmm0); - xmm3 = _mm_cmpgt_epi16(xmm0, xmm1); - xmm4 = _mm_cmpeq_epi16(xmm0, xmm1); - xmm5 = _mm_cmpgt_epi16(xmm1, xmm0); + xmm3 = _mm_cmpgt_epi16(xmm0, xmm1); + xmm4 = _mm_cmpeq_epi16(xmm0, xmm1); + xmm5 = _mm_cmpgt_epi16(xmm1, xmm0); - xmm6 = _mm_xor_si128(xmm4, xmm5); + xmm6 = _mm_xor_si128(xmm4, xmm5); - xmm3 = _mm_and_si128(xmm3, xmm0); - xmm4 = _mm_and_si128(xmm6, xmm1); + xmm3 = _mm_and_si128(xmm3, xmm0); + xmm4 = _mm_and_si128(xmm6, xmm1); - xmm0 = _mm_add_epi16(xmm3, xmm4); - } + xmm0 = _mm_add_epi16(xmm3, xmm4); + } - _mm_store_si128((__m128i*)cands, xmm0); + _mm_store_si128((__m128i*)cands, xmm0); - for(i = 0; i < 8; ++i) { - candidate = ((short)(candidate - cands[i]) > 0) ? candidate : cands[i]; - } + for (i = 0; i < 8; ++i) { + candidate = ((short)(candidate - cands[i]) > 0) ? candidate : cands[i]; + } - for(i = 0; i < leftovers; ++i) { - candidate = ((short)(candidate - src0[(bound << 3) + i]) > 0) ? candidate : src0[(bound << 3) + i]; - } + for (i = 0; i < leftovers; ++i) { + candidate = ((short)(candidate - src0[(bound << 3) + i]) > 0) + ? candidate + : src0[(bound << 3) + i]; + } - target[0] = candidate; + target[0] = candidate; } #endif /*LV_HAVE_SSSE3*/ @@ -124,38 +126,38 @@ volk_16i_max_star_16i_a_ssse3(short* target, short* src0, unsigned int num_point static inline void volk_16i_max_star_16i_neon(short* target, short* src0, unsigned int num_points) { - const unsigned int eighth_points = num_points / 8; - unsigned number; - int16x8_t input_vec; - int16x8_t diff, zeros; - uint16x8_t comp1, comp2; - zeros = vdupq_n_s16(0); - - int16x8x2_t tmpvec; - - int16x8_t candidate_vec = vld1q_dup_s16(src0 ); - short candidate; - ++src0; - - for(number=0; number < eighth_points; ++number) { - input_vec = vld1q_s16(src0); - __VOLK_PREFETCH(src0+16); - diff = vsubq_s16(candidate_vec, input_vec); - comp1 = vcgeq_s16(diff, zeros); - comp2 = vcltq_s16(diff, zeros); - - tmpvec.val[0] = vandq_s16(candidate_vec, (int16x8_t)comp1); - tmpvec.val[1] = vandq_s16(input_vec, (int16x8_t)comp2); - - candidate_vec = vaddq_s16(tmpvec.val[0], tmpvec.val[1]); - src0 += 8; - } - vst1q_s16(&candidate, candidate_vec); - - for(number=0; number < num_points%8; number++) { - candidate = ((int16_t)(candidate - src0[number]) > 0) ? candidate : src0[number]; - } - target[0] = candidate; + const unsigned int eighth_points = num_points / 8; + unsigned number; + int16x8_t input_vec; + int16x8_t diff, zeros; + uint16x8_t comp1, comp2; + zeros = vdupq_n_s16(0); + + int16x8x2_t tmpvec; + + int16x8_t candidate_vec = vld1q_dup_s16(src0); + short candidate; + ++src0; + + for (number = 0; number < eighth_points; ++number) { + input_vec = vld1q_s16(src0); + __VOLK_PREFETCH(src0 + 16); + diff = vsubq_s16(candidate_vec, input_vec); + comp1 = vcgeq_s16(diff, zeros); + comp2 = vcltq_s16(diff, zeros); + + tmpvec.val[0] = vandq_s16(candidate_vec, (int16x8_t)comp1); + tmpvec.val[1] = vandq_s16(input_vec, (int16x8_t)comp2); + + candidate_vec = vaddq_s16(tmpvec.val[0], tmpvec.val[1]); + src0 += 8; + } + vst1q_s16(&candidate, candidate_vec); + + for (number = 0; number < num_points % 8; number++) { + candidate = ((int16_t)(candidate - src0[number]) > 0) ? candidate : src0[number]; + } + target[0] = candidate; } #endif /*LV_HAVE_NEON*/ @@ -164,17 +166,17 @@ volk_16i_max_star_16i_neon(short* target, short* src0, unsigned int num_points) static inline void volk_16i_max_star_16i_generic(short* target, short* src0, unsigned int num_points) { - const unsigned int num_bytes = num_points*2; + const unsigned int num_bytes = num_points * 2; - int i = 0; + int i = 0; - int bound = num_bytes >> 1; + int bound = num_bytes >> 1; - short candidate = src0[0]; - for(i = 1; i < bound; ++i) { - candidate = ((short)(candidate - src0[i]) > 0) ? candidate : src0[i]; - } - target[0] = candidate; + short candidate = src0[0]; + for (i = 1; i < bound; ++i) { + candidate = ((short)(candidate - src0[i]) > 0) ? candidate : src0[i]; + } + target[0] = candidate; } #endif /*LV_HAVE_GENERIC*/ diff --git a/kernels/volk/volk_16i_max_star_horizontal_16i.h b/kernels/volk/volk_16i_max_star_horizontal_16i.h index 4ffe264..2e1f52b 100644 --- a/kernels/volk/volk_16i_max_star_horizontal_16i.h +++ b/kernels/volk/volk_16i_max_star_horizontal_16i.h @@ -29,8 +29,8 @@ * * Dispatcher Prototype * \code - * void volk_16i_max_star_horizontal_16i(short* target, short* src0, unsigned int num_points); - * \endcode + * void volk_16i_max_star_horizontal_16i(short* target, short* src0, unsigned int + * num_points); \endcode * * \b Inputs * \li src0: The input vector. @@ -55,102 +55,113 @@ #include -#include -#include +#include +#include #ifdef LV_HAVE_SSSE3 -#include -#include -#include +#include +#include +#include -static inline void -volk_16i_max_star_horizontal_16i_a_ssse3(int16_t* target, int16_t* src0, unsigned int num_points) +static inline void volk_16i_max_star_horizontal_16i_a_ssse3(int16_t* target, + int16_t* src0, + unsigned int num_points) { - const unsigned int num_bytes = num_points*2; + const unsigned int num_bytes = num_points * 2; - static const uint8_t shufmask0[16] = {0x00, 0x01, 0x04, 0x05, 0x08, 0x09, 0x0c, 0x0d, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}; - static const uint8_t shufmask1[16] = {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, - 0x01, 0x04, 0x05, 0x08, 0x09, 0x0c, 0x0d}; - static const uint8_t andmask0[16] = {0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02,0x02, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}; - static const uint8_t andmask1[16] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, - 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02}; + static const uint8_t shufmask0[16] = { + 0x00, 0x01, 0x04, 0x05, 0x08, 0x09, 0x0c, 0x0d, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff + }; + static const uint8_t shufmask1[16] = { + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x00, 0x01, 0x04, 0x05, 0x08, 0x09, 0x0c, 0x0d + }; + static const uint8_t andmask0[16] = { + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 + }; + static const uint8_t andmask1[16] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02 + }; - __m128i xmm0, xmm1, xmm2, xmm3, xmm4; - __m128i xmm5, xmm6, xmm7, xmm8; + __m128i xmm0, xmm1, xmm2, xmm3, xmm4; + __m128i xmm5, xmm6, xmm7, xmm8; - xmm4 = _mm_load_si128((__m128i*)shufmask0); - xmm5 = _mm_load_si128((__m128i*)shufmask1); - xmm6 = _mm_load_si128((__m128i*)andmask0); - xmm7 = _mm_load_si128((__m128i*)andmask1); + xmm4 = _mm_load_si128((__m128i*)shufmask0); + xmm5 = _mm_load_si128((__m128i*)shufmask1); + xmm6 = _mm_load_si128((__m128i*)andmask0); + xmm7 = _mm_load_si128((__m128i*)andmask1); - __m128i *p_target, *p_src0; + __m128i *p_target, *p_src0; - p_target = (__m128i*)target; - p_src0 = (__m128i*)src0; + p_target = (__m128i*)target; + p_src0 = (__m128i*)src0; - int bound = num_bytes >> 5; - int intermediate = (num_bytes >> 4) & 1; - int leftovers = (num_bytes >> 1) & 7; + int bound = num_bytes >> 5; + int intermediate = (num_bytes >> 4) & 1; + int leftovers = (num_bytes >> 1) & 7; - int i = 0; + int i = 0; - for(i = 0; i < bound; ++i) { - xmm0 = _mm_load_si128(p_src0); - xmm1 = _mm_load_si128(&p_src0[1]); + for (i = 0; i < bound; ++i) { + xmm0 = _mm_load_si128(p_src0); + xmm1 = _mm_load_si128(&p_src0[1]); - xmm2 = _mm_xor_si128(xmm2, xmm2); - p_src0 += 2; + xmm2 = _mm_xor_si128(xmm2, xmm2); + p_src0 += 2; - xmm3 = _mm_hsub_epi16(xmm0, xmm1); + xmm3 = _mm_hsub_epi16(xmm0, xmm1); - xmm2 = _mm_cmpgt_epi16(xmm2, xmm3); + xmm2 = _mm_cmpgt_epi16(xmm2, xmm3); - xmm8 = _mm_and_si128(xmm2, xmm6); - xmm3 = _mm_and_si128(xmm2, xmm7); + xmm8 = _mm_and_si128(xmm2, xmm6); + xmm3 = _mm_and_si128(xmm2, xmm7); - xmm8 = _mm_add_epi8(xmm8, xmm4); - xmm3 = _mm_add_epi8(xmm3, xmm5); + xmm8 = _mm_add_epi8(xmm8, xmm4); + xmm3 = _mm_add_epi8(xmm3, xmm5); - xmm0 = _mm_shuffle_epi8(xmm0, xmm8); - xmm1 = _mm_shuffle_epi8(xmm1, xmm3); + xmm0 = _mm_shuffle_epi8(xmm0, xmm8); + xmm1 = _mm_shuffle_epi8(xmm1, xmm3); - xmm3 = _mm_add_epi16(xmm0, xmm1); + xmm3 = _mm_add_epi16(xmm0, xmm1); - _mm_store_si128(p_target, xmm3); + _mm_store_si128(p_target, xmm3); - p_target += 1; - } + p_target += 1; + } - if (intermediate) { - xmm0 = _mm_load_si128(p_src0); + if (intermediate) { + xmm0 = _mm_load_si128(p_src0); - xmm2 = _mm_xor_si128(xmm2, xmm2); - p_src0 += 1; + xmm2 = _mm_xor_si128(xmm2, xmm2); + p_src0 += 1; - xmm3 = _mm_hsub_epi16(xmm0, xmm1); - xmm2 = _mm_cmpgt_epi16(xmm2, xmm3); + xmm3 = _mm_hsub_epi16(xmm0, xmm1); + xmm2 = _mm_cmpgt_epi16(xmm2, xmm3); - xmm8 = _mm_and_si128(xmm2, xmm6); + xmm8 = _mm_and_si128(xmm2, xmm6); - xmm3 = _mm_add_epi8(xmm8, xmm4); + xmm3 = _mm_add_epi8(xmm8, xmm4); - xmm0 = _mm_shuffle_epi8(xmm0, xmm3); + xmm0 = _mm_shuffle_epi8(xmm0, xmm3); - _mm_storel_pd((double*)p_target, bit128_p(&xmm0)->double_vec); + _mm_storel_pd((double*)p_target, bit128_p(&xmm0)->double_vec); - p_target = (__m128i*)((int8_t*)p_target + 8); - } + p_target = (__m128i*)((int8_t*)p_target + 8); + } - for(i = (bound << 4) + (intermediate << 3); i < (bound << 4) + (intermediate << 3) + leftovers ; i += 2) { - target[i>>1] = ((int16_t)(src0[i] - src0[i + 1]) > 0) ? src0[i] : src0[i + 1]; - } + for (i = (bound << 4) + (intermediate << 3); + i < (bound << 4) + (intermediate << 3) + leftovers; + i += 2) { + target[i >> 1] = ((int16_t)(src0[i] - src0[i + 1]) > 0) ? src0[i] : src0[i + 1]; + } } #endif /*LV_HAVE_SSSE3*/ @@ -158,54 +169,59 @@ volk_16i_max_star_horizontal_16i_a_ssse3(int16_t* target, int16_t* src0, unsigne #ifdef LV_HAVE_NEON #include -static inline void -volk_16i_max_star_horizontal_16i_neon(int16_t* target, int16_t* src0, unsigned int num_points) +static inline void volk_16i_max_star_horizontal_16i_neon(int16_t* target, + int16_t* src0, + unsigned int num_points) { - const unsigned int eighth_points = num_points / 16; - unsigned number; - int16x8x2_t input_vec; - int16x8_t diff, max_vec, zeros; - uint16x8_t comp1, comp2; - zeros = vdupq_n_s16(0); - for(number=0; number < eighth_points; ++number) { - input_vec = vld2q_s16(src0); - //__VOLK_PREFETCH(src0+16); - diff = vsubq_s16(input_vec.val[0], input_vec.val[1]); - comp1 = vcgeq_s16(diff, zeros); - comp2 = vcltq_s16(diff, zeros); - - input_vec.val[0] = vandq_s16(input_vec.val[0], (int16x8_t)comp1); - input_vec.val[1] = vandq_s16(input_vec.val[1], (int16x8_t)comp2); - - max_vec = vaddq_s16(input_vec.val[0], input_vec.val[1]); - vst1q_s16(target, max_vec); - src0 += 16; - target += 8; - } - for(number=0; number < num_points%16; number+=2) { - target[number >> 1] = ((int16_t)(src0[number] - src0[number + 1]) > 0) ? src0[number] : src0[number+1]; - } - + const unsigned int eighth_points = num_points / 16; + unsigned number; + int16x8x2_t input_vec; + int16x8_t diff, max_vec, zeros; + uint16x8_t comp1, comp2; + zeros = vdupq_n_s16(0); + for (number = 0; number < eighth_points; ++number) { + input_vec = vld2q_s16(src0); + //__VOLK_PREFETCH(src0+16); + diff = vsubq_s16(input_vec.val[0], input_vec.val[1]); + comp1 = vcgeq_s16(diff, zeros); + comp2 = vcltq_s16(diff, zeros); + + input_vec.val[0] = vandq_s16(input_vec.val[0], (int16x8_t)comp1); + input_vec.val[1] = vandq_s16(input_vec.val[1], (int16x8_t)comp2); + + max_vec = vaddq_s16(input_vec.val[0], input_vec.val[1]); + vst1q_s16(target, max_vec); + src0 += 16; + target += 8; + } + for (number = 0; number < num_points % 16; number += 2) { + target[number >> 1] = ((int16_t)(src0[number] - src0[number + 1]) > 0) + ? src0[number] + : src0[number + 1]; + } } #endif /* LV_HAVE_NEON */ #ifdef LV_HAVE_NEONV7 -extern void volk_16i_max_star_horizontal_16i_a_neonasm(int16_t* target, int16_t* src0, unsigned int num_points); +extern void volk_16i_max_star_horizontal_16i_a_neonasm(int16_t* target, + int16_t* src0, + unsigned int num_points); #endif /* LV_HAVE_NEONV7 */ #ifdef LV_HAVE_GENERIC -static inline void -volk_16i_max_star_horizontal_16i_generic(int16_t* target, int16_t* src0, unsigned int num_points) +static inline void volk_16i_max_star_horizontal_16i_generic(int16_t* target, + int16_t* src0, + unsigned int num_points) { - const unsigned int num_bytes = num_points*2; + const unsigned int num_bytes = num_points * 2; - int i = 0; + int i = 0; - int bound = num_bytes >> 1; + int bound = num_bytes >> 1; - for(i = 0; i < bound; i += 2) { - target[i >> 1] = ((int16_t) (src0[i] - src0[i + 1]) > 0) ? src0[i] : src0[i+1]; - } + for (i = 0; i < bound; i += 2) { + target[i >> 1] = ((int16_t)(src0[i] - src0[i + 1]) > 0) ? src0[i] : src0[i + 1]; + } } #endif /*LV_HAVE_GENERIC*/ diff --git a/kernels/volk/volk_16i_permute_and_scalar_add.h b/kernels/volk/volk_16i_permute_and_scalar_add.h index 7fcdad3..0563f07 100644 --- a/kernels/volk/volk_16i_permute_and_scalar_add.h +++ b/kernels/volk/volk_16i_permute_and_scalar_add.h @@ -29,8 +29,9 @@ * * Dispatcher Prototype * \code - * void volk_16i_permute_and_scalar_add(short* target, short* src0, short* permute_indexes, short* cntl0, short* cntl1, short* cntl2, short* cntl3, short* scalars, unsigned int num_points) - * \endcode + * void volk_16i_permute_and_scalar_add(short* target, short* src0, short* + * permute_indexes, short* cntl0, short* cntl1, short* cntl2, short* cntl3, short* + * scalars, unsigned int num_points) \endcode * * \b Inputs * \li src0: The input vector. @@ -58,137 +59,143 @@ #ifndef INCLUDED_volk_16i_permute_and_scalar_add_a_H #define INCLUDED_volk_16i_permute_and_scalar_add_a_H -#include -#include +#include +#include #ifdef LV_HAVE_SSE2 -#include -#include - -static inline void -volk_16i_permute_and_scalar_add_a_sse2(short* target, short* src0, short* permute_indexes, - short* cntl0, short* cntl1, short* cntl2, short* cntl3, - short* scalars, unsigned int num_points) +#include +#include + +static inline void volk_16i_permute_and_scalar_add_a_sse2(short* target, + short* src0, + short* permute_indexes, + short* cntl0, + short* cntl1, + short* cntl2, + short* cntl3, + short* scalars, + unsigned int num_points) { - const unsigned int num_bytes = num_points*2; + const unsigned int num_bytes = num_points * 2; - __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; + __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; - __m128i *p_target, *p_cntl0, *p_cntl1, *p_cntl2, *p_cntl3, *p_scalars; + __m128i *p_target, *p_cntl0, *p_cntl1, *p_cntl2, *p_cntl3, *p_scalars; - short* p_permute_indexes = permute_indexes; + short* p_permute_indexes = permute_indexes; - p_target = (__m128i*)target; - p_cntl0 = (__m128i*)cntl0; - p_cntl1 = (__m128i*)cntl1; - p_cntl2 = (__m128i*)cntl2; - p_cntl3 = (__m128i*)cntl3; - p_scalars = (__m128i*)scalars; + p_target = (__m128i*)target; + p_cntl0 = (__m128i*)cntl0; + p_cntl1 = (__m128i*)cntl1; + p_cntl2 = (__m128i*)cntl2; + p_cntl3 = (__m128i*)cntl3; + p_scalars = (__m128i*)scalars; - int i = 0; + int i = 0; - int bound = (num_bytes >> 4); - int leftovers = (num_bytes >> 1) & 7; + int bound = (num_bytes >> 4); + int leftovers = (num_bytes >> 1) & 7; - xmm0 = _mm_load_si128(p_scalars); + xmm0 = _mm_load_si128(p_scalars); - xmm1 = _mm_shufflelo_epi16(xmm0, 0); - xmm2 = _mm_shufflelo_epi16(xmm0, 0x55); - xmm3 = _mm_shufflelo_epi16(xmm0, 0xaa); - xmm4 = _mm_shufflelo_epi16(xmm0, 0xff); + xmm1 = _mm_shufflelo_epi16(xmm0, 0); + xmm2 = _mm_shufflelo_epi16(xmm0, 0x55); + xmm3 = _mm_shufflelo_epi16(xmm0, 0xaa); + xmm4 = _mm_shufflelo_epi16(xmm0, 0xff); - xmm1 = _mm_shuffle_epi32(xmm1, 0x00); - xmm2 = _mm_shuffle_epi32(xmm2, 0x00); - xmm3 = _mm_shuffle_epi32(xmm3, 0x00); - xmm4 = _mm_shuffle_epi32(xmm4, 0x00); + xmm1 = _mm_shuffle_epi32(xmm1, 0x00); + xmm2 = _mm_shuffle_epi32(xmm2, 0x00); + xmm3 = _mm_shuffle_epi32(xmm3, 0x00); + xmm4 = _mm_shuffle_epi32(xmm4, 0x00); - for(; i < bound; ++i) { - xmm0 = _mm_setzero_si128(); - xmm5 = _mm_setzero_si128(); - xmm6 = _mm_setzero_si128(); - xmm7 = _mm_setzero_si128(); + for (; i < bound; ++i) { + xmm0 = _mm_setzero_si128(); + xmm5 = _mm_setzero_si128(); + xmm6 = _mm_setzero_si128(); + xmm7 = _mm_setzero_si128(); - xmm0 = _mm_insert_epi16(xmm0, src0[p_permute_indexes[0]], 0); - xmm5 = _mm_insert_epi16(xmm5, src0[p_permute_indexes[1]], 1); - xmm6 = _mm_insert_epi16(xmm6, src0[p_permute_indexes[2]], 2); - xmm7 = _mm_insert_epi16(xmm7, src0[p_permute_indexes[3]], 3); - xmm0 = _mm_insert_epi16(xmm0, src0[p_permute_indexes[4]], 4); - xmm5 = _mm_insert_epi16(xmm5, src0[p_permute_indexes[5]], 5); - xmm6 = _mm_insert_epi16(xmm6, src0[p_permute_indexes[6]], 6); - xmm7 = _mm_insert_epi16(xmm7, src0[p_permute_indexes[7]], 7); + xmm0 = _mm_insert_epi16(xmm0, src0[p_permute_indexes[0]], 0); + xmm5 = _mm_insert_epi16(xmm5, src0[p_permute_indexes[1]], 1); + xmm6 = _mm_insert_epi16(xmm6, src0[p_permute_indexes[2]], 2); + xmm7 = _mm_insert_epi16(xmm7, src0[p_permute_indexes[3]], 3); + xmm0 = _mm_insert_epi16(xmm0, src0[p_permute_indexes[4]], 4); + xmm5 = _mm_insert_epi16(xmm5, src0[p_permute_indexes[5]], 5); + xmm6 = _mm_insert_epi16(xmm6, src0[p_permute_indexes[6]], 6); + xmm7 = _mm_insert_epi16(xmm7, src0[p_permute_indexes[7]], 7); - xmm0 = _mm_add_epi16(xmm0, xmm5); - xmm6 = _mm_add_epi16(xmm6, xmm7); + xmm0 = _mm_add_epi16(xmm0, xmm5); + xmm6 = _mm_add_epi16(xmm6, xmm7); - p_permute_indexes += 8; + p_permute_indexes += 8; - xmm0 = _mm_add_epi16(xmm0, xmm6); + xmm0 = _mm_add_epi16(xmm0, xmm6); - xmm5 = _mm_load_si128(p_cntl0); - xmm6 = _mm_load_si128(p_cntl1); - xmm7 = _mm_load_si128(p_cntl2); + xmm5 = _mm_load_si128(p_cntl0); + xmm6 = _mm_load_si128(p_cntl1); + xmm7 = _mm_load_si128(p_cntl2); - xmm5 = _mm_and_si128(xmm5, xmm1); - xmm6 = _mm_and_si128(xmm6, xmm2); - xmm7 = _mm_and_si128(xmm7, xmm3); + xmm5 = _mm_and_si128(xmm5, xmm1); + xmm6 = _mm_and_si128(xmm6, xmm2); + xmm7 = _mm_and_si128(xmm7, xmm3); - xmm0 = _mm_add_epi16(xmm0, xmm5); + xmm0 = _mm_add_epi16(xmm0, xmm5); - xmm5 = _mm_load_si128(p_cntl3); + xmm5 = _mm_load_si128(p_cntl3); - xmm6 = _mm_add_epi16(xmm6, xmm7); + xmm6 = _mm_add_epi16(xmm6, xmm7); - p_cntl0 += 1; + p_cntl0 += 1; - xmm5 = _mm_and_si128(xmm5, xmm4); + xmm5 = _mm_and_si128(xmm5, xmm4); - xmm0 = _mm_add_epi16(xmm0, xmm6); + xmm0 = _mm_add_epi16(xmm0, xmm6); - p_cntl1 += 1; - p_cntl2 += 1; + p_cntl1 += 1; + p_cntl2 += 1; - xmm0 = _mm_add_epi16(xmm0, xmm5); + xmm0 = _mm_add_epi16(xmm0, xmm5); - p_cntl3 += 1; + p_cntl3 += 1; - _mm_store_si128(p_target, xmm0); + _mm_store_si128(p_target, xmm0); - p_target += 1; - } + p_target += 1; + } - for(i = bound * 8; i < (bound * 8) + leftovers; ++i) { - target[i] = src0[permute_indexes[i]] - + (cntl0[i] & scalars[0]) - + (cntl1[i] & scalars[1]) - + (cntl2[i] & scalars[2]) - + (cntl3[i] & scalars[3]); - } + for (i = bound * 8; i < (bound * 8) + leftovers; ++i) { + target[i] = src0[permute_indexes[i]] + (cntl0[i] & scalars[0]) + + (cntl1[i] & scalars[1]) + (cntl2[i] & scalars[2]) + + (cntl3[i] & scalars[3]); + } } #endif /*LV_HAVE_SSE*/ #ifdef LV_HAVE_GENERIC -static inline void -volk_16i_permute_and_scalar_add_generic(short* target, short* src0, short* permute_indexes, - short* cntl0, short* cntl1, short* cntl2, short* cntl3, - short* scalars, unsigned int num_points) +static inline void volk_16i_permute_and_scalar_add_generic(short* target, + short* src0, + short* permute_indexes, + short* cntl0, + short* cntl1, + short* cntl2, + short* cntl3, + short* scalars, + unsigned int num_points) { - const unsigned int num_bytes = num_points*2; + const unsigned int num_bytes = num_points * 2; - int i = 0; + int i = 0; - int bound = num_bytes >> 1; + int bound = num_bytes >> 1; - for(i = 0; i < bound; ++i) { - target[i] = src0[permute_indexes[i]] - + (cntl0[i] & scalars[0]) - + (cntl1[i] & scalars[1]) - + (cntl2[i] & scalars[2]) - + (cntl3[i] & scalars[3]); - } + for (i = 0; i < bound; ++i) { + target[i] = src0[permute_indexes[i]] + (cntl0[i] & scalars[0]) + + (cntl1[i] & scalars[1]) + (cntl2[i] & scalars[2]) + + (cntl3[i] & scalars[3]); + } } #endif /*LV_HAVE_GENERIC*/ diff --git a/kernels/volk/volk_16i_s32f_convert_32f.h b/kernels/volk/volk_16i_s32f_convert_32f.h index 38ea6f5..3fd3a77 100644 --- a/kernels/volk/volk_16i_s32f_convert_32f.h +++ b/kernels/volk/volk_16i_s32f_convert_32f.h @@ -29,8 +29,8 @@ * * Dispatcher Prototype * \code - * void volk_16i_s32f_convert_32f(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points); - * \endcode + * void volk_16i_s32f_convert_32f(float* outputVector, const int16_t* inputVector, const + * float scalar, unsigned int num_points); \endcode * * \b Inputs * \li inputVector: The input vector of 16-bit shorts. @@ -60,238 +60,247 @@ #ifdef LV_HAVE_AVX2 #include -static inline void -volk_16i_s32f_convert_32f_u_avx2(float* outputVector, const int16_t* inputVector, - const float scalar, unsigned int num_points) +static inline void volk_16i_s32f_convert_32f_u_avx2(float* outputVector, + const int16_t* inputVector, + const float scalar, + unsigned int num_points) { - unsigned int number = 0; - const unsigned int eighthPoints = num_points / 8; + unsigned int number = 0; + const unsigned int eighthPoints = num_points / 8; - float* outputVectorPtr = outputVector; - __m256 invScalar = _mm256_set1_ps(1.0/scalar); - int16_t* inputPtr = (int16_t*)inputVector; - __m128i inputVal; - __m256i inputVal2; - __m256 ret; + float* outputVectorPtr = outputVector; + __m256 invScalar = _mm256_set1_ps(1.0 / scalar); + int16_t* inputPtr = (int16_t*)inputVector; + __m128i inputVal; + __m256i inputVal2; + __m256 ret; - for(;number < eighthPoints; number++){ + for (; number < eighthPoints; number++) { - // Load the 8 values - inputVal = _mm_loadu_si128((__m128i*)inputPtr); + // Load the 8 values + inputVal = _mm_loadu_si128((__m128i*)inputPtr); - // Convert - inputVal2 = _mm256_cvtepi16_epi32(inputVal); + // Convert + inputVal2 = _mm256_cvtepi16_epi32(inputVal); - ret = _mm256_cvtepi32_ps(inputVal2); - ret = _mm256_mul_ps(ret, invScalar); + ret = _mm256_cvtepi32_ps(inputVal2); + ret = _mm256_mul_ps(ret, invScalar); - _mm256_storeu_ps(outputVectorPtr, ret); + _mm256_storeu_ps(outputVectorPtr, ret); - outputVectorPtr += 8; + outputVectorPtr += 8; - inputPtr += 8; - } + inputPtr += 8; + } - number = eighthPoints * 8; - for(; number < num_points; number++){ - outputVector[number] =((float)(inputVector[number])) / scalar; - } + number = eighthPoints * 8; + for (; number < num_points; number++) { + outputVector[number] = ((float)(inputVector[number])) / scalar; + } } #endif /* LV_HAVE_AVX2 */ #ifdef LV_HAVE_AVX #include -static inline void -volk_16i_s32f_convert_32f_u_avx(float* outputVector, const int16_t* inputVector, - const float scalar, unsigned int num_points) +static inline void volk_16i_s32f_convert_32f_u_avx(float* outputVector, + const int16_t* inputVector, + const float scalar, + unsigned int num_points) { - unsigned int number = 0; - const unsigned int eighthPoints = num_points / 8; + unsigned int number = 0; + const unsigned int eighthPoints = num_points / 8; - float* outputVectorPtr = outputVector; - __m128 invScalar = _mm_set_ps1(1.0/scalar); - int16_t* inputPtr = (int16_t*)inputVector; - __m128i inputVal, inputVal2; - __m128 ret; - __m256 output; - __m256 dummy = _mm256_setzero_ps(); + float* outputVectorPtr = outputVector; + __m128 invScalar = _mm_set_ps1(1.0 / scalar); + int16_t* inputPtr = (int16_t*)inputVector; + __m128i inputVal, inputVal2; + __m128 ret; + __m256 output; + __m256 dummy = _mm256_setzero_ps(); - for(;number < eighthPoints; number++){ + for (; number < eighthPoints; number++) { - // Load the 8 values - //inputVal = _mm_loadu_si128((__m128i*)inputPtr); - inputVal = _mm_loadu_si128((__m128i*)inputPtr); + // Load the 8 values + // inputVal = _mm_loadu_si128((__m128i*)inputPtr); + inputVal = _mm_loadu_si128((__m128i*)inputPtr); - // Shift the input data to the right by 64 bits ( 8 bytes ) - inputVal2 = _mm_srli_si128(inputVal, 8); + // Shift the input data to the right by 64 bits ( 8 bytes ) + inputVal2 = _mm_srli_si128(inputVal, 8); - // Convert the lower 4 values into 32 bit words - inputVal = _mm_cvtepi16_epi32(inputVal); - inputVal2 = _mm_cvtepi16_epi32(inputVal2); + // Convert the lower 4 values into 32 bit words + inputVal = _mm_cvtepi16_epi32(inputVal); + inputVal2 = _mm_cvtepi16_epi32(inputVal2); - ret = _mm_cvtepi32_ps(inputVal); - ret = _mm_mul_ps(ret, invScalar); - output = _mm256_insertf128_ps(dummy, ret, 0); + ret = _mm_cvtepi32_ps(inputVal); + ret = _mm_mul_ps(ret, invScalar); + output = _mm256_insertf128_ps(dummy, ret, 0); - ret = _mm_cvtepi32_ps(inputVal2); - ret = _mm_mul_ps(ret, invScalar); - output = _mm256_insertf128_ps(output, ret, 1); + ret = _mm_cvtepi32_ps(inputVal2); + ret = _mm_mul_ps(ret, invScalar); + output = _mm256_insertf128_ps(output, ret, 1); - _mm256_storeu_ps(outputVectorPtr, output); + _mm256_storeu_ps(outputVectorPtr, output); - outputVectorPtr += 8; + outputVectorPtr += 8; - inputPtr += 8; - } + inputPtr += 8; + } - number = eighthPoints * 8; - for(; number < num_points; number++){ - outputVector[number] =((float)(inputVector[number])) / scalar; - } + number = eighthPoints * 8; + for (; number < num_points; number++) { + outputVector[number] = ((float)(inputVector[number])) / scalar; + } } #endif /* LV_HAVE_AVX */ #ifdef LV_HAVE_SSE4_1 #include -static inline void -volk_16i_s32f_convert_32f_u_sse4_1(float* outputVector, const int16_t* inputVector, - const float scalar, unsigned int num_points) +static inline void volk_16i_s32f_convert_32f_u_sse4_1(float* outputVector, + const int16_t* inputVector, + const float scalar, + unsigned int num_points) { - unsigned int number = 0; - const unsigned int eighthPoints = num_points / 8; + unsigned int number = 0; + const unsigned int eighthPoints = num_points / 8; - float* outputVectorPtr = outputVector; - __m128 invScalar = _mm_set_ps1(1.0/scalar); - int16_t* inputPtr = (int16_t*)inputVector; - __m128i inputVal; - __m128i inputVal2; - __m128 ret; + float* outputVectorPtr = outputVector; + __m128 invScalar = _mm_set_ps1(1.0 / scalar); + int16_t* inputPtr = (int16_t*)inputVector; + __m128i inputVal; + __m128i inputVal2; + __m128 ret; - for(;number < eighthPoints; number++){ + for (; number < eighthPoints; number++) { - // Load the 8 values - inputVal = _mm_loadu_si128((__m128i*)inputPtr); + // Load the 8 values + inputVal = _mm_loadu_si128((__m128i*)inputPtr); - // Shift the input data to the right by 64 bits ( 8 bytes ) - inputVal2 = _mm_srli_si128(inputVal, 8); + // Shift the input data to the right by 64 bits ( 8 bytes ) + inputVal2 = _mm_srli_si128(inputVal, 8); - // Convert the lower 4 values into 32 bit words - inputVal = _mm_cvtepi16_epi32(inputVal); - inputVal2 = _mm_cvtepi16_epi32(inputVal2); + // Convert the lower 4 values into 32 bit words + inputVal = _mm_cvtepi16_epi32(inputVal); + inputVal2 = _mm_cvtepi16_epi32(inputVal2); - ret = _mm_cvtepi32_ps(inputVal); - ret = _mm_mul_ps(ret, invScalar); - _mm_storeu_ps(outputVectorPtr, ret); - outputVectorPtr += 4; + ret = _mm_cvtepi32_ps(inputVal); + ret = _mm_mul_ps(ret, invScalar); + _mm_storeu_ps(outputVectorPtr, ret); + outputVectorPtr += 4; - ret = _mm_cvtepi32_ps(inputVal2); - ret = _mm_mul_ps(ret, invScalar); - _mm_storeu_ps(outputVectorPtr, ret); + ret = _mm_cvtepi32_ps(inputVal2); + ret = _mm_mul_ps(ret, invScalar); + _mm_storeu_ps(outputVectorPtr, ret); - outputVectorPtr += 4; + outputVectorPtr += 4; - inputPtr += 8; - } + inputPtr += 8; + } - number = eighthPoints * 8; - for(; number < num_points; number++){ - outputVector[number] =((float)(inputVector[number])) / scalar; - } + number = eighthPoints * 8; + for (; number < num_points; number++) { + outputVector[number] = ((float)(inputVector[number])) / scalar; + } } #endif /* LV_HAVE_SSE4_1 */ #ifdef LV_HAVE_SSE #include -static inline void -volk_16i_s32f_convert_32f_u_sse(float* outputVector, const int16_t* inputVector, - const float scalar, unsigned int num_points) +static inline void volk_16i_s32f_convert_32f_u_sse(float* outputVector, + const int16_t* inputVector, + const float scalar, + unsigned int num_points) { - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - float* outputVectorPtr = outputVector; - __m128 invScalar = _mm_set_ps1(1.0/scalar); - int16_t* inputPtr = (int16_t*)inputVector; - __m128 ret; - - for(;number < quarterPoints; number++){ - ret = _mm_set_ps((float)(inputPtr[3]), (float)(inputPtr[2]), (float)(inputPtr[1]), (float)(inputPtr[0])); - - ret = _mm_mul_ps(ret, invScalar); - _mm_storeu_ps(outputVectorPtr, ret); - - inputPtr += 4; - outputVectorPtr += 4; - } - - number = quarterPoints * 4; - for(; number < num_points; number++){ - outputVector[number] = (float)(inputVector[number]) / scalar; - } + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + float* outputVectorPtr = outputVector; + __m128 invScalar = _mm_set_ps1(1.0 / scalar); + int16_t* inputPtr = (int16_t*)inputVector; + __m128 ret; + + for (; number < quarterPoints; number++) { + ret = _mm_set_ps((float)(inputPtr[3]), + (float)(inputPtr[2]), + (float)(inputPtr[1]), + (float)(inputPtr[0])); + + ret = _mm_mul_ps(ret, invScalar); + _mm_storeu_ps(outputVectorPtr, ret); + + inputPtr += 4; + outputVectorPtr += 4; + } + + number = quarterPoints * 4; + for (; number < num_points; number++) { + outputVector[number] = (float)(inputVector[number]) / scalar; + } } #endif /* LV_HAVE_SSE */ #ifdef LV_HAVE_GENERIC -static inline void -volk_16i_s32f_convert_32f_generic(float* outputVector, const int16_t* inputVector, - const float scalar, unsigned int num_points) +static inline void volk_16i_s32f_convert_32f_generic(float* outputVector, + const int16_t* inputVector, + const float scalar, + unsigned int num_points) { - float* outputVectorPtr = outputVector; - const int16_t* inputVectorPtr = inputVector; - unsigned int number = 0; + float* outputVectorPtr = outputVector; + const int16_t* inputVectorPtr = inputVector; + unsigned int number = 0; - for(number = 0; number < num_points; number++){ - *outputVectorPtr++ = ((float)(*inputVectorPtr++)) / scalar; - } + for (number = 0; number < num_points; number++) { + *outputVectorPtr++ = ((float)(*inputVectorPtr++)) / scalar; + } } #endif /* LV_HAVE_GENERIC */ #ifdef LV_HAVE_NEON #include -static inline void -volk_16i_s32f_convert_32f_neon(float* outputVector, const int16_t* inputVector, - const float scalar, unsigned int num_points) +static inline void volk_16i_s32f_convert_32f_neon(float* outputVector, + const int16_t* inputVector, + const float scalar, + unsigned int num_points) { - float* outputPtr = outputVector; - const int16_t* inputPtr = inputVector; - unsigned int number = 0; - unsigned int eighth_points = num_points / 8; - - int16x4x2_t input16; - int32x4_t input32_0, input32_1; - float32x4_t input_float_0, input_float_1; - float32x4x2_t output_float; - float32x4_t inv_scale; - - inv_scale = vdupq_n_f32(1.0/scalar); - - // the generic disassembles to a 128-bit load - // and duplicates every instruction to operate on 64-bits - // at a time. This is only possible with lanes, which is faster - // than just doing a vld1_s16, but still slower. - for(number = 0; number < eighth_points; number++){ - input16 = vld2_s16(inputPtr); - // widen 16-bit int to 32-bit int - input32_0 = vmovl_s16(input16.val[0]); - input32_1 = vmovl_s16(input16.val[1]); - // convert 32-bit int to float with scale - input_float_0 = vcvtq_f32_s32(input32_0); - input_float_1 = vcvtq_f32_s32(input32_1); - output_float.val[0] = vmulq_f32(input_float_0, inv_scale); - output_float.val[1] = vmulq_f32(input_float_1, inv_scale); - vst2q_f32(outputPtr, output_float); - inputPtr += 8; - outputPtr += 8; - } - - for(number = eighth_points*8; number < num_points; number++){ - *outputPtr++ = ((float)(*inputPtr++)) / scalar; - } + float* outputPtr = outputVector; + const int16_t* inputPtr = inputVector; + unsigned int number = 0; + unsigned int eighth_points = num_points / 8; + + int16x4x2_t input16; + int32x4_t input32_0, input32_1; + float32x4_t input_float_0, input_float_1; + float32x4x2_t output_float; + float32x4_t inv_scale; + + inv_scale = vdupq_n_f32(1.0 / scalar); + + // the generic disassembles to a 128-bit load + // and duplicates every instruction to operate on 64-bits + // at a time. This is only possible with lanes, which is faster + // than just doing a vld1_s16, but still slower. + for (number = 0; number < eighth_points; number++) { + input16 = vld2_s16(inputPtr); + // widen 16-bit int to 32-bit int + input32_0 = vmovl_s16(input16.val[0]); + input32_1 = vmovl_s16(input16.val[1]); + // convert 32-bit int to float with scale + input_float_0 = vcvtq_f32_s32(input32_0); + input_float_1 = vcvtq_f32_s32(input32_1); + output_float.val[0] = vmulq_f32(input_float_0, inv_scale); + output_float.val[1] = vmulq_f32(input_float_1, inv_scale); + vst2q_f32(outputPtr, output_float); + inputPtr += 8; + outputPtr += 8; + } + + for (number = eighth_points * 8; number < num_points; number++) { + *outputPtr++ = ((float)(*inputPtr++)) / scalar; + } } #endif /* LV_HAVE_NEON */ @@ -306,193 +315,201 @@ volk_16i_s32f_convert_32f_neon(float* outputVector, const int16_t* inputVector, #ifdef LV_HAVE_AVX2 #include -static inline void -volk_16i_s32f_convert_32f_a_avx2(float* outputVector, const int16_t* inputVector, - const float scalar, unsigned int num_points) +static inline void volk_16i_s32f_convert_32f_a_avx2(float* outputVector, + const int16_t* inputVector, + const float scalar, + unsigned int num_points) { - unsigned int number = 0; - const unsigned int eighthPoints = num_points / 8; + unsigned int number = 0; + const unsigned int eighthPoints = num_points / 8; - float* outputVectorPtr = outputVector; - __m256 invScalar = _mm256_set1_ps(1.0/scalar); - int16_t* inputPtr = (int16_t*)inputVector; - __m128i inputVal; - __m256i inputVal2; - __m256 ret; + float* outputVectorPtr = outputVector; + __m256 invScalar = _mm256_set1_ps(1.0 / scalar); + int16_t* inputPtr = (int16_t*)inputVector; + __m128i inputVal; + __m256i inputVal2; + __m256 ret; - for(;number < eighthPoints; number++){ + for (; number < eighthPoints; number++) { - // Load the 8 values - inputVal = _mm_load_si128((__m128i*)inputPtr); + // Load the 8 values + inputVal = _mm_load_si128((__m128i*)inputPtr); - // Convert - inputVal2 = _mm256_cvtepi16_epi32(inputVal); + // Convert + inputVal2 = _mm256_cvtepi16_epi32(inputVal); - ret = _mm256_cvtepi32_ps(inputVal2); - ret = _mm256_mul_ps(ret, invScalar); + ret = _mm256_cvtepi32_ps(inputVal2); + ret = _mm256_mul_ps(ret, invScalar); - _mm256_store_ps(outputVectorPtr, ret); + _mm256_store_ps(outputVectorPtr, ret); - outputVectorPtr += 8; + outputVectorPtr += 8; - inputPtr += 8; - } + inputPtr += 8; + } - number = eighthPoints * 8; - for(; number < num_points; number++){ - outputVector[number] =((float)(inputVector[number])) / scalar; - } + number = eighthPoints * 8; + for (; number < num_points; number++) { + outputVector[number] = ((float)(inputVector[number])) / scalar; + } } #endif /* LV_HAVE_AVX2 */ #ifdef LV_HAVE_AVX #include -static inline void -volk_16i_s32f_convert_32f_a_avx(float* outputVector, const int16_t* inputVector, - const float scalar, unsigned int num_points) +static inline void volk_16i_s32f_convert_32f_a_avx(float* outputVector, + const int16_t* inputVector, + const float scalar, + unsigned int num_points) { - unsigned int number = 0; - const unsigned int eighthPoints = num_points / 8; + unsigned int number = 0; + const unsigned int eighthPoints = num_points / 8; - float* outputVectorPtr = outputVector; - __m128 invScalar = _mm_set_ps1(1.0/scalar); - int16_t* inputPtr = (int16_t*)inputVector; - __m128i inputVal, inputVal2; - __m128 ret; - __m256 output; - __m256 dummy = _mm256_setzero_ps(); + float* outputVectorPtr = outputVector; + __m128 invScalar = _mm_set_ps1(1.0 / scalar); + int16_t* inputPtr = (int16_t*)inputVector; + __m128i inputVal, inputVal2; + __m128 ret; + __m256 output; + __m256 dummy = _mm256_setzero_ps(); - for(;number < eighthPoints; number++){ + for (; number < eighthPoints; number++) { - // Load the 8 values - //inputVal = _mm_loadu_si128((__m128i*)inputPtr); - inputVal = _mm_load_si128((__m128i*)inputPtr); + // Load the 8 values + // inputVal = _mm_loadu_si128((__m128i*)inputPtr); + inputVal = _mm_load_si128((__m128i*)inputPtr); - // Shift the input data to the right by 64 bits ( 8 bytes ) - inputVal2 = _mm_srli_si128(inputVal, 8); + // Shift the input data to the right by 64 bits ( 8 bytes ) + inputVal2 = _mm_srli_si128(inputVal, 8); - // Convert the lower 4 values into 32 bit words - inputVal = _mm_cvtepi16_epi32(inputVal); - inputVal2 = _mm_cvtepi16_epi32(inputVal2); + // Convert the lower 4 values into 32 bit words + inputVal = _mm_cvtepi16_epi32(inputVal); + inputVal2 = _mm_cvtepi16_epi32(inputVal2); - ret = _mm_cvtepi32_ps(inputVal); - ret = _mm_mul_ps(ret, invScalar); - output = _mm256_insertf128_ps(dummy, ret, 0); + ret = _mm_cvtepi32_ps(inputVal); + ret = _mm_mul_ps(ret, invScalar); + output = _mm256_insertf128_ps(dummy, ret, 0); - ret = _mm_cvtepi32_ps(inputVal2); - ret = _mm_mul_ps(ret, invScalar); - output = _mm256_insertf128_ps(output, ret, 1); + ret = _mm_cvtepi32_ps(inputVal2); + ret = _mm_mul_ps(ret, invScalar); + output = _mm256_insertf128_ps(output, ret, 1); - _mm256_store_ps(outputVectorPtr, output); + _mm256_store_ps(outputVectorPtr, output); - outputVectorPtr += 8; + outputVectorPtr += 8; - inputPtr += 8; - } + inputPtr += 8; + } - number = eighthPoints * 8; - for(; number < num_points; number++){ - outputVector[number] =((float)(inputVector[number])) / scalar; - } + number = eighthPoints * 8; + for (; number < num_points; number++) { + outputVector[number] = ((float)(inputVector[number])) / scalar; + } } #endif /* LV_HAVE_AVX */ #ifdef LV_HAVE_SSE4_1 #include -static inline void -volk_16i_s32f_convert_32f_a_sse4_1(float* outputVector, const int16_t* inputVector, - const float scalar, unsigned int num_points) +static inline void volk_16i_s32f_convert_32f_a_sse4_1(float* outputVector, + const int16_t* inputVector, + const float scalar, + unsigned int num_points) { - unsigned int number = 0; - const unsigned int eighthPoints = num_points / 8; + unsigned int number = 0; + const unsigned int eighthPoints = num_points / 8; - float* outputVectorPtr = outputVector; - __m128 invScalar = _mm_set_ps1(1.0/scalar); - int16_t* inputPtr = (int16_t*)inputVector; - __m128i inputVal; - __m128i inputVal2; - __m128 ret; + float* outputVectorPtr = outputVector; + __m128 invScalar = _mm_set_ps1(1.0 / scalar); + int16_t* inputPtr = (int16_t*)inputVector; + __m128i inputVal; + __m128i inputVal2; + __m128 ret; - for(;number < eighthPoints; number++){ + for (; number < eighthPoints; number++) { - // Load the 8 values - inputVal = _mm_loadu_si128((__m128i*)inputPtr); + // Load the 8 values + inputVal = _mm_loadu_si128((__m128i*)inputPtr); - // Shift the input data to the right by 64 bits ( 8 bytes ) - inputVal2 = _mm_srli_si128(inputVal, 8); + // Shift the input data to the right by 64 bits ( 8 bytes ) + inputVal2 = _mm_srli_si128(inputVal, 8); - // Convert the lower 4 values into 32 bit words - inputVal = _mm_cvtepi16_epi32(inputVal); - inputVal2 = _mm_cvtepi16_epi32(inputVal2); + // Convert the lower 4 values into 32 bit words + inputVal = _mm_cvtepi16_epi32(inputVal); + inputVal2 = _mm_cvtepi16_epi32(inputVal2); - ret = _mm_cvtepi32_ps(inputVal); - ret = _mm_mul_ps(ret, invScalar); - _mm_storeu_ps(outputVectorPtr, ret); - outputVectorPtr += 4; + ret = _mm_cvtepi32_ps(inputVal); + ret = _mm_mul_ps(ret, invScalar); + _mm_storeu_ps(outputVectorPtr, ret); + outputVectorPtr += 4; - ret = _mm_cvtepi32_ps(inputVal2); - ret = _mm_mul_ps(ret, invScalar); - _mm_storeu_ps(outputVectorPtr, ret); + ret = _mm_cvtepi32_ps(inputVal2); + ret = _mm_mul_ps(ret, invScalar); + _mm_storeu_ps(outputVectorPtr, ret); - outputVectorPtr += 4; + outputVectorPtr += 4; - inputPtr += 8; - } + inputPtr += 8; + } - number = eighthPoints * 8; - for(; number < num_points; number++){ - outputVector[number] =((float)(inputVector[number])) / scalar; - } + number = eighthPoints * 8; + for (; number < num_points; number++) { + outputVector[number] = ((float)(inputVector[number])) / scalar; + } } #endif /* LV_HAVE_SSE4_1 */ #ifdef LV_HAVE_SSE #include -static inline void -volk_16i_s32f_convert_32f_a_sse(float* outputVector, const int16_t* inputVector, - const float scalar, unsigned int num_points) +static inline void volk_16i_s32f_convert_32f_a_sse(float* outputVector, + const int16_t* inputVector, + const float scalar, + unsigned int num_points) { - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - float* outputVectorPtr = outputVector; - __m128 invScalar = _mm_set_ps1(1.0/scalar); - int16_t* inputPtr = (int16_t*)inputVector; - __m128 ret; - - for(;number < quarterPoints; number++){ - ret = _mm_set_ps((float)(inputPtr[3]), (float)(inputPtr[2]), (float)(inputPtr[1]), (float)(inputPtr[0])); - - ret = _mm_mul_ps(ret, invScalar); - _mm_storeu_ps(outputVectorPtr, ret); - - inputPtr += 4; - outputVectorPtr += 4; - } - - number = quarterPoints * 4; - for(; number < num_points; number++){ - outputVector[number] = (float)(inputVector[number]) / scalar; - } + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + float* outputVectorPtr = outputVector; + __m128 invScalar = _mm_set_ps1(1.0 / scalar); + int16_t* inputPtr = (int16_t*)inputVector; + __m128 ret; + + for (; number < quarterPoints; number++) { + ret = _mm_set_ps((float)(inputPtr[3]), + (float)(inputPtr[2]), + (float)(inputPtr[1]), + (float)(inputPtr[0])); + + ret = _mm_mul_ps(ret, invScalar); + _mm_storeu_ps(outputVectorPtr, ret); + + inputPtr += 4; + outputVectorPtr += 4; + } + + number = quarterPoints * 4; + for (; number < num_points; number++) { + outputVector[number] = (float)(inputVector[number]) / scalar; + } } #endif /* LV_HAVE_SSE */ #ifdef LV_HAVE_GENERIC -static inline void -volk_16i_s32f_convert_32f_a_generic(float* outputVector, const int16_t* inputVector, - const float scalar, unsigned int num_points) +static inline void volk_16i_s32f_convert_32f_a_generic(float* outputVector, + const int16_t* inputVector, + const float scalar, + unsigned int num_points) { - float* outputVectorPtr = outputVector; - const int16_t* inputVectorPtr = inputVector; - unsigned int number = 0; + float* outputVectorPtr = outputVector; + const int16_t* inputVectorPtr = inputVector; + unsigned int number = 0; - for(number = 0; number < num_points; number++){ - *outputVectorPtr++ = ((float)(*inputVectorPtr++)) / scalar; - } + for (number = 0; number < num_points; number++) { + *outputVectorPtr++ = ((float)(*inputVectorPtr++)) / scalar; + } } #endif /* LV_HAVE_GENERIC */ diff --git a/kernels/volk/volk_16i_x4_quad_max_star_16i.h b/kernels/volk/volk_16i_x4_quad_max_star_16i.h index 6aa74c7..619cc90 100644 --- a/kernels/volk/volk_16i_x4_quad_max_star_16i.h +++ b/kernels/volk/volk_16i_x4_quad_max_star_16i.h @@ -29,8 +29,8 @@ * * Dispatcher Prototype * \code - * void volk_16i_x4_quad_max_star_16i(short* target, short* src0, short* src1, short* src2, short* src3, unsigned int num_points) - * \endcode + * void volk_16i_x4_quad_max_star_16i(short* target, short* src0, short* src1, short* + * src2, short* src3, unsigned int num_points) \endcode * * \b Inputs * \li src0: The input vector 0. @@ -55,149 +55,152 @@ #ifndef INCLUDED_volk_16i_x4_quad_max_star_16i_a_H #define INCLUDED_volk_16i_x4_quad_max_star_16i_a_H -#include -#include +#include +#include #ifdef LV_HAVE_SSE2 -#include +#include -static inline void -volk_16i_x4_quad_max_star_16i_a_sse2(short* target, short* src0, short* src1, - short* src2, short* src3, unsigned int num_points) +static inline void volk_16i_x4_quad_max_star_16i_a_sse2(short* target, + short* src0, + short* src1, + short* src2, + short* src3, + unsigned int num_points) { - const unsigned int num_bytes = num_points*2; - - int i = 0; + const unsigned int num_bytes = num_points * 2; - int bound = (num_bytes >> 4); - int bound_copy = bound; - int leftovers = (num_bytes >> 1) & 7; + int i = 0; - __m128i *p_target, *p_src0, *p_src1, *p_src2, *p_src3; - p_target = (__m128i*) target; - p_src0 = (__m128i*)src0; - p_src1 = (__m128i*)src1; - p_src2 = (__m128i*)src2; - p_src3 = (__m128i*)src3; + int bound = (num_bytes >> 4); + int bound_copy = bound; + int leftovers = (num_bytes >> 1) & 7; - __m128i xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8; + __m128i *p_target, *p_src0, *p_src1, *p_src2, *p_src3; + p_target = (__m128i*)target; + p_src0 = (__m128i*)src0; + p_src1 = (__m128i*)src1; + p_src2 = (__m128i*)src2; + p_src3 = (__m128i*)src3; - while(bound_copy > 0) { - xmm1 = _mm_load_si128(p_src0); - xmm2 = _mm_load_si128(p_src1); - xmm3 = _mm_load_si128(p_src2); - xmm4 = _mm_load_si128(p_src3); + __m128i xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8; - xmm5 = _mm_setzero_si128(); - xmm6 = _mm_setzero_si128(); - xmm7 = xmm1; - xmm8 = xmm3; + while (bound_copy > 0) { + xmm1 = _mm_load_si128(p_src0); + xmm2 = _mm_load_si128(p_src1); + xmm3 = _mm_load_si128(p_src2); + xmm4 = _mm_load_si128(p_src3); - xmm1 = _mm_sub_epi16(xmm2, xmm1); + xmm5 = _mm_setzero_si128(); + xmm6 = _mm_setzero_si128(); + xmm7 = xmm1; + xmm8 = xmm3; - xmm3 = _mm_sub_epi16(xmm4, xmm3); + xmm1 = _mm_sub_epi16(xmm2, xmm1); - xmm5 = _mm_cmpgt_epi16(xmm1, xmm5); - xmm6 = _mm_cmpgt_epi16(xmm3, xmm6); + xmm3 = _mm_sub_epi16(xmm4, xmm3); - xmm2 = _mm_and_si128(xmm5, xmm2); - xmm4 = _mm_and_si128(xmm6, xmm4); - xmm5 = _mm_andnot_si128(xmm5, xmm7); - xmm6 = _mm_andnot_si128(xmm6, xmm8); + xmm5 = _mm_cmpgt_epi16(xmm1, xmm5); + xmm6 = _mm_cmpgt_epi16(xmm3, xmm6); - xmm5 = _mm_add_epi16(xmm2, xmm5); - xmm6 = _mm_add_epi16(xmm4, xmm6); + xmm2 = _mm_and_si128(xmm5, xmm2); + xmm4 = _mm_and_si128(xmm6, xmm4); + xmm5 = _mm_andnot_si128(xmm5, xmm7); + xmm6 = _mm_andnot_si128(xmm6, xmm8); - xmm1 = _mm_xor_si128(xmm1, xmm1); - xmm2 = xmm5; - xmm5 = _mm_sub_epi16(xmm6, xmm5); - p_src0 += 1; - bound_copy -= 1; + xmm5 = _mm_add_epi16(xmm2, xmm5); + xmm6 = _mm_add_epi16(xmm4, xmm6); - xmm1 = _mm_cmpgt_epi16(xmm5, xmm1); - p_src1 += 1; + xmm1 = _mm_xor_si128(xmm1, xmm1); + xmm2 = xmm5; + xmm5 = _mm_sub_epi16(xmm6, xmm5); + p_src0 += 1; + bound_copy -= 1; - xmm6 = _mm_and_si128(xmm1, xmm6); + xmm1 = _mm_cmpgt_epi16(xmm5, xmm1); + p_src1 += 1; - xmm1 = _mm_andnot_si128(xmm1, xmm2); - p_src2 += 1; + xmm6 = _mm_and_si128(xmm1, xmm6); - xmm1 = _mm_add_epi16(xmm6, xmm1); - p_src3 += 1; + xmm1 = _mm_andnot_si128(xmm1, xmm2); + p_src2 += 1; - _mm_store_si128(p_target, xmm1); - p_target += 1; + xmm1 = _mm_add_epi16(xmm6, xmm1); + p_src3 += 1; - } + _mm_store_si128(p_target, xmm1); + p_target += 1; + } - /*__VOLK_ASM __VOLK_VOLATILE - ( - "volk_16i_x4_quad_max_star_16i_a_sse2_L1:\n\t" - "cmp $0, %[bound]\n\t" - "je volk_16i_x4_quad_max_star_16i_a_sse2_END\n\t" + /*__VOLK_ASM __VOLK_VOLATILE + ( + "volk_16i_x4_quad_max_star_16i_a_sse2_L1:\n\t" + "cmp $0, %[bound]\n\t" + "je volk_16i_x4_quad_max_star_16i_a_sse2_END\n\t" - "movaps (%[src0]), %%xmm1\n\t" - "movaps (%[src1]), %%xmm2\n\t" - "movaps (%[src2]), %%xmm3\n\t" - "movaps (%[src3]), %%xmm4\n\t" + "movaps (%[src0]), %%xmm1\n\t" + "movaps (%[src1]), %%xmm2\n\t" + "movaps (%[src2]), %%xmm3\n\t" + "movaps (%[src3]), %%xmm4\n\t" - "pxor %%xmm5, %%xmm5\n\t" - "pxor %%xmm6, %%xmm6\n\t" - "movaps %%xmm1, %%xmm7\n\t" - "movaps %%xmm3, %%xmm8\n\t" - "psubw %%xmm2, %%xmm1\n\t" - "psubw %%xmm4, %%xmm3\n\t" + "pxor %%xmm5, %%xmm5\n\t" + "pxor %%xmm6, %%xmm6\n\t" + "movaps %%xmm1, %%xmm7\n\t" + "movaps %%xmm3, %%xmm8\n\t" + "psubw %%xmm2, %%xmm1\n\t" + "psubw %%xmm4, %%xmm3\n\t" - "pcmpgtw %%xmm1, %%xmm5\n\t" - "pcmpgtw %%xmm3, %%xmm6\n\t" + "pcmpgtw %%xmm1, %%xmm5\n\t" + "pcmpgtw %%xmm3, %%xmm6\n\t" - "pand %%xmm5, %%xmm2\n\t" - "pand %%xmm6, %%xmm4\n\t" - "pandn %%xmm7, %%xmm5\n\t" - "pandn %%xmm8, %%xmm6\n\t" + "pand %%xmm5, %%xmm2\n\t" + "pand %%xmm6, %%xmm4\n\t" + "pandn %%xmm7, %%xmm5\n\t" + "pandn %%xmm8, %%xmm6\n\t" - "paddw %%xmm2, %%xmm5\n\t" - "paddw %%xmm4, %%xmm6\n\t" + "paddw %%xmm2, %%xmm5\n\t" + "paddw %%xmm4, %%xmm6\n\t" - "pxor %%xmm1, %%xmm1\n\t" - "movaps %%xmm5, %%xmm2\n\t" + "pxor %%xmm1, %%xmm1\n\t" + "movaps %%xmm5, %%xmm2\n\t" - "psubw %%xmm6, %%xmm5\n\t" - "add $16, %[src0]\n\t" - "add $-1, %[bound]\n\t" + "psubw %%xmm6, %%xmm5\n\t" + "add $16, %[src0]\n\t" + "add $-1, %[bound]\n\t" - "pcmpgtw %%xmm5, %%xmm1\n\t" - "add $16, %[src1]\n\t" + "pcmpgtw %%xmm5, %%xmm1\n\t" + "add $16, %[src1]\n\t" - "pand %%xmm1, %%xmm6\n\t" + "pand %%xmm1, %%xmm6\n\t" - "pandn %%xmm2, %%xmm1\n\t" - "add $16, %[src2]\n\t" + "pandn %%xmm2, %%xmm1\n\t" + "add $16, %[src2]\n\t" - "paddw %%xmm6, %%xmm1\n\t" - "add $16, %[src3]\n\t" + "paddw %%xmm6, %%xmm1\n\t" + "add $16, %[src3]\n\t" - "movaps %%xmm1, (%[target])\n\t" - "addw $16, %[target]\n\t" - "jmp volk_16i_x4_quad_max_star_16i_a_sse2_L1\n\t" + "movaps %%xmm1, (%[target])\n\t" + "addw $16, %[target]\n\t" + "jmp volk_16i_x4_quad_max_star_16i_a_sse2_L1\n\t" - "volk_16i_x4_quad_max_star_16i_a_sse2_END:\n\t" - : - :[bound]"r"(bound), [src0]"r"(src0), [src1]"r"(src1), [src2]"r"(src2), [src3]"r"(src3), [target]"r"(target) - : - ); - */ + "volk_16i_x4_quad_max_star_16i_a_sse2_END:\n\t" + : + :[bound]"r"(bound), [src0]"r"(src0), [src1]"r"(src1), [src2]"r"(src2), + [src3]"r"(src3), [target]"r"(target) + : + ); + */ - short temp0 = 0; - short temp1 = 0; - for(i = bound * 8; i < (bound * 8) + leftovers; ++i) { - temp0 = ((short)(src0[i] - src1[i]) > 0) ? src0[i] : src1[i]; - temp1 = ((short)(src2[i] - src3[i])>0) ? src2[i] : src3[i]; - target[i] = ((short)(temp0 - temp1)>0) ? temp0 : temp1; - } - return; + short temp0 = 0; + short temp1 = 0; + for (i = bound * 8; i < (bound * 8) + leftovers; ++i) { + temp0 = ((short)(src0[i] - src1[i]) > 0) ? src0[i] : src1[i]; + temp1 = ((short)(src2[i] - src3[i]) > 0) ? src2[i] : src3[i]; + target[i] = ((short)(temp0 - temp1) > 0) ? temp0 : temp1; + } + return; } #endif /*LV_HAVE_SSE2*/ @@ -206,85 +209,91 @@ volk_16i_x4_quad_max_star_16i_a_sse2(short* target, short* src0, short* src1, #include -static inline void -volk_16i_x4_quad_max_star_16i_neon(short* target, short* src0, short* src1, - short* src2, short* src3, unsigned int num_points) +static inline void volk_16i_x4_quad_max_star_16i_neon(short* target, + short* src0, + short* src1, + short* src2, + short* src3, + unsigned int num_points) { - const unsigned int eighth_points = num_points / 8; - unsigned i; - - int16x8_t src0_vec, src1_vec, src2_vec, src3_vec; - int16x8_t diff12, diff34; - int16x8_t comp0, comp1, comp2, comp3; - int16x8_t result1_vec, result2_vec; - int16x8_t zeros; - zeros = vdupq_n_s16(0); - for(i=0; i < eighth_points; ++i) { - src0_vec = vld1q_s16(src0); - src1_vec = vld1q_s16(src1); - src2_vec = vld1q_s16(src2); - src3_vec = vld1q_s16(src3); - diff12 = vsubq_s16(src0_vec, src1_vec); - diff34 = vsubq_s16(src2_vec, src3_vec); - comp0 = (int16x8_t)vcgeq_s16(diff12, zeros); - comp1 = (int16x8_t)vcltq_s16(diff12, zeros); - comp2 = (int16x8_t)vcgeq_s16(diff34, zeros); - comp3 = (int16x8_t)vcltq_s16(diff34, zeros); - comp0 = vandq_s16(src0_vec, comp0); - comp1 = vandq_s16(src1_vec, comp1); - comp2 = vandq_s16(src2_vec, comp2); - comp3 = vandq_s16(src3_vec, comp3); - - result1_vec = vaddq_s16(comp0, comp1); - result2_vec = vaddq_s16(comp2, comp3); - - diff12 = vsubq_s16(result1_vec, result2_vec); - comp0 = (int16x8_t)vcgeq_s16(diff12, zeros); - comp1 = (int16x8_t)vcltq_s16(diff12, zeros); - comp0 = vandq_s16(result1_vec, comp0); - comp1 = vandq_s16(result2_vec, comp1); - result1_vec = vaddq_s16(comp0, comp1); - vst1q_s16(target, result1_vec); - src0 += 8; - src1 += 8; - src2 += 8; - src3 += 8; - target += 8; + const unsigned int eighth_points = num_points / 8; + unsigned i; + + int16x8_t src0_vec, src1_vec, src2_vec, src3_vec; + int16x8_t diff12, diff34; + int16x8_t comp0, comp1, comp2, comp3; + int16x8_t result1_vec, result2_vec; + int16x8_t zeros; + zeros = vdupq_n_s16(0); + for (i = 0; i < eighth_points; ++i) { + src0_vec = vld1q_s16(src0); + src1_vec = vld1q_s16(src1); + src2_vec = vld1q_s16(src2); + src3_vec = vld1q_s16(src3); + diff12 = vsubq_s16(src0_vec, src1_vec); + diff34 = vsubq_s16(src2_vec, src3_vec); + comp0 = (int16x8_t)vcgeq_s16(diff12, zeros); + comp1 = (int16x8_t)vcltq_s16(diff12, zeros); + comp2 = (int16x8_t)vcgeq_s16(diff34, zeros); + comp3 = (int16x8_t)vcltq_s16(diff34, zeros); + comp0 = vandq_s16(src0_vec, comp0); + comp1 = vandq_s16(src1_vec, comp1); + comp2 = vandq_s16(src2_vec, comp2); + comp3 = vandq_s16(src3_vec, comp3); + + result1_vec = vaddq_s16(comp0, comp1); + result2_vec = vaddq_s16(comp2, comp3); + + diff12 = vsubq_s16(result1_vec, result2_vec); + comp0 = (int16x8_t)vcgeq_s16(diff12, zeros); + comp1 = (int16x8_t)vcltq_s16(diff12, zeros); + comp0 = vandq_s16(result1_vec, comp0); + comp1 = vandq_s16(result2_vec, comp1); + result1_vec = vaddq_s16(comp0, comp1); + vst1q_s16(target, result1_vec); + src0 += 8; + src1 += 8; + src2 += 8; + src3 += 8; + target += 8; } - short temp0 = 0; - short temp1 = 0; - for(i=eighth_points*8; i < num_points; ++i) { - temp0 = ((short)(*src0 - *src1) > 0) ? *src0 : *src1; - temp1 = ((short)(*src2 - *src3) > 0) ? *src2 : *src3; - *target++ = ((short)(temp0 - temp1)>0) ? temp0 : temp1; - src0++; - src1++; - src2++; - src3++; - } + short temp0 = 0; + short temp1 = 0; + for (i = eighth_points * 8; i < num_points; ++i) { + temp0 = ((short)(*src0 - *src1) > 0) ? *src0 : *src1; + temp1 = ((short)(*src2 - *src3) > 0) ? *src2 : *src3; + *target++ = ((short)(temp0 - temp1) > 0) ? temp0 : temp1; + src0++; + src1++; + src2++; + src3++; + } } #endif /* LV_HAVE_NEON */ #ifdef LV_HAVE_GENERIC -static inline void -volk_16i_x4_quad_max_star_16i_generic(short* target, short* src0, short* src1, - short* src2, short* src3, unsigned int num_points) +static inline void volk_16i_x4_quad_max_star_16i_generic(short* target, + short* src0, + short* src1, + short* src2, + short* src3, + unsigned int num_points) { - const unsigned int num_bytes = num_points*2; + const unsigned int num_bytes = num_points * 2; - int i = 0; + int i = 0; - int bound = num_bytes >> 1; + int bound = num_bytes >> 1; - short temp0 = 0; - short temp1 = 0; - for(i = 0; i < bound; ++i) { - temp0 = ((short)(src0[i] - src1[i]) > 0) ? src0[i] : src1[i]; - temp1 = ((short)(src2[i] - src3[i])>0) ? src2[i] : src3[i]; - target[i] = ((short)(temp0 - temp1)>0) ? temp0 : temp1; - } + short temp0 = 0; + short temp1 = 0; + for (i = 0; i < bound; ++i) { + temp0 = ((short)(src0[i] - src1[i]) > 0) ? src0[i] : src1[i]; + temp1 = ((short)(src2[i] - src3[i]) > 0) ? src2[i] : src3[i]; + target[i] = ((short)(temp0 - temp1) > 0) ? temp0 : temp1; + } } #endif /*LV_HAVE_GENERIC*/ diff --git a/kernels/volk/volk_16i_x5_add_quad_16i_x4.h b/kernels/volk/volk_16i_x5_add_quad_16i_x4.h index 30417de..f735f11 100644 --- a/kernels/volk/volk_16i_x5_add_quad_16i_x4.h +++ b/kernels/volk/volk_16i_x5_add_quad_16i_x4.h @@ -29,8 +29,9 @@ * * Dispatcher Prototype * \code - * void volk_16i_x5_add_quad_16i_x4(short* target0, short* target1, short* target2, short* target3, short* src0, short* src1, short* src2, short* src3, short* src4, unsigned int num_points); - * \endcode + * void volk_16i_x5_add_quad_16i_x4(short* target0, short* target1, short* target2, short* + * target3, short* src0, short* src1, short* src2, short* src3, short* src4, unsigned int + * num_points); \endcode * * \b Inputs * \li src0: The input vector 0. @@ -59,182 +60,203 @@ #ifndef INCLUDED_volk_16i_x5_add_quad_16i_x4_a_H #define INCLUDED_volk_16i_x5_add_quad_16i_x4_a_H -#include -#include +#include +#include #ifdef LV_HAVE_SSE2 -#include -#include +#include +#include -static inline void -volk_16i_x5_add_quad_16i_x4_a_sse2(short* target0, short* target1, short* target2, short* target3, - short* src0, short* src1, short* src2, short* src3, short* src4, - unsigned int num_points) +static inline void volk_16i_x5_add_quad_16i_x4_a_sse2(short* target0, + short* target1, + short* target2, + short* target3, + short* src0, + short* src1, + short* src2, + short* src3, + short* src4, + unsigned int num_points) { - const unsigned int num_bytes = num_points*2; - - __m128i xmm0, xmm1, xmm2, xmm3, xmm4; - __m128i *p_target0, *p_target1, *p_target2, *p_target3, *p_src0, *p_src1, *p_src2, *p_src3, *p_src4; - p_target0 = (__m128i*)target0; - p_target1 = (__m128i*)target1; - p_target2 = (__m128i*)target2; - p_target3 = (__m128i*)target3; - - p_src0 = (__m128i*)src0; - p_src1 = (__m128i*)src1; - p_src2 = (__m128i*)src2; - p_src3 = (__m128i*)src3; - p_src4 = (__m128i*)src4; - - int i = 0; - - int bound = (num_bytes >> 4); - int leftovers = (num_bytes >> 1) & 7; - - for(; i < bound; ++i) { - xmm0 = _mm_load_si128(p_src0); - xmm1 = _mm_load_si128(p_src1); - xmm2 = _mm_load_si128(p_src2); - xmm3 = _mm_load_si128(p_src3); - xmm4 = _mm_load_si128(p_src4); - - p_src0 += 1; - p_src1 += 1; - - xmm1 = _mm_add_epi16(xmm0, xmm1); - xmm2 = _mm_add_epi16(xmm0, xmm2); - xmm3 = _mm_add_epi16(xmm0, xmm3); - xmm4 = _mm_add_epi16(xmm0, xmm4); - - - p_src2 += 1; - p_src3 += 1; - p_src4 += 1; - - _mm_store_si128(p_target0, xmm1); - _mm_store_si128(p_target1, xmm2); - _mm_store_si128(p_target2, xmm3); - _mm_store_si128(p_target3, xmm4); - - p_target0 += 1; - p_target1 += 1; - p_target2 += 1; - p_target3 += 1; - } - /*__VOLK_ASM __VOLK_VOLATILE - ( - ".%=volk_16i_x5_add_quad_16i_x4_a_sse2_L1:\n\t" - "cmp $0, %[bound]\n\t" - "je .%=volk_16i_x5_add_quad_16i_x4_a_sse2_END\n\t" - "movaps (%[src0]), %%xmm1\n\t" - "movaps (%[src1]), %%xmm2\n\t" - "movaps (%[src2]), %%xmm3\n\t" - "movaps (%[src3]), %%xmm4\n\t" - "movaps (%[src4]), %%xmm5\n\t" - "add $16, %[src0]\n\t" - "add $16, %[src1]\n\t" - "add $16, %[src2]\n\t" - "add $16, %[src3]\n\t" - "add $16, %[src4]\n\t" - "paddw %%xmm1, %%xmm2\n\t" - "paddw %%xmm1, %%xmm3\n\t" - "paddw %%xmm1, %%xmm4\n\t" - "paddw %%xmm1, %%xmm5\n\t" - "add $-1, %[bound]\n\t" - "movaps %%xmm2, (%[target0])\n\t" - "movaps %%xmm3, (%[target1])\n\t" - "movaps %%xmm4, (%[target2])\n\t" - "movaps %%xmm5, (%[target3])\n\t" - "add $16, %[target0]\n\t" - "add $16, %[target1]\n\t" - "add $16, %[target2]\n\t" - "add $16, %[target3]\n\t" - "jmp .%=volk_16i_x5_add_quad_16i_x4_a_sse2_L1\n\t" - ".%=volk_16i_x5_add_quad_16i_x4_a_sse2_END:\n\t" - : - :[bound]"r"(bound), [src0]"r"(src0), [src1]"r"(src1), [src2]"r"(src2), [src3]"r"(src3), [src4]"r"(src4), [target0]"r"(target0), [target1]"r"(target1), [target2]"r"(target2), [target3]"r"(target3) - :"xmm1", "xmm2", "xmm3", "xmm4", "xmm5" - ); - */ - - for(i = bound * 8; i < (bound * 8) + leftovers; ++i) { - target0[i] = src0[i] + src1[i]; - target1[i] = src0[i] + src2[i]; - target2[i] = src0[i] + src3[i]; - target3[i] = src0[i] + src4[i]; - } + const unsigned int num_bytes = num_points * 2; + + __m128i xmm0, xmm1, xmm2, xmm3, xmm4; + __m128i *p_target0, *p_target1, *p_target2, *p_target3, *p_src0, *p_src1, *p_src2, + *p_src3, *p_src4; + p_target0 = (__m128i*)target0; + p_target1 = (__m128i*)target1; + p_target2 = (__m128i*)target2; + p_target3 = (__m128i*)target3; + + p_src0 = (__m128i*)src0; + p_src1 = (__m128i*)src1; + p_src2 = (__m128i*)src2; + p_src3 = (__m128i*)src3; + p_src4 = (__m128i*)src4; + + int i = 0; + + int bound = (num_bytes >> 4); + int leftovers = (num_bytes >> 1) & 7; + + for (; i < bound; ++i) { + xmm0 = _mm_load_si128(p_src0); + xmm1 = _mm_load_si128(p_src1); + xmm2 = _mm_load_si128(p_src2); + xmm3 = _mm_load_si128(p_src3); + xmm4 = _mm_load_si128(p_src4); + + p_src0 += 1; + p_src1 += 1; + + xmm1 = _mm_add_epi16(xmm0, xmm1); + xmm2 = _mm_add_epi16(xmm0, xmm2); + xmm3 = _mm_add_epi16(xmm0, xmm3); + xmm4 = _mm_add_epi16(xmm0, xmm4); + + + p_src2 += 1; + p_src3 += 1; + p_src4 += 1; + + _mm_store_si128(p_target0, xmm1); + _mm_store_si128(p_target1, xmm2); + _mm_store_si128(p_target2, xmm3); + _mm_store_si128(p_target3, xmm4); + + p_target0 += 1; + p_target1 += 1; + p_target2 += 1; + p_target3 += 1; + } + /*__VOLK_ASM __VOLK_VOLATILE + ( + ".%=volk_16i_x5_add_quad_16i_x4_a_sse2_L1:\n\t" + "cmp $0, %[bound]\n\t" + "je .%=volk_16i_x5_add_quad_16i_x4_a_sse2_END\n\t" + "movaps (%[src0]), %%xmm1\n\t" + "movaps (%[src1]), %%xmm2\n\t" + "movaps (%[src2]), %%xmm3\n\t" + "movaps (%[src3]), %%xmm4\n\t" + "movaps (%[src4]), %%xmm5\n\t" + "add $16, %[src0]\n\t" + "add $16, %[src1]\n\t" + "add $16, %[src2]\n\t" + "add $16, %[src3]\n\t" + "add $16, %[src4]\n\t" + "paddw %%xmm1, %%xmm2\n\t" + "paddw %%xmm1, %%xmm3\n\t" + "paddw %%xmm1, %%xmm4\n\t" + "paddw %%xmm1, %%xmm5\n\t" + "add $-1, %[bound]\n\t" + "movaps %%xmm2, (%[target0])\n\t" + "movaps %%xmm3, (%[target1])\n\t" + "movaps %%xmm4, (%[target2])\n\t" + "movaps %%xmm5, (%[target3])\n\t" + "add $16, %[target0]\n\t" + "add $16, %[target1]\n\t" + "add $16, %[target2]\n\t" + "add $16, %[target3]\n\t" + "jmp .%=volk_16i_x5_add_quad_16i_x4_a_sse2_L1\n\t" + ".%=volk_16i_x5_add_quad_16i_x4_a_sse2_END:\n\t" + : + :[bound]"r"(bound), [src0]"r"(src0), [src1]"r"(src1), [src2]"r"(src2), + [src3]"r"(src3), [src4]"r"(src4), [target0]"r"(target0), [target1]"r"(target1), + [target2]"r"(target2), [target3]"r"(target3) + :"xmm1", "xmm2", "xmm3", "xmm4", "xmm5" + ); + */ + + for (i = bound * 8; i < (bound * 8) + leftovers; ++i) { + target0[i] = src0[i] + src1[i]; + target1[i] = src0[i] + src2[i]; + target2[i] = src0[i] + src3[i]; + target3[i] = src0[i] + src4[i]; + } } #endif /*LV_HAVE_SSE2*/ #ifdef LV_HAVE_NEON #include -static inline void -volk_16i_x5_add_quad_16i_x4_neon(short* target0, short* target1, short* target2, short* target3, - short* src0, short* src1, short* src2, short* src3, short* src4, - unsigned int num_points) +static inline void volk_16i_x5_add_quad_16i_x4_neon(short* target0, + short* target1, + short* target2, + short* target3, + short* src0, + short* src1, + short* src2, + short* src3, + short* src4, + unsigned int num_points) { - const unsigned int eighth_points = num_points / 8; - unsigned int number = 0; - - int16x8_t src0_vec, src1_vec, src2_vec, src3_vec, src4_vec; - int16x8_t target0_vec, target1_vec, target2_vec, target3_vec; - for(number = 0; number < eighth_points; ++number) { - src0_vec = vld1q_s16(src0); - src1_vec = vld1q_s16(src1); - src2_vec = vld1q_s16(src2); - src3_vec = vld1q_s16(src3); - src4_vec = vld1q_s16(src4); - - target0_vec = vaddq_s16(src0_vec , src1_vec); - target1_vec = vaddq_s16(src0_vec , src2_vec); - target2_vec = vaddq_s16(src0_vec , src3_vec); - target3_vec = vaddq_s16(src0_vec , src4_vec); - - vst1q_s16(target0, target0_vec); - vst1q_s16(target1, target1_vec); - vst1q_s16(target2, target2_vec); - vst1q_s16(target3, target3_vec); - src0 += 8; - src1 += 8; - src2 += 8; - src3 += 8; - src4 += 8; - target0 += 8; - target1 += 8; - target2 += 8; - target3 += 8; - } - - for(number = eighth_points * 8; number < num_points; ++number) { - *target0++ = *src0 + *src1++; - *target1++ = *src0 + *src2++; - *target2++ = *src0 + *src3++; - *target3++ = *src0++ + *src4++; - } + const unsigned int eighth_points = num_points / 8; + unsigned int number = 0; + + int16x8_t src0_vec, src1_vec, src2_vec, src3_vec, src4_vec; + int16x8_t target0_vec, target1_vec, target2_vec, target3_vec; + for (number = 0; number < eighth_points; ++number) { + src0_vec = vld1q_s16(src0); + src1_vec = vld1q_s16(src1); + src2_vec = vld1q_s16(src2); + src3_vec = vld1q_s16(src3); + src4_vec = vld1q_s16(src4); + + target0_vec = vaddq_s16(src0_vec, src1_vec); + target1_vec = vaddq_s16(src0_vec, src2_vec); + target2_vec = vaddq_s16(src0_vec, src3_vec); + target3_vec = vaddq_s16(src0_vec, src4_vec); + + vst1q_s16(target0, target0_vec); + vst1q_s16(target1, target1_vec); + vst1q_s16(target2, target2_vec); + vst1q_s16(target3, target3_vec); + src0 += 8; + src1 += 8; + src2 += 8; + src3 += 8; + src4 += 8; + target0 += 8; + target1 += 8; + target2 += 8; + target3 += 8; + } + + for (number = eighth_points * 8; number < num_points; ++number) { + *target0++ = *src0 + *src1++; + *target1++ = *src0 + *src2++; + *target2++ = *src0 + *src3++; + *target3++ = *src0++ + *src4++; + } } #endif /* LV_HAVE_NEON */ #ifdef LV_HAVE_GENERIC -static inline void -volk_16i_x5_add_quad_16i_x4_generic(short* target0, short* target1, short* target2, short* target3, - short* src0, short* src1, short* src2, short* src3, short* src4, - unsigned int num_points) +static inline void volk_16i_x5_add_quad_16i_x4_generic(short* target0, + short* target1, + short* target2, + short* target3, + short* src0, + short* src1, + short* src2, + short* src3, + short* src4, + unsigned int num_points) { - const unsigned int num_bytes = num_points*2; + const unsigned int num_bytes = num_points * 2; - int i = 0; + int i = 0; - int bound = num_bytes >> 1; + int bound = num_bytes >> 1; - for(i = 0; i < bound; ++i) { - target0[i] = src0[i] + src1[i]; - target1[i] = src0[i] + src2[i]; - target2[i] = src0[i] + src3[i]; - target3[i] = src0[i] + src4[i]; - } + for (i = 0; i < bound; ++i) { + target0[i] = src0[i] + src1[i]; + target1[i] = src0[i] + src2[i]; + target2[i] = src0[i] + src3[i]; + target3[i] = src0[i] + src4[i]; + } } #endif /* LV_HAVE_GENERIC */ diff --git a/kernels/volk/volk_16ic_convert_32fc.h b/kernels/volk/volk_16ic_convert_32fc.h index 84f067c..1453724 100644 --- a/kernels/volk/volk_16ic_convert_32fc.h +++ b/kernels/volk/volk_16ic_convert_32fc.h @@ -30,8 +30,8 @@ * * Dispatcher Prototype * \code - * void volk_16ic_convert_32fc(lv_32fc_t* outputVector, const lv_16sc_t* inputVector, unsigned int num_points) - * \endcode + * void volk_16ic_convert_32fc(lv_32fc_t* outputVector, const lv_16sc_t* inputVector, + * unsigned int num_points) \endcode * * \b Inputs * \li inputVector: The complex 16-bit integer input data buffer. @@ -51,7 +51,9 @@ #ifdef LV_HAVE_AVX2 #include -static inline void volk_16ic_convert_32fc_a_avx2(lv_32fc_t* outputVector, const lv_16sc_t* inputVector, unsigned int num_points) +static inline void volk_16ic_convert_32fc_a_avx2(lv_32fc_t* outputVector, + const lv_16sc_t* inputVector, + unsigned int num_points) { const unsigned int avx_iters = num_points / 8; unsigned int number = 0; @@ -61,36 +63,36 @@ static inline void volk_16ic_convert_32fc_a_avx2(lv_32fc_t* outputVector, const __m256i outValInt; __m128i cplxValue; - for(number = 0; number < avx_iters; number++) - { - cplxValue = _mm_load_si128((__m128i*)complexVectorPtr); - complexVectorPtr += 8; - - outValInt = _mm256_cvtepi16_epi32(cplxValue); - outVal = _mm256_cvtepi32_ps(outValInt); - _mm256_store_ps((float*)outputVectorPtr, outVal); + for (number = 0; number < avx_iters; number++) { + cplxValue = _mm_load_si128((__m128i*)complexVectorPtr); + complexVectorPtr += 8; - outputVectorPtr += 8; - } + outValInt = _mm256_cvtepi16_epi32(cplxValue); + outVal = _mm256_cvtepi32_ps(outValInt); + _mm256_store_ps((float*)outputVectorPtr, outVal); + + outputVectorPtr += 8; + } number = avx_iters * 8; - for(; number < num_points*2; number++) - { - *outputVectorPtr++ = (float)*complexVectorPtr++; - } + for (; number < num_points * 2; number++) { + *outputVectorPtr++ = (float)*complexVectorPtr++; + } } #endif /* LV_HAVE_AVX2 */ #ifdef LV_HAVE_GENERIC -static inline void volk_16ic_convert_32fc_generic(lv_32fc_t* outputVector, const lv_16sc_t* inputVector, unsigned int num_points) +static inline void volk_16ic_convert_32fc_generic(lv_32fc_t* outputVector, + const lv_16sc_t* inputVector, + unsigned int num_points) { unsigned int i; - for(i = 0; i < num_points; i++) - { - outputVector[i] = lv_cmake((float)lv_creal(inputVector[i]), (float)lv_cimag(inputVector[i])); - } + for (i = 0; i < num_points; i++) { + outputVector[i] = + lv_cmake((float)lv_creal(inputVector[i]), (float)lv_cimag(inputVector[i])); + } } #endif /* LV_HAVE_GENERIC */ @@ -99,7 +101,9 @@ static inline void volk_16ic_convert_32fc_generic(lv_32fc_t* outputVector, const #ifdef LV_HAVE_SSE2 #include -static inline void volk_16ic_convert_32fc_a_sse2(lv_32fc_t* outputVector, const lv_16sc_t* inputVector, unsigned int num_points) +static inline void volk_16ic_convert_32fc_a_sse2(lv_32fc_t* outputVector, + const lv_16sc_t* inputVector, + unsigned int num_points) { const unsigned int sse_iters = num_points / 2; @@ -108,18 +112,21 @@ static inline void volk_16ic_convert_32fc_a_sse2(lv_32fc_t* outputVector, const __m128 a; unsigned int number; - for(number = 0; number < sse_iters; number++) - { - a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg - _mm_store_ps((float*)_out, a); - _in += 2; - _out += 2; - } - if (num_points & 1) - { - *_out++ = lv_cmake((float)lv_creal(*_in), (float)lv_cimag(*_in)); - _in++; - } + for (number = 0; number < sse_iters; number++) { + a = _mm_set_ps( + (float)(lv_cimag(_in[1])), + (float)(lv_creal(_in[1])), + (float)(lv_cimag(_in[0])), + (float)(lv_creal( + _in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg + _mm_store_ps((float*)_out, a); + _in += 2; + _out += 2; + } + if (num_points & 1) { + *_out++ = lv_cmake((float)lv_creal(*_in), (float)lv_cimag(*_in)); + _in++; + } } #endif /* LV_HAVE_SSE2 */ @@ -127,7 +134,9 @@ static inline void volk_16ic_convert_32fc_a_sse2(lv_32fc_t* outputVector, const #ifdef LV_HAVE_AVX #include -static inline void volk_16ic_convert_32fc_a_avx(lv_32fc_t* outputVector, const lv_16sc_t* inputVector, unsigned int num_points) +static inline void volk_16ic_convert_32fc_a_avx(lv_32fc_t* outputVector, + const lv_16sc_t* inputVector, + unsigned int num_points) { const unsigned int sse_iters = num_points / 4; @@ -136,19 +145,26 @@ static inline void volk_16ic_convert_32fc_a_avx(lv_32fc_t* outputVector, const l __m256 a; unsigned int i, number; - for(number = 0; number < sse_iters; number++) - { - a = _mm256_set_ps((float)(lv_cimag(_in[3])), (float)(lv_creal(_in[3])), (float)(lv_cimag(_in[2])), (float)(lv_creal(_in[2])), (float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg - _mm256_store_ps((float*)_out, a); - _in += 4; - _out += 4; - } + for (number = 0; number < sse_iters; number++) { + a = _mm256_set_ps( + (float)(lv_cimag(_in[3])), + (float)(lv_creal(_in[3])), + (float)(lv_cimag(_in[2])), + (float)(lv_creal(_in[2])), + (float)(lv_cimag(_in[1])), + (float)(lv_creal(_in[1])), + (float)(lv_cimag(_in[0])), + (float)(lv_creal( + _in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg + _mm256_store_ps((float*)_out, a); + _in += 4; + _out += 4; + } _mm256_zeroupper(); - for (i = 0; i < (num_points % 4); ++i) - { - *_out++ = lv_cmake((float)lv_creal(*_in), (float)lv_cimag(*_in)); - _in++; - } + for (i = 0; i < (num_points % 4); ++i) { + *_out++ = lv_cmake((float)lv_creal(*_in), (float)lv_cimag(*_in)); + _in++; + } } #endif /* LV_HAVE_AVX */ @@ -157,7 +173,9 @@ static inline void volk_16ic_convert_32fc_a_avx(lv_32fc_t* outputVector, const l #ifdef LV_HAVE_NEON #include -static inline void volk_16ic_convert_32fc_neon(lv_32fc_t* outputVector, const lv_16sc_t* inputVector, unsigned int num_points) +static inline void volk_16ic_convert_32fc_neon(lv_32fc_t* outputVector, + const lv_16sc_t* inputVector, + unsigned int num_points) { const unsigned int sse_iters = num_points / 2; @@ -169,21 +187,19 @@ static inline void volk_16ic_convert_32fc_neon(lv_32fc_t* outputVector, const lv float32x4_t f32x4; unsigned int i, number; - for(number = 0; number < sse_iters; number++) - { - a16x4 = vld1_s16((const int16_t*)_in); - __VOLK_PREFETCH(_in + 4); - a32x4 = vmovl_s16(a16x4); - f32x4 = vcvtq_f32_s32(a32x4); - vst1q_f32((float32_t*)_out, f32x4); - _in += 2; - _out += 2; - } - for (i = 0; i < (num_points % 2); ++i) - { - *_out++ = lv_cmake((float)lv_creal(*_in), (float)lv_cimag(*_in)); - _in++; - } + for (number = 0; number < sse_iters; number++) { + a16x4 = vld1_s16((const int16_t*)_in); + __VOLK_PREFETCH(_in + 4); + a32x4 = vmovl_s16(a16x4); + f32x4 = vcvtq_f32_s32(a32x4); + vst1q_f32((float32_t*)_out, f32x4); + _in += 2; + _out += 2; + } + for (i = 0; i < (num_points % 2); ++i) { + *_out++ = lv_cmake((float)lv_creal(*_in), (float)lv_cimag(*_in)); + _in++; + } } #endif /* LV_HAVE_NEON */ @@ -198,7 +214,9 @@ static inline void volk_16ic_convert_32fc_neon(lv_32fc_t* outputVector, const lv #ifdef LV_HAVE_AVX2 #include -static inline void volk_16ic_convert_32fc_u_avx2(lv_32fc_t* outputVector, const lv_16sc_t* inputVector, unsigned int num_points) +static inline void volk_16ic_convert_32fc_u_avx2(lv_32fc_t* outputVector, + const lv_16sc_t* inputVector, + unsigned int num_points) { const unsigned int avx_iters = num_points / 8; unsigned int number = 0; @@ -208,23 +226,21 @@ static inline void volk_16ic_convert_32fc_u_avx2(lv_32fc_t* outputVector, const __m256i outValInt; __m128i cplxValue; - for(number = 0; number < avx_iters; number++) - { - cplxValue = _mm_loadu_si128((__m128i*)complexVectorPtr); - complexVectorPtr += 8; - - outValInt = _mm256_cvtepi16_epi32(cplxValue); - outVal = _mm256_cvtepi32_ps(outValInt); - _mm256_storeu_ps((float*)outputVectorPtr, outVal); + for (number = 0; number < avx_iters; number++) { + cplxValue = _mm_loadu_si128((__m128i*)complexVectorPtr); + complexVectorPtr += 8; + + outValInt = _mm256_cvtepi16_epi32(cplxValue); + outVal = _mm256_cvtepi32_ps(outValInt); + _mm256_storeu_ps((float*)outputVectorPtr, outVal); - outputVectorPtr += 8; - } + outputVectorPtr += 8; + } number = avx_iters * 8; - for(; number < num_points*2; number++) - { - *outputVectorPtr++ = (float)*complexVectorPtr++; - } + for (; number < num_points * 2; number++) { + *outputVectorPtr++ = (float)*complexVectorPtr++; + } } #endif /* LV_HAVE_AVX2 */ @@ -232,7 +248,9 @@ static inline void volk_16ic_convert_32fc_u_avx2(lv_32fc_t* outputVector, const #ifdef LV_HAVE_SSE2 #include -static inline void volk_16ic_convert_32fc_u_sse2(lv_32fc_t* outputVector, const lv_16sc_t* inputVector, unsigned int num_points) +static inline void volk_16ic_convert_32fc_u_sse2(lv_32fc_t* outputVector, + const lv_16sc_t* inputVector, + unsigned int num_points) { const unsigned int sse_iters = num_points / 2; @@ -241,18 +259,21 @@ static inline void volk_16ic_convert_32fc_u_sse2(lv_32fc_t* outputVector, const __m128 a; unsigned int number; - for(number = 0; number < sse_iters; number++) - { - a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg - _mm_storeu_ps((float*)_out, a); - _in += 2; - _out += 2; - } - if (num_points & 1) - { - *_out++ = lv_cmake((float)lv_creal(*_in), (float)lv_cimag(*_in)); - _in++; - } + for (number = 0; number < sse_iters; number++) { + a = _mm_set_ps( + (float)(lv_cimag(_in[1])), + (float)(lv_creal(_in[1])), + (float)(lv_cimag(_in[0])), + (float)(lv_creal( + _in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg + _mm_storeu_ps((float*)_out, a); + _in += 2; + _out += 2; + } + if (num_points & 1) { + *_out++ = lv_cmake((float)lv_creal(*_in), (float)lv_cimag(*_in)); + _in++; + } } #endif /* LV_HAVE_SSE2 */ @@ -261,7 +282,9 @@ static inline void volk_16ic_convert_32fc_u_sse2(lv_32fc_t* outputVector, const #ifdef LV_HAVE_AVX #include -static inline void volk_16ic_convert_32fc_u_avx(lv_32fc_t* outputVector, const lv_16sc_t* inputVector, unsigned int num_points) +static inline void volk_16ic_convert_32fc_u_avx(lv_32fc_t* outputVector, + const lv_16sc_t* inputVector, + unsigned int num_points) { const unsigned int sse_iters = num_points / 4; @@ -270,21 +293,27 @@ static inline void volk_16ic_convert_32fc_u_avx(lv_32fc_t* outputVector, const l __m256 a; unsigned int i, number; - for(number = 0; number < sse_iters; number++) - { - a = _mm256_set_ps((float)(lv_cimag(_in[3])), (float)(lv_creal(_in[3])), (float)(lv_cimag(_in[2])), (float)(lv_creal(_in[2])), (float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg - _mm256_storeu_ps((float*)_out, a); - _in += 4; - _out += 4; - } + for (number = 0; number < sse_iters; number++) { + a = _mm256_set_ps( + (float)(lv_cimag(_in[3])), + (float)(lv_creal(_in[3])), + (float)(lv_cimag(_in[2])), + (float)(lv_creal(_in[2])), + (float)(lv_cimag(_in[1])), + (float)(lv_creal(_in[1])), + (float)(lv_cimag(_in[0])), + (float)(lv_creal( + _in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg + _mm256_storeu_ps((float*)_out, a); + _in += 4; + _out += 4; + } _mm256_zeroupper(); - for (i = 0; i < (num_points % 4); ++i) - { - *_out++ = lv_cmake((float)lv_creal(*_in), (float)lv_cimag(*_in)); - _in++; - } + for (i = 0; i < (num_points % 4); ++i) { + *_out++ = lv_cmake((float)lv_creal(*_in), (float)lv_cimag(*_in)); + _in++; + } } #endif /* LV_HAVE_AVX */ #endif /* INCLUDED_volk_32fc_convert_16ic_u_H */ - diff --git a/kernels/volk/volk_16ic_deinterleave_16i_x2.h b/kernels/volk/volk_16ic_deinterleave_16i_x2.h index 40d10b4..9e784a6 100644 --- a/kernels/volk/volk_16ic_deinterleave_16i_x2.h +++ b/kernels/volk/volk_16ic_deinterleave_16i_x2.h @@ -29,8 +29,8 @@ * * Dispatcher Prototype * \code - * void volk_16ic_deinterleave_16i_x2(int16_t* iBuffer, int16_t* qBuffer, const lv_16sc_t* complexVector, unsigned int num_points) - * \endcode + * void volk_16ic_deinterleave_16i_x2(int16_t* iBuffer, int16_t* qBuffer, const lv_16sc_t* + * complexVector, unsigned int num_points) \endcode * * \b Inputs * \li complexVector: The complex input vector. @@ -59,179 +59,241 @@ #ifdef LV_HAVE_AVX2 #include -static inline void -volk_16ic_deinterleave_16i_x2_a_avx2(int16_t* iBuffer, int16_t* qBuffer, const lv_16sc_t* complexVector, unsigned int num_points) +static inline void volk_16ic_deinterleave_16i_x2_a_avx2(int16_t* iBuffer, + int16_t* qBuffer, + const lv_16sc_t* complexVector, + unsigned int num_points) { - unsigned int number = 0; - const int8_t* complexVectorPtr = (int8_t*)complexVector; - int16_t* iBufferPtr = iBuffer; - int16_t* qBufferPtr = qBuffer; - - __m256i MoveMask = _mm256_set_epi8(15,14,11,10,7,6,3,2,13,12,9,8,5,4,1,0, 15,14,11,10,7,6,3,2,13,12,9,8,5,4,1,0); - - __m256i iMove2, iMove1; - __m256i complexVal1, complexVal2, iOutputVal, qOutputVal; - - unsigned int sixteenthPoints = num_points / 16; - - for(number = 0; number < sixteenthPoints; number++){ - complexVal1 = _mm256_load_si256((__m256i*)complexVectorPtr); complexVectorPtr += 32; - complexVal2 = _mm256_load_si256((__m256i*)complexVectorPtr); complexVectorPtr += 32; - - iMove2 = _mm256_shuffle_epi8(complexVal2, MoveMask); - iMove1 = _mm256_shuffle_epi8(complexVal1, MoveMask); - - iOutputVal = _mm256_permute2x128_si256(_mm256_permute4x64_epi64(iMove1,0x08),_mm256_permute4x64_epi64(iMove2,0x80),0x30); - qOutputVal = _mm256_permute2x128_si256(_mm256_permute4x64_epi64(iMove1,0x0d),_mm256_permute4x64_epi64(iMove2,0xd0),0x30); - - _mm256_store_si256((__m256i*)iBufferPtr, iOutputVal); - _mm256_store_si256((__m256i*)qBufferPtr, qOutputVal); - - iBufferPtr += 16; - qBufferPtr += 16; - } - - number = sixteenthPoints * 16; - int16_t* int16ComplexVectorPtr = (int16_t*)complexVectorPtr; - for(; number < num_points; number++){ - *iBufferPtr++ = *int16ComplexVectorPtr++; - *qBufferPtr++ = *int16ComplexVectorPtr++; - } + unsigned int number = 0; + const int8_t* complexVectorPtr = (int8_t*)complexVector; + int16_t* iBufferPtr = iBuffer; + int16_t* qBufferPtr = qBuffer; + + __m256i MoveMask = _mm256_set_epi8(15, + 14, + 11, + 10, + 7, + 6, + 3, + 2, + 13, + 12, + 9, + 8, + 5, + 4, + 1, + 0, + 15, + 14, + 11, + 10, + 7, + 6, + 3, + 2, + 13, + 12, + 9, + 8, + 5, + 4, + 1, + 0); + + __m256i iMove2, iMove1; + __m256i complexVal1, complexVal2, iOutputVal, qOutputVal; + + unsigned int sixteenthPoints = num_points / 16; + + for (number = 0; number < sixteenthPoints; number++) { + complexVal1 = _mm256_load_si256((__m256i*)complexVectorPtr); + complexVectorPtr += 32; + complexVal2 = _mm256_load_si256((__m256i*)complexVectorPtr); + complexVectorPtr += 32; + + iMove2 = _mm256_shuffle_epi8(complexVal2, MoveMask); + iMove1 = _mm256_shuffle_epi8(complexVal1, MoveMask); + + iOutputVal = _mm256_permute2x128_si256(_mm256_permute4x64_epi64(iMove1, 0x08), + _mm256_permute4x64_epi64(iMove2, 0x80), + 0x30); + qOutputVal = _mm256_permute2x128_si256(_mm256_permute4x64_epi64(iMove1, 0x0d), + _mm256_permute4x64_epi64(iMove2, 0xd0), + 0x30); + + _mm256_store_si256((__m256i*)iBufferPtr, iOutputVal); + _mm256_store_si256((__m256i*)qBufferPtr, qOutputVal); + + iBufferPtr += 16; + qBufferPtr += 16; + } + + number = sixteenthPoints * 16; + int16_t* int16ComplexVectorPtr = (int16_t*)complexVectorPtr; + for (; number < num_points; number++) { + *iBufferPtr++ = *int16ComplexVectorPtr++; + *qBufferPtr++ = *int16ComplexVectorPtr++; + } } #endif /* LV_HAVE_AVX2 */ #ifdef LV_HAVE_SSSE3 #include -static inline void -volk_16ic_deinterleave_16i_x2_a_ssse3(int16_t* iBuffer, int16_t* qBuffer, const lv_16sc_t* complexVector, unsigned int num_points) +static inline void volk_16ic_deinterleave_16i_x2_a_ssse3(int16_t* iBuffer, + int16_t* qBuffer, + const lv_16sc_t* complexVector, + unsigned int num_points) { - unsigned int number = 0; - const int8_t* complexVectorPtr = (int8_t*)complexVector; - int16_t* iBufferPtr = iBuffer; - int16_t* qBufferPtr = qBuffer; - - __m128i iMoveMask1 = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0); - __m128i iMoveMask2 = _mm_set_epi8(13, 12, 9, 8, 5, 4, 1, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80); - - __m128i qMoveMask1 = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 15, 14, 11, 10, 7, 6, 3, 2); - __m128i qMoveMask2 = _mm_set_epi8(15, 14, 11, 10, 7, 6, 3, 2, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80); - - __m128i complexVal1, complexVal2, iOutputVal, qOutputVal; - - unsigned int eighthPoints = num_points / 8; - - for(number = 0; number < eighthPoints; number++){ - complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16; - complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16; - - iOutputVal = _mm_or_si128( _mm_shuffle_epi8(complexVal1, iMoveMask1) , _mm_shuffle_epi8(complexVal2, iMoveMask2)); - qOutputVal = _mm_or_si128( _mm_shuffle_epi8(complexVal1, qMoveMask1) , _mm_shuffle_epi8(complexVal2, qMoveMask2)); - - _mm_store_si128((__m128i*)iBufferPtr, iOutputVal); - _mm_store_si128((__m128i*)qBufferPtr, qOutputVal); - - iBufferPtr += 8; - qBufferPtr += 8; - } - - number = eighthPoints * 8; - int16_t* int16ComplexVectorPtr = (int16_t*)complexVectorPtr; - for(; number < num_points; number++){ - *iBufferPtr++ = *int16ComplexVectorPtr++; - *qBufferPtr++ = *int16ComplexVectorPtr++; - } + unsigned int number = 0; + const int8_t* complexVectorPtr = (int8_t*)complexVector; + int16_t* iBufferPtr = iBuffer; + int16_t* qBufferPtr = qBuffer; + + __m128i iMoveMask1 = _mm_set_epi8( + 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0); + __m128i iMoveMask2 = _mm_set_epi8( + 13, 12, 9, 8, 5, 4, 1, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80); + + __m128i qMoveMask1 = _mm_set_epi8( + 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 15, 14, 11, 10, 7, 6, 3, 2); + __m128i qMoveMask2 = _mm_set_epi8( + 15, 14, 11, 10, 7, 6, 3, 2, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80); + + __m128i complexVal1, complexVal2, iOutputVal, qOutputVal; + + unsigned int eighthPoints = num_points / 8; + + for (number = 0; number < eighthPoints; number++) { + complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr); + complexVectorPtr += 16; + complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr); + complexVectorPtr += 16; + + iOutputVal = _mm_or_si128(_mm_shuffle_epi8(complexVal1, iMoveMask1), + _mm_shuffle_epi8(complexVal2, iMoveMask2)); + qOutputVal = _mm_or_si128(_mm_shuffle_epi8(complexVal1, qMoveMask1), + _mm_shuffle_epi8(complexVal2, qMoveMask2)); + + _mm_store_si128((__m128i*)iBufferPtr, iOutputVal); + _mm_store_si128((__m128i*)qBufferPtr, qOutputVal); + + iBufferPtr += 8; + qBufferPtr += 8; + } + + number = eighthPoints * 8; + int16_t* int16ComplexVectorPtr = (int16_t*)complexVectorPtr; + for (; number < num_points; number++) { + *iBufferPtr++ = *int16ComplexVectorPtr++; + *qBufferPtr++ = *int16ComplexVectorPtr++; + } } #endif /* LV_HAVE_SSSE3 */ #ifdef LV_HAVE_SSE2 #include -static inline void -volk_16ic_deinterleave_16i_x2_a_sse2(int16_t* iBuffer, int16_t* qBuffer, const lv_16sc_t* complexVector, unsigned int num_points) +static inline void volk_16ic_deinterleave_16i_x2_a_sse2(int16_t* iBuffer, + int16_t* qBuffer, + const lv_16sc_t* complexVector, + unsigned int num_points) { - unsigned int number = 0; - const int16_t* complexVectorPtr = (int16_t*)complexVector; - int16_t* iBufferPtr = iBuffer; - int16_t* qBufferPtr = qBuffer; - __m128i complexVal1, complexVal2, iComplexVal1, iComplexVal2, qComplexVal1, qComplexVal2, iOutputVal, qOutputVal; - __m128i lowMask = _mm_set_epi32(0x0, 0x0, 0xFFFFFFFF, 0xFFFFFFFF); - __m128i highMask = _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0x0, 0x0); + unsigned int number = 0; + const int16_t* complexVectorPtr = (int16_t*)complexVector; + int16_t* iBufferPtr = iBuffer; + int16_t* qBufferPtr = qBuffer; + __m128i complexVal1, complexVal2, iComplexVal1, iComplexVal2, qComplexVal1, + qComplexVal2, iOutputVal, qOutputVal; + __m128i lowMask = _mm_set_epi32(0x0, 0x0, 0xFFFFFFFF, 0xFFFFFFFF); + __m128i highMask = _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0x0, 0x0); - unsigned int eighthPoints = num_points / 8; + unsigned int eighthPoints = num_points / 8; - for(number = 0; number < eighthPoints; number++){ - complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 8; - complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 8; + for (number = 0; number < eighthPoints; number++) { + complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr); + complexVectorPtr += 8; + complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr); + complexVectorPtr += 8; - iComplexVal1 = _mm_shufflelo_epi16(complexVal1, _MM_SHUFFLE(3,1,2,0)); + iComplexVal1 = _mm_shufflelo_epi16(complexVal1, _MM_SHUFFLE(3, 1, 2, 0)); - iComplexVal1 = _mm_shufflehi_epi16(iComplexVal1, _MM_SHUFFLE(3,1,2,0)); + iComplexVal1 = _mm_shufflehi_epi16(iComplexVal1, _MM_SHUFFLE(3, 1, 2, 0)); - iComplexVal1 = _mm_shuffle_epi32(iComplexVal1, _MM_SHUFFLE(3,1,2,0)); + iComplexVal1 = _mm_shuffle_epi32(iComplexVal1, _MM_SHUFFLE(3, 1, 2, 0)); - iComplexVal2 = _mm_shufflelo_epi16(complexVal2, _MM_SHUFFLE(3,1,2,0)); + iComplexVal2 = _mm_shufflelo_epi16(complexVal2, _MM_SHUFFLE(3, 1, 2, 0)); - iComplexVal2 = _mm_shufflehi_epi16(iComplexVal2, _MM_SHUFFLE(3,1,2,0)); + iComplexVal2 = _mm_shufflehi_epi16(iComplexVal2, _MM_SHUFFLE(3, 1, 2, 0)); - iComplexVal2 = _mm_shuffle_epi32(iComplexVal2, _MM_SHUFFLE(2,0,3,1)); + iComplexVal2 = _mm_shuffle_epi32(iComplexVal2, _MM_SHUFFLE(2, 0, 3, 1)); - iOutputVal = _mm_or_si128(_mm_and_si128(iComplexVal1, lowMask), _mm_and_si128(iComplexVal2, highMask)); + iOutputVal = _mm_or_si128(_mm_and_si128(iComplexVal1, lowMask), + _mm_and_si128(iComplexVal2, highMask)); - _mm_store_si128((__m128i*)iBufferPtr, iOutputVal); + _mm_store_si128((__m128i*)iBufferPtr, iOutputVal); - qComplexVal1 = _mm_shufflelo_epi16(complexVal1, _MM_SHUFFLE(2,0,3,1)); + qComplexVal1 = _mm_shufflelo_epi16(complexVal1, _MM_SHUFFLE(2, 0, 3, 1)); - qComplexVal1 = _mm_shufflehi_epi16(qComplexVal1, _MM_SHUFFLE(2,0,3,1)); + qComplexVal1 = _mm_shufflehi_epi16(qComplexVal1, _MM_SHUFFLE(2, 0, 3, 1)); - qComplexVal1 = _mm_shuffle_epi32(qComplexVal1, _MM_SHUFFLE(3,1,2,0)); + qComplexVal1 = _mm_shuffle_epi32(qComplexVal1, _MM_SHUFFLE(3, 1, 2, 0)); - qComplexVal2 = _mm_shufflelo_epi16(complexVal2, _MM_SHUFFLE(2,0,3,1)); + qComplexVal2 = _mm_shufflelo_epi16(complexVal2, _MM_SHUFFLE(2, 0, 3, 1)); - qComplexVal2 = _mm_shufflehi_epi16(qComplexVal2, _MM_SHUFFLE(2,0,3,1)); + qComplexVal2 = _mm_shufflehi_epi16(qComplexVal2, _MM_SHUFFLE(2, 0, 3, 1)); - qComplexVal2 = _mm_shuffle_epi32(qComplexVal2, _MM_SHUFFLE(2,0,3,1)); + qComplexVal2 = _mm_shuffle_epi32(qComplexVal2, _MM_SHUFFLE(2, 0, 3, 1)); - qOutputVal = _mm_or_si128(_mm_and_si128(qComplexVal1, lowMask), _mm_and_si128(qComplexVal2, highMask)); + qOutputVal = _mm_or_si128(_mm_and_si128(qComplexVal1, lowMask), + _mm_and_si128(qComplexVal2, highMask)); - _mm_store_si128((__m128i*)qBufferPtr, qOutputVal); + _mm_store_si128((__m128i*)qBufferPtr, qOutputVal); - iBufferPtr += 8; - qBufferPtr += 8; - } + iBufferPtr += 8; + qBufferPtr += 8; + } - number = eighthPoints * 8; - for(; number < num_points; number++){ - *iBufferPtr++ = *complexVectorPtr++; - *qBufferPtr++ = *complexVectorPtr++; - } + number = eighthPoints * 8; + for (; number < num_points; number++) { + *iBufferPtr++ = *complexVectorPtr++; + *qBufferPtr++ = *complexVectorPtr++; + } } #endif /* LV_HAVE_SSE2 */ #ifdef LV_HAVE_GENERIC -static inline void -volk_16ic_deinterleave_16i_x2_generic(int16_t* iBuffer, int16_t* qBuffer, const lv_16sc_t* complexVector, unsigned int num_points) +static inline void volk_16ic_deinterleave_16i_x2_generic(int16_t* iBuffer, + int16_t* qBuffer, + const lv_16sc_t* complexVector, + unsigned int num_points) { - const int16_t* complexVectorPtr = (const int16_t*)complexVector; - int16_t* iBufferPtr = iBuffer; - int16_t* qBufferPtr = qBuffer; - unsigned int number; - for(number = 0; number < num_points; number++){ - *iBufferPtr++ = *complexVectorPtr++; - *qBufferPtr++ = *complexVectorPtr++; - } + const int16_t* complexVectorPtr = (const int16_t*)complexVector; + int16_t* iBufferPtr = iBuffer; + int16_t* qBufferPtr = qBuffer; + unsigned int number; + for (number = 0; number < num_points; number++) { + *iBufferPtr++ = *complexVectorPtr++; + *qBufferPtr++ = *complexVectorPtr++; + } } #endif /* LV_HAVE_GENERIC */ #ifdef LV_HAVE_ORC -extern void -volk_16ic_deinterleave_16i_x2_a_orc_impl(int16_t* iBuffer, int16_t* qBuffer, const lv_16sc_t* complexVector, unsigned int num_points); -static inline void -volk_16ic_deinterleave_16i_x2_u_orc(int16_t* iBuffer, int16_t* qBuffer, const lv_16sc_t* complexVector, unsigned int num_points) +extern void volk_16ic_deinterleave_16i_x2_a_orc_impl(int16_t* iBuffer, + int16_t* qBuffer, + const lv_16sc_t* complexVector, + unsigned int num_points); +static inline void volk_16ic_deinterleave_16i_x2_u_orc(int16_t* iBuffer, + int16_t* qBuffer, + const lv_16sc_t* complexVector, + unsigned int num_points) { - volk_16ic_deinterleave_16i_x2_a_orc_impl(iBuffer, qBuffer, complexVector, num_points); + volk_16ic_deinterleave_16i_x2_a_orc_impl(iBuffer, qBuffer, complexVector, num_points); } #endif /* LV_HAVE_ORC */ @@ -246,44 +308,83 @@ volk_16ic_deinterleave_16i_x2_u_orc(int16_t* iBuffer, int16_t* qBuffer, const lv #ifdef LV_HAVE_AVX2 #include -static inline void -volk_16ic_deinterleave_16i_x2_u_avx2(int16_t* iBuffer, int16_t* qBuffer, const lv_16sc_t* complexVector, unsigned int num_points) +static inline void volk_16ic_deinterleave_16i_x2_u_avx2(int16_t* iBuffer, + int16_t* qBuffer, + const lv_16sc_t* complexVector, + unsigned int num_points) { - unsigned int number = 0; - const int8_t* complexVectorPtr = (int8_t*)complexVector; - int16_t* iBufferPtr = iBuffer; - int16_t* qBufferPtr = qBuffer; - - __m256i MoveMask = _mm256_set_epi8(15,14,11,10,7,6,3,2,13,12,9,8,5,4,1,0, 15,14,11,10,7,6,3,2,13,12,9,8,5,4,1,0); - - __m256i iMove2, iMove1; - __m256i complexVal1, complexVal2, iOutputVal, qOutputVal; - - unsigned int sixteenthPoints = num_points / 16; - - for(number = 0; number < sixteenthPoints; number++){ - complexVal1 = _mm256_loadu_si256((__m256i*)complexVectorPtr); complexVectorPtr += 32; - complexVal2 = _mm256_loadu_si256((__m256i*)complexVectorPtr); complexVectorPtr += 32; - - iMove2 = _mm256_shuffle_epi8(complexVal2, MoveMask); - iMove1 = _mm256_shuffle_epi8(complexVal1, MoveMask); - - iOutputVal = _mm256_permute2x128_si256(_mm256_permute4x64_epi64(iMove1,0x08),_mm256_permute4x64_epi64(iMove2,0x80),0x30); - qOutputVal = _mm256_permute2x128_si256(_mm256_permute4x64_epi64(iMove1,0x0d),_mm256_permute4x64_epi64(iMove2,0xd0),0x30); - - _mm256_storeu_si256((__m256i*)iBufferPtr, iOutputVal); - _mm256_storeu_si256((__m256i*)qBufferPtr, qOutputVal); - - iBufferPtr += 16; - qBufferPtr += 16; - } - - number = sixteenthPoints * 16; - int16_t* int16ComplexVectorPtr = (int16_t*)complexVectorPtr; - for(; number < num_points; number++){ - *iBufferPtr++ = *int16ComplexVectorPtr++; - *qBufferPtr++ = *int16ComplexVectorPtr++; - } + unsigned int number = 0; + const int8_t* complexVectorPtr = (int8_t*)complexVector; + int16_t* iBufferPtr = iBuffer; + int16_t* qBufferPtr = qBuffer; + + __m256i MoveMask = _mm256_set_epi8(15, + 14, + 11, + 10, + 7, + 6, + 3, + 2, + 13, + 12, + 9, + 8, + 5, + 4, + 1, + 0, + 15, + 14, + 11, + 10, + 7, + 6, + 3, + 2, + 13, + 12, + 9, + 8, + 5, + 4, + 1, + 0); + + __m256i iMove2, iMove1; + __m256i complexVal1, complexVal2, iOutputVal, qOutputVal; + + unsigned int sixteenthPoints = num_points / 16; + + for (number = 0; number < sixteenthPoints; number++) { + complexVal1 = _mm256_loadu_si256((__m256i*)complexVectorPtr); + complexVectorPtr += 32; + complexVal2 = _mm256_loadu_si256((__m256i*)complexVectorPtr); + complexVectorPtr += 32; + + iMove2 = _mm256_shuffle_epi8(complexVal2, MoveMask); + iMove1 = _mm256_shuffle_epi8(complexVal1, MoveMask); + + iOutputVal = _mm256_permute2x128_si256(_mm256_permute4x64_epi64(iMove1, 0x08), + _mm256_permute4x64_epi64(iMove2, 0x80), + 0x30); + qOutputVal = _mm256_permute2x128_si256(_mm256_permute4x64_epi64(iMove1, 0x0d), + _mm256_permute4x64_epi64(iMove2, 0xd0), + 0x30); + + _mm256_storeu_si256((__m256i*)iBufferPtr, iOutputVal); + _mm256_storeu_si256((__m256i*)qBufferPtr, qOutputVal); + + iBufferPtr += 16; + qBufferPtr += 16; + } + + number = sixteenthPoints * 16; + int16_t* int16ComplexVectorPtr = (int16_t*)complexVectorPtr; + for (; number < num_points; number++) { + *iBufferPtr++ = *int16ComplexVectorPtr++; + *qBufferPtr++ = *int16ComplexVectorPtr++; + } } #endif /* LV_HAVE_AVX2 */ diff --git a/kernels/volk/volk_16ic_deinterleave_real_16i.h b/kernels/volk/volk_16ic_deinterleave_real_16i.h index c1de553..45fcd99 100644 --- a/kernels/volk/volk_16ic_deinterleave_real_16i.h +++ b/kernels/volk/volk_16ic_deinterleave_real_16i.h @@ -25,12 +25,13 @@ * * \b Overview * - * Deinterleaves the complex 16 bit vector and returns the real (inphase) part of the signal. + * Deinterleaves the complex 16 bit vector and returns the real (inphase) part of the + * signal. * * Dispatcher Prototype * \code - * void volk_16ic_deinterleave_real_16i(int16_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points) - * \endcode + * void volk_16ic_deinterleave_real_16i(int16_t* iBuffer, const lv_16sc_t* complexVector, + * unsigned int num_points) \endcode * * \b Inputs * \li complexVector: The complex input vector. @@ -60,79 +61,149 @@ #ifdef LV_HAVE_AVX2 #include -static inline void -volk_16ic_deinterleave_real_16i_a_avx2(int16_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points) +static inline void volk_16ic_deinterleave_real_16i_a_avx2(int16_t* iBuffer, + const lv_16sc_t* complexVector, + unsigned int num_points) { - unsigned int number = 0; - const int16_t* complexVectorPtr = (int16_t*)complexVector; - int16_t* iBufferPtr = iBuffer; - - __m256i iMoveMask1 = _mm256_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0); - __m256i iMoveMask2 = _mm256_set_epi8(13, 12, 9, 8, 5, 4, 1, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80); - - __m256i complexVal1, complexVal2, iOutputVal; - - unsigned int sixteenthPoints = num_points / 16; - - for(number = 0; number < sixteenthPoints; number++){ - complexVal1 = _mm256_load_si256((__m256i*)complexVectorPtr); complexVectorPtr += 16; - complexVal2 = _mm256_load_si256((__m256i*)complexVectorPtr); complexVectorPtr += 16; - - complexVal1 = _mm256_shuffle_epi8(complexVal1, iMoveMask1); - complexVal2 = _mm256_shuffle_epi8(complexVal2, iMoveMask2); - - iOutputVal = _mm256_or_si256(complexVal1, complexVal2); - iOutputVal = _mm256_permute4x64_epi64(iOutputVal, 0xd8); - - _mm256_store_si256((__m256i*)iBufferPtr, iOutputVal); - - iBufferPtr += 16; - } - - number = sixteenthPoints * 16; - for(; number < num_points; number++){ - *iBufferPtr++ = *complexVectorPtr++; - complexVectorPtr++; - } + unsigned int number = 0; + const int16_t* complexVectorPtr = (int16_t*)complexVector; + int16_t* iBufferPtr = iBuffer; + + __m256i iMoveMask1 = _mm256_set_epi8(0x80, + 0x80, + 0x80, + 0x80, + 0x80, + 0x80, + 0x80, + 0x80, + 13, + 12, + 9, + 8, + 5, + 4, + 1, + 0, + 0x80, + 0x80, + 0x80, + 0x80, + 0x80, + 0x80, + 0x80, + 0x80, + 13, + 12, + 9, + 8, + 5, + 4, + 1, + 0); + __m256i iMoveMask2 = _mm256_set_epi8(13, + 12, + 9, + 8, + 5, + 4, + 1, + 0, + 0x80, + 0x80, + 0x80, + 0x80, + 0x80, + 0x80, + 0x80, + 0x80, + 13, + 12, + 9, + 8, + 5, + 4, + 1, + 0, + 0x80, + 0x80, + 0x80, + 0x80, + 0x80, + 0x80, + 0x80, + 0x80); + + __m256i complexVal1, complexVal2, iOutputVal; + + unsigned int sixteenthPoints = num_points / 16; + + for (number = 0; number < sixteenthPoints; number++) { + complexVal1 = _mm256_load_si256((__m256i*)complexVectorPtr); + complexVectorPtr += 16; + complexVal2 = _mm256_load_si256((__m256i*)complexVectorPtr); + complexVectorPtr += 16; + + complexVal1 = _mm256_shuffle_epi8(complexVal1, iMoveMask1); + complexVal2 = _mm256_shuffle_epi8(complexVal2, iMoveMask2); + + iOutputVal = _mm256_or_si256(complexVal1, complexVal2); + iOutputVal = _mm256_permute4x64_epi64(iOutputVal, 0xd8); + + _mm256_store_si256((__m256i*)iBufferPtr, iOutputVal); + + iBufferPtr += 16; + } + + number = sixteenthPoints * 16; + for (; number < num_points; number++) { + *iBufferPtr++ = *complexVectorPtr++; + complexVectorPtr++; + } } #endif /* LV_HAVE_AVX2 */ #ifdef LV_HAVE_SSSE3 #include -static inline void -volk_16ic_deinterleave_real_16i_a_ssse3(int16_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points) +static inline void volk_16ic_deinterleave_real_16i_a_ssse3(int16_t* iBuffer, + const lv_16sc_t* complexVector, + unsigned int num_points) { - unsigned int number = 0; - const int16_t* complexVectorPtr = (int16_t*)complexVector; - int16_t* iBufferPtr = iBuffer; + unsigned int number = 0; + const int16_t* complexVectorPtr = (int16_t*)complexVector; + int16_t* iBufferPtr = iBuffer; - __m128i iMoveMask1 = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0); - __m128i iMoveMask2 = _mm_set_epi8(13, 12, 9, 8, 5, 4, 1, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80); + __m128i iMoveMask1 = _mm_set_epi8( + 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0); + __m128i iMoveMask2 = _mm_set_epi8( + 13, 12, 9, 8, 5, 4, 1, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80); - __m128i complexVal1, complexVal2, iOutputVal; + __m128i complexVal1, complexVal2, iOutputVal; - unsigned int eighthPoints = num_points / 8; + unsigned int eighthPoints = num_points / 8; - for(number = 0; number < eighthPoints; number++){ - complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 8; - complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 8; + for (number = 0; number < eighthPoints; number++) { + complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr); + complexVectorPtr += 8; + complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr); + complexVectorPtr += 8; - complexVal1 = _mm_shuffle_epi8(complexVal1, iMoveMask1); - complexVal2 = _mm_shuffle_epi8(complexVal2, iMoveMask2); + complexVal1 = _mm_shuffle_epi8(complexVal1, iMoveMask1); + complexVal2 = _mm_shuffle_epi8(complexVal2, iMoveMask2); - iOutputVal = _mm_or_si128(complexVal1, complexVal2); + iOutputVal = _mm_or_si128(complexVal1, complexVal2); - _mm_store_si128((__m128i*)iBufferPtr, iOutputVal); + _mm_store_si128((__m128i*)iBufferPtr, iOutputVal); - iBufferPtr += 8; - } + iBufferPtr += 8; + } - number = eighthPoints * 8; - for(; number < num_points; number++){ - *iBufferPtr++ = *complexVectorPtr++; - complexVectorPtr++; - } + number = eighthPoints * 8; + for (; number < num_points; number++) { + *iBufferPtr++ = *complexVectorPtr++; + complexVectorPtr++; + } } #endif /* LV_HAVE_SSSE3 */ @@ -140,61 +211,66 @@ volk_16ic_deinterleave_real_16i_a_ssse3(int16_t* iBuffer, const lv_16sc_t* compl #ifdef LV_HAVE_SSE2 #include -static inline void -volk_16ic_deinterleave_real_16i_a_sse2(int16_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points) +static inline void volk_16ic_deinterleave_real_16i_a_sse2(int16_t* iBuffer, + const lv_16sc_t* complexVector, + unsigned int num_points) { - unsigned int number = 0; - const int16_t* complexVectorPtr = (int16_t*)complexVector; - int16_t* iBufferPtr = iBuffer; - __m128i complexVal1, complexVal2, iOutputVal; - __m128i lowMask = _mm_set_epi32(0x0, 0x0, 0xFFFFFFFF, 0xFFFFFFFF); - __m128i highMask = _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0x0, 0x0); + unsigned int number = 0; + const int16_t* complexVectorPtr = (int16_t*)complexVector; + int16_t* iBufferPtr = iBuffer; + __m128i complexVal1, complexVal2, iOutputVal; + __m128i lowMask = _mm_set_epi32(0x0, 0x0, 0xFFFFFFFF, 0xFFFFFFFF); + __m128i highMask = _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0x0, 0x0); - unsigned int eighthPoints = num_points / 8; + unsigned int eighthPoints = num_points / 8; - for(number = 0; number < eighthPoints; number++){ - complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 8; - complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 8; + for (number = 0; number < eighthPoints; number++) { + complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr); + complexVectorPtr += 8; + complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr); + complexVectorPtr += 8; - complexVal1 = _mm_shufflelo_epi16(complexVal1, _MM_SHUFFLE(3,1,2,0)); + complexVal1 = _mm_shufflelo_epi16(complexVal1, _MM_SHUFFLE(3, 1, 2, 0)); - complexVal1 = _mm_shufflehi_epi16(complexVal1, _MM_SHUFFLE(3,1,2,0)); + complexVal1 = _mm_shufflehi_epi16(complexVal1, _MM_SHUFFLE(3, 1, 2, 0)); - complexVal1 = _mm_shuffle_epi32(complexVal1, _MM_SHUFFLE(3,1,2,0)); + complexVal1 = _mm_shuffle_epi32(complexVal1, _MM_SHUFFLE(3, 1, 2, 0)); - complexVal2 = _mm_shufflelo_epi16(complexVal2, _MM_SHUFFLE(3,1,2,0)); + complexVal2 = _mm_shufflelo_epi16(complexVal2, _MM_SHUFFLE(3, 1, 2, 0)); - complexVal2 = _mm_shufflehi_epi16(complexVal2, _MM_SHUFFLE(3,1,2,0)); + complexVal2 = _mm_shufflehi_epi16(complexVal2, _MM_SHUFFLE(3, 1, 2, 0)); - complexVal2 = _mm_shuffle_epi32(complexVal2, _MM_SHUFFLE(2,0,3,1)); + complexVal2 = _mm_shuffle_epi32(complexVal2, _MM_SHUFFLE(2, 0, 3, 1)); - iOutputVal = _mm_or_si128(_mm_and_si128(complexVal1, lowMask), _mm_and_si128(complexVal2, highMask)); + iOutputVal = _mm_or_si128(_mm_and_si128(complexVal1, lowMask), + _mm_and_si128(complexVal2, highMask)); - _mm_store_si128((__m128i*)iBufferPtr, iOutputVal); + _mm_store_si128((__m128i*)iBufferPtr, iOutputVal); - iBufferPtr += 8; - } + iBufferPtr += 8; + } - number = eighthPoints * 8; - for(; number < num_points; number++){ - *iBufferPtr++ = *complexVectorPtr++; - complexVectorPtr++; - } + number = eighthPoints * 8; + for (; number < num_points; number++) { + *iBufferPtr++ = *complexVectorPtr++; + complexVectorPtr++; + } } #endif /* LV_HAVE_SSE2 */ #ifdef LV_HAVE_GENERIC -static inline void -volk_16ic_deinterleave_real_16i_generic(int16_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points) +static inline void volk_16ic_deinterleave_real_16i_generic(int16_t* iBuffer, + const lv_16sc_t* complexVector, + unsigned int num_points) { - unsigned int number = 0; - const int16_t* complexVectorPtr = (int16_t*)complexVector; - int16_t* iBufferPtr = iBuffer; - for(number = 0; number < num_points; number++){ - *iBufferPtr++ = *complexVectorPtr++; - complexVectorPtr++; - } + unsigned int number = 0; + const int16_t* complexVectorPtr = (int16_t*)complexVector; + int16_t* iBufferPtr = iBuffer; + for (number = 0; number < num_points; number++) { + *iBufferPtr++ = *complexVectorPtr++; + complexVectorPtr++; + } } #endif /* LV_HAVE_GENERIC */ @@ -212,40 +288,105 @@ volk_16ic_deinterleave_real_16i_generic(int16_t* iBuffer, const lv_16sc_t* compl #ifdef LV_HAVE_AVX2 #include -static inline void -volk_16ic_deinterleave_real_16i_u_avx2(int16_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points) +static inline void volk_16ic_deinterleave_real_16i_u_avx2(int16_t* iBuffer, + const lv_16sc_t* complexVector, + unsigned int num_points) { - unsigned int number = 0; - const int16_t* complexVectorPtr = (int16_t*)complexVector; - int16_t* iBufferPtr = iBuffer; - - __m256i iMoveMask1 = _mm256_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0); - __m256i iMoveMask2 = _mm256_set_epi8(13, 12, 9, 8, 5, 4, 1, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80); - - __m256i complexVal1, complexVal2, iOutputVal; - - unsigned int sixteenthPoints = num_points / 16; - - for(number = 0; number < sixteenthPoints; number++){ - complexVal1 = _mm256_loadu_si256((__m256i*)complexVectorPtr); complexVectorPtr += 16; - complexVal2 = _mm256_loadu_si256((__m256i*)complexVectorPtr); complexVectorPtr += 16; - - complexVal1 = _mm256_shuffle_epi8(complexVal1, iMoveMask1); - complexVal2 = _mm256_shuffle_epi8(complexVal2, iMoveMask2); - - iOutputVal = _mm256_or_si256(complexVal1, complexVal2); - iOutputVal = _mm256_permute4x64_epi64(iOutputVal, 0xd8); - - _mm256_storeu_si256((__m256i*)iBufferPtr, iOutputVal); - - iBufferPtr += 16; - } - - number = sixteenthPoints * 16; - for(; number < num_points; number++){ - *iBufferPtr++ = *complexVectorPtr++; - complexVectorPtr++; - } + unsigned int number = 0; + const int16_t* complexVectorPtr = (int16_t*)complexVector; + int16_t* iBufferPtr = iBuffer; + + __m256i iMoveMask1 = _mm256_set_epi8(0x80, + 0x80, + 0x80, + 0x80, + 0x80, + 0x80, + 0x80, + 0x80, + 13, + 12, + 9, + 8, + 5, + 4, + 1, + 0, + 0x80, + 0x80, + 0x80, + 0x80, + 0x80, + 0x80, + 0x80, + 0x80, + 13, + 12, + 9, + 8, + 5, + 4, + 1, + 0); + __m256i iMoveMask2 = _mm256_set_epi8(13, + 12, + 9, + 8, + 5, + 4, + 1, + 0, + 0x80, + 0x80, + 0x80, + 0x80, + 0x80, + 0x80, + 0x80, + 0x80, + 13, + 12, + 9, + 8, + 5, + 4, + 1, + 0, + 0x80, + 0x80, + 0x80, + 0x80, + 0x80, + 0x80, + 0x80, + 0x80); + + __m256i complexVal1, complexVal2, iOutputVal; + + unsigned int sixteenthPoints = num_points / 16; + + for (number = 0; number < sixteenthPoints; number++) { + complexVal1 = _mm256_loadu_si256((__m256i*)complexVectorPtr); + complexVectorPtr += 16; + complexVal2 = _mm256_loadu_si256((__m256i*)complexVectorPtr); + complexVectorPtr += 16; + + complexVal1 = _mm256_shuffle_epi8(complexVal1, iMoveMask1); + complexVal2 = _mm256_shuffle_epi8(complexVal2, iMoveMask2); + + iOutputVal = _mm256_or_si256(complexVal1, complexVal2); + iOutputVal = _mm256_permute4x64_epi64(iOutputVal, 0xd8); + + _mm256_storeu_si256((__m256i*)iBufferPtr, iOutputVal); + + iBufferPtr += 16; + } + + number = sixteenthPoints * 16; + for (; number < num_points; number++) { + *iBufferPtr++ = *complexVectorPtr++; + complexVectorPtr++; + } } #endif /* LV_HAVE_AVX2 */ diff --git a/kernels/volk/volk_16ic_deinterleave_real_8i.h b/kernels/volk/volk_16ic_deinterleave_real_8i.h index 1022688..3d8e4ea 100644 --- a/kernels/volk/volk_16ic_deinterleave_real_8i.h +++ b/kernels/volk/volk_16ic_deinterleave_real_8i.h @@ -30,8 +30,8 @@ * * Dispatcher Prototype * \code - * void volk_16ic_deinterleave_real_8i(int8_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points) - * \endcode + * void volk_16ic_deinterleave_real_8i(int8_t* iBuffer, const lv_16sc_t* complexVector, + * unsigned int num_points) \endcode * * \b Inputs * \li complexVector: The complex input vector. @@ -61,54 +61,121 @@ #ifdef LV_HAVE_AVX2 #include -static inline void -volk_16ic_deinterleave_real_8i_a_avx2(int8_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points) +static inline void volk_16ic_deinterleave_real_8i_a_avx2(int8_t* iBuffer, + const lv_16sc_t* complexVector, + unsigned int num_points) { - unsigned int number = 0; - const int8_t* complexVectorPtr = (int8_t*)complexVector; - int8_t* iBufferPtr = iBuffer; - __m256i iMoveMask1 = _mm256_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0); - __m256i iMoveMask2 = _mm256_set_epi8(13, 12, 9, 8, 5, 4, 1, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80); - __m256i complexVal1, complexVal2, complexVal3, complexVal4, iOutputVal; - - unsigned int thirtysecondPoints = num_points / 32; - - for(number = 0; number < thirtysecondPoints; number++){ - complexVal1 = _mm256_load_si256((__m256i*)complexVectorPtr); complexVectorPtr += 32; - complexVal2 = _mm256_load_si256((__m256i*)complexVectorPtr); complexVectorPtr += 32; - - complexVal3 = _mm256_load_si256((__m256i*)complexVectorPtr); complexVectorPtr += 32; - complexVal4 = _mm256_load_si256((__m256i*)complexVectorPtr); complexVectorPtr += 32; - - complexVal1 = _mm256_shuffle_epi8(complexVal1, iMoveMask1); - complexVal2 = _mm256_shuffle_epi8(complexVal2, iMoveMask2); - - complexVal1 = _mm256_or_si256(complexVal1, complexVal2); - complexVal1 = _mm256_permute4x64_epi64(complexVal1, 0xd8); - - complexVal3 = _mm256_shuffle_epi8(complexVal3, iMoveMask1); - complexVal4 = _mm256_shuffle_epi8(complexVal4, iMoveMask2); - - complexVal3 = _mm256_or_si256(complexVal3, complexVal4); - complexVal3 = _mm256_permute4x64_epi64(complexVal3, 0xd8); - - complexVal1 = _mm256_srai_epi16(complexVal1, 8); - complexVal3 = _mm256_srai_epi16(complexVal3, 8); - - iOutputVal = _mm256_packs_epi16(complexVal1, complexVal3); - iOutputVal = _mm256_permute4x64_epi64(iOutputVal, 0xd8); - - _mm256_store_si256((__m256i*)iBufferPtr, iOutputVal); - - iBufferPtr += 32; - } - - number = thirtysecondPoints * 32; - int16_t* int16ComplexVectorPtr = (int16_t*)complexVectorPtr; - for(; number < num_points; number++){ - *iBufferPtr++ = ((int8_t)(*int16ComplexVectorPtr++ >> 8)); - int16ComplexVectorPtr++; - } + unsigned int number = 0; + const int8_t* complexVectorPtr = (int8_t*)complexVector; + int8_t* iBufferPtr = iBuffer; + __m256i iMoveMask1 = _mm256_set_epi8(0x80, + 0x80, + 0x80, + 0x80, + 0x80, + 0x80, + 0x80, + 0x80, + 13, + 12, + 9, + 8, + 5, + 4, + 1, + 0, + 0x80, + 0x80, + 0x80, + 0x80, + 0x80, + 0x80, + 0x80, + 0x80, + 13, + 12, + 9, + 8, + 5, + 4, + 1, + 0); + __m256i iMoveMask2 = _mm256_set_epi8(13, + 12, + 9, + 8, + 5, + 4, + 1, + 0, + 0x80, + 0x80, + 0x80, + 0x80, + 0x80, + 0x80, + 0x80, + 0x80, + 13, + 12, + 9, + 8, + 5, + 4, + 1, + 0, + 0x80, + 0x80, + 0x80, + 0x80, + 0x80, + 0x80, + 0x80, + 0x80); + __m256i complexVal1, complexVal2, complexVal3, complexVal4, iOutputVal; + + unsigned int thirtysecondPoints = num_points / 32; + + for (number = 0; number < thirtysecondPoints; number++) { + complexVal1 = _mm256_load_si256((__m256i*)complexVectorPtr); + complexVectorPtr += 32; + complexVal2 = _mm256_load_si256((__m256i*)complexVectorPtr); + complexVectorPtr += 32; + + complexVal3 = _mm256_load_si256((__m256i*)complexVectorPtr); + complexVectorPtr += 32; + complexVal4 = _mm256_load_si256((__m256i*)complexVectorPtr); + complexVectorPtr += 32; + + complexVal1 = _mm256_shuffle_epi8(complexVal1, iMoveMask1); + complexVal2 = _mm256_shuffle_epi8(complexVal2, iMoveMask2); + + complexVal1 = _mm256_or_si256(complexVal1, complexVal2); + complexVal1 = _mm256_permute4x64_epi64(complexVal1, 0xd8); + + complexVal3 = _mm256_shuffle_epi8(complexVal3, iMoveMask1); + complexVal4 = _mm256_shuffle_epi8(complexVal4, iMoveMask2); + + complexVal3 = _mm256_or_si256(complexVal3, complexVal4); + complexVal3 = _mm256_permute4x64_epi64(complexVal3, 0xd8); + + complexVal1 = _mm256_srai_epi16(complexVal1, 8); + complexVal3 = _mm256_srai_epi16(complexVal3, 8); + + iOutputVal = _mm256_packs_epi16(complexVal1, complexVal3); + iOutputVal = _mm256_permute4x64_epi64(iOutputVal, 0xd8); + + _mm256_store_si256((__m256i*)iBufferPtr, iOutputVal); + + iBufferPtr += 32; + } + + number = thirtysecondPoints * 32; + int16_t* int16ComplexVectorPtr = (int16_t*)complexVectorPtr; + for (; number < num_points; number++) { + *iBufferPtr++ = ((int8_t)(*int16ComplexVectorPtr++ >> 8)); + int16ComplexVectorPtr++; + } } #endif /* LV_HAVE_AVX2 */ @@ -116,105 +183,116 @@ volk_16ic_deinterleave_real_8i_a_avx2(int8_t* iBuffer, const lv_16sc_t* complexV #ifdef LV_HAVE_SSSE3 #include -static inline void -volk_16ic_deinterleave_real_8i_a_ssse3(int8_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points) +static inline void volk_16ic_deinterleave_real_8i_a_ssse3(int8_t* iBuffer, + const lv_16sc_t* complexVector, + unsigned int num_points) { - unsigned int number = 0; - const int8_t* complexVectorPtr = (int8_t*)complexVector; - int8_t* iBufferPtr = iBuffer; - __m128i iMoveMask1 = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0); - __m128i iMoveMask2 = _mm_set_epi8(13, 12, 9, 8, 5, 4, 1, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80); - __m128i complexVal1, complexVal2, complexVal3, complexVal4, iOutputVal; + unsigned int number = 0; + const int8_t* complexVectorPtr = (int8_t*)complexVector; + int8_t* iBufferPtr = iBuffer; + __m128i iMoveMask1 = _mm_set_epi8( + 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0); + __m128i iMoveMask2 = _mm_set_epi8( + 13, 12, 9, 8, 5, 4, 1, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80); + __m128i complexVal1, complexVal2, complexVal3, complexVal4, iOutputVal; - unsigned int sixteenthPoints = num_points / 16; + unsigned int sixteenthPoints = num_points / 16; - for(number = 0; number < sixteenthPoints; number++){ - complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16; - complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16; + for (number = 0; number < sixteenthPoints; number++) { + complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr); + complexVectorPtr += 16; + complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr); + complexVectorPtr += 16; - complexVal3 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16; - complexVal4 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16; + complexVal3 = _mm_load_si128((__m128i*)complexVectorPtr); + complexVectorPtr += 16; + complexVal4 = _mm_load_si128((__m128i*)complexVectorPtr); + complexVectorPtr += 16; - complexVal1 = _mm_shuffle_epi8(complexVal1, iMoveMask1); - complexVal2 = _mm_shuffle_epi8(complexVal2, iMoveMask2); + complexVal1 = _mm_shuffle_epi8(complexVal1, iMoveMask1); + complexVal2 = _mm_shuffle_epi8(complexVal2, iMoveMask2); - complexVal1 = _mm_or_si128(complexVal1, complexVal2); + complexVal1 = _mm_or_si128(complexVal1, complexVal2); - complexVal3 = _mm_shuffle_epi8(complexVal3, iMoveMask1); - complexVal4 = _mm_shuffle_epi8(complexVal4, iMoveMask2); + complexVal3 = _mm_shuffle_epi8(complexVal3, iMoveMask1); + complexVal4 = _mm_shuffle_epi8(complexVal4, iMoveMask2); - complexVal3 = _mm_or_si128(complexVal3, complexVal4); + complexVal3 = _mm_or_si128(complexVal3, complexVal4); - complexVal1 = _mm_srai_epi16(complexVal1, 8); - complexVal3 = _mm_srai_epi16(complexVal3, 8); + complexVal1 = _mm_srai_epi16(complexVal1, 8); + complexVal3 = _mm_srai_epi16(complexVal3, 8); - iOutputVal = _mm_packs_epi16(complexVal1, complexVal3); + iOutputVal = _mm_packs_epi16(complexVal1, complexVal3); - _mm_store_si128((__m128i*)iBufferPtr, iOutputVal); + _mm_store_si128((__m128i*)iBufferPtr, iOutputVal); - iBufferPtr += 16; - } + iBufferPtr += 16; + } - number = sixteenthPoints * 16; - int16_t* int16ComplexVectorPtr = (int16_t*)complexVectorPtr; - for(; number < num_points; number++){ - *iBufferPtr++ = ((int8_t)(*int16ComplexVectorPtr++ >> 8)); - int16ComplexVectorPtr++; - } + number = sixteenthPoints * 16; + int16_t* int16ComplexVectorPtr = (int16_t*)complexVectorPtr; + for (; number < num_points; number++) { + *iBufferPtr++ = ((int8_t)(*int16ComplexVectorPtr++ >> 8)); + int16ComplexVectorPtr++; + } } #endif /* LV_HAVE_SSSE3 */ #ifdef LV_HAVE_GENERIC -static inline void -volk_16ic_deinterleave_real_8i_generic(int8_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points) +static inline void volk_16ic_deinterleave_real_8i_generic(int8_t* iBuffer, + const lv_16sc_t* complexVector, + unsigned int num_points) { - unsigned int number = 0; - int16_t* complexVectorPtr = (int16_t*)complexVector; - int8_t* iBufferPtr = iBuffer; - for(number = 0; number < num_points; number++){ - *iBufferPtr++ = ((int8_t)(*complexVectorPtr++ >> 8)); - complexVectorPtr++; - } + unsigned int number = 0; + int16_t* complexVectorPtr = (int16_t*)complexVector; + int8_t* iBufferPtr = iBuffer; + for (number = 0; number < num_points; number++) { + *iBufferPtr++ = ((int8_t)(*complexVectorPtr++ >> 8)); + complexVectorPtr++; + } } #endif /* LV_HAVE_GENERIC */ #ifdef LV_HAVE_NEON #include -static inline void -volk_16ic_deinterleave_real_8i_neon(int8_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points) +static inline void volk_16ic_deinterleave_real_8i_neon(int8_t* iBuffer, + const lv_16sc_t* complexVector, + unsigned int num_points) { - const int16_t* complexVectorPtr = (const int16_t*)complexVector; - int8_t* iBufferPtr = iBuffer; - unsigned int eighth_points = num_points / 8; - unsigned int number; - - int16x8x2_t complexInput; - int8x8_t realOutput; - for(number = 0; number < eighth_points; number++){ - complexInput = vld2q_s16(complexVectorPtr); - realOutput = vshrn_n_s16(complexInput.val[0], 8); - vst1_s8(iBufferPtr, realOutput); - complexVectorPtr += 16; - iBufferPtr += 8; - } - - for(number = eighth_points*8; number < num_points; number++){ - *iBufferPtr++ = ((int8_t)(*complexVectorPtr++ >> 8)); - complexVectorPtr++; - } + const int16_t* complexVectorPtr = (const int16_t*)complexVector; + int8_t* iBufferPtr = iBuffer; + unsigned int eighth_points = num_points / 8; + unsigned int number; + + int16x8x2_t complexInput; + int8x8_t realOutput; + for (number = 0; number < eighth_points; number++) { + complexInput = vld2q_s16(complexVectorPtr); + realOutput = vshrn_n_s16(complexInput.val[0], 8); + vst1_s8(iBufferPtr, realOutput); + complexVectorPtr += 16; + iBufferPtr += 8; + } + + for (number = eighth_points * 8; number < num_points; number++) { + *iBufferPtr++ = ((int8_t)(*complexVectorPtr++ >> 8)); + complexVectorPtr++; + } } #endif #ifdef LV_HAVE_ORC -extern void -volk_16ic_deinterleave_real_8i_a_orc_impl(int8_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points); +extern void volk_16ic_deinterleave_real_8i_a_orc_impl(int8_t* iBuffer, + const lv_16sc_t* complexVector, + unsigned int num_points); -static inline void -volk_16ic_deinterleave_real_8i_u_orc(int8_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points) +static inline void volk_16ic_deinterleave_real_8i_u_orc(int8_t* iBuffer, + const lv_16sc_t* complexVector, + unsigned int num_points) { volk_16ic_deinterleave_real_8i_a_orc_impl(iBuffer, complexVector, num_points); } @@ -233,54 +311,121 @@ volk_16ic_deinterleave_real_8i_u_orc(int8_t* iBuffer, const lv_16sc_t* complexVe #ifdef LV_HAVE_AVX2 #include -static inline void -volk_16ic_deinterleave_real_8i_u_avx2(int8_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points) +static inline void volk_16ic_deinterleave_real_8i_u_avx2(int8_t* iBuffer, + const lv_16sc_t* complexVector, + unsigned int num_points) { - unsigned int number = 0; - const int8_t* complexVectorPtr = (int8_t*)complexVector; - int8_t* iBufferPtr = iBuffer; - __m256i iMoveMask1 = _mm256_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0); - __m256i iMoveMask2 = _mm256_set_epi8(13, 12, 9, 8, 5, 4, 1, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80); - __m256i complexVal1, complexVal2, complexVal3, complexVal4, iOutputVal; - - unsigned int thirtysecondPoints = num_points / 32; - - for(number = 0; number < thirtysecondPoints; number++){ - complexVal1 = _mm256_loadu_si256((__m256i*)complexVectorPtr); complexVectorPtr += 32; - complexVal2 = _mm256_loadu_si256((__m256i*)complexVectorPtr); complexVectorPtr += 32; - - complexVal3 = _mm256_loadu_si256((__m256i*)complexVectorPtr); complexVectorPtr += 32; - complexVal4 = _mm256_loadu_si256((__m256i*)complexVectorPtr); complexVectorPtr += 32; - - complexVal1 = _mm256_shuffle_epi8(complexVal1, iMoveMask1); - complexVal2 = _mm256_shuffle_epi8(complexVal2, iMoveMask2); - - complexVal1 = _mm256_or_si256(complexVal1, complexVal2); - complexVal1 = _mm256_permute4x64_epi64(complexVal1, 0xd8); - - complexVal3 = _mm256_shuffle_epi8(complexVal3, iMoveMask1); - complexVal4 = _mm256_shuffle_epi8(complexVal4, iMoveMask2); - - complexVal3 = _mm256_or_si256(complexVal3, complexVal4); - complexVal3 = _mm256_permute4x64_epi64(complexVal3, 0xd8); - - complexVal1 = _mm256_srai_epi16(complexVal1, 8); - complexVal3 = _mm256_srai_epi16(complexVal3, 8); - - iOutputVal = _mm256_packs_epi16(complexVal1, complexVal3); - iOutputVal = _mm256_permute4x64_epi64(iOutputVal, 0xd8); - - _mm256_storeu_si256((__m256i*)iBufferPtr, iOutputVal); - - iBufferPtr += 32; - } - - number = thirtysecondPoints * 32; - int16_t* int16ComplexVectorPtr = (int16_t*)complexVectorPtr; - for(; number < num_points; number++){ - *iBufferPtr++ = ((int8_t)(*int16ComplexVectorPtr++ >> 8)); - int16ComplexVectorPtr++; - } + unsigned int number = 0; + const int8_t* complexVectorPtr = (int8_t*)complexVector; + int8_t* iBufferPtr = iBuffer; + __m256i iMoveMask1 = _mm256_set_epi8(0x80, + 0x80, + 0x80, + 0x80, + 0x80, + 0x80, + 0x80, + 0x80, + 13, + 12, + 9, + 8, + 5, + 4, + 1, + 0, + 0x80, + 0x80, + 0x80, + 0x80, + 0x80, + 0x80, + 0x80, + 0x80, + 13, + 12, + 9, + 8, + 5, + 4, + 1, + 0); + __m256i iMoveMask2 = _mm256_set_epi8(13, + 12, + 9, + 8, + 5, + 4, + 1, + 0, + 0x80, + 0x80, + 0x80, + 0x80, + 0x80, + 0x80, + 0x80, + 0x80, + 13, + 12, + 9, + 8, + 5, + 4, + 1, + 0, + 0x80, + 0x80, + 0x80, + 0x80, + 0x80, + 0x80, + 0x80, + 0x80); + __m256i complexVal1, complexVal2, complexVal3, complexVal4, iOutputVal; + + unsigned int thirtysecondPoints = num_points / 32; + + for (number = 0; number < thirtysecondPoints; number++) { + complexVal1 = _mm256_loadu_si256((__m256i*)complexVectorPtr); + complexVectorPtr += 32; + complexVal2 = _mm256_loadu_si256((__m256i*)complexVectorPtr); + complexVectorPtr += 32; + + complexVal3 = _mm256_loadu_si256((__m256i*)complexVectorPtr); + complexVectorPtr += 32; + complexVal4 = _mm256_loadu_si256((__m256i*)complexVectorPtr); + complexVectorPtr += 32; + + complexVal1 = _mm256_shuffle_epi8(complexVal1, iMoveMask1); + complexVal2 = _mm256_shuffle_epi8(complexVal2, iMoveMask2); + + complexVal1 = _mm256_or_si256(complexVal1, complexVal2); + complexVal1 = _mm256_permute4x64_epi64(complexVal1, 0xd8); + + complexVal3 = _mm256_shuffle_epi8(complexVal3, iMoveMask1); + complexVal4 = _mm256_shuffle_epi8(complexVal4, iMoveMask2); + + complexVal3 = _mm256_or_si256(complexVal3, complexVal4); + complexVal3 = _mm256_permute4x64_epi64(complexVal3, 0xd8); + + complexVal1 = _mm256_srai_epi16(complexVal1, 8); + complexVal3 = _mm256_srai_epi16(complexVal3, 8); + + iOutputVal = _mm256_packs_epi16(complexVal1, complexVal3); + iOutputVal = _mm256_permute4x64_epi64(iOutputVal, 0xd8); + + _mm256_storeu_si256((__m256i*)iBufferPtr, iOutputVal); + + iBufferPtr += 32; + } + + number = thirtysecondPoints * 32; + int16_t* int16ComplexVectorPtr = (int16_t*)complexVectorPtr; + for (; number < num_points; number++) { + *iBufferPtr++ = ((int8_t)(*int16ComplexVectorPtr++ >> 8)); + int16ComplexVectorPtr++; + } } #endif /* LV_HAVE_AVX2 */ #endif /* INCLUDED_volk_16ic_deinterleave_real_8i_u_H */ diff --git a/kernels/volk/volk_16ic_magnitude_16i.h b/kernels/volk/volk_16ic_magnitude_16i.h index bbe72a8..35b40cb 100644 --- a/kernels/volk/volk_16ic_magnitude_16i.h +++ b/kernels/volk/volk_16ic_magnitude_16i.h @@ -30,8 +30,8 @@ * * Dispatcher Prototype * \code - * void volk_16ic_magnitude_16i(int16_t* magnitudeVector, const lv_16sc_t* complexVector, unsigned int num_points) - * \endcode + * void volk_16ic_magnitude_16i(int16_t* magnitudeVector, const lv_16sc_t* complexVector, + * unsigned int num_points) \endcode * * \b Inputs * \li complexVector: The complex input vector. @@ -54,242 +54,255 @@ #ifndef INCLUDED_volk_16ic_magnitude_16i_a_H #define INCLUDED_volk_16ic_magnitude_16i_a_H -#include #include -#include -#include #include +#include +#include +#include #ifdef LV_HAVE_AVX2 #include -static inline void -volk_16ic_magnitude_16i_a_avx2(int16_t* magnitudeVector, const lv_16sc_t* complexVector, unsigned int num_points) +static inline void volk_16ic_magnitude_16i_a_avx2(int16_t* magnitudeVector, + const lv_16sc_t* complexVector, + unsigned int num_points) { - unsigned int number = 0; - const unsigned int eighthPoints = num_points / 8; - - const int16_t* complexVectorPtr = (const int16_t*)complexVector; - int16_t* magnitudeVectorPtr = magnitudeVector; - - __m256 vScalar = _mm256_set1_ps(SHRT_MAX); - __m256 invScalar = _mm256_set1_ps(1.0f/SHRT_MAX); - __m256i int1, int2; - __m128i short1, short2; - __m256 cplxValue1, cplxValue2, result; - __m256i idx = _mm256_set_epi32(0,0,0,0,5,1,4,0); - - for(;number < eighthPoints; number++){ - - int1 = _mm256_load_si256((__m256i*)complexVectorPtr); - complexVectorPtr += 16; - short1 = _mm256_extracti128_si256(int1,0); - short2 = _mm256_extracti128_si256(int1,1); - - int1 = _mm256_cvtepi16_epi32(short1); - int2 = _mm256_cvtepi16_epi32(short2); - cplxValue1 = _mm256_cvtepi32_ps(int1); - cplxValue2 = _mm256_cvtepi32_ps(int2); - - cplxValue1 = _mm256_mul_ps(cplxValue1, invScalar); - cplxValue2 = _mm256_mul_ps(cplxValue2, invScalar); - - cplxValue1 = _mm256_mul_ps(cplxValue1, cplxValue1); // Square the values - cplxValue2 = _mm256_mul_ps(cplxValue2, cplxValue2); // Square the Values - - result = _mm256_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values - - result = _mm256_sqrt_ps(result); // Square root the values - - result = _mm256_mul_ps(result, vScalar); // Scale the results - - int1 = _mm256_cvtps_epi32(result); - int1 = _mm256_packs_epi32(int1, int1); - int1 = _mm256_permutevar8x32_epi32(int1, idx); //permute to compensate for shuffling in hadd and packs - short1 = _mm256_extracti128_si256(int1, 0); - _mm_store_si128((__m128i*)magnitudeVectorPtr,short1); - magnitudeVectorPtr += 8; - } - - number = eighthPoints * 8; - magnitudeVectorPtr = &magnitudeVector[number]; - complexVectorPtr = (const int16_t*)&complexVector[number]; - for(; number < num_points; number++){ - const float val1Real = (float)(*complexVectorPtr++) / SHRT_MAX; - const float val1Imag = (float)(*complexVectorPtr++) / SHRT_MAX; - const float val1Result = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)) * SHRT_MAX; - *magnitudeVectorPtr++ = (int16_t)rintf(val1Result); - } + unsigned int number = 0; + const unsigned int eighthPoints = num_points / 8; + + const int16_t* complexVectorPtr = (const int16_t*)complexVector; + int16_t* magnitudeVectorPtr = magnitudeVector; + + __m256 vScalar = _mm256_set1_ps(SHRT_MAX); + __m256 invScalar = _mm256_set1_ps(1.0f / SHRT_MAX); + __m256i int1, int2; + __m128i short1, short2; + __m256 cplxValue1, cplxValue2, result; + __m256i idx = _mm256_set_epi32(0, 0, 0, 0, 5, 1, 4, 0); + + for (; number < eighthPoints; number++) { + + int1 = _mm256_load_si256((__m256i*)complexVectorPtr); + complexVectorPtr += 16; + short1 = _mm256_extracti128_si256(int1, 0); + short2 = _mm256_extracti128_si256(int1, 1); + + int1 = _mm256_cvtepi16_epi32(short1); + int2 = _mm256_cvtepi16_epi32(short2); + cplxValue1 = _mm256_cvtepi32_ps(int1); + cplxValue2 = _mm256_cvtepi32_ps(int2); + + cplxValue1 = _mm256_mul_ps(cplxValue1, invScalar); + cplxValue2 = _mm256_mul_ps(cplxValue2, invScalar); + + cplxValue1 = _mm256_mul_ps(cplxValue1, cplxValue1); // Square the values + cplxValue2 = _mm256_mul_ps(cplxValue2, cplxValue2); // Square the Values + + result = _mm256_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values + + result = _mm256_sqrt_ps(result); // Square root the values + + result = _mm256_mul_ps(result, vScalar); // Scale the results + + int1 = _mm256_cvtps_epi32(result); + int1 = _mm256_packs_epi32(int1, int1); + int1 = _mm256_permutevar8x32_epi32( + int1, idx); // permute to compensate for shuffling in hadd and packs + short1 = _mm256_extracti128_si256(int1, 0); + _mm_store_si128((__m128i*)magnitudeVectorPtr, short1); + magnitudeVectorPtr += 8; + } + + number = eighthPoints * 8; + magnitudeVectorPtr = &magnitudeVector[number]; + complexVectorPtr = (const int16_t*)&complexVector[number]; + for (; number < num_points; number++) { + const float val1Real = (float)(*complexVectorPtr++) / SHRT_MAX; + const float val1Imag = (float)(*complexVectorPtr++) / SHRT_MAX; + const float val1Result = + sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)) * SHRT_MAX; + *magnitudeVectorPtr++ = (int16_t)rintf(val1Result); + } } #endif /* LV_HAVE_AVX2 */ #ifdef LV_HAVE_SSE3 #include -static inline void -volk_16ic_magnitude_16i_a_sse3(int16_t* magnitudeVector, const lv_16sc_t* complexVector, unsigned int num_points) +static inline void volk_16ic_magnitude_16i_a_sse3(int16_t* magnitudeVector, + const lv_16sc_t* complexVector, + unsigned int num_points) { - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; - const int16_t* complexVectorPtr = (const int16_t*)complexVector; - int16_t* magnitudeVectorPtr = magnitudeVector; + const int16_t* complexVectorPtr = (const int16_t*)complexVector; + int16_t* magnitudeVectorPtr = magnitudeVector; - __m128 vScalar = _mm_set_ps1(SHRT_MAX); - __m128 invScalar = _mm_set_ps1(1.0f/SHRT_MAX); + __m128 vScalar = _mm_set_ps1(SHRT_MAX); + __m128 invScalar = _mm_set_ps1(1.0f / SHRT_MAX); - __m128 cplxValue1, cplxValue2, result; + __m128 cplxValue1, cplxValue2, result; - __VOLK_ATTR_ALIGNED(16) float inputFloatBuffer[8]; - __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4]; + __VOLK_ATTR_ALIGNED(16) float inputFloatBuffer[8]; + __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4]; - for(;number < quarterPoints; number++){ + for (; number < quarterPoints; number++) { - inputFloatBuffer[0] = (float)(complexVectorPtr[0]); - inputFloatBuffer[1] = (float)(complexVectorPtr[1]); - inputFloatBuffer[2] = (float)(complexVectorPtr[2]); - inputFloatBuffer[3] = (float)(complexVectorPtr[3]); + inputFloatBuffer[0] = (float)(complexVectorPtr[0]); + inputFloatBuffer[1] = (float)(complexVectorPtr[1]); + inputFloatBuffer[2] = (float)(complexVectorPtr[2]); + inputFloatBuffer[3] = (float)(complexVectorPtr[3]); - inputFloatBuffer[4] = (float)(complexVectorPtr[4]); - inputFloatBuffer[5] = (float)(complexVectorPtr[5]); - inputFloatBuffer[6] = (float)(complexVectorPtr[6]); - inputFloatBuffer[7] = (float)(complexVectorPtr[7]); + inputFloatBuffer[4] = (float)(complexVectorPtr[4]); + inputFloatBuffer[5] = (float)(complexVectorPtr[5]); + inputFloatBuffer[6] = (float)(complexVectorPtr[6]); + inputFloatBuffer[7] = (float)(complexVectorPtr[7]); - cplxValue1 = _mm_load_ps(&inputFloatBuffer[0]); - cplxValue2 = _mm_load_ps(&inputFloatBuffer[4]); + cplxValue1 = _mm_load_ps(&inputFloatBuffer[0]); + cplxValue2 = _mm_load_ps(&inputFloatBuffer[4]); - complexVectorPtr += 8; + complexVectorPtr += 8; - cplxValue1 = _mm_mul_ps(cplxValue1, invScalar); - cplxValue2 = _mm_mul_ps(cplxValue2, invScalar); + cplxValue1 = _mm_mul_ps(cplxValue1, invScalar); + cplxValue2 = _mm_mul_ps(cplxValue2, invScalar); - cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values - cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values + cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values + cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values - result = _mm_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values + result = _mm_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values - result = _mm_sqrt_ps(result); // Square root the values + result = _mm_sqrt_ps(result); // Square root the values - result = _mm_mul_ps(result, vScalar); // Scale the results + result = _mm_mul_ps(result, vScalar); // Scale the results - _mm_store_ps(outputFloatBuffer, result); - *magnitudeVectorPtr++ = (int16_t)rintf(outputFloatBuffer[0]); - *magnitudeVectorPtr++ = (int16_t)rintf(outputFloatBuffer[1]); - *magnitudeVectorPtr++ = (int16_t)rintf(outputFloatBuffer[2]); - *magnitudeVectorPtr++ = (int16_t)rintf(outputFloatBuffer[3]); - } + _mm_store_ps(outputFloatBuffer, result); + *magnitudeVectorPtr++ = (int16_t)rintf(outputFloatBuffer[0]); + *magnitudeVectorPtr++ = (int16_t)rintf(outputFloatBuffer[1]); + *magnitudeVectorPtr++ = (int16_t)rintf(outputFloatBuffer[2]); + *magnitudeVectorPtr++ = (int16_t)rintf(outputFloatBuffer[3]); + } - number = quarterPoints * 4; - magnitudeVectorPtr = &magnitudeVector[number]; - complexVectorPtr = (const int16_t*)&complexVector[number]; - for(; number < num_points; number++){ - const float val1Real = (float)(*complexVectorPtr++) / SHRT_MAX; - const float val1Imag = (float)(*complexVectorPtr++) / SHRT_MAX; - const float val1Result = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)) * SHRT_MAX; - *magnitudeVectorPtr++ = (int16_t)rintf(val1Result); - } + number = quarterPoints * 4; + magnitudeVectorPtr = &magnitudeVector[number]; + complexVectorPtr = (const int16_t*)&complexVector[number]; + for (; number < num_points; number++) { + const float val1Real = (float)(*complexVectorPtr++) / SHRT_MAX; + const float val1Imag = (float)(*complexVectorPtr++) / SHRT_MAX; + const float val1Result = + sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)) * SHRT_MAX; + *magnitudeVectorPtr++ = (int16_t)rintf(val1Result); + } } #endif /* LV_HAVE_SSE3 */ #ifdef LV_HAVE_SSE #include -static inline void -volk_16ic_magnitude_16i_a_sse(int16_t* magnitudeVector, const lv_16sc_t* complexVector, unsigned int num_points) +static inline void volk_16ic_magnitude_16i_a_sse(int16_t* magnitudeVector, + const lv_16sc_t* complexVector, + unsigned int num_points) { - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; - const int16_t* complexVectorPtr = (const int16_t*)complexVector; - int16_t* magnitudeVectorPtr = magnitudeVector; + const int16_t* complexVectorPtr = (const int16_t*)complexVector; + int16_t* magnitudeVectorPtr = magnitudeVector; - __m128 vScalar = _mm_set_ps1(SHRT_MAX); - __m128 invScalar = _mm_set_ps1(1.0f/SHRT_MAX); + __m128 vScalar = _mm_set_ps1(SHRT_MAX); + __m128 invScalar = _mm_set_ps1(1.0f / SHRT_MAX); - __m128 cplxValue1, cplxValue2, iValue, qValue, result; + __m128 cplxValue1, cplxValue2, iValue, qValue, result; - __VOLK_ATTR_ALIGNED(16) float inputFloatBuffer[4]; - __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4]; + __VOLK_ATTR_ALIGNED(16) float inputFloatBuffer[4]; + __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4]; - for(;number < quarterPoints; number++){ + for (; number < quarterPoints; number++) { - inputFloatBuffer[0] = (float)(complexVectorPtr[0]); - inputFloatBuffer[1] = (float)(complexVectorPtr[1]); - inputFloatBuffer[2] = (float)(complexVectorPtr[2]); - inputFloatBuffer[3] = (float)(complexVectorPtr[3]); + inputFloatBuffer[0] = (float)(complexVectorPtr[0]); + inputFloatBuffer[1] = (float)(complexVectorPtr[1]); + inputFloatBuffer[2] = (float)(complexVectorPtr[2]); + inputFloatBuffer[3] = (float)(complexVectorPtr[3]); - cplxValue1 = _mm_load_ps(inputFloatBuffer); - complexVectorPtr += 4; + cplxValue1 = _mm_load_ps(inputFloatBuffer); + complexVectorPtr += 4; - inputFloatBuffer[0] = (float)(complexVectorPtr[0]); - inputFloatBuffer[1] = (float)(complexVectorPtr[1]); - inputFloatBuffer[2] = (float)(complexVectorPtr[2]); - inputFloatBuffer[3] = (float)(complexVectorPtr[3]); + inputFloatBuffer[0] = (float)(complexVectorPtr[0]); + inputFloatBuffer[1] = (float)(complexVectorPtr[1]); + inputFloatBuffer[2] = (float)(complexVectorPtr[2]); + inputFloatBuffer[3] = (float)(complexVectorPtr[3]); - cplxValue2 = _mm_load_ps(inputFloatBuffer); - complexVectorPtr += 4; + cplxValue2 = _mm_load_ps(inputFloatBuffer); + complexVectorPtr += 4; - cplxValue1 = _mm_mul_ps(cplxValue1, invScalar); - cplxValue2 = _mm_mul_ps(cplxValue2, invScalar); + cplxValue1 = _mm_mul_ps(cplxValue1, invScalar); + cplxValue2 = _mm_mul_ps(cplxValue2, invScalar); - // Arrange in i1i2i3i4 format - iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0)); - // Arrange in q1q2q3q4 format - qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1)); + // Arrange in i1i2i3i4 format + iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2, 0, 2, 0)); + // Arrange in q1q2q3q4 format + qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3, 1, 3, 1)); - iValue = _mm_mul_ps(iValue, iValue); // Square the I values - qValue = _mm_mul_ps(qValue, qValue); // Square the Q Values + iValue = _mm_mul_ps(iValue, iValue); // Square the I values + qValue = _mm_mul_ps(qValue, qValue); // Square the Q Values - result = _mm_add_ps(iValue, qValue); // Add the I2 and Q2 values + result = _mm_add_ps(iValue, qValue); // Add the I2 and Q2 values - result = _mm_sqrt_ps(result); // Square root the values + result = _mm_sqrt_ps(result); // Square root the values - result = _mm_mul_ps(result, vScalar); // Scale the results + result = _mm_mul_ps(result, vScalar); // Scale the results - _mm_store_ps(outputFloatBuffer, result); - *magnitudeVectorPtr++ = (int16_t)rintf(outputFloatBuffer[0]); - *magnitudeVectorPtr++ = (int16_t)rintf(outputFloatBuffer[1]); - *magnitudeVectorPtr++ = (int16_t)rintf(outputFloatBuffer[2]); - *magnitudeVectorPtr++ = (int16_t)rintf(outputFloatBuffer[3]); - } + _mm_store_ps(outputFloatBuffer, result); + *magnitudeVectorPtr++ = (int16_t)rintf(outputFloatBuffer[0]); + *magnitudeVectorPtr++ = (int16_t)rintf(outputFloatBuffer[1]); + *magnitudeVectorPtr++ = (int16_t)rintf(outputFloatBuffer[2]); + *magnitudeVectorPtr++ = (int16_t)rintf(outputFloatBuffer[3]); + } - number = quarterPoints * 4; - magnitudeVectorPtr = &magnitudeVector[number]; - complexVectorPtr = (const int16_t*)&complexVector[number]; - for(; number < num_points; number++){ - const float val1Real = (float)(*complexVectorPtr++) / SHRT_MAX; - const float val1Imag = (float)(*complexVectorPtr++) / SHRT_MAX; - const float val1Result = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)) * SHRT_MAX; - *magnitudeVectorPtr++ = (int16_t)rintf(val1Result); - } + number = quarterPoints * 4; + magnitudeVectorPtr = &magnitudeVector[number]; + complexVectorPtr = (const int16_t*)&complexVector[number]; + for (; number < num_points; number++) { + const float val1Real = (float)(*complexVectorPtr++) / SHRT_MAX; + const float val1Imag = (float)(*complexVectorPtr++) / SHRT_MAX; + const float val1Result = + sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)) * SHRT_MAX; + *magnitudeVectorPtr++ = (int16_t)rintf(val1Result); + } } #endif /* LV_HAVE_SSE */ #ifdef LV_HAVE_GENERIC -static inline void -volk_16ic_magnitude_16i_generic(int16_t* magnitudeVector, const lv_16sc_t* complexVector, unsigned int num_points) +static inline void volk_16ic_magnitude_16i_generic(int16_t* magnitudeVector, + const lv_16sc_t* complexVector, + unsigned int num_points) { - const int16_t* complexVectorPtr = (const int16_t*)complexVector; - int16_t* magnitudeVectorPtr = magnitudeVector; - unsigned int number = 0; - const float scalar = SHRT_MAX; - for(number = 0; number < num_points; number++){ - float real = ((float)(*complexVectorPtr++)) / scalar; - float imag = ((float)(*complexVectorPtr++)) / scalar; - *magnitudeVectorPtr++ = (int16_t)rintf(sqrtf((real*real) + (imag*imag)) * scalar); - } + const int16_t* complexVectorPtr = (const int16_t*)complexVector; + int16_t* magnitudeVectorPtr = magnitudeVector; + unsigned int number = 0; + const float scalar = SHRT_MAX; + for (number = 0; number < num_points; number++) { + float real = ((float)(*complexVectorPtr++)) / scalar; + float imag = ((float)(*complexVectorPtr++)) / scalar; + *magnitudeVectorPtr++ = + (int16_t)rintf(sqrtf((real * real) + (imag * imag)) * scalar); + } } #endif /* LV_HAVE_GENERIC */ #ifdef LV_HAVE_ORC_DISABLED -extern void -volk_16ic_magnitude_16i_a_orc_impl(int16_t* magnitudeVector, const lv_16sc_t* complexVector, float scalar, unsigned int num_points); - -static inline void -volk_16ic_magnitude_16i_u_orc(int16_t* magnitudeVector, const lv_16sc_t* complexVector, unsigned int num_points) +extern void volk_16ic_magnitude_16i_a_orc_impl(int16_t* magnitudeVector, + const lv_16sc_t* complexVector, + float scalar, + unsigned int num_points); + +static inline void volk_16ic_magnitude_16i_u_orc(int16_t* magnitudeVector, + const lv_16sc_t* complexVector, + unsigned int num_points) { - volk_16ic_magnitude_16i_a_orc_impl(magnitudeVector, complexVector, SHRT_MAX, num_points); + volk_16ic_magnitude_16i_a_orc_impl( + magnitudeVector, complexVector, SHRT_MAX, num_points); } #endif /* LV_HAVE_ORC */ @@ -300,71 +313,74 @@ volk_16ic_magnitude_16i_u_orc(int16_t* magnitudeVector, const lv_16sc_t* complex #ifndef INCLUDED_volk_16ic_magnitude_16i_u_H #define INCLUDED_volk_16ic_magnitude_16i_u_H -#include #include -#include #include +#include +#include #ifdef LV_HAVE_AVX2 #include -static inline void -volk_16ic_magnitude_16i_u_avx2(int16_t* magnitudeVector, const lv_16sc_t* complexVector, unsigned int num_points) +static inline void volk_16ic_magnitude_16i_u_avx2(int16_t* magnitudeVector, + const lv_16sc_t* complexVector, + unsigned int num_points) { - unsigned int number = 0; - const unsigned int eighthPoints = num_points / 8; - - const int16_t* complexVectorPtr = (const int16_t*)complexVector; - int16_t* magnitudeVectorPtr = magnitudeVector; - - __m256 vScalar = _mm256_set1_ps(SHRT_MAX); - __m256 invScalar = _mm256_set1_ps(1.0f/SHRT_MAX); - __m256i int1, int2; - __m128i short1, short2; - __m256 cplxValue1, cplxValue2, result; - __m256i idx = _mm256_set_epi32(0,0,0,0,5,1,4,0); - - for(;number < eighthPoints; number++){ - - int1 = _mm256_loadu_si256((__m256i*)complexVectorPtr); - complexVectorPtr += 16; - short1 = _mm256_extracti128_si256(int1,0); - short2 = _mm256_extracti128_si256(int1,1); - - int1 = _mm256_cvtepi16_epi32(short1); - int2 = _mm256_cvtepi16_epi32(short2); - cplxValue1 = _mm256_cvtepi32_ps(int1); - cplxValue2 = _mm256_cvtepi32_ps(int2); - - cplxValue1 = _mm256_mul_ps(cplxValue1, invScalar); - cplxValue2 = _mm256_mul_ps(cplxValue2, invScalar); - - cplxValue1 = _mm256_mul_ps(cplxValue1, cplxValue1); // Square the values - cplxValue2 = _mm256_mul_ps(cplxValue2, cplxValue2); // Square the Values - - result = _mm256_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values - - result = _mm256_sqrt_ps(result); // Square root the values - - result = _mm256_mul_ps(result, vScalar); // Scale the results - - int1 = _mm256_cvtps_epi32(result); - int1 = _mm256_packs_epi32(int1, int1); - int1 = _mm256_permutevar8x32_epi32(int1, idx); //permute to compensate for shuffling in hadd and packs - short1 = _mm256_extracti128_si256(int1, 0); - _mm_storeu_si128((__m128i*)magnitudeVectorPtr,short1); - magnitudeVectorPtr += 8; - } - - number = eighthPoints * 8; - magnitudeVectorPtr = &magnitudeVector[number]; - complexVectorPtr = (const int16_t*)&complexVector[number]; - for(; number < num_points; number++){ - const float val1Real = (float)(*complexVectorPtr++) / SHRT_MAX; - const float val1Imag = (float)(*complexVectorPtr++) / SHRT_MAX; - const float val1Result = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)) * SHRT_MAX; - *magnitudeVectorPtr++ = (int16_t)rintf(val1Result); - } + unsigned int number = 0; + const unsigned int eighthPoints = num_points / 8; + + const int16_t* complexVectorPtr = (const int16_t*)complexVector; + int16_t* magnitudeVectorPtr = magnitudeVector; + + __m256 vScalar = _mm256_set1_ps(SHRT_MAX); + __m256 invScalar = _mm256_set1_ps(1.0f / SHRT_MAX); + __m256i int1, int2; + __m128i short1, short2; + __m256 cplxValue1, cplxValue2, result; + __m256i idx = _mm256_set_epi32(0, 0, 0, 0, 5, 1, 4, 0); + + for (; number < eighthPoints; number++) { + + int1 = _mm256_loadu_si256((__m256i*)complexVectorPtr); + complexVectorPtr += 16; + short1 = _mm256_extracti128_si256(int1, 0); + short2 = _mm256_extracti128_si256(int1, 1); + + int1 = _mm256_cvtepi16_epi32(short1); + int2 = _mm256_cvtepi16_epi32(short2); + cplxValue1 = _mm256_cvtepi32_ps(int1); + cplxValue2 = _mm256_cvtepi32_ps(int2); + + cplxValue1 = _mm256_mul_ps(cplxValue1, invScalar); + cplxValue2 = _mm256_mul_ps(cplxValue2, invScalar); + + cplxValue1 = _mm256_mul_ps(cplxValue1, cplxValue1); // Square the values + cplxValue2 = _mm256_mul_ps(cplxValue2, cplxValue2); // Square the Values + + result = _mm256_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values + + result = _mm256_sqrt_ps(result); // Square root the values + + result = _mm256_mul_ps(result, vScalar); // Scale the results + + int1 = _mm256_cvtps_epi32(result); + int1 = _mm256_packs_epi32(int1, int1); + int1 = _mm256_permutevar8x32_epi32( + int1, idx); // permute to compensate for shuffling in hadd and packs + short1 = _mm256_extracti128_si256(int1, 0); + _mm_storeu_si128((__m128i*)magnitudeVectorPtr, short1); + magnitudeVectorPtr += 8; + } + + number = eighthPoints * 8; + magnitudeVectorPtr = &magnitudeVector[number]; + complexVectorPtr = (const int16_t*)&complexVector[number]; + for (; number < num_points; number++) { + const float val1Real = (float)(*complexVectorPtr++) / SHRT_MAX; + const float val1Imag = (float)(*complexVectorPtr++) / SHRT_MAX; + const float val1Result = + sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)) * SHRT_MAX; + *magnitudeVectorPtr++ = (int16_t)rintf(val1Result); + } } #endif /* LV_HAVE_AVX2 */ @@ -372,24 +388,25 @@ volk_16ic_magnitude_16i_u_avx2(int16_t* magnitudeVector, const lv_16sc_t* comple #include #include -static inline void -volk_16ic_magnitude_16i_neonv7(int16_t* magnitudeVector, const lv_16sc_t* complexVector, unsigned int num_points) +static inline void volk_16ic_magnitude_16i_neonv7(int16_t* magnitudeVector, + const lv_16sc_t* complexVector, + unsigned int num_points) { unsigned int number = 0; unsigned int quarter_points = num_points / 4; - + const float scalar = SHRT_MAX; const float inv_scalar = 1.0f / scalar; - + int16_t* magnitudeVectorPtr = magnitudeVector; const lv_16sc_t* complexVectorPtr = complexVector; - + float32x4_t mag_vec; float32x4x2_t c_vec; - - for(number = 0; number < quarter_points; number++) { + + for (number = 0; number < quarter_points; number++) { const int16x4x2_t c16_vec = vld2_s16((int16_t*)complexVectorPtr); - __VOLK_PREFETCH(complexVectorPtr+4); + __VOLK_PREFETCH(complexVectorPtr + 4); c_vec.val[0] = vcvtq_f32_s32(vmovl_s16(c16_vec.val[0])); c_vec.val[1] = vcvtq_f32_s32(vmovl_s16(c16_vec.val[1])); // Scale to close to 0-1 @@ -406,15 +423,16 @@ volk_16ic_magnitude_16i_neonv7(int16_t* magnitudeVector, const lv_16sc_t* comple const int16x4_t mag16_vec = vmovn_s32(vcvtq_s32_f32(mag_vec)); vst1_s16(magnitudeVectorPtr, mag16_vec); // Advance pointers - magnitudeVectorPtr+=4; - complexVectorPtr+=4; + magnitudeVectorPtr += 4; + complexVectorPtr += 4; } - + // Deal with the rest - for(number = quarter_points * 4; number < num_points; number++) { + for (number = quarter_points * 4; number < num_points; number++) { const float real = lv_creal(*complexVectorPtr) * inv_scalar; const float imag = lv_cimag(*complexVectorPtr) * inv_scalar; - *magnitudeVectorPtr = (int16_t)rintf(sqrtf((real*real) + (imag*imag)) * scalar); + *magnitudeVectorPtr = + (int16_t)rintf(sqrtf((real * real) + (imag * imag)) * scalar); complexVectorPtr++; magnitudeVectorPtr++; } diff --git a/kernels/volk/volk_16ic_s32f_deinterleave_32f_x2.h b/kernels/volk/volk_16ic_s32f_deinterleave_32f_x2.h index 50d9341..7425ec6 100644 --- a/kernels/volk/volk_16ic_s32f_deinterleave_32f_x2.h +++ b/kernels/volk/volk_16ic_s32f_deinterleave_32f_x2.h @@ -30,8 +30,8 @@ * * Dispatcher Prototype * \code - * void volk_16ic_s32f_deinterleave_32f_x2(float* iBuffer, float* qBuffer, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){ - * \endcode + * void volk_16ic_s32f_deinterleave_32f_x2(float* iBuffer, float* qBuffer, const + * lv_16sc_t* complexVector, const float scalar, unsigned int num_points){ \endcode * * \b Inputs * \li complexVector: The complex input vector of 16-bit shorts. @@ -56,197 +56,214 @@ #ifndef INCLUDED_volk_16ic_s32f_deinterleave_32f_x2_a_H #define INCLUDED_volk_16ic_s32f_deinterleave_32f_x2_a_H -#include #include #include +#include #ifdef LV_HAVE_AVX2 #include -static inline -void volk_16ic_s32f_deinterleave_32f_x2_a_avx2(float* iBuffer, float* qBuffer, const lv_16sc_t* complexVector, - const float scalar, unsigned int num_points) +static inline void +volk_16ic_s32f_deinterleave_32f_x2_a_avx2(float* iBuffer, + float* qBuffer, + const lv_16sc_t* complexVector, + const float scalar, + unsigned int num_points) { - float* iBufferPtr = iBuffer; - float* qBufferPtr = qBuffer; - - uint64_t number = 0; - const uint64_t eighthPoints = num_points / 8; - __m256 cplxValue1, cplxValue2, iValue, qValue; - __m256i cplxValueA, cplxValueB; - __m128i cplxValue128; - - __m256 invScalar = _mm256_set1_ps(1.0/scalar); - int16_t* complexVectorPtr = (int16_t*)complexVector; - __m256i idx = _mm256_set_epi32(7,6,3,2,5,4,1,0); - - for(;number < eighthPoints; number++){ - - cplxValueA = _mm256_load_si256((__m256i*) complexVectorPtr); - complexVectorPtr += 16; - - //cvt - cplxValue128 = _mm256_extracti128_si256(cplxValueA, 0); - cplxValueB = _mm256_cvtepi16_epi32(cplxValue128); - cplxValue1 = _mm256_cvtepi32_ps(cplxValueB); - cplxValue128 = _mm256_extracti128_si256(cplxValueA, 1); - cplxValueB = _mm256_cvtepi16_epi32(cplxValue128); - cplxValue2 = _mm256_cvtepi32_ps(cplxValueB); - - cplxValue1 = _mm256_mul_ps(cplxValue1, invScalar); - cplxValue2 = _mm256_mul_ps(cplxValue2, invScalar); - - // Arrange in i1i2i3i4 format - iValue = _mm256_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0)); - iValue = _mm256_permutevar8x32_ps(iValue,idx); - // Arrange in q1q2q3q4 format - qValue = _mm256_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1)); - qValue = _mm256_permutevar8x32_ps(qValue,idx); - - _mm256_store_ps(iBufferPtr, iValue); - _mm256_store_ps(qBufferPtr, qValue); - - iBufferPtr += 8; - qBufferPtr += 8; - } - - number = eighthPoints * 8; - complexVectorPtr = (int16_t*)&complexVector[number]; - for(; number < num_points; number++){ - *iBufferPtr++ = (float)(*complexVectorPtr++) / scalar; - *qBufferPtr++ = (float)(*complexVectorPtr++) / scalar; - } + float* iBufferPtr = iBuffer; + float* qBufferPtr = qBuffer; + + uint64_t number = 0; + const uint64_t eighthPoints = num_points / 8; + __m256 cplxValue1, cplxValue2, iValue, qValue; + __m256i cplxValueA, cplxValueB; + __m128i cplxValue128; + + __m256 invScalar = _mm256_set1_ps(1.0 / scalar); + int16_t* complexVectorPtr = (int16_t*)complexVector; + __m256i idx = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0); + + for (; number < eighthPoints; number++) { + + cplxValueA = _mm256_load_si256((__m256i*)complexVectorPtr); + complexVectorPtr += 16; + + // cvt + cplxValue128 = _mm256_extracti128_si256(cplxValueA, 0); + cplxValueB = _mm256_cvtepi16_epi32(cplxValue128); + cplxValue1 = _mm256_cvtepi32_ps(cplxValueB); + cplxValue128 = _mm256_extracti128_si256(cplxValueA, 1); + cplxValueB = _mm256_cvtepi16_epi32(cplxValue128); + cplxValue2 = _mm256_cvtepi32_ps(cplxValueB); + + cplxValue1 = _mm256_mul_ps(cplxValue1, invScalar); + cplxValue2 = _mm256_mul_ps(cplxValue2, invScalar); + + // Arrange in i1i2i3i4 format + iValue = _mm256_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2, 0, 2, 0)); + iValue = _mm256_permutevar8x32_ps(iValue, idx); + // Arrange in q1q2q3q4 format + qValue = _mm256_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3, 1, 3, 1)); + qValue = _mm256_permutevar8x32_ps(qValue, idx); + + _mm256_store_ps(iBufferPtr, iValue); + _mm256_store_ps(qBufferPtr, qValue); + + iBufferPtr += 8; + qBufferPtr += 8; + } + + number = eighthPoints * 8; + complexVectorPtr = (int16_t*)&complexVector[number]; + for (; number < num_points; number++) { + *iBufferPtr++ = (float)(*complexVectorPtr++) / scalar; + *qBufferPtr++ = (float)(*complexVectorPtr++) / scalar; + } } #endif /* LV_HAVE_AVX2 */ #ifdef LV_HAVE_SSE #include -static inline -void volk_16ic_s32f_deinterleave_32f_x2_a_sse(float* iBuffer, float* qBuffer, const lv_16sc_t* complexVector, - const float scalar, unsigned int num_points) +static inline void +volk_16ic_s32f_deinterleave_32f_x2_a_sse(float* iBuffer, + float* qBuffer, + const lv_16sc_t* complexVector, + const float scalar, + unsigned int num_points) { - float* iBufferPtr = iBuffer; - float* qBufferPtr = qBuffer; + float* iBufferPtr = iBuffer; + float* qBufferPtr = qBuffer; - uint64_t number = 0; - const uint64_t quarterPoints = num_points / 4; - __m128 cplxValue1, cplxValue2, iValue, qValue; + uint64_t number = 0; + const uint64_t quarterPoints = num_points / 4; + __m128 cplxValue1, cplxValue2, iValue, qValue; - __m128 invScalar = _mm_set_ps1(1.0/scalar); - int16_t* complexVectorPtr = (int16_t*)complexVector; + __m128 invScalar = _mm_set_ps1(1.0 / scalar); + int16_t* complexVectorPtr = (int16_t*)complexVector; - __VOLK_ATTR_ALIGNED(16) float floatBuffer[8]; + __VOLK_ATTR_ALIGNED(16) float floatBuffer[8]; - for(;number < quarterPoints; number++){ + for (; number < quarterPoints; number++) { - floatBuffer[0] = (float)(complexVectorPtr[0]); - floatBuffer[1] = (float)(complexVectorPtr[1]); - floatBuffer[2] = (float)(complexVectorPtr[2]); - floatBuffer[3] = (float)(complexVectorPtr[3]); + floatBuffer[0] = (float)(complexVectorPtr[0]); + floatBuffer[1] = (float)(complexVectorPtr[1]); + floatBuffer[2] = (float)(complexVectorPtr[2]); + floatBuffer[3] = (float)(complexVectorPtr[3]); - floatBuffer[4] = (float)(complexVectorPtr[4]); - floatBuffer[5] = (float)(complexVectorPtr[5]); - floatBuffer[6] = (float)(complexVectorPtr[6]); - floatBuffer[7] = (float)(complexVectorPtr[7]); + floatBuffer[4] = (float)(complexVectorPtr[4]); + floatBuffer[5] = (float)(complexVectorPtr[5]); + floatBuffer[6] = (float)(complexVectorPtr[6]); + floatBuffer[7] = (float)(complexVectorPtr[7]); - cplxValue1 = _mm_load_ps(&floatBuffer[0]); - cplxValue2 = _mm_load_ps(&floatBuffer[4]); + cplxValue1 = _mm_load_ps(&floatBuffer[0]); + cplxValue2 = _mm_load_ps(&floatBuffer[4]); - complexVectorPtr += 8; + complexVectorPtr += 8; - cplxValue1 = _mm_mul_ps(cplxValue1, invScalar); - cplxValue2 = _mm_mul_ps(cplxValue2, invScalar); + cplxValue1 = _mm_mul_ps(cplxValue1, invScalar); + cplxValue2 = _mm_mul_ps(cplxValue2, invScalar); - // Arrange in i1i2i3i4 format - iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0)); - // Arrange in q1q2q3q4 format - qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1)); + // Arrange in i1i2i3i4 format + iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2, 0, 2, 0)); + // Arrange in q1q2q3q4 format + qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3, 1, 3, 1)); - _mm_store_ps(iBufferPtr, iValue); - _mm_store_ps(qBufferPtr, qValue); + _mm_store_ps(iBufferPtr, iValue); + _mm_store_ps(qBufferPtr, qValue); - iBufferPtr += 4; - qBufferPtr += 4; - } + iBufferPtr += 4; + qBufferPtr += 4; + } - number = quarterPoints * 4; - complexVectorPtr = (int16_t*)&complexVector[number]; - for(; number < num_points; number++){ - *iBufferPtr++ = (float)(*complexVectorPtr++) / scalar; - *qBufferPtr++ = (float)(*complexVectorPtr++) / scalar; - } + number = quarterPoints * 4; + complexVectorPtr = (int16_t*)&complexVector[number]; + for (; number < num_points; number++) { + *iBufferPtr++ = (float)(*complexVectorPtr++) / scalar; + *qBufferPtr++ = (float)(*complexVectorPtr++) / scalar; + } } #endif /* LV_HAVE_SSE */ #ifdef LV_HAVE_GENERIC static inline void -volk_16ic_s32f_deinterleave_32f_x2_generic(float* iBuffer, float* qBuffer, const lv_16sc_t* complexVector, - const float scalar, unsigned int num_points) +volk_16ic_s32f_deinterleave_32f_x2_generic(float* iBuffer, + float* qBuffer, + const lv_16sc_t* complexVector, + const float scalar, + unsigned int num_points) { - const int16_t* complexVectorPtr = (const int16_t*)complexVector; - float* iBufferPtr = iBuffer; - float* qBufferPtr = qBuffer; - unsigned int number; - for(number = 0; number < num_points; number++){ - *iBufferPtr++ = (float)(*complexVectorPtr++) / scalar; - *qBufferPtr++ = (float)(*complexVectorPtr++) / scalar; - } + const int16_t* complexVectorPtr = (const int16_t*)complexVector; + float* iBufferPtr = iBuffer; + float* qBufferPtr = qBuffer; + unsigned int number; + for (number = 0; number < num_points; number++) { + *iBufferPtr++ = (float)(*complexVectorPtr++) / scalar; + *qBufferPtr++ = (float)(*complexVectorPtr++) / scalar; + } } #endif /* LV_HAVE_GENERIC */ #ifdef LV_HAVE_NEON #include -static inline void -volk_16ic_s32f_deinterleave_32f_x2_neon(float* iBuffer, float* qBuffer, const lv_16sc_t* complexVector, - const float scalar, unsigned int num_points) +static inline void volk_16ic_s32f_deinterleave_32f_x2_neon(float* iBuffer, + float* qBuffer, + const lv_16sc_t* complexVector, + const float scalar, + unsigned int num_points) { - const int16_t* complexVectorPtr = (const int16_t*)complexVector; - float* iBufferPtr = iBuffer; - float* qBufferPtr = qBuffer; - unsigned int eighth_points = num_points / 4; - unsigned int number; - float iScalar = 1.f/scalar; - float32x4_t invScalar; - invScalar = vld1q_dup_f32(&iScalar); - - int16x4x2_t complexInput_s16; - int32x4x2_t complexInput_s32; - float32x4x2_t complexFloat; - - for(number = 0; number < eighth_points; number++){ - complexInput_s16 = vld2_s16(complexVectorPtr); - complexInput_s32.val[0] = vmovl_s16(complexInput_s16.val[0]); - complexInput_s32.val[1] = vmovl_s16(complexInput_s16.val[1]); - complexFloat.val[0] = vcvtq_f32_s32(complexInput_s32.val[0]); - complexFloat.val[1] = vcvtq_f32_s32(complexInput_s32.val[1]); - complexFloat.val[0] = vmulq_f32(complexFloat.val[0], invScalar); - complexFloat.val[1] = vmulq_f32(complexFloat.val[1], invScalar); - vst1q_f32(iBufferPtr, complexFloat.val[0]); - vst1q_f32(qBufferPtr, complexFloat.val[1]); - complexVectorPtr += 8; - iBufferPtr += 4; - qBufferPtr += 4; - } - - for(number = eighth_points*4; number < num_points; number++){ - *iBufferPtr++ = (float)(*complexVectorPtr++) / scalar; - *qBufferPtr++ = (float)(*complexVectorPtr++) / scalar; - } + const int16_t* complexVectorPtr = (const int16_t*)complexVector; + float* iBufferPtr = iBuffer; + float* qBufferPtr = qBuffer; + unsigned int eighth_points = num_points / 4; + unsigned int number; + float iScalar = 1.f / scalar; + float32x4_t invScalar; + invScalar = vld1q_dup_f32(&iScalar); + + int16x4x2_t complexInput_s16; + int32x4x2_t complexInput_s32; + float32x4x2_t complexFloat; + + for (number = 0; number < eighth_points; number++) { + complexInput_s16 = vld2_s16(complexVectorPtr); + complexInput_s32.val[0] = vmovl_s16(complexInput_s16.val[0]); + complexInput_s32.val[1] = vmovl_s16(complexInput_s16.val[1]); + complexFloat.val[0] = vcvtq_f32_s32(complexInput_s32.val[0]); + complexFloat.val[1] = vcvtq_f32_s32(complexInput_s32.val[1]); + complexFloat.val[0] = vmulq_f32(complexFloat.val[0], invScalar); + complexFloat.val[1] = vmulq_f32(complexFloat.val[1], invScalar); + vst1q_f32(iBufferPtr, complexFloat.val[0]); + vst1q_f32(qBufferPtr, complexFloat.val[1]); + complexVectorPtr += 8; + iBufferPtr += 4; + qBufferPtr += 4; + } + + for (number = eighth_points * 4; number < num_points; number++) { + *iBufferPtr++ = (float)(*complexVectorPtr++) / scalar; + *qBufferPtr++ = (float)(*complexVectorPtr++) / scalar; + } } #endif /* LV_HAVE_GENERIC */ #ifdef LV_HAVE_ORC -extern void -volk_16ic_s32f_deinterleave_32f_x2_a_orc_impl(float* iBuffer, float* qBuffer, const lv_16sc_t* complexVector, - const float scalar, unsigned int num_points); +extern void volk_16ic_s32f_deinterleave_32f_x2_a_orc_impl(float* iBuffer, + float* qBuffer, + const lv_16sc_t* complexVector, + const float scalar, + unsigned int num_points); static inline void -volk_16ic_s32f_deinterleave_32f_x2_u_orc(float* iBuffer, float* qBuffer, const lv_16sc_t* complexVector, - const float scalar, unsigned int num_points) +volk_16ic_s32f_deinterleave_32f_x2_u_orc(float* iBuffer, + float* qBuffer, + const lv_16sc_t* complexVector, + const float scalar, + unsigned int num_points) { - volk_16ic_s32f_deinterleave_32f_x2_a_orc_impl(iBuffer, qBuffer, complexVector, scalar, num_points); + volk_16ic_s32f_deinterleave_32f_x2_a_orc_impl( + iBuffer, qBuffer, complexVector, scalar, num_points); } #endif /* LV_HAVE_ORC */ @@ -257,66 +274,69 @@ volk_16ic_s32f_deinterleave_32f_x2_u_orc(float* iBuffer, float* qBuffer, const l #ifndef INCLUDED_volk_16ic_s32f_deinterleave_32f_x2_u_H #define INCLUDED_volk_16ic_s32f_deinterleave_32f_x2_u_H -#include #include #include +#include #ifdef LV_HAVE_AVX2 #include -static inline -void volk_16ic_s32f_deinterleave_32f_x2_u_avx2(float* iBuffer, float* qBuffer, const lv_16sc_t* complexVector, - const float scalar, unsigned int num_points) +static inline void +volk_16ic_s32f_deinterleave_32f_x2_u_avx2(float* iBuffer, + float* qBuffer, + const lv_16sc_t* complexVector, + const float scalar, + unsigned int num_points) { - float* iBufferPtr = iBuffer; - float* qBufferPtr = qBuffer; - - uint64_t number = 0; - const uint64_t eighthPoints = num_points / 8; - __m256 cplxValue1, cplxValue2, iValue, qValue; - __m256i cplxValueA, cplxValueB; - __m128i cplxValue128; - - __m256 invScalar = _mm256_set1_ps(1.0/scalar); - int16_t* complexVectorPtr = (int16_t*)complexVector; - __m256i idx = _mm256_set_epi32(7,6,3,2,5,4,1,0); - - for(;number < eighthPoints; number++){ - - cplxValueA = _mm256_loadu_si256((__m256i*) complexVectorPtr); - complexVectorPtr += 16; - - //cvt - cplxValue128 = _mm256_extracti128_si256(cplxValueA, 0); - cplxValueB = _mm256_cvtepi16_epi32(cplxValue128); - cplxValue1 = _mm256_cvtepi32_ps(cplxValueB); - cplxValue128 = _mm256_extracti128_si256(cplxValueA, 1); - cplxValueB = _mm256_cvtepi16_epi32(cplxValue128); - cplxValue2 = _mm256_cvtepi32_ps(cplxValueB); - - cplxValue1 = _mm256_mul_ps(cplxValue1, invScalar); - cplxValue2 = _mm256_mul_ps(cplxValue2, invScalar); - - // Arrange in i1i2i3i4 format - iValue = _mm256_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0)); - iValue = _mm256_permutevar8x32_ps(iValue,idx); - // Arrange in q1q2q3q4 format - qValue = _mm256_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1)); - qValue = _mm256_permutevar8x32_ps(qValue,idx); - - _mm256_storeu_ps(iBufferPtr, iValue); - _mm256_storeu_ps(qBufferPtr, qValue); - - iBufferPtr += 8; - qBufferPtr += 8; - } - - number = eighthPoints * 8; - complexVectorPtr = (int16_t*)&complexVector[number]; - for(; number < num_points; number++){ - *iBufferPtr++ = (float)(*complexVectorPtr++) / scalar; - *qBufferPtr++ = (float)(*complexVectorPtr++) / scalar; - } + float* iBufferPtr = iBuffer; + float* qBufferPtr = qBuffer; + + uint64_t number = 0; + const uint64_t eighthPoints = num_points / 8; + __m256 cplxValue1, cplxValue2, iValue, qValue; + __m256i cplxValueA, cplxValueB; + __m128i cplxValue128; + + __m256 invScalar = _mm256_set1_ps(1.0 / scalar); + int16_t* complexVectorPtr = (int16_t*)complexVector; + __m256i idx = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0); + + for (; number < eighthPoints; number++) { + + cplxValueA = _mm256_loadu_si256((__m256i*)complexVectorPtr); + complexVectorPtr += 16; + + // cvt + cplxValue128 = _mm256_extracti128_si256(cplxValueA, 0); + cplxValueB = _mm256_cvtepi16_epi32(cplxValue128); + cplxValue1 = _mm256_cvtepi32_ps(cplxValueB); + cplxValue128 = _mm256_extracti128_si256(cplxValueA, 1); + cplxValueB = _mm256_cvtepi16_epi32(cplxValue128); + cplxValue2 = _mm256_cvtepi32_ps(cplxValueB); + + cplxValue1 = _mm256_mul_ps(cplxValue1, invScalar); + cplxValue2 = _mm256_mul_ps(cplxValue2, invScalar); + + // Arrange in i1i2i3i4 format + iValue = _mm256_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2, 0, 2, 0)); + iValue = _mm256_permutevar8x32_ps(iValue, idx); + // Arrange in q1q2q3q4 format + qValue = _mm256_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3, 1, 3, 1)); + qValue = _mm256_permutevar8x32_ps(qValue, idx); + + _mm256_storeu_ps(iBufferPtr, iValue); + _mm256_storeu_ps(qBufferPtr, qValue); + + iBufferPtr += 8; + qBufferPtr += 8; + } + + number = eighthPoints * 8; + complexVectorPtr = (int16_t*)&complexVector[number]; + for (; number < num_points; number++) { + *iBufferPtr++ = (float)(*complexVectorPtr++) / scalar; + *qBufferPtr++ = (float)(*complexVectorPtr++) / scalar; + } } #endif /* LV_HAVE_AVX2 */ diff --git a/kernels/volk/volk_16ic_s32f_deinterleave_real_32f.h b/kernels/volk/volk_16ic_s32f_deinterleave_real_32f.h index 713e6a1..8b72d1c 100644 --- a/kernels/volk/volk_16ic_s32f_deinterleave_real_32f.h +++ b/kernels/volk/volk_16ic_s32f_deinterleave_real_32f.h @@ -31,8 +31,8 @@ * * Dispatcher Prototype * \code - * void volk_16ic_s32f_deinterleave_real_32f(float* iBuffer, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){ - * \endcode + * void volk_16ic_s32f_deinterleave_real_32f(float* iBuffer, const lv_16sc_t* + * complexVector, const float scalar, unsigned int num_points){ \endcode * * \b Inputs * \li complexVector: The complex input vector of 16-bit shorts. @@ -56,55 +56,88 @@ #ifndef INCLUDED_volk_16ic_s32f_deinterleave_real_32f_a_H #define INCLUDED_volk_16ic_s32f_deinterleave_real_32f_a_H -#include #include #include +#include #ifdef LV_HAVE_AVX2 #include static inline void -volk_16ic_s32f_deinterleave_real_32f_a_avx2(float* iBuffer, const lv_16sc_t* complexVector, - const float scalar, unsigned int num_points) +volk_16ic_s32f_deinterleave_real_32f_a_avx2(float* iBuffer, + const lv_16sc_t* complexVector, + const float scalar, + unsigned int num_points) { - float* iBufferPtr = iBuffer; - - unsigned int number = 0; - const unsigned int eighthPoints = num_points / 8; - - __m256 iFloatValue; - - const float iScalar= 1.0 / scalar; - __m256 invScalar = _mm256_set1_ps(iScalar); - __m256i complexVal, iIntVal; - __m128i complexVal128; - int8_t* complexVectorPtr = (int8_t*)complexVector; - - __m256i moveMask = _mm256_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0); - - for(;number < eighthPoints; number++){ - complexVal = _mm256_load_si256((__m256i*)complexVectorPtr); complexVectorPtr += 32; - complexVal = _mm256_shuffle_epi8(complexVal, moveMask); - complexVal = _mm256_permute4x64_epi64(complexVal, 0xd8); - complexVal128 = _mm256_extracti128_si256(complexVal, 0); - - iIntVal = _mm256_cvtepi16_epi32(complexVal128); - iFloatValue = _mm256_cvtepi32_ps(iIntVal); - - iFloatValue = _mm256_mul_ps(iFloatValue, invScalar); - - _mm256_store_ps(iBufferPtr, iFloatValue); - - iBufferPtr += 8; - } - - number = eighthPoints * 8; - int16_t* sixteenTComplexVectorPtr = (int16_t*)&complexVector[number]; - for(; number < num_points; number++){ - *iBufferPtr++ = ((float)(*sixteenTComplexVectorPtr++)) * iScalar; - sixteenTComplexVectorPtr++; - } - + float* iBufferPtr = iBuffer; + + unsigned int number = 0; + const unsigned int eighthPoints = num_points / 8; + + __m256 iFloatValue; + + const float iScalar = 1.0 / scalar; + __m256 invScalar = _mm256_set1_ps(iScalar); + __m256i complexVal, iIntVal; + __m128i complexVal128; + int8_t* complexVectorPtr = (int8_t*)complexVector; + + __m256i moveMask = _mm256_set_epi8(0x80, + 0x80, + 0x80, + 0x80, + 0x80, + 0x80, + 0x80, + 0x80, + 13, + 12, + 9, + 8, + 5, + 4, + 1, + 0, + 0x80, + 0x80, + 0x80, + 0x80, + 0x80, + 0x80, + 0x80, + 0x80, + 13, + 12, + 9, + 8, + 5, + 4, + 1, + 0); + + for (; number < eighthPoints; number++) { + complexVal = _mm256_load_si256((__m256i*)complexVectorPtr); + complexVectorPtr += 32; + complexVal = _mm256_shuffle_epi8(complexVal, moveMask); + complexVal = _mm256_permute4x64_epi64(complexVal, 0xd8); + complexVal128 = _mm256_extracti128_si256(complexVal, 0); + + iIntVal = _mm256_cvtepi16_epi32(complexVal128); + iFloatValue = _mm256_cvtepi32_ps(iIntVal); + + iFloatValue = _mm256_mul_ps(iFloatValue, invScalar); + + _mm256_store_ps(iBufferPtr, iFloatValue); + + iBufferPtr += 8; + } + + number = eighthPoints * 8; + int16_t* sixteenTComplexVectorPtr = (int16_t*)&complexVector[number]; + for (; number < num_points; number++) { + *iBufferPtr++ = ((float)(*sixteenTComplexVectorPtr++)) * iScalar; + sixteenTComplexVectorPtr++; + } } #endif /* LV_HAVE_AVX2 */ @@ -112,44 +145,47 @@ volk_16ic_s32f_deinterleave_real_32f_a_avx2(float* iBuffer, const lv_16sc_t* com #include static inline void -volk_16ic_s32f_deinterleave_real_32f_a_sse4_1(float* iBuffer, const lv_16sc_t* complexVector, - const float scalar, unsigned int num_points) +volk_16ic_s32f_deinterleave_real_32f_a_sse4_1(float* iBuffer, + const lv_16sc_t* complexVector, + const float scalar, + unsigned int num_points) { - float* iBufferPtr = iBuffer; - - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; + float* iBufferPtr = iBuffer; - __m128 iFloatValue; + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; - const float iScalar= 1.0 / scalar; - __m128 invScalar = _mm_set_ps1(iScalar); - __m128i complexVal, iIntVal; - int8_t* complexVectorPtr = (int8_t*)complexVector; + __m128 iFloatValue; - __m128i moveMask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0); + const float iScalar = 1.0 / scalar; + __m128 invScalar = _mm_set_ps1(iScalar); + __m128i complexVal, iIntVal; + int8_t* complexVectorPtr = (int8_t*)complexVector; - for(;number < quarterPoints; number++){ - complexVal = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16; - complexVal = _mm_shuffle_epi8(complexVal, moveMask); + __m128i moveMask = _mm_set_epi8( + 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0); - iIntVal = _mm_cvtepi16_epi32(complexVal); - iFloatValue = _mm_cvtepi32_ps(iIntVal); + for (; number < quarterPoints; number++) { + complexVal = _mm_load_si128((__m128i*)complexVectorPtr); + complexVectorPtr += 16; + complexVal = _mm_shuffle_epi8(complexVal, moveMask); - iFloatValue = _mm_mul_ps(iFloatValue, invScalar); + iIntVal = _mm_cvtepi16_epi32(complexVal); + iFloatValue = _mm_cvtepi32_ps(iIntVal); - _mm_store_ps(iBufferPtr, iFloatValue); + iFloatValue = _mm_mul_ps(iFloatValue, invScalar); - iBufferPtr += 4; - } + _mm_store_ps(iBufferPtr, iFloatValue); - number = quarterPoints * 4; - int16_t* sixteenTComplexVectorPtr = (int16_t*)&complexVector[number]; - for(; number < num_points; number++){ - *iBufferPtr++ = ((float)(*sixteenTComplexVectorPtr++)) * iScalar; - sixteenTComplexVectorPtr++; - } + iBufferPtr += 4; + } + number = quarterPoints * 4; + int16_t* sixteenTComplexVectorPtr = (int16_t*)&complexVector[number]; + for (; number < num_points; number++) { + *iBufferPtr++ = ((float)(*sixteenTComplexVectorPtr++)) * iScalar; + sixteenTComplexVectorPtr++; + } } #endif /* LV_HAVE_SSE4_1 */ @@ -157,59 +193,66 @@ volk_16ic_s32f_deinterleave_real_32f_a_sse4_1(float* iBuffer, const lv_16sc_t* c #include static inline void -volk_16ic_s32f_deinterleave_real_32f_a_sse(float* iBuffer, const lv_16sc_t* complexVector, - const float scalar, unsigned int num_points) +volk_16ic_s32f_deinterleave_real_32f_a_sse(float* iBuffer, + const lv_16sc_t* complexVector, + const float scalar, + unsigned int num_points) { - float* iBufferPtr = iBuffer; + float* iBufferPtr = iBuffer; - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - __m128 iValue; + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + __m128 iValue; - const float iScalar = 1.0/scalar; - __m128 invScalar = _mm_set_ps1(iScalar); - int16_t* complexVectorPtr = (int16_t*)complexVector; + const float iScalar = 1.0 / scalar; + __m128 invScalar = _mm_set_ps1(iScalar); + int16_t* complexVectorPtr = (int16_t*)complexVector; - __VOLK_ATTR_ALIGNED(16) float floatBuffer[4]; + __VOLK_ATTR_ALIGNED(16) float floatBuffer[4]; - for(;number < quarterPoints; number++){ - floatBuffer[0] = (float)(*complexVectorPtr); complexVectorPtr += 2; - floatBuffer[1] = (float)(*complexVectorPtr); complexVectorPtr += 2; - floatBuffer[2] = (float)(*complexVectorPtr); complexVectorPtr += 2; - floatBuffer[3] = (float)(*complexVectorPtr); complexVectorPtr += 2; + for (; number < quarterPoints; number++) { + floatBuffer[0] = (float)(*complexVectorPtr); + complexVectorPtr += 2; + floatBuffer[1] = (float)(*complexVectorPtr); + complexVectorPtr += 2; + floatBuffer[2] = (float)(*complexVectorPtr); + complexVectorPtr += 2; + floatBuffer[3] = (float)(*complexVectorPtr); + complexVectorPtr += 2; - iValue = _mm_load_ps(floatBuffer); + iValue = _mm_load_ps(floatBuffer); - iValue = _mm_mul_ps(iValue, invScalar); + iValue = _mm_mul_ps(iValue, invScalar); - _mm_store_ps(iBufferPtr, iValue); + _mm_store_ps(iBufferPtr, iValue); - iBufferPtr += 4; - } - - number = quarterPoints * 4; - complexVectorPtr = (int16_t*)&complexVector[number]; - for(; number < num_points; number++){ - *iBufferPtr++ = ((float)(*complexVectorPtr++)) * iScalar; - complexVectorPtr++; - } + iBufferPtr += 4; + } + number = quarterPoints * 4; + complexVectorPtr = (int16_t*)&complexVector[number]; + for (; number < num_points; number++) { + *iBufferPtr++ = ((float)(*complexVectorPtr++)) * iScalar; + complexVectorPtr++; + } } #endif /* LV_HAVE_SSE */ #ifdef LV_HAVE_GENERIC static inline void -volk_16ic_s32f_deinterleave_real_32f_generic(float* iBuffer, const lv_16sc_t* complexVector, - const float scalar, unsigned int num_points) +volk_16ic_s32f_deinterleave_real_32f_generic(float* iBuffer, + const lv_16sc_t* complexVector, + const float scalar, + unsigned int num_points) { - unsigned int number = 0; - const int16_t* complexVectorPtr = (const int16_t*)complexVector; - float* iBufferPtr = iBuffer; - const float invScalar = 1.0 / scalar; - for(number = 0; number < num_points; number++){ - *iBufferPtr++ = ((float)(*complexVectorPtr++)) * invScalar; - complexVectorPtr++; - } + unsigned int number = 0; + const int16_t* complexVectorPtr = (const int16_t*)complexVector; + float* iBufferPtr = iBuffer; + const float invScalar = 1.0 / scalar; + for (number = 0; number < num_points; number++) { + *iBufferPtr++ = ((float)(*complexVectorPtr++)) * invScalar; + complexVectorPtr++; + } } #endif /* LV_HAVE_GENERIC */ @@ -219,55 +262,88 @@ volk_16ic_s32f_deinterleave_real_32f_generic(float* iBuffer, const lv_16sc_t* co #ifndef INCLUDED_volk_16ic_s32f_deinterleave_real_32f_u_H #define INCLUDED_volk_16ic_s32f_deinterleave_real_32f_u_H -#include #include #include +#include #ifdef LV_HAVE_AVX2 #include static inline void -volk_16ic_s32f_deinterleave_real_32f_u_avx2(float* iBuffer, const lv_16sc_t* complexVector, - const float scalar, unsigned int num_points) +volk_16ic_s32f_deinterleave_real_32f_u_avx2(float* iBuffer, + const lv_16sc_t* complexVector, + const float scalar, + unsigned int num_points) { - float* iBufferPtr = iBuffer; - - unsigned int number = 0; - const unsigned int eighthPoints = num_points / 8; - - __m256 iFloatValue; - - const float iScalar= 1.0 / scalar; - __m256 invScalar = _mm256_set1_ps(iScalar); - __m256i complexVal, iIntVal; - __m128i complexVal128; - int8_t* complexVectorPtr = (int8_t*)complexVector; - - __m256i moveMask = _mm256_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0); - - for(;number < eighthPoints; number++){ - complexVal = _mm256_loadu_si256((__m256i*)complexVectorPtr); complexVectorPtr += 32; - complexVal = _mm256_shuffle_epi8(complexVal, moveMask); - complexVal = _mm256_permute4x64_epi64(complexVal, 0xd8); - complexVal128 = _mm256_extracti128_si256(complexVal, 0); - - iIntVal = _mm256_cvtepi16_epi32(complexVal128); - iFloatValue = _mm256_cvtepi32_ps(iIntVal); - - iFloatValue = _mm256_mul_ps(iFloatValue, invScalar); - - _mm256_storeu_ps(iBufferPtr, iFloatValue); - - iBufferPtr += 8; - } - - number = eighthPoints * 8; - int16_t* sixteenTComplexVectorPtr = (int16_t*)&complexVector[number]; - for(; number < num_points; number++){ - *iBufferPtr++ = ((float)(*sixteenTComplexVectorPtr++)) * iScalar; - sixteenTComplexVectorPtr++; - } - + float* iBufferPtr = iBuffer; + + unsigned int number = 0; + const unsigned int eighthPoints = num_points / 8; + + __m256 iFloatValue; + + const float iScalar = 1.0 / scalar; + __m256 invScalar = _mm256_set1_ps(iScalar); + __m256i complexVal, iIntVal; + __m128i complexVal128; + int8_t* complexVectorPtr = (int8_t*)complexVector; + + __m256i moveMask = _mm256_set_epi8(0x80, + 0x80, + 0x80, + 0x80, + 0x80, + 0x80, + 0x80, + 0x80, + 13, + 12, + 9, + 8, + 5, + 4, + 1, + 0, + 0x80, + 0x80, + 0x80, + 0x80, + 0x80, + 0x80, + 0x80, + 0x80, + 13, + 12, + 9, + 8, + 5, + 4, + 1, + 0); + + for (; number < eighthPoints; number++) { + complexVal = _mm256_loadu_si256((__m256i*)complexVectorPtr); + complexVectorPtr += 32; + complexVal = _mm256_shuffle_epi8(complexVal, moveMask); + complexVal = _mm256_permute4x64_epi64(complexVal, 0xd8); + complexVal128 = _mm256_extracti128_si256(complexVal, 0); + + iIntVal = _mm256_cvtepi16_epi32(complexVal128); + iFloatValue = _mm256_cvtepi32_ps(iIntVal); + + iFloatValue = _mm256_mul_ps(iFloatValue, invScalar); + + _mm256_storeu_ps(iBufferPtr, iFloatValue); + + iBufferPtr += 8; + } + + number = eighthPoints * 8; + int16_t* sixteenTComplexVectorPtr = (int16_t*)&complexVector[number]; + for (; number < num_points; number++) { + *iBufferPtr++ = ((float)(*sixteenTComplexVectorPtr++)) * iScalar; + sixteenTComplexVectorPtr++; + } } #endif /* LV_HAVE_AVX2 */ diff --git a/kernels/volk/volk_16ic_s32f_magnitude_32f.h b/kernels/volk/volk_16ic_s32f_magnitude_32f.h index bb0459c..c3e3605 100644 --- a/kernels/volk/volk_16ic_s32f_magnitude_32f.h +++ b/kernels/volk/volk_16ic_s32f_magnitude_32f.h @@ -30,8 +30,8 @@ * * Dispatcher Prototype * \code - * void volk_16ic_s32f_magnitude_32f(float* magnitudeVector, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points) - * \endcode + * void volk_16ic_s32f_magnitude_32f(float* magnitudeVector, const lv_16sc_t* + * complexVector, const float scalar, unsigned int num_points) \endcode * * \b Inputs * \li complexVector: The complex input vector of complex 16-bit shorts. @@ -55,67 +55,68 @@ #ifndef INCLUDED_volk_16ic_s32f_magnitude_32f_a_H #define INCLUDED_volk_16ic_s32f_magnitude_32f_a_H -#include #include -#include #include +#include +#include #ifdef LV_HAVE_AVX2 #include -static inline void -volk_16ic_s32f_magnitude_32f_a_avx2(float* magnitudeVector, const lv_16sc_t* complexVector, - const float scalar, unsigned int num_points) +static inline void volk_16ic_s32f_magnitude_32f_a_avx2(float* magnitudeVector, + const lv_16sc_t* complexVector, + const float scalar, + unsigned int num_points) { - unsigned int number = 0; - const unsigned int eighthPoints = num_points / 8; + unsigned int number = 0; + const unsigned int eighthPoints = num_points / 8; + + const int16_t* complexVectorPtr = (const int16_t*)complexVector; + float* magnitudeVectorPtr = magnitudeVector; - const int16_t* complexVectorPtr = (const int16_t*)complexVector; - float* magnitudeVectorPtr = magnitudeVector; + __m256 invScalar = _mm256_set1_ps(1.0 / scalar); - __m256 invScalar = _mm256_set1_ps(1.0/scalar); + __m256 cplxValue1, cplxValue2, result; + __m256i int1, int2; + __m128i short1, short2; + __m256i idx = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0); - __m256 cplxValue1, cplxValue2, result; - __m256i int1, int2; - __m128i short1, short2; - __m256i idx = _mm256_set_epi32(7,6,3,2,5,4,1,0); + for (; number < eighthPoints; number++) { - for(;number < eighthPoints; number++){ - - int1 = _mm256_loadu_si256((__m256i*)complexVectorPtr); - complexVectorPtr += 16; - short1 = _mm256_extracti128_si256(int1,0); - short2 = _mm256_extracti128_si256(int1,1); + int1 = _mm256_loadu_si256((__m256i*)complexVectorPtr); + complexVectorPtr += 16; + short1 = _mm256_extracti128_si256(int1, 0); + short2 = _mm256_extracti128_si256(int1, 1); - int1 = _mm256_cvtepi16_epi32(short1); - int2 = _mm256_cvtepi16_epi32(short2); - cplxValue1 = _mm256_cvtepi32_ps(int1); - cplxValue2 = _mm256_cvtepi32_ps(int2); + int1 = _mm256_cvtepi16_epi32(short1); + int2 = _mm256_cvtepi16_epi32(short2); + cplxValue1 = _mm256_cvtepi32_ps(int1); + cplxValue2 = _mm256_cvtepi32_ps(int2); - cplxValue1 = _mm256_mul_ps(cplxValue1, invScalar); - cplxValue2 = _mm256_mul_ps(cplxValue2, invScalar); + cplxValue1 = _mm256_mul_ps(cplxValue1, invScalar); + cplxValue2 = _mm256_mul_ps(cplxValue2, invScalar); - cplxValue1 = _mm256_mul_ps(cplxValue1, cplxValue1); // Square the values - cplxValue2 = _mm256_mul_ps(cplxValue2, cplxValue2); // Square the Values + cplxValue1 = _mm256_mul_ps(cplxValue1, cplxValue1); // Square the values + cplxValue2 = _mm256_mul_ps(cplxValue2, cplxValue2); // Square the Values - result = _mm256_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values - result = _mm256_permutevar8x32_ps(result, idx); + result = _mm256_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values + result = _mm256_permutevar8x32_ps(result, idx); - result = _mm256_sqrt_ps(result); // Square root the values + result = _mm256_sqrt_ps(result); // Square root the values - _mm256_store_ps(magnitudeVectorPtr, result); + _mm256_store_ps(magnitudeVectorPtr, result); - magnitudeVectorPtr += 8; - } + magnitudeVectorPtr += 8; + } - number = eighthPoints * 8; - magnitudeVectorPtr = &magnitudeVector[number]; - complexVectorPtr = (const int16_t*)&complexVector[number]; - for(; number < num_points; number++){ - float val1Real = (float)(*complexVectorPtr++) / scalar; - float val1Imag = (float)(*complexVectorPtr++) / scalar; - *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)); - } + number = eighthPoints * 8; + magnitudeVectorPtr = &magnitudeVector[number]; + complexVectorPtr = (const int16_t*)&complexVector[number]; + for (; number < num_points; number++) { + float val1Real = (float)(*complexVectorPtr++) / scalar; + float val1Imag = (float)(*complexVectorPtr++) / scalar; + *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)); + } } #endif /* LV_HAVE_AVX2 */ @@ -123,127 +124,129 @@ volk_16ic_s32f_magnitude_32f_a_avx2(float* magnitudeVector, const lv_16sc_t* com #ifdef LV_HAVE_SSE3 #include -static inline void -volk_16ic_s32f_magnitude_32f_a_sse3(float* magnitudeVector, const lv_16sc_t* complexVector, - const float scalar, unsigned int num_points) +static inline void volk_16ic_s32f_magnitude_32f_a_sse3(float* magnitudeVector, + const lv_16sc_t* complexVector, + const float scalar, + unsigned int num_points) { - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; - const int16_t* complexVectorPtr = (const int16_t*)complexVector; - float* magnitudeVectorPtr = magnitudeVector; + const int16_t* complexVectorPtr = (const int16_t*)complexVector; + float* magnitudeVectorPtr = magnitudeVector; - __m128 invScalar = _mm_set_ps1(1.0/scalar); + __m128 invScalar = _mm_set_ps1(1.0 / scalar); - __m128 cplxValue1, cplxValue2, result; + __m128 cplxValue1, cplxValue2, result; - __VOLK_ATTR_ALIGNED(16) float inputFloatBuffer[8]; + __VOLK_ATTR_ALIGNED(16) float inputFloatBuffer[8]; - for(;number < quarterPoints; number++){ + for (; number < quarterPoints; number++) { - inputFloatBuffer[0] = (float)(complexVectorPtr[0]); - inputFloatBuffer[1] = (float)(complexVectorPtr[1]); - inputFloatBuffer[2] = (float)(complexVectorPtr[2]); - inputFloatBuffer[3] = (float)(complexVectorPtr[3]); + inputFloatBuffer[0] = (float)(complexVectorPtr[0]); + inputFloatBuffer[1] = (float)(complexVectorPtr[1]); + inputFloatBuffer[2] = (float)(complexVectorPtr[2]); + inputFloatBuffer[3] = (float)(complexVectorPtr[3]); - inputFloatBuffer[4] = (float)(complexVectorPtr[4]); - inputFloatBuffer[5] = (float)(complexVectorPtr[5]); - inputFloatBuffer[6] = (float)(complexVectorPtr[6]); - inputFloatBuffer[7] = (float)(complexVectorPtr[7]); + inputFloatBuffer[4] = (float)(complexVectorPtr[4]); + inputFloatBuffer[5] = (float)(complexVectorPtr[5]); + inputFloatBuffer[6] = (float)(complexVectorPtr[6]); + inputFloatBuffer[7] = (float)(complexVectorPtr[7]); - cplxValue1 = _mm_load_ps(&inputFloatBuffer[0]); - cplxValue2 = _mm_load_ps(&inputFloatBuffer[4]); + cplxValue1 = _mm_load_ps(&inputFloatBuffer[0]); + cplxValue2 = _mm_load_ps(&inputFloatBuffer[4]); - complexVectorPtr += 8; + complexVectorPtr += 8; - cplxValue1 = _mm_mul_ps(cplxValue1, invScalar); - cplxValue2 = _mm_mul_ps(cplxValue2, invScalar); + cplxValue1 = _mm_mul_ps(cplxValue1, invScalar); + cplxValue2 = _mm_mul_ps(cplxValue2, invScalar); - cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values - cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values + cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values + cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values - result = _mm_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values + result = _mm_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values - result = _mm_sqrt_ps(result); // Square root the values + result = _mm_sqrt_ps(result); // Square root the values - _mm_store_ps(magnitudeVectorPtr, result); + _mm_store_ps(magnitudeVectorPtr, result); - magnitudeVectorPtr += 4; - } + magnitudeVectorPtr += 4; + } - number = quarterPoints * 4; - magnitudeVectorPtr = &magnitudeVector[number]; - complexVectorPtr = (const int16_t*)&complexVector[number]; - for(; number < num_points; number++){ - float val1Real = (float)(*complexVectorPtr++) / scalar; - float val1Imag = (float)(*complexVectorPtr++) / scalar; - *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)); - } + number = quarterPoints * 4; + magnitudeVectorPtr = &magnitudeVector[number]; + complexVectorPtr = (const int16_t*)&complexVector[number]; + for (; number < num_points; number++) { + float val1Real = (float)(*complexVectorPtr++) / scalar; + float val1Imag = (float)(*complexVectorPtr++) / scalar; + *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)); + } } #endif /* LV_HAVE_SSE3 */ #ifdef LV_HAVE_SSE #include -static inline void -volk_16ic_s32f_magnitude_32f_a_sse(float* magnitudeVector, const lv_16sc_t* complexVector, - const float scalar, unsigned int num_points) +static inline void volk_16ic_s32f_magnitude_32f_a_sse(float* magnitudeVector, + const lv_16sc_t* complexVector, + const float scalar, + unsigned int num_points) { - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; - const int16_t* complexVectorPtr = (const int16_t*)complexVector; - float* magnitudeVectorPtr = magnitudeVector; + const int16_t* complexVectorPtr = (const int16_t*)complexVector; + float* magnitudeVectorPtr = magnitudeVector; - const float iScalar = 1.0 / scalar; - __m128 invScalar = _mm_set_ps1(iScalar); + const float iScalar = 1.0 / scalar; + __m128 invScalar = _mm_set_ps1(iScalar); - __m128 cplxValue1, cplxValue2, result, re, im; + __m128 cplxValue1, cplxValue2, result, re, im; - __VOLK_ATTR_ALIGNED(16) float inputFloatBuffer[8]; + __VOLK_ATTR_ALIGNED(16) float inputFloatBuffer[8]; - for(;number < quarterPoints; number++){ - inputFloatBuffer[0] = (float)(complexVectorPtr[0]); - inputFloatBuffer[1] = (float)(complexVectorPtr[1]); - inputFloatBuffer[2] = (float)(complexVectorPtr[2]); - inputFloatBuffer[3] = (float)(complexVectorPtr[3]); + for (; number < quarterPoints; number++) { + inputFloatBuffer[0] = (float)(complexVectorPtr[0]); + inputFloatBuffer[1] = (float)(complexVectorPtr[1]); + inputFloatBuffer[2] = (float)(complexVectorPtr[2]); + inputFloatBuffer[3] = (float)(complexVectorPtr[3]); - inputFloatBuffer[4] = (float)(complexVectorPtr[4]); - inputFloatBuffer[5] = (float)(complexVectorPtr[5]); - inputFloatBuffer[6] = (float)(complexVectorPtr[6]); - inputFloatBuffer[7] = (float)(complexVectorPtr[7]); + inputFloatBuffer[4] = (float)(complexVectorPtr[4]); + inputFloatBuffer[5] = (float)(complexVectorPtr[5]); + inputFloatBuffer[6] = (float)(complexVectorPtr[6]); + inputFloatBuffer[7] = (float)(complexVectorPtr[7]); - cplxValue1 = _mm_load_ps(&inputFloatBuffer[0]); - cplxValue2 = _mm_load_ps(&inputFloatBuffer[4]); + cplxValue1 = _mm_load_ps(&inputFloatBuffer[0]); + cplxValue2 = _mm_load_ps(&inputFloatBuffer[4]); - re = _mm_shuffle_ps(cplxValue1, cplxValue2, 0x88); - im = _mm_shuffle_ps(cplxValue1, cplxValue2, 0xdd); + re = _mm_shuffle_ps(cplxValue1, cplxValue2, 0x88); + im = _mm_shuffle_ps(cplxValue1, cplxValue2, 0xdd); - complexVectorPtr += 8; + complexVectorPtr += 8; - cplxValue1 = _mm_mul_ps(re, invScalar); - cplxValue2 = _mm_mul_ps(im, invScalar); + cplxValue1 = _mm_mul_ps(re, invScalar); + cplxValue2 = _mm_mul_ps(im, invScalar); - cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values - cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values + cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values + cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values - result = _mm_add_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values + result = _mm_add_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values - result = _mm_sqrt_ps(result); // Square root the values + result = _mm_sqrt_ps(result); // Square root the values - _mm_store_ps(magnitudeVectorPtr, result); + _mm_store_ps(magnitudeVectorPtr, result); - magnitudeVectorPtr += 4; - } + magnitudeVectorPtr += 4; + } - number = quarterPoints * 4; - magnitudeVectorPtr = &magnitudeVector[number]; - complexVectorPtr = (const int16_t*)&complexVector[number]; - for(; number < num_points; number++){ - float val1Real = (float)(*complexVectorPtr++) * iScalar; - float val1Imag = (float)(*complexVectorPtr++) * iScalar; - *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)); - } + number = quarterPoints * 4; + magnitudeVectorPtr = &magnitudeVector[number]; + complexVectorPtr = (const int16_t*)&complexVector[number]; + for (; number < num_points; number++) { + float val1Real = (float)(*complexVectorPtr++) * iScalar; + float val1Imag = (float)(*complexVectorPtr++) * iScalar; + *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)); + } } @@ -251,33 +254,37 @@ volk_16ic_s32f_magnitude_32f_a_sse(float* magnitudeVector, const lv_16sc_t* comp #ifdef LV_HAVE_GENERIC -static inline void -volk_16ic_s32f_magnitude_32f_generic(float* magnitudeVector, const lv_16sc_t* complexVector, - const float scalar, unsigned int num_points) +static inline void volk_16ic_s32f_magnitude_32f_generic(float* magnitudeVector, + const lv_16sc_t* complexVector, + const float scalar, + unsigned int num_points) { - const int16_t* complexVectorPtr = (const int16_t*)complexVector; - float* magnitudeVectorPtr = magnitudeVector; - unsigned int number = 0; - const float invScalar = 1.0 / scalar; - for(number = 0; number < num_points; number++){ - float real = ( (float) (*complexVectorPtr++)) * invScalar; - float imag = ( (float) (*complexVectorPtr++)) * invScalar; - *magnitudeVectorPtr++ = sqrtf((real*real) + (imag*imag)); - } + const int16_t* complexVectorPtr = (const int16_t*)complexVector; + float* magnitudeVectorPtr = magnitudeVector; + unsigned int number = 0; + const float invScalar = 1.0 / scalar; + for (number = 0; number < num_points; number++) { + float real = ((float)(*complexVectorPtr++)) * invScalar; + float imag = ((float)(*complexVectorPtr++)) * invScalar; + *magnitudeVectorPtr++ = sqrtf((real * real) + (imag * imag)); + } } #endif /* LV_HAVE_GENERIC */ #ifdef LV_HAVE_ORC_DISABLED -extern void -volk_16ic_s32f_magnitude_32f_a_orc_impl(float* magnitudeVector, const lv_16sc_t* complexVector, - const float scalar, unsigned int num_points); +extern void volk_16ic_s32f_magnitude_32f_a_orc_impl(float* magnitudeVector, + const lv_16sc_t* complexVector, + const float scalar, + unsigned int num_points); -static inline void -volk_16ic_s32f_magnitude_32f_u_orc(float* magnitudeVector, const lv_16sc_t* complexVector, - const float scalar, unsigned int num_points) +static inline void volk_16ic_s32f_magnitude_32f_u_orc(float* magnitudeVector, + const lv_16sc_t* complexVector, + const float scalar, + unsigned int num_points) { - volk_16ic_s32f_magnitude_32f_a_orc_impl(magnitudeVector, complexVector, scalar, num_points); + volk_16ic_s32f_magnitude_32f_a_orc_impl( + magnitudeVector, complexVector, scalar, num_points); } #endif /* LV_HAVE_ORC */ @@ -287,69 +294,69 @@ volk_16ic_s32f_magnitude_32f_u_orc(float* magnitudeVector, const lv_16sc_t* comp #ifndef INCLUDED_volk_16ic_s32f_magnitude_32f_u_H #define INCLUDED_volk_16ic_s32f_magnitude_32f_u_H -#include #include -#include #include +#include +#include #ifdef LV_HAVE_AVX2 #include -static inline void -volk_16ic_s32f_magnitude_32f_u_avx2(float* magnitudeVector, const lv_16sc_t* complexVector, - const float scalar, unsigned int num_points) +static inline void volk_16ic_s32f_magnitude_32f_u_avx2(float* magnitudeVector, + const lv_16sc_t* complexVector, + const float scalar, + unsigned int num_points) { - unsigned int number = 0; - const unsigned int eighthPoints = num_points / 8; + unsigned int number = 0; + const unsigned int eighthPoints = num_points / 8; + + const int16_t* complexVectorPtr = (const int16_t*)complexVector; + float* magnitudeVectorPtr = magnitudeVector; - const int16_t* complexVectorPtr = (const int16_t*)complexVector; - float* magnitudeVectorPtr = magnitudeVector; + __m256 invScalar = _mm256_set1_ps(1.0 / scalar); - __m256 invScalar = _mm256_set1_ps(1.0/scalar); + __m256 cplxValue1, cplxValue2, result; + __m256i int1, int2; + __m128i short1, short2; + __m256i idx = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0); - __m256 cplxValue1, cplxValue2, result; - __m256i int1, int2; - __m128i short1, short2; - __m256i idx = _mm256_set_epi32(7,6,3,2,5,4,1,0); + for (; number < eighthPoints; number++) { - for(;number < eighthPoints; number++){ - - int1 = _mm256_loadu_si256((__m256i*)complexVectorPtr); - complexVectorPtr += 16; - short1 = _mm256_extracti128_si256(int1,0); - short2 = _mm256_extracti128_si256(int1,1); + int1 = _mm256_loadu_si256((__m256i*)complexVectorPtr); + complexVectorPtr += 16; + short1 = _mm256_extracti128_si256(int1, 0); + short2 = _mm256_extracti128_si256(int1, 1); - int1 = _mm256_cvtepi16_epi32(short1); - int2 = _mm256_cvtepi16_epi32(short2); - cplxValue1 = _mm256_cvtepi32_ps(int1); - cplxValue2 = _mm256_cvtepi32_ps(int2); + int1 = _mm256_cvtepi16_epi32(short1); + int2 = _mm256_cvtepi16_epi32(short2); + cplxValue1 = _mm256_cvtepi32_ps(int1); + cplxValue2 = _mm256_cvtepi32_ps(int2); - cplxValue1 = _mm256_mul_ps(cplxValue1, invScalar); - cplxValue2 = _mm256_mul_ps(cplxValue2, invScalar); + cplxValue1 = _mm256_mul_ps(cplxValue1, invScalar); + cplxValue2 = _mm256_mul_ps(cplxValue2, invScalar); - cplxValue1 = _mm256_mul_ps(cplxValue1, cplxValue1); // Square the values - cplxValue2 = _mm256_mul_ps(cplxValue2, cplxValue2); // Square the Values + cplxValue1 = _mm256_mul_ps(cplxValue1, cplxValue1); // Square the values + cplxValue2 = _mm256_mul_ps(cplxValue2, cplxValue2); // Square the Values - result = _mm256_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values - result = _mm256_permutevar8x32_ps(result, idx); + result = _mm256_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values + result = _mm256_permutevar8x32_ps(result, idx); - result = _mm256_sqrt_ps(result); // Square root the values + result = _mm256_sqrt_ps(result); // Square root the values - _mm256_storeu_ps(magnitudeVectorPtr, result); + _mm256_storeu_ps(magnitudeVectorPtr, result); - magnitudeVectorPtr += 8; - } + magnitudeVectorPtr += 8; + } - number = eighthPoints * 8; - magnitudeVectorPtr = &magnitudeVector[number]; - complexVectorPtr = (const int16_t*)&complexVector[number]; - for(; number < num_points; number++){ - float val1Real = (float)(*complexVectorPtr++) / scalar; - float val1Imag = (float)(*complexVectorPtr++) / scalar; - *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)); - } + number = eighthPoints * 8; + magnitudeVectorPtr = &magnitudeVector[number]; + complexVectorPtr = (const int16_t*)&complexVector[number]; + for (; number < num_points; number++) { + float val1Real = (float)(*complexVectorPtr++) / scalar; + float val1Imag = (float)(*complexVectorPtr++) / scalar; + *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)); + } } #endif /* LV_HAVE_AVX2 */ #endif /* INCLUDED_volk_16ic_s32f_magnitude_32f_u_H */ - diff --git a/kernels/volk/volk_16ic_x2_dot_prod_16ic.h b/kernels/volk/volk_16ic_x2_dot_prod_16ic.h index ae10cff..a1a0e8c 100644 --- a/kernels/volk/volk_16ic_x2_dot_prod_16ic.h +++ b/kernels/volk/volk_16ic_x2_dot_prod_16ic.h @@ -25,18 +25,20 @@ * * \b Overview * - * Multiplies two input complex vectors (16-bit integer each component) and accumulates them, - * storing the result. Results are saturated so never go beyond the limits of the data type. + * Multiplies two input complex vectors (16-bit integer each component) and accumulates + * them, storing the result. Results are saturated so never go beyond the limits of the + * data type. * * Dispatcher Prototype * \code - * void volk_16ic_x2_dot_prod_16ic(lv_16sc_t* result, const lv_16sc_t* in_a, const lv_16sc_t* in_b, unsigned int num_points); - * \endcode + * void volk_16ic_x2_dot_prod_16ic(lv_16sc_t* result, const lv_16sc_t* in_a, const + * lv_16sc_t* in_b, unsigned int num_points); \endcode * * \b Inputs * \li in_a: One of the vectors to be multiplied and accumulated. * \li in_b: The other vector to be multiplied and accumulated. - * \li num_points: Number of complex values to be multiplied together, accumulated and stored into \p result + * \li num_points: Number of complex values to be multiplied together, accumulated and + * stored into \p result * * \b Outputs * \li result: Value of the accumulated result. @@ -46,22 +48,25 @@ #ifndef INCLUDED_volk_16ic_x2_dot_prod_16ic_H #define INCLUDED_volk_16ic_x2_dot_prod_16ic_H +#include #include #include -#include #ifdef LV_HAVE_GENERIC -static inline void volk_16ic_x2_dot_prod_16ic_generic(lv_16sc_t* result, const lv_16sc_t* in_a, const lv_16sc_t* in_b, unsigned int num_points) +static inline void volk_16ic_x2_dot_prod_16ic_generic(lv_16sc_t* result, + const lv_16sc_t* in_a, + const lv_16sc_t* in_b, + unsigned int num_points) { result[0] = lv_cmake((int16_t)0, (int16_t)0); unsigned int n; - for (n = 0; n < num_points; n++) - { - lv_16sc_t tmp = in_a[n] * in_b[n]; - result[0] = lv_cmake(sat_adds16i(lv_creal(result[0]), lv_creal(tmp)), sat_adds16i(lv_cimag(result[0]), lv_cimag(tmp) )); - } + for (n = 0; n < num_points; n++) { + lv_16sc_t tmp = in_a[n] * in_b[n]; + result[0] = lv_cmake(sat_adds16i(lv_creal(result[0]), lv_creal(tmp)), + sat_adds16i(lv_cimag(result[0]), lv_cimag(tmp))); + } } #endif /*LV_HAVE_GENERIC*/ @@ -70,7 +75,10 @@ static inline void volk_16ic_x2_dot_prod_16ic_generic(lv_16sc_t* result, const l #ifdef LV_HAVE_SSE2 #include -static inline void volk_16ic_x2_dot_prod_16ic_a_sse2(lv_16sc_t* out, const lv_16sc_t* in_a, const lv_16sc_t* in_b, unsigned int num_points) +static inline void volk_16ic_x2_dot_prod_16ic_a_sse2(lv_16sc_t* out, + const lv_16sc_t* in_a, + const lv_16sc_t* in_b, + unsigned int num_points) { lv_16sc_t dotProduct = lv_cmake((int16_t)0, (int16_t)0); @@ -81,62 +89,67 @@ static inline void volk_16ic_x2_dot_prod_16ic_a_sse2(lv_16sc_t* out, const lv_16 const lv_16sc_t* _in_b = in_b; lv_16sc_t* _out = out; - if (sse_iters > 0) - { - __m128i a, b, c, c_sr, mask_imag, mask_real, real, imag, imag1, imag2, b_sl, a_sl, realcacc, imagcacc; - __VOLK_ATTR_ALIGNED(16) lv_16sc_t dotProductVector[4]; + if (sse_iters > 0) { + __m128i a, b, c, c_sr, mask_imag, mask_real, real, imag, imag1, imag2, b_sl, a_sl, + realcacc, imagcacc; + __VOLK_ATTR_ALIGNED(16) lv_16sc_t dotProductVector[4]; - realcacc = _mm_setzero_si128(); - imagcacc = _mm_setzero_si128(); + realcacc = _mm_setzero_si128(); + imagcacc = _mm_setzero_si128(); - mask_imag = _mm_set_epi8(0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0); - mask_real = _mm_set_epi8(0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF); + mask_imag = _mm_set_epi8( + 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0); + mask_real = _mm_set_epi8( + 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF); - for(number = 0; number < sse_iters; number++) - { - // a[127:0]=[a3.i,a3.r,a2.i,a2.r,a1.i,a1.r,a0.i,a0.r] - a = _mm_load_si128((__m128i*)_in_a); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg - __VOLK_PREFETCH(_in_a + 8); - b = _mm_load_si128((__m128i*)_in_b); - __VOLK_PREFETCH(_in_b + 8); - c = _mm_mullo_epi16(a, b); // a3.i*b3.i, a3.r*b3.r, .... + for (number = 0; number < sse_iters; number++) { + // a[127:0]=[a3.i,a3.r,a2.i,a2.r,a1.i,a1.r,a0.i,a0.r] + a = _mm_load_si128( + (__m128i*)_in_a); // load (2 byte imag, 2 byte real) x 4 into 128 bits reg + __VOLK_PREFETCH(_in_a + 8); + b = _mm_load_si128((__m128i*)_in_b); + __VOLK_PREFETCH(_in_b + 8); + c = _mm_mullo_epi16(a, b); // a3.i*b3.i, a3.r*b3.r, .... - c_sr = _mm_srli_si128(c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst. - real = _mm_subs_epi16(c, c_sr); + c_sr = _mm_srli_si128(c, 2); // Shift a right by imm8 bytes while shifting in + // zeros, and store the results in dst. + real = _mm_subs_epi16(c, c_sr); - b_sl = _mm_slli_si128(b, 2); // b3.r, b2.i .... - a_sl = _mm_slli_si128(a, 2); // a3.r, a2.i .... + b_sl = _mm_slli_si128(b, 2); // b3.r, b2.i .... + a_sl = _mm_slli_si128(a, 2); // a3.r, a2.i .... - imag1 = _mm_mullo_epi16(a, b_sl); // a3.i*b3.r, .... - imag2 = _mm_mullo_epi16(b, a_sl); // b3.i*a3.r, .... + imag1 = _mm_mullo_epi16(a, b_sl); // a3.i*b3.r, .... + imag2 = _mm_mullo_epi16(b, a_sl); // b3.i*a3.r, .... - imag = _mm_adds_epi16(imag1, imag2); //with saturation arithmetic! + imag = _mm_adds_epi16(imag1, imag2); // with saturation arithmetic! - realcacc = _mm_adds_epi16(realcacc, real); - imagcacc = _mm_adds_epi16(imagcacc, imag); + realcacc = _mm_adds_epi16(realcacc, real); + imagcacc = _mm_adds_epi16(imagcacc, imag); - _in_a += 4; - _in_b += 4; - } + _in_a += 4; + _in_b += 4; + } - realcacc = _mm_and_si128(realcacc, mask_real); - imagcacc = _mm_and_si128(imagcacc, mask_imag); + realcacc = _mm_and_si128(realcacc, mask_real); + imagcacc = _mm_and_si128(imagcacc, mask_imag); - a = _mm_or_si128(realcacc, imagcacc); + a = _mm_or_si128(realcacc, imagcacc); - _mm_store_si128((__m128i*)dotProductVector, a); // Store the results back into the dot product vector + _mm_store_si128((__m128i*)dotProductVector, + a); // Store the results back into the dot product vector - for (number = 0; number < 4; ++number) - { - dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(dotProductVector[number])), sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[number]))); - } + for (number = 0; number < 4; ++number) { + dotProduct = lv_cmake( + sat_adds16i(lv_creal(dotProduct), lv_creal(dotProductVector[number])), + sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[number]))); } + } - for (number = 0; number < (num_points % 4); ++number) - { - lv_16sc_t tmp = (*_in_a++) * (*_in_b++); - dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(tmp)), sat_adds16i(lv_cimag(dotProduct), lv_cimag(tmp))); - } + for (number = 0; number < (num_points % 4); ++number) { + lv_16sc_t tmp = (*_in_a++) * (*_in_b++); + dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(tmp)), + sat_adds16i(lv_cimag(dotProduct), lv_cimag(tmp))); + } *_out = dotProduct; } @@ -147,7 +160,10 @@ static inline void volk_16ic_x2_dot_prod_16ic_a_sse2(lv_16sc_t* out, const lv_16 #ifdef LV_HAVE_SSE2 #include -static inline void volk_16ic_x2_dot_prod_16ic_u_sse2(lv_16sc_t* out, const lv_16sc_t* in_a, const lv_16sc_t* in_b, unsigned int num_points) +static inline void volk_16ic_x2_dot_prod_16ic_u_sse2(lv_16sc_t* out, + const lv_16sc_t* in_a, + const lv_16sc_t* in_b, + unsigned int num_points) { lv_16sc_t dotProduct = lv_cmake((int16_t)0, (int16_t)0); @@ -158,62 +174,67 @@ static inline void volk_16ic_x2_dot_prod_16ic_u_sse2(lv_16sc_t* out, const lv_16 lv_16sc_t* _out = out; unsigned int number; - if (sse_iters > 0) - { - __m128i a, b, c, c_sr, mask_imag, mask_real, real, imag, imag1, imag2, b_sl, a_sl, realcacc, imagcacc, result; - __VOLK_ATTR_ALIGNED(16) lv_16sc_t dotProductVector[4]; + if (sse_iters > 0) { + __m128i a, b, c, c_sr, mask_imag, mask_real, real, imag, imag1, imag2, b_sl, a_sl, + realcacc, imagcacc, result; + __VOLK_ATTR_ALIGNED(16) lv_16sc_t dotProductVector[4]; - realcacc = _mm_setzero_si128(); - imagcacc = _mm_setzero_si128(); + realcacc = _mm_setzero_si128(); + imagcacc = _mm_setzero_si128(); - mask_imag = _mm_set_epi8(0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0); - mask_real = _mm_set_epi8(0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF); + mask_imag = _mm_set_epi8( + 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0); + mask_real = _mm_set_epi8( + 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF); - for(number = 0; number < sse_iters; number++) - { - // a[127:0]=[a3.i,a3.r,a2.i,a2.r,a1.i,a1.r,a0.i,a0.r] - a = _mm_loadu_si128((__m128i*)_in_a); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg - __VOLK_PREFETCH(_in_a + 8); - b = _mm_loadu_si128((__m128i*)_in_b); - __VOLK_PREFETCH(_in_b + 8); - c = _mm_mullo_epi16(a, b); // a3.i*b3.i, a3.r*b3.r, .... + for (number = 0; number < sse_iters; number++) { + // a[127:0]=[a3.i,a3.r,a2.i,a2.r,a1.i,a1.r,a0.i,a0.r] + a = _mm_loadu_si128( + (__m128i*)_in_a); // load (2 byte imag, 2 byte real) x 4 into 128 bits reg + __VOLK_PREFETCH(_in_a + 8); + b = _mm_loadu_si128((__m128i*)_in_b); + __VOLK_PREFETCH(_in_b + 8); + c = _mm_mullo_epi16(a, b); // a3.i*b3.i, a3.r*b3.r, .... - c_sr = _mm_srli_si128(c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst. - real = _mm_subs_epi16(c, c_sr); + c_sr = _mm_srli_si128(c, 2); // Shift a right by imm8 bytes while shifting in + // zeros, and store the results in dst. + real = _mm_subs_epi16(c, c_sr); - b_sl = _mm_slli_si128(b, 2); // b3.r, b2.i .... - a_sl = _mm_slli_si128(a, 2); // a3.r, a2.i .... + b_sl = _mm_slli_si128(b, 2); // b3.r, b2.i .... + a_sl = _mm_slli_si128(a, 2); // a3.r, a2.i .... - imag1 = _mm_mullo_epi16(a, b_sl); // a3.i*b3.r, .... - imag2 = _mm_mullo_epi16(b, a_sl); // b3.i*a3.r, .... + imag1 = _mm_mullo_epi16(a, b_sl); // a3.i*b3.r, .... + imag2 = _mm_mullo_epi16(b, a_sl); // b3.i*a3.r, .... - imag = _mm_adds_epi16(imag1, imag2); //with saturation arithmetic! + imag = _mm_adds_epi16(imag1, imag2); // with saturation arithmetic! - realcacc = _mm_adds_epi16(realcacc, real); - imagcacc = _mm_adds_epi16(imagcacc, imag); + realcacc = _mm_adds_epi16(realcacc, real); + imagcacc = _mm_adds_epi16(imagcacc, imag); - _in_a += 4; - _in_b += 4; - } + _in_a += 4; + _in_b += 4; + } - realcacc = _mm_and_si128(realcacc, mask_real); - imagcacc = _mm_and_si128(imagcacc, mask_imag); + realcacc = _mm_and_si128(realcacc, mask_real); + imagcacc = _mm_and_si128(imagcacc, mask_imag); - result = _mm_or_si128(realcacc, imagcacc); + result = _mm_or_si128(realcacc, imagcacc); - _mm_storeu_si128((__m128i*)dotProductVector, result); // Store the results back into the dot product vector + _mm_storeu_si128((__m128i*)dotProductVector, + result); // Store the results back into the dot product vector - for (number = 0; number < 4; ++number) - { - dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(dotProductVector[number])), sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[number]))); - } + for (number = 0; number < 4; ++number) { + dotProduct = lv_cmake( + sat_adds16i(lv_creal(dotProduct), lv_creal(dotProductVector[number])), + sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[number]))); } + } - for (number = 0; number < (num_points % 4); ++number) - { - lv_16sc_t tmp = (*_in_a++) * (*_in_b++); - dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(tmp)), sat_adds16i(lv_cimag(dotProduct), lv_cimag(tmp))); - } + for (number = 0; number < (num_points % 4); ++number) { + lv_16sc_t tmp = (*_in_a++) * (*_in_b++); + dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(tmp)), + sat_adds16i(lv_cimag(dotProduct), lv_cimag(tmp))); + } *_out = dotProduct; } @@ -223,7 +244,10 @@ static inline void volk_16ic_x2_dot_prod_16ic_u_sse2(lv_16sc_t* out, const lv_16 #ifdef LV_HAVE_AVX2 #include -static inline void volk_16ic_x2_dot_prod_16ic_u_axv2(lv_16sc_t* out, const lv_16sc_t* in_a, const lv_16sc_t* in_b, unsigned int num_points) +static inline void volk_16ic_x2_dot_prod_16ic_u_axv2(lv_16sc_t* out, + const lv_16sc_t* in_a, + const lv_16sc_t* in_b, + unsigned int num_points) { lv_16sc_t dotProduct = lv_cmake((int16_t)0, (int16_t)0); @@ -234,62 +258,126 @@ static inline void volk_16ic_x2_dot_prod_16ic_u_axv2(lv_16sc_t* out, const lv_16 lv_16sc_t* _out = out; unsigned int number; - if (avx_iters > 0) - { - __m256i a, b, c, c_sr, mask_imag, mask_real, real, imag, imag1, imag2, b_sl, a_sl, realcacc, imagcacc, result; - __VOLK_ATTR_ALIGNED(32) lv_16sc_t dotProductVector[8]; - - realcacc = _mm256_setzero_si256(); - imagcacc = _mm256_setzero_si256(); - - mask_imag = _mm256_set_epi8(0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0); - mask_real = _mm256_set_epi8(0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF); - - for(number = 0; number < avx_iters; number++) - { - a = _mm256_loadu_si256((__m256i*)_in_a); - __VOLK_PREFETCH(_in_a + 16); - b = _mm256_loadu_si256((__m256i*)_in_b); - __VOLK_PREFETCH(_in_b + 16); - c = _mm256_mullo_epi16(a, b); - - c_sr = _mm256_srli_si256(c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst. - real = _mm256_subs_epi16(c, c_sr); - - b_sl = _mm256_slli_si256(b, 2); - a_sl = _mm256_slli_si256(a, 2); - - imag1 = _mm256_mullo_epi16(a, b_sl); - imag2 = _mm256_mullo_epi16(b, a_sl); - - imag = _mm256_adds_epi16(imag1, imag2); //with saturation arithmetic! - - realcacc = _mm256_adds_epi16(realcacc, real); - imagcacc = _mm256_adds_epi16(imagcacc, imag); - - _in_a += 8; - _in_b += 8; - } + if (avx_iters > 0) { + __m256i a, b, c, c_sr, mask_imag, mask_real, real, imag, imag1, imag2, b_sl, a_sl, + realcacc, imagcacc, result; + __VOLK_ATTR_ALIGNED(32) lv_16sc_t dotProductVector[8]; + + realcacc = _mm256_setzero_si256(); + imagcacc = _mm256_setzero_si256(); + + mask_imag = _mm256_set_epi8(0xFF, + 0xFF, + 0, + 0, + 0xFF, + 0xFF, + 0, + 0, + 0xFF, + 0xFF, + 0, + 0, + 0xFF, + 0xFF, + 0, + 0, + 0xFF, + 0xFF, + 0, + 0, + 0xFF, + 0xFF, + 0, + 0, + 0xFF, + 0xFF, + 0, + 0, + 0xFF, + 0xFF, + 0, + 0); + mask_real = _mm256_set_epi8(0, + 0, + 0xFF, + 0xFF, + 0, + 0, + 0xFF, + 0xFF, + 0, + 0, + 0xFF, + 0xFF, + 0, + 0, + 0xFF, + 0xFF, + 0, + 0, + 0xFF, + 0xFF, + 0, + 0, + 0xFF, + 0xFF, + 0, + 0, + 0xFF, + 0xFF, + 0, + 0, + 0xFF, + 0xFF); + + for (number = 0; number < avx_iters; number++) { + a = _mm256_loadu_si256((__m256i*)_in_a); + __VOLK_PREFETCH(_in_a + 16); + b = _mm256_loadu_si256((__m256i*)_in_b); + __VOLK_PREFETCH(_in_b + 16); + c = _mm256_mullo_epi16(a, b); + + c_sr = _mm256_srli_si256(c, 2); // Shift a right by imm8 bytes while shifting + // in zeros, and store the results in dst. + real = _mm256_subs_epi16(c, c_sr); + + b_sl = _mm256_slli_si256(b, 2); + a_sl = _mm256_slli_si256(a, 2); + + imag1 = _mm256_mullo_epi16(a, b_sl); + imag2 = _mm256_mullo_epi16(b, a_sl); + + imag = _mm256_adds_epi16(imag1, imag2); // with saturation arithmetic! + + realcacc = _mm256_adds_epi16(realcacc, real); + imagcacc = _mm256_adds_epi16(imagcacc, imag); + + _in_a += 8; + _in_b += 8; + } - realcacc = _mm256_and_si256(realcacc, mask_real); - imagcacc = _mm256_and_si256(imagcacc, mask_imag); + realcacc = _mm256_and_si256(realcacc, mask_real); + imagcacc = _mm256_and_si256(imagcacc, mask_imag); - result = _mm256_or_si256(realcacc, imagcacc); + result = _mm256_or_si256(realcacc, imagcacc); - _mm256_storeu_si256((__m256i*)dotProductVector, result); // Store the results back into the dot product vector - _mm256_zeroupper(); + _mm256_storeu_si256((__m256i*)dotProductVector, + result); // Store the results back into the dot product vector + _mm256_zeroupper(); - for (number = 0; number < 8; ++number) - { - dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(dotProductVector[number])), sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[number]))); - } + for (number = 0; number < 8; ++number) { + dotProduct = lv_cmake( + sat_adds16i(lv_creal(dotProduct), lv_creal(dotProductVector[number])), + sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[number]))); } + } - for (number = 0; number < (num_points % 8); ++number) - { - lv_16sc_t tmp = (*_in_a++) * (*_in_b++); - dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(tmp)), sat_adds16i(lv_cimag(dotProduct), lv_cimag(tmp))); - } + for (number = 0; number < (num_points % 8); ++number) { + lv_16sc_t tmp = (*_in_a++) * (*_in_b++); + dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(tmp)), + sat_adds16i(lv_cimag(dotProduct), lv_cimag(tmp))); + } *_out = dotProduct; } @@ -299,7 +387,10 @@ static inline void volk_16ic_x2_dot_prod_16ic_u_axv2(lv_16sc_t* out, const lv_16 #ifdef LV_HAVE_AVX2 #include -static inline void volk_16ic_x2_dot_prod_16ic_a_axv2(lv_16sc_t* out, const lv_16sc_t* in_a, const lv_16sc_t* in_b, unsigned int num_points) +static inline void volk_16ic_x2_dot_prod_16ic_a_axv2(lv_16sc_t* out, + const lv_16sc_t* in_a, + const lv_16sc_t* in_b, + unsigned int num_points) { lv_16sc_t dotProduct = lv_cmake((int16_t)0, (int16_t)0); @@ -310,62 +401,126 @@ static inline void volk_16ic_x2_dot_prod_16ic_a_axv2(lv_16sc_t* out, const lv_16 lv_16sc_t* _out = out; unsigned int number; - if (avx_iters > 0) - { - __m256i a, b, c, c_sr, mask_imag, mask_real, real, imag, imag1, imag2, b_sl, a_sl, realcacc, imagcacc, result; - __VOLK_ATTR_ALIGNED(32) lv_16sc_t dotProductVector[8]; - - realcacc = _mm256_setzero_si256(); - imagcacc = _mm256_setzero_si256(); - - mask_imag = _mm256_set_epi8(0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0); - mask_real = _mm256_set_epi8(0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF); - - for(number = 0; number < avx_iters; number++) - { - a = _mm256_load_si256((__m256i*)_in_a); - __VOLK_PREFETCH(_in_a + 16); - b = _mm256_load_si256((__m256i*)_in_b); - __VOLK_PREFETCH(_in_b + 16); - c = _mm256_mullo_epi16(a, b); - - c_sr = _mm256_srli_si256(c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst. - real = _mm256_subs_epi16(c, c_sr); - - b_sl = _mm256_slli_si256(b, 2); - a_sl = _mm256_slli_si256(a, 2); - - imag1 = _mm256_mullo_epi16(a, b_sl); - imag2 = _mm256_mullo_epi16(b, a_sl); - - imag = _mm256_adds_epi16(imag1, imag2); //with saturation arithmetic! - - realcacc = _mm256_adds_epi16(realcacc, real); - imagcacc = _mm256_adds_epi16(imagcacc, imag); - - _in_a += 8; - _in_b += 8; - } + if (avx_iters > 0) { + __m256i a, b, c, c_sr, mask_imag, mask_real, real, imag, imag1, imag2, b_sl, a_sl, + realcacc, imagcacc, result; + __VOLK_ATTR_ALIGNED(32) lv_16sc_t dotProductVector[8]; + + realcacc = _mm256_setzero_si256(); + imagcacc = _mm256_setzero_si256(); + + mask_imag = _mm256_set_epi8(0xFF, + 0xFF, + 0, + 0, + 0xFF, + 0xFF, + 0, + 0, + 0xFF, + 0xFF, + 0, + 0, + 0xFF, + 0xFF, + 0, + 0, + 0xFF, + 0xFF, + 0, + 0, + 0xFF, + 0xFF, + 0, + 0, + 0xFF, + 0xFF, + 0, + 0, + 0xFF, + 0xFF, + 0, + 0); + mask_real = _mm256_set_epi8(0, + 0, + 0xFF, + 0xFF, + 0, + 0, + 0xFF, + 0xFF, + 0, + 0, + 0xFF, + 0xFF, + 0, + 0, + 0xFF, + 0xFF, + 0, + 0, + 0xFF, + 0xFF, + 0, + 0, + 0xFF, + 0xFF, + 0, + 0, + 0xFF, + 0xFF, + 0, + 0, + 0xFF, + 0xFF); + + for (number = 0; number < avx_iters; number++) { + a = _mm256_load_si256((__m256i*)_in_a); + __VOLK_PREFETCH(_in_a + 16); + b = _mm256_load_si256((__m256i*)_in_b); + __VOLK_PREFETCH(_in_b + 16); + c = _mm256_mullo_epi16(a, b); + + c_sr = _mm256_srli_si256(c, 2); // Shift a right by imm8 bytes while shifting + // in zeros, and store the results in dst. + real = _mm256_subs_epi16(c, c_sr); + + b_sl = _mm256_slli_si256(b, 2); + a_sl = _mm256_slli_si256(a, 2); + + imag1 = _mm256_mullo_epi16(a, b_sl); + imag2 = _mm256_mullo_epi16(b, a_sl); + + imag = _mm256_adds_epi16(imag1, imag2); // with saturation arithmetic! + + realcacc = _mm256_adds_epi16(realcacc, real); + imagcacc = _mm256_adds_epi16(imagcacc, imag); + + _in_a += 8; + _in_b += 8; + } - realcacc = _mm256_and_si256(realcacc, mask_real); - imagcacc = _mm256_and_si256(imagcacc, mask_imag); + realcacc = _mm256_and_si256(realcacc, mask_real); + imagcacc = _mm256_and_si256(imagcacc, mask_imag); - result = _mm256_or_si256(realcacc, imagcacc); + result = _mm256_or_si256(realcacc, imagcacc); - _mm256_store_si256((__m256i*)dotProductVector, result); // Store the results back into the dot product vector - _mm256_zeroupper(); + _mm256_store_si256((__m256i*)dotProductVector, + result); // Store the results back into the dot product vector + _mm256_zeroupper(); - for (number = 0; number < 8; ++number) - { - dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(dotProductVector[number])), sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[number]))); - } + for (number = 0; number < 8; ++number) { + dotProduct = lv_cmake( + sat_adds16i(lv_creal(dotProduct), lv_creal(dotProductVector[number])), + sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[number]))); } + } - for (number = 0; number < (num_points % 8); ++number) - { - lv_16sc_t tmp = (*_in_a++) * (*_in_b++); - dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(tmp)), sat_adds16i(lv_cimag(dotProduct), lv_cimag(tmp))); - } + for (number = 0; number < (num_points % 8); ++number) { + lv_16sc_t tmp = (*_in_a++) * (*_in_b++); + dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(tmp)), + sat_adds16i(lv_cimag(dotProduct), lv_cimag(tmp))); + } *_out = dotProduct; } @@ -375,69 +530,70 @@ static inline void volk_16ic_x2_dot_prod_16ic_a_axv2(lv_16sc_t* out, const lv_16 #ifdef LV_HAVE_NEON #include -static inline void volk_16ic_x2_dot_prod_16ic_neon(lv_16sc_t* out, const lv_16sc_t* in_a, const lv_16sc_t* in_b, unsigned int num_points) +static inline void volk_16ic_x2_dot_prod_16ic_neon(lv_16sc_t* out, + const lv_16sc_t* in_a, + const lv_16sc_t* in_b, + unsigned int num_points) { unsigned int quarter_points = num_points / 4; unsigned int number; - lv_16sc_t* a_ptr = (lv_16sc_t*) in_a; - lv_16sc_t* b_ptr = (lv_16sc_t*) in_b; + lv_16sc_t* a_ptr = (lv_16sc_t*)in_a; + lv_16sc_t* b_ptr = (lv_16sc_t*)in_b; *out = lv_cmake((int16_t)0, (int16_t)0); - if (quarter_points > 0) - { - // for 2-lane vectors, 1st lane holds the real part, - // 2nd lane holds the imaginary part - int16x4x2_t a_val, b_val, c_val, accumulator; - int16x4x2_t tmp_real, tmp_imag; - __VOLK_ATTR_ALIGNED(16) lv_16sc_t accum_result[4]; - accumulator.val[0] = vdup_n_s16(0); - accumulator.val[1] = vdup_n_s16(0); - lv_16sc_t dotProduct = lv_cmake((int16_t)0, (int16_t)0); - - for(number = 0; number < quarter_points; ++number) - { - a_val = vld2_s16((int16_t*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i - b_val = vld2_s16((int16_t*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i - __VOLK_PREFETCH(a_ptr + 8); - __VOLK_PREFETCH(b_ptr + 8); - - // multiply the real*real and imag*imag to get real result - // a0r*b0r|a1r*b1r|a2r*b2r|a3r*b3r - tmp_real.val[0] = vmul_s16(a_val.val[0], b_val.val[0]); - // a0i*b0i|a1i*b1i|a2i*b2i|a3i*b3i - tmp_real.val[1] = vmul_s16(a_val.val[1], b_val.val[1]); - - // Multiply cross terms to get the imaginary result - // a0r*b0i|a1r*b1i|a2r*b2i|a3r*b3i - tmp_imag.val[0] = vmul_s16(a_val.val[0], b_val.val[1]); - // a0i*b0r|a1i*b1r|a2i*b2r|a3i*b3r - tmp_imag.val[1] = vmul_s16(a_val.val[1], b_val.val[0]); - - c_val.val[0] = vqsub_s16(tmp_real.val[0], tmp_real.val[1]); - c_val.val[1] = vqadd_s16(tmp_imag.val[0], tmp_imag.val[1]); - - accumulator.val[0] = vqadd_s16(accumulator.val[0], c_val.val[0]); - accumulator.val[1] = vqadd_s16(accumulator.val[1], c_val.val[1]); - - a_ptr += 4; - b_ptr += 4; - } - - vst2_s16((int16_t*)accum_result, accumulator); - for (number = 0; number < 4; ++number) - { - dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(accum_result[number])), sat_adds16i(lv_cimag(dotProduct), lv_cimag(accum_result[number]))); - } - - *out = dotProduct; + if (quarter_points > 0) { + // for 2-lane vectors, 1st lane holds the real part, + // 2nd lane holds the imaginary part + int16x4x2_t a_val, b_val, c_val, accumulator; + int16x4x2_t tmp_real, tmp_imag; + __VOLK_ATTR_ALIGNED(16) lv_16sc_t accum_result[4]; + accumulator.val[0] = vdup_n_s16(0); + accumulator.val[1] = vdup_n_s16(0); + lv_16sc_t dotProduct = lv_cmake((int16_t)0, (int16_t)0); + + for (number = 0; number < quarter_points; ++number) { + a_val = vld2_s16((int16_t*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i + b_val = vld2_s16((int16_t*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i + __VOLK_PREFETCH(a_ptr + 8); + __VOLK_PREFETCH(b_ptr + 8); + + // multiply the real*real and imag*imag to get real result + // a0r*b0r|a1r*b1r|a2r*b2r|a3r*b3r + tmp_real.val[0] = vmul_s16(a_val.val[0], b_val.val[0]); + // a0i*b0i|a1i*b1i|a2i*b2i|a3i*b3i + tmp_real.val[1] = vmul_s16(a_val.val[1], b_val.val[1]); + + // Multiply cross terms to get the imaginary result + // a0r*b0i|a1r*b1i|a2r*b2i|a3r*b3i + tmp_imag.val[0] = vmul_s16(a_val.val[0], b_val.val[1]); + // a0i*b0r|a1i*b1r|a2i*b2r|a3i*b3r + tmp_imag.val[1] = vmul_s16(a_val.val[1], b_val.val[0]); + + c_val.val[0] = vqsub_s16(tmp_real.val[0], tmp_real.val[1]); + c_val.val[1] = vqadd_s16(tmp_imag.val[0], tmp_imag.val[1]); + + accumulator.val[0] = vqadd_s16(accumulator.val[0], c_val.val[0]); + accumulator.val[1] = vqadd_s16(accumulator.val[1], c_val.val[1]); + + a_ptr += 4; + b_ptr += 4; } - // tail case - for(number = quarter_points * 4; number < num_points; ++number) - { - *out += (*a_ptr++) * (*b_ptr++); + vst2_s16((int16_t*)accum_result, accumulator); + for (number = 0; number < 4; ++number) { + dotProduct = lv_cmake( + sat_adds16i(lv_creal(dotProduct), lv_creal(accum_result[number])), + sat_adds16i(lv_cimag(dotProduct), lv_cimag(accum_result[number]))); } + + *out = dotProduct; + } + + // tail case + for (number = quarter_points * 4; number < num_points; ++number) { + *out += (*a_ptr++) * (*b_ptr++); + } } #endif /* LV_HAVE_NEON */ @@ -446,13 +602,16 @@ static inline void volk_16ic_x2_dot_prod_16ic_neon(lv_16sc_t* out, const lv_16sc #ifdef LV_HAVE_NEON #include -static inline void volk_16ic_x2_dot_prod_16ic_neon_vma(lv_16sc_t* out, const lv_16sc_t* in_a, const lv_16sc_t* in_b, unsigned int num_points) +static inline void volk_16ic_x2_dot_prod_16ic_neon_vma(lv_16sc_t* out, + const lv_16sc_t* in_a, + const lv_16sc_t* in_b, + unsigned int num_points) { unsigned int quarter_points = num_points / 4; unsigned int number; - lv_16sc_t* a_ptr = (lv_16sc_t*) in_a; - lv_16sc_t* b_ptr = (lv_16sc_t*) in_b; + lv_16sc_t* a_ptr = (lv_16sc_t*)in_a; + lv_16sc_t* b_ptr = (lv_16sc_t*)in_b; // for 2-lane vectors, 1st lane holds the real part, // 2nd lane holds the imaginary part int16x4x2_t a_val, b_val, accumulator; @@ -461,35 +620,33 @@ static inline void volk_16ic_x2_dot_prod_16ic_neon_vma(lv_16sc_t* out, const lv_ accumulator.val[0] = vdup_n_s16(0); accumulator.val[1] = vdup_n_s16(0); - for(number = 0; number < quarter_points; ++number) - { - a_val = vld2_s16((int16_t*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i - b_val = vld2_s16((int16_t*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i - __VOLK_PREFETCH(a_ptr + 8); - __VOLK_PREFETCH(b_ptr + 8); + for (number = 0; number < quarter_points; ++number) { + a_val = vld2_s16((int16_t*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i + b_val = vld2_s16((int16_t*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i + __VOLK_PREFETCH(a_ptr + 8); + __VOLK_PREFETCH(b_ptr + 8); - tmp.val[0] = vmul_s16(a_val.val[0], b_val.val[0]); - tmp.val[1] = vmul_s16(a_val.val[1], b_val.val[0]); + tmp.val[0] = vmul_s16(a_val.val[0], b_val.val[0]); + tmp.val[1] = vmul_s16(a_val.val[1], b_val.val[0]); - // use multiply accumulate/subtract to get result - tmp.val[0] = vmls_s16(tmp.val[0], a_val.val[1], b_val.val[1]); - tmp.val[1] = vmla_s16(tmp.val[1], a_val.val[0], b_val.val[1]); + // use multiply accumulate/subtract to get result + tmp.val[0] = vmls_s16(tmp.val[0], a_val.val[1], b_val.val[1]); + tmp.val[1] = vmla_s16(tmp.val[1], a_val.val[0], b_val.val[1]); - accumulator.val[0] = vqadd_s16(accumulator.val[0], tmp.val[0]); - accumulator.val[1] = vqadd_s16(accumulator.val[1], tmp.val[1]); + accumulator.val[0] = vqadd_s16(accumulator.val[0], tmp.val[0]); + accumulator.val[1] = vqadd_s16(accumulator.val[1], tmp.val[1]); - a_ptr += 4; - b_ptr += 4; - } + a_ptr += 4; + b_ptr += 4; + } vst2_s16((int16_t*)accum_result, accumulator); *out = accum_result[0] + accum_result[1] + accum_result[2] + accum_result[3]; // tail case - for(number = quarter_points * 4; number < num_points; ++number) - { - *out += (*a_ptr++) * (*b_ptr++); - } + for (number = quarter_points * 4; number < num_points; ++number) { + *out += (*a_ptr++) * (*b_ptr++); + } } #endif /* LV_HAVE_NEON */ @@ -498,13 +655,16 @@ static inline void volk_16ic_x2_dot_prod_16ic_neon_vma(lv_16sc_t* out, const lv_ #ifdef LV_HAVE_NEON #include -static inline void volk_16ic_x2_dot_prod_16ic_neon_optvma(lv_16sc_t* out, const lv_16sc_t* in_a, const lv_16sc_t* in_b, unsigned int num_points) +static inline void volk_16ic_x2_dot_prod_16ic_neon_optvma(lv_16sc_t* out, + const lv_16sc_t* in_a, + const lv_16sc_t* in_b, + unsigned int num_points) { unsigned int quarter_points = num_points / 4; unsigned int number; - lv_16sc_t* a_ptr = (lv_16sc_t*) in_a; - lv_16sc_t* b_ptr = (lv_16sc_t*) in_b; + lv_16sc_t* a_ptr = (lv_16sc_t*)in_a; + lv_16sc_t* b_ptr = (lv_16sc_t*)in_b; // for 2-lane vectors, 1st lane holds the real part, // 2nd lane holds the imaginary part int16x4x2_t a_val, b_val, accumulator1, accumulator2; @@ -515,22 +675,21 @@ static inline void volk_16ic_x2_dot_prod_16ic_neon_optvma(lv_16sc_t* out, const accumulator2.val[0] = vdup_n_s16(0); accumulator2.val[1] = vdup_n_s16(0); - for(number = 0; number < quarter_points; ++number) - { - a_val = vld2_s16((int16_t*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i - b_val = vld2_s16((int16_t*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i - __VOLK_PREFETCH(a_ptr + 8); - __VOLK_PREFETCH(b_ptr + 8); + for (number = 0; number < quarter_points; ++number) { + a_val = vld2_s16((int16_t*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i + b_val = vld2_s16((int16_t*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i + __VOLK_PREFETCH(a_ptr + 8); + __VOLK_PREFETCH(b_ptr + 8); - // use 2 accumulators to remove inter-instruction data dependencies - accumulator1.val[0] = vmla_s16(accumulator1.val[0], a_val.val[0], b_val.val[0]); - accumulator2.val[0] = vmls_s16(accumulator2.val[0], a_val.val[1], b_val.val[1]); - accumulator1.val[1] = vmla_s16(accumulator1.val[1], a_val.val[0], b_val.val[1]); - accumulator2.val[1] = vmla_s16(accumulator2.val[1], a_val.val[1], b_val.val[0]); + // use 2 accumulators to remove inter-instruction data dependencies + accumulator1.val[0] = vmla_s16(accumulator1.val[0], a_val.val[0], b_val.val[0]); + accumulator2.val[0] = vmls_s16(accumulator2.val[0], a_val.val[1], b_val.val[1]); + accumulator1.val[1] = vmla_s16(accumulator1.val[1], a_val.val[0], b_val.val[1]); + accumulator2.val[1] = vmla_s16(accumulator2.val[1], a_val.val[1], b_val.val[0]); - a_ptr += 4; - b_ptr += 4; - } + a_ptr += 4; + b_ptr += 4; + } accumulator1.val[0] = vqadd_s16(accumulator1.val[0], accumulator2.val[0]); accumulator1.val[1] = vqadd_s16(accumulator1.val[1], accumulator2.val[1]); @@ -539,10 +698,9 @@ static inline void volk_16ic_x2_dot_prod_16ic_neon_optvma(lv_16sc_t* out, const *out = accum_result[0] + accum_result[1] + accum_result[2] + accum_result[3]; // tail case - for(number = quarter_points * 4; number < num_points; ++number) - { - *out += (*a_ptr++) * (*b_ptr++); - } + for (number = quarter_points * 4; number < num_points; ++number) { + *out += (*a_ptr++) * (*b_ptr++); + } } #endif /* LV_HAVE_NEON */ diff --git a/kernels/volk/volk_16ic_x2_multiply_16ic.h b/kernels/volk/volk_16ic_x2_multiply_16ic.h index 20d6a7f..2bf835d 100644 --- a/kernels/volk/volk_16ic_x2_multiply_16ic.h +++ b/kernels/volk/volk_16ic_x2_multiply_16ic.h @@ -25,18 +25,19 @@ * * \b Overview * - * Multiplies two input complex vectors, point-by-point, storing the result in the third vector. - * WARNING: Saturation is not checked. + * Multiplies two input complex vectors, point-by-point, storing the result in the third + * vector. WARNING: Saturation is not checked. * * Dispatcher Prototype * \code - * void volk_16ic_x2_multiply_16ic(lv_16sc_t* result, const lv_16sc_t* in_a, const lv_16sc_t* in_b, unsigned int num_points); - * \endcode + * void volk_16ic_x2_multiply_16ic(lv_16sc_t* result, const lv_16sc_t* in_a, const + * lv_16sc_t* in_b, unsigned int num_points); \endcode * * \b Inputs * \li in_a: One of the vectors to be multiplied. * \li in_b: The other vector to be multiplied. - * \li num_points: The number of complex data points to be multiplied from both input vectors. + * \li num_points: The number of complex data points to be multiplied from both input + * vectors. * * \b Outputs * \li result: The vector where the results will be stored. @@ -51,13 +52,15 @@ #ifdef LV_HAVE_GENERIC -static inline void volk_16ic_x2_multiply_16ic_generic(lv_16sc_t* result, const lv_16sc_t* in_a, const lv_16sc_t* in_b, unsigned int num_points) +static inline void volk_16ic_x2_multiply_16ic_generic(lv_16sc_t* result, + const lv_16sc_t* in_a, + const lv_16sc_t* in_b, + unsigned int num_points) { unsigned int n; - for (n = 0; n < num_points; n++) - { - result[n] = in_a[n] * in_b[n]; - } + for (n = 0; n < num_points; n++) { + result[n] = in_a[n] * in_b[n]; + } } #endif /*LV_HAVE_GENERIC*/ @@ -66,51 +69,58 @@ static inline void volk_16ic_x2_multiply_16ic_generic(lv_16sc_t* result, const l #ifdef LV_HAVE_SSE2 #include -static inline void volk_16ic_x2_multiply_16ic_a_sse2(lv_16sc_t* out, const lv_16sc_t* in_a, const lv_16sc_t* in_b, unsigned int num_points) +static inline void volk_16ic_x2_multiply_16ic_a_sse2(lv_16sc_t* out, + const lv_16sc_t* in_a, + const lv_16sc_t* in_b, + unsigned int num_points) { const unsigned int sse_iters = num_points / 4; - __m128i a, b, c, c_sr, mask_imag, mask_real, real, imag, imag1, imag2, b_sl, a_sl, result; + __m128i a, b, c, c_sr, mask_imag, mask_real, real, imag, imag1, imag2, b_sl, a_sl, + result; - mask_imag = _mm_set_epi8(0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0); - mask_real = _mm_set_epi8(0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF); + mask_imag = _mm_set_epi8( + 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0); + mask_real = _mm_set_epi8( + 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF); const lv_16sc_t* _in_a = in_a; const lv_16sc_t* _in_b = in_b; lv_16sc_t* _out = out; unsigned int number; - for(number = 0; number < sse_iters; number++) - { - a = _mm_load_si128((__m128i*)_in_a); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg - b = _mm_load_si128((__m128i*)_in_b); - c = _mm_mullo_epi16 (a, b); // a3.i*b3.i, a3.r*b3.r, .... + for (number = 0; number < sse_iters; number++) { + a = _mm_load_si128( + (__m128i*)_in_a); // load (2 byte imag, 2 byte real) x 4 into 128 bits reg + b = _mm_load_si128((__m128i*)_in_b); + c = _mm_mullo_epi16(a, b); // a3.i*b3.i, a3.r*b3.r, .... - c_sr = _mm_srli_si128 (c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst. - real = _mm_subs_epi16 (c, c_sr); - real = _mm_and_si128 (real, mask_real); // a3.r*b3.r-a3.i*b3.i , 0, a3.r*b3.r- a3.i*b3.i + c_sr = _mm_srli_si128(c, 2); // Shift a right by imm8 bytes while shifting in + // zeros, and store the results in dst. + real = _mm_subs_epi16(c, c_sr); + real = _mm_and_si128(real, + mask_real); // a3.r*b3.r-a3.i*b3.i , 0, a3.r*b3.r- a3.i*b3.i - b_sl = _mm_slli_si128(b, 2); // b3.r, b2.i .... - a_sl = _mm_slli_si128(a, 2); // a3.r, a2.i .... + b_sl = _mm_slli_si128(b, 2); // b3.r, b2.i .... + a_sl = _mm_slli_si128(a, 2); // a3.r, a2.i .... - imag1 = _mm_mullo_epi16(a, b_sl); // a3.i*b3.r, .... - imag2 = _mm_mullo_epi16(b, a_sl); // b3.i*a3.r, .... + imag1 = _mm_mullo_epi16(a, b_sl); // a3.i*b3.r, .... + imag2 = _mm_mullo_epi16(b, a_sl); // b3.i*a3.r, .... - imag = _mm_adds_epi16(imag1, imag2); - imag = _mm_and_si128 (imag, mask_imag); // a3.i*b3.r+b3.i*a3.r, 0, ... + imag = _mm_adds_epi16(imag1, imag2); + imag = _mm_and_si128(imag, mask_imag); // a3.i*b3.r+b3.i*a3.r, 0, ... - result = _mm_or_si128 (real, imag); + result = _mm_or_si128(real, imag); - _mm_store_si128((__m128i*)_out, result); + _mm_store_si128((__m128i*)_out, result); - _in_a += 4; - _in_b += 4; - _out += 4; - } + _in_a += 4; + _in_b += 4; + _out += 4; + } - for (number = sse_iters * 4; number < num_points; ++number) - { - *_out++ = (*_in_a++) * (*_in_b++); - } + for (number = sse_iters * 4; number < num_points; ++number) { + *_out++ = (*_in_a++) * (*_in_b++); + } } #endif /* LV_HAVE_SSE2 */ @@ -118,51 +128,58 @@ static inline void volk_16ic_x2_multiply_16ic_a_sse2(lv_16sc_t* out, const lv_16 #ifdef LV_HAVE_SSE2 #include -static inline void volk_16ic_x2_multiply_16ic_u_sse2(lv_16sc_t* out, const lv_16sc_t* in_a, const lv_16sc_t* in_b, unsigned int num_points) +static inline void volk_16ic_x2_multiply_16ic_u_sse2(lv_16sc_t* out, + const lv_16sc_t* in_a, + const lv_16sc_t* in_b, + unsigned int num_points) { const unsigned int sse_iters = num_points / 4; - __m128i a, b, c, c_sr, mask_imag, mask_real, real, imag, imag1,imag2, b_sl, a_sl, result; + __m128i a, b, c, c_sr, mask_imag, mask_real, real, imag, imag1, imag2, b_sl, a_sl, + result; - mask_imag = _mm_set_epi8(0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0); - mask_real = _mm_set_epi8(0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF); + mask_imag = _mm_set_epi8( + 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0); + mask_real = _mm_set_epi8( + 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF); const lv_16sc_t* _in_a = in_a; const lv_16sc_t* _in_b = in_b; lv_16sc_t* _out = out; unsigned int number; - for(number = 0; number < sse_iters; number++) - { - a = _mm_loadu_si128((__m128i*)_in_a); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg - b = _mm_loadu_si128((__m128i*)_in_b); - c = _mm_mullo_epi16 (a, b); // a3.i*b3.i, a3.r*b3.r, .... + for (number = 0; number < sse_iters; number++) { + a = _mm_loadu_si128( + (__m128i*)_in_a); // load (2 byte imag, 2 byte real) x 4 into 128 bits reg + b = _mm_loadu_si128((__m128i*)_in_b); + c = _mm_mullo_epi16(a, b); // a3.i*b3.i, a3.r*b3.r, .... - c_sr = _mm_srli_si128 (c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst. - real = _mm_subs_epi16 (c, c_sr); - real = _mm_and_si128 (real, mask_real); // a3.r*b3.r-a3.i*b3.i , 0, a3.r*b3.r- a3.i*b3.i + c_sr = _mm_srli_si128(c, 2); // Shift a right by imm8 bytes while shifting in + // zeros, and store the results in dst. + real = _mm_subs_epi16(c, c_sr); + real = _mm_and_si128(real, + mask_real); // a3.r*b3.r-a3.i*b3.i , 0, a3.r*b3.r- a3.i*b3.i - b_sl = _mm_slli_si128(b, 2); // b3.r, b2.i .... - a_sl = _mm_slli_si128(a, 2); // a3.r, a2.i .... + b_sl = _mm_slli_si128(b, 2); // b3.r, b2.i .... + a_sl = _mm_slli_si128(a, 2); // a3.r, a2.i .... - imag1 = _mm_mullo_epi16(a, b_sl); // a3.i*b3.r, .... - imag2 = _mm_mullo_epi16(b, a_sl); // b3.i*a3.r, .... + imag1 = _mm_mullo_epi16(a, b_sl); // a3.i*b3.r, .... + imag2 = _mm_mullo_epi16(b, a_sl); // b3.i*a3.r, .... - imag = _mm_adds_epi16(imag1, imag2); - imag = _mm_and_si128 (imag, mask_imag); // a3.i*b3.r+b3.i*a3.r, 0, ... + imag = _mm_adds_epi16(imag1, imag2); + imag = _mm_and_si128(imag, mask_imag); // a3.i*b3.r+b3.i*a3.r, 0, ... - result = _mm_or_si128 (real, imag); + result = _mm_or_si128(real, imag); - _mm_storeu_si128((__m128i*)_out, result); + _mm_storeu_si128((__m128i*)_out, result); - _in_a += 4; - _in_b += 4; - _out += 4; - } + _in_a += 4; + _in_b += 4; + _out += 4; + } - for (number = sse_iters * 4; number < num_points; ++number) - { - *_out++ = (*_in_a++) * (*_in_b++); - } + for (number = sse_iters * 4; number < num_points; ++number) { + *_out++ = (*_in_a++) * (*_in_b++); + } } #endif /* LV_HAVE_SSE2 */ @@ -170,7 +187,10 @@ static inline void volk_16ic_x2_multiply_16ic_u_sse2(lv_16sc_t* out, const lv_16 #ifdef LV_HAVE_AVX2 #include -static inline void volk_16ic_x2_multiply_16ic_u_avx2(lv_16sc_t* out, const lv_16sc_t* in_a, const lv_16sc_t* in_b, unsigned int num_points) +static inline void volk_16ic_x2_multiply_16ic_u_avx2(lv_16sc_t* out, + const lv_16sc_t* in_a, + const lv_16sc_t* in_b, + unsigned int num_points) { unsigned int number = 0; const unsigned int avx2_points = num_points / 8; @@ -179,44 +199,108 @@ static inline void volk_16ic_x2_multiply_16ic_u_avx2(lv_16sc_t* out, const lv_16 const lv_16sc_t* _in_b = in_b; lv_16sc_t* _out = out; - __m256i a, b, c, c_sr, real, imag, imag1, imag2, b_sl, a_sl, result; - - const __m256i mask_imag = _mm256_set_epi8(0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0); - const __m256i mask_real = _mm256_set_epi8(0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF); - - for(;number < avx2_points; number++) - { - a = _mm256_loadu_si256((__m256i*)_in_a); // Load the ar + ai, br + bi as ar,ai,br,bi - b = _mm256_loadu_si256((__m256i*)_in_b); // Load the cr + ci, dr + di as cr,ci,dr,di - c = _mm256_mullo_epi16(a, b); - - c_sr = _mm256_srli_si256(c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst. - real = _mm256_subs_epi16(c, c_sr); - real = _mm256_and_si256(real, mask_real); // a3.r*b3.r-a3.i*b3.i , 0, a3.r*b3.r- a3.i*b3.i - - b_sl = _mm256_slli_si256(b, 2); // b3.r, b2.i .... - a_sl = _mm256_slli_si256(a, 2); // a3.r, a2.i .... - - imag1 = _mm256_mullo_epi16(a, b_sl); // a3.i*b3.r, .... - imag2 = _mm256_mullo_epi16(b, a_sl); // b3.i*a3.r, .... - - imag = _mm256_adds_epi16(imag1, imag2); - imag = _mm256_and_si256(imag, mask_imag); // a3.i*b3.r+b3.i*a3.r, 0, ... - - result = _mm256_or_si256(real, imag); - - _mm256_storeu_si256((__m256i*)_out, result); - - _in_a += 8; - _in_b += 8; - _out += 8; - } + __m256i a, b, c, c_sr, real, imag, imag1, imag2, b_sl, a_sl, result; + + const __m256i mask_imag = _mm256_set_epi8(0xFF, + 0xFF, + 0, + 0, + 0xFF, + 0xFF, + 0, + 0, + 0xFF, + 0xFF, + 0, + 0, + 0xFF, + 0xFF, + 0, + 0, + 0xFF, + 0xFF, + 0, + 0, + 0xFF, + 0xFF, + 0, + 0, + 0xFF, + 0xFF, + 0, + 0, + 0xFF, + 0xFF, + 0, + 0); + const __m256i mask_real = _mm256_set_epi8(0, + 0, + 0xFF, + 0xFF, + 0, + 0, + 0xFF, + 0xFF, + 0, + 0, + 0xFF, + 0xFF, + 0, + 0, + 0xFF, + 0xFF, + 0, + 0, + 0xFF, + 0xFF, + 0, + 0, + 0xFF, + 0xFF, + 0, + 0, + 0xFF, + 0xFF, + 0, + 0, + 0xFF, + 0xFF); + + for (; number < avx2_points; number++) { + a = _mm256_loadu_si256( + (__m256i*)_in_a); // Load the ar + ai, br + bi as ar,ai,br,bi + b = _mm256_loadu_si256( + (__m256i*)_in_b); // Load the cr + ci, dr + di as cr,ci,dr,di + c = _mm256_mullo_epi16(a, b); + + c_sr = _mm256_srli_si256(c, 2); // Shift a right by imm8 bytes while shifting in + // zeros, and store the results in dst. + real = _mm256_subs_epi16(c, c_sr); + real = _mm256_and_si256( + real, mask_real); // a3.r*b3.r-a3.i*b3.i , 0, a3.r*b3.r- a3.i*b3.i + + b_sl = _mm256_slli_si256(b, 2); // b3.r, b2.i .... + a_sl = _mm256_slli_si256(a, 2); // a3.r, a2.i .... + + imag1 = _mm256_mullo_epi16(a, b_sl); // a3.i*b3.r, .... + imag2 = _mm256_mullo_epi16(b, a_sl); // b3.i*a3.r, .... + + imag = _mm256_adds_epi16(imag1, imag2); + imag = _mm256_and_si256(imag, mask_imag); // a3.i*b3.r+b3.i*a3.r, 0, ... + + result = _mm256_or_si256(real, imag); + + _mm256_storeu_si256((__m256i*)_out, result); + + _in_a += 8; + _in_b += 8; + _out += 8; + } _mm256_zeroupper(); number = avx2_points * 8; - for(;number < num_points; number++) - { - *_out++ = (*_in_a++) * (*_in_b++); - } + for (; number < num_points; number++) { + *_out++ = (*_in_a++) * (*_in_b++); + } } #endif /* LV_HAVE_AVX2 */ @@ -224,7 +308,10 @@ static inline void volk_16ic_x2_multiply_16ic_u_avx2(lv_16sc_t* out, const lv_16 #ifdef LV_HAVE_AVX2 #include -static inline void volk_16ic_x2_multiply_16ic_a_avx2(lv_16sc_t* out, const lv_16sc_t* in_a, const lv_16sc_t* in_b, unsigned int num_points) +static inline void volk_16ic_x2_multiply_16ic_a_avx2(lv_16sc_t* out, + const lv_16sc_t* in_a, + const lv_16sc_t* in_b, + unsigned int num_points) { unsigned int number = 0; const unsigned int avx2_points = num_points / 8; @@ -233,44 +320,108 @@ static inline void volk_16ic_x2_multiply_16ic_a_avx2(lv_16sc_t* out, const lv_16 const lv_16sc_t* _in_b = in_b; lv_16sc_t* _out = out; - __m256i a, b, c, c_sr, real, imag, imag1, imag2, b_sl, a_sl, result; - - const __m256i mask_imag = _mm256_set_epi8(0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0); - const __m256i mask_real = _mm256_set_epi8(0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF); - - for(;number < avx2_points; number++) - { - a = _mm256_load_si256((__m256i*)_in_a); // Load the ar + ai, br + bi as ar,ai,br,bi - b = _mm256_load_si256((__m256i*)_in_b); // Load the cr + ci, dr + di as cr,ci,dr,di - c = _mm256_mullo_epi16(a, b); - - c_sr = _mm256_srli_si256(c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst. - real = _mm256_subs_epi16(c, c_sr); - real = _mm256_and_si256(real, mask_real); // a3.r*b3.r-a3.i*b3.i , 0, a3.r*b3.r- a3.i*b3.i - - b_sl = _mm256_slli_si256(b, 2); // b3.r, b2.i .... - a_sl = _mm256_slli_si256(a, 2); // a3.r, a2.i .... - - imag1 = _mm256_mullo_epi16(a, b_sl); // a3.i*b3.r, .... - imag2 = _mm256_mullo_epi16(b, a_sl); // b3.i*a3.r, .... - - imag = _mm256_adds_epi16(imag1, imag2); - imag = _mm256_and_si256(imag, mask_imag); // a3.i*b3.r+b3.i*a3.r, 0, ... - - result = _mm256_or_si256(real, imag); - - _mm256_store_si256((__m256i*)_out, result); - - _in_a += 8; - _in_b += 8; - _out += 8; - } + __m256i a, b, c, c_sr, real, imag, imag1, imag2, b_sl, a_sl, result; + + const __m256i mask_imag = _mm256_set_epi8(0xFF, + 0xFF, + 0, + 0, + 0xFF, + 0xFF, + 0, + 0, + 0xFF, + 0xFF, + 0, + 0, + 0xFF, + 0xFF, + 0, + 0, + 0xFF, + 0xFF, + 0, + 0, + 0xFF, + 0xFF, + 0, + 0, + 0xFF, + 0xFF, + 0, + 0, + 0xFF, + 0xFF, + 0, + 0); + const __m256i mask_real = _mm256_set_epi8(0, + 0, + 0xFF, + 0xFF, + 0, + 0, + 0xFF, + 0xFF, + 0, + 0, + 0xFF, + 0xFF, + 0, + 0, + 0xFF, + 0xFF, + 0, + 0, + 0xFF, + 0xFF, + 0, + 0, + 0xFF, + 0xFF, + 0, + 0, + 0xFF, + 0xFF, + 0, + 0, + 0xFF, + 0xFF); + + for (; number < avx2_points; number++) { + a = _mm256_load_si256( + (__m256i*)_in_a); // Load the ar + ai, br + bi as ar,ai,br,bi + b = _mm256_load_si256( + (__m256i*)_in_b); // Load the cr + ci, dr + di as cr,ci,dr,di + c = _mm256_mullo_epi16(a, b); + + c_sr = _mm256_srli_si256(c, 2); // Shift a right by imm8 bytes while shifting in + // zeros, and store the results in dst. + real = _mm256_subs_epi16(c, c_sr); + real = _mm256_and_si256( + real, mask_real); // a3.r*b3.r-a3.i*b3.i , 0, a3.r*b3.r- a3.i*b3.i + + b_sl = _mm256_slli_si256(b, 2); // b3.r, b2.i .... + a_sl = _mm256_slli_si256(a, 2); // a3.r, a2.i .... + + imag1 = _mm256_mullo_epi16(a, b_sl); // a3.i*b3.r, .... + imag2 = _mm256_mullo_epi16(b, a_sl); // b3.i*a3.r, .... + + imag = _mm256_adds_epi16(imag1, imag2); + imag = _mm256_and_si256(imag, mask_imag); // a3.i*b3.r+b3.i*a3.r, 0, ... + + result = _mm256_or_si256(real, imag); + + _mm256_store_si256((__m256i*)_out, result); + + _in_a += 8; + _in_b += 8; + _out += 8; + } _mm256_zeroupper(); number = avx2_points * 8; - for(;number < num_points; number++) - { - *_out++ = (*_in_a++) * (*_in_b++); - } + for (; number < num_points; number++) { + *_out++ = (*_in_a++) * (*_in_b++); + } } #endif /* LV_HAVE_AVX2 */ @@ -278,48 +429,49 @@ static inline void volk_16ic_x2_multiply_16ic_a_avx2(lv_16sc_t* out, const lv_16 #ifdef LV_HAVE_NEON #include -static inline void volk_16ic_x2_multiply_16ic_neon(lv_16sc_t* out, const lv_16sc_t* in_a, const lv_16sc_t* in_b, unsigned int num_points) +static inline void volk_16ic_x2_multiply_16ic_neon(lv_16sc_t* out, + const lv_16sc_t* in_a, + const lv_16sc_t* in_b, + unsigned int num_points) { - lv_16sc_t *a_ptr = (lv_16sc_t*) in_a; - lv_16sc_t *b_ptr = (lv_16sc_t*) in_b; + lv_16sc_t* a_ptr = (lv_16sc_t*)in_a; + lv_16sc_t* b_ptr = (lv_16sc_t*)in_b; unsigned int quarter_points = num_points / 4; int16x4x2_t a_val, b_val, c_val; int16x4x2_t tmp_real, tmp_imag; unsigned int number = 0; - for(number = 0; number < quarter_points; ++number) - { - a_val = vld2_s16((int16_t*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i - b_val = vld2_s16((int16_t*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i - __VOLK_PREFETCH(a_ptr + 4); - __VOLK_PREFETCH(b_ptr + 4); - - // multiply the real*real and imag*imag to get real result - // a0r*b0r|a1r*b1r|a2r*b2r|a3r*b3r - tmp_real.val[0] = vmul_s16(a_val.val[0], b_val.val[0]); - // a0i*b0i|a1i*b1i|a2i*b2i|a3i*b3i - tmp_real.val[1] = vmul_s16(a_val.val[1], b_val.val[1]); - - // Multiply cross terms to get the imaginary result - // a0r*b0i|a1r*b1i|a2r*b2i|a3r*b3i - tmp_imag.val[0] = vmul_s16(a_val.val[0], b_val.val[1]); - // a0i*b0r|a1i*b1r|a2i*b2r|a3i*b3r - tmp_imag.val[1] = vmul_s16(a_val.val[1], b_val.val[0]); - - // store the results - c_val.val[0] = vsub_s16(tmp_real.val[0], tmp_real.val[1]); - c_val.val[1] = vadd_s16(tmp_imag.val[0], tmp_imag.val[1]); - vst2_s16((int16_t*)out, c_val); - - a_ptr += 4; - b_ptr += 4; - out += 4; - } - - for(number = quarter_points * 4; number < num_points; number++) - { - *out++ = (*a_ptr++) * (*b_ptr++); - } + for (number = 0; number < quarter_points; ++number) { + a_val = vld2_s16((int16_t*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i + b_val = vld2_s16((int16_t*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i + __VOLK_PREFETCH(a_ptr + 4); + __VOLK_PREFETCH(b_ptr + 4); + + // multiply the real*real and imag*imag to get real result + // a0r*b0r|a1r*b1r|a2r*b2r|a3r*b3r + tmp_real.val[0] = vmul_s16(a_val.val[0], b_val.val[0]); + // a0i*b0i|a1i*b1i|a2i*b2i|a3i*b3i + tmp_real.val[1] = vmul_s16(a_val.val[1], b_val.val[1]); + + // Multiply cross terms to get the imaginary result + // a0r*b0i|a1r*b1i|a2r*b2i|a3r*b3i + tmp_imag.val[0] = vmul_s16(a_val.val[0], b_val.val[1]); + // a0i*b0r|a1i*b1r|a2i*b2r|a3i*b3r + tmp_imag.val[1] = vmul_s16(a_val.val[1], b_val.val[0]); + + // store the results + c_val.val[0] = vsub_s16(tmp_real.val[0], tmp_real.val[1]); + c_val.val[1] = vadd_s16(tmp_imag.val[0], tmp_imag.val[1]); + vst2_s16((int16_t*)out, c_val); + + a_ptr += 4; + b_ptr += 4; + out += 4; + } + + for (number = quarter_points * 4; number < num_points; number++) { + *out++ = (*a_ptr++) * (*b_ptr++); + } } #endif /* LV_HAVE_NEON */ diff --git a/kernels/volk/volk_16u_byteswap.h b/kernels/volk/volk_16u_byteswap.h index eaa972f..221dcdb 100644 --- a/kernels/volk/volk_16u_byteswap.h +++ b/kernels/volk/volk_16u_byteswap.h @@ -58,74 +58,80 @@ #if LV_HAVE_AVX2 #include -static inline void volk_16u_byteswap_a_avx2(uint16_t* intsToSwap, unsigned int num_points){ - unsigned int number; +static inline void volk_16u_byteswap_a_avx2(uint16_t* intsToSwap, unsigned int num_points) +{ + unsigned int number; - const unsigned int nPerSet = 16; - const uint64_t nSets = num_points / nPerSet; + const unsigned int nPerSet = 16; + const uint64_t nSets = num_points / nPerSet; - uint16_t* inputPtr = (uint16_t*) intsToSwap; + uint16_t* inputPtr = (uint16_t*)intsToSwap; - const uint8_t shuffleVector[32] = { 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14, 17, 16, 19, 18, 21, 20, 23, 22, 25, 24, 27, 26, 29, 28, 31, 30}; + const uint8_t shuffleVector[32] = { 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, + 10, 13, 12, 15, 14, 17, 16, 19, 18, 21, 20, + 23, 22, 25, 24, 27, 26, 29, 28, 31, 30 }; - const __m256i myShuffle = _mm256_loadu_si256((__m256i*) &shuffleVector[0]); + const __m256i myShuffle = _mm256_loadu_si256((__m256i*)&shuffleVector[0]); - for(number = 0; number < nSets; number++) { - // Load the 32t values, increment inputPtr later since we're doing it in-place. - const __m256i input = _mm256_load_si256((__m256i*)inputPtr); - const __m256i output = _mm256_shuffle_epi8(input, myShuffle); + for (number = 0; number < nSets; number++) { + // Load the 32t values, increment inputPtr later since we're doing it in-place. + const __m256i input = _mm256_load_si256((__m256i*)inputPtr); + const __m256i output = _mm256_shuffle_epi8(input, myShuffle); - // Store the results - _mm256_store_si256((__m256i*)inputPtr, output); - inputPtr += nPerSet; - } + // Store the results + _mm256_store_si256((__m256i*)inputPtr, output); + inputPtr += nPerSet; + } - _mm256_zeroupper(); + _mm256_zeroupper(); - // Byteswap any remaining points: - for(number = nPerSet * nSets; number < num_points; number++) { - uint16_t outputVal = *inputPtr; - outputVal = (((outputVal >> 8) & 0xff) | ((outputVal << 8) & 0xff00)); - *inputPtr = outputVal; - inputPtr++; - } + // Byteswap any remaining points: + for (number = nPerSet * nSets; number < num_points; number++) { + uint16_t outputVal = *inputPtr; + outputVal = (((outputVal >> 8) & 0xff) | ((outputVal << 8) & 0xff00)); + *inputPtr = outputVal; + inputPtr++; + } } #endif /* LV_HAVE_AVX2 */ #if LV_HAVE_AVX2 #include -static inline void volk_16u_byteswap_u_avx2(uint16_t* intsToSwap, unsigned int num_points){ - unsigned int number; +static inline void volk_16u_byteswap_u_avx2(uint16_t* intsToSwap, unsigned int num_points) +{ + unsigned int number; - const unsigned int nPerSet = 16; - const uint64_t nSets = num_points / nPerSet; + const unsigned int nPerSet = 16; + const uint64_t nSets = num_points / nPerSet; - uint16_t* inputPtr = (uint16_t*) intsToSwap; + uint16_t* inputPtr = (uint16_t*)intsToSwap; - const uint8_t shuffleVector[32] = { 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14, 17, 16, 19, 18, 21, 20, 23, 22, 25, 24, 27, 26, 29, 28, 31, 30}; + const uint8_t shuffleVector[32] = { 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, + 10, 13, 12, 15, 14, 17, 16, 19, 18, 21, 20, + 23, 22, 25, 24, 27, 26, 29, 28, 31, 30 }; - const __m256i myShuffle = _mm256_loadu_si256((__m256i*) &shuffleVector[0]); + const __m256i myShuffle = _mm256_loadu_si256((__m256i*)&shuffleVector[0]); - for (number = 0; number < nSets; number++) { - // Load the 32t values, increment inputPtr later since we're doing it in-place. - const __m256i input = _mm256_loadu_si256((__m256i*)inputPtr); - const __m256i output = _mm256_shuffle_epi8(input,myShuffle); + for (number = 0; number < nSets; number++) { + // Load the 32t values, increment inputPtr later since we're doing it in-place. + const __m256i input = _mm256_loadu_si256((__m256i*)inputPtr); + const __m256i output = _mm256_shuffle_epi8(input, myShuffle); - // Store the results - _mm256_storeu_si256((__m256i*)inputPtr, output); - inputPtr += nPerSet; - } + // Store the results + _mm256_storeu_si256((__m256i*)inputPtr, output); + inputPtr += nPerSet; + } - _mm256_zeroupper(); + _mm256_zeroupper(); - // Byteswap any remaining points: - for(number = nPerSet * nSets; number < num_points; number++) { - uint16_t outputVal = *inputPtr; - outputVal = (((outputVal >> 8) & 0xff) | ((outputVal << 8) & 0xff00)); - *inputPtr = outputVal; - inputPtr++; - } + // Byteswap any remaining points: + for (number = nPerSet * nSets; number < num_points; number++) { + uint16_t outputVal = *inputPtr; + outputVal = (((outputVal >> 8) & 0xff) | ((outputVal << 8) & 0xff00)); + *inputPtr = outputVal; + inputPtr++; + } } #endif /* LV_HAVE_AVX2 */ @@ -133,47 +139,50 @@ static inline void volk_16u_byteswap_u_avx2(uint16_t* intsToSwap, unsigned int n #ifdef LV_HAVE_SSE2 #include -static inline void volk_16u_byteswap_u_sse2(uint16_t* intsToSwap, unsigned int num_points){ - unsigned int number = 0; - uint16_t* inputPtr = intsToSwap; - __m128i input, left, right, output; - - const unsigned int eighthPoints = num_points / 8; - for(;number < eighthPoints; number++){ - // Load the 16t values, increment inputPtr later since we're doing it in-place. - input = _mm_loadu_si128((__m128i*)inputPtr); - // Do the two shifts - left = _mm_slli_epi16(input, 8); - right = _mm_srli_epi16(input, 8); - // Or the left and right halves together - output = _mm_or_si128(left, right); - // Store the results - _mm_storeu_si128((__m128i*)inputPtr, output); - inputPtr += 8; - } - - // Byteswap any remaining points: - number = eighthPoints*8; - for(; number < num_points; number++){ - uint16_t outputVal = *inputPtr; - outputVal = (((outputVal >> 8) & 0xff) | ((outputVal << 8) & 0xff00)); - *inputPtr = outputVal; - inputPtr++; - } +static inline void volk_16u_byteswap_u_sse2(uint16_t* intsToSwap, unsigned int num_points) +{ + unsigned int number = 0; + uint16_t* inputPtr = intsToSwap; + __m128i input, left, right, output; + + const unsigned int eighthPoints = num_points / 8; + for (; number < eighthPoints; number++) { + // Load the 16t values, increment inputPtr later since we're doing it in-place. + input = _mm_loadu_si128((__m128i*)inputPtr); + // Do the two shifts + left = _mm_slli_epi16(input, 8); + right = _mm_srli_epi16(input, 8); + // Or the left and right halves together + output = _mm_or_si128(left, right); + // Store the results + _mm_storeu_si128((__m128i*)inputPtr, output); + inputPtr += 8; + } + + // Byteswap any remaining points: + number = eighthPoints * 8; + for (; number < num_points; number++) { + uint16_t outputVal = *inputPtr; + outputVal = (((outputVal >> 8) & 0xff) | ((outputVal << 8) & 0xff00)); + *inputPtr = outputVal; + inputPtr++; + } } #endif /* LV_HAVE_SSE2 */ #ifdef LV_HAVE_GENERIC -static inline void volk_16u_byteswap_generic(uint16_t* intsToSwap, unsigned int num_points){ - unsigned int point; - uint16_t* inputPtr = intsToSwap; - for(point = 0; point < num_points; point++){ - uint16_t output = *inputPtr; - output = (((output >> 8) & 0xff) | ((output << 8) & 0xff00)); - *inputPtr = output; - inputPtr++; - } +static inline void volk_16u_byteswap_generic(uint16_t* intsToSwap, + unsigned int num_points) +{ + unsigned int point; + uint16_t* inputPtr = intsToSwap; + for (point = 0; point < num_points; point++) { + uint16_t output = *inputPtr; + output = (((output >> 8) & 0xff) | ((output << 8) & 0xff00)); + *inputPtr = output; + inputPtr++; + } } #endif /* LV_HAVE_GENERIC */ @@ -187,129 +196,136 @@ static inline void volk_16u_byteswap_generic(uint16_t* intsToSwap, unsigned int #ifdef LV_HAVE_SSE2 #include -static inline void volk_16u_byteswap_a_sse2(uint16_t* intsToSwap, unsigned int num_points){ - unsigned int number = 0; - uint16_t* inputPtr = intsToSwap; - __m128i input, left, right, output; - - const unsigned int eighthPoints = num_points / 8; - for(;number < eighthPoints; number++){ - // Load the 16t values, increment inputPtr later since we're doing it in-place. - input = _mm_load_si128((__m128i*)inputPtr); - // Do the two shifts - left = _mm_slli_epi16(input, 8); - right = _mm_srli_epi16(input, 8); - // Or the left and right halves together - output = _mm_or_si128(left, right); - // Store the results - _mm_store_si128((__m128i*)inputPtr, output); - inputPtr += 8; - } - - - // Byteswap any remaining points: - number = eighthPoints*8; - for(; number < num_points; number++){ - uint16_t outputVal = *inputPtr; - outputVal = (((outputVal >> 8) & 0xff) | ((outputVal << 8) & 0xff00)); - *inputPtr = outputVal; - inputPtr++; - } +static inline void volk_16u_byteswap_a_sse2(uint16_t* intsToSwap, unsigned int num_points) +{ + unsigned int number = 0; + uint16_t* inputPtr = intsToSwap; + __m128i input, left, right, output; + + const unsigned int eighthPoints = num_points / 8; + for (; number < eighthPoints; number++) { + // Load the 16t values, increment inputPtr later since we're doing it in-place. + input = _mm_load_si128((__m128i*)inputPtr); + // Do the two shifts + left = _mm_slli_epi16(input, 8); + right = _mm_srli_epi16(input, 8); + // Or the left and right halves together + output = _mm_or_si128(left, right); + // Store the results + _mm_store_si128((__m128i*)inputPtr, output); + inputPtr += 8; + } + + + // Byteswap any remaining points: + number = eighthPoints * 8; + for (; number < num_points; number++) { + uint16_t outputVal = *inputPtr; + outputVal = (((outputVal >> 8) & 0xff) | ((outputVal << 8) & 0xff00)); + *inputPtr = outputVal; + inputPtr++; + } } #endif /* LV_HAVE_SSE2 */ #ifdef LV_HAVE_NEON #include -static inline void volk_16u_byteswap_neon(uint16_t* intsToSwap, unsigned int num_points){ - unsigned int number; - unsigned int eighth_points = num_points / 8; - uint16x8_t input, output; - uint16_t* inputPtr = intsToSwap; - - for(number = 0; number < eighth_points; number++) { - input = vld1q_u16(inputPtr); - output = vsriq_n_u16(output, input, 8); - output = vsliq_n_u16(output, input, 8); - vst1q_u16(inputPtr, output); - inputPtr += 8; - } - - for(number = eighth_points * 8; number < num_points; number++){ - uint16_t output = *inputPtr; - output = (((output >> 8) & 0xff) | ((output << 8) & 0xff00)); - *inputPtr = output; - inputPtr++; - } +static inline void volk_16u_byteswap_neon(uint16_t* intsToSwap, unsigned int num_points) +{ + unsigned int number; + unsigned int eighth_points = num_points / 8; + uint16x8_t input, output; + uint16_t* inputPtr = intsToSwap; + + for (number = 0; number < eighth_points; number++) { + input = vld1q_u16(inputPtr); + output = vsriq_n_u16(output, input, 8); + output = vsliq_n_u16(output, input, 8); + vst1q_u16(inputPtr, output); + inputPtr += 8; + } + + for (number = eighth_points * 8; number < num_points; number++) { + uint16_t output = *inputPtr; + output = (((output >> 8) & 0xff) | ((output << 8) & 0xff00)); + *inputPtr = output; + inputPtr++; + } } #endif /* LV_HAVE_NEON */ #ifdef LV_HAVE_NEON #include -static inline void volk_16u_byteswap_neon_table(uint16_t* intsToSwap, unsigned int num_points){ - uint16_t* inputPtr = intsToSwap; - unsigned int number = 0; - unsigned int n16points = num_points / 16; - - uint8x8x4_t input_table; - uint8x8_t int_lookup01, int_lookup23, int_lookup45, int_lookup67; - uint8x8_t swapped_int01, swapped_int23, swapped_int45, swapped_int67; - - /* these magic numbers are used as byte-indices in the LUT. - they are pre-computed to save time. A simple C program - can calculate them; for example for lookup01: - uint8_t chars[8] = {24, 16, 8, 0, 25, 17, 9, 1}; - for(ii=0; ii < 8; ++ii) { - index += ((uint64_t)(*(chars+ii))) << (ii*8); +static inline void volk_16u_byteswap_neon_table(uint16_t* intsToSwap, + unsigned int num_points) +{ + uint16_t* inputPtr = intsToSwap; + unsigned int number = 0; + unsigned int n16points = num_points / 16; + + uint8x8x4_t input_table; + uint8x8_t int_lookup01, int_lookup23, int_lookup45, int_lookup67; + uint8x8_t swapped_int01, swapped_int23, swapped_int45, swapped_int67; + + /* these magic numbers are used as byte-indices in the LUT. + they are pre-computed to save time. A simple C program + can calculate them; for example for lookup01: + uint8_t chars[8] = {24, 16, 8, 0, 25, 17, 9, 1}; + for(ii=0; ii < 8; ++ii) { + index += ((uint64_t)(*(chars+ii))) << (ii*8); + } + */ + int_lookup01 = vcreate_u8(1232017111498883080); + int_lookup23 = vcreate_u8(1376697457175036426); + int_lookup45 = vcreate_u8(1521377802851189772); + int_lookup67 = vcreate_u8(1666058148527343118); + + for (number = 0; number < n16points; ++number) { + input_table = vld4_u8((uint8_t*)inputPtr); + swapped_int01 = vtbl4_u8(input_table, int_lookup01); + swapped_int23 = vtbl4_u8(input_table, int_lookup23); + swapped_int45 = vtbl4_u8(input_table, int_lookup45); + swapped_int67 = vtbl4_u8(input_table, int_lookup67); + vst1_u8((uint8_t*)inputPtr, swapped_int01); + vst1_u8((uint8_t*)(inputPtr + 4), swapped_int23); + vst1_u8((uint8_t*)(inputPtr + 8), swapped_int45); + vst1_u8((uint8_t*)(inputPtr + 12), swapped_int67); + + inputPtr += 16; + } + + for (number = n16points * 16; number < num_points; ++number) { + uint16_t output = *inputPtr; + output = (((output >> 8) & 0xff) | ((output << 8) & 0xff00)); + *inputPtr = output; + inputPtr++; } - */ - int_lookup01 = vcreate_u8(1232017111498883080); - int_lookup23 = vcreate_u8(1376697457175036426); - int_lookup45 = vcreate_u8(1521377802851189772); - int_lookup67 = vcreate_u8(1666058148527343118); - - for(number = 0; number < n16points; ++number){ - input_table = vld4_u8((uint8_t*) inputPtr); - swapped_int01 = vtbl4_u8(input_table, int_lookup01); - swapped_int23 = vtbl4_u8(input_table, int_lookup23); - swapped_int45 = vtbl4_u8(input_table, int_lookup45); - swapped_int67 = vtbl4_u8(input_table, int_lookup67); - vst1_u8((uint8_t*)inputPtr, swapped_int01); - vst1_u8((uint8_t*)(inputPtr+4), swapped_int23); - vst1_u8((uint8_t*)(inputPtr+8), swapped_int45); - vst1_u8((uint8_t*)(inputPtr+12), swapped_int67); - - inputPtr += 16; - } - - for(number = n16points * 16; number < num_points; ++number){ - uint16_t output = *inputPtr; - output = (((output >> 8) & 0xff) | ((output << 8) & 0xff00)); - *inputPtr = output; - inputPtr++; - } } #endif /* LV_HAVE_NEON */ #ifdef LV_HAVE_GENERIC -static inline void volk_16u_byteswap_a_generic(uint16_t* intsToSwap, unsigned int num_points){ - unsigned int point; - uint16_t* inputPtr = intsToSwap; - for(point = 0; point < num_points; point++){ - uint16_t output = *inputPtr; - output = (((output >> 8) & 0xff) | ((output << 8) & 0xff00)); - *inputPtr = output; - inputPtr++; - } +static inline void volk_16u_byteswap_a_generic(uint16_t* intsToSwap, + unsigned int num_points) +{ + unsigned int point; + uint16_t* inputPtr = intsToSwap; + for (point = 0; point < num_points; point++) { + uint16_t output = *inputPtr; + output = (((output >> 8) & 0xff) | ((output << 8) & 0xff00)); + *inputPtr = output; + inputPtr++; + } } #endif /* LV_HAVE_GENERIC */ #ifdef LV_HAVE_ORC extern void volk_16u_byteswap_a_orc_impl(uint16_t* intsToSwap, unsigned int num_points); -static inline void volk_16u_byteswap_u_orc(uint16_t* intsToSwap, unsigned int num_points){ +static inline void volk_16u_byteswap_u_orc(uint16_t* intsToSwap, unsigned int num_points) +{ volk_16u_byteswap_a_orc_impl(intsToSwap, num_points); } #endif /* LV_HAVE_ORC */ diff --git a/kernels/volk/volk_16u_byteswappuppet_16u.h b/kernels/volk/volk_16u_byteswappuppet_16u.h index d3c8c5d..8cb1318 100644 --- a/kernels/volk/volk_16u_byteswappuppet_16u.h +++ b/kernels/volk/volk_16u_byteswappuppet_16u.h @@ -3,69 +3,83 @@ #include -#include #include +#include #ifdef LV_HAVE_GENERIC -static inline void volk_16u_byteswappuppet_16u_generic(uint16_t*output, uint16_t* intsToSwap, unsigned int num_points){ +static inline void volk_16u_byteswappuppet_16u_generic(uint16_t* output, + uint16_t* intsToSwap, + unsigned int num_points) +{ volk_16u_byteswap_generic((uint16_t*)intsToSwap, num_points); memcpy((void*)output, (void*)intsToSwap, num_points * sizeof(uint16_t)); - } #endif #ifdef LV_HAVE_NEON -static inline void volk_16u_byteswappuppet_16u_neon(uint16_t*output, uint16_t* intsToSwap, unsigned int num_points){ +static inline void volk_16u_byteswappuppet_16u_neon(uint16_t* output, + uint16_t* intsToSwap, + unsigned int num_points) +{ volk_16u_byteswap_neon((uint16_t*)intsToSwap, num_points); memcpy((void*)output, (void*)intsToSwap, num_points * sizeof(uint16_t)); - } #endif #ifdef LV_HAVE_NEON -static inline void volk_16u_byteswappuppet_16u_neon_table(uint16_t*output, uint16_t* intsToSwap, unsigned int num_points){ +static inline void volk_16u_byteswappuppet_16u_neon_table(uint16_t* output, + uint16_t* intsToSwap, + unsigned int num_points) +{ volk_16u_byteswap_neon_table((uint16_t*)intsToSwap, num_points); memcpy((void*)output, (void*)intsToSwap, num_points * sizeof(uint16_t)); - } #endif #ifdef LV_HAVE_SSE2 -static inline void volk_16u_byteswappuppet_16u_u_sse2(uint16_t *output, uint16_t* intsToSwap, unsigned int num_points){ +static inline void volk_16u_byteswappuppet_16u_u_sse2(uint16_t* output, + uint16_t* intsToSwap, + unsigned int num_points) +{ volk_16u_byteswap_u_sse2((uint16_t*)intsToSwap, num_points); memcpy((void*)output, (void*)intsToSwap, num_points * sizeof(uint16_t)); - } #endif #ifdef LV_HAVE_SSE2 -static inline void volk_16u_byteswappuppet_16u_a_sse2(uint16_t *output, uint16_t* intsToSwap, unsigned int num_points){ +static inline void volk_16u_byteswappuppet_16u_a_sse2(uint16_t* output, + uint16_t* intsToSwap, + unsigned int num_points) +{ volk_16u_byteswap_a_sse2((uint16_t*)intsToSwap, num_points); memcpy((void*)output, (void*)intsToSwap, num_points * sizeof(uint16_t)); - } #endif #ifdef LV_HAVE_AVX2 -static inline void volk_16u_byteswappuppet_16u_u_avx2(uint16_t *output, uint16_t* intsToSwap, unsigned int num_points){ +static inline void volk_16u_byteswappuppet_16u_u_avx2(uint16_t* output, + uint16_t* intsToSwap, + unsigned int num_points) +{ volk_16u_byteswap_u_avx2((uint16_t*)intsToSwap, num_points); memcpy((void*)output, (void*)intsToSwap, num_points * sizeof(uint16_t)); - } #endif #ifdef LV_HAVE_AVX2 -static inline void volk_16u_byteswappuppet_16u_a_avx2(uint16_t *output, uint16_t* intsToSwap, unsigned int num_points){ +static inline void volk_16u_byteswappuppet_16u_a_avx2(uint16_t* output, + uint16_t* intsToSwap, + unsigned int num_points) +{ volk_16u_byteswap_a_avx2((uint16_t*)intsToSwap, num_points); memcpy((void*)output, (void*)intsToSwap, num_points * sizeof(uint16_t)); - } #endif diff --git a/kernels/volk/volk_32f_64f_add_64f.h b/kernels/volk/volk_32f_64f_add_64f.h index 770c27e..d00ada5 100644 --- a/kernels/volk/volk_32f_64f_add_64f.h +++ b/kernels/volk/volk_32f_64f_add_64f.h @@ -77,18 +77,19 @@ #ifdef LV_HAVE_GENERIC -static inline void volk_32f_64f_add_64f_generic(double *cVector, - const float *aVector, - const double *bVector, - unsigned int num_points) { - double *cPtr = cVector; - const float *aPtr = aVector; - const double *bPtr = bVector; - unsigned int number = 0; - - for (number = 0; number < num_points; number++) { - *cPtr++ = ((double)(*aPtr++)) + (*bPtr++); - } +static inline void volk_32f_64f_add_64f_generic(double* cVector, + const float* aVector, + const double* bVector, + unsigned int num_points) +{ + double* cPtr = cVector; + const float* aPtr = aVector; + const double* bPtr = bVector; + unsigned int number = 0; + + for (number = 0; number < num_points; number++) { + *cPtr++ = ((double)(*aPtr++)) + (*bPtr++); + } } #endif /* LV_HAVE_GENERIC */ @@ -96,42 +97,43 @@ static inline void volk_32f_64f_add_64f_generic(double *cVector, #ifdef LV_HAVE_NEONV8 #include -static inline void volk_32f_64f_add_64f_neon(double *cVector, - const float *aVector, - const double *bVector, - unsigned int num_points) { - unsigned int number = 0; - const unsigned int half_points = num_points / 2; - - double *cPtr = cVector; - const float *aPtr = aVector; - const double *bPtr = bVector; - - float64x2_t aVal, bVal, cVal; - float32x2_t aVal1; - for (number = 0; number < half_points; number++) { - // Load in to NEON registers - aVal1 = vld1_f32(aPtr); - bVal = vld1q_f64(bPtr); - __VOLK_PREFETCH(aPtr + 2); - __VOLK_PREFETCH(bPtr + 2); - aPtr += 2; // q uses quadwords, 4 floats per vadd - bPtr += 2; - - // Vector conversion - aVal = vcvt_f64_f32(aVal1); - // vector add - cVal = vaddq_f64(aVal, bVal); - // Store the results back into the C container - vst1q_f64(cPtr, cVal); - - cPtr += 2; - } - - number = half_points * 2; // should be = num_points - for (; number < num_points; number++) { - *cPtr++ = ((double)(*aPtr++)) + (*bPtr++); - } +static inline void volk_32f_64f_add_64f_neon(double* cVector, + const float* aVector, + const double* bVector, + unsigned int num_points) +{ + unsigned int number = 0; + const unsigned int half_points = num_points / 2; + + double* cPtr = cVector; + const float* aPtr = aVector; + const double* bPtr = bVector; + + float64x2_t aVal, bVal, cVal; + float32x2_t aVal1; + for (number = 0; number < half_points; number++) { + // Load in to NEON registers + aVal1 = vld1_f32(aPtr); + bVal = vld1q_f64(bPtr); + __VOLK_PREFETCH(aPtr + 2); + __VOLK_PREFETCH(bPtr + 2); + aPtr += 2; // q uses quadwords, 4 floats per vadd + bPtr += 2; + + // Vector conversion + aVal = vcvt_f64_f32(aVal1); + // vector add + cVal = vaddq_f64(aVal, bVal); + // Store the results back into the C container + vst1q_f64(cPtr, cVal); + + cPtr += 2; + } + + number = half_points * 2; // should be = num_points + for (; number < num_points; number++) { + *cPtr++ = ((double)(*aPtr++)) + (*bPtr++); + } } #endif /* LV_HAVE_NEONV8 */ @@ -141,49 +143,50 @@ static inline void volk_32f_64f_add_64f_neon(double *cVector, #include #include -static inline void volk_32f_64f_add_64f_u_avx(double *cVector, - const float *aVector, - const double *bVector, - unsigned int num_points) { - unsigned int number = 0; - const unsigned int eighth_points = num_points / 8; - - double *cPtr = cVector; - const float *aPtr = aVector; - const double *bPtr = bVector; - - __m256 aVal; - __m128 aVal1, aVal2; - __m256d aDbl1, aDbl2, bVal1, bVal2, cVal1, cVal2; - for (; number < eighth_points; number++) { - - aVal = _mm256_loadu_ps(aPtr); - bVal1 = _mm256_loadu_pd(bPtr); - bVal2 = _mm256_loadu_pd(bPtr + 4); - - aVal1 = _mm256_extractf128_ps(aVal, 0); - aVal2 = _mm256_extractf128_ps(aVal, 1); - - aDbl1 = _mm256_cvtps_pd(aVal1); - aDbl2 = _mm256_cvtps_pd(aVal2); - - cVal1 = _mm256_add_pd(aDbl1, bVal1); - cVal2 = _mm256_add_pd(aDbl2, bVal2); - - _mm256_storeu_pd(cPtr, - cVal1); // Store the results back into the C container - _mm256_storeu_pd(cPtr + 4, - cVal2); // Store the results back into the C container - - aPtr += 8; - bPtr += 8; - cPtr += 8; - } - - number = eighth_points * 8; - for (; number < num_points; number++) { - *cPtr++ = ((double)(*aPtr++)) + (*bPtr++); - } +static inline void volk_32f_64f_add_64f_u_avx(double* cVector, + const float* aVector, + const double* bVector, + unsigned int num_points) +{ + unsigned int number = 0; + const unsigned int eighth_points = num_points / 8; + + double* cPtr = cVector; + const float* aPtr = aVector; + const double* bPtr = bVector; + + __m256 aVal; + __m128 aVal1, aVal2; + __m256d aDbl1, aDbl2, bVal1, bVal2, cVal1, cVal2; + for (; number < eighth_points; number++) { + + aVal = _mm256_loadu_ps(aPtr); + bVal1 = _mm256_loadu_pd(bPtr); + bVal2 = _mm256_loadu_pd(bPtr + 4); + + aVal1 = _mm256_extractf128_ps(aVal, 0); + aVal2 = _mm256_extractf128_ps(aVal, 1); + + aDbl1 = _mm256_cvtps_pd(aVal1); + aDbl2 = _mm256_cvtps_pd(aVal2); + + cVal1 = _mm256_add_pd(aDbl1, bVal1); + cVal2 = _mm256_add_pd(aDbl2, bVal2); + + _mm256_storeu_pd(cPtr, + cVal1); // Store the results back into the C container + _mm256_storeu_pd(cPtr + 4, + cVal2); // Store the results back into the C container + + aPtr += 8; + bPtr += 8; + cPtr += 8; + } + + number = eighth_points * 8; + for (; number < num_points; number++) { + *cPtr++ = ((double)(*aPtr++)) + (*bPtr++); + } } #endif /* LV_HAVE_AVX */ @@ -193,48 +196,49 @@ static inline void volk_32f_64f_add_64f_u_avx(double *cVector, #include #include -static inline void volk_32f_64f_add_64f_a_avx(double *cVector, - const float *aVector, - const double *bVector, - unsigned int num_points) { - unsigned int number = 0; - const unsigned int eighth_points = num_points / 8; - - double *cPtr = cVector; - const float *aPtr = aVector; - const double *bPtr = bVector; - - __m256 aVal; - __m128 aVal1, aVal2; - __m256d aDbl1, aDbl2, bVal1, bVal2, cVal1, cVal2; - for (; number < eighth_points; number++) { - - aVal = _mm256_load_ps(aPtr); - bVal1 = _mm256_load_pd(bPtr); - bVal2 = _mm256_load_pd(bPtr + 4); - - aVal1 = _mm256_extractf128_ps(aVal, 0); - aVal2 = _mm256_extractf128_ps(aVal, 1); - - aDbl1 = _mm256_cvtps_pd(aVal1); - aDbl2 = _mm256_cvtps_pd(aVal2); - - cVal1 = _mm256_add_pd(aDbl1, bVal1); - cVal2 = _mm256_add_pd(aDbl2, bVal2); - - _mm256_store_pd(cPtr, cVal1); // Store the results back into the C container - _mm256_store_pd(cPtr + 4, - cVal2); // Store the results back into the C container - - aPtr += 8; - bPtr += 8; - cPtr += 8; - } - - number = eighth_points * 8; - for (; number < num_points; number++) { - *cPtr++ = ((double)(*aPtr++)) + (*bPtr++); - } +static inline void volk_32f_64f_add_64f_a_avx(double* cVector, + const float* aVector, + const double* bVector, + unsigned int num_points) +{ + unsigned int number = 0; + const unsigned int eighth_points = num_points / 8; + + double* cPtr = cVector; + const float* aPtr = aVector; + const double* bPtr = bVector; + + __m256 aVal; + __m128 aVal1, aVal2; + __m256d aDbl1, aDbl2, bVal1, bVal2, cVal1, cVal2; + for (; number < eighth_points; number++) { + + aVal = _mm256_load_ps(aPtr); + bVal1 = _mm256_load_pd(bPtr); + bVal2 = _mm256_load_pd(bPtr + 4); + + aVal1 = _mm256_extractf128_ps(aVal, 0); + aVal2 = _mm256_extractf128_ps(aVal, 1); + + aDbl1 = _mm256_cvtps_pd(aVal1); + aDbl2 = _mm256_cvtps_pd(aVal2); + + cVal1 = _mm256_add_pd(aDbl1, bVal1); + cVal2 = _mm256_add_pd(aDbl2, bVal2); + + _mm256_store_pd(cPtr, cVal1); // Store the results back into the C container + _mm256_store_pd(cPtr + 4, + cVal2); // Store the results back into the C container + + aPtr += 8; + bPtr += 8; + cPtr += 8; + } + + number = eighth_points * 8; + for (; number < num_points; number++) { + *cPtr++ = ((double)(*aPtr++)) + (*bPtr++); + } } #endif /* LV_HAVE_AVX */ diff --git a/kernels/volk/volk_32f_64f_multiply_64f.h b/kernels/volk/volk_32f_64f_multiply_64f.h index 50f08a1..1039850 100644 --- a/kernels/volk/volk_32f_64f_multiply_64f.h +++ b/kernels/volk/volk_32f_64f_multiply_64f.h @@ -31,8 +31,8 @@ * * Dispatcher Prototype * \code - * void volk_32f_64f_multiply_64f(double* cVector, const double* aVector, const double* bVector, unsigned int num_points) - * \endcode + * void volk_32f_64f_multiply_64f(double* cVector, const double* aVector, const double* + * bVector, unsigned int num_points) \endcode * * \b Inputs * \li aVector: First input vector. @@ -76,18 +76,19 @@ #ifdef LV_HAVE_GENERIC -static inline void -volk_32f_64f_multiply_64f_generic(double *cVector, const float *aVector, - const double *bVector, unsigned int num_points) +static inline void volk_32f_64f_multiply_64f_generic(double* cVector, + const float* aVector, + const double* bVector, + unsigned int num_points) { - double *cPtr = cVector; - const float *aPtr = aVector; - const double *bPtr = bVector; - unsigned int number = 0; - - for (number = 0; number < num_points; number++) { - *cPtr++ = ((double)(*aPtr++)) * (*bPtr++); - } + double* cPtr = cVector; + const float* aPtr = aVector; + const double* bPtr = bVector; + unsigned int number = 0; + + for (number = 0; number < num_points; number++) { + *cPtr++ = ((double)(*aPtr++)) * (*bPtr++); + } } #endif /* LV_HAVE_GENERIC */ @@ -102,47 +103,48 @@ volk_32f_64f_multiply_64f_generic(double *cVector, const float *aVector, #include #include -static inline void -volk_32f_64f_multiply_64f_u_avx(double *cVector, const float *aVector, - const double *bVector, unsigned int num_points) +static inline void volk_32f_64f_multiply_64f_u_avx(double* cVector, + const float* aVector, + const double* bVector, + unsigned int num_points) { - unsigned int number = 0; - const unsigned int eighth_points = num_points / 8; + unsigned int number = 0; + const unsigned int eighth_points = num_points / 8; - double *cPtr = cVector; - const float *aPtr = aVector; - const double *bPtr = bVector; + double* cPtr = cVector; + const float* aPtr = aVector; + const double* bPtr = bVector; - __m256 aVal; - __m128 aVal1, aVal2; - __m256d aDbl1, aDbl2, bVal1, bVal2, cVal1, cVal2; - for (; number < eighth_points; number++) { + __m256 aVal; + __m128 aVal1, aVal2; + __m256d aDbl1, aDbl2, bVal1, bVal2, cVal1, cVal2; + for (; number < eighth_points; number++) { - aVal = _mm256_loadu_ps(aPtr); - bVal1 = _mm256_loadu_pd(bPtr); - bVal2 = _mm256_loadu_pd(bPtr+4); + aVal = _mm256_loadu_ps(aPtr); + bVal1 = _mm256_loadu_pd(bPtr); + bVal2 = _mm256_loadu_pd(bPtr + 4); - aVal1 = _mm256_extractf128_ps(aVal, 0); - aVal2 = _mm256_extractf128_ps(aVal, 1); + aVal1 = _mm256_extractf128_ps(aVal, 0); + aVal2 = _mm256_extractf128_ps(aVal, 1); - aDbl1 = _mm256_cvtps_pd(aVal1); - aDbl2 = _mm256_cvtps_pd(aVal2); + aDbl1 = _mm256_cvtps_pd(aVal1); + aDbl2 = _mm256_cvtps_pd(aVal2); - cVal1 = _mm256_mul_pd(aDbl1, bVal1); - cVal2 = _mm256_mul_pd(aDbl2, bVal2); + cVal1 = _mm256_mul_pd(aDbl1, bVal1); + cVal2 = _mm256_mul_pd(aDbl2, bVal2); - _mm256_storeu_pd(cPtr, cVal1); // Store the results back into the C container - _mm256_storeu_pd(cPtr+4, cVal2); // Store the results back into the C container + _mm256_storeu_pd(cPtr, cVal1); // Store the results back into the C container + _mm256_storeu_pd(cPtr + 4, cVal2); // Store the results back into the C container - aPtr += 8; - bPtr += 8; - cPtr += 8; - } + aPtr += 8; + bPtr += 8; + cPtr += 8; + } - number = eighth_points * 8; - for (; number < num_points; number++) { - *cPtr++ = ((double)(*aPtr++)) * (*bPtr++); - } + number = eighth_points * 8; + for (; number < num_points; number++) { + *cPtr++ = ((double)(*aPtr++)) * (*bPtr++); + } } #endif /* LV_HAVE_AVX */ @@ -153,51 +155,51 @@ volk_32f_64f_multiply_64f_u_avx(double *cVector, const float *aVector, #include #include -static inline void -volk_32f_64f_multiply_64f_a_avx(double *cVector, const float *aVector, - const double *bVector, unsigned int num_points) +static inline void volk_32f_64f_multiply_64f_a_avx(double* cVector, + const float* aVector, + const double* bVector, + unsigned int num_points) { - unsigned int number = 0; - const unsigned int eighth_points = num_points / 8; + unsigned int number = 0; + const unsigned int eighth_points = num_points / 8; - double *cPtr = cVector; - const float *aPtr = aVector; - const double *bPtr = bVector; + double* cPtr = cVector; + const float* aPtr = aVector; + const double* bPtr = bVector; - __m256 aVal; - __m128 aVal1, aVal2; - __m256d aDbl1, aDbl2, bVal1, bVal2, cVal1, cVal2; - for (; number < eighth_points; number++) { + __m256 aVal; + __m128 aVal1, aVal2; + __m256d aDbl1, aDbl2, bVal1, bVal2, cVal1, cVal2; + for (; number < eighth_points; number++) { - aVal = _mm256_load_ps(aPtr); - bVal1 = _mm256_load_pd(bPtr); - bVal2 = _mm256_load_pd(bPtr+4); + aVal = _mm256_load_ps(aPtr); + bVal1 = _mm256_load_pd(bPtr); + bVal2 = _mm256_load_pd(bPtr + 4); - aVal1 = _mm256_extractf128_ps(aVal, 0); - aVal2 = _mm256_extractf128_ps(aVal, 1); + aVal1 = _mm256_extractf128_ps(aVal, 0); + aVal2 = _mm256_extractf128_ps(aVal, 1); - aDbl1 = _mm256_cvtps_pd(aVal1); - aDbl2 = _mm256_cvtps_pd(aVal2); + aDbl1 = _mm256_cvtps_pd(aVal1); + aDbl2 = _mm256_cvtps_pd(aVal2); - cVal1 = _mm256_mul_pd(aDbl1, bVal1); - cVal2 = _mm256_mul_pd(aDbl2, bVal2); + cVal1 = _mm256_mul_pd(aDbl1, bVal1); + cVal2 = _mm256_mul_pd(aDbl2, bVal2); - _mm256_store_pd(cPtr, cVal1); // Store the results back into the C container - _mm256_store_pd(cPtr+4, cVal2); // Store the results back into the C container + _mm256_store_pd(cPtr, cVal1); // Store the results back into the C container + _mm256_store_pd(cPtr + 4, cVal2); // Store the results back into the C container - aPtr += 8; - bPtr += 8; - cPtr += 8; - } + aPtr += 8; + bPtr += 8; + cPtr += 8; + } - number = eighth_points * 8; - for (; number < num_points; number++) { - *cPtr++ = ((double)(*aPtr++)) * (*bPtr++); - } + number = eighth_points * 8; + for (; number < num_points; number++) { + *cPtr++ = ((double)(*aPtr++)) * (*bPtr++); + } } #endif /* LV_HAVE_AVX */ - #endif /* INCLUDED_volk_32f_64f_multiply_64f_u_H */ diff --git a/kernels/volk/volk_32f_8u_polarbutterfly_32f.h b/kernels/volk/volk_32f_8u_polarbutterfly_32f.h index 4aba6c4..2198b33 100644 --- a/kernels/volk/volk_32f_8u_polarbutterfly_32f.h +++ b/kernels/volk/volk_32f_8u_polarbutterfly_32f.h @@ -51,14 +51,17 @@ * int frame_exp = 10; * int frame_size = 0x01 << frame_exp; * - * float* llrs = (float*) volk_malloc(sizeof(float) * frame_size * (frame_exp + 1), volk_get_alignment()); - * unsigned char* u = (unsigned char) volk_malloc(sizeof(unsigned char) * frame_size * (frame_exp + 1), volk_get_alignment()); + * float* llrs = (float*) volk_malloc(sizeof(float) * frame_size * (frame_exp + 1), + * volk_get_alignment()); unsigned char* u = (unsigned char) volk_malloc(sizeof(unsigned + * char) * frame_size * (frame_exp + 1), volk_get_alignment()); * - * {some_function_to_write_encoded_bits_to_float_llrs(llrs + frame_size * frame_exp, data)}; + * {some_function_to_write_encoded_bits_to_float_llrs(llrs + frame_size * frame_exp, + * data)}; * * unsigned int u_num; * for(u_num = 0; u_num < frame_size; u_num++){ - * volk_32f_8u_polarbutterfly_32f_u_avx(llrs, u, frame_size, frame_exp, 0, u_num, u_num); + * volk_32f_8u_polarbutterfly_32f_u_avx(llrs, u, frame_size, frame_exp, 0, u_num, + * u_num); * // next line could first search for frozen bit value and then do bit decision. * u[u_num] = llrs[u_num] > 0 ? 0 : 1; * } @@ -73,130 +76,131 @@ #include #include -static inline float -llr_odd(const float la, const float lb) +static inline float llr_odd(const float la, const float lb) { - const float ala = fabsf(la); - const float alb = fabsf(lb); - return copysignf(1.0f, la) * copysignf(1.0f, lb) * (ala > alb ? alb : ala); + const float ala = fabsf(la); + const float alb = fabsf(lb); + return copysignf(1.0f, la) * copysignf(1.0f, lb) * (ala > alb ? alb : ala); } -static inline void -llr_odd_stages(float* llrs, int min_stage, const int depth, const int frame_size, const int row) +static inline void llr_odd_stages( + float* llrs, int min_stage, const int depth, const int frame_size, const int row) { - int loop_stage = depth - 1; - float* dst_llr_ptr; - float* src_llr_ptr; - int stage_size = 0x01 << loop_stage; - - int el; - while(min_stage <= loop_stage){ - dst_llr_ptr = llrs + loop_stage * frame_size + row; - src_llr_ptr = dst_llr_ptr + frame_size; - for(el = 0; el < stage_size; el++){ - *dst_llr_ptr++ = llr_odd(*src_llr_ptr, *(src_llr_ptr + 1)); - src_llr_ptr += 2; + int loop_stage = depth - 1; + float* dst_llr_ptr; + float* src_llr_ptr; + int stage_size = 0x01 << loop_stage; + + int el; + while (min_stage <= loop_stage) { + dst_llr_ptr = llrs + loop_stage * frame_size + row; + src_llr_ptr = dst_llr_ptr + frame_size; + for (el = 0; el < stage_size; el++) { + *dst_llr_ptr++ = llr_odd(*src_llr_ptr, *(src_llr_ptr + 1)); + src_llr_ptr += 2; + } + + --loop_stage; + stage_size >>= 1; } - - --loop_stage; - stage_size >>= 1; - } } -static inline float -llr_even(const float la, const float lb, const unsigned char f) +static inline float llr_even(const float la, const float lb, const unsigned char f) { - switch(f){ + switch (f) { case 0: - return lb + la; + return lb + la; default: - return lb - la; - } + return lb - la; + } } static inline void even_u_values(unsigned char* u_even, const unsigned char* u, const int u_num) { - u++; - int i; - for(i = 1; i < u_num; i += 2){ - *u_even++ = *u; - u += 2; - } + u++; + int i; + for (i = 1; i < u_num; i += 2) { + *u_even++ = *u; + u += 2; + } } static inline void odd_xor_even_values(unsigned char* u_xor, const unsigned char* u, const int u_num) { - int i; - for(i = 1; i < u_num; i += 2){ - *u_xor++ = *u ^ *(u + 1); - u += 2; - } + int i; + for (i = 1; i < u_num; i += 2) { + *u_xor++ = *u ^ *(u + 1); + u += 2; + } } -static inline int -calculate_max_stage_depth_for_row(const int frame_exp, const int row) +static inline int calculate_max_stage_depth_for_row(const int frame_exp, const int row) { - int max_stage_depth = 0; - int half_stage_size = 0x01; - int stage_size = half_stage_size << 1; - while(max_stage_depth < (frame_exp - 1)){ // last stage holds received values. - if(!(row % stage_size < half_stage_size)){ - break; + int max_stage_depth = 0; + int half_stage_size = 0x01; + int stage_size = half_stage_size << 1; + while (max_stage_depth < (frame_exp - 1)) { // last stage holds received values. + if (!(row % stage_size < half_stage_size)) { + break; + } + half_stage_size <<= 1; + stage_size <<= 1; + max_stage_depth++; } - half_stage_size <<= 1; - stage_size <<= 1; - max_stage_depth++; - } - return max_stage_depth; + return max_stage_depth; } #ifdef LV_HAVE_GENERIC -static inline void -volk_32f_8u_polarbutterfly_32f_generic(float* llrs, unsigned char* u, - const int frame_exp, - const int stage, const int u_num, const int row) +static inline void volk_32f_8u_polarbutterfly_32f_generic(float* llrs, + unsigned char* u, + const int frame_exp, + const int stage, + const int u_num, + const int row) { - const int frame_size = 0x01 << frame_exp; - const int next_stage = stage + 1; + const int frame_size = 0x01 << frame_exp; + const int next_stage = stage + 1; - const int half_stage_size = 0x01 << stage; - const int stage_size = half_stage_size << 1; + const int half_stage_size = 0x01 << stage; + const int stage_size = half_stage_size << 1; - const bool is_upper_stage_half = row % stage_size < half_stage_size; + const bool is_upper_stage_half = row % stage_size < half_stage_size; -// // this is a natural bit order impl - float* next_llrs = llrs + frame_size;// LLRs are stored in a consecutive array. - float* call_row_llr = llrs + row; + // // this is a natural bit order impl + float* next_llrs = llrs + frame_size; // LLRs are stored in a consecutive array. + float* call_row_llr = llrs + row; - const int section = row - (row % stage_size); - const int jump_size = ((row % half_stage_size) << 1) % stage_size; + const int section = row - (row % stage_size); + const int jump_size = ((row % half_stage_size) << 1) % stage_size; - const int next_upper_row = section + jump_size; - const int next_lower_row = next_upper_row + 1; + const int next_upper_row = section + jump_size; + const int next_lower_row = next_upper_row + 1; - const float* upper_right_llr_ptr = next_llrs + next_upper_row; - const float* lower_right_llr_ptr = next_llrs + next_lower_row; + const float* upper_right_llr_ptr = next_llrs + next_upper_row; + const float* lower_right_llr_ptr = next_llrs + next_lower_row; - if(!is_upper_stage_half){ - const int u_pos = u_num >> stage; - const unsigned char f = u[u_pos - 1]; - *call_row_llr = llr_even(*upper_right_llr_ptr, *lower_right_llr_ptr, f); - return; - } + if (!is_upper_stage_half) { + const int u_pos = u_num >> stage; + const unsigned char f = u[u_pos - 1]; + *call_row_llr = llr_even(*upper_right_llr_ptr, *lower_right_llr_ptr, f); + return; + } - if(frame_exp > next_stage){ - unsigned char* u_half = u + frame_size; - odd_xor_even_values(u_half, u, u_num); - volk_32f_8u_polarbutterfly_32f_generic(next_llrs, u_half, frame_exp, next_stage, u_num, next_upper_row); + if (frame_exp > next_stage) { + unsigned char* u_half = u + frame_size; + odd_xor_even_values(u_half, u, u_num); + volk_32f_8u_polarbutterfly_32f_generic( + next_llrs, u_half, frame_exp, next_stage, u_num, next_upper_row); - even_u_values(u_half, u, u_num); - volk_32f_8u_polarbutterfly_32f_generic(next_llrs, u_half, frame_exp, next_stage, u_num, next_lower_row); - } + even_u_values(u_half, u, u_num); + volk_32f_8u_polarbutterfly_32f_generic( + next_llrs, u_half, frame_exp, next_stage, u_num, next_lower_row); + } - *call_row_llr = llr_odd(*upper_right_llr_ptr, *lower_right_llr_ptr); + *call_row_llr = llr_odd(*upper_right_llr_ptr, *lower_right_llr_ptr); } #endif /* LV_HAVE_GENERIC */ @@ -206,99 +210,99 @@ volk_32f_8u_polarbutterfly_32f_generic(float* llrs, unsigned char* u, #include #include -static inline void -volk_32f_8u_polarbutterfly_32f_u_avx(float* llrs, unsigned char* u, - const int frame_exp, - const int stage, const int u_num, const int row) +static inline void volk_32f_8u_polarbutterfly_32f_u_avx(float* llrs, + unsigned char* u, + const int frame_exp, + const int stage, + const int u_num, + const int row) { - const int frame_size = 0x01 << frame_exp; - if(row % 2){ // for odd rows just do the only necessary calculation and return. - const float* next_llrs = llrs + frame_size + row; - *(llrs + row) = llr_even(*(next_llrs - 1), *next_llrs, u[u_num - 1]); - return; - } - - const int max_stage_depth = calculate_max_stage_depth_for_row(frame_exp, row); - if(max_stage_depth < 3){ // vectorized version needs larger vectors. - volk_32f_8u_polarbutterfly_32f_generic(llrs, u, frame_exp, stage, u_num, row); - return; - } - - int loop_stage = max_stage_depth; - int stage_size = 0x01 << loop_stage; - - float* src_llr_ptr; - float* dst_llr_ptr; - - __m256 src0, src1, dst; - - if(row){ // not necessary for ZERO row. == first bit to be decoded. - // first do bit combination for all stages - // effectively encode some decoded bits again. - unsigned char* u_target = u + frame_size; - unsigned char* u_temp = u + 2* frame_size; - memcpy(u_temp, u + u_num - stage_size, sizeof(unsigned char) * stage_size); - - if(stage_size > 15){ - _mm256_zeroupper(); - volk_8u_x2_encodeframepolar_8u_u_ssse3(u_target, u_temp, stage_size); + const int frame_size = 0x01 << frame_exp; + if (row % 2) { // for odd rows just do the only necessary calculation and return. + const float* next_llrs = llrs + frame_size + row; + *(llrs + row) = llr_even(*(next_llrs - 1), *next_llrs, u[u_num - 1]); + return; } - else{ - volk_8u_x2_encodeframepolar_8u_generic(u_target, u_temp, stage_size); + + const int max_stage_depth = calculate_max_stage_depth_for_row(frame_exp, row); + if (max_stage_depth < 3) { // vectorized version needs larger vectors. + volk_32f_8u_polarbutterfly_32f_generic(llrs, u, frame_exp, stage, u_num, row); + return; } - src_llr_ptr = llrs + (max_stage_depth + 1) * frame_size + row - stage_size; - dst_llr_ptr = llrs + max_stage_depth * frame_size + row; + int loop_stage = max_stage_depth; + int stage_size = 0x01 << loop_stage; - __m128i fbits; + float* src_llr_ptr; + float* dst_llr_ptr; - int p; - for(p = 0; p < stage_size; p += 8){ - _mm256_zeroupper(); - fbits = _mm_loadu_si128((__m128i*) u_target); - u_target += 8; + __m256 src0, src1, dst; - src0 = _mm256_loadu_ps(src_llr_ptr); - src1 = _mm256_loadu_ps(src_llr_ptr + 8); - src_llr_ptr += 16; + if (row) { // not necessary for ZERO row. == first bit to be decoded. + // first do bit combination for all stages + // effectively encode some decoded bits again. + unsigned char* u_target = u + frame_size; + unsigned char* u_temp = u + 2 * frame_size; + memcpy(u_temp, u + u_num - stage_size, sizeof(unsigned char) * stage_size); - dst = _mm256_polar_fsign_add_llrs(src0, src1, fbits); + if (stage_size > 15) { + _mm256_zeroupper(); + volk_8u_x2_encodeframepolar_8u_u_ssse3(u_target, u_temp, stage_size); + } else { + volk_8u_x2_encodeframepolar_8u_generic(u_target, u_temp, stage_size); + } - _mm256_storeu_ps(dst_llr_ptr, dst); - dst_llr_ptr += 8; - } + src_llr_ptr = llrs + (max_stage_depth + 1) * frame_size + row - stage_size; + dst_llr_ptr = llrs + max_stage_depth * frame_size + row; - --loop_stage; - stage_size >>= 1; - } + __m128i fbits; - const int min_stage = stage > 2 ? stage : 2; + int p; + for (p = 0; p < stage_size; p += 8) { + _mm256_zeroupper(); + fbits = _mm_loadu_si128((__m128i*)u_target); + u_target += 8; - _mm256_zeroall(); // Important to clear cache! + src0 = _mm256_loadu_ps(src_llr_ptr); + src1 = _mm256_loadu_ps(src_llr_ptr + 8); + src_llr_ptr += 16; - int el; - while(min_stage < loop_stage){ - dst_llr_ptr = llrs + loop_stage * frame_size + row; - src_llr_ptr = dst_llr_ptr + frame_size; - for(el = 0; el < stage_size; el += 8){ - src0 = _mm256_loadu_ps(src_llr_ptr); - src_llr_ptr += 8; - src1 = _mm256_loadu_ps(src_llr_ptr); - src_llr_ptr += 8; + dst = _mm256_polar_fsign_add_llrs(src0, src1, fbits); - dst = _mm256_polar_minsum_llrs(src0, src1); + _mm256_storeu_ps(dst_llr_ptr, dst); + dst_llr_ptr += 8; + } - _mm256_storeu_ps(dst_llr_ptr, dst); - dst_llr_ptr += 8; + --loop_stage; + stage_size >>= 1; } - --loop_stage; - stage_size >>= 1; + const int min_stage = stage > 2 ? stage : 2; + + _mm256_zeroall(); // Important to clear cache! - } + int el; + while (min_stage < loop_stage) { + dst_llr_ptr = llrs + loop_stage * frame_size + row; + src_llr_ptr = dst_llr_ptr + frame_size; + for (el = 0; el < stage_size; el += 8) { + src0 = _mm256_loadu_ps(src_llr_ptr); + src_llr_ptr += 8; + src1 = _mm256_loadu_ps(src_llr_ptr); + src_llr_ptr += 8; - // for stages < 3 vectors are too small!. - llr_odd_stages(llrs, stage, loop_stage + 1,frame_size, row); + dst = _mm256_polar_minsum_llrs(src0, src1); + + _mm256_storeu_ps(dst_llr_ptr, dst); + dst_llr_ptr += 8; + } + + --loop_stage; + stage_size >>= 1; + } + + // for stages < 3 vectors are too small!. + llr_odd_stages(llrs, stage, loop_stage + 1, frame_size, row); } #endif /* LV_HAVE_AVX */ @@ -307,99 +311,99 @@ volk_32f_8u_polarbutterfly_32f_u_avx(float* llrs, unsigned char* u, #include #include -static inline void -volk_32f_8u_polarbutterfly_32f_u_avx2(float* llrs, unsigned char* u, - const int frame_exp, - const int stage, const int u_num, const int row) +static inline void volk_32f_8u_polarbutterfly_32f_u_avx2(float* llrs, + unsigned char* u, + const int frame_exp, + const int stage, + const int u_num, + const int row) { - const int frame_size = 0x01 << frame_exp; - if(row % 2){ // for odd rows just do the only necessary calculation and return. - const float* next_llrs = llrs + frame_size + row; - *(llrs + row) = llr_even(*(next_llrs - 1), *next_llrs, u[u_num - 1]); - return; - } - - const int max_stage_depth = calculate_max_stage_depth_for_row(frame_exp, row); - if(max_stage_depth < 3){ // vectorized version needs larger vectors. - volk_32f_8u_polarbutterfly_32f_generic(llrs, u, frame_exp, stage, u_num, row); - return; - } - - int loop_stage = max_stage_depth; - int stage_size = 0x01 << loop_stage; - - float* src_llr_ptr; - float* dst_llr_ptr; - - __m256 src0, src1, dst; - - if(row){ // not necessary for ZERO row. == first bit to be decoded. - // first do bit combination for all stages - // effectively encode some decoded bits again. - unsigned char* u_target = u + frame_size; - unsigned char* u_temp = u + 2* frame_size; - memcpy(u_temp, u + u_num - stage_size, sizeof(unsigned char) * stage_size); - - if(stage_size > 15){ - _mm256_zeroupper(); - volk_8u_x2_encodeframepolar_8u_u_ssse3(u_target, u_temp, stage_size); + const int frame_size = 0x01 << frame_exp; + if (row % 2) { // for odd rows just do the only necessary calculation and return. + const float* next_llrs = llrs + frame_size + row; + *(llrs + row) = llr_even(*(next_llrs - 1), *next_llrs, u[u_num - 1]); + return; } - else{ - volk_8u_x2_encodeframepolar_8u_generic(u_target, u_temp, stage_size); + + const int max_stage_depth = calculate_max_stage_depth_for_row(frame_exp, row); + if (max_stage_depth < 3) { // vectorized version needs larger vectors. + volk_32f_8u_polarbutterfly_32f_generic(llrs, u, frame_exp, stage, u_num, row); + return; } - src_llr_ptr = llrs + (max_stage_depth + 1) * frame_size + row - stage_size; - dst_llr_ptr = llrs + max_stage_depth * frame_size + row; + int loop_stage = max_stage_depth; + int stage_size = 0x01 << loop_stage; - __m128i fbits; + float* src_llr_ptr; + float* dst_llr_ptr; - int p; - for(p = 0; p < stage_size; p += 8){ - _mm256_zeroupper(); - fbits = _mm_loadu_si128((__m128i*) u_target); - u_target += 8; + __m256 src0, src1, dst; - src0 = _mm256_loadu_ps(src_llr_ptr); - src1 = _mm256_loadu_ps(src_llr_ptr + 8); - src_llr_ptr += 16; + if (row) { // not necessary for ZERO row. == first bit to be decoded. + // first do bit combination for all stages + // effectively encode some decoded bits again. + unsigned char* u_target = u + frame_size; + unsigned char* u_temp = u + 2 * frame_size; + memcpy(u_temp, u + u_num - stage_size, sizeof(unsigned char) * stage_size); - dst = _mm256_polar_fsign_add_llrs_avx2(src0, src1, fbits); + if (stage_size > 15) { + _mm256_zeroupper(); + volk_8u_x2_encodeframepolar_8u_u_ssse3(u_target, u_temp, stage_size); + } else { + volk_8u_x2_encodeframepolar_8u_generic(u_target, u_temp, stage_size); + } - _mm256_storeu_ps(dst_llr_ptr, dst); - dst_llr_ptr += 8; - } + src_llr_ptr = llrs + (max_stage_depth + 1) * frame_size + row - stage_size; + dst_llr_ptr = llrs + max_stage_depth * frame_size + row; - --loop_stage; - stage_size >>= 1; - } + __m128i fbits; - const int min_stage = stage > 2 ? stage : 2; + int p; + for (p = 0; p < stage_size; p += 8) { + _mm256_zeroupper(); + fbits = _mm_loadu_si128((__m128i*)u_target); + u_target += 8; - _mm256_zeroall(); // Important to clear cache! + src0 = _mm256_loadu_ps(src_llr_ptr); + src1 = _mm256_loadu_ps(src_llr_ptr + 8); + src_llr_ptr += 16; - int el; - while(min_stage < loop_stage){ - dst_llr_ptr = llrs + loop_stage * frame_size + row; - src_llr_ptr = dst_llr_ptr + frame_size; - for(el = 0; el < stage_size; el += 8){ - src0 = _mm256_loadu_ps(src_llr_ptr); - src_llr_ptr += 8; - src1 = _mm256_loadu_ps(src_llr_ptr); - src_llr_ptr += 8; + dst = _mm256_polar_fsign_add_llrs_avx2(src0, src1, fbits); - dst = _mm256_polar_minsum_llrs(src0, src1); + _mm256_storeu_ps(dst_llr_ptr, dst); + dst_llr_ptr += 8; + } - _mm256_storeu_ps(dst_llr_ptr, dst); - dst_llr_ptr += 8; + --loop_stage; + stage_size >>= 1; } - --loop_stage; - stage_size >>= 1; + const int min_stage = stage > 2 ? stage : 2; + + _mm256_zeroall(); // Important to clear cache! + + int el; + while (min_stage < loop_stage) { + dst_llr_ptr = llrs + loop_stage * frame_size + row; + src_llr_ptr = dst_llr_ptr + frame_size; + for (el = 0; el < stage_size; el += 8) { + src0 = _mm256_loadu_ps(src_llr_ptr); + src_llr_ptr += 8; + src1 = _mm256_loadu_ps(src_llr_ptr); + src_llr_ptr += 8; - } + dst = _mm256_polar_minsum_llrs(src0, src1); + + _mm256_storeu_ps(dst_llr_ptr, dst); + dst_llr_ptr += 8; + } + + --loop_stage; + stage_size >>= 1; + } - // for stages < 3 vectors are too small!. - llr_odd_stages(llrs, stage, loop_stage + 1,frame_size, row); + // for stages < 3 vectors are too small!. + llr_odd_stages(llrs, stage, loop_stage + 1, frame_size, row); } #endif /* LV_HAVE_AVX2 */ diff --git a/kernels/volk/volk_32f_8u_polarbutterflypuppet_32f.h b/kernels/volk/volk_32f_8u_polarbutterflypuppet_32f.h index fa40a86..6f97dd1 100644 --- a/kernels/volk/volk_32f_8u_polarbutterflypuppet_32f.h +++ b/kernels/volk/volk_32f_8u_polarbutterflypuppet_32f.h @@ -33,124 +33,129 @@ #include -static inline void -sanitize_bytes(unsigned char* u, const int elements) +static inline void sanitize_bytes(unsigned char* u, const int elements) { - int i; - unsigned char* u_ptr = u; - for(i = 0; i < elements; i++){ - *u_ptr = (*u_ptr & 0x01); - u_ptr++; - } + int i; + unsigned char* u_ptr = u; + for (i = 0; i < elements; i++) { + *u_ptr = (*u_ptr & 0x01); + u_ptr++; + } } -static inline void -clean_up_intermediate_values(float* llrs, unsigned char* u, const int frame_size, const int elements) +static inline void clean_up_intermediate_values(float* llrs, + unsigned char* u, + const int frame_size, + const int elements) { - memset(u + frame_size, 0, sizeof(unsigned char) * (elements - frame_size)); - memset(llrs + frame_size, 0, sizeof(float) * (elements - frame_size)); + memset(u + frame_size, 0, sizeof(unsigned char) * (elements - frame_size)); + memset(llrs + frame_size, 0, sizeof(float) * (elements - frame_size)); } static inline void generate_error_free_input_vector(float* llrs, unsigned char* u, const int frame_size) { - memset(u, 0, frame_size); - unsigned char* target = u + frame_size; - volk_8u_x2_encodeframepolar_8u_generic(target, u + 2 * frame_size, frame_size); - float* ft = llrs; - int i; - for(i = 0; i < frame_size; i++){ - *ft = (-2 * ((float) *target++)) + 1.0f; - ft++; - } + memset(u, 0, frame_size); + unsigned char* target = u + frame_size; + volk_8u_x2_encodeframepolar_8u_generic(target, u + 2 * frame_size, frame_size); + float* ft = llrs; + int i; + for (i = 0; i < frame_size; i++) { + *ft = (-2 * ((float)*target++)) + 1.0f; + ft++; + } } static inline void print_llr_tree(const float* llrs, const int frame_size, const int frame_exp) { - int s, e; - for(s = 0; s < frame_size; s++){ - for(e = 0; e < frame_exp + 1; e++){ - printf("%+4.2f ", llrs[e * frame_size + s]); - } - printf("\n"); - if((s + 1) % 8 == 0){ - printf("\n"); + int s, e; + for (s = 0; s < frame_size; s++) { + for (e = 0; e < frame_exp + 1; e++) { + printf("%+4.2f ", llrs[e * frame_size + s]); + } + printf("\n"); + if ((s + 1) % 8 == 0) { + printf("\n"); + } } - } } -static inline int -maximum_frame_size(const int elements) +static inline int maximum_frame_size(const int elements) { - unsigned int frame_size = next_lower_power_of_two(elements); - unsigned int frame_exp = log2_of_power_of_2(frame_size); - return next_lower_power_of_two(frame_size / frame_exp); + unsigned int frame_size = next_lower_power_of_two(elements); + unsigned int frame_exp = log2_of_power_of_2(frame_size); + return next_lower_power_of_two(frame_size / frame_exp); } #ifdef LV_HAVE_GENERIC -static inline void -volk_32f_8u_polarbutterflypuppet_32f_generic(float* llrs, const float* input, unsigned char* u, const int elements) +static inline void volk_32f_8u_polarbutterflypuppet_32f_generic(float* llrs, + const float* input, + unsigned char* u, + const int elements) { - unsigned int frame_size = maximum_frame_size(elements); - unsigned int frame_exp = log2_of_power_of_2(frame_size); + unsigned int frame_size = maximum_frame_size(elements); + unsigned int frame_exp = log2_of_power_of_2(frame_size); - sanitize_bytes(u, elements); - clean_up_intermediate_values(llrs, u, frame_size, elements); - generate_error_free_input_vector(llrs + frame_exp * frame_size, u, frame_size); + sanitize_bytes(u, elements); + clean_up_intermediate_values(llrs, u, frame_size, elements); + generate_error_free_input_vector(llrs + frame_exp * frame_size, u, frame_size); - unsigned int u_num = 0; - for(; u_num < frame_size; u_num++){ - volk_32f_8u_polarbutterfly_32f_generic(llrs, u, frame_exp, 0, u_num, u_num); - u[u_num] = llrs[u_num] > 0 ? 0 : 1; - } + unsigned int u_num = 0; + for (; u_num < frame_size; u_num++) { + volk_32f_8u_polarbutterfly_32f_generic(llrs, u, frame_exp, 0, u_num, u_num); + u[u_num] = llrs[u_num] > 0 ? 0 : 1; + } - clean_up_intermediate_values(llrs, u, frame_size, elements); + clean_up_intermediate_values(llrs, u, frame_size, elements); } #endif /* LV_HAVE_GENERIC */ #ifdef LV_HAVE_AVX -static inline void -volk_32f_8u_polarbutterflypuppet_32f_u_avx(float* llrs, const float* input, unsigned char* u, const int elements) +static inline void volk_32f_8u_polarbutterflypuppet_32f_u_avx(float* llrs, + const float* input, + unsigned char* u, + const int elements) { - unsigned int frame_size = maximum_frame_size(elements); - unsigned int frame_exp = log2_of_power_of_2(frame_size); + unsigned int frame_size = maximum_frame_size(elements); + unsigned int frame_exp = log2_of_power_of_2(frame_size); - sanitize_bytes(u, elements); - clean_up_intermediate_values(llrs, u, frame_size, elements); - generate_error_free_input_vector(llrs + frame_exp * frame_size, u, frame_size); + sanitize_bytes(u, elements); + clean_up_intermediate_values(llrs, u, frame_size, elements); + generate_error_free_input_vector(llrs + frame_exp * frame_size, u, frame_size); - unsigned int u_num = 0; - for(; u_num < frame_size; u_num++){ - volk_32f_8u_polarbutterfly_32f_u_avx(llrs, u, frame_exp, 0, u_num, u_num); - u[u_num] = llrs[u_num] > 0 ? 0 : 1; - } + unsigned int u_num = 0; + for (; u_num < frame_size; u_num++) { + volk_32f_8u_polarbutterfly_32f_u_avx(llrs, u, frame_exp, 0, u_num, u_num); + u[u_num] = llrs[u_num] > 0 ? 0 : 1; + } - clean_up_intermediate_values(llrs, u, frame_size, elements); + clean_up_intermediate_values(llrs, u, frame_size, elements); } #endif /* LV_HAVE_AVX */ #ifdef LV_HAVE_AVX2 -static inline void -volk_32f_8u_polarbutterflypuppet_32f_u_avx2(float* llrs, const float* input, unsigned char* u, const int elements) +static inline void volk_32f_8u_polarbutterflypuppet_32f_u_avx2(float* llrs, + const float* input, + unsigned char* u, + const int elements) { - unsigned int frame_size = maximum_frame_size(elements); - unsigned int frame_exp = log2_of_power_of_2(frame_size); + unsigned int frame_size = maximum_frame_size(elements); + unsigned int frame_exp = log2_of_power_of_2(frame_size); - sanitize_bytes(u, elements); - clean_up_intermediate_values(llrs, u, frame_size, elements); - generate_error_free_input_vector(llrs + frame_exp * frame_size, u, frame_size); + sanitize_bytes(u, elements); + clean_up_intermediate_values(llrs, u, frame_size, elements); + generate_error_free_input_vector(llrs + frame_exp * frame_size, u, frame_size); - unsigned int u_num = 0; - for(; u_num < frame_size; u_num++){ - volk_32f_8u_polarbutterfly_32f_u_avx2(llrs, u, frame_exp, 0, u_num, u_num); - u[u_num] = llrs[u_num] > 0 ? 0 : 1; - } + unsigned int u_num = 0; + for (; u_num < frame_size; u_num++) { + volk_32f_8u_polarbutterfly_32f_u_avx2(llrs, u, frame_exp, 0, u_num, u_num); + u[u_num] = llrs[u_num] > 0 ? 0 : 1; + } - clean_up_intermediate_values(llrs, u, frame_size, elements); + clean_up_intermediate_values(llrs, u, frame_size, elements); } #endif /* LV_HAVE_AVX2 */ - #endif /* VOLK_KERNELS_VOLK_VOLK_32F_8U_POLARBUTTERFLYPUPPET_32F_H_ */ diff --git a/kernels/volk/volk_32f_accumulator_s32f.h b/kernels/volk/volk_32f_accumulator_s32f.h index f6219c8..9a78f58 100644 --- a/kernels/volk/volk_32f_accumulator_s32f.h +++ b/kernels/volk/volk_32f_accumulator_s32f.h @@ -29,8 +29,8 @@ * * Dispatcher Prototype * \code - * void volk_32f_accumulator_s32f(float* result, const float* inputBuffer, unsigned int num_points) - * \endcode + * void volk_32f_accumulator_s32f(float* result, const float* inputBuffer, unsigned int + * num_points) \endcode * * \b Inputs * \li inputBuffer The buffer of data to be accumulated @@ -63,47 +63,48 @@ #ifndef INCLUDED_volk_32f_accumulator_s32f_a_H #define INCLUDED_volk_32f_accumulator_s32f_a_H -#include #include +#include #ifdef LV_HAVE_AVX #include -static inline void -volk_32f_accumulator_s32f_a_avx(float* result, const float* inputBuffer, unsigned int num_points) +static inline void volk_32f_accumulator_s32f_a_avx(float* result, + const float* inputBuffer, + unsigned int num_points) { - float returnValue = 0; - unsigned int number = 0; - const unsigned int eighthPoints = num_points / 8; - - const float* aPtr = inputBuffer; - __VOLK_ATTR_ALIGNED(32) float tempBuffer[8]; - - __m256 accumulator = _mm256_setzero_ps(); - __m256 aVal = _mm256_setzero_ps(); - - for(;number < eighthPoints; number++){ - aVal = _mm256_load_ps(aPtr); - accumulator = _mm256_add_ps(accumulator, aVal); - aPtr += 8; - } - - _mm256_store_ps(tempBuffer, accumulator); - - returnValue = tempBuffer[0]; - returnValue += tempBuffer[1]; - returnValue += tempBuffer[2]; - returnValue += tempBuffer[3]; - returnValue += tempBuffer[4]; - returnValue += tempBuffer[5]; - returnValue += tempBuffer[6]; - returnValue += tempBuffer[7]; - - number = eighthPoints * 8; - for(;number < num_points; number++){ - returnValue += (*aPtr++); - } - *result = returnValue; + float returnValue = 0; + unsigned int number = 0; + const unsigned int eighthPoints = num_points / 8; + + const float* aPtr = inputBuffer; + __VOLK_ATTR_ALIGNED(32) float tempBuffer[8]; + + __m256 accumulator = _mm256_setzero_ps(); + __m256 aVal = _mm256_setzero_ps(); + + for (; number < eighthPoints; number++) { + aVal = _mm256_load_ps(aPtr); + accumulator = _mm256_add_ps(accumulator, aVal); + aPtr += 8; + } + + _mm256_store_ps(tempBuffer, accumulator); + + returnValue = tempBuffer[0]; + returnValue += tempBuffer[1]; + returnValue += tempBuffer[2]; + returnValue += tempBuffer[3]; + returnValue += tempBuffer[4]; + returnValue += tempBuffer[5]; + returnValue += tempBuffer[6]; + returnValue += tempBuffer[7]; + + number = eighthPoints * 8; + for (; number < num_points; number++) { + returnValue += (*aPtr++); + } + *result = returnValue; } #endif /* LV_HAVE_AVX */ @@ -111,41 +112,42 @@ volk_32f_accumulator_s32f_a_avx(float* result, const float* inputBuffer, unsigne #ifdef LV_HAVE_AVX #include -static inline void -volk_32f_accumulator_s32f_u_avx(float* result, const float* inputBuffer, unsigned int num_points) +static inline void volk_32f_accumulator_s32f_u_avx(float* result, + const float* inputBuffer, + unsigned int num_points) { - float returnValue = 0; - unsigned int number = 0; - const unsigned int eighthPoints = num_points / 8; - - const float* aPtr = inputBuffer; - __VOLK_ATTR_ALIGNED(32) float tempBuffer[8]; - - __m256 accumulator = _mm256_setzero_ps(); - __m256 aVal = _mm256_setzero_ps(); - - for(;number < eighthPoints; number++){ - aVal = _mm256_loadu_ps(aPtr); - accumulator = _mm256_add_ps(accumulator, aVal); - aPtr += 8; - } - - _mm256_store_ps(tempBuffer, accumulator); - - returnValue = tempBuffer[0]; - returnValue += tempBuffer[1]; - returnValue += tempBuffer[2]; - returnValue += tempBuffer[3]; - returnValue += tempBuffer[4]; - returnValue += tempBuffer[5]; - returnValue += tempBuffer[6]; - returnValue += tempBuffer[7]; - - number = eighthPoints * 8; - for(;number < num_points; number++){ - returnValue += (*aPtr++); - } - *result = returnValue; + float returnValue = 0; + unsigned int number = 0; + const unsigned int eighthPoints = num_points / 8; + + const float* aPtr = inputBuffer; + __VOLK_ATTR_ALIGNED(32) float tempBuffer[8]; + + __m256 accumulator = _mm256_setzero_ps(); + __m256 aVal = _mm256_setzero_ps(); + + for (; number < eighthPoints; number++) { + aVal = _mm256_loadu_ps(aPtr); + accumulator = _mm256_add_ps(accumulator, aVal); + aPtr += 8; + } + + _mm256_store_ps(tempBuffer, accumulator); + + returnValue = tempBuffer[0]; + returnValue += tempBuffer[1]; + returnValue += tempBuffer[2]; + returnValue += tempBuffer[3]; + returnValue += tempBuffer[4]; + returnValue += tempBuffer[5]; + returnValue += tempBuffer[6]; + returnValue += tempBuffer[7]; + + number = eighthPoints * 8; + for (; number < num_points; number++) { + returnValue += (*aPtr++); + } + *result = returnValue; } #endif /* LV_HAVE_AVX */ @@ -153,37 +155,38 @@ volk_32f_accumulator_s32f_u_avx(float* result, const float* inputBuffer, unsigne #ifdef LV_HAVE_SSE #include -static inline void -volk_32f_accumulator_s32f_a_sse(float* result, const float* inputBuffer, unsigned int num_points) +static inline void volk_32f_accumulator_s32f_a_sse(float* result, + const float* inputBuffer, + unsigned int num_points) { - float returnValue = 0; - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - const float* aPtr = inputBuffer; - __VOLK_ATTR_ALIGNED(16) float tempBuffer[4]; - - __m128 accumulator = _mm_setzero_ps(); - __m128 aVal = _mm_setzero_ps(); - - for(;number < quarterPoints; number++){ - aVal = _mm_load_ps(aPtr); - accumulator = _mm_add_ps(accumulator, aVal); - aPtr += 4; - } - - _mm_store_ps(tempBuffer,accumulator); - - returnValue = tempBuffer[0]; - returnValue += tempBuffer[1]; - returnValue += tempBuffer[2]; - returnValue += tempBuffer[3]; - - number = quarterPoints * 4; - for(;number < num_points; number++){ - returnValue += (*aPtr++); - } - *result = returnValue; + float returnValue = 0; + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + const float* aPtr = inputBuffer; + __VOLK_ATTR_ALIGNED(16) float tempBuffer[4]; + + __m128 accumulator = _mm_setzero_ps(); + __m128 aVal = _mm_setzero_ps(); + + for (; number < quarterPoints; number++) { + aVal = _mm_load_ps(aPtr); + accumulator = _mm_add_ps(accumulator, aVal); + aPtr += 4; + } + + _mm_store_ps(tempBuffer, accumulator); + + returnValue = tempBuffer[0]; + returnValue += tempBuffer[1]; + returnValue += tempBuffer[2]; + returnValue += tempBuffer[3]; + + number = quarterPoints * 4; + for (; number < num_points; number++) { + returnValue += (*aPtr++); + } + *result = returnValue; } #endif /* LV_HAVE_SSE */ @@ -191,52 +194,54 @@ volk_32f_accumulator_s32f_a_sse(float* result, const float* inputBuffer, unsigne #ifdef LV_HAVE_SSE #include -static inline void -volk_32f_accumulator_s32f_u_sse(float* result, const float* inputBuffer, unsigned int num_points) +static inline void volk_32f_accumulator_s32f_u_sse(float* result, + const float* inputBuffer, + unsigned int num_points) { - float returnValue = 0; - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - const float* aPtr = inputBuffer; - __VOLK_ATTR_ALIGNED(16) float tempBuffer[4]; - - __m128 accumulator = _mm_setzero_ps(); - __m128 aVal = _mm_setzero_ps(); - - for(;number < quarterPoints; number++){ - aVal = _mm_load_ps(aPtr); - accumulator = _mm_add_ps(accumulator, aVal); - aPtr += 4; - } - - _mm_store_ps(tempBuffer,accumulator); - - returnValue = tempBuffer[0]; - returnValue += tempBuffer[1]; - returnValue += tempBuffer[2]; - returnValue += tempBuffer[3]; - - number = quarterPoints * 4; - for(;number < num_points; number++){ - returnValue += (*aPtr++); - } - *result = returnValue; + float returnValue = 0; + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + const float* aPtr = inputBuffer; + __VOLK_ATTR_ALIGNED(16) float tempBuffer[4]; + + __m128 accumulator = _mm_setzero_ps(); + __m128 aVal = _mm_setzero_ps(); + + for (; number < quarterPoints; number++) { + aVal = _mm_load_ps(aPtr); + accumulator = _mm_add_ps(accumulator, aVal); + aPtr += 4; + } + + _mm_store_ps(tempBuffer, accumulator); + + returnValue = tempBuffer[0]; + returnValue += tempBuffer[1]; + returnValue += tempBuffer[2]; + returnValue += tempBuffer[3]; + + number = quarterPoints * 4; + for (; number < num_points; number++) { + returnValue += (*aPtr++); + } + *result = returnValue; } #endif /* LV_HAVE_SSE */ #ifdef LV_HAVE_GENERIC -static inline void -volk_32f_accumulator_s32f_generic(float* result, const float* inputBuffer, unsigned int num_points) +static inline void volk_32f_accumulator_s32f_generic(float* result, + const float* inputBuffer, + unsigned int num_points) { - const float* aPtr = inputBuffer; - unsigned int number = 0; - float returnValue = 0; - - for(;number < num_points; number++){ - returnValue += (*aPtr++); - } - *result = returnValue; + const float* aPtr = inputBuffer; + unsigned int number = 0; + float returnValue = 0; + + for (; number < num_points; number++) { + returnValue += (*aPtr++); + } + *result = returnValue; } #endif /* LV_HAVE_GENERIC */ diff --git a/kernels/volk/volk_32f_acos_32f.h b/kernels/volk/volk_32f_acos_32f.h index 5c14c2f..92918ca 100644 --- a/kernels/volk/volk_32f_acos_32f.h +++ b/kernels/volk/volk_32f_acos_32f.h @@ -67,11 +67,12 @@ * \endcode */ -#include -#include #include +#include +#include -/* This is the number of terms of Taylor series to evaluate, increase this for more accuracy*/ +/* This is the number of terms of Taylor series to evaluate, increase this for more + * accuracy*/ #define ACOS_TERMS 2 #ifndef INCLUDED_volk_32f_acos_32f_a_H @@ -80,62 +81,68 @@ #if LV_HAVE_AVX2 && LV_HAVE_FMA #include -static inline void -volk_32f_acos_32f_a_avx2_fma(float* bVector, const float* aVector, unsigned int num_points) +static inline void volk_32f_acos_32f_a_avx2_fma(float* bVector, + const float* aVector, + unsigned int num_points) { - float* bPtr = bVector; - const float* aPtr = aVector; - - unsigned int number = 0; - unsigned int eighthPoints = num_points / 8; - int i, j; - - __m256 aVal, d, pi, pio2, x, y, z, arccosine; - __m256 fzeroes, fones, ftwos, ffours, condition; - - pi = _mm256_set1_ps(3.14159265358979323846); - pio2 = _mm256_set1_ps(3.14159265358979323846/2); - fzeroes = _mm256_setzero_ps(); - fones = _mm256_set1_ps(1.0); - ftwos = _mm256_set1_ps(2.0); - ffours = _mm256_set1_ps(4.0); - - for(;number < eighthPoints; number++){ - aVal = _mm256_load_ps(aPtr); - d = aVal; - aVal = _mm256_div_ps(_mm256_sqrt_ps(_mm256_mul_ps(_mm256_add_ps(fones, aVal), _mm256_sub_ps(fones, aVal))), aVal); - z = aVal; - condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS); - z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition)); - condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS); - x = _mm256_add_ps(z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition)); - - for(i = 0; i < 2; i++) - x = _mm256_add_ps(x, _mm256_sqrt_ps(_mm256_fmadd_ps(x, x,fones))); - x = _mm256_div_ps(fones, x); - y = fzeroes; - for(j = ACOS_TERMS - 1; j >=0 ; j--) - y = _mm256_fmadd_ps(y, _mm256_mul_ps(x, x), _mm256_set1_ps(pow(-1,j)/(2*j+1))); - - y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours)); - condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS); - - y = _mm256_add_ps(y, _mm256_and_ps(_mm256_fnmadd_ps(y, ftwos, pio2), condition)); - arccosine = y; - condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS); - arccosine = _mm256_sub_ps(arccosine, _mm256_and_ps(_mm256_mul_ps(arccosine, ftwos), condition)); - condition = _mm256_cmp_ps(d, fzeroes, _CMP_LT_OS); - arccosine = _mm256_add_ps(arccosine, _mm256_and_ps(pi, condition)); - - _mm256_store_ps(bPtr, arccosine); - aPtr += 8; - bPtr += 8; - } - - number = eighthPoints * 8; - for(;number < num_points; number++){ - *bPtr++ = acos(*aPtr++); - } + float* bPtr = bVector; + const float* aPtr = aVector; + + unsigned int number = 0; + unsigned int eighthPoints = num_points / 8; + int i, j; + + __m256 aVal, d, pi, pio2, x, y, z, arccosine; + __m256 fzeroes, fones, ftwos, ffours, condition; + + pi = _mm256_set1_ps(3.14159265358979323846); + pio2 = _mm256_set1_ps(3.14159265358979323846 / 2); + fzeroes = _mm256_setzero_ps(); + fones = _mm256_set1_ps(1.0); + ftwos = _mm256_set1_ps(2.0); + ffours = _mm256_set1_ps(4.0); + + for (; number < eighthPoints; number++) { + aVal = _mm256_load_ps(aPtr); + d = aVal; + aVal = _mm256_div_ps(_mm256_sqrt_ps(_mm256_mul_ps(_mm256_add_ps(fones, aVal), + _mm256_sub_ps(fones, aVal))), + aVal); + z = aVal; + condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS); + z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition)); + condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS); + x = _mm256_add_ps( + z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition)); + + for (i = 0; i < 2; i++) + x = _mm256_add_ps(x, _mm256_sqrt_ps(_mm256_fmadd_ps(x, x, fones))); + x = _mm256_div_ps(fones, x); + y = fzeroes; + for (j = ACOS_TERMS - 1; j >= 0; j--) + y = _mm256_fmadd_ps( + y, _mm256_mul_ps(x, x), _mm256_set1_ps(pow(-1, j) / (2 * j + 1))); + + y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours)); + condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS); + + y = _mm256_add_ps(y, _mm256_and_ps(_mm256_fnmadd_ps(y, ftwos, pio2), condition)); + arccosine = y; + condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS); + arccosine = _mm256_sub_ps( + arccosine, _mm256_and_ps(_mm256_mul_ps(arccosine, ftwos), condition)); + condition = _mm256_cmp_ps(d, fzeroes, _CMP_LT_OS); + arccosine = _mm256_add_ps(arccosine, _mm256_and_ps(pi, condition)); + + _mm256_store_ps(bPtr, arccosine); + aPtr += 8; + bPtr += 8; + } + + number = eighthPoints * 8; + for (; number < num_points; number++) { + *bPtr++ = acos(*aPtr++); + } } #endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for aligned */ @@ -147,59 +154,66 @@ volk_32f_acos_32f_a_avx2_fma(float* bVector, const float* aVector, unsigned int static inline void volk_32f_acos_32f_a_avx(float* bVector, const float* aVector, unsigned int num_points) { - float* bPtr = bVector; - const float* aPtr = aVector; - - unsigned int number = 0; - unsigned int eighthPoints = num_points / 8; - int i, j; - - __m256 aVal, d, pi, pio2, x, y, z, arccosine; - __m256 fzeroes, fones, ftwos, ffours, condition; - - pi = _mm256_set1_ps(3.14159265358979323846); - pio2 = _mm256_set1_ps(3.14159265358979323846/2); - fzeroes = _mm256_setzero_ps(); - fones = _mm256_set1_ps(1.0); - ftwos = _mm256_set1_ps(2.0); - ffours = _mm256_set1_ps(4.0); - - for(;number < eighthPoints; number++){ - aVal = _mm256_load_ps(aPtr); - d = aVal; - aVal = _mm256_div_ps(_mm256_sqrt_ps(_mm256_mul_ps(_mm256_add_ps(fones, aVal), _mm256_sub_ps(fones, aVal))), aVal); - z = aVal; - condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS); - z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition)); - condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS); - x = _mm256_add_ps(z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition)); - - for(i = 0; i < 2; i++) - x = _mm256_add_ps(x, _mm256_sqrt_ps(_mm256_add_ps(fones, _mm256_mul_ps(x, x)))); - x = _mm256_div_ps(fones, x); - y = fzeroes; - for(j = ACOS_TERMS - 1; j >=0 ; j--) - y = _mm256_add_ps(_mm256_mul_ps(y, _mm256_mul_ps(x, x)), _mm256_set1_ps(pow(-1,j)/(2*j+1))); - - y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours)); - condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS); - - y = _mm256_add_ps(y, _mm256_and_ps(_mm256_sub_ps(pio2, _mm256_mul_ps(y, ftwos)), condition)); - arccosine = y; - condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS); - arccosine = _mm256_sub_ps(arccosine, _mm256_and_ps(_mm256_mul_ps(arccosine, ftwos), condition)); - condition = _mm256_cmp_ps(d, fzeroes, _CMP_LT_OS); - arccosine = _mm256_add_ps(arccosine, _mm256_and_ps(pi, condition)); - - _mm256_store_ps(bPtr, arccosine); - aPtr += 8; - bPtr += 8; - } - - number = eighthPoints * 8; - for(;number < num_points; number++){ - *bPtr++ = acos(*aPtr++); - } + float* bPtr = bVector; + const float* aPtr = aVector; + + unsigned int number = 0; + unsigned int eighthPoints = num_points / 8; + int i, j; + + __m256 aVal, d, pi, pio2, x, y, z, arccosine; + __m256 fzeroes, fones, ftwos, ffours, condition; + + pi = _mm256_set1_ps(3.14159265358979323846); + pio2 = _mm256_set1_ps(3.14159265358979323846 / 2); + fzeroes = _mm256_setzero_ps(); + fones = _mm256_set1_ps(1.0); + ftwos = _mm256_set1_ps(2.0); + ffours = _mm256_set1_ps(4.0); + + for (; number < eighthPoints; number++) { + aVal = _mm256_load_ps(aPtr); + d = aVal; + aVal = _mm256_div_ps(_mm256_sqrt_ps(_mm256_mul_ps(_mm256_add_ps(fones, aVal), + _mm256_sub_ps(fones, aVal))), + aVal); + z = aVal; + condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS); + z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition)); + condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS); + x = _mm256_add_ps( + z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition)); + + for (i = 0; i < 2; i++) + x = _mm256_add_ps(x, + _mm256_sqrt_ps(_mm256_add_ps(fones, _mm256_mul_ps(x, x)))); + x = _mm256_div_ps(fones, x); + y = fzeroes; + for (j = ACOS_TERMS - 1; j >= 0; j--) + y = _mm256_add_ps(_mm256_mul_ps(y, _mm256_mul_ps(x, x)), + _mm256_set1_ps(pow(-1, j) / (2 * j + 1))); + + y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours)); + condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS); + + y = _mm256_add_ps( + y, _mm256_and_ps(_mm256_sub_ps(pio2, _mm256_mul_ps(y, ftwos)), condition)); + arccosine = y; + condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS); + arccosine = _mm256_sub_ps( + arccosine, _mm256_and_ps(_mm256_mul_ps(arccosine, ftwos), condition)); + condition = _mm256_cmp_ps(d, fzeroes, _CMP_LT_OS); + arccosine = _mm256_add_ps(arccosine, _mm256_and_ps(pi, condition)); + + _mm256_store_ps(bPtr, arccosine); + aPtr += 8; + bPtr += 8; + } + + number = eighthPoints * 8; + for (; number < num_points; number++) { + *bPtr++ = acos(*aPtr++); + } } #endif /* LV_HAVE_AVX2 for aligned */ @@ -210,59 +224,63 @@ volk_32f_acos_32f_a_avx(float* bVector, const float* aVector, unsigned int num_p static inline void volk_32f_acos_32f_a_sse4_1(float* bVector, const float* aVector, unsigned int num_points) { - float* bPtr = bVector; - const float* aPtr = aVector; - - unsigned int number = 0; - unsigned int quarterPoints = num_points / 4; - int i, j; - - __m128 aVal, d, pi, pio2, x, y, z, arccosine; - __m128 fzeroes, fones, ftwos, ffours, condition; - - pi = _mm_set1_ps(3.14159265358979323846); - pio2 = _mm_set1_ps(3.14159265358979323846/2); - fzeroes = _mm_setzero_ps(); - fones = _mm_set1_ps(1.0); - ftwos = _mm_set1_ps(2.0); - ffours = _mm_set1_ps(4.0); - - for(;number < quarterPoints; number++){ - aVal = _mm_load_ps(aPtr); - d = aVal; - aVal = _mm_div_ps(_mm_sqrt_ps(_mm_mul_ps(_mm_add_ps(fones, aVal), _mm_sub_ps(fones, aVal))), aVal); - z = aVal; - condition = _mm_cmplt_ps(z, fzeroes); - z = _mm_sub_ps(z, _mm_and_ps(_mm_mul_ps(z, ftwos), condition)); - condition = _mm_cmplt_ps(z, fones); - x = _mm_add_ps(z, _mm_and_ps(_mm_sub_ps(_mm_div_ps(fones, z), z), condition)); - - for(i = 0; i < 2; i++) - x = _mm_add_ps(x, _mm_sqrt_ps(_mm_add_ps(fones, _mm_mul_ps(x, x)))); - x = _mm_div_ps(fones, x); - y = fzeroes; - for(j = ACOS_TERMS - 1; j >=0 ; j--) - y = _mm_add_ps(_mm_mul_ps(y, _mm_mul_ps(x, x)), _mm_set1_ps(pow(-1,j)/(2*j+1))); - - y = _mm_mul_ps(y, _mm_mul_ps(x, ffours)); - condition = _mm_cmpgt_ps(z, fones); - - y = _mm_add_ps(y, _mm_and_ps(_mm_sub_ps(pio2, _mm_mul_ps(y, ftwos)), condition)); - arccosine = y; - condition = _mm_cmplt_ps(aVal, fzeroes); - arccosine = _mm_sub_ps(arccosine, _mm_and_ps(_mm_mul_ps(arccosine, ftwos), condition)); - condition = _mm_cmplt_ps(d, fzeroes); - arccosine = _mm_add_ps(arccosine, _mm_and_ps(pi, condition)); - - _mm_store_ps(bPtr, arccosine); - aPtr += 4; - bPtr += 4; - } - - number = quarterPoints * 4; - for(;number < num_points; number++){ - *bPtr++ = acosf(*aPtr++); - } + float* bPtr = bVector; + const float* aPtr = aVector; + + unsigned int number = 0; + unsigned int quarterPoints = num_points / 4; + int i, j; + + __m128 aVal, d, pi, pio2, x, y, z, arccosine; + __m128 fzeroes, fones, ftwos, ffours, condition; + + pi = _mm_set1_ps(3.14159265358979323846); + pio2 = _mm_set1_ps(3.14159265358979323846 / 2); + fzeroes = _mm_setzero_ps(); + fones = _mm_set1_ps(1.0); + ftwos = _mm_set1_ps(2.0); + ffours = _mm_set1_ps(4.0); + + for (; number < quarterPoints; number++) { + aVal = _mm_load_ps(aPtr); + d = aVal; + aVal = _mm_div_ps( + _mm_sqrt_ps(_mm_mul_ps(_mm_add_ps(fones, aVal), _mm_sub_ps(fones, aVal))), + aVal); + z = aVal; + condition = _mm_cmplt_ps(z, fzeroes); + z = _mm_sub_ps(z, _mm_and_ps(_mm_mul_ps(z, ftwos), condition)); + condition = _mm_cmplt_ps(z, fones); + x = _mm_add_ps(z, _mm_and_ps(_mm_sub_ps(_mm_div_ps(fones, z), z), condition)); + + for (i = 0; i < 2; i++) + x = _mm_add_ps(x, _mm_sqrt_ps(_mm_add_ps(fones, _mm_mul_ps(x, x)))); + x = _mm_div_ps(fones, x); + y = fzeroes; + for (j = ACOS_TERMS - 1; j >= 0; j--) + y = _mm_add_ps(_mm_mul_ps(y, _mm_mul_ps(x, x)), + _mm_set1_ps(pow(-1, j) / (2 * j + 1))); + + y = _mm_mul_ps(y, _mm_mul_ps(x, ffours)); + condition = _mm_cmpgt_ps(z, fones); + + y = _mm_add_ps(y, _mm_and_ps(_mm_sub_ps(pio2, _mm_mul_ps(y, ftwos)), condition)); + arccosine = y; + condition = _mm_cmplt_ps(aVal, fzeroes); + arccosine = + _mm_sub_ps(arccosine, _mm_and_ps(_mm_mul_ps(arccosine, ftwos), condition)); + condition = _mm_cmplt_ps(d, fzeroes); + arccosine = _mm_add_ps(arccosine, _mm_and_ps(pi, condition)); + + _mm_store_ps(bPtr, arccosine); + aPtr += 4; + bPtr += 4; + } + + number = quarterPoints * 4; + for (; number < num_points; number++) { + *bPtr++ = acosf(*aPtr++); + } } #endif /* LV_HAVE_SSE4_1 for aligned */ @@ -276,62 +294,68 @@ volk_32f_acos_32f_a_sse4_1(float* bVector, const float* aVector, unsigned int nu #if LV_HAVE_AVX2 && LV_HAVE_FMA #include -static inline void -volk_32f_acos_32f_u_avx2_fma(float* bVector, const float* aVector, unsigned int num_points) +static inline void volk_32f_acos_32f_u_avx2_fma(float* bVector, + const float* aVector, + unsigned int num_points) { - float* bPtr = bVector; - const float* aPtr = aVector; - - unsigned int number = 0; - unsigned int eighthPoints = num_points / 8; - int i, j; - - __m256 aVal, d, pi, pio2, x, y, z, arccosine; - __m256 fzeroes, fones, ftwos, ffours, condition; - - pi = _mm256_set1_ps(3.14159265358979323846); - pio2 = _mm256_set1_ps(3.14159265358979323846/2); - fzeroes = _mm256_setzero_ps(); - fones = _mm256_set1_ps(1.0); - ftwos = _mm256_set1_ps(2.0); - ffours = _mm256_set1_ps(4.0); - - for(;number < eighthPoints; number++){ - aVal = _mm256_loadu_ps(aPtr); - d = aVal; - aVal = _mm256_div_ps(_mm256_sqrt_ps(_mm256_mul_ps(_mm256_add_ps(fones, aVal), _mm256_sub_ps(fones, aVal))), aVal); - z = aVal; - condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS); - z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition)); - condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS); - x = _mm256_add_ps(z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition)); - - for(i = 0; i < 2; i++) - x = _mm256_add_ps(x, _mm256_sqrt_ps(_mm256_fmadd_ps(x, x,fones))); - x = _mm256_div_ps(fones, x); - y = fzeroes; - for(j = ACOS_TERMS - 1; j >=0 ; j--) - y = _mm256_fmadd_ps(y, _mm256_mul_ps(x, x), _mm256_set1_ps(pow(-1,j)/(2*j+1))); - - y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours)); - condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS); - - y = _mm256_add_ps(y, _mm256_and_ps(_mm256_fnmadd_ps(y, ftwos, pio2), condition)); - arccosine = y; - condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS); - arccosine = _mm256_sub_ps(arccosine, _mm256_and_ps(_mm256_mul_ps(arccosine, ftwos), condition)); - condition = _mm256_cmp_ps(d, fzeroes, _CMP_LT_OS); - arccosine = _mm256_add_ps(arccosine, _mm256_and_ps(pi, condition)); - - _mm256_storeu_ps(bPtr, arccosine); - aPtr += 8; - bPtr += 8; - } - - number = eighthPoints * 8; - for(;number < num_points; number++){ - *bPtr++ = acos(*aPtr++); - } + float* bPtr = bVector; + const float* aPtr = aVector; + + unsigned int number = 0; + unsigned int eighthPoints = num_points / 8; + int i, j; + + __m256 aVal, d, pi, pio2, x, y, z, arccosine; + __m256 fzeroes, fones, ftwos, ffours, condition; + + pi = _mm256_set1_ps(3.14159265358979323846); + pio2 = _mm256_set1_ps(3.14159265358979323846 / 2); + fzeroes = _mm256_setzero_ps(); + fones = _mm256_set1_ps(1.0); + ftwos = _mm256_set1_ps(2.0); + ffours = _mm256_set1_ps(4.0); + + for (; number < eighthPoints; number++) { + aVal = _mm256_loadu_ps(aPtr); + d = aVal; + aVal = _mm256_div_ps(_mm256_sqrt_ps(_mm256_mul_ps(_mm256_add_ps(fones, aVal), + _mm256_sub_ps(fones, aVal))), + aVal); + z = aVal; + condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS); + z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition)); + condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS); + x = _mm256_add_ps( + z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition)); + + for (i = 0; i < 2; i++) + x = _mm256_add_ps(x, _mm256_sqrt_ps(_mm256_fmadd_ps(x, x, fones))); + x = _mm256_div_ps(fones, x); + y = fzeroes; + for (j = ACOS_TERMS - 1; j >= 0; j--) + y = _mm256_fmadd_ps( + y, _mm256_mul_ps(x, x), _mm256_set1_ps(pow(-1, j) / (2 * j + 1))); + + y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours)); + condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS); + + y = _mm256_add_ps(y, _mm256_and_ps(_mm256_fnmadd_ps(y, ftwos, pio2), condition)); + arccosine = y; + condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS); + arccosine = _mm256_sub_ps( + arccosine, _mm256_and_ps(_mm256_mul_ps(arccosine, ftwos), condition)); + condition = _mm256_cmp_ps(d, fzeroes, _CMP_LT_OS); + arccosine = _mm256_add_ps(arccosine, _mm256_and_ps(pi, condition)); + + _mm256_storeu_ps(bPtr, arccosine); + aPtr += 8; + bPtr += 8; + } + + number = eighthPoints * 8; + for (; number < num_points; number++) { + *bPtr++ = acos(*aPtr++); + } } #endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for unaligned */ @@ -343,59 +367,66 @@ volk_32f_acos_32f_u_avx2_fma(float* bVector, const float* aVector, unsigned int static inline void volk_32f_acos_32f_u_avx(float* bVector, const float* aVector, unsigned int num_points) { - float* bPtr = bVector; - const float* aPtr = aVector; - - unsigned int number = 0; - unsigned int eighthPoints = num_points / 8; - int i, j; - - __m256 aVal, d, pi, pio2, x, y, z, arccosine; - __m256 fzeroes, fones, ftwos, ffours, condition; - - pi = _mm256_set1_ps(3.14159265358979323846); - pio2 = _mm256_set1_ps(3.14159265358979323846/2); - fzeroes = _mm256_setzero_ps(); - fones = _mm256_set1_ps(1.0); - ftwos = _mm256_set1_ps(2.0); - ffours = _mm256_set1_ps(4.0); - - for(;number < eighthPoints; number++){ - aVal = _mm256_loadu_ps(aPtr); - d = aVal; - aVal = _mm256_div_ps(_mm256_sqrt_ps(_mm256_mul_ps(_mm256_add_ps(fones, aVal), _mm256_sub_ps(fones, aVal))), aVal); - z = aVal; - condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS); - z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition)); - condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS); - x = _mm256_add_ps(z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition)); - - for(i = 0; i < 2; i++) - x = _mm256_add_ps(x, _mm256_sqrt_ps(_mm256_add_ps(fones, _mm256_mul_ps(x, x)))); - x = _mm256_div_ps(fones, x); - y = fzeroes; - for(j = ACOS_TERMS - 1; j >=0 ; j--) - y = _mm256_add_ps(_mm256_mul_ps(y, _mm256_mul_ps(x, x)), _mm256_set1_ps(pow(-1,j)/(2*j+1))); - - y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours)); - condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS); - - y = _mm256_add_ps(y, _mm256_and_ps(_mm256_sub_ps(pio2, _mm256_mul_ps(y, ftwos)), condition)); - arccosine = y; - condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS); - arccosine = _mm256_sub_ps(arccosine, _mm256_and_ps(_mm256_mul_ps(arccosine, ftwos), condition)); - condition = _mm256_cmp_ps(d, fzeroes, _CMP_LT_OS); - arccosine = _mm256_add_ps(arccosine, _mm256_and_ps(pi, condition)); - - _mm256_storeu_ps(bPtr, arccosine); - aPtr += 8; - bPtr += 8; - } - - number = eighthPoints * 8; - for(;number < num_points; number++){ - *bPtr++ = acos(*aPtr++); - } + float* bPtr = bVector; + const float* aPtr = aVector; + + unsigned int number = 0; + unsigned int eighthPoints = num_points / 8; + int i, j; + + __m256 aVal, d, pi, pio2, x, y, z, arccosine; + __m256 fzeroes, fones, ftwos, ffours, condition; + + pi = _mm256_set1_ps(3.14159265358979323846); + pio2 = _mm256_set1_ps(3.14159265358979323846 / 2); + fzeroes = _mm256_setzero_ps(); + fones = _mm256_set1_ps(1.0); + ftwos = _mm256_set1_ps(2.0); + ffours = _mm256_set1_ps(4.0); + + for (; number < eighthPoints; number++) { + aVal = _mm256_loadu_ps(aPtr); + d = aVal; + aVal = _mm256_div_ps(_mm256_sqrt_ps(_mm256_mul_ps(_mm256_add_ps(fones, aVal), + _mm256_sub_ps(fones, aVal))), + aVal); + z = aVal; + condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS); + z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition)); + condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS); + x = _mm256_add_ps( + z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition)); + + for (i = 0; i < 2; i++) + x = _mm256_add_ps(x, + _mm256_sqrt_ps(_mm256_add_ps(fones, _mm256_mul_ps(x, x)))); + x = _mm256_div_ps(fones, x); + y = fzeroes; + for (j = ACOS_TERMS - 1; j >= 0; j--) + y = _mm256_add_ps(_mm256_mul_ps(y, _mm256_mul_ps(x, x)), + _mm256_set1_ps(pow(-1, j) / (2 * j + 1))); + + y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours)); + condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS); + + y = _mm256_add_ps( + y, _mm256_and_ps(_mm256_sub_ps(pio2, _mm256_mul_ps(y, ftwos)), condition)); + arccosine = y; + condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS); + arccosine = _mm256_sub_ps( + arccosine, _mm256_and_ps(_mm256_mul_ps(arccosine, ftwos), condition)); + condition = _mm256_cmp_ps(d, fzeroes, _CMP_LT_OS); + arccosine = _mm256_add_ps(arccosine, _mm256_and_ps(pi, condition)); + + _mm256_storeu_ps(bPtr, arccosine); + aPtr += 8; + bPtr += 8; + } + + number = eighthPoints * 8; + for (; number < num_points; number++) { + *bPtr++ = acos(*aPtr++); + } } #endif /* LV_HAVE_AVX2 for unaligned */ @@ -406,60 +437,64 @@ volk_32f_acos_32f_u_avx(float* bVector, const float* aVector, unsigned int num_p static inline void volk_32f_acos_32f_u_sse4_1(float* bVector, const float* aVector, unsigned int num_points) { - float* bPtr = bVector; - const float* aPtr = aVector; - - unsigned int number = 0; - unsigned int quarterPoints = num_points / 4; - int i, j; - - __m128 aVal, d, pi, pio2, x, y, z, arccosine; - __m128 fzeroes, fones, ftwos, ffours, condition; - - pi = _mm_set1_ps(3.14159265358979323846); - pio2 = _mm_set1_ps(3.14159265358979323846/2); - fzeroes = _mm_setzero_ps(); - fones = _mm_set1_ps(1.0); - ftwos = _mm_set1_ps(2.0); - ffours = _mm_set1_ps(4.0); - - for(;number < quarterPoints; number++){ - aVal = _mm_loadu_ps(aPtr); - d = aVal; - aVal = _mm_div_ps(_mm_sqrt_ps(_mm_mul_ps(_mm_add_ps(fones, aVal), _mm_sub_ps(fones, aVal))), aVal); - z = aVal; - condition = _mm_cmplt_ps(z, fzeroes); - z = _mm_sub_ps(z, _mm_and_ps(_mm_mul_ps(z, ftwos), condition)); - condition = _mm_cmplt_ps(z, fones); - x = _mm_add_ps(z, _mm_and_ps(_mm_sub_ps(_mm_div_ps(fones, z), z), condition)); - - for(i = 0; i < 2; i++) - x = _mm_add_ps(x, _mm_sqrt_ps(_mm_add_ps(fones, _mm_mul_ps(x, x)))); - x = _mm_div_ps(fones, x); - y = fzeroes; - - for(j = ACOS_TERMS - 1; j >=0 ; j--) - y = _mm_add_ps(_mm_mul_ps(y, _mm_mul_ps(x, x)), _mm_set1_ps(pow(-1,j)/(2*j+1))); - - y = _mm_mul_ps(y, _mm_mul_ps(x, ffours)); - condition = _mm_cmpgt_ps(z, fones); - - y = _mm_add_ps(y, _mm_and_ps(_mm_sub_ps(pio2, _mm_mul_ps(y, ftwos)), condition)); - arccosine = y; - condition = _mm_cmplt_ps(aVal, fzeroes); - arccosine = _mm_sub_ps(arccosine, _mm_and_ps(_mm_mul_ps(arccosine, ftwos), condition)); - condition = _mm_cmplt_ps(d, fzeroes); - arccosine = _mm_add_ps(arccosine, _mm_and_ps(pi, condition)); - - _mm_storeu_ps(bPtr, arccosine); - aPtr += 4; - bPtr += 4; - } - - number = quarterPoints * 4; - for(;number < num_points; number++){ - *bPtr++ = acosf(*aPtr++); - } + float* bPtr = bVector; + const float* aPtr = aVector; + + unsigned int number = 0; + unsigned int quarterPoints = num_points / 4; + int i, j; + + __m128 aVal, d, pi, pio2, x, y, z, arccosine; + __m128 fzeroes, fones, ftwos, ffours, condition; + + pi = _mm_set1_ps(3.14159265358979323846); + pio2 = _mm_set1_ps(3.14159265358979323846 / 2); + fzeroes = _mm_setzero_ps(); + fones = _mm_set1_ps(1.0); + ftwos = _mm_set1_ps(2.0); + ffours = _mm_set1_ps(4.0); + + for (; number < quarterPoints; number++) { + aVal = _mm_loadu_ps(aPtr); + d = aVal; + aVal = _mm_div_ps( + _mm_sqrt_ps(_mm_mul_ps(_mm_add_ps(fones, aVal), _mm_sub_ps(fones, aVal))), + aVal); + z = aVal; + condition = _mm_cmplt_ps(z, fzeroes); + z = _mm_sub_ps(z, _mm_and_ps(_mm_mul_ps(z, ftwos), condition)); + condition = _mm_cmplt_ps(z, fones); + x = _mm_add_ps(z, _mm_and_ps(_mm_sub_ps(_mm_div_ps(fones, z), z), condition)); + + for (i = 0; i < 2; i++) + x = _mm_add_ps(x, _mm_sqrt_ps(_mm_add_ps(fones, _mm_mul_ps(x, x)))); + x = _mm_div_ps(fones, x); + y = fzeroes; + + for (j = ACOS_TERMS - 1; j >= 0; j--) + y = _mm_add_ps(_mm_mul_ps(y, _mm_mul_ps(x, x)), + _mm_set1_ps(pow(-1, j) / (2 * j + 1))); + + y = _mm_mul_ps(y, _mm_mul_ps(x, ffours)); + condition = _mm_cmpgt_ps(z, fones); + + y = _mm_add_ps(y, _mm_and_ps(_mm_sub_ps(pio2, _mm_mul_ps(y, ftwos)), condition)); + arccosine = y; + condition = _mm_cmplt_ps(aVal, fzeroes); + arccosine = + _mm_sub_ps(arccosine, _mm_and_ps(_mm_mul_ps(arccosine, ftwos), condition)); + condition = _mm_cmplt_ps(d, fzeroes); + arccosine = _mm_add_ps(arccosine, _mm_and_ps(pi, condition)); + + _mm_storeu_ps(bPtr, arccosine); + aPtr += 4; + bPtr += 4; + } + + number = quarterPoints * 4; + for (; number < num_points; number++) { + *bPtr++ = acosf(*aPtr++); + } } #endif /* LV_HAVE_SSE4_1 for aligned */ @@ -469,14 +504,13 @@ volk_32f_acos_32f_u_sse4_1(float* bVector, const float* aVector, unsigned int nu static inline void volk_32f_acos_32f_generic(float* bVector, const float* aVector, unsigned int num_points) { - float* bPtr = bVector; - const float* aPtr = aVector; - unsigned int number = 0; - - for(number = 0; number < num_points; number++){ - *bPtr++ = acosf(*aPtr++); - } + float* bPtr = bVector; + const float* aPtr = aVector; + unsigned int number = 0; + for (number = 0; number < num_points; number++) { + *bPtr++ = acosf(*aPtr++); + } } #endif /* LV_HAVE_GENERIC */ diff --git a/kernels/volk/volk_32f_asin_32f.h b/kernels/volk/volk_32f_asin_32f.h index 864cfcf..946d382 100644 --- a/kernels/volk/volk_32f_asin_32f.h +++ b/kernels/volk/volk_32f_asin_32f.h @@ -67,11 +67,12 @@ * \endcode */ -#include -#include #include +#include +#include -/* This is the number of terms of Taylor series to evaluate, increase this for more accuracy*/ +/* This is the number of terms of Taylor series to evaluate, increase this for more + * accuracy*/ #define ASIN_TERMS 2 #ifndef INCLUDED_volk_32f_asin_32f_a_H @@ -80,60 +81,66 @@ #if LV_HAVE_AVX2 && LV_HAVE_FMA #include -static inline void -volk_32f_asin_32f_a_avx2_fma(float* bVector, const float* aVector, unsigned int num_points) +static inline void volk_32f_asin_32f_a_avx2_fma(float* bVector, + const float* aVector, + unsigned int num_points) { - float* bPtr = bVector; - const float* aPtr = aVector; - - unsigned int number = 0; - unsigned int eighthPoints = num_points / 8; - int i, j; - - __m256 aVal, pio2, x, y, z, arcsine; - __m256 fzeroes, fones, ftwos, ffours, condition; - - pio2 = _mm256_set1_ps(3.14159265358979323846/2); - fzeroes = _mm256_setzero_ps(); - fones = _mm256_set1_ps(1.0); - ftwos = _mm256_set1_ps(2.0); - ffours = _mm256_set1_ps(4.0); - - for(;number < eighthPoints; number++){ - aVal = _mm256_load_ps(aPtr); - aVal = _mm256_div_ps(aVal, _mm256_sqrt_ps(_mm256_mul_ps(_mm256_add_ps(fones, aVal), _mm256_sub_ps(fones, aVal)))); - z = aVal; - condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS); - z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition)); - condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS); - x = _mm256_add_ps(z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition)); - - for(i = 0; i < 2; i++){ - x = _mm256_add_ps(x, _mm256_sqrt_ps(_mm256_fmadd_ps(x,x,fones))); + float* bPtr = bVector; + const float* aPtr = aVector; + + unsigned int number = 0; + unsigned int eighthPoints = num_points / 8; + int i, j; + + __m256 aVal, pio2, x, y, z, arcsine; + __m256 fzeroes, fones, ftwos, ffours, condition; + + pio2 = _mm256_set1_ps(3.14159265358979323846 / 2); + fzeroes = _mm256_setzero_ps(); + fones = _mm256_set1_ps(1.0); + ftwos = _mm256_set1_ps(2.0); + ffours = _mm256_set1_ps(4.0); + + for (; number < eighthPoints; number++) { + aVal = _mm256_load_ps(aPtr); + aVal = _mm256_div_ps(aVal, + _mm256_sqrt_ps(_mm256_mul_ps(_mm256_add_ps(fones, aVal), + _mm256_sub_ps(fones, aVal)))); + z = aVal; + condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS); + z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition)); + condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS); + x = _mm256_add_ps( + z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition)); + + for (i = 0; i < 2; i++) { + x = _mm256_add_ps(x, _mm256_sqrt_ps(_mm256_fmadd_ps(x, x, fones))); + } + x = _mm256_div_ps(fones, x); + y = fzeroes; + for (j = ASIN_TERMS - 1; j >= 0; j--) { + y = _mm256_fmadd_ps( + y, _mm256_mul_ps(x, x), _mm256_set1_ps(pow(-1, j) / (2 * j + 1))); + } + + y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours)); + condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS); + + y = _mm256_add_ps(y, _mm256_and_ps(_mm256_fnmadd_ps(y, ftwos, pio2), condition)); + arcsine = y; + condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS); + arcsine = _mm256_sub_ps(arcsine, + _mm256_and_ps(_mm256_mul_ps(arcsine, ftwos), condition)); + + _mm256_store_ps(bPtr, arcsine); + aPtr += 8; + bPtr += 8; } - x = _mm256_div_ps(fones, x); - y = fzeroes; - for(j = ASIN_TERMS - 1; j >=0 ; j--){ - y = _mm256_fmadd_ps(y, _mm256_mul_ps(x, x), _mm256_set1_ps(pow(-1,j)/(2*j+1))); - } - - y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours)); - condition = _mm256_cmp_ps(z, fones,_CMP_GT_OS); - - y = _mm256_add_ps(y, _mm256_and_ps(_mm256_fnmadd_ps(y,ftwos,pio2), condition)); - arcsine = y; - condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS); - arcsine = _mm256_sub_ps(arcsine, _mm256_and_ps(_mm256_mul_ps(arcsine, ftwos), condition)); - _mm256_store_ps(bPtr, arcsine); - aPtr += 8; - bPtr += 8; - } - - number = eighthPoints * 8; - for(;number < num_points; number++){ - *bPtr++ = asin(*aPtr++); - } + number = eighthPoints * 8; + for (; number < num_points; number++) { + *bPtr++ = asin(*aPtr++); + } } #endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for aligned */ @@ -145,57 +152,64 @@ volk_32f_asin_32f_a_avx2_fma(float* bVector, const float* aVector, unsigned int static inline void volk_32f_asin_32f_a_avx(float* bVector, const float* aVector, unsigned int num_points) { - float* bPtr = bVector; - const float* aPtr = aVector; - - unsigned int number = 0; - unsigned int eighthPoints = num_points / 8; - int i, j; - - __m256 aVal, pio2, x, y, z, arcsine; - __m256 fzeroes, fones, ftwos, ffours, condition; - - pio2 = _mm256_set1_ps(3.14159265358979323846/2); - fzeroes = _mm256_setzero_ps(); - fones = _mm256_set1_ps(1.0); - ftwos = _mm256_set1_ps(2.0); - ffours = _mm256_set1_ps(4.0); - - for(;number < eighthPoints; number++){ - aVal = _mm256_load_ps(aPtr); - aVal = _mm256_div_ps(aVal, _mm256_sqrt_ps(_mm256_mul_ps(_mm256_add_ps(fones, aVal), _mm256_sub_ps(fones, aVal)))); - z = aVal; - condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS); - z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition)); - condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS); - x = _mm256_add_ps(z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition)); - - for(i = 0; i < 2; i++){ - x = _mm256_add_ps(x, _mm256_sqrt_ps(_mm256_add_ps(fones, _mm256_mul_ps(x, x)))); + float* bPtr = bVector; + const float* aPtr = aVector; + + unsigned int number = 0; + unsigned int eighthPoints = num_points / 8; + int i, j; + + __m256 aVal, pio2, x, y, z, arcsine; + __m256 fzeroes, fones, ftwos, ffours, condition; + + pio2 = _mm256_set1_ps(3.14159265358979323846 / 2); + fzeroes = _mm256_setzero_ps(); + fones = _mm256_set1_ps(1.0); + ftwos = _mm256_set1_ps(2.0); + ffours = _mm256_set1_ps(4.0); + + for (; number < eighthPoints; number++) { + aVal = _mm256_load_ps(aPtr); + aVal = _mm256_div_ps(aVal, + _mm256_sqrt_ps(_mm256_mul_ps(_mm256_add_ps(fones, aVal), + _mm256_sub_ps(fones, aVal)))); + z = aVal; + condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS); + z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition)); + condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS); + x = _mm256_add_ps( + z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition)); + + for (i = 0; i < 2; i++) { + x = _mm256_add_ps(x, + _mm256_sqrt_ps(_mm256_add_ps(fones, _mm256_mul_ps(x, x)))); + } + x = _mm256_div_ps(fones, x); + y = fzeroes; + for (j = ASIN_TERMS - 1; j >= 0; j--) { + y = _mm256_add_ps(_mm256_mul_ps(y, _mm256_mul_ps(x, x)), + _mm256_set1_ps(pow(-1, j) / (2 * j + 1))); + } + + y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours)); + condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS); + + y = _mm256_add_ps( + y, _mm256_and_ps(_mm256_sub_ps(pio2, _mm256_mul_ps(y, ftwos)), condition)); + arcsine = y; + condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS); + arcsine = _mm256_sub_ps(arcsine, + _mm256_and_ps(_mm256_mul_ps(arcsine, ftwos), condition)); + + _mm256_store_ps(bPtr, arcsine); + aPtr += 8; + bPtr += 8; } - x = _mm256_div_ps(fones, x); - y = fzeroes; - for(j = ASIN_TERMS - 1; j >=0 ; j--){ - y = _mm256_add_ps(_mm256_mul_ps(y, _mm256_mul_ps(x, x)), _mm256_set1_ps(pow(-1,j)/(2*j+1))); - } - - y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours)); - condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS); - - y = _mm256_add_ps(y, _mm256_and_ps(_mm256_sub_ps(pio2, _mm256_mul_ps(y, ftwos)), condition)); - arcsine = y; - condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS); - arcsine = _mm256_sub_ps(arcsine, _mm256_and_ps(_mm256_mul_ps(arcsine, ftwos), condition)); - _mm256_store_ps(bPtr, arcsine); - aPtr += 8; - bPtr += 8; - } - - number = eighthPoints * 8; - for(;number < num_points; number++){ - *bPtr++ = asin(*aPtr++); - } + number = eighthPoints * 8; + for (; number < num_points; number++) { + *bPtr++ = asin(*aPtr++); + } } #endif /* LV_HAVE_AVX for aligned */ @@ -206,57 +220,60 @@ volk_32f_asin_32f_a_avx(float* bVector, const float* aVector, unsigned int num_p static inline void volk_32f_asin_32f_a_sse4_1(float* bVector, const float* aVector, unsigned int num_points) { - float* bPtr = bVector; - const float* aPtr = aVector; - - unsigned int number = 0; - unsigned int quarterPoints = num_points / 4; - int i, j; - - __m128 aVal, pio2, x, y, z, arcsine; - __m128 fzeroes, fones, ftwos, ffours, condition; - - pio2 = _mm_set1_ps(3.14159265358979323846/2); - fzeroes = _mm_setzero_ps(); - fones = _mm_set1_ps(1.0); - ftwos = _mm_set1_ps(2.0); - ffours = _mm_set1_ps(4.0); - - for(;number < quarterPoints; number++){ - aVal = _mm_load_ps(aPtr); - aVal = _mm_div_ps(aVal, _mm_sqrt_ps(_mm_mul_ps(_mm_add_ps(fones, aVal), _mm_sub_ps(fones, aVal)))); - z = aVal; - condition = _mm_cmplt_ps(z, fzeroes); - z = _mm_sub_ps(z, _mm_and_ps(_mm_mul_ps(z, ftwos), condition)); - condition = _mm_cmplt_ps(z, fones); - x = _mm_add_ps(z, _mm_and_ps(_mm_sub_ps(_mm_div_ps(fones, z), z), condition)); - - for(i = 0; i < 2; i++){ - x = _mm_add_ps(x, _mm_sqrt_ps(_mm_add_ps(fones, _mm_mul_ps(x, x)))); + float* bPtr = bVector; + const float* aPtr = aVector; + + unsigned int number = 0; + unsigned int quarterPoints = num_points / 4; + int i, j; + + __m128 aVal, pio2, x, y, z, arcsine; + __m128 fzeroes, fones, ftwos, ffours, condition; + + pio2 = _mm_set1_ps(3.14159265358979323846 / 2); + fzeroes = _mm_setzero_ps(); + fones = _mm_set1_ps(1.0); + ftwos = _mm_set1_ps(2.0); + ffours = _mm_set1_ps(4.0); + + for (; number < quarterPoints; number++) { + aVal = _mm_load_ps(aPtr); + aVal = _mm_div_ps( + aVal, + _mm_sqrt_ps(_mm_mul_ps(_mm_add_ps(fones, aVal), _mm_sub_ps(fones, aVal)))); + z = aVal; + condition = _mm_cmplt_ps(z, fzeroes); + z = _mm_sub_ps(z, _mm_and_ps(_mm_mul_ps(z, ftwos), condition)); + condition = _mm_cmplt_ps(z, fones); + x = _mm_add_ps(z, _mm_and_ps(_mm_sub_ps(_mm_div_ps(fones, z), z), condition)); + + for (i = 0; i < 2; i++) { + x = _mm_add_ps(x, _mm_sqrt_ps(_mm_add_ps(fones, _mm_mul_ps(x, x)))); + } + x = _mm_div_ps(fones, x); + y = fzeroes; + for (j = ASIN_TERMS - 1; j >= 0; j--) { + y = _mm_add_ps(_mm_mul_ps(y, _mm_mul_ps(x, x)), + _mm_set1_ps(pow(-1, j) / (2 * j + 1))); + } + + y = _mm_mul_ps(y, _mm_mul_ps(x, ffours)); + condition = _mm_cmpgt_ps(z, fones); + + y = _mm_add_ps(y, _mm_and_ps(_mm_sub_ps(pio2, _mm_mul_ps(y, ftwos)), condition)); + arcsine = y; + condition = _mm_cmplt_ps(aVal, fzeroes); + arcsine = _mm_sub_ps(arcsine, _mm_and_ps(_mm_mul_ps(arcsine, ftwos), condition)); + + _mm_store_ps(bPtr, arcsine); + aPtr += 4; + bPtr += 4; } - x = _mm_div_ps(fones, x); - y = fzeroes; - for(j = ASIN_TERMS - 1; j >=0 ; j--){ - y = _mm_add_ps(_mm_mul_ps(y, _mm_mul_ps(x, x)), _mm_set1_ps(pow(-1,j)/(2*j+1))); - } - - y = _mm_mul_ps(y, _mm_mul_ps(x, ffours)); - condition = _mm_cmpgt_ps(z, fones); - - y = _mm_add_ps(y, _mm_and_ps(_mm_sub_ps(pio2, _mm_mul_ps(y, ftwos)), condition)); - arcsine = y; - condition = _mm_cmplt_ps(aVal, fzeroes); - arcsine = _mm_sub_ps(arcsine, _mm_and_ps(_mm_mul_ps(arcsine, ftwos), condition)); - - _mm_store_ps(bPtr, arcsine); - aPtr += 4; - bPtr += 4; - } - number = quarterPoints * 4; - for(;number < num_points; number++){ - *bPtr++ = asinf(*aPtr++); - } + number = quarterPoints * 4; + for (; number < num_points; number++) { + *bPtr++ = asinf(*aPtr++); + } } #endif /* LV_HAVE_SSE4_1 for aligned */ @@ -269,60 +286,66 @@ volk_32f_asin_32f_a_sse4_1(float* bVector, const float* aVector, unsigned int nu #if LV_HAVE_AVX2 && LV_HAVE_FMA #include -static inline void -volk_32f_asin_32f_u_avx2_fma(float* bVector, const float* aVector, unsigned int num_points) +static inline void volk_32f_asin_32f_u_avx2_fma(float* bVector, + const float* aVector, + unsigned int num_points) { - float* bPtr = bVector; - const float* aPtr = aVector; - - unsigned int number = 0; - unsigned int eighthPoints = num_points / 8; - int i, j; - - __m256 aVal, pio2, x, y, z, arcsine; - __m256 fzeroes, fones, ftwos, ffours, condition; - - pio2 = _mm256_set1_ps(3.14159265358979323846/2); - fzeroes = _mm256_setzero_ps(); - fones = _mm256_set1_ps(1.0); - ftwos = _mm256_set1_ps(2.0); - ffours = _mm256_set1_ps(4.0); - - for(;number < eighthPoints; number++){ - aVal = _mm256_loadu_ps(aPtr); - aVal = _mm256_div_ps(aVal, _mm256_sqrt_ps(_mm256_mul_ps(_mm256_add_ps(fones, aVal), _mm256_sub_ps(fones, aVal)))); - z = aVal; - condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS); - z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition)); - condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS); - x = _mm256_add_ps(z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition)); - - for(i = 0; i < 2; i++){ - x = _mm256_add_ps(x, _mm256_sqrt_ps(_mm256_fmadd_ps(x,x,fones))); - } - x = _mm256_div_ps(fones, x); - y = fzeroes; - for(j = ASIN_TERMS - 1; j >=0 ; j--){ - y = _mm256_fmadd_ps(y, _mm256_mul_ps(x, x), _mm256_set1_ps(pow(-1,j)/(2*j+1))); + float* bPtr = bVector; + const float* aPtr = aVector; + + unsigned int number = 0; + unsigned int eighthPoints = num_points / 8; + int i, j; + + __m256 aVal, pio2, x, y, z, arcsine; + __m256 fzeroes, fones, ftwos, ffours, condition; + + pio2 = _mm256_set1_ps(3.14159265358979323846 / 2); + fzeroes = _mm256_setzero_ps(); + fones = _mm256_set1_ps(1.0); + ftwos = _mm256_set1_ps(2.0); + ffours = _mm256_set1_ps(4.0); + + for (; number < eighthPoints; number++) { + aVal = _mm256_loadu_ps(aPtr); + aVal = _mm256_div_ps(aVal, + _mm256_sqrt_ps(_mm256_mul_ps(_mm256_add_ps(fones, aVal), + _mm256_sub_ps(fones, aVal)))); + z = aVal; + condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS); + z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition)); + condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS); + x = _mm256_add_ps( + z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition)); + + for (i = 0; i < 2; i++) { + x = _mm256_add_ps(x, _mm256_sqrt_ps(_mm256_fmadd_ps(x, x, fones))); + } + x = _mm256_div_ps(fones, x); + y = fzeroes; + for (j = ASIN_TERMS - 1; j >= 0; j--) { + y = _mm256_fmadd_ps( + y, _mm256_mul_ps(x, x), _mm256_set1_ps(pow(-1, j) / (2 * j + 1))); + } + + y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours)); + condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS); + + y = _mm256_add_ps(y, _mm256_and_ps(_mm256_fnmadd_ps(y, ftwos, pio2), condition)); + arcsine = y; + condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS); + arcsine = _mm256_sub_ps(arcsine, + _mm256_and_ps(_mm256_mul_ps(arcsine, ftwos), condition)); + + _mm256_storeu_ps(bPtr, arcsine); + aPtr += 8; + bPtr += 8; } - y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours)); - condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS); - - y = _mm256_add_ps(y, _mm256_and_ps(_mm256_fnmadd_ps(y,ftwos,pio2), condition)); - arcsine = y; - condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS); - arcsine = _mm256_sub_ps(arcsine, _mm256_and_ps(_mm256_mul_ps(arcsine, ftwos), condition)); - - _mm256_storeu_ps(bPtr, arcsine); - aPtr += 8; - bPtr += 8; - } - - number = eighthPoints * 8; - for(;number < num_points; number++){ - *bPtr++ = asin(*aPtr++); - } + number = eighthPoints * 8; + for (; number < num_points; number++) { + *bPtr++ = asin(*aPtr++); + } } #endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for unaligned */ @@ -334,57 +357,64 @@ volk_32f_asin_32f_u_avx2_fma(float* bVector, const float* aVector, unsigned int static inline void volk_32f_asin_32f_u_avx(float* bVector, const float* aVector, unsigned int num_points) { - float* bPtr = bVector; - const float* aPtr = aVector; - - unsigned int number = 0; - unsigned int eighthPoints = num_points / 8; - int i, j; - - __m256 aVal, pio2, x, y, z, arcsine; - __m256 fzeroes, fones, ftwos, ffours, condition; - - pio2 = _mm256_set1_ps(3.14159265358979323846/2); - fzeroes = _mm256_setzero_ps(); - fones = _mm256_set1_ps(1.0); - ftwos = _mm256_set1_ps(2.0); - ffours = _mm256_set1_ps(4.0); - - for(;number < eighthPoints; number++){ - aVal = _mm256_loadu_ps(aPtr); - aVal = _mm256_div_ps(aVal, _mm256_sqrt_ps(_mm256_mul_ps(_mm256_add_ps(fones, aVal), _mm256_sub_ps(fones, aVal)))); - z = aVal; - condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS); - z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition)); - condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS); - x = _mm256_add_ps(z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition)); - - for(i = 0; i < 2; i++){ - x = _mm256_add_ps(x, _mm256_sqrt_ps(_mm256_add_ps(fones, _mm256_mul_ps(x, x)))); + float* bPtr = bVector; + const float* aPtr = aVector; + + unsigned int number = 0; + unsigned int eighthPoints = num_points / 8; + int i, j; + + __m256 aVal, pio2, x, y, z, arcsine; + __m256 fzeroes, fones, ftwos, ffours, condition; + + pio2 = _mm256_set1_ps(3.14159265358979323846 / 2); + fzeroes = _mm256_setzero_ps(); + fones = _mm256_set1_ps(1.0); + ftwos = _mm256_set1_ps(2.0); + ffours = _mm256_set1_ps(4.0); + + for (; number < eighthPoints; number++) { + aVal = _mm256_loadu_ps(aPtr); + aVal = _mm256_div_ps(aVal, + _mm256_sqrt_ps(_mm256_mul_ps(_mm256_add_ps(fones, aVal), + _mm256_sub_ps(fones, aVal)))); + z = aVal; + condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS); + z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition)); + condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS); + x = _mm256_add_ps( + z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition)); + + for (i = 0; i < 2; i++) { + x = _mm256_add_ps(x, + _mm256_sqrt_ps(_mm256_add_ps(fones, _mm256_mul_ps(x, x)))); + } + x = _mm256_div_ps(fones, x); + y = fzeroes; + for (j = ASIN_TERMS - 1; j >= 0; j--) { + y = _mm256_add_ps(_mm256_mul_ps(y, _mm256_mul_ps(x, x)), + _mm256_set1_ps(pow(-1, j) / (2 * j + 1))); + } + + y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours)); + condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS); + + y = _mm256_add_ps( + y, _mm256_and_ps(_mm256_sub_ps(pio2, _mm256_mul_ps(y, ftwos)), condition)); + arcsine = y; + condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS); + arcsine = _mm256_sub_ps(arcsine, + _mm256_and_ps(_mm256_mul_ps(arcsine, ftwos), condition)); + + _mm256_storeu_ps(bPtr, arcsine); + aPtr += 8; + bPtr += 8; } - x = _mm256_div_ps(fones, x); - y = fzeroes; - for(j = ASIN_TERMS - 1; j >=0 ; j--){ - y = _mm256_add_ps(_mm256_mul_ps(y, _mm256_mul_ps(x, x)), _mm256_set1_ps(pow(-1,j)/(2*j+1))); - } - - y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours)); - condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS); - y = _mm256_add_ps(y, _mm256_and_ps(_mm256_sub_ps(pio2, _mm256_mul_ps(y, ftwos)), condition)); - arcsine = y; - condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS); - arcsine = _mm256_sub_ps(arcsine, _mm256_and_ps(_mm256_mul_ps(arcsine, ftwos), condition)); - - _mm256_storeu_ps(bPtr, arcsine); - aPtr += 8; - bPtr += 8; - } - - number = eighthPoints * 8; - for(;number < num_points; number++){ - *bPtr++ = asin(*aPtr++); - } + number = eighthPoints * 8; + for (; number < num_points; number++) { + *bPtr++ = asin(*aPtr++); + } } #endif /* LV_HAVE_AVX for unaligned */ @@ -396,57 +426,60 @@ volk_32f_asin_32f_u_avx(float* bVector, const float* aVector, unsigned int num_p static inline void volk_32f_asin_32f_u_sse4_1(float* bVector, const float* aVector, unsigned int num_points) { - float* bPtr = bVector; - const float* aPtr = aVector; - - unsigned int number = 0; - unsigned int quarterPoints = num_points / 4; - int i, j; - - __m128 aVal, pio2, x, y, z, arcsine; - __m128 fzeroes, fones, ftwos, ffours, condition; - - pio2 = _mm_set1_ps(3.14159265358979323846/2); - fzeroes = _mm_setzero_ps(); - fones = _mm_set1_ps(1.0); - ftwos = _mm_set1_ps(2.0); - ffours = _mm_set1_ps(4.0); - - for(;number < quarterPoints; number++){ - aVal = _mm_loadu_ps(aPtr); - aVal = _mm_div_ps(aVal, _mm_sqrt_ps(_mm_mul_ps(_mm_add_ps(fones, aVal), _mm_sub_ps(fones, aVal)))); - z = aVal; - condition = _mm_cmplt_ps(z, fzeroes); - z = _mm_sub_ps(z, _mm_and_ps(_mm_mul_ps(z, ftwos), condition)); - condition = _mm_cmplt_ps(z, fones); - x = _mm_add_ps(z, _mm_and_ps(_mm_sub_ps(_mm_div_ps(fones, z), z), condition)); - - for(i = 0; i < 2; i++){ - x = _mm_add_ps(x, _mm_sqrt_ps(_mm_add_ps(fones, _mm_mul_ps(x, x)))); + float* bPtr = bVector; + const float* aPtr = aVector; + + unsigned int number = 0; + unsigned int quarterPoints = num_points / 4; + int i, j; + + __m128 aVal, pio2, x, y, z, arcsine; + __m128 fzeroes, fones, ftwos, ffours, condition; + + pio2 = _mm_set1_ps(3.14159265358979323846 / 2); + fzeroes = _mm_setzero_ps(); + fones = _mm_set1_ps(1.0); + ftwos = _mm_set1_ps(2.0); + ffours = _mm_set1_ps(4.0); + + for (; number < quarterPoints; number++) { + aVal = _mm_loadu_ps(aPtr); + aVal = _mm_div_ps( + aVal, + _mm_sqrt_ps(_mm_mul_ps(_mm_add_ps(fones, aVal), _mm_sub_ps(fones, aVal)))); + z = aVal; + condition = _mm_cmplt_ps(z, fzeroes); + z = _mm_sub_ps(z, _mm_and_ps(_mm_mul_ps(z, ftwos), condition)); + condition = _mm_cmplt_ps(z, fones); + x = _mm_add_ps(z, _mm_and_ps(_mm_sub_ps(_mm_div_ps(fones, z), z), condition)); + + for (i = 0; i < 2; i++) { + x = _mm_add_ps(x, _mm_sqrt_ps(_mm_add_ps(fones, _mm_mul_ps(x, x)))); + } + x = _mm_div_ps(fones, x); + y = fzeroes; + for (j = ASIN_TERMS - 1; j >= 0; j--) { + y = _mm_add_ps(_mm_mul_ps(y, _mm_mul_ps(x, x)), + _mm_set1_ps(pow(-1, j) / (2 * j + 1))); + } + + y = _mm_mul_ps(y, _mm_mul_ps(x, ffours)); + condition = _mm_cmpgt_ps(z, fones); + + y = _mm_add_ps(y, _mm_and_ps(_mm_sub_ps(pio2, _mm_mul_ps(y, ftwos)), condition)); + arcsine = y; + condition = _mm_cmplt_ps(aVal, fzeroes); + arcsine = _mm_sub_ps(arcsine, _mm_and_ps(_mm_mul_ps(arcsine, ftwos), condition)); + + _mm_storeu_ps(bPtr, arcsine); + aPtr += 4; + bPtr += 4; } - x = _mm_div_ps(fones, x); - y = fzeroes; - for(j = ASIN_TERMS - 1; j >=0 ; j--){ - y = _mm_add_ps(_mm_mul_ps(y, _mm_mul_ps(x, x)), _mm_set1_ps(pow(-1,j)/(2*j+1))); - } - - y = _mm_mul_ps(y, _mm_mul_ps(x, ffours)); - condition = _mm_cmpgt_ps(z, fones); - y = _mm_add_ps(y, _mm_and_ps(_mm_sub_ps(pio2, _mm_mul_ps(y, ftwos)), condition)); - arcsine = y; - condition = _mm_cmplt_ps(aVal, fzeroes); - arcsine = _mm_sub_ps(arcsine, _mm_and_ps(_mm_mul_ps(arcsine, ftwos), condition)); - - _mm_storeu_ps(bPtr, arcsine); - aPtr += 4; - bPtr += 4; - } - - number = quarterPoints * 4; - for(;number < num_points; number++){ - *bPtr++ = asinf(*aPtr++); - } + number = quarterPoints * 4; + for (; number < num_points; number++) { + *bPtr++ = asinf(*aPtr++); + } } #endif /* LV_HAVE_SSE4_1 for unaligned */ @@ -456,13 +489,13 @@ volk_32f_asin_32f_u_sse4_1(float* bVector, const float* aVector, unsigned int nu static inline void volk_32f_asin_32f_u_generic(float* bVector, const float* aVector, unsigned int num_points) { - float* bPtr = bVector; - const float* aPtr = aVector; - unsigned int number = 0; + float* bPtr = bVector; + const float* aPtr = aVector; + unsigned int number = 0; - for(number = 0; number < num_points; number++){ - *bPtr++ = asinf(*aPtr++); - } + for (number = 0; number < num_points; number++) { + *bPtr++ = asinf(*aPtr++); + } } #endif /* LV_HAVE_GENERIC */ diff --git a/kernels/volk/volk_32f_atan_32f.h b/kernels/volk/volk_32f_atan_32f.h index 3496f0e..6652ee8 100644 --- a/kernels/volk/volk_32f_atan_32f.h +++ b/kernels/volk/volk_32f_atan_32f.h @@ -67,11 +67,12 @@ * \endcode */ -#include -#include #include +#include +#include -/* This is the number of terms of Taylor series to evaluate, increase this for more accuracy*/ +/* This is the number of terms of Taylor series to evaluate, increase this for more + * accuracy*/ #define TERMS 2 #ifndef INCLUDED_volk_32f_atan_32f_a_H @@ -80,59 +81,63 @@ #if LV_HAVE_AVX2 && LV_HAVE_FMA #include -static inline void -volk_32f_atan_32f_a_avx2_fma(float* bVector, const float* aVector, unsigned int num_points) +static inline void volk_32f_atan_32f_a_avx2_fma(float* bVector, + const float* aVector, + unsigned int num_points) { - float* bPtr = bVector; - const float* aPtr = aVector; - - unsigned int number = 0; - unsigned int eighthPoints = num_points / 8; - int i, j; - - __m256 aVal, pio2, x, y, z, arctangent; - __m256 fzeroes, fones, ftwos, ffours, condition; - - pio2 = _mm256_set1_ps(3.14159265358979323846/2); - fzeroes = _mm256_setzero_ps(); - fones = _mm256_set1_ps(1.0); - ftwos = _mm256_set1_ps(2.0); - ffours = _mm256_set1_ps(4.0); - - for(;number < eighthPoints; number++){ - aVal = _mm256_load_ps(aPtr); - z = aVal; - condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS); - z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition)); - condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS); - x = _mm256_add_ps(z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition)); - - for(i = 0; i < 2; i++){ - x = _mm256_add_ps(x, _mm256_sqrt_ps(_mm256_fmadd_ps(x,x,fones))); + float* bPtr = bVector; + const float* aPtr = aVector; + + unsigned int number = 0; + unsigned int eighthPoints = num_points / 8; + int i, j; + + __m256 aVal, pio2, x, y, z, arctangent; + __m256 fzeroes, fones, ftwos, ffours, condition; + + pio2 = _mm256_set1_ps(3.14159265358979323846 / 2); + fzeroes = _mm256_setzero_ps(); + fones = _mm256_set1_ps(1.0); + ftwos = _mm256_set1_ps(2.0); + ffours = _mm256_set1_ps(4.0); + + for (; number < eighthPoints; number++) { + aVal = _mm256_load_ps(aPtr); + z = aVal; + condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS); + z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition)); + condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS); + x = _mm256_add_ps( + z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition)); + + for (i = 0; i < 2; i++) { + x = _mm256_add_ps(x, _mm256_sqrt_ps(_mm256_fmadd_ps(x, x, fones))); + } + x = _mm256_div_ps(fones, x); + y = fzeroes; + for (j = TERMS - 1; j >= 0; j--) { + y = _mm256_fmadd_ps( + y, _mm256_mul_ps(x, x), _mm256_set1_ps(pow(-1, j) / (2 * j + 1))); + } + + y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours)); + condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS); + + y = _mm256_add_ps(y, _mm256_and_ps(_mm256_fnmadd_ps(y, ftwos, pio2), condition)); + arctangent = y; + condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS); + arctangent = _mm256_sub_ps( + arctangent, _mm256_and_ps(_mm256_mul_ps(arctangent, ftwos), condition)); + + _mm256_store_ps(bPtr, arctangent); + aPtr += 8; + bPtr += 8; } - x = _mm256_div_ps(fones, x); - y = fzeroes; - for(j = TERMS - 1; j >=0 ; j--){ - y = _mm256_fmadd_ps(y, _mm256_mul_ps(x, x), _mm256_set1_ps(pow(-1,j)/(2*j+1))); - } - - y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours)); - condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS); - - y = _mm256_add_ps(y, _mm256_and_ps(_mm256_fnmadd_ps(y,ftwos,pio2), condition)); - arctangent = y; - condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS); - arctangent = _mm256_sub_ps(arctangent, _mm256_and_ps(_mm256_mul_ps(arctangent, ftwos), condition)); - - _mm256_store_ps(bPtr, arctangent); - aPtr += 8; - bPtr += 8; - } - number = eighthPoints * 8; - for(;number < num_points; number++){ - *bPtr++ = atan(*aPtr++); - } + number = eighthPoints * 8; + for (; number < num_points; number++) { + *bPtr++ = atan(*aPtr++); + } } #endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for aligned */ @@ -144,56 +149,61 @@ volk_32f_atan_32f_a_avx2_fma(float* bVector, const float* aVector, unsigned int static inline void volk_32f_atan_32f_a_avx(float* bVector, const float* aVector, unsigned int num_points) { - float* bPtr = bVector; - const float* aPtr = aVector; - - unsigned int number = 0; - unsigned int eighthPoints = num_points / 8; - int i, j; - - __m256 aVal, pio2, x, y, z, arctangent; - __m256 fzeroes, fones, ftwos, ffours, condition; - - pio2 = _mm256_set1_ps(3.14159265358979323846/2); - fzeroes = _mm256_setzero_ps(); - fones = _mm256_set1_ps(1.0); - ftwos = _mm256_set1_ps(2.0); - ffours = _mm256_set1_ps(4.0); - - for(;number < eighthPoints; number++){ - aVal = _mm256_load_ps(aPtr); - z = aVal; - condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS); - z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition)); - condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS); - x = _mm256_add_ps(z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition)); - - for(i = 0; i < 2; i++){ - x = _mm256_add_ps(x, _mm256_sqrt_ps(_mm256_add_ps(fones, _mm256_mul_ps(x, x)))); - } - x = _mm256_div_ps(fones, x); - y = fzeroes; - for(j = TERMS - 1; j >=0 ; j--){ - y = _mm256_add_ps(_mm256_mul_ps(y, _mm256_mul_ps(x, x)), _mm256_set1_ps(pow(-1,j)/(2*j+1))); + float* bPtr = bVector; + const float* aPtr = aVector; + + unsigned int number = 0; + unsigned int eighthPoints = num_points / 8; + int i, j; + + __m256 aVal, pio2, x, y, z, arctangent; + __m256 fzeroes, fones, ftwos, ffours, condition; + + pio2 = _mm256_set1_ps(3.14159265358979323846 / 2); + fzeroes = _mm256_setzero_ps(); + fones = _mm256_set1_ps(1.0); + ftwos = _mm256_set1_ps(2.0); + ffours = _mm256_set1_ps(4.0); + + for (; number < eighthPoints; number++) { + aVal = _mm256_load_ps(aPtr); + z = aVal; + condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS); + z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition)); + condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS); + x = _mm256_add_ps( + z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition)); + + for (i = 0; i < 2; i++) { + x = _mm256_add_ps(x, + _mm256_sqrt_ps(_mm256_add_ps(fones, _mm256_mul_ps(x, x)))); + } + x = _mm256_div_ps(fones, x); + y = fzeroes; + for (j = TERMS - 1; j >= 0; j--) { + y = _mm256_add_ps(_mm256_mul_ps(y, _mm256_mul_ps(x, x)), + _mm256_set1_ps(pow(-1, j) / (2 * j + 1))); + } + + y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours)); + condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS); + + y = _mm256_add_ps( + y, _mm256_and_ps(_mm256_sub_ps(pio2, _mm256_mul_ps(y, ftwos)), condition)); + arctangent = y; + condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS); + arctangent = _mm256_sub_ps( + arctangent, _mm256_and_ps(_mm256_mul_ps(arctangent, ftwos), condition)); + + _mm256_store_ps(bPtr, arctangent); + aPtr += 8; + bPtr += 8; } - y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours)); - condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS); - - y = _mm256_add_ps(y, _mm256_and_ps(_mm256_sub_ps(pio2, _mm256_mul_ps(y, ftwos)), condition)); - arctangent = y; - condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS); - arctangent = _mm256_sub_ps(arctangent, _mm256_and_ps(_mm256_mul_ps(arctangent, ftwos), condition)); - - _mm256_store_ps(bPtr, arctangent); - aPtr += 8; - bPtr += 8; - } - - number = eighthPoints * 8; - for(;number < num_points; number++){ - *bPtr++ = atan(*aPtr++); - } + number = eighthPoints * 8; + for (; number < num_points; number++) { + *bPtr++ = atan(*aPtr++); + } } #endif /* LV_HAVE_AVX for aligned */ @@ -204,56 +214,58 @@ volk_32f_atan_32f_a_avx(float* bVector, const float* aVector, unsigned int num_p static inline void volk_32f_atan_32f_a_sse4_1(float* bVector, const float* aVector, unsigned int num_points) { - float* bPtr = bVector; - const float* aPtr = aVector; - - unsigned int number = 0; - unsigned int quarterPoints = num_points / 4; - int i, j; - - __m128 aVal, pio2, x, y, z, arctangent; - __m128 fzeroes, fones, ftwos, ffours, condition; - - pio2 = _mm_set1_ps(3.14159265358979323846/2); - fzeroes = _mm_setzero_ps(); - fones = _mm_set1_ps(1.0); - ftwos = _mm_set1_ps(2.0); - ffours = _mm_set1_ps(4.0); - - for(;number < quarterPoints; number++){ - aVal = _mm_load_ps(aPtr); - z = aVal; - condition = _mm_cmplt_ps(z, fzeroes); - z = _mm_sub_ps(z, _mm_and_ps(_mm_mul_ps(z, ftwos), condition)); - condition = _mm_cmplt_ps(z, fones); - x = _mm_add_ps(z, _mm_and_ps(_mm_sub_ps(_mm_div_ps(fones, z), z), condition)); - - for(i = 0; i < 2; i++){ - x = _mm_add_ps(x, _mm_sqrt_ps(_mm_add_ps(fones, _mm_mul_ps(x, x)))); - } - x = _mm_div_ps(fones, x); - y = fzeroes; - for(j = TERMS - 1; j >=0 ; j--){ - y = _mm_add_ps(_mm_mul_ps(y, _mm_mul_ps(x, x)), _mm_set1_ps(pow(-1,j)/(2*j+1))); + float* bPtr = bVector; + const float* aPtr = aVector; + + unsigned int number = 0; + unsigned int quarterPoints = num_points / 4; + int i, j; + + __m128 aVal, pio2, x, y, z, arctangent; + __m128 fzeroes, fones, ftwos, ffours, condition; + + pio2 = _mm_set1_ps(3.14159265358979323846 / 2); + fzeroes = _mm_setzero_ps(); + fones = _mm_set1_ps(1.0); + ftwos = _mm_set1_ps(2.0); + ffours = _mm_set1_ps(4.0); + + for (; number < quarterPoints; number++) { + aVal = _mm_load_ps(aPtr); + z = aVal; + condition = _mm_cmplt_ps(z, fzeroes); + z = _mm_sub_ps(z, _mm_and_ps(_mm_mul_ps(z, ftwos), condition)); + condition = _mm_cmplt_ps(z, fones); + x = _mm_add_ps(z, _mm_and_ps(_mm_sub_ps(_mm_div_ps(fones, z), z), condition)); + + for (i = 0; i < 2; i++) { + x = _mm_add_ps(x, _mm_sqrt_ps(_mm_add_ps(fones, _mm_mul_ps(x, x)))); + } + x = _mm_div_ps(fones, x); + y = fzeroes; + for (j = TERMS - 1; j >= 0; j--) { + y = _mm_add_ps(_mm_mul_ps(y, _mm_mul_ps(x, x)), + _mm_set1_ps(pow(-1, j) / (2 * j + 1))); + } + + y = _mm_mul_ps(y, _mm_mul_ps(x, ffours)); + condition = _mm_cmpgt_ps(z, fones); + + y = _mm_add_ps(y, _mm_and_ps(_mm_sub_ps(pio2, _mm_mul_ps(y, ftwos)), condition)); + arctangent = y; + condition = _mm_cmplt_ps(aVal, fzeroes); + arctangent = + _mm_sub_ps(arctangent, _mm_and_ps(_mm_mul_ps(arctangent, ftwos), condition)); + + _mm_store_ps(bPtr, arctangent); + aPtr += 4; + bPtr += 4; } - y = _mm_mul_ps(y, _mm_mul_ps(x, ffours)); - condition = _mm_cmpgt_ps(z, fones); - - y = _mm_add_ps(y, _mm_and_ps(_mm_sub_ps(pio2, _mm_mul_ps(y, ftwos)), condition)); - arctangent = y; - condition = _mm_cmplt_ps(aVal, fzeroes); - arctangent = _mm_sub_ps(arctangent, _mm_and_ps(_mm_mul_ps(arctangent, ftwos), condition)); - - _mm_store_ps(bPtr, arctangent); - aPtr += 4; - bPtr += 4; - } - - number = quarterPoints * 4; - for(;number < num_points; number++){ - *bPtr++ = atanf(*aPtr++); - } + number = quarterPoints * 4; + for (; number < num_points; number++) { + *bPtr++ = atanf(*aPtr++); + } } #endif /* LV_HAVE_SSE4_1 for aligned */ @@ -266,59 +278,63 @@ volk_32f_atan_32f_a_sse4_1(float* bVector, const float* aVector, unsigned int nu #if LV_HAVE_AVX2 && LV_HAVE_FMA #include -static inline void -volk_32f_atan_32f_u_avx2_fma(float* bVector, const float* aVector, unsigned int num_points) +static inline void volk_32f_atan_32f_u_avx2_fma(float* bVector, + const float* aVector, + unsigned int num_points) { - float* bPtr = bVector; - const float* aPtr = aVector; - - unsigned int number = 0; - unsigned int eighthPoints = num_points / 8; - int i, j; - - __m256 aVal, pio2, x, y, z, arctangent; - __m256 fzeroes, fones, ftwos, ffours, condition; - - pio2 = _mm256_set1_ps(3.14159265358979323846/2); - fzeroes = _mm256_setzero_ps(); - fones = _mm256_set1_ps(1.0); - ftwos = _mm256_set1_ps(2.0); - ffours = _mm256_set1_ps(4.0); - - for(;number < eighthPoints; number++){ - aVal = _mm256_loadu_ps(aPtr); - z = aVal; - condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS); - z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition)); - condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS); - x = _mm256_add_ps(z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition)); - - for(i = 0; i < 2; i++){ - x = _mm256_add_ps(x, _mm256_sqrt_ps(_mm256_fmadd_ps(x,x,fones))); + float* bPtr = bVector; + const float* aPtr = aVector; + + unsigned int number = 0; + unsigned int eighthPoints = num_points / 8; + int i, j; + + __m256 aVal, pio2, x, y, z, arctangent; + __m256 fzeroes, fones, ftwos, ffours, condition; + + pio2 = _mm256_set1_ps(3.14159265358979323846 / 2); + fzeroes = _mm256_setzero_ps(); + fones = _mm256_set1_ps(1.0); + ftwos = _mm256_set1_ps(2.0); + ffours = _mm256_set1_ps(4.0); + + for (; number < eighthPoints; number++) { + aVal = _mm256_loadu_ps(aPtr); + z = aVal; + condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS); + z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition)); + condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS); + x = _mm256_add_ps( + z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition)); + + for (i = 0; i < 2; i++) { + x = _mm256_add_ps(x, _mm256_sqrt_ps(_mm256_fmadd_ps(x, x, fones))); + } + x = _mm256_div_ps(fones, x); + y = fzeroes; + for (j = TERMS - 1; j >= 0; j--) { + y = _mm256_fmadd_ps( + y, _mm256_mul_ps(x, x), _mm256_set1_ps(pow(-1, j) / (2 * j + 1))); + } + + y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours)); + condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS); + + y = _mm256_add_ps(y, _mm256_and_ps(_mm256_fnmadd_ps(y, ftwos, pio2), condition)); + arctangent = y; + condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS); + arctangent = _mm256_sub_ps( + arctangent, _mm256_and_ps(_mm256_mul_ps(arctangent, ftwos), condition)); + + _mm256_storeu_ps(bPtr, arctangent); + aPtr += 8; + bPtr += 8; } - x = _mm256_div_ps(fones, x); - y = fzeroes; - for(j = TERMS - 1; j >=0 ; j--){ - y = _mm256_fmadd_ps(y, _mm256_mul_ps(x, x), _mm256_set1_ps(pow(-1,j)/(2*j+1))); - } - - y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours)); - condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS); - y = _mm256_add_ps(y, _mm256_and_ps(_mm256_fnmadd_ps(y,ftwos,pio2), condition)); - arctangent = y; - condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS); - arctangent = _mm256_sub_ps(arctangent, _mm256_and_ps(_mm256_mul_ps(arctangent, ftwos), condition)); - - _mm256_storeu_ps(bPtr, arctangent); - aPtr += 8; - bPtr += 8; - } - - number = eighthPoints * 8; - for(;number < num_points; number++){ - *bPtr++ = atan(*aPtr++); - } + number = eighthPoints * 8; + for (; number < num_points; number++) { + *bPtr++ = atan(*aPtr++); + } } #endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for unaligned */ @@ -330,56 +346,61 @@ volk_32f_atan_32f_u_avx2_fma(float* bVector, const float* aVector, unsigned int static inline void volk_32f_atan_32f_u_avx(float* bVector, const float* aVector, unsigned int num_points) { - float* bPtr = bVector; - const float* aPtr = aVector; - - unsigned int number = 0; - unsigned int eighthPoints = num_points / 8; - int i, j; - - __m256 aVal, pio2, x, y, z, arctangent; - __m256 fzeroes, fones, ftwos, ffours, condition; - - pio2 = _mm256_set1_ps(3.14159265358979323846/2); - fzeroes = _mm256_setzero_ps(); - fones = _mm256_set1_ps(1.0); - ftwos = _mm256_set1_ps(2.0); - ffours = _mm256_set1_ps(4.0); - - for(;number < eighthPoints; number++){ - aVal = _mm256_loadu_ps(aPtr); - z = aVal; - condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS); - z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition)); - condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS); - x = _mm256_add_ps(z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition)); - - for(i = 0; i < 2; i++){ - x = _mm256_add_ps(x, _mm256_sqrt_ps(_mm256_add_ps(fones, _mm256_mul_ps(x, x)))); - } - x = _mm256_div_ps(fones, x); - y = fzeroes; - for(j = TERMS - 1; j >=0 ; j--){ - y = _mm256_add_ps(_mm256_mul_ps(y, _mm256_mul_ps(x, x)), _mm256_set1_ps(pow(-1,j)/(2*j+1))); + float* bPtr = bVector; + const float* aPtr = aVector; + + unsigned int number = 0; + unsigned int eighthPoints = num_points / 8; + int i, j; + + __m256 aVal, pio2, x, y, z, arctangent; + __m256 fzeroes, fones, ftwos, ffours, condition; + + pio2 = _mm256_set1_ps(3.14159265358979323846 / 2); + fzeroes = _mm256_setzero_ps(); + fones = _mm256_set1_ps(1.0); + ftwos = _mm256_set1_ps(2.0); + ffours = _mm256_set1_ps(4.0); + + for (; number < eighthPoints; number++) { + aVal = _mm256_loadu_ps(aPtr); + z = aVal; + condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS); + z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition)); + condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS); + x = _mm256_add_ps( + z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition)); + + for (i = 0; i < 2; i++) { + x = _mm256_add_ps(x, + _mm256_sqrt_ps(_mm256_add_ps(fones, _mm256_mul_ps(x, x)))); + } + x = _mm256_div_ps(fones, x); + y = fzeroes; + for (j = TERMS - 1; j >= 0; j--) { + y = _mm256_add_ps(_mm256_mul_ps(y, _mm256_mul_ps(x, x)), + _mm256_set1_ps(pow(-1, j) / (2 * j + 1))); + } + + y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours)); + condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS); + + y = _mm256_add_ps( + y, _mm256_and_ps(_mm256_sub_ps(pio2, _mm256_mul_ps(y, ftwos)), condition)); + arctangent = y; + condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS); + arctangent = _mm256_sub_ps( + arctangent, _mm256_and_ps(_mm256_mul_ps(arctangent, ftwos), condition)); + + _mm256_storeu_ps(bPtr, arctangent); + aPtr += 8; + bPtr += 8; } - y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours)); - condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS); - - y = _mm256_add_ps(y, _mm256_and_ps(_mm256_sub_ps(pio2, _mm256_mul_ps(y, ftwos)), condition)); - arctangent = y; - condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS); - arctangent = _mm256_sub_ps(arctangent, _mm256_and_ps(_mm256_mul_ps(arctangent, ftwos), condition)); - - _mm256_storeu_ps(bPtr, arctangent); - aPtr += 8; - bPtr += 8; - } - - number = eighthPoints * 8; - for(;number < num_points; number++){ - *bPtr++ = atan(*aPtr++); - } + number = eighthPoints * 8; + for (; number < num_points; number++) { + *bPtr++ = atan(*aPtr++); + } } #endif /* LV_HAVE_AVX for unaligned */ @@ -390,54 +411,56 @@ volk_32f_atan_32f_u_avx(float* bVector, const float* aVector, unsigned int num_p static inline void volk_32f_atan_32f_u_sse4_1(float* bVector, const float* aVector, unsigned int num_points) { - float* bPtr = bVector; - const float* aPtr = aVector; - - unsigned int number = 0; - unsigned int quarterPoints = num_points / 4; - int i, j; - - __m128 aVal, pio2, x, y, z, arctangent; - __m128 fzeroes, fones, ftwos, ffours, condition; - - pio2 = _mm_set1_ps(3.14159265358979323846/2); - fzeroes = _mm_setzero_ps(); - fones = _mm_set1_ps(1.0); - ftwos = _mm_set1_ps(2.0); - ffours = _mm_set1_ps(4.0); - - for(;number < quarterPoints; number++){ - aVal = _mm_loadu_ps(aPtr); - z = aVal; - condition = _mm_cmplt_ps(z, fzeroes); - z = _mm_sub_ps(z, _mm_and_ps(_mm_mul_ps(z, ftwos), condition)); - condition = _mm_cmplt_ps(z, fones); - x = _mm_add_ps(z, _mm_and_ps(_mm_sub_ps(_mm_div_ps(fones, z), z), condition)); - - for(i = 0; i < 2; i++) - x = _mm_add_ps(x, _mm_sqrt_ps(_mm_add_ps(fones, _mm_mul_ps(x, x)))); - x = _mm_div_ps(fones, x); - y = fzeroes; - for(j = TERMS - 1; j >= 0; j--) - y = _mm_add_ps(_mm_mul_ps(y, _mm_mul_ps(x, x)), _mm_set1_ps(pow(-1,j)/(2*j+1))); - - y = _mm_mul_ps(y, _mm_mul_ps(x, ffours)); - condition = _mm_cmpgt_ps(z, fones); - - y = _mm_add_ps(y, _mm_and_ps(_mm_sub_ps(pio2, _mm_mul_ps(y, ftwos)), condition)); - arctangent = y; - condition = _mm_cmplt_ps(aVal, fzeroes); - arctangent = _mm_sub_ps(arctangent, _mm_and_ps(_mm_mul_ps(arctangent, ftwos), condition)); - - _mm_storeu_ps(bPtr, arctangent); - aPtr += 4; - bPtr += 4; - } - - number = quarterPoints * 4; - for(;number < num_points; number++){ - *bPtr++ = atanf(*aPtr++); - } + float* bPtr = bVector; + const float* aPtr = aVector; + + unsigned int number = 0; + unsigned int quarterPoints = num_points / 4; + int i, j; + + __m128 aVal, pio2, x, y, z, arctangent; + __m128 fzeroes, fones, ftwos, ffours, condition; + + pio2 = _mm_set1_ps(3.14159265358979323846 / 2); + fzeroes = _mm_setzero_ps(); + fones = _mm_set1_ps(1.0); + ftwos = _mm_set1_ps(2.0); + ffours = _mm_set1_ps(4.0); + + for (; number < quarterPoints; number++) { + aVal = _mm_loadu_ps(aPtr); + z = aVal; + condition = _mm_cmplt_ps(z, fzeroes); + z = _mm_sub_ps(z, _mm_and_ps(_mm_mul_ps(z, ftwos), condition)); + condition = _mm_cmplt_ps(z, fones); + x = _mm_add_ps(z, _mm_and_ps(_mm_sub_ps(_mm_div_ps(fones, z), z), condition)); + + for (i = 0; i < 2; i++) + x = _mm_add_ps(x, _mm_sqrt_ps(_mm_add_ps(fones, _mm_mul_ps(x, x)))); + x = _mm_div_ps(fones, x); + y = fzeroes; + for (j = TERMS - 1; j >= 0; j--) + y = _mm_add_ps(_mm_mul_ps(y, _mm_mul_ps(x, x)), + _mm_set1_ps(pow(-1, j) / (2 * j + 1))); + + y = _mm_mul_ps(y, _mm_mul_ps(x, ffours)); + condition = _mm_cmpgt_ps(z, fones); + + y = _mm_add_ps(y, _mm_and_ps(_mm_sub_ps(pio2, _mm_mul_ps(y, ftwos)), condition)); + arctangent = y; + condition = _mm_cmplt_ps(aVal, fzeroes); + arctangent = + _mm_sub_ps(arctangent, _mm_and_ps(_mm_mul_ps(arctangent, ftwos), condition)); + + _mm_storeu_ps(bPtr, arctangent); + aPtr += 4; + bPtr += 4; + } + + number = quarterPoints * 4; + for (; number < num_points; number++) { + *bPtr++ = atanf(*aPtr++); + } } #endif /* LV_HAVE_SSE4_1 for unaligned */ @@ -447,13 +470,13 @@ volk_32f_atan_32f_u_sse4_1(float* bVector, const float* aVector, unsigned int nu static inline void volk_32f_atan_32f_generic(float* bVector, const float* aVector, unsigned int num_points) { - float* bPtr = bVector; - const float* aPtr = aVector; - unsigned int number = 0; + float* bPtr = bVector; + const float* aPtr = aVector; + unsigned int number = 0; - for(number = 0; number < num_points; number++){ - *bPtr++ = atanf(*aPtr++); - } + for (number = 0; number < num_points; number++) { + *bPtr++ = atanf(*aPtr++); + } } #endif /* LV_HAVE_GENERIC */ diff --git a/kernels/volk/volk_32f_binary_slicer_32i.h b/kernels/volk/volk_32f_binary_slicer_32i.h index c56ff8f..635d0c3 100644 --- a/kernels/volk/volk_32f_binary_slicer_32i.h +++ b/kernels/volk/volk_32f_binary_slicer_32i.h @@ -30,8 +30,8 @@ * * Dispatcher Prototype * \code - * void volk_32f_binary_slicer_32i(int* cVector, const float* aVector, unsigned int num_points) - * \endcode + * void volk_32f_binary_slicer_32i(int* cVector, const float* aVector, unsigned int + * num_points) \endcode * * \b Inputs * \li aVector: The input vector of floats. @@ -73,37 +73,38 @@ #ifdef LV_HAVE_GENERIC -static inline void -volk_32f_binary_slicer_32i_generic(int* cVector, const float* aVector, unsigned int num_points) +static inline void volk_32f_binary_slicer_32i_generic(int* cVector, + const float* aVector, + unsigned int num_points) { - int* cPtr = cVector; - const float* aPtr = aVector; - unsigned int number = 0; - - for(number = 0; number < num_points; number++){ - if( *aPtr++ >= 0) { - *cPtr++ = 1; - } - else { - *cPtr++ = 0; + int* cPtr = cVector; + const float* aPtr = aVector; + unsigned int number = 0; + + for (number = 0; number < num_points; number++) { + if (*aPtr++ >= 0) { + *cPtr++ = 1; + } else { + *cPtr++ = 0; + } } - } } #endif /* LV_HAVE_GENERIC */ #ifdef LV_HAVE_GENERIC -static inline void -volk_32f_binary_slicer_32i_generic_branchless(int* cVector, const float* aVector, unsigned int num_points) +static inline void volk_32f_binary_slicer_32i_generic_branchless(int* cVector, + const float* aVector, + unsigned int num_points) { - int* cPtr = cVector; - const float* aPtr = aVector; - unsigned int number = 0; + int* cPtr = cVector; + const float* aPtr = aVector; + unsigned int number = 0; - for(number = 0; number < num_points; number++){ - *cPtr++ = (*aPtr++ >= 0); - } + for (number = 0; number < num_points; number++) { + *cPtr++ = (*aPtr++ >= 0); + } } #endif /* LV_HAVE_GENERIC */ @@ -111,40 +112,40 @@ volk_32f_binary_slicer_32i_generic_branchless(int* cVector, const float* aVector #ifdef LV_HAVE_SSE2 #include -static inline void -volk_32f_binary_slicer_32i_a_sse2(int* cVector, const float* aVector, unsigned int num_points) +static inline void volk_32f_binary_slicer_32i_a_sse2(int* cVector, + const float* aVector, + unsigned int num_points) { - int* cPtr = cVector; - const float* aPtr = aVector; - unsigned int number = 0; - - unsigned int quarter_points = num_points / 4; - __m128 a_val, res_f; - __m128i res_i, binary_i; - __m128 zero_val; - zero_val = _mm_set1_ps (0.0f); + int* cPtr = cVector; + const float* aPtr = aVector; + unsigned int number = 0; - for(number = 0; number < quarter_points; number++){ - a_val = _mm_load_ps(aPtr); + unsigned int quarter_points = num_points / 4; + __m128 a_val, res_f; + __m128i res_i, binary_i; + __m128 zero_val; + zero_val = _mm_set1_ps(0.0f); - res_f = _mm_cmpge_ps (a_val, zero_val); - res_i = _mm_cvtps_epi32 (res_f); - binary_i = _mm_srli_epi32 (res_i, 31); + for (number = 0; number < quarter_points; number++) { + a_val = _mm_load_ps(aPtr); - _mm_store_si128((__m128i*)cPtr, binary_i); + res_f = _mm_cmpge_ps(a_val, zero_val); + res_i = _mm_cvtps_epi32(res_f); + binary_i = _mm_srli_epi32(res_i, 31); - cPtr += 4; - aPtr += 4; - } + _mm_store_si128((__m128i*)cPtr, binary_i); - for(number = quarter_points * 4; number < num_points; number++){ - if( *aPtr++ >= 0) { - *cPtr++ = 1; + cPtr += 4; + aPtr += 4; } - else { - *cPtr++ = 0; + + for (number = quarter_points * 4; number < num_points; number++) { + if (*aPtr++ >= 0) { + *cPtr++ = 1; + } else { + *cPtr++ = 0; + } } - } } #endif /* LV_HAVE_SSE2 */ @@ -152,41 +153,41 @@ volk_32f_binary_slicer_32i_a_sse2(int* cVector, const float* aVector, unsigned i #ifdef LV_HAVE_AVX #include -static inline void -volk_32f_binary_slicer_32i_a_avx(int* cVector, const float* aVector, unsigned int num_points) +static inline void volk_32f_binary_slicer_32i_a_avx(int* cVector, + const float* aVector, + unsigned int num_points) { - int* cPtr = cVector; - const float* aPtr = aVector; - unsigned int number = 0; + int* cPtr = cVector; + const float* aPtr = aVector; + unsigned int number = 0; - unsigned int quarter_points = num_points / 8; - __m256 a_val, res_f, binary_f; - __m256i binary_i; - __m256 zero_val, one_val; - zero_val = _mm256_set1_ps (0.0f); - one_val = _mm256_set1_ps (1.0f); + unsigned int quarter_points = num_points / 8; + __m256 a_val, res_f, binary_f; + __m256i binary_i; + __m256 zero_val, one_val; + zero_val = _mm256_set1_ps(0.0f); + one_val = _mm256_set1_ps(1.0f); - for(number = 0; number < quarter_points; number++){ - a_val = _mm256_load_ps(aPtr); + for (number = 0; number < quarter_points; number++) { + a_val = _mm256_load_ps(aPtr); - res_f = _mm256_cmp_ps (a_val, zero_val, _CMP_GE_OS); - binary_f = _mm256_and_ps (res_f, one_val); - binary_i = _mm256_cvtps_epi32(binary_f); + res_f = _mm256_cmp_ps(a_val, zero_val, _CMP_GE_OS); + binary_f = _mm256_and_ps(res_f, one_val); + binary_i = _mm256_cvtps_epi32(binary_f); - _mm256_store_si256((__m256i *)cPtr, binary_i); + _mm256_store_si256((__m256i*)cPtr, binary_i); - cPtr += 8; - aPtr += 8; - } - - for(number = quarter_points * 8; number < num_points; number++){ - if( *aPtr++ >= 0) { - *cPtr++ = 1; + cPtr += 8; + aPtr += 8; } - else { - *cPtr++ = 0; + + for (number = quarter_points * 8; number < num_points; number++) { + if (*aPtr++ >= 0) { + *cPtr++ = 1; + } else { + *cPtr++ = 0; + } } - } } #endif /* LV_HAVE_AVX */ @@ -194,40 +195,40 @@ volk_32f_binary_slicer_32i_a_avx(int* cVector, const float* aVector, unsigned in #ifdef LV_HAVE_SSE2 #include -static inline void -volk_32f_binary_slicer_32i_u_sse2(int* cVector, const float* aVector, unsigned int num_points) +static inline void volk_32f_binary_slicer_32i_u_sse2(int* cVector, + const float* aVector, + unsigned int num_points) { - int* cPtr = cVector; - const float* aPtr = aVector; - unsigned int number = 0; - - unsigned int quarter_points = num_points / 4; - __m128 a_val, res_f; - __m128i res_i, binary_i; - __m128 zero_val; - zero_val = _mm_set1_ps (0.0f); + int* cPtr = cVector; + const float* aPtr = aVector; + unsigned int number = 0; - for(number = 0; number < quarter_points; number++){ - a_val = _mm_loadu_ps(aPtr); + unsigned int quarter_points = num_points / 4; + __m128 a_val, res_f; + __m128i res_i, binary_i; + __m128 zero_val; + zero_val = _mm_set1_ps(0.0f); - res_f = _mm_cmpge_ps (a_val, zero_val); - res_i = _mm_cvtps_epi32 (res_f); - binary_i = _mm_srli_epi32 (res_i, 31); + for (number = 0; number < quarter_points; number++) { + a_val = _mm_loadu_ps(aPtr); - _mm_storeu_si128((__m128i*)cPtr, binary_i); + res_f = _mm_cmpge_ps(a_val, zero_val); + res_i = _mm_cvtps_epi32(res_f); + binary_i = _mm_srli_epi32(res_i, 31); - cPtr += 4; - aPtr += 4; - } + _mm_storeu_si128((__m128i*)cPtr, binary_i); - for(number = quarter_points * 4; number < num_points; number++){ - if( *aPtr++ >= 0) { - *cPtr++ = 1; + cPtr += 4; + aPtr += 4; } - else { - *cPtr++ = 0; + + for (number = quarter_points * 4; number < num_points; number++) { + if (*aPtr++ >= 0) { + *cPtr++ = 1; + } else { + *cPtr++ = 0; + } } - } } #endif /* LV_HAVE_SSE2 */ @@ -235,41 +236,41 @@ volk_32f_binary_slicer_32i_u_sse2(int* cVector, const float* aVector, unsigned i #ifdef LV_HAVE_AVX #include -static inline void -volk_32f_binary_slicer_32i_u_avx(int* cVector, const float* aVector, unsigned int num_points) +static inline void volk_32f_binary_slicer_32i_u_avx(int* cVector, + const float* aVector, + unsigned int num_points) { - int* cPtr = cVector; - const float* aPtr = aVector; - unsigned int number = 0; - - unsigned int quarter_points = num_points / 8; - __m256 a_val, res_f, binary_f; - __m256i binary_i; - __m256 zero_val, one_val; - zero_val = _mm256_set1_ps (0.0f); - one_val = _mm256_set1_ps (1.0f); + int* cPtr = cVector; + const float* aPtr = aVector; + unsigned int number = 0; - for(number = 0; number < quarter_points; number++){ - a_val = _mm256_loadu_ps(aPtr); + unsigned int quarter_points = num_points / 8; + __m256 a_val, res_f, binary_f; + __m256i binary_i; + __m256 zero_val, one_val; + zero_val = _mm256_set1_ps(0.0f); + one_val = _mm256_set1_ps(1.0f); - res_f = _mm256_cmp_ps (a_val, zero_val, _CMP_GE_OS); - binary_f = _mm256_and_ps (res_f, one_val); - binary_i = _mm256_cvtps_epi32(binary_f); + for (number = 0; number < quarter_points; number++) { + a_val = _mm256_loadu_ps(aPtr); - _mm256_storeu_si256((__m256i*)cPtr, binary_i); + res_f = _mm256_cmp_ps(a_val, zero_val, _CMP_GE_OS); + binary_f = _mm256_and_ps(res_f, one_val); + binary_i = _mm256_cvtps_epi32(binary_f); - cPtr += 8; - aPtr += 8; - } + _mm256_storeu_si256((__m256i*)cPtr, binary_i); - for(number = quarter_points * 8; number < num_points; number++){ - if( *aPtr++ >= 0) { - *cPtr++ = 1; + cPtr += 8; + aPtr += 8; } - else { - *cPtr++ = 0; + + for (number = quarter_points * 8; number < num_points; number++) { + if (*aPtr++ >= 0) { + *cPtr++ = 1; + } else { + *cPtr++ = 0; + } } - } } #endif /* LV_HAVE_AVX */ diff --git a/kernels/volk/volk_32f_binary_slicer_8i.h b/kernels/volk/volk_32f_binary_slicer_8i.h index 5920621..3eddb5c 100644 --- a/kernels/volk/volk_32f_binary_slicer_8i.h +++ b/kernels/volk/volk_32f_binary_slicer_8i.h @@ -30,7 +30,8 @@ * * Dispatcher Prototype * \code - * void volk_32f_binary_slicer_8i(int8_t* cVector, const float* aVector, unsigned int num_points) + * void volk_32f_binary_slicer_8i(int8_t* cVector, const float* aVector, unsigned int + num_points) * \endcode * * \b Inputs @@ -74,39 +75,38 @@ #ifdef LV_HAVE_GENERIC -static inline void -volk_32f_binary_slicer_8i_generic(int8_t* cVector, const float* aVector, - unsigned int num_points) +static inline void volk_32f_binary_slicer_8i_generic(int8_t* cVector, + const float* aVector, + unsigned int num_points) { - int8_t* cPtr = cVector; - const float* aPtr = aVector; - unsigned int number = 0; - - for(number = 0; number < num_points; number++) { - if(*aPtr++ >= 0) { - *cPtr++ = 1; - } - else { - *cPtr++ = 0; + int8_t* cPtr = cVector; + const float* aPtr = aVector; + unsigned int number = 0; + + for (number = 0; number < num_points; number++) { + if (*aPtr++ >= 0) { + *cPtr++ = 1; + } else { + *cPtr++ = 0; + } } - } } #endif /* LV_HAVE_GENERIC */ #ifdef LV_HAVE_GENERIC -static inline void -volk_32f_binary_slicer_8i_generic_branchless(int8_t* cVector, const float* aVector, - unsigned int num_points) +static inline void volk_32f_binary_slicer_8i_generic_branchless(int8_t* cVector, + const float* aVector, + unsigned int num_points) { - int8_t* cPtr = cVector; - const float* aPtr = aVector; - unsigned int number = 0; + int8_t* cPtr = cVector; + const float* aPtr = aVector; + unsigned int number = 0; - for(number = 0; number < num_points; number++){ - *cPtr++ = (*aPtr++ >= 0); - } + for (number = 0; number < num_points; number++) { + *cPtr++ = (*aPtr++ >= 0); + } } #endif /* LV_HAVE_GENERIC */ @@ -114,279 +114,329 @@ volk_32f_binary_slicer_8i_generic_branchless(int8_t* cVector, const float* aVect #ifdef LV_HAVE_AVX2 #include -static inline void -volk_32f_binary_slicer_8i_a_avx2(int8_t* cVector, const float* aVector, - unsigned int num_points) +static inline void volk_32f_binary_slicer_8i_a_avx2(int8_t* cVector, + const float* aVector, + unsigned int num_points) { - int8_t* cPtr = cVector; - const float* aPtr = aVector; - unsigned int number = 0; - unsigned int n32points = num_points / 32; - - const __m256 zero_val = _mm256_set1_ps(0.0f); - __m256 a0_val, a1_val, a2_val, a3_val; - __m256 res0_f, res1_f, res2_f, res3_f; - __m256i res0_i, res1_i, res2_i, res3_i; - __m256i byte_shuffle = _mm256_set_epi8( 15, 14, 13, 12, 7, 6, 5, 4, - 11, 10, 9, 8, 3, 2, 1, 0, - 15, 14, 13, 12, 7, 6, 5, 4, - 11, 10, 9, 8, 3, 2, 1, 0); - - for(number = 0; number < n32points; number++) { - a0_val = _mm256_load_ps(aPtr); - a1_val = _mm256_load_ps(aPtr+8); - a2_val = _mm256_load_ps(aPtr+16); - a3_val = _mm256_load_ps(aPtr+24); - - // compare >= 0; return float - res0_f = _mm256_cmp_ps(a0_val, zero_val, _CMP_GE_OS); - res1_f = _mm256_cmp_ps(a1_val, zero_val, _CMP_GE_OS); - res2_f = _mm256_cmp_ps(a2_val, zero_val, _CMP_GE_OS); - res3_f = _mm256_cmp_ps(a3_val, zero_val, _CMP_GE_OS); - - // convert to 32i and >> 31 - res0_i = _mm256_srli_epi32(_mm256_cvtps_epi32(res0_f), 31); - res1_i = _mm256_srli_epi32(_mm256_cvtps_epi32(res1_f), 31); - res2_i = _mm256_srli_epi32(_mm256_cvtps_epi32(res2_f), 31); - res3_i = _mm256_srli_epi32(_mm256_cvtps_epi32(res3_f), 31); - - // pack in to 16-bit results - res0_i = _mm256_packs_epi32(res0_i, res1_i); - res2_i = _mm256_packs_epi32(res2_i, res3_i); - // pack in to 8-bit results - // res0: (after packs_epi32) - // a0, a1, a2, a3, b0, b1, b2, b3, a4, a5, a6, a7, b4, b5, b6, b7 - // res2: - // c0, c1, c2, c3, d0, d1, d2, d3, c4, c5, c6, c7, d4, d5, d6, d7 - res0_i = _mm256_packs_epi16(res0_i, res2_i); - // shuffle the lanes - // res0: (after packs_epi16) - // a0, a1, a2, a3, b0, b1, b2, b3, c0, c1, c2, c3, d0, d1, d2, d3 - // a4, a5, a6, a7, b4, b5, b6, b7, c4, c5, c6, c7, d4, d5, d6, d7 - // 0, 2, 1, 3 -> 11 01 10 00 (0xd8) - res0_i = _mm256_permute4x64_epi64(res0_i, 0xd8); - - // shuffle bytes within lanes - // res0: (after shuffle_epi8) - // a0, a1, a2, a3, b0, b1, b2, b3, a4, a5, a6, a7, b4, b5, b6, b7 - // c0, c1, c2, c3, d0, d1, d2, d3, c4, c5, c6, c7, d4, d5, d6, d7 - res0_i = _mm256_shuffle_epi8(res0_i, byte_shuffle); - - _mm256_store_si256((__m256i*)cPtr, res0_i); - aPtr += 32; - cPtr += 32; - } - - for(number = n32points * 32; number < num_points; number++) { - if( *aPtr++ >= 0) { - *cPtr++ = 1; + int8_t* cPtr = cVector; + const float* aPtr = aVector; + unsigned int number = 0; + unsigned int n32points = num_points / 32; + + const __m256 zero_val = _mm256_set1_ps(0.0f); + __m256 a0_val, a1_val, a2_val, a3_val; + __m256 res0_f, res1_f, res2_f, res3_f; + __m256i res0_i, res1_i, res2_i, res3_i; + __m256i byte_shuffle = _mm256_set_epi8(15, + 14, + 13, + 12, + 7, + 6, + 5, + 4, + 11, + 10, + 9, + 8, + 3, + 2, + 1, + 0, + 15, + 14, + 13, + 12, + 7, + 6, + 5, + 4, + 11, + 10, + 9, + 8, + 3, + 2, + 1, + 0); + + for (number = 0; number < n32points; number++) { + a0_val = _mm256_load_ps(aPtr); + a1_val = _mm256_load_ps(aPtr + 8); + a2_val = _mm256_load_ps(aPtr + 16); + a3_val = _mm256_load_ps(aPtr + 24); + + // compare >= 0; return float + res0_f = _mm256_cmp_ps(a0_val, zero_val, _CMP_GE_OS); + res1_f = _mm256_cmp_ps(a1_val, zero_val, _CMP_GE_OS); + res2_f = _mm256_cmp_ps(a2_val, zero_val, _CMP_GE_OS); + res3_f = _mm256_cmp_ps(a3_val, zero_val, _CMP_GE_OS); + + // convert to 32i and >> 31 + res0_i = _mm256_srli_epi32(_mm256_cvtps_epi32(res0_f), 31); + res1_i = _mm256_srli_epi32(_mm256_cvtps_epi32(res1_f), 31); + res2_i = _mm256_srli_epi32(_mm256_cvtps_epi32(res2_f), 31); + res3_i = _mm256_srli_epi32(_mm256_cvtps_epi32(res3_f), 31); + + // pack in to 16-bit results + res0_i = _mm256_packs_epi32(res0_i, res1_i); + res2_i = _mm256_packs_epi32(res2_i, res3_i); + // pack in to 8-bit results + // res0: (after packs_epi32) + // a0, a1, a2, a3, b0, b1, b2, b3, a4, a5, a6, a7, b4, b5, b6, b7 + // res2: + // c0, c1, c2, c3, d0, d1, d2, d3, c4, c5, c6, c7, d4, d5, d6, d7 + res0_i = _mm256_packs_epi16(res0_i, res2_i); + // shuffle the lanes + // res0: (after packs_epi16) + // a0, a1, a2, a3, b0, b1, b2, b3, c0, c1, c2, c3, d0, d1, d2, d3 + // a4, a5, a6, a7, b4, b5, b6, b7, c4, c5, c6, c7, d4, d5, d6, d7 + // 0, 2, 1, 3 -> 11 01 10 00 (0xd8) + res0_i = _mm256_permute4x64_epi64(res0_i, 0xd8); + + // shuffle bytes within lanes + // res0: (after shuffle_epi8) + // a0, a1, a2, a3, b0, b1, b2, b3, a4, a5, a6, a7, b4, b5, b6, b7 + // c0, c1, c2, c3, d0, d1, d2, d3, c4, c5, c6, c7, d4, d5, d6, d7 + res0_i = _mm256_shuffle_epi8(res0_i, byte_shuffle); + + _mm256_store_si256((__m256i*)cPtr, res0_i); + aPtr += 32; + cPtr += 32; } - else { - *cPtr++ = 0; + + for (number = n32points * 32; number < num_points; number++) { + if (*aPtr++ >= 0) { + *cPtr++ = 1; + } else { + *cPtr++ = 0; + } } - } } #endif #ifdef LV_HAVE_AVX2 #include -static inline void -volk_32f_binary_slicer_8i_u_avx2(int8_t* cVector, const float* aVector, - unsigned int num_points) +static inline void volk_32f_binary_slicer_8i_u_avx2(int8_t* cVector, + const float* aVector, + unsigned int num_points) { - int8_t* cPtr = cVector; - const float* aPtr = aVector; - unsigned int number = 0; - unsigned int n32points = num_points / 32; - - const __m256 zero_val = _mm256_set1_ps(0.0f); - __m256 a0_val, a1_val, a2_val, a3_val; - __m256 res0_f, res1_f, res2_f, res3_f; - __m256i res0_i, res1_i, res2_i, res3_i; - __m256i byte_shuffle = _mm256_set_epi8( 15, 14, 13, 12, 7, 6, 5, 4, - 11, 10, 9, 8, 3, 2, 1, 0, - 15, 14, 13, 12, 7, 6, 5, 4, - 11, 10, 9, 8, 3, 2, 1, 0); - - for(number = 0; number < n32points; number++) { - a0_val = _mm256_loadu_ps(aPtr); - a1_val = _mm256_loadu_ps(aPtr+8); - a2_val = _mm256_loadu_ps(aPtr+16); - a3_val = _mm256_loadu_ps(aPtr+24); - - // compare >= 0; return float - res0_f = _mm256_cmp_ps(a0_val, zero_val, _CMP_GE_OS); - res1_f = _mm256_cmp_ps(a1_val, zero_val, _CMP_GE_OS); - res2_f = _mm256_cmp_ps(a2_val, zero_val, _CMP_GE_OS); - res3_f = _mm256_cmp_ps(a3_val, zero_val, _CMP_GE_OS); - - // convert to 32i and >> 31 - res0_i = _mm256_srli_epi32(_mm256_cvtps_epi32(res0_f), 31); - res1_i = _mm256_srli_epi32(_mm256_cvtps_epi32(res1_f), 31); - res2_i = _mm256_srli_epi32(_mm256_cvtps_epi32(res2_f), 31); - res3_i = _mm256_srli_epi32(_mm256_cvtps_epi32(res3_f), 31); - - // pack in to 16-bit results - res0_i = _mm256_packs_epi32(res0_i, res1_i); - res2_i = _mm256_packs_epi32(res2_i, res3_i); - // pack in to 8-bit results - // res0: (after packs_epi32) - // a0, a1, a2, a3, b0, b1, b2, b3, a4, a5, a6, a7, b4, b5, b6, b7 - // res2: - // c0, c1, c2, c3, d0, d1, d2, d3, c4, c5, c6, c7, d4, d5, d6, d7 - res0_i = _mm256_packs_epi16(res0_i, res2_i); - // shuffle the lanes - // res0: (after packs_epi16) - // a0, a1, a2, a3, b0, b1, b2, b3, c0, c1, c2, c3, d0, d1, d2, d3 - // a4, a5, a6, a7, b4, b5, b6, b7, c4, c5, c6, c7, d4, d5, d6, d7 - // 0, 2, 1, 3 -> 11 01 10 00 (0xd8) - res0_i = _mm256_permute4x64_epi64(res0_i, 0xd8); - - // shuffle bytes within lanes - // res0: (after shuffle_epi8) - // a0, a1, a2, a3, b0, b1, b2, b3, a4, a5, a6, a7, b4, b5, b6, b7 - // c0, c1, c2, c3, d0, d1, d2, d3, c4, c5, c6, c7, d4, d5, d6, d7 - res0_i = _mm256_shuffle_epi8(res0_i, byte_shuffle); - - _mm256_storeu_si256((__m256i*)cPtr, res0_i); - aPtr += 32; - cPtr += 32; - } - - for(number = n32points * 32; number < num_points; number++) { - if( *aPtr++ >= 0) { - *cPtr++ = 1; + int8_t* cPtr = cVector; + const float* aPtr = aVector; + unsigned int number = 0; + unsigned int n32points = num_points / 32; + + const __m256 zero_val = _mm256_set1_ps(0.0f); + __m256 a0_val, a1_val, a2_val, a3_val; + __m256 res0_f, res1_f, res2_f, res3_f; + __m256i res0_i, res1_i, res2_i, res3_i; + __m256i byte_shuffle = _mm256_set_epi8(15, + 14, + 13, + 12, + 7, + 6, + 5, + 4, + 11, + 10, + 9, + 8, + 3, + 2, + 1, + 0, + 15, + 14, + 13, + 12, + 7, + 6, + 5, + 4, + 11, + 10, + 9, + 8, + 3, + 2, + 1, + 0); + + for (number = 0; number < n32points; number++) { + a0_val = _mm256_loadu_ps(aPtr); + a1_val = _mm256_loadu_ps(aPtr + 8); + a2_val = _mm256_loadu_ps(aPtr + 16); + a3_val = _mm256_loadu_ps(aPtr + 24); + + // compare >= 0; return float + res0_f = _mm256_cmp_ps(a0_val, zero_val, _CMP_GE_OS); + res1_f = _mm256_cmp_ps(a1_val, zero_val, _CMP_GE_OS); + res2_f = _mm256_cmp_ps(a2_val, zero_val, _CMP_GE_OS); + res3_f = _mm256_cmp_ps(a3_val, zero_val, _CMP_GE_OS); + + // convert to 32i and >> 31 + res0_i = _mm256_srli_epi32(_mm256_cvtps_epi32(res0_f), 31); + res1_i = _mm256_srli_epi32(_mm256_cvtps_epi32(res1_f), 31); + res2_i = _mm256_srli_epi32(_mm256_cvtps_epi32(res2_f), 31); + res3_i = _mm256_srli_epi32(_mm256_cvtps_epi32(res3_f), 31); + + // pack in to 16-bit results + res0_i = _mm256_packs_epi32(res0_i, res1_i); + res2_i = _mm256_packs_epi32(res2_i, res3_i); + // pack in to 8-bit results + // res0: (after packs_epi32) + // a0, a1, a2, a3, b0, b1, b2, b3, a4, a5, a6, a7, b4, b5, b6, b7 + // res2: + // c0, c1, c2, c3, d0, d1, d2, d3, c4, c5, c6, c7, d4, d5, d6, d7 + res0_i = _mm256_packs_epi16(res0_i, res2_i); + // shuffle the lanes + // res0: (after packs_epi16) + // a0, a1, a2, a3, b0, b1, b2, b3, c0, c1, c2, c3, d0, d1, d2, d3 + // a4, a5, a6, a7, b4, b5, b6, b7, c4, c5, c6, c7, d4, d5, d6, d7 + // 0, 2, 1, 3 -> 11 01 10 00 (0xd8) + res0_i = _mm256_permute4x64_epi64(res0_i, 0xd8); + + // shuffle bytes within lanes + // res0: (after shuffle_epi8) + // a0, a1, a2, a3, b0, b1, b2, b3, a4, a5, a6, a7, b4, b5, b6, b7 + // c0, c1, c2, c3, d0, d1, d2, d3, c4, c5, c6, c7, d4, d5, d6, d7 + res0_i = _mm256_shuffle_epi8(res0_i, byte_shuffle); + + _mm256_storeu_si256((__m256i*)cPtr, res0_i); + aPtr += 32; + cPtr += 32; } - else { - *cPtr++ = 0; + + for (number = n32points * 32; number < num_points; number++) { + if (*aPtr++ >= 0) { + *cPtr++ = 1; + } else { + *cPtr++ = 0; + } } - } } #endif - #ifdef LV_HAVE_SSE2 #include -static inline void -volk_32f_binary_slicer_8i_a_sse2(int8_t* cVector, const float* aVector, - unsigned int num_points) +static inline void volk_32f_binary_slicer_8i_a_sse2(int8_t* cVector, + const float* aVector, + unsigned int num_points) { - int8_t* cPtr = cVector; - const float* aPtr = aVector; - unsigned int number = 0; - - unsigned int n16points = num_points / 16; - __m128 a0_val, a1_val, a2_val, a3_val; - __m128 res0_f, res1_f, res2_f, res3_f; - __m128i res0_i, res1_i, res2_i, res3_i; - __m128 zero_val; - zero_val = _mm_set1_ps(0.0f); - - for(number = 0; number < n16points; number++) { - a0_val = _mm_load_ps(aPtr); - a1_val = _mm_load_ps(aPtr+4); - a2_val = _mm_load_ps(aPtr+8); - a3_val = _mm_load_ps(aPtr+12); - - // compare >= 0; return float - res0_f = _mm_cmpge_ps(a0_val, zero_val); - res1_f = _mm_cmpge_ps(a1_val, zero_val); - res2_f = _mm_cmpge_ps(a2_val, zero_val); - res3_f = _mm_cmpge_ps(a3_val, zero_val); - - // convert to 32i and >> 31 - res0_i = _mm_srli_epi32(_mm_cvtps_epi32(res0_f), 31); - res1_i = _mm_srli_epi32(_mm_cvtps_epi32(res1_f), 31); - res2_i = _mm_srli_epi32(_mm_cvtps_epi32(res2_f), 31); - res3_i = _mm_srli_epi32(_mm_cvtps_epi32(res3_f), 31); - - // pack into 16-bit results - res0_i = _mm_packs_epi32(res0_i, res1_i); - res2_i = _mm_packs_epi32(res2_i, res3_i); - - // pack into 8-bit results - res0_i = _mm_packs_epi16(res0_i, res2_i); - - _mm_store_si128((__m128i*)cPtr, res0_i); - - cPtr += 16; - aPtr += 16; - } - - for(number = n16points * 16; number < num_points; number++) { - if( *aPtr++ >= 0) { - *cPtr++ = 1; + int8_t* cPtr = cVector; + const float* aPtr = aVector; + unsigned int number = 0; + + unsigned int n16points = num_points / 16; + __m128 a0_val, a1_val, a2_val, a3_val; + __m128 res0_f, res1_f, res2_f, res3_f; + __m128i res0_i, res1_i, res2_i, res3_i; + __m128 zero_val; + zero_val = _mm_set1_ps(0.0f); + + for (number = 0; number < n16points; number++) { + a0_val = _mm_load_ps(aPtr); + a1_val = _mm_load_ps(aPtr + 4); + a2_val = _mm_load_ps(aPtr + 8); + a3_val = _mm_load_ps(aPtr + 12); + + // compare >= 0; return float + res0_f = _mm_cmpge_ps(a0_val, zero_val); + res1_f = _mm_cmpge_ps(a1_val, zero_val); + res2_f = _mm_cmpge_ps(a2_val, zero_val); + res3_f = _mm_cmpge_ps(a3_val, zero_val); + + // convert to 32i and >> 31 + res0_i = _mm_srli_epi32(_mm_cvtps_epi32(res0_f), 31); + res1_i = _mm_srli_epi32(_mm_cvtps_epi32(res1_f), 31); + res2_i = _mm_srli_epi32(_mm_cvtps_epi32(res2_f), 31); + res3_i = _mm_srli_epi32(_mm_cvtps_epi32(res3_f), 31); + + // pack into 16-bit results + res0_i = _mm_packs_epi32(res0_i, res1_i); + res2_i = _mm_packs_epi32(res2_i, res3_i); + + // pack into 8-bit results + res0_i = _mm_packs_epi16(res0_i, res2_i); + + _mm_store_si128((__m128i*)cPtr, res0_i); + + cPtr += 16; + aPtr += 16; } - else { - *cPtr++ = 0; + + for (number = n16points * 16; number < num_points; number++) { + if (*aPtr++ >= 0) { + *cPtr++ = 1; + } else { + *cPtr++ = 0; + } } - } } #endif /* LV_HAVE_SSE2 */ - #ifdef LV_HAVE_SSE2 #include -static inline void -volk_32f_binary_slicer_8i_u_sse2(int8_t* cVector, const float* aVector, - unsigned int num_points) +static inline void volk_32f_binary_slicer_8i_u_sse2(int8_t* cVector, + const float* aVector, + unsigned int num_points) { - int8_t* cPtr = cVector; - const float* aPtr = aVector; - unsigned int number = 0; - - unsigned int n16points = num_points / 16; - __m128 a0_val, a1_val, a2_val, a3_val; - __m128 res0_f, res1_f, res2_f, res3_f; - __m128i res0_i, res1_i, res2_i, res3_i; - __m128 zero_val; - zero_val = _mm_set1_ps (0.0f); - - for(number = 0; number < n16points; number++) { - a0_val = _mm_loadu_ps(aPtr); - a1_val = _mm_loadu_ps(aPtr+4); - a2_val = _mm_loadu_ps(aPtr+8); - a3_val = _mm_loadu_ps(aPtr+12); - - // compare >= 0; return float - res0_f = _mm_cmpge_ps(a0_val, zero_val); - res1_f = _mm_cmpge_ps(a1_val, zero_val); - res2_f = _mm_cmpge_ps(a2_val, zero_val); - res3_f = _mm_cmpge_ps(a3_val, zero_val); - - // convert to 32i and >> 31 - res0_i = _mm_srli_epi32(_mm_cvtps_epi32(res0_f), 31); - res1_i = _mm_srli_epi32(_mm_cvtps_epi32(res1_f), 31); - res2_i = _mm_srli_epi32(_mm_cvtps_epi32(res2_f), 31); - res3_i = _mm_srli_epi32(_mm_cvtps_epi32(res3_f), 31); - - // pack into 16-bit results - res0_i = _mm_packs_epi32(res0_i, res1_i); - res2_i = _mm_packs_epi32(res2_i, res3_i); - - // pack into 8-bit results - res0_i = _mm_packs_epi16(res0_i, res2_i); - - _mm_storeu_si128((__m128i*)cPtr, res0_i); - - cPtr += 16; - aPtr += 16; - } - - for(number = n16points * 16; number < num_points; number++) { - if( *aPtr++ >= 0) { - *cPtr++ = 1; + int8_t* cPtr = cVector; + const float* aPtr = aVector; + unsigned int number = 0; + + unsigned int n16points = num_points / 16; + __m128 a0_val, a1_val, a2_val, a3_val; + __m128 res0_f, res1_f, res2_f, res3_f; + __m128i res0_i, res1_i, res2_i, res3_i; + __m128 zero_val; + zero_val = _mm_set1_ps(0.0f); + + for (number = 0; number < n16points; number++) { + a0_val = _mm_loadu_ps(aPtr); + a1_val = _mm_loadu_ps(aPtr + 4); + a2_val = _mm_loadu_ps(aPtr + 8); + a3_val = _mm_loadu_ps(aPtr + 12); + + // compare >= 0; return float + res0_f = _mm_cmpge_ps(a0_val, zero_val); + res1_f = _mm_cmpge_ps(a1_val, zero_val); + res2_f = _mm_cmpge_ps(a2_val, zero_val); + res3_f = _mm_cmpge_ps(a3_val, zero_val); + + // convert to 32i and >> 31 + res0_i = _mm_srli_epi32(_mm_cvtps_epi32(res0_f), 31); + res1_i = _mm_srli_epi32(_mm_cvtps_epi32(res1_f), 31); + res2_i = _mm_srli_epi32(_mm_cvtps_epi32(res2_f), 31); + res3_i = _mm_srli_epi32(_mm_cvtps_epi32(res3_f), 31); + + // pack into 16-bit results + res0_i = _mm_packs_epi32(res0_i, res1_i); + res2_i = _mm_packs_epi32(res2_i, res3_i); + + // pack into 8-bit results + res0_i = _mm_packs_epi16(res0_i, res2_i); + + _mm_storeu_si128((__m128i*)cPtr, res0_i); + + cPtr += 16; + aPtr += 16; } - else { - *cPtr++ = 0; + + for (number = n16points * 16; number < num_points; number++) { + if (*aPtr++ >= 0) { + *cPtr++ = 1; + } else { + *cPtr++ = 0; + } } - } } #endif /* LV_HAVE_SSE2 */ @@ -394,74 +444,72 @@ volk_32f_binary_slicer_8i_u_sse2(int8_t* cVector, const float* aVector, #ifdef LV_HAVE_NEON #include -static inline void -volk_32f_binary_slicer_8i_neon(int8_t* cVector, const float* aVector, - unsigned int num_points) +static inline void volk_32f_binary_slicer_8i_neon(int8_t* cVector, + const float* aVector, + unsigned int num_points) { - int8_t* cPtr = cVector; - const float* aPtr = aVector; - unsigned int number = 0; - unsigned int n16points = num_points / 16; - - float32x4x2_t input_val0, input_val1; - float32x4_t zero_val; - uint32x4x2_t res0_u32, res1_u32; - uint16x4x2_t res0_u16x4, res1_u16x4; - uint16x8x2_t res_u16x8; - uint8x8x2_t res_u8; - uint8x8_t one; - - zero_val = vdupq_n_f32(0.0); - one = vdup_n_u8(0x01); - - // TODO: this is a good candidate for asm because the vcombines - // can be eliminated simply by picking dst registers that are - // adjacent. - for(number = 0; number < n16points; number++) { - input_val0 = vld2q_f32(aPtr); - input_val1 = vld2q_f32(aPtr+8); - - // test against 0; return uint32 - res0_u32.val[0] = vcgeq_f32(input_val0.val[0], zero_val); - res0_u32.val[1] = vcgeq_f32(input_val0.val[1], zero_val); - res1_u32.val[0] = vcgeq_f32(input_val1.val[0], zero_val); - res1_u32.val[1] = vcgeq_f32(input_val1.val[1], zero_val); - - // narrow uint32 -> uint16 followed by combine to 8-element vectors - res0_u16x4.val[0] = vmovn_u32(res0_u32.val[0]); - res0_u16x4.val[1] = vmovn_u32(res0_u32.val[1]); - res1_u16x4.val[0] = vmovn_u32(res1_u32.val[0]); - res1_u16x4.val[1] = vmovn_u32(res1_u32.val[1]); - - res_u16x8.val[0] = vcombine_u16(res0_u16x4.val[0], res1_u16x4.val[0]); - res_u16x8.val[1] = vcombine_u16(res0_u16x4.val[1], res1_u16x4.val[1]); - - // narrow uint16x8 -> uint8x8 - res_u8.val[0] = vmovn_u16(res_u16x8.val[0]); - res_u8.val[1] = vmovn_u16(res_u16x8.val[1]); - // we *could* load twice as much data and do another vcombine here - // to get a uint8x16x2 vector, still only do 2 vandqs and a single store - // but that turns out to be ~16% slower than this version on zc702 - // it's possible register contention in GCC scheduler slows it down - // and a hand-written asm with quad-word u8 registers is much faster. - - res_u8.val[0] = vand_u8(one, res_u8.val[0]); - res_u8.val[1] = vand_u8(one, res_u8.val[1]); - - vst2_u8((unsigned char*)cPtr, res_u8); - cPtr += 16; - aPtr += 16; - - } - - for(number = n16points * 16; number < num_points; number++) { - if(*aPtr++ >= 0) { - *cPtr++ = 1; + int8_t* cPtr = cVector; + const float* aPtr = aVector; + unsigned int number = 0; + unsigned int n16points = num_points / 16; + + float32x4x2_t input_val0, input_val1; + float32x4_t zero_val; + uint32x4x2_t res0_u32, res1_u32; + uint16x4x2_t res0_u16x4, res1_u16x4; + uint16x8x2_t res_u16x8; + uint8x8x2_t res_u8; + uint8x8_t one; + + zero_val = vdupq_n_f32(0.0); + one = vdup_n_u8(0x01); + + // TODO: this is a good candidate for asm because the vcombines + // can be eliminated simply by picking dst registers that are + // adjacent. + for (number = 0; number < n16points; number++) { + input_val0 = vld2q_f32(aPtr); + input_val1 = vld2q_f32(aPtr + 8); + + // test against 0; return uint32 + res0_u32.val[0] = vcgeq_f32(input_val0.val[0], zero_val); + res0_u32.val[1] = vcgeq_f32(input_val0.val[1], zero_val); + res1_u32.val[0] = vcgeq_f32(input_val1.val[0], zero_val); + res1_u32.val[1] = vcgeq_f32(input_val1.val[1], zero_val); + + // narrow uint32 -> uint16 followed by combine to 8-element vectors + res0_u16x4.val[0] = vmovn_u32(res0_u32.val[0]); + res0_u16x4.val[1] = vmovn_u32(res0_u32.val[1]); + res1_u16x4.val[0] = vmovn_u32(res1_u32.val[0]); + res1_u16x4.val[1] = vmovn_u32(res1_u32.val[1]); + + res_u16x8.val[0] = vcombine_u16(res0_u16x4.val[0], res1_u16x4.val[0]); + res_u16x8.val[1] = vcombine_u16(res0_u16x4.val[1], res1_u16x4.val[1]); + + // narrow uint16x8 -> uint8x8 + res_u8.val[0] = vmovn_u16(res_u16x8.val[0]); + res_u8.val[1] = vmovn_u16(res_u16x8.val[1]); + // we *could* load twice as much data and do another vcombine here + // to get a uint8x16x2 vector, still only do 2 vandqs and a single store + // but that turns out to be ~16% slower than this version on zc702 + // it's possible register contention in GCC scheduler slows it down + // and a hand-written asm with quad-word u8 registers is much faster. + + res_u8.val[0] = vand_u8(one, res_u8.val[0]); + res_u8.val[1] = vand_u8(one, res_u8.val[1]); + + vst2_u8((unsigned char*)cPtr, res_u8); + cPtr += 16; + aPtr += 16; } - else { - *cPtr++ = 0; + + for (number = n16points * 16; number < num_points; number++) { + if (*aPtr++ >= 0) { + *cPtr++ = 1; + } else { + *cPtr++ = 0; + } } - } } #endif /* LV_HAVE_NEON */ diff --git a/kernels/volk/volk_32f_convert_64f.h b/kernels/volk/volk_32f_convert_64f.h index bf57e3a..d2e3f8a 100644 --- a/kernels/volk/volk_32f_convert_64f.h +++ b/kernels/volk/volk_32f_convert_64f.h @@ -29,8 +29,8 @@ * * Dispatcher Prototype * \code - * void volk_32f_convert_64f(double* outputVector, const float* inputVector, unsigned int num_points) - * \endcode + * void volk_32f_convert_64f(double* outputVector, const float* inputVector, unsigned int + * num_points) \endcode * * \b Inputs * \li inputVector: The vector of floats to convert to doubles. @@ -72,29 +72,33 @@ #ifdef LV_HAVE_AVX #include -static inline void volk_32f_convert_64f_u_avx(double* outputVector, const float* inputVector, unsigned int num_points){ - unsigned int number = 0; +static inline void volk_32f_convert_64f_u_avx(double* outputVector, + const float* inputVector, + unsigned int num_points) +{ + unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; + const unsigned int quarterPoints = num_points / 4; - const float* inputVectorPtr = (const float*)inputVector; - double* outputVectorPtr = outputVector; - __m256d ret; - __m128 inputVal; + const float* inputVectorPtr = (const float*)inputVector; + double* outputVectorPtr = outputVector; + __m256d ret; + __m128 inputVal; - for(;number < quarterPoints; number++){ - inputVal = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4; + for (; number < quarterPoints; number++) { + inputVal = _mm_loadu_ps(inputVectorPtr); + inputVectorPtr += 4; - ret = _mm256_cvtps_pd(inputVal); - _mm256_storeu_pd(outputVectorPtr, ret); + ret = _mm256_cvtps_pd(inputVal); + _mm256_storeu_pd(outputVectorPtr, ret); - outputVectorPtr += 4; - } + outputVectorPtr += 4; + } - number = quarterPoints * 4; - for(; number < num_points; number++){ - outputVector[number] = (double)(inputVector[number]); - } + number = quarterPoints * 4; + for (; number < num_points; number++) { + outputVector[number] = (double)(inputVector[number]); + } } #endif /* LV_HAVE_AVX */ @@ -102,56 +106,61 @@ static inline void volk_32f_convert_64f_u_avx(double* outputVector, const float* #ifdef LV_HAVE_SSE2 #include -static inline void volk_32f_convert_64f_u_sse2(double* outputVector, const float* inputVector, unsigned int num_points){ - unsigned int number = 0; +static inline void volk_32f_convert_64f_u_sse2(double* outputVector, + const float* inputVector, + unsigned int num_points) +{ + unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; + const unsigned int quarterPoints = num_points / 4; - const float* inputVectorPtr = (const float*)inputVector; - double* outputVectorPtr = outputVector; - __m128d ret; - __m128 inputVal; + const float* inputVectorPtr = (const float*)inputVector; + double* outputVectorPtr = outputVector; + __m128d ret; + __m128 inputVal; - for(;number < quarterPoints; number++){ - inputVal = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4; + for (; number < quarterPoints; number++) { + inputVal = _mm_loadu_ps(inputVectorPtr); + inputVectorPtr += 4; - ret = _mm_cvtps_pd(inputVal); + ret = _mm_cvtps_pd(inputVal); - _mm_storeu_pd(outputVectorPtr, ret); - outputVectorPtr += 2; + _mm_storeu_pd(outputVectorPtr, ret); + outputVectorPtr += 2; - inputVal = _mm_movehl_ps(inputVal, inputVal); + inputVal = _mm_movehl_ps(inputVal, inputVal); - ret = _mm_cvtps_pd(inputVal); + ret = _mm_cvtps_pd(inputVal); - _mm_storeu_pd(outputVectorPtr, ret); - outputVectorPtr += 2; - } + _mm_storeu_pd(outputVectorPtr, ret); + outputVectorPtr += 2; + } - number = quarterPoints * 4; - for(; number < num_points; number++){ - outputVector[number] = (double)(inputVector[number]); - } + number = quarterPoints * 4; + for (; number < num_points; number++) { + outputVector[number] = (double)(inputVector[number]); + } } #endif /* LV_HAVE_SSE2 */ #ifdef LV_HAVE_GENERIC -static inline void volk_32f_convert_64f_generic(double* outputVector, const float* inputVector, unsigned int num_points){ - double* outputVectorPtr = outputVector; - const float* inputVectorPtr = inputVector; - unsigned int number = 0; - - for(number = 0; number < num_points; number++){ - *outputVectorPtr++ = ((double)(*inputVectorPtr++)); - } +static inline void volk_32f_convert_64f_generic(double* outputVector, + const float* inputVector, + unsigned int num_points) +{ + double* outputVectorPtr = outputVector; + const float* inputVectorPtr = inputVector; + unsigned int number = 0; + + for (number = 0; number < num_points; number++) { + *outputVectorPtr++ = ((double)(*inputVectorPtr++)); + } } #endif /* LV_HAVE_GENERIC */ - - #endif /* INCLUDED_volk_32f_convert_64f_u_H */ @@ -164,83 +173,92 @@ static inline void volk_32f_convert_64f_generic(double* outputVector, const floa #ifdef LV_HAVE_AVX #include -static inline void volk_32f_convert_64f_a_avx(double* outputVector, const float* inputVector, unsigned int num_points){ - unsigned int number = 0; +static inline void volk_32f_convert_64f_a_avx(double* outputVector, + const float* inputVector, + unsigned int num_points) +{ + unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; + const unsigned int quarterPoints = num_points / 4; - const float* inputVectorPtr = (const float*)inputVector; - double* outputVectorPtr = outputVector; - __m256d ret; - __m128 inputVal; + const float* inputVectorPtr = (const float*)inputVector; + double* outputVectorPtr = outputVector; + __m256d ret; + __m128 inputVal; - for(;number < quarterPoints; number++){ - inputVal = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4; + for (; number < quarterPoints; number++) { + inputVal = _mm_load_ps(inputVectorPtr); + inputVectorPtr += 4; - ret = _mm256_cvtps_pd(inputVal); - _mm256_store_pd(outputVectorPtr, ret); + ret = _mm256_cvtps_pd(inputVal); + _mm256_store_pd(outputVectorPtr, ret); - outputVectorPtr += 4; - } + outputVectorPtr += 4; + } - number = quarterPoints * 4; - for(; number < num_points; number++){ - outputVector[number] = (double)(inputVector[number]); - } + number = quarterPoints * 4; + for (; number < num_points; number++) { + outputVector[number] = (double)(inputVector[number]); + } } #endif /* LV_HAVE_AVX */ #ifdef LV_HAVE_SSE2 #include -static inline void volk_32f_convert_64f_a_sse2(double* outputVector, const float* inputVector, unsigned int num_points){ - unsigned int number = 0; +static inline void volk_32f_convert_64f_a_sse2(double* outputVector, + const float* inputVector, + unsigned int num_points) +{ + unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; + const unsigned int quarterPoints = num_points / 4; - const float* inputVectorPtr = (const float*)inputVector; - double* outputVectorPtr = outputVector; - __m128d ret; - __m128 inputVal; + const float* inputVectorPtr = (const float*)inputVector; + double* outputVectorPtr = outputVector; + __m128d ret; + __m128 inputVal; - for(;number < quarterPoints; number++){ - inputVal = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4; + for (; number < quarterPoints; number++) { + inputVal = _mm_load_ps(inputVectorPtr); + inputVectorPtr += 4; - ret = _mm_cvtps_pd(inputVal); + ret = _mm_cvtps_pd(inputVal); - _mm_store_pd(outputVectorPtr, ret); - outputVectorPtr += 2; + _mm_store_pd(outputVectorPtr, ret); + outputVectorPtr += 2; - inputVal = _mm_movehl_ps(inputVal, inputVal); + inputVal = _mm_movehl_ps(inputVal, inputVal); - ret = _mm_cvtps_pd(inputVal); + ret = _mm_cvtps_pd(inputVal); - _mm_store_pd(outputVectorPtr, ret); - outputVectorPtr += 2; - } + _mm_store_pd(outputVectorPtr, ret); + outputVectorPtr += 2; + } - number = quarterPoints * 4; - for(; number < num_points; number++){ - outputVector[number] = (double)(inputVector[number]); - } + number = quarterPoints * 4; + for (; number < num_points; number++) { + outputVector[number] = (double)(inputVector[number]); + } } #endif /* LV_HAVE_SSE2 */ #ifdef LV_HAVE_GENERIC -static inline void volk_32f_convert_64f_a_generic(double* outputVector, const float* inputVector, unsigned int num_points){ - double* outputVectorPtr = outputVector; - const float* inputVectorPtr = inputVector; - unsigned int number = 0; - - for(number = 0; number < num_points; number++){ - *outputVectorPtr++ = ((double)(*inputVectorPtr++)); - } +static inline void volk_32f_convert_64f_a_generic(double* outputVector, + const float* inputVector, + unsigned int num_points) +{ + double* outputVectorPtr = outputVector; + const float* inputVectorPtr = inputVector; + unsigned int number = 0; + + for (number = 0; number < num_points; number++) { + *outputVectorPtr++ = ((double)(*inputVectorPtr++)); + } } #endif /* LV_HAVE_GENERIC */ - - #endif /* INCLUDED_volk_32f_convert_64f_a_H */ diff --git a/kernels/volk/volk_32f_cos_32f.h b/kernels/volk/volk_32f_cos_32f.h index 39c2008..b493764 100644 --- a/kernels/volk/volk_32f_cos_32f.h +++ b/kernels/volk/volk_32f_cos_32f.h @@ -69,9 +69,9 @@ * \endcode */ -#include -#include #include +#include +#include #ifndef INCLUDED_volk_32f_cos_32f_a_H #define INCLUDED_volk_32f_cos_32f_a_H @@ -80,86 +80,102 @@ #include static inline void - volk_32f_cos_32f_a_avx2_fma(float* bVector, const float* aVector, unsigned int num_points) +volk_32f_cos_32f_a_avx2_fma(float* bVector, const float* aVector, unsigned int num_points) { - float* bPtr = bVector; - const float* aPtr = aVector; - - unsigned int number = 0; - unsigned int eighthPoints = num_points / 8; - unsigned int i = 0; - - __m256 aVal, s, r, m4pi, pio4A, pio4B, pio4C, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones, fzeroes; - __m256 sine, cosine; - __m256i q, ones, twos, fours; - - m4pi = _mm256_set1_ps(1.273239544735162542821171882678754627704620361328125); - pio4A = _mm256_set1_ps(0.7853981554508209228515625); - pio4B = _mm256_set1_ps(0.794662735614792836713604629039764404296875e-8); - pio4C = _mm256_set1_ps(0.306161699786838294306516483068750264552437361480769e-16); - ffours = _mm256_set1_ps(4.0); - ftwos = _mm256_set1_ps(2.0); - fones = _mm256_set1_ps(1.0); - fzeroes = _mm256_setzero_ps(); - __m256i zeroes = _mm256_set1_epi32(0); - ones = _mm256_set1_epi32(1); - __m256i allones = _mm256_set1_epi32(0xffffffff); - twos = _mm256_set1_epi32(2); - fours = _mm256_set1_epi32(4); - - cp1 = _mm256_set1_ps(1.0); - cp2 = _mm256_set1_ps(0.08333333333333333); - cp3 = _mm256_set1_ps(0.002777777777777778); - cp4 = _mm256_set1_ps(4.96031746031746e-05); - cp5 = _mm256_set1_ps(5.511463844797178e-07); - union bit256 condition1; - union bit256 condition3; - - for(;number < eighthPoints; number++){ - - aVal = _mm256_load_ps(aPtr); - // s = fabs(aVal) - s = _mm256_sub_ps(aVal, _mm256_and_ps(_mm256_mul_ps(aVal, ftwos), _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS))); - // q = (int) (s * (4/pi)), floor(aVal / (pi/4)) - q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi))); - // r = q + q&1, q indicates quadrant, r gives - r = _mm256_cvtepi32_ps(_mm256_add_epi32(q, _mm256_and_si256(q, ones))); - - s = _mm256_fnmadd_ps(r,pio4A,s); - s = _mm256_fnmadd_ps(r,pio4B,s); - s = _mm256_fnmadd_ps(r,pio4C,s); - - s = _mm256_div_ps(s, _mm256_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction - s = _mm256_mul_ps(s, s); - // Evaluate Taylor series - s = _mm256_mul_ps(_mm256_fmadd_ps(_mm256_fmsub_ps(_mm256_fmadd_ps(_mm256_fmsub_ps(s, cp5, cp4), s, cp3), s, cp2), s, cp1), s); - - for(i = 0; i < 3; i++) - s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s)); - s = _mm256_div_ps(s, ftwos); - - sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s)); - cosine = _mm256_sub_ps(fones, s); - - // if(((q+1)&2) != 0) { cosine=sine;} - condition1.int_vec = _mm256_cmpeq_epi32(_mm256_and_si256(_mm256_add_epi32(q, ones), twos), zeroes); - condition1.int_vec = _mm256_xor_si256(allones, condition1.int_vec); - - // if(((q+2)&4) != 0) { cosine = -cosine;} - condition3.int_vec = _mm256_cmpeq_epi32(_mm256_and_si256(_mm256_add_epi32(q, twos), fours), zeroes); - condition3.int_vec = _mm256_xor_si256(allones, condition3.int_vec); - - cosine = _mm256_add_ps(cosine, _mm256_and_ps(_mm256_sub_ps(sine, cosine), condition1.float_vec)); - cosine = _mm256_sub_ps(cosine, _mm256_and_ps(_mm256_mul_ps(cosine, _mm256_set1_ps(2.0f)), condition3.float_vec)); - _mm256_store_ps(bPtr, cosine); - aPtr += 8; - bPtr += 8; - } - - number = eighthPoints * 8; - for(;number < num_points; number++){ - *bPtr++ = cos(*aPtr++); - } + float* bPtr = bVector; + const float* aPtr = aVector; + + unsigned int number = 0; + unsigned int eighthPoints = num_points / 8; + unsigned int i = 0; + + __m256 aVal, s, r, m4pi, pio4A, pio4B, pio4C, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, + fones, fzeroes; + __m256 sine, cosine; + __m256i q, ones, twos, fours; + + m4pi = _mm256_set1_ps(1.273239544735162542821171882678754627704620361328125); + pio4A = _mm256_set1_ps(0.7853981554508209228515625); + pio4B = _mm256_set1_ps(0.794662735614792836713604629039764404296875e-8); + pio4C = _mm256_set1_ps(0.306161699786838294306516483068750264552437361480769e-16); + ffours = _mm256_set1_ps(4.0); + ftwos = _mm256_set1_ps(2.0); + fones = _mm256_set1_ps(1.0); + fzeroes = _mm256_setzero_ps(); + __m256i zeroes = _mm256_set1_epi32(0); + ones = _mm256_set1_epi32(1); + __m256i allones = _mm256_set1_epi32(0xffffffff); + twos = _mm256_set1_epi32(2); + fours = _mm256_set1_epi32(4); + + cp1 = _mm256_set1_ps(1.0); + cp2 = _mm256_set1_ps(0.08333333333333333); + cp3 = _mm256_set1_ps(0.002777777777777778); + cp4 = _mm256_set1_ps(4.96031746031746e-05); + cp5 = _mm256_set1_ps(5.511463844797178e-07); + union bit256 condition1; + union bit256 condition3; + + for (; number < eighthPoints; number++) { + + aVal = _mm256_load_ps(aPtr); + // s = fabs(aVal) + s = _mm256_sub_ps(aVal, + _mm256_and_ps(_mm256_mul_ps(aVal, ftwos), + _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS))); + // q = (int) (s * (4/pi)), floor(aVal / (pi/4)) + q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi))); + // r = q + q&1, q indicates quadrant, r gives + r = _mm256_cvtepi32_ps(_mm256_add_epi32(q, _mm256_and_si256(q, ones))); + + s = _mm256_fnmadd_ps(r, pio4A, s); + s = _mm256_fnmadd_ps(r, pio4B, s); + s = _mm256_fnmadd_ps(r, pio4C, s); + + s = _mm256_div_ps( + s, + _mm256_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction + s = _mm256_mul_ps(s, s); + // Evaluate Taylor series + s = _mm256_mul_ps( + _mm256_fmadd_ps( + _mm256_fmsub_ps( + _mm256_fmadd_ps(_mm256_fmsub_ps(s, cp5, cp4), s, cp3), s, cp2), + s, + cp1), + s); + + for (i = 0; i < 3; i++) + s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s)); + s = _mm256_div_ps(s, ftwos); + + sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s)); + cosine = _mm256_sub_ps(fones, s); + + // if(((q+1)&2) != 0) { cosine=sine;} + condition1.int_vec = + _mm256_cmpeq_epi32(_mm256_and_si256(_mm256_add_epi32(q, ones), twos), zeroes); + condition1.int_vec = _mm256_xor_si256(allones, condition1.int_vec); + + // if(((q+2)&4) != 0) { cosine = -cosine;} + condition3.int_vec = _mm256_cmpeq_epi32( + _mm256_and_si256(_mm256_add_epi32(q, twos), fours), zeroes); + condition3.int_vec = _mm256_xor_si256(allones, condition3.int_vec); + + cosine = _mm256_add_ps( + cosine, _mm256_and_ps(_mm256_sub_ps(sine, cosine), condition1.float_vec)); + cosine = _mm256_sub_ps(cosine, + _mm256_and_ps(_mm256_mul_ps(cosine, _mm256_set1_ps(2.0f)), + condition3.float_vec)); + _mm256_store_ps(bPtr, cosine); + aPtr += 8; + bPtr += 8; + } + + number = eighthPoints * 8; + for (; number < num_points; number++) { + *bPtr++ = cos(*aPtr++); + } } #endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for aligned */ @@ -168,86 +184,109 @@ static inline void #include static inline void - volk_32f_cos_32f_a_avx2(float* bVector, const float* aVector, unsigned int num_points) +volk_32f_cos_32f_a_avx2(float* bVector, const float* aVector, unsigned int num_points) { - float* bPtr = bVector; - const float* aPtr = aVector; - - unsigned int number = 0; - unsigned int eighthPoints = num_points / 8; - unsigned int i = 0; - - __m256 aVal, s, r, m4pi, pio4A, pio4B, pio4C, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones, fzeroes; - __m256 sine, cosine; - __m256i q, ones, twos, fours; - - m4pi = _mm256_set1_ps(1.273239544735162542821171882678754627704620361328125); - pio4A = _mm256_set1_ps(0.7853981554508209228515625); - pio4B = _mm256_set1_ps(0.794662735614792836713604629039764404296875e-8); - pio4C = _mm256_set1_ps(0.306161699786838294306516483068750264552437361480769e-16); - ffours = _mm256_set1_ps(4.0); - ftwos = _mm256_set1_ps(2.0); - fones = _mm256_set1_ps(1.0); - fzeroes = _mm256_setzero_ps(); - __m256i zeroes = _mm256_set1_epi32(0); - ones = _mm256_set1_epi32(1); - __m256i allones = _mm256_set1_epi32(0xffffffff); - twos = _mm256_set1_epi32(2); - fours = _mm256_set1_epi32(4); - - cp1 = _mm256_set1_ps(1.0); - cp2 = _mm256_set1_ps(0.08333333333333333); - cp3 = _mm256_set1_ps(0.002777777777777778); - cp4 = _mm256_set1_ps(4.96031746031746e-05); - cp5 = _mm256_set1_ps(5.511463844797178e-07); - union bit256 condition1; - union bit256 condition3; - - for(;number < eighthPoints; number++){ - - aVal = _mm256_load_ps(aPtr); - // s = fabs(aVal) - s = _mm256_sub_ps(aVal, _mm256_and_ps(_mm256_mul_ps(aVal, ftwos), _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS))); - // q = (int) (s * (4/pi)), floor(aVal / (pi/4)) - q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi))); - // r = q + q&1, q indicates quadrant, r gives - r = _mm256_cvtepi32_ps(_mm256_add_epi32(q, _mm256_and_si256(q, ones))); - - s = _mm256_sub_ps(s, _mm256_mul_ps(r,pio4A)); - s = _mm256_sub_ps(s, _mm256_mul_ps(r,pio4B)); - s = _mm256_sub_ps(s, _mm256_mul_ps(r,pio4C)); - - s = _mm256_div_ps(s, _mm256_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction - s = _mm256_mul_ps(s, s); - // Evaluate Taylor series - s = _mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(s, cp5), cp4), s), cp3), s), cp2), s), cp1), s); - - for(i = 0; i < 3; i++) - s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s)); - s = _mm256_div_ps(s, ftwos); - - sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s)); - cosine = _mm256_sub_ps(fones, s); - - // if(((q+1)&2) != 0) { cosine=sine;} - condition1.int_vec = _mm256_cmpeq_epi32(_mm256_and_si256(_mm256_add_epi32(q, ones), twos), zeroes); - condition1.int_vec = _mm256_xor_si256(allones, condition1.int_vec); - - // if(((q+2)&4) != 0) { cosine = -cosine;} - condition3.int_vec = _mm256_cmpeq_epi32(_mm256_and_si256(_mm256_add_epi32(q, twos), fours), zeroes); - condition3.int_vec = _mm256_xor_si256(allones, condition3.int_vec); - - cosine = _mm256_add_ps(cosine, _mm256_and_ps(_mm256_sub_ps(sine, cosine), condition1.float_vec)); - cosine = _mm256_sub_ps(cosine, _mm256_and_ps(_mm256_mul_ps(cosine, _mm256_set1_ps(2.0f)), condition3.float_vec)); - _mm256_store_ps(bPtr, cosine); - aPtr += 8; - bPtr += 8; - } - - number = eighthPoints * 8; - for(;number < num_points; number++){ - *bPtr++ = cos(*aPtr++); - } + float* bPtr = bVector; + const float* aPtr = aVector; + + unsigned int number = 0; + unsigned int eighthPoints = num_points / 8; + unsigned int i = 0; + + __m256 aVal, s, r, m4pi, pio4A, pio4B, pio4C, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, + fones, fzeroes; + __m256 sine, cosine; + __m256i q, ones, twos, fours; + + m4pi = _mm256_set1_ps(1.273239544735162542821171882678754627704620361328125); + pio4A = _mm256_set1_ps(0.7853981554508209228515625); + pio4B = _mm256_set1_ps(0.794662735614792836713604629039764404296875e-8); + pio4C = _mm256_set1_ps(0.306161699786838294306516483068750264552437361480769e-16); + ffours = _mm256_set1_ps(4.0); + ftwos = _mm256_set1_ps(2.0); + fones = _mm256_set1_ps(1.0); + fzeroes = _mm256_setzero_ps(); + __m256i zeroes = _mm256_set1_epi32(0); + ones = _mm256_set1_epi32(1); + __m256i allones = _mm256_set1_epi32(0xffffffff); + twos = _mm256_set1_epi32(2); + fours = _mm256_set1_epi32(4); + + cp1 = _mm256_set1_ps(1.0); + cp2 = _mm256_set1_ps(0.08333333333333333); + cp3 = _mm256_set1_ps(0.002777777777777778); + cp4 = _mm256_set1_ps(4.96031746031746e-05); + cp5 = _mm256_set1_ps(5.511463844797178e-07); + union bit256 condition1; + union bit256 condition3; + + for (; number < eighthPoints; number++) { + + aVal = _mm256_load_ps(aPtr); + // s = fabs(aVal) + s = _mm256_sub_ps(aVal, + _mm256_and_ps(_mm256_mul_ps(aVal, ftwos), + _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS))); + // q = (int) (s * (4/pi)), floor(aVal / (pi/4)) + q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi))); + // r = q + q&1, q indicates quadrant, r gives + r = _mm256_cvtepi32_ps(_mm256_add_epi32(q, _mm256_and_si256(q, ones))); + + s = _mm256_sub_ps(s, _mm256_mul_ps(r, pio4A)); + s = _mm256_sub_ps(s, _mm256_mul_ps(r, pio4B)); + s = _mm256_sub_ps(s, _mm256_mul_ps(r, pio4C)); + + s = _mm256_div_ps( + s, + _mm256_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction + s = _mm256_mul_ps(s, s); + // Evaluate Taylor series + s = _mm256_mul_ps( + _mm256_add_ps( + _mm256_mul_ps( + _mm256_sub_ps( + _mm256_mul_ps( + _mm256_add_ps( + _mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(s, cp5), cp4), + s), + cp3), + s), + cp2), + s), + cp1), + s); + + for (i = 0; i < 3; i++) + s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s)); + s = _mm256_div_ps(s, ftwos); + + sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s)); + cosine = _mm256_sub_ps(fones, s); + + // if(((q+1)&2) != 0) { cosine=sine;} + condition1.int_vec = + _mm256_cmpeq_epi32(_mm256_and_si256(_mm256_add_epi32(q, ones), twos), zeroes); + condition1.int_vec = _mm256_xor_si256(allones, condition1.int_vec); + + // if(((q+2)&4) != 0) { cosine = -cosine;} + condition3.int_vec = _mm256_cmpeq_epi32( + _mm256_and_si256(_mm256_add_epi32(q, twos), fours), zeroes); + condition3.int_vec = _mm256_xor_si256(allones, condition3.int_vec); + + cosine = _mm256_add_ps( + cosine, _mm256_and_ps(_mm256_sub_ps(sine, cosine), condition1.float_vec)); + cosine = _mm256_sub_ps(cosine, + _mm256_and_ps(_mm256_mul_ps(cosine, _mm256_set1_ps(2.0f)), + condition3.float_vec)); + _mm256_store_ps(bPtr, cosine); + aPtr += 8; + bPtr += 8; + } + + number = eighthPoints * 8; + for (; number < num_points; number++) { + *bPtr++ = cos(*aPtr++); + } } #endif /* LV_HAVE_AVX2 for aligned */ @@ -256,86 +295,105 @@ static inline void #include static inline void - volk_32f_cos_32f_a_sse4_1(float* bVector, const float* aVector, unsigned int num_points) +volk_32f_cos_32f_a_sse4_1(float* bVector, const float* aVector, unsigned int num_points) { - float* bPtr = bVector; - const float* aPtr = aVector; - - unsigned int number = 0; - unsigned int quarterPoints = num_points / 4; - unsigned int i = 0; - - __m128 aVal, s, r, m4pi, pio4A, pio4B, pio4C, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones, fzeroes; - __m128 sine, cosine; - __m128i q, ones, twos, fours; - - m4pi = _mm_set1_ps(1.273239544735162542821171882678754627704620361328125); - pio4A = _mm_set1_ps(0.7853981554508209228515625); - pio4B = _mm_set1_ps(0.794662735614792836713604629039764404296875e-8); - pio4C = _mm_set1_ps(0.306161699786838294306516483068750264552437361480769e-16); - ffours = _mm_set1_ps(4.0); - ftwos = _mm_set1_ps(2.0); - fones = _mm_set1_ps(1.0); - fzeroes = _mm_setzero_ps(); - __m128i zeroes = _mm_set1_epi32(0); - ones = _mm_set1_epi32(1); - __m128i allones = _mm_set1_epi32(0xffffffff); - twos = _mm_set1_epi32(2); - fours = _mm_set1_epi32(4); - - cp1 = _mm_set1_ps(1.0); - cp2 = _mm_set1_ps(0.08333333333333333); - cp3 = _mm_set1_ps(0.002777777777777778); - cp4 = _mm_set1_ps(4.96031746031746e-05); - cp5 = _mm_set1_ps(5.511463844797178e-07); - union bit128 condition1; - union bit128 condition3; - - for(;number < quarterPoints; number++){ - - aVal = _mm_load_ps(aPtr); - // s = fabs(aVal) - s = _mm_sub_ps(aVal, _mm_and_ps(_mm_mul_ps(aVal, ftwos), _mm_cmplt_ps(aVal, fzeroes))); - // q = (int) (s * (4/pi)), floor(aVal / (pi/4)) - q = _mm_cvtps_epi32(_mm_floor_ps(_mm_mul_ps(s, m4pi))); - // r = q + q&1, q indicates quadrant, r gives - r = _mm_cvtepi32_ps(_mm_add_epi32(q, _mm_and_si128(q, ones))); - - s = _mm_sub_ps(s, _mm_mul_ps(r, pio4A)); - s = _mm_sub_ps(s, _mm_mul_ps(r, pio4B)); - s = _mm_sub_ps(s, _mm_mul_ps(r, pio4C)); - - s = _mm_div_ps(s, _mm_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction - s = _mm_mul_ps(s, s); - // Evaluate Taylor series - s = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(s, cp5), cp4), s), cp3), s), cp2), s), cp1), s); - - for(i = 0; i < 3; i++) - s = _mm_mul_ps(s, _mm_sub_ps(ffours, s)); - s = _mm_div_ps(s, ftwos); - - sine = _mm_sqrt_ps(_mm_mul_ps(_mm_sub_ps(ftwos, s), s)); - cosine = _mm_sub_ps(fones, s); - - // if(((q+1)&2) != 0) { cosine=sine;} - condition1.int_vec = _mm_cmpeq_epi32(_mm_and_si128(_mm_add_epi32(q, ones), twos), zeroes); - condition1.int_vec = _mm_xor_si128(allones, condition1.int_vec); - - // if(((q+2)&4) != 0) { cosine = -cosine;} - condition3.int_vec = _mm_cmpeq_epi32(_mm_and_si128(_mm_add_epi32(q, twos), fours), zeroes); - condition3.int_vec = _mm_xor_si128(allones, condition3.int_vec); - - cosine = _mm_add_ps(cosine, _mm_and_ps(_mm_sub_ps(sine, cosine), condition1.float_vec)); - cosine = _mm_sub_ps(cosine, _mm_and_ps(_mm_mul_ps(cosine, _mm_set1_ps(2.0f)), condition3.float_vec)); - _mm_store_ps(bPtr, cosine); - aPtr += 4; - bPtr += 4; - } - - number = quarterPoints * 4; - for(;number < num_points; number++){ - *bPtr++ = cosf(*aPtr++); - } + float* bPtr = bVector; + const float* aPtr = aVector; + + unsigned int number = 0; + unsigned int quarterPoints = num_points / 4; + unsigned int i = 0; + + __m128 aVal, s, r, m4pi, pio4A, pio4B, pio4C, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, + fones, fzeroes; + __m128 sine, cosine; + __m128i q, ones, twos, fours; + + m4pi = _mm_set1_ps(1.273239544735162542821171882678754627704620361328125); + pio4A = _mm_set1_ps(0.7853981554508209228515625); + pio4B = _mm_set1_ps(0.794662735614792836713604629039764404296875e-8); + pio4C = _mm_set1_ps(0.306161699786838294306516483068750264552437361480769e-16); + ffours = _mm_set1_ps(4.0); + ftwos = _mm_set1_ps(2.0); + fones = _mm_set1_ps(1.0); + fzeroes = _mm_setzero_ps(); + __m128i zeroes = _mm_set1_epi32(0); + ones = _mm_set1_epi32(1); + __m128i allones = _mm_set1_epi32(0xffffffff); + twos = _mm_set1_epi32(2); + fours = _mm_set1_epi32(4); + + cp1 = _mm_set1_ps(1.0); + cp2 = _mm_set1_ps(0.08333333333333333); + cp3 = _mm_set1_ps(0.002777777777777778); + cp4 = _mm_set1_ps(4.96031746031746e-05); + cp5 = _mm_set1_ps(5.511463844797178e-07); + union bit128 condition1; + union bit128 condition3; + + for (; number < quarterPoints; number++) { + + aVal = _mm_load_ps(aPtr); + // s = fabs(aVal) + s = _mm_sub_ps(aVal, + _mm_and_ps(_mm_mul_ps(aVal, ftwos), _mm_cmplt_ps(aVal, fzeroes))); + // q = (int) (s * (4/pi)), floor(aVal / (pi/4)) + q = _mm_cvtps_epi32(_mm_floor_ps(_mm_mul_ps(s, m4pi))); + // r = q + q&1, q indicates quadrant, r gives + r = _mm_cvtepi32_ps(_mm_add_epi32(q, _mm_and_si128(q, ones))); + + s = _mm_sub_ps(s, _mm_mul_ps(r, pio4A)); + s = _mm_sub_ps(s, _mm_mul_ps(r, pio4B)); + s = _mm_sub_ps(s, _mm_mul_ps(r, pio4C)); + + s = _mm_div_ps( + s, _mm_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction + s = _mm_mul_ps(s, s); + // Evaluate Taylor series + s = _mm_mul_ps( + _mm_add_ps( + _mm_mul_ps( + _mm_sub_ps( + _mm_mul_ps( + _mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(s, cp5), cp4), s), + cp3), + s), + cp2), + s), + cp1), + s); + + for (i = 0; i < 3; i++) + s = _mm_mul_ps(s, _mm_sub_ps(ffours, s)); + s = _mm_div_ps(s, ftwos); + + sine = _mm_sqrt_ps(_mm_mul_ps(_mm_sub_ps(ftwos, s), s)); + cosine = _mm_sub_ps(fones, s); + + // if(((q+1)&2) != 0) { cosine=sine;} + condition1.int_vec = + _mm_cmpeq_epi32(_mm_and_si128(_mm_add_epi32(q, ones), twos), zeroes); + condition1.int_vec = _mm_xor_si128(allones, condition1.int_vec); + + // if(((q+2)&4) != 0) { cosine = -cosine;} + condition3.int_vec = + _mm_cmpeq_epi32(_mm_and_si128(_mm_add_epi32(q, twos), fours), zeroes); + condition3.int_vec = _mm_xor_si128(allones, condition3.int_vec); + + cosine = _mm_add_ps(cosine, + _mm_and_ps(_mm_sub_ps(sine, cosine), condition1.float_vec)); + cosine = _mm_sub_ps( + cosine, + _mm_and_ps(_mm_mul_ps(cosine, _mm_set1_ps(2.0f)), condition3.float_vec)); + _mm_store_ps(bPtr, cosine); + aPtr += 4; + bPtr += 4; + } + + number = quarterPoints * 4; + for (; number < num_points; number++) { + *bPtr++ = cosf(*aPtr++); + } } #endif /* LV_HAVE_SSE4_1 for aligned */ @@ -343,7 +401,6 @@ static inline void #endif /* INCLUDED_volk_32f_cos_32f_a_H */ - #ifndef INCLUDED_volk_32f_cos_32f_u_H #define INCLUDED_volk_32f_cos_32f_u_H @@ -351,86 +408,102 @@ static inline void #include static inline void - volk_32f_cos_32f_u_avx2_fma(float* bVector, const float* aVector, unsigned int num_points) +volk_32f_cos_32f_u_avx2_fma(float* bVector, const float* aVector, unsigned int num_points) { - float* bPtr = bVector; - const float* aPtr = aVector; - - unsigned int number = 0; - unsigned int eighthPoints = num_points / 8; - unsigned int i = 0; - - __m256 aVal, s, r, m4pi, pio4A, pio4B, pio4C, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones, fzeroes; - __m256 sine, cosine; - __m256i q, ones, twos, fours; - - m4pi = _mm256_set1_ps(1.273239544735162542821171882678754627704620361328125); - pio4A = _mm256_set1_ps(0.7853981554508209228515625); - pio4B = _mm256_set1_ps(0.794662735614792836713604629039764404296875e-8); - pio4C = _mm256_set1_ps(0.306161699786838294306516483068750264552437361480769e-16); - ffours = _mm256_set1_ps(4.0); - ftwos = _mm256_set1_ps(2.0); - fones = _mm256_set1_ps(1.0); - fzeroes = _mm256_setzero_ps(); - __m256i zeroes = _mm256_set1_epi32(0); - ones = _mm256_set1_epi32(1); - __m256i allones = _mm256_set1_epi32(0xffffffff); - twos = _mm256_set1_epi32(2); - fours = _mm256_set1_epi32(4); - - cp1 = _mm256_set1_ps(1.0); - cp2 = _mm256_set1_ps(0.08333333333333333); - cp3 = _mm256_set1_ps(0.002777777777777778); - cp4 = _mm256_set1_ps(4.96031746031746e-05); - cp5 = _mm256_set1_ps(5.511463844797178e-07); - union bit256 condition1; - union bit256 condition3; - - for(;number < eighthPoints; number++){ - - aVal = _mm256_loadu_ps(aPtr); - // s = fabs(aVal) - s = _mm256_sub_ps(aVal, _mm256_and_ps(_mm256_mul_ps(aVal, ftwos), _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS))); - // q = (int) (s * (4/pi)), floor(aVal / (pi/4)) - q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi))); - // r = q + q&1, q indicates quadrant, r gives - r = _mm256_cvtepi32_ps(_mm256_add_epi32(q, _mm256_and_si256(q, ones))); - - s = _mm256_fnmadd_ps(r,pio4A,s); - s = _mm256_fnmadd_ps(r,pio4B,s); - s = _mm256_fnmadd_ps(r,pio4C,s); - - s = _mm256_div_ps(s, _mm256_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction - s = _mm256_mul_ps(s, s); - // Evaluate Taylor series - s = _mm256_mul_ps(_mm256_fmadd_ps(_mm256_fmsub_ps(_mm256_fmadd_ps(_mm256_fmsub_ps(s, cp5, cp4), s, cp3), s, cp2), s, cp1), s); - - for(i = 0; i < 3; i++) - s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s)); - s = _mm256_div_ps(s, ftwos); - - sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s)); - cosine = _mm256_sub_ps(fones, s); - - // if(((q+1)&2) != 0) { cosine=sine;} - condition1.int_vec = _mm256_cmpeq_epi32(_mm256_and_si256(_mm256_add_epi32(q, ones), twos), zeroes); - condition1.int_vec = _mm256_xor_si256(allones, condition1.int_vec); - - // if(((q+2)&4) != 0) { cosine = -cosine;} - condition3.int_vec = _mm256_cmpeq_epi32(_mm256_and_si256(_mm256_add_epi32(q, twos), fours), zeroes); - condition3.int_vec = _mm256_xor_si256(allones, condition3.int_vec); - - cosine = _mm256_add_ps(cosine, _mm256_and_ps(_mm256_sub_ps(sine, cosine), condition1.float_vec)); - cosine = _mm256_sub_ps(cosine, _mm256_and_ps(_mm256_mul_ps(cosine, _mm256_set1_ps(2.0f)), condition3.float_vec)); - _mm256_storeu_ps(bPtr, cosine); - aPtr += 8; - bPtr += 8; - } - - number = eighthPoints * 8; - for(;number < num_points; number++){ - *bPtr++ = cos(*aPtr++); - } + float* bPtr = bVector; + const float* aPtr = aVector; + + unsigned int number = 0; + unsigned int eighthPoints = num_points / 8; + unsigned int i = 0; + + __m256 aVal, s, r, m4pi, pio4A, pio4B, pio4C, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, + fones, fzeroes; + __m256 sine, cosine; + __m256i q, ones, twos, fours; + + m4pi = _mm256_set1_ps(1.273239544735162542821171882678754627704620361328125); + pio4A = _mm256_set1_ps(0.7853981554508209228515625); + pio4B = _mm256_set1_ps(0.794662735614792836713604629039764404296875e-8); + pio4C = _mm256_set1_ps(0.306161699786838294306516483068750264552437361480769e-16); + ffours = _mm256_set1_ps(4.0); + ftwos = _mm256_set1_ps(2.0); + fones = _mm256_set1_ps(1.0); + fzeroes = _mm256_setzero_ps(); + __m256i zeroes = _mm256_set1_epi32(0); + ones = _mm256_set1_epi32(1); + __m256i allones = _mm256_set1_epi32(0xffffffff); + twos = _mm256_set1_epi32(2); + fours = _mm256_set1_epi32(4); + + cp1 = _mm256_set1_ps(1.0); + cp2 = _mm256_set1_ps(0.08333333333333333); + cp3 = _mm256_set1_ps(0.002777777777777778); + cp4 = _mm256_set1_ps(4.96031746031746e-05); + cp5 = _mm256_set1_ps(5.511463844797178e-07); + union bit256 condition1; + union bit256 condition3; + + for (; number < eighthPoints; number++) { + + aVal = _mm256_loadu_ps(aPtr); + // s = fabs(aVal) + s = _mm256_sub_ps(aVal, + _mm256_and_ps(_mm256_mul_ps(aVal, ftwos), + _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS))); + // q = (int) (s * (4/pi)), floor(aVal / (pi/4)) + q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi))); + // r = q + q&1, q indicates quadrant, r gives + r = _mm256_cvtepi32_ps(_mm256_add_epi32(q, _mm256_and_si256(q, ones))); + + s = _mm256_fnmadd_ps(r, pio4A, s); + s = _mm256_fnmadd_ps(r, pio4B, s); + s = _mm256_fnmadd_ps(r, pio4C, s); + + s = _mm256_div_ps( + s, + _mm256_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction + s = _mm256_mul_ps(s, s); + // Evaluate Taylor series + s = _mm256_mul_ps( + _mm256_fmadd_ps( + _mm256_fmsub_ps( + _mm256_fmadd_ps(_mm256_fmsub_ps(s, cp5, cp4), s, cp3), s, cp2), + s, + cp1), + s); + + for (i = 0; i < 3; i++) + s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s)); + s = _mm256_div_ps(s, ftwos); + + sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s)); + cosine = _mm256_sub_ps(fones, s); + + // if(((q+1)&2) != 0) { cosine=sine;} + condition1.int_vec = + _mm256_cmpeq_epi32(_mm256_and_si256(_mm256_add_epi32(q, ones), twos), zeroes); + condition1.int_vec = _mm256_xor_si256(allones, condition1.int_vec); + + // if(((q+2)&4) != 0) { cosine = -cosine;} + condition3.int_vec = _mm256_cmpeq_epi32( + _mm256_and_si256(_mm256_add_epi32(q, twos), fours), zeroes); + condition3.int_vec = _mm256_xor_si256(allones, condition3.int_vec); + + cosine = _mm256_add_ps( + cosine, _mm256_and_ps(_mm256_sub_ps(sine, cosine), condition1.float_vec)); + cosine = _mm256_sub_ps(cosine, + _mm256_and_ps(_mm256_mul_ps(cosine, _mm256_set1_ps(2.0f)), + condition3.float_vec)); + _mm256_storeu_ps(bPtr, cosine); + aPtr += 8; + bPtr += 8; + } + + number = eighthPoints * 8; + for (; number < num_points; number++) { + *bPtr++ = cos(*aPtr++); + } } #endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for unaligned */ @@ -439,86 +512,109 @@ static inline void #include static inline void - volk_32f_cos_32f_u_avx2(float* bVector, const float* aVector, unsigned int num_points) +volk_32f_cos_32f_u_avx2(float* bVector, const float* aVector, unsigned int num_points) { - float* bPtr = bVector; - const float* aPtr = aVector; - - unsigned int number = 0; - unsigned int eighthPoints = num_points / 8; - unsigned int i = 0; - - __m256 aVal, s, r, m4pi, pio4A, pio4B, pio4C, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones, fzeroes; - __m256 sine, cosine; - __m256i q, ones, twos, fours; - - m4pi = _mm256_set1_ps(1.273239544735162542821171882678754627704620361328125); - pio4A = _mm256_set1_ps(0.7853981554508209228515625); - pio4B = _mm256_set1_ps(0.794662735614792836713604629039764404296875e-8); - pio4C = _mm256_set1_ps(0.306161699786838294306516483068750264552437361480769e-16); - ffours = _mm256_set1_ps(4.0); - ftwos = _mm256_set1_ps(2.0); - fones = _mm256_set1_ps(1.0); - fzeroes = _mm256_setzero_ps(); - __m256i zeroes = _mm256_set1_epi32(0); - ones = _mm256_set1_epi32(1); - __m256i allones = _mm256_set1_epi32(0xffffffff); - twos = _mm256_set1_epi32(2); - fours = _mm256_set1_epi32(4); - - cp1 = _mm256_set1_ps(1.0); - cp2 = _mm256_set1_ps(0.08333333333333333); - cp3 = _mm256_set1_ps(0.002777777777777778); - cp4 = _mm256_set1_ps(4.96031746031746e-05); - cp5 = _mm256_set1_ps(5.511463844797178e-07); - union bit256 condition1; - union bit256 condition3; - - for(;number < eighthPoints; number++){ - - aVal = _mm256_loadu_ps(aPtr); - // s = fabs(aVal) - s = _mm256_sub_ps(aVal, _mm256_and_ps(_mm256_mul_ps(aVal, ftwos), _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS))); - // q = (int) (s * (4/pi)), floor(aVal / (pi/4)) - q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi))); - // r = q + q&1, q indicates quadrant, r gives - r = _mm256_cvtepi32_ps(_mm256_add_epi32(q, _mm256_and_si256(q, ones))); - - s = _mm256_sub_ps(s, _mm256_mul_ps(r,pio4A)); - s = _mm256_sub_ps(s, _mm256_mul_ps(r,pio4B)); - s = _mm256_sub_ps(s, _mm256_mul_ps(r,pio4C)); - - s = _mm256_div_ps(s, _mm256_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction - s = _mm256_mul_ps(s, s); - // Evaluate Taylor series - s = _mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(s, cp5), cp4), s), cp3), s), cp2), s), cp1), s); - - for(i = 0; i < 3; i++) - s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s)); - s = _mm256_div_ps(s, ftwos); - - sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s)); - cosine = _mm256_sub_ps(fones, s); - - // if(((q+1)&2) != 0) { cosine=sine;} - condition1.int_vec = _mm256_cmpeq_epi32(_mm256_and_si256(_mm256_add_epi32(q, ones), twos), zeroes); - condition1.int_vec = _mm256_xor_si256(allones, condition1.int_vec); - - // if(((q+2)&4) != 0) { cosine = -cosine;} - condition3.int_vec = _mm256_cmpeq_epi32(_mm256_and_si256(_mm256_add_epi32(q, twos), fours), zeroes); - condition3.int_vec = _mm256_xor_si256(allones, condition3.int_vec); - - cosine = _mm256_add_ps(cosine, _mm256_and_ps(_mm256_sub_ps(sine, cosine), condition1.float_vec)); - cosine = _mm256_sub_ps(cosine, _mm256_and_ps(_mm256_mul_ps(cosine, _mm256_set1_ps(2.0f)), condition3.float_vec)); - _mm256_storeu_ps(bPtr, cosine); - aPtr += 8; - bPtr += 8; - } - - number = eighthPoints * 8; - for(;number < num_points; number++){ - *bPtr++ = cos(*aPtr++); - } + float* bPtr = bVector; + const float* aPtr = aVector; + + unsigned int number = 0; + unsigned int eighthPoints = num_points / 8; + unsigned int i = 0; + + __m256 aVal, s, r, m4pi, pio4A, pio4B, pio4C, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, + fones, fzeroes; + __m256 sine, cosine; + __m256i q, ones, twos, fours; + + m4pi = _mm256_set1_ps(1.273239544735162542821171882678754627704620361328125); + pio4A = _mm256_set1_ps(0.7853981554508209228515625); + pio4B = _mm256_set1_ps(0.794662735614792836713604629039764404296875e-8); + pio4C = _mm256_set1_ps(0.306161699786838294306516483068750264552437361480769e-16); + ffours = _mm256_set1_ps(4.0); + ftwos = _mm256_set1_ps(2.0); + fones = _mm256_set1_ps(1.0); + fzeroes = _mm256_setzero_ps(); + __m256i zeroes = _mm256_set1_epi32(0); + ones = _mm256_set1_epi32(1); + __m256i allones = _mm256_set1_epi32(0xffffffff); + twos = _mm256_set1_epi32(2); + fours = _mm256_set1_epi32(4); + + cp1 = _mm256_set1_ps(1.0); + cp2 = _mm256_set1_ps(0.08333333333333333); + cp3 = _mm256_set1_ps(0.002777777777777778); + cp4 = _mm256_set1_ps(4.96031746031746e-05); + cp5 = _mm256_set1_ps(5.511463844797178e-07); + union bit256 condition1; + union bit256 condition3; + + for (; number < eighthPoints; number++) { + + aVal = _mm256_loadu_ps(aPtr); + // s = fabs(aVal) + s = _mm256_sub_ps(aVal, + _mm256_and_ps(_mm256_mul_ps(aVal, ftwos), + _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS))); + // q = (int) (s * (4/pi)), floor(aVal / (pi/4)) + q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi))); + // r = q + q&1, q indicates quadrant, r gives + r = _mm256_cvtepi32_ps(_mm256_add_epi32(q, _mm256_and_si256(q, ones))); + + s = _mm256_sub_ps(s, _mm256_mul_ps(r, pio4A)); + s = _mm256_sub_ps(s, _mm256_mul_ps(r, pio4B)); + s = _mm256_sub_ps(s, _mm256_mul_ps(r, pio4C)); + + s = _mm256_div_ps( + s, + _mm256_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction + s = _mm256_mul_ps(s, s); + // Evaluate Taylor series + s = _mm256_mul_ps( + _mm256_add_ps( + _mm256_mul_ps( + _mm256_sub_ps( + _mm256_mul_ps( + _mm256_add_ps( + _mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(s, cp5), cp4), + s), + cp3), + s), + cp2), + s), + cp1), + s); + + for (i = 0; i < 3; i++) + s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s)); + s = _mm256_div_ps(s, ftwos); + + sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s)); + cosine = _mm256_sub_ps(fones, s); + + // if(((q+1)&2) != 0) { cosine=sine;} + condition1.int_vec = + _mm256_cmpeq_epi32(_mm256_and_si256(_mm256_add_epi32(q, ones), twos), zeroes); + condition1.int_vec = _mm256_xor_si256(allones, condition1.int_vec); + + // if(((q+2)&4) != 0) { cosine = -cosine;} + condition3.int_vec = _mm256_cmpeq_epi32( + _mm256_and_si256(_mm256_add_epi32(q, twos), fours), zeroes); + condition3.int_vec = _mm256_xor_si256(allones, condition3.int_vec); + + cosine = _mm256_add_ps( + cosine, _mm256_and_ps(_mm256_sub_ps(sine, cosine), condition1.float_vec)); + cosine = _mm256_sub_ps(cosine, + _mm256_and_ps(_mm256_mul_ps(cosine, _mm256_set1_ps(2.0f)), + condition3.float_vec)); + _mm256_storeu_ps(bPtr, cosine); + aPtr += 8; + bPtr += 8; + } + + number = eighthPoints * 8; + for (; number < num_points; number++) { + *bPtr++ = cos(*aPtr++); + } } #endif /* LV_HAVE_AVX2 for unaligned */ @@ -529,71 +625,88 @@ static inline void static inline void volk_32f_cos_32f_u_sse4_1(float* bVector, const float* aVector, unsigned int num_points) { - float* bPtr = bVector; - const float* aPtr = aVector; - - unsigned int number = 0; - unsigned int quarterPoints = num_points / 4; - unsigned int i = 0; - - __m128 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones, fzeroes; - __m128 sine, cosine, condition1, condition3; - __m128i q, r, ones, twos, fours; - - m4pi = _mm_set1_ps(1.273239545); - pio4A = _mm_set1_ps(0.78515625); - pio4B = _mm_set1_ps(0.241876e-3); - ffours = _mm_set1_ps(4.0); - ftwos = _mm_set1_ps(2.0); - fones = _mm_set1_ps(1.0); - fzeroes = _mm_setzero_ps(); - ones = _mm_set1_epi32(1); - twos = _mm_set1_epi32(2); - fours = _mm_set1_epi32(4); - - cp1 = _mm_set1_ps(1.0); - cp2 = _mm_set1_ps(0.83333333e-1); - cp3 = _mm_set1_ps(0.2777778e-2); - cp4 = _mm_set1_ps(0.49603e-4); - cp5 = _mm_set1_ps(0.551e-6); - - for(;number < quarterPoints; number++){ - aVal = _mm_loadu_ps(aPtr); - s = _mm_sub_ps(aVal, _mm_and_ps(_mm_mul_ps(aVal, ftwos), _mm_cmplt_ps(aVal, fzeroes))); - q = _mm_cvtps_epi32(_mm_floor_ps(_mm_mul_ps(s, m4pi))); - r = _mm_add_epi32(q, _mm_and_si128(q, ones)); - - s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4A)); - s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4B)); - - s = _mm_div_ps(s, _mm_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction - s = _mm_mul_ps(s, s); - // Evaluate Taylor series - s = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(s, cp5), cp4), s), cp3), s), cp2), s), cp1), s); - - for(i = 0; i < 3; i++){ - s = _mm_mul_ps(s, _mm_sub_ps(ffours, s)); - } - s = _mm_div_ps(s, ftwos); - - sine = _mm_sqrt_ps(_mm_mul_ps(_mm_sub_ps(ftwos, s), s)); - cosine = _mm_sub_ps(fones, s); - - condition1 = _mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, ones), twos)), fzeroes); + float* bPtr = bVector; + const float* aPtr = aVector; - condition3 = _mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, twos), fours)), fzeroes); - - cosine = _mm_add_ps(cosine, _mm_and_ps(_mm_sub_ps(sine, cosine), condition1)); - cosine = _mm_sub_ps(cosine, _mm_and_ps(_mm_mul_ps(cosine, _mm_set1_ps(2.0f)), condition3)); - _mm_storeu_ps(bPtr, cosine); - aPtr += 4; - bPtr += 4; - } + unsigned int number = 0; + unsigned int quarterPoints = num_points / 4; + unsigned int i = 0; + + __m128 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones, + fzeroes; + __m128 sine, cosine, condition1, condition3; + __m128i q, r, ones, twos, fours; + + m4pi = _mm_set1_ps(1.273239545); + pio4A = _mm_set1_ps(0.78515625); + pio4B = _mm_set1_ps(0.241876e-3); + ffours = _mm_set1_ps(4.0); + ftwos = _mm_set1_ps(2.0); + fones = _mm_set1_ps(1.0); + fzeroes = _mm_setzero_ps(); + ones = _mm_set1_epi32(1); + twos = _mm_set1_epi32(2); + fours = _mm_set1_epi32(4); + + cp1 = _mm_set1_ps(1.0); + cp2 = _mm_set1_ps(0.83333333e-1); + cp3 = _mm_set1_ps(0.2777778e-2); + cp4 = _mm_set1_ps(0.49603e-4); + cp5 = _mm_set1_ps(0.551e-6); + + for (; number < quarterPoints; number++) { + aVal = _mm_loadu_ps(aPtr); + s = _mm_sub_ps(aVal, + _mm_and_ps(_mm_mul_ps(aVal, ftwos), _mm_cmplt_ps(aVal, fzeroes))); + q = _mm_cvtps_epi32(_mm_floor_ps(_mm_mul_ps(s, m4pi))); + r = _mm_add_epi32(q, _mm_and_si128(q, ones)); + + s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4A)); + s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4B)); + + s = _mm_div_ps( + s, _mm_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction + s = _mm_mul_ps(s, s); + // Evaluate Taylor series + s = _mm_mul_ps( + _mm_add_ps( + _mm_mul_ps( + _mm_sub_ps( + _mm_mul_ps( + _mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(s, cp5), cp4), s), + cp3), + s), + cp2), + s), + cp1), + s); + + for (i = 0; i < 3; i++) { + s = _mm_mul_ps(s, _mm_sub_ps(ffours, s)); + } + s = _mm_div_ps(s, ftwos); + + sine = _mm_sqrt_ps(_mm_mul_ps(_mm_sub_ps(ftwos, s), s)); + cosine = _mm_sub_ps(fones, s); + + condition1 = _mm_cmpneq_ps( + _mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, ones), twos)), fzeroes); + + condition3 = _mm_cmpneq_ps( + _mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, twos), fours)), fzeroes); + + cosine = _mm_add_ps(cosine, _mm_and_ps(_mm_sub_ps(sine, cosine), condition1)); + cosine = _mm_sub_ps( + cosine, _mm_and_ps(_mm_mul_ps(cosine, _mm_set1_ps(2.0f)), condition3)); + _mm_storeu_ps(bPtr, cosine); + aPtr += 4; + bPtr += 4; + } - number = quarterPoints * 4; - for(;number < num_points; number++){ - *bPtr++ = cosf(*aPtr++); - } + number = quarterPoints * 4; + for (; number < num_points; number++) { + *bPtr++ = cosf(*aPtr++); + } } #endif /* LV_HAVE_SSE4_1 for unaligned */ @@ -606,52 +719,55 @@ volk_32f_cos_32f_u_sse4_1(float* bVector, const float* aVector, unsigned int num * Shibata, Naoki, "Efficient evaluation methods of elementary functions * suitable for SIMD computation," in Springer-Verlag 2010 */ -static inline void -volk_32f_cos_32f_generic_fast(float* bVector, const float* aVector, unsigned int num_points) +static inline void volk_32f_cos_32f_generic_fast(float* bVector, + const float* aVector, + unsigned int num_points) { - float* bPtr = bVector; - const float* aPtr = aVector; - - float m4pi = 1.273239544735162542821171882678754627704620361328125; - float pio4A = 0.7853981554508209228515625; - float pio4B = 0.794662735614792836713604629039764404296875e-8; - float pio4C = 0.306161699786838294306516483068750264552437361480769e-16; - int N = 3; // order of argument reduction - - unsigned int number; - for(number = 0; number < num_points; number++){ - float s = fabs(*aPtr); - int q = (int)(s * m4pi); - int r = q + (q&1); - s -= r * pio4A; - s -= r * pio4B; - s -= r * pio4C; - - s = s * 0.125; // 2^-N (<--3) - s = s*s; - s = ((((s/1814400. - 1.0/20160.0)*s + 1.0/360.0)*s - 1.0/12.0)*s + 1.0)*s; - - int i; - for(i=0; i < N; ++i) { - s = (4.0-s)*s; - } - s = s/2.0; - - float sine = sqrt((2.0-s)*s); - float cosine = 1-s; - - if (((q+1) & 2) != 0) { - s = cosine; - cosine = sine; - sine = s; - } - if (((q+2) & 4) != 0) { - cosine = -cosine; - } - *bPtr = cosine; - bPtr++; - aPtr++; - } + float* bPtr = bVector; + const float* aPtr = aVector; + + float m4pi = 1.273239544735162542821171882678754627704620361328125; + float pio4A = 0.7853981554508209228515625; + float pio4B = 0.794662735614792836713604629039764404296875e-8; + float pio4C = 0.306161699786838294306516483068750264552437361480769e-16; + int N = 3; // order of argument reduction + + unsigned int number; + for (number = 0; number < num_points; number++) { + float s = fabs(*aPtr); + int q = (int)(s * m4pi); + int r = q + (q & 1); + s -= r * pio4A; + s -= r * pio4B; + s -= r * pio4C; + + s = s * 0.125; // 2^-N (<--3) + s = s * s; + s = ((((s / 1814400. - 1.0 / 20160.0) * s + 1.0 / 360.0) * s - 1.0 / 12.0) * s + + 1.0) * + s; + + int i; + for (i = 0; i < N; ++i) { + s = (4.0 - s) * s; + } + s = s / 2.0; + + float sine = sqrt((2.0 - s) * s); + float cosine = 1 - s; + + if (((q + 1) & 2) != 0) { + s = cosine; + cosine = sine; + sine = s; + } + if (((q + 2) & 4) != 0) { + cosine = -cosine; + } + *bPtr = cosine; + bPtr++; + aPtr++; + } } #endif /* LV_HAVE_GENERIC */ @@ -662,13 +778,13 @@ volk_32f_cos_32f_generic_fast(float* bVector, const float* aVector, unsigned int static inline void volk_32f_cos_32f_generic(float* bVector, const float* aVector, unsigned int num_points) { - float* bPtr = bVector; - const float* aPtr = aVector; - unsigned int number = 0; + float* bPtr = bVector; + const float* aPtr = aVector; + unsigned int number = 0; - for(; number < num_points; number++){ - *bPtr++ = cosf(*aPtr++); - } + for (; number < num_points; number++) { + *bPtr++ = cosf(*aPtr++); + } } #endif /* LV_HAVE_GENERIC */ @@ -679,30 +795,29 @@ volk_32f_cos_32f_generic(float* bVector, const float* aVector, unsigned int num_ #include static inline void -volk_32f_cos_32f_neon(float* bVector, const float* aVector, - unsigned int num_points) +volk_32f_cos_32f_neon(float* bVector, const float* aVector, unsigned int num_points) { unsigned int number = 0; unsigned int quarter_points = num_points / 4; float* bVectorPtr = bVector; const float* aVectorPtr = aVector; - + float32x4_t b_vec; float32x4_t a_vec; - - for(number = 0; number < quarter_points; number++) { + + for (number = 0; number < quarter_points; number++) { a_vec = vld1q_f32(aVectorPtr); // Prefetch next one, speeds things up - __VOLK_PREFETCH(aVectorPtr+4); + __VOLK_PREFETCH(aVectorPtr + 4); b_vec = _vcosq_f32(a_vec); vst1q_f32(bVectorPtr, b_vec); // move pointers ahead - bVectorPtr+=4; - aVectorPtr+=4; + bVectorPtr += 4; + aVectorPtr += 4; } - + // Deal with the rest - for(number = quarter_points * 4; number < num_points; number++) { + for (number = quarter_points * 4; number < num_points; number++) { *bVectorPtr++ = cosf(*aVectorPtr++); } } diff --git a/kernels/volk/volk_32f_expfast_32f.h b/kernels/volk/volk_32f_expfast_32f.h index ecb4914..45de3f9 100644 --- a/kernels/volk/volk_32f_expfast_32f.h +++ b/kernels/volk/volk_32f_expfast_32f.h @@ -30,8 +30,8 @@ * * Dispatcher Prototype * \code - * void volk_32f_expfast_32f(float* bVector, const float* aVector, unsigned int num_points) - * \endcode + * void volk_32f_expfast_32f(float* bVector, const float* aVector, unsigned int + * num_points) \endcode * * \b Inputs * \li aVector: Input vector of floats. @@ -62,9 +62,9 @@ * \endcode */ -#include -#include #include +#include +#include #define Mln2 0.6931471805f #define A 8388608.0f @@ -79,34 +79,35 @@ #include -static inline void - volk_32f_expfast_32f_a_avx_fma(float* bVector, const float* aVector, unsigned int num_points) +static inline void volk_32f_expfast_32f_a_avx_fma(float* bVector, + const float* aVector, + unsigned int num_points) { - float* bPtr = bVector; - const float* aPtr = aVector; - - unsigned int number = 0; - const unsigned int eighthPoints = num_points / 8; - - __m256 aVal, bVal, a, b; - __m256i exp; - a = _mm256_set1_ps(A/Mln2); - b = _mm256_set1_ps(B-C); - - for(;number < eighthPoints; number++){ - aVal = _mm256_load_ps(aPtr); - exp = _mm256_cvtps_epi32(_mm256_fmadd_ps(a,aVal, b)); - bVal = _mm256_castsi256_ps(exp); - - _mm256_store_ps(bPtr, bVal); - aPtr += 8; - bPtr += 8; - } - - number = eighthPoints * 8; - for(;number < num_points; number++){ - *bPtr++ = expf(*aPtr++); - } + float* bPtr = bVector; + const float* aPtr = aVector; + + unsigned int number = 0; + const unsigned int eighthPoints = num_points / 8; + + __m256 aVal, bVal, a, b; + __m256i exp; + a = _mm256_set1_ps(A / Mln2); + b = _mm256_set1_ps(B - C); + + for (; number < eighthPoints; number++) { + aVal = _mm256_load_ps(aPtr); + exp = _mm256_cvtps_epi32(_mm256_fmadd_ps(a, aVal, b)); + bVal = _mm256_castsi256_ps(exp); + + _mm256_store_ps(bPtr, bVal); + aPtr += 8; + bPtr += 8; + } + + number = eighthPoints * 8; + for (; number < num_points; number++) { + *bPtr++ = expf(*aPtr++); + } } #endif /* LV_HAVE_AVX && LV_HAVE_FMA for aligned */ @@ -116,33 +117,33 @@ static inline void #include static inline void - volk_32f_expfast_32f_a_avx(float* bVector, const float* aVector, unsigned int num_points) +volk_32f_expfast_32f_a_avx(float* bVector, const float* aVector, unsigned int num_points) { - float* bPtr = bVector; - const float* aPtr = aVector; - - unsigned int number = 0; - const unsigned int eighthPoints = num_points / 8; - - __m256 aVal, bVal, a, b; - __m256i exp; - a = _mm256_set1_ps(A/Mln2); - b = _mm256_set1_ps(B-C); - - for(;number < eighthPoints; number++){ - aVal = _mm256_load_ps(aPtr); - exp = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_mul_ps(a,aVal), b)); - bVal = _mm256_castsi256_ps(exp); - - _mm256_store_ps(bPtr, bVal); - aPtr += 8; - bPtr += 8; - } - - number = eighthPoints * 8; - for(;number < num_points; number++){ - *bPtr++ = expf(*aPtr++); - } + float* bPtr = bVector; + const float* aPtr = aVector; + + unsigned int number = 0; + const unsigned int eighthPoints = num_points / 8; + + __m256 aVal, bVal, a, b; + __m256i exp; + a = _mm256_set1_ps(A / Mln2); + b = _mm256_set1_ps(B - C); + + for (; number < eighthPoints; number++) { + aVal = _mm256_load_ps(aPtr); + exp = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_mul_ps(a, aVal), b)); + bVal = _mm256_castsi256_ps(exp); + + _mm256_store_ps(bPtr, bVal); + aPtr += 8; + bPtr += 8; + } + + number = eighthPoints * 8; + for (; number < num_points; number++) { + *bPtr++ = expf(*aPtr++); + } } #endif /* LV_HAVE_AVX for aligned */ @@ -150,34 +151,35 @@ static inline void #ifdef LV_HAVE_SSE4_1 #include -static inline void -volk_32f_expfast_32f_a_sse4_1(float* bVector, const float* aVector, unsigned int num_points) +static inline void volk_32f_expfast_32f_a_sse4_1(float* bVector, + const float* aVector, + unsigned int num_points) { - float* bPtr = bVector; - const float* aPtr = aVector; - - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - __m128 aVal, bVal, a, b; - __m128i exp; - a = _mm_set1_ps(A/Mln2); - b = _mm_set1_ps(B-C); - - for(;number < quarterPoints; number++){ - aVal = _mm_load_ps(aPtr); - exp = _mm_cvtps_epi32(_mm_add_ps(_mm_mul_ps(a,aVal), b)); - bVal = _mm_castsi128_ps(exp); - - _mm_store_ps(bPtr, bVal); - aPtr += 4; - bPtr += 4; - } - - number = quarterPoints * 4; - for(;number < num_points; number++){ - *bPtr++ = expf(*aPtr++); - } + float* bPtr = bVector; + const float* aPtr = aVector; + + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + __m128 aVal, bVal, a, b; + __m128i exp; + a = _mm_set1_ps(A / Mln2); + b = _mm_set1_ps(B - C); + + for (; number < quarterPoints; number++) { + aVal = _mm_load_ps(aPtr); + exp = _mm_cvtps_epi32(_mm_add_ps(_mm_mul_ps(a, aVal), b)); + bVal = _mm_castsi128_ps(exp); + + _mm_store_ps(bPtr, bVal); + aPtr += 4; + bPtr += 4; + } + + number = quarterPoints * 4; + for (; number < num_points; number++) { + *bPtr++ = expf(*aPtr++); + } } #endif /* LV_HAVE_SSE4_1 for aligned */ @@ -190,34 +192,35 @@ volk_32f_expfast_32f_a_sse4_1(float* bVector, const float* aVector, unsigned int #if LV_HAVE_AVX && LV_HAVE_FMA #include -static inline void -volk_32f_expfast_32f_u_avx_fma(float* bVector, const float* aVector, unsigned int num_points) +static inline void volk_32f_expfast_32f_u_avx_fma(float* bVector, + const float* aVector, + unsigned int num_points) { - float* bPtr = bVector; - const float* aPtr = aVector; - - unsigned int number = 0; - const unsigned int eighthPoints = num_points / 8; - - __m256 aVal, bVal, a, b; - __m256i exp; - a = _mm256_set1_ps(A/Mln2); - b = _mm256_set1_ps(B-C); - - for(;number < eighthPoints; number++){ - aVal = _mm256_loadu_ps(aPtr); - exp = _mm256_cvtps_epi32(_mm256_fmadd_ps(a,aVal, b)); - bVal = _mm256_castsi256_ps(exp); - - _mm256_storeu_ps(bPtr, bVal); - aPtr += 8; - bPtr += 8; - } - - number = eighthPoints * 8; - for(;number < num_points; number++){ - *bPtr++ = expf(*aPtr++); - } + float* bPtr = bVector; + const float* aPtr = aVector; + + unsigned int number = 0; + const unsigned int eighthPoints = num_points / 8; + + __m256 aVal, bVal, a, b; + __m256i exp; + a = _mm256_set1_ps(A / Mln2); + b = _mm256_set1_ps(B - C); + + for (; number < eighthPoints; number++) { + aVal = _mm256_loadu_ps(aPtr); + exp = _mm256_cvtps_epi32(_mm256_fmadd_ps(a, aVal, b)); + bVal = _mm256_castsi256_ps(exp); + + _mm256_storeu_ps(bPtr, bVal); + aPtr += 8; + bPtr += 8; + } + + number = eighthPoints * 8; + for (; number < num_points; number++) { + *bPtr++ = expf(*aPtr++); + } } #endif /* LV_HAVE_AVX && LV_HAVE_FMA for unaligned */ @@ -228,31 +231,31 @@ volk_32f_expfast_32f_u_avx_fma(float* bVector, const float* aVector, unsigned in static inline void volk_32f_expfast_32f_u_avx(float* bVector, const float* aVector, unsigned int num_points) { - float* bPtr = bVector; - const float* aPtr = aVector; - - unsigned int number = 0; - const unsigned int eighthPoints = num_points / 8; - - __m256 aVal, bVal, a, b; - __m256i exp; - a = _mm256_set1_ps(A/Mln2); - b = _mm256_set1_ps(B-C); - - for(;number < eighthPoints; number++){ - aVal = _mm256_loadu_ps(aPtr); - exp = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_mul_ps(a,aVal), b)); - bVal = _mm256_castsi256_ps(exp); - - _mm256_storeu_ps(bPtr, bVal); - aPtr += 8; - bPtr += 8; - } - - number = eighthPoints * 8; - for(;number < num_points; number++){ - *bPtr++ = expf(*aPtr++); - } + float* bPtr = bVector; + const float* aPtr = aVector; + + unsigned int number = 0; + const unsigned int eighthPoints = num_points / 8; + + __m256 aVal, bVal, a, b; + __m256i exp; + a = _mm256_set1_ps(A / Mln2); + b = _mm256_set1_ps(B - C); + + for (; number < eighthPoints; number++) { + aVal = _mm256_loadu_ps(aPtr); + exp = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_mul_ps(a, aVal), b)); + bVal = _mm256_castsi256_ps(exp); + + _mm256_storeu_ps(bPtr, bVal); + aPtr += 8; + bPtr += 8; + } + + number = eighthPoints * 8; + for (; number < num_points; number++) { + *bPtr++ = expf(*aPtr++); + } } #endif /* LV_HAVE_AVX for unaligned */ @@ -261,34 +264,35 @@ volk_32f_expfast_32f_u_avx(float* bVector, const float* aVector, unsigned int nu #ifdef LV_HAVE_SSE4_1 #include -static inline void -volk_32f_expfast_32f_u_sse4_1(float* bVector, const float* aVector, unsigned int num_points) +static inline void volk_32f_expfast_32f_u_sse4_1(float* bVector, + const float* aVector, + unsigned int num_points) { - float* bPtr = bVector; - const float* aPtr = aVector; - - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - __m128 aVal, bVal, a, b; - __m128i exp; - a = _mm_set1_ps(A/Mln2); - b = _mm_set1_ps(B-C); - - for(;number < quarterPoints; number++){ - aVal = _mm_loadu_ps(aPtr); - exp = _mm_cvtps_epi32(_mm_add_ps(_mm_mul_ps(a,aVal), b)); - bVal = _mm_castsi128_ps(exp); - - _mm_storeu_ps(bPtr, bVal); - aPtr += 4; - bPtr += 4; - } - - number = quarterPoints * 4; - for(;number < num_points; number++){ - *bPtr++ = expf(*aPtr++); - } + float* bPtr = bVector; + const float* aPtr = aVector; + + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + __m128 aVal, bVal, a, b; + __m128i exp; + a = _mm_set1_ps(A / Mln2); + b = _mm_set1_ps(B - C); + + for (; number < quarterPoints; number++) { + aVal = _mm_loadu_ps(aPtr); + exp = _mm_cvtps_epi32(_mm_add_ps(_mm_mul_ps(a, aVal), b)); + bVal = _mm_castsi128_ps(exp); + + _mm_storeu_ps(bPtr, bVal); + aPtr += 4; + bPtr += 4; + } + + number = quarterPoints * 4; + for (; number < num_points; number++) { + *bPtr++ = expf(*aPtr++); + } } #endif /* LV_HAVE_SSE4_1 for unaligned */ @@ -296,16 +300,17 @@ volk_32f_expfast_32f_u_sse4_1(float* bVector, const float* aVector, unsigned int #ifdef LV_HAVE_GENERIC -static inline void -volk_32f_expfast_32f_generic(float* bVector, const float* aVector, unsigned int num_points) +static inline void volk_32f_expfast_32f_generic(float* bVector, + const float* aVector, + unsigned int num_points) { - float* bPtr = bVector; - const float* aPtr = aVector; - unsigned int number = 0; + float* bPtr = bVector; + const float* aPtr = aVector; + unsigned int number = 0; - for(number = 0; number < num_points; number++){ - *bPtr++ = expf(*aPtr++); - } + for (number = 0; number < num_points; number++) { + *bPtr++ = expf(*aPtr++); + } } #endif /* LV_HAVE_GENERIC */ diff --git a/kernels/volk/volk_32f_index_max_16u.h b/kernels/volk/volk_32f_index_max_16u.h index 7ca6928..3ee10f4 100644 --- a/kernels/volk/volk_32f_index_max_16u.h +++ b/kernels/volk/volk_32f_index_max_16u.h @@ -71,72 +71,71 @@ #ifndef INCLUDED_volk_32f_index_max_16u_a_H #define INCLUDED_volk_32f_index_max_16u_a_H -#include -#include #include #include #include +#include #ifdef LV_HAVE_AVX #include static inline void -volk_32f_index_max_16u_a_avx(uint16_t* target, const float* src0, - uint32_t num_points) +volk_32f_index_max_16u_a_avx(uint16_t* target, const float* src0, uint32_t num_points) { - num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points; - - uint32_t number = 0; - const uint32_t eighthPoints = num_points / 8; + num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points; - float* inputPtr = (float*)src0; + uint32_t number = 0; + const uint32_t eighthPoints = num_points / 8; - __m256 indexIncrementValues = _mm256_set1_ps(8); - __m256 currentIndexes = _mm256_set_ps(-1,-2,-3,-4,-5,-6,-7,-8); + float* inputPtr = (float*)src0; - float max = src0[0]; - float index = 0; - __m256 maxValues = _mm256_set1_ps(max); - __m256 maxValuesIndex = _mm256_setzero_ps(); - __m256 compareResults; - __m256 currentValues; + __m256 indexIncrementValues = _mm256_set1_ps(8); + __m256 currentIndexes = _mm256_set_ps(-1, -2, -3, -4, -5, -6, -7, -8); - __VOLK_ATTR_ALIGNED(32) float maxValuesBuffer[8]; - __VOLK_ATTR_ALIGNED(32) float maxIndexesBuffer[8]; + float max = src0[0]; + float index = 0; + __m256 maxValues = _mm256_set1_ps(max); + __m256 maxValuesIndex = _mm256_setzero_ps(); + __m256 compareResults; + __m256 currentValues; - for(;number < eighthPoints; number++){ + __VOLK_ATTR_ALIGNED(32) float maxValuesBuffer[8]; + __VOLK_ATTR_ALIGNED(32) float maxIndexesBuffer[8]; - currentValues = _mm256_load_ps(inputPtr); inputPtr += 8; - currentIndexes = _mm256_add_ps(currentIndexes, indexIncrementValues); + for (; number < eighthPoints; number++) { - compareResults = _mm256_cmp_ps(currentValues, maxValues, _CMP_GT_OS); + currentValues = _mm256_load_ps(inputPtr); + inputPtr += 8; + currentIndexes = _mm256_add_ps(currentIndexes, indexIncrementValues); - maxValuesIndex = _mm256_blendv_ps(maxValuesIndex, currentIndexes, compareResults); - maxValues = _mm256_blendv_ps(maxValues, currentValues, compareResults); - } + compareResults = _mm256_cmp_ps(currentValues, maxValues, _CMP_GT_OS); - // Calculate the largest value from the remaining 4 points - _mm256_store_ps(maxValuesBuffer, maxValues); - _mm256_store_ps(maxIndexesBuffer, maxValuesIndex); + maxValuesIndex = _mm256_blendv_ps(maxValuesIndex, currentIndexes, compareResults); + maxValues = _mm256_blendv_ps(maxValues, currentValues, compareResults); + } - for(number = 0; number < 8; number++){ - if(maxValuesBuffer[number] > max){ - index = maxIndexesBuffer[number]; - max = maxValuesBuffer[number]; - } else if(maxValuesBuffer[number] == max){ - if (index > maxIndexesBuffer[number]) - index = maxIndexesBuffer[number]; + // Calculate the largest value from the remaining 4 points + _mm256_store_ps(maxValuesBuffer, maxValues); + _mm256_store_ps(maxIndexesBuffer, maxValuesIndex); + + for (number = 0; number < 8; number++) { + if (maxValuesBuffer[number] > max) { + index = maxIndexesBuffer[number]; + max = maxValuesBuffer[number]; + } else if (maxValuesBuffer[number] == max) { + if (index > maxIndexesBuffer[number]) + index = maxIndexesBuffer[number]; + } } - } - number = eighthPoints * 8; - for(;number < num_points; number++){ - if(src0[number] > max){ - index = number; - max = src0[number]; + number = eighthPoints * 8; + for (; number < num_points; number++) { + if (src0[number] > max) { + index = number; + max = src0[number]; + } } - } - target[0] = (uint16_t)index; + target[0] = (uint16_t)index; } #endif /*LV_HAVE_AVX*/ @@ -145,62 +144,62 @@ volk_32f_index_max_16u_a_avx(uint16_t* target, const float* src0, #include static inline void -volk_32f_index_max_16u_a_sse4_1(uint16_t* target, const float* src0, - uint32_t num_points) +volk_32f_index_max_16u_a_sse4_1(uint16_t* target, const float* src0, uint32_t num_points) { - num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points; - - uint32_t number = 0; - const uint32_t quarterPoints = num_points / 4; + num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points; - float* inputPtr = (float*)src0; + uint32_t number = 0; + const uint32_t quarterPoints = num_points / 4; - __m128 indexIncrementValues = _mm_set1_ps(4); - __m128 currentIndexes = _mm_set_ps(-1,-2,-3,-4); + float* inputPtr = (float*)src0; - float max = src0[0]; - float index = 0; - __m128 maxValues = _mm_set1_ps(max); - __m128 maxValuesIndex = _mm_setzero_ps(); - __m128 compareResults; - __m128 currentValues; + __m128 indexIncrementValues = _mm_set1_ps(4); + __m128 currentIndexes = _mm_set_ps(-1, -2, -3, -4); - __VOLK_ATTR_ALIGNED(16) float maxValuesBuffer[4]; - __VOLK_ATTR_ALIGNED(16) float maxIndexesBuffer[4]; + float max = src0[0]; + float index = 0; + __m128 maxValues = _mm_set1_ps(max); + __m128 maxValuesIndex = _mm_setzero_ps(); + __m128 compareResults; + __m128 currentValues; - for(;number < quarterPoints; number++){ + __VOLK_ATTR_ALIGNED(16) float maxValuesBuffer[4]; + __VOLK_ATTR_ALIGNED(16) float maxIndexesBuffer[4]; - currentValues = _mm_load_ps(inputPtr); inputPtr += 4; - currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues); + for (; number < quarterPoints; number++) { - compareResults = _mm_cmpgt_ps(currentValues, maxValues); + currentValues = _mm_load_ps(inputPtr); + inputPtr += 4; + currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues); - maxValuesIndex = _mm_blendv_ps(maxValuesIndex, currentIndexes, compareResults); - maxValues = _mm_blendv_ps(maxValues, currentValues, compareResults); - } + compareResults = _mm_cmpgt_ps(currentValues, maxValues); - // Calculate the largest value from the remaining 4 points - _mm_store_ps(maxValuesBuffer, maxValues); - _mm_store_ps(maxIndexesBuffer, maxValuesIndex); + maxValuesIndex = _mm_blendv_ps(maxValuesIndex, currentIndexes, compareResults); + maxValues = _mm_blendv_ps(maxValues, currentValues, compareResults); + } - for(number = 0; number < 4; number++){ - if(maxValuesBuffer[number] > max){ - index = maxIndexesBuffer[number]; - max = maxValuesBuffer[number]; - } else if(maxValuesBuffer[number] == max){ - if (index > maxIndexesBuffer[number]) - index = maxIndexesBuffer[number]; + // Calculate the largest value from the remaining 4 points + _mm_store_ps(maxValuesBuffer, maxValues); + _mm_store_ps(maxIndexesBuffer, maxValuesIndex); + + for (number = 0; number < 4; number++) { + if (maxValuesBuffer[number] > max) { + index = maxIndexesBuffer[number]; + max = maxValuesBuffer[number]; + } else if (maxValuesBuffer[number] == max) { + if (index > maxIndexesBuffer[number]) + index = maxIndexesBuffer[number]; + } } - } - number = quarterPoints * 4; - for(;number < num_points; number++){ - if(src0[number] > max){ - index = number; - max = src0[number]; + number = quarterPoints * 4; + for (; number < num_points; number++) { + if (src0[number] > max) { + index = number; + max = src0[number]; + } } - } - target[0] = (uint16_t)index; + target[0] = (uint16_t)index; } #endif /*LV_HAVE_SSE4_1*/ @@ -211,64 +210,64 @@ volk_32f_index_max_16u_a_sse4_1(uint16_t* target, const float* src0, #include static inline void -volk_32f_index_max_16u_a_sse(uint16_t* target, const float* src0, - uint32_t num_points) +volk_32f_index_max_16u_a_sse(uint16_t* target, const float* src0, uint32_t num_points) { - num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points; - - uint32_t number = 0; - const uint32_t quarterPoints = num_points / 4; + num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points; - float* inputPtr = (float*)src0; + uint32_t number = 0; + const uint32_t quarterPoints = num_points / 4; - __m128 indexIncrementValues = _mm_set1_ps(4); - __m128 currentIndexes = _mm_set_ps(-1,-2,-3,-4); + float* inputPtr = (float*)src0; - float max = src0[0]; - float index = 0; - __m128 maxValues = _mm_set1_ps(max); - __m128 maxValuesIndex = _mm_setzero_ps(); - __m128 compareResults; - __m128 currentValues; + __m128 indexIncrementValues = _mm_set1_ps(4); + __m128 currentIndexes = _mm_set_ps(-1, -2, -3, -4); - __VOLK_ATTR_ALIGNED(16) float maxValuesBuffer[4]; - __VOLK_ATTR_ALIGNED(16) float maxIndexesBuffer[4]; + float max = src0[0]; + float index = 0; + __m128 maxValues = _mm_set1_ps(max); + __m128 maxValuesIndex = _mm_setzero_ps(); + __m128 compareResults; + __m128 currentValues; - for(;number < quarterPoints; number++){ + __VOLK_ATTR_ALIGNED(16) float maxValuesBuffer[4]; + __VOLK_ATTR_ALIGNED(16) float maxIndexesBuffer[4]; - currentValues = _mm_load_ps(inputPtr); inputPtr += 4; - currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues); + for (; number < quarterPoints; number++) { - compareResults = _mm_cmpgt_ps(currentValues, maxValues); + currentValues = _mm_load_ps(inputPtr); + inputPtr += 4; + currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues); - maxValuesIndex = _mm_or_ps(_mm_and_ps(compareResults, currentIndexes), - _mm_andnot_ps(compareResults, maxValuesIndex)); - maxValues = _mm_or_ps(_mm_and_ps(compareResults, currentValues), - _mm_andnot_ps(compareResults, maxValues)); - } + compareResults = _mm_cmpgt_ps(currentValues, maxValues); - // Calculate the largest value from the remaining 4 points - _mm_store_ps(maxValuesBuffer, maxValues); - _mm_store_ps(maxIndexesBuffer, maxValuesIndex); + maxValuesIndex = _mm_or_ps(_mm_and_ps(compareResults, currentIndexes), + _mm_andnot_ps(compareResults, maxValuesIndex)); + maxValues = _mm_or_ps(_mm_and_ps(compareResults, currentValues), + _mm_andnot_ps(compareResults, maxValues)); + } - for(number = 0; number < 4; number++){ - if(maxValuesBuffer[number] > max){ - index = maxIndexesBuffer[number]; - max = maxValuesBuffer[number]; - } else if(maxValuesBuffer[number] == max){ - if (index > maxIndexesBuffer[number]) - index = maxIndexesBuffer[number]; + // Calculate the largest value from the remaining 4 points + _mm_store_ps(maxValuesBuffer, maxValues); + _mm_store_ps(maxIndexesBuffer, maxValuesIndex); + + for (number = 0; number < 4; number++) { + if (maxValuesBuffer[number] > max) { + index = maxIndexesBuffer[number]; + max = maxValuesBuffer[number]; + } else if (maxValuesBuffer[number] == max) { + if (index > maxIndexesBuffer[number]) + index = maxIndexesBuffer[number]; + } } - } - number = quarterPoints * 4; - for(;number < num_points; number++){ - if(src0[number] > max){ - index = number; - max = src0[number]; + number = quarterPoints * 4; + for (; number < num_points; number++) { + if (src0[number] > max) { + index = number; + max = src0[number]; + } } - } - target[0] = (uint16_t)index; + target[0] = (uint16_t)index; } #endif /*LV_HAVE_SSE*/ @@ -277,23 +276,22 @@ volk_32f_index_max_16u_a_sse(uint16_t* target, const float* src0, #ifdef LV_HAVE_GENERIC static inline void -volk_32f_index_max_16u_generic(uint16_t* target, const float* src0, - uint32_t num_points) +volk_32f_index_max_16u_generic(uint16_t* target, const float* src0, uint32_t num_points) { - num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points; + num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points; - float max = src0[0]; - uint16_t index = 0; + float max = src0[0]; + uint16_t index = 0; - uint32_t i = 1; + uint32_t i = 1; - for(; i < num_points; ++i) { - if(src0[i] > max) { - index = i; - max = src0[i]; + for (; i < num_points; ++i) { + if (src0[i] > max) { + index = i; + max = src0[i]; + } } - } - target[0] = index; + target[0] = index; } #endif /*LV_HAVE_GENERIC*/ @@ -302,76 +300,74 @@ volk_32f_index_max_16u_generic(uint16_t* target, const float* src0, #endif /*INCLUDED_volk_32f_index_max_16u_a_H*/ - #ifndef INCLUDED_volk_32f_index_max_16u_u_H #define INCLUDED_volk_32f_index_max_16u_u_H -#include -#include #include #include #include +#include #ifdef LV_HAVE_AVX #include static inline void -volk_32f_index_max_16u_u_avx(uint16_t* target, const float* src0, - uint32_t num_points) +volk_32f_index_max_16u_u_avx(uint16_t* target, const float* src0, uint32_t num_points) { - num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points; - - uint32_t number = 0; - const uint32_t eighthPoints = num_points / 8; + num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points; - float* inputPtr = (float*)src0; + uint32_t number = 0; + const uint32_t eighthPoints = num_points / 8; - __m256 indexIncrementValues = _mm256_set1_ps(8); - __m256 currentIndexes = _mm256_set_ps(-1,-2,-3,-4,-5,-6,-7,-8); + float* inputPtr = (float*)src0; - float max = src0[0]; - float index = 0; - __m256 maxValues = _mm256_set1_ps(max); - __m256 maxValuesIndex = _mm256_setzero_ps(); - __m256 compareResults; - __m256 currentValues; + __m256 indexIncrementValues = _mm256_set1_ps(8); + __m256 currentIndexes = _mm256_set_ps(-1, -2, -3, -4, -5, -6, -7, -8); - __VOLK_ATTR_ALIGNED(32) float maxValuesBuffer[8]; - __VOLK_ATTR_ALIGNED(32) float maxIndexesBuffer[8]; + float max = src0[0]; + float index = 0; + __m256 maxValues = _mm256_set1_ps(max); + __m256 maxValuesIndex = _mm256_setzero_ps(); + __m256 compareResults; + __m256 currentValues; - for(;number < eighthPoints; number++){ + __VOLK_ATTR_ALIGNED(32) float maxValuesBuffer[8]; + __VOLK_ATTR_ALIGNED(32) float maxIndexesBuffer[8]; - currentValues = _mm256_loadu_ps(inputPtr); inputPtr += 8; - currentIndexes = _mm256_add_ps(currentIndexes, indexIncrementValues); + for (; number < eighthPoints; number++) { - compareResults = _mm256_cmp_ps(currentValues, maxValues, _CMP_GT_OS); + currentValues = _mm256_loadu_ps(inputPtr); + inputPtr += 8; + currentIndexes = _mm256_add_ps(currentIndexes, indexIncrementValues); - maxValuesIndex = _mm256_blendv_ps(maxValuesIndex, currentIndexes, compareResults); - maxValues = _mm256_blendv_ps(maxValues, currentValues, compareResults); - } + compareResults = _mm256_cmp_ps(currentValues, maxValues, _CMP_GT_OS); - // Calculate the largest value from the remaining 4 points - _mm256_storeu_ps(maxValuesBuffer, maxValues); - _mm256_storeu_ps(maxIndexesBuffer, maxValuesIndex); + maxValuesIndex = _mm256_blendv_ps(maxValuesIndex, currentIndexes, compareResults); + maxValues = _mm256_blendv_ps(maxValues, currentValues, compareResults); + } - for(number = 0; number < 8; number++){ - if(maxValuesBuffer[number] > max){ - index = maxIndexesBuffer[number]; - max = maxValuesBuffer[number]; - } else if(maxValuesBuffer[number] == max){ - if (index > maxIndexesBuffer[number]) - index = maxIndexesBuffer[number]; + // Calculate the largest value from the remaining 4 points + _mm256_storeu_ps(maxValuesBuffer, maxValues); + _mm256_storeu_ps(maxIndexesBuffer, maxValuesIndex); + + for (number = 0; number < 8; number++) { + if (maxValuesBuffer[number] > max) { + index = maxIndexesBuffer[number]; + max = maxValuesBuffer[number]; + } else if (maxValuesBuffer[number] == max) { + if (index > maxIndexesBuffer[number]) + index = maxIndexesBuffer[number]; + } } - } - number = eighthPoints * 8; - for(;number < num_points; number++){ - if(src0[number] > max){ - index = number; - max = src0[number]; + number = eighthPoints * 8; + for (; number < num_points; number++) { + if (src0[number] > max) { + index = number; + max = src0[number]; + } } - } - target[0] = (uint16_t)index; + target[0] = (uint16_t)index; } #endif /*LV_HAVE_AVX*/ diff --git a/kernels/volk/volk_32f_index_max_32u.h b/kernels/volk/volk_32f_index_max_32u.h index 318c8e4..315531d 100644 --- a/kernels/volk/volk_32f_index_max_32u.h +++ b/kernels/volk/volk_32f_index_max_32u.h @@ -25,7 +25,8 @@ * * \b Overview * - * Returns Argmax_i x[i]. Finds and returns the index which contains the first maximum value in the given vector. + * Returns Argmax_i x[i]. Finds and returns the index which contains the first maximum + * value in the given vector. * * Dispatcher Prototype * \code @@ -64,70 +65,71 @@ #ifndef INCLUDED_volk_32f_index_max_32u_a_H #define INCLUDED_volk_32f_index_max_32u_a_H -#include -#include #include #include +#include #ifdef LV_HAVE_SSE4_1 -#include +#include static inline void volk_32f_index_max_32u_a_sse4_1(uint32_t* target, const float* src0, uint32_t num_points) { - if(num_points > 0){ - uint32_t number = 0; - const uint32_t quarterPoints = num_points / 4; + if (num_points > 0) { + uint32_t number = 0; + const uint32_t quarterPoints = num_points / 4; - float* inputPtr = (float*)src0; + float* inputPtr = (float*)src0; - __m128 indexIncrementValues = _mm_set1_ps(4); - __m128 currentIndexes = _mm_set_ps(-1,-2,-3,-4); + __m128 indexIncrementValues = _mm_set1_ps(4); + __m128 currentIndexes = _mm_set_ps(-1, -2, -3, -4); - float max = src0[0]; - float index = 0; - __m128 maxValues = _mm_set1_ps(max); - __m128 maxValuesIndex = _mm_setzero_ps(); - __m128 compareResults; - __m128 currentValues; + float max = src0[0]; + float index = 0; + __m128 maxValues = _mm_set1_ps(max); + __m128 maxValuesIndex = _mm_setzero_ps(); + __m128 compareResults; + __m128 currentValues; - __VOLK_ATTR_ALIGNED(16) float maxValuesBuffer[4]; - __VOLK_ATTR_ALIGNED(16) float maxIndexesBuffer[4]; + __VOLK_ATTR_ALIGNED(16) float maxValuesBuffer[4]; + __VOLK_ATTR_ALIGNED(16) float maxIndexesBuffer[4]; - for(;number < quarterPoints; number++){ + for (; number < quarterPoints; number++) { - currentValues = _mm_load_ps(inputPtr); inputPtr += 4; - currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues); + currentValues = _mm_load_ps(inputPtr); + inputPtr += 4; + currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues); - compareResults = _mm_cmpgt_ps(currentValues, maxValues); + compareResults = _mm_cmpgt_ps(currentValues, maxValues); - maxValuesIndex = _mm_blendv_ps(maxValuesIndex, currentIndexes, compareResults); - maxValues = _mm_blendv_ps(maxValues, currentValues, compareResults); - } + maxValuesIndex = + _mm_blendv_ps(maxValuesIndex, currentIndexes, compareResults); + maxValues = _mm_blendv_ps(maxValues, currentValues, compareResults); + } - // Calculate the largest value from the remaining 4 points - _mm_store_ps(maxValuesBuffer, maxValues); - _mm_store_ps(maxIndexesBuffer, maxValuesIndex); - - for(number = 0; number < 4; number++){ - if(maxValuesBuffer[number] > max){ - index = maxIndexesBuffer[number]; - max = maxValuesBuffer[number]; - } else if(maxValuesBuffer[number] == max){ - if (index > maxIndexesBuffer[number]) - index = maxIndexesBuffer[number]; - } - } + // Calculate the largest value from the remaining 4 points + _mm_store_ps(maxValuesBuffer, maxValues); + _mm_store_ps(maxIndexesBuffer, maxValuesIndex); + + for (number = 0; number < 4; number++) { + if (maxValuesBuffer[number] > max) { + index = maxIndexesBuffer[number]; + max = maxValuesBuffer[number]; + } else if (maxValuesBuffer[number] == max) { + if (index > maxIndexesBuffer[number]) + index = maxIndexesBuffer[number]; + } + } - number = quarterPoints * 4; - for(;number < num_points; number++){ - if(src0[number] > max){ - index = number; - max = src0[number]; - } + number = quarterPoints * 4; + for (; number < num_points; number++) { + if (src0[number] > max) { + index = number; + max = src0[number]; + } + } + target[0] = (uint32_t)index; } - target[0] = (uint32_t)index; - } } #endif /*LV_HAVE_SSE4_1*/ @@ -135,67 +137,68 @@ volk_32f_index_max_32u_a_sse4_1(uint32_t* target, const float* src0, uint32_t nu #ifdef LV_HAVE_SSE -#include +#include static inline void volk_32f_index_max_32u_a_sse(uint32_t* target, const float* src0, uint32_t num_points) { - if(num_points > 0){ - uint32_t number = 0; - const uint32_t quarterPoints = num_points / 4; + if (num_points > 0) { + uint32_t number = 0; + const uint32_t quarterPoints = num_points / 4; - float* inputPtr = (float*)src0; + float* inputPtr = (float*)src0; - __m128 indexIncrementValues = _mm_set1_ps(4); - __m128 currentIndexes = _mm_set_ps(-1,-2,-3,-4); + __m128 indexIncrementValues = _mm_set1_ps(4); + __m128 currentIndexes = _mm_set_ps(-1, -2, -3, -4); - float max = src0[0]; - float index = 0; - __m128 maxValues = _mm_set1_ps(max); - __m128 maxValuesIndex = _mm_setzero_ps(); - __m128 compareResults; - __m128 currentValues; + float max = src0[0]; + float index = 0; + __m128 maxValues = _mm_set1_ps(max); + __m128 maxValuesIndex = _mm_setzero_ps(); + __m128 compareResults; + __m128 currentValues; - __VOLK_ATTR_ALIGNED(16) float maxValuesBuffer[4]; - __VOLK_ATTR_ALIGNED(16) float maxIndexesBuffer[4]; + __VOLK_ATTR_ALIGNED(16) float maxValuesBuffer[4]; + __VOLK_ATTR_ALIGNED(16) float maxIndexesBuffer[4]; - for(;number < quarterPoints; number++){ + for (; number < quarterPoints; number++) { - currentValues = _mm_load_ps(inputPtr); inputPtr += 4; - currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues); + currentValues = _mm_load_ps(inputPtr); + inputPtr += 4; + currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues); - compareResults = _mm_cmpgt_ps(currentValues, maxValues); + compareResults = _mm_cmpgt_ps(currentValues, maxValues); - maxValuesIndex = _mm_or_ps(_mm_and_ps(compareResults, currentIndexes), - _mm_andnot_ps(compareResults, maxValuesIndex)); + maxValuesIndex = _mm_or_ps(_mm_and_ps(compareResults, currentIndexes), + _mm_andnot_ps(compareResults, maxValuesIndex)); - maxValues = _mm_or_ps(_mm_and_ps(compareResults, currentValues), - _mm_andnot_ps(compareResults, maxValues)); - } + maxValues = _mm_or_ps(_mm_and_ps(compareResults, currentValues), + _mm_andnot_ps(compareResults, maxValues)); + } - // Calculate the largest value from the remaining 4 points - _mm_store_ps(maxValuesBuffer, maxValues); - _mm_store_ps(maxIndexesBuffer, maxValuesIndex); - - for(number = 0; number < 4; number++){ - if(maxValuesBuffer[number] > max){ - index = maxIndexesBuffer[number]; - max = maxValuesBuffer[number]; - } else if(maxValuesBuffer[number] == max){ - if (index > maxIndexesBuffer[number]) - index = maxIndexesBuffer[number]; - } - } + // Calculate the largest value from the remaining 4 points + _mm_store_ps(maxValuesBuffer, maxValues); + _mm_store_ps(maxIndexesBuffer, maxValuesIndex); + + for (number = 0; number < 4; number++) { + if (maxValuesBuffer[number] > max) { + index = maxIndexesBuffer[number]; + max = maxValuesBuffer[number]; + } else if (maxValuesBuffer[number] == max) { + if (index > maxIndexesBuffer[number]) + index = maxIndexesBuffer[number]; + } + } - number = quarterPoints * 4; - for(;number < num_points; number++){ - if(src0[number] > max){ - index = number; - max = src0[number]; - } + number = quarterPoints * 4; + for (; number < num_points; number++) { + if (src0[number] > max) { + index = number; + max = src0[number]; + } + } + target[0] = (uint32_t)index; } - target[0] = (uint32_t)index; - } } #endif /*LV_HAVE_SSE*/ @@ -204,65 +207,61 @@ volk_32f_index_max_32u_a_sse(uint32_t* target, const float* src0, uint32_t num_p #ifdef LV_HAVE_AVX #include -static inline void volk_32f_index_max_32u_a_avx(uint32_t* target, const float* src0, uint32_t num_points) +static inline void +volk_32f_index_max_32u_a_avx(uint32_t* target, const float* src0, uint32_t num_points) { - if(num_points > 0) - { - uint32_t number = 0; - const uint32_t quarterPoints = num_points / 8; - - float* inputPtr = (float*)src0; - - __m256 indexIncrementValues = _mm256_set1_ps(8); - __m256 currentIndexes = _mm256_set_ps(-1,-2,-3,-4,-5,-6,-7,-8); - - float max = src0[0]; - float index = 0; - __m256 maxValues = _mm256_set1_ps(max); - __m256 maxValuesIndex = _mm256_setzero_ps(); - __m256 compareResults; - __m256 currentValues; - - __VOLK_ATTR_ALIGNED(32) float maxValuesBuffer[8]; - __VOLK_ATTR_ALIGNED(32) float maxIndexesBuffer[8]; - - for(;number < quarterPoints; number++) - { - currentValues = _mm256_load_ps(inputPtr); inputPtr += 8; - currentIndexes = _mm256_add_ps(currentIndexes, indexIncrementValues); - compareResults = _mm256_cmp_ps(currentValues, maxValues, _CMP_GT_OS); - maxValuesIndex = _mm256_blendv_ps(maxValuesIndex, currentIndexes, compareResults); - maxValues = _mm256_blendv_ps(maxValues, currentValues, compareResults); - } - - // Calculate the largest value from the remaining 8 points - _mm256_store_ps(maxValuesBuffer, maxValues); - _mm256_store_ps(maxIndexesBuffer, maxValuesIndex); - - for(number = 0; number < 8; number++) - { - if(maxValuesBuffer[number] > max) - { - index = maxIndexesBuffer[number]; - max = maxValuesBuffer[number]; - } - else if(maxValuesBuffer[number] == max){ - if (index > maxIndexesBuffer[number]) - index = maxIndexesBuffer[number]; - } - } - - number = quarterPoints * 8; - for(;number < num_points; number++) - { - if(src0[number] > max) - { - index = number; - max = src0[number]; - } - } - target[0] = (uint32_t)index; + if (num_points > 0) { + uint32_t number = 0; + const uint32_t quarterPoints = num_points / 8; + + float* inputPtr = (float*)src0; + + __m256 indexIncrementValues = _mm256_set1_ps(8); + __m256 currentIndexes = _mm256_set_ps(-1, -2, -3, -4, -5, -6, -7, -8); + + float max = src0[0]; + float index = 0; + __m256 maxValues = _mm256_set1_ps(max); + __m256 maxValuesIndex = _mm256_setzero_ps(); + __m256 compareResults; + __m256 currentValues; + + __VOLK_ATTR_ALIGNED(32) float maxValuesBuffer[8]; + __VOLK_ATTR_ALIGNED(32) float maxIndexesBuffer[8]; + + for (; number < quarterPoints; number++) { + currentValues = _mm256_load_ps(inputPtr); + inputPtr += 8; + currentIndexes = _mm256_add_ps(currentIndexes, indexIncrementValues); + compareResults = _mm256_cmp_ps(currentValues, maxValues, _CMP_GT_OS); + maxValuesIndex = + _mm256_blendv_ps(maxValuesIndex, currentIndexes, compareResults); + maxValues = _mm256_blendv_ps(maxValues, currentValues, compareResults); + } + + // Calculate the largest value from the remaining 8 points + _mm256_store_ps(maxValuesBuffer, maxValues); + _mm256_store_ps(maxIndexesBuffer, maxValuesIndex); + + for (number = 0; number < 8; number++) { + if (maxValuesBuffer[number] > max) { + index = maxIndexesBuffer[number]; + max = maxValuesBuffer[number]; + } else if (maxValuesBuffer[number] == max) { + if (index > maxIndexesBuffer[number]) + index = maxIndexesBuffer[number]; + } + } + + number = quarterPoints * 8; + for (; number < num_points; number++) { + if (src0[number] > max) { + index = number; + max = src0[number]; + } } + target[0] = (uint32_t)index; + } } #endif /*LV_HAVE_AVX*/ @@ -271,66 +270,63 @@ static inline void volk_32f_index_max_32u_a_avx(uint32_t* target, const float* s #ifdef LV_HAVE_NEON #include -static inline void volk_32f_index_max_32u_neon(uint32_t* target, const float* src0, uint32_t num_points) +static inline void +volk_32f_index_max_32u_neon(uint32_t* target, const float* src0, uint32_t num_points) { - if(num_points > 0) - { - uint32_t number = 0; - const uint32_t quarterPoints = num_points / 4; - - float* inputPtr = (float*)src0; - float32x4_t indexIncrementValues = vdupq_n_f32(4); - __VOLK_ATTR_ALIGNED(16) float currentIndexes_float[4] = { -4.0f, -3.0f, -2.0f, -1.0f }; - float32x4_t currentIndexes = vld1q_f32(currentIndexes_float); - - float max = src0[0]; - float index = 0; - float32x4_t maxValues = vdupq_n_f32(max); - uint32x4_t maxValuesIndex = vmovq_n_u32(0); - uint32x4_t compareResults; - uint32x4_t currentIndexes_u; - float32x4_t currentValues; - - __VOLK_ATTR_ALIGNED(16) float maxValuesBuffer[4]; - __VOLK_ATTR_ALIGNED(16) float maxIndexesBuffer[4]; - - for(;number < quarterPoints; number++) - { - currentValues = vld1q_f32(inputPtr); inputPtr += 4; - currentIndexes = vaddq_f32(currentIndexes, indexIncrementValues); - currentIndexes_u = vcvtq_u32_f32(currentIndexes); - compareResults = vcleq_f32(currentValues, maxValues); - maxValuesIndex = vorrq_u32( vandq_u32( compareResults, maxValuesIndex ), vbicq_u32(currentIndexes_u, compareResults) ); - maxValues = vmaxq_f32(currentValues, maxValues); - } - - // Calculate the largest value from the remaining 4 points - vst1q_f32(maxValuesBuffer, maxValues); - vst1q_f32(maxIndexesBuffer, vcvtq_f32_u32(maxValuesIndex)); - for(number = 0; number < 4; number++) - { - if(maxValuesBuffer[number] > max) - { - index = maxIndexesBuffer[number]; - max = maxValuesBuffer[number]; - } - else if(maxValues[number] == max){ - if (index > maxIndexesBuffer[number]) - index = maxIndexesBuffer[number]; - } - } - - number = quarterPoints * 4; - for(;number < num_points; number++) - { - if(src0[number] > max) - { - index = number; - max = src0[number]; - } - } - target[0] = (uint32_t)index; + if (num_points > 0) { + uint32_t number = 0; + const uint32_t quarterPoints = num_points / 4; + + float* inputPtr = (float*)src0; + float32x4_t indexIncrementValues = vdupq_n_f32(4); + __VOLK_ATTR_ALIGNED(16) + float currentIndexes_float[4] = { -4.0f, -3.0f, -2.0f, -1.0f }; + float32x4_t currentIndexes = vld1q_f32(currentIndexes_float); + + float max = src0[0]; + float index = 0; + float32x4_t maxValues = vdupq_n_f32(max); + uint32x4_t maxValuesIndex = vmovq_n_u32(0); + uint32x4_t compareResults; + uint32x4_t currentIndexes_u; + float32x4_t currentValues; + + __VOLK_ATTR_ALIGNED(16) float maxValuesBuffer[4]; + __VOLK_ATTR_ALIGNED(16) float maxIndexesBuffer[4]; + + for (; number < quarterPoints; number++) { + currentValues = vld1q_f32(inputPtr); + inputPtr += 4; + currentIndexes = vaddq_f32(currentIndexes, indexIncrementValues); + currentIndexes_u = vcvtq_u32_f32(currentIndexes); + compareResults = vcleq_f32(currentValues, maxValues); + maxValuesIndex = vorrq_u32(vandq_u32(compareResults, maxValuesIndex), + vbicq_u32(currentIndexes_u, compareResults)); + maxValues = vmaxq_f32(currentValues, maxValues); + } + + // Calculate the largest value from the remaining 4 points + vst1q_f32(maxValuesBuffer, maxValues); + vst1q_f32(maxIndexesBuffer, vcvtq_f32_u32(maxValuesIndex)); + for (number = 0; number < 4; number++) { + if (maxValuesBuffer[number] > max) { + index = maxIndexesBuffer[number]; + max = maxValuesBuffer[number]; + } else if (maxValues[number] == max) { + if (index > maxIndexesBuffer[number]) + index = maxIndexesBuffer[number]; + } + } + + number = quarterPoints * 4; + for (; number < num_points; number++) { + if (src0[number] > max) { + index = number; + max = src0[number]; + } } + target[0] = (uint32_t)index; + } } #endif /*LV_HAVE_NEON*/ @@ -341,20 +337,20 @@ static inline void volk_32f_index_max_32u_neon(uint32_t* target, const float* sr static inline void volk_32f_index_max_32u_generic(uint32_t* target, const float* src0, uint32_t num_points) { - if(num_points > 0){ - float max = src0[0]; - uint32_t index = 0; + if (num_points > 0) { + float max = src0[0]; + uint32_t index = 0; - uint32_t i = 1; + uint32_t i = 1; - for(; i < num_points; ++i) { - if(src0[i] > max){ - index = i; - max = src0[i]; - } + for (; i < num_points; ++i) { + if (src0[i] > max) { + index = i; + max = src0[i]; + } + } + target[0] = index; } - target[0] = index; - } } #endif /*LV_HAVE_GENERIC*/ @@ -366,209 +362,195 @@ volk_32f_index_max_32u_generic(uint32_t* target, const float* src0, uint32_t num #ifndef INCLUDED_volk_32f_index_max_32u_u_H #define INCLUDED_volk_32f_index_max_32u_u_H -#include -#include #include #include +#include #ifdef LV_HAVE_AVX #include -static inline void volk_32f_index_max_32u_u_avx(uint32_t* target, const float* src0, uint32_t num_points) +static inline void +volk_32f_index_max_32u_u_avx(uint32_t* target, const float* src0, uint32_t num_points) { - if(num_points > 0) - { - uint32_t number = 0; - const uint32_t quarterPoints = num_points / 8; - - float* inputPtr = (float*)src0; - - __m256 indexIncrementValues = _mm256_set1_ps(8); - __m256 currentIndexes = _mm256_set_ps(-1,-2,-3,-4,-5,-6,-7,-8); - - float max = src0[0]; - float index = 0; - __m256 maxValues = _mm256_set1_ps(max); - __m256 maxValuesIndex = _mm256_setzero_ps(); - __m256 compareResults; - __m256 currentValues; - - __VOLK_ATTR_ALIGNED(32) float maxValuesBuffer[8]; - __VOLK_ATTR_ALIGNED(32) float maxIndexesBuffer[8]; - - for(;number < quarterPoints; number++) - { - currentValues = _mm256_loadu_ps(inputPtr); inputPtr += 8; - currentIndexes = _mm256_add_ps(currentIndexes, indexIncrementValues); - compareResults = _mm256_cmp_ps(currentValues, maxValues, _CMP_GT_OS); - maxValuesIndex = _mm256_blendv_ps(maxValuesIndex, currentIndexes, compareResults); - maxValues = _mm256_blendv_ps(maxValues, currentValues, compareResults); - } - - // Calculate the largest value from the remaining 8 points - _mm256_store_ps(maxValuesBuffer, maxValues); - _mm256_store_ps(maxIndexesBuffer, maxValuesIndex); - - for(number = 0; number < 8; number++) - { - if(maxValuesBuffer[number] > max) - { - index = maxIndexesBuffer[number]; - max = maxValuesBuffer[number]; - } - else if(maxValuesBuffer[number] == max){ - if (index > maxIndexesBuffer[number]) - index = maxIndexesBuffer[number]; - } - } - - number = quarterPoints * 8; - for(;number < num_points; number++) - { - if(src0[number] > max) - { - index = number; - max = src0[number]; - } - } - target[0] = (uint32_t)index; + if (num_points > 0) { + uint32_t number = 0; + const uint32_t quarterPoints = num_points / 8; + + float* inputPtr = (float*)src0; + + __m256 indexIncrementValues = _mm256_set1_ps(8); + __m256 currentIndexes = _mm256_set_ps(-1, -2, -3, -4, -5, -6, -7, -8); + + float max = src0[0]; + float index = 0; + __m256 maxValues = _mm256_set1_ps(max); + __m256 maxValuesIndex = _mm256_setzero_ps(); + __m256 compareResults; + __m256 currentValues; + + __VOLK_ATTR_ALIGNED(32) float maxValuesBuffer[8]; + __VOLK_ATTR_ALIGNED(32) float maxIndexesBuffer[8]; + + for (; number < quarterPoints; number++) { + currentValues = _mm256_loadu_ps(inputPtr); + inputPtr += 8; + currentIndexes = _mm256_add_ps(currentIndexes, indexIncrementValues); + compareResults = _mm256_cmp_ps(currentValues, maxValues, _CMP_GT_OS); + maxValuesIndex = + _mm256_blendv_ps(maxValuesIndex, currentIndexes, compareResults); + maxValues = _mm256_blendv_ps(maxValues, currentValues, compareResults); } + + // Calculate the largest value from the remaining 8 points + _mm256_store_ps(maxValuesBuffer, maxValues); + _mm256_store_ps(maxIndexesBuffer, maxValuesIndex); + + for (number = 0; number < 8; number++) { + if (maxValuesBuffer[number] > max) { + index = maxIndexesBuffer[number]; + max = maxValuesBuffer[number]; + } else if (maxValuesBuffer[number] == max) { + if (index > maxIndexesBuffer[number]) + index = maxIndexesBuffer[number]; + } + } + + number = quarterPoints * 8; + for (; number < num_points; number++) { + if (src0[number] > max) { + index = number; + max = src0[number]; + } + } + target[0] = (uint32_t)index; + } } #endif /*LV_HAVE_AVX*/ #ifdef LV_HAVE_SSE4_1 -#include +#include -static inline void volk_32f_index_max_32u_u_sse4_1(uint32_t* target, const float* src0, uint32_t num_points) +static inline void +volk_32f_index_max_32u_u_sse4_1(uint32_t* target, const float* src0, uint32_t num_points) { - if(num_points > 0) - { - uint32_t number = 0; - const uint32_t quarterPoints = num_points / 4; - - float* inputPtr = (float*)src0; - - __m128 indexIncrementValues = _mm_set1_ps(4); - __m128 currentIndexes = _mm_set_ps(-1,-2,-3,-4); - - float max = src0[0]; - float index = 0; - __m128 maxValues = _mm_set1_ps(max); - __m128 maxValuesIndex = _mm_setzero_ps(); - __m128 compareResults; - __m128 currentValues; - - __VOLK_ATTR_ALIGNED(16) float maxValuesBuffer[4]; - __VOLK_ATTR_ALIGNED(16) float maxIndexesBuffer[4]; - - for(;number < quarterPoints; number++) - { - currentValues = _mm_loadu_ps(inputPtr); inputPtr += 4; - currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues); - compareResults = _mm_cmpgt_ps(currentValues, maxValues); - maxValuesIndex = _mm_blendv_ps(maxValuesIndex, currentIndexes, compareResults); - maxValues = _mm_blendv_ps(maxValues, currentValues, compareResults); - } - - // Calculate the largest value from the remaining 4 points - _mm_store_ps(maxValuesBuffer, maxValues); - _mm_store_ps(maxIndexesBuffer, maxValuesIndex); - - for(number = 0; number < 4; number++) - { - if(maxValuesBuffer[number] > max) - { - index = maxIndexesBuffer[number]; - max = maxValuesBuffer[number]; - } - else if(maxValuesBuffer[number] == max){ - if (index > maxIndexesBuffer[number]) - index = maxIndexesBuffer[number]; - } - } - - number = quarterPoints * 4; - for(;number < num_points; number++) - { - if(src0[number] > max) - { - index = number; - max = src0[number]; - } - } - target[0] = (uint32_t)index; + if (num_points > 0) { + uint32_t number = 0; + const uint32_t quarterPoints = num_points / 4; + + float* inputPtr = (float*)src0; + + __m128 indexIncrementValues = _mm_set1_ps(4); + __m128 currentIndexes = _mm_set_ps(-1, -2, -3, -4); + + float max = src0[0]; + float index = 0; + __m128 maxValues = _mm_set1_ps(max); + __m128 maxValuesIndex = _mm_setzero_ps(); + __m128 compareResults; + __m128 currentValues; + + __VOLK_ATTR_ALIGNED(16) float maxValuesBuffer[4]; + __VOLK_ATTR_ALIGNED(16) float maxIndexesBuffer[4]; + + for (; number < quarterPoints; number++) { + currentValues = _mm_loadu_ps(inputPtr); + inputPtr += 4; + currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues); + compareResults = _mm_cmpgt_ps(currentValues, maxValues); + maxValuesIndex = + _mm_blendv_ps(maxValuesIndex, currentIndexes, compareResults); + maxValues = _mm_blendv_ps(maxValues, currentValues, compareResults); } + + // Calculate the largest value from the remaining 4 points + _mm_store_ps(maxValuesBuffer, maxValues); + _mm_store_ps(maxIndexesBuffer, maxValuesIndex); + + for (number = 0; number < 4; number++) { + if (maxValuesBuffer[number] > max) { + index = maxIndexesBuffer[number]; + max = maxValuesBuffer[number]; + } else if (maxValuesBuffer[number] == max) { + if (index > maxIndexesBuffer[number]) + index = maxIndexesBuffer[number]; + } + } + + number = quarterPoints * 4; + for (; number < num_points; number++) { + if (src0[number] > max) { + index = number; + max = src0[number]; + } + } + target[0] = (uint32_t)index; + } } #endif /*LV_HAVE_SSE4_1*/ #ifdef LV_HAVE_SSE -#include +#include -static inline void volk_32f_index_max_32u_u_sse(uint32_t* target, const float* src0, uint32_t num_points) +static inline void +volk_32f_index_max_32u_u_sse(uint32_t* target, const float* src0, uint32_t num_points) { - if(num_points > 0) - { - uint32_t number = 0; - const uint32_t quarterPoints = num_points / 4; - - float* inputPtr = (float*)src0; - - __m128 indexIncrementValues = _mm_set1_ps(4); - __m128 currentIndexes = _mm_set_ps(-1,-2,-3,-4); - - float max = src0[0]; - float index = 0; - __m128 maxValues = _mm_set1_ps(max); - __m128 maxValuesIndex = _mm_setzero_ps(); - __m128 compareResults; - __m128 currentValues; - - __VOLK_ATTR_ALIGNED(16) float maxValuesBuffer[4]; - __VOLK_ATTR_ALIGNED(16) float maxIndexesBuffer[4]; - - for(;number < quarterPoints; number++) - { - currentValues = _mm_loadu_ps(inputPtr); inputPtr += 4; - currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues); - compareResults = _mm_cmpgt_ps(currentValues, maxValues); - maxValuesIndex = _mm_or_ps(_mm_and_ps(compareResults, currentIndexes), - _mm_andnot_ps(compareResults, maxValuesIndex)); - maxValues = _mm_or_ps(_mm_and_ps(compareResults, currentValues), - _mm_andnot_ps(compareResults, maxValues)); - } - - // Calculate the largest value from the remaining 4 points - _mm_store_ps(maxValuesBuffer, maxValues); - _mm_store_ps(maxIndexesBuffer, maxValuesIndex); - - for(number = 0; number < 4; number++) - { - if(maxValuesBuffer[number] > max) - { - index = maxIndexesBuffer[number]; - max = maxValuesBuffer[number]; - } - else if(maxValuesBuffer[number] == max){ - if (index > maxIndexesBuffer[number]) - index = maxIndexesBuffer[number]; - } - } - - number = quarterPoints * 4; - for(;number < num_points; number++) - { - if(src0[number] > max) - { - index = number; - max = src0[number]; - } - } - target[0] = (uint32_t)index; + if (num_points > 0) { + uint32_t number = 0; + const uint32_t quarterPoints = num_points / 4; + + float* inputPtr = (float*)src0; + + __m128 indexIncrementValues = _mm_set1_ps(4); + __m128 currentIndexes = _mm_set_ps(-1, -2, -3, -4); + + float max = src0[0]; + float index = 0; + __m128 maxValues = _mm_set1_ps(max); + __m128 maxValuesIndex = _mm_setzero_ps(); + __m128 compareResults; + __m128 currentValues; + + __VOLK_ATTR_ALIGNED(16) float maxValuesBuffer[4]; + __VOLK_ATTR_ALIGNED(16) float maxIndexesBuffer[4]; + + for (; number < quarterPoints; number++) { + currentValues = _mm_loadu_ps(inputPtr); + inputPtr += 4; + currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues); + compareResults = _mm_cmpgt_ps(currentValues, maxValues); + maxValuesIndex = _mm_or_ps(_mm_and_ps(compareResults, currentIndexes), + _mm_andnot_ps(compareResults, maxValuesIndex)); + maxValues = _mm_or_ps(_mm_and_ps(compareResults, currentValues), + _mm_andnot_ps(compareResults, maxValues)); } + + // Calculate the largest value from the remaining 4 points + _mm_store_ps(maxValuesBuffer, maxValues); + _mm_store_ps(maxIndexesBuffer, maxValuesIndex); + + for (number = 0; number < 4; number++) { + if (maxValuesBuffer[number] > max) { + index = maxIndexesBuffer[number]; + max = maxValuesBuffer[number]; + } else if (maxValuesBuffer[number] == max) { + if (index > maxIndexesBuffer[number]) + index = maxIndexesBuffer[number]; + } + } + + number = quarterPoints * 4; + for (; number < num_points; number++) { + if (src0[number] > max) { + index = number; + max = src0[number]; + } + } + target[0] = (uint32_t)index; + } } #endif /*LV_HAVE_SSE*/ diff --git a/kernels/volk/volk_32f_invsqrt_32f.h b/kernels/volk/volk_32f_invsqrt_32f.h index e416321..e545515 100644 --- a/kernels/volk/volk_32f_invsqrt_32f.h +++ b/kernels/volk/volk_32f_invsqrt_32f.h @@ -30,8 +30,8 @@ * * Dispatcher Prototype * \code - * void volk_32f_invsqrt_32f(float* cVector, const float* aVector, unsigned int num_points) - * \endcode + * void volk_32f_invsqrt_32f(float* cVector, const float* aVector, unsigned int + * num_points) \endcode * * \b Inputs * \li aVector: the input vector of floats. @@ -66,27 +66,27 @@ #define INCLUDED_volk_32f_invsqrt_32f_a_H #include -#include #include +#include #include -static inline float -Q_rsqrt(float number) +static inline float Q_rsqrt(float number) { - float x2; - const float threehalfs = 1.5F; - union f32_to_i32 { - int32_t i; - float f; - } u; - - x2 = number * 0.5F; - u.f = number; - u.i = 0x5f3759df - ( u.i >> 1 ); // what the fuck? - u.f = u.f * ( threehalfs - ( x2 * u.f * u.f ) ); // 1st iteration - //u.f = u.f * ( threehalfs - ( x2 * u.f * u.f ) ); // 2nd iteration, this can be removed - - return u.f; + float x2; + const float threehalfs = 1.5F; + union f32_to_i32 { + int32_t i; + float f; + } u; + + x2 = number * 0.5F; + u.f = number; + u.i = 0x5f3759df - (u.i >> 1); // what the fuck? + u.f = u.f * (threehalfs - (x2 * u.f * u.f)); // 1st iteration + // u.f = u.f * ( threehalfs - ( x2 * u.f * u.f ) ); // 2nd iteration, this can be + // removed + + return u.f; } #ifdef LV_HAVE_AVX @@ -95,24 +95,23 @@ Q_rsqrt(float number) static inline void volk_32f_invsqrt_32f_a_avx(float* cVector, const float* aVector, unsigned int num_points) { - unsigned int number = 0; - const unsigned int eighthPoints = num_points / 8; - - float* cPtr = cVector; - const float* aPtr = aVector; - __m256 aVal, cVal; - for (; number < eighthPoints; number++) { - aVal = _mm256_load_ps(aPtr); - cVal = _mm256_rsqrt_ps(aVal); - _mm256_store_ps(cPtr, cVal); - aPtr += 8; - cPtr += 8; - } - - number = eighthPoints * 8; - for(;number < num_points; number++) - *cPtr++ = Q_rsqrt(*aPtr++); - + unsigned int number = 0; + const unsigned int eighthPoints = num_points / 8; + + float* cPtr = cVector; + const float* aPtr = aVector; + __m256 aVal, cVal; + for (; number < eighthPoints; number++) { + aVal = _mm256_load_ps(aPtr); + cVal = _mm256_rsqrt_ps(aVal); + _mm256_store_ps(cPtr, cVal); + aPtr += 8; + cPtr += 8; + } + + number = eighthPoints * 8; + for (; number < num_points; number++) + *cPtr++ = Q_rsqrt(*aPtr++); } #endif /* LV_HAVE_AVX */ @@ -123,29 +122,29 @@ volk_32f_invsqrt_32f_a_avx(float* cVector, const float* aVector, unsigned int nu static inline void volk_32f_invsqrt_32f_a_sse(float* cVector, const float* aVector, unsigned int num_points) { - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; - float* cPtr = cVector; - const float* aPtr = aVector; + float* cPtr = cVector; + const float* aPtr = aVector; - __m128 aVal, cVal; - for(;number < quarterPoints; number++){ + __m128 aVal, cVal; + for (; number < quarterPoints; number++) { - aVal = _mm_load_ps(aPtr); + aVal = _mm_load_ps(aPtr); - cVal = _mm_rsqrt_ps(aVal); + cVal = _mm_rsqrt_ps(aVal); - _mm_store_ps(cPtr,cVal); // Store the results back into the C container + _mm_store_ps(cPtr, cVal); // Store the results back into the C container - aPtr += 4; - cPtr += 4; - } + aPtr += 4; + cPtr += 4; + } - number = quarterPoints * 4; - for(;number < num_points; number++) { - *cPtr++ = Q_rsqrt(*aPtr++); - } + number = quarterPoints * 4; + for (; number < num_points; number++) { + *cPtr++ = Q_rsqrt(*aPtr++); + } } #endif /* LV_HAVE_SSE */ @@ -156,37 +155,38 @@ volk_32f_invsqrt_32f_a_sse(float* cVector, const float* aVector, unsigned int nu static inline void volk_32f_invsqrt_32f_neon(float* cVector, const float* aVector, unsigned int num_points) { - unsigned int number; - const unsigned int quarter_points = num_points / 4; - - float* cPtr = cVector; - const float* aPtr = aVector; - float32x4_t a_val, c_val; - for (number = 0; number < quarter_points; ++number) { - a_val = vld1q_f32(aPtr); - c_val = vrsqrteq_f32(a_val); - vst1q_f32(cPtr, c_val); - aPtr += 4; - cPtr += 4; - } - - for(number=quarter_points * 4;number < num_points; number++) - *cPtr++ = Q_rsqrt(*aPtr++); + unsigned int number; + const unsigned int quarter_points = num_points / 4; + + float* cPtr = cVector; + const float* aPtr = aVector; + float32x4_t a_val, c_val; + for (number = 0; number < quarter_points; ++number) { + a_val = vld1q_f32(aPtr); + c_val = vrsqrteq_f32(a_val); + vst1q_f32(cPtr, c_val); + aPtr += 4; + cPtr += 4; + } + + for (number = quarter_points * 4; number < num_points; number++) + *cPtr++ = Q_rsqrt(*aPtr++); } #endif /* LV_HAVE_NEON */ #ifdef LV_HAVE_GENERIC -static inline void -volk_32f_invsqrt_32f_generic(float* cVector, const float* aVector, unsigned int num_points) +static inline void volk_32f_invsqrt_32f_generic(float* cVector, + const float* aVector, + unsigned int num_points) { - float* cPtr = cVector; - const float* aPtr = aVector; - unsigned int number = 0; - for(number = 0; number < num_points; number++) { - *cPtr++ = Q_rsqrt(*aPtr++); - } + float* cPtr = cVector; + const float* aPtr = aVector; + unsigned int number = 0; + for (number = 0; number < num_points; number++) { + *cPtr++ = Q_rsqrt(*aPtr++); + } } #endif /* LV_HAVE_GENERIC */ @@ -196,24 +196,23 @@ volk_32f_invsqrt_32f_generic(float* cVector, const float* aVector, unsigned int static inline void volk_32f_invsqrt_32f_u_avx(float* cVector, const float* aVector, unsigned int num_points) { - unsigned int number = 0; - const unsigned int eighthPoints = num_points / 8; - - float* cPtr = cVector; - const float* aPtr = aVector; - __m256 aVal, cVal; - for (; number < eighthPoints; number++) { - aVal = _mm256_loadu_ps(aPtr); - cVal = _mm256_rsqrt_ps(aVal); - _mm256_storeu_ps(cPtr, cVal); - aPtr += 8; - cPtr += 8; - } - - number = eighthPoints * 8; - for(;number < num_points; number++) - *cPtr++ = Q_rsqrt(*aPtr++); - + unsigned int number = 0; + const unsigned int eighthPoints = num_points / 8; + + float* cPtr = cVector; + const float* aPtr = aVector; + __m256 aVal, cVal; + for (; number < eighthPoints; number++) { + aVal = _mm256_loadu_ps(aPtr); + cVal = _mm256_rsqrt_ps(aVal); + _mm256_storeu_ps(cPtr, cVal); + aPtr += 8; + cPtr += 8; + } + + number = eighthPoints * 8; + for (; number < num_points; number++) + *cPtr++ = Q_rsqrt(*aPtr++); } #endif /* LV_HAVE_AVX */ diff --git a/kernels/volk/volk_32f_log2_32f.h b/kernels/volk/volk_32f_log2_32f.h index 740f89d..47276d4 100644 --- a/kernels/volk/volk_32f_log2_32f.h +++ b/kernels/volk/volk_32f_log2_32f.h @@ -92,17 +92,18 @@ #ifndef INCLUDED_volk_32f_log2_32f_a_H #define INCLUDED_volk_32f_log2_32f_a_H -#include -#include #include #include +#include +#include #define LOG_POLY_DEGREE 6 // +-Inf -> +-127.0f in order to match the behaviour of the SIMD kernels -static inline float log2f_non_ieee(float f) { - float const result = log2f(f); - return isinf(result) ? copysignf(127.0f, result) : result; +static inline float log2f_non_ieee(float f) +{ + float const result = log2f(f); + return isinf(result) ? copysignf(127.0f, result) : result; } #ifdef LV_HAVE_GENERIC @@ -110,12 +111,12 @@ static inline float log2f_non_ieee(float f) { static inline void volk_32f_log2_32f_generic(float* bVector, const float* aVector, unsigned int num_points) { - float* bPtr = bVector; - const float* aPtr = aVector; - unsigned int number = 0; + float* bPtr = bVector; + const float* aPtr = aVector; + unsigned int number = 0; - for(number = 0; number < num_points; number++) - *bPtr++ = log2f_non_ieee(*aPtr++); + for (number = 0; number < num_points; number++) + *bPtr++ = log2f_non_ieee(*aPtr++); } #endif /* LV_HAVE_GENERIC */ @@ -123,56 +124,86 @@ volk_32f_log2_32f_generic(float* bVector, const float* aVector, unsigned int num #include #define POLY0_FMAAVX2(x, c0) _mm256_set1_ps(c0) -#define POLY1_FMAAVX2(x, c0, c1) _mm256_fmadd_ps(POLY0_FMAAVX2(x, c1), x, _mm256_set1_ps(c0)) -#define POLY2_FMAAVX2(x, c0, c1, c2) _mm256_fmadd_ps(POLY1_FMAAVX2(x, c1, c2), x, _mm256_set1_ps(c0)) -#define POLY3_FMAAVX2(x, c0, c1, c2, c3) _mm256_fmadd_ps(POLY2_FMAAVX2(x, c1, c2, c3), x, _mm256_set1_ps(c0)) -#define POLY4_FMAAVX2(x, c0, c1, c2, c3, c4) _mm256_fmadd_ps(POLY3_FMAAVX2(x, c1, c2, c3, c4), x, _mm256_set1_ps(c0)) -#define POLY5_FMAAVX2(x, c0, c1, c2, c3, c4, c5) _mm256_fmadd_ps(POLY4_FMAAVX2(x, c1, c2, c3, c4, c5), x, _mm256_set1_ps(c0)) - -static inline void -volk_32f_log2_32f_a_avx2_fma(float* bVector, const float* aVector, unsigned int num_points) +#define POLY1_FMAAVX2(x, c0, c1) \ + _mm256_fmadd_ps(POLY0_FMAAVX2(x, c1), x, _mm256_set1_ps(c0)) +#define POLY2_FMAAVX2(x, c0, c1, c2) \ + _mm256_fmadd_ps(POLY1_FMAAVX2(x, c1, c2), x, _mm256_set1_ps(c0)) +#define POLY3_FMAAVX2(x, c0, c1, c2, c3) \ + _mm256_fmadd_ps(POLY2_FMAAVX2(x, c1, c2, c3), x, _mm256_set1_ps(c0)) +#define POLY4_FMAAVX2(x, c0, c1, c2, c3, c4) \ + _mm256_fmadd_ps(POLY3_FMAAVX2(x, c1, c2, c3, c4), x, _mm256_set1_ps(c0)) +#define POLY5_FMAAVX2(x, c0, c1, c2, c3, c4, c5) \ + _mm256_fmadd_ps(POLY4_FMAAVX2(x, c1, c2, c3, c4, c5), x, _mm256_set1_ps(c0)) + +static inline void volk_32f_log2_32f_a_avx2_fma(float* bVector, + const float* aVector, + unsigned int num_points) { - float* bPtr = bVector; - const float* aPtr = aVector; + float* bPtr = bVector; + const float* aPtr = aVector; - unsigned int number = 0; - const unsigned int eighthPoints = num_points / 8; + unsigned int number = 0; + const unsigned int eighthPoints = num_points / 8; - __m256 aVal, bVal, mantissa, frac, leadingOne; - __m256i bias, exp; + __m256 aVal, bVal, mantissa, frac, leadingOne; + __m256i bias, exp; - for(;number < eighthPoints; number++){ + for (; number < eighthPoints; number++) { - aVal = _mm256_load_ps(aPtr); - bias = _mm256_set1_epi32(127); - leadingOne = _mm256_set1_ps(1.0f); - exp = _mm256_sub_epi32(_mm256_srli_epi32(_mm256_and_si256(_mm256_castps_si256(aVal), _mm256_set1_epi32(0x7f800000)), 23), bias); - bVal = _mm256_cvtepi32_ps(exp); + aVal = _mm256_load_ps(aPtr); + bias = _mm256_set1_epi32(127); + leadingOne = _mm256_set1_ps(1.0f); + exp = _mm256_sub_epi32( + _mm256_srli_epi32(_mm256_and_si256(_mm256_castps_si256(aVal), + _mm256_set1_epi32(0x7f800000)), + 23), + bias); + bVal = _mm256_cvtepi32_ps(exp); - // Now to extract mantissa - frac = _mm256_or_ps(leadingOne, _mm256_and_ps(aVal, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffff)))); + // Now to extract mantissa + frac = _mm256_or_ps( + leadingOne, + _mm256_and_ps(aVal, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffff)))); #if LOG_POLY_DEGREE == 6 - mantissa = POLY5_FMAAVX2( frac, 3.1157899f, -3.3241990f, 2.5988452f, -1.2315303f, 3.1821337e-1f, -3.4436006e-2f); + mantissa = POLY5_FMAAVX2(frac, + 3.1157899f, + -3.3241990f, + 2.5988452f, + -1.2315303f, + 3.1821337e-1f, + -3.4436006e-2f); #elif LOG_POLY_DEGREE == 5 - mantissa = POLY4_FMAAVX2( frac, 2.8882704548164776201f, -2.52074962577807006663f, 1.48116647521213171641f, -0.465725644288844778798f, 0.0596515482674574969533f); + mantissa = POLY4_FMAAVX2(frac, + 2.8882704548164776201f, + -2.52074962577807006663f, + 1.48116647521213171641f, + -0.465725644288844778798f, + 0.0596515482674574969533f); #elif LOG_POLY_DEGREE == 4 - mantissa = POLY3_FMAAVX2( frac, 2.61761038894603480148f, -1.75647175389045657003f, 0.688243882994381274313f, -0.107254423828329604454f); + mantissa = POLY3_FMAAVX2(frac, + 2.61761038894603480148f, + -1.75647175389045657003f, + 0.688243882994381274313f, + -0.107254423828329604454f); #elif LOG_POLY_DEGREE == 3 - mantissa = POLY2_FMAAVX2( frac, 2.28330284476918490682f, -1.04913055217340124191f, 0.204446009836232697516f); + mantissa = POLY2_FMAAVX2(frac, + 2.28330284476918490682f, + -1.04913055217340124191f, + 0.204446009836232697516f); #else #error #endif - bVal = _mm256_fmadd_ps(mantissa, _mm256_sub_ps(frac, leadingOne), bVal); - _mm256_store_ps(bPtr, bVal); + bVal = _mm256_fmadd_ps(mantissa, _mm256_sub_ps(frac, leadingOne), bVal); + _mm256_store_ps(bPtr, bVal); - aPtr += 8; - bPtr += 8; - } + aPtr += 8; + bPtr += 8; + } - number = eighthPoints * 8; - volk_32f_log2_32f_generic(bPtr, aPtr, num_points-number); + number = eighthPoints * 8; + volk_32f_log2_32f_generic(bPtr, aPtr, num_points - number); } #endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for aligned */ @@ -181,56 +212,86 @@ volk_32f_log2_32f_a_avx2_fma(float* bVector, const float* aVector, unsigned int #include #define POLY0_AVX2(x, c0) _mm256_set1_ps(c0) -#define POLY1_AVX2(x, c0, c1) _mm256_add_ps(_mm256_mul_ps(POLY0_AVX2(x, c1), x), _mm256_set1_ps(c0)) -#define POLY2_AVX2(x, c0, c1, c2) _mm256_add_ps(_mm256_mul_ps(POLY1_AVX2(x, c1, c2), x), _mm256_set1_ps(c0)) -#define POLY3_AVX2(x, c0, c1, c2, c3) _mm256_add_ps(_mm256_mul_ps(POLY2_AVX2(x, c1, c2, c3), x), _mm256_set1_ps(c0)) -#define POLY4_AVX2(x, c0, c1, c2, c3, c4) _mm256_add_ps(_mm256_mul_ps(POLY3_AVX2(x, c1, c2, c3, c4), x), _mm256_set1_ps(c0)) -#define POLY5_AVX2(x, c0, c1, c2, c3, c4, c5) _mm256_add_ps(_mm256_mul_ps(POLY4_AVX2(x, c1, c2, c3, c4, c5), x), _mm256_set1_ps(c0)) +#define POLY1_AVX2(x, c0, c1) \ + _mm256_add_ps(_mm256_mul_ps(POLY0_AVX2(x, c1), x), _mm256_set1_ps(c0)) +#define POLY2_AVX2(x, c0, c1, c2) \ + _mm256_add_ps(_mm256_mul_ps(POLY1_AVX2(x, c1, c2), x), _mm256_set1_ps(c0)) +#define POLY3_AVX2(x, c0, c1, c2, c3) \ + _mm256_add_ps(_mm256_mul_ps(POLY2_AVX2(x, c1, c2, c3), x), _mm256_set1_ps(c0)) +#define POLY4_AVX2(x, c0, c1, c2, c3, c4) \ + _mm256_add_ps(_mm256_mul_ps(POLY3_AVX2(x, c1, c2, c3, c4), x), _mm256_set1_ps(c0)) +#define POLY5_AVX2(x, c0, c1, c2, c3, c4, c5) \ + _mm256_add_ps(_mm256_mul_ps(POLY4_AVX2(x, c1, c2, c3, c4, c5), x), _mm256_set1_ps(c0)) static inline void volk_32f_log2_32f_a_avx2(float* bVector, const float* aVector, unsigned int num_points) { - float* bPtr = bVector; - const float* aPtr = aVector; + float* bPtr = bVector; + const float* aPtr = aVector; - unsigned int number = 0; - const unsigned int eighthPoints = num_points / 8; + unsigned int number = 0; + const unsigned int eighthPoints = num_points / 8; - __m256 aVal, bVal, mantissa, frac, leadingOne; - __m256i bias, exp; + __m256 aVal, bVal, mantissa, frac, leadingOne; + __m256i bias, exp; - for(;number < eighthPoints; number++){ + for (; number < eighthPoints; number++) { - aVal = _mm256_load_ps(aPtr); - bias = _mm256_set1_epi32(127); - leadingOne = _mm256_set1_ps(1.0f); - exp = _mm256_sub_epi32(_mm256_srli_epi32(_mm256_and_si256(_mm256_castps_si256(aVal), _mm256_set1_epi32(0x7f800000)), 23), bias); - bVal = _mm256_cvtepi32_ps(exp); + aVal = _mm256_load_ps(aPtr); + bias = _mm256_set1_epi32(127); + leadingOne = _mm256_set1_ps(1.0f); + exp = _mm256_sub_epi32( + _mm256_srli_epi32(_mm256_and_si256(_mm256_castps_si256(aVal), + _mm256_set1_epi32(0x7f800000)), + 23), + bias); + bVal = _mm256_cvtepi32_ps(exp); - // Now to extract mantissa - frac = _mm256_or_ps(leadingOne, _mm256_and_ps(aVal, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffff)))); + // Now to extract mantissa + frac = _mm256_or_ps( + leadingOne, + _mm256_and_ps(aVal, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffff)))); #if LOG_POLY_DEGREE == 6 - mantissa = POLY5_AVX2( frac, 3.1157899f, -3.3241990f, 2.5988452f, -1.2315303f, 3.1821337e-1f, -3.4436006e-2f); + mantissa = POLY5_AVX2(frac, + 3.1157899f, + -3.3241990f, + 2.5988452f, + -1.2315303f, + 3.1821337e-1f, + -3.4436006e-2f); #elif LOG_POLY_DEGREE == 5 - mantissa = POLY4_AVX2( frac, 2.8882704548164776201f, -2.52074962577807006663f, 1.48116647521213171641f, -0.465725644288844778798f, 0.0596515482674574969533f); + mantissa = POLY4_AVX2(frac, + 2.8882704548164776201f, + -2.52074962577807006663f, + 1.48116647521213171641f, + -0.465725644288844778798f, + 0.0596515482674574969533f); #elif LOG_POLY_DEGREE == 4 - mantissa = POLY3_AVX2( frac, 2.61761038894603480148f, -1.75647175389045657003f, 0.688243882994381274313f, -0.107254423828329604454f); + mantissa = POLY3_AVX2(frac, + 2.61761038894603480148f, + -1.75647175389045657003f, + 0.688243882994381274313f, + -0.107254423828329604454f); #elif LOG_POLY_DEGREE == 3 - mantissa = POLY2_AVX2( frac, 2.28330284476918490682f, -1.04913055217340124191f, 0.204446009836232697516f); + mantissa = POLY2_AVX2(frac, + 2.28330284476918490682f, + -1.04913055217340124191f, + 0.204446009836232697516f); #else #error #endif - bVal = _mm256_add_ps(_mm256_mul_ps(mantissa, _mm256_sub_ps(frac, leadingOne)), bVal); - _mm256_store_ps(bPtr, bVal); + bVal = + _mm256_add_ps(_mm256_mul_ps(mantissa, _mm256_sub_ps(frac, leadingOne)), bVal); + _mm256_store_ps(bPtr, bVal); - aPtr += 8; - bPtr += 8; - } + aPtr += 8; + bPtr += 8; + } - number = eighthPoints * 8; - volk_32f_log2_32f_generic(bPtr, aPtr, num_points-number); + number = eighthPoints * 8; + volk_32f_log2_32f_generic(bPtr, aPtr, num_points - number); } #endif /* LV_HAVE_AVX2 for aligned */ @@ -241,54 +302,79 @@ volk_32f_log2_32f_a_avx2(float* bVector, const float* aVector, unsigned int num_ #define POLY0(x, c0) _mm_set1_ps(c0) #define POLY1(x, c0, c1) _mm_add_ps(_mm_mul_ps(POLY0(x, c1), x), _mm_set1_ps(c0)) #define POLY2(x, c0, c1, c2) _mm_add_ps(_mm_mul_ps(POLY1(x, c1, c2), x), _mm_set1_ps(c0)) -#define POLY3(x, c0, c1, c2, c3) _mm_add_ps(_mm_mul_ps(POLY2(x, c1, c2, c3), x), _mm_set1_ps(c0)) -#define POLY4(x, c0, c1, c2, c3, c4) _mm_add_ps(_mm_mul_ps(POLY3(x, c1, c2, c3, c4), x), _mm_set1_ps(c0)) -#define POLY5(x, c0, c1, c2, c3, c4, c5) _mm_add_ps(_mm_mul_ps(POLY4(x, c1, c2, c3, c4, c5), x), _mm_set1_ps(c0)) +#define POLY3(x, c0, c1, c2, c3) \ + _mm_add_ps(_mm_mul_ps(POLY2(x, c1, c2, c3), x), _mm_set1_ps(c0)) +#define POLY4(x, c0, c1, c2, c3, c4) \ + _mm_add_ps(_mm_mul_ps(POLY3(x, c1, c2, c3, c4), x), _mm_set1_ps(c0)) +#define POLY5(x, c0, c1, c2, c3, c4, c5) \ + _mm_add_ps(_mm_mul_ps(POLY4(x, c1, c2, c3, c4, c5), x), _mm_set1_ps(c0)) static inline void volk_32f_log2_32f_a_sse4_1(float* bVector, const float* aVector, unsigned int num_points) { - float* bPtr = bVector; - const float* aPtr = aVector; + float* bPtr = bVector; + const float* aPtr = aVector; - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; - __m128 aVal, bVal, mantissa, frac, leadingOne; - __m128i bias, exp; + __m128 aVal, bVal, mantissa, frac, leadingOne; + __m128i bias, exp; - for(;number < quarterPoints; number++){ + for (; number < quarterPoints; number++) { - aVal = _mm_load_ps(aPtr); - bias = _mm_set1_epi32(127); - leadingOne = _mm_set1_ps(1.0f); - exp = _mm_sub_epi32(_mm_srli_epi32(_mm_and_si128(_mm_castps_si128(aVal), _mm_set1_epi32(0x7f800000)), 23), bias); - bVal = _mm_cvtepi32_ps(exp); + aVal = _mm_load_ps(aPtr); + bias = _mm_set1_epi32(127); + leadingOne = _mm_set1_ps(1.0f); + exp = _mm_sub_epi32( + _mm_srli_epi32( + _mm_and_si128(_mm_castps_si128(aVal), _mm_set1_epi32(0x7f800000)), 23), + bias); + bVal = _mm_cvtepi32_ps(exp); - // Now to extract mantissa - frac = _mm_or_ps(leadingOne, _mm_and_ps(aVal, _mm_castsi128_ps(_mm_set1_epi32(0x7fffff)))); + // Now to extract mantissa + frac = _mm_or_ps(leadingOne, + _mm_and_ps(aVal, _mm_castsi128_ps(_mm_set1_epi32(0x7fffff)))); #if LOG_POLY_DEGREE == 6 - mantissa = POLY5( frac, 3.1157899f, -3.3241990f, 2.5988452f, -1.2315303f, 3.1821337e-1f, -3.4436006e-2f); + mantissa = POLY5(frac, + 3.1157899f, + -3.3241990f, + 2.5988452f, + -1.2315303f, + 3.1821337e-1f, + -3.4436006e-2f); #elif LOG_POLY_DEGREE == 5 - mantissa = POLY4( frac, 2.8882704548164776201f, -2.52074962577807006663f, 1.48116647521213171641f, -0.465725644288844778798f, 0.0596515482674574969533f); + mantissa = POLY4(frac, + 2.8882704548164776201f, + -2.52074962577807006663f, + 1.48116647521213171641f, + -0.465725644288844778798f, + 0.0596515482674574969533f); #elif LOG_POLY_DEGREE == 4 - mantissa = POLY3( frac, 2.61761038894603480148f, -1.75647175389045657003f, 0.688243882994381274313f, -0.107254423828329604454f); + mantissa = POLY3(frac, + 2.61761038894603480148f, + -1.75647175389045657003f, + 0.688243882994381274313f, + -0.107254423828329604454f); #elif LOG_POLY_DEGREE == 3 - mantissa = POLY2( frac, 2.28330284476918490682f, -1.04913055217340124191f, 0.204446009836232697516f); + mantissa = POLY2(frac, + 2.28330284476918490682f, + -1.04913055217340124191f, + 0.204446009836232697516f); #else #error #endif - bVal = _mm_add_ps(bVal, _mm_mul_ps(mantissa, _mm_sub_ps(frac, leadingOne))); - _mm_store_ps(bPtr, bVal); + bVal = _mm_add_ps(bVal, _mm_mul_ps(mantissa, _mm_sub_ps(frac, leadingOne))); + _mm_store_ps(bPtr, bVal); - aPtr += 4; - bPtr += 4; - } + aPtr += 4; + bPtr += 4; + } - number = quarterPoints * 4; - volk_32f_log2_32f_generic(bPtr, aPtr, num_points-number); + number = quarterPoints * 4; + volk_32f_log2_32f_generic(bPtr, aPtr, num_points - number); } #endif /* LV_HAVE_SSE4_1 for aligned */ @@ -297,91 +383,91 @@ volk_32f_log2_32f_a_sse4_1(float* bVector, const float* aVector, unsigned int nu #include /* these macros allow us to embed logs in other kernels */ -#define VLOG2Q_NEON_PREAMBLE() \ - int32x4_t one = vdupq_n_s32(0x000800000); \ - /* minimax polynomial */ \ - float32x4_t p0 = vdupq_n_f32(-3.0400402727048585); \ - float32x4_t p1 = vdupq_n_f32(6.1129631282966113); \ - float32x4_t p2 = vdupq_n_f32(-5.3419892024633207); \ - float32x4_t p3 = vdupq_n_f32(3.2865287703753912); \ - float32x4_t p4 = vdupq_n_f32(-1.2669182593441635); \ - float32x4_t p5 = vdupq_n_f32(0.2751487703421256); \ - float32x4_t p6 = vdupq_n_f32(-0.0256910888150985); \ - int32x4_t exp_mask = vdupq_n_s32(0x7f800000); \ - int32x4_t sig_mask = vdupq_n_s32(0x007fffff); \ - int32x4_t exp_bias = vdupq_n_s32(127); - - -#define VLOG2Q_NEON_F32(log2_approx, aval) \ - int32x4_t exponent_i = vandq_s32(aval, exp_mask); \ - int32x4_t significand_i = vandq_s32(aval, sig_mask); \ - exponent_i = vshrq_n_s32(exponent_i, 23); \ - \ - /* extract the exponent and significand \ - we can treat this as fixed point to save ~9% on the \ - conversion + float add */ \ - significand_i = vorrq_s32(one, significand_i); \ - float32x4_t significand_f = vcvtq_n_f32_s32(significand_i,23); \ - /* debias the exponent and convert to float */ \ - exponent_i = vsubq_s32(exponent_i, exp_bias); \ - float32x4_t exponent_f = vcvtq_f32_s32(exponent_i); \ - \ - /* put the significand through a polynomial fit of log2(x) [1,2] \ - add the result to the exponent */ \ - log2_approx = vaddq_f32(exponent_f, p0); /* p0 */ \ - float32x4_t tmp1 = vmulq_f32(significand_f, p1); /* p1 * x */ \ - log2_approx = vaddq_f32(log2_approx, tmp1); \ - float32x4_t sig_2 = vmulq_f32(significand_f, significand_f); /* x^2 */ \ - tmp1 = vmulq_f32(sig_2, p2); /* p2 * x^2 */ \ - log2_approx = vaddq_f32(log2_approx, tmp1); \ - \ - float32x4_t sig_3 = vmulq_f32(sig_2, significand_f); /* x^3 */ \ - tmp1 = vmulq_f32(sig_3, p3); /* p3 * x^3 */ \ - log2_approx = vaddq_f32(log2_approx, tmp1); \ - float32x4_t sig_4 = vmulq_f32(sig_2, sig_2); /* x^4 */ \ - tmp1 = vmulq_f32(sig_4, p4); /* p4 * x^4 */ \ - log2_approx = vaddq_f32(log2_approx, tmp1); \ - float32x4_t sig_5 = vmulq_f32(sig_3, sig_2); /* x^5 */ \ - tmp1 = vmulq_f32(sig_5, p5); /* p5 * x^5 */ \ - log2_approx = vaddq_f32(log2_approx, tmp1); \ - float32x4_t sig_6 = vmulq_f32(sig_3, sig_3); /* x^6 */ \ - tmp1 = vmulq_f32(sig_6, p6); /* p6 * x^6 */ \ - log2_approx = vaddq_f32(log2_approx, tmp1); +#define VLOG2Q_NEON_PREAMBLE() \ + int32x4_t one = vdupq_n_s32(0x000800000); \ + /* minimax polynomial */ \ + float32x4_t p0 = vdupq_n_f32(-3.0400402727048585); \ + float32x4_t p1 = vdupq_n_f32(6.1129631282966113); \ + float32x4_t p2 = vdupq_n_f32(-5.3419892024633207); \ + float32x4_t p3 = vdupq_n_f32(3.2865287703753912); \ + float32x4_t p4 = vdupq_n_f32(-1.2669182593441635); \ + float32x4_t p5 = vdupq_n_f32(0.2751487703421256); \ + float32x4_t p6 = vdupq_n_f32(-0.0256910888150985); \ + int32x4_t exp_mask = vdupq_n_s32(0x7f800000); \ + int32x4_t sig_mask = vdupq_n_s32(0x007fffff); \ + int32x4_t exp_bias = vdupq_n_s32(127); + + +#define VLOG2Q_NEON_F32(log2_approx, aval) \ + int32x4_t exponent_i = vandq_s32(aval, exp_mask); \ + int32x4_t significand_i = vandq_s32(aval, sig_mask); \ + exponent_i = vshrq_n_s32(exponent_i, 23); \ + \ + /* extract the exponent and significand \ + we can treat this as fixed point to save ~9% on the \ + conversion + float add */ \ + significand_i = vorrq_s32(one, significand_i); \ + float32x4_t significand_f = vcvtq_n_f32_s32(significand_i, 23); \ + /* debias the exponent and convert to float */ \ + exponent_i = vsubq_s32(exponent_i, exp_bias); \ + float32x4_t exponent_f = vcvtq_f32_s32(exponent_i); \ + \ + /* put the significand through a polynomial fit of log2(x) [1,2] \ + add the result to the exponent */ \ + log2_approx = vaddq_f32(exponent_f, p0); /* p0 */ \ + float32x4_t tmp1 = vmulq_f32(significand_f, p1); /* p1 * x */ \ + log2_approx = vaddq_f32(log2_approx, tmp1); \ + float32x4_t sig_2 = vmulq_f32(significand_f, significand_f); /* x^2 */ \ + tmp1 = vmulq_f32(sig_2, p2); /* p2 * x^2 */ \ + log2_approx = vaddq_f32(log2_approx, tmp1); \ + \ + float32x4_t sig_3 = vmulq_f32(sig_2, significand_f); /* x^3 */ \ + tmp1 = vmulq_f32(sig_3, p3); /* p3 * x^3 */ \ + log2_approx = vaddq_f32(log2_approx, tmp1); \ + float32x4_t sig_4 = vmulq_f32(sig_2, sig_2); /* x^4 */ \ + tmp1 = vmulq_f32(sig_4, p4); /* p4 * x^4 */ \ + log2_approx = vaddq_f32(log2_approx, tmp1); \ + float32x4_t sig_5 = vmulq_f32(sig_3, sig_2); /* x^5 */ \ + tmp1 = vmulq_f32(sig_5, p5); /* p5 * x^5 */ \ + log2_approx = vaddq_f32(log2_approx, tmp1); \ + float32x4_t sig_6 = vmulq_f32(sig_3, sig_3); /* x^6 */ \ + tmp1 = vmulq_f32(sig_6, p6); /* p6 * x^6 */ \ + log2_approx = vaddq_f32(log2_approx, tmp1); static inline void volk_32f_log2_32f_neon(float* bVector, const float* aVector, unsigned int num_points) { - float* bPtr = bVector; - const float* aPtr = aVector; - unsigned int number; - const unsigned int quarterPoints = num_points / 4; + float* bPtr = bVector; + const float* aPtr = aVector; + unsigned int number; + const unsigned int quarterPoints = num_points / 4; - int32x4_t aval; - float32x4_t log2_approx; + int32x4_t aval; + float32x4_t log2_approx; - VLOG2Q_NEON_PREAMBLE() - // lms - //p0 = vdupq_n_f32(-1.649132280361871); - //p1 = vdupq_n_f32(1.995047138579499); - //p2 = vdupq_n_f32(-0.336914839219728); + VLOG2Q_NEON_PREAMBLE() + // lms + // p0 = vdupq_n_f32(-1.649132280361871); + // p1 = vdupq_n_f32(1.995047138579499); + // p2 = vdupq_n_f32(-0.336914839219728); - // keep in mind a single precision float is represented as - // (-1)^sign * 2^exp * 1.significand, so the log2 is - // log2(2^exp * sig) = exponent + log2(1 + significand/(1<<23) - for(number = 0; number < quarterPoints; ++number){ - // load float in to an int register without conversion - aval = vld1q_s32((int*)aPtr); + // keep in mind a single precision float is represented as + // (-1)^sign * 2^exp * 1.significand, so the log2 is + // log2(2^exp * sig) = exponent + log2(1 + significand/(1<<23) + for (number = 0; number < quarterPoints; ++number) { + // load float in to an int register without conversion + aval = vld1q_s32((int*)aPtr); - VLOG2Q_NEON_F32(log2_approx, aval) + VLOG2Q_NEON_F32(log2_approx, aval) - vst1q_f32(bPtr, log2_approx); + vst1q_f32(bPtr, log2_approx); - aPtr += 4; - bPtr += 4; - } + aPtr += 4; + bPtr += 4; + } - number = quarterPoints * 4; - volk_32f_log2_32f_generic(bPtr, aPtr, num_points-number); + number = quarterPoints * 4; + volk_32f_log2_32f_generic(bPtr, aPtr, num_points - number); } #endif /* LV_HAVE_NEON */ @@ -398,14 +484,14 @@ volk_32f_log2_32f_neon(float* bVector, const float* aVector, unsigned int num_po static inline void volk_32f_log2_32f_u_generic(float* bVector, const float* aVector, unsigned int num_points) { - float* bPtr = bVector; - const float* aPtr = aVector; - unsigned int number = 0; - - for(number = 0; number < num_points; number++){ - float const result = log2f(*aPtr++); - *bPtr++ = isinf(result) ? -127.0f : result; - } + float* bPtr = bVector; + const float* aPtr = aVector; + unsigned int number = 0; + + for (number = 0; number < num_points; number++) { + float const result = log2f(*aPtr++); + *bPtr++ = isinf(result) ? -127.0f : result; + } } #endif /* LV_HAVE_GENERIC */ @@ -417,54 +503,79 @@ volk_32f_log2_32f_u_generic(float* bVector, const float* aVector, unsigned int n #define POLY0(x, c0) _mm_set1_ps(c0) #define POLY1(x, c0, c1) _mm_add_ps(_mm_mul_ps(POLY0(x, c1), x), _mm_set1_ps(c0)) #define POLY2(x, c0, c1, c2) _mm_add_ps(_mm_mul_ps(POLY1(x, c1, c2), x), _mm_set1_ps(c0)) -#define POLY3(x, c0, c1, c2, c3) _mm_add_ps(_mm_mul_ps(POLY2(x, c1, c2, c3), x), _mm_set1_ps(c0)) -#define POLY4(x, c0, c1, c2, c3, c4) _mm_add_ps(_mm_mul_ps(POLY3(x, c1, c2, c3, c4), x), _mm_set1_ps(c0)) -#define POLY5(x, c0, c1, c2, c3, c4, c5) _mm_add_ps(_mm_mul_ps(POLY4(x, c1, c2, c3, c4, c5), x), _mm_set1_ps(c0)) +#define POLY3(x, c0, c1, c2, c3) \ + _mm_add_ps(_mm_mul_ps(POLY2(x, c1, c2, c3), x), _mm_set1_ps(c0)) +#define POLY4(x, c0, c1, c2, c3, c4) \ + _mm_add_ps(_mm_mul_ps(POLY3(x, c1, c2, c3, c4), x), _mm_set1_ps(c0)) +#define POLY5(x, c0, c1, c2, c3, c4, c5) \ + _mm_add_ps(_mm_mul_ps(POLY4(x, c1, c2, c3, c4, c5), x), _mm_set1_ps(c0)) static inline void volk_32f_log2_32f_u_sse4_1(float* bVector, const float* aVector, unsigned int num_points) { - float* bPtr = bVector; - const float* aPtr = aVector; + float* bPtr = bVector; + const float* aPtr = aVector; - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; - __m128 aVal, bVal, mantissa, frac, leadingOne; - __m128i bias, exp; + __m128 aVal, bVal, mantissa, frac, leadingOne; + __m128i bias, exp; - for(;number < quarterPoints; number++){ + for (; number < quarterPoints; number++) { - aVal = _mm_loadu_ps(aPtr); - bias = _mm_set1_epi32(127); - leadingOne = _mm_set1_ps(1.0f); - exp = _mm_sub_epi32(_mm_srli_epi32(_mm_and_si128(_mm_castps_si128(aVal), _mm_set1_epi32(0x7f800000)), 23), bias); - bVal = _mm_cvtepi32_ps(exp); + aVal = _mm_loadu_ps(aPtr); + bias = _mm_set1_epi32(127); + leadingOne = _mm_set1_ps(1.0f); + exp = _mm_sub_epi32( + _mm_srli_epi32( + _mm_and_si128(_mm_castps_si128(aVal), _mm_set1_epi32(0x7f800000)), 23), + bias); + bVal = _mm_cvtepi32_ps(exp); - // Now to extract mantissa - frac = _mm_or_ps(leadingOne, _mm_and_ps(aVal, _mm_castsi128_ps(_mm_set1_epi32(0x7fffff)))); + // Now to extract mantissa + frac = _mm_or_ps(leadingOne, + _mm_and_ps(aVal, _mm_castsi128_ps(_mm_set1_epi32(0x7fffff)))); #if LOG_POLY_DEGREE == 6 - mantissa = POLY5( frac, 3.1157899f, -3.3241990f, 2.5988452f, -1.2315303f, 3.1821337e-1f, -3.4436006e-2f); + mantissa = POLY5(frac, + 3.1157899f, + -3.3241990f, + 2.5988452f, + -1.2315303f, + 3.1821337e-1f, + -3.4436006e-2f); #elif LOG_POLY_DEGREE == 5 - mantissa = POLY4( frac, 2.8882704548164776201f, -2.52074962577807006663f, 1.48116647521213171641f, -0.465725644288844778798f, 0.0596515482674574969533f); + mantissa = POLY4(frac, + 2.8882704548164776201f, + -2.52074962577807006663f, + 1.48116647521213171641f, + -0.465725644288844778798f, + 0.0596515482674574969533f); #elif LOG_POLY_DEGREE == 4 - mantissa = POLY3( frac, 2.61761038894603480148f, -1.75647175389045657003f, 0.688243882994381274313f, -0.107254423828329604454f); + mantissa = POLY3(frac, + 2.61761038894603480148f, + -1.75647175389045657003f, + 0.688243882994381274313f, + -0.107254423828329604454f); #elif LOG_POLY_DEGREE == 3 - mantissa = POLY2( frac, 2.28330284476918490682f, -1.04913055217340124191f, 0.204446009836232697516f); + mantissa = POLY2(frac, + 2.28330284476918490682f, + -1.04913055217340124191f, + 0.204446009836232697516f); #else #error #endif - bVal = _mm_add_ps(bVal, _mm_mul_ps(mantissa, _mm_sub_ps(frac, leadingOne))); - _mm_storeu_ps(bPtr, bVal); + bVal = _mm_add_ps(bVal, _mm_mul_ps(mantissa, _mm_sub_ps(frac, leadingOne))); + _mm_storeu_ps(bPtr, bVal); - aPtr += 4; - bPtr += 4; - } + aPtr += 4; + bPtr += 4; + } - number = quarterPoints * 4; - volk_32f_log2_32f_u_generic(bPtr, aPtr, num_points-number); + number = quarterPoints * 4; + volk_32f_log2_32f_u_generic(bPtr, aPtr, num_points - number); } #endif /* LV_HAVE_SSE4_1 for unaligned */ @@ -473,56 +584,86 @@ volk_32f_log2_32f_u_sse4_1(float* bVector, const float* aVector, unsigned int nu #include #define POLY0_FMAAVX2(x, c0) _mm256_set1_ps(c0) -#define POLY1_FMAAVX2(x, c0, c1) _mm256_fmadd_ps(POLY0_FMAAVX2(x, c1), x, _mm256_set1_ps(c0)) -#define POLY2_FMAAVX2(x, c0, c1, c2) _mm256_fmadd_ps(POLY1_FMAAVX2(x, c1, c2), x, _mm256_set1_ps(c0)) -#define POLY3_FMAAVX2(x, c0, c1, c2, c3) _mm256_fmadd_ps(POLY2_FMAAVX2(x, c1, c2, c3), x, _mm256_set1_ps(c0)) -#define POLY4_FMAAVX2(x, c0, c1, c2, c3, c4) _mm256_fmadd_ps(POLY3_FMAAVX2(x, c1, c2, c3, c4), x, _mm256_set1_ps(c0)) -#define POLY5_FMAAVX2(x, c0, c1, c2, c3, c4, c5) _mm256_fmadd_ps(POLY4_FMAAVX2(x, c1, c2, c3, c4, c5), x, _mm256_set1_ps(c0)) - -static inline void -volk_32f_log2_32f_u_avx2_fma(float* bVector, const float* aVector, unsigned int num_points) +#define POLY1_FMAAVX2(x, c0, c1) \ + _mm256_fmadd_ps(POLY0_FMAAVX2(x, c1), x, _mm256_set1_ps(c0)) +#define POLY2_FMAAVX2(x, c0, c1, c2) \ + _mm256_fmadd_ps(POLY1_FMAAVX2(x, c1, c2), x, _mm256_set1_ps(c0)) +#define POLY3_FMAAVX2(x, c0, c1, c2, c3) \ + _mm256_fmadd_ps(POLY2_FMAAVX2(x, c1, c2, c3), x, _mm256_set1_ps(c0)) +#define POLY4_FMAAVX2(x, c0, c1, c2, c3, c4) \ + _mm256_fmadd_ps(POLY3_FMAAVX2(x, c1, c2, c3, c4), x, _mm256_set1_ps(c0)) +#define POLY5_FMAAVX2(x, c0, c1, c2, c3, c4, c5) \ + _mm256_fmadd_ps(POLY4_FMAAVX2(x, c1, c2, c3, c4, c5), x, _mm256_set1_ps(c0)) + +static inline void volk_32f_log2_32f_u_avx2_fma(float* bVector, + const float* aVector, + unsigned int num_points) { - float* bPtr = bVector; - const float* aPtr = aVector; + float* bPtr = bVector; + const float* aPtr = aVector; - unsigned int number = 0; - const unsigned int eighthPoints = num_points / 8; + unsigned int number = 0; + const unsigned int eighthPoints = num_points / 8; - __m256 aVal, bVal, mantissa, frac, leadingOne; - __m256i bias, exp; + __m256 aVal, bVal, mantissa, frac, leadingOne; + __m256i bias, exp; - for(;number < eighthPoints; number++){ + for (; number < eighthPoints; number++) { - aVal = _mm256_loadu_ps(aPtr); - bias = _mm256_set1_epi32(127); - leadingOne = _mm256_set1_ps(1.0f); - exp = _mm256_sub_epi32(_mm256_srli_epi32(_mm256_and_si256(_mm256_castps_si256(aVal), _mm256_set1_epi32(0x7f800000)), 23), bias); - bVal = _mm256_cvtepi32_ps(exp); + aVal = _mm256_loadu_ps(aPtr); + bias = _mm256_set1_epi32(127); + leadingOne = _mm256_set1_ps(1.0f); + exp = _mm256_sub_epi32( + _mm256_srli_epi32(_mm256_and_si256(_mm256_castps_si256(aVal), + _mm256_set1_epi32(0x7f800000)), + 23), + bias); + bVal = _mm256_cvtepi32_ps(exp); - // Now to extract mantissa - frac = _mm256_or_ps(leadingOne, _mm256_and_ps(aVal, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffff)))); + // Now to extract mantissa + frac = _mm256_or_ps( + leadingOne, + _mm256_and_ps(aVal, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffff)))); #if LOG_POLY_DEGREE == 6 - mantissa = POLY5_FMAAVX2( frac, 3.1157899f, -3.3241990f, 2.5988452f, -1.2315303f, 3.1821337e-1f, -3.4436006e-2f); + mantissa = POLY5_FMAAVX2(frac, + 3.1157899f, + -3.3241990f, + 2.5988452f, + -1.2315303f, + 3.1821337e-1f, + -3.4436006e-2f); #elif LOG_POLY_DEGREE == 5 - mantissa = POLY4_FMAAVX2( frac, 2.8882704548164776201f, -2.52074962577807006663f, 1.48116647521213171641f, -0.465725644288844778798f, 0.0596515482674574969533f); + mantissa = POLY4_FMAAVX2(frac, + 2.8882704548164776201f, + -2.52074962577807006663f, + 1.48116647521213171641f, + -0.465725644288844778798f, + 0.0596515482674574969533f); #elif LOG_POLY_DEGREE == 4 - mantissa = POLY3_FMAAVX2( frac, 2.61761038894603480148f, -1.75647175389045657003f, 0.688243882994381274313f, -0.107254423828329604454f); + mantissa = POLY3_FMAAVX2(frac, + 2.61761038894603480148f, + -1.75647175389045657003f, + 0.688243882994381274313f, + -0.107254423828329604454f); #elif LOG_POLY_DEGREE == 3 - mantissa = POLY2_FMAAVX2( frac, 2.28330284476918490682f, -1.04913055217340124191f, 0.204446009836232697516f); + mantissa = POLY2_FMAAVX2(frac, + 2.28330284476918490682f, + -1.04913055217340124191f, + 0.204446009836232697516f); #else #error #endif - bVal = _mm256_fmadd_ps(mantissa, _mm256_sub_ps(frac, leadingOne), bVal); - _mm256_storeu_ps(bPtr, bVal); + bVal = _mm256_fmadd_ps(mantissa, _mm256_sub_ps(frac, leadingOne), bVal); + _mm256_storeu_ps(bPtr, bVal); - aPtr += 8; - bPtr += 8; - } + aPtr += 8; + bPtr += 8; + } - number = eighthPoints * 8; - volk_32f_log2_32f_u_generic(bPtr, aPtr, num_points-number); + number = eighthPoints * 8; + volk_32f_log2_32f_u_generic(bPtr, aPtr, num_points - number); } #endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for unaligned */ @@ -531,56 +672,86 @@ volk_32f_log2_32f_u_avx2_fma(float* bVector, const float* aVector, unsigned int #include #define POLY0_AVX2(x, c0) _mm256_set1_ps(c0) -#define POLY1_AVX2(x, c0, c1) _mm256_add_ps(_mm256_mul_ps(POLY0_AVX2(x, c1), x), _mm256_set1_ps(c0)) -#define POLY2_AVX2(x, c0, c1, c2) _mm256_add_ps(_mm256_mul_ps(POLY1_AVX2(x, c1, c2), x), _mm256_set1_ps(c0)) -#define POLY3_AVX2(x, c0, c1, c2, c3) _mm256_add_ps(_mm256_mul_ps(POLY2_AVX2(x, c1, c2, c3), x), _mm256_set1_ps(c0)) -#define POLY4_AVX2(x, c0, c1, c2, c3, c4) _mm256_add_ps(_mm256_mul_ps(POLY3_AVX2(x, c1, c2, c3, c4), x), _mm256_set1_ps(c0)) -#define POLY5_AVX2(x, c0, c1, c2, c3, c4, c5) _mm256_add_ps(_mm256_mul_ps(POLY4_AVX2(x, c1, c2, c3, c4, c5), x), _mm256_set1_ps(c0)) +#define POLY1_AVX2(x, c0, c1) \ + _mm256_add_ps(_mm256_mul_ps(POLY0_AVX2(x, c1), x), _mm256_set1_ps(c0)) +#define POLY2_AVX2(x, c0, c1, c2) \ + _mm256_add_ps(_mm256_mul_ps(POLY1_AVX2(x, c1, c2), x), _mm256_set1_ps(c0)) +#define POLY3_AVX2(x, c0, c1, c2, c3) \ + _mm256_add_ps(_mm256_mul_ps(POLY2_AVX2(x, c1, c2, c3), x), _mm256_set1_ps(c0)) +#define POLY4_AVX2(x, c0, c1, c2, c3, c4) \ + _mm256_add_ps(_mm256_mul_ps(POLY3_AVX2(x, c1, c2, c3, c4), x), _mm256_set1_ps(c0)) +#define POLY5_AVX2(x, c0, c1, c2, c3, c4, c5) \ + _mm256_add_ps(_mm256_mul_ps(POLY4_AVX2(x, c1, c2, c3, c4, c5), x), _mm256_set1_ps(c0)) static inline void volk_32f_log2_32f_u_avx2(float* bVector, const float* aVector, unsigned int num_points) { - float* bPtr = bVector; - const float* aPtr = aVector; + float* bPtr = bVector; + const float* aPtr = aVector; - unsigned int number = 0; - const unsigned int eighthPoints = num_points / 8; + unsigned int number = 0; + const unsigned int eighthPoints = num_points / 8; - __m256 aVal, bVal, mantissa, frac, leadingOne; - __m256i bias, exp; + __m256 aVal, bVal, mantissa, frac, leadingOne; + __m256i bias, exp; - for(;number < eighthPoints; number++){ + for (; number < eighthPoints; number++) { - aVal = _mm256_loadu_ps(aPtr); - bias = _mm256_set1_epi32(127); - leadingOne = _mm256_set1_ps(1.0f); - exp = _mm256_sub_epi32(_mm256_srli_epi32(_mm256_and_si256(_mm256_castps_si256(aVal), _mm256_set1_epi32(0x7f800000)), 23), bias); - bVal = _mm256_cvtepi32_ps(exp); + aVal = _mm256_loadu_ps(aPtr); + bias = _mm256_set1_epi32(127); + leadingOne = _mm256_set1_ps(1.0f); + exp = _mm256_sub_epi32( + _mm256_srli_epi32(_mm256_and_si256(_mm256_castps_si256(aVal), + _mm256_set1_epi32(0x7f800000)), + 23), + bias); + bVal = _mm256_cvtepi32_ps(exp); - // Now to extract mantissa - frac = _mm256_or_ps(leadingOne, _mm256_and_ps(aVal, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffff)))); + // Now to extract mantissa + frac = _mm256_or_ps( + leadingOne, + _mm256_and_ps(aVal, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffff)))); #if LOG_POLY_DEGREE == 6 - mantissa = POLY5_AVX2( frac, 3.1157899f, -3.3241990f, 2.5988452f, -1.2315303f, 3.1821337e-1f, -3.4436006e-2f); + mantissa = POLY5_AVX2(frac, + 3.1157899f, + -3.3241990f, + 2.5988452f, + -1.2315303f, + 3.1821337e-1f, + -3.4436006e-2f); #elif LOG_POLY_DEGREE == 5 - mantissa = POLY4_AVX2( frac, 2.8882704548164776201f, -2.52074962577807006663f, 1.48116647521213171641f, -0.465725644288844778798f, 0.0596515482674574969533f); + mantissa = POLY4_AVX2(frac, + 2.8882704548164776201f, + -2.52074962577807006663f, + 1.48116647521213171641f, + -0.465725644288844778798f, + 0.0596515482674574969533f); #elif LOG_POLY_DEGREE == 4 - mantissa = POLY3_AVX2( frac, 2.61761038894603480148f, -1.75647175389045657003f, 0.688243882994381274313f, -0.107254423828329604454f); + mantissa = POLY3_AVX2(frac, + 2.61761038894603480148f, + -1.75647175389045657003f, + 0.688243882994381274313f, + -0.107254423828329604454f); #elif LOG_POLY_DEGREE == 3 - mantissa = POLY2_AVX2( frac, 2.28330284476918490682f, -1.04913055217340124191f, 0.204446009836232697516f); + mantissa = POLY2_AVX2(frac, + 2.28330284476918490682f, + -1.04913055217340124191f, + 0.204446009836232697516f); #else #error #endif - bVal = _mm256_add_ps(_mm256_mul_ps(mantissa, _mm256_sub_ps(frac, leadingOne)), bVal); - _mm256_storeu_ps(bPtr, bVal); + bVal = + _mm256_add_ps(_mm256_mul_ps(mantissa, _mm256_sub_ps(frac, leadingOne)), bVal); + _mm256_storeu_ps(bPtr, bVal); - aPtr += 8; - bPtr += 8; - } + aPtr += 8; + bPtr += 8; + } - number = eighthPoints * 8; - volk_32f_log2_32f_u_generic(bPtr, aPtr, num_points-number); + number = eighthPoints * 8; + volk_32f_log2_32f_u_generic(bPtr, aPtr, num_points - number); } #endif /* LV_HAVE_AVX2 for unaligned */ diff --git a/kernels/volk/volk_32f_null_32f.h b/kernels/volk/volk_32f_null_32f.h index 95e8d1a..cbed229 100644 --- a/kernels/volk/volk_32f_null_32f.h +++ b/kernels/volk/volk_32f_null_32f.h @@ -20,9 +20,9 @@ * Boston, MA 02110-1301, USA. */ -#include -#include #include +#include +#include #ifndef INCLUDED_volk_32f_null_32f_a_H #define INCLUDED_volk_32f_null_32f_a_H @@ -32,13 +32,13 @@ static inline void volk_32f_null_32f_generic(float* bVector, const float* aVector, unsigned int num_points) { - float* bPtr = bVector; - const float* aPtr = aVector; - unsigned int number; + float* bPtr = bVector; + const float* aPtr = aVector; + unsigned int number; - for(number = 0; number < num_points; number++){ - *bPtr++ = *aPtr++; - } + for (number = 0; number < num_points; number++) { + *bPtr++ = *aPtr++; + } } #endif /* LV_HAVE_GENERIC */ diff --git a/kernels/volk/volk_32f_s32f_32f_fm_detect_32f.h b/kernels/volk/volk_32f_s32f_32f_fm_detect_32f.h index 9879959..3bf7aea 100644 --- a/kernels/volk/volk_32f_s32f_32f_fm_detect_32f.h +++ b/kernels/volk/volk_32f_s32f_32f_fm_detect_32f.h @@ -30,14 +30,15 @@ * * Dispatcher Prototype * \code - * void volk_32f_s32f_32f_fm_detect_32f(float* outputVector, const float* inputVector, const float bound, float* saveValue, unsigned int num_points) - * \endcode + * void volk_32f_s32f_32f_fm_detect_32f(float* outputVector, const float* inputVector, + * const float bound, float* saveValue, unsigned int num_points) \endcode * * \b Inputs - * \li inputVector: The input vector containing phase data (must be on the interval (-bound, bound]). - * \li bound: The interval that the input phase data is in, which is used to modulo the differentiation. - * \li saveValue: A pointer to a float which contains the phase value of the sample before the first input sample. - * \li num_points The number of data points. + * \li inputVector: The input vector containing phase data (must be on the interval + * (-bound, bound]). \li bound: The interval that the input phase data is in, which is + * used to modulo the differentiation. \li saveValue: A pointer to a float which contains + * the phase value of the sample before the first input sample. \li num_points The number + * of data points. * * \b Outputs * \li outputVector: The vector where the results will be stored. @@ -62,67 +63,79 @@ #ifdef LV_HAVE_AVX #include -static inline void volk_32f_s32f_32f_fm_detect_32f_a_avx(float* outputVector, const float* inputVector, const float bound, float* saveValue, unsigned int num_points){ - if (num_points < 1) { - return; - } - unsigned int number = 1; - unsigned int j = 0; - // num_points-1 keeps Fedora 7's gcc from crashing... - // num_points won't work. :( - const unsigned int eighthPoints = (num_points-1) / 8; - - float* outPtr = outputVector; - const float* inPtr = inputVector; - __m256 upperBound = _mm256_set1_ps(bound); - __m256 lowerBound = _mm256_set1_ps(-bound); - __m256 next3old1; - __m256 next4; - __m256 boundAdjust; - __m256 posBoundAdjust = _mm256_set1_ps(-2*bound); // Subtract when we're above. - __m256 negBoundAdjust = _mm256_set1_ps(2*bound); // Add when we're below. - // Do the first 8 by hand since we're going in from the saveValue: - *outPtr = *inPtr - *saveValue; - if (*outPtr > bound) *outPtr -= 2*bound; - if (*outPtr < -bound) *outPtr += 2*bound; - inPtr++; - outPtr++; - for (j = 1; j < ( (8 < num_points) ? 8 : num_points); j++) { - *outPtr = *(inPtr) - *(inPtr-1); - if (*outPtr > bound) *outPtr -= 2*bound; - if (*outPtr < -bound) *outPtr += 2*bound; - inPtr++; - outPtr++; - } - - for (; number < eighthPoints; number++) { - // Load data - next3old1 = _mm256_loadu_ps((float*) (inPtr-1)); - next4 = _mm256_load_ps(inPtr); - inPtr += 8; - // Subtract and store: - next3old1 = _mm256_sub_ps(next4, next3old1); - // Bound: - boundAdjust = _mm256_cmp_ps(next3old1, upperBound, _CMP_GT_OS); - boundAdjust = _mm256_and_ps(boundAdjust, posBoundAdjust); - next4 = _mm256_cmp_ps(next3old1, lowerBound, _CMP_LT_OS); - next4 = _mm256_and_ps(next4, negBoundAdjust); - boundAdjust = _mm256_or_ps(next4, boundAdjust); - // Make sure we're in the bounding interval: - next3old1 = _mm256_add_ps(next3old1, boundAdjust); - _mm256_store_ps(outPtr,next3old1); // Store the results back into the output - outPtr += 8; - } - - for (number = (8 > (eighthPoints*8) ? 8 : (8 * eighthPoints)); number < num_points; number++) { - *outPtr = *(inPtr) - *(inPtr-1); - if (*outPtr > bound) *outPtr -= 2*bound; - if (*outPtr < -bound) *outPtr += 2*bound; +static inline void volk_32f_s32f_32f_fm_detect_32f_a_avx(float* outputVector, + const float* inputVector, + const float bound, + float* saveValue, + unsigned int num_points) +{ + if (num_points < 1) { + return; + } + unsigned int number = 1; + unsigned int j = 0; + // num_points-1 keeps Fedora 7's gcc from crashing... + // num_points won't work. :( + const unsigned int eighthPoints = (num_points - 1) / 8; + + float* outPtr = outputVector; + const float* inPtr = inputVector; + __m256 upperBound = _mm256_set1_ps(bound); + __m256 lowerBound = _mm256_set1_ps(-bound); + __m256 next3old1; + __m256 next4; + __m256 boundAdjust; + __m256 posBoundAdjust = _mm256_set1_ps(-2 * bound); // Subtract when we're above. + __m256 negBoundAdjust = _mm256_set1_ps(2 * bound); // Add when we're below. + // Do the first 8 by hand since we're going in from the saveValue: + *outPtr = *inPtr - *saveValue; + if (*outPtr > bound) + *outPtr -= 2 * bound; + if (*outPtr < -bound) + *outPtr += 2 * bound; inPtr++; outPtr++; - } - - *saveValue = inputVector[num_points-1]; + for (j = 1; j < ((8 < num_points) ? 8 : num_points); j++) { + *outPtr = *(inPtr) - *(inPtr - 1); + if (*outPtr > bound) + *outPtr -= 2 * bound; + if (*outPtr < -bound) + *outPtr += 2 * bound; + inPtr++; + outPtr++; + } + + for (; number < eighthPoints; number++) { + // Load data + next3old1 = _mm256_loadu_ps((float*)(inPtr - 1)); + next4 = _mm256_load_ps(inPtr); + inPtr += 8; + // Subtract and store: + next3old1 = _mm256_sub_ps(next4, next3old1); + // Bound: + boundAdjust = _mm256_cmp_ps(next3old1, upperBound, _CMP_GT_OS); + boundAdjust = _mm256_and_ps(boundAdjust, posBoundAdjust); + next4 = _mm256_cmp_ps(next3old1, lowerBound, _CMP_LT_OS); + next4 = _mm256_and_ps(next4, negBoundAdjust); + boundAdjust = _mm256_or_ps(next4, boundAdjust); + // Make sure we're in the bounding interval: + next3old1 = _mm256_add_ps(next3old1, boundAdjust); + _mm256_store_ps(outPtr, next3old1); // Store the results back into the output + outPtr += 8; + } + + for (number = (8 > (eighthPoints * 8) ? 8 : (8 * eighthPoints)); number < num_points; + number++) { + *outPtr = *(inPtr) - *(inPtr - 1); + if (*outPtr > bound) + *outPtr -= 2 * bound; + if (*outPtr < -bound) + *outPtr += 2 * bound; + inPtr++; + outPtr++; + } + + *saveValue = inputVector[num_points - 1]; } #endif /* LV_HAVE_AVX */ @@ -130,102 +143,122 @@ static inline void volk_32f_s32f_32f_fm_detect_32f_a_avx(float* outputVector, co #ifdef LV_HAVE_SSE #include -static inline void volk_32f_s32f_32f_fm_detect_32f_a_sse(float* outputVector, const float* inputVector, const float bound, float* saveValue, unsigned int num_points){ - if (num_points < 1) { - return; - } - unsigned int number = 1; - unsigned int j = 0; - // num_points-1 keeps Fedora 7's gcc from crashing... - // num_points won't work. :( - const unsigned int quarterPoints = (num_points-1) / 4; - - float* outPtr = outputVector; - const float* inPtr = inputVector; - __m128 upperBound = _mm_set_ps1(bound); - __m128 lowerBound = _mm_set_ps1(-bound); - __m128 next3old1; - __m128 next4; - __m128 boundAdjust; - __m128 posBoundAdjust = _mm_set_ps1(-2*bound); // Subtract when we're above. - __m128 negBoundAdjust = _mm_set_ps1(2*bound); // Add when we're below. - // Do the first 4 by hand since we're going in from the saveValue: - *outPtr = *inPtr - *saveValue; - if (*outPtr > bound) *outPtr -= 2*bound; - if (*outPtr < -bound) *outPtr += 2*bound; - inPtr++; - outPtr++; - for (j = 1; j < ( (4 < num_points) ? 4 : num_points); j++) { - *outPtr = *(inPtr) - *(inPtr-1); - if (*outPtr > bound) *outPtr -= 2*bound; - if (*outPtr < -bound) *outPtr += 2*bound; - inPtr++; - outPtr++; - } - - for (; number < quarterPoints; number++) { - // Load data - next3old1 = _mm_loadu_ps((float*) (inPtr-1)); - next4 = _mm_load_ps(inPtr); - inPtr += 4; - // Subtract and store: - next3old1 = _mm_sub_ps(next4, next3old1); - // Bound: - boundAdjust = _mm_cmpgt_ps(next3old1, upperBound); - boundAdjust = _mm_and_ps(boundAdjust, posBoundAdjust); - next4 = _mm_cmplt_ps(next3old1, lowerBound); - next4 = _mm_and_ps(next4, negBoundAdjust); - boundAdjust = _mm_or_ps(next4, boundAdjust); - // Make sure we're in the bounding interval: - next3old1 = _mm_add_ps(next3old1, boundAdjust); - _mm_store_ps(outPtr,next3old1); // Store the results back into the output - outPtr += 4; - } - - for (number = (4 > (quarterPoints*4) ? 4 : (4 * quarterPoints)); number < num_points; number++) { - *outPtr = *(inPtr) - *(inPtr-1); - if (*outPtr > bound) *outPtr -= 2*bound; - if (*outPtr < -bound) *outPtr += 2*bound; +static inline void volk_32f_s32f_32f_fm_detect_32f_a_sse(float* outputVector, + const float* inputVector, + const float bound, + float* saveValue, + unsigned int num_points) +{ + if (num_points < 1) { + return; + } + unsigned int number = 1; + unsigned int j = 0; + // num_points-1 keeps Fedora 7's gcc from crashing... + // num_points won't work. :( + const unsigned int quarterPoints = (num_points - 1) / 4; + + float* outPtr = outputVector; + const float* inPtr = inputVector; + __m128 upperBound = _mm_set_ps1(bound); + __m128 lowerBound = _mm_set_ps1(-bound); + __m128 next3old1; + __m128 next4; + __m128 boundAdjust; + __m128 posBoundAdjust = _mm_set_ps1(-2 * bound); // Subtract when we're above. + __m128 negBoundAdjust = _mm_set_ps1(2 * bound); // Add when we're below. + // Do the first 4 by hand since we're going in from the saveValue: + *outPtr = *inPtr - *saveValue; + if (*outPtr > bound) + *outPtr -= 2 * bound; + if (*outPtr < -bound) + *outPtr += 2 * bound; inPtr++; outPtr++; - } - - *saveValue = inputVector[num_points-1]; + for (j = 1; j < ((4 < num_points) ? 4 : num_points); j++) { + *outPtr = *(inPtr) - *(inPtr - 1); + if (*outPtr > bound) + *outPtr -= 2 * bound; + if (*outPtr < -bound) + *outPtr += 2 * bound; + inPtr++; + outPtr++; + } + + for (; number < quarterPoints; number++) { + // Load data + next3old1 = _mm_loadu_ps((float*)(inPtr - 1)); + next4 = _mm_load_ps(inPtr); + inPtr += 4; + // Subtract and store: + next3old1 = _mm_sub_ps(next4, next3old1); + // Bound: + boundAdjust = _mm_cmpgt_ps(next3old1, upperBound); + boundAdjust = _mm_and_ps(boundAdjust, posBoundAdjust); + next4 = _mm_cmplt_ps(next3old1, lowerBound); + next4 = _mm_and_ps(next4, negBoundAdjust); + boundAdjust = _mm_or_ps(next4, boundAdjust); + // Make sure we're in the bounding interval: + next3old1 = _mm_add_ps(next3old1, boundAdjust); + _mm_store_ps(outPtr, next3old1); // Store the results back into the output + outPtr += 4; + } + + for (number = (4 > (quarterPoints * 4) ? 4 : (4 * quarterPoints)); + number < num_points; + number++) { + *outPtr = *(inPtr) - *(inPtr - 1); + if (*outPtr > bound) + *outPtr -= 2 * bound; + if (*outPtr < -bound) + *outPtr += 2 * bound; + inPtr++; + outPtr++; + } + + *saveValue = inputVector[num_points - 1]; } #endif /* LV_HAVE_SSE */ #ifdef LV_HAVE_GENERIC -static inline void volk_32f_s32f_32f_fm_detect_32f_generic(float* outputVector, const float* inputVector, const float bound, float* saveValue, unsigned int num_points){ - if (num_points < 1) { - return; - } - unsigned int number = 0; - float* outPtr = outputVector; - const float* inPtr = inputVector; - - // Do the first 1 by hand since we're going in from the saveValue: - *outPtr = *inPtr - *saveValue; - if (*outPtr > bound) *outPtr -= 2*bound; - if (*outPtr < -bound) *outPtr += 2*bound; - inPtr++; - outPtr++; - - for (number = 1; number < num_points; number++) { - *outPtr = *(inPtr) - *(inPtr-1); - if (*outPtr > bound) *outPtr -= 2*bound; - if (*outPtr < -bound) *outPtr += 2*bound; +static inline void volk_32f_s32f_32f_fm_detect_32f_generic(float* outputVector, + const float* inputVector, + const float bound, + float* saveValue, + unsigned int num_points) +{ + if (num_points < 1) { + return; + } + unsigned int number = 0; + float* outPtr = outputVector; + const float* inPtr = inputVector; + + // Do the first 1 by hand since we're going in from the saveValue: + *outPtr = *inPtr - *saveValue; + if (*outPtr > bound) + *outPtr -= 2 * bound; + if (*outPtr < -bound) + *outPtr += 2 * bound; inPtr++; outPtr++; - } - *saveValue = inputVector[num_points-1]; + for (number = 1; number < num_points; number++) { + *outPtr = *(inPtr) - *(inPtr - 1); + if (*outPtr > bound) + *outPtr -= 2 * bound; + if (*outPtr < -bound) + *outPtr += 2 * bound; + inPtr++; + outPtr++; + } + + *saveValue = inputVector[num_points - 1]; } #endif /* LV_HAVE_GENERIC */ - - #endif /* INCLUDED_volk_32f_s32f_32f_fm_detect_32f_a_H */ @@ -238,67 +271,79 @@ static inline void volk_32f_s32f_32f_fm_detect_32f_generic(float* outputVector, #ifdef LV_HAVE_AVX #include -static inline void volk_32f_s32f_32f_fm_detect_32f_u_avx(float* outputVector, const float* inputVector, const float bound, float* saveValue, unsigned int num_points){ - if (num_points < 1) { - return; - } - unsigned int number = 1; - unsigned int j = 0; - // num_points-1 keeps Fedora 7's gcc from crashing... - // num_points won't work. :( - const unsigned int eighthPoints = (num_points-1) / 8; - - float* outPtr = outputVector; - const float* inPtr = inputVector; - __m256 upperBound = _mm256_set1_ps(bound); - __m256 lowerBound = _mm256_set1_ps(-bound); - __m256 next3old1; - __m256 next4; - __m256 boundAdjust; - __m256 posBoundAdjust = _mm256_set1_ps(-2*bound); // Subtract when we're above. - __m256 negBoundAdjust = _mm256_set1_ps(2*bound); // Add when we're below. - // Do the first 8 by hand since we're going in from the saveValue: - *outPtr = *inPtr - *saveValue; - if (*outPtr > bound) *outPtr -= 2*bound; - if (*outPtr < -bound) *outPtr += 2*bound; - inPtr++; - outPtr++; - for (j = 1; j < ( (8 < num_points) ? 8 : num_points); j++) { - *outPtr = *(inPtr) - *(inPtr-1); - if (*outPtr > bound) *outPtr -= 2*bound; - if (*outPtr < -bound) *outPtr += 2*bound; +static inline void volk_32f_s32f_32f_fm_detect_32f_u_avx(float* outputVector, + const float* inputVector, + const float bound, + float* saveValue, + unsigned int num_points) +{ + if (num_points < 1) { + return; + } + unsigned int number = 1; + unsigned int j = 0; + // num_points-1 keeps Fedora 7's gcc from crashing... + // num_points won't work. :( + const unsigned int eighthPoints = (num_points - 1) / 8; + + float* outPtr = outputVector; + const float* inPtr = inputVector; + __m256 upperBound = _mm256_set1_ps(bound); + __m256 lowerBound = _mm256_set1_ps(-bound); + __m256 next3old1; + __m256 next4; + __m256 boundAdjust; + __m256 posBoundAdjust = _mm256_set1_ps(-2 * bound); // Subtract when we're above. + __m256 negBoundAdjust = _mm256_set1_ps(2 * bound); // Add when we're below. + // Do the first 8 by hand since we're going in from the saveValue: + *outPtr = *inPtr - *saveValue; + if (*outPtr > bound) + *outPtr -= 2 * bound; + if (*outPtr < -bound) + *outPtr += 2 * bound; inPtr++; outPtr++; - } - - for (; number < eighthPoints; number++) { - // Load data - next3old1 = _mm256_loadu_ps((float*) (inPtr-1)); - next4 = _mm256_loadu_ps(inPtr); - inPtr += 8; - // Subtract and store: - next3old1 = _mm256_sub_ps(next4, next3old1); - // Bound: - boundAdjust = _mm256_cmp_ps(next3old1, upperBound, _CMP_GT_OS); - boundAdjust = _mm256_and_ps(boundAdjust, posBoundAdjust); - next4 = _mm256_cmp_ps(next3old1, lowerBound, _CMP_LT_OS); - next4 = _mm256_and_ps(next4, negBoundAdjust); - boundAdjust = _mm256_or_ps(next4, boundAdjust); - // Make sure we're in the bounding interval: - next3old1 = _mm256_add_ps(next3old1, boundAdjust); - _mm256_storeu_ps(outPtr,next3old1); // Store the results back into the output - outPtr += 8; - } - - for (number = (8 > (eighthPoints*8) ? 8 : (8 * eighthPoints)); number < num_points; number++) { - *outPtr = *(inPtr) - *(inPtr-1); - if (*outPtr > bound) *outPtr -= 2*bound; - if (*outPtr < -bound) *outPtr += 2*bound; - inPtr++; - outPtr++; - } - - *saveValue = inputVector[num_points-1]; + for (j = 1; j < ((8 < num_points) ? 8 : num_points); j++) { + *outPtr = *(inPtr) - *(inPtr - 1); + if (*outPtr > bound) + *outPtr -= 2 * bound; + if (*outPtr < -bound) + *outPtr += 2 * bound; + inPtr++; + outPtr++; + } + + for (; number < eighthPoints; number++) { + // Load data + next3old1 = _mm256_loadu_ps((float*)(inPtr - 1)); + next4 = _mm256_loadu_ps(inPtr); + inPtr += 8; + // Subtract and store: + next3old1 = _mm256_sub_ps(next4, next3old1); + // Bound: + boundAdjust = _mm256_cmp_ps(next3old1, upperBound, _CMP_GT_OS); + boundAdjust = _mm256_and_ps(boundAdjust, posBoundAdjust); + next4 = _mm256_cmp_ps(next3old1, lowerBound, _CMP_LT_OS); + next4 = _mm256_and_ps(next4, negBoundAdjust); + boundAdjust = _mm256_or_ps(next4, boundAdjust); + // Make sure we're in the bounding interval: + next3old1 = _mm256_add_ps(next3old1, boundAdjust); + _mm256_storeu_ps(outPtr, next3old1); // Store the results back into the output + outPtr += 8; + } + + for (number = (8 > (eighthPoints * 8) ? 8 : (8 * eighthPoints)); number < num_points; + number++) { + *outPtr = *(inPtr) - *(inPtr - 1); + if (*outPtr > bound) + *outPtr -= 2 * bound; + if (*outPtr < -bound) + *outPtr += 2 * bound; + inPtr++; + outPtr++; + } + + *saveValue = inputVector[num_points - 1]; } #endif /* LV_HAVE_AVX */ diff --git a/kernels/volk/volk_32f_s32f_calc_spectral_noise_floor_32f.h b/kernels/volk/volk_32f_s32f_calc_spectral_noise_floor_32f.h index ae371a2..e7e581f 100644 --- a/kernels/volk/volk_32f_s32f_calc_spectral_noise_floor_32f.h +++ b/kernels/volk/volk_32f_s32f_calc_spectral_noise_floor_32f.h @@ -35,13 +35,15 @@ * * Dispatcher Prototype * \code - * void volk_32f_s32f_calc_spectral_noise_floor_32f(float* noiseFloorAmplitude, const float* realDataPoints, const float spectralExclusionValue, const unsigned int num_points) - * \endcode + * void volk_32f_s32f_calc_spectral_noise_floor_32f(float* noiseFloorAmplitude, const + * float* realDataPoints, const float spectralExclusionValue, const unsigned int + * num_points) \endcode * * \b Inputs * \li realDataPoints: The input power spectrum. - * \li spectralExclusionValue: The number of dB above the noise floor that a data point must be to be excluded from the noise floor calculation - default value is 20. - * \li num_points: The number of data points. + * \li spectralExclusionValue: The number of dB above the noise floor that a data point + * must be to be excluded from the noise floor calculation - default value is 20. \li + * num_points: The number of data points. * * \b Outputs * \li noiseFloorAmplitude: The noise floor of the input spectrum, in dB. @@ -59,9 +61,9 @@ #ifndef INCLUDED_volk_32f_s32f_calc_spectral_noise_floor_32f_a_H #define INCLUDED_volk_32f_s32f_calc_spectral_noise_floor_32f_a_H -#include #include #include +#include #ifdef LV_HAVE_AVX #include @@ -72,114 +74,117 @@ volk_32f_s32f_calc_spectral_noise_floor_32f_a_avx(float* noiseFloorAmplitude, const float spectralExclusionValue, const unsigned int num_points) { - unsigned int number = 0; - const unsigned int eighthPoints = num_points / 8; - - const float* dataPointsPtr = realDataPoints; - __VOLK_ATTR_ALIGNED(32) float avgPointsVector[8]; - - __m256 dataPointsVal; - __m256 avgPointsVal = _mm256_setzero_ps(); - // Calculate the sum (for mean) for all points - for(; number < eighthPoints; number++){ - - dataPointsVal = _mm256_load_ps(dataPointsPtr); - - dataPointsPtr += 8; - - avgPointsVal = _mm256_add_ps(avgPointsVal, dataPointsVal); - } - - _mm256_store_ps(avgPointsVector, avgPointsVal); - - float sumMean = 0.0; - sumMean += avgPointsVector[0]; - sumMean += avgPointsVector[1]; - sumMean += avgPointsVector[2]; - sumMean += avgPointsVector[3]; - sumMean += avgPointsVector[4]; - sumMean += avgPointsVector[5]; - sumMean += avgPointsVector[6]; - sumMean += avgPointsVector[7]; - - number = eighthPoints * 8; - for(;number < num_points; number++){ - sumMean += realDataPoints[number]; - } - - // calculate the spectral mean - // +20 because for the comparison below we only want to throw out bins - // that are significantly higher (and would, thus, affect the mean more - const float meanAmplitude = (sumMean / ((float)num_points)) + spectralExclusionValue; - - dataPointsPtr = realDataPoints; // Reset the dataPointsPtr - __m256 vMeanAmplitudeVector = _mm256_set1_ps(meanAmplitude); - __m256 vOnesVector = _mm256_set1_ps(1.0); - __m256 vValidBinCount = _mm256_setzero_ps(); - avgPointsVal = _mm256_setzero_ps(); - __m256 compareMask; - number = 0; - // Calculate the sum (for mean) for any points which do NOT exceed the mean amplitude - for(; number < eighthPoints; number++){ - - dataPointsVal = _mm256_load_ps(dataPointsPtr); - - dataPointsPtr += 8; - - // Identify which items do not exceed the mean amplitude - compareMask = _mm256_cmp_ps(dataPointsVal, vMeanAmplitudeVector, _CMP_LE_OQ); - - // Mask off the items that exceed the mean amplitude and add the avg Points that do not exceed the mean amplitude - avgPointsVal = _mm256_add_ps(avgPointsVal, _mm256_and_ps(compareMask, dataPointsVal)); - - // Count the number of bins which do not exceed the mean amplitude - vValidBinCount = _mm256_add_ps(vValidBinCount, _mm256_and_ps(compareMask, vOnesVector)); - } - - // Calculate the mean from the remaining data points - _mm256_store_ps(avgPointsVector, avgPointsVal); - - sumMean = 0.0; - sumMean += avgPointsVector[0]; - sumMean += avgPointsVector[1]; - sumMean += avgPointsVector[2]; - sumMean += avgPointsVector[3]; - sumMean += avgPointsVector[4]; - sumMean += avgPointsVector[5]; - sumMean += avgPointsVector[6]; - sumMean += avgPointsVector[7]; - - // Calculate the number of valid bins from the remaining count - __VOLK_ATTR_ALIGNED(32) float validBinCountVector[8]; - _mm256_store_ps(validBinCountVector, vValidBinCount); - - float validBinCount = 0; - validBinCount += validBinCountVector[0]; - validBinCount += validBinCountVector[1]; - validBinCount += validBinCountVector[2]; - validBinCount += validBinCountVector[3]; - validBinCount += validBinCountVector[4]; - validBinCount += validBinCountVector[5]; - validBinCount += validBinCountVector[6]; - validBinCount += validBinCountVector[7]; - - number = eighthPoints * 8; - for(;number < num_points; number++){ - if(realDataPoints[number] <= meanAmplitude){ - sumMean += realDataPoints[number]; - validBinCount += 1.0; + unsigned int number = 0; + const unsigned int eighthPoints = num_points / 8; + + const float* dataPointsPtr = realDataPoints; + __VOLK_ATTR_ALIGNED(32) float avgPointsVector[8]; + + __m256 dataPointsVal; + __m256 avgPointsVal = _mm256_setzero_ps(); + // Calculate the sum (for mean) for all points + for (; number < eighthPoints; number++) { + + dataPointsVal = _mm256_load_ps(dataPointsPtr); + + dataPointsPtr += 8; + + avgPointsVal = _mm256_add_ps(avgPointsVal, dataPointsVal); } - } - float localNoiseFloorAmplitude = 0; - if(validBinCount > 0.0){ - localNoiseFloorAmplitude = sumMean / validBinCount; - } - else{ - localNoiseFloorAmplitude = meanAmplitude; // For the odd case that all the amplitudes are equal... - } + _mm256_store_ps(avgPointsVector, avgPointsVal); + + float sumMean = 0.0; + sumMean += avgPointsVector[0]; + sumMean += avgPointsVector[1]; + sumMean += avgPointsVector[2]; + sumMean += avgPointsVector[3]; + sumMean += avgPointsVector[4]; + sumMean += avgPointsVector[5]; + sumMean += avgPointsVector[6]; + sumMean += avgPointsVector[7]; + + number = eighthPoints * 8; + for (; number < num_points; number++) { + sumMean += realDataPoints[number]; + } + + // calculate the spectral mean + // +20 because for the comparison below we only want to throw out bins + // that are significantly higher (and would, thus, affect the mean more + const float meanAmplitude = (sumMean / ((float)num_points)) + spectralExclusionValue; + + dataPointsPtr = realDataPoints; // Reset the dataPointsPtr + __m256 vMeanAmplitudeVector = _mm256_set1_ps(meanAmplitude); + __m256 vOnesVector = _mm256_set1_ps(1.0); + __m256 vValidBinCount = _mm256_setzero_ps(); + avgPointsVal = _mm256_setzero_ps(); + __m256 compareMask; + number = 0; + // Calculate the sum (for mean) for any points which do NOT exceed the mean amplitude + for (; number < eighthPoints; number++) { + + dataPointsVal = _mm256_load_ps(dataPointsPtr); + + dataPointsPtr += 8; + + // Identify which items do not exceed the mean amplitude + compareMask = _mm256_cmp_ps(dataPointsVal, vMeanAmplitudeVector, _CMP_LE_OQ); + + // Mask off the items that exceed the mean amplitude and add the avg Points that + // do not exceed the mean amplitude + avgPointsVal = + _mm256_add_ps(avgPointsVal, _mm256_and_ps(compareMask, dataPointsVal)); + + // Count the number of bins which do not exceed the mean amplitude + vValidBinCount = + _mm256_add_ps(vValidBinCount, _mm256_and_ps(compareMask, vOnesVector)); + } - *noiseFloorAmplitude = localNoiseFloorAmplitude; + // Calculate the mean from the remaining data points + _mm256_store_ps(avgPointsVector, avgPointsVal); + + sumMean = 0.0; + sumMean += avgPointsVector[0]; + sumMean += avgPointsVector[1]; + sumMean += avgPointsVector[2]; + sumMean += avgPointsVector[3]; + sumMean += avgPointsVector[4]; + sumMean += avgPointsVector[5]; + sumMean += avgPointsVector[6]; + sumMean += avgPointsVector[7]; + + // Calculate the number of valid bins from the remaining count + __VOLK_ATTR_ALIGNED(32) float validBinCountVector[8]; + _mm256_store_ps(validBinCountVector, vValidBinCount); + + float validBinCount = 0; + validBinCount += validBinCountVector[0]; + validBinCount += validBinCountVector[1]; + validBinCount += validBinCountVector[2]; + validBinCount += validBinCountVector[3]; + validBinCount += validBinCountVector[4]; + validBinCount += validBinCountVector[5]; + validBinCount += validBinCountVector[6]; + validBinCount += validBinCountVector[7]; + + number = eighthPoints * 8; + for (; number < num_points; number++) { + if (realDataPoints[number] <= meanAmplitude) { + sumMean += realDataPoints[number]; + validBinCount += 1.0; + } + } + + float localNoiseFloorAmplitude = 0; + if (validBinCount > 0.0) { + localNoiseFloorAmplitude = sumMean / validBinCount; + } else { + localNoiseFloorAmplitude = + meanAmplitude; // For the odd case that all the amplitudes are equal... + } + + *noiseFloorAmplitude = localNoiseFloorAmplitude; } #endif /* LV_HAVE_AVX */ @@ -192,102 +197,103 @@ volk_32f_s32f_calc_spectral_noise_floor_32f_a_sse(float* noiseFloorAmplitude, const float spectralExclusionValue, const unsigned int num_points) { - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - const float* dataPointsPtr = realDataPoints; - __VOLK_ATTR_ALIGNED(16) float avgPointsVector[4]; - - __m128 dataPointsVal; - __m128 avgPointsVal = _mm_setzero_ps(); - // Calculate the sum (for mean) for all points - for(; number < quarterPoints; number++){ - - dataPointsVal = _mm_load_ps(dataPointsPtr); - - dataPointsPtr += 4; - - avgPointsVal = _mm_add_ps(avgPointsVal, dataPointsVal); - } - - _mm_store_ps(avgPointsVector, avgPointsVal); - - float sumMean = 0.0; - sumMean += avgPointsVector[0]; - sumMean += avgPointsVector[1]; - sumMean += avgPointsVector[2]; - sumMean += avgPointsVector[3]; - - number = quarterPoints * 4; - for(;number < num_points; number++){ - sumMean += realDataPoints[number]; - } - - // calculate the spectral mean - // +20 because for the comparison below we only want to throw out bins - // that are significantly higher (and would, thus, affect the mean more - const float meanAmplitude = (sumMean / ((float)num_points)) + spectralExclusionValue; - - dataPointsPtr = realDataPoints; // Reset the dataPointsPtr - __m128 vMeanAmplitudeVector = _mm_set_ps1(meanAmplitude); - __m128 vOnesVector = _mm_set_ps1(1.0); - __m128 vValidBinCount = _mm_setzero_ps(); - avgPointsVal = _mm_setzero_ps(); - __m128 compareMask; - number = 0; - // Calculate the sum (for mean) for any points which do NOT exceed the mean amplitude - for(; number < quarterPoints; number++){ - - dataPointsVal = _mm_load_ps(dataPointsPtr); - - dataPointsPtr += 4; - - // Identify which items do not exceed the mean amplitude - compareMask = _mm_cmple_ps(dataPointsVal, vMeanAmplitudeVector); - - // Mask off the items that exceed the mean amplitude and add the avg Points that do not exceed the mean amplitude - avgPointsVal = _mm_add_ps(avgPointsVal, _mm_and_ps(compareMask, dataPointsVal)); - - // Count the number of bins which do not exceed the mean amplitude - vValidBinCount = _mm_add_ps(vValidBinCount, _mm_and_ps(compareMask, vOnesVector)); - } - - // Calculate the mean from the remaining data points - _mm_store_ps(avgPointsVector, avgPointsVal); - - sumMean = 0.0; - sumMean += avgPointsVector[0]; - sumMean += avgPointsVector[1]; - sumMean += avgPointsVector[2]; - sumMean += avgPointsVector[3]; - - // Calculate the number of valid bins from the remaining count - __VOLK_ATTR_ALIGNED(16) float validBinCountVector[4]; - _mm_store_ps(validBinCountVector, vValidBinCount); - - float validBinCount = 0; - validBinCount += validBinCountVector[0]; - validBinCount += validBinCountVector[1]; - validBinCount += validBinCountVector[2]; - validBinCount += validBinCountVector[3]; - - number = quarterPoints * 4; - for(;number < num_points; number++){ - if(realDataPoints[number] <= meanAmplitude){ - sumMean += realDataPoints[number]; - validBinCount += 1.0; + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + const float* dataPointsPtr = realDataPoints; + __VOLK_ATTR_ALIGNED(16) float avgPointsVector[4]; + + __m128 dataPointsVal; + __m128 avgPointsVal = _mm_setzero_ps(); + // Calculate the sum (for mean) for all points + for (; number < quarterPoints; number++) { + + dataPointsVal = _mm_load_ps(dataPointsPtr); + + dataPointsPtr += 4; + + avgPointsVal = _mm_add_ps(avgPointsVal, dataPointsVal); + } + + _mm_store_ps(avgPointsVector, avgPointsVal); + + float sumMean = 0.0; + sumMean += avgPointsVector[0]; + sumMean += avgPointsVector[1]; + sumMean += avgPointsVector[2]; + sumMean += avgPointsVector[3]; + + number = quarterPoints * 4; + for (; number < num_points; number++) { + sumMean += realDataPoints[number]; + } + + // calculate the spectral mean + // +20 because for the comparison below we only want to throw out bins + // that are significantly higher (and would, thus, affect the mean more + const float meanAmplitude = (sumMean / ((float)num_points)) + spectralExclusionValue; + + dataPointsPtr = realDataPoints; // Reset the dataPointsPtr + __m128 vMeanAmplitudeVector = _mm_set_ps1(meanAmplitude); + __m128 vOnesVector = _mm_set_ps1(1.0); + __m128 vValidBinCount = _mm_setzero_ps(); + avgPointsVal = _mm_setzero_ps(); + __m128 compareMask; + number = 0; + // Calculate the sum (for mean) for any points which do NOT exceed the mean amplitude + for (; number < quarterPoints; number++) { + + dataPointsVal = _mm_load_ps(dataPointsPtr); + + dataPointsPtr += 4; + + // Identify which items do not exceed the mean amplitude + compareMask = _mm_cmple_ps(dataPointsVal, vMeanAmplitudeVector); + + // Mask off the items that exceed the mean amplitude and add the avg Points that + // do not exceed the mean amplitude + avgPointsVal = _mm_add_ps(avgPointsVal, _mm_and_ps(compareMask, dataPointsVal)); + + // Count the number of bins which do not exceed the mean amplitude + vValidBinCount = _mm_add_ps(vValidBinCount, _mm_and_ps(compareMask, vOnesVector)); } - } - float localNoiseFloorAmplitude = 0; - if(validBinCount > 0.0){ - localNoiseFloorAmplitude = sumMean / validBinCount; - } - else{ - localNoiseFloorAmplitude = meanAmplitude; // For the odd case that all the amplitudes are equal... - } + // Calculate the mean from the remaining data points + _mm_store_ps(avgPointsVector, avgPointsVal); + + sumMean = 0.0; + sumMean += avgPointsVector[0]; + sumMean += avgPointsVector[1]; + sumMean += avgPointsVector[2]; + sumMean += avgPointsVector[3]; + + // Calculate the number of valid bins from the remaining count + __VOLK_ATTR_ALIGNED(16) float validBinCountVector[4]; + _mm_store_ps(validBinCountVector, vValidBinCount); + + float validBinCount = 0; + validBinCount += validBinCountVector[0]; + validBinCount += validBinCountVector[1]; + validBinCount += validBinCountVector[2]; + validBinCount += validBinCountVector[3]; + + number = quarterPoints * 4; + for (; number < num_points; number++) { + if (realDataPoints[number] <= meanAmplitude) { + sumMean += realDataPoints[number]; + validBinCount += 1.0; + } + } + + float localNoiseFloorAmplitude = 0; + if (validBinCount > 0.0) { + localNoiseFloorAmplitude = sumMean / validBinCount; + } else { + localNoiseFloorAmplitude = + meanAmplitude; // For the odd case that all the amplitudes are equal... + } - *noiseFloorAmplitude = localNoiseFloorAmplitude; + *noiseFloorAmplitude = localNoiseFloorAmplitude; } #endif /* LV_HAVE_SSE */ @@ -300,36 +306,36 @@ volk_32f_s32f_calc_spectral_noise_floor_32f_generic(float* noiseFloorAmplitude, const float spectralExclusionValue, const unsigned int num_points) { - float sumMean = 0.0; - unsigned int number; - // find the sum (for mean), etc - for(number = 0; number < num_points; number++){ - // sum (for mean) - sumMean += realDataPoints[number]; - } - - // calculate the spectral mean - // +20 because for the comparison below we only want to throw out bins - // that are significantly higher (and would, thus, affect the mean more) - const float meanAmplitude = (sumMean / num_points) + spectralExclusionValue; - - // now throw out any bins higher than the mean - sumMean = 0.0; - unsigned int newNumDataPoints = num_points; - for(number = 0; number < num_points; number++){ - if (realDataPoints[number] <= meanAmplitude) - sumMean += realDataPoints[number]; - else - newNumDataPoints--; - } + float sumMean = 0.0; + unsigned int number; + // find the sum (for mean), etc + for (number = 0; number < num_points; number++) { + // sum (for mean) + sumMean += realDataPoints[number]; + } + + // calculate the spectral mean + // +20 because for the comparison below we only want to throw out bins + // that are significantly higher (and would, thus, affect the mean more) + const float meanAmplitude = (sumMean / num_points) + spectralExclusionValue; + + // now throw out any bins higher than the mean + sumMean = 0.0; + unsigned int newNumDataPoints = num_points; + for (number = 0; number < num_points; number++) { + if (realDataPoints[number] <= meanAmplitude) + sumMean += realDataPoints[number]; + else + newNumDataPoints--; + } - float localNoiseFloorAmplitude = 0.0; - if (newNumDataPoints == 0) // in the odd case that all - localNoiseFloorAmplitude = meanAmplitude; // amplitudes are equal! - else - localNoiseFloorAmplitude = sumMean / ((float)newNumDataPoints); + float localNoiseFloorAmplitude = 0.0; + if (newNumDataPoints == 0) // in the odd case that all + localNoiseFloorAmplitude = meanAmplitude; // amplitudes are equal! + else + localNoiseFloorAmplitude = sumMean / ((float)newNumDataPoints); - *noiseFloorAmplitude = localNoiseFloorAmplitude; + *noiseFloorAmplitude = localNoiseFloorAmplitude; } #endif /* LV_HAVE_GENERIC */ @@ -339,9 +345,9 @@ volk_32f_s32f_calc_spectral_noise_floor_32f_generic(float* noiseFloorAmplitude, #ifndef INCLUDED_volk_32f_s32f_calc_spectral_noise_floor_32f_u_H #define INCLUDED_volk_32f_s32f_calc_spectral_noise_floor_32f_u_H -#include #include #include +#include #ifdef LV_HAVE_AVX #include @@ -352,114 +358,117 @@ volk_32f_s32f_calc_spectral_noise_floor_32f_u_avx(float* noiseFloorAmplitude, const float spectralExclusionValue, const unsigned int num_points) { - unsigned int number = 0; - const unsigned int eighthPoints = num_points / 8; - - const float* dataPointsPtr = realDataPoints; - __VOLK_ATTR_ALIGNED(16) float avgPointsVector[8]; - - __m256 dataPointsVal; - __m256 avgPointsVal = _mm256_setzero_ps(); - // Calculate the sum (for mean) for all points - for(; number < eighthPoints; number++){ - - dataPointsVal = _mm256_loadu_ps(dataPointsPtr); - - dataPointsPtr += 8; - - avgPointsVal = _mm256_add_ps(avgPointsVal, dataPointsVal); - } - - _mm256_storeu_ps(avgPointsVector, avgPointsVal); - - float sumMean = 0.0; - sumMean += avgPointsVector[0]; - sumMean += avgPointsVector[1]; - sumMean += avgPointsVector[2]; - sumMean += avgPointsVector[3]; - sumMean += avgPointsVector[4]; - sumMean += avgPointsVector[5]; - sumMean += avgPointsVector[6]; - sumMean += avgPointsVector[7]; - - number = eighthPoints * 8; - for(;number < num_points; number++){ - sumMean += realDataPoints[number]; - } - - // calculate the spectral mean - // +20 because for the comparison below we only want to throw out bins - // that are significantly higher (and would, thus, affect the mean more - const float meanAmplitude = (sumMean / ((float)num_points)) + spectralExclusionValue; - - dataPointsPtr = realDataPoints; // Reset the dataPointsPtr - __m256 vMeanAmplitudeVector = _mm256_set1_ps(meanAmplitude); - __m256 vOnesVector = _mm256_set1_ps(1.0); - __m256 vValidBinCount = _mm256_setzero_ps(); - avgPointsVal = _mm256_setzero_ps(); - __m256 compareMask; - number = 0; - // Calculate the sum (for mean) for any points which do NOT exceed the mean amplitude - for(; number < eighthPoints; number++){ - - dataPointsVal = _mm256_loadu_ps(dataPointsPtr); - - dataPointsPtr += 8; - - // Identify which items do not exceed the mean amplitude - compareMask = _mm256_cmp_ps(dataPointsVal, vMeanAmplitudeVector, _CMP_LE_OQ); - - // Mask off the items that exceed the mean amplitude and add the avg Points that do not exceed the mean amplitude - avgPointsVal = _mm256_add_ps(avgPointsVal, _mm256_and_ps(compareMask, dataPointsVal)); - - // Count the number of bins which do not exceed the mean amplitude - vValidBinCount = _mm256_add_ps(vValidBinCount, _mm256_and_ps(compareMask, vOnesVector)); - } - - // Calculate the mean from the remaining data points - _mm256_storeu_ps(avgPointsVector, avgPointsVal); - - sumMean = 0.0; - sumMean += avgPointsVector[0]; - sumMean += avgPointsVector[1]; - sumMean += avgPointsVector[2]; - sumMean += avgPointsVector[3]; - sumMean += avgPointsVector[4]; - sumMean += avgPointsVector[5]; - sumMean += avgPointsVector[6]; - sumMean += avgPointsVector[7]; - - // Calculate the number of valid bins from the remaining count - __VOLK_ATTR_ALIGNED(16) float validBinCountVector[8]; - _mm256_storeu_ps(validBinCountVector, vValidBinCount); - - float validBinCount = 0; - validBinCount += validBinCountVector[0]; - validBinCount += validBinCountVector[1]; - validBinCount += validBinCountVector[2]; - validBinCount += validBinCountVector[3]; - validBinCount += validBinCountVector[4]; - validBinCount += validBinCountVector[5]; - validBinCount += validBinCountVector[6]; - validBinCount += validBinCountVector[7]; - - number = eighthPoints * 8; - for(;number < num_points; number++){ - if(realDataPoints[number] <= meanAmplitude){ - sumMean += realDataPoints[number]; - validBinCount += 1.0; + unsigned int number = 0; + const unsigned int eighthPoints = num_points / 8; + + const float* dataPointsPtr = realDataPoints; + __VOLK_ATTR_ALIGNED(16) float avgPointsVector[8]; + + __m256 dataPointsVal; + __m256 avgPointsVal = _mm256_setzero_ps(); + // Calculate the sum (for mean) for all points + for (; number < eighthPoints; number++) { + + dataPointsVal = _mm256_loadu_ps(dataPointsPtr); + + dataPointsPtr += 8; + + avgPointsVal = _mm256_add_ps(avgPointsVal, dataPointsVal); + } + + _mm256_storeu_ps(avgPointsVector, avgPointsVal); + + float sumMean = 0.0; + sumMean += avgPointsVector[0]; + sumMean += avgPointsVector[1]; + sumMean += avgPointsVector[2]; + sumMean += avgPointsVector[3]; + sumMean += avgPointsVector[4]; + sumMean += avgPointsVector[5]; + sumMean += avgPointsVector[6]; + sumMean += avgPointsVector[7]; + + number = eighthPoints * 8; + for (; number < num_points; number++) { + sumMean += realDataPoints[number]; + } + + // calculate the spectral mean + // +20 because for the comparison below we only want to throw out bins + // that are significantly higher (and would, thus, affect the mean more + const float meanAmplitude = (sumMean / ((float)num_points)) + spectralExclusionValue; + + dataPointsPtr = realDataPoints; // Reset the dataPointsPtr + __m256 vMeanAmplitudeVector = _mm256_set1_ps(meanAmplitude); + __m256 vOnesVector = _mm256_set1_ps(1.0); + __m256 vValidBinCount = _mm256_setzero_ps(); + avgPointsVal = _mm256_setzero_ps(); + __m256 compareMask; + number = 0; + // Calculate the sum (for mean) for any points which do NOT exceed the mean amplitude + for (; number < eighthPoints; number++) { + + dataPointsVal = _mm256_loadu_ps(dataPointsPtr); + + dataPointsPtr += 8; + + // Identify which items do not exceed the mean amplitude + compareMask = _mm256_cmp_ps(dataPointsVal, vMeanAmplitudeVector, _CMP_LE_OQ); + + // Mask off the items that exceed the mean amplitude and add the avg Points that + // do not exceed the mean amplitude + avgPointsVal = + _mm256_add_ps(avgPointsVal, _mm256_and_ps(compareMask, dataPointsVal)); + + // Count the number of bins which do not exceed the mean amplitude + vValidBinCount = + _mm256_add_ps(vValidBinCount, _mm256_and_ps(compareMask, vOnesVector)); + } + + // Calculate the mean from the remaining data points + _mm256_storeu_ps(avgPointsVector, avgPointsVal); + + sumMean = 0.0; + sumMean += avgPointsVector[0]; + sumMean += avgPointsVector[1]; + sumMean += avgPointsVector[2]; + sumMean += avgPointsVector[3]; + sumMean += avgPointsVector[4]; + sumMean += avgPointsVector[5]; + sumMean += avgPointsVector[6]; + sumMean += avgPointsVector[7]; + + // Calculate the number of valid bins from the remaining count + __VOLK_ATTR_ALIGNED(16) float validBinCountVector[8]; + _mm256_storeu_ps(validBinCountVector, vValidBinCount); + + float validBinCount = 0; + validBinCount += validBinCountVector[0]; + validBinCount += validBinCountVector[1]; + validBinCount += validBinCountVector[2]; + validBinCount += validBinCountVector[3]; + validBinCount += validBinCountVector[4]; + validBinCount += validBinCountVector[5]; + validBinCount += validBinCountVector[6]; + validBinCount += validBinCountVector[7]; + + number = eighthPoints * 8; + for (; number < num_points; number++) { + if (realDataPoints[number] <= meanAmplitude) { + sumMean += realDataPoints[number]; + validBinCount += 1.0; + } } - } - float localNoiseFloorAmplitude = 0; - if(validBinCount > 0.0){ - localNoiseFloorAmplitude = sumMean / validBinCount; - } - else{ - localNoiseFloorAmplitude = meanAmplitude; // For the odd case that all the amplitudes are equal... - } + float localNoiseFloorAmplitude = 0; + if (validBinCount > 0.0) { + localNoiseFloorAmplitude = sumMean / validBinCount; + } else { + localNoiseFloorAmplitude = + meanAmplitude; // For the odd case that all the amplitudes are equal... + } - *noiseFloorAmplitude = localNoiseFloorAmplitude; + *noiseFloorAmplitude = localNoiseFloorAmplitude; } #endif /* LV_HAVE_AVX */ #endif /* INCLUDED_volk_32f_s32f_calc_spectral_noise_floor_32f_u_H */ diff --git a/kernels/volk/volk_32f_s32f_convert_16i.h b/kernels/volk/volk_32f_s32f_convert_16i.h index 27ef4d9..c9469b7 100644 --- a/kernels/volk/volk_32f_s32f_convert_16i.h +++ b/kernels/volk/volk_32f_s32f_convert_16i.h @@ -30,8 +30,8 @@ * * Dispatcher Prototype * \code - * void volk_32f_s32f_convert_16i(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points) - * \endcode + * void volk_32f_s32f_convert_16i(int16_t* outputVector, const float* inputVector, const + * float scalar, unsigned int num_points) \endcode * * \b Inputs * \li inputVector: the input vector of floats. @@ -42,11 +42,10 @@ * \li outputVector: The output vector. * * \b Example - * Convert floats from [-1,1] to 16-bit integers with a scale of 5 to maintain smallest delta - * int N = 10; - * unsigned int alignment = volk_get_alignment(); - * float* increasing = (float*)volk_malloc(sizeof(float)*N, alignment); - * int16_t* out = (int16_t*)volk_malloc(sizeof(int16_t)*N, alignment); + * Convert floats from [-1,1] to 16-bit integers with a scale of 5 to maintain smallest + * delta int N = 10; unsigned int alignment = volk_get_alignment(); float* increasing = + * (float*)volk_malloc(sizeof(float)*N, alignment); int16_t* out = + * (int16_t*)volk_malloc(sizeof(int16_t)*N, alignment); * * for(unsigned int ii = 0; ii < N; ++ii){ * increasing[ii] = 2.f * ((float)ii / (float)N) - 1.f; @@ -76,55 +75,60 @@ #ifdef LV_HAVE_AVX2 #include -static inline void -volk_32f_s32f_convert_16i_u_avx2(int16_t* outputVector, const float* inputVector, - const float scalar, unsigned int num_points) +static inline void volk_32f_s32f_convert_16i_u_avx2(int16_t* outputVector, + const float* inputVector, + const float scalar, + unsigned int num_points) { - unsigned int number = 0; - - const unsigned int sixteenthPoints = num_points / 16; - - const float* inputVectorPtr = (const float*)inputVector; - int16_t* outputVectorPtr = outputVector; - - float min_val = SHRT_MIN; - float max_val = SHRT_MAX; - float r; - - __m256 vScalar = _mm256_set1_ps(scalar); - __m256 inputVal1, inputVal2; - __m256i intInputVal1, intInputVal2; - __m256 ret1, ret2; - __m256 vmin_val = _mm256_set1_ps(min_val); - __m256 vmax_val = _mm256_set1_ps(max_val); - - for(;number < sixteenthPoints; number++){ - inputVal1 = _mm256_loadu_ps(inputVectorPtr); inputVectorPtr += 8; - inputVal2 = _mm256_loadu_ps(inputVectorPtr); inputVectorPtr += 8; - - // Scale and clip - ret1 = _mm256_max_ps(_mm256_min_ps(_mm256_mul_ps(inputVal1, vScalar), vmax_val), vmin_val); - ret2 = _mm256_max_ps(_mm256_min_ps(_mm256_mul_ps(inputVal2, vScalar), vmax_val), vmin_val); - - intInputVal1 = _mm256_cvtps_epi32(ret1); - intInputVal2 = _mm256_cvtps_epi32(ret2); - - intInputVal1 = _mm256_packs_epi32(intInputVal1, intInputVal2); - intInputVal1 = _mm256_permute4x64_epi64(intInputVal1, 0b11011000); - - _mm256_storeu_si256((__m256i*)outputVectorPtr, intInputVal1); - outputVectorPtr += 16; - } - - number = sixteenthPoints * 16; - for(; number < num_points; number++){ - r = inputVector[number] * scalar; - if(r > max_val) - r = max_val; - else if(r < min_val) - r = min_val; - outputVector[number] = (int16_t)rintf(r); - } + unsigned int number = 0; + + const unsigned int sixteenthPoints = num_points / 16; + + const float* inputVectorPtr = (const float*)inputVector; + int16_t* outputVectorPtr = outputVector; + + float min_val = SHRT_MIN; + float max_val = SHRT_MAX; + float r; + + __m256 vScalar = _mm256_set1_ps(scalar); + __m256 inputVal1, inputVal2; + __m256i intInputVal1, intInputVal2; + __m256 ret1, ret2; + __m256 vmin_val = _mm256_set1_ps(min_val); + __m256 vmax_val = _mm256_set1_ps(max_val); + + for (; number < sixteenthPoints; number++) { + inputVal1 = _mm256_loadu_ps(inputVectorPtr); + inputVectorPtr += 8; + inputVal2 = _mm256_loadu_ps(inputVectorPtr); + inputVectorPtr += 8; + + // Scale and clip + ret1 = _mm256_max_ps(_mm256_min_ps(_mm256_mul_ps(inputVal1, vScalar), vmax_val), + vmin_val); + ret2 = _mm256_max_ps(_mm256_min_ps(_mm256_mul_ps(inputVal2, vScalar), vmax_val), + vmin_val); + + intInputVal1 = _mm256_cvtps_epi32(ret1); + intInputVal2 = _mm256_cvtps_epi32(ret2); + + intInputVal1 = _mm256_packs_epi32(intInputVal1, intInputVal2); + intInputVal1 = _mm256_permute4x64_epi64(intInputVal1, 0b11011000); + + _mm256_storeu_si256((__m256i*)outputVectorPtr, intInputVal1); + outputVectorPtr += 16; + } + + number = sixteenthPoints * 16; + for (; number < num_points; number++) { + r = inputVector[number] * scalar; + if (r > max_val) + r = max_val; + else if (r < min_val) + r = min_val; + outputVector[number] = (int16_t)rintf(r); + } } #endif /* LV_HAVE_AVX2 */ @@ -132,54 +136,57 @@ volk_32f_s32f_convert_16i_u_avx2(int16_t* outputVector, const float* inputVector #ifdef LV_HAVE_AVX #include -static inline void -volk_32f_s32f_convert_16i_u_avx(int16_t* outputVector, const float* inputVector, - const float scalar, unsigned int num_points) +static inline void volk_32f_s32f_convert_16i_u_avx(int16_t* outputVector, + const float* inputVector, + const float scalar, + unsigned int num_points) { - unsigned int number = 0; + unsigned int number = 0; - const unsigned int eighthPoints = num_points / 8; + const unsigned int eighthPoints = num_points / 8; - const float* inputVectorPtr = (const float*)inputVector; - int16_t* outputVectorPtr = outputVector; + const float* inputVectorPtr = (const float*)inputVector; + int16_t* outputVectorPtr = outputVector; - float min_val = SHRT_MIN; - float max_val = SHRT_MAX; - float r; + float min_val = SHRT_MIN; + float max_val = SHRT_MAX; + float r; - __m256 vScalar = _mm256_set1_ps(scalar); - __m256 inputVal, ret; - __m256i intInputVal; - __m128i intInputVal1, intInputVal2; - __m256 vmin_val = _mm256_set1_ps(min_val); - __m256 vmax_val = _mm256_set1_ps(max_val); + __m256 vScalar = _mm256_set1_ps(scalar); + __m256 inputVal, ret; + __m256i intInputVal; + __m128i intInputVal1, intInputVal2; + __m256 vmin_val = _mm256_set1_ps(min_val); + __m256 vmax_val = _mm256_set1_ps(max_val); - for(;number < eighthPoints; number++){ - inputVal = _mm256_loadu_ps(inputVectorPtr); inputVectorPtr += 8; + for (; number < eighthPoints; number++) { + inputVal = _mm256_loadu_ps(inputVectorPtr); + inputVectorPtr += 8; - // Scale and clip - ret = _mm256_max_ps(_mm256_min_ps(_mm256_mul_ps(inputVal, vScalar), vmax_val), vmin_val); + // Scale and clip + ret = _mm256_max_ps(_mm256_min_ps(_mm256_mul_ps(inputVal, vScalar), vmax_val), + vmin_val); - intInputVal = _mm256_cvtps_epi32(ret); + intInputVal = _mm256_cvtps_epi32(ret); - intInputVal1 = _mm256_extractf128_si256(intInputVal, 0); - intInputVal2 = _mm256_extractf128_si256(intInputVal, 1); + intInputVal1 = _mm256_extractf128_si256(intInputVal, 0); + intInputVal2 = _mm256_extractf128_si256(intInputVal, 1); - intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2); + intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2); - _mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1); - outputVectorPtr += 8; - } + _mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1); + outputVectorPtr += 8; + } - number = eighthPoints * 8; - for(; number < num_points; number++){ - r = inputVector[number] * scalar; - if(r > max_val) - r = max_val; - else if(r < min_val) - r = min_val; - outputVector[number] = (int16_t)rintf(r); - } + number = eighthPoints * 8; + for (; number < num_points; number++) { + r = inputVector[number] * scalar; + if (r > max_val) + r = max_val; + else if (r < min_val) + r = min_val; + outputVector[number] = (int16_t)rintf(r); + } } #endif /* LV_HAVE_AVX */ @@ -187,54 +194,57 @@ volk_32f_s32f_convert_16i_u_avx(int16_t* outputVector, const float* inputVector, #ifdef LV_HAVE_SSE2 #include -static inline void -volk_32f_s32f_convert_16i_u_sse2(int16_t* outputVector, const float* inputVector, - const float scalar, unsigned int num_points) +static inline void volk_32f_s32f_convert_16i_u_sse2(int16_t* outputVector, + const float* inputVector, + const float scalar, + unsigned int num_points) { - unsigned int number = 0; - - const unsigned int eighthPoints = num_points / 8; - - const float* inputVectorPtr = (const float*)inputVector; - int16_t* outputVectorPtr = outputVector; - - float min_val = SHRT_MIN; - float max_val = SHRT_MAX; - float r; - - __m128 vScalar = _mm_set_ps1(scalar); - __m128 inputVal1, inputVal2; - __m128i intInputVal1, intInputVal2; - __m128 ret1, ret2; - __m128 vmin_val = _mm_set_ps1(min_val); - __m128 vmax_val = _mm_set_ps1(max_val); - - for(;number < eighthPoints; number++){ - inputVal1 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4; - inputVal2 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4; - - // Scale and clip - ret1 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal1, vScalar), vmax_val), vmin_val); - ret2 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal2, vScalar), vmax_val), vmin_val); - - intInputVal1 = _mm_cvtps_epi32(ret1); - intInputVal2 = _mm_cvtps_epi32(ret2); - - intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2); - - _mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1); - outputVectorPtr += 8; - } - - number = eighthPoints * 8; - for(; number < num_points; number++){ - r = inputVector[number] * scalar; - if(r > max_val) - r = max_val; - else if(r < min_val) - r = min_val; - outputVector[number] = (int16_t)rintf(r); - } + unsigned int number = 0; + + const unsigned int eighthPoints = num_points / 8; + + const float* inputVectorPtr = (const float*)inputVector; + int16_t* outputVectorPtr = outputVector; + + float min_val = SHRT_MIN; + float max_val = SHRT_MAX; + float r; + + __m128 vScalar = _mm_set_ps1(scalar); + __m128 inputVal1, inputVal2; + __m128i intInputVal1, intInputVal2; + __m128 ret1, ret2; + __m128 vmin_val = _mm_set_ps1(min_val); + __m128 vmax_val = _mm_set_ps1(max_val); + + for (; number < eighthPoints; number++) { + inputVal1 = _mm_loadu_ps(inputVectorPtr); + inputVectorPtr += 4; + inputVal2 = _mm_loadu_ps(inputVectorPtr); + inputVectorPtr += 4; + + // Scale and clip + ret1 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal1, vScalar), vmax_val), vmin_val); + ret2 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal2, vScalar), vmax_val), vmin_val); + + intInputVal1 = _mm_cvtps_epi32(ret1); + intInputVal2 = _mm_cvtps_epi32(ret2); + + intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2); + + _mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1); + outputVectorPtr += 8; + } + + number = eighthPoints * 8; + for (; number < num_points; number++) { + r = inputVector[number] * scalar; + if (r > max_val) + r = max_val; + else if (r < min_val) + r = min_val; + outputVector[number] = (int16_t)rintf(r); + } } #endif /* LV_HAVE_SSE2 */ @@ -242,76 +252,78 @@ volk_32f_s32f_convert_16i_u_sse2(int16_t* outputVector, const float* inputVector #ifdef LV_HAVE_SSE #include -static inline void -volk_32f_s32f_convert_16i_u_sse(int16_t* outputVector, const float* inputVector, - const float scalar, unsigned int num_points) +static inline void volk_32f_s32f_convert_16i_u_sse(int16_t* outputVector, + const float* inputVector, + const float scalar, + unsigned int num_points) { - unsigned int number = 0; - - const unsigned int quarterPoints = num_points / 4; - - const float* inputVectorPtr = (const float*)inputVector; - int16_t* outputVectorPtr = outputVector; - - float min_val = SHRT_MIN; - float max_val = SHRT_MAX; - float r; - - __m128 vScalar = _mm_set_ps1(scalar); - __m128 ret; - __m128 vmin_val = _mm_set_ps1(min_val); - __m128 vmax_val = _mm_set_ps1(max_val); - - __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4]; - - for(;number < quarterPoints; number++){ - ret = _mm_loadu_ps(inputVectorPtr); - inputVectorPtr += 4; - - // Scale and clip - ret = _mm_max_ps(_mm_min_ps(_mm_mul_ps(ret, vScalar), vmax_val), vmin_val); - - _mm_store_ps(outputFloatBuffer, ret); - *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[0]); - *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[1]); - *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[2]); - *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[3]); - } - - number = quarterPoints * 4; - for(; number < num_points; number++){ - r = inputVector[number] * scalar; - if(r > max_val) - r = max_val; - else if(r < min_val) - r = min_val; - outputVector[number] = (int16_t)rintf(r); - } + unsigned int number = 0; + + const unsigned int quarterPoints = num_points / 4; + + const float* inputVectorPtr = (const float*)inputVector; + int16_t* outputVectorPtr = outputVector; + + float min_val = SHRT_MIN; + float max_val = SHRT_MAX; + float r; + + __m128 vScalar = _mm_set_ps1(scalar); + __m128 ret; + __m128 vmin_val = _mm_set_ps1(min_val); + __m128 vmax_val = _mm_set_ps1(max_val); + + __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4]; + + for (; number < quarterPoints; number++) { + ret = _mm_loadu_ps(inputVectorPtr); + inputVectorPtr += 4; + + // Scale and clip + ret = _mm_max_ps(_mm_min_ps(_mm_mul_ps(ret, vScalar), vmax_val), vmin_val); + + _mm_store_ps(outputFloatBuffer, ret); + *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[0]); + *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[1]); + *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[2]); + *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[3]); + } + + number = quarterPoints * 4; + for (; number < num_points; number++) { + r = inputVector[number] * scalar; + if (r > max_val) + r = max_val; + else if (r < min_val) + r = min_val; + outputVector[number] = (int16_t)rintf(r); + } } #endif /* LV_HAVE_SSE */ #ifdef LV_HAVE_GENERIC -static inline void -volk_32f_s32f_convert_16i_generic(int16_t* outputVector, const float* inputVector, - const float scalar, unsigned int num_points) +static inline void volk_32f_s32f_convert_16i_generic(int16_t* outputVector, + const float* inputVector, + const float scalar, + unsigned int num_points) { - int16_t* outputVectorPtr = outputVector; - const float* inputVectorPtr = inputVector; - unsigned int number = 0; - float min_val = SHRT_MIN; - float max_val = SHRT_MAX; - float r; - - for(number = 0; number < num_points; number++){ - r = *inputVectorPtr++ * scalar; - if(r > max_val) - r = max_val; - else if(r < min_val) - r = min_val; - *outputVectorPtr++ = (int16_t)rintf(r); - } + int16_t* outputVectorPtr = outputVector; + const float* inputVectorPtr = inputVector; + unsigned int number = 0; + float min_val = SHRT_MIN; + float max_val = SHRT_MAX; + float r; + + for (number = 0; number < num_points; number++) { + r = *inputVectorPtr++ * scalar; + if (r > max_val) + r = max_val; + else if (r < min_val) + r = min_val; + *outputVectorPtr++ = (int16_t)rintf(r); + } } #endif /* LV_HAVE_GENERIC */ @@ -320,63 +332,68 @@ volk_32f_s32f_convert_16i_generic(int16_t* outputVector, const float* inputVecto #ifndef INCLUDED_volk_32f_s32f_convert_16i_a_H #define INCLUDED_volk_32f_s32f_convert_16i_a_H -#include #include -#include #include +#include +#include #ifdef LV_HAVE_AVX2 #include -static inline void -volk_32f_s32f_convert_16i_a_avx2(int16_t* outputVector, const float* inputVector, - const float scalar, unsigned int num_points) +static inline void volk_32f_s32f_convert_16i_a_avx2(int16_t* outputVector, + const float* inputVector, + const float scalar, + unsigned int num_points) { - unsigned int number = 0; - - const unsigned int sixteenthPoints = num_points / 16; - - const float* inputVectorPtr = (const float*)inputVector; - int16_t* outputVectorPtr = outputVector; - - float min_val = SHRT_MIN; - float max_val = SHRT_MAX; - float r; - - __m256 vScalar = _mm256_set1_ps(scalar); - __m256 inputVal1, inputVal2; - __m256i intInputVal1, intInputVal2; - __m256 ret1, ret2; - __m256 vmin_val = _mm256_set1_ps(min_val); - __m256 vmax_val = _mm256_set1_ps(max_val); - - for(;number < sixteenthPoints; number++){ - inputVal1 = _mm256_load_ps(inputVectorPtr); inputVectorPtr += 8; - inputVal2 = _mm256_load_ps(inputVectorPtr); inputVectorPtr += 8; - - // Scale and clip - ret1 = _mm256_max_ps(_mm256_min_ps(_mm256_mul_ps(inputVal1, vScalar), vmax_val), vmin_val); - ret2 = _mm256_max_ps(_mm256_min_ps(_mm256_mul_ps(inputVal2, vScalar), vmax_val), vmin_val); - - intInputVal1 = _mm256_cvtps_epi32(ret1); - intInputVal2 = _mm256_cvtps_epi32(ret2); - - intInputVal1 = _mm256_packs_epi32(intInputVal1, intInputVal2); - intInputVal1 = _mm256_permute4x64_epi64(intInputVal1, 0b11011000); - - _mm256_store_si256((__m256i*)outputVectorPtr, intInputVal1); - outputVectorPtr += 16; - } - - number = sixteenthPoints * 16; - for(; number < num_points; number++){ - r = inputVector[number] * scalar; - if(r > max_val) - r = max_val; - else if(r < min_val) - r = min_val; - outputVector[number] = (int16_t)rintf(r); - } + unsigned int number = 0; + + const unsigned int sixteenthPoints = num_points / 16; + + const float* inputVectorPtr = (const float*)inputVector; + int16_t* outputVectorPtr = outputVector; + + float min_val = SHRT_MIN; + float max_val = SHRT_MAX; + float r; + + __m256 vScalar = _mm256_set1_ps(scalar); + __m256 inputVal1, inputVal2; + __m256i intInputVal1, intInputVal2; + __m256 ret1, ret2; + __m256 vmin_val = _mm256_set1_ps(min_val); + __m256 vmax_val = _mm256_set1_ps(max_val); + + for (; number < sixteenthPoints; number++) { + inputVal1 = _mm256_load_ps(inputVectorPtr); + inputVectorPtr += 8; + inputVal2 = _mm256_load_ps(inputVectorPtr); + inputVectorPtr += 8; + + // Scale and clip + ret1 = _mm256_max_ps(_mm256_min_ps(_mm256_mul_ps(inputVal1, vScalar), vmax_val), + vmin_val); + ret2 = _mm256_max_ps(_mm256_min_ps(_mm256_mul_ps(inputVal2, vScalar), vmax_val), + vmin_val); + + intInputVal1 = _mm256_cvtps_epi32(ret1); + intInputVal2 = _mm256_cvtps_epi32(ret2); + + intInputVal1 = _mm256_packs_epi32(intInputVal1, intInputVal2); + intInputVal1 = _mm256_permute4x64_epi64(intInputVal1, 0b11011000); + + _mm256_store_si256((__m256i*)outputVectorPtr, intInputVal1); + outputVectorPtr += 16; + } + + number = sixteenthPoints * 16; + for (; number < num_points; number++) { + r = inputVector[number] * scalar; + if (r > max_val) + r = max_val; + else if (r < min_val) + r = min_val; + outputVector[number] = (int16_t)rintf(r); + } } #endif /* LV_HAVE_AVX2 */ @@ -384,108 +401,114 @@ volk_32f_s32f_convert_16i_a_avx2(int16_t* outputVector, const float* inputVector #ifdef LV_HAVE_AVX #include -static inline void -volk_32f_s32f_convert_16i_a_avx(int16_t* outputVector, const float* inputVector, - const float scalar, unsigned int num_points) +static inline void volk_32f_s32f_convert_16i_a_avx(int16_t* outputVector, + const float* inputVector, + const float scalar, + unsigned int num_points) { - unsigned int number = 0; + unsigned int number = 0; - const unsigned int eighthPoints = num_points / 8; + const unsigned int eighthPoints = num_points / 8; - const float* inputVectorPtr = (const float*)inputVector; - int16_t* outputVectorPtr = outputVector; + const float* inputVectorPtr = (const float*)inputVector; + int16_t* outputVectorPtr = outputVector; - float min_val = SHRT_MIN; - float max_val = SHRT_MAX; - float r; + float min_val = SHRT_MIN; + float max_val = SHRT_MAX; + float r; - __m256 vScalar = _mm256_set1_ps(scalar); - __m256 inputVal, ret; - __m256i intInputVal; - __m128i intInputVal1, intInputVal2; - __m256 vmin_val = _mm256_set1_ps(min_val); - __m256 vmax_val = _mm256_set1_ps(max_val); + __m256 vScalar = _mm256_set1_ps(scalar); + __m256 inputVal, ret; + __m256i intInputVal; + __m128i intInputVal1, intInputVal2; + __m256 vmin_val = _mm256_set1_ps(min_val); + __m256 vmax_val = _mm256_set1_ps(max_val); - for(;number < eighthPoints; number++){ - inputVal = _mm256_load_ps(inputVectorPtr); inputVectorPtr += 8; + for (; number < eighthPoints; number++) { + inputVal = _mm256_load_ps(inputVectorPtr); + inputVectorPtr += 8; - // Scale and clip - ret = _mm256_max_ps(_mm256_min_ps(_mm256_mul_ps(inputVal, vScalar), vmax_val), vmin_val); + // Scale and clip + ret = _mm256_max_ps(_mm256_min_ps(_mm256_mul_ps(inputVal, vScalar), vmax_val), + vmin_val); - intInputVal = _mm256_cvtps_epi32(ret); + intInputVal = _mm256_cvtps_epi32(ret); - intInputVal1 = _mm256_extractf128_si256(intInputVal, 0); - intInputVal2 = _mm256_extractf128_si256(intInputVal, 1); + intInputVal1 = _mm256_extractf128_si256(intInputVal, 0); + intInputVal2 = _mm256_extractf128_si256(intInputVal, 1); - intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2); + intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2); - _mm_store_si128((__m128i*)outputVectorPtr, intInputVal1); - outputVectorPtr += 8; - } + _mm_store_si128((__m128i*)outputVectorPtr, intInputVal1); + outputVectorPtr += 8; + } - number = eighthPoints * 8; - for(; number < num_points; number++){ - r = inputVector[number] * scalar; - if(r > max_val) - r = max_val; - else if(r < min_val) - r = min_val; - outputVector[number] = (int16_t)rintf(r); - } + number = eighthPoints * 8; + for (; number < num_points; number++) { + r = inputVector[number] * scalar; + if (r > max_val) + r = max_val; + else if (r < min_val) + r = min_val; + outputVector[number] = (int16_t)rintf(r); + } } #endif /* LV_HAVE_AVX */ #ifdef LV_HAVE_SSE2 #include -static inline void -volk_32f_s32f_convert_16i_a_sse2(int16_t* outputVector, const float* inputVector, - const float scalar, unsigned int num_points) +static inline void volk_32f_s32f_convert_16i_a_sse2(int16_t* outputVector, + const float* inputVector, + const float scalar, + unsigned int num_points) { - unsigned int number = 0; - - const unsigned int eighthPoints = num_points / 8; - - const float* inputVectorPtr = (const float*)inputVector; - int16_t* outputVectorPtr = outputVector; - - float min_val = SHRT_MIN; - float max_val = SHRT_MAX; - float r; - - __m128 vScalar = _mm_set_ps1(scalar); - __m128 inputVal1, inputVal2; - __m128i intInputVal1, intInputVal2; - __m128 ret1, ret2; - __m128 vmin_val = _mm_set_ps1(min_val); - __m128 vmax_val = _mm_set_ps1(max_val); - - for(;number < eighthPoints; number++){ - inputVal1 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4; - inputVal2 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4; - - // Scale and clip - ret1 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal1, vScalar), vmax_val), vmin_val); - ret2 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal2, vScalar), vmax_val), vmin_val); - - intInputVal1 = _mm_cvtps_epi32(ret1); - intInputVal2 = _mm_cvtps_epi32(ret2); - - intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2); - - _mm_store_si128((__m128i*)outputVectorPtr, intInputVal1); - outputVectorPtr += 8; - } - - number = eighthPoints * 8; - for(; number < num_points; number++){ - r = inputVector[number] * scalar; - if(r > max_val) - r = max_val; - else if(r < min_val) - r = min_val; - outputVector[number] = (int16_t)rintf(r); - } + unsigned int number = 0; + + const unsigned int eighthPoints = num_points / 8; + + const float* inputVectorPtr = (const float*)inputVector; + int16_t* outputVectorPtr = outputVector; + + float min_val = SHRT_MIN; + float max_val = SHRT_MAX; + float r; + + __m128 vScalar = _mm_set_ps1(scalar); + __m128 inputVal1, inputVal2; + __m128i intInputVal1, intInputVal2; + __m128 ret1, ret2; + __m128 vmin_val = _mm_set_ps1(min_val); + __m128 vmax_val = _mm_set_ps1(max_val); + + for (; number < eighthPoints; number++) { + inputVal1 = _mm_load_ps(inputVectorPtr); + inputVectorPtr += 4; + inputVal2 = _mm_load_ps(inputVectorPtr); + inputVectorPtr += 4; + + // Scale and clip + ret1 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal1, vScalar), vmax_val), vmin_val); + ret2 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal2, vScalar), vmax_val), vmin_val); + + intInputVal1 = _mm_cvtps_epi32(ret1); + intInputVal2 = _mm_cvtps_epi32(ret2); + + intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2); + + _mm_store_si128((__m128i*)outputVectorPtr, intInputVal1); + outputVectorPtr += 8; + } + + number = eighthPoints * 8; + for (; number < num_points; number++) { + r = inputVector[number] * scalar; + if (r > max_val) + r = max_val; + else if (r < min_val) + r = min_val; + outputVector[number] = (int16_t)rintf(r); + } } #endif /* LV_HAVE_SSE2 */ @@ -493,76 +516,78 @@ volk_32f_s32f_convert_16i_a_sse2(int16_t* outputVector, const float* inputVector #ifdef LV_HAVE_SSE #include -static inline void -volk_32f_s32f_convert_16i_a_sse(int16_t* outputVector, const float* inputVector, - const float scalar, unsigned int num_points) +static inline void volk_32f_s32f_convert_16i_a_sse(int16_t* outputVector, + const float* inputVector, + const float scalar, + unsigned int num_points) { - unsigned int number = 0; - - const unsigned int quarterPoints = num_points / 4; - - const float* inputVectorPtr = (const float*)inputVector; - int16_t* outputVectorPtr = outputVector; - - float min_val = SHRT_MIN; - float max_val = SHRT_MAX; - float r; - - __m128 vScalar = _mm_set_ps1(scalar); - __m128 ret; - __m128 vmin_val = _mm_set_ps1(min_val); - __m128 vmax_val = _mm_set_ps1(max_val); - - __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4]; - - for(;number < quarterPoints; number++){ - ret = _mm_load_ps(inputVectorPtr); - inputVectorPtr += 4; - - // Scale and clip - ret = _mm_max_ps(_mm_min_ps(_mm_mul_ps(ret, vScalar), vmax_val), vmin_val); - - _mm_store_ps(outputFloatBuffer, ret); - *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[0]); - *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[1]); - *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[2]); - *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[3]); - } - - number = quarterPoints * 4; - for(; number < num_points; number++){ - r = inputVector[number] * scalar; - if(r > max_val) - r = max_val; - else if(r < min_val) - r = min_val; - outputVector[number] = (int16_t)rintf(r); - } + unsigned int number = 0; + + const unsigned int quarterPoints = num_points / 4; + + const float* inputVectorPtr = (const float*)inputVector; + int16_t* outputVectorPtr = outputVector; + + float min_val = SHRT_MIN; + float max_val = SHRT_MAX; + float r; + + __m128 vScalar = _mm_set_ps1(scalar); + __m128 ret; + __m128 vmin_val = _mm_set_ps1(min_val); + __m128 vmax_val = _mm_set_ps1(max_val); + + __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4]; + + for (; number < quarterPoints; number++) { + ret = _mm_load_ps(inputVectorPtr); + inputVectorPtr += 4; + + // Scale and clip + ret = _mm_max_ps(_mm_min_ps(_mm_mul_ps(ret, vScalar), vmax_val), vmin_val); + + _mm_store_ps(outputFloatBuffer, ret); + *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[0]); + *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[1]); + *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[2]); + *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[3]); + } + + number = quarterPoints * 4; + for (; number < num_points; number++) { + r = inputVector[number] * scalar; + if (r > max_val) + r = max_val; + else if (r < min_val) + r = min_val; + outputVector[number] = (int16_t)rintf(r); + } } #endif /* LV_HAVE_SSE */ #ifdef LV_HAVE_GENERIC -static inline void -volk_32f_s32f_convert_16i_a_generic(int16_t* outputVector, const float* inputVector, - const float scalar, unsigned int num_points) +static inline void volk_32f_s32f_convert_16i_a_generic(int16_t* outputVector, + const float* inputVector, + const float scalar, + unsigned int num_points) { - int16_t* outputVectorPtr = outputVector; - const float* inputVectorPtr = inputVector; - unsigned int number = 0; - float min_val = SHRT_MIN; - float max_val = SHRT_MAX; - float r; - - for(number = 0; number < num_points; number++){ - r = *inputVectorPtr++ * scalar; - if(r < min_val) - r = min_val; - else if(r > max_val) - r = max_val; - *outputVectorPtr++ = (int16_t)rintf(r); - } + int16_t* outputVectorPtr = outputVector; + const float* inputVectorPtr = inputVector; + unsigned int number = 0; + float min_val = SHRT_MIN; + float max_val = SHRT_MAX; + float r; + + for (number = 0; number < num_points; number++) { + r = *inputVectorPtr++ * scalar; + if (r < min_val) + r = min_val; + else if (r > max_val) + r = max_val; + *outputVectorPtr++ = (int16_t)rintf(r); + } } #endif /* LV_HAVE_GENERIC */ diff --git a/kernels/volk/volk_32f_s32f_convert_32i.h b/kernels/volk/volk_32f_s32f_convert_32i.h index d2a65a0..d5f7cd4 100644 --- a/kernels/volk/volk_32f_s32f_convert_32i.h +++ b/kernels/volk/volk_32f_s32f_convert_32i.h @@ -30,8 +30,8 @@ * * Dispatcher Prototype * \code - * void volk_32f_s32f_convert_32i(int32_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points) - * \endcode + * void volk_32f_s32f_convert_32i(int32_t* outputVector, const float* inputVector, const + * float scalar, unsigned int num_points) \endcode * * \b Inputs * \li inputVector: the input vector of floats. @@ -77,46 +77,49 @@ #ifdef LV_HAVE_AVX #include -static inline void -volk_32f_s32f_convert_32i_u_avx(int32_t* outputVector, const float* inputVector, - const float scalar, unsigned int num_points) +static inline void volk_32f_s32f_convert_32i_u_avx(int32_t* outputVector, + const float* inputVector, + const float scalar, + unsigned int num_points) { - unsigned int number = 0; - - const unsigned int eighthPoints = num_points / 8; - - const float* inputVectorPtr = (const float*)inputVector; - int32_t* outputVectorPtr = outputVector; - - float min_val = INT_MIN; - float max_val = INT_MAX; - float r; - - __m256 vScalar = _mm256_set1_ps(scalar); - __m256 inputVal1; - __m256i intInputVal1; - __m256 vmin_val = _mm256_set1_ps(min_val); - __m256 vmax_val = _mm256_set1_ps(max_val); - - for(;number < eighthPoints; number++){ - inputVal1 = _mm256_loadu_ps(inputVectorPtr); inputVectorPtr += 8; - - inputVal1 = _mm256_max_ps(_mm256_min_ps(_mm256_mul_ps(inputVal1, vScalar), vmax_val), vmin_val); - intInputVal1 = _mm256_cvtps_epi32(inputVal1); - - _mm256_storeu_si256((__m256i*)outputVectorPtr, intInputVal1); - outputVectorPtr += 8; - } - - number = eighthPoints * 8; - for(; number < num_points; number++){ - r = inputVector[number] * scalar; - if(r > max_val) - r = max_val; - else if(r < min_val) - r = min_val; - outputVector[number] = (int32_t)rintf(r); - } + unsigned int number = 0; + + const unsigned int eighthPoints = num_points / 8; + + const float* inputVectorPtr = (const float*)inputVector; + int32_t* outputVectorPtr = outputVector; + + float min_val = INT_MIN; + float max_val = INT_MAX; + float r; + + __m256 vScalar = _mm256_set1_ps(scalar); + __m256 inputVal1; + __m256i intInputVal1; + __m256 vmin_val = _mm256_set1_ps(min_val); + __m256 vmax_val = _mm256_set1_ps(max_val); + + for (; number < eighthPoints; number++) { + inputVal1 = _mm256_loadu_ps(inputVectorPtr); + inputVectorPtr += 8; + + inputVal1 = _mm256_max_ps( + _mm256_min_ps(_mm256_mul_ps(inputVal1, vScalar), vmax_val), vmin_val); + intInputVal1 = _mm256_cvtps_epi32(inputVal1); + + _mm256_storeu_si256((__m256i*)outputVectorPtr, intInputVal1); + outputVectorPtr += 8; + } + + number = eighthPoints * 8; + for (; number < num_points; number++) { + r = inputVector[number] * scalar; + if (r > max_val) + r = max_val; + else if (r < min_val) + r = min_val; + outputVector[number] = (int32_t)rintf(r); + } } #endif /* LV_HAVE_AVX */ @@ -124,46 +127,49 @@ volk_32f_s32f_convert_32i_u_avx(int32_t* outputVector, const float* inputVector, #ifdef LV_HAVE_SSE2 #include -static inline void -volk_32f_s32f_convert_32i_u_sse2(int32_t* outputVector, const float* inputVector, - const float scalar, unsigned int num_points) +static inline void volk_32f_s32f_convert_32i_u_sse2(int32_t* outputVector, + const float* inputVector, + const float scalar, + unsigned int num_points) { - unsigned int number = 0; - - const unsigned int quarterPoints = num_points / 4; - - const float* inputVectorPtr = (const float*)inputVector; - int32_t* outputVectorPtr = outputVector; - - float min_val = INT_MIN; - float max_val = INT_MAX; - float r; - - __m128 vScalar = _mm_set_ps1(scalar); - __m128 inputVal1; - __m128i intInputVal1; - __m128 vmin_val = _mm_set_ps1(min_val); - __m128 vmax_val = _mm_set_ps1(max_val); - - for(;number < quarterPoints; number++){ - inputVal1 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4; - - inputVal1 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal1, vScalar), vmax_val), vmin_val); - intInputVal1 = _mm_cvtps_epi32(inputVal1); - - _mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1); - outputVectorPtr += 4; - } - - number = quarterPoints * 4; - for(; number < num_points; number++){ - r = inputVector[number] * scalar; - if(r > max_val) - r = max_val; - else if(r < min_val) - r = min_val; - outputVector[number] = (int32_t)rintf(r); - } + unsigned int number = 0; + + const unsigned int quarterPoints = num_points / 4; + + const float* inputVectorPtr = (const float*)inputVector; + int32_t* outputVectorPtr = outputVector; + + float min_val = INT_MIN; + float max_val = INT_MAX; + float r; + + __m128 vScalar = _mm_set_ps1(scalar); + __m128 inputVal1; + __m128i intInputVal1; + __m128 vmin_val = _mm_set_ps1(min_val); + __m128 vmax_val = _mm_set_ps1(max_val); + + for (; number < quarterPoints; number++) { + inputVal1 = _mm_loadu_ps(inputVectorPtr); + inputVectorPtr += 4; + + inputVal1 = + _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal1, vScalar), vmax_val), vmin_val); + intInputVal1 = _mm_cvtps_epi32(inputVal1); + + _mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1); + outputVectorPtr += 4; + } + + number = quarterPoints * 4; + for (; number < num_points; number++) { + r = inputVector[number] * scalar; + if (r > max_val) + r = max_val; + else if (r < min_val) + r = min_val; + outputVector[number] = (int32_t)rintf(r); + } } #endif /* LV_HAVE_SSE2 */ @@ -172,50 +178,51 @@ volk_32f_s32f_convert_32i_u_sse2(int32_t* outputVector, const float* inputVector #ifdef LV_HAVE_SSE #include -static inline void -volk_32f_s32f_convert_32i_u_sse(int32_t* outputVector, const float* inputVector, - const float scalar, unsigned int num_points) +static inline void volk_32f_s32f_convert_32i_u_sse(int32_t* outputVector, + const float* inputVector, + const float scalar, + unsigned int num_points) { - unsigned int number = 0; - - const unsigned int quarterPoints = num_points / 4; - - const float* inputVectorPtr = (const float*)inputVector; - int32_t* outputVectorPtr = outputVector; - - float min_val = INT_MIN; - float max_val = INT_MAX; - float r; - - __m128 vScalar = _mm_set_ps1(scalar); - __m128 ret; - __m128 vmin_val = _mm_set_ps1(min_val); - __m128 vmax_val = _mm_set_ps1(max_val); - - __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4]; - - for(;number < quarterPoints; number++){ - ret = _mm_loadu_ps(inputVectorPtr); - inputVectorPtr += 4; - - ret = _mm_max_ps(_mm_min_ps(_mm_mul_ps(ret, vScalar), vmax_val), vmin_val); - - _mm_store_ps(outputFloatBuffer, ret); - *outputVectorPtr++ = (int32_t)rintf(outputFloatBuffer[0]); - *outputVectorPtr++ = (int32_t)rintf(outputFloatBuffer[1]); - *outputVectorPtr++ = (int32_t)rintf(outputFloatBuffer[2]); - *outputVectorPtr++ = (int32_t)rintf(outputFloatBuffer[3]); - } - - number = quarterPoints * 4; - for(; number < num_points; number++){ - r = inputVector[number] * scalar; - if(r > max_val) - r = max_val; - else if(r < min_val) - r = min_val; - outputVector[number] = (int32_t)rintf(r); - } + unsigned int number = 0; + + const unsigned int quarterPoints = num_points / 4; + + const float* inputVectorPtr = (const float*)inputVector; + int32_t* outputVectorPtr = outputVector; + + float min_val = INT_MIN; + float max_val = INT_MAX; + float r; + + __m128 vScalar = _mm_set_ps1(scalar); + __m128 ret; + __m128 vmin_val = _mm_set_ps1(min_val); + __m128 vmax_val = _mm_set_ps1(max_val); + + __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4]; + + for (; number < quarterPoints; number++) { + ret = _mm_loadu_ps(inputVectorPtr); + inputVectorPtr += 4; + + ret = _mm_max_ps(_mm_min_ps(_mm_mul_ps(ret, vScalar), vmax_val), vmin_val); + + _mm_store_ps(outputFloatBuffer, ret); + *outputVectorPtr++ = (int32_t)rintf(outputFloatBuffer[0]); + *outputVectorPtr++ = (int32_t)rintf(outputFloatBuffer[1]); + *outputVectorPtr++ = (int32_t)rintf(outputFloatBuffer[2]); + *outputVectorPtr++ = (int32_t)rintf(outputFloatBuffer[3]); + } + + number = quarterPoints * 4; + for (; number < num_points; number++) { + r = inputVector[number] * scalar; + if (r > max_val) + r = max_val; + else if (r < min_val) + r = min_val; + outputVector[number] = (int32_t)rintf(r); + } } #endif /* LV_HAVE_SSE */ @@ -223,82 +230,85 @@ volk_32f_s32f_convert_32i_u_sse(int32_t* outputVector, const float* inputVector, #ifdef LV_HAVE_GENERIC -static inline void -volk_32f_s32f_convert_32i_generic(int32_t* outputVector, const float* inputVector, - const float scalar, unsigned int num_points) +static inline void volk_32f_s32f_convert_32i_generic(int32_t* outputVector, + const float* inputVector, + const float scalar, + unsigned int num_points) { - int32_t* outputVectorPtr = outputVector; - const float* inputVectorPtr = inputVector; - unsigned int number = 0; - float min_val = INT_MIN; - float max_val = INT_MAX; - float r; - - for(number = 0; number < num_points; number++){ - r = *inputVectorPtr++ * scalar; - if(r > max_val) - r = max_val; - else if(r < min_val) - r = min_val; - *outputVectorPtr++ = (int32_t)rintf(r); - } + int32_t* outputVectorPtr = outputVector; + const float* inputVectorPtr = inputVector; + unsigned int number = 0; + float min_val = INT_MIN; + float max_val = INT_MAX; + float r; + + for (number = 0; number < num_points; number++) { + r = *inputVectorPtr++ * scalar; + if (r > max_val) + r = max_val; + else if (r < min_val) + r = min_val; + *outputVectorPtr++ = (int32_t)rintf(r); + } } #endif /* LV_HAVE_GENERIC */ - #endif /* INCLUDED_volk_32f_s32f_convert_32i_u_H */ #ifndef INCLUDED_volk_32f_s32f_convert_32i_a_H #define INCLUDED_volk_32f_s32f_convert_32i_a_H -#include #include #include +#include #ifdef LV_HAVE_AVX #include -static inline void -volk_32f_s32f_convert_32i_a_avx(int32_t* outputVector, const float* inputVector, - const float scalar, unsigned int num_points) +static inline void volk_32f_s32f_convert_32i_a_avx(int32_t* outputVector, + const float* inputVector, + const float scalar, + unsigned int num_points) { - unsigned int number = 0; - - const unsigned int eighthPoints = num_points / 8; - - const float* inputVectorPtr = (const float*)inputVector; - int32_t* outputVectorPtr = outputVector; - - float min_val = INT_MIN; - float max_val = INT_MAX; - float r; - - __m256 vScalar = _mm256_set1_ps(scalar); - __m256 inputVal1; - __m256i intInputVal1; - __m256 vmin_val = _mm256_set1_ps(min_val); - __m256 vmax_val = _mm256_set1_ps(max_val); - - for(;number < eighthPoints; number++){ - inputVal1 = _mm256_load_ps(inputVectorPtr); inputVectorPtr += 8; - - inputVal1 = _mm256_max_ps(_mm256_min_ps(_mm256_mul_ps(inputVal1, vScalar), vmax_val), vmin_val); - intInputVal1 = _mm256_cvtps_epi32(inputVal1); - - _mm256_store_si256((__m256i*)outputVectorPtr, intInputVal1); - outputVectorPtr += 8; - } - - number = eighthPoints * 8; - for(; number < num_points; number++){ - r = inputVector[number] * scalar; - if(r > max_val) - r = max_val; - else if(r < min_val) - r = min_val; - outputVector[number] = (int32_t)rintf(r); - } + unsigned int number = 0; + + const unsigned int eighthPoints = num_points / 8; + + const float* inputVectorPtr = (const float*)inputVector; + int32_t* outputVectorPtr = outputVector; + + float min_val = INT_MIN; + float max_val = INT_MAX; + float r; + + __m256 vScalar = _mm256_set1_ps(scalar); + __m256 inputVal1; + __m256i intInputVal1; + __m256 vmin_val = _mm256_set1_ps(min_val); + __m256 vmax_val = _mm256_set1_ps(max_val); + + for (; number < eighthPoints; number++) { + inputVal1 = _mm256_load_ps(inputVectorPtr); + inputVectorPtr += 8; + + inputVal1 = _mm256_max_ps( + _mm256_min_ps(_mm256_mul_ps(inputVal1, vScalar), vmax_val), vmin_val); + intInputVal1 = _mm256_cvtps_epi32(inputVal1); + + _mm256_store_si256((__m256i*)outputVectorPtr, intInputVal1); + outputVectorPtr += 8; + } + + number = eighthPoints * 8; + for (; number < num_points; number++) { + r = inputVector[number] * scalar; + if (r > max_val) + r = max_val; + else if (r < min_val) + r = min_val; + outputVector[number] = (int32_t)rintf(r); + } } #endif /* LV_HAVE_AVX */ @@ -307,46 +317,49 @@ volk_32f_s32f_convert_32i_a_avx(int32_t* outputVector, const float* inputVector, #ifdef LV_HAVE_SSE2 #include -static inline void -volk_32f_s32f_convert_32i_a_sse2(int32_t* outputVector, const float* inputVector, - const float scalar, unsigned int num_points) +static inline void volk_32f_s32f_convert_32i_a_sse2(int32_t* outputVector, + const float* inputVector, + const float scalar, + unsigned int num_points) { - unsigned int number = 0; - - const unsigned int quarterPoints = num_points / 4; - - const float* inputVectorPtr = (const float*)inputVector; - int32_t* outputVectorPtr = outputVector; - - float min_val = INT_MIN; - float max_val = INT_MAX; - float r; - - __m128 vScalar = _mm_set_ps1(scalar); - __m128 inputVal1; - __m128i intInputVal1; - __m128 vmin_val = _mm_set_ps1(min_val); - __m128 vmax_val = _mm_set_ps1(max_val); - - for(;number < quarterPoints; number++){ - inputVal1 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4; - - inputVal1 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal1, vScalar), vmax_val), vmin_val); - intInputVal1 = _mm_cvtps_epi32(inputVal1); - - _mm_store_si128((__m128i*)outputVectorPtr, intInputVal1); - outputVectorPtr += 4; - } - - number = quarterPoints * 4; - for(; number < num_points; number++){ - r = inputVector[number] * scalar; - if(r > max_val) - r = max_val; - else if(r < min_val) - r = min_val; - outputVector[number] = (int32_t)rintf(r); - } + unsigned int number = 0; + + const unsigned int quarterPoints = num_points / 4; + + const float* inputVectorPtr = (const float*)inputVector; + int32_t* outputVectorPtr = outputVector; + + float min_val = INT_MIN; + float max_val = INT_MAX; + float r; + + __m128 vScalar = _mm_set_ps1(scalar); + __m128 inputVal1; + __m128i intInputVal1; + __m128 vmin_val = _mm_set_ps1(min_val); + __m128 vmax_val = _mm_set_ps1(max_val); + + for (; number < quarterPoints; number++) { + inputVal1 = _mm_load_ps(inputVectorPtr); + inputVectorPtr += 4; + + inputVal1 = + _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal1, vScalar), vmax_val), vmin_val); + intInputVal1 = _mm_cvtps_epi32(inputVal1); + + _mm_store_si128((__m128i*)outputVectorPtr, intInputVal1); + outputVectorPtr += 4; + } + + number = quarterPoints * 4; + for (; number < num_points; number++) { + r = inputVector[number] * scalar; + if (r > max_val) + r = max_val; + else if (r < min_val) + r = min_val; + outputVector[number] = (int32_t)rintf(r); + } } #endif /* LV_HAVE_SSE2 */ @@ -355,50 +368,51 @@ volk_32f_s32f_convert_32i_a_sse2(int32_t* outputVector, const float* inputVector #ifdef LV_HAVE_SSE #include -static inline void -volk_32f_s32f_convert_32i_a_sse(int32_t* outputVector, const float* inputVector, - const float scalar, unsigned int num_points) +static inline void volk_32f_s32f_convert_32i_a_sse(int32_t* outputVector, + const float* inputVector, + const float scalar, + unsigned int num_points) { - unsigned int number = 0; - - const unsigned int quarterPoints = num_points / 4; - - const float* inputVectorPtr = (const float*)inputVector; - int32_t* outputVectorPtr = outputVector; - - float min_val = INT_MIN; - float max_val = INT_MAX; - float r; - - __m128 vScalar = _mm_set_ps1(scalar); - __m128 ret; - __m128 vmin_val = _mm_set_ps1(min_val); - __m128 vmax_val = _mm_set_ps1(max_val); - - __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4]; - - for(;number < quarterPoints; number++){ - ret = _mm_load_ps(inputVectorPtr); - inputVectorPtr += 4; - - ret = _mm_max_ps(_mm_min_ps(_mm_mul_ps(ret, vScalar), vmax_val), vmin_val); - - _mm_store_ps(outputFloatBuffer, ret); - *outputVectorPtr++ = (int32_t)rintf(outputFloatBuffer[0]); - *outputVectorPtr++ = (int32_t)rintf(outputFloatBuffer[1]); - *outputVectorPtr++ = (int32_t)rintf(outputFloatBuffer[2]); - *outputVectorPtr++ = (int32_t)rintf(outputFloatBuffer[3]); - } - - number = quarterPoints * 4; - for(; number < num_points; number++){ - r = inputVector[number] * scalar; - if(r > max_val) - r = max_val; - else if(r < min_val) - r = min_val; - outputVector[number] = (int32_t)rintf(r); - } + unsigned int number = 0; + + const unsigned int quarterPoints = num_points / 4; + + const float* inputVectorPtr = (const float*)inputVector; + int32_t* outputVectorPtr = outputVector; + + float min_val = INT_MIN; + float max_val = INT_MAX; + float r; + + __m128 vScalar = _mm_set_ps1(scalar); + __m128 ret; + __m128 vmin_val = _mm_set_ps1(min_val); + __m128 vmax_val = _mm_set_ps1(max_val); + + __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4]; + + for (; number < quarterPoints; number++) { + ret = _mm_load_ps(inputVectorPtr); + inputVectorPtr += 4; + + ret = _mm_max_ps(_mm_min_ps(_mm_mul_ps(ret, vScalar), vmax_val), vmin_val); + + _mm_store_ps(outputFloatBuffer, ret); + *outputVectorPtr++ = (int32_t)rintf(outputFloatBuffer[0]); + *outputVectorPtr++ = (int32_t)rintf(outputFloatBuffer[1]); + *outputVectorPtr++ = (int32_t)rintf(outputFloatBuffer[2]); + *outputVectorPtr++ = (int32_t)rintf(outputFloatBuffer[3]); + } + + number = quarterPoints * 4; + for (; number < num_points; number++) { + r = inputVector[number] * scalar; + if (r > max_val) + r = max_val; + else if (r < min_val) + r = min_val; + outputVector[number] = (int32_t)rintf(r); + } } #endif /* LV_HAVE_SSE */ @@ -406,25 +420,26 @@ volk_32f_s32f_convert_32i_a_sse(int32_t* outputVector, const float* inputVector, #ifdef LV_HAVE_GENERIC -static inline void -volk_32f_s32f_convert_32i_a_generic(int32_t* outputVector, const float* inputVector, - const float scalar, unsigned int num_points) +static inline void volk_32f_s32f_convert_32i_a_generic(int32_t* outputVector, + const float* inputVector, + const float scalar, + unsigned int num_points) { - int32_t* outputVectorPtr = outputVector; - const float* inputVectorPtr = inputVector; - unsigned int number = 0; - float min_val = INT_MIN; - float max_val = INT_MAX; - float r; - - for(number = 0; number < num_points; number++){ - r = *inputVectorPtr++ * scalar; - if(r > max_val) - r = max_val; - else if(r < min_val) - r = min_val; - *outputVectorPtr++ = (int32_t)rintf(r); - } + int32_t* outputVectorPtr = outputVector; + const float* inputVectorPtr = inputVector; + unsigned int number = 0; + float min_val = INT_MIN; + float max_val = INT_MAX; + float r; + + for (number = 0; number < num_points; number++) { + r = *inputVectorPtr++ * scalar; + if (r > max_val) + r = max_val; + else if (r < min_val) + r = min_val; + *outputVectorPtr++ = (int32_t)rintf(r); + } } #endif /* LV_HAVE_GENERIC */ diff --git a/kernels/volk/volk_32f_s32f_convert_8i.h b/kernels/volk/volk_32f_s32f_convert_8i.h index 2a1669c..242c3bd 100644 --- a/kernels/volk/volk_32f_s32f_convert_8i.h +++ b/kernels/volk/volk_32f_s32f_convert_8i.h @@ -30,7 +30,8 @@ * * Dispatcher Prototype * \code - * void volk_32f_s32f_convert_8i(int8_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points) + * void volk_32f_s32f_convert_8i(int8_t* outputVector, const float* inputVector, const + float scalar, unsigned int num_points) * \endcode * * \b Inputs @@ -42,7 +43,8 @@ * \li outputVector: The output vector. * * \b Example - * Convert floats from [-1,1] to 16-bit integers with a scale of 5 to maintain smallest delta + * Convert floats from [-1,1] to 16-bit integers with a scale of 5 to maintain smallest + delta * int N = 10; * unsigned int alignment = volk_get_alignment(); * float* increasing = (float*)volk_malloc(sizeof(float)*N, alignment); @@ -74,77 +76,86 @@ #include #include -static inline void -volk_32f_s32f_convert_8i_single(int8_t* out, const float in){ - float min_val = CHAR_MIN; - float max_val = CHAR_MAX; - if(in > max_val){ - *out = (int8_t)(max_val); - }else if(in < min_val){ - *out = (int8_t)(min_val); - }else{ - *out = (int8_t)(rintf(in)); - } +static inline void volk_32f_s32f_convert_8i_single(int8_t* out, const float in) +{ + float min_val = CHAR_MIN; + float max_val = CHAR_MAX; + if (in > max_val) { + *out = (int8_t)(max_val); + } else if (in < min_val) { + *out = (int8_t)(min_val); + } else { + *out = (int8_t)(rintf(in)); + } } #ifdef LV_HAVE_AVX2 #include -static inline void -volk_32f_s32f_convert_8i_u_avx2(int8_t* outputVector, const float* inputVector, - const float scalar, unsigned int num_points) +static inline void volk_32f_s32f_convert_8i_u_avx2(int8_t* outputVector, + const float* inputVector, + const float scalar, + unsigned int num_points) { - unsigned int number = 0; - - const unsigned int thirtysecondPoints = num_points / 32; - - const float* inputVectorPtr = (const float*)inputVector; - int8_t* outputVectorPtr = outputVector; - - float min_val = CHAR_MIN; - float max_val = CHAR_MAX; - float r; - - __m256 vScalar = _mm256_set1_ps(scalar); - __m256 inputVal1, inputVal2, inputVal3, inputVal4; - __m256i intInputVal1, intInputVal2, intInputVal3, intInputVal4; - __m256 vmin_val = _mm256_set1_ps(min_val); - __m256 vmax_val = _mm256_set1_ps(max_val); - __m256i intInputVal; - - for(;number < thirtysecondPoints; number++){ - inputVal1 = _mm256_loadu_ps(inputVectorPtr); inputVectorPtr += 8; - inputVal2 = _mm256_loadu_ps(inputVectorPtr); inputVectorPtr += 8; - inputVal3 = _mm256_loadu_ps(inputVectorPtr); inputVectorPtr += 8; - inputVal4 = _mm256_loadu_ps(inputVectorPtr); inputVectorPtr += 8; - - inputVal1 = _mm256_max_ps(_mm256_min_ps(_mm256_mul_ps(inputVal1, vScalar), vmax_val), vmin_val); - inputVal2 = _mm256_max_ps(_mm256_min_ps(_mm256_mul_ps(inputVal2, vScalar), vmax_val), vmin_val); - inputVal3 = _mm256_max_ps(_mm256_min_ps(_mm256_mul_ps(inputVal3, vScalar), vmax_val), vmin_val); - inputVal4 = _mm256_max_ps(_mm256_min_ps(_mm256_mul_ps(inputVal4, vScalar), vmax_val), vmin_val); - - intInputVal1 = _mm256_cvtps_epi32(inputVal1); - intInputVal2 = _mm256_cvtps_epi32(inputVal2); - intInputVal3 = _mm256_cvtps_epi32(inputVal3); - intInputVal4 = _mm256_cvtps_epi32(inputVal4); - - intInputVal1 = _mm256_packs_epi32(intInputVal1, intInputVal2); - intInputVal1 = _mm256_permute4x64_epi64(intInputVal1, 0b11011000); - intInputVal3 = _mm256_packs_epi32(intInputVal3, intInputVal4); - intInputVal3 = _mm256_permute4x64_epi64(intInputVal3, 0b11011000); - - intInputVal1 = _mm256_packs_epi16(intInputVal1, intInputVal3); - intInputVal = _mm256_permute4x64_epi64(intInputVal1, 0b11011000); - - _mm256_storeu_si256((__m256i*)outputVectorPtr, intInputVal); - outputVectorPtr += 32; - } - - number = thirtysecondPoints * 32; - for(; number < num_points; number++){ - r = inputVector[number] * scalar; - volk_32f_s32f_convert_8i_single(&outputVector[number], r); - } + unsigned int number = 0; + + const unsigned int thirtysecondPoints = num_points / 32; + + const float* inputVectorPtr = (const float*)inputVector; + int8_t* outputVectorPtr = outputVector; + + float min_val = CHAR_MIN; + float max_val = CHAR_MAX; + float r; + + __m256 vScalar = _mm256_set1_ps(scalar); + __m256 inputVal1, inputVal2, inputVal3, inputVal4; + __m256i intInputVal1, intInputVal2, intInputVal3, intInputVal4; + __m256 vmin_val = _mm256_set1_ps(min_val); + __m256 vmax_val = _mm256_set1_ps(max_val); + __m256i intInputVal; + + for (; number < thirtysecondPoints; number++) { + inputVal1 = _mm256_loadu_ps(inputVectorPtr); + inputVectorPtr += 8; + inputVal2 = _mm256_loadu_ps(inputVectorPtr); + inputVectorPtr += 8; + inputVal3 = _mm256_loadu_ps(inputVectorPtr); + inputVectorPtr += 8; + inputVal4 = _mm256_loadu_ps(inputVectorPtr); + inputVectorPtr += 8; + + inputVal1 = _mm256_max_ps( + _mm256_min_ps(_mm256_mul_ps(inputVal1, vScalar), vmax_val), vmin_val); + inputVal2 = _mm256_max_ps( + _mm256_min_ps(_mm256_mul_ps(inputVal2, vScalar), vmax_val), vmin_val); + inputVal3 = _mm256_max_ps( + _mm256_min_ps(_mm256_mul_ps(inputVal3, vScalar), vmax_val), vmin_val); + inputVal4 = _mm256_max_ps( + _mm256_min_ps(_mm256_mul_ps(inputVal4, vScalar), vmax_val), vmin_val); + + intInputVal1 = _mm256_cvtps_epi32(inputVal1); + intInputVal2 = _mm256_cvtps_epi32(inputVal2); + intInputVal3 = _mm256_cvtps_epi32(inputVal3); + intInputVal4 = _mm256_cvtps_epi32(inputVal4); + + intInputVal1 = _mm256_packs_epi32(intInputVal1, intInputVal2); + intInputVal1 = _mm256_permute4x64_epi64(intInputVal1, 0b11011000); + intInputVal3 = _mm256_packs_epi32(intInputVal3, intInputVal4); + intInputVal3 = _mm256_permute4x64_epi64(intInputVal3, 0b11011000); + + intInputVal1 = _mm256_packs_epi16(intInputVal1, intInputVal3); + intInputVal = _mm256_permute4x64_epi64(intInputVal1, 0b11011000); + + _mm256_storeu_si256((__m256i*)outputVectorPtr, intInputVal); + outputVectorPtr += 32; + } + + number = thirtysecondPoints * 32; + for (; number < num_points; number++) { + r = inputVector[number] * scalar; + volk_32f_s32f_convert_8i_single(&outputVector[number], r); + } } #endif /* LV_HAVE_AVX2 */ @@ -153,57 +164,66 @@ volk_32f_s32f_convert_8i_u_avx2(int8_t* outputVector, const float* inputVector, #ifdef LV_HAVE_SSE2 #include -static inline void -volk_32f_s32f_convert_8i_u_sse2(int8_t* outputVector, const float* inputVector, - const float scalar, unsigned int num_points) +static inline void volk_32f_s32f_convert_8i_u_sse2(int8_t* outputVector, + const float* inputVector, + const float scalar, + unsigned int num_points) { - unsigned int number = 0; - - const unsigned int sixteenthPoints = num_points / 16; - - const float* inputVectorPtr = (const float*)inputVector; - int8_t* outputVectorPtr = outputVector; - - float min_val = CHAR_MIN; - float max_val = CHAR_MAX; - float r; - - __m128 vScalar = _mm_set_ps1(scalar); - __m128 inputVal1, inputVal2, inputVal3, inputVal4; - __m128i intInputVal1, intInputVal2, intInputVal3, intInputVal4; - __m128 vmin_val = _mm_set_ps1(min_val); - __m128 vmax_val = _mm_set_ps1(max_val); - - for(;number < sixteenthPoints; number++){ - inputVal1 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4; - inputVal2 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4; - inputVal3 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4; - inputVal4 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4; - - inputVal1 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal1, vScalar), vmax_val), vmin_val); - inputVal2 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal2, vScalar), vmax_val), vmin_val); - inputVal3 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal3, vScalar), vmax_val), vmin_val); - inputVal4 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal4, vScalar), vmax_val), vmin_val); - - intInputVal1 = _mm_cvtps_epi32(inputVal1); - intInputVal2 = _mm_cvtps_epi32(inputVal2); - intInputVal3 = _mm_cvtps_epi32(inputVal3); - intInputVal4 = _mm_cvtps_epi32(inputVal4); - - intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2); - intInputVal3 = _mm_packs_epi32(intInputVal3, intInputVal4); - - intInputVal1 = _mm_packs_epi16(intInputVal1, intInputVal3); - - _mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1); - outputVectorPtr += 16; - } + unsigned int number = 0; + + const unsigned int sixteenthPoints = num_points / 16; + + const float* inputVectorPtr = (const float*)inputVector; + int8_t* outputVectorPtr = outputVector; + + float min_val = CHAR_MIN; + float max_val = CHAR_MAX; + float r; + + __m128 vScalar = _mm_set_ps1(scalar); + __m128 inputVal1, inputVal2, inputVal3, inputVal4; + __m128i intInputVal1, intInputVal2, intInputVal3, intInputVal4; + __m128 vmin_val = _mm_set_ps1(min_val); + __m128 vmax_val = _mm_set_ps1(max_val); + + for (; number < sixteenthPoints; number++) { + inputVal1 = _mm_loadu_ps(inputVectorPtr); + inputVectorPtr += 4; + inputVal2 = _mm_loadu_ps(inputVectorPtr); + inputVectorPtr += 4; + inputVal3 = _mm_loadu_ps(inputVectorPtr); + inputVectorPtr += 4; + inputVal4 = _mm_loadu_ps(inputVectorPtr); + inputVectorPtr += 4; + + inputVal1 = + _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal1, vScalar), vmax_val), vmin_val); + inputVal2 = + _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal2, vScalar), vmax_val), vmin_val); + inputVal3 = + _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal3, vScalar), vmax_val), vmin_val); + inputVal4 = + _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal4, vScalar), vmax_val), vmin_val); + + intInputVal1 = _mm_cvtps_epi32(inputVal1); + intInputVal2 = _mm_cvtps_epi32(inputVal2); + intInputVal3 = _mm_cvtps_epi32(inputVal3); + intInputVal4 = _mm_cvtps_epi32(inputVal4); + + intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2); + intInputVal3 = _mm_packs_epi32(intInputVal3, intInputVal4); + + intInputVal1 = _mm_packs_epi16(intInputVal1, intInputVal3); + + _mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1); + outputVectorPtr += 16; + } - number = sixteenthPoints * 16; - for(; number < num_points; number++){ - r = inputVector[number] * scalar; - volk_32f_s32f_convert_8i_single(&outputVector[number], r); - } + number = sixteenthPoints * 16; + for (; number < num_points; number++) { + r = inputVector[number] * scalar; + volk_32f_s32f_convert_8i_single(&outputVector[number], r); + } } #endif /* LV_HAVE_SSE2 */ @@ -212,46 +232,47 @@ volk_32f_s32f_convert_8i_u_sse2(int8_t* outputVector, const float* inputVector, #ifdef LV_HAVE_SSE #include -static inline void -volk_32f_s32f_convert_8i_u_sse(int8_t* outputVector, const float* inputVector, - const float scalar, unsigned int num_points) +static inline void volk_32f_s32f_convert_8i_u_sse(int8_t* outputVector, + const float* inputVector, + const float scalar, + unsigned int num_points) { - unsigned int number = 0; - size_t inner_loop; + unsigned int number = 0; + size_t inner_loop; - const unsigned int quarterPoints = num_points / 4; + const unsigned int quarterPoints = num_points / 4; - const float* inputVectorPtr = (const float*)inputVector; - int8_t* outputVectorPtr = outputVector; + const float* inputVectorPtr = (const float*)inputVector; + int8_t* outputVectorPtr = outputVector; - float min_val = CHAR_MIN; - float max_val = CHAR_MAX; - float r; + float min_val = CHAR_MIN; + float max_val = CHAR_MAX; + float r; - __m128 vScalar = _mm_set_ps1(scalar); - __m128 ret; - __m128 vmin_val = _mm_set_ps1(min_val); - __m128 vmax_val = _mm_set_ps1(max_val); + __m128 vScalar = _mm_set_ps1(scalar); + __m128 ret; + __m128 vmin_val = _mm_set_ps1(min_val); + __m128 vmax_val = _mm_set_ps1(max_val); - __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4]; + __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4]; - for(;number < quarterPoints; number++){ - ret = _mm_loadu_ps(inputVectorPtr); - inputVectorPtr += 4; + for (; number < quarterPoints; number++) { + ret = _mm_loadu_ps(inputVectorPtr); + inputVectorPtr += 4; - ret = _mm_max_ps(_mm_min_ps(_mm_mul_ps(ret, vScalar), vmax_val), vmin_val); + ret = _mm_max_ps(_mm_min_ps(_mm_mul_ps(ret, vScalar), vmax_val), vmin_val); - _mm_store_ps(outputFloatBuffer, ret); - for (inner_loop = 0; inner_loop < 4; inner_loop++){ - *outputVectorPtr++ = (int8_t)(rintf(outputFloatBuffer[inner_loop])); + _mm_store_ps(outputFloatBuffer, ret); + for (inner_loop = 0; inner_loop < 4; inner_loop++) { + *outputVectorPtr++ = (int8_t)(rintf(outputFloatBuffer[inner_loop])); + } } - } - number = quarterPoints * 4; - for(; number < num_points; number++){ - r = inputVector[number] * scalar; - volk_32f_s32f_convert_8i_single(&outputVector[number], r); - } + number = quarterPoints * 4; + for (; number < num_points; number++) { + r = inputVector[number] * scalar; + volk_32f_s32f_convert_8i_single(&outputVector[number], r); + } } #endif /* LV_HAVE_SSE */ @@ -259,18 +280,19 @@ volk_32f_s32f_convert_8i_u_sse(int8_t* outputVector, const float* inputVector, #ifdef LV_HAVE_GENERIC -static inline void -volk_32f_s32f_convert_8i_generic(int8_t* outputVector, const float* inputVector, - const float scalar, unsigned int num_points) +static inline void volk_32f_s32f_convert_8i_generic(int8_t* outputVector, + const float* inputVector, + const float scalar, + unsigned int num_points) { - const float* inputVectorPtr = inputVector; - unsigned int number = 0; - float r; - - for(number = 0; number < num_points; number++){ - r = *inputVectorPtr++ * scalar; - volk_32f_s32f_convert_8i_single(&outputVector[number], r); - } + const float* inputVectorPtr = inputVector; + unsigned int number = 0; + float r; + + for (number = 0; number < num_points; number++) { + r = *inputVectorPtr++ * scalar; + volk_32f_s32f_convert_8i_single(&outputVector[number], r); + } } #endif /* LV_HAVE_GENERIC */ @@ -280,68 +302,77 @@ volk_32f_s32f_convert_8i_generic(int8_t* outputVector, const float* inputVector, #ifndef INCLUDED_volk_32f_s32f_convert_8i_a_H #define INCLUDED_volk_32f_s32f_convert_8i_a_H -#include #include #include +#include #ifdef LV_HAVE_AVX2 #include -static inline void -volk_32f_s32f_convert_8i_a_avx2(int8_t* outputVector, const float* inputVector, - const float scalar, unsigned int num_points) +static inline void volk_32f_s32f_convert_8i_a_avx2(int8_t* outputVector, + const float* inputVector, + const float scalar, + unsigned int num_points) { - unsigned int number = 0; - - const unsigned int thirtysecondPoints = num_points / 32; - - const float* inputVectorPtr = (const float*)inputVector; - int8_t* outputVectorPtr = outputVector; - - float min_val = CHAR_MIN; - float max_val = CHAR_MAX; - float r; - - __m256 vScalar = _mm256_set1_ps(scalar); - __m256 inputVal1, inputVal2, inputVal3, inputVal4; - __m256i intInputVal1, intInputVal2, intInputVal3, intInputVal4; - __m256 vmin_val = _mm256_set1_ps(min_val); - __m256 vmax_val = _mm256_set1_ps(max_val); - __m256i intInputVal; - - for(;number < thirtysecondPoints; number++){ - inputVal1 = _mm256_load_ps(inputVectorPtr); inputVectorPtr += 8; - inputVal2 = _mm256_load_ps(inputVectorPtr); inputVectorPtr += 8; - inputVal3 = _mm256_load_ps(inputVectorPtr); inputVectorPtr += 8; - inputVal4 = _mm256_load_ps(inputVectorPtr); inputVectorPtr += 8; - - inputVal1 = _mm256_max_ps(_mm256_min_ps(_mm256_mul_ps(inputVal1, vScalar), vmax_val), vmin_val); - inputVal2 = _mm256_max_ps(_mm256_min_ps(_mm256_mul_ps(inputVal2, vScalar), vmax_val), vmin_val); - inputVal3 = _mm256_max_ps(_mm256_min_ps(_mm256_mul_ps(inputVal3, vScalar), vmax_val), vmin_val); - inputVal4 = _mm256_max_ps(_mm256_min_ps(_mm256_mul_ps(inputVal4, vScalar), vmax_val), vmin_val); - - intInputVal1 = _mm256_cvtps_epi32(inputVal1); - intInputVal2 = _mm256_cvtps_epi32(inputVal2); - intInputVal3 = _mm256_cvtps_epi32(inputVal3); - intInputVal4 = _mm256_cvtps_epi32(inputVal4); - - intInputVal1 = _mm256_packs_epi32(intInputVal1, intInputVal2); - intInputVal1 = _mm256_permute4x64_epi64(intInputVal1, 0b11011000); - intInputVal3 = _mm256_packs_epi32(intInputVal3, intInputVal4); - intInputVal3 = _mm256_permute4x64_epi64(intInputVal3, 0b11011000); - - intInputVal1 = _mm256_packs_epi16(intInputVal1, intInputVal3); - intInputVal = _mm256_permute4x64_epi64(intInputVal1, 0b11011000); - - _mm256_store_si256((__m256i*)outputVectorPtr, intInputVal); - outputVectorPtr += 32; - } - - number = thirtysecondPoints * 32; - for(; number < num_points; number++){ - r = inputVector[number] * scalar; - volk_32f_s32f_convert_8i_single(&outputVector[number], r); - } + unsigned int number = 0; + + const unsigned int thirtysecondPoints = num_points / 32; + + const float* inputVectorPtr = (const float*)inputVector; + int8_t* outputVectorPtr = outputVector; + + float min_val = CHAR_MIN; + float max_val = CHAR_MAX; + float r; + + __m256 vScalar = _mm256_set1_ps(scalar); + __m256 inputVal1, inputVal2, inputVal3, inputVal4; + __m256i intInputVal1, intInputVal2, intInputVal3, intInputVal4; + __m256 vmin_val = _mm256_set1_ps(min_val); + __m256 vmax_val = _mm256_set1_ps(max_val); + __m256i intInputVal; + + for (; number < thirtysecondPoints; number++) { + inputVal1 = _mm256_load_ps(inputVectorPtr); + inputVectorPtr += 8; + inputVal2 = _mm256_load_ps(inputVectorPtr); + inputVectorPtr += 8; + inputVal3 = _mm256_load_ps(inputVectorPtr); + inputVectorPtr += 8; + inputVal4 = _mm256_load_ps(inputVectorPtr); + inputVectorPtr += 8; + + inputVal1 = _mm256_max_ps( + _mm256_min_ps(_mm256_mul_ps(inputVal1, vScalar), vmax_val), vmin_val); + inputVal2 = _mm256_max_ps( + _mm256_min_ps(_mm256_mul_ps(inputVal2, vScalar), vmax_val), vmin_val); + inputVal3 = _mm256_max_ps( + _mm256_min_ps(_mm256_mul_ps(inputVal3, vScalar), vmax_val), vmin_val); + inputVal4 = _mm256_max_ps( + _mm256_min_ps(_mm256_mul_ps(inputVal4, vScalar), vmax_val), vmin_val); + + intInputVal1 = _mm256_cvtps_epi32(inputVal1); + intInputVal2 = _mm256_cvtps_epi32(inputVal2); + intInputVal3 = _mm256_cvtps_epi32(inputVal3); + intInputVal4 = _mm256_cvtps_epi32(inputVal4); + + intInputVal1 = _mm256_packs_epi32(intInputVal1, intInputVal2); + intInputVal1 = _mm256_permute4x64_epi64(intInputVal1, 0b11011000); + intInputVal3 = _mm256_packs_epi32(intInputVal3, intInputVal4); + intInputVal3 = _mm256_permute4x64_epi64(intInputVal3, 0b11011000); + + intInputVal1 = _mm256_packs_epi16(intInputVal1, intInputVal3); + intInputVal = _mm256_permute4x64_epi64(intInputVal1, 0b11011000); + + _mm256_store_si256((__m256i*)outputVectorPtr, intInputVal); + outputVectorPtr += 32; + } + + number = thirtysecondPoints * 32; + for (; number < num_points; number++) { + r = inputVector[number] * scalar; + volk_32f_s32f_convert_8i_single(&outputVector[number], r); + } } #endif /* LV_HAVE_AVX2 */ @@ -350,57 +381,66 @@ volk_32f_s32f_convert_8i_a_avx2(int8_t* outputVector, const float* inputVector, #ifdef LV_HAVE_SSE2 #include -static inline void -volk_32f_s32f_convert_8i_a_sse2(int8_t* outputVector, const float* inputVector, - const float scalar, unsigned int num_points) +static inline void volk_32f_s32f_convert_8i_a_sse2(int8_t* outputVector, + const float* inputVector, + const float scalar, + unsigned int num_points) { - unsigned int number = 0; - - const unsigned int sixteenthPoints = num_points / 16; - - const float* inputVectorPtr = (const float*)inputVector; - int8_t* outputVectorPtr = outputVector; - - float min_val = CHAR_MIN; - float max_val = CHAR_MAX; - float r; - - __m128 vScalar = _mm_set_ps1(scalar); - __m128 inputVal1, inputVal2, inputVal3, inputVal4; - __m128i intInputVal1, intInputVal2, intInputVal3, intInputVal4; - __m128 vmin_val = _mm_set_ps1(min_val); - __m128 vmax_val = _mm_set_ps1(max_val); - - for(;number < sixteenthPoints; number++){ - inputVal1 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4; - inputVal2 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4; - inputVal3 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4; - inputVal4 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4; - - inputVal1 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal1, vScalar), vmax_val), vmin_val); - inputVal2 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal2, vScalar), vmax_val), vmin_val); - inputVal3 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal3, vScalar), vmax_val), vmin_val); - inputVal4 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal4, vScalar), vmax_val), vmin_val); - - intInputVal1 = _mm_cvtps_epi32(inputVal1); - intInputVal2 = _mm_cvtps_epi32(inputVal2); - intInputVal3 = _mm_cvtps_epi32(inputVal3); - intInputVal4 = _mm_cvtps_epi32(inputVal4); - - intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2); - intInputVal3 = _mm_packs_epi32(intInputVal3, intInputVal4); - - intInputVal1 = _mm_packs_epi16(intInputVal1, intInputVal3); - - _mm_store_si128((__m128i*)outputVectorPtr, intInputVal1); - outputVectorPtr += 16; - } + unsigned int number = 0; + + const unsigned int sixteenthPoints = num_points / 16; + + const float* inputVectorPtr = (const float*)inputVector; + int8_t* outputVectorPtr = outputVector; + + float min_val = CHAR_MIN; + float max_val = CHAR_MAX; + float r; + + __m128 vScalar = _mm_set_ps1(scalar); + __m128 inputVal1, inputVal2, inputVal3, inputVal4; + __m128i intInputVal1, intInputVal2, intInputVal3, intInputVal4; + __m128 vmin_val = _mm_set_ps1(min_val); + __m128 vmax_val = _mm_set_ps1(max_val); + + for (; number < sixteenthPoints; number++) { + inputVal1 = _mm_load_ps(inputVectorPtr); + inputVectorPtr += 4; + inputVal2 = _mm_load_ps(inputVectorPtr); + inputVectorPtr += 4; + inputVal3 = _mm_load_ps(inputVectorPtr); + inputVectorPtr += 4; + inputVal4 = _mm_load_ps(inputVectorPtr); + inputVectorPtr += 4; + + inputVal1 = + _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal1, vScalar), vmax_val), vmin_val); + inputVal2 = + _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal2, vScalar), vmax_val), vmin_val); + inputVal3 = + _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal3, vScalar), vmax_val), vmin_val); + inputVal4 = + _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal4, vScalar), vmax_val), vmin_val); + + intInputVal1 = _mm_cvtps_epi32(inputVal1); + intInputVal2 = _mm_cvtps_epi32(inputVal2); + intInputVal3 = _mm_cvtps_epi32(inputVal3); + intInputVal4 = _mm_cvtps_epi32(inputVal4); + + intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2); + intInputVal3 = _mm_packs_epi32(intInputVal3, intInputVal4); + + intInputVal1 = _mm_packs_epi16(intInputVal1, intInputVal3); + + _mm_store_si128((__m128i*)outputVectorPtr, intInputVal1); + outputVectorPtr += 16; + } - number = sixteenthPoints * 16; - for(; number < num_points; number++){ - r = inputVector[number] * scalar; - volk_32f_s32f_convert_8i_single(&outputVector[number], r); - } + number = sixteenthPoints * 16; + for (; number < num_points; number++) { + r = inputVector[number] * scalar; + volk_32f_s32f_convert_8i_single(&outputVector[number], r); + } } #endif /* LV_HAVE_SSE2 */ @@ -408,46 +448,47 @@ volk_32f_s32f_convert_8i_a_sse2(int8_t* outputVector, const float* inputVector, #ifdef LV_HAVE_SSE #include -static inline void -volk_32f_s32f_convert_8i_a_sse(int8_t* outputVector, const float* inputVector, - const float scalar, unsigned int num_points) +static inline void volk_32f_s32f_convert_8i_a_sse(int8_t* outputVector, + const float* inputVector, + const float scalar, + unsigned int num_points) { - unsigned int number = 0; - size_t inner_loop; + unsigned int number = 0; + size_t inner_loop; - const unsigned int quarterPoints = num_points / 4; + const unsigned int quarterPoints = num_points / 4; - const float* inputVectorPtr = (const float*)inputVector; + const float* inputVectorPtr = (const float*)inputVector; - float min_val = CHAR_MIN; - float max_val = CHAR_MAX; - float r; + float min_val = CHAR_MIN; + float max_val = CHAR_MAX; + float r; - int8_t* outputVectorPtr = outputVector; - __m128 vScalar = _mm_set_ps1(scalar); - __m128 ret; - __m128 vmin_val = _mm_set_ps1(min_val); - __m128 vmax_val = _mm_set_ps1(max_val); + int8_t* outputVectorPtr = outputVector; + __m128 vScalar = _mm_set_ps1(scalar); + __m128 ret; + __m128 vmin_val = _mm_set_ps1(min_val); + __m128 vmax_val = _mm_set_ps1(max_val); - __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4]; + __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4]; - for(;number < quarterPoints; number++){ - ret = _mm_load_ps(inputVectorPtr); - inputVectorPtr += 4; + for (; number < quarterPoints; number++) { + ret = _mm_load_ps(inputVectorPtr); + inputVectorPtr += 4; - ret = _mm_max_ps(_mm_min_ps(_mm_mul_ps(ret, vScalar), vmax_val), vmin_val); + ret = _mm_max_ps(_mm_min_ps(_mm_mul_ps(ret, vScalar), vmax_val), vmin_val); - _mm_store_ps(outputFloatBuffer, ret); - for (inner_loop = 0; inner_loop < 4; inner_loop++){ - *outputVectorPtr++ = (int8_t)(rintf(outputFloatBuffer[inner_loop])); + _mm_store_ps(outputFloatBuffer, ret); + for (inner_loop = 0; inner_loop < 4; inner_loop++) { + *outputVectorPtr++ = (int8_t)(rintf(outputFloatBuffer[inner_loop])); + } } - } - number = quarterPoints * 4; - for(; number < num_points; number++){ - r = inputVector[number] * scalar; - volk_32f_s32f_convert_8i_single(&outputVector[number], r); - } + number = quarterPoints * 4; + for (; number < num_points; number++) { + r = inputVector[number] * scalar; + volk_32f_s32f_convert_8i_single(&outputVector[number], r); + } } #endif /* LV_HAVE_SSE */ @@ -455,18 +496,19 @@ volk_32f_s32f_convert_8i_a_sse(int8_t* outputVector, const float* inputVector, #ifdef LV_HAVE_GENERIC -static inline void -volk_32f_s32f_convert_8i_a_generic(int8_t* outputVector, const float* inputVector, - const float scalar, unsigned int num_points) +static inline void volk_32f_s32f_convert_8i_a_generic(int8_t* outputVector, + const float* inputVector, + const float scalar, + unsigned int num_points) { - const float* inputVectorPtr = inputVector; - unsigned int number = 0; - float r; - - for(number = 0; number < num_points; number++){ - r = *inputVectorPtr++ * scalar; - volk_32f_s32f_convert_8i_single(&outputVector[number], r); - } + const float* inputVectorPtr = inputVector; + unsigned int number = 0; + float r; + + for (number = 0; number < num_points; number++) { + r = *inputVectorPtr++ * scalar; + volk_32f_s32f_convert_8i_single(&outputVector[number], r); + } } #endif /* LV_HAVE_GENERIC */ diff --git a/kernels/volk/volk_32f_s32f_mod_rangepuppet_32f.h b/kernels/volk/volk_32f_s32f_mod_rangepuppet_32f.h index 6ace77b..28d7ab5 100644 --- a/kernels/volk/volk_32f_s32f_mod_rangepuppet_32f.h +++ b/kernels/volk/volk_32f_s32f_mod_rangepuppet_32f.h @@ -4,42 +4,77 @@ #include #ifdef LV_HAVE_GENERIC -static inline void volk_32f_s32f_mod_rangepuppet_32f_generic(float *output, const float *input, float bound, unsigned int num_points){ - volk_32f_s32f_s32f_mod_range_32f_generic(output, input, bound-3.141f, bound, num_points); +static inline void volk_32f_s32f_mod_rangepuppet_32f_generic(float* output, + const float* input, + float bound, + unsigned int num_points) +{ + volk_32f_s32f_s32f_mod_range_32f_generic( + output, input, bound - 3.141f, bound, num_points); } #endif #ifdef LV_HAVE_SSE -static inline void volk_32f_s32f_mod_rangepuppet_32f_u_sse(float *output, const float *input, float bound, unsigned int num_points){ - volk_32f_s32f_s32f_mod_range_32f_u_sse(output, input, bound-3.141f, bound, num_points); +static inline void volk_32f_s32f_mod_rangepuppet_32f_u_sse(float* output, + const float* input, + float bound, + unsigned int num_points) +{ + volk_32f_s32f_s32f_mod_range_32f_u_sse( + output, input, bound - 3.141f, bound, num_points); } #endif #ifdef LV_HAVE_SSE -static inline void volk_32f_s32f_mod_rangepuppet_32f_a_sse(float *output, const float *input, float bound, unsigned int num_points){ - volk_32f_s32f_s32f_mod_range_32f_a_sse(output, input, bound-3.141f, bound, num_points); +static inline void volk_32f_s32f_mod_rangepuppet_32f_a_sse(float* output, + const float* input, + float bound, + unsigned int num_points) +{ + volk_32f_s32f_s32f_mod_range_32f_a_sse( + output, input, bound - 3.141f, bound, num_points); } #endif #ifdef LV_HAVE_SSE2 -static inline void volk_32f_s32f_mod_rangepuppet_32f_u_sse2(float *output, const float *input, float bound, unsigned int num_points){ - volk_32f_s32f_s32f_mod_range_32f_u_sse2(output, input, bound-3.141f, bound, num_points); +static inline void volk_32f_s32f_mod_rangepuppet_32f_u_sse2(float* output, + const float* input, + float bound, + unsigned int num_points) +{ + volk_32f_s32f_s32f_mod_range_32f_u_sse2( + output, input, bound - 3.141f, bound, num_points); } #endif #ifdef LV_HAVE_SSE2 -static inline void volk_32f_s32f_mod_rangepuppet_32f_a_sse2(float *output, const float *input, float bound, unsigned int num_points){ - volk_32f_s32f_s32f_mod_range_32f_a_sse2(output, input, bound-3.141f, bound, num_points); +static inline void volk_32f_s32f_mod_rangepuppet_32f_a_sse2(float* output, + const float* input, + float bound, + unsigned int num_points) +{ + volk_32f_s32f_s32f_mod_range_32f_a_sse2( + output, input, bound - 3.141f, bound, num_points); } #endif #ifdef LV_HAVE_AVX -static inline void volk_32f_s32f_mod_rangepuppet_32f_u_avx(float *output, const float *input, float bound, unsigned int num_points){ - volk_32f_s32f_s32f_mod_range_32f_u_avx(output, input, bound-3.141f, bound, num_points); +static inline void volk_32f_s32f_mod_rangepuppet_32f_u_avx(float* output, + const float* input, + float bound, + unsigned int num_points) +{ + volk_32f_s32f_s32f_mod_range_32f_u_avx( + output, input, bound - 3.141f, bound, num_points); } #endif #ifdef LV_HAVE_AVX -static inline void volk_32f_s32f_mod_rangepuppet_32f_a_avx(float *output, const float *input, float bound, unsigned int num_points){ - volk_32f_s32f_s32f_mod_range_32f_a_avx(output, input, bound-3.141f, bound, num_points); +static inline void volk_32f_s32f_mod_rangepuppet_32f_a_avx(float* output, + const float* input, + float bound, + unsigned int num_points) +{ + volk_32f_s32f_s32f_mod_range_32f_a_avx( + output, input, bound - 3.141f, bound, num_points); } #endif #endif diff --git a/kernels/volk/volk_32f_s32f_multiply_32f.h b/kernels/volk/volk_32f_s32f_multiply_32f.h index 97c7f69..dcc9c6b 100644 --- a/kernels/volk/volk_32f_s32f_multiply_32f.h +++ b/kernels/volk/volk_32f_s32f_multiply_32f.h @@ -29,8 +29,8 @@ * * Dispatcher Prototype * \code - * void volk_32f_s32f_multiply_32f(float* cVector, const float* aVector, const float scalar, unsigned int num_points) - * \endcode + * void volk_32f_s32f_multiply_32f(float* cVector, const float* aVector, const float + * scalar, unsigned int num_points) \endcode * * \b Inputs * \li aVector: The input vector of floats. @@ -75,84 +75,87 @@ #ifdef LV_HAVE_SSE #include -static inline void -volk_32f_s32f_multiply_32f_u_sse(float* cVector, const float* aVector, - const float scalar, unsigned int num_points) +static inline void volk_32f_s32f_multiply_32f_u_sse(float* cVector, + const float* aVector, + const float scalar, + unsigned int num_points) { - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; - float* cPtr = cVector; - const float* aPtr = aVector; + float* cPtr = cVector; + const float* aPtr = aVector; - __m128 aVal, bVal, cVal; - bVal = _mm_set_ps1(scalar); - for(;number < quarterPoints; number++){ - aVal = _mm_loadu_ps(aPtr); + __m128 aVal, bVal, cVal; + bVal = _mm_set_ps1(scalar); + for (; number < quarterPoints; number++) { + aVal = _mm_loadu_ps(aPtr); - cVal = _mm_mul_ps(aVal, bVal); + cVal = _mm_mul_ps(aVal, bVal); - _mm_storeu_ps(cPtr,cVal); // Store the results back into the C container + _mm_storeu_ps(cPtr, cVal); // Store the results back into the C container - aPtr += 4; - cPtr += 4; - } + aPtr += 4; + cPtr += 4; + } - number = quarterPoints * 4; - for(;number < num_points; number++){ - *cPtr++ = (*aPtr++) * scalar; - } + number = quarterPoints * 4; + for (; number < num_points; number++) { + *cPtr++ = (*aPtr++) * scalar; + } } #endif /* LV_HAVE_SSE */ #ifdef LV_HAVE_AVX #include -static inline void -volk_32f_s32f_multiply_32f_u_avx(float* cVector, const float* aVector, - const float scalar, unsigned int num_points) +static inline void volk_32f_s32f_multiply_32f_u_avx(float* cVector, + const float* aVector, + const float scalar, + unsigned int num_points) { - unsigned int number = 0; - const unsigned int eighthPoints = num_points / 8; + unsigned int number = 0; + const unsigned int eighthPoints = num_points / 8; - float* cPtr = cVector; - const float* aPtr = aVector; + float* cPtr = cVector; + const float* aPtr = aVector; - __m256 aVal, bVal, cVal; - bVal = _mm256_set1_ps(scalar); - for(;number < eighthPoints; number++){ + __m256 aVal, bVal, cVal; + bVal = _mm256_set1_ps(scalar); + for (; number < eighthPoints; number++) { - aVal = _mm256_loadu_ps(aPtr); + aVal = _mm256_loadu_ps(aPtr); - cVal = _mm256_mul_ps(aVal, bVal); + cVal = _mm256_mul_ps(aVal, bVal); - _mm256_storeu_ps(cPtr,cVal); // Store the results back into the C container + _mm256_storeu_ps(cPtr, cVal); // Store the results back into the C container - aPtr += 8; - cPtr += 8; - } + aPtr += 8; + cPtr += 8; + } - number = eighthPoints * 8; - for(;number < num_points; number++){ - *cPtr++ = (*aPtr++) * scalar; - } + number = eighthPoints * 8; + for (; number < num_points; number++) { + *cPtr++ = (*aPtr++) * scalar; + } } #endif /* LV_HAVE_AVX */ #ifdef LV_HAVE_GENERIC -static inline void -volk_32f_s32f_multiply_32f_generic(float* cVector, const float* aVector, - const float scalar, unsigned int num_points) +static inline void volk_32f_s32f_multiply_32f_generic(float* cVector, + const float* aVector, + const float scalar, + unsigned int num_points) { - unsigned int number = 0; - const float* inputPtr = aVector; - float* outputPtr = cVector; - for(number = 0; number < num_points; number++){ - *outputPtr = (*inputPtr) * scalar; - inputPtr++; - outputPtr++; - } + unsigned int number = 0; + const float* inputPtr = aVector; + float* outputPtr = cVector; + for (number = 0; number < num_points; number++) { + *outputPtr = (*inputPtr) * scalar; + inputPtr++; + outputPtr++; + } } #endif /* LV_HAVE_GENERIC */ @@ -168,126 +171,132 @@ volk_32f_s32f_multiply_32f_generic(float* cVector, const float* aVector, #ifdef LV_HAVE_SSE #include -static inline void -volk_32f_s32f_multiply_32f_a_sse(float* cVector, const float* aVector, - const float scalar, unsigned int num_points) +static inline void volk_32f_s32f_multiply_32f_a_sse(float* cVector, + const float* aVector, + const float scalar, + unsigned int num_points) { - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; - float* cPtr = cVector; - const float* aPtr = aVector; + float* cPtr = cVector; + const float* aPtr = aVector; - __m128 aVal, bVal, cVal; - bVal = _mm_set_ps1(scalar); - for(;number < quarterPoints; number++){ - aVal = _mm_load_ps(aPtr); + __m128 aVal, bVal, cVal; + bVal = _mm_set_ps1(scalar); + for (; number < quarterPoints; number++) { + aVal = _mm_load_ps(aPtr); - cVal = _mm_mul_ps(aVal, bVal); + cVal = _mm_mul_ps(aVal, bVal); - _mm_store_ps(cPtr,cVal); // Store the results back into the C container + _mm_store_ps(cPtr, cVal); // Store the results back into the C container - aPtr += 4; - cPtr += 4; - } + aPtr += 4; + cPtr += 4; + } - number = quarterPoints * 4; - for(;number < num_points; number++){ - *cPtr++ = (*aPtr++) * scalar; - } + number = quarterPoints * 4; + for (; number < num_points; number++) { + *cPtr++ = (*aPtr++) * scalar; + } } #endif /* LV_HAVE_SSE */ #ifdef LV_HAVE_AVX #include -static inline void -volk_32f_s32f_multiply_32f_a_avx(float* cVector, const float* aVector, - const float scalar, unsigned int num_points) +static inline void volk_32f_s32f_multiply_32f_a_avx(float* cVector, + const float* aVector, + const float scalar, + unsigned int num_points) { - unsigned int number = 0; - const unsigned int eighthPoints = num_points / 8; + unsigned int number = 0; + const unsigned int eighthPoints = num_points / 8; - float* cPtr = cVector; - const float* aPtr = aVector; + float* cPtr = cVector; + const float* aPtr = aVector; - __m256 aVal, bVal, cVal; - bVal = _mm256_set1_ps(scalar); - for(;number < eighthPoints; number++){ - aVal = _mm256_load_ps(aPtr); + __m256 aVal, bVal, cVal; + bVal = _mm256_set1_ps(scalar); + for (; number < eighthPoints; number++) { + aVal = _mm256_load_ps(aPtr); - cVal = _mm256_mul_ps(aVal, bVal); + cVal = _mm256_mul_ps(aVal, bVal); - _mm256_store_ps(cPtr,cVal); // Store the results back into the C container + _mm256_store_ps(cPtr, cVal); // Store the results back into the C container - aPtr += 8; - cPtr += 8; - } + aPtr += 8; + cPtr += 8; + } - number = eighthPoints * 8; - for(;number < num_points; number++){ - *cPtr++ = (*aPtr++) * scalar; - } + number = eighthPoints * 8; + for (; number < num_points; number++) { + *cPtr++ = (*aPtr++) * scalar; + } } #endif /* LV_HAVE_AVX */ #ifdef LV_HAVE_NEON #include -static inline void -volk_32f_s32f_multiply_32f_u_neon(float* cVector, const float* aVector, - const float scalar, unsigned int num_points) +static inline void volk_32f_s32f_multiply_32f_u_neon(float* cVector, + const float* aVector, + const float scalar, + unsigned int num_points) { - unsigned int number = 0; - const float* inputPtr = aVector; - float* outputPtr = cVector; - const unsigned int quarterPoints = num_points / 4; - - float32x4_t aVal, cVal; - - for(number = 0; number < quarterPoints; number++){ - aVal = vld1q_f32(inputPtr); // Load into NEON regs - cVal = vmulq_n_f32 (aVal, scalar); // Do the multiply - vst1q_f32(outputPtr, cVal); // Store results back to output - inputPtr += 4; - outputPtr += 4; - } - for(number = quarterPoints * 4; number < num_points; number++){ - *outputPtr++ = (*inputPtr++) * scalar; - } + unsigned int number = 0; + const float* inputPtr = aVector; + float* outputPtr = cVector; + const unsigned int quarterPoints = num_points / 4; + + float32x4_t aVal, cVal; + + for (number = 0; number < quarterPoints; number++) { + aVal = vld1q_f32(inputPtr); // Load into NEON regs + cVal = vmulq_n_f32(aVal, scalar); // Do the multiply + vst1q_f32(outputPtr, cVal); // Store results back to output + inputPtr += 4; + outputPtr += 4; + } + for (number = quarterPoints * 4; number < num_points; number++) { + *outputPtr++ = (*inputPtr++) * scalar; + } } #endif /* LV_HAVE_NEON */ #ifdef LV_HAVE_GENERIC -static inline void -volk_32f_s32f_multiply_32f_a_generic(float* cVector, const float* aVector, - const float scalar, unsigned int num_points) +static inline void volk_32f_s32f_multiply_32f_a_generic(float* cVector, + const float* aVector, + const float scalar, + unsigned int num_points) { - unsigned int number = 0; - const float* inputPtr = aVector; - float* outputPtr = cVector; - for(number = 0; number < num_points; number++){ - *outputPtr = (*inputPtr) * scalar; - inputPtr++; - outputPtr++; - } + unsigned int number = 0; + const float* inputPtr = aVector; + float* outputPtr = cVector; + for (number = 0; number < num_points; number++) { + *outputPtr = (*inputPtr) * scalar; + inputPtr++; + outputPtr++; + } } #endif /* LV_HAVE_GENERIC */ #ifdef LV_HAVE_ORC -extern void -volk_32f_s32f_multiply_32f_a_orc_impl(float* dst, const float* src, - const float scalar, unsigned int num_points); +extern void volk_32f_s32f_multiply_32f_a_orc_impl(float* dst, + const float* src, + const float scalar, + unsigned int num_points); -static inline void -volk_32f_s32f_multiply_32f_u_orc(float* cVector, const float* aVector, - const float scalar, unsigned int num_points) +static inline void volk_32f_s32f_multiply_32f_u_orc(float* cVector, + const float* aVector, + const float scalar, + unsigned int num_points) { - volk_32f_s32f_multiply_32f_a_orc_impl(cVector, aVector, scalar, num_points); + volk_32f_s32f_multiply_32f_a_orc_impl(cVector, aVector, scalar, num_points); } #endif /* LV_HAVE_GENERIC */ diff --git a/kernels/volk/volk_32f_s32f_normalize.h b/kernels/volk/volk_32f_s32f_normalize.h index 404d534..0a05492 100644 --- a/kernels/volk/volk_32f_s32f_normalize.h +++ b/kernels/volk/volk_32f_s32f_normalize.h @@ -30,8 +30,8 @@ * * Dispatcher Prototype * \code - * void volk_32f_s32f_normalize(float* vecBuffer, const float scalar, unsigned int num_points) - * \endcode + * void volk_32f_s32f_normalize(float* vecBuffer, const float scalar, unsigned int + * num_points) \endcode * * \b Inputs * \li vecBuffer: The buffer of values to be vectorized. @@ -76,84 +76,99 @@ #ifdef LV_HAVE_AVX #include -static inline void volk_32f_s32f_normalize_a_avx(float* vecBuffer, const float scalar, unsigned int num_points){ - unsigned int number = 0; - float* inputPtr = vecBuffer; +static inline void volk_32f_s32f_normalize_a_avx(float* vecBuffer, + const float scalar, + unsigned int num_points) +{ + unsigned int number = 0; + float* inputPtr = vecBuffer; - const float invScalar = 1.0 / scalar; - __m256 vecScalar = _mm256_set1_ps(invScalar); + const float invScalar = 1.0 / scalar; + __m256 vecScalar = _mm256_set1_ps(invScalar); - __m256 input1; + __m256 input1; - const uint64_t eighthPoints = num_points / 8; - for(;number < eighthPoints; number++){ + const uint64_t eighthPoints = num_points / 8; + for (; number < eighthPoints; number++) { - input1 = _mm256_load_ps(inputPtr); + input1 = _mm256_load_ps(inputPtr); - input1 = _mm256_mul_ps(input1, vecScalar); + input1 = _mm256_mul_ps(input1, vecScalar); - _mm256_store_ps(inputPtr, input1); + _mm256_store_ps(inputPtr, input1); - inputPtr += 8; - } + inputPtr += 8; + } - number = eighthPoints*8; - for(; number < num_points; number++){ - *inputPtr *= invScalar; - inputPtr++; - } + number = eighthPoints * 8; + for (; number < num_points; number++) { + *inputPtr *= invScalar; + inputPtr++; + } } #endif /* LV_HAVE_AVX */ #ifdef LV_HAVE_SSE #include -static inline void volk_32f_s32f_normalize_a_sse(float* vecBuffer, const float scalar, unsigned int num_points){ - unsigned int number = 0; - float* inputPtr = vecBuffer; +static inline void volk_32f_s32f_normalize_a_sse(float* vecBuffer, + const float scalar, + unsigned int num_points) +{ + unsigned int number = 0; + float* inputPtr = vecBuffer; - const float invScalar = 1.0 / scalar; - __m128 vecScalar = _mm_set_ps1(invScalar); + const float invScalar = 1.0 / scalar; + __m128 vecScalar = _mm_set_ps1(invScalar); - __m128 input1; + __m128 input1; - const uint64_t quarterPoints = num_points / 4; - for(;number < quarterPoints; number++){ + const uint64_t quarterPoints = num_points / 4; + for (; number < quarterPoints; number++) { - input1 = _mm_load_ps(inputPtr); + input1 = _mm_load_ps(inputPtr); - input1 = _mm_mul_ps(input1, vecScalar); + input1 = _mm_mul_ps(input1, vecScalar); - _mm_store_ps(inputPtr, input1); + _mm_store_ps(inputPtr, input1); - inputPtr += 4; - } + inputPtr += 4; + } - number = quarterPoints*4; - for(; number < num_points; number++){ - *inputPtr *= invScalar; - inputPtr++; - } + number = quarterPoints * 4; + for (; number < num_points; number++) { + *inputPtr *= invScalar; + inputPtr++; + } } #endif /* LV_HAVE_SSE */ #ifdef LV_HAVE_GENERIC -static inline void volk_32f_s32f_normalize_generic(float* vecBuffer, const float scalar, unsigned int num_points){ - unsigned int number = 0; - float* inputPtr = vecBuffer; - const float invScalar = 1.0 / scalar; - for(number = 0; number < num_points; number++){ - *inputPtr *= invScalar; - inputPtr++; - } +static inline void volk_32f_s32f_normalize_generic(float* vecBuffer, + const float scalar, + unsigned int num_points) +{ + unsigned int number = 0; + float* inputPtr = vecBuffer; + const float invScalar = 1.0 / scalar; + for (number = 0; number < num_points; number++) { + *inputPtr *= invScalar; + inputPtr++; + } } #endif /* LV_HAVE_GENERIC */ #ifdef LV_HAVE_ORC -extern void volk_32f_s32f_normalize_a_orc_impl(float* dst, float* src, const float scalar, unsigned int num_points); -static inline void volk_32f_s32f_normalize_u_orc(float* vecBuffer, const float scalar, unsigned int num_points){ +extern void volk_32f_s32f_normalize_a_orc_impl(float* dst, + float* src, + const float scalar, + unsigned int num_points); +static inline void volk_32f_s32f_normalize_u_orc(float* vecBuffer, + const float scalar, + unsigned int num_points) +{ float invscalar = 1.0 / scalar; volk_32f_s32f_normalize_a_orc_impl(vecBuffer, vecBuffer, invscalar, num_points); } @@ -169,32 +184,35 @@ static inline void volk_32f_s32f_normalize_u_orc(float* vecBuffer, const float s #ifdef LV_HAVE_AVX #include -static inline void volk_32f_s32f_normalize_u_avx(float* vecBuffer, const float scalar, unsigned int num_points){ - unsigned int number = 0; - float* inputPtr = vecBuffer; +static inline void volk_32f_s32f_normalize_u_avx(float* vecBuffer, + const float scalar, + unsigned int num_points) +{ + unsigned int number = 0; + float* inputPtr = vecBuffer; - const float invScalar = 1.0 / scalar; - __m256 vecScalar = _mm256_set1_ps(invScalar); + const float invScalar = 1.0 / scalar; + __m256 vecScalar = _mm256_set1_ps(invScalar); - __m256 input1; + __m256 input1; - const uint64_t eighthPoints = num_points / 8; - for(;number < eighthPoints; number++){ + const uint64_t eighthPoints = num_points / 8; + for (; number < eighthPoints; number++) { - input1 = _mm256_loadu_ps(inputPtr); + input1 = _mm256_loadu_ps(inputPtr); - input1 = _mm256_mul_ps(input1, vecScalar); + input1 = _mm256_mul_ps(input1, vecScalar); - _mm256_storeu_ps(inputPtr, input1); + _mm256_storeu_ps(inputPtr, input1); - inputPtr += 8; - } + inputPtr += 8; + } - number = eighthPoints*8; - for(; number < num_points; number++){ - *inputPtr *= invScalar; - inputPtr++; - } + number = eighthPoints * 8; + for (; number < num_points; number++) { + *inputPtr *= invScalar; + inputPtr++; + } } #endif /* LV_HAVE_AVX */ diff --git a/kernels/volk/volk_32f_s32f_power_32f.h b/kernels/volk/volk_32f_s32f_power_32f.h index 070efdc..9b6fdf4 100644 --- a/kernels/volk/volk_32f_s32f_power_32f.h +++ b/kernels/volk/volk_32f_s32f_power_32f.h @@ -30,8 +30,8 @@ * * Dispatcher Prototype * \code - * void volk_32f_s32f_power_32f(float* cVector, const float* aVector, const float power, unsigned int num_points) - * \endcode + * void volk_32f_s32f_power_32f(float* cVector, const float* aVector, const float power, + * unsigned int num_points) \endcode * * \b Inputs * \li aVector: The input vector of floats. @@ -72,8 +72,8 @@ #define INCLUDED_volk_32f_s32f_power_32f_a_H #include -#include #include +#include #ifdef LV_HAVE_SSE4_1 #include @@ -82,49 +82,51 @@ #include #endif /* LV_HAVE_LIB_SIMDMATH */ -static inline void -volk_32f_s32f_power_32f_a_sse4_1(float* cVector, const float* aVector, - const float power, unsigned int num_points) +static inline void volk_32f_s32f_power_32f_a_sse4_1(float* cVector, + const float* aVector, + const float power, + unsigned int num_points) { - unsigned int number = 0; + unsigned int number = 0; - float* cPtr = cVector; - const float* aPtr = aVector; + float* cPtr = cVector; + const float* aPtr = aVector; #ifdef LV_HAVE_LIB_SIMDMATH - const unsigned int quarterPoints = num_points / 4; - __m128 vPower = _mm_set_ps1(power); - __m128 zeroValue = _mm_setzero_ps(); - __m128 signMask; - __m128 negatedValues; - __m128 negativeOneToPower = _mm_set_ps1(powf(-1, power)); - __m128 onesMask = _mm_set_ps1(1); + const unsigned int quarterPoints = num_points / 4; + __m128 vPower = _mm_set_ps1(power); + __m128 zeroValue = _mm_setzero_ps(); + __m128 signMask; + __m128 negatedValues; + __m128 negativeOneToPower = _mm_set_ps1(powf(-1, power)); + __m128 onesMask = _mm_set_ps1(1); - __m128 aVal, cVal; - for(;number < quarterPoints; number++){ + __m128 aVal, cVal; + for (; number < quarterPoints; number++) { - aVal = _mm_load_ps(aPtr); - signMask = _mm_cmplt_ps(aVal, zeroValue); - negatedValues = _mm_sub_ps(zeroValue, aVal); - aVal = _mm_blendv_ps(aVal, negatedValues, signMask); + aVal = _mm_load_ps(aPtr); + signMask = _mm_cmplt_ps(aVal, zeroValue); + negatedValues = _mm_sub_ps(zeroValue, aVal); + aVal = _mm_blendv_ps(aVal, negatedValues, signMask); - // powf4 doesn't support negative values in the base, so we mask them off and then apply the negative after - cVal = powf4(aVal, vPower); // Takes each input value to the specified power + // powf4 doesn't support negative values in the base, so we mask them off and then + // apply the negative after + cVal = powf4(aVal, vPower); // Takes each input value to the specified power - cVal = _mm_mul_ps( _mm_blendv_ps(onesMask, negativeOneToPower, signMask), cVal); + cVal = _mm_mul_ps(_mm_blendv_ps(onesMask, negativeOneToPower, signMask), cVal); - _mm_store_ps(cPtr,cVal); // Store the results back into the C container + _mm_store_ps(cPtr, cVal); // Store the results back into the C container - aPtr += 4; - cPtr += 4; - } + aPtr += 4; + cPtr += 4; + } - number = quarterPoints * 4; + number = quarterPoints * 4; #endif /* LV_HAVE_LIB_SIMDMATH */ - for(;number < num_points; number++){ - *cPtr++ = powf((*aPtr++), power); - } + for (; number < num_points; number++) { + *cPtr++ = powf((*aPtr++), power); + } } #endif /* LV_HAVE_SSE4_1 */ @@ -137,49 +139,54 @@ volk_32f_s32f_power_32f_a_sse4_1(float* cVector, const float* aVector, #include #endif /* LV_HAVE_LIB_SIMDMATH */ -static inline void -volk_32f_s32f_power_32f_a_sse(float* cVector, const float* aVector, - const float power, unsigned int num_points) +static inline void volk_32f_s32f_power_32f_a_sse(float* cVector, + const float* aVector, + const float power, + unsigned int num_points) { - unsigned int number = 0; + unsigned int number = 0; - float* cPtr = cVector; - const float* aPtr = aVector; + float* cPtr = cVector; + const float* aPtr = aVector; #ifdef LV_HAVE_LIB_SIMDMATH - const unsigned int quarterPoints = num_points / 4; - __m128 vPower = _mm_set_ps1(power); - __m128 zeroValue = _mm_setzero_ps(); - __m128 signMask; - __m128 negatedValues; - __m128 negativeOneToPower = _mm_set_ps1(powf(-1, power)); - __m128 onesMask = _mm_set_ps1(1); - - __m128 aVal, cVal; - for(;number < quarterPoints; number++){ - - aVal = _mm_load_ps(aPtr); - signMask = _mm_cmplt_ps(aVal, zeroValue); - negatedValues = _mm_sub_ps(zeroValue, aVal); - aVal = _mm_or_ps(_mm_andnot_ps(signMask, aVal), _mm_and_ps(signMask, negatedValues) ); - - // powf4 doesn't support negative values in the base, so we mask them off and then apply the negative after - cVal = powf4(aVal, vPower); // Takes each input value to the specified power - - cVal = _mm_mul_ps( _mm_or_ps( _mm_andnot_ps(signMask, onesMask), _mm_and_ps(signMask, negativeOneToPower) ), cVal); - - _mm_store_ps(cPtr,cVal); // Store the results back into the C container - - aPtr += 4; - cPtr += 4; - } - - number = quarterPoints * 4; + const unsigned int quarterPoints = num_points / 4; + __m128 vPower = _mm_set_ps1(power); + __m128 zeroValue = _mm_setzero_ps(); + __m128 signMask; + __m128 negatedValues; + __m128 negativeOneToPower = _mm_set_ps1(powf(-1, power)); + __m128 onesMask = _mm_set_ps1(1); + + __m128 aVal, cVal; + for (; number < quarterPoints; number++) { + + aVal = _mm_load_ps(aPtr); + signMask = _mm_cmplt_ps(aVal, zeroValue); + negatedValues = _mm_sub_ps(zeroValue, aVal); + aVal = + _mm_or_ps(_mm_andnot_ps(signMask, aVal), _mm_and_ps(signMask, negatedValues)); + + // powf4 doesn't support negative values in the base, so we mask them off and then + // apply the negative after + cVal = powf4(aVal, vPower); // Takes each input value to the specified power + + cVal = _mm_mul_ps(_mm_or_ps(_mm_andnot_ps(signMask, onesMask), + _mm_and_ps(signMask, negativeOneToPower)), + cVal); + + _mm_store_ps(cPtr, cVal); // Store the results back into the C container + + aPtr += 4; + cPtr += 4; + } + + number = quarterPoints * 4; #endif /* LV_HAVE_LIB_SIMDMATH */ - for(;number < num_points; number++){ - *cPtr++ = powf((*aPtr++), power); - } + for (; number < num_points; number++) { + *cPtr++ = powf((*aPtr++), power); + } } #endif /* LV_HAVE_SSE */ @@ -187,17 +194,18 @@ volk_32f_s32f_power_32f_a_sse(float* cVector, const float* aVector, #ifdef LV_HAVE_GENERIC -static inline void -volk_32f_s32f_power_32f_generic(float* cVector, const float* aVector, - const float power, unsigned int num_points) +static inline void volk_32f_s32f_power_32f_generic(float* cVector, + const float* aVector, + const float power, + unsigned int num_points) { - float* cPtr = cVector; - const float* aPtr = aVector; - unsigned int number = 0; + float* cPtr = cVector; + const float* aPtr = aVector; + unsigned int number = 0; - for(number = 0; number < num_points; number++){ - *cPtr++ = powf((*aPtr++), power); - } + for (number = 0; number < num_points; number++) { + *cPtr++ = powf((*aPtr++), power); + } } #endif /* LV_HAVE_GENERIC */ diff --git a/kernels/volk/volk_32f_s32f_s32f_mod_range_32f.h b/kernels/volk/volk_32f_s32f_s32f_mod_range_32f.h index 53b4937..d7f23fe 100644 --- a/kernels/volk/volk_32f_s32f_s32f_mod_range_32f.h +++ b/kernels/volk/volk_32f_s32f_s32f_mod_range_32f.h @@ -25,8 +25,8 @@ * * Dispatcher Prototype * \code - * void volk_32f_s32f_s32f_mod_range_32f(float* outputVector, const float* inputVector, const float lower_bound, const float upper_bound, unsigned int num_points) - * \endcode + * void volk_32f_s32f_s32f_mod_range_32f(float* outputVector, const float* inputVector, + * const float lower_bound, const float upper_bound, unsigned int num_points) \endcode * * \b Inputs * \li inputVector: The input vector @@ -46,117 +46,129 @@ #ifdef LV_HAVE_AVX #include -static inline void volk_32f_s32f_s32f_mod_range_32f_u_avx(float* outputVector, const float* inputVector, const float lower_bound, const float upper_bound, unsigned int num_points){ - __m256 lower = _mm256_set1_ps(lower_bound); - __m256 upper = _mm256_set1_ps(upper_bound); - __m256 distance = _mm256_sub_ps(upper,lower); - float dist = upper_bound - lower_bound; - __m256 input, output; - __m256 is_smaller, is_bigger; - __m256 excess, adj; - - const float *inPtr = inputVector; - float *outPtr = outputVector; - size_t eight_points = num_points / 8; - size_t counter; - for(counter = 0; counter < eight_points; counter++) { - input = _mm256_loadu_ps(inPtr); - // calculate mask: input < lower, input > upper - is_smaller = _mm256_cmp_ps(input, lower, _CMP_LT_OQ); //0x11: Less than, ordered, non-signalling - is_bigger = _mm256_cmp_ps(input, upper, _CMP_GT_OQ); //0x1e: greater than, ordered, non-signalling - // find out how far we are out-of-bound – positive values! - excess = _mm256_and_ps(_mm256_sub_ps(lower, input), is_smaller); - excess = _mm256_or_ps(_mm256_and_ps(_mm256_sub_ps(input, upper), is_bigger), excess); - // how many do we have to add? (int(excess/distance+1)*distance) - excess = _mm256_div_ps(excess, distance); - // round down - excess = _mm256_cvtepi32_ps(_mm256_cvttps_epi32(excess)); - // plus 1 - adj = _mm256_set1_ps(1.0f); - excess = _mm256_add_ps(excess, adj); - // get the sign right, adj is still {1.0f,1.0f,1.0f,1.0f} - adj = _mm256_and_ps(adj, is_smaller); - adj = _mm256_or_ps(_mm256_and_ps(_mm256_set1_ps(-1.0f), is_bigger), adj); - // scale by distance, sign - excess = _mm256_mul_ps(_mm256_mul_ps(excess, adj), distance); - output = _mm256_add_ps(input, excess); - _mm256_storeu_ps(outPtr, output); - inPtr += 8; - outPtr += 8; - } - - size_t cnt; - for(cnt = eight_points * 8; cnt < num_points; cnt++){ - float val = inputVector[cnt]; - if(val < lower_bound){ - float excess = lower_bound - val; - signed int count = (int)(excess/dist); - outputVector[cnt] = val + (count+1)*dist; +static inline void volk_32f_s32f_s32f_mod_range_32f_u_avx(float* outputVector, + const float* inputVector, + const float lower_bound, + const float upper_bound, + unsigned int num_points) +{ + __m256 lower = _mm256_set1_ps(lower_bound); + __m256 upper = _mm256_set1_ps(upper_bound); + __m256 distance = _mm256_sub_ps(upper, lower); + float dist = upper_bound - lower_bound; + __m256 input, output; + __m256 is_smaller, is_bigger; + __m256 excess, adj; + + const float* inPtr = inputVector; + float* outPtr = outputVector; + size_t eight_points = num_points / 8; + size_t counter; + for (counter = 0; counter < eight_points; counter++) { + input = _mm256_loadu_ps(inPtr); + // calculate mask: input < lower, input > upper + is_smaller = _mm256_cmp_ps( + input, lower, _CMP_LT_OQ); // 0x11: Less than, ordered, non-signalling + is_bigger = _mm256_cmp_ps( + input, upper, _CMP_GT_OQ); // 0x1e: greater than, ordered, non-signalling + // find out how far we are out-of-bound – positive values! + excess = _mm256_and_ps(_mm256_sub_ps(lower, input), is_smaller); + excess = + _mm256_or_ps(_mm256_and_ps(_mm256_sub_ps(input, upper), is_bigger), excess); + // how many do we have to add? (int(excess/distance+1)*distance) + excess = _mm256_div_ps(excess, distance); + // round down + excess = _mm256_cvtepi32_ps(_mm256_cvttps_epi32(excess)); + // plus 1 + adj = _mm256_set1_ps(1.0f); + excess = _mm256_add_ps(excess, adj); + // get the sign right, adj is still {1.0f,1.0f,1.0f,1.0f} + adj = _mm256_and_ps(adj, is_smaller); + adj = _mm256_or_ps(_mm256_and_ps(_mm256_set1_ps(-1.0f), is_bigger), adj); + // scale by distance, sign + excess = _mm256_mul_ps(_mm256_mul_ps(excess, adj), distance); + output = _mm256_add_ps(input, excess); + _mm256_storeu_ps(outPtr, output); + inPtr += 8; + outPtr += 8; } - else if(val > upper_bound){ - float excess = val - upper_bound; - signed int count = (int)(excess/dist); - outputVector[cnt] = val - (count+1)*dist; + + size_t cnt; + for (cnt = eight_points * 8; cnt < num_points; cnt++) { + float val = inputVector[cnt]; + if (val < lower_bound) { + float excess = lower_bound - val; + signed int count = (int)(excess / dist); + outputVector[cnt] = val + (count + 1) * dist; + } else if (val > upper_bound) { + float excess = val - upper_bound; + signed int count = (int)(excess / dist); + outputVector[cnt] = val - (count + 1) * dist; + } else + outputVector[cnt] = val; } - else - outputVector[cnt] = val; - } } -static inline void volk_32f_s32f_s32f_mod_range_32f_a_avx(float* outputVector, const float* inputVector, const float lower_bound, const float upper_bound, unsigned int num_points){ - __m256 lower = _mm256_set1_ps(lower_bound); - __m256 upper = _mm256_set1_ps(upper_bound); - __m256 distance = _mm256_sub_ps(upper,lower); - float dist = upper_bound - lower_bound; - __m256 input, output; - __m256 is_smaller, is_bigger; - __m256 excess, adj; - - const float *inPtr = inputVector; - float *outPtr = outputVector; - size_t eight_points = num_points / 8; - size_t counter; - for(counter = 0; counter < eight_points; counter++) { - input = _mm256_load_ps(inPtr); - // calculate mask: input < lower, input > upper - is_smaller = _mm256_cmp_ps(input, lower, _CMP_LT_OQ); //0x11: Less than, ordered, non-signalling - is_bigger = _mm256_cmp_ps(input, upper, _CMP_GT_OQ); //0x1e: greater than, ordered, non-signalling - // find out how far we are out-of-bound – positive values! - excess = _mm256_and_ps(_mm256_sub_ps(lower, input), is_smaller); - excess = _mm256_or_ps(_mm256_and_ps(_mm256_sub_ps(input, upper), is_bigger), excess); - // how many do we have to add? (int(excess/distance+1)*distance) - excess = _mm256_div_ps(excess, distance); - // round down - excess = _mm256_cvtepi32_ps(_mm256_cvttps_epi32(excess)); - // plus 1 - adj = _mm256_set1_ps(1.0f); - excess = _mm256_add_ps(excess, adj); - // get the sign right, adj is still {1.0f,1.0f,1.0f,1.0f} - adj = _mm256_and_ps(adj, is_smaller); - adj = _mm256_or_ps(_mm256_and_ps(_mm256_set1_ps(-1.0f), is_bigger), adj); - // scale by distance, sign - excess = _mm256_mul_ps(_mm256_mul_ps(excess, adj), distance); - output = _mm256_add_ps(input, excess); - _mm256_store_ps(outPtr, output); - inPtr += 8; - outPtr += 8; - } - - size_t cnt; - for(cnt = eight_points * 8; cnt < num_points; cnt++){ - float val = inputVector[cnt]; - if(val < lower_bound){ - float excess = lower_bound - val; - signed int count = (int)(excess/dist); - outputVector[cnt] = val + (count+1)*dist; +static inline void volk_32f_s32f_s32f_mod_range_32f_a_avx(float* outputVector, + const float* inputVector, + const float lower_bound, + const float upper_bound, + unsigned int num_points) +{ + __m256 lower = _mm256_set1_ps(lower_bound); + __m256 upper = _mm256_set1_ps(upper_bound); + __m256 distance = _mm256_sub_ps(upper, lower); + float dist = upper_bound - lower_bound; + __m256 input, output; + __m256 is_smaller, is_bigger; + __m256 excess, adj; + + const float* inPtr = inputVector; + float* outPtr = outputVector; + size_t eight_points = num_points / 8; + size_t counter; + for (counter = 0; counter < eight_points; counter++) { + input = _mm256_load_ps(inPtr); + // calculate mask: input < lower, input > upper + is_smaller = _mm256_cmp_ps( + input, lower, _CMP_LT_OQ); // 0x11: Less than, ordered, non-signalling + is_bigger = _mm256_cmp_ps( + input, upper, _CMP_GT_OQ); // 0x1e: greater than, ordered, non-signalling + // find out how far we are out-of-bound – positive values! + excess = _mm256_and_ps(_mm256_sub_ps(lower, input), is_smaller); + excess = + _mm256_or_ps(_mm256_and_ps(_mm256_sub_ps(input, upper), is_bigger), excess); + // how many do we have to add? (int(excess/distance+1)*distance) + excess = _mm256_div_ps(excess, distance); + // round down + excess = _mm256_cvtepi32_ps(_mm256_cvttps_epi32(excess)); + // plus 1 + adj = _mm256_set1_ps(1.0f); + excess = _mm256_add_ps(excess, adj); + // get the sign right, adj is still {1.0f,1.0f,1.0f,1.0f} + adj = _mm256_and_ps(adj, is_smaller); + adj = _mm256_or_ps(_mm256_and_ps(_mm256_set1_ps(-1.0f), is_bigger), adj); + // scale by distance, sign + excess = _mm256_mul_ps(_mm256_mul_ps(excess, adj), distance); + output = _mm256_add_ps(input, excess); + _mm256_store_ps(outPtr, output); + inPtr += 8; + outPtr += 8; } - else if(val > upper_bound){ - float excess = val - upper_bound; - signed int count = (int)(excess/dist); - outputVector[cnt] = val - (count+1)*dist; + + size_t cnt; + for (cnt = eight_points * 8; cnt < num_points; cnt++) { + float val = inputVector[cnt]; + if (val < lower_bound) { + float excess = lower_bound - val; + signed int count = (int)(excess / dist); + outputVector[cnt] = val + (count + 1) * dist; + } else if (val > upper_bound) { + float excess = val - upper_bound; + signed int count = (int)(excess / dist); + outputVector[cnt] = val - (count + 1) * dist; + } else + outputVector[cnt] = val; } - else - outputVector[cnt] = val; - } } #endif /* LV_HAVE_AVX */ @@ -164,268 +176,282 @@ static inline void volk_32f_s32f_s32f_mod_range_32f_a_avx(float* outputVector, c #ifdef LV_HAVE_SSE2 #include -static inline void volk_32f_s32f_s32f_mod_range_32f_u_sse2(float* outputVector, const float* inputVector, const float lower_bound, const float upper_bound, unsigned int num_points){ - __m128 lower = _mm_set_ps1(lower_bound); - __m128 upper = _mm_set_ps1(upper_bound); - __m128 distance = _mm_sub_ps(upper,lower); - float dist = upper_bound - lower_bound; - __m128 input, output; - __m128 is_smaller, is_bigger; - __m128 excess, adj; - - const float *inPtr = inputVector; - float *outPtr = outputVector; - size_t quarter_points = num_points / 4; - size_t counter; - for(counter = 0; counter < quarter_points; counter++) { - input = _mm_load_ps(inPtr); - // calculate mask: input < lower, input > upper - is_smaller = _mm_cmplt_ps(input, lower); - is_bigger = _mm_cmpgt_ps(input, upper); - // find out how far we are out-of-bound – positive values! - excess = _mm_and_ps(_mm_sub_ps(lower, input), is_smaller); - excess = _mm_or_ps(_mm_and_ps(_mm_sub_ps(input, upper), is_bigger), excess); - // how many do we have to add? (int(excess/distance+1)*distance) - excess = _mm_div_ps(excess, distance); - // round down - excess = _mm_cvtepi32_ps(_mm_cvttps_epi32(excess)); - // plus 1 - adj = _mm_set_ps1(1.0f); - excess = _mm_add_ps(excess, adj); - // get the sign right, adj is still {1.0f,1.0f,1.0f,1.0f} - adj = _mm_and_ps(adj, is_smaller); - adj = _mm_or_ps(_mm_and_ps(_mm_set_ps1(-1.0f), is_bigger), adj); - // scale by distance, sign - excess = _mm_mul_ps(_mm_mul_ps(excess, adj), distance); - output = _mm_add_ps(input, excess); - _mm_store_ps(outPtr, output); - inPtr += 4; - outPtr += 4; - } - - size_t cnt; - for(cnt = quarter_points * 4; cnt < num_points; cnt++){ - float val = inputVector[cnt]; - if(val < lower_bound){ - float excess = lower_bound - val; - signed int count = (int)(excess/dist); - outputVector[cnt] = val + (count+1)*dist; +static inline void volk_32f_s32f_s32f_mod_range_32f_u_sse2(float* outputVector, + const float* inputVector, + const float lower_bound, + const float upper_bound, + unsigned int num_points) +{ + __m128 lower = _mm_set_ps1(lower_bound); + __m128 upper = _mm_set_ps1(upper_bound); + __m128 distance = _mm_sub_ps(upper, lower); + float dist = upper_bound - lower_bound; + __m128 input, output; + __m128 is_smaller, is_bigger; + __m128 excess, adj; + + const float* inPtr = inputVector; + float* outPtr = outputVector; + size_t quarter_points = num_points / 4; + size_t counter; + for (counter = 0; counter < quarter_points; counter++) { + input = _mm_load_ps(inPtr); + // calculate mask: input < lower, input > upper + is_smaller = _mm_cmplt_ps(input, lower); + is_bigger = _mm_cmpgt_ps(input, upper); + // find out how far we are out-of-bound – positive values! + excess = _mm_and_ps(_mm_sub_ps(lower, input), is_smaller); + excess = _mm_or_ps(_mm_and_ps(_mm_sub_ps(input, upper), is_bigger), excess); + // how many do we have to add? (int(excess/distance+1)*distance) + excess = _mm_div_ps(excess, distance); + // round down + excess = _mm_cvtepi32_ps(_mm_cvttps_epi32(excess)); + // plus 1 + adj = _mm_set_ps1(1.0f); + excess = _mm_add_ps(excess, adj); + // get the sign right, adj is still {1.0f,1.0f,1.0f,1.0f} + adj = _mm_and_ps(adj, is_smaller); + adj = _mm_or_ps(_mm_and_ps(_mm_set_ps1(-1.0f), is_bigger), adj); + // scale by distance, sign + excess = _mm_mul_ps(_mm_mul_ps(excess, adj), distance); + output = _mm_add_ps(input, excess); + _mm_store_ps(outPtr, output); + inPtr += 4; + outPtr += 4; } - else if(val > upper_bound){ - float excess = val - upper_bound; - signed int count = (int)(excess/dist); - outputVector[cnt] = val - (count+1)*dist; + + size_t cnt; + for (cnt = quarter_points * 4; cnt < num_points; cnt++) { + float val = inputVector[cnt]; + if (val < lower_bound) { + float excess = lower_bound - val; + signed int count = (int)(excess / dist); + outputVector[cnt] = val + (count + 1) * dist; + } else if (val > upper_bound) { + float excess = val - upper_bound; + signed int count = (int)(excess / dist); + outputVector[cnt] = val - (count + 1) * dist; + } else + outputVector[cnt] = val; } - else - outputVector[cnt] = val; - } } -static inline void volk_32f_s32f_s32f_mod_range_32f_a_sse2(float* outputVector, const float* inputVector, const float lower_bound, const float upper_bound, unsigned int num_points){ - __m128 lower = _mm_set_ps1(lower_bound); - __m128 upper = _mm_set_ps1(upper_bound); - __m128 distance = _mm_sub_ps(upper,lower); - __m128 input, output; - __m128 is_smaller, is_bigger; - __m128 excess, adj; - - const float *inPtr = inputVector; - float *outPtr = outputVector; - size_t quarter_points = num_points / 4; - size_t counter; - for(counter = 0; counter < quarter_points; counter++) { - input = _mm_load_ps(inPtr); - // calculate mask: input < lower, input > upper - is_smaller = _mm_cmplt_ps(input, lower); - is_bigger = _mm_cmpgt_ps(input, upper); - // find out how far we are out-of-bound – positive values! - excess = _mm_and_ps(_mm_sub_ps(lower, input), is_smaller); - excess = _mm_or_ps(_mm_and_ps(_mm_sub_ps(input, upper), is_bigger), excess); - // how many do we have to add? (int(excess/distance+1)*distance) - excess = _mm_div_ps(excess, distance); - // round down – for some reason, SSE doesn't come with a 4x float -> 4x int32 conversion. - excess = _mm_cvtepi32_ps(_mm_cvttps_epi32(excess)); - // plus 1 - adj = _mm_set_ps1(1.0f); - excess = _mm_add_ps(excess, adj); - // get the sign right, adj is still {1.0f,1.0f,1.0f,1.0f} - adj = _mm_and_ps(adj, is_smaller); - adj = _mm_or_ps(_mm_and_ps(_mm_set_ps1(-1.0f), is_bigger), adj); - // scale by distance, sign - excess = _mm_mul_ps(_mm_mul_ps(excess, adj), distance); - output = _mm_add_ps(input, excess); - _mm_store_ps(outPtr, output); - inPtr += 4; - outPtr += 4; - } - - float dist = upper_bound - lower_bound; - size_t cnt; - for(cnt = quarter_points * 4; cnt < num_points; cnt++){ - float val = inputVector[cnt]; - if(val < lower_bound){ - float excess = lower_bound - val; - signed int count = (int)(excess/dist); - outputVector[cnt] = val + (count+1)*dist; +static inline void volk_32f_s32f_s32f_mod_range_32f_a_sse2(float* outputVector, + const float* inputVector, + const float lower_bound, + const float upper_bound, + unsigned int num_points) +{ + __m128 lower = _mm_set_ps1(lower_bound); + __m128 upper = _mm_set_ps1(upper_bound); + __m128 distance = _mm_sub_ps(upper, lower); + __m128 input, output; + __m128 is_smaller, is_bigger; + __m128 excess, adj; + + const float* inPtr = inputVector; + float* outPtr = outputVector; + size_t quarter_points = num_points / 4; + size_t counter; + for (counter = 0; counter < quarter_points; counter++) { + input = _mm_load_ps(inPtr); + // calculate mask: input < lower, input > upper + is_smaller = _mm_cmplt_ps(input, lower); + is_bigger = _mm_cmpgt_ps(input, upper); + // find out how far we are out-of-bound – positive values! + excess = _mm_and_ps(_mm_sub_ps(lower, input), is_smaller); + excess = _mm_or_ps(_mm_and_ps(_mm_sub_ps(input, upper), is_bigger), excess); + // how many do we have to add? (int(excess/distance+1)*distance) + excess = _mm_div_ps(excess, distance); + // round down – for some reason, SSE doesn't come with a 4x float -> 4x int32 + // conversion. + excess = _mm_cvtepi32_ps(_mm_cvttps_epi32(excess)); + // plus 1 + adj = _mm_set_ps1(1.0f); + excess = _mm_add_ps(excess, adj); + // get the sign right, adj is still {1.0f,1.0f,1.0f,1.0f} + adj = _mm_and_ps(adj, is_smaller); + adj = _mm_or_ps(_mm_and_ps(_mm_set_ps1(-1.0f), is_bigger), adj); + // scale by distance, sign + excess = _mm_mul_ps(_mm_mul_ps(excess, adj), distance); + output = _mm_add_ps(input, excess); + _mm_store_ps(outPtr, output); + inPtr += 4; + outPtr += 4; } - else if(val > upper_bound){ - float excess = val - upper_bound; - signed int count = (int)(excess/dist); - outputVector[cnt] = val - (count+1)*dist; + + float dist = upper_bound - lower_bound; + size_t cnt; + for (cnt = quarter_points * 4; cnt < num_points; cnt++) { + float val = inputVector[cnt]; + if (val < lower_bound) { + float excess = lower_bound - val; + signed int count = (int)(excess / dist); + outputVector[cnt] = val + (count + 1) * dist; + } else if (val > upper_bound) { + float excess = val - upper_bound; + signed int count = (int)(excess / dist); + outputVector[cnt] = val - (count + 1) * dist; + } else + outputVector[cnt] = val; } - else - outputVector[cnt] = val; - } } #endif /* LV_HAVE_SSE2 */ #ifdef LV_HAVE_SSE #include -static inline void volk_32f_s32f_s32f_mod_range_32f_u_sse(float* outputVector, const float* inputVector, const float lower_bound, const float upper_bound, unsigned int num_points){ - __m128 lower = _mm_set_ps1(lower_bound); - __m128 upper = _mm_set_ps1(upper_bound); - __m128 distance = _mm_sub_ps(upper,lower); - float dist = upper_bound - lower_bound; - __m128 input, output; - __m128 is_smaller, is_bigger; - __m128 excess, adj; - __m128i rounddown; - - const float *inPtr = inputVector; - float *outPtr = outputVector; - size_t quarter_points = num_points / 4; - size_t counter; - for(counter = 0; counter < quarter_points; counter++) { - input = _mm_load_ps(inPtr); - // calculate mask: input < lower, input > upper - is_smaller = _mm_cmplt_ps(input, lower); - is_bigger = _mm_cmpgt_ps(input, upper); - // find out how far we are out-of-bound – positive values! - excess = _mm_and_ps(_mm_sub_ps(lower, input), is_smaller); - excess = _mm_or_ps(_mm_and_ps(_mm_sub_ps(input, upper), is_bigger), excess); - // how many do we have to add? (int(excess/distance+1)*distance) - excess = _mm_div_ps(excess, distance); - // round down – for some reason - rounddown = _mm_cvttps_epi32(excess); - excess = _mm_cvtepi32_ps(rounddown); - // plus 1 - adj = _mm_set_ps1(1.0f); - excess = _mm_add_ps(excess, adj); - // get the sign right, adj is still {1.0f,1.0f,1.0f,1.0f} - adj = _mm_and_ps(adj, is_smaller); - adj = _mm_or_ps(_mm_and_ps(_mm_set_ps1(-1.0f), is_bigger), adj); - // scale by distance, sign - excess = _mm_mul_ps(_mm_mul_ps(excess, adj), distance); - output = _mm_add_ps(input, excess); - _mm_store_ps(outPtr, output); - inPtr += 4; - outPtr += 4; - } - - size_t cnt; - for(cnt = quarter_points * 4; cnt < num_points; cnt++){ - float val = inputVector[cnt]; - if(val < lower_bound){ - float excess = lower_bound - val; - signed int count = (int)(excess/dist); - outputVector[cnt] = val + (count+1)*dist; +static inline void volk_32f_s32f_s32f_mod_range_32f_u_sse(float* outputVector, + const float* inputVector, + const float lower_bound, + const float upper_bound, + unsigned int num_points) +{ + __m128 lower = _mm_set_ps1(lower_bound); + __m128 upper = _mm_set_ps1(upper_bound); + __m128 distance = _mm_sub_ps(upper, lower); + float dist = upper_bound - lower_bound; + __m128 input, output; + __m128 is_smaller, is_bigger; + __m128 excess, adj; + __m128i rounddown; + + const float* inPtr = inputVector; + float* outPtr = outputVector; + size_t quarter_points = num_points / 4; + size_t counter; + for (counter = 0; counter < quarter_points; counter++) { + input = _mm_load_ps(inPtr); + // calculate mask: input < lower, input > upper + is_smaller = _mm_cmplt_ps(input, lower); + is_bigger = _mm_cmpgt_ps(input, upper); + // find out how far we are out-of-bound – positive values! + excess = _mm_and_ps(_mm_sub_ps(lower, input), is_smaller); + excess = _mm_or_ps(_mm_and_ps(_mm_sub_ps(input, upper), is_bigger), excess); + // how many do we have to add? (int(excess/distance+1)*distance) + excess = _mm_div_ps(excess, distance); + // round down – for some reason + rounddown = _mm_cvttps_epi32(excess); + excess = _mm_cvtepi32_ps(rounddown); + // plus 1 + adj = _mm_set_ps1(1.0f); + excess = _mm_add_ps(excess, adj); + // get the sign right, adj is still {1.0f,1.0f,1.0f,1.0f} + adj = _mm_and_ps(adj, is_smaller); + adj = _mm_or_ps(_mm_and_ps(_mm_set_ps1(-1.0f), is_bigger), adj); + // scale by distance, sign + excess = _mm_mul_ps(_mm_mul_ps(excess, adj), distance); + output = _mm_add_ps(input, excess); + _mm_store_ps(outPtr, output); + inPtr += 4; + outPtr += 4; } - else if(val > upper_bound){ - float excess = val - upper_bound; - signed int count = (int)(excess/dist); - outputVector[cnt] = val - (count+1)*dist; + + size_t cnt; + for (cnt = quarter_points * 4; cnt < num_points; cnt++) { + float val = inputVector[cnt]; + if (val < lower_bound) { + float excess = lower_bound - val; + signed int count = (int)(excess / dist); + outputVector[cnt] = val + (count + 1) * dist; + } else if (val > upper_bound) { + float excess = val - upper_bound; + signed int count = (int)(excess / dist); + outputVector[cnt] = val - (count + 1) * dist; + } else + outputVector[cnt] = val; } - else - outputVector[cnt] = val; - } } -static inline void volk_32f_s32f_s32f_mod_range_32f_a_sse(float* outputVector, const float* inputVector, const float lower_bound, const float upper_bound, unsigned int num_points){ - __m128 lower = _mm_set_ps1(lower_bound); - __m128 upper = _mm_set_ps1(upper_bound); - __m128 distance = _mm_sub_ps(upper,lower); - __m128 input, output; - __m128 is_smaller, is_bigger; - __m128 excess, adj; - __m128i rounddown; - - const float *inPtr = inputVector; - float *outPtr = outputVector; - size_t quarter_points = num_points / 4; - size_t counter; - for(counter = 0; counter < quarter_points; counter++) { - input = _mm_load_ps(inPtr); - // calculate mask: input < lower, input > upper - is_smaller = _mm_cmplt_ps(input, lower); - is_bigger = _mm_cmpgt_ps(input, upper); - // find out how far we are out-of-bound – positive values! - excess = _mm_and_ps(_mm_sub_ps(lower, input), is_smaller); - excess = _mm_or_ps(_mm_and_ps(_mm_sub_ps(input, upper), is_bigger), excess); - // how many do we have to add? (int(excess/distance+1)*distance) - excess = _mm_div_ps(excess, distance); - // round down - rounddown = _mm_cvttps_epi32(excess); - excess = _mm_cvtepi32_ps(rounddown); - // plus 1 - adj = _mm_set_ps1(1.0f); - excess = _mm_add_ps(excess, adj); - // get the sign right, adj is still {1.0f,1.0f,1.0f,1.0f} - adj = _mm_and_ps(adj, is_smaller); - adj = _mm_or_ps(_mm_and_ps(_mm_set_ps1(-1.0f), is_bigger), adj); - // scale by distance, sign - excess = _mm_mul_ps(_mm_mul_ps(excess, adj), distance); - output = _mm_add_ps(input, excess); - _mm_store_ps(outPtr, output); - inPtr += 4; - outPtr += 4; - } - - float dist = upper_bound - lower_bound; - size_t cnt; - for(cnt = quarter_points * 4; cnt < num_points; cnt++){ - float val = inputVector[cnt]; - if(val < lower_bound){ - float excess = lower_bound - val; - signed int count = (int)(excess/dist); - outputVector[cnt] = val + (count+1)*dist; +static inline void volk_32f_s32f_s32f_mod_range_32f_a_sse(float* outputVector, + const float* inputVector, + const float lower_bound, + const float upper_bound, + unsigned int num_points) +{ + __m128 lower = _mm_set_ps1(lower_bound); + __m128 upper = _mm_set_ps1(upper_bound); + __m128 distance = _mm_sub_ps(upper, lower); + __m128 input, output; + __m128 is_smaller, is_bigger; + __m128 excess, adj; + __m128i rounddown; + + const float* inPtr = inputVector; + float* outPtr = outputVector; + size_t quarter_points = num_points / 4; + size_t counter; + for (counter = 0; counter < quarter_points; counter++) { + input = _mm_load_ps(inPtr); + // calculate mask: input < lower, input > upper + is_smaller = _mm_cmplt_ps(input, lower); + is_bigger = _mm_cmpgt_ps(input, upper); + // find out how far we are out-of-bound – positive values! + excess = _mm_and_ps(_mm_sub_ps(lower, input), is_smaller); + excess = _mm_or_ps(_mm_and_ps(_mm_sub_ps(input, upper), is_bigger), excess); + // how many do we have to add? (int(excess/distance+1)*distance) + excess = _mm_div_ps(excess, distance); + // round down + rounddown = _mm_cvttps_epi32(excess); + excess = _mm_cvtepi32_ps(rounddown); + // plus 1 + adj = _mm_set_ps1(1.0f); + excess = _mm_add_ps(excess, adj); + // get the sign right, adj is still {1.0f,1.0f,1.0f,1.0f} + adj = _mm_and_ps(adj, is_smaller); + adj = _mm_or_ps(_mm_and_ps(_mm_set_ps1(-1.0f), is_bigger), adj); + // scale by distance, sign + excess = _mm_mul_ps(_mm_mul_ps(excess, adj), distance); + output = _mm_add_ps(input, excess); + _mm_store_ps(outPtr, output); + inPtr += 4; + outPtr += 4; } - else if(val > upper_bound){ - float excess = val - upper_bound; - signed int count = (int)(excess/dist); - outputVector[cnt] = val - (count+1)*dist; + + float dist = upper_bound - lower_bound; + size_t cnt; + for (cnt = quarter_points * 4; cnt < num_points; cnt++) { + float val = inputVector[cnt]; + if (val < lower_bound) { + float excess = lower_bound - val; + signed int count = (int)(excess / dist); + outputVector[cnt] = val + (count + 1) * dist; + } else if (val > upper_bound) { + float excess = val - upper_bound; + signed int count = (int)(excess / dist); + outputVector[cnt] = val - (count + 1) * dist; + } else + outputVector[cnt] = val; } - else - outputVector[cnt] = val; - } } #endif /* LV_HAVE_SSE */ #ifdef LV_HAVE_GENERIC -static inline void volk_32f_s32f_s32f_mod_range_32f_generic(float* outputVector, const float* inputVector, const float lower_bound, const float upper_bound, unsigned int num_points){ - float* outPtr = outputVector; - const float *inPtr; - float distance = upper_bound - lower_bound; - - for(inPtr = inputVector; inPtr < inputVector + num_points; inPtr++){ - float val = *inPtr; - if(val < lower_bound){ - float excess = lower_bound - val; - signed int count = (int)(excess/distance); - *outPtr = val + (count+1)*distance; - } - else if(val > upper_bound){ - float excess = val - upper_bound; - signed int count = (int)(excess/distance); - *outPtr = val - (count+1)*distance; +static inline void volk_32f_s32f_s32f_mod_range_32f_generic(float* outputVector, + const float* inputVector, + const float lower_bound, + const float upper_bound, + unsigned int num_points) +{ + float* outPtr = outputVector; + const float* inPtr; + float distance = upper_bound - lower_bound; + + for (inPtr = inputVector; inPtr < inputVector + num_points; inPtr++) { + float val = *inPtr; + if (val < lower_bound) { + float excess = lower_bound - val; + signed int count = (int)(excess / distance); + *outPtr = val + (count + 1) * distance; + } else if (val > upper_bound) { + float excess = val - upper_bound; + signed int count = (int)(excess / distance); + *outPtr = val - (count + 1) * distance; + } else + *outPtr = val; + outPtr++; } - else - *outPtr = val; - outPtr++; - } } #endif /* LV_HAVE_GENERIC */ - - #endif /* INCLUDED_VOLK_32F_S32F_S32F_MOD_RANGE_32F_A_H */ diff --git a/kernels/volk/volk_32f_s32f_stddev_32f.h b/kernels/volk/volk_32f_s32f_stddev_32f.h index 4f3dc1c..0a1c32b 100644 --- a/kernels/volk/volk_32f_s32f_stddev_32f.h +++ b/kernels/volk/volk_32f_s32f_stddev_32f.h @@ -29,8 +29,8 @@ * * Dispatcher Prototype * \code - * void volk_32f_s32f_stddev_32f(float* stddev, const float* inputBuffer, const float mean, unsigned int num_points) - * \endcode + * void volk_32f_s32f_stddev_32f(float* stddev, const float* inputBuffer, const float + * mean, unsigned int num_points) \endcode * * \b Inputs * \li inputBuffer: The input vector of floats. @@ -68,65 +68,72 @@ #ifndef INCLUDED_volk_32f_s32f_stddev_32f_a_H #define INCLUDED_volk_32f_s32f_stddev_32f_a_H -#include #include -#include #include +#include +#include #ifdef LV_HAVE_SSE4_1 #include -static inline void -volk_32f_s32f_stddev_32f_a_sse4_1(float* stddev, const float* inputBuffer, - const float mean, unsigned int num_points) +static inline void volk_32f_s32f_stddev_32f_a_sse4_1(float* stddev, + const float* inputBuffer, + const float mean, + unsigned int num_points) { - float returnValue = 0; - if(num_points > 0){ - unsigned int number = 0; - const unsigned int sixteenthPoints = num_points / 16; - - const float* aPtr = inputBuffer; - - __VOLK_ATTR_ALIGNED(16) float squareBuffer[4]; - - __m128 squareAccumulator = _mm_setzero_ps(); - __m128 aVal1, aVal2, aVal3, aVal4; - __m128 cVal1, cVal2, cVal3, cVal4; - for(;number < sixteenthPoints; number++) { - aVal1 = _mm_load_ps(aPtr); aPtr += 4; - cVal1 = _mm_dp_ps(aVal1, aVal1, 0xF1); - - aVal2 = _mm_load_ps(aPtr); aPtr += 4; - cVal2 = _mm_dp_ps(aVal2, aVal2, 0xF2); - - aVal3 = _mm_load_ps(aPtr); aPtr += 4; - cVal3 = _mm_dp_ps(aVal3, aVal3, 0xF4); - - aVal4 = _mm_load_ps(aPtr); aPtr += 4; - cVal4 = _mm_dp_ps(aVal4, aVal4, 0xF8); - - cVal1 = _mm_or_ps(cVal1, cVal2); - cVal3 = _mm_or_ps(cVal3, cVal4); - cVal1 = _mm_or_ps(cVal1, cVal3); - - squareAccumulator = _mm_add_ps(squareAccumulator, cVal1); // squareAccumulator += x^2 + float returnValue = 0; + if (num_points > 0) { + unsigned int number = 0; + const unsigned int sixteenthPoints = num_points / 16; + + const float* aPtr = inputBuffer; + + __VOLK_ATTR_ALIGNED(16) float squareBuffer[4]; + + __m128 squareAccumulator = _mm_setzero_ps(); + __m128 aVal1, aVal2, aVal3, aVal4; + __m128 cVal1, cVal2, cVal3, cVal4; + for (; number < sixteenthPoints; number++) { + aVal1 = _mm_load_ps(aPtr); + aPtr += 4; + cVal1 = _mm_dp_ps(aVal1, aVal1, 0xF1); + + aVal2 = _mm_load_ps(aPtr); + aPtr += 4; + cVal2 = _mm_dp_ps(aVal2, aVal2, 0xF2); + + aVal3 = _mm_load_ps(aPtr); + aPtr += 4; + cVal3 = _mm_dp_ps(aVal3, aVal3, 0xF4); + + aVal4 = _mm_load_ps(aPtr); + aPtr += 4; + cVal4 = _mm_dp_ps(aVal4, aVal4, 0xF8); + + cVal1 = _mm_or_ps(cVal1, cVal2); + cVal3 = _mm_or_ps(cVal3, cVal4); + cVal1 = _mm_or_ps(cVal1, cVal3); + + squareAccumulator = + _mm_add_ps(squareAccumulator, cVal1); // squareAccumulator += x^2 + } + _mm_store_ps(squareBuffer, + squareAccumulator); // Store the results back into the C container + returnValue = squareBuffer[0]; + returnValue += squareBuffer[1]; + returnValue += squareBuffer[2]; + returnValue += squareBuffer[3]; + + number = sixteenthPoints * 16; + for (; number < num_points; number++) { + returnValue += (*aPtr) * (*aPtr); + aPtr++; + } + returnValue /= num_points; + returnValue -= (mean * mean); + returnValue = sqrtf(returnValue); } - _mm_store_ps(squareBuffer,squareAccumulator); // Store the results back into the C container - returnValue = squareBuffer[0]; - returnValue += squareBuffer[1]; - returnValue += squareBuffer[2]; - returnValue += squareBuffer[3]; - - number = sixteenthPoints * 16; - for(;number < num_points; number++){ - returnValue += (*aPtr) * (*aPtr); - aPtr++; - } - returnValue /= num_points; - returnValue -= (mean * mean); - returnValue = sqrtf(returnValue); - } - *stddev = returnValue; + *stddev = returnValue; } #endif /* LV_HAVE_SSE4_1 */ @@ -134,43 +141,45 @@ volk_32f_s32f_stddev_32f_a_sse4_1(float* stddev, const float* inputBuffer, #ifdef LV_HAVE_SSE #include -static inline void -volk_32f_s32f_stddev_32f_a_sse(float* stddev, const float* inputBuffer, - const float mean, unsigned int num_points) +static inline void volk_32f_s32f_stddev_32f_a_sse(float* stddev, + const float* inputBuffer, + const float mean, + unsigned int num_points) { - float returnValue = 0; - if(num_points > 0){ - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - const float* aPtr = inputBuffer; - - __VOLK_ATTR_ALIGNED(16) float squareBuffer[4]; - - __m128 squareAccumulator = _mm_setzero_ps(); - __m128 aVal = _mm_setzero_ps(); - for(;number < quarterPoints; number++) { - aVal = _mm_load_ps(aPtr); // aVal = x - aVal = _mm_mul_ps(aVal, aVal); // squareAccumulator += x^2 - squareAccumulator = _mm_add_ps(squareAccumulator, aVal); - aPtr += 4; - } - _mm_store_ps(squareBuffer,squareAccumulator); // Store the results back into the C container - returnValue = squareBuffer[0]; - returnValue += squareBuffer[1]; - returnValue += squareBuffer[2]; - returnValue += squareBuffer[3]; - - number = quarterPoints * 4; - for(;number < num_points; number++){ - returnValue += (*aPtr) * (*aPtr); - aPtr++; + float returnValue = 0; + if (num_points > 0) { + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + const float* aPtr = inputBuffer; + + __VOLK_ATTR_ALIGNED(16) float squareBuffer[4]; + + __m128 squareAccumulator = _mm_setzero_ps(); + __m128 aVal = _mm_setzero_ps(); + for (; number < quarterPoints; number++) { + aVal = _mm_load_ps(aPtr); // aVal = x + aVal = _mm_mul_ps(aVal, aVal); // squareAccumulator += x^2 + squareAccumulator = _mm_add_ps(squareAccumulator, aVal); + aPtr += 4; + } + _mm_store_ps(squareBuffer, + squareAccumulator); // Store the results back into the C container + returnValue = squareBuffer[0]; + returnValue += squareBuffer[1]; + returnValue += squareBuffer[2]; + returnValue += squareBuffer[3]; + + number = quarterPoints * 4; + for (; number < num_points; number++) { + returnValue += (*aPtr) * (*aPtr); + aPtr++; + } + returnValue /= num_points; + returnValue -= (mean * mean); + returnValue = sqrtf(returnValue); } - returnValue /= num_points; - returnValue -= (mean * mean); - returnValue = sqrtf(returnValue); - } - *stddev = returnValue; + *stddev = returnValue; } #endif /* LV_HAVE_SSE */ @@ -178,86 +187,93 @@ volk_32f_s32f_stddev_32f_a_sse(float* stddev, const float* inputBuffer, #ifdef LV_HAVE_AVX #include -static inline void -volk_32f_s32f_stddev_32f_a_avx(float* stddev, const float* inputBuffer, - const float mean, unsigned int num_points) +static inline void volk_32f_s32f_stddev_32f_a_avx(float* stddev, + const float* inputBuffer, + const float mean, + unsigned int num_points) { - float stdDev = 0; - if(num_points > 0){ - unsigned int number = 0; - const unsigned int thirtySecondthPoints = num_points / 32; - - const float* aPtr = inputBuffer; - __VOLK_ATTR_ALIGNED(32) float squareBuffer[8]; - - __m256 squareAccumulator = _mm256_setzero_ps(); - __m256 aVal1, aVal2, aVal3, aVal4; - __m256 cVal1, cVal2, cVal3, cVal4; - for(;number < thirtySecondthPoints; number++) { - aVal1 = _mm256_load_ps(aPtr); aPtr += 8; - cVal1 = _mm256_dp_ps(aVal1, aVal1, 0xF1); - - aVal2 = _mm256_load_ps(aPtr); aPtr += 8; - cVal2 = _mm256_dp_ps(aVal2, aVal2, 0xF2); - - aVal3 = _mm256_load_ps(aPtr); aPtr += 8; - cVal3 = _mm256_dp_ps(aVal3, aVal3, 0xF4); - - aVal4 = _mm256_load_ps(aPtr); aPtr += 8; - cVal4 = _mm256_dp_ps(aVal4, aVal4, 0xF8); - - cVal1 = _mm256_or_ps(cVal1, cVal2); - cVal3 = _mm256_or_ps(cVal3, cVal4); - cVal1 = _mm256_or_ps(cVal1, cVal3); - - squareAccumulator = _mm256_add_ps(squareAccumulator, cVal1); // squareAccumulator += x^2 + float stdDev = 0; + if (num_points > 0) { + unsigned int number = 0; + const unsigned int thirtySecondthPoints = num_points / 32; + + const float* aPtr = inputBuffer; + __VOLK_ATTR_ALIGNED(32) float squareBuffer[8]; + + __m256 squareAccumulator = _mm256_setzero_ps(); + __m256 aVal1, aVal2, aVal3, aVal4; + __m256 cVal1, cVal2, cVal3, cVal4; + for (; number < thirtySecondthPoints; number++) { + aVal1 = _mm256_load_ps(aPtr); + aPtr += 8; + cVal1 = _mm256_dp_ps(aVal1, aVal1, 0xF1); + + aVal2 = _mm256_load_ps(aPtr); + aPtr += 8; + cVal2 = _mm256_dp_ps(aVal2, aVal2, 0xF2); + + aVal3 = _mm256_load_ps(aPtr); + aPtr += 8; + cVal3 = _mm256_dp_ps(aVal3, aVal3, 0xF4); + + aVal4 = _mm256_load_ps(aPtr); + aPtr += 8; + cVal4 = _mm256_dp_ps(aVal4, aVal4, 0xF8); + + cVal1 = _mm256_or_ps(cVal1, cVal2); + cVal3 = _mm256_or_ps(cVal3, cVal4); + cVal1 = _mm256_or_ps(cVal1, cVal3); + + squareAccumulator = + _mm256_add_ps(squareAccumulator, cVal1); // squareAccumulator += x^2 + } + _mm256_store_ps(squareBuffer, + squareAccumulator); // Store the results back into the C container + stdDev = squareBuffer[0]; + stdDev += squareBuffer[1]; + stdDev += squareBuffer[2]; + stdDev += squareBuffer[3]; + stdDev += squareBuffer[4]; + stdDev += squareBuffer[5]; + stdDev += squareBuffer[6]; + stdDev += squareBuffer[7]; + + number = thirtySecondthPoints * 32; + for (; number < num_points; number++) { + stdDev += (*aPtr) * (*aPtr); + aPtr++; + } + stdDev /= num_points; + stdDev -= (mean * mean); + stdDev = sqrtf(stdDev); } - _mm256_store_ps(squareBuffer,squareAccumulator); // Store the results back into the C container - stdDev = squareBuffer[0]; - stdDev += squareBuffer[1]; - stdDev += squareBuffer[2]; - stdDev += squareBuffer[3]; - stdDev += squareBuffer[4]; - stdDev += squareBuffer[5]; - stdDev += squareBuffer[6]; - stdDev += squareBuffer[7]; - - number = thirtySecondthPoints * 32; - for(;number < num_points; number++){ - stdDev += (*aPtr) * (*aPtr); - aPtr++; - } - stdDev /= num_points; - stdDev -= (mean * mean); - stdDev = sqrtf(stdDev); - } - *stddev = stdDev; - + *stddev = stdDev; } #endif /* LV_HAVE_AVX */ #ifdef LV_HAVE_GENERIC -static inline void -volk_32f_s32f_stddev_32f_generic(float* stddev, const float* inputBuffer, - const float mean, unsigned int num_points) +static inline void volk_32f_s32f_stddev_32f_generic(float* stddev, + const float* inputBuffer, + const float mean, + unsigned int num_points) { - float returnValue = 0; - if(num_points > 0){ - const float* aPtr = inputBuffer; - unsigned int number = 0; - - for(number = 0; number < num_points; number++){ - returnValue += (*aPtr) * (*aPtr); - aPtr++; + float returnValue = 0; + if (num_points > 0) { + const float* aPtr = inputBuffer; + unsigned int number = 0; + + for (number = 0; number < num_points; number++) { + returnValue += (*aPtr) * (*aPtr); + aPtr++; + } + + returnValue /= num_points; + returnValue -= (mean * mean); + returnValue = sqrtf(returnValue); } - - returnValue /= num_points; - returnValue -= (mean * mean); - returnValue = sqrtf(returnValue); - } - *stddev = returnValue; + *stddev = returnValue; } #endif /* LV_HAVE_GENERIC */ @@ -268,69 +284,76 @@ volk_32f_s32f_stddev_32f_generic(float* stddev, const float* inputBuffer, #ifndef INCLUDED_volk_32f_s32f_stddev_32f_u_H #define INCLUDED_volk_32f_s32f_stddev_32f_u_H -#include #include -#include #include +#include +#include #ifdef LV_HAVE_AVX #include -static inline void -volk_32f_s32f_stddev_32f_u_avx(float* stddev, const float* inputBuffer, - const float mean, unsigned int num_points) +static inline void volk_32f_s32f_stddev_32f_u_avx(float* stddev, + const float* inputBuffer, + const float mean, + unsigned int num_points) { - float stdDev = 0; - if(num_points > 0){ - unsigned int number = 0; - const unsigned int thirtySecondthPoints = num_points / 32; - - const float* aPtr = inputBuffer; - __VOLK_ATTR_ALIGNED(32) float squareBuffer[8]; - - __m256 squareAccumulator = _mm256_setzero_ps(); - __m256 aVal1, aVal2, aVal3, aVal4; - __m256 cVal1, cVal2, cVal3, cVal4; - for(;number < thirtySecondthPoints; number++) { - aVal1 = _mm256_loadu_ps(aPtr); aPtr += 8; - cVal1 = _mm256_dp_ps(aVal1, aVal1, 0xF1); - - aVal2 = _mm256_loadu_ps(aPtr); aPtr += 8; - cVal2 = _mm256_dp_ps(aVal2, aVal2, 0xF2); - - aVal3 = _mm256_loadu_ps(aPtr); aPtr += 8; - cVal3 = _mm256_dp_ps(aVal3, aVal3, 0xF4); - - aVal4 = _mm256_loadu_ps(aPtr); aPtr += 8; - cVal4 = _mm256_dp_ps(aVal4, aVal4, 0xF8); - - cVal1 = _mm256_or_ps(cVal1, cVal2); - cVal3 = _mm256_or_ps(cVal3, cVal4); - cVal1 = _mm256_or_ps(cVal1, cVal3); - - squareAccumulator = _mm256_add_ps(squareAccumulator, cVal1); // squareAccumulator += x^2 + float stdDev = 0; + if (num_points > 0) { + unsigned int number = 0; + const unsigned int thirtySecondthPoints = num_points / 32; + + const float* aPtr = inputBuffer; + __VOLK_ATTR_ALIGNED(32) float squareBuffer[8]; + + __m256 squareAccumulator = _mm256_setzero_ps(); + __m256 aVal1, aVal2, aVal3, aVal4; + __m256 cVal1, cVal2, cVal3, cVal4; + for (; number < thirtySecondthPoints; number++) { + aVal1 = _mm256_loadu_ps(aPtr); + aPtr += 8; + cVal1 = _mm256_dp_ps(aVal1, aVal1, 0xF1); + + aVal2 = _mm256_loadu_ps(aPtr); + aPtr += 8; + cVal2 = _mm256_dp_ps(aVal2, aVal2, 0xF2); + + aVal3 = _mm256_loadu_ps(aPtr); + aPtr += 8; + cVal3 = _mm256_dp_ps(aVal3, aVal3, 0xF4); + + aVal4 = _mm256_loadu_ps(aPtr); + aPtr += 8; + cVal4 = _mm256_dp_ps(aVal4, aVal4, 0xF8); + + cVal1 = _mm256_or_ps(cVal1, cVal2); + cVal3 = _mm256_or_ps(cVal3, cVal4); + cVal1 = _mm256_or_ps(cVal1, cVal3); + + squareAccumulator = + _mm256_add_ps(squareAccumulator, cVal1); // squareAccumulator += x^2 + } + _mm256_storeu_ps( + squareBuffer, + squareAccumulator); // Store the results back into the C container + stdDev = squareBuffer[0]; + stdDev += squareBuffer[1]; + stdDev += squareBuffer[2]; + stdDev += squareBuffer[3]; + stdDev += squareBuffer[4]; + stdDev += squareBuffer[5]; + stdDev += squareBuffer[6]; + stdDev += squareBuffer[7]; + + number = thirtySecondthPoints * 32; + for (; number < num_points; number++) { + stdDev += (*aPtr) * (*aPtr); + aPtr++; + } + stdDev /= num_points; + stdDev -= (mean * mean); + stdDev = sqrtf(stdDev); } - _mm256_storeu_ps(squareBuffer,squareAccumulator); // Store the results back into the C container - stdDev = squareBuffer[0]; - stdDev += squareBuffer[1]; - stdDev += squareBuffer[2]; - stdDev += squareBuffer[3]; - stdDev += squareBuffer[4]; - stdDev += squareBuffer[5]; - stdDev += squareBuffer[6]; - stdDev += squareBuffer[7]; - - number = thirtySecondthPoints * 32; - for(;number < num_points; number++){ - stdDev += (*aPtr) * (*aPtr); - aPtr++; - } - stdDev /= num_points; - stdDev -= (mean * mean); - stdDev = sqrtf(stdDev); - } - *stddev = stdDev; - + *stddev = stdDev; } #endif /* LV_HAVE_AVX */ diff --git a/kernels/volk/volk_32f_sin_32f.h b/kernels/volk/volk_32f_sin_32f.h index 3780086..e65f25a 100644 --- a/kernels/volk/volk_32f_sin_32f.h +++ b/kernels/volk/volk_32f_sin_32f.h @@ -69,9 +69,9 @@ * \endcode */ -#include -#include #include +#include +#include #ifndef INCLUDED_volk_32f_sin_32f_a_H #define INCLUDED_volk_32f_sin_32f_a_H @@ -83,72 +83,93 @@ static inline void volk_32f_sin_32f_a_avx2_fma(float* bVector, const float* aVector, unsigned int num_points) { - float* bPtr = bVector; - const float* aPtr = aVector; - - unsigned int number = 0; - unsigned int eighthPoints = num_points / 8; - unsigned int i = 0; - - __m256 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones, fzeroes; - __m256 sine, cosine, condition1, condition2; - __m256i q, r, ones, twos, fours; - - m4pi = _mm256_set1_ps(1.273239545); - pio4A = _mm256_set1_ps(0.78515625); - pio4B = _mm256_set1_ps(0.241876e-3); - ffours = _mm256_set1_ps(4.0); - ftwos = _mm256_set1_ps(2.0); - fones = _mm256_set1_ps(1.0); - fzeroes = _mm256_setzero_ps(); - ones = _mm256_set1_epi32(1); - twos = _mm256_set1_epi32(2); - fours = _mm256_set1_epi32(4); - - cp1 = _mm256_set1_ps(1.0); - cp2 = _mm256_set1_ps(0.83333333e-1); - cp3 = _mm256_set1_ps(0.2777778e-2); - cp4 = _mm256_set1_ps(0.49603e-4); - cp5 = _mm256_set1_ps(0.551e-6); - - for(;number < eighthPoints; number++) { - aVal = _mm256_load_ps(aPtr); - s = _mm256_sub_ps(aVal, _mm256_and_ps(_mm256_mul_ps(aVal, ftwos), _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS))); - q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi))); - r = _mm256_add_epi32(q, _mm256_and_si256(q, ones)); - - s = _mm256_fnmadd_ps(_mm256_cvtepi32_ps(r), pio4A, s); - s = _mm256_fnmadd_ps(_mm256_cvtepi32_ps(r), pio4B, s); - - s = _mm256_div_ps(s, _mm256_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction - s = _mm256_mul_ps(s, s); - // Evaluate Taylor series - s = _mm256_mul_ps(_mm256_fmadd_ps(_mm256_fmsub_ps(_mm256_fmadd_ps(_mm256_fmsub_ps(s, cp5, cp4), s, cp3), s, cp2), s, cp1), s); - - for(i = 0; i < 3; i++) { - s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s)); + float* bPtr = bVector; + const float* aPtr = aVector; + + unsigned int number = 0; + unsigned int eighthPoints = num_points / 8; + unsigned int i = 0; + + __m256 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones, + fzeroes; + __m256 sine, cosine, condition1, condition2; + __m256i q, r, ones, twos, fours; + + m4pi = _mm256_set1_ps(1.273239545); + pio4A = _mm256_set1_ps(0.78515625); + pio4B = _mm256_set1_ps(0.241876e-3); + ffours = _mm256_set1_ps(4.0); + ftwos = _mm256_set1_ps(2.0); + fones = _mm256_set1_ps(1.0); + fzeroes = _mm256_setzero_ps(); + ones = _mm256_set1_epi32(1); + twos = _mm256_set1_epi32(2); + fours = _mm256_set1_epi32(4); + + cp1 = _mm256_set1_ps(1.0); + cp2 = _mm256_set1_ps(0.83333333e-1); + cp3 = _mm256_set1_ps(0.2777778e-2); + cp4 = _mm256_set1_ps(0.49603e-4); + cp5 = _mm256_set1_ps(0.551e-6); + + for (; number < eighthPoints; number++) { + aVal = _mm256_load_ps(aPtr); + s = _mm256_sub_ps(aVal, + _mm256_and_ps(_mm256_mul_ps(aVal, ftwos), + _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS))); + q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi))); + r = _mm256_add_epi32(q, _mm256_and_si256(q, ones)); + + s = _mm256_fnmadd_ps(_mm256_cvtepi32_ps(r), pio4A, s); + s = _mm256_fnmadd_ps(_mm256_cvtepi32_ps(r), pio4B, s); + + s = _mm256_div_ps( + s, + _mm256_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction + s = _mm256_mul_ps(s, s); + // Evaluate Taylor series + s = _mm256_mul_ps( + _mm256_fmadd_ps( + _mm256_fmsub_ps( + _mm256_fmadd_ps(_mm256_fmsub_ps(s, cp5, cp4), s, cp3), s, cp2), + s, + cp1), + s); + + for (i = 0; i < 3; i++) { + s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s)); + } + s = _mm256_div_ps(s, ftwos); + + sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s)); + cosine = _mm256_sub_ps(fones, s); + + condition1 = _mm256_cmp_ps( + _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, ones), twos)), + fzeroes, + _CMP_NEQ_UQ); + condition2 = _mm256_cmp_ps( + _mm256_cmp_ps( + _mm256_cvtepi32_ps(_mm256_and_si256(q, fours)), fzeroes, _CMP_NEQ_UQ), + _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS), + _CMP_NEQ_UQ); + // Need this condition only for cos + // condition3 = _mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, + // twos), fours)), fzeroes); + + sine = + _mm256_add_ps(sine, _mm256_and_ps(_mm256_sub_ps(cosine, sine), condition1)); + sine = _mm256_sub_ps( + sine, _mm256_and_ps(_mm256_mul_ps(sine, _mm256_set1_ps(2.0f)), condition2)); + _mm256_store_ps(bPtr, sine); + aPtr += 8; + bPtr += 8; + } + + number = eighthPoints * 8; + for (; number < num_points; number++) { + *bPtr++ = sin(*aPtr++); } - s = _mm256_div_ps(s, ftwos); - - sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s)); - cosine = _mm256_sub_ps(fones, s); - - condition1 = _mm256_cmp_ps(_mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, ones), twos)), fzeroes, _CMP_NEQ_UQ); - condition2 = _mm256_cmp_ps(_mm256_cmp_ps(_mm256_cvtepi32_ps(_mm256_and_si256(q, fours)), fzeroes, _CMP_NEQ_UQ), _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS), _CMP_NEQ_UQ); - // Need this condition only for cos - //condition3 = _mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, twos), fours)), fzeroes); - - sine = _mm256_add_ps(sine, _mm256_and_ps(_mm256_sub_ps(cosine, sine), condition1)); - sine = _mm256_sub_ps(sine, _mm256_and_ps(_mm256_mul_ps(sine, _mm256_set1_ps(2.0f)), condition2)); - _mm256_store_ps(bPtr, sine); - aPtr += 8; - bPtr += 8; - } - - number = eighthPoints * 8; - for(;number < num_points; number++) { - *bPtr++ = sin(*aPtr++); - } } #endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for aligned */ @@ -159,72 +180,100 @@ volk_32f_sin_32f_a_avx2_fma(float* bVector, const float* aVector, unsigned int n static inline void volk_32f_sin_32f_a_avx2(float* bVector, const float* aVector, unsigned int num_points) { - float* bPtr = bVector; - const float* aPtr = aVector; - - unsigned int number = 0; - unsigned int eighthPoints = num_points / 8; - unsigned int i = 0; - - __m256 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones, fzeroes; - __m256 sine, cosine, condition1, condition2; - __m256i q, r, ones, twos, fours; - - m4pi = _mm256_set1_ps(1.273239545); - pio4A = _mm256_set1_ps(0.78515625); - pio4B = _mm256_set1_ps(0.241876e-3); - ffours = _mm256_set1_ps(4.0); - ftwos = _mm256_set1_ps(2.0); - fones = _mm256_set1_ps(1.0); - fzeroes = _mm256_setzero_ps(); - ones = _mm256_set1_epi32(1); - twos = _mm256_set1_epi32(2); - fours = _mm256_set1_epi32(4); - - cp1 = _mm256_set1_ps(1.0); - cp2 = _mm256_set1_ps(0.83333333e-1); - cp3 = _mm256_set1_ps(0.2777778e-2); - cp4 = _mm256_set1_ps(0.49603e-4); - cp5 = _mm256_set1_ps(0.551e-6); - - for(;number < eighthPoints; number++) { - aVal = _mm256_load_ps(aPtr); - s = _mm256_sub_ps(aVal, _mm256_and_ps(_mm256_mul_ps(aVal, ftwos), _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS))); - q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi))); - r = _mm256_add_epi32(q, _mm256_and_si256(q, ones)); - - s = _mm256_sub_ps(s, _mm256_mul_ps(_mm256_cvtepi32_ps(r), pio4A)); - s = _mm256_sub_ps(s, _mm256_mul_ps(_mm256_cvtepi32_ps(r), pio4B)); - - s = _mm256_div_ps(s, _mm256_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction - s = _mm256_mul_ps(s, s); - // Evaluate Taylor series - s = _mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(s, cp5), cp4), s), cp3), s), cp2), s), cp1), s); - - for(i = 0; i < 3; i++) { - s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s)); + float* bPtr = bVector; + const float* aPtr = aVector; + + unsigned int number = 0; + unsigned int eighthPoints = num_points / 8; + unsigned int i = 0; + + __m256 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones, + fzeroes; + __m256 sine, cosine, condition1, condition2; + __m256i q, r, ones, twos, fours; + + m4pi = _mm256_set1_ps(1.273239545); + pio4A = _mm256_set1_ps(0.78515625); + pio4B = _mm256_set1_ps(0.241876e-3); + ffours = _mm256_set1_ps(4.0); + ftwos = _mm256_set1_ps(2.0); + fones = _mm256_set1_ps(1.0); + fzeroes = _mm256_setzero_ps(); + ones = _mm256_set1_epi32(1); + twos = _mm256_set1_epi32(2); + fours = _mm256_set1_epi32(4); + + cp1 = _mm256_set1_ps(1.0); + cp2 = _mm256_set1_ps(0.83333333e-1); + cp3 = _mm256_set1_ps(0.2777778e-2); + cp4 = _mm256_set1_ps(0.49603e-4); + cp5 = _mm256_set1_ps(0.551e-6); + + for (; number < eighthPoints; number++) { + aVal = _mm256_load_ps(aPtr); + s = _mm256_sub_ps(aVal, + _mm256_and_ps(_mm256_mul_ps(aVal, ftwos), + _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS))); + q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi))); + r = _mm256_add_epi32(q, _mm256_and_si256(q, ones)); + + s = _mm256_sub_ps(s, _mm256_mul_ps(_mm256_cvtepi32_ps(r), pio4A)); + s = _mm256_sub_ps(s, _mm256_mul_ps(_mm256_cvtepi32_ps(r), pio4B)); + + s = _mm256_div_ps( + s, + _mm256_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction + s = _mm256_mul_ps(s, s); + // Evaluate Taylor series + s = _mm256_mul_ps( + _mm256_add_ps( + _mm256_mul_ps( + _mm256_sub_ps( + _mm256_mul_ps( + _mm256_add_ps( + _mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(s, cp5), cp4), + s), + cp3), + s), + cp2), + s), + cp1), + s); + + for (i = 0; i < 3; i++) { + s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s)); + } + s = _mm256_div_ps(s, ftwos); + + sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s)); + cosine = _mm256_sub_ps(fones, s); + + condition1 = _mm256_cmp_ps( + _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, ones), twos)), + fzeroes, + _CMP_NEQ_UQ); + condition2 = _mm256_cmp_ps( + _mm256_cmp_ps( + _mm256_cvtepi32_ps(_mm256_and_si256(q, fours)), fzeroes, _CMP_NEQ_UQ), + _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS), + _CMP_NEQ_UQ); + // Need this condition only for cos + // condition3 = _mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, + // twos), fours)), fzeroes); + + sine = + _mm256_add_ps(sine, _mm256_and_ps(_mm256_sub_ps(cosine, sine), condition1)); + sine = _mm256_sub_ps( + sine, _mm256_and_ps(_mm256_mul_ps(sine, _mm256_set1_ps(2.0f)), condition2)); + _mm256_store_ps(bPtr, sine); + aPtr += 8; + bPtr += 8; + } + + number = eighthPoints * 8; + for (; number < num_points; number++) { + *bPtr++ = sin(*aPtr++); } - s = _mm256_div_ps(s, ftwos); - - sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s)); - cosine = _mm256_sub_ps(fones, s); - - condition1 = _mm256_cmp_ps(_mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, ones), twos)), fzeroes, _CMP_NEQ_UQ); - condition2 = _mm256_cmp_ps(_mm256_cmp_ps(_mm256_cvtepi32_ps(_mm256_and_si256(q, fours)), fzeroes, _CMP_NEQ_UQ), _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS), _CMP_NEQ_UQ); - // Need this condition only for cos - //condition3 = _mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, twos), fours)), fzeroes); - - sine = _mm256_add_ps(sine, _mm256_and_ps(_mm256_sub_ps(cosine, sine), condition1)); - sine = _mm256_sub_ps(sine, _mm256_and_ps(_mm256_mul_ps(sine, _mm256_set1_ps(2.0f)), condition2)); - _mm256_store_ps(bPtr, sine); - aPtr += 8; - bPtr += 8; - } - - number = eighthPoints * 8; - for(;number < num_points; number++) { - *bPtr++ = sin(*aPtr++); - } } #endif /* LV_HAVE_AVX2 for aligned */ @@ -235,72 +284,91 @@ volk_32f_sin_32f_a_avx2(float* bVector, const float* aVector, unsigned int num_p static inline void volk_32f_sin_32f_a_sse4_1(float* bVector, const float* aVector, unsigned int num_points) { - float* bPtr = bVector; - const float* aPtr = aVector; - - unsigned int number = 0; - unsigned int quarterPoints = num_points / 4; - unsigned int i = 0; - - __m128 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones, fzeroes; - __m128 sine, cosine, condition1, condition2; - __m128i q, r, ones, twos, fours; - - m4pi = _mm_set1_ps(1.273239545); - pio4A = _mm_set1_ps(0.78515625); - pio4B = _mm_set1_ps(0.241876e-3); - ffours = _mm_set1_ps(4.0); - ftwos = _mm_set1_ps(2.0); - fones = _mm_set1_ps(1.0); - fzeroes = _mm_setzero_ps(); - ones = _mm_set1_epi32(1); - twos = _mm_set1_epi32(2); - fours = _mm_set1_epi32(4); - - cp1 = _mm_set1_ps(1.0); - cp2 = _mm_set1_ps(0.83333333e-1); - cp3 = _mm_set1_ps(0.2777778e-2); - cp4 = _mm_set1_ps(0.49603e-4); - cp5 = _mm_set1_ps(0.551e-6); - - for(;number < quarterPoints; number++) { - aVal = _mm_load_ps(aPtr); - s = _mm_sub_ps(aVal, _mm_and_ps(_mm_mul_ps(aVal, ftwos), _mm_cmplt_ps(aVal, fzeroes))); - q = _mm_cvtps_epi32(_mm_floor_ps(_mm_mul_ps(s, m4pi))); - r = _mm_add_epi32(q, _mm_and_si128(q, ones)); - - s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4A)); - s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4B)); - - s = _mm_div_ps(s, _mm_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction - s = _mm_mul_ps(s, s); - // Evaluate Taylor series - s = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(s, cp5), cp4), s), cp3), s), cp2), s), cp1), s); - - for(i = 0; i < 3; i++) { - s = _mm_mul_ps(s, _mm_sub_ps(ffours, s)); + float* bPtr = bVector; + const float* aPtr = aVector; + + unsigned int number = 0; + unsigned int quarterPoints = num_points / 4; + unsigned int i = 0; + + __m128 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones, + fzeroes; + __m128 sine, cosine, condition1, condition2; + __m128i q, r, ones, twos, fours; + + m4pi = _mm_set1_ps(1.273239545); + pio4A = _mm_set1_ps(0.78515625); + pio4B = _mm_set1_ps(0.241876e-3); + ffours = _mm_set1_ps(4.0); + ftwos = _mm_set1_ps(2.0); + fones = _mm_set1_ps(1.0); + fzeroes = _mm_setzero_ps(); + ones = _mm_set1_epi32(1); + twos = _mm_set1_epi32(2); + fours = _mm_set1_epi32(4); + + cp1 = _mm_set1_ps(1.0); + cp2 = _mm_set1_ps(0.83333333e-1); + cp3 = _mm_set1_ps(0.2777778e-2); + cp4 = _mm_set1_ps(0.49603e-4); + cp5 = _mm_set1_ps(0.551e-6); + + for (; number < quarterPoints; number++) { + aVal = _mm_load_ps(aPtr); + s = _mm_sub_ps(aVal, + _mm_and_ps(_mm_mul_ps(aVal, ftwos), _mm_cmplt_ps(aVal, fzeroes))); + q = _mm_cvtps_epi32(_mm_floor_ps(_mm_mul_ps(s, m4pi))); + r = _mm_add_epi32(q, _mm_and_si128(q, ones)); + + s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4A)); + s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4B)); + + s = _mm_div_ps( + s, _mm_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction + s = _mm_mul_ps(s, s); + // Evaluate Taylor series + s = _mm_mul_ps( + _mm_add_ps( + _mm_mul_ps( + _mm_sub_ps( + _mm_mul_ps( + _mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(s, cp5), cp4), s), + cp3), + s), + cp2), + s), + cp1), + s); + + for (i = 0; i < 3; i++) { + s = _mm_mul_ps(s, _mm_sub_ps(ffours, s)); + } + s = _mm_div_ps(s, ftwos); + + sine = _mm_sqrt_ps(_mm_mul_ps(_mm_sub_ps(ftwos, s), s)); + cosine = _mm_sub_ps(fones, s); + + condition1 = _mm_cmpneq_ps( + _mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, ones), twos)), fzeroes); + condition2 = _mm_cmpneq_ps( + _mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(q, fours)), fzeroes), + _mm_cmplt_ps(aVal, fzeroes)); + // Need this condition only for cos + // condition3 = _mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, + // twos), fours)), fzeroes); + + sine = _mm_add_ps(sine, _mm_and_ps(_mm_sub_ps(cosine, sine), condition1)); + sine = + _mm_sub_ps(sine, _mm_and_ps(_mm_mul_ps(sine, _mm_set1_ps(2.0f)), condition2)); + _mm_store_ps(bPtr, sine); + aPtr += 4; + bPtr += 4; + } + + number = quarterPoints * 4; + for (; number < num_points; number++) { + *bPtr++ = sinf(*aPtr++); } - s = _mm_div_ps(s, ftwos); - - sine = _mm_sqrt_ps(_mm_mul_ps(_mm_sub_ps(ftwos, s), s)); - cosine = _mm_sub_ps(fones, s); - - condition1 = _mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, ones), twos)), fzeroes); - condition2 = _mm_cmpneq_ps(_mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(q, fours)), fzeroes), _mm_cmplt_ps(aVal, fzeroes)); - // Need this condition only for cos - //condition3 = _mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, twos), fours)), fzeroes); - - sine = _mm_add_ps(sine, _mm_and_ps(_mm_sub_ps(cosine, sine), condition1)); - sine = _mm_sub_ps(sine, _mm_and_ps(_mm_mul_ps(sine, _mm_set1_ps(2.0f)), condition2)); - _mm_store_ps(bPtr, sine); - aPtr += 4; - bPtr += 4; - } - - number = quarterPoints * 4; - for(;number < num_points; number++) { - *bPtr++ = sinf(*aPtr++); - } } #endif /* LV_HAVE_SSE4_1 for aligned */ @@ -317,72 +385,93 @@ volk_32f_sin_32f_a_sse4_1(float* bVector, const float* aVector, unsigned int num static inline void volk_32f_sin_32f_u_avx2_fma(float* bVector, const float* aVector, unsigned int num_points) { - float* bPtr = bVector; - const float* aPtr = aVector; - - unsigned int number = 0; - unsigned int eighthPoints = num_points / 8; - unsigned int i = 0; - - __m256 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones, fzeroes; - __m256 sine, cosine, condition1, condition2; - __m256i q, r, ones, twos, fours; - - m4pi = _mm256_set1_ps(1.273239545); - pio4A = _mm256_set1_ps(0.78515625); - pio4B = _mm256_set1_ps(0.241876e-3); - ffours = _mm256_set1_ps(4.0); - ftwos = _mm256_set1_ps(2.0); - fones = _mm256_set1_ps(1.0); - fzeroes = _mm256_setzero_ps(); - ones = _mm256_set1_epi32(1); - twos = _mm256_set1_epi32(2); - fours = _mm256_set1_epi32(4); - - cp1 = _mm256_set1_ps(1.0); - cp2 = _mm256_set1_ps(0.83333333e-1); - cp3 = _mm256_set1_ps(0.2777778e-2); - cp4 = _mm256_set1_ps(0.49603e-4); - cp5 = _mm256_set1_ps(0.551e-6); - - for(;number < eighthPoints; number++) { - aVal = _mm256_loadu_ps(aPtr); - s = _mm256_sub_ps(aVal, _mm256_and_ps(_mm256_mul_ps(aVal, ftwos), _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS))); - q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi))); - r = _mm256_add_epi32(q, _mm256_and_si256(q, ones)); - - s = _mm256_fnmadd_ps(_mm256_cvtepi32_ps(r), pio4A, s); - s = _mm256_fnmadd_ps(_mm256_cvtepi32_ps(r), pio4B, s); - - s = _mm256_div_ps(s, _mm256_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction - s = _mm256_mul_ps(s, s); - // Evaluate Taylor series - s = _mm256_mul_ps(_mm256_fmadd_ps(_mm256_fmsub_ps(_mm256_fmadd_ps(_mm256_fmsub_ps(s, cp5, cp4), s, cp3), s, cp2), s, cp1), s); - - for(i = 0; i < 3; i++) { - s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s)); + float* bPtr = bVector; + const float* aPtr = aVector; + + unsigned int number = 0; + unsigned int eighthPoints = num_points / 8; + unsigned int i = 0; + + __m256 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones, + fzeroes; + __m256 sine, cosine, condition1, condition2; + __m256i q, r, ones, twos, fours; + + m4pi = _mm256_set1_ps(1.273239545); + pio4A = _mm256_set1_ps(0.78515625); + pio4B = _mm256_set1_ps(0.241876e-3); + ffours = _mm256_set1_ps(4.0); + ftwos = _mm256_set1_ps(2.0); + fones = _mm256_set1_ps(1.0); + fzeroes = _mm256_setzero_ps(); + ones = _mm256_set1_epi32(1); + twos = _mm256_set1_epi32(2); + fours = _mm256_set1_epi32(4); + + cp1 = _mm256_set1_ps(1.0); + cp2 = _mm256_set1_ps(0.83333333e-1); + cp3 = _mm256_set1_ps(0.2777778e-2); + cp4 = _mm256_set1_ps(0.49603e-4); + cp5 = _mm256_set1_ps(0.551e-6); + + for (; number < eighthPoints; number++) { + aVal = _mm256_loadu_ps(aPtr); + s = _mm256_sub_ps(aVal, + _mm256_and_ps(_mm256_mul_ps(aVal, ftwos), + _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS))); + q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi))); + r = _mm256_add_epi32(q, _mm256_and_si256(q, ones)); + + s = _mm256_fnmadd_ps(_mm256_cvtepi32_ps(r), pio4A, s); + s = _mm256_fnmadd_ps(_mm256_cvtepi32_ps(r), pio4B, s); + + s = _mm256_div_ps( + s, + _mm256_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction + s = _mm256_mul_ps(s, s); + // Evaluate Taylor series + s = _mm256_mul_ps( + _mm256_fmadd_ps( + _mm256_fmsub_ps( + _mm256_fmadd_ps(_mm256_fmsub_ps(s, cp5, cp4), s, cp3), s, cp2), + s, + cp1), + s); + + for (i = 0; i < 3; i++) { + s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s)); + } + s = _mm256_div_ps(s, ftwos); + + sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s)); + cosine = _mm256_sub_ps(fones, s); + + condition1 = _mm256_cmp_ps( + _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, ones), twos)), + fzeroes, + _CMP_NEQ_UQ); + condition2 = _mm256_cmp_ps( + _mm256_cmp_ps( + _mm256_cvtepi32_ps(_mm256_and_si256(q, fours)), fzeroes, _CMP_NEQ_UQ), + _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS), + _CMP_NEQ_UQ); + // Need this condition only for cos + // condition3 = _mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, + // twos), fours)), fzeroes); + + sine = + _mm256_add_ps(sine, _mm256_and_ps(_mm256_sub_ps(cosine, sine), condition1)); + sine = _mm256_sub_ps( + sine, _mm256_and_ps(_mm256_mul_ps(sine, _mm256_set1_ps(2.0f)), condition2)); + _mm256_storeu_ps(bPtr, sine); + aPtr += 8; + bPtr += 8; + } + + number = eighthPoints * 8; + for (; number < num_points; number++) { + *bPtr++ = sin(*aPtr++); } - s = _mm256_div_ps(s, ftwos); - - sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s)); - cosine = _mm256_sub_ps(fones, s); - - condition1 = _mm256_cmp_ps(_mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, ones), twos)), fzeroes, _CMP_NEQ_UQ); - condition2 = _mm256_cmp_ps(_mm256_cmp_ps(_mm256_cvtepi32_ps(_mm256_and_si256(q, fours)), fzeroes, _CMP_NEQ_UQ), _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS), _CMP_NEQ_UQ); - // Need this condition only for cos - //condition3 = _mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, twos), fours)), fzeroes); - - sine = _mm256_add_ps(sine, _mm256_and_ps(_mm256_sub_ps(cosine, sine), condition1)); - sine = _mm256_sub_ps(sine, _mm256_and_ps(_mm256_mul_ps(sine, _mm256_set1_ps(2.0f)), condition2)); - _mm256_storeu_ps(bPtr, sine); - aPtr += 8; - bPtr += 8; - } - - number = eighthPoints * 8; - for(;number < num_points; number++) { - *bPtr++ = sin(*aPtr++); - } } #endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for unaligned */ @@ -393,72 +482,100 @@ volk_32f_sin_32f_u_avx2_fma(float* bVector, const float* aVector, unsigned int n static inline void volk_32f_sin_32f_u_avx2(float* bVector, const float* aVector, unsigned int num_points) { - float* bPtr = bVector; - const float* aPtr = aVector; - - unsigned int number = 0; - unsigned int eighthPoints = num_points / 8; - unsigned int i = 0; - - __m256 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones, fzeroes; - __m256 sine, cosine, condition1, condition2; - __m256i q, r, ones, twos, fours; - - m4pi = _mm256_set1_ps(1.273239545); - pio4A = _mm256_set1_ps(0.78515625); - pio4B = _mm256_set1_ps(0.241876e-3); - ffours = _mm256_set1_ps(4.0); - ftwos = _mm256_set1_ps(2.0); - fones = _mm256_set1_ps(1.0); - fzeroes = _mm256_setzero_ps(); - ones = _mm256_set1_epi32(1); - twos = _mm256_set1_epi32(2); - fours = _mm256_set1_epi32(4); - - cp1 = _mm256_set1_ps(1.0); - cp2 = _mm256_set1_ps(0.83333333e-1); - cp3 = _mm256_set1_ps(0.2777778e-2); - cp4 = _mm256_set1_ps(0.49603e-4); - cp5 = _mm256_set1_ps(0.551e-6); - - for(;number < eighthPoints; number++) { - aVal = _mm256_loadu_ps(aPtr); - s = _mm256_sub_ps(aVal, _mm256_and_ps(_mm256_mul_ps(aVal, ftwos), _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS))); - q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi))); - r = _mm256_add_epi32(q, _mm256_and_si256(q, ones)); - - s = _mm256_sub_ps(s, _mm256_mul_ps(_mm256_cvtepi32_ps(r), pio4A)); - s = _mm256_sub_ps(s, _mm256_mul_ps(_mm256_cvtepi32_ps(r), pio4B)); - - s = _mm256_div_ps(s, _mm256_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction - s = _mm256_mul_ps(s, s); - // Evaluate Taylor series - s = _mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(s, cp5), cp4), s), cp3), s), cp2), s), cp1), s); - - for(i = 0; i < 3; i++) { - s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s)); + float* bPtr = bVector; + const float* aPtr = aVector; + + unsigned int number = 0; + unsigned int eighthPoints = num_points / 8; + unsigned int i = 0; + + __m256 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones, + fzeroes; + __m256 sine, cosine, condition1, condition2; + __m256i q, r, ones, twos, fours; + + m4pi = _mm256_set1_ps(1.273239545); + pio4A = _mm256_set1_ps(0.78515625); + pio4B = _mm256_set1_ps(0.241876e-3); + ffours = _mm256_set1_ps(4.0); + ftwos = _mm256_set1_ps(2.0); + fones = _mm256_set1_ps(1.0); + fzeroes = _mm256_setzero_ps(); + ones = _mm256_set1_epi32(1); + twos = _mm256_set1_epi32(2); + fours = _mm256_set1_epi32(4); + + cp1 = _mm256_set1_ps(1.0); + cp2 = _mm256_set1_ps(0.83333333e-1); + cp3 = _mm256_set1_ps(0.2777778e-2); + cp4 = _mm256_set1_ps(0.49603e-4); + cp5 = _mm256_set1_ps(0.551e-6); + + for (; number < eighthPoints; number++) { + aVal = _mm256_loadu_ps(aPtr); + s = _mm256_sub_ps(aVal, + _mm256_and_ps(_mm256_mul_ps(aVal, ftwos), + _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS))); + q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi))); + r = _mm256_add_epi32(q, _mm256_and_si256(q, ones)); + + s = _mm256_sub_ps(s, _mm256_mul_ps(_mm256_cvtepi32_ps(r), pio4A)); + s = _mm256_sub_ps(s, _mm256_mul_ps(_mm256_cvtepi32_ps(r), pio4B)); + + s = _mm256_div_ps( + s, + _mm256_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction + s = _mm256_mul_ps(s, s); + // Evaluate Taylor series + s = _mm256_mul_ps( + _mm256_add_ps( + _mm256_mul_ps( + _mm256_sub_ps( + _mm256_mul_ps( + _mm256_add_ps( + _mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(s, cp5), cp4), + s), + cp3), + s), + cp2), + s), + cp1), + s); + + for (i = 0; i < 3; i++) { + s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s)); + } + s = _mm256_div_ps(s, ftwos); + + sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s)); + cosine = _mm256_sub_ps(fones, s); + + condition1 = _mm256_cmp_ps( + _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, ones), twos)), + fzeroes, + _CMP_NEQ_UQ); + condition2 = _mm256_cmp_ps( + _mm256_cmp_ps( + _mm256_cvtepi32_ps(_mm256_and_si256(q, fours)), fzeroes, _CMP_NEQ_UQ), + _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS), + _CMP_NEQ_UQ); + // Need this condition only for cos + // condition3 = _mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, + // twos), fours)), fzeroes); + + sine = + _mm256_add_ps(sine, _mm256_and_ps(_mm256_sub_ps(cosine, sine), condition1)); + sine = _mm256_sub_ps( + sine, _mm256_and_ps(_mm256_mul_ps(sine, _mm256_set1_ps(2.0f)), condition2)); + _mm256_storeu_ps(bPtr, sine); + aPtr += 8; + bPtr += 8; + } + + number = eighthPoints * 8; + for (; number < num_points; number++) { + *bPtr++ = sin(*aPtr++); } - s = _mm256_div_ps(s, ftwos); - - sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s)); - cosine = _mm256_sub_ps(fones, s); - - condition1 = _mm256_cmp_ps(_mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, ones), twos)), fzeroes, _CMP_NEQ_UQ); - condition2 = _mm256_cmp_ps(_mm256_cmp_ps(_mm256_cvtepi32_ps(_mm256_and_si256(q, fours)), fzeroes, _CMP_NEQ_UQ), _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS), _CMP_NEQ_UQ); - // Need this condition only for cos - //condition3 = _mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, twos), fours)), fzeroes); - - sine = _mm256_add_ps(sine, _mm256_and_ps(_mm256_sub_ps(cosine, sine), condition1)); - sine = _mm256_sub_ps(sine, _mm256_and_ps(_mm256_mul_ps(sine, _mm256_set1_ps(2.0f)), condition2)); - _mm256_storeu_ps(bPtr, sine); - aPtr += 8; - bPtr += 8; - } - - number = eighthPoints * 8; - for(;number < num_points; number++) { - *bPtr++ = sin(*aPtr++); - } } #endif /* LV_HAVE_AVX2 for unaligned */ @@ -470,70 +587,88 @@ volk_32f_sin_32f_u_avx2(float* bVector, const float* aVector, unsigned int num_p static inline void volk_32f_sin_32f_u_sse4_1(float* bVector, const float* aVector, unsigned int num_points) { - float* bPtr = bVector; - const float* aPtr = aVector; - - unsigned int number = 0; - unsigned int quarterPoints = num_points / 4; - unsigned int i = 0; - - __m128 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones, fzeroes; - __m128 sine, cosine, condition1, condition2; - __m128i q, r, ones, twos, fours; - - m4pi = _mm_set1_ps(1.273239545); - pio4A = _mm_set1_ps(0.78515625); - pio4B = _mm_set1_ps(0.241876e-3); - ffours = _mm_set1_ps(4.0); - ftwos = _mm_set1_ps(2.0); - fones = _mm_set1_ps(1.0); - fzeroes = _mm_setzero_ps(); - ones = _mm_set1_epi32(1); - twos = _mm_set1_epi32(2); - fours = _mm_set1_epi32(4); - - cp1 = _mm_set1_ps(1.0); - cp2 = _mm_set1_ps(0.83333333e-1); - cp3 = _mm_set1_ps(0.2777778e-2); - cp4 = _mm_set1_ps(0.49603e-4); - cp5 = _mm_set1_ps(0.551e-6); - - for(;number < quarterPoints; number++) { - aVal = _mm_loadu_ps(aPtr); - s = _mm_sub_ps(aVal, _mm_and_ps(_mm_mul_ps(aVal, ftwos), _mm_cmplt_ps(aVal, fzeroes))); - q = _mm_cvtps_epi32(_mm_floor_ps(_mm_mul_ps(s, m4pi))); - r = _mm_add_epi32(q, _mm_and_si128(q, ones)); - - s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4A)); - s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4B)); - - s = _mm_div_ps(s, _mm_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction - s = _mm_mul_ps(s, s); - // Evaluate Taylor series - s = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(s, cp5), cp4), s), cp3), s), cp2), s), cp1), s); - - for(i = 0; i < 3; i++) { - s = _mm_mul_ps(s, _mm_sub_ps(ffours, s)); - } - s = _mm_div_ps(s, ftwos); - - sine = _mm_sqrt_ps(_mm_mul_ps(_mm_sub_ps(ftwos, s), s)); - cosine = _mm_sub_ps(fones, s); - - condition1 = _mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, ones), twos)), fzeroes); - condition2 = _mm_cmpneq_ps(_mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(q, fours)), fzeroes), _mm_cmplt_ps(aVal, fzeroes)); + float* bPtr = bVector; + const float* aPtr = aVector; - sine = _mm_add_ps(sine, _mm_and_ps(_mm_sub_ps(cosine, sine), condition1)); - sine = _mm_sub_ps(sine, _mm_and_ps(_mm_mul_ps(sine, _mm_set1_ps(2.0f)), condition2)); - _mm_storeu_ps(bPtr, sine); - aPtr += 4; - bPtr += 4; - } + unsigned int number = 0; + unsigned int quarterPoints = num_points / 4; + unsigned int i = 0; + + __m128 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones, + fzeroes; + __m128 sine, cosine, condition1, condition2; + __m128i q, r, ones, twos, fours; + + m4pi = _mm_set1_ps(1.273239545); + pio4A = _mm_set1_ps(0.78515625); + pio4B = _mm_set1_ps(0.241876e-3); + ffours = _mm_set1_ps(4.0); + ftwos = _mm_set1_ps(2.0); + fones = _mm_set1_ps(1.0); + fzeroes = _mm_setzero_ps(); + ones = _mm_set1_epi32(1); + twos = _mm_set1_epi32(2); + fours = _mm_set1_epi32(4); + + cp1 = _mm_set1_ps(1.0); + cp2 = _mm_set1_ps(0.83333333e-1); + cp3 = _mm_set1_ps(0.2777778e-2); + cp4 = _mm_set1_ps(0.49603e-4); + cp5 = _mm_set1_ps(0.551e-6); + + for (; number < quarterPoints; number++) { + aVal = _mm_loadu_ps(aPtr); + s = _mm_sub_ps(aVal, + _mm_and_ps(_mm_mul_ps(aVal, ftwos), _mm_cmplt_ps(aVal, fzeroes))); + q = _mm_cvtps_epi32(_mm_floor_ps(_mm_mul_ps(s, m4pi))); + r = _mm_add_epi32(q, _mm_and_si128(q, ones)); + + s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4A)); + s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4B)); + + s = _mm_div_ps( + s, _mm_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction + s = _mm_mul_ps(s, s); + // Evaluate Taylor series + s = _mm_mul_ps( + _mm_add_ps( + _mm_mul_ps( + _mm_sub_ps( + _mm_mul_ps( + _mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(s, cp5), cp4), s), + cp3), + s), + cp2), + s), + cp1), + s); + + for (i = 0; i < 3; i++) { + s = _mm_mul_ps(s, _mm_sub_ps(ffours, s)); + } + s = _mm_div_ps(s, ftwos); + + sine = _mm_sqrt_ps(_mm_mul_ps(_mm_sub_ps(ftwos, s), s)); + cosine = _mm_sub_ps(fones, s); + + condition1 = _mm_cmpneq_ps( + _mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, ones), twos)), fzeroes); + condition2 = _mm_cmpneq_ps( + _mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(q, fours)), fzeroes), + _mm_cmplt_ps(aVal, fzeroes)); + + sine = _mm_add_ps(sine, _mm_and_ps(_mm_sub_ps(cosine, sine), condition1)); + sine = + _mm_sub_ps(sine, _mm_and_ps(_mm_mul_ps(sine, _mm_set1_ps(2.0f)), condition2)); + _mm_storeu_ps(bPtr, sine); + aPtr += 4; + bPtr += 4; + } - number = quarterPoints * 4; - for(;number < num_points; number++){ - *bPtr++ = sinf(*aPtr++); - } + number = quarterPoints * 4; + for (; number < num_points; number++) { + *bPtr++ = sinf(*aPtr++); + } } #endif /* LV_HAVE_SSE4_1 for unaligned */ @@ -544,14 +679,13 @@ volk_32f_sin_32f_u_sse4_1(float* bVector, const float* aVector, unsigned int num static inline void volk_32f_sin_32f_generic(float* bVector, const float* aVector, unsigned int num_points) { - float* bPtr = bVector; - const float* aPtr = aVector; - unsigned int number = 0; - - for(number = 0; number < num_points; number++) { - *bPtr++ = sinf(*aPtr++); - } + float* bPtr = bVector; + const float* aPtr = aVector; + unsigned int number = 0; + for (number = 0; number < num_points; number++) { + *bPtr++ = sinf(*aPtr++); + } } #endif /* LV_HAVE_GENERIC */ @@ -562,30 +696,29 @@ volk_32f_sin_32f_generic(float* bVector, const float* aVector, unsigned int num_ #include static inline void -volk_32f_sin_32f_neon(float* bVector, const float* aVector, - unsigned int num_points) +volk_32f_sin_32f_neon(float* bVector, const float* aVector, unsigned int num_points) { unsigned int number = 0; unsigned int quarter_points = num_points / 4; float* bVectorPtr = bVector; const float* aVectorPtr = aVector; - + float32x4_t b_vec; float32x4_t a_vec; - - for(number = 0; number < quarter_points; number++) { + + for (number = 0; number < quarter_points; number++) { a_vec = vld1q_f32(aVectorPtr); // Prefetch next one, speeds things up - __VOLK_PREFETCH(aVectorPtr+4); + __VOLK_PREFETCH(aVectorPtr + 4); b_vec = _vsinq_f32(a_vec); vst1q_f32(bVectorPtr, b_vec); // move pointers ahead - bVectorPtr+=4; - aVectorPtr+=4; + bVectorPtr += 4; + aVectorPtr += 4; } - + // Deal with the rest - for(number = quarter_points * 4; number < num_points; number++) { + for (number = quarter_points * 4; number < num_points; number++) { *bVectorPtr++ = sinf(*aVectorPtr++); } } diff --git a/kernels/volk/volk_32f_sqrt_32f.h b/kernels/volk/volk_32f_sqrt_32f.h index 84160af..667d356 100644 --- a/kernels/volk/volk_32f_sqrt_32f.h +++ b/kernels/volk/volk_32f_sqrt_32f.h @@ -66,8 +66,8 @@ #define INCLUDED_volk_32f_sqrt_32f_a_H #include -#include #include +#include #ifdef LV_HAVE_SSE #include @@ -75,28 +75,28 @@ static inline void volk_32f_sqrt_32f_a_sse(float* cVector, const float* aVector, unsigned int num_points) { - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; - float* cPtr = cVector; - const float* aPtr = aVector; + float* cPtr = cVector; + const float* aPtr = aVector; - __m128 aVal, cVal; - for(;number < quarterPoints; number++) { - aVal = _mm_load_ps(aPtr); + __m128 aVal, cVal; + for (; number < quarterPoints; number++) { + aVal = _mm_load_ps(aPtr); - cVal = _mm_sqrt_ps(aVal); + cVal = _mm_sqrt_ps(aVal); - _mm_store_ps(cPtr,cVal); // Store the results back into the C container + _mm_store_ps(cPtr, cVal); // Store the results back into the C container - aPtr += 4; - cPtr += 4; - } + aPtr += 4; + cPtr += 4; + } - number = quarterPoints * 4; - for(;number < num_points; number++) { - *cPtr++ = sqrtf(*aPtr++); - } + number = quarterPoints * 4; + for (; number < num_points; number++) { + *cPtr++ = sqrtf(*aPtr++); + } } #endif /* LV_HAVE_SSE */ @@ -107,28 +107,28 @@ volk_32f_sqrt_32f_a_sse(float* cVector, const float* aVector, unsigned int num_p static inline void volk_32f_sqrt_32f_a_avx(float* cVector, const float* aVector, unsigned int num_points) { - unsigned int number = 0; - const unsigned int eighthPoints = num_points / 8; + unsigned int number = 0; + const unsigned int eighthPoints = num_points / 8; - float* cPtr = cVector; - const float* aPtr = aVector; + float* cPtr = cVector; + const float* aPtr = aVector; - __m256 aVal, cVal; - for(;number < eighthPoints; number++) { - aVal = _mm256_load_ps(aPtr); + __m256 aVal, cVal; + for (; number < eighthPoints; number++) { + aVal = _mm256_load_ps(aPtr); - cVal = _mm256_sqrt_ps(aVal); + cVal = _mm256_sqrt_ps(aVal); - _mm256_store_ps(cPtr,cVal); // Store the results back into the C container + _mm256_store_ps(cPtr, cVal); // Store the results back into the C container - aPtr += 8; - cPtr += 8; - } + aPtr += 8; + cPtr += 8; + } - number = eighthPoints * 8; - for(;number < num_points; number++) { - *cPtr++ = sqrtf(*aPtr++); - } + number = eighthPoints * 8; + for (; number < num_points; number++) { + *cPtr++ = sqrtf(*aPtr++); + } } #endif /* LV_HAVE_AVX */ @@ -140,24 +140,24 @@ volk_32f_sqrt_32f_a_avx(float* cVector, const float* aVector, unsigned int num_p static inline void volk_32f_sqrt_32f_neon(float* cVector, const float* aVector, unsigned int num_points) { - float* cPtr = cVector; - const float* aPtr = aVector; - unsigned int number = 0; - unsigned int quarter_points = num_points / 4; - float32x4_t in_vec, out_vec; - - for(number = 0; number < quarter_points; number++) { - in_vec = vld1q_f32(aPtr); - // note that armv8 has vsqrt_f32 which will be much better - out_vec = vrecpeq_f32(vrsqrteq_f32(in_vec) ); - vst1q_f32(cPtr, out_vec); - aPtr += 4; - cPtr += 4; - } - - for(number = quarter_points * 4; number < num_points; number++) { - *cPtr++ = sqrtf(*aPtr++); - } + float* cPtr = cVector; + const float* aPtr = aVector; + unsigned int number = 0; + unsigned int quarter_points = num_points / 4; + float32x4_t in_vec, out_vec; + + for (number = 0; number < quarter_points; number++) { + in_vec = vld1q_f32(aPtr); + // note that armv8 has vsqrt_f32 which will be much better + out_vec = vrecpeq_f32(vrsqrteq_f32(in_vec)); + vst1q_f32(cPtr, out_vec); + aPtr += 4; + cPtr += 4; + } + + for (number = quarter_points * 4; number < num_points; number++) { + *cPtr++ = sqrtf(*aPtr++); + } } #endif /* LV_HAVE_NEON */ @@ -168,13 +168,13 @@ volk_32f_sqrt_32f_neon(float* cVector, const float* aVector, unsigned int num_po static inline void volk_32f_sqrt_32f_generic(float* cVector, const float* aVector, unsigned int num_points) { - float* cPtr = cVector; - const float* aPtr = aVector; - unsigned int number = 0; + float* cPtr = cVector; + const float* aPtr = aVector; + unsigned int number = 0; - for(number = 0; number < num_points; number++) { - *cPtr++ = sqrtf(*aPtr++); - } + for (number = 0; number < num_points; number++) { + *cPtr++ = sqrtf(*aPtr++); + } } #endif /* LV_HAVE_GENERIC */ @@ -182,13 +182,12 @@ volk_32f_sqrt_32f_generic(float* cVector, const float* aVector, unsigned int num #ifdef LV_HAVE_ORC -extern void -volk_32f_sqrt_32f_a_orc_impl(float *, const float*, unsigned int); +extern void volk_32f_sqrt_32f_a_orc_impl(float*, const float*, unsigned int); static inline void volk_32f_sqrt_32f_u_orc(float* cVector, const float* aVector, unsigned int num_points) { - volk_32f_sqrt_32f_a_orc_impl(cVector, aVector, num_points); + volk_32f_sqrt_32f_a_orc_impl(cVector, aVector, num_points); } #endif /* LV_HAVE_ORC */ @@ -199,36 +198,36 @@ volk_32f_sqrt_32f_u_orc(float* cVector, const float* aVector, unsigned int num_p #define INCLUDED_volk_32f_sqrt_32f_u_H #include -#include #include +#include #ifdef LV_HAVE_AVX #include static inline void volk_32f_sqrt_32f_u_avx(float* cVector, const float* aVector, unsigned int num_points) { - unsigned int number = 0; - const unsigned int eighthPoints = num_points / 8; + unsigned int number = 0; + const unsigned int eighthPoints = num_points / 8; - float* cPtr = cVector; - const float* aPtr = aVector; + float* cPtr = cVector; + const float* aPtr = aVector; - __m256 aVal, cVal; - for(;number < eighthPoints; number++) { - aVal = _mm256_loadu_ps(aPtr); + __m256 aVal, cVal; + for (; number < eighthPoints; number++) { + aVal = _mm256_loadu_ps(aPtr); - cVal = _mm256_sqrt_ps(aVal); + cVal = _mm256_sqrt_ps(aVal); - _mm256_storeu_ps(cPtr,cVal); // Store the results back into the C container + _mm256_storeu_ps(cPtr, cVal); // Store the results back into the C container - aPtr += 8; - cPtr += 8; - } + aPtr += 8; + cPtr += 8; + } - number = eighthPoints * 8; - for(;number < num_points; number++) { - *cPtr++ = sqrtf(*aPtr++); - } + number = eighthPoints * 8; + for (; number < num_points; number++) { + *cPtr++ = sqrtf(*aPtr++); + } } #endif /* LV_HAVE_AVX */ diff --git a/kernels/volk/volk_32f_stddev_and_mean_32f_x2.h b/kernels/volk/volk_32f_stddev_and_mean_32f_x2.h index 8e996e2..6ad0f17 100644 --- a/kernels/volk/volk_32f_stddev_and_mean_32f_x2.h +++ b/kernels/volk/volk_32f_stddev_and_mean_32f_x2.h @@ -29,8 +29,8 @@ * * Dispatcher Prototype * \code - * void volk_32f_stddev_and_mean_32f_x2(float* stddev, float* mean, const float* inputBuffer, unsigned int num_points) - * \endcode + * void volk_32f_stddev_and_mean_32f_x2(float* stddev, float* mean, const float* + * inputBuffer, unsigned int num_points) \endcode * * \b Inputs * \li inputBuffer: The buffer of points. @@ -41,10 +41,8 @@ * \li mean: The mean of the input buffer. * * \b Example - * Generate random numbers with c++11's normal distribution and estimate the mean and standard deviation - * \code - * int N = 1000; - * unsigned int alignment = volk_get_alignment(); + * Generate random numbers with c++11's normal distribution and estimate the mean and + * standard deviation \code int N = 1000; unsigned int alignment = volk_get_alignment(); * float* rand_numbers = (float*)volk_malloc(sizeof(float)*N, alignment); * float* mean = (float*)volk_malloc(sizeof(float), alignment); * float* stddev = (float*)volk_malloc(sizeof(float), alignment); @@ -71,88 +69,94 @@ #ifndef INCLUDED_volk_32f_stddev_and_mean_32f_x2_a_H #define INCLUDED_volk_32f_stddev_and_mean_32f_x2_a_H -#include #include -#include #include +#include +#include #ifdef LV_HAVE_AVX #include -static inline void -volk_32f_stddev_and_mean_32f_x2_a_avx(float* stddev, float* mean, - const float* inputBuffer, - unsigned int num_points) +static inline void volk_32f_stddev_and_mean_32f_x2_a_avx(float* stddev, + float* mean, + const float* inputBuffer, + unsigned int num_points) { - float stdDev = 0; - float newMean = 0; - if(num_points > 0){ - unsigned int number = 0; - const unsigned int thirtySecondthPoints = num_points / 32; - - const float* aPtr = inputBuffer; - __VOLK_ATTR_ALIGNED(32) float meanBuffer[8]; - __VOLK_ATTR_ALIGNED(32) float squareBuffer[8]; - - __m256 accumulator = _mm256_setzero_ps(); - __m256 squareAccumulator = _mm256_setzero_ps(); - __m256 aVal1, aVal2, aVal3, aVal4; - __m256 cVal1, cVal2, cVal3, cVal4; - for(;number < thirtySecondthPoints; number++) { - aVal1 = _mm256_load_ps(aPtr); aPtr += 8; - cVal1 = _mm256_dp_ps(aVal1, aVal1, 0xF1); - accumulator = _mm256_add_ps(accumulator, aVal1); // accumulator += x - - aVal2 = _mm256_load_ps(aPtr); aPtr += 8; - cVal2 = _mm256_dp_ps(aVal2, aVal2, 0xF2); - accumulator = _mm256_add_ps(accumulator, aVal2); // accumulator += x - - aVal3 = _mm256_load_ps(aPtr); aPtr += 8; - cVal3 = _mm256_dp_ps(aVal3, aVal3, 0xF4); - accumulator = _mm256_add_ps(accumulator, aVal3); // accumulator += x - - aVal4 = _mm256_load_ps(aPtr); aPtr += 8; - cVal4 = _mm256_dp_ps(aVal4, aVal4, 0xF8); - accumulator = _mm256_add_ps(accumulator, aVal4); // accumulator += x - - cVal1 = _mm256_or_ps(cVal1, cVal2); - cVal3 = _mm256_or_ps(cVal3, cVal4); - cVal1 = _mm256_or_ps(cVal1, cVal3); - - squareAccumulator = _mm256_add_ps(squareAccumulator, cVal1); // squareAccumulator += x^2 - } - _mm256_store_ps(meanBuffer,accumulator); // Store the results back into the C container - _mm256_store_ps(squareBuffer,squareAccumulator); // Store the results back into the C container - newMean = meanBuffer[0]; - newMean += meanBuffer[1]; - newMean += meanBuffer[2]; - newMean += meanBuffer[3]; - newMean += meanBuffer[4]; - newMean += meanBuffer[5]; - newMean += meanBuffer[6]; - newMean += meanBuffer[7]; - stdDev = squareBuffer[0]; - stdDev += squareBuffer[1]; - stdDev += squareBuffer[2]; - stdDev += squareBuffer[3]; - stdDev += squareBuffer[4]; - stdDev += squareBuffer[5]; - stdDev += squareBuffer[6]; - stdDev += squareBuffer[7]; - - number = thirtySecondthPoints * 32; - for(;number < num_points; number++){ - stdDev += (*aPtr) * (*aPtr); - newMean += *aPtr++; + float stdDev = 0; + float newMean = 0; + if (num_points > 0) { + unsigned int number = 0; + const unsigned int thirtySecondthPoints = num_points / 32; + + const float* aPtr = inputBuffer; + __VOLK_ATTR_ALIGNED(32) float meanBuffer[8]; + __VOLK_ATTR_ALIGNED(32) float squareBuffer[8]; + + __m256 accumulator = _mm256_setzero_ps(); + __m256 squareAccumulator = _mm256_setzero_ps(); + __m256 aVal1, aVal2, aVal3, aVal4; + __m256 cVal1, cVal2, cVal3, cVal4; + for (; number < thirtySecondthPoints; number++) { + aVal1 = _mm256_load_ps(aPtr); + aPtr += 8; + cVal1 = _mm256_dp_ps(aVal1, aVal1, 0xF1); + accumulator = _mm256_add_ps(accumulator, aVal1); // accumulator += x + + aVal2 = _mm256_load_ps(aPtr); + aPtr += 8; + cVal2 = _mm256_dp_ps(aVal2, aVal2, 0xF2); + accumulator = _mm256_add_ps(accumulator, aVal2); // accumulator += x + + aVal3 = _mm256_load_ps(aPtr); + aPtr += 8; + cVal3 = _mm256_dp_ps(aVal3, aVal3, 0xF4); + accumulator = _mm256_add_ps(accumulator, aVal3); // accumulator += x + + aVal4 = _mm256_load_ps(aPtr); + aPtr += 8; + cVal4 = _mm256_dp_ps(aVal4, aVal4, 0xF8); + accumulator = _mm256_add_ps(accumulator, aVal4); // accumulator += x + + cVal1 = _mm256_or_ps(cVal1, cVal2); + cVal3 = _mm256_or_ps(cVal3, cVal4); + cVal1 = _mm256_or_ps(cVal1, cVal3); + + squareAccumulator = + _mm256_add_ps(squareAccumulator, cVal1); // squareAccumulator += x^2 + } + _mm256_store_ps(meanBuffer, + accumulator); // Store the results back into the C container + _mm256_store_ps(squareBuffer, + squareAccumulator); // Store the results back into the C container + newMean = meanBuffer[0]; + newMean += meanBuffer[1]; + newMean += meanBuffer[2]; + newMean += meanBuffer[3]; + newMean += meanBuffer[4]; + newMean += meanBuffer[5]; + newMean += meanBuffer[6]; + newMean += meanBuffer[7]; + stdDev = squareBuffer[0]; + stdDev += squareBuffer[1]; + stdDev += squareBuffer[2]; + stdDev += squareBuffer[3]; + stdDev += squareBuffer[4]; + stdDev += squareBuffer[5]; + stdDev += squareBuffer[6]; + stdDev += squareBuffer[7]; + + number = thirtySecondthPoints * 32; + for (; number < num_points; number++) { + stdDev += (*aPtr) * (*aPtr); + newMean += *aPtr++; + } + newMean /= num_points; + stdDev /= num_points; + stdDev -= (newMean * newMean); + stdDev = sqrtf(stdDev); } - newMean /= num_points; - stdDev /= num_points; - stdDev -= (newMean * newMean); - stdDev = sqrtf(stdDev); - } - *stddev = stdDev; - *mean = newMean; - + *stddev = stdDev; + *mean = newMean; } #endif /* LV_HAVE_AVX */ @@ -160,151 +164,164 @@ volk_32f_stddev_and_mean_32f_x2_a_avx(float* stddev, float* mean, #ifdef LV_HAVE_AVX #include -static inline void -volk_32f_stddev_and_mean_32f_x2_u_avx(float* stddev, float* mean, - const float* inputBuffer, - unsigned int num_points) +static inline void volk_32f_stddev_and_mean_32f_x2_u_avx(float* stddev, + float* mean, + const float* inputBuffer, + unsigned int num_points) { - float stdDev = 0; - float newMean = 0; - if(num_points > 0){ - unsigned int number = 0; - const unsigned int thirtySecondthPoints = num_points / 32; - - const float* aPtr = inputBuffer; - __VOLK_ATTR_ALIGNED(32) float meanBuffer[8]; - __VOLK_ATTR_ALIGNED(32) float squareBuffer[8]; - - __m256 accumulator = _mm256_setzero_ps(); - __m256 squareAccumulator = _mm256_setzero_ps(); - __m256 aVal1, aVal2, aVal3, aVal4; - __m256 cVal1, cVal2, cVal3, cVal4; - for(;number < thirtySecondthPoints; number++) { - aVal1 = _mm256_loadu_ps(aPtr); aPtr += 8; - cVal1 = _mm256_dp_ps(aVal1, aVal1, 0xF1); - accumulator = _mm256_add_ps(accumulator, aVal1); // accumulator += x - - aVal2 = _mm256_loadu_ps(aPtr); aPtr += 8; - cVal2 = _mm256_dp_ps(aVal2, aVal2, 0xF2); - accumulator = _mm256_add_ps(accumulator, aVal2); // accumulator += x - - aVal3 = _mm256_loadu_ps(aPtr); aPtr += 8; - cVal3 = _mm256_dp_ps(aVal3, aVal3, 0xF4); - accumulator = _mm256_add_ps(accumulator, aVal3); // accumulator += x - - aVal4 = _mm256_loadu_ps(aPtr); aPtr += 8; - cVal4 = _mm256_dp_ps(aVal4, aVal4, 0xF8); - accumulator = _mm256_add_ps(accumulator, aVal4); // accumulator += x - - cVal1 = _mm256_or_ps(cVal1, cVal2); - cVal3 = _mm256_or_ps(cVal3, cVal4); - cVal1 = _mm256_or_ps(cVal1, cVal3); - - squareAccumulator = _mm256_add_ps(squareAccumulator, cVal1); // squareAccumulator += x^2 - } - _mm256_store_ps(meanBuffer,accumulator); // Store the results back into the C container - _mm256_store_ps(squareBuffer,squareAccumulator); // Store the results back into the C container - newMean = meanBuffer[0]; - newMean += meanBuffer[1]; - newMean += meanBuffer[2]; - newMean += meanBuffer[3]; - newMean += meanBuffer[4]; - newMean += meanBuffer[5]; - newMean += meanBuffer[6]; - newMean += meanBuffer[7]; - stdDev = squareBuffer[0]; - stdDev += squareBuffer[1]; - stdDev += squareBuffer[2]; - stdDev += squareBuffer[3]; - stdDev += squareBuffer[4]; - stdDev += squareBuffer[5]; - stdDev += squareBuffer[6]; - stdDev += squareBuffer[7]; - - number = thirtySecondthPoints * 32; - for(;number < num_points; number++){ - stdDev += (*aPtr) * (*aPtr); - newMean += *aPtr++; + float stdDev = 0; + float newMean = 0; + if (num_points > 0) { + unsigned int number = 0; + const unsigned int thirtySecondthPoints = num_points / 32; + + const float* aPtr = inputBuffer; + __VOLK_ATTR_ALIGNED(32) float meanBuffer[8]; + __VOLK_ATTR_ALIGNED(32) float squareBuffer[8]; + + __m256 accumulator = _mm256_setzero_ps(); + __m256 squareAccumulator = _mm256_setzero_ps(); + __m256 aVal1, aVal2, aVal3, aVal4; + __m256 cVal1, cVal2, cVal3, cVal4; + for (; number < thirtySecondthPoints; number++) { + aVal1 = _mm256_loadu_ps(aPtr); + aPtr += 8; + cVal1 = _mm256_dp_ps(aVal1, aVal1, 0xF1); + accumulator = _mm256_add_ps(accumulator, aVal1); // accumulator += x + + aVal2 = _mm256_loadu_ps(aPtr); + aPtr += 8; + cVal2 = _mm256_dp_ps(aVal2, aVal2, 0xF2); + accumulator = _mm256_add_ps(accumulator, aVal2); // accumulator += x + + aVal3 = _mm256_loadu_ps(aPtr); + aPtr += 8; + cVal3 = _mm256_dp_ps(aVal3, aVal3, 0xF4); + accumulator = _mm256_add_ps(accumulator, aVal3); // accumulator += x + + aVal4 = _mm256_loadu_ps(aPtr); + aPtr += 8; + cVal4 = _mm256_dp_ps(aVal4, aVal4, 0xF8); + accumulator = _mm256_add_ps(accumulator, aVal4); // accumulator += x + + cVal1 = _mm256_or_ps(cVal1, cVal2); + cVal3 = _mm256_or_ps(cVal3, cVal4); + cVal1 = _mm256_or_ps(cVal1, cVal3); + + squareAccumulator = + _mm256_add_ps(squareAccumulator, cVal1); // squareAccumulator += x^2 + } + _mm256_store_ps(meanBuffer, + accumulator); // Store the results back into the C container + _mm256_store_ps(squareBuffer, + squareAccumulator); // Store the results back into the C container + newMean = meanBuffer[0]; + newMean += meanBuffer[1]; + newMean += meanBuffer[2]; + newMean += meanBuffer[3]; + newMean += meanBuffer[4]; + newMean += meanBuffer[5]; + newMean += meanBuffer[6]; + newMean += meanBuffer[7]; + stdDev = squareBuffer[0]; + stdDev += squareBuffer[1]; + stdDev += squareBuffer[2]; + stdDev += squareBuffer[3]; + stdDev += squareBuffer[4]; + stdDev += squareBuffer[5]; + stdDev += squareBuffer[6]; + stdDev += squareBuffer[7]; + + number = thirtySecondthPoints * 32; + for (; number < num_points; number++) { + stdDev += (*aPtr) * (*aPtr); + newMean += *aPtr++; + } + newMean /= num_points; + stdDev /= num_points; + stdDev -= (newMean * newMean); + stdDev = sqrtf(stdDev); } - newMean /= num_points; - stdDev /= num_points; - stdDev -= (newMean * newMean); - stdDev = sqrtf(stdDev); - } - *stddev = stdDev; - *mean = newMean; - + *stddev = stdDev; + *mean = newMean; } #endif /* LV_HAVE_AVX */ #ifdef LV_HAVE_SSE4_1 #include -static inline void -volk_32f_stddev_and_mean_32f_x2_a_sse4_1(float* stddev, float* mean, - const float* inputBuffer, - unsigned int num_points) +static inline void volk_32f_stddev_and_mean_32f_x2_a_sse4_1(float* stddev, + float* mean, + const float* inputBuffer, + unsigned int num_points) { - float returnValue = 0; - float newMean = 0; - if(num_points > 0){ - unsigned int number = 0; - const unsigned int sixteenthPoints = num_points / 16; - - const float* aPtr = inputBuffer; - __VOLK_ATTR_ALIGNED(16) float meanBuffer[4]; - __VOLK_ATTR_ALIGNED(16) float squareBuffer[4]; - - __m128 accumulator = _mm_setzero_ps(); - __m128 squareAccumulator = _mm_setzero_ps(); - __m128 aVal1, aVal2, aVal3, aVal4; - __m128 cVal1, cVal2, cVal3, cVal4; - for(;number < sixteenthPoints; number++) { - aVal1 = _mm_load_ps(aPtr); aPtr += 4; - cVal1 = _mm_dp_ps(aVal1, aVal1, 0xF1); - accumulator = _mm_add_ps(accumulator, aVal1); // accumulator += x - - aVal2 = _mm_load_ps(aPtr); aPtr += 4; - cVal2 = _mm_dp_ps(aVal2, aVal2, 0xF2); - accumulator = _mm_add_ps(accumulator, aVal2); // accumulator += x - - aVal3 = _mm_load_ps(aPtr); aPtr += 4; - cVal3 = _mm_dp_ps(aVal3, aVal3, 0xF4); - accumulator = _mm_add_ps(accumulator, aVal3); // accumulator += x - - aVal4 = _mm_load_ps(aPtr); aPtr += 4; - cVal4 = _mm_dp_ps(aVal4, aVal4, 0xF8); - accumulator = _mm_add_ps(accumulator, aVal4); // accumulator += x - - cVal1 = _mm_or_ps(cVal1, cVal2); - cVal3 = _mm_or_ps(cVal3, cVal4); - cVal1 = _mm_or_ps(cVal1, cVal3); - - squareAccumulator = _mm_add_ps(squareAccumulator, cVal1); // squareAccumulator += x^2 - } - _mm_store_ps(meanBuffer,accumulator); // Store the results back into the C container - _mm_store_ps(squareBuffer,squareAccumulator); // Store the results back into the C container - newMean = meanBuffer[0]; - newMean += meanBuffer[1]; - newMean += meanBuffer[2]; - newMean += meanBuffer[3]; - returnValue = squareBuffer[0]; - returnValue += squareBuffer[1]; - returnValue += squareBuffer[2]; - returnValue += squareBuffer[3]; - - number = sixteenthPoints * 16; - for(;number < num_points; number++){ - returnValue += (*aPtr) * (*aPtr); - newMean += *aPtr++; + float returnValue = 0; + float newMean = 0; + if (num_points > 0) { + unsigned int number = 0; + const unsigned int sixteenthPoints = num_points / 16; + + const float* aPtr = inputBuffer; + __VOLK_ATTR_ALIGNED(16) float meanBuffer[4]; + __VOLK_ATTR_ALIGNED(16) float squareBuffer[4]; + + __m128 accumulator = _mm_setzero_ps(); + __m128 squareAccumulator = _mm_setzero_ps(); + __m128 aVal1, aVal2, aVal3, aVal4; + __m128 cVal1, cVal2, cVal3, cVal4; + for (; number < sixteenthPoints; number++) { + aVal1 = _mm_load_ps(aPtr); + aPtr += 4; + cVal1 = _mm_dp_ps(aVal1, aVal1, 0xF1); + accumulator = _mm_add_ps(accumulator, aVal1); // accumulator += x + + aVal2 = _mm_load_ps(aPtr); + aPtr += 4; + cVal2 = _mm_dp_ps(aVal2, aVal2, 0xF2); + accumulator = _mm_add_ps(accumulator, aVal2); // accumulator += x + + aVal3 = _mm_load_ps(aPtr); + aPtr += 4; + cVal3 = _mm_dp_ps(aVal3, aVal3, 0xF4); + accumulator = _mm_add_ps(accumulator, aVal3); // accumulator += x + + aVal4 = _mm_load_ps(aPtr); + aPtr += 4; + cVal4 = _mm_dp_ps(aVal4, aVal4, 0xF8); + accumulator = _mm_add_ps(accumulator, aVal4); // accumulator += x + + cVal1 = _mm_or_ps(cVal1, cVal2); + cVal3 = _mm_or_ps(cVal3, cVal4); + cVal1 = _mm_or_ps(cVal1, cVal3); + + squareAccumulator = + _mm_add_ps(squareAccumulator, cVal1); // squareAccumulator += x^2 + } + _mm_store_ps(meanBuffer, + accumulator); // Store the results back into the C container + _mm_store_ps(squareBuffer, + squareAccumulator); // Store the results back into the C container + newMean = meanBuffer[0]; + newMean += meanBuffer[1]; + newMean += meanBuffer[2]; + newMean += meanBuffer[3]; + returnValue = squareBuffer[0]; + returnValue += squareBuffer[1]; + returnValue += squareBuffer[2]; + returnValue += squareBuffer[3]; + + number = sixteenthPoints * 16; + for (; number < num_points; number++) { + returnValue += (*aPtr) * (*aPtr); + newMean += *aPtr++; + } + newMean /= num_points; + returnValue /= num_points; + returnValue -= (newMean * newMean); + returnValue = sqrtf(returnValue); } - newMean /= num_points; - returnValue /= num_points; - returnValue -= (newMean * newMean); - returnValue = sqrtf(returnValue); - } - *stddev = returnValue; - *mean = newMean; + *stddev = returnValue; + *mean = newMean; } #endif /* LV_HAVE_SSE4_1 */ @@ -312,86 +329,86 @@ volk_32f_stddev_and_mean_32f_x2_a_sse4_1(float* stddev, float* mean, #ifdef LV_HAVE_SSE #include -static inline void -volk_32f_stddev_and_mean_32f_x2_a_sse(float* stddev, float* mean, - const float* inputBuffer, - unsigned int num_points) +static inline void volk_32f_stddev_and_mean_32f_x2_a_sse(float* stddev, + float* mean, + const float* inputBuffer, + unsigned int num_points) { - float returnValue = 0; - float newMean = 0; - if(num_points > 0){ - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - const float* aPtr = inputBuffer; - __VOLK_ATTR_ALIGNED(16) float meanBuffer[4]; - __VOLK_ATTR_ALIGNED(16) float squareBuffer[4]; - - __m128 accumulator = _mm_setzero_ps(); - __m128 squareAccumulator = _mm_setzero_ps(); - __m128 aVal = _mm_setzero_ps(); - for(;number < quarterPoints; number++) { - aVal = _mm_load_ps(aPtr); // aVal = x - accumulator = _mm_add_ps(accumulator, aVal); // accumulator += x - aVal = _mm_mul_ps(aVal, aVal); // squareAccumulator += x^2 - squareAccumulator = _mm_add_ps(squareAccumulator, aVal); - aPtr += 4; + float returnValue = 0; + float newMean = 0; + if (num_points > 0) { + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + const float* aPtr = inputBuffer; + __VOLK_ATTR_ALIGNED(16) float meanBuffer[4]; + __VOLK_ATTR_ALIGNED(16) float squareBuffer[4]; + + __m128 accumulator = _mm_setzero_ps(); + __m128 squareAccumulator = _mm_setzero_ps(); + __m128 aVal = _mm_setzero_ps(); + for (; number < quarterPoints; number++) { + aVal = _mm_load_ps(aPtr); // aVal = x + accumulator = _mm_add_ps(accumulator, aVal); // accumulator += x + aVal = _mm_mul_ps(aVal, aVal); // squareAccumulator += x^2 + squareAccumulator = _mm_add_ps(squareAccumulator, aVal); + aPtr += 4; + } + _mm_store_ps(meanBuffer, + accumulator); // Store the results back into the C container + _mm_store_ps(squareBuffer, + squareAccumulator); // Store the results back into the C container + newMean = meanBuffer[0]; + newMean += meanBuffer[1]; + newMean += meanBuffer[2]; + newMean += meanBuffer[3]; + returnValue = squareBuffer[0]; + returnValue += squareBuffer[1]; + returnValue += squareBuffer[2]; + returnValue += squareBuffer[3]; + + number = quarterPoints * 4; + for (; number < num_points; number++) { + returnValue += (*aPtr) * (*aPtr); + newMean += *aPtr++; + } + newMean /= num_points; + returnValue /= num_points; + returnValue -= (newMean * newMean); + returnValue = sqrtf(returnValue); } - _mm_store_ps(meanBuffer,accumulator); // Store the results back into the C container - _mm_store_ps(squareBuffer,squareAccumulator); // Store the results back into the C container - newMean = meanBuffer[0]; - newMean += meanBuffer[1]; - newMean += meanBuffer[2]; - newMean += meanBuffer[3]; - returnValue = squareBuffer[0]; - returnValue += squareBuffer[1]; - returnValue += squareBuffer[2]; - returnValue += squareBuffer[3]; - - number = quarterPoints * 4; - for(;number < num_points; number++){ - returnValue += (*aPtr) * (*aPtr); - newMean += *aPtr++; - } - newMean /= num_points; - returnValue /= num_points; - returnValue -= (newMean * newMean); - returnValue = sqrtf(returnValue); - } - *stddev = returnValue; - *mean = newMean; + *stddev = returnValue; + *mean = newMean; } #endif /* LV_HAVE_SSE */ #ifdef LV_HAVE_GENERIC -static inline void -volk_32f_stddev_and_mean_32f_x2_generic(float* stddev, float* mean, - const float* inputBuffer, - unsigned int num_points) +static inline void volk_32f_stddev_and_mean_32f_x2_generic(float* stddev, + float* mean, + const float* inputBuffer, + unsigned int num_points) { - float returnValue = 0; - float newMean = 0; - if(num_points > 0){ - const float* aPtr = inputBuffer; - unsigned int number = 0; - - for(number = 0; number < num_points; number++){ - returnValue += (*aPtr) * (*aPtr); - newMean += *aPtr++; + float returnValue = 0; + float newMean = 0; + if (num_points > 0) { + const float* aPtr = inputBuffer; + unsigned int number = 0; + + for (number = 0; number < num_points; number++) { + returnValue += (*aPtr) * (*aPtr); + newMean += *aPtr++; + } + newMean /= num_points; + returnValue /= num_points; + returnValue -= (newMean * newMean); + returnValue = sqrtf(returnValue); } - newMean /= num_points; - returnValue /= num_points; - returnValue -= (newMean * newMean); - returnValue = sqrtf(returnValue); - } - *stddev = returnValue; - *mean = newMean; + *stddev = returnValue; + *mean = newMean; } #endif /* LV_HAVE_GENERIC */ - - #endif /* INCLUDED_volk_32f_stddev_and_mean_32f_x2_a_H */ diff --git a/kernels/volk/volk_32f_tan_32f.h b/kernels/volk/volk_32f_tan_32f.h index 239b745..a623a66 100644 --- a/kernels/volk/volk_32f_tan_32f.h +++ b/kernels/volk/volk_32f_tan_32f.h @@ -71,9 +71,9 @@ * \endcode */ -#include -#include #include +#include +#include #ifndef INCLUDED_volk_32f_tan_32f_a_H #define INCLUDED_volk_32f_tan_32f_a_H @@ -82,78 +82,102 @@ #include static inline void -volk_32f_tan_32f_a_avx2_fma(float* bVector, const float* aVector, - unsigned int num_points) +volk_32f_tan_32f_a_avx2_fma(float* bVector, const float* aVector, unsigned int num_points) { - float* bPtr = bVector; - const float* aPtr = aVector; - - unsigned int number = 0; - unsigned int eighthPoints = num_points / 8; - unsigned int i = 0; - - __m256 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones, fzeroes; - __m256 sine, cosine, tangent, condition1, condition2, condition3; - __m256i q, r, ones, twos, fours; - - m4pi = _mm256_set1_ps(1.273239545); - pio4A = _mm256_set1_ps(0.78515625); - pio4B = _mm256_set1_ps(0.241876e-3); - ffours = _mm256_set1_ps(4.0); - ftwos = _mm256_set1_ps(2.0); - fones = _mm256_set1_ps(1.0); - fzeroes = _mm256_setzero_ps(); - ones = _mm256_set1_epi32(1); - twos = _mm256_set1_epi32(2); - fours = _mm256_set1_epi32(4); - - cp1 = _mm256_set1_ps(1.0); - cp2 = _mm256_set1_ps(0.83333333e-1); - cp3 = _mm256_set1_ps(0.2777778e-2); - cp4 = _mm256_set1_ps(0.49603e-4); - cp5 = _mm256_set1_ps(0.551e-6); - - for(;number < eighthPoints; number++){ - aVal = _mm256_load_ps(aPtr); - s = _mm256_sub_ps(aVal, _mm256_and_ps(_mm256_mul_ps(aVal, ftwos), _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS))); - q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi))); - r = _mm256_add_epi32(q, _mm256_and_si256(q, ones)); - - s = _mm256_fnmadd_ps(_mm256_cvtepi32_ps(r), pio4A, s); - s = _mm256_fnmadd_ps(_mm256_cvtepi32_ps(r), pio4B, s); - - s = _mm256_div_ps(s, _mm256_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction - s = _mm256_mul_ps(s, s); - // Evaluate Taylor series - s = _mm256_mul_ps(_mm256_fmadd_ps(_mm256_fmsub_ps(_mm256_fmadd_ps(_mm256_fmsub_ps(s, cp5, cp4), s, cp3), s, cp2), s, cp1), s); - - for(i = 0; i < 3; i++){ - s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s)); + float* bPtr = bVector; + const float* aPtr = aVector; + + unsigned int number = 0; + unsigned int eighthPoints = num_points / 8; + unsigned int i = 0; + + __m256 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones, + fzeroes; + __m256 sine, cosine, tangent, condition1, condition2, condition3; + __m256i q, r, ones, twos, fours; + + m4pi = _mm256_set1_ps(1.273239545); + pio4A = _mm256_set1_ps(0.78515625); + pio4B = _mm256_set1_ps(0.241876e-3); + ffours = _mm256_set1_ps(4.0); + ftwos = _mm256_set1_ps(2.0); + fones = _mm256_set1_ps(1.0); + fzeroes = _mm256_setzero_ps(); + ones = _mm256_set1_epi32(1); + twos = _mm256_set1_epi32(2); + fours = _mm256_set1_epi32(4); + + cp1 = _mm256_set1_ps(1.0); + cp2 = _mm256_set1_ps(0.83333333e-1); + cp3 = _mm256_set1_ps(0.2777778e-2); + cp4 = _mm256_set1_ps(0.49603e-4); + cp5 = _mm256_set1_ps(0.551e-6); + + for (; number < eighthPoints; number++) { + aVal = _mm256_load_ps(aPtr); + s = _mm256_sub_ps(aVal, + _mm256_and_ps(_mm256_mul_ps(aVal, ftwos), + _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS))); + q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi))); + r = _mm256_add_epi32(q, _mm256_and_si256(q, ones)); + + s = _mm256_fnmadd_ps(_mm256_cvtepi32_ps(r), pio4A, s); + s = _mm256_fnmadd_ps(_mm256_cvtepi32_ps(r), pio4B, s); + + s = _mm256_div_ps( + s, + _mm256_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction + s = _mm256_mul_ps(s, s); + // Evaluate Taylor series + s = _mm256_mul_ps( + _mm256_fmadd_ps( + _mm256_fmsub_ps( + _mm256_fmadd_ps(_mm256_fmsub_ps(s, cp5, cp4), s, cp3), s, cp2), + s, + cp1), + s); + + for (i = 0; i < 3; i++) { + s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s)); + } + s = _mm256_div_ps(s, ftwos); + + sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s)); + cosine = _mm256_sub_ps(fones, s); + + condition1 = _mm256_cmp_ps( + _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, ones), twos)), + fzeroes, + _CMP_NEQ_UQ); + condition2 = _mm256_cmp_ps( + _mm256_cmp_ps( + _mm256_cvtepi32_ps(_mm256_and_si256(q, fours)), fzeroes, _CMP_NEQ_UQ), + _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS), + _CMP_NEQ_UQ); + condition3 = _mm256_cmp_ps( + _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, twos), fours)), + fzeroes, + _CMP_NEQ_UQ); + + __m256 temp = cosine; + cosine = + _mm256_add_ps(cosine, _mm256_and_ps(_mm256_sub_ps(sine, cosine), condition1)); + sine = _mm256_add_ps(sine, _mm256_and_ps(_mm256_sub_ps(temp, sine), condition1)); + sine = _mm256_sub_ps( + sine, _mm256_and_ps(_mm256_mul_ps(sine, _mm256_set1_ps(2.0f)), condition2)); + cosine = _mm256_sub_ps( + cosine, + _mm256_and_ps(_mm256_mul_ps(cosine, _mm256_set1_ps(2.0f)), condition3)); + tangent = _mm256_div_ps(sine, cosine); + _mm256_store_ps(bPtr, tangent); + aPtr += 8; + bPtr += 8; + } + + number = eighthPoints * 8; + for (; number < num_points; number++) { + *bPtr++ = tan(*aPtr++); } - s = _mm256_div_ps(s, ftwos); - - sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s)); - cosine = _mm256_sub_ps(fones, s); - - condition1 = _mm256_cmp_ps(_mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, ones), twos)), fzeroes, _CMP_NEQ_UQ); - condition2 = _mm256_cmp_ps(_mm256_cmp_ps(_mm256_cvtepi32_ps(_mm256_and_si256(q, fours)), fzeroes, _CMP_NEQ_UQ), _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS), _CMP_NEQ_UQ); - condition3 = _mm256_cmp_ps(_mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, twos), fours)), fzeroes, _CMP_NEQ_UQ); - - __m256 temp = cosine; - cosine = _mm256_add_ps(cosine, _mm256_and_ps(_mm256_sub_ps(sine, cosine), condition1)); - sine = _mm256_add_ps(sine, _mm256_and_ps(_mm256_sub_ps(temp, sine), condition1)); - sine = _mm256_sub_ps(sine, _mm256_and_ps(_mm256_mul_ps(sine, _mm256_set1_ps(2.0f)), condition2)); - cosine = _mm256_sub_ps(cosine, _mm256_and_ps(_mm256_mul_ps(cosine, _mm256_set1_ps(2.0f)), condition3)); - tangent = _mm256_div_ps(sine, cosine); - _mm256_store_ps(bPtr, tangent); - aPtr += 8; - bPtr += 8; - } - - number = eighthPoints * 8; - for(;number < num_points; number++){ - *bPtr++ = tan(*aPtr++); - } } #endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for aligned */ @@ -162,78 +186,109 @@ volk_32f_tan_32f_a_avx2_fma(float* bVector, const float* aVector, #include static inline void -volk_32f_tan_32f_a_avx2(float* bVector, const float* aVector, - unsigned int num_points) +volk_32f_tan_32f_a_avx2(float* bVector, const float* aVector, unsigned int num_points) { - float* bPtr = bVector; - const float* aPtr = aVector; - - unsigned int number = 0; - unsigned int eighthPoints = num_points / 8; - unsigned int i = 0; - - __m256 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones, fzeroes; - __m256 sine, cosine, tangent, condition1, condition2, condition3; - __m256i q, r, ones, twos, fours; - - m4pi = _mm256_set1_ps(1.273239545); - pio4A = _mm256_set1_ps(0.78515625); - pio4B = _mm256_set1_ps(0.241876e-3); - ffours = _mm256_set1_ps(4.0); - ftwos = _mm256_set1_ps(2.0); - fones = _mm256_set1_ps(1.0); - fzeroes = _mm256_setzero_ps(); - ones = _mm256_set1_epi32(1); - twos = _mm256_set1_epi32(2); - fours = _mm256_set1_epi32(4); - - cp1 = _mm256_set1_ps(1.0); - cp2 = _mm256_set1_ps(0.83333333e-1); - cp3 = _mm256_set1_ps(0.2777778e-2); - cp4 = _mm256_set1_ps(0.49603e-4); - cp5 = _mm256_set1_ps(0.551e-6); - - for(;number < eighthPoints; number++){ - aVal = _mm256_load_ps(aPtr); - s = _mm256_sub_ps(aVal, _mm256_and_ps(_mm256_mul_ps(aVal, ftwos), _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS))); - q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi))); - r = _mm256_add_epi32(q, _mm256_and_si256(q, ones)); - - s = _mm256_sub_ps(s, _mm256_mul_ps(_mm256_cvtepi32_ps(r), pio4A)); - s = _mm256_sub_ps(s, _mm256_mul_ps(_mm256_cvtepi32_ps(r), pio4B)); - - s = _mm256_div_ps(s, _mm256_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction - s = _mm256_mul_ps(s, s); - // Evaluate Taylor series - s = _mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(s, cp5), cp4), s), cp3), s), cp2), s), cp1), s); - - for(i = 0; i < 3; i++){ - s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s)); + float* bPtr = bVector; + const float* aPtr = aVector; + + unsigned int number = 0; + unsigned int eighthPoints = num_points / 8; + unsigned int i = 0; + + __m256 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones, + fzeroes; + __m256 sine, cosine, tangent, condition1, condition2, condition3; + __m256i q, r, ones, twos, fours; + + m4pi = _mm256_set1_ps(1.273239545); + pio4A = _mm256_set1_ps(0.78515625); + pio4B = _mm256_set1_ps(0.241876e-3); + ffours = _mm256_set1_ps(4.0); + ftwos = _mm256_set1_ps(2.0); + fones = _mm256_set1_ps(1.0); + fzeroes = _mm256_setzero_ps(); + ones = _mm256_set1_epi32(1); + twos = _mm256_set1_epi32(2); + fours = _mm256_set1_epi32(4); + + cp1 = _mm256_set1_ps(1.0); + cp2 = _mm256_set1_ps(0.83333333e-1); + cp3 = _mm256_set1_ps(0.2777778e-2); + cp4 = _mm256_set1_ps(0.49603e-4); + cp5 = _mm256_set1_ps(0.551e-6); + + for (; number < eighthPoints; number++) { + aVal = _mm256_load_ps(aPtr); + s = _mm256_sub_ps(aVal, + _mm256_and_ps(_mm256_mul_ps(aVal, ftwos), + _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS))); + q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi))); + r = _mm256_add_epi32(q, _mm256_and_si256(q, ones)); + + s = _mm256_sub_ps(s, _mm256_mul_ps(_mm256_cvtepi32_ps(r), pio4A)); + s = _mm256_sub_ps(s, _mm256_mul_ps(_mm256_cvtepi32_ps(r), pio4B)); + + s = _mm256_div_ps( + s, + _mm256_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction + s = _mm256_mul_ps(s, s); + // Evaluate Taylor series + s = _mm256_mul_ps( + _mm256_add_ps( + _mm256_mul_ps( + _mm256_sub_ps( + _mm256_mul_ps( + _mm256_add_ps( + _mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(s, cp5), cp4), + s), + cp3), + s), + cp2), + s), + cp1), + s); + + for (i = 0; i < 3; i++) { + s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s)); + } + s = _mm256_div_ps(s, ftwos); + + sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s)); + cosine = _mm256_sub_ps(fones, s); + + condition1 = _mm256_cmp_ps( + _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, ones), twos)), + fzeroes, + _CMP_NEQ_UQ); + condition2 = _mm256_cmp_ps( + _mm256_cmp_ps( + _mm256_cvtepi32_ps(_mm256_and_si256(q, fours)), fzeroes, _CMP_NEQ_UQ), + _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS), + _CMP_NEQ_UQ); + condition3 = _mm256_cmp_ps( + _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, twos), fours)), + fzeroes, + _CMP_NEQ_UQ); + + __m256 temp = cosine; + cosine = + _mm256_add_ps(cosine, _mm256_and_ps(_mm256_sub_ps(sine, cosine), condition1)); + sine = _mm256_add_ps(sine, _mm256_and_ps(_mm256_sub_ps(temp, sine), condition1)); + sine = _mm256_sub_ps( + sine, _mm256_and_ps(_mm256_mul_ps(sine, _mm256_set1_ps(2.0f)), condition2)); + cosine = _mm256_sub_ps( + cosine, + _mm256_and_ps(_mm256_mul_ps(cosine, _mm256_set1_ps(2.0f)), condition3)); + tangent = _mm256_div_ps(sine, cosine); + _mm256_store_ps(bPtr, tangent); + aPtr += 8; + bPtr += 8; + } + + number = eighthPoints * 8; + for (; number < num_points; number++) { + *bPtr++ = tan(*aPtr++); } - s = _mm256_div_ps(s, ftwos); - - sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s)); - cosine = _mm256_sub_ps(fones, s); - - condition1 = _mm256_cmp_ps(_mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, ones), twos)), fzeroes, _CMP_NEQ_UQ); - condition2 = _mm256_cmp_ps(_mm256_cmp_ps(_mm256_cvtepi32_ps(_mm256_and_si256(q, fours)), fzeroes, _CMP_NEQ_UQ), _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS), _CMP_NEQ_UQ); - condition3 = _mm256_cmp_ps(_mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, twos), fours)), fzeroes, _CMP_NEQ_UQ); - - __m256 temp = cosine; - cosine = _mm256_add_ps(cosine, _mm256_and_ps(_mm256_sub_ps(sine, cosine), condition1)); - sine = _mm256_add_ps(sine, _mm256_and_ps(_mm256_sub_ps(temp, sine), condition1)); - sine = _mm256_sub_ps(sine, _mm256_and_ps(_mm256_mul_ps(sine, _mm256_set1_ps(2.0f)), condition2)); - cosine = _mm256_sub_ps(cosine, _mm256_and_ps(_mm256_mul_ps(cosine, _mm256_set1_ps(2.0f)), condition3)); - tangent = _mm256_div_ps(sine, cosine); - _mm256_store_ps(bPtr, tangent); - aPtr += 8; - bPtr += 8; - } - - number = eighthPoints * 8; - for(;number < num_points; number++){ - *bPtr++ = tan(*aPtr++); - } } #endif /* LV_HAVE_AVX2 for aligned */ @@ -242,78 +297,97 @@ volk_32f_tan_32f_a_avx2(float* bVector, const float* aVector, #include static inline void -volk_32f_tan_32f_a_sse4_1(float* bVector, const float* aVector, - unsigned int num_points) +volk_32f_tan_32f_a_sse4_1(float* bVector, const float* aVector, unsigned int num_points) { - float* bPtr = bVector; - const float* aPtr = aVector; - - unsigned int number = 0; - unsigned int quarterPoints = num_points / 4; - unsigned int i = 0; - - __m128 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones, fzeroes; - __m128 sine, cosine, tangent, condition1, condition2, condition3; - __m128i q, r, ones, twos, fours; - - m4pi = _mm_set1_ps(1.273239545); - pio4A = _mm_set1_ps(0.78515625); - pio4B = _mm_set1_ps(0.241876e-3); - ffours = _mm_set1_ps(4.0); - ftwos = _mm_set1_ps(2.0); - fones = _mm_set1_ps(1.0); - fzeroes = _mm_setzero_ps(); - ones = _mm_set1_epi32(1); - twos = _mm_set1_epi32(2); - fours = _mm_set1_epi32(4); - - cp1 = _mm_set1_ps(1.0); - cp2 = _mm_set1_ps(0.83333333e-1); - cp3 = _mm_set1_ps(0.2777778e-2); - cp4 = _mm_set1_ps(0.49603e-4); - cp5 = _mm_set1_ps(0.551e-6); - - for(;number < quarterPoints; number++){ - aVal = _mm_load_ps(aPtr); - s = _mm_sub_ps(aVal, _mm_and_ps(_mm_mul_ps(aVal, ftwos), _mm_cmplt_ps(aVal, fzeroes))); - q = _mm_cvtps_epi32(_mm_floor_ps(_mm_mul_ps(s, m4pi))); - r = _mm_add_epi32(q, _mm_and_si128(q, ones)); - - s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4A)); - s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4B)); - - s = _mm_div_ps(s, _mm_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction - s = _mm_mul_ps(s, s); - // Evaluate Taylor series - s = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(s, cp5), cp4), s), cp3), s), cp2), s), cp1), s); - - for(i = 0; i < 3; i++){ - s = _mm_mul_ps(s, _mm_sub_ps(ffours, s)); + float* bPtr = bVector; + const float* aPtr = aVector; + + unsigned int number = 0; + unsigned int quarterPoints = num_points / 4; + unsigned int i = 0; + + __m128 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones, + fzeroes; + __m128 sine, cosine, tangent, condition1, condition2, condition3; + __m128i q, r, ones, twos, fours; + + m4pi = _mm_set1_ps(1.273239545); + pio4A = _mm_set1_ps(0.78515625); + pio4B = _mm_set1_ps(0.241876e-3); + ffours = _mm_set1_ps(4.0); + ftwos = _mm_set1_ps(2.0); + fones = _mm_set1_ps(1.0); + fzeroes = _mm_setzero_ps(); + ones = _mm_set1_epi32(1); + twos = _mm_set1_epi32(2); + fours = _mm_set1_epi32(4); + + cp1 = _mm_set1_ps(1.0); + cp2 = _mm_set1_ps(0.83333333e-1); + cp3 = _mm_set1_ps(0.2777778e-2); + cp4 = _mm_set1_ps(0.49603e-4); + cp5 = _mm_set1_ps(0.551e-6); + + for (; number < quarterPoints; number++) { + aVal = _mm_load_ps(aPtr); + s = _mm_sub_ps(aVal, + _mm_and_ps(_mm_mul_ps(aVal, ftwos), _mm_cmplt_ps(aVal, fzeroes))); + q = _mm_cvtps_epi32(_mm_floor_ps(_mm_mul_ps(s, m4pi))); + r = _mm_add_epi32(q, _mm_and_si128(q, ones)); + + s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4A)); + s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4B)); + + s = _mm_div_ps( + s, _mm_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction + s = _mm_mul_ps(s, s); + // Evaluate Taylor series + s = _mm_mul_ps( + _mm_add_ps( + _mm_mul_ps( + _mm_sub_ps( + _mm_mul_ps( + _mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(s, cp5), cp4), s), + cp3), + s), + cp2), + s), + cp1), + s); + + for (i = 0; i < 3; i++) { + s = _mm_mul_ps(s, _mm_sub_ps(ffours, s)); + } + s = _mm_div_ps(s, ftwos); + + sine = _mm_sqrt_ps(_mm_mul_ps(_mm_sub_ps(ftwos, s), s)); + cosine = _mm_sub_ps(fones, s); + + condition1 = _mm_cmpneq_ps( + _mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, ones), twos)), fzeroes); + condition2 = _mm_cmpneq_ps( + _mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(q, fours)), fzeroes), + _mm_cmplt_ps(aVal, fzeroes)); + condition3 = _mm_cmpneq_ps( + _mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, twos), fours)), fzeroes); + + __m128 temp = cosine; + cosine = _mm_add_ps(cosine, _mm_and_ps(_mm_sub_ps(sine, cosine), condition1)); + sine = _mm_add_ps(sine, _mm_and_ps(_mm_sub_ps(temp, sine), condition1)); + sine = + _mm_sub_ps(sine, _mm_and_ps(_mm_mul_ps(sine, _mm_set1_ps(2.0f)), condition2)); + cosine = _mm_sub_ps( + cosine, _mm_and_ps(_mm_mul_ps(cosine, _mm_set1_ps(2.0f)), condition3)); + tangent = _mm_div_ps(sine, cosine); + _mm_store_ps(bPtr, tangent); + aPtr += 4; + bPtr += 4; + } + + number = quarterPoints * 4; + for (; number < num_points; number++) { + *bPtr++ = tanf(*aPtr++); } - s = _mm_div_ps(s, ftwos); - - sine = _mm_sqrt_ps(_mm_mul_ps(_mm_sub_ps(ftwos, s), s)); - cosine = _mm_sub_ps(fones, s); - - condition1 = _mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, ones), twos)), fzeroes); - condition2 = _mm_cmpneq_ps(_mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(q, fours)), fzeroes), _mm_cmplt_ps(aVal, fzeroes)); - condition3 = _mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, twos), fours)), fzeroes); - - __m128 temp = cosine; - cosine = _mm_add_ps(cosine, _mm_and_ps(_mm_sub_ps(sine, cosine), condition1)); - sine = _mm_add_ps(sine, _mm_and_ps(_mm_sub_ps(temp, sine), condition1)); - sine = _mm_sub_ps(sine, _mm_and_ps(_mm_mul_ps(sine, _mm_set1_ps(2.0f)), condition2)); - cosine = _mm_sub_ps(cosine, _mm_and_ps(_mm_mul_ps(cosine, _mm_set1_ps(2.0f)), condition3)); - tangent = _mm_div_ps(sine, cosine); - _mm_store_ps(bPtr, tangent); - aPtr += 4; - bPtr += 4; - } - - number = quarterPoints * 4; - for(;number < num_points; number++){ - *bPtr++ = tanf(*aPtr++); - } } #endif /* LV_HAVE_SSE4_1 for aligned */ @@ -328,78 +402,102 @@ volk_32f_tan_32f_a_sse4_1(float* bVector, const float* aVector, #include static inline void -volk_32f_tan_32f_u_avx2_fma(float* bVector, const float* aVector, - unsigned int num_points) +volk_32f_tan_32f_u_avx2_fma(float* bVector, const float* aVector, unsigned int num_points) { - float* bPtr = bVector; - const float* aPtr = aVector; - - unsigned int number = 0; - unsigned int eighthPoints = num_points / 8; - unsigned int i = 0; - - __m256 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones, fzeroes; - __m256 sine, cosine, tangent, condition1, condition2, condition3; - __m256i q, r, ones, twos, fours; - - m4pi = _mm256_set1_ps(1.273239545); - pio4A = _mm256_set1_ps(0.78515625); - pio4B = _mm256_set1_ps(0.241876e-3); - ffours = _mm256_set1_ps(4.0); - ftwos = _mm256_set1_ps(2.0); - fones = _mm256_set1_ps(1.0); - fzeroes = _mm256_setzero_ps(); - ones = _mm256_set1_epi32(1); - twos = _mm256_set1_epi32(2); - fours = _mm256_set1_epi32(4); - - cp1 = _mm256_set1_ps(1.0); - cp2 = _mm256_set1_ps(0.83333333e-1); - cp3 = _mm256_set1_ps(0.2777778e-2); - cp4 = _mm256_set1_ps(0.49603e-4); - cp5 = _mm256_set1_ps(0.551e-6); - - for(;number < eighthPoints; number++){ - aVal = _mm256_loadu_ps(aPtr); - s = _mm256_sub_ps(aVal, _mm256_and_ps(_mm256_mul_ps(aVal, ftwos), _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS))); - q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi))); - r = _mm256_add_epi32(q, _mm256_and_si256(q, ones)); - - s = _mm256_fnmadd_ps(_mm256_cvtepi32_ps(r), pio4A, s); - s = _mm256_fnmadd_ps(_mm256_cvtepi32_ps(r), pio4B, s); - - s = _mm256_div_ps(s, _mm256_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction - s = _mm256_mul_ps(s, s); - // Evaluate Taylor series - s = _mm256_mul_ps(_mm256_fmadd_ps(_mm256_fmsub_ps(_mm256_fmadd_ps(_mm256_fmsub_ps(s, cp5, cp4), s, cp3), s, cp2), s, cp1), s); - - for(i = 0; i < 3; i++){ - s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s)); + float* bPtr = bVector; + const float* aPtr = aVector; + + unsigned int number = 0; + unsigned int eighthPoints = num_points / 8; + unsigned int i = 0; + + __m256 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones, + fzeroes; + __m256 sine, cosine, tangent, condition1, condition2, condition3; + __m256i q, r, ones, twos, fours; + + m4pi = _mm256_set1_ps(1.273239545); + pio4A = _mm256_set1_ps(0.78515625); + pio4B = _mm256_set1_ps(0.241876e-3); + ffours = _mm256_set1_ps(4.0); + ftwos = _mm256_set1_ps(2.0); + fones = _mm256_set1_ps(1.0); + fzeroes = _mm256_setzero_ps(); + ones = _mm256_set1_epi32(1); + twos = _mm256_set1_epi32(2); + fours = _mm256_set1_epi32(4); + + cp1 = _mm256_set1_ps(1.0); + cp2 = _mm256_set1_ps(0.83333333e-1); + cp3 = _mm256_set1_ps(0.2777778e-2); + cp4 = _mm256_set1_ps(0.49603e-4); + cp5 = _mm256_set1_ps(0.551e-6); + + for (; number < eighthPoints; number++) { + aVal = _mm256_loadu_ps(aPtr); + s = _mm256_sub_ps(aVal, + _mm256_and_ps(_mm256_mul_ps(aVal, ftwos), + _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS))); + q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi))); + r = _mm256_add_epi32(q, _mm256_and_si256(q, ones)); + + s = _mm256_fnmadd_ps(_mm256_cvtepi32_ps(r), pio4A, s); + s = _mm256_fnmadd_ps(_mm256_cvtepi32_ps(r), pio4B, s); + + s = _mm256_div_ps( + s, + _mm256_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction + s = _mm256_mul_ps(s, s); + // Evaluate Taylor series + s = _mm256_mul_ps( + _mm256_fmadd_ps( + _mm256_fmsub_ps( + _mm256_fmadd_ps(_mm256_fmsub_ps(s, cp5, cp4), s, cp3), s, cp2), + s, + cp1), + s); + + for (i = 0; i < 3; i++) { + s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s)); + } + s = _mm256_div_ps(s, ftwos); + + sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s)); + cosine = _mm256_sub_ps(fones, s); + + condition1 = _mm256_cmp_ps( + _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, ones), twos)), + fzeroes, + _CMP_NEQ_UQ); + condition2 = _mm256_cmp_ps( + _mm256_cmp_ps( + _mm256_cvtepi32_ps(_mm256_and_si256(q, fours)), fzeroes, _CMP_NEQ_UQ), + _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS), + _CMP_NEQ_UQ); + condition3 = _mm256_cmp_ps( + _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, twos), fours)), + fzeroes, + _CMP_NEQ_UQ); + + __m256 temp = cosine; + cosine = + _mm256_add_ps(cosine, _mm256_and_ps(_mm256_sub_ps(sine, cosine), condition1)); + sine = _mm256_add_ps(sine, _mm256_and_ps(_mm256_sub_ps(temp, sine), condition1)); + sine = _mm256_sub_ps( + sine, _mm256_and_ps(_mm256_mul_ps(sine, _mm256_set1_ps(2.0f)), condition2)); + cosine = _mm256_sub_ps( + cosine, + _mm256_and_ps(_mm256_mul_ps(cosine, _mm256_set1_ps(2.0f)), condition3)); + tangent = _mm256_div_ps(sine, cosine); + _mm256_storeu_ps(bPtr, tangent); + aPtr += 8; + bPtr += 8; + } + + number = eighthPoints * 8; + for (; number < num_points; number++) { + *bPtr++ = tan(*aPtr++); } - s = _mm256_div_ps(s, ftwos); - - sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s)); - cosine = _mm256_sub_ps(fones, s); - - condition1 = _mm256_cmp_ps(_mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, ones), twos)), fzeroes, _CMP_NEQ_UQ); - condition2 = _mm256_cmp_ps(_mm256_cmp_ps(_mm256_cvtepi32_ps(_mm256_and_si256(q, fours)), fzeroes, _CMP_NEQ_UQ), _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS), _CMP_NEQ_UQ); - condition3 = _mm256_cmp_ps(_mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, twos), fours)), fzeroes, _CMP_NEQ_UQ); - - __m256 temp = cosine; - cosine = _mm256_add_ps(cosine, _mm256_and_ps(_mm256_sub_ps(sine, cosine), condition1)); - sine = _mm256_add_ps(sine, _mm256_and_ps(_mm256_sub_ps(temp, sine), condition1)); - sine = _mm256_sub_ps(sine, _mm256_and_ps(_mm256_mul_ps(sine, _mm256_set1_ps(2.0f)), condition2)); - cosine = _mm256_sub_ps(cosine, _mm256_and_ps(_mm256_mul_ps(cosine, _mm256_set1_ps(2.0f)), condition3)); - tangent = _mm256_div_ps(sine, cosine); - _mm256_storeu_ps(bPtr, tangent); - aPtr += 8; - bPtr += 8; - } - - number = eighthPoints * 8; - for(;number < num_points; number++){ - *bPtr++ = tan(*aPtr++); - } } #endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for unaligned */ @@ -408,78 +506,109 @@ volk_32f_tan_32f_u_avx2_fma(float* bVector, const float* aVector, #include static inline void -volk_32f_tan_32f_u_avx2(float* bVector, const float* aVector, - unsigned int num_points) +volk_32f_tan_32f_u_avx2(float* bVector, const float* aVector, unsigned int num_points) { - float* bPtr = bVector; - const float* aPtr = aVector; - - unsigned int number = 0; - unsigned int eighthPoints = num_points / 8; - unsigned int i = 0; - - __m256 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones, fzeroes; - __m256 sine, cosine, tangent, condition1, condition2, condition3; - __m256i q, r, ones, twos, fours; - - m4pi = _mm256_set1_ps(1.273239545); - pio4A = _mm256_set1_ps(0.78515625); - pio4B = _mm256_set1_ps(0.241876e-3); - ffours = _mm256_set1_ps(4.0); - ftwos = _mm256_set1_ps(2.0); - fones = _mm256_set1_ps(1.0); - fzeroes = _mm256_setzero_ps(); - ones = _mm256_set1_epi32(1); - twos = _mm256_set1_epi32(2); - fours = _mm256_set1_epi32(4); - - cp1 = _mm256_set1_ps(1.0); - cp2 = _mm256_set1_ps(0.83333333e-1); - cp3 = _mm256_set1_ps(0.2777778e-2); - cp4 = _mm256_set1_ps(0.49603e-4); - cp5 = _mm256_set1_ps(0.551e-6); - - for(;number < eighthPoints; number++){ - aVal = _mm256_loadu_ps(aPtr); - s = _mm256_sub_ps(aVal, _mm256_and_ps(_mm256_mul_ps(aVal, ftwos), _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS))); - q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi))); - r = _mm256_add_epi32(q, _mm256_and_si256(q, ones)); - - s = _mm256_sub_ps(s, _mm256_mul_ps(_mm256_cvtepi32_ps(r), pio4A)); - s = _mm256_sub_ps(s, _mm256_mul_ps(_mm256_cvtepi32_ps(r), pio4B)); - - s = _mm256_div_ps(s, _mm256_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction - s = _mm256_mul_ps(s, s); - // Evaluate Taylor series - s = _mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(s, cp5), cp4), s), cp3), s), cp2), s), cp1), s); - - for(i = 0; i < 3; i++){ - s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s)); + float* bPtr = bVector; + const float* aPtr = aVector; + + unsigned int number = 0; + unsigned int eighthPoints = num_points / 8; + unsigned int i = 0; + + __m256 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones, + fzeroes; + __m256 sine, cosine, tangent, condition1, condition2, condition3; + __m256i q, r, ones, twos, fours; + + m4pi = _mm256_set1_ps(1.273239545); + pio4A = _mm256_set1_ps(0.78515625); + pio4B = _mm256_set1_ps(0.241876e-3); + ffours = _mm256_set1_ps(4.0); + ftwos = _mm256_set1_ps(2.0); + fones = _mm256_set1_ps(1.0); + fzeroes = _mm256_setzero_ps(); + ones = _mm256_set1_epi32(1); + twos = _mm256_set1_epi32(2); + fours = _mm256_set1_epi32(4); + + cp1 = _mm256_set1_ps(1.0); + cp2 = _mm256_set1_ps(0.83333333e-1); + cp3 = _mm256_set1_ps(0.2777778e-2); + cp4 = _mm256_set1_ps(0.49603e-4); + cp5 = _mm256_set1_ps(0.551e-6); + + for (; number < eighthPoints; number++) { + aVal = _mm256_loadu_ps(aPtr); + s = _mm256_sub_ps(aVal, + _mm256_and_ps(_mm256_mul_ps(aVal, ftwos), + _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS))); + q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi))); + r = _mm256_add_epi32(q, _mm256_and_si256(q, ones)); + + s = _mm256_sub_ps(s, _mm256_mul_ps(_mm256_cvtepi32_ps(r), pio4A)); + s = _mm256_sub_ps(s, _mm256_mul_ps(_mm256_cvtepi32_ps(r), pio4B)); + + s = _mm256_div_ps( + s, + _mm256_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction + s = _mm256_mul_ps(s, s); + // Evaluate Taylor series + s = _mm256_mul_ps( + _mm256_add_ps( + _mm256_mul_ps( + _mm256_sub_ps( + _mm256_mul_ps( + _mm256_add_ps( + _mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(s, cp5), cp4), + s), + cp3), + s), + cp2), + s), + cp1), + s); + + for (i = 0; i < 3; i++) { + s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s)); + } + s = _mm256_div_ps(s, ftwos); + + sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s)); + cosine = _mm256_sub_ps(fones, s); + + condition1 = _mm256_cmp_ps( + _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, ones), twos)), + fzeroes, + _CMP_NEQ_UQ); + condition2 = _mm256_cmp_ps( + _mm256_cmp_ps( + _mm256_cvtepi32_ps(_mm256_and_si256(q, fours)), fzeroes, _CMP_NEQ_UQ), + _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS), + _CMP_NEQ_UQ); + condition3 = _mm256_cmp_ps( + _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, twos), fours)), + fzeroes, + _CMP_NEQ_UQ); + + __m256 temp = cosine; + cosine = + _mm256_add_ps(cosine, _mm256_and_ps(_mm256_sub_ps(sine, cosine), condition1)); + sine = _mm256_add_ps(sine, _mm256_and_ps(_mm256_sub_ps(temp, sine), condition1)); + sine = _mm256_sub_ps( + sine, _mm256_and_ps(_mm256_mul_ps(sine, _mm256_set1_ps(2.0f)), condition2)); + cosine = _mm256_sub_ps( + cosine, + _mm256_and_ps(_mm256_mul_ps(cosine, _mm256_set1_ps(2.0f)), condition3)); + tangent = _mm256_div_ps(sine, cosine); + _mm256_storeu_ps(bPtr, tangent); + aPtr += 8; + bPtr += 8; + } + + number = eighthPoints * 8; + for (; number < num_points; number++) { + *bPtr++ = tan(*aPtr++); } - s = _mm256_div_ps(s, ftwos); - - sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s)); - cosine = _mm256_sub_ps(fones, s); - - condition1 = _mm256_cmp_ps(_mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, ones), twos)), fzeroes, _CMP_NEQ_UQ); - condition2 = _mm256_cmp_ps(_mm256_cmp_ps(_mm256_cvtepi32_ps(_mm256_and_si256(q, fours)), fzeroes, _CMP_NEQ_UQ), _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS), _CMP_NEQ_UQ); - condition3 = _mm256_cmp_ps(_mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, twos), fours)), fzeroes, _CMP_NEQ_UQ); - - __m256 temp = cosine; - cosine = _mm256_add_ps(cosine, _mm256_and_ps(_mm256_sub_ps(sine, cosine), condition1)); - sine = _mm256_add_ps(sine, _mm256_and_ps(_mm256_sub_ps(temp, sine), condition1)); - sine = _mm256_sub_ps(sine, _mm256_and_ps(_mm256_mul_ps(sine, _mm256_set1_ps(2.0f)), condition2)); - cosine = _mm256_sub_ps(cosine, _mm256_and_ps(_mm256_mul_ps(cosine, _mm256_set1_ps(2.0f)), condition3)); - tangent = _mm256_div_ps(sine, cosine); - _mm256_storeu_ps(bPtr, tangent); - aPtr += 8; - bPtr += 8; - } - - number = eighthPoints * 8; - for(;number < num_points; number++){ - *bPtr++ = tan(*aPtr++); - } } #endif /* LV_HAVE_AVX2 for unaligned */ @@ -491,75 +620,95 @@ volk_32f_tan_32f_u_avx2(float* bVector, const float* aVector, static inline void volk_32f_tan_32f_u_sse4_1(float* bVector, const float* aVector, unsigned int num_points) { - float* bPtr = bVector; - const float* aPtr = aVector; - - unsigned int number = 0; - unsigned int quarterPoints = num_points / 4; - unsigned int i = 0; - - __m128 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones, fzeroes; - __m128 sine, cosine, tangent, condition1, condition2, condition3; - __m128i q, r, ones, twos, fours; - - m4pi = _mm_set1_ps(1.273239545); - pio4A = _mm_set1_ps(0.78515625); - pio4B = _mm_set1_ps(0.241876e-3); - ffours = _mm_set1_ps(4.0); - ftwos = _mm_set1_ps(2.0); - fones = _mm_set1_ps(1.0); - fzeroes = _mm_setzero_ps(); - ones = _mm_set1_epi32(1); - twos = _mm_set1_epi32(2); - fours = _mm_set1_epi32(4); - - cp1 = _mm_set1_ps(1.0); - cp2 = _mm_set1_ps(0.83333333e-1); - cp3 = _mm_set1_ps(0.2777778e-2); - cp4 = _mm_set1_ps(0.49603e-4); - cp5 = _mm_set1_ps(0.551e-6); - - for(;number < quarterPoints; number++){ - aVal = _mm_loadu_ps(aPtr); - s = _mm_sub_ps(aVal, _mm_and_ps(_mm_mul_ps(aVal, ftwos), _mm_cmplt_ps(aVal, fzeroes))); - q = _mm_cvtps_epi32(_mm_floor_ps(_mm_mul_ps(s, m4pi))); - r = _mm_add_epi32(q, _mm_and_si128(q, ones)); - - s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4A)); - s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4B)); - - s = _mm_div_ps(s, _mm_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction - s = _mm_mul_ps(s, s); - // Evaluate Taylor series - s = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(s, cp5), cp4), s), cp3), s), cp2), s), cp1), s); - - for(i = 0; i < 3; i++){ - s = _mm_mul_ps(s, _mm_sub_ps(ffours, s)); + float* bPtr = bVector; + const float* aPtr = aVector; + + unsigned int number = 0; + unsigned int quarterPoints = num_points / 4; + unsigned int i = 0; + + __m128 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones, + fzeroes; + __m128 sine, cosine, tangent, condition1, condition2, condition3; + __m128i q, r, ones, twos, fours; + + m4pi = _mm_set1_ps(1.273239545); + pio4A = _mm_set1_ps(0.78515625); + pio4B = _mm_set1_ps(0.241876e-3); + ffours = _mm_set1_ps(4.0); + ftwos = _mm_set1_ps(2.0); + fones = _mm_set1_ps(1.0); + fzeroes = _mm_setzero_ps(); + ones = _mm_set1_epi32(1); + twos = _mm_set1_epi32(2); + fours = _mm_set1_epi32(4); + + cp1 = _mm_set1_ps(1.0); + cp2 = _mm_set1_ps(0.83333333e-1); + cp3 = _mm_set1_ps(0.2777778e-2); + cp4 = _mm_set1_ps(0.49603e-4); + cp5 = _mm_set1_ps(0.551e-6); + + for (; number < quarterPoints; number++) { + aVal = _mm_loadu_ps(aPtr); + s = _mm_sub_ps(aVal, + _mm_and_ps(_mm_mul_ps(aVal, ftwos), _mm_cmplt_ps(aVal, fzeroes))); + q = _mm_cvtps_epi32(_mm_floor_ps(_mm_mul_ps(s, m4pi))); + r = _mm_add_epi32(q, _mm_and_si128(q, ones)); + + s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4A)); + s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4B)); + + s = _mm_div_ps( + s, _mm_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction + s = _mm_mul_ps(s, s); + // Evaluate Taylor series + s = _mm_mul_ps( + _mm_add_ps( + _mm_mul_ps( + _mm_sub_ps( + _mm_mul_ps( + _mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(s, cp5), cp4), s), + cp3), + s), + cp2), + s), + cp1), + s); + + for (i = 0; i < 3; i++) { + s = _mm_mul_ps(s, _mm_sub_ps(ffours, s)); + } + s = _mm_div_ps(s, ftwos); + + sine = _mm_sqrt_ps(_mm_mul_ps(_mm_sub_ps(ftwos, s), s)); + cosine = _mm_sub_ps(fones, s); + + condition1 = _mm_cmpneq_ps( + _mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, ones), twos)), fzeroes); + condition2 = _mm_cmpneq_ps( + _mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(q, fours)), fzeroes), + _mm_cmplt_ps(aVal, fzeroes)); + condition3 = _mm_cmpneq_ps( + _mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, twos), fours)), fzeroes); + + __m128 temp = cosine; + cosine = _mm_add_ps(cosine, _mm_and_ps(_mm_sub_ps(sine, cosine), condition1)); + sine = _mm_add_ps(sine, _mm_and_ps(_mm_sub_ps(temp, sine), condition1)); + sine = + _mm_sub_ps(sine, _mm_and_ps(_mm_mul_ps(sine, _mm_set1_ps(2.0f)), condition2)); + cosine = _mm_sub_ps( + cosine, _mm_and_ps(_mm_mul_ps(cosine, _mm_set1_ps(2.0f)), condition3)); + tangent = _mm_div_ps(sine, cosine); + _mm_storeu_ps(bPtr, tangent); + aPtr += 4; + bPtr += 4; + } + + number = quarterPoints * 4; + for (; number < num_points; number++) { + *bPtr++ = tanf(*aPtr++); } - s = _mm_div_ps(s, ftwos); - - sine = _mm_sqrt_ps(_mm_mul_ps(_mm_sub_ps(ftwos, s), s)); - cosine = _mm_sub_ps(fones, s); - - condition1 = _mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, ones), twos)), fzeroes); - condition2 = _mm_cmpneq_ps(_mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(q, fours)), fzeroes), _mm_cmplt_ps(aVal, fzeroes)); - condition3 = _mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, twos), fours)), fzeroes); - - __m128 temp = cosine; - cosine = _mm_add_ps(cosine, _mm_and_ps(_mm_sub_ps(sine, cosine), condition1)); - sine = _mm_add_ps(sine, _mm_and_ps(_mm_sub_ps(temp, sine), condition1)); - sine = _mm_sub_ps(sine, _mm_and_ps(_mm_mul_ps(sine, _mm_set1_ps(2.0f)), condition2)); - cosine = _mm_sub_ps(cosine, _mm_and_ps(_mm_mul_ps(cosine, _mm_set1_ps(2.0f)), condition3)); - tangent = _mm_div_ps(sine, cosine); - _mm_storeu_ps(bPtr, tangent); - aPtr += 4; - bPtr += 4; - } - - number = quarterPoints * 4; - for(;number < num_points; number++){ - *bPtr++ = tanf(*aPtr++); - } } #endif /* LV_HAVE_SSE4_1 for unaligned */ @@ -568,16 +717,15 @@ volk_32f_tan_32f_u_sse4_1(float* bVector, const float* aVector, unsigned int num #ifdef LV_HAVE_GENERIC static inline void -volk_32f_tan_32f_generic(float* bVector, const float* aVector, - unsigned int num_points) +volk_32f_tan_32f_generic(float* bVector, const float* aVector, unsigned int num_points) { - float* bPtr = bVector; - const float* aPtr = aVector; - unsigned int number = 0; + float* bPtr = bVector; + const float* aPtr = aVector; + unsigned int number = 0; - for(; number < num_points; number++){ - *bPtr++ = tanf(*aPtr++); - } + for (; number < num_points; number++) { + *bPtr++ = tanf(*aPtr++); + } } #endif /* LV_HAVE_GENERIC */ @@ -587,30 +735,29 @@ volk_32f_tan_32f_generic(float* bVector, const float* aVector, #include static inline void -volk_32f_tan_32f_neon(float* bVector, const float* aVector, - unsigned int num_points) +volk_32f_tan_32f_neon(float* bVector, const float* aVector, unsigned int num_points) { unsigned int number = 0; unsigned int quarter_points = num_points / 4; float* bVectorPtr = bVector; const float* aVectorPtr = aVector; - + float32x4_t b_vec; float32x4_t a_vec; - - for(number = 0; number < quarter_points; number++) { + + for (number = 0; number < quarter_points; number++) { a_vec = vld1q_f32(aVectorPtr); // Prefetch next one, speeds things up - __VOLK_PREFETCH(aVectorPtr+4); + __VOLK_PREFETCH(aVectorPtr + 4); b_vec = _vtanq_f32(a_vec); vst1q_f32(bVectorPtr, b_vec); // move pointers ahead - bVectorPtr+=4; - aVectorPtr+=4; + bVectorPtr += 4; + aVectorPtr += 4; } - + // Deal with the rest - for(number = quarter_points * 4; number < num_points; number++) { + for (number = quarter_points * 4; number < num_points; number++) { *bVectorPtr++ = tanf(*aVectorPtr++); } } diff --git a/kernels/volk/volk_32f_tanh_32f.h b/kernels/volk/volk_32f_tanh_32f.h index d49432d..f157d39 100644 --- a/kernels/volk/volk_32f_tanh_32f.h +++ b/kernels/volk/volk_32f_tanh_32f.h @@ -69,22 +69,21 @@ #define INCLUDED_volk_32f_tanh_32f_a_H #include -#include #include +#include #include #ifdef LV_HAVE_GENERIC static inline void -volk_32f_tanh_32f_generic(float* cVector, const float* aVector, - unsigned int num_points) +volk_32f_tanh_32f_generic(float* cVector, const float* aVector, unsigned int num_points) { - unsigned int number = 0; - float* cPtr = cVector; - const float* aPtr = aVector; - for(; number < num_points; number++) { - *cPtr++ = tanhf(*aPtr++); - } + unsigned int number = 0; + float* cPtr = cVector; + const float* aPtr = aVector; + for (; number < num_points; number++) { + *cPtr++ = tanhf(*aPtr++); + } } #endif /* LV_HAVE_GENERIC */ @@ -93,81 +92,88 @@ volk_32f_tanh_32f_generic(float* cVector, const float* aVector, #ifdef LV_HAVE_GENERIC static inline void -volk_32f_tanh_32f_series(float* cVector, const float* aVector, - unsigned int num_points) +volk_32f_tanh_32f_series(float* cVector, const float* aVector, unsigned int num_points) { - unsigned int number = 0; - float* cPtr = cVector; - const float* aPtr = aVector; - for(; number < num_points; number++) { - if(*aPtr > 4.97) - *cPtr++ = 1; - else if(*aPtr <= -4.97) - *cPtr++ = -1; - else { - float x2 = (*aPtr) * (*aPtr); - float a = (*aPtr) * (135135.0f + x2 * (17325.0f + x2 * (378.0f + x2))); - float b = 135135.0f + x2 * (62370.0f + x2 * (3150.0f + x2 * 28.0f)); - *cPtr++ = a / b; - aPtr++; + unsigned int number = 0; + float* cPtr = cVector; + const float* aPtr = aVector; + for (; number < num_points; number++) { + if (*aPtr > 4.97) + *cPtr++ = 1; + else if (*aPtr <= -4.97) + *cPtr++ = -1; + else { + float x2 = (*aPtr) * (*aPtr); + float a = (*aPtr) * (135135.0f + x2 * (17325.0f + x2 * (378.0f + x2))); + float b = 135135.0f + x2 * (62370.0f + x2 * (3150.0f + x2 * 28.0f)); + *cPtr++ = a / b; + aPtr++; + } } - } } #endif /* LV_HAVE_GENERIC */ - #ifdef LV_HAVE_SSE #include static inline void -volk_32f_tanh_32f_a_sse(float* cVector, const float* aVector, - unsigned int num_points) +volk_32f_tanh_32f_a_sse(float* cVector, const float* aVector, unsigned int num_points) { - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - float* cPtr = cVector; - const float* aPtr = aVector; - - __m128 aVal, cVal, x2, a, b; - __m128 const1, const2, const3, const4, const5, const6; - const1 = _mm_set_ps1(135135.0f); - const2 = _mm_set_ps1(17325.0f); - const3 = _mm_set_ps1(378.0f); - const4 = _mm_set_ps1(62370.0f); - const5 = _mm_set_ps1(3150.0f); - const6 = _mm_set_ps1(28.0f); - for(;number < quarterPoints; number++){ - - aVal = _mm_load_ps(aPtr); - x2 = _mm_mul_ps(aVal, aVal); - a = _mm_mul_ps(aVal, _mm_add_ps(const1, _mm_mul_ps(x2, _mm_add_ps(const2, _mm_mul_ps(x2, _mm_add_ps(const3, x2)))))); - b = _mm_add_ps(const1, _mm_mul_ps(x2, _mm_add_ps(const4, _mm_mul_ps(x2, _mm_add_ps(const5, _mm_mul_ps(x2, const6)))))); - - cVal = _mm_div_ps(a, b); - - _mm_store_ps(cPtr, cVal); // Store the results back into the C container - - aPtr += 4; - cPtr += 4; - } - - number = quarterPoints * 4; - for(;number < num_points; number++) { - if(*aPtr > 4.97) - *cPtr++ = 1; - else if(*aPtr <= -4.97) - *cPtr++ = -1; - else { - float x2 = (*aPtr) * (*aPtr); - float a = (*aPtr) * (135135.0f + x2 * (17325.0f + x2 * (378.0f + x2))); - float b = 135135.0f + x2 * (62370.0f + x2 * (3150.0f + x2 * 28.0f)); - *cPtr++ = a / b; - aPtr++; + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + float* cPtr = cVector; + const float* aPtr = aVector; + + __m128 aVal, cVal, x2, a, b; + __m128 const1, const2, const3, const4, const5, const6; + const1 = _mm_set_ps1(135135.0f); + const2 = _mm_set_ps1(17325.0f); + const3 = _mm_set_ps1(378.0f); + const4 = _mm_set_ps1(62370.0f); + const5 = _mm_set_ps1(3150.0f); + const6 = _mm_set_ps1(28.0f); + for (; number < quarterPoints; number++) { + + aVal = _mm_load_ps(aPtr); + x2 = _mm_mul_ps(aVal, aVal); + a = _mm_mul_ps( + aVal, + _mm_add_ps( + const1, + _mm_mul_ps(x2, + _mm_add_ps(const2, _mm_mul_ps(x2, _mm_add_ps(const3, x2)))))); + b = _mm_add_ps( + const1, + _mm_mul_ps( + x2, + _mm_add_ps(const4, + _mm_mul_ps(x2, _mm_add_ps(const5, _mm_mul_ps(x2, const6)))))); + + cVal = _mm_div_ps(a, b); + + _mm_store_ps(cPtr, cVal); // Store the results back into the C container + + aPtr += 4; + cPtr += 4; + } + + number = quarterPoints * 4; + for (; number < num_points; number++) { + if (*aPtr > 4.97) + *cPtr++ = 1; + else if (*aPtr <= -4.97) + *cPtr++ = -1; + else { + float x2 = (*aPtr) * (*aPtr); + float a = (*aPtr) * (135135.0f + x2 * (17325.0f + x2 * (378.0f + x2))); + float b = 135135.0f + x2 * (62370.0f + x2 * (3150.0f + x2 * 28.0f)); + *cPtr++ = a / b; + aPtr++; + } } - } } #endif /* LV_HAVE_SSE */ @@ -176,52 +182,65 @@ volk_32f_tanh_32f_a_sse(float* cVector, const float* aVector, #include static inline void -volk_32f_tanh_32f_a_avx(float* cVector, const float* aVector, - unsigned int num_points) +volk_32f_tanh_32f_a_avx(float* cVector, const float* aVector, unsigned int num_points) { - unsigned int number = 0; - const unsigned int eighthPoints = num_points / 8; - - float* cPtr = cVector; - const float* aPtr = aVector; - - __m256 aVal, cVal, x2, a, b; - __m256 const1, const2, const3, const4, const5, const6; - const1 = _mm256_set1_ps(135135.0f); - const2 = _mm256_set1_ps(17325.0f); - const3 = _mm256_set1_ps(378.0f); - const4 = _mm256_set1_ps(62370.0f); - const5 = _mm256_set1_ps(3150.0f); - const6 = _mm256_set1_ps(28.0f); - for(;number < eighthPoints; number++){ - - aVal = _mm256_load_ps(aPtr); - x2 = _mm256_mul_ps(aVal, aVal); - a = _mm256_mul_ps(aVal, _mm256_add_ps(const1, _mm256_mul_ps(x2, _mm256_add_ps(const2, _mm256_mul_ps(x2, _mm256_add_ps(const3, x2)))))); - b = _mm256_add_ps(const1, _mm256_mul_ps(x2, _mm256_add_ps(const4, _mm256_mul_ps(x2, _mm256_add_ps(const5, _mm256_mul_ps(x2, const6)))))); - - cVal = _mm256_div_ps(a, b); - - _mm256_store_ps(cPtr, cVal); // Store the results back into the C container - - aPtr += 8; - cPtr += 8; - } - - number = eighthPoints * 8; - for(;number < num_points; number++) { - if(*aPtr > 4.97) - *cPtr++ = 1; - else if(*aPtr <= -4.97) - *cPtr++ = -1; - else { - float x2 = (*aPtr) * (*aPtr); - float a = (*aPtr) * (135135.0f + x2 * (17325.0f + x2 * (378.0f + x2))); - float b = 135135.0f + x2 * (62370.0f + x2 * (3150.0f + x2 * 28.0f)); - *cPtr++ = a / b; - aPtr++; + unsigned int number = 0; + const unsigned int eighthPoints = num_points / 8; + + float* cPtr = cVector; + const float* aPtr = aVector; + + __m256 aVal, cVal, x2, a, b; + __m256 const1, const2, const3, const4, const5, const6; + const1 = _mm256_set1_ps(135135.0f); + const2 = _mm256_set1_ps(17325.0f); + const3 = _mm256_set1_ps(378.0f); + const4 = _mm256_set1_ps(62370.0f); + const5 = _mm256_set1_ps(3150.0f); + const6 = _mm256_set1_ps(28.0f); + for (; number < eighthPoints; number++) { + + aVal = _mm256_load_ps(aPtr); + x2 = _mm256_mul_ps(aVal, aVal); + a = _mm256_mul_ps( + aVal, + _mm256_add_ps( + const1, + _mm256_mul_ps( + x2, + _mm256_add_ps(const2, + _mm256_mul_ps(x2, _mm256_add_ps(const3, x2)))))); + b = _mm256_add_ps( + const1, + _mm256_mul_ps( + x2, + _mm256_add_ps( + const4, + _mm256_mul_ps(x2, + _mm256_add_ps(const5, _mm256_mul_ps(x2, const6)))))); + + cVal = _mm256_div_ps(a, b); + + _mm256_store_ps(cPtr, cVal); // Store the results back into the C container + + aPtr += 8; + cPtr += 8; + } + + number = eighthPoints * 8; + for (; number < num_points; number++) { + if (*aPtr > 4.97) + *cPtr++ = 1; + else if (*aPtr <= -4.97) + *cPtr++ = -1; + else { + float x2 = (*aPtr) * (*aPtr); + float a = (*aPtr) * (135135.0f + x2 * (17325.0f + x2 * (378.0f + x2))); + float b = 135135.0f + x2 * (62370.0f + x2 * (3150.0f + x2 * 28.0f)); + *cPtr++ = a / b; + aPtr++; + } } - } } #endif /* LV_HAVE_AVX */ @@ -229,52 +248,55 @@ volk_32f_tanh_32f_a_avx(float* cVector, const float* aVector, #include static inline void -volk_32f_tanh_32f_a_avx_fma(float* cVector, const float* aVector, - unsigned int num_points) +volk_32f_tanh_32f_a_avx_fma(float* cVector, const float* aVector, unsigned int num_points) { - unsigned int number = 0; - const unsigned int eighthPoints = num_points / 8; - - float* cPtr = cVector; - const float* aPtr = aVector; - - __m256 aVal, cVal, x2, a, b; - __m256 const1, const2, const3, const4, const5, const6; - const1 = _mm256_set1_ps(135135.0f); - const2 = _mm256_set1_ps(17325.0f); - const3 = _mm256_set1_ps(378.0f); - const4 = _mm256_set1_ps(62370.0f); - const5 = _mm256_set1_ps(3150.0f); - const6 = _mm256_set1_ps(28.0f); - for(;number < eighthPoints; number++){ - - aVal = _mm256_load_ps(aPtr); - x2 = _mm256_mul_ps(aVal, aVal); - a = _mm256_mul_ps(aVal, _mm256_fmadd_ps(x2, _mm256_fmadd_ps(x2, _mm256_add_ps(const3, x2), const2),const1)); - b = _mm256_fmadd_ps(x2, _mm256_fmadd_ps(x2, _mm256_fmadd_ps(x2, const6, const5), const4), const1); - - cVal = _mm256_div_ps(a, b); - - _mm256_store_ps(cPtr, cVal); // Store the results back into the C container - - aPtr += 8; - cPtr += 8; - } - - number = eighthPoints * 8; - for(;number < num_points; number++) { - if(*aPtr > 4.97) - *cPtr++ = 1; - else if(*aPtr <= -4.97) - *cPtr++ = -1; - else { - float x2 = (*aPtr) * (*aPtr); - float a = (*aPtr) * (135135.0f + x2 * (17325.0f + x2 * (378.0f + x2))); - float b = 135135.0f + x2 * (62370.0f + x2 * (3150.0f + x2 * 28.0f)); - *cPtr++ = a / b; - aPtr++; + unsigned int number = 0; + const unsigned int eighthPoints = num_points / 8; + + float* cPtr = cVector; + const float* aPtr = aVector; + + __m256 aVal, cVal, x2, a, b; + __m256 const1, const2, const3, const4, const5, const6; + const1 = _mm256_set1_ps(135135.0f); + const2 = _mm256_set1_ps(17325.0f); + const3 = _mm256_set1_ps(378.0f); + const4 = _mm256_set1_ps(62370.0f); + const5 = _mm256_set1_ps(3150.0f); + const6 = _mm256_set1_ps(28.0f); + for (; number < eighthPoints; number++) { + + aVal = _mm256_load_ps(aPtr); + x2 = _mm256_mul_ps(aVal, aVal); + a = _mm256_mul_ps( + aVal, + _mm256_fmadd_ps( + x2, _mm256_fmadd_ps(x2, _mm256_add_ps(const3, x2), const2), const1)); + b = _mm256_fmadd_ps( + x2, _mm256_fmadd_ps(x2, _mm256_fmadd_ps(x2, const6, const5), const4), const1); + + cVal = _mm256_div_ps(a, b); + + _mm256_store_ps(cPtr, cVal); // Store the results back into the C container + + aPtr += 8; + cPtr += 8; + } + + number = eighthPoints * 8; + for (; number < num_points; number++) { + if (*aPtr > 4.97) + *cPtr++ = 1; + else if (*aPtr <= -4.97) + *cPtr++ = -1; + else { + float x2 = (*aPtr) * (*aPtr); + float a = (*aPtr) * (135135.0f + x2 * (17325.0f + x2 * (378.0f + x2))); + float b = 135135.0f + x2 * (62370.0f + x2 * (3150.0f + x2 * 28.0f)); + *cPtr++ = a / b; + aPtr++; + } } - } } #endif /* LV_HAVE_AVX && LV_HAVE_FMA */ @@ -285,8 +307,8 @@ volk_32f_tanh_32f_a_avx_fma(float* cVector, const float* aVector, #define INCLUDED_volk_32f_tanh_32f_u_H #include -#include #include +#include #include @@ -294,52 +316,61 @@ volk_32f_tanh_32f_a_avx_fma(float* cVector, const float* aVector, #include static inline void -volk_32f_tanh_32f_u_sse(float* cVector, const float* aVector, - unsigned int num_points) +volk_32f_tanh_32f_u_sse(float* cVector, const float* aVector, unsigned int num_points) { - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - float* cPtr = cVector; - const float* aPtr = aVector; - - __m128 aVal, cVal, x2, a, b; - __m128 const1, const2, const3, const4, const5, const6; - const1 = _mm_set_ps1(135135.0f); - const2 = _mm_set_ps1(17325.0f); - const3 = _mm_set_ps1(378.0f); - const4 = _mm_set_ps1(62370.0f); - const5 = _mm_set_ps1(3150.0f); - const6 = _mm_set_ps1(28.0f); - for(;number < quarterPoints; number++){ - - aVal = _mm_loadu_ps(aPtr); - x2 = _mm_mul_ps(aVal, aVal); - a = _mm_mul_ps(aVal, _mm_add_ps(const1, _mm_mul_ps(x2, _mm_add_ps(const2, _mm_mul_ps(x2, _mm_add_ps(const3, x2)))))); - b = _mm_add_ps(const1, _mm_mul_ps(x2, _mm_add_ps(const4, _mm_mul_ps(x2, _mm_add_ps(const5, _mm_mul_ps(x2, const6)))))); - - cVal = _mm_div_ps(a, b); - - _mm_storeu_ps(cPtr, cVal); // Store the results back into the C container - - aPtr += 4; - cPtr += 4; - } - - number = quarterPoints * 4; - for(;number < num_points; number++) { - if(*aPtr > 4.97) - *cPtr++ = 1; - else if(*aPtr <= -4.97) - *cPtr++ = -1; - else { - float x2 = (*aPtr) * (*aPtr); - float a = (*aPtr) * (135135.0f + x2 * (17325.0f + x2 * (378.0f + x2))); - float b = 135135.0f + x2 * (62370.0f + x2 * (3150.0f + x2 * 28.0f)); - *cPtr++ = a / b; - aPtr++; + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + float* cPtr = cVector; + const float* aPtr = aVector; + + __m128 aVal, cVal, x2, a, b; + __m128 const1, const2, const3, const4, const5, const6; + const1 = _mm_set_ps1(135135.0f); + const2 = _mm_set_ps1(17325.0f); + const3 = _mm_set_ps1(378.0f); + const4 = _mm_set_ps1(62370.0f); + const5 = _mm_set_ps1(3150.0f); + const6 = _mm_set_ps1(28.0f); + for (; number < quarterPoints; number++) { + + aVal = _mm_loadu_ps(aPtr); + x2 = _mm_mul_ps(aVal, aVal); + a = _mm_mul_ps( + aVal, + _mm_add_ps( + const1, + _mm_mul_ps(x2, + _mm_add_ps(const2, _mm_mul_ps(x2, _mm_add_ps(const3, x2)))))); + b = _mm_add_ps( + const1, + _mm_mul_ps( + x2, + _mm_add_ps(const4, + _mm_mul_ps(x2, _mm_add_ps(const5, _mm_mul_ps(x2, const6)))))); + + cVal = _mm_div_ps(a, b); + + _mm_storeu_ps(cPtr, cVal); // Store the results back into the C container + + aPtr += 4; + cPtr += 4; + } + + number = quarterPoints * 4; + for (; number < num_points; number++) { + if (*aPtr > 4.97) + *cPtr++ = 1; + else if (*aPtr <= -4.97) + *cPtr++ = -1; + else { + float x2 = (*aPtr) * (*aPtr); + float a = (*aPtr) * (135135.0f + x2 * (17325.0f + x2 * (378.0f + x2))); + float b = 135135.0f + x2 * (62370.0f + x2 * (3150.0f + x2 * 28.0f)); + *cPtr++ = a / b; + aPtr++; + } } - } } #endif /* LV_HAVE_SSE */ @@ -348,52 +379,65 @@ volk_32f_tanh_32f_u_sse(float* cVector, const float* aVector, #include static inline void -volk_32f_tanh_32f_u_avx(float* cVector, const float* aVector, - unsigned int num_points) +volk_32f_tanh_32f_u_avx(float* cVector, const float* aVector, unsigned int num_points) { - unsigned int number = 0; - const unsigned int eighthPoints = num_points / 8; - - float* cPtr = cVector; - const float* aPtr = aVector; - - __m256 aVal, cVal, x2, a, b; - __m256 const1, const2, const3, const4, const5, const6; - const1 = _mm256_set1_ps(135135.0f); - const2 = _mm256_set1_ps(17325.0f); - const3 = _mm256_set1_ps(378.0f); - const4 = _mm256_set1_ps(62370.0f); - const5 = _mm256_set1_ps(3150.0f); - const6 = _mm256_set1_ps(28.0f); - for(;number < eighthPoints; number++){ - - aVal = _mm256_loadu_ps(aPtr); - x2 = _mm256_mul_ps(aVal, aVal); - a = _mm256_mul_ps(aVal, _mm256_add_ps(const1, _mm256_mul_ps(x2, _mm256_add_ps(const2, _mm256_mul_ps(x2, _mm256_add_ps(const3, x2)))))); - b = _mm256_add_ps(const1, _mm256_mul_ps(x2, _mm256_add_ps(const4, _mm256_mul_ps(x2, _mm256_add_ps(const5, _mm256_mul_ps(x2, const6)))))); - - cVal = _mm256_div_ps(a, b); - - _mm256_storeu_ps(cPtr, cVal); // Store the results back into the C container - - aPtr += 8; - cPtr += 8; - } - - number = eighthPoints * 8; - for(;number < num_points; number++) { - if(*aPtr > 4.97) - *cPtr++ = 1; - else if(*aPtr <= -4.97) - *cPtr++ = -1; - else { - float x2 = (*aPtr) * (*aPtr); - float a = (*aPtr) * (135135.0f + x2 * (17325.0f + x2 * (378.0f + x2))); - float b = 135135.0f + x2 * (62370.0f + x2 * (3150.0f + x2 * 28.0f)); - *cPtr++ = a / b; - aPtr++; + unsigned int number = 0; + const unsigned int eighthPoints = num_points / 8; + + float* cPtr = cVector; + const float* aPtr = aVector; + + __m256 aVal, cVal, x2, a, b; + __m256 const1, const2, const3, const4, const5, const6; + const1 = _mm256_set1_ps(135135.0f); + const2 = _mm256_set1_ps(17325.0f); + const3 = _mm256_set1_ps(378.0f); + const4 = _mm256_set1_ps(62370.0f); + const5 = _mm256_set1_ps(3150.0f); + const6 = _mm256_set1_ps(28.0f); + for (; number < eighthPoints; number++) { + + aVal = _mm256_loadu_ps(aPtr); + x2 = _mm256_mul_ps(aVal, aVal); + a = _mm256_mul_ps( + aVal, + _mm256_add_ps( + const1, + _mm256_mul_ps( + x2, + _mm256_add_ps(const2, + _mm256_mul_ps(x2, _mm256_add_ps(const3, x2)))))); + b = _mm256_add_ps( + const1, + _mm256_mul_ps( + x2, + _mm256_add_ps( + const4, + _mm256_mul_ps(x2, + _mm256_add_ps(const5, _mm256_mul_ps(x2, const6)))))); + + cVal = _mm256_div_ps(a, b); + + _mm256_storeu_ps(cPtr, cVal); // Store the results back into the C container + + aPtr += 8; + cPtr += 8; + } + + number = eighthPoints * 8; + for (; number < num_points; number++) { + if (*aPtr > 4.97) + *cPtr++ = 1; + else if (*aPtr <= -4.97) + *cPtr++ = -1; + else { + float x2 = (*aPtr) * (*aPtr); + float a = (*aPtr) * (135135.0f + x2 * (17325.0f + x2 * (378.0f + x2))); + float b = 135135.0f + x2 * (62370.0f + x2 * (3150.0f + x2 * 28.0f)); + *cPtr++ = a / b; + aPtr++; + } } - } } #endif /* LV_HAVE_AVX */ @@ -401,52 +445,55 @@ volk_32f_tanh_32f_u_avx(float* cVector, const float* aVector, #include static inline void -volk_32f_tanh_32f_u_avx_fma(float* cVector, const float* aVector, - unsigned int num_points) +volk_32f_tanh_32f_u_avx_fma(float* cVector, const float* aVector, unsigned int num_points) { - unsigned int number = 0; - const unsigned int eighthPoints = num_points / 8; - - float* cPtr = cVector; - const float* aPtr = aVector; - - __m256 aVal, cVal, x2, a, b; - __m256 const1, const2, const3, const4, const5, const6; - const1 = _mm256_set1_ps(135135.0f); - const2 = _mm256_set1_ps(17325.0f); - const3 = _mm256_set1_ps(378.0f); - const4 = _mm256_set1_ps(62370.0f); - const5 = _mm256_set1_ps(3150.0f); - const6 = _mm256_set1_ps(28.0f); - for(;number < eighthPoints; number++){ - - aVal = _mm256_loadu_ps(aPtr); - x2 = _mm256_mul_ps(aVal, aVal); - a = _mm256_mul_ps(aVal, _mm256_fmadd_ps(x2, _mm256_fmadd_ps(x2, _mm256_add_ps(const3, x2), const2),const1)); - b = _mm256_fmadd_ps(x2, _mm256_fmadd_ps(x2, _mm256_fmadd_ps(x2, const6, const5), const4), const1); - - cVal = _mm256_div_ps(a, b); - - _mm256_storeu_ps(cPtr, cVal); // Store the results back into the C container - - aPtr += 8; - cPtr += 8; - } - - number = eighthPoints * 8; - for(;number < num_points; number++) { - if(*aPtr > 4.97) - *cPtr++ = 1; - else if(*aPtr <= -4.97) - *cPtr++ = -1; - else { - float x2 = (*aPtr) * (*aPtr); - float a = (*aPtr) * (135135.0f + x2 * (17325.0f + x2 * (378.0f + x2))); - float b = 135135.0f + x2 * (62370.0f + x2 * (3150.0f + x2 * 28.0f)); - *cPtr++ = a / b; - aPtr++; + unsigned int number = 0; + const unsigned int eighthPoints = num_points / 8; + + float* cPtr = cVector; + const float* aPtr = aVector; + + __m256 aVal, cVal, x2, a, b; + __m256 const1, const2, const3, const4, const5, const6; + const1 = _mm256_set1_ps(135135.0f); + const2 = _mm256_set1_ps(17325.0f); + const3 = _mm256_set1_ps(378.0f); + const4 = _mm256_set1_ps(62370.0f); + const5 = _mm256_set1_ps(3150.0f); + const6 = _mm256_set1_ps(28.0f); + for (; number < eighthPoints; number++) { + + aVal = _mm256_loadu_ps(aPtr); + x2 = _mm256_mul_ps(aVal, aVal); + a = _mm256_mul_ps( + aVal, + _mm256_fmadd_ps( + x2, _mm256_fmadd_ps(x2, _mm256_add_ps(const3, x2), const2), const1)); + b = _mm256_fmadd_ps( + x2, _mm256_fmadd_ps(x2, _mm256_fmadd_ps(x2, const6, const5), const4), const1); + + cVal = _mm256_div_ps(a, b); + + _mm256_storeu_ps(cPtr, cVal); // Store the results back into the C container + + aPtr += 8; + cPtr += 8; + } + + number = eighthPoints * 8; + for (; number < num_points; number++) { + if (*aPtr > 4.97) + *cPtr++ = 1; + else if (*aPtr <= -4.97) + *cPtr++ = -1; + else { + float x2 = (*aPtr) * (*aPtr); + float a = (*aPtr) * (135135.0f + x2 * (17325.0f + x2 * (378.0f + x2))); + float b = 135135.0f + x2 * (62370.0f + x2 * (3150.0f + x2 * 28.0f)); + *cPtr++ = a / b; + aPtr++; + } } - } } #endif /* LV_HAVE_AVX && LV_HAVE_FMA */ diff --git a/kernels/volk/volk_32f_x2_add_32f.h b/kernels/volk/volk_32f_x2_add_32f.h index ce18092..e4b7e93 100644 --- a/kernels/volk/volk_32f_x2_add_32f.h +++ b/kernels/volk/volk_32f_x2_add_32f.h @@ -31,8 +31,8 @@ * * Dispatcher Prototype * \code - * void volk_32f_x2_add_32f(float* cVector, const float* aVector, const float* bVector, unsigned int num_points) - * \endcode + * void volk_32f_x2_add_32f(float* cVector, const float* aVector, const float* bVector, + * unsigned int num_points) \endcode * * \b Inputs * \li aVector: First vector of input points. @@ -44,7 +44,8 @@ * * \b Example * - * The follow example adds the increasing and decreasing vectors such that the result of every summation pair is 10 + * The follow example adds the increasing and decreasing vectors such that the result of + * every summation pair is 10 * * \code * int N = 10; @@ -79,37 +80,38 @@ #ifdef LV_HAVE_AVX512F #include -static inline void -volk_32f_x2_add_32f_u_avx512f(float* cVector, const float* aVector, - const float* bVector, unsigned int num_points) +static inline void volk_32f_x2_add_32f_u_avx512f(float* cVector, + const float* aVector, + const float* bVector, + unsigned int num_points) { - unsigned int number = 0; - const unsigned int sixteenthPoints = num_points / 16; + unsigned int number = 0; + const unsigned int sixteenthPoints = num_points / 16; - float* cPtr = cVector; - const float* aPtr = aVector; - const float* bPtr= bVector; + float* cPtr = cVector; + const float* aPtr = aVector; + const float* bPtr = bVector; - __m512 aVal, bVal, cVal; - for(;number < sixteenthPoints; number++){ + __m512 aVal, bVal, cVal; + for (; number < sixteenthPoints; number++) { - aVal = _mm512_loadu_ps(aPtr); - bVal = _mm512_loadu_ps(bPtr); + aVal = _mm512_loadu_ps(aPtr); + bVal = _mm512_loadu_ps(bPtr); - cVal = _mm512_add_ps(aVal, bVal); + cVal = _mm512_add_ps(aVal, bVal); - _mm512_storeu_ps(cPtr,cVal); // Store the results back into the C container + _mm512_storeu_ps(cPtr, cVal); // Store the results back into the C container - aPtr += 16; - bPtr += 16; - cPtr += 16; - } + aPtr += 16; + bPtr += 16; + cPtr += 16; + } - number = sixteenthPoints * 16; + number = sixteenthPoints * 16; - for(;number < num_points; number++){ - *cPtr++ = (*aPtr++) + (*bPtr++); - } + for (; number < num_points; number++) { + *cPtr++ = (*aPtr++) + (*bPtr++); + } } #endif /* LV_HAVE_AVX512F */ @@ -118,35 +120,36 @@ volk_32f_x2_add_32f_u_avx512f(float* cVector, const float* aVector, #ifdef LV_HAVE_AVX #include -static inline void -volk_32f_x2_add_32f_u_avx(float* cVector, const float* aVector, - const float* bVector, unsigned int num_points) +static inline void volk_32f_x2_add_32f_u_avx(float* cVector, + const float* aVector, + const float* bVector, + unsigned int num_points) { - unsigned int number = 0; - const unsigned int eighthPoints = num_points / 8; - float* cPtr = cVector; - const float* aPtr = aVector; - const float* bPtr= bVector; - __m256 aVal, bVal, cVal; - for(;number < eighthPoints; number++){ + unsigned int number = 0; + const unsigned int eighthPoints = num_points / 8; + float* cPtr = cVector; + const float* aPtr = aVector; + const float* bPtr = bVector; + __m256 aVal, bVal, cVal; + for (; number < eighthPoints; number++) { - aVal = _mm256_loadu_ps(aPtr); - bVal = _mm256_loadu_ps(bPtr); + aVal = _mm256_loadu_ps(aPtr); + bVal = _mm256_loadu_ps(bPtr); - cVal = _mm256_add_ps(aVal, bVal); + cVal = _mm256_add_ps(aVal, bVal); - _mm256_storeu_ps(cPtr,cVal); // Store the results back into the C container + _mm256_storeu_ps(cPtr, cVal); // Store the results back into the C container - aPtr += 8; - bPtr += 8; - cPtr += 8; - } + aPtr += 8; + bPtr += 8; + cPtr += 8; + } - number = eighthPoints * 8; + number = eighthPoints * 8; - for(;number < num_points; number++){ - *cPtr++ = (*aPtr++) + (*bPtr++); - } + for (; number < num_points; number++) { + *cPtr++ = (*aPtr++) + (*bPtr++); + } } #endif /* LV_HAVE_AVX */ @@ -154,54 +157,56 @@ volk_32f_x2_add_32f_u_avx(float* cVector, const float* aVector, #ifdef LV_HAVE_SSE #include -static inline void -volk_32f_x2_add_32f_u_sse(float* cVector, const float* aVector, - const float* bVector, unsigned int num_points) +static inline void volk_32f_x2_add_32f_u_sse(float* cVector, + const float* aVector, + const float* bVector, + unsigned int num_points) { - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; - float* cPtr = cVector; - const float* aPtr = aVector; - const float* bPtr= bVector; + float* cPtr = cVector; + const float* aPtr = aVector; + const float* bPtr = bVector; - __m128 aVal, bVal, cVal; - for(;number < quarterPoints; number++){ + __m128 aVal, bVal, cVal; + for (; number < quarterPoints; number++) { - aVal = _mm_loadu_ps(aPtr); - bVal = _mm_loadu_ps(bPtr); + aVal = _mm_loadu_ps(aPtr); + bVal = _mm_loadu_ps(bPtr); - cVal = _mm_add_ps(aVal, bVal); + cVal = _mm_add_ps(aVal, bVal); - _mm_storeu_ps(cPtr,cVal); // Store the results back into the C container + _mm_storeu_ps(cPtr, cVal); // Store the results back into the C container - aPtr += 4; - bPtr += 4; - cPtr += 4; - } + aPtr += 4; + bPtr += 4; + cPtr += 4; + } - number = quarterPoints * 4; - for(;number < num_points; number++){ - *cPtr++ = (*aPtr++) + (*bPtr++); - } + number = quarterPoints * 4; + for (; number < num_points; number++) { + *cPtr++ = (*aPtr++) + (*bPtr++); + } } #endif /* LV_HAVE_SSE */ #ifdef LV_HAVE_GENERIC -static inline void -volk_32f_x2_add_32f_generic(float* cVector, const float* aVector, - const float* bVector, unsigned int num_points) +static inline void volk_32f_x2_add_32f_generic(float* cVector, + const float* aVector, + const float* bVector, + unsigned int num_points) { - float* cPtr = cVector; - const float* aPtr = aVector; - const float* bPtr= bVector; - unsigned int number = 0; - - for(number = 0; number < num_points; number++){ - *cPtr++ = (*aPtr++) + (*bPtr++); - } + float* cPtr = cVector; + const float* aPtr = aVector; + const float* bPtr = bVector; + unsigned int number = 0; + + for (number = 0; number < num_points; number++) { + *cPtr++ = (*aPtr++) + (*bPtr++); + } } #endif /* LV_HAVE_GENERIC */ @@ -216,37 +221,38 @@ volk_32f_x2_add_32f_generic(float* cVector, const float* aVector, #ifdef LV_HAVE_AVX512F #include -static inline void -volk_32f_x2_add_32f_a_avx512f(float* cVector, const float* aVector, - const float* bVector, unsigned int num_points) +static inline void volk_32f_x2_add_32f_a_avx512f(float* cVector, + const float* aVector, + const float* bVector, + unsigned int num_points) { - unsigned int number = 0; - const unsigned int sixteenthPoints = num_points / 16; + unsigned int number = 0; + const unsigned int sixteenthPoints = num_points / 16; - float* cPtr = cVector; - const float* aPtr = aVector; - const float* bPtr= bVector; + float* cPtr = cVector; + const float* aPtr = aVector; + const float* bPtr = bVector; - __m512 aVal, bVal, cVal; - for(;number < sixteenthPoints; number++){ + __m512 aVal, bVal, cVal; + for (; number < sixteenthPoints; number++) { - aVal = _mm512_load_ps(aPtr); - bVal = _mm512_load_ps(bPtr); + aVal = _mm512_load_ps(aPtr); + bVal = _mm512_load_ps(bPtr); - cVal = _mm512_add_ps(aVal, bVal); + cVal = _mm512_add_ps(aVal, bVal); - _mm512_store_ps(cPtr,cVal); // Store the results back into the C container + _mm512_store_ps(cPtr, cVal); // Store the results back into the C container - aPtr += 16; - bPtr += 16; - cPtr += 16; - } + aPtr += 16; + bPtr += 16; + cPtr += 16; + } - number = sixteenthPoints * 16; + number = sixteenthPoints * 16; - for(;number < num_points; number++){ - *cPtr++ = (*aPtr++) + (*bPtr++); - } + for (; number < num_points; number++) { + *cPtr++ = (*aPtr++) + (*bPtr++); + } } #endif /* LV_HAVE_AVX512F */ @@ -255,70 +261,73 @@ volk_32f_x2_add_32f_a_avx512f(float* cVector, const float* aVector, #ifdef LV_HAVE_AVX #include -static inline void -volk_32f_x2_add_32f_a_avx(float* cVector, const float* aVector, - const float* bVector, unsigned int num_points) +static inline void volk_32f_x2_add_32f_a_avx(float* cVector, + const float* aVector, + const float* bVector, + unsigned int num_points) { - unsigned int number = 0; - const unsigned int eighthPoints = num_points / 8; + unsigned int number = 0; + const unsigned int eighthPoints = num_points / 8; - float* cPtr = cVector; - const float* aPtr = aVector; - const float* bPtr= bVector; + float* cPtr = cVector; + const float* aPtr = aVector; + const float* bPtr = bVector; - __m256 aVal, bVal, cVal; - for(;number < eighthPoints; number++){ + __m256 aVal, bVal, cVal; + for (; number < eighthPoints; number++) { - aVal = _mm256_load_ps(aPtr); - bVal = _mm256_load_ps(bPtr); + aVal = _mm256_load_ps(aPtr); + bVal = _mm256_load_ps(bPtr); - cVal = _mm256_add_ps(aVal, bVal); + cVal = _mm256_add_ps(aVal, bVal); - _mm256_store_ps(cPtr,cVal); // Store the results back into the C container + _mm256_store_ps(cPtr, cVal); // Store the results back into the C container - aPtr += 8; - bPtr += 8; - cPtr += 8; - } + aPtr += 8; + bPtr += 8; + cPtr += 8; + } - number = eighthPoints * 8; - for(;number < num_points; number++){ - *cPtr++ = (*aPtr++) + (*bPtr++); - } + number = eighthPoints * 8; + for (; number < num_points; number++) { + *cPtr++ = (*aPtr++) + (*bPtr++); + } } #endif /* LV_HAVE_AVX */ #ifdef LV_HAVE_SSE #include -static inline void -volk_32f_x2_add_32f_a_sse(float* cVector, const float* aVector, const float* bVector, unsigned int num_points) +static inline void volk_32f_x2_add_32f_a_sse(float* cVector, + const float* aVector, + const float* bVector, + unsigned int num_points) { - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; - float* cPtr = cVector; - const float* aPtr = aVector; - const float* bPtr= bVector; + float* cPtr = cVector; + const float* aPtr = aVector; + const float* bPtr = bVector; - __m128 aVal, bVal, cVal; - for(;number < quarterPoints; number++){ - aVal = _mm_load_ps(aPtr); - bVal = _mm_load_ps(bPtr); + __m128 aVal, bVal, cVal; + for (; number < quarterPoints; number++) { + aVal = _mm_load_ps(aPtr); + bVal = _mm_load_ps(bPtr); - cVal = _mm_add_ps(aVal, bVal); + cVal = _mm_add_ps(aVal, bVal); - _mm_store_ps(cPtr,cVal); // Store the results back into the C container + _mm_store_ps(cPtr, cVal); // Store the results back into the C container - aPtr += 4; - bPtr += 4; - cPtr += 4; - } + aPtr += 4; + bPtr += 4; + cPtr += 4; + } - number = quarterPoints * 4; - for(;number < num_points; number++){ - *cPtr++ = (*aPtr++) + (*bPtr++); - } + number = quarterPoints * 4; + for (; number < num_points; number++) { + *cPtr++ = (*aPtr++) + (*bPtr++); + } } #endif /* LV_HAVE_SSE */ @@ -326,78 +335,89 @@ volk_32f_x2_add_32f_a_sse(float* cVector, const float* aVector, const float* bVe #ifdef LV_HAVE_NEON #include -static inline void -volk_32f_x2_add_32f_u_neon(float* cVector, const float* aVector, - const float* bVector, unsigned int num_points) +static inline void volk_32f_x2_add_32f_u_neon(float* cVector, + const float* aVector, + const float* bVector, + unsigned int num_points) { - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - float* cPtr = cVector; - const float* aPtr = aVector; - const float* bPtr= bVector; - float32x4_t aVal, bVal, cVal; - for(number=0; number < quarterPoints; number++){ - // Load in to NEON registers - aVal = vld1q_f32(aPtr); - bVal = vld1q_f32(bPtr); - __VOLK_PREFETCH(aPtr+4); - __VOLK_PREFETCH(bPtr+4); - - // vector add - cVal = vaddq_f32(aVal, bVal); - // Store the results back into the C container - vst1q_f32(cPtr,cVal); - - aPtr += 4; // q uses quadwords, 4 floats per vadd - bPtr += 4; - cPtr += 4; - } - - number = quarterPoints * 4; // should be = num_points - for(;number < num_points; number++){ - *cPtr++ = (*aPtr++) + (*bPtr++); - } + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + float* cPtr = cVector; + const float* aPtr = aVector; + const float* bPtr = bVector; + float32x4_t aVal, bVal, cVal; + for (number = 0; number < quarterPoints; number++) { + // Load in to NEON registers + aVal = vld1q_f32(aPtr); + bVal = vld1q_f32(bPtr); + __VOLK_PREFETCH(aPtr + 4); + __VOLK_PREFETCH(bPtr + 4); + + // vector add + cVal = vaddq_f32(aVal, bVal); + // Store the results back into the C container + vst1q_f32(cPtr, cVal); + + aPtr += 4; // q uses quadwords, 4 floats per vadd + bPtr += 4; + cPtr += 4; + } + + number = quarterPoints * 4; // should be = num_points + for (; number < num_points; number++) { + *cPtr++ = (*aPtr++) + (*bPtr++); + } } #endif /* LV_HAVE_NEON */ #ifdef LV_HAVE_NEONV7 -extern void volk_32f_x2_add_32f_a_neonasm(float* cVector, const float* aVector, const float* bVector, unsigned int num_points); +extern void volk_32f_x2_add_32f_a_neonasm(float* cVector, + const float* aVector, + const float* bVector, + unsigned int num_points); #endif /* LV_HAVE_NEONV7 */ #ifdef LV_HAVE_NEONV7 -extern void volk_32f_x2_add_32f_a_neonpipeline(float* cVector, const float* aVector, const float* bVector, unsigned int num_points); +extern void volk_32f_x2_add_32f_a_neonpipeline(float* cVector, + const float* aVector, + const float* bVector, + unsigned int num_points); #endif /* LV_HAVE_NEONV7 */ #ifdef LV_HAVE_GENERIC -static inline void -volk_32f_x2_add_32f_a_generic(float* cVector, const float* aVector, - const float* bVector, unsigned int num_points) +static inline void volk_32f_x2_add_32f_a_generic(float* cVector, + const float* aVector, + const float* bVector, + unsigned int num_points) { - float* cPtr = cVector; - const float* aPtr = aVector; - const float* bPtr= bVector; - unsigned int number = 0; - - for(number = 0; number < num_points; number++){ - *cPtr++ = (*aPtr++) + (*bPtr++); - } + float* cPtr = cVector; + const float* aPtr = aVector; + const float* bPtr = bVector; + unsigned int number = 0; + + for (number = 0; number < num_points; number++) { + *cPtr++ = (*aPtr++) + (*bPtr++); + } } #endif /* LV_HAVE_GENERIC */ #ifdef LV_HAVE_ORC -extern void -volk_32f_x2_add_32f_a_orc_impl(float* cVector, const float* aVector, - const float* bVector, unsigned int num_points); +extern void volk_32f_x2_add_32f_a_orc_impl(float* cVector, + const float* aVector, + const float* bVector, + unsigned int num_points); -static inline void -volk_32f_x2_add_32f_u_orc(float* cVector, const float* aVector, - const float* bVector, unsigned int num_points){ - volk_32f_x2_add_32f_a_orc_impl(cVector, aVector, bVector, num_points); +static inline void volk_32f_x2_add_32f_u_orc(float* cVector, + const float* aVector, + const float* bVector, + unsigned int num_points) +{ + volk_32f_x2_add_32f_a_orc_impl(cVector, aVector, bVector, num_points); } #endif /* LV_HAVE_ORC */ diff --git a/kernels/volk/volk_32f_x2_divide_32f.h b/kernels/volk/volk_32f_x2_divide_32f.h index 130767f..8b80365 100644 --- a/kernels/volk/volk_32f_x2_divide_32f.h +++ b/kernels/volk/volk_32f_x2_divide_32f.h @@ -31,8 +31,8 @@ * * Dispatcher Prototype * \code - * void volk_32f_x2_divide_32f(float* cVector, const float* aVector, const float* bVector, unsigned int num_points) - * \endcode + * void volk_32f_x2_divide_32f(float* cVector, const float* aVector, const float* bVector, + * unsigned int num_points) \endcode * * \b Inputs * \li aVector: First vector of input points. @@ -77,35 +77,36 @@ #ifdef LV_HAVE_AVX512F #include -static inline void -volk_32f_x2_divide_32f_a_avx512f(float* cVector, const float* aVector, - const float* bVector, unsigned int num_points) +static inline void volk_32f_x2_divide_32f_a_avx512f(float* cVector, + const float* aVector, + const float* bVector, + unsigned int num_points) { - unsigned int number = 0; - const unsigned int sixteenthPoints = num_points / 16; + unsigned int number = 0; + const unsigned int sixteenthPoints = num_points / 16; - float* cPtr = cVector; - const float* aPtr = aVector; - const float* bPtr= bVector; + float* cPtr = cVector; + const float* aPtr = aVector; + const float* bPtr = bVector; - __m512 aVal, bVal, cVal; - for(;number < sixteenthPoints; number++){ - aVal = _mm512_load_ps(aPtr); - bVal = _mm512_load_ps(bPtr); + __m512 aVal, bVal, cVal; + for (; number < sixteenthPoints; number++) { + aVal = _mm512_load_ps(aPtr); + bVal = _mm512_load_ps(bPtr); - cVal = _mm512_div_ps(aVal, bVal); + cVal = _mm512_div_ps(aVal, bVal); - _mm512_store_ps(cPtr,cVal); // Store the results back into the C container + _mm512_store_ps(cPtr, cVal); // Store the results back into the C container - aPtr += 16; - bPtr += 16; - cPtr += 16; - } + aPtr += 16; + bPtr += 16; + cPtr += 16; + } - number = sixteenthPoints * 16; - for(;number < num_points; number++){ - *cPtr++ = (*aPtr++) / (*bPtr++); - } + number = sixteenthPoints * 16; + for (; number < num_points; number++) { + *cPtr++ = (*aPtr++) / (*bPtr++); + } } #endif /* LV_HAVE_AVX512F */ @@ -113,35 +114,36 @@ volk_32f_x2_divide_32f_a_avx512f(float* cVector, const float* aVector, #ifdef LV_HAVE_AVX #include -static inline void -volk_32f_x2_divide_32f_a_avx(float* cVector, const float* aVector, - const float* bVector, unsigned int num_points) +static inline void volk_32f_x2_divide_32f_a_avx(float* cVector, + const float* aVector, + const float* bVector, + unsigned int num_points) { - unsigned int number = 0; - const unsigned int eighthPoints = num_points / 8; + unsigned int number = 0; + const unsigned int eighthPoints = num_points / 8; - float* cPtr = cVector; - const float* aPtr = aVector; - const float* bPtr= bVector; + float* cPtr = cVector; + const float* aPtr = aVector; + const float* bPtr = bVector; - __m256 aVal, bVal, cVal; - for(;number < eighthPoints; number++){ - aVal = _mm256_load_ps(aPtr); - bVal = _mm256_load_ps(bPtr); + __m256 aVal, bVal, cVal; + for (; number < eighthPoints; number++) { + aVal = _mm256_load_ps(aPtr); + bVal = _mm256_load_ps(bPtr); - cVal = _mm256_div_ps(aVal, bVal); + cVal = _mm256_div_ps(aVal, bVal); - _mm256_store_ps(cPtr,cVal); // Store the results back into the C container + _mm256_store_ps(cPtr, cVal); // Store the results back into the C container - aPtr += 8; - bPtr += 8; - cPtr += 8; - } + aPtr += 8; + bPtr += 8; + cPtr += 8; + } - number = eighthPoints * 8; - for(;number < num_points; number++){ - *cPtr++ = (*aPtr++) / (*bPtr++); - } + number = eighthPoints * 8; + for (; number < num_points; number++) { + *cPtr++ = (*aPtr++) / (*bPtr++); + } } #endif /* LV_HAVE_AVX */ @@ -149,35 +151,36 @@ volk_32f_x2_divide_32f_a_avx(float* cVector, const float* aVector, #ifdef LV_HAVE_SSE #include -static inline void -volk_32f_x2_divide_32f_a_sse(float* cVector, const float* aVector, - const float* bVector, unsigned int num_points) +static inline void volk_32f_x2_divide_32f_a_sse(float* cVector, + const float* aVector, + const float* bVector, + unsigned int num_points) { - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; - float* cPtr = cVector; - const float* aPtr = aVector; - const float* bPtr= bVector; + float* cPtr = cVector; + const float* aPtr = aVector; + const float* bPtr = bVector; - __m128 aVal, bVal, cVal; - for(;number < quarterPoints; number++){ - aVal = _mm_load_ps(aPtr); - bVal = _mm_load_ps(bPtr); + __m128 aVal, bVal, cVal; + for (; number < quarterPoints; number++) { + aVal = _mm_load_ps(aPtr); + bVal = _mm_load_ps(bPtr); - cVal = _mm_div_ps(aVal, bVal); + cVal = _mm_div_ps(aVal, bVal); - _mm_store_ps(cPtr,cVal); // Store the results back into the C container + _mm_store_ps(cPtr, cVal); // Store the results back into the C container - aPtr += 4; - bPtr += 4; - cPtr += 4; - } + aPtr += 4; + bPtr += 4; + cPtr += 4; + } - number = quarterPoints * 4; - for(;number < num_points; number++){ - *cPtr++ = (*aPtr++) / (*bPtr++); - } + number = quarterPoints * 4; + for (; number < num_points; number++) { + *cPtr++ = (*aPtr++) / (*bPtr++); + } } #endif /* LV_HAVE_SSE */ @@ -185,54 +188,55 @@ volk_32f_x2_divide_32f_a_sse(float* cVector, const float* aVector, #ifdef LV_HAVE_NEON #include -static inline void -volk_32f_x2_divide_32f_neon(float* cVector, const float* aVector, - const float* bVector, unsigned int num_points) +static inline void volk_32f_x2_divide_32f_neon(float* cVector, + const float* aVector, + const float* bVector, + unsigned int num_points) { - float* cPtr = cVector; - const float* aPtr = aVector; - const float* bPtr = bVector; - - float32x4x4_t aVal, bVal, bInv, cVal; - - const unsigned int eighthPoints = num_points / 16; - unsigned int number = 0; - for(; number < eighthPoints; number++){ - aVal = vld4q_f32(aPtr); - aPtr += 16; - bVal = vld4q_f32(bPtr); - bPtr += 16; - - __VOLK_PREFETCH(aPtr+16); - __VOLK_PREFETCH(bPtr+16); - - bInv.val[0] = vrecpeq_f32(bVal.val[0]); - bInv.val[0] = vmulq_f32(bInv.val[0], vrecpsq_f32(bInv.val[0], bVal.val[0])); - bInv.val[0] = vmulq_f32(bInv.val[0], vrecpsq_f32(bInv.val[0], bVal.val[0])); - cVal.val[0] = vmulq_f32(aVal.val[0], bInv.val[0]); - - bInv.val[1] = vrecpeq_f32(bVal.val[1]); - bInv.val[1] = vmulq_f32(bInv.val[1], vrecpsq_f32(bInv.val[1], bVal.val[1])); - bInv.val[1] = vmulq_f32(bInv.val[1], vrecpsq_f32(bInv.val[1], bVal.val[1])); - cVal.val[1] = vmulq_f32(aVal.val[1], bInv.val[1]); - - bInv.val[2] = vrecpeq_f32(bVal.val[2]); - bInv.val[2] = vmulq_f32(bInv.val[2], vrecpsq_f32(bInv.val[2], bVal.val[2])); - bInv.val[2] = vmulq_f32(bInv.val[2], vrecpsq_f32(bInv.val[2], bVal.val[2])); - cVal.val[2] = vmulq_f32(aVal.val[2], bInv.val[2]); - - bInv.val[3] = vrecpeq_f32(bVal.val[3]); - bInv.val[3] = vmulq_f32(bInv.val[3], vrecpsq_f32(bInv.val[3], bVal.val[3])); - bInv.val[3] = vmulq_f32(bInv.val[3], vrecpsq_f32(bInv.val[3], bVal.val[3])); - cVal.val[3] = vmulq_f32(aVal.val[3], bInv.val[3]); - - vst4q_f32(cPtr, cVal); - cPtr += 16; - } - - for(number = eighthPoints * 16; number < num_points; number++){ - *cPtr++ = (*aPtr++) / (*bPtr++); - } + float* cPtr = cVector; + const float* aPtr = aVector; + const float* bPtr = bVector; + + float32x4x4_t aVal, bVal, bInv, cVal; + + const unsigned int eighthPoints = num_points / 16; + unsigned int number = 0; + for (; number < eighthPoints; number++) { + aVal = vld4q_f32(aPtr); + aPtr += 16; + bVal = vld4q_f32(bPtr); + bPtr += 16; + + __VOLK_PREFETCH(aPtr + 16); + __VOLK_PREFETCH(bPtr + 16); + + bInv.val[0] = vrecpeq_f32(bVal.val[0]); + bInv.val[0] = vmulq_f32(bInv.val[0], vrecpsq_f32(bInv.val[0], bVal.val[0])); + bInv.val[0] = vmulq_f32(bInv.val[0], vrecpsq_f32(bInv.val[0], bVal.val[0])); + cVal.val[0] = vmulq_f32(aVal.val[0], bInv.val[0]); + + bInv.val[1] = vrecpeq_f32(bVal.val[1]); + bInv.val[1] = vmulq_f32(bInv.val[1], vrecpsq_f32(bInv.val[1], bVal.val[1])); + bInv.val[1] = vmulq_f32(bInv.val[1], vrecpsq_f32(bInv.val[1], bVal.val[1])); + cVal.val[1] = vmulq_f32(aVal.val[1], bInv.val[1]); + + bInv.val[2] = vrecpeq_f32(bVal.val[2]); + bInv.val[2] = vmulq_f32(bInv.val[2], vrecpsq_f32(bInv.val[2], bVal.val[2])); + bInv.val[2] = vmulq_f32(bInv.val[2], vrecpsq_f32(bInv.val[2], bVal.val[2])); + cVal.val[2] = vmulq_f32(aVal.val[2], bInv.val[2]); + + bInv.val[3] = vrecpeq_f32(bVal.val[3]); + bInv.val[3] = vmulq_f32(bInv.val[3], vrecpsq_f32(bInv.val[3], bVal.val[3])); + bInv.val[3] = vmulq_f32(bInv.val[3], vrecpsq_f32(bInv.val[3], bVal.val[3])); + cVal.val[3] = vmulq_f32(aVal.val[3], bInv.val[3]); + + vst4q_f32(cPtr, cVal); + cPtr += 16; + } + + for (number = eighthPoints * 16; number < num_points; number++) { + *cPtr++ = (*aPtr++) / (*bPtr++); + } } #endif /* LV_HAVE_NEON */ @@ -240,38 +244,40 @@ volk_32f_x2_divide_32f_neon(float* cVector, const float* aVector, #ifdef LV_HAVE_GENERIC -static inline void -volk_32f_x2_divide_32f_generic(float* cVector, const float* aVector, - const float* bVector, unsigned int num_points) +static inline void volk_32f_x2_divide_32f_generic(float* cVector, + const float* aVector, + const float* bVector, + unsigned int num_points) { - float* cPtr = cVector; - const float* aPtr = aVector; - const float* bPtr= bVector; - unsigned int number = 0; - - for(number = 0; number < num_points; number++){ - *cPtr++ = (*aPtr++) / (*bPtr++); - } + float* cPtr = cVector; + const float* aPtr = aVector; + const float* bPtr = bVector; + unsigned int number = 0; + + for (number = 0; number < num_points; number++) { + *cPtr++ = (*aPtr++) / (*bPtr++); + } } #endif /* LV_HAVE_GENERIC */ #ifdef LV_HAVE_ORC -extern void -volk_32f_x2_divide_32f_a_orc_impl(float* cVector, const float* aVector, - const float* bVector, unsigned int num_points); +extern void volk_32f_x2_divide_32f_a_orc_impl(float* cVector, + const float* aVector, + const float* bVector, + unsigned int num_points); -static inline void -volk_32f_x2_divide_32f_u_orc(float* cVector, const float* aVector, - const float* bVector, unsigned int num_points) +static inline void volk_32f_x2_divide_32f_u_orc(float* cVector, + const float* aVector, + const float* bVector, + unsigned int num_points) { - volk_32f_x2_divide_32f_a_orc_impl(cVector, aVector, bVector, num_points); + volk_32f_x2_divide_32f_a_orc_impl(cVector, aVector, bVector, num_points); } #endif /* LV_HAVE_ORC */ - #endif /* INCLUDED_volk_32f_x2_divide_32f_a_H */ @@ -284,35 +290,36 @@ volk_32f_x2_divide_32f_u_orc(float* cVector, const float* aVector, #ifdef LV_HAVE_AVX512F #include -static inline void -volk_32f_x2_divide_32f_u_avx512f(float* cVector, const float* aVector, - const float* bVector, unsigned int num_points) +static inline void volk_32f_x2_divide_32f_u_avx512f(float* cVector, + const float* aVector, + const float* bVector, + unsigned int num_points) { - unsigned int number = 0; - const unsigned int sixteenthPoints = num_points / 16; + unsigned int number = 0; + const unsigned int sixteenthPoints = num_points / 16; - float* cPtr = cVector; - const float* aPtr = aVector; - const float* bPtr= bVector; + float* cPtr = cVector; + const float* aPtr = aVector; + const float* bPtr = bVector; - __m512 aVal, bVal, cVal; - for(;number < sixteenthPoints; number++){ - aVal = _mm512_loadu_ps(aPtr); - bVal = _mm512_loadu_ps(bPtr); + __m512 aVal, bVal, cVal; + for (; number < sixteenthPoints; number++) { + aVal = _mm512_loadu_ps(aPtr); + bVal = _mm512_loadu_ps(bPtr); - cVal = _mm512_div_ps(aVal, bVal); + cVal = _mm512_div_ps(aVal, bVal); - _mm512_storeu_ps(cPtr,cVal); // Store the results back into the C container + _mm512_storeu_ps(cPtr, cVal); // Store the results back into the C container - aPtr += 16; - bPtr += 16; - cPtr += 16; - } + aPtr += 16; + bPtr += 16; + cPtr += 16; + } - number = sixteenthPoints * 16; - for(;number < num_points; number++){ - *cPtr++ = (*aPtr++) / (*bPtr++); - } + number = sixteenthPoints * 16; + for (; number < num_points; number++) { + *cPtr++ = (*aPtr++) / (*bPtr++); + } } #endif /* LV_HAVE_AVX512F */ @@ -320,35 +327,36 @@ volk_32f_x2_divide_32f_u_avx512f(float* cVector, const float* aVector, #ifdef LV_HAVE_AVX #include -static inline void -volk_32f_x2_divide_32f_u_avx(float* cVector, const float* aVector, - const float* bVector, unsigned int num_points) +static inline void volk_32f_x2_divide_32f_u_avx(float* cVector, + const float* aVector, + const float* bVector, + unsigned int num_points) { - unsigned int number = 0; - const unsigned int eighthPoints = num_points / 8; + unsigned int number = 0; + const unsigned int eighthPoints = num_points / 8; - float* cPtr = cVector; - const float* aPtr = aVector; - const float* bPtr= bVector; + float* cPtr = cVector; + const float* aPtr = aVector; + const float* bPtr = bVector; - __m256 aVal, bVal, cVal; - for(;number < eighthPoints; number++){ - aVal = _mm256_loadu_ps(aPtr); - bVal = _mm256_loadu_ps(bPtr); + __m256 aVal, bVal, cVal; + for (; number < eighthPoints; number++) { + aVal = _mm256_loadu_ps(aPtr); + bVal = _mm256_loadu_ps(bPtr); - cVal = _mm256_div_ps(aVal, bVal); + cVal = _mm256_div_ps(aVal, bVal); - _mm256_storeu_ps(cPtr,cVal); // Store the results back into the C container + _mm256_storeu_ps(cPtr, cVal); // Store the results back into the C container - aPtr += 8; - bPtr += 8; - cPtr += 8; - } + aPtr += 8; + bPtr += 8; + cPtr += 8; + } - number = eighthPoints * 8; - for(;number < num_points; number++){ - *cPtr++ = (*aPtr++) / (*bPtr++); - } + number = eighthPoints * 8; + for (; number < num_points; number++) { + *cPtr++ = (*aPtr++) / (*bPtr++); + } } #endif /* LV_HAVE_AVX */ diff --git a/kernels/volk/volk_32f_x2_dot_prod_16i.h b/kernels/volk/volk_32f_x2_dot_prod_16i.h index c1b5a82..4da7db6 100644 --- a/kernels/volk/volk_32f_x2_dot_prod_16i.h +++ b/kernels/volk/volk_32f_x2_dot_prod_16i.h @@ -33,8 +33,8 @@ * * Dispatcher Prototype * \code - * void volk_32f_x2_dot_prod_16i(int16_t* result, const float* input, const float* taps, unsigned int num_points) - * \endcode + * void volk_32f_x2_dot_prod_16i(int16_t* result, const float* input, const float* taps, + * unsigned int num_points) \endcode * * \b Inputs * \li input: vector of floats. @@ -58,25 +58,29 @@ #ifndef INCLUDED_volk_32f_x2_dot_prod_16i_H #define INCLUDED_volk_32f_x2_dot_prod_16i_H -#include #include +#include #ifdef LV_HAVE_GENERIC -static inline void volk_32f_x2_dot_prod_16i_generic(int16_t* result, const float* input, const float* taps, unsigned int num_points) { +static inline void volk_32f_x2_dot_prod_16i_generic(int16_t* result, + const float* input, + const float* taps, + unsigned int num_points) +{ - float dotProduct = 0; - const float* aPtr = input; - const float* bPtr= taps; - unsigned int number = 0; + float dotProduct = 0; + const float* aPtr = input; + const float* bPtr = taps; + unsigned int number = 0; - for(number = 0; number < num_points; number++){ - dotProduct += ((*aPtr++) * (*bPtr++)); - } + for (number = 0; number < num_points; number++) { + dotProduct += ((*aPtr++) * (*bPtr++)); + } - *result = (int16_t)dotProduct; + *result = (int16_t)dotProduct; } #endif /*LV_HAVE_GENERIC*/ @@ -84,68 +88,73 @@ static inline void volk_32f_x2_dot_prod_16i_generic(int16_t* result, const float #ifdef LV_HAVE_SSE -static inline void volk_32f_x2_dot_prod_16i_a_sse(int16_t* result, const float* input, const float* taps, unsigned int num_points) { - - unsigned int number = 0; - const unsigned int sixteenthPoints = num_points / 16; - - float dotProduct = 0; - const float* aPtr = input; - const float* bPtr = taps; - - __m128 a0Val, a1Val, a2Val, a3Val; - __m128 b0Val, b1Val, b2Val, b3Val; - __m128 c0Val, c1Val, c2Val, c3Val; - - __m128 dotProdVal0 = _mm_setzero_ps(); - __m128 dotProdVal1 = _mm_setzero_ps(); - __m128 dotProdVal2 = _mm_setzero_ps(); - __m128 dotProdVal3 = _mm_setzero_ps(); - - for(;number < sixteenthPoints; number++){ - - a0Val = _mm_load_ps(aPtr); - a1Val = _mm_load_ps(aPtr+4); - a2Val = _mm_load_ps(aPtr+8); - a3Val = _mm_load_ps(aPtr+12); - b0Val = _mm_load_ps(bPtr); - b1Val = _mm_load_ps(bPtr+4); - b2Val = _mm_load_ps(bPtr+8); - b3Val = _mm_load_ps(bPtr+12); - - c0Val = _mm_mul_ps(a0Val, b0Val); - c1Val = _mm_mul_ps(a1Val, b1Val); - c2Val = _mm_mul_ps(a2Val, b2Val); - c3Val = _mm_mul_ps(a3Val, b3Val); - - dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0); - dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1); - dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2); - dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3); - - aPtr += 16; - bPtr += 16; - } - - dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1); - dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2); - dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3); - - __VOLK_ATTR_ALIGNED(16) float dotProductVector[4]; - - _mm_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector - - dotProduct = dotProductVector[0]; - dotProduct += dotProductVector[1]; - dotProduct += dotProductVector[2]; - dotProduct += dotProductVector[3]; - - number = sixteenthPoints*16; - for(;number < num_points; number++){ - dotProduct += ((*aPtr++) * (*bPtr++)); - } - - *result = (short)dotProduct; +static inline void volk_32f_x2_dot_prod_16i_a_sse(int16_t* result, + const float* input, + const float* taps, + unsigned int num_points) +{ + + unsigned int number = 0; + const unsigned int sixteenthPoints = num_points / 16; + + float dotProduct = 0; + const float* aPtr = input; + const float* bPtr = taps; + + __m128 a0Val, a1Val, a2Val, a3Val; + __m128 b0Val, b1Val, b2Val, b3Val; + __m128 c0Val, c1Val, c2Val, c3Val; + + __m128 dotProdVal0 = _mm_setzero_ps(); + __m128 dotProdVal1 = _mm_setzero_ps(); + __m128 dotProdVal2 = _mm_setzero_ps(); + __m128 dotProdVal3 = _mm_setzero_ps(); + + for (; number < sixteenthPoints; number++) { + + a0Val = _mm_load_ps(aPtr); + a1Val = _mm_load_ps(aPtr + 4); + a2Val = _mm_load_ps(aPtr + 8); + a3Val = _mm_load_ps(aPtr + 12); + b0Val = _mm_load_ps(bPtr); + b1Val = _mm_load_ps(bPtr + 4); + b2Val = _mm_load_ps(bPtr + 8); + b3Val = _mm_load_ps(bPtr + 12); + + c0Val = _mm_mul_ps(a0Val, b0Val); + c1Val = _mm_mul_ps(a1Val, b1Val); + c2Val = _mm_mul_ps(a2Val, b2Val); + c3Val = _mm_mul_ps(a3Val, b3Val); + + dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0); + dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1); + dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2); + dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3); + + aPtr += 16; + bPtr += 16; + } + + dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1); + dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2); + dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3); + + __VOLK_ATTR_ALIGNED(16) float dotProductVector[4]; + + _mm_store_ps(dotProductVector, + dotProdVal0); // Store the results back into the dot product vector + + dotProduct = dotProductVector[0]; + dotProduct += dotProductVector[1]; + dotProduct += dotProductVector[2]; + dotProduct += dotProductVector[3]; + + number = sixteenthPoints * 16; + for (; number < num_points; number++) { + dotProduct += ((*aPtr++) * (*bPtr++)); + } + + *result = (short)dotProduct; } #endif /*LV_HAVE_SSE*/ @@ -153,66 +162,71 @@ static inline void volk_32f_x2_dot_prod_16i_a_sse(int16_t* result, const float* #if LV_HAVE_AVX2 && LV_HAVE_FMA -static inline void volk_32f_x2_dot_prod_16i_a_avx2_fma(int16_t* result, const float* input, const float* taps, unsigned int num_points) { - - unsigned int number = 0; - const unsigned int thirtysecondPoints = num_points / 32; - - float dotProduct = 0; - const float* aPtr = input; - const float* bPtr = taps; - - __m256 a0Val, a1Val, a2Val, a3Val; - __m256 b0Val, b1Val, b2Val, b3Val; - - __m256 dotProdVal0 = _mm256_setzero_ps(); - __m256 dotProdVal1 = _mm256_setzero_ps(); - __m256 dotProdVal2 = _mm256_setzero_ps(); - __m256 dotProdVal3 = _mm256_setzero_ps(); - - for(;number < thirtysecondPoints; number++){ - - a0Val = _mm256_load_ps(aPtr); - a1Val = _mm256_load_ps(aPtr+8); - a2Val = _mm256_load_ps(aPtr+16); - a3Val = _mm256_load_ps(aPtr+24); - b0Val = _mm256_load_ps(bPtr); - b1Val = _mm256_load_ps(bPtr+8); - b2Val = _mm256_load_ps(bPtr+16); - b3Val = _mm256_load_ps(bPtr+24); - - dotProdVal0 = _mm256_fmadd_ps(a0Val, b0Val, dotProdVal0); - dotProdVal1 = _mm256_fmadd_ps(a1Val, b1Val, dotProdVal1); - dotProdVal2 = _mm256_fmadd_ps(a2Val, b2Val, dotProdVal2); - dotProdVal3 = _mm256_fmadd_ps(a3Val, b3Val, dotProdVal3); - - aPtr += 32; - bPtr += 32; - } - - dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1); - dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2); - dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3); - - __VOLK_ATTR_ALIGNED(32) float dotProductVector[8]; - - _mm256_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector - - dotProduct = dotProductVector[0]; - dotProduct += dotProductVector[1]; - dotProduct += dotProductVector[2]; - dotProduct += dotProductVector[3]; - dotProduct += dotProductVector[4]; - dotProduct += dotProductVector[5]; - dotProduct += dotProductVector[6]; - dotProduct += dotProductVector[7]; - - number = thirtysecondPoints*32; - for(;number < num_points; number++){ - dotProduct += ((*aPtr++) * (*bPtr++)); - } - - *result = (short)dotProduct; +static inline void volk_32f_x2_dot_prod_16i_a_avx2_fma(int16_t* result, + const float* input, + const float* taps, + unsigned int num_points) +{ + + unsigned int number = 0; + const unsigned int thirtysecondPoints = num_points / 32; + + float dotProduct = 0; + const float* aPtr = input; + const float* bPtr = taps; + + __m256 a0Val, a1Val, a2Val, a3Val; + __m256 b0Val, b1Val, b2Val, b3Val; + + __m256 dotProdVal0 = _mm256_setzero_ps(); + __m256 dotProdVal1 = _mm256_setzero_ps(); + __m256 dotProdVal2 = _mm256_setzero_ps(); + __m256 dotProdVal3 = _mm256_setzero_ps(); + + for (; number < thirtysecondPoints; number++) { + + a0Val = _mm256_load_ps(aPtr); + a1Val = _mm256_load_ps(aPtr + 8); + a2Val = _mm256_load_ps(aPtr + 16); + a3Val = _mm256_load_ps(aPtr + 24); + b0Val = _mm256_load_ps(bPtr); + b1Val = _mm256_load_ps(bPtr + 8); + b2Val = _mm256_load_ps(bPtr + 16); + b3Val = _mm256_load_ps(bPtr + 24); + + dotProdVal0 = _mm256_fmadd_ps(a0Val, b0Val, dotProdVal0); + dotProdVal1 = _mm256_fmadd_ps(a1Val, b1Val, dotProdVal1); + dotProdVal2 = _mm256_fmadd_ps(a2Val, b2Val, dotProdVal2); + dotProdVal3 = _mm256_fmadd_ps(a3Val, b3Val, dotProdVal3); + + aPtr += 32; + bPtr += 32; + } + + dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1); + dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2); + dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3); + + __VOLK_ATTR_ALIGNED(32) float dotProductVector[8]; + + _mm256_store_ps(dotProductVector, + dotProdVal0); // Store the results back into the dot product vector + + dotProduct = dotProductVector[0]; + dotProduct += dotProductVector[1]; + dotProduct += dotProductVector[2]; + dotProduct += dotProductVector[3]; + dotProduct += dotProductVector[4]; + dotProduct += dotProductVector[5]; + dotProduct += dotProductVector[6]; + dotProduct += dotProductVector[7]; + + number = thirtysecondPoints * 32; + for (; number < num_points; number++) { + dotProduct += ((*aPtr++) * (*bPtr++)); + } + + *result = (short)dotProduct; } #endif /*LV_HAVE_AVX2 && LV_HAVE_FMA*/ @@ -220,146 +234,156 @@ static inline void volk_32f_x2_dot_prod_16i_a_avx2_fma(int16_t* result, const f #ifdef LV_HAVE_AVX -static inline void volk_32f_x2_dot_prod_16i_a_avx(int16_t* result, const float* input, const float* taps, unsigned int num_points) { - - unsigned int number = 0; - const unsigned int thirtysecondPoints = num_points / 32; - - float dotProduct = 0; - const float* aPtr = input; - const float* bPtr = taps; - - __m256 a0Val, a1Val, a2Val, a3Val; - __m256 b0Val, b1Val, b2Val, b3Val; - __m256 c0Val, c1Val, c2Val, c3Val; - - __m256 dotProdVal0 = _mm256_setzero_ps(); - __m256 dotProdVal1 = _mm256_setzero_ps(); - __m256 dotProdVal2 = _mm256_setzero_ps(); - __m256 dotProdVal3 = _mm256_setzero_ps(); - - for(;number < thirtysecondPoints; number++){ - - a0Val = _mm256_load_ps(aPtr); - a1Val = _mm256_load_ps(aPtr+8); - a2Val = _mm256_load_ps(aPtr+16); - a3Val = _mm256_load_ps(aPtr+24); - b0Val = _mm256_load_ps(bPtr); - b1Val = _mm256_load_ps(bPtr+8); - b2Val = _mm256_load_ps(bPtr+16); - b3Val = _mm256_load_ps(bPtr+24); - - c0Val = _mm256_mul_ps(a0Val, b0Val); - c1Val = _mm256_mul_ps(a1Val, b1Val); - c2Val = _mm256_mul_ps(a2Val, b2Val); - c3Val = _mm256_mul_ps(a3Val, b3Val); - - dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0); - dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1); - dotProdVal2 = _mm256_add_ps(c2Val, dotProdVal2); - dotProdVal3 = _mm256_add_ps(c3Val, dotProdVal3); - - aPtr += 32; - bPtr += 32; - } - - dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1); - dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2); - dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3); - - __VOLK_ATTR_ALIGNED(32) float dotProductVector[8]; - - _mm256_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector - - dotProduct = dotProductVector[0]; - dotProduct += dotProductVector[1]; - dotProduct += dotProductVector[2]; - dotProduct += dotProductVector[3]; - dotProduct += dotProductVector[4]; - dotProduct += dotProductVector[5]; - dotProduct += dotProductVector[6]; - dotProduct += dotProductVector[7]; - - number = thirtysecondPoints*32; - for(;number < num_points; number++){ - dotProduct += ((*aPtr++) * (*bPtr++)); - } - - *result = (short)dotProduct; +static inline void volk_32f_x2_dot_prod_16i_a_avx(int16_t* result, + const float* input, + const float* taps, + unsigned int num_points) +{ + + unsigned int number = 0; + const unsigned int thirtysecondPoints = num_points / 32; + + float dotProduct = 0; + const float* aPtr = input; + const float* bPtr = taps; + + __m256 a0Val, a1Val, a2Val, a3Val; + __m256 b0Val, b1Val, b2Val, b3Val; + __m256 c0Val, c1Val, c2Val, c3Val; + + __m256 dotProdVal0 = _mm256_setzero_ps(); + __m256 dotProdVal1 = _mm256_setzero_ps(); + __m256 dotProdVal2 = _mm256_setzero_ps(); + __m256 dotProdVal3 = _mm256_setzero_ps(); + + for (; number < thirtysecondPoints; number++) { + + a0Val = _mm256_load_ps(aPtr); + a1Val = _mm256_load_ps(aPtr + 8); + a2Val = _mm256_load_ps(aPtr + 16); + a3Val = _mm256_load_ps(aPtr + 24); + b0Val = _mm256_load_ps(bPtr); + b1Val = _mm256_load_ps(bPtr + 8); + b2Val = _mm256_load_ps(bPtr + 16); + b3Val = _mm256_load_ps(bPtr + 24); + + c0Val = _mm256_mul_ps(a0Val, b0Val); + c1Val = _mm256_mul_ps(a1Val, b1Val); + c2Val = _mm256_mul_ps(a2Val, b2Val); + c3Val = _mm256_mul_ps(a3Val, b3Val); + + dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0); + dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1); + dotProdVal2 = _mm256_add_ps(c2Val, dotProdVal2); + dotProdVal3 = _mm256_add_ps(c3Val, dotProdVal3); + + aPtr += 32; + bPtr += 32; + } + + dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1); + dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2); + dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3); + + __VOLK_ATTR_ALIGNED(32) float dotProductVector[8]; + + _mm256_store_ps(dotProductVector, + dotProdVal0); // Store the results back into the dot product vector + + dotProduct = dotProductVector[0]; + dotProduct += dotProductVector[1]; + dotProduct += dotProductVector[2]; + dotProduct += dotProductVector[3]; + dotProduct += dotProductVector[4]; + dotProduct += dotProductVector[5]; + dotProduct += dotProductVector[6]; + dotProduct += dotProductVector[7]; + + number = thirtysecondPoints * 32; + for (; number < num_points; number++) { + dotProduct += ((*aPtr++) * (*bPtr++)); + } + + *result = (short)dotProduct; } #endif /*LV_HAVE_AVX*/ #ifdef LV_HAVE_AVX512F -static inline void volk_32f_x2_dot_prod_16i_a_avx512f(int16_t* result, const float* input, const float* taps, unsigned int num_points) { - - unsigned int number = 0; - const unsigned int sixtyfourthPoints = num_points / 64; - - float dotProduct = 0; - const float* aPtr = input; - const float* bPtr = taps; - - __m512 a0Val, a1Val, a2Val, a3Val; - __m512 b0Val, b1Val, b2Val, b3Val; - - __m512 dotProdVal0 = _mm512_setzero_ps(); - __m512 dotProdVal1 = _mm512_setzero_ps(); - __m512 dotProdVal2 = _mm512_setzero_ps(); - __m512 dotProdVal3 = _mm512_setzero_ps(); - - for(;number < sixtyfourthPoints; number++){ - - a0Val = _mm512_load_ps(aPtr); - a1Val = _mm512_load_ps(aPtr+16); - a2Val = _mm512_load_ps(aPtr+32); - a3Val = _mm512_load_ps(aPtr+48); - b0Val = _mm512_load_ps(bPtr); - b1Val = _mm512_load_ps(bPtr+16); - b2Val = _mm512_load_ps(bPtr+32); - b3Val = _mm512_load_ps(bPtr+48); - - dotProdVal0 = _mm512_fmadd_ps(a0Val, b0Val, dotProdVal0); - dotProdVal1 = _mm512_fmadd_ps(a1Val, b1Val, dotProdVal1); - dotProdVal2 = _mm512_fmadd_ps(a2Val, b2Val, dotProdVal2); - dotProdVal3 = _mm512_fmadd_ps(a3Val, b3Val, dotProdVal3); - - aPtr += 64; - bPtr += 64; - } - - dotProdVal0 = _mm512_add_ps(dotProdVal0, dotProdVal1); - dotProdVal0 = _mm512_add_ps(dotProdVal0, dotProdVal2); - dotProdVal0 = _mm512_add_ps(dotProdVal0, dotProdVal3); - - __VOLK_ATTR_ALIGNED(64) float dotProductVector[16]; - - _mm512_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector - - dotProduct = dotProductVector[0]; - dotProduct += dotProductVector[1]; - dotProduct += dotProductVector[2]; - dotProduct += dotProductVector[3]; - dotProduct += dotProductVector[4]; - dotProduct += dotProductVector[5]; - dotProduct += dotProductVector[6]; - dotProduct += dotProductVector[7]; - dotProduct += dotProductVector[8]; - dotProduct += dotProductVector[9]; - dotProduct += dotProductVector[10]; - dotProduct += dotProductVector[11]; - dotProduct += dotProductVector[12]; - dotProduct += dotProductVector[13]; - dotProduct += dotProductVector[14]; - dotProduct += dotProductVector[15]; - - number = sixtyfourthPoints*64; - for(;number < num_points; number++){ - dotProduct += ((*aPtr++) * (*bPtr++)); - } - - *result = (short)dotProduct; +static inline void volk_32f_x2_dot_prod_16i_a_avx512f(int16_t* result, + const float* input, + const float* taps, + unsigned int num_points) +{ + + unsigned int number = 0; + const unsigned int sixtyfourthPoints = num_points / 64; + + float dotProduct = 0; + const float* aPtr = input; + const float* bPtr = taps; + + __m512 a0Val, a1Val, a2Val, a3Val; + __m512 b0Val, b1Val, b2Val, b3Val; + + __m512 dotProdVal0 = _mm512_setzero_ps(); + __m512 dotProdVal1 = _mm512_setzero_ps(); + __m512 dotProdVal2 = _mm512_setzero_ps(); + __m512 dotProdVal3 = _mm512_setzero_ps(); + + for (; number < sixtyfourthPoints; number++) { + + a0Val = _mm512_load_ps(aPtr); + a1Val = _mm512_load_ps(aPtr + 16); + a2Val = _mm512_load_ps(aPtr + 32); + a3Val = _mm512_load_ps(aPtr + 48); + b0Val = _mm512_load_ps(bPtr); + b1Val = _mm512_load_ps(bPtr + 16); + b2Val = _mm512_load_ps(bPtr + 32); + b3Val = _mm512_load_ps(bPtr + 48); + + dotProdVal0 = _mm512_fmadd_ps(a0Val, b0Val, dotProdVal0); + dotProdVal1 = _mm512_fmadd_ps(a1Val, b1Val, dotProdVal1); + dotProdVal2 = _mm512_fmadd_ps(a2Val, b2Val, dotProdVal2); + dotProdVal3 = _mm512_fmadd_ps(a3Val, b3Val, dotProdVal3); + + aPtr += 64; + bPtr += 64; + } + + dotProdVal0 = _mm512_add_ps(dotProdVal0, dotProdVal1); + dotProdVal0 = _mm512_add_ps(dotProdVal0, dotProdVal2); + dotProdVal0 = _mm512_add_ps(dotProdVal0, dotProdVal3); + + __VOLK_ATTR_ALIGNED(64) float dotProductVector[16]; + + _mm512_store_ps(dotProductVector, + dotProdVal0); // Store the results back into the dot product vector + + dotProduct = dotProductVector[0]; + dotProduct += dotProductVector[1]; + dotProduct += dotProductVector[2]; + dotProduct += dotProductVector[3]; + dotProduct += dotProductVector[4]; + dotProduct += dotProductVector[5]; + dotProduct += dotProductVector[6]; + dotProduct += dotProductVector[7]; + dotProduct += dotProductVector[8]; + dotProduct += dotProductVector[9]; + dotProduct += dotProductVector[10]; + dotProduct += dotProductVector[11]; + dotProduct += dotProductVector[12]; + dotProduct += dotProductVector[13]; + dotProduct += dotProductVector[14]; + dotProduct += dotProductVector[15]; + + number = sixtyfourthPoints * 64; + for (; number < num_points; number++) { + dotProduct += ((*aPtr++) * (*bPtr++)); + } + + *result = (short)dotProduct; } #endif /*LV_HAVE_AVX512F*/ @@ -367,68 +391,73 @@ static inline void volk_32f_x2_dot_prod_16i_a_avx512f(int16_t* result, const fl #ifdef LV_HAVE_SSE -static inline void volk_32f_x2_dot_prod_16i_u_sse(int16_t* result, const float* input, const float* taps, unsigned int num_points) { - - unsigned int number = 0; - const unsigned int sixteenthPoints = num_points / 16; - - float dotProduct = 0; - const float* aPtr = input; - const float* bPtr = taps; - - __m128 a0Val, a1Val, a2Val, a3Val; - __m128 b0Val, b1Val, b2Val, b3Val; - __m128 c0Val, c1Val, c2Val, c3Val; - - __m128 dotProdVal0 = _mm_setzero_ps(); - __m128 dotProdVal1 = _mm_setzero_ps(); - __m128 dotProdVal2 = _mm_setzero_ps(); - __m128 dotProdVal3 = _mm_setzero_ps(); - - for(;number < sixteenthPoints; number++){ - - a0Val = _mm_loadu_ps(aPtr); - a1Val = _mm_loadu_ps(aPtr+4); - a2Val = _mm_loadu_ps(aPtr+8); - a3Val = _mm_loadu_ps(aPtr+12); - b0Val = _mm_loadu_ps(bPtr); - b1Val = _mm_loadu_ps(bPtr+4); - b2Val = _mm_loadu_ps(bPtr+8); - b3Val = _mm_loadu_ps(bPtr+12); - - c0Val = _mm_mul_ps(a0Val, b0Val); - c1Val = _mm_mul_ps(a1Val, b1Val); - c2Val = _mm_mul_ps(a2Val, b2Val); - c3Val = _mm_mul_ps(a3Val, b3Val); - - dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0); - dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1); - dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2); - dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3); - - aPtr += 16; - bPtr += 16; - } - - dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1); - dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2); - dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3); - - __VOLK_ATTR_ALIGNED(16) float dotProductVector[4]; - - _mm_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector - - dotProduct = dotProductVector[0]; - dotProduct += dotProductVector[1]; - dotProduct += dotProductVector[2]; - dotProduct += dotProductVector[3]; - - number = sixteenthPoints*16; - for(;number < num_points; number++){ - dotProduct += ((*aPtr++) * (*bPtr++)); - } - - *result = (short)dotProduct; +static inline void volk_32f_x2_dot_prod_16i_u_sse(int16_t* result, + const float* input, + const float* taps, + unsigned int num_points) +{ + + unsigned int number = 0; + const unsigned int sixteenthPoints = num_points / 16; + + float dotProduct = 0; + const float* aPtr = input; + const float* bPtr = taps; + + __m128 a0Val, a1Val, a2Val, a3Val; + __m128 b0Val, b1Val, b2Val, b3Val; + __m128 c0Val, c1Val, c2Val, c3Val; + + __m128 dotProdVal0 = _mm_setzero_ps(); + __m128 dotProdVal1 = _mm_setzero_ps(); + __m128 dotProdVal2 = _mm_setzero_ps(); + __m128 dotProdVal3 = _mm_setzero_ps(); + + for (; number < sixteenthPoints; number++) { + + a0Val = _mm_loadu_ps(aPtr); + a1Val = _mm_loadu_ps(aPtr + 4); + a2Val = _mm_loadu_ps(aPtr + 8); + a3Val = _mm_loadu_ps(aPtr + 12); + b0Val = _mm_loadu_ps(bPtr); + b1Val = _mm_loadu_ps(bPtr + 4); + b2Val = _mm_loadu_ps(bPtr + 8); + b3Val = _mm_loadu_ps(bPtr + 12); + + c0Val = _mm_mul_ps(a0Val, b0Val); + c1Val = _mm_mul_ps(a1Val, b1Val); + c2Val = _mm_mul_ps(a2Val, b2Val); + c3Val = _mm_mul_ps(a3Val, b3Val); + + dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0); + dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1); + dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2); + dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3); + + aPtr += 16; + bPtr += 16; + } + + dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1); + dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2); + dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3); + + __VOLK_ATTR_ALIGNED(16) float dotProductVector[4]; + + _mm_store_ps(dotProductVector, + dotProdVal0); // Store the results back into the dot product vector + + dotProduct = dotProductVector[0]; + dotProduct += dotProductVector[1]; + dotProduct += dotProductVector[2]; + dotProduct += dotProductVector[3]; + + number = sixteenthPoints * 16; + for (; number < num_points; number++) { + dotProduct += ((*aPtr++) * (*bPtr++)); + } + + *result = (short)dotProduct; } #endif /*LV_HAVE_SSE*/ @@ -436,66 +465,71 @@ static inline void volk_32f_x2_dot_prod_16i_u_sse(int16_t* result, const float* #if LV_HAVE_AVX2 && LV_HAVE_FMA -static inline void volk_32f_x2_dot_prod_16i_u_avx2_fma(int16_t* result, const float* input, const float* taps, unsigned int num_points) { - - unsigned int number = 0; - const unsigned int thirtysecondPoints = num_points / 32; - - float dotProduct = 0; - const float* aPtr = input; - const float* bPtr = taps; - - __m256 a0Val, a1Val, a2Val, a3Val; - __m256 b0Val, b1Val, b2Val, b3Val; - - __m256 dotProdVal0 = _mm256_setzero_ps(); - __m256 dotProdVal1 = _mm256_setzero_ps(); - __m256 dotProdVal2 = _mm256_setzero_ps(); - __m256 dotProdVal3 = _mm256_setzero_ps(); - - for(;number < thirtysecondPoints; number++){ - - a0Val = _mm256_loadu_ps(aPtr); - a1Val = _mm256_loadu_ps(aPtr+8); - a2Val = _mm256_loadu_ps(aPtr+16); - a3Val = _mm256_loadu_ps(aPtr+24); - b0Val = _mm256_loadu_ps(bPtr); - b1Val = _mm256_loadu_ps(bPtr+8); - b2Val = _mm256_loadu_ps(bPtr+16); - b3Val = _mm256_loadu_ps(bPtr+24); - - dotProdVal0 = _mm256_fmadd_ps(a0Val, b0Val, dotProdVal0); - dotProdVal1 = _mm256_fmadd_ps(a1Val, b1Val, dotProdVal1); - dotProdVal2 = _mm256_fmadd_ps(a2Val, b2Val, dotProdVal2); - dotProdVal3 = _mm256_fmadd_ps(a3Val, b3Val, dotProdVal3); - - aPtr += 32; - bPtr += 32; - } - - dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1); - dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2); - dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3); - - __VOLK_ATTR_ALIGNED(32) float dotProductVector[8]; - - _mm256_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector - - dotProduct = dotProductVector[0]; - dotProduct += dotProductVector[1]; - dotProduct += dotProductVector[2]; - dotProduct += dotProductVector[3]; - dotProduct += dotProductVector[4]; - dotProduct += dotProductVector[5]; - dotProduct += dotProductVector[6]; - dotProduct += dotProductVector[7]; - - number = thirtysecondPoints*32; - for(;number < num_points; number++){ - dotProduct += ((*aPtr++) * (*bPtr++)); - } - - *result = (short)dotProduct; +static inline void volk_32f_x2_dot_prod_16i_u_avx2_fma(int16_t* result, + const float* input, + const float* taps, + unsigned int num_points) +{ + + unsigned int number = 0; + const unsigned int thirtysecondPoints = num_points / 32; + + float dotProduct = 0; + const float* aPtr = input; + const float* bPtr = taps; + + __m256 a0Val, a1Val, a2Val, a3Val; + __m256 b0Val, b1Val, b2Val, b3Val; + + __m256 dotProdVal0 = _mm256_setzero_ps(); + __m256 dotProdVal1 = _mm256_setzero_ps(); + __m256 dotProdVal2 = _mm256_setzero_ps(); + __m256 dotProdVal3 = _mm256_setzero_ps(); + + for (; number < thirtysecondPoints; number++) { + + a0Val = _mm256_loadu_ps(aPtr); + a1Val = _mm256_loadu_ps(aPtr + 8); + a2Val = _mm256_loadu_ps(aPtr + 16); + a3Val = _mm256_loadu_ps(aPtr + 24); + b0Val = _mm256_loadu_ps(bPtr); + b1Val = _mm256_loadu_ps(bPtr + 8); + b2Val = _mm256_loadu_ps(bPtr + 16); + b3Val = _mm256_loadu_ps(bPtr + 24); + + dotProdVal0 = _mm256_fmadd_ps(a0Val, b0Val, dotProdVal0); + dotProdVal1 = _mm256_fmadd_ps(a1Val, b1Val, dotProdVal1); + dotProdVal2 = _mm256_fmadd_ps(a2Val, b2Val, dotProdVal2); + dotProdVal3 = _mm256_fmadd_ps(a3Val, b3Val, dotProdVal3); + + aPtr += 32; + bPtr += 32; + } + + dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1); + dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2); + dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3); + + __VOLK_ATTR_ALIGNED(32) float dotProductVector[8]; + + _mm256_store_ps(dotProductVector, + dotProdVal0); // Store the results back into the dot product vector + + dotProduct = dotProductVector[0]; + dotProduct += dotProductVector[1]; + dotProduct += dotProductVector[2]; + dotProduct += dotProductVector[3]; + dotProduct += dotProductVector[4]; + dotProduct += dotProductVector[5]; + dotProduct += dotProductVector[6]; + dotProduct += dotProductVector[7]; + + number = thirtysecondPoints * 32; + for (; number < num_points; number++) { + dotProduct += ((*aPtr++) * (*bPtr++)); + } + + *result = (short)dotProduct; } #endif /*LV_HAVE_AVX2 && lV_HAVE_FMA*/ @@ -503,146 +537,156 @@ static inline void volk_32f_x2_dot_prod_16i_u_avx2_fma(int16_t* result, const f #ifdef LV_HAVE_AVX -static inline void volk_32f_x2_dot_prod_16i_u_avx(int16_t* result, const float* input, const float* taps, unsigned int num_points) { - - unsigned int number = 0; - const unsigned int thirtysecondPoints = num_points / 32; - - float dotProduct = 0; - const float* aPtr = input; - const float* bPtr = taps; - - __m256 a0Val, a1Val, a2Val, a3Val; - __m256 b0Val, b1Val, b2Val, b3Val; - __m256 c0Val, c1Val, c2Val, c3Val; - - __m256 dotProdVal0 = _mm256_setzero_ps(); - __m256 dotProdVal1 = _mm256_setzero_ps(); - __m256 dotProdVal2 = _mm256_setzero_ps(); - __m256 dotProdVal3 = _mm256_setzero_ps(); - - for(;number < thirtysecondPoints; number++){ - - a0Val = _mm256_loadu_ps(aPtr); - a1Val = _mm256_loadu_ps(aPtr+8); - a2Val = _mm256_loadu_ps(aPtr+16); - a3Val = _mm256_loadu_ps(aPtr+24); - b0Val = _mm256_loadu_ps(bPtr); - b1Val = _mm256_loadu_ps(bPtr+8); - b2Val = _mm256_loadu_ps(bPtr+16); - b3Val = _mm256_loadu_ps(bPtr+24); - - c0Val = _mm256_mul_ps(a0Val, b0Val); - c1Val = _mm256_mul_ps(a1Val, b1Val); - c2Val = _mm256_mul_ps(a2Val, b2Val); - c3Val = _mm256_mul_ps(a3Val, b3Val); - - dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0); - dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1); - dotProdVal2 = _mm256_add_ps(c2Val, dotProdVal2); - dotProdVal3 = _mm256_add_ps(c3Val, dotProdVal3); - - aPtr += 32; - bPtr += 32; - } - - dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1); - dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2); - dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3); - - __VOLK_ATTR_ALIGNED(32) float dotProductVector[8]; - - _mm256_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector - - dotProduct = dotProductVector[0]; - dotProduct += dotProductVector[1]; - dotProduct += dotProductVector[2]; - dotProduct += dotProductVector[3]; - dotProduct += dotProductVector[4]; - dotProduct += dotProductVector[5]; - dotProduct += dotProductVector[6]; - dotProduct += dotProductVector[7]; - - number = thirtysecondPoints*32; - for(;number < num_points; number++){ - dotProduct += ((*aPtr++) * (*bPtr++)); - } - - *result = (short)dotProduct; +static inline void volk_32f_x2_dot_prod_16i_u_avx(int16_t* result, + const float* input, + const float* taps, + unsigned int num_points) +{ + + unsigned int number = 0; + const unsigned int thirtysecondPoints = num_points / 32; + + float dotProduct = 0; + const float* aPtr = input; + const float* bPtr = taps; + + __m256 a0Val, a1Val, a2Val, a3Val; + __m256 b0Val, b1Val, b2Val, b3Val; + __m256 c0Val, c1Val, c2Val, c3Val; + + __m256 dotProdVal0 = _mm256_setzero_ps(); + __m256 dotProdVal1 = _mm256_setzero_ps(); + __m256 dotProdVal2 = _mm256_setzero_ps(); + __m256 dotProdVal3 = _mm256_setzero_ps(); + + for (; number < thirtysecondPoints; number++) { + + a0Val = _mm256_loadu_ps(aPtr); + a1Val = _mm256_loadu_ps(aPtr + 8); + a2Val = _mm256_loadu_ps(aPtr + 16); + a3Val = _mm256_loadu_ps(aPtr + 24); + b0Val = _mm256_loadu_ps(bPtr); + b1Val = _mm256_loadu_ps(bPtr + 8); + b2Val = _mm256_loadu_ps(bPtr + 16); + b3Val = _mm256_loadu_ps(bPtr + 24); + + c0Val = _mm256_mul_ps(a0Val, b0Val); + c1Val = _mm256_mul_ps(a1Val, b1Val); + c2Val = _mm256_mul_ps(a2Val, b2Val); + c3Val = _mm256_mul_ps(a3Val, b3Val); + + dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0); + dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1); + dotProdVal2 = _mm256_add_ps(c2Val, dotProdVal2); + dotProdVal3 = _mm256_add_ps(c3Val, dotProdVal3); + + aPtr += 32; + bPtr += 32; + } + + dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1); + dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2); + dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3); + + __VOLK_ATTR_ALIGNED(32) float dotProductVector[8]; + + _mm256_store_ps(dotProductVector, + dotProdVal0); // Store the results back into the dot product vector + + dotProduct = dotProductVector[0]; + dotProduct += dotProductVector[1]; + dotProduct += dotProductVector[2]; + dotProduct += dotProductVector[3]; + dotProduct += dotProductVector[4]; + dotProduct += dotProductVector[5]; + dotProduct += dotProductVector[6]; + dotProduct += dotProductVector[7]; + + number = thirtysecondPoints * 32; + for (; number < num_points; number++) { + dotProduct += ((*aPtr++) * (*bPtr++)); + } + + *result = (short)dotProduct; } #endif /*LV_HAVE_AVX*/ #ifdef LV_HAVE_AVX512F -static inline void volk_32f_x2_dot_prod_16i_u_avx512f(int16_t* result, const float* input, const float* taps, unsigned int num_points) { - - unsigned int number = 0; - const unsigned int sixtyfourthPoints = num_points / 64; - - float dotProduct = 0; - const float* aPtr = input; - const float* bPtr = taps; - - __m512 a0Val, a1Val, a2Val, a3Val; - __m512 b0Val, b1Val, b2Val, b3Val; - - __m512 dotProdVal0 = _mm512_setzero_ps(); - __m512 dotProdVal1 = _mm512_setzero_ps(); - __m512 dotProdVal2 = _mm512_setzero_ps(); - __m512 dotProdVal3 = _mm512_setzero_ps(); - - for(;number < sixtyfourthPoints; number++){ - - a0Val = _mm512_loadu_ps(aPtr); - a1Val = _mm512_loadu_ps(aPtr+16); - a2Val = _mm512_loadu_ps(aPtr+32); - a3Val = _mm512_loadu_ps(aPtr+48); - b0Val = _mm512_loadu_ps(bPtr); - b1Val = _mm512_loadu_ps(bPtr+16); - b2Val = _mm512_loadu_ps(bPtr+32); - b3Val = _mm512_loadu_ps(bPtr+48); - - dotProdVal0 = _mm512_fmadd_ps(a0Val, b0Val, dotProdVal0); - dotProdVal1 = _mm512_fmadd_ps(a1Val, b1Val, dotProdVal1); - dotProdVal2 = _mm512_fmadd_ps(a2Val, b2Val, dotProdVal2); - dotProdVal3 = _mm512_fmadd_ps(a3Val, b3Val, dotProdVal3); - - aPtr += 64; - bPtr += 64; - } - - dotProdVal0 = _mm512_add_ps(dotProdVal0, dotProdVal1); - dotProdVal0 = _mm512_add_ps(dotProdVal0, dotProdVal2); - dotProdVal0 = _mm512_add_ps(dotProdVal0, dotProdVal3); - - __VOLK_ATTR_ALIGNED(64) float dotProductVector[16]; - - _mm512_storeu_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector - - dotProduct = dotProductVector[0]; - dotProduct += dotProductVector[1]; - dotProduct += dotProductVector[2]; - dotProduct += dotProductVector[3]; - dotProduct += dotProductVector[4]; - dotProduct += dotProductVector[5]; - dotProduct += dotProductVector[6]; - dotProduct += dotProductVector[7]; - dotProduct += dotProductVector[8]; - dotProduct += dotProductVector[9]; - dotProduct += dotProductVector[10]; - dotProduct += dotProductVector[11]; - dotProduct += dotProductVector[12]; - dotProduct += dotProductVector[13]; - dotProduct += dotProductVector[14]; - dotProduct += dotProductVector[15]; - - number = sixtyfourthPoints*64; - for(;number < num_points; number++){ - dotProduct += ((*aPtr++) * (*bPtr++)); - } - - *result = (short)dotProduct; +static inline void volk_32f_x2_dot_prod_16i_u_avx512f(int16_t* result, + const float* input, + const float* taps, + unsigned int num_points) +{ + + unsigned int number = 0; + const unsigned int sixtyfourthPoints = num_points / 64; + + float dotProduct = 0; + const float* aPtr = input; + const float* bPtr = taps; + + __m512 a0Val, a1Val, a2Val, a3Val; + __m512 b0Val, b1Val, b2Val, b3Val; + + __m512 dotProdVal0 = _mm512_setzero_ps(); + __m512 dotProdVal1 = _mm512_setzero_ps(); + __m512 dotProdVal2 = _mm512_setzero_ps(); + __m512 dotProdVal3 = _mm512_setzero_ps(); + + for (; number < sixtyfourthPoints; number++) { + + a0Val = _mm512_loadu_ps(aPtr); + a1Val = _mm512_loadu_ps(aPtr + 16); + a2Val = _mm512_loadu_ps(aPtr + 32); + a3Val = _mm512_loadu_ps(aPtr + 48); + b0Val = _mm512_loadu_ps(bPtr); + b1Val = _mm512_loadu_ps(bPtr + 16); + b2Val = _mm512_loadu_ps(bPtr + 32); + b3Val = _mm512_loadu_ps(bPtr + 48); + + dotProdVal0 = _mm512_fmadd_ps(a0Val, b0Val, dotProdVal0); + dotProdVal1 = _mm512_fmadd_ps(a1Val, b1Val, dotProdVal1); + dotProdVal2 = _mm512_fmadd_ps(a2Val, b2Val, dotProdVal2); + dotProdVal3 = _mm512_fmadd_ps(a3Val, b3Val, dotProdVal3); + + aPtr += 64; + bPtr += 64; + } + + dotProdVal0 = _mm512_add_ps(dotProdVal0, dotProdVal1); + dotProdVal0 = _mm512_add_ps(dotProdVal0, dotProdVal2); + dotProdVal0 = _mm512_add_ps(dotProdVal0, dotProdVal3); + + __VOLK_ATTR_ALIGNED(64) float dotProductVector[16]; + + _mm512_storeu_ps(dotProductVector, + dotProdVal0); // Store the results back into the dot product vector + + dotProduct = dotProductVector[0]; + dotProduct += dotProductVector[1]; + dotProduct += dotProductVector[2]; + dotProduct += dotProductVector[3]; + dotProduct += dotProductVector[4]; + dotProduct += dotProductVector[5]; + dotProduct += dotProductVector[6]; + dotProduct += dotProductVector[7]; + dotProduct += dotProductVector[8]; + dotProduct += dotProductVector[9]; + dotProduct += dotProductVector[10]; + dotProduct += dotProductVector[11]; + dotProduct += dotProductVector[12]; + dotProduct += dotProductVector[13]; + dotProduct += dotProductVector[14]; + dotProduct += dotProductVector[15]; + + number = sixtyfourthPoints * 64; + for (; number < num_points; number++) { + dotProduct += ((*aPtr++) * (*bPtr++)); + } + + *result = (short)dotProduct; } #endif /*LV_HAVE_AVX512F*/ diff --git a/kernels/volk/volk_32f_x2_dot_prod_32f.h b/kernels/volk/volk_32f_x2_dot_prod_32f.h index ea0f7ba..7854031 100644 --- a/kernels/volk/volk_32f_x2_dot_prod_32f.h +++ b/kernels/volk/volk_32f_x2_dot_prod_32f.h @@ -33,8 +33,8 @@ * * Dispatcher Prototype * \code - * void volk_32f_x2_dot_prod_32f(float* result, const float* input, const float* taps, unsigned int num_points) - * \endcode + * void volk_32f_x2_dot_prod_32f(float* result, const float* input, const float* taps, + * unsigned int num_points) \endcode * * \b Inputs * \li input: vector of floats. @@ -45,10 +45,8 @@ * \li result: pointer to a float value to hold the dot product result. * * \b Example - * Take the dot product of an increasing vector and a vector of ones. The result is the sum of integers (0,9). - * \code - * int N = 10; - * unsigned int alignment = volk_get_alignment(); + * Take the dot product of an increasing vector and a vector of ones. The result is the + * sum of integers (0,9). \code int N = 10; unsigned int alignment = volk_get_alignment(); * float* increasing = (float*)volk_malloc(sizeof(float)*N, alignment); * float* ones = (float*)volk_malloc(sizeof(float)*N, alignment); * float* out = (float*)volk_malloc(sizeof(float)*1, alignment); @@ -73,25 +71,29 @@ #ifndef INCLUDED_volk_32f_x2_dot_prod_32f_u_H #define INCLUDED_volk_32f_x2_dot_prod_32f_u_H +#include #include -#include #ifdef LV_HAVE_GENERIC -static inline void volk_32f_x2_dot_prod_32f_generic(float * result, const float * input, const float * taps, unsigned int num_points) { +static inline void volk_32f_x2_dot_prod_32f_generic(float* result, + const float* input, + const float* taps, + unsigned int num_points) +{ - float dotProduct = 0; - const float* aPtr = input; - const float* bPtr= taps; - unsigned int number = 0; + float dotProduct = 0; + const float* aPtr = input; + const float* bPtr = taps; + unsigned int number = 0; - for(number = 0; number < num_points; number++){ - dotProduct += ((*aPtr++) * (*bPtr++)); - } + for (number = 0; number < num_points; number++) { + dotProduct += ((*aPtr++) * (*bPtr++)); + } - *result = dotProduct; + *result = dotProduct; } #endif /*LV_HAVE_GENERIC*/ @@ -100,69 +102,73 @@ static inline void volk_32f_x2_dot_prod_32f_generic(float * result, const float #ifdef LV_HAVE_SSE -static inline void volk_32f_x2_dot_prod_32f_u_sse( float* result, const float* input, const float* taps, unsigned int num_points) { - - unsigned int number = 0; - const unsigned int sixteenthPoints = num_points / 16; - - float dotProduct = 0; - const float* aPtr = input; - const float* bPtr = taps; +static inline void volk_32f_x2_dot_prod_32f_u_sse(float* result, + const float* input, + const float* taps, + unsigned int num_points) +{ - __m128 a0Val, a1Val, a2Val, a3Val; - __m128 b0Val, b1Val, b2Val, b3Val; - __m128 c0Val, c1Val, c2Val, c3Val; - - __m128 dotProdVal0 = _mm_setzero_ps(); - __m128 dotProdVal1 = _mm_setzero_ps(); - __m128 dotProdVal2 = _mm_setzero_ps(); - __m128 dotProdVal3 = _mm_setzero_ps(); - - for(;number < sixteenthPoints; number++){ - - a0Val = _mm_loadu_ps(aPtr); - a1Val = _mm_loadu_ps(aPtr+4); - a2Val = _mm_loadu_ps(aPtr+8); - a3Val = _mm_loadu_ps(aPtr+12); - b0Val = _mm_loadu_ps(bPtr); - b1Val = _mm_loadu_ps(bPtr+4); - b2Val = _mm_loadu_ps(bPtr+8); - b3Val = _mm_loadu_ps(bPtr+12); - - c0Val = _mm_mul_ps(a0Val, b0Val); - c1Val = _mm_mul_ps(a1Val, b1Val); - c2Val = _mm_mul_ps(a2Val, b2Val); - c3Val = _mm_mul_ps(a3Val, b3Val); - - dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0); - dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1); - dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2); - dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3); + unsigned int number = 0; + const unsigned int sixteenthPoints = num_points / 16; - aPtr += 16; - bPtr += 16; - } + float dotProduct = 0; + const float* aPtr = input; + const float* bPtr = taps; + + __m128 a0Val, a1Val, a2Val, a3Val; + __m128 b0Val, b1Val, b2Val, b3Val; + __m128 c0Val, c1Val, c2Val, c3Val; + + __m128 dotProdVal0 = _mm_setzero_ps(); + __m128 dotProdVal1 = _mm_setzero_ps(); + __m128 dotProdVal2 = _mm_setzero_ps(); + __m128 dotProdVal3 = _mm_setzero_ps(); + + for (; number < sixteenthPoints; number++) { + + a0Val = _mm_loadu_ps(aPtr); + a1Val = _mm_loadu_ps(aPtr + 4); + a2Val = _mm_loadu_ps(aPtr + 8); + a3Val = _mm_loadu_ps(aPtr + 12); + b0Val = _mm_loadu_ps(bPtr); + b1Val = _mm_loadu_ps(bPtr + 4); + b2Val = _mm_loadu_ps(bPtr + 8); + b3Val = _mm_loadu_ps(bPtr + 12); + + c0Val = _mm_mul_ps(a0Val, b0Val); + c1Val = _mm_mul_ps(a1Val, b1Val); + c2Val = _mm_mul_ps(a2Val, b2Val); + c3Val = _mm_mul_ps(a3Val, b3Val); + + dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0); + dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1); + dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2); + dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3); - dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1); - dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2); - dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3); + aPtr += 16; + bPtr += 16; + } - __VOLK_ATTR_ALIGNED(16) float dotProductVector[4]; + dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1); + dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2); + dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3); - _mm_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector + __VOLK_ATTR_ALIGNED(16) float dotProductVector[4]; - dotProduct = dotProductVector[0]; - dotProduct += dotProductVector[1]; - dotProduct += dotProductVector[2]; - dotProduct += dotProductVector[3]; + _mm_store_ps(dotProductVector, + dotProdVal0); // Store the results back into the dot product vector - number = sixteenthPoints*16; - for(;number < num_points; number++){ - dotProduct += ((*aPtr++) * (*bPtr++)); - } + dotProduct = dotProductVector[0]; + dotProduct += dotProductVector[1]; + dotProduct += dotProductVector[2]; + dotProduct += dotProductVector[3]; - *result = dotProduct; + number = sixteenthPoints * 16; + for (; number < num_points; number++) { + dotProduct += ((*aPtr++) * (*bPtr++)); + } + *result = dotProduct; } #endif /*LV_HAVE_SSE*/ @@ -171,127 +177,145 @@ static inline void volk_32f_x2_dot_prod_32f_u_sse( float* result, const float* #include -static inline void volk_32f_x2_dot_prod_32f_u_sse3(float * result, const float * input, const float * taps, unsigned int num_points) { - unsigned int number = 0; - const unsigned int sixteenthPoints = num_points / 16; - - float dotProduct = 0; - const float* aPtr = input; - const float* bPtr = taps; - - __m128 a0Val, a1Val, a2Val, a3Val; - __m128 b0Val, b1Val, b2Val, b3Val; - __m128 c0Val, c1Val, c2Val, c3Val; - - __m128 dotProdVal0 = _mm_setzero_ps(); - __m128 dotProdVal1 = _mm_setzero_ps(); - __m128 dotProdVal2 = _mm_setzero_ps(); - __m128 dotProdVal3 = _mm_setzero_ps(); - - for(;number < sixteenthPoints; number++){ - - a0Val = _mm_loadu_ps(aPtr); - a1Val = _mm_loadu_ps(aPtr+4); - a2Val = _mm_loadu_ps(aPtr+8); - a3Val = _mm_loadu_ps(aPtr+12); - b0Val = _mm_loadu_ps(bPtr); - b1Val = _mm_loadu_ps(bPtr+4); - b2Val = _mm_loadu_ps(bPtr+8); - b3Val = _mm_loadu_ps(bPtr+12); - - c0Val = _mm_mul_ps(a0Val, b0Val); - c1Val = _mm_mul_ps(a1Val, b1Val); - c2Val = _mm_mul_ps(a2Val, b2Val); - c3Val = _mm_mul_ps(a3Val, b3Val); - - dotProdVal0 = _mm_add_ps(dotProdVal0, c0Val); - dotProdVal1 = _mm_add_ps(dotProdVal1, c1Val); - dotProdVal2 = _mm_add_ps(dotProdVal2, c2Val); - dotProdVal3 = _mm_add_ps(dotProdVal3, c3Val); - - aPtr += 16; - bPtr += 16; - } - - dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1); - dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2); - dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3); - - __VOLK_ATTR_ALIGNED(16) float dotProductVector[4]; - _mm_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector - - dotProduct = dotProductVector[0]; - dotProduct += dotProductVector[1]; - dotProduct += dotProductVector[2]; - dotProduct += dotProductVector[3]; - - number = sixteenthPoints*16; - for(;number < num_points; number++){ - dotProduct += ((*aPtr++) * (*bPtr++)); - } - - *result = dotProduct; -} - -#endif /*LV_HAVE_SSE3*/ +static inline void volk_32f_x2_dot_prod_32f_u_sse3(float* result, + const float* input, + const float* taps, + unsigned int num_points) +{ + unsigned int number = 0; + const unsigned int sixteenthPoints = num_points / 16; -#ifdef LV_HAVE_SSE4_1 + float dotProduct = 0; + const float* aPtr = input; + const float* bPtr = taps; + + __m128 a0Val, a1Val, a2Val, a3Val; + __m128 b0Val, b1Val, b2Val, b3Val; + __m128 c0Val, c1Val, c2Val, c3Val; + + __m128 dotProdVal0 = _mm_setzero_ps(); + __m128 dotProdVal1 = _mm_setzero_ps(); + __m128 dotProdVal2 = _mm_setzero_ps(); + __m128 dotProdVal3 = _mm_setzero_ps(); + + for (; number < sixteenthPoints; number++) { + + a0Val = _mm_loadu_ps(aPtr); + a1Val = _mm_loadu_ps(aPtr + 4); + a2Val = _mm_loadu_ps(aPtr + 8); + a3Val = _mm_loadu_ps(aPtr + 12); + b0Val = _mm_loadu_ps(bPtr); + b1Val = _mm_loadu_ps(bPtr + 4); + b2Val = _mm_loadu_ps(bPtr + 8); + b3Val = _mm_loadu_ps(bPtr + 12); + + c0Val = _mm_mul_ps(a0Val, b0Val); + c1Val = _mm_mul_ps(a1Val, b1Val); + c2Val = _mm_mul_ps(a2Val, b2Val); + c3Val = _mm_mul_ps(a3Val, b3Val); + + dotProdVal0 = _mm_add_ps(dotProdVal0, c0Val); + dotProdVal1 = _mm_add_ps(dotProdVal1, c1Val); + dotProdVal2 = _mm_add_ps(dotProdVal2, c2Val); + dotProdVal3 = _mm_add_ps(dotProdVal3, c3Val); -#include + aPtr += 16; + bPtr += 16; + } -static inline void volk_32f_x2_dot_prod_32f_u_sse4_1(float * result, const float * input, const float* taps, unsigned int num_points) { - unsigned int number = 0; - const unsigned int sixteenthPoints = num_points / 16; + dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1); + dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2); + dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3); - float dotProduct = 0; - const float* aPtr = input; - const float* bPtr = taps; + __VOLK_ATTR_ALIGNED(16) float dotProductVector[4]; + _mm_store_ps(dotProductVector, + dotProdVal0); // Store the results back into the dot product vector - __m128 aVal1, bVal1, cVal1; - __m128 aVal2, bVal2, cVal2; - __m128 aVal3, bVal3, cVal3; - __m128 aVal4, bVal4, cVal4; + dotProduct = dotProductVector[0]; + dotProduct += dotProductVector[1]; + dotProduct += dotProductVector[2]; + dotProduct += dotProductVector[3]; - __m128 dotProdVal = _mm_setzero_ps(); + number = sixteenthPoints * 16; + for (; number < num_points; number++) { + dotProduct += ((*aPtr++) * (*bPtr++)); + } - for(;number < sixteenthPoints; number++){ + *result = dotProduct; +} - aVal1 = _mm_loadu_ps(aPtr); aPtr += 4; - aVal2 = _mm_loadu_ps(aPtr); aPtr += 4; - aVal3 = _mm_loadu_ps(aPtr); aPtr += 4; - aVal4 = _mm_loadu_ps(aPtr); aPtr += 4; +#endif /*LV_HAVE_SSE3*/ - bVal1 = _mm_loadu_ps(bPtr); bPtr += 4; - bVal2 = _mm_loadu_ps(bPtr); bPtr += 4; - bVal3 = _mm_loadu_ps(bPtr); bPtr += 4; - bVal4 = _mm_loadu_ps(bPtr); bPtr += 4; +#ifdef LV_HAVE_SSE4_1 - cVal1 = _mm_dp_ps(aVal1, bVal1, 0xF1); - cVal2 = _mm_dp_ps(aVal2, bVal2, 0xF2); - cVal3 = _mm_dp_ps(aVal3, bVal3, 0xF4); - cVal4 = _mm_dp_ps(aVal4, bVal4, 0xF8); +#include - cVal1 = _mm_or_ps(cVal1, cVal2); - cVal3 = _mm_or_ps(cVal3, cVal4); - cVal1 = _mm_or_ps(cVal1, cVal3); +static inline void volk_32f_x2_dot_prod_32f_u_sse4_1(float* result, + const float* input, + const float* taps, + unsigned int num_points) +{ + unsigned int number = 0; + const unsigned int sixteenthPoints = num_points / 16; - dotProdVal = _mm_add_ps(dotProdVal, cVal1); - } + float dotProduct = 0; + const float* aPtr = input; + const float* bPtr = taps; + + __m128 aVal1, bVal1, cVal1; + __m128 aVal2, bVal2, cVal2; + __m128 aVal3, bVal3, cVal3; + __m128 aVal4, bVal4, cVal4; + + __m128 dotProdVal = _mm_setzero_ps(); + + for (; number < sixteenthPoints; number++) { + + aVal1 = _mm_loadu_ps(aPtr); + aPtr += 4; + aVal2 = _mm_loadu_ps(aPtr); + aPtr += 4; + aVal3 = _mm_loadu_ps(aPtr); + aPtr += 4; + aVal4 = _mm_loadu_ps(aPtr); + aPtr += 4; + + bVal1 = _mm_loadu_ps(bPtr); + bPtr += 4; + bVal2 = _mm_loadu_ps(bPtr); + bPtr += 4; + bVal3 = _mm_loadu_ps(bPtr); + bPtr += 4; + bVal4 = _mm_loadu_ps(bPtr); + bPtr += 4; + + cVal1 = _mm_dp_ps(aVal1, bVal1, 0xF1); + cVal2 = _mm_dp_ps(aVal2, bVal2, 0xF2); + cVal3 = _mm_dp_ps(aVal3, bVal3, 0xF4); + cVal4 = _mm_dp_ps(aVal4, bVal4, 0xF8); + + cVal1 = _mm_or_ps(cVal1, cVal2); + cVal3 = _mm_or_ps(cVal3, cVal4); + cVal1 = _mm_or_ps(cVal1, cVal3); + + dotProdVal = _mm_add_ps(dotProdVal, cVal1); + } - __VOLK_ATTR_ALIGNED(16) float dotProductVector[4]; - _mm_store_ps(dotProductVector, dotProdVal); // Store the results back into the dot product vector + __VOLK_ATTR_ALIGNED(16) float dotProductVector[4]; + _mm_store_ps(dotProductVector, + dotProdVal); // Store the results back into the dot product vector - dotProduct = dotProductVector[0]; - dotProduct += dotProductVector[1]; - dotProduct += dotProductVector[2]; - dotProduct += dotProductVector[3]; + dotProduct = dotProductVector[0]; + dotProduct += dotProductVector[1]; + dotProduct += dotProductVector[2]; + dotProduct += dotProductVector[3]; - number = sixteenthPoints * 16; - for(;number < num_points; number++){ - dotProduct += ((*aPtr++) * (*bPtr++)); - } + number = sixteenthPoints * 16; + for (; number < num_points; number++) { + dotProduct += ((*aPtr++) * (*bPtr++)); + } - *result = dotProduct; + *result = dotProduct; } #endif /*LV_HAVE_SSE4_1*/ @@ -300,147 +324,154 @@ static inline void volk_32f_x2_dot_prod_32f_u_sse4_1(float * result, const float #include -static inline void volk_32f_x2_dot_prod_32f_u_avx( float* result, const float* input, const float* taps, unsigned int num_points) { +static inline void volk_32f_x2_dot_prod_32f_u_avx(float* result, + const float* input, + const float* taps, + unsigned int num_points) +{ - unsigned int number = 0; - const unsigned int sixteenthPoints = num_points / 16; - - float dotProduct = 0; - const float* aPtr = input; - const float* bPtr = taps; + unsigned int number = 0; + const unsigned int sixteenthPoints = num_points / 16; - __m256 a0Val, a1Val; - __m256 b0Val, b1Val; - __m256 c0Val, c1Val; + float dotProduct = 0; + const float* aPtr = input; + const float* bPtr = taps; - __m256 dotProdVal0 = _mm256_setzero_ps(); - __m256 dotProdVal1 = _mm256_setzero_ps(); + __m256 a0Val, a1Val; + __m256 b0Val, b1Val; + __m256 c0Val, c1Val; - for(;number < sixteenthPoints; number++){ + __m256 dotProdVal0 = _mm256_setzero_ps(); + __m256 dotProdVal1 = _mm256_setzero_ps(); - a0Val = _mm256_loadu_ps(aPtr); - a1Val = _mm256_loadu_ps(aPtr+8); - b0Val = _mm256_loadu_ps(bPtr); - b1Val = _mm256_loadu_ps(bPtr+8); + for (; number < sixteenthPoints; number++) { - c0Val = _mm256_mul_ps(a0Val, b0Val); - c1Val = _mm256_mul_ps(a1Val, b1Val); + a0Val = _mm256_loadu_ps(aPtr); + a1Val = _mm256_loadu_ps(aPtr + 8); + b0Val = _mm256_loadu_ps(bPtr); + b1Val = _mm256_loadu_ps(bPtr + 8); - dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0); - dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1); + c0Val = _mm256_mul_ps(a0Val, b0Val); + c1Val = _mm256_mul_ps(a1Val, b1Val); - aPtr += 16; - bPtr += 16; - } + dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0); + dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1); - dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1); + aPtr += 16; + bPtr += 16; + } - __VOLK_ATTR_ALIGNED(32) float dotProductVector[8]; + dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1); - _mm256_storeu_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector + __VOLK_ATTR_ALIGNED(32) float dotProductVector[8]; - dotProduct = dotProductVector[0]; - dotProduct += dotProductVector[1]; - dotProduct += dotProductVector[2]; - dotProduct += dotProductVector[3]; - dotProduct += dotProductVector[4]; - dotProduct += dotProductVector[5]; - dotProduct += dotProductVector[6]; - dotProduct += dotProductVector[7]; + _mm256_storeu_ps(dotProductVector, + dotProdVal0); // Store the results back into the dot product vector - number = sixteenthPoints*16; - for(;number < num_points; number++){ - dotProduct += ((*aPtr++) * (*bPtr++)); - } + dotProduct = dotProductVector[0]; + dotProduct += dotProductVector[1]; + dotProduct += dotProductVector[2]; + dotProduct += dotProductVector[3]; + dotProduct += dotProductVector[4]; + dotProduct += dotProductVector[5]; + dotProduct += dotProductVector[6]; + dotProduct += dotProductVector[7]; - *result = dotProduct; + number = sixteenthPoints * 16; + for (; number < num_points; number++) { + dotProduct += ((*aPtr++) * (*bPtr++)); + } + *result = dotProduct; } #endif /*LV_HAVE_AVX*/ #if LV_HAVE_AVX2 && LV_HAVE_FMA #include -static inline void volk_32f_x2_dot_prod_32f_u_avx2_fma(float * result, const float * input, const float* taps, unsigned int num_points){ - unsigned int number; - const unsigned int eighthPoints = num_points / 8; +static inline void volk_32f_x2_dot_prod_32f_u_avx2_fma(float* result, + const float* input, + const float* taps, + unsigned int num_points) +{ + unsigned int number; + const unsigned int eighthPoints = num_points / 8; - const float* aPtr = input; - const float* bPtr = taps; - - __m256 dotProdVal = _mm256_setzero_ps(); - __m256 aVal1, bVal1; + const float* aPtr = input; + const float* bPtr = taps; - for (number = 0; number < eighthPoints; number++ ) { + __m256 dotProdVal = _mm256_setzero_ps(); + __m256 aVal1, bVal1; - aVal1 = _mm256_loadu_ps(aPtr); - bVal1 = _mm256_loadu_ps(bPtr); - aPtr += 8; - bPtr += 8; + for (number = 0; number < eighthPoints; number++) { - dotProdVal = _mm256_fmadd_ps(aVal1, bVal1, dotProdVal); - } + aVal1 = _mm256_loadu_ps(aPtr); + bVal1 = _mm256_loadu_ps(bPtr); + aPtr += 8; + bPtr += 8; - __VOLK_ATTR_ALIGNED(32) float dotProductVector[8]; - _mm256_storeu_ps(dotProductVector, dotProdVal); // Store the results back into the dot product vector - _mm256_zeroupper(); + dotProdVal = _mm256_fmadd_ps(aVal1, bVal1, dotProdVal); + } - float dotProduct = - dotProductVector[0] + dotProductVector[1] + - dotProductVector[2] + dotProductVector[3] + - dotProductVector[4] + dotProductVector[5] + - dotProductVector[6] + dotProductVector[7]; + __VOLK_ATTR_ALIGNED(32) float dotProductVector[8]; + _mm256_storeu_ps(dotProductVector, + dotProdVal); // Store the results back into the dot product vector + _mm256_zeroupper(); - for(number = eighthPoints * 8; number < num_points; number++){ - dotProduct += ((*aPtr++) * (*bPtr++)); - } + float dotProduct = dotProductVector[0] + dotProductVector[1] + dotProductVector[2] + + dotProductVector[3] + dotProductVector[4] + dotProductVector[5] + + dotProductVector[6] + dotProductVector[7]; - *result = dotProduct; + for (number = eighthPoints * 8; number < num_points; number++) { + dotProduct += ((*aPtr++) * (*bPtr++)); + } + *result = dotProduct; } #endif /* LV_HAVE_AVX2 && LV_HAVE_FMA */ #if LV_HAVE_AVX512F #include -static inline void volk_32f_x2_dot_prod_32f_u_avx512f(float * result, const float * input, const float* taps, unsigned int num_points){ - unsigned int number; - const unsigned int sixteenthPoints = num_points / 16; - - const float* aPtr = input; - const float* bPtr = taps; +static inline void volk_32f_x2_dot_prod_32f_u_avx512f(float* result, + const float* input, + const float* taps, + unsigned int num_points) +{ + unsigned int number; + const unsigned int sixteenthPoints = num_points / 16; - __m512 dotProdVal = _mm512_setzero_ps(); - __m512 aVal1, bVal1; + const float* aPtr = input; + const float* bPtr = taps; - for (number = 0; number < sixteenthPoints; number++ ) { + __m512 dotProdVal = _mm512_setzero_ps(); + __m512 aVal1, bVal1; - aVal1 = _mm512_loadu_ps(aPtr); - bVal1 = _mm512_loadu_ps(bPtr); - aPtr += 16; - bPtr += 16; + for (number = 0; number < sixteenthPoints; number++) { - dotProdVal = _mm512_fmadd_ps(aVal1, bVal1, dotProdVal); - } + aVal1 = _mm512_loadu_ps(aPtr); + bVal1 = _mm512_loadu_ps(bPtr); + aPtr += 16; + bPtr += 16; - __VOLK_ATTR_ALIGNED(64) float dotProductVector[16]; - _mm512_storeu_ps(dotProductVector, dotProdVal); // Store the results back into the dot product vector + dotProdVal = _mm512_fmadd_ps(aVal1, bVal1, dotProdVal); + } - float dotProduct = - dotProductVector[0] + dotProductVector[1] + - dotProductVector[2] + dotProductVector[3] + - dotProductVector[4] + dotProductVector[5] + - dotProductVector[6] + dotProductVector[7] + - dotProductVector[8] + dotProductVector[9] + - dotProductVector[10] + dotProductVector[11] + - dotProductVector[12] + dotProductVector[13] + - dotProductVector[14] + dotProductVector[15]; + __VOLK_ATTR_ALIGNED(64) float dotProductVector[16]; + _mm512_storeu_ps(dotProductVector, + dotProdVal); // Store the results back into the dot product vector - for(number = sixteenthPoints * 16; number < num_points; number++){ - dotProduct += ((*aPtr++) * (*bPtr++)); - } + float dotProduct = dotProductVector[0] + dotProductVector[1] + dotProductVector[2] + + dotProductVector[3] + dotProductVector[4] + dotProductVector[5] + + dotProductVector[6] + dotProductVector[7] + dotProductVector[8] + + dotProductVector[9] + dotProductVector[10] + dotProductVector[11] + + dotProductVector[12] + dotProductVector[13] + + dotProductVector[14] + dotProductVector[15]; - *result = dotProduct; + for (number = sixteenthPoints * 16; number < num_points; number++) { + dotProduct += ((*aPtr++) * (*bPtr++)); + } + *result = dotProduct; } #endif /* LV_HAVE_AVX512F */ @@ -449,25 +480,29 @@ static inline void volk_32f_x2_dot_prod_32f_u_avx512f(float * result, const floa #ifndef INCLUDED_volk_32f_x2_dot_prod_32f_a_H #define INCLUDED_volk_32f_x2_dot_prod_32f_a_H +#include #include -#include #ifdef LV_HAVE_GENERIC -static inline void volk_32f_x2_dot_prod_32f_a_generic(float * result, const float * input, const float * taps, unsigned int num_points) { +static inline void volk_32f_x2_dot_prod_32f_a_generic(float* result, + const float* input, + const float* taps, + unsigned int num_points) +{ - float dotProduct = 0; - const float* aPtr = input; - const float* bPtr= taps; - unsigned int number = 0; + float dotProduct = 0; + const float* aPtr = input; + const float* bPtr = taps; + unsigned int number = 0; - for(number = 0; number < num_points; number++){ - dotProduct += ((*aPtr++) * (*bPtr++)); - } + for (number = 0; number < num_points; number++) { + dotProduct += ((*aPtr++) * (*bPtr++)); + } - *result = dotProduct; + *result = dotProduct; } #endif /*LV_HAVE_GENERIC*/ @@ -476,69 +511,73 @@ static inline void volk_32f_x2_dot_prod_32f_a_generic(float * result, const floa #ifdef LV_HAVE_SSE -static inline void volk_32f_x2_dot_prod_32f_a_sse( float* result, const float* input, const float* taps, unsigned int num_points) { - - unsigned int number = 0; - const unsigned int sixteenthPoints = num_points / 16; - - float dotProduct = 0; - const float* aPtr = input; - const float* bPtr = taps; - - __m128 a0Val, a1Val, a2Val, a3Val; - __m128 b0Val, b1Val, b2Val, b3Val; - __m128 c0Val, c1Val, c2Val, c3Val; - - __m128 dotProdVal0 = _mm_setzero_ps(); - __m128 dotProdVal1 = _mm_setzero_ps(); - __m128 dotProdVal2 = _mm_setzero_ps(); - __m128 dotProdVal3 = _mm_setzero_ps(); +static inline void volk_32f_x2_dot_prod_32f_a_sse(float* result, + const float* input, + const float* taps, + unsigned int num_points) +{ - for(;number < sixteenthPoints; number++){ - - a0Val = _mm_load_ps(aPtr); - a1Val = _mm_load_ps(aPtr+4); - a2Val = _mm_load_ps(aPtr+8); - a3Val = _mm_load_ps(aPtr+12); - b0Val = _mm_load_ps(bPtr); - b1Val = _mm_load_ps(bPtr+4); - b2Val = _mm_load_ps(bPtr+8); - b3Val = _mm_load_ps(bPtr+12); - - c0Val = _mm_mul_ps(a0Val, b0Val); - c1Val = _mm_mul_ps(a1Val, b1Val); - c2Val = _mm_mul_ps(a2Val, b2Val); - c3Val = _mm_mul_ps(a3Val, b3Val); - - dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0); - dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1); - dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2); - dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3); + unsigned int number = 0; + const unsigned int sixteenthPoints = num_points / 16; - aPtr += 16; - bPtr += 16; - } + float dotProduct = 0; + const float* aPtr = input; + const float* bPtr = taps; + + __m128 a0Val, a1Val, a2Val, a3Val; + __m128 b0Val, b1Val, b2Val, b3Val; + __m128 c0Val, c1Val, c2Val, c3Val; + + __m128 dotProdVal0 = _mm_setzero_ps(); + __m128 dotProdVal1 = _mm_setzero_ps(); + __m128 dotProdVal2 = _mm_setzero_ps(); + __m128 dotProdVal3 = _mm_setzero_ps(); + + for (; number < sixteenthPoints; number++) { + + a0Val = _mm_load_ps(aPtr); + a1Val = _mm_load_ps(aPtr + 4); + a2Val = _mm_load_ps(aPtr + 8); + a3Val = _mm_load_ps(aPtr + 12); + b0Val = _mm_load_ps(bPtr); + b1Val = _mm_load_ps(bPtr + 4); + b2Val = _mm_load_ps(bPtr + 8); + b3Val = _mm_load_ps(bPtr + 12); + + c0Val = _mm_mul_ps(a0Val, b0Val); + c1Val = _mm_mul_ps(a1Val, b1Val); + c2Val = _mm_mul_ps(a2Val, b2Val); + c3Val = _mm_mul_ps(a3Val, b3Val); + + dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0); + dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1); + dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2); + dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3); - dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1); - dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2); - dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3); + aPtr += 16; + bPtr += 16; + } - __VOLK_ATTR_ALIGNED(16) float dotProductVector[4]; + dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1); + dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2); + dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3); - _mm_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector + __VOLK_ATTR_ALIGNED(16) float dotProductVector[4]; - dotProduct = dotProductVector[0]; - dotProduct += dotProductVector[1]; - dotProduct += dotProductVector[2]; - dotProduct += dotProductVector[3]; + _mm_store_ps(dotProductVector, + dotProdVal0); // Store the results back into the dot product vector - number = sixteenthPoints*16; - for(;number < num_points; number++){ - dotProduct += ((*aPtr++) * (*bPtr++)); - } + dotProduct = dotProductVector[0]; + dotProduct += dotProductVector[1]; + dotProduct += dotProductVector[2]; + dotProduct += dotProductVector[3]; - *result = dotProduct; + number = sixteenthPoints * 16; + for (; number < num_points; number++) { + dotProduct += ((*aPtr++) * (*bPtr++)); + } + *result = dotProduct; } #endif /*LV_HAVE_SSE*/ @@ -547,127 +586,145 @@ static inline void volk_32f_x2_dot_prod_32f_a_sse( float* result, const float* #include -static inline void volk_32f_x2_dot_prod_32f_a_sse3(float * result, const float * input, const float * taps, unsigned int num_points) { - unsigned int number = 0; - const unsigned int sixteenthPoints = num_points / 16; - - float dotProduct = 0; - const float* aPtr = input; - const float* bPtr = taps; - - __m128 a0Val, a1Val, a2Val, a3Val; - __m128 b0Val, b1Val, b2Val, b3Val; - __m128 c0Val, c1Val, c2Val, c3Val; - - __m128 dotProdVal0 = _mm_setzero_ps(); - __m128 dotProdVal1 = _mm_setzero_ps(); - __m128 dotProdVal2 = _mm_setzero_ps(); - __m128 dotProdVal3 = _mm_setzero_ps(); - - for(;number < sixteenthPoints; number++){ - - a0Val = _mm_load_ps(aPtr); - a1Val = _mm_load_ps(aPtr+4); - a2Val = _mm_load_ps(aPtr+8); - a3Val = _mm_load_ps(aPtr+12); - b0Val = _mm_load_ps(bPtr); - b1Val = _mm_load_ps(bPtr+4); - b2Val = _mm_load_ps(bPtr+8); - b3Val = _mm_load_ps(bPtr+12); - - c0Val = _mm_mul_ps(a0Val, b0Val); - c1Val = _mm_mul_ps(a1Val, b1Val); - c2Val = _mm_mul_ps(a2Val, b2Val); - c3Val = _mm_mul_ps(a3Val, b3Val); - - dotProdVal0 = _mm_add_ps(dotProdVal0, c0Val); - dotProdVal1 = _mm_add_ps(dotProdVal1, c1Val); - dotProdVal2 = _mm_add_ps(dotProdVal2, c2Val); - dotProdVal3 = _mm_add_ps(dotProdVal3, c3Val); - - aPtr += 16; - bPtr += 16; - } - - dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1); - dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2); - dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3); - - __VOLK_ATTR_ALIGNED(16) float dotProductVector[4]; - _mm_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector - - dotProduct = dotProductVector[0]; - dotProduct += dotProductVector[1]; - dotProduct += dotProductVector[2]; - dotProduct += dotProductVector[3]; - - number = sixteenthPoints*16; - for(;number < num_points; number++){ - dotProduct += ((*aPtr++) * (*bPtr++)); - } - - *result = dotProduct; -} - -#endif /*LV_HAVE_SSE3*/ +static inline void volk_32f_x2_dot_prod_32f_a_sse3(float* result, + const float* input, + const float* taps, + unsigned int num_points) +{ + unsigned int number = 0; + const unsigned int sixteenthPoints = num_points / 16; -#ifdef LV_HAVE_SSE4_1 + float dotProduct = 0; + const float* aPtr = input; + const float* bPtr = taps; + + __m128 a0Val, a1Val, a2Val, a3Val; + __m128 b0Val, b1Val, b2Val, b3Val; + __m128 c0Val, c1Val, c2Val, c3Val; + + __m128 dotProdVal0 = _mm_setzero_ps(); + __m128 dotProdVal1 = _mm_setzero_ps(); + __m128 dotProdVal2 = _mm_setzero_ps(); + __m128 dotProdVal3 = _mm_setzero_ps(); + + for (; number < sixteenthPoints; number++) { + + a0Val = _mm_load_ps(aPtr); + a1Val = _mm_load_ps(aPtr + 4); + a2Val = _mm_load_ps(aPtr + 8); + a3Val = _mm_load_ps(aPtr + 12); + b0Val = _mm_load_ps(bPtr); + b1Val = _mm_load_ps(bPtr + 4); + b2Val = _mm_load_ps(bPtr + 8); + b3Val = _mm_load_ps(bPtr + 12); + + c0Val = _mm_mul_ps(a0Val, b0Val); + c1Val = _mm_mul_ps(a1Val, b1Val); + c2Val = _mm_mul_ps(a2Val, b2Val); + c3Val = _mm_mul_ps(a3Val, b3Val); + + dotProdVal0 = _mm_add_ps(dotProdVal0, c0Val); + dotProdVal1 = _mm_add_ps(dotProdVal1, c1Val); + dotProdVal2 = _mm_add_ps(dotProdVal2, c2Val); + dotProdVal3 = _mm_add_ps(dotProdVal3, c3Val); -#include + aPtr += 16; + bPtr += 16; + } -static inline void volk_32f_x2_dot_prod_32f_a_sse4_1(float * result, const float * input, const float* taps, unsigned int num_points) { - unsigned int number = 0; - const unsigned int sixteenthPoints = num_points / 16; + dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1); + dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2); + dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3); - float dotProduct = 0; - const float* aPtr = input; - const float* bPtr = taps; + __VOLK_ATTR_ALIGNED(16) float dotProductVector[4]; + _mm_store_ps(dotProductVector, + dotProdVal0); // Store the results back into the dot product vector - __m128 aVal1, bVal1, cVal1; - __m128 aVal2, bVal2, cVal2; - __m128 aVal3, bVal3, cVal3; - __m128 aVal4, bVal4, cVal4; + dotProduct = dotProductVector[0]; + dotProduct += dotProductVector[1]; + dotProduct += dotProductVector[2]; + dotProduct += dotProductVector[3]; - __m128 dotProdVal = _mm_setzero_ps(); + number = sixteenthPoints * 16; + for (; number < num_points; number++) { + dotProduct += ((*aPtr++) * (*bPtr++)); + } - for(;number < sixteenthPoints; number++){ + *result = dotProduct; +} - aVal1 = _mm_load_ps(aPtr); aPtr += 4; - aVal2 = _mm_load_ps(aPtr); aPtr += 4; - aVal3 = _mm_load_ps(aPtr); aPtr += 4; - aVal4 = _mm_load_ps(aPtr); aPtr += 4; +#endif /*LV_HAVE_SSE3*/ - bVal1 = _mm_load_ps(bPtr); bPtr += 4; - bVal2 = _mm_load_ps(bPtr); bPtr += 4; - bVal3 = _mm_load_ps(bPtr); bPtr += 4; - bVal4 = _mm_load_ps(bPtr); bPtr += 4; +#ifdef LV_HAVE_SSE4_1 - cVal1 = _mm_dp_ps(aVal1, bVal1, 0xF1); - cVal2 = _mm_dp_ps(aVal2, bVal2, 0xF2); - cVal3 = _mm_dp_ps(aVal3, bVal3, 0xF4); - cVal4 = _mm_dp_ps(aVal4, bVal4, 0xF8); +#include - cVal1 = _mm_or_ps(cVal1, cVal2); - cVal3 = _mm_or_ps(cVal3, cVal4); - cVal1 = _mm_or_ps(cVal1, cVal3); +static inline void volk_32f_x2_dot_prod_32f_a_sse4_1(float* result, + const float* input, + const float* taps, + unsigned int num_points) +{ + unsigned int number = 0; + const unsigned int sixteenthPoints = num_points / 16; - dotProdVal = _mm_add_ps(dotProdVal, cVal1); - } + float dotProduct = 0; + const float* aPtr = input; + const float* bPtr = taps; + + __m128 aVal1, bVal1, cVal1; + __m128 aVal2, bVal2, cVal2; + __m128 aVal3, bVal3, cVal3; + __m128 aVal4, bVal4, cVal4; + + __m128 dotProdVal = _mm_setzero_ps(); + + for (; number < sixteenthPoints; number++) { + + aVal1 = _mm_load_ps(aPtr); + aPtr += 4; + aVal2 = _mm_load_ps(aPtr); + aPtr += 4; + aVal3 = _mm_load_ps(aPtr); + aPtr += 4; + aVal4 = _mm_load_ps(aPtr); + aPtr += 4; + + bVal1 = _mm_load_ps(bPtr); + bPtr += 4; + bVal2 = _mm_load_ps(bPtr); + bPtr += 4; + bVal3 = _mm_load_ps(bPtr); + bPtr += 4; + bVal4 = _mm_load_ps(bPtr); + bPtr += 4; + + cVal1 = _mm_dp_ps(aVal1, bVal1, 0xF1); + cVal2 = _mm_dp_ps(aVal2, bVal2, 0xF2); + cVal3 = _mm_dp_ps(aVal3, bVal3, 0xF4); + cVal4 = _mm_dp_ps(aVal4, bVal4, 0xF8); + + cVal1 = _mm_or_ps(cVal1, cVal2); + cVal3 = _mm_or_ps(cVal3, cVal4); + cVal1 = _mm_or_ps(cVal1, cVal3); + + dotProdVal = _mm_add_ps(dotProdVal, cVal1); + } - __VOLK_ATTR_ALIGNED(16) float dotProductVector[4]; - _mm_store_ps(dotProductVector, dotProdVal); // Store the results back into the dot product vector + __VOLK_ATTR_ALIGNED(16) float dotProductVector[4]; + _mm_store_ps(dotProductVector, + dotProdVal); // Store the results back into the dot product vector - dotProduct = dotProductVector[0]; - dotProduct += dotProductVector[1]; - dotProduct += dotProductVector[2]; - dotProduct += dotProductVector[3]; + dotProduct = dotProductVector[0]; + dotProduct += dotProductVector[1]; + dotProduct += dotProductVector[2]; + dotProduct += dotProductVector[3]; - number = sixteenthPoints * 16; - for(;number < num_points; number++){ - dotProduct += ((*aPtr++) * (*bPtr++)); - } + number = sixteenthPoints * 16; + for (; number < num_points; number++) { + dotProduct += ((*aPtr++) * (*bPtr++)); + } - *result = dotProduct; + *result = dotProduct; } #endif /*LV_HAVE_SSE4_1*/ @@ -676,159 +733,170 @@ static inline void volk_32f_x2_dot_prod_32f_a_sse4_1(float * result, const float #include -static inline void volk_32f_x2_dot_prod_32f_a_avx( float* result, const float* input, const float* taps, unsigned int num_points) { +static inline void volk_32f_x2_dot_prod_32f_a_avx(float* result, + const float* input, + const float* taps, + unsigned int num_points) +{ - unsigned int number = 0; - const unsigned int sixteenthPoints = num_points / 16; - - float dotProduct = 0; - const float* aPtr = input; - const float* bPtr = taps; + unsigned int number = 0; + const unsigned int sixteenthPoints = num_points / 16; - __m256 a0Val, a1Val; - __m256 b0Val, b1Val; - __m256 c0Val, c1Val; + float dotProduct = 0; + const float* aPtr = input; + const float* bPtr = taps; - __m256 dotProdVal0 = _mm256_setzero_ps(); - __m256 dotProdVal1 = _mm256_setzero_ps(); + __m256 a0Val, a1Val; + __m256 b0Val, b1Val; + __m256 c0Val, c1Val; - for(;number < sixteenthPoints; number++){ + __m256 dotProdVal0 = _mm256_setzero_ps(); + __m256 dotProdVal1 = _mm256_setzero_ps(); - a0Val = _mm256_load_ps(aPtr); - a1Val = _mm256_load_ps(aPtr+8); - b0Val = _mm256_load_ps(bPtr); - b1Val = _mm256_load_ps(bPtr+8); + for (; number < sixteenthPoints; number++) { - c0Val = _mm256_mul_ps(a0Val, b0Val); - c1Val = _mm256_mul_ps(a1Val, b1Val); + a0Val = _mm256_load_ps(aPtr); + a1Val = _mm256_load_ps(aPtr + 8); + b0Val = _mm256_load_ps(bPtr); + b1Val = _mm256_load_ps(bPtr + 8); - dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0); - dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1); + c0Val = _mm256_mul_ps(a0Val, b0Val); + c1Val = _mm256_mul_ps(a1Val, b1Val); - aPtr += 16; - bPtr += 16; - } + dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0); + dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1); - dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1); + aPtr += 16; + bPtr += 16; + } - __VOLK_ATTR_ALIGNED(32) float dotProductVector[8]; + dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1); - _mm256_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector + __VOLK_ATTR_ALIGNED(32) float dotProductVector[8]; - dotProduct = dotProductVector[0]; - dotProduct += dotProductVector[1]; - dotProduct += dotProductVector[2]; - dotProduct += dotProductVector[3]; - dotProduct += dotProductVector[4]; - dotProduct += dotProductVector[5]; - dotProduct += dotProductVector[6]; - dotProduct += dotProductVector[7]; + _mm256_store_ps(dotProductVector, + dotProdVal0); // Store the results back into the dot product vector - number = sixteenthPoints*16; - for(;number < num_points; number++){ - dotProduct += ((*aPtr++) * (*bPtr++)); - } + dotProduct = dotProductVector[0]; + dotProduct += dotProductVector[1]; + dotProduct += dotProductVector[2]; + dotProduct += dotProductVector[3]; + dotProduct += dotProductVector[4]; + dotProduct += dotProductVector[5]; + dotProduct += dotProductVector[6]; + dotProduct += dotProductVector[7]; - *result = dotProduct; + number = sixteenthPoints * 16; + for (; number < num_points; number++) { + dotProduct += ((*aPtr++) * (*bPtr++)); + } + *result = dotProduct; } #endif /*LV_HAVE_AVX*/ #if LV_HAVE_AVX2 && LV_HAVE_FMA #include -static inline void volk_32f_x2_dot_prod_32f_a_avx2_fma(float * result, const float * input, const float* taps, unsigned int num_points){ - unsigned int number; - const unsigned int eighthPoints = num_points / 8; - - const float* aPtr = input; - const float* bPtr = taps; +static inline void volk_32f_x2_dot_prod_32f_a_avx2_fma(float* result, + const float* input, + const float* taps, + unsigned int num_points) +{ + unsigned int number; + const unsigned int eighthPoints = num_points / 8; - __m256 dotProdVal = _mm256_setzero_ps(); - __m256 aVal1, bVal1; + const float* aPtr = input; + const float* bPtr = taps; - for (number = 0; number < eighthPoints; number++ ) { + __m256 dotProdVal = _mm256_setzero_ps(); + __m256 aVal1, bVal1; - aVal1 = _mm256_load_ps(aPtr); - bVal1 = _mm256_load_ps(bPtr); - aPtr += 8; - bPtr += 8; + for (number = 0; number < eighthPoints; number++) { - dotProdVal = _mm256_fmadd_ps(aVal1, bVal1, dotProdVal); - } + aVal1 = _mm256_load_ps(aPtr); + bVal1 = _mm256_load_ps(bPtr); + aPtr += 8; + bPtr += 8; - __VOLK_ATTR_ALIGNED(32) float dotProductVector[8]; - _mm256_store_ps(dotProductVector, dotProdVal); // Store the results back into the dot product vector - _mm256_zeroupper(); + dotProdVal = _mm256_fmadd_ps(aVal1, bVal1, dotProdVal); + } - float dotProduct = - dotProductVector[0] + dotProductVector[1] + - dotProductVector[2] + dotProductVector[3] + - dotProductVector[4] + dotProductVector[5] + - dotProductVector[6] + dotProductVector[7]; + __VOLK_ATTR_ALIGNED(32) float dotProductVector[8]; + _mm256_store_ps(dotProductVector, + dotProdVal); // Store the results back into the dot product vector + _mm256_zeroupper(); - for(number = eighthPoints * 8; number < num_points; number++){ - dotProduct += ((*aPtr++) * (*bPtr++)); - } + float dotProduct = dotProductVector[0] + dotProductVector[1] + dotProductVector[2] + + dotProductVector[3] + dotProductVector[4] + dotProductVector[5] + + dotProductVector[6] + dotProductVector[7]; - *result = dotProduct; + for (number = eighthPoints * 8; number < num_points; number++) { + dotProduct += ((*aPtr++) * (*bPtr++)); + } + *result = dotProduct; } #endif /* LV_HAVE_AVX2 && LV_HAVE_FMA */ #if LV_HAVE_AVX512F #include -static inline void volk_32f_x2_dot_prod_32f_a_avx512f(float * result, const float * input, const float* taps, unsigned int num_points){ - unsigned int number; - const unsigned int sixteenthPoints = num_points / 16; - - const float* aPtr = input; - const float* bPtr = taps; +static inline void volk_32f_x2_dot_prod_32f_a_avx512f(float* result, + const float* input, + const float* taps, + unsigned int num_points) +{ + unsigned int number; + const unsigned int sixteenthPoints = num_points / 16; - __m512 dotProdVal = _mm512_setzero_ps(); - __m512 aVal1, bVal1; + const float* aPtr = input; + const float* bPtr = taps; - for (number = 0; number < sixteenthPoints; number++ ) { + __m512 dotProdVal = _mm512_setzero_ps(); + __m512 aVal1, bVal1; - aVal1 = _mm512_load_ps(aPtr); - bVal1 = _mm512_load_ps(bPtr); - aPtr += 16; - bPtr += 16; + for (number = 0; number < sixteenthPoints; number++) { - dotProdVal = _mm512_fmadd_ps(aVal1, bVal1, dotProdVal); - } + aVal1 = _mm512_load_ps(aPtr); + bVal1 = _mm512_load_ps(bPtr); + aPtr += 16; + bPtr += 16; - __VOLK_ATTR_ALIGNED(64) float dotProductVector[16]; - _mm512_store_ps(dotProductVector, dotProdVal); // Store the results back into the dot product vector + dotProdVal = _mm512_fmadd_ps(aVal1, bVal1, dotProdVal); + } - float dotProduct = - dotProductVector[0] + dotProductVector[1] + - dotProductVector[2] + dotProductVector[3] + - dotProductVector[4] + dotProductVector[5] + - dotProductVector[6] + dotProductVector[7] + - dotProductVector[8] + dotProductVector[9] + - dotProductVector[10] + dotProductVector[11] + - dotProductVector[12] + dotProductVector[13] + - dotProductVector[14] + dotProductVector[15]; + __VOLK_ATTR_ALIGNED(64) float dotProductVector[16]; + _mm512_store_ps(dotProductVector, + dotProdVal); // Store the results back into the dot product vector - for(number = sixteenthPoints * 16; number < num_points; number++){ - dotProduct += ((*aPtr++) * (*bPtr++)); - } + float dotProduct = dotProductVector[0] + dotProductVector[1] + dotProductVector[2] + + dotProductVector[3] + dotProductVector[4] + dotProductVector[5] + + dotProductVector[6] + dotProductVector[7] + dotProductVector[8] + + dotProductVector[9] + dotProductVector[10] + dotProductVector[11] + + dotProductVector[12] + dotProductVector[13] + + dotProductVector[14] + dotProductVector[15]; - *result = dotProduct; + for (number = sixteenthPoints * 16; number < num_points; number++) { + dotProduct += ((*aPtr++) * (*bPtr++)); + } + *result = dotProduct; } #endif /* LV_HAVE_AVX512F */ #ifdef LV_HAVE_NEON #include -static inline void volk_32f_x2_dot_prod_32f_neonopts(float * result, const float * input, const float * taps, unsigned int num_points) { +static inline void volk_32f_x2_dot_prod_32f_neonopts(float* result, + const float* input, + const float* taps, + unsigned int num_points) +{ unsigned int quarter_points = num_points / 16; float dotProduct = 0; const float* aPtr = input; - const float* bPtr= taps; + const float* bPtr = taps; unsigned int number = 0; float32x4x4_t a_val, b_val, accumulator0; @@ -838,7 +906,7 @@ static inline void volk_32f_x2_dot_prod_32f_neonopts(float * result, const float accumulator0.val[3] = vdupq_n_f32(0); // factor of 4 loop unroll with independent accumulators // uses 12 out of 16 neon q registers - for( number = 0; number < quarter_points; ++number) { + for (number = 0; number < quarter_points; ++number) { a_val = vld4q_f32(aPtr); b_val = vld4q_f32(bPtr); accumulator0.val[0] = vmlaq_f32(accumulator0.val[0], a_val.val[0], b_val.val[0]); @@ -855,8 +923,8 @@ static inline void volk_32f_x2_dot_prod_32f_neonopts(float * result, const float vst1q_f32(accumulator, accumulator0.val[0]); dotProduct = accumulator[0] + accumulator[1] + accumulator[2] + accumulator[3]; - for(number = quarter_points*16; number < num_points; number++){ - dotProduct += ((*aPtr++) * (*bPtr++)); + for (number = quarter_points * 16; number < num_points; number++) { + dotProduct += ((*aPtr++) * (*bPtr++)); } *result = dotProduct; @@ -865,26 +933,30 @@ static inline void volk_32f_x2_dot_prod_32f_neonopts(float * result, const float #endif - - #ifdef LV_HAVE_NEON -static inline void volk_32f_x2_dot_prod_32f_neon(float * result, const float * input, const float * taps, unsigned int num_points) { +static inline void volk_32f_x2_dot_prod_32f_neon(float* result, + const float* input, + const float* taps, + unsigned int num_points) +{ unsigned int quarter_points = num_points / 8; float dotProduct = 0; const float* aPtr = input; - const float* bPtr= taps; + const float* bPtr = taps; unsigned int number = 0; float32x4x2_t a_val, b_val, accumulator_val; accumulator_val.val[0] = vdupq_n_f32(0); accumulator_val.val[1] = vdupq_n_f32(0); // factor of 2 loop unroll with independent accumulators - for( number = 0; number < quarter_points; ++number) { + for (number = 0; number < quarter_points; ++number) { a_val = vld2q_f32(aPtr); b_val = vld2q_f32(bPtr); - accumulator_val.val[0] = vmlaq_f32(accumulator_val.val[0], a_val.val[0], b_val.val[0]); - accumulator_val.val[1] = vmlaq_f32(accumulator_val.val[1], a_val.val[1], b_val.val[1]); + accumulator_val.val[0] = + vmlaq_f32(accumulator_val.val[0], a_val.val[0], b_val.val[0]); + accumulator_val.val[1] = + vmlaq_f32(accumulator_val.val[1], a_val.val[1], b_val.val[1]); aPtr += 8; bPtr += 8; } @@ -893,8 +965,8 @@ static inline void volk_32f_x2_dot_prod_32f_neon(float * result, const float * i vst1q_f32(accumulator, accumulator_val.val[0]); dotProduct = accumulator[0] + accumulator[1] + accumulator[2] + accumulator[3]; - for(number = quarter_points*8; number < num_points; number++){ - dotProduct += ((*aPtr++) * (*bPtr++)); + for (number = quarter_points * 8; number < num_points; number++) { + dotProduct += ((*aPtr++) * (*bPtr++)); } *result = dotProduct; @@ -903,11 +975,17 @@ static inline void volk_32f_x2_dot_prod_32f_neon(float * result, const float * i #endif /* LV_HAVE_NEON */ #ifdef LV_HAVE_NEONV7 -extern void volk_32f_x2_dot_prod_32f_a_neonasm(float* cVector, const float* aVector, const float* bVector, unsigned int num_points); +extern void volk_32f_x2_dot_prod_32f_a_neonasm(float* cVector, + const float* aVector, + const float* bVector, + unsigned int num_points); #endif /* LV_HAVE_NEONV7 */ #ifdef LV_HAVE_NEONV7 -extern void volk_32f_x2_dot_prod_32f_a_neonasm_opts(float* cVector, const float* aVector, const float* bVector, unsigned int num_points); +extern void volk_32f_x2_dot_prod_32f_a_neonasm_opts(float* cVector, + const float* aVector, + const float* bVector, + unsigned int num_points); #endif /* LV_HAVE_NEONV7 */ #endif /*INCLUDED_volk_32f_x2_dot_prod_32f_a_H*/ diff --git a/kernels/volk/volk_32f_x2_fm_detectpuppet_32f.h b/kernels/volk/volk_32f_x2_fm_detectpuppet_32f.h index e1da185..3a3caca 100644 --- a/kernels/volk/volk_32f_x2_fm_detectpuppet_32f.h +++ b/kernels/volk/volk_32f_x2_fm_detectpuppet_32f.h @@ -28,32 +28,44 @@ #ifdef LV_HAVE_AVX #include -static inline void volk_32f_x2_fm_detectpuppet_32f_a_avx(float* outputVector, const float* inputVector, float* saveValue, unsigned int num_points) +static inline void volk_32f_x2_fm_detectpuppet_32f_a_avx(float* outputVector, + const float* inputVector, + float* saveValue, + unsigned int num_points) { - const float bound = 1.0f; + const float bound = 1.0f; - volk_32f_s32f_32f_fm_detect_32f_a_avx(outputVector, inputVector, bound, saveValue, num_points); + volk_32f_s32f_32f_fm_detect_32f_a_avx( + outputVector, inputVector, bound, saveValue, num_points); } #endif /* LV_HAVE_AVX */ #ifdef LV_HAVE_SSE #include -static inline void volk_32f_x2_fm_detectpuppet_32f_a_sse(float* outputVector, const float* inputVector, float* saveValue, unsigned int num_points) +static inline void volk_32f_x2_fm_detectpuppet_32f_a_sse(float* outputVector, + const float* inputVector, + float* saveValue, + unsigned int num_points) { - const float bound = 1.0f; + const float bound = 1.0f; - volk_32f_s32f_32f_fm_detect_32f_a_sse(outputVector, inputVector, bound, saveValue, num_points); + volk_32f_s32f_32f_fm_detect_32f_a_sse( + outputVector, inputVector, bound, saveValue, num_points); } #endif /* LV_HAVE_SSE */ #ifdef LV_HAVE_GENERIC -static inline void volk_32f_x2_fm_detectpuppet_32f_generic(float* outputVector, const float* inputVector, float* saveValue, unsigned int num_points) +static inline void volk_32f_x2_fm_detectpuppet_32f_generic(float* outputVector, + const float* inputVector, + float* saveValue, + unsigned int num_points) { - const float bound = 1.0f; + const float bound = 1.0f; - volk_32f_s32f_32f_fm_detect_32f_generic(outputVector, inputVector, bound, saveValue, num_points); + volk_32f_s32f_32f_fm_detect_32f_generic( + outputVector, inputVector, bound, saveValue, num_points); } #endif /* LV_HAVE_GENERIC */ @@ -69,11 +81,15 @@ static inline void volk_32f_x2_fm_detectpuppet_32f_generic(float* outputVector, #ifdef LV_HAVE_AVX #include -static inline void volk_32f_x2_fm_detectpuppet_32f_u_avx(float* outputVector, const float* inputVector, float* saveValue, unsigned int num_points) +static inline void volk_32f_x2_fm_detectpuppet_32f_u_avx(float* outputVector, + const float* inputVector, + float* saveValue, + unsigned int num_points) { - const float bound = 1.0f; + const float bound = 1.0f; - volk_32f_s32f_32f_fm_detect_32f_u_avx(outputVector, inputVector, bound, saveValue, num_points); + volk_32f_s32f_32f_fm_detect_32f_u_avx( + outputVector, inputVector, bound, saveValue, num_points); } #endif /* LV_HAVE_AVX */ #endif /* INCLUDED_volk_32f_x2_fm_detectpuppet_32f_u_H */ diff --git a/kernels/volk/volk_32f_x2_interleave_32fc.h b/kernels/volk/volk_32f_x2_interleave_32fc.h index ef8ada2..d0cc6dd 100644 --- a/kernels/volk/volk_32f_x2_interleave_32fc.h +++ b/kernels/volk/volk_32f_x2_interleave_32fc.h @@ -33,8 +33,8 @@ * * Dispatcher Prototype * \code - * void volk_32f_x2_interleave_32fc(lv_32fc_t* complexVector, const float* iBuffer, const float* qBuffer, unsigned int num_points) - * \endcode + * void volk_32f_x2_interleave_32fc(lv_32fc_t* complexVector, const float* iBuffer, const + * float* qBuffer, unsigned int num_points) \endcode * * \b Inputs * \li iBuffer: Input vector of samples for the real part. @@ -79,44 +79,45 @@ #ifdef LV_HAVE_AVX #include -static inline void -volk_32f_x2_interleave_32fc_a_avx(lv_32fc_t* complexVector, const float* iBuffer, - const float* qBuffer, unsigned int num_points) +static inline void volk_32f_x2_interleave_32fc_a_avx(lv_32fc_t* complexVector, + const float* iBuffer, + const float* qBuffer, + unsigned int num_points) { - unsigned int number = 0; - float* complexVectorPtr = (float*)complexVector; - const float* iBufferPtr = iBuffer; - const float* qBufferPtr = qBuffer; - - const uint64_t eighthPoints = num_points / 8; - - __m256 iValue, qValue, cplxValue1, cplxValue2, cplxValue; - for(;number < eighthPoints; number++){ - iValue = _mm256_load_ps(iBufferPtr); - qValue = _mm256_load_ps(qBufferPtr); - - // Interleaves the lower two values in the i and q variables into one buffer - cplxValue1 = _mm256_unpacklo_ps(iValue, qValue); - // Interleaves the upper two values in the i and q variables into one buffer - cplxValue2 = _mm256_unpackhi_ps(iValue, qValue); - - cplxValue = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x20); - _mm256_store_ps(complexVectorPtr, cplxValue); - complexVectorPtr += 8; - - cplxValue = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x31); - _mm256_store_ps(complexVectorPtr, cplxValue); - complexVectorPtr += 8; - - iBufferPtr += 8; - qBufferPtr += 8; - } - - number = eighthPoints * 8; - for(; number < num_points; number++){ - *complexVectorPtr++ = *iBufferPtr++; - *complexVectorPtr++ = *qBufferPtr++; - } + unsigned int number = 0; + float* complexVectorPtr = (float*)complexVector; + const float* iBufferPtr = iBuffer; + const float* qBufferPtr = qBuffer; + + const uint64_t eighthPoints = num_points / 8; + + __m256 iValue, qValue, cplxValue1, cplxValue2, cplxValue; + for (; number < eighthPoints; number++) { + iValue = _mm256_load_ps(iBufferPtr); + qValue = _mm256_load_ps(qBufferPtr); + + // Interleaves the lower two values in the i and q variables into one buffer + cplxValue1 = _mm256_unpacklo_ps(iValue, qValue); + // Interleaves the upper two values in the i and q variables into one buffer + cplxValue2 = _mm256_unpackhi_ps(iValue, qValue); + + cplxValue = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x20); + _mm256_store_ps(complexVectorPtr, cplxValue); + complexVectorPtr += 8; + + cplxValue = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x31); + _mm256_store_ps(complexVectorPtr, cplxValue); + complexVectorPtr += 8; + + iBufferPtr += 8; + qBufferPtr += 8; + } + + number = eighthPoints * 8; + for (; number < num_points; number++) { + *complexVectorPtr++ = *iBufferPtr++; + *complexVectorPtr++ = *qBufferPtr++; + } } #endif /* LV_HAV_AVX */ @@ -124,41 +125,42 @@ volk_32f_x2_interleave_32fc_a_avx(lv_32fc_t* complexVector, const float* iBuffer #ifdef LV_HAVE_SSE #include -static inline void -volk_32f_x2_interleave_32fc_a_sse(lv_32fc_t* complexVector, const float* iBuffer, - const float* qBuffer, unsigned int num_points) +static inline void volk_32f_x2_interleave_32fc_a_sse(lv_32fc_t* complexVector, + const float* iBuffer, + const float* qBuffer, + unsigned int num_points) { - unsigned int number = 0; - float* complexVectorPtr = (float*)complexVector; - const float* iBufferPtr = iBuffer; - const float* qBufferPtr = qBuffer; - - const uint64_t quarterPoints = num_points / 4; - - __m128 iValue, qValue, cplxValue; - for(;number < quarterPoints; number++){ - iValue = _mm_load_ps(iBufferPtr); - qValue = _mm_load_ps(qBufferPtr); - - // Interleaves the lower two values in the i and q variables into one buffer - cplxValue = _mm_unpacklo_ps(iValue, qValue); - _mm_store_ps(complexVectorPtr, cplxValue); - complexVectorPtr += 4; - - // Interleaves the upper two values in the i and q variables into one buffer - cplxValue = _mm_unpackhi_ps(iValue, qValue); - _mm_store_ps(complexVectorPtr, cplxValue); - complexVectorPtr += 4; - - iBufferPtr += 4; - qBufferPtr += 4; - } - - number = quarterPoints * 4; - for(; number < num_points; number++){ - *complexVectorPtr++ = *iBufferPtr++; - *complexVectorPtr++ = *qBufferPtr++; - } + unsigned int number = 0; + float* complexVectorPtr = (float*)complexVector; + const float* iBufferPtr = iBuffer; + const float* qBufferPtr = qBuffer; + + const uint64_t quarterPoints = num_points / 4; + + __m128 iValue, qValue, cplxValue; + for (; number < quarterPoints; number++) { + iValue = _mm_load_ps(iBufferPtr); + qValue = _mm_load_ps(qBufferPtr); + + // Interleaves the lower two values in the i and q variables into one buffer + cplxValue = _mm_unpacklo_ps(iValue, qValue); + _mm_store_ps(complexVectorPtr, cplxValue); + complexVectorPtr += 4; + + // Interleaves the upper two values in the i and q variables into one buffer + cplxValue = _mm_unpackhi_ps(iValue, qValue); + _mm_store_ps(complexVectorPtr, cplxValue); + complexVectorPtr += 4; + + iBufferPtr += 4; + qBufferPtr += 4; + } + + number = quarterPoints * 4; + for (; number < num_points; number++) { + *complexVectorPtr++ = *iBufferPtr++; + *complexVectorPtr++ = *qBufferPtr++; + } } #endif /* LV_HAVE_SSE */ @@ -166,52 +168,53 @@ volk_32f_x2_interleave_32fc_a_sse(lv_32fc_t* complexVector, const float* iBuffer #ifdef LV_HAVE_NEON #include -static inline void -volk_32f_x2_interleave_32fc_neon(lv_32fc_t* complexVector, const float* iBuffer, - const float* qBuffer, unsigned int num_points) +static inline void volk_32f_x2_interleave_32fc_neon(lv_32fc_t* complexVector, + const float* iBuffer, + const float* qBuffer, + unsigned int num_points) { - unsigned int quarter_points = num_points / 4; - unsigned int number; - float* complexVectorPtr = (float*) complexVector; - - float32x4x2_t complex_vec; - for(number=0; number < quarter_points; ++number) { - complex_vec.val[0] = vld1q_f32(iBuffer); - complex_vec.val[1] = vld1q_f32(qBuffer); - vst2q_f32(complexVectorPtr, complex_vec); - iBuffer += 4; - qBuffer += 4; - complexVectorPtr += 8; - } - - for(number=quarter_points * 4; number < num_points; ++number) { - *complexVectorPtr++ = *iBuffer++; - *complexVectorPtr++ = *qBuffer++; - } + unsigned int quarter_points = num_points / 4; + unsigned int number; + float* complexVectorPtr = (float*)complexVector; + + float32x4x2_t complex_vec; + for (number = 0; number < quarter_points; ++number) { + complex_vec.val[0] = vld1q_f32(iBuffer); + complex_vec.val[1] = vld1q_f32(qBuffer); + vst2q_f32(complexVectorPtr, complex_vec); + iBuffer += 4; + qBuffer += 4; + complexVectorPtr += 8; + } + + for (number = quarter_points * 4; number < num_points; ++number) { + *complexVectorPtr++ = *iBuffer++; + *complexVectorPtr++ = *qBuffer++; + } } #endif /* LV_HAVE_NEON */ #ifdef LV_HAVE_GENERIC -static inline void -volk_32f_x2_interleave_32fc_generic(lv_32fc_t* complexVector, const float* iBuffer, - const float* qBuffer, unsigned int num_points) +static inline void volk_32f_x2_interleave_32fc_generic(lv_32fc_t* complexVector, + const float* iBuffer, + const float* qBuffer, + unsigned int num_points) { - float* complexVectorPtr = (float*)complexVector; - const float* iBufferPtr = iBuffer; - const float* qBufferPtr = qBuffer; - unsigned int number; - - for(number = 0; number < num_points; number++){ - *complexVectorPtr++ = *iBufferPtr++; - *complexVectorPtr++ = *qBufferPtr++; - } + float* complexVectorPtr = (float*)complexVector; + const float* iBufferPtr = iBuffer; + const float* qBufferPtr = qBuffer; + unsigned int number; + + for (number = 0; number < num_points; number++) { + *complexVectorPtr++ = *iBufferPtr++; + *complexVectorPtr++ = *qBufferPtr++; + } } #endif /* LV_HAVE_GENERIC */ - #endif /* INCLUDED_volk_32f_x2_interleave_32fc_a_H */ #ifndef INCLUDED_volk_32f_x2_interleave_32fc_u_H @@ -223,44 +226,45 @@ volk_32f_x2_interleave_32fc_generic(lv_32fc_t* complexVector, const float* iBuff #ifdef LV_HAVE_AVX #include -static inline void -volk_32f_x2_interleave_32fc_u_avx(lv_32fc_t* complexVector, const float* iBuffer, - const float* qBuffer, unsigned int num_points) +static inline void volk_32f_x2_interleave_32fc_u_avx(lv_32fc_t* complexVector, + const float* iBuffer, + const float* qBuffer, + unsigned int num_points) { - unsigned int number = 0; - float* complexVectorPtr = (float*)complexVector; - const float* iBufferPtr = iBuffer; - const float* qBufferPtr = qBuffer; - - const uint64_t eighthPoints = num_points / 8; - - __m256 iValue, qValue, cplxValue1, cplxValue2, cplxValue; - for(;number < eighthPoints; number++){ - iValue = _mm256_loadu_ps(iBufferPtr); - qValue = _mm256_loadu_ps(qBufferPtr); - - // Interleaves the lower two values in the i and q variables into one buffer - cplxValue1 = _mm256_unpacklo_ps(iValue, qValue); - // Interleaves the upper two values in the i and q variables into one buffer - cplxValue2 = _mm256_unpackhi_ps(iValue, qValue); - - cplxValue = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x20); - _mm256_storeu_ps(complexVectorPtr, cplxValue); - complexVectorPtr += 8; - - cplxValue = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x31); - _mm256_storeu_ps(complexVectorPtr, cplxValue); - complexVectorPtr += 8; - - iBufferPtr += 8; - qBufferPtr += 8; - } - - number = eighthPoints * 8; - for(; number < num_points; number++){ - *complexVectorPtr++ = *iBufferPtr++; - *complexVectorPtr++ = *qBufferPtr++; - } + unsigned int number = 0; + float* complexVectorPtr = (float*)complexVector; + const float* iBufferPtr = iBuffer; + const float* qBufferPtr = qBuffer; + + const uint64_t eighthPoints = num_points / 8; + + __m256 iValue, qValue, cplxValue1, cplxValue2, cplxValue; + for (; number < eighthPoints; number++) { + iValue = _mm256_loadu_ps(iBufferPtr); + qValue = _mm256_loadu_ps(qBufferPtr); + + // Interleaves the lower two values in the i and q variables into one buffer + cplxValue1 = _mm256_unpacklo_ps(iValue, qValue); + // Interleaves the upper two values in the i and q variables into one buffer + cplxValue2 = _mm256_unpackhi_ps(iValue, qValue); + + cplxValue = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x20); + _mm256_storeu_ps(complexVectorPtr, cplxValue); + complexVectorPtr += 8; + + cplxValue = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x31); + _mm256_storeu_ps(complexVectorPtr, cplxValue); + complexVectorPtr += 8; + + iBufferPtr += 8; + qBufferPtr += 8; + } + + number = eighthPoints * 8; + for (; number < num_points; number++) { + *complexVectorPtr++ = *iBufferPtr++; + *complexVectorPtr++ = *qBufferPtr++; + } } #endif /* LV_HAVE_AVX */ diff --git a/kernels/volk/volk_32f_x2_max_32f.h b/kernels/volk/volk_32f_x2_max_32f.h index 82086a6..c7eb67f 100644 --- a/kernels/volk/volk_32f_x2_max_32f.h +++ b/kernels/volk/volk_32f_x2_max_32f.h @@ -32,8 +32,8 @@ * * Dispatcher Prototype * \code - * void volk_32f_x2_max_32f(float* cVector, const float* aVector, const float* bVector, unsigned int num_points) - * \endcode + * void volk_32f_x2_max_32f(float* cVector, const float* aVector, const float* bVector, + * unsigned int num_points) \endcode * * \b Inputs * \li aVector: First input vector. @@ -77,176 +77,183 @@ #ifdef LV_HAVE_AVX512F #include -static inline void -volk_32f_x2_max_32f_a_avx512f(float* cVector, const float* aVector, - const float* bVector, unsigned int num_points) +static inline void volk_32f_x2_max_32f_a_avx512f(float* cVector, + const float* aVector, + const float* bVector, + unsigned int num_points) { - unsigned int number = 0; - const unsigned int sixteenthPoints = num_points / 16; + unsigned int number = 0; + const unsigned int sixteenthPoints = num_points / 16; - float* cPtr = cVector; - const float* aPtr = aVector; - const float* bPtr= bVector; + float* cPtr = cVector; + const float* aPtr = aVector; + const float* bPtr = bVector; - __m512 aVal, bVal, cVal; - for(;number < sixteenthPoints; number++){ - aVal = _mm512_load_ps(aPtr); - bVal = _mm512_load_ps(bPtr); + __m512 aVal, bVal, cVal; + for (; number < sixteenthPoints; number++) { + aVal = _mm512_load_ps(aPtr); + bVal = _mm512_load_ps(bPtr); - cVal = _mm512_max_ps(aVal, bVal); + cVal = _mm512_max_ps(aVal, bVal); - _mm512_store_ps(cPtr,cVal); // Store the results back into the C container + _mm512_store_ps(cPtr, cVal); // Store the results back into the C container - aPtr += 16; - bPtr += 16; - cPtr += 16; - } + aPtr += 16; + bPtr += 16; + cPtr += 16; + } - number = sixteenthPoints * 16; - for(;number < num_points; number++){ - const float a = *aPtr++; - const float b = *bPtr++; - *cPtr++ = ( a > b ? a : b); - } + number = sixteenthPoints * 16; + for (; number < num_points; number++) { + const float a = *aPtr++; + const float b = *bPtr++; + *cPtr++ = (a > b ? a : b); + } } #endif /* LV_HAVE_AVX512F */ #ifdef LV_HAVE_SSE #include -static inline void -volk_32f_x2_max_32f_a_sse(float* cVector, const float* aVector, - const float* bVector, unsigned int num_points) +static inline void volk_32f_x2_max_32f_a_sse(float* cVector, + const float* aVector, + const float* bVector, + unsigned int num_points) { - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; - float* cPtr = cVector; - const float* aPtr = aVector; - const float* bPtr= bVector; + float* cPtr = cVector; + const float* aPtr = aVector; + const float* bPtr = bVector; - __m128 aVal, bVal, cVal; - for(;number < quarterPoints; number++){ - aVal = _mm_load_ps(aPtr); - bVal = _mm_load_ps(bPtr); + __m128 aVal, bVal, cVal; + for (; number < quarterPoints; number++) { + aVal = _mm_load_ps(aPtr); + bVal = _mm_load_ps(bPtr); - cVal = _mm_max_ps(aVal, bVal); + cVal = _mm_max_ps(aVal, bVal); - _mm_store_ps(cPtr,cVal); // Store the results back into the C container + _mm_store_ps(cPtr, cVal); // Store the results back into the C container - aPtr += 4; - bPtr += 4; - cPtr += 4; - } + aPtr += 4; + bPtr += 4; + cPtr += 4; + } - number = quarterPoints * 4; - for(;number < num_points; number++){ - const float a = *aPtr++; - const float b = *bPtr++; - *cPtr++ = ( a > b ? a : b); - } + number = quarterPoints * 4; + for (; number < num_points; number++) { + const float a = *aPtr++; + const float b = *bPtr++; + *cPtr++ = (a > b ? a : b); + } } #endif /* LV_HAVE_SSE */ #ifdef LV_HAVE_AVX #include -static inline void -volk_32f_x2_max_32f_a_avx(float* cVector, const float* aVector, - const float* bVector, unsigned int num_points) +static inline void volk_32f_x2_max_32f_a_avx(float* cVector, + const float* aVector, + const float* bVector, + unsigned int num_points) { - unsigned int number = 0; - const unsigned int eighthPoints = num_points / 8; + unsigned int number = 0; + const unsigned int eighthPoints = num_points / 8; - float* cPtr = cVector; - const float* aPtr = aVector; - const float* bPtr= bVector; + float* cPtr = cVector; + const float* aPtr = aVector; + const float* bPtr = bVector; - __m256 aVal, bVal, cVal; - for(;number < eighthPoints; number++){ - aVal = _mm256_load_ps(aPtr); - bVal = _mm256_load_ps(bPtr); + __m256 aVal, bVal, cVal; + for (; number < eighthPoints; number++) { + aVal = _mm256_load_ps(aPtr); + bVal = _mm256_load_ps(bPtr); - cVal = _mm256_max_ps(aVal, bVal); + cVal = _mm256_max_ps(aVal, bVal); - _mm256_store_ps(cPtr,cVal); // Store the results back into the C container + _mm256_store_ps(cPtr, cVal); // Store the results back into the C container - aPtr += 8; - bPtr += 8; - cPtr += 8; - } + aPtr += 8; + bPtr += 8; + cPtr += 8; + } - number = eighthPoints * 8; - for(;number < num_points; number++){ - const float a = *aPtr++; - const float b = *bPtr++; - *cPtr++ = ( a > b ? a : b); - } + number = eighthPoints * 8; + for (; number < num_points; number++) { + const float a = *aPtr++; + const float b = *bPtr++; + *cPtr++ = (a > b ? a : b); + } } #endif /* LV_HAVE_AVX */ #ifdef LV_HAVE_NEON #include -static inline void -volk_32f_x2_max_32f_neon(float* cVector, const float* aVector, - const float* bVector, unsigned int num_points) +static inline void volk_32f_x2_max_32f_neon(float* cVector, + const float* aVector, + const float* bVector, + unsigned int num_points) { - unsigned int quarter_points = num_points / 4; - float* cPtr = cVector; - const float* aPtr = aVector; - const float* bPtr= bVector; - unsigned int number = 0; - - float32x4_t a_vec, b_vec, c_vec; - for(number = 0; number < quarter_points; number++){ - a_vec = vld1q_f32(aPtr); - b_vec = vld1q_f32(bPtr); - c_vec = vmaxq_f32(a_vec, b_vec); - vst1q_f32(cPtr, c_vec); - aPtr += 4; - bPtr += 4; - cPtr += 4; - } - - for(number = quarter_points*4; number < num_points; number++){ - const float a = *aPtr++; - const float b = *bPtr++; - *cPtr++ = ( a > b ? a : b); - } + unsigned int quarter_points = num_points / 4; + float* cPtr = cVector; + const float* aPtr = aVector; + const float* bPtr = bVector; + unsigned int number = 0; + + float32x4_t a_vec, b_vec, c_vec; + for (number = 0; number < quarter_points; number++) { + a_vec = vld1q_f32(aPtr); + b_vec = vld1q_f32(bPtr); + c_vec = vmaxq_f32(a_vec, b_vec); + vst1q_f32(cPtr, c_vec); + aPtr += 4; + bPtr += 4; + cPtr += 4; + } + + for (number = quarter_points * 4; number < num_points; number++) { + const float a = *aPtr++; + const float b = *bPtr++; + *cPtr++ = (a > b ? a : b); + } } #endif /* LV_HAVE_NEON */ #ifdef LV_HAVE_GENERIC -static inline void -volk_32f_x2_max_32f_generic(float* cVector, const float* aVector, - const float* bVector, unsigned int num_points) +static inline void volk_32f_x2_max_32f_generic(float* cVector, + const float* aVector, + const float* bVector, + unsigned int num_points) { - float* cPtr = cVector; - const float* aPtr = aVector; - const float* bPtr= bVector; - unsigned int number = 0; - - for(number = 0; number < num_points; number++){ - const float a = *aPtr++; - const float b = *bPtr++; - *cPtr++ = ( a > b ? a : b); - } + float* cPtr = cVector; + const float* aPtr = aVector; + const float* bPtr = bVector; + unsigned int number = 0; + + for (number = 0; number < num_points; number++) { + const float a = *aPtr++; + const float b = *bPtr++; + *cPtr++ = (a > b ? a : b); + } } #endif /* LV_HAVE_GENERIC */ #ifdef LV_HAVE_ORC -extern void -volk_32f_x2_max_32f_a_orc_impl(float* cVector, const float* aVector, - const float* bVector, unsigned int num_points); - -static inline void -volk_32f_x2_max_32f_u_orc(float* cVector, const float* aVector, - const float* bVector, unsigned int num_points) +extern void volk_32f_x2_max_32f_a_orc_impl(float* cVector, + const float* aVector, + const float* bVector, + unsigned int num_points); + +static inline void volk_32f_x2_max_32f_u_orc(float* cVector, + const float* aVector, + const float* bVector, + unsigned int num_points) { - volk_32f_x2_max_32f_a_orc_impl(cVector, aVector, bVector, num_points); + volk_32f_x2_max_32f_a_orc_impl(cVector, aVector, bVector, num_points); } #endif /* LV_HAVE_ORC */ @@ -263,74 +270,76 @@ volk_32f_x2_max_32f_u_orc(float* cVector, const float* aVector, #ifdef LV_HAVE_AVX512F #include -static inline void -volk_32f_x2_max_32f_u_avx512f(float* cVector, const float* aVector, - const float* bVector, unsigned int num_points) +static inline void volk_32f_x2_max_32f_u_avx512f(float* cVector, + const float* aVector, + const float* bVector, + unsigned int num_points) { - unsigned int number = 0; - const unsigned int sixteenthPoints = num_points / 16; + unsigned int number = 0; + const unsigned int sixteenthPoints = num_points / 16; - float* cPtr = cVector; - const float* aPtr = aVector; - const float* bPtr= bVector; + float* cPtr = cVector; + const float* aPtr = aVector; + const float* bPtr = bVector; - __m512 aVal, bVal, cVal; - for(;number < sixteenthPoints; number++){ - aVal = _mm512_loadu_ps(aPtr); - bVal = _mm512_loadu_ps(bPtr); + __m512 aVal, bVal, cVal; + for (; number < sixteenthPoints; number++) { + aVal = _mm512_loadu_ps(aPtr); + bVal = _mm512_loadu_ps(bPtr); - cVal = _mm512_max_ps(aVal, bVal); + cVal = _mm512_max_ps(aVal, bVal); - _mm512_storeu_ps(cPtr,cVal); // Store the results back into the C container + _mm512_storeu_ps(cPtr, cVal); // Store the results back into the C container - aPtr += 16; - bPtr += 16; - cPtr += 16; - } + aPtr += 16; + bPtr += 16; + cPtr += 16; + } - number = sixteenthPoints * 16; - for(;number < num_points; number++){ - const float a = *aPtr++; - const float b = *bPtr++; - *cPtr++ = ( a > b ? a : b); - } + number = sixteenthPoints * 16; + for (; number < num_points; number++) { + const float a = *aPtr++; + const float b = *bPtr++; + *cPtr++ = (a > b ? a : b); + } } #endif /* LV_HAVE_AVX512F */ #ifdef LV_HAVE_AVX #include -static inline void -volk_32f_x2_max_32f_u_avx(float* cVector, const float* aVector, - const float* bVector, unsigned int num_points) +static inline void volk_32f_x2_max_32f_u_avx(float* cVector, + const float* aVector, + const float* bVector, + unsigned int num_points) { - unsigned int number = 0; - const unsigned int eighthPoints = num_points / 8; + unsigned int number = 0; + const unsigned int eighthPoints = num_points / 8; - float* cPtr = cVector; - const float* aPtr = aVector; - const float* bPtr= bVector; + float* cPtr = cVector; + const float* aPtr = aVector; + const float* bPtr = bVector; - __m256 aVal, bVal, cVal; - for(;number < eighthPoints; number++){ - aVal = _mm256_loadu_ps(aPtr); - bVal = _mm256_loadu_ps(bPtr); + __m256 aVal, bVal, cVal; + for (; number < eighthPoints; number++) { + aVal = _mm256_loadu_ps(aPtr); + bVal = _mm256_loadu_ps(bPtr); - cVal = _mm256_max_ps(aVal, bVal); + cVal = _mm256_max_ps(aVal, bVal); - _mm256_storeu_ps(cPtr,cVal); // Store the results back into the C container + _mm256_storeu_ps(cPtr, cVal); // Store the results back into the C container - aPtr += 8; - bPtr += 8; - cPtr += 8; - } + aPtr += 8; + bPtr += 8; + cPtr += 8; + } - number = eighthPoints * 8; - for(;number < num_points; number++){ - const float a = *aPtr++; - const float b = *bPtr++; - *cPtr++ = ( a > b ? a : b); - } + number = eighthPoints * 8; + for (; number < num_points; number++) { + const float a = *aPtr++; + const float b = *bPtr++; + *cPtr++ = (a > b ? a : b); + } } #endif /* LV_HAVE_AVX */ diff --git a/kernels/volk/volk_32f_x2_min_32f.h b/kernels/volk/volk_32f_x2_min_32f.h index 454eb76..aecd11a 100644 --- a/kernels/volk/volk_32f_x2_min_32f.h +++ b/kernels/volk/volk_32f_x2_min_32f.h @@ -32,8 +32,8 @@ * * Dispatcher Prototype * \code - * void volk_32f_x2_min_32f(float* cVector, const float* aVector, const float* bVector, unsigned int num_points) - * \endcode + * void volk_32f_x2_min_32f(float* cVector, const float* aVector, const float* bVector, + * unsigned int num_points) \endcode * * \b Inputs * \li aVector: First input vector. @@ -77,37 +77,38 @@ #ifdef LV_HAVE_SSE #include -static inline void -volk_32f_x2_min_32f_a_sse(float* cVector, const float* aVector, - const float* bVector, unsigned int num_points) +static inline void volk_32f_x2_min_32f_a_sse(float* cVector, + const float* aVector, + const float* bVector, + unsigned int num_points) { - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; - float* cPtr = cVector; - const float* aPtr = aVector; - const float* bPtr= bVector; + float* cPtr = cVector; + const float* aPtr = aVector; + const float* bPtr = bVector; - __m128 aVal, bVal, cVal; - for(;number < quarterPoints; number++){ - aVal = _mm_load_ps(aPtr); - bVal = _mm_load_ps(bPtr); + __m128 aVal, bVal, cVal; + for (; number < quarterPoints; number++) { + aVal = _mm_load_ps(aPtr); + bVal = _mm_load_ps(bPtr); - cVal = _mm_min_ps(aVal, bVal); + cVal = _mm_min_ps(aVal, bVal); - _mm_store_ps(cPtr,cVal); // Store the results back into the C container + _mm_store_ps(cPtr, cVal); // Store the results back into the C container - aPtr += 4; - bPtr += 4; - cPtr += 4; - } + aPtr += 4; + bPtr += 4; + cPtr += 4; + } - number = quarterPoints * 4; - for(;number < num_points; number++){ - const float a = *aPtr++; - const float b = *bPtr++; - *cPtr++ = ( a < b ? a : b); - } + number = quarterPoints * 4; + for (; number < num_points; number++) { + const float a = *aPtr++; + const float b = *bPtr++; + *cPtr++ = (a < b ? a : b); + } } #endif /* LV_HAVE_SSE */ @@ -115,143 +116,149 @@ volk_32f_x2_min_32f_a_sse(float* cVector, const float* aVector, #ifdef LV_HAVE_NEON #include -static inline void -volk_32f_x2_min_32f_neon(float* cVector, const float* aVector, - const float* bVector, unsigned int num_points) +static inline void volk_32f_x2_min_32f_neon(float* cVector, + const float* aVector, + const float* bVector, + unsigned int num_points) { - float* cPtr = cVector; - const float* aPtr = aVector; - const float* bPtr= bVector; - unsigned int number = 0; - unsigned int quarter_points = num_points / 4; - - float32x4_t a_vec, b_vec, c_vec; - for(number = 0; number < quarter_points; number++){ - a_vec = vld1q_f32(aPtr); - b_vec = vld1q_f32(bPtr); - - c_vec = vminq_f32(a_vec, b_vec); - - vst1q_f32(cPtr, c_vec); - aPtr += 4; - bPtr += 4; - cPtr += 4; - } - - for(number = quarter_points*4; number < num_points; number++){ - const float a = *aPtr++; - const float b = *bPtr++; - *cPtr++ = ( a < b ? a : b); - } + float* cPtr = cVector; + const float* aPtr = aVector; + const float* bPtr = bVector; + unsigned int number = 0; + unsigned int quarter_points = num_points / 4; + + float32x4_t a_vec, b_vec, c_vec; + for (number = 0; number < quarter_points; number++) { + a_vec = vld1q_f32(aPtr); + b_vec = vld1q_f32(bPtr); + + c_vec = vminq_f32(a_vec, b_vec); + + vst1q_f32(cPtr, c_vec); + aPtr += 4; + bPtr += 4; + cPtr += 4; + } + + for (number = quarter_points * 4; number < num_points; number++) { + const float a = *aPtr++; + const float b = *bPtr++; + *cPtr++ = (a < b ? a : b); + } } #endif /* LV_HAVE_NEON */ #ifdef LV_HAVE_GENERIC -static inline void -volk_32f_x2_min_32f_generic(float* cVector, const float* aVector, - const float* bVector, unsigned int num_points) +static inline void volk_32f_x2_min_32f_generic(float* cVector, + const float* aVector, + const float* bVector, + unsigned int num_points) { - float* cPtr = cVector; - const float* aPtr = aVector; - const float* bPtr= bVector; - unsigned int number = 0; - - for(number = 0; number < num_points; number++){ - const float a = *aPtr++; - const float b = *bPtr++; - *cPtr++ = ( a < b ? a : b); - } + float* cPtr = cVector; + const float* aPtr = aVector; + const float* bPtr = bVector; + unsigned int number = 0; + + for (number = 0; number < num_points; number++) { + const float a = *aPtr++; + const float b = *bPtr++; + *cPtr++ = (a < b ? a : b); + } } #endif /* LV_HAVE_GENERIC */ #ifdef LV_HAVE_ORC -extern void -volk_32f_x2_min_32f_a_orc_impl(float* cVector, const float* aVector, - const float* bVector, unsigned int num_points); +extern void volk_32f_x2_min_32f_a_orc_impl(float* cVector, + const float* aVector, + const float* bVector, + unsigned int num_points); -static inline void -volk_32f_x2_min_32f_u_orc(float* cVector, const float* aVector, - const float* bVector, unsigned int num_points) +static inline void volk_32f_x2_min_32f_u_orc(float* cVector, + const float* aVector, + const float* bVector, + unsigned int num_points) { - volk_32f_x2_min_32f_a_orc_impl(cVector, aVector, bVector, num_points); + volk_32f_x2_min_32f_a_orc_impl(cVector, aVector, bVector, num_points); } #endif /* LV_HAVE_ORC */ #ifdef LV_HAVE_AVX #include -static inline void -volk_32f_x2_min_32f_a_avx(float* cVector, const float* aVector, - const float* bVector, unsigned int num_points) +static inline void volk_32f_x2_min_32f_a_avx(float* cVector, + const float* aVector, + const float* bVector, + unsigned int num_points) { - unsigned int number = 0; - const unsigned int eighthPoints = num_points / 8; + unsigned int number = 0; + const unsigned int eighthPoints = num_points / 8; - float* cPtr = cVector; - const float* aPtr = aVector; - const float* bPtr= bVector; + float* cPtr = cVector; + const float* aPtr = aVector; + const float* bPtr = bVector; - __m256 aVal, bVal, cVal; - for(;number < eighthPoints; number++){ - aVal = _mm256_load_ps(aPtr); - bVal = _mm256_load_ps(bPtr); + __m256 aVal, bVal, cVal; + for (; number < eighthPoints; number++) { + aVal = _mm256_load_ps(aPtr); + bVal = _mm256_load_ps(bPtr); - cVal = _mm256_min_ps(aVal, bVal); + cVal = _mm256_min_ps(aVal, bVal); - _mm256_store_ps(cPtr,cVal); // Store the results back into the C container + _mm256_store_ps(cPtr, cVal); // Store the results back into the C container - aPtr += 8; - bPtr += 8; - cPtr += 8; - } + aPtr += 8; + bPtr += 8; + cPtr += 8; + } - number = eighthPoints * 8; - for(;number < num_points; number++){ - const float a = *aPtr++; - const float b = *bPtr++; - *cPtr++ = ( a < b ? a : b); - } + number = eighthPoints * 8; + for (; number < num_points; number++) { + const float a = *aPtr++; + const float b = *bPtr++; + *cPtr++ = (a < b ? a : b); + } } #endif /* LV_HAVE_AVX */ #ifdef LV_HAVE_AVX512F #include -static inline void -volk_32f_x2_min_32f_a_avx512f(float* cVector, const float* aVector, - const float* bVector, unsigned int num_points) +static inline void volk_32f_x2_min_32f_a_avx512f(float* cVector, + const float* aVector, + const float* bVector, + unsigned int num_points) { - unsigned int number = 0; - const unsigned int sixteenthPoints = num_points / 16; + unsigned int number = 0; + const unsigned int sixteenthPoints = num_points / 16; - float* cPtr = cVector; - const float* aPtr = aVector; - const float* bPtr= bVector; + float* cPtr = cVector; + const float* aPtr = aVector; + const float* bPtr = bVector; - __m512 aVal, bVal, cVal; - for(;number < sixteenthPoints; number++){ - aVal = _mm512_load_ps(aPtr); - bVal = _mm512_load_ps(bPtr); + __m512 aVal, bVal, cVal; + for (; number < sixteenthPoints; number++) { + aVal = _mm512_load_ps(aPtr); + bVal = _mm512_load_ps(bPtr); - cVal = _mm512_min_ps(aVal, bVal); + cVal = _mm512_min_ps(aVal, bVal); - _mm512_store_ps(cPtr,cVal); // Store the results back into the C container + _mm512_store_ps(cPtr, cVal); // Store the results back into the C container - aPtr += 16; - bPtr += 16; - cPtr += 16; - } + aPtr += 16; + bPtr += 16; + cPtr += 16; + } - number = sixteenthPoints * 16; - for(;number < num_points; number++){ - const float a = *aPtr++; - const float b = *bPtr++; - *cPtr++ = ( a < b ? a : b); - } + number = sixteenthPoints * 16; + for (; number < num_points; number++) { + const float a = *aPtr++; + const float b = *bPtr++; + *cPtr++ = (a < b ? a : b); + } } #endif /* LV_HAVE_AVX512F */ @@ -267,74 +274,76 @@ volk_32f_x2_min_32f_a_avx512f(float* cVector, const float* aVector, #ifdef LV_HAVE_AVX512F #include -static inline void -volk_32f_x2_min_32f_u_avx512f(float* cVector, const float* aVector, - const float* bVector, unsigned int num_points) +static inline void volk_32f_x2_min_32f_u_avx512f(float* cVector, + const float* aVector, + const float* bVector, + unsigned int num_points) { - unsigned int number = 0; - const unsigned int sixteenthPoints = num_points / 16; + unsigned int number = 0; + const unsigned int sixteenthPoints = num_points / 16; - float* cPtr = cVector; - const float* aPtr = aVector; - const float* bPtr= bVector; + float* cPtr = cVector; + const float* aPtr = aVector; + const float* bPtr = bVector; - __m512 aVal, bVal, cVal; - for(;number < sixteenthPoints; number++){ - aVal = _mm512_loadu_ps(aPtr); - bVal = _mm512_loadu_ps(bPtr); + __m512 aVal, bVal, cVal; + for (; number < sixteenthPoints; number++) { + aVal = _mm512_loadu_ps(aPtr); + bVal = _mm512_loadu_ps(bPtr); - cVal = _mm512_min_ps(aVal, bVal); + cVal = _mm512_min_ps(aVal, bVal); - _mm512_storeu_ps(cPtr,cVal); // Store the results back into the C container + _mm512_storeu_ps(cPtr, cVal); // Store the results back into the C container - aPtr += 16; - bPtr += 16; - cPtr += 16; - } + aPtr += 16; + bPtr += 16; + cPtr += 16; + } - number = sixteenthPoints * 16; - for(;number < num_points; number++){ - const float a = *aPtr++; - const float b = *bPtr++; - *cPtr++ = ( a < b ? a : b); - } + number = sixteenthPoints * 16; + for (; number < num_points; number++) { + const float a = *aPtr++; + const float b = *bPtr++; + *cPtr++ = (a < b ? a : b); + } } #endif /* LV_HAVE_AVX512F */ #ifdef LV_HAVE_AVX #include -static inline void -volk_32f_x2_min_32f_u_avx(float* cVector, const float* aVector, - const float* bVector, unsigned int num_points) +static inline void volk_32f_x2_min_32f_u_avx(float* cVector, + const float* aVector, + const float* bVector, + unsigned int num_points) { - unsigned int number = 0; - const unsigned int eighthPoints = num_points / 8; + unsigned int number = 0; + const unsigned int eighthPoints = num_points / 8; - float* cPtr = cVector; - const float* aPtr = aVector; - const float* bPtr= bVector; + float* cPtr = cVector; + const float* aPtr = aVector; + const float* bPtr = bVector; - __m256 aVal, bVal, cVal; - for(;number < eighthPoints; number++){ - aVal = _mm256_loadu_ps(aPtr); - bVal = _mm256_loadu_ps(bPtr); + __m256 aVal, bVal, cVal; + for (; number < eighthPoints; number++) { + aVal = _mm256_loadu_ps(aPtr); + bVal = _mm256_loadu_ps(bPtr); - cVal = _mm256_min_ps(aVal, bVal); + cVal = _mm256_min_ps(aVal, bVal); - _mm256_storeu_ps(cPtr,cVal); // Store the results back into the C container + _mm256_storeu_ps(cPtr, cVal); // Store the results back into the C container - aPtr += 8; - bPtr += 8; - cPtr += 8; - } + aPtr += 8; + bPtr += 8; + cPtr += 8; + } - number = eighthPoints * 8; - for(;number < num_points; number++){ - const float a = *aPtr++; - const float b = *bPtr++; - *cPtr++ = ( a < b ? a : b); - } + number = eighthPoints * 8; + for (; number < num_points; number++) { + const float a = *aPtr++; + const float b = *bPtr++; + *cPtr++ = (a < b ? a : b); + } } #endif /* LV_HAVE_AVX */ diff --git a/kernels/volk/volk_32f_x2_multiply_32f.h b/kernels/volk/volk_32f_x2_multiply_32f.h index deb9ae3..eebba18 100644 --- a/kernels/volk/volk_32f_x2_multiply_32f.h +++ b/kernels/volk/volk_32f_x2_multiply_32f.h @@ -31,8 +31,8 @@ * * Dispatcher Prototype * \code - * void volk_32f_x2_multiply_32f(float* cVector, const float* aVector, const float* bVector, unsigned int num_points) - * \endcode + * void volk_32f_x2_multiply_32f(float* cVector, const float* aVector, const float* + * bVector, unsigned int num_points) \endcode * * \b Inputs * \li aVector: First input vector. @@ -77,126 +77,130 @@ #ifdef LV_HAVE_SSE #include -static inline void -volk_32f_x2_multiply_32f_u_sse(float* cVector, const float* aVector, - const float* bVector, unsigned int num_points) +static inline void volk_32f_x2_multiply_32f_u_sse(float* cVector, + const float* aVector, + const float* bVector, + unsigned int num_points) { - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; - float* cPtr = cVector; - const float* aPtr = aVector; - const float* bPtr= bVector; + float* cPtr = cVector; + const float* aPtr = aVector; + const float* bPtr = bVector; - __m128 aVal, bVal, cVal; - for(;number < quarterPoints; number++){ + __m128 aVal, bVal, cVal; + for (; number < quarterPoints; number++) { - aVal = _mm_loadu_ps(aPtr); - bVal = _mm_loadu_ps(bPtr); + aVal = _mm_loadu_ps(aPtr); + bVal = _mm_loadu_ps(bPtr); - cVal = _mm_mul_ps(aVal, bVal); + cVal = _mm_mul_ps(aVal, bVal); - _mm_storeu_ps(cPtr,cVal); // Store the results back into the C container + _mm_storeu_ps(cPtr, cVal); // Store the results back into the C container - aPtr += 4; - bPtr += 4; - cPtr += 4; - } + aPtr += 4; + bPtr += 4; + cPtr += 4; + } - number = quarterPoints * 4; - for(;number < num_points; number++){ - *cPtr++ = (*aPtr++) * (*bPtr++); - } + number = quarterPoints * 4; + for (; number < num_points; number++) { + *cPtr++ = (*aPtr++) * (*bPtr++); + } } #endif /* LV_HAVE_SSE */ #ifdef LV_HAVE_AVX512F #include -static inline void -volk_32f_x2_multiply_32f_u_avx512f(float* cVector, const float* aVector, - const float* bVector, unsigned int num_points) +static inline void volk_32f_x2_multiply_32f_u_avx512f(float* cVector, + const float* aVector, + const float* bVector, + unsigned int num_points) { - unsigned int number = 0; - const unsigned int sixteenthPoints = num_points / 16; + unsigned int number = 0; + const unsigned int sixteenthPoints = num_points / 16; - float* cPtr = cVector; - const float* aPtr = aVector; - const float* bPtr= bVector; + float* cPtr = cVector; + const float* aPtr = aVector; + const float* bPtr = bVector; - __m512 aVal, bVal, cVal; - for(;number < sixteenthPoints; number++){ + __m512 aVal, bVal, cVal; + for (; number < sixteenthPoints; number++) { - aVal = _mm512_loadu_ps(aPtr); - bVal = _mm512_loadu_ps(bPtr); + aVal = _mm512_loadu_ps(aPtr); + bVal = _mm512_loadu_ps(bPtr); - cVal = _mm512_mul_ps(aVal, bVal); + cVal = _mm512_mul_ps(aVal, bVal); - _mm512_storeu_ps(cPtr,cVal); // Store the results back into the C container + _mm512_storeu_ps(cPtr, cVal); // Store the results back into the C container - aPtr += 16; - bPtr += 16; - cPtr += 16; - } + aPtr += 16; + bPtr += 16; + cPtr += 16; + } - number = sixteenthPoints * 16; - for(;number < num_points; number++){ - *cPtr++ = (*aPtr++) * (*bPtr++); - } + number = sixteenthPoints * 16; + for (; number < num_points; number++) { + *cPtr++ = (*aPtr++) * (*bPtr++); + } } #endif /* LV_HAVE_AVX512F */ #ifdef LV_HAVE_AVX #include -static inline void -volk_32f_x2_multiply_32f_u_avx(float* cVector, const float* aVector, - const float* bVector, unsigned int num_points) +static inline void volk_32f_x2_multiply_32f_u_avx(float* cVector, + const float* aVector, + const float* bVector, + unsigned int num_points) { - unsigned int number = 0; - const unsigned int eighthPoints = num_points / 8; + unsigned int number = 0; + const unsigned int eighthPoints = num_points / 8; - float* cPtr = cVector; - const float* aPtr = aVector; - const float* bPtr= bVector; + float* cPtr = cVector; + const float* aPtr = aVector; + const float* bPtr = bVector; - __m256 aVal, bVal, cVal; - for(;number < eighthPoints; number++){ + __m256 aVal, bVal, cVal; + for (; number < eighthPoints; number++) { - aVal = _mm256_loadu_ps(aPtr); - bVal = _mm256_loadu_ps(bPtr); + aVal = _mm256_loadu_ps(aPtr); + bVal = _mm256_loadu_ps(bPtr); - cVal = _mm256_mul_ps(aVal, bVal); + cVal = _mm256_mul_ps(aVal, bVal); - _mm256_storeu_ps(cPtr,cVal); // Store the results back into the C container + _mm256_storeu_ps(cPtr, cVal); // Store the results back into the C container - aPtr += 8; - bPtr += 8; - cPtr += 8; - } + aPtr += 8; + bPtr += 8; + cPtr += 8; + } - number = eighthPoints * 8; - for(;number < num_points; number++){ - *cPtr++ = (*aPtr++) * (*bPtr++); - } + number = eighthPoints * 8; + for (; number < num_points; number++) { + *cPtr++ = (*aPtr++) * (*bPtr++); + } } #endif /* LV_HAVE_AVX */ #ifdef LV_HAVE_GENERIC -static inline void -volk_32f_x2_multiply_32f_generic(float* cVector, const float* aVector, - const float* bVector, unsigned int num_points) +static inline void volk_32f_x2_multiply_32f_generic(float* cVector, + const float* aVector, + const float* bVector, + unsigned int num_points) { - float* cPtr = cVector; - const float* aPtr = aVector; - const float* bPtr= bVector; - unsigned int number = 0; - - for(number = 0; number < num_points; number++){ - *cPtr++ = (*aPtr++) * (*bPtr++); - } + float* cPtr = cVector; + const float* aPtr = aVector; + const float* bPtr = bVector; + unsigned int number = 0; + + for (number = 0; number < num_points; number++) { + *cPtr++ = (*aPtr++) * (*bPtr++); + } } #endif /* LV_HAVE_GENERIC */ @@ -213,72 +217,74 @@ volk_32f_x2_multiply_32f_generic(float* cVector, const float* aVector, #ifdef LV_HAVE_SSE #include -static inline void -volk_32f_x2_multiply_32f_a_sse(float* cVector, const float* aVector, - const float* bVector, unsigned int num_points) +static inline void volk_32f_x2_multiply_32f_a_sse(float* cVector, + const float* aVector, + const float* bVector, + unsigned int num_points) { - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; - float* cPtr = cVector; - const float* aPtr = aVector; - const float* bPtr= bVector; + float* cPtr = cVector; + const float* aPtr = aVector; + const float* bPtr = bVector; - __m128 aVal, bVal, cVal; - for(;number < quarterPoints; number++){ + __m128 aVal, bVal, cVal; + for (; number < quarterPoints; number++) { - aVal = _mm_load_ps(aPtr); - bVal = _mm_load_ps(bPtr); + aVal = _mm_load_ps(aPtr); + bVal = _mm_load_ps(bPtr); - cVal = _mm_mul_ps(aVal, bVal); + cVal = _mm_mul_ps(aVal, bVal); - _mm_store_ps(cPtr,cVal); // Store the results back into the C container + _mm_store_ps(cPtr, cVal); // Store the results back into the C container - aPtr += 4; - bPtr += 4; - cPtr += 4; - } + aPtr += 4; + bPtr += 4; + cPtr += 4; + } - number = quarterPoints * 4; - for(;number < num_points; number++){ - *cPtr++ = (*aPtr++) * (*bPtr++); - } + number = quarterPoints * 4; + for (; number < num_points; number++) { + *cPtr++ = (*aPtr++) * (*bPtr++); + } } #endif /* LV_HAVE_SSE */ #ifdef LV_HAVE_AVX512F #include -static inline void -volk_32f_x2_multiply_32f_a_avx512f(float* cVector, const float* aVector, - const float* bVector, unsigned int num_points) +static inline void volk_32f_x2_multiply_32f_a_avx512f(float* cVector, + const float* aVector, + const float* bVector, + unsigned int num_points) { - unsigned int number = 0; - const unsigned int sixteenthPoints = num_points / 16; + unsigned int number = 0; + const unsigned int sixteenthPoints = num_points / 16; - float* cPtr = cVector; - const float* aPtr = aVector; - const float* bPtr= bVector; + float* cPtr = cVector; + const float* aPtr = aVector; + const float* bPtr = bVector; - __m512 aVal, bVal, cVal; - for(;number < sixteenthPoints; number++){ + __m512 aVal, bVal, cVal; + for (; number < sixteenthPoints; number++) { - aVal = _mm512_load_ps(aPtr); - bVal = _mm512_load_ps(bPtr); + aVal = _mm512_load_ps(aPtr); + bVal = _mm512_load_ps(bPtr); - cVal = _mm512_mul_ps(aVal, bVal); + cVal = _mm512_mul_ps(aVal, bVal); - _mm512_store_ps(cPtr,cVal); // Store the results back into the C container + _mm512_store_ps(cPtr, cVal); // Store the results back into the C container - aPtr += 16; - bPtr += 16; - cPtr += 16; - } + aPtr += 16; + bPtr += 16; + cPtr += 16; + } - number = sixteenthPoints * 16; - for(;number < num_points; number++){ - *cPtr++ = (*aPtr++) * (*bPtr++); - } + number = sixteenthPoints * 16; + for (; number < num_points; number++) { + *cPtr++ = (*aPtr++) * (*bPtr++); + } } #endif /* LV_HAVE_AVX512F */ @@ -286,36 +292,37 @@ volk_32f_x2_multiply_32f_a_avx512f(float* cVector, const float* aVector, #ifdef LV_HAVE_AVX #include -static inline void -volk_32f_x2_multiply_32f_a_avx(float* cVector, const float* aVector, - const float* bVector, unsigned int num_points) +static inline void volk_32f_x2_multiply_32f_a_avx(float* cVector, + const float* aVector, + const float* bVector, + unsigned int num_points) { - unsigned int number = 0; - const unsigned int eighthPoints = num_points / 8; + unsigned int number = 0; + const unsigned int eighthPoints = num_points / 8; - float* cPtr = cVector; - const float* aPtr = aVector; - const float* bPtr= bVector; + float* cPtr = cVector; + const float* aPtr = aVector; + const float* bPtr = bVector; - __m256 aVal, bVal, cVal; - for(;number < eighthPoints; number++){ + __m256 aVal, bVal, cVal; + for (; number < eighthPoints; number++) { - aVal = _mm256_load_ps(aPtr); - bVal = _mm256_load_ps(bPtr); + aVal = _mm256_load_ps(aPtr); + bVal = _mm256_load_ps(bPtr); - cVal = _mm256_mul_ps(aVal, bVal); + cVal = _mm256_mul_ps(aVal, bVal); - _mm256_store_ps(cPtr,cVal); // Store the results back into the C container + _mm256_store_ps(cPtr, cVal); // Store the results back into the C container - aPtr += 8; - bPtr += 8; - cPtr += 8; - } + aPtr += 8; + bPtr += 8; + cPtr += 8; + } - number = eighthPoints * 8; - for(;number < num_points; number++){ - *cPtr++ = (*aPtr++) * (*bPtr++); - } + number = eighthPoints * 8; + for (; number < num_points; number++) { + *cPtr++ = (*aPtr++) * (*bPtr++); + } } #endif /* LV_HAVE_AVX */ @@ -323,57 +330,61 @@ volk_32f_x2_multiply_32f_a_avx(float* cVector, const float* aVector, #ifdef LV_HAVE_NEON #include -static inline void -volk_32f_x2_multiply_32f_neon(float* cVector, const float* aVector, - const float* bVector, unsigned int num_points) +static inline void volk_32f_x2_multiply_32f_neon(float* cVector, + const float* aVector, + const float* bVector, + unsigned int num_points) { - const unsigned int quarter_points = num_points / 4; - unsigned int number; - float32x4_t avec, bvec, cvec; - for(number=0; number < quarter_points; ++number) { - avec = vld1q_f32(aVector); - bvec = vld1q_f32(bVector); - cvec = vmulq_f32(avec, bvec); - vst1q_f32(cVector, cvec); - aVector += 4; - bVector += 4; - cVector += 4; - } - for(number=quarter_points*4; number < num_points; ++number) { - *cVector++ = *aVector++ * *bVector++; - } + const unsigned int quarter_points = num_points / 4; + unsigned int number; + float32x4_t avec, bvec, cvec; + for (number = 0; number < quarter_points; ++number) { + avec = vld1q_f32(aVector); + bvec = vld1q_f32(bVector); + cvec = vmulq_f32(avec, bvec); + vst1q_f32(cVector, cvec); + aVector += 4; + bVector += 4; + cVector += 4; + } + for (number = quarter_points * 4; number < num_points; ++number) { + *cVector++ = *aVector++ * *bVector++; + } } #endif /* LV_HAVE_NEON */ #ifdef LV_HAVE_GENERIC -static inline void -volk_32f_x2_multiply_32f_a_generic(float* cVector, const float* aVector, - const float* bVector, unsigned int num_points) +static inline void volk_32f_x2_multiply_32f_a_generic(float* cVector, + const float* aVector, + const float* bVector, + unsigned int num_points) { - float* cPtr = cVector; - const float* aPtr = aVector; - const float* bPtr= bVector; - unsigned int number = 0; - - for(number = 0; number < num_points; number++){ - *cPtr++ = (*aPtr++) * (*bPtr++); - } + float* cPtr = cVector; + const float* aPtr = aVector; + const float* bPtr = bVector; + unsigned int number = 0; + + for (number = 0; number < num_points; number++) { + *cPtr++ = (*aPtr++) * (*bPtr++); + } } #endif /* LV_HAVE_GENERIC */ #ifdef LV_HAVE_ORC -extern void -volk_32f_x2_multiply_32f_a_orc_impl(float* cVector, const float* aVector, - const float* bVector, unsigned int num_points); - -static inline void -volk_32f_x2_multiply_32f_u_orc(float* cVector, const float* aVector, - const float* bVector, unsigned int num_points) +extern void volk_32f_x2_multiply_32f_a_orc_impl(float* cVector, + const float* aVector, + const float* bVector, + unsigned int num_points); + +static inline void volk_32f_x2_multiply_32f_u_orc(float* cVector, + const float* aVector, + const float* bVector, + unsigned int num_points) { - volk_32f_x2_multiply_32f_a_orc_impl(cVector, aVector, bVector, num_points); + volk_32f_x2_multiply_32f_a_orc_impl(cVector, aVector, bVector, num_points); } #endif /* LV_HAVE_ORC */ diff --git a/kernels/volk/volk_32f_x2_pow_32f.h b/kernels/volk/volk_32f_x2_pow_32f.h index daa7f4e..106c57b 100644 --- a/kernels/volk/volk_32f_x2_pow_32f.h +++ b/kernels/volk/volk_32f_x2_pow_32f.h @@ -31,8 +31,8 @@ * * Dispatcher Prototype * \code - * void volk_32f_x2_pow_32f(float* cVector, const float* bVector, const float* aVector, unsigned int num_points) - * \endcode + * void volk_32f_x2_pow_32f(float* cVector, const float* bVector, const float* aVector, + * unsigned int num_points) \endcode * * \b Inputs * \li bVector: The input vector of indices (power values). @@ -71,10 +71,10 @@ #ifndef INCLUDED_volk_32f_x2_pow_32f_a_H #define INCLUDED_volk_32f_x2_pow_32f_a_H -#include -#include #include #include +#include +#include #define POW_POLY_DEGREE 3 @@ -82,99 +82,130 @@ #include #define POLY0_AVX2_FMA(x, c0) _mm256_set1_ps(c0) -#define POLY1_AVX2_FMA(x, c0, c1) _mm256_fmadd_ps(POLY0_AVX2_FMA(x, c1), x, _mm256_set1_ps(c0)) -#define POLY2_AVX2_FMA(x, c0, c1, c2) _mm256_fmadd_ps(POLY1_AVX2_FMA(x, c1, c2), x, _mm256_set1_ps(c0)) -#define POLY3_AVX2_FMA(x, c0, c1, c2, c3) _mm256_fmadd_ps(POLY2_AVX2_FMA(x, c1, c2, c3), x, _mm256_set1_ps(c0)) -#define POLY4_AVX2_FMA(x, c0, c1, c2, c3, c4) _mm256_fmadd_ps(POLY3_AVX2_FMA(x, c1, c2, c3, c4), x, _mm256_set1_ps(c0)) -#define POLY5_AVX2_FMA(x, c0, c1, c2, c3, c4, c5) _mm256_fmadd_ps(POLY4_AVX2_FMA(x, c1, c2, c3, c4, c5), x, _mm256_set1_ps(c0)) - -static inline void -volk_32f_x2_pow_32f_a_avx2_fma(float* cVector, const float* bVector, - const float* aVector, unsigned int num_points) +#define POLY1_AVX2_FMA(x, c0, c1) \ + _mm256_fmadd_ps(POLY0_AVX2_FMA(x, c1), x, _mm256_set1_ps(c0)) +#define POLY2_AVX2_FMA(x, c0, c1, c2) \ + _mm256_fmadd_ps(POLY1_AVX2_FMA(x, c1, c2), x, _mm256_set1_ps(c0)) +#define POLY3_AVX2_FMA(x, c0, c1, c2, c3) \ + _mm256_fmadd_ps(POLY2_AVX2_FMA(x, c1, c2, c3), x, _mm256_set1_ps(c0)) +#define POLY4_AVX2_FMA(x, c0, c1, c2, c3, c4) \ + _mm256_fmadd_ps(POLY3_AVX2_FMA(x, c1, c2, c3, c4), x, _mm256_set1_ps(c0)) +#define POLY5_AVX2_FMA(x, c0, c1, c2, c3, c4, c5) \ + _mm256_fmadd_ps(POLY4_AVX2_FMA(x, c1, c2, c3, c4, c5), x, _mm256_set1_ps(c0)) + +static inline void volk_32f_x2_pow_32f_a_avx2_fma(float* cVector, + const float* bVector, + const float* aVector, + unsigned int num_points) { - float* cPtr = cVector; - const float* bPtr = bVector; - const float* aPtr = aVector; - - unsigned int number = 0; - const unsigned int eighthPoints = num_points / 8; - - __m256 aVal, bVal, cVal, logarithm, mantissa, frac, leadingOne; - __m256 tmp, fx, mask, pow2n, z, y; - __m256 one, exp_hi, exp_lo, ln2, log2EF, half, exp_C1, exp_C2; - __m256 exp_p0, exp_p1, exp_p2, exp_p3, exp_p4, exp_p5; - __m256i bias, exp, emm0, pi32_0x7f; - - one = _mm256_set1_ps(1.0); - exp_hi = _mm256_set1_ps(88.3762626647949); - exp_lo = _mm256_set1_ps(-88.3762626647949); - ln2 = _mm256_set1_ps(0.6931471805); - log2EF = _mm256_set1_ps(1.44269504088896341); - half = _mm256_set1_ps(0.5); - exp_C1 = _mm256_set1_ps(0.693359375); - exp_C2 = _mm256_set1_ps(-2.12194440e-4); - pi32_0x7f = _mm256_set1_epi32(0x7f); - - exp_p0 = _mm256_set1_ps(1.9875691500e-4); - exp_p1 = _mm256_set1_ps(1.3981999507e-3); - exp_p2 = _mm256_set1_ps(8.3334519073e-3); - exp_p3 = _mm256_set1_ps(4.1665795894e-2); - exp_p4 = _mm256_set1_ps(1.6666665459e-1); - exp_p5 = _mm256_set1_ps(5.0000001201e-1); - - for(;number < eighthPoints; number++){ - // First compute the logarithm - aVal = _mm256_load_ps(aPtr); - bias = _mm256_set1_epi32(127); - leadingOne = _mm256_set1_ps(1.0f); - exp = _mm256_sub_epi32(_mm256_srli_epi32(_mm256_and_si256(_mm256_castps_si256(aVal), _mm256_set1_epi32(0x7f800000)), 23), bias); - logarithm = _mm256_cvtepi32_ps(exp); - - frac = _mm256_or_ps(leadingOne, _mm256_and_ps(aVal, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffff)))); + float* cPtr = cVector; + const float* bPtr = bVector; + const float* aPtr = aVector; + + unsigned int number = 0; + const unsigned int eighthPoints = num_points / 8; + + __m256 aVal, bVal, cVal, logarithm, mantissa, frac, leadingOne; + __m256 tmp, fx, mask, pow2n, z, y; + __m256 one, exp_hi, exp_lo, ln2, log2EF, half, exp_C1, exp_C2; + __m256 exp_p0, exp_p1, exp_p2, exp_p3, exp_p4, exp_p5; + __m256i bias, exp, emm0, pi32_0x7f; + + one = _mm256_set1_ps(1.0); + exp_hi = _mm256_set1_ps(88.3762626647949); + exp_lo = _mm256_set1_ps(-88.3762626647949); + ln2 = _mm256_set1_ps(0.6931471805); + log2EF = _mm256_set1_ps(1.44269504088896341); + half = _mm256_set1_ps(0.5); + exp_C1 = _mm256_set1_ps(0.693359375); + exp_C2 = _mm256_set1_ps(-2.12194440e-4); + pi32_0x7f = _mm256_set1_epi32(0x7f); + + exp_p0 = _mm256_set1_ps(1.9875691500e-4); + exp_p1 = _mm256_set1_ps(1.3981999507e-3); + exp_p2 = _mm256_set1_ps(8.3334519073e-3); + exp_p3 = _mm256_set1_ps(4.1665795894e-2); + exp_p4 = _mm256_set1_ps(1.6666665459e-1); + exp_p5 = _mm256_set1_ps(5.0000001201e-1); + + for (; number < eighthPoints; number++) { + // First compute the logarithm + aVal = _mm256_load_ps(aPtr); + bias = _mm256_set1_epi32(127); + leadingOne = _mm256_set1_ps(1.0f); + exp = _mm256_sub_epi32( + _mm256_srli_epi32(_mm256_and_si256(_mm256_castps_si256(aVal), + _mm256_set1_epi32(0x7f800000)), + 23), + bias); + logarithm = _mm256_cvtepi32_ps(exp); + + frac = _mm256_or_ps( + leadingOne, + _mm256_and_ps(aVal, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffff)))); #if POW_POLY_DEGREE == 6 - mantissa = POLY5_AVX2_FMA( frac, 3.1157899f, -3.3241990f, 2.5988452f, -1.2315303f, 3.1821337e-1f, -3.4436006e-2f); + mantissa = POLY5_AVX2_FMA(frac, + 3.1157899f, + -3.3241990f, + 2.5988452f, + -1.2315303f, + 3.1821337e-1f, + -3.4436006e-2f); #elif POW_POLY_DEGREE == 5 - mantissa = POLY4_AVX2_FMA( frac, 2.8882704548164776201f, -2.52074962577807006663f, 1.48116647521213171641f, -0.465725644288844778798f, 0.0596515482674574969533f); + mantissa = POLY4_AVX2_FMA(frac, + 2.8882704548164776201f, + -2.52074962577807006663f, + 1.48116647521213171641f, + -0.465725644288844778798f, + 0.0596515482674574969533f); #elif POW_POLY_DEGREE == 4 - mantissa = POLY3_AVX2_FMA( frac, 2.61761038894603480148f, -1.75647175389045657003f, 0.688243882994381274313f, -0.107254423828329604454f); + mantissa = POLY3_AVX2_FMA(frac, + 2.61761038894603480148f, + -1.75647175389045657003f, + 0.688243882994381274313f, + -0.107254423828329604454f); #elif POW_POLY_DEGREE == 3 - mantissa = POLY2_AVX2_FMA( frac, 2.28330284476918490682f, -1.04913055217340124191f, 0.204446009836232697516f); + mantissa = POLY2_AVX2_FMA(frac, + 2.28330284476918490682f, + -1.04913055217340124191f, + 0.204446009836232697516f); #else #error #endif - logarithm = _mm256_fmadd_ps(mantissa, _mm256_sub_ps(frac, leadingOne), logarithm); - logarithm = _mm256_mul_ps(logarithm, ln2); + logarithm = _mm256_fmadd_ps(mantissa, _mm256_sub_ps(frac, leadingOne), logarithm); + logarithm = _mm256_mul_ps(logarithm, ln2); - // Now calculate b*lna - bVal = _mm256_load_ps(bPtr); - bVal = _mm256_mul_ps(bVal, logarithm); + // Now calculate b*lna + bVal = _mm256_load_ps(bPtr); + bVal = _mm256_mul_ps(bVal, logarithm); - // Now compute exp(b*lna) - bVal = _mm256_max_ps(_mm256_min_ps(bVal, exp_hi), exp_lo); + // Now compute exp(b*lna) + bVal = _mm256_max_ps(_mm256_min_ps(bVal, exp_hi), exp_lo); - fx = _mm256_fmadd_ps(bVal, log2EF, half); + fx = _mm256_fmadd_ps(bVal, log2EF, half); - emm0 = _mm256_cvttps_epi32(fx); - tmp = _mm256_cvtepi32_ps(emm0); + emm0 = _mm256_cvttps_epi32(fx); + tmp = _mm256_cvtepi32_ps(emm0); - mask = _mm256_and_ps(_mm256_cmp_ps(tmp, fx, _CMP_GT_OS), one); - fx = _mm256_sub_ps(tmp, mask); + mask = _mm256_and_ps(_mm256_cmp_ps(tmp, fx, _CMP_GT_OS), one); + fx = _mm256_sub_ps(tmp, mask); - tmp = _mm256_fnmadd_ps(fx, exp_C1, bVal); - bVal = _mm256_fnmadd_ps(fx, exp_C2, tmp); - z = _mm256_mul_ps(bVal, bVal); + tmp = _mm256_fnmadd_ps(fx, exp_C1, bVal); + bVal = _mm256_fnmadd_ps(fx, exp_C2, tmp); + z = _mm256_mul_ps(bVal, bVal); - y = _mm256_fmadd_ps(exp_p0, bVal, exp_p1); - y = _mm256_fmadd_ps(y, bVal, exp_p2); - y = _mm256_fmadd_ps(y, bVal, exp_p3); - y = _mm256_fmadd_ps(y, bVal, exp_p4); - y = _mm256_fmadd_ps(y, bVal, exp_p5); - y = _mm256_fmadd_ps(y, z, bVal); - y = _mm256_add_ps(y, one); + y = _mm256_fmadd_ps(exp_p0, bVal, exp_p1); + y = _mm256_fmadd_ps(y, bVal, exp_p2); + y = _mm256_fmadd_ps(y, bVal, exp_p3); + y = _mm256_fmadd_ps(y, bVal, exp_p4); + y = _mm256_fmadd_ps(y, bVal, exp_p5); + y = _mm256_fmadd_ps(y, z, bVal); + y = _mm256_add_ps(y, one); - emm0 = _mm256_slli_epi32(_mm256_add_epi32(_mm256_cvttps_epi32(fx), pi32_0x7f), 23); + emm0 = + _mm256_slli_epi32(_mm256_add_epi32(_mm256_cvttps_epi32(fx), pi32_0x7f), 23); pow2n = _mm256_castsi256_ps(emm0); cVal = _mm256_mul_ps(y, pow2n); @@ -184,12 +215,12 @@ volk_32f_x2_pow_32f_a_avx2_fma(float* cVector, const float* bVector, aPtr += 8; bPtr += 8; cPtr += 8; - } + } - number = eighthPoints * 8; - for(;number < num_points; number++){ - *cPtr++ = pow(*aPtr++, *bPtr++); - } + number = eighthPoints * 8; + for (; number < num_points; number++) { + *cPtr++ = pow(*aPtr++, *bPtr++); + } } #endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for aligned */ @@ -198,99 +229,131 @@ volk_32f_x2_pow_32f_a_avx2_fma(float* cVector, const float* bVector, #include #define POLY0_AVX2(x, c0) _mm256_set1_ps(c0) -#define POLY1_AVX2(x, c0, c1) _mm256_add_ps(_mm256_mul_ps(POLY0_AVX2(x, c1), x), _mm256_set1_ps(c0)) -#define POLY2_AVX2(x, c0, c1, c2) _mm256_add_ps(_mm256_mul_ps(POLY1_AVX2(x, c1, c2), x), _mm256_set1_ps(c0)) -#define POLY3_AVX2(x, c0, c1, c2, c3) _mm256_add_ps(_mm256_mul_ps(POLY2_AVX2(x, c1, c2, c3), x), _mm256_set1_ps(c0)) -#define POLY4_AVX2(x, c0, c1, c2, c3, c4) _mm256_add_ps(_mm256_mul_ps(POLY3_AVX2(x, c1, c2, c3, c4), x), _mm256_set1_ps(c0)) -#define POLY5_AVX2(x, c0, c1, c2, c3, c4, c5) _mm256_add_ps(_mm256_mul_ps(POLY4_AVX2(x, c1, c2, c3, c4, c5), x), _mm256_set1_ps(c0)) - -static inline void -volk_32f_x2_pow_32f_a_avx2(float* cVector, const float* bVector, - const float* aVector, unsigned int num_points) +#define POLY1_AVX2(x, c0, c1) \ + _mm256_add_ps(_mm256_mul_ps(POLY0_AVX2(x, c1), x), _mm256_set1_ps(c0)) +#define POLY2_AVX2(x, c0, c1, c2) \ + _mm256_add_ps(_mm256_mul_ps(POLY1_AVX2(x, c1, c2), x), _mm256_set1_ps(c0)) +#define POLY3_AVX2(x, c0, c1, c2, c3) \ + _mm256_add_ps(_mm256_mul_ps(POLY2_AVX2(x, c1, c2, c3), x), _mm256_set1_ps(c0)) +#define POLY4_AVX2(x, c0, c1, c2, c3, c4) \ + _mm256_add_ps(_mm256_mul_ps(POLY3_AVX2(x, c1, c2, c3, c4), x), _mm256_set1_ps(c0)) +#define POLY5_AVX2(x, c0, c1, c2, c3, c4, c5) \ + _mm256_add_ps(_mm256_mul_ps(POLY4_AVX2(x, c1, c2, c3, c4, c5), x), _mm256_set1_ps(c0)) + +static inline void volk_32f_x2_pow_32f_a_avx2(float* cVector, + const float* bVector, + const float* aVector, + unsigned int num_points) { - float* cPtr = cVector; - const float* bPtr = bVector; - const float* aPtr = aVector; - - unsigned int number = 0; - const unsigned int eighthPoints = num_points / 8; - - __m256 aVal, bVal, cVal, logarithm, mantissa, frac, leadingOne; - __m256 tmp, fx, mask, pow2n, z, y; - __m256 one, exp_hi, exp_lo, ln2, log2EF, half, exp_C1, exp_C2; - __m256 exp_p0, exp_p1, exp_p2, exp_p3, exp_p4, exp_p5; - __m256i bias, exp, emm0, pi32_0x7f; - - one = _mm256_set1_ps(1.0); - exp_hi = _mm256_set1_ps(88.3762626647949); - exp_lo = _mm256_set1_ps(-88.3762626647949); - ln2 = _mm256_set1_ps(0.6931471805); - log2EF = _mm256_set1_ps(1.44269504088896341); - half = _mm256_set1_ps(0.5); - exp_C1 = _mm256_set1_ps(0.693359375); - exp_C2 = _mm256_set1_ps(-2.12194440e-4); - pi32_0x7f = _mm256_set1_epi32(0x7f); - - exp_p0 = _mm256_set1_ps(1.9875691500e-4); - exp_p1 = _mm256_set1_ps(1.3981999507e-3); - exp_p2 = _mm256_set1_ps(8.3334519073e-3); - exp_p3 = _mm256_set1_ps(4.1665795894e-2); - exp_p4 = _mm256_set1_ps(1.6666665459e-1); - exp_p5 = _mm256_set1_ps(5.0000001201e-1); - - for(;number < eighthPoints; number++){ - // First compute the logarithm - aVal = _mm256_load_ps(aPtr); - bias = _mm256_set1_epi32(127); - leadingOne = _mm256_set1_ps(1.0f); - exp = _mm256_sub_epi32(_mm256_srli_epi32(_mm256_and_si256(_mm256_castps_si256(aVal), _mm256_set1_epi32(0x7f800000)), 23), bias); - logarithm = _mm256_cvtepi32_ps(exp); - - frac = _mm256_or_ps(leadingOne, _mm256_and_ps(aVal, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffff)))); + float* cPtr = cVector; + const float* bPtr = bVector; + const float* aPtr = aVector; + + unsigned int number = 0; + const unsigned int eighthPoints = num_points / 8; + + __m256 aVal, bVal, cVal, logarithm, mantissa, frac, leadingOne; + __m256 tmp, fx, mask, pow2n, z, y; + __m256 one, exp_hi, exp_lo, ln2, log2EF, half, exp_C1, exp_C2; + __m256 exp_p0, exp_p1, exp_p2, exp_p3, exp_p4, exp_p5; + __m256i bias, exp, emm0, pi32_0x7f; + + one = _mm256_set1_ps(1.0); + exp_hi = _mm256_set1_ps(88.3762626647949); + exp_lo = _mm256_set1_ps(-88.3762626647949); + ln2 = _mm256_set1_ps(0.6931471805); + log2EF = _mm256_set1_ps(1.44269504088896341); + half = _mm256_set1_ps(0.5); + exp_C1 = _mm256_set1_ps(0.693359375); + exp_C2 = _mm256_set1_ps(-2.12194440e-4); + pi32_0x7f = _mm256_set1_epi32(0x7f); + + exp_p0 = _mm256_set1_ps(1.9875691500e-4); + exp_p1 = _mm256_set1_ps(1.3981999507e-3); + exp_p2 = _mm256_set1_ps(8.3334519073e-3); + exp_p3 = _mm256_set1_ps(4.1665795894e-2); + exp_p4 = _mm256_set1_ps(1.6666665459e-1); + exp_p5 = _mm256_set1_ps(5.0000001201e-1); + + for (; number < eighthPoints; number++) { + // First compute the logarithm + aVal = _mm256_load_ps(aPtr); + bias = _mm256_set1_epi32(127); + leadingOne = _mm256_set1_ps(1.0f); + exp = _mm256_sub_epi32( + _mm256_srli_epi32(_mm256_and_si256(_mm256_castps_si256(aVal), + _mm256_set1_epi32(0x7f800000)), + 23), + bias); + logarithm = _mm256_cvtepi32_ps(exp); + + frac = _mm256_or_ps( + leadingOne, + _mm256_and_ps(aVal, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffff)))); #if POW_POLY_DEGREE == 6 - mantissa = POLY5_AVX2( frac, 3.1157899f, -3.3241990f, 2.5988452f, -1.2315303f, 3.1821337e-1f, -3.4436006e-2f); + mantissa = POLY5_AVX2(frac, + 3.1157899f, + -3.3241990f, + 2.5988452f, + -1.2315303f, + 3.1821337e-1f, + -3.4436006e-2f); #elif POW_POLY_DEGREE == 5 - mantissa = POLY4_AVX2( frac, 2.8882704548164776201f, -2.52074962577807006663f, 1.48116647521213171641f, -0.465725644288844778798f, 0.0596515482674574969533f); + mantissa = POLY4_AVX2(frac, + 2.8882704548164776201f, + -2.52074962577807006663f, + 1.48116647521213171641f, + -0.465725644288844778798f, + 0.0596515482674574969533f); #elif POW_POLY_DEGREE == 4 - mantissa = POLY3_AVX2( frac, 2.61761038894603480148f, -1.75647175389045657003f, 0.688243882994381274313f, -0.107254423828329604454f); + mantissa = POLY3_AVX2(frac, + 2.61761038894603480148f, + -1.75647175389045657003f, + 0.688243882994381274313f, + -0.107254423828329604454f); #elif POW_POLY_DEGREE == 3 - mantissa = POLY2_AVX2( frac, 2.28330284476918490682f, -1.04913055217340124191f, 0.204446009836232697516f); + mantissa = POLY2_AVX2(frac, + 2.28330284476918490682f, + -1.04913055217340124191f, + 0.204446009836232697516f); #else #error #endif - logarithm = _mm256_add_ps(_mm256_mul_ps(mantissa, _mm256_sub_ps(frac, leadingOne)), logarithm); - logarithm = _mm256_mul_ps(logarithm, ln2); + logarithm = _mm256_add_ps( + _mm256_mul_ps(mantissa, _mm256_sub_ps(frac, leadingOne)), logarithm); + logarithm = _mm256_mul_ps(logarithm, ln2); - // Now calculate b*lna - bVal = _mm256_load_ps(bPtr); - bVal = _mm256_mul_ps(bVal, logarithm); + // Now calculate b*lna + bVal = _mm256_load_ps(bPtr); + bVal = _mm256_mul_ps(bVal, logarithm); - // Now compute exp(b*lna) - bVal = _mm256_max_ps(_mm256_min_ps(bVal, exp_hi), exp_lo); + // Now compute exp(b*lna) + bVal = _mm256_max_ps(_mm256_min_ps(bVal, exp_hi), exp_lo); - fx = _mm256_add_ps(_mm256_mul_ps(bVal, log2EF), half); + fx = _mm256_add_ps(_mm256_mul_ps(bVal, log2EF), half); - emm0 = _mm256_cvttps_epi32(fx); - tmp = _mm256_cvtepi32_ps(emm0); + emm0 = _mm256_cvttps_epi32(fx); + tmp = _mm256_cvtepi32_ps(emm0); - mask = _mm256_and_ps(_mm256_cmp_ps(tmp, fx, _CMP_GT_OS), one); - fx = _mm256_sub_ps(tmp, mask); + mask = _mm256_and_ps(_mm256_cmp_ps(tmp, fx, _CMP_GT_OS), one); + fx = _mm256_sub_ps(tmp, mask); - tmp = _mm256_sub_ps(bVal, _mm256_mul_ps(fx, exp_C1)); - bVal = _mm256_sub_ps(tmp, _mm256_mul_ps(fx, exp_C2)); - z = _mm256_mul_ps(bVal, bVal); + tmp = _mm256_sub_ps(bVal, _mm256_mul_ps(fx, exp_C1)); + bVal = _mm256_sub_ps(tmp, _mm256_mul_ps(fx, exp_C2)); + z = _mm256_mul_ps(bVal, bVal); - y = _mm256_add_ps(_mm256_mul_ps(exp_p0, bVal), exp_p1); - y = _mm256_add_ps(_mm256_mul_ps(y, bVal), exp_p2); - y = _mm256_add_ps(_mm256_mul_ps(y, bVal), exp_p3); - y = _mm256_add_ps(_mm256_mul_ps(y, bVal), exp_p4); - y = _mm256_add_ps(_mm256_mul_ps(y, bVal), exp_p5); - y = _mm256_add_ps(_mm256_mul_ps(y, z), bVal); - y = _mm256_add_ps(y, one); + y = _mm256_add_ps(_mm256_mul_ps(exp_p0, bVal), exp_p1); + y = _mm256_add_ps(_mm256_mul_ps(y, bVal), exp_p2); + y = _mm256_add_ps(_mm256_mul_ps(y, bVal), exp_p3); + y = _mm256_add_ps(_mm256_mul_ps(y, bVal), exp_p4); + y = _mm256_add_ps(_mm256_mul_ps(y, bVal), exp_p5); + y = _mm256_add_ps(_mm256_mul_ps(y, z), bVal); + y = _mm256_add_ps(y, one); - emm0 = _mm256_slli_epi32(_mm256_add_epi32(_mm256_cvttps_epi32(fx), pi32_0x7f), 23); + emm0 = + _mm256_slli_epi32(_mm256_add_epi32(_mm256_cvttps_epi32(fx), pi32_0x7f), 23); pow2n = _mm256_castsi256_ps(emm0); cVal = _mm256_mul_ps(y, pow2n); @@ -300,12 +363,12 @@ volk_32f_x2_pow_32f_a_avx2(float* cVector, const float* bVector, aPtr += 8; bPtr += 8; cPtr += 8; - } + } - number = eighthPoints * 8; - for(;number < num_points; number++){ - *cPtr++ = pow(*aPtr++, *bPtr++); - } + number = eighthPoints * 8; + for (; number < num_points; number++) { + *cPtr++ = pow(*aPtr++, *bPtr++); + } } #endif /* LV_HAVE_AVX2 for aligned */ @@ -317,97 +380,124 @@ volk_32f_x2_pow_32f_a_avx2(float* cVector, const float* bVector, #define POLY0(x, c0) _mm_set1_ps(c0) #define POLY1(x, c0, c1) _mm_add_ps(_mm_mul_ps(POLY0(x, c1), x), _mm_set1_ps(c0)) #define POLY2(x, c0, c1, c2) _mm_add_ps(_mm_mul_ps(POLY1(x, c1, c2), x), _mm_set1_ps(c0)) -#define POLY3(x, c0, c1, c2, c3) _mm_add_ps(_mm_mul_ps(POLY2(x, c1, c2, c3), x), _mm_set1_ps(c0)) -#define POLY4(x, c0, c1, c2, c3, c4) _mm_add_ps(_mm_mul_ps(POLY3(x, c1, c2, c3, c4), x), _mm_set1_ps(c0)) -#define POLY5(x, c0, c1, c2, c3, c4, c5) _mm_add_ps(_mm_mul_ps(POLY4(x, c1, c2, c3, c4, c5), x), _mm_set1_ps(c0)) - -static inline void -volk_32f_x2_pow_32f_a_sse4_1(float* cVector, const float* bVector, - const float* aVector, unsigned int num_points) +#define POLY3(x, c0, c1, c2, c3) \ + _mm_add_ps(_mm_mul_ps(POLY2(x, c1, c2, c3), x), _mm_set1_ps(c0)) +#define POLY4(x, c0, c1, c2, c3, c4) \ + _mm_add_ps(_mm_mul_ps(POLY3(x, c1, c2, c3, c4), x), _mm_set1_ps(c0)) +#define POLY5(x, c0, c1, c2, c3, c4, c5) \ + _mm_add_ps(_mm_mul_ps(POLY4(x, c1, c2, c3, c4, c5), x), _mm_set1_ps(c0)) + +static inline void volk_32f_x2_pow_32f_a_sse4_1(float* cVector, + const float* bVector, + const float* aVector, + unsigned int num_points) { - float* cPtr = cVector; - const float* bPtr = bVector; - const float* aPtr = aVector; - - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - __m128 aVal, bVal, cVal, logarithm, mantissa, frac, leadingOne; - __m128 tmp, fx, mask, pow2n, z, y; - __m128 one, exp_hi, exp_lo, ln2, log2EF, half, exp_C1, exp_C2; - __m128 exp_p0, exp_p1, exp_p2, exp_p3, exp_p4, exp_p5; - __m128i bias, exp, emm0, pi32_0x7f; - - one = _mm_set1_ps(1.0); - exp_hi = _mm_set1_ps(88.3762626647949); - exp_lo = _mm_set1_ps(-88.3762626647949); - ln2 = _mm_set1_ps(0.6931471805); - log2EF = _mm_set1_ps(1.44269504088896341); - half = _mm_set1_ps(0.5); - exp_C1 = _mm_set1_ps(0.693359375); - exp_C2 = _mm_set1_ps(-2.12194440e-4); - pi32_0x7f = _mm_set1_epi32(0x7f); - - exp_p0 = _mm_set1_ps(1.9875691500e-4); - exp_p1 = _mm_set1_ps(1.3981999507e-3); - exp_p2 = _mm_set1_ps(8.3334519073e-3); - exp_p3 = _mm_set1_ps(4.1665795894e-2); - exp_p4 = _mm_set1_ps(1.6666665459e-1); - exp_p5 = _mm_set1_ps(5.0000001201e-1); - - for(;number < quarterPoints; number++){ - // First compute the logarithm - aVal = _mm_load_ps(aPtr); - bias = _mm_set1_epi32(127); - leadingOne = _mm_set1_ps(1.0f); - exp = _mm_sub_epi32(_mm_srli_epi32(_mm_and_si128(_mm_castps_si128(aVal), _mm_set1_epi32(0x7f800000)), 23), bias); - logarithm = _mm_cvtepi32_ps(exp); - - frac = _mm_or_ps(leadingOne, _mm_and_ps(aVal, _mm_castsi128_ps(_mm_set1_epi32(0x7fffff)))); + float* cPtr = cVector; + const float* bPtr = bVector; + const float* aPtr = aVector; + + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + __m128 aVal, bVal, cVal, logarithm, mantissa, frac, leadingOne; + __m128 tmp, fx, mask, pow2n, z, y; + __m128 one, exp_hi, exp_lo, ln2, log2EF, half, exp_C1, exp_C2; + __m128 exp_p0, exp_p1, exp_p2, exp_p3, exp_p4, exp_p5; + __m128i bias, exp, emm0, pi32_0x7f; + + one = _mm_set1_ps(1.0); + exp_hi = _mm_set1_ps(88.3762626647949); + exp_lo = _mm_set1_ps(-88.3762626647949); + ln2 = _mm_set1_ps(0.6931471805); + log2EF = _mm_set1_ps(1.44269504088896341); + half = _mm_set1_ps(0.5); + exp_C1 = _mm_set1_ps(0.693359375); + exp_C2 = _mm_set1_ps(-2.12194440e-4); + pi32_0x7f = _mm_set1_epi32(0x7f); + + exp_p0 = _mm_set1_ps(1.9875691500e-4); + exp_p1 = _mm_set1_ps(1.3981999507e-3); + exp_p2 = _mm_set1_ps(8.3334519073e-3); + exp_p3 = _mm_set1_ps(4.1665795894e-2); + exp_p4 = _mm_set1_ps(1.6666665459e-1); + exp_p5 = _mm_set1_ps(5.0000001201e-1); + + for (; number < quarterPoints; number++) { + // First compute the logarithm + aVal = _mm_load_ps(aPtr); + bias = _mm_set1_epi32(127); + leadingOne = _mm_set1_ps(1.0f); + exp = _mm_sub_epi32( + _mm_srli_epi32( + _mm_and_si128(_mm_castps_si128(aVal), _mm_set1_epi32(0x7f800000)), 23), + bias); + logarithm = _mm_cvtepi32_ps(exp); + + frac = _mm_or_ps(leadingOne, + _mm_and_ps(aVal, _mm_castsi128_ps(_mm_set1_epi32(0x7fffff)))); #if POW_POLY_DEGREE == 6 - mantissa = POLY5( frac, 3.1157899f, -3.3241990f, 2.5988452f, -1.2315303f, 3.1821337e-1f, -3.4436006e-2f); + mantissa = POLY5(frac, + 3.1157899f, + -3.3241990f, + 2.5988452f, + -1.2315303f, + 3.1821337e-1f, + -3.4436006e-2f); #elif POW_POLY_DEGREE == 5 - mantissa = POLY4( frac, 2.8882704548164776201f, -2.52074962577807006663f, 1.48116647521213171641f, -0.465725644288844778798f, 0.0596515482674574969533f); + mantissa = POLY4(frac, + 2.8882704548164776201f, + -2.52074962577807006663f, + 1.48116647521213171641f, + -0.465725644288844778798f, + 0.0596515482674574969533f); #elif POW_POLY_DEGREE == 4 - mantissa = POLY3( frac, 2.61761038894603480148f, -1.75647175389045657003f, 0.688243882994381274313f, -0.107254423828329604454f); + mantissa = POLY3(frac, + 2.61761038894603480148f, + -1.75647175389045657003f, + 0.688243882994381274313f, + -0.107254423828329604454f); #elif POW_POLY_DEGREE == 3 - mantissa = POLY2( frac, 2.28330284476918490682f, -1.04913055217340124191f, 0.204446009836232697516f); + mantissa = POLY2(frac, + 2.28330284476918490682f, + -1.04913055217340124191f, + 0.204446009836232697516f); #else #error #endif - logarithm = _mm_add_ps(logarithm, _mm_mul_ps(mantissa, _mm_sub_ps(frac, leadingOne))); - logarithm = _mm_mul_ps(logarithm, ln2); + logarithm = + _mm_add_ps(logarithm, _mm_mul_ps(mantissa, _mm_sub_ps(frac, leadingOne))); + logarithm = _mm_mul_ps(logarithm, ln2); - // Now calculate b*lna - bVal = _mm_load_ps(bPtr); - bVal = _mm_mul_ps(bVal, logarithm); + // Now calculate b*lna + bVal = _mm_load_ps(bPtr); + bVal = _mm_mul_ps(bVal, logarithm); - // Now compute exp(b*lna) - bVal = _mm_max_ps(_mm_min_ps(bVal, exp_hi), exp_lo); + // Now compute exp(b*lna) + bVal = _mm_max_ps(_mm_min_ps(bVal, exp_hi), exp_lo); - fx = _mm_add_ps(_mm_mul_ps(bVal, log2EF), half); + fx = _mm_add_ps(_mm_mul_ps(bVal, log2EF), half); - emm0 = _mm_cvttps_epi32(fx); - tmp = _mm_cvtepi32_ps(emm0); + emm0 = _mm_cvttps_epi32(fx); + tmp = _mm_cvtepi32_ps(emm0); - mask = _mm_and_ps(_mm_cmpgt_ps(tmp, fx), one); - fx = _mm_sub_ps(tmp, mask); + mask = _mm_and_ps(_mm_cmpgt_ps(tmp, fx), one); + fx = _mm_sub_ps(tmp, mask); - tmp = _mm_mul_ps(fx, exp_C1); - z = _mm_mul_ps(fx, exp_C2); - bVal = _mm_sub_ps(_mm_sub_ps(bVal, tmp), z); - z = _mm_mul_ps(bVal, bVal); + tmp = _mm_mul_ps(fx, exp_C1); + z = _mm_mul_ps(fx, exp_C2); + bVal = _mm_sub_ps(_mm_sub_ps(bVal, tmp), z); + z = _mm_mul_ps(bVal, bVal); - y = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(exp_p0, bVal), exp_p1), bVal); - y = _mm_add_ps(_mm_mul_ps(_mm_add_ps(y, exp_p2), bVal), exp_p3); - y = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(y, bVal), exp_p4), bVal); - y = _mm_add_ps(_mm_mul_ps(_mm_add_ps(y, exp_p5), z), bVal); - y = _mm_add_ps(y, one); + y = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(exp_p0, bVal), exp_p1), bVal); + y = _mm_add_ps(_mm_mul_ps(_mm_add_ps(y, exp_p2), bVal), exp_p3); + y = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(y, bVal), exp_p4), bVal); + y = _mm_add_ps(_mm_mul_ps(_mm_add_ps(y, exp_p5), z), bVal); + y = _mm_add_ps(y, one); - emm0 = _mm_slli_epi32(_mm_add_epi32(_mm_cvttps_epi32(fx), pi32_0x7f), 23); + emm0 = _mm_slli_epi32(_mm_add_epi32(_mm_cvttps_epi32(fx), pi32_0x7f), 23); pow2n = _mm_castsi128_ps(emm0); cVal = _mm_mul_ps(y, pow2n); @@ -417,12 +507,12 @@ volk_32f_x2_pow_32f_a_sse4_1(float* cVector, const float* bVector, aPtr += 4; bPtr += 4; cPtr += 4; - } + } - number = quarterPoints * 4; - for(;number < num_points; number++){ - *cPtr++ = powf(*aPtr++, *bPtr++); - } + number = quarterPoints * 4; + for (; number < num_points; number++) { + *cPtr++ = powf(*aPtr++, *bPtr++); + } } #endif /* LV_HAVE_SSE4_1 for aligned */ @@ -432,27 +522,28 @@ volk_32f_x2_pow_32f_a_sse4_1(float* cVector, const float* bVector, #ifndef INCLUDED_volk_32f_x2_pow_32f_u_H #define INCLUDED_volk_32f_x2_pow_32f_u_H -#include -#include #include #include +#include +#include #define POW_POLY_DEGREE 3 #ifdef LV_HAVE_GENERIC -static inline void -volk_32f_x2_pow_32f_generic(float* cVector, const float* bVector, - const float* aVector, unsigned int num_points) +static inline void volk_32f_x2_pow_32f_generic(float* cVector, + const float* bVector, + const float* aVector, + unsigned int num_points) { - float* cPtr = cVector; - const float* bPtr = bVector; - const float* aPtr = aVector; - unsigned int number = 0; - - for(number = 0; number < num_points; number++){ - *cPtr++ = powf(*aPtr++, *bPtr++); - } + float* cPtr = cVector; + const float* bPtr = bVector; + const float* aPtr = aVector; + unsigned int number = 0; + + for (number = 0; number < num_points; number++) { + *cPtr++ = powf(*aPtr++, *bPtr++); + } } #endif /* LV_HAVE_GENERIC */ @@ -463,112 +554,139 @@ volk_32f_x2_pow_32f_generic(float* cVector, const float* bVector, #define POLY0(x, c0) _mm_set1_ps(c0) #define POLY1(x, c0, c1) _mm_add_ps(_mm_mul_ps(POLY0(x, c1), x), _mm_set1_ps(c0)) #define POLY2(x, c0, c1, c2) _mm_add_ps(_mm_mul_ps(POLY1(x, c1, c2), x), _mm_set1_ps(c0)) -#define POLY3(x, c0, c1, c2, c3) _mm_add_ps(_mm_mul_ps(POLY2(x, c1, c2, c3), x), _mm_set1_ps(c0)) -#define POLY4(x, c0, c1, c2, c3, c4) _mm_add_ps(_mm_mul_ps(POLY3(x, c1, c2, c3, c4), x), _mm_set1_ps(c0)) -#define POLY5(x, c0, c1, c2, c3, c4, c5) _mm_add_ps(_mm_mul_ps(POLY4(x, c1, c2, c3, c4, c5), x), _mm_set1_ps(c0)) - -static inline void -volk_32f_x2_pow_32f_u_sse4_1(float* cVector, const float* bVector, - const float* aVector, unsigned int num_points) +#define POLY3(x, c0, c1, c2, c3) \ + _mm_add_ps(_mm_mul_ps(POLY2(x, c1, c2, c3), x), _mm_set1_ps(c0)) +#define POLY4(x, c0, c1, c2, c3, c4) \ + _mm_add_ps(_mm_mul_ps(POLY3(x, c1, c2, c3, c4), x), _mm_set1_ps(c0)) +#define POLY5(x, c0, c1, c2, c3, c4, c5) \ + _mm_add_ps(_mm_mul_ps(POLY4(x, c1, c2, c3, c4, c5), x), _mm_set1_ps(c0)) + +static inline void volk_32f_x2_pow_32f_u_sse4_1(float* cVector, + const float* bVector, + const float* aVector, + unsigned int num_points) { - float* cPtr = cVector; - const float* bPtr = bVector; - const float* aPtr = aVector; - - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - __m128 aVal, bVal, cVal, logarithm, mantissa, frac, leadingOne; - __m128 tmp, fx, mask, pow2n, z, y; - __m128 one, exp_hi, exp_lo, ln2, log2EF, half, exp_C1, exp_C2; - __m128 exp_p0, exp_p1, exp_p2, exp_p3, exp_p4, exp_p5; - __m128i bias, exp, emm0, pi32_0x7f; - - one = _mm_set1_ps(1.0); - exp_hi = _mm_set1_ps(88.3762626647949); - exp_lo = _mm_set1_ps(-88.3762626647949); - ln2 = _mm_set1_ps(0.6931471805); - log2EF = _mm_set1_ps(1.44269504088896341); - half = _mm_set1_ps(0.5); - exp_C1 = _mm_set1_ps(0.693359375); - exp_C2 = _mm_set1_ps(-2.12194440e-4); - pi32_0x7f = _mm_set1_epi32(0x7f); - - exp_p0 = _mm_set1_ps(1.9875691500e-4); - exp_p1 = _mm_set1_ps(1.3981999507e-3); - exp_p2 = _mm_set1_ps(8.3334519073e-3); - exp_p3 = _mm_set1_ps(4.1665795894e-2); - exp_p4 = _mm_set1_ps(1.6666665459e-1); - exp_p5 = _mm_set1_ps(5.0000001201e-1); - - for(;number < quarterPoints; number++){ - // First compute the logarithm - aVal = _mm_loadu_ps(aPtr); - bias = _mm_set1_epi32(127); - leadingOne = _mm_set1_ps(1.0f); - exp = _mm_sub_epi32(_mm_srli_epi32(_mm_and_si128(_mm_castps_si128(aVal), _mm_set1_epi32(0x7f800000)), 23), bias); - logarithm = _mm_cvtepi32_ps(exp); - - frac = _mm_or_ps(leadingOne, _mm_and_ps(aVal, _mm_castsi128_ps(_mm_set1_epi32(0x7fffff)))); + float* cPtr = cVector; + const float* bPtr = bVector; + const float* aPtr = aVector; + + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + __m128 aVal, bVal, cVal, logarithm, mantissa, frac, leadingOne; + __m128 tmp, fx, mask, pow2n, z, y; + __m128 one, exp_hi, exp_lo, ln2, log2EF, half, exp_C1, exp_C2; + __m128 exp_p0, exp_p1, exp_p2, exp_p3, exp_p4, exp_p5; + __m128i bias, exp, emm0, pi32_0x7f; + + one = _mm_set1_ps(1.0); + exp_hi = _mm_set1_ps(88.3762626647949); + exp_lo = _mm_set1_ps(-88.3762626647949); + ln2 = _mm_set1_ps(0.6931471805); + log2EF = _mm_set1_ps(1.44269504088896341); + half = _mm_set1_ps(0.5); + exp_C1 = _mm_set1_ps(0.693359375); + exp_C2 = _mm_set1_ps(-2.12194440e-4); + pi32_0x7f = _mm_set1_epi32(0x7f); + + exp_p0 = _mm_set1_ps(1.9875691500e-4); + exp_p1 = _mm_set1_ps(1.3981999507e-3); + exp_p2 = _mm_set1_ps(8.3334519073e-3); + exp_p3 = _mm_set1_ps(4.1665795894e-2); + exp_p4 = _mm_set1_ps(1.6666665459e-1); + exp_p5 = _mm_set1_ps(5.0000001201e-1); + + for (; number < quarterPoints; number++) { + // First compute the logarithm + aVal = _mm_loadu_ps(aPtr); + bias = _mm_set1_epi32(127); + leadingOne = _mm_set1_ps(1.0f); + exp = _mm_sub_epi32( + _mm_srli_epi32( + _mm_and_si128(_mm_castps_si128(aVal), _mm_set1_epi32(0x7f800000)), 23), + bias); + logarithm = _mm_cvtepi32_ps(exp); + + frac = _mm_or_ps(leadingOne, + _mm_and_ps(aVal, _mm_castsi128_ps(_mm_set1_epi32(0x7fffff)))); #if POW_POLY_DEGREE == 6 - mantissa = POLY5( frac, 3.1157899f, -3.3241990f, 2.5988452f, -1.2315303f, 3.1821337e-1f, -3.4436006e-2f); + mantissa = POLY5(frac, + 3.1157899f, + -3.3241990f, + 2.5988452f, + -1.2315303f, + 3.1821337e-1f, + -3.4436006e-2f); #elif POW_POLY_DEGREE == 5 - mantissa = POLY4( frac, 2.8882704548164776201f, -2.52074962577807006663f, 1.48116647521213171641f, -0.465725644288844778798f, 0.0596515482674574969533f); + mantissa = POLY4(frac, + 2.8882704548164776201f, + -2.52074962577807006663f, + 1.48116647521213171641f, + -0.465725644288844778798f, + 0.0596515482674574969533f); #elif POW_POLY_DEGREE == 4 - mantissa = POLY3( frac, 2.61761038894603480148f, -1.75647175389045657003f, 0.688243882994381274313f, -0.107254423828329604454f); + mantissa = POLY3(frac, + 2.61761038894603480148f, + -1.75647175389045657003f, + 0.688243882994381274313f, + -0.107254423828329604454f); #elif POW_POLY_DEGREE == 3 - mantissa = POLY2( frac, 2.28330284476918490682f, -1.04913055217340124191f, 0.204446009836232697516f); + mantissa = POLY2(frac, + 2.28330284476918490682f, + -1.04913055217340124191f, + 0.204446009836232697516f); #else #error #endif - logarithm = _mm_add_ps(logarithm, _mm_mul_ps(mantissa, _mm_sub_ps(frac, leadingOne))); - logarithm = _mm_mul_ps(logarithm, ln2); + logarithm = + _mm_add_ps(logarithm, _mm_mul_ps(mantissa, _mm_sub_ps(frac, leadingOne))); + logarithm = _mm_mul_ps(logarithm, ln2); - // Now calculate b*lna - bVal = _mm_loadu_ps(bPtr); - bVal = _mm_mul_ps(bVal, logarithm); + // Now calculate b*lna + bVal = _mm_loadu_ps(bPtr); + bVal = _mm_mul_ps(bVal, logarithm); - // Now compute exp(b*lna) - bVal = _mm_max_ps(_mm_min_ps(bVal, exp_hi), exp_lo); + // Now compute exp(b*lna) + bVal = _mm_max_ps(_mm_min_ps(bVal, exp_hi), exp_lo); - fx = _mm_add_ps(_mm_mul_ps(bVal, log2EF), half); + fx = _mm_add_ps(_mm_mul_ps(bVal, log2EF), half); - emm0 = _mm_cvttps_epi32(fx); - tmp = _mm_cvtepi32_ps(emm0); + emm0 = _mm_cvttps_epi32(fx); + tmp = _mm_cvtepi32_ps(emm0); - mask = _mm_and_ps(_mm_cmpgt_ps(tmp, fx), one); - fx = _mm_sub_ps(tmp, mask); + mask = _mm_and_ps(_mm_cmpgt_ps(tmp, fx), one); + fx = _mm_sub_ps(tmp, mask); - tmp = _mm_mul_ps(fx, exp_C1); - z = _mm_mul_ps(fx, exp_C2); - bVal = _mm_sub_ps(_mm_sub_ps(bVal, tmp), z); - z = _mm_mul_ps(bVal, bVal); + tmp = _mm_mul_ps(fx, exp_C1); + z = _mm_mul_ps(fx, exp_C2); + bVal = _mm_sub_ps(_mm_sub_ps(bVal, tmp), z); + z = _mm_mul_ps(bVal, bVal); - y = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(exp_p0, bVal), exp_p1), bVal); - y = _mm_add_ps(_mm_mul_ps(_mm_add_ps(y, exp_p2), bVal), exp_p3); - y = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(y, bVal), exp_p4), bVal); - y = _mm_add_ps(_mm_mul_ps(_mm_add_ps(y, exp_p5), z), bVal); - y = _mm_add_ps(y, one); + y = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(exp_p0, bVal), exp_p1), bVal); + y = _mm_add_ps(_mm_mul_ps(_mm_add_ps(y, exp_p2), bVal), exp_p3); + y = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(y, bVal), exp_p4), bVal); + y = _mm_add_ps(_mm_mul_ps(_mm_add_ps(y, exp_p5), z), bVal); + y = _mm_add_ps(y, one); - emm0 = _mm_slli_epi32(_mm_add_epi32(_mm_cvttps_epi32(fx), pi32_0x7f), 23); + emm0 = _mm_slli_epi32(_mm_add_epi32(_mm_cvttps_epi32(fx), pi32_0x7f), 23); - pow2n = _mm_castsi128_ps(emm0); - cVal = _mm_mul_ps(y, pow2n); + pow2n = _mm_castsi128_ps(emm0); + cVal = _mm_mul_ps(y, pow2n); - _mm_storeu_ps(cPtr, cVal); + _mm_storeu_ps(cPtr, cVal); - aPtr += 4; - bPtr += 4; - cPtr += 4; - } + aPtr += 4; + bPtr += 4; + cPtr += 4; + } - number = quarterPoints * 4; - for(;number < num_points; number++){ - *cPtr++ = powf(*aPtr++, *bPtr++); - } + number = quarterPoints * 4; + for (; number < num_points; number++) { + *cPtr++ = powf(*aPtr++, *bPtr++); + } } #endif /* LV_HAVE_SSE4_1 for unaligned */ @@ -577,100 +695,131 @@ volk_32f_x2_pow_32f_u_sse4_1(float* cVector, const float* bVector, #include #define POLY0_AVX2_FMA(x, c0) _mm256_set1_ps(c0) -#define POLY1_AVX2_FMA(x, c0, c1) _mm256_fmadd_ps(POLY0_AVX2_FMA(x, c1), x, _mm256_set1_ps(c0)) -#define POLY2_AVX2_FMA(x, c0, c1, c2) _mm256_fmadd_ps(POLY1_AVX2_FMA(x, c1, c2), x, _mm256_set1_ps(c0)) -#define POLY3_AVX2_FMA(x, c0, c1, c2, c3) _mm256_fmadd_ps(POLY2_AVX2_FMA(x, c1, c2, c3), x, _mm256_set1_ps(c0)) -#define POLY4_AVX2_FMA(x, c0, c1, c2, c3, c4) _mm256_fmadd_ps(POLY3_AVX2_FMA(x, c1, c2, c3, c4), x, _mm256_set1_ps(c0)) -#define POLY5_AVX2_FMA(x, c0, c1, c2, c3, c4, c5) _mm256_fmadd_ps(POLY4_AVX2_FMA(x, c1, c2, c3, c4, c5), x, _mm256_set1_ps(c0)) - -static inline void -volk_32f_x2_pow_32f_u_avx2_fma(float* cVector, const float* bVector, - const float* aVector, unsigned int num_points) +#define POLY1_AVX2_FMA(x, c0, c1) \ + _mm256_fmadd_ps(POLY0_AVX2_FMA(x, c1), x, _mm256_set1_ps(c0)) +#define POLY2_AVX2_FMA(x, c0, c1, c2) \ + _mm256_fmadd_ps(POLY1_AVX2_FMA(x, c1, c2), x, _mm256_set1_ps(c0)) +#define POLY3_AVX2_FMA(x, c0, c1, c2, c3) \ + _mm256_fmadd_ps(POLY2_AVX2_FMA(x, c1, c2, c3), x, _mm256_set1_ps(c0)) +#define POLY4_AVX2_FMA(x, c0, c1, c2, c3, c4) \ + _mm256_fmadd_ps(POLY3_AVX2_FMA(x, c1, c2, c3, c4), x, _mm256_set1_ps(c0)) +#define POLY5_AVX2_FMA(x, c0, c1, c2, c3, c4, c5) \ + _mm256_fmadd_ps(POLY4_AVX2_FMA(x, c1, c2, c3, c4, c5), x, _mm256_set1_ps(c0)) + +static inline void volk_32f_x2_pow_32f_u_avx2_fma(float* cVector, + const float* bVector, + const float* aVector, + unsigned int num_points) { - float* cPtr = cVector; - const float* bPtr = bVector; - const float* aPtr = aVector; - - unsigned int number = 0; - const unsigned int eighthPoints = num_points / 8; - - __m256 aVal, bVal, cVal, logarithm, mantissa, frac, leadingOne; - __m256 tmp, fx, mask, pow2n, z, y; - __m256 one, exp_hi, exp_lo, ln2, log2EF, half, exp_C1, exp_C2; - __m256 exp_p0, exp_p1, exp_p2, exp_p3, exp_p4, exp_p5; - __m256i bias, exp, emm0, pi32_0x7f; - - one = _mm256_set1_ps(1.0); - exp_hi = _mm256_set1_ps(88.3762626647949); - exp_lo = _mm256_set1_ps(-88.3762626647949); - ln2 = _mm256_set1_ps(0.6931471805); - log2EF = _mm256_set1_ps(1.44269504088896341); - half = _mm256_set1_ps(0.5); - exp_C1 = _mm256_set1_ps(0.693359375); - exp_C2 = _mm256_set1_ps(-2.12194440e-4); - pi32_0x7f = _mm256_set1_epi32(0x7f); - - exp_p0 = _mm256_set1_ps(1.9875691500e-4); - exp_p1 = _mm256_set1_ps(1.3981999507e-3); - exp_p2 = _mm256_set1_ps(8.3334519073e-3); - exp_p3 = _mm256_set1_ps(4.1665795894e-2); - exp_p4 = _mm256_set1_ps(1.6666665459e-1); - exp_p5 = _mm256_set1_ps(5.0000001201e-1); - - for(;number < eighthPoints; number++){ - // First compute the logarithm - aVal = _mm256_loadu_ps(aPtr); - bias = _mm256_set1_epi32(127); - leadingOne = _mm256_set1_ps(1.0f); - exp = _mm256_sub_epi32(_mm256_srli_epi32(_mm256_and_si256(_mm256_castps_si256(aVal), _mm256_set1_epi32(0x7f800000)), 23), bias); - logarithm = _mm256_cvtepi32_ps(exp); - - frac = _mm256_or_ps(leadingOne, _mm256_and_ps(aVal, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffff)))); + float* cPtr = cVector; + const float* bPtr = bVector; + const float* aPtr = aVector; + + unsigned int number = 0; + const unsigned int eighthPoints = num_points / 8; + + __m256 aVal, bVal, cVal, logarithm, mantissa, frac, leadingOne; + __m256 tmp, fx, mask, pow2n, z, y; + __m256 one, exp_hi, exp_lo, ln2, log2EF, half, exp_C1, exp_C2; + __m256 exp_p0, exp_p1, exp_p2, exp_p3, exp_p4, exp_p5; + __m256i bias, exp, emm0, pi32_0x7f; + + one = _mm256_set1_ps(1.0); + exp_hi = _mm256_set1_ps(88.3762626647949); + exp_lo = _mm256_set1_ps(-88.3762626647949); + ln2 = _mm256_set1_ps(0.6931471805); + log2EF = _mm256_set1_ps(1.44269504088896341); + half = _mm256_set1_ps(0.5); + exp_C1 = _mm256_set1_ps(0.693359375); + exp_C2 = _mm256_set1_ps(-2.12194440e-4); + pi32_0x7f = _mm256_set1_epi32(0x7f); + + exp_p0 = _mm256_set1_ps(1.9875691500e-4); + exp_p1 = _mm256_set1_ps(1.3981999507e-3); + exp_p2 = _mm256_set1_ps(8.3334519073e-3); + exp_p3 = _mm256_set1_ps(4.1665795894e-2); + exp_p4 = _mm256_set1_ps(1.6666665459e-1); + exp_p5 = _mm256_set1_ps(5.0000001201e-1); + + for (; number < eighthPoints; number++) { + // First compute the logarithm + aVal = _mm256_loadu_ps(aPtr); + bias = _mm256_set1_epi32(127); + leadingOne = _mm256_set1_ps(1.0f); + exp = _mm256_sub_epi32( + _mm256_srli_epi32(_mm256_and_si256(_mm256_castps_si256(aVal), + _mm256_set1_epi32(0x7f800000)), + 23), + bias); + logarithm = _mm256_cvtepi32_ps(exp); + + frac = _mm256_or_ps( + leadingOne, + _mm256_and_ps(aVal, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffff)))); #if POW_POLY_DEGREE == 6 - mantissa = POLY5_AVX2_FMA( frac, 3.1157899f, -3.3241990f, 2.5988452f, -1.2315303f, 3.1821337e-1f, -3.4436006e-2f); + mantissa = POLY5_AVX2_FMA(frac, + 3.1157899f, + -3.3241990f, + 2.5988452f, + -1.2315303f, + 3.1821337e-1f, + -3.4436006e-2f); #elif POW_POLY_DEGREE == 5 - mantissa = POLY4_AVX2_FMA( frac, 2.8882704548164776201f, -2.52074962577807006663f, 1.48116647521213171641f, -0.465725644288844778798f, 0.0596515482674574969533f); + mantissa = POLY4_AVX2_FMA(frac, + 2.8882704548164776201f, + -2.52074962577807006663f, + 1.48116647521213171641f, + -0.465725644288844778798f, + 0.0596515482674574969533f); #elif POW_POLY_DEGREE == 4 - mantissa = POLY3_AVX2_FMA( frac, 2.61761038894603480148f, -1.75647175389045657003f, 0.688243882994381274313f, -0.107254423828329604454f); + mantissa = POLY3_AVX2_FMA(frac, + 2.61761038894603480148f, + -1.75647175389045657003f, + 0.688243882994381274313f, + -0.107254423828329604454f); #elif POW_POLY_DEGREE == 3 - mantissa = POLY2_AVX2_FMA( frac, 2.28330284476918490682f, -1.04913055217340124191f, 0.204446009836232697516f); + mantissa = POLY2_AVX2_FMA(frac, + 2.28330284476918490682f, + -1.04913055217340124191f, + 0.204446009836232697516f); #else #error #endif - logarithm = _mm256_fmadd_ps(mantissa, _mm256_sub_ps(frac, leadingOne), logarithm); - logarithm = _mm256_mul_ps(logarithm, ln2); + logarithm = _mm256_fmadd_ps(mantissa, _mm256_sub_ps(frac, leadingOne), logarithm); + logarithm = _mm256_mul_ps(logarithm, ln2); - // Now calculate b*lna - bVal = _mm256_loadu_ps(bPtr); - bVal = _mm256_mul_ps(bVal, logarithm); + // Now calculate b*lna + bVal = _mm256_loadu_ps(bPtr); + bVal = _mm256_mul_ps(bVal, logarithm); - // Now compute exp(b*lna) - bVal = _mm256_max_ps(_mm256_min_ps(bVal, exp_hi), exp_lo); + // Now compute exp(b*lna) + bVal = _mm256_max_ps(_mm256_min_ps(bVal, exp_hi), exp_lo); - fx = _mm256_fmadd_ps(bVal, log2EF, half); + fx = _mm256_fmadd_ps(bVal, log2EF, half); - emm0 = _mm256_cvttps_epi32(fx); - tmp = _mm256_cvtepi32_ps(emm0); + emm0 = _mm256_cvttps_epi32(fx); + tmp = _mm256_cvtepi32_ps(emm0); - mask = _mm256_and_ps(_mm256_cmp_ps(tmp, fx, _CMP_GT_OS), one); - fx = _mm256_sub_ps(tmp, mask); + mask = _mm256_and_ps(_mm256_cmp_ps(tmp, fx, _CMP_GT_OS), one); + fx = _mm256_sub_ps(tmp, mask); - tmp = _mm256_fnmadd_ps(fx, exp_C1, bVal); - bVal = _mm256_fnmadd_ps(fx, exp_C2, tmp); - z = _mm256_mul_ps(bVal, bVal); + tmp = _mm256_fnmadd_ps(fx, exp_C1, bVal); + bVal = _mm256_fnmadd_ps(fx, exp_C2, tmp); + z = _mm256_mul_ps(bVal, bVal); - y = _mm256_fmadd_ps(exp_p0, bVal, exp_p1); - y = _mm256_fmadd_ps(y, bVal, exp_p2); - y = _mm256_fmadd_ps(y, bVal, exp_p3); - y = _mm256_fmadd_ps(y, bVal, exp_p4); - y = _mm256_fmadd_ps(y, bVal, exp_p5); - y = _mm256_fmadd_ps(y, z, bVal); - y = _mm256_add_ps(y, one); + y = _mm256_fmadd_ps(exp_p0, bVal, exp_p1); + y = _mm256_fmadd_ps(y, bVal, exp_p2); + y = _mm256_fmadd_ps(y, bVal, exp_p3); + y = _mm256_fmadd_ps(y, bVal, exp_p4); + y = _mm256_fmadd_ps(y, bVal, exp_p5); + y = _mm256_fmadd_ps(y, z, bVal); + y = _mm256_add_ps(y, one); - emm0 = _mm256_slli_epi32(_mm256_add_epi32(_mm256_cvttps_epi32(fx), pi32_0x7f), 23); + emm0 = + _mm256_slli_epi32(_mm256_add_epi32(_mm256_cvttps_epi32(fx), pi32_0x7f), 23); pow2n = _mm256_castsi256_ps(emm0); cVal = _mm256_mul_ps(y, pow2n); @@ -680,12 +829,12 @@ volk_32f_x2_pow_32f_u_avx2_fma(float* cVector, const float* bVector, aPtr += 8; bPtr += 8; cPtr += 8; - } + } - number = eighthPoints * 8; - for(;number < num_points; number++){ - *cPtr++ = pow(*aPtr++, *bPtr++); - } + number = eighthPoints * 8; + for (; number < num_points; number++) { + *cPtr++ = pow(*aPtr++, *bPtr++); + } } #endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for unaligned */ @@ -694,99 +843,131 @@ volk_32f_x2_pow_32f_u_avx2_fma(float* cVector, const float* bVector, #include #define POLY0_AVX2(x, c0) _mm256_set1_ps(c0) -#define POLY1_AVX2(x, c0, c1) _mm256_add_ps(_mm256_mul_ps(POLY0_AVX2(x, c1), x), _mm256_set1_ps(c0)) -#define POLY2_AVX2(x, c0, c1, c2) _mm256_add_ps(_mm256_mul_ps(POLY1_AVX2(x, c1, c2), x), _mm256_set1_ps(c0)) -#define POLY3_AVX2(x, c0, c1, c2, c3) _mm256_add_ps(_mm256_mul_ps(POLY2_AVX2(x, c1, c2, c3), x), _mm256_set1_ps(c0)) -#define POLY4_AVX2(x, c0, c1, c2, c3, c4) _mm256_add_ps(_mm256_mul_ps(POLY3_AVX2(x, c1, c2, c3, c4), x), _mm256_set1_ps(c0)) -#define POLY5_AVX2(x, c0, c1, c2, c3, c4, c5) _mm256_add_ps(_mm256_mul_ps(POLY4_AVX2(x, c1, c2, c3, c4, c5), x), _mm256_set1_ps(c0)) - -static inline void -volk_32f_x2_pow_32f_u_avx2(float* cVector, const float* bVector, - const float* aVector, unsigned int num_points) +#define POLY1_AVX2(x, c0, c1) \ + _mm256_add_ps(_mm256_mul_ps(POLY0_AVX2(x, c1), x), _mm256_set1_ps(c0)) +#define POLY2_AVX2(x, c0, c1, c2) \ + _mm256_add_ps(_mm256_mul_ps(POLY1_AVX2(x, c1, c2), x), _mm256_set1_ps(c0)) +#define POLY3_AVX2(x, c0, c1, c2, c3) \ + _mm256_add_ps(_mm256_mul_ps(POLY2_AVX2(x, c1, c2, c3), x), _mm256_set1_ps(c0)) +#define POLY4_AVX2(x, c0, c1, c2, c3, c4) \ + _mm256_add_ps(_mm256_mul_ps(POLY3_AVX2(x, c1, c2, c3, c4), x), _mm256_set1_ps(c0)) +#define POLY5_AVX2(x, c0, c1, c2, c3, c4, c5) \ + _mm256_add_ps(_mm256_mul_ps(POLY4_AVX2(x, c1, c2, c3, c4, c5), x), _mm256_set1_ps(c0)) + +static inline void volk_32f_x2_pow_32f_u_avx2(float* cVector, + const float* bVector, + const float* aVector, + unsigned int num_points) { - float* cPtr = cVector; - const float* bPtr = bVector; - const float* aPtr = aVector; - - unsigned int number = 0; - const unsigned int eighthPoints = num_points / 8; - - __m256 aVal, bVal, cVal, logarithm, mantissa, frac, leadingOne; - __m256 tmp, fx, mask, pow2n, z, y; - __m256 one, exp_hi, exp_lo, ln2, log2EF, half, exp_C1, exp_C2; - __m256 exp_p0, exp_p1, exp_p2, exp_p3, exp_p4, exp_p5; - __m256i bias, exp, emm0, pi32_0x7f; - - one = _mm256_set1_ps(1.0); - exp_hi = _mm256_set1_ps(88.3762626647949); - exp_lo = _mm256_set1_ps(-88.3762626647949); - ln2 = _mm256_set1_ps(0.6931471805); - log2EF = _mm256_set1_ps(1.44269504088896341); - half = _mm256_set1_ps(0.5); - exp_C1 = _mm256_set1_ps(0.693359375); - exp_C2 = _mm256_set1_ps(-2.12194440e-4); - pi32_0x7f = _mm256_set1_epi32(0x7f); - - exp_p0 = _mm256_set1_ps(1.9875691500e-4); - exp_p1 = _mm256_set1_ps(1.3981999507e-3); - exp_p2 = _mm256_set1_ps(8.3334519073e-3); - exp_p3 = _mm256_set1_ps(4.1665795894e-2); - exp_p4 = _mm256_set1_ps(1.6666665459e-1); - exp_p5 = _mm256_set1_ps(5.0000001201e-1); - - for(;number < eighthPoints; number++){ - // First compute the logarithm - aVal = _mm256_loadu_ps(aPtr); - bias = _mm256_set1_epi32(127); - leadingOne = _mm256_set1_ps(1.0f); - exp = _mm256_sub_epi32(_mm256_srli_epi32(_mm256_and_si256(_mm256_castps_si256(aVal), _mm256_set1_epi32(0x7f800000)), 23), bias); - logarithm = _mm256_cvtepi32_ps(exp); - - frac = _mm256_or_ps(leadingOne, _mm256_and_ps(aVal, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffff)))); + float* cPtr = cVector; + const float* bPtr = bVector; + const float* aPtr = aVector; + + unsigned int number = 0; + const unsigned int eighthPoints = num_points / 8; + + __m256 aVal, bVal, cVal, logarithm, mantissa, frac, leadingOne; + __m256 tmp, fx, mask, pow2n, z, y; + __m256 one, exp_hi, exp_lo, ln2, log2EF, half, exp_C1, exp_C2; + __m256 exp_p0, exp_p1, exp_p2, exp_p3, exp_p4, exp_p5; + __m256i bias, exp, emm0, pi32_0x7f; + + one = _mm256_set1_ps(1.0); + exp_hi = _mm256_set1_ps(88.3762626647949); + exp_lo = _mm256_set1_ps(-88.3762626647949); + ln2 = _mm256_set1_ps(0.6931471805); + log2EF = _mm256_set1_ps(1.44269504088896341); + half = _mm256_set1_ps(0.5); + exp_C1 = _mm256_set1_ps(0.693359375); + exp_C2 = _mm256_set1_ps(-2.12194440e-4); + pi32_0x7f = _mm256_set1_epi32(0x7f); + + exp_p0 = _mm256_set1_ps(1.9875691500e-4); + exp_p1 = _mm256_set1_ps(1.3981999507e-3); + exp_p2 = _mm256_set1_ps(8.3334519073e-3); + exp_p3 = _mm256_set1_ps(4.1665795894e-2); + exp_p4 = _mm256_set1_ps(1.6666665459e-1); + exp_p5 = _mm256_set1_ps(5.0000001201e-1); + + for (; number < eighthPoints; number++) { + // First compute the logarithm + aVal = _mm256_loadu_ps(aPtr); + bias = _mm256_set1_epi32(127); + leadingOne = _mm256_set1_ps(1.0f); + exp = _mm256_sub_epi32( + _mm256_srli_epi32(_mm256_and_si256(_mm256_castps_si256(aVal), + _mm256_set1_epi32(0x7f800000)), + 23), + bias); + logarithm = _mm256_cvtepi32_ps(exp); + + frac = _mm256_or_ps( + leadingOne, + _mm256_and_ps(aVal, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffff)))); #if POW_POLY_DEGREE == 6 - mantissa = POLY5_AVX2( frac, 3.1157899f, -3.3241990f, 2.5988452f, -1.2315303f, 3.1821337e-1f, -3.4436006e-2f); + mantissa = POLY5_AVX2(frac, + 3.1157899f, + -3.3241990f, + 2.5988452f, + -1.2315303f, + 3.1821337e-1f, + -3.4436006e-2f); #elif POW_POLY_DEGREE == 5 - mantissa = POLY4_AVX2( frac, 2.8882704548164776201f, -2.52074962577807006663f, 1.48116647521213171641f, -0.465725644288844778798f, 0.0596515482674574969533f); + mantissa = POLY4_AVX2(frac, + 2.8882704548164776201f, + -2.52074962577807006663f, + 1.48116647521213171641f, + -0.465725644288844778798f, + 0.0596515482674574969533f); #elif POW_POLY_DEGREE == 4 - mantissa = POLY3_AVX2( frac, 2.61761038894603480148f, -1.75647175389045657003f, 0.688243882994381274313f, -0.107254423828329604454f); + mantissa = POLY3_AVX2(frac, + 2.61761038894603480148f, + -1.75647175389045657003f, + 0.688243882994381274313f, + -0.107254423828329604454f); #elif POW_POLY_DEGREE == 3 - mantissa = POLY2_AVX2( frac, 2.28330284476918490682f, -1.04913055217340124191f, 0.204446009836232697516f); + mantissa = POLY2_AVX2(frac, + 2.28330284476918490682f, + -1.04913055217340124191f, + 0.204446009836232697516f); #else #error #endif - logarithm = _mm256_add_ps(_mm256_mul_ps(mantissa, _mm256_sub_ps(frac, leadingOne)), logarithm); - logarithm = _mm256_mul_ps(logarithm, ln2); + logarithm = _mm256_add_ps( + _mm256_mul_ps(mantissa, _mm256_sub_ps(frac, leadingOne)), logarithm); + logarithm = _mm256_mul_ps(logarithm, ln2); - // Now calculate b*lna - bVal = _mm256_loadu_ps(bPtr); - bVal = _mm256_mul_ps(bVal, logarithm); + // Now calculate b*lna + bVal = _mm256_loadu_ps(bPtr); + bVal = _mm256_mul_ps(bVal, logarithm); - // Now compute exp(b*lna) - bVal = _mm256_max_ps(_mm256_min_ps(bVal, exp_hi), exp_lo); + // Now compute exp(b*lna) + bVal = _mm256_max_ps(_mm256_min_ps(bVal, exp_hi), exp_lo); - fx = _mm256_add_ps(_mm256_mul_ps(bVal, log2EF), half); + fx = _mm256_add_ps(_mm256_mul_ps(bVal, log2EF), half); - emm0 = _mm256_cvttps_epi32(fx); - tmp = _mm256_cvtepi32_ps(emm0); + emm0 = _mm256_cvttps_epi32(fx); + tmp = _mm256_cvtepi32_ps(emm0); - mask = _mm256_and_ps(_mm256_cmp_ps(tmp, fx, _CMP_GT_OS), one); - fx = _mm256_sub_ps(tmp, mask); + mask = _mm256_and_ps(_mm256_cmp_ps(tmp, fx, _CMP_GT_OS), one); + fx = _mm256_sub_ps(tmp, mask); - tmp = _mm256_sub_ps(bVal, _mm256_mul_ps(fx, exp_C1)); - bVal = _mm256_sub_ps(tmp, _mm256_mul_ps(fx, exp_C2)); - z = _mm256_mul_ps(bVal, bVal); + tmp = _mm256_sub_ps(bVal, _mm256_mul_ps(fx, exp_C1)); + bVal = _mm256_sub_ps(tmp, _mm256_mul_ps(fx, exp_C2)); + z = _mm256_mul_ps(bVal, bVal); - y = _mm256_add_ps(_mm256_mul_ps(exp_p0, bVal), exp_p1); - y = _mm256_add_ps(_mm256_mul_ps(y, bVal), exp_p2); - y = _mm256_add_ps(_mm256_mul_ps(y, bVal), exp_p3); - y = _mm256_add_ps(_mm256_mul_ps(y, bVal), exp_p4); - y = _mm256_add_ps(_mm256_mul_ps(y, bVal), exp_p5); - y = _mm256_add_ps(_mm256_mul_ps(y, z), bVal); - y = _mm256_add_ps(y, one); + y = _mm256_add_ps(_mm256_mul_ps(exp_p0, bVal), exp_p1); + y = _mm256_add_ps(_mm256_mul_ps(y, bVal), exp_p2); + y = _mm256_add_ps(_mm256_mul_ps(y, bVal), exp_p3); + y = _mm256_add_ps(_mm256_mul_ps(y, bVal), exp_p4); + y = _mm256_add_ps(_mm256_mul_ps(y, bVal), exp_p5); + y = _mm256_add_ps(_mm256_mul_ps(y, z), bVal); + y = _mm256_add_ps(y, one); - emm0 = _mm256_slli_epi32(_mm256_add_epi32(_mm256_cvttps_epi32(fx), pi32_0x7f), 23); + emm0 = + _mm256_slli_epi32(_mm256_add_epi32(_mm256_cvttps_epi32(fx), pi32_0x7f), 23); pow2n = _mm256_castsi256_ps(emm0); cVal = _mm256_mul_ps(y, pow2n); @@ -796,12 +977,12 @@ volk_32f_x2_pow_32f_u_avx2(float* cVector, const float* bVector, aPtr += 8; bPtr += 8; cPtr += 8; - } + } - number = eighthPoints * 8; - for(;number < num_points; number++){ - *cPtr++ = pow(*aPtr++, *bPtr++); - } + number = eighthPoints * 8; + for (; number < num_points; number++) { + *cPtr++ = pow(*aPtr++, *bPtr++); + } } #endif /* LV_HAVE_AVX2 for unaligned */ diff --git a/kernels/volk/volk_32f_x2_s32f_interleave_16ic.h b/kernels/volk/volk_32f_x2_s32f_interleave_16ic.h index 8021faf..04e5892 100644 --- a/kernels/volk/volk_32f_x2_s32f_interleave_16ic.h +++ b/kernels/volk/volk_32f_x2_s32f_interleave_16ic.h @@ -32,8 +32,8 @@ * * Dispatcher Prototype * \code - * void volk_32f_x2_s32f_interleave_16ic(lv_16sc_t* complexVector, const float* iBuffer, const float* qBuffer, const float scalar, unsigned int num_points) - * \endcode + * void volk_32f_x2_s32f_interleave_16ic(lv_16sc_t* complexVector, const float* iBuffer, + * const float* qBuffer, const float scalar, unsigned int num_points) \endcode * * \b Inputs * \li iBuffer: Input vector of samples for the real part. @@ -75,60 +75,62 @@ #ifndef INCLUDED_volk_32f_x2_s32f_interleave_16ic_a_H #define INCLUDED_volk_32f_x2_s32f_interleave_16ic_a_H -#include #include #include +#include #ifdef LV_HAVE_AVX2 #include -static inline void -volk_32f_x2_s32f_interleave_16ic_a_avx2(lv_16sc_t* complexVector, const float* iBuffer, - const float* qBuffer, const float scalar, unsigned int num_points) +static inline void volk_32f_x2_s32f_interleave_16ic_a_avx2(lv_16sc_t* complexVector, + const float* iBuffer, + const float* qBuffer, + const float scalar, + unsigned int num_points) { - unsigned int number = 0; - const float* iBufferPtr = iBuffer; - const float* qBufferPtr = qBuffer; + unsigned int number = 0; + const float* iBufferPtr = iBuffer; + const float* qBufferPtr = qBuffer; - __m256 vScalar = _mm256_set1_ps(scalar); + __m256 vScalar = _mm256_set1_ps(scalar); - const unsigned int eighthPoints = num_points / 8; + const unsigned int eighthPoints = num_points / 8; - __m256 iValue, qValue, cplxValue1, cplxValue2; - __m256i intValue1, intValue2; + __m256 iValue, qValue, cplxValue1, cplxValue2; + __m256i intValue1, intValue2; - int16_t* complexVectorPtr = (int16_t*)complexVector; + int16_t* complexVectorPtr = (int16_t*)complexVector; - for(;number < eighthPoints; number++){ - iValue = _mm256_load_ps(iBufferPtr); - qValue = _mm256_load_ps(qBufferPtr); + for (; number < eighthPoints; number++) { + iValue = _mm256_load_ps(iBufferPtr); + qValue = _mm256_load_ps(qBufferPtr); - // Interleaves the lower two values in the i and q variables into one buffer - cplxValue1 = _mm256_unpacklo_ps(iValue, qValue); - cplxValue1 = _mm256_mul_ps(cplxValue1, vScalar); + // Interleaves the lower two values in the i and q variables into one buffer + cplxValue1 = _mm256_unpacklo_ps(iValue, qValue); + cplxValue1 = _mm256_mul_ps(cplxValue1, vScalar); - // Interleaves the upper two values in the i and q variables into one buffer - cplxValue2 = _mm256_unpackhi_ps(iValue, qValue); - cplxValue2 = _mm256_mul_ps(cplxValue2, vScalar); + // Interleaves the upper two values in the i and q variables into one buffer + cplxValue2 = _mm256_unpackhi_ps(iValue, qValue); + cplxValue2 = _mm256_mul_ps(cplxValue2, vScalar); - intValue1 = _mm256_cvtps_epi32(cplxValue1); - intValue2 = _mm256_cvtps_epi32(cplxValue2); + intValue1 = _mm256_cvtps_epi32(cplxValue1); + intValue2 = _mm256_cvtps_epi32(cplxValue2); - intValue1 = _mm256_packs_epi32(intValue1, intValue2); + intValue1 = _mm256_packs_epi32(intValue1, intValue2); - _mm256_store_si256((__m256i*)complexVectorPtr, intValue1); - complexVectorPtr += 16; + _mm256_store_si256((__m256i*)complexVectorPtr, intValue1); + complexVectorPtr += 16; - iBufferPtr += 8; - qBufferPtr += 8; - } + iBufferPtr += 8; + qBufferPtr += 8; + } - number = eighthPoints * 8; - complexVectorPtr = (int16_t*)(&complexVector[number]); - for(; number < num_points; number++){ - *complexVectorPtr++ = (int16_t)rintf(*iBufferPtr++ * scalar); - *complexVectorPtr++ = (int16_t)rintf(*qBufferPtr++ * scalar); - } + number = eighthPoints * 8; + complexVectorPtr = (int16_t*)(&complexVector[number]); + for (; number < num_points; number++) { + *complexVectorPtr++ = (int16_t)rintf(*iBufferPtr++ * scalar); + *complexVectorPtr++ = (int16_t)rintf(*qBufferPtr++ * scalar); + } } #endif /* LV_HAVE_AVX2 */ @@ -136,53 +138,55 @@ volk_32f_x2_s32f_interleave_16ic_a_avx2(lv_16sc_t* complexVector, const float* i #ifdef LV_HAVE_SSE2 #include -static inline void -volk_32f_x2_s32f_interleave_16ic_a_sse2(lv_16sc_t* complexVector, const float* iBuffer, - const float* qBuffer, const float scalar, unsigned int num_points) +static inline void volk_32f_x2_s32f_interleave_16ic_a_sse2(lv_16sc_t* complexVector, + const float* iBuffer, + const float* qBuffer, + const float scalar, + unsigned int num_points) { - unsigned int number = 0; - const float* iBufferPtr = iBuffer; - const float* qBufferPtr = qBuffer; + unsigned int number = 0; + const float* iBufferPtr = iBuffer; + const float* qBufferPtr = qBuffer; - __m128 vScalar = _mm_set_ps1(scalar); + __m128 vScalar = _mm_set_ps1(scalar); - const unsigned int quarterPoints = num_points / 4; + const unsigned int quarterPoints = num_points / 4; - __m128 iValue, qValue, cplxValue1, cplxValue2; - __m128i intValue1, intValue2; + __m128 iValue, qValue, cplxValue1, cplxValue2; + __m128i intValue1, intValue2; - int16_t* complexVectorPtr = (int16_t*)complexVector; + int16_t* complexVectorPtr = (int16_t*)complexVector; - for(;number < quarterPoints; number++){ - iValue = _mm_load_ps(iBufferPtr); - qValue = _mm_load_ps(qBufferPtr); + for (; number < quarterPoints; number++) { + iValue = _mm_load_ps(iBufferPtr); + qValue = _mm_load_ps(qBufferPtr); - // Interleaves the lower two values in the i and q variables into one buffer - cplxValue1 = _mm_unpacklo_ps(iValue, qValue); - cplxValue1 = _mm_mul_ps(cplxValue1, vScalar); + // Interleaves the lower two values in the i and q variables into one buffer + cplxValue1 = _mm_unpacklo_ps(iValue, qValue); + cplxValue1 = _mm_mul_ps(cplxValue1, vScalar); - // Interleaves the upper two values in the i and q variables into one buffer - cplxValue2 = _mm_unpackhi_ps(iValue, qValue); - cplxValue2 = _mm_mul_ps(cplxValue2, vScalar); + // Interleaves the upper two values in the i and q variables into one buffer + cplxValue2 = _mm_unpackhi_ps(iValue, qValue); + cplxValue2 = _mm_mul_ps(cplxValue2, vScalar); - intValue1 = _mm_cvtps_epi32(cplxValue1); - intValue2 = _mm_cvtps_epi32(cplxValue2); + intValue1 = _mm_cvtps_epi32(cplxValue1); + intValue2 = _mm_cvtps_epi32(cplxValue2); - intValue1 = _mm_packs_epi32(intValue1, intValue2); + intValue1 = _mm_packs_epi32(intValue1, intValue2); - _mm_store_si128((__m128i*)complexVectorPtr, intValue1); - complexVectorPtr += 8; + _mm_store_si128((__m128i*)complexVectorPtr, intValue1); + complexVectorPtr += 8; - iBufferPtr += 4; - qBufferPtr += 4; - } + iBufferPtr += 4; + qBufferPtr += 4; + } - number = quarterPoints * 4; - complexVectorPtr = (int16_t*)(&complexVector[number]); - for(; number < num_points; number++){ - *complexVectorPtr++ = (int16_t)rintf(*iBufferPtr++ * scalar); - *complexVectorPtr++ = (int16_t)rintf(*qBufferPtr++ * scalar); - } + number = quarterPoints * 4; + complexVectorPtr = (int16_t*)(&complexVector[number]); + for (; number < num_points; number++) { + *complexVectorPtr++ = (int16_t)rintf(*iBufferPtr++ * scalar); + *complexVectorPtr++ = (int16_t)rintf(*qBufferPtr++ * scalar); + } } #endif /* LV_HAVE_SSE2 */ @@ -190,79 +194,83 @@ volk_32f_x2_s32f_interleave_16ic_a_sse2(lv_16sc_t* complexVector, const float* i #ifdef LV_HAVE_SSE #include -static inline void -volk_32f_x2_s32f_interleave_16ic_a_sse(lv_16sc_t* complexVector, const float* iBuffer, - const float* qBuffer, const float scalar, unsigned int num_points) +static inline void volk_32f_x2_s32f_interleave_16ic_a_sse(lv_16sc_t* complexVector, + const float* iBuffer, + const float* qBuffer, + const float scalar, + unsigned int num_points) { - unsigned int number = 0; - const float* iBufferPtr = iBuffer; - const float* qBufferPtr = qBuffer; + unsigned int number = 0; + const float* iBufferPtr = iBuffer; + const float* qBufferPtr = qBuffer; - __m128 vScalar = _mm_set_ps1(scalar); + __m128 vScalar = _mm_set_ps1(scalar); - const unsigned int quarterPoints = num_points / 4; + const unsigned int quarterPoints = num_points / 4; - __m128 iValue, qValue, cplxValue; + __m128 iValue, qValue, cplxValue; - int16_t* complexVectorPtr = (int16_t*)complexVector; + int16_t* complexVectorPtr = (int16_t*)complexVector; - __VOLK_ATTR_ALIGNED(16) float floatBuffer[4]; + __VOLK_ATTR_ALIGNED(16) float floatBuffer[4]; - for(;number < quarterPoints; number++){ - iValue = _mm_load_ps(iBufferPtr); - qValue = _mm_load_ps(qBufferPtr); + for (; number < quarterPoints; number++) { + iValue = _mm_load_ps(iBufferPtr); + qValue = _mm_load_ps(qBufferPtr); - // Interleaves the lower two values in the i and q variables into one buffer - cplxValue = _mm_unpacklo_ps(iValue, qValue); - cplxValue = _mm_mul_ps(cplxValue, vScalar); + // Interleaves the lower two values in the i and q variables into one buffer + cplxValue = _mm_unpacklo_ps(iValue, qValue); + cplxValue = _mm_mul_ps(cplxValue, vScalar); - _mm_store_ps(floatBuffer, cplxValue); + _mm_store_ps(floatBuffer, cplxValue); - *complexVectorPtr++ = (int16_t)rintf(floatBuffer[0]); - *complexVectorPtr++ = (int16_t)rintf(floatBuffer[1]); - *complexVectorPtr++ = (int16_t)rintf(floatBuffer[2]); - *complexVectorPtr++ = (int16_t)rintf(floatBuffer[3]); + *complexVectorPtr++ = (int16_t)rintf(floatBuffer[0]); + *complexVectorPtr++ = (int16_t)rintf(floatBuffer[1]); + *complexVectorPtr++ = (int16_t)rintf(floatBuffer[2]); + *complexVectorPtr++ = (int16_t)rintf(floatBuffer[3]); - // Interleaves the upper two values in the i and q variables into one buffer - cplxValue = _mm_unpackhi_ps(iValue, qValue); - cplxValue = _mm_mul_ps(cplxValue, vScalar); + // Interleaves the upper two values in the i and q variables into one buffer + cplxValue = _mm_unpackhi_ps(iValue, qValue); + cplxValue = _mm_mul_ps(cplxValue, vScalar); - _mm_store_ps(floatBuffer, cplxValue); + _mm_store_ps(floatBuffer, cplxValue); - *complexVectorPtr++ = (int16_t)rintf(floatBuffer[0]); - *complexVectorPtr++ = (int16_t)rintf(floatBuffer[1]); - *complexVectorPtr++ = (int16_t)rintf(floatBuffer[2]); - *complexVectorPtr++ = (int16_t)rintf(floatBuffer[3]); + *complexVectorPtr++ = (int16_t)rintf(floatBuffer[0]); + *complexVectorPtr++ = (int16_t)rintf(floatBuffer[1]); + *complexVectorPtr++ = (int16_t)rintf(floatBuffer[2]); + *complexVectorPtr++ = (int16_t)rintf(floatBuffer[3]); - iBufferPtr += 4; - qBufferPtr += 4; - } + iBufferPtr += 4; + qBufferPtr += 4; + } - number = quarterPoints * 4; - complexVectorPtr = (int16_t*)(&complexVector[number]); - for(; number < num_points; number++){ - *complexVectorPtr++ = (int16_t)rintf(*iBufferPtr++ * scalar); - *complexVectorPtr++ = (int16_t)rintf(*qBufferPtr++ * scalar); - } + number = quarterPoints * 4; + complexVectorPtr = (int16_t*)(&complexVector[number]); + for (; number < num_points; number++) { + *complexVectorPtr++ = (int16_t)rintf(*iBufferPtr++ * scalar); + *complexVectorPtr++ = (int16_t)rintf(*qBufferPtr++ * scalar); + } } #endif /* LV_HAVE_SSE */ #ifdef LV_HAVE_GENERIC -static inline void -volk_32f_x2_s32f_interleave_16ic_generic(lv_16sc_t* complexVector, const float* iBuffer, - const float* qBuffer, const float scalar, unsigned int num_points) +static inline void volk_32f_x2_s32f_interleave_16ic_generic(lv_16sc_t* complexVector, + const float* iBuffer, + const float* qBuffer, + const float scalar, + unsigned int num_points) { - int16_t* complexVectorPtr = (int16_t*)complexVector; - const float* iBufferPtr = iBuffer; - const float* qBufferPtr = qBuffer; - unsigned int number = 0; - - for(number = 0; number < num_points; number++){ - *complexVectorPtr++ = (int16_t)rintf(*iBufferPtr++ * scalar); - *complexVectorPtr++ = (int16_t)rintf(*qBufferPtr++ * scalar); - } + int16_t* complexVectorPtr = (int16_t*)complexVector; + const float* iBufferPtr = iBuffer; + const float* qBufferPtr = qBuffer; + unsigned int number = 0; + + for (number = 0; number < num_points; number++) { + *complexVectorPtr++ = (int16_t)rintf(*iBufferPtr++ * scalar); + *complexVectorPtr++ = (int16_t)rintf(*qBufferPtr++ * scalar); + } } #endif /* LV_HAVE_GENERIC */ @@ -272,60 +280,62 @@ volk_32f_x2_s32f_interleave_16ic_generic(lv_16sc_t* complexVector, const float* #ifndef INCLUDED_volk_32f_x2_s32f_interleave_16ic_u_H #define INCLUDED_volk_32f_x2_s32f_interleave_16ic_u_H -#include #include #include +#include #ifdef LV_HAVE_AVX2 #include -static inline void -volk_32f_x2_s32f_interleave_16ic_u_avx2(lv_16sc_t* complexVector, const float* iBuffer, - const float* qBuffer, const float scalar, unsigned int num_points) +static inline void volk_32f_x2_s32f_interleave_16ic_u_avx2(lv_16sc_t* complexVector, + const float* iBuffer, + const float* qBuffer, + const float scalar, + unsigned int num_points) { - unsigned int number = 0; - const float* iBufferPtr = iBuffer; - const float* qBufferPtr = qBuffer; + unsigned int number = 0; + const float* iBufferPtr = iBuffer; + const float* qBufferPtr = qBuffer; - __m256 vScalar = _mm256_set1_ps(scalar); + __m256 vScalar = _mm256_set1_ps(scalar); - const unsigned int eighthPoints = num_points / 8; + const unsigned int eighthPoints = num_points / 8; - __m256 iValue, qValue, cplxValue1, cplxValue2; - __m256i intValue1, intValue2; + __m256 iValue, qValue, cplxValue1, cplxValue2; + __m256i intValue1, intValue2; - int16_t* complexVectorPtr = (int16_t*)complexVector; + int16_t* complexVectorPtr = (int16_t*)complexVector; - for(;number < eighthPoints; number++){ - iValue = _mm256_loadu_ps(iBufferPtr); - qValue = _mm256_loadu_ps(qBufferPtr); + for (; number < eighthPoints; number++) { + iValue = _mm256_loadu_ps(iBufferPtr); + qValue = _mm256_loadu_ps(qBufferPtr); - // Interleaves the lower two values in the i and q variables into one buffer - cplxValue1 = _mm256_unpacklo_ps(iValue, qValue); - cplxValue1 = _mm256_mul_ps(cplxValue1, vScalar); + // Interleaves the lower two values in the i and q variables into one buffer + cplxValue1 = _mm256_unpacklo_ps(iValue, qValue); + cplxValue1 = _mm256_mul_ps(cplxValue1, vScalar); - // Interleaves the upper two values in the i and q variables into one buffer - cplxValue2 = _mm256_unpackhi_ps(iValue, qValue); - cplxValue2 = _mm256_mul_ps(cplxValue2, vScalar); + // Interleaves the upper two values in the i and q variables into one buffer + cplxValue2 = _mm256_unpackhi_ps(iValue, qValue); + cplxValue2 = _mm256_mul_ps(cplxValue2, vScalar); - intValue1 = _mm256_cvtps_epi32(cplxValue1); - intValue2 = _mm256_cvtps_epi32(cplxValue2); + intValue1 = _mm256_cvtps_epi32(cplxValue1); + intValue2 = _mm256_cvtps_epi32(cplxValue2); - intValue1 = _mm256_packs_epi32(intValue1, intValue2); + intValue1 = _mm256_packs_epi32(intValue1, intValue2); - _mm256_storeu_si256((__m256i*)complexVectorPtr, intValue1); - complexVectorPtr += 16; + _mm256_storeu_si256((__m256i*)complexVectorPtr, intValue1); + complexVectorPtr += 16; - iBufferPtr += 8; - qBufferPtr += 8; - } + iBufferPtr += 8; + qBufferPtr += 8; + } - number = eighthPoints * 8; - complexVectorPtr = (int16_t*)(&complexVector[number]); - for(; number < num_points; number++){ - *complexVectorPtr++ = (int16_t)rintf(*iBufferPtr++ * scalar); - *complexVectorPtr++ = (int16_t)rintf(*qBufferPtr++ * scalar); - } + number = eighthPoints * 8; + complexVectorPtr = (int16_t*)(&complexVector[number]); + for (; number < num_points; number++) { + *complexVectorPtr++ = (int16_t)rintf(*iBufferPtr++ * scalar); + *complexVectorPtr++ = (int16_t)rintf(*qBufferPtr++ * scalar); + } } #endif /* LV_HAVE_AVX2 */ diff --git a/kernels/volk/volk_32f_x2_subtract_32f.h b/kernels/volk/volk_32f_x2_subtract_32f.h index bdfa0a1..359974c 100644 --- a/kernels/volk/volk_32f_x2_subtract_32f.h +++ b/kernels/volk/volk_32f_x2_subtract_32f.h @@ -31,8 +31,8 @@ * * Dispatcher Prototype * \code - * void volk_32f_x2_subtract_32f(float* cVector, const float* aVector, const float* bVector, unsigned int num_points) - * \endcode + * void volk_32f_x2_subtract_32f(float* cVector, const float* aVector, const float* + * bVector, unsigned int num_points) \endcode * * \b Inputs * \li aVector: The initial vector. @@ -77,126 +77,130 @@ #ifdef LV_HAVE_AVX512F #include -static inline void -volk_32f_x2_subtract_32f_a_avx512f(float* cVector, const float* aVector, - const float* bVector, unsigned int num_points) +static inline void volk_32f_x2_subtract_32f_a_avx512f(float* cVector, + const float* aVector, + const float* bVector, + unsigned int num_points) { - unsigned int number = 0; - const unsigned int sixteenthPoints = num_points / 16; + unsigned int number = 0; + const unsigned int sixteenthPoints = num_points / 16; - float* cPtr = cVector; - const float* aPtr = aVector; - const float* bPtr = bVector; + float* cPtr = cVector; + const float* aPtr = aVector; + const float* bPtr = bVector; - __m512 aVal, bVal, cVal; - for(;number < sixteenthPoints; number++){ + __m512 aVal, bVal, cVal; + for (; number < sixteenthPoints; number++) { - aVal = _mm512_load_ps(aPtr); - bVal = _mm512_load_ps(bPtr); + aVal = _mm512_load_ps(aPtr); + bVal = _mm512_load_ps(bPtr); - cVal = _mm512_sub_ps(aVal, bVal); + cVal = _mm512_sub_ps(aVal, bVal); - _mm512_store_ps(cPtr,cVal); // Store the results back into the C container + _mm512_store_ps(cPtr, cVal); // Store the results back into the C container - aPtr += 16; - bPtr += 16; - cPtr += 16; - } + aPtr += 16; + bPtr += 16; + cPtr += 16; + } - number = sixteenthPoints *16; - for(;number < num_points; number++){ - *cPtr++ = (*aPtr++) - (*bPtr++); - } + number = sixteenthPoints * 16; + for (; number < num_points; number++) { + *cPtr++ = (*aPtr++) - (*bPtr++); + } } #endif /* LV_HAVE_AVX512F */ #ifdef LV_HAVE_AVX #include -static inline void -volk_32f_x2_subtract_32f_a_avx(float* cVector, const float* aVector, - const float* bVector, unsigned int num_points) +static inline void volk_32f_x2_subtract_32f_a_avx(float* cVector, + const float* aVector, + const float* bVector, + unsigned int num_points) { - unsigned int number = 0; - const unsigned int eighthPoints = num_points / 8; + unsigned int number = 0; + const unsigned int eighthPoints = num_points / 8; - float* cPtr = cVector; - const float* aPtr = aVector; - const float* bPtr = bVector; + float* cPtr = cVector; + const float* aPtr = aVector; + const float* bPtr = bVector; - __m256 aVal, bVal, cVal; - for(;number < eighthPoints; number++){ + __m256 aVal, bVal, cVal; + for (; number < eighthPoints; number++) { - aVal = _mm256_load_ps(aPtr); - bVal = _mm256_load_ps(bPtr); + aVal = _mm256_load_ps(aPtr); + bVal = _mm256_load_ps(bPtr); - cVal = _mm256_sub_ps(aVal, bVal); + cVal = _mm256_sub_ps(aVal, bVal); - _mm256_store_ps(cPtr,cVal); // Store the results back into the C container + _mm256_store_ps(cPtr, cVal); // Store the results back into the C container - aPtr += 8; - bPtr += 8; - cPtr += 8; - } + aPtr += 8; + bPtr += 8; + cPtr += 8; + } - number = eighthPoints * 8; - for(;number < num_points; number++){ - *cPtr++ = (*aPtr++) - (*bPtr++); - } + number = eighthPoints * 8; + for (; number < num_points; number++) { + *cPtr++ = (*aPtr++) - (*bPtr++); + } } #endif /* LV_HAVE_AVX */ #ifdef LV_HAVE_SSE #include -static inline void -volk_32f_x2_subtract_32f_a_sse(float* cVector, const float* aVector, - const float* bVector, unsigned int num_points) +static inline void volk_32f_x2_subtract_32f_a_sse(float* cVector, + const float* aVector, + const float* bVector, + unsigned int num_points) { - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; - float* cPtr = cVector; - const float* aPtr = aVector; - const float* bPtr = bVector; + float* cPtr = cVector; + const float* aPtr = aVector; + const float* bPtr = bVector; - __m128 aVal, bVal, cVal; - for(;number < quarterPoints; number++){ + __m128 aVal, bVal, cVal; + for (; number < quarterPoints; number++) { - aVal = _mm_load_ps(aPtr); - bVal = _mm_load_ps(bPtr); + aVal = _mm_load_ps(aPtr); + bVal = _mm_load_ps(bPtr); - cVal = _mm_sub_ps(aVal, bVal); + cVal = _mm_sub_ps(aVal, bVal); - _mm_store_ps(cPtr,cVal); // Store the results back into the C container + _mm_store_ps(cPtr, cVal); // Store the results back into the C container - aPtr += 4; - bPtr += 4; - cPtr += 4; - } + aPtr += 4; + bPtr += 4; + cPtr += 4; + } - number = quarterPoints * 4; - for(;number < num_points; number++){ - *cPtr++ = (*aPtr++) - (*bPtr++); - } + number = quarterPoints * 4; + for (; number < num_points; number++) { + *cPtr++ = (*aPtr++) - (*bPtr++); + } } #endif /* LV_HAVE_SSE */ #ifdef LV_HAVE_GENERIC -static inline void -volk_32f_x2_subtract_32f_generic(float* cVector, const float* aVector, - const float* bVector, unsigned int num_points) +static inline void volk_32f_x2_subtract_32f_generic(float* cVector, + const float* aVector, + const float* bVector, + unsigned int num_points) { - float* cPtr = cVector; - const float* aPtr = aVector; - const float* bPtr = bVector; - unsigned int number = 0; - - for(number = 0; number < num_points; number++){ - *cPtr++ = (*aPtr++) - (*bPtr++); - } + float* cPtr = cVector; + const float* aPtr = aVector; + const float* bPtr = bVector; + unsigned int number = 0; + + for (number = 0; number < num_points; number++) { + *cPtr++ = (*aPtr++) - (*bPtr++); + } } #endif /* LV_HAVE_GENERIC */ @@ -204,45 +208,48 @@ volk_32f_x2_subtract_32f_generic(float* cVector, const float* aVector, #ifdef LV_HAVE_NEON #include -static inline void -volk_32f_x2_subtract_32f_neon(float* cVector, const float* aVector, - const float* bVector, unsigned int num_points) +static inline void volk_32f_x2_subtract_32f_neon(float* cVector, + const float* aVector, + const float* bVector, + unsigned int num_points) { - float* cPtr = cVector; - const float* aPtr = aVector; - const float* bPtr = bVector; - unsigned int number = 0; - unsigned int quarter_points = num_points / 4; - - float32x4_t a_vec, b_vec, c_vec; - - for(number = 0; number < quarter_points; number++){ - a_vec = vld1q_f32(aPtr); - b_vec = vld1q_f32(bPtr); - c_vec = vsubq_f32(a_vec, b_vec); - vst1q_f32(cPtr, c_vec); - aPtr += 4; - bPtr += 4; - cPtr += 4; - } - - for(number = quarter_points * 4; number < num_points; number++){ - *cPtr++ = (*aPtr++) - (*bPtr++); - } + float* cPtr = cVector; + const float* aPtr = aVector; + const float* bPtr = bVector; + unsigned int number = 0; + unsigned int quarter_points = num_points / 4; + + float32x4_t a_vec, b_vec, c_vec; + + for (number = 0; number < quarter_points; number++) { + a_vec = vld1q_f32(aPtr); + b_vec = vld1q_f32(bPtr); + c_vec = vsubq_f32(a_vec, b_vec); + vst1q_f32(cPtr, c_vec); + aPtr += 4; + bPtr += 4; + cPtr += 4; + } + + for (number = quarter_points * 4; number < num_points; number++) { + *cPtr++ = (*aPtr++) - (*bPtr++); + } } #endif /* LV_HAVE_NEON */ #ifdef LV_HAVE_ORC -extern void -volk_32f_x2_subtract_32f_a_orc_impl(float* cVector, const float* aVector, - const float* bVector, unsigned int num_points); - -static inline void -volk_32f_x2_subtract_32f_u_orc(float* cVector, const float* aVector, - const float* bVector, unsigned int num_points) +extern void volk_32f_x2_subtract_32f_a_orc_impl(float* cVector, + const float* aVector, + const float* bVector, + unsigned int num_points); + +static inline void volk_32f_x2_subtract_32f_u_orc(float* cVector, + const float* aVector, + const float* bVector, + unsigned int num_points) { - volk_32f_x2_subtract_32f_a_orc_impl(cVector, aVector, bVector, num_points); + volk_32f_x2_subtract_32f_a_orc_impl(cVector, aVector, bVector, num_points); } #endif /* LV_HAVE_ORC */ @@ -259,36 +266,37 @@ volk_32f_x2_subtract_32f_u_orc(float* cVector, const float* aVector, #ifdef LV_HAVE_AVX512F #include -static inline void -volk_32f_x2_subtract_32f_u_avx512f(float* cVector, const float* aVector, - const float* bVector, unsigned int num_points) +static inline void volk_32f_x2_subtract_32f_u_avx512f(float* cVector, + const float* aVector, + const float* bVector, + unsigned int num_points) { - unsigned int number = 0; - const unsigned int sixteenthPoints = num_points / 16; + unsigned int number = 0; + const unsigned int sixteenthPoints = num_points / 16; - float* cPtr = cVector; - const float* aPtr = aVector; - const float* bPtr = bVector; + float* cPtr = cVector; + const float* aPtr = aVector; + const float* bPtr = bVector; - __m512 aVal, bVal, cVal; - for(;number < sixteenthPoints; number++){ + __m512 aVal, bVal, cVal; + for (; number < sixteenthPoints; number++) { - aVal = _mm512_loadu_ps(aPtr); - bVal = _mm512_loadu_ps(bPtr); + aVal = _mm512_loadu_ps(aPtr); + bVal = _mm512_loadu_ps(bPtr); - cVal = _mm512_sub_ps(aVal, bVal); + cVal = _mm512_sub_ps(aVal, bVal); - _mm512_storeu_ps(cPtr,cVal); // Store the results back into the C container + _mm512_storeu_ps(cPtr, cVal); // Store the results back into the C container - aPtr += 16; - bPtr += 16; - cPtr += 16; - } + aPtr += 16; + bPtr += 16; + cPtr += 16; + } - number = sixteenthPoints *16; - for(;number < num_points; number++){ - *cPtr++ = (*aPtr++) - (*bPtr++); - } + number = sixteenthPoints * 16; + for (; number < num_points; number++) { + *cPtr++ = (*aPtr++) - (*bPtr++); + } } #endif /* LV_HAVE_AVX512F */ @@ -296,36 +304,37 @@ volk_32f_x2_subtract_32f_u_avx512f(float* cVector, const float* aVector, #ifdef LV_HAVE_AVX #include -static inline void -volk_32f_x2_subtract_32f_u_avx(float* cVector, const float* aVector, - const float* bVector, unsigned int num_points) +static inline void volk_32f_x2_subtract_32f_u_avx(float* cVector, + const float* aVector, + const float* bVector, + unsigned int num_points) { - unsigned int number = 0; - const unsigned int eighthPoints = num_points / 8; + unsigned int number = 0; + const unsigned int eighthPoints = num_points / 8; - float* cPtr = cVector; - const float* aPtr = aVector; - const float* bPtr = bVector; + float* cPtr = cVector; + const float* aPtr = aVector; + const float* bPtr = bVector; - __m256 aVal, bVal, cVal; - for(;number < eighthPoints; number++){ + __m256 aVal, bVal, cVal; + for (; number < eighthPoints; number++) { - aVal = _mm256_loadu_ps(aPtr); - bVal = _mm256_loadu_ps(bPtr); + aVal = _mm256_loadu_ps(aPtr); + bVal = _mm256_loadu_ps(bPtr); - cVal = _mm256_sub_ps(aVal, bVal); + cVal = _mm256_sub_ps(aVal, bVal); - _mm256_storeu_ps(cPtr,cVal); // Store the results back into the C container + _mm256_storeu_ps(cPtr, cVal); // Store the results back into the C container - aPtr += 8; - bPtr += 8; - cPtr += 8; - } + aPtr += 8; + bPtr += 8; + cPtr += 8; + } - number = eighthPoints * 8; - for(;number < num_points; number++){ - *cPtr++ = (*aPtr++) - (*bPtr++); - } + number = eighthPoints * 8; + for (; number < num_points; number++) { + *cPtr++ = (*aPtr++) - (*bPtr++); + } } #endif /* LV_HAVE_AVX */ diff --git a/kernels/volk/volk_32f_x3_sum_of_poly_32f.h b/kernels/volk/volk_32f_x3_sum_of_poly_32f.h index e74a385..b0b1466 100644 --- a/kernels/volk/volk_32f_x3_sum_of_poly_32f.h +++ b/kernels/volk/volk_32f_x3_sum_of_poly_32f.h @@ -30,12 +30,13 @@ * multiply by the rectangle/bin width. * * Expressed as a formula, this function calculates - * \f$ \sum f(x) = \sum (c_0 + c_1 \cdot x + c_2 \cdot x^2 + c_3 \cdot x^3 + c_4 \cdot x^4)\f$ + * \f$ \sum f(x) = \sum (c_0 + c_1 \cdot x + c_2 \cdot x^2 + c_3 \cdot x^3 + c_4 \cdot + * x^4)\f$ * * Dispatcher Prototype * \code - * void volk_32f_x3_sum_of_poly_32f(float* target, float* src0, float* center_point_array, float* cutoff, unsigned int num_points) - * \endcode + * void volk_32f_x3_sum_of_poly_32f(float* target, float* src0, float* center_point_array, + * float* cutoff, unsigned int num_points) \endcode * * \b Inputs * \li src0: x values @@ -53,9 +54,10 @@ * \code * int npoints = 4096; * float* coefficients = (float*)volk_malloc(sizeof(float) * 5, volk_get_alignment()); - * float* input = (float*)volk_malloc(sizeof(float) * npoints, volk_get_alignment()); - * float* result = (float*)volk_malloc(sizeof(float), volk_get_alignment()); - * float* cutoff = (float*)volk_malloc(sizeof(float), volk_get_alignment()); + * float* input = (float*)volk_malloc(sizeof(float) * npoints, + * volk_get_alignment()); float* result = (float*)volk_malloc(sizeof(float), + * volk_get_alignment()); float* cutoff = (float*)volk_malloc(sizeof(float), + * volk_get_alignment()); * // load precomputed Taylor series coefficients * coefficients[0] = 4.48168907033806f; // c1 * coefficients[1] = coefficients[0] * 0.5f; // c2 @@ -82,288 +84,291 @@ #ifndef INCLUDED_volk_32f_x3_sum_of_poly_32f_a_H #define INCLUDED_volk_32f_x3_sum_of_poly_32f_a_H -#include -#include -#include +#include +#include +#include #ifndef MAX -#define MAX(X,Y) ((X) > (Y)?(X):(Y)) +#define MAX(X, Y) ((X) > (Y) ? (X) : (Y)) #endif #ifdef LV_HAVE_SSE3 -#include -#include - -static inline void -volk_32f_x3_sum_of_poly_32f_a_sse3(float* target, float* src0, float* center_point_array, - float* cutoff, unsigned int num_points) +#include +#include + +static inline void volk_32f_x3_sum_of_poly_32f_a_sse3(float* target, + float* src0, + float* center_point_array, + float* cutoff, + unsigned int num_points) { - float result = 0.0f; - float fst = 0.0f; - float sq = 0.0f; - float thrd = 0.0f; - float frth = 0.0f; - - __m128 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10; - - xmm9 = _mm_setzero_ps(); - xmm1 = _mm_setzero_ps(); - xmm0 = _mm_load1_ps(¢er_point_array[0]); - xmm6 = _mm_load1_ps(¢er_point_array[1]); - xmm7 = _mm_load1_ps(¢er_point_array[2]); - xmm8 = _mm_load1_ps(¢er_point_array[3]); - xmm10 = _mm_load1_ps(cutoff); - - int bound = num_points/8; - int leftovers = num_points - 8*bound; - int i = 0; - for(; i < bound; ++i) { - // 1st - xmm2 = _mm_load_ps(src0); - xmm2 = _mm_max_ps(xmm10, xmm2); - xmm3 = _mm_mul_ps(xmm2, xmm2); - xmm4 = _mm_mul_ps(xmm2, xmm3); - xmm5 = _mm_mul_ps(xmm3, xmm3); - - xmm2 = _mm_mul_ps(xmm2, xmm0); - xmm3 = _mm_mul_ps(xmm3, xmm6); - xmm4 = _mm_mul_ps(xmm4, xmm7); - xmm5 = _mm_mul_ps(xmm5, xmm8); - - xmm2 = _mm_add_ps(xmm2, xmm3); - xmm3 = _mm_add_ps(xmm4, xmm5); - - src0 += 4; - - xmm9 = _mm_add_ps(xmm2, xmm9); - xmm9 = _mm_add_ps(xmm3, xmm9); - - // 2nd - xmm2 = _mm_load_ps(src0); - xmm2 = _mm_max_ps(xmm10, xmm2); - xmm3 = _mm_mul_ps(xmm2, xmm2); - xmm4 = _mm_mul_ps(xmm2, xmm3); - xmm5 = _mm_mul_ps(xmm3, xmm3); - - xmm2 = _mm_mul_ps(xmm2, xmm0); - xmm3 = _mm_mul_ps(xmm3, xmm6); - xmm4 = _mm_mul_ps(xmm4, xmm7); - xmm5 = _mm_mul_ps(xmm5, xmm8); - - xmm2 = _mm_add_ps(xmm2, xmm3); - xmm3 = _mm_add_ps(xmm4, xmm5); - - src0 += 4; - - xmm1 = _mm_add_ps(xmm2, xmm1); - xmm1 = _mm_add_ps(xmm3, xmm1); - } - xmm2 = _mm_hadd_ps(xmm9, xmm1); - xmm3 = _mm_hadd_ps(xmm2, xmm2); - xmm4 = _mm_hadd_ps(xmm3, xmm3); - _mm_store_ss(&result, xmm4); - - for(i = 0; i < leftovers; ++i) { - fst = *src0++; - fst = MAX(fst, *cutoff); - sq = fst * fst; - thrd = fst * sq; - frth = sq * sq; - result += (center_point_array[0] * fst + - center_point_array[1] * sq + - center_point_array[2] * thrd + - center_point_array[3] * frth); - } - - result += (float)(num_points) * center_point_array[4]; - *target = result; + float result = 0.0f; + float fst = 0.0f; + float sq = 0.0f; + float thrd = 0.0f; + float frth = 0.0f; + + __m128 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10; + + xmm9 = _mm_setzero_ps(); + xmm1 = _mm_setzero_ps(); + xmm0 = _mm_load1_ps(¢er_point_array[0]); + xmm6 = _mm_load1_ps(¢er_point_array[1]); + xmm7 = _mm_load1_ps(¢er_point_array[2]); + xmm8 = _mm_load1_ps(¢er_point_array[3]); + xmm10 = _mm_load1_ps(cutoff); + + int bound = num_points / 8; + int leftovers = num_points - 8 * bound; + int i = 0; + for (; i < bound; ++i) { + // 1st + xmm2 = _mm_load_ps(src0); + xmm2 = _mm_max_ps(xmm10, xmm2); + xmm3 = _mm_mul_ps(xmm2, xmm2); + xmm4 = _mm_mul_ps(xmm2, xmm3); + xmm5 = _mm_mul_ps(xmm3, xmm3); + + xmm2 = _mm_mul_ps(xmm2, xmm0); + xmm3 = _mm_mul_ps(xmm3, xmm6); + xmm4 = _mm_mul_ps(xmm4, xmm7); + xmm5 = _mm_mul_ps(xmm5, xmm8); + + xmm2 = _mm_add_ps(xmm2, xmm3); + xmm3 = _mm_add_ps(xmm4, xmm5); + + src0 += 4; + + xmm9 = _mm_add_ps(xmm2, xmm9); + xmm9 = _mm_add_ps(xmm3, xmm9); + + // 2nd + xmm2 = _mm_load_ps(src0); + xmm2 = _mm_max_ps(xmm10, xmm2); + xmm3 = _mm_mul_ps(xmm2, xmm2); + xmm4 = _mm_mul_ps(xmm2, xmm3); + xmm5 = _mm_mul_ps(xmm3, xmm3); + + xmm2 = _mm_mul_ps(xmm2, xmm0); + xmm3 = _mm_mul_ps(xmm3, xmm6); + xmm4 = _mm_mul_ps(xmm4, xmm7); + xmm5 = _mm_mul_ps(xmm5, xmm8); + + xmm2 = _mm_add_ps(xmm2, xmm3); + xmm3 = _mm_add_ps(xmm4, xmm5); + + src0 += 4; + + xmm1 = _mm_add_ps(xmm2, xmm1); + xmm1 = _mm_add_ps(xmm3, xmm1); + } + xmm2 = _mm_hadd_ps(xmm9, xmm1); + xmm3 = _mm_hadd_ps(xmm2, xmm2); + xmm4 = _mm_hadd_ps(xmm3, xmm3); + _mm_store_ss(&result, xmm4); + + for (i = 0; i < leftovers; ++i) { + fst = *src0++; + fst = MAX(fst, *cutoff); + sq = fst * fst; + thrd = fst * sq; + frth = sq * sq; + result += (center_point_array[0] * fst + center_point_array[1] * sq + + center_point_array[2] * thrd + center_point_array[3] * frth); + } + + result += (float)(num_points)*center_point_array[4]; + *target = result; } #endif /*LV_HAVE_SSE3*/ #if LV_HAVE_AVX && LV_HAVE_FMA -#include +#include -static inline void -volk_32f_x3_sum_of_poly_32f_a_avx2_fma(float* target, float* src0, float* center_point_array, - float* cutoff, unsigned int num_points) +static inline void volk_32f_x3_sum_of_poly_32f_a_avx2_fma(float* target, + float* src0, + float* center_point_array, + float* cutoff, + unsigned int num_points) { - const unsigned int eighth_points = num_points / 8; - float fst = 0.0; - float sq = 0.0; - float thrd = 0.0; - float frth = 0.0; - - __m256 cpa0, cpa1, cpa2, cpa3, cutoff_vec; - __m256 target_vec; - __m256 x_to_1, x_to_2, x_to_3, x_to_4; - - cpa0 = _mm256_set1_ps(center_point_array[0]); - cpa1 = _mm256_set1_ps(center_point_array[1]); - cpa2 = _mm256_set1_ps(center_point_array[2]); - cpa3 = _mm256_set1_ps(center_point_array[3]); - cutoff_vec = _mm256_set1_ps(*cutoff); - target_vec = _mm256_setzero_ps(); - - unsigned int i; - - for(i = 0; i < eighth_points; ++i) { - x_to_1 = _mm256_load_ps(src0); - x_to_1 = _mm256_max_ps(x_to_1, cutoff_vec); - x_to_2 = _mm256_mul_ps(x_to_1, x_to_1); // x^2 - x_to_3 = _mm256_mul_ps(x_to_1, x_to_2); // x^3 - // x^1 * x^3 is slightly faster than x^2 * x^2 - x_to_4 = _mm256_mul_ps(x_to_1, x_to_3); // x^4 - - x_to_2 = _mm256_mul_ps(x_to_2, cpa1); // cpa[1] * x^2 - x_to_4 = _mm256_mul_ps(x_to_4, cpa3); // cpa[3] * x^4 - - x_to_1 = _mm256_fmadd_ps(x_to_1, cpa0, x_to_2); - x_to_3 = _mm256_fmadd_ps(x_to_3, cpa2, x_to_4); - // this is slightly faster than result += (x_to_1 + x_to_3) - target_vec = _mm256_add_ps(x_to_1, target_vec); - target_vec = _mm256_add_ps(x_to_3, target_vec); - - src0 += 8; - } - - // the hadd for vector reduction has very very slight impact @ 50k iters - __VOLK_ATTR_ALIGNED(32) float temp_results[8]; - target_vec = _mm256_hadd_ps(target_vec, target_vec); // x0+x1 | x2+x3 | x0+x1 | x2+x3 || x4+x5 | x6+x7 | x4+x5 | x6+x7 - _mm256_store_ps(temp_results, target_vec); - *target = temp_results[0] + temp_results[1] + temp_results[4] + temp_results[5]; - - for(i = eighth_points*8; i < num_points; ++i) { - fst = *src0++; - fst = MAX(fst, *cutoff); - sq = fst * fst; - thrd = fst * sq; - frth = sq * sq; - *target += (center_point_array[0] * fst + - center_point_array[1] * sq + - center_point_array[2] * thrd + - center_point_array[3] * frth); - } - *target += (float)(num_points) * center_point_array[4]; + const unsigned int eighth_points = num_points / 8; + float fst = 0.0; + float sq = 0.0; + float thrd = 0.0; + float frth = 0.0; + + __m256 cpa0, cpa1, cpa2, cpa3, cutoff_vec; + __m256 target_vec; + __m256 x_to_1, x_to_2, x_to_3, x_to_4; + + cpa0 = _mm256_set1_ps(center_point_array[0]); + cpa1 = _mm256_set1_ps(center_point_array[1]); + cpa2 = _mm256_set1_ps(center_point_array[2]); + cpa3 = _mm256_set1_ps(center_point_array[3]); + cutoff_vec = _mm256_set1_ps(*cutoff); + target_vec = _mm256_setzero_ps(); + + unsigned int i; + + for (i = 0; i < eighth_points; ++i) { + x_to_1 = _mm256_load_ps(src0); + x_to_1 = _mm256_max_ps(x_to_1, cutoff_vec); + x_to_2 = _mm256_mul_ps(x_to_1, x_to_1); // x^2 + x_to_3 = _mm256_mul_ps(x_to_1, x_to_2); // x^3 + // x^1 * x^3 is slightly faster than x^2 * x^2 + x_to_4 = _mm256_mul_ps(x_to_1, x_to_3); // x^4 + + x_to_2 = _mm256_mul_ps(x_to_2, cpa1); // cpa[1] * x^2 + x_to_4 = _mm256_mul_ps(x_to_4, cpa3); // cpa[3] * x^4 + + x_to_1 = _mm256_fmadd_ps(x_to_1, cpa0, x_to_2); + x_to_3 = _mm256_fmadd_ps(x_to_3, cpa2, x_to_4); + // this is slightly faster than result += (x_to_1 + x_to_3) + target_vec = _mm256_add_ps(x_to_1, target_vec); + target_vec = _mm256_add_ps(x_to_3, target_vec); + + src0 += 8; + } + + // the hadd for vector reduction has very very slight impact @ 50k iters + __VOLK_ATTR_ALIGNED(32) float temp_results[8]; + target_vec = _mm256_hadd_ps( + target_vec, + target_vec); // x0+x1 | x2+x3 | x0+x1 | x2+x3 || x4+x5 | x6+x7 | x4+x5 | x6+x7 + _mm256_store_ps(temp_results, target_vec); + *target = temp_results[0] + temp_results[1] + temp_results[4] + temp_results[5]; + + for (i = eighth_points * 8; i < num_points; ++i) { + fst = *src0++; + fst = MAX(fst, *cutoff); + sq = fst * fst; + thrd = fst * sq; + frth = sq * sq; + *target += (center_point_array[0] * fst + center_point_array[1] * sq + + center_point_array[2] * thrd + center_point_array[3] * frth); + } + *target += (float)(num_points)*center_point_array[4]; } #endif // LV_HAVE_AVX && LV_HAVE_FMA #ifdef LV_HAVE_AVX -#include +#include -static inline void -volk_32f_x3_sum_of_poly_32f_a_avx(float* target, float* src0, float* center_point_array, - float* cutoff, unsigned int num_points) +static inline void volk_32f_x3_sum_of_poly_32f_a_avx(float* target, + float* src0, + float* center_point_array, + float* cutoff, + unsigned int num_points) { - const unsigned int eighth_points = num_points / 8; - float fst = 0.0; - float sq = 0.0; - float thrd = 0.0; - float frth = 0.0; - - __m256 cpa0, cpa1, cpa2, cpa3, cutoff_vec; - __m256 target_vec; - __m256 x_to_1, x_to_2, x_to_3, x_to_4; - - cpa0 = _mm256_set1_ps(center_point_array[0]); - cpa1 = _mm256_set1_ps(center_point_array[1]); - cpa2 = _mm256_set1_ps(center_point_array[2]); - cpa3 = _mm256_set1_ps(center_point_array[3]); - cutoff_vec = _mm256_set1_ps(*cutoff); - target_vec = _mm256_setzero_ps(); - - unsigned int i; - - for(i = 0; i < eighth_points; ++i) { - x_to_1 = _mm256_load_ps(src0); - x_to_1 = _mm256_max_ps(x_to_1, cutoff_vec); - x_to_2 = _mm256_mul_ps(x_to_1, x_to_1); // x^2 - x_to_3 = _mm256_mul_ps(x_to_1, x_to_2); // x^3 - // x^1 * x^3 is slightly faster than x^2 * x^2 - x_to_4 = _mm256_mul_ps(x_to_1, x_to_3); // x^4 - - x_to_1 = _mm256_mul_ps(x_to_1, cpa0); // cpa[0] * x^1 - x_to_2 = _mm256_mul_ps(x_to_2, cpa1); // cpa[1] * x^2 - x_to_3 = _mm256_mul_ps(x_to_3, cpa2); // cpa[2] * x^3 - x_to_4 = _mm256_mul_ps(x_to_4, cpa3); // cpa[3] * x^4 - - x_to_1 = _mm256_add_ps(x_to_1, x_to_2); - x_to_3 = _mm256_add_ps(x_to_3, x_to_4); - // this is slightly faster than result += (x_to_1 + x_to_3) - target_vec = _mm256_add_ps(x_to_1, target_vec); - target_vec = _mm256_add_ps(x_to_3, target_vec); - - src0 += 8; - } - - // the hadd for vector reduction has very very slight impact @ 50k iters - __VOLK_ATTR_ALIGNED(32) float temp_results[8]; - target_vec = _mm256_hadd_ps(target_vec, target_vec); // x0+x1 | x2+x3 | x0+x1 | x2+x3 || x4+x5 | x6+x7 | x4+x5 | x6+x7 - _mm256_store_ps(temp_results, target_vec); - *target = temp_results[0] + temp_results[1] + temp_results[4] + temp_results[5]; - - for(i = eighth_points*8; i < num_points; ++i) { - fst = *src0++; - fst = MAX(fst, *cutoff); - sq = fst * fst; - thrd = fst * sq; - frth = sq * sq; - *target += (center_point_array[0] * fst + - center_point_array[1] * sq + - center_point_array[2] * thrd + - center_point_array[3] * frth); - } - *target += (float)(num_points) * center_point_array[4]; + const unsigned int eighth_points = num_points / 8; + float fst = 0.0; + float sq = 0.0; + float thrd = 0.0; + float frth = 0.0; + + __m256 cpa0, cpa1, cpa2, cpa3, cutoff_vec; + __m256 target_vec; + __m256 x_to_1, x_to_2, x_to_3, x_to_4; + + cpa0 = _mm256_set1_ps(center_point_array[0]); + cpa1 = _mm256_set1_ps(center_point_array[1]); + cpa2 = _mm256_set1_ps(center_point_array[2]); + cpa3 = _mm256_set1_ps(center_point_array[3]); + cutoff_vec = _mm256_set1_ps(*cutoff); + target_vec = _mm256_setzero_ps(); + + unsigned int i; + + for (i = 0; i < eighth_points; ++i) { + x_to_1 = _mm256_load_ps(src0); + x_to_1 = _mm256_max_ps(x_to_1, cutoff_vec); + x_to_2 = _mm256_mul_ps(x_to_1, x_to_1); // x^2 + x_to_3 = _mm256_mul_ps(x_to_1, x_to_2); // x^3 + // x^1 * x^3 is slightly faster than x^2 * x^2 + x_to_4 = _mm256_mul_ps(x_to_1, x_to_3); // x^4 + + x_to_1 = _mm256_mul_ps(x_to_1, cpa0); // cpa[0] * x^1 + x_to_2 = _mm256_mul_ps(x_to_2, cpa1); // cpa[1] * x^2 + x_to_3 = _mm256_mul_ps(x_to_3, cpa2); // cpa[2] * x^3 + x_to_4 = _mm256_mul_ps(x_to_4, cpa3); // cpa[3] * x^4 + + x_to_1 = _mm256_add_ps(x_to_1, x_to_2); + x_to_3 = _mm256_add_ps(x_to_3, x_to_4); + // this is slightly faster than result += (x_to_1 + x_to_3) + target_vec = _mm256_add_ps(x_to_1, target_vec); + target_vec = _mm256_add_ps(x_to_3, target_vec); + + src0 += 8; + } + + // the hadd for vector reduction has very very slight impact @ 50k iters + __VOLK_ATTR_ALIGNED(32) float temp_results[8]; + target_vec = _mm256_hadd_ps( + target_vec, + target_vec); // x0+x1 | x2+x3 | x0+x1 | x2+x3 || x4+x5 | x6+x7 | x4+x5 | x6+x7 + _mm256_store_ps(temp_results, target_vec); + *target = temp_results[0] + temp_results[1] + temp_results[4] + temp_results[5]; + + for (i = eighth_points * 8; i < num_points; ++i) { + fst = *src0++; + fst = MAX(fst, *cutoff); + sq = fst * fst; + thrd = fst * sq; + frth = sq * sq; + *target += (center_point_array[0] * fst + center_point_array[1] * sq + + center_point_array[2] * thrd + center_point_array[3] * frth); + } + *target += (float)(num_points)*center_point_array[4]; } #endif // LV_HAVE_AVX - #ifdef LV_HAVE_GENERIC -static inline void -volk_32f_x3_sum_of_poly_32f_generic(float* target, float* src0, float* center_point_array, - float* cutoff, unsigned int num_points) +static inline void volk_32f_x3_sum_of_poly_32f_generic(float* target, + float* src0, + float* center_point_array, + float* cutoff, + unsigned int num_points) { - const unsigned int eighth_points = num_points / 8; - - float result[8] = {0.0f,0.0f,0.0f,0.0f, 0.0f,0.0f,0.0f,0.0f}; - float fst = 0.0f; - float sq = 0.0f; - float thrd = 0.0f; - float frth = 0.0f; - - unsigned int i = 0; - unsigned int k = 0; - for(i = 0; i < eighth_points; ++i) { - for(k = 0; k < 8; ++k) { - fst = *src0++; - fst = MAX(fst, *cutoff); - sq = fst * fst; - thrd = fst * sq; - frth = fst * thrd; - result[k] += center_point_array[0] * fst + center_point_array[1] * sq; - result[k] += center_point_array[2] * thrd + center_point_array[3] * frth; + const unsigned int eighth_points = num_points / 8; + + float result[8] = { 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f }; + float fst = 0.0f; + float sq = 0.0f; + float thrd = 0.0f; + float frth = 0.0f; + + unsigned int i = 0; + unsigned int k = 0; + for (i = 0; i < eighth_points; ++i) { + for (k = 0; k < 8; ++k) { + fst = *src0++; + fst = MAX(fst, *cutoff); + sq = fst * fst; + thrd = fst * sq; + frth = fst * thrd; + result[k] += center_point_array[0] * fst + center_point_array[1] * sq; + result[k] += center_point_array[2] * thrd + center_point_array[3] * frth; + } } - } - for(k = 0; k < 8; k+=2) - result[k] = result[k]+result[k+1]; - - *target = result[0] + result[2] + result[4] + result[6]; - - for(i = eighth_points*8; i < num_points; ++i) { - fst = *src0++; - fst = MAX(fst, *cutoff); - sq = fst * fst; - thrd = fst * sq; - frth = fst * thrd; - *target += (center_point_array[0] * fst + - center_point_array[1] * sq + - center_point_array[2] * thrd + - center_point_array[3] * frth); - } - *target += (float)(num_points) * center_point_array[4]; + for (k = 0; k < 8; k += 2) + result[k] = result[k] + result[k + 1]; + + *target = result[0] + result[2] + result[4] + result[6]; + + for (i = eighth_points * 8; i < num_points; ++i) { + fst = *src0++; + fst = MAX(fst, *cutoff); + sq = fst * fst; + thrd = fst * sq; + frth = fst * thrd; + *target += (center_point_array[0] * fst + center_point_array[1] * sq + + center_point_array[2] * thrd + center_point_array[3] * frth); + } + *target += (float)(num_points)*center_point_array[4]; } #endif /*LV_HAVE_GENERIC*/ @@ -372,51 +377,52 @@ volk_32f_x3_sum_of_poly_32f_generic(float* target, float* src0, float* center_po #include static inline void -volk_32f_x3_sum_of_poly_32f_a_neon(float* __restrict target, float* __restrict src0, +volk_32f_x3_sum_of_poly_32f_a_neon(float* __restrict target, + float* __restrict src0, float* __restrict center_point_array, - float* __restrict cutoff, unsigned int num_points) + float* __restrict cutoff, + unsigned int num_points) { - unsigned int i; - float zero[4] = {0.0f, 0.0f, 0.0f, 0.0f }; - - float32x2_t x_to_1, x_to_2, x_to_3, x_to_4; - float32x2_t cutoff_vector; - float32x2x2_t x_low, x_high; - float32x4_t x_qvector, c_qvector, cpa_qvector; - float accumulator; - float res_accumulators[4]; - - c_qvector = vld1q_f32( zero ); - // load the cutoff in to a vector - cutoff_vector = vdup_n_f32( *cutoff ); - // ... center point array - cpa_qvector = vld1q_f32( center_point_array ); - - for(i=0; i < num_points; ++i) { - // load x (src0) - x_to_1 = vdup_n_f32( *src0++ ); - - // Get a vector of max(src0, cutoff) - x_to_1 = vmax_f32(x_to_1, cutoff_vector ); // x^1 - x_to_2 = vmul_f32(x_to_1, x_to_1); // x^2 - x_to_3 = vmul_f32(x_to_2, x_to_1); // x^3 - x_to_4 = vmul_f32(x_to_3, x_to_1); // x^4 - // zip up doubles to interleave - x_low = vzip_f32(x_to_1, x_to_2); // [x^2 | x^1 || x^2 | x^1] - x_high = vzip_f32(x_to_3, x_to_4); // [x^4 | x^3 || x^4 | x^3] - // float32x4_t vcombine_f32(float32x2_t low, float32x2_t high); // VMOV d0,d0 - x_qvector = vcombine_f32(x_low.val[0], x_high.val[0]); - // now we finally have [x^4 | x^3 | x^2 | x] ! - - c_qvector = vmlaq_f32(c_qvector, x_qvector, cpa_qvector); - - } - // there should be better vector reduction techniques - vst1q_f32(res_accumulators, c_qvector ); - accumulator = res_accumulators[0] + res_accumulators[1] + - res_accumulators[2] + res_accumulators[3]; - - *target = accumulator + (float)num_points * center_point_array[4]; + unsigned int i; + float zero[4] = { 0.0f, 0.0f, 0.0f, 0.0f }; + + float32x2_t x_to_1, x_to_2, x_to_3, x_to_4; + float32x2_t cutoff_vector; + float32x2x2_t x_low, x_high; + float32x4_t x_qvector, c_qvector, cpa_qvector; + float accumulator; + float res_accumulators[4]; + + c_qvector = vld1q_f32(zero); + // load the cutoff in to a vector + cutoff_vector = vdup_n_f32(*cutoff); + // ... center point array + cpa_qvector = vld1q_f32(center_point_array); + + for (i = 0; i < num_points; ++i) { + // load x (src0) + x_to_1 = vdup_n_f32(*src0++); + + // Get a vector of max(src0, cutoff) + x_to_1 = vmax_f32(x_to_1, cutoff_vector); // x^1 + x_to_2 = vmul_f32(x_to_1, x_to_1); // x^2 + x_to_3 = vmul_f32(x_to_2, x_to_1); // x^3 + x_to_4 = vmul_f32(x_to_3, x_to_1); // x^4 + // zip up doubles to interleave + x_low = vzip_f32(x_to_1, x_to_2); // [x^2 | x^1 || x^2 | x^1] + x_high = vzip_f32(x_to_3, x_to_4); // [x^4 | x^3 || x^4 | x^3] + // float32x4_t vcombine_f32(float32x2_t low, float32x2_t high); // VMOV d0,d0 + x_qvector = vcombine_f32(x_low.val[0], x_high.val[0]); + // now we finally have [x^4 | x^3 | x^2 | x] ! + + c_qvector = vmlaq_f32(c_qvector, x_qvector, cpa_qvector); + } + // there should be better vector reduction techniques + vst1q_f32(res_accumulators, c_qvector); + accumulator = res_accumulators[0] + res_accumulators[1] + res_accumulators[2] + + res_accumulators[3]; + + *target = accumulator + (float)num_points * center_point_array[4]; } #endif /* LV_HAVE_NEON */ @@ -425,82 +431,82 @@ volk_32f_x3_sum_of_poly_32f_a_neon(float* __restrict target, float* __restrict s #ifdef LV_HAVE_NEON static inline void -volk_32f_x3_sum_of_poly_32f_neonvert(float* __restrict target, float* __restrict src0, +volk_32f_x3_sum_of_poly_32f_neonvert(float* __restrict target, + float* __restrict src0, float* __restrict center_point_array, - float* __restrict cutoff, unsigned int num_points) + float* __restrict cutoff, + unsigned int num_points) { - unsigned int i; - float zero[4] = {0.0f, 0.0f, 0.0f, 0.0f }; - - float accumulator; - - float32x4_t accumulator1_vec, accumulator2_vec, accumulator3_vec, accumulator4_vec; - accumulator1_vec = vld1q_f32(zero); - accumulator2_vec = vld1q_f32(zero); - accumulator3_vec = vld1q_f32(zero); - accumulator4_vec = vld1q_f32(zero); - float32x4_t x_to_1, x_to_2, x_to_3, x_to_4; - float32x4_t cutoff_vector, cpa_0, cpa_1, cpa_2, cpa_3; - - // load the cutoff in to a vector - cutoff_vector = vdupq_n_f32( *cutoff ); - // ... center point array - cpa_0 = vdupq_n_f32(center_point_array[0]); - cpa_1 = vdupq_n_f32(center_point_array[1]); - cpa_2 = vdupq_n_f32(center_point_array[2]); - cpa_3 = vdupq_n_f32(center_point_array[3]); - - // nathan is not sure why this is slower *and* wrong compared to neonvertfma - for(i=0; i < num_points/4; ++i) { - // load x - x_to_1 = vld1q_f32( src0 ); - - // Get a vector of max(src0, cutoff) - x_to_1 = vmaxq_f32(x_to_1, cutoff_vector ); // x^1 - x_to_2 = vmulq_f32(x_to_1, x_to_1); // x^2 - x_to_3 = vmulq_f32(x_to_2, x_to_1); // x^3 - x_to_4 = vmulq_f32(x_to_3, x_to_1); // x^4 - x_to_1 = vmulq_f32(x_to_1, cpa_0); - x_to_2 = vmulq_f32(x_to_2, cpa_1); - x_to_3 = vmulq_f32(x_to_3, cpa_2); - x_to_4 = vmulq_f32(x_to_4, cpa_3); - accumulator1_vec = vaddq_f32(accumulator1_vec, x_to_1); - accumulator2_vec = vaddq_f32(accumulator2_vec, x_to_2); - accumulator3_vec = vaddq_f32(accumulator3_vec, x_to_3); - accumulator4_vec = vaddq_f32(accumulator4_vec, x_to_4); - - src0 += 4; - } - accumulator1_vec = vaddq_f32(accumulator1_vec, accumulator2_vec); - accumulator3_vec = vaddq_f32(accumulator3_vec, accumulator4_vec); - accumulator1_vec = vaddq_f32(accumulator1_vec, accumulator3_vec); - - __VOLK_ATTR_ALIGNED(32) float res_accumulators[4]; - vst1q_f32(res_accumulators, accumulator1_vec ); - accumulator = res_accumulators[0] + res_accumulators[1] + - res_accumulators[2] + res_accumulators[3]; - - float fst = 0.0; - float sq = 0.0; - float thrd = 0.0; - float frth = 0.0; - - for(i = 4*num_points/4; i < num_points; ++i) { - fst = src0[i]; - fst = MAX(fst, *cutoff); - - sq = fst * fst; - thrd = fst * sq; - frth = sq * sq; - //fith = sq * thrd; - - accumulator += (center_point_array[0] * fst + - center_point_array[1] * sq + - center_point_array[2] * thrd + - center_point_array[3] * frth); //+ - } - - *target = accumulator + (float)num_points * center_point_array[4]; + unsigned int i; + float zero[4] = { 0.0f, 0.0f, 0.0f, 0.0f }; + + float accumulator; + + float32x4_t accumulator1_vec, accumulator2_vec, accumulator3_vec, accumulator4_vec; + accumulator1_vec = vld1q_f32(zero); + accumulator2_vec = vld1q_f32(zero); + accumulator3_vec = vld1q_f32(zero); + accumulator4_vec = vld1q_f32(zero); + float32x4_t x_to_1, x_to_2, x_to_3, x_to_4; + float32x4_t cutoff_vector, cpa_0, cpa_1, cpa_2, cpa_3; + + // load the cutoff in to a vector + cutoff_vector = vdupq_n_f32(*cutoff); + // ... center point array + cpa_0 = vdupq_n_f32(center_point_array[0]); + cpa_1 = vdupq_n_f32(center_point_array[1]); + cpa_2 = vdupq_n_f32(center_point_array[2]); + cpa_3 = vdupq_n_f32(center_point_array[3]); + + // nathan is not sure why this is slower *and* wrong compared to neonvertfma + for (i = 0; i < num_points / 4; ++i) { + // load x + x_to_1 = vld1q_f32(src0); + + // Get a vector of max(src0, cutoff) + x_to_1 = vmaxq_f32(x_to_1, cutoff_vector); // x^1 + x_to_2 = vmulq_f32(x_to_1, x_to_1); // x^2 + x_to_3 = vmulq_f32(x_to_2, x_to_1); // x^3 + x_to_4 = vmulq_f32(x_to_3, x_to_1); // x^4 + x_to_1 = vmulq_f32(x_to_1, cpa_0); + x_to_2 = vmulq_f32(x_to_2, cpa_1); + x_to_3 = vmulq_f32(x_to_3, cpa_2); + x_to_4 = vmulq_f32(x_to_4, cpa_3); + accumulator1_vec = vaddq_f32(accumulator1_vec, x_to_1); + accumulator2_vec = vaddq_f32(accumulator2_vec, x_to_2); + accumulator3_vec = vaddq_f32(accumulator3_vec, x_to_3); + accumulator4_vec = vaddq_f32(accumulator4_vec, x_to_4); + + src0 += 4; + } + accumulator1_vec = vaddq_f32(accumulator1_vec, accumulator2_vec); + accumulator3_vec = vaddq_f32(accumulator3_vec, accumulator4_vec); + accumulator1_vec = vaddq_f32(accumulator1_vec, accumulator3_vec); + + __VOLK_ATTR_ALIGNED(32) float res_accumulators[4]; + vst1q_f32(res_accumulators, accumulator1_vec); + accumulator = res_accumulators[0] + res_accumulators[1] + res_accumulators[2] + + res_accumulators[3]; + + float fst = 0.0; + float sq = 0.0; + float thrd = 0.0; + float frth = 0.0; + + for (i = 4 * num_points / 4; i < num_points; ++i) { + fst = src0[i]; + fst = MAX(fst, *cutoff); + + sq = fst * fst; + thrd = fst * sq; + frth = sq * sq; + // fith = sq * thrd; + + accumulator += (center_point_array[0] * fst + center_point_array[1] * sq + + center_point_array[2] * thrd + center_point_array[3] * frth); //+ + } + + *target = accumulator + (float)num_points * center_point_array[4]; } #endif /* LV_HAVE_NEON */ @@ -510,150 +516,154 @@ volk_32f_x3_sum_of_poly_32f_neonvert(float* __restrict target, float* __restrict #ifndef INCLUDED_volk_32f_x3_sum_of_poly_32f_u_H #define INCLUDED_volk_32f_x3_sum_of_poly_32f_u_H -#include -#include -#include +#include +#include +#include #ifndef MAX -#define MAX(X,Y) ((X) > (Y)?(X):(Y)) +#define MAX(X, Y) ((X) > (Y) ? (X) : (Y)) #endif #if LV_HAVE_AVX && LV_HAVE_FMA -#include +#include -static inline void -volk_32f_x3_sum_of_poly_32f_u_avx_fma(float* target, float* src0, float* center_point_array, - float* cutoff, unsigned int num_points) +static inline void volk_32f_x3_sum_of_poly_32f_u_avx_fma(float* target, + float* src0, + float* center_point_array, + float* cutoff, + unsigned int num_points) { - const unsigned int eighth_points = num_points / 8; - float fst = 0.0; - float sq = 0.0; - float thrd = 0.0; - float frth = 0.0; - - __m256 cpa0, cpa1, cpa2, cpa3, cutoff_vec; - __m256 target_vec; - __m256 x_to_1, x_to_2, x_to_3, x_to_4; - - cpa0 = _mm256_set1_ps(center_point_array[0]); - cpa1 = _mm256_set1_ps(center_point_array[1]); - cpa2 = _mm256_set1_ps(center_point_array[2]); - cpa3 = _mm256_set1_ps(center_point_array[3]); - cutoff_vec = _mm256_set1_ps(*cutoff); - target_vec = _mm256_setzero_ps(); - - unsigned int i; - - for(i = 0; i < eighth_points; ++i) { - x_to_1 = _mm256_loadu_ps(src0); - x_to_1 = _mm256_max_ps(x_to_1, cutoff_vec); - x_to_2 = _mm256_mul_ps(x_to_1, x_to_1); // x^2 - x_to_3 = _mm256_mul_ps(x_to_1, x_to_2); // x^3 - // x^1 * x^3 is slightly faster than x^2 * x^2 - x_to_4 = _mm256_mul_ps(x_to_1, x_to_3); // x^4 - - x_to_2 = _mm256_mul_ps(x_to_2, cpa1); // cpa[1] * x^2 - x_to_4 = _mm256_mul_ps(x_to_4, cpa3); // cpa[3] * x^4 - - x_to_1 = _mm256_fmadd_ps(x_to_1, cpa0, x_to_2); - x_to_3 = _mm256_fmadd_ps(x_to_3, cpa2, x_to_4); - // this is slightly faster than result += (x_to_1 + x_to_3) - target_vec = _mm256_add_ps(x_to_1, target_vec); - target_vec = _mm256_add_ps(x_to_3, target_vec); - - src0 += 8; - } - - // the hadd for vector reduction has very very slight impact @ 50k iters - __VOLK_ATTR_ALIGNED(32) float temp_results[8]; - target_vec = _mm256_hadd_ps(target_vec, target_vec); // x0+x1 | x2+x3 | x0+x1 | x2+x3 || x4+x5 | x6+x7 | x4+x5 | x6+x7 - _mm256_storeu_ps(temp_results, target_vec); - *target = temp_results[0] + temp_results[1] + temp_results[4] + temp_results[5]; - - for(i = eighth_points*8; i < num_points; ++i) { - fst = *src0++; - fst = MAX(fst, *cutoff); - sq = fst * fst; - thrd = fst * sq; - frth = sq * sq; - *target += (center_point_array[0] * fst + - center_point_array[1] * sq + - center_point_array[2] * thrd + - center_point_array[3] * frth); - } - - *target += (float)(num_points) * center_point_array[4]; + const unsigned int eighth_points = num_points / 8; + float fst = 0.0; + float sq = 0.0; + float thrd = 0.0; + float frth = 0.0; + + __m256 cpa0, cpa1, cpa2, cpa3, cutoff_vec; + __m256 target_vec; + __m256 x_to_1, x_to_2, x_to_3, x_to_4; + + cpa0 = _mm256_set1_ps(center_point_array[0]); + cpa1 = _mm256_set1_ps(center_point_array[1]); + cpa2 = _mm256_set1_ps(center_point_array[2]); + cpa3 = _mm256_set1_ps(center_point_array[3]); + cutoff_vec = _mm256_set1_ps(*cutoff); + target_vec = _mm256_setzero_ps(); + + unsigned int i; + + for (i = 0; i < eighth_points; ++i) { + x_to_1 = _mm256_loadu_ps(src0); + x_to_1 = _mm256_max_ps(x_to_1, cutoff_vec); + x_to_2 = _mm256_mul_ps(x_to_1, x_to_1); // x^2 + x_to_3 = _mm256_mul_ps(x_to_1, x_to_2); // x^3 + // x^1 * x^3 is slightly faster than x^2 * x^2 + x_to_4 = _mm256_mul_ps(x_to_1, x_to_3); // x^4 + + x_to_2 = _mm256_mul_ps(x_to_2, cpa1); // cpa[1] * x^2 + x_to_4 = _mm256_mul_ps(x_to_4, cpa3); // cpa[3] * x^4 + + x_to_1 = _mm256_fmadd_ps(x_to_1, cpa0, x_to_2); + x_to_3 = _mm256_fmadd_ps(x_to_3, cpa2, x_to_4); + // this is slightly faster than result += (x_to_1 + x_to_3) + target_vec = _mm256_add_ps(x_to_1, target_vec); + target_vec = _mm256_add_ps(x_to_3, target_vec); + + src0 += 8; + } + + // the hadd for vector reduction has very very slight impact @ 50k iters + __VOLK_ATTR_ALIGNED(32) float temp_results[8]; + target_vec = _mm256_hadd_ps( + target_vec, + target_vec); // x0+x1 | x2+x3 | x0+x1 | x2+x3 || x4+x5 | x6+x7 | x4+x5 | x6+x7 + _mm256_storeu_ps(temp_results, target_vec); + *target = temp_results[0] + temp_results[1] + temp_results[4] + temp_results[5]; + + for (i = eighth_points * 8; i < num_points; ++i) { + fst = *src0++; + fst = MAX(fst, *cutoff); + sq = fst * fst; + thrd = fst * sq; + frth = sq * sq; + *target += (center_point_array[0] * fst + center_point_array[1] * sq + + center_point_array[2] * thrd + center_point_array[3] * frth); + } + + *target += (float)(num_points)*center_point_array[4]; } #endif // LV_HAVE_AVX && LV_HAVE_FMA #ifdef LV_HAVE_AVX -#include +#include -static inline void -volk_32f_x3_sum_of_poly_32f_u_avx(float* target, float* src0, float* center_point_array, - float* cutoff, unsigned int num_points) +static inline void volk_32f_x3_sum_of_poly_32f_u_avx(float* target, + float* src0, + float* center_point_array, + float* cutoff, + unsigned int num_points) { - const unsigned int eighth_points = num_points / 8; - float fst = 0.0; - float sq = 0.0; - float thrd = 0.0; - float frth = 0.0; - - __m256 cpa0, cpa1, cpa2, cpa3, cutoff_vec; - __m256 target_vec; - __m256 x_to_1, x_to_2, x_to_3, x_to_4; - - cpa0 = _mm256_set1_ps(center_point_array[0]); - cpa1 = _mm256_set1_ps(center_point_array[1]); - cpa2 = _mm256_set1_ps(center_point_array[2]); - cpa3 = _mm256_set1_ps(center_point_array[3]); - cutoff_vec = _mm256_set1_ps(*cutoff); - target_vec = _mm256_setzero_ps(); - - unsigned int i; - - for(i = 0; i < eighth_points; ++i) { - x_to_1 = _mm256_loadu_ps(src0); - x_to_1 = _mm256_max_ps(x_to_1, cutoff_vec); - x_to_2 = _mm256_mul_ps(x_to_1, x_to_1); // x^2 - x_to_3 = _mm256_mul_ps(x_to_1, x_to_2); // x^3 - // x^1 * x^3 is slightly faster than x^2 * x^2 - x_to_4 = _mm256_mul_ps(x_to_1, x_to_3); // x^4 - - x_to_1 = _mm256_mul_ps(x_to_1, cpa0); // cpa[0] * x^1 - x_to_2 = _mm256_mul_ps(x_to_2, cpa1); // cpa[1] * x^2 - x_to_3 = _mm256_mul_ps(x_to_3, cpa2); // cpa[2] * x^3 - x_to_4 = _mm256_mul_ps(x_to_4, cpa3); // cpa[3] * x^4 - - x_to_1 = _mm256_add_ps(x_to_1, x_to_2); - x_to_3 = _mm256_add_ps(x_to_3, x_to_4); - // this is slightly faster than result += (x_to_1 + x_to_3) - target_vec = _mm256_add_ps(x_to_1, target_vec); - target_vec = _mm256_add_ps(x_to_3, target_vec); - - src0 += 8; - } - - // the hadd for vector reduction has very very slight impact @ 50k iters - __VOLK_ATTR_ALIGNED(32) float temp_results[8]; - target_vec = _mm256_hadd_ps(target_vec, target_vec); // x0+x1 | x2+x3 | x0+x1 | x2+x3 || x4+x5 | x6+x7 | x4+x5 | x6+x7 - _mm256_storeu_ps(temp_results, target_vec); - *target = temp_results[0] + temp_results[1] + temp_results[4] + temp_results[5]; - - for(i = eighth_points*8; i < num_points; ++i) { - fst = *src0++; - fst = MAX(fst, *cutoff); - sq = fst * fst; - thrd = fst * sq; - frth = sq * sq; - - *target += (center_point_array[0] * fst + - center_point_array[1] * sq + - center_point_array[2] * thrd + - center_point_array[3] * frth); - } - - *target += (float)(num_points) * center_point_array[4]; + const unsigned int eighth_points = num_points / 8; + float fst = 0.0; + float sq = 0.0; + float thrd = 0.0; + float frth = 0.0; + + __m256 cpa0, cpa1, cpa2, cpa3, cutoff_vec; + __m256 target_vec; + __m256 x_to_1, x_to_2, x_to_3, x_to_4; + + cpa0 = _mm256_set1_ps(center_point_array[0]); + cpa1 = _mm256_set1_ps(center_point_array[1]); + cpa2 = _mm256_set1_ps(center_point_array[2]); + cpa3 = _mm256_set1_ps(center_point_array[3]); + cutoff_vec = _mm256_set1_ps(*cutoff); + target_vec = _mm256_setzero_ps(); + + unsigned int i; + + for (i = 0; i < eighth_points; ++i) { + x_to_1 = _mm256_loadu_ps(src0); + x_to_1 = _mm256_max_ps(x_to_1, cutoff_vec); + x_to_2 = _mm256_mul_ps(x_to_1, x_to_1); // x^2 + x_to_3 = _mm256_mul_ps(x_to_1, x_to_2); // x^3 + // x^1 * x^3 is slightly faster than x^2 * x^2 + x_to_4 = _mm256_mul_ps(x_to_1, x_to_3); // x^4 + + x_to_1 = _mm256_mul_ps(x_to_1, cpa0); // cpa[0] * x^1 + x_to_2 = _mm256_mul_ps(x_to_2, cpa1); // cpa[1] * x^2 + x_to_3 = _mm256_mul_ps(x_to_3, cpa2); // cpa[2] * x^3 + x_to_4 = _mm256_mul_ps(x_to_4, cpa3); // cpa[3] * x^4 + + x_to_1 = _mm256_add_ps(x_to_1, x_to_2); + x_to_3 = _mm256_add_ps(x_to_3, x_to_4); + // this is slightly faster than result += (x_to_1 + x_to_3) + target_vec = _mm256_add_ps(x_to_1, target_vec); + target_vec = _mm256_add_ps(x_to_3, target_vec); + + src0 += 8; + } + + // the hadd for vector reduction has very very slight impact @ 50k iters + __VOLK_ATTR_ALIGNED(32) float temp_results[8]; + target_vec = _mm256_hadd_ps( + target_vec, + target_vec); // x0+x1 | x2+x3 | x0+x1 | x2+x3 || x4+x5 | x6+x7 | x4+x5 | x6+x7 + _mm256_storeu_ps(temp_results, target_vec); + *target = temp_results[0] + temp_results[1] + temp_results[4] + temp_results[5]; + + for (i = eighth_points * 8; i < num_points; ++i) { + fst = *src0++; + fst = MAX(fst, *cutoff); + sq = fst * fst; + thrd = fst * sq; + frth = sq * sq; + + *target += (center_point_array[0] * fst + center_point_array[1] * sq + + center_point_array[2] * thrd + center_point_array[3] * frth); + } + + *target += (float)(num_points)*center_point_array[4]; } #endif // LV_HAVE_AVX diff --git a/kernels/volk/volk_32fc_32f_add_32fc.h b/kernels/volk/volk_32fc_32f_add_32fc.h index 86a3818..b25ca6a 100644 --- a/kernels/volk/volk_32fc_32f_add_32fc.h +++ b/kernels/volk/volk_32fc_32f_add_32fc.h @@ -31,8 +31,8 @@ * * Dispatcher Prototype * \code - * void volk_32fc_32f_add_32fc(lv_32fc_t* cVector, const lv_32fc_t* aVector, const float* bVector, unsigned int num_points) - * \endcode + * void volk_32fc_32f_add_32fc(lv_32fc_t* cVector, const lv_32fc_t* aVector, const float* + * bVector, unsigned int num_points) \endcode * * \b Inputs * \li aVector: First vector of input points. @@ -44,7 +44,8 @@ * * \b Example * - * The follow example adds the increasing and decreasing vectors such that the result of every summation pair is 10 + * The follow example adds the increasing and decreasing vectors such that the result of + * every summation pair is 10 * * \code * int N = 10; @@ -75,18 +76,19 @@ #ifdef LV_HAVE_GENERIC -static inline void -volk_32fc_32f_add_32fc_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, - const float* bVector, unsigned int num_points) +static inline void volk_32fc_32f_add_32fc_generic(lv_32fc_t* cVector, + const lv_32fc_t* aVector, + const float* bVector, + unsigned int num_points) { - lv_32fc_t* cPtr = cVector; - const lv_32fc_t* aPtr = aVector; - const float* bPtr= bVector; - unsigned int number = 0; - - for(number = 0; number < num_points; number++){ - *cPtr++ = (*aPtr++) + (*bPtr++); - } + lv_32fc_t* cPtr = cVector; + const lv_32fc_t* aPtr = aVector; + const float* bPtr = bVector; + unsigned int number = 0; + + for (number = 0; number < num_points; number++) { + *cPtr++ = (*aPtr++) + (*bPtr++); + } } #endif /* LV_HAVE_GENERIC */ @@ -94,143 +96,150 @@ volk_32fc_32f_add_32fc_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, #ifdef LV_HAVE_AVX #include -static inline void -volk_32fc_32f_add_32fc_u_avx(lv_32fc_t* cVector, const lv_32fc_t* aVector, - const float* bVector, unsigned int num_points) +static inline void volk_32fc_32f_add_32fc_u_avx(lv_32fc_t* cVector, + const lv_32fc_t* aVector, + const float* bVector, + unsigned int num_points) { - unsigned int number = 0; - const unsigned int eighthPoints = num_points / 8; - - lv_32fc_t* cPtr = cVector; - const lv_32fc_t* aPtr = aVector; - const float* bPtr= bVector; - - __m256 aVal1, aVal2, bVal, cVal1, cVal2; - __m256 cpx_b1, cpx_b2; - __m256 zero; - zero = _mm256_setzero_ps(); - __m256 tmp1, tmp2; - for(;number < eighthPoints; number++){ - - aVal1 = _mm256_loadu_ps((float *) aPtr); - aVal2 = _mm256_loadu_ps((float *) (aPtr+4)); - bVal = _mm256_loadu_ps(bPtr); - cpx_b1 = _mm256_unpacklo_ps(bVal, zero); // b0, 0, b1, 0, b4, 0, b5, 0 - cpx_b2 = _mm256_unpackhi_ps(bVal, zero); // b2, 0, b3, 0, b6, 0, b7, 0 - - tmp1 = _mm256_permute2f128_ps(cpx_b1, cpx_b2, 0x0+(0x2<<4)); - tmp2 = _mm256_permute2f128_ps(cpx_b1, cpx_b2, 0x1+(0x3<<4)); - - cVal1 = _mm256_add_ps(aVal1, tmp1); - cVal2 = _mm256_add_ps(aVal2, tmp2); - - _mm256_storeu_ps((float *) cPtr, cVal1); // Store the results back into the C container - _mm256_storeu_ps((float *) (cPtr+4), cVal2); // Store the results back into the C container - - aPtr += 8; - bPtr += 8; - cPtr += 8; - } - - number = eighthPoints * 8; - for(;number < num_points; number++){ - *cPtr++ = (*aPtr++) + (*bPtr++); - } + unsigned int number = 0; + const unsigned int eighthPoints = num_points / 8; + + lv_32fc_t* cPtr = cVector; + const lv_32fc_t* aPtr = aVector; + const float* bPtr = bVector; + + __m256 aVal1, aVal2, bVal, cVal1, cVal2; + __m256 cpx_b1, cpx_b2; + __m256 zero; + zero = _mm256_setzero_ps(); + __m256 tmp1, tmp2; + for (; number < eighthPoints; number++) { + + aVal1 = _mm256_loadu_ps((float*)aPtr); + aVal2 = _mm256_loadu_ps((float*)(aPtr + 4)); + bVal = _mm256_loadu_ps(bPtr); + cpx_b1 = _mm256_unpacklo_ps(bVal, zero); // b0, 0, b1, 0, b4, 0, b5, 0 + cpx_b2 = _mm256_unpackhi_ps(bVal, zero); // b2, 0, b3, 0, b6, 0, b7, 0 + + tmp1 = _mm256_permute2f128_ps(cpx_b1, cpx_b2, 0x0 + (0x2 << 4)); + tmp2 = _mm256_permute2f128_ps(cpx_b1, cpx_b2, 0x1 + (0x3 << 4)); + + cVal1 = _mm256_add_ps(aVal1, tmp1); + cVal2 = _mm256_add_ps(aVal2, tmp2); + + _mm256_storeu_ps((float*)cPtr, + cVal1); // Store the results back into the C container + _mm256_storeu_ps((float*)(cPtr + 4), + cVal2); // Store the results back into the C container + + aPtr += 8; + bPtr += 8; + cPtr += 8; + } + + number = eighthPoints * 8; + for (; number < num_points; number++) { + *cPtr++ = (*aPtr++) + (*bPtr++); + } } #endif /* LV_HAVE_AVX */ #ifdef LV_HAVE_AVX #include -static inline void -volk_32fc_32f_add_32fc_a_avx(lv_32fc_t* cVector, const lv_32fc_t* aVector, - const float* bVector, unsigned int num_points) +static inline void volk_32fc_32f_add_32fc_a_avx(lv_32fc_t* cVector, + const lv_32fc_t* aVector, + const float* bVector, + unsigned int num_points) { - unsigned int number = 0; - const unsigned int eighthPoints = num_points / 8; - - lv_32fc_t* cPtr = cVector; - const lv_32fc_t* aPtr = aVector; - const float* bPtr= bVector; - - __m256 aVal1, aVal2, bVal, cVal1, cVal2; - __m256 cpx_b1, cpx_b2; - __m256 zero; - zero = _mm256_setzero_ps(); - __m256 tmp1, tmp2; - for(;number < eighthPoints; number++){ - - aVal1 = _mm256_load_ps((float *) aPtr); - aVal2 = _mm256_load_ps((float *) (aPtr+4)); - bVal = _mm256_load_ps(bPtr); - cpx_b1 = _mm256_unpacklo_ps(bVal, zero); // b0, 0, b1, 0, b4, 0, b5, 0 - cpx_b2 = _mm256_unpackhi_ps(bVal, zero); // b2, 0, b3, 0, b6, 0, b7, 0 - - tmp1 = _mm256_permute2f128_ps(cpx_b1, cpx_b2, 0x0+(0x2<<4)); - tmp2 = _mm256_permute2f128_ps(cpx_b1, cpx_b2, 0x1+(0x3<<4)); - - cVal1 = _mm256_add_ps(aVal1, tmp1); - cVal2 = _mm256_add_ps(aVal2, tmp2); - - _mm256_store_ps((float *) cPtr, cVal1); // Store the results back into the C container - _mm256_store_ps((float *) (cPtr+4), cVal2); // Store the results back into the C container - - aPtr += 8; - bPtr += 8; - cPtr += 8; - } - - number = eighthPoints * 8; - for(;number < num_points; number++){ - *cPtr++ = (*aPtr++) + (*bPtr++); - } + unsigned int number = 0; + const unsigned int eighthPoints = num_points / 8; + + lv_32fc_t* cPtr = cVector; + const lv_32fc_t* aPtr = aVector; + const float* bPtr = bVector; + + __m256 aVal1, aVal2, bVal, cVal1, cVal2; + __m256 cpx_b1, cpx_b2; + __m256 zero; + zero = _mm256_setzero_ps(); + __m256 tmp1, tmp2; + for (; number < eighthPoints; number++) { + + aVal1 = _mm256_load_ps((float*)aPtr); + aVal2 = _mm256_load_ps((float*)(aPtr + 4)); + bVal = _mm256_load_ps(bPtr); + cpx_b1 = _mm256_unpacklo_ps(bVal, zero); // b0, 0, b1, 0, b4, 0, b5, 0 + cpx_b2 = _mm256_unpackhi_ps(bVal, zero); // b2, 0, b3, 0, b6, 0, b7, 0 + + tmp1 = _mm256_permute2f128_ps(cpx_b1, cpx_b2, 0x0 + (0x2 << 4)); + tmp2 = _mm256_permute2f128_ps(cpx_b1, cpx_b2, 0x1 + (0x3 << 4)); + + cVal1 = _mm256_add_ps(aVal1, tmp1); + cVal2 = _mm256_add_ps(aVal2, tmp2); + + _mm256_store_ps((float*)cPtr, + cVal1); // Store the results back into the C container + _mm256_store_ps((float*)(cPtr + 4), + cVal2); // Store the results back into the C container + + aPtr += 8; + bPtr += 8; + cPtr += 8; + } + + number = eighthPoints * 8; + for (; number < num_points; number++) { + *cPtr++ = (*aPtr++) + (*bPtr++); + } } #endif /* LV_HAVE_AVX */ #ifdef LV_HAVE_NEON #include -static inline void -volk_32fc_32f_add_32fc_neon(lv_32fc_t* cVector, const lv_32fc_t* aVector, - const float* bVector, unsigned int num_points) +static inline void volk_32fc_32f_add_32fc_neon(lv_32fc_t* cVector, + const lv_32fc_t* aVector, + const float* bVector, + unsigned int num_points) { - lv_32fc_t* cPtr = cVector; - const lv_32fc_t* aPtr = aVector; - const float* bPtr = bVector; - - float32x4x4_t aVal0, aVal1; - float32x4x2_t bVal0, bVal1; - - const unsigned int sixteenthPoints = num_points / 16; - unsigned int number = 0; - for(; number < sixteenthPoints; number++){ - aVal0 = vld4q_f32((const float*)aPtr); - aPtr += 8; - aVal1 = vld4q_f32((const float*)aPtr); - aPtr += 8; - __VOLK_PREFETCH(aPtr+16); - - bVal0 = vld2q_f32((const float*)bPtr); - bPtr += 8; - bVal1 = vld2q_f32((const float*)bPtr); - bPtr += 8; - __VOLK_PREFETCH(bPtr+16); - - aVal0.val[0] = vaddq_f32(aVal0.val[0], bVal0.val[0]); - aVal0.val[2] = vaddq_f32(aVal0.val[2], bVal0.val[1]); - - aVal1.val[2] = vaddq_f32(aVal1.val[2], bVal1.val[1]); - aVal1.val[0] = vaddq_f32(aVal1.val[0], bVal1.val[0]); - - vst4q_f32((float*)(cPtr), aVal0); - cPtr += 8; - vst4q_f32((float*)(cPtr), aVal1); - cPtr += 8; - } - - for(number = sixteenthPoints * 16; number < num_points; number++){ - *cPtr++ = (*aPtr++) + (*bPtr++); - } + lv_32fc_t* cPtr = cVector; + const lv_32fc_t* aPtr = aVector; + const float* bPtr = bVector; + + float32x4x4_t aVal0, aVal1; + float32x4x2_t bVal0, bVal1; + + const unsigned int sixteenthPoints = num_points / 16; + unsigned int number = 0; + for (; number < sixteenthPoints; number++) { + aVal0 = vld4q_f32((const float*)aPtr); + aPtr += 8; + aVal1 = vld4q_f32((const float*)aPtr); + aPtr += 8; + __VOLK_PREFETCH(aPtr + 16); + + bVal0 = vld2q_f32((const float*)bPtr); + bPtr += 8; + bVal1 = vld2q_f32((const float*)bPtr); + bPtr += 8; + __VOLK_PREFETCH(bPtr + 16); + + aVal0.val[0] = vaddq_f32(aVal0.val[0], bVal0.val[0]); + aVal0.val[2] = vaddq_f32(aVal0.val[2], bVal0.val[1]); + + aVal1.val[2] = vaddq_f32(aVal1.val[2], bVal1.val[1]); + aVal1.val[0] = vaddq_f32(aVal1.val[0], bVal1.val[0]); + + vst4q_f32((float*)(cPtr), aVal0); + cPtr += 8; + vst4q_f32((float*)(cPtr), aVal1); + cPtr += 8; + } + + for (number = sixteenthPoints * 16; number < num_points; number++) { + *cPtr++ = (*aPtr++) + (*bPtr++); + } } #endif /* LV_HAVE_NEON */ diff --git a/kernels/volk/volk_32fc_32f_dot_prod_32fc.h b/kernels/volk/volk_32fc_32f_dot_prod_32fc.h index 35f7077..d905870 100644 --- a/kernels/volk/volk_32fc_32f_dot_prod_32fc.h +++ b/kernels/volk/volk_32fc_32f_dot_prod_32fc.h @@ -33,8 +33,8 @@ * * Dispatcher Prototype * \code - * void volk_32fc_32f_dot_prod_32fc(lv_32fc_t* result, const lv_32fc_t* input, const float * taps, unsigned int num_points) - * \endcode + * void volk_32fc_32f_dot_prod_32fc(lv_32fc_t* result, const lv_32fc_t* input, const float + * * taps, unsigned int num_points) \endcode * * \b Inputs * \li input: vector of complex samples @@ -63,28 +63,32 @@ #ifndef INCLUDED_volk_32fc_32f_dot_prod_32fc_a_H #define INCLUDED_volk_32fc_32f_dot_prod_32fc_a_H -#include #include +#include #ifdef LV_HAVE_GENERIC -static inline void volk_32fc_32f_dot_prod_32fc_generic(lv_32fc_t* result, const lv_32fc_t* input, const float * taps, unsigned int num_points) { +static inline void volk_32fc_32f_dot_prod_32fc_generic(lv_32fc_t* result, + const lv_32fc_t* input, + const float* taps, + unsigned int num_points) +{ - float res[2]; - float *realpt = &res[0], *imagpt = &res[1]; - const float* aPtr = (float*)input; - const float* bPtr= taps; - unsigned int number = 0; + float res[2]; + float *realpt = &res[0], *imagpt = &res[1]; + const float* aPtr = (float*)input; + const float* bPtr = taps; + unsigned int number = 0; - *realpt = 0; - *imagpt = 0; + *realpt = 0; + *imagpt = 0; - for(number = 0; number < num_points; number++){ - *realpt += ((*aPtr++) * (*bPtr)); - *imagpt += ((*aPtr++) * (*bPtr++)); - } + for (number = 0; number < num_points; number++) { + *realpt += ((*aPtr++) * (*bPtr)); + *imagpt += ((*aPtr++) * (*bPtr++)); + } - *result = *(lv_32fc_t*)(&res[0]); + *result = *(lv_32fc_t*)(&res[0]); } #endif /*LV_HAVE_GENERIC*/ @@ -93,78 +97,83 @@ static inline void volk_32fc_32f_dot_prod_32fc_generic(lv_32fc_t* result, const #include -static inline void volk_32fc_32f_dot_prod_32fc_a_avx2_fma( lv_32fc_t* result, const lv_32fc_t* input, const float* taps, unsigned int num_points) { - - unsigned int number = 0; - const unsigned int sixteenthPoints = num_points / 16; - - float res[2]; - float *realpt = &res[0], *imagpt = &res[1]; - const float* aPtr = (float*)input; - const float* bPtr = taps; - - __m256 a0Val, a1Val, a2Val, a3Val; - __m256 b0Val, b1Val, b2Val, b3Val; - __m256 x0Val, x1Val, x0loVal, x0hiVal, x1loVal, x1hiVal; - - __m256 dotProdVal0 = _mm256_setzero_ps(); - __m256 dotProdVal1 = _mm256_setzero_ps(); - __m256 dotProdVal2 = _mm256_setzero_ps(); - __m256 dotProdVal3 = _mm256_setzero_ps(); - - for(;number < sixteenthPoints; number++){ - - a0Val = _mm256_load_ps(aPtr); - a1Val = _mm256_load_ps(aPtr+8); - a2Val = _mm256_load_ps(aPtr+16); - a3Val = _mm256_load_ps(aPtr+24); - - x0Val = _mm256_load_ps(bPtr); // t0|t1|t2|t3|t4|t5|t6|t7 - x1Val = _mm256_load_ps(bPtr+8); - x0loVal = _mm256_unpacklo_ps(x0Val, x0Val); // t0|t0|t1|t1|t4|t4|t5|t5 - x0hiVal = _mm256_unpackhi_ps(x0Val, x0Val); // t2|t2|t3|t3|t6|t6|t7|t7 - x1loVal = _mm256_unpacklo_ps(x1Val, x1Val); - x1hiVal = _mm256_unpackhi_ps(x1Val, x1Val); - - // TODO: it may be possible to rearrange swizzling to better pipeline data - b0Val = _mm256_permute2f128_ps(x0loVal, x0hiVal, 0x20); // t0|t0|t1|t1|t2|t2|t3|t3 - b1Val = _mm256_permute2f128_ps(x0loVal, x0hiVal, 0x31); // t4|t4|t5|t5|t6|t6|t7|t7 - b2Val = _mm256_permute2f128_ps(x1loVal, x1hiVal, 0x20); - b3Val = _mm256_permute2f128_ps(x1loVal, x1hiVal, 0x31); - - dotProdVal0 = _mm256_fmadd_ps(a0Val, b0Val, dotProdVal0); - dotProdVal1 = _mm256_fmadd_ps(a1Val, b1Val, dotProdVal1); - dotProdVal2 = _mm256_fmadd_ps(a2Val, b2Val, dotProdVal2); - dotProdVal3 = _mm256_fmadd_ps(a3Val, b3Val, dotProdVal3); - - aPtr += 32; - bPtr += 16; - } - - dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1); - dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2); - dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3); - - __VOLK_ATTR_ALIGNED(32) float dotProductVector[8]; - - _mm256_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector - - *realpt = dotProductVector[0]; - *imagpt = dotProductVector[1]; - *realpt += dotProductVector[2]; - *imagpt += dotProductVector[3]; - *realpt += dotProductVector[4]; - *imagpt += dotProductVector[5]; - *realpt += dotProductVector[6]; - *imagpt += dotProductVector[7]; - - number = sixteenthPoints*16; - for(;number < num_points; number++){ - *realpt += ((*aPtr++) * (*bPtr)); - *imagpt += ((*aPtr++) * (*bPtr++)); - } - - *result = *(lv_32fc_t*)(&res[0]); +static inline void volk_32fc_32f_dot_prod_32fc_a_avx2_fma(lv_32fc_t* result, + const lv_32fc_t* input, + const float* taps, + unsigned int num_points) +{ + + unsigned int number = 0; + const unsigned int sixteenthPoints = num_points / 16; + + float res[2]; + float *realpt = &res[0], *imagpt = &res[1]; + const float* aPtr = (float*)input; + const float* bPtr = taps; + + __m256 a0Val, a1Val, a2Val, a3Val; + __m256 b0Val, b1Val, b2Val, b3Val; + __m256 x0Val, x1Val, x0loVal, x0hiVal, x1loVal, x1hiVal; + + __m256 dotProdVal0 = _mm256_setzero_ps(); + __m256 dotProdVal1 = _mm256_setzero_ps(); + __m256 dotProdVal2 = _mm256_setzero_ps(); + __m256 dotProdVal3 = _mm256_setzero_ps(); + + for (; number < sixteenthPoints; number++) { + + a0Val = _mm256_load_ps(aPtr); + a1Val = _mm256_load_ps(aPtr + 8); + a2Val = _mm256_load_ps(aPtr + 16); + a3Val = _mm256_load_ps(aPtr + 24); + + x0Val = _mm256_load_ps(bPtr); // t0|t1|t2|t3|t4|t5|t6|t7 + x1Val = _mm256_load_ps(bPtr + 8); + x0loVal = _mm256_unpacklo_ps(x0Val, x0Val); // t0|t0|t1|t1|t4|t4|t5|t5 + x0hiVal = _mm256_unpackhi_ps(x0Val, x0Val); // t2|t2|t3|t3|t6|t6|t7|t7 + x1loVal = _mm256_unpacklo_ps(x1Val, x1Val); + x1hiVal = _mm256_unpackhi_ps(x1Val, x1Val); + + // TODO: it may be possible to rearrange swizzling to better pipeline data + b0Val = _mm256_permute2f128_ps(x0loVal, x0hiVal, 0x20); // t0|t0|t1|t1|t2|t2|t3|t3 + b1Val = _mm256_permute2f128_ps(x0loVal, x0hiVal, 0x31); // t4|t4|t5|t5|t6|t6|t7|t7 + b2Val = _mm256_permute2f128_ps(x1loVal, x1hiVal, 0x20); + b3Val = _mm256_permute2f128_ps(x1loVal, x1hiVal, 0x31); + + dotProdVal0 = _mm256_fmadd_ps(a0Val, b0Val, dotProdVal0); + dotProdVal1 = _mm256_fmadd_ps(a1Val, b1Val, dotProdVal1); + dotProdVal2 = _mm256_fmadd_ps(a2Val, b2Val, dotProdVal2); + dotProdVal3 = _mm256_fmadd_ps(a3Val, b3Val, dotProdVal3); + + aPtr += 32; + bPtr += 16; + } + + dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1); + dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2); + dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3); + + __VOLK_ATTR_ALIGNED(32) float dotProductVector[8]; + + _mm256_store_ps(dotProductVector, + dotProdVal0); // Store the results back into the dot product vector + + *realpt = dotProductVector[0]; + *imagpt = dotProductVector[1]; + *realpt += dotProductVector[2]; + *imagpt += dotProductVector[3]; + *realpt += dotProductVector[4]; + *imagpt += dotProductVector[5]; + *realpt += dotProductVector[6]; + *imagpt += dotProductVector[7]; + + number = sixteenthPoints * 16; + for (; number < num_points; number++) { + *realpt += ((*aPtr++) * (*bPtr)); + *imagpt += ((*aPtr++) * (*bPtr++)); + } + + *result = *(lv_32fc_t*)(&res[0]); } #endif /*LV_HAVE_AVX2 && LV_HAVE_FMA*/ @@ -173,164 +182,172 @@ static inline void volk_32fc_32f_dot_prod_32fc_a_avx2_fma( lv_32fc_t* result, co #include -static inline void volk_32fc_32f_dot_prod_32fc_a_avx( lv_32fc_t* result, const lv_32fc_t* input, const float* taps, unsigned int num_points) { - - unsigned int number = 0; - const unsigned int sixteenthPoints = num_points / 16; - - float res[2]; - float *realpt = &res[0], *imagpt = &res[1]; - const float* aPtr = (float*)input; - const float* bPtr = taps; - - __m256 a0Val, a1Val, a2Val, a3Val; - __m256 b0Val, b1Val, b2Val, b3Val; - __m256 x0Val, x1Val, x0loVal, x0hiVal, x1loVal, x1hiVal; - __m256 c0Val, c1Val, c2Val, c3Val; - - __m256 dotProdVal0 = _mm256_setzero_ps(); - __m256 dotProdVal1 = _mm256_setzero_ps(); - __m256 dotProdVal2 = _mm256_setzero_ps(); - __m256 dotProdVal3 = _mm256_setzero_ps(); - - for(;number < sixteenthPoints; number++){ - - a0Val = _mm256_load_ps(aPtr); - a1Val = _mm256_load_ps(aPtr+8); - a2Val = _mm256_load_ps(aPtr+16); - a3Val = _mm256_load_ps(aPtr+24); - - x0Val = _mm256_load_ps(bPtr); // t0|t1|t2|t3|t4|t5|t6|t7 - x1Val = _mm256_load_ps(bPtr+8); - x0loVal = _mm256_unpacklo_ps(x0Val, x0Val); // t0|t0|t1|t1|t4|t4|t5|t5 - x0hiVal = _mm256_unpackhi_ps(x0Val, x0Val); // t2|t2|t3|t3|t6|t6|t7|t7 - x1loVal = _mm256_unpacklo_ps(x1Val, x1Val); - x1hiVal = _mm256_unpackhi_ps(x1Val, x1Val); - - // TODO: it may be possible to rearrange swizzling to better pipeline data - b0Val = _mm256_permute2f128_ps(x0loVal, x0hiVal, 0x20); // t0|t0|t1|t1|t2|t2|t3|t3 - b1Val = _mm256_permute2f128_ps(x0loVal, x0hiVal, 0x31); // t4|t4|t5|t5|t6|t6|t7|t7 - b2Val = _mm256_permute2f128_ps(x1loVal, x1hiVal, 0x20); - b3Val = _mm256_permute2f128_ps(x1loVal, x1hiVal, 0x31); - - c0Val = _mm256_mul_ps(a0Val, b0Val); - c1Val = _mm256_mul_ps(a1Val, b1Val); - c2Val = _mm256_mul_ps(a2Val, b2Val); - c3Val = _mm256_mul_ps(a3Val, b3Val); - - dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0); - dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1); - dotProdVal2 = _mm256_add_ps(c2Val, dotProdVal2); - dotProdVal3 = _mm256_add_ps(c3Val, dotProdVal3); - - aPtr += 32; - bPtr += 16; - } - - dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1); - dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2); - dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3); - - __VOLK_ATTR_ALIGNED(32) float dotProductVector[8]; - - _mm256_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector - - *realpt = dotProductVector[0]; - *imagpt = dotProductVector[1]; - *realpt += dotProductVector[2]; - *imagpt += dotProductVector[3]; - *realpt += dotProductVector[4]; - *imagpt += dotProductVector[5]; - *realpt += dotProductVector[6]; - *imagpt += dotProductVector[7]; - - number = sixteenthPoints*16; - for(;number < num_points; number++){ - *realpt += ((*aPtr++) * (*bPtr)); - *imagpt += ((*aPtr++) * (*bPtr++)); - } - - *result = *(lv_32fc_t*)(&res[0]); +static inline void volk_32fc_32f_dot_prod_32fc_a_avx(lv_32fc_t* result, + const lv_32fc_t* input, + const float* taps, + unsigned int num_points) +{ + + unsigned int number = 0; + const unsigned int sixteenthPoints = num_points / 16; + + float res[2]; + float *realpt = &res[0], *imagpt = &res[1]; + const float* aPtr = (float*)input; + const float* bPtr = taps; + + __m256 a0Val, a1Val, a2Val, a3Val; + __m256 b0Val, b1Val, b2Val, b3Val; + __m256 x0Val, x1Val, x0loVal, x0hiVal, x1loVal, x1hiVal; + __m256 c0Val, c1Val, c2Val, c3Val; + + __m256 dotProdVal0 = _mm256_setzero_ps(); + __m256 dotProdVal1 = _mm256_setzero_ps(); + __m256 dotProdVal2 = _mm256_setzero_ps(); + __m256 dotProdVal3 = _mm256_setzero_ps(); + + for (; number < sixteenthPoints; number++) { + + a0Val = _mm256_load_ps(aPtr); + a1Val = _mm256_load_ps(aPtr + 8); + a2Val = _mm256_load_ps(aPtr + 16); + a3Val = _mm256_load_ps(aPtr + 24); + + x0Val = _mm256_load_ps(bPtr); // t0|t1|t2|t3|t4|t5|t6|t7 + x1Val = _mm256_load_ps(bPtr + 8); + x0loVal = _mm256_unpacklo_ps(x0Val, x0Val); // t0|t0|t1|t1|t4|t4|t5|t5 + x0hiVal = _mm256_unpackhi_ps(x0Val, x0Val); // t2|t2|t3|t3|t6|t6|t7|t7 + x1loVal = _mm256_unpacklo_ps(x1Val, x1Val); + x1hiVal = _mm256_unpackhi_ps(x1Val, x1Val); + + // TODO: it may be possible to rearrange swizzling to better pipeline data + b0Val = _mm256_permute2f128_ps(x0loVal, x0hiVal, 0x20); // t0|t0|t1|t1|t2|t2|t3|t3 + b1Val = _mm256_permute2f128_ps(x0loVal, x0hiVal, 0x31); // t4|t4|t5|t5|t6|t6|t7|t7 + b2Val = _mm256_permute2f128_ps(x1loVal, x1hiVal, 0x20); + b3Val = _mm256_permute2f128_ps(x1loVal, x1hiVal, 0x31); + + c0Val = _mm256_mul_ps(a0Val, b0Val); + c1Val = _mm256_mul_ps(a1Val, b1Val); + c2Val = _mm256_mul_ps(a2Val, b2Val); + c3Val = _mm256_mul_ps(a3Val, b3Val); + + dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0); + dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1); + dotProdVal2 = _mm256_add_ps(c2Val, dotProdVal2); + dotProdVal3 = _mm256_add_ps(c3Val, dotProdVal3); + + aPtr += 32; + bPtr += 16; + } + + dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1); + dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2); + dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3); + + __VOLK_ATTR_ALIGNED(32) float dotProductVector[8]; + + _mm256_store_ps(dotProductVector, + dotProdVal0); // Store the results back into the dot product vector + + *realpt = dotProductVector[0]; + *imagpt = dotProductVector[1]; + *realpt += dotProductVector[2]; + *imagpt += dotProductVector[3]; + *realpt += dotProductVector[4]; + *imagpt += dotProductVector[5]; + *realpt += dotProductVector[6]; + *imagpt += dotProductVector[7]; + + number = sixteenthPoints * 16; + for (; number < num_points; number++) { + *realpt += ((*aPtr++) * (*bPtr)); + *imagpt += ((*aPtr++) * (*bPtr++)); + } + + *result = *(lv_32fc_t*)(&res[0]); } #endif /*LV_HAVE_AVX*/ - - #ifdef LV_HAVE_SSE -static inline void volk_32fc_32f_dot_prod_32fc_a_sse( lv_32fc_t* result, const lv_32fc_t* input, const float* taps, unsigned int num_points) { - - unsigned int number = 0; - const unsigned int sixteenthPoints = num_points / 8; - - float res[2]; - float *realpt = &res[0], *imagpt = &res[1]; - const float* aPtr = (float*)input; - const float* bPtr = taps; - - __m128 a0Val, a1Val, a2Val, a3Val; - __m128 b0Val, b1Val, b2Val, b3Val; - __m128 x0Val, x1Val, x2Val, x3Val; - __m128 c0Val, c1Val, c2Val, c3Val; - - __m128 dotProdVal0 = _mm_setzero_ps(); - __m128 dotProdVal1 = _mm_setzero_ps(); - __m128 dotProdVal2 = _mm_setzero_ps(); - __m128 dotProdVal3 = _mm_setzero_ps(); - - for(;number < sixteenthPoints; number++){ - - a0Val = _mm_load_ps(aPtr); - a1Val = _mm_load_ps(aPtr+4); - a2Val = _mm_load_ps(aPtr+8); - a3Val = _mm_load_ps(aPtr+12); - - x0Val = _mm_load_ps(bPtr); - x1Val = _mm_load_ps(bPtr); - x2Val = _mm_load_ps(bPtr+4); - x3Val = _mm_load_ps(bPtr+4); - b0Val = _mm_unpacklo_ps(x0Val, x1Val); - b1Val = _mm_unpackhi_ps(x0Val, x1Val); - b2Val = _mm_unpacklo_ps(x2Val, x3Val); - b3Val = _mm_unpackhi_ps(x2Val, x3Val); - - c0Val = _mm_mul_ps(a0Val, b0Val); - c1Val = _mm_mul_ps(a1Val, b1Val); - c2Val = _mm_mul_ps(a2Val, b2Val); - c3Val = _mm_mul_ps(a3Val, b3Val); - - dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0); - dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1); - dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2); - dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3); - - aPtr += 16; - bPtr += 8; - } - - dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1); - dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2); - dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3); - - __VOLK_ATTR_ALIGNED(16) float dotProductVector[4]; - - _mm_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector - - *realpt = dotProductVector[0]; - *imagpt = dotProductVector[1]; - *realpt += dotProductVector[2]; - *imagpt += dotProductVector[3]; - - number = sixteenthPoints*8; - for(;number < num_points; number++){ - *realpt += ((*aPtr++) * (*bPtr)); - *imagpt += ((*aPtr++) * (*bPtr++)); - } - - *result = *(lv_32fc_t*)(&res[0]); +static inline void volk_32fc_32f_dot_prod_32fc_a_sse(lv_32fc_t* result, + const lv_32fc_t* input, + const float* taps, + unsigned int num_points) +{ + + unsigned int number = 0; + const unsigned int sixteenthPoints = num_points / 8; + + float res[2]; + float *realpt = &res[0], *imagpt = &res[1]; + const float* aPtr = (float*)input; + const float* bPtr = taps; + + __m128 a0Val, a1Val, a2Val, a3Val; + __m128 b0Val, b1Val, b2Val, b3Val; + __m128 x0Val, x1Val, x2Val, x3Val; + __m128 c0Val, c1Val, c2Val, c3Val; + + __m128 dotProdVal0 = _mm_setzero_ps(); + __m128 dotProdVal1 = _mm_setzero_ps(); + __m128 dotProdVal2 = _mm_setzero_ps(); + __m128 dotProdVal3 = _mm_setzero_ps(); + + for (; number < sixteenthPoints; number++) { + + a0Val = _mm_load_ps(aPtr); + a1Val = _mm_load_ps(aPtr + 4); + a2Val = _mm_load_ps(aPtr + 8); + a3Val = _mm_load_ps(aPtr + 12); + + x0Val = _mm_load_ps(bPtr); + x1Val = _mm_load_ps(bPtr); + x2Val = _mm_load_ps(bPtr + 4); + x3Val = _mm_load_ps(bPtr + 4); + b0Val = _mm_unpacklo_ps(x0Val, x1Val); + b1Val = _mm_unpackhi_ps(x0Val, x1Val); + b2Val = _mm_unpacklo_ps(x2Val, x3Val); + b3Val = _mm_unpackhi_ps(x2Val, x3Val); + + c0Val = _mm_mul_ps(a0Val, b0Val); + c1Val = _mm_mul_ps(a1Val, b1Val); + c2Val = _mm_mul_ps(a2Val, b2Val); + c3Val = _mm_mul_ps(a3Val, b3Val); + + dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0); + dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1); + dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2); + dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3); + + aPtr += 16; + bPtr += 8; + } + + dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1); + dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2); + dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3); + + __VOLK_ATTR_ALIGNED(16) float dotProductVector[4]; + + _mm_store_ps(dotProductVector, + dotProdVal0); // Store the results back into the dot product vector + + *realpt = dotProductVector[0]; + *imagpt = dotProductVector[1]; + *realpt += dotProductVector[2]; + *imagpt += dotProductVector[3]; + + number = sixteenthPoints * 8; + for (; number < num_points; number++) { + *realpt += ((*aPtr++) * (*bPtr)); + *imagpt += ((*aPtr++) * (*bPtr++)); + } + + *result = *(lv_32fc_t*)(&res[0]); } #endif /*LV_HAVE_SSE*/ @@ -339,78 +356,83 @@ static inline void volk_32fc_32f_dot_prod_32fc_a_sse( lv_32fc_t* result, const #include -static inline void volk_32fc_32f_dot_prod_32fc_u_avx2_fma( lv_32fc_t* result, const lv_32fc_t* input, const float* taps, unsigned int num_points) { - - unsigned int number = 0; - const unsigned int sixteenthPoints = num_points / 16; - - float res[2]; - float *realpt = &res[0], *imagpt = &res[1]; - const float* aPtr = (float*)input; - const float* bPtr = taps; - - __m256 a0Val, a1Val, a2Val, a3Val; - __m256 b0Val, b1Val, b2Val, b3Val; - __m256 x0Val, x1Val, x0loVal, x0hiVal, x1loVal, x1hiVal; - - __m256 dotProdVal0 = _mm256_setzero_ps(); - __m256 dotProdVal1 = _mm256_setzero_ps(); - __m256 dotProdVal2 = _mm256_setzero_ps(); - __m256 dotProdVal3 = _mm256_setzero_ps(); - - for(;number < sixteenthPoints; number++){ - - a0Val = _mm256_loadu_ps(aPtr); - a1Val = _mm256_loadu_ps(aPtr+8); - a2Val = _mm256_loadu_ps(aPtr+16); - a3Val = _mm256_loadu_ps(aPtr+24); - - x0Val = _mm256_load_ps(bPtr); // t0|t1|t2|t3|t4|t5|t6|t7 - x1Val = _mm256_load_ps(bPtr+8); - x0loVal = _mm256_unpacklo_ps(x0Val, x0Val); // t0|t0|t1|t1|t4|t4|t5|t5 - x0hiVal = _mm256_unpackhi_ps(x0Val, x0Val); // t2|t2|t3|t3|t6|t6|t7|t7 - x1loVal = _mm256_unpacklo_ps(x1Val, x1Val); - x1hiVal = _mm256_unpackhi_ps(x1Val, x1Val); - - // TODO: it may be possible to rearrange swizzling to better pipeline data - b0Val = _mm256_permute2f128_ps(x0loVal, x0hiVal, 0x20); // t0|t0|t1|t1|t2|t2|t3|t3 - b1Val = _mm256_permute2f128_ps(x0loVal, x0hiVal, 0x31); // t4|t4|t5|t5|t6|t6|t7|t7 - b2Val = _mm256_permute2f128_ps(x1loVal, x1hiVal, 0x20); - b3Val = _mm256_permute2f128_ps(x1loVal, x1hiVal, 0x31); - - dotProdVal0 = _mm256_fmadd_ps(a0Val, b0Val, dotProdVal0); - dotProdVal1 = _mm256_fmadd_ps(a1Val, b1Val, dotProdVal1); - dotProdVal2 = _mm256_fmadd_ps(a2Val, b2Val, dotProdVal2); - dotProdVal3 = _mm256_fmadd_ps(a3Val, b3Val, dotProdVal3); - - aPtr += 32; - bPtr += 16; - } - - dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1); - dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2); - dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3); - - __VOLK_ATTR_ALIGNED(32) float dotProductVector[8]; - - _mm256_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector - - *realpt = dotProductVector[0]; - *imagpt = dotProductVector[1]; - *realpt += dotProductVector[2]; - *imagpt += dotProductVector[3]; - *realpt += dotProductVector[4]; - *imagpt += dotProductVector[5]; - *realpt += dotProductVector[6]; - *imagpt += dotProductVector[7]; - - number = sixteenthPoints*16; - for(;number < num_points; number++){ - *realpt += ((*aPtr++) * (*bPtr)); - *imagpt += ((*aPtr++) * (*bPtr++)); - } - - *result = *(lv_32fc_t*)(&res[0]); +static inline void volk_32fc_32f_dot_prod_32fc_u_avx2_fma(lv_32fc_t* result, + const lv_32fc_t* input, + const float* taps, + unsigned int num_points) +{ + + unsigned int number = 0; + const unsigned int sixteenthPoints = num_points / 16; + + float res[2]; + float *realpt = &res[0], *imagpt = &res[1]; + const float* aPtr = (float*)input; + const float* bPtr = taps; + + __m256 a0Val, a1Val, a2Val, a3Val; + __m256 b0Val, b1Val, b2Val, b3Val; + __m256 x0Val, x1Val, x0loVal, x0hiVal, x1loVal, x1hiVal; + + __m256 dotProdVal0 = _mm256_setzero_ps(); + __m256 dotProdVal1 = _mm256_setzero_ps(); + __m256 dotProdVal2 = _mm256_setzero_ps(); + __m256 dotProdVal3 = _mm256_setzero_ps(); + + for (; number < sixteenthPoints; number++) { + + a0Val = _mm256_loadu_ps(aPtr); + a1Val = _mm256_loadu_ps(aPtr + 8); + a2Val = _mm256_loadu_ps(aPtr + 16); + a3Val = _mm256_loadu_ps(aPtr + 24); + + x0Val = _mm256_load_ps(bPtr); // t0|t1|t2|t3|t4|t5|t6|t7 + x1Val = _mm256_load_ps(bPtr + 8); + x0loVal = _mm256_unpacklo_ps(x0Val, x0Val); // t0|t0|t1|t1|t4|t4|t5|t5 + x0hiVal = _mm256_unpackhi_ps(x0Val, x0Val); // t2|t2|t3|t3|t6|t6|t7|t7 + x1loVal = _mm256_unpacklo_ps(x1Val, x1Val); + x1hiVal = _mm256_unpackhi_ps(x1Val, x1Val); + + // TODO: it may be possible to rearrange swizzling to better pipeline data + b0Val = _mm256_permute2f128_ps(x0loVal, x0hiVal, 0x20); // t0|t0|t1|t1|t2|t2|t3|t3 + b1Val = _mm256_permute2f128_ps(x0loVal, x0hiVal, 0x31); // t4|t4|t5|t5|t6|t6|t7|t7 + b2Val = _mm256_permute2f128_ps(x1loVal, x1hiVal, 0x20); + b3Val = _mm256_permute2f128_ps(x1loVal, x1hiVal, 0x31); + + dotProdVal0 = _mm256_fmadd_ps(a0Val, b0Val, dotProdVal0); + dotProdVal1 = _mm256_fmadd_ps(a1Val, b1Val, dotProdVal1); + dotProdVal2 = _mm256_fmadd_ps(a2Val, b2Val, dotProdVal2); + dotProdVal3 = _mm256_fmadd_ps(a3Val, b3Val, dotProdVal3); + + aPtr += 32; + bPtr += 16; + } + + dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1); + dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2); + dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3); + + __VOLK_ATTR_ALIGNED(32) float dotProductVector[8]; + + _mm256_store_ps(dotProductVector, + dotProdVal0); // Store the results back into the dot product vector + + *realpt = dotProductVector[0]; + *imagpt = dotProductVector[1]; + *realpt += dotProductVector[2]; + *imagpt += dotProductVector[3]; + *realpt += dotProductVector[4]; + *imagpt += dotProductVector[5]; + *realpt += dotProductVector[6]; + *imagpt += dotProductVector[7]; + + number = sixteenthPoints * 16; + for (; number < num_points; number++) { + *realpt += ((*aPtr++) * (*bPtr)); + *imagpt += ((*aPtr++) * (*bPtr++)); + } + + *result = *(lv_32fc_t*)(&res[0]); } #endif /*LV_HAVE_AVX2 && LV_HAVE_FMA*/ @@ -419,162 +441,172 @@ static inline void volk_32fc_32f_dot_prod_32fc_u_avx2_fma( lv_32fc_t* result, co #include -static inline void volk_32fc_32f_dot_prod_32fc_u_avx( lv_32fc_t* result, const lv_32fc_t* input, const float* taps, unsigned int num_points) { - - unsigned int number = 0; - const unsigned int sixteenthPoints = num_points / 16; - - float res[2]; - float *realpt = &res[0], *imagpt = &res[1]; - const float* aPtr = (float*)input; - const float* bPtr = taps; - - __m256 a0Val, a1Val, a2Val, a3Val; - __m256 b0Val, b1Val, b2Val, b3Val; - __m256 x0Val, x1Val, x0loVal, x0hiVal, x1loVal, x1hiVal; - __m256 c0Val, c1Val, c2Val, c3Val; - - __m256 dotProdVal0 = _mm256_setzero_ps(); - __m256 dotProdVal1 = _mm256_setzero_ps(); - __m256 dotProdVal2 = _mm256_setzero_ps(); - __m256 dotProdVal3 = _mm256_setzero_ps(); - - for(;number < sixteenthPoints; number++){ - - a0Val = _mm256_loadu_ps(aPtr); - a1Val = _mm256_loadu_ps(aPtr+8); - a2Val = _mm256_loadu_ps(aPtr+16); - a3Val = _mm256_loadu_ps(aPtr+24); - - x0Val = _mm256_loadu_ps(bPtr); // t0|t1|t2|t3|t4|t5|t6|t7 - x1Val = _mm256_loadu_ps(bPtr+8); - x0loVal = _mm256_unpacklo_ps(x0Val, x0Val); // t0|t0|t1|t1|t4|t4|t5|t5 - x0hiVal = _mm256_unpackhi_ps(x0Val, x0Val); // t2|t2|t3|t3|t6|t6|t7|t7 - x1loVal = _mm256_unpacklo_ps(x1Val, x1Val); - x1hiVal = _mm256_unpackhi_ps(x1Val, x1Val); - - // TODO: it may be possible to rearrange swizzling to better pipeline data - b0Val = _mm256_permute2f128_ps(x0loVal, x0hiVal, 0x20); // t0|t0|t1|t1|t2|t2|t3|t3 - b1Val = _mm256_permute2f128_ps(x0loVal, x0hiVal, 0x31); // t4|t4|t5|t5|t6|t6|t7|t7 - b2Val = _mm256_permute2f128_ps(x1loVal, x1hiVal, 0x20); - b3Val = _mm256_permute2f128_ps(x1loVal, x1hiVal, 0x31); - - c0Val = _mm256_mul_ps(a0Val, b0Val); - c1Val = _mm256_mul_ps(a1Val, b1Val); - c2Val = _mm256_mul_ps(a2Val, b2Val); - c3Val = _mm256_mul_ps(a3Val, b3Val); - - dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0); - dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1); - dotProdVal2 = _mm256_add_ps(c2Val, dotProdVal2); - dotProdVal3 = _mm256_add_ps(c3Val, dotProdVal3); - - aPtr += 32; - bPtr += 16; - } - - dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1); - dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2); - dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3); - - __VOLK_ATTR_ALIGNED(32) float dotProductVector[8]; - - _mm256_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector - - *realpt = dotProductVector[0]; - *imagpt = dotProductVector[1]; - *realpt += dotProductVector[2]; - *imagpt += dotProductVector[3]; - *realpt += dotProductVector[4]; - *imagpt += dotProductVector[5]; - *realpt += dotProductVector[6]; - *imagpt += dotProductVector[7]; - - number = sixteenthPoints*16; - for(;number < num_points; number++){ - *realpt += ((*aPtr++) * (*bPtr)); - *imagpt += ((*aPtr++) * (*bPtr++)); - } - - *result = *(lv_32fc_t*)(&res[0]); +static inline void volk_32fc_32f_dot_prod_32fc_u_avx(lv_32fc_t* result, + const lv_32fc_t* input, + const float* taps, + unsigned int num_points) +{ + + unsigned int number = 0; + const unsigned int sixteenthPoints = num_points / 16; + + float res[2]; + float *realpt = &res[0], *imagpt = &res[1]; + const float* aPtr = (float*)input; + const float* bPtr = taps; + + __m256 a0Val, a1Val, a2Val, a3Val; + __m256 b0Val, b1Val, b2Val, b3Val; + __m256 x0Val, x1Val, x0loVal, x0hiVal, x1loVal, x1hiVal; + __m256 c0Val, c1Val, c2Val, c3Val; + + __m256 dotProdVal0 = _mm256_setzero_ps(); + __m256 dotProdVal1 = _mm256_setzero_ps(); + __m256 dotProdVal2 = _mm256_setzero_ps(); + __m256 dotProdVal3 = _mm256_setzero_ps(); + + for (; number < sixteenthPoints; number++) { + + a0Val = _mm256_loadu_ps(aPtr); + a1Val = _mm256_loadu_ps(aPtr + 8); + a2Val = _mm256_loadu_ps(aPtr + 16); + a3Val = _mm256_loadu_ps(aPtr + 24); + + x0Val = _mm256_loadu_ps(bPtr); // t0|t1|t2|t3|t4|t5|t6|t7 + x1Val = _mm256_loadu_ps(bPtr + 8); + x0loVal = _mm256_unpacklo_ps(x0Val, x0Val); // t0|t0|t1|t1|t4|t4|t5|t5 + x0hiVal = _mm256_unpackhi_ps(x0Val, x0Val); // t2|t2|t3|t3|t6|t6|t7|t7 + x1loVal = _mm256_unpacklo_ps(x1Val, x1Val); + x1hiVal = _mm256_unpackhi_ps(x1Val, x1Val); + + // TODO: it may be possible to rearrange swizzling to better pipeline data + b0Val = _mm256_permute2f128_ps(x0loVal, x0hiVal, 0x20); // t0|t0|t1|t1|t2|t2|t3|t3 + b1Val = _mm256_permute2f128_ps(x0loVal, x0hiVal, 0x31); // t4|t4|t5|t5|t6|t6|t7|t7 + b2Val = _mm256_permute2f128_ps(x1loVal, x1hiVal, 0x20); + b3Val = _mm256_permute2f128_ps(x1loVal, x1hiVal, 0x31); + + c0Val = _mm256_mul_ps(a0Val, b0Val); + c1Val = _mm256_mul_ps(a1Val, b1Val); + c2Val = _mm256_mul_ps(a2Val, b2Val); + c3Val = _mm256_mul_ps(a3Val, b3Val); + + dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0); + dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1); + dotProdVal2 = _mm256_add_ps(c2Val, dotProdVal2); + dotProdVal3 = _mm256_add_ps(c3Val, dotProdVal3); + + aPtr += 32; + bPtr += 16; + } + + dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1); + dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2); + dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3); + + __VOLK_ATTR_ALIGNED(32) float dotProductVector[8]; + + _mm256_store_ps(dotProductVector, + dotProdVal0); // Store the results back into the dot product vector + + *realpt = dotProductVector[0]; + *imagpt = dotProductVector[1]; + *realpt += dotProductVector[2]; + *imagpt += dotProductVector[3]; + *realpt += dotProductVector[4]; + *imagpt += dotProductVector[5]; + *realpt += dotProductVector[6]; + *imagpt += dotProductVector[7]; + + number = sixteenthPoints * 16; + for (; number < num_points; number++) { + *realpt += ((*aPtr++) * (*bPtr)); + *imagpt += ((*aPtr++) * (*bPtr++)); + } + + *result = *(lv_32fc_t*)(&res[0]); } #endif /*LV_HAVE_AVX*/ #ifdef LV_HAVE_NEON #include -static inline void volk_32fc_32f_dot_prod_32fc_neon_unroll ( lv_32fc_t* __restrict result, const lv_32fc_t* __restrict input, const float* __restrict taps, unsigned int num_points) { - - unsigned int number; - const unsigned int quarterPoints = num_points / 8; - - float res[2]; - float *realpt = &res[0], *imagpt = &res[1]; - const float* inputPtr = (float*)input; - const float* tapsPtr = taps; - float zero[4] = {0.0f, 0.0f, 0.0f, 0.0f }; - float accVector_real[4]; - float accVector_imag[4]; - - float32x4x2_t inputVector0, inputVector1; - float32x4_t tapsVector0, tapsVector1; - float32x4_t tmp_real0, tmp_imag0; - float32x4_t tmp_real1, tmp_imag1; - float32x4_t real_accumulator0, imag_accumulator0; - float32x4_t real_accumulator1, imag_accumulator1; - - // zero out accumulators - // take a *float, return float32x4_t - real_accumulator0 = vld1q_f32( zero ); - imag_accumulator0 = vld1q_f32( zero ); - real_accumulator1 = vld1q_f32( zero ); - imag_accumulator1 = vld1q_f32( zero ); - - for(number=0 ;number < quarterPoints; number++){ - // load doublewords and duplicate in to second lane - tapsVector0 = vld1q_f32(tapsPtr ); - tapsVector1 = vld1q_f32(tapsPtr+4 ); - - // load quadword of complex numbers in to 2 lanes. 1st lane is real, 2dn imag - inputVector0 = vld2q_f32(inputPtr ); - inputVector1 = vld2q_f32(inputPtr+8 ); - // inputVector is now a struct of two vectors, 0th is real, 1st is imag - - tmp_real0 = vmulq_f32(tapsVector0, inputVector0.val[0]); - tmp_imag0 = vmulq_f32(tapsVector0, inputVector0.val[1]); - - tmp_real1 = vmulq_f32(tapsVector1, inputVector1.val[0]); - tmp_imag1 = vmulq_f32(tapsVector1, inputVector1.val[1]); - - real_accumulator0 = vaddq_f32(real_accumulator0, tmp_real0); - imag_accumulator0 = vaddq_f32(imag_accumulator0, tmp_imag0); - - real_accumulator1 = vaddq_f32(real_accumulator1, tmp_real1); - imag_accumulator1 = vaddq_f32(imag_accumulator1, tmp_imag1); - - tapsPtr += 8; - inputPtr += 16; - } - - real_accumulator0 = vaddq_f32( real_accumulator0, real_accumulator1); - imag_accumulator0 = vaddq_f32( imag_accumulator0, imag_accumulator1); - // void vst1q_f32( float32_t * ptr, float32x4_t val); - // store results back to a complex (array of 2 floats) - vst1q_f32(accVector_real, real_accumulator0); - vst1q_f32(accVector_imag, imag_accumulator0); - *realpt = accVector_real[0] + accVector_real[1] + - accVector_real[2] + accVector_real[3] ; - - *imagpt = accVector_imag[0] + accVector_imag[1] + - accVector_imag[2] + accVector_imag[3] ; - - // clean up the remainder - for(number=quarterPoints*8; number < num_points; number++){ - *realpt += ((*inputPtr++) * (*tapsPtr)); - *imagpt += ((*inputPtr++) * (*tapsPtr++)); - } - - *result = *(lv_32fc_t*)(&res[0]); +static inline void +volk_32fc_32f_dot_prod_32fc_neon_unroll(lv_32fc_t* __restrict result, + const lv_32fc_t* __restrict input, + const float* __restrict taps, + unsigned int num_points) +{ + + unsigned int number; + const unsigned int quarterPoints = num_points / 8; + + float res[2]; + float *realpt = &res[0], *imagpt = &res[1]; + const float* inputPtr = (float*)input; + const float* tapsPtr = taps; + float zero[4] = { 0.0f, 0.0f, 0.0f, 0.0f }; + float accVector_real[4]; + float accVector_imag[4]; + + float32x4x2_t inputVector0, inputVector1; + float32x4_t tapsVector0, tapsVector1; + float32x4_t tmp_real0, tmp_imag0; + float32x4_t tmp_real1, tmp_imag1; + float32x4_t real_accumulator0, imag_accumulator0; + float32x4_t real_accumulator1, imag_accumulator1; + + // zero out accumulators + // take a *float, return float32x4_t + real_accumulator0 = vld1q_f32(zero); + imag_accumulator0 = vld1q_f32(zero); + real_accumulator1 = vld1q_f32(zero); + imag_accumulator1 = vld1q_f32(zero); + + for (number = 0; number < quarterPoints; number++) { + // load doublewords and duplicate in to second lane + tapsVector0 = vld1q_f32(tapsPtr); + tapsVector1 = vld1q_f32(tapsPtr + 4); + + // load quadword of complex numbers in to 2 lanes. 1st lane is real, 2dn imag + inputVector0 = vld2q_f32(inputPtr); + inputVector1 = vld2q_f32(inputPtr + 8); + // inputVector is now a struct of two vectors, 0th is real, 1st is imag + + tmp_real0 = vmulq_f32(tapsVector0, inputVector0.val[0]); + tmp_imag0 = vmulq_f32(tapsVector0, inputVector0.val[1]); + + tmp_real1 = vmulq_f32(tapsVector1, inputVector1.val[0]); + tmp_imag1 = vmulq_f32(tapsVector1, inputVector1.val[1]); + + real_accumulator0 = vaddq_f32(real_accumulator0, tmp_real0); + imag_accumulator0 = vaddq_f32(imag_accumulator0, tmp_imag0); + + real_accumulator1 = vaddq_f32(real_accumulator1, tmp_real1); + imag_accumulator1 = vaddq_f32(imag_accumulator1, tmp_imag1); + + tapsPtr += 8; + inputPtr += 16; + } + + real_accumulator0 = vaddq_f32(real_accumulator0, real_accumulator1); + imag_accumulator0 = vaddq_f32(imag_accumulator0, imag_accumulator1); + // void vst1q_f32( float32_t * ptr, float32x4_t val); + // store results back to a complex (array of 2 floats) + vst1q_f32(accVector_real, real_accumulator0); + vst1q_f32(accVector_imag, imag_accumulator0); + *realpt = + accVector_real[0] + accVector_real[1] + accVector_real[2] + accVector_real[3]; + + *imagpt = + accVector_imag[0] + accVector_imag[1] + accVector_imag[2] + accVector_imag[3]; + + // clean up the remainder + for (number = quarterPoints * 8; number < num_points; number++) { + *realpt += ((*inputPtr++) * (*tapsPtr)); + *imagpt += ((*inputPtr++) * (*tapsPtr++)); + } + + *result = *(lv_32fc_t*)(&res[0]); } #endif /*LV_HAVE_NEON*/ @@ -582,154 +614,171 @@ static inline void volk_32fc_32f_dot_prod_32fc_neon_unroll ( lv_32fc_t* __restri #ifdef LV_HAVE_NEON #include -static inline void volk_32fc_32f_dot_prod_32fc_a_neon ( lv_32fc_t* __restrict result, const lv_32fc_t* __restrict input, const float* __restrict taps, unsigned int num_points) { - - unsigned int number; - const unsigned int quarterPoints = num_points / 4; +static inline void volk_32fc_32f_dot_prod_32fc_a_neon(lv_32fc_t* __restrict result, + const lv_32fc_t* __restrict input, + const float* __restrict taps, + unsigned int num_points) +{ - float res[2]; - float *realpt = &res[0], *imagpt = &res[1]; - const float* inputPtr = (float*)input; - const float* tapsPtr = taps; - float zero[4] = {0.0f, 0.0f, 0.0f, 0.0f }; - float accVector_real[4]; - float accVector_imag[4]; + unsigned int number; + const unsigned int quarterPoints = num_points / 4; - float32x4x2_t inputVector; - float32x4_t tapsVector; - float32x4_t tmp_real, tmp_imag; - float32x4_t real_accumulator, imag_accumulator; + float res[2]; + float *realpt = &res[0], *imagpt = &res[1]; + const float* inputPtr = (float*)input; + const float* tapsPtr = taps; + float zero[4] = { 0.0f, 0.0f, 0.0f, 0.0f }; + float accVector_real[4]; + float accVector_imag[4]; + float32x4x2_t inputVector; + float32x4_t tapsVector; + float32x4_t tmp_real, tmp_imag; + float32x4_t real_accumulator, imag_accumulator; - // zero out accumulators - // take a *float, return float32x4_t - real_accumulator = vld1q_f32( zero ); - imag_accumulator = vld1q_f32( zero ); - for(number=0 ;number < quarterPoints; number++){ - // load taps ( float32x2x2_t = vld1q_f32( float32_t const * ptr) ) - // load doublewords and duplicate in to second lane - tapsVector = vld1q_f32(tapsPtr ); + // zero out accumulators + // take a *float, return float32x4_t + real_accumulator = vld1q_f32(zero); + imag_accumulator = vld1q_f32(zero); - // load quadword of complex numbers in to 2 lanes. 1st lane is real, 2dn imag - inputVector = vld2q_f32(inputPtr ); + for (number = 0; number < quarterPoints; number++) { + // load taps ( float32x2x2_t = vld1q_f32( float32_t const * ptr) ) + // load doublewords and duplicate in to second lane + tapsVector = vld1q_f32(tapsPtr); - tmp_real = vmulq_f32(tapsVector, inputVector.val[0]); - tmp_imag = vmulq_f32(tapsVector, inputVector.val[1]); + // load quadword of complex numbers in to 2 lanes. 1st lane is real, 2dn imag + inputVector = vld2q_f32(inputPtr); - real_accumulator = vaddq_f32(real_accumulator, tmp_real); - imag_accumulator = vaddq_f32(imag_accumulator, tmp_imag); + tmp_real = vmulq_f32(tapsVector, inputVector.val[0]); + tmp_imag = vmulq_f32(tapsVector, inputVector.val[1]); + real_accumulator = vaddq_f32(real_accumulator, tmp_real); + imag_accumulator = vaddq_f32(imag_accumulator, tmp_imag); - tapsPtr += 4; - inputPtr += 8; - } + tapsPtr += 4; + inputPtr += 8; + } - // store results back to a complex (array of 2 floats) - vst1q_f32(accVector_real, real_accumulator); - vst1q_f32(accVector_imag, imag_accumulator); - *realpt = accVector_real[0] + accVector_real[1] + - accVector_real[2] + accVector_real[3] ; + // store results back to a complex (array of 2 floats) + vst1q_f32(accVector_real, real_accumulator); + vst1q_f32(accVector_imag, imag_accumulator); + *realpt = + accVector_real[0] + accVector_real[1] + accVector_real[2] + accVector_real[3]; - *imagpt = accVector_imag[0] + accVector_imag[1] + - accVector_imag[2] + accVector_imag[3] ; + *imagpt = + accVector_imag[0] + accVector_imag[1] + accVector_imag[2] + accVector_imag[3]; - // clean up the remainder - for(number=quarterPoints*4; number < num_points; number++){ - *realpt += ((*inputPtr++) * (*tapsPtr)); - *imagpt += ((*inputPtr++) * (*tapsPtr++)); - } + // clean up the remainder + for (number = quarterPoints * 4; number < num_points; number++) { + *realpt += ((*inputPtr++) * (*tapsPtr)); + *imagpt += ((*inputPtr++) * (*tapsPtr++)); + } - *result = *(lv_32fc_t*)(&res[0]); + *result = *(lv_32fc_t*)(&res[0]); } #endif /*LV_HAVE_NEON*/ #ifdef LV_HAVE_NEONV7 -extern void volk_32fc_32f_dot_prod_32fc_a_neonasm ( lv_32fc_t* result, const lv_32fc_t* input, const float* taps, unsigned int num_points); +extern void volk_32fc_32f_dot_prod_32fc_a_neonasm(lv_32fc_t* result, + const lv_32fc_t* input, + const float* taps, + unsigned int num_points); #endif /*LV_HAVE_NEONV7*/ #ifdef LV_HAVE_NEONV7 -extern void volk_32fc_32f_dot_prod_32fc_a_neonasmvmla ( lv_32fc_t* result, const lv_32fc_t* input, const float* taps, unsigned int num_points); +extern void volk_32fc_32f_dot_prod_32fc_a_neonasmvmla(lv_32fc_t* result, + const lv_32fc_t* input, + const float* taps, + unsigned int num_points); #endif /*LV_HAVE_NEONV7*/ #ifdef LV_HAVE_NEONV7 -extern void volk_32fc_32f_dot_prod_32fc_a_neonpipeline ( lv_32fc_t* result, const lv_32fc_t* input, const float* taps, unsigned int num_points); +extern void volk_32fc_32f_dot_prod_32fc_a_neonpipeline(lv_32fc_t* result, + const lv_32fc_t* input, + const float* taps, + unsigned int num_points); #endif /*LV_HAVE_NEONV7*/ #ifdef LV_HAVE_SSE -static inline void volk_32fc_32f_dot_prod_32fc_u_sse( lv_32fc_t* result, const lv_32fc_t* input, const float* taps, unsigned int num_points) { - - unsigned int number = 0; - const unsigned int sixteenthPoints = num_points / 8; - - float res[2]; - float *realpt = &res[0], *imagpt = &res[1]; - const float* aPtr = (float*)input; - const float* bPtr = taps; - - __m128 a0Val, a1Val, a2Val, a3Val; - __m128 b0Val, b1Val, b2Val, b3Val; - __m128 x0Val, x1Val, x2Val, x3Val; - __m128 c0Val, c1Val, c2Val, c3Val; - - __m128 dotProdVal0 = _mm_setzero_ps(); - __m128 dotProdVal1 = _mm_setzero_ps(); - __m128 dotProdVal2 = _mm_setzero_ps(); - __m128 dotProdVal3 = _mm_setzero_ps(); - - for(;number < sixteenthPoints; number++){ - - a0Val = _mm_loadu_ps(aPtr); - a1Val = _mm_loadu_ps(aPtr+4); - a2Val = _mm_loadu_ps(aPtr+8); - a3Val = _mm_loadu_ps(aPtr+12); - - x0Val = _mm_loadu_ps(bPtr); - x1Val = _mm_loadu_ps(bPtr); - x2Val = _mm_loadu_ps(bPtr+4); - x3Val = _mm_loadu_ps(bPtr+4); - b0Val = _mm_unpacklo_ps(x0Val, x1Val); - b1Val = _mm_unpackhi_ps(x0Val, x1Val); - b2Val = _mm_unpacklo_ps(x2Val, x3Val); - b3Val = _mm_unpackhi_ps(x2Val, x3Val); - - c0Val = _mm_mul_ps(a0Val, b0Val); - c1Val = _mm_mul_ps(a1Val, b1Val); - c2Val = _mm_mul_ps(a2Val, b2Val); - c3Val = _mm_mul_ps(a3Val, b3Val); - - dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0); - dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1); - dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2); - dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3); - - aPtr += 16; - bPtr += 8; - } - - dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1); - dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2); - dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3); - - __VOLK_ATTR_ALIGNED(16) float dotProductVector[4]; - - _mm_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector - - *realpt = dotProductVector[0]; - *imagpt = dotProductVector[1]; - *realpt += dotProductVector[2]; - *imagpt += dotProductVector[3]; - - number = sixteenthPoints*8; - for(;number < num_points; number++){ - *realpt += ((*aPtr++) * (*bPtr)); - *imagpt += ((*aPtr++) * (*bPtr++)); - } - - *result = *(lv_32fc_t*)(&res[0]); +static inline void volk_32fc_32f_dot_prod_32fc_u_sse(lv_32fc_t* result, + const lv_32fc_t* input, + const float* taps, + unsigned int num_points) +{ + + unsigned int number = 0; + const unsigned int sixteenthPoints = num_points / 8; + + float res[2]; + float *realpt = &res[0], *imagpt = &res[1]; + const float* aPtr = (float*)input; + const float* bPtr = taps; + + __m128 a0Val, a1Val, a2Val, a3Val; + __m128 b0Val, b1Val, b2Val, b3Val; + __m128 x0Val, x1Val, x2Val, x3Val; + __m128 c0Val, c1Val, c2Val, c3Val; + + __m128 dotProdVal0 = _mm_setzero_ps(); + __m128 dotProdVal1 = _mm_setzero_ps(); + __m128 dotProdVal2 = _mm_setzero_ps(); + __m128 dotProdVal3 = _mm_setzero_ps(); + + for (; number < sixteenthPoints; number++) { + + a0Val = _mm_loadu_ps(aPtr); + a1Val = _mm_loadu_ps(aPtr + 4); + a2Val = _mm_loadu_ps(aPtr + 8); + a3Val = _mm_loadu_ps(aPtr + 12); + + x0Val = _mm_loadu_ps(bPtr); + x1Val = _mm_loadu_ps(bPtr); + x2Val = _mm_loadu_ps(bPtr + 4); + x3Val = _mm_loadu_ps(bPtr + 4); + b0Val = _mm_unpacklo_ps(x0Val, x1Val); + b1Val = _mm_unpackhi_ps(x0Val, x1Val); + b2Val = _mm_unpacklo_ps(x2Val, x3Val); + b3Val = _mm_unpackhi_ps(x2Val, x3Val); + + c0Val = _mm_mul_ps(a0Val, b0Val); + c1Val = _mm_mul_ps(a1Val, b1Val); + c2Val = _mm_mul_ps(a2Val, b2Val); + c3Val = _mm_mul_ps(a3Val, b3Val); + + dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0); + dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1); + dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2); + dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3); + + aPtr += 16; + bPtr += 8; + } + + dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1); + dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2); + dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3); + + __VOLK_ATTR_ALIGNED(16) float dotProductVector[4]; + + _mm_store_ps(dotProductVector, + dotProdVal0); // Store the results back into the dot product vector + + *realpt = dotProductVector[0]; + *imagpt = dotProductVector[1]; + *realpt += dotProductVector[2]; + *imagpt += dotProductVector[3]; + + number = sixteenthPoints * 8; + for (; number < num_points; number++) { + *realpt += ((*aPtr++) * (*bPtr)); + *imagpt += ((*aPtr++) * (*bPtr++)); + } + + *result = *(lv_32fc_t*)(&res[0]); } #endif /*LV_HAVE_SSE*/ diff --git a/kernels/volk/volk_32fc_32f_multiply_32fc.h b/kernels/volk/volk_32fc_32f_multiply_32fc.h index b47883f..196ba9a 100644 --- a/kernels/volk/volk_32fc_32f_multiply_32fc.h +++ b/kernels/volk/volk_32fc_32f_multiply_32fc.h @@ -30,8 +30,8 @@ * * Dispatcher Prototype * \code - * void volk_32fc_32f_multiply_32fc(lv_32fc_t* cVector, const lv_32fc_t* aVector, const float* bVector, unsigned int num_points); - * \endcode + * void volk_32fc_32f_multiply_32fc(lv_32fc_t* cVector, const lv_32fc_t* aVector, const + * float* bVector, unsigned int num_points); \endcode * * \b Inputs * \li aVector: The input vector of complex floats. @@ -61,52 +61,55 @@ #ifdef LV_HAVE_AVX #include -static inline void -volk_32fc_32f_multiply_32fc_a_avx(lv_32fc_t* cVector, const lv_32fc_t* aVector, - const float* bVector, unsigned int num_points) +static inline void volk_32fc_32f_multiply_32fc_a_avx(lv_32fc_t* cVector, + const lv_32fc_t* aVector, + const float* bVector, + unsigned int num_points) { - unsigned int number = 0; - const unsigned int eighthPoints = num_points / 8; + unsigned int number = 0; + const unsigned int eighthPoints = num_points / 8; - lv_32fc_t* cPtr = cVector; - const lv_32fc_t* aPtr = aVector; - const float* bPtr= bVector; + lv_32fc_t* cPtr = cVector; + const lv_32fc_t* aPtr = aVector; + const float* bPtr = bVector; - __m256 aVal1, aVal2, bVal, bVal1, bVal2, cVal1, cVal2; + __m256 aVal1, aVal2, bVal, bVal1, bVal2, cVal1, cVal2; - __m256i permute_mask = _mm256_set_epi32(3, 3, 2, 2, 1, 1, 0, 0); + __m256i permute_mask = _mm256_set_epi32(3, 3, 2, 2, 1, 1, 0, 0); - for(;number < eighthPoints; number++){ + for (; number < eighthPoints; number++) { - aVal1 = _mm256_load_ps((float *)aPtr); - aPtr += 4; + aVal1 = _mm256_load_ps((float*)aPtr); + aPtr += 4; - aVal2 = _mm256_load_ps((float *)aPtr); - aPtr += 4; + aVal2 = _mm256_load_ps((float*)aPtr); + aPtr += 4; - bVal = _mm256_load_ps(bPtr); // b0|b1|b2|b3|b4|b5|b6|b7 - bPtr += 8; + bVal = _mm256_load_ps(bPtr); // b0|b1|b2|b3|b4|b5|b6|b7 + bPtr += 8; - bVal1 = _mm256_permute2f128_ps(bVal, bVal, 0x00); // b0|b1|b2|b3|b0|b1|b2|b3 - bVal2 = _mm256_permute2f128_ps(bVal, bVal, 0x11); // b4|b5|b6|b7|b4|b5|b6|b7 + bVal1 = _mm256_permute2f128_ps(bVal, bVal, 0x00); // b0|b1|b2|b3|b0|b1|b2|b3 + bVal2 = _mm256_permute2f128_ps(bVal, bVal, 0x11); // b4|b5|b6|b7|b4|b5|b6|b7 - bVal1 = _mm256_permutevar_ps(bVal1, permute_mask); // b0|b0|b1|b1|b2|b2|b3|b3 - bVal2 = _mm256_permutevar_ps(bVal2, permute_mask); // b4|b4|b5|b5|b6|b6|b7|b7 + bVal1 = _mm256_permutevar_ps(bVal1, permute_mask); // b0|b0|b1|b1|b2|b2|b3|b3 + bVal2 = _mm256_permutevar_ps(bVal2, permute_mask); // b4|b4|b5|b5|b6|b6|b7|b7 - cVal1 = _mm256_mul_ps(aVal1, bVal1); - cVal2 = _mm256_mul_ps(aVal2, bVal2); + cVal1 = _mm256_mul_ps(aVal1, bVal1); + cVal2 = _mm256_mul_ps(aVal2, bVal2); - _mm256_store_ps((float*)cPtr,cVal1); // Store the results back into the C container - cPtr += 4; + _mm256_store_ps((float*)cPtr, + cVal1); // Store the results back into the C container + cPtr += 4; - _mm256_store_ps((float*)cPtr,cVal2); // Store the results back into the C container - cPtr += 4; - } + _mm256_store_ps((float*)cPtr, + cVal2); // Store the results back into the C container + cPtr += 4; + } - number = eighthPoints * 8; - for(;number < num_points; ++number){ - *cPtr++ = (*aPtr++) * (*bPtr++); - } + number = eighthPoints * 8; + for (; number < num_points; ++number) { + *cPtr++ = (*aPtr++) * (*bPtr++); + } } #endif /* LV_HAVE_AVX */ @@ -114,67 +117,69 @@ volk_32fc_32f_multiply_32fc_a_avx(lv_32fc_t* cVector, const lv_32fc_t* aVector, #ifdef LV_HAVE_SSE #include -static inline void -volk_32fc_32f_multiply_32fc_a_sse(lv_32fc_t* cVector, const lv_32fc_t* aVector, - const float* bVector, unsigned int num_points) +static inline void volk_32fc_32f_multiply_32fc_a_sse(lv_32fc_t* cVector, + const lv_32fc_t* aVector, + const float* bVector, + unsigned int num_points) { - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; - lv_32fc_t* cPtr = cVector; - const lv_32fc_t* aPtr = aVector; - const float* bPtr= bVector; + lv_32fc_t* cPtr = cVector; + const lv_32fc_t* aPtr = aVector; + const float* bPtr = bVector; - __m128 aVal1, aVal2, bVal, bVal1, bVal2, cVal; - for(;number < quarterPoints; number++){ + __m128 aVal1, aVal2, bVal, bVal1, bVal2, cVal; + for (; number < quarterPoints; number++) { - aVal1 = _mm_load_ps((const float*)aPtr); - aPtr += 2; + aVal1 = _mm_load_ps((const float*)aPtr); + aPtr += 2; - aVal2 = _mm_load_ps((const float*)aPtr); - aPtr += 2; + aVal2 = _mm_load_ps((const float*)aPtr); + aPtr += 2; - bVal = _mm_load_ps(bPtr); - bPtr += 4; + bVal = _mm_load_ps(bPtr); + bPtr += 4; - bVal1 = _mm_shuffle_ps(bVal, bVal, _MM_SHUFFLE(1,1,0,0)); - bVal2 = _mm_shuffle_ps(bVal, bVal, _MM_SHUFFLE(3,3,2,2)); + bVal1 = _mm_shuffle_ps(bVal, bVal, _MM_SHUFFLE(1, 1, 0, 0)); + bVal2 = _mm_shuffle_ps(bVal, bVal, _MM_SHUFFLE(3, 3, 2, 2)); - cVal = _mm_mul_ps(aVal1, bVal1); + cVal = _mm_mul_ps(aVal1, bVal1); - _mm_store_ps((float*)cPtr,cVal); // Store the results back into the C container - cPtr += 2; + _mm_store_ps((float*)cPtr, cVal); // Store the results back into the C container + cPtr += 2; - cVal = _mm_mul_ps(aVal2, bVal2); + cVal = _mm_mul_ps(aVal2, bVal2); - _mm_store_ps((float*)cPtr,cVal); // Store the results back into the C container + _mm_store_ps((float*)cPtr, cVal); // Store the results back into the C container - cPtr += 2; - } + cPtr += 2; + } - number = quarterPoints * 4; - for(;number < num_points; number++){ - *cPtr++ = (*aPtr++) * (*bPtr); - bPtr++; - } + number = quarterPoints * 4; + for (; number < num_points; number++) { + *cPtr++ = (*aPtr++) * (*bPtr); + bPtr++; + } } #endif /* LV_HAVE_SSE */ #ifdef LV_HAVE_GENERIC -static inline void -volk_32fc_32f_multiply_32fc_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, - const float* bVector, unsigned int num_points) +static inline void volk_32fc_32f_multiply_32fc_generic(lv_32fc_t* cVector, + const lv_32fc_t* aVector, + const float* bVector, + unsigned int num_points) { - lv_32fc_t* cPtr = cVector; - const lv_32fc_t* aPtr = aVector; - const float* bPtr= bVector; - unsigned int number = 0; - - for(number = 0; number < num_points; number++){ - *cPtr++ = (*aPtr++) * (*bPtr++); - } + lv_32fc_t* cPtr = cVector; + const lv_32fc_t* aPtr = aVector; + const float* bPtr = bVector; + unsigned int number = 0; + + for (number = 0; number < num_points; number++) { + *cPtr++ = (*aPtr++) * (*bPtr++); + } } #endif /* LV_HAVE_GENERIC */ @@ -182,49 +187,52 @@ volk_32fc_32f_multiply_32fc_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector #ifdef LV_HAVE_NEON #include -static inline void -volk_32fc_32f_multiply_32fc_neon(lv_32fc_t* cVector, const lv_32fc_t* aVector, - const float* bVector, unsigned int num_points) +static inline void volk_32fc_32f_multiply_32fc_neon(lv_32fc_t* cVector, + const lv_32fc_t* aVector, + const float* bVector, + unsigned int num_points) { - lv_32fc_t* cPtr = cVector; - const lv_32fc_t* aPtr = aVector; - const float* bPtr= bVector; - unsigned int number = 0; - unsigned int quarter_points = num_points / 4; - - float32x4x2_t inputVector, outputVector; - float32x4_t tapsVector; - for(number = 0; number < quarter_points; number++){ - inputVector = vld2q_f32((float*)aPtr); - tapsVector = vld1q_f32(bPtr); - - outputVector.val[0] = vmulq_f32(inputVector.val[0], tapsVector); - outputVector.val[1] = vmulq_f32(inputVector.val[1], tapsVector); - - vst2q_f32((float*)cPtr, outputVector); - aPtr += 4; - bPtr += 4; - cPtr += 4; - } - - for(number = quarter_points * 4; number < num_points; number++){ - *cPtr++ = (*aPtr++) * (*bPtr++); - } + lv_32fc_t* cPtr = cVector; + const lv_32fc_t* aPtr = aVector; + const float* bPtr = bVector; + unsigned int number = 0; + unsigned int quarter_points = num_points / 4; + + float32x4x2_t inputVector, outputVector; + float32x4_t tapsVector; + for (number = 0; number < quarter_points; number++) { + inputVector = vld2q_f32((float*)aPtr); + tapsVector = vld1q_f32(bPtr); + + outputVector.val[0] = vmulq_f32(inputVector.val[0], tapsVector); + outputVector.val[1] = vmulq_f32(inputVector.val[1], tapsVector); + + vst2q_f32((float*)cPtr, outputVector); + aPtr += 4; + bPtr += 4; + cPtr += 4; + } + + for (number = quarter_points * 4; number < num_points; number++) { + *cPtr++ = (*aPtr++) * (*bPtr++); + } } #endif /* LV_HAVE_NEON */ #ifdef LV_HAVE_ORC -extern void -volk_32fc_32f_multiply_32fc_a_orc_impl(lv_32fc_t* cVector, const lv_32fc_t* aVector, - const float* bVector, unsigned int num_points); +extern void volk_32fc_32f_multiply_32fc_a_orc_impl(lv_32fc_t* cVector, + const lv_32fc_t* aVector, + const float* bVector, + unsigned int num_points); -static inline void -volk_32fc_32f_multiply_32fc_u_orc(lv_32fc_t* cVector, const lv_32fc_t* aVector, - const float* bVector, unsigned int num_points) +static inline void volk_32fc_32f_multiply_32fc_u_orc(lv_32fc_t* cVector, + const lv_32fc_t* aVector, + const float* bVector, + unsigned int num_points) { - volk_32fc_32f_multiply_32fc_a_orc_impl(cVector, aVector, bVector, num_points); + volk_32fc_32f_multiply_32fc_a_orc_impl(cVector, aVector, bVector, num_points); } #endif /* LV_HAVE_GENERIC */ diff --git a/kernels/volk/volk_32fc_conjugate_32fc.h b/kernels/volk/volk_32fc_conjugate_32fc.h index 6994d0e..9195e3a 100644 --- a/kernels/volk/volk_32fc_conjugate_32fc.h +++ b/kernels/volk/volk_32fc_conjugate_32fc.h @@ -29,8 +29,8 @@ * * Dispatcher Prototype * \code - * void volk_32fc_conjugate_32fc(lv_32fc_t* cVector, const lv_32fc_t* aVector, unsigned int num_points) - * \endcode + * void volk_32fc_conjugate_32fc(lv_32fc_t* cVector, const lv_32fc_t* aVector, unsigned + * int num_points) \endcode * * \b Inputs * \li aVector: The input vector of complex floats. @@ -68,91 +68,94 @@ #ifndef INCLUDED_volk_32fc_conjugate_32fc_u_H #define INCLUDED_volk_32fc_conjugate_32fc_u_H +#include #include #include #include -#include #ifdef LV_HAVE_AVX #include -static inline void -volk_32fc_conjugate_32fc_u_avx(lv_32fc_t* cVector, const lv_32fc_t* aVector, unsigned int num_points) +static inline void volk_32fc_conjugate_32fc_u_avx(lv_32fc_t* cVector, + const lv_32fc_t* aVector, + unsigned int num_points) { - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; - __m256 x; - lv_32fc_t* c = cVector; - const lv_32fc_t* a = aVector; + __m256 x; + lv_32fc_t* c = cVector; + const lv_32fc_t* a = aVector; - __m256 conjugator = _mm256_setr_ps(0, -0.f, 0, -0.f, 0, -0.f, 0, -0.f); + __m256 conjugator = _mm256_setr_ps(0, -0.f, 0, -0.f, 0, -0.f, 0, -0.f); - for(;number < quarterPoints; number++){ + for (; number < quarterPoints; number++) { - x = _mm256_loadu_ps((float*)a); // Load the complex data as ar,ai,br,bi + x = _mm256_loadu_ps((float*)a); // Load the complex data as ar,ai,br,bi - x = _mm256_xor_ps(x, conjugator); // conjugate register + x = _mm256_xor_ps(x, conjugator); // conjugate register - _mm256_storeu_ps((float*)c,x); // Store the results back into the C container + _mm256_storeu_ps((float*)c, x); // Store the results back into the C container - a += 4; - c += 4; - } + a += 4; + c += 4; + } - number = quarterPoints * 4; + number = quarterPoints * 4; - for(;number < num_points; number++) { - *c++ = lv_conj(*a++); - } + for (; number < num_points; number++) { + *c++ = lv_conj(*a++); + } } #endif /* LV_HAVE_AVX */ #ifdef LV_HAVE_SSE3 #include -static inline void -volk_32fc_conjugate_32fc_u_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector, unsigned int num_points) +static inline void volk_32fc_conjugate_32fc_u_sse3(lv_32fc_t* cVector, + const lv_32fc_t* aVector, + unsigned int num_points) { - unsigned int number = 0; - const unsigned int halfPoints = num_points / 2; + unsigned int number = 0; + const unsigned int halfPoints = num_points / 2; - __m128 x; - lv_32fc_t* c = cVector; - const lv_32fc_t* a = aVector; + __m128 x; + lv_32fc_t* c = cVector; + const lv_32fc_t* a = aVector; - __m128 conjugator = _mm_setr_ps(0, -0.f, 0, -0.f); + __m128 conjugator = _mm_setr_ps(0, -0.f, 0, -0.f); - for(;number < halfPoints; number++){ + for (; number < halfPoints; number++) { - x = _mm_loadu_ps((float*)a); // Load the complex data as ar,ai,br,bi + x = _mm_loadu_ps((float*)a); // Load the complex data as ar,ai,br,bi - x = _mm_xor_ps(x, conjugator); // conjugate register + x = _mm_xor_ps(x, conjugator); // conjugate register - _mm_storeu_ps((float*)c,x); // Store the results back into the C container + _mm_storeu_ps((float*)c, x); // Store the results back into the C container - a += 2; - c += 2; - } + a += 2; + c += 2; + } - if((num_points % 2) != 0) { - *c = lv_conj(*a); - } + if ((num_points % 2) != 0) { + *c = lv_conj(*a); + } } #endif /* LV_HAVE_SSE3 */ #ifdef LV_HAVE_GENERIC -static inline void -volk_32fc_conjugate_32fc_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, unsigned int num_points) +static inline void volk_32fc_conjugate_32fc_generic(lv_32fc_t* cVector, + const lv_32fc_t* aVector, + unsigned int num_points) { - lv_32fc_t* cPtr = cVector; - const lv_32fc_t* aPtr = aVector; - unsigned int number = 0; + lv_32fc_t* cPtr = cVector; + const lv_32fc_t* aPtr = aVector; + unsigned int number = 0; - for(number = 0; number < num_points; number++){ - *cPtr++ = lv_conj(*aPtr++); - } + for (number = 0; number < num_points; number++) { + *cPtr++ = lv_conj(*aPtr++); + } } #endif /* LV_HAVE_GENERIC */ @@ -161,124 +164,128 @@ volk_32fc_conjugate_32fc_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, u #ifndef INCLUDED_volk_32fc_conjugate_32fc_a_H #define INCLUDED_volk_32fc_conjugate_32fc_a_H +#include #include #include #include -#include #ifdef LV_HAVE_AVX #include -static inline void -volk_32fc_conjugate_32fc_a_avx(lv_32fc_t* cVector, const lv_32fc_t* aVector, unsigned int num_points) +static inline void volk_32fc_conjugate_32fc_a_avx(lv_32fc_t* cVector, + const lv_32fc_t* aVector, + unsigned int num_points) { - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; - __m256 x; - lv_32fc_t* c = cVector; - const lv_32fc_t* a = aVector; + __m256 x; + lv_32fc_t* c = cVector; + const lv_32fc_t* a = aVector; - __m256 conjugator = _mm256_setr_ps(0, -0.f, 0, -0.f, 0, -0.f, 0, -0.f); + __m256 conjugator = _mm256_setr_ps(0, -0.f, 0, -0.f, 0, -0.f, 0, -0.f); - for(;number < quarterPoints; number++){ + for (; number < quarterPoints; number++) { - x = _mm256_load_ps((float*)a); // Load the complex data as ar,ai,br,bi + x = _mm256_load_ps((float*)a); // Load the complex data as ar,ai,br,bi - x = _mm256_xor_ps(x, conjugator); // conjugate register + x = _mm256_xor_ps(x, conjugator); // conjugate register - _mm256_store_ps((float*)c,x); // Store the results back into the C container + _mm256_store_ps((float*)c, x); // Store the results back into the C container - a += 4; - c += 4; - } + a += 4; + c += 4; + } - number = quarterPoints * 4; + number = quarterPoints * 4; - for(;number < num_points; number++) { - *c++ = lv_conj(*a++); - } + for (; number < num_points; number++) { + *c++ = lv_conj(*a++); + } } #endif /* LV_HAVE_AVX */ #ifdef LV_HAVE_SSE3 #include -static inline void -volk_32fc_conjugate_32fc_a_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector, unsigned int num_points) +static inline void volk_32fc_conjugate_32fc_a_sse3(lv_32fc_t* cVector, + const lv_32fc_t* aVector, + unsigned int num_points) { - unsigned int number = 0; - const unsigned int halfPoints = num_points / 2; + unsigned int number = 0; + const unsigned int halfPoints = num_points / 2; - __m128 x; - lv_32fc_t* c = cVector; - const lv_32fc_t* a = aVector; + __m128 x; + lv_32fc_t* c = cVector; + const lv_32fc_t* a = aVector; - __m128 conjugator = _mm_setr_ps(0, -0.f, 0, -0.f); + __m128 conjugator = _mm_setr_ps(0, -0.f, 0, -0.f); - for(;number < halfPoints; number++){ + for (; number < halfPoints; number++) { - x = _mm_load_ps((float*)a); // Load the complex data as ar,ai,br,bi + x = _mm_load_ps((float*)a); // Load the complex data as ar,ai,br,bi - x = _mm_xor_ps(x, conjugator); // conjugate register + x = _mm_xor_ps(x, conjugator); // conjugate register - _mm_store_ps((float*)c,x); // Store the results back into the C container + _mm_store_ps((float*)c, x); // Store the results back into the C container - a += 2; - c += 2; - } + a += 2; + c += 2; + } - if((num_points % 2) != 0) { - *c = lv_conj(*a); - } + if ((num_points % 2) != 0) { + *c = lv_conj(*a); + } } #endif /* LV_HAVE_SSE3 */ #ifdef LV_HAVE_NEON #include -static inline void -volk_32fc_conjugate_32fc_a_neon(lv_32fc_t* cVector, const lv_32fc_t* aVector, unsigned int num_points) +static inline void volk_32fc_conjugate_32fc_a_neon(lv_32fc_t* cVector, + const lv_32fc_t* aVector, + unsigned int num_points) { - unsigned int number; - const unsigned int quarterPoints = num_points / 4; + unsigned int number; + const unsigned int quarterPoints = num_points / 4; - float32x4x2_t x; - lv_32fc_t* c = cVector; - const lv_32fc_t* a = aVector; + float32x4x2_t x; + lv_32fc_t* c = cVector; + const lv_32fc_t* a = aVector; - for(number=0; number < quarterPoints; number++){ - __VOLK_PREFETCH(a+4); - x = vld2q_f32((float*)a); // Load the complex data as ar,br,cr,dr; ai,bi,ci,di + for (number = 0; number < quarterPoints; number++) { + __VOLK_PREFETCH(a + 4); + x = vld2q_f32((float*)a); // Load the complex data as ar,br,cr,dr; ai,bi,ci,di - // xor the imaginary lane - x.val[1] = vnegq_f32( x.val[1]); + // xor the imaginary lane + x.val[1] = vnegq_f32(x.val[1]); - vst2q_f32((float*)c,x); // Store the results back into the C container + vst2q_f32((float*)c, x); // Store the results back into the C container - a += 4; - c += 4; - } + a += 4; + c += 4; + } - for(number=quarterPoints*4; number < num_points; number++){ - *c++ = lv_conj(*a++); - } + for (number = quarterPoints * 4; number < num_points; number++) { + *c++ = lv_conj(*a++); + } } #endif /* LV_HAVE_NEON */ #ifdef LV_HAVE_GENERIC -static inline void -volk_32fc_conjugate_32fc_a_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, unsigned int num_points) +static inline void volk_32fc_conjugate_32fc_a_generic(lv_32fc_t* cVector, + const lv_32fc_t* aVector, + unsigned int num_points) { - lv_32fc_t* cPtr = cVector; - const lv_32fc_t* aPtr = aVector; - unsigned int number = 0; + lv_32fc_t* cPtr = cVector; + const lv_32fc_t* aPtr = aVector; + unsigned int number = 0; - for(number = 0; number < num_points; number++){ - *cPtr++ = lv_conj(*aPtr++); - } + for (number = 0; number < num_points; number++) { + *cPtr++ = lv_conj(*aPtr++); + } } #endif /* LV_HAVE_GENERIC */ diff --git a/kernels/volk/volk_32fc_convert_16ic.h b/kernels/volk/volk_32fc_convert_16ic.h index 0ba2383..5788158 100644 --- a/kernels/volk/volk_32fc_convert_16ic.h +++ b/kernels/volk/volk_32fc_convert_16ic.h @@ -31,8 +31,8 @@ * * Dispatcher Prototype * \code - * void volk_32fc_convert_16ic(lv_16sc_t* outputVector, const lv_32fc_t* inputVector, unsigned int num_points); - * \endcode + * void volk_32fc_convert_16ic(lv_16sc_t* outputVector, const lv_32fc_t* inputVector, + * unsigned int num_points); \endcode * * \b Inputs * \li inputVector: The complex 32-bit float input data buffer. @@ -46,14 +46,16 @@ #ifndef INCLUDED_volk_32fc_convert_16ic_a_H #define INCLUDED_volk_32fc_convert_16ic_a_H +#include "volk/volk_complex.h" #include #include -#include "volk/volk_complex.h" #ifdef LV_HAVE_AVX2 #include -static inline void volk_32fc_convert_16ic_a_avx2(lv_16sc_t* outputVector, const lv_32fc_t* inputVector, unsigned int num_points) +static inline void volk_32fc_convert_16ic_a_avx2(lv_16sc_t* outputVector, + const lv_32fc_t* inputVector, + unsigned int num_points) { const unsigned int avx_iters = num_points / 8; @@ -71,44 +73,44 @@ static inline void volk_32fc_convert_16ic_a_avx2(lv_16sc_t* outputVector, const const __m256 vmax_val = _mm256_set1_ps(max_val); unsigned int i; - for(i = 0; i < avx_iters; i++) - { - inputVal1 = _mm256_load_ps((float*)inputVectorPtr); - inputVectorPtr += 8; - inputVal2 = _mm256_load_ps((float*)inputVectorPtr); - inputVectorPtr += 8; - __VOLK_PREFETCH(inputVectorPtr + 16); - - // Clip - ret1 = _mm256_max_ps(_mm256_min_ps(inputVal1, vmax_val), vmin_val); - ret2 = _mm256_max_ps(_mm256_min_ps(inputVal2, vmax_val), vmin_val); - - intInputVal1 = _mm256_cvtps_epi32(ret1); - intInputVal2 = _mm256_cvtps_epi32(ret2); - - intInputVal1 = _mm256_packs_epi32(intInputVal1, intInputVal2); - intInputVal1 = _mm256_permute4x64_epi64(intInputVal1, 0xd8); - - _mm256_store_si256((__m256i*)outputVectorPtr, intInputVal1); - outputVectorPtr += 16; - } - - for(i = avx_iters * 16; i < num_points * 2; i++) - { - aux = *inputVectorPtr++; - if(aux > max_val) - aux = max_val; - else if(aux < min_val) - aux = min_val; - *outputVectorPtr++ = (int16_t)rintf(aux); - } + for (i = 0; i < avx_iters; i++) { + inputVal1 = _mm256_load_ps((float*)inputVectorPtr); + inputVectorPtr += 8; + inputVal2 = _mm256_load_ps((float*)inputVectorPtr); + inputVectorPtr += 8; + __VOLK_PREFETCH(inputVectorPtr + 16); + + // Clip + ret1 = _mm256_max_ps(_mm256_min_ps(inputVal1, vmax_val), vmin_val); + ret2 = _mm256_max_ps(_mm256_min_ps(inputVal2, vmax_val), vmin_val); + + intInputVal1 = _mm256_cvtps_epi32(ret1); + intInputVal2 = _mm256_cvtps_epi32(ret2); + + intInputVal1 = _mm256_packs_epi32(intInputVal1, intInputVal2); + intInputVal1 = _mm256_permute4x64_epi64(intInputVal1, 0xd8); + + _mm256_store_si256((__m256i*)outputVectorPtr, intInputVal1); + outputVectorPtr += 16; + } + + for (i = avx_iters * 16; i < num_points * 2; i++) { + aux = *inputVectorPtr++; + if (aux > max_val) + aux = max_val; + else if (aux < min_val) + aux = min_val; + *outputVectorPtr++ = (int16_t)rintf(aux); + } } #endif /* LV_HAVE_AVX2 */ #ifdef LV_HAVE_SSE2 #include -static inline void volk_32fc_convert_16ic_a_sse2(lv_16sc_t* outputVector, const lv_32fc_t* inputVector, unsigned int num_points) +static inline void volk_32fc_convert_16ic_a_sse2(lv_16sc_t* outputVector, + const lv_32fc_t* inputVector, + unsigned int num_points) { const unsigned int sse_iters = num_points / 4; @@ -126,34 +128,34 @@ static inline void volk_32fc_convert_16ic_a_sse2(lv_16sc_t* outputVector, const const __m128 vmax_val = _mm_set_ps1(max_val); unsigned int i; - for(i = 0; i < sse_iters; i++) - { - inputVal1 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4; - inputVal2 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4; - __VOLK_PREFETCH(inputVectorPtr + 8); - - // Clip - ret1 = _mm_max_ps(_mm_min_ps(inputVal1, vmax_val), vmin_val); - ret2 = _mm_max_ps(_mm_min_ps(inputVal2, vmax_val), vmin_val); - - intInputVal1 = _mm_cvtps_epi32(ret1); - intInputVal2 = _mm_cvtps_epi32(ret2); - - intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2); - - _mm_store_si128((__m128i*)outputVectorPtr, intInputVal1); - outputVectorPtr += 8; - } - - for(i = sse_iters * 8; i < num_points * 2; i++) - { - aux = *inputVectorPtr++; - if(aux > max_val) - aux = max_val; - else if(aux < min_val) - aux = min_val; - *outputVectorPtr++ = (int16_t)rintf(aux); - } + for (i = 0; i < sse_iters; i++) { + inputVal1 = _mm_load_ps((float*)inputVectorPtr); + inputVectorPtr += 4; + inputVal2 = _mm_load_ps((float*)inputVectorPtr); + inputVectorPtr += 4; + __VOLK_PREFETCH(inputVectorPtr + 8); + + // Clip + ret1 = _mm_max_ps(_mm_min_ps(inputVal1, vmax_val), vmin_val); + ret2 = _mm_max_ps(_mm_min_ps(inputVal2, vmax_val), vmin_val); + + intInputVal1 = _mm_cvtps_epi32(ret1); + intInputVal2 = _mm_cvtps_epi32(ret2); + + intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2); + + _mm_store_si128((__m128i*)outputVectorPtr, intInputVal1); + outputVectorPtr += 8; + } + + for (i = sse_iters * 8; i < num_points * 2; i++) { + aux = *inputVectorPtr++; + if (aux > max_val) + aux = max_val; + else if (aux < min_val) + aux = min_val; + *outputVectorPtr++ = (int16_t)rintf(aux); + } } #endif /* LV_HAVE_SSE2 */ @@ -161,13 +163,24 @@ static inline void volk_32fc_convert_16ic_a_sse2(lv_16sc_t* outputVector, const #if LV_HAVE_NEONV7 #include -#define VCVTRQ_S32_F32(res,val) \ - __VOLK_ASM ("VCVTR.S32.F32 %[r0], %[v0]\n\t" : [r0]"=w"(res[0]) : [v0]"w"(val[0]) : ); \ - __VOLK_ASM ("VCVTR.S32.F32 %[r1], %[v1]\n\t" : [r1]"=w"(res[1]) : [v1]"w"(val[1]) : ); \ - __VOLK_ASM ("VCVTR.S32.F32 %[r2], %[v2]\n\t" : [r2]"=w"(res[2]) : [v2]"w"(val[2]) : ); \ - __VOLK_ASM ("VCVTR.S32.F32 %[r3], %[v3]\n\t" : [r3]"=w"(res[3]) : [v3]"w"(val[3]) : ); - -static inline void volk_32fc_convert_16ic_neon(lv_16sc_t* outputVector, const lv_32fc_t* inputVector, unsigned int num_points) +#define VCVTRQ_S32_F32(res, val) \ + __VOLK_ASM("VCVTR.S32.F32 %[r0], %[v0]\n\t" \ + : [r0] "=w"(res[0]) \ + : [v0] "w"(val[0]) \ + :); \ + __VOLK_ASM("VCVTR.S32.F32 %[r1], %[v1]\n\t" \ + : [r1] "=w"(res[1]) \ + : [v1] "w"(val[1]) \ + :); \ + __VOLK_ASM("VCVTR.S32.F32 %[r2], %[v2]\n\t" \ + : [r2] "=w"(res[2]) \ + : [v2] "w"(val[2]) \ + :); \ + __VOLK_ASM("VCVTR.S32.F32 %[r3], %[v3]\n\t" : [r3] "=w"(res[3]) : [v3] "w"(val[3]) :); + +static inline void volk_32fc_convert_16ic_neon(lv_16sc_t* outputVector, + const lv_32fc_t* inputVector, + unsigned int num_points) { const unsigned int neon_iters = num_points / 4; @@ -184,43 +197,41 @@ static inline void volk_32fc_convert_16ic_neon(lv_16sc_t* outputVector, const lv const float32x4_t max_val = vmovq_n_f32(max_val_f); float32x4_t ret1, ret2, a, b; - int32x4_t toint_a={0,0,0,0}; - int32x4_t toint_b={0,0,0,0}; + int32x4_t toint_a = { 0, 0, 0, 0 }; + int32x4_t toint_b = { 0, 0, 0, 0 }; int16x4_t intInputVal1, intInputVal2; int16x8_t res; - for(i = 0; i < neon_iters; i++) - { - a = vld1q_f32((const float32_t*)(inputVectorPtr)); - inputVectorPtr += 4; - b = vld1q_f32((const float32_t*)(inputVectorPtr)); - inputVectorPtr += 4; - __VOLK_PREFETCH(inputVectorPtr + 8); - - ret1 = vmaxq_f32(vminq_f32(a, max_val), min_val); - ret2 = vmaxq_f32(vminq_f32(b, max_val), min_val); - - // vcvtr takes into account the current rounding mode (as does rintf) - VCVTRQ_S32_F32(toint_a, ret1); - VCVTRQ_S32_F32(toint_b, ret2); - - intInputVal1 = vqmovn_s32(toint_a); - intInputVal2 = vqmovn_s32(toint_b); - - res = vcombine_s16(intInputVal1, intInputVal2); - vst1q_s16((int16_t*)outputVectorPtr, res); - outputVectorPtr += 8; - } - - for(i = neon_iters * 8; i < num_points * 2; i++) - { - aux = *inputVectorPtr++; - if(aux > max_val_f) - aux = max_val_f; - else if(aux < min_val_f) - aux = min_val_f; - *outputVectorPtr++ = (int16_t)rintf(aux); - } + for (i = 0; i < neon_iters; i++) { + a = vld1q_f32((const float32_t*)(inputVectorPtr)); + inputVectorPtr += 4; + b = vld1q_f32((const float32_t*)(inputVectorPtr)); + inputVectorPtr += 4; + __VOLK_PREFETCH(inputVectorPtr + 8); + + ret1 = vmaxq_f32(vminq_f32(a, max_val), min_val); + ret2 = vmaxq_f32(vminq_f32(b, max_val), min_val); + + // vcvtr takes into account the current rounding mode (as does rintf) + VCVTRQ_S32_F32(toint_a, ret1); + VCVTRQ_S32_F32(toint_b, ret2); + + intInputVal1 = vqmovn_s32(toint_a); + intInputVal2 = vqmovn_s32(toint_b); + + res = vcombine_s16(intInputVal1, intInputVal2); + vst1q_s16((int16_t*)outputVectorPtr, res); + outputVectorPtr += 8; + } + + for (i = neon_iters * 8; i < num_points * 2; i++) { + aux = *inputVectorPtr++; + if (aux > max_val_f) + aux = max_val_f; + else if (aux < min_val_f) + aux = min_val_f; + *outputVectorPtr++ = (int16_t)rintf(aux); + } } #undef VCVTRQ_S32_F32 @@ -229,7 +240,9 @@ static inline void volk_32fc_convert_16ic_neon(lv_16sc_t* outputVector, const lv #if LV_HAVE_NEONV8 #include -static inline void volk_32fc_convert_16ic_neonv8(lv_16sc_t* outputVector, const lv_32fc_t* inputVector, unsigned int num_points) +static inline void volk_32fc_convert_16ic_neonv8(lv_16sc_t* outputVector, + const lv_32fc_t* inputVector, + unsigned int num_points) { const unsigned int neon_iters = num_points / 4; @@ -245,50 +258,49 @@ static inline void volk_32fc_convert_16ic_neonv8(lv_16sc_t* outputVector, const const float32x4_t max_val = vmovq_n_f32(max_val_f); float32x4_t ret1, ret2, a, b; - int32x4_t toint_a={0,0,0,0}, toint_b={0,0,0,0}; + int32x4_t toint_a = { 0, 0, 0, 0 }, toint_b = { 0, 0, 0, 0 }; int16x4_t intInputVal1, intInputVal2; int16x8_t res; - for(i = 0; i < neon_iters; i++) - { - a = vld1q_f32((const float32_t*)(inputVectorPtr)); - inputVectorPtr += 4; - b = vld1q_f32((const float32_t*)(inputVectorPtr)); - inputVectorPtr += 4; - __VOLK_PREFETCH(inputVectorPtr + 8); - - ret1 = vmaxq_f32(vminq_f32(a, max_val), min_val); - ret2 = vmaxq_f32(vminq_f32(b, max_val), min_val); - - // vrndiq takes into account the current rounding mode (as does rintf) - toint_a = vcvtq_s32_f32(vrndiq_f32(ret1)); - toint_b = vcvtq_s32_f32(vrndiq_f32(ret2)); - - intInputVal1 = vqmovn_s32(toint_a); - intInputVal2 = vqmovn_s32(toint_b); - - res = vcombine_s16(intInputVal1, intInputVal2); - vst1q_s16((int16_t*)outputVectorPtr, res); - outputVectorPtr += 8; - } - - for(i = neon_iters * 8; i < num_points * 2; i++) - { - aux = *inputVectorPtr++; - if(aux > max_val_f) - aux = max_val_f; - else if(aux < min_val_f) - aux = min_val_f; - *outputVectorPtr++ = (int16_t)rintf(aux); - } + for (i = 0; i < neon_iters; i++) { + a = vld1q_f32((const float32_t*)(inputVectorPtr)); + inputVectorPtr += 4; + b = vld1q_f32((const float32_t*)(inputVectorPtr)); + inputVectorPtr += 4; + __VOLK_PREFETCH(inputVectorPtr + 8); + + ret1 = vmaxq_f32(vminq_f32(a, max_val), min_val); + ret2 = vmaxq_f32(vminq_f32(b, max_val), min_val); + + // vrndiq takes into account the current rounding mode (as does rintf) + toint_a = vcvtq_s32_f32(vrndiq_f32(ret1)); + toint_b = vcvtq_s32_f32(vrndiq_f32(ret2)); + + intInputVal1 = vqmovn_s32(toint_a); + intInputVal2 = vqmovn_s32(toint_b); + + res = vcombine_s16(intInputVal1, intInputVal2); + vst1q_s16((int16_t*)outputVectorPtr, res); + outputVectorPtr += 8; + } + + for (i = neon_iters * 8; i < num_points * 2; i++) { + aux = *inputVectorPtr++; + if (aux > max_val_f) + aux = max_val_f; + else if (aux < min_val_f) + aux = min_val_f; + *outputVectorPtr++ = (int16_t)rintf(aux); + } } #endif /* LV_HAVE_NEONV8 */ - #ifdef LV_HAVE_GENERIC -static inline void volk_32fc_convert_16ic_generic(lv_16sc_t* outputVector, const lv_32fc_t* inputVector, unsigned int num_points) +static inline void volk_32fc_convert_16ic_generic(lv_16sc_t* outputVector, + const lv_32fc_t* inputVector, + unsigned int num_points) { float* inputVectorPtr = (float*)inputVector; int16_t* outputVectorPtr = (int16_t*)outputVector; @@ -296,15 +308,14 @@ static inline void volk_32fc_convert_16ic_generic(lv_16sc_t* outputVector, const const float max_val = (float)SHRT_MAX; float aux; unsigned int i; - for(i = 0; i < num_points * 2; i++) - { - aux = *inputVectorPtr++; - if(aux > max_val) - aux = max_val; - else if(aux < min_val) - aux = min_val; - *outputVectorPtr++ = (int16_t)rintf(aux); - } + for (i = 0; i < num_points * 2; i++) { + aux = *inputVectorPtr++; + if (aux > max_val) + aux = max_val; + else if (aux < min_val) + aux = min_val; + *outputVectorPtr++ = (int16_t)rintf(aux); + } } #endif /* LV_HAVE_GENERIC */ @@ -313,15 +324,17 @@ static inline void volk_32fc_convert_16ic_generic(lv_16sc_t* outputVector, const #ifndef INCLUDED_volk_32fc_convert_16ic_u_H #define INCLUDED_volk_32fc_convert_16ic_u_H +#include "volk/volk_complex.h" #include #include -#include "volk/volk_complex.h" #ifdef LV_HAVE_AVX2 #include -static inline void volk_32fc_convert_16ic_u_avx2(lv_16sc_t* outputVector, const lv_32fc_t* inputVector, unsigned int num_points) +static inline void volk_32fc_convert_16ic_u_avx2(lv_16sc_t* outputVector, + const lv_32fc_t* inputVector, + unsigned int num_points) { const unsigned int avx_iters = num_points / 8; @@ -339,37 +352,35 @@ static inline void volk_32fc_convert_16ic_u_avx2(lv_16sc_t* outputVector, const const __m256 vmax_val = _mm256_set1_ps(max_val); unsigned int i; - for(i = 0; i < avx_iters; i++) - { - inputVal1 = _mm256_loadu_ps((float*)inputVectorPtr); - inputVectorPtr += 8; - inputVal2 = _mm256_loadu_ps((float*)inputVectorPtr); - inputVectorPtr += 8; - __VOLK_PREFETCH(inputVectorPtr + 16); - - // Clip - ret1 = _mm256_max_ps(_mm256_min_ps(inputVal1, vmax_val), vmin_val); - ret2 = _mm256_max_ps(_mm256_min_ps(inputVal2, vmax_val), vmin_val); - - intInputVal1 = _mm256_cvtps_epi32(ret1); - intInputVal2 = _mm256_cvtps_epi32(ret2); - - intInputVal1 = _mm256_packs_epi32(intInputVal1, intInputVal2); - intInputVal1 = _mm256_permute4x64_epi64(intInputVal1, 0xd8); - - _mm256_storeu_si256((__m256i*)outputVectorPtr, intInputVal1); - outputVectorPtr += 16; - } - - for(i = avx_iters * 16; i < num_points * 2; i++) - { - aux = *inputVectorPtr++; - if(aux > max_val) - aux = max_val; - else if(aux < min_val) - aux = min_val; - *outputVectorPtr++ = (int16_t)rintf(aux); - } + for (i = 0; i < avx_iters; i++) { + inputVal1 = _mm256_loadu_ps((float*)inputVectorPtr); + inputVectorPtr += 8; + inputVal2 = _mm256_loadu_ps((float*)inputVectorPtr); + inputVectorPtr += 8; + __VOLK_PREFETCH(inputVectorPtr + 16); + + // Clip + ret1 = _mm256_max_ps(_mm256_min_ps(inputVal1, vmax_val), vmin_val); + ret2 = _mm256_max_ps(_mm256_min_ps(inputVal2, vmax_val), vmin_val); + + intInputVal1 = _mm256_cvtps_epi32(ret1); + intInputVal2 = _mm256_cvtps_epi32(ret2); + + intInputVal1 = _mm256_packs_epi32(intInputVal1, intInputVal2); + intInputVal1 = _mm256_permute4x64_epi64(intInputVal1, 0xd8); + + _mm256_storeu_si256((__m256i*)outputVectorPtr, intInputVal1); + outputVectorPtr += 16; + } + + for (i = avx_iters * 16; i < num_points * 2; i++) { + aux = *inputVectorPtr++; + if (aux > max_val) + aux = max_val; + else if (aux < min_val) + aux = min_val; + *outputVectorPtr++ = (int16_t)rintf(aux); + } } #endif /* LV_HAVE_AVX2 */ @@ -377,7 +388,9 @@ static inline void volk_32fc_convert_16ic_u_avx2(lv_16sc_t* outputVector, const #ifdef LV_HAVE_SSE2 #include -static inline void volk_32fc_convert_16ic_u_sse2(lv_16sc_t* outputVector, const lv_32fc_t* inputVector, unsigned int num_points) +static inline void volk_32fc_convert_16ic_u_sse2(lv_16sc_t* outputVector, + const lv_32fc_t* inputVector, + unsigned int num_points) { const unsigned int sse_iters = num_points / 4; @@ -395,36 +408,34 @@ static inline void volk_32fc_convert_16ic_u_sse2(lv_16sc_t* outputVector, const const __m128 vmax_val = _mm_set_ps1(max_val); unsigned int i; - for(i = 0; i < sse_iters; i++) - { - inputVal1 = _mm_loadu_ps((float*)inputVectorPtr); - inputVectorPtr += 4; - inputVal2 = _mm_loadu_ps((float*)inputVectorPtr); - inputVectorPtr += 4; - __VOLK_PREFETCH(inputVectorPtr + 8); - - // Clip - ret1 = _mm_max_ps(_mm_min_ps(inputVal1, vmax_val), vmin_val); - ret2 = _mm_max_ps(_mm_min_ps(inputVal2, vmax_val), vmin_val); - - intInputVal1 = _mm_cvtps_epi32(ret1); - intInputVal2 = _mm_cvtps_epi32(ret2); - - intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2); - - _mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1); - outputVectorPtr += 8; - } - - for(i = sse_iters * 8; i < num_points * 2; i++) - { - aux = *inputVectorPtr++; - if(aux > max_val) - aux = max_val; - else if(aux < min_val) - aux = min_val; - *outputVectorPtr++ = (int16_t)rintf(aux); - } + for (i = 0; i < sse_iters; i++) { + inputVal1 = _mm_loadu_ps((float*)inputVectorPtr); + inputVectorPtr += 4; + inputVal2 = _mm_loadu_ps((float*)inputVectorPtr); + inputVectorPtr += 4; + __VOLK_PREFETCH(inputVectorPtr + 8); + + // Clip + ret1 = _mm_max_ps(_mm_min_ps(inputVal1, vmax_val), vmin_val); + ret2 = _mm_max_ps(_mm_min_ps(inputVal2, vmax_val), vmin_val); + + intInputVal1 = _mm_cvtps_epi32(ret1); + intInputVal2 = _mm_cvtps_epi32(ret2); + + intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2); + + _mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1); + outputVectorPtr += 8; + } + + for (i = sse_iters * 8; i < num_points * 2; i++) { + aux = *inputVectorPtr++; + if (aux > max_val) + aux = max_val; + else if (aux < min_val) + aux = min_val; + *outputVectorPtr++ = (int16_t)rintf(aux); + } } #endif /* LV_HAVE_SSE2 */ #endif /* INCLUDED_volk_32fc_convert_16ic_u_H */ diff --git a/kernels/volk/volk_32fc_deinterleave_32f_x2.h b/kernels/volk/volk_32fc_deinterleave_32f_x2.h index 40cd664..1a06c48 100644 --- a/kernels/volk/volk_32fc_deinterleave_32f_x2.h +++ b/kernels/volk/volk_32fc_deinterleave_32f_x2.h @@ -30,8 +30,8 @@ * * Dispatcher Prototype * \code - * void volk_32fc_deinterleave_32f_x2(float* iBuffer, float* qBuffer, const lv_32fc_t* complexVector, unsigned int num_points) - * \endcode + * void volk_32fc_deinterleave_32f_x2(float* iBuffer, float* qBuffer, const lv_32fc_t* + * complexVector, unsigned int num_points) \endcode * * \b Inputs * \li complexVector: The complex input vector. @@ -78,86 +78,88 @@ #ifdef LV_HAVE_AVX #include -static inline void -volk_32fc_deinterleave_32f_x2_a_avx(float* iBuffer, float* qBuffer, const lv_32fc_t* complexVector, - unsigned int num_points) +static inline void volk_32fc_deinterleave_32f_x2_a_avx(float* iBuffer, + float* qBuffer, + const lv_32fc_t* complexVector, + unsigned int num_points) { - const float* complexVectorPtr = (float*)complexVector; - float* iBufferPtr = iBuffer; - float* qBufferPtr = qBuffer; - - unsigned int number = 0; - // Mask for real and imaginary parts - const unsigned int eighthPoints = num_points / 8; - __m256 cplxValue1, cplxValue2, complex1, complex2, iValue, qValue; - for(;number < eighthPoints; number++){ - cplxValue1 = _mm256_load_ps(complexVectorPtr); - complexVectorPtr += 8; - - cplxValue2 = _mm256_load_ps(complexVectorPtr); - complexVectorPtr += 8; - - complex1 = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x20); - complex2 = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x31); - - // Arrange in i1i2i3i4 format - iValue = _mm256_shuffle_ps(complex1, complex2, 0x88); - // Arrange in q1q2q3q4 format - qValue = _mm256_shuffle_ps(complex1, complex2, 0xdd); - - _mm256_store_ps(iBufferPtr, iValue); - _mm256_store_ps(qBufferPtr, qValue); - - iBufferPtr += 8; - qBufferPtr += 8; - } - - number = eighthPoints * 8; - for(; number < num_points; number++){ - *iBufferPtr++ = *complexVectorPtr++; - *qBufferPtr++ = *complexVectorPtr++; - } + const float* complexVectorPtr = (float*)complexVector; + float* iBufferPtr = iBuffer; + float* qBufferPtr = qBuffer; + + unsigned int number = 0; + // Mask for real and imaginary parts + const unsigned int eighthPoints = num_points / 8; + __m256 cplxValue1, cplxValue2, complex1, complex2, iValue, qValue; + for (; number < eighthPoints; number++) { + cplxValue1 = _mm256_load_ps(complexVectorPtr); + complexVectorPtr += 8; + + cplxValue2 = _mm256_load_ps(complexVectorPtr); + complexVectorPtr += 8; + + complex1 = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x20); + complex2 = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x31); + + // Arrange in i1i2i3i4 format + iValue = _mm256_shuffle_ps(complex1, complex2, 0x88); + // Arrange in q1q2q3q4 format + qValue = _mm256_shuffle_ps(complex1, complex2, 0xdd); + + _mm256_store_ps(iBufferPtr, iValue); + _mm256_store_ps(qBufferPtr, qValue); + + iBufferPtr += 8; + qBufferPtr += 8; + } + + number = eighthPoints * 8; + for (; number < num_points; number++) { + *iBufferPtr++ = *complexVectorPtr++; + *qBufferPtr++ = *complexVectorPtr++; + } } #endif /* LV_HAVE_AVX */ #ifdef LV_HAVE_SSE #include -static inline void -volk_32fc_deinterleave_32f_x2_a_sse(float* iBuffer, float* qBuffer, const lv_32fc_t* complexVector, - unsigned int num_points) +static inline void volk_32fc_deinterleave_32f_x2_a_sse(float* iBuffer, + float* qBuffer, + const lv_32fc_t* complexVector, + unsigned int num_points) { - const float* complexVectorPtr = (float*)complexVector; - float* iBufferPtr = iBuffer; - float* qBufferPtr = qBuffer; - - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - __m128 cplxValue1, cplxValue2, iValue, qValue; - for(;number < quarterPoints; number++){ - cplxValue1 = _mm_load_ps(complexVectorPtr); - complexVectorPtr += 4; - - cplxValue2 = _mm_load_ps(complexVectorPtr); - complexVectorPtr += 4; - - // Arrange in i1i2i3i4 format - iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0)); - // Arrange in q1q2q3q4 format - qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1)); - - _mm_store_ps(iBufferPtr, iValue); - _mm_store_ps(qBufferPtr, qValue); - - iBufferPtr += 4; - qBufferPtr += 4; - } - - number = quarterPoints * 4; - for(; number < num_points; number++){ - *iBufferPtr++ = *complexVectorPtr++; - *qBufferPtr++ = *complexVectorPtr++; - } + const float* complexVectorPtr = (float*)complexVector; + float* iBufferPtr = iBuffer; + float* qBufferPtr = qBuffer; + + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + __m128 cplxValue1, cplxValue2, iValue, qValue; + for (; number < quarterPoints; number++) { + cplxValue1 = _mm_load_ps(complexVectorPtr); + complexVectorPtr += 4; + + cplxValue2 = _mm_load_ps(complexVectorPtr); + complexVectorPtr += 4; + + // Arrange in i1i2i3i4 format + iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2, 0, 2, 0)); + // Arrange in q1q2q3q4 format + qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3, 1, 3, 1)); + + _mm_store_ps(iBufferPtr, iValue); + _mm_store_ps(qBufferPtr, qValue); + + iBufferPtr += 4; + qBufferPtr += 4; + } + + number = quarterPoints * 4; + for (; number < num_points; number++) { + *iBufferPtr++ = *complexVectorPtr++; + *qBufferPtr++ = *complexVectorPtr++; + } } #endif /* LV_HAVE_SSE */ @@ -165,48 +167,50 @@ volk_32fc_deinterleave_32f_x2_a_sse(float* iBuffer, float* qBuffer, const lv_32f #ifdef LV_HAVE_NEON #include -static inline void -volk_32fc_deinterleave_32f_x2_neon(float* iBuffer, float* qBuffer, const lv_32fc_t* complexVector, - unsigned int num_points) +static inline void volk_32fc_deinterleave_32f_x2_neon(float* iBuffer, + float* qBuffer, + const lv_32fc_t* complexVector, + unsigned int num_points) { - unsigned int number = 0; - unsigned int quarter_points = num_points / 4; - const float* complexVectorPtr = (float*)complexVector; - float* iBufferPtr = iBuffer; - float* qBufferPtr = qBuffer; - float32x4x2_t complexInput; - - for(number = 0; number < quarter_points; number++){ - complexInput = vld2q_f32(complexVectorPtr); - vst1q_f32( iBufferPtr, complexInput.val[0] ); - vst1q_f32( qBufferPtr, complexInput.val[1] ); - complexVectorPtr += 8; - iBufferPtr += 4; - qBufferPtr += 4; - } - - for(number = quarter_points*4; number < num_points; number++){ - *iBufferPtr++ = *complexVectorPtr++; - *qBufferPtr++ = *complexVectorPtr++; - } + unsigned int number = 0; + unsigned int quarter_points = num_points / 4; + const float* complexVectorPtr = (float*)complexVector; + float* iBufferPtr = iBuffer; + float* qBufferPtr = qBuffer; + float32x4x2_t complexInput; + + for (number = 0; number < quarter_points; number++) { + complexInput = vld2q_f32(complexVectorPtr); + vst1q_f32(iBufferPtr, complexInput.val[0]); + vst1q_f32(qBufferPtr, complexInput.val[1]); + complexVectorPtr += 8; + iBufferPtr += 4; + qBufferPtr += 4; + } + + for (number = quarter_points * 4; number < num_points; number++) { + *iBufferPtr++ = *complexVectorPtr++; + *qBufferPtr++ = *complexVectorPtr++; + } } #endif /* LV_HAVE_NEON */ #ifdef LV_HAVE_GENERIC -static inline void -volk_32fc_deinterleave_32f_x2_generic(float* iBuffer, float* qBuffer, const lv_32fc_t* complexVector, - unsigned int num_points) +static inline void volk_32fc_deinterleave_32f_x2_generic(float* iBuffer, + float* qBuffer, + const lv_32fc_t* complexVector, + unsigned int num_points) { - const float* complexVectorPtr = (float*)complexVector; - float* iBufferPtr = iBuffer; - float* qBufferPtr = qBuffer; - unsigned int number; - for(number = 0; number < num_points; number++){ - *iBufferPtr++ = *complexVectorPtr++; - *qBufferPtr++ = *complexVectorPtr++; - } + const float* complexVectorPtr = (float*)complexVector; + float* iBufferPtr = iBuffer; + float* qBufferPtr = qBuffer; + unsigned int number; + for (number = 0; number < num_points; number++) { + *iBufferPtr++ = *complexVectorPtr++; + *qBufferPtr++ = *complexVectorPtr++; + } } #endif /* LV_HAVE_GENERIC */ @@ -221,45 +225,46 @@ volk_32fc_deinterleave_32f_x2_generic(float* iBuffer, float* qBuffer, const lv_3 #ifdef LV_HAVE_AVX #include -static inline void -volk_32fc_deinterleave_32f_x2_u_avx(float* iBuffer, float* qBuffer, const lv_32fc_t* complexVector, - unsigned int num_points) +static inline void volk_32fc_deinterleave_32f_x2_u_avx(float* iBuffer, + float* qBuffer, + const lv_32fc_t* complexVector, + unsigned int num_points) { - const float* complexVectorPtr = (float*)complexVector; - float* iBufferPtr = iBuffer; - float* qBufferPtr = qBuffer; - - unsigned int number = 0; - // Mask for real and imaginary parts - const unsigned int eighthPoints = num_points / 8; - __m256 cplxValue1, cplxValue2, complex1, complex2, iValue, qValue; - for(;number < eighthPoints; number++){ - cplxValue1 = _mm256_loadu_ps(complexVectorPtr); - complexVectorPtr += 8; - - cplxValue2 = _mm256_loadu_ps(complexVectorPtr); - complexVectorPtr += 8; - - complex1 = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x20); - complex2 = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x31); - - // Arrange in i1i2i3i4 format - iValue = _mm256_shuffle_ps(complex1, complex2, 0x88); - // Arrange in q1q2q3q4 format - qValue = _mm256_shuffle_ps(complex1, complex2, 0xdd); - - _mm256_storeu_ps(iBufferPtr, iValue); - _mm256_storeu_ps(qBufferPtr, qValue); - - iBufferPtr += 8; - qBufferPtr += 8; - } - - number = eighthPoints * 8; - for(; number < num_points; number++){ - *iBufferPtr++ = *complexVectorPtr++; - *qBufferPtr++ = *complexVectorPtr++; - } + const float* complexVectorPtr = (float*)complexVector; + float* iBufferPtr = iBuffer; + float* qBufferPtr = qBuffer; + + unsigned int number = 0; + // Mask for real and imaginary parts + const unsigned int eighthPoints = num_points / 8; + __m256 cplxValue1, cplxValue2, complex1, complex2, iValue, qValue; + for (; number < eighthPoints; number++) { + cplxValue1 = _mm256_loadu_ps(complexVectorPtr); + complexVectorPtr += 8; + + cplxValue2 = _mm256_loadu_ps(complexVectorPtr); + complexVectorPtr += 8; + + complex1 = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x20); + complex2 = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x31); + + // Arrange in i1i2i3i4 format + iValue = _mm256_shuffle_ps(complex1, complex2, 0x88); + // Arrange in q1q2q3q4 format + qValue = _mm256_shuffle_ps(complex1, complex2, 0xdd); + + _mm256_storeu_ps(iBufferPtr, iValue); + _mm256_storeu_ps(qBufferPtr, qValue); + + iBufferPtr += 8; + qBufferPtr += 8; + } + + number = eighthPoints * 8; + for (; number < num_points; number++) { + *iBufferPtr++ = *complexVectorPtr++; + *qBufferPtr++ = *complexVectorPtr++; + } } #endif /* LV_HAVE_AVX */ #endif /* INCLUDED_volk_32fc_deinterleave_32f_x2_u_H */ diff --git a/kernels/volk/volk_32fc_deinterleave_64f_x2.h b/kernels/volk/volk_32fc_deinterleave_64f_x2.h index 3e799cb..3b69c3c 100644 --- a/kernels/volk/volk_32fc_deinterleave_64f_x2.h +++ b/kernels/volk/volk_32fc_deinterleave_64f_x2.h @@ -79,110 +79,113 @@ #ifdef LV_HAVE_AVX #include -static inline void -volk_32fc_deinterleave_64f_x2_u_avx(double *iBuffer, double *qBuffer, - const lv_32fc_t *complexVector, - unsigned int num_points) { - unsigned int number = 0; - - const float *complexVectorPtr = (float *)complexVector; - double *iBufferPtr = iBuffer; - double *qBufferPtr = qBuffer; - - const unsigned int quarterPoints = num_points / 4; - __m256 cplxValue; - __m128 complexH, complexL, fVal; - __m256d dVal; - - for (; number < quarterPoints; number++) { - - cplxValue = _mm256_loadu_ps(complexVectorPtr); - complexVectorPtr += 8; - - complexH = _mm256_extractf128_ps(cplxValue, 1); - complexL = _mm256_extractf128_ps(cplxValue, 0); - - // Arrange in i1i2i1i2 format - fVal = _mm_shuffle_ps(complexL, complexH, _MM_SHUFFLE(2, 0, 2, 0)); - dVal = _mm256_cvtps_pd(fVal); - _mm256_storeu_pd(iBufferPtr, dVal); - - // Arrange in q1q2q1q2 format - fVal = _mm_shuffle_ps(complexL, complexH, _MM_SHUFFLE(3, 1, 3, 1)); - dVal = _mm256_cvtps_pd(fVal); - _mm256_storeu_pd(qBufferPtr, dVal); - - iBufferPtr += 4; - qBufferPtr += 4; - } - - number = quarterPoints * 4; - for (; number < num_points; number++) { - *iBufferPtr++ = *complexVectorPtr++; - *qBufferPtr++ = *complexVectorPtr++; - } +static inline void volk_32fc_deinterleave_64f_x2_u_avx(double* iBuffer, + double* qBuffer, + const lv_32fc_t* complexVector, + unsigned int num_points) +{ + unsigned int number = 0; + + const float* complexVectorPtr = (float*)complexVector; + double* iBufferPtr = iBuffer; + double* qBufferPtr = qBuffer; + + const unsigned int quarterPoints = num_points / 4; + __m256 cplxValue; + __m128 complexH, complexL, fVal; + __m256d dVal; + + for (; number < quarterPoints; number++) { + + cplxValue = _mm256_loadu_ps(complexVectorPtr); + complexVectorPtr += 8; + + complexH = _mm256_extractf128_ps(cplxValue, 1); + complexL = _mm256_extractf128_ps(cplxValue, 0); + + // Arrange in i1i2i1i2 format + fVal = _mm_shuffle_ps(complexL, complexH, _MM_SHUFFLE(2, 0, 2, 0)); + dVal = _mm256_cvtps_pd(fVal); + _mm256_storeu_pd(iBufferPtr, dVal); + + // Arrange in q1q2q1q2 format + fVal = _mm_shuffle_ps(complexL, complexH, _MM_SHUFFLE(3, 1, 3, 1)); + dVal = _mm256_cvtps_pd(fVal); + _mm256_storeu_pd(qBufferPtr, dVal); + + iBufferPtr += 4; + qBufferPtr += 4; + } + + number = quarterPoints * 4; + for (; number < num_points; number++) { + *iBufferPtr++ = *complexVectorPtr++; + *qBufferPtr++ = *complexVectorPtr++; + } } #endif /* LV_HAVE_AVX */ #ifdef LV_HAVE_SSE2 #include -static inline void -volk_32fc_deinterleave_64f_x2_u_sse2(double *iBuffer, double *qBuffer, - const lv_32fc_t *complexVector, - unsigned int num_points) { - unsigned int number = 0; - - const float *complexVectorPtr = (float *)complexVector; - double *iBufferPtr = iBuffer; - double *qBufferPtr = qBuffer; - - const unsigned int halfPoints = num_points / 2; - __m128 cplxValue, fVal; - __m128d dVal; - - for (; number < halfPoints; number++) { - - cplxValue = _mm_loadu_ps(complexVectorPtr); - complexVectorPtr += 4; - - // Arrange in i1i2i1i2 format - fVal = _mm_shuffle_ps(cplxValue, cplxValue, _MM_SHUFFLE(2, 0, 2, 0)); - dVal = _mm_cvtps_pd(fVal); - _mm_storeu_pd(iBufferPtr, dVal); - - // Arrange in q1q2q1q2 format - fVal = _mm_shuffle_ps(cplxValue, cplxValue, _MM_SHUFFLE(3, 1, 3, 1)); - dVal = _mm_cvtps_pd(fVal); - _mm_storeu_pd(qBufferPtr, dVal); - - iBufferPtr += 2; - qBufferPtr += 2; - } - - number = halfPoints * 2; - for (; number < num_points; number++) { - *iBufferPtr++ = *complexVectorPtr++; - *qBufferPtr++ = *complexVectorPtr++; - } +static inline void volk_32fc_deinterleave_64f_x2_u_sse2(double* iBuffer, + double* qBuffer, + const lv_32fc_t* complexVector, + unsigned int num_points) +{ + unsigned int number = 0; + + const float* complexVectorPtr = (float*)complexVector; + double* iBufferPtr = iBuffer; + double* qBufferPtr = qBuffer; + + const unsigned int halfPoints = num_points / 2; + __m128 cplxValue, fVal; + __m128d dVal; + + for (; number < halfPoints; number++) { + + cplxValue = _mm_loadu_ps(complexVectorPtr); + complexVectorPtr += 4; + + // Arrange in i1i2i1i2 format + fVal = _mm_shuffle_ps(cplxValue, cplxValue, _MM_SHUFFLE(2, 0, 2, 0)); + dVal = _mm_cvtps_pd(fVal); + _mm_storeu_pd(iBufferPtr, dVal); + + // Arrange in q1q2q1q2 format + fVal = _mm_shuffle_ps(cplxValue, cplxValue, _MM_SHUFFLE(3, 1, 3, 1)); + dVal = _mm_cvtps_pd(fVal); + _mm_storeu_pd(qBufferPtr, dVal); + + iBufferPtr += 2; + qBufferPtr += 2; + } + + number = halfPoints * 2; + for (; number < num_points; number++) { + *iBufferPtr++ = *complexVectorPtr++; + *qBufferPtr++ = *complexVectorPtr++; + } } #endif /* LV_HAVE_SSE */ #ifdef LV_HAVE_GENERIC -static inline void -volk_32fc_deinterleave_64f_x2_generic(double *iBuffer, double *qBuffer, - const lv_32fc_t *complexVector, - unsigned int num_points) { - unsigned int number = 0; - const float *complexVectorPtr = (float *)complexVector; - double *iBufferPtr = iBuffer; - double *qBufferPtr = qBuffer; - - for (number = 0; number < num_points; number++) { - *iBufferPtr++ = (double)*complexVectorPtr++; - *qBufferPtr++ = (double)*complexVectorPtr++; - } +static inline void volk_32fc_deinterleave_64f_x2_generic(double* iBuffer, + double* qBuffer, + const lv_32fc_t* complexVector, + unsigned int num_points) +{ + unsigned int number = 0; + const float* complexVectorPtr = (float*)complexVector; + double* iBufferPtr = iBuffer; + double* qBufferPtr = qBuffer; + + for (number = 0; number < num_points; number++) { + *iBufferPtr++ = (double)*complexVectorPtr++; + *qBufferPtr++ = (double)*complexVectorPtr++; + } } #endif /* LV_HAVE_GENERIC */ @@ -196,146 +199,150 @@ volk_32fc_deinterleave_64f_x2_generic(double *iBuffer, double *qBuffer, #ifdef LV_HAVE_AVX #include -static inline void -volk_32fc_deinterleave_64f_x2_a_avx(double *iBuffer, double *qBuffer, - const lv_32fc_t *complexVector, - unsigned int num_points) { - unsigned int number = 0; - - const float *complexVectorPtr = (float *)complexVector; - double *iBufferPtr = iBuffer; - double *qBufferPtr = qBuffer; - - const unsigned int quarterPoints = num_points / 4; - __m256 cplxValue; - __m128 complexH, complexL, fVal; - __m256d dVal; - - for (; number < quarterPoints; number++) { - - cplxValue = _mm256_load_ps(complexVectorPtr); - complexVectorPtr += 8; - - complexH = _mm256_extractf128_ps(cplxValue, 1); - complexL = _mm256_extractf128_ps(cplxValue, 0); - - // Arrange in i1i2i1i2 format - fVal = _mm_shuffle_ps(complexL, complexH, _MM_SHUFFLE(2, 0, 2, 0)); - dVal = _mm256_cvtps_pd(fVal); - _mm256_store_pd(iBufferPtr, dVal); - - // Arrange in q1q2q1q2 format - fVal = _mm_shuffle_ps(complexL, complexH, _MM_SHUFFLE(3, 1, 3, 1)); - dVal = _mm256_cvtps_pd(fVal); - _mm256_store_pd(qBufferPtr, dVal); - - iBufferPtr += 4; - qBufferPtr += 4; - } - - number = quarterPoints * 4; - for (; number < num_points; number++) { - *iBufferPtr++ = *complexVectorPtr++; - *qBufferPtr++ = *complexVectorPtr++; - } +static inline void volk_32fc_deinterleave_64f_x2_a_avx(double* iBuffer, + double* qBuffer, + const lv_32fc_t* complexVector, + unsigned int num_points) +{ + unsigned int number = 0; + + const float* complexVectorPtr = (float*)complexVector; + double* iBufferPtr = iBuffer; + double* qBufferPtr = qBuffer; + + const unsigned int quarterPoints = num_points / 4; + __m256 cplxValue; + __m128 complexH, complexL, fVal; + __m256d dVal; + + for (; number < quarterPoints; number++) { + + cplxValue = _mm256_load_ps(complexVectorPtr); + complexVectorPtr += 8; + + complexH = _mm256_extractf128_ps(cplxValue, 1); + complexL = _mm256_extractf128_ps(cplxValue, 0); + + // Arrange in i1i2i1i2 format + fVal = _mm_shuffle_ps(complexL, complexH, _MM_SHUFFLE(2, 0, 2, 0)); + dVal = _mm256_cvtps_pd(fVal); + _mm256_store_pd(iBufferPtr, dVal); + + // Arrange in q1q2q1q2 format + fVal = _mm_shuffle_ps(complexL, complexH, _MM_SHUFFLE(3, 1, 3, 1)); + dVal = _mm256_cvtps_pd(fVal); + _mm256_store_pd(qBufferPtr, dVal); + + iBufferPtr += 4; + qBufferPtr += 4; + } + + number = quarterPoints * 4; + for (; number < num_points; number++) { + *iBufferPtr++ = *complexVectorPtr++; + *qBufferPtr++ = *complexVectorPtr++; + } } #endif /* LV_HAVE_AVX */ #ifdef LV_HAVE_SSE2 #include -static inline void -volk_32fc_deinterleave_64f_x2_a_sse2(double *iBuffer, double *qBuffer, - const lv_32fc_t *complexVector, - unsigned int num_points) { - unsigned int number = 0; - - const float *complexVectorPtr = (float *)complexVector; - double *iBufferPtr = iBuffer; - double *qBufferPtr = qBuffer; - - const unsigned int halfPoints = num_points / 2; - __m128 cplxValue, fVal; - __m128d dVal; - - for (; number < halfPoints; number++) { - - cplxValue = _mm_load_ps(complexVectorPtr); - complexVectorPtr += 4; - - // Arrange in i1i2i1i2 format - fVal = _mm_shuffle_ps(cplxValue, cplxValue, _MM_SHUFFLE(2, 0, 2, 0)); - dVal = _mm_cvtps_pd(fVal); - _mm_store_pd(iBufferPtr, dVal); - - // Arrange in q1q2q1q2 format - fVal = _mm_shuffle_ps(cplxValue, cplxValue, _MM_SHUFFLE(3, 1, 3, 1)); - dVal = _mm_cvtps_pd(fVal); - _mm_store_pd(qBufferPtr, dVal); - - iBufferPtr += 2; - qBufferPtr += 2; - } - - number = halfPoints * 2; - for (; number < num_points; number++) { - *iBufferPtr++ = *complexVectorPtr++; - *qBufferPtr++ = *complexVectorPtr++; - } +static inline void volk_32fc_deinterleave_64f_x2_a_sse2(double* iBuffer, + double* qBuffer, + const lv_32fc_t* complexVector, + unsigned int num_points) +{ + unsigned int number = 0; + + const float* complexVectorPtr = (float*)complexVector; + double* iBufferPtr = iBuffer; + double* qBufferPtr = qBuffer; + + const unsigned int halfPoints = num_points / 2; + __m128 cplxValue, fVal; + __m128d dVal; + + for (; number < halfPoints; number++) { + + cplxValue = _mm_load_ps(complexVectorPtr); + complexVectorPtr += 4; + + // Arrange in i1i2i1i2 format + fVal = _mm_shuffle_ps(cplxValue, cplxValue, _MM_SHUFFLE(2, 0, 2, 0)); + dVal = _mm_cvtps_pd(fVal); + _mm_store_pd(iBufferPtr, dVal); + + // Arrange in q1q2q1q2 format + fVal = _mm_shuffle_ps(cplxValue, cplxValue, _MM_SHUFFLE(3, 1, 3, 1)); + dVal = _mm_cvtps_pd(fVal); + _mm_store_pd(qBufferPtr, dVal); + + iBufferPtr += 2; + qBufferPtr += 2; + } + + number = halfPoints * 2; + for (; number < num_points; number++) { + *iBufferPtr++ = *complexVectorPtr++; + *qBufferPtr++ = *complexVectorPtr++; + } } #endif /* LV_HAVE_SSE */ #ifdef LV_HAVE_GENERIC -static inline void -volk_32fc_deinterleave_64f_x2_a_generic(double *iBuffer, double *qBuffer, - const lv_32fc_t *complexVector, - unsigned int num_points) { - unsigned int number = 0; - const float *complexVectorPtr = (float *)complexVector; - double *iBufferPtr = iBuffer; - double *qBufferPtr = qBuffer; - - for (number = 0; number < num_points; number++) { - *iBufferPtr++ = (double)*complexVectorPtr++; - *qBufferPtr++ = (double)*complexVectorPtr++; - } +static inline void volk_32fc_deinterleave_64f_x2_a_generic(double* iBuffer, + double* qBuffer, + const lv_32fc_t* complexVector, + unsigned int num_points) +{ + unsigned int number = 0; + const float* complexVectorPtr = (float*)complexVector; + double* iBufferPtr = iBuffer; + double* qBufferPtr = qBuffer; + + for (number = 0; number < num_points; number++) { + *iBufferPtr++ = (double)*complexVectorPtr++; + *qBufferPtr++ = (double)*complexVectorPtr++; + } } #endif /* LV_HAVE_GENERIC */ #ifdef LV_HAVE_NEONV8 #include -static inline void -volk_32fc_deinterleave_64f_x2_neon(double *iBuffer, double *qBuffer, - const lv_32fc_t *complexVector, - unsigned int num_points) { - unsigned int number = 0; - unsigned int half_points = num_points / 2; - const float *complexVectorPtr = (float *)complexVector; - double *iBufferPtr = iBuffer; - double *qBufferPtr = qBuffer; - float32x2x2_t complexInput; - float64x2_t iVal, qVal; - - for (number = 0; number < half_points; number++) { - complexInput = vld2_f32(complexVectorPtr); - - iVal = vcvt_f64_f32(complexInput.val[0]); - qVal = vcvt_f64_f32(complexInput.val[1]); - - vst1q_f64(iBufferPtr, iVal); - vst1q_f64(qBufferPtr, qVal); - - complexVectorPtr += 4; - iBufferPtr += 2; - qBufferPtr += 2; - } - - for (number = half_points * 2; number < num_points; number++) { - *iBufferPtr++ = (double)*complexVectorPtr++; - *qBufferPtr++ = (double)*complexVectorPtr++; - } +static inline void volk_32fc_deinterleave_64f_x2_neon(double* iBuffer, + double* qBuffer, + const lv_32fc_t* complexVector, + unsigned int num_points) +{ + unsigned int number = 0; + unsigned int half_points = num_points / 2; + const float* complexVectorPtr = (float*)complexVector; + double* iBufferPtr = iBuffer; + double* qBufferPtr = qBuffer; + float32x2x2_t complexInput; + float64x2_t iVal, qVal; + + for (number = 0; number < half_points; number++) { + complexInput = vld2_f32(complexVectorPtr); + + iVal = vcvt_f64_f32(complexInput.val[0]); + qVal = vcvt_f64_f32(complexInput.val[1]); + + vst1q_f64(iBufferPtr, iVal); + vst1q_f64(qBufferPtr, qVal); + + complexVectorPtr += 4; + iBufferPtr += 2; + qBufferPtr += 2; + } + + for (number = half_points * 2; number < num_points; number++) { + *iBufferPtr++ = (double)*complexVectorPtr++; + *qBufferPtr++ = (double)*complexVectorPtr++; + } } #endif /* LV_HAVE_NEONV8 */ diff --git a/kernels/volk/volk_32fc_deinterleave_imag_32f.h b/kernels/volk/volk_32fc_deinterleave_imag_32f.h index 13f9764..e3dfa12 100644 --- a/kernels/volk/volk_32fc_deinterleave_imag_32f.h +++ b/kernels/volk/volk_32fc_deinterleave_imag_32f.h @@ -30,8 +30,8 @@ * * Dispatcher Prototype * \code - * void volk_32fc_deinterleave_image_32f(float* qBuffer, const lv_32fc_t* complexVector, unsigned int num_points) - * \endcode + * void volk_32fc_deinterleave_image_32f(float* qBuffer, const lv_32fc_t* complexVector, + * unsigned int num_points) \endcode * * \b Inputs * \li complexVector: The complex input vector. @@ -76,121 +76,121 @@ #ifdef LV_HAVE_AVX #include -static inline void -volk_32fc_deinterleave_imag_32f_a_avx(float* qBuffer, const lv_32fc_t* complexVector, - unsigned int num_points) +static inline void volk_32fc_deinterleave_imag_32f_a_avx(float* qBuffer, + const lv_32fc_t* complexVector, + unsigned int num_points) { - unsigned int number = 0; - const unsigned int eighthPoints = num_points / 8; - const float* complexVectorPtr = (const float*)complexVector; - float* qBufferPtr = qBuffer; + unsigned int number = 0; + const unsigned int eighthPoints = num_points / 8; + const float* complexVectorPtr = (const float*)complexVector; + float* qBufferPtr = qBuffer; - __m256 cplxValue1, cplxValue2, complex1, complex2, qValue; - for(;number < eighthPoints; number++){ + __m256 cplxValue1, cplxValue2, complex1, complex2, qValue; + for (; number < eighthPoints; number++) { - cplxValue1 = _mm256_load_ps(complexVectorPtr); - complexVectorPtr += 8; + cplxValue1 = _mm256_load_ps(complexVectorPtr); + complexVectorPtr += 8; - cplxValue2 = _mm256_load_ps(complexVectorPtr); - complexVectorPtr += 8; + cplxValue2 = _mm256_load_ps(complexVectorPtr); + complexVectorPtr += 8; - complex1 = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x20); - complex2 = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x31); + complex1 = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x20); + complex2 = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x31); - // Arrange in q1q2q3q4 format - qValue = _mm256_shuffle_ps(complex1, complex2, 0xdd); + // Arrange in q1q2q3q4 format + qValue = _mm256_shuffle_ps(complex1, complex2, 0xdd); - _mm256_store_ps(qBufferPtr, qValue); + _mm256_store_ps(qBufferPtr, qValue); - qBufferPtr += 8; - } + qBufferPtr += 8; + } - number = eighthPoints * 8; - for(; number < num_points; number++){ - complexVectorPtr++; - *qBufferPtr++ = *complexVectorPtr++; - } + number = eighthPoints * 8; + for (; number < num_points; number++) { + complexVectorPtr++; + *qBufferPtr++ = *complexVectorPtr++; + } } #endif /* LV_HAVE_AVX */ #ifdef LV_HAVE_SSE #include -static inline void -volk_32fc_deinterleave_imag_32f_a_sse(float* qBuffer, const lv_32fc_t* complexVector, - unsigned int num_points) +static inline void volk_32fc_deinterleave_imag_32f_a_sse(float* qBuffer, + const lv_32fc_t* complexVector, + unsigned int num_points) { - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; - const float* complexVectorPtr = (const float*)complexVector; - float* qBufferPtr = qBuffer; + const float* complexVectorPtr = (const float*)complexVector; + float* qBufferPtr = qBuffer; - __m128 cplxValue1, cplxValue2, iValue; - for(;number < quarterPoints; number++){ + __m128 cplxValue1, cplxValue2, iValue; + for (; number < quarterPoints; number++) { - cplxValue1 = _mm_load_ps(complexVectorPtr); - complexVectorPtr += 4; + cplxValue1 = _mm_load_ps(complexVectorPtr); + complexVectorPtr += 4; - cplxValue2 = _mm_load_ps(complexVectorPtr); - complexVectorPtr += 4; + cplxValue2 = _mm_load_ps(complexVectorPtr); + complexVectorPtr += 4; - // Arrange in q1q2q3q4 format - iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1)); + // Arrange in q1q2q3q4 format + iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3, 1, 3, 1)); - _mm_store_ps(qBufferPtr, iValue); + _mm_store_ps(qBufferPtr, iValue); - qBufferPtr += 4; - } + qBufferPtr += 4; + } - number = quarterPoints * 4; - for(; number < num_points; number++){ - complexVectorPtr++; - *qBufferPtr++ = *complexVectorPtr++; - } + number = quarterPoints * 4; + for (; number < num_points; number++) { + complexVectorPtr++; + *qBufferPtr++ = *complexVectorPtr++; + } } #endif /* LV_HAVE_SSE */ #ifdef LV_HAVE_NEON #include -static inline void -volk_32fc_deinterleave_imag_32f_neon(float* qBuffer, const lv_32fc_t* complexVector, - unsigned int num_points) +static inline void volk_32fc_deinterleave_imag_32f_neon(float* qBuffer, + const lv_32fc_t* complexVector, + unsigned int num_points) { - unsigned int number = 0; - unsigned int quarter_points = num_points / 4; - const float* complexVectorPtr = (float*)complexVector; - float* qBufferPtr = qBuffer; - float32x4x2_t complexInput; - - for(number = 0; number < quarter_points; number++){ - complexInput = vld2q_f32(complexVectorPtr); - vst1q_f32( qBufferPtr, complexInput.val[1] ); - complexVectorPtr += 8; - qBufferPtr += 4; - } - - for(number = quarter_points*4; number < num_points; number++){ - complexVectorPtr++; - *qBufferPtr++ = *complexVectorPtr++; - } + unsigned int number = 0; + unsigned int quarter_points = num_points / 4; + const float* complexVectorPtr = (float*)complexVector; + float* qBufferPtr = qBuffer; + float32x4x2_t complexInput; + + for (number = 0; number < quarter_points; number++) { + complexInput = vld2q_f32(complexVectorPtr); + vst1q_f32(qBufferPtr, complexInput.val[1]); + complexVectorPtr += 8; + qBufferPtr += 4; + } + + for (number = quarter_points * 4; number < num_points; number++) { + complexVectorPtr++; + *qBufferPtr++ = *complexVectorPtr++; + } } #endif /* LV_HAVE_NEON */ #ifdef LV_HAVE_GENERIC -static inline void -volk_32fc_deinterleave_imag_32f_generic(float* qBuffer, const lv_32fc_t* complexVector, - unsigned int num_points) +static inline void volk_32fc_deinterleave_imag_32f_generic(float* qBuffer, + const lv_32fc_t* complexVector, + unsigned int num_points) { - unsigned int number = 0; - const float* complexVectorPtr = (float*)complexVector; - float* qBufferPtr = qBuffer; - for(number = 0; number < num_points; number++){ - complexVectorPtr++; - *qBufferPtr++ = *complexVectorPtr++; - } + unsigned int number = 0; + const float* complexVectorPtr = (float*)complexVector; + float* qBufferPtr = qBuffer; + for (number = 0; number < num_points; number++) { + complexVectorPtr++; + *qBufferPtr++ = *complexVectorPtr++; + } } #endif /* LV_HAVE_GENERIC */ @@ -206,40 +206,40 @@ volk_32fc_deinterleave_imag_32f_generic(float* qBuffer, const lv_32fc_t* complex #ifdef LV_HAVE_AVX #include -static inline void -volk_32fc_deinterleave_imag_32f_u_avx(float* qBuffer, const lv_32fc_t* complexVector, - unsigned int num_points) +static inline void volk_32fc_deinterleave_imag_32f_u_avx(float* qBuffer, + const lv_32fc_t* complexVector, + unsigned int num_points) { - unsigned int number = 0; - const unsigned int eighthPoints = num_points / 8; - const float* complexVectorPtr = (const float*)complexVector; - float* qBufferPtr = qBuffer; + unsigned int number = 0; + const unsigned int eighthPoints = num_points / 8; + const float* complexVectorPtr = (const float*)complexVector; + float* qBufferPtr = qBuffer; - __m256 cplxValue1, cplxValue2, complex1, complex2, qValue; - for(;number < eighthPoints; number++){ + __m256 cplxValue1, cplxValue2, complex1, complex2, qValue; + for (; number < eighthPoints; number++) { - cplxValue1 = _mm256_loadu_ps(complexVectorPtr); - complexVectorPtr += 8; + cplxValue1 = _mm256_loadu_ps(complexVectorPtr); + complexVectorPtr += 8; - cplxValue2 = _mm256_loadu_ps(complexVectorPtr); - complexVectorPtr += 8; + cplxValue2 = _mm256_loadu_ps(complexVectorPtr); + complexVectorPtr += 8; - complex1 = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x20); - complex2 = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x31); + complex1 = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x20); + complex2 = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x31); - // Arrange in q1q2q3q4 format - qValue = _mm256_shuffle_ps(complex1, complex2, 0xdd); + // Arrange in q1q2q3q4 format + qValue = _mm256_shuffle_ps(complex1, complex2, 0xdd); - _mm256_storeu_ps(qBufferPtr, qValue); + _mm256_storeu_ps(qBufferPtr, qValue); - qBufferPtr += 8; - } + qBufferPtr += 8; + } - number = eighthPoints * 8; - for(; number < num_points; number++){ - complexVectorPtr++; - *qBufferPtr++ = *complexVectorPtr++; - } + number = eighthPoints * 8; + for (; number < num_points; number++) { + complexVectorPtr++; + *qBufferPtr++ = *complexVectorPtr++; + } } #endif /* LV_HAVE_AVX */ #endif /* INCLUDED_volk_32fc_deinterleave_imag_32f_u_H */ diff --git a/kernels/volk/volk_32fc_deinterleave_real_32f.h b/kernels/volk/volk_32fc_deinterleave_real_32f.h index 92a94d3..2526a16 100644 --- a/kernels/volk/volk_32fc_deinterleave_real_32f.h +++ b/kernels/volk/volk_32fc_deinterleave_real_32f.h @@ -30,8 +30,8 @@ * * Dispatcher Prototype * \code - * void volk_32fc_deinterleave_real_32f(float* iBuffer, const lv_32fc_t* complexVector, unsigned int num_points) - * \endcode + * void volk_32fc_deinterleave_real_32f(float* iBuffer, const lv_32fc_t* complexVector, + * unsigned int num_points) \endcode * * \b Inputs * \li complexVector: The complex input vector. @@ -76,96 +76,96 @@ #ifdef LV_HAVE_AVX2 #include -static inline void -volk_32fc_deinterleave_real_32f_a_avx2(float* iBuffer, const lv_32fc_t* complexVector, - unsigned int num_points) +static inline void volk_32fc_deinterleave_real_32f_a_avx2(float* iBuffer, + const lv_32fc_t* complexVector, + unsigned int num_points) { - unsigned int number = 0; - const unsigned int eighthPoints = num_points / 8; + unsigned int number = 0; + const unsigned int eighthPoints = num_points / 8; - const float* complexVectorPtr = (const float*)complexVector; - float* iBufferPtr = iBuffer; + const float* complexVectorPtr = (const float*)complexVector; + float* iBufferPtr = iBuffer; - __m256 cplxValue1, cplxValue2; - __m256 iValue; - __m256i idx = _mm256_set_epi32(7,6,3,2,5,4,1,0); - for(;number < eighthPoints; number++){ + __m256 cplxValue1, cplxValue2; + __m256 iValue; + __m256i idx = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0); + for (; number < eighthPoints; number++) { - cplxValue1 = _mm256_load_ps(complexVectorPtr); - complexVectorPtr += 8; + cplxValue1 = _mm256_load_ps(complexVectorPtr); + complexVectorPtr += 8; - cplxValue2 = _mm256_load_ps(complexVectorPtr); - complexVectorPtr += 8; + cplxValue2 = _mm256_load_ps(complexVectorPtr); + complexVectorPtr += 8; - // Arrange in i1i2i3i4 format - iValue = _mm256_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0)); - iValue = _mm256_permutevar8x32_ps(iValue,idx); + // Arrange in i1i2i3i4 format + iValue = _mm256_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2, 0, 2, 0)); + iValue = _mm256_permutevar8x32_ps(iValue, idx); - _mm256_store_ps(iBufferPtr, iValue); + _mm256_store_ps(iBufferPtr, iValue); - iBufferPtr += 8; - } + iBufferPtr += 8; + } - number = eighthPoints * 8; - for(; number < num_points; number++){ - *iBufferPtr++ = *complexVectorPtr++; - complexVectorPtr++; - } + number = eighthPoints * 8; + for (; number < num_points; number++) { + *iBufferPtr++ = *complexVectorPtr++; + complexVectorPtr++; + } } #endif /* LV_HAVE_AVX2 */ #ifdef LV_HAVE_SSE #include -static inline void -volk_32fc_deinterleave_real_32f_a_sse(float* iBuffer, const lv_32fc_t* complexVector, - unsigned int num_points) +static inline void volk_32fc_deinterleave_real_32f_a_sse(float* iBuffer, + const lv_32fc_t* complexVector, + unsigned int num_points) { - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; - const float* complexVectorPtr = (const float*)complexVector; - float* iBufferPtr = iBuffer; + const float* complexVectorPtr = (const float*)complexVector; + float* iBufferPtr = iBuffer; - __m128 cplxValue1, cplxValue2, iValue; - for(;number < quarterPoints; number++){ + __m128 cplxValue1, cplxValue2, iValue; + for (; number < quarterPoints; number++) { - cplxValue1 = _mm_load_ps(complexVectorPtr); - complexVectorPtr += 4; + cplxValue1 = _mm_load_ps(complexVectorPtr); + complexVectorPtr += 4; - cplxValue2 = _mm_load_ps(complexVectorPtr); - complexVectorPtr += 4; + cplxValue2 = _mm_load_ps(complexVectorPtr); + complexVectorPtr += 4; - // Arrange in i1i2i3i4 format - iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0)); + // Arrange in i1i2i3i4 format + iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2, 0, 2, 0)); - _mm_store_ps(iBufferPtr, iValue); + _mm_store_ps(iBufferPtr, iValue); - iBufferPtr += 4; - } + iBufferPtr += 4; + } - number = quarterPoints * 4; - for(; number < num_points; number++){ - *iBufferPtr++ = *complexVectorPtr++; - complexVectorPtr++; - } + number = quarterPoints * 4; + for (; number < num_points; number++) { + *iBufferPtr++ = *complexVectorPtr++; + complexVectorPtr++; + } } #endif /* LV_HAVE_SSE */ #ifdef LV_HAVE_GENERIC -static inline void -volk_32fc_deinterleave_real_32f_generic(float* iBuffer, const lv_32fc_t* complexVector, - unsigned int num_points) +static inline void volk_32fc_deinterleave_real_32f_generic(float* iBuffer, + const lv_32fc_t* complexVector, + unsigned int num_points) { - unsigned int number = 0; - const float* complexVectorPtr = (float*)complexVector; - float* iBufferPtr = iBuffer; - for(number = 0; number < num_points; number++){ - *iBufferPtr++ = *complexVectorPtr++; - complexVectorPtr++; - } + unsigned int number = 0; + const float* complexVectorPtr = (float*)complexVector; + float* iBufferPtr = iBuffer; + for (number = 0; number < num_points; number++) { + *iBufferPtr++ = *complexVectorPtr++; + complexVectorPtr++; + } } #endif /* LV_HAVE_GENERIC */ @@ -173,27 +173,27 @@ volk_32fc_deinterleave_real_32f_generic(float* iBuffer, const lv_32fc_t* complex #ifdef LV_HAVE_NEON #include -static inline void -volk_32fc_deinterleave_real_32f_neon(float* iBuffer, const lv_32fc_t* complexVector, - unsigned int num_points) +static inline void volk_32fc_deinterleave_real_32f_neon(float* iBuffer, + const lv_32fc_t* complexVector, + unsigned int num_points) { - unsigned int number = 0; - unsigned int quarter_points = num_points / 4; - const float* complexVectorPtr = (float*)complexVector; - float* iBufferPtr = iBuffer; - float32x4x2_t complexInput; - - for(number = 0; number < quarter_points; number++){ - complexInput = vld2q_f32(complexVectorPtr); - vst1q_f32( iBufferPtr, complexInput.val[0] ); - complexVectorPtr += 8; - iBufferPtr += 4; - } - - for(number = quarter_points*4; number < num_points; number++){ - *iBufferPtr++ = *complexVectorPtr++; - complexVectorPtr++; - } + unsigned int number = 0; + unsigned int quarter_points = num_points / 4; + const float* complexVectorPtr = (float*)complexVector; + float* iBufferPtr = iBuffer; + float32x4x2_t complexInput; + + for (number = 0; number < quarter_points; number++) { + complexInput = vld2q_f32(complexVectorPtr); + vst1q_f32(iBufferPtr, complexInput.val[0]); + complexVectorPtr += 8; + iBufferPtr += 4; + } + + for (number = quarter_points * 4; number < num_points; number++) { + *iBufferPtr++ = *complexVectorPtr++; + complexVectorPtr++; + } } #endif /* LV_HAVE_NEON */ @@ -209,41 +209,41 @@ volk_32fc_deinterleave_real_32f_neon(float* iBuffer, const lv_32fc_t* complexVec #ifdef LV_HAVE_AVX2 #include -static inline void -volk_32fc_deinterleave_real_32f_u_avx2(float* iBuffer, const lv_32fc_t* complexVector, - unsigned int num_points) +static inline void volk_32fc_deinterleave_real_32f_u_avx2(float* iBuffer, + const lv_32fc_t* complexVector, + unsigned int num_points) { - unsigned int number = 0; - const unsigned int eighthPoints = num_points / 8; + unsigned int number = 0; + const unsigned int eighthPoints = num_points / 8; - const float* complexVectorPtr = (const float*)complexVector; - float* iBufferPtr = iBuffer; + const float* complexVectorPtr = (const float*)complexVector; + float* iBufferPtr = iBuffer; - __m256 cplxValue1, cplxValue2; - __m256 iValue; - __m256i idx = _mm256_set_epi32(7,6,3,2,5,4,1,0); - for(;number < eighthPoints; number++){ + __m256 cplxValue1, cplxValue2; + __m256 iValue; + __m256i idx = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0); + for (; number < eighthPoints; number++) { - cplxValue1 = _mm256_loadu_ps(complexVectorPtr); - complexVectorPtr += 8; + cplxValue1 = _mm256_loadu_ps(complexVectorPtr); + complexVectorPtr += 8; - cplxValue2 = _mm256_loadu_ps(complexVectorPtr); - complexVectorPtr += 8; + cplxValue2 = _mm256_loadu_ps(complexVectorPtr); + complexVectorPtr += 8; - // Arrange in i1i2i3i4 format - iValue = _mm256_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0)); - iValue = _mm256_permutevar8x32_ps(iValue,idx); + // Arrange in i1i2i3i4 format + iValue = _mm256_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2, 0, 2, 0)); + iValue = _mm256_permutevar8x32_ps(iValue, idx); - _mm256_storeu_ps(iBufferPtr, iValue); + _mm256_storeu_ps(iBufferPtr, iValue); - iBufferPtr += 8; - } + iBufferPtr += 8; + } - number = eighthPoints * 8; - for(; number < num_points; number++){ - *iBufferPtr++ = *complexVectorPtr++; - complexVectorPtr++; - } + number = eighthPoints * 8; + for (; number < num_points; number++) { + *iBufferPtr++ = *complexVectorPtr++; + complexVectorPtr++; + } } #endif /* LV_HAVE_AVX2 */ diff --git a/kernels/volk/volk_32fc_deinterleave_real_64f.h b/kernels/volk/volk_32fc_deinterleave_real_64f.h index 3d6e901..9ec7769 100644 --- a/kernels/volk/volk_32fc_deinterleave_real_64f.h +++ b/kernels/volk/volk_32fc_deinterleave_real_64f.h @@ -77,124 +77,132 @@ #ifdef LV_HAVE_AVX2 #include -static inline void volk_32fc_deinterleave_real_64f_a_avx2( - double *iBuffer, const lv_32fc_t *complexVector, unsigned int num_points) { - unsigned int number = 0; - - const float *complexVectorPtr = (float *)complexVector; - double *iBufferPtr = iBuffer; - - const unsigned int quarterPoints = num_points / 4; - __m256 cplxValue; - __m128 fVal; - __m256d dVal; - __m256i idx = _mm256_set_epi32(0, 0, 0, 0, 6, 4, 2, 0); - for (; number < quarterPoints; number++) { - - cplxValue = _mm256_load_ps(complexVectorPtr); - complexVectorPtr += 8; - - // Arrange in i1i2i1i2 format - cplxValue = _mm256_permutevar8x32_ps(cplxValue, idx); - fVal = _mm256_extractf128_ps(cplxValue, 0); - dVal = _mm256_cvtps_pd(fVal); - _mm256_store_pd(iBufferPtr, dVal); - - iBufferPtr += 4; - } - - number = quarterPoints * 4; - for (; number < num_points; number++) { - *iBufferPtr++ = (double)*complexVectorPtr++; - complexVectorPtr++; - } +static inline void volk_32fc_deinterleave_real_64f_a_avx2(double* iBuffer, + const lv_32fc_t* complexVector, + unsigned int num_points) +{ + unsigned int number = 0; + + const float* complexVectorPtr = (float*)complexVector; + double* iBufferPtr = iBuffer; + + const unsigned int quarterPoints = num_points / 4; + __m256 cplxValue; + __m128 fVal; + __m256d dVal; + __m256i idx = _mm256_set_epi32(0, 0, 0, 0, 6, 4, 2, 0); + for (; number < quarterPoints; number++) { + + cplxValue = _mm256_load_ps(complexVectorPtr); + complexVectorPtr += 8; + + // Arrange in i1i2i1i2 format + cplxValue = _mm256_permutevar8x32_ps(cplxValue, idx); + fVal = _mm256_extractf128_ps(cplxValue, 0); + dVal = _mm256_cvtps_pd(fVal); + _mm256_store_pd(iBufferPtr, dVal); + + iBufferPtr += 4; + } + + number = quarterPoints * 4; + for (; number < num_points; number++) { + *iBufferPtr++ = (double)*complexVectorPtr++; + complexVectorPtr++; + } } #endif /* LV_HAVE_AVX2 */ #ifdef LV_HAVE_SSE2 #include -static inline void volk_32fc_deinterleave_real_64f_a_sse2( - double *iBuffer, const lv_32fc_t *complexVector, unsigned int num_points) { - unsigned int number = 0; +static inline void volk_32fc_deinterleave_real_64f_a_sse2(double* iBuffer, + const lv_32fc_t* complexVector, + unsigned int num_points) +{ + unsigned int number = 0; - const float *complexVectorPtr = (float *)complexVector; - double *iBufferPtr = iBuffer; + const float* complexVectorPtr = (float*)complexVector; + double* iBufferPtr = iBuffer; - const unsigned int halfPoints = num_points / 2; - __m128 cplxValue, fVal; - __m128d dVal; - for (; number < halfPoints; number++) { + const unsigned int halfPoints = num_points / 2; + __m128 cplxValue, fVal; + __m128d dVal; + for (; number < halfPoints; number++) { - cplxValue = _mm_load_ps(complexVectorPtr); - complexVectorPtr += 4; + cplxValue = _mm_load_ps(complexVectorPtr); + complexVectorPtr += 4; - // Arrange in i1i2i1i2 format - fVal = _mm_shuffle_ps(cplxValue, cplxValue, _MM_SHUFFLE(2, 0, 2, 0)); - dVal = _mm_cvtps_pd(fVal); - _mm_store_pd(iBufferPtr, dVal); + // Arrange in i1i2i1i2 format + fVal = _mm_shuffle_ps(cplxValue, cplxValue, _MM_SHUFFLE(2, 0, 2, 0)); + dVal = _mm_cvtps_pd(fVal); + _mm_store_pd(iBufferPtr, dVal); - iBufferPtr += 2; - } + iBufferPtr += 2; + } - number = halfPoints * 2; - for (; number < num_points; number++) { - *iBufferPtr++ = (double)*complexVectorPtr++; - complexVectorPtr++; - } + number = halfPoints * 2; + for (; number < num_points; number++) { + *iBufferPtr++ = (double)*complexVectorPtr++; + complexVectorPtr++; + } } #endif /* LV_HAVE_SSE */ #ifdef LV_HAVE_GENERIC -static inline void volk_32fc_deinterleave_real_64f_generic( - double *iBuffer, const lv_32fc_t *complexVector, unsigned int num_points) { - unsigned int number = 0; - const float *complexVectorPtr = (float *)complexVector; - double *iBufferPtr = iBuffer; - for (number = 0; number < num_points; number++) { - *iBufferPtr++ = (double)*complexVectorPtr++; - complexVectorPtr++; - } +static inline void volk_32fc_deinterleave_real_64f_generic(double* iBuffer, + const lv_32fc_t* complexVector, + unsigned int num_points) +{ + unsigned int number = 0; + const float* complexVectorPtr = (float*)complexVector; + double* iBufferPtr = iBuffer; + for (number = 0; number < num_points; number++) { + *iBufferPtr++ = (double)*complexVectorPtr++; + complexVectorPtr++; + } } #endif /* LV_HAVE_GENERIC */ #ifdef LV_HAVE_NEONV8 #include -static inline void volk_32fc_deinterleave_real_64f_neon( - double *iBuffer, const lv_32fc_t *complexVector, unsigned int num_points) { - unsigned int number = 0; - unsigned int quarter_points = num_points / 4; - const float *complexVectorPtr = (float *)complexVector; - double *iBufferPtr = iBuffer; - float32x2x4_t complexInput; - float64x2_t iVal1; - float64x2_t iVal2; - float64x2x2_t iVal; - - for (number = 0; number < quarter_points; number++) { - // Load data into register - complexInput = vld4_f32(complexVectorPtr); - - // Perform single to double precision conversion - iVal1 = vcvt_f64_f32(complexInput.val[0]); - iVal2 = vcvt_f64_f32(complexInput.val[2]); - iVal.val[0] = iVal1; - iVal.val[1] = iVal2; - - // Store results into memory buffer - vst2q_f64(iBufferPtr, iVal); - - // Update pointers - iBufferPtr += 4; - complexVectorPtr += 8; - } - - for (number = quarter_points * 4; number < num_points; number++) { - *iBufferPtr++ = (double)*complexVectorPtr++; - complexVectorPtr++; - } +static inline void volk_32fc_deinterleave_real_64f_neon(double* iBuffer, + const lv_32fc_t* complexVector, + unsigned int num_points) +{ + unsigned int number = 0; + unsigned int quarter_points = num_points / 4; + const float* complexVectorPtr = (float*)complexVector; + double* iBufferPtr = iBuffer; + float32x2x4_t complexInput; + float64x2_t iVal1; + float64x2_t iVal2; + float64x2x2_t iVal; + + for (number = 0; number < quarter_points; number++) { + // Load data into register + complexInput = vld4_f32(complexVectorPtr); + + // Perform single to double precision conversion + iVal1 = vcvt_f64_f32(complexInput.val[0]); + iVal2 = vcvt_f64_f32(complexInput.val[2]); + iVal.val[0] = iVal1; + iVal.val[1] = iVal2; + + // Store results into memory buffer + vst2q_f64(iBufferPtr, iVal); + + // Update pointers + iBufferPtr += 4; + complexVectorPtr += 8; + } + + for (number = quarter_points * 4; number < num_points; number++) { + *iBufferPtr++ = (double)*complexVectorPtr++; + complexVectorPtr++; + } } #endif /* LV_HAVE_NEON */ @@ -209,37 +217,39 @@ static inline void volk_32fc_deinterleave_real_64f_neon( #ifdef LV_HAVE_AVX2 #include -static inline void volk_32fc_deinterleave_real_64f_u_avx2( - double *iBuffer, const lv_32fc_t *complexVector, unsigned int num_points) { - unsigned int number = 0; - - const float *complexVectorPtr = (float *)complexVector; - double *iBufferPtr = iBuffer; - - const unsigned int quarterPoints = num_points / 4; - __m256 cplxValue; - __m128 fVal; - __m256d dVal; - __m256i idx = _mm256_set_epi32(0, 0, 0, 0, 6, 4, 2, 0); - for (; number < quarterPoints; number++) { - - cplxValue = _mm256_loadu_ps(complexVectorPtr); - complexVectorPtr += 8; - - // Arrange in i1i2i1i2 format - cplxValue = _mm256_permutevar8x32_ps(cplxValue, idx); - fVal = _mm256_extractf128_ps(cplxValue, 0); - dVal = _mm256_cvtps_pd(fVal); - _mm256_storeu_pd(iBufferPtr, dVal); - - iBufferPtr += 4; - } - - number = quarterPoints * 4; - for (; number < num_points; number++) { - *iBufferPtr++ = (double)*complexVectorPtr++; - complexVectorPtr++; - } +static inline void volk_32fc_deinterleave_real_64f_u_avx2(double* iBuffer, + const lv_32fc_t* complexVector, + unsigned int num_points) +{ + unsigned int number = 0; + + const float* complexVectorPtr = (float*)complexVector; + double* iBufferPtr = iBuffer; + + const unsigned int quarterPoints = num_points / 4; + __m256 cplxValue; + __m128 fVal; + __m256d dVal; + __m256i idx = _mm256_set_epi32(0, 0, 0, 0, 6, 4, 2, 0); + for (; number < quarterPoints; number++) { + + cplxValue = _mm256_loadu_ps(complexVectorPtr); + complexVectorPtr += 8; + + // Arrange in i1i2i1i2 format + cplxValue = _mm256_permutevar8x32_ps(cplxValue, idx); + fVal = _mm256_extractf128_ps(cplxValue, 0); + dVal = _mm256_cvtps_pd(fVal); + _mm256_storeu_pd(iBufferPtr, dVal); + + iBufferPtr += 4; + } + + number = quarterPoints * 4; + for (; number < num_points; number++) { + *iBufferPtr++ = (double)*complexVectorPtr++; + complexVectorPtr++; + } } #endif /* LV_HAVE_AVX2 */ diff --git a/kernels/volk/volk_32fc_index_max_16u.h b/kernels/volk/volk_32fc_index_max_16u.h index a9f9508..b9f9cfd 100644 --- a/kernels/volk/volk_32fc_index_max_16u.h +++ b/kernels/volk/volk_32fc_index_max_16u.h @@ -76,346 +76,353 @@ #ifndef INCLUDED_volk_32fc_index_max_16u_a_H #define INCLUDED_volk_32fc_index_max_16u_a_H -#include #include -#include #include +#include +#include #include #ifdef LV_HAVE_AVX2 #include static inline void -volk_32fc_index_max_16u_a_avx2(uint16_t* target, lv_32fc_t* src0, - uint32_t num_points) +volk_32fc_index_max_16u_a_avx2(uint16_t* target, lv_32fc_t* src0, uint32_t num_points) { - num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points; - // Branchless version, if we think it'll make a difference - //num_points = USHRT_MAX ^ ((num_points ^ USHRT_MAX) & -(num_points < USHRT_MAX)); - - const uint32_t num_bytes = num_points*8; - - union bit256 holderf; - union bit256 holderi; - float sq_dist = 0.0; + num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points; + // Branchless version, if we think it'll make a difference + // num_points = USHRT_MAX ^ ((num_points ^ USHRT_MAX) & -(num_points < USHRT_MAX)); - union bit256 xmm5, xmm4; - __m256 xmm1, xmm2, xmm3; - __m256i xmm8, xmm11, xmm12, xmmfive, xmmfour, xmm9, holder0, holder1, xmm10; + const uint32_t num_bytes = num_points * 8; - xmm5.int_vec = xmmfive = _mm256_setzero_si256(); - xmm4.int_vec = xmmfour = _mm256_setzero_si256(); - holderf.int_vec = holder0 = _mm256_setzero_si256(); - holderi.int_vec = holder1 = _mm256_setzero_si256(); + union bit256 holderf; + union bit256 holderi; + float sq_dist = 0.0; - int bound = num_bytes >> 6; - int i = 0; + union bit256 xmm5, xmm4; + __m256 xmm1, xmm2, xmm3; + __m256i xmm8, xmm11, xmm12, xmmfive, xmmfour, xmm9, holder0, holder1, xmm10; - xmm8 = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0); - xmm9 = _mm256_setzero_si256(); //=xmm8 - xmm10 = _mm256_set1_epi32(8); - xmm3 = _mm256_setzero_ps(); + xmm5.int_vec = xmmfive = _mm256_setzero_si256(); + xmm4.int_vec = xmmfour = _mm256_setzero_si256(); + holderf.int_vec = holder0 = _mm256_setzero_si256(); + holderi.int_vec = holder1 = _mm256_setzero_si256(); - __m256i idx = _mm256_set_epi32(7,6,3,2,5,4,1,0); - for(; i < bound; ++i) { - xmm1 = _mm256_load_ps((float*)src0); - xmm2 = _mm256_load_ps((float*)&src0[4]); + int bound = num_bytes >> 6; + int i = 0; - src0 += 8; + xmm8 = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0); + xmm9 = _mm256_setzero_si256(); //=xmm8 + xmm10 = _mm256_set1_epi32(8); + xmm3 = _mm256_setzero_ps(); - xmm1 = _mm256_mul_ps(xmm1, xmm1); - xmm2 = _mm256_mul_ps(xmm2, xmm2); + __m256i idx = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0); + for (; i < bound; ++i) { + xmm1 = _mm256_load_ps((float*)src0); + xmm2 = _mm256_load_ps((float*)&src0[4]); - xmm1 = _mm256_hadd_ps(xmm1, xmm2); - xmm1 = _mm256_permutevar8x32_ps(xmm1, idx); + src0 += 8; - xmm3 = _mm256_max_ps(xmm1, xmm3); + xmm1 = _mm256_mul_ps(xmm1, xmm1); + xmm2 = _mm256_mul_ps(xmm2, xmm2); - xmm4.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_LT_OS); - xmm5.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_EQ_OQ); + xmm1 = _mm256_hadd_ps(xmm1, xmm2); + xmm1 = _mm256_permutevar8x32_ps(xmm1, idx); - xmm11 = _mm256_and_si256(xmm8, xmm5.int_vec); - xmm12 = _mm256_and_si256(xmm9, xmm4.int_vec); + xmm3 = _mm256_max_ps(xmm1, xmm3); - xmm9 = _mm256_add_epi32(xmm11, xmm12); + xmm4.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_LT_OS); + xmm5.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_EQ_OQ); - xmm8 = _mm256_add_epi32(xmm8, xmm10); - } - xmm10 = _mm256_set1_epi32(4); - if (num_bytes >> 5 & 1) { - xmm1 = _mm256_load_ps((float*)src0); + xmm11 = _mm256_and_si256(xmm8, xmm5.int_vec); + xmm12 = _mm256_and_si256(xmm9, xmm4.int_vec); - src0 += 4; + xmm9 = _mm256_add_epi32(xmm11, xmm12); - xmm1 = _mm256_mul_ps(xmm1, xmm1); + xmm8 = _mm256_add_epi32(xmm8, xmm10); + } + xmm10 = _mm256_set1_epi32(4); + if (num_bytes >> 5 & 1) { + xmm1 = _mm256_load_ps((float*)src0); - xmm1 = _mm256_hadd_ps(xmm1, xmm1); - xmm1 = _mm256_permutevar8x32_ps(xmm1, idx); + src0 += 4; - xmm3 = _mm256_max_ps(xmm1, xmm3); + xmm1 = _mm256_mul_ps(xmm1, xmm1); - xmm4.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_LT_OS); - xmm5.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_EQ_OQ); + xmm1 = _mm256_hadd_ps(xmm1, xmm1); + xmm1 = _mm256_permutevar8x32_ps(xmm1, idx); - xmm11 = _mm256_and_si256(xmm8, xmm5.int_vec); - xmm12 = _mm256_and_si256(xmm9, xmm4.int_vec); + xmm3 = _mm256_max_ps(xmm1, xmm3); - xmm9 = _mm256_add_epi32(xmm11, xmm12); + xmm4.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_LT_OS); + xmm5.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_EQ_OQ); - xmm8 = _mm256_add_epi32(xmm8, xmm10); - } + xmm11 = _mm256_and_si256(xmm8, xmm5.int_vec); + xmm12 = _mm256_and_si256(xmm9, xmm4.int_vec); - idx = _mm256_set_epi32(1,0,1,0,1,0,1,0); - xmm10 = _mm256_set1_epi32(2); - if (num_bytes >> 4 & 1) { - xmm2 = _mm256_load_ps((float*)src0); + xmm9 = _mm256_add_epi32(xmm11, xmm12); - xmm1 = _mm256_permutevar8x32_ps(bit256_p(&xmm8)->float_vec, idx); - xmm8 = bit256_p(&xmm1)->int_vec; + xmm8 = _mm256_add_epi32(xmm8, xmm10); + } - xmm2 = _mm256_mul_ps(xmm2, xmm2); + idx = _mm256_set_epi32(1, 0, 1, 0, 1, 0, 1, 0); + xmm10 = _mm256_set1_epi32(2); + if (num_bytes >> 4 & 1) { + xmm2 = _mm256_load_ps((float*)src0); - src0 += 2; + xmm1 = _mm256_permutevar8x32_ps(bit256_p(&xmm8)->float_vec, idx); + xmm8 = bit256_p(&xmm1)->int_vec; - xmm1 = _mm256_hadd_ps(xmm2, xmm2); + xmm2 = _mm256_mul_ps(xmm2, xmm2); - xmm3 = _mm256_max_ps(xmm1, xmm3); + src0 += 2; - xmm4.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_LT_OS); - xmm5.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_EQ_OQ); - - xmm11 = _mm256_and_si256(xmm8, xmm5.int_vec); - xmm12 = _mm256_and_si256(xmm9, xmm4.int_vec); + xmm1 = _mm256_hadd_ps(xmm2, xmm2); - xmm9 = _mm256_add_epi32(xmm11, xmm12); + xmm3 = _mm256_max_ps(xmm1, xmm3); - xmm8 = _mm256_add_epi32(xmm8, xmm10); - } + xmm4.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_LT_OS); + xmm5.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_EQ_OQ); - /* - idx = _mm256_setzero_si256(); - for(i = 0; i < leftovers2; ++i) { - //printf("%u, %u, %u, %u\n", ((uint32_t*)&xmm9)[0], ((uint32_t*)&xmm9)[1], ((uint32_t*)&xmm9)[2], ((uint32_t*)&xmm9)[3]); + xmm11 = _mm256_and_si256(xmm8, xmm5.int_vec); + xmm12 = _mm256_and_si256(xmm9, xmm4.int_vec); - sq_dist = lv_creal(src0[0]) * lv_creal(src0[0]) + lv_cimag(src0[0]) * lv_cimag(src0[0]); + xmm9 = _mm256_add_epi32(xmm11, xmm12); - //xmm = _mm_load1_ps(&sq_dist);//insert? - xmm2 = _mm256_set1_ps(sq_dist); - //xmm2 = _mm256_insertf128_ps(xmm2, xmm, 0); + xmm8 = _mm256_add_epi32(xmm8, xmm10); + } - xmm1 = xmm3; + /* + idx = _mm256_setzero_si256(); + for(i = 0; i < leftovers2; ++i) { + //printf("%u, %u, %u, %u\n", ((uint32_t*)&xmm9)[0], ((uint32_t*)&xmm9)[1], + ((uint32_t*)&xmm9)[2], ((uint32_t*)&xmm9)[3]); - xmm3 = _mm256_max_ps(xmm3, xmm2);//only lowest 32bit value - xmm3 = _mm256_permutevar8x32_ps(xmm3, idx); + sq_dist = lv_creal(src0[0]) * lv_creal(src0[0]) + lv_cimag(src0[0]) * + lv_cimag(src0[0]); - xmm4.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_LT_OS); - xmm5.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_EQ_OQ); + //xmm = _mm_load1_ps(&sq_dist);//insert? + xmm2 = _mm256_set1_ps(sq_dist); + //xmm2 = _mm256_insertf128_ps(xmm2, xmm, 0); - xmm8 = _mm256_permutevar8x32_epi32(xmm8, idx); + xmm1 = xmm3; - xmm11 = _mm256_and_si256(xmm8, xmm4.int_vec); - xmm12 = _mm256_and_si256(xmm9, xmm5.int_vec); + xmm3 = _mm256_max_ps(xmm3, xmm2);//only lowest 32bit value + xmm3 = _mm256_permutevar8x32_ps(xmm3, idx); - xmm9 = _mm256_add_epi32(xmm11, xmm12); -}*/ + xmm4.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_LT_OS); + xmm5.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_EQ_OQ); - _mm256_store_ps((float*)&(holderf.f), xmm3); - _mm256_store_si256(&(holderi.int_vec), xmm9); + xmm8 = _mm256_permutevar8x32_epi32(xmm8, idx); - target[0] = holderi.i[0]; - sq_dist = holderf.f[0]; - target[0] = (holderf.f[1] > sq_dist) ? holderi.i[1] : target[0]; - sq_dist = (holderf.f[1] > sq_dist) ? holderf.f[1] : sq_dist; - target[0] = (holderf.f[2] > sq_dist) ? holderi.i[2] : target[0]; - sq_dist = (holderf.f[2] > sq_dist) ? holderf.f[2] : sq_dist; - target[0] = (holderf.f[3] > sq_dist) ? holderi.i[3] : target[0]; - sq_dist = (holderf.f[3] > sq_dist) ? holderf.f[3] : sq_dist; - target[0] = (holderf.f[4] > sq_dist) ? holderi.i[4] : target[0]; - sq_dist = (holderf.f[4] > sq_dist) ? holderf.f[4] : sq_dist; - target[0] = (holderf.f[5] > sq_dist) ? holderi.i[5] : target[0]; - sq_dist = (holderf.f[5] > sq_dist) ? holderf.f[5] : sq_dist; - target[0] = (holderf.f[6] > sq_dist) ? holderi.i[6] : target[0]; - sq_dist = (holderf.f[6] > sq_dist) ? holderf.f[6] : sq_dist; - target[0] = (holderf.f[7] > sq_dist) ? holderi.i[7] : target[0]; - sq_dist = (holderf.f[7] > sq_dist) ? holderf.f[7] : sq_dist; + xmm11 = _mm256_and_si256(xmm8, xmm4.int_vec); + xmm12 = _mm256_and_si256(xmm9, xmm5.int_vec); + xmm9 = _mm256_add_epi32(xmm11, xmm12); + }*/ + + _mm256_store_ps((float*)&(holderf.f), xmm3); + _mm256_store_si256(&(holderi.int_vec), xmm9); + + target[0] = holderi.i[0]; + sq_dist = holderf.f[0]; + target[0] = (holderf.f[1] > sq_dist) ? holderi.i[1] : target[0]; + sq_dist = (holderf.f[1] > sq_dist) ? holderf.f[1] : sq_dist; + target[0] = (holderf.f[2] > sq_dist) ? holderi.i[2] : target[0]; + sq_dist = (holderf.f[2] > sq_dist) ? holderf.f[2] : sq_dist; + target[0] = (holderf.f[3] > sq_dist) ? holderi.i[3] : target[0]; + sq_dist = (holderf.f[3] > sq_dist) ? holderf.f[3] : sq_dist; + target[0] = (holderf.f[4] > sq_dist) ? holderi.i[4] : target[0]; + sq_dist = (holderf.f[4] > sq_dist) ? holderf.f[4] : sq_dist; + target[0] = (holderf.f[5] > sq_dist) ? holderi.i[5] : target[0]; + sq_dist = (holderf.f[5] > sq_dist) ? holderf.f[5] : sq_dist; + target[0] = (holderf.f[6] > sq_dist) ? holderi.i[6] : target[0]; + sq_dist = (holderf.f[6] > sq_dist) ? holderf.f[6] : sq_dist; + target[0] = (holderf.f[7] > sq_dist) ? holderi.i[7] : target[0]; + sq_dist = (holderf.f[7] > sq_dist) ? holderf.f[7] : sq_dist; } #endif /*LV_HAVE_AVX2*/ #ifdef LV_HAVE_SSE3 -#include #include +#include static inline void -volk_32fc_index_max_16u_a_sse3(uint16_t* target, lv_32fc_t* src0, - uint32_t num_points) +volk_32fc_index_max_16u_a_sse3(uint16_t* target, lv_32fc_t* src0, uint32_t num_points) { - num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points; - // Branchless version, if we think it'll make a difference - //num_points = USHRT_MAX ^ ((num_points ^ USHRT_MAX) & -(num_points < USHRT_MAX)); + num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points; + // Branchless version, if we think it'll make a difference + // num_points = USHRT_MAX ^ ((num_points ^ USHRT_MAX) & -(num_points < USHRT_MAX)); - const uint32_t num_bytes = num_points*8; + const uint32_t num_bytes = num_points * 8; - union bit128 holderf; - union bit128 holderi; - float sq_dist = 0.0; + union bit128 holderf; + union bit128 holderi; + float sq_dist = 0.0; - union bit128 xmm5, xmm4; - __m128 xmm1, xmm2, xmm3; - __m128i xmm8, xmm11, xmm12, xmmfive, xmmfour, xmm9, holder0, holder1, xmm10; + union bit128 xmm5, xmm4; + __m128 xmm1, xmm2, xmm3; + __m128i xmm8, xmm11, xmm12, xmmfive, xmmfour, xmm9, holder0, holder1, xmm10; - xmm5.int_vec = xmmfive = _mm_setzero_si128(); - xmm4.int_vec = xmmfour = _mm_setzero_si128(); - holderf.int_vec = holder0 = _mm_setzero_si128(); - holderi.int_vec = holder1 = _mm_setzero_si128(); + xmm5.int_vec = xmmfive = _mm_setzero_si128(); + xmm4.int_vec = xmmfour = _mm_setzero_si128(); + holderf.int_vec = holder0 = _mm_setzero_si128(); + holderi.int_vec = holder1 = _mm_setzero_si128(); - int bound = num_bytes >> 5; - int i = 0; + int bound = num_bytes >> 5; + int i = 0; - xmm8 = _mm_set_epi32(3, 2, 1, 0);//remember the crazy reverse order! - xmm9 = _mm_setzero_si128(); - xmm10 = _mm_set_epi32(4, 4, 4, 4); - xmm3 = _mm_setzero_ps(); - //printf("%f, %f, %f, %f\n", ((float*)&xmm10)[0], ((float*)&xmm10)[1], ((float*)&xmm10)[2], ((float*)&xmm10)[3]); + xmm8 = _mm_set_epi32(3, 2, 1, 0); // remember the crazy reverse order! + xmm9 = _mm_setzero_si128(); + xmm10 = _mm_set_epi32(4, 4, 4, 4); + xmm3 = _mm_setzero_ps(); + // printf("%f, %f, %f, %f\n", ((float*)&xmm10)[0], ((float*)&xmm10)[1], + // ((float*)&xmm10)[2], ((float*)&xmm10)[3]); - for(; i < bound; ++i) { - xmm1 = _mm_load_ps((float*)src0); - xmm2 = _mm_load_ps((float*)&src0[2]); + for (; i < bound; ++i) { + xmm1 = _mm_load_ps((float*)src0); + xmm2 = _mm_load_ps((float*)&src0[2]); - src0 += 4; + src0 += 4; - xmm1 = _mm_mul_ps(xmm1, xmm1); - xmm2 = _mm_mul_ps(xmm2, xmm2); + xmm1 = _mm_mul_ps(xmm1, xmm1); + xmm2 = _mm_mul_ps(xmm2, xmm2); - xmm1 = _mm_hadd_ps(xmm1, xmm2); + xmm1 = _mm_hadd_ps(xmm1, xmm2); - xmm3 = _mm_max_ps(xmm1, xmm3); + xmm3 = _mm_max_ps(xmm1, xmm3); - xmm4.float_vec = _mm_cmplt_ps(xmm1, xmm3); - xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3); + xmm4.float_vec = _mm_cmplt_ps(xmm1, xmm3); + xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3); - xmm11 = _mm_and_si128(xmm8, xmm5.int_vec); - xmm12 = _mm_and_si128(xmm9, xmm4.int_vec); + xmm11 = _mm_and_si128(xmm8, xmm5.int_vec); + xmm12 = _mm_and_si128(xmm9, xmm4.int_vec); - xmm9 = _mm_add_epi32(xmm11, xmm12); + xmm9 = _mm_add_epi32(xmm11, xmm12); - xmm8 = _mm_add_epi32(xmm8, xmm10); + xmm8 = _mm_add_epi32(xmm8, xmm10); - //printf("%f, %f, %f, %f\n", ((float*)&xmm3)[0], ((float*)&xmm3)[1], ((float*)&xmm3)[2], ((float*)&xmm3)[3]); - //printf("%u, %u, %u, %u\n", ((uint32_t*)&xmm10)[0], ((uint32_t*)&xmm10)[1], ((uint32_t*)&xmm10)[2], ((uint32_t*)&xmm10)[3]); - } + // printf("%f, %f, %f, %f\n", ((float*)&xmm3)[0], ((float*)&xmm3)[1], + // ((float*)&xmm3)[2], ((float*)&xmm3)[3]); printf("%u, %u, %u, %u\n", + // ((uint32_t*)&xmm10)[0], ((uint32_t*)&xmm10)[1], ((uint32_t*)&xmm10)[2], + // ((uint32_t*)&xmm10)[3]); + } - if (num_bytes >> 4 & 1) { - xmm2 = _mm_load_ps((float*)src0); + if (num_bytes >> 4 & 1) { + xmm2 = _mm_load_ps((float*)src0); - xmm1 = _mm_movelh_ps(bit128_p(&xmm8)->float_vec, bit128_p(&xmm8)->float_vec); - xmm8 = bit128_p(&xmm1)->int_vec; + xmm1 = _mm_movelh_ps(bit128_p(&xmm8)->float_vec, bit128_p(&xmm8)->float_vec); + xmm8 = bit128_p(&xmm1)->int_vec; - xmm2 = _mm_mul_ps(xmm2, xmm2); + xmm2 = _mm_mul_ps(xmm2, xmm2); - src0 += 2; + src0 += 2; - xmm1 = _mm_hadd_ps(xmm2, xmm2); + xmm1 = _mm_hadd_ps(xmm2, xmm2); - xmm3 = _mm_max_ps(xmm1, xmm3); + xmm3 = _mm_max_ps(xmm1, xmm3); - xmm10 = _mm_set_epi32(2, 2, 2, 2);//load1_ps((float*)&init[2]); + xmm10 = _mm_set_epi32(2, 2, 2, 2); // load1_ps((float*)&init[2]); - xmm4.float_vec = _mm_cmplt_ps(xmm1, xmm3); - xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3); + xmm4.float_vec = _mm_cmplt_ps(xmm1, xmm3); + xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3); - xmm11 = _mm_and_si128(xmm8, xmm5.int_vec); - xmm12 = _mm_and_si128(xmm9, xmm4.int_vec); + xmm11 = _mm_and_si128(xmm8, xmm5.int_vec); + xmm12 = _mm_and_si128(xmm9, xmm4.int_vec); - xmm9 = _mm_add_epi32(xmm11, xmm12); + xmm9 = _mm_add_epi32(xmm11, xmm12); - xmm8 = _mm_add_epi32(xmm8, xmm10); - //printf("egads%u, %u, %u, %u\n", ((uint32_t*)&xmm9)[0], ((uint32_t*)&xmm9)[1], ((uint32_t*)&xmm9)[2], ((uint32_t*)&xmm9)[3]); - } + xmm8 = _mm_add_epi32(xmm8, xmm10); + // printf("egads%u, %u, %u, %u\n", ((uint32_t*)&xmm9)[0], ((uint32_t*)&xmm9)[1], + // ((uint32_t*)&xmm9)[2], ((uint32_t*)&xmm9)[3]); + } - if (num_bytes >> 3 & 1) { - //printf("%u, %u, %u, %u\n", ((uint32_t*)&xmm9)[0], ((uint32_t*)&xmm9)[1], ((uint32_t*)&xmm9)[2], ((uint32_t*)&xmm9)[3]); + if (num_bytes >> 3 & 1) { + // printf("%u, %u, %u, %u\n", ((uint32_t*)&xmm9)[0], ((uint32_t*)&xmm9)[1], + // ((uint32_t*)&xmm9)[2], ((uint32_t*)&xmm9)[3]); - sq_dist = lv_creal(src0[0]) * lv_creal(src0[0]) + lv_cimag(src0[0]) * lv_cimag(src0[0]); + sq_dist = + lv_creal(src0[0]) * lv_creal(src0[0]) + lv_cimag(src0[0]) * lv_cimag(src0[0]); - xmm2 = _mm_load1_ps(&sq_dist); + xmm2 = _mm_load1_ps(&sq_dist); - xmm1 = xmm3; + xmm1 = xmm3; - xmm3 = _mm_max_ss(xmm3, xmm2); + xmm3 = _mm_max_ss(xmm3, xmm2); - xmm4.float_vec = _mm_cmplt_ps(xmm1, xmm3); - xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3); + xmm4.float_vec = _mm_cmplt_ps(xmm1, xmm3); + xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3); - xmm8 = _mm_shuffle_epi32(xmm8, 0x00); + xmm8 = _mm_shuffle_epi32(xmm8, 0x00); - xmm11 = _mm_and_si128(xmm8, xmm4.int_vec); - xmm12 = _mm_and_si128(xmm9, xmm5.int_vec); + xmm11 = _mm_and_si128(xmm8, xmm4.int_vec); + xmm12 = _mm_and_si128(xmm9, xmm5.int_vec); - xmm9 = _mm_add_epi32(xmm11, xmm12); - } + xmm9 = _mm_add_epi32(xmm11, xmm12); + } - //printf("%f, %f, %f, %f\n", ((float*)&xmm3)[0], ((float*)&xmm3)[1], ((float*)&xmm3)[2], ((float*)&xmm3)[3]); - //printf("%u, %u, %u, %u\n", ((uint32_t*)&xmm9)[0], ((uint32_t*)&xmm9)[1], ((uint32_t*)&xmm9)[2], ((uint32_t*)&xmm9)[3]); + // printf("%f, %f, %f, %f\n", ((float*)&xmm3)[0], ((float*)&xmm3)[1], + // ((float*)&xmm3)[2], ((float*)&xmm3)[3]); printf("%u, %u, %u, %u\n", + // ((uint32_t*)&xmm9)[0], ((uint32_t*)&xmm9)[1], ((uint32_t*)&xmm9)[2], + // ((uint32_t*)&xmm9)[3]); - _mm_store_ps((float*)&(holderf.f), xmm3); - _mm_store_si128(&(holderi.int_vec), xmm9); + _mm_store_ps((float*)&(holderf.f), xmm3); + _mm_store_si128(&(holderi.int_vec), xmm9); - target[0] = holderi.i[0]; - sq_dist = holderf.f[0]; - target[0] = (holderf.f[1] > sq_dist) ? holderi.i[1] : target[0]; - sq_dist = (holderf.f[1] > sq_dist) ? holderf.f[1] : sq_dist; - target[0] = (holderf.f[2] > sq_dist) ? holderi.i[2] : target[0]; - sq_dist = (holderf.f[2] > sq_dist) ? holderf.f[2] : sq_dist; - target[0] = (holderf.f[3] > sq_dist) ? holderi.i[3] : target[0]; - sq_dist = (holderf.f[3] > sq_dist) ? holderf.f[3] : sq_dist; + target[0] = holderi.i[0]; + sq_dist = holderf.f[0]; + target[0] = (holderf.f[1] > sq_dist) ? holderi.i[1] : target[0]; + sq_dist = (holderf.f[1] > sq_dist) ? holderf.f[1] : sq_dist; + target[0] = (holderf.f[2] > sq_dist) ? holderi.i[2] : target[0]; + sq_dist = (holderf.f[2] > sq_dist) ? holderf.f[2] : sq_dist; + target[0] = (holderf.f[3] > sq_dist) ? holderi.i[3] : target[0]; + sq_dist = (holderf.f[3] > sq_dist) ? holderf.f[3] : sq_dist; - /* - float placeholder = 0.0; - uint32_t temp0, temp1; - uint32_t g0 = (((float*)&xmm3)[0] > ((float*)&xmm3)[1]); - uint32_t l0 = g0 ^ 1; + /* + float placeholder = 0.0; + uint32_t temp0, temp1; + uint32_t g0 = (((float*)&xmm3)[0] > ((float*)&xmm3)[1]); + uint32_t l0 = g0 ^ 1; - uint32_t g1 = (((float*)&xmm3)[1] > ((float*)&xmm3)[2]); - uint32_t l1 = g1 ^ 1; + uint32_t g1 = (((float*)&xmm3)[1] > ((float*)&xmm3)[2]); + uint32_t l1 = g1 ^ 1; - temp0 = g0 * ((uint32_t*)&xmm9)[0] + l0 * ((uint32_t*)&xmm9)[1]; - temp1 = g0 * ((uint32_t*)&xmm9)[2] + l0 * ((uint32_t*)&xmm9)[3]; - sq_dist = g0 * ((float*)&xmm3)[0] + l0 * ((float*)&xmm3)[1]; - placeholder = g0 * ((float*)&xmm3)[2] + l0 * ((float*)&xmm3)[3]; + temp0 = g0 * ((uint32_t*)&xmm9)[0] + l0 * ((uint32_t*)&xmm9)[1]; + temp1 = g0 * ((uint32_t*)&xmm9)[2] + l0 * ((uint32_t*)&xmm9)[3]; + sq_dist = g0 * ((float*)&xmm3)[0] + l0 * ((float*)&xmm3)[1]; + placeholder = g0 * ((float*)&xmm3)[2] + l0 * ((float*)&xmm3)[3]; - g0 = (sq_dist > placeholder); - l0 = g0 ^ 1; - target[0] = g0 * temp0 + l0 * temp1; - */ + g0 = (sq_dist > placeholder); + l0 = g0 ^ 1; + target[0] = g0 * temp0 + l0 * temp1; + */ } #endif /*LV_HAVE_SSE3*/ #ifdef LV_HAVE_GENERIC static inline void - volk_32fc_index_max_16u_generic(uint16_t* target, lv_32fc_t* src0, - uint32_t num_points) +volk_32fc_index_max_16u_generic(uint16_t* target, lv_32fc_t* src0, uint32_t num_points) { - num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points; + num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points; - const uint32_t num_bytes = num_points*8; + const uint32_t num_bytes = num_points * 8; - float sq_dist = 0.0; - float max = 0.0; - uint16_t index = 0; + float sq_dist = 0.0; + float max = 0.0; + uint16_t index = 0; - uint32_t i = 0; + uint32_t i = 0; - for(; i < num_bytes >> 3; ++i) { - sq_dist = lv_creal(src0[i]) * lv_creal(src0[i]) + lv_cimag(src0[i]) * lv_cimag(src0[i]); + for (; i> 3; ++i) { + sq_dist = + lv_creal(src0[i]) * lv_creal(src0[i]) + lv_cimag(src0[i]) * lv_cimag(src0[i]); - index = sq_dist > max ? i : index; - max = sq_dist > max ? sq_dist : max; - } - target[0] = index; + index = sq_dist > max ? i : index; + max = sq_dist > max ? sq_dist : max; + } + target[0] = index; } #endif /*LV_HAVE_GENERIC*/ @@ -427,142 +434,140 @@ static inline void #ifndef INCLUDED_volk_32fc_index_max_16u_u_H #define INCLUDED_volk_32fc_index_max_16u_u_H -#include #include -#include #include +#include +#include #include #ifdef LV_HAVE_AVX2 #include static inline void -volk_32fc_index_max_16u_u_avx2(uint16_t* target, lv_32fc_t* src0, - uint32_t num_points) +volk_32fc_index_max_16u_u_avx2(uint16_t* target, lv_32fc_t* src0, uint32_t num_points) { - num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points; - // Branchless version, if we think it'll make a difference - //num_points = USHRT_MAX ^ ((num_points ^ USHRT_MAX) & -(num_points < USHRT_MAX)); + num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points; + // Branchless version, if we think it'll make a difference + // num_points = USHRT_MAX ^ ((num_points ^ USHRT_MAX) & -(num_points < USHRT_MAX)); - const uint32_t num_bytes = num_points*8; + const uint32_t num_bytes = num_points * 8; - union bit256 holderf; - union bit256 holderi; - float sq_dist = 0.0; + union bit256 holderf; + union bit256 holderi; + float sq_dist = 0.0; - union bit256 xmm5, xmm4; - __m256 xmm1, xmm2, xmm3; - __m256i xmm8, xmm11, xmm12, xmmfive, xmmfour, xmm9, holder0, holder1, xmm10; + union bit256 xmm5, xmm4; + __m256 xmm1, xmm2, xmm3; + __m256i xmm8, xmm11, xmm12, xmmfive, xmmfour, xmm9, holder0, holder1, xmm10; - xmm5.int_vec = xmmfive = _mm256_setzero_si256(); - xmm4.int_vec = xmmfour = _mm256_setzero_si256(); - holderf.int_vec = holder0 = _mm256_setzero_si256(); - holderi.int_vec = holder1 = _mm256_setzero_si256(); + xmm5.int_vec = xmmfive = _mm256_setzero_si256(); + xmm4.int_vec = xmmfour = _mm256_setzero_si256(); + holderf.int_vec = holder0 = _mm256_setzero_si256(); + holderi.int_vec = holder1 = _mm256_setzero_si256(); - int bound = num_bytes >> 6; - int i = 0; + int bound = num_bytes >> 6; + int i = 0; - xmm8 = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0); - xmm9 = _mm256_setzero_si256(); //=xmm8 - xmm10 = _mm256_set1_epi32(8); - xmm3 = _mm256_setzero_ps(); + xmm8 = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0); + xmm9 = _mm256_setzero_si256(); //=xmm8 + xmm10 = _mm256_set1_epi32(8); + xmm3 = _mm256_setzero_ps(); - __m256i idx = _mm256_set_epi32(7,6,3,2,5,4,1,0); - for(; i < bound; ++i) { - xmm1 = _mm256_loadu_ps((float*)src0); - xmm2 = _mm256_loadu_ps((float*)&src0[4]); + __m256i idx = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0); + for (; i < bound; ++i) { + xmm1 = _mm256_loadu_ps((float*)src0); + xmm2 = _mm256_loadu_ps((float*)&src0[4]); - src0 += 8; + src0 += 8; - xmm1 = _mm256_mul_ps(xmm1, xmm1); - xmm2 = _mm256_mul_ps(xmm2, xmm2); + xmm1 = _mm256_mul_ps(xmm1, xmm1); + xmm2 = _mm256_mul_ps(xmm2, xmm2); - xmm1 = _mm256_hadd_ps(xmm1, xmm2); - xmm1 = _mm256_permutevar8x32_ps(xmm1, idx); + xmm1 = _mm256_hadd_ps(xmm1, xmm2); + xmm1 = _mm256_permutevar8x32_ps(xmm1, idx); - xmm3 = _mm256_max_ps(xmm1, xmm3); + xmm3 = _mm256_max_ps(xmm1, xmm3); - xmm4.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_LT_OS); - xmm5.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_EQ_OQ); + xmm4.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_LT_OS); + xmm5.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_EQ_OQ); - xmm11 = _mm256_and_si256(xmm8, xmm5.int_vec); - xmm12 = _mm256_and_si256(xmm9, xmm4.int_vec); + xmm11 = _mm256_and_si256(xmm8, xmm5.int_vec); + xmm12 = _mm256_and_si256(xmm9, xmm4.int_vec); - xmm9 = _mm256_add_epi32(xmm11, xmm12); + xmm9 = _mm256_add_epi32(xmm11, xmm12); - xmm8 = _mm256_add_epi32(xmm8, xmm10); - } - xmm10 = _mm256_set1_epi32(4); - if (num_bytes >> 5 & 1) { - xmm1 = _mm256_loadu_ps((float*)src0); + xmm8 = _mm256_add_epi32(xmm8, xmm10); + } + xmm10 = _mm256_set1_epi32(4); + if (num_bytes >> 5 & 1) { + xmm1 = _mm256_loadu_ps((float*)src0); - src0 += 4; + src0 += 4; - xmm1 = _mm256_mul_ps(xmm1, xmm1); + xmm1 = _mm256_mul_ps(xmm1, xmm1); - xmm1 = _mm256_hadd_ps(xmm1, xmm1); - xmm1 = _mm256_permutevar8x32_ps(xmm1, idx); + xmm1 = _mm256_hadd_ps(xmm1, xmm1); + xmm1 = _mm256_permutevar8x32_ps(xmm1, idx); - xmm3 = _mm256_max_ps(xmm1, xmm3); + xmm3 = _mm256_max_ps(xmm1, xmm3); - xmm4.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_LT_OS); - xmm5.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_EQ_OQ); + xmm4.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_LT_OS); + xmm5.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_EQ_OQ); - xmm11 = _mm256_and_si256(xmm8, xmm5.int_vec); - xmm12 = _mm256_and_si256(xmm9, xmm4.int_vec); + xmm11 = _mm256_and_si256(xmm8, xmm5.int_vec); + xmm12 = _mm256_and_si256(xmm9, xmm4.int_vec); - xmm9 = _mm256_add_epi32(xmm11, xmm12); + xmm9 = _mm256_add_epi32(xmm11, xmm12); - xmm8 = _mm256_add_epi32(xmm8, xmm10); - } + xmm8 = _mm256_add_epi32(xmm8, xmm10); + } - idx = _mm256_set_epi32(1,0,1,0,1,0,1,0); - xmm10 = _mm256_set1_epi32(2); - if (num_bytes >> 4 & 1) { - xmm2 = _mm256_loadu_ps((float*)src0); + idx = _mm256_set_epi32(1, 0, 1, 0, 1, 0, 1, 0); + xmm10 = _mm256_set1_epi32(2); + if (num_bytes >> 4 & 1) { + xmm2 = _mm256_loadu_ps((float*)src0); - xmm1 = _mm256_permutevar8x32_ps(bit256_p(&xmm8)->float_vec, idx); - xmm8 = bit256_p(&xmm1)->int_vec; + xmm1 = _mm256_permutevar8x32_ps(bit256_p(&xmm8)->float_vec, idx); + xmm8 = bit256_p(&xmm1)->int_vec; - xmm2 = _mm256_mul_ps(xmm2, xmm2); + xmm2 = _mm256_mul_ps(xmm2, xmm2); - src0 += 2; + src0 += 2; - xmm1 = _mm256_hadd_ps(xmm2, xmm2); + xmm1 = _mm256_hadd_ps(xmm2, xmm2); - xmm3 = _mm256_max_ps(xmm1, xmm3); + xmm3 = _mm256_max_ps(xmm1, xmm3); - xmm4.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_LT_OS); - xmm5.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_EQ_OQ); + xmm4.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_LT_OS); + xmm5.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_EQ_OQ); - xmm11 = _mm256_and_si256(xmm8, xmm5.int_vec); - xmm12 = _mm256_and_si256(xmm9, xmm4.int_vec); + xmm11 = _mm256_and_si256(xmm8, xmm5.int_vec); + xmm12 = _mm256_and_si256(xmm9, xmm4.int_vec); - xmm9 = _mm256_add_epi32(xmm11, xmm12); + xmm9 = _mm256_add_epi32(xmm11, xmm12); + + xmm8 = _mm256_add_epi32(xmm8, xmm10); + } - xmm8 = _mm256_add_epi32(xmm8, xmm10); - } - - _mm256_storeu_ps((float*)&(holderf.f), xmm3); - _mm256_storeu_si256(&(holderi.int_vec), xmm9); - - target[0] = holderi.i[0]; - sq_dist = holderf.f[0]; - target[0] = (holderf.f[1] > sq_dist) ? holderi.i[1] : target[0]; - sq_dist = (holderf.f[1] > sq_dist) ? holderf.f[1] : sq_dist; - target[0] = (holderf.f[2] > sq_dist) ? holderi.i[2] : target[0]; - sq_dist = (holderf.f[2] > sq_dist) ? holderf.f[2] : sq_dist; - target[0] = (holderf.f[3] > sq_dist) ? holderi.i[3] : target[0]; - sq_dist = (holderf.f[3] > sq_dist) ? holderf.f[3] : sq_dist; - target[0] = (holderf.f[4] > sq_dist) ? holderi.i[4] : target[0]; - sq_dist = (holderf.f[4] > sq_dist) ? holderf.f[4] : sq_dist; - target[0] = (holderf.f[5] > sq_dist) ? holderi.i[5] : target[0]; - sq_dist = (holderf.f[5] > sq_dist) ? holderf.f[5] : sq_dist; - target[0] = (holderf.f[6] > sq_dist) ? holderi.i[6] : target[0]; - sq_dist = (holderf.f[6] > sq_dist) ? holderf.f[6] : sq_dist; - target[0] = (holderf.f[7] > sq_dist) ? holderi.i[7] : target[0]; - sq_dist = (holderf.f[7] > sq_dist) ? holderf.f[7] : sq_dist; + _mm256_storeu_ps((float*)&(holderf.f), xmm3); + _mm256_storeu_si256(&(holderi.int_vec), xmm9); + target[0] = holderi.i[0]; + sq_dist = holderf.f[0]; + target[0] = (holderf.f[1] > sq_dist) ? holderi.i[1] : target[0]; + sq_dist = (holderf.f[1] > sq_dist) ? holderf.f[1] : sq_dist; + target[0] = (holderf.f[2] > sq_dist) ? holderi.i[2] : target[0]; + sq_dist = (holderf.f[2] > sq_dist) ? holderf.f[2] : sq_dist; + target[0] = (holderf.f[3] > sq_dist) ? holderi.i[3] : target[0]; + sq_dist = (holderf.f[3] > sq_dist) ? holderf.f[3] : sq_dist; + target[0] = (holderf.f[4] > sq_dist) ? holderi.i[4] : target[0]; + sq_dist = (holderf.f[4] > sq_dist) ? holderf.f[4] : sq_dist; + target[0] = (holderf.f[5] > sq_dist) ? holderi.i[5] : target[0]; + sq_dist = (holderf.f[5] > sq_dist) ? holderf.f[5] : sq_dist; + target[0] = (holderf.f[6] > sq_dist) ? holderi.i[6] : target[0]; + sq_dist = (holderf.f[6] > sq_dist) ? holderf.f[6] : sq_dist; + target[0] = (holderf.f[7] > sq_dist) ? holderi.i[7] : target[0]; + sq_dist = (holderf.f[7] > sq_dist) ? holderf.f[7] : sq_dist; } #endif /*LV_HAVE_AVX2*/ diff --git a/kernels/volk/volk_32fc_index_max_32u.h b/kernels/volk/volk_32fc_index_max_32u.h index 67a3faa..7756fc6 100644 --- a/kernels/volk/volk_32fc_index_max_32u.h +++ b/kernels/volk/volk_32fc_index_max_32u.h @@ -70,309 +70,314 @@ #ifndef INCLUDED_volk_32fc_index_max_32u_a_H #define INCLUDED_volk_32fc_index_max_32u_a_H +#include +#include #include -#include -#include -#include +#include #ifdef LV_HAVE_AVX2 -#include +#include static inline void -volk_32fc_index_max_32u_a_avx2(uint32_t* target, lv_32fc_t* src0, - uint32_t num_points) +volk_32fc_index_max_32u_a_avx2(uint32_t* target, lv_32fc_t* src0, uint32_t num_points) { - const uint32_t num_bytes = num_points*8; + const uint32_t num_bytes = num_points * 8; - union bit256 holderf; - union bit256 holderi; - float sq_dist = 0.0; + union bit256 holderf; + union bit256 holderi; + float sq_dist = 0.0; - union bit256 xmm5, xmm4; - __m256 xmm1, xmm2, xmm3; - __m256i xmm8, xmm11, xmm12, xmmfive, xmmfour, xmm9, holder0, holder1, xmm10; + union bit256 xmm5, xmm4; + __m256 xmm1, xmm2, xmm3; + __m256i xmm8, xmm11, xmm12, xmmfive, xmmfour, xmm9, holder0, holder1, xmm10; - xmm5.int_vec = xmmfive = _mm256_setzero_si256(); - xmm4.int_vec = xmmfour = _mm256_setzero_si256(); - holderf.int_vec = holder0 = _mm256_setzero_si256(); - holderi.int_vec = holder1 = _mm256_setzero_si256(); + xmm5.int_vec = xmmfive = _mm256_setzero_si256(); + xmm4.int_vec = xmmfour = _mm256_setzero_si256(); + holderf.int_vec = holder0 = _mm256_setzero_si256(); + holderi.int_vec = holder1 = _mm256_setzero_si256(); - int bound = num_bytes >> 6; - int i = 0; + int bound = num_bytes >> 6; + int i = 0; - xmm8 = _mm256_set_epi32(7,6,5,4,3, 2, 1, 0); - xmm9 = _mm256_setzero_si256(); - xmm10 = _mm256_set1_epi32(8); - xmm3 = _mm256_setzero_ps(); - __m256i idx = _mm256_set_epi32(7,6,3,2,5,4,1,0); + xmm8 = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0); + xmm9 = _mm256_setzero_si256(); + xmm10 = _mm256_set1_epi32(8); + xmm3 = _mm256_setzero_ps(); + __m256i idx = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0); - for(; i < bound; ++i) { - xmm1 = _mm256_load_ps((float*)src0); - xmm2 = _mm256_load_ps((float*)&src0[4]); + for (; i < bound; ++i) { + xmm1 = _mm256_load_ps((float*)src0); + xmm2 = _mm256_load_ps((float*)&src0[4]); - src0 += 8; + src0 += 8; - xmm1 = _mm256_mul_ps(xmm1, xmm1); - xmm2 = _mm256_mul_ps(xmm2, xmm2); + xmm1 = _mm256_mul_ps(xmm1, xmm1); + xmm2 = _mm256_mul_ps(xmm2, xmm2); - xmm1 = _mm256_hadd_ps(xmm1, xmm2); - xmm1 = _mm256_permutevar8x32_ps(xmm1, idx); + xmm1 = _mm256_hadd_ps(xmm1, xmm2); + xmm1 = _mm256_permutevar8x32_ps(xmm1, idx); - xmm3 = _mm256_max_ps(xmm1, xmm3); + xmm3 = _mm256_max_ps(xmm1, xmm3); - xmm4.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_LT_OS); - xmm5.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_EQ_OQ); + xmm4.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_LT_OS); + xmm5.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_EQ_OQ); - xmm11 = _mm256_and_si256(xmm8, xmm5.int_vec); - xmm12 = _mm256_and_si256(xmm9, xmm4.int_vec); + xmm11 = _mm256_and_si256(xmm8, xmm5.int_vec); + xmm12 = _mm256_and_si256(xmm9, xmm4.int_vec); - xmm9 = _mm256_add_epi32(xmm11, xmm12); + xmm9 = _mm256_add_epi32(xmm11, xmm12); - xmm8 = _mm256_add_epi32(xmm8, xmm10); - } - - xmm10 = _mm256_set1_epi32(4); - if (num_bytes >> 5 & 1) { - xmm1 = _mm256_load_ps((float*)src0); - - xmm1 = _mm256_mul_ps(xmm1, xmm1); + xmm8 = _mm256_add_epi32(xmm8, xmm10); + } - src0 += 4; + xmm10 = _mm256_set1_epi32(4); + if (num_bytes >> 4 & 1) { + xmm1 = _mm256_load_ps((float*)src0); - xmm1 = _mm256_hadd_ps(xmm1, xmm1); + xmm1 = _mm256_mul_ps(xmm1, xmm1); - xmm3 = _mm256_max_ps(xmm1, xmm3); + src0 += 4; - xmm4.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_LT_OS); - xmm5.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_EQ_OQ); + xmm1 = _mm256_hadd_ps(xmm1, xmm1); - xmm11 = _mm256_and_si256(xmm8, xmm5.int_vec); - xmm12 = _mm256_and_si256(xmm9, xmm4.int_vec); + xmm3 = _mm256_max_ps(xmm1, xmm3); - xmm9 = _mm256_add_epi32(xmm11, xmm12); + xmm4.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_LT_OS); + xmm5.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_EQ_OQ); - xmm8 = _mm256_add_epi32(xmm8, xmm10); - } + xmm11 = _mm256_and_si256(xmm8, xmm5.int_vec); + xmm12 = _mm256_and_si256(xmm9, xmm4.int_vec); - idx = _mm256_set_epi32(1,0,1,0,1,0,1,0); - xmm10 = _mm256_set1_epi32(2); - if (num_bytes >> 4 & 1) { - xmm2 = _mm256_load_ps((float*)src0); + xmm9 = _mm256_add_epi32(xmm11, xmm12); - xmm1 = _mm256_permutevar8x32_ps(bit256_p(&xmm8)->float_vec, idx); - xmm8 = bit256_p(&xmm1)->int_vec; + xmm8 = _mm256_add_epi32(xmm8, xmm10); + } - xmm2 = _mm256_mul_ps(xmm2, xmm2); + idx = _mm256_set_epi32(1, 0, 1, 0, 1, 0, 1, 0); + xmm10 = _mm256_set1_epi32(2); + if (num_bytes >> 4 & 1) { + xmm2 = _mm256_load_ps((float*)src0); - src0 += 2; + xmm1 = _mm256_permutevar8x32_ps(bit256_p(&xmm8)->float_vec, idx); + xmm8 = bit256_p(&xmm1)->int_vec; - xmm1 = _mm256_hadd_ps(xmm2, xmm2); + xmm2 = _mm256_mul_ps(xmm2, xmm2); - xmm3 = _mm256_max_ps(xmm1, xmm3); + src0 += 2; - xmm4.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_LT_OS); - xmm5.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_EQ_OQ); + xmm1 = _mm256_hadd_ps(xmm2, xmm2); - xmm11 = _mm256_and_si256(xmm8, xmm5.int_vec); - xmm12 = _mm256_and_si256(xmm9, xmm4.int_vec); + xmm3 = _mm256_max_ps(xmm1, xmm3); - xmm9 = _mm256_add_epi32(xmm11, xmm12); + xmm4.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_LT_OS); + xmm5.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_EQ_OQ); - xmm8 = _mm256_add_epi32(xmm8, xmm10); - } + xmm11 = _mm256_and_si256(xmm8, xmm5.int_vec); + xmm12 = _mm256_and_si256(xmm9, xmm4.int_vec); - _mm256_store_ps((float*)&(holderf.f), xmm3); - _mm256_store_si256(&(holderi.int_vec), xmm9); + xmm9 = _mm256_add_epi32(xmm11, xmm12); - target[0] = holderi.i[0]; - sq_dist = holderf.f[0]; - target[0] = (holderf.f[1] > sq_dist) ? holderi.i[1] : target[0]; - sq_dist = (holderf.f[1] > sq_dist) ? holderf.f[1] : sq_dist; - target[0] = (holderf.f[2] > sq_dist) ? holderi.i[2] : target[0]; - sq_dist = (holderf.f[2] > sq_dist) ? holderf.f[2] : sq_dist; - target[0] = (holderf.f[3] > sq_dist) ? holderi.i[3] : target[0]; - sq_dist = (holderf.f[3] > sq_dist) ? holderf.f[3] : sq_dist; - target[0] = (holderf.f[4] > sq_dist) ? holderi.i[4] : target[0]; - sq_dist = (holderf.f[4] > sq_dist) ? holderf.f[4] : sq_dist; - target[0] = (holderf.f[5] > sq_dist) ? holderi.i[5] : target[0]; - sq_dist = (holderf.f[5] > sq_dist) ? holderf.f[5] : sq_dist; - target[0] = (holderf.f[6] > sq_dist) ? holderi.i[6] : target[0]; - sq_dist = (holderf.f[6] > sq_dist) ? holderf.f[6] : sq_dist; - target[0] = (holderf.f[7] > sq_dist) ? holderi.i[7] : target[0]; - sq_dist = (holderf.f[7] > sq_dist) ? holderf.f[7] : sq_dist; + xmm8 = _mm256_add_epi32(xmm8, xmm10); + } + _mm256_store_ps((float*)&(holderf.f), xmm3); + _mm256_store_si256(&(holderi.int_vec), xmm9); + + target[0] = holderi.i[0]; + sq_dist = holderf.f[0]; + target[0] = (holderf.f[1] > sq_dist) ? holderi.i[1] : target[0]; + sq_dist = (holderf.f[1] > sq_dist) ? holderf.f[1] : sq_dist; + target[0] = (holderf.f[2] > sq_dist) ? holderi.i[2] : target[0]; + sq_dist = (holderf.f[2] > sq_dist) ? holderf.f[2] : sq_dist; + target[0] = (holderf.f[3] > sq_dist) ? holderi.i[3] : target[0]; + sq_dist = (holderf.f[3] > sq_dist) ? holderf.f[3] : sq_dist; + target[0] = (holderf.f[4] > sq_dist) ? holderi.i[4] : target[0]; + sq_dist = (holderf.f[4] > sq_dist) ? holderf.f[4] : sq_dist; + target[0] = (holderf.f[5] > sq_dist) ? holderi.i[5] : target[0]; + sq_dist = (holderf.f[5] > sq_dist) ? holderf.f[5] : sq_dist; + target[0] = (holderf.f[6] > sq_dist) ? holderi.i[6] : target[0]; + sq_dist = (holderf.f[6] > sq_dist) ? holderf.f[6] : sq_dist; + target[0] = (holderf.f[7] > sq_dist) ? holderi.i[7] : target[0]; + sq_dist = (holderf.f[7] > sq_dist) ? holderf.f[7] : sq_dist; } #endif /*LV_HAVE_AVX2*/ #ifdef LV_HAVE_SSE3 -#include -#include +#include +#include static inline void -volk_32fc_index_max_32u_a_sse3(uint32_t* target, lv_32fc_t* src0, - uint32_t num_points) +volk_32fc_index_max_32u_a_sse3(uint32_t* target, lv_32fc_t* src0, uint32_t num_points) { - const uint32_t num_bytes = num_points*8; - - union bit128 holderf; - union bit128 holderi; - float sq_dist = 0.0; - - union bit128 xmm5, xmm4; - __m128 xmm1, xmm2, xmm3; - __m128i xmm8, xmm11, xmm12, xmmfive, xmmfour, xmm9, holder0, holder1, xmm10; + const uint32_t num_bytes = num_points * 8; - xmm5.int_vec = xmmfive = _mm_setzero_si128(); - xmm4.int_vec = xmmfour = _mm_setzero_si128(); - holderf.int_vec = holder0 = _mm_setzero_si128(); - holderi.int_vec = holder1 = _mm_setzero_si128(); + union bit128 holderf; + union bit128 holderi; + float sq_dist = 0.0; - int bound = num_bytes >> 5; - int i = 0; + union bit128 xmm5, xmm4; + __m128 xmm1, xmm2, xmm3; + __m128i xmm8, xmm11, xmm12, xmmfive, xmmfour, xmm9, holder0, holder1, xmm10; - xmm8 = _mm_set_epi32(3, 2, 1, 0);//remember the crazy reverse order! - xmm9 = _mm_setzero_si128(); - xmm10 = _mm_set_epi32(4, 4, 4, 4); - xmm3 = _mm_setzero_ps(); + xmm5.int_vec = xmmfive = _mm_setzero_si128(); + xmm4.int_vec = xmmfour = _mm_setzero_si128(); + holderf.int_vec = holder0 = _mm_setzero_si128(); + holderi.int_vec = holder1 = _mm_setzero_si128(); - //printf("%f, %f, %f, %f\n", ((float*)&xmm10)[0], ((float*)&xmm10)[1], ((float*)&xmm10)[2], ((float*)&xmm10)[3]); + int bound = num_bytes >> 5; + int i = 0; - for(; i < bound; ++i) { - xmm1 = _mm_load_ps((float*)src0); - xmm2 = _mm_load_ps((float*)&src0[2]); + xmm8 = _mm_set_epi32(3, 2, 1, 0); // remember the crazy reverse order! + xmm9 = _mm_setzero_si128(); + xmm10 = _mm_set_epi32(4, 4, 4, 4); + xmm3 = _mm_setzero_ps(); - src0 += 4; + // printf("%f, %f, %f, %f\n", ((float*)&xmm10)[0], ((float*)&xmm10)[1], + // ((float*)&xmm10)[2], ((float*)&xmm10)[3]); - xmm1 = _mm_mul_ps(xmm1, xmm1); - xmm2 = _mm_mul_ps(xmm2, xmm2); + for (; i < bound; ++i) { + xmm1 = _mm_load_ps((float*)src0); + xmm2 = _mm_load_ps((float*)&src0[2]); - xmm1 = _mm_hadd_ps(xmm1, xmm2); + src0 += 4; - xmm3 = _mm_max_ps(xmm1, xmm3); + xmm1 = _mm_mul_ps(xmm1, xmm1); + xmm2 = _mm_mul_ps(xmm2, xmm2); - xmm4.float_vec = _mm_cmplt_ps(xmm1, xmm3); - xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3); + xmm1 = _mm_hadd_ps(xmm1, xmm2); - xmm11 = _mm_and_si128(xmm8, xmm5.int_vec); - xmm12 = _mm_and_si128(xmm9, xmm4.int_vec); + xmm3 = _mm_max_ps(xmm1, xmm3); - xmm9 = _mm_add_epi32(xmm11, xmm12); + xmm4.float_vec = _mm_cmplt_ps(xmm1, xmm3); + xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3); - xmm8 = _mm_add_epi32(xmm8, xmm10); + xmm11 = _mm_and_si128(xmm8, xmm5.int_vec); + xmm12 = _mm_and_si128(xmm9, xmm4.int_vec); - //printf("%f, %f, %f, %f\n", ((float*)&xmm3)[0], ((float*)&xmm3)[1], ((float*)&xmm3)[2], ((float*)&xmm3)[3]); - //printf("%u, %u, %u, %u\n", ((uint32_t*)&xmm10)[0], ((uint32_t*)&xmm10)[1], ((uint32_t*)&xmm10)[2], ((uint32_t*)&xmm10)[3]); - } + xmm9 = _mm_add_epi32(xmm11, xmm12); + xmm8 = _mm_add_epi32(xmm8, xmm10); - if (num_bytes >> 4 & 1) { - xmm2 = _mm_load_ps((float*)src0); - - xmm1 = _mm_movelh_ps(bit128_p(&xmm8)->float_vec, bit128_p(&xmm8)->float_vec); - xmm8 = bit128_p(&xmm1)->int_vec; - - xmm2 = _mm_mul_ps(xmm2, xmm2); - - src0 += 2; - - xmm1 = _mm_hadd_ps(xmm2, xmm2); + // printf("%f, %f, %f, %f\n", ((float*)&xmm3)[0], ((float*)&xmm3)[1], + // ((float*)&xmm3)[2], ((float*)&xmm3)[3]); printf("%u, %u, %u, %u\n", + // ((uint32_t*)&xmm10)[0], ((uint32_t*)&xmm10)[1], ((uint32_t*)&xmm10)[2], + // ((uint32_t*)&xmm10)[3]); + } - xmm3 = _mm_max_ps(xmm1, xmm3); - xmm10 = _mm_set_epi32(2, 2, 2, 2);//load1_ps((float*)&init[2]); + if (num_bytes >> 4 & 1) { + xmm2 = _mm_load_ps((float*)src0); - xmm4.float_vec = _mm_cmplt_ps(xmm1, xmm3); - xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3); + xmm1 = _mm_movelh_ps(bit128_p(&xmm8)->float_vec, bit128_p(&xmm8)->float_vec); + xmm8 = bit128_p(&xmm1)->int_vec; - xmm11 = _mm_and_si128(xmm8, xmm5.int_vec); - xmm12 = _mm_and_si128(xmm9, xmm4.int_vec); + xmm2 = _mm_mul_ps(xmm2, xmm2); - xmm9 = _mm_add_epi32(xmm11, xmm12); + src0 += 2; - xmm8 = _mm_add_epi32(xmm8, xmm10); - //printf("egads%u, %u, %u, %u\n", ((uint32_t*)&xmm9)[0], ((uint32_t*)&xmm9)[1], ((uint32_t*)&xmm9)[2], ((uint32_t*)&xmm9)[3]); - } + xmm1 = _mm_hadd_ps(xmm2, xmm2); - if (num_bytes >> 3 & 1) { - //printf("%u, %u, %u, %u\n", ((uint32_t*)&xmm9)[0], ((uint32_t*)&xmm9)[1], ((uint32_t*)&xmm9)[2], ((uint32_t*)&xmm9)[3]); + xmm3 = _mm_max_ps(xmm1, xmm3); - sq_dist = lv_creal(src0[0]) * lv_creal(src0[0]) + lv_cimag(src0[0]) * lv_cimag(src0[0]); + xmm10 = _mm_set_epi32(2, 2, 2, 2); // load1_ps((float*)&init[2]); - xmm2 = _mm_load1_ps(&sq_dist); + xmm4.float_vec = _mm_cmplt_ps(xmm1, xmm3); + xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3); - xmm1 = xmm3; + xmm11 = _mm_and_si128(xmm8, xmm5.int_vec); + xmm12 = _mm_and_si128(xmm9, xmm4.int_vec); - xmm3 = _mm_max_ss(xmm3, xmm2); + xmm9 = _mm_add_epi32(xmm11, xmm12); - xmm4.float_vec = _mm_cmplt_ps(xmm1, xmm3); - xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3); + xmm8 = _mm_add_epi32(xmm8, xmm10); + // printf("egads%u, %u, %u, %u\n", ((uint32_t*)&xmm9)[0], ((uint32_t*)&xmm9)[1], + // ((uint32_t*)&xmm9)[2], ((uint32_t*)&xmm9)[3]); + } - xmm8 = _mm_shuffle_epi32(xmm8, 0x00); + if (num_bytes >> 3 & 1) { + // printf("%u, %u, %u, %u\n", ((uint32_t*)&xmm9)[0], ((uint32_t*)&xmm9)[1], + // ((uint32_t*)&xmm9)[2], ((uint32_t*)&xmm9)[3]); - xmm11 = _mm_and_si128(xmm8, xmm4.int_vec); - xmm12 = _mm_and_si128(xmm9, xmm5.int_vec); + sq_dist = + lv_creal(src0[0]) * lv_creal(src0[0]) + lv_cimag(src0[0]) * lv_cimag(src0[0]); - xmm9 = _mm_add_epi32(xmm11, xmm12); - } + xmm2 = _mm_load1_ps(&sq_dist); - //printf("%f, %f, %f, %f\n", ((float*)&xmm3)[0], ((float*)&xmm3)[1], ((float*)&xmm3)[2], ((float*)&xmm3)[3]); - //printf("%u, %u, %u, %u\n", ((uint32_t*)&xmm9)[0], ((uint32_t*)&xmm9)[1], ((uint32_t*)&xmm9)[2], ((uint32_t*)&xmm9)[3]); + xmm1 = xmm3; - _mm_store_ps((float*)&(holderf.f), xmm3); - _mm_store_si128(&(holderi.int_vec), xmm9); + xmm3 = _mm_max_ss(xmm3, xmm2); - target[0] = holderi.i[0]; - sq_dist = holderf.f[0]; - target[0] = (holderf.f[1] > sq_dist) ? holderi.i[1] : target[0]; - sq_dist = (holderf.f[1] > sq_dist) ? holderf.f[1] : sq_dist; - target[0] = (holderf.f[2] > sq_dist) ? holderi.i[2] : target[0]; - sq_dist = (holderf.f[2] > sq_dist) ? holderf.f[2] : sq_dist; - target[0] = (holderf.f[3] > sq_dist) ? holderi.i[3] : target[0]; - sq_dist = (holderf.f[3] > sq_dist) ? holderf.f[3] : sq_dist; + xmm4.float_vec = _mm_cmplt_ps(xmm1, xmm3); + xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3); - /* - float placeholder = 0.0; - uint32_t temp0, temp1; - uint32_t g0 = (((float*)&xmm3)[0] > ((float*)&xmm3)[1]); - uint32_t l0 = g0 ^ 1; + xmm8 = _mm_shuffle_epi32(xmm8, 0x00); - uint32_t g1 = (((float*)&xmm3)[1] > ((float*)&xmm3)[2]); - uint32_t l1 = g1 ^ 1; + xmm11 = _mm_and_si128(xmm8, xmm4.int_vec); + xmm12 = _mm_and_si128(xmm9, xmm5.int_vec); - temp0 = g0 * ((uint32_t*)&xmm9)[0] + l0 * ((uint32_t*)&xmm9)[1]; - temp1 = g0 * ((uint32_t*)&xmm9)[2] + l0 * ((uint32_t*)&xmm9)[3]; - sq_dist = g0 * ((float*)&xmm3)[0] + l0 * ((float*)&xmm3)[1]; - placeholder = g0 * ((float*)&xmm3)[2] + l0 * ((float*)&xmm3)[3]; + xmm9 = _mm_add_epi32(xmm11, xmm12); + } - g0 = (sq_dist > placeholder); - l0 = g0 ^ 1; - target[0] = g0 * temp0 + l0 * temp1; - */ + // printf("%f, %f, %f, %f\n", ((float*)&xmm3)[0], ((float*)&xmm3)[1], + // ((float*)&xmm3)[2], ((float*)&xmm3)[3]); printf("%u, %u, %u, %u\n", + // ((uint32_t*)&xmm9)[0], ((uint32_t*)&xmm9)[1], ((uint32_t*)&xmm9)[2], + // ((uint32_t*)&xmm9)[3]); + + _mm_store_ps((float*)&(holderf.f), xmm3); + _mm_store_si128(&(holderi.int_vec), xmm9); + + target[0] = holderi.i[0]; + sq_dist = holderf.f[0]; + target[0] = (holderf.f[1] > sq_dist) ? holderi.i[1] : target[0]; + sq_dist = (holderf.f[1] > sq_dist) ? holderf.f[1] : sq_dist; + target[0] = (holderf.f[2] > sq_dist) ? holderi.i[2] : target[0]; + sq_dist = (holderf.f[2] > sq_dist) ? holderf.f[2] : sq_dist; + target[0] = (holderf.f[3] > sq_dist) ? holderi.i[3] : target[0]; + sq_dist = (holderf.f[3] > sq_dist) ? holderf.f[3] : sq_dist; + + /* + float placeholder = 0.0; + uint32_t temp0, temp1; + uint32_t g0 = (((float*)&xmm3)[0] > ((float*)&xmm3)[1]); + uint32_t l0 = g0 ^ 1; + + uint32_t g1 = (((float*)&xmm3)[1] > ((float*)&xmm3)[2]); + uint32_t l1 = g1 ^ 1; + + temp0 = g0 * ((uint32_t*)&xmm9)[0] + l0 * ((uint32_t*)&xmm9)[1]; + temp1 = g0 * ((uint32_t*)&xmm9)[2] + l0 * ((uint32_t*)&xmm9)[3]; + sq_dist = g0 * ((float*)&xmm3)[0] + l0 * ((float*)&xmm3)[1]; + placeholder = g0 * ((float*)&xmm3)[2] + l0 * ((float*)&xmm3)[3]; + + g0 = (sq_dist > placeholder); + l0 = g0 ^ 1; + target[0] = g0 * temp0 + l0 * temp1; + */ } #endif /*LV_HAVE_SSE3*/ #ifdef LV_HAVE_GENERIC static inline void - volk_32fc_index_max_32u_generic(uint32_t* target, lv_32fc_t* src0, - uint32_t num_points) +volk_32fc_index_max_32u_generic(uint32_t* target, lv_32fc_t* src0, uint32_t num_points) { - const uint32_t num_bytes = num_points*8; + const uint32_t num_bytes = num_points * 8; - float sq_dist = 0.0; - float max = 0.0; - uint32_t index = 0; + float sq_dist = 0.0; + float max = 0.0; + uint32_t index = 0; - uint32_t i = 0; + uint32_t i = 0; - for(; i < num_bytes >> 3; ++i) { - sq_dist = lv_creal(src0[i]) * lv_creal(src0[i]) + lv_cimag(src0[i]) * lv_cimag(src0[i]); + for (; i> 3; ++i) { + sq_dist = + lv_creal(src0[i]) * lv_creal(src0[i]) + lv_cimag(src0[i]) * lv_cimag(src0[i]); - index = sq_dist > max ? i : index; - max = sq_dist > max ? sq_dist : max; - } - target[0] = index; + index = sq_dist > max ? i : index; + max = sq_dist > max ? sq_dist : max; + } + target[0] = index; } #endif /*LV_HAVE_GENERIC*/ @@ -384,137 +389,135 @@ static inline void #ifndef INCLUDED_volk_32fc_index_max_32u_u_H #define INCLUDED_volk_32fc_index_max_32u_u_H +#include +#include #include -#include -#include -#include +#include #ifdef LV_HAVE_AVX2 -#include +#include static inline void -volk_32fc_index_max_32u_u_avx2(uint32_t* target, lv_32fc_t* src0, - uint32_t num_points) +volk_32fc_index_max_32u_u_avx2(uint32_t* target, lv_32fc_t* src0, uint32_t num_points) { - const uint32_t num_bytes = num_points*8; - - union bit256 holderf; - union bit256 holderi; - float sq_dist = 0.0; + const uint32_t num_bytes = num_points * 8; - union bit256 xmm5, xmm4; - __m256 xmm1, xmm2, xmm3; - __m256i xmm8, xmm11, xmm12, xmmfive, xmmfour, xmm9, holder0, holder1, xmm10; + union bit256 holderf; + union bit256 holderi; + float sq_dist = 0.0; - xmm5.int_vec = xmmfive = _mm256_setzero_si256(); - xmm4.int_vec = xmmfour = _mm256_setzero_si256(); - holderf.int_vec = holder0 = _mm256_setzero_si256(); - holderi.int_vec = holder1 = _mm256_setzero_si256(); + union bit256 xmm5, xmm4; + __m256 xmm1, xmm2, xmm3; + __m256i xmm8, xmm11, xmm12, xmmfive, xmmfour, xmm9, holder0, holder1, xmm10; - int bound = num_bytes >> 6; - int i = 0; + xmm5.int_vec = xmmfive = _mm256_setzero_si256(); + xmm4.int_vec = xmmfour = _mm256_setzero_si256(); + holderf.int_vec = holder0 = _mm256_setzero_si256(); + holderi.int_vec = holder1 = _mm256_setzero_si256(); - xmm8 = _mm256_set_epi32(7,6,5,4,3, 2, 1, 0); - xmm9 = _mm256_setzero_si256(); - xmm10 = _mm256_set1_epi32(8); - xmm3 = _mm256_setzero_ps(); - __m256i idx = _mm256_set_epi32(7,6,3,2,5,4,1,0); + int bound = num_bytes >> 6; + int i = 0; - for(; i < bound; ++i) { - xmm1 = _mm256_loadu_ps((float*)src0); - xmm2 = _mm256_loadu_ps((float*)&src0[4]); + xmm8 = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0); + xmm9 = _mm256_setzero_si256(); + xmm10 = _mm256_set1_epi32(8); + xmm3 = _mm256_setzero_ps(); + __m256i idx = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0); - src0 += 8; + for (; i < bound; ++i) { + xmm1 = _mm256_loadu_ps((float*)src0); + xmm2 = _mm256_loadu_ps((float*)&src0[4]); - xmm1 = _mm256_mul_ps(xmm1, xmm1); - xmm2 = _mm256_mul_ps(xmm2, xmm2); + src0 += 8; - xmm1 = _mm256_hadd_ps(xmm1, xmm2); - xmm1 = _mm256_permutevar8x32_ps(xmm1, idx); + xmm1 = _mm256_mul_ps(xmm1, xmm1); + xmm2 = _mm256_mul_ps(xmm2, xmm2); - xmm3 = _mm256_max_ps(xmm1, xmm3); + xmm1 = _mm256_hadd_ps(xmm1, xmm2); + xmm1 = _mm256_permutevar8x32_ps(xmm1, idx); - xmm4.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_LT_OS); - xmm5.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_EQ_OQ); + xmm3 = _mm256_max_ps(xmm1, xmm3); - xmm11 = _mm256_and_si256(xmm8, xmm5.int_vec); - xmm12 = _mm256_and_si256(xmm9, xmm4.int_vec); + xmm4.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_LT_OS); + xmm5.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_EQ_OQ); - xmm9 = _mm256_add_epi32(xmm11, xmm12); + xmm11 = _mm256_and_si256(xmm8, xmm5.int_vec); + xmm12 = _mm256_and_si256(xmm9, xmm4.int_vec); - xmm8 = _mm256_add_epi32(xmm8, xmm10); - } + xmm9 = _mm256_add_epi32(xmm11, xmm12); - xmm10 = _mm256_set1_epi32(4); - if (num_bytes >> 5 & 1) { - xmm1 = _mm256_loadu_ps((float*)src0); - - xmm1 = _mm256_mul_ps(xmm1, xmm1); + xmm8 = _mm256_add_epi32(xmm8, xmm10); + } - src0 += 4; + xmm10 = _mm256_set1_epi32(4); + if (num_bytes >> 4 & 1) { + xmm1 = _mm256_loadu_ps((float*)src0); - xmm1 = _mm256_hadd_ps(xmm1, xmm1); + xmm1 = _mm256_mul_ps(xmm1, xmm1); - xmm3 = _mm256_max_ps(xmm1, xmm3); + src0 += 4; - xmm4.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_LT_OS); - xmm5.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_EQ_OQ); + xmm1 = _mm256_hadd_ps(xmm1, xmm1); - xmm11 = _mm256_and_si256(xmm8, xmm5.int_vec); - xmm12 = _mm256_and_si256(xmm9, xmm4.int_vec); + xmm3 = _mm256_max_ps(xmm1, xmm3); - xmm9 = _mm256_add_epi32(xmm11, xmm12); + xmm4.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_LT_OS); + xmm5.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_EQ_OQ); - xmm8 = _mm256_add_epi32(xmm8, xmm10); - } + xmm11 = _mm256_and_si256(xmm8, xmm5.int_vec); + xmm12 = _mm256_and_si256(xmm9, xmm4.int_vec); - idx = _mm256_set_epi32(1,0,1,0,1,0,1,0); - xmm10 = _mm256_set1_epi32(2); - if (num_bytes >> 4 & 1) { - xmm2 = _mm256_loadu_ps((float*)src0); + xmm9 = _mm256_add_epi32(xmm11, xmm12); - xmm1 = _mm256_permutevar8x32_ps(bit256_p(&xmm8)->float_vec, idx); - xmm8 = bit256_p(&xmm1)->int_vec; + xmm8 = _mm256_add_epi32(xmm8, xmm10); + } - xmm2 = _mm256_mul_ps(xmm2, xmm2); + idx = _mm256_set_epi32(1, 0, 1, 0, 1, 0, 1, 0); + xmm10 = _mm256_set1_epi32(2); + if (num_bytes >> 4 & 1) { + xmm2 = _mm256_loadu_ps((float*)src0); - src0 += 2; + xmm1 = _mm256_permutevar8x32_ps(bit256_p(&xmm8)->float_vec, idx); + xmm8 = bit256_p(&xmm1)->int_vec; - xmm1 = _mm256_hadd_ps(xmm2, xmm2); + xmm2 = _mm256_mul_ps(xmm2, xmm2); - xmm3 = _mm256_max_ps(xmm1, xmm3); + src0 += 2; - xmm4.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_LT_OS); - xmm5.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_EQ_OQ); + xmm1 = _mm256_hadd_ps(xmm2, xmm2); - xmm11 = _mm256_and_si256(xmm8, xmm5.int_vec); - xmm12 = _mm256_and_si256(xmm9, xmm4.int_vec); + xmm3 = _mm256_max_ps(xmm1, xmm3); - xmm9 = _mm256_add_epi32(xmm11, xmm12); + xmm4.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_LT_OS); + xmm5.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_EQ_OQ); - xmm8 = _mm256_add_epi32(xmm8, xmm10); - } + xmm11 = _mm256_and_si256(xmm8, xmm5.int_vec); + xmm12 = _mm256_and_si256(xmm9, xmm4.int_vec); - _mm256_storeu_ps((float*)&(holderf.f), xmm3); - _mm256_storeu_si256(&(holderi.int_vec), xmm9); + xmm9 = _mm256_add_epi32(xmm11, xmm12); - target[0] = holderi.i[0]; - sq_dist = holderf.f[0]; - target[0] = (holderf.f[1] > sq_dist) ? holderi.i[1] : target[0]; - sq_dist = (holderf.f[1] > sq_dist) ? holderf.f[1] : sq_dist; - target[0] = (holderf.f[2] > sq_dist) ? holderi.i[2] : target[0]; - sq_dist = (holderf.f[2] > sq_dist) ? holderf.f[2] : sq_dist; - target[0] = (holderf.f[3] > sq_dist) ? holderi.i[3] : target[0]; - sq_dist = (holderf.f[3] > sq_dist) ? holderf.f[3] : sq_dist; - target[0] = (holderf.f[4] > sq_dist) ? holderi.i[4] : target[0]; - sq_dist = (holderf.f[4] > sq_dist) ? holderf.f[4] : sq_dist; - target[0] = (holderf.f[5] > sq_dist) ? holderi.i[5] : target[0]; - sq_dist = (holderf.f[5] > sq_dist) ? holderf.f[5] : sq_dist; - target[0] = (holderf.f[6] > sq_dist) ? holderi.i[6] : target[0]; - sq_dist = (holderf.f[6] > sq_dist) ? holderf.f[6] : sq_dist; - target[0] = (holderf.f[7] > sq_dist) ? holderi.i[7] : target[0]; - sq_dist = (holderf.f[7] > sq_dist) ? holderf.f[7] : sq_dist; + xmm8 = _mm256_add_epi32(xmm8, xmm10); + } + _mm256_storeu_ps((float*)&(holderf.f), xmm3); + _mm256_storeu_si256(&(holderi.int_vec), xmm9); + + target[0] = holderi.i[0]; + sq_dist = holderf.f[0]; + target[0] = (holderf.f[1] > sq_dist) ? holderi.i[1] : target[0]; + sq_dist = (holderf.f[1] > sq_dist) ? holderf.f[1] : sq_dist; + target[0] = (holderf.f[2] > sq_dist) ? holderi.i[2] : target[0]; + sq_dist = (holderf.f[2] > sq_dist) ? holderf.f[2] : sq_dist; + target[0] = (holderf.f[3] > sq_dist) ? holderi.i[3] : target[0]; + sq_dist = (holderf.f[3] > sq_dist) ? holderf.f[3] : sq_dist; + target[0] = (holderf.f[4] > sq_dist) ? holderi.i[4] : target[0]; + sq_dist = (holderf.f[4] > sq_dist) ? holderf.f[4] : sq_dist; + target[0] = (holderf.f[5] > sq_dist) ? holderi.i[5] : target[0]; + sq_dist = (holderf.f[5] > sq_dist) ? holderf.f[5] : sq_dist; + target[0] = (holderf.f[6] > sq_dist) ? holderi.i[6] : target[0]; + sq_dist = (holderf.f[6] > sq_dist) ? holderf.f[6] : sq_dist; + target[0] = (holderf.f[7] > sq_dist) ? holderi.i[7] : target[0]; + sq_dist = (holderf.f[7] > sq_dist) ? holderf.f[7] : sq_dist; } #endif /*LV_HAVE_AVX2*/ @@ -523,29 +526,29 @@ volk_32fc_index_max_32u_u_avx2(uint32_t* target, lv_32fc_t* src0, #include #include -static inline void volk_32fc_index_max_32u_neon(uint32_t* target, lv_32fc_t* src0, uint32_t num_points) +static inline void +volk_32fc_index_max_32u_neon(uint32_t* target, lv_32fc_t* src0, uint32_t num_points) { unsigned int number = 0; const uint32_t quarter_points = num_points / 4; const lv_32fc_t* src0Ptr = src0; - - uint32_t indices[4] = {0, 1, 2, 3}; + + uint32_t indices[4] = { 0, 1, 2, 3 }; const uint32x4_t vec_indices_incr = vdupq_n_u32(4); uint32x4_t vec_indices = vld1q_u32(indices); uint32x4_t vec_max_indices = vec_indices; - - if(num_points) - { + + if (num_points) { float max = *src0Ptr; uint32_t index = 0; - + float32x4_t vec_max = vdupq_n_f32(*src0Ptr); - - for(;number < quarter_points; number++) - { + + for (; number < quarter_points; number++) { // Load complex and compute magnitude squared - const float32x4_t vec_mag2 = _vmagnitudesquaredq_f32(vld2q_f32((float*)src0Ptr)); - __VOLK_PREFETCH(src0Ptr+=4); + const float32x4_t vec_mag2 = + _vmagnitudesquaredq_f32(vld2q_f32((float*)src0Ptr)); + __VOLK_PREFETCH(src0Ptr += 4); // a > b? const uint32x4_t gt_mask = vcgtq_f32(vec_mag2, vec_max); vec_max = vbslq_f32(gt_mask, vec_mag2, vec_max); @@ -556,20 +559,19 @@ static inline void volk_32fc_index_max_32u_neon(uint32_t* target, lv_32fc_t* src float tmp_max[4]; vst1q_u32(tmp_max_indices, vec_max_indices); vst1q_f32(tmp_max, vec_max); - + for (int i = 0; i < 4; i++) { if (tmp_max[i] > max) { max = tmp_max[i]; index = tmp_max_indices[i]; } } - + // Deal with the rest - for(number = quarter_points * 4;number < num_points; number++) - { + for (number = quarter_points * 4; number < num_points; number++) { const float re = lv_creal(*src0Ptr); const float im = lv_cimag(*src0Ptr); - if ((re*re+im*im) > max) { + if ((re * re + im * im) > max) { max = *src0Ptr; index = number; } diff --git a/kernels/volk/volk_32fc_magnitude_32f.h b/kernels/volk/volk_32fc_magnitude_32f.h index 1ba6871..6a0a7d8 100644 --- a/kernels/volk/volk_32fc_magnitude_32f.h +++ b/kernels/volk/volk_32fc_magnitude_32f.h @@ -30,8 +30,8 @@ * * Dispatcher Prototype * \code - * void volk_32fc_magnitude_32f(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points) - * \endcode + * void volk_32fc_magnitude_32f(float* magnitudeVector, const lv_32fc_t* complexVector, + * unsigned int num_points) \endcode * * \b Inputs * \li complexVector: The complex input vector. @@ -72,41 +72,41 @@ #define INCLUDED_volk_32fc_magnitude_32f_u_H #include -#include #include +#include #ifdef LV_HAVE_AVX #include #include -static inline void -volk_32fc_magnitude_32f_u_avx(float* magnitudeVector, const lv_32fc_t* complexVector, - unsigned int num_points) +static inline void volk_32fc_magnitude_32f_u_avx(float* magnitudeVector, + const lv_32fc_t* complexVector, + unsigned int num_points) { - unsigned int number = 0; - const unsigned int eighthPoints = num_points / 8; - - const float* complexVectorPtr = (float*) complexVector; - float* magnitudeVectorPtr = magnitudeVector; - - __m256 cplxValue1, cplxValue2, result; - - for(; number < eighthPoints; number++){ - cplxValue1 = _mm256_loadu_ps(complexVectorPtr); - cplxValue2 = _mm256_loadu_ps(complexVectorPtr + 8); - result = _mm256_magnitude_ps(cplxValue1, cplxValue2); - _mm256_storeu_ps(magnitudeVectorPtr, result); - - complexVectorPtr += 16; - magnitudeVectorPtr += 8; - } - - number = eighthPoints * 8; - for(; number < num_points; number++){ - float val1Real = *complexVectorPtr++; - float val1Imag = *complexVectorPtr++; - *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)); - } + unsigned int number = 0; + const unsigned int eighthPoints = num_points / 8; + + const float* complexVectorPtr = (float*)complexVector; + float* magnitudeVectorPtr = magnitudeVector; + + __m256 cplxValue1, cplxValue2, result; + + for (; number < eighthPoints; number++) { + cplxValue1 = _mm256_loadu_ps(complexVectorPtr); + cplxValue2 = _mm256_loadu_ps(complexVectorPtr + 8); + result = _mm256_magnitude_ps(cplxValue1, cplxValue2); + _mm256_storeu_ps(magnitudeVectorPtr, result); + + complexVectorPtr += 16; + magnitudeVectorPtr += 8; + } + + number = eighthPoints * 8; + for (; number < num_points; number++) { + float val1Real = *complexVectorPtr++; + float val1Imag = *complexVectorPtr++; + *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)); + } } #endif /* LV_HAVE_AVX */ @@ -114,137 +114,137 @@ volk_32fc_magnitude_32f_u_avx(float* magnitudeVector, const lv_32fc_t* complexVe #include #include -static inline void -volk_32fc_magnitude_32f_u_sse3(float* magnitudeVector, const lv_32fc_t* complexVector, - unsigned int num_points) +static inline void volk_32fc_magnitude_32f_u_sse3(float* magnitudeVector, + const lv_32fc_t* complexVector, + unsigned int num_points) { - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; - const float* complexVectorPtr = (float*) complexVector; - float* magnitudeVectorPtr = magnitudeVector; + const float* complexVectorPtr = (float*)complexVector; + float* magnitudeVectorPtr = magnitudeVector; - __m128 cplxValue1, cplxValue2, result; - for(; number < quarterPoints; number++){ - cplxValue1 = _mm_loadu_ps(complexVectorPtr); - complexVectorPtr += 4; + __m128 cplxValue1, cplxValue2, result; + for (; number < quarterPoints; number++) { + cplxValue1 = _mm_loadu_ps(complexVectorPtr); + complexVectorPtr += 4; - cplxValue2 = _mm_loadu_ps(complexVectorPtr); - complexVectorPtr += 4; + cplxValue2 = _mm_loadu_ps(complexVectorPtr); + complexVectorPtr += 4; - result = _mm_magnitude_ps_sse3(cplxValue1, cplxValue2); + result = _mm_magnitude_ps_sse3(cplxValue1, cplxValue2); - _mm_storeu_ps(magnitudeVectorPtr, result); - magnitudeVectorPtr += 4; - } + _mm_storeu_ps(magnitudeVectorPtr, result); + magnitudeVectorPtr += 4; + } - number = quarterPoints * 4; - for(; number < num_points; number++){ - float val1Real = *complexVectorPtr++; - float val1Imag = *complexVectorPtr++; - *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)); - } + number = quarterPoints * 4; + for (; number < num_points; number++) { + float val1Real = *complexVectorPtr++; + float val1Imag = *complexVectorPtr++; + *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)); + } } #endif /* LV_HAVE_SSE3 */ #ifdef LV_HAVE_SSE -#include #include +#include -static inline void -volk_32fc_magnitude_32f_u_sse(float* magnitudeVector, const lv_32fc_t* complexVector, - unsigned int num_points) +static inline void volk_32fc_magnitude_32f_u_sse(float* magnitudeVector, + const lv_32fc_t* complexVector, + unsigned int num_points) { - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; - const float* complexVectorPtr = (float*) complexVector; - float* magnitudeVectorPtr = magnitudeVector; + const float* complexVectorPtr = (float*)complexVector; + float* magnitudeVectorPtr = magnitudeVector; - __m128 cplxValue1, cplxValue2, result; + __m128 cplxValue1, cplxValue2, result; - for(; number < quarterPoints; number++){ - cplxValue1 = _mm_loadu_ps(complexVectorPtr); - complexVectorPtr += 4; + for (; number < quarterPoints; number++) { + cplxValue1 = _mm_loadu_ps(complexVectorPtr); + complexVectorPtr += 4; - cplxValue2 = _mm_loadu_ps(complexVectorPtr); - complexVectorPtr += 4; + cplxValue2 = _mm_loadu_ps(complexVectorPtr); + complexVectorPtr += 4; - result = _mm_magnitude_ps(cplxValue1, cplxValue2); - _mm_storeu_ps(magnitudeVectorPtr, result); - magnitudeVectorPtr += 4; - } + result = _mm_magnitude_ps(cplxValue1, cplxValue2); + _mm_storeu_ps(magnitudeVectorPtr, result); + magnitudeVectorPtr += 4; + } - number = quarterPoints * 4; - for(; number < num_points; number++){ - float val1Real = *complexVectorPtr++; - float val1Imag = *complexVectorPtr++; - *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)); - } + number = quarterPoints * 4; + for (; number < num_points; number++) { + float val1Real = *complexVectorPtr++; + float val1Imag = *complexVectorPtr++; + *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)); + } } #endif /* LV_HAVE_SSE */ #ifdef LV_HAVE_GENERIC -static inline void -volk_32fc_magnitude_32f_generic(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points) +static inline void volk_32fc_magnitude_32f_generic(float* magnitudeVector, + const lv_32fc_t* complexVector, + unsigned int num_points) { - const float* complexVectorPtr = (float*)complexVector; - float* magnitudeVectorPtr = magnitudeVector; - unsigned int number = 0; - for(number = 0; number < num_points; number++){ - const float real = *complexVectorPtr++; - const float imag = *complexVectorPtr++; - *magnitudeVectorPtr++ = sqrtf((real*real) + (imag*imag)); - } + const float* complexVectorPtr = (float*)complexVector; + float* magnitudeVectorPtr = magnitudeVector; + unsigned int number = 0; + for (number = 0; number < num_points; number++) { + const float real = *complexVectorPtr++; + const float imag = *complexVectorPtr++; + *magnitudeVectorPtr++ = sqrtf((real * real) + (imag * imag)); + } } #endif /* LV_HAVE_GENERIC */ - #endif /* INCLUDED_volk_32fc_magnitude_32f_u_H */ #ifndef INCLUDED_volk_32fc_magnitude_32f_a_H #define INCLUDED_volk_32fc_magnitude_32f_a_H #include -#include #include +#include #ifdef LV_HAVE_AVX #include #include -static inline void -volk_32fc_magnitude_32f_a_avx(float* magnitudeVector, const lv_32fc_t* complexVector, - unsigned int num_points) +static inline void volk_32fc_magnitude_32f_a_avx(float* magnitudeVector, + const lv_32fc_t* complexVector, + unsigned int num_points) { - unsigned int number = 0; - const unsigned int eighthPoints = num_points / 8; - - const float* complexVectorPtr = (float*) complexVector; - float* magnitudeVectorPtr = magnitudeVector; - - __m256 cplxValue1, cplxValue2, result; - for(; number < eighthPoints; number++){ - cplxValue1 = _mm256_load_ps(complexVectorPtr); - complexVectorPtr += 8; - - cplxValue2 = _mm256_load_ps(complexVectorPtr); - complexVectorPtr += 8; - - result = _mm256_magnitude_ps(cplxValue1, cplxValue2); - _mm256_store_ps(magnitudeVectorPtr, result); - magnitudeVectorPtr += 8; - } - - number = eighthPoints * 8; - for(; number < num_points; number++){ - float val1Real = *complexVectorPtr++; - float val1Imag = *complexVectorPtr++; - *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)); - } + unsigned int number = 0; + const unsigned int eighthPoints = num_points / 8; + + const float* complexVectorPtr = (float*)complexVector; + float* magnitudeVectorPtr = magnitudeVector; + + __m256 cplxValue1, cplxValue2, result; + for (; number < eighthPoints; number++) { + cplxValue1 = _mm256_load_ps(complexVectorPtr); + complexVectorPtr += 8; + + cplxValue2 = _mm256_load_ps(complexVectorPtr); + complexVectorPtr += 8; + + result = _mm256_magnitude_ps(cplxValue1, cplxValue2); + _mm256_store_ps(magnitudeVectorPtr, result); + magnitudeVectorPtr += 8; + } + + number = eighthPoints * 8; + for (; number < num_points; number++) { + float val1Real = *complexVectorPtr++; + float val1Imag = *complexVectorPtr++; + *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)); + } } #endif /* LV_HAVE_AVX */ @@ -252,89 +252,89 @@ volk_32fc_magnitude_32f_a_avx(float* magnitudeVector, const lv_32fc_t* complexVe #include #include -static inline void -volk_32fc_magnitude_32f_a_sse3(float* magnitudeVector, const lv_32fc_t* complexVector, - unsigned int num_points) +static inline void volk_32fc_magnitude_32f_a_sse3(float* magnitudeVector, + const lv_32fc_t* complexVector, + unsigned int num_points) { - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - const float* complexVectorPtr = (float*) complexVector; - float* magnitudeVectorPtr = magnitudeVector; - - __m128 cplxValue1, cplxValue2, result; - for(; number < quarterPoints; number++){ - cplxValue1 = _mm_load_ps(complexVectorPtr); - complexVectorPtr += 4; - - cplxValue2 = _mm_load_ps(complexVectorPtr); - complexVectorPtr += 4; - - result = _mm_magnitude_ps_sse3(cplxValue1, cplxValue2); - _mm_store_ps(magnitudeVectorPtr, result); - magnitudeVectorPtr += 4; - } - - number = quarterPoints * 4; - for(; number < num_points; number++){ - float val1Real = *complexVectorPtr++; - float val1Imag = *complexVectorPtr++; - *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)); - } + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + const float* complexVectorPtr = (float*)complexVector; + float* magnitudeVectorPtr = magnitudeVector; + + __m128 cplxValue1, cplxValue2, result; + for (; number < quarterPoints; number++) { + cplxValue1 = _mm_load_ps(complexVectorPtr); + complexVectorPtr += 4; + + cplxValue2 = _mm_load_ps(complexVectorPtr); + complexVectorPtr += 4; + + result = _mm_magnitude_ps_sse3(cplxValue1, cplxValue2); + _mm_store_ps(magnitudeVectorPtr, result); + magnitudeVectorPtr += 4; + } + + number = quarterPoints * 4; + for (; number < num_points; number++) { + float val1Real = *complexVectorPtr++; + float val1Imag = *complexVectorPtr++; + *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)); + } } #endif /* LV_HAVE_SSE3 */ #ifdef LV_HAVE_SSE -#include #include +#include -static inline void -volk_32fc_magnitude_32f_a_sse(float* magnitudeVector, const lv_32fc_t* complexVector, - unsigned int num_points) +static inline void volk_32fc_magnitude_32f_a_sse(float* magnitudeVector, + const lv_32fc_t* complexVector, + unsigned int num_points) { - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - const float* complexVectorPtr = (float*) complexVector; - float* magnitudeVectorPtr = magnitudeVector; - - __m128 cplxValue1, cplxValue2, result; - for(; number < quarterPoints; number++){ - cplxValue1 = _mm_load_ps(complexVectorPtr); - complexVectorPtr += 4; - - cplxValue2 = _mm_load_ps(complexVectorPtr); - complexVectorPtr += 4; - - result = _mm_magnitude_ps(cplxValue1, cplxValue2); - _mm_store_ps(magnitudeVectorPtr, result); - magnitudeVectorPtr += 4; - } - - number = quarterPoints * 4; - for(; number < num_points; number++){ - float val1Real = *complexVectorPtr++; - float val1Imag = *complexVectorPtr++; - *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)); - } + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + const float* complexVectorPtr = (float*)complexVector; + float* magnitudeVectorPtr = magnitudeVector; + + __m128 cplxValue1, cplxValue2, result; + for (; number < quarterPoints; number++) { + cplxValue1 = _mm_load_ps(complexVectorPtr); + complexVectorPtr += 4; + + cplxValue2 = _mm_load_ps(complexVectorPtr); + complexVectorPtr += 4; + + result = _mm_magnitude_ps(cplxValue1, cplxValue2); + _mm_store_ps(magnitudeVectorPtr, result); + magnitudeVectorPtr += 4; + } + + number = quarterPoints * 4; + for (; number < num_points; number++) { + float val1Real = *complexVectorPtr++; + float val1Imag = *complexVectorPtr++; + *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)); + } } #endif /* LV_HAVE_SSE */ #ifdef LV_HAVE_GENERIC -static inline void -volk_32fc_magnitude_32f_a_generic(float* magnitudeVector, const lv_32fc_t* complexVector, - unsigned int num_points) +static inline void volk_32fc_magnitude_32f_a_generic(float* magnitudeVector, + const lv_32fc_t* complexVector, + unsigned int num_points) { - const float* complexVectorPtr = (float*)complexVector; - float* magnitudeVectorPtr = magnitudeVector; - unsigned int number = 0; - for(number = 0; number < num_points; number++){ - const float real = *complexVectorPtr++; - const float imag = *complexVectorPtr++; - *magnitudeVectorPtr++ = sqrtf((real*real) + (imag*imag)); - } + const float* complexVectorPtr = (float*)complexVector; + float* magnitudeVectorPtr = magnitudeVector; + unsigned int number = 0; + for (number = 0; number < num_points; number++) { + const float real = *complexVectorPtr++; + const float imag = *complexVectorPtr++; + *magnitudeVectorPtr++ = sqrtf((real * real) + (imag * imag)); + } } #endif /* LV_HAVE_GENERIC */ @@ -342,41 +342,43 @@ volk_32fc_magnitude_32f_a_generic(float* magnitudeVector, const lv_32fc_t* compl #ifdef LV_HAVE_NEON #include -static inline void -volk_32fc_magnitude_32f_neon(float* magnitudeVector, const lv_32fc_t* complexVector, - unsigned int num_points) +static inline void volk_32fc_magnitude_32f_neon(float* magnitudeVector, + const lv_32fc_t* complexVector, + unsigned int num_points) { - unsigned int number; - unsigned int quarter_points = num_points / 4; - const float* complexVectorPtr = (float*)complexVector; - float* magnitudeVectorPtr = magnitudeVector; - - float32x4x2_t complex_vec; - float32x4_t magnitude_vec; - for(number = 0; number < quarter_points; number++){ - complex_vec = vld2q_f32(complexVectorPtr); - complex_vec.val[0] = vmulq_f32(complex_vec.val[0], complex_vec.val[0]); - magnitude_vec = vmlaq_f32(complex_vec.val[0], complex_vec.val[1], complex_vec.val[1]); - magnitude_vec = vrsqrteq_f32(magnitude_vec); - magnitude_vec = vrecpeq_f32( magnitude_vec ); // no plain ol' sqrt - vst1q_f32(magnitudeVectorPtr, magnitude_vec); - - complexVectorPtr += 8; - magnitudeVectorPtr += 4; - } - - for(number = quarter_points*4; number < num_points; number++){ - const float real = *complexVectorPtr++; - const float imag = *complexVectorPtr++; - *magnitudeVectorPtr++ = sqrtf((real*real) + (imag*imag)); - } + unsigned int number; + unsigned int quarter_points = num_points / 4; + const float* complexVectorPtr = (float*)complexVector; + float* magnitudeVectorPtr = magnitudeVector; + + float32x4x2_t complex_vec; + float32x4_t magnitude_vec; + for (number = 0; number < quarter_points; number++) { + complex_vec = vld2q_f32(complexVectorPtr); + complex_vec.val[0] = vmulq_f32(complex_vec.val[0], complex_vec.val[0]); + magnitude_vec = + vmlaq_f32(complex_vec.val[0], complex_vec.val[1], complex_vec.val[1]); + magnitude_vec = vrsqrteq_f32(magnitude_vec); + magnitude_vec = vrecpeq_f32(magnitude_vec); // no plain ol' sqrt + vst1q_f32(magnitudeVectorPtr, magnitude_vec); + + complexVectorPtr += 8; + magnitudeVectorPtr += 4; + } + + for (number = quarter_points * 4; number < num_points; number++) { + const float real = *complexVectorPtr++; + const float imag = *complexVectorPtr++; + *magnitudeVectorPtr++ = sqrtf((real * real) + (imag * imag)); + } } #endif /* LV_HAVE_NEON */ #ifdef LV_HAVE_NEON /*! - \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector + \brief Calculates the magnitude of the complexVector and stores the results in the + magnitudeVector This is an approximation from "Streamlining Digital Signal Processing" by Richard Lyons. Apparently max error is about 1% and mean error is about 0.6%. @@ -387,80 +389,80 @@ volk_32fc_magnitude_32f_neon(float* magnitudeVector, const lv_32fc_t* complexVec \param complexVector The vector containing the complex input values \param magnitudeVector The vector containing the real output values - \param num_points The number of complex values in complexVector to be calculated and stored into cVector + \param num_points The number of complex values in complexVector to be calculated and + stored into cVector */ -static inline void -volk_32fc_magnitude_32f_neon_fancy_sweet(float* magnitudeVector, const lv_32fc_t* complexVector, - unsigned int num_points) +static inline void volk_32fc_magnitude_32f_neon_fancy_sweet( + float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points) { - unsigned int number; - unsigned int quarter_points = num_points / 4; - const float* complexVectorPtr = (float*)complexVector; - float* magnitudeVectorPtr = magnitudeVector; - - const float threshold = 0.4142135; - - float32x4_t a_vec, b_vec, a_high, a_low, b_high, b_low; - a_high = vdupq_n_f32( 0.84 ); - b_high = vdupq_n_f32( 0.561); - a_low = vdupq_n_f32( 0.99 ); - b_low = vdupq_n_f32( 0.197); - - uint32x4_t comp0, comp1; - - float32x4x2_t complex_vec; - float32x4_t min_vec, max_vec, magnitude_vec; - float32x4_t real_abs, imag_abs; - for(number = 0; number < quarter_points; number++){ - complex_vec = vld2q_f32(complexVectorPtr); - - real_abs = vabsq_f32(complex_vec.val[0]); - imag_abs = vabsq_f32(complex_vec.val[1]); - - min_vec = vminq_f32(real_abs, imag_abs); - max_vec = vmaxq_f32(real_abs, imag_abs); - - // effective branch to choose coefficient pair. - comp0 = vcgtq_f32(min_vec, vmulq_n_f32(max_vec, threshold)); - comp1 = vcleq_f32(min_vec, vmulq_n_f32(max_vec, threshold)); - - // and 0s or 1s with coefficients from previous effective branch - a_vec = (float32x4_t)vaddq_s32(vandq_s32((int32x4_t)comp0, (int32x4_t)a_high), - vandq_s32((int32x4_t)comp1, (int32x4_t)a_low)); - b_vec = (float32x4_t)vaddq_s32(vandq_s32((int32x4_t)comp0, (int32x4_t)b_high), - vandq_s32((int32x4_t)comp1, (int32x4_t)b_low)); - - // coefficients chosen, do the weighted sum - min_vec = vmulq_f32(min_vec, b_vec); - max_vec = vmulq_f32(max_vec, a_vec); - - magnitude_vec = vaddq_f32(min_vec, max_vec); - vst1q_f32(magnitudeVectorPtr, magnitude_vec); - - complexVectorPtr += 8; - magnitudeVectorPtr += 4; - } - - for(number = quarter_points*4; number < num_points; number++){ - const float real = *complexVectorPtr++; - const float imag = *complexVectorPtr++; - *magnitudeVectorPtr++ = sqrtf((real*real) + (imag*imag)); - } + unsigned int number; + unsigned int quarter_points = num_points / 4; + const float* complexVectorPtr = (float*)complexVector; + float* magnitudeVectorPtr = magnitudeVector; + + const float threshold = 0.4142135; + + float32x4_t a_vec, b_vec, a_high, a_low, b_high, b_low; + a_high = vdupq_n_f32(0.84); + b_high = vdupq_n_f32(0.561); + a_low = vdupq_n_f32(0.99); + b_low = vdupq_n_f32(0.197); + + uint32x4_t comp0, comp1; + + float32x4x2_t complex_vec; + float32x4_t min_vec, max_vec, magnitude_vec; + float32x4_t real_abs, imag_abs; + for (number = 0; number < quarter_points; number++) { + complex_vec = vld2q_f32(complexVectorPtr); + + real_abs = vabsq_f32(complex_vec.val[0]); + imag_abs = vabsq_f32(complex_vec.val[1]); + + min_vec = vminq_f32(real_abs, imag_abs); + max_vec = vmaxq_f32(real_abs, imag_abs); + + // effective branch to choose coefficient pair. + comp0 = vcgtq_f32(min_vec, vmulq_n_f32(max_vec, threshold)); + comp1 = vcleq_f32(min_vec, vmulq_n_f32(max_vec, threshold)); + + // and 0s or 1s with coefficients from previous effective branch + a_vec = (float32x4_t)vaddq_s32(vandq_s32((int32x4_t)comp0, (int32x4_t)a_high), + vandq_s32((int32x4_t)comp1, (int32x4_t)a_low)); + b_vec = (float32x4_t)vaddq_s32(vandq_s32((int32x4_t)comp0, (int32x4_t)b_high), + vandq_s32((int32x4_t)comp1, (int32x4_t)b_low)); + + // coefficients chosen, do the weighted sum + min_vec = vmulq_f32(min_vec, b_vec); + max_vec = vmulq_f32(max_vec, a_vec); + + magnitude_vec = vaddq_f32(min_vec, max_vec); + vst1q_f32(magnitudeVectorPtr, magnitude_vec); + + complexVectorPtr += 8; + magnitudeVectorPtr += 4; + } + + for (number = quarter_points * 4; number < num_points; number++) { + const float real = *complexVectorPtr++; + const float imag = *complexVectorPtr++; + *magnitudeVectorPtr++ = sqrtf((real * real) + (imag * imag)); + } } #endif /* LV_HAVE_NEON */ #ifdef LV_HAVE_ORC -extern void -volk_32fc_magnitude_32f_a_orc_impl(float* magnitudeVector, const lv_32fc_t* complexVector, - unsigned int num_points); +extern void volk_32fc_magnitude_32f_a_orc_impl(float* magnitudeVector, + const lv_32fc_t* complexVector, + unsigned int num_points); -static inline void -volk_32fc_magnitude_32f_u_orc(float* magnitudeVector, const lv_32fc_t* complexVector, - unsigned int num_points) +static inline void volk_32fc_magnitude_32f_u_orc(float* magnitudeVector, + const lv_32fc_t* complexVector, + unsigned int num_points) { - volk_32fc_magnitude_32f_a_orc_impl(magnitudeVector, complexVector, num_points); + volk_32fc_magnitude_32f_a_orc_impl(magnitudeVector, complexVector, num_points); } #endif /* LV_HAVE_ORC */ diff --git a/kernels/volk/volk_32fc_magnitude_squared_32f.h b/kernels/volk/volk_32fc_magnitude_squared_32f.h index 51bb4df..cb093ca 100644 --- a/kernels/volk/volk_32fc_magnitude_squared_32f.h +++ b/kernels/volk/volk_32fc_magnitude_squared_32f.h @@ -30,8 +30,8 @@ * * Dispatcher Prototype * \code - * void volk_32fc_magnitude_squared_32f(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points) - * \endcode + * void volk_32fc_magnitude_squared_32f(float* magnitudeVector, const lv_32fc_t* + * complexVector, unsigned int num_points) \endcode * * \b Inputs * \li complexVector: The complex input vector. @@ -72,41 +72,41 @@ #define INCLUDED_volk_32fc_magnitude_squared_32f_u_H #include -#include #include +#include #ifdef LV_HAVE_AVX #include #include -static inline void -volk_32fc_magnitude_squared_32f_u_avx(float* magnitudeVector, const lv_32fc_t* complexVector, - unsigned int num_points) +static inline void volk_32fc_magnitude_squared_32f_u_avx(float* magnitudeVector, + const lv_32fc_t* complexVector, + unsigned int num_points) { - unsigned int number = 0; - const unsigned int eighthPoints = num_points / 8; - - const float* complexVectorPtr = (float*) complexVector; - float* magnitudeVectorPtr = magnitudeVector; - - __m256 cplxValue1, cplxValue2, result; - - for(; number < eighthPoints; number++){ - cplxValue1 = _mm256_loadu_ps(complexVectorPtr); - cplxValue2 = _mm256_loadu_ps(complexVectorPtr + 8); - result = _mm256_magnitudesquared_ps(cplxValue1, cplxValue2); - _mm256_storeu_ps(magnitudeVectorPtr, result); - - complexVectorPtr += 16; - magnitudeVectorPtr += 8; - } - - number = eighthPoints * 8; - for(; number < num_points; number++){ - float val1Real = *complexVectorPtr++; - float val1Imag = *complexVectorPtr++; - *magnitudeVectorPtr++ = (val1Real * val1Real) + (val1Imag * val1Imag); - } + unsigned int number = 0; + const unsigned int eighthPoints = num_points / 8; + + const float* complexVectorPtr = (float*)complexVector; + float* magnitudeVectorPtr = magnitudeVector; + + __m256 cplxValue1, cplxValue2, result; + + for (; number < eighthPoints; number++) { + cplxValue1 = _mm256_loadu_ps(complexVectorPtr); + cplxValue2 = _mm256_loadu_ps(complexVectorPtr + 8); + result = _mm256_magnitudesquared_ps(cplxValue1, cplxValue2); + _mm256_storeu_ps(magnitudeVectorPtr, result); + + complexVectorPtr += 16; + magnitudeVectorPtr += 8; + } + + number = eighthPoints * 8; + for (; number < num_points; number++) { + float val1Real = *complexVectorPtr++; + float val1Imag = *complexVectorPtr++; + *magnitudeVectorPtr++ = (val1Real * val1Real) + (val1Imag * val1Imag); + } } #endif /* LV_HAVE_AVX */ @@ -115,137 +115,136 @@ volk_32fc_magnitude_squared_32f_u_avx(float* magnitudeVector, const lv_32fc_t* c #include #include -static inline void -volk_32fc_magnitude_squared_32f_u_sse3(float* magnitudeVector, const lv_32fc_t* complexVector, - unsigned int num_points) +static inline void volk_32fc_magnitude_squared_32f_u_sse3(float* magnitudeVector, + const lv_32fc_t* complexVector, + unsigned int num_points) { - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - const float* complexVectorPtr = (float*) complexVector; - float* magnitudeVectorPtr = magnitudeVector; - - __m128 cplxValue1, cplxValue2, result; - for(; number < quarterPoints; number++){ - cplxValue1 = _mm_loadu_ps(complexVectorPtr); - complexVectorPtr += 4; - - cplxValue2 = _mm_loadu_ps(complexVectorPtr); - complexVectorPtr += 4; - - result = _mm_magnitudesquared_ps_sse3(cplxValue1, cplxValue2); - _mm_storeu_ps(magnitudeVectorPtr, result); - magnitudeVectorPtr += 4; - } - - number = quarterPoints * 4; - for(; number < num_points; number++){ - float val1Real = *complexVectorPtr++; - float val1Imag = *complexVectorPtr++; - *magnitudeVectorPtr++ = (val1Real * val1Real) + (val1Imag * val1Imag); - } + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + const float* complexVectorPtr = (float*)complexVector; + float* magnitudeVectorPtr = magnitudeVector; + + __m128 cplxValue1, cplxValue2, result; + for (; number < quarterPoints; number++) { + cplxValue1 = _mm_loadu_ps(complexVectorPtr); + complexVectorPtr += 4; + + cplxValue2 = _mm_loadu_ps(complexVectorPtr); + complexVectorPtr += 4; + + result = _mm_magnitudesquared_ps_sse3(cplxValue1, cplxValue2); + _mm_storeu_ps(magnitudeVectorPtr, result); + magnitudeVectorPtr += 4; + } + + number = quarterPoints * 4; + for (; number < num_points; number++) { + float val1Real = *complexVectorPtr++; + float val1Imag = *complexVectorPtr++; + *magnitudeVectorPtr++ = (val1Real * val1Real) + (val1Imag * val1Imag); + } } #endif /* LV_HAVE_SSE3 */ #ifdef LV_HAVE_SSE -#include #include +#include -static inline void -volk_32fc_magnitude_squared_32f_u_sse(float* magnitudeVector, const lv_32fc_t* complexVector, - unsigned int num_points) +static inline void volk_32fc_magnitude_squared_32f_u_sse(float* magnitudeVector, + const lv_32fc_t* complexVector, + unsigned int num_points) { - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; - const float* complexVectorPtr = (float*) complexVector; - float* magnitudeVectorPtr = magnitudeVector; + const float* complexVectorPtr = (float*)complexVector; + float* magnitudeVectorPtr = magnitudeVector; - __m128 cplxValue1, cplxValue2, result; + __m128 cplxValue1, cplxValue2, result; - for(; number < quarterPoints; number++){ - cplxValue1 = _mm_loadu_ps(complexVectorPtr); - complexVectorPtr += 4; + for (; number < quarterPoints; number++) { + cplxValue1 = _mm_loadu_ps(complexVectorPtr); + complexVectorPtr += 4; - cplxValue2 = _mm_loadu_ps(complexVectorPtr); - complexVectorPtr += 4; + cplxValue2 = _mm_loadu_ps(complexVectorPtr); + complexVectorPtr += 4; - result = _mm_magnitudesquared_ps(cplxValue1, cplxValue2); - _mm_storeu_ps(magnitudeVectorPtr, result); - magnitudeVectorPtr += 4; - } + result = _mm_magnitudesquared_ps(cplxValue1, cplxValue2); + _mm_storeu_ps(magnitudeVectorPtr, result); + magnitudeVectorPtr += 4; + } - number = quarterPoints * 4; - for(; number < num_points; number++){ - float val1Real = *complexVectorPtr++; - float val1Imag = *complexVectorPtr++; - *magnitudeVectorPtr++ = (val1Real * val1Real) + (val1Imag * val1Imag); - } + number = quarterPoints * 4; + for (; number < num_points; number++) { + float val1Real = *complexVectorPtr++; + float val1Imag = *complexVectorPtr++; + *magnitudeVectorPtr++ = (val1Real * val1Real) + (val1Imag * val1Imag); + } } #endif /* LV_HAVE_SSE */ #ifdef LV_HAVE_GENERIC -static inline void -volk_32fc_magnitude_squared_32f_generic(float* magnitudeVector, const lv_32fc_t* complexVector, - unsigned int num_points) +static inline void volk_32fc_magnitude_squared_32f_generic(float* magnitudeVector, + const lv_32fc_t* complexVector, + unsigned int num_points) { - const float* complexVectorPtr = (float*)complexVector; - float* magnitudeVectorPtr = magnitudeVector; - unsigned int number = 0; - for(number = 0; number < num_points; number++){ - const float real = *complexVectorPtr++; - const float imag = *complexVectorPtr++; - *magnitudeVectorPtr++ = (real*real) + (imag*imag); - } + const float* complexVectorPtr = (float*)complexVector; + float* magnitudeVectorPtr = magnitudeVector; + unsigned int number = 0; + for (number = 0; number < num_points; number++) { + const float real = *complexVectorPtr++; + const float imag = *complexVectorPtr++; + *magnitudeVectorPtr++ = (real * real) + (imag * imag); + } } #endif /* LV_HAVE_GENERIC */ - #endif /* INCLUDED_volk_32fc_magnitude_32f_u_H */ #ifndef INCLUDED_volk_32fc_magnitude_squared_32f_a_H #define INCLUDED_volk_32fc_magnitude_squared_32f_a_H #include -#include #include +#include #ifdef LV_HAVE_AVX #include #include -static inline void -volk_32fc_magnitude_squared_32f_a_avx(float* magnitudeVector, const lv_32fc_t* complexVector, - unsigned int num_points) +static inline void volk_32fc_magnitude_squared_32f_a_avx(float* magnitudeVector, + const lv_32fc_t* complexVector, + unsigned int num_points) { - unsigned int number = 0; - const unsigned int eighthPoints = num_points / 8; - - const float* complexVectorPtr = (float*) complexVector; - float* magnitudeVectorPtr = magnitudeVector; - - __m256 cplxValue1, cplxValue2, result; - for(; number < eighthPoints; number++){ - cplxValue1 = _mm256_load_ps(complexVectorPtr); - complexVectorPtr += 8; - - cplxValue2 = _mm256_load_ps(complexVectorPtr); - complexVectorPtr += 8; - - result = _mm256_magnitudesquared_ps(cplxValue1, cplxValue2); - _mm256_store_ps(magnitudeVectorPtr, result); - magnitudeVectorPtr += 8; - } - - number = eighthPoints * 8; - for(; number < num_points; number++){ - float val1Real = *complexVectorPtr++; - float val1Imag = *complexVectorPtr++; - *magnitudeVectorPtr++ = (val1Real * val1Real) + (val1Imag * val1Imag); - } + unsigned int number = 0; + const unsigned int eighthPoints = num_points / 8; + + const float* complexVectorPtr = (float*)complexVector; + float* magnitudeVectorPtr = magnitudeVector; + + __m256 cplxValue1, cplxValue2, result; + for (; number < eighthPoints; number++) { + cplxValue1 = _mm256_load_ps(complexVectorPtr); + complexVectorPtr += 8; + + cplxValue2 = _mm256_load_ps(complexVectorPtr); + complexVectorPtr += 8; + + result = _mm256_magnitudesquared_ps(cplxValue1, cplxValue2); + _mm256_store_ps(magnitudeVectorPtr, result); + magnitudeVectorPtr += 8; + } + + number = eighthPoints * 8; + for (; number < num_points; number++) { + float val1Real = *complexVectorPtr++; + float val1Imag = *complexVectorPtr++; + *magnitudeVectorPtr++ = (val1Real * val1Real) + (val1Imag * val1Imag); + } } #endif /* LV_HAVE_AVX */ @@ -254,72 +253,72 @@ volk_32fc_magnitude_squared_32f_a_avx(float* magnitudeVector, const lv_32fc_t* c #include #include -static inline void -volk_32fc_magnitude_squared_32f_a_sse3(float* magnitudeVector, const lv_32fc_t* complexVector, - unsigned int num_points) +static inline void volk_32fc_magnitude_squared_32f_a_sse3(float* magnitudeVector, + const lv_32fc_t* complexVector, + unsigned int num_points) { - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - const float* complexVectorPtr = (float*) complexVector; - float* magnitudeVectorPtr = magnitudeVector; - - __m128 cplxValue1, cplxValue2, result; - for(; number < quarterPoints; number++){ - cplxValue1 = _mm_load_ps(complexVectorPtr); - complexVectorPtr += 4; - - cplxValue2 = _mm_load_ps(complexVectorPtr); - complexVectorPtr += 4; - - result = _mm_magnitudesquared_ps_sse3(cplxValue1, cplxValue2); - _mm_store_ps(magnitudeVectorPtr, result); - magnitudeVectorPtr += 4; - } - - number = quarterPoints * 4; - for(; number < num_points; number++){ - float val1Real = *complexVectorPtr++; - float val1Imag = *complexVectorPtr++; - *magnitudeVectorPtr++ = (val1Real * val1Real) + (val1Imag * val1Imag); - } + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + const float* complexVectorPtr = (float*)complexVector; + float* magnitudeVectorPtr = magnitudeVector; + + __m128 cplxValue1, cplxValue2, result; + for (; number < quarterPoints; number++) { + cplxValue1 = _mm_load_ps(complexVectorPtr); + complexVectorPtr += 4; + + cplxValue2 = _mm_load_ps(complexVectorPtr); + complexVectorPtr += 4; + + result = _mm_magnitudesquared_ps_sse3(cplxValue1, cplxValue2); + _mm_store_ps(magnitudeVectorPtr, result); + magnitudeVectorPtr += 4; + } + + number = quarterPoints * 4; + for (; number < num_points; number++) { + float val1Real = *complexVectorPtr++; + float val1Imag = *complexVectorPtr++; + *magnitudeVectorPtr++ = (val1Real * val1Real) + (val1Imag * val1Imag); + } } #endif /* LV_HAVE_SSE3 */ #ifdef LV_HAVE_SSE -#include #include +#include -static inline void -volk_32fc_magnitude_squared_32f_a_sse(float* magnitudeVector, const lv_32fc_t* complexVector, - unsigned int num_points) +static inline void volk_32fc_magnitude_squared_32f_a_sse(float* magnitudeVector, + const lv_32fc_t* complexVector, + unsigned int num_points) { - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - const float* complexVectorPtr = (float*)complexVector; - float* magnitudeVectorPtr = magnitudeVector; - - __m128 cplxValue1, cplxValue2, result; - for(;number < quarterPoints; number++){ - cplxValue1 = _mm_load_ps(complexVectorPtr); - complexVectorPtr += 4; - - cplxValue2 = _mm_load_ps(complexVectorPtr); - complexVectorPtr += 4; - - result = _mm_magnitudesquared_ps(cplxValue1, cplxValue2); - _mm_store_ps(magnitudeVectorPtr, result); - magnitudeVectorPtr += 4; - } - - number = quarterPoints * 4; - for(; number < num_points; number++){ - float val1Real = *complexVectorPtr++; - float val1Imag = *complexVectorPtr++; - *magnitudeVectorPtr++ = (val1Real * val1Real) + (val1Imag * val1Imag); - } + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + const float* complexVectorPtr = (float*)complexVector; + float* magnitudeVectorPtr = magnitudeVector; + + __m128 cplxValue1, cplxValue2, result; + for (; number < quarterPoints; number++) { + cplxValue1 = _mm_load_ps(complexVectorPtr); + complexVectorPtr += 4; + + cplxValue2 = _mm_load_ps(complexVectorPtr); + complexVectorPtr += 4; + + result = _mm_magnitudesquared_ps(cplxValue1, cplxValue2); + _mm_store_ps(magnitudeVectorPtr, result); + magnitudeVectorPtr += 4; + } + + number = quarterPoints * 4; + for (; number < num_points; number++) { + float val1Real = *complexVectorPtr++; + float val1Imag = *complexVectorPtr++; + *magnitudeVectorPtr++ = (val1Real * val1Real) + (val1Imag * val1Imag); + } } #endif /* LV_HAVE_SSE */ @@ -327,55 +326,57 @@ volk_32fc_magnitude_squared_32f_a_sse(float* magnitudeVector, const lv_32fc_t* c #ifdef LV_HAVE_NEON #include -static inline void -volk_32fc_magnitude_squared_32f_neon(float* magnitudeVector, const lv_32fc_t* complexVector, - unsigned int num_points) +static inline void volk_32fc_magnitude_squared_32f_neon(float* magnitudeVector, + const lv_32fc_t* complexVector, + unsigned int num_points) { - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - const float* complexVectorPtr = (float*)complexVector; - float* magnitudeVectorPtr = magnitudeVector; - - float32x4x2_t cmplx_val; - float32x4_t result; - for(;number < quarterPoints; number++){ - cmplx_val = vld2q_f32(complexVectorPtr); - complexVectorPtr += 8; - - cmplx_val.val[0] = vmulq_f32(cmplx_val.val[0], cmplx_val.val[0]); // Square the values - cmplx_val.val[1] = vmulq_f32(cmplx_val.val[1], cmplx_val.val[1]); // Square the values - - result = vaddq_f32(cmplx_val.val[0], cmplx_val.val[1]); // Add the I2 and Q2 values - - vst1q_f32(magnitudeVectorPtr, result); - magnitudeVectorPtr += 4; - } - - number = quarterPoints * 4; - for(; number < num_points; number++){ - float val1Real = *complexVectorPtr++; - float val1Imag = *complexVectorPtr++; - *magnitudeVectorPtr++ = (val1Real * val1Real) + (val1Imag * val1Imag); - } + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + const float* complexVectorPtr = (float*)complexVector; + float* magnitudeVectorPtr = magnitudeVector; + + float32x4x2_t cmplx_val; + float32x4_t result; + for (; number < quarterPoints; number++) { + cmplx_val = vld2q_f32(complexVectorPtr); + complexVectorPtr += 8; + + cmplx_val.val[0] = + vmulq_f32(cmplx_val.val[0], cmplx_val.val[0]); // Square the values + cmplx_val.val[1] = + vmulq_f32(cmplx_val.val[1], cmplx_val.val[1]); // Square the values + + result = + vaddq_f32(cmplx_val.val[0], cmplx_val.val[1]); // Add the I2 and Q2 values + + vst1q_f32(magnitudeVectorPtr, result); + magnitudeVectorPtr += 4; + } + + number = quarterPoints * 4; + for (; number < num_points; number++) { + float val1Real = *complexVectorPtr++; + float val1Imag = *complexVectorPtr++; + *magnitudeVectorPtr++ = (val1Real * val1Real) + (val1Imag * val1Imag); + } } #endif /* LV_HAVE_NEON */ #ifdef LV_HAVE_GENERIC -static inline void -volk_32fc_magnitude_squared_32f_a_generic(float* magnitudeVector, const lv_32fc_t* complexVector, - unsigned int num_points) +static inline void volk_32fc_magnitude_squared_32f_a_generic( + float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points) { - const float* complexVectorPtr = (float*)complexVector; - float* magnitudeVectorPtr = magnitudeVector; - unsigned int number = 0; - for(number = 0; number < num_points; number++){ - const float real = *complexVectorPtr++; - const float imag = *complexVectorPtr++; - *magnitudeVectorPtr++ = (real*real) + (imag*imag); - } + const float* complexVectorPtr = (float*)complexVector; + float* magnitudeVectorPtr = magnitudeVector; + unsigned int number = 0; + for (number = 0; number < num_points; number++) { + const float real = *complexVectorPtr++; + const float imag = *complexVectorPtr++; + *magnitudeVectorPtr++ = (real * real) + (imag * imag); + } } #endif /* LV_HAVE_GENERIC */ diff --git a/kernels/volk/volk_32fc_s32f_atan2_32f.h b/kernels/volk/volk_32fc_s32f_atan2_32f.h index c169336..f08f793 100644 --- a/kernels/volk/volk_32fc_s32f_atan2_32f.h +++ b/kernels/volk/volk_32fc_s32f_atan2_32f.h @@ -30,13 +30,13 @@ * * Dispatcher Prototype * \code - * void volk_32fc_s32f_atan2_32f(float* outputVector, const lv_32fc_t* complexVector, const float normalizeFactor, unsigned int num_points) - * \endcode + * void volk_32fc_s32f_atan2_32f(float* outputVector, const lv_32fc_t* complexVector, + * const float normalizeFactor, unsigned int num_points) \endcode * * \b Inputs - * \li inputVector: The byte-aligned input vector containing interleaved IQ data (I = cos, Q = sin). - * \li normalizeFactor: The atan results are divided by this normalization factor. - * \li num_points: The number of complex values in \p inputVector. + * \li inputVector: The byte-aligned input vector containing interleaved IQ data (I = cos, + * Q = sin). \li normalizeFactor: The atan results are divided by this normalization + * factor. \li num_points: The number of complex values in \p inputVector. * * \b Outputs * \li outputVector: The vector where the results will be stored. @@ -75,8 +75,8 @@ #define INCLUDED_volk_32fc_s32f_atan2_32f_a_H #include -#include #include +#include #ifdef LV_HAVE_SSE4_1 #include @@ -85,50 +85,54 @@ #include #endif /* LV_HAVE_LIB_SIMDMATH */ -static inline void volk_32fc_s32f_atan2_32f_a_sse4_1(float* outputVector, const lv_32fc_t* complexVector, const float normalizeFactor, unsigned int num_points){ - const float* complexVectorPtr = (float*)complexVector; - float* outPtr = outputVector; +static inline void volk_32fc_s32f_atan2_32f_a_sse4_1(float* outputVector, + const lv_32fc_t* complexVector, + const float normalizeFactor, + unsigned int num_points) +{ + const float* complexVectorPtr = (float*)complexVector; + float* outPtr = outputVector; - unsigned int number = 0; - const float invNormalizeFactor = 1.0 / normalizeFactor; + unsigned int number = 0; + const float invNormalizeFactor = 1.0 / normalizeFactor; #ifdef LV_HAVE_LIB_SIMDMATH - const unsigned int quarterPoints = num_points / 4; - __m128 testVector = _mm_set_ps1(2*M_PI); - __m128 correctVector = _mm_set_ps1(M_PI); - __m128 vNormalizeFactor = _mm_set_ps1(invNormalizeFactor); - __m128 phase; - __m128 complex1, complex2, iValue, qValue; - __m128 keepMask; - - for (; number < quarterPoints; number++) { - // Load IQ data: - complex1 = _mm_load_ps(complexVectorPtr); - complexVectorPtr += 4; - complex2 = _mm_load_ps(complexVectorPtr); - complexVectorPtr += 4; - // Deinterleave IQ data: - iValue = _mm_shuffle_ps(complex1, complex2, _MM_SHUFFLE(2,0,2,0)); - qValue = _mm_shuffle_ps(complex1, complex2, _MM_SHUFFLE(3,1,3,1)); - // Arctan to get phase: - phase = atan2f4(qValue, iValue); - // When Q = 0 and I < 0, atan2f4 sucks and returns 2pi vice pi. - // Compare to 2pi: - keepMask = _mm_cmpneq_ps(phase,testVector); - phase = _mm_blendv_ps(correctVector, phase, keepMask); - // done with above correction. - phase = _mm_mul_ps(phase, vNormalizeFactor); - _mm_store_ps((float*)outPtr, phase); - outPtr += 4; - } - number = quarterPoints * 4; + const unsigned int quarterPoints = num_points / 4; + __m128 testVector = _mm_set_ps1(2 * M_PI); + __m128 correctVector = _mm_set_ps1(M_PI); + __m128 vNormalizeFactor = _mm_set_ps1(invNormalizeFactor); + __m128 phase; + __m128 complex1, complex2, iValue, qValue; + __m128 keepMask; + + for (; number < quarterPoints; number++) { + // Load IQ data: + complex1 = _mm_load_ps(complexVectorPtr); + complexVectorPtr += 4; + complex2 = _mm_load_ps(complexVectorPtr); + complexVectorPtr += 4; + // Deinterleave IQ data: + iValue = _mm_shuffle_ps(complex1, complex2, _MM_SHUFFLE(2, 0, 2, 0)); + qValue = _mm_shuffle_ps(complex1, complex2, _MM_SHUFFLE(3, 1, 3, 1)); + // Arctan to get phase: + phase = atan2f4(qValue, iValue); + // When Q = 0 and I < 0, atan2f4 sucks and returns 2pi vice pi. + // Compare to 2pi: + keepMask = _mm_cmpneq_ps(phase, testVector); + phase = _mm_blendv_ps(correctVector, phase, keepMask); + // done with above correction. + phase = _mm_mul_ps(phase, vNormalizeFactor); + _mm_store_ps((float*)outPtr, phase); + outPtr += 4; + } + number = quarterPoints * 4; #endif /* LV_HAVE_SIMDMATH_H */ - for (; number < num_points; number++) { - const float real = *complexVectorPtr++; - const float imag = *complexVectorPtr++; - *outPtr++ = atan2f(imag, real) * invNormalizeFactor; - } + for (; number < num_points; number++) { + const float real = *complexVectorPtr++; + const float imag = *complexVectorPtr++; + *outPtr++ = atan2f(imag, real) * invNormalizeFactor; + } } #endif /* LV_HAVE_SSE4_1 */ @@ -140,72 +144,78 @@ static inline void volk_32fc_s32f_atan2_32f_a_sse4_1(float* outputVector, const #include #endif /* LV_HAVE_LIB_SIMDMATH */ -static inline void volk_32fc_s32f_atan2_32f_a_sse(float* outputVector, const lv_32fc_t* complexVector, const float normalizeFactor, unsigned int num_points){ - const float* complexVectorPtr = (float*)complexVector; - float* outPtr = outputVector; +static inline void volk_32fc_s32f_atan2_32f_a_sse(float* outputVector, + const lv_32fc_t* complexVector, + const float normalizeFactor, + unsigned int num_points) +{ + const float* complexVectorPtr = (float*)complexVector; + float* outPtr = outputVector; - unsigned int number = 0; - const float invNormalizeFactor = 1.0 / normalizeFactor; + unsigned int number = 0; + const float invNormalizeFactor = 1.0 / normalizeFactor; #ifdef LV_HAVE_LIB_SIMDMATH - const unsigned int quarterPoints = num_points / 4; - __m128 testVector = _mm_set_ps1(2*M_PI); - __m128 correctVector = _mm_set_ps1(M_PI); - __m128 vNormalizeFactor = _mm_set_ps1(invNormalizeFactor); - __m128 phase; - __m128 complex1, complex2, iValue, qValue; - __m128 mask; - __m128 keepMask; - - for (; number < quarterPoints; number++) { - // Load IQ data: - complex1 = _mm_load_ps(complexVectorPtr); - complexVectorPtr += 4; - complex2 = _mm_load_ps(complexVectorPtr); - complexVectorPtr += 4; - // Deinterleave IQ data: - iValue = _mm_shuffle_ps(complex1, complex2, _MM_SHUFFLE(2,0,2,0)); - qValue = _mm_shuffle_ps(complex1, complex2, _MM_SHUFFLE(3,1,3,1)); - // Arctan to get phase: - phase = atan2f4(qValue, iValue); - // When Q = 0 and I < 0, atan2f4 sucks and returns 2pi vice pi. - // Compare to 2pi: - keepMask = _mm_cmpneq_ps(phase,testVector); - phase = _mm_and_ps(phase, keepMask); - mask = _mm_andnot_ps(keepMask, correctVector); - phase = _mm_or_ps(phase, mask); - // done with above correction. - phase = _mm_mul_ps(phase, vNormalizeFactor); - _mm_store_ps((float*)outPtr, phase); - outPtr += 4; - } - number = quarterPoints * 4; + const unsigned int quarterPoints = num_points / 4; + __m128 testVector = _mm_set_ps1(2 * M_PI); + __m128 correctVector = _mm_set_ps1(M_PI); + __m128 vNormalizeFactor = _mm_set_ps1(invNormalizeFactor); + __m128 phase; + __m128 complex1, complex2, iValue, qValue; + __m128 mask; + __m128 keepMask; + + for (; number < quarterPoints; number++) { + // Load IQ data: + complex1 = _mm_load_ps(complexVectorPtr); + complexVectorPtr += 4; + complex2 = _mm_load_ps(complexVectorPtr); + complexVectorPtr += 4; + // Deinterleave IQ data: + iValue = _mm_shuffle_ps(complex1, complex2, _MM_SHUFFLE(2, 0, 2, 0)); + qValue = _mm_shuffle_ps(complex1, complex2, _MM_SHUFFLE(3, 1, 3, 1)); + // Arctan to get phase: + phase = atan2f4(qValue, iValue); + // When Q = 0 and I < 0, atan2f4 sucks and returns 2pi vice pi. + // Compare to 2pi: + keepMask = _mm_cmpneq_ps(phase, testVector); + phase = _mm_and_ps(phase, keepMask); + mask = _mm_andnot_ps(keepMask, correctVector); + phase = _mm_or_ps(phase, mask); + // done with above correction. + phase = _mm_mul_ps(phase, vNormalizeFactor); + _mm_store_ps((float*)outPtr, phase); + outPtr += 4; + } + number = quarterPoints * 4; #endif /* LV_HAVE_SIMDMATH_H */ - for (; number < num_points; number++) { - const float real = *complexVectorPtr++; - const float imag = *complexVectorPtr++; - *outPtr++ = atan2f(imag, real) * invNormalizeFactor; - } + for (; number < num_points; number++) { + const float real = *complexVectorPtr++; + const float imag = *complexVectorPtr++; + *outPtr++ = atan2f(imag, real) * invNormalizeFactor; + } } #endif /* LV_HAVE_SSE */ #ifdef LV_HAVE_GENERIC -static inline void volk_32fc_s32f_atan2_32f_generic(float* outputVector, const lv_32fc_t* inputVector, const float normalizeFactor, unsigned int num_points){ - float* outPtr = outputVector; - const float* inPtr = (float*)inputVector; - const float invNormalizeFactor = 1.0 / normalizeFactor; - unsigned int number; - for ( number = 0; number < num_points; number++) { - const float real = *inPtr++; - const float imag = *inPtr++; - *outPtr++ = atan2f(imag, real) * invNormalizeFactor; - } +static inline void volk_32fc_s32f_atan2_32f_generic(float* outputVector, + const lv_32fc_t* inputVector, + const float normalizeFactor, + unsigned int num_points) +{ + float* outPtr = outputVector; + const float* inPtr = (float*)inputVector; + const float invNormalizeFactor = 1.0 / normalizeFactor; + unsigned int number; + for (number = 0; number < num_points; number++) { + const float real = *inPtr++; + const float imag = *inPtr++; + *outPtr++ = atan2f(imag, real) * invNormalizeFactor; + } } #endif /* LV_HAVE_GENERIC */ - - #endif /* INCLUDED_volk_32fc_s32f_atan2_32f_a_H */ diff --git a/kernels/volk/volk_32fc_s32f_deinterleave_real_16i.h b/kernels/volk/volk_32fc_s32f_deinterleave_real_16i.h index 64c6a8b..f70f494 100644 --- a/kernels/volk/volk_32fc_s32f_deinterleave_real_16i.h +++ b/kernels/volk/volk_32fc_s32f_deinterleave_real_16i.h @@ -30,8 +30,8 @@ * * Dispatcher Prototype * \code - * void volk_32fc_s32f_deinterleave_real_16i(int16_t* iBuffer, const lv_32fc_t* complexVector, const float scalar, unsigned int num_points) - * \endcode + * void volk_32fc_s32f_deinterleave_real_16i(int16_t* iBuffer, const lv_32fc_t* + * complexVector, const float scalar, unsigned int num_points) \endcode * * \b Inputs * \li complexVector: The complex input vector. @@ -73,61 +73,62 @@ #ifndef INCLUDED_volk_32fc_s32f_deinterleave_real_16i_a_H #define INCLUDED_volk_32fc_s32f_deinterleave_real_16i_a_H -#include #include #include +#include #ifdef LV_HAVE_AVX2 #include static inline void -volk_32fc_s32f_deinterleave_real_16i_a_avx2(int16_t* iBuffer, const lv_32fc_t* complexVector, - const float scalar, unsigned int num_points) +volk_32fc_s32f_deinterleave_real_16i_a_avx2(int16_t* iBuffer, + const lv_32fc_t* complexVector, + const float scalar, + unsigned int num_points) { - unsigned int number = 0; - const unsigned int eighthPoints = num_points / 8; - - const float* complexVectorPtr = (float*)complexVector; - int16_t* iBufferPtr = iBuffer; + unsigned int number = 0; + const unsigned int eighthPoints = num_points / 8; - __m256 vScalar = _mm256_set1_ps(scalar); + const float* complexVectorPtr = (float*)complexVector; + int16_t* iBufferPtr = iBuffer; - __m256 cplxValue1, cplxValue2, iValue; - __m256i a; - __m128i b; + __m256 vScalar = _mm256_set1_ps(scalar); - __m256i idx = _mm256_set_epi32(3,3,3,3,5,1,4,0); + __m256 cplxValue1, cplxValue2, iValue; + __m256i a; + __m128i b; - for(;number < eighthPoints; number++){ - cplxValue1 = _mm256_load_ps(complexVectorPtr); - complexVectorPtr += 8; + __m256i idx = _mm256_set_epi32(3, 3, 3, 3, 5, 1, 4, 0); - cplxValue2 = _mm256_load_ps(complexVectorPtr); - complexVectorPtr += 8; + for (; number < eighthPoints; number++) { + cplxValue1 = _mm256_load_ps(complexVectorPtr); + complexVectorPtr += 8; - // Arrange in i1i2i3i4 format - iValue = _mm256_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0)); + cplxValue2 = _mm256_load_ps(complexVectorPtr); + complexVectorPtr += 8; - iValue = _mm256_mul_ps(iValue, vScalar); + // Arrange in i1i2i3i4 format + iValue = _mm256_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2, 0, 2, 0)); - iValue = _mm256_round_ps(iValue, _MM_FROUND_TO_ZERO); - a = _mm256_cvtps_epi32(iValue); - a = _mm256_packs_epi32(a,a); - a = _mm256_permutevar8x32_epi32(a,idx); - b = _mm256_extracti128_si256(a,0); + iValue = _mm256_mul_ps(iValue, vScalar); - _mm_store_si128((__m128i*)iBufferPtr,b); - iBufferPtr += 8; + iValue = _mm256_round_ps(iValue, _MM_FROUND_TO_ZERO); + a = _mm256_cvtps_epi32(iValue); + a = _mm256_packs_epi32(a, a); + a = _mm256_permutevar8x32_epi32(a, idx); + b = _mm256_extracti128_si256(a, 0); - } + _mm_store_si128((__m128i*)iBufferPtr, b); + iBufferPtr += 8; + } - number = eighthPoints * 8; - iBufferPtr = &iBuffer[number]; - for(; number < num_points; number++){ - *iBufferPtr++ = (int16_t)(*complexVectorPtr++ * scalar); - complexVectorPtr++; - } + number = eighthPoints * 8; + iBufferPtr = &iBuffer[number]; + for (; number < num_points; number++) { + *iBufferPtr++ = (int16_t)(*complexVectorPtr++ * scalar); + complexVectorPtr++; + } } @@ -137,46 +138,48 @@ volk_32fc_s32f_deinterleave_real_16i_a_avx2(int16_t* iBuffer, const lv_32fc_t* c #include static inline void -volk_32fc_s32f_deinterleave_real_16i_a_sse(int16_t* iBuffer, const lv_32fc_t* complexVector, - const float scalar, unsigned int num_points) +volk_32fc_s32f_deinterleave_real_16i_a_sse(int16_t* iBuffer, + const lv_32fc_t* complexVector, + const float scalar, + unsigned int num_points) { - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; - const float* complexVectorPtr = (float*)complexVector; - int16_t* iBufferPtr = iBuffer; + const float* complexVectorPtr = (float*)complexVector; + int16_t* iBufferPtr = iBuffer; - __m128 vScalar = _mm_set_ps1(scalar); + __m128 vScalar = _mm_set_ps1(scalar); - __m128 cplxValue1, cplxValue2, iValue; + __m128 cplxValue1, cplxValue2, iValue; - __VOLK_ATTR_ALIGNED(16) float floatBuffer[4]; + __VOLK_ATTR_ALIGNED(16) float floatBuffer[4]; - for(;number < quarterPoints; number++){ - cplxValue1 = _mm_load_ps(complexVectorPtr); - complexVectorPtr += 4; + for (; number < quarterPoints; number++) { + cplxValue1 = _mm_load_ps(complexVectorPtr); + complexVectorPtr += 4; - cplxValue2 = _mm_load_ps(complexVectorPtr); - complexVectorPtr += 4; + cplxValue2 = _mm_load_ps(complexVectorPtr); + complexVectorPtr += 4; - // Arrange in i1i2i3i4 format - iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0)); + // Arrange in i1i2i3i4 format + iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2, 0, 2, 0)); - iValue = _mm_mul_ps(iValue, vScalar); + iValue = _mm_mul_ps(iValue, vScalar); - _mm_store_ps(floatBuffer, iValue); - *iBufferPtr++ = (int16_t)(floatBuffer[0]); - *iBufferPtr++ = (int16_t)(floatBuffer[1]); - *iBufferPtr++ = (int16_t)(floatBuffer[2]); - *iBufferPtr++ = (int16_t)(floatBuffer[3]); - } + _mm_store_ps(floatBuffer, iValue); + *iBufferPtr++ = (int16_t)(floatBuffer[0]); + *iBufferPtr++ = (int16_t)(floatBuffer[1]); + *iBufferPtr++ = (int16_t)(floatBuffer[2]); + *iBufferPtr++ = (int16_t)(floatBuffer[3]); + } - number = quarterPoints * 4; - iBufferPtr = &iBuffer[number]; - for(; number < num_points; number++){ - *iBufferPtr++ = (int16_t)(*complexVectorPtr++ * scalar); - complexVectorPtr++; - } + number = quarterPoints * 4; + iBufferPtr = &iBuffer[number]; + for (; number < num_points; number++) { + *iBufferPtr++ = (int16_t)(*complexVectorPtr++ * scalar); + complexVectorPtr++; + } } #endif /* LV_HAVE_SSE */ @@ -185,16 +188,18 @@ volk_32fc_s32f_deinterleave_real_16i_a_sse(int16_t* iBuffer, const lv_32fc_t* co #ifdef LV_HAVE_GENERIC static inline void -volk_32fc_s32f_deinterleave_real_16i_generic(int16_t* iBuffer, const lv_32fc_t* complexVector, - const float scalar, unsigned int num_points) +volk_32fc_s32f_deinterleave_real_16i_generic(int16_t* iBuffer, + const lv_32fc_t* complexVector, + const float scalar, + unsigned int num_points) { - const float* complexVectorPtr = (float*)complexVector; - int16_t* iBufferPtr = iBuffer; - unsigned int number = 0; - for(number = 0; number < num_points; number++){ - *iBufferPtr++ = (int16_t)(*complexVectorPtr++ * scalar); - complexVectorPtr++; - } + const float* complexVectorPtr = (float*)complexVector; + int16_t* iBufferPtr = iBuffer; + unsigned int number = 0; + for (number = 0; number < num_points; number++) { + *iBufferPtr++ = (int16_t)(*complexVectorPtr++ * scalar); + complexVectorPtr++; + } } #endif /* LV_HAVE_GENERIC */ @@ -204,60 +209,61 @@ volk_32fc_s32f_deinterleave_real_16i_generic(int16_t* iBuffer, const lv_32fc_t* #ifndef INCLUDED_volk_32fc_s32f_deinterleave_real_16i_u_H #define INCLUDED_volk_32fc_s32f_deinterleave_real_16i_u_H -#include #include #include +#include #ifdef LV_HAVE_AVX2 #include static inline void -volk_32fc_s32f_deinterleave_real_16i_u_avx2(int16_t* iBuffer, const lv_32fc_t* complexVector, - const float scalar, unsigned int num_points) +volk_32fc_s32f_deinterleave_real_16i_u_avx2(int16_t* iBuffer, + const lv_32fc_t* complexVector, + const float scalar, + unsigned int num_points) { - unsigned int number = 0; - const unsigned int eighthPoints = num_points / 8; - - const float* complexVectorPtr = (float*)complexVector; - int16_t* iBufferPtr = iBuffer; + unsigned int number = 0; + const unsigned int eighthPoints = num_points / 8; - __m256 vScalar = _mm256_set1_ps(scalar); + const float* complexVectorPtr = (float*)complexVector; + int16_t* iBufferPtr = iBuffer; - __m256 cplxValue1, cplxValue2, iValue; - __m256i a; - __m128i b; + __m256 vScalar = _mm256_set1_ps(scalar); - __m256i idx = _mm256_set_epi32(3,3,3,3,5,1,4,0); + __m256 cplxValue1, cplxValue2, iValue; + __m256i a; + __m128i b; - for(;number < eighthPoints; number++){ - cplxValue1 = _mm256_loadu_ps(complexVectorPtr); - complexVectorPtr += 8; + __m256i idx = _mm256_set_epi32(3, 3, 3, 3, 5, 1, 4, 0); - cplxValue2 = _mm256_loadu_ps(complexVectorPtr); - complexVectorPtr += 8; + for (; number < eighthPoints; number++) { + cplxValue1 = _mm256_loadu_ps(complexVectorPtr); + complexVectorPtr += 8; - // Arrange in i1i2i3i4 format - iValue = _mm256_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0)); + cplxValue2 = _mm256_loadu_ps(complexVectorPtr); + complexVectorPtr += 8; - iValue = _mm256_mul_ps(iValue, vScalar); + // Arrange in i1i2i3i4 format + iValue = _mm256_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2, 0, 2, 0)); - iValue = _mm256_round_ps(iValue, _MM_FROUND_TO_ZERO); - a = _mm256_cvtps_epi32(iValue); - a = _mm256_packs_epi32(a,a); - a = _mm256_permutevar8x32_epi32(a,idx); - b = _mm256_extracti128_si256(a,0); + iValue = _mm256_mul_ps(iValue, vScalar); - _mm_storeu_si128((__m128i*)iBufferPtr,b); - iBufferPtr += 8; + iValue = _mm256_round_ps(iValue, _MM_FROUND_TO_ZERO); + a = _mm256_cvtps_epi32(iValue); + a = _mm256_packs_epi32(a, a); + a = _mm256_permutevar8x32_epi32(a, idx); + b = _mm256_extracti128_si256(a, 0); - } + _mm_storeu_si128((__m128i*)iBufferPtr, b); + iBufferPtr += 8; + } - number = eighthPoints * 8; - iBufferPtr = &iBuffer[number]; - for(; number < num_points; number++){ - *iBufferPtr++ = (int16_t)(*complexVectorPtr++ * scalar); - complexVectorPtr++; - } + number = eighthPoints * 8; + iBufferPtr = &iBuffer[number]; + for (; number < num_points; number++) { + *iBufferPtr++ = (int16_t)(*complexVectorPtr++ * scalar); + complexVectorPtr++; + } } #endif /* LV_HAVE_AVX2 */ diff --git a/kernels/volk/volk_32fc_s32f_magnitude_16i.h b/kernels/volk/volk_32fc_s32f_magnitude_16i.h index 6e7e7cb..91a5b8e 100644 --- a/kernels/volk/volk_32fc_s32f_magnitude_16i.h +++ b/kernels/volk/volk_32fc_s32f_magnitude_16i.h @@ -31,8 +31,8 @@ * * Dispatcher Prototype * \code - * void volk_32fc_s32f_magnitude_16i(int16_t* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points) - * \endcode + * void volk_32fc_s32f_magnitude_16i(int16_t* magnitudeVector, const lv_32fc_t* + * complexVector, unsigned int num_points) \endcode * * \b Inputs * \li complexVector: The complex input vector. @@ -73,123 +73,129 @@ #ifdef LV_HAVE_GENERIC #include -static inline void -volk_32fc_s32f_magnitude_16i_generic(int16_t* magnitudeVector, const lv_32fc_t* complexVector, - const float scalar, unsigned int num_points) +static inline void volk_32fc_s32f_magnitude_16i_generic(int16_t* magnitudeVector, + const lv_32fc_t* complexVector, + const float scalar, + unsigned int num_points) { - const float* complexVectorPtr = (float*)complexVector; - int16_t* magnitudeVectorPtr = magnitudeVector; - unsigned int number = 0; - for(number = 0; number < num_points; number++){ - __VOLK_VOLATILE float real = *complexVectorPtr++; - __VOLK_VOLATILE float imag = *complexVectorPtr++; - real *= real; - imag *= imag; - *magnitudeVectorPtr++ = (int16_t)rintf(scalar*sqrtf(real + imag)); - } + const float* complexVectorPtr = (float*)complexVector; + int16_t* magnitudeVectorPtr = magnitudeVector; + unsigned int number = 0; + for (number = 0; number < num_points; number++) { + __VOLK_VOLATILE float real = *complexVectorPtr++; + __VOLK_VOLATILE float imag = *complexVectorPtr++; + real *= real; + imag *= imag; + *magnitudeVectorPtr++ = (int16_t)rintf(scalar * sqrtf(real + imag)); + } } #endif /* LV_HAVE_GENERIC */ #ifndef INCLUDED_volk_32fc_s32f_magnitude_16i_a_H #define INCLUDED_volk_32fc_s32f_magnitude_16i_a_H -#include #include -#include #include +#include +#include #ifdef LV_HAVE_AVX2 #include -static inline void -volk_32fc_s32f_magnitude_16i_a_avx2(int16_t* magnitudeVector, const lv_32fc_t* complexVector, - const float scalar, unsigned int num_points) +static inline void volk_32fc_s32f_magnitude_16i_a_avx2(int16_t* magnitudeVector, + const lv_32fc_t* complexVector, + const float scalar, + unsigned int num_points) { - unsigned int number = 0; - const unsigned int eighthPoints = num_points / 8; + unsigned int number = 0; + const unsigned int eighthPoints = num_points / 8; - const float* complexVectorPtr = (const float*)complexVector; - int16_t* magnitudeVectorPtr = magnitudeVector; + const float* complexVectorPtr = (const float*)complexVector; + int16_t* magnitudeVectorPtr = magnitudeVector; - __m256 vScalar = _mm256_set1_ps(scalar); - __m256i idx = _mm256_set_epi32(0,0,0,0,5,1,4,0); - __m256 cplxValue1, cplxValue2, result; - __m256i resultInt; - __m128i resultShort; + __m256 vScalar = _mm256_set1_ps(scalar); + __m256i idx = _mm256_set_epi32(0, 0, 0, 0, 5, 1, 4, 0); + __m256 cplxValue1, cplxValue2, result; + __m256i resultInt; + __m128i resultShort; - for(;number < eighthPoints; number++){ - cplxValue1 = _mm256_load_ps(complexVectorPtr); - complexVectorPtr += 8; + for (; number < eighthPoints; number++) { + cplxValue1 = _mm256_load_ps(complexVectorPtr); + complexVectorPtr += 8; - cplxValue2 = _mm256_load_ps(complexVectorPtr); - complexVectorPtr += 8; + cplxValue2 = _mm256_load_ps(complexVectorPtr); + complexVectorPtr += 8; - cplxValue1 = _mm256_mul_ps(cplxValue1, cplxValue1); // Square the values - cplxValue2 = _mm256_mul_ps(cplxValue2, cplxValue2); // Square the Values + cplxValue1 = _mm256_mul_ps(cplxValue1, cplxValue1); // Square the values + cplxValue2 = _mm256_mul_ps(cplxValue2, cplxValue2); // Square the Values - result = _mm256_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values + result = _mm256_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values - result = _mm256_sqrt_ps(result); + result = _mm256_sqrt_ps(result); - result = _mm256_mul_ps(result, vScalar); + result = _mm256_mul_ps(result, vScalar); - resultInt = _mm256_cvtps_epi32(result); - resultInt = _mm256_packs_epi32(resultInt, resultInt); - resultInt = _mm256_permutevar8x32_epi32(resultInt, idx); //permute to compensate for shuffling in hadd and packs - resultShort = _mm256_extracti128_si256(resultInt,0); - _mm_store_si128((__m128i*)magnitudeVectorPtr,resultShort); - magnitudeVectorPtr += 8; - } + resultInt = _mm256_cvtps_epi32(result); + resultInt = _mm256_packs_epi32(resultInt, resultInt); + resultInt = _mm256_permutevar8x32_epi32( + resultInt, idx); // permute to compensate for shuffling in hadd and packs + resultShort = _mm256_extracti128_si256(resultInt, 0); + _mm_store_si128((__m128i*)magnitudeVectorPtr, resultShort); + magnitudeVectorPtr += 8; + } - number = eighthPoints * 8; - volk_32fc_s32f_magnitude_16i_generic(magnitudeVector+number, complexVector+number, scalar, num_points-number); + number = eighthPoints * 8; + volk_32fc_s32f_magnitude_16i_generic( + magnitudeVector + number, complexVector + number, scalar, num_points - number); } #endif /* LV_HAVE_AVX2 */ #ifdef LV_HAVE_SSE3 #include -static inline void -volk_32fc_s32f_magnitude_16i_a_sse3(int16_t* magnitudeVector, const lv_32fc_t* complexVector, - const float scalar, unsigned int num_points) +static inline void volk_32fc_s32f_magnitude_16i_a_sse3(int16_t* magnitudeVector, + const lv_32fc_t* complexVector, + const float scalar, + unsigned int num_points) { - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; - const float* complexVectorPtr = (const float*)complexVector; - int16_t* magnitudeVectorPtr = magnitudeVector; + const float* complexVectorPtr = (const float*)complexVector; + int16_t* magnitudeVectorPtr = magnitudeVector; - __m128 vScalar = _mm_set_ps1(scalar); + __m128 vScalar = _mm_set_ps1(scalar); - __m128 cplxValue1, cplxValue2, result; + __m128 cplxValue1, cplxValue2, result; - __VOLK_ATTR_ALIGNED(16) float floatBuffer[4]; + __VOLK_ATTR_ALIGNED(16) float floatBuffer[4]; - for(;number < quarterPoints; number++){ - cplxValue1 = _mm_load_ps(complexVectorPtr); - complexVectorPtr += 4; + for (; number < quarterPoints; number++) { + cplxValue1 = _mm_load_ps(complexVectorPtr); + complexVectorPtr += 4; - cplxValue2 = _mm_load_ps(complexVectorPtr); - complexVectorPtr += 4; + cplxValue2 = _mm_load_ps(complexVectorPtr); + complexVectorPtr += 4; - cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values - cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values + cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values + cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values - result = _mm_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values + result = _mm_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values - result = _mm_sqrt_ps(result); + result = _mm_sqrt_ps(result); - result = _mm_mul_ps(result, vScalar); + result = _mm_mul_ps(result, vScalar); - _mm_store_ps(floatBuffer, result); - *magnitudeVectorPtr++ = (int16_t)rintf(floatBuffer[0]); - *magnitudeVectorPtr++ = (int16_t)rintf(floatBuffer[1]); - *magnitudeVectorPtr++ = (int16_t)rintf(floatBuffer[2]); - *magnitudeVectorPtr++ = (int16_t)rintf(floatBuffer[3]); - } + _mm_store_ps(floatBuffer, result); + *magnitudeVectorPtr++ = (int16_t)rintf(floatBuffer[0]); + *magnitudeVectorPtr++ = (int16_t)rintf(floatBuffer[1]); + *magnitudeVectorPtr++ = (int16_t)rintf(floatBuffer[2]); + *magnitudeVectorPtr++ = (int16_t)rintf(floatBuffer[3]); + } - number = quarterPoints * 4; - volk_32fc_s32f_magnitude_16i_generic(magnitudeVector+number, complexVector+number, scalar, num_points-number); + number = quarterPoints * 4; + volk_32fc_s32f_magnitude_16i_generic( + magnitudeVector + number, complexVector + number, scalar, num_points - number); } #endif /* LV_HAVE_SSE3 */ @@ -197,53 +203,57 @@ volk_32fc_s32f_magnitude_16i_a_sse3(int16_t* magnitudeVector, const lv_32fc_t* c #ifdef LV_HAVE_SSE #include -static inline void -volk_32fc_s32f_magnitude_16i_a_sse(int16_t* magnitudeVector, const lv_32fc_t* complexVector, - const float scalar, unsigned int num_points) +static inline void volk_32fc_s32f_magnitude_16i_a_sse(int16_t* magnitudeVector, + const lv_32fc_t* complexVector, + const float scalar, + unsigned int num_points) { - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; - const float* complexVectorPtr = (const float*)complexVector; - int16_t* magnitudeVectorPtr = magnitudeVector; + const float* complexVectorPtr = (const float*)complexVector; + int16_t* magnitudeVectorPtr = magnitudeVector; - __m128 vScalar = _mm_set_ps1(scalar); + __m128 vScalar = _mm_set_ps1(scalar); - __m128 cplxValue1, cplxValue2, result; - __m128 iValue, qValue; + __m128 cplxValue1, cplxValue2, result; + __m128 iValue, qValue; - __VOLK_ATTR_ALIGNED(16) float floatBuffer[4]; + __VOLK_ATTR_ALIGNED(16) float floatBuffer[4]; - for(;number < quarterPoints; number++){ - cplxValue1 = _mm_load_ps(complexVectorPtr); - complexVectorPtr += 4; + for (; number < quarterPoints; number++) { + cplxValue1 = _mm_load_ps(complexVectorPtr); + complexVectorPtr += 4; - cplxValue2 = _mm_load_ps(complexVectorPtr); - complexVectorPtr += 4; + cplxValue2 = _mm_load_ps(complexVectorPtr); + complexVectorPtr += 4; - // Arrange in i1i2i3i4 format - iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0)); - // Arrange in q1q2q3q4 format - qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1)); + // Arrange in i1i2i3i4 format + iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2, 0, 2, 0)); + // Arrange in q1q2q3q4 format + qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3, 1, 3, 1)); - __VOLK_VOLATILE __m128 iValue2 = _mm_mul_ps(iValue, iValue); // Square the I values - __VOLK_VOLATILE __m128 qValue2 = _mm_mul_ps(qValue, qValue); // Square the Q Values + __VOLK_VOLATILE __m128 iValue2 = + _mm_mul_ps(iValue, iValue); // Square the I values + __VOLK_VOLATILE __m128 qValue2 = + _mm_mul_ps(qValue, qValue); // Square the Q Values - result = _mm_add_ps(iValue2, qValue2); // Add the I2 and Q2 values + result = _mm_add_ps(iValue2, qValue2); // Add the I2 and Q2 values - result = _mm_sqrt_ps(result); + result = _mm_sqrt_ps(result); - result = _mm_mul_ps(result, vScalar); + result = _mm_mul_ps(result, vScalar); - _mm_store_ps(floatBuffer, result); - *magnitudeVectorPtr++ = (int16_t)rintf(floatBuffer[0]); - *magnitudeVectorPtr++ = (int16_t)rintf(floatBuffer[1]); - *magnitudeVectorPtr++ = (int16_t)rintf(floatBuffer[2]); - *magnitudeVectorPtr++ = (int16_t)rintf(floatBuffer[3]); - } + _mm_store_ps(floatBuffer, result); + *magnitudeVectorPtr++ = (int16_t)rintf(floatBuffer[0]); + *magnitudeVectorPtr++ = (int16_t)rintf(floatBuffer[1]); + *magnitudeVectorPtr++ = (int16_t)rintf(floatBuffer[2]); + *magnitudeVectorPtr++ = (int16_t)rintf(floatBuffer[3]); + } - number = quarterPoints * 4; - volk_32fc_s32f_magnitude_16i_generic(magnitudeVector+number, complexVector+number, scalar, num_points-number); + number = quarterPoints * 4; + volk_32fc_s32f_magnitude_16i_generic( + magnitudeVector + number, complexVector + number, scalar, num_points - number); } #endif /* LV_HAVE_SSE */ @@ -253,56 +263,59 @@ volk_32fc_s32f_magnitude_16i_a_sse(int16_t* magnitudeVector, const lv_32fc_t* co #ifndef INCLUDED_volk_32fc_s32f_magnitude_16i_u_H #define INCLUDED_volk_32fc_s32f_magnitude_16i_u_H -#include #include -#include #include +#include +#include #ifdef LV_HAVE_AVX2 #include -static inline void -volk_32fc_s32f_magnitude_16i_u_avx2(int16_t* magnitudeVector, const lv_32fc_t* complexVector, - const float scalar, unsigned int num_points) +static inline void volk_32fc_s32f_magnitude_16i_u_avx2(int16_t* magnitudeVector, + const lv_32fc_t* complexVector, + const float scalar, + unsigned int num_points) { - unsigned int number = 0; - const unsigned int eighthPoints = num_points / 8; + unsigned int number = 0; + const unsigned int eighthPoints = num_points / 8; - const float* complexVectorPtr = (const float*)complexVector; - int16_t* magnitudeVectorPtr = magnitudeVector; + const float* complexVectorPtr = (const float*)complexVector; + int16_t* magnitudeVectorPtr = magnitudeVector; - __m256 vScalar = _mm256_set1_ps(scalar); - __m256i idx = _mm256_set_epi32(0,0,0,0,5,1,4,0); - __m256 cplxValue1, cplxValue2, result; - __m256i resultInt; - __m128i resultShort; + __m256 vScalar = _mm256_set1_ps(scalar); + __m256i idx = _mm256_set_epi32(0, 0, 0, 0, 5, 1, 4, 0); + __m256 cplxValue1, cplxValue2, result; + __m256i resultInt; + __m128i resultShort; - for(;number < eighthPoints; number++){ - cplxValue1 = _mm256_loadu_ps(complexVectorPtr); - complexVectorPtr += 8; + for (; number < eighthPoints; number++) { + cplxValue1 = _mm256_loadu_ps(complexVectorPtr); + complexVectorPtr += 8; - cplxValue2 = _mm256_loadu_ps(complexVectorPtr); - complexVectorPtr += 8; + cplxValue2 = _mm256_loadu_ps(complexVectorPtr); + complexVectorPtr += 8; - cplxValue1 = _mm256_mul_ps(cplxValue1, cplxValue1); // Square the values - cplxValue2 = _mm256_mul_ps(cplxValue2, cplxValue2); // Square the Values + cplxValue1 = _mm256_mul_ps(cplxValue1, cplxValue1); // Square the values + cplxValue2 = _mm256_mul_ps(cplxValue2, cplxValue2); // Square the Values - result = _mm256_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values + result = _mm256_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values - result = _mm256_sqrt_ps(result); + result = _mm256_sqrt_ps(result); - result = _mm256_mul_ps(result, vScalar); + result = _mm256_mul_ps(result, vScalar); - resultInt = _mm256_cvtps_epi32(result); - resultInt = _mm256_packs_epi32(resultInt, resultInt); - resultInt = _mm256_permutevar8x32_epi32(resultInt, idx); //permute to compensate for shuffling in hadd and packs - resultShort = _mm256_extracti128_si256(resultInt,0); - _mm_storeu_si128((__m128i*)magnitudeVectorPtr,resultShort); - magnitudeVectorPtr += 8; - } + resultInt = _mm256_cvtps_epi32(result); + resultInt = _mm256_packs_epi32(resultInt, resultInt); + resultInt = _mm256_permutevar8x32_epi32( + resultInt, idx); // permute to compensate for shuffling in hadd and packs + resultShort = _mm256_extracti128_si256(resultInt, 0); + _mm_storeu_si128((__m128i*)magnitudeVectorPtr, resultShort); + magnitudeVectorPtr += 8; + } - number = eighthPoints * 8; - volk_32fc_s32f_magnitude_16i_generic(magnitudeVector+number, complexVector+number, scalar, num_points-number); + number = eighthPoints * 8; + volk_32fc_s32f_magnitude_16i_generic( + magnitudeVector + number, complexVector + number, scalar, num_points - number); } #endif /* LV_HAVE_AVX2 */ diff --git a/kernels/volk/volk_32fc_s32f_power_32fc.h b/kernels/volk/volk_32fc_s32f_power_32fc.h index d2803f2..b31179c 100644 --- a/kernels/volk/volk_32fc_s32f_power_32fc.h +++ b/kernels/volk/volk_32fc_s32f_power_32fc.h @@ -31,8 +31,8 @@ * * Dispatcher Prototype * \code - * void volk_32fc_s32f_power_32fc(lv_32fc_t* cVector, const lv_32fc_t* aVector, const float power, unsigned int num_points) - * \endcode + * void volk_32fc_s32f_power_32fc(lv_32fc_t* cVector, const lv_32fc_t* aVector, const + * float power, unsigned int num_points) \endcode * * \b Inputs * \li aVector: The complex input vector. @@ -56,15 +56,17 @@ #define INCLUDED_volk_32fc_s32f_power_32fc_a_H #include -#include #include +#include //! raise a complex float to a real float power -static inline lv_32fc_t __volk_s32fc_s32f_power_s32fc_a(const lv_32fc_t exp, const float power) +static inline lv_32fc_t __volk_s32fc_s32f_power_s32fc_a(const lv_32fc_t exp, + const float power) { - const float arg = power*atan2f(lv_creal(exp), lv_cimag(exp)); - const float mag = powf(lv_creal(exp)*lv_creal(exp) + lv_cimag(exp)*lv_cimag(exp), power/2); - return mag*lv_cmake(-cosf(arg), sinf(arg)); + const float arg = power * atan2f(lv_creal(exp), lv_cimag(exp)); + const float mag = + powf(lv_creal(exp) * lv_creal(exp) + lv_cimag(exp) * lv_cimag(exp), power / 2); + return mag * lv_cmake(-cosf(arg), sinf(arg)); } #ifdef LV_HAVE_SSE @@ -74,83 +76,94 @@ static inline lv_32fc_t __volk_s32fc_s32f_power_s32fc_a(const lv_32fc_t exp, con #include #endif /* LV_HAVE_LIB_SIMDMATH */ -static inline void -volk_32fc_s32f_power_32fc_a_sse(lv_32fc_t* cVector, const lv_32fc_t* aVector, - const float power, unsigned int num_points) +static inline void volk_32fc_s32f_power_32fc_a_sse(lv_32fc_t* cVector, + const lv_32fc_t* aVector, + const float power, + unsigned int num_points) { - unsigned int number = 0; + unsigned int number = 0; - lv_32fc_t* cPtr = cVector; - const lv_32fc_t* aPtr = aVector; + lv_32fc_t* cPtr = cVector; + const lv_32fc_t* aPtr = aVector; #ifdef LV_HAVE_LIB_SIMDMATH - const unsigned int quarterPoints = num_points / 4; - __m128 vPower = _mm_set_ps1(power); + const unsigned int quarterPoints = num_points / 4; + __m128 vPower = _mm_set_ps1(power); - __m128 cplxValue1, cplxValue2, magnitude, phase, iValue, qValue; - for(;number < quarterPoints; number++){ + __m128 cplxValue1, cplxValue2, magnitude, phase, iValue, qValue; + for (; number < quarterPoints; number++) { - cplxValue1 = _mm_load_ps((float*)aPtr); - aPtr += 2; + cplxValue1 = _mm_load_ps((float*)aPtr); + aPtr += 2; - cplxValue2 = _mm_load_ps((float*)aPtr); - aPtr += 2; + cplxValue2 = _mm_load_ps((float*)aPtr); + aPtr += 2; - // Convert to polar coordinates + // Convert to polar coordinates - // Arrange in i1i2i3i4 format - iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0)); - // Arrange in q1q2q3q4 format - qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1)); + // Arrange in i1i2i3i4 format + iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2, 0, 2, 0)); + // Arrange in q1q2q3q4 format + qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3, 1, 3, 1)); - phase = atan2f4(qValue, iValue); // Calculate the Phase + phase = atan2f4(qValue, iValue); // Calculate the Phase - magnitude = _mm_sqrt_ps(_mm_add_ps(_mm_mul_ps(iValue, iValue), _mm_mul_ps(qValue, qValue))); // Calculate the magnitude by square rooting the added I2 and Q2 values + magnitude = _mm_sqrt_ps( + _mm_add_ps(_mm_mul_ps(iValue, iValue), + _mm_mul_ps(qValue, qValue))); // Calculate the magnitude by square + // rooting the added I2 and Q2 values - // Now calculate the power of the polar coordinate data - magnitude = powf4(magnitude, vPower); // Take the magnitude to the specified power + // Now calculate the power of the polar coordinate data + magnitude = powf4(magnitude, vPower); // Take the magnitude to the specified power - phase = _mm_mul_ps(phase, vPower); // Multiply the phase by the specified power + phase = _mm_mul_ps(phase, vPower); // Multiply the phase by the specified power - // Convert back to cartesian coordinates - iValue = _mm_mul_ps( cosf4(phase), magnitude); // Multiply the cos of the phase by the magnitude - qValue = _mm_mul_ps( sinf4(phase), magnitude); // Multiply the sin of the phase by the magnitude + // Convert back to cartesian coordinates + iValue = _mm_mul_ps(cosf4(phase), + magnitude); // Multiply the cos of the phase by the magnitude + qValue = _mm_mul_ps(sinf4(phase), + magnitude); // Multiply the sin of the phase by the magnitude - cplxValue1 = _mm_unpacklo_ps(iValue, qValue); // Interleave the lower two i & q values - cplxValue2 = _mm_unpackhi_ps(iValue, qValue); // Interleave the upper two i & q values + cplxValue1 = + _mm_unpacklo_ps(iValue, qValue); // Interleave the lower two i & q values + cplxValue2 = + _mm_unpackhi_ps(iValue, qValue); // Interleave the upper two i & q values - _mm_store_ps((float*)cPtr,cplxValue1); // Store the results back into the C container + _mm_store_ps((float*)cPtr, + cplxValue1); // Store the results back into the C container - cPtr += 2; + cPtr += 2; - _mm_store_ps((float*)cPtr,cplxValue2); // Store the results back into the C container + _mm_store_ps((float*)cPtr, + cplxValue2); // Store the results back into the C container - cPtr += 2; - } + cPtr += 2; + } - number = quarterPoints * 4; + number = quarterPoints * 4; #endif /* LV_HAVE_LIB_SIMDMATH */ - for(;number < num_points; number++){ - *cPtr++ = __volk_s32fc_s32f_power_s32fc_a((*aPtr++), power); - } + for (; number < num_points; number++) { + *cPtr++ = __volk_s32fc_s32f_power_s32fc_a((*aPtr++), power); + } } #endif /* LV_HAVE_SSE */ #ifdef LV_HAVE_GENERIC -static inline void -volk_32fc_s32f_power_32fc_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, - const float power, unsigned int num_points) +static inline void volk_32fc_s32f_power_32fc_generic(lv_32fc_t* cVector, + const lv_32fc_t* aVector, + const float power, + unsigned int num_points) { - lv_32fc_t* cPtr = cVector; - const lv_32fc_t* aPtr = aVector; - unsigned int number = 0; + lv_32fc_t* cPtr = cVector; + const lv_32fc_t* aPtr = aVector; + unsigned int number = 0; - for(number = 0; number < num_points; number++){ - *cPtr++ = __volk_s32fc_s32f_power_s32fc_a((*aPtr++), power); - } + for (number = 0; number < num_points; number++) { + *cPtr++ = __volk_s32fc_s32f_power_s32fc_a((*aPtr++), power); + } } #endif /* LV_HAVE_GENERIC */ diff --git a/kernels/volk/volk_32fc_s32f_power_spectrum_32f.h b/kernels/volk/volk_32fc_s32f_power_spectrum_32f.h index abe4662..a1a036d 100644 --- a/kernels/volk/volk_32fc_s32f_power_spectrum_32f.h +++ b/kernels/volk/volk_32fc_s32f_power_spectrum_32f.h @@ -29,13 +29,13 @@ * * Dispatcher Prototype * \code - * void volk_32fc_s32f_power_spectrum_32f(float* logPowerOutput, const lv_32fc_t* complexFFTInput, const float normalizationFactor, unsigned int num_points) - * \endcode + * void volk_32fc_s32f_power_spectrum_32f(float* logPowerOutput, const lv_32fc_t* + * complexFFTInput, const float normalizationFactor, unsigned int num_points) \endcode * * \b Inputs * \li complexFFTInput The complex data output from the FFT point. - * \li normalizationFactor: This value is divided against all the input values before the power is calculated. - * \li num_points: The number of fft data points. + * \li normalizationFactor: This value is divided against all the input values before the + * power is calculated. \li num_points: The number of fft data points. * * \b Outputs * \li logPowerOutput: The 10.0 * log10(r*r + i*i) for each data point. @@ -54,8 +54,8 @@ #define INCLUDED_volk_32fc_s32f_power_spectrum_32f_a_H #include -#include #include +#include #ifdef LV_HAVE_SSE3 #include @@ -65,74 +65,75 @@ #endif /* LV_HAVE_LIB_SIMDMATH */ static inline void -volk_32fc_s32f_power_spectrum_32f_a_sse3(float* logPowerOutput, const lv_32fc_t* complexFFTInput, - const float normalizationFactor, unsigned int num_points) +volk_32fc_s32f_power_spectrum_32f_a_sse3(float* logPowerOutput, + const lv_32fc_t* complexFFTInput, + const float normalizationFactor, + unsigned int num_points) { - const float* inputPtr = (const float*)complexFFTInput; - float* destPtr = logPowerOutput; - uint64_t number = 0; - const float iNormalizationFactor = 1.0 / normalizationFactor; + const float* inputPtr = (const float*)complexFFTInput; + float* destPtr = logPowerOutput; + uint64_t number = 0; + const float iNormalizationFactor = 1.0 / normalizationFactor; #ifdef LV_HAVE_LIB_SIMDMATH - __m128 magScalar = _mm_set_ps1(10.0); - magScalar = _mm_div_ps(magScalar, logf4(magScalar)); + __m128 magScalar = _mm_set_ps1(10.0); + magScalar = _mm_div_ps(magScalar, logf4(magScalar)); - __m128 invNormalizationFactor = _mm_set_ps1(iNormalizationFactor); + __m128 invNormalizationFactor = _mm_set_ps1(iNormalizationFactor); - __m128 power; - __m128 input1, input2; - const uint64_t quarterPoints = num_points / 4; - for(;number < quarterPoints; number++){ - // Load the complex values - input1 =_mm_load_ps(inputPtr); - inputPtr += 4; - input2 =_mm_load_ps(inputPtr); - inputPtr += 4; + __m128 power; + __m128 input1, input2; + const uint64_t quarterPoints = num_points / 4; + for (; number < quarterPoints; number++) { + // Load the complex values + input1 = _mm_load_ps(inputPtr); + inputPtr += 4; + input2 = _mm_load_ps(inputPtr); + inputPtr += 4; - // Apply the normalization factor - input1 = _mm_mul_ps(input1, invNormalizationFactor); - input2 = _mm_mul_ps(input2, invNormalizationFactor); + // Apply the normalization factor + input1 = _mm_mul_ps(input1, invNormalizationFactor); + input2 = _mm_mul_ps(input2, invNormalizationFactor); - // Multiply each value by itself - // (r1*r1), (i1*i1), (r2*r2), (i2*i2) - input1 = _mm_mul_ps(input1, input1); - // (r3*r3), (i3*i3), (r4*r4), (i4*i4) - input2 = _mm_mul_ps(input2, input2); + // Multiply each value by itself + // (r1*r1), (i1*i1), (r2*r2), (i2*i2) + input1 = _mm_mul_ps(input1, input1); + // (r3*r3), (i3*i3), (r4*r4), (i4*i4) + input2 = _mm_mul_ps(input2, input2); - // Horizontal add, to add (r*r) + (i*i) for each complex value - // (r1*r1)+(i1*i1), (r2*r2) + (i2*i2), (r3*r3)+(i3*i3), (r4*r4)+(i4*i4) - power = _mm_hadd_ps(input1, input2); + // Horizontal add, to add (r*r) + (i*i) for each complex value + // (r1*r1)+(i1*i1), (r2*r2) + (i2*i2), (r3*r3)+(i3*i3), (r4*r4)+(i4*i4) + power = _mm_hadd_ps(input1, input2); - // Calculate the natural log power - power = logf4(power); + // Calculate the natural log power + power = logf4(power); - // Convert to log10 and multiply by 10.0 - power = _mm_mul_ps(power, magScalar); + // Convert to log10 and multiply by 10.0 + power = _mm_mul_ps(power, magScalar); - // Store the floating point results - _mm_store_ps(destPtr, power); + // Store the floating point results + _mm_store_ps(destPtr, power); - destPtr += 4; - } + destPtr += 4; + } - number = quarterPoints*4; + number = quarterPoints * 4; #endif /* LV_HAVE_LIB_SIMDMATH */ - // Calculate the FFT for any remaining points - - for(; number < num_points; number++){ - // Calculate dBm - // 50 ohm load assumption - // 10 * log10 (v^2 / (2 * 50.0 * .001)) = 10 * log10( v^2 * 10) - // 75 ohm load assumption - // 10 * log10 (v^2 / (2 * 75.0 * .001)) = 10 * log10( v^2 * 15) + // Calculate the FFT for any remaining points - const float real = *inputPtr++ * iNormalizationFactor; - const float imag = *inputPtr++ * iNormalizationFactor; + for (; number < num_points; number++) { + // Calculate dBm + // 50 ohm load assumption + // 10 * log10 (v^2 / (2 * 50.0 * .001)) = 10 * log10( v^2 * 10) + // 75 ohm load assumption + // 10 * log10 (v^2 / (2 * 75.0 * .001)) = 10 * log10( v^2 * 15) - *destPtr = 10.0*log10f(((real * real) + (imag * imag)) + 1e-20); + const float real = *inputPtr++ * iNormalizationFactor; + const float imag = *inputPtr++ * iNormalizationFactor; - destPtr++; - } + *destPtr = 10.0 * log10f(((real * real) + (imag * imag)) + 1e-20); + destPtr++; + } } #endif /* LV_HAVE_SSE3 */ @@ -141,7 +142,10 @@ volk_32fc_s32f_power_spectrum_32f_a_sse3(float* logPowerOutput, const lv_32fc_t* #include static inline void -volk_32fc_s32f_power_spectrum_32f_neon(float* logPowerOutput, const lv_32fc_t* complexFFTInput, const float normalizationFactor, unsigned int num_points) +volk_32fc_s32f_power_spectrum_32f_neon(float* logPowerOutput, + const lv_32fc_t* complexFFTInput, + const float normalizationFactor, + unsigned int num_points) { float* logPowerOutputPtr = logPowerOutput; const lv_32fc_t* complexFFTInputPtr = complexFFTInput; @@ -151,14 +155,14 @@ volk_32fc_s32f_power_spectrum_32f_neon(float* logPowerOutput, const lv_32fc_t* c float32x4x2_t fft_vec; float32x4_t log_pwr_vec; float32x4_t mag_squared_vec; - + const float inv_ln10_10 = 4.34294481903f; // 10.0/ln(10.) - - for(number = 0; number < quarter_points; number++) { + + for (number = 0; number < quarter_points; number++) { // Load fft_vec = vld2q_f32((float*)complexFFTInputPtr); // Prefetch next 4 - __VOLK_PREFETCH(complexFFTInputPtr+4); + __VOLK_PREFETCH(complexFFTInputPtr + 4); // Normalize fft_vec.val[0] = vmulq_n_f32(fft_vec.val[0], iNormalizationFactor); fft_vec.val[1] = vmulq_n_f32(fft_vec.val[1], iNormalizationFactor); @@ -167,12 +171,12 @@ volk_32fc_s32f_power_spectrum_32f_neon(float* logPowerOutput, const lv_32fc_t* c // Store vst1q_f32(logPowerOutputPtr, log_pwr_vec); // Move pointers ahead - complexFFTInputPtr+=4; - logPowerOutputPtr+=4; + complexFFTInputPtr += 4; + logPowerOutputPtr += 4; } - + // deal with the rest - for(number = quarter_points * 4; number < num_points; number++) { + for (number = quarter_points * 4; number < num_points; number++) { const float real = lv_creal(*complexFFTInputPtr) * iNormalizationFactor; const float imag = lv_cimag(*complexFFTInputPtr) * iNormalizationFactor; *logPowerOutputPtr = 10.0 * log10f(((real * real) + (imag * imag)) + 1e-20); @@ -186,27 +190,29 @@ volk_32fc_s32f_power_spectrum_32f_neon(float* logPowerOutput, const lv_32fc_t* c #ifdef LV_HAVE_GENERIC static inline void -volk_32fc_s32f_power_spectrum_32f_generic(float* logPowerOutput, const lv_32fc_t* complexFFTInput, - const float normalizationFactor, unsigned int num_points) +volk_32fc_s32f_power_spectrum_32f_generic(float* logPowerOutput, + const lv_32fc_t* complexFFTInput, + const float normalizationFactor, + unsigned int num_points) { - // Calculate the Power of the complex point - const float* inputPtr = (float*)complexFFTInput; - float* realFFTDataPointsPtr = logPowerOutput; - const float iNormalizationFactor = 1.0 / normalizationFactor; - unsigned int point; - for(point = 0; point < num_points; point++){ - // Calculate dBm - // 50 ohm load assumption - // 10 * log10 (v^2 / (2 * 50.0 * .001)) = 10 * log10( v^2 * 10) - // 75 ohm load assumption - // 10 * log10 (v^2 / (2 * 75.0 * .001)) = 10 * log10( v^2 * 15) - - const float real = *inputPtr++ * iNormalizationFactor; - const float imag = *inputPtr++ * iNormalizationFactor; - - *realFFTDataPointsPtr = 10.0*log10f(((real * real) + (imag * imag)) + 1e-20); - realFFTDataPointsPtr++; - } + // Calculate the Power of the complex point + const float* inputPtr = (float*)complexFFTInput; + float* realFFTDataPointsPtr = logPowerOutput; + const float iNormalizationFactor = 1.0 / normalizationFactor; + unsigned int point; + for (point = 0; point < num_points; point++) { + // Calculate dBm + // 50 ohm load assumption + // 10 * log10 (v^2 / (2 * 50.0 * .001)) = 10 * log10( v^2 * 10) + // 75 ohm load assumption + // 10 * log10 (v^2 / (2 * 75.0 * .001)) = 10 * log10( v^2 * 15) + + const float real = *inputPtr++ * iNormalizationFactor; + const float imag = *inputPtr++ * iNormalizationFactor; + + *realFFTDataPointsPtr = 10.0 * log10f(((real * real) + (imag * imag)) + 1e-20); + realFFTDataPointsPtr++; + } } #endif /* LV_HAVE_GENERIC */ diff --git a/kernels/volk/volk_32fc_s32f_x2_power_spectral_density_32f.h b/kernels/volk/volk_32fc_s32f_x2_power_spectral_density_32f.h index 3260b08..37ca43c 100644 --- a/kernels/volk/volk_32fc_s32f_x2_power_spectral_density_32f.h +++ b/kernels/volk/volk_32fc_s32f_x2_power_spectral_density_32f.h @@ -29,14 +29,15 @@ * * Dispatcher Prototype * \code - * void volk_32fc_s32f_x2_power_spectral_density_32f(float* logPowerOutput, const lv_32fc_t* complexFFTInput, const float normalizationFactor, const float rbw, unsigned int num_points) - * \endcode + * void volk_32fc_s32f_x2_power_spectral_density_32f(float* logPowerOutput, const + * lv_32fc_t* complexFFTInput, const float normalizationFactor, const float rbw, unsigned + * int num_points) \endcode * * \b Inputs * \li complexFFTInput The complex data output from the FFT point. - * \li normalizationFactor: This value is divided against all the input values before the power is calculated. - * \li rbw: The resolution bandwidth of the fft spectrum - * \li num_points: The number of fft data points. + * \li normalizationFactor: This value is divided against all the input values before the + * power is calculated. \li rbw: The resolution bandwidth of the fft spectrum \li + * num_points: The number of fft data points. * * \b Outputs * \li logPowerOutput: The 10.0 * log10((r*r + i*i)/RBW) for each data point. @@ -55,8 +56,8 @@ #define INCLUDED_volk_32fc_s32f_x2_power_spectral_density_32f_a_H #include -#include #include +#include #ifdef LV_HAVE_AVX #include @@ -66,83 +67,84 @@ #endif /* LV_HAVE_LIB_SIMDMATH */ static inline void -volk_32fc_s32f_x2_power_spectral_density_32f_a_avx(float* logPowerOutput, const lv_32fc_t* complexFFTInput, - const float normalizationFactor, const float rbw, +volk_32fc_s32f_x2_power_spectral_density_32f_a_avx(float* logPowerOutput, + const lv_32fc_t* complexFFTInput, + const float normalizationFactor, + const float rbw, unsigned int num_points) { - const float* inputPtr = (const float*)complexFFTInput; - float* destPtr = logPowerOutput; - uint64_t number = 0; - const float iRBW = 1.0 / rbw; - const float iNormalizationFactor = 1.0 / normalizationFactor; + const float* inputPtr = (const float*)complexFFTInput; + float* destPtr = logPowerOutput; + uint64_t number = 0; + const float iRBW = 1.0 / rbw; + const float iNormalizationFactor = 1.0 / normalizationFactor; #ifdef LV_HAVE_LIB_SIMDMATH - __m256 magScalar = _mm256_set1_ps(10.0); - magScalar = _mm256_div_ps(magScalar, logf4(magScalar)); + __m256 magScalar = _mm256_set1_ps(10.0); + magScalar = _mm256_div_ps(magScalar, logf4(magScalar)); - __m256 invRBW = _mm256_set1_ps(iRBW); + __m256 invRBW = _mm256_set1_ps(iRBW); - __m256 invNormalizationFactor = _mm256_set1_ps(iNormalizationFactor); + __m256 invNormalizationFactor = _mm256_set1_ps(iNormalizationFactor); - __m256 power; - __m256 input1, input2; - const uint64_t eighthPoints = num_points / 8; - for(;number < eighthPoints; number++){ - // Load the complex values - input1 =_mm256_load_ps(inputPtr); - inputPtr += 8; - input2 =_mm256_load_ps(inputPtr); - inputPtr += 8; + __m256 power; + __m256 input1, input2; + const uint64_t eighthPoints = num_points / 8; + for (; number < eighthPoints; number++) { + // Load the complex values + input1 = _mm256_load_ps(inputPtr); + inputPtr += 8; + input2 = _mm256_load_ps(inputPtr); + inputPtr += 8; - // Apply the normalization factor - input1 = _mm256_mul_ps(input1, invNormalizationFactor); - input2 = _mm256_mul_ps(input2, invNormalizationFactor); + // Apply the normalization factor + input1 = _mm256_mul_ps(input1, invNormalizationFactor); + input2 = _mm256_mul_ps(input2, invNormalizationFactor); - // Multiply each value by itself - // (r1*r1), (i1*i1), (r2*r2), (i2*i2) - input1 = _mm256_mul_ps(input1, input1); - // (r3*r3), (i3*i3), (r4*r4), (i4*i4) - input2 = _mm256_mul_ps(input2, input2); + // Multiply each value by itself + // (r1*r1), (i1*i1), (r2*r2), (i2*i2) + input1 = _mm256_mul_ps(input1, input1); + // (r3*r3), (i3*i3), (r4*r4), (i4*i4) + input2 = _mm256_mul_ps(input2, input2); - // Horizontal add, to add (r*r) + (i*i) for each complex value - // (r1*r1)+(i1*i1), (r2*r2) + (i2*i2), (r3*r3)+(i3*i3), (r4*r4)+(i4*i4) - inputVal1 = _mm256_permute2f128_ps(input1, input2, 0x20); - inputVal2 = _mm256_permute2f128_ps(input1, input2, 0x31); + // Horizontal add, to add (r*r) + (i*i) for each complex value + // (r1*r1)+(i1*i1), (r2*r2) + (i2*i2), (r3*r3)+(i3*i3), (r4*r4)+(i4*i4) + inputVal1 = _mm256_permute2f128_ps(input1, input2, 0x20); + inputVal2 = _mm256_permute2f128_ps(input1, input2, 0x31); - power = _mm256_hadd_ps(inputVal1, inputVal2); + power = _mm256_hadd_ps(inputVal1, inputVal2); - // Divide by the rbw - power = _mm256_mul_ps(power, invRBW); + // Divide by the rbw + power = _mm256_mul_ps(power, invRBW); - // Calculate the natural log power - power = logf4(power); + // Calculate the natural log power + power = logf4(power); - // Convert to log10 and multiply by 10.0 - power = _mm256_mul_ps(power, magScalar); + // Convert to log10 and multiply by 10.0 + power = _mm256_mul_ps(power, magScalar); - // Store the floating point results - _mm256_store_ps(destPtr, power); + // Store the floating point results + _mm256_store_ps(destPtr, power); - destPtr += 8; - } + destPtr += 8; + } - number = eighthPoints*8; + number = eighthPoints * 8; #endif /* LV_HAVE_LIB_SIMDMATH */ - // Calculate the FFT for any remaining points - for(; number < num_points; number++){ - // Calculate dBm - // 50 ohm load assumption - // 10 * log10 (v^2 / (2 * 50.0 * .001)) = 10 * log10( v^2 * 10) - // 75 ohm load assumption - // 10 * log10 (v^2 / (2 * 75.0 * .001)) = 10 * log10( v^2 * 15) - - const float real = *inputPtr++ * iNormalizationFactor; - const float imag = *inputPtr++ * iNormalizationFactor; - - *destPtr = 10.0*log10f((((real * real) + (imag * imag)) + 1e-20) * iRBW); - destPtr++; - } - + // Calculate the FFT for any remaining points + for (; number < num_points; number++) { + // Calculate dBm + // 50 ohm load assumption + // 10 * log10 (v^2 / (2 * 50.0 * .001)) = 10 * log10( v^2 * 10) + // 75 ohm load assumption + // 10 * log10 (v^2 / (2 * 75.0 * .001)) = 10 * log10( v^2 * 15) + + const float real = *inputPtr++ * iNormalizationFactor; + const float imag = *inputPtr++ * iNormalizationFactor; + + *destPtr = 10.0 * log10f((((real * real) + (imag * imag)) + 1e-20) * iRBW); + destPtr++; + } } #endif /* LV_HAVE_AVX */ @@ -150,86 +152,86 @@ volk_32fc_s32f_x2_power_spectral_density_32f_a_avx(float* logPowerOutput, const #include - #ifdef LV_HAVE_LIB_SIMDMATH #include #endif /* LV_HAVE_LIB_SIMDMATH */ static inline void -volk_32fc_s32f_x2_power_spectral_density_32f_a_sse3(float* logPowerOutput, const lv_32fc_t* complexFFTInput, - const float normalizationFactor, const float rbw, +volk_32fc_s32f_x2_power_spectral_density_32f_a_sse3(float* logPowerOutput, + const lv_32fc_t* complexFFTInput, + const float normalizationFactor, + const float rbw, unsigned int num_points) { - const float* inputPtr = (const float*)complexFFTInput; - float* destPtr = logPowerOutput; - uint64_t number = 0; - const float iRBW = 1.0 / rbw; - const float iNormalizationFactor = 1.0 / normalizationFactor; + const float* inputPtr = (const float*)complexFFTInput; + float* destPtr = logPowerOutput; + uint64_t number = 0; + const float iRBW = 1.0 / rbw; + const float iNormalizationFactor = 1.0 / normalizationFactor; #ifdef LV_HAVE_LIB_SIMDMATH - __m128 magScalar = _mm_set_ps1(10.0); - magScalar = _mm_div_ps(magScalar, logf4(magScalar)); + __m128 magScalar = _mm_set_ps1(10.0); + magScalar = _mm_div_ps(magScalar, logf4(magScalar)); - __m128 invRBW = _mm_set_ps1(iRBW); + __m128 invRBW = _mm_set_ps1(iRBW); - __m128 invNormalizationFactor = _mm_set_ps1(iNormalizationFactor); + __m128 invNormalizationFactor = _mm_set_ps1(iNormalizationFactor); - __m128 power; - __m128 input1, input2; - const uint64_t quarterPoints = num_points / 4; - for(;number < quarterPoints; number++){ - // Load the complex values - input1 =_mm_load_ps(inputPtr); - inputPtr += 4; - input2 =_mm_load_ps(inputPtr); - inputPtr += 4; + __m128 power; + __m128 input1, input2; + const uint64_t quarterPoints = num_points / 4; + for (; number < quarterPoints; number++) { + // Load the complex values + input1 = _mm_load_ps(inputPtr); + inputPtr += 4; + input2 = _mm_load_ps(inputPtr); + inputPtr += 4; - // Apply the normalization factor - input1 = _mm_mul_ps(input1, invNormalizationFactor); - input2 = _mm_mul_ps(input2, invNormalizationFactor); + // Apply the normalization factor + input1 = _mm_mul_ps(input1, invNormalizationFactor); + input2 = _mm_mul_ps(input2, invNormalizationFactor); - // Multiply each value by itself - // (r1*r1), (i1*i1), (r2*r2), (i2*i2) - input1 = _mm_mul_ps(input1, input1); - // (r3*r3), (i3*i3), (r4*r4), (i4*i4) - input2 = _mm_mul_ps(input2, input2); + // Multiply each value by itself + // (r1*r1), (i1*i1), (r2*r2), (i2*i2) + input1 = _mm_mul_ps(input1, input1); + // (r3*r3), (i3*i3), (r4*r4), (i4*i4) + input2 = _mm_mul_ps(input2, input2); - // Horizontal add, to add (r*r) + (i*i) for each complex value - // (r1*r1)+(i1*i1), (r2*r2) + (i2*i2), (r3*r3)+(i3*i3), (r4*r4)+(i4*i4) - power = _mm_hadd_ps(input1, input2); + // Horizontal add, to add (r*r) + (i*i) for each complex value + // (r1*r1)+(i1*i1), (r2*r2) + (i2*i2), (r3*r3)+(i3*i3), (r4*r4)+(i4*i4) + power = _mm_hadd_ps(input1, input2); - // Divide by the rbw - power = _mm_mul_ps(power, invRBW); + // Divide by the rbw + power = _mm_mul_ps(power, invRBW); - // Calculate the natural log power - power = logf4(power); + // Calculate the natural log power + power = logf4(power); - // Convert to log10 and multiply by 10.0 - power = _mm_mul_ps(power, magScalar); + // Convert to log10 and multiply by 10.0 + power = _mm_mul_ps(power, magScalar); - // Store the floating point results - _mm_store_ps(destPtr, power); + // Store the floating point results + _mm_store_ps(destPtr, power); - destPtr += 4; - } + destPtr += 4; + } - number = quarterPoints*4; + number = quarterPoints * 4; #endif /* LV_HAVE_LIB_SIMDMATH */ - // Calculate the FFT for any remaining points - for(; number < num_points; number++){ - // Calculate dBm - // 50 ohm load assumption - // 10 * log10 (v^2 / (2 * 50.0 * .001)) = 10 * log10( v^2 * 10) - // 75 ohm load assumption - // 10 * log10 (v^2 / (2 * 75.0 * .001)) = 10 * log10( v^2 * 15) - - const float real = *inputPtr++ * iNormalizationFactor; - const float imag = *inputPtr++ * iNormalizationFactor; - - *destPtr = 10.0*log10f((((real * real) + (imag * imag)) + 1e-20) * iRBW); - destPtr++; - } - + // Calculate the FFT for any remaining points + for (; number < num_points; number++) { + // Calculate dBm + // 50 ohm load assumption + // 10 * log10 (v^2 / (2 * 50.0 * .001)) = 10 * log10( v^2 * 10) + // 75 ohm load assumption + // 10 * log10 (v^2 / (2 * 75.0 * .001)) = 10 * log10( v^2 * 15) + + const float real = *inputPtr++ * iNormalizationFactor; + const float imag = *inputPtr++ * iNormalizationFactor; + + *destPtr = 10.0 * log10f((((real * real) + (imag * imag)) + 1e-20) * iRBW); + destPtr++; + } } #endif /* LV_HAVE_SSE3 */ @@ -237,31 +239,34 @@ volk_32fc_s32f_x2_power_spectral_density_32f_a_sse3(float* logPowerOutput, const #ifdef LV_HAVE_GENERIC static inline void -volk_32fc_s32f_x2_power_spectral_density_32f_generic(float* logPowerOutput, const lv_32fc_t* complexFFTInput, - const float normalizationFactor, const float rbw, +volk_32fc_s32f_x2_power_spectral_density_32f_generic(float* logPowerOutput, + const lv_32fc_t* complexFFTInput, + const float normalizationFactor, + const float rbw, unsigned int num_points) { - // Calculate the Power of the complex point - const float* inputPtr = (float*)complexFFTInput; - float* realFFTDataPointsPtr = logPowerOutput; - unsigned int point; - const float invRBW = 1.0 / rbw; - const float iNormalizationFactor = 1.0 / normalizationFactor; - - for(point = 0; point < num_points; point++){ - // Calculate dBm - // 50 ohm load assumption - // 10 * log10 (v^2 / (2 * 50.0 * .001)) = 10 * log10( v^2 * 10) - // 75 ohm load assumption - // 10 * log10 (v^2 / (2 * 75.0 * .001)) = 10 * log10( v^2 * 15) - - const float real = *inputPtr++ * iNormalizationFactor; - const float imag = *inputPtr++ * iNormalizationFactor; - - *realFFTDataPointsPtr = 10.0*log10f((((real * real) + (imag * imag)) + 1e-20) * invRBW); - - realFFTDataPointsPtr++; - } + // Calculate the Power of the complex point + const float* inputPtr = (float*)complexFFTInput; + float* realFFTDataPointsPtr = logPowerOutput; + unsigned int point; + const float invRBW = 1.0 / rbw; + const float iNormalizationFactor = 1.0 / normalizationFactor; + + for (point = 0; point < num_points; point++) { + // Calculate dBm + // 50 ohm load assumption + // 10 * log10 (v^2 / (2 * 50.0 * .001)) = 10 * log10( v^2 * 10) + // 75 ohm load assumption + // 10 * log10 (v^2 / (2 * 75.0 * .001)) = 10 * log10( v^2 * 15) + + const float real = *inputPtr++ * iNormalizationFactor; + const float imag = *inputPtr++ * iNormalizationFactor; + + *realFFTDataPointsPtr = + 10.0 * log10f((((real * real) + (imag * imag)) + 1e-20) * invRBW); + + realFFTDataPointsPtr++; + } } #endif /* LV_HAVE_GENERIC */ diff --git a/kernels/volk/volk_32fc_s32fc_multiply_32fc.h b/kernels/volk/volk_32fc_s32fc_multiply_32fc.h index fe416b4..840008a 100644 --- a/kernels/volk/volk_32fc_s32fc_multiply_32fc.h +++ b/kernels/volk/volk_32fc_s32fc_multiply_32fc.h @@ -30,8 +30,8 @@ * * Dispatcher Prototype * \code - * void volk_32fc_s32fc_multiply_32fc(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t scalar, unsigned int num_points); - * \endcode + * void volk_32fc_s32fc_multiply_32fc(lv_32fc_t* cVector, const lv_32fc_t* aVector, const + * lv_32fc_t scalar, unsigned int num_points); \endcode * * \b Inputs * \li aVector: The input vector to be multiplied. @@ -76,15 +76,19 @@ #ifndef INCLUDED_volk_32fc_s32fc_multiply_32fc_u_H #define INCLUDED_volk_32fc_s32fc_multiply_32fc_u_H +#include #include #include #include -#include #if LV_HAVE_AVX && LV_HAVE_FMA #include -static inline void volk_32fc_s32fc_multiply_32fc_u_avx_fma(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t scalar, unsigned int num_points){ +static inline void volk_32fc_s32fc_multiply_32fc_u_avx_fma(lv_32fc_t* cVector, + const lv_32fc_t* aVector, + const lv_32fc_t scalar, + unsigned int num_points) +{ unsigned int number = 0; unsigned int i = 0; const unsigned int quarterPoints = num_points / 4; @@ -97,34 +101,38 @@ static inline void volk_32fc_s32fc_multiply_32fc_u_avx_fma(lv_32fc_t* cVector, c yl = _mm256_set1_ps(lv_creal(scalar)); yh = _mm256_set1_ps(lv_cimag(scalar)); - for(;number < quarterPoints; number++){ - x = _mm256_loadu_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi + for (; number < quarterPoints; number++) { + x = _mm256_loadu_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi - tmp1 = x; + tmp1 = x; - x = _mm256_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br + x = _mm256_shuffle_ps(x, x, 0xB1); // Re-arrange x to be ai,ar,bi,br - tmp2 = _mm256_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + tmp2 = _mm256_mul_ps(x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di - z = _mm256_fmaddsub_ps(tmp1, yl,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + z = _mm256_fmaddsub_ps( + tmp1, yl, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di - _mm256_storeu_ps((float*)c,z); // Store the results back into the C container + _mm256_storeu_ps((float*)c, z); // Store the results back into the C container - a += 4; - c += 4; + a += 4; + c += 4; } - for(i = num_points-isodd; i < num_points; i++) { + for (i = num_points - isodd; i < num_points; i++) { *c++ = (*a++) * scalar; } - } #endif /* LV_HAVE_AVX && LV_HAVE_FMA */ #ifdef LV_HAVE_AVX #include -static inline void volk_32fc_s32fc_multiply_32fc_u_avx(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t scalar, unsigned int num_points){ +static inline void volk_32fc_s32fc_multiply_32fc_u_avx(lv_32fc_t* cVector, + const lv_32fc_t* aVector, + const lv_32fc_t scalar, + unsigned int num_points) +{ unsigned int number = 0; unsigned int i = 0; const unsigned int quarterPoints = num_points / 4; @@ -137,35 +145,39 @@ static inline void volk_32fc_s32fc_multiply_32fc_u_avx(lv_32fc_t* cVector, const yl = _mm256_set1_ps(lv_creal(scalar)); yh = _mm256_set1_ps(lv_cimag(scalar)); - for(;number < quarterPoints; number++){ - x = _mm256_loadu_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi + for (; number < quarterPoints; number++) { + x = _mm256_loadu_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi - tmp1 = _mm256_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + tmp1 = _mm256_mul_ps(x, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr - x = _mm256_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br + x = _mm256_shuffle_ps(x, x, 0xB1); // Re-arrange x to be ai,ar,bi,br - tmp2 = _mm256_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + tmp2 = _mm256_mul_ps(x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di - z = _mm256_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + z = _mm256_addsub_ps(tmp1, + tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di - _mm256_storeu_ps((float*)c,z); // Store the results back into the C container + _mm256_storeu_ps((float*)c, z); // Store the results back into the C container - a += 4; - c += 4; + a += 4; + c += 4; } - for(i = num_points-isodd; i < num_points; i++) { + for (i = num_points - isodd; i < num_points; i++) { *c++ = (*a++) * scalar; } - } #endif /* LV_HAVE_AVX */ #ifdef LV_HAVE_SSE3 #include -static inline void volk_32fc_s32fc_multiply_32fc_u_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t scalar, unsigned int num_points){ - unsigned int number = 0; +static inline void volk_32fc_s32fc_multiply_32fc_u_sse3(lv_32fc_t* cVector, + const lv_32fc_t* aVector, + const lv_32fc_t scalar, + unsigned int num_points) +{ + unsigned int number = 0; const unsigned int halfPoints = num_points / 2; __m128 x, yl, yh, z, tmp1, tmp2; @@ -176,53 +188,58 @@ static inline void volk_32fc_s32fc_multiply_32fc_u_sse3(lv_32fc_t* cVector, cons yl = _mm_set_ps1(lv_creal(scalar)); yh = _mm_set_ps1(lv_cimag(scalar)); - for(;number < halfPoints; number++){ + for (; number < halfPoints; number++) { - x = _mm_loadu_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi + x = _mm_loadu_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi - tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + tmp1 = _mm_mul_ps(x, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr - x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br + x = _mm_shuffle_ps(x, x, 0xB1); // Re-arrange x to be ai,ar,bi,br - tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + tmp2 = _mm_mul_ps(x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di - z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + z = _mm_addsub_ps(tmp1, + tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di - _mm_storeu_ps((float*)c,z); // Store the results back into the C container + _mm_storeu_ps((float*)c, z); // Store the results back into the C container - a += 2; - c += 2; + a += 2; + c += 2; } - if((num_points % 2) != 0) { - *c = (*a) * scalar; + if ((num_points % 2) != 0) { + *c = (*a) * scalar; } } #endif /* LV_HAVE_SSE */ #ifdef LV_HAVE_GENERIC -static inline void volk_32fc_s32fc_multiply_32fc_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t scalar, unsigned int num_points){ +static inline void volk_32fc_s32fc_multiply_32fc_generic(lv_32fc_t* cVector, + const lv_32fc_t* aVector, + const lv_32fc_t scalar, + unsigned int num_points) +{ lv_32fc_t* cPtr = cVector; const lv_32fc_t* aPtr = aVector; unsigned int number = num_points; // unwrap loop - while (number >= 8){ - *cPtr++ = (*aPtr++) * scalar; - *cPtr++ = (*aPtr++) * scalar; - *cPtr++ = (*aPtr++) * scalar; - *cPtr++ = (*aPtr++) * scalar; - *cPtr++ = (*aPtr++) * scalar; - *cPtr++ = (*aPtr++) * scalar; - *cPtr++ = (*aPtr++) * scalar; - *cPtr++ = (*aPtr++) * scalar; - number -= 8; + while (number >= 8) { + *cPtr++ = (*aPtr++) * scalar; + *cPtr++ = (*aPtr++) * scalar; + *cPtr++ = (*aPtr++) * scalar; + *cPtr++ = (*aPtr++) * scalar; + *cPtr++ = (*aPtr++) * scalar; + *cPtr++ = (*aPtr++) * scalar; + *cPtr++ = (*aPtr++) * scalar; + *cPtr++ = (*aPtr++) * scalar; + number -= 8; } // clean up any remaining while (number-- > 0) - *cPtr++ = *aPtr++ * scalar; + *cPtr++ = *aPtr++ * scalar; } #endif /* LV_HAVE_GENERIC */ @@ -231,15 +248,19 @@ static inline void volk_32fc_s32fc_multiply_32fc_generic(lv_32fc_t* cVector, con #ifndef INCLUDED_volk_32fc_s32fc_multiply_32fc_a_H #define INCLUDED_volk_32fc_s32fc_multiply_32fc_a_H +#include #include #include #include -#include #if LV_HAVE_AVX && LV_HAVE_FMA #include -static inline void volk_32fc_s32fc_multiply_32fc_a_avx_fma(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t scalar, unsigned int num_points){ +static inline void volk_32fc_s32fc_multiply_32fc_a_avx_fma(lv_32fc_t* cVector, + const lv_32fc_t* aVector, + const lv_32fc_t scalar, + unsigned int num_points) +{ unsigned int number = 0; unsigned int i = 0; const unsigned int quarterPoints = num_points / 4; @@ -252,27 +273,27 @@ static inline void volk_32fc_s32fc_multiply_32fc_a_avx_fma(lv_32fc_t* cVector, c yl = _mm256_set1_ps(lv_creal(scalar)); yh = _mm256_set1_ps(lv_cimag(scalar)); - for(;number < quarterPoints; number++){ - x = _mm256_load_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi + for (; number < quarterPoints; number++) { + x = _mm256_load_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi - tmp1 = x; + tmp1 = x; - x = _mm256_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br + x = _mm256_shuffle_ps(x, x, 0xB1); // Re-arrange x to be ai,ar,bi,br - tmp2 = _mm256_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + tmp2 = _mm256_mul_ps(x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di - z = _mm256_fmaddsub_ps(tmp1, yl,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + z = _mm256_fmaddsub_ps( + tmp1, yl, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di - _mm256_store_ps((float*)c,z); // Store the results back into the C container + _mm256_store_ps((float*)c, z); // Store the results back into the C container - a += 4; - c += 4; + a += 4; + c += 4; } - for(i = num_points-isodd; i < num_points; i++) { + for (i = num_points - isodd; i < num_points; i++) { *c++ = (*a++) * scalar; } - } #endif /* LV_HAVE_AVX && LV_HAVE_FMA */ @@ -280,7 +301,11 @@ static inline void volk_32fc_s32fc_multiply_32fc_a_avx_fma(lv_32fc_t* cVector, c #ifdef LV_HAVE_AVX #include -static inline void volk_32fc_s32fc_multiply_32fc_a_avx(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t scalar, unsigned int num_points){ +static inline void volk_32fc_s32fc_multiply_32fc_a_avx(lv_32fc_t* cVector, + const lv_32fc_t* aVector, + const lv_32fc_t scalar, + unsigned int num_points) +{ unsigned int number = 0; unsigned int i = 0; const unsigned int quarterPoints = num_points / 4; @@ -293,35 +318,39 @@ static inline void volk_32fc_s32fc_multiply_32fc_a_avx(lv_32fc_t* cVector, const yl = _mm256_set1_ps(lv_creal(scalar)); yh = _mm256_set1_ps(lv_cimag(scalar)); - for(;number < quarterPoints; number++){ - x = _mm256_load_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi + for (; number < quarterPoints; number++) { + x = _mm256_load_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi - tmp1 = _mm256_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + tmp1 = _mm256_mul_ps(x, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr - x = _mm256_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br + x = _mm256_shuffle_ps(x, x, 0xB1); // Re-arrange x to be ai,ar,bi,br - tmp2 = _mm256_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + tmp2 = _mm256_mul_ps(x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di - z = _mm256_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + z = _mm256_addsub_ps(tmp1, + tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di - _mm256_store_ps((float*)c,z); // Store the results back into the C container + _mm256_store_ps((float*)c, z); // Store the results back into the C container - a += 4; - c += 4; + a += 4; + c += 4; } - for(i = num_points-isodd; i < num_points; i++) { + for (i = num_points - isodd; i < num_points; i++) { *c++ = (*a++) * scalar; } - } #endif /* LV_HAVE_AVX */ #ifdef LV_HAVE_SSE3 #include -static inline void volk_32fc_s32fc_multiply_32fc_a_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t scalar, unsigned int num_points){ - unsigned int number = 0; +static inline void volk_32fc_s32fc_multiply_32fc_a_sse3(lv_32fc_t* cVector, + const lv_32fc_t* aVector, + const lv_32fc_t scalar, + unsigned int num_points) +{ + unsigned int number = 0; const unsigned int halfPoints = num_points / 2; __m128 x, yl, yh, z, tmp1, tmp2; @@ -332,26 +361,27 @@ static inline void volk_32fc_s32fc_multiply_32fc_a_sse3(lv_32fc_t* cVector, cons yl = _mm_set_ps1(lv_creal(scalar)); yh = _mm_set_ps1(lv_cimag(scalar)); - for(;number < halfPoints; number++){ + for (; number < halfPoints; number++) { - x = _mm_load_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi + x = _mm_load_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi - tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + tmp1 = _mm_mul_ps(x, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr - x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br + x = _mm_shuffle_ps(x, x, 0xB1); // Re-arrange x to be ai,ar,bi,br - tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + tmp2 = _mm_mul_ps(x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di - z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + z = _mm_addsub_ps(tmp1, + tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di - _mm_store_ps((float*)c,z); // Store the results back into the C container + _mm_store_ps((float*)c, z); // Store the results back into the C container - a += 2; - c += 2; + a += 2; + c += 2; } - if((num_points % 2) != 0) { - *c = (*a) * scalar; + if ((num_points % 2) != 0) { + *c = (*a) * scalar; } } #endif /* LV_HAVE_SSE */ @@ -359,7 +389,11 @@ static inline void volk_32fc_s32fc_multiply_32fc_a_sse3(lv_32fc_t* cVector, cons #ifdef LV_HAVE_NEON #include -static inline void volk_32fc_s32fc_multiply_32fc_neon(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t scalar, unsigned int num_points){ +static inline void volk_32fc_s32fc_multiply_32fc_neon(lv_32fc_t* cVector, + const lv_32fc_t* aVector, + const lv_32fc_t scalar, + unsigned int num_points) +{ lv_32fc_t* cPtr = cVector; const lv_32fc_t* aPtr = aVector; unsigned int number = num_points; @@ -370,7 +404,7 @@ static inline void volk_32fc_s32fc_multiply_32fc_neon(lv_32fc_t* cVector, const scalar_val.val[0] = vld1q_dup_f32((const float*)&scalar); scalar_val.val[1] = vld1q_dup_f32(((const float*)&scalar) + 1); - for(number = 0; number < quarter_points; ++number) { + for (number = 0; number < quarter_points; ++number) { a_val = vld2q_f32((float*)aPtr); tmp_imag.val[1] = vmulq_f32(a_val.val[1], scalar_val.val[0]); tmp_imag.val[0] = vmulq_f32(a_val.val[0], scalar_val.val[0]); @@ -383,35 +417,39 @@ static inline void volk_32fc_s32fc_multiply_32fc_neon(lv_32fc_t* cVector, const cPtr += 4; } - for(number = quarter_points*4; number < num_points; number++){ - *cPtr++ = *aPtr++ * scalar; + for (number = quarter_points * 4; number < num_points; number++) { + *cPtr++ = *aPtr++ * scalar; } } #endif /* LV_HAVE_NEON */ #ifdef LV_HAVE_GENERIC -static inline void volk_32fc_s32fc_multiply_32fc_a_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t scalar, unsigned int num_points){ +static inline void volk_32fc_s32fc_multiply_32fc_a_generic(lv_32fc_t* cVector, + const lv_32fc_t* aVector, + const lv_32fc_t scalar, + unsigned int num_points) +{ lv_32fc_t* cPtr = cVector; const lv_32fc_t* aPtr = aVector; unsigned int number = num_points; // unwrap loop - while (number >= 8){ - *cPtr++ = (*aPtr++) * scalar; - *cPtr++ = (*aPtr++) * scalar; - *cPtr++ = (*aPtr++) * scalar; - *cPtr++ = (*aPtr++) * scalar; - *cPtr++ = (*aPtr++) * scalar; - *cPtr++ = (*aPtr++) * scalar; - *cPtr++ = (*aPtr++) * scalar; - *cPtr++ = (*aPtr++) * scalar; - number -= 8; + while (number >= 8) { + *cPtr++ = (*aPtr++) * scalar; + *cPtr++ = (*aPtr++) * scalar; + *cPtr++ = (*aPtr++) * scalar; + *cPtr++ = (*aPtr++) * scalar; + *cPtr++ = (*aPtr++) * scalar; + *cPtr++ = (*aPtr++) * scalar; + *cPtr++ = (*aPtr++) * scalar; + *cPtr++ = (*aPtr++) * scalar; + number -= 8; } // clean up any remaining while (number-- > 0) - *cPtr++ = *aPtr++ * scalar; + *cPtr++ = *aPtr++ * scalar; } #endif /* LV_HAVE_GENERIC */ diff --git a/kernels/volk/volk_32fc_s32fc_rotatorpuppet_32fc.h b/kernels/volk/volk_32fc_s32fc_rotatorpuppet_32fc.h index 181abc5..eba98fe 100644 --- a/kernels/volk/volk_32fc_s32fc_rotatorpuppet_32fc.h +++ b/kernels/volk/volk_32fc_s32fc_rotatorpuppet_32fc.h @@ -25,19 +25,24 @@ #define INCLUDED_volk_32fc_s32fc_rotatorpuppet_32fc_a_H -#include #include #include +#include #ifdef LV_HAVE_GENERIC -static inline void volk_32fc_s32fc_rotatorpuppet_32fc_generic(lv_32fc_t* outVector, const lv_32fc_t* inVector, const lv_32fc_t phase_inc, unsigned int num_points){ - lv_32fc_t phase[1] = {lv_cmake(.3, 0.95393)}; +static inline void volk_32fc_s32fc_rotatorpuppet_32fc_generic(lv_32fc_t* outVector, + const lv_32fc_t* inVector, + const lv_32fc_t phase_inc, + unsigned int num_points) +{ + lv_32fc_t phase[1] = { lv_cmake(.3, 0.95393) }; (*phase) /= hypotf(lv_creal(*phase), lv_cimag(*phase)); - const lv_32fc_t phase_inc_n = phase_inc / hypotf(lv_creal(phase_inc), lv_cimag(phase_inc)); - volk_32fc_s32fc_x2_rotator_32fc_generic(outVector, inVector, phase_inc_n, phase, num_points); - + const lv_32fc_t phase_inc_n = + phase_inc / hypotf(lv_creal(phase_inc), lv_cimag(phase_inc)); + volk_32fc_s32fc_x2_rotator_32fc_generic( + outVector, inVector, phase_inc_n, phase, num_points); } #endif /* LV_HAVE_GENERIC */ @@ -47,12 +52,17 @@ static inline void volk_32fc_s32fc_rotatorpuppet_32fc_generic(lv_32fc_t* outVect #include #include -static inline void volk_32fc_s32fc_rotatorpuppet_32fc_neon(lv_32fc_t* outVector, const lv_32fc_t* inVector, const lv_32fc_t phase_inc, unsigned int num_points){ - lv_32fc_t phase[1] = {lv_cmake(.3, 0.95393)}; +static inline void volk_32fc_s32fc_rotatorpuppet_32fc_neon(lv_32fc_t* outVector, + const lv_32fc_t* inVector, + const lv_32fc_t phase_inc, + unsigned int num_points) +{ + lv_32fc_t phase[1] = { lv_cmake(.3, 0.95393) }; (*phase) /= hypotf(lv_creal(*phase), lv_cimag(*phase)); - const lv_32fc_t phase_inc_n = phase_inc / hypotf(lv_creal(phase_inc), lv_cimag(phase_inc)); - volk_32fc_s32fc_x2_rotator_32fc_neon(outVector, inVector, phase_inc_n, phase, num_points); - + const lv_32fc_t phase_inc_n = + phase_inc / hypotf(lv_creal(phase_inc), lv_cimag(phase_inc)); + volk_32fc_s32fc_x2_rotator_32fc_neon( + outVector, inVector, phase_inc_n, phase, num_points); } #endif /* LV_HAVE_NEON */ @@ -61,12 +71,17 @@ static inline void volk_32fc_s32fc_rotatorpuppet_32fc_neon(lv_32fc_t* outVector, #ifdef LV_HAVE_SSE4_1 #include -static inline void volk_32fc_s32fc_rotatorpuppet_32fc_a_sse4_1(lv_32fc_t* outVector, const lv_32fc_t* inVector, const lv_32fc_t phase_inc, unsigned int num_points){ - lv_32fc_t phase[1] = {lv_cmake(.3, .95393)}; +static inline void volk_32fc_s32fc_rotatorpuppet_32fc_a_sse4_1(lv_32fc_t* outVector, + const lv_32fc_t* inVector, + const lv_32fc_t phase_inc, + unsigned int num_points) +{ + lv_32fc_t phase[1] = { lv_cmake(.3, .95393) }; (*phase) /= hypotf(lv_creal(*phase), lv_cimag(*phase)); - const lv_32fc_t phase_inc_n = phase_inc / hypotf(lv_creal(phase_inc), lv_cimag(phase_inc)); - volk_32fc_s32fc_x2_rotator_32fc_a_sse4_1(outVector, inVector, phase_inc_n, phase, num_points); - + const lv_32fc_t phase_inc_n = + phase_inc / hypotf(lv_creal(phase_inc), lv_cimag(phase_inc)); + volk_32fc_s32fc_x2_rotator_32fc_a_sse4_1( + outVector, inVector, phase_inc_n, phase, num_points); } #endif /* LV_HAVE_SSE4_1 */ @@ -74,12 +89,17 @@ static inline void volk_32fc_s32fc_rotatorpuppet_32fc_a_sse4_1(lv_32fc_t* outVec #ifdef LV_HAVE_SSE4_1 #include -static inline void volk_32fc_s32fc_rotatorpuppet_32fc_u_sse4_1(lv_32fc_t* outVector, const lv_32fc_t* inVector, const lv_32fc_t phase_inc, unsigned int num_points){ - lv_32fc_t phase[1] = {lv_cmake(.3, .95393)}; +static inline void volk_32fc_s32fc_rotatorpuppet_32fc_u_sse4_1(lv_32fc_t* outVector, + const lv_32fc_t* inVector, + const lv_32fc_t phase_inc, + unsigned int num_points) +{ + lv_32fc_t phase[1] = { lv_cmake(.3, .95393) }; (*phase) /= hypotf(lv_creal(*phase), lv_cimag(*phase)); - const lv_32fc_t phase_inc_n = phase_inc / hypotf(lv_creal(phase_inc), lv_cimag(phase_inc)); - volk_32fc_s32fc_x2_rotator_32fc_u_sse4_1(outVector, inVector, phase_inc_n, phase, num_points); - + const lv_32fc_t phase_inc_n = + phase_inc / hypotf(lv_creal(phase_inc), lv_cimag(phase_inc)); + volk_32fc_s32fc_x2_rotator_32fc_u_sse4_1( + outVector, inVector, phase_inc_n, phase, num_points); } #endif /* LV_HAVE_SSE4_1 */ @@ -88,11 +108,17 @@ static inline void volk_32fc_s32fc_rotatorpuppet_32fc_u_sse4_1(lv_32fc_t* outVec #ifdef LV_HAVE_AVX #include -static inline void volk_32fc_s32fc_rotatorpuppet_32fc_a_avx(lv_32fc_t* outVector, const lv_32fc_t* inVector, const lv_32fc_t phase_inc, unsigned int num_points){ - lv_32fc_t phase[1] = {lv_cmake(.3, .95393)}; +static inline void volk_32fc_s32fc_rotatorpuppet_32fc_a_avx(lv_32fc_t* outVector, + const lv_32fc_t* inVector, + const lv_32fc_t phase_inc, + unsigned int num_points) +{ + lv_32fc_t phase[1] = { lv_cmake(.3, .95393) }; (*phase) /= hypotf(lv_creal(*phase), lv_cimag(*phase)); - const lv_32fc_t phase_inc_n = phase_inc / hypotf(lv_creal(phase_inc), lv_cimag(phase_inc)); - volk_32fc_s32fc_x2_rotator_32fc_a_avx(outVector, inVector, phase_inc_n, phase, num_points); + const lv_32fc_t phase_inc_n = + phase_inc / hypotf(lv_creal(phase_inc), lv_cimag(phase_inc)); + volk_32fc_s32fc_x2_rotator_32fc_a_avx( + outVector, inVector, phase_inc_n, phase, num_points); } #endif /* LV_HAVE_AVX */ @@ -101,11 +127,17 @@ static inline void volk_32fc_s32fc_rotatorpuppet_32fc_a_avx(lv_32fc_t* outVector #ifdef LV_HAVE_AVX #include -static inline void volk_32fc_s32fc_rotatorpuppet_32fc_u_avx(lv_32fc_t* outVector, const lv_32fc_t* inVector, const lv_32fc_t phase_inc, unsigned int num_points){ - lv_32fc_t phase[1] = {lv_cmake(.3, .95393)}; +static inline void volk_32fc_s32fc_rotatorpuppet_32fc_u_avx(lv_32fc_t* outVector, + const lv_32fc_t* inVector, + const lv_32fc_t phase_inc, + unsigned int num_points) +{ + lv_32fc_t phase[1] = { lv_cmake(.3, .95393) }; (*phase) /= hypotf(lv_creal(*phase), lv_cimag(*phase)); - const lv_32fc_t phase_inc_n = phase_inc / hypotf(lv_creal(phase_inc), lv_cimag(phase_inc)); - volk_32fc_s32fc_x2_rotator_32fc_u_avx(outVector, inVector, phase_inc_n, phase, num_points); + const lv_32fc_t phase_inc_n = + phase_inc / hypotf(lv_creal(phase_inc), lv_cimag(phase_inc)); + volk_32fc_s32fc_x2_rotator_32fc_u_avx( + outVector, inVector, phase_inc_n, phase, num_points); } #endif /* LV_HAVE_AVX */ @@ -113,11 +145,17 @@ static inline void volk_32fc_s32fc_rotatorpuppet_32fc_u_avx(lv_32fc_t* outVector #if LV_HAVE_AVX && LV_HAVE_FMA #include -static inline void volk_32fc_s32fc_rotatorpuppet_32fc_a_avx_fma(lv_32fc_t* outVector, const lv_32fc_t* inVector, const lv_32fc_t phase_inc, unsigned int num_points){ - lv_32fc_t phase[1] = {lv_cmake(.3, .95393)}; +static inline void volk_32fc_s32fc_rotatorpuppet_32fc_a_avx_fma(lv_32fc_t* outVector, + const lv_32fc_t* inVector, + const lv_32fc_t phase_inc, + unsigned int num_points) +{ + lv_32fc_t phase[1] = { lv_cmake(.3, .95393) }; (*phase) /= hypotf(lv_creal(*phase), lv_cimag(*phase)); - const lv_32fc_t phase_inc_n = phase_inc / hypotf(lv_creal(phase_inc), lv_cimag(phase_inc)); - volk_32fc_s32fc_x2_rotator_32fc_a_avx_fma(outVector, inVector, phase_inc_n, phase, num_points); + const lv_32fc_t phase_inc_n = + phase_inc / hypotf(lv_creal(phase_inc), lv_cimag(phase_inc)); + volk_32fc_s32fc_x2_rotator_32fc_a_avx_fma( + outVector, inVector, phase_inc_n, phase, num_points); } #endif /* LV_HAVE_AVX && LV_HAVE_FMA*/ @@ -126,11 +164,17 @@ static inline void volk_32fc_s32fc_rotatorpuppet_32fc_a_avx_fma(lv_32fc_t* outVe #if LV_HAVE_AVX && LV_HAVE_FMA #include -static inline void volk_32fc_s32fc_rotatorpuppet_32fc_u_avx_fma(lv_32fc_t* outVector, const lv_32fc_t* inVector, const lv_32fc_t phase_inc, unsigned int num_points){ - lv_32fc_t phase[1] = {lv_cmake(.3, .95393)}; +static inline void volk_32fc_s32fc_rotatorpuppet_32fc_u_avx_fma(lv_32fc_t* outVector, + const lv_32fc_t* inVector, + const lv_32fc_t phase_inc, + unsigned int num_points) +{ + lv_32fc_t phase[1] = { lv_cmake(.3, .95393) }; (*phase) /= hypotf(lv_creal(*phase), lv_cimag(*phase)); - const lv_32fc_t phase_inc_n = phase_inc / hypotf(lv_creal(phase_inc), lv_cimag(phase_inc)); - volk_32fc_s32fc_x2_rotator_32fc_u_avx_fma(outVector, inVector, phase_inc_n, phase, num_points); + const lv_32fc_t phase_inc_n = + phase_inc / hypotf(lv_creal(phase_inc), lv_cimag(phase_inc)); + volk_32fc_s32fc_x2_rotator_32fc_u_avx_fma( + outVector, inVector, phase_inc_n, phase, num_points); } #endif /* LV_HAVE_AVX && LV_HAVE_FMA*/ diff --git a/kernels/volk/volk_32fc_s32fc_x2_rotator_32fc.h b/kernels/volk/volk_32fc_s32fc_x2_rotator_32fc.h index a886458..c97b8cb 100644 --- a/kernels/volk/volk_32fc_s32fc_x2_rotator_32fc.h +++ b/kernels/volk/volk_32fc_s32fc_x2_rotator_32fc.h @@ -30,14 +30,15 @@ * * Dispatcher Prototype * \code - * void volk_32fc_s32fc_x2_rotator_32fc(lv_32fc_t* outVector, const lv_32fc_t* inVector, const lv_32fc_t phase_inc, lv_32fc_t* phase, unsigned int num_points) - * \endcode + * void volk_32fc_s32fc_x2_rotator_32fc(lv_32fc_t* outVector, const lv_32fc_t* inVector, + * const lv_32fc_t phase_inc, lv_32fc_t* phase, unsigned int num_points) \endcode * * \b Inputs * \li inVector: Vector to be rotated. * \li phase_inc: rotational velocity. * \li phase: initial phase offset. - * \li num_points: The number of values in inVector to be rotated and stored into outVector. + * \li num_points: The number of values in inVector to be rotated and stored into + * outVector. * * \b Outputs * \li outVector: The vector where the results will be stored. @@ -81,31 +82,36 @@ #define INCLUDED_volk_32fc_s32fc_rotator_32fc_a_H -#include +#include #include #include -#include +#include #define ROTATOR_RELOAD 512 #ifdef LV_HAVE_GENERIC -static inline void volk_32fc_s32fc_x2_rotator_32fc_generic(lv_32fc_t* outVector, const lv_32fc_t* inVector, const lv_32fc_t phase_inc, lv_32fc_t* phase, unsigned int num_points){ +static inline void volk_32fc_s32fc_x2_rotator_32fc_generic(lv_32fc_t* outVector, + const lv_32fc_t* inVector, + const lv_32fc_t phase_inc, + lv_32fc_t* phase, + unsigned int num_points) +{ unsigned int i = 0; int j = 0; - for(i = 0; i < (unsigned int)(num_points/ROTATOR_RELOAD); ++i) { - for(j = 0; j < ROTATOR_RELOAD; ++j) { + for (i = 0; i < (unsigned int)(num_points / ROTATOR_RELOAD); ++i) { + for (j = 0; j < ROTATOR_RELOAD; ++j) { *outVector++ = *inVector++ * (*phase); (*phase) *= phase_inc; } (*phase) /= hypotf(lv_creal(*phase), lv_cimag(*phase)); } - for(i = 0; i < num_points%ROTATOR_RELOAD; ++i) { + for (i = 0; i < num_points % ROTATOR_RELOAD; ++i) { *outVector++ = *inVector++ * (*phase); (*phase) *= phase_inc; } - if(i){ + if (i) { // Make sure, we normalize phase on every call! (*phase) /= hypotf(lv_creal(*phase), lv_cimag(*phase)); } @@ -118,43 +124,47 @@ static inline void volk_32fc_s32fc_x2_rotator_32fc_generic(lv_32fc_t* outVector, #include #include -static inline void volk_32fc_s32fc_x2_rotator_32fc_neon(lv_32fc_t* outVector, const lv_32fc_t* inVector, const lv_32fc_t phase_inc, lv_32fc_t* phase, unsigned int num_points) +static inline void volk_32fc_s32fc_x2_rotator_32fc_neon(lv_32fc_t* outVector, + const lv_32fc_t* inVector, + const lv_32fc_t phase_inc, + lv_32fc_t* phase, + unsigned int num_points) { lv_32fc_t* outputVectorPtr = outVector; const lv_32fc_t* inputVectorPtr = inVector; lv_32fc_t incr = 1; - lv_32fc_t phasePtr[4] = {(*phase), (*phase), (*phase), (*phase)}; + lv_32fc_t phasePtr[4] = { (*phase), (*phase), (*phase), (*phase) }; float32x4x2_t input_vec; float32x4x2_t output_vec; - + unsigned int i = 0, j = 0; const unsigned int quarter_points = num_points / 4; - - for(i = 0; i < 4; ++i) { + + for (i = 0; i < 4; ++i) { phasePtr[i] *= incr; incr *= (phase_inc); } - + // Notice that incr has be incremented in the previous loop - const lv_32fc_t incrPtr[4] = {incr, incr, incr, incr}; - const float32x4x2_t incr_vec = vld2q_f32((float*) incrPtr); - float32x4x2_t phase_vec = vld2q_f32((float*) phasePtr); - - for(i = 0; i < (unsigned int)(quarter_points/ROTATOR_RELOAD); i++) { - for(j = 0; j < ROTATOR_RELOAD; j++) { - input_vec = vld2q_f32((float*) inputVectorPtr); + const lv_32fc_t incrPtr[4] = { incr, incr, incr, incr }; + const float32x4x2_t incr_vec = vld2q_f32((float*)incrPtr); + float32x4x2_t phase_vec = vld2q_f32((float*)phasePtr); + + for (i = 0; i < (unsigned int)(quarter_points / ROTATOR_RELOAD); i++) { + for (j = 0; j < ROTATOR_RELOAD; j++) { + input_vec = vld2q_f32((float*)inputVectorPtr); // Prefetch next one, speeds things up - __VOLK_PREFETCH(inputVectorPtr+4); + __VOLK_PREFETCH(inputVectorPtr + 4); // Rotate output_vec = _vmultiply_complexq_f32(input_vec, phase_vec); // Increase phase phase_vec = _vmultiply_complexq_f32(phase_vec, incr_vec); // Store output vst2q_f32((float*)outputVectorPtr, output_vec); - - outputVectorPtr+=4; - inputVectorPtr+=4; + + outputVectorPtr += 4; + inputVectorPtr += 4; } // normalize phase so magnitude doesn't grow because of // floating point rounding error @@ -164,20 +174,20 @@ static inline void volk_32fc_s32fc_x2_rotator_32fc_neon(lv_32fc_t* outVector, co phase_vec.val[0] = vmulq_f32(phase_vec.val[0], inv_mag); phase_vec.val[1] = vmulq_f32(phase_vec.val[1], inv_mag); } - - for(i = 0; i < quarter_points % ROTATOR_RELOAD; i++) { - input_vec = vld2q_f32((float*) inputVectorPtr); + + for (i = 0; i < quarter_points % ROTATOR_RELOAD; i++) { + input_vec = vld2q_f32((float*)inputVectorPtr); // Prefetch next one, speeds things up - __VOLK_PREFETCH(inputVectorPtr+4); + __VOLK_PREFETCH(inputVectorPtr + 4); // Rotate output_vec = _vmultiply_complexq_f32(input_vec, phase_vec); // Increase phase phase_vec = _vmultiply_complexq_f32(phase_vec, incr_vec); // Store output vst2q_f32((float*)outputVectorPtr, output_vec); - - outputVectorPtr+=4; - inputVectorPtr+=4; + + outputVectorPtr += 4; + inputVectorPtr += 4; } // if(i) == true means we looped above if (i) { @@ -191,13 +201,13 @@ static inline void volk_32fc_s32fc_x2_rotator_32fc_neon(lv_32fc_t* outVector, co } // Store current phase vst2q_f32((float*)phasePtr, phase_vec); - + // Deal with the rest - for(i = 0; i < num_points % 4; i++) { + for (i = 0; i < num_points % 4; i++) { *outputVectorPtr++ = *inputVectorPtr++ * phasePtr[0]; phasePtr[0] *= (phase_inc); } - + // For continious phase next time we need to call this function (*phase) = phasePtr[0]; } @@ -208,15 +218,20 @@ static inline void volk_32fc_s32fc_x2_rotator_32fc_neon(lv_32fc_t* outVector, co #ifdef LV_HAVE_SSE4_1 #include -static inline void volk_32fc_s32fc_x2_rotator_32fc_a_sse4_1(lv_32fc_t* outVector, const lv_32fc_t* inVector, const lv_32fc_t phase_inc, lv_32fc_t* phase, unsigned int num_points){ +static inline void volk_32fc_s32fc_x2_rotator_32fc_a_sse4_1(lv_32fc_t* outVector, + const lv_32fc_t* inVector, + const lv_32fc_t phase_inc, + lv_32fc_t* phase, + unsigned int num_points) +{ lv_32fc_t* cPtr = outVector; const lv_32fc_t* aPtr = inVector; lv_32fc_t incr = 1; - lv_32fc_t phase_Ptr[2] = {(*phase), (*phase)}; + lv_32fc_t phase_Ptr[2] = { (*phase), (*phase) }; unsigned int i, j = 0; - for(i = 0; i < 2; ++i) { + for (i = 0; i < 2; ++i) { phase_Ptr[i] *= incr; incr *= (phase_inc); } @@ -227,13 +242,13 @@ static inline void volk_32fc_s32fc_x2_rotator_32fc_a_sse4_1(lv_32fc_t* outVector __m128 aVal, phase_Val, inc_Val, yl, yh, tmp1, tmp2, z, ylp, yhp, tmp1p, tmp2p; phase_Val = _mm_loadu_ps((float*)phase_Ptr); - inc_Val = _mm_set_ps(lv_cimag(incr), lv_creal(incr),lv_cimag(incr), lv_creal(incr)); + inc_Val = _mm_set_ps(lv_cimag(incr), lv_creal(incr), lv_cimag(incr), lv_creal(incr)); const unsigned int halfPoints = num_points / 2; - for(i = 0; i < (unsigned int)(halfPoints/ROTATOR_RELOAD); i++) { - for(j = 0; j < ROTATOR_RELOAD; ++j) { + for (i = 0; i < (unsigned int)(halfPoints / ROTATOR_RELOAD); i++) { + for (j = 0; j < ROTATOR_RELOAD; ++j) { aVal = _mm_load_ps((float*)aPtr); @@ -264,7 +279,7 @@ static inline void volk_32fc_s32fc_x2_rotator_32fc_a_sse4_1(lv_32fc_t* outVector tmp2 = _mm_sqrt_ps(tmp1); phase_Val = _mm_div_ps(phase_Val, tmp2); } - for(i = 0; i < halfPoints%ROTATOR_RELOAD; ++i) { + for (i = 0; i < halfPoints % ROTATOR_RELOAD; ++i) { aVal = _mm_load_ps((float*)aPtr); yl = _mm_moveldup_ps(phase_Val); @@ -304,7 +319,6 @@ static inline void volk_32fc_s32fc_x2_rotator_32fc_a_sse4_1(lv_32fc_t* outVector } (*phase) = phase_Ptr[0]; - } #endif /* LV_HAVE_SSE4_1 for aligned */ @@ -313,15 +327,20 @@ static inline void volk_32fc_s32fc_x2_rotator_32fc_a_sse4_1(lv_32fc_t* outVector #ifdef LV_HAVE_SSE4_1 #include -static inline void volk_32fc_s32fc_x2_rotator_32fc_u_sse4_1(lv_32fc_t* outVector, const lv_32fc_t* inVector, const lv_32fc_t phase_inc, lv_32fc_t* phase, unsigned int num_points){ +static inline void volk_32fc_s32fc_x2_rotator_32fc_u_sse4_1(lv_32fc_t* outVector, + const lv_32fc_t* inVector, + const lv_32fc_t phase_inc, + lv_32fc_t* phase, + unsigned int num_points) +{ lv_32fc_t* cPtr = outVector; const lv_32fc_t* aPtr = inVector; lv_32fc_t incr = 1; - lv_32fc_t phase_Ptr[2] = {(*phase), (*phase)}; + lv_32fc_t phase_Ptr[2] = { (*phase), (*phase) }; unsigned int i, j = 0; - for(i = 0; i < 2; ++i) { + for (i = 0; i < 2; ++i) { phase_Ptr[i] *= incr; incr *= (phase_inc); } @@ -332,13 +351,13 @@ static inline void volk_32fc_s32fc_x2_rotator_32fc_u_sse4_1(lv_32fc_t* outVector __m128 aVal, phase_Val, inc_Val, yl, yh, tmp1, tmp2, z, ylp, yhp, tmp1p, tmp2p; phase_Val = _mm_loadu_ps((float*)phase_Ptr); - inc_Val = _mm_set_ps(lv_cimag(incr), lv_creal(incr),lv_cimag(incr), lv_creal(incr)); + inc_Val = _mm_set_ps(lv_cimag(incr), lv_creal(incr), lv_cimag(incr), lv_creal(incr)); const unsigned int halfPoints = num_points / 2; - for(i = 0; i < (unsigned int)(halfPoints/ROTATOR_RELOAD); i++) { - for(j = 0; j < ROTATOR_RELOAD; ++j) { + for (i = 0; i < (unsigned int)(halfPoints / ROTATOR_RELOAD); i++) { + for (j = 0; j < ROTATOR_RELOAD; ++j) { aVal = _mm_loadu_ps((float*)aPtr); @@ -369,7 +388,7 @@ static inline void volk_32fc_s32fc_x2_rotator_32fc_u_sse4_1(lv_32fc_t* outVector tmp2 = _mm_sqrt_ps(tmp1); phase_Val = _mm_div_ps(phase_Val, tmp2); } - for(i = 0; i < halfPoints%ROTATOR_RELOAD; ++i) { + for (i = 0; i < halfPoints % ROTATOR_RELOAD; ++i) { aVal = _mm_loadu_ps((float*)aPtr); yl = _mm_moveldup_ps(phase_Val); @@ -409,7 +428,6 @@ static inline void volk_32fc_s32fc_x2_rotator_32fc_u_sse4_1(lv_32fc_t* outVector } (*phase) = phase_Ptr[0]; - } #endif /* LV_HAVE_SSE4_1 */ @@ -419,15 +437,20 @@ static inline void volk_32fc_s32fc_x2_rotator_32fc_u_sse4_1(lv_32fc_t* outVector #include #include -static inline void volk_32fc_s32fc_x2_rotator_32fc_a_avx(lv_32fc_t* outVector, const lv_32fc_t* inVector, const lv_32fc_t phase_inc, lv_32fc_t* phase, unsigned int num_points){ +static inline void volk_32fc_s32fc_x2_rotator_32fc_a_avx(lv_32fc_t* outVector, + const lv_32fc_t* inVector, + const lv_32fc_t phase_inc, + lv_32fc_t* phase, + unsigned int num_points) +{ lv_32fc_t* cPtr = outVector; const lv_32fc_t* aPtr = inVector; lv_32fc_t incr = lv_cmake(1.0, 0.0); - lv_32fc_t phase_Ptr[4] = {(*phase), (*phase), (*phase), (*phase)}; + lv_32fc_t phase_Ptr[4] = { (*phase), (*phase), (*phase), (*phase) }; unsigned int i, j = 0; - for(i = 0; i < 4; ++i) { + for (i = 0; i < 4; ++i) { phase_Ptr[i] *= incr; incr *= (phase_inc); } @@ -435,16 +458,20 @@ static inline void volk_32fc_s32fc_x2_rotator_32fc_a_avx(lv_32fc_t* outVector, c __m256 aVal, phase_Val, z; phase_Val = _mm256_loadu_ps((float*)phase_Ptr); - - const __m256 inc_Val = _mm256_set_ps(lv_cimag(incr), lv_creal(incr), - lv_cimag(incr), lv_creal(incr), - lv_cimag(incr), lv_creal(incr), - lv_cimag(incr), lv_creal(incr)); + + const __m256 inc_Val = _mm256_set_ps(lv_cimag(incr), + lv_creal(incr), + lv_cimag(incr), + lv_creal(incr), + lv_cimag(incr), + lv_creal(incr), + lv_cimag(incr), + lv_creal(incr)); const unsigned int fourthPoints = num_points / 4; - for(i = 0; i < (unsigned int)(fourthPoints/ROTATOR_RELOAD); i++) { - for(j = 0; j < ROTATOR_RELOAD; ++j) { + for (i = 0; i < (unsigned int)(fourthPoints / ROTATOR_RELOAD); i++) { + for (j = 0; j < ROTATOR_RELOAD; ++j) { aVal = _mm256_load_ps((float*)aPtr); @@ -458,8 +485,8 @@ static inline void volk_32fc_s32fc_x2_rotator_32fc_a_avx(lv_32fc_t* outVector, c } phase_Val = _mm256_normalize_ps(phase_Val); } - - for(i = 0; i < fourthPoints%ROTATOR_RELOAD; ++i) { + + for (i = 0; i < fourthPoints % ROTATOR_RELOAD; ++i) { aVal = _mm256_load_ps((float*)aPtr); z = _mm256_complexmul_ps(aVal, phase_Val); @@ -473,10 +500,10 @@ static inline void volk_32fc_s32fc_x2_rotator_32fc_a_avx(lv_32fc_t* outVector, c if (i) { phase_Val = _mm256_normalize_ps(phase_Val); } - + _mm256_storeu_ps((float*)phase_Ptr, phase_Val); (*phase) = phase_Ptr[0]; - volk_32fc_s32fc_x2_rotator_32fc_generic(cPtr, aPtr, phase_inc, phase, num_points%4); + volk_32fc_s32fc_x2_rotator_32fc_generic(cPtr, aPtr, phase_inc, phase, num_points % 4); } #endif /* LV_HAVE_AVX for aligned */ @@ -486,15 +513,20 @@ static inline void volk_32fc_s32fc_x2_rotator_32fc_a_avx(lv_32fc_t* outVector, c #include #include -static inline void volk_32fc_s32fc_x2_rotator_32fc_u_avx(lv_32fc_t* outVector, const lv_32fc_t* inVector, const lv_32fc_t phase_inc, lv_32fc_t* phase, unsigned int num_points){ +static inline void volk_32fc_s32fc_x2_rotator_32fc_u_avx(lv_32fc_t* outVector, + const lv_32fc_t* inVector, + const lv_32fc_t phase_inc, + lv_32fc_t* phase, + unsigned int num_points) +{ lv_32fc_t* cPtr = outVector; const lv_32fc_t* aPtr = inVector; lv_32fc_t incr = lv_cmake(1.0, 0.0); - lv_32fc_t phase_Ptr[4] = {(*phase), (*phase), (*phase), (*phase)}; + lv_32fc_t phase_Ptr[4] = { (*phase), (*phase), (*phase), (*phase) }; unsigned int i, j = 0; - for(i = 0; i < 4; ++i) { + for (i = 0; i < 4; ++i) { phase_Ptr[i] *= incr; incr *= (phase_inc); } @@ -502,19 +534,23 @@ static inline void volk_32fc_s32fc_x2_rotator_32fc_u_avx(lv_32fc_t* outVector, c __m256 aVal, phase_Val, z; phase_Val = _mm256_loadu_ps((float*)phase_Ptr); - - const __m256 inc_Val = _mm256_set_ps(lv_cimag(incr), lv_creal(incr), - lv_cimag(incr), lv_creal(incr), - lv_cimag(incr), lv_creal(incr), - lv_cimag(incr), lv_creal(incr)); - + + const __m256 inc_Val = _mm256_set_ps(lv_cimag(incr), + lv_creal(incr), + lv_cimag(incr), + lv_creal(incr), + lv_cimag(incr), + lv_creal(incr), + lv_cimag(incr), + lv_creal(incr)); + const unsigned int fourthPoints = num_points / 4; - for(i = 0; i < (unsigned int)(fourthPoints/ROTATOR_RELOAD); ++i) { - for(j = 0; j < ROTATOR_RELOAD; ++j) { + for (i = 0; i < (unsigned int)(fourthPoints / ROTATOR_RELOAD); ++i) { + for (j = 0; j < ROTATOR_RELOAD; ++j) { aVal = _mm256_loadu_ps((float*)aPtr); - + z = _mm256_complexmul_ps(aVal, phase_Val); phase_Val = _mm256_complexmul_ps(phase_Val, inc_Val); @@ -524,10 +560,9 @@ static inline void volk_32fc_s32fc_x2_rotator_32fc_u_avx(lv_32fc_t* outVector, c cPtr += 4; } phase_Val = _mm256_normalize_ps(phase_Val); - } - - for(i = 0; i < fourthPoints%ROTATOR_RELOAD; ++i) { + + for (i = 0; i < num_points % ROTATOR_RELOAD; ++i) { aVal = _mm256_loadu_ps((float*)aPtr); z = _mm256_complexmul_ps(aVal, phase_Val); @@ -544,7 +579,7 @@ static inline void volk_32fc_s32fc_x2_rotator_32fc_u_avx(lv_32fc_t* outVector, c _mm256_storeu_ps((float*)phase_Ptr, phase_Val); (*phase) = phase_Ptr[0]; - volk_32fc_s32fc_x2_rotator_32fc_generic(cPtr, aPtr, phase_inc, phase, num_points%4); + volk_32fc_s32fc_x2_rotator_32fc_generic(cPtr, aPtr, phase_inc, phase, num_points % 4); } #endif /* LV_HAVE_AVX */ @@ -552,15 +587,21 @@ static inline void volk_32fc_s32fc_x2_rotator_32fc_u_avx(lv_32fc_t* outVector, c #if LV_HAVE_AVX && LV_HAVE_FMA #include -static inline void volk_32fc_s32fc_x2_rotator_32fc_a_avx_fma(lv_32fc_t* outVector, const lv_32fc_t* inVector, const lv_32fc_t phase_inc, lv_32fc_t* phase, unsigned int num_points){ +static inline void volk_32fc_s32fc_x2_rotator_32fc_a_avx_fma(lv_32fc_t* outVector, + const lv_32fc_t* inVector, + const lv_32fc_t phase_inc, + lv_32fc_t* phase, + unsigned int num_points) +{ lv_32fc_t* cPtr = outVector; const lv_32fc_t* aPtr = inVector; lv_32fc_t incr = 1; - __VOLK_ATTR_ALIGNED(32) lv_32fc_t phase_Ptr[4] = {(*phase), (*phase), (*phase), (*phase)}; + __VOLK_ATTR_ALIGNED(32) + lv_32fc_t phase_Ptr[4] = { (*phase), (*phase), (*phase), (*phase) }; unsigned int i, j = 0; - for(i = 0; i < 4; ++i) { + for (i = 0; i < 4; ++i) { phase_Ptr[i] *= incr; incr *= (phase_inc); } @@ -568,11 +609,18 @@ static inline void volk_32fc_s32fc_x2_rotator_32fc_a_avx_fma(lv_32fc_t* outVecto __m256 aVal, phase_Val, inc_Val, yl, yh, tmp1, tmp2, z, ylp, yhp, tmp1p, tmp2p; phase_Val = _mm256_load_ps((float*)phase_Ptr); - inc_Val = _mm256_set_ps(lv_cimag(incr), lv_creal(incr),lv_cimag(incr), lv_creal(incr),lv_cimag(incr), lv_creal(incr),lv_cimag(incr), lv_creal(incr)); + inc_Val = _mm256_set_ps(lv_cimag(incr), + lv_creal(incr), + lv_cimag(incr), + lv_creal(incr), + lv_cimag(incr), + lv_creal(incr), + lv_cimag(incr), + lv_creal(incr)); const unsigned int fourthPoints = num_points / 4; - for(i = 0; i < (unsigned int)(fourthPoints/ROTATOR_RELOAD); i++) { - for(j = 0; j < ROTATOR_RELOAD; ++j) { + for (i = 0; i < (unsigned int)(fourthPoints / ROTATOR_RELOAD); i++) { + for (j = 0; j < ROTATOR_RELOAD; ++j) { aVal = _mm256_load_ps((float*)aPtr); @@ -603,7 +651,7 @@ static inline void volk_32fc_s32fc_x2_rotator_32fc_a_avx_fma(lv_32fc_t* outVecto tmp2 = _mm256_sqrt_ps(tmp1); phase_Val = _mm256_div_ps(phase_Val, tmp2); } - for(i = 0; i < fourthPoints%ROTATOR_RELOAD; ++i) { + for (i = 0; i < fourthPoints % ROTATOR_RELOAD; ++i) { aVal = _mm256_load_ps((float*)aPtr); yl = _mm256_moveldup_ps(phase_Val); @@ -636,13 +684,12 @@ static inline void volk_32fc_s32fc_x2_rotator_32fc_a_avx_fma(lv_32fc_t* outVecto } _mm256_store_ps((float*)phase_Ptr, phase_Val); - for(i = 0; i < num_points%4; ++i) { + for (i = 0; i < num_points % 4; ++i) { *cPtr++ = *aPtr++ * phase_Ptr[0]; phase_Ptr[0] *= (phase_inc); } (*phase) = phase_Ptr[0]; - } #endif /* LV_HAVE_AVX && LV_HAVE_FMA for aligned*/ @@ -650,15 +697,20 @@ static inline void volk_32fc_s32fc_x2_rotator_32fc_a_avx_fma(lv_32fc_t* outVecto #if LV_HAVE_AVX && LV_HAVE_FMA #include -static inline void volk_32fc_s32fc_x2_rotator_32fc_u_avx_fma(lv_32fc_t* outVector, const lv_32fc_t* inVector, const lv_32fc_t phase_inc, lv_32fc_t* phase, unsigned int num_points){ +static inline void volk_32fc_s32fc_x2_rotator_32fc_u_avx_fma(lv_32fc_t* outVector, + const lv_32fc_t* inVector, + const lv_32fc_t phase_inc, + lv_32fc_t* phase, + unsigned int num_points) +{ lv_32fc_t* cPtr = outVector; const lv_32fc_t* aPtr = inVector; lv_32fc_t incr = 1; - lv_32fc_t phase_Ptr[4] = {(*phase), (*phase), (*phase), (*phase)}; + lv_32fc_t phase_Ptr[4] = { (*phase), (*phase), (*phase), (*phase) }; unsigned int i, j = 0; - for(i = 0; i < 4; ++i) { + for (i = 0; i < 4; ++i) { phase_Ptr[i] *= incr; incr *= (phase_inc); } @@ -666,11 +718,18 @@ static inline void volk_32fc_s32fc_x2_rotator_32fc_u_avx_fma(lv_32fc_t* outVecto __m256 aVal, phase_Val, inc_Val, yl, yh, tmp1, tmp2, z, ylp, yhp, tmp1p, tmp2p; phase_Val = _mm256_loadu_ps((float*)phase_Ptr); - inc_Val = _mm256_set_ps(lv_cimag(incr), lv_creal(incr),lv_cimag(incr), lv_creal(incr),lv_cimag(incr), lv_creal(incr),lv_cimag(incr), lv_creal(incr)); + inc_Val = _mm256_set_ps(lv_cimag(incr), + lv_creal(incr), + lv_cimag(incr), + lv_creal(incr), + lv_cimag(incr), + lv_creal(incr), + lv_cimag(incr), + lv_creal(incr)); const unsigned int fourthPoints = num_points / 4; - for(i = 0; i < (unsigned int)(fourthPoints/ROTATOR_RELOAD); i++) { - for(j = 0; j < ROTATOR_RELOAD; ++j) { + for (i = 0; i < (unsigned int)(fourthPoints / ROTATOR_RELOAD); i++) { + for (j = 0; j < ROTATOR_RELOAD; ++j) { aVal = _mm256_loadu_ps((float*)aPtr); @@ -701,7 +760,7 @@ static inline void volk_32fc_s32fc_x2_rotator_32fc_u_avx_fma(lv_32fc_t* outVecto tmp2 = _mm256_sqrt_ps(tmp1); phase_Val = _mm256_div_ps(phase_Val, tmp2); } - for(i = 0; i < fourthPoints%ROTATOR_RELOAD; ++i) { + for (i = 0; i < fourthPoints % ROTATOR_RELOAD; ++i) { aVal = _mm256_loadu_ps((float*)aPtr); yl = _mm256_moveldup_ps(phase_Val); @@ -734,13 +793,12 @@ static inline void volk_32fc_s32fc_x2_rotator_32fc_u_avx_fma(lv_32fc_t* outVecto } _mm256_storeu_ps((float*)phase_Ptr, phase_Val); - for(i = 0; i < num_points%4; ++i) { + for (i = 0; i < num_points % 4; ++i) { *cPtr++ = *aPtr++ * phase_Ptr[0]; phase_Ptr[0] *= (phase_inc); } (*phase) = phase_Ptr[0]; - } #endif /* LV_HAVE_AVX && LV_HAVE_FMA*/ diff --git a/kernels/volk/volk_32fc_x2_add_32fc.h b/kernels/volk/volk_32fc_x2_add_32fc.h index 90ff787..e7356c3 100644 --- a/kernels/volk/volk_32fc_x2_add_32fc.h +++ b/kernels/volk/volk_32fc_x2_add_32fc.h @@ -31,8 +31,8 @@ * * Dispatcher Prototype * \code - * void volk_32fc_x2_add_32fc(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points) - * \endcode + * void volk_32fc_x2_add_32fc(lv_32fc_t* cVector, const lv_32fc_t* aVector, const + * lv_32fc_t* bVector, unsigned int num_points) \endcode * * \b Inputs * \li aVector: First vector of input points. @@ -44,7 +44,8 @@ * * \b Example * - * The follow example adds the increasing and decreasing vectors such that the result of every summation pair is 10 + * The follow example adds the increasing and decreasing vectors such that the result of + * every summation pair is 10 * * \code * int N = 10; @@ -76,36 +77,38 @@ #ifdef LV_HAVE_AVX #include -static inline void -volk_32fc_x2_add_32fc_u_avx(lv_32fc_t* cVector, const lv_32fc_t* aVector, - const lv_32fc_t* bVector, unsigned int num_points) +static inline void volk_32fc_x2_add_32fc_u_avx(lv_32fc_t* cVector, + const lv_32fc_t* aVector, + const lv_32fc_t* bVector, + unsigned int num_points) { - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; - lv_32fc_t* cPtr = cVector; - const lv_32fc_t* aPtr = aVector; - const lv_32fc_t* bPtr= bVector; + lv_32fc_t* cPtr = cVector; + const lv_32fc_t* aPtr = aVector; + const lv_32fc_t* bPtr = bVector; - __m256 aVal, bVal, cVal; - for(;number < quarterPoints; number++){ + __m256 aVal, bVal, cVal; + for (; number < quarterPoints; number++) { - aVal = _mm256_loadu_ps((float *) aPtr); - bVal = _mm256_loadu_ps((float *) bPtr); + aVal = _mm256_loadu_ps((float*)aPtr); + bVal = _mm256_loadu_ps((float*)bPtr); - cVal = _mm256_add_ps(aVal, bVal); + cVal = _mm256_add_ps(aVal, bVal); - _mm256_storeu_ps((float *) cPtr,cVal); // Store the results back into the C container + _mm256_storeu_ps((float*)cPtr, + cVal); // Store the results back into the C container - aPtr += 4; - bPtr += 4; - cPtr += 4; - } + aPtr += 4; + bPtr += 4; + cPtr += 4; + } - number = quarterPoints * 4; - for(;number < num_points; number++){ - *cPtr++ = (*aPtr++) + (*bPtr++); - } + number = quarterPoints * 4; + for (; number < num_points; number++) { + *cPtr++ = (*aPtr++) + (*bPtr++); + } } #endif /* LV_HAVE_AVX */ @@ -113,36 +116,38 @@ volk_32fc_x2_add_32fc_u_avx(lv_32fc_t* cVector, const lv_32fc_t* aVector, #ifdef LV_HAVE_AVX #include -static inline void -volk_32fc_x2_add_32fc_a_avx(lv_32fc_t* cVector, const lv_32fc_t* aVector, - const lv_32fc_t* bVector, unsigned int num_points) +static inline void volk_32fc_x2_add_32fc_a_avx(lv_32fc_t* cVector, + const lv_32fc_t* aVector, + const lv_32fc_t* bVector, + unsigned int num_points) { - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; - lv_32fc_t* cPtr = cVector; - const lv_32fc_t* aPtr = aVector; - const lv_32fc_t* bPtr= bVector; + lv_32fc_t* cPtr = cVector; + const lv_32fc_t* aPtr = aVector; + const lv_32fc_t* bPtr = bVector; - __m256 aVal, bVal, cVal; - for(;number < quarterPoints; number++){ + __m256 aVal, bVal, cVal; + for (; number < quarterPoints; number++) { - aVal = _mm256_load_ps((float*) aPtr); - bVal = _mm256_load_ps((float*) bPtr); + aVal = _mm256_load_ps((float*)aPtr); + bVal = _mm256_load_ps((float*)bPtr); - cVal = _mm256_add_ps(aVal, bVal); + cVal = _mm256_add_ps(aVal, bVal); - _mm256_store_ps((float*) cPtr,cVal); // Store the results back into the C container + _mm256_store_ps((float*)cPtr, + cVal); // Store the results back into the C container - aPtr += 4; - bPtr += 4; - cPtr += 4; - } + aPtr += 4; + bPtr += 4; + cPtr += 4; + } - number = quarterPoints * 4; - for(;number < num_points; number++){ - *cPtr++ = (*aPtr++) + (*bPtr++); - } + number = quarterPoints * 4; + for (; number < num_points; number++) { + *cPtr++ = (*aPtr++) + (*bPtr++); + } } #endif /* LV_HAVE_AVX */ @@ -150,54 +155,56 @@ volk_32fc_x2_add_32fc_a_avx(lv_32fc_t* cVector, const lv_32fc_t* aVector, #ifdef LV_HAVE_SSE #include -static inline void -volk_32fc_x2_add_32fc_u_sse(lv_32fc_t* cVector, const lv_32fc_t* aVector, - const lv_32fc_t* bVector, unsigned int num_points) +static inline void volk_32fc_x2_add_32fc_u_sse(lv_32fc_t* cVector, + const lv_32fc_t* aVector, + const lv_32fc_t* bVector, + unsigned int num_points) { - unsigned int number = 0; - const unsigned int halfPoints = num_points / 2; + unsigned int number = 0; + const unsigned int halfPoints = num_points / 2; - lv_32fc_t* cPtr = cVector; - const lv_32fc_t* aPtr = aVector; - const lv_32fc_t* bPtr= bVector; + lv_32fc_t* cPtr = cVector; + const lv_32fc_t* aPtr = aVector; + const lv_32fc_t* bPtr = bVector; - __m128 aVal, bVal, cVal; - for(;number < halfPoints; number++){ + __m128 aVal, bVal, cVal; + for (; number < halfPoints; number++) { - aVal = _mm_loadu_ps((float *) aPtr); - bVal = _mm_loadu_ps((float *) bPtr); + aVal = _mm_loadu_ps((float*)aPtr); + bVal = _mm_loadu_ps((float*)bPtr); - cVal = _mm_add_ps(aVal, bVal); + cVal = _mm_add_ps(aVal, bVal); - _mm_storeu_ps((float*) cPtr, cVal); // Store the results back into the C container + _mm_storeu_ps((float*)cPtr, cVal); // Store the results back into the C container - aPtr += 2; - bPtr += 2; - cPtr += 2; - } + aPtr += 2; + bPtr += 2; + cPtr += 2; + } - number = halfPoints * 2; - for(;number < num_points; number++){ - *cPtr++ = (*aPtr++) + (*bPtr++); - } + number = halfPoints * 2; + for (; number < num_points; number++) { + *cPtr++ = (*aPtr++) + (*bPtr++); + } } #endif /* LV_HAVE_SSE */ #ifdef LV_HAVE_GENERIC -static inline void -volk_32fc_x2_add_32fc_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, - const lv_32fc_t* bVector, unsigned int num_points) +static inline void volk_32fc_x2_add_32fc_generic(lv_32fc_t* cVector, + const lv_32fc_t* aVector, + const lv_32fc_t* bVector, + unsigned int num_points) { - lv_32fc_t* cPtr = cVector; - const lv_32fc_t* aPtr = aVector; - const lv_32fc_t* bPtr= bVector; - unsigned int number = 0; - - for(number = 0; number < num_points; number++){ - *cPtr++ = (*aPtr++) + (*bPtr++); - } + lv_32fc_t* cPtr = cVector; + const lv_32fc_t* aPtr = aVector; + const lv_32fc_t* bPtr = bVector; + unsigned int number = 0; + + for (number = 0; number < num_points; number++) { + *cPtr++ = (*aPtr++) + (*bPtr++); + } } #endif /* LV_HAVE_GENERIC */ @@ -205,34 +212,36 @@ volk_32fc_x2_add_32fc_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, #ifdef LV_HAVE_SSE #include -static inline void -volk_32fc_x2_add_32fc_a_sse(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points) +static inline void volk_32fc_x2_add_32fc_a_sse(lv_32fc_t* cVector, + const lv_32fc_t* aVector, + const lv_32fc_t* bVector, + unsigned int num_points) { - unsigned int number = 0; - const unsigned int halfPoints = num_points / 2; + unsigned int number = 0; + const unsigned int halfPoints = num_points / 2; - lv_32fc_t* cPtr = cVector; - const lv_32fc_t* aPtr = aVector; - const lv_32fc_t* bPtr= bVector; + lv_32fc_t* cPtr = cVector; + const lv_32fc_t* aPtr = aVector; + const lv_32fc_t* bPtr = bVector; - __m128 aVal, bVal, cVal; - for(;number < halfPoints; number++){ - aVal = _mm_load_ps((float *) aPtr); - bVal = _mm_load_ps((float *) bPtr); + __m128 aVal, bVal, cVal; + for (; number < halfPoints; number++) { + aVal = _mm_load_ps((float*)aPtr); + bVal = _mm_load_ps((float*)bPtr); - cVal = _mm_add_ps(aVal, bVal); + cVal = _mm_add_ps(aVal, bVal); - _mm_store_ps((float *) cPtr,cVal); // Store the results back into the C container + _mm_store_ps((float*)cPtr, cVal); // Store the results back into the C container - aPtr += 2; - bPtr += 2; - cPtr += 2; - } + aPtr += 2; + bPtr += 2; + cPtr += 2; + } - number = halfPoints * 2; - for(;number < num_points; number++){ - *cPtr++ = (*aPtr++) + (*bPtr++); - } + number = halfPoints * 2; + for (; number < num_points; number++) { + *cPtr++ = (*aPtr++) + (*bPtr++); + } } #endif /* LV_HAVE_SSE */ @@ -240,38 +249,39 @@ volk_32fc_x2_add_32fc_a_sse(lv_32fc_t* cVector, const lv_32fc_t* aVector, const #ifdef LV_HAVE_NEON #include -static inline void -volk_32fc_x2_add_32fc_u_neon(lv_32fc_t* cVector, const lv_32fc_t* aVector, - const lv_32fc_t* bVector, unsigned int num_points) +static inline void volk_32fc_x2_add_32fc_u_neon(lv_32fc_t* cVector, + const lv_32fc_t* aVector, + const lv_32fc_t* bVector, + unsigned int num_points) { - unsigned int number = 0; - const unsigned int halfPoints = num_points / 2; - - lv_32fc_t* cPtr = cVector; - const lv_32fc_t* aPtr = aVector; - const lv_32fc_t* bPtr= bVector; - float32x4_t aVal, bVal, cVal; - for(number=0; number < halfPoints; number++){ - // Load in to NEON registers - aVal = vld1q_f32((const float32_t*)(aPtr)); - bVal = vld1q_f32((const float32_t*)(bPtr)); - __VOLK_PREFETCH(aPtr+2); - __VOLK_PREFETCH(bPtr+2); - - // vector add - cVal = vaddq_f32(aVal, bVal); - // Store the results back into the C container - vst1q_f32((float*)(cPtr),cVal); - - aPtr += 2; // q uses quadwords, 4 lv_32fc_ts per vadd - bPtr += 2; - cPtr += 2; - } - - number = halfPoints * 2; // should be = num_points - for(;number < num_points; number++){ - *cPtr++ = (*aPtr++) + (*bPtr++); - } + unsigned int number = 0; + const unsigned int halfPoints = num_points / 2; + + lv_32fc_t* cPtr = cVector; + const lv_32fc_t* aPtr = aVector; + const lv_32fc_t* bPtr = bVector; + float32x4_t aVal, bVal, cVal; + for (number = 0; number < halfPoints; number++) { + // Load in to NEON registers + aVal = vld1q_f32((const float32_t*)(aPtr)); + bVal = vld1q_f32((const float32_t*)(bPtr)); + __VOLK_PREFETCH(aPtr + 2); + __VOLK_PREFETCH(bPtr + 2); + + // vector add + cVal = vaddq_f32(aVal, bVal); + // Store the results back into the C container + vst1q_f32((float*)(cPtr), cVal); + + aPtr += 2; // q uses quadwords, 4 lv_32fc_ts per vadd + bPtr += 2; + cPtr += 2; + } + + number = halfPoints * 2; // should be = num_points + for (; number < num_points; number++) { + *cPtr++ = (*aPtr++) + (*bPtr++); + } } #endif /* LV_HAVE_NEON */ diff --git a/kernels/volk/volk_32fc_x2_conjugate_dot_prod_32fc.h b/kernels/volk/volk_32fc_x2_conjugate_dot_prod_32fc.h index 77432ec..0f69499 100644 --- a/kernels/volk/volk_32fc_x2_conjugate_dot_prod_32fc.h +++ b/kernels/volk/volk_32fc_x2_conjugate_dot_prod_32fc.h @@ -34,8 +34,8 @@ * * Dispatcher Prototype * \code - * void volk_32fc_x2_conjugate_dot_prod_32fc(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) - * \endcode + * void volk_32fc_x2_conjugate_dot_prod_32fc(lv_32fc_t* result, const lv_32fc_t* input, + * const lv_32fc_t* taps, unsigned int num_points) \endcode * * \b Inputs * \li input: vector of complex floats. @@ -60,40 +60,44 @@ #define INCLUDED_volk_32fc_x2_conjugate_dot_prod_32fc_u_H -#include +#include #ifdef LV_HAVE_GENERIC -static inline void volk_32fc_x2_conjugate_dot_prod_32fc_generic(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) { +static inline void volk_32fc_x2_conjugate_dot_prod_32fc_generic(lv_32fc_t* result, + const lv_32fc_t* input, + const lv_32fc_t* taps, + unsigned int num_points) +{ - const unsigned int num_bytes = num_points*8; + const unsigned int num_bytes = num_points * 8; - float * res = (float*) result; - float * in = (float*) input; - float * tp = (float*) taps; - unsigned int n_2_ccomplex_blocks = num_bytes >> 4; + float* res = (float*)result; + float* in = (float*)input; + float* tp = (float*)taps; + unsigned int n_2_ccomplex_blocks = num_bytes >> 4; - float sum0[2] = {0,0}; - float sum1[2] = {0,0}; - unsigned int i = 0; + float sum0[2] = { 0, 0 }; + float sum1[2] = { 0, 0 }; + unsigned int i = 0; - for(i = 0; i < n_2_ccomplex_blocks; ++i) { - sum0[0] += in[0] * tp[0] + in[1] * tp[1]; - sum0[1] += (-in[0] * tp[1]) + in[1] * tp[0]; - sum1[0] += in[2] * tp[2] + in[3] * tp[3]; - sum1[1] += (-in[2] * tp[3]) + in[3] * tp[2]; + for (i = 0; i < n_2_ccomplex_blocks; ++i) { + sum0[0] += in[0] * tp[0] + in[1] * tp[1]; + sum0[1] += (-in[0] * tp[1]) + in[1] * tp[0]; + sum1[0] += in[2] * tp[2] + in[3] * tp[3]; + sum1[1] += (-in[2] * tp[3]) + in[3] * tp[2]; - in += 4; - tp += 4; - } + in += 4; + tp += 4; + } - res[0] = sum0[0] + sum1[0]; - res[1] = sum0[1] + sum1[1]; + res[0] = sum0[0] + sum1[0]; + res[1] = sum0[1] + sum1[1]; - if (num_bytes >> 3 & 1) { - *result += input[(num_bytes >> 3) - 1] * lv_conj(taps[(num_bytes >> 3) - 1]); - } + if (num_bytes >> 3 & 1) { + *result += input[(num_bytes >> 3) - 1] * lv_conj(taps[(num_bytes >> 3) - 1]); + } } #endif /*LV_HAVE_GENERIC*/ @@ -103,125 +107,134 @@ static inline void volk_32fc_x2_conjugate_dot_prod_32fc_generic(lv_32fc_t* resul #include static inline void volk_32fc_x2_conjugate_dot_prod_32fc_u_avx(lv_32fc_t* result, - const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) + const lv_32fc_t* input, + const lv_32fc_t* taps, + unsigned int num_points) { - // Partial sums for indices i, i+1, i+2 and i+3. - __m256 sum_a_mult_b_real = _mm256_setzero_ps(); - __m256 sum_a_mult_b_imag = _mm256_setzero_ps(); - - for (long unsigned i = 0; i < (num_points & ~3u); i += 4) { - /* Four complex elements a time are processed. - * (ar + j⋅ai)*conj(br + j⋅bi) = - * ar⋅br + ai⋅bi + j⋅(ai⋅br − ar⋅bi) - */ + // Partial sums for indices i, i+1, i+2 and i+3. + __m256 sum_a_mult_b_real = _mm256_setzero_ps(); + __m256 sum_a_mult_b_imag = _mm256_setzero_ps(); + + for (long unsigned i = 0; i < (num_points & ~3u); i += 4) { + /* Four complex elements a time are processed. + * (ar + j⋅ai)*conj(br + j⋅bi) = + * ar⋅br + ai⋅bi + j⋅(ai⋅br − ar⋅bi) + */ + + /* Load input and taps, split and duplicate real und imaginary parts of taps. + * a: | ai,i+3 | ar,i+3 | … | ai,i+1 | ar,i+1 | ai,i+0 | ar,i+0 | + * b: | bi,i+3 | br,i+3 | … | bi,i+1 | br,i+1 | bi,i+0 | br,i+0 | + * b_real: | br,i+3 | br,i+3 | … | br,i+1 | br,i+1 | br,i+0 | br,i+0 | + * b_imag: | bi,i+3 | bi,i+3 | … | bi,i+1 | bi,i+1 | bi,i+0 | bi,i+0 | + */ + __m256 a = _mm256_loadu_ps((const float*)&input[i]); + __m256 b = _mm256_loadu_ps((const float*)&taps[i]); + __m256 b_real = _mm256_moveldup_ps(b); + __m256 b_imag = _mm256_movehdup_ps(b); + + // Add | ai⋅br,i+3 | ar⋅br,i+3 | … | ai⋅br,i+0 | ar⋅br,i+0 | to partial sum. + sum_a_mult_b_real = _mm256_add_ps(sum_a_mult_b_real, _mm256_mul_ps(a, b_real)); + // Add | ai⋅bi,i+3 | −ar⋅bi,i+3 | … | ai⋅bi,i+0 | −ar⋅bi,i+0 | to partial sum. + sum_a_mult_b_imag = _mm256_addsub_ps(sum_a_mult_b_imag, _mm256_mul_ps(a, b_imag)); + } - /* Load input and taps, split and duplicate real und imaginary parts of taps. - * a: | ai,i+3 | ar,i+3 | … | ai,i+1 | ar,i+1 | ai,i+0 | ar,i+0 | - * b: | bi,i+3 | br,i+3 | … | bi,i+1 | br,i+1 | bi,i+0 | br,i+0 | - * b_real: | br,i+3 | br,i+3 | … | br,i+1 | br,i+1 | br,i+0 | br,i+0 | - * b_imag: | bi,i+3 | bi,i+3 | … | bi,i+1 | bi,i+1 | bi,i+0 | bi,i+0 | + // Swap position of −ar⋅bi and ai⋅bi. + sum_a_mult_b_imag = _mm256_permute_ps(sum_a_mult_b_imag, _MM_SHUFFLE(2, 3, 0, 1)); + // | ai⋅br + ai⋅bi | ai⋅br − ar⋅bi |, sum contains four such partial sums. + __m256 sum = _mm256_add_ps(sum_a_mult_b_real, sum_a_mult_b_imag); + /* Sum the four partial sums: Add high half of vector sum to the low one, i.e. + * s1 + s3 and s0 + s2 … */ - __m256 a = _mm256_loadu_ps((const float *) &input[i]); - __m256 b = _mm256_loadu_ps((const float *) &taps[i]); - __m256 b_real = _mm256_moveldup_ps(b); - __m256 b_imag = _mm256_movehdup_ps(b); - - // Add | ai⋅br,i+3 | ar⋅br,i+3 | … | ai⋅br,i+0 | ar⋅br,i+0 | to partial sum. - sum_a_mult_b_real = _mm256_add_ps(sum_a_mult_b_real, _mm256_mul_ps(a, b_real)); - // Add | ai⋅bi,i+3 | −ar⋅bi,i+3 | … | ai⋅bi,i+0 | −ar⋅bi,i+0 | to partial sum. - sum_a_mult_b_imag = _mm256_addsub_ps(sum_a_mult_b_imag, _mm256_mul_ps(a, b_imag)); - } - - // Swap position of −ar⋅bi and ai⋅bi. - sum_a_mult_b_imag = _mm256_permute_ps(sum_a_mult_b_imag, _MM_SHUFFLE(2, 3, 0, 1)); - // | ai⋅br + ai⋅bi | ai⋅br − ar⋅bi |, sum contains four such partial sums. - __m256 sum = _mm256_add_ps(sum_a_mult_b_real, sum_a_mult_b_imag); - /* Sum the four partial sums: Add high half of vector sum to the low one, i.e. - * s1 + s3 and s0 + s2 … - */ - sum = _mm256_add_ps(sum, _mm256_permute2f128_ps(sum, sum, 0x01)); - // … and now (s0 + s2) + (s1 + s3) - sum = _mm256_add_ps(sum, _mm256_permute_ps(sum, _MM_SHUFFLE(1, 0, 3, 2))); - // Store result. - __m128 lower = _mm256_extractf128_ps(sum, 0); - _mm_storel_pi((__m64 *) result, lower); - - // Handle the last elements if num_points mod 4 is bigger than 0. - for (long unsigned i = num_points & ~3u; i < num_points; ++i) { - *result += lv_cmake( - lv_creal(input[i]) * lv_creal(taps[i]) + lv_cimag(input[i]) * lv_cimag(taps[i]), - lv_cimag(input[i]) * lv_creal(taps[i]) - lv_creal(input[i]) * lv_cimag(taps[i])); - } + sum = _mm256_add_ps(sum, _mm256_permute2f128_ps(sum, sum, 0x01)); + // … and now (s0 + s2) + (s1 + s3) + sum = _mm256_add_ps(sum, _mm256_permute_ps(sum, _MM_SHUFFLE(1, 0, 3, 2))); + // Store result. + __m128 lower = _mm256_extractf128_ps(sum, 0); + _mm_storel_pi((__m64*)result, lower); + + // Handle the last elements if num_points mod 4 is bigger than 0. + for (long unsigned i = num_points & ~3u; i < num_points; ++i) { + *result += lv_cmake(lv_creal(input[i]) * lv_creal(taps[i]) + + lv_cimag(input[i]) * lv_cimag(taps[i]), + lv_cimag(input[i]) * lv_creal(taps[i]) - + lv_creal(input[i]) * lv_cimag(taps[i])); + } } #endif /* LV_HAVE_AVX */ #ifdef LV_HAVE_SSE3 -#include #include +#include static inline void volk_32fc_x2_conjugate_dot_prod_32fc_u_sse3(lv_32fc_t* result, - const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) + const lv_32fc_t* input, + const lv_32fc_t* taps, + unsigned int num_points) { - // Partial sums for indices i and i+1. - __m128 sum_a_mult_b_real = _mm_setzero_ps(); - __m128 sum_a_mult_b_imag = _mm_setzero_ps(); - - for (long unsigned i = 0; i < (num_points & ~1u); i += 2) { - /* Two complex elements a time are processed. - * (ar + j⋅ai)*conj(br + j⋅bi) = - * ar⋅br + ai⋅bi + j⋅(ai⋅br − ar⋅bi) - */ + // Partial sums for indices i and i+1. + __m128 sum_a_mult_b_real = _mm_setzero_ps(); + __m128 sum_a_mult_b_imag = _mm_setzero_ps(); + + for (long unsigned i = 0; i < (num_points & ~1u); i += 2) { + /* Two complex elements a time are processed. + * (ar + j⋅ai)*conj(br + j⋅bi) = + * ar⋅br + ai⋅bi + j⋅(ai⋅br − ar⋅bi) + */ + + /* Load input and taps, split and duplicate real und imaginary parts of taps. + * a: | ai,i+1 | ar,i+1 | ai,i+0 | ar,i+0 | + * b: | bi,i+1 | br,i+1 | bi,i+0 | br,i+0 | + * b_real: | br,i+1 | br,i+1 | br,i+0 | br,i+0 | + * b_imag: | bi,i+1 | bi,i+1 | bi,i+0 | bi,i+0 | + */ + __m128 a = _mm_loadu_ps((const float*)&input[i]); + __m128 b = _mm_loadu_ps((const float*)&taps[i]); + __m128 b_real = _mm_moveldup_ps(b); + __m128 b_imag = _mm_movehdup_ps(b); + + // Add | ai⋅br,i+1 | ar⋅br,i+1 | ai⋅br,i+0 | ar⋅br,i+0 | to partial sum. + sum_a_mult_b_real = _mm_add_ps(sum_a_mult_b_real, _mm_mul_ps(a, b_real)); + // Add | ai⋅bi,i+1 | −ar⋅bi,i+1 | ai⋅bi,i+0 | −ar⋅bi,i+0 | to partial sum. + sum_a_mult_b_imag = _mm_addsub_ps(sum_a_mult_b_imag, _mm_mul_ps(a, b_imag)); + } - /* Load input and taps, split and duplicate real und imaginary parts of taps. - * a: | ai,i+1 | ar,i+1 | ai,i+0 | ar,i+0 | - * b: | bi,i+1 | br,i+1 | bi,i+0 | br,i+0 | - * b_real: | br,i+1 | br,i+1 | br,i+0 | br,i+0 | - * b_imag: | bi,i+1 | bi,i+1 | bi,i+0 | bi,i+0 | - */ - __m128 a = _mm_loadu_ps((const float *) &input[i]); - __m128 b = _mm_loadu_ps((const float *) &taps[i]); - __m128 b_real = _mm_moveldup_ps(b); - __m128 b_imag = _mm_movehdup_ps(b); - - // Add | ai⋅br,i+1 | ar⋅br,i+1 | ai⋅br,i+0 | ar⋅br,i+0 | to partial sum. - sum_a_mult_b_real = _mm_add_ps(sum_a_mult_b_real, _mm_mul_ps(a, b_real)); - // Add | ai⋅bi,i+1 | −ar⋅bi,i+1 | ai⋅bi,i+0 | −ar⋅bi,i+0 | to partial sum. - sum_a_mult_b_imag = _mm_addsub_ps(sum_a_mult_b_imag, _mm_mul_ps(a, b_imag)); - } - - // Swap position of −ar⋅bi and ai⋅bi. - sum_a_mult_b_imag = _mm_shuffle_ps(sum_a_mult_b_imag, sum_a_mult_b_imag, - _MM_SHUFFLE(2, 3, 0, 1)); - // | ai⋅br + ai⋅bi | ai⋅br − ar⋅bi |, sum contains two such partial sums. - __m128 sum = _mm_add_ps(sum_a_mult_b_real, sum_a_mult_b_imag); - // Sum the two partial sums. - sum = _mm_add_ps(sum, _mm_shuffle_ps(sum, sum, _MM_SHUFFLE(1, 0, 3, 2))); - // Store result. - _mm_storel_pi((__m64 *) result, sum); - - // Handle the last element if num_points mod 2 is 1. - if (num_points & 1u) { - *result += lv_cmake( - lv_creal(input[num_points - 1]) * lv_creal(taps[num_points - 1]) + - lv_cimag(input[num_points - 1]) * lv_cimag(taps[num_points - 1]), - lv_cimag(input[num_points - 1]) * lv_creal(taps[num_points - 1]) - - lv_creal(input[num_points - 1]) * lv_cimag(taps[num_points - 1])); - } + // Swap position of −ar⋅bi and ai⋅bi. + sum_a_mult_b_imag = + _mm_shuffle_ps(sum_a_mult_b_imag, sum_a_mult_b_imag, _MM_SHUFFLE(2, 3, 0, 1)); + // | ai⋅br + ai⋅bi | ai⋅br − ar⋅bi |, sum contains two such partial sums. + __m128 sum = _mm_add_ps(sum_a_mult_b_real, sum_a_mult_b_imag); + // Sum the two partial sums. + sum = _mm_add_ps(sum, _mm_shuffle_ps(sum, sum, _MM_SHUFFLE(1, 0, 3, 2))); + // Store result. + _mm_storel_pi((__m64*)result, sum); + + // Handle the last element if num_points mod 2 is 1. + if (num_points & 1u) { + *result += lv_cmake( + lv_creal(input[num_points - 1]) * lv_creal(taps[num_points - 1]) + + lv_cimag(input[num_points - 1]) * lv_cimag(taps[num_points - 1]), + lv_cimag(input[num_points - 1]) * lv_creal(taps[num_points - 1]) - + lv_creal(input[num_points - 1]) * lv_cimag(taps[num_points - 1])); + } } #endif /*LV_HAVE_SSE3*/ #ifdef LV_HAVE_NEON #include -static inline void volk_32fc_x2_conjugate_dot_prod_32fc_neon(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) { +static inline void volk_32fc_x2_conjugate_dot_prod_32fc_neon(lv_32fc_t* result, + const lv_32fc_t* input, + const lv_32fc_t* taps, + unsigned int num_points) +{ unsigned int quarter_points = num_points / 4; unsigned int number; - lv_32fc_t* a_ptr = (lv_32fc_t*) taps; - lv_32fc_t* b_ptr = (lv_32fc_t*) input; + lv_32fc_t* a_ptr = (lv_32fc_t*)taps; + lv_32fc_t* b_ptr = (lv_32fc_t*)input; // for 2-lane vectors, 1st lane holds the real part, // 2nd lane holds the imaginary part float32x4x2_t a_val, b_val, accumulator; @@ -229,11 +242,11 @@ static inline void volk_32fc_x2_conjugate_dot_prod_32fc_neon(lv_32fc_t* result, accumulator.val[0] = vdupq_n_f32(0); accumulator.val[1] = vdupq_n_f32(0); - for(number = 0; number < quarter_points; ++number) { + for (number = 0; number < quarter_points; ++number) { a_val = vld2q_f32((float*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i b_val = vld2q_f32((float*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i - __VOLK_PREFETCH(a_ptr+8); - __VOLK_PREFETCH(b_ptr+8); + __VOLK_PREFETCH(a_ptr + 8); + __VOLK_PREFETCH(b_ptr + 8); // do the first multiply tmp_imag.val[1] = vmulq_f32(a_val.val[1], b_val.val[0]); @@ -255,11 +268,10 @@ static inline void volk_32fc_x2_conjugate_dot_prod_32fc_neon(lv_32fc_t* result, *result = accum_result[0] + accum_result[1] + accum_result[2] + accum_result[3]; // tail case - for(number = quarter_points*4; number < num_points; ++number) { - *result += (*a_ptr++) * lv_conj(*b_ptr++); + for (number = quarter_points * 4; number < num_points; ++number) { + *result += (*a_ptr++) * lv_conj(*b_ptr++); } *result = lv_conj(*result); - } #endif /*LV_HAVE_NEON*/ @@ -268,120 +280,125 @@ static inline void volk_32fc_x2_conjugate_dot_prod_32fc_neon(lv_32fc_t* result, #ifndef INCLUDED_volk_32fc_x2_conjugate_dot_prod_32fc_a_H #define INCLUDED_volk_32fc_x2_conjugate_dot_prod_32fc_a_H +#include #include -#include -#include +#include #ifdef LV_HAVE_AVX #include static inline void volk_32fc_x2_conjugate_dot_prod_32fc_a_avx(lv_32fc_t* result, - const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) + const lv_32fc_t* input, + const lv_32fc_t* taps, + unsigned int num_points) { - // Partial sums for indices i, i+1, i+2 and i+3. - __m256 sum_a_mult_b_real = _mm256_setzero_ps(); - __m256 sum_a_mult_b_imag = _mm256_setzero_ps(); - - for (long unsigned i = 0; i < (num_points & ~3u); i += 4) { - /* Four complex elements a time are processed. - * (ar + j⋅ai)*conj(br + j⋅bi) = - * ar⋅br + ai⋅bi + j⋅(ai⋅br − ar⋅bi) - */ + // Partial sums for indices i, i+1, i+2 and i+3. + __m256 sum_a_mult_b_real = _mm256_setzero_ps(); + __m256 sum_a_mult_b_imag = _mm256_setzero_ps(); + + for (long unsigned i = 0; i < (num_points & ~3u); i += 4) { + /* Four complex elements a time are processed. + * (ar + j⋅ai)*conj(br + j⋅bi) = + * ar⋅br + ai⋅bi + j⋅(ai⋅br − ar⋅bi) + */ + + /* Load input and taps, split and duplicate real und imaginary parts of taps. + * a: | ai,i+3 | ar,i+3 | … | ai,i+1 | ar,i+1 | ai,i+0 | ar,i+0 | + * b: | bi,i+3 | br,i+3 | … | bi,i+1 | br,i+1 | bi,i+0 | br,i+0 | + * b_real: | br,i+3 | br,i+3 | … | br,i+1 | br,i+1 | br,i+0 | br,i+0 | + * b_imag: | bi,i+3 | bi,i+3 | … | bi,i+1 | bi,i+1 | bi,i+0 | bi,i+0 | + */ + __m256 a = _mm256_load_ps((const float*)&input[i]); + __m256 b = _mm256_load_ps((const float*)&taps[i]); + __m256 b_real = _mm256_moveldup_ps(b); + __m256 b_imag = _mm256_movehdup_ps(b); + + // Add | ai⋅br,i+3 | ar⋅br,i+3 | … | ai⋅br,i+0 | ar⋅br,i+0 | to partial sum. + sum_a_mult_b_real = _mm256_add_ps(sum_a_mult_b_real, _mm256_mul_ps(a, b_real)); + // Add | ai⋅bi,i+3 | −ar⋅bi,i+3 | … | ai⋅bi,i+0 | −ar⋅bi,i+0 | to partial sum. + sum_a_mult_b_imag = _mm256_addsub_ps(sum_a_mult_b_imag, _mm256_mul_ps(a, b_imag)); + } - /* Load input and taps, split and duplicate real und imaginary parts of taps. - * a: | ai,i+3 | ar,i+3 | … | ai,i+1 | ar,i+1 | ai,i+0 | ar,i+0 | - * b: | bi,i+3 | br,i+3 | … | bi,i+1 | br,i+1 | bi,i+0 | br,i+0 | - * b_real: | br,i+3 | br,i+3 | … | br,i+1 | br,i+1 | br,i+0 | br,i+0 | - * b_imag: | bi,i+3 | bi,i+3 | … | bi,i+1 | bi,i+1 | bi,i+0 | bi,i+0 | + // Swap position of −ar⋅bi and ai⋅bi. + sum_a_mult_b_imag = _mm256_permute_ps(sum_a_mult_b_imag, _MM_SHUFFLE(2, 3, 0, 1)); + // | ai⋅br + ai⋅bi | ai⋅br − ar⋅bi |, sum contains four such partial sums. + __m256 sum = _mm256_add_ps(sum_a_mult_b_real, sum_a_mult_b_imag); + /* Sum the four partial sums: Add high half of vector sum to the low one, i.e. + * s1 + s3 and s0 + s2 … */ - __m256 a = _mm256_load_ps((const float *) &input[i]); - __m256 b = _mm256_load_ps((const float *) &taps[i]); - __m256 b_real = _mm256_moveldup_ps(b); - __m256 b_imag = _mm256_movehdup_ps(b); - - // Add | ai⋅br,i+3 | ar⋅br,i+3 | … | ai⋅br,i+0 | ar⋅br,i+0 | to partial sum. - sum_a_mult_b_real = _mm256_add_ps(sum_a_mult_b_real, _mm256_mul_ps(a, b_real)); - // Add | ai⋅bi,i+3 | −ar⋅bi,i+3 | … | ai⋅bi,i+0 | −ar⋅bi,i+0 | to partial sum. - sum_a_mult_b_imag = _mm256_addsub_ps(sum_a_mult_b_imag, _mm256_mul_ps(a, b_imag)); - } - - // Swap position of −ar⋅bi and ai⋅bi. - sum_a_mult_b_imag = _mm256_permute_ps(sum_a_mult_b_imag, _MM_SHUFFLE(2, 3, 0, 1)); - // | ai⋅br + ai⋅bi | ai⋅br − ar⋅bi |, sum contains four such partial sums. - __m256 sum = _mm256_add_ps(sum_a_mult_b_real, sum_a_mult_b_imag); - /* Sum the four partial sums: Add high half of vector sum to the low one, i.e. - * s1 + s3 and s0 + s2 … - */ - sum = _mm256_add_ps(sum, _mm256_permute2f128_ps(sum, sum, 0x01)); - // … and now (s0 + s2) + (s1 + s3) - sum = _mm256_add_ps(sum, _mm256_permute_ps(sum, _MM_SHUFFLE(1, 0, 3, 2))); - // Store result. - __m128 lower = _mm256_extractf128_ps(sum, 0); - _mm_storel_pi((__m64 *) result, lower); - - // Handle the last elements if num_points mod 4 is bigger than 0. - for (long unsigned i = num_points & ~3u; i < num_points; ++i) { - *result += lv_cmake( - lv_creal(input[i]) * lv_creal(taps[i]) + lv_cimag(input[i]) * lv_cimag(taps[i]), - lv_cimag(input[i]) * lv_creal(taps[i]) - lv_creal(input[i]) * lv_cimag(taps[i])); - } + sum = _mm256_add_ps(sum, _mm256_permute2f128_ps(sum, sum, 0x01)); + // … and now (s0 + s2) + (s1 + s3) + sum = _mm256_add_ps(sum, _mm256_permute_ps(sum, _MM_SHUFFLE(1, 0, 3, 2))); + // Store result. + __m128 lower = _mm256_extractf128_ps(sum, 0); + _mm_storel_pi((__m64*)result, lower); + + // Handle the last elements if num_points mod 4 is bigger than 0. + for (long unsigned i = num_points & ~3u; i < num_points; ++i) { + *result += lv_cmake(lv_creal(input[i]) * lv_creal(taps[i]) + + lv_cimag(input[i]) * lv_cimag(taps[i]), + lv_cimag(input[i]) * lv_creal(taps[i]) - + lv_creal(input[i]) * lv_cimag(taps[i])); + } } #endif /* LV_HAVE_AVX */ #ifdef LV_HAVE_SSE3 -#include #include +#include static inline void volk_32fc_x2_conjugate_dot_prod_32fc_a_sse3(lv_32fc_t* result, - const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) + const lv_32fc_t* input, + const lv_32fc_t* taps, + unsigned int num_points) { - // Partial sums for indices i and i+1. - __m128 sum_a_mult_b_real = _mm_setzero_ps(); - __m128 sum_a_mult_b_imag = _mm_setzero_ps(); - - for (long unsigned i = 0; i < (num_points & ~1u); i += 2) { - /* Two complex elements a time are processed. - * (ar + j⋅ai)*conj(br + j⋅bi) = - * ar⋅br + ai⋅bi + j⋅(ai⋅br − ar⋅bi) - */ + // Partial sums for indices i and i+1. + __m128 sum_a_mult_b_real = _mm_setzero_ps(); + __m128 sum_a_mult_b_imag = _mm_setzero_ps(); + + for (long unsigned i = 0; i < (num_points & ~1u); i += 2) { + /* Two complex elements a time are processed. + * (ar + j⋅ai)*conj(br + j⋅bi) = + * ar⋅br + ai⋅bi + j⋅(ai⋅br − ar⋅bi) + */ + + /* Load input and taps, split and duplicate real und imaginary parts of taps. + * a: | ai,i+1 | ar,i+1 | ai,i+0 | ar,i+0 | + * b: | bi,i+1 | br,i+1 | bi,i+0 | br,i+0 | + * b_real: | br,i+1 | br,i+1 | br,i+0 | br,i+0 | + * b_imag: | bi,i+1 | bi,i+1 | bi,i+0 | bi,i+0 | + */ + __m128 a = _mm_load_ps((const float*)&input[i]); + __m128 b = _mm_load_ps((const float*)&taps[i]); + __m128 b_real = _mm_moveldup_ps(b); + __m128 b_imag = _mm_movehdup_ps(b); + + // Add | ai⋅br,i+1 | ar⋅br,i+1 | ai⋅br,i+0 | ar⋅br,i+0 | to partial sum. + sum_a_mult_b_real = _mm_add_ps(sum_a_mult_b_real, _mm_mul_ps(a, b_real)); + // Add | ai⋅bi,i+1 | −ar⋅bi,i+1 | ai⋅bi,i+0 | −ar⋅bi,i+0 | to partial sum. + sum_a_mult_b_imag = _mm_addsub_ps(sum_a_mult_b_imag, _mm_mul_ps(a, b_imag)); + } - /* Load input and taps, split and duplicate real und imaginary parts of taps. - * a: | ai,i+1 | ar,i+1 | ai,i+0 | ar,i+0 | - * b: | bi,i+1 | br,i+1 | bi,i+0 | br,i+0 | - * b_real: | br,i+1 | br,i+1 | br,i+0 | br,i+0 | - * b_imag: | bi,i+1 | bi,i+1 | bi,i+0 | bi,i+0 | - */ - __m128 a = _mm_load_ps((const float *) &input[i]); - __m128 b = _mm_load_ps((const float *) &taps[i]); - __m128 b_real = _mm_moveldup_ps(b); - __m128 b_imag = _mm_movehdup_ps(b); - - // Add | ai⋅br,i+1 | ar⋅br,i+1 | ai⋅br,i+0 | ar⋅br,i+0 | to partial sum. - sum_a_mult_b_real = _mm_add_ps(sum_a_mult_b_real, _mm_mul_ps(a, b_real)); - // Add | ai⋅bi,i+1 | −ar⋅bi,i+1 | ai⋅bi,i+0 | −ar⋅bi,i+0 | to partial sum. - sum_a_mult_b_imag = _mm_addsub_ps(sum_a_mult_b_imag, _mm_mul_ps(a, b_imag)); - } - - // Swap position of −ar⋅bi and ai⋅bi. - sum_a_mult_b_imag = _mm_shuffle_ps(sum_a_mult_b_imag, sum_a_mult_b_imag, - _MM_SHUFFLE(2, 3, 0, 1)); - // | ai⋅br + ai⋅bi | ai⋅br − ar⋅bi |, sum contains two such partial sums. - __m128 sum = _mm_add_ps(sum_a_mult_b_real, sum_a_mult_b_imag); - // Sum the two partial sums. - sum = _mm_add_ps(sum, _mm_shuffle_ps(sum, sum, _MM_SHUFFLE(1, 0, 3, 2))); - // Store result. - _mm_storel_pi((__m64 *) result, sum); - - // Handle the last element if num_points mod 2 is 1. - if (num_points & 1u) { - *result += lv_cmake( - lv_creal(input[num_points - 1]) * lv_creal(taps[num_points - 1]) + - lv_cimag(input[num_points - 1]) * lv_cimag(taps[num_points - 1]), - lv_cimag(input[num_points - 1]) * lv_creal(taps[num_points - 1]) - - lv_creal(input[num_points - 1]) * lv_cimag(taps[num_points - 1])); - } + // Swap position of −ar⋅bi and ai⋅bi. + sum_a_mult_b_imag = + _mm_shuffle_ps(sum_a_mult_b_imag, sum_a_mult_b_imag, _MM_SHUFFLE(2, 3, 0, 1)); + // | ai⋅br + ai⋅bi | ai⋅br − ar⋅bi |, sum contains two such partial sums. + __m128 sum = _mm_add_ps(sum_a_mult_b_real, sum_a_mult_b_imag); + // Sum the two partial sums. + sum = _mm_add_ps(sum, _mm_shuffle_ps(sum, sum, _MM_SHUFFLE(1, 0, 3, 2))); + // Store result. + _mm_storel_pi((__m64*)result, sum); + + // Handle the last element if num_points mod 2 is 1. + if (num_points & 1u) { + *result += lv_cmake( + lv_creal(input[num_points - 1]) * lv_creal(taps[num_points - 1]) + + lv_cimag(input[num_points - 1]) * lv_cimag(taps[num_points - 1]), + lv_cimag(input[num_points - 1]) * lv_creal(taps[num_points - 1]) - + lv_creal(input[num_points - 1]) * lv_cimag(taps[num_points - 1])); + } } #endif /*LV_HAVE_SSE3*/ @@ -390,35 +407,39 @@ static inline void volk_32fc_x2_conjugate_dot_prod_32fc_a_sse3(lv_32fc_t* result #ifdef LV_HAVE_GENERIC -static inline void volk_32fc_x2_conjugate_dot_prod_32fc_a_generic(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) { +static inline void volk_32fc_x2_conjugate_dot_prod_32fc_a_generic(lv_32fc_t* result, + const lv_32fc_t* input, + const lv_32fc_t* taps, + unsigned int num_points) +{ - const unsigned int num_bytes = num_points*8; + const unsigned int num_bytes = num_points * 8; - float * res = (float*) result; - float * in = (float*) input; - float * tp = (float*) taps; - unsigned int n_2_ccomplex_blocks = num_bytes >> 4; + float* res = (float*)result; + float* in = (float*)input; + float* tp = (float*)taps; + unsigned int n_2_ccomplex_blocks = num_bytes >> 4; - float sum0[2] = {0,0}; - float sum1[2] = {0,0}; - unsigned int i = 0; + float sum0[2] = { 0, 0 }; + float sum1[2] = { 0, 0 }; + unsigned int i = 0; - for(i = 0; i < n_2_ccomplex_blocks; ++i) { - sum0[0] += in[0] * tp[0] + in[1] * tp[1]; - sum0[1] += (-in[0] * tp[1]) + in[1] * tp[0]; - sum1[0] += in[2] * tp[2] + in[3] * tp[3]; - sum1[1] += (-in[2] * tp[3]) + in[3] * tp[2]; + for (i = 0; i < n_2_ccomplex_blocks; ++i) { + sum0[0] += in[0] * tp[0] + in[1] * tp[1]; + sum0[1] += (-in[0] * tp[1]) + in[1] * tp[0]; + sum1[0] += in[2] * tp[2] + in[3] * tp[3]; + sum1[1] += (-in[2] * tp[3]) + in[3] * tp[2]; - in += 4; - tp += 4; - } + in += 4; + tp += 4; + } - res[0] = sum0[0] + sum1[0]; - res[1] = sum0[1] + sum1[1]; + res[0] = sum0[0] + sum1[0]; + res[1] = sum0[1] + sum1[1]; - if (num_bytes >> 3 & 1) { - *result += input[(num_bytes >> 3) - 1] * lv_conj(taps[(num_bytes >> 3) - 1]); - } + if (num_bytes >> 3 & 1) { + *result += input[(num_bytes >> 3) - 1] * lv_conj(taps[(num_bytes >> 3) - 1]); + } } #endif /*LV_HAVE_GENERIC*/ @@ -426,256 +447,276 @@ static inline void volk_32fc_x2_conjugate_dot_prod_32fc_a_generic(lv_32fc_t* res #if LV_HAVE_SSE && LV_HAVE_64 -static inline void volk_32fc_x2_conjugate_dot_prod_32fc_a_sse(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) { - - const unsigned int num_bytes = num_points*8; - - __VOLK_ATTR_ALIGNED(16) static const uint32_t conjugator[4]= {0x00000000, 0x80000000, 0x00000000, 0x80000000}; - - __VOLK_ASM __VOLK_VOLATILE - ( - "# ccomplex_conjugate_dotprod_generic (float* result, const float *input,\n\t" - "# const float *taps, unsigned num_bytes)\n\t" - "# float sum0 = 0;\n\t" - "# float sum1 = 0;\n\t" - "# float sum2 = 0;\n\t" - "# float sum3 = 0;\n\t" - "# do {\n\t" - "# sum0 += input[0] * taps[0] - input[1] * taps[1];\n\t" - "# sum1 += input[0] * taps[1] + input[1] * taps[0];\n\t" - "# sum2 += input[2] * taps[2] - input[3] * taps[3];\n\t" - "# sum3 += input[2] * taps[3] + input[3] * taps[2];\n\t" - "# input += 4;\n\t" - "# taps += 4; \n\t" - "# } while (--n_2_ccomplex_blocks != 0);\n\t" - "# result[0] = sum0 + sum2;\n\t" - "# result[1] = sum1 + sum3;\n\t" - "# TODO: prefetch and better scheduling\n\t" - " xor %%r9, %%r9\n\t" - " xor %%r10, %%r10\n\t" - " movq %[conjugator], %%r9\n\t" - " movq %%rcx, %%rax\n\t" - " movaps 0(%%r9), %%xmm8\n\t" - " movq %%rcx, %%r8\n\t" - " movq %[rsi], %%r9\n\t" - " movq %[rdx], %%r10\n\t" - " xorps %%xmm6, %%xmm6 # zero accumulators\n\t" - " movaps 0(%%r9), %%xmm0\n\t" - " xorps %%xmm7, %%xmm7 # zero accumulators\n\t" - " movups 0(%%r10), %%xmm2\n\t" - " shr $5, %%rax # rax = n_2_ccomplex_blocks / 2\n\t" - " shr $4, %%r8\n\t" - " xorps %%xmm8, %%xmm2\n\t" - " jmp .%=L1_test\n\t" - " # 4 taps / loop\n\t" - " # something like ?? cycles / loop\n\t" - ".%=Loop1: \n\t" - "# complex prod: C += A * B, w/ temp Z & Y (or B), xmmPN=$0x8000000080000000\n\t" - "# movaps (%%r9), %%xmmA\n\t" - "# movaps (%%r10), %%xmmB\n\t" - "# movaps %%xmmA, %%xmmZ\n\t" - "# shufps $0xb1, %%xmmZ, %%xmmZ # swap internals\n\t" - "# mulps %%xmmB, %%xmmA\n\t" - "# mulps %%xmmZ, %%xmmB\n\t" - "# # SSE replacement for: pfpnacc %%xmmB, %%xmmA\n\t" - "# xorps %%xmmPN, %%xmmA\n\t" - "# movaps %%xmmA, %%xmmZ\n\t" - "# unpcklps %%xmmB, %%xmmA\n\t" - "# unpckhps %%xmmB, %%xmmZ\n\t" - "# movaps %%xmmZ, %%xmmY\n\t" - "# shufps $0x44, %%xmmA, %%xmmZ # b01000100\n\t" - "# shufps $0xee, %%xmmY, %%xmmA # b11101110\n\t" - "# addps %%xmmZ, %%xmmA\n\t" - "# addps %%xmmA, %%xmmC\n\t" - "# A=xmm0, B=xmm2, Z=xmm4\n\t" - "# A'=xmm1, B'=xmm3, Z'=xmm5\n\t" - " movaps 16(%%r9), %%xmm1\n\t" - " movaps %%xmm0, %%xmm4\n\t" - " mulps %%xmm2, %%xmm0\n\t" - " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t" - " movaps 16(%%r10), %%xmm3\n\t" - " movaps %%xmm1, %%xmm5\n\t" - " xorps %%xmm8, %%xmm3\n\t" - " addps %%xmm0, %%xmm6\n\t" - " mulps %%xmm3, %%xmm1\n\t" - " shufps $0xb1, %%xmm5, %%xmm5 # swap internals\n\t" - " addps %%xmm1, %%xmm6\n\t" - " mulps %%xmm4, %%xmm2\n\t" - " movaps 32(%%r9), %%xmm0\n\t" - " addps %%xmm2, %%xmm7\n\t" - " mulps %%xmm5, %%xmm3\n\t" - " add $32, %%r9\n\t" - " movaps 32(%%r10), %%xmm2\n\t" - " addps %%xmm3, %%xmm7\n\t" - " add $32, %%r10\n\t" - " xorps %%xmm8, %%xmm2\n\t" - ".%=L1_test:\n\t" - " dec %%rax\n\t" - " jge .%=Loop1\n\t" - " # We've handled the bulk of multiplies up to here.\n\t" - " # Let's sse if original n_2_ccomplex_blocks was odd.\n\t" - " # If so, we've got 2 more taps to do.\n\t" - " and $1, %%r8\n\t" - " je .%=Leven\n\t" - " # The count was odd, do 2 more taps.\n\t" - " # Note that we've already got mm0/mm2 preloaded\n\t" - " # from the main loop.\n\t" - " movaps %%xmm0, %%xmm4\n\t" - " mulps %%xmm2, %%xmm0\n\t" - " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t" - " addps %%xmm0, %%xmm6\n\t" - " mulps %%xmm4, %%xmm2\n\t" - " addps %%xmm2, %%xmm7\n\t" - ".%=Leven:\n\t" - " # neg inversor\n\t" - " xorps %%xmm1, %%xmm1\n\t" - " mov $0x80000000, %%r9\n\t" - " movd %%r9, %%xmm1\n\t" - " shufps $0x11, %%xmm1, %%xmm1 # b00010001 # 0 -0 0 -0\n\t" - " # pfpnacc\n\t" - " xorps %%xmm1, %%xmm6\n\t" - " movaps %%xmm6, %%xmm2\n\t" - " unpcklps %%xmm7, %%xmm6\n\t" - " unpckhps %%xmm7, %%xmm2\n\t" - " movaps %%xmm2, %%xmm3\n\t" - " shufps $0x44, %%xmm6, %%xmm2 # b01000100\n\t" - " shufps $0xee, %%xmm3, %%xmm6 # b11101110\n\t" - " addps %%xmm2, %%xmm6\n\t" - " # xmm6 = r1 i2 r3 i4\n\t" - " movhlps %%xmm6, %%xmm4 # xmm4 = r3 i4 ?? ??\n\t" - " addps %%xmm4, %%xmm6 # xmm6 = r1+r3 i2+i4 ?? ??\n\t" - " movlps %%xmm6, (%[rdi]) # store low 2x32 bits (complex) to memory\n\t" - : - :[rsi] "r" (input), [rdx] "r" (taps), "c" (num_bytes), [rdi] "r" (result), [conjugator] "r" (conjugator) - :"rax", "r8", "r9", "r10" - ); - - int getem = num_bytes % 16; - - for(; getem > 0; getem -= 8) { - *result += (input[(num_bytes >> 3) - 1] * lv_conj(taps[(num_bytes >> 3) - 1])); - } +static inline void volk_32fc_x2_conjugate_dot_prod_32fc_a_sse(lv_32fc_t* result, + const lv_32fc_t* input, + const lv_32fc_t* taps, + unsigned int num_points) +{ + + const unsigned int num_bytes = num_points * 8; + + __VOLK_ATTR_ALIGNED(16) + static const uint32_t conjugator[4] = { + 0x00000000, 0x80000000, 0x00000000, 0x80000000 + }; + + __VOLK_ASM __VOLK_VOLATILE( + "# ccomplex_conjugate_dotprod_generic (float* result, const float *input,\n\t" + "# const float *taps, unsigned num_bytes)\n\t" + "# float sum0 = 0;\n\t" + "# float sum1 = 0;\n\t" + "# float sum2 = 0;\n\t" + "# float sum3 = 0;\n\t" + "# do {\n\t" + "# sum0 += input[0] * taps[0] - input[1] * taps[1];\n\t" + "# sum1 += input[0] * taps[1] + input[1] * taps[0];\n\t" + "# sum2 += input[2] * taps[2] - input[3] * taps[3];\n\t" + "# sum3 += input[2] * taps[3] + input[3] * taps[2];\n\t" + "# input += 4;\n\t" + "# taps += 4; \n\t" + "# } while (--n_2_ccomplex_blocks != 0);\n\t" + "# result[0] = sum0 + sum2;\n\t" + "# result[1] = sum1 + sum3;\n\t" + "# TODO: prefetch and better scheduling\n\t" + " xor %%r9, %%r9\n\t" + " xor %%r10, %%r10\n\t" + " movq %[conjugator], %%r9\n\t" + " movq %%rcx, %%rax\n\t" + " movaps 0(%%r9), %%xmm8\n\t" + " movq %%rcx, %%r8\n\t" + " movq %[rsi], %%r9\n\t" + " movq %[rdx], %%r10\n\t" + " xorps %%xmm6, %%xmm6 # zero accumulators\n\t" + " movaps 0(%%r9), %%xmm0\n\t" + " xorps %%xmm7, %%xmm7 # zero accumulators\n\t" + " movups 0(%%r10), %%xmm2\n\t" + " shr $5, %%rax # rax = n_2_ccomplex_blocks / 2\n\t" + " shr $4, %%r8\n\t" + " xorps %%xmm8, %%xmm2\n\t" + " jmp .%=L1_test\n\t" + " # 4 taps / loop\n\t" + " # something like ?? cycles / loop\n\t" + ".%=Loop1: \n\t" + "# complex prod: C += A * B, w/ temp Z & Y (or B), xmmPN=$0x8000000080000000\n\t" + "# movaps (%%r9), %%xmmA\n\t" + "# movaps (%%r10), %%xmmB\n\t" + "# movaps %%xmmA, %%xmmZ\n\t" + "# shufps $0xb1, %%xmmZ, %%xmmZ # swap internals\n\t" + "# mulps %%xmmB, %%xmmA\n\t" + "# mulps %%xmmZ, %%xmmB\n\t" + "# # SSE replacement for: pfpnacc %%xmmB, %%xmmA\n\t" + "# xorps %%xmmPN, %%xmmA\n\t" + "# movaps %%xmmA, %%xmmZ\n\t" + "# unpcklps %%xmmB, %%xmmA\n\t" + "# unpckhps %%xmmB, %%xmmZ\n\t" + "# movaps %%xmmZ, %%xmmY\n\t" + "# shufps $0x44, %%xmmA, %%xmmZ # b01000100\n\t" + "# shufps $0xee, %%xmmY, %%xmmA # b11101110\n\t" + "# addps %%xmmZ, %%xmmA\n\t" + "# addps %%xmmA, %%xmmC\n\t" + "# A=xmm0, B=xmm2, Z=xmm4\n\t" + "# A'=xmm1, B'=xmm3, Z'=xmm5\n\t" + " movaps 16(%%r9), %%xmm1\n\t" + " movaps %%xmm0, %%xmm4\n\t" + " mulps %%xmm2, %%xmm0\n\t" + " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t" + " movaps 16(%%r10), %%xmm3\n\t" + " movaps %%xmm1, %%xmm5\n\t" + " xorps %%xmm8, %%xmm3\n\t" + " addps %%xmm0, %%xmm6\n\t" + " mulps %%xmm3, %%xmm1\n\t" + " shufps $0xb1, %%xmm5, %%xmm5 # swap internals\n\t" + " addps %%xmm1, %%xmm6\n\t" + " mulps %%xmm4, %%xmm2\n\t" + " movaps 32(%%r9), %%xmm0\n\t" + " addps %%xmm2, %%xmm7\n\t" + " mulps %%xmm5, %%xmm3\n\t" + " add $32, %%r9\n\t" + " movaps 32(%%r10), %%xmm2\n\t" + " addps %%xmm3, %%xmm7\n\t" + " add $32, %%r10\n\t" + " xorps %%xmm8, %%xmm2\n\t" + ".%=L1_test:\n\t" + " dec %%rax\n\t" + " jge .%=Loop1\n\t" + " # We've handled the bulk of multiplies up to here.\n\t" + " # Let's sse if original n_2_ccomplex_blocks was odd.\n\t" + " # If so, we've got 2 more taps to do.\n\t" + " and $1, %%r8\n\t" + " je .%=Leven\n\t" + " # The count was odd, do 2 more taps.\n\t" + " # Note that we've already got mm0/mm2 preloaded\n\t" + " # from the main loop.\n\t" + " movaps %%xmm0, %%xmm4\n\t" + " mulps %%xmm2, %%xmm0\n\t" + " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t" + " addps %%xmm0, %%xmm6\n\t" + " mulps %%xmm4, %%xmm2\n\t" + " addps %%xmm2, %%xmm7\n\t" + ".%=Leven:\n\t" + " # neg inversor\n\t" + " xorps %%xmm1, %%xmm1\n\t" + " mov $0x80000000, %%r9\n\t" + " movd %%r9, %%xmm1\n\t" + " shufps $0x11, %%xmm1, %%xmm1 # b00010001 # 0 -0 0 -0\n\t" + " # pfpnacc\n\t" + " xorps %%xmm1, %%xmm6\n\t" + " movaps %%xmm6, %%xmm2\n\t" + " unpcklps %%xmm7, %%xmm6\n\t" + " unpckhps %%xmm7, %%xmm2\n\t" + " movaps %%xmm2, %%xmm3\n\t" + " shufps $0x44, %%xmm6, %%xmm2 # b01000100\n\t" + " shufps $0xee, %%xmm3, %%xmm6 # b11101110\n\t" + " addps %%xmm2, %%xmm6\n\t" + " # xmm6 = r1 i2 r3 i4\n\t" + " movhlps %%xmm6, %%xmm4 # xmm4 = r3 i4 ?? ??\n\t" + " addps %%xmm4, %%xmm6 # xmm6 = r1+r3 i2+i4 ?? ??\n\t" + " movlps %%xmm6, (%[rdi]) # store low 2x32 bits (complex) " + "to memory\n\t" + : + : [rsi] "r"(input), + [rdx] "r"(taps), + "c"(num_bytes), + [rdi] "r"(result), + [conjugator] "r"(conjugator) + : "rax", "r8", "r9", "r10"); + + int getem = num_bytes % 16; + + for (; getem > 0; getem -= 8) { + *result += (input[(num_bytes >> 3) - 1] * lv_conj(taps[(num_bytes >> 3) - 1])); + } } #endif #if LV_HAVE_SSE && LV_HAVE_32 -static inline void volk_32fc_x2_conjugate_dot_prod_32fc_a_sse_32(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) { - - const unsigned int num_bytes = num_points*8; - - __VOLK_ATTR_ALIGNED(16) static const uint32_t conjugator[4]= {0x00000000, 0x80000000, 0x00000000, 0x80000000}; - - int bound = num_bytes >> 4; - int leftovers = num_bytes % 16; - - __VOLK_ASM __VOLK_VOLATILE - ( - " #pushl %%ebp\n\t" - " #movl %%esp, %%ebp\n\t" - " #movl 12(%%ebp), %%eax # input\n\t" - " #movl 16(%%ebp), %%edx # taps\n\t" - " #movl 20(%%ebp), %%ecx # n_bytes\n\t" - " movaps 0(%[conjugator]), %%xmm1\n\t" - " xorps %%xmm6, %%xmm6 # zero accumulators\n\t" - " movaps 0(%[eax]), %%xmm0\n\t" - " xorps %%xmm7, %%xmm7 # zero accumulators\n\t" - " movaps 0(%[edx]), %%xmm2\n\t" - " movl %[ecx], (%[out])\n\t" - " shrl $5, %[ecx] # ecx = n_2_ccomplex_blocks / 2\n\t" - - " xorps %%xmm1, %%xmm2\n\t" - " jmp .%=L1_test\n\t" - " # 4 taps / loop\n\t" - " # something like ?? cycles / loop\n\t" - ".%=Loop1: \n\t" - "# complex prod: C += A * B, w/ temp Z & Y (or B), xmmPN=$0x8000000080000000\n\t" - "# movaps (%[eax]), %%xmmA\n\t" - "# movaps (%[edx]), %%xmmB\n\t" - "# movaps %%xmmA, %%xmmZ\n\t" - "# shufps $0xb1, %%xmmZ, %%xmmZ # swap internals\n\t" - "# mulps %%xmmB, %%xmmA\n\t" - "# mulps %%xmmZ, %%xmmB\n\t" - "# # SSE replacement for: pfpnacc %%xmmB, %%xmmA\n\t" - "# xorps %%xmmPN, %%xmmA\n\t" - "# movaps %%xmmA, %%xmmZ\n\t" - "# unpcklps %%xmmB, %%xmmA\n\t" - "# unpckhps %%xmmB, %%xmmZ\n\t" - "# movaps %%xmmZ, %%xmmY\n\t" - "# shufps $0x44, %%xmmA, %%xmmZ # b01000100\n\t" - "# shufps $0xee, %%xmmY, %%xmmA # b11101110\n\t" - "# addps %%xmmZ, %%xmmA\n\t" - "# addps %%xmmA, %%xmmC\n\t" - "# A=xmm0, B=xmm2, Z=xmm4\n\t" - "# A'=xmm1, B'=xmm3, Z'=xmm5\n\t" - " movaps 16(%[edx]), %%xmm3\n\t" - " movaps %%xmm0, %%xmm4\n\t" - " xorps %%xmm1, %%xmm3\n\t" - " mulps %%xmm2, %%xmm0\n\t" - " movaps 16(%[eax]), %%xmm1\n\t" - " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t" - " movaps %%xmm1, %%xmm5\n\t" - " addps %%xmm0, %%xmm6\n\t" - " mulps %%xmm3, %%xmm1\n\t" - " shufps $0xb1, %%xmm5, %%xmm5 # swap internals\n\t" - " addps %%xmm1, %%xmm6\n\t" - " movaps 0(%[conjugator]), %%xmm1\n\t" - " mulps %%xmm4, %%xmm2\n\t" - " movaps 32(%[eax]), %%xmm0\n\t" - " addps %%xmm2, %%xmm7\n\t" - " mulps %%xmm5, %%xmm3\n\t" - " addl $32, %[eax]\n\t" - " movaps 32(%[edx]), %%xmm2\n\t" - " addps %%xmm3, %%xmm7\n\t" - " xorps %%xmm1, %%xmm2\n\t" - " addl $32, %[edx]\n\t" - ".%=L1_test:\n\t" - " decl %[ecx]\n\t" - " jge .%=Loop1\n\t" - " # We've handled the bulk of multiplies up to here.\n\t" - " # Let's sse if original n_2_ccomplex_blocks was odd.\n\t" - " # If so, we've got 2 more taps to do.\n\t" - " movl 0(%[out]), %[ecx] # n_2_ccomplex_blocks\n\t" - " shrl $4, %[ecx]\n\t" - " andl $1, %[ecx]\n\t" - " je .%=Leven\n\t" - " # The count was odd, do 2 more taps.\n\t" - " # Note that we've already got mm0/mm2 preloaded\n\t" - " # from the main loop.\n\t" - " movaps %%xmm0, %%xmm4\n\t" - " mulps %%xmm2, %%xmm0\n\t" - " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t" - " addps %%xmm0, %%xmm6\n\t" - " mulps %%xmm4, %%xmm2\n\t" - " addps %%xmm2, %%xmm7\n\t" - ".%=Leven:\n\t" - " # neg inversor\n\t" - " #movl 8(%%ebp), %[eax] \n\t" - " xorps %%xmm1, %%xmm1\n\t" - " movl $0x80000000, (%[out])\n\t" - " movss (%[out]), %%xmm1\n\t" - " shufps $0x11, %%xmm1, %%xmm1 # b00010001 # 0 -0 0 -0\n\t" - " # pfpnacc\n\t" - " xorps %%xmm1, %%xmm6\n\t" - " movaps %%xmm6, %%xmm2\n\t" - " unpcklps %%xmm7, %%xmm6\n\t" - " unpckhps %%xmm7, %%xmm2\n\t" - " movaps %%xmm2, %%xmm3\n\t" - " shufps $0x44, %%xmm6, %%xmm2 # b01000100\n\t" - " shufps $0xee, %%xmm3, %%xmm6 # b11101110\n\t" - " addps %%xmm2, %%xmm6\n\t" - " # xmm6 = r1 i2 r3 i4\n\t" - " #movl 8(%%ebp), %[eax] # @result\n\t" - " movhlps %%xmm6, %%xmm4 # xmm4 = r3 i4 ?? ??\n\t" - " addps %%xmm4, %%xmm6 # xmm6 = r1+r3 i2+i4 ?? ??\n\t" - " movlps %%xmm6, (%[out]) # store low 2x32 bits (complex) to memory\n\t" - " #popl %%ebp\n\t" - : - : [eax] "r" (input), [edx] "r" (taps), [ecx] "r" (num_bytes), [out] "r" (result), [conjugator] "r" (conjugator) - ); - - for(; leftovers > 0; leftovers -= 8) { - *result += (input[(bound << 1)] * lv_conj(taps[(bound << 1)])); - } +static inline void volk_32fc_x2_conjugate_dot_prod_32fc_a_sse_32(lv_32fc_t* result, + const lv_32fc_t* input, + const lv_32fc_t* taps, + unsigned int num_points) +{ + + const unsigned int num_bytes = num_points * 8; + + __VOLK_ATTR_ALIGNED(16) + static const uint32_t conjugator[4] = { + 0x00000000, 0x80000000, 0x00000000, 0x80000000 + }; + + int bound = num_bytes >> 4; + int leftovers = num_bytes % 16; + + __VOLK_ASM __VOLK_VOLATILE( + " #pushl %%ebp\n\t" + " #movl %%esp, %%ebp\n\t" + " #movl 12(%%ebp), %%eax # input\n\t" + " #movl 16(%%ebp), %%edx # taps\n\t" + " #movl 20(%%ebp), %%ecx # n_bytes\n\t" + " movaps 0(%[conjugator]), %%xmm1\n\t" + " xorps %%xmm6, %%xmm6 # zero accumulators\n\t" + " movaps 0(%[eax]), %%xmm0\n\t" + " xorps %%xmm7, %%xmm7 # zero accumulators\n\t" + " movaps 0(%[edx]), %%xmm2\n\t" + " movl %[ecx], (%[out])\n\t" + " shrl $5, %[ecx] # ecx = n_2_ccomplex_blocks / 2\n\t" + + " xorps %%xmm1, %%xmm2\n\t" + " jmp .%=L1_test\n\t" + " # 4 taps / loop\n\t" + " # something like ?? cycles / loop\n\t" + ".%=Loop1: \n\t" + "# complex prod: C += A * B, w/ temp Z & Y (or B), xmmPN=$0x8000000080000000\n\t" + "# movaps (%[eax]), %%xmmA\n\t" + "# movaps (%[edx]), %%xmmB\n\t" + "# movaps %%xmmA, %%xmmZ\n\t" + "# shufps $0xb1, %%xmmZ, %%xmmZ # swap internals\n\t" + "# mulps %%xmmB, %%xmmA\n\t" + "# mulps %%xmmZ, %%xmmB\n\t" + "# # SSE replacement for: pfpnacc %%xmmB, %%xmmA\n\t" + "# xorps %%xmmPN, %%xmmA\n\t" + "# movaps %%xmmA, %%xmmZ\n\t" + "# unpcklps %%xmmB, %%xmmA\n\t" + "# unpckhps %%xmmB, %%xmmZ\n\t" + "# movaps %%xmmZ, %%xmmY\n\t" + "# shufps $0x44, %%xmmA, %%xmmZ # b01000100\n\t" + "# shufps $0xee, %%xmmY, %%xmmA # b11101110\n\t" + "# addps %%xmmZ, %%xmmA\n\t" + "# addps %%xmmA, %%xmmC\n\t" + "# A=xmm0, B=xmm2, Z=xmm4\n\t" + "# A'=xmm1, B'=xmm3, Z'=xmm5\n\t" + " movaps 16(%[edx]), %%xmm3\n\t" + " movaps %%xmm0, %%xmm4\n\t" + " xorps %%xmm1, %%xmm3\n\t" + " mulps %%xmm2, %%xmm0\n\t" + " movaps 16(%[eax]), %%xmm1\n\t" + " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t" + " movaps %%xmm1, %%xmm5\n\t" + " addps %%xmm0, %%xmm6\n\t" + " mulps %%xmm3, %%xmm1\n\t" + " shufps $0xb1, %%xmm5, %%xmm5 # swap internals\n\t" + " addps %%xmm1, %%xmm6\n\t" + " movaps 0(%[conjugator]), %%xmm1\n\t" + " mulps %%xmm4, %%xmm2\n\t" + " movaps 32(%[eax]), %%xmm0\n\t" + " addps %%xmm2, %%xmm7\n\t" + " mulps %%xmm5, %%xmm3\n\t" + " addl $32, %[eax]\n\t" + " movaps 32(%[edx]), %%xmm2\n\t" + " addps %%xmm3, %%xmm7\n\t" + " xorps %%xmm1, %%xmm2\n\t" + " addl $32, %[edx]\n\t" + ".%=L1_test:\n\t" + " decl %[ecx]\n\t" + " jge .%=Loop1\n\t" + " # We've handled the bulk of multiplies up to here.\n\t" + " # Let's sse if original n_2_ccomplex_blocks was odd.\n\t" + " # If so, we've got 2 more taps to do.\n\t" + " movl 0(%[out]), %[ecx] # n_2_ccomplex_blocks\n\t" + " shrl $4, %[ecx]\n\t" + " andl $1, %[ecx]\n\t" + " je .%=Leven\n\t" + " # The count was odd, do 2 more taps.\n\t" + " # Note that we've already got mm0/mm2 preloaded\n\t" + " # from the main loop.\n\t" + " movaps %%xmm0, %%xmm4\n\t" + " mulps %%xmm2, %%xmm0\n\t" + " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t" + " addps %%xmm0, %%xmm6\n\t" + " mulps %%xmm4, %%xmm2\n\t" + " addps %%xmm2, %%xmm7\n\t" + ".%=Leven:\n\t" + " # neg inversor\n\t" + " #movl 8(%%ebp), %[eax] \n\t" + " xorps %%xmm1, %%xmm1\n\t" + " movl $0x80000000, (%[out])\n\t" + " movss (%[out]), %%xmm1\n\t" + " shufps $0x11, %%xmm1, %%xmm1 # b00010001 # 0 -0 0 -0\n\t" + " # pfpnacc\n\t" + " xorps %%xmm1, %%xmm6\n\t" + " movaps %%xmm6, %%xmm2\n\t" + " unpcklps %%xmm7, %%xmm6\n\t" + " unpckhps %%xmm7, %%xmm2\n\t" + " movaps %%xmm2, %%xmm3\n\t" + " shufps $0x44, %%xmm6, %%xmm2 # b01000100\n\t" + " shufps $0xee, %%xmm3, %%xmm6 # b11101110\n\t" + " addps %%xmm2, %%xmm6\n\t" + " # xmm6 = r1 i2 r3 i4\n\t" + " #movl 8(%%ebp), %[eax] # @result\n\t" + " movhlps %%xmm6, %%xmm4 # xmm4 = r3 i4 ?? ??\n\t" + " addps %%xmm4, %%xmm6 # xmm6 = r1+r3 i2+i4 ?? ??\n\t" + " movlps %%xmm6, (%[out]) # store low 2x32 bits (complex) " + "to memory\n\t" + " #popl %%ebp\n\t" + : + : [eax] "r"(input), + [edx] "r"(taps), + [ecx] "r"(num_bytes), + [out] "r"(result), + [conjugator] "r"(conjugator)); + + for (; leftovers > 0; leftovers -= 8) { + *result += (input[(bound << 1)] * lv_conj(taps[(bound << 1)])); + } } #endif /*LV_HAVE_SSE*/ diff --git a/kernels/volk/volk_32fc_x2_divide_32fc.h b/kernels/volk/volk_32fc_x2_divide_32fc.h index 3ce6ede..78c245a 100644 --- a/kernels/volk/volk_32fc_x2_divide_32fc.h +++ b/kernels/volk/volk_32fc_x2_divide_32fc.h @@ -29,8 +29,8 @@ * * Dispatcher Prototype * \code - * void volk_32fc_x2_divide_32fc(lv_32fc_t* cVector, const lv_32fc_t* numeratorVector, const lv_32fc_t* denumeratorVector, unsigned int num_points); - * \endcode + * void volk_32fc_x2_divide_32fc(lv_32fc_t* cVector, const lv_32fc_t* numeratorVector, + * const lv_32fc_t* denumeratorVector, unsigned int num_points); \endcode * * \b Inputs * \li numeratorVector: The numerator complex values. @@ -41,7 +41,8 @@ * \li outputVector: The output vector complex floats. * * \b Example - * divide a complex vector by itself, demonstrating the result should be pretty close to 1+0j. + * divide a complex vector by itself, demonstrating the result should be pretty close to + * 1+0j. * * \code * int N = 10; @@ -71,17 +72,18 @@ #ifndef INCLUDED_volk_32fc_x2_divide_32fc_u_H #define INCLUDED_volk_32fc_x2_divide_32fc_u_H +#include #include #include -#include #ifdef LV_HAVE_SSE3 #include #include -static inline void -volk_32fc_x2_divide_32fc_u_sse3(lv_32fc_t* cVector, const lv_32fc_t* numeratorVector, - const lv_32fc_t* denumeratorVector, unsigned int num_points) +static inline void volk_32fc_x2_divide_32fc_u_sse3(lv_32fc_t* cVector, + const lv_32fc_t* numeratorVector, + const lv_32fc_t* denumeratorVector, + unsigned int num_points) { /* * we'll do the "classical" @@ -89,44 +91,46 @@ volk_32fc_x2_divide_32fc_u_sse3(lv_32fc_t* cVector, const lv_32fc_t* numeratorVe * --- = ------- * b |b|^2 * */ - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - __m128 num01, num23, den01, den23, norm, result; - lv_32fc_t* c = cVector; - const lv_32fc_t* a = numeratorVector; - const lv_32fc_t* b = denumeratorVector; - - for(; number < quarterPoints; number++){ - num01 = _mm_loadu_ps((float*) a); // first pair - den01 = _mm_loadu_ps((float*) b); // first pair - num01 = _mm_complexconjugatemul_ps(num01, den01); // a conj(b) - a += 2; - b += 2; - - num23 = _mm_loadu_ps((float*) a); // second pair - den23 = _mm_loadu_ps((float*) b); // second pair - num23 = _mm_complexconjugatemul_ps(num23, den23); // a conj(b) - a += 2; - b += 2; - - norm = _mm_magnitudesquared_ps_sse3(den01, den23); - den01 = _mm_unpacklo_ps(norm,norm); - den23 = _mm_unpackhi_ps(norm,norm); - - result = _mm_div_ps(num01, den01); - _mm_storeu_ps((float*) c, result); // Store the results back into the C container - c += 2; - result = _mm_div_ps(num23, den23); - _mm_storeu_ps((float*) c, result); // Store the results back into the C container - c += 2; - } - - number *= 4; - for(;number < num_points; number++){ - *c = (*a) / (*b); - a++; b++; c++; - } + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + __m128 num01, num23, den01, den23, norm, result; + lv_32fc_t* c = cVector; + const lv_32fc_t* a = numeratorVector; + const lv_32fc_t* b = denumeratorVector; + + for (; number < quarterPoints; number++) { + num01 = _mm_loadu_ps((float*)a); // first pair + den01 = _mm_loadu_ps((float*)b); // first pair + num01 = _mm_complexconjugatemul_ps(num01, den01); // a conj(b) + a += 2; + b += 2; + + num23 = _mm_loadu_ps((float*)a); // second pair + den23 = _mm_loadu_ps((float*)b); // second pair + num23 = _mm_complexconjugatemul_ps(num23, den23); // a conj(b) + a += 2; + b += 2; + + norm = _mm_magnitudesquared_ps_sse3(den01, den23); + den01 = _mm_unpacklo_ps(norm, norm); + den23 = _mm_unpackhi_ps(norm, norm); + + result = _mm_div_ps(num01, den01); + _mm_storeu_ps((float*)c, result); // Store the results back into the C container + c += 2; + result = _mm_div_ps(num23, den23); + _mm_storeu_ps((float*)c, result); // Store the results back into the C container + c += 2; + } + + number *= 4; + for (; number < num_points; number++) { + *c = (*a) / (*b); + a++; + b++; + c++; + } } #endif /* LV_HAVE_SSE3 */ @@ -135,9 +139,10 @@ volk_32fc_x2_divide_32fc_u_sse3(lv_32fc_t* cVector, const lv_32fc_t* numeratorVe #include #include -static inline void -volk_32fc_x2_divide_32fc_u_avx(lv_32fc_t* cVector, const lv_32fc_t* numeratorVector, - const lv_32fc_t* denumeratorVector, unsigned int num_points) +static inline void volk_32fc_x2_divide_32fc_u_avx(lv_32fc_t* cVector, + const lv_32fc_t* numeratorVector, + const lv_32fc_t* denumeratorVector, + unsigned int num_points) { /* * we'll do the "classical" @@ -153,17 +158,21 @@ volk_32fc_x2_divide_32fc_u_avx(lv_32fc_t* cVector, const lv_32fc_t* numeratorVec const lv_32fc_t* a = numeratorVector; const lv_32fc_t* b = denumeratorVector; - for(; number < quarterPoints; number++){ - num = _mm256_loadu_ps((float*) a); // Load the ar + ai, br + bi ... as ar,ai,br,bi ... - denum = _mm256_loadu_ps((float*) b); // Load the cr + ci, dr + di ... as cr,ci,dr,di ... + for (; number < quarterPoints; number++) { + num = _mm256_loadu_ps( + (float*)a); // Load the ar + ai, br + bi ... as ar,ai,br,bi ... + denum = _mm256_loadu_ps( + (float*)b); // Load the cr + ci, dr + di ... as cr,ci,dr,di ... mul_conj = _mm256_complexconjugatemul_ps(num, denum); sq = _mm256_mul_ps(denum, denum); // Square the values - mag_sq_un = _mm256_hadd_ps(sq,sq); // obtain the actual squared magnitude, although out of order + mag_sq_un = _mm256_hadd_ps( + sq, sq); // obtain the actual squared magnitude, although out of order mag_sq = _mm256_permute_ps(mag_sq_un, 0xd8); // I order them - // best guide I found on using these functions: https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=2738,2059,2738,2738,3875,3874,3875,2738,3870 - div = _mm256_div_ps(mul_conj,mag_sq); + // best guide I found on using these functions: + // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=2738,2059,2738,2738,3875,3874,3875,2738,3870 + div = _mm256_div_ps(mul_conj, mag_sq); - _mm256_storeu_ps((float*) c, div); // Store the results back into the C container + _mm256_storeu_ps((float*)c, div); // Store the results back into the C container a += 4; b += 4; @@ -172,51 +181,51 @@ volk_32fc_x2_divide_32fc_u_avx(lv_32fc_t* cVector, const lv_32fc_t* numeratorVec number = quarterPoints * 4; - for(; number < num_points; number++){ + for (; number < num_points; number++) { *c++ = (*a++) / (*b++); } - } #endif /* LV_HAVE_AVX */ #ifdef LV_HAVE_GENERIC -static inline void -volk_32fc_x2_divide_32fc_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, - const lv_32fc_t* bVector, unsigned int num_points) +static inline void volk_32fc_x2_divide_32fc_generic(lv_32fc_t* cVector, + const lv_32fc_t* aVector, + const lv_32fc_t* bVector, + unsigned int num_points) { - lv_32fc_t* cPtr = cVector; - const lv_32fc_t* aPtr = aVector; - const lv_32fc_t* bPtr= bVector; - unsigned int number = 0; - - for(number = 0; number < num_points; number++){ - *cPtr++ = (*aPtr++) / (*bPtr++); - } + lv_32fc_t* cPtr = cVector; + const lv_32fc_t* aPtr = aVector; + const lv_32fc_t* bPtr = bVector; + unsigned int number = 0; + + for (number = 0; number < num_points; number++) { + *cPtr++ = (*aPtr++) / (*bPtr++); + } } #endif /* LV_HAVE_GENERIC */ - #endif /* INCLUDED_volk_32fc_x2_divide_32fc_u_H */ #ifndef INCLUDED_volk_32fc_x2_divide_32fc_a_H #define INCLUDED_volk_32fc_x2_divide_32fc_a_H +#include #include #include #include -#include #ifdef LV_HAVE_SSE3 #include #include -static inline void -volk_32fc_x2_divide_32fc_a_sse3(lv_32fc_t* cVector, const lv_32fc_t* numeratorVector, - const lv_32fc_t* denumeratorVector, unsigned int num_points) +static inline void volk_32fc_x2_divide_32fc_a_sse3(lv_32fc_t* cVector, + const lv_32fc_t* numeratorVector, + const lv_32fc_t* denumeratorVector, + unsigned int num_points) { /* * we'll do the "classical" @@ -224,45 +233,47 @@ volk_32fc_x2_divide_32fc_a_sse3(lv_32fc_t* cVector, const lv_32fc_t* numeratorVe * --- = ------- * b |b|^2 * */ - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - __m128 num01, num23, den01, den23, norm, result; - lv_32fc_t* c = cVector; - const lv_32fc_t* a = numeratorVector; - const lv_32fc_t* b = denumeratorVector; - - for(; number < quarterPoints; number++){ - num01 = _mm_load_ps((float*) a); // first pair - den01 = _mm_load_ps((float*) b); // first pair - num01 = _mm_complexconjugatemul_ps(num01, den01); // a conj(b) - a += 2; - b += 2; - - num23 = _mm_load_ps((float*) a); // second pair - den23 = _mm_load_ps((float*) b); // second pair - num23 = _mm_complexconjugatemul_ps(num23, den23); // a conj(b) - a += 2; - b += 2; - - norm = _mm_magnitudesquared_ps_sse3(den01, den23); - - den01 = _mm_unpacklo_ps(norm,norm); // select the lower floats twice - den23 = _mm_unpackhi_ps(norm,norm); // select the upper floats twice - - result = _mm_div_ps(num01, den01); - _mm_store_ps((float*) c, result); // Store the results back into the C container - c += 2; - result = _mm_div_ps(num23, den23); - _mm_store_ps((float*) c, result); // Store the results back into the C container - c += 2; - } - - number *= 4; - for(;number < num_points; number++){ - *c = (*a) / (*b); - a++; b++; c++; - } + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + __m128 num01, num23, den01, den23, norm, result; + lv_32fc_t* c = cVector; + const lv_32fc_t* a = numeratorVector; + const lv_32fc_t* b = denumeratorVector; + + for (; number < quarterPoints; number++) { + num01 = _mm_load_ps((float*)a); // first pair + den01 = _mm_load_ps((float*)b); // first pair + num01 = _mm_complexconjugatemul_ps(num01, den01); // a conj(b) + a += 2; + b += 2; + + num23 = _mm_load_ps((float*)a); // second pair + den23 = _mm_load_ps((float*)b); // second pair + num23 = _mm_complexconjugatemul_ps(num23, den23); // a conj(b) + a += 2; + b += 2; + + norm = _mm_magnitudesquared_ps_sse3(den01, den23); + + den01 = _mm_unpacklo_ps(norm, norm); // select the lower floats twice + den23 = _mm_unpackhi_ps(norm, norm); // select the upper floats twice + + result = _mm_div_ps(num01, den01); + _mm_store_ps((float*)c, result); // Store the results back into the C container + c += 2; + result = _mm_div_ps(num23, den23); + _mm_store_ps((float*)c, result); // Store the results back into the C container + c += 2; + } + + number *= 4; + for (; number < num_points; number++) { + *c = (*a) / (*b); + a++; + b++; + c++; + } } #endif /* LV_HAVE_SSE */ @@ -270,9 +281,10 @@ volk_32fc_x2_divide_32fc_a_sse3(lv_32fc_t* cVector, const lv_32fc_t* numeratorVe #include #include -static inline void -volk_32fc_x2_divide_32fc_a_avx(lv_32fc_t* cVector, const lv_32fc_t* numeratorVector, - const lv_32fc_t* denumeratorVector, unsigned int num_points) +static inline void volk_32fc_x2_divide_32fc_a_avx(lv_32fc_t* cVector, + const lv_32fc_t* numeratorVector, + const lv_32fc_t* denumeratorVector, + unsigned int num_points) { /* * we'll do the "classical" @@ -288,17 +300,21 @@ volk_32fc_x2_divide_32fc_a_avx(lv_32fc_t* cVector, const lv_32fc_t* numeratorVec const lv_32fc_t* a = numeratorVector; const lv_32fc_t* b = denumeratorVector; - for(; number < quarterPoints; number++){ - num = _mm256_load_ps((float*) a); // Load the ar + ai, br + bi ... as ar,ai,br,bi ... - denum = _mm256_load_ps((float*) b); // Load the cr + ci, dr + di ... as cr,ci,dr,di ... + for (; number < quarterPoints; number++) { + num = + _mm256_load_ps((float*)a); // Load the ar + ai, br + bi ... as ar,ai,br,bi ... + denum = + _mm256_load_ps((float*)b); // Load the cr + ci, dr + di ... as cr,ci,dr,di ... mul_conj = _mm256_complexconjugatemul_ps(num, denum); sq = _mm256_mul_ps(denum, denum); // Square the values - mag_sq_un = _mm256_hadd_ps(sq,sq); // obtain the actual squared magnitude, although out of order + mag_sq_un = _mm256_hadd_ps( + sq, sq); // obtain the actual squared magnitude, although out of order mag_sq = _mm256_permute_ps(mag_sq_un, 0xd8); // I order them - // best guide I found on using these functions: https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=2738,2059,2738,2738,3875,3874,3875,2738,3870 - div = _mm256_div_ps(mul_conj,mag_sq); + // best guide I found on using these functions: + // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=2738,2059,2738,2738,3875,3874,3875,2738,3870 + div = _mm256_div_ps(mul_conj, mag_sq); - _mm256_store_ps((float*) c, div); // Store the results back into the C container + _mm256_store_ps((float*)c, div); // Store the results back into the C container a += 4; b += 4; @@ -307,78 +323,78 @@ volk_32fc_x2_divide_32fc_a_avx(lv_32fc_t* cVector, const lv_32fc_t* numeratorVec number = quarterPoints * 4; - for(; number < num_points; number++){ + for (; number < num_points; number++) { *c++ = (*a++) / (*b++); } - - } #endif /* LV_HAVE_AVX */ #ifdef LV_HAVE_NEON #include -static inline void -volk_32fc_x2_divide_32fc_neon(lv_32fc_t* cVector, const lv_32fc_t* aVector, - const lv_32fc_t* bVector, unsigned int num_points) +static inline void volk_32fc_x2_divide_32fc_neon(lv_32fc_t* cVector, + const lv_32fc_t* aVector, + const lv_32fc_t* bVector, + unsigned int num_points) { - lv_32fc_t* cPtr = cVector; - const lv_32fc_t* aPtr = aVector; - const lv_32fc_t* bPtr = bVector; - - float32x4x2_t aVal, bVal, cVal; - float32x4_t bAbs, bAbsInv; - - const unsigned int quarterPoints = num_points / 4; - unsigned int number = 0; - for(; number < quarterPoints; number++){ - aVal = vld2q_f32((const float*)(aPtr)); - bVal = vld2q_f32((const float*)(bPtr)); - aPtr += 4; - bPtr += 4; - __VOLK_PREFETCH(aPtr+4); - __VOLK_PREFETCH(bPtr+4); - - bAbs = vmulq_f32( bVal.val[0], bVal.val[0]); - bAbs = vmlaq_f32(bAbs, bVal.val[1], bVal.val[1]); - - bAbsInv = vrecpeq_f32(bAbs); - bAbsInv = vmulq_f32(bAbsInv, vrecpsq_f32(bAbsInv, bAbs)); - bAbsInv = vmulq_f32(bAbsInv, vrecpsq_f32(bAbsInv, bAbs)); - - cVal.val[0] = vmulq_f32( aVal.val[0], bVal.val[0]); - cVal.val[0] = vmlaq_f32(cVal.val[0], aVal.val[1], bVal.val[1]); - cVal.val[0] = vmulq_f32(cVal.val[0], bAbsInv); - - cVal.val[1] = vmulq_f32( aVal.val[1], bVal.val[0]); - cVal.val[1] = vmlsq_f32(cVal.val[1], aVal.val[0], bVal.val[1]); - cVal.val[1] = vmulq_f32(cVal.val[1], bAbsInv); - - vst2q_f32((float*)(cPtr), cVal); - cPtr += 4; - } - - for(number = quarterPoints * 4; number < num_points; number++){ - *cPtr++ = (*aPtr++) / (*bPtr++); - } + lv_32fc_t* cPtr = cVector; + const lv_32fc_t* aPtr = aVector; + const lv_32fc_t* bPtr = bVector; + + float32x4x2_t aVal, bVal, cVal; + float32x4_t bAbs, bAbsInv; + + const unsigned int quarterPoints = num_points / 4; + unsigned int number = 0; + for (; number < quarterPoints; number++) { + aVal = vld2q_f32((const float*)(aPtr)); + bVal = vld2q_f32((const float*)(bPtr)); + aPtr += 4; + bPtr += 4; + __VOLK_PREFETCH(aPtr + 4); + __VOLK_PREFETCH(bPtr + 4); + + bAbs = vmulq_f32(bVal.val[0], bVal.val[0]); + bAbs = vmlaq_f32(bAbs, bVal.val[1], bVal.val[1]); + + bAbsInv = vrecpeq_f32(bAbs); + bAbsInv = vmulq_f32(bAbsInv, vrecpsq_f32(bAbsInv, bAbs)); + bAbsInv = vmulq_f32(bAbsInv, vrecpsq_f32(bAbsInv, bAbs)); + + cVal.val[0] = vmulq_f32(aVal.val[0], bVal.val[0]); + cVal.val[0] = vmlaq_f32(cVal.val[0], aVal.val[1], bVal.val[1]); + cVal.val[0] = vmulq_f32(cVal.val[0], bAbsInv); + + cVal.val[1] = vmulq_f32(aVal.val[1], bVal.val[0]); + cVal.val[1] = vmlsq_f32(cVal.val[1], aVal.val[0], bVal.val[1]); + cVal.val[1] = vmulq_f32(cVal.val[1], bAbsInv); + + vst2q_f32((float*)(cPtr), cVal); + cPtr += 4; + } + + for (number = quarterPoints * 4; number < num_points; number++) { + *cPtr++ = (*aPtr++) / (*bPtr++); + } } #endif /* LV_HAVE_NEON */ #ifdef LV_HAVE_GENERIC -static inline void -volk_32fc_x2_divide_32fc_a_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, - const lv_32fc_t* bVector, unsigned int num_points) +static inline void volk_32fc_x2_divide_32fc_a_generic(lv_32fc_t* cVector, + const lv_32fc_t* aVector, + const lv_32fc_t* bVector, + unsigned int num_points) { - lv_32fc_t* cPtr = cVector; - const lv_32fc_t* aPtr = aVector; - const lv_32fc_t* bPtr= bVector; - unsigned int number = 0; - - for(number = 0; number < num_points; number++){ - *cPtr++ = (*aPtr++) / (*bPtr++); - } + lv_32fc_t* cPtr = cVector; + const lv_32fc_t* aPtr = aVector; + const lv_32fc_t* bPtr = bVector; + unsigned int number = 0; + + for (number = 0; number < num_points; number++) { + *cPtr++ = (*aPtr++) / (*bPtr++); + } } #endif /* LV_HAVE_GENERIC */ diff --git a/kernels/volk/volk_32fc_x2_dot_prod_32fc.h b/kernels/volk/volk_32fc_x2_dot_prod_32fc.h index f4a4469..b0b7fee 100644 --- a/kernels/volk/volk_32fc_x2_dot_prod_32fc.h +++ b/kernels/volk/volk_32fc_x2_dot_prod_32fc.h @@ -33,8 +33,8 @@ * * Dispatcher Prototype * \code - * void volk_32fc_x2_dot_prod_32fc(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) - * \endcode + * void volk_32fc_x2_dot_prod_32fc(lv_32fc_t* result, const lv_32fc_t* input, const + * lv_32fc_t* taps, unsigned int num_points) \endcode * * \b Inputs * \li input: vector of complex floats. @@ -58,236 +58,246 @@ #ifndef INCLUDED_volk_32fc_x2_dot_prod_32fc_u_H #define INCLUDED_volk_32fc_x2_dot_prod_32fc_u_H -#include -#include #include #include +#include +#include #ifdef LV_HAVE_GENERIC -static inline void volk_32fc_x2_dot_prod_32fc_generic(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) { +static inline void volk_32fc_x2_dot_prod_32fc_generic(lv_32fc_t* result, + const lv_32fc_t* input, + const lv_32fc_t* taps, + unsigned int num_points) +{ - float * res = (float*) result; - float * in = (float*) input; - float * tp = (float*) taps; - unsigned int n_2_ccomplex_blocks = num_points/2; + float* res = (float*)result; + float* in = (float*)input; + float* tp = (float*)taps; + unsigned int n_2_ccomplex_blocks = num_points / 2; - float sum0[2] = {0,0}; - float sum1[2] = {0,0}; - unsigned int i = 0; + float sum0[2] = { 0, 0 }; + float sum1[2] = { 0, 0 }; + unsigned int i = 0; - for(i = 0; i < n_2_ccomplex_blocks; ++i) { - sum0[0] += in[0] * tp[0] - in[1] * tp[1]; - sum0[1] += in[0] * tp[1] + in[1] * tp[0]; - sum1[0] += in[2] * tp[2] - in[3] * tp[3]; - sum1[1] += in[2] * tp[3] + in[3] * tp[2]; + for (i = 0; i < n_2_ccomplex_blocks; ++i) { + sum0[0] += in[0] * tp[0] - in[1] * tp[1]; + sum0[1] += in[0] * tp[1] + in[1] * tp[0]; + sum1[0] += in[2] * tp[2] - in[3] * tp[3]; + sum1[1] += in[2] * tp[3] + in[3] * tp[2]; - in += 4; - tp += 4; - } + in += 4; + tp += 4; + } - res[0] = sum0[0] + sum1[0]; - res[1] = sum0[1] + sum1[1]; + res[0] = sum0[0] + sum1[0]; + res[1] = sum0[1] + sum1[1]; - // Cleanup if we had an odd number of points - if (num_points & 1) { - *result += input[num_points - 1] * taps[num_points - 1]; - } + // Cleanup if we had an odd number of points + if (num_points & 1) { + *result += input[num_points - 1] * taps[num_points - 1]; + } } #endif /*LV_HAVE_GENERIC*/ - #if LV_HAVE_SSE && LV_HAVE_64 -static inline void volk_32fc_x2_dot_prod_32fc_u_sse_64(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) { - - const unsigned int num_bytes = num_points*8; - unsigned int isodd = num_points & 1; - - __VOLK_ASM - ( - "# ccomplex_dotprod_generic (float* result, const float *input,\n\t" - "# const float *taps, unsigned num_bytes)\n\t" - "# float sum0 = 0;\n\t" - "# float sum1 = 0;\n\t" - "# float sum2 = 0;\n\t" - "# float sum3 = 0;\n\t" - "# do {\n\t" - "# sum0 += input[0] * taps[0] - input[1] * taps[1];\n\t" - "# sum1 += input[0] * taps[1] + input[1] * taps[0];\n\t" - "# sum2 += input[2] * taps[2] - input[3] * taps[3];\n\t" - "# sum3 += input[2] * taps[3] + input[3] * taps[2];\n\t" - "# input += 4;\n\t" - "# taps += 4; \n\t" - "# } while (--n_2_ccomplex_blocks != 0);\n\t" - "# result[0] = sum0 + sum2;\n\t" - "# result[1] = sum1 + sum3;\n\t" - "# TODO: prefetch and better scheduling\n\t" - " xor %%r9, %%r9\n\t" - " xor %%r10, %%r10\n\t" - " movq %%rcx, %%rax\n\t" - " movq %%rcx, %%r8\n\t" - " movq %[rsi], %%r9\n\t" - " movq %[rdx], %%r10\n\t" - " xorps %%xmm6, %%xmm6 # zero accumulators\n\t" - " movups 0(%%r9), %%xmm0\n\t" - " xorps %%xmm7, %%xmm7 # zero accumulators\n\t" - " movups 0(%%r10), %%xmm2\n\t" - " shr $5, %%rax # rax = n_2_ccomplex_blocks / 2\n\t" - " shr $4, %%r8\n\t" - " jmp .%=L1_test\n\t" - " # 4 taps / loop\n\t" - " # something like ?? cycles / loop\n\t" - ".%=Loop1: \n\t" - "# complex prod: C += A * B, w/ temp Z & Y (or B), xmmPN=$0x8000000080000000\n\t" - "# movups (%%r9), %%xmmA\n\t" - "# movups (%%r10), %%xmmB\n\t" - "# movups %%xmmA, %%xmmZ\n\t" - "# shufps $0xb1, %%xmmZ, %%xmmZ # swap internals\n\t" - "# mulps %%xmmB, %%xmmA\n\t" - "# mulps %%xmmZ, %%xmmB\n\t" - "# # SSE replacement for: pfpnacc %%xmmB, %%xmmA\n\t" - "# xorps %%xmmPN, %%xmmA\n\t" - "# movups %%xmmA, %%xmmZ\n\t" - "# unpcklps %%xmmB, %%xmmA\n\t" - "# unpckhps %%xmmB, %%xmmZ\n\t" - "# movups %%xmmZ, %%xmmY\n\t" - "# shufps $0x44, %%xmmA, %%xmmZ # b01000100\n\t" - "# shufps $0xee, %%xmmY, %%xmmA # b11101110\n\t" - "# addps %%xmmZ, %%xmmA\n\t" - "# addps %%xmmA, %%xmmC\n\t" - "# A=xmm0, B=xmm2, Z=xmm4\n\t" - "# A'=xmm1, B'=xmm3, Z'=xmm5\n\t" - " movups 16(%%r9), %%xmm1\n\t" - " movups %%xmm0, %%xmm4\n\t" - " mulps %%xmm2, %%xmm0\n\t" - " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t" - " movups 16(%%r10), %%xmm3\n\t" - " movups %%xmm1, %%xmm5\n\t" - " addps %%xmm0, %%xmm6\n\t" - " mulps %%xmm3, %%xmm1\n\t" - " shufps $0xb1, %%xmm5, %%xmm5 # swap internals\n\t" - " addps %%xmm1, %%xmm6\n\t" - " mulps %%xmm4, %%xmm2\n\t" - " movups 32(%%r9), %%xmm0\n\t" - " addps %%xmm2, %%xmm7\n\t" - " mulps %%xmm5, %%xmm3\n\t" - " add $32, %%r9\n\t" - " movups 32(%%r10), %%xmm2\n\t" - " addps %%xmm3, %%xmm7\n\t" - " add $32, %%r10\n\t" - ".%=L1_test:\n\t" - " dec %%rax\n\t" - " jge .%=Loop1\n\t" - " # We've handled the bulk of multiplies up to here.\n\t" - " # Let's sse if original n_2_ccomplex_blocks was odd.\n\t" - " # If so, we've got 2 more taps to do.\n\t" - " and $1, %%r8\n\t" - " je .%=Leven\n\t" - " # The count was odd, do 2 more taps.\n\t" - " # Note that we've already got mm0/mm2 preloaded\n\t" - " # from the main loop.\n\t" - " movups %%xmm0, %%xmm4\n\t" - " mulps %%xmm2, %%xmm0\n\t" - " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t" - " addps %%xmm0, %%xmm6\n\t" - " mulps %%xmm4, %%xmm2\n\t" - " addps %%xmm2, %%xmm7\n\t" - ".%=Leven:\n\t" - " # neg inversor\n\t" - " xorps %%xmm1, %%xmm1\n\t" - " mov $0x80000000, %%r9\n\t" - " movd %%r9, %%xmm1\n\t" - " shufps $0x11, %%xmm1, %%xmm1 # b00010001 # 0 -0 0 -0\n\t" - " # pfpnacc\n\t" - " xorps %%xmm1, %%xmm6\n\t" - " movups %%xmm6, %%xmm2\n\t" - " unpcklps %%xmm7, %%xmm6\n\t" - " unpckhps %%xmm7, %%xmm2\n\t" - " movups %%xmm2, %%xmm3\n\t" - " shufps $0x44, %%xmm6, %%xmm2 # b01000100\n\t" - " shufps $0xee, %%xmm3, %%xmm6 # b11101110\n\t" - " addps %%xmm2, %%xmm6\n\t" - " # xmm6 = r1 i2 r3 i4\n\t" - " movhlps %%xmm6, %%xmm4 # xmm4 = r3 i4 ?? ??\n\t" - " addps %%xmm4, %%xmm6 # xmm6 = r1+r3 i2+i4 ?? ??\n\t" - " movlps %%xmm6, (%[rdi]) # store low 2x32 bits (complex) to memory\n\t" - : - :[rsi] "r" (input), [rdx] "r" (taps), "c" (num_bytes), [rdi] "r" (result) - :"rax", "r8", "r9", "r10" - ); - - - if(isodd) { - *result += input[num_points - 1] * taps[num_points - 1]; - } - - return; +static inline void volk_32fc_x2_dot_prod_32fc_u_sse_64(lv_32fc_t* result, + const lv_32fc_t* input, + const lv_32fc_t* taps, + unsigned int num_points) +{ + + const unsigned int num_bytes = num_points * 8; + unsigned int isodd = num_points & 1; + + __VOLK_ASM( + "# ccomplex_dotprod_generic (float* result, const float *input,\n\t" + "# const float *taps, unsigned num_bytes)\n\t" + "# float sum0 = 0;\n\t" + "# float sum1 = 0;\n\t" + "# float sum2 = 0;\n\t" + "# float sum3 = 0;\n\t" + "# do {\n\t" + "# sum0 += input[0] * taps[0] - input[1] * taps[1];\n\t" + "# sum1 += input[0] * taps[1] + input[1] * taps[0];\n\t" + "# sum2 += input[2] * taps[2] - input[3] * taps[3];\n\t" + "# sum3 += input[2] * taps[3] + input[3] * taps[2];\n\t" + "# input += 4;\n\t" + "# taps += 4; \n\t" + "# } while (--n_2_ccomplex_blocks != 0);\n\t" + "# result[0] = sum0 + sum2;\n\t" + "# result[1] = sum1 + sum3;\n\t" + "# TODO: prefetch and better scheduling\n\t" + " xor %%r9, %%r9\n\t" + " xor %%r10, %%r10\n\t" + " movq %%rcx, %%rax\n\t" + " movq %%rcx, %%r8\n\t" + " movq %[rsi], %%r9\n\t" + " movq %[rdx], %%r10\n\t" + " xorps %%xmm6, %%xmm6 # zero accumulators\n\t" + " movups 0(%%r9), %%xmm0\n\t" + " xorps %%xmm7, %%xmm7 # zero accumulators\n\t" + " movups 0(%%r10), %%xmm2\n\t" + " shr $5, %%rax # rax = n_2_ccomplex_blocks / 2\n\t" + " shr $4, %%r8\n\t" + " jmp .%=L1_test\n\t" + " # 4 taps / loop\n\t" + " # something like ?? cycles / loop\n\t" + ".%=Loop1: \n\t" + "# complex prod: C += A * B, w/ temp Z & Y (or B), xmmPN=$0x8000000080000000\n\t" + "# movups (%%r9), %%xmmA\n\t" + "# movups (%%r10), %%xmmB\n\t" + "# movups %%xmmA, %%xmmZ\n\t" + "# shufps $0xb1, %%xmmZ, %%xmmZ # swap internals\n\t" + "# mulps %%xmmB, %%xmmA\n\t" + "# mulps %%xmmZ, %%xmmB\n\t" + "# # SSE replacement for: pfpnacc %%xmmB, %%xmmA\n\t" + "# xorps %%xmmPN, %%xmmA\n\t" + "# movups %%xmmA, %%xmmZ\n\t" + "# unpcklps %%xmmB, %%xmmA\n\t" + "# unpckhps %%xmmB, %%xmmZ\n\t" + "# movups %%xmmZ, %%xmmY\n\t" + "# shufps $0x44, %%xmmA, %%xmmZ # b01000100\n\t" + "# shufps $0xee, %%xmmY, %%xmmA # b11101110\n\t" + "# addps %%xmmZ, %%xmmA\n\t" + "# addps %%xmmA, %%xmmC\n\t" + "# A=xmm0, B=xmm2, Z=xmm4\n\t" + "# A'=xmm1, B'=xmm3, Z'=xmm5\n\t" + " movups 16(%%r9), %%xmm1\n\t" + " movups %%xmm0, %%xmm4\n\t" + " mulps %%xmm2, %%xmm0\n\t" + " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t" + " movups 16(%%r10), %%xmm3\n\t" + " movups %%xmm1, %%xmm5\n\t" + " addps %%xmm0, %%xmm6\n\t" + " mulps %%xmm3, %%xmm1\n\t" + " shufps $0xb1, %%xmm5, %%xmm5 # swap internals\n\t" + " addps %%xmm1, %%xmm6\n\t" + " mulps %%xmm4, %%xmm2\n\t" + " movups 32(%%r9), %%xmm0\n\t" + " addps %%xmm2, %%xmm7\n\t" + " mulps %%xmm5, %%xmm3\n\t" + " add $32, %%r9\n\t" + " movups 32(%%r10), %%xmm2\n\t" + " addps %%xmm3, %%xmm7\n\t" + " add $32, %%r10\n\t" + ".%=L1_test:\n\t" + " dec %%rax\n\t" + " jge .%=Loop1\n\t" + " # We've handled the bulk of multiplies up to here.\n\t" + " # Let's sse if original n_2_ccomplex_blocks was odd.\n\t" + " # If so, we've got 2 more taps to do.\n\t" + " and $1, %%r8\n\t" + " je .%=Leven\n\t" + " # The count was odd, do 2 more taps.\n\t" + " # Note that we've already got mm0/mm2 preloaded\n\t" + " # from the main loop.\n\t" + " movups %%xmm0, %%xmm4\n\t" + " mulps %%xmm2, %%xmm0\n\t" + " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t" + " addps %%xmm0, %%xmm6\n\t" + " mulps %%xmm4, %%xmm2\n\t" + " addps %%xmm2, %%xmm7\n\t" + ".%=Leven:\n\t" + " # neg inversor\n\t" + " xorps %%xmm1, %%xmm1\n\t" + " mov $0x80000000, %%r9\n\t" + " movd %%r9, %%xmm1\n\t" + " shufps $0x11, %%xmm1, %%xmm1 # b00010001 # 0 -0 0 -0\n\t" + " # pfpnacc\n\t" + " xorps %%xmm1, %%xmm6\n\t" + " movups %%xmm6, %%xmm2\n\t" + " unpcklps %%xmm7, %%xmm6\n\t" + " unpckhps %%xmm7, %%xmm2\n\t" + " movups %%xmm2, %%xmm3\n\t" + " shufps $0x44, %%xmm6, %%xmm2 # b01000100\n\t" + " shufps $0xee, %%xmm3, %%xmm6 # b11101110\n\t" + " addps %%xmm2, %%xmm6\n\t" + " # xmm6 = r1 i2 r3 i4\n\t" + " movhlps %%xmm6, %%xmm4 # xmm4 = r3 i4 ?? ??\n\t" + " addps %%xmm4, %%xmm6 # xmm6 = r1+r3 i2+i4 ?? ??\n\t" + " movlps %%xmm6, (%[rdi]) # store low 2x32 bits (complex) " + "to memory\n\t" + : + : [rsi] "r"(input), [rdx] "r"(taps), "c"(num_bytes), [rdi] "r"(result) + : "rax", "r8", "r9", "r10"); + + + if (isodd) { + *result += input[num_points - 1] * taps[num_points - 1]; + } + return; } #endif /* LV_HAVE_SSE && LV_HAVE_64 */ - - #ifdef LV_HAVE_SSE3 #include -static inline void volk_32fc_x2_dot_prod_32fc_u_sse3(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) { +static inline void volk_32fc_x2_dot_prod_32fc_u_sse3(lv_32fc_t* result, + const lv_32fc_t* input, + const lv_32fc_t* taps, + unsigned int num_points) +{ - lv_32fc_t dotProduct; - memset(&dotProduct, 0x0, 2*sizeof(float)); + lv_32fc_t dotProduct; + memset(&dotProduct, 0x0, 2 * sizeof(float)); - unsigned int number = 0; - const unsigned int halfPoints = num_points/2; - unsigned int isodd = num_points & 1; + unsigned int number = 0; + const unsigned int halfPoints = num_points / 2; + unsigned int isodd = num_points & 1; - __m128 x, y, yl, yh, z, tmp1, tmp2, dotProdVal; + __m128 x, y, yl, yh, z, tmp1, tmp2, dotProdVal; - const lv_32fc_t* a = input; - const lv_32fc_t* b = taps; + const lv_32fc_t* a = input; + const lv_32fc_t* b = taps; - dotProdVal = _mm_setzero_ps(); + dotProdVal = _mm_setzero_ps(); - for(;number < halfPoints; number++){ + for (; number < halfPoints; number++) { - x = _mm_loadu_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi - y = _mm_loadu_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di + x = _mm_loadu_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi + y = _mm_loadu_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di - yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr - yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di + yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr + yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di - tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + tmp1 = _mm_mul_ps(x, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr - x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br + x = _mm_shuffle_ps(x, x, 0xB1); // Re-arrange x to be ai,ar,bi,br - tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + tmp2 = _mm_mul_ps(x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di - z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + z = _mm_addsub_ps(tmp1, + tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di - dotProdVal = _mm_add_ps(dotProdVal, z); // Add the complex multiplication results together + dotProdVal = + _mm_add_ps(dotProdVal, z); // Add the complex multiplication results together - a += 2; - b += 2; - } + a += 2; + b += 2; + } - __VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector[2]; + __VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector[2]; - _mm_storeu_ps((float*)dotProductVector,dotProdVal); // Store the results back into the dot product vector + _mm_storeu_ps((float*)dotProductVector, + dotProdVal); // Store the results back into the dot product vector - dotProduct += ( dotProductVector[0] + dotProductVector[1] ); + dotProduct += (dotProductVector[0] + dotProductVector[1]); - if(isodd) { - dotProduct += input[num_points - 1] * taps[num_points - 1]; - } + if (isodd) { + dotProduct += input[num_points - 1] * taps[num_points - 1]; + } - *result = dotProduct; + *result = dotProduct; } #endif /*LV_HAVE_SSE3*/ @@ -296,78 +306,82 @@ static inline void volk_32fc_x2_dot_prod_32fc_u_sse3(lv_32fc_t* result, const lv #include -static inline void volk_32fc_x2_dot_prod_32fc_u_sse4_1(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) { +static inline void volk_32fc_x2_dot_prod_32fc_u_sse4_1(lv_32fc_t* result, + const lv_32fc_t* input, + const lv_32fc_t* taps, + unsigned int num_points) +{ - unsigned int i = 0; - const unsigned int qtr_points = num_points/4; - const unsigned int isodd = num_points & 3; + unsigned int i = 0; + const unsigned int qtr_points = num_points / 4; + const unsigned int isodd = num_points & 3; - __m128 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, real0, real1, im0, im1; - float *p_input, *p_taps; - __m64 *p_result; + __m128 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, real0, real1, im0, im1; + float *p_input, *p_taps; + __m64* p_result; - p_result = (__m64*)result; - p_input = (float*)input; - p_taps = (float*)taps; + p_result = (__m64*)result; + p_input = (float*)input; + p_taps = (float*)taps; - static const __m128i neg = {0x000000000000000080000000}; + static const __m128i neg = { 0x000000000000000080000000 }; - real0 = _mm_setzero_ps(); - real1 = _mm_setzero_ps(); - im0 = _mm_setzero_ps(); - im1 = _mm_setzero_ps(); + real0 = _mm_setzero_ps(); + real1 = _mm_setzero_ps(); + im0 = _mm_setzero_ps(); + im1 = _mm_setzero_ps(); - for(; i < qtr_points; ++i) { - xmm0 = _mm_loadu_ps(p_input); - xmm1 = _mm_loadu_ps(p_taps); + for (; i < qtr_points; ++i) { + xmm0 = _mm_loadu_ps(p_input); + xmm1 = _mm_loadu_ps(p_taps); - p_input += 4; - p_taps += 4; + p_input += 4; + p_taps += 4; - xmm2 = _mm_loadu_ps(p_input); - xmm3 = _mm_loadu_ps(p_taps); + xmm2 = _mm_loadu_ps(p_input); + xmm3 = _mm_loadu_ps(p_taps); - p_input += 4; - p_taps += 4; + p_input += 4; + p_taps += 4; - xmm4 = _mm_unpackhi_ps(xmm0, xmm2); - xmm5 = _mm_unpackhi_ps(xmm1, xmm3); - xmm0 = _mm_unpacklo_ps(xmm0, xmm2); - xmm2 = _mm_unpacklo_ps(xmm1, xmm3); + xmm4 = _mm_unpackhi_ps(xmm0, xmm2); + xmm5 = _mm_unpackhi_ps(xmm1, xmm3); + xmm0 = _mm_unpacklo_ps(xmm0, xmm2); + xmm2 = _mm_unpacklo_ps(xmm1, xmm3); - //imaginary vector from input - xmm1 = _mm_unpackhi_ps(xmm0, xmm4); - //real vector from input - xmm3 = _mm_unpacklo_ps(xmm0, xmm4); - //imaginary vector from taps - xmm0 = _mm_unpackhi_ps(xmm2, xmm5); - //real vector from taps - xmm2 = _mm_unpacklo_ps(xmm2, xmm5); + // imaginary vector from input + xmm1 = _mm_unpackhi_ps(xmm0, xmm4); + // real vector from input + xmm3 = _mm_unpacklo_ps(xmm0, xmm4); + // imaginary vector from taps + xmm0 = _mm_unpackhi_ps(xmm2, xmm5); + // real vector from taps + xmm2 = _mm_unpacklo_ps(xmm2, xmm5); - xmm4 = _mm_dp_ps(xmm3, xmm2, 0xf1); - xmm5 = _mm_dp_ps(xmm1, xmm0, 0xf1); + xmm4 = _mm_dp_ps(xmm3, xmm2, 0xf1); + xmm5 = _mm_dp_ps(xmm1, xmm0, 0xf1); - xmm6 = _mm_dp_ps(xmm3, xmm0, 0xf2); - xmm7 = _mm_dp_ps(xmm1, xmm2, 0xf2); + xmm6 = _mm_dp_ps(xmm3, xmm0, 0xf2); + xmm7 = _mm_dp_ps(xmm1, xmm2, 0xf2); - real0 = _mm_add_ps(xmm4, real0); - real1 = _mm_add_ps(xmm5, real1); - im0 = _mm_add_ps(xmm6, im0); - im1 = _mm_add_ps(xmm7, im1); - } + real0 = _mm_add_ps(xmm4, real0); + real1 = _mm_add_ps(xmm5, real1); + im0 = _mm_add_ps(xmm6, im0); + im1 = _mm_add_ps(xmm7, im1); + } - real1 = _mm_xor_ps(real1, bit128_p(&neg)->float_vec); + real1 = _mm_xor_ps(real1, bit128_p(&neg)->float_vec); - im0 = _mm_add_ps(im0, im1); - real0 = _mm_add_ps(real0, real1); + im0 = _mm_add_ps(im0, im1); + real0 = _mm_add_ps(real0, real1); - im0 = _mm_add_ps(im0, real0); + im0 = _mm_add_ps(im0, real0); - _mm_storel_pi(p_result, im0); + _mm_storel_pi(p_result, im0); - for(i = num_points-isodd; i < num_points; i++) { - *result += input[i] * taps[i]; - } + for (i = num_points - isodd; i < num_points; i++) { + *result += input[i] * taps[i]; + } } #endif /*LV_HAVE_SSE4_1*/ @@ -376,55 +390,63 @@ static inline void volk_32fc_x2_dot_prod_32fc_u_sse4_1(lv_32fc_t* result, const #include -static inline void volk_32fc_x2_dot_prod_32fc_u_avx(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) { +static inline void volk_32fc_x2_dot_prod_32fc_u_avx(lv_32fc_t* result, + const lv_32fc_t* input, + const lv_32fc_t* taps, + unsigned int num_points) +{ - unsigned int isodd = num_points & 3; - unsigned int i = 0; - lv_32fc_t dotProduct; - memset(&dotProduct, 0x0, 2*sizeof(float)); + unsigned int isodd = num_points & 3; + unsigned int i = 0; + lv_32fc_t dotProduct; + memset(&dotProduct, 0x0, 2 * sizeof(float)); - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; - __m256 x, y, yl, yh, z, tmp1, tmp2, dotProdVal; + __m256 x, y, yl, yh, z, tmp1, tmp2, dotProdVal; - const lv_32fc_t* a = input; - const lv_32fc_t* b = taps; + const lv_32fc_t* a = input; + const lv_32fc_t* b = taps; - dotProdVal = _mm256_setzero_ps(); + dotProdVal = _mm256_setzero_ps(); - for(;number < quarterPoints; number++){ - x = _mm256_loadu_ps((float*)a); // Load a,b,e,f as ar,ai,br,bi,er,ei,fr,fi - y = _mm256_loadu_ps((float*)b); // Load c,d,g,h as cr,ci,dr,di,gr,gi,hr,hi + for (; number < quarterPoints; number++) { + x = _mm256_loadu_ps((float*)a); // Load a,b,e,f as ar,ai,br,bi,er,ei,fr,fi + y = _mm256_loadu_ps((float*)b); // Load c,d,g,h as cr,ci,dr,di,gr,gi,hr,hi - yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr,gr,gr,hr,hr - yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di,gi,gi,hi,hi + yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr,gr,gr,hr,hr + yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di,gi,gi,hi,hi - tmp1 = _mm256_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr ... + tmp1 = _mm256_mul_ps(x, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr ... - x = _mm256_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br,ei,er,fi,fr + x = _mm256_shuffle_ps(x, x, 0xB1); // Re-arrange x to be ai,ar,bi,br,ei,er,fi,fr - tmp2 = _mm256_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di ... + tmp2 = _mm256_mul_ps(x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di ... - z = _mm256_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + z = _mm256_addsub_ps(tmp1, + tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di - dotProdVal = _mm256_add_ps(dotProdVal, z); // Add the complex multiplication results together + dotProdVal = _mm256_add_ps(dotProdVal, + z); // Add the complex multiplication results together - a += 4; - b += 4; - } + a += 4; + b += 4; + } - __VOLK_ATTR_ALIGNED(32) lv_32fc_t dotProductVector[4]; + __VOLK_ATTR_ALIGNED(32) lv_32fc_t dotProductVector[4]; - _mm256_storeu_ps((float*)dotProductVector,dotProdVal); // Store the results back into the dot product vector + _mm256_storeu_ps((float*)dotProductVector, + dotProdVal); // Store the results back into the dot product vector - dotProduct += ( dotProductVector[0] + dotProductVector[1] + dotProductVector[2] + dotProductVector[3]); + dotProduct += (dotProductVector[0] + dotProductVector[1] + dotProductVector[2] + + dotProductVector[3]); - for(i = num_points-isodd; i < num_points; i++) { - dotProduct += input[i] * taps[i]; - } + for (i = num_points - isodd; i < num_points; i++) { + dotProduct += input[i] * taps[i]; + } - *result = dotProduct; + *result = dotProduct; } #endif /*LV_HAVE_AVX*/ @@ -432,56 +454,64 @@ static inline void volk_32fc_x2_dot_prod_32fc_u_avx(lv_32fc_t* result, const lv_ #if LV_HAVE_AVX && LV_HAVE_FMA #include -static inline void volk_32fc_x2_dot_prod_32fc_u_avx_fma(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) { +static inline void volk_32fc_x2_dot_prod_32fc_u_avx_fma(lv_32fc_t* result, + const lv_32fc_t* input, + const lv_32fc_t* taps, + unsigned int num_points) +{ - unsigned int isodd = num_points & 3; - unsigned int i = 0; - lv_32fc_t dotProduct; - memset(&dotProduct, 0x0, 2*sizeof(float)); + unsigned int isodd = num_points & 3; + unsigned int i = 0; + lv_32fc_t dotProduct; + memset(&dotProduct, 0x0, 2 * sizeof(float)); - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; - __m256 x, y, yl, yh, z, tmp1, tmp2, dotProdVal; + __m256 x, y, yl, yh, z, tmp1, tmp2, dotProdVal; - const lv_32fc_t* a = input; - const lv_32fc_t* b = taps; + const lv_32fc_t* a = input; + const lv_32fc_t* b = taps; - dotProdVal = _mm256_setzero_ps(); + dotProdVal = _mm256_setzero_ps(); - for(;number < quarterPoints; number++){ + for (; number < quarterPoints; number++) { - x = _mm256_loadu_ps((float*)a); // Load a,b,e,f as ar,ai,br,bi,er,ei,fr,fi - y = _mm256_loadu_ps((float*)b); // Load c,d,g,h as cr,ci,dr,di,gr,gi,hr,hi + x = _mm256_loadu_ps((float*)a); // Load a,b,e,f as ar,ai,br,bi,er,ei,fr,fi + y = _mm256_loadu_ps((float*)b); // Load c,d,g,h as cr,ci,dr,di,gr,gi,hr,hi - yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr,gr,gr,hr,hr - yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di,gi,gi,hi,hi + yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr,gr,gr,hr,hr + yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di,gi,gi,hi,hi - tmp1 = x; + tmp1 = x; - x = _mm256_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br,ei,er,fi,fr + x = _mm256_shuffle_ps(x, x, 0xB1); // Re-arrange x to be ai,ar,bi,br,ei,er,fi,fr - tmp2 = _mm256_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di ... + tmp2 = _mm256_mul_ps(x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di ... - z = _mm256_fmaddsub_ps(tmp1, yl,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + z = _mm256_fmaddsub_ps( + tmp1, yl, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di - dotProdVal = _mm256_add_ps(dotProdVal, z); // Add the complex multiplication results together + dotProdVal = _mm256_add_ps(dotProdVal, + z); // Add the complex multiplication results together - a += 4; - b += 4; - } + a += 4; + b += 4; + } - __VOLK_ATTR_ALIGNED(32) lv_32fc_t dotProductVector[4]; + __VOLK_ATTR_ALIGNED(32) lv_32fc_t dotProductVector[4]; - _mm256_storeu_ps((float*)dotProductVector,dotProdVal); // Store the results back into the dot product vector + _mm256_storeu_ps((float*)dotProductVector, + dotProdVal); // Store the results back into the dot product vector - dotProduct += ( dotProductVector[0] + dotProductVector[1] + dotProductVector[2] + dotProductVector[3]); + dotProduct += (dotProductVector[0] + dotProductVector[1] + dotProductVector[2] + + dotProductVector[3]); - for(i = num_points-isodd; i < num_points; i++) { - dotProduct += input[i] * taps[i]; - } + for (i = num_points - isodd; i < num_points; i++) { + dotProduct += input[i] * taps[i]; + } - *result = dotProduct; + *result = dotProduct; } #endif /*LV_HAVE_AVX && LV_HAVE_FMA*/ @@ -491,44 +521,48 @@ static inline void volk_32fc_x2_dot_prod_32fc_u_avx_fma(lv_32fc_t* result, const #ifndef INCLUDED_volk_32fc_x2_dot_prod_32fc_a_H #define INCLUDED_volk_32fc_x2_dot_prod_32fc_a_H -#include -#include #include #include +#include +#include #ifdef LV_HAVE_GENERIC -static inline void volk_32fc_x2_dot_prod_32fc_a_generic(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) { +static inline void volk_32fc_x2_dot_prod_32fc_a_generic(lv_32fc_t* result, + const lv_32fc_t* input, + const lv_32fc_t* taps, + unsigned int num_points) +{ - const unsigned int num_bytes = num_points*8; + const unsigned int num_bytes = num_points * 8; - float * res = (float*) result; - float * in = (float*) input; - float * tp = (float*) taps; - unsigned int n_2_ccomplex_blocks = num_bytes >> 4; + float* res = (float*)result; + float* in = (float*)input; + float* tp = (float*)taps; + unsigned int n_2_ccomplex_blocks = num_bytes >> 4; - float sum0[2] = {0,0}; - float sum1[2] = {0,0}; - unsigned int i = 0; + float sum0[2] = { 0, 0 }; + float sum1[2] = { 0, 0 }; + unsigned int i = 0; - for(i = 0; i < n_2_ccomplex_blocks; ++i) { - sum0[0] += in[0] * tp[0] - in[1] * tp[1]; - sum0[1] += in[0] * tp[1] + in[1] * tp[0]; - sum1[0] += in[2] * tp[2] - in[3] * tp[3]; - sum1[1] += in[2] * tp[3] + in[3] * tp[2]; + for (i = 0; i < n_2_ccomplex_blocks; ++i) { + sum0[0] += in[0] * tp[0] - in[1] * tp[1]; + sum0[1] += in[0] * tp[1] + in[1] * tp[0]; + sum1[0] += in[2] * tp[2] - in[3] * tp[3]; + sum1[1] += in[2] * tp[3] + in[3] * tp[2]; - in += 4; - tp += 4; - } + in += 4; + tp += 4; + } - res[0] = sum0[0] + sum1[0]; - res[1] = sum0[1] + sum1[1]; + res[0] = sum0[0] + sum1[0]; + res[1] = sum0[1] + sum1[1]; - if (num_points & 1) { - *result += input[num_points - 1] * taps[num_points - 1]; - } + if (num_points & 1) { + *result += input[num_points - 1] * taps[num_points - 1]; + } } #endif /*LV_HAVE_GENERIC*/ @@ -537,140 +571,146 @@ static inline void volk_32fc_x2_dot_prod_32fc_a_generic(lv_32fc_t* result, const #if LV_HAVE_SSE && LV_HAVE_64 -static inline void volk_32fc_x2_dot_prod_32fc_a_sse_64(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) { - - const unsigned int num_bytes = num_points*8; - unsigned int isodd = num_points & 1; - - __VOLK_ASM - ( - "# ccomplex_dotprod_generic (float* result, const float *input,\n\t" - "# const float *taps, unsigned num_bytes)\n\t" - "# float sum0 = 0;\n\t" - "# float sum1 = 0;\n\t" - "# float sum2 = 0;\n\t" - "# float sum3 = 0;\n\t" - "# do {\n\t" - "# sum0 += input[0] * taps[0] - input[1] * taps[1];\n\t" - "# sum1 += input[0] * taps[1] + input[1] * taps[0];\n\t" - "# sum2 += input[2] * taps[2] - input[3] * taps[3];\n\t" - "# sum3 += input[2] * taps[3] + input[3] * taps[2];\n\t" - "# input += 4;\n\t" - "# taps += 4; \n\t" - "# } while (--n_2_ccomplex_blocks != 0);\n\t" - "# result[0] = sum0 + sum2;\n\t" - "# result[1] = sum1 + sum3;\n\t" - "# TODO: prefetch and better scheduling\n\t" - " xor %%r9, %%r9\n\t" - " xor %%r10, %%r10\n\t" - " movq %%rcx, %%rax\n\t" - " movq %%rcx, %%r8\n\t" - " movq %[rsi], %%r9\n\t" - " movq %[rdx], %%r10\n\t" - " xorps %%xmm6, %%xmm6 # zero accumulators\n\t" - " movaps 0(%%r9), %%xmm0\n\t" - " xorps %%xmm7, %%xmm7 # zero accumulators\n\t" - " movaps 0(%%r10), %%xmm2\n\t" - " shr $5, %%rax # rax = n_2_ccomplex_blocks / 2\n\t" - " shr $4, %%r8\n\t" - " jmp .%=L1_test\n\t" - " # 4 taps / loop\n\t" - " # something like ?? cycles / loop\n\t" - ".%=Loop1: \n\t" - "# complex prod: C += A * B, w/ temp Z & Y (or B), xmmPN=$0x8000000080000000\n\t" - "# movaps (%%r9), %%xmmA\n\t" - "# movaps (%%r10), %%xmmB\n\t" - "# movaps %%xmmA, %%xmmZ\n\t" - "# shufps $0xb1, %%xmmZ, %%xmmZ # swap internals\n\t" - "# mulps %%xmmB, %%xmmA\n\t" - "# mulps %%xmmZ, %%xmmB\n\t" - "# # SSE replacement for: pfpnacc %%xmmB, %%xmmA\n\t" - "# xorps %%xmmPN, %%xmmA\n\t" - "# movaps %%xmmA, %%xmmZ\n\t" - "# unpcklps %%xmmB, %%xmmA\n\t" - "# unpckhps %%xmmB, %%xmmZ\n\t" - "# movaps %%xmmZ, %%xmmY\n\t" - "# shufps $0x44, %%xmmA, %%xmmZ # b01000100\n\t" - "# shufps $0xee, %%xmmY, %%xmmA # b11101110\n\t" - "# addps %%xmmZ, %%xmmA\n\t" - "# addps %%xmmA, %%xmmC\n\t" - "# A=xmm0, B=xmm2, Z=xmm4\n\t" - "# A'=xmm1, B'=xmm3, Z'=xmm5\n\t" - " movaps 16(%%r9), %%xmm1\n\t" - " movaps %%xmm0, %%xmm4\n\t" - " mulps %%xmm2, %%xmm0\n\t" - " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t" - " movaps 16(%%r10), %%xmm3\n\t" - " movaps %%xmm1, %%xmm5\n\t" - " addps %%xmm0, %%xmm6\n\t" - " mulps %%xmm3, %%xmm1\n\t" - " shufps $0xb1, %%xmm5, %%xmm5 # swap internals\n\t" - " addps %%xmm1, %%xmm6\n\t" - " mulps %%xmm4, %%xmm2\n\t" - " movaps 32(%%r9), %%xmm0\n\t" - " addps %%xmm2, %%xmm7\n\t" - " mulps %%xmm5, %%xmm3\n\t" - " add $32, %%r9\n\t" - " movaps 32(%%r10), %%xmm2\n\t" - " addps %%xmm3, %%xmm7\n\t" - " add $32, %%r10\n\t" - ".%=L1_test:\n\t" - " dec %%rax\n\t" - " jge .%=Loop1\n\t" - " # We've handled the bulk of multiplies up to here.\n\t" - " # Let's sse if original n_2_ccomplex_blocks was odd.\n\t" - " # If so, we've got 2 more taps to do.\n\t" - " and $1, %%r8\n\t" - " je .%=Leven\n\t" - " # The count was odd, do 2 more taps.\n\t" - " # Note that we've already got mm0/mm2 preloaded\n\t" - " # from the main loop.\n\t" - " movaps %%xmm0, %%xmm4\n\t" - " mulps %%xmm2, %%xmm0\n\t" - " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t" - " addps %%xmm0, %%xmm6\n\t" - " mulps %%xmm4, %%xmm2\n\t" - " addps %%xmm2, %%xmm7\n\t" - ".%=Leven:\n\t" - " # neg inversor\n\t" - " xorps %%xmm1, %%xmm1\n\t" - " mov $0x80000000, %%r9\n\t" - " movd %%r9, %%xmm1\n\t" - " shufps $0x11, %%xmm1, %%xmm1 # b00010001 # 0 -0 0 -0\n\t" - " # pfpnacc\n\t" - " xorps %%xmm1, %%xmm6\n\t" - " movaps %%xmm6, %%xmm2\n\t" - " unpcklps %%xmm7, %%xmm6\n\t" - " unpckhps %%xmm7, %%xmm2\n\t" - " movaps %%xmm2, %%xmm3\n\t" - " shufps $0x44, %%xmm6, %%xmm2 # b01000100\n\t" - " shufps $0xee, %%xmm3, %%xmm6 # b11101110\n\t" - " addps %%xmm2, %%xmm6\n\t" - " # xmm6 = r1 i2 r3 i4\n\t" - " movhlps %%xmm6, %%xmm4 # xmm4 = r3 i4 ?? ??\n\t" - " addps %%xmm4, %%xmm6 # xmm6 = r1+r3 i2+i4 ?? ??\n\t" - " movlps %%xmm6, (%[rdi]) # store low 2x32 bits (complex) to memory\n\t" - : - :[rsi] "r" (input), [rdx] "r" (taps), "c" (num_bytes), [rdi] "r" (result) - :"rax", "r8", "r9", "r10" - ); - - - if(isodd) { - *result += input[num_points - 1] * taps[num_points - 1]; - } - - return; +static inline void volk_32fc_x2_dot_prod_32fc_a_sse_64(lv_32fc_t* result, + const lv_32fc_t* input, + const lv_32fc_t* taps, + unsigned int num_points) +{ + + const unsigned int num_bytes = num_points * 8; + unsigned int isodd = num_points & 1; + + __VOLK_ASM( + "# ccomplex_dotprod_generic (float* result, const float *input,\n\t" + "# const float *taps, unsigned num_bytes)\n\t" + "# float sum0 = 0;\n\t" + "# float sum1 = 0;\n\t" + "# float sum2 = 0;\n\t" + "# float sum3 = 0;\n\t" + "# do {\n\t" + "# sum0 += input[0] * taps[0] - input[1] * taps[1];\n\t" + "# sum1 += input[0] * taps[1] + input[1] * taps[0];\n\t" + "# sum2 += input[2] * taps[2] - input[3] * taps[3];\n\t" + "# sum3 += input[2] * taps[3] + input[3] * taps[2];\n\t" + "# input += 4;\n\t" + "# taps += 4; \n\t" + "# } while (--n_2_ccomplex_blocks != 0);\n\t" + "# result[0] = sum0 + sum2;\n\t" + "# result[1] = sum1 + sum3;\n\t" + "# TODO: prefetch and better scheduling\n\t" + " xor %%r9, %%r9\n\t" + " xor %%r10, %%r10\n\t" + " movq %%rcx, %%rax\n\t" + " movq %%rcx, %%r8\n\t" + " movq %[rsi], %%r9\n\t" + " movq %[rdx], %%r10\n\t" + " xorps %%xmm6, %%xmm6 # zero accumulators\n\t" + " movaps 0(%%r9), %%xmm0\n\t" + " xorps %%xmm7, %%xmm7 # zero accumulators\n\t" + " movaps 0(%%r10), %%xmm2\n\t" + " shr $5, %%rax # rax = n_2_ccomplex_blocks / 2\n\t" + " shr $4, %%r8\n\t" + " jmp .%=L1_test\n\t" + " # 4 taps / loop\n\t" + " # something like ?? cycles / loop\n\t" + ".%=Loop1: \n\t" + "# complex prod: C += A * B, w/ temp Z & Y (or B), xmmPN=$0x8000000080000000\n\t" + "# movaps (%%r9), %%xmmA\n\t" + "# movaps (%%r10), %%xmmB\n\t" + "# movaps %%xmmA, %%xmmZ\n\t" + "# shufps $0xb1, %%xmmZ, %%xmmZ # swap internals\n\t" + "# mulps %%xmmB, %%xmmA\n\t" + "# mulps %%xmmZ, %%xmmB\n\t" + "# # SSE replacement for: pfpnacc %%xmmB, %%xmmA\n\t" + "# xorps %%xmmPN, %%xmmA\n\t" + "# movaps %%xmmA, %%xmmZ\n\t" + "# unpcklps %%xmmB, %%xmmA\n\t" + "# unpckhps %%xmmB, %%xmmZ\n\t" + "# movaps %%xmmZ, %%xmmY\n\t" + "# shufps $0x44, %%xmmA, %%xmmZ # b01000100\n\t" + "# shufps $0xee, %%xmmY, %%xmmA # b11101110\n\t" + "# addps %%xmmZ, %%xmmA\n\t" + "# addps %%xmmA, %%xmmC\n\t" + "# A=xmm0, B=xmm2, Z=xmm4\n\t" + "# A'=xmm1, B'=xmm3, Z'=xmm5\n\t" + " movaps 16(%%r9), %%xmm1\n\t" + " movaps %%xmm0, %%xmm4\n\t" + " mulps %%xmm2, %%xmm0\n\t" + " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t" + " movaps 16(%%r10), %%xmm3\n\t" + " movaps %%xmm1, %%xmm5\n\t" + " addps %%xmm0, %%xmm6\n\t" + " mulps %%xmm3, %%xmm1\n\t" + " shufps $0xb1, %%xmm5, %%xmm5 # swap internals\n\t" + " addps %%xmm1, %%xmm6\n\t" + " mulps %%xmm4, %%xmm2\n\t" + " movaps 32(%%r9), %%xmm0\n\t" + " addps %%xmm2, %%xmm7\n\t" + " mulps %%xmm5, %%xmm3\n\t" + " add $32, %%r9\n\t" + " movaps 32(%%r10), %%xmm2\n\t" + " addps %%xmm3, %%xmm7\n\t" + " add $32, %%r10\n\t" + ".%=L1_test:\n\t" + " dec %%rax\n\t" + " jge .%=Loop1\n\t" + " # We've handled the bulk of multiplies up to here.\n\t" + " # Let's sse if original n_2_ccomplex_blocks was odd.\n\t" + " # If so, we've got 2 more taps to do.\n\t" + " and $1, %%r8\n\t" + " je .%=Leven\n\t" + " # The count was odd, do 2 more taps.\n\t" + " # Note that we've already got mm0/mm2 preloaded\n\t" + " # from the main loop.\n\t" + " movaps %%xmm0, %%xmm4\n\t" + " mulps %%xmm2, %%xmm0\n\t" + " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t" + " addps %%xmm0, %%xmm6\n\t" + " mulps %%xmm4, %%xmm2\n\t" + " addps %%xmm2, %%xmm7\n\t" + ".%=Leven:\n\t" + " # neg inversor\n\t" + " xorps %%xmm1, %%xmm1\n\t" + " mov $0x80000000, %%r9\n\t" + " movd %%r9, %%xmm1\n\t" + " shufps $0x11, %%xmm1, %%xmm1 # b00010001 # 0 -0 0 -0\n\t" + " # pfpnacc\n\t" + " xorps %%xmm1, %%xmm6\n\t" + " movaps %%xmm6, %%xmm2\n\t" + " unpcklps %%xmm7, %%xmm6\n\t" + " unpckhps %%xmm7, %%xmm2\n\t" + " movaps %%xmm2, %%xmm3\n\t" + " shufps $0x44, %%xmm6, %%xmm2 # b01000100\n\t" + " shufps $0xee, %%xmm3, %%xmm6 # b11101110\n\t" + " addps %%xmm2, %%xmm6\n\t" + " # xmm6 = r1 i2 r3 i4\n\t" + " movhlps %%xmm6, %%xmm4 # xmm4 = r3 i4 ?? ??\n\t" + " addps %%xmm4, %%xmm6 # xmm6 = r1+r3 i2+i4 ?? ??\n\t" + " movlps %%xmm6, (%[rdi]) # store low 2x32 bits (complex) " + "to memory\n\t" + : + : [rsi] "r"(input), [rdx] "r"(taps), "c"(num_bytes), [rdi] "r"(result) + : "rax", "r8", "r9", "r10"); + + + if (isodd) { + *result += input[num_points - 1] * taps[num_points - 1]; + } + return; } #endif #if LV_HAVE_SSE && LV_HAVE_32 -static inline void volk_32fc_x2_dot_prod_32fc_a_sse_32(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) { +static inline void volk_32fc_x2_dot_prod_32fc_a_sse_32(lv_32fc_t* result, + const lv_32fc_t* input, + const lv_32fc_t* taps, + unsigned int num_points) +{ - volk_32fc_x2_dot_prod_32fc_a_generic(result, input, taps, num_points); + volk_32fc_x2_dot_prod_32fc_a_generic(result, input, taps, num_points); #if 0 const unsigned int num_bytes = num_points*8; @@ -792,57 +832,64 @@ static inline void volk_32fc_x2_dot_prod_32fc_a_sse_32(lv_32fc_t* result, const #include -static inline void volk_32fc_x2_dot_prod_32fc_a_sse3(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) { +static inline void volk_32fc_x2_dot_prod_32fc_a_sse3(lv_32fc_t* result, + const lv_32fc_t* input, + const lv_32fc_t* taps, + unsigned int num_points) +{ - const unsigned int num_bytes = num_points*8; - unsigned int isodd = num_points & 1; + const unsigned int num_bytes = num_points * 8; + unsigned int isodd = num_points & 1; - lv_32fc_t dotProduct; - memset(&dotProduct, 0x0, 2*sizeof(float)); + lv_32fc_t dotProduct; + memset(&dotProduct, 0x0, 2 * sizeof(float)); - unsigned int number = 0; - const unsigned int halfPoints = num_bytes >> 4; + unsigned int number = 0; + const unsigned int halfPoints = num_bytes >> 4; - __m128 x, y, yl, yh, z, tmp1, tmp2, dotProdVal; + __m128 x, y, yl, yh, z, tmp1, tmp2, dotProdVal; - const lv_32fc_t* a = input; - const lv_32fc_t* b = taps; + const lv_32fc_t* a = input; + const lv_32fc_t* b = taps; - dotProdVal = _mm_setzero_ps(); + dotProdVal = _mm_setzero_ps(); - for(;number < halfPoints; number++){ + for (; number < halfPoints; number++) { - x = _mm_load_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi - y = _mm_load_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di + x = _mm_load_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi + y = _mm_load_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di - yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr - yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di + yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr + yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di - tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + tmp1 = _mm_mul_ps(x, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr - x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br + x = _mm_shuffle_ps(x, x, 0xB1); // Re-arrange x to be ai,ar,bi,br - tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + tmp2 = _mm_mul_ps(x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di - z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + z = _mm_addsub_ps(tmp1, + tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di - dotProdVal = _mm_add_ps(dotProdVal, z); // Add the complex multiplication results together + dotProdVal = + _mm_add_ps(dotProdVal, z); // Add the complex multiplication results together - a += 2; - b += 2; - } + a += 2; + b += 2; + } - __VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector[2]; + __VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector[2]; - _mm_store_ps((float*)dotProductVector,dotProdVal); // Store the results back into the dot product vector + _mm_store_ps((float*)dotProductVector, + dotProdVal); // Store the results back into the dot product vector - dotProduct += ( dotProductVector[0] + dotProductVector[1] ); + dotProduct += (dotProductVector[0] + dotProductVector[1]); - if(isodd) { - dotProduct += input[num_points - 1] * taps[num_points - 1]; - } + if (isodd) { + dotProduct += input[num_points - 1] * taps[num_points - 1]; + } - *result = dotProduct; + *result = dotProduct; } #endif /*LV_HAVE_SSE3*/ @@ -852,78 +899,82 @@ static inline void volk_32fc_x2_dot_prod_32fc_a_sse3(lv_32fc_t* result, const lv #include -static inline void volk_32fc_x2_dot_prod_32fc_a_sse4_1(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) { +static inline void volk_32fc_x2_dot_prod_32fc_a_sse4_1(lv_32fc_t* result, + const lv_32fc_t* input, + const lv_32fc_t* taps, + unsigned int num_points) +{ - unsigned int i = 0; - const unsigned int qtr_points = num_points/4; - const unsigned int isodd = num_points & 3; + unsigned int i = 0; + const unsigned int qtr_points = num_points / 4; + const unsigned int isodd = num_points & 3; - __m128 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, real0, real1, im0, im1; - float *p_input, *p_taps; - __m64 *p_result; + __m128 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, real0, real1, im0, im1; + float *p_input, *p_taps; + __m64* p_result; - static const __m128i neg = {0x000000000000000080000000}; + static const __m128i neg = { 0x000000000000000080000000 }; - p_result = (__m64*)result; - p_input = (float*)input; - p_taps = (float*)taps; + p_result = (__m64*)result; + p_input = (float*)input; + p_taps = (float*)taps; - real0 = _mm_setzero_ps(); - real1 = _mm_setzero_ps(); - im0 = _mm_setzero_ps(); - im1 = _mm_setzero_ps(); + real0 = _mm_setzero_ps(); + real1 = _mm_setzero_ps(); + im0 = _mm_setzero_ps(); + im1 = _mm_setzero_ps(); - for(; i < qtr_points; ++i) { - xmm0 = _mm_load_ps(p_input); - xmm1 = _mm_load_ps(p_taps); + for (; i < qtr_points; ++i) { + xmm0 = _mm_load_ps(p_input); + xmm1 = _mm_load_ps(p_taps); - p_input += 4; - p_taps += 4; + p_input += 4; + p_taps += 4; - xmm2 = _mm_load_ps(p_input); - xmm3 = _mm_load_ps(p_taps); + xmm2 = _mm_load_ps(p_input); + xmm3 = _mm_load_ps(p_taps); - p_input += 4; - p_taps += 4; + p_input += 4; + p_taps += 4; - xmm4 = _mm_unpackhi_ps(xmm0, xmm2); - xmm5 = _mm_unpackhi_ps(xmm1, xmm3); - xmm0 = _mm_unpacklo_ps(xmm0, xmm2); - xmm2 = _mm_unpacklo_ps(xmm1, xmm3); + xmm4 = _mm_unpackhi_ps(xmm0, xmm2); + xmm5 = _mm_unpackhi_ps(xmm1, xmm3); + xmm0 = _mm_unpacklo_ps(xmm0, xmm2); + xmm2 = _mm_unpacklo_ps(xmm1, xmm3); - //imaginary vector from input - xmm1 = _mm_unpackhi_ps(xmm0, xmm4); - //real vector from input - xmm3 = _mm_unpacklo_ps(xmm0, xmm4); - //imaginary vector from taps - xmm0 = _mm_unpackhi_ps(xmm2, xmm5); - //real vector from taps - xmm2 = _mm_unpacklo_ps(xmm2, xmm5); + // imaginary vector from input + xmm1 = _mm_unpackhi_ps(xmm0, xmm4); + // real vector from input + xmm3 = _mm_unpacklo_ps(xmm0, xmm4); + // imaginary vector from taps + xmm0 = _mm_unpackhi_ps(xmm2, xmm5); + // real vector from taps + xmm2 = _mm_unpacklo_ps(xmm2, xmm5); - xmm4 = _mm_dp_ps(xmm3, xmm2, 0xf1); - xmm5 = _mm_dp_ps(xmm1, xmm0, 0xf1); + xmm4 = _mm_dp_ps(xmm3, xmm2, 0xf1); + xmm5 = _mm_dp_ps(xmm1, xmm0, 0xf1); - xmm6 = _mm_dp_ps(xmm3, xmm0, 0xf2); - xmm7 = _mm_dp_ps(xmm1, xmm2, 0xf2); + xmm6 = _mm_dp_ps(xmm3, xmm0, 0xf2); + xmm7 = _mm_dp_ps(xmm1, xmm2, 0xf2); - real0 = _mm_add_ps(xmm4, real0); - real1 = _mm_add_ps(xmm5, real1); - im0 = _mm_add_ps(xmm6, im0); - im1 = _mm_add_ps(xmm7, im1); - } + real0 = _mm_add_ps(xmm4, real0); + real1 = _mm_add_ps(xmm5, real1); + im0 = _mm_add_ps(xmm6, im0); + im1 = _mm_add_ps(xmm7, im1); + } - real1 = _mm_xor_ps(real1, bit128_p(&neg)->float_vec); + real1 = _mm_xor_ps(real1, bit128_p(&neg)->float_vec); - im0 = _mm_add_ps(im0, im1); - real0 = _mm_add_ps(real0, real1); + im0 = _mm_add_ps(im0, im1); + real0 = _mm_add_ps(real0, real1); - im0 = _mm_add_ps(im0, real0); + im0 = _mm_add_ps(im0, real0); - _mm_storel_pi(p_result, im0); + _mm_storel_pi(p_result, im0); - for(i = num_points-isodd; i < num_points; i++) { - *result += input[i] * taps[i]; - } + for (i = num_points - isodd; i < num_points; i++) { + *result += input[i] * taps[i]; + } } #endif /*LV_HAVE_SSE4_1*/ @@ -931,13 +982,17 @@ static inline void volk_32fc_x2_dot_prod_32fc_a_sse4_1(lv_32fc_t* result, const #ifdef LV_HAVE_NEON #include -static inline void volk_32fc_x2_dot_prod_32fc_neon(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) { +static inline void volk_32fc_x2_dot_prod_32fc_neon(lv_32fc_t* result, + const lv_32fc_t* input, + const lv_32fc_t* taps, + unsigned int num_points) +{ unsigned int quarter_points = num_points / 4; unsigned int number; - lv_32fc_t* a_ptr = (lv_32fc_t*) taps; - lv_32fc_t* b_ptr = (lv_32fc_t*) input; + lv_32fc_t* a_ptr = (lv_32fc_t*)taps; + lv_32fc_t* b_ptr = (lv_32fc_t*)input; // for 2-lane vectors, 1st lane holds the real part, // 2nd lane holds the imaginary part float32x4x2_t a_val, b_val, c_val, accumulator; @@ -945,11 +1000,11 @@ static inline void volk_32fc_x2_dot_prod_32fc_neon(lv_32fc_t* result, const lv_3 accumulator.val[0] = vdupq_n_f32(0); accumulator.val[1] = vdupq_n_f32(0); - for(number = 0; number < quarter_points; ++number) { + for (number = 0; number < quarter_points; ++number) { a_val = vld2q_f32((float*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i b_val = vld2q_f32((float*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i - __VOLK_PREFETCH(a_ptr+8); - __VOLK_PREFETCH(b_ptr+8); + __VOLK_PREFETCH(a_ptr + 8); + __VOLK_PREFETCH(b_ptr + 8); // multiply the real*real and imag*imag to get real result // a0r*b0r|a1r*b1r|a2r*b2r|a3r*b3r @@ -977,22 +1032,25 @@ static inline void volk_32fc_x2_dot_prod_32fc_neon(lv_32fc_t* result, const lv_3 *result = accum_result[0] + accum_result[1] + accum_result[2] + accum_result[3]; // tail case - for(number = quarter_points*4; number < num_points; ++number) { + for (number = quarter_points * 4; number < num_points; ++number) { *result += (*a_ptr++) * (*b_ptr++); } - } #endif /*LV_HAVE_NEON*/ #ifdef LV_HAVE_NEON #include -static inline void volk_32fc_x2_dot_prod_32fc_neon_opttests(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) { +static inline void volk_32fc_x2_dot_prod_32fc_neon_opttests(lv_32fc_t* result, + const lv_32fc_t* input, + const lv_32fc_t* taps, + unsigned int num_points) +{ unsigned int quarter_points = num_points / 4; unsigned int number; - lv_32fc_t* a_ptr = (lv_32fc_t*) taps; - lv_32fc_t* b_ptr = (lv_32fc_t*) input; + lv_32fc_t* a_ptr = (lv_32fc_t*)taps; + lv_32fc_t* b_ptr = (lv_32fc_t*)input; // for 2-lane vectors, 1st lane holds the real part, // 2nd lane holds the imaginary part float32x4x2_t a_val, b_val, accumulator; @@ -1000,11 +1058,11 @@ static inline void volk_32fc_x2_dot_prod_32fc_neon_opttests(lv_32fc_t* result, c accumulator.val[0] = vdupq_n_f32(0); accumulator.val[1] = vdupq_n_f32(0); - for(number = 0; number < quarter_points; ++number) { + for (number = 0; number < quarter_points; ++number) { a_val = vld2q_f32((float*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i b_val = vld2q_f32((float*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i - __VOLK_PREFETCH(a_ptr+8); - __VOLK_PREFETCH(b_ptr+8); + __VOLK_PREFETCH(a_ptr + 8); + __VOLK_PREFETCH(b_ptr + 8); // do the first multiply tmp_imag.val[1] = vmulq_f32(a_val.val[1], b_val.val[0]); @@ -1026,21 +1084,24 @@ static inline void volk_32fc_x2_dot_prod_32fc_neon_opttests(lv_32fc_t* result, c *result = accum_result[0] + accum_result[1] + accum_result[2] + accum_result[3]; // tail case - for(number = quarter_points*4; number < num_points; ++number) { + for (number = quarter_points * 4; number < num_points; ++number) { *result += (*a_ptr++) * (*b_ptr++); } - } #endif /*LV_HAVE_NEON*/ #ifdef LV_HAVE_NEON -static inline void volk_32fc_x2_dot_prod_32fc_neon_optfma(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) { +static inline void volk_32fc_x2_dot_prod_32fc_neon_optfma(lv_32fc_t* result, + const lv_32fc_t* input, + const lv_32fc_t* taps, + unsigned int num_points) +{ unsigned int quarter_points = num_points / 4; unsigned int number; - lv_32fc_t* a_ptr = (lv_32fc_t*) taps; - lv_32fc_t* b_ptr = (lv_32fc_t*) input; + lv_32fc_t* a_ptr = (lv_32fc_t*)taps; + lv_32fc_t* b_ptr = (lv_32fc_t*)input; // for 2-lane vectors, 1st lane holds the real part, // 2nd lane holds the imaginary part float32x4x2_t a_val, b_val, accumulator1, accumulator2; @@ -1049,11 +1110,11 @@ static inline void volk_32fc_x2_dot_prod_32fc_neon_optfma(lv_32fc_t* result, con accumulator2.val[0] = vdupq_n_f32(0); accumulator2.val[1] = vdupq_n_f32(0); - for(number = 0; number < quarter_points; ++number) { + for (number = 0; number < quarter_points; ++number) { a_val = vld2q_f32((float*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i b_val = vld2q_f32((float*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i - __VOLK_PREFETCH(a_ptr+8); - __VOLK_PREFETCH(b_ptr+8); + __VOLK_PREFETCH(a_ptr + 8); + __VOLK_PREFETCH(b_ptr + 8); // use 2 accumulators to remove inter-instruction data dependencies accumulator1.val[0] = vmlaq_f32(accumulator1.val[0], a_val.val[0], b_val.val[0]); @@ -1071,22 +1132,26 @@ static inline void volk_32fc_x2_dot_prod_32fc_neon_optfma(lv_32fc_t* result, con *result = accum_result[0] + accum_result[1] + accum_result[2] + accum_result[3]; // tail case - for(number = quarter_points*4; number < num_points; ++number) { + for (number = quarter_points * 4; number < num_points; ++number) { *result += (*a_ptr++) * (*b_ptr++); } - } #endif /*LV_HAVE_NEON*/ #ifdef LV_HAVE_NEON -static inline void volk_32fc_x2_dot_prod_32fc_neon_optfmaunroll(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) { -// NOTE: GCC does a poor job with this kernel, but the equivalent ASM code is very fast +static inline void volk_32fc_x2_dot_prod_32fc_neon_optfmaunroll(lv_32fc_t* result, + const lv_32fc_t* input, + const lv_32fc_t* taps, + unsigned int num_points) +{ + // NOTE: GCC does a poor job with this kernel, but the equivalent ASM code is very + // fast unsigned int quarter_points = num_points / 8; unsigned int number; - lv_32fc_t* a_ptr = (lv_32fc_t*) taps; - lv_32fc_t* b_ptr = (lv_32fc_t*) input; + lv_32fc_t* a_ptr = (lv_32fc_t*)taps; + lv_32fc_t* b_ptr = (lv_32fc_t*)input; // for 2-lane vectors, 1st lane holds the real part, // 2nd lane holds the imaginary part float32x4x4_t a_val, b_val, accumulator1, accumulator2; @@ -1101,11 +1166,11 @@ static inline void volk_32fc_x2_dot_prod_32fc_neon_optfmaunroll(lv_32fc_t* resul accumulator2.val[3] = vdupq_n_f32(0); // 8 input regs, 8 accumulators -> 16/16 neon regs are used - for(number = 0; number < quarter_points; ++number) { + for (number = 0; number < quarter_points; ++number) { a_val = vld4q_f32((float*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i b_val = vld4q_f32((float*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i - __VOLK_PREFETCH(a_ptr+8); - __VOLK_PREFETCH(b_ptr+8); + __VOLK_PREFETCH(a_ptr + 8); + __VOLK_PREFETCH(b_ptr + 8); // use 2 accumulators to remove inter-instruction data dependencies accumulator1.val[0] = vmlaq_f32(accumulator1.val[0], a_val.val[0], b_val.val[0]); @@ -1136,10 +1201,9 @@ static inline void volk_32fc_x2_dot_prod_32fc_neon_optfmaunroll(lv_32fc_t* resul *result = accum_result[0] + accum_result[1] + accum_result[2] + accum_result[3]; // tail case - for(number = quarter_points*8; number < num_points; ++number) { + for (number = quarter_points * 8; number < num_points; ++number) { *result += (*a_ptr++) * (*b_ptr++); } - } #endif /*LV_HAVE_NEON*/ @@ -1148,56 +1212,64 @@ static inline void volk_32fc_x2_dot_prod_32fc_neon_optfmaunroll(lv_32fc_t* resul #include -static inline void volk_32fc_x2_dot_prod_32fc_a_avx(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) { +static inline void volk_32fc_x2_dot_prod_32fc_a_avx(lv_32fc_t* result, + const lv_32fc_t* input, + const lv_32fc_t* taps, + unsigned int num_points) +{ - unsigned int isodd = num_points & 3; - unsigned int i = 0; - lv_32fc_t dotProduct; - memset(&dotProduct, 0x0, 2*sizeof(float)); + unsigned int isodd = num_points & 3; + unsigned int i = 0; + lv_32fc_t dotProduct; + memset(&dotProduct, 0x0, 2 * sizeof(float)); - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; - __m256 x, y, yl, yh, z, tmp1, tmp2, dotProdVal; + __m256 x, y, yl, yh, z, tmp1, tmp2, dotProdVal; - const lv_32fc_t* a = input; - const lv_32fc_t* b = taps; + const lv_32fc_t* a = input; + const lv_32fc_t* b = taps; - dotProdVal = _mm256_setzero_ps(); + dotProdVal = _mm256_setzero_ps(); - for(;number < quarterPoints; number++){ + for (; number < quarterPoints; number++) { - x = _mm256_load_ps((float*)a); // Load a,b,e,f as ar,ai,br,bi,er,ei,fr,fi - y = _mm256_load_ps((float*)b); // Load c,d,g,h as cr,ci,dr,di,gr,gi,hr,hi + x = _mm256_load_ps((float*)a); // Load a,b,e,f as ar,ai,br,bi,er,ei,fr,fi + y = _mm256_load_ps((float*)b); // Load c,d,g,h as cr,ci,dr,di,gr,gi,hr,hi - yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr,gr,gr,hr,hr - yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di,gi,gi,hi,hi + yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr,gr,gr,hr,hr + yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di,gi,gi,hi,hi - tmp1 = _mm256_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr ... + tmp1 = _mm256_mul_ps(x, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr ... - x = _mm256_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br,ei,er,fi,fr + x = _mm256_shuffle_ps(x, x, 0xB1); // Re-arrange x to be ai,ar,bi,br,ei,er,fi,fr - tmp2 = _mm256_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di ... + tmp2 = _mm256_mul_ps(x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di ... - z = _mm256_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + z = _mm256_addsub_ps(tmp1, + tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di - dotProdVal = _mm256_add_ps(dotProdVal, z); // Add the complex multiplication results together + dotProdVal = _mm256_add_ps(dotProdVal, + z); // Add the complex multiplication results together - a += 4; - b += 4; - } + a += 4; + b += 4; + } - __VOLK_ATTR_ALIGNED(32) lv_32fc_t dotProductVector[4]; + __VOLK_ATTR_ALIGNED(32) lv_32fc_t dotProductVector[4]; - _mm256_store_ps((float*)dotProductVector,dotProdVal); // Store the results back into the dot product vector + _mm256_store_ps((float*)dotProductVector, + dotProdVal); // Store the results back into the dot product vector - dotProduct += ( dotProductVector[0] + dotProductVector[1] + dotProductVector[2] + dotProductVector[3]); + dotProduct += (dotProductVector[0] + dotProductVector[1] + dotProductVector[2] + + dotProductVector[3]); - for(i = num_points-isodd; i < num_points; i++) { - dotProduct += input[i] * taps[i]; - } + for (i = num_points - isodd; i < num_points; i++) { + dotProduct += input[i] * taps[i]; + } - *result = dotProduct; + *result = dotProduct; } #endif /*LV_HAVE_AVX*/ @@ -1205,56 +1277,64 @@ static inline void volk_32fc_x2_dot_prod_32fc_a_avx(lv_32fc_t* result, const lv_ #if LV_HAVE_AVX && LV_HAVE_FMA #include -static inline void volk_32fc_x2_dot_prod_32fc_a_avx_fma(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) { +static inline void volk_32fc_x2_dot_prod_32fc_a_avx_fma(lv_32fc_t* result, + const lv_32fc_t* input, + const lv_32fc_t* taps, + unsigned int num_points) +{ - unsigned int isodd = num_points & 3; - unsigned int i = 0; - lv_32fc_t dotProduct; - memset(&dotProduct, 0x0, 2*sizeof(float)); + unsigned int isodd = num_points & 3; + unsigned int i = 0; + lv_32fc_t dotProduct; + memset(&dotProduct, 0x0, 2 * sizeof(float)); - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; - __m256 x, y, yl, yh, z, tmp1, tmp2, dotProdVal; + __m256 x, y, yl, yh, z, tmp1, tmp2, dotProdVal; - const lv_32fc_t* a = input; - const lv_32fc_t* b = taps; + const lv_32fc_t* a = input; + const lv_32fc_t* b = taps; - dotProdVal = _mm256_setzero_ps(); + dotProdVal = _mm256_setzero_ps(); - for(;number < quarterPoints; number++){ + for (; number < quarterPoints; number++) { - x = _mm256_load_ps((float*)a); // Load a,b,e,f as ar,ai,br,bi,er,ei,fr,fi - y = _mm256_load_ps((float*)b); // Load c,d,g,h as cr,ci,dr,di,gr,gi,hr,hi + x = _mm256_load_ps((float*)a); // Load a,b,e,f as ar,ai,br,bi,er,ei,fr,fi + y = _mm256_load_ps((float*)b); // Load c,d,g,h as cr,ci,dr,di,gr,gi,hr,hi - yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr,gr,gr,hr,hr - yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di,gi,gi,hi,hi + yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr,gr,gr,hr,hr + yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di,gi,gi,hi,hi - tmp1 = x; + tmp1 = x; - x = _mm256_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br,ei,er,fi,fr + x = _mm256_shuffle_ps(x, x, 0xB1); // Re-arrange x to be ai,ar,bi,br,ei,er,fi,fr - tmp2 = _mm256_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di ... + tmp2 = _mm256_mul_ps(x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di ... - z = _mm256_fmaddsub_ps(tmp1, yl,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + z = _mm256_fmaddsub_ps( + tmp1, yl, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di - dotProdVal = _mm256_add_ps(dotProdVal, z); // Add the complex multiplication results together + dotProdVal = _mm256_add_ps(dotProdVal, + z); // Add the complex multiplication results together - a += 4; - b += 4; - } + a += 4; + b += 4; + } - __VOLK_ATTR_ALIGNED(32) lv_32fc_t dotProductVector[4]; + __VOLK_ATTR_ALIGNED(32) lv_32fc_t dotProductVector[4]; - _mm256_store_ps((float*)dotProductVector,dotProdVal); // Store the results back into the dot product vector + _mm256_store_ps((float*)dotProductVector, + dotProdVal); // Store the results back into the dot product vector - dotProduct += ( dotProductVector[0] + dotProductVector[1] + dotProductVector[2] + dotProductVector[3]); + dotProduct += (dotProductVector[0] + dotProductVector[1] + dotProductVector[2] + + dotProductVector[3]); - for(i = num_points-isodd; i < num_points; i++) { - dotProduct += input[i] * taps[i]; - } + for (i = num_points - isodd; i < num_points; i++) { + dotProduct += input[i] * taps[i]; + } - *result = dotProduct; + *result = dotProduct; } #endif /*LV_HAVE_AVX && LV_HAVE_FMA*/ diff --git a/kernels/volk/volk_32fc_x2_multiply_32fc.h b/kernels/volk/volk_32fc_x2_multiply_32fc.h index 6bf428b..6cb6907 100644 --- a/kernels/volk/volk_32fc_x2_multiply_32fc.h +++ b/kernels/volk/volk_32fc_x2_multiply_32fc.h @@ -29,8 +29,8 @@ * * Dispatcher Prototype * \code - * void volk_32fc_x2_multiply_32fc(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points); - * \endcode + * void volk_32fc_x2_multiply_32fc(lv_32fc_t* cVector, const lv_32fc_t* aVector, const + * lv_32fc_t* bVector, unsigned int num_points); \endcode * * \b Inputs * \li aVector: The first input vector of complex floats. @@ -70,55 +70,62 @@ #ifndef INCLUDED_volk_32fc_x2_multiply_32fc_u_H #define INCLUDED_volk_32fc_x2_multiply_32fc_u_H +#include #include #include #include -#include #if LV_HAVE_AVX2 && LV_HAVE_FMA #include /*! - \brief Multiplies the two input complex vectors and stores their results in the third vector - \param cVector The vector where the results will be stored - \param aVector One of the vectors to be multiplied - \param bVector One of the vectors to be multiplied - \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector + \brief Multiplies the two input complex vectors and stores their results in the third + vector \param cVector The vector where the results will be stored \param aVector One of + the vectors to be multiplied \param bVector One of the vectors to be multiplied \param + num_points The number of complex values in aVector and bVector to be multiplied together + and stored into cVector */ -static inline void volk_32fc_x2_multiply_32fc_u_avx2_fma(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){ - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; +static inline void volk_32fc_x2_multiply_32fc_u_avx2_fma(lv_32fc_t* cVector, + const lv_32fc_t* aVector, + const lv_32fc_t* bVector, + unsigned int num_points) +{ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; - lv_32fc_t* c = cVector; - const lv_32fc_t* a = aVector; - const lv_32fc_t* b = bVector; + lv_32fc_t* c = cVector; + const lv_32fc_t* a = aVector; + const lv_32fc_t* b = bVector; - for(;number < quarterPoints; number++){ + for (; number < quarterPoints; number++) { - const __m256 x = _mm256_loadu_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi - const __m256 y = _mm256_loadu_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di + const __m256 x = + _mm256_loadu_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi + const __m256 y = + _mm256_loadu_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di - const __m256 yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr - const __m256 yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di + const __m256 yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr + const __m256 yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di - const __m256 tmp2x = _mm256_permute_ps(x,0xB1); // Re-arrange x to be ai,ar,bi,br + const __m256 tmp2x = _mm256_permute_ps(x, 0xB1); // Re-arrange x to be ai,ar,bi,br - const __m256 tmp2 = _mm256_mul_ps(tmp2x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + const __m256 tmp2 = _mm256_mul_ps(tmp2x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di - const __m256 z = _mm256_fmaddsub_ps(x, yl, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + const __m256 z = _mm256_fmaddsub_ps( + x, yl, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di - _mm256_storeu_ps((float*)c,z); // Store the results back into the C container + _mm256_storeu_ps((float*)c, z); // Store the results back into the C container - a += 4; - b += 4; - c += 4; - } + a += 4; + b += 4; + c += 4; + } - _mm256_zeroupper(); + _mm256_zeroupper(); - number = quarterPoints * 4; - for(;number < num_points; number++){ - *c++ = (*a++) * (*b++); - } + number = quarterPoints * 4; + for (; number < num_points; number++) { + *c++ = (*a++) * (*b++); + } } #endif /* LV_HAVE_AVX2 && LV_HAVE_FMA */ @@ -127,34 +134,37 @@ static inline void volk_32fc_x2_multiply_32fc_u_avx2_fma(lv_32fc_t* cVector, con #include #include -static inline void -volk_32fc_x2_multiply_32fc_u_avx(lv_32fc_t* cVector, const lv_32fc_t* aVector, - const lv_32fc_t* bVector, unsigned int num_points) +static inline void volk_32fc_x2_multiply_32fc_u_avx(lv_32fc_t* cVector, + const lv_32fc_t* aVector, + const lv_32fc_t* bVector, + unsigned int num_points) { - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - __m256 x, y, z; - lv_32fc_t* c = cVector; - const lv_32fc_t* a = aVector; - const lv_32fc_t* b = bVector; - - for(; number < quarterPoints; number++){ - x = _mm256_loadu_ps((float*) a); // Load the ar + ai, br + bi ... as ar,ai,br,bi ... - y = _mm256_loadu_ps((float*) b); // Load the cr + ci, dr + di ... as cr,ci,dr,di ... - z = _mm256_complexmul_ps(x, y); - _mm256_storeu_ps((float*) c, z); // Store the results back into the C container - - a += 4; - b += 4; - c += 4; - } - - number = quarterPoints * 4; - - for(; number < num_points; number++){ - *c++ = (*a++) * (*b++); - } + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + __m256 x, y, z; + lv_32fc_t* c = cVector; + const lv_32fc_t* a = aVector; + const lv_32fc_t* b = bVector; + + for (; number < quarterPoints; number++) { + x = _mm256_loadu_ps( + (float*)a); // Load the ar + ai, br + bi ... as ar,ai,br,bi ... + y = _mm256_loadu_ps( + (float*)b); // Load the cr + ci, dr + di ... as cr,ci,dr,di ... + z = _mm256_complexmul_ps(x, y); + _mm256_storeu_ps((float*)c, z); // Store the results back into the C container + + a += 4; + b += 4; + c += 4; + } + + number = quarterPoints * 4; + + for (; number < num_points; number++) { + *c++ = (*a++) * (*b++); + } } #endif /* LV_HAVE_AVX */ @@ -163,50 +173,52 @@ volk_32fc_x2_multiply_32fc_u_avx(lv_32fc_t* cVector, const lv_32fc_t* aVector, #include #include -static inline void -volk_32fc_x2_multiply_32fc_u_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector, - const lv_32fc_t* bVector, unsigned int num_points) +static inline void volk_32fc_x2_multiply_32fc_u_sse3(lv_32fc_t* cVector, + const lv_32fc_t* aVector, + const lv_32fc_t* bVector, + unsigned int num_points) { - unsigned int number = 0; - const unsigned int halfPoints = num_points / 2; - - __m128 x, y, z; - lv_32fc_t* c = cVector; - const lv_32fc_t* a = aVector; - const lv_32fc_t* b = bVector; - - for(; number < halfPoints; number++){ - x = _mm_loadu_ps((float*) a); // Load the ar + ai, br + bi as ar,ai,br,bi - y = _mm_loadu_ps((float*) b); // Load the cr + ci, dr + di as cr,ci,dr,di - z = _mm_complexmul_ps(x, y); - _mm_storeu_ps((float*) c, z); // Store the results back into the C container - - a += 2; - b += 2; - c += 2; - } - - if((num_points % 2) != 0){ - *c = (*a) * (*b); - } + unsigned int number = 0; + const unsigned int halfPoints = num_points / 2; + + __m128 x, y, z; + lv_32fc_t* c = cVector; + const lv_32fc_t* a = aVector; + const lv_32fc_t* b = bVector; + + for (; number < halfPoints; number++) { + x = _mm_loadu_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi + y = _mm_loadu_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di + z = _mm_complexmul_ps(x, y); + _mm_storeu_ps((float*)c, z); // Store the results back into the C container + + a += 2; + b += 2; + c += 2; + } + + if ((num_points % 2) != 0) { + *c = (*a) * (*b); + } } #endif /* LV_HAVE_SSE */ #ifdef LV_HAVE_GENERIC -static inline void -volk_32fc_x2_multiply_32fc_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, - const lv_32fc_t* bVector, unsigned int num_points) +static inline void volk_32fc_x2_multiply_32fc_generic(lv_32fc_t* cVector, + const lv_32fc_t* aVector, + const lv_32fc_t* bVector, + unsigned int num_points) { - lv_32fc_t* cPtr = cVector; - const lv_32fc_t* aPtr = aVector; - const lv_32fc_t* bPtr= bVector; - unsigned int number = 0; - - for(number = 0; number < num_points; number++){ - *cPtr++ = (*aPtr++) * (*bPtr++); - } + lv_32fc_t* cPtr = cVector; + const lv_32fc_t* aPtr = aVector; + const lv_32fc_t* bPtr = bVector; + unsigned int number = 0; + + for (number = 0; number < num_points; number++) { + *cPtr++ = (*aPtr++) * (*bPtr++); + } } #endif /* LV_HAVE_GENERIC */ @@ -215,55 +227,62 @@ volk_32fc_x2_multiply_32fc_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, #ifndef INCLUDED_volk_32fc_x2_multiply_32fc_a_H #define INCLUDED_volk_32fc_x2_multiply_32fc_a_H +#include #include #include #include -#include #if LV_HAVE_AVX2 && LV_HAVE_FMA #include /*! - \brief Multiplies the two input complex vectors and stores their results in the third vector - \param cVector The vector where the results will be stored - \param aVector One of the vectors to be multiplied - \param bVector One of the vectors to be multiplied - \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector + \brief Multiplies the two input complex vectors and stores their results in the third + vector \param cVector The vector where the results will be stored \param aVector One of + the vectors to be multiplied \param bVector One of the vectors to be multiplied \param + num_points The number of complex values in aVector and bVector to be multiplied together + and stored into cVector */ -static inline void volk_32fc_x2_multiply_32fc_a_avx2_fma(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){ - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; +static inline void volk_32fc_x2_multiply_32fc_a_avx2_fma(lv_32fc_t* cVector, + const lv_32fc_t* aVector, + const lv_32fc_t* bVector, + unsigned int num_points) +{ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; - lv_32fc_t* c = cVector; - const lv_32fc_t* a = aVector; - const lv_32fc_t* b = bVector; + lv_32fc_t* c = cVector; + const lv_32fc_t* a = aVector; + const lv_32fc_t* b = bVector; - for(;number < quarterPoints; number++){ + for (; number < quarterPoints; number++) { - const __m256 x = _mm256_load_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi - const __m256 y = _mm256_load_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di + const __m256 x = + _mm256_load_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi + const __m256 y = + _mm256_load_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di - const __m256 yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr - const __m256 yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di + const __m256 yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr + const __m256 yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di - const __m256 tmp2x = _mm256_permute_ps(x,0xB1); // Re-arrange x to be ai,ar,bi,br + const __m256 tmp2x = _mm256_permute_ps(x, 0xB1); // Re-arrange x to be ai,ar,bi,br - const __m256 tmp2 = _mm256_mul_ps(tmp2x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + const __m256 tmp2 = _mm256_mul_ps(tmp2x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di - const __m256 z = _mm256_fmaddsub_ps(x, yl, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + const __m256 z = _mm256_fmaddsub_ps( + x, yl, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di - _mm256_store_ps((float*)c,z); // Store the results back into the C container + _mm256_store_ps((float*)c, z); // Store the results back into the C container - a += 4; - b += 4; - c += 4; - } + a += 4; + b += 4; + c += 4; + } - _mm256_zeroupper(); + _mm256_zeroupper(); - number = quarterPoints * 4; - for(;number < num_points; number++){ - *c++ = (*a++) * (*b++); - } + number = quarterPoints * 4; + for (; number < num_points; number++) { + *c++ = (*a++) * (*b++); + } } #endif /* LV_HAVE_AVX2 && LV_HAVE_FMA */ @@ -272,34 +291,35 @@ static inline void volk_32fc_x2_multiply_32fc_a_avx2_fma(lv_32fc_t* cVector, con #include #include -static inline void -volk_32fc_x2_multiply_32fc_a_avx(lv_32fc_t* cVector, const lv_32fc_t* aVector, - const lv_32fc_t* bVector, unsigned int num_points) +static inline void volk_32fc_x2_multiply_32fc_a_avx(lv_32fc_t* cVector, + const lv_32fc_t* aVector, + const lv_32fc_t* bVector, + unsigned int num_points) { - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - __m256 x, y, z; - lv_32fc_t* c = cVector; - const lv_32fc_t* a = aVector; - const lv_32fc_t* b = bVector; - - for(; number < quarterPoints; number++){ - x = _mm256_load_ps((float*) a); // Load the ar + ai, br + bi ... as ar,ai,br,bi ... - y = _mm256_load_ps((float*) b); // Load the cr + ci, dr + di ... as cr,ci,dr,di ... - z = _mm256_complexmul_ps(x, y); - _mm256_store_ps((float*) c, z); // Store the results back into the C container - - a += 4; - b += 4; - c += 4; - } - - number = quarterPoints * 4; - - for(; number < num_points; number++){ - *c++ = (*a++) * (*b++); - } + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + __m256 x, y, z; + lv_32fc_t* c = cVector; + const lv_32fc_t* a = aVector; + const lv_32fc_t* b = bVector; + + for (; number < quarterPoints; number++) { + x = _mm256_load_ps((float*)a); // Load the ar + ai, br + bi ... as ar,ai,br,bi ... + y = _mm256_load_ps((float*)b); // Load the cr + ci, dr + di ... as cr,ci,dr,di ... + z = _mm256_complexmul_ps(x, y); + _mm256_store_ps((float*)c, z); // Store the results back into the C container + + a += 4; + b += 4; + c += 4; + } + + number = quarterPoints * 4; + + for (; number < num_points; number++) { + *c++ = (*a++) * (*b++); + } } #endif /* LV_HAVE_AVX */ @@ -307,50 +327,52 @@ volk_32fc_x2_multiply_32fc_a_avx(lv_32fc_t* cVector, const lv_32fc_t* aVector, #include #include -static inline void -volk_32fc_x2_multiply_32fc_a_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector, - const lv_32fc_t* bVector, unsigned int num_points) +static inline void volk_32fc_x2_multiply_32fc_a_sse3(lv_32fc_t* cVector, + const lv_32fc_t* aVector, + const lv_32fc_t* bVector, + unsigned int num_points) { - unsigned int number = 0; - const unsigned int halfPoints = num_points / 2; - - __m128 x, y, z; - lv_32fc_t* c = cVector; - const lv_32fc_t* a = aVector; - const lv_32fc_t* b = bVector; - - for(; number < halfPoints; number++){ - x = _mm_load_ps((float*) a); // Load the ar + ai, br + bi as ar,ai,br,bi - y = _mm_load_ps((float*) b); // Load the cr + ci, dr + di as cr,ci,dr,di - z = _mm_complexmul_ps(x, y); - _mm_store_ps((float*) c, z); // Store the results back into the C container - - a += 2; - b += 2; - c += 2; - } - - if((num_points % 2) != 0){ - *c = (*a) * (*b); - } + unsigned int number = 0; + const unsigned int halfPoints = num_points / 2; + + __m128 x, y, z; + lv_32fc_t* c = cVector; + const lv_32fc_t* a = aVector; + const lv_32fc_t* b = bVector; + + for (; number < halfPoints; number++) { + x = _mm_load_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi + y = _mm_load_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di + z = _mm_complexmul_ps(x, y); + _mm_store_ps((float*)c, z); // Store the results back into the C container + + a += 2; + b += 2; + c += 2; + } + + if ((num_points % 2) != 0) { + *c = (*a) * (*b); + } } #endif /* LV_HAVE_SSE */ #ifdef LV_HAVE_GENERIC -static inline void -volk_32fc_x2_multiply_32fc_a_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, - const lv_32fc_t* bVector, unsigned int num_points) +static inline void volk_32fc_x2_multiply_32fc_a_generic(lv_32fc_t* cVector, + const lv_32fc_t* aVector, + const lv_32fc_t* bVector, + unsigned int num_points) { - lv_32fc_t* cPtr = cVector; - const lv_32fc_t* aPtr = aVector; - const lv_32fc_t* bPtr= bVector; - unsigned int number = 0; - - for(number = 0; number < num_points; number++){ - *cPtr++ = (*aPtr++) * (*bPtr++); - } + lv_32fc_t* cPtr = cVector; + const lv_32fc_t* aPtr = aVector; + const lv_32fc_t* bPtr = bVector; + unsigned int number = 0; + + for (number = 0; number < num_points; number++) { + *cPtr++ = (*aPtr++) * (*bPtr++); + } } #endif /* LV_HAVE_GENERIC */ @@ -358,113 +380,118 @@ volk_32fc_x2_multiply_32fc_a_generic(lv_32fc_t* cVector, const lv_32fc_t* aVecto #ifdef LV_HAVE_NEON #include -static inline void -volk_32fc_x2_multiply_32fc_neon(lv_32fc_t* cVector, const lv_32fc_t* aVector, - const lv_32fc_t* bVector, unsigned int num_points) +static inline void volk_32fc_x2_multiply_32fc_neon(lv_32fc_t* cVector, + const lv_32fc_t* aVector, + const lv_32fc_t* bVector, + unsigned int num_points) { - lv_32fc_t *a_ptr = (lv_32fc_t*) aVector; - lv_32fc_t *b_ptr = (lv_32fc_t*) bVector; - unsigned int quarter_points = num_points / 4; - float32x4x2_t a_val, b_val, c_val; - float32x4x2_t tmp_real, tmp_imag; - unsigned int number = 0; - - for(number = 0; number < quarter_points; ++number) { - a_val = vld2q_f32((float*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i - b_val = vld2q_f32((float*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i - __VOLK_PREFETCH(a_ptr+4); - __VOLK_PREFETCH(b_ptr+4); - - // multiply the real*real and imag*imag to get real result - // a0r*b0r|a1r*b1r|a2r*b2r|a3r*b3r - tmp_real.val[0] = vmulq_f32(a_val.val[0], b_val.val[0]); - // a0i*b0i|a1i*b1i|a2i*b2i|a3i*b3i - tmp_real.val[1] = vmulq_f32(a_val.val[1], b_val.val[1]); - - // Multiply cross terms to get the imaginary result - // a0r*b0i|a1r*b1i|a2r*b2i|a3r*b3i - tmp_imag.val[0] = vmulq_f32(a_val.val[0], b_val.val[1]); - // a0i*b0r|a1i*b1r|a2i*b2r|a3i*b3r - tmp_imag.val[1] = vmulq_f32(a_val.val[1], b_val.val[0]); - - // store the results - c_val.val[0] = vsubq_f32(tmp_real.val[0], tmp_real.val[1]); - c_val.val[1] = vaddq_f32(tmp_imag.val[0], tmp_imag.val[1]); - vst2q_f32((float*)cVector, c_val); - - a_ptr += 4; - b_ptr += 4; - cVector += 4; - } - - for(number = quarter_points*4; number < num_points; number++){ - *cVector++ = (*a_ptr++) * (*b_ptr++); - } + lv_32fc_t* a_ptr = (lv_32fc_t*)aVector; + lv_32fc_t* b_ptr = (lv_32fc_t*)bVector; + unsigned int quarter_points = num_points / 4; + float32x4x2_t a_val, b_val, c_val; + float32x4x2_t tmp_real, tmp_imag; + unsigned int number = 0; + + for (number = 0; number < quarter_points; ++number) { + a_val = vld2q_f32((float*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i + b_val = vld2q_f32((float*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i + __VOLK_PREFETCH(a_ptr + 4); + __VOLK_PREFETCH(b_ptr + 4); + + // multiply the real*real and imag*imag to get real result + // a0r*b0r|a1r*b1r|a2r*b2r|a3r*b3r + tmp_real.val[0] = vmulq_f32(a_val.val[0], b_val.val[0]); + // a0i*b0i|a1i*b1i|a2i*b2i|a3i*b3i + tmp_real.val[1] = vmulq_f32(a_val.val[1], b_val.val[1]); + + // Multiply cross terms to get the imaginary result + // a0r*b0i|a1r*b1i|a2r*b2i|a3r*b3i + tmp_imag.val[0] = vmulq_f32(a_val.val[0], b_val.val[1]); + // a0i*b0r|a1i*b1r|a2i*b2r|a3i*b3r + tmp_imag.val[1] = vmulq_f32(a_val.val[1], b_val.val[0]); + + // store the results + c_val.val[0] = vsubq_f32(tmp_real.val[0], tmp_real.val[1]); + c_val.val[1] = vaddq_f32(tmp_imag.val[0], tmp_imag.val[1]); + vst2q_f32((float*)cVector, c_val); + + a_ptr += 4; + b_ptr += 4; + cVector += 4; + } + + for (number = quarter_points * 4; number < num_points; number++) { + *cVector++ = (*a_ptr++) * (*b_ptr++); + } } #endif /* LV_HAVE_NEON */ #ifdef LV_HAVE_NEON -static inline void -volk_32fc_x2_multiply_32fc_neon_opttests(lv_32fc_t* cVector, const lv_32fc_t* aVector, - const lv_32fc_t* bVector, unsigned int num_points) +static inline void volk_32fc_x2_multiply_32fc_neon_opttests(lv_32fc_t* cVector, + const lv_32fc_t* aVector, + const lv_32fc_t* bVector, + unsigned int num_points) { - lv_32fc_t *a_ptr = (lv_32fc_t*) aVector; - lv_32fc_t *b_ptr = (lv_32fc_t*) bVector; - unsigned int quarter_points = num_points / 4; - float32x4x2_t a_val, b_val; - float32x4x2_t tmp_imag; - unsigned int number = 0; - - for(number = 0; number < quarter_points; ++number) { - a_val = vld2q_f32((float*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i - b_val = vld2q_f32((float*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i - __VOLK_PREFETCH(a_ptr+4); - __VOLK_PREFETCH(b_ptr+4); - - // do the first multiply - tmp_imag.val[1] = vmulq_f32(a_val.val[1], b_val.val[0]); - tmp_imag.val[0] = vmulq_f32(a_val.val[0], b_val.val[0]); - - // use multiply accumulate/subtract to get result - tmp_imag.val[1] = vmlaq_f32(tmp_imag.val[1], a_val.val[0], b_val.val[1]); - tmp_imag.val[0] = vmlsq_f32(tmp_imag.val[0], a_val.val[1], b_val.val[1]); - - // store - vst2q_f32((float*)cVector, tmp_imag); - // increment pointers - a_ptr += 4; - b_ptr += 4; - cVector += 4; - } - - for(number = quarter_points*4; number < num_points; number++){ - *cVector++ = (*a_ptr++) * (*b_ptr++); - } + lv_32fc_t* a_ptr = (lv_32fc_t*)aVector; + lv_32fc_t* b_ptr = (lv_32fc_t*)bVector; + unsigned int quarter_points = num_points / 4; + float32x4x2_t a_val, b_val; + float32x4x2_t tmp_imag; + unsigned int number = 0; + + for (number = 0; number < quarter_points; ++number) { + a_val = vld2q_f32((float*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i + b_val = vld2q_f32((float*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i + __VOLK_PREFETCH(a_ptr + 4); + __VOLK_PREFETCH(b_ptr + 4); + + // do the first multiply + tmp_imag.val[1] = vmulq_f32(a_val.val[1], b_val.val[0]); + tmp_imag.val[0] = vmulq_f32(a_val.val[0], b_val.val[0]); + + // use multiply accumulate/subtract to get result + tmp_imag.val[1] = vmlaq_f32(tmp_imag.val[1], a_val.val[0], b_val.val[1]); + tmp_imag.val[0] = vmlsq_f32(tmp_imag.val[0], a_val.val[1], b_val.val[1]); + + // store + vst2q_f32((float*)cVector, tmp_imag); + // increment pointers + a_ptr += 4; + b_ptr += 4; + cVector += 4; + } + + for (number = quarter_points * 4; number < num_points; number++) { + *cVector++ = (*a_ptr++) * (*b_ptr++); + } } #endif /* LV_HAVE_NEON */ #ifdef LV_HAVE_NEONV7 -extern void -volk_32fc_x2_multiply_32fc_a_neonasm(lv_32fc_t* cVector, const lv_32fc_t* aVector, - const lv_32fc_t* bVector, unsigned int num_points); +extern void volk_32fc_x2_multiply_32fc_a_neonasm(lv_32fc_t* cVector, + const lv_32fc_t* aVector, + const lv_32fc_t* bVector, + unsigned int num_points); #endif /* LV_HAVE_NEONV7 */ #ifdef LV_HAVE_ORC -extern void -volk_32fc_x2_multiply_32fc_a_orc_impl(lv_32fc_t* cVector, const lv_32fc_t* aVector, - const lv_32fc_t* bVector, unsigned int num_points); +extern void volk_32fc_x2_multiply_32fc_a_orc_impl(lv_32fc_t* cVector, + const lv_32fc_t* aVector, + const lv_32fc_t* bVector, + unsigned int num_points); -static inline void -volk_32fc_x2_multiply_32fc_u_orc(lv_32fc_t* cVector, const lv_32fc_t* aVector, - const lv_32fc_t* bVector, unsigned int num_points) +static inline void volk_32fc_x2_multiply_32fc_u_orc(lv_32fc_t* cVector, + const lv_32fc_t* aVector, + const lv_32fc_t* bVector, + unsigned int num_points) { - volk_32fc_x2_multiply_32fc_a_orc_impl(cVector, aVector, bVector, num_points); + volk_32fc_x2_multiply_32fc_a_orc_impl(cVector, aVector, bVector, num_points); } #endif /* LV_HAVE_ORC */ diff --git a/kernels/volk/volk_32fc_x2_multiply_conjugate_32fc.h b/kernels/volk/volk_32fc_x2_multiply_conjugate_32fc.h index 1b1a8b3..4f834c2 100644 --- a/kernels/volk/volk_32fc_x2_multiply_conjugate_32fc.h +++ b/kernels/volk/volk_32fc_x2_multiply_conjugate_32fc.h @@ -30,8 +30,8 @@ * * Dispatcher Prototype * \code - * void volk_32fc_x2_multiply_conjugate_32fc(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points); - * \endcode + * void volk_32fc_x2_multiply_conjugate_32fc(lv_32fc_t* cVector, const lv_32fc_t* aVector, + * const lv_32fc_t* bVector, unsigned int num_points); \endcode * * \b Inputs * \li aVector: The first input vector of complex floats. @@ -71,43 +71,46 @@ #ifndef INCLUDED_volk_32fc_x2_multiply_conjugate_32fc_u_H #define INCLUDED_volk_32fc_x2_multiply_conjugate_32fc_u_H +#include #include #include #include -#include #ifdef LV_HAVE_AVX #include #include -static inline void -volk_32fc_x2_multiply_conjugate_32fc_u_avx(lv_32fc_t* cVector, const lv_32fc_t* aVector, - const lv_32fc_t* bVector, unsigned int num_points) +static inline void volk_32fc_x2_multiply_conjugate_32fc_u_avx(lv_32fc_t* cVector, + const lv_32fc_t* aVector, + const lv_32fc_t* bVector, + unsigned int num_points) { - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - __m256 x, y, z; - lv_32fc_t* c = cVector; - const lv_32fc_t* a = aVector; - const lv_32fc_t* b = bVector; - - for(; number < quarterPoints; number++){ - x = _mm256_loadu_ps((float*) a); // Load the ar + ai, br + bi ... as ar,ai,br,bi ... - y = _mm256_loadu_ps((float*) b); // Load the cr + ci, dr + di ... as cr,ci,dr,di ... - z = _mm256_complexconjugatemul_ps(x, y); - _mm256_storeu_ps((float*) c, z); // Store the results back into the C container - - a += 4; - b += 4; - c += 4; - } - - number = quarterPoints * 4; - - for(; number < num_points; number++){ - *c++ = (*a++) * lv_conj(*b++); - } + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + __m256 x, y, z; + lv_32fc_t* c = cVector; + const lv_32fc_t* a = aVector; + const lv_32fc_t* b = bVector; + + for (; number < quarterPoints; number++) { + x = _mm256_loadu_ps( + (float*)a); // Load the ar + ai, br + bi ... as ar,ai,br,bi ... + y = _mm256_loadu_ps( + (float*)b); // Load the cr + ci, dr + di ... as cr,ci,dr,di ... + z = _mm256_complexconjugatemul_ps(x, y); + _mm256_storeu_ps((float*)c, z); // Store the results back into the C container + + a += 4; + b += 4; + c += 4; + } + + number = quarterPoints * 4; + + for (; number < num_points; number++) { + *c++ = (*a++) * lv_conj(*b++); + } } #endif /* LV_HAVE_AVX */ @@ -116,96 +119,98 @@ volk_32fc_x2_multiply_conjugate_32fc_u_avx(lv_32fc_t* cVector, const lv_32fc_t* #include #include -static inline void -volk_32fc_x2_multiply_conjugate_32fc_u_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector, - const lv_32fc_t* bVector, unsigned int num_points) +static inline void volk_32fc_x2_multiply_conjugate_32fc_u_sse3(lv_32fc_t* cVector, + const lv_32fc_t* aVector, + const lv_32fc_t* bVector, + unsigned int num_points) { - unsigned int number = 0; - const unsigned int halfPoints = num_points / 2; - - __m128 x, y, z; - lv_32fc_t* c = cVector; - const lv_32fc_t* a = aVector; - const lv_32fc_t* b = bVector; - - for(; number < halfPoints; number++){ - x = _mm_loadu_ps((float*) a); // Load the ar + ai, br + bi as ar,ai,br,bi - y = _mm_loadu_ps((float*) b); // Load the cr + ci, dr + di as cr,ci,dr,di - z = _mm_complexconjugatemul_ps(x, y); - _mm_storeu_ps((float*) c, z); // Store the results back into the C container - - a += 2; - b += 2; - c += 2; - } - - if((num_points % 2) != 0){ - *c = (*a) * lv_conj(*b); - } + unsigned int number = 0; + const unsigned int halfPoints = num_points / 2; + + __m128 x, y, z; + lv_32fc_t* c = cVector; + const lv_32fc_t* a = aVector; + const lv_32fc_t* b = bVector; + + for (; number < halfPoints; number++) { + x = _mm_loadu_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi + y = _mm_loadu_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di + z = _mm_complexconjugatemul_ps(x, y); + _mm_storeu_ps((float*)c, z); // Store the results back into the C container + + a += 2; + b += 2; + c += 2; + } + + if ((num_points % 2) != 0) { + *c = (*a) * lv_conj(*b); + } } #endif /* LV_HAVE_SSE */ #ifdef LV_HAVE_GENERIC -static inline void -volk_32fc_x2_multiply_conjugate_32fc_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, - const lv_32fc_t* bVector, unsigned int num_points) +static inline void volk_32fc_x2_multiply_conjugate_32fc_generic(lv_32fc_t* cVector, + const lv_32fc_t* aVector, + const lv_32fc_t* bVector, + unsigned int num_points) { - lv_32fc_t* cPtr = cVector; - const lv_32fc_t* aPtr = aVector; - const lv_32fc_t* bPtr= bVector; - unsigned int number = 0; - - for(number = 0; number < num_points; number++){ - *cPtr++ = (*aPtr++) * lv_conj(*bPtr++); - } + lv_32fc_t* cPtr = cVector; + const lv_32fc_t* aPtr = aVector; + const lv_32fc_t* bPtr = bVector; + unsigned int number = 0; + + for (number = 0; number < num_points; number++) { + *cPtr++ = (*aPtr++) * lv_conj(*bPtr++); + } } #endif /* LV_HAVE_GENERIC */ - #endif /* INCLUDED_volk_32fc_x2_multiply_conjugate_32fc_u_H */ #ifndef INCLUDED_volk_32fc_x2_multiply_conjugate_32fc_a_H #define INCLUDED_volk_32fc_x2_multiply_conjugate_32fc_a_H +#include #include #include #include -#include #ifdef LV_HAVE_AVX #include #include -static inline void -volk_32fc_x2_multiply_conjugate_32fc_a_avx(lv_32fc_t* cVector, const lv_32fc_t* aVector, - const lv_32fc_t* bVector, unsigned int num_points) +static inline void volk_32fc_x2_multiply_conjugate_32fc_a_avx(lv_32fc_t* cVector, + const lv_32fc_t* aVector, + const lv_32fc_t* bVector, + unsigned int num_points) { - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - __m256 x, y, z; - lv_32fc_t* c = cVector; - const lv_32fc_t* a = aVector; - const lv_32fc_t* b = bVector; - - for(; number < quarterPoints; number++){ - x = _mm256_load_ps((float*) a); // Load the ar + ai, br + bi ... as ar,ai,br,bi ... - y = _mm256_load_ps((float*) b); // Load the cr + ci, dr + di ... as cr,ci,dr,di ... - z = _mm256_complexconjugatemul_ps(x, y); - _mm256_store_ps((float*) c, z); // Store the results back into the C container - - a += 4; - b += 4; - c += 4; - } - - number = quarterPoints * 4; - - for(; number < num_points; number++){ - *c++ = (*a++) * lv_conj(*b++); - } + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + __m256 x, y, z; + lv_32fc_t* c = cVector; + const lv_32fc_t* a = aVector; + const lv_32fc_t* b = bVector; + + for (; number < quarterPoints; number++) { + x = _mm256_load_ps((float*)a); // Load the ar + ai, br + bi ... as ar,ai,br,bi ... + y = _mm256_load_ps((float*)b); // Load the cr + ci, dr + di ... as cr,ci,dr,di ... + z = _mm256_complexconjugatemul_ps(x, y); + _mm256_store_ps((float*)c, z); // Store the results back into the C container + + a += 4; + b += 4; + c += 4; + } + + number = quarterPoints * 4; + + for (; number < num_points; number++) { + *c++ = (*a++) * lv_conj(*b++); + } } #endif /* LV_HAVE_AVX */ @@ -214,32 +219,33 @@ volk_32fc_x2_multiply_conjugate_32fc_a_avx(lv_32fc_t* cVector, const lv_32fc_t* #include #include -static inline void -volk_32fc_x2_multiply_conjugate_32fc_a_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector, - const lv_32fc_t* bVector, unsigned int num_points) +static inline void volk_32fc_x2_multiply_conjugate_32fc_a_sse3(lv_32fc_t* cVector, + const lv_32fc_t* aVector, + const lv_32fc_t* bVector, + unsigned int num_points) { - unsigned int number = 0; - const unsigned int halfPoints = num_points / 2; - - __m128 x, y, z; - lv_32fc_t* c = cVector; - const lv_32fc_t* a = aVector; - const lv_32fc_t* b = bVector; - - for(; number < halfPoints; number++){ - x = _mm_load_ps((float*) a); // Load the ar + ai, br + bi as ar,ai,br,bi - y = _mm_load_ps((float*) b); // Load the cr + ci, dr + di as cr,ci,dr,di - z = _mm_complexconjugatemul_ps(x, y); - _mm_store_ps((float*) c, z); // Store the results back into the C container - - a += 2; - b += 2; - c += 2; - } - - if((num_points % 2) != 0){ - *c = (*a) * lv_conj(*b); - } + unsigned int number = 0; + const unsigned int halfPoints = num_points / 2; + + __m128 x, y, z; + lv_32fc_t* c = cVector; + const lv_32fc_t* a = aVector; + const lv_32fc_t* b = bVector; + + for (; number < halfPoints; number++) { + x = _mm_load_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi + y = _mm_load_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di + z = _mm_complexconjugatemul_ps(x, y); + _mm_store_ps((float*)c, z); // Store the results back into the C container + + a += 2; + b += 2; + c += 2; + } + + if ((num_points % 2) != 0) { + *c = (*a) * lv_conj(*b); + } } #endif /* LV_HAVE_SSE */ @@ -247,49 +253,50 @@ volk_32fc_x2_multiply_conjugate_32fc_a_sse3(lv_32fc_t* cVector, const lv_32fc_t* #ifdef LV_HAVE_NEON #include -static inline void -volk_32fc_x2_multiply_conjugate_32fc_neon(lv_32fc_t* cVector, const lv_32fc_t* aVector, - const lv_32fc_t* bVector, unsigned int num_points) +static inline void volk_32fc_x2_multiply_conjugate_32fc_neon(lv_32fc_t* cVector, + const lv_32fc_t* aVector, + const lv_32fc_t* bVector, + unsigned int num_points) { - lv_32fc_t *a_ptr = (lv_32fc_t*) aVector; - lv_32fc_t *b_ptr = (lv_32fc_t*) bVector; - unsigned int quarter_points = num_points / 4; - float32x4x2_t a_val, b_val, c_val; - float32x4x2_t tmp_real, tmp_imag; - unsigned int number = 0; - - for(number = 0; number < quarter_points; ++number) { - a_val = vld2q_f32((float*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i - b_val = vld2q_f32((float*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i - b_val.val[1] = vnegq_f32(b_val.val[1]); - __VOLK_PREFETCH(a_ptr+4); - __VOLK_PREFETCH(b_ptr+4); - - // multiply the real*real and imag*imag to get real result - // a0r*b0r|a1r*b1r|a2r*b2r|a3r*b3r - tmp_real.val[0] = vmulq_f32(a_val.val[0], b_val.val[0]); - // a0i*b0i|a1i*b1i|a2i*b2i|a3i*b3i - tmp_real.val[1] = vmulq_f32(a_val.val[1], b_val.val[1]); - - // Multiply cross terms to get the imaginary result + lv_32fc_t* a_ptr = (lv_32fc_t*)aVector; + lv_32fc_t* b_ptr = (lv_32fc_t*)bVector; + unsigned int quarter_points = num_points / 4; + float32x4x2_t a_val, b_val, c_val; + float32x4x2_t tmp_real, tmp_imag; + unsigned int number = 0; + + for (number = 0; number < quarter_points; ++number) { + a_val = vld2q_f32((float*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i + b_val = vld2q_f32((float*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i + b_val.val[1] = vnegq_f32(b_val.val[1]); + __VOLK_PREFETCH(a_ptr + 4); + __VOLK_PREFETCH(b_ptr + 4); + + // multiply the real*real and imag*imag to get real result + // a0r*b0r|a1r*b1r|a2r*b2r|a3r*b3r + tmp_real.val[0] = vmulq_f32(a_val.val[0], b_val.val[0]); + // a0i*b0i|a1i*b1i|a2i*b2i|a3i*b3i + tmp_real.val[1] = vmulq_f32(a_val.val[1], b_val.val[1]); + + // Multiply cross terms to get the imaginary result // a0r*b0i|a1r*b1i|a2r*b2i|a3r*b3i - tmp_imag.val[0] = vmulq_f32(a_val.val[0], b_val.val[1]); - // a0i*b0r|a1i*b1r|a2i*b2r|a3i*b3r - tmp_imag.val[1] = vmulq_f32(a_val.val[1], b_val.val[0]); - - // store the results - c_val.val[0] = vsubq_f32(tmp_real.val[0], tmp_real.val[1]); - c_val.val[1] = vaddq_f32(tmp_imag.val[0], tmp_imag.val[1]); - vst2q_f32((float*)cVector, c_val); - - a_ptr += 4; - b_ptr += 4; - cVector += 4; + tmp_imag.val[0] = vmulq_f32(a_val.val[0], b_val.val[1]); + // a0i*b0r|a1i*b1r|a2i*b2r|a3i*b3r + tmp_imag.val[1] = vmulq_f32(a_val.val[1], b_val.val[0]); + + // store the results + c_val.val[0] = vsubq_f32(tmp_real.val[0], tmp_real.val[1]); + c_val.val[1] = vaddq_f32(tmp_imag.val[0], tmp_imag.val[1]); + vst2q_f32((float*)cVector, c_val); + + a_ptr += 4; + b_ptr += 4; + cVector += 4; } - for(number = quarter_points*4; number < num_points; number++){ - *cVector++ = (*a_ptr++) * conj(*b_ptr++); - } + for (number = quarter_points * 4; number < num_points; number++) { + *cVector++ = (*a_ptr++) * conj(*b_ptr++); + } } #endif /* LV_HAVE_NEON */ @@ -297,17 +304,19 @@ volk_32fc_x2_multiply_conjugate_32fc_neon(lv_32fc_t* cVector, const lv_32fc_t* a #ifdef LV_HAVE_GENERIC static inline void -volk_32fc_x2_multiply_conjugate_32fc_a_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, - const lv_32fc_t* bVector, unsigned int num_points) +volk_32fc_x2_multiply_conjugate_32fc_a_generic(lv_32fc_t* cVector, + const lv_32fc_t* aVector, + const lv_32fc_t* bVector, + unsigned int num_points) { - lv_32fc_t* cPtr = cVector; - const lv_32fc_t* aPtr = aVector; - const lv_32fc_t* bPtr= bVector; - unsigned int number = 0; - - for(number = 0; number < num_points; number++){ - *cPtr++ = (*aPtr++) * lv_conj(*bPtr++); - } + lv_32fc_t* cPtr = cVector; + const lv_32fc_t* aPtr = aVector; + const lv_32fc_t* bPtr = bVector; + unsigned int number = 0; + + for (number = 0; number < num_points; number++) { + *cPtr++ = (*aPtr++) * lv_conj(*bPtr++); + } } #endif /* LV_HAVE_GENERIC */ diff --git a/kernels/volk/volk_32fc_x2_s32f_square_dist_scalar_mult_32f.h b/kernels/volk/volk_32fc_x2_s32f_square_dist_scalar_mult_32f.h index 1c65f23..1d10561 100644 --- a/kernels/volk/volk_32fc_x2_s32f_square_dist_scalar_mult_32f.h +++ b/kernels/volk/volk_32fc_x2_s32f_square_dist_scalar_mult_32f.h @@ -30,8 +30,8 @@ * * Dispatcher Prototype * \code - * void volk_32fc_x2_s32f_square_dist_scalar_mult_32f(float* target, lv_32fc_t* src0, lv_32fc_t* points, float scalar, unsigned int num_points) - * \endcode + * void volk_32fc_x2_s32f_square_dist_scalar_mult_32f(float* target, lv_32fc_t* src0, + * lv_32fc_t* points, float scalar, unsigned int num_points) \endcode * * \b Inputs * \li src0: The complex input. Only the first point is used. @@ -79,103 +79,107 @@ #ifndef INCLUDED_volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_H #define INCLUDED_volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_H -#include +#include -static inline void -calculate_scaled_distances(float* target, const lv_32fc_t symbol, const lv_32fc_t* points, - const float scalar, const unsigned int num_points) +static inline void calculate_scaled_distances(float* target, + const lv_32fc_t symbol, + const lv_32fc_t* points, + const float scalar, + const unsigned int num_points) { - lv_32fc_t diff; - for(unsigned int i = 0; i < num_points; ++i) { - /* - * Calculate: |y - x|^2 * SNR_lin - * Compare C++: *target++ = scalar * std::norm(symbol - *constellation++); - */ - diff = symbol - *points++; - *target++ = scalar * (lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff)); - } + lv_32fc_t diff; + for (unsigned int i = 0; i < num_points; ++i) { + /* + * Calculate: |y - x|^2 * SNR_lin + * Compare C++: *target++ = scalar * std::norm(symbol - *constellation++); + */ + diff = symbol - *points++; + *target++ = + scalar * (lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff)); + } } #ifdef LV_HAVE_AVX2 -#include -#include +#include +#include static inline void -volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_avx2(float* target, lv_32fc_t* src0, - lv_32fc_t* points, float scalar, +volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_avx2(float* target, + lv_32fc_t* src0, + lv_32fc_t* points, + float scalar, unsigned int num_points) { - const unsigned int num_bytes = num_points*8; - __m128 xmm9, xmm10; - __m256 xmm4, xmm6; - __m256 xmm_points0, xmm_points1, xmm_result; + const unsigned int num_bytes = num_points * 8; + __m128 xmm9, xmm10; + __m256 xmm4, xmm6; + __m256 xmm_points0, xmm_points1, xmm_result; - const unsigned int bound = num_bytes >> 6; - - // load complex value into all parts of the register. - const __m256 xmm_symbol = _mm256_castpd_ps(_mm256_broadcast_sd((const double*)src0)); - const __m128 xmm128_symbol = _mm256_extractf128_ps(xmm_symbol, 1); - - // Load scalar into all 8 parts of the register - const __m256 xmm_scalar = _mm256_broadcast_ss(&scalar); - const __m128 xmm128_scalar = _mm256_extractf128_ps(xmm_scalar, 1); + const unsigned int bound = num_bytes >> 6; - // Set permutation constant - const __m256i idx = _mm256_set_epi32(7,6,3,2,5,4,1,0); - - for(unsigned int i = 0; i < bound; ++i) { - xmm_points0 = _mm256_load_ps((float*)points); - xmm_points1 = _mm256_load_ps((float*)(points + 4)); - points += 8; - __VOLK_PREFETCH(points); + // load complex value into all parts of the register. + const __m256 xmm_symbol = _mm256_castpd_ps(_mm256_broadcast_sd((const double*)src0)); + const __m128 xmm128_symbol = _mm256_extractf128_ps(xmm_symbol, 1); - xmm_result = _mm256_scaled_norm_dist_ps_avx2(xmm_symbol, xmm_symbol, - xmm_points0, xmm_points1, - xmm_scalar); - - _mm256_store_ps(target, xmm_result); - target += 8; - } + // Load scalar into all 8 parts of the register + const __m256 xmm_scalar = _mm256_broadcast_ss(&scalar); + const __m128 xmm128_scalar = _mm256_extractf128_ps(xmm_scalar, 1); - if (num_bytes >> 5 & 1) { - xmm_points0 = _mm256_load_ps((float*)points); + // Set permutation constant + const __m256i idx = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0); - xmm4 = _mm256_sub_ps(xmm_symbol, xmm_points0); + for (unsigned int i = 0; i < bound; ++i) { + xmm_points0 = _mm256_load_ps((float*)points); + xmm_points1 = _mm256_load_ps((float*)(points + 4)); + points += 8; + __VOLK_PREFETCH(points); - points += 4; + xmm_result = _mm256_scaled_norm_dist_ps_avx2( + xmm_symbol, xmm_symbol, xmm_points0, xmm_points1, xmm_scalar); - xmm6 = _mm256_mul_ps(xmm4, xmm4); + _mm256_store_ps(target, xmm_result); + target += 8; + } - xmm4 = _mm256_hadd_ps(xmm6, xmm6); - xmm4 = _mm256_permutevar8x32_ps(xmm4, idx); + if (num_bytes >> 5 & 1) { + xmm_points0 = _mm256_load_ps((float*)points); - xmm_result = _mm256_mul_ps(xmm4, xmm_scalar); + xmm4 = _mm256_sub_ps(xmm_symbol, xmm_points0); - xmm9 = _mm256_extractf128_ps(xmm_result, 1); - _mm_store_ps(target,xmm9); - target += 4; - } + points += 4; - if (num_bytes >> 4 & 1) { - xmm9 = _mm_load_ps((float*)points); + xmm6 = _mm256_mul_ps(xmm4, xmm4); - xmm10 = _mm_sub_ps(xmm128_symbol, xmm9); + xmm4 = _mm256_hadd_ps(xmm6, xmm6); + xmm4 = _mm256_permutevar8x32_ps(xmm4, idx); - points += 2; + xmm_result = _mm256_mul_ps(xmm4, xmm_scalar); - xmm9 = _mm_mul_ps(xmm10, xmm10); + xmm9 = _mm256_extractf128_ps(xmm_result, 1); + _mm_store_ps(target, xmm9); + target += 4; + } - xmm10 = _mm_hadd_ps(xmm9, xmm9); + if (num_bytes >> 4 & 1) { + xmm9 = _mm_load_ps((float*)points); - xmm10 = _mm_mul_ps(xmm10, xmm128_scalar); + xmm10 = _mm_sub_ps(xmm128_symbol, xmm9); - _mm_storeh_pi((__m64*)target, xmm10); - target += 2; - } + points += 2; - calculate_scaled_distances(target, src0[0], points, scalar, (num_bytes >> 3) & 1); + xmm9 = _mm_mul_ps(xmm10, xmm10); + + xmm10 = _mm_hadd_ps(xmm9, xmm9); + + xmm10 = _mm_mul_ps(xmm10, xmm128_scalar); + + _mm_storeh_pi((__m64*)target, xmm10); + target += 2; + } + + calculate_scaled_distances(target, src0[0], points, scalar, (num_bytes >> 3) & 1); } #endif /*LV_HAVE_AVX2*/ @@ -186,131 +190,139 @@ volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_avx2(float* target, lv_32fc_t* s #include static inline void -volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_avx(float *target, lv_32fc_t *src0, - lv_32fc_t *points, float scalar, - unsigned int num_points) { - const int eightsPoints = num_points / 8; - const int remainder = num_points - 8 * eightsPoints; - - __m256 xmm_points0, xmm_points1, xmm_result; - - // load complex value into all parts of the register. - const __m256 xmm_symbol = _mm256_castpd_ps(_mm256_broadcast_sd((const double*)src0)); - - // Load scalar into all 8 parts of the register - const __m256 xmm_scalar = _mm256_broadcast_ss(&scalar); - - for(int i = 0; i < eightsPoints; ++i){ - xmm_points0 = _mm256_load_ps((float*)points); - xmm_points1 = _mm256_load_ps((float*)(points + 4)); - points += 8; - - xmm_result = _mm256_scaled_norm_dist_ps(xmm_symbol, xmm_symbol, xmm_points0, - xmm_points1, xmm_scalar); - - _mm256_store_ps(target, xmm_result); - target += 8; - } - - const lv_32fc_t symbol = *src0; - calculate_scaled_distances(target, symbol, points, scalar, remainder); +volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_avx(float* target, + lv_32fc_t* src0, + lv_32fc_t* points, + float scalar, + unsigned int num_points) +{ + const int eightsPoints = num_points / 8; + const int remainder = num_points - 8 * eightsPoints; + + __m256 xmm_points0, xmm_points1, xmm_result; + + // load complex value into all parts of the register. + const __m256 xmm_symbol = _mm256_castpd_ps(_mm256_broadcast_sd((const double*)src0)); + + // Load scalar into all 8 parts of the register + const __m256 xmm_scalar = _mm256_broadcast_ss(&scalar); + + for (int i = 0; i < eightsPoints; ++i) { + xmm_points0 = _mm256_load_ps((float*)points); + xmm_points1 = _mm256_load_ps((float*)(points + 4)); + points += 8; + + xmm_result = _mm256_scaled_norm_dist_ps( + xmm_symbol, xmm_symbol, xmm_points0, xmm_points1, xmm_scalar); + + _mm256_store_ps(target, xmm_result); + target += 8; + } + + const lv_32fc_t symbol = *src0; + calculate_scaled_distances(target, symbol, points, scalar, remainder); } #endif /* LV_HAVE_AVX */ #ifdef LV_HAVE_SSE3 -#include -#include +#include +#include static inline void -volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_sse3(float* target, lv_32fc_t* src0, - lv_32fc_t* points, float scalar, +volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_sse3(float* target, + lv_32fc_t* src0, + lv_32fc_t* points, + float scalar, unsigned int num_points) { - __m128 xmm_points0, xmm_points1, xmm_result; - - /* - * First do 4 values in every loop iteration. - * There may be up to 3 values left. - * leftovers0 indicates if at least 2 more are available for SSE execution. - * leftovers1 indicates if there is a single element left. - */ - const int quarterPoints = num_points / 4; - const int leftovers0 = (num_points / 2) - 2 * quarterPoints; - const int leftovers1 = num_points % 2; - - // load complex value into both parts of the register. - const __m128 xmm_symbol = _mm_castpd_ps(_mm_load1_pd((const double*)src0)); - - // Load scalar into all 4 parts of the register - const __m128 xmm_scalar = _mm_load1_ps(&scalar); - - for(int i = 0; i < quarterPoints; ++i) { - xmm_points0 = _mm_load_ps((float*)points); - xmm_points1 = _mm_load_ps((float*)(points + 2)); - points += 4; - __VOLK_PREFETCH(points); - // calculate distances - xmm_result = _mm_scaled_norm_dist_ps_sse3(xmm_symbol, xmm_symbol, xmm_points0, - xmm_points1, xmm_scalar); - - _mm_store_ps(target, xmm_result); - target += 4; - } - - for(int i = 0; i < leftovers0; ++i) { - xmm_points0 = _mm_load_ps((float*)points); - points += 2; - - xmm_points0 = _mm_sub_ps(xmm_symbol, xmm_points0); - xmm_points0 = _mm_mul_ps(xmm_points0, xmm_points0); - xmm_points0 = _mm_hadd_ps(xmm_points0, xmm_points0); - xmm_result = _mm_mul_ps(xmm_points0, xmm_scalar); - - _mm_storeh_pi((__m64*)target, xmm_result); - target += 2; - } - - calculate_scaled_distances(target, src0[0], points, scalar, leftovers1); + __m128 xmm_points0, xmm_points1, xmm_result; + + /* + * First do 4 values in every loop iteration. + * There may be up to 3 values left. + * leftovers0 indicates if at least 2 more are available for SSE execution. + * leftovers1 indicates if there is a single element left. + */ + const int quarterPoints = num_points / 4; + const int leftovers0 = (num_points / 2) - 2 * quarterPoints; + const int leftovers1 = num_points % 2; + + // load complex value into both parts of the register. + const __m128 xmm_symbol = _mm_castpd_ps(_mm_load1_pd((const double*)src0)); + + // Load scalar into all 4 parts of the register + const __m128 xmm_scalar = _mm_load1_ps(&scalar); + + for (int i = 0; i < quarterPoints; ++i) { + xmm_points0 = _mm_load_ps((float*)points); + xmm_points1 = _mm_load_ps((float*)(points + 2)); + points += 4; + __VOLK_PREFETCH(points); + // calculate distances + xmm_result = _mm_scaled_norm_dist_ps_sse3( + xmm_symbol, xmm_symbol, xmm_points0, xmm_points1, xmm_scalar); + + _mm_store_ps(target, xmm_result); + target += 4; + } + + for (int i = 0; i < leftovers0; ++i) { + xmm_points0 = _mm_load_ps((float*)points); + points += 2; + + xmm_points0 = _mm_sub_ps(xmm_symbol, xmm_points0); + xmm_points0 = _mm_mul_ps(xmm_points0, xmm_points0); + xmm_points0 = _mm_hadd_ps(xmm_points0, xmm_points0); + xmm_result = _mm_mul_ps(xmm_points0, xmm_scalar); + + _mm_storeh_pi((__m64*)target, xmm_result); + target += 2; + } + + calculate_scaled_distances(target, src0[0], points, scalar, leftovers1); } #endif /*LV_HAVE_SSE3*/ #ifdef LV_HAVE_SSE -#include #include +#include static inline void -volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_sse(float* target, lv_32fc_t* src0, - lv_32fc_t* points, float scalar, +volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_sse(float* target, + lv_32fc_t* src0, + lv_32fc_t* points, + float scalar, unsigned int num_points) { - const __m128 xmm_scalar = _mm_set1_ps(scalar); - const __m128 xmm_symbol = _mm_castpd_ps(_mm_load1_pd((const double*)src0)); - - for (unsigned i = 0; i < num_points / 4; ++i) { - __m128 xmm_points0 = _mm_load_ps((float *) points); - __m128 xmm_points1 = _mm_load_ps((float *) (points + 2)); - points += 4; - __m128 xmm_result = _mm_scaled_norm_dist_ps_sse(xmm_symbol, xmm_symbol, - xmm_points0, xmm_points1, - xmm_scalar); - _mm_store_ps((float *) target, xmm_result); - target += 4; - } - - calculate_scaled_distances(target, src0[0], points, scalar, num_points % 4); + const __m128 xmm_scalar = _mm_set1_ps(scalar); + const __m128 xmm_symbol = _mm_castpd_ps(_mm_load1_pd((const double*)src0)); + + for (unsigned i = 0; i < num_points / 4; ++i) { + __m128 xmm_points0 = _mm_load_ps((float*)points); + __m128 xmm_points1 = _mm_load_ps((float*)(points + 2)); + points += 4; + __m128 xmm_result = _mm_scaled_norm_dist_ps_sse( + xmm_symbol, xmm_symbol, xmm_points0, xmm_points1, xmm_scalar); + _mm_store_ps((float*)target, xmm_result); + target += 4; + } + + calculate_scaled_distances(target, src0[0], points, scalar, num_points % 4); } #endif // LV_HAVE_SSE #ifdef LV_HAVE_GENERIC static inline void -volk_32fc_x2_s32f_square_dist_scalar_mult_32f_generic(float* target, lv_32fc_t* src0, - lv_32fc_t* points, float scalar, +volk_32fc_x2_s32f_square_dist_scalar_mult_32f_generic(float* target, + lv_32fc_t* src0, + lv_32fc_t* points, + float scalar, unsigned int num_points) { - const lv_32fc_t symbol = *src0; - calculate_scaled_distances(target, symbol, points, scalar, num_points); + const lv_32fc_t symbol = *src0; + calculate_scaled_distances(target, symbol, points, scalar, num_points); } #endif /*LV_HAVE_GENERIC*/ @@ -321,87 +333,88 @@ volk_32fc_x2_s32f_square_dist_scalar_mult_32f_generic(float* target, lv_32fc_t* #ifndef INCLUDED_volk_32fc_x2_s32f_square_dist_scalar_mult_32f_u_H #define INCLUDED_volk_32fc_x2_s32f_square_dist_scalar_mult_32f_u_H -#include +#include #ifdef LV_HAVE_AVX2 -#include +#include #include static inline void -volk_32fc_x2_s32f_square_dist_scalar_mult_32f_u_avx2(float* target, lv_32fc_t* src0, - lv_32fc_t* points, float scalar, +volk_32fc_x2_s32f_square_dist_scalar_mult_32f_u_avx2(float* target, + lv_32fc_t* src0, + lv_32fc_t* points, + float scalar, unsigned int num_points) { - const unsigned int num_bytes = num_points*8; - __m128 xmm9, xmm10; - __m256 xmm4, xmm6; - __m256 xmm_points0, xmm_points1, xmm_result; + const unsigned int num_bytes = num_points * 8; + __m128 xmm9, xmm10; + __m256 xmm4, xmm6; + __m256 xmm_points0, xmm_points1, xmm_result; + + const unsigned int bound = num_bytes >> 6; + + // load complex value into all parts of the register. + const __m256 xmm_symbol = _mm256_castpd_ps(_mm256_broadcast_sd((const double*)src0)); + const __m128 xmm128_symbol = _mm256_extractf128_ps(xmm_symbol, 1); + + // Load scalar into all 8 parts of the register + const __m256 xmm_scalar = _mm256_broadcast_ss(&scalar); + const __m128 xmm128_scalar = _mm256_extractf128_ps(xmm_scalar, 1); - const unsigned int bound = num_bytes >> 6; - - // load complex value into all parts of the register. - const __m256 xmm_symbol = _mm256_castpd_ps(_mm256_broadcast_sd((const double*)src0)); - const __m128 xmm128_symbol = _mm256_extractf128_ps(xmm_symbol, 1); - - // Load scalar into all 8 parts of the register - const __m256 xmm_scalar = _mm256_broadcast_ss(&scalar); - const __m128 xmm128_scalar = _mm256_extractf128_ps(xmm_scalar, 1); + // Set permutation constant + const __m256i idx = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0); - // Set permutation constant - const __m256i idx = _mm256_set_epi32(7,6,3,2,5,4,1,0); - - for(unsigned int i = 0; i < bound; ++i) { - xmm_points0 = _mm256_loadu_ps((float*)points); - xmm_points1 = _mm256_loadu_ps((float*)(points + 4)); - points += 8; - __VOLK_PREFETCH(points); + for (unsigned int i = 0; i < bound; ++i) { + xmm_points0 = _mm256_loadu_ps((float*)points); + xmm_points1 = _mm256_loadu_ps((float*)(points + 4)); + points += 8; + __VOLK_PREFETCH(points); - xmm_result = _mm256_scaled_norm_dist_ps_avx2(xmm_symbol, xmm_symbol, - xmm_points0, xmm_points1, - xmm_scalar); - - _mm256_storeu_ps(target, xmm_result); - target += 8; - } + xmm_result = _mm256_scaled_norm_dist_ps_avx2( + xmm_symbol, xmm_symbol, xmm_points0, xmm_points1, xmm_scalar); - if (num_bytes >> 5 & 1) { - xmm_points0 = _mm256_loadu_ps((float*)points); + _mm256_storeu_ps(target, xmm_result); + target += 8; + } - xmm4 = _mm256_sub_ps(xmm_symbol, xmm_points0); + if (num_bytes >> 5 & 1) { + xmm_points0 = _mm256_loadu_ps((float*)points); - points += 4; + xmm4 = _mm256_sub_ps(xmm_symbol, xmm_points0); - xmm6 = _mm256_mul_ps(xmm4, xmm4); + points += 4; - xmm4 = _mm256_hadd_ps(xmm6, xmm6); - xmm4 = _mm256_permutevar8x32_ps(xmm4, idx); + xmm6 = _mm256_mul_ps(xmm4, xmm4); - xmm_result = _mm256_mul_ps(xmm4, xmm_scalar); + xmm4 = _mm256_hadd_ps(xmm6, xmm6); + xmm4 = _mm256_permutevar8x32_ps(xmm4, idx); - xmm9 = _mm256_extractf128_ps(xmm_result, 1); - _mm_storeu_ps(target,xmm9); - target += 4; - } + xmm_result = _mm256_mul_ps(xmm4, xmm_scalar); - if (num_bytes >> 4 & 1) { - xmm9 = _mm_loadu_ps((float*)points); + xmm9 = _mm256_extractf128_ps(xmm_result, 1); + _mm_storeu_ps(target, xmm9); + target += 4; + } - xmm10 = _mm_sub_ps(xmm128_symbol, xmm9); + if (num_bytes >> 4 & 1) { + xmm9 = _mm_loadu_ps((float*)points); - points += 2; + xmm10 = _mm_sub_ps(xmm128_symbol, xmm9); - xmm9 = _mm_mul_ps(xmm10, xmm10); + points += 2; - xmm10 = _mm_hadd_ps(xmm9, xmm9); + xmm9 = _mm_mul_ps(xmm10, xmm10); - xmm10 = _mm_mul_ps(xmm10, xmm128_scalar); + xmm10 = _mm_hadd_ps(xmm9, xmm9); - _mm_storeh_pi((__m64*)target, xmm10); - target += 2; - } + xmm10 = _mm_mul_ps(xmm10, xmm128_scalar); - calculate_scaled_distances(target, src0[0], points, scalar, (num_bytes >> 3) & 1); + _mm_storeh_pi((__m64*)target, xmm10); + target += 2; + } + + calculate_scaled_distances(target, src0[0], points, scalar, (num_bytes >> 3) & 1); } #endif /*LV_HAVE_AVX2*/ @@ -412,120 +425,126 @@ volk_32fc_x2_s32f_square_dist_scalar_mult_32f_u_avx2(float* target, lv_32fc_t* s #include static inline void -volk_32fc_x2_s32f_square_dist_scalar_mult_32f_u_avx(float *target, lv_32fc_t *src0, - lv_32fc_t *points, float scalar, - unsigned int num_points) { - const int eightsPoints = num_points / 8; - const int remainder = num_points - 8 * eightsPoints; - - __m256 xmm_points0, xmm_points1, xmm_result; - - // load complex value into all parts of the register. - const __m256 xmm_symbol = _mm256_castpd_ps(_mm256_broadcast_sd((const double*)src0)); - - // Load scalar into all 8 parts of the register - const __m256 xmm_scalar = _mm256_broadcast_ss(&scalar); - - for(int i = 0; i < eightsPoints; ++i){ - xmm_points0 = _mm256_loadu_ps((float*)points); - xmm_points1 = _mm256_loadu_ps((float*)(points + 4)); - points += 8; - - xmm_result = _mm256_scaled_norm_dist_ps(xmm_symbol, xmm_symbol, xmm_points0, - xmm_points1, xmm_scalar); - - _mm256_storeu_ps(target, xmm_result); - target += 8; - } - - const lv_32fc_t symbol = *src0; - calculate_scaled_distances(target, symbol, points, scalar, remainder); +volk_32fc_x2_s32f_square_dist_scalar_mult_32f_u_avx(float* target, + lv_32fc_t* src0, + lv_32fc_t* points, + float scalar, + unsigned int num_points) +{ + const int eightsPoints = num_points / 8; + const int remainder = num_points - 8 * eightsPoints; + + __m256 xmm_points0, xmm_points1, xmm_result; + + // load complex value into all parts of the register. + const __m256 xmm_symbol = _mm256_castpd_ps(_mm256_broadcast_sd((const double*)src0)); + + // Load scalar into all 8 parts of the register + const __m256 xmm_scalar = _mm256_broadcast_ss(&scalar); + + for (int i = 0; i < eightsPoints; ++i) { + xmm_points0 = _mm256_loadu_ps((float*)points); + xmm_points1 = _mm256_loadu_ps((float*)(points + 4)); + points += 8; + + xmm_result = _mm256_scaled_norm_dist_ps( + xmm_symbol, xmm_symbol, xmm_points0, xmm_points1, xmm_scalar); + + _mm256_storeu_ps(target, xmm_result); + target += 8; + } + + const lv_32fc_t symbol = *src0; + calculate_scaled_distances(target, symbol, points, scalar, remainder); } #endif /* LV_HAVE_AVX */ #ifdef LV_HAVE_SSE3 -#include -#include +#include +#include static inline void -volk_32fc_x2_s32f_square_dist_scalar_mult_32f_u_sse3(float* target, lv_32fc_t* src0, - lv_32fc_t* points, float scalar, +volk_32fc_x2_s32f_square_dist_scalar_mult_32f_u_sse3(float* target, + lv_32fc_t* src0, + lv_32fc_t* points, + float scalar, unsigned int num_points) { - __m128 xmm_points0, xmm_points1, xmm_result; - - /* - * First do 4 values in every loop iteration. - * There may be up to 3 values left. - * leftovers0 indicates if at least 2 more are available for SSE execution. - * leftovers1 indicates if there is a single element left. - */ - const int quarterPoints = num_points / 4; - const int leftovers0 = (num_points / 2) - 2 * quarterPoints; - const int leftovers1 = num_points % 2; - - // load complex value into both parts of the register. - const __m128 xmm_symbol = _mm_castpd_ps(_mm_load1_pd((const double*)src0)); - - // Load scalar into all 4 parts of the register - const __m128 xmm_scalar = _mm_load1_ps(&scalar); - - for(int i = 0; i < quarterPoints; ++i) { - xmm_points0 = _mm_loadu_ps((float*)points); - xmm_points1 = _mm_loadu_ps((float*)(points + 2)); - points += 4; - __VOLK_PREFETCH(points); - // calculate distances - xmm_result = _mm_scaled_norm_dist_ps_sse3(xmm_symbol, xmm_symbol, xmm_points0, - xmm_points1, xmm_scalar); - - _mm_storeu_ps(target, xmm_result); - target += 4; - } - - for(int i = 0; i < leftovers0; ++i) { - xmm_points0 = _mm_loadu_ps((float*)points); - points += 2; - - xmm_points0 = _mm_sub_ps(xmm_symbol, xmm_points0); - xmm_points0 = _mm_mul_ps(xmm_points0, xmm_points0); - xmm_points0 = _mm_hadd_ps(xmm_points0, xmm_points0); - xmm_result = _mm_mul_ps(xmm_points0, xmm_scalar); - - _mm_storeh_pi((__m64*)target, xmm_result); - target += 2; - } - - calculate_scaled_distances(target, src0[0], points, scalar, leftovers1); + __m128 xmm_points0, xmm_points1, xmm_result; + + /* + * First do 4 values in every loop iteration. + * There may be up to 3 values left. + * leftovers0 indicates if at least 2 more are available for SSE execution. + * leftovers1 indicates if there is a single element left. + */ + const int quarterPoints = num_points / 4; + const int leftovers0 = (num_points / 2) - 2 * quarterPoints; + const int leftovers1 = num_points % 2; + + // load complex value into both parts of the register. + const __m128 xmm_symbol = _mm_castpd_ps(_mm_load1_pd((const double*)src0)); + + // Load scalar into all 4 parts of the register + const __m128 xmm_scalar = _mm_load1_ps(&scalar); + + for (int i = 0; i < quarterPoints; ++i) { + xmm_points0 = _mm_loadu_ps((float*)points); + xmm_points1 = _mm_loadu_ps((float*)(points + 2)); + points += 4; + __VOLK_PREFETCH(points); + // calculate distances + xmm_result = _mm_scaled_norm_dist_ps_sse3( + xmm_symbol, xmm_symbol, xmm_points0, xmm_points1, xmm_scalar); + + _mm_storeu_ps(target, xmm_result); + target += 4; + } + + for (int i = 0; i < leftovers0; ++i) { + xmm_points0 = _mm_loadu_ps((float*)points); + points += 2; + + xmm_points0 = _mm_sub_ps(xmm_symbol, xmm_points0); + xmm_points0 = _mm_mul_ps(xmm_points0, xmm_points0); + xmm_points0 = _mm_hadd_ps(xmm_points0, xmm_points0); + xmm_result = _mm_mul_ps(xmm_points0, xmm_scalar); + + _mm_storeh_pi((__m64*)target, xmm_result); + target += 2; + } + + calculate_scaled_distances(target, src0[0], points, scalar, leftovers1); } #endif /*LV_HAVE_SSE3*/ #ifdef LV_HAVE_SSE -#include #include +#include static inline void -volk_32fc_x2_s32f_square_dist_scalar_mult_32f_u_sse(float* target, lv_32fc_t* src0, - lv_32fc_t* points, float scalar, +volk_32fc_x2_s32f_square_dist_scalar_mult_32f_u_sse(float* target, + lv_32fc_t* src0, + lv_32fc_t* points, + float scalar, unsigned int num_points) { - const __m128 xmm_scalar = _mm_set1_ps(scalar); - const __m128 xmm_symbol = _mm_castpd_ps(_mm_load1_pd((const double*)src0)); - - for (unsigned i = 0; i < num_points / 4; ++i) { - __m128 xmm_points0 = _mm_loadu_ps((float *) points); - __m128 xmm_points1 = _mm_loadu_ps((float *) (points + 2)); - points += 4; - __m128 xmm_result = _mm_scaled_norm_dist_ps_sse(xmm_symbol, xmm_symbol, - xmm_points0, xmm_points1, - xmm_scalar); - _mm_storeu_ps((float *) target, xmm_result); - target += 4; - } - - calculate_scaled_distances(target, src0[0], points, scalar, num_points % 4); + const __m128 xmm_scalar = _mm_set1_ps(scalar); + const __m128 xmm_symbol = _mm_castpd_ps(_mm_load1_pd((const double*)src0)); + + for (unsigned i = 0; i < num_points / 4; ++i) { + __m128 xmm_points0 = _mm_loadu_ps((float*)points); + __m128 xmm_points1 = _mm_loadu_ps((float*)(points + 2)); + points += 4; + __m128 xmm_result = _mm_scaled_norm_dist_ps_sse( + xmm_symbol, xmm_symbol, xmm_points0, xmm_points1, xmm_scalar); + _mm_storeu_ps((float*)target, xmm_result); + target += 4; + } + + calculate_scaled_distances(target, src0[0], points, scalar, num_points % 4); } #endif // LV_HAVE_SSE diff --git a/kernels/volk/volk_32fc_x2_s32fc_multiply_conjugate_add_32fc.h b/kernels/volk/volk_32fc_x2_s32fc_multiply_conjugate_add_32fc.h index 6c7f4d3..1fb9b68 100644 --- a/kernels/volk/volk_32fc_x2_s32fc_multiply_conjugate_add_32fc.h +++ b/kernels/volk/volk_32fc_x2_s32fc_multiply_conjugate_add_32fc.h @@ -32,14 +32,16 @@ * * Dispatcher Prototype * \code - * void volk_32fc_x2_s32fc_multiply_conjugate_add_32fc(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, const lv_32fc_t scalar, unsigned int num_points); - * \endcode + * void volk_32fc_x2_s32fc_multiply_conjugate_add_32fc(lv_32fc_t* cVector, const + * lv_32fc_t* aVector, const lv_32fc_t* bVector, const lv_32fc_t scalar, unsigned int + * num_points); \endcode * * \b Inputs * \li aVector: The input vector to be added. * \li bVector: The input vector to be conjugate and multiplied. * \li scalar: The complex scalar to multiply against conjugated bVector. - * \li num_points: The number of complex values in aVector and bVector to be conjugate, multiplied and stored into cVector. + * \li num_points: The number of complex values in aVector and bVector to be conjugate, + * multiplied and stored into cVector. * * \b Outputs * \li cVector: The vector where the results will be stored. @@ -84,15 +86,21 @@ #ifndef INCLUDED_volk_32fc_x2_s32fc_multiply_conjugate_add_32fc_H #define INCLUDED_volk_32fc_x2_s32fc_multiply_conjugate_add_32fc_H +#include #include #include #include -#include #ifdef LV_HAVE_GENERIC -static inline void volk_32fc_x2_s32fc_multiply_conjugate_add_32fc_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, const lv_32fc_t scalar, unsigned int num_points){ +static inline void +volk_32fc_x2_s32fc_multiply_conjugate_add_32fc_generic(lv_32fc_t* cVector, + const lv_32fc_t* aVector, + const lv_32fc_t* bVector, + const lv_32fc_t scalar, + unsigned int num_points) +{ const lv_32fc_t* aPtr = aVector; const lv_32fc_t* bPtr = bVector; lv_32fc_t* cPtr = cVector; @@ -123,14 +131,20 @@ static inline void volk_32fc_x2_s32fc_multiply_conjugate_add_32fc_generic(lv_32f #include #include -static inline void volk_32fc_x2_s32fc_multiply_conjugate_add_32fc_u_avx(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, const lv_32fc_t scalar, unsigned int num_points) { +static inline void +volk_32fc_x2_s32fc_multiply_conjugate_add_32fc_u_avx(lv_32fc_t* cVector, + const lv_32fc_t* aVector, + const lv_32fc_t* bVector, + const lv_32fc_t scalar, + unsigned int num_points) +{ unsigned int number = 0; unsigned int i = 0; const unsigned int quarterPoints = num_points / 4; unsigned int isodd = num_points & 3; __m256 x, y, s, z; - lv_32fc_t v_scalar[4] = {scalar, scalar, scalar, scalar}; + lv_32fc_t v_scalar[4] = { scalar, scalar, scalar, scalar }; const lv_32fc_t* a = aVector; const lv_32fc_t* b = bVector; @@ -139,19 +153,19 @@ static inline void volk_32fc_x2_s32fc_multiply_conjugate_add_32fc_u_avx(lv_32fc_ // Set up constant scalar vector s = _mm256_loadu_ps((float*)v_scalar); - for(;number < quarterPoints; number++) { + for (; number < quarterPoints; number++) { x = _mm256_loadu_ps((float*)b); y = _mm256_loadu_ps((float*)a); z = _mm256_complexconjugatemul_ps(s, x); z = _mm256_add_ps(y, z); - _mm256_storeu_ps((float*)c,z); + _mm256_storeu_ps((float*)c, z); a += 4; b += 4; c += 4; } - for(i = num_points-isodd; i < num_points; i++) { + for (i = num_points - isodd; i < num_points; i++) { *c++ = (*a++) + lv_conj(*b++) * scalar; } } @@ -162,12 +176,18 @@ static inline void volk_32fc_x2_s32fc_multiply_conjugate_add_32fc_u_avx(lv_32fc_ #include #include -static inline void volk_32fc_x2_s32fc_multiply_conjugate_add_32fc_u_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, const lv_32fc_t scalar, unsigned int num_points) { +static inline void +volk_32fc_x2_s32fc_multiply_conjugate_add_32fc_u_sse3(lv_32fc_t* cVector, + const lv_32fc_t* aVector, + const lv_32fc_t* bVector, + const lv_32fc_t scalar, + unsigned int num_points) +{ unsigned int number = 0; const unsigned int halfPoints = num_points / 2; __m128 x, y, s, z; - lv_32fc_t v_scalar[2] = {scalar, scalar}; + lv_32fc_t v_scalar[2] = { scalar, scalar }; const lv_32fc_t* a = aVector; const lv_32fc_t* b = bVector; @@ -176,19 +196,19 @@ static inline void volk_32fc_x2_s32fc_multiply_conjugate_add_32fc_u_sse3(lv_32fc // Set up constant scalar vector s = _mm_loadu_ps((float*)v_scalar); - for(;number < halfPoints; number++){ + for (; number < halfPoints; number++) { x = _mm_loadu_ps((float*)b); y = _mm_loadu_ps((float*)a); z = _mm_complexconjugatemul_ps(s, x); z = _mm_add_ps(y, z); - _mm_storeu_ps((float*)c,z); + _mm_storeu_ps((float*)c, z); a += 2; b += 2; c += 2; } - if((num_points % 2) != 0) { + if ((num_points % 2) != 0) { *c = *a + lv_conj(*b) * scalar; } } @@ -199,14 +219,20 @@ static inline void volk_32fc_x2_s32fc_multiply_conjugate_add_32fc_u_sse3(lv_32fc #include #include -static inline void volk_32fc_x2_s32fc_multiply_conjugate_add_32fc_a_avx(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, const lv_32fc_t scalar, unsigned int num_points) { +static inline void +volk_32fc_x2_s32fc_multiply_conjugate_add_32fc_a_avx(lv_32fc_t* cVector, + const lv_32fc_t* aVector, + const lv_32fc_t* bVector, + const lv_32fc_t scalar, + unsigned int num_points) +{ unsigned int number = 0; unsigned int i = 0; const unsigned int quarterPoints = num_points / 4; unsigned int isodd = num_points & 3; __m256 x, y, s, z; - lv_32fc_t v_scalar[4] = {scalar, scalar, scalar, scalar}; + lv_32fc_t v_scalar[4] = { scalar, scalar, scalar, scalar }; const lv_32fc_t* a = aVector; const lv_32fc_t* b = bVector; @@ -215,19 +241,19 @@ static inline void volk_32fc_x2_s32fc_multiply_conjugate_add_32fc_a_avx(lv_32fc_ // Set up constant scalar vector s = _mm256_load_ps((float*)v_scalar); - for(;number < quarterPoints; number++) { + for (; number < quarterPoints; number++) { x = _mm256_load_ps((float*)b); y = _mm256_load_ps((float*)a); z = _mm256_complexconjugatemul_ps(s, x); z = _mm256_add_ps(y, z); - _mm256_store_ps((float*)c,z); + _mm256_store_ps((float*)c, z); a += 4; b += 4; c += 4; } - for(i = num_points-isodd; i < num_points; i++) { + for (i = num_points - isodd; i < num_points; i++) { *c++ = (*a++) + lv_conj(*b++) * scalar; } } @@ -238,12 +264,18 @@ static inline void volk_32fc_x2_s32fc_multiply_conjugate_add_32fc_a_avx(lv_32fc_ #include #include -static inline void volk_32fc_x2_s32fc_multiply_conjugate_add_32fc_a_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, const lv_32fc_t scalar, unsigned int num_points) { +static inline void +volk_32fc_x2_s32fc_multiply_conjugate_add_32fc_a_sse3(lv_32fc_t* cVector, + const lv_32fc_t* aVector, + const lv_32fc_t* bVector, + const lv_32fc_t scalar, + unsigned int num_points) +{ unsigned int number = 0; const unsigned int halfPoints = num_points / 2; __m128 x, y, s, z; - lv_32fc_t v_scalar[2] = {scalar, scalar}; + lv_32fc_t v_scalar[2] = { scalar, scalar }; const lv_32fc_t* a = aVector; const lv_32fc_t* b = bVector; @@ -252,19 +284,19 @@ static inline void volk_32fc_x2_s32fc_multiply_conjugate_add_32fc_a_sse3(lv_32fc // Set up constant scalar vector s = _mm_load_ps((float*)v_scalar); - for(;number < halfPoints; number++){ + for (; number < halfPoints; number++) { x = _mm_load_ps((float*)b); y = _mm_load_ps((float*)a); z = _mm_complexconjugatemul_ps(s, x); z = _mm_add_ps(y, z); - _mm_store_ps((float*)c,z); + _mm_store_ps((float*)c, z); a += 2; b += 2; c += 2; } - if((num_points % 2) != 0) { + if ((num_points % 2) != 0) { *c = *a + lv_conj(*b) * scalar; } } @@ -272,9 +304,15 @@ static inline void volk_32fc_x2_s32fc_multiply_conjugate_add_32fc_a_sse3(lv_32fc #ifdef LV_HAVE_NEON -#include - -static inline void volk_32fc_x2_s32fc_multiply_conjugate_add_32fc_neon(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, const lv_32fc_t scalar, unsigned int num_points){ +#include + +static inline void +volk_32fc_x2_s32fc_multiply_conjugate_add_32fc_neon(lv_32fc_t* cVector, + const lv_32fc_t* aVector, + const lv_32fc_t* bVector, + const lv_32fc_t scalar, + unsigned int num_points) +{ const lv_32fc_t* bPtr = bVector; const lv_32fc_t* aPtr = aVector; lv_32fc_t* cPtr = cVector; @@ -287,7 +325,7 @@ static inline void volk_32fc_x2_s32fc_multiply_conjugate_add_32fc_neon(lv_32fc_t scalar_val.val[0] = vld1q_dup_f32((const float*)&scalar); scalar_val.val[1] = vld1q_dup_f32(((const float*)&scalar) + 1); - for(number = 0; number < quarter_points; ++number) { + for (number = 0; number < quarter_points; ++number) { a_val = vld2q_f32((float*)aPtr); b_val = vld2q_f32((float*)bPtr); b_val.val[1] = vnegq_f32(b_val.val[1]); @@ -310,7 +348,7 @@ static inline void volk_32fc_x2_s32fc_multiply_conjugate_add_32fc_neon(lv_32fc_t cPtr += 4; } - for(number = quarter_points*4; number < num_points; number++){ + for (number = quarter_points * 4; number < num_points; number++) { *cPtr++ = (*aPtr++) + lv_conj(*bPtr++) * scalar; } } diff --git a/kernels/volk/volk_32fc_x2_square_dist_32f.h b/kernels/volk/volk_32fc_x2_square_dist_32f.h index d6c6dff..75f4072 100644 --- a/kernels/volk/volk_32fc_x2_square_dist_32f.h +++ b/kernels/volk/volk_32fc_x2_square_dist_32f.h @@ -30,8 +30,8 @@ * * Dispatcher Prototype * \code - * void volk_32fc_x2_square_dist_32f(float* target, lv_32fc_t* src0, lv_32fc_t* points, unsigned int num_points) { - * \endcode + * void volk_32fc_x2_square_dist_32f(float* target, lv_32fc_t* src0, lv_32fc_t* points, + * unsigned int num_points) { \endcode * * \b Inputs * \li src0: The complex input. Only the first point is used. @@ -78,183 +78,185 @@ #ifndef INCLUDED_volk_32fc_x2_square_dist_32f_a_H #define INCLUDED_volk_32fc_x2_square_dist_32f_a_H -#include -#include -#include +#include +#include +#include #ifdef LV_HAVE_AVX2 -#include +#include -static inline void -volk_32fc_x2_square_dist_32f_a_avx2(float* target, lv_32fc_t* src0, lv_32fc_t* points, - unsigned int num_points) +static inline void volk_32fc_x2_square_dist_32f_a_avx2(float* target, + lv_32fc_t* src0, + lv_32fc_t* points, + unsigned int num_points) { - const unsigned int num_bytes = num_points*8; - __m128 xmm0, xmm9, xmm10; - __m256 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; - - lv_32fc_t diff; - float sq_dist; - int bound = num_bytes >> 6; - int leftovers0 = (num_bytes >> 5) & 1; - int leftovers1 = (num_bytes >> 4) & 1; - int leftovers2 = (num_bytes >> 3) & 1; - int i = 0; - - __m256i idx = _mm256_set_epi32(7,6,3,2,5,4,1,0); - xmm1 = _mm256_setzero_ps(); - xmm2 = _mm256_load_ps((float*)&points[0]); - xmm0 = _mm_load_ps((float*)src0); - xmm0 = _mm_permute_ps(xmm0, 0b01000100); - xmm1 = _mm256_insertf128_ps(xmm1, xmm0, 0); - xmm1 = _mm256_insertf128_ps(xmm1, xmm0, 1); - xmm3 = _mm256_load_ps((float*)&points[4]); - - for(; i < bound; ++i) { - xmm4 = _mm256_sub_ps(xmm1, xmm2); - xmm5 = _mm256_sub_ps(xmm1, xmm3); - points += 8; - xmm6 = _mm256_mul_ps(xmm4, xmm4); - xmm7 = _mm256_mul_ps(xmm5, xmm5); - + const unsigned int num_bytes = num_points * 8; + __m128 xmm0, xmm9, xmm10; + __m256 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; + + lv_32fc_t diff; + float sq_dist; + int bound = num_bytes >> 6; + int leftovers0 = (num_bytes >> 5) & 1; + int leftovers1 = (num_bytes >> 4) & 1; + int leftovers2 = (num_bytes >> 3) & 1; + int i = 0; + + __m256i idx = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0); + xmm1 = _mm256_setzero_ps(); xmm2 = _mm256_load_ps((float*)&points[0]); + xmm0 = _mm_load_ps((float*)src0); + xmm0 = _mm_permute_ps(xmm0, 0b01000100); + xmm1 = _mm256_insertf128_ps(xmm1, xmm0, 0); + xmm1 = _mm256_insertf128_ps(xmm1, xmm0, 1); + xmm3 = _mm256_load_ps((float*)&points[4]); - xmm4 = _mm256_hadd_ps(xmm6, xmm7); - xmm4 = _mm256_permutevar8x32_ps(xmm4, idx); + for (; i < bound; ++i) { + xmm4 = _mm256_sub_ps(xmm1, xmm2); + xmm5 = _mm256_sub_ps(xmm1, xmm3); + points += 8; + xmm6 = _mm256_mul_ps(xmm4, xmm4); + xmm7 = _mm256_mul_ps(xmm5, xmm5); - xmm3 = _mm256_load_ps((float*)&points[4]); + xmm2 = _mm256_load_ps((float*)&points[0]); - _mm256_store_ps(target, xmm4); + xmm4 = _mm256_hadd_ps(xmm6, xmm7); + xmm4 = _mm256_permutevar8x32_ps(xmm4, idx); - target += 8; - } + xmm3 = _mm256_load_ps((float*)&points[4]); - for(i = 0; i < leftovers0; ++i) { + _mm256_store_ps(target, xmm4); - xmm2 = _mm256_load_ps((float*)&points[0]); + target += 8; + } - xmm4 = _mm256_sub_ps(xmm1, xmm2); + for (i = 0; i < leftovers0; ++i) { - points += 4; + xmm2 = _mm256_load_ps((float*)&points[0]); - xmm6 = _mm256_mul_ps(xmm4, xmm4); + xmm4 = _mm256_sub_ps(xmm1, xmm2); - xmm4 = _mm256_hadd_ps(xmm6, xmm6); - xmm4 = _mm256_permutevar8x32_ps(xmm4, idx); + points += 4; - xmm9 = _mm256_extractf128_ps(xmm4, 1); - _mm_store_ps(target,xmm9); + xmm6 = _mm256_mul_ps(xmm4, xmm4); - target += 4; - } + xmm4 = _mm256_hadd_ps(xmm6, xmm6); + xmm4 = _mm256_permutevar8x32_ps(xmm4, idx); + + xmm9 = _mm256_extractf128_ps(xmm4, 1); + _mm_store_ps(target, xmm9); - for(i = 0; i < leftovers1; ++i) { - xmm9 = _mm_load_ps((float*)&points[0]); + target += 4; + } - xmm10 = _mm_sub_ps(xmm0, xmm9); + for (i = 0; i < leftovers1; ++i) { + xmm9 = _mm_load_ps((float*)&points[0]); - points += 2; + xmm10 = _mm_sub_ps(xmm0, xmm9); - xmm9 = _mm_mul_ps(xmm10, xmm10); + points += 2; - xmm10 = _mm_hadd_ps(xmm9, xmm9); + xmm9 = _mm_mul_ps(xmm10, xmm10); - _mm_storeh_pi((__m64*)target, xmm10); + xmm10 = _mm_hadd_ps(xmm9, xmm9); - target += 2; - } + _mm_storeh_pi((__m64*)target, xmm10); - for(i = 0; i < leftovers2; ++i) { + target += 2; + } - diff = src0[0] - points[0]; + for (i = 0; i < leftovers2; ++i) { - sq_dist = lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff); + diff = src0[0] - points[0]; - target[0] = sq_dist; - } + sq_dist = lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff); + + target[0] = sq_dist; + } } #endif /*LV_HAVE_AVX2*/ #ifdef LV_HAVE_SSE3 -#include -#include +#include +#include -static inline void -volk_32fc_x2_square_dist_32f_a_sse3(float* target, lv_32fc_t* src0, lv_32fc_t* points, - unsigned int num_points) +static inline void volk_32fc_x2_square_dist_32f_a_sse3(float* target, + lv_32fc_t* src0, + lv_32fc_t* points, + unsigned int num_points) { - const unsigned int num_bytes = num_points*8; + const unsigned int num_bytes = num_points * 8; - __m128 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; + __m128 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; - lv_32fc_t diff; - float sq_dist; - int bound = num_bytes >> 5; - int i = 0; + lv_32fc_t diff; + float sq_dist; + int bound = num_bytes >> 5; + int i = 0; - xmm1 = _mm_setzero_ps(); - xmm1 = _mm_loadl_pi(xmm1, (__m64*)src0); - xmm2 = _mm_load_ps((float*)&points[0]); - xmm1 = _mm_movelh_ps(xmm1, xmm1); - xmm3 = _mm_load_ps((float*)&points[2]); + xmm1 = _mm_setzero_ps(); + xmm1 = _mm_loadl_pi(xmm1, (__m64*)src0); + xmm2 = _mm_load_ps((float*)&points[0]); + xmm1 = _mm_movelh_ps(xmm1, xmm1); + xmm3 = _mm_load_ps((float*)&points[2]); + + for (; i < bound - 1; ++i) { + xmm4 = _mm_sub_ps(xmm1, xmm2); + xmm5 = _mm_sub_ps(xmm1, xmm3); + points += 4; + xmm6 = _mm_mul_ps(xmm4, xmm4); + xmm7 = _mm_mul_ps(xmm5, xmm5); + + xmm2 = _mm_load_ps((float*)&points[0]); + + xmm4 = _mm_hadd_ps(xmm6, xmm7); + + xmm3 = _mm_load_ps((float*)&points[2]); + + _mm_store_ps(target, xmm4); + + target += 4; + } - for(; i < bound - 1; ++i) { xmm4 = _mm_sub_ps(xmm1, xmm2); xmm5 = _mm_sub_ps(xmm1, xmm3); + points += 4; xmm6 = _mm_mul_ps(xmm4, xmm4); xmm7 = _mm_mul_ps(xmm5, xmm5); - xmm2 = _mm_load_ps((float*)&points[0]); - xmm4 = _mm_hadd_ps(xmm6, xmm7); - xmm3 = _mm_load_ps((float*)&points[2]); - _mm_store_ps(target, xmm4); target += 4; - } - - xmm4 = _mm_sub_ps(xmm1, xmm2); - xmm5 = _mm_sub_ps(xmm1, xmm3); - - points += 4; - xmm6 = _mm_mul_ps(xmm4, xmm4); - xmm7 = _mm_mul_ps(xmm5, xmm5); - xmm4 = _mm_hadd_ps(xmm6, xmm7); + if (num_bytes >> 4 & 1) { - _mm_store_ps(target, xmm4); + xmm2 = _mm_load_ps((float*)&points[0]); - target += 4; + xmm4 = _mm_sub_ps(xmm1, xmm2); - if (num_bytes >> 4 & 1) { + points += 2; - xmm2 = _mm_load_ps((float*)&points[0]); - - xmm4 = _mm_sub_ps(xmm1, xmm2); + xmm6 = _mm_mul_ps(xmm4, xmm4); - points += 2; - - xmm6 = _mm_mul_ps(xmm4, xmm4); + xmm4 = _mm_hadd_ps(xmm6, xmm6); - xmm4 = _mm_hadd_ps(xmm6, xmm6); + _mm_storeh_pi((__m64*)target, xmm4); - _mm_storeh_pi((__m64*)target, xmm4); + target += 2; + } - target += 2; - } + if (num_bytes >> 3 & 1) { - if (num_bytes >> 3 & 1) { + diff = src0[0] - points[0]; - diff = src0[0] - points[0]; + sq_dist = lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff); - sq_dist = lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff); - - target[0] = sq_dist; - } + target[0] = sq_dist; + } } #endif /*LV_HAVE_SSE3*/ @@ -262,55 +264,58 @@ volk_32fc_x2_square_dist_32f_a_sse3(float* target, lv_32fc_t* src0, lv_32fc_t* p #ifdef LV_HAVE_NEON #include -static inline void -volk_32fc_x2_square_dist_32f_neon(float* target, lv_32fc_t* src0, lv_32fc_t* points, unsigned int num_points) +static inline void volk_32fc_x2_square_dist_32f_neon(float* target, + lv_32fc_t* src0, + lv_32fc_t* points, + unsigned int num_points) { - const unsigned int quarter_points = num_points / 4; - unsigned int number; - - float32x4x2_t a_vec, b_vec; - float32x4x2_t diff_vec; - float32x4_t tmp, tmp1, dist_sq; - a_vec.val[0] = vdupq_n_f32( lv_creal(src0[0]) ); - a_vec.val[1] = vdupq_n_f32( lv_cimag(src0[0]) ); - for(number=0; number < quarter_points; ++number) { - b_vec = vld2q_f32((float*)points); - diff_vec.val[0] = vsubq_f32(a_vec.val[0], b_vec.val[0]); - diff_vec.val[1] = vsubq_f32(a_vec.val[1], b_vec.val[1]); - tmp = vmulq_f32(diff_vec.val[0], diff_vec.val[0]); - tmp1 = vmulq_f32(diff_vec.val[1], diff_vec.val[1]); - - dist_sq = vaddq_f32(tmp, tmp1); - vst1q_f32(target, dist_sq); - points += 4; - target += 4; - } - for(number=quarter_points*4; number < num_points; ++number) { - lv_32fc_t diff = src0[0] - *points++; - *target++ = lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff); - } + const unsigned int quarter_points = num_points / 4; + unsigned int number; + + float32x4x2_t a_vec, b_vec; + float32x4x2_t diff_vec; + float32x4_t tmp, tmp1, dist_sq; + a_vec.val[0] = vdupq_n_f32(lv_creal(src0[0])); + a_vec.val[1] = vdupq_n_f32(lv_cimag(src0[0])); + for (number = 0; number < quarter_points; ++number) { + b_vec = vld2q_f32((float*)points); + diff_vec.val[0] = vsubq_f32(a_vec.val[0], b_vec.val[0]); + diff_vec.val[1] = vsubq_f32(a_vec.val[1], b_vec.val[1]); + tmp = vmulq_f32(diff_vec.val[0], diff_vec.val[0]); + tmp1 = vmulq_f32(diff_vec.val[1], diff_vec.val[1]); + + dist_sq = vaddq_f32(tmp, tmp1); + vst1q_f32(target, dist_sq); + points += 4; + target += 4; + } + for (number = quarter_points * 4; number < num_points; ++number) { + lv_32fc_t diff = src0[0] - *points++; + *target++ = lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff); + } } #endif /* LV_HAVE_NEON */ #ifdef LV_HAVE_GENERIC -static inline void -volk_32fc_x2_square_dist_32f_generic(float* target, lv_32fc_t* src0, lv_32fc_t* points, - unsigned int num_points) +static inline void volk_32fc_x2_square_dist_32f_generic(float* target, + lv_32fc_t* src0, + lv_32fc_t* points, + unsigned int num_points) { - const unsigned int num_bytes = num_points*8; + const unsigned int num_bytes = num_points * 8; - lv_32fc_t diff; - float sq_dist; - unsigned int i = 0; + lv_32fc_t diff; + float sq_dist; + unsigned int i = 0; - for(; i < num_bytes >> 3; ++i) { - diff = src0[0] - points[i]; + for (; i> 3; ++i) { + diff = src0[0] - points[i]; - sq_dist = lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff); + sq_dist = lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff); - target[i] = sq_dist; - } + target[i] = sq_dist; + } } #endif /*LV_HAVE_GENERIC*/ @@ -321,80 +326,85 @@ volk_32fc_x2_square_dist_32f_generic(float* target, lv_32fc_t* src0, lv_32fc_t* #ifndef INCLUDED_volk_32fc_x2_square_dist_32f_u_H #define INCLUDED_volk_32fc_x2_square_dist_32f_u_H -#include -#include -#include +#include +#include +#include #ifdef LV_HAVE_AVX2 -#include +#include -static inline void -volk_32fc_x2_square_dist_32f_u_avx2(float* target, lv_32fc_t* src0, lv_32fc_t* points, - unsigned int num_points) +static inline void volk_32fc_x2_square_dist_32f_u_avx2(float* target, + lv_32fc_t* src0, + lv_32fc_t* points, + unsigned int num_points) { - const unsigned int num_bytes = num_points*8; - __m128 xmm0, xmm9; - __m256 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; - - lv_32fc_t diff; - float sq_dist; - int bound = num_bytes >> 6; - int leftovers1 = (num_bytes >> 3) & 0b11; - int i = 0; - - __m256i idx = _mm256_set_epi32(7,6,3,2,5,4,1,0); - xmm1 = _mm256_setzero_ps(); - xmm0 = _mm_loadu_ps((float*)src0); - xmm0 = _mm_permute_ps(xmm0, 0b01000100); - xmm1 = _mm256_insertf128_ps(xmm1, xmm0, 0); - xmm1 = _mm256_insertf128_ps(xmm1, xmm0, 1); - - for(; i < bound; ++i) { + const unsigned int num_bytes = num_points * 8; + __m128 xmm0, xmm9; + __m256 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; + + lv_32fc_t diff; + float sq_dist; + int bound = num_bytes >> 6; + int leftovers1 = (num_bytes >> 3) & 0b11; + int i = 0; + + __m256i idx = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0); + xmm1 = _mm256_setzero_ps(); xmm2 = _mm256_loadu_ps((float*)&points[0]); + xmm0 = _mm_loadu_ps((float*)src0); + xmm0 = _mm_permute_ps(xmm0, 0b01000100); + xmm1 = _mm256_insertf128_ps(xmm1, xmm0, 0); + xmm1 = _mm256_insertf128_ps(xmm1, xmm0, 1); xmm3 = _mm256_loadu_ps((float*)&points[4]); - xmm4 = _mm256_sub_ps(xmm1, xmm2); - xmm5 = _mm256_sub_ps(xmm1, xmm3); - points += 8; - xmm6 = _mm256_mul_ps(xmm4, xmm4); - xmm7 = _mm256_mul_ps(xmm5, xmm5); - xmm4 = _mm256_hadd_ps(xmm6, xmm7); - xmm4 = _mm256_permutevar8x32_ps(xmm4, idx); + for (; i < bound; ++i) { + xmm4 = _mm256_sub_ps(xmm1, xmm2); + xmm5 = _mm256_sub_ps(xmm1, xmm3); + points += 8; + xmm6 = _mm256_mul_ps(xmm4, xmm4); + xmm7 = _mm256_mul_ps(xmm5, xmm5); - _mm256_storeu_ps(target, xmm4); + xmm2 = _mm256_loadu_ps((float*)&points[0]); - target += 8; - } + xmm4 = _mm256_hadd_ps(xmm6, xmm7); + xmm4 = _mm256_permutevar8x32_ps(xmm4, idx); - if (num_bytes >> 5 & 1) { + xmm3 = _mm256_loadu_ps((float*)&points[4]); - xmm2 = _mm256_loadu_ps((float*)&points[0]); + _mm256_storeu_ps(target, xmm4); - xmm4 = _mm256_sub_ps(xmm1, xmm2); + target += 8; + } - points += 4; + if (num_bytes >> 5 & 1) { - xmm6 = _mm256_mul_ps(xmm4, xmm4); + xmm2 = _mm256_loadu_ps((float*)&points[0]); - xmm4 = _mm256_hadd_ps(xmm6, xmm6); - xmm4 = _mm256_permutevar8x32_ps(xmm4, idx); + xmm4 = _mm256_sub_ps(xmm1, xmm2); - xmm9 = _mm256_extractf128_ps(xmm4, 1); - _mm_storeu_ps(target,xmm9); + points += 4; - target += 4; - } + xmm6 = _mm256_mul_ps(xmm4, xmm4); + + xmm4 = _mm256_hadd_ps(xmm6, xmm6); + xmm4 = _mm256_permutevar8x32_ps(xmm4, idx); + + xmm9 = _mm256_extractf128_ps(xmm4, 1); + _mm_storeu_ps(target, xmm9); + + target += 4; + } - for(i = 0; i < leftovers1; ++i) { + for (i = 0; i < leftovers1; ++i) { - diff = src0[0] - points[0]; - points += 1; + diff = src0[0] - points[0]; + points += 1; - sq_dist = lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff); + sq_dist = lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff); - target[0] = sq_dist; - target += 1; - } + target[0] = sq_dist; + target += 1; + } } #endif /*LV_HAVE_AVX2*/ diff --git a/kernels/volk/volk_32i_s32f_convert_32f.h b/kernels/volk/volk_32i_s32f_convert_32f.h index 87d94f9..6b67cdb 100644 --- a/kernels/volk/volk_32i_s32f_convert_32f.h +++ b/kernels/volk/volk_32i_s32f_convert_32f.h @@ -30,8 +30,8 @@ * * Dispatcher Prototype * \code - * void volk_32i_s32f_convert_32f(float* outputVector, const int32_t* inputVector, const float scalar, unsigned int num_points) - * \endcode + * void volk_32i_s32f_convert_32f(float* outputVector, const int32_t* inputVector, const + * float scalar, unsigned int num_points) \endcode * * \b Inputs * \li inputVector: The vector of 32-bit integers. @@ -70,37 +70,38 @@ #ifdef LV_HAVE_AVX512F #include -static inline void -volk_32i_s32f_convert_32f_u_avx512f(float* outputVector, const int32_t* inputVector, - const float scalar, unsigned int num_points) +static inline void volk_32i_s32f_convert_32f_u_avx512f(float* outputVector, + const int32_t* inputVector, + const float scalar, + unsigned int num_points) { - unsigned int number = 0; - const unsigned int onesixteenthPoints = num_points / 16; + unsigned int number = 0; + const unsigned int onesixteenthPoints = num_points / 16; - float* outputVectorPtr = outputVector; - const float iScalar = 1.0 / scalar; - __m512 invScalar = _mm512_set1_ps(iScalar); - int32_t* inputPtr = (int32_t*)inputVector; - __m512i inputVal; - __m512 ret; + float* outputVectorPtr = outputVector; + const float iScalar = 1.0 / scalar; + __m512 invScalar = _mm512_set1_ps(iScalar); + int32_t* inputPtr = (int32_t*)inputVector; + __m512i inputVal; + __m512 ret; - for(;number < onesixteenthPoints; number++){ - // Load the values - inputVal = _mm512_loadu_si512((__m512i*)inputPtr); + for (; number < onesixteenthPoints; number++) { + // Load the values + inputVal = _mm512_loadu_si512((__m512i*)inputPtr); - ret = _mm512_cvtepi32_ps(inputVal); - ret = _mm512_mul_ps(ret, invScalar); + ret = _mm512_cvtepi32_ps(inputVal); + ret = _mm512_mul_ps(ret, invScalar); - _mm512_storeu_ps(outputVectorPtr, ret); + _mm512_storeu_ps(outputVectorPtr, ret); - outputVectorPtr += 16; - inputPtr += 16; - } + outputVectorPtr += 16; + inputPtr += 16; + } - number = onesixteenthPoints * 16; - for(; number < num_points; number++){ - outputVector[number] =((float)(inputVector[number])) * iScalar; - } + number = onesixteenthPoints * 16; + for (; number < num_points; number++) { + outputVector[number] = ((float)(inputVector[number])) * iScalar; + } } #endif /* LV_HAVE_AVX512F */ @@ -108,37 +109,38 @@ volk_32i_s32f_convert_32f_u_avx512f(float* outputVector, const int32_t* inputVec #ifdef LV_HAVE_AVX2 #include -static inline void -volk_32i_s32f_convert_32f_u_avx2(float* outputVector, const int32_t* inputVector, - const float scalar, unsigned int num_points) +static inline void volk_32i_s32f_convert_32f_u_avx2(float* outputVector, + const int32_t* inputVector, + const float scalar, + unsigned int num_points) { - unsigned int number = 0; - const unsigned int oneEightPoints = num_points / 8; + unsigned int number = 0; + const unsigned int oneEightPoints = num_points / 8; - float* outputVectorPtr = outputVector; - const float iScalar = 1.0 / scalar; - __m256 invScalar = _mm256_set1_ps(iScalar); - int32_t* inputPtr = (int32_t*)inputVector; - __m256i inputVal; - __m256 ret; + float* outputVectorPtr = outputVector; + const float iScalar = 1.0 / scalar; + __m256 invScalar = _mm256_set1_ps(iScalar); + int32_t* inputPtr = (int32_t*)inputVector; + __m256i inputVal; + __m256 ret; - for(;number < oneEightPoints; number++){ - // Load the 4 values - inputVal = _mm256_loadu_si256((__m256i*)inputPtr); + for (; number < oneEightPoints; number++) { + // Load the 4 values + inputVal = _mm256_loadu_si256((__m256i*)inputPtr); - ret = _mm256_cvtepi32_ps(inputVal); - ret = _mm256_mul_ps(ret, invScalar); + ret = _mm256_cvtepi32_ps(inputVal); + ret = _mm256_mul_ps(ret, invScalar); - _mm256_storeu_ps(outputVectorPtr, ret); + _mm256_storeu_ps(outputVectorPtr, ret); - outputVectorPtr += 8; - inputPtr += 8; - } + outputVectorPtr += 8; + inputPtr += 8; + } - number = oneEightPoints * 8; - for(; number < num_points; number++){ - outputVector[number] =((float)(inputVector[number])) * iScalar; - } + number = oneEightPoints * 8; + for (; number < num_points; number++) { + outputVector[number] = ((float)(inputVector[number])) * iScalar; + } } #endif /* LV_HAVE_AVX2 */ @@ -146,62 +148,63 @@ volk_32i_s32f_convert_32f_u_avx2(float* outputVector, const int32_t* inputVector #ifdef LV_HAVE_SSE2 #include -static inline void -volk_32i_s32f_convert_32f_u_sse2(float* outputVector, const int32_t* inputVector, - const float scalar, unsigned int num_points) +static inline void volk_32i_s32f_convert_32f_u_sse2(float* outputVector, + const int32_t* inputVector, + const float scalar, + unsigned int num_points) { - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; - float* outputVectorPtr = outputVector; - const float iScalar = 1.0 / scalar; - __m128 invScalar = _mm_set_ps1(iScalar); - int32_t* inputPtr = (int32_t*)inputVector; - __m128i inputVal; - __m128 ret; + float* outputVectorPtr = outputVector; + const float iScalar = 1.0 / scalar; + __m128 invScalar = _mm_set_ps1(iScalar); + int32_t* inputPtr = (int32_t*)inputVector; + __m128i inputVal; + __m128 ret; - for(;number < quarterPoints; number++){ - // Load the 4 values - inputVal = _mm_loadu_si128((__m128i*)inputPtr); + for (; number < quarterPoints; number++) { + // Load the 4 values + inputVal = _mm_loadu_si128((__m128i*)inputPtr); - ret = _mm_cvtepi32_ps(inputVal); - ret = _mm_mul_ps(ret, invScalar); + ret = _mm_cvtepi32_ps(inputVal); + ret = _mm_mul_ps(ret, invScalar); - _mm_storeu_ps(outputVectorPtr, ret); + _mm_storeu_ps(outputVectorPtr, ret); - outputVectorPtr += 4; - inputPtr += 4; - } + outputVectorPtr += 4; + inputPtr += 4; + } - number = quarterPoints * 4; - for(; number < num_points; number++){ - outputVector[number] =((float)(inputVector[number])) * iScalar; - } + number = quarterPoints * 4; + for (; number < num_points; number++) { + outputVector[number] = ((float)(inputVector[number])) * iScalar; + } } #endif /* LV_HAVE_SSE2 */ #ifdef LV_HAVE_GENERIC -static inline void -volk_32i_s32f_convert_32f_generic(float* outputVector, const int32_t* inputVector, - const float scalar, unsigned int num_points) +static inline void volk_32i_s32f_convert_32f_generic(float* outputVector, + const int32_t* inputVector, + const float scalar, + unsigned int num_points) { - float* outputVectorPtr = outputVector; - const int32_t* inputVectorPtr = inputVector; - unsigned int number = 0; - const float iScalar = 1.0 / scalar; - - for(number = 0; number < num_points; number++){ - *outputVectorPtr++ = ((float)(*inputVectorPtr++)) * iScalar; - } + float* outputVectorPtr = outputVector; + const int32_t* inputVectorPtr = inputVector; + unsigned int number = 0; + const float iScalar = 1.0 / scalar; + + for (number = 0; number < num_points; number++) { + *outputVectorPtr++ = ((float)(*inputVectorPtr++)) * iScalar; + } } #endif /* LV_HAVE_GENERIC */ #endif /* INCLUDED_volk_32i_s32f_convert_32f_u_H */ - #ifndef INCLUDED_volk_32i_s32f_convert_32f_a_H #define INCLUDED_volk_32i_s32f_convert_32f_a_H @@ -211,74 +214,76 @@ volk_32i_s32f_convert_32f_generic(float* outputVector, const int32_t* inputVecto #ifdef LV_HAVE_AVX512F #include -static inline void -volk_32i_s32f_convert_32f_a_avx512f(float* outputVector, const int32_t* inputVector, - const float scalar, unsigned int num_points) +static inline void volk_32i_s32f_convert_32f_a_avx512f(float* outputVector, + const int32_t* inputVector, + const float scalar, + unsigned int num_points) { - unsigned int number = 0; - const unsigned int onesixteenthPoints = num_points / 16; + unsigned int number = 0; + const unsigned int onesixteenthPoints = num_points / 16; - float* outputVectorPtr = outputVector; - const float iScalar = 1.0 / scalar; - __m512 invScalar = _mm512_set1_ps(iScalar); - int32_t* inputPtr = (int32_t*)inputVector; - __m512i inputVal; - __m512 ret; + float* outputVectorPtr = outputVector; + const float iScalar = 1.0 / scalar; + __m512 invScalar = _mm512_set1_ps(iScalar); + int32_t* inputPtr = (int32_t*)inputVector; + __m512i inputVal; + __m512 ret; - for(;number < onesixteenthPoints; number++){ - // Load the values - inputVal = _mm512_load_si512((__m512i*)inputPtr); + for (; number < onesixteenthPoints; number++) { + // Load the values + inputVal = _mm512_load_si512((__m512i*)inputPtr); - ret = _mm512_cvtepi32_ps(inputVal); - ret = _mm512_mul_ps(ret, invScalar); + ret = _mm512_cvtepi32_ps(inputVal); + ret = _mm512_mul_ps(ret, invScalar); - _mm512_store_ps(outputVectorPtr, ret); + _mm512_store_ps(outputVectorPtr, ret); - outputVectorPtr += 16; - inputPtr += 16; - } + outputVectorPtr += 16; + inputPtr += 16; + } - number = onesixteenthPoints * 16; - for(; number < num_points; number++){ - outputVector[number] =((float)(inputVector[number])) * iScalar; - } + number = onesixteenthPoints * 16; + for (; number < num_points; number++) { + outputVector[number] = ((float)(inputVector[number])) * iScalar; + } } #endif /* LV_HAVE_AVX512F */ #ifdef LV_HAVE_AVX2 #include -static inline void -volk_32i_s32f_convert_32f_a_avx2(float* outputVector, const int32_t* inputVector, - const float scalar, unsigned int num_points) +static inline void volk_32i_s32f_convert_32f_a_avx2(float* outputVector, + const int32_t* inputVector, + const float scalar, + unsigned int num_points) { - unsigned int number = 0; - const unsigned int oneEightPoints = num_points / 8; + unsigned int number = 0; + const unsigned int oneEightPoints = num_points / 8; - float* outputVectorPtr = outputVector; - const float iScalar = 1.0 / scalar; - __m256 invScalar = _mm256_set1_ps(iScalar); - int32_t* inputPtr = (int32_t*)inputVector; - __m256i inputVal; - __m256 ret; + float* outputVectorPtr = outputVector; + const float iScalar = 1.0 / scalar; + __m256 invScalar = _mm256_set1_ps(iScalar); + int32_t* inputPtr = (int32_t*)inputVector; + __m256i inputVal; + __m256 ret; - for(;number < oneEightPoints; number++){ - // Load the 4 values - inputVal = _mm256_load_si256((__m256i*)inputPtr); + for (; number < oneEightPoints; number++) { + // Load the 4 values + inputVal = _mm256_load_si256((__m256i*)inputPtr); - ret = _mm256_cvtepi32_ps(inputVal); - ret = _mm256_mul_ps(ret, invScalar); + ret = _mm256_cvtepi32_ps(inputVal); + ret = _mm256_mul_ps(ret, invScalar); - _mm256_store_ps(outputVectorPtr, ret); + _mm256_store_ps(outputVectorPtr, ret); - outputVectorPtr += 8; - inputPtr += 8; - } + outputVectorPtr += 8; + inputPtr += 8; + } - number = oneEightPoints * 8; - for(; number < num_points; number++){ - outputVector[number] =((float)(inputVector[number])) * iScalar; - } + number = oneEightPoints * 8; + for (; number < num_points; number++) { + outputVector[number] = ((float)(inputVector[number])) * iScalar; + } } #endif /* LV_HAVE_AVX2 */ @@ -286,59 +291,59 @@ volk_32i_s32f_convert_32f_a_avx2(float* outputVector, const int32_t* inputVector #ifdef LV_HAVE_SSE2 #include -static inline void -volk_32i_s32f_convert_32f_a_sse2(float* outputVector, const int32_t* inputVector, - const float scalar, unsigned int num_points) +static inline void volk_32i_s32f_convert_32f_a_sse2(float* outputVector, + const int32_t* inputVector, + const float scalar, + unsigned int num_points) { - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; - float* outputVectorPtr = outputVector; - const float iScalar = 1.0 / scalar; - __m128 invScalar = _mm_set_ps1(iScalar); - int32_t* inputPtr = (int32_t*)inputVector; - __m128i inputVal; - __m128 ret; + float* outputVectorPtr = outputVector; + const float iScalar = 1.0 / scalar; + __m128 invScalar = _mm_set_ps1(iScalar); + int32_t* inputPtr = (int32_t*)inputVector; + __m128i inputVal; + __m128 ret; - for(;number < quarterPoints; number++){ - // Load the 4 values - inputVal = _mm_load_si128((__m128i*)inputPtr); + for (; number < quarterPoints; number++) { + // Load the 4 values + inputVal = _mm_load_si128((__m128i*)inputPtr); - ret = _mm_cvtepi32_ps(inputVal); - ret = _mm_mul_ps(ret, invScalar); + ret = _mm_cvtepi32_ps(inputVal); + ret = _mm_mul_ps(ret, invScalar); - _mm_store_ps(outputVectorPtr, ret); + _mm_store_ps(outputVectorPtr, ret); - outputVectorPtr += 4; - inputPtr += 4; - } + outputVectorPtr += 4; + inputPtr += 4; + } - number = quarterPoints * 4; - for(; number < num_points; number++){ - outputVector[number] =((float)(inputVector[number])) * iScalar; - } + number = quarterPoints * 4; + for (; number < num_points; number++) { + outputVector[number] = ((float)(inputVector[number])) * iScalar; + } } #endif /* LV_HAVE_SSE2 */ #ifdef LV_HAVE_GENERIC -static inline void -volk_32i_s32f_convert_32f_a_generic(float* outputVector, const int32_t* inputVector, - const float scalar, unsigned int num_points) +static inline void volk_32i_s32f_convert_32f_a_generic(float* outputVector, + const int32_t* inputVector, + const float scalar, + unsigned int num_points) { - float* outputVectorPtr = outputVector; - const int32_t* inputVectorPtr = inputVector; - unsigned int number = 0; - const float iScalar = 1.0 / scalar; - - for(number = 0; number < num_points; number++){ - *outputVectorPtr++ = ((float)(*inputVectorPtr++)) * iScalar; - } + float* outputVectorPtr = outputVector; + const int32_t* inputVectorPtr = inputVector; + unsigned int number = 0; + const float iScalar = 1.0 / scalar; + + for (number = 0; number < num_points; number++) { + *outputVectorPtr++ = ((float)(*inputVectorPtr++)) * iScalar; + } } #endif /* LV_HAVE_GENERIC */ - - #endif /* INCLUDED_volk_32i_s32f_convert_32f_a_H */ diff --git a/kernels/volk/volk_32i_x2_and_32i.h b/kernels/volk/volk_32i_x2_and_32i.h index 76f0175..755cfdc 100644 --- a/kernels/volk/volk_32i_x2_and_32i.h +++ b/kernels/volk/volk_32i_x2_and_32i.h @@ -29,8 +29,8 @@ * * Dispatcher Prototype * \code - * void volk_32i_x2_and_32i(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points) - * \endcode + * void volk_32i_x2_and_32i(int32_t* cVector, const int32_t* aVector, const int32_t* + * bVector, unsigned int num_points) \endcode * * \b Inputs * \li aVector: Input vector of samples. @@ -87,72 +87,75 @@ #ifdef LV_HAVE_AVX512F #include -static inline void -volk_32i_x2_and_32i_a_avx512f(int32_t* cVector, const int32_t* aVector, - const int32_t* bVector, unsigned int num_points) +static inline void volk_32i_x2_and_32i_a_avx512f(int32_t* cVector, + const int32_t* aVector, + const int32_t* bVector, + unsigned int num_points) { - unsigned int number = 0; - const unsigned int sixteenthPoints = num_points / 16; + unsigned int number = 0; + const unsigned int sixteenthPoints = num_points / 16; - int32_t* cPtr = (int32_t*)cVector; - const int32_t* aPtr = (int32_t*)aVector; - const int32_t* bPtr = (int32_t*)bVector; + int32_t* cPtr = (int32_t*)cVector; + const int32_t* aPtr = (int32_t*)aVector; + const int32_t* bPtr = (int32_t*)bVector; - __m512i aVal, bVal, cVal; - for(;number < sixteenthPoints; number++){ + __m512i aVal, bVal, cVal; + for (; number < sixteenthPoints; number++) { - aVal = _mm512_load_si512(aPtr); - bVal = _mm512_load_si512(bPtr); + aVal = _mm512_load_si512(aPtr); + bVal = _mm512_load_si512(bPtr); - cVal = _mm512_and_si512(aVal, bVal); + cVal = _mm512_and_si512(aVal, bVal); - _mm512_store_si512(cPtr,cVal); // Store the results back into the C container + _mm512_store_si512(cPtr, cVal); // Store the results back into the C container - aPtr += 16; - bPtr += 16; - cPtr += 16; - } + aPtr += 16; + bPtr += 16; + cPtr += 16; + } - number = sixteenthPoints * 16; - for(;number < num_points; number++){ - cVector[number] = aVector[number] & bVector[number]; - } + number = sixteenthPoints * 16; + for (; number < num_points; number++) { + cVector[number] = aVector[number] & bVector[number]; + } } #endif /* LV_HAVE_AVX512F */ #ifdef LV_HAVE_AVX2 #include -static inline void -volk_32i_x2_and_32i_a_avx2(int32_t* cVector, const int32_t* aVector, - const int32_t* bVector, unsigned int num_points) +static inline void volk_32i_x2_and_32i_a_avx2(int32_t* cVector, + const int32_t* aVector, + const int32_t* bVector, + unsigned int num_points) { - unsigned int number = 0; - const unsigned int oneEightPoints = num_points / 8; + unsigned int number = 0; + const unsigned int oneEightPoints = num_points / 8; - int32_t* cPtr = cVector; - const int32_t* aPtr = aVector; - const int32_t* bPtr = bVector; + int32_t* cPtr = cVector; + const int32_t* aPtr = aVector; + const int32_t* bPtr = bVector; - __m256i aVal, bVal, cVal; - for(;number < oneEightPoints; number++){ + __m256i aVal, bVal, cVal; + for (; number < oneEightPoints; number++) { - aVal = _mm256_load_si256((__m256i*)aPtr); - bVal = _mm256_load_si256((__m256i*)bPtr); + aVal = _mm256_load_si256((__m256i*)aPtr); + bVal = _mm256_load_si256((__m256i*)bPtr); - cVal = _mm256_and_si256(aVal, bVal); + cVal = _mm256_and_si256(aVal, bVal); - _mm256_store_si256((__m256i*)cPtr,cVal); // Store the results back into the C container + _mm256_store_si256((__m256i*)cPtr, + cVal); // Store the results back into the C container - aPtr += 8; - bPtr += 8; - cPtr += 8; - } + aPtr += 8; + bPtr += 8; + cPtr += 8; + } - number = oneEightPoints * 8; - for(;number < num_points; number++){ - cVector[number] = aVector[number] & bVector[number]; - } + number = oneEightPoints * 8; + for (; number < num_points; number++) { + cVector[number] = aVector[number] & bVector[number]; + } } #endif /* LV_HAVE_AVX2 */ @@ -160,36 +163,37 @@ volk_32i_x2_and_32i_a_avx2(int32_t* cVector, const int32_t* aVector, #ifdef LV_HAVE_SSE #include -static inline void -volk_32i_x2_and_32i_a_sse(int32_t* cVector, const int32_t* aVector, - const int32_t* bVector, unsigned int num_points) +static inline void volk_32i_x2_and_32i_a_sse(int32_t* cVector, + const int32_t* aVector, + const int32_t* bVector, + unsigned int num_points) { - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; - float* cPtr = (float*)cVector; - const float* aPtr = (float*)aVector; - const float* bPtr = (float*)bVector; + float* cPtr = (float*)cVector; + const float* aPtr = (float*)aVector; + const float* bPtr = (float*)bVector; - __m128 aVal, bVal, cVal; - for(;number < quarterPoints; number++){ + __m128 aVal, bVal, cVal; + for (; number < quarterPoints; number++) { - aVal = _mm_load_ps(aPtr); - bVal = _mm_load_ps(bPtr); + aVal = _mm_load_ps(aPtr); + bVal = _mm_load_ps(bPtr); - cVal = _mm_and_ps(aVal, bVal); + cVal = _mm_and_ps(aVal, bVal); - _mm_store_ps(cPtr,cVal); // Store the results back into the C container + _mm_store_ps(cPtr, cVal); // Store the results back into the C container - aPtr += 4; - bPtr += 4; - cPtr += 4; - } + aPtr += 4; + bPtr += 4; + cPtr += 4; + } - number = quarterPoints * 4; - for(;number < num_points; number++){ - cVector[number] = aVector[number] & bVector[number]; - } + number = quarterPoints * 4; + for (; number < num_points; number++) { + cVector[number] = aVector[number] & bVector[number]; + } } #endif /* LV_HAVE_SSE */ @@ -197,62 +201,67 @@ volk_32i_x2_and_32i_a_sse(int32_t* cVector, const int32_t* aVector, #ifdef LV_HAVE_NEON #include -static inline void -volk_32i_x2_and_32i_neon(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points) +static inline void volk_32i_x2_and_32i_neon(int32_t* cVector, + const int32_t* aVector, + const int32_t* bVector, + unsigned int num_points) { - int32_t* cPtr = cVector; - const int32_t* aPtr = aVector; - const int32_t* bPtr= bVector; - unsigned int number = 0; - unsigned int quarter_points = num_points / 4; - - int32x4_t a_val, b_val, c_val; - - for(number = 0; number < quarter_points; number++){ - a_val = vld1q_s32(aPtr); - b_val = vld1q_s32(bPtr); - c_val = vandq_s32(a_val, b_val); - vst1q_s32(cPtr, c_val); - aPtr += 4; - bPtr += 4; - cPtr += 4; - } - - for(number = quarter_points * 4; number < num_points; number++){ - *cPtr++ = (*aPtr++) & (*bPtr++); - } + int32_t* cPtr = cVector; + const int32_t* aPtr = aVector; + const int32_t* bPtr = bVector; + unsigned int number = 0; + unsigned int quarter_points = num_points / 4; + + int32x4_t a_val, b_val, c_val; + + for (number = 0; number < quarter_points; number++) { + a_val = vld1q_s32(aPtr); + b_val = vld1q_s32(bPtr); + c_val = vandq_s32(a_val, b_val); + vst1q_s32(cPtr, c_val); + aPtr += 4; + bPtr += 4; + cPtr += 4; + } + + for (number = quarter_points * 4; number < num_points; number++) { + *cPtr++ = (*aPtr++) & (*bPtr++); + } } #endif /* LV_HAVE_NEON */ #ifdef LV_HAVE_GENERIC -static inline void -volk_32i_x2_and_32i_generic(int32_t* cVector, const int32_t* aVector, - const int32_t* bVector, unsigned int num_points) +static inline void volk_32i_x2_and_32i_generic(int32_t* cVector, + const int32_t* aVector, + const int32_t* bVector, + unsigned int num_points) { - int32_t* cPtr = cVector; - const int32_t* aPtr = aVector; - const int32_t* bPtr= bVector; - unsigned int number = 0; - - for(number = 0; number < num_points; number++){ - *cPtr++ = (*aPtr++) & (*bPtr++); - } + int32_t* cPtr = cVector; + const int32_t* aPtr = aVector; + const int32_t* bPtr = bVector; + unsigned int number = 0; + + for (number = 0; number < num_points; number++) { + *cPtr++ = (*aPtr++) & (*bPtr++); + } } #endif /* LV_HAVE_GENERIC */ #ifdef LV_HAVE_ORC -extern void -volk_32i_x2_and_32i_a_orc_impl(int32_t* cVector, const int32_t* aVector, - const int32_t* bVector, unsigned int num_points); - -static inline void -volk_32i_x2_and_32i_u_orc(int32_t* cVector, const int32_t* aVector, - const int32_t* bVector, unsigned int num_points) +extern void volk_32i_x2_and_32i_a_orc_impl(int32_t* cVector, + const int32_t* aVector, + const int32_t* bVector, + unsigned int num_points); + +static inline void volk_32i_x2_and_32i_u_orc(int32_t* cVector, + const int32_t* aVector, + const int32_t* bVector, + unsigned int num_points) { - volk_32i_x2_and_32i_a_orc_impl(cVector, aVector, bVector, num_points); + volk_32i_x2_and_32i_a_orc_impl(cVector, aVector, bVector, num_points); } #endif /* LV_HAVE_ORC */ @@ -269,72 +278,75 @@ volk_32i_x2_and_32i_u_orc(int32_t* cVector, const int32_t* aVector, #ifdef LV_HAVE_AVX512F #include -static inline void -volk_32i_x2_and_32i_u_avx512f(int32_t* cVector, const int32_t* aVector, - const int32_t* bVector, unsigned int num_points) +static inline void volk_32i_x2_and_32i_u_avx512f(int32_t* cVector, + const int32_t* aVector, + const int32_t* bVector, + unsigned int num_points) { - unsigned int number = 0; - const unsigned int sixteenthPoints = num_points / 16; + unsigned int number = 0; + const unsigned int sixteenthPoints = num_points / 16; - int32_t* cPtr = (int32_t*)cVector; - const int32_t* aPtr = (int32_t*)aVector; - const int32_t* bPtr = (int32_t*)bVector; + int32_t* cPtr = (int32_t*)cVector; + const int32_t* aPtr = (int32_t*)aVector; + const int32_t* bPtr = (int32_t*)bVector; - __m512i aVal, bVal, cVal; - for(;number < sixteenthPoints; number++){ + __m512i aVal, bVal, cVal; + for (; number < sixteenthPoints; number++) { - aVal = _mm512_loadu_si512(aPtr); - bVal = _mm512_loadu_si512(bPtr); + aVal = _mm512_loadu_si512(aPtr); + bVal = _mm512_loadu_si512(bPtr); - cVal = _mm512_and_si512(aVal, bVal); + cVal = _mm512_and_si512(aVal, bVal); - _mm512_storeu_si512(cPtr,cVal); // Store the results back into the C container + _mm512_storeu_si512(cPtr, cVal); // Store the results back into the C container - aPtr += 16; - bPtr += 16; - cPtr += 16; - } + aPtr += 16; + bPtr += 16; + cPtr += 16; + } - number = sixteenthPoints * 16; - for(;number < num_points; number++){ - cVector[number] = aVector[number] & bVector[number]; - } + number = sixteenthPoints * 16; + for (; number < num_points; number++) { + cVector[number] = aVector[number] & bVector[number]; + } } #endif /* LV_HAVE_AVX512F */ #ifdef LV_HAVE_AVX2 #include -static inline void -volk_32i_x2_and_32i_u_avx2(int32_t* cVector, const int32_t* aVector, - const int32_t* bVector, unsigned int num_points) +static inline void volk_32i_x2_and_32i_u_avx2(int32_t* cVector, + const int32_t* aVector, + const int32_t* bVector, + unsigned int num_points) { - unsigned int number = 0; - const unsigned int oneEightPoints = num_points / 8; + unsigned int number = 0; + const unsigned int oneEightPoints = num_points / 8; - int32_t* cPtr = cVector; - const int32_t* aPtr = aVector; - const int32_t* bPtr = bVector; + int32_t* cPtr = cVector; + const int32_t* aPtr = aVector; + const int32_t* bPtr = bVector; - __m256i aVal, bVal, cVal; - for(;number < oneEightPoints; number++){ + __m256i aVal, bVal, cVal; + for (; number < oneEightPoints; number++) { - aVal = _mm256_loadu_si256((__m256i*)aPtr); - bVal = _mm256_loadu_si256((__m256i*)bPtr); + aVal = _mm256_loadu_si256((__m256i*)aPtr); + bVal = _mm256_loadu_si256((__m256i*)bPtr); - cVal = _mm256_and_si256(aVal, bVal); + cVal = _mm256_and_si256(aVal, bVal); - _mm256_storeu_si256((__m256i*)cPtr,cVal); // Store the results back into the C container + _mm256_storeu_si256((__m256i*)cPtr, + cVal); // Store the results back into the C container - aPtr += 8; - bPtr += 8; - cPtr += 8; - } + aPtr += 8; + bPtr += 8; + cPtr += 8; + } - number = oneEightPoints * 8; - for(;number < num_points; number++){ - cVector[number] = aVector[number] & bVector[number]; - } + number = oneEightPoints * 8; + for (; number < num_points; number++) { + cVector[number] = aVector[number] & bVector[number]; + } } #endif /* LV_HAVE_AVX2 */ diff --git a/kernels/volk/volk_32i_x2_or_32i.h b/kernels/volk/volk_32i_x2_or_32i.h index be4c086..b03db89 100644 --- a/kernels/volk/volk_32i_x2_or_32i.h +++ b/kernels/volk/volk_32i_x2_or_32i.h @@ -29,8 +29,8 @@ * * Dispatcher Prototype * \code - * void volk_32i_x2_or_32i(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points) - * \endcode + * void volk_32i_x2_or_32i(int32_t* cVector, const int32_t* aVector, const int32_t* + * bVector, unsigned int num_points) \endcode * * \b Inputs * \li aVector: Input vector of samples. @@ -87,72 +87,75 @@ #ifdef LV_HAVE_AVX512F #include -static inline void -volk_32i_x2_or_32i_a_avx512f(int32_t* cVector, const int32_t* aVector, - const int32_t* bVector, unsigned int num_points) +static inline void volk_32i_x2_or_32i_a_avx512f(int32_t* cVector, + const int32_t* aVector, + const int32_t* bVector, + unsigned int num_points) { - unsigned int number = 0; - const unsigned int sixteenthPoints = num_points / 16; + unsigned int number = 0; + const unsigned int sixteenthPoints = num_points / 16; - int32_t* cPtr = (int32_t*)cVector; - const int32_t* aPtr = (int32_t*)aVector; - const int32_t* bPtr = (int32_t*)bVector; + int32_t* cPtr = (int32_t*)cVector; + const int32_t* aPtr = (int32_t*)aVector; + const int32_t* bPtr = (int32_t*)bVector; - __m512i aVal, bVal, cVal; - for(;number < sixteenthPoints; number++){ + __m512i aVal, bVal, cVal; + for (; number < sixteenthPoints; number++) { - aVal = _mm512_load_si512(aPtr); - bVal = _mm512_load_si512(bPtr); + aVal = _mm512_load_si512(aPtr); + bVal = _mm512_load_si512(bPtr); - cVal = _mm512_or_si512(aVal, bVal); + cVal = _mm512_or_si512(aVal, bVal); - _mm512_store_si512(cPtr,cVal); // Store the results back into the C container + _mm512_store_si512(cPtr, cVal); // Store the results back into the C container - aPtr += 16; - bPtr += 16; - cPtr += 16; - } + aPtr += 16; + bPtr += 16; + cPtr += 16; + } - number = sixteenthPoints * 16; - for(;number < num_points; number++){ - cVector[number] = aVector[number] | bVector[number]; - } + number = sixteenthPoints * 16; + for (; number < num_points; number++) { + cVector[number] = aVector[number] | bVector[number]; + } } #endif /* LV_HAVE_AVX512F */ #ifdef LV_HAVE_AVX2 #include -static inline void -volk_32i_x2_or_32i_a_avx2(int32_t* cVector, const int32_t* aVector, - const int32_t* bVector, unsigned int num_points) +static inline void volk_32i_x2_or_32i_a_avx2(int32_t* cVector, + const int32_t* aVector, + const int32_t* bVector, + unsigned int num_points) { - unsigned int number = 0; - const unsigned int oneEightPoints = num_points / 8; + unsigned int number = 0; + const unsigned int oneEightPoints = num_points / 8; - int32_t* cPtr = cVector; - const int32_t* aPtr = aVector; - const int32_t* bPtr = bVector; + int32_t* cPtr = cVector; + const int32_t* aPtr = aVector; + const int32_t* bPtr = bVector; - __m256i aVal, bVal, cVal; - for(;number < oneEightPoints; number++){ + __m256i aVal, bVal, cVal; + for (; number < oneEightPoints; number++) { - aVal = _mm256_load_si256((__m256i*)aPtr); - bVal = _mm256_load_si256((__m256i*)bPtr); + aVal = _mm256_load_si256((__m256i*)aPtr); + bVal = _mm256_load_si256((__m256i*)bPtr); - cVal = _mm256_or_si256(aVal, bVal); + cVal = _mm256_or_si256(aVal, bVal); - _mm256_store_si256((__m256i*)cPtr,cVal); // Store the results back into the C container + _mm256_store_si256((__m256i*)cPtr, + cVal); // Store the results back into the C container - aPtr += 8; - bPtr += 8; - cPtr += 8; - } + aPtr += 8; + bPtr += 8; + cPtr += 8; + } - number = oneEightPoints * 8; - for(;number < num_points; number++){ - cVector[number] = aVector[number] | bVector[number]; - } + number = oneEightPoints * 8; + for (; number < num_points; number++) { + cVector[number] = aVector[number] | bVector[number]; + } } #endif /* LV_HAVE_AVX2 */ @@ -160,35 +163,36 @@ volk_32i_x2_or_32i_a_avx2(int32_t* cVector, const int32_t* aVector, #ifdef LV_HAVE_SSE #include -static inline void -volk_32i_x2_or_32i_a_sse(int32_t* cVector, const int32_t* aVector, - const int32_t* bVector, unsigned int num_points) +static inline void volk_32i_x2_or_32i_a_sse(int32_t* cVector, + const int32_t* aVector, + const int32_t* bVector, + unsigned int num_points) { - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; - float* cPtr = (float*)cVector; - const float* aPtr = (float*)aVector; - const float* bPtr = (float*)bVector; + float* cPtr = (float*)cVector; + const float* aPtr = (float*)aVector; + const float* bPtr = (float*)bVector; - __m128 aVal, bVal, cVal; - for(;number < quarterPoints; number++){ - aVal = _mm_load_ps(aPtr); - bVal = _mm_load_ps(bPtr); + __m128 aVal, bVal, cVal; + for (; number < quarterPoints; number++) { + aVal = _mm_load_ps(aPtr); + bVal = _mm_load_ps(bPtr); - cVal = _mm_or_ps(aVal, bVal); + cVal = _mm_or_ps(aVal, bVal); - _mm_store_ps(cPtr,cVal); // Store the results back into the C container + _mm_store_ps(cPtr, cVal); // Store the results back into the C container - aPtr += 4; - bPtr += 4; - cPtr += 4; - } + aPtr += 4; + bPtr += 4; + cPtr += 4; + } - number = quarterPoints * 4; - for(;number < num_points; number++){ - cVector[number] = aVector[number] | bVector[number]; - } + number = quarterPoints * 4; + for (; number < num_points; number++) { + cVector[number] = aVector[number] | bVector[number]; + } } #endif /* LV_HAVE_SSE */ @@ -196,63 +200,67 @@ volk_32i_x2_or_32i_a_sse(int32_t* cVector, const int32_t* aVector, #ifdef LV_HAVE_NEON #include -static inline void -volk_32i_x2_or_32i_neon(int32_t* cVector, const int32_t* aVector, - const int32_t* bVector, unsigned int num_points) +static inline void volk_32i_x2_or_32i_neon(int32_t* cVector, + const int32_t* aVector, + const int32_t* bVector, + unsigned int num_points) { - int32_t* cPtr = cVector; - const int32_t* aPtr = aVector; - const int32_t* bPtr= bVector; - unsigned int number = 0; - unsigned int quarter_points = num_points / 4; - - int32x4_t a_val, b_val, c_val; - - for(number = 0; number < quarter_points; number++){ - a_val = vld1q_s32(aPtr); - b_val = vld1q_s32(bPtr); - c_val = vorrq_s32(a_val, b_val); - vst1q_s32(cPtr, c_val); - aPtr += 4; - bPtr += 4; - cPtr += 4; - } - - for(number = quarter_points * 4; number < num_points; number++){ - *cPtr++ = (*aPtr++) | (*bPtr++); - } + int32_t* cPtr = cVector; + const int32_t* aPtr = aVector; + const int32_t* bPtr = bVector; + unsigned int number = 0; + unsigned int quarter_points = num_points / 4; + + int32x4_t a_val, b_val, c_val; + + for (number = 0; number < quarter_points; number++) { + a_val = vld1q_s32(aPtr); + b_val = vld1q_s32(bPtr); + c_val = vorrq_s32(a_val, b_val); + vst1q_s32(cPtr, c_val); + aPtr += 4; + bPtr += 4; + cPtr += 4; + } + + for (number = quarter_points * 4; number < num_points; number++) { + *cPtr++ = (*aPtr++) | (*bPtr++); + } } #endif /* LV_HAVE_NEON */ #ifdef LV_HAVE_GENERIC -static inline void -volk_32i_x2_or_32i_generic(int32_t* cVector, const int32_t* aVector, - const int32_t* bVector, unsigned int num_points) +static inline void volk_32i_x2_or_32i_generic(int32_t* cVector, + const int32_t* aVector, + const int32_t* bVector, + unsigned int num_points) { - int32_t* cPtr = cVector; - const int32_t* aPtr = aVector; - const int32_t* bPtr= bVector; - unsigned int number = 0; - - for(number = 0; number < num_points; number++){ - *cPtr++ = (*aPtr++) | (*bPtr++); - } + int32_t* cPtr = cVector; + const int32_t* aPtr = aVector; + const int32_t* bPtr = bVector; + unsigned int number = 0; + + for (number = 0; number < num_points; number++) { + *cPtr++ = (*aPtr++) | (*bPtr++); + } } #endif /* LV_HAVE_GENERIC */ #ifdef LV_HAVE_ORC -extern void -volk_32i_x2_or_32i_a_orc_impl(int32_t* cVector, const int32_t* aVector, - const int32_t* bVector, unsigned int num_points); - -static inline void -volk_32i_x2_or_32i_u_orc(int32_t* cVector, const int32_t* aVector, - const int32_t* bVector, unsigned int num_points) +extern void volk_32i_x2_or_32i_a_orc_impl(int32_t* cVector, + const int32_t* aVector, + const int32_t* bVector, + unsigned int num_points); + +static inline void volk_32i_x2_or_32i_u_orc(int32_t* cVector, + const int32_t* aVector, + const int32_t* bVector, + unsigned int num_points) { - volk_32i_x2_or_32i_a_orc_impl(cVector, aVector, bVector, num_points); + volk_32i_x2_or_32i_a_orc_impl(cVector, aVector, bVector, num_points); } #endif /* LV_HAVE_ORC */ @@ -269,72 +277,75 @@ volk_32i_x2_or_32i_u_orc(int32_t* cVector, const int32_t* aVector, #ifdef LV_HAVE_AVX512F #include -static inline void -volk_32i_x2_or_32i_u_avx512f(int32_t* cVector, const int32_t* aVector, - const int32_t* bVector, unsigned int num_points) +static inline void volk_32i_x2_or_32i_u_avx512f(int32_t* cVector, + const int32_t* aVector, + const int32_t* bVector, + unsigned int num_points) { - unsigned int number = 0; - const unsigned int sixteenthPoints = num_points / 16; + unsigned int number = 0; + const unsigned int sixteenthPoints = num_points / 16; - int32_t* cPtr = (int32_t*)cVector; - const int32_t* aPtr = (int32_t*)aVector; - const int32_t* bPtr = (int32_t*)bVector; + int32_t* cPtr = (int32_t*)cVector; + const int32_t* aPtr = (int32_t*)aVector; + const int32_t* bPtr = (int32_t*)bVector; - __m512i aVal, bVal, cVal; - for(;number < sixteenthPoints; number++){ + __m512i aVal, bVal, cVal; + for (; number < sixteenthPoints; number++) { - aVal = _mm512_loadu_si512(aPtr); - bVal = _mm512_loadu_si512(bPtr); + aVal = _mm512_loadu_si512(aPtr); + bVal = _mm512_loadu_si512(bPtr); - cVal = _mm512_or_si512(aVal, bVal); + cVal = _mm512_or_si512(aVal, bVal); - _mm512_storeu_si512(cPtr,cVal); // Store the results back into the C container + _mm512_storeu_si512(cPtr, cVal); // Store the results back into the C container - aPtr += 16; - bPtr += 16; - cPtr += 16; - } + aPtr += 16; + bPtr += 16; + cPtr += 16; + } - number = sixteenthPoints * 16; - for(;number < num_points; number++){ - cVector[number] = aVector[number] | bVector[number]; - } + number = sixteenthPoints * 16; + for (; number < num_points; number++) { + cVector[number] = aVector[number] | bVector[number]; + } } #endif /* LV_HAVE_AVX512F */ #ifdef LV_HAVE_AVX2 #include -static inline void -volk_32i_x2_or_32i_u_avx2(int32_t* cVector, const int32_t* aVector, - const int32_t* bVector, unsigned int num_points) +static inline void volk_32i_x2_or_32i_u_avx2(int32_t* cVector, + const int32_t* aVector, + const int32_t* bVector, + unsigned int num_points) { - unsigned int number = 0; - const unsigned int oneEightPoints = num_points / 8; + unsigned int number = 0; + const unsigned int oneEightPoints = num_points / 8; - int32_t* cPtr = cVector; - const int32_t* aPtr = aVector; - const int32_t* bPtr = bVector; + int32_t* cPtr = cVector; + const int32_t* aPtr = aVector; + const int32_t* bPtr = bVector; - __m256i aVal, bVal, cVal; - for(;number < oneEightPoints; number++){ + __m256i aVal, bVal, cVal; + for (; number < oneEightPoints; number++) { - aVal = _mm256_loadu_si256((__m256i*)aPtr); - bVal = _mm256_loadu_si256((__m256i*)bPtr); + aVal = _mm256_loadu_si256((__m256i*)aPtr); + bVal = _mm256_loadu_si256((__m256i*)bPtr); - cVal = _mm256_or_si256(aVal, bVal); + cVal = _mm256_or_si256(aVal, bVal); - _mm256_storeu_si256((__m256i*)cPtr,cVal); // Store the results back into the C container + _mm256_storeu_si256((__m256i*)cPtr, + cVal); // Store the results back into the C container - aPtr += 8; - bPtr += 8; - cPtr += 8; - } + aPtr += 8; + bPtr += 8; + cPtr += 8; + } - number = oneEightPoints * 8; - for(;number < num_points; number++){ - cVector[number] = aVector[number] | bVector[number]; - } + number = oneEightPoints * 8; + for (; number < num_points; number++) { + cVector[number] = aVector[number] | bVector[number]; + } } #endif /* LV_HAVE_AVX2 */ diff --git a/kernels/volk/volk_32u_byteswap.h b/kernels/volk/volk_32u_byteswap.h index f5e6f11..185047c 100644 --- a/kernels/volk/volk_32u_byteswap.h +++ b/kernels/volk/volk_32u_byteswap.h @@ -71,38 +71,42 @@ #if LV_HAVE_AVX2 #include -static inline void volk_32u_byteswap_u_avx2(uint32_t* intsToSwap, unsigned int num_points){ +static inline void volk_32u_byteswap_u_avx2(uint32_t* intsToSwap, unsigned int num_points) +{ - unsigned int number; + unsigned int number; - const unsigned int nPerSet = 8; - const uint64_t nSets = num_points / nPerSet; + const unsigned int nPerSet = 8; + const uint64_t nSets = num_points / nPerSet; - uint32_t* inputPtr = intsToSwap; + uint32_t* inputPtr = intsToSwap; - const uint8_t shuffleVector[32] = { 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12, 19, 18, 17, 16, 23, 22, 21, 20, 27, 26, 25, 24, 31, 30, 29, 28 }; + const uint8_t shuffleVector[32] = { 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, + 8, 15, 14, 13, 12, 19, 18, 17, 16, 23, 22, + 21, 20, 27, 26, 25, 24, 31, 30, 29, 28 }; - const __m256i myShuffle = _mm256_loadu_si256((__m256i*) &shuffleVector); + const __m256i myShuffle = _mm256_loadu_si256((__m256i*)&shuffleVector); - for (number = 0 ;number < nSets; number++) { + for (number = 0; number < nSets; number++) { - // Load the 32t values, increment inputPtr later since we're doing it in-place. - const __m256i input = _mm256_loadu_si256((__m256i*)inputPtr); - const __m256i output = _mm256_shuffle_epi8(input,myShuffle); + // Load the 32t values, increment inputPtr later since we're doing it in-place. + const __m256i input = _mm256_loadu_si256((__m256i*)inputPtr); + const __m256i output = _mm256_shuffle_epi8(input, myShuffle); - // Store the results - _mm256_storeu_si256((__m256i*)inputPtr, output); - inputPtr += nPerSet; - } - _mm256_zeroupper(); - - // Byteswap any remaining points: - for(number = nSets * nPerSet; number < num_points; number++){ - uint32_t outputVal = *inputPtr; - outputVal = (((outputVal >> 24) & 0xff) | ((outputVal >> 8) & 0x0000ff00) | ((outputVal << 8) & 0x00ff0000) | ((outputVal << 24) & 0xff000000)); - *inputPtr = outputVal; - inputPtr++; - } + // Store the results + _mm256_storeu_si256((__m256i*)inputPtr, output); + inputPtr += nPerSet; + } + _mm256_zeroupper(); + + // Byteswap any remaining points: + for (number = nSets * nPerSet; number < num_points; number++) { + uint32_t outputVal = *inputPtr; + outputVal = (((outputVal >> 24) & 0xff) | ((outputVal >> 8) & 0x0000ff00) | + ((outputVal << 8) & 0x00ff0000) | ((outputVal << 24) & 0xff000000)); + *inputPtr = outputVal; + inputPtr++; + } } #endif /* LV_HAVE_AVX2 */ @@ -110,42 +114,44 @@ static inline void volk_32u_byteswap_u_avx2(uint32_t* intsToSwap, unsigned int n #ifdef LV_HAVE_SSE2 #include -static inline void volk_32u_byteswap_u_sse2(uint32_t* intsToSwap, unsigned int num_points){ - unsigned int number = 0; - - uint32_t* inputPtr = intsToSwap; - __m128i input, byte1, byte2, byte3, byte4, output; - __m128i byte2mask = _mm_set1_epi32(0x00FF0000); - __m128i byte3mask = _mm_set1_epi32(0x0000FF00); - - const uint64_t quarterPoints = num_points / 4; - for(;number < quarterPoints; number++){ - // Load the 32t values, increment inputPtr later since we're doing it in-place. - input = _mm_loadu_si128((__m128i*)inputPtr); - // Do the four shifts - byte1 = _mm_slli_epi32(input, 24); - byte2 = _mm_slli_epi32(input, 8); - byte3 = _mm_srli_epi32(input, 8); - byte4 = _mm_srli_epi32(input, 24); - // Or bytes together - output = _mm_or_si128(byte1, byte4); - byte2 = _mm_and_si128(byte2, byte2mask); - output = _mm_or_si128(output, byte2); - byte3 = _mm_and_si128(byte3, byte3mask); - output = _mm_or_si128(output, byte3); - // Store the results - _mm_storeu_si128((__m128i*)inputPtr, output); - inputPtr += 4; - } - - // Byteswap any remaining points: - number = quarterPoints*4; - for(; number < num_points; number++){ - uint32_t outputVal = *inputPtr; - outputVal = (((outputVal >> 24) & 0xff) | ((outputVal >> 8) & 0x0000ff00) | ((outputVal << 8) & 0x00ff0000) | ((outputVal << 24) & 0xff000000)); - *inputPtr = outputVal; - inputPtr++; - } +static inline void volk_32u_byteswap_u_sse2(uint32_t* intsToSwap, unsigned int num_points) +{ + unsigned int number = 0; + + uint32_t* inputPtr = intsToSwap; + __m128i input, byte1, byte2, byte3, byte4, output; + __m128i byte2mask = _mm_set1_epi32(0x00FF0000); + __m128i byte3mask = _mm_set1_epi32(0x0000FF00); + + const uint64_t quarterPoints = num_points / 4; + for (; number < quarterPoints; number++) { + // Load the 32t values, increment inputPtr later since we're doing it in-place. + input = _mm_loadu_si128((__m128i*)inputPtr); + // Do the four shifts + byte1 = _mm_slli_epi32(input, 24); + byte2 = _mm_slli_epi32(input, 8); + byte3 = _mm_srli_epi32(input, 8); + byte4 = _mm_srli_epi32(input, 24); + // Or bytes together + output = _mm_or_si128(byte1, byte4); + byte2 = _mm_and_si128(byte2, byte2mask); + output = _mm_or_si128(output, byte2); + byte3 = _mm_and_si128(byte3, byte3mask); + output = _mm_or_si128(output, byte3); + // Store the results + _mm_storeu_si128((__m128i*)inputPtr, output); + inputPtr += 4; + } + + // Byteswap any remaining points: + number = quarterPoints * 4; + for (; number < num_points; number++) { + uint32_t outputVal = *inputPtr; + outputVal = (((outputVal >> 24) & 0xff) | ((outputVal >> 8) & 0x0000ff00) | + ((outputVal << 8) & 0x00ff0000) | ((outputVal << 24) & 0xff000000)); + *inputPtr = outputVal; + inputPtr++; + } } #endif /* LV_HAVE_SSE2 */ @@ -153,100 +159,106 @@ static inline void volk_32u_byteswap_u_sse2(uint32_t* intsToSwap, unsigned int n #ifdef LV_HAVE_NEON #include -static inline void volk_32u_byteswap_neon(uint32_t* intsToSwap, unsigned int num_points){ - uint32_t* inputPtr = intsToSwap; - unsigned int number = 0; - unsigned int n8points = num_points / 8; - - uint8x8x4_t input_table; - uint8x8_t int_lookup01, int_lookup23, int_lookup45, int_lookup67; - uint8x8_t swapped_int01, swapped_int23, swapped_int45, swapped_int67; - - /* these magic numbers are used as byte-indices in the LUT. - they are pre-computed to save time. A simple C program - can calculate them; for example for lookup01: - uint8_t chars[8] = {24, 16, 8, 0, 25, 17, 9, 1}; - for(ii=0; ii < 8; ++ii) { - index += ((uint64_t)(*(chars+ii))) << (ii*8); +static inline void volk_32u_byteswap_neon(uint32_t* intsToSwap, unsigned int num_points) +{ + uint32_t* inputPtr = intsToSwap; + unsigned int number = 0; + unsigned int n8points = num_points / 8; + + uint8x8x4_t input_table; + uint8x8_t int_lookup01, int_lookup23, int_lookup45, int_lookup67; + uint8x8_t swapped_int01, swapped_int23, swapped_int45, swapped_int67; + + /* these magic numbers are used as byte-indices in the LUT. + they are pre-computed to save time. A simple C program + can calculate them; for example for lookup01: + uint8_t chars[8] = {24, 16, 8, 0, 25, 17, 9, 1}; + for(ii=0; ii < 8; ++ii) { + index += ((uint64_t)(*(chars+ii))) << (ii*8); + } + */ + int_lookup01 = vcreate_u8(74609667900706840); + int_lookup23 = vcreate_u8(219290013576860186); + int_lookup45 = vcreate_u8(363970359253013532); + int_lookup67 = vcreate_u8(508650704929166878); + + for (number = 0; number < n8points; ++number) { + input_table = vld4_u8((uint8_t*)inputPtr); + swapped_int01 = vtbl4_u8(input_table, int_lookup01); + swapped_int23 = vtbl4_u8(input_table, int_lookup23); + swapped_int45 = vtbl4_u8(input_table, int_lookup45); + swapped_int67 = vtbl4_u8(input_table, int_lookup67); + vst1_u8((uint8_t*)inputPtr, swapped_int01); + vst1_u8((uint8_t*)(inputPtr + 2), swapped_int23); + vst1_u8((uint8_t*)(inputPtr + 4), swapped_int45); + vst1_u8((uint8_t*)(inputPtr + 6), swapped_int67); + + inputPtr += 8; + } + + for (number = n8points * 8; number < num_points; ++number) { + uint32_t output = *inputPtr; + output = (((output >> 24) & 0xff) | ((output >> 8) & 0x0000ff00) | + ((output << 8) & 0x00ff0000) | ((output << 24) & 0xff000000)); + + *inputPtr = output; + inputPtr++; } - */ - int_lookup01 = vcreate_u8(74609667900706840); - int_lookup23 = vcreate_u8(219290013576860186); - int_lookup45 = vcreate_u8(363970359253013532); - int_lookup67 = vcreate_u8(508650704929166878); - - for(number = 0; number < n8points; ++number){ - input_table = vld4_u8((uint8_t*) inputPtr); - swapped_int01 = vtbl4_u8(input_table, int_lookup01); - swapped_int23 = vtbl4_u8(input_table, int_lookup23); - swapped_int45 = vtbl4_u8(input_table, int_lookup45); - swapped_int67 = vtbl4_u8(input_table, int_lookup67); - vst1_u8((uint8_t*) inputPtr, swapped_int01); - vst1_u8((uint8_t*) (inputPtr+2), swapped_int23); - vst1_u8((uint8_t*) (inputPtr+4), swapped_int45); - vst1_u8((uint8_t*) (inputPtr+6), swapped_int67); - - inputPtr += 8; - } - - for(number = n8points * 8; number < num_points; ++number){ - uint32_t output = *inputPtr; - output = (((output >> 24) & 0xff) | ((output >> 8) & 0x0000ff00) | ((output << 8) & 0x00ff0000) | ((output << 24) & 0xff000000)); - - *inputPtr = output; - inputPtr++; - } } #endif /* LV_HAVE_NEON */ #ifdef LV_HAVE_NEONV8 #include -static inline void volk_32u_byteswap_neonv8(uint32_t* intsToSwap, unsigned int num_points){ - uint32_t* inputPtr = (uint32_t*)intsToSwap; - const unsigned int n8points = num_points / 8; - uint8x16_t input; - uint8x16_t idx = { 3,2,1,0, 7,6,5,4, 11,10,9,8, 15,14,13,12 }; - - unsigned int number = 0; - for(number = 0; number < n8points; ++number){ - __VOLK_PREFETCH(inputPtr+8); - input = vld1q_u8((uint8_t*) inputPtr); - input = vqtbl1q_u8(input, idx); - vst1q_u8((uint8_t*) inputPtr, input); - inputPtr += 4; - - input = vld1q_u8((uint8_t*) inputPtr); - input = vqtbl1q_u8(input, idx); - vst1q_u8((uint8_t*) inputPtr, input); - inputPtr += 4; - } - - for(number = n8points * 8; number < num_points; ++number){ - uint32_t output = *inputPtr; +static inline void volk_32u_byteswap_neonv8(uint32_t* intsToSwap, unsigned int num_points) +{ + uint32_t* inputPtr = (uint32_t*)intsToSwap; + const unsigned int n8points = num_points / 8; + uint8x16_t input; + uint8x16_t idx = { 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12 }; + + unsigned int number = 0; + for (number = 0; number < n8points; ++number) { + __VOLK_PREFETCH(inputPtr + 8); + input = vld1q_u8((uint8_t*)inputPtr); + input = vqtbl1q_u8(input, idx); + vst1q_u8((uint8_t*)inputPtr, input); + inputPtr += 4; + + input = vld1q_u8((uint8_t*)inputPtr); + input = vqtbl1q_u8(input, idx); + vst1q_u8((uint8_t*)inputPtr, input); + inputPtr += 4; + } - output = (((output >> 24) & 0xff) | ((output >> 8) & 0x0000ff00) | ((output << 8) & 0x00ff0000) | ((output << 24) & 0xff000000)); + for (number = n8points * 8; number < num_points; ++number) { + uint32_t output = *inputPtr; - *inputPtr++ = output; - } + output = (((output >> 24) & 0xff) | ((output >> 8) & 0x0000ff00) | + ((output << 8) & 0x00ff0000) | ((output << 24) & 0xff000000)); + *inputPtr++ = output; + } } #endif /* LV_HAVE_NEONV8 */ #ifdef LV_HAVE_GENERIC -static inline void volk_32u_byteswap_generic(uint32_t* intsToSwap, unsigned int num_points){ - uint32_t* inputPtr = intsToSwap; +static inline void volk_32u_byteswap_generic(uint32_t* intsToSwap, + unsigned int num_points) +{ + uint32_t* inputPtr = intsToSwap; - unsigned int point; - for(point = 0; point < num_points; point++){ - uint32_t output = *inputPtr; - output = (((output >> 24) & 0xff) | ((output >> 8) & 0x0000ff00) | ((output << 8) & 0x00ff0000) | ((output << 24) & 0xff000000)); + unsigned int point; + for (point = 0; point < num_points; point++) { + uint32_t output = *inputPtr; + output = (((output >> 24) & 0xff) | ((output >> 8) & 0x0000ff00) | + ((output << 8) & 0x00ff0000) | ((output << 24) & 0xff000000)); - *inputPtr = output; - inputPtr++; - } + *inputPtr = output; + inputPtr++; + } } #endif /* LV_HAVE_GENERIC */ @@ -261,38 +273,42 @@ static inline void volk_32u_byteswap_generic(uint32_t* intsToSwap, unsigned int #if LV_HAVE_AVX2 #include -static inline void volk_32u_byteswap_a_avx2(uint32_t* intsToSwap, unsigned int num_points){ +static inline void volk_32u_byteswap_a_avx2(uint32_t* intsToSwap, unsigned int num_points) +{ - unsigned int number; + unsigned int number; - const unsigned int nPerSet = 8; - const uint64_t nSets = num_points / nPerSet; + const unsigned int nPerSet = 8; + const uint64_t nSets = num_points / nPerSet; - uint32_t* inputPtr = intsToSwap; + uint32_t* inputPtr = intsToSwap; - const uint8_t shuffleVector[32] = { 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12, 19, 18, 17, 16, 23, 22, 21, 20, 27, 26, 25, 24, 31, 30, 29, 28 }; + const uint8_t shuffleVector[32] = { 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, + 8, 15, 14, 13, 12, 19, 18, 17, 16, 23, 22, + 21, 20, 27, 26, 25, 24, 31, 30, 29, 28 }; - const __m256i myShuffle = _mm256_loadu_si256((__m256i*) &shuffleVector); + const __m256i myShuffle = _mm256_loadu_si256((__m256i*)&shuffleVector); - for (number = 0 ;number < nSets; number++) { + for (number = 0; number < nSets; number++) { - // Load the 32t values, increment inputPtr later since we're doing it in-place. - const __m256i input = _mm256_load_si256((__m256i*)inputPtr); - const __m256i output = _mm256_shuffle_epi8(input,myShuffle); + // Load the 32t values, increment inputPtr later since we're doing it in-place. + const __m256i input = _mm256_load_si256((__m256i*)inputPtr); + const __m256i output = _mm256_shuffle_epi8(input, myShuffle); - // Store the results - _mm256_store_si256((__m256i*)inputPtr, output); - inputPtr += nPerSet; - } - _mm256_zeroupper(); - - // Byteswap any remaining points: - for(number = nSets * nPerSet; number < num_points; number++){ - uint32_t outputVal = *inputPtr; - outputVal = (((outputVal >> 24) & 0xff) | ((outputVal >> 8) & 0x0000ff00) | ((outputVal << 8) & 0x00ff0000) | ((outputVal << 24) & 0xff000000)); - *inputPtr = outputVal; - inputPtr++; - } + // Store the results + _mm256_store_si256((__m256i*)inputPtr, output); + inputPtr += nPerSet; + } + _mm256_zeroupper(); + + // Byteswap any remaining points: + for (number = nSets * nPerSet; number < num_points; number++) { + uint32_t outputVal = *inputPtr; + outputVal = (((outputVal >> 24) & 0xff) | ((outputVal >> 8) & 0x0000ff00) | + ((outputVal << 8) & 0x00ff0000) | ((outputVal << 24) & 0xff000000)); + *inputPtr = outputVal; + inputPtr++; + } } #endif /* LV_HAVE_AVX2 */ @@ -301,63 +317,66 @@ static inline void volk_32u_byteswap_a_avx2(uint32_t* intsToSwap, unsigned int n #include -static inline void volk_32u_byteswap_a_sse2(uint32_t* intsToSwap, unsigned int num_points){ - unsigned int number = 0; - - uint32_t* inputPtr = intsToSwap; - __m128i input, byte1, byte2, byte3, byte4, output; - __m128i byte2mask = _mm_set1_epi32(0x00FF0000); - __m128i byte3mask = _mm_set1_epi32(0x0000FF00); - - const uint64_t quarterPoints = num_points / 4; - for(;number < quarterPoints; number++){ - // Load the 32t values, increment inputPtr later since we're doing it in-place. - input = _mm_load_si128((__m128i*)inputPtr); - // Do the four shifts - byte1 = _mm_slli_epi32(input, 24); - byte2 = _mm_slli_epi32(input, 8); - byte3 = _mm_srli_epi32(input, 8); - byte4 = _mm_srli_epi32(input, 24); - // Or bytes together - output = _mm_or_si128(byte1, byte4); - byte2 = _mm_and_si128(byte2, byte2mask); - output = _mm_or_si128(output, byte2); - byte3 = _mm_and_si128(byte3, byte3mask); - output = _mm_or_si128(output, byte3); - // Store the results - _mm_store_si128((__m128i*)inputPtr, output); - inputPtr += 4; - } - - // Byteswap any remaining points: - number = quarterPoints*4; - for(; number < num_points; number++){ - uint32_t outputVal = *inputPtr; - outputVal = (((outputVal >> 24) & 0xff) | ((outputVal >> 8) & 0x0000ff00) | ((outputVal << 8) & 0x00ff0000) | ((outputVal << 24) & 0xff000000)); - *inputPtr = outputVal; - inputPtr++; - } +static inline void volk_32u_byteswap_a_sse2(uint32_t* intsToSwap, unsigned int num_points) +{ + unsigned int number = 0; + + uint32_t* inputPtr = intsToSwap; + __m128i input, byte1, byte2, byte3, byte4, output; + __m128i byte2mask = _mm_set1_epi32(0x00FF0000); + __m128i byte3mask = _mm_set1_epi32(0x0000FF00); + + const uint64_t quarterPoints = num_points / 4; + for (; number < quarterPoints; number++) { + // Load the 32t values, increment inputPtr later since we're doing it in-place. + input = _mm_load_si128((__m128i*)inputPtr); + // Do the four shifts + byte1 = _mm_slli_epi32(input, 24); + byte2 = _mm_slli_epi32(input, 8); + byte3 = _mm_srli_epi32(input, 8); + byte4 = _mm_srli_epi32(input, 24); + // Or bytes together + output = _mm_or_si128(byte1, byte4); + byte2 = _mm_and_si128(byte2, byte2mask); + output = _mm_or_si128(output, byte2); + byte3 = _mm_and_si128(byte3, byte3mask); + output = _mm_or_si128(output, byte3); + // Store the results + _mm_store_si128((__m128i*)inputPtr, output); + inputPtr += 4; + } + + // Byteswap any remaining points: + number = quarterPoints * 4; + for (; number < num_points; number++) { + uint32_t outputVal = *inputPtr; + outputVal = (((outputVal >> 24) & 0xff) | ((outputVal >> 8) & 0x0000ff00) | + ((outputVal << 8) & 0x00ff0000) | ((outputVal << 24) & 0xff000000)); + *inputPtr = outputVal; + inputPtr++; + } } #endif /* LV_HAVE_SSE2 */ #ifdef LV_HAVE_GENERIC -static inline void volk_32u_byteswap_a_generic(uint32_t* intsToSwap, unsigned int num_points){ - uint32_t* inputPtr = intsToSwap; +static inline void volk_32u_byteswap_a_generic(uint32_t* intsToSwap, + unsigned int num_points) +{ + uint32_t* inputPtr = intsToSwap; - unsigned int point; - for(point = 0; point < num_points; point++){ - uint32_t output = *inputPtr; - output = (((output >> 24) & 0xff) | ((output >> 8) & 0x0000ff00) | ((output << 8) & 0x00ff0000) | ((output << 24) & 0xff000000)); + unsigned int point; + for (point = 0; point < num_points; point++) { + uint32_t output = *inputPtr; + output = (((output >> 24) & 0xff) | ((output >> 8) & 0x0000ff00) | + ((output << 8) & 0x00ff0000) | ((output << 24) & 0xff000000)); - *inputPtr = output; - inputPtr++; - } + *inputPtr = output; + inputPtr++; + } } #endif /* LV_HAVE_GENERIC */ - - #endif /* INCLUDED_volk_32u_byteswap_a_H */ diff --git a/kernels/volk/volk_32u_byteswappuppet_32u.h b/kernels/volk/volk_32u_byteswappuppet_32u.h index c33a5fc..ca5ca17 100644 --- a/kernels/volk/volk_32u_byteswappuppet_32u.h +++ b/kernels/volk/volk_32u_byteswappuppet_32u.h @@ -1,70 +1,84 @@ #ifndef INCLUDED_volk_32u_byteswappuppet_32u_H #define INCLUDED_volk_32u_byteswappuppet_32u_H -#include #include #include +#include #ifdef LV_HAVE_GENERIC -static inline void volk_32u_byteswappuppet_32u_generic(uint32_t*output, uint32_t* intsToSwap, unsigned int num_points){ +static inline void volk_32u_byteswappuppet_32u_generic(uint32_t* output, + uint32_t* intsToSwap, + unsigned int num_points) +{ volk_32u_byteswap_generic((uint32_t*)intsToSwap, num_points); memcpy((void*)output, (void*)intsToSwap, num_points * sizeof(uint32_t)); - } #endif #ifdef LV_HAVE_NEON -static inline void volk_32u_byteswappuppet_32u_neon(uint32_t*output, uint32_t* intsToSwap, unsigned int num_points){ +static inline void volk_32u_byteswappuppet_32u_neon(uint32_t* output, + uint32_t* intsToSwap, + unsigned int num_points) +{ volk_32u_byteswap_neon((uint32_t*)intsToSwap, num_points); memcpy((void*)output, (void*)intsToSwap, num_points * sizeof(uint32_t)); - } #endif #ifdef LV_HAVE_NEONV8 -static inline void volk_32u_byteswappuppet_32u_neonv8(uint32_t*output, uint32_t* intsToSwap, unsigned int num_points){ +static inline void volk_32u_byteswappuppet_32u_neonv8(uint32_t* output, + uint32_t* intsToSwap, + unsigned int num_points) +{ volk_32u_byteswap_neonv8((uint32_t*)intsToSwap, num_points); memcpy((void*)output, (void*)intsToSwap, num_points * sizeof(uint32_t)); - } #endif #ifdef LV_HAVE_SSE2 -static inline void volk_32u_byteswappuppet_32u_u_sse2(uint32_t *output, uint32_t* intsToSwap, unsigned int num_points){ +static inline void volk_32u_byteswappuppet_32u_u_sse2(uint32_t* output, + uint32_t* intsToSwap, + unsigned int num_points) +{ volk_32u_byteswap_u_sse2((uint32_t*)intsToSwap, num_points); memcpy((void*)output, (void*)intsToSwap, num_points * sizeof(uint32_t)); - } #endif #ifdef LV_HAVE_SSE2 -static inline void volk_32u_byteswappuppet_32u_a_sse2(uint32_t* output, uint32_t* intsToSwap, unsigned int num_points){ +static inline void volk_32u_byteswappuppet_32u_a_sse2(uint32_t* output, + uint32_t* intsToSwap, + unsigned int num_points) +{ volk_32u_byteswap_a_sse2((uint32_t*)intsToSwap, num_points); memcpy((void*)output, (void*)intsToSwap, num_points * sizeof(uint32_t)); - } #endif #ifdef LV_HAVE_AVX2 -static inline void volk_32u_byteswappuppet_32u_u_avx2(uint32_t* output, uint32_t* intsToSwap, unsigned int num_points){ +static inline void volk_32u_byteswappuppet_32u_u_avx2(uint32_t* output, + uint32_t* intsToSwap, + unsigned int num_points) +{ volk_32u_byteswap_u_avx2((uint32_t*)intsToSwap, num_points); memcpy((void*)output, (void*)intsToSwap, num_points * sizeof(uint32_t)); - } #endif #ifdef LV_HAVE_AVX2 -static inline void volk_32u_byteswappuppet_32u_a_avx2(uint32_t* output, uint32_t* intsToSwap, unsigned int num_points){ +static inline void volk_32u_byteswappuppet_32u_a_avx2(uint32_t* output, + uint32_t* intsToSwap, + unsigned int num_points) +{ volk_32u_byteswap_a_avx2((uint32_t*)intsToSwap, num_points); memcpy((void*)output, (void*)intsToSwap, num_points * sizeof(uint32_t)); - } #endif diff --git a/kernels/volk/volk_32u_popcnt.h b/kernels/volk/volk_32u_popcnt.h index 7aa4d43..f6f0c10 100644 --- a/kernels/volk/volk_32u_popcnt.h +++ b/kernels/volk/volk_32u_popcnt.h @@ -56,24 +56,23 @@ #ifndef INCLUDED_VOLK_32u_POPCNT_A16_H #define INCLUDED_VOLK_32u_POPCNT_A16_H -#include #include +#include #ifdef LV_HAVE_GENERIC -static inline void -volk_32u_popcnt_generic(uint32_t* ret, const uint32_t value) +static inline void volk_32u_popcnt_generic(uint32_t* ret, const uint32_t value) { - // This is faster than a lookup table - uint32_t retVal = value; + // This is faster than a lookup table + uint32_t retVal = value; - retVal = (retVal & 0x55555555) + (retVal >> 1 & 0x55555555); - retVal = (retVal & 0x33333333) + (retVal >> 2 & 0x33333333); - retVal = (retVal + (retVal >> 4)) & 0x0F0F0F0F; - retVal = (retVal + (retVal >> 8)); - retVal = (retVal + (retVal >> 16)) & 0x0000003F; + retVal = (retVal & 0x55555555) + (retVal >> 1 & 0x55555555); + retVal = (retVal & 0x33333333) + (retVal >> 2 & 0x33333333); + retVal = (retVal + (retVal >> 4)) & 0x0F0F0F0F; + retVal = (retVal + (retVal >> 8)); + retVal = (retVal + (retVal >> 16)) & 0x0000003F; - *ret = retVal; + *ret = retVal; } #endif /*LV_HAVE_GENERIC*/ @@ -83,10 +82,9 @@ volk_32u_popcnt_generic(uint32_t* ret, const uint32_t value) #include -static inline void -volk_32u_popcnt_a_sse4_2(uint32_t* ret, const uint32_t value) +static inline void volk_32u_popcnt_a_sse4_2(uint32_t* ret, const uint32_t value) { - *ret = _mm_popcnt_u32(value); + *ret = _mm_popcnt_u32(value); } #endif /*LV_HAVE_SSE4_2*/ diff --git a/kernels/volk/volk_32u_popcntpuppet_32u.h b/kernels/volk/volk_32u_popcntpuppet_32u.h index d5edd35..c0389cc 100644 --- a/kernels/volk/volk_32u_popcntpuppet_32u.h +++ b/kernels/volk/volk_32u_popcntpuppet_32u.h @@ -27,19 +27,25 @@ #include #ifdef LV_HAVE_GENERIC -static inline void volk_32u_popcntpuppet_32u_generic(uint32_t* outVector, const uint32_t* inVector, unsigned int num_points){ +static inline void volk_32u_popcntpuppet_32u_generic(uint32_t* outVector, + const uint32_t* inVector, + unsigned int num_points) +{ unsigned int ii; - for(ii=0; ii < num_points; ++ii) { - volk_32u_popcnt_generic(outVector+ii, *(inVector+ii) ); + for (ii = 0; ii < num_points; ++ii) { + volk_32u_popcnt_generic(outVector + ii, *(inVector + ii)); } } #endif /* LV_HAVE_GENERIC */ #ifdef LV_HAVE_SSE4_2 -static inline void volk_32u_popcntpuppet_32u_a_sse4_2(uint32_t* outVector, const uint32_t* inVector, unsigned int num_points){ +static inline void volk_32u_popcntpuppet_32u_a_sse4_2(uint32_t* outVector, + const uint32_t* inVector, + unsigned int num_points) +{ unsigned int ii; - for(ii=0; ii < num_points; ++ii) { - volk_32u_popcnt_a_sse4_2(outVector+ii, *(inVector+ii) ); + for (ii = 0; ii < num_points; ++ii) { + volk_32u_popcnt_a_sse4_2(outVector + ii, *(inVector + ii)); } } #endif /* LV_HAVE_SSE4_2 */ diff --git a/kernels/volk/volk_32u_reverse_32u.h b/kernels/volk/volk_32u_reverse_32u.h index b670b13..aff0a9e 100644 --- a/kernels/volk/volk_32u_reverse_32u.h +++ b/kernels/volk/volk_32u_reverse_32u.h @@ -24,7 +24,8 @@ * \b bit reversal of the input 32 bit word * Dispatcher Prototype - * \code volk_32u_reverse_32u(uint32_t *outputVector, uint32_t *inputVector; unsigned int num_points); + * \code volk_32u_reverse_32u(uint32_t *outputVector, uint32_t *inputVector; unsigned int + num_points); * \endcode * * \b Inputs @@ -32,338 +33,344 @@ * \li num_points The number of data points. * * \b Outputs - * \li outputVector: The vector where the results will be stored, which is the bit-reversed input + * \li outputVector: The vector where the results will be stored, which is the + bit-reversed input * * \endcode */ #ifndef INCLUDED_VOLK_32u_REVERSE_32u_U_H struct dword_split { - int b00: 1; - int b01: 1; - int b02: 1; - int b03: 1; - int b04: 1; - int b05: 1; - int b06: 1; - int b07: 1; - int b08: 1; - int b09: 1; - int b10: 1; - int b11: 1; - int b12: 1; - int b13: 1; - int b14: 1; - int b15: 1; - int b16: 1; - int b17: 1; - int b18: 1; - int b19: 1; - int b20: 1; - int b21: 1; - int b22: 1; - int b23: 1; - int b24: 1; - int b25: 1; - int b26: 1; - int b27: 1; - int b28: 1; - int b29: 1; - int b30: 1; - int b31: 1; + int b00 : 1; + int b01 : 1; + int b02 : 1; + int b03 : 1; + int b04 : 1; + int b05 : 1; + int b06 : 1; + int b07 : 1; + int b08 : 1; + int b09 : 1; + int b10 : 1; + int b11 : 1; + int b12 : 1; + int b13 : 1; + int b14 : 1; + int b15 : 1; + int b16 : 1; + int b17 : 1; + int b18 : 1; + int b19 : 1; + int b20 : 1; + int b21 : 1; + int b22 : 1; + int b23 : 1; + int b24 : 1; + int b25 : 1; + int b26 : 1; + int b27 : 1; + int b28 : 1; + int b29 : 1; + int b30 : 1; + int b31 : 1; }; struct char_split { - uint8_t b00: 1; - uint8_t b01: 1; - uint8_t b02: 1; - uint8_t b03: 1; - uint8_t b04: 1; - uint8_t b05: 1; - uint8_t b06: 1; - uint8_t b07: 1; + uint8_t b00 : 1; + uint8_t b01 : 1; + uint8_t b02 : 1; + uint8_t b03 : 1; + uint8_t b04 : 1; + uint8_t b05 : 1; + uint8_t b06 : 1; + uint8_t b07 : 1; }; -//Idea from "Bit Twiddling Hacks", which dedicates this method to public domain -//http://graphics.stanford.edu/~seander/bithacks.html#BitReverseTable +// Idea from "Bit Twiddling Hacks", which dedicates this method to public domain +// http://graphics.stanford.edu/~seander/bithacks.html#BitReverseTable static const unsigned char BitReverseTable256[] = { - 0x00, 0x80, 0x40, 0xC0, 0x20, 0xA0, 0x60, 0xE0, 0x10, 0x90, 0x50, 0xD0, 0x30, - 0xB0, 0x70, 0xF0, 0x08, 0x88, 0x48, 0xC8, 0x28, 0xA8, 0x68, 0xE8, 0x18, 0x98, - 0x58, 0xD8, 0x38, 0xB8, 0x78, 0xF8, 0x04, 0x84, 0x44, 0xC4, 0x24, 0xA4, 0x64, - 0xE4, 0x14, 0x94, 0x54, 0xD4, 0x34, 0xB4, 0x74, 0xF4, 0x0C, 0x8C, 0x4C, 0xCC, - 0x2C, 0xAC, 0x6C, 0xEC, 0x1C, 0x9C, 0x5C, 0xDC, 0x3C, 0xBC, 0x7C, 0xFC, 0x02, - 0x82, 0x42, 0xC2, 0x22, 0xA2, 0x62, 0xE2, 0x12, 0x92, 0x52, 0xD2, 0x32, 0xB2, - 0x72, 0xF2, 0x0A, 0x8A, 0x4A, 0xCA, 0x2A, 0xAA, 0x6A, 0xEA, 0x1A, 0x9A, 0x5A, - 0xDA, 0x3A, 0xBA, 0x7A, 0xFA, 0x06, 0x86, 0x46, 0xC6, 0x26, 0xA6, 0x66, 0xE6, - 0x16, 0x96, 0x56, 0xD6, 0x36, 0xB6, 0x76, 0xF6, 0x0E, 0x8E, 0x4E, 0xCE, 0x2E, - 0xAE, 0x6E, 0xEE, 0x1E, 0x9E, 0x5E, 0xDE, 0x3E, 0xBE, 0x7E, 0xFE, 0x01, 0x81, - 0x41, 0xC1, 0x21, 0xA1, 0x61, 0xE1, 0x11, 0x91, 0x51, 0xD1, 0x31, 0xB1, 0x71, - 0xF1, 0x09, 0x89, 0x49, 0xC9, 0x29, 0xA9, 0x69, 0xE9, 0x19, 0x99, 0x59, 0xD9, - 0x39, 0xB9, 0x79, 0xF9, 0x05, 0x85, 0x45, 0xC5, 0x25, 0xA5, 0x65, 0xE5, 0x15, - 0x95, 0x55, 0xD5, 0x35, 0xB5, 0x75, 0xF5, 0x0D, 0x8D, 0x4D, 0xCD, 0x2D, 0xAD, - 0x6D, 0xED, 0x1D, 0x9D, 0x5D, 0xDD, 0x3D, 0xBD, 0x7D, 0xFD, 0x03, 0x83, 0x43, - 0xC3, 0x23, 0xA3, 0x63, 0xE3, 0x13, 0x93, 0x53, 0xD3, 0x33, 0xB3, 0x73, 0xF3, - 0x0B, 0x8B, 0x4B, 0xCB, 0x2B, 0xAB, 0x6B, 0xEB, 0x1B, 0x9B, 0x5B, 0xDB, 0x3B, - 0xBB, 0x7B, 0xFB, 0x07, 0x87, 0x47, 0xC7, 0x27, 0xA7, 0x67, 0xE7, 0x17, 0x97, - 0x57, 0xD7, 0x37, 0xB7, 0x77, 0xF7, 0x0F, 0x8F, 0x4F, 0xCF, 0x2F, 0xAF, 0x6F, - 0xEF, 0x1F, 0x9F, 0x5F, 0xDF, 0x3F, 0xBF, 0x7F, 0xFF + 0x00, 0x80, 0x40, 0xC0, 0x20, 0xA0, 0x60, 0xE0, 0x10, 0x90, 0x50, 0xD0, 0x30, 0xB0, + 0x70, 0xF0, 0x08, 0x88, 0x48, 0xC8, 0x28, 0xA8, 0x68, 0xE8, 0x18, 0x98, 0x58, 0xD8, + 0x38, 0xB8, 0x78, 0xF8, 0x04, 0x84, 0x44, 0xC4, 0x24, 0xA4, 0x64, 0xE4, 0x14, 0x94, + 0x54, 0xD4, 0x34, 0xB4, 0x74, 0xF4, 0x0C, 0x8C, 0x4C, 0xCC, 0x2C, 0xAC, 0x6C, 0xEC, + 0x1C, 0x9C, 0x5C, 0xDC, 0x3C, 0xBC, 0x7C, 0xFC, 0x02, 0x82, 0x42, 0xC2, 0x22, 0xA2, + 0x62, 0xE2, 0x12, 0x92, 0x52, 0xD2, 0x32, 0xB2, 0x72, 0xF2, 0x0A, 0x8A, 0x4A, 0xCA, + 0x2A, 0xAA, 0x6A, 0xEA, 0x1A, 0x9A, 0x5A, 0xDA, 0x3A, 0xBA, 0x7A, 0xFA, 0x06, 0x86, + 0x46, 0xC6, 0x26, 0xA6, 0x66, 0xE6, 0x16, 0x96, 0x56, 0xD6, 0x36, 0xB6, 0x76, 0xF6, + 0x0E, 0x8E, 0x4E, 0xCE, 0x2E, 0xAE, 0x6E, 0xEE, 0x1E, 0x9E, 0x5E, 0xDE, 0x3E, 0xBE, + 0x7E, 0xFE, 0x01, 0x81, 0x41, 0xC1, 0x21, 0xA1, 0x61, 0xE1, 0x11, 0x91, 0x51, 0xD1, + 0x31, 0xB1, 0x71, 0xF1, 0x09, 0x89, 0x49, 0xC9, 0x29, 0xA9, 0x69, 0xE9, 0x19, 0x99, + 0x59, 0xD9, 0x39, 0xB9, 0x79, 0xF9, 0x05, 0x85, 0x45, 0xC5, 0x25, 0xA5, 0x65, 0xE5, + 0x15, 0x95, 0x55, 0xD5, 0x35, 0xB5, 0x75, 0xF5, 0x0D, 0x8D, 0x4D, 0xCD, 0x2D, 0xAD, + 0x6D, 0xED, 0x1D, 0x9D, 0x5D, 0xDD, 0x3D, 0xBD, 0x7D, 0xFD, 0x03, 0x83, 0x43, 0xC3, + 0x23, 0xA3, 0x63, 0xE3, 0x13, 0x93, 0x53, 0xD3, 0x33, 0xB3, 0x73, 0xF3, 0x0B, 0x8B, + 0x4B, 0xCB, 0x2B, 0xAB, 0x6B, 0xEB, 0x1B, 0x9B, 0x5B, 0xDB, 0x3B, 0xBB, 0x7B, 0xFB, + 0x07, 0x87, 0x47, 0xC7, 0x27, 0xA7, 0x67, 0xE7, 0x17, 0x97, 0x57, 0xD7, 0x37, 0xB7, + 0x77, 0xF7, 0x0F, 0x8F, 0x4F, 0xCF, 0x2F, 0xAF, 0x6F, 0xEF, 0x1F, 0x9F, 0x5F, 0xDF, + 0x3F, 0xBF, 0x7F, 0xFF }; #ifdef LV_HAVE_GENERIC -static inline void volk_32u_reverse_32u_dword_shuffle(uint32_t* out, const uint32_t* in, - unsigned int num_points) +static inline void volk_32u_reverse_32u_dword_shuffle(uint32_t* out, + const uint32_t* in, + unsigned int num_points) { - const struct dword_split *in_ptr = (const struct dword_split*)in; - struct dword_split * out_ptr = (struct dword_split*)out; - unsigned int number = 0; - for(; number < num_points; ++number){ - out_ptr->b00 = in_ptr->b31; - out_ptr->b01 = in_ptr->b30; - out_ptr->b02 = in_ptr->b29; - out_ptr->b03 = in_ptr->b28; - out_ptr->b04 = in_ptr->b27; - out_ptr->b05 = in_ptr->b26; - out_ptr->b06 = in_ptr->b25; - out_ptr->b07 = in_ptr->b24; - out_ptr->b08 = in_ptr->b23; - out_ptr->b09 = in_ptr->b22; - out_ptr->b10 = in_ptr->b21; - out_ptr->b11 = in_ptr->b20; - out_ptr->b12 = in_ptr->b19; - out_ptr->b13 = in_ptr->b18; - out_ptr->b14 = in_ptr->b17; - out_ptr->b15 = in_ptr->b16; - out_ptr->b16 = in_ptr->b15; - out_ptr->b17 = in_ptr->b14; - out_ptr->b18 = in_ptr->b13; - out_ptr->b19 = in_ptr->b12; - out_ptr->b20 = in_ptr->b11; - out_ptr->b21 = in_ptr->b10; - out_ptr->b22 = in_ptr->b09; - out_ptr->b23 = in_ptr->b08; - out_ptr->b24 = in_ptr->b07; - out_ptr->b25 = in_ptr->b06; - out_ptr->b26 = in_ptr->b05; - out_ptr->b27 = in_ptr->b04; - out_ptr->b28 = in_ptr->b03; - out_ptr->b29 = in_ptr->b02; - out_ptr->b30 = in_ptr->b01; - out_ptr->b31 = in_ptr->b00; - ++in_ptr; - ++out_ptr; - } + const struct dword_split* in_ptr = (const struct dword_split*)in; + struct dword_split* out_ptr = (struct dword_split*)out; + unsigned int number = 0; + for (; number < num_points; ++number) { + out_ptr->b00 = in_ptr->b31; + out_ptr->b01 = in_ptr->b30; + out_ptr->b02 = in_ptr->b29; + out_ptr->b03 = in_ptr->b28; + out_ptr->b04 = in_ptr->b27; + out_ptr->b05 = in_ptr->b26; + out_ptr->b06 = in_ptr->b25; + out_ptr->b07 = in_ptr->b24; + out_ptr->b08 = in_ptr->b23; + out_ptr->b09 = in_ptr->b22; + out_ptr->b10 = in_ptr->b21; + out_ptr->b11 = in_ptr->b20; + out_ptr->b12 = in_ptr->b19; + out_ptr->b13 = in_ptr->b18; + out_ptr->b14 = in_ptr->b17; + out_ptr->b15 = in_ptr->b16; + out_ptr->b16 = in_ptr->b15; + out_ptr->b17 = in_ptr->b14; + out_ptr->b18 = in_ptr->b13; + out_ptr->b19 = in_ptr->b12; + out_ptr->b20 = in_ptr->b11; + out_ptr->b21 = in_ptr->b10; + out_ptr->b22 = in_ptr->b09; + out_ptr->b23 = in_ptr->b08; + out_ptr->b24 = in_ptr->b07; + out_ptr->b25 = in_ptr->b06; + out_ptr->b26 = in_ptr->b05; + out_ptr->b27 = in_ptr->b04; + out_ptr->b28 = in_ptr->b03; + out_ptr->b29 = in_ptr->b02; + out_ptr->b30 = in_ptr->b01; + out_ptr->b31 = in_ptr->b00; + ++in_ptr; + ++out_ptr; + } } #endif /* LV_HAVE_GENERIC */ #ifdef LV_HAVE_GENERIC -static inline void volk_32u_reverse_32u_byte_shuffle(uint32_t* out, const uint32_t* in, - unsigned int num_points) +static inline void volk_32u_reverse_32u_byte_shuffle(uint32_t* out, + const uint32_t* in, + unsigned int num_points) { - const uint32_t *in_ptr = in; - uint32_t *out_ptr = out; - unsigned int number = 0; - for(; number < num_points; ++number){ - const struct char_split *in8 = (const struct char_split*)in_ptr; - struct char_split *out8 = (struct char_split*)out_ptr; + const uint32_t* in_ptr = in; + uint32_t* out_ptr = out; + unsigned int number = 0; + for (; number < num_points; ++number) { + const struct char_split* in8 = (const struct char_split*)in_ptr; + struct char_split* out8 = (struct char_split*)out_ptr; - out8[3].b00 = in8[0].b07; - out8[3].b01 = in8[0].b06; - out8[3].b02 = in8[0].b05; - out8[3].b03 = in8[0].b04; - out8[3].b04 = in8[0].b03; - out8[3].b05 = in8[0].b02; - out8[3].b06 = in8[0].b01; - out8[3].b07 = in8[0].b00; + out8[3].b00 = in8[0].b07; + out8[3].b01 = in8[0].b06; + out8[3].b02 = in8[0].b05; + out8[3].b03 = in8[0].b04; + out8[3].b04 = in8[0].b03; + out8[3].b05 = in8[0].b02; + out8[3].b06 = in8[0].b01; + out8[3].b07 = in8[0].b00; - out8[2].b00 = in8[1].b07; - out8[2].b01 = in8[1].b06; - out8[2].b02 = in8[1].b05; - out8[2].b03 = in8[1].b04; - out8[2].b04 = in8[1].b03; - out8[2].b05 = in8[1].b02; - out8[2].b06 = in8[1].b01; - out8[2].b07 = in8[1].b00; + out8[2].b00 = in8[1].b07; + out8[2].b01 = in8[1].b06; + out8[2].b02 = in8[1].b05; + out8[2].b03 = in8[1].b04; + out8[2].b04 = in8[1].b03; + out8[2].b05 = in8[1].b02; + out8[2].b06 = in8[1].b01; + out8[2].b07 = in8[1].b00; - out8[1].b00 = in8[2].b07; - out8[1].b01 = in8[2].b06; - out8[1].b02 = in8[2].b05; - out8[1].b03 = in8[2].b04; - out8[1].b04 = in8[2].b03; - out8[1].b05 = in8[2].b02; - out8[1].b06 = in8[2].b01; - out8[1].b07 = in8[2].b00; + out8[1].b00 = in8[2].b07; + out8[1].b01 = in8[2].b06; + out8[1].b02 = in8[2].b05; + out8[1].b03 = in8[2].b04; + out8[1].b04 = in8[2].b03; + out8[1].b05 = in8[2].b02; + out8[1].b06 = in8[2].b01; + out8[1].b07 = in8[2].b00; - out8[0].b00 = in8[3].b07; - out8[0].b01 = in8[3].b06; - out8[0].b02 = in8[3].b05; - out8[0].b03 = in8[3].b04; - out8[0].b04 = in8[3].b03; - out8[0].b05 = in8[3].b02; - out8[0].b06 = in8[3].b01; - out8[0].b07 = in8[3].b00; - ++in_ptr; - ++out_ptr; - } + out8[0].b00 = in8[3].b07; + out8[0].b01 = in8[3].b06; + out8[0].b02 = in8[3].b05; + out8[0].b03 = in8[3].b04; + out8[0].b04 = in8[3].b03; + out8[0].b05 = in8[3].b02; + out8[0].b06 = in8[3].b01; + out8[0].b07 = in8[3].b00; + ++in_ptr; + ++out_ptr; + } } #endif /* LV_HAVE_GENERIC */ -//Idea from "Bit Twiddling Hacks", which dedicates this method to public domain -//http://graphics.stanford.edu/~seander/bithacks.html#BitReverseTable +// Idea from "Bit Twiddling Hacks", which dedicates this method to public domain +// http://graphics.stanford.edu/~seander/bithacks.html#BitReverseTable #ifdef LV_HAVE_GENERIC -static inline void volk_32u_reverse_32u_lut(uint32_t* out, const uint32_t* in, - unsigned int num_points) +static inline void +volk_32u_reverse_32u_lut(uint32_t* out, const uint32_t* in, unsigned int num_points) { - const uint32_t *in_ptr = in; - uint32_t *out_ptr = out; - unsigned int number = 0; - for(; number < num_points; ++number){ - *out_ptr = - (BitReverseTable256[*in_ptr & 0xff] << 24) | - (BitReverseTable256[(*in_ptr >> 8) & 0xff] << 16) | - (BitReverseTable256[(*in_ptr >> 16) & 0xff] << 8) | - (BitReverseTable256[(*in_ptr >> 24) & 0xff]); - ++in_ptr; - ++out_ptr; - } + const uint32_t* in_ptr = in; + uint32_t* out_ptr = out; + unsigned int number = 0; + for (; number < num_points; ++number) { + *out_ptr = (BitReverseTable256[*in_ptr & 0xff] << 24) | + (BitReverseTable256[(*in_ptr >> 8) & 0xff] << 16) | + (BitReverseTable256[(*in_ptr >> 16) & 0xff] << 8) | + (BitReverseTable256[(*in_ptr >> 24) & 0xff]); + ++in_ptr; + ++out_ptr; + } } #endif /* LV_HAVE_GENERIC */ -//Single-Byte code from "Bit Twiddling Hacks", which dedicates this method to public domain -//http://graphics.stanford.edu/~seander/bithacks.html#ReverseByteWith64Bits +// Single-Byte code from "Bit Twiddling Hacks", which dedicates this method to public +// domain http://graphics.stanford.edu/~seander/bithacks.html#ReverseByteWith64Bits #ifdef LV_HAVE_GENERIC -static inline void volk_32u_reverse_32u_2001magic(uint32_t* out, const uint32_t* in, - unsigned int num_points) +static inline void +volk_32u_reverse_32u_2001magic(uint32_t* out, const uint32_t* in, unsigned int num_points) { - const uint32_t *in_ptr = in; - uint32_t *out_ptr = out; - const uint8_t *in8; - uint8_t *out8; - unsigned int number = 0; - for(; number < num_points; ++number){ - in8 = (const uint8_t*)in_ptr; - out8 = (uint8_t*)out_ptr; - out8[3] = ((in8[0] * 0x80200802ULL) & 0x0884422110ULL) * 0x0101010101ULL >> 32; - out8[2] = ((in8[1] * 0x80200802ULL) & 0x0884422110ULL) * 0x0101010101ULL >> 32; - out8[1] = ((in8[2] * 0x80200802ULL) & 0x0884422110ULL) * 0x0101010101ULL >> 32; - out8[0] = ((in8[3] * 0x80200802ULL) & 0x0884422110ULL) * 0x0101010101ULL >> 32; - ++in_ptr; - ++out_ptr; - } + const uint32_t* in_ptr = in; + uint32_t* out_ptr = out; + const uint8_t* in8; + uint8_t* out8; + unsigned int number = 0; + for (; number < num_points; ++number) { + in8 = (const uint8_t*)in_ptr; + out8 = (uint8_t*)out_ptr; + out8[3] = ((in8[0] * 0x80200802ULL) & 0x0884422110ULL) * 0x0101010101ULL >> 32; + out8[2] = ((in8[1] * 0x80200802ULL) & 0x0884422110ULL) * 0x0101010101ULL >> 32; + out8[1] = ((in8[2] * 0x80200802ULL) & 0x0884422110ULL) * 0x0101010101ULL >> 32; + out8[0] = ((in8[3] * 0x80200802ULL) & 0x0884422110ULL) * 0x0101010101ULL >> 32; + ++in_ptr; + ++out_ptr; + } } #endif /* LV_HAVE_GENERIC */ #ifdef LV_HAVE_GENERIC // Current gr-pager implementation -static inline void volk_32u_reverse_32u_1972magic(uint32_t* out, const uint32_t* in, - unsigned int num_points) +static inline void +volk_32u_reverse_32u_1972magic(uint32_t* out, const uint32_t* in, unsigned int num_points) { - const uint32_t *in_ptr = in; - uint32_t *out_ptr = out; - const uint8_t *in8; - uint8_t *out8; - unsigned int number = 0; - for(; number < num_points; ++number){ - in8 = (const uint8_t*)in_ptr; - out8 = (uint8_t*)out_ptr; - out8[3] = (in8[0] * 0x0202020202ULL & 0x010884422010ULL) % 1023; - out8[2] = (in8[1] * 0x0202020202ULL & 0x010884422010ULL) % 1023; - out8[1] = (in8[2] * 0x0202020202ULL & 0x010884422010ULL) % 1023; - out8[0] = (in8[3] * 0x0202020202ULL & 0x010884422010ULL) % 1023; - ++in_ptr; - ++out_ptr; - } + const uint32_t* in_ptr = in; + uint32_t* out_ptr = out; + const uint8_t* in8; + uint8_t* out8; + unsigned int number = 0; + for (; number < num_points; ++number) { + in8 = (const uint8_t*)in_ptr; + out8 = (uint8_t*)out_ptr; + out8[3] = (in8[0] * 0x0202020202ULL & 0x010884422010ULL) % 1023; + out8[2] = (in8[1] * 0x0202020202ULL & 0x010884422010ULL) % 1023; + out8[1] = (in8[2] * 0x0202020202ULL & 0x010884422010ULL) % 1023; + out8[0] = (in8[3] * 0x0202020202ULL & 0x010884422010ULL) % 1023; + ++in_ptr; + ++out_ptr; + } } #endif /* LV_HAVE_GENERIC */ -//After lengthy thought and quite a bit of whiteboarding: +// After lengthy thought and quite a bit of whiteboarding: #ifdef LV_HAVE_GENERIC -static inline void volk_32u_reverse_32u_bintree_permute_top_down(uint32_t* out, const uint32_t* in, - unsigned int num_points) +static inline void volk_32u_reverse_32u_bintree_permute_top_down(uint32_t* out, + const uint32_t* in, + unsigned int num_points) { - const uint32_t *in_ptr = in; - uint32_t *out_ptr = out; - unsigned int number = 0; - for(; number < num_points; ++number){ - uint32_t tmp = *in_ptr; - /* permute uint16: - The idea is to simply shift the lower 16 bit up, and the upper 16 bit down. - */ - tmp = ( tmp << 16 ) | ( tmp >> 16 ); - /* permute bytes: - shift up by 1 B first, then only consider even bytes, and OR with the unshifted even bytes - */ - tmp = ((tmp & (0xFF | 0xFF << 16)) << 8) | ((tmp >> 8) & (0xFF | 0xFF << 16)); - /* permute 4bit tuples: - Same idea, but the "consideration" mask expression becomes unwieldy - */ - tmp = ((tmp & (0xF | 0xF << 8 | 0xF << 16 | 0xF << 24)) << 4) | ((tmp >> 4) & (0xF | 0xF << 8 | 0xF << 16 | 0xF << 24)); - /* permute 2bit tuples: - Here, we collapsed the "consideration" mask to a simple hexmask: 0b0011 = - 3; we need those every 4b, which coincides with a hex digit! - */ - tmp = ((tmp & (0x33333333)) << 2) | ((tmp >> 2) & (0x33333333)); - /* permute odd/even: - 0x01 = 0x1; we need these every 2b, which works out: 0x01 | (0x01 << 2) = 0x05! - */ - tmp = ((tmp & (0x55555555)) << 1) | ((tmp >> 1) & (0x55555555)); + const uint32_t* in_ptr = in; + uint32_t* out_ptr = out; + unsigned int number = 0; + for (; number < num_points; ++number) { + uint32_t tmp = *in_ptr; + /* permute uint16: + The idea is to simply shift the lower 16 bit up, and the upper 16 bit down. + */ + tmp = (tmp << 16) | (tmp >> 16); + /* permute bytes: + shift up by 1 B first, then only consider even bytes, and OR with the unshifted + even bytes + */ + tmp = ((tmp & (0xFF | 0xFF << 16)) << 8) | ((tmp >> 8) & (0xFF | 0xFF << 16)); + /* permute 4bit tuples: + Same idea, but the "consideration" mask expression becomes unwieldy + */ + tmp = ((tmp & (0xF | 0xF << 8 | 0xF << 16 | 0xF << 24)) << 4) | + ((tmp >> 4) & (0xF | 0xF << 8 | 0xF << 16 | 0xF << 24)); + /* permute 2bit tuples: + Here, we collapsed the "consideration" mask to a simple hexmask: 0b0011 = + 3; we need those every 4b, which coincides with a hex digit! + */ + tmp = ((tmp & (0x33333333)) << 2) | ((tmp >> 2) & (0x33333333)); + /* permute odd/even: + 0x01 = 0x1; we need these every 2b, which works out: 0x01 | (0x01 << 2) = + 0x05! + */ + tmp = ((tmp & (0x55555555)) << 1) | ((tmp >> 1) & (0x55555555)); - *out_ptr = tmp; - ++in_ptr; - ++out_ptr; - } + *out_ptr = tmp; + ++in_ptr; + ++out_ptr; + } } #endif /* LV_HAVE_GENERIC */ #ifdef LV_HAVE_GENERIC -static inline void volk_32u_reverse_32u_bintree_permute_bottom_up(uint32_t* out, const uint32_t* in, - unsigned int num_points) +static inline void volk_32u_reverse_32u_bintree_permute_bottom_up(uint32_t* out, + const uint32_t* in, + unsigned int num_points) { - //same stuff as top_down, inverted order (permutation matrices don't care, you know!) - const uint32_t *in_ptr = in; - uint32_t *out_ptr = out; - unsigned int number = 0; - for(; number < num_points; ++number){ - uint32_t tmp = *in_ptr; - tmp = ((tmp & (0x55555555)) << 1) | ((tmp >> 1) & (0x55555555)); - tmp = ((tmp & (0x33333333)) << 2) | ((tmp >> 2) & (0x33333333)); - tmp = ((tmp & (0xF | 0xF << 8 | 0xF << 16 | 0xF << 24)) << 4) | ((tmp >> 4) & (0xF | 0xF << 8 | 0xF << 16 | 0xF << 24)); - tmp = ((tmp & (0xFF | 0xFF << 16)) << 8) | ((tmp >> 8) & (0xFF | 0xFF << 16)); - tmp = ( tmp << 16 ) | ( tmp >> 16 ); + // same stuff as top_down, inverted order (permutation matrices don't care, you know!) + const uint32_t* in_ptr = in; + uint32_t* out_ptr = out; + unsigned int number = 0; + for (; number < num_points; ++number) { + uint32_t tmp = *in_ptr; + tmp = ((tmp & (0x55555555)) << 1) | ((tmp >> 1) & (0x55555555)); + tmp = ((tmp & (0x33333333)) << 2) | ((tmp >> 2) & (0x33333333)); + tmp = ((tmp & (0xF | 0xF << 8 | 0xF << 16 | 0xF << 24)) << 4) | + ((tmp >> 4) & (0xF | 0xF << 8 | 0xF << 16 | 0xF << 24)); + tmp = ((tmp & (0xFF | 0xFF << 16)) << 8) | ((tmp >> 8) & (0xFF | 0xFF << 16)); + tmp = (tmp << 16) | (tmp >> 16); - *out_ptr = tmp; - ++in_ptr; - ++out_ptr; - } + *out_ptr = tmp; + ++in_ptr; + ++out_ptr; + } } #endif /* LV_HAVE_GENERIC */ #ifdef LV_HAVE_NEONV8 #include -static inline void volk_32u_reverse_32u_neonv8(uint32_t* out, const uint32_t* in, - unsigned int num_points) -{ - const uint32_t *in_ptr = in; - uint32_t *out_ptr = out; +static inline void +volk_32u_reverse_32u_neonv8(uint32_t* out, const uint32_t* in, unsigned int num_points) +{ + const uint32_t* in_ptr = in; + uint32_t* out_ptr = out; - const uint8x16_t idx = { 3,2,1,0, 7,6,5,4, 11,10,9,8, 15,14,13,12 }; + const uint8x16_t idx = { 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12 }; - const unsigned int quarterPoints = num_points/4; + const unsigned int quarterPoints = num_points / 4; unsigned int number = 0; - for(; number < quarterPoints; ++number){ - __VOLK_PREFETCH(in_ptr+4); - uint32x4_t x = vld1q_u32(in_ptr); - uint32x4_t z = vreinterpretq_u32_u8(vqtbl1q_u8(vrbitq_u8(vreinterpretq_u8_u32 (x)), - idx)); - vst1q_u32 (out_ptr, z); - in_ptr += 4; - out_ptr += 4; + for (; number < quarterPoints; ++number) { + __VOLK_PREFETCH(in_ptr + 4); + uint32x4_t x = vld1q_u32(in_ptr); + uint32x4_t z = + vreinterpretq_u32_u8(vqtbl1q_u8(vrbitq_u8(vreinterpretq_u8_u32(x)), idx)); + vst1q_u32(out_ptr, z); + in_ptr += 4; + out_ptr += 4; } - number = quarterPoints*4; - for(; number < num_points; ++number){ - *out_ptr = - (BitReverseTable256[*in_ptr & 0xff] << 24) | - (BitReverseTable256[(*in_ptr >> 8) & 0xff] << 16) | - (BitReverseTable256[(*in_ptr >> 16) & 0xff] << 8) | - (BitReverseTable256[(*in_ptr >> 24) & 0xff]); - ++in_ptr; - ++out_ptr; + number = quarterPoints * 4; + for (; number < num_points; ++number) { + *out_ptr = (BitReverseTable256[*in_ptr & 0xff] << 24) | + (BitReverseTable256[(*in_ptr >> 8) & 0xff] << 16) | + (BitReverseTable256[(*in_ptr >> 16) & 0xff] << 8) | + (BitReverseTable256[(*in_ptr >> 24) & 0xff]); + ++in_ptr; + ++out_ptr; } } @@ -371,29 +378,35 @@ static inline void volk_32u_reverse_32u_neonv8(uint32_t* out, const uint32_t* in #ifdef LV_HAVE_NEON #include -#define DO_RBIT \ - __VOLK_ASM("rbit %[result], %[value]" \ - : [result]"=r" (*out_ptr) \ - : [value] "r" (*in_ptr) \ - : ); \ - in_ptr++; \ - out_ptr++; +#define DO_RBIT \ + __VOLK_ASM("rbit %[result], %[value]" \ + : [result] "=r"(*out_ptr) \ + : [value] "r"(*in_ptr) \ + :); \ + in_ptr++; \ + out_ptr++; -static inline void volk_32u_reverse_32u_arm(uint32_t* out, const uint32_t* in, - unsigned int num_points) +static inline void +volk_32u_reverse_32u_arm(uint32_t* out, const uint32_t* in, unsigned int num_points) { - const uint32_t *in_ptr = in; - uint32_t *out_ptr = out; - const unsigned int eighthPoints = num_points/8; + const uint32_t* in_ptr = in; + uint32_t* out_ptr = out; + const unsigned int eighthPoints = num_points / 8; unsigned int number = 0; - for(; number < eighthPoints; ++number){ - __VOLK_PREFETCH(in_ptr+8); - DO_RBIT; DO_RBIT; DO_RBIT; DO_RBIT; - DO_RBIT; DO_RBIT; DO_RBIT; DO_RBIT; + for (; number < eighthPoints; ++number) { + __VOLK_PREFETCH(in_ptr + 8); + DO_RBIT; + DO_RBIT; + DO_RBIT; + DO_RBIT; + DO_RBIT; + DO_RBIT; + DO_RBIT; + DO_RBIT; } - number = eighthPoints*8; - for(; number < num_points; ++number){ + number = eighthPoints * 8; + for (; number < num_points; ++number) { DO_RBIT; } } @@ -403,4 +416,3 @@ static inline void volk_32u_reverse_32u_arm(uint32_t* out, const uint32_t* in, #endif /* INCLUDED_volk_32u_reverse_32u_u_H */ - diff --git a/kernels/volk/volk_64f_convert_32f.h b/kernels/volk/volk_64f_convert_32f.h index 20422cf..4ebccc0 100644 --- a/kernels/volk/volk_64f_convert_32f.h +++ b/kernels/volk/volk_64f_convert_32f.h @@ -29,8 +29,8 @@ * * Dispatcher Prototype * \code - * void volk_64f_convert_32f(float* outputVector, const double* inputVector, unsigned int num_points) - * \endcode + * void volk_64f_convert_32f(float* outputVector, const double* inputVector, unsigned int + * num_points) \endcode * * \b Inputs * \li inputVector: The vector of doubles to convert to floats. @@ -70,34 +70,39 @@ #ifdef LV_HAVE_AVX512F #include -static inline void volk_64f_convert_32f_u_avx512f(float* outputVector, const double* inputVector, unsigned int num_points){ - unsigned int number = 0; +static inline void volk_64f_convert_32f_u_avx512f(float* outputVector, + const double* inputVector, + unsigned int num_points) +{ + unsigned int number = 0; - const unsigned int oneSixteenthPoints = num_points / 16; + const unsigned int oneSixteenthPoints = num_points / 16; - const double* inputVectorPtr = (const double*)inputVector; - float* outputVectorPtr = outputVector; - __m256 ret1, ret2; - __m512d inputVal1, inputVal2; + const double* inputVectorPtr = (const double*)inputVector; + float* outputVectorPtr = outputVector; + __m256 ret1, ret2; + __m512d inputVal1, inputVal2; - for(;number < oneSixteenthPoints; number++){ - inputVal1 = _mm512_loadu_pd(inputVectorPtr); inputVectorPtr += 8; - inputVal2 = _mm512_loadu_pd(inputVectorPtr); inputVectorPtr += 8; + for (; number < oneSixteenthPoints; number++) { + inputVal1 = _mm512_loadu_pd(inputVectorPtr); + inputVectorPtr += 8; + inputVal2 = _mm512_loadu_pd(inputVectorPtr); + inputVectorPtr += 8; - ret1 = _mm512_cvtpd_ps(inputVal1); - ret2 = _mm512_cvtpd_ps(inputVal2); + ret1 = _mm512_cvtpd_ps(inputVal1); + ret2 = _mm512_cvtpd_ps(inputVal2); - _mm256_storeu_ps(outputVectorPtr, ret1); - outputVectorPtr += 8; + _mm256_storeu_ps(outputVectorPtr, ret1); + outputVectorPtr += 8; - _mm256_storeu_ps(outputVectorPtr, ret2); - outputVectorPtr += 8; - } + _mm256_storeu_ps(outputVectorPtr, ret2); + outputVectorPtr += 8; + } - number = oneSixteenthPoints * 16; - for(; number < num_points; number++){ - outputVector[number] = (float)(inputVector[number]); - } + number = oneSixteenthPoints * 16; + for (; number < num_points; number++) { + outputVector[number] = (float)(inputVector[number]); + } } #endif /* LV_HAVE_AVX512F */ @@ -105,34 +110,39 @@ static inline void volk_64f_convert_32f_u_avx512f(float* outputVector, const dou #ifdef LV_HAVE_AVX #include -static inline void volk_64f_convert_32f_u_avx(float* outputVector, const double* inputVector, unsigned int num_points){ - unsigned int number = 0; +static inline void volk_64f_convert_32f_u_avx(float* outputVector, + const double* inputVector, + unsigned int num_points) +{ + unsigned int number = 0; - const unsigned int oneEightPoints = num_points / 8; + const unsigned int oneEightPoints = num_points / 8; - const double* inputVectorPtr = (const double*)inputVector; - float* outputVectorPtr = outputVector; - __m128 ret1, ret2; - __m256d inputVal1, inputVal2; + const double* inputVectorPtr = (const double*)inputVector; + float* outputVectorPtr = outputVector; + __m128 ret1, ret2; + __m256d inputVal1, inputVal2; - for(;number < oneEightPoints; number++){ - inputVal1 = _mm256_loadu_pd(inputVectorPtr); inputVectorPtr += 4; - inputVal2 = _mm256_loadu_pd(inputVectorPtr); inputVectorPtr += 4; + for (; number < oneEightPoints; number++) { + inputVal1 = _mm256_loadu_pd(inputVectorPtr); + inputVectorPtr += 4; + inputVal2 = _mm256_loadu_pd(inputVectorPtr); + inputVectorPtr += 4; - ret1 = _mm256_cvtpd_ps(inputVal1); - ret2 = _mm256_cvtpd_ps(inputVal2); + ret1 = _mm256_cvtpd_ps(inputVal1); + ret2 = _mm256_cvtpd_ps(inputVal2); - _mm_storeu_ps(outputVectorPtr, ret1); - outputVectorPtr += 4; + _mm_storeu_ps(outputVectorPtr, ret1); + outputVectorPtr += 4; - _mm_storeu_ps(outputVectorPtr, ret2); - outputVectorPtr += 4; - } + _mm_storeu_ps(outputVectorPtr, ret2); + outputVectorPtr += 4; + } - number = oneEightPoints * 8; - for(; number < num_points; number++){ - outputVector[number] = (float)(inputVector[number]); - } + number = oneEightPoints * 8; + for (; number < num_points; number++) { + outputVector[number] = (float)(inputVector[number]); + } } #endif /* LV_HAVE_AVX */ @@ -140,53 +150,59 @@ static inline void volk_64f_convert_32f_u_avx(float* outputVector, const double* #ifdef LV_HAVE_SSE2 #include -static inline void volk_64f_convert_32f_u_sse2(float* outputVector, const double* inputVector, unsigned int num_points){ - unsigned int number = 0; +static inline void volk_64f_convert_32f_u_sse2(float* outputVector, + const double* inputVector, + unsigned int num_points) +{ + unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; + const unsigned int quarterPoints = num_points / 4; - const double* inputVectorPtr = (const double*)inputVector; - float* outputVectorPtr = outputVector; - __m128 ret, ret2; - __m128d inputVal1, inputVal2; + const double* inputVectorPtr = (const double*)inputVector; + float* outputVectorPtr = outputVector; + __m128 ret, ret2; + __m128d inputVal1, inputVal2; - for(;number < quarterPoints; number++){ - inputVal1 = _mm_loadu_pd(inputVectorPtr); inputVectorPtr += 2; - inputVal2 = _mm_loadu_pd(inputVectorPtr); inputVectorPtr += 2; + for (; number < quarterPoints; number++) { + inputVal1 = _mm_loadu_pd(inputVectorPtr); + inputVectorPtr += 2; + inputVal2 = _mm_loadu_pd(inputVectorPtr); + inputVectorPtr += 2; - ret = _mm_cvtpd_ps(inputVal1); - ret2 = _mm_cvtpd_ps(inputVal2); + ret = _mm_cvtpd_ps(inputVal1); + ret2 = _mm_cvtpd_ps(inputVal2); - ret = _mm_movelh_ps(ret, ret2); + ret = _mm_movelh_ps(ret, ret2); - _mm_storeu_ps(outputVectorPtr, ret); - outputVectorPtr += 4; - } + _mm_storeu_ps(outputVectorPtr, ret); + outputVectorPtr += 4; + } - number = quarterPoints * 4; - for(; number < num_points; number++){ - outputVector[number] = (float)(inputVector[number]); - } + number = quarterPoints * 4; + for (; number < num_points; number++) { + outputVector[number] = (float)(inputVector[number]); + } } #endif /* LV_HAVE_SSE2 */ #ifdef LV_HAVE_GENERIC -static inline void volk_64f_convert_32f_generic(float* outputVector, const double* inputVector, unsigned int num_points){ - float* outputVectorPtr = outputVector; - const double* inputVectorPtr = inputVector; - unsigned int number = 0; - - for(number = 0; number < num_points; number++){ - *outputVectorPtr++ = ((float)(*inputVectorPtr++)); - } +static inline void volk_64f_convert_32f_generic(float* outputVector, + const double* inputVector, + unsigned int num_points) +{ + float* outputVectorPtr = outputVector; + const double* inputVectorPtr = inputVector; + unsigned int number = 0; + + for (number = 0; number < num_points; number++) { + *outputVectorPtr++ = ((float)(*inputVectorPtr++)); + } } #endif /* LV_HAVE_GENERIC */ - - #endif /* INCLUDED_volk_64f_convert_32f_u_H */ #ifndef INCLUDED_volk_64f_convert_32f_a_H #define INCLUDED_volk_64f_convert_32f_a_H @@ -197,34 +213,39 @@ static inline void volk_64f_convert_32f_generic(float* outputVector, const doubl #ifdef LV_HAVE_AVX512F #include -static inline void volk_64f_convert_32f_a_avx512f(float* outputVector, const double* inputVector, unsigned int num_points){ - unsigned int number = 0; +static inline void volk_64f_convert_32f_a_avx512f(float* outputVector, + const double* inputVector, + unsigned int num_points) +{ + unsigned int number = 0; - const unsigned int oneSixteenthPoints = num_points / 16; + const unsigned int oneSixteenthPoints = num_points / 16; - const double* inputVectorPtr = (const double*)inputVector; - float* outputVectorPtr = outputVector; - __m256 ret1, ret2; - __m512d inputVal1, inputVal2; + const double* inputVectorPtr = (const double*)inputVector; + float* outputVectorPtr = outputVector; + __m256 ret1, ret2; + __m512d inputVal1, inputVal2; - for(;number < oneSixteenthPoints; number++){ - inputVal1 = _mm512_load_pd(inputVectorPtr); inputVectorPtr += 8; - inputVal2 = _mm512_load_pd(inputVectorPtr); inputVectorPtr += 8; + for (; number < oneSixteenthPoints; number++) { + inputVal1 = _mm512_load_pd(inputVectorPtr); + inputVectorPtr += 8; + inputVal2 = _mm512_load_pd(inputVectorPtr); + inputVectorPtr += 8; - ret1 = _mm512_cvtpd_ps(inputVal1); - ret2 = _mm512_cvtpd_ps(inputVal2); + ret1 = _mm512_cvtpd_ps(inputVal1); + ret2 = _mm512_cvtpd_ps(inputVal2); - _mm256_store_ps(outputVectorPtr, ret1); - outputVectorPtr += 8; + _mm256_store_ps(outputVectorPtr, ret1); + outputVectorPtr += 8; - _mm256_store_ps(outputVectorPtr, ret2); - outputVectorPtr += 8; - } + _mm256_store_ps(outputVectorPtr, ret2); + outputVectorPtr += 8; + } - number = oneSixteenthPoints * 16; - for(; number < num_points; number++){ - outputVector[number] = (float)(inputVector[number]); - } + number = oneSixteenthPoints * 16; + for (; number < num_points; number++) { + outputVector[number] = (float)(inputVector[number]); + } } #endif /* LV_HAVE_AVX512F */ @@ -232,34 +253,39 @@ static inline void volk_64f_convert_32f_a_avx512f(float* outputVector, const dou #ifdef LV_HAVE_AVX #include -static inline void volk_64f_convert_32f_a_avx(float* outputVector, const double* inputVector, unsigned int num_points){ - unsigned int number = 0; +static inline void volk_64f_convert_32f_a_avx(float* outputVector, + const double* inputVector, + unsigned int num_points) +{ + unsigned int number = 0; - const unsigned int oneEightPoints = num_points / 8; + const unsigned int oneEightPoints = num_points / 8; - const double* inputVectorPtr = (const double*)inputVector; - float* outputVectorPtr = outputVector; - __m128 ret1, ret2; - __m256d inputVal1, inputVal2; + const double* inputVectorPtr = (const double*)inputVector; + float* outputVectorPtr = outputVector; + __m128 ret1, ret2; + __m256d inputVal1, inputVal2; - for(;number < oneEightPoints; number++){ - inputVal1 = _mm256_load_pd(inputVectorPtr); inputVectorPtr += 4; - inputVal2 = _mm256_load_pd(inputVectorPtr); inputVectorPtr += 4; + for (; number < oneEightPoints; number++) { + inputVal1 = _mm256_load_pd(inputVectorPtr); + inputVectorPtr += 4; + inputVal2 = _mm256_load_pd(inputVectorPtr); + inputVectorPtr += 4; - ret1 = _mm256_cvtpd_ps(inputVal1); - ret2 = _mm256_cvtpd_ps(inputVal2); + ret1 = _mm256_cvtpd_ps(inputVal1); + ret2 = _mm256_cvtpd_ps(inputVal2); - _mm_store_ps(outputVectorPtr, ret1); - outputVectorPtr += 4; + _mm_store_ps(outputVectorPtr, ret1); + outputVectorPtr += 4; - _mm_store_ps(outputVectorPtr, ret2); - outputVectorPtr += 4; - } + _mm_store_ps(outputVectorPtr, ret2); + outputVectorPtr += 4; + } - number = oneEightPoints * 8; - for(; number < num_points; number++){ - outputVector[number] = (float)(inputVector[number]); - } + number = oneEightPoints * 8; + for (; number < num_points; number++) { + outputVector[number] = (float)(inputVector[number]); + } } #endif /* LV_HAVE_AVX */ @@ -267,51 +293,57 @@ static inline void volk_64f_convert_32f_a_avx(float* outputVector, const double* #ifdef LV_HAVE_SSE2 #include -static inline void volk_64f_convert_32f_a_sse2(float* outputVector, const double* inputVector, unsigned int num_points){ - unsigned int number = 0; +static inline void volk_64f_convert_32f_a_sse2(float* outputVector, + const double* inputVector, + unsigned int num_points) +{ + unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; + const unsigned int quarterPoints = num_points / 4; - const double* inputVectorPtr = (const double*)inputVector; - float* outputVectorPtr = outputVector; - __m128 ret, ret2; - __m128d inputVal1, inputVal2; + const double* inputVectorPtr = (const double*)inputVector; + float* outputVectorPtr = outputVector; + __m128 ret, ret2; + __m128d inputVal1, inputVal2; - for(;number < quarterPoints; number++){ - inputVal1 = _mm_load_pd(inputVectorPtr); inputVectorPtr += 2; - inputVal2 = _mm_load_pd(inputVectorPtr); inputVectorPtr += 2; + for (; number < quarterPoints; number++) { + inputVal1 = _mm_load_pd(inputVectorPtr); + inputVectorPtr += 2; + inputVal2 = _mm_load_pd(inputVectorPtr); + inputVectorPtr += 2; - ret = _mm_cvtpd_ps(inputVal1); - ret2 = _mm_cvtpd_ps(inputVal2); + ret = _mm_cvtpd_ps(inputVal1); + ret2 = _mm_cvtpd_ps(inputVal2); - ret = _mm_movelh_ps(ret, ret2); + ret = _mm_movelh_ps(ret, ret2); - _mm_store_ps(outputVectorPtr, ret); - outputVectorPtr += 4; - } + _mm_store_ps(outputVectorPtr, ret); + outputVectorPtr += 4; + } - number = quarterPoints * 4; - for(; number < num_points; number++){ - outputVector[number] = (float)(inputVector[number]); - } + number = quarterPoints * 4; + for (; number < num_points; number++) { + outputVector[number] = (float)(inputVector[number]); + } } #endif /* LV_HAVE_SSE2 */ #ifdef LV_HAVE_GENERIC -static inline void volk_64f_convert_32f_a_generic(float* outputVector, const double* inputVector, unsigned int num_points){ - float* outputVectorPtr = outputVector; - const double* inputVectorPtr = inputVector; - unsigned int number = 0; - - for(number = 0; number < num_points; number++){ - *outputVectorPtr++ = ((float)(*inputVectorPtr++)); - } +static inline void volk_64f_convert_32f_a_generic(float* outputVector, + const double* inputVector, + unsigned int num_points) +{ + float* outputVectorPtr = outputVector; + const double* inputVectorPtr = inputVector; + unsigned int number = 0; + + for (number = 0; number < num_points; number++) { + *outputVectorPtr++ = ((float)(*inputVectorPtr++)); + } } #endif /* LV_HAVE_GENERIC */ - - #endif /* INCLUDED_volk_64f_convert_32f_a_H */ diff --git a/kernels/volk/volk_64f_x2_add_64f.h b/kernels/volk/volk_64f_x2_add_64f.h index 03b8e4c..5c512cc 100644 --- a/kernels/volk/volk_64f_x2_add_64f.h +++ b/kernels/volk/volk_64f_x2_add_64f.h @@ -31,8 +31,8 @@ * * Dispatcher Prototype * \code - * void volk_64f_x2_add_64f(float* cVector, const float* aVector, const float* bVector, unsigned int num_points) - * \endcode + * void volk_64f_x2_add_64f(float* cVector, const float* aVector, const float* bVector, + * unsigned int num_points) \endcode * * \b Inputs * \li aVector: First input vector. @@ -76,18 +76,19 @@ #ifdef LV_HAVE_GENERIC -static inline void -volk_64f_x2_add_64f_generic(double *cVector, const double *aVector, - const double *bVector, unsigned int num_points) +static inline void volk_64f_x2_add_64f_generic(double* cVector, + const double* aVector, + const double* bVector, + unsigned int num_points) { - double *cPtr = cVector; - const double *aPtr = aVector; - const double *bPtr = bVector; - unsigned int number = 0; - - for (number = 0; number < num_points; number++) { - *cPtr++ = (*aPtr++) + (*bPtr++); - } + double* cPtr = cVector; + const double* aPtr = aVector; + const double* bPtr = bVector; + unsigned int number = 0; + + for (number = 0; number < num_points; number++) { + *cPtr++ = (*aPtr++) + (*bPtr++); + } } #endif /* LV_HAVE_GENERIC */ @@ -100,35 +101,36 @@ volk_64f_x2_add_64f_generic(double *cVector, const double *aVector, #include -static inline void -volk_64f_x2_add_64f_u_sse2(double *cVector, const double *aVector, - const double *bVector, unsigned int num_points) +static inline void volk_64f_x2_add_64f_u_sse2(double* cVector, + const double* aVector, + const double* bVector, + unsigned int num_points) { - unsigned int number = 0; - const unsigned int half_points = num_points / 2; + unsigned int number = 0; + const unsigned int half_points = num_points / 2; - double *cPtr = cVector; - const double *aPtr = aVector; - const double *bPtr = bVector; + double* cPtr = cVector; + const double* aPtr = aVector; + const double* bPtr = bVector; - __m128d aVal, bVal, cVal; - for (; number < half_points; number++) { - aVal = _mm_loadu_pd(aPtr); - bVal = _mm_loadu_pd(bPtr); + __m128d aVal, bVal, cVal; + for (; number < half_points; number++) { + aVal = _mm_loadu_pd(aPtr); + bVal = _mm_loadu_pd(bPtr); - cVal = _mm_add_pd(aVal, bVal); + cVal = _mm_add_pd(aVal, bVal); - _mm_storeu_pd(cPtr, cVal); // Store the results back into the C container + _mm_storeu_pd(cPtr, cVal); // Store the results back into the C container - aPtr += 2; - bPtr += 2; - cPtr += 2; - } + aPtr += 2; + bPtr += 2; + cPtr += 2; + } - number = half_points * 2; - for (; number < num_points; number++) { - *cPtr++ = (*aPtr++) + (*bPtr++); - } + number = half_points * 2; + for (; number < num_points; number++) { + *cPtr++ = (*aPtr++) + (*bPtr++); + } } #endif /* LV_HAVE_SSE2 */ @@ -138,36 +140,37 @@ volk_64f_x2_add_64f_u_sse2(double *cVector, const double *aVector, #include -static inline void -volk_64f_x2_add_64f_u_avx(double *cVector, const double *aVector, - const double *bVector, unsigned int num_points) +static inline void volk_64f_x2_add_64f_u_avx(double* cVector, + const double* aVector, + const double* bVector, + unsigned int num_points) { - unsigned int number = 0; - const unsigned int quarter_points = num_points / 4; + unsigned int number = 0; + const unsigned int quarter_points = num_points / 4; - double *cPtr = cVector; - const double *aPtr = aVector; - const double *bPtr = bVector; + double* cPtr = cVector; + const double* aPtr = aVector; + const double* bPtr = bVector; - __m256d aVal, bVal, cVal; - for (; number < quarter_points; number++) { + __m256d aVal, bVal, cVal; + for (; number < quarter_points; number++) { - aVal = _mm256_loadu_pd(aPtr); - bVal = _mm256_loadu_pd(bPtr); + aVal = _mm256_loadu_pd(aPtr); + bVal = _mm256_loadu_pd(bPtr); - cVal = _mm256_add_pd(aVal, bVal); + cVal = _mm256_add_pd(aVal, bVal); - _mm256_storeu_pd(cPtr, cVal); // Store the results back into the C container + _mm256_storeu_pd(cPtr, cVal); // Store the results back into the C container - aPtr += 4; - bPtr += 4; - cPtr += 4; - } + aPtr += 4; + bPtr += 4; + cPtr += 4; + } - number = quarter_points * 4; - for (; number < num_points; number++) { - *cPtr++ = (*aPtr++) + (*bPtr++); - } + number = quarter_points * 4; + for (; number < num_points; number++) { + *cPtr++ = (*aPtr++) + (*bPtr++); + } } #endif /* LV_HAVE_AVX */ @@ -180,35 +183,36 @@ volk_64f_x2_add_64f_u_avx(double *cVector, const double *aVector, #include -static inline void -volk_64f_x2_add_64f_a_sse2(double *cVector, const double *aVector, - const double *bVector, unsigned int num_points) +static inline void volk_64f_x2_add_64f_a_sse2(double* cVector, + const double* aVector, + const double* bVector, + unsigned int num_points) { - unsigned int number = 0; - const unsigned int half_points = num_points / 2; + unsigned int number = 0; + const unsigned int half_points = num_points / 2; - double *cPtr = cVector; - const double *aPtr = aVector; - const double *bPtr = bVector; + double* cPtr = cVector; + const double* aPtr = aVector; + const double* bPtr = bVector; - __m128d aVal, bVal, cVal; - for (; number < half_points; number++) { - aVal = _mm_load_pd(aPtr); - bVal = _mm_load_pd(bPtr); + __m128d aVal, bVal, cVal; + for (; number < half_points; number++) { + aVal = _mm_load_pd(aPtr); + bVal = _mm_load_pd(bPtr); - cVal = _mm_add_pd(aVal, bVal); + cVal = _mm_add_pd(aVal, bVal); - _mm_store_pd(cPtr, cVal); // Store the results back into the C container + _mm_store_pd(cPtr, cVal); // Store the results back into the C container - aPtr += 2; - bPtr += 2; - cPtr += 2; - } + aPtr += 2; + bPtr += 2; + cPtr += 2; + } - number = half_points * 2; - for (; number < num_points; number++) { - *cPtr++ = (*aPtr++) + (*bPtr++); - } + number = half_points * 2; + for (; number < num_points; number++) { + *cPtr++ = (*aPtr++) + (*bPtr++); + } } #endif /* LV_HAVE_SSE2 */ @@ -218,36 +222,37 @@ volk_64f_x2_add_64f_a_sse2(double *cVector, const double *aVector, #include -static inline void -volk_64f_x2_add_64f_a_avx(double *cVector, const double *aVector, - const double *bVector, unsigned int num_points) +static inline void volk_64f_x2_add_64f_a_avx(double* cVector, + const double* aVector, + const double* bVector, + unsigned int num_points) { - unsigned int number = 0; - const unsigned int quarter_points = num_points / 4; + unsigned int number = 0; + const unsigned int quarter_points = num_points / 4; - double *cPtr = cVector; - const double *aPtr = aVector; - const double *bPtr = bVector; + double* cPtr = cVector; + const double* aPtr = aVector; + const double* bPtr = bVector; - __m256d aVal, bVal, cVal; - for (; number < quarter_points; number++) { + __m256d aVal, bVal, cVal; + for (; number < quarter_points; number++) { - aVal = _mm256_load_pd(aPtr); - bVal = _mm256_load_pd(bPtr); + aVal = _mm256_load_pd(aPtr); + bVal = _mm256_load_pd(bPtr); - cVal = _mm256_add_pd(aVal, bVal); + cVal = _mm256_add_pd(aVal, bVal); - _mm256_store_pd(cPtr, cVal); // Store the results back into the C container + _mm256_store_pd(cPtr, cVal); // Store the results back into the C container - aPtr += 4; - bPtr += 4; - cPtr += 4; - } + aPtr += 4; + bPtr += 4; + cPtr += 4; + } - number = quarter_points * 4; - for (; number < num_points; number++) { - *cPtr++ = (*aPtr++) + (*bPtr++); - } + number = quarter_points * 4; + for (; number < num_points; number++) { + *cPtr++ = (*aPtr++) + (*bPtr++); + } } #endif /* LV_HAVE_AVX */ diff --git a/kernels/volk/volk_64f_x2_max_64f.h b/kernels/volk/volk_64f_x2_max_64f.h index d4464b7..8f7f743 100644 --- a/kernels/volk/volk_64f_x2_max_64f.h +++ b/kernels/volk/volk_64f_x2_max_64f.h @@ -32,8 +32,8 @@ * * Dispatcher Prototype * \code - * void volk_64f_x2_max_64f(double* cVector, const double* aVector, const double* bVector, unsigned int num_points) - * \endcode + * void volk_64f_x2_max_64f(double* cVector, const double* aVector, const double* bVector, + * unsigned int num_points) \endcode * * \b Inputs * \li aVector: First input vector. @@ -77,38 +77,39 @@ #ifdef LV_HAVE_AVX512F #include -static inline void -volk_64f_x2_max_64f_a_avx512f(double* cVector, const double* aVector, - const double* bVector, unsigned int num_points) +static inline void volk_64f_x2_max_64f_a_avx512f(double* cVector, + const double* aVector, + const double* bVector, + unsigned int num_points) { - unsigned int number = 0; - const unsigned int eigthPoints = num_points / 8; + unsigned int number = 0; + const unsigned int eigthPoints = num_points / 8; - double* cPtr = cVector; - const double* aPtr = aVector; - const double* bPtr= bVector; + double* cPtr = cVector; + const double* aPtr = aVector; + const double* bPtr = bVector; - __m512d aVal, bVal, cVal; - for(;number < eigthPoints; number++){ + __m512d aVal, bVal, cVal; + for (; number < eigthPoints; number++) { - aVal = _mm512_load_pd(aPtr); - bVal = _mm512_load_pd(bPtr); + aVal = _mm512_load_pd(aPtr); + bVal = _mm512_load_pd(bPtr); - cVal = _mm512_max_pd(aVal, bVal); + cVal = _mm512_max_pd(aVal, bVal); - _mm512_store_pd(cPtr,cVal); // Store the results back into the C container + _mm512_store_pd(cPtr, cVal); // Store the results back into the C container - aPtr += 8; - bPtr += 8; - cPtr += 8; - } + aPtr += 8; + bPtr += 8; + cPtr += 8; + } - number = eigthPoints * 8; - for(;number < num_points; number++){ - const double a = *aPtr++; - const double b = *bPtr++; - *cPtr++ = ( a > b ? a : b); - } + number = eigthPoints * 8; + for (; number < num_points; number++) { + const double a = *aPtr++; + const double b = *bPtr++; + *cPtr++ = (a > b ? a : b); + } } #endif /* LV_HAVE_AVX512F */ @@ -116,38 +117,39 @@ volk_64f_x2_max_64f_a_avx512f(double* cVector, const double* aVector, #ifdef LV_HAVE_AVX #include -static inline void -volk_64f_x2_max_64f_a_avx(double* cVector, const double* aVector, - const double* bVector, unsigned int num_points) +static inline void volk_64f_x2_max_64f_a_avx(double* cVector, + const double* aVector, + const double* bVector, + unsigned int num_points) { - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; - double* cPtr = cVector; - const double* aPtr = aVector; - const double* bPtr= bVector; + double* cPtr = cVector; + const double* aPtr = aVector; + const double* bPtr = bVector; - __m256d aVal, bVal, cVal; - for(;number < quarterPoints; number++){ + __m256d aVal, bVal, cVal; + for (; number < quarterPoints; number++) { - aVal = _mm256_load_pd(aPtr); - bVal = _mm256_load_pd(bPtr); + aVal = _mm256_load_pd(aPtr); + bVal = _mm256_load_pd(bPtr); - cVal = _mm256_max_pd(aVal, bVal); + cVal = _mm256_max_pd(aVal, bVal); - _mm256_store_pd(cPtr,cVal); // Store the results back into the C container + _mm256_store_pd(cPtr, cVal); // Store the results back into the C container - aPtr += 4; - bPtr += 4; - cPtr += 4; - } + aPtr += 4; + bPtr += 4; + cPtr += 4; + } - number = quarterPoints * 4; - for(;number < num_points; number++){ - const double a = *aPtr++; - const double b = *bPtr++; - *cPtr++ = ( a > b ? a : b); - } + number = quarterPoints * 4; + for (; number < num_points; number++) { + const double a = *aPtr++; + const double b = *bPtr++; + *cPtr++ = (a > b ? a : b); + } } #endif /* LV_HAVE_AVX */ @@ -155,58 +157,60 @@ volk_64f_x2_max_64f_a_avx(double* cVector, const double* aVector, #ifdef LV_HAVE_SSE2 #include -static inline void -volk_64f_x2_max_64f_a_sse2(double* cVector, const double* aVector, - const double* bVector, unsigned int num_points) +static inline void volk_64f_x2_max_64f_a_sse2(double* cVector, + const double* aVector, + const double* bVector, + unsigned int num_points) { - unsigned int number = 0; - const unsigned int halfPoints = num_points / 2; + unsigned int number = 0; + const unsigned int halfPoints = num_points / 2; - double* cPtr = cVector; - const double* aPtr = aVector; - const double* bPtr= bVector; + double* cPtr = cVector; + const double* aPtr = aVector; + const double* bPtr = bVector; - __m128d aVal, bVal, cVal; - for(;number < halfPoints; number++){ + __m128d aVal, bVal, cVal; + for (; number < halfPoints; number++) { - aVal = _mm_load_pd(aPtr); - bVal = _mm_load_pd(bPtr); + aVal = _mm_load_pd(aPtr); + bVal = _mm_load_pd(bPtr); - cVal = _mm_max_pd(aVal, bVal); + cVal = _mm_max_pd(aVal, bVal); - _mm_store_pd(cPtr,cVal); // Store the results back into the C container + _mm_store_pd(cPtr, cVal); // Store the results back into the C container - aPtr += 2; - bPtr += 2; - cPtr += 2; - } + aPtr += 2; + bPtr += 2; + cPtr += 2; + } - number = halfPoints * 2; - for(;number < num_points; number++){ - const double a = *aPtr++; - const double b = *bPtr++; - *cPtr++ = ( a > b ? a : b); - } + number = halfPoints * 2; + for (; number < num_points; number++) { + const double a = *aPtr++; + const double b = *bPtr++; + *cPtr++ = (a > b ? a : b); + } } #endif /* LV_HAVE_SSE2 */ #ifdef LV_HAVE_GENERIC -static inline void -volk_64f_x2_max_64f_generic(double* cVector, const double* aVector, - const double* bVector, unsigned int num_points) +static inline void volk_64f_x2_max_64f_generic(double* cVector, + const double* aVector, + const double* bVector, + unsigned int num_points) { - double* cPtr = cVector; - const double* aPtr = aVector; - const double* bPtr= bVector; - unsigned int number = 0; - - for(number = 0; number < num_points; number++){ - const double a = *aPtr++; - const double b = *bPtr++; - *cPtr++ = ( a > b ? a : b); - } + double* cPtr = cVector; + const double* aPtr = aVector; + const double* bPtr = bVector; + unsigned int number = 0; + + for (number = 0; number < num_points; number++) { + const double a = *aPtr++; + const double b = *bPtr++; + *cPtr++ = (a > b ? a : b); + } } #endif /* LV_HAVE_GENERIC */ @@ -223,38 +227,39 @@ volk_64f_x2_max_64f_generic(double* cVector, const double* aVector, #ifdef LV_HAVE_AVX512F #include -static inline void -volk_64f_x2_max_64f_u_avx512f(double* cVector, const double* aVector, - const double* bVector, unsigned int num_points) +static inline void volk_64f_x2_max_64f_u_avx512f(double* cVector, + const double* aVector, + const double* bVector, + unsigned int num_points) { - unsigned int number = 0; - const unsigned int eigthPoints = num_points / 8; + unsigned int number = 0; + const unsigned int eigthPoints = num_points / 8; - double* cPtr = cVector; - const double* aPtr = aVector; - const double* bPtr= bVector; + double* cPtr = cVector; + const double* aPtr = aVector; + const double* bPtr = bVector; - __m512d aVal, bVal, cVal; - for(;number < eigthPoints; number++){ + __m512d aVal, bVal, cVal; + for (; number < eigthPoints; number++) { - aVal = _mm512_loadu_pd(aPtr); - bVal = _mm512_loadu_pd(bPtr); + aVal = _mm512_loadu_pd(aPtr); + bVal = _mm512_loadu_pd(bPtr); - cVal = _mm512_max_pd(aVal, bVal); + cVal = _mm512_max_pd(aVal, bVal); - _mm512_storeu_pd(cPtr,cVal); // Store the results back into the C container + _mm512_storeu_pd(cPtr, cVal); // Store the results back into the C container - aPtr += 8; - bPtr += 8; - cPtr += 8; - } + aPtr += 8; + bPtr += 8; + cPtr += 8; + } - number = eigthPoints * 8; - for(;number < num_points; number++){ - const double a = *aPtr++; - const double b = *bPtr++; - *cPtr++ = ( a > b ? a : b); - } + number = eigthPoints * 8; + for (; number < num_points; number++) { + const double a = *aPtr++; + const double b = *bPtr++; + *cPtr++ = (a > b ? a : b); + } } #endif /* LV_HAVE_AVX512F */ @@ -262,38 +267,39 @@ volk_64f_x2_max_64f_u_avx512f(double* cVector, const double* aVector, #ifdef LV_HAVE_AVX #include -static inline void -volk_64f_x2_max_64f_u_avx(double* cVector, const double* aVector, - const double* bVector, unsigned int num_points) +static inline void volk_64f_x2_max_64f_u_avx(double* cVector, + const double* aVector, + const double* bVector, + unsigned int num_points) { - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; - double* cPtr = cVector; - const double* aPtr = aVector; - const double* bPtr= bVector; + double* cPtr = cVector; + const double* aPtr = aVector; + const double* bPtr = bVector; - __m256d aVal, bVal, cVal; - for(;number < quarterPoints; number++){ + __m256d aVal, bVal, cVal; + for (; number < quarterPoints; number++) { - aVal = _mm256_loadu_pd(aPtr); - bVal = _mm256_loadu_pd(bPtr); + aVal = _mm256_loadu_pd(aPtr); + bVal = _mm256_loadu_pd(bPtr); - cVal = _mm256_max_pd(aVal, bVal); + cVal = _mm256_max_pd(aVal, bVal); - _mm256_storeu_pd(cPtr,cVal); // Store the results back into the C container + _mm256_storeu_pd(cPtr, cVal); // Store the results back into the C container - aPtr += 4; - bPtr += 4; - cPtr += 4; - } + aPtr += 4; + bPtr += 4; + cPtr += 4; + } - number = quarterPoints * 4; - for(;number < num_points; number++){ - const double a = *aPtr++; - const double b = *bPtr++; - *cPtr++ = ( a > b ? a : b); - } + number = quarterPoints * 4; + for (; number < num_points; number++) { + const double a = *aPtr++; + const double b = *bPtr++; + *cPtr++ = (a > b ? a : b); + } } #endif /* LV_HAVE_AVX */ diff --git a/kernels/volk/volk_64f_x2_min_64f.h b/kernels/volk/volk_64f_x2_min_64f.h index 0ffa305..7dc4d59 100644 --- a/kernels/volk/volk_64f_x2_min_64f.h +++ b/kernels/volk/volk_64f_x2_min_64f.h @@ -32,7 +32,8 @@ * * Dispatcher Prototype * \code - * void volk_64f_x2_min_64f(double* cVector, const double* aVector, const double* bVector, unsigned int num_points) + * void volk_64f_x2_min_64f(double* cVector, const double* aVector, const double* bVector, + unsigned int num_points) * \endcode * * \b Inputs @@ -77,38 +78,39 @@ #ifdef LV_HAVE_AVX512F #include -static inline void -volk_64f_x2_min_64f_a_avx512f(double* cVector, const double* aVector, - const double* bVector, unsigned int num_points) +static inline void volk_64f_x2_min_64f_a_avx512f(double* cVector, + const double* aVector, + const double* bVector, + unsigned int num_points) { - unsigned int number = 0; - const unsigned int eigthPoints = num_points / 8; + unsigned int number = 0; + const unsigned int eigthPoints = num_points / 8; - double* cPtr = cVector; - const double* aPtr = aVector; - const double* bPtr= bVector; + double* cPtr = cVector; + const double* aPtr = aVector; + const double* bPtr = bVector; - __m512d aVal, bVal, cVal; - for(;number < eigthPoints; number++){ + __m512d aVal, bVal, cVal; + for (; number < eigthPoints; number++) { - aVal = _mm512_load_pd(aPtr); - bVal = _mm512_load_pd(bPtr); + aVal = _mm512_load_pd(aPtr); + bVal = _mm512_load_pd(bPtr); - cVal = _mm512_min_pd(aVal, bVal); + cVal = _mm512_min_pd(aVal, bVal); - _mm512_store_pd(cPtr,cVal); // Store the results back into the C container + _mm512_store_pd(cPtr, cVal); // Store the results back into the C container - aPtr += 8; - bPtr += 8; - cPtr += 8; - } + aPtr += 8; + bPtr += 8; + cPtr += 8; + } - number = eigthPoints * 8; - for(;number < num_points; number++){ - const double a = *aPtr++; - const double b = *bPtr++; - *cPtr++ = ( a < b ? a : b); - } + number = eigthPoints * 8; + for (; number < num_points; number++) { + const double a = *aPtr++; + const double b = *bPtr++; + *cPtr++ = (a < b ? a : b); + } } #endif /* LV_HAVE_AVX512F */ @@ -116,38 +118,39 @@ volk_64f_x2_min_64f_a_avx512f(double* cVector, const double* aVector, #ifdef LV_HAVE_AVX #include -static inline void -volk_64f_x2_min_64f_a_avx(double* cVector, const double* aVector, - const double* bVector, unsigned int num_points) +static inline void volk_64f_x2_min_64f_a_avx(double* cVector, + const double* aVector, + const double* bVector, + unsigned int num_points) { - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; - double* cPtr = cVector; - const double* aPtr = aVector; - const double* bPtr= bVector; + double* cPtr = cVector; + const double* aPtr = aVector; + const double* bPtr = bVector; - __m256d aVal, bVal, cVal; - for(;number < quarterPoints; number++){ + __m256d aVal, bVal, cVal; + for (; number < quarterPoints; number++) { - aVal = _mm256_load_pd(aPtr); - bVal = _mm256_load_pd(bPtr); + aVal = _mm256_load_pd(aPtr); + bVal = _mm256_load_pd(bPtr); - cVal = _mm256_min_pd(aVal, bVal); + cVal = _mm256_min_pd(aVal, bVal); - _mm256_store_pd(cPtr,cVal); // Store the results back into the C container + _mm256_store_pd(cPtr, cVal); // Store the results back into the C container - aPtr += 4; - bPtr += 4; - cPtr += 4; - } + aPtr += 4; + bPtr += 4; + cPtr += 4; + } - number = quarterPoints * 4; - for(;number < num_points; number++){ - const double a = *aPtr++; - const double b = *bPtr++; - *cPtr++ = ( a < b ? a : b); - } + number = quarterPoints * 4; + for (; number < num_points; number++) { + const double a = *aPtr++; + const double b = *bPtr++; + *cPtr++ = (a < b ? a : b); + } } #endif /* LV_HAVE_AVX */ @@ -155,58 +158,60 @@ volk_64f_x2_min_64f_a_avx(double* cVector, const double* aVector, #ifdef LV_HAVE_SSE2 #include -static inline void -volk_64f_x2_min_64f_a_sse2(double* cVector, const double* aVector, - const double* bVector, unsigned int num_points) +static inline void volk_64f_x2_min_64f_a_sse2(double* cVector, + const double* aVector, + const double* bVector, + unsigned int num_points) { - unsigned int number = 0; - const unsigned int halfPoints = num_points / 2; + unsigned int number = 0; + const unsigned int halfPoints = num_points / 2; - double* cPtr = cVector; - const double* aPtr = aVector; - const double* bPtr= bVector; + double* cPtr = cVector; + const double* aPtr = aVector; + const double* bPtr = bVector; - __m128d aVal, bVal, cVal; - for(;number < halfPoints; number++){ + __m128d aVal, bVal, cVal; + for (; number < halfPoints; number++) { - aVal = _mm_load_pd(aPtr); - bVal = _mm_load_pd(bPtr); + aVal = _mm_load_pd(aPtr); + bVal = _mm_load_pd(bPtr); - cVal = _mm_min_pd(aVal, bVal); + cVal = _mm_min_pd(aVal, bVal); - _mm_store_pd(cPtr,cVal); // Store the results back into the C container + _mm_store_pd(cPtr, cVal); // Store the results back into the C container - aPtr += 2; - bPtr += 2; - cPtr += 2; - } + aPtr += 2; + bPtr += 2; + cPtr += 2; + } - number = halfPoints * 2; - for(;number < num_points; number++){ - const double a = *aPtr++; - const double b = *bPtr++; - *cPtr++ = ( a < b ? a : b); - } + number = halfPoints * 2; + for (; number < num_points; number++) { + const double a = *aPtr++; + const double b = *bPtr++; + *cPtr++ = (a < b ? a : b); + } } #endif /* LV_HAVE_SSE2 */ #ifdef LV_HAVE_GENERIC -static inline void -volk_64f_x2_min_64f_generic(double* cVector, const double* aVector, - const double* bVector, unsigned int num_points) +static inline void volk_64f_x2_min_64f_generic(double* cVector, + const double* aVector, + const double* bVector, + unsigned int num_points) { - double* cPtr = cVector; - const double* aPtr = aVector; - const double* bPtr= bVector; - unsigned int number = 0; - - for(number = 0; number < num_points; number++){ - const double a = *aPtr++; - const double b = *bPtr++; - *cPtr++ = ( a < b ? a : b); - } + double* cPtr = cVector; + const double* aPtr = aVector; + const double* bPtr = bVector; + unsigned int number = 0; + + for (number = 0; number < num_points; number++) { + const double a = *aPtr++; + const double b = *bPtr++; + *cPtr++ = (a < b ? a : b); + } } #endif /* LV_HAVE_GENERIC */ @@ -222,38 +227,39 @@ volk_64f_x2_min_64f_generic(double* cVector, const double* aVector, #ifdef LV_HAVE_AVX512F #include -static inline void -volk_64f_x2_min_64f_u_avx512f(double* cVector, const double* aVector, - const double* bVector, unsigned int num_points) +static inline void volk_64f_x2_min_64f_u_avx512f(double* cVector, + const double* aVector, + const double* bVector, + unsigned int num_points) { - unsigned int number = 0; - const unsigned int eigthPoints = num_points / 8; + unsigned int number = 0; + const unsigned int eigthPoints = num_points / 8; - double* cPtr = cVector; - const double* aPtr = aVector; - const double* bPtr= bVector; + double* cPtr = cVector; + const double* aPtr = aVector; + const double* bPtr = bVector; - __m512d aVal, bVal, cVal; - for(;number < eigthPoints; number++){ + __m512d aVal, bVal, cVal; + for (; number < eigthPoints; number++) { - aVal = _mm512_loadu_pd(aPtr); - bVal = _mm512_loadu_pd(bPtr); + aVal = _mm512_loadu_pd(aPtr); + bVal = _mm512_loadu_pd(bPtr); - cVal = _mm512_min_pd(aVal, bVal); + cVal = _mm512_min_pd(aVal, bVal); - _mm512_storeu_pd(cPtr,cVal); // Store the results back into the C container + _mm512_storeu_pd(cPtr, cVal); // Store the results back into the C container - aPtr += 8; - bPtr += 8; - cPtr += 8; - } + aPtr += 8; + bPtr += 8; + cPtr += 8; + } - number = eigthPoints * 8; - for(;number < num_points; number++){ - const double a = *aPtr++; - const double b = *bPtr++; - *cPtr++ = ( a < b ? a : b); - } + number = eigthPoints * 8; + for (; number < num_points; number++) { + const double a = *aPtr++; + const double b = *bPtr++; + *cPtr++ = (a < b ? a : b); + } } #endif /* LV_HAVE_AVX512F */ @@ -261,38 +267,39 @@ volk_64f_x2_min_64f_u_avx512f(double* cVector, const double* aVector, #ifdef LV_HAVE_AVX #include -static inline void -volk_64f_x2_min_64f_u_avx(double* cVector, const double* aVector, - const double* bVector, unsigned int num_points) +static inline void volk_64f_x2_min_64f_u_avx(double* cVector, + const double* aVector, + const double* bVector, + unsigned int num_points) { - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; - double* cPtr = cVector; - const double* aPtr = aVector; - const double* bPtr= bVector; + double* cPtr = cVector; + const double* aPtr = aVector; + const double* bPtr = bVector; - __m256d aVal, bVal, cVal; - for(;number < quarterPoints; number++){ + __m256d aVal, bVal, cVal; + for (; number < quarterPoints; number++) { - aVal = _mm256_loadu_pd(aPtr); - bVal = _mm256_loadu_pd(bPtr); + aVal = _mm256_loadu_pd(aPtr); + bVal = _mm256_loadu_pd(bPtr); - cVal = _mm256_min_pd(aVal, bVal); + cVal = _mm256_min_pd(aVal, bVal); - _mm256_storeu_pd(cPtr,cVal); // Store the results back into the C container + _mm256_storeu_pd(cPtr, cVal); // Store the results back into the C container - aPtr += 4; - bPtr += 4; - cPtr += 4; - } + aPtr += 4; + bPtr += 4; + cPtr += 4; + } - number = quarterPoints * 4; - for(;number < num_points; number++){ - const double a = *aPtr++; - const double b = *bPtr++; - *cPtr++ = ( a < b ? a : b); - } + number = quarterPoints * 4; + for (; number < num_points; number++) { + const double a = *aPtr++; + const double b = *bPtr++; + *cPtr++ = (a < b ? a : b); + } } #endif /* LV_HAVE_AVX */ diff --git a/kernels/volk/volk_64f_x2_multiply_64f.h b/kernels/volk/volk_64f_x2_multiply_64f.h index 6fa9e8e..39a155d 100644 --- a/kernels/volk/volk_64f_x2_multiply_64f.h +++ b/kernels/volk/volk_64f_x2_multiply_64f.h @@ -31,8 +31,8 @@ * * Dispatcher Prototype * \code - * void volk_64f_x2_multiply_64f(float* cVector, const float* aVector, const float* bVector, unsigned int num_points) - * \endcode + * void volk_64f_x2_multiply_64f(float* cVector, const float* aVector, const float* + * bVector, unsigned int num_points) \endcode * * \b Inputs * \li aVector: First input vector. @@ -76,18 +76,19 @@ #ifdef LV_HAVE_GENERIC -static inline void -volk_64f_x2_multiply_64f_generic(double *cVector, const double *aVector, - const double *bVector, unsigned int num_points) +static inline void volk_64f_x2_multiply_64f_generic(double* cVector, + const double* aVector, + const double* bVector, + unsigned int num_points) { - double *cPtr = cVector; - const double *aPtr = aVector; - const double *bPtr = bVector; - unsigned int number = 0; - - for (number = 0; number < num_points; number++) { - *cPtr++ = (*aPtr++) * (*bPtr++); - } + double* cPtr = cVector; + const double* aPtr = aVector; + const double* bPtr = bVector; + unsigned int number = 0; + + for (number = 0; number < num_points; number++) { + *cPtr++ = (*aPtr++) * (*bPtr++); + } } #endif /* LV_HAVE_GENERIC */ @@ -100,35 +101,36 @@ volk_64f_x2_multiply_64f_generic(double *cVector, const double *aVector, #include -static inline void -volk_64f_x2_multiply_64f_u_sse2(double *cVector, const double *aVector, - const double *bVector, unsigned int num_points) +static inline void volk_64f_x2_multiply_64f_u_sse2(double* cVector, + const double* aVector, + const double* bVector, + unsigned int num_points) { - unsigned int number = 0; - const unsigned int half_points = num_points / 2; + unsigned int number = 0; + const unsigned int half_points = num_points / 2; - double *cPtr = cVector; - const double *aPtr = aVector; - const double *bPtr = bVector; + double* cPtr = cVector; + const double* aPtr = aVector; + const double* bPtr = bVector; - __m128d aVal, bVal, cVal; - for (; number < half_points; number++) { - aVal = _mm_loadu_pd(aPtr); - bVal = _mm_loadu_pd(bPtr); + __m128d aVal, bVal, cVal; + for (; number < half_points; number++) { + aVal = _mm_loadu_pd(aPtr); + bVal = _mm_loadu_pd(bPtr); - cVal = _mm_mul_pd(aVal, bVal); + cVal = _mm_mul_pd(aVal, bVal); - _mm_storeu_pd(cPtr, cVal); // Store the results back into the C container + _mm_storeu_pd(cPtr, cVal); // Store the results back into the C container - aPtr += 2; - bPtr += 2; - cPtr += 2; - } + aPtr += 2; + bPtr += 2; + cPtr += 2; + } - number = half_points * 2; - for (; number < num_points; number++) { - *cPtr++ = (*aPtr++) * (*bPtr++); - } + number = half_points * 2; + for (; number < num_points; number++) { + *cPtr++ = (*aPtr++) * (*bPtr++); + } } #endif /* LV_HAVE_SSE2 */ @@ -138,36 +140,37 @@ volk_64f_x2_multiply_64f_u_sse2(double *cVector, const double *aVector, #include -static inline void -volk_64f_x2_multiply_64f_u_avx(double *cVector, const double *aVector, - const double *bVector, unsigned int num_points) +static inline void volk_64f_x2_multiply_64f_u_avx(double* cVector, + const double* aVector, + const double* bVector, + unsigned int num_points) { - unsigned int number = 0; - const unsigned int quarter_points = num_points / 4; + unsigned int number = 0; + const unsigned int quarter_points = num_points / 4; - double *cPtr = cVector; - const double *aPtr = aVector; - const double *bPtr = bVector; + double* cPtr = cVector; + const double* aPtr = aVector; + const double* bPtr = bVector; - __m256d aVal, bVal, cVal; - for (; number < quarter_points; number++) { + __m256d aVal, bVal, cVal; + for (; number < quarter_points; number++) { - aVal = _mm256_loadu_pd(aPtr); - bVal = _mm256_loadu_pd(bPtr); + aVal = _mm256_loadu_pd(aPtr); + bVal = _mm256_loadu_pd(bPtr); - cVal = _mm256_mul_pd(aVal, bVal); + cVal = _mm256_mul_pd(aVal, bVal); - _mm256_storeu_pd(cPtr, cVal); // Store the results back into the C container + _mm256_storeu_pd(cPtr, cVal); // Store the results back into the C container - aPtr += 4; - bPtr += 4; - cPtr += 4; - } + aPtr += 4; + bPtr += 4; + cPtr += 4; + } - number = quarter_points * 4; - for (; number < num_points; number++) { - *cPtr++ = (*aPtr++) * (*bPtr++); - } + number = quarter_points * 4; + for (; number < num_points; number++) { + *cPtr++ = (*aPtr++) * (*bPtr++); + } } #endif /* LV_HAVE_AVX */ @@ -180,35 +183,36 @@ volk_64f_x2_multiply_64f_u_avx(double *cVector, const double *aVector, #include -static inline void -volk_64f_x2_multiply_64f_a_sse2(double *cVector, const double *aVector, - const double *bVector, unsigned int num_points) +static inline void volk_64f_x2_multiply_64f_a_sse2(double* cVector, + const double* aVector, + const double* bVector, + unsigned int num_points) { - unsigned int number = 0; - const unsigned int half_points = num_points / 2; + unsigned int number = 0; + const unsigned int half_points = num_points / 2; - double *cPtr = cVector; - const double *aPtr = aVector; - const double *bPtr = bVector; + double* cPtr = cVector; + const double* aPtr = aVector; + const double* bPtr = bVector; - __m128d aVal, bVal, cVal; - for (; number < half_points; number++) { - aVal = _mm_load_pd(aPtr); - bVal = _mm_load_pd(bPtr); + __m128d aVal, bVal, cVal; + for (; number < half_points; number++) { + aVal = _mm_load_pd(aPtr); + bVal = _mm_load_pd(bPtr); - cVal = _mm_mul_pd(aVal, bVal); + cVal = _mm_mul_pd(aVal, bVal); - _mm_store_pd(cPtr, cVal); // Store the results back into the C container + _mm_store_pd(cPtr, cVal); // Store the results back into the C container - aPtr += 2; - bPtr += 2; - cPtr += 2; - } + aPtr += 2; + bPtr += 2; + cPtr += 2; + } - number = half_points * 2; - for (; number < num_points; number++) { - *cPtr++ = (*aPtr++) * (*bPtr++); - } + number = half_points * 2; + for (; number < num_points; number++) { + *cPtr++ = (*aPtr++) * (*bPtr++); + } } #endif /* LV_HAVE_SSE2 */ @@ -218,36 +222,37 @@ volk_64f_x2_multiply_64f_a_sse2(double *cVector, const double *aVector, #include -static inline void -volk_64f_x2_multiply_64f_a_avx(double *cVector, const double *aVector, - const double *bVector, unsigned int num_points) +static inline void volk_64f_x2_multiply_64f_a_avx(double* cVector, + const double* aVector, + const double* bVector, + unsigned int num_points) { - unsigned int number = 0; - const unsigned int quarter_points = num_points / 4; + unsigned int number = 0; + const unsigned int quarter_points = num_points / 4; - double *cPtr = cVector; - const double *aPtr = aVector; - const double *bPtr = bVector; + double* cPtr = cVector; + const double* aPtr = aVector; + const double* bPtr = bVector; - __m256d aVal, bVal, cVal; - for (; number < quarter_points; number++) { + __m256d aVal, bVal, cVal; + for (; number < quarter_points; number++) { - aVal = _mm256_load_pd(aPtr); - bVal = _mm256_load_pd(bPtr); + aVal = _mm256_load_pd(aPtr); + bVal = _mm256_load_pd(bPtr); - cVal = _mm256_mul_pd(aVal, bVal); + cVal = _mm256_mul_pd(aVal, bVal); - _mm256_store_pd(cPtr, cVal); // Store the results back into the C container + _mm256_store_pd(cPtr, cVal); // Store the results back into the C container - aPtr += 4; - bPtr += 4; - cPtr += 4; - } + aPtr += 4; + bPtr += 4; + cPtr += 4; + } - number = quarter_points * 4; - for (; number < num_points; number++) { - *cPtr++ = (*aPtr++) * (*bPtr++); - } + number = quarter_points * 4; + for (; number < num_points; number++) { + *cPtr++ = (*aPtr++) * (*bPtr++); + } } #endif /* LV_HAVE_AVX */ diff --git a/kernels/volk/volk_64u_byteswap.h b/kernels/volk/volk_64u_byteswap.h index 96e0661..38621a4 100644 --- a/kernels/volk/volk_64u_byteswap.h +++ b/kernels/volk/volk_64u_byteswap.h @@ -72,71 +72,77 @@ #ifdef LV_HAVE_SSE2 #include -static inline void volk_64u_byteswap_u_sse2(uint64_t* intsToSwap, unsigned int num_points){ +static inline void volk_64u_byteswap_u_sse2(uint64_t* intsToSwap, unsigned int num_points) +{ uint32_t* inputPtr = (uint32_t*)intsToSwap; __m128i input, byte1, byte2, byte3, byte4, output; __m128i byte2mask = _mm_set1_epi32(0x00FF0000); __m128i byte3mask = _mm_set1_epi32(0x0000FF00); uint64_t number = 0; const unsigned int halfPoints = num_points / 2; - for(;number < halfPoints; number++){ - // Load the 32t values, increment inputPtr later since we're doing it in-place. - input = _mm_loadu_si128((__m128i*)inputPtr); - - // Do the four shifts - byte1 = _mm_slli_epi32(input, 24); - byte2 = _mm_slli_epi32(input, 8); - byte3 = _mm_srli_epi32(input, 8); - byte4 = _mm_srli_epi32(input, 24); - // Or bytes together - output = _mm_or_si128(byte1, byte4); - byte2 = _mm_and_si128(byte2, byte2mask); - output = _mm_or_si128(output, byte2); - byte3 = _mm_and_si128(byte3, byte3mask); - output = _mm_or_si128(output, byte3); - - // Reorder the two words - output = _mm_shuffle_epi32(output, _MM_SHUFFLE(2, 3, 0, 1)); - - // Store the results - _mm_storeu_si128((__m128i*)inputPtr, output); - inputPtr += 4; + for (; number < halfPoints; number++) { + // Load the 32t values, increment inputPtr later since we're doing it in-place. + input = _mm_loadu_si128((__m128i*)inputPtr); + + // Do the four shifts + byte1 = _mm_slli_epi32(input, 24); + byte2 = _mm_slli_epi32(input, 8); + byte3 = _mm_srli_epi32(input, 8); + byte4 = _mm_srli_epi32(input, 24); + // Or bytes together + output = _mm_or_si128(byte1, byte4); + byte2 = _mm_and_si128(byte2, byte2mask); + output = _mm_or_si128(output, byte2); + byte3 = _mm_and_si128(byte3, byte3mask); + output = _mm_or_si128(output, byte3); + + // Reorder the two words + output = _mm_shuffle_epi32(output, _MM_SHUFFLE(2, 3, 0, 1)); + + // Store the results + _mm_storeu_si128((__m128i*)inputPtr, output); + inputPtr += 4; } // Byteswap any remaining points: - number = halfPoints*2; - for(; number < num_points; number++){ - uint32_t output1 = *inputPtr; - uint32_t output2 = inputPtr[1]; + number = halfPoints * 2; + for (; number < num_points; number++) { + uint32_t output1 = *inputPtr; + uint32_t output2 = inputPtr[1]; - output1 = (((output1 >> 24) & 0xff) | ((output1 >> 8) & 0x0000ff00) | ((output1 << 8) & 0x00ff0000) | ((output1 << 24) & 0xff000000)); + output1 = (((output1 >> 24) & 0xff) | ((output1 >> 8) & 0x0000ff00) | + ((output1 << 8) & 0x00ff0000) | ((output1 << 24) & 0xff000000)); - output2 = (((output2 >> 24) & 0xff) | ((output2 >> 8) & 0x0000ff00) | ((output2 << 8) & 0x00ff0000) | ((output2 << 24) & 0xff000000)); + output2 = (((output2 >> 24) & 0xff) | ((output2 >> 8) & 0x0000ff00) | + ((output2 << 8) & 0x00ff0000) | ((output2 << 24) & 0xff000000)); - *inputPtr++ = output2; - *inputPtr++ = output1; + *inputPtr++ = output2; + *inputPtr++ = output1; } } #endif /* LV_HAVE_SSE2 */ - #ifdef LV_HAVE_GENERIC -static inline void volk_64u_byteswap_generic(uint64_t* intsToSwap, unsigned int num_points){ - uint32_t* inputPtr = (uint32_t*)intsToSwap; - unsigned int point; - for(point = 0; point < num_points; point++){ - uint32_t output1 = *inputPtr; - uint32_t output2 = inputPtr[1]; +static inline void volk_64u_byteswap_generic(uint64_t* intsToSwap, + unsigned int num_points) +{ + uint32_t* inputPtr = (uint32_t*)intsToSwap; + unsigned int point; + for (point = 0; point < num_points; point++) { + uint32_t output1 = *inputPtr; + uint32_t output2 = inputPtr[1]; - output1 = (((output1 >> 24) & 0xff) | ((output1 >> 8) & 0x0000ff00) | ((output1 << 8) & 0x00ff0000) | ((output1 << 24) & 0xff000000)); + output1 = (((output1 >> 24) & 0xff) | ((output1 >> 8) & 0x0000ff00) | + ((output1 << 8) & 0x00ff0000) | ((output1 << 24) & 0xff000000)); - output2 = (((output2 >> 24) & 0xff) | ((output2 >> 8) & 0x0000ff00) | ((output2 << 8) & 0x00ff0000) | ((output2 << 24) & 0xff000000)); + output2 = (((output2 >> 24) & 0xff) | ((output2 >> 8) & 0x0000ff00) | + ((output2 << 8) & 0x00ff0000) | ((output2 << 24) & 0xff000000)); - *inputPtr++ = output2; - *inputPtr++ = output1; - } + *inputPtr++ = output2; + *inputPtr++ = output1; + } } #endif /* LV_HAVE_GENERIC */ @@ -144,47 +150,47 @@ static inline void volk_64u_byteswap_generic(uint64_t* intsToSwap, unsigned int #include static inline void volk_64u_byteswap_a_avx2(uint64_t* intsToSwap, unsigned int num_points) { - unsigned int number = 0; - - const unsigned int nPerSet = 4; - const uint64_t nSets = num_points / nPerSet; + unsigned int number = 0; - uint32_t* inputPtr = (uint32_t*)intsToSwap; + const unsigned int nPerSet = 4; + const uint64_t nSets = num_points / nPerSet; - const uint8_t shuffleVector[32] = { 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8, 23, 22, 21, 20, 19, 18, 17, 16, 31, 30, 29, 28, 27, 26, 25, 24 }; + uint32_t* inputPtr = (uint32_t*)intsToSwap; - const __m256i myShuffle = _mm256_loadu_si256((__m256i*) &shuffleVector[0]); + const uint8_t shuffleVector[32] = { 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, + 12, 11, 10, 9, 8, 23, 22, 21, 20, 19, 18, + 17, 16, 31, 30, 29, 28, 27, 26, 25, 24 }; - for ( ;number < nSets; number++ ) { + const __m256i myShuffle = _mm256_loadu_si256((__m256i*)&shuffleVector[0]); - // Load the 32t values, increment inputPtr later since we're doing it in-place. - const __m256i input = _mm256_load_si256((__m256i*)inputPtr); - const __m256i output = _mm256_shuffle_epi8(input, myShuffle); + for (; number < nSets; number++) { - // Store the results - _mm256_store_si256((__m256i*)inputPtr, output); + // Load the 32t values, increment inputPtr later since we're doing it in-place. + const __m256i input = _mm256_load_si256((__m256i*)inputPtr); + const __m256i output = _mm256_shuffle_epi8(input, myShuffle); - /* inputPtr is 32bit so increment twice */ - inputPtr += 2 * nPerSet; - } - _mm256_zeroupper(); + // Store the results + _mm256_store_si256((__m256i*)inputPtr, output); - // Byteswap any remaining points: - for(number = nSets * nPerSet; number < num_points; ++number ) { - uint32_t output1 = *inputPtr; - uint32_t output2 = inputPtr[1]; - uint32_t out1 = ((((output1) >> 24) & 0x000000ff) | - (((output1) >> 8) & 0x0000ff00) | - (((output1) << 8) & 0x00ff0000) | - (((output1) << 24) & 0xff000000) ); + /* inputPtr is 32bit so increment twice */ + inputPtr += 2 * nPerSet; + } + _mm256_zeroupper(); - uint32_t out2 = ((((output2) >> 24) & 0x000000ff) | - (((output2) >> 8) & 0x0000ff00) | - (((output2) << 8) & 0x00ff0000) | - (((output2) << 24) & 0xff000000) ); - *inputPtr++ = out2; - *inputPtr++ = out1; - } + // Byteswap any remaining points: + for (number = nSets * nPerSet; number < num_points; ++number) { + uint32_t output1 = *inputPtr; + uint32_t output2 = inputPtr[1]; + uint32_t out1 = + ((((output1) >> 24) & 0x000000ff) | (((output1) >> 8) & 0x0000ff00) | + (((output1) << 8) & 0x00ff0000) | (((output1) << 24) & 0xff000000)); + + uint32_t out2 = + ((((output2) >> 24) & 0x000000ff) | (((output2) >> 8) & 0x0000ff00) | + (((output2) << 8) & 0x00ff0000) | (((output2) << 24) & 0xff000000)); + *inputPtr++ = out2; + *inputPtr++ = out1; + } } #endif /* LV_HAVE_AVX2 */ @@ -192,48 +198,47 @@ static inline void volk_64u_byteswap_a_avx2(uint64_t* intsToSwap, unsigned int n #if LV_HAVE_SSSE3 #include -static inline void volk_64u_byteswap_a_ssse3(uint64_t* intsToSwap, unsigned int num_points) +static inline void volk_64u_byteswap_a_ssse3(uint64_t* intsToSwap, + unsigned int num_points) { - unsigned int number = 0; + unsigned int number = 0; - const unsigned int nPerSet = 2; - const uint64_t nSets = num_points / nPerSet; + const unsigned int nPerSet = 2; + const uint64_t nSets = num_points / nPerSet; - uint32_t* inputPtr = (uint32_t*)intsToSwap; - - uint8_t shuffleVector[16] = { 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8 }; + uint32_t* inputPtr = (uint32_t*)intsToSwap; - const __m128i myShuffle = _mm_loadu_si128((__m128i*) &shuffleVector); + uint8_t shuffleVector[16] = { 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8 }; - for ( ;number < nSets; number++ ) { + const __m128i myShuffle = _mm_loadu_si128((__m128i*)&shuffleVector); - // Load the 32t values, increment inputPtr later since we're doing it in-place. - const __m128i input = _mm_load_si128((__m128i*)inputPtr); - const __m128i output = _mm_shuffle_epi8(input,myShuffle); + for (; number < nSets; number++) { - // Store the results - _mm_store_si128((__m128i*)inputPtr, output); + // Load the 32t values, increment inputPtr later since we're doing it in-place. + const __m128i input = _mm_load_si128((__m128i*)inputPtr); + const __m128i output = _mm_shuffle_epi8(input, myShuffle); - /* inputPtr is 32bit so increment twice */ - inputPtr += 2 * nPerSet; - } + // Store the results + _mm_store_si128((__m128i*)inputPtr, output); - // Byteswap any remaining points: - for(number = nSets * nPerSet; number < num_points; ++number ) { - uint32_t output1 = *inputPtr; - uint32_t output2 = inputPtr[1]; - uint32_t out1 = ((((output1) >> 24) & 0x000000ff) | - (((output1) >> 8) & 0x0000ff00) | - (((output1) << 8) & 0x00ff0000) | - (((output1) << 24) & 0xff000000) ); + /* inputPtr is 32bit so increment twice */ + inputPtr += 2 * nPerSet; + } - uint32_t out2 = ((((output2) >> 24) & 0x000000ff) | - (((output2) >> 8) & 0x0000ff00) | - (((output2) << 8) & 0x00ff0000) | - (((output2) << 24) & 0xff000000) ); - *inputPtr++ = out2; - *inputPtr++ = out1; - } + // Byteswap any remaining points: + for (number = nSets * nPerSet; number < num_points; ++number) { + uint32_t output1 = *inputPtr; + uint32_t output2 = inputPtr[1]; + uint32_t out1 = + ((((output1) >> 24) & 0x000000ff) | (((output1) >> 8) & 0x0000ff00) | + (((output1) << 8) & 0x00ff0000) | (((output1) << 24) & 0xff000000)); + + uint32_t out2 = + ((((output2) >> 24) & 0x000000ff) | (((output2) >> 8) & 0x0000ff00) | + (((output2) << 8) & 0x00ff0000) | (((output2) << 24) & 0xff000000)); + *inputPtr++ = out2; + *inputPtr++ = out1; + } } #endif /* LV_HAVE_SSSE3 */ @@ -241,86 +246,90 @@ static inline void volk_64u_byteswap_a_ssse3(uint64_t* intsToSwap, unsigned int #ifdef LV_HAVE_NEONV8 #include -static inline void volk_64u_byteswap_neonv8(uint64_t* intsToSwap, unsigned int num_points){ - uint32_t* inputPtr = (uint32_t*)intsToSwap; - const unsigned int n4points = num_points / 4; - uint8x16x2_t input; - uint8x16_t idx = { 7,6,5,4, 3,2,1,0, 15,14,13,12, 11,10,9,8 }; - - unsigned int number = 0; - for(number = 0; number < n4points; ++number){ - __VOLK_PREFETCH(inputPtr+8); - input = vld2q_u8((uint8_t*) inputPtr); - input.val[0] = vqtbl1q_u8(input.val[0], idx); - input.val[1] = vqtbl1q_u8(input.val[1], idx); - vst2q_u8((uint8_t*) inputPtr, input); - - inputPtr += 8; - } - - for(number = n4points * 4; number < num_points; ++number){ - uint32_t output1 = *inputPtr; - uint32_t output2 = inputPtr[1]; +static inline void volk_64u_byteswap_neonv8(uint64_t* intsToSwap, unsigned int num_points) +{ + uint32_t* inputPtr = (uint32_t*)intsToSwap; + const unsigned int n4points = num_points / 4; + uint8x16x2_t input; + uint8x16_t idx = { 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8 }; + + unsigned int number = 0; + for (number = 0; number < n4points; ++number) { + __VOLK_PREFETCH(inputPtr + 8); + input = vld2q_u8((uint8_t*)inputPtr); + input.val[0] = vqtbl1q_u8(input.val[0], idx); + input.val[1] = vqtbl1q_u8(input.val[1], idx); + vst2q_u8((uint8_t*)inputPtr, input); + + inputPtr += 8; + } - output1 = (((output1 >> 24) & 0xff) | ((output1 >> 8) & 0x0000ff00) | ((output1 << 8) & 0x00ff0000) | ((output1 << 24) & 0xff000000)); - output2 = (((output2 >> 24) & 0xff) | ((output2 >> 8) & 0x0000ff00) | ((output2 << 8) & 0x00ff0000) | ((output2 << 24) & 0xff000000)); + for (number = n4points * 4; number < num_points; ++number) { + uint32_t output1 = *inputPtr; + uint32_t output2 = inputPtr[1]; - *inputPtr++ = output2; - *inputPtr++ = output1; - } + output1 = (((output1 >> 24) & 0xff) | ((output1 >> 8) & 0x0000ff00) | + ((output1 << 8) & 0x00ff0000) | ((output1 << 24) & 0xff000000)); + output2 = (((output2 >> 24) & 0xff) | ((output2 >> 8) & 0x0000ff00) | + ((output2 << 8) & 0x00ff0000) | ((output2 << 24) & 0xff000000)); + *inputPtr++ = output2; + *inputPtr++ = output1; + } } #else #ifdef LV_HAVE_NEON #include -static inline void volk_64u_byteswap_neon(uint64_t* intsToSwap, unsigned int num_points){ - uint32_t* inputPtr = (uint32_t*)intsToSwap; - unsigned int number = 0; - unsigned int n8points = num_points / 4; - - uint8x8x4_t input_table; - uint8x8_t int_lookup01, int_lookup23, int_lookup45, int_lookup67; - uint8x8_t swapped_int01, swapped_int23, swapped_int45, swapped_int67; - - /* these magic numbers are used as byte-indices in the LUT. - they are pre-computed to save time. A simple C program - can calculate them; for example for lookup01: - uint8_t chars[8] = {24, 16, 8, 0, 25, 17, 9, 1}; - for(ii=0; ii < 8; ++ii) { - index += ((uint64_t)(*(chars+ii))) << (ii*8); +static inline void volk_64u_byteswap_neon(uint64_t* intsToSwap, unsigned int num_points) +{ + uint32_t* inputPtr = (uint32_t*)intsToSwap; + unsigned int number = 0; + unsigned int n8points = num_points / 4; + + uint8x8x4_t input_table; + uint8x8_t int_lookup01, int_lookup23, int_lookup45, int_lookup67; + uint8x8_t swapped_int01, swapped_int23, swapped_int45, swapped_int67; + + /* these magic numbers are used as byte-indices in the LUT. + they are pre-computed to save time. A simple C program + can calculate them; for example for lookup01: + uint8_t chars[8] = {24, 16, 8, 0, 25, 17, 9, 1}; + for(ii=0; ii < 8; ++ii) { + index += ((uint64_t)(*(chars+ii))) << (ii*8); + } + */ + int_lookup01 = vcreate_u8(2269495096316185); + int_lookup23 = vcreate_u8(146949840772469531); + int_lookup45 = vcreate_u8(291630186448622877); + int_lookup67 = vcreate_u8(436310532124776223); + + for (number = 0; number < n8points; ++number) { + input_table = vld4_u8((uint8_t*)inputPtr); + swapped_int01 = vtbl4_u8(input_table, int_lookup01); + swapped_int23 = vtbl4_u8(input_table, int_lookup23); + swapped_int45 = vtbl4_u8(input_table, int_lookup45); + swapped_int67 = vtbl4_u8(input_table, int_lookup67); + vst1_u8((uint8_t*)inputPtr, swapped_int01); + vst1_u8((uint8_t*)(inputPtr + 2), swapped_int23); + vst1_u8((uint8_t*)(inputPtr + 4), swapped_int45); + vst1_u8((uint8_t*)(inputPtr + 6), swapped_int67); + + inputPtr += 4; } - */ - int_lookup01 = vcreate_u8(2269495096316185); - int_lookup23 = vcreate_u8(146949840772469531); - int_lookup45 = vcreate_u8(291630186448622877); - int_lookup67 = vcreate_u8(436310532124776223); - - for(number = 0; number < n8points; ++number){ - input_table = vld4_u8((uint8_t*) inputPtr); - swapped_int01 = vtbl4_u8(input_table, int_lookup01); - swapped_int23 = vtbl4_u8(input_table, int_lookup23); - swapped_int45 = vtbl4_u8(input_table, int_lookup45); - swapped_int67 = vtbl4_u8(input_table, int_lookup67); - vst1_u8((uint8_t*) inputPtr, swapped_int01); - vst1_u8((uint8_t*) (inputPtr+2), swapped_int23); - vst1_u8((uint8_t*) (inputPtr+4), swapped_int45); - vst1_u8((uint8_t*) (inputPtr+6), swapped_int67); - - inputPtr += 4; - } - - for(number = n8points * 4; number < num_points; ++number){ - uint32_t output1 = *inputPtr; - uint32_t output2 = inputPtr[1]; - - output1 = (((output1 >> 24) & 0xff) | ((output1 >> 8) & 0x0000ff00) | ((output1 << 8) & 0x00ff0000) | ((output1 << 24) & 0xff000000)); - output2 = (((output2 >> 24) & 0xff) | ((output2 >> 8) & 0x0000ff00) | ((output2 << 8) & 0x00ff0000) | ((output2 << 24) & 0xff000000)); - - *inputPtr++ = output2; - *inputPtr++ = output1; - } + for (number = n8points * 4; number < num_points; ++number) { + uint32_t output1 = *inputPtr; + uint32_t output2 = inputPtr[1]; + + output1 = (((output1 >> 24) & 0xff) | ((output1 >> 8) & 0x0000ff00) | + ((output1 << 8) & 0x00ff0000) | ((output1 << 24) & 0xff000000)); + output2 = (((output2 >> 24) & 0xff) | ((output2 >> 8) & 0x0000ff00) | + ((output2 << 8) & 0x00ff0000) | ((output2 << 24) & 0xff000000)); + + *inputPtr++ = output2; + *inputPtr++ = output1; + } } #endif /* LV_HAVE_NEON */ #endif @@ -336,49 +345,52 @@ static inline void volk_64u_byteswap_neon(uint64_t* intsToSwap, unsigned int num #ifdef LV_HAVE_SSE2 #include -static inline void volk_64u_byteswap_a_sse2(uint64_t* intsToSwap, unsigned int num_points){ +static inline void volk_64u_byteswap_a_sse2(uint64_t* intsToSwap, unsigned int num_points) +{ uint32_t* inputPtr = (uint32_t*)intsToSwap; __m128i input, byte1, byte2, byte3, byte4, output; __m128i byte2mask = _mm_set1_epi32(0x00FF0000); __m128i byte3mask = _mm_set1_epi32(0x0000FF00); uint64_t number = 0; const unsigned int halfPoints = num_points / 2; - for(;number < halfPoints; number++){ - // Load the 32t values, increment inputPtr later since we're doing it in-place. - input = _mm_load_si128((__m128i*)inputPtr); - - // Do the four shifts - byte1 = _mm_slli_epi32(input, 24); - byte2 = _mm_slli_epi32(input, 8); - byte3 = _mm_srli_epi32(input, 8); - byte4 = _mm_srli_epi32(input, 24); - // Or bytes together - output = _mm_or_si128(byte1, byte4); - byte2 = _mm_and_si128(byte2, byte2mask); - output = _mm_or_si128(output, byte2); - byte3 = _mm_and_si128(byte3, byte3mask); - output = _mm_or_si128(output, byte3); - - // Reorder the two words - output = _mm_shuffle_epi32(output, _MM_SHUFFLE(2, 3, 0, 1)); - - // Store the results - _mm_store_si128((__m128i*)inputPtr, output); - inputPtr += 4; + for (; number < halfPoints; number++) { + // Load the 32t values, increment inputPtr later since we're doing it in-place. + input = _mm_load_si128((__m128i*)inputPtr); + + // Do the four shifts + byte1 = _mm_slli_epi32(input, 24); + byte2 = _mm_slli_epi32(input, 8); + byte3 = _mm_srli_epi32(input, 8); + byte4 = _mm_srli_epi32(input, 24); + // Or bytes together + output = _mm_or_si128(byte1, byte4); + byte2 = _mm_and_si128(byte2, byte2mask); + output = _mm_or_si128(output, byte2); + byte3 = _mm_and_si128(byte3, byte3mask); + output = _mm_or_si128(output, byte3); + + // Reorder the two words + output = _mm_shuffle_epi32(output, _MM_SHUFFLE(2, 3, 0, 1)); + + // Store the results + _mm_store_si128((__m128i*)inputPtr, output); + inputPtr += 4; } // Byteswap any remaining points: - number = halfPoints*2; - for(; number < num_points; number++){ - uint32_t output1 = *inputPtr; - uint32_t output2 = inputPtr[1]; + number = halfPoints * 2; + for (; number < num_points; number++) { + uint32_t output1 = *inputPtr; + uint32_t output2 = inputPtr[1]; - output1 = (((output1 >> 24) & 0xff) | ((output1 >> 8) & 0x0000ff00) | ((output1 << 8) & 0x00ff0000) | ((output1 << 24) & 0xff000000)); + output1 = (((output1 >> 24) & 0xff) | ((output1 >> 8) & 0x0000ff00) | + ((output1 << 8) & 0x00ff0000) | ((output1 << 24) & 0xff000000)); - output2 = (((output2 >> 24) & 0xff) | ((output2 >> 8) & 0x0000ff00) | ((output2 << 8) & 0x00ff0000) | ((output2 << 24) & 0xff000000)); + output2 = (((output2 >> 24) & 0xff) | ((output2 >> 8) & 0x0000ff00) | + ((output2 << 8) & 0x00ff0000) | ((output2 << 24) & 0xff000000)); - *inputPtr++ = output2; - *inputPtr++ = output1; + *inputPtr++ = output2; + *inputPtr++ = output1; } } #endif /* LV_HAVE_SSE2 */ @@ -387,46 +399,46 @@ static inline void volk_64u_byteswap_a_sse2(uint64_t* intsToSwap, unsigned int n #include static inline void volk_64u_byteswap_u_avx2(uint64_t* intsToSwap, unsigned int num_points) { - unsigned int number = 0; - - const unsigned int nPerSet = 4; - const uint64_t nSets = num_points / nPerSet; - - uint32_t* inputPtr = (uint32_t*)intsToSwap; - - const uint8_t shuffleVector[32] = { 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8, 23, 22, 21, 20, 19, 18, 17, 16, 31, 30, 29, 28, 27, 26, 25, 24 }; - - const __m256i myShuffle = _mm256_loadu_si256((__m256i*) &shuffleVector[0]); - - for ( ;number < nSets; number++ ) { - // Load the 32t values, increment inputPtr later since we're doing it in-place. - const __m256i input = _mm256_loadu_si256((__m256i*)inputPtr); - const __m256i output = _mm256_shuffle_epi8(input,myShuffle); - - // Store the results - _mm256_storeu_si256((__m256i*)inputPtr, output); - - /* inputPtr is 32bit so increment twice */ - inputPtr += 2 * nPerSet; - } - _mm256_zeroupper(); - - // Byteswap any remaining points: - for(number = nSets * nPerSet; number < num_points; ++number ) { - uint32_t output1 = *inputPtr; - uint32_t output2 = inputPtr[1]; - uint32_t out1 = ((((output1) >> 24) & 0x000000ff) | - (((output1) >> 8) & 0x0000ff00) | - (((output1) << 8) & 0x00ff0000) | - (((output1) << 24) & 0xff000000) ); - - uint32_t out2 = ((((output2) >> 24) & 0x000000ff) | - (((output2) >> 8) & 0x0000ff00) | - (((output2) << 8) & 0x00ff0000) | - (((output2) << 24) & 0xff000000) ); - *inputPtr++ = out2; - *inputPtr++ = out1; - } + unsigned int number = 0; + + const unsigned int nPerSet = 4; + const uint64_t nSets = num_points / nPerSet; + + uint32_t* inputPtr = (uint32_t*)intsToSwap; + + const uint8_t shuffleVector[32] = { 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, + 12, 11, 10, 9, 8, 23, 22, 21, 20, 19, 18, + 17, 16, 31, 30, 29, 28, 27, 26, 25, 24 }; + + const __m256i myShuffle = _mm256_loadu_si256((__m256i*)&shuffleVector[0]); + + for (; number < nSets; number++) { + // Load the 32t values, increment inputPtr later since we're doing it in-place. + const __m256i input = _mm256_loadu_si256((__m256i*)inputPtr); + const __m256i output = _mm256_shuffle_epi8(input, myShuffle); + + // Store the results + _mm256_storeu_si256((__m256i*)inputPtr, output); + + /* inputPtr is 32bit so increment twice */ + inputPtr += 2 * nPerSet; + } + _mm256_zeroupper(); + + // Byteswap any remaining points: + for (number = nSets * nPerSet; number < num_points; ++number) { + uint32_t output1 = *inputPtr; + uint32_t output2 = inputPtr[1]; + uint32_t out1 = + ((((output1) >> 24) & 0x000000ff) | (((output1) >> 8) & 0x0000ff00) | + (((output1) << 8) & 0x00ff0000) | (((output1) << 24) & 0xff000000)); + + uint32_t out2 = + ((((output2) >> 24) & 0x000000ff) | (((output2) >> 8) & 0x0000ff00) | + (((output2) << 8) & 0x00ff0000) | (((output2) << 24) & 0xff000000)); + *inputPtr++ = out2; + *inputPtr++ = out1; + } } #endif /* LV_HAVE_AVX2 */ @@ -434,70 +446,71 @@ static inline void volk_64u_byteswap_u_avx2(uint64_t* intsToSwap, unsigned int n #if LV_HAVE_SSSE3 #include -static inline void volk_64u_byteswap_u_ssse3(uint64_t* intsToSwap, unsigned int num_points) +static inline void volk_64u_byteswap_u_ssse3(uint64_t* intsToSwap, + unsigned int num_points) { - unsigned int number = 0; - - const unsigned int nPerSet = 2; - const uint64_t nSets = num_points / nPerSet; + unsigned int number = 0; - uint32_t* inputPtr = (uint32_t*)intsToSwap; + const unsigned int nPerSet = 2; + const uint64_t nSets = num_points / nPerSet; - uint8_t shuffleVector[16] = { 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8 }; + uint32_t* inputPtr = (uint32_t*)intsToSwap; - const __m128i myShuffle = _mm_loadu_si128((__m128i*) &shuffleVector); + uint8_t shuffleVector[16] = { 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8 }; - for ( ;number < nSets; number++ ) { - // Load the 32t values, increment inputPtr later since we're doing it in-place. - const __m128i input = _mm_loadu_si128((__m128i*)inputPtr); - const __m128i output = _mm_shuffle_epi8(input,myShuffle); + const __m128i myShuffle = _mm_loadu_si128((__m128i*)&shuffleVector); - // Store the results - _mm_storeu_si128((__m128i*)inputPtr, output); + for (; number < nSets; number++) { + // Load the 32t values, increment inputPtr later since we're doing it in-place. + const __m128i input = _mm_loadu_si128((__m128i*)inputPtr); + const __m128i output = _mm_shuffle_epi8(input, myShuffle); - /* inputPtr is 32bit so increment twice */ - inputPtr += 2 * nPerSet; - } + // Store the results + _mm_storeu_si128((__m128i*)inputPtr, output); - // Byteswap any remaining points: - for(number = nSets * nPerSet; number < num_points; ++number ) { - uint32_t output1 = *inputPtr; - uint32_t output2 = inputPtr[1]; - uint32_t out1 = ((((output1) >> 24) & 0x000000ff) | - (((output1) >> 8) & 0x0000ff00) | - (((output1) << 8) & 0x00ff0000) | - (((output1) << 24) & 0xff000000) ); + /* inputPtr is 32bit so increment twice */ + inputPtr += 2 * nPerSet; + } - uint32_t out2 = ((((output2) >> 24) & 0x000000ff) | - (((output2) >> 8) & 0x0000ff00) | - (((output2) << 8) & 0x00ff0000) | - (((output2) << 24) & 0xff000000) ); - *inputPtr++ = out2; - *inputPtr++ = out1; - } + // Byteswap any remaining points: + for (number = nSets * nPerSet; number < num_points; ++number) { + uint32_t output1 = *inputPtr; + uint32_t output2 = inputPtr[1]; + uint32_t out1 = + ((((output1) >> 24) & 0x000000ff) | (((output1) >> 8) & 0x0000ff00) | + (((output1) << 8) & 0x00ff0000) | (((output1) << 24) & 0xff000000)); + + uint32_t out2 = + ((((output2) >> 24) & 0x000000ff) | (((output2) >> 8) & 0x0000ff00) | + (((output2) << 8) & 0x00ff0000) | (((output2) << 24) & 0xff000000)); + *inputPtr++ = out2; + *inputPtr++ = out1; + } } #endif /* LV_HAVE_SSSE3 */ #ifdef LV_HAVE_GENERIC -static inline void volk_64u_byteswap_a_generic(uint64_t* intsToSwap, unsigned int num_points){ - uint32_t* inputPtr = (uint32_t*)intsToSwap; - unsigned int point; - for(point = 0; point < num_points; point++){ - uint32_t output1 = *inputPtr; - uint32_t output2 = inputPtr[1]; +static inline void volk_64u_byteswap_a_generic(uint64_t* intsToSwap, + unsigned int num_points) +{ + uint32_t* inputPtr = (uint32_t*)intsToSwap; + unsigned int point; + for (point = 0; point < num_points; point++) { + uint32_t output1 = *inputPtr; + uint32_t output2 = inputPtr[1]; - output1 = (((output1 >> 24) & 0xff) | ((output1 >> 8) & 0x0000ff00) | ((output1 << 8) & 0x00ff0000) | ((output1 << 24) & 0xff000000)); + output1 = (((output1 >> 24) & 0xff) | ((output1 >> 8) & 0x0000ff00) | + ((output1 << 8) & 0x00ff0000) | ((output1 << 24) & 0xff000000)); - output2 = (((output2 >> 24) & 0xff) | ((output2 >> 8) & 0x0000ff00) | ((output2 << 8) & 0x00ff0000) | ((output2 << 24) & 0xff000000)); + output2 = (((output2 >> 24) & 0xff) | ((output2 >> 8) & 0x0000ff00) | + ((output2 << 8) & 0x00ff0000) | ((output2 << 24) & 0xff000000)); - *inputPtr++ = output2; - *inputPtr++ = output1; - } + *inputPtr++ = output2; + *inputPtr++ = output1; + } } #endif /* LV_HAVE_GENERIC */ - - #endif /* INCLUDED_volk_64u_byteswap_a_H */ diff --git a/kernels/volk/volk_64u_byteswappuppet_64u.h b/kernels/volk/volk_64u_byteswappuppet_64u.h index 2db0171..ded54ee 100644 --- a/kernels/volk/volk_64u_byteswappuppet_64u.h +++ b/kernels/volk/volk_64u_byteswappuppet_64u.h @@ -3,87 +3,105 @@ #include -#include #include +#include #ifdef LV_HAVE_GENERIC -static inline void volk_64u_byteswappuppet_64u_generic(uint64_t*output, uint64_t* intsToSwap, unsigned int num_points){ +static inline void volk_64u_byteswappuppet_64u_generic(uint64_t* output, + uint64_t* intsToSwap, + unsigned int num_points) +{ volk_64u_byteswap_generic((uint64_t*)intsToSwap, num_points); memcpy((void*)output, (void*)intsToSwap, num_points * sizeof(uint64_t)); - } #endif #ifdef LV_HAVE_NEONV8 -static inline void volk_64u_byteswappuppet_64u_neonv8(uint64_t*output, uint64_t* intsToSwap, unsigned int num_points){ +static inline void volk_64u_byteswappuppet_64u_neonv8(uint64_t* output, + uint64_t* intsToSwap, + unsigned int num_points) +{ volk_64u_byteswap_neonv8((uint64_t*)intsToSwap, num_points); memcpy((void*)output, (void*)intsToSwap, num_points * sizeof(uint64_t)); - } #else #ifdef LV_HAVE_NEON -static inline void volk_64u_byteswappuppet_64u_neon(uint64_t*output, uint64_t* intsToSwap, unsigned int num_points){ +static inline void volk_64u_byteswappuppet_64u_neon(uint64_t* output, + uint64_t* intsToSwap, + unsigned int num_points) +{ volk_64u_byteswap_neon((uint64_t*)intsToSwap, num_points); memcpy((void*)output, (void*)intsToSwap, num_points * sizeof(uint64_t)); - } #endif #endif #ifdef LV_HAVE_SSE2 -static inline void volk_64u_byteswappuppet_64u_u_sse2(uint64_t* output, uint64_t* intsToSwap, unsigned int num_points){ +static inline void volk_64u_byteswappuppet_64u_u_sse2(uint64_t* output, + uint64_t* intsToSwap, + unsigned int num_points) +{ volk_64u_byteswap_u_sse2((uint64_t*)intsToSwap, num_points); memcpy((void*)output, (void*)intsToSwap, num_points * sizeof(uint64_t)); - } #endif #ifdef LV_HAVE_SSE2 -static inline void volk_64u_byteswappuppet_64u_a_sse2(uint64_t* output, uint64_t* intsToSwap, unsigned int num_points){ +static inline void volk_64u_byteswappuppet_64u_a_sse2(uint64_t* output, + uint64_t* intsToSwap, + unsigned int num_points) +{ volk_64u_byteswap_a_sse2((uint64_t*)intsToSwap, num_points); memcpy((void*)output, (void*)intsToSwap, num_points * sizeof(uint64_t)); - } #endif #ifdef LV_HAVE_SSSE3 -static inline void volk_64u_byteswappuppet_64u_u_ssse3(uint64_t* output, uint64_t* intsToSwap, unsigned int num_points){ +static inline void volk_64u_byteswappuppet_64u_u_ssse3(uint64_t* output, + uint64_t* intsToSwap, + unsigned int num_points) +{ volk_64u_byteswap_u_ssse3((uint64_t*)intsToSwap, num_points); memcpy((void*)output, (void*)intsToSwap, num_points * sizeof(uint64_t)); - } #endif #ifdef LV_HAVE_SSSE3 -static inline void volk_64u_byteswappuppet_64u_a_ssse3(uint64_t* output, uint64_t* intsToSwap, unsigned int num_points){ +static inline void volk_64u_byteswappuppet_64u_a_ssse3(uint64_t* output, + uint64_t* intsToSwap, + unsigned int num_points) +{ volk_64u_byteswap_a_ssse3((uint64_t*)intsToSwap, num_points); memcpy((void*)output, (void*)intsToSwap, num_points * sizeof(uint64_t)); - } #endif #ifdef LV_HAVE_AVX2 -static inline void volk_64u_byteswappuppet_64u_u_avx2(uint64_t* output, uint64_t* intsToSwap, unsigned int num_points){ +static inline void volk_64u_byteswappuppet_64u_u_avx2(uint64_t* output, + uint64_t* intsToSwap, + unsigned int num_points) +{ volk_64u_byteswap_u_avx2((uint64_t*)intsToSwap, num_points); memcpy((void*)output, (void*)intsToSwap, num_points * sizeof(uint64_t)); - } #endif #ifdef LV_HAVE_AVX2 -static inline void volk_64u_byteswappuppet_64u_a_avx2(uint64_t* output, uint64_t* intsToSwap, unsigned int num_points){ +static inline void volk_64u_byteswappuppet_64u_a_avx2(uint64_t* output, + uint64_t* intsToSwap, + unsigned int num_points) +{ volk_64u_byteswap_a_avx2((uint64_t*)intsToSwap, num_points); memcpy((void*)output, (void*)intsToSwap, num_points * sizeof(uint64_t)); - } #endif diff --git a/kernels/volk/volk_64u_popcnt.h b/kernels/volk/volk_64u_popcnt.h index cbce2ec..43c2ae0 100644 --- a/kernels/volk/volk_64u_popcnt.h +++ b/kernels/volk/volk_64u_popcnt.h @@ -60,39 +60,38 @@ #ifndef INCLUDED_volk_64u_popcnt_a_H #define INCLUDED_volk_64u_popcnt_a_H -#include #include +#include #ifdef LV_HAVE_GENERIC -static inline void -volk_64u_popcnt_generic(uint64_t* ret, const uint64_t value) +static inline void volk_64u_popcnt_generic(uint64_t* ret, const uint64_t value) { - //const uint32_t* valueVector = (const uint32_t*)&value; - - // This is faster than a lookup table - //uint32_t retVal = valueVector[0]; - uint32_t retVal = (uint32_t)(value & 0x00000000FFFFFFFFull); - - retVal = (retVal & 0x55555555) + (retVal >> 1 & 0x55555555); - retVal = (retVal & 0x33333333) + (retVal >> 2 & 0x33333333); - retVal = (retVal + (retVal >> 4)) & 0x0F0F0F0F; - retVal = (retVal + (retVal >> 8)); - retVal = (retVal + (retVal >> 16)) & 0x0000003F; - uint64_t retVal64 = retVal; - - //retVal = valueVector[1]; - retVal = (uint32_t)((value & 0xFFFFFFFF00000000ull) >> 32); - retVal = (retVal & 0x55555555) + (retVal >> 1 & 0x55555555); - retVal = (retVal & 0x33333333) + (retVal >> 2 & 0x33333333); - retVal = (retVal + (retVal >> 4)) & 0x0F0F0F0F; - retVal = (retVal + (retVal >> 8)); - retVal = (retVal + (retVal >> 16)) & 0x0000003F; - retVal64 += retVal; - - *ret = retVal64; + // const uint32_t* valueVector = (const uint32_t*)&value; + + // This is faster than a lookup table + // uint32_t retVal = valueVector[0]; + uint32_t retVal = (uint32_t)(value & 0x00000000FFFFFFFFull); + + retVal = (retVal & 0x55555555) + (retVal >> 1 & 0x55555555); + retVal = (retVal & 0x33333333) + (retVal >> 2 & 0x33333333); + retVal = (retVal + (retVal >> 4)) & 0x0F0F0F0F; + retVal = (retVal + (retVal >> 8)); + retVal = (retVal + (retVal >> 16)) & 0x0000003F; + uint64_t retVal64 = retVal; + + // retVal = valueVector[1]; + retVal = (uint32_t)((value & 0xFFFFFFFF00000000ull) >> 32); + retVal = (retVal & 0x55555555) + (retVal >> 1 & 0x55555555); + retVal = (retVal & 0x33333333) + (retVal >> 2 & 0x33333333); + retVal = (retVal + (retVal >> 4)) & 0x0F0F0F0F; + retVal = (retVal + (retVal >> 8)); + retVal = (retVal + (retVal >> 16)) & 0x0000003F; + retVal64 += retVal; + + *ret = retVal64; } #endif /*LV_HAVE_GENERIC*/ @@ -104,7 +103,7 @@ volk_64u_popcnt_generic(uint64_t* ret, const uint64_t value) static inline void volk_64u_popcnt_a_sse4_2(uint64_t* ret, const uint64_t value) { - *ret = _mm_popcnt_u64(value); + *ret = _mm_popcnt_u64(value); } #endif /*LV_HAVE_SSE4_2*/ @@ -114,19 +113,19 @@ static inline void volk_64u_popcnt_a_sse4_2(uint64_t* ret, const uint64_t value) #include static inline void volk_64u_popcnt_neon(uint64_t* ret, const uint64_t value) { - uint8x8_t input_val, count8x8_val; - uint16x4_t count16x4_val; - uint32x2_t count32x2_val; - uint64x1_t count64x1_val; - - input_val = vld1_u8((unsigned char *) &value); - count8x8_val = vcnt_u8(input_val); - count16x4_val = vpaddl_u8(count8x8_val); - count32x2_val = vpaddl_u16(count16x4_val); - count64x1_val = vpaddl_u32(count32x2_val); - vst1_u64(ret, count64x1_val); - - //*ret = _mm_popcnt_u64(value); + uint8x8_t input_val, count8x8_val; + uint16x4_t count16x4_val; + uint32x2_t count32x2_val; + uint64x1_t count64x1_val; + + input_val = vld1_u8((unsigned char*)&value); + count8x8_val = vcnt_u8(input_val); + count16x4_val = vpaddl_u8(count8x8_val); + count32x2_val = vpaddl_u16(count16x4_val); + count64x1_val = vpaddl_u32(count32x2_val); + vst1_u64(ret, count64x1_val); + + //*ret = _mm_popcnt_u64(value); } #endif /*LV_HAVE_NEON*/ diff --git a/kernels/volk/volk_64u_popcntpuppet_64u.h b/kernels/volk/volk_64u_popcntpuppet_64u.h index e38ebb3..688281a 100644 --- a/kernels/volk/volk_64u_popcntpuppet_64u.h +++ b/kernels/volk/volk_64u_popcntpuppet_64u.h @@ -23,35 +23,44 @@ #ifndef INCLUDED_volk_64u_popcntpuppet_64u_H #define INCLUDED_volk_64u_popcntpuppet_64u_H -#include #include #include +#include #ifdef LV_HAVE_GENERIC -static inline void volk_64u_popcntpuppet_64u_generic(uint64_t* outVector, const uint64_t* inVector, unsigned int num_points){ +static inline void volk_64u_popcntpuppet_64u_generic(uint64_t* outVector, + const uint64_t* inVector, + unsigned int num_points) +{ unsigned int ii; - for(ii=0; ii < num_points; ++ii) { - volk_64u_popcnt_generic(outVector+ii, num_points ); + for (ii = 0; ii < num_points; ++ii) { + volk_64u_popcnt_generic(outVector + ii, num_points); } memcpy((void*)outVector, (void*)inVector, num_points * sizeof(uint64_t)); } #endif /* LV_HAVE_GENERIC */ #if LV_HAVE_SSE4_2 && LV_HAVE_64 -static inline void volk_64u_popcntpuppet_64u_a_sse4_2(uint64_t* outVector, const uint64_t* inVector, unsigned int num_points){ +static inline void volk_64u_popcntpuppet_64u_a_sse4_2(uint64_t* outVector, + const uint64_t* inVector, + unsigned int num_points) +{ unsigned int ii; - for(ii=0; ii < num_points; ++ii) { - volk_64u_popcnt_a_sse4_2(outVector+ii, num_points ); + for (ii = 0; ii < num_points; ++ii) { + volk_64u_popcnt_a_sse4_2(outVector + ii, num_points); } memcpy((void*)outVector, (void*)inVector, num_points * sizeof(uint64_t)); } #endif /* LV_HAVE_SSE4_2 */ #ifdef LV_HAVE_NEON -static inline void volk_64u_popcntpuppet_64u_neon(uint64_t* outVector, const uint64_t* inVector, unsigned int num_points){ +static inline void volk_64u_popcntpuppet_64u_neon(uint64_t* outVector, + const uint64_t* inVector, + unsigned int num_points) +{ unsigned int ii; - for(ii=0; ii < num_points; ++ii) { - volk_64u_popcnt_neon(outVector+ii, num_points ); + for (ii = 0; ii < num_points; ++ii) { + volk_64u_popcnt_neon(outVector + ii, num_points); } memcpy((void*)outVector, (void*)inVector, num_points * sizeof(uint64_t)); } diff --git a/kernels/volk/volk_8i_convert_16i.h b/kernels/volk/volk_8i_convert_16i.h index 40400c3..69d8f6a 100644 --- a/kernels/volk/volk_8i_convert_16i.h +++ b/kernels/volk/volk_8i_convert_16i.h @@ -30,8 +30,8 @@ * * Dispatcher Prototype * \code - * void volk_8i_convert_16i(int16_t* outputVector, const int8_t* inputVector, unsigned int num_points) - * \endcode + * void volk_8i_convert_16i(int16_t* outputVector, const int8_t* inputVector, unsigned int + * num_points) \endcode * * \b Inputs * \li inputVector: The input vector of 8-bit chars. @@ -59,32 +59,32 @@ #ifdef LV_HAVE_AVX2 #include -static inline void -volk_8i_convert_16i_u_avx2(int16_t* outputVector, const int8_t* inputVector, - unsigned int num_points) +static inline void volk_8i_convert_16i_u_avx2(int16_t* outputVector, + const int8_t* inputVector, + unsigned int num_points) { - unsigned int number = 0; - const unsigned int sixteenthPoints = num_points / 16; - - const __m128i* inputVectorPtr = (const __m128i*)inputVector; - __m256i* outputVectorPtr = (__m256i*)outputVector; - __m128i inputVal; - __m256i ret; - - for(;number < sixteenthPoints; number++){ - inputVal = _mm_loadu_si128(inputVectorPtr); - ret = _mm256_cvtepi8_epi16(inputVal); - ret = _mm256_slli_epi16(ret, 8); // Multiply by 256 - _mm256_storeu_si256(outputVectorPtr, ret); - - outputVectorPtr++; - inputVectorPtr++; - } - - number = sixteenthPoints * 16; - for(; number < num_points; number++){ - outputVector[number] = (int16_t)(inputVector[number])*256; - } + unsigned int number = 0; + const unsigned int sixteenthPoints = num_points / 16; + + const __m128i* inputVectorPtr = (const __m128i*)inputVector; + __m256i* outputVectorPtr = (__m256i*)outputVector; + __m128i inputVal; + __m256i ret; + + for (; number < sixteenthPoints; number++) { + inputVal = _mm_loadu_si128(inputVectorPtr); + ret = _mm256_cvtepi8_epi16(inputVal); + ret = _mm256_slli_epi16(ret, 8); // Multiply by 256 + _mm256_storeu_si256(outputVectorPtr, ret); + + outputVectorPtr++; + inputVectorPtr++; + } + + number = sixteenthPoints * 16; + for (; number < num_points; number++) { + outputVector[number] = (int16_t)(inputVector[number]) * 256; + } } #endif /* LV_HAVE_AVX2 */ @@ -92,57 +92,57 @@ volk_8i_convert_16i_u_avx2(int16_t* outputVector, const int8_t* inputVector, #ifdef LV_HAVE_SSE4_1 #include -static inline void -volk_8i_convert_16i_u_sse4_1(int16_t* outputVector, const int8_t* inputVector, - unsigned int num_points) +static inline void volk_8i_convert_16i_u_sse4_1(int16_t* outputVector, + const int8_t* inputVector, + unsigned int num_points) { - unsigned int number = 0; - const unsigned int sixteenthPoints = num_points / 16; + unsigned int number = 0; + const unsigned int sixteenthPoints = num_points / 16; - const __m128i* inputVectorPtr = (const __m128i*)inputVector; - __m128i* outputVectorPtr = (__m128i*)outputVector; - __m128i inputVal; - __m128i ret; + const __m128i* inputVectorPtr = (const __m128i*)inputVector; + __m128i* outputVectorPtr = (__m128i*)outputVector; + __m128i inputVal; + __m128i ret; - for(;number < sixteenthPoints; number++){ - inputVal = _mm_loadu_si128(inputVectorPtr); - ret = _mm_cvtepi8_epi16(inputVal); - ret = _mm_slli_epi16(ret, 8); // Multiply by 256 - _mm_storeu_si128(outputVectorPtr, ret); + for (; number < sixteenthPoints; number++) { + inputVal = _mm_loadu_si128(inputVectorPtr); + ret = _mm_cvtepi8_epi16(inputVal); + ret = _mm_slli_epi16(ret, 8); // Multiply by 256 + _mm_storeu_si128(outputVectorPtr, ret); - outputVectorPtr++; + outputVectorPtr++; - inputVal = _mm_srli_si128(inputVal, 8); - ret = _mm_cvtepi8_epi16(inputVal); - ret = _mm_slli_epi16(ret, 8); // Multiply by 256 - _mm_storeu_si128(outputVectorPtr, ret); + inputVal = _mm_srli_si128(inputVal, 8); + ret = _mm_cvtepi8_epi16(inputVal); + ret = _mm_slli_epi16(ret, 8); // Multiply by 256 + _mm_storeu_si128(outputVectorPtr, ret); - outputVectorPtr++; + outputVectorPtr++; - inputVectorPtr++; - } + inputVectorPtr++; + } - number = sixteenthPoints * 16; - for(; number < num_points; number++){ - outputVector[number] = (int16_t)(inputVector[number])*256; - } + number = sixteenthPoints * 16; + for (; number < num_points; number++) { + outputVector[number] = (int16_t)(inputVector[number]) * 256; + } } #endif /* LV_HAVE_SSE4_1 */ #ifdef LV_HAVE_GENERIC -static inline void -volk_8i_convert_16i_generic(int16_t* outputVector, const int8_t* inputVector, - unsigned int num_points) +static inline void volk_8i_convert_16i_generic(int16_t* outputVector, + const int8_t* inputVector, + unsigned int num_points) { - int16_t* outputVectorPtr = outputVector; - const int8_t* inputVectorPtr = inputVector; - unsigned int number = 0; + int16_t* outputVectorPtr = outputVector; + const int8_t* inputVectorPtr = inputVector; + unsigned int number = 0; - for(number = 0; number < num_points; number++){ - *outputVectorPtr++ = ((int16_t)(*inputVectorPtr++)) * 256; - } + for (number = 0; number < num_points; number++) { + *outputVectorPtr++ = ((int16_t)(*inputVectorPtr++)) * 256; + } } #endif /* LV_HAVE_GENERIC */ @@ -150,7 +150,6 @@ volk_8i_convert_16i_generic(int16_t* outputVector, const int8_t* inputVector, #endif /* INCLUDED_VOLK_8s_CONVERT_16s_UNALIGNED8_H */ - #ifndef INCLUDED_volk_8i_convert_16i_a_H #define INCLUDED_volk_8i_convert_16i_a_H @@ -160,32 +159,32 @@ volk_8i_convert_16i_generic(int16_t* outputVector, const int8_t* inputVector, #ifdef LV_HAVE_AVX2 #include -static inline void -volk_8i_convert_16i_a_avx2(int16_t* outputVector, const int8_t* inputVector, - unsigned int num_points) +static inline void volk_8i_convert_16i_a_avx2(int16_t* outputVector, + const int8_t* inputVector, + unsigned int num_points) { - unsigned int number = 0; - const unsigned int sixteenthPoints = num_points / 16; - - const __m128i* inputVectorPtr = (const __m128i*)inputVector; - __m256i* outputVectorPtr = (__m256i*)outputVector; - __m128i inputVal; - __m256i ret; - - for(;number < sixteenthPoints; number++){ - inputVal = _mm_load_si128(inputVectorPtr); - ret = _mm256_cvtepi8_epi16(inputVal); - ret = _mm256_slli_epi16(ret, 8); // Multiply by 256 - _mm256_store_si256(outputVectorPtr, ret); - - outputVectorPtr++; - inputVectorPtr++; - } - - number = sixteenthPoints * 16; - for(; number < num_points; number++){ - outputVector[number] = (int16_t)(inputVector[number])*256; - } + unsigned int number = 0; + const unsigned int sixteenthPoints = num_points / 16; + + const __m128i* inputVectorPtr = (const __m128i*)inputVector; + __m256i* outputVectorPtr = (__m256i*)outputVector; + __m128i inputVal; + __m256i ret; + + for (; number < sixteenthPoints; number++) { + inputVal = _mm_load_si128(inputVectorPtr); + ret = _mm256_cvtepi8_epi16(inputVal); + ret = _mm256_slli_epi16(ret, 8); // Multiply by 256 + _mm256_store_si256(outputVectorPtr, ret); + + outputVectorPtr++; + inputVectorPtr++; + } + + number = sixteenthPoints * 16; + for (; number < num_points; number++) { + outputVector[number] = (int16_t)(inputVector[number]) * 256; + } } #endif /* LV_HAVE_AVX2 */ @@ -193,57 +192,57 @@ volk_8i_convert_16i_a_avx2(int16_t* outputVector, const int8_t* inputVector, #ifdef LV_HAVE_SSE4_1 #include -static inline void -volk_8i_convert_16i_a_sse4_1(int16_t* outputVector, const int8_t* inputVector, - unsigned int num_points) +static inline void volk_8i_convert_16i_a_sse4_1(int16_t* outputVector, + const int8_t* inputVector, + unsigned int num_points) { - unsigned int number = 0; - const unsigned int sixteenthPoints = num_points / 16; + unsigned int number = 0; + const unsigned int sixteenthPoints = num_points / 16; - const __m128i* inputVectorPtr = (const __m128i*)inputVector; - __m128i* outputVectorPtr = (__m128i*)outputVector; - __m128i inputVal; - __m128i ret; + const __m128i* inputVectorPtr = (const __m128i*)inputVector; + __m128i* outputVectorPtr = (__m128i*)outputVector; + __m128i inputVal; + __m128i ret; - for(;number < sixteenthPoints; number++){ - inputVal = _mm_load_si128(inputVectorPtr); - ret = _mm_cvtepi8_epi16(inputVal); - ret = _mm_slli_epi16(ret, 8); // Multiply by 256 - _mm_store_si128(outputVectorPtr, ret); + for (; number < sixteenthPoints; number++) { + inputVal = _mm_load_si128(inputVectorPtr); + ret = _mm_cvtepi8_epi16(inputVal); + ret = _mm_slli_epi16(ret, 8); // Multiply by 256 + _mm_store_si128(outputVectorPtr, ret); - outputVectorPtr++; + outputVectorPtr++; - inputVal = _mm_srli_si128(inputVal, 8); - ret = _mm_cvtepi8_epi16(inputVal); - ret = _mm_slli_epi16(ret, 8); // Multiply by 256 - _mm_store_si128(outputVectorPtr, ret); + inputVal = _mm_srli_si128(inputVal, 8); + ret = _mm_cvtepi8_epi16(inputVal); + ret = _mm_slli_epi16(ret, 8); // Multiply by 256 + _mm_store_si128(outputVectorPtr, ret); - outputVectorPtr++; + outputVectorPtr++; - inputVectorPtr++; - } + inputVectorPtr++; + } - number = sixteenthPoints * 16; - for(; number < num_points; number++){ - outputVector[number] = (int16_t)(inputVector[number])*256; - } + number = sixteenthPoints * 16; + for (; number < num_points; number++) { + outputVector[number] = (int16_t)(inputVector[number]) * 256; + } } #endif /* LV_HAVE_SSE4_1 */ #ifdef LV_HAVE_GENERIC -static inline void -volk_8i_convert_16i_a_generic(int16_t* outputVector, const int8_t* inputVector, - unsigned int num_points) +static inline void volk_8i_convert_16i_a_generic(int16_t* outputVector, + const int8_t* inputVector, + unsigned int num_points) { - int16_t* outputVectorPtr = outputVector; - const int8_t* inputVectorPtr = inputVector; - unsigned int number = 0; + int16_t* outputVectorPtr = outputVector; + const int8_t* inputVectorPtr = inputVector; + unsigned int number = 0; - for(number = 0; number < num_points; number++){ - *outputVectorPtr++ = ((int16_t)(*inputVectorPtr++)) * 256; - } + for (number = 0; number < num_points; number++) { + *outputVectorPtr++ = ((int16_t)(*inputVectorPtr++)) * 256; + } } #endif /* LV_HAVE_GENERIC */ @@ -251,51 +250,51 @@ volk_8i_convert_16i_a_generic(int16_t* outputVector, const int8_t* inputVector, #ifdef LV_HAVE_NEON #include -static inline void -volk_8i_convert_16i_neon(int16_t* outputVector, const int8_t* inputVector, unsigned int num_points) +static inline void volk_8i_convert_16i_neon(int16_t* outputVector, + const int8_t* inputVector, + unsigned int num_points) { - int16_t* outputVectorPtr = outputVector; - const int8_t* inputVectorPtr = inputVector; - unsigned int number; - const unsigned int eighth_points = num_points / 8; - - int8x8_t input_vec ; - int16x8_t converted_vec; - - // NEON doesn't have a concept of 8 bit registers, so we are really - // dealing with the low half of 16-bit registers. Since this requires - // a move instruction we likely do better with ASM here. - for(number = 0; number < eighth_points; ++number) { - input_vec = vld1_s8(inputVectorPtr); - converted_vec = vmovl_s8(input_vec); - //converted_vec = vmulq_s16(converted_vec, scale_factor); - converted_vec = vshlq_n_s16(converted_vec, 8); - vst1q_s16( outputVectorPtr, converted_vec); - - inputVectorPtr += 8; - outputVectorPtr += 8; - } - - for(number = eighth_points * 8; number < num_points; number++){ - *outputVectorPtr++ = ((int16_t)(*inputVectorPtr++)) * 256; - } + int16_t* outputVectorPtr = outputVector; + const int8_t* inputVectorPtr = inputVector; + unsigned int number; + const unsigned int eighth_points = num_points / 8; + + int8x8_t input_vec; + int16x8_t converted_vec; + + // NEON doesn't have a concept of 8 bit registers, so we are really + // dealing with the low half of 16-bit registers. Since this requires + // a move instruction we likely do better with ASM here. + for (number = 0; number < eighth_points; ++number) { + input_vec = vld1_s8(inputVectorPtr); + converted_vec = vmovl_s8(input_vec); + // converted_vec = vmulq_s16(converted_vec, scale_factor); + converted_vec = vshlq_n_s16(converted_vec, 8); + vst1q_s16(outputVectorPtr, converted_vec); + + inputVectorPtr += 8; + outputVectorPtr += 8; + } + + for (number = eighth_points * 8; number < num_points; number++) { + *outputVectorPtr++ = ((int16_t)(*inputVectorPtr++)) * 256; + } } #endif /* LV_HAVE_NEON */ #ifdef LV_HAVE_ORC -extern void -volk_8i_convert_16i_a_orc_impl(int16_t* outputVector, const int8_t* inputVector, - unsigned int num_points); +extern void volk_8i_convert_16i_a_orc_impl(int16_t* outputVector, + const int8_t* inputVector, + unsigned int num_points); -static inline void -volk_8i_convert_16i_u_orc(int16_t* outputVector, const int8_t* inputVector, - unsigned int num_points) +static inline void volk_8i_convert_16i_u_orc(int16_t* outputVector, + const int8_t* inputVector, + unsigned int num_points) { - volk_8i_convert_16i_a_orc_impl(outputVector, inputVector, num_points); + volk_8i_convert_16i_a_orc_impl(outputVector, inputVector, num_points); } #endif /* LV_HAVE_ORC */ - #endif /* INCLUDED_VOLK_8s_CONVERT_16s_ALIGNED8_H */ diff --git a/kernels/volk/volk_8i_s32f_convert_32f.h b/kernels/volk/volk_8i_s32f_convert_32f.h index 97d160b..c3d5666 100644 --- a/kernels/volk/volk_8i_s32f_convert_32f.h +++ b/kernels/volk/volk_8i_s32f_convert_32f.h @@ -30,8 +30,8 @@ * * Dispatcher Prototype * \code - * void volk_8i_s32f_convert_32f(float* outputVector, const int8_t* inputVector, const float scalar, unsigned int num_points) - * \endcode + * void volk_8i_s32f_convert_32f(float* outputVector, const int8_t* inputVector, const + * float scalar, unsigned int num_points) \endcode * * \b Inputs * \li inputVector: The input vector of 8-bit chars. @@ -60,44 +60,45 @@ #ifdef LV_HAVE_AVX2 #include -static inline void -volk_8i_s32f_convert_32f_u_avx2(float* outputVector, const int8_t* inputVector, - const float scalar, unsigned int num_points) +static inline void volk_8i_s32f_convert_32f_u_avx2(float* outputVector, + const int8_t* inputVector, + const float scalar, + unsigned int num_points) { - unsigned int number = 0; - const unsigned int sixteenthPoints = num_points / 16; - - float* outputVectorPtr = outputVector; - const float iScalar = 1.0 / scalar; - __m256 invScalar = _mm256_set1_ps( iScalar ); - const int8_t* inputVectorPtr = inputVector; - __m256 ret; - __m128i inputVal128; - __m256i interimVal; - - for(;number < sixteenthPoints; number++){ - inputVal128 = _mm_loadu_si128((__m128i*)inputVectorPtr); - - interimVal = _mm256_cvtepi8_epi32(inputVal128); - ret = _mm256_cvtepi32_ps(interimVal); - ret = _mm256_mul_ps(ret, invScalar); - _mm256_storeu_ps(outputVectorPtr, ret); - outputVectorPtr += 8; - - inputVal128 = _mm_srli_si128(inputVal128, 8); - interimVal = _mm256_cvtepi8_epi32(inputVal128); - ret = _mm256_cvtepi32_ps(interimVal); - ret = _mm256_mul_ps(ret, invScalar); - _mm256_storeu_ps(outputVectorPtr, ret); - outputVectorPtr += 8; - - inputVectorPtr += 16; - } - - number = sixteenthPoints * 16; - for(; number < num_points; number++){ - outputVector[number] = (float)(inputVector[number]) * iScalar; - } + unsigned int number = 0; + const unsigned int sixteenthPoints = num_points / 16; + + float* outputVectorPtr = outputVector; + const float iScalar = 1.0 / scalar; + __m256 invScalar = _mm256_set1_ps(iScalar); + const int8_t* inputVectorPtr = inputVector; + __m256 ret; + __m128i inputVal128; + __m256i interimVal; + + for (; number < sixteenthPoints; number++) { + inputVal128 = _mm_loadu_si128((__m128i*)inputVectorPtr); + + interimVal = _mm256_cvtepi8_epi32(inputVal128); + ret = _mm256_cvtepi32_ps(interimVal); + ret = _mm256_mul_ps(ret, invScalar); + _mm256_storeu_ps(outputVectorPtr, ret); + outputVectorPtr += 8; + + inputVal128 = _mm_srli_si128(inputVal128, 8); + interimVal = _mm256_cvtepi8_epi32(inputVal128); + ret = _mm256_cvtepi32_ps(interimVal); + ret = _mm256_mul_ps(ret, invScalar); + _mm256_storeu_ps(outputVectorPtr, ret); + outputVectorPtr += 8; + + inputVectorPtr += 16; + } + + number = sixteenthPoints * 16; + for (; number < num_points; number++) { + outputVector[number] = (float)(inputVector[number]) * iScalar; + } } #endif /* LV_HAVE_AVX2 */ @@ -105,80 +106,81 @@ volk_8i_s32f_convert_32f_u_avx2(float* outputVector, const int8_t* inputVector, #ifdef LV_HAVE_SSE4_1 #include -static inline void -volk_8i_s32f_convert_32f_u_sse4_1(float* outputVector, const int8_t* inputVector, - const float scalar, unsigned int num_points) +static inline void volk_8i_s32f_convert_32f_u_sse4_1(float* outputVector, + const int8_t* inputVector, + const float scalar, + unsigned int num_points) { - unsigned int number = 0; - const unsigned int sixteenthPoints = num_points / 16; - - float* outputVectorPtr = outputVector; - const float iScalar = 1.0 / scalar; - __m128 invScalar = _mm_set_ps1( iScalar ); - const int8_t* inputVectorPtr = inputVector; - __m128 ret; - __m128i inputVal; - __m128i interimVal; - - for(;number < sixteenthPoints; number++){ - inputVal = _mm_loadu_si128((__m128i*)inputVectorPtr); - - interimVal = _mm_cvtepi8_epi32(inputVal); - ret = _mm_cvtepi32_ps(interimVal); - ret = _mm_mul_ps(ret, invScalar); - _mm_storeu_ps(outputVectorPtr, ret); - outputVectorPtr += 4; - - inputVal = _mm_srli_si128(inputVal, 4); - interimVal = _mm_cvtepi8_epi32(inputVal); - ret = _mm_cvtepi32_ps(interimVal); - ret = _mm_mul_ps(ret, invScalar); - _mm_storeu_ps(outputVectorPtr, ret); - outputVectorPtr += 4; - - inputVal = _mm_srli_si128(inputVal, 4); - interimVal = _mm_cvtepi8_epi32(inputVal); - ret = _mm_cvtepi32_ps(interimVal); - ret = _mm_mul_ps(ret, invScalar); - _mm_storeu_ps(outputVectorPtr, ret); - outputVectorPtr += 4; - - inputVal = _mm_srli_si128(inputVal, 4); - interimVal = _mm_cvtepi8_epi32(inputVal); - ret = _mm_cvtepi32_ps(interimVal); - ret = _mm_mul_ps(ret, invScalar); - _mm_storeu_ps(outputVectorPtr, ret); - outputVectorPtr += 4; - - inputVectorPtr += 16; - } - - number = sixteenthPoints * 16; - for(; number < num_points; number++){ - outputVector[number] = (float)(inputVector[number]) * iScalar; - } + unsigned int number = 0; + const unsigned int sixteenthPoints = num_points / 16; + + float* outputVectorPtr = outputVector; + const float iScalar = 1.0 / scalar; + __m128 invScalar = _mm_set_ps1(iScalar); + const int8_t* inputVectorPtr = inputVector; + __m128 ret; + __m128i inputVal; + __m128i interimVal; + + for (; number < sixteenthPoints; number++) { + inputVal = _mm_loadu_si128((__m128i*)inputVectorPtr); + + interimVal = _mm_cvtepi8_epi32(inputVal); + ret = _mm_cvtepi32_ps(interimVal); + ret = _mm_mul_ps(ret, invScalar); + _mm_storeu_ps(outputVectorPtr, ret); + outputVectorPtr += 4; + + inputVal = _mm_srli_si128(inputVal, 4); + interimVal = _mm_cvtepi8_epi32(inputVal); + ret = _mm_cvtepi32_ps(interimVal); + ret = _mm_mul_ps(ret, invScalar); + _mm_storeu_ps(outputVectorPtr, ret); + outputVectorPtr += 4; + + inputVal = _mm_srli_si128(inputVal, 4); + interimVal = _mm_cvtepi8_epi32(inputVal); + ret = _mm_cvtepi32_ps(interimVal); + ret = _mm_mul_ps(ret, invScalar); + _mm_storeu_ps(outputVectorPtr, ret); + outputVectorPtr += 4; + + inputVal = _mm_srli_si128(inputVal, 4); + interimVal = _mm_cvtepi8_epi32(inputVal); + ret = _mm_cvtepi32_ps(interimVal); + ret = _mm_mul_ps(ret, invScalar); + _mm_storeu_ps(outputVectorPtr, ret); + outputVectorPtr += 4; + + inputVectorPtr += 16; + } + + number = sixteenthPoints * 16; + for (; number < num_points; number++) { + outputVector[number] = (float)(inputVector[number]) * iScalar; + } } #endif /* LV_HAVE_SSE4_1 */ #ifdef LV_HAVE_GENERIC -static inline void -volk_8i_s32f_convert_32f_generic(float* outputVector, const int8_t* inputVector, - const float scalar, unsigned int num_points) +static inline void volk_8i_s32f_convert_32f_generic(float* outputVector, + const int8_t* inputVector, + const float scalar, + unsigned int num_points) { - float* outputVectorPtr = outputVector; - const int8_t* inputVectorPtr = inputVector; - unsigned int number = 0; - const float iScalar = 1.0 / scalar; - - for(number = 0; number < num_points; number++){ - *outputVectorPtr++ = ((float)(*inputVectorPtr++)) * iScalar; - } + float* outputVectorPtr = outputVector; + const int8_t* inputVectorPtr = inputVector; + unsigned int number = 0; + const float iScalar = 1.0 / scalar; + + for (number = 0; number < num_points; number++) { + *outputVectorPtr++ = ((float)(*inputVectorPtr++)) * iScalar; + } } #endif /* LV_HAVE_GENERIC */ - #endif /* INCLUDED_VOLK_8s_CONVERT_32f_UNALIGNED8_H */ #ifndef INCLUDED_volk_8i_s32f_convert_32f_a_H @@ -190,195 +192,199 @@ volk_8i_s32f_convert_32f_generic(float* outputVector, const int8_t* inputVector, #ifdef LV_HAVE_AVX2 #include -static inline void -volk_8i_s32f_convert_32f_a_avx2(float* outputVector, const int8_t* inputVector, - const float scalar, unsigned int num_points) +static inline void volk_8i_s32f_convert_32f_a_avx2(float* outputVector, + const int8_t* inputVector, + const float scalar, + unsigned int num_points) { - unsigned int number = 0; - const unsigned int sixteenthPoints = num_points / 16; - - float* outputVectorPtr = outputVector; - const float iScalar = 1.0 / scalar; - __m256 invScalar = _mm256_set1_ps( iScalar ); - const int8_t* inputVectorPtr = inputVector; - __m256 ret; - __m128i inputVal128; - __m256i interimVal; - - for(;number < sixteenthPoints; number++){ - inputVal128 = _mm_load_si128((__m128i*)inputVectorPtr); - - interimVal = _mm256_cvtepi8_epi32(inputVal128); - ret = _mm256_cvtepi32_ps(interimVal); - ret = _mm256_mul_ps(ret, invScalar); - _mm256_store_ps(outputVectorPtr, ret); - outputVectorPtr += 8; - - inputVal128 = _mm_srli_si128(inputVal128, 8); - interimVal = _mm256_cvtepi8_epi32(inputVal128); - ret = _mm256_cvtepi32_ps(interimVal); - ret = _mm256_mul_ps(ret, invScalar); - _mm256_store_ps(outputVectorPtr, ret); - outputVectorPtr += 8; - - inputVectorPtr += 16; - } - - number = sixteenthPoints * 16; - for(; number < num_points; number++){ - outputVector[number] = (float)(inputVector[number]) * iScalar; - } + unsigned int number = 0; + const unsigned int sixteenthPoints = num_points / 16; + + float* outputVectorPtr = outputVector; + const float iScalar = 1.0 / scalar; + __m256 invScalar = _mm256_set1_ps(iScalar); + const int8_t* inputVectorPtr = inputVector; + __m256 ret; + __m128i inputVal128; + __m256i interimVal; + + for (; number < sixteenthPoints; number++) { + inputVal128 = _mm_load_si128((__m128i*)inputVectorPtr); + + interimVal = _mm256_cvtepi8_epi32(inputVal128); + ret = _mm256_cvtepi32_ps(interimVal); + ret = _mm256_mul_ps(ret, invScalar); + _mm256_store_ps(outputVectorPtr, ret); + outputVectorPtr += 8; + + inputVal128 = _mm_srli_si128(inputVal128, 8); + interimVal = _mm256_cvtepi8_epi32(inputVal128); + ret = _mm256_cvtepi32_ps(interimVal); + ret = _mm256_mul_ps(ret, invScalar); + _mm256_store_ps(outputVectorPtr, ret); + outputVectorPtr += 8; + + inputVectorPtr += 16; + } + + number = sixteenthPoints * 16; + for (; number < num_points; number++) { + outputVector[number] = (float)(inputVector[number]) * iScalar; + } } #endif /* LV_HAVE_AVX2 */ #ifdef LV_HAVE_SSE4_1 #include -static inline void -volk_8i_s32f_convert_32f_a_sse4_1(float* outputVector, const int8_t* inputVector, - const float scalar, unsigned int num_points) +static inline void volk_8i_s32f_convert_32f_a_sse4_1(float* outputVector, + const int8_t* inputVector, + const float scalar, + unsigned int num_points) { - unsigned int number = 0; - const unsigned int sixteenthPoints = num_points / 16; - - float* outputVectorPtr = outputVector; - const float iScalar = 1.0 / scalar; - __m128 invScalar = _mm_set_ps1(iScalar); - const int8_t* inputVectorPtr = inputVector; - __m128 ret; - __m128i inputVal; - __m128i interimVal; - - for(;number < sixteenthPoints; number++){ - inputVal = _mm_load_si128((__m128i*)inputVectorPtr); - - interimVal = _mm_cvtepi8_epi32(inputVal); - ret = _mm_cvtepi32_ps(interimVal); - ret = _mm_mul_ps(ret, invScalar); - _mm_store_ps(outputVectorPtr, ret); - outputVectorPtr += 4; - - inputVal = _mm_srli_si128(inputVal, 4); - interimVal = _mm_cvtepi8_epi32(inputVal); - ret = _mm_cvtepi32_ps(interimVal); - ret = _mm_mul_ps(ret, invScalar); - _mm_store_ps(outputVectorPtr, ret); - outputVectorPtr += 4; - - inputVal = _mm_srli_si128(inputVal, 4); - interimVal = _mm_cvtepi8_epi32(inputVal); - ret = _mm_cvtepi32_ps(interimVal); - ret = _mm_mul_ps(ret, invScalar); - _mm_store_ps(outputVectorPtr, ret); - outputVectorPtr += 4; - - inputVal = _mm_srli_si128(inputVal, 4); - interimVal = _mm_cvtepi8_epi32(inputVal); - ret = _mm_cvtepi32_ps(interimVal); - ret = _mm_mul_ps(ret, invScalar); - _mm_store_ps(outputVectorPtr, ret); - outputVectorPtr += 4; - - inputVectorPtr += 16; - } - - number = sixteenthPoints * 16; - for(; number < num_points; number++){ - outputVector[number] = (float)(inputVector[number]) * iScalar; - } + unsigned int number = 0; + const unsigned int sixteenthPoints = num_points / 16; + + float* outputVectorPtr = outputVector; + const float iScalar = 1.0 / scalar; + __m128 invScalar = _mm_set_ps1(iScalar); + const int8_t* inputVectorPtr = inputVector; + __m128 ret; + __m128i inputVal; + __m128i interimVal; + + for (; number < sixteenthPoints; number++) { + inputVal = _mm_load_si128((__m128i*)inputVectorPtr); + + interimVal = _mm_cvtepi8_epi32(inputVal); + ret = _mm_cvtepi32_ps(interimVal); + ret = _mm_mul_ps(ret, invScalar); + _mm_store_ps(outputVectorPtr, ret); + outputVectorPtr += 4; + + inputVal = _mm_srli_si128(inputVal, 4); + interimVal = _mm_cvtepi8_epi32(inputVal); + ret = _mm_cvtepi32_ps(interimVal); + ret = _mm_mul_ps(ret, invScalar); + _mm_store_ps(outputVectorPtr, ret); + outputVectorPtr += 4; + + inputVal = _mm_srli_si128(inputVal, 4); + interimVal = _mm_cvtepi8_epi32(inputVal); + ret = _mm_cvtepi32_ps(interimVal); + ret = _mm_mul_ps(ret, invScalar); + _mm_store_ps(outputVectorPtr, ret); + outputVectorPtr += 4; + + inputVal = _mm_srli_si128(inputVal, 4); + interimVal = _mm_cvtepi8_epi32(inputVal); + ret = _mm_cvtepi32_ps(interimVal); + ret = _mm_mul_ps(ret, invScalar); + _mm_store_ps(outputVectorPtr, ret); + outputVectorPtr += 4; + + inputVectorPtr += 16; + } + + number = sixteenthPoints * 16; + for (; number < num_points; number++) { + outputVector[number] = (float)(inputVector[number]) * iScalar; + } } #endif /* LV_HAVE_SSE4_1 */ #ifdef LV_HAVE_NEON #include -static inline void -volk_8i_s32f_convert_32f_neon(float* outputVector, const int8_t* inputVector, - const float scalar, unsigned int num_points) +static inline void volk_8i_s32f_convert_32f_neon(float* outputVector, + const int8_t* inputVector, + const float scalar, + unsigned int num_points) { - float* outputVectorPtr = outputVector; - const int8_t* inputVectorPtr = inputVector; - - const float iScalar = 1.0 / scalar; - const float32x4_t qiScalar = vdupq_n_f32(iScalar); - - int8x8x2_t inputVal; - float32x4x2_t outputFloat; - int16x8_t tmp; - - unsigned int number = 0; - const unsigned int sixteenthPoints = num_points / 16; - for(;number < sixteenthPoints; number++){ - __VOLK_PREFETCH(inputVectorPtr+16); - - inputVal = vld2_s8(inputVectorPtr); - inputVal = vzip_s8(inputVal.val[0], inputVal.val[1]); - inputVectorPtr += 16; - - tmp = vmovl_s8(inputVal.val[0]); - - outputFloat.val[0] = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp))); - outputFloat.val[0] = vmulq_f32(outputFloat.val[0], qiScalar); - vst1q_f32(outputVectorPtr, outputFloat.val[0]); - outputVectorPtr += 4; - - outputFloat.val[1] = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp))); - outputFloat.val[1] = vmulq_f32(outputFloat.val[1], qiScalar); - vst1q_f32(outputVectorPtr, outputFloat.val[1]); - outputVectorPtr += 4; - - tmp = vmovl_s8(inputVal.val[1]); - - outputFloat.val[0] = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp))); - outputFloat.val[0] = vmulq_f32(outputFloat.val[0], qiScalar); - vst1q_f32(outputVectorPtr, outputFloat.val[0]); - outputVectorPtr += 4; - - outputFloat.val[1] = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp))); - outputFloat.val[1] = vmulq_f32(outputFloat.val[1], qiScalar); - vst1q_f32(outputVectorPtr, outputFloat.val[1]); - outputVectorPtr += 4; - } - for(number = sixteenthPoints * 16; number < num_points; number++){ - *outputVectorPtr++ = ((float)(*inputVectorPtr++)) * iScalar; - } + float* outputVectorPtr = outputVector; + const int8_t* inputVectorPtr = inputVector; + + const float iScalar = 1.0 / scalar; + const float32x4_t qiScalar = vdupq_n_f32(iScalar); + + int8x8x2_t inputVal; + float32x4x2_t outputFloat; + int16x8_t tmp; + + unsigned int number = 0; + const unsigned int sixteenthPoints = num_points / 16; + for (; number < sixteenthPoints; number++) { + __VOLK_PREFETCH(inputVectorPtr + 16); + + inputVal = vld2_s8(inputVectorPtr); + inputVal = vzip_s8(inputVal.val[0], inputVal.val[1]); + inputVectorPtr += 16; + + tmp = vmovl_s8(inputVal.val[0]); + + outputFloat.val[0] = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp))); + outputFloat.val[0] = vmulq_f32(outputFloat.val[0], qiScalar); + vst1q_f32(outputVectorPtr, outputFloat.val[0]); + outputVectorPtr += 4; + + outputFloat.val[1] = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp))); + outputFloat.val[1] = vmulq_f32(outputFloat.val[1], qiScalar); + vst1q_f32(outputVectorPtr, outputFloat.val[1]); + outputVectorPtr += 4; + + tmp = vmovl_s8(inputVal.val[1]); + + outputFloat.val[0] = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp))); + outputFloat.val[0] = vmulq_f32(outputFloat.val[0], qiScalar); + vst1q_f32(outputVectorPtr, outputFloat.val[0]); + outputVectorPtr += 4; + + outputFloat.val[1] = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp))); + outputFloat.val[1] = vmulq_f32(outputFloat.val[1], qiScalar); + vst1q_f32(outputVectorPtr, outputFloat.val[1]); + outputVectorPtr += 4; + } + for (number = sixteenthPoints * 16; number < num_points; number++) { + *outputVectorPtr++ = ((float)(*inputVectorPtr++)) * iScalar; + } } #endif /* LV_HAVE_NEON */ #ifdef LV_HAVE_GENERIC -static inline void -volk_8i_s32f_convert_32f_a_generic(float* outputVector, const int8_t* inputVector, - const float scalar, unsigned int num_points) +static inline void volk_8i_s32f_convert_32f_a_generic(float* outputVector, + const int8_t* inputVector, + const float scalar, + unsigned int num_points) { - float* outputVectorPtr = outputVector; - const int8_t* inputVectorPtr = inputVector; - unsigned int number = 0; - const float iScalar = 1.0 / scalar; - - for(number = 0; number < num_points; number++){ - *outputVectorPtr++ = ((float)(*inputVectorPtr++)) * iScalar; - } + float* outputVectorPtr = outputVector; + const int8_t* inputVectorPtr = inputVector; + unsigned int number = 0; + const float iScalar = 1.0 / scalar; + + for (number = 0; number < num_points; number++) { + *outputVectorPtr++ = ((float)(*inputVectorPtr++)) * iScalar; + } } #endif /* LV_HAVE_GENERIC */ #ifdef LV_HAVE_ORC -extern void -volk_8i_s32f_convert_32f_a_orc_impl(float* outputVector, const int8_t* inputVector, - const float scalar, unsigned int num_points); - -static inline void -volk_8i_s32f_convert_32f_u_orc(float* outputVector, const int8_t* inputVector, - const float scalar, unsigned int num_points) +extern void volk_8i_s32f_convert_32f_a_orc_impl(float* outputVector, + const int8_t* inputVector, + const float scalar, + unsigned int num_points); + +static inline void volk_8i_s32f_convert_32f_u_orc(float* outputVector, + const int8_t* inputVector, + const float scalar, + unsigned int num_points) { - float invscalar = 1.0 / scalar; - volk_8i_s32f_convert_32f_a_orc_impl(outputVector, inputVector, invscalar, num_points); + float invscalar = 1.0 / scalar; + volk_8i_s32f_convert_32f_a_orc_impl(outputVector, inputVector, invscalar, num_points); } #endif /* LV_HAVE_ORC */ - #endif /* INCLUDED_VOLK_8s_CONVERT_32f_ALIGNED8_H */ - diff --git a/kernels/volk/volk_8ic_deinterleave_16i_x2.h b/kernels/volk/volk_8ic_deinterleave_16i_x2.h index b4cf251..fa998a0 100644 --- a/kernels/volk/volk_8ic_deinterleave_16i_x2.h +++ b/kernels/volk/volk_8ic_deinterleave_16i_x2.h @@ -30,8 +30,8 @@ * * Dispatcher Prototype * \code - * void volk_8ic_deinterleave_16i_x2(int16_t* iBuffer, int16_t* qBuffer, const lv_8sc_t* complexVector, unsigned int num_points) - * \endcode + * void volk_8ic_deinterleave_16i_x2(int16_t* iBuffer, int16_t* qBuffer, const lv_8sc_t* + * complexVector, unsigned int num_points) \endcode * * \b Inputs * \li complexVector: The complex input vector. @@ -60,91 +60,150 @@ #ifdef LV_HAVE_AVX2 #include -static inline void -volk_8ic_deinterleave_16i_x2_a_avx2(int16_t* iBuffer, int16_t* qBuffer, - const lv_8sc_t* complexVector, unsigned int num_points) +static inline void volk_8ic_deinterleave_16i_x2_a_avx2(int16_t* iBuffer, + int16_t* qBuffer, + const lv_8sc_t* complexVector, + unsigned int num_points) { - unsigned int number = 0; - const int8_t* complexVectorPtr = (int8_t*)complexVector; - int16_t* iBufferPtr = iBuffer; - int16_t* qBufferPtr = qBuffer; - __m256i MoveMask = _mm256_set_epi8(15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0, 15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0); - __m256i complexVal, iOutputVal, qOutputVal; - __m128i iOutputVal0, qOutputVal0; - - unsigned int sixteenthPoints = num_points / 16; - - for(number = 0; number < sixteenthPoints; number++){ - complexVal = _mm256_load_si256((__m256i*)complexVectorPtr); complexVectorPtr += 32; - - complexVal = _mm256_shuffle_epi8(complexVal, MoveMask); - complexVal = _mm256_permute4x64_epi64(complexVal, 0xd8); - - iOutputVal0 = _mm256_extracti128_si256(complexVal, 0); - qOutputVal0 = _mm256_extracti128_si256(complexVal, 1); - - iOutputVal = _mm256_cvtepi8_epi16(iOutputVal0); - iOutputVal = _mm256_slli_epi16(iOutputVal, 8); - - qOutputVal = _mm256_cvtepi8_epi16(qOutputVal0); - qOutputVal = _mm256_slli_epi16(qOutputVal, 8); - - _mm256_store_si256((__m256i*)iBufferPtr, iOutputVal); - _mm256_store_si256((__m256i*)qBufferPtr, qOutputVal); - - iBufferPtr += 16; - qBufferPtr += 16; - } - - number = sixteenthPoints * 16; - for(; number < num_points; number++){ - *iBufferPtr++ = ((int16_t)*complexVectorPtr++) * 256; // load 8 bit Complexvector into 16 bit, shift left by 8 bits and store - *qBufferPtr++ = ((int16_t)*complexVectorPtr++) * 256; - } + unsigned int number = 0; + const int8_t* complexVectorPtr = (int8_t*)complexVector; + int16_t* iBufferPtr = iBuffer; + int16_t* qBufferPtr = qBuffer; + __m256i MoveMask = _mm256_set_epi8(15, + 13, + 11, + 9, + 7, + 5, + 3, + 1, + 14, + 12, + 10, + 8, + 6, + 4, + 2, + 0, + 15, + 13, + 11, + 9, + 7, + 5, + 3, + 1, + 14, + 12, + 10, + 8, + 6, + 4, + 2, + 0); + __m256i complexVal, iOutputVal, qOutputVal; + __m128i iOutputVal0, qOutputVal0; + + unsigned int sixteenthPoints = num_points / 16; + + for (number = 0; number < sixteenthPoints; number++) { + complexVal = _mm256_load_si256((__m256i*)complexVectorPtr); + complexVectorPtr += 32; + + complexVal = _mm256_shuffle_epi8(complexVal, MoveMask); + complexVal = _mm256_permute4x64_epi64(complexVal, 0xd8); + + iOutputVal0 = _mm256_extracti128_si256(complexVal, 0); + qOutputVal0 = _mm256_extracti128_si256(complexVal, 1); + + iOutputVal = _mm256_cvtepi8_epi16(iOutputVal0); + iOutputVal = _mm256_slli_epi16(iOutputVal, 8); + + qOutputVal = _mm256_cvtepi8_epi16(qOutputVal0); + qOutputVal = _mm256_slli_epi16(qOutputVal, 8); + + _mm256_store_si256((__m256i*)iBufferPtr, iOutputVal); + _mm256_store_si256((__m256i*)qBufferPtr, qOutputVal); + + iBufferPtr += 16; + qBufferPtr += 16; + } + + number = sixteenthPoints * 16; + for (; number < num_points; number++) { + *iBufferPtr++ = + ((int16_t)*complexVectorPtr++) * + 256; // load 8 bit Complexvector into 16 bit, shift left by 8 bits and store + *qBufferPtr++ = ((int16_t)*complexVectorPtr++) * 256; + } } #endif /* LV_HAVE_AVX2 */ #ifdef LV_HAVE_SSE4_1 #include -static inline void -volk_8ic_deinterleave_16i_x2_a_sse4_1(int16_t* iBuffer, int16_t* qBuffer, - const lv_8sc_t* complexVector, unsigned int num_points) +static inline void volk_8ic_deinterleave_16i_x2_a_sse4_1(int16_t* iBuffer, + int16_t* qBuffer, + const lv_8sc_t* complexVector, + unsigned int num_points) { - unsigned int number = 0; - const int8_t* complexVectorPtr = (int8_t*)complexVector; - int16_t* iBufferPtr = iBuffer; - int16_t* qBufferPtr = qBuffer; - __m128i iMoveMask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0); // set 16 byte values - __m128i qMoveMask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 15, 13, 11, 9, 7, 5, 3, 1); - __m128i complexVal, iOutputVal, qOutputVal; - - unsigned int eighthPoints = num_points / 8; - - for(number = 0; number < eighthPoints; number++){ - complexVal = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16; // aligned load - - iOutputVal = _mm_shuffle_epi8(complexVal, iMoveMask); // shuffle 16 bytes of 128bit complexVal - qOutputVal = _mm_shuffle_epi8(complexVal, qMoveMask); - - iOutputVal = _mm_cvtepi8_epi16(iOutputVal); // fills 2-byte sign extended versions of lower 8 bytes of input to output - iOutputVal = _mm_slli_epi16(iOutputVal, 8); // shift in left by 8 bits, each of the 8 16-bit integers, shift in with zeros - - qOutputVal = _mm_cvtepi8_epi16(qOutputVal); - qOutputVal = _mm_slli_epi16(qOutputVal, 8); - - _mm_store_si128((__m128i*)iBufferPtr, iOutputVal); // aligned store - _mm_store_si128((__m128i*)qBufferPtr, qOutputVal); - - iBufferPtr += 8; - qBufferPtr += 8; - } - - number = eighthPoints * 8; - for(; number < num_points; number++){ - *iBufferPtr++ = ((int16_t)*complexVectorPtr++) * 256; // load 8 bit Complexvector into 16 bit, shift left by 8 bits and store - *qBufferPtr++ = ((int16_t)*complexVectorPtr++) * 256; - } + unsigned int number = 0; + const int8_t* complexVectorPtr = (int8_t*)complexVector; + int16_t* iBufferPtr = iBuffer; + int16_t* qBufferPtr = qBuffer; + __m128i iMoveMask = _mm_set_epi8(0x80, + 0x80, + 0x80, + 0x80, + 0x80, + 0x80, + 0x80, + 0x80, + 14, + 12, + 10, + 8, + 6, + 4, + 2, + 0); // set 16 byte values + __m128i qMoveMask = _mm_set_epi8( + 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 15, 13, 11, 9, 7, 5, 3, 1); + __m128i complexVal, iOutputVal, qOutputVal; + + unsigned int eighthPoints = num_points / 8; + + for (number = 0; number < eighthPoints; number++) { + complexVal = _mm_load_si128((__m128i*)complexVectorPtr); + complexVectorPtr += 16; // aligned load + + iOutputVal = _mm_shuffle_epi8(complexVal, + iMoveMask); // shuffle 16 bytes of 128bit complexVal + qOutputVal = _mm_shuffle_epi8(complexVal, qMoveMask); + + iOutputVal = _mm_cvtepi8_epi16(iOutputVal); // fills 2-byte sign extended versions + // of lower 8 bytes of input to output + iOutputVal = + _mm_slli_epi16(iOutputVal, 8); // shift in left by 8 bits, each of the 8 + // 16-bit integers, shift in with zeros + + qOutputVal = _mm_cvtepi8_epi16(qOutputVal); + qOutputVal = _mm_slli_epi16(qOutputVal, 8); + + _mm_store_si128((__m128i*)iBufferPtr, iOutputVal); // aligned store + _mm_store_si128((__m128i*)qBufferPtr, qOutputVal); + + iBufferPtr += 8; + qBufferPtr += 8; + } + + number = eighthPoints * 8; + for (; number < num_points; number++) { + *iBufferPtr++ = + ((int16_t)*complexVectorPtr++) * + 256; // load 8 bit Complexvector into 16 bit, shift left by 8 bits and store + *qBufferPtr++ = ((int16_t)*complexVectorPtr++) * 256; + } } #endif /* LV_HAVE_SSE4_1 */ @@ -152,86 +211,111 @@ volk_8ic_deinterleave_16i_x2_a_sse4_1(int16_t* iBuffer, int16_t* qBuffer, #ifdef LV_HAVE_AVX #include -static inline void -volk_8ic_deinterleave_16i_x2_a_avx(int16_t* iBuffer, int16_t* qBuffer, - const lv_8sc_t* complexVector, unsigned int num_points) +static inline void volk_8ic_deinterleave_16i_x2_a_avx(int16_t* iBuffer, + int16_t* qBuffer, + const lv_8sc_t* complexVector, + unsigned int num_points) { - unsigned int number = 0; - const int8_t* complexVectorPtr = (int8_t*)complexVector; - int16_t* iBufferPtr = iBuffer; - int16_t* qBufferPtr = qBuffer; - __m128i iMoveMask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0); // set 16 byte values - __m128i qMoveMask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 15, 13, 11, 9, 7, 5, 3, 1); - __m256i complexVal, iOutputVal, qOutputVal; - __m128i complexVal1, complexVal0; - __m128i iOutputVal1, iOutputVal0, qOutputVal1, qOutputVal0; - - unsigned int sixteenthPoints = num_points / 16; - - for(number = 0; number < sixteenthPoints; number++){ - complexVal = _mm256_load_si256((__m256i*)complexVectorPtr); complexVectorPtr += 32; // aligned load - - // Extract from complexVal to iOutputVal and qOutputVal - complexVal1 = _mm256_extractf128_si256(complexVal, 1); - complexVal0 = _mm256_extractf128_si256(complexVal, 0); - - iOutputVal1 = _mm_shuffle_epi8(complexVal1, iMoveMask); // shuffle 16 bytes of 128bit complexVal - iOutputVal0 = _mm_shuffle_epi8(complexVal0, iMoveMask); - qOutputVal1 = _mm_shuffle_epi8(complexVal1, qMoveMask); - qOutputVal0 = _mm_shuffle_epi8(complexVal0, qMoveMask); - - iOutputVal1 = _mm_cvtepi8_epi16(iOutputVal1); // fills 2-byte sign extended versions of lower 8 bytes of input to output - iOutputVal1 = _mm_slli_epi16(iOutputVal1, 8); // shift in left by 8 bits, each of the 8 16-bit integers, shift in with zeros - iOutputVal0 = _mm_cvtepi8_epi16(iOutputVal0); - iOutputVal0 = _mm_slli_epi16(iOutputVal0, 8); - - qOutputVal1 = _mm_cvtepi8_epi16(qOutputVal1); - qOutputVal1 = _mm_slli_epi16(qOutputVal1, 8); - qOutputVal0 = _mm_cvtepi8_epi16(qOutputVal0); - qOutputVal0 = _mm_slli_epi16(qOutputVal0, 8); - - // Pack iOutputVal0,1 to iOutputVal - __m256i dummy = _mm256_setzero_si256(); - iOutputVal = _mm256_insertf128_si256(dummy, iOutputVal0, 0); - iOutputVal = _mm256_insertf128_si256(iOutputVal, iOutputVal1, 1); - qOutputVal = _mm256_insertf128_si256(dummy, qOutputVal0, 0); - qOutputVal = _mm256_insertf128_si256(qOutputVal, qOutputVal1, 1); - - _mm256_store_si256((__m256i*)iBufferPtr, iOutputVal); // aligned store - _mm256_store_si256((__m256i*)qBufferPtr, qOutputVal); - - iBufferPtr += 16; - qBufferPtr += 16; - } - - number = sixteenthPoints * 16; - for(; number < num_points; number++){ - *iBufferPtr++ = ((int16_t)*complexVectorPtr++) * 256; // load 8 bit Complexvector into 16 bit, shift left by 8 bits and store - *qBufferPtr++ = ((int16_t)*complexVectorPtr++) * 256; - } + unsigned int number = 0; + const int8_t* complexVectorPtr = (int8_t*)complexVector; + int16_t* iBufferPtr = iBuffer; + int16_t* qBufferPtr = qBuffer; + __m128i iMoveMask = _mm_set_epi8(0x80, + 0x80, + 0x80, + 0x80, + 0x80, + 0x80, + 0x80, + 0x80, + 14, + 12, + 10, + 8, + 6, + 4, + 2, + 0); // set 16 byte values + __m128i qMoveMask = _mm_set_epi8( + 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 15, 13, 11, 9, 7, 5, 3, 1); + __m256i complexVal, iOutputVal, qOutputVal; + __m128i complexVal1, complexVal0; + __m128i iOutputVal1, iOutputVal0, qOutputVal1, qOutputVal0; + + unsigned int sixteenthPoints = num_points / 16; + + for (number = 0; number < sixteenthPoints; number++) { + complexVal = _mm256_load_si256((__m256i*)complexVectorPtr); + complexVectorPtr += 32; // aligned load + + // Extract from complexVal to iOutputVal and qOutputVal + complexVal1 = _mm256_extractf128_si256(complexVal, 1); + complexVal0 = _mm256_extractf128_si256(complexVal, 0); + + iOutputVal1 = _mm_shuffle_epi8( + complexVal1, iMoveMask); // shuffle 16 bytes of 128bit complexVal + iOutputVal0 = _mm_shuffle_epi8(complexVal0, iMoveMask); + qOutputVal1 = _mm_shuffle_epi8(complexVal1, qMoveMask); + qOutputVal0 = _mm_shuffle_epi8(complexVal0, qMoveMask); + + iOutputVal1 = + _mm_cvtepi8_epi16(iOutputVal1); // fills 2-byte sign extended versions of + // lower 8 bytes of input to output + iOutputVal1 = + _mm_slli_epi16(iOutputVal1, 8); // shift in left by 8 bits, each of the 8 + // 16-bit integers, shift in with zeros + iOutputVal0 = _mm_cvtepi8_epi16(iOutputVal0); + iOutputVal0 = _mm_slli_epi16(iOutputVal0, 8); + + qOutputVal1 = _mm_cvtepi8_epi16(qOutputVal1); + qOutputVal1 = _mm_slli_epi16(qOutputVal1, 8); + qOutputVal0 = _mm_cvtepi8_epi16(qOutputVal0); + qOutputVal0 = _mm_slli_epi16(qOutputVal0, 8); + + // Pack iOutputVal0,1 to iOutputVal + __m256i dummy = _mm256_setzero_si256(); + iOutputVal = _mm256_insertf128_si256(dummy, iOutputVal0, 0); + iOutputVal = _mm256_insertf128_si256(iOutputVal, iOutputVal1, 1); + qOutputVal = _mm256_insertf128_si256(dummy, qOutputVal0, 0); + qOutputVal = _mm256_insertf128_si256(qOutputVal, qOutputVal1, 1); + + _mm256_store_si256((__m256i*)iBufferPtr, iOutputVal); // aligned store + _mm256_store_si256((__m256i*)qBufferPtr, qOutputVal); + + iBufferPtr += 16; + qBufferPtr += 16; + } + + number = sixteenthPoints * 16; + for (; number < num_points; number++) { + *iBufferPtr++ = + ((int16_t)*complexVectorPtr++) * + 256; // load 8 bit Complexvector into 16 bit, shift left by 8 bits and store + *qBufferPtr++ = ((int16_t)*complexVectorPtr++) * 256; + } } #endif /* LV_HAVE_AVX */ #ifdef LV_HAVE_GENERIC -static inline void -volk_8ic_deinterleave_16i_x2_generic(int16_t* iBuffer, int16_t* qBuffer, - const lv_8sc_t* complexVector, unsigned int num_points) +static inline void volk_8ic_deinterleave_16i_x2_generic(int16_t* iBuffer, + int16_t* qBuffer, + const lv_8sc_t* complexVector, + unsigned int num_points) { - const int8_t* complexVectorPtr = (const int8_t*)complexVector; - int16_t* iBufferPtr = iBuffer; - int16_t* qBufferPtr = qBuffer; - unsigned int number; - for(number = 0; number < num_points; number++){ - *iBufferPtr++ = (int16_t)(*complexVectorPtr++)*256; - *qBufferPtr++ = (int16_t)(*complexVectorPtr++)*256; - } + const int8_t* complexVectorPtr = (const int8_t*)complexVector; + int16_t* iBufferPtr = iBuffer; + int16_t* qBufferPtr = qBuffer; + unsigned int number; + for (number = 0; number < num_points; number++) { + *iBufferPtr++ = (int16_t)(*complexVectorPtr++) * 256; + *qBufferPtr++ = (int16_t)(*complexVectorPtr++) * 256; + } } #endif /* LV_HAVE_GENERIC */ - #endif /* INCLUDED_volk_8ic_deinterleave_16i_x2_a_H */ #ifndef INCLUDED_volk_8ic_deinterleave_16i_x2_u_H @@ -243,47 +327,82 @@ volk_8ic_deinterleave_16i_x2_generic(int16_t* iBuffer, int16_t* qBuffer, #ifdef LV_HAVE_AVX2 #include -static inline void -volk_8ic_deinterleave_16i_x2_u_avx2(int16_t* iBuffer, int16_t* qBuffer, - const lv_8sc_t* complexVector, unsigned int num_points) +static inline void volk_8ic_deinterleave_16i_x2_u_avx2(int16_t* iBuffer, + int16_t* qBuffer, + const lv_8sc_t* complexVector, + unsigned int num_points) { - unsigned int number = 0; - const int8_t* complexVectorPtr = (int8_t*)complexVector; - int16_t* iBufferPtr = iBuffer; - int16_t* qBufferPtr = qBuffer; - __m256i MoveMask = _mm256_set_epi8(15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0, 15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0); - __m256i complexVal, iOutputVal, qOutputVal; - __m128i iOutputVal0, qOutputVal0; - - unsigned int sixteenthPoints = num_points / 16; - - for(number = 0; number < sixteenthPoints; number++){ - complexVal = _mm256_loadu_si256((__m256i*)complexVectorPtr); complexVectorPtr += 32; - - complexVal = _mm256_shuffle_epi8(complexVal, MoveMask); - complexVal = _mm256_permute4x64_epi64(complexVal, 0xd8); - - iOutputVal0 = _mm256_extracti128_si256(complexVal, 0); - qOutputVal0 = _mm256_extracti128_si256(complexVal, 1); - - iOutputVal = _mm256_cvtepi8_epi16(iOutputVal0); - iOutputVal = _mm256_slli_epi16(iOutputVal, 8); - - qOutputVal = _mm256_cvtepi8_epi16(qOutputVal0); - qOutputVal = _mm256_slli_epi16(qOutputVal, 8); - - _mm256_storeu_si256((__m256i*)iBufferPtr, iOutputVal); - _mm256_storeu_si256((__m256i*)qBufferPtr, qOutputVal); - - iBufferPtr += 16; - qBufferPtr += 16; - } - - number = sixteenthPoints * 16; - for(; number < num_points; number++){ - *iBufferPtr++ = ((int16_t)*complexVectorPtr++) * 256; // load 8 bit Complexvector into 16 bit, shift left by 8 bits and store - *qBufferPtr++ = ((int16_t)*complexVectorPtr++) * 256; - } + unsigned int number = 0; + const int8_t* complexVectorPtr = (int8_t*)complexVector; + int16_t* iBufferPtr = iBuffer; + int16_t* qBufferPtr = qBuffer; + __m256i MoveMask = _mm256_set_epi8(15, + 13, + 11, + 9, + 7, + 5, + 3, + 1, + 14, + 12, + 10, + 8, + 6, + 4, + 2, + 0, + 15, + 13, + 11, + 9, + 7, + 5, + 3, + 1, + 14, + 12, + 10, + 8, + 6, + 4, + 2, + 0); + __m256i complexVal, iOutputVal, qOutputVal; + __m128i iOutputVal0, qOutputVal0; + + unsigned int sixteenthPoints = num_points / 16; + + for (number = 0; number < sixteenthPoints; number++) { + complexVal = _mm256_loadu_si256((__m256i*)complexVectorPtr); + complexVectorPtr += 32; + + complexVal = _mm256_shuffle_epi8(complexVal, MoveMask); + complexVal = _mm256_permute4x64_epi64(complexVal, 0xd8); + + iOutputVal0 = _mm256_extracti128_si256(complexVal, 0); + qOutputVal0 = _mm256_extracti128_si256(complexVal, 1); + + iOutputVal = _mm256_cvtepi8_epi16(iOutputVal0); + iOutputVal = _mm256_slli_epi16(iOutputVal, 8); + + qOutputVal = _mm256_cvtepi8_epi16(qOutputVal0); + qOutputVal = _mm256_slli_epi16(qOutputVal, 8); + + _mm256_storeu_si256((__m256i*)iBufferPtr, iOutputVal); + _mm256_storeu_si256((__m256i*)qBufferPtr, qOutputVal); + + iBufferPtr += 16; + qBufferPtr += 16; + } + + number = sixteenthPoints * 16; + for (; number < num_points; number++) { + *iBufferPtr++ = + ((int16_t)*complexVectorPtr++) * + 256; // load 8 bit Complexvector into 16 bit, shift left by 8 bits and store + *qBufferPtr++ = ((int16_t)*complexVectorPtr++) * 256; + } } #endif /* LV_HAVE_AVX2 */ #endif /* INCLUDED_volk_8ic_deinterleave_16i_x2_u_H */ diff --git a/kernels/volk/volk_8ic_deinterleave_real_16i.h b/kernels/volk/volk_8ic_deinterleave_real_16i.h index f15879a..aaebb47 100644 --- a/kernels/volk/volk_8ic_deinterleave_real_16i.h +++ b/kernels/volk/volk_8ic_deinterleave_real_16i.h @@ -30,8 +30,8 @@ * * Dispatcher Prototype * \code - * void volk_8ic_deinterleave_real_16i(int16_t* iBuffer, const lv_8sc_t* complexVector, unsigned int num_points) - * \endcode + * void volk_8ic_deinterleave_real_16i(int16_t* iBuffer, const lv_8sc_t* complexVector, + * unsigned int num_points) \endcode * * \b Inputs * \li complexVector: The complex input vector. @@ -60,75 +60,109 @@ #ifdef LV_HAVE_AVX2 #include -static inline void -volk_8ic_deinterleave_real_16i_a_avx2(int16_t* iBuffer, const lv_8sc_t* complexVector, - unsigned int num_points) +static inline void volk_8ic_deinterleave_real_16i_a_avx2(int16_t* iBuffer, + const lv_8sc_t* complexVector, + unsigned int num_points) { - unsigned int number = 0; - const int8_t* complexVectorPtr = (int8_t*)complexVector; - int16_t* iBufferPtr = iBuffer; - __m256i moveMask = _mm256_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0); - __m256i complexVal, outputVal; - __m128i outputVal0; - - unsigned int sixteenthPoints = num_points / 16; - - for(number = 0; number < sixteenthPoints; number++){ - complexVal = _mm256_load_si256((__m256i*)complexVectorPtr); complexVectorPtr += 32; - - complexVal = _mm256_shuffle_epi8(complexVal, moveMask); - complexVal = _mm256_permute4x64_epi64(complexVal, 0xd8); - - outputVal0 = _mm256_extractf128_si256(complexVal, 0); - - outputVal = _mm256_cvtepi8_epi16(outputVal0); - outputVal = _mm256_slli_epi16(outputVal, 7); - - _mm256_store_si256((__m256i*)iBufferPtr, outputVal); - - iBufferPtr += 16; - } - - number = sixteenthPoints * 16; - for(; number < num_points; number++){ - *iBufferPtr++ = ((int16_t)*complexVectorPtr++) * 128; - complexVectorPtr++; - } + unsigned int number = 0; + const int8_t* complexVectorPtr = (int8_t*)complexVector; + int16_t* iBufferPtr = iBuffer; + __m256i moveMask = _mm256_set_epi8(0x80, + 0x80, + 0x80, + 0x80, + 0x80, + 0x80, + 0x80, + 0x80, + 14, + 12, + 10, + 8, + 6, + 4, + 2, + 0, + 0x80, + 0x80, + 0x80, + 0x80, + 0x80, + 0x80, + 0x80, + 0x80, + 14, + 12, + 10, + 8, + 6, + 4, + 2, + 0); + __m256i complexVal, outputVal; + __m128i outputVal0; + + unsigned int sixteenthPoints = num_points / 16; + + for (number = 0; number < sixteenthPoints; number++) { + complexVal = _mm256_load_si256((__m256i*)complexVectorPtr); + complexVectorPtr += 32; + + complexVal = _mm256_shuffle_epi8(complexVal, moveMask); + complexVal = _mm256_permute4x64_epi64(complexVal, 0xd8); + + outputVal0 = _mm256_extractf128_si256(complexVal, 0); + + outputVal = _mm256_cvtepi8_epi16(outputVal0); + outputVal = _mm256_slli_epi16(outputVal, 7); + + _mm256_store_si256((__m256i*)iBufferPtr, outputVal); + + iBufferPtr += 16; + } + + number = sixteenthPoints * 16; + for (; number < num_points; number++) { + *iBufferPtr++ = ((int16_t)*complexVectorPtr++) * 128; + complexVectorPtr++; + } } #endif /* LV_HAVE_AVX2 */ #ifdef LV_HAVE_SSE4_1 #include -static inline void -volk_8ic_deinterleave_real_16i_a_sse4_1(int16_t* iBuffer, const lv_8sc_t* complexVector, - unsigned int num_points) +static inline void volk_8ic_deinterleave_real_16i_a_sse4_1(int16_t* iBuffer, + const lv_8sc_t* complexVector, + unsigned int num_points) { - unsigned int number = 0; - const int8_t* complexVectorPtr = (int8_t*)complexVector; - int16_t* iBufferPtr = iBuffer; - __m128i moveMask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0); - __m128i complexVal, outputVal; + unsigned int number = 0; + const int8_t* complexVectorPtr = (int8_t*)complexVector; + int16_t* iBufferPtr = iBuffer; + __m128i moveMask = _mm_set_epi8( + 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0); + __m128i complexVal, outputVal; - unsigned int eighthPoints = num_points / 8; + unsigned int eighthPoints = num_points / 8; - for(number = 0; number < eighthPoints; number++){ - complexVal = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16; + for (number = 0; number < eighthPoints; number++) { + complexVal = _mm_load_si128((__m128i*)complexVectorPtr); + complexVectorPtr += 16; - complexVal = _mm_shuffle_epi8(complexVal, moveMask); + complexVal = _mm_shuffle_epi8(complexVal, moveMask); - outputVal = _mm_cvtepi8_epi16(complexVal); - outputVal = _mm_slli_epi16(outputVal, 7); + outputVal = _mm_cvtepi8_epi16(complexVal); + outputVal = _mm_slli_epi16(outputVal, 7); - _mm_store_si128((__m128i*)iBufferPtr, outputVal); - iBufferPtr += 8; - } + _mm_store_si128((__m128i*)iBufferPtr, outputVal); + iBufferPtr += 8; + } - number = eighthPoints * 8; - for(; number < num_points; number++){ - *iBufferPtr++ = ((int16_t)*complexVectorPtr++) * 128; - complexVectorPtr++; - } + number = eighthPoints * 8; + for (; number < num_points; number++) { + *iBufferPtr++ = ((int16_t)*complexVectorPtr++) * 128; + complexVectorPtr++; + } } #endif /* LV_HAVE_SSE4_1 */ @@ -136,63 +170,65 @@ volk_8ic_deinterleave_real_16i_a_sse4_1(int16_t* iBuffer, const lv_8sc_t* comple #ifdef LV_HAVE_AVX #include -static inline void -volk_8ic_deinterleave_real_16i_a_avx(int16_t* iBuffer, const lv_8sc_t* complexVector, - unsigned int num_points) +static inline void volk_8ic_deinterleave_real_16i_a_avx(int16_t* iBuffer, + const lv_8sc_t* complexVector, + unsigned int num_points) { - unsigned int number = 0; - const int8_t* complexVectorPtr = (int8_t*)complexVector; - int16_t* iBufferPtr = iBuffer; - __m128i moveMask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0); - __m256i complexVal, outputVal; - __m128i complexVal1, complexVal0, outputVal1, outputVal0; - - unsigned int sixteenthPoints = num_points / 16; - - for(number = 0; number < sixteenthPoints; number++){ - complexVal = _mm256_load_si256((__m256i*)complexVectorPtr); complexVectorPtr += 32; - - complexVal1 = _mm256_extractf128_si256(complexVal, 1); - complexVal0 = _mm256_extractf128_si256(complexVal, 0); - - outputVal1 = _mm_shuffle_epi8(complexVal1, moveMask); - outputVal0 = _mm_shuffle_epi8(complexVal0, moveMask); - - outputVal1 = _mm_cvtepi8_epi16(outputVal1); - outputVal1 = _mm_slli_epi16(outputVal1, 7); - outputVal0 = _mm_cvtepi8_epi16(outputVal0); - outputVal0 = _mm_slli_epi16(outputVal0, 7); - - __m256i dummy = _mm256_setzero_si256(); - outputVal = _mm256_insertf128_si256(dummy, outputVal0, 0); - outputVal = _mm256_insertf128_si256(outputVal, outputVal1, 1); - _mm256_store_si256((__m256i*)iBufferPtr, outputVal); - - iBufferPtr += 16; - } - - number = sixteenthPoints * 16; - for(; number < num_points; number++){ - *iBufferPtr++ = ((int16_t)*complexVectorPtr++) * 128; - complexVectorPtr++; - } + unsigned int number = 0; + const int8_t* complexVectorPtr = (int8_t*)complexVector; + int16_t* iBufferPtr = iBuffer; + __m128i moveMask = _mm_set_epi8( + 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0); + __m256i complexVal, outputVal; + __m128i complexVal1, complexVal0, outputVal1, outputVal0; + + unsigned int sixteenthPoints = num_points / 16; + + for (number = 0; number < sixteenthPoints; number++) { + complexVal = _mm256_load_si256((__m256i*)complexVectorPtr); + complexVectorPtr += 32; + + complexVal1 = _mm256_extractf128_si256(complexVal, 1); + complexVal0 = _mm256_extractf128_si256(complexVal, 0); + + outputVal1 = _mm_shuffle_epi8(complexVal1, moveMask); + outputVal0 = _mm_shuffle_epi8(complexVal0, moveMask); + + outputVal1 = _mm_cvtepi8_epi16(outputVal1); + outputVal1 = _mm_slli_epi16(outputVal1, 7); + outputVal0 = _mm_cvtepi8_epi16(outputVal0); + outputVal0 = _mm_slli_epi16(outputVal0, 7); + + __m256i dummy = _mm256_setzero_si256(); + outputVal = _mm256_insertf128_si256(dummy, outputVal0, 0); + outputVal = _mm256_insertf128_si256(outputVal, outputVal1, 1); + _mm256_store_si256((__m256i*)iBufferPtr, outputVal); + + iBufferPtr += 16; + } + + number = sixteenthPoints * 16; + for (; number < num_points; number++) { + *iBufferPtr++ = ((int16_t)*complexVectorPtr++) * 128; + complexVectorPtr++; + } } #endif /* LV_HAVE_AVX */ #ifdef LV_HAVE_GENERIC -static inline void -volk_8ic_deinterleave_real_16i_generic(int16_t* iBuffer, const lv_8sc_t* complexVector, - unsigned int num_points) +static inline void volk_8ic_deinterleave_real_16i_generic(int16_t* iBuffer, + const lv_8sc_t* complexVector, + unsigned int num_points) { - unsigned int number = 0; - const int8_t* complexVectorPtr = (const int8_t*)complexVector; - int16_t* iBufferPtr = iBuffer; - for(number = 0; number < num_points; number++){ - *iBufferPtr++ = ((int16_t)(*complexVectorPtr++)) * 128; - complexVectorPtr++; - } + unsigned int number = 0; + const int8_t* complexVectorPtr = (const int8_t*)complexVector; + int16_t* iBufferPtr = iBuffer; + for (number = 0; number < num_points; number++) { + *iBufferPtr++ = ((int16_t)(*complexVectorPtr++)) * 128; + complexVectorPtr++; + } } #endif /* LV_HAVE_GENERIC */ @@ -209,40 +245,72 @@ volk_8ic_deinterleave_real_16i_generic(int16_t* iBuffer, const lv_8sc_t* complex #ifdef LV_HAVE_AVX2 #include -static inline void -volk_8ic_deinterleave_real_16i_u_avx2(int16_t* iBuffer, const lv_8sc_t* complexVector, - unsigned int num_points) +static inline void volk_8ic_deinterleave_real_16i_u_avx2(int16_t* iBuffer, + const lv_8sc_t* complexVector, + unsigned int num_points) { - unsigned int number = 0; - const int8_t* complexVectorPtr = (int8_t*)complexVector; - int16_t* iBufferPtr = iBuffer; - __m256i moveMask = _mm256_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0); - __m256i complexVal, outputVal; - __m128i outputVal0; - - unsigned int sixteenthPoints = num_points / 16; - - for(number = 0; number < sixteenthPoints; number++){ - complexVal = _mm256_loadu_si256((__m256i*)complexVectorPtr); complexVectorPtr += 32; - - complexVal = _mm256_shuffle_epi8(complexVal, moveMask); - complexVal = _mm256_permute4x64_epi64(complexVal, 0xd8); - - outputVal0 = _mm256_extractf128_si256(complexVal, 0); - - outputVal = _mm256_cvtepi8_epi16(outputVal0); - outputVal = _mm256_slli_epi16(outputVal, 7); - - _mm256_storeu_si256((__m256i*)iBufferPtr, outputVal); - - iBufferPtr += 16; - } - - number = sixteenthPoints * 16; - for(; number < num_points; number++){ - *iBufferPtr++ = ((int16_t)*complexVectorPtr++) * 128; - complexVectorPtr++; - } + unsigned int number = 0; + const int8_t* complexVectorPtr = (int8_t*)complexVector; + int16_t* iBufferPtr = iBuffer; + __m256i moveMask = _mm256_set_epi8(0x80, + 0x80, + 0x80, + 0x80, + 0x80, + 0x80, + 0x80, + 0x80, + 14, + 12, + 10, + 8, + 6, + 4, + 2, + 0, + 0x80, + 0x80, + 0x80, + 0x80, + 0x80, + 0x80, + 0x80, + 0x80, + 14, + 12, + 10, + 8, + 6, + 4, + 2, + 0); + __m256i complexVal, outputVal; + __m128i outputVal0; + + unsigned int sixteenthPoints = num_points / 16; + + for (number = 0; number < sixteenthPoints; number++) { + complexVal = _mm256_loadu_si256((__m256i*)complexVectorPtr); + complexVectorPtr += 32; + + complexVal = _mm256_shuffle_epi8(complexVal, moveMask); + complexVal = _mm256_permute4x64_epi64(complexVal, 0xd8); + + outputVal0 = _mm256_extractf128_si256(complexVal, 0); + + outputVal = _mm256_cvtepi8_epi16(outputVal0); + outputVal = _mm256_slli_epi16(outputVal, 7); + + _mm256_storeu_si256((__m256i*)iBufferPtr, outputVal); + + iBufferPtr += 16; + } + + number = sixteenthPoints * 16; + for (; number < num_points; number++) { + *iBufferPtr++ = ((int16_t)*complexVectorPtr++) * 128; + complexVectorPtr++; + } } #endif /* LV_HAVE_AVX2 */ #endif /* INCLUDED_volk_8ic_deinterleave_real_16i_u_H */ diff --git a/kernels/volk/volk_8ic_deinterleave_real_8i.h b/kernels/volk/volk_8ic_deinterleave_real_8i.h index 6cc3f15..a1a835d 100644 --- a/kernels/volk/volk_8ic_deinterleave_real_8i.h +++ b/kernels/volk/volk_8ic_deinterleave_real_8i.h @@ -30,8 +30,8 @@ * * Dispatcher Prototype * \code - * void volk_8ic_deinterleave_real_8i(int8_t* iBuffer, const lv_8sc_t* complexVector, unsigned int num_points) - * \endcode + * void volk_8ic_deinterleave_real_8i(int8_t* iBuffer, const lv_8sc_t* complexVector, + * unsigned int num_points) \endcode * * \b Inputs * \li complexVector: The complex input vector. @@ -59,40 +59,102 @@ #ifdef LV_HAVE_AVX2 #include -static inline void -volk_8ic_deinterleave_real_8i_a_avx2(int8_t* iBuffer, const lv_8sc_t* complexVector, - unsigned int num_points) +static inline void volk_8ic_deinterleave_real_8i_a_avx2(int8_t* iBuffer, + const lv_8sc_t* complexVector, + unsigned int num_points) { - unsigned int number = 0; - const int8_t* complexVectorPtr = (int8_t*)complexVector; - int8_t* iBufferPtr = iBuffer; - __m256i moveMask1 = _mm256_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0); - __m256i moveMask2 = _mm256_set_epi8(14, 12, 10, 8, 6, 4, 2, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80); - __m256i complexVal1, complexVal2, outputVal; - - unsigned int thirtysecondPoints = num_points / 32; - - for(number = 0; number < thirtysecondPoints; number++){ - - complexVal1 = _mm256_load_si256((__m256i*)complexVectorPtr); - complexVectorPtr += 32; - complexVal2 = _mm256_load_si256((__m256i*)complexVectorPtr); - complexVectorPtr += 32; - - complexVal1 = _mm256_shuffle_epi8(complexVal1, moveMask1); - complexVal2 = _mm256_shuffle_epi8(complexVal2, moveMask2); - outputVal = _mm256_or_si256(complexVal1, complexVal2); - outputVal = _mm256_permute4x64_epi64(outputVal, 0xd8); - - _mm256_store_si256((__m256i*)iBufferPtr, outputVal); - iBufferPtr += 32; - } - - number = thirtysecondPoints * 32; - for(; number < num_points; number++){ - *iBufferPtr++ = *complexVectorPtr++; - complexVectorPtr++; - } + unsigned int number = 0; + const int8_t* complexVectorPtr = (int8_t*)complexVector; + int8_t* iBufferPtr = iBuffer; + __m256i moveMask1 = _mm256_set_epi8(0x80, + 0x80, + 0x80, + 0x80, + 0x80, + 0x80, + 0x80, + 0x80, + 14, + 12, + 10, + 8, + 6, + 4, + 2, + 0, + 0x80, + 0x80, + 0x80, + 0x80, + 0x80, + 0x80, + 0x80, + 0x80, + 14, + 12, + 10, + 8, + 6, + 4, + 2, + 0); + __m256i moveMask2 = _mm256_set_epi8(14, + 12, + 10, + 8, + 6, + 4, + 2, + 0, + 0x80, + 0x80, + 0x80, + 0x80, + 0x80, + 0x80, + 0x80, + 0x80, + 14, + 12, + 10, + 8, + 6, + 4, + 2, + 0, + 0x80, + 0x80, + 0x80, + 0x80, + 0x80, + 0x80, + 0x80, + 0x80); + __m256i complexVal1, complexVal2, outputVal; + + unsigned int thirtysecondPoints = num_points / 32; + + for (number = 0; number < thirtysecondPoints; number++) { + + complexVal1 = _mm256_load_si256((__m256i*)complexVectorPtr); + complexVectorPtr += 32; + complexVal2 = _mm256_load_si256((__m256i*)complexVectorPtr); + complexVectorPtr += 32; + + complexVal1 = _mm256_shuffle_epi8(complexVal1, moveMask1); + complexVal2 = _mm256_shuffle_epi8(complexVal2, moveMask2); + outputVal = _mm256_or_si256(complexVal1, complexVal2); + outputVal = _mm256_permute4x64_epi64(outputVal, 0xd8); + + _mm256_store_si256((__m256i*)iBufferPtr, outputVal); + iBufferPtr += 32; + } + + number = thirtysecondPoints * 32; + for (; number < num_points; number++) { + *iBufferPtr++ = *complexVectorPtr++; + complexVectorPtr++; + } } #endif /* LV_HAVE_AVX2 */ @@ -100,37 +162,41 @@ volk_8ic_deinterleave_real_8i_a_avx2(int8_t* iBuffer, const lv_8sc_t* complexVec #ifdef LV_HAVE_SSSE3 #include -static inline void -volk_8ic_deinterleave_real_8i_a_ssse3(int8_t* iBuffer, const lv_8sc_t* complexVector, - unsigned int num_points) +static inline void volk_8ic_deinterleave_real_8i_a_ssse3(int8_t* iBuffer, + const lv_8sc_t* complexVector, + unsigned int num_points) { - unsigned int number = 0; - const int8_t* complexVectorPtr = (int8_t*)complexVector; - int8_t* iBufferPtr = iBuffer; - __m128i moveMask1 = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0); - __m128i moveMask2 = _mm_set_epi8(14, 12, 10, 8, 6, 4, 2, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80); - __m128i complexVal1, complexVal2, outputVal; - - unsigned int sixteenthPoints = num_points / 16; - - for(number = 0; number < sixteenthPoints; number++){ - complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16; - complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16; - - complexVal1 = _mm_shuffle_epi8(complexVal1, moveMask1); - complexVal2 = _mm_shuffle_epi8(complexVal2, moveMask2); - - outputVal = _mm_or_si128(complexVal1, complexVal2); - - _mm_store_si128((__m128i*)iBufferPtr, outputVal); - iBufferPtr += 16; - } - - number = sixteenthPoints * 16; - for(; number < num_points; number++){ - *iBufferPtr++ = *complexVectorPtr++; - complexVectorPtr++; - } + unsigned int number = 0; + const int8_t* complexVectorPtr = (int8_t*)complexVector; + int8_t* iBufferPtr = iBuffer; + __m128i moveMask1 = _mm_set_epi8( + 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0); + __m128i moveMask2 = _mm_set_epi8( + 14, 12, 10, 8, 6, 4, 2, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80); + __m128i complexVal1, complexVal2, outputVal; + + unsigned int sixteenthPoints = num_points / 16; + + for (number = 0; number < sixteenthPoints; number++) { + complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr); + complexVectorPtr += 16; + complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr); + complexVectorPtr += 16; + + complexVal1 = _mm_shuffle_epi8(complexVal1, moveMask1); + complexVal2 = _mm_shuffle_epi8(complexVal2, moveMask2); + + outputVal = _mm_or_si128(complexVal1, complexVal2); + + _mm_store_si128((__m128i*)iBufferPtr, outputVal); + iBufferPtr += 16; + } + + number = sixteenthPoints * 16; + for (; number < num_points; number++) { + *iBufferPtr++ = *complexVectorPtr++; + complexVectorPtr++; + } } #endif /* LV_HAVE_SSSE3 */ @@ -138,72 +204,75 @@ volk_8ic_deinterleave_real_8i_a_ssse3(int8_t* iBuffer, const lv_8sc_t* complexVe #ifdef LV_HAVE_AVX #include -static inline void -volk_8ic_deinterleave_real_8i_a_avx(int8_t* iBuffer, const lv_8sc_t* complexVector, - unsigned int num_points) +static inline void volk_8ic_deinterleave_real_8i_a_avx(int8_t* iBuffer, + const lv_8sc_t* complexVector, + unsigned int num_points) { - unsigned int number = 0; - const int8_t* complexVectorPtr = (int8_t*)complexVector; - int8_t* iBufferPtr = iBuffer; - __m128i moveMaskL = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0); - __m128i moveMaskH = _mm_set_epi8(14, 12, 10, 8, 6, 4, 2, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80); - __m256i complexVal1, complexVal2, outputVal; - __m128i complexVal1H, complexVal1L, complexVal2H, complexVal2L, outputVal1, outputVal2; - - unsigned int thirtysecondPoints = num_points / 32; - - for(number = 0; number < thirtysecondPoints; number++){ - - complexVal1 = _mm256_load_si256((__m256i*)complexVectorPtr); - complexVectorPtr += 32; - complexVal2 = _mm256_load_si256((__m256i*)complexVectorPtr); - complexVectorPtr += 32; - - complexVal1H = _mm256_extractf128_si256(complexVal1, 1); - complexVal1L = _mm256_extractf128_si256(complexVal1, 0); - complexVal2H = _mm256_extractf128_si256(complexVal2, 1); - complexVal2L = _mm256_extractf128_si256(complexVal2, 0); - - complexVal1H = _mm_shuffle_epi8(complexVal1H, moveMaskH); - complexVal1L = _mm_shuffle_epi8(complexVal1L, moveMaskL); - outputVal1 = _mm_or_si128(complexVal1H, complexVal1L); - - - complexVal2H = _mm_shuffle_epi8(complexVal2H, moveMaskH); - complexVal2L = _mm_shuffle_epi8(complexVal2L, moveMaskL); - outputVal2 = _mm_or_si128(complexVal2H, complexVal2L); - - __m256i dummy = _mm256_setzero_si256(); - outputVal = _mm256_insertf128_si256(dummy, outputVal1, 0); - outputVal = _mm256_insertf128_si256(outputVal, outputVal2, 1); - - - _mm256_store_si256((__m256i*)iBufferPtr, outputVal); - iBufferPtr += 32; - } - - number = thirtysecondPoints * 32; - for(; number < num_points; number++){ - *iBufferPtr++ = *complexVectorPtr++; - complexVectorPtr++; - } + unsigned int number = 0; + const int8_t* complexVectorPtr = (int8_t*)complexVector; + int8_t* iBufferPtr = iBuffer; + __m128i moveMaskL = _mm_set_epi8( + 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0); + __m128i moveMaskH = _mm_set_epi8( + 14, 12, 10, 8, 6, 4, 2, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80); + __m256i complexVal1, complexVal2, outputVal; + __m128i complexVal1H, complexVal1L, complexVal2H, complexVal2L, outputVal1, + outputVal2; + + unsigned int thirtysecondPoints = num_points / 32; + + for (number = 0; number < thirtysecondPoints; number++) { + + complexVal1 = _mm256_load_si256((__m256i*)complexVectorPtr); + complexVectorPtr += 32; + complexVal2 = _mm256_load_si256((__m256i*)complexVectorPtr); + complexVectorPtr += 32; + + complexVal1H = _mm256_extractf128_si256(complexVal1, 1); + complexVal1L = _mm256_extractf128_si256(complexVal1, 0); + complexVal2H = _mm256_extractf128_si256(complexVal2, 1); + complexVal2L = _mm256_extractf128_si256(complexVal2, 0); + + complexVal1H = _mm_shuffle_epi8(complexVal1H, moveMaskH); + complexVal1L = _mm_shuffle_epi8(complexVal1L, moveMaskL); + outputVal1 = _mm_or_si128(complexVal1H, complexVal1L); + + + complexVal2H = _mm_shuffle_epi8(complexVal2H, moveMaskH); + complexVal2L = _mm_shuffle_epi8(complexVal2L, moveMaskL); + outputVal2 = _mm_or_si128(complexVal2H, complexVal2L); + + __m256i dummy = _mm256_setzero_si256(); + outputVal = _mm256_insertf128_si256(dummy, outputVal1, 0); + outputVal = _mm256_insertf128_si256(outputVal, outputVal2, 1); + + + _mm256_store_si256((__m256i*)iBufferPtr, outputVal); + iBufferPtr += 32; + } + + number = thirtysecondPoints * 32; + for (; number < num_points; number++) { + *iBufferPtr++ = *complexVectorPtr++; + complexVectorPtr++; + } } #endif /* LV_HAVE_AVX */ #ifdef LV_HAVE_GENERIC -static inline void -volk_8ic_deinterleave_real_8i_generic(int8_t* iBuffer, const lv_8sc_t* complexVector, - unsigned int num_points) +static inline void volk_8ic_deinterleave_real_8i_generic(int8_t* iBuffer, + const lv_8sc_t* complexVector, + unsigned int num_points) { - unsigned int number = 0; - const int8_t* complexVectorPtr = (int8_t*)complexVector; - int8_t* iBufferPtr = iBuffer; - for(number = 0; number < num_points; number++){ - *iBufferPtr++ = *complexVectorPtr++; - complexVectorPtr++; - } + unsigned int number = 0; + const int8_t* complexVectorPtr = (int8_t*)complexVector; + int8_t* iBufferPtr = iBuffer; + for (number = 0; number < num_points; number++) { + *iBufferPtr++ = *complexVectorPtr++; + complexVectorPtr++; + } } #endif /* LV_HAVE_GENERIC */ @@ -211,26 +280,27 @@ volk_8ic_deinterleave_real_8i_generic(int8_t* iBuffer, const lv_8sc_t* complexVe #ifdef LV_HAVE_NEON #include -static inline void -volk_8ic_deinterleave_real_8i_neon(int8_t* iBuffer, const lv_8sc_t* complexVector, unsigned int num_points) +static inline void volk_8ic_deinterleave_real_8i_neon(int8_t* iBuffer, + const lv_8sc_t* complexVector, + unsigned int num_points) { - unsigned int number; - unsigned int sixteenth_points = num_points / 16; - - int8x16x2_t input_vector; - for(number=0; number < sixteenth_points; ++number) { - input_vector = vld2q_s8((int8_t*) complexVector ); - vst1q_s8(iBuffer, input_vector.val[0]); - iBuffer += 16; - complexVector += 16; - } - - const int8_t* complexVectorPtr = (int8_t*)complexVector; - int8_t* iBufferPtr = iBuffer; - for(number = sixteenth_points*16; number < num_points; number++){ - *iBufferPtr++ = *complexVectorPtr++; - complexVectorPtr++; - } + unsigned int number; + unsigned int sixteenth_points = num_points / 16; + + int8x16x2_t input_vector; + for (number = 0; number < sixteenth_points; ++number) { + input_vector = vld2q_s8((int8_t*)complexVector); + vst1q_s8(iBuffer, input_vector.val[0]); + iBuffer += 16; + complexVector += 16; + } + + const int8_t* complexVectorPtr = (int8_t*)complexVector; + int8_t* iBufferPtr = iBuffer; + for (number = sixteenth_points * 16; number < num_points; number++) { + *iBufferPtr++ = *complexVectorPtr++; + complexVectorPtr++; + } } #endif /* LV_HAVE_NEON */ @@ -246,40 +316,102 @@ volk_8ic_deinterleave_real_8i_neon(int8_t* iBuffer, const lv_8sc_t* complexVecto #ifdef LV_HAVE_AVX2 #include -static inline void -volk_8ic_deinterleave_real_8i_u_avx2(int8_t* iBuffer, const lv_8sc_t* complexVector, - unsigned int num_points) +static inline void volk_8ic_deinterleave_real_8i_u_avx2(int8_t* iBuffer, + const lv_8sc_t* complexVector, + unsigned int num_points) { - unsigned int number = 0; - const int8_t* complexVectorPtr = (int8_t*)complexVector; - int8_t* iBufferPtr = iBuffer; - __m256i moveMask1 = _mm256_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0); - __m256i moveMask2 = _mm256_set_epi8(14, 12, 10, 8, 6, 4, 2, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80); - __m256i complexVal1, complexVal2, outputVal; - - unsigned int thirtysecondPoints = num_points / 32; - - for(number = 0; number < thirtysecondPoints; number++){ - - complexVal1 = _mm256_loadu_si256((__m256i*)complexVectorPtr); - complexVectorPtr += 32; - complexVal2 = _mm256_loadu_si256((__m256i*)complexVectorPtr); - complexVectorPtr += 32; - - complexVal1 = _mm256_shuffle_epi8(complexVal1, moveMask1); - complexVal2 = _mm256_shuffle_epi8(complexVal2, moveMask2); - outputVal = _mm256_or_si256(complexVal1, complexVal2); - outputVal = _mm256_permute4x64_epi64(outputVal, 0xd8); - - _mm256_storeu_si256((__m256i*)iBufferPtr, outputVal); - iBufferPtr += 32; - } - - number = thirtysecondPoints * 32; - for(; number < num_points; number++){ - *iBufferPtr++ = *complexVectorPtr++; - complexVectorPtr++; - } + unsigned int number = 0; + const int8_t* complexVectorPtr = (int8_t*)complexVector; + int8_t* iBufferPtr = iBuffer; + __m256i moveMask1 = _mm256_set_epi8(0x80, + 0x80, + 0x80, + 0x80, + 0x80, + 0x80, + 0x80, + 0x80, + 14, + 12, + 10, + 8, + 6, + 4, + 2, + 0, + 0x80, + 0x80, + 0x80, + 0x80, + 0x80, + 0x80, + 0x80, + 0x80, + 14, + 12, + 10, + 8, + 6, + 4, + 2, + 0); + __m256i moveMask2 = _mm256_set_epi8(14, + 12, + 10, + 8, + 6, + 4, + 2, + 0, + 0x80, + 0x80, + 0x80, + 0x80, + 0x80, + 0x80, + 0x80, + 0x80, + 14, + 12, + 10, + 8, + 6, + 4, + 2, + 0, + 0x80, + 0x80, + 0x80, + 0x80, + 0x80, + 0x80, + 0x80, + 0x80); + __m256i complexVal1, complexVal2, outputVal; + + unsigned int thirtysecondPoints = num_points / 32; + + for (number = 0; number < thirtysecondPoints; number++) { + + complexVal1 = _mm256_loadu_si256((__m256i*)complexVectorPtr); + complexVectorPtr += 32; + complexVal2 = _mm256_loadu_si256((__m256i*)complexVectorPtr); + complexVectorPtr += 32; + + complexVal1 = _mm256_shuffle_epi8(complexVal1, moveMask1); + complexVal2 = _mm256_shuffle_epi8(complexVal2, moveMask2); + outputVal = _mm256_or_si256(complexVal1, complexVal2); + outputVal = _mm256_permute4x64_epi64(outputVal, 0xd8); + + _mm256_storeu_si256((__m256i*)iBufferPtr, outputVal); + iBufferPtr += 32; + } + + number = thirtysecondPoints * 32; + for (; number < num_points; number++) { + *iBufferPtr++ = *complexVectorPtr++; + complexVectorPtr++; + } } #endif /* LV_HAVE_AVX2 */ diff --git a/kernels/volk/volk_8ic_s32f_deinterleave_32f_x2.h b/kernels/volk/volk_8ic_s32f_deinterleave_32f_x2.h index 736f7c0..f622752 100644 --- a/kernels/volk/volk_8ic_s32f_deinterleave_32f_x2.h +++ b/kernels/volk/volk_8ic_s32f_deinterleave_32f_x2.h @@ -31,8 +31,8 @@ * * Dispatcher Prototype * \code - * void volk_8ic_s32f_deinterleave_32f_x2(float* iBuffer, float* qBuffer, const lv_8sc_t* complexVector, const float scalar, unsigned int num_points) - * \endcode + * void volk_8ic_s32f_deinterleave_32f_x2(float* iBuffer, float* qBuffer, const lv_8sc_t* + * complexVector, const float scalar, unsigned int num_points) \endcode * * \b Inputs * \li complexVector: The complex input vector. @@ -56,74 +56,79 @@ #ifndef INCLUDED_volk_8ic_s32f_deinterleave_32f_x2_a_H #define INCLUDED_volk_8ic_s32f_deinterleave_32f_x2_a_H -#include #include #include +#include #ifdef LV_HAVE_SSE4_1 #include static inline void -volk_8ic_s32f_deinterleave_32f_x2_a_sse4_1(float* iBuffer, float* qBuffer, const lv_8sc_t* complexVector, - const float scalar, unsigned int num_points) +volk_8ic_s32f_deinterleave_32f_x2_a_sse4_1(float* iBuffer, + float* qBuffer, + const lv_8sc_t* complexVector, + const float scalar, + unsigned int num_points) { - float* iBufferPtr = iBuffer; - float* qBufferPtr = qBuffer; - - unsigned int number = 0; - const unsigned int eighthPoints = num_points / 8; - __m128 iFloatValue, qFloatValue; - - const float iScalar= 1.0 / scalar; - __m128 invScalar = _mm_set_ps1(iScalar); - __m128i complexVal, iIntVal, qIntVal, iComplexVal, qComplexVal; - int8_t* complexVectorPtr = (int8_t*)complexVector; - - __m128i iMoveMask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0); - __m128i qMoveMask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 15, 13, 11, 9, 7, 5, 3, 1); - - for(;number < eighthPoints; number++){ - complexVal = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16; - iComplexVal = _mm_shuffle_epi8(complexVal, iMoveMask); - qComplexVal = _mm_shuffle_epi8(complexVal, qMoveMask); - - iIntVal = _mm_cvtepi8_epi32(iComplexVal); - iFloatValue = _mm_cvtepi32_ps(iIntVal); - iFloatValue = _mm_mul_ps(iFloatValue, invScalar); - _mm_store_ps(iBufferPtr, iFloatValue); - iBufferPtr += 4; - - iComplexVal = _mm_srli_si128(iComplexVal, 4); - - iIntVal = _mm_cvtepi8_epi32(iComplexVal); - iFloatValue = _mm_cvtepi32_ps(iIntVal); - iFloatValue = _mm_mul_ps(iFloatValue, invScalar); - _mm_store_ps(iBufferPtr, iFloatValue); - iBufferPtr += 4; - - qIntVal = _mm_cvtepi8_epi32(qComplexVal); - qFloatValue = _mm_cvtepi32_ps(qIntVal); - qFloatValue = _mm_mul_ps(qFloatValue, invScalar); - _mm_store_ps(qBufferPtr, qFloatValue); - qBufferPtr += 4; - - qComplexVal = _mm_srli_si128(qComplexVal, 4); - - qIntVal = _mm_cvtepi8_epi32(qComplexVal); - qFloatValue = _mm_cvtepi32_ps(qIntVal); - qFloatValue = _mm_mul_ps(qFloatValue, invScalar); - _mm_store_ps(qBufferPtr, qFloatValue); - - qBufferPtr += 4; - } - - number = eighthPoints * 8; - for(; number < num_points; number++){ - *iBufferPtr++ = (float)(*complexVectorPtr++) * iScalar; - *qBufferPtr++ = (float)(*complexVectorPtr++) * iScalar; - } - + float* iBufferPtr = iBuffer; + float* qBufferPtr = qBuffer; + + unsigned int number = 0; + const unsigned int eighthPoints = num_points / 8; + __m128 iFloatValue, qFloatValue; + + const float iScalar = 1.0 / scalar; + __m128 invScalar = _mm_set_ps1(iScalar); + __m128i complexVal, iIntVal, qIntVal, iComplexVal, qComplexVal; + int8_t* complexVectorPtr = (int8_t*)complexVector; + + __m128i iMoveMask = _mm_set_epi8( + 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0); + __m128i qMoveMask = _mm_set_epi8( + 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 15, 13, 11, 9, 7, 5, 3, 1); + + for (; number < eighthPoints; number++) { + complexVal = _mm_load_si128((__m128i*)complexVectorPtr); + complexVectorPtr += 16; + iComplexVal = _mm_shuffle_epi8(complexVal, iMoveMask); + qComplexVal = _mm_shuffle_epi8(complexVal, qMoveMask); + + iIntVal = _mm_cvtepi8_epi32(iComplexVal); + iFloatValue = _mm_cvtepi32_ps(iIntVal); + iFloatValue = _mm_mul_ps(iFloatValue, invScalar); + _mm_store_ps(iBufferPtr, iFloatValue); + iBufferPtr += 4; + + iComplexVal = _mm_srli_si128(iComplexVal, 4); + + iIntVal = _mm_cvtepi8_epi32(iComplexVal); + iFloatValue = _mm_cvtepi32_ps(iIntVal); + iFloatValue = _mm_mul_ps(iFloatValue, invScalar); + _mm_store_ps(iBufferPtr, iFloatValue); + iBufferPtr += 4; + + qIntVal = _mm_cvtepi8_epi32(qComplexVal); + qFloatValue = _mm_cvtepi32_ps(qIntVal); + qFloatValue = _mm_mul_ps(qFloatValue, invScalar); + _mm_store_ps(qBufferPtr, qFloatValue); + qBufferPtr += 4; + + qComplexVal = _mm_srli_si128(qComplexVal, 4); + + qIntVal = _mm_cvtepi8_epi32(qComplexVal); + qFloatValue = _mm_cvtepi32_ps(qIntVal); + qFloatValue = _mm_mul_ps(qFloatValue, invScalar); + _mm_store_ps(qBufferPtr, qFloatValue); + + qBufferPtr += 4; + } + + number = eighthPoints * 8; + for (; number < num_points; number++) { + *iBufferPtr++ = (float)(*complexVectorPtr++) * iScalar; + *qBufferPtr++ = (float)(*complexVectorPtr++) * iScalar; + } } #endif /* LV_HAVE_SSE4_1 */ @@ -131,59 +136,60 @@ volk_8ic_s32f_deinterleave_32f_x2_a_sse4_1(float* iBuffer, float* qBuffer, const #ifdef LV_HAVE_SSE #include -static inline void -volk_8ic_s32f_deinterleave_32f_x2_a_sse(float* iBuffer, float* qBuffer, - const lv_8sc_t* complexVector, - const float scalar, unsigned int num_points) +static inline void volk_8ic_s32f_deinterleave_32f_x2_a_sse(float* iBuffer, + float* qBuffer, + const lv_8sc_t* complexVector, + const float scalar, + unsigned int num_points) { - float* iBufferPtr = iBuffer; - float* qBufferPtr = qBuffer; + float* iBufferPtr = iBuffer; + float* qBufferPtr = qBuffer; - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - __m128 cplxValue1, cplxValue2, iValue, qValue; + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + __m128 cplxValue1, cplxValue2, iValue, qValue; - __m128 invScalar = _mm_set_ps1(1.0/scalar); - int8_t* complexVectorPtr = (int8_t*)complexVector; + __m128 invScalar = _mm_set_ps1(1.0 / scalar); + int8_t* complexVectorPtr = (int8_t*)complexVector; - __VOLK_ATTR_ALIGNED(16) float floatBuffer[8]; + __VOLK_ATTR_ALIGNED(16) float floatBuffer[8]; - for(;number < quarterPoints; number++){ - floatBuffer[0] = (float)(complexVectorPtr[0]); - floatBuffer[1] = (float)(complexVectorPtr[1]); - floatBuffer[2] = (float)(complexVectorPtr[2]); - floatBuffer[3] = (float)(complexVectorPtr[3]); + for (; number < quarterPoints; number++) { + floatBuffer[0] = (float)(complexVectorPtr[0]); + floatBuffer[1] = (float)(complexVectorPtr[1]); + floatBuffer[2] = (float)(complexVectorPtr[2]); + floatBuffer[3] = (float)(complexVectorPtr[3]); - floatBuffer[4] = (float)(complexVectorPtr[4]); - floatBuffer[5] = (float)(complexVectorPtr[5]); - floatBuffer[6] = (float)(complexVectorPtr[6]); - floatBuffer[7] = (float)(complexVectorPtr[7]); + floatBuffer[4] = (float)(complexVectorPtr[4]); + floatBuffer[5] = (float)(complexVectorPtr[5]); + floatBuffer[6] = (float)(complexVectorPtr[6]); + floatBuffer[7] = (float)(complexVectorPtr[7]); - cplxValue1 = _mm_load_ps(&floatBuffer[0]); - cplxValue2 = _mm_load_ps(&floatBuffer[4]); + cplxValue1 = _mm_load_ps(&floatBuffer[0]); + cplxValue2 = _mm_load_ps(&floatBuffer[4]); - complexVectorPtr += 8; + complexVectorPtr += 8; - cplxValue1 = _mm_mul_ps(cplxValue1, invScalar); - cplxValue2 = _mm_mul_ps(cplxValue2, invScalar); + cplxValue1 = _mm_mul_ps(cplxValue1, invScalar); + cplxValue2 = _mm_mul_ps(cplxValue2, invScalar); - // Arrange in i1i2i3i4 format - iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0)); - qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1)); + // Arrange in i1i2i3i4 format + iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2, 0, 2, 0)); + qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3, 1, 3, 1)); - _mm_store_ps(iBufferPtr, iValue); - _mm_store_ps(qBufferPtr, qValue); + _mm_store_ps(iBufferPtr, iValue); + _mm_store_ps(qBufferPtr, qValue); - iBufferPtr += 4; - qBufferPtr += 4; - } + iBufferPtr += 4; + qBufferPtr += 4; + } - number = quarterPoints * 4; - complexVectorPtr = (int8_t*)&complexVector[number]; - for(; number < num_points; number++){ - *iBufferPtr++ = (float)(*complexVectorPtr++) / scalar; - *qBufferPtr++ = (float)(*complexVectorPtr++) / scalar; - } + number = quarterPoints * 4; + complexVectorPtr = (int8_t*)&complexVector[number]; + for (; number < num_points; number++) { + *iBufferPtr++ = (float)(*complexVectorPtr++) / scalar; + *qBufferPtr++ = (float)(*complexVectorPtr++) / scalar; + } } #endif /* LV_HAVE_SSE */ @@ -191,70 +197,127 @@ volk_8ic_s32f_deinterleave_32f_x2_a_sse(float* iBuffer, float* qBuffer, #ifdef LV_HAVE_AVX2 #include -static inline void -volk_8ic_s32f_deinterleave_32f_x2_a_avx2(float* iBuffer, float* qBuffer, const lv_8sc_t* complexVector, - const float scalar, unsigned int num_points) +static inline void volk_8ic_s32f_deinterleave_32f_x2_a_avx2(float* iBuffer, + float* qBuffer, + const lv_8sc_t* complexVector, + const float scalar, + unsigned int num_points) { - float* iBufferPtr = iBuffer; - float* qBufferPtr = qBuffer; - - unsigned int number = 0; - const unsigned int sixteenthPoints = num_points / 16; - __m256 iFloatValue, qFloatValue; - - const float iScalar= 1.0 / scalar; - __m256 invScalar = _mm256_set1_ps(iScalar); - __m256i complexVal, iIntVal, qIntVal, iComplexVal, qComplexVal; - int8_t* complexVectorPtr = (int8_t*)complexVector; - - __m256i iMoveMask = _mm256_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, - 14, 12, 10, 8, 6, 4, 2, 0, - 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, - 14, 12, 10, 8, 6, 4, 2, 0); - __m256i qMoveMask = _mm256_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, - 15, 13, 11, 9, 7, 5, 3, 1, - 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, - 15, 13, 11, 9, 7, 5, 3, 1); - - for(;number < sixteenthPoints; number++){ - complexVal = _mm256_load_si256((__m256i*)complexVectorPtr); - complexVectorPtr += 32; - iComplexVal = _mm256_shuffle_epi8(complexVal, iMoveMask); - qComplexVal = _mm256_shuffle_epi8(complexVal, qMoveMask); - - iIntVal = _mm256_cvtepi8_epi32(_mm256_castsi256_si128(iComplexVal)); - iFloatValue = _mm256_cvtepi32_ps(iIntVal); - iFloatValue = _mm256_mul_ps(iFloatValue, invScalar); - _mm256_store_ps(iBufferPtr, iFloatValue); - iBufferPtr += 8; - - iComplexVal = _mm256_permute4x64_epi64(iComplexVal, 0b11000110); - iIntVal = _mm256_cvtepi8_epi32(_mm256_castsi256_si128(iComplexVal)); - iFloatValue = _mm256_cvtepi32_ps(iIntVal); - iFloatValue = _mm256_mul_ps(iFloatValue, invScalar); - _mm256_store_ps(iBufferPtr, iFloatValue); - iBufferPtr += 8; - - qIntVal = _mm256_cvtepi8_epi32(_mm256_castsi256_si128(qComplexVal)); - qFloatValue = _mm256_cvtepi32_ps(qIntVal); - qFloatValue = _mm256_mul_ps(qFloatValue, invScalar); - _mm256_store_ps(qBufferPtr, qFloatValue); - qBufferPtr += 8; - - qComplexVal = _mm256_permute4x64_epi64(qComplexVal, 0b11000110); - qIntVal = _mm256_cvtepi8_epi32(_mm256_castsi256_si128(qComplexVal)); - qFloatValue = _mm256_cvtepi32_ps(qIntVal); - qFloatValue = _mm256_mul_ps(qFloatValue, invScalar); - _mm256_store_ps(qBufferPtr, qFloatValue); - qBufferPtr += 8; - } - - number = sixteenthPoints * 16; - for(; number < num_points; number++){ - *iBufferPtr++ = (float)(*complexVectorPtr++) * iScalar; - *qBufferPtr++ = (float)(*complexVectorPtr++) * iScalar; - } - + float* iBufferPtr = iBuffer; + float* qBufferPtr = qBuffer; + + unsigned int number = 0; + const unsigned int sixteenthPoints = num_points / 16; + __m256 iFloatValue, qFloatValue; + + const float iScalar = 1.0 / scalar; + __m256 invScalar = _mm256_set1_ps(iScalar); + __m256i complexVal, iIntVal, qIntVal, iComplexVal, qComplexVal; + int8_t* complexVectorPtr = (int8_t*)complexVector; + + __m256i iMoveMask = _mm256_set_epi8(0x80, + 0x80, + 0x80, + 0x80, + 0x80, + 0x80, + 0x80, + 0x80, + 14, + 12, + 10, + 8, + 6, + 4, + 2, + 0, + 0x80, + 0x80, + 0x80, + 0x80, + 0x80, + 0x80, + 0x80, + 0x80, + 14, + 12, + 10, + 8, + 6, + 4, + 2, + 0); + __m256i qMoveMask = _mm256_set_epi8(0x80, + 0x80, + 0x80, + 0x80, + 0x80, + 0x80, + 0x80, + 0x80, + 15, + 13, + 11, + 9, + 7, + 5, + 3, + 1, + 0x80, + 0x80, + 0x80, + 0x80, + 0x80, + 0x80, + 0x80, + 0x80, + 15, + 13, + 11, + 9, + 7, + 5, + 3, + 1); + + for (; number < sixteenthPoints; number++) { + complexVal = _mm256_load_si256((__m256i*)complexVectorPtr); + complexVectorPtr += 32; + iComplexVal = _mm256_shuffle_epi8(complexVal, iMoveMask); + qComplexVal = _mm256_shuffle_epi8(complexVal, qMoveMask); + + iIntVal = _mm256_cvtepi8_epi32(_mm256_castsi256_si128(iComplexVal)); + iFloatValue = _mm256_cvtepi32_ps(iIntVal); + iFloatValue = _mm256_mul_ps(iFloatValue, invScalar); + _mm256_store_ps(iBufferPtr, iFloatValue); + iBufferPtr += 8; + + iComplexVal = _mm256_permute4x64_epi64(iComplexVal, 0b11000110); + iIntVal = _mm256_cvtepi8_epi32(_mm256_castsi256_si128(iComplexVal)); + iFloatValue = _mm256_cvtepi32_ps(iIntVal); + iFloatValue = _mm256_mul_ps(iFloatValue, invScalar); + _mm256_store_ps(iBufferPtr, iFloatValue); + iBufferPtr += 8; + + qIntVal = _mm256_cvtepi8_epi32(_mm256_castsi256_si128(qComplexVal)); + qFloatValue = _mm256_cvtepi32_ps(qIntVal); + qFloatValue = _mm256_mul_ps(qFloatValue, invScalar); + _mm256_store_ps(qBufferPtr, qFloatValue); + qBufferPtr += 8; + + qComplexVal = _mm256_permute4x64_epi64(qComplexVal, 0b11000110); + qIntVal = _mm256_cvtepi8_epi32(_mm256_castsi256_si128(qComplexVal)); + qFloatValue = _mm256_cvtepi32_ps(qIntVal); + qFloatValue = _mm256_mul_ps(qFloatValue, invScalar); + _mm256_store_ps(qBufferPtr, qFloatValue); + qBufferPtr += 8; + } + + number = sixteenthPoints * 16; + for (; number < num_points; number++) { + *iBufferPtr++ = (float)(*complexVectorPtr++) * iScalar; + *qBufferPtr++ = (float)(*complexVectorPtr++) * iScalar; + } } #endif /* LV_HAVE_AVX2 */ @@ -262,19 +325,21 @@ volk_8ic_s32f_deinterleave_32f_x2_a_avx2(float* iBuffer, float* qBuffer, const l #ifdef LV_HAVE_GENERIC static inline void -volk_8ic_s32f_deinterleave_32f_x2_generic(float* iBuffer, float* qBuffer, +volk_8ic_s32f_deinterleave_32f_x2_generic(float* iBuffer, + float* qBuffer, const lv_8sc_t* complexVector, - const float scalar, unsigned int num_points) + const float scalar, + unsigned int num_points) { - const int8_t* complexVectorPtr = (const int8_t*)complexVector; - float* iBufferPtr = iBuffer; - float* qBufferPtr = qBuffer; - unsigned int number; - const float invScalar = 1.0 / scalar; - for(number = 0; number < num_points; number++){ - *iBufferPtr++ = (float)(*complexVectorPtr++)*invScalar; - *qBufferPtr++ = (float)(*complexVectorPtr++)*invScalar; - } + const int8_t* complexVectorPtr = (const int8_t*)complexVector; + float* iBufferPtr = iBuffer; + float* qBufferPtr = qBuffer; + unsigned int number; + const float invScalar = 1.0 / scalar; + for (number = 0; number < num_points; number++) { + *iBufferPtr++ = (float)(*complexVectorPtr++) * invScalar; + *qBufferPtr++ = (float)(*complexVectorPtr++) * invScalar; + } } #endif /* LV_HAVE_GENERIC */ @@ -285,75 +350,107 @@ volk_8ic_s32f_deinterleave_32f_x2_generic(float* iBuffer, float* qBuffer, #ifndef INCLUDED_volk_8ic_s32f_deinterleave_32f_x2_u_H #define INCLUDED_volk_8ic_s32f_deinterleave_32f_x2_u_H -#include #include #include +#include #ifdef LV_HAVE_AVX2 #include -static inline void -volk_8ic_s32f_deinterleave_32f_x2_u_avx2(float* iBuffer, float* qBuffer, const lv_8sc_t* complexVector, - const float scalar, unsigned int num_points) +static inline void volk_8ic_s32f_deinterleave_32f_x2_u_avx2(float* iBuffer, + float* qBuffer, + const lv_8sc_t* complexVector, + const float scalar, + unsigned int num_points) { - float* iBufferPtr = iBuffer; - float* qBufferPtr = qBuffer; - - unsigned int number = 0; - const unsigned int sixteenthPoints = num_points / 16; - __m256 iFloatValue, qFloatValue; - - const float iScalar= 1.0 / scalar; - __m256 invScalar = _mm256_set1_ps(iScalar); - __m256i complexVal, iIntVal, qIntVal; - __m128i iComplexVal, qComplexVal; - int8_t* complexVectorPtr = (int8_t*)complexVector; - - __m256i MoveMask = _mm256_set_epi8(15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, - 6, 4, 2, 0,15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0); - - for(;number < sixteenthPoints; number++){ - complexVal = _mm256_loadu_si256((__m256i*)complexVectorPtr); complexVectorPtr += 32; - complexVal = _mm256_shuffle_epi8(complexVal, MoveMask); - complexVal = _mm256_permute4x64_epi64(complexVal,0xd8); - iComplexVal = _mm256_extractf128_si256(complexVal,0); - qComplexVal = _mm256_extractf128_si256(complexVal,1); - - iIntVal = _mm256_cvtepi8_epi32(iComplexVal); - iFloatValue = _mm256_cvtepi32_ps(iIntVal); - iFloatValue = _mm256_mul_ps(iFloatValue, invScalar); - _mm256_storeu_ps(iBufferPtr, iFloatValue); - iBufferPtr += 8; - - qIntVal = _mm256_cvtepi8_epi32(qComplexVal); - qFloatValue = _mm256_cvtepi32_ps(qIntVal); - qFloatValue = _mm256_mul_ps(qFloatValue, invScalar); - _mm256_storeu_ps(qBufferPtr, qFloatValue); - qBufferPtr += 8; - - complexVal = _mm256_srli_si256(complexVal, 8); - iComplexVal = _mm256_extractf128_si256(complexVal,0); - qComplexVal = _mm256_extractf128_si256(complexVal,1); - - iIntVal = _mm256_cvtepi8_epi32(iComplexVal); - iFloatValue = _mm256_cvtepi32_ps(iIntVal); - iFloatValue = _mm256_mul_ps(iFloatValue, invScalar); - _mm256_storeu_ps(iBufferPtr, iFloatValue); - iBufferPtr += 8; - - qIntVal = _mm256_cvtepi8_epi32(qComplexVal); - qFloatValue = _mm256_cvtepi32_ps(qIntVal); - qFloatValue = _mm256_mul_ps(qFloatValue, invScalar); - _mm256_storeu_ps(qBufferPtr, qFloatValue); - qBufferPtr += 8; - } - - number = sixteenthPoints * 16; - for(; number < num_points; number++){ - *iBufferPtr++ = (float)(*complexVectorPtr++) * iScalar; - *qBufferPtr++ = (float)(*complexVectorPtr++) * iScalar; - } - + float* iBufferPtr = iBuffer; + float* qBufferPtr = qBuffer; + + unsigned int number = 0; + const unsigned int sixteenthPoints = num_points / 16; + __m256 iFloatValue, qFloatValue; + + const float iScalar = 1.0 / scalar; + __m256 invScalar = _mm256_set1_ps(iScalar); + __m256i complexVal, iIntVal, qIntVal; + __m128i iComplexVal, qComplexVal; + int8_t* complexVectorPtr = (int8_t*)complexVector; + + __m256i MoveMask = _mm256_set_epi8(15, + 13, + 11, + 9, + 7, + 5, + 3, + 1, + 14, + 12, + 10, + 8, + 6, + 4, + 2, + 0, + 15, + 13, + 11, + 9, + 7, + 5, + 3, + 1, + 14, + 12, + 10, + 8, + 6, + 4, + 2, + 0); + + for (; number < sixteenthPoints; number++) { + complexVal = _mm256_loadu_si256((__m256i*)complexVectorPtr); + complexVectorPtr += 32; + complexVal = _mm256_shuffle_epi8(complexVal, MoveMask); + complexVal = _mm256_permute4x64_epi64(complexVal, 0xd8); + iComplexVal = _mm256_extractf128_si256(complexVal, 0); + qComplexVal = _mm256_extractf128_si256(complexVal, 1); + + iIntVal = _mm256_cvtepi8_epi32(iComplexVal); + iFloatValue = _mm256_cvtepi32_ps(iIntVal); + iFloatValue = _mm256_mul_ps(iFloatValue, invScalar); + _mm256_storeu_ps(iBufferPtr, iFloatValue); + iBufferPtr += 8; + + qIntVal = _mm256_cvtepi8_epi32(qComplexVal); + qFloatValue = _mm256_cvtepi32_ps(qIntVal); + qFloatValue = _mm256_mul_ps(qFloatValue, invScalar); + _mm256_storeu_ps(qBufferPtr, qFloatValue); + qBufferPtr += 8; + + complexVal = _mm256_srli_si256(complexVal, 8); + iComplexVal = _mm256_extractf128_si256(complexVal, 0); + qComplexVal = _mm256_extractf128_si256(complexVal, 1); + + iIntVal = _mm256_cvtepi8_epi32(iComplexVal); + iFloatValue = _mm256_cvtepi32_ps(iIntVal); + iFloatValue = _mm256_mul_ps(iFloatValue, invScalar); + _mm256_storeu_ps(iBufferPtr, iFloatValue); + iBufferPtr += 8; + + qIntVal = _mm256_cvtepi8_epi32(qComplexVal); + qFloatValue = _mm256_cvtepi32_ps(qIntVal); + qFloatValue = _mm256_mul_ps(qFloatValue, invScalar); + _mm256_storeu_ps(qBufferPtr, qFloatValue); + qBufferPtr += 8; + } + + number = sixteenthPoints * 16; + for (; number < num_points; number++) { + *iBufferPtr++ = (float)(*complexVectorPtr++) * iScalar; + *qBufferPtr++ = (float)(*complexVectorPtr++) * iScalar; + } } #endif /* LV_HAVE_AVX2 */ diff --git a/kernels/volk/volk_8ic_s32f_deinterleave_real_32f.h b/kernels/volk/volk_8ic_s32f_deinterleave_real_32f.h index 0c85ee9..4c1afe7 100644 --- a/kernels/volk/volk_8ic_s32f_deinterleave_real_32f.h +++ b/kernels/volk/volk_8ic_s32f_deinterleave_real_32f.h @@ -31,8 +31,8 @@ * * Dispatcher Prototype * \code - * void volk_8ic_s32f_deinterleave_real_32f(float* iBuffer, const lv_8sc_t* complexVector, const float scalar, unsigned int num_points) - * \endcode + * void volk_8ic_s32f_deinterleave_real_32f(float* iBuffer, const lv_8sc_t* complexVector, + * const float scalar, unsigned int num_points) \endcode * * \b Inputs * \li complexVector: The complex input vector. @@ -55,57 +55,86 @@ #ifndef INCLUDED_volk_8ic_s32f_deinterleave_real_32f_a_H #define INCLUDED_volk_8ic_s32f_deinterleave_real_32f_a_H -#include #include #include +#include #ifdef LV_HAVE_AVX2 #include static inline void -volk_8ic_s32f_deinterleave_real_32f_a_avx2(float* iBuffer, const lv_8sc_t* complexVector, - const float scalar, unsigned int num_points) +volk_8ic_s32f_deinterleave_real_32f_a_avx2(float* iBuffer, + const lv_8sc_t* complexVector, + const float scalar, + unsigned int num_points) { - float* iBufferPtr = iBuffer; - - unsigned int number = 0; - const unsigned int sixteenthPoints = num_points / 16; - __m256 iFloatValue; - - const float iScalar= 1.0 / scalar; - __m256 invScalar = _mm256_set1_ps(iScalar); - __m256i complexVal, iIntVal; - int8_t* complexVectorPtr = (int8_t*)complexVector; - - __m256i moveMask = _mm256_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, - 14, 12, 10, 8, 6, 4, 2, 0, - 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, - 14, 12, 10, 8, 6, 4, 2, 0); - for(;number < sixteenthPoints; number++){ - complexVal = _mm256_load_si256((__m256i*)complexVectorPtr); - complexVectorPtr += 32; - complexVal = _mm256_shuffle_epi8(complexVal, moveMask); - - iIntVal = _mm256_cvtepi8_epi32(_mm256_castsi256_si128(complexVal)); - iFloatValue = _mm256_cvtepi32_ps(iIntVal); - iFloatValue = _mm256_mul_ps(iFloatValue, invScalar); - _mm256_store_ps(iBufferPtr, iFloatValue); - iBufferPtr += 8; - - complexVal = _mm256_permute4x64_epi64(complexVal, 0b11000110); - iIntVal = _mm256_cvtepi8_epi32(_mm256_castsi256_si128(complexVal)); - iFloatValue = _mm256_cvtepi32_ps(iIntVal); - iFloatValue = _mm256_mul_ps(iFloatValue, invScalar); - _mm256_store_ps(iBufferPtr, iFloatValue); - iBufferPtr += 8; - } - - number = sixteenthPoints * 16; - for(; number < num_points; number++){ - *iBufferPtr++ = (float)(*complexVectorPtr++) * iScalar; - complexVectorPtr++; - } - + float* iBufferPtr = iBuffer; + + unsigned int number = 0; + const unsigned int sixteenthPoints = num_points / 16; + __m256 iFloatValue; + + const float iScalar = 1.0 / scalar; + __m256 invScalar = _mm256_set1_ps(iScalar); + __m256i complexVal, iIntVal; + int8_t* complexVectorPtr = (int8_t*)complexVector; + + __m256i moveMask = _mm256_set_epi8(0x80, + 0x80, + 0x80, + 0x80, + 0x80, + 0x80, + 0x80, + 0x80, + 14, + 12, + 10, + 8, + 6, + 4, + 2, + 0, + 0x80, + 0x80, + 0x80, + 0x80, + 0x80, + 0x80, + 0x80, + 0x80, + 14, + 12, + 10, + 8, + 6, + 4, + 2, + 0); + for (; number < sixteenthPoints; number++) { + complexVal = _mm256_load_si256((__m256i*)complexVectorPtr); + complexVectorPtr += 32; + complexVal = _mm256_shuffle_epi8(complexVal, moveMask); + + iIntVal = _mm256_cvtepi8_epi32(_mm256_castsi256_si128(complexVal)); + iFloatValue = _mm256_cvtepi32_ps(iIntVal); + iFloatValue = _mm256_mul_ps(iFloatValue, invScalar); + _mm256_store_ps(iBufferPtr, iFloatValue); + iBufferPtr += 8; + + complexVal = _mm256_permute4x64_epi64(complexVal, 0b11000110); + iIntVal = _mm256_cvtepi8_epi32(_mm256_castsi256_si128(complexVal)); + iFloatValue = _mm256_cvtepi32_ps(iIntVal); + iFloatValue = _mm256_mul_ps(iFloatValue, invScalar); + _mm256_store_ps(iBufferPtr, iFloatValue); + iBufferPtr += 8; + } + + number = sixteenthPoints * 16; + for (; number < num_points; number++) { + *iBufferPtr++ = (float)(*complexVectorPtr++) * iScalar; + complexVectorPtr++; + } } #endif /* LV_HAVE_AVX2 */ @@ -114,52 +143,55 @@ volk_8ic_s32f_deinterleave_real_32f_a_avx2(float* iBuffer, const lv_8sc_t* compl #include static inline void -volk_8ic_s32f_deinterleave_real_32f_a_sse4_1(float* iBuffer, const lv_8sc_t* complexVector, - const float scalar, unsigned int num_points) +volk_8ic_s32f_deinterleave_real_32f_a_sse4_1(float* iBuffer, + const lv_8sc_t* complexVector, + const float scalar, + unsigned int num_points) { - float* iBufferPtr = iBuffer; - - unsigned int number = 0; - const unsigned int eighthPoints = num_points / 8; - __m128 iFloatValue; + float* iBufferPtr = iBuffer; - const float iScalar= 1.0 / scalar; - __m128 invScalar = _mm_set_ps1(iScalar); - __m128i complexVal, iIntVal; - int8_t* complexVectorPtr = (int8_t*)complexVector; + unsigned int number = 0; + const unsigned int eighthPoints = num_points / 8; + __m128 iFloatValue; - __m128i moveMask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0); + const float iScalar = 1.0 / scalar; + __m128 invScalar = _mm_set_ps1(iScalar); + __m128i complexVal, iIntVal; + int8_t* complexVectorPtr = (int8_t*)complexVector; - for(;number < eighthPoints; number++){ - complexVal = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16; - complexVal = _mm_shuffle_epi8(complexVal, moveMask); + __m128i moveMask = _mm_set_epi8( + 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0); - iIntVal = _mm_cvtepi8_epi32(complexVal); - iFloatValue = _mm_cvtepi32_ps(iIntVal); + for (; number < eighthPoints; number++) { + complexVal = _mm_load_si128((__m128i*)complexVectorPtr); + complexVectorPtr += 16; + complexVal = _mm_shuffle_epi8(complexVal, moveMask); - iFloatValue = _mm_mul_ps(iFloatValue, invScalar); + iIntVal = _mm_cvtepi8_epi32(complexVal); + iFloatValue = _mm_cvtepi32_ps(iIntVal); - _mm_store_ps(iBufferPtr, iFloatValue); + iFloatValue = _mm_mul_ps(iFloatValue, invScalar); - iBufferPtr += 4; + _mm_store_ps(iBufferPtr, iFloatValue); - complexVal = _mm_srli_si128(complexVal, 4); - iIntVal = _mm_cvtepi8_epi32(complexVal); - iFloatValue = _mm_cvtepi32_ps(iIntVal); + iBufferPtr += 4; - iFloatValue = _mm_mul_ps(iFloatValue, invScalar); + complexVal = _mm_srli_si128(complexVal, 4); + iIntVal = _mm_cvtepi8_epi32(complexVal); + iFloatValue = _mm_cvtepi32_ps(iIntVal); - _mm_store_ps(iBufferPtr, iFloatValue); + iFloatValue = _mm_mul_ps(iFloatValue, invScalar); - iBufferPtr += 4; - } + _mm_store_ps(iBufferPtr, iFloatValue); - number = eighthPoints * 8; - for(; number < num_points; number++){ - *iBufferPtr++ = (float)(*complexVectorPtr++) * iScalar; - complexVectorPtr++; - } + iBufferPtr += 4; + } + number = eighthPoints * 8; + for (; number < num_points; number++) { + *iBufferPtr++ = (float)(*complexVectorPtr++) * iScalar; + complexVectorPtr++; + } } #endif /* LV_HAVE_SSE4_1 */ @@ -168,42 +200,47 @@ volk_8ic_s32f_deinterleave_real_32f_a_sse4_1(float* iBuffer, const lv_8sc_t* com #include static inline void -volk_8ic_s32f_deinterleave_real_32f_a_sse(float* iBuffer, const lv_8sc_t* complexVector, - const float scalar, unsigned int num_points) +volk_8ic_s32f_deinterleave_real_32f_a_sse(float* iBuffer, + const lv_8sc_t* complexVector, + const float scalar, + unsigned int num_points) { - float* iBufferPtr = iBuffer; - - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - __m128 iValue; + float* iBufferPtr = iBuffer; - const float iScalar= 1.0 / scalar; - __m128 invScalar = _mm_set_ps1(iScalar); - int8_t* complexVectorPtr = (int8_t*)complexVector; + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + __m128 iValue; - __VOLK_ATTR_ALIGNED(16) float floatBuffer[4]; + const float iScalar = 1.0 / scalar; + __m128 invScalar = _mm_set_ps1(iScalar); + int8_t* complexVectorPtr = (int8_t*)complexVector; - for(;number < quarterPoints; number++){ - floatBuffer[0] = (float)(*complexVectorPtr); complexVectorPtr += 2; - floatBuffer[1] = (float)(*complexVectorPtr); complexVectorPtr += 2; - floatBuffer[2] = (float)(*complexVectorPtr); complexVectorPtr += 2; - floatBuffer[3] = (float)(*complexVectorPtr); complexVectorPtr += 2; + __VOLK_ATTR_ALIGNED(16) float floatBuffer[4]; - iValue = _mm_load_ps(floatBuffer); + for (; number < quarterPoints; number++) { + floatBuffer[0] = (float)(*complexVectorPtr); + complexVectorPtr += 2; + floatBuffer[1] = (float)(*complexVectorPtr); + complexVectorPtr += 2; + floatBuffer[2] = (float)(*complexVectorPtr); + complexVectorPtr += 2; + floatBuffer[3] = (float)(*complexVectorPtr); + complexVectorPtr += 2; - iValue = _mm_mul_ps(iValue, invScalar); + iValue = _mm_load_ps(floatBuffer); - _mm_store_ps(iBufferPtr, iValue); + iValue = _mm_mul_ps(iValue, invScalar); - iBufferPtr += 4; - } + _mm_store_ps(iBufferPtr, iValue); - number = quarterPoints * 4; - for(; number < num_points; number++){ - *iBufferPtr++ = (float)(*complexVectorPtr++) * iScalar; - complexVectorPtr++; - } + iBufferPtr += 4; + } + number = quarterPoints * 4; + for (; number < num_points; number++) { + *iBufferPtr++ = (float)(*complexVectorPtr++) * iScalar; + complexVectorPtr++; + } } #endif /* LV_HAVE_SSE */ @@ -211,83 +248,117 @@ volk_8ic_s32f_deinterleave_real_32f_a_sse(float* iBuffer, const lv_8sc_t* comple #ifdef LV_HAVE_GENERIC static inline void -volk_8ic_s32f_deinterleave_real_32f_generic(float* iBuffer, const lv_8sc_t* complexVector, - const float scalar, unsigned int num_points) +volk_8ic_s32f_deinterleave_real_32f_generic(float* iBuffer, + const lv_8sc_t* complexVector, + const float scalar, + unsigned int num_points) { - unsigned int number = 0; - const int8_t* complexVectorPtr = (const int8_t*)complexVector; - float* iBufferPtr = iBuffer; - const float invScalar = 1.0 / scalar; - for(number = 0; number < num_points; number++){ - *iBufferPtr++ = ((float)(*complexVectorPtr++)) * invScalar; - complexVectorPtr++; - } + unsigned int number = 0; + const int8_t* complexVectorPtr = (const int8_t*)complexVector; + float* iBufferPtr = iBuffer; + const float invScalar = 1.0 / scalar; + for (number = 0; number < num_points; number++) { + *iBufferPtr++ = ((float)(*complexVectorPtr++)) * invScalar; + complexVectorPtr++; + } } #endif /* LV_HAVE_GENERIC */ - #endif /* INCLUDED_volk_8ic_s32f_deinterleave_real_32f_a_H */ #ifndef INCLUDED_volk_8ic_s32f_deinterleave_real_32f_u_H #define INCLUDED_volk_8ic_s32f_deinterleave_real_32f_u_H -#include #include #include +#include #ifdef LV_HAVE_AVX2 #include static inline void -volk_8ic_s32f_deinterleave_real_32f_u_avx2(float* iBuffer, const lv_8sc_t* complexVector, - const float scalar, unsigned int num_points) +volk_8ic_s32f_deinterleave_real_32f_u_avx2(float* iBuffer, + const lv_8sc_t* complexVector, + const float scalar, + unsigned int num_points) { - float* iBufferPtr = iBuffer; - - unsigned int number = 0; - const unsigned int sixteenthPoints = num_points / 16; - __m256 iFloatValue; - - const float iScalar= 1.0 / scalar; - __m256 invScalar = _mm256_set1_ps(iScalar); - __m256i complexVal, iIntVal; - __m128i hcomplexVal; - int8_t* complexVectorPtr = (int8_t*)complexVector; - - __m256i moveMask = _mm256_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0); - - for(;number < sixteenthPoints; number++){ - complexVal = _mm256_loadu_si256((__m256i*)complexVectorPtr); complexVectorPtr += 32; - complexVal = _mm256_shuffle_epi8(complexVal, moveMask); - - hcomplexVal = _mm256_extracti128_si256(complexVal,0); - iIntVal = _mm256_cvtepi8_epi32(hcomplexVal); - iFloatValue = _mm256_cvtepi32_ps(iIntVal); - - iFloatValue = _mm256_mul_ps(iFloatValue, invScalar); - - _mm256_storeu_ps(iBufferPtr, iFloatValue); - - iBufferPtr += 8; - - hcomplexVal = _mm256_extracti128_si256(complexVal,1); - iIntVal = _mm256_cvtepi8_epi32(hcomplexVal); - iFloatValue = _mm256_cvtepi32_ps(iIntVal); - - iFloatValue = _mm256_mul_ps(iFloatValue, invScalar); - - _mm256_storeu_ps(iBufferPtr, iFloatValue); - - iBufferPtr += 8; - } - - number = sixteenthPoints * 16; - for(; number < num_points; number++){ - *iBufferPtr++ = (float)(*complexVectorPtr++) * iScalar; - complexVectorPtr++; - } - + float* iBufferPtr = iBuffer; + + unsigned int number = 0; + const unsigned int sixteenthPoints = num_points / 16; + __m256 iFloatValue; + + const float iScalar = 1.0 / scalar; + __m256 invScalar = _mm256_set1_ps(iScalar); + __m256i complexVal, iIntVal; + __m128i hcomplexVal; + int8_t* complexVectorPtr = (int8_t*)complexVector; + + __m256i moveMask = _mm256_set_epi8(0x80, + 0x80, + 0x80, + 0x80, + 0x80, + 0x80, + 0x80, + 0x80, + 14, + 12, + 10, + 8, + 6, + 4, + 2, + 0, + 0x80, + 0x80, + 0x80, + 0x80, + 0x80, + 0x80, + 0x80, + 0x80, + 14, + 12, + 10, + 8, + 6, + 4, + 2, + 0); + + for (; number < sixteenthPoints; number++) { + complexVal = _mm256_loadu_si256((__m256i*)complexVectorPtr); + complexVectorPtr += 32; + complexVal = _mm256_shuffle_epi8(complexVal, moveMask); + + hcomplexVal = _mm256_extracti128_si256(complexVal, 0); + iIntVal = _mm256_cvtepi8_epi32(hcomplexVal); + iFloatValue = _mm256_cvtepi32_ps(iIntVal); + + iFloatValue = _mm256_mul_ps(iFloatValue, invScalar); + + _mm256_storeu_ps(iBufferPtr, iFloatValue); + + iBufferPtr += 8; + + hcomplexVal = _mm256_extracti128_si256(complexVal, 1); + iIntVal = _mm256_cvtepi8_epi32(hcomplexVal); + iFloatValue = _mm256_cvtepi32_ps(iIntVal); + + iFloatValue = _mm256_mul_ps(iFloatValue, invScalar); + + _mm256_storeu_ps(iBufferPtr, iFloatValue); + + iBufferPtr += 8; + } + + number = sixteenthPoints * 16; + for (; number < num_points; number++) { + *iBufferPtr++ = (float)(*complexVectorPtr++) * iScalar; + complexVectorPtr++; + } } #endif /* LV_HAVE_AVX2 */ diff --git a/kernels/volk/volk_8ic_x2_multiply_conjugate_16ic.h b/kernels/volk/volk_8ic_x2_multiply_conjugate_16ic.h index 6762658..7f9fd96 100644 --- a/kernels/volk/volk_8ic_x2_multiply_conjugate_16ic.h +++ b/kernels/volk/volk_8ic_x2_multiply_conjugate_16ic.h @@ -30,64 +30,73 @@ #ifdef LV_HAVE_AVX2 #include /*! - \brief Multiplys the one complex vector with the complex conjugate of the second complex vector and stores their results in the third vector - \param cVector The complex vector where the results will be stored - \param aVector One of the complex vectors to be multiplied - \param bVector The complex vector which will be converted to complex conjugate and multiplied - \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector + \brief Multiplys the one complex vector with the complex conjugate of the second complex + vector and stores their results in the third vector \param cVector The complex vector + where the results will be stored \param aVector One of the complex vectors to be + multiplied \param bVector The complex vector which will be converted to complex + conjugate and multiplied \param num_points The number of complex values in aVector and + bVector to be multiplied together and stored into cVector */ -static inline void volk_8ic_x2_multiply_conjugate_16ic_a_avx2(lv_16sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t* bVector, unsigned int num_points){ - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 8; - - __m256i x, y, realz, imagz; - lv_16sc_t* c = cVector; - const lv_8sc_t* a = aVector; - const lv_8sc_t* b = bVector; - __m256i conjugateSign = _mm256_set_epi16(-1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1); - - for(;number < quarterPoints; number++){ - // Convert 8 bit values into 16 bit values - x = _mm256_cvtepi8_epi16(_mm_load_si128((__m128i*)a)); - y = _mm256_cvtepi8_epi16(_mm_load_si128((__m128i*)b)); - - // Calculate the ar*cr - ai*(-ci) portions - realz = _mm256_madd_epi16(x,y); - - // Calculate the complex conjugate of the cr + ci j values - y = _mm256_sign_epi16(y, conjugateSign); - - // Shift the order of the cr and ci values - y = _mm256_shufflehi_epi16(_mm256_shufflelo_epi16(y, _MM_SHUFFLE(2,3,0,1) ), _MM_SHUFFLE(2,3,0,1)); - - // Calculate the ar*(-ci) + cr*(ai) - imagz = _mm256_madd_epi16(x,y); - - // Perform the addition of products - - _mm256_store_si256((__m256i*)c, _mm256_packs_epi32(_mm256_unpacklo_epi32(realz, imagz), _mm256_unpackhi_epi32(realz, imagz))); - - a += 8; - b += 8; - c += 8; - } - - number = quarterPoints * 8; - int16_t* c16Ptr = (int16_t*)&cVector[number]; - int8_t* a8Ptr = (int8_t*)&aVector[number]; - int8_t* b8Ptr = (int8_t*)&bVector[number]; - for(; number < num_points; number++){ - float aReal = (float)*a8Ptr++; - float aImag = (float)*a8Ptr++; - lv_32fc_t aVal = lv_cmake(aReal, aImag ); - float bReal = (float)*b8Ptr++; - float bImag = (float)*b8Ptr++; - lv_32fc_t bVal = lv_cmake( bReal, -bImag ); - lv_32fc_t temp = aVal * bVal; - - *c16Ptr++ = (int16_t)lv_creal(temp); - *c16Ptr++ = (int16_t)lv_cimag(temp); - } +static inline void volk_8ic_x2_multiply_conjugate_16ic_a_avx2(lv_16sc_t* cVector, + const lv_8sc_t* aVector, + const lv_8sc_t* bVector, + unsigned int num_points) +{ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 8; + + __m256i x, y, realz, imagz; + lv_16sc_t* c = cVector; + const lv_8sc_t* a = aVector; + const lv_8sc_t* b = bVector; + __m256i conjugateSign = + _mm256_set_epi16(-1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1); + + for (; number < quarterPoints; number++) { + // Convert 8 bit values into 16 bit values + x = _mm256_cvtepi8_epi16(_mm_load_si128((__m128i*)a)); + y = _mm256_cvtepi8_epi16(_mm_load_si128((__m128i*)b)); + + // Calculate the ar*cr - ai*(-ci) portions + realz = _mm256_madd_epi16(x, y); + + // Calculate the complex conjugate of the cr + ci j values + y = _mm256_sign_epi16(y, conjugateSign); + + // Shift the order of the cr and ci values + y = _mm256_shufflehi_epi16(_mm256_shufflelo_epi16(y, _MM_SHUFFLE(2, 3, 0, 1)), + _MM_SHUFFLE(2, 3, 0, 1)); + + // Calculate the ar*(-ci) + cr*(ai) + imagz = _mm256_madd_epi16(x, y); + + // Perform the addition of products + + _mm256_store_si256((__m256i*)c, + _mm256_packs_epi32(_mm256_unpacklo_epi32(realz, imagz), + _mm256_unpackhi_epi32(realz, imagz))); + + a += 8; + b += 8; + c += 8; + } + + number = quarterPoints * 8; + int16_t* c16Ptr = (int16_t*)&cVector[number]; + int8_t* a8Ptr = (int8_t*)&aVector[number]; + int8_t* b8Ptr = (int8_t*)&bVector[number]; + for (; number < num_points; number++) { + float aReal = (float)*a8Ptr++; + float aImag = (float)*a8Ptr++; + lv_32fc_t aVal = lv_cmake(aReal, aImag); + float bReal = (float)*b8Ptr++; + float bImag = (float)*b8Ptr++; + lv_32fc_t bVal = lv_cmake(bReal, -bImag); + lv_32fc_t temp = aVal * bVal; + + *c16Ptr++ = (int16_t)lv_creal(temp); + *c16Ptr++ = (int16_t)lv_cimag(temp); + } } #endif /* LV_HAVE_AVX2 */ @@ -95,90 +104,103 @@ static inline void volk_8ic_x2_multiply_conjugate_16ic_a_avx2(lv_16sc_t* cVector #ifdef LV_HAVE_SSE4_1 #include /*! - \brief Multiplys the one complex vector with the complex conjugate of the second complex vector and stores their results in the third vector - \param cVector The complex vector where the results will be stored - \param aVector One of the complex vectors to be multiplied - \param bVector The complex vector which will be converted to complex conjugate and multiplied - \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector + \brief Multiplys the one complex vector with the complex conjugate of the second complex + vector and stores their results in the third vector \param cVector The complex vector + where the results will be stored \param aVector One of the complex vectors to be + multiplied \param bVector The complex vector which will be converted to complex + conjugate and multiplied \param num_points The number of complex values in aVector and + bVector to be multiplied together and stored into cVector */ -static inline void volk_8ic_x2_multiply_conjugate_16ic_a_sse4_1(lv_16sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t* bVector, unsigned int num_points){ - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - __m128i x, y, realz, imagz; - lv_16sc_t* c = cVector; - const lv_8sc_t* a = aVector; - const lv_8sc_t* b = bVector; - __m128i conjugateSign = _mm_set_epi16(-1, 1, -1, 1, -1, 1, -1, 1); - - for(;number < quarterPoints; number++){ - // Convert into 8 bit values into 16 bit values - x = _mm_cvtepi8_epi16(_mm_loadl_epi64((__m128i*)a)); - y = _mm_cvtepi8_epi16(_mm_loadl_epi64((__m128i*)b)); - - // Calculate the ar*cr - ai*(-ci) portions - realz = _mm_madd_epi16(x,y); - - // Calculate the complex conjugate of the cr + ci j values - y = _mm_sign_epi16(y, conjugateSign); - - // Shift the order of the cr and ci values - y = _mm_shufflehi_epi16(_mm_shufflelo_epi16(y, _MM_SHUFFLE(2,3,0,1) ), _MM_SHUFFLE(2,3,0,1)); - - // Calculate the ar*(-ci) + cr*(ai) - imagz = _mm_madd_epi16(x,y); - - _mm_store_si128((__m128i*)c, _mm_packs_epi32(_mm_unpacklo_epi32(realz, imagz), _mm_unpackhi_epi32(realz, imagz))); - - a += 4; - b += 4; - c += 4; - } - - number = quarterPoints * 4; - int16_t* c16Ptr = (int16_t*)&cVector[number]; - int8_t* a8Ptr = (int8_t*)&aVector[number]; - int8_t* b8Ptr = (int8_t*)&bVector[number]; - for(; number < num_points; number++){ - float aReal = (float)*a8Ptr++; - float aImag = (float)*a8Ptr++; - lv_32fc_t aVal = lv_cmake(aReal, aImag ); - float bReal = (float)*b8Ptr++; - float bImag = (float)*b8Ptr++; - lv_32fc_t bVal = lv_cmake( bReal, -bImag ); - lv_32fc_t temp = aVal * bVal; - - *c16Ptr++ = (int16_t)lv_creal(temp); - *c16Ptr++ = (int16_t)lv_cimag(temp); - } +static inline void volk_8ic_x2_multiply_conjugate_16ic_a_sse4_1(lv_16sc_t* cVector, + const lv_8sc_t* aVector, + const lv_8sc_t* bVector, + unsigned int num_points) +{ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + __m128i x, y, realz, imagz; + lv_16sc_t* c = cVector; + const lv_8sc_t* a = aVector; + const lv_8sc_t* b = bVector; + __m128i conjugateSign = _mm_set_epi16(-1, 1, -1, 1, -1, 1, -1, 1); + + for (; number < quarterPoints; number++) { + // Convert into 8 bit values into 16 bit values + x = _mm_cvtepi8_epi16(_mm_loadl_epi64((__m128i*)a)); + y = _mm_cvtepi8_epi16(_mm_loadl_epi64((__m128i*)b)); + + // Calculate the ar*cr - ai*(-ci) portions + realz = _mm_madd_epi16(x, y); + + // Calculate the complex conjugate of the cr + ci j values + y = _mm_sign_epi16(y, conjugateSign); + + // Shift the order of the cr and ci values + y = _mm_shufflehi_epi16(_mm_shufflelo_epi16(y, _MM_SHUFFLE(2, 3, 0, 1)), + _MM_SHUFFLE(2, 3, 0, 1)); + + // Calculate the ar*(-ci) + cr*(ai) + imagz = _mm_madd_epi16(x, y); + + _mm_store_si128((__m128i*)c, + _mm_packs_epi32(_mm_unpacklo_epi32(realz, imagz), + _mm_unpackhi_epi32(realz, imagz))); + + a += 4; + b += 4; + c += 4; + } + + number = quarterPoints * 4; + int16_t* c16Ptr = (int16_t*)&cVector[number]; + int8_t* a8Ptr = (int8_t*)&aVector[number]; + int8_t* b8Ptr = (int8_t*)&bVector[number]; + for (; number < num_points; number++) { + float aReal = (float)*a8Ptr++; + float aImag = (float)*a8Ptr++; + lv_32fc_t aVal = lv_cmake(aReal, aImag); + float bReal = (float)*b8Ptr++; + float bImag = (float)*b8Ptr++; + lv_32fc_t bVal = lv_cmake(bReal, -bImag); + lv_32fc_t temp = aVal * bVal; + + *c16Ptr++ = (int16_t)lv_creal(temp); + *c16Ptr++ = (int16_t)lv_cimag(temp); + } } #endif /* LV_HAVE_SSE4_1 */ #ifdef LV_HAVE_GENERIC /*! - \brief Multiplys the one complex vector with the complex conjugate of the second complex vector and stores their results in the third vector - \param cVector The complex vector where the results will be stored - \param aVector One of the complex vectors to be multiplied - \param bVector The complex vector which will be converted to complex conjugate and multiplied - \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector + \brief Multiplys the one complex vector with the complex conjugate of the second complex + vector and stores their results in the third vector \param cVector The complex vector + where the results will be stored \param aVector One of the complex vectors to be + multiplied \param bVector The complex vector which will be converted to complex + conjugate and multiplied \param num_points The number of complex values in aVector and + bVector to be multiplied together and stored into cVector */ -static inline void volk_8ic_x2_multiply_conjugate_16ic_generic(lv_16sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t* bVector, unsigned int num_points){ - unsigned int number = 0; - int16_t* c16Ptr = (int16_t*)cVector; - int8_t* a8Ptr = (int8_t*)aVector; - int8_t* b8Ptr = (int8_t*)bVector; - for(number =0; number < num_points; number++){ - float aReal = (float)*a8Ptr++; - float aImag = (float)*a8Ptr++; - lv_32fc_t aVal = lv_cmake(aReal, aImag ); - float bReal = (float)*b8Ptr++; - float bImag = (float)*b8Ptr++; - lv_32fc_t bVal = lv_cmake( bReal, -bImag ); - lv_32fc_t temp = aVal * bVal; - - *c16Ptr++ = (int16_t)lv_creal(temp); - *c16Ptr++ = (int16_t)lv_cimag(temp); - } +static inline void volk_8ic_x2_multiply_conjugate_16ic_generic(lv_16sc_t* cVector, + const lv_8sc_t* aVector, + const lv_8sc_t* bVector, + unsigned int num_points) +{ + unsigned int number = 0; + int16_t* c16Ptr = (int16_t*)cVector; + int8_t* a8Ptr = (int8_t*)aVector; + int8_t* b8Ptr = (int8_t*)bVector; + for (number = 0; number < num_points; number++) { + float aReal = (float)*a8Ptr++; + float aImag = (float)*a8Ptr++; + lv_32fc_t aVal = lv_cmake(aReal, aImag); + float bReal = (float)*b8Ptr++; + float bImag = (float)*b8Ptr++; + lv_32fc_t bVal = lv_cmake(bReal, -bImag); + lv_32fc_t temp = aVal * bVal; + + *c16Ptr++ = (int16_t)lv_creal(temp); + *c16Ptr++ = (int16_t)lv_cimag(temp); + } } #endif /* LV_HAVE_GENERIC */ @@ -194,64 +216,73 @@ static inline void volk_8ic_x2_multiply_conjugate_16ic_generic(lv_16sc_t* cVecto #ifdef LV_HAVE_AVX2 #include /*! - \brief Multiplys the one complex vector with the complex conjugate of the second complex vector and stores their results in the third vector - \param cVector The complex vector where the results will be stored - \param aVector One of the complex vectors to be multiplied - \param bVector The complex vector which will be converted to complex conjugate and multiplied - \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector + \brief Multiplys the one complex vector with the complex conjugate of the second complex + vector and stores their results in the third vector \param cVector The complex vector + where the results will be stored \param aVector One of the complex vectors to be + multiplied \param bVector The complex vector which will be converted to complex + conjugate and multiplied \param num_points The number of complex values in aVector and + bVector to be multiplied together and stored into cVector */ -static inline void volk_8ic_x2_multiply_conjugate_16ic_u_avx2(lv_16sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t* bVector, unsigned int num_points){ - unsigned int number = 0; - const unsigned int oneEigthPoints = num_points / 8; - - __m256i x, y, realz, imagz; - lv_16sc_t* c = cVector; - const lv_8sc_t* a = aVector; - const lv_8sc_t* b = bVector; - __m256i conjugateSign = _mm256_set_epi16(-1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1); - - for(;number < oneEigthPoints; number++){ - // Convert 8 bit values into 16 bit values - x = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*)a)); - y = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*)b)); - - // Calculate the ar*cr - ai*(-ci) portions - realz = _mm256_madd_epi16(x,y); - - // Calculate the complex conjugate of the cr + ci j values - y = _mm256_sign_epi16(y, conjugateSign); - - // Shift the order of the cr and ci values - y = _mm256_shufflehi_epi16(_mm256_shufflelo_epi16(y, _MM_SHUFFLE(2,3,0,1) ), _MM_SHUFFLE(2,3,0,1)); - - // Calculate the ar*(-ci) + cr*(ai) - imagz = _mm256_madd_epi16(x,y); - - // Perform the addition of products - - _mm256_storeu_si256((__m256i*)c, _mm256_packs_epi32(_mm256_unpacklo_epi32(realz, imagz), _mm256_unpackhi_epi32(realz, imagz))); - - a += 8; - b += 8; - c += 8; - } - - number = oneEigthPoints * 8; - int16_t* c16Ptr = (int16_t*)&cVector[number]; - int8_t* a8Ptr = (int8_t*)&aVector[number]; - int8_t* b8Ptr = (int8_t*)&bVector[number]; - for(; number < num_points; number++){ - float aReal = (float)*a8Ptr++; - float aImag = (float)*a8Ptr++; - lv_32fc_t aVal = lv_cmake(aReal, aImag ); - float bReal = (float)*b8Ptr++; - float bImag = (float)*b8Ptr++; - lv_32fc_t bVal = lv_cmake( bReal, -bImag ); - lv_32fc_t temp = aVal * bVal; - - *c16Ptr++ = (int16_t)lv_creal(temp); - *c16Ptr++ = (int16_t)lv_cimag(temp); - } +static inline void volk_8ic_x2_multiply_conjugate_16ic_u_avx2(lv_16sc_t* cVector, + const lv_8sc_t* aVector, + const lv_8sc_t* bVector, + unsigned int num_points) +{ + unsigned int number = 0; + const unsigned int oneEigthPoints = num_points / 8; + + __m256i x, y, realz, imagz; + lv_16sc_t* c = cVector; + const lv_8sc_t* a = aVector; + const lv_8sc_t* b = bVector; + __m256i conjugateSign = + _mm256_set_epi16(-1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1); + + for (; number < oneEigthPoints; number++) { + // Convert 8 bit values into 16 bit values + x = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*)a)); + y = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*)b)); + + // Calculate the ar*cr - ai*(-ci) portions + realz = _mm256_madd_epi16(x, y); + + // Calculate the complex conjugate of the cr + ci j values + y = _mm256_sign_epi16(y, conjugateSign); + + // Shift the order of the cr and ci values + y = _mm256_shufflehi_epi16(_mm256_shufflelo_epi16(y, _MM_SHUFFLE(2, 3, 0, 1)), + _MM_SHUFFLE(2, 3, 0, 1)); + + // Calculate the ar*(-ci) + cr*(ai) + imagz = _mm256_madd_epi16(x, y); + + // Perform the addition of products + + _mm256_storeu_si256((__m256i*)c, + _mm256_packs_epi32(_mm256_unpacklo_epi32(realz, imagz), + _mm256_unpackhi_epi32(realz, imagz))); + + a += 8; + b += 8; + c += 8; + } + + number = oneEigthPoints * 8; + int16_t* c16Ptr = (int16_t*)&cVector[number]; + int8_t* a8Ptr = (int8_t*)&aVector[number]; + int8_t* b8Ptr = (int8_t*)&bVector[number]; + for (; number < num_points; number++) { + float aReal = (float)*a8Ptr++; + float aImag = (float)*a8Ptr++; + lv_32fc_t aVal = lv_cmake(aReal, aImag); + float bReal = (float)*b8Ptr++; + float bImag = (float)*b8Ptr++; + lv_32fc_t bVal = lv_cmake(bReal, -bImag); + lv_32fc_t temp = aVal * bVal; + + *c16Ptr++ = (int16_t)lv_creal(temp); + *c16Ptr++ = (int16_t)lv_cimag(temp); + } } #endif /* LV_HAVE_AVX2 */ diff --git a/kernels/volk/volk_8ic_x2_s32f_multiply_conjugate_32fc.h b/kernels/volk/volk_8ic_x2_s32f_multiply_conjugate_32fc.h index 82e40c8..db6bd7a 100644 --- a/kernels/volk/volk_8ic_x2_s32f_multiply_conjugate_32fc.h +++ b/kernels/volk/volk_8ic_x2_s32f_multiply_conjugate_32fc.h @@ -30,14 +30,15 @@ * * Dispatcher Prototype * \code - * void volk_8ic_x2_s32f_multiply_conjugate_32fc(lv_32fc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t* bVector, const float scalar, unsigned int num_points) - * \endcode + * void volk_8ic_x2_s32f_multiply_conjugate_32fc(lv_32fc_t* cVector, const lv_8sc_t* + * aVector, const lv_8sc_t* bVector, const float scalar, unsigned int num_points) \endcode * * \b Inputs * \li aVector: One of the complex vectors to be multiplied. - * \li bVector: The complex vector which will be converted to complex conjugate and multiplied. - * \li scalar: each output value is scaled by 1/scalar. - * \li num_points: The number of complex values in aVector and bVector to be multiplied together and stored into cVector. + * \li bVector: The complex vector which will be converted to complex conjugate and + * multiplied. \li scalar: each output value is scaled by 1/scalar. \li num_points: The + * number of complex values in aVector and bVector to be multiplied together and stored + * into cVector. * * \b Outputs * \li cVector: The complex vector where the results will be stored. @@ -64,160 +65,167 @@ #include static inline void -volk_8ic_x2_s32f_multiply_conjugate_32fc_a_avx2(lv_32fc_t* cVector, const lv_8sc_t* aVector, - const lv_8sc_t* bVector, const float scalar, - unsigned int num_points) +volk_8ic_x2_s32f_multiply_conjugate_32fc_a_avx2(lv_32fc_t* cVector, + const lv_8sc_t* aVector, + const lv_8sc_t* bVector, + const float scalar, + unsigned int num_points) { - unsigned int number = 0; - const unsigned int oneEigthPoints = num_points / 8; - - __m256i x, y, realz, imagz; - __m256 ret, retlo, rethi; - lv_32fc_t* c = cVector; - const lv_8sc_t* a = aVector; - const lv_8sc_t* b = bVector; - __m256i conjugateSign = _mm256_set_epi16(-1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1); - - __m256 invScalar = _mm256_set1_ps(1.0/scalar); - - for(;number < oneEigthPoints; number++){ - // Convert 8 bit values into 16 bit values - x = _mm256_cvtepi8_epi16(_mm_load_si128((__m128i*)a)); - y = _mm256_cvtepi8_epi16(_mm_load_si128((__m128i*)b)); - - // Calculate the ar*cr - ai*(-ci) portions - realz = _mm256_madd_epi16(x,y); - - // Calculate the complex conjugate of the cr + ci j values - y = _mm256_sign_epi16(y, conjugateSign); - - // Shift the order of the cr and ci values - y = _mm256_shufflehi_epi16(_mm256_shufflelo_epi16(y, _MM_SHUFFLE(2,3,0,1) ), _MM_SHUFFLE(2,3,0,1)); - - // Calculate the ar*(-ci) + cr*(ai) - imagz = _mm256_madd_epi16(x,y); - - // Interleave real and imaginary and then convert to float values - retlo = _mm256_cvtepi32_ps(_mm256_unpacklo_epi32(realz, imagz)); - - // Normalize the floating point values - retlo = _mm256_mul_ps(retlo, invScalar); - - // Interleave real and imaginary and then convert to float values - rethi = _mm256_cvtepi32_ps(_mm256_unpackhi_epi32(realz, imagz)); - - // Normalize the floating point values - rethi = _mm256_mul_ps(rethi, invScalar); - - ret = _mm256_permute2f128_ps(retlo, rethi, 0b00100000); - _mm256_store_ps((float*)c, ret); - c += 4; - - ret = _mm256_permute2f128_ps(retlo, rethi, 0b00110001); - _mm256_store_ps((float*)c, ret); - c += 4; - - a += 8; - b += 8; - } - - number = oneEigthPoints * 8; - float* cFloatPtr = (float*)&cVector[number]; - int8_t* a8Ptr = (int8_t*)&aVector[number]; - int8_t* b8Ptr = (int8_t*)&bVector[number]; - for(; number < num_points; number++){ - float aReal = (float)*a8Ptr++; - float aImag = (float)*a8Ptr++; - lv_32fc_t aVal = lv_cmake(aReal, aImag ); - float bReal = (float)*b8Ptr++; - float bImag = (float)*b8Ptr++; - lv_32fc_t bVal = lv_cmake( bReal, -bImag ); - lv_32fc_t temp = aVal * bVal; - - *cFloatPtr++ = lv_creal(temp) / scalar; - *cFloatPtr++ = lv_cimag(temp) / scalar; - } + unsigned int number = 0; + const unsigned int oneEigthPoints = num_points / 8; + + __m256i x, y, realz, imagz; + __m256 ret, retlo, rethi; + lv_32fc_t* c = cVector; + const lv_8sc_t* a = aVector; + const lv_8sc_t* b = bVector; + __m256i conjugateSign = + _mm256_set_epi16(-1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1); + + __m256 invScalar = _mm256_set1_ps(1.0 / scalar); + + for (; number < oneEigthPoints; number++) { + // Convert 8 bit values into 16 bit values + x = _mm256_cvtepi8_epi16(_mm_load_si128((__m128i*)a)); + y = _mm256_cvtepi8_epi16(_mm_load_si128((__m128i*)b)); + + // Calculate the ar*cr - ai*(-ci) portions + realz = _mm256_madd_epi16(x, y); + + // Calculate the complex conjugate of the cr + ci j values + y = _mm256_sign_epi16(y, conjugateSign); + + // Shift the order of the cr and ci values + y = _mm256_shufflehi_epi16(_mm256_shufflelo_epi16(y, _MM_SHUFFLE(2, 3, 0, 1)), + _MM_SHUFFLE(2, 3, 0, 1)); + + // Calculate the ar*(-ci) + cr*(ai) + imagz = _mm256_madd_epi16(x, y); + + // Interleave real and imaginary and then convert to float values + retlo = _mm256_cvtepi32_ps(_mm256_unpacklo_epi32(realz, imagz)); + + // Normalize the floating point values + retlo = _mm256_mul_ps(retlo, invScalar); + + // Interleave real and imaginary and then convert to float values + rethi = _mm256_cvtepi32_ps(_mm256_unpackhi_epi32(realz, imagz)); + + // Normalize the floating point values + rethi = _mm256_mul_ps(rethi, invScalar); + + ret = _mm256_permute2f128_ps(retlo, rethi, 0b00100000); + _mm256_store_ps((float*)c, ret); + c += 4; + + ret = _mm256_permute2f128_ps(retlo, rethi, 0b00110001); + _mm256_store_ps((float*)c, ret); + c += 4; + + a += 8; + b += 8; + } + + number = oneEigthPoints * 8; + float* cFloatPtr = (float*)&cVector[number]; + int8_t* a8Ptr = (int8_t*)&aVector[number]; + int8_t* b8Ptr = (int8_t*)&bVector[number]; + for (; number < num_points; number++) { + float aReal = (float)*a8Ptr++; + float aImag = (float)*a8Ptr++; + lv_32fc_t aVal = lv_cmake(aReal, aImag); + float bReal = (float)*b8Ptr++; + float bImag = (float)*b8Ptr++; + lv_32fc_t bVal = lv_cmake(bReal, -bImag); + lv_32fc_t temp = aVal * bVal; + + *cFloatPtr++ = lv_creal(temp) / scalar; + *cFloatPtr++ = lv_cimag(temp) / scalar; + } } -#endif /* LV_HAVE_AVX2*/ +#endif /* LV_HAVE_AVX2*/ #ifdef LV_HAVE_SSE4_1 #include static inline void -volk_8ic_x2_s32f_multiply_conjugate_32fc_a_sse4_1(lv_32fc_t* cVector, const lv_8sc_t* aVector, - const lv_8sc_t* bVector, const float scalar, +volk_8ic_x2_s32f_multiply_conjugate_32fc_a_sse4_1(lv_32fc_t* cVector, + const lv_8sc_t* aVector, + const lv_8sc_t* bVector, + const float scalar, unsigned int num_points) { - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - __m128i x, y, realz, imagz; - __m128 ret; - lv_32fc_t* c = cVector; - const lv_8sc_t* a = aVector; - const lv_8sc_t* b = bVector; - __m128i conjugateSign = _mm_set_epi16(-1, 1, -1, 1, -1, 1, -1, 1); - - __m128 invScalar = _mm_set_ps1(1.0/scalar); - - for(;number < quarterPoints; number++){ - // Convert into 8 bit values into 16 bit values - x = _mm_cvtepi8_epi16(_mm_loadl_epi64((__m128i*)a)); - y = _mm_cvtepi8_epi16(_mm_loadl_epi64((__m128i*)b)); - - // Calculate the ar*cr - ai*(-ci) portions - realz = _mm_madd_epi16(x,y); - - // Calculate the complex conjugate of the cr + ci j values - y = _mm_sign_epi16(y, conjugateSign); - - // Shift the order of the cr and ci values - y = _mm_shufflehi_epi16(_mm_shufflelo_epi16(y, _MM_SHUFFLE(2,3,0,1) ), _MM_SHUFFLE(2,3,0,1)); - - // Calculate the ar*(-ci) + cr*(ai) - imagz = _mm_madd_epi16(x,y); - - // Interleave real and imaginary and then convert to float values - ret = _mm_cvtepi32_ps(_mm_unpacklo_epi32(realz, imagz)); - - // Normalize the floating point values - ret = _mm_mul_ps(ret, invScalar); - - // Store the floating point values - _mm_store_ps((float*)c, ret); - c += 2; - - // Interleave real and imaginary and then convert to float values - ret = _mm_cvtepi32_ps(_mm_unpackhi_epi32(realz, imagz)); - - // Normalize the floating point values - ret = _mm_mul_ps(ret, invScalar); - - // Store the floating point values - _mm_store_ps((float*)c, ret); - c += 2; - - a += 4; - b += 4; - } - - number = quarterPoints * 4; - float* cFloatPtr = (float*)&cVector[number]; - int8_t* a8Ptr = (int8_t*)&aVector[number]; - int8_t* b8Ptr = (int8_t*)&bVector[number]; - for(; number < num_points; number++){ - float aReal = (float)*a8Ptr++; - float aImag = (float)*a8Ptr++; - lv_32fc_t aVal = lv_cmake(aReal, aImag ); - float bReal = (float)*b8Ptr++; - float bImag = (float)*b8Ptr++; - lv_32fc_t bVal = lv_cmake( bReal, -bImag ); - lv_32fc_t temp = aVal * bVal; - - *cFloatPtr++ = lv_creal(temp) / scalar; - *cFloatPtr++ = lv_cimag(temp) / scalar; - } + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + __m128i x, y, realz, imagz; + __m128 ret; + lv_32fc_t* c = cVector; + const lv_8sc_t* a = aVector; + const lv_8sc_t* b = bVector; + __m128i conjugateSign = _mm_set_epi16(-1, 1, -1, 1, -1, 1, -1, 1); + + __m128 invScalar = _mm_set_ps1(1.0 / scalar); + + for (; number < quarterPoints; number++) { + // Convert into 8 bit values into 16 bit values + x = _mm_cvtepi8_epi16(_mm_loadl_epi64((__m128i*)a)); + y = _mm_cvtepi8_epi16(_mm_loadl_epi64((__m128i*)b)); + + // Calculate the ar*cr - ai*(-ci) portions + realz = _mm_madd_epi16(x, y); + + // Calculate the complex conjugate of the cr + ci j values + y = _mm_sign_epi16(y, conjugateSign); + + // Shift the order of the cr and ci values + y = _mm_shufflehi_epi16(_mm_shufflelo_epi16(y, _MM_SHUFFLE(2, 3, 0, 1)), + _MM_SHUFFLE(2, 3, 0, 1)); + + // Calculate the ar*(-ci) + cr*(ai) + imagz = _mm_madd_epi16(x, y); + + // Interleave real and imaginary and then convert to float values + ret = _mm_cvtepi32_ps(_mm_unpacklo_epi32(realz, imagz)); + + // Normalize the floating point values + ret = _mm_mul_ps(ret, invScalar); + + // Store the floating point values + _mm_store_ps((float*)c, ret); + c += 2; + + // Interleave real and imaginary and then convert to float values + ret = _mm_cvtepi32_ps(_mm_unpackhi_epi32(realz, imagz)); + + // Normalize the floating point values + ret = _mm_mul_ps(ret, invScalar); + + // Store the floating point values + _mm_store_ps((float*)c, ret); + c += 2; + + a += 4; + b += 4; + } + + number = quarterPoints * 4; + float* cFloatPtr = (float*)&cVector[number]; + int8_t* a8Ptr = (int8_t*)&aVector[number]; + int8_t* b8Ptr = (int8_t*)&bVector[number]; + for (; number < num_points; number++) { + float aReal = (float)*a8Ptr++; + float aImag = (float)*a8Ptr++; + lv_32fc_t aVal = lv_cmake(aReal, aImag); + float bReal = (float)*b8Ptr++; + float bImag = (float)*b8Ptr++; + lv_32fc_t bVal = lv_cmake(bReal, -bImag); + lv_32fc_t temp = aVal * bVal; + + *cFloatPtr++ = lv_creal(temp) / scalar; + *cFloatPtr++ = lv_cimag(temp) / scalar; + } } #endif /* LV_HAVE_SSE4_1 */ @@ -225,27 +233,29 @@ volk_8ic_x2_s32f_multiply_conjugate_32fc_a_sse4_1(lv_32fc_t* cVector, const lv_8 #ifdef LV_HAVE_GENERIC static inline void -volk_8ic_x2_s32f_multiply_conjugate_32fc_generic(lv_32fc_t* cVector, const lv_8sc_t* aVector, - const lv_8sc_t* bVector, const float scalar, +volk_8ic_x2_s32f_multiply_conjugate_32fc_generic(lv_32fc_t* cVector, + const lv_8sc_t* aVector, + const lv_8sc_t* bVector, + const float scalar, unsigned int num_points) { - unsigned int number = 0; - float* cPtr = (float*)cVector; - const float invScalar = 1.0 / scalar; - int8_t* a8Ptr = (int8_t*)aVector; - int8_t* b8Ptr = (int8_t*)bVector; - for(number = 0; number < num_points; number++){ - float aReal = (float)*a8Ptr++; - float aImag = (float)*a8Ptr++; - lv_32fc_t aVal = lv_cmake(aReal, aImag ); - float bReal = (float)*b8Ptr++; - float bImag = (float)*b8Ptr++; - lv_32fc_t bVal = lv_cmake( bReal, -bImag ); - lv_32fc_t temp = aVal * bVal; - - *cPtr++ = (lv_creal(temp) * invScalar); - *cPtr++ = (lv_cimag(temp) * invScalar); - } + unsigned int number = 0; + float* cPtr = (float*)cVector; + const float invScalar = 1.0 / scalar; + int8_t* a8Ptr = (int8_t*)aVector; + int8_t* b8Ptr = (int8_t*)bVector; + for (number = 0; number < num_points; number++) { + float aReal = (float)*a8Ptr++; + float aImag = (float)*a8Ptr++; + lv_32fc_t aVal = lv_cmake(aReal, aImag); + float bReal = (float)*b8Ptr++; + float bImag = (float)*b8Ptr++; + lv_32fc_t bVal = lv_cmake(bReal, -bImag); + lv_32fc_t temp = aVal * bVal; + + *cPtr++ = (lv_creal(temp) * invScalar); + *cPtr++ = (lv_cimag(temp) * invScalar); + } } #endif /* LV_HAVE_GENERIC */ @@ -263,81 +273,85 @@ volk_8ic_x2_s32f_multiply_conjugate_32fc_generic(lv_32fc_t* cVector, const lv_8s #include static inline void -volk_8ic_x2_s32f_multiply_conjugate_32fc_u_avx2(lv_32fc_t* cVector, const lv_8sc_t* aVector, - const lv_8sc_t* bVector, const float scalar, - unsigned int num_points) +volk_8ic_x2_s32f_multiply_conjugate_32fc_u_avx2(lv_32fc_t* cVector, + const lv_8sc_t* aVector, + const lv_8sc_t* bVector, + const float scalar, + unsigned int num_points) { - unsigned int number = 0; - const unsigned int oneEigthPoints = num_points / 8; - - __m256i x, y, realz, imagz; - __m256 ret, retlo, rethi; - lv_32fc_t* c = cVector; - const lv_8sc_t* a = aVector; - const lv_8sc_t* b = bVector; - __m256i conjugateSign = _mm256_set_epi16(-1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1); - - __m256 invScalar = _mm256_set1_ps(1.0/scalar); - - for(;number < oneEigthPoints; number++){ - // Convert 8 bit values into 16 bit values - x = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*)a)); - y = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*)b)); - - // Calculate the ar*cr - ai*(-ci) portions - realz = _mm256_madd_epi16(x,y); - - // Calculate the complex conjugate of the cr + ci j values - y = _mm256_sign_epi16(y, conjugateSign); - - // Shift the order of the cr and ci values - y = _mm256_shufflehi_epi16(_mm256_shufflelo_epi16(y, _MM_SHUFFLE(2,3,0,1) ), _MM_SHUFFLE(2,3,0,1)); - - // Calculate the ar*(-ci) + cr*(ai) - imagz = _mm256_madd_epi16(x,y); - - // Interleave real and imaginary and then convert to float values - retlo = _mm256_cvtepi32_ps(_mm256_unpacklo_epi32(realz, imagz)); - - // Normalize the floating point values - retlo = _mm256_mul_ps(retlo, invScalar); - - // Interleave real and imaginary and then convert to float values - rethi = _mm256_cvtepi32_ps(_mm256_unpackhi_epi32(realz, imagz)); - - // Normalize the floating point values - rethi = _mm256_mul_ps(rethi, invScalar); - - ret = _mm256_permute2f128_ps(retlo, rethi, 0b00100000); - _mm256_storeu_ps((float*)c, ret); - c += 4; - - ret = _mm256_permute2f128_ps(retlo, rethi, 0b00110001); - _mm256_storeu_ps((float*)c, ret); - c += 4; - - a += 8; - b += 8; - } - - number = oneEigthPoints * 8; - float* cFloatPtr = (float*)&cVector[number]; - int8_t* a8Ptr = (int8_t*)&aVector[number]; - int8_t* b8Ptr = (int8_t*)&bVector[number]; - for(; number < num_points; number++){ - float aReal = (float)*a8Ptr++; - float aImag = (float)*a8Ptr++; - lv_32fc_t aVal = lv_cmake(aReal, aImag ); - float bReal = (float)*b8Ptr++; - float bImag = (float)*b8Ptr++; - lv_32fc_t bVal = lv_cmake( bReal, -bImag ); - lv_32fc_t temp = aVal * bVal; - - *cFloatPtr++ = lv_creal(temp) / scalar; - *cFloatPtr++ = lv_cimag(temp) / scalar; - } + unsigned int number = 0; + const unsigned int oneEigthPoints = num_points / 8; + + __m256i x, y, realz, imagz; + __m256 ret, retlo, rethi; + lv_32fc_t* c = cVector; + const lv_8sc_t* a = aVector; + const lv_8sc_t* b = bVector; + __m256i conjugateSign = + _mm256_set_epi16(-1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1); + + __m256 invScalar = _mm256_set1_ps(1.0 / scalar); + + for (; number < oneEigthPoints; number++) { + // Convert 8 bit values into 16 bit values + x = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*)a)); + y = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*)b)); + + // Calculate the ar*cr - ai*(-ci) portions + realz = _mm256_madd_epi16(x, y); + + // Calculate the complex conjugate of the cr + ci j values + y = _mm256_sign_epi16(y, conjugateSign); + + // Shift the order of the cr and ci values + y = _mm256_shufflehi_epi16(_mm256_shufflelo_epi16(y, _MM_SHUFFLE(2, 3, 0, 1)), + _MM_SHUFFLE(2, 3, 0, 1)); + + // Calculate the ar*(-ci) + cr*(ai) + imagz = _mm256_madd_epi16(x, y); + + // Interleave real and imaginary and then convert to float values + retlo = _mm256_cvtepi32_ps(_mm256_unpacklo_epi32(realz, imagz)); + + // Normalize the floating point values + retlo = _mm256_mul_ps(retlo, invScalar); + + // Interleave real and imaginary and then convert to float values + rethi = _mm256_cvtepi32_ps(_mm256_unpackhi_epi32(realz, imagz)); + + // Normalize the floating point values + rethi = _mm256_mul_ps(rethi, invScalar); + + ret = _mm256_permute2f128_ps(retlo, rethi, 0b00100000); + _mm256_storeu_ps((float*)c, ret); + c += 4; + + ret = _mm256_permute2f128_ps(retlo, rethi, 0b00110001); + _mm256_storeu_ps((float*)c, ret); + c += 4; + + a += 8; + b += 8; + } + + number = oneEigthPoints * 8; + float* cFloatPtr = (float*)&cVector[number]; + int8_t* a8Ptr = (int8_t*)&aVector[number]; + int8_t* b8Ptr = (int8_t*)&bVector[number]; + for (; number < num_points; number++) { + float aReal = (float)*a8Ptr++; + float aImag = (float)*a8Ptr++; + lv_32fc_t aVal = lv_cmake(aReal, aImag); + float bReal = (float)*b8Ptr++; + float bImag = (float)*b8Ptr++; + lv_32fc_t bVal = lv_cmake(bReal, -bImag); + lv_32fc_t temp = aVal * bVal; + + *cFloatPtr++ = lv_creal(temp) / scalar; + *cFloatPtr++ = lv_cimag(temp) / scalar; + } } -#endif /* LV_HAVE_AVX2*/ +#endif /* LV_HAVE_AVX2*/ #endif /* INCLUDED_volk_8ic_x2_s32f_multiply_conjugate_32fc_u_H */ diff --git a/kernels/volk/volk_8u_conv_k7_r2puppet_8u.h b/kernels/volk/volk_8u_conv_k7_r2puppet_8u.h index 00f83de..69287cd 100644 --- a/kernels/volk/volk_8u_conv_k7_r2puppet_8u.h +++ b/kernels/volk/volk_8u_conv_k7_r2puppet_8u.h @@ -23,21 +23,21 @@ #ifndef INCLUDED_volk_8u_conv_k7_r2puppet_8u_H #define INCLUDED_volk_8u_conv_k7_r2puppet_8u_H +#include #include #include -#include typedef union { - //decision_t is a BIT vector - unsigned char* t; - unsigned int* w; + // decision_t is a BIT vector + unsigned char* t; + unsigned int* w; } p_decision_t; static inline int parity(int x, unsigned char* Partab) { - x ^= (x >> 16); - x ^= (x >> 8); - return Partab[x]; + x ^= (x >> 16); + x ^= (x >> 8); + return Partab[x]; } static inline int chainback_viterbi(unsigned char* data, @@ -46,135 +46,143 @@ static inline int chainback_viterbi(unsigned char* data, unsigned int tailsize, unsigned char* decisions) { - unsigned char* d; - int d_ADDSHIFT = 0; - int d_numstates = (1 << 6); - int d_decision_t_size = d_numstates/8; - unsigned int d_k = 7; - int d_framebits = nbits; - /* ADDSHIFT and SUBSHIFT make sure that the thing returned is a byte. */ - d = decisions; - /* Make room beyond the end of the encoder register so we can - * accumulate a full byte of decoded data - */ - - endstate = (endstate%d_numstates) << d_ADDSHIFT; - - /* The store into data[] only needs to be done every 8 bits. - * But this avoids a conditional branch, and the writes will - * combine in the cache anyway - */ - - d += tailsize * d_decision_t_size ; /* Look past tail */ - int retval; - int dif = tailsize - (d_k - 1); - //printf("break, %d, %d\n", dif, (nbits+dif)%d_framebits); - p_decision_t dec; - while(nbits-- > d_framebits - (d_k - 1)) { - int k; - dec.t = &d[nbits * d_decision_t_size]; - k = (dec.w[(endstate>>d_ADDSHIFT)/32] >> ((endstate>>d_ADDSHIFT)%32)) & 1; - - endstate = (endstate >> 1) | (k << (d_k-2+d_ADDSHIFT)); - //data[((nbits+dif)%nbits)>>3] = endstate>>d_SUBSHIFT; - //printf("%d, %d\n", k, (nbits+dif)%d_framebits); - data[((nbits+dif)%d_framebits)] = k; - - retval = endstate; - } - nbits += 1; - - while(nbits-- != 0) { - int k; - - dec.t = &d[nbits * d_decision_t_size]; - - k = (dec.w[(endstate>>d_ADDSHIFT)/32] >> ((endstate>>d_ADDSHIFT)%32)) & 1; - - endstate = (endstate >> 1) | (k << (d_k-2+d_ADDSHIFT)); - data[((nbits+dif)%d_framebits)] = k; - } - //printf("%d, %d, %d, %d, %d, %d, %d, %d\n", data[4095],data[4094],data[4093],data[4092],data[4091],data[4090],data[4089],data[4088]); - - - return retval >> d_ADDSHIFT; + unsigned char* d; + int d_ADDSHIFT = 0; + int d_numstates = (1 << 6); + int d_decision_t_size = d_numstates / 8; + unsigned int d_k = 7; + int d_framebits = nbits; + /* ADDSHIFT and SUBSHIFT make sure that the thing returned is a byte. */ + d = decisions; + /* Make room beyond the end of the encoder register so we can + * accumulate a full byte of decoded data + */ + + endstate = (endstate % d_numstates) << d_ADDSHIFT; + + /* The store into data[] only needs to be done every 8 bits. + * But this avoids a conditional branch, and the writes will + * combine in the cache anyway + */ + + d += tailsize * d_decision_t_size; /* Look past tail */ + int retval; + int dif = tailsize - (d_k - 1); + // printf("break, %d, %d\n", dif, (nbits+dif)%d_framebits); + p_decision_t dec; + while (nbits-- > d_framebits - (d_k - 1)) { + int k; + dec.t = &d[nbits * d_decision_t_size]; + k = (dec.w[(endstate >> d_ADDSHIFT) / 32] >> ((endstate >> d_ADDSHIFT) % 32)) & 1; + + endstate = (endstate >> 1) | (k << (d_k - 2 + d_ADDSHIFT)); + // data[((nbits+dif)%nbits)>>3] = endstate>>d_SUBSHIFT; + // printf("%d, %d\n", k, (nbits+dif)%d_framebits); + data[((nbits + dif) % d_framebits)] = k; + + retval = endstate; + } + nbits += 1; + + while (nbits-- != 0) { + int k; + + dec.t = &d[nbits * d_decision_t_size]; + + k = (dec.w[(endstate >> d_ADDSHIFT) / 32] >> ((endstate >> d_ADDSHIFT) % 32)) & 1; + + endstate = (endstate >> 1) | (k << (d_k - 2 + d_ADDSHIFT)); + data[((nbits + dif) % d_framebits)] = k; + } + // printf("%d, %d, %d, %d, %d, %d, %d, %d\n", + // data[4095],data[4094],data[4093],data[4092],data[4091],data[4090],data[4089],data[4088]); + + + return retval >> d_ADDSHIFT; } #if LV_HAVE_SSE3 -#include #include -#include #include +#include #include +#include -static inline void volk_8u_conv_k7_r2puppet_8u_spiral(unsigned char* syms, unsigned char* dec, unsigned int framebits) { - - - static int once = 1; - int d_numstates = (1 << 6); - int rate = 2; - static unsigned char* D; - static unsigned char* Y; - static unsigned char* X; - static unsigned int excess = 6; - static unsigned char* Branchtab; - static unsigned char Partab[256]; - - int d_polys[2] = {79, 109}; - - - if(once) { - - X = (unsigned char*)volk_malloc(2*d_numstates, volk_get_alignment()); - Y = X + d_numstates; - Branchtab = (unsigned char*)volk_malloc(d_numstates/2*rate, volk_get_alignment()); - D = (unsigned char*)volk_malloc((d_numstates/8) * (framebits + 6), volk_get_alignment()); - int state, i; - int cnt,ti; - - /* Initialize parity lookup table */ - for(i=0;i<256;i++){ - cnt = 0; - ti = i; - while(ti){ - if(ti & 1) - cnt++; - ti >>= 1; - } - Partab[i] = cnt & 1; - } - /* Initialize the branch table */ - for(state=0;state < d_numstates/2;state++){ - for(i=0; i>= 1; + } + Partab[i] = cnt & 1; + } + /* Initialize the branch table */ + for (state = 0; state < d_numstates / 2; state++) { + for (i = 0; i < rate; i++) { + Branchtab[i * d_numstates / 2 + state] = + parity((2 * state) & d_polys[i], Partab) ? 255 : 0; + } + } + + once = 0; + } + + // unbias the old_metrics + memset(X, 31, d_numstates); - // initialize decisions - memset(D, 0, (d_numstates/8) * (framebits + 6)); + // initialize decisions + memset(D, 0, (d_numstates / 8) * (framebits + 6)); - volk_8u_x4_conv_k7_r2_8u_spiral(Y, X, syms, D, framebits/2 - excess, excess, Branchtab); + volk_8u_x4_conv_k7_r2_8u_spiral( + Y, X, syms, D, framebits / 2 - excess, excess, Branchtab); - unsigned int min = X[0]; - int i = 0, state = 0; - for(i = 0; i < (d_numstates); ++i) { - if(X[i] < min) { - min = X[i]; - state = i; + unsigned int min = X[0]; + int i = 0, state = 0; + for (i = 0; i < (d_numstates); ++i) { + if (X[i] < min) { + min = X[i]; + state = i; + } } - } - chainback_viterbi(dec, framebits/2 -excess, state, excess, D); + chainback_viterbi(dec, framebits / 2 - excess, state, excess, D); - return; + return; } #endif /*LV_HAVE_SSE3*/ @@ -185,151 +193,161 @@ static inline void volk_8u_conv_k7_r2puppet_8u_spiral(unsigned char* syms, unsig #include #include -static inline void volk_8u_conv_k7_r2puppet_8u_avx2(unsigned char* syms, unsigned char* dec, unsigned int framebits) { - - - static int once = 1; - int d_numstates = (1 << 6); - int rate = 2; - static unsigned char* D; - static unsigned char* Y; - static unsigned char* X; - static unsigned int excess = 6; - static unsigned char* Branchtab; - static unsigned char Partab[256]; - - int d_polys[2] = {79, 109}; - - - if(once) { - - X = (unsigned char*)volk_malloc(2*d_numstates, volk_get_alignment()); - Y = X + d_numstates; - Branchtab = (unsigned char*)volk_malloc(d_numstates/2*rate, volk_get_alignment()); - D = (unsigned char*)volk_malloc((d_numstates/8) * (framebits + 6), volk_get_alignment()); - int state, i; - int cnt,ti; - - /* Initialize parity lookup table */ - for(i=0;i<256;i++){ - cnt = 0; - ti = i; - while(ti){ - if(ti & 1) - cnt++; - ti >>= 1; - } - Partab[i] = cnt & 1; - } - /* Initialize the branch table */ - for(state=0;state < d_numstates/2;state++){ - for(i=0; i>= 1; + } + Partab[i] = cnt & 1; + } + /* Initialize the branch table */ + for (state = 0; state < d_numstates / 2; state++) { + for (i = 0; i < rate; i++) { + Branchtab[i * d_numstates / 2 + state] = + parity((2 * state) & d_polys[i], Partab) ? 255 : 0; + } + } + + once = 0; + } + + // unbias the old_metrics + memset(X, 31, d_numstates); - // initialize decisions - memset(D, 0, (d_numstates/8) * (framebits + 6)); + // initialize decisions + memset(D, 0, (d_numstates / 8) * (framebits + 6)); - volk_8u_x4_conv_k7_r2_8u_avx2(Y, X, syms, D, framebits/2 - excess, excess, Branchtab); + volk_8u_x4_conv_k7_r2_8u_avx2( + Y, X, syms, D, framebits / 2 - excess, excess, Branchtab); - unsigned int min = X[0]; - int i = 0, state = 0; - for(i = 0; i < (d_numstates); ++i) { - if(X[i] < min) { - min = X[i]; - state = i; + unsigned int min = X[0]; + int i = 0, state = 0; + for (i = 0; i < (d_numstates); ++i) { + if (X[i] < min) { + min = X[i]; + state = i; + } } - } - chainback_viterbi(dec, framebits/2 -excess, state, excess, D); + chainback_viterbi(dec, framebits / 2 - excess, state, excess, D); - return; + return; } #endif /*LV_HAVE_AVX2*/ - #if LV_HAVE_GENERIC -static inline void volk_8u_conv_k7_r2puppet_8u_generic(unsigned char* syms, unsigned char* dec, unsigned int framebits) { - - - - static int once = 1; - int d_numstates = (1 << 6); - int rate = 2; - static unsigned char* Y; - static unsigned char* X; - static unsigned char* D; - static unsigned int excess = 6; - static unsigned char* Branchtab; - static unsigned char Partab[256]; - - int d_polys[2] = {79, 109}; - - - if(once) { - - X = (unsigned char*)volk_malloc(2*d_numstates, volk_get_alignment()); - Y = X + d_numstates; - Branchtab = (unsigned char*)volk_malloc(d_numstates/2*rate, volk_get_alignment()); - D = (unsigned char*)volk_malloc((d_numstates/8) * (framebits + 6), volk_get_alignment()); +static inline void volk_8u_conv_k7_r2puppet_8u_generic(unsigned char* syms, + unsigned char* dec, + unsigned int framebits) +{ - int state, i; - int cnt,ti; - /* Initialize parity lookup table */ - for(i=0;i<256;i++){ - cnt = 0; - ti = i; - while(ti){ - if(ti & 1) - cnt++; - ti >>= 1; - } - Partab[i] = cnt & 1; + static int once = 1; + int d_numstates = (1 << 6); + int rate = 2; + static unsigned char* Y; + static unsigned char* X; + static unsigned char* D; + static unsigned int excess = 6; + static unsigned char* Branchtab; + static unsigned char Partab[256]; + + int d_polys[2] = { 79, 109 }; + + + if (once) { + + X = (unsigned char*)volk_malloc(2 * d_numstates, volk_get_alignment()); + Y = X + d_numstates; + Branchtab = + (unsigned char*)volk_malloc(d_numstates / 2 * rate, volk_get_alignment()); + D = (unsigned char*)volk_malloc((d_numstates / 8) * (framebits + 6), + volk_get_alignment()); + + int state, i; + int cnt, ti; + + /* Initialize parity lookup table */ + for (i = 0; i < 256; i++) { + cnt = 0; + ti = i; + while (ti) { + if (ti & 1) + cnt++; + ti >>= 1; + } + Partab[i] = cnt & 1; + } + /* Initialize the branch table */ + for (state = 0; state < d_numstates / 2; state++) { + for (i = 0; i < rate; i++) { + Branchtab[i * d_numstates / 2 + state] = + parity((2 * state) & d_polys[i], Partab) ? 255 : 0; + } + } + + once = 0; } - /* Initialize the branch table */ - for(state=0;state < d_numstates/2;state++){ - for(i=0; i -static inline unsigned int -log2_of_power_of_2(unsigned int val){ - // algorithm from: http://graphics.stanford.edu/~seander/bithacks.html#IntegerLog - static const unsigned int b[] = {0xAAAAAAAA, 0xCCCCCCCC, 0xF0F0F0F0, - 0xFF00FF00, 0xFFFF0000}; - - unsigned int res = (val & b[0]) != 0; - res |= ((val & b[4]) != 0) << 4; - res |= ((val & b[3]) != 0) << 3; - res |= ((val & b[2]) != 0) << 2; - res |= ((val & b[1]) != 0) << 1; - return res; +static inline unsigned int log2_of_power_of_2(unsigned int val) +{ + // algorithm from: http://graphics.stanford.edu/~seander/bithacks.html#IntegerLog + static const unsigned int b[] = { + 0xAAAAAAAA, 0xCCCCCCCC, 0xF0F0F0F0, 0xFF00FF00, 0xFFFF0000 + }; + + unsigned int res = (val & b[0]) != 0; + res |= ((val & b[4]) != 0) << 4; + res |= ((val & b[3]) != 0) << 3; + res |= ((val & b[2]) != 0) << 2; + res |= ((val & b[1]) != 0) << 1; + return res; } -static inline void -encodepolar_single_stage(unsigned char* frame_ptr, const unsigned char* temp_ptr, - const unsigned int num_branches, const unsigned int frame_half) +static inline void encodepolar_single_stage(unsigned char* frame_ptr, + const unsigned char* temp_ptr, + const unsigned int num_branches, + const unsigned int frame_half) { - unsigned int branch, bit; - for(branch = 0; branch < num_branches; ++branch){ - for(bit = 0; bit < frame_half; ++bit){ - *frame_ptr = *temp_ptr ^ *(temp_ptr + 1); - *(frame_ptr + frame_half) = *(temp_ptr + 1); - ++frame_ptr; - temp_ptr += 2; + unsigned int branch, bit; + for (branch = 0; branch < num_branches; ++branch) { + for (bit = 0; bit < frame_half; ++bit) { + *frame_ptr = *temp_ptr ^ *(temp_ptr + 1); + *(frame_ptr + frame_half) = *(temp_ptr + 1); + ++frame_ptr; + temp_ptr += 2; + } + frame_ptr += frame_half; } - frame_ptr += frame_half; - } } #ifdef LV_HAVE_GENERIC -static inline void -volk_8u_x2_encodeframepolar_8u_generic(unsigned char* frame, unsigned char* temp, - unsigned int frame_size) +static inline void volk_8u_x2_encodeframepolar_8u_generic(unsigned char* frame, + unsigned char* temp, + unsigned int frame_size) { - unsigned int stage = log2_of_power_of_2(frame_size); - unsigned int frame_half = frame_size >> 1; - unsigned int num_branches = 1; - - while(stage){ - // encode stage - encodepolar_single_stage(frame, temp, num_branches, frame_half); - memcpy(temp, frame, sizeof(unsigned char) * frame_size); - - // update all the parameters. - num_branches = num_branches << 1; - frame_half = frame_half >> 1; - --stage; - } + unsigned int stage = log2_of_power_of_2(frame_size); + unsigned int frame_half = frame_size >> 1; + unsigned int num_branches = 1; + + while (stage) { + // encode stage + encodepolar_single_stage(frame, temp, num_branches, frame_half); + memcpy(temp, frame, sizeof(unsigned char) * frame_size); + + // update all the parameters. + num_branches = num_branches << 1; + frame_half = frame_half >> 1; + --stage; + } } #endif /* LV_HAVE_GENERIC */ #ifdef LV_HAVE_SSSE3 #include -static inline void -volk_8u_x2_encodeframepolar_8u_u_ssse3(unsigned char* frame, unsigned char* temp, - unsigned int frame_size) +static inline void volk_8u_x2_encodeframepolar_8u_u_ssse3(unsigned char* frame, + unsigned char* temp, + unsigned int frame_size) { - const unsigned int po2 = log2_of_power_of_2(frame_size); - - unsigned int stage = po2; - unsigned char* frame_ptr = frame; - unsigned char* temp_ptr = temp; - - unsigned int frame_half = frame_size >> 1; - unsigned int num_branches = 1; - unsigned int branch; - unsigned int bit; - - // prepare constants - const __m128i mask_stage1 = _mm_set_epi8(0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF); - - // get some SIMD registers to play with. - __m128i r_frame0, r_temp0, shifted; - - { - __m128i r_frame1, r_temp1; - const __m128i shuffle_separate = _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15); - - while(stage > 4){ - frame_ptr = frame; - temp_ptr = temp; - - // for stage = 5 a branch has 32 elements. So upper stages are even bigger. - for(branch = 0; branch < num_branches; ++branch){ - for(bit = 0; bit < frame_half; bit += 16){ - r_temp0 = _mm_loadu_si128((__m128i *) temp_ptr); - temp_ptr += 16; - r_temp1 = _mm_loadu_si128((__m128i *) temp_ptr); - temp_ptr += 16; - - shifted = _mm_srli_si128(r_temp0, 1); - shifted = _mm_and_si128(shifted, mask_stage1); - r_temp0 = _mm_xor_si128(shifted, r_temp0); - r_temp0 = _mm_shuffle_epi8(r_temp0, shuffle_separate); - - shifted = _mm_srli_si128(r_temp1, 1); - shifted = _mm_and_si128(shifted, mask_stage1); - r_temp1 = _mm_xor_si128(shifted, r_temp1); - r_temp1 = _mm_shuffle_epi8(r_temp1, shuffle_separate); - - r_frame0 = _mm_unpacklo_epi64(r_temp0, r_temp1); - _mm_storeu_si128((__m128i*) frame_ptr, r_frame0); - - r_frame1 = _mm_unpackhi_epi64(r_temp0, r_temp1); - _mm_storeu_si128((__m128i*) (frame_ptr + frame_half), r_frame1); - frame_ptr += 16; + const unsigned int po2 = log2_of_power_of_2(frame_size); + + unsigned int stage = po2; + unsigned char* frame_ptr = frame; + unsigned char* temp_ptr = temp; + + unsigned int frame_half = frame_size >> 1; + unsigned int num_branches = 1; + unsigned int branch; + unsigned int bit; + + // prepare constants + const __m128i mask_stage1 = _mm_set_epi8(0x0, + 0xFF, + 0x0, + 0xFF, + 0x0, + 0xFF, + 0x0, + 0xFF, + 0x0, + 0xFF, + 0x0, + 0xFF, + 0x0, + 0xFF, + 0x0, + 0xFF); + + // get some SIMD registers to play with. + __m128i r_frame0, r_temp0, shifted; + + { + __m128i r_frame1, r_temp1; + const __m128i shuffle_separate = + _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15); + + while (stage > 4) { + frame_ptr = frame; + temp_ptr = temp; + + // for stage = 5 a branch has 32 elements. So upper stages are even bigger. + for (branch = 0; branch < num_branches; ++branch) { + for (bit = 0; bit < frame_half; bit += 16) { + r_temp0 = _mm_loadu_si128((__m128i*)temp_ptr); + temp_ptr += 16; + r_temp1 = _mm_loadu_si128((__m128i*)temp_ptr); + temp_ptr += 16; + + shifted = _mm_srli_si128(r_temp0, 1); + shifted = _mm_and_si128(shifted, mask_stage1); + r_temp0 = _mm_xor_si128(shifted, r_temp0); + r_temp0 = _mm_shuffle_epi8(r_temp0, shuffle_separate); + + shifted = _mm_srli_si128(r_temp1, 1); + shifted = _mm_and_si128(shifted, mask_stage1); + r_temp1 = _mm_xor_si128(shifted, r_temp1); + r_temp1 = _mm_shuffle_epi8(r_temp1, shuffle_separate); + + r_frame0 = _mm_unpacklo_epi64(r_temp0, r_temp1); + _mm_storeu_si128((__m128i*)frame_ptr, r_frame0); + + r_frame1 = _mm_unpackhi_epi64(r_temp0, r_temp1); + _mm_storeu_si128((__m128i*)(frame_ptr + frame_half), r_frame1); + frame_ptr += 16; + } + + frame_ptr += frame_half; + } + memcpy(temp, frame, sizeof(unsigned char) * frame_size); + + num_branches = num_branches << 1; + frame_half = frame_half >> 1; + stage--; } - - frame_ptr += frame_half; - } - memcpy(temp, frame, sizeof(unsigned char) * frame_size); - - num_branches = num_branches << 1; - frame_half = frame_half >> 1; - stage--; } - } - // This last part requires at least 16-bit frames. - // Smaller frames are useless for SIMD optimization anyways. Just choose GENERIC! + // This last part requires at least 16-bit frames. + // Smaller frames are useless for SIMD optimization anyways. Just choose GENERIC! - // reset pointers to correct positions. - frame_ptr = frame; - temp_ptr = temp; + // reset pointers to correct positions. + frame_ptr = frame; + temp_ptr = temp; - // prefetch first chunk - __VOLK_PREFETCH(temp_ptr); - - const __m128i shuffle_stage4 = _mm_setr_epi8(0, 8, 4, 12, 2, 10, 6, 14, 1, 9, 5, 13, 3, 11, 7, 15); - const __m128i mask_stage4 = _mm_set_epi8(0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF); - const __m128i mask_stage3 = _mm_set_epi8(0x0, 0x0, 0x0, 0x0, 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x0, 0x0, 0x0, 0xFF, 0xFF, 0xFF, 0xFF); - const __m128i mask_stage2 = _mm_set_epi8(0x0, 0x0, 0xFF, 0xFF, 0x0, 0x0, 0xFF, 0xFF, 0x0, 0x0, 0xFF, 0xFF, 0x0, 0x0, 0xFF, 0xFF); - - for(branch = 0; branch < num_branches; ++branch){ - r_temp0 = _mm_loadu_si128((__m128i*) temp_ptr); - - // prefetch next chunk - temp_ptr += 16; + // prefetch first chunk __VOLK_PREFETCH(temp_ptr); - // shuffle once for bit-reversal. - r_temp0 = _mm_shuffle_epi8(r_temp0, shuffle_stage4); - - shifted = _mm_srli_si128(r_temp0, 8); - shifted = _mm_and_si128(shifted, mask_stage4); - r_frame0 = _mm_xor_si128(shifted, r_temp0); - - shifted = _mm_srli_si128(r_frame0, 4); - shifted = _mm_and_si128(shifted, mask_stage3); - r_frame0 = _mm_xor_si128(shifted, r_frame0); - - shifted = _mm_srli_si128(r_frame0, 2); - shifted = _mm_and_si128(shifted, mask_stage2); - r_frame0 = _mm_xor_si128(shifted, r_frame0); - - shifted = _mm_srli_si128(r_frame0, 1); - shifted = _mm_and_si128(shifted, mask_stage1); - r_frame0 = _mm_xor_si128(shifted, r_frame0); - - // store result of chunk. - _mm_storeu_si128((__m128i*)frame_ptr, r_frame0); - frame_ptr += 16; - } + const __m128i shuffle_stage4 = + _mm_setr_epi8(0, 8, 4, 12, 2, 10, 6, 14, 1, 9, 5, 13, 3, 11, 7, 15); + const __m128i mask_stage4 = _mm_set_epi8(0x0, + 0x0, + 0x0, + 0x0, + 0x0, + 0x0, + 0x0, + 0x0, + 0xFF, + 0xFF, + 0xFF, + 0xFF, + 0xFF, + 0xFF, + 0xFF, + 0xFF); + const __m128i mask_stage3 = _mm_set_epi8(0x0, + 0x0, + 0x0, + 0x0, + 0xFF, + 0xFF, + 0xFF, + 0xFF, + 0x0, + 0x0, + 0x0, + 0x0, + 0xFF, + 0xFF, + 0xFF, + 0xFF); + const __m128i mask_stage2 = _mm_set_epi8(0x0, + 0x0, + 0xFF, + 0xFF, + 0x0, + 0x0, + 0xFF, + 0xFF, + 0x0, + 0x0, + 0xFF, + 0xFF, + 0x0, + 0x0, + 0xFF, + 0xFF); + + for (branch = 0; branch < num_branches; ++branch) { + r_temp0 = _mm_loadu_si128((__m128i*)temp_ptr); + + // prefetch next chunk + temp_ptr += 16; + __VOLK_PREFETCH(temp_ptr); + + // shuffle once for bit-reversal. + r_temp0 = _mm_shuffle_epi8(r_temp0, shuffle_stage4); + + shifted = _mm_srli_si128(r_temp0, 8); + shifted = _mm_and_si128(shifted, mask_stage4); + r_frame0 = _mm_xor_si128(shifted, r_temp0); + + shifted = _mm_srli_si128(r_frame0, 4); + shifted = _mm_and_si128(shifted, mask_stage3); + r_frame0 = _mm_xor_si128(shifted, r_frame0); + + shifted = _mm_srli_si128(r_frame0, 2); + shifted = _mm_and_si128(shifted, mask_stage2); + r_frame0 = _mm_xor_si128(shifted, r_frame0); + + shifted = _mm_srli_si128(r_frame0, 1); + shifted = _mm_and_si128(shifted, mask_stage1); + r_frame0 = _mm_xor_si128(shifted, r_frame0); + + // store result of chunk. + _mm_storeu_si128((__m128i*)frame_ptr, r_frame0); + frame_ptr += 16; + } } #endif /* LV_HAVE_SSSE3 */ @@ -201,154 +265,351 @@ volk_8u_x2_encodeframepolar_8u_u_ssse3(unsigned char* frame, unsigned char* temp #ifdef LV_HAVE_AVX2 #include -static inline void -volk_8u_x2_encodeframepolar_8u_u_avx2(unsigned char* frame, unsigned char* temp, - unsigned int frame_size) +static inline void volk_8u_x2_encodeframepolar_8u_u_avx2(unsigned char* frame, + unsigned char* temp, + unsigned int frame_size) { - const unsigned int po2 = log2_of_power_of_2(frame_size); - - unsigned int stage = po2; - unsigned char* frame_ptr = frame; - unsigned char* temp_ptr = temp; - - unsigned int frame_half = frame_size >> 1; - unsigned int num_branches = 1; - unsigned int branch; - unsigned int bit; - - // prepare constants - const __m256i mask_stage1 = _mm256_set_epi8(0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, - 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF); - - const __m128i mask_stage0 = _mm_set_epi8(0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF); - // get some SIMD registers to play with. - __m256i r_frame0, r_temp0, shifted; - __m128i r_temp2, r_frame2, shifted2; - { - __m256i r_frame1, r_temp1; - __m128i r_frame3, r_temp3; - const __m256i shuffle_separate = _mm256_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15, - 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15); - const __m128i shuffle_separate128 = _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15); - - while(stage > 4){ - frame_ptr = frame; - temp_ptr = temp; - - // for stage = 5 a branch has 32 elements. So upper stages are even bigger. - for(branch = 0; branch < num_branches; ++branch){ - for(bit = 0; bit < frame_half; bit += 32){ - if ((frame_half-bit)<32) //if only 16 bits remaining in frame, not 32 - { - r_temp2 = _mm_loadu_si128((__m128i *) temp_ptr); - temp_ptr += 16; - r_temp3 = _mm_loadu_si128((__m128i *) temp_ptr); - temp_ptr += 16; - - shifted2 = _mm_srli_si128(r_temp2, 1); - shifted2 = _mm_and_si128(shifted2, mask_stage0); - r_temp2 = _mm_xor_si128(shifted2, r_temp2); - r_temp2 = _mm_shuffle_epi8(r_temp2, shuffle_separate128); - - shifted2 = _mm_srli_si128(r_temp3, 1); - shifted2 = _mm_and_si128(shifted2, mask_stage0); - r_temp3 = _mm_xor_si128(shifted2, r_temp3); - r_temp3 = _mm_shuffle_epi8(r_temp3, shuffle_separate128); - - r_frame2 = _mm_unpacklo_epi64(r_temp2, r_temp3); - _mm_storeu_si128((__m128i*) frame_ptr, r_frame2); - - r_frame3 = _mm_unpackhi_epi64(r_temp2, r_temp3); - _mm_storeu_si128((__m128i*) (frame_ptr + frame_half), r_frame3); - frame_ptr += 16; - break; - } - r_temp0 = _mm256_loadu_si256((__m256i *) temp_ptr); - temp_ptr += 32; - r_temp1 = _mm256_loadu_si256((__m256i *) temp_ptr); - temp_ptr += 32; - - shifted = _mm256_srli_si256(r_temp0, 1);//operate on 128 bit lanes - shifted = _mm256_and_si256(shifted, mask_stage1); - r_temp0 = _mm256_xor_si256(shifted, r_temp0); - r_temp0 = _mm256_shuffle_epi8(r_temp0, shuffle_separate); - - shifted = _mm256_srli_si256(r_temp1, 1); - shifted = _mm256_and_si256(shifted, mask_stage1); - r_temp1 = _mm256_xor_si256(shifted, r_temp1); - r_temp1 = _mm256_shuffle_epi8(r_temp1, shuffle_separate); - - r_frame0 = _mm256_unpacklo_epi64(r_temp0, r_temp1); - r_temp1 = _mm256_unpackhi_epi64(r_temp0, r_temp1); - r_frame0 = _mm256_permute4x64_epi64(r_frame0, 0xd8); - r_frame1 = _mm256_permute4x64_epi64(r_temp1, 0xd8); - - _mm256_storeu_si256((__m256i*) frame_ptr, r_frame0); - - _mm256_storeu_si256((__m256i*) (frame_ptr + frame_half), r_frame1); - frame_ptr += 32; + const unsigned int po2 = log2_of_power_of_2(frame_size); + + unsigned int stage = po2; + unsigned char* frame_ptr = frame; + unsigned char* temp_ptr = temp; + + unsigned int frame_half = frame_size >> 1; + unsigned int num_branches = 1; + unsigned int branch; + unsigned int bit; + + // prepare constants + const __m256i mask_stage1 = _mm256_set_epi8(0x0, + 0xFF, + 0x0, + 0xFF, + 0x0, + 0xFF, + 0x0, + 0xFF, + 0x0, + 0xFF, + 0x0, + 0xFF, + 0x0, + 0xFF, + 0x0, + 0xFF, + 0x0, + 0xFF, + 0x0, + 0xFF, + 0x0, + 0xFF, + 0x0, + 0xFF, + 0x0, + 0xFF, + 0x0, + 0xFF, + 0x0, + 0xFF, + 0x0, + 0xFF); + + const __m128i mask_stage0 = _mm_set_epi8(0x0, + 0xFF, + 0x0, + 0xFF, + 0x0, + 0xFF, + 0x0, + 0xFF, + 0x0, + 0xFF, + 0x0, + 0xFF, + 0x0, + 0xFF, + 0x0, + 0xFF); + // get some SIMD registers to play with. + __m256i r_frame0, r_temp0, shifted; + __m128i r_temp2, r_frame2, shifted2; + { + __m256i r_frame1, r_temp1; + __m128i r_frame3, r_temp3; + const __m256i shuffle_separate = _mm256_setr_epi8(0, + 2, + 4, + 6, + 8, + 10, + 12, + 14, + 1, + 3, + 5, + 7, + 9, + 11, + 13, + 15, + 0, + 2, + 4, + 6, + 8, + 10, + 12, + 14, + 1, + 3, + 5, + 7, + 9, + 11, + 13, + 15); + const __m128i shuffle_separate128 = + _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15); + + while (stage > 4) { + frame_ptr = frame; + temp_ptr = temp; + + // for stage = 5 a branch has 32 elements. So upper stages are even bigger. + for (branch = 0; branch < num_branches; ++branch) { + for (bit = 0; bit < frame_half; bit += 32) { + if ((frame_half - bit) < + 32) // if only 16 bits remaining in frame, not 32 + { + r_temp2 = _mm_loadu_si128((__m128i*)temp_ptr); + temp_ptr += 16; + r_temp3 = _mm_loadu_si128((__m128i*)temp_ptr); + temp_ptr += 16; + + shifted2 = _mm_srli_si128(r_temp2, 1); + shifted2 = _mm_and_si128(shifted2, mask_stage0); + r_temp2 = _mm_xor_si128(shifted2, r_temp2); + r_temp2 = _mm_shuffle_epi8(r_temp2, shuffle_separate128); + + shifted2 = _mm_srli_si128(r_temp3, 1); + shifted2 = _mm_and_si128(shifted2, mask_stage0); + r_temp3 = _mm_xor_si128(shifted2, r_temp3); + r_temp3 = _mm_shuffle_epi8(r_temp3, shuffle_separate128); + + r_frame2 = _mm_unpacklo_epi64(r_temp2, r_temp3); + _mm_storeu_si128((__m128i*)frame_ptr, r_frame2); + + r_frame3 = _mm_unpackhi_epi64(r_temp2, r_temp3); + _mm_storeu_si128((__m128i*)(frame_ptr + frame_half), r_frame3); + frame_ptr += 16; + break; + } + r_temp0 = _mm256_loadu_si256((__m256i*)temp_ptr); + temp_ptr += 32; + r_temp1 = _mm256_loadu_si256((__m256i*)temp_ptr); + temp_ptr += 32; + + shifted = _mm256_srli_si256(r_temp0, 1); // operate on 128 bit lanes + shifted = _mm256_and_si256(shifted, mask_stage1); + r_temp0 = _mm256_xor_si256(shifted, r_temp0); + r_temp0 = _mm256_shuffle_epi8(r_temp0, shuffle_separate); + + shifted = _mm256_srli_si256(r_temp1, 1); + shifted = _mm256_and_si256(shifted, mask_stage1); + r_temp1 = _mm256_xor_si256(shifted, r_temp1); + r_temp1 = _mm256_shuffle_epi8(r_temp1, shuffle_separate); + + r_frame0 = _mm256_unpacklo_epi64(r_temp0, r_temp1); + r_temp1 = _mm256_unpackhi_epi64(r_temp0, r_temp1); + r_frame0 = _mm256_permute4x64_epi64(r_frame0, 0xd8); + r_frame1 = _mm256_permute4x64_epi64(r_temp1, 0xd8); + + _mm256_storeu_si256((__m256i*)frame_ptr, r_frame0); + + _mm256_storeu_si256((__m256i*)(frame_ptr + frame_half), r_frame1); + frame_ptr += 32; + } + + frame_ptr += frame_half; + } + memcpy(temp, frame, sizeof(unsigned char) * frame_size); + + num_branches = num_branches << 1; + frame_half = frame_half >> 1; + stage--; } - - frame_ptr += frame_half; - } - memcpy(temp, frame, sizeof(unsigned char) * frame_size); - - num_branches = num_branches << 1; - frame_half = frame_half >> 1; - stage--; } - } - - // This last part requires at least 32-bit frames. - // Smaller frames are useless for SIMD optimization anyways. Just choose GENERIC! - - // reset pointers to correct positions. - frame_ptr = frame; - temp_ptr = temp; - // prefetch first chunk - __VOLK_PREFETCH(temp_ptr); + // This last part requires at least 32-bit frames. + // Smaller frames are useless for SIMD optimization anyways. Just choose GENERIC! - const __m256i shuffle_stage4 = _mm256_setr_epi8(0, 8, 4, 12, 2, 10, 6, 14, 1, 9, 5, 13, 3, 11, 7, 15, - 0, 8, 4, 12, 2, 10, 6, 14, 1, 9, 5, 13, 3, 11, 7, 15); - const __m256i mask_stage4 = _mm256_set_epi8(0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, - 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF); - const __m256i mask_stage3 = _mm256_set_epi8(0x0, 0x0, 0x0, 0x0, 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x0, 0x0, 0x0, 0xFF, 0xFF, 0xFF, 0xFF, - 0x0, 0x0, 0x0, 0x0, 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x0, 0x0, 0x0, 0xFF, 0xFF, 0xFF, 0xFF); - const __m256i mask_stage2 = _mm256_set_epi8(0x0, 0x0, 0xFF, 0xFF, 0x0, 0x0, 0xFF, 0xFF, 0x0, 0x0, 0xFF, 0xFF, 0x0, 0x0, 0xFF, 0xFF, - 0x0, 0x0, 0xFF, 0xFF, 0x0, 0x0, 0xFF, 0xFF, 0x0, 0x0, 0xFF, 0xFF, 0x0, 0x0, 0xFF, 0xFF); + // reset pointers to correct positions. + frame_ptr = frame; + temp_ptr = temp; - for(branch = 0; branch < num_branches/2; ++branch){ - r_temp0 = _mm256_loadu_si256((__m256i*) temp_ptr); - - // prefetch next chunk - temp_ptr += 32; + // prefetch first chunk __VOLK_PREFETCH(temp_ptr); - // shuffle once for bit-reversal. - r_temp0 = _mm256_shuffle_epi8(r_temp0, shuffle_stage4); - - shifted = _mm256_srli_si256(r_temp0, 8); //128 bit lanes - shifted = _mm256_and_si256(shifted, mask_stage4); - r_frame0 = _mm256_xor_si256(shifted, r_temp0); - - - shifted = _mm256_srli_si256(r_frame0, 4); - shifted = _mm256_and_si256(shifted, mask_stage3); - r_frame0 = _mm256_xor_si256(shifted, r_frame0); - - shifted = _mm256_srli_si256(r_frame0, 2); - shifted = _mm256_and_si256(shifted, mask_stage2); - r_frame0 = _mm256_xor_si256(shifted, r_frame0); - - shifted = _mm256_srli_si256(r_frame0, 1); - shifted = _mm256_and_si256(shifted, mask_stage1); - r_frame0 = _mm256_xor_si256(shifted, r_frame0); - - // store result of chunk. - _mm256_storeu_si256((__m256i*)frame_ptr, r_frame0); - frame_ptr += 32; - } + const __m256i shuffle_stage4 = _mm256_setr_epi8(0, + 8, + 4, + 12, + 2, + 10, + 6, + 14, + 1, + 9, + 5, + 13, + 3, + 11, + 7, + 15, + 0, + 8, + 4, + 12, + 2, + 10, + 6, + 14, + 1, + 9, + 5, + 13, + 3, + 11, + 7, + 15); + const __m256i mask_stage4 = _mm256_set_epi8(0x0, + 0x0, + 0x0, + 0x0, + 0x0, + 0x0, + 0x0, + 0x0, + 0xFF, + 0xFF, + 0xFF, + 0xFF, + 0xFF, + 0xFF, + 0xFF, + 0xFF, + 0x0, + 0x0, + 0x0, + 0x0, + 0x0, + 0x0, + 0x0, + 0x0, + 0xFF, + 0xFF, + 0xFF, + 0xFF, + 0xFF, + 0xFF, + 0xFF, + 0xFF); + const __m256i mask_stage3 = _mm256_set_epi8(0x0, + 0x0, + 0x0, + 0x0, + 0xFF, + 0xFF, + 0xFF, + 0xFF, + 0x0, + 0x0, + 0x0, + 0x0, + 0xFF, + 0xFF, + 0xFF, + 0xFF, + 0x0, + 0x0, + 0x0, + 0x0, + 0xFF, + 0xFF, + 0xFF, + 0xFF, + 0x0, + 0x0, + 0x0, + 0x0, + 0xFF, + 0xFF, + 0xFF, + 0xFF); + const __m256i mask_stage2 = _mm256_set_epi8(0x0, + 0x0, + 0xFF, + 0xFF, + 0x0, + 0x0, + 0xFF, + 0xFF, + 0x0, + 0x0, + 0xFF, + 0xFF, + 0x0, + 0x0, + 0xFF, + 0xFF, + 0x0, + 0x0, + 0xFF, + 0xFF, + 0x0, + 0x0, + 0xFF, + 0xFF, + 0x0, + 0x0, + 0xFF, + 0xFF, + 0x0, + 0x0, + 0xFF, + 0xFF); + + for (branch = 0; branch < num_branches / 2; ++branch) { + r_temp0 = _mm256_loadu_si256((__m256i*)temp_ptr); + + // prefetch next chunk + temp_ptr += 32; + __VOLK_PREFETCH(temp_ptr); + + // shuffle once for bit-reversal. + r_temp0 = _mm256_shuffle_epi8(r_temp0, shuffle_stage4); + + shifted = _mm256_srli_si256(r_temp0, 8); // 128 bit lanes + shifted = _mm256_and_si256(shifted, mask_stage4); + r_frame0 = _mm256_xor_si256(shifted, r_temp0); + + + shifted = _mm256_srli_si256(r_frame0, 4); + shifted = _mm256_and_si256(shifted, mask_stage3); + r_frame0 = _mm256_xor_si256(shifted, r_frame0); + + shifted = _mm256_srli_si256(r_frame0, 2); + shifted = _mm256_and_si256(shifted, mask_stage2); + r_frame0 = _mm256_xor_si256(shifted, r_frame0); + + shifted = _mm256_srli_si256(r_frame0, 1); + shifted = _mm256_and_si256(shifted, mask_stage1); + r_frame0 = _mm256_xor_si256(shifted, r_frame0); + + // store result of chunk. + _mm256_storeu_si256((__m256i*)frame_ptr, r_frame0); + frame_ptr += 32; + } } #endif /* LV_HAVE_AVX2 */ @@ -360,272 +621,530 @@ volk_8u_x2_encodeframepolar_8u_u_avx2(unsigned char* frame, unsigned char* temp, #ifdef LV_HAVE_SSSE3 #include -static inline void -volk_8u_x2_encodeframepolar_8u_a_ssse3(unsigned char* frame, unsigned char* temp, - unsigned int frame_size) +static inline void volk_8u_x2_encodeframepolar_8u_a_ssse3(unsigned char* frame, + unsigned char* temp, + unsigned int frame_size) { - const unsigned int po2 = log2_of_power_of_2(frame_size); - - unsigned int stage = po2; - unsigned char* frame_ptr = frame; - unsigned char* temp_ptr = temp; - - unsigned int frame_half = frame_size >> 1; - unsigned int num_branches = 1; - unsigned int branch; - unsigned int bit; - - // prepare constants - const __m128i mask_stage1 = _mm_set_epi8(0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF); - - // get some SIMD registers to play with. - __m128i r_frame0, r_temp0, shifted; - - { - __m128i r_frame1, r_temp1; - const __m128i shuffle_separate = _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15); - - while(stage > 4){ - frame_ptr = frame; - temp_ptr = temp; - - // for stage = 5 a branch has 32 elements. So upper stages are even bigger. - for(branch = 0; branch < num_branches; ++branch){ - for(bit = 0; bit < frame_half; bit += 16){ - r_temp0 = _mm_load_si128((__m128i *) temp_ptr); - temp_ptr += 16; - r_temp1 = _mm_load_si128((__m128i *) temp_ptr); - temp_ptr += 16; - - shifted = _mm_srli_si128(r_temp0, 1); - shifted = _mm_and_si128(shifted, mask_stage1); - r_temp0 = _mm_xor_si128(shifted, r_temp0); - r_temp0 = _mm_shuffle_epi8(r_temp0, shuffle_separate); - - shifted = _mm_srli_si128(r_temp1, 1); - shifted = _mm_and_si128(shifted, mask_stage1); - r_temp1 = _mm_xor_si128(shifted, r_temp1); - r_temp1 = _mm_shuffle_epi8(r_temp1, shuffle_separate); - - r_frame0 = _mm_unpacklo_epi64(r_temp0, r_temp1); - _mm_store_si128((__m128i*) frame_ptr, r_frame0); - - r_frame1 = _mm_unpackhi_epi64(r_temp0, r_temp1); - _mm_store_si128((__m128i*) (frame_ptr + frame_half), r_frame1); - frame_ptr += 16; + const unsigned int po2 = log2_of_power_of_2(frame_size); + + unsigned int stage = po2; + unsigned char* frame_ptr = frame; + unsigned char* temp_ptr = temp; + + unsigned int frame_half = frame_size >> 1; + unsigned int num_branches = 1; + unsigned int branch; + unsigned int bit; + + // prepare constants + const __m128i mask_stage1 = _mm_set_epi8(0x0, + 0xFF, + 0x0, + 0xFF, + 0x0, + 0xFF, + 0x0, + 0xFF, + 0x0, + 0xFF, + 0x0, + 0xFF, + 0x0, + 0xFF, + 0x0, + 0xFF); + + // get some SIMD registers to play with. + __m128i r_frame0, r_temp0, shifted; + + { + __m128i r_frame1, r_temp1; + const __m128i shuffle_separate = + _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15); + + while (stage > 4) { + frame_ptr = frame; + temp_ptr = temp; + + // for stage = 5 a branch has 32 elements. So upper stages are even bigger. + for (branch = 0; branch < num_branches; ++branch) { + for (bit = 0; bit < frame_half; bit += 16) { + r_temp0 = _mm_load_si128((__m128i*)temp_ptr); + temp_ptr += 16; + r_temp1 = _mm_load_si128((__m128i*)temp_ptr); + temp_ptr += 16; + + shifted = _mm_srli_si128(r_temp0, 1); + shifted = _mm_and_si128(shifted, mask_stage1); + r_temp0 = _mm_xor_si128(shifted, r_temp0); + r_temp0 = _mm_shuffle_epi8(r_temp0, shuffle_separate); + + shifted = _mm_srli_si128(r_temp1, 1); + shifted = _mm_and_si128(shifted, mask_stage1); + r_temp1 = _mm_xor_si128(shifted, r_temp1); + r_temp1 = _mm_shuffle_epi8(r_temp1, shuffle_separate); + + r_frame0 = _mm_unpacklo_epi64(r_temp0, r_temp1); + _mm_store_si128((__m128i*)frame_ptr, r_frame0); + + r_frame1 = _mm_unpackhi_epi64(r_temp0, r_temp1); + _mm_store_si128((__m128i*)(frame_ptr + frame_half), r_frame1); + frame_ptr += 16; + } + + frame_ptr += frame_half; + } + memcpy(temp, frame, sizeof(unsigned char) * frame_size); + + num_branches = num_branches << 1; + frame_half = frame_half >> 1; + stage--; } - - frame_ptr += frame_half; - } - memcpy(temp, frame, sizeof(unsigned char) * frame_size); - - num_branches = num_branches << 1; - frame_half = frame_half >> 1; - stage--; } - } - - // This last part requires at least 16-bit frames. - // Smaller frames are useless for SIMD optimization anyways. Just choose GENERIC! - - // reset pointers to correct positions. - frame_ptr = frame; - temp_ptr = temp; - // prefetch first chunk - __VOLK_PREFETCH(temp_ptr); + // This last part requires at least 16-bit frames. + // Smaller frames are useless for SIMD optimization anyways. Just choose GENERIC! - const __m128i shuffle_stage4 = _mm_setr_epi8(0, 8, 4, 12, 2, 10, 6, 14, 1, 9, 5, 13, 3, 11, 7, 15); - const __m128i mask_stage4 = _mm_set_epi8(0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF); - const __m128i mask_stage3 = _mm_set_epi8(0x0, 0x0, 0x0, 0x0, 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x0, 0x0, 0x0, 0xFF, 0xFF, 0xFF, 0xFF); - const __m128i mask_stage2 = _mm_set_epi8(0x0, 0x0, 0xFF, 0xFF, 0x0, 0x0, 0xFF, 0xFF, 0x0, 0x0, 0xFF, 0xFF, 0x0, 0x0, 0xFF, 0xFF); + // reset pointers to correct positions. + frame_ptr = frame; + temp_ptr = temp; - for(branch = 0; branch < num_branches; ++branch){ - r_temp0 = _mm_load_si128((__m128i*) temp_ptr); - - // prefetch next chunk - temp_ptr += 16; + // prefetch first chunk __VOLK_PREFETCH(temp_ptr); - // shuffle once for bit-reversal. - r_temp0 = _mm_shuffle_epi8(r_temp0, shuffle_stage4); - - shifted = _mm_srli_si128(r_temp0, 8); - shifted = _mm_and_si128(shifted, mask_stage4); - r_frame0 = _mm_xor_si128(shifted, r_temp0); - - shifted = _mm_srli_si128(r_frame0, 4); - shifted = _mm_and_si128(shifted, mask_stage3); - r_frame0 = _mm_xor_si128(shifted, r_frame0); - - shifted = _mm_srli_si128(r_frame0, 2); - shifted = _mm_and_si128(shifted, mask_stage2); - r_frame0 = _mm_xor_si128(shifted, r_frame0); - - shifted = _mm_srli_si128(r_frame0, 1); - shifted = _mm_and_si128(shifted, mask_stage1); - r_frame0 = _mm_xor_si128(shifted, r_frame0); - - // store result of chunk. - _mm_store_si128((__m128i*)frame_ptr, r_frame0); - frame_ptr += 16; - } + const __m128i shuffle_stage4 = + _mm_setr_epi8(0, 8, 4, 12, 2, 10, 6, 14, 1, 9, 5, 13, 3, 11, 7, 15); + const __m128i mask_stage4 = _mm_set_epi8(0x0, + 0x0, + 0x0, + 0x0, + 0x0, + 0x0, + 0x0, + 0x0, + 0xFF, + 0xFF, + 0xFF, + 0xFF, + 0xFF, + 0xFF, + 0xFF, + 0xFF); + const __m128i mask_stage3 = _mm_set_epi8(0x0, + 0x0, + 0x0, + 0x0, + 0xFF, + 0xFF, + 0xFF, + 0xFF, + 0x0, + 0x0, + 0x0, + 0x0, + 0xFF, + 0xFF, + 0xFF, + 0xFF); + const __m128i mask_stage2 = _mm_set_epi8(0x0, + 0x0, + 0xFF, + 0xFF, + 0x0, + 0x0, + 0xFF, + 0xFF, + 0x0, + 0x0, + 0xFF, + 0xFF, + 0x0, + 0x0, + 0xFF, + 0xFF); + + for (branch = 0; branch < num_branches; ++branch) { + r_temp0 = _mm_load_si128((__m128i*)temp_ptr); + + // prefetch next chunk + temp_ptr += 16; + __VOLK_PREFETCH(temp_ptr); + + // shuffle once for bit-reversal. + r_temp0 = _mm_shuffle_epi8(r_temp0, shuffle_stage4); + + shifted = _mm_srli_si128(r_temp0, 8); + shifted = _mm_and_si128(shifted, mask_stage4); + r_frame0 = _mm_xor_si128(shifted, r_temp0); + + shifted = _mm_srli_si128(r_frame0, 4); + shifted = _mm_and_si128(shifted, mask_stage3); + r_frame0 = _mm_xor_si128(shifted, r_frame0); + + shifted = _mm_srli_si128(r_frame0, 2); + shifted = _mm_and_si128(shifted, mask_stage2); + r_frame0 = _mm_xor_si128(shifted, r_frame0); + + shifted = _mm_srli_si128(r_frame0, 1); + shifted = _mm_and_si128(shifted, mask_stage1); + r_frame0 = _mm_xor_si128(shifted, r_frame0); + + // store result of chunk. + _mm_store_si128((__m128i*)frame_ptr, r_frame0); + frame_ptr += 16; + } } #endif /* LV_HAVE_SSSE3 */ #ifdef LV_HAVE_AVX2 #include -static inline void -volk_8u_x2_encodeframepolar_8u_a_avx2(unsigned char* frame, unsigned char* temp, - unsigned int frame_size) +static inline void volk_8u_x2_encodeframepolar_8u_a_avx2(unsigned char* frame, + unsigned char* temp, + unsigned int frame_size) { - const unsigned int po2 = log2_of_power_of_2(frame_size); - - unsigned int stage = po2; - unsigned char* frame_ptr = frame; - unsigned char* temp_ptr = temp; - - unsigned int frame_half = frame_size >> 1; - unsigned int num_branches = 1; - unsigned int branch; - unsigned int bit; - - // prepare constants - const __m256i mask_stage1 = _mm256_set_epi8(0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, - 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF); - - const __m128i mask_stage0 = _mm_set_epi8(0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF); - // get some SIMD registers to play with. - __m256i r_frame0, r_temp0, shifted; - __m128i r_temp2, r_frame2, shifted2; - { - __m256i r_frame1, r_temp1; - __m128i r_frame3, r_temp3; - const __m256i shuffle_separate = _mm256_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15, - 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15); - const __m128i shuffle_separate128 = _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15); - - while(stage > 4){ - frame_ptr = frame; - temp_ptr = temp; - - // for stage = 5 a branch has 32 elements. So upper stages are even bigger. - for(branch = 0; branch < num_branches; ++branch){ - for(bit = 0; bit < frame_half; bit += 32){ - if ((frame_half-bit)<32) //if only 16 bits remaining in frame, not 32 - { - r_temp2 = _mm_load_si128((__m128i *) temp_ptr); - temp_ptr += 16; - r_temp3 = _mm_load_si128((__m128i *) temp_ptr); - temp_ptr += 16; - - shifted2 = _mm_srli_si128(r_temp2, 1); - shifted2 = _mm_and_si128(shifted2, mask_stage0); - r_temp2 = _mm_xor_si128(shifted2, r_temp2); - r_temp2 = _mm_shuffle_epi8(r_temp2, shuffle_separate128); - - shifted2 = _mm_srli_si128(r_temp3, 1); - shifted2 = _mm_and_si128(shifted2, mask_stage0); - r_temp3 = _mm_xor_si128(shifted2, r_temp3); - r_temp3 = _mm_shuffle_epi8(r_temp3, shuffle_separate128); - - r_frame2 = _mm_unpacklo_epi64(r_temp2, r_temp3); - _mm_store_si128((__m128i*) frame_ptr, r_frame2); - - r_frame3 = _mm_unpackhi_epi64(r_temp2, r_temp3); - _mm_store_si128((__m128i*) (frame_ptr + frame_half), r_frame3); - frame_ptr += 16; - break; - } - r_temp0 = _mm256_load_si256((__m256i *) temp_ptr); - temp_ptr += 32; - r_temp1 = _mm256_load_si256((__m256i *) temp_ptr); - temp_ptr += 32; - - shifted = _mm256_srli_si256(r_temp0, 1);//operate on 128 bit lanes - shifted = _mm256_and_si256(shifted, mask_stage1); - r_temp0 = _mm256_xor_si256(shifted, r_temp0); - r_temp0 = _mm256_shuffle_epi8(r_temp0, shuffle_separate); - - shifted = _mm256_srli_si256(r_temp1, 1); - shifted = _mm256_and_si256(shifted, mask_stage1); - r_temp1 = _mm256_xor_si256(shifted, r_temp1); - r_temp1 = _mm256_shuffle_epi8(r_temp1, shuffle_separate); - - r_frame0 = _mm256_unpacklo_epi64(r_temp0, r_temp1); - r_temp1 = _mm256_unpackhi_epi64(r_temp0, r_temp1); - r_frame0 = _mm256_permute4x64_epi64(r_frame0, 0xd8); - r_frame1 = _mm256_permute4x64_epi64(r_temp1, 0xd8); - - _mm256_store_si256((__m256i*) frame_ptr, r_frame0); - - _mm256_store_si256((__m256i*) (frame_ptr + frame_half), r_frame1); - frame_ptr += 32; + const unsigned int po2 = log2_of_power_of_2(frame_size); + + unsigned int stage = po2; + unsigned char* frame_ptr = frame; + unsigned char* temp_ptr = temp; + + unsigned int frame_half = frame_size >> 1; + unsigned int num_branches = 1; + unsigned int branch; + unsigned int bit; + + // prepare constants + const __m256i mask_stage1 = _mm256_set_epi8(0x0, + 0xFF, + 0x0, + 0xFF, + 0x0, + 0xFF, + 0x0, + 0xFF, + 0x0, + 0xFF, + 0x0, + 0xFF, + 0x0, + 0xFF, + 0x0, + 0xFF, + 0x0, + 0xFF, + 0x0, + 0xFF, + 0x0, + 0xFF, + 0x0, + 0xFF, + 0x0, + 0xFF, + 0x0, + 0xFF, + 0x0, + 0xFF, + 0x0, + 0xFF); + + const __m128i mask_stage0 = _mm_set_epi8(0x0, + 0xFF, + 0x0, + 0xFF, + 0x0, + 0xFF, + 0x0, + 0xFF, + 0x0, + 0xFF, + 0x0, + 0xFF, + 0x0, + 0xFF, + 0x0, + 0xFF); + // get some SIMD registers to play with. + __m256i r_frame0, r_temp0, shifted; + __m128i r_temp2, r_frame2, shifted2; + { + __m256i r_frame1, r_temp1; + __m128i r_frame3, r_temp3; + const __m256i shuffle_separate = _mm256_setr_epi8(0, + 2, + 4, + 6, + 8, + 10, + 12, + 14, + 1, + 3, + 5, + 7, + 9, + 11, + 13, + 15, + 0, + 2, + 4, + 6, + 8, + 10, + 12, + 14, + 1, + 3, + 5, + 7, + 9, + 11, + 13, + 15); + const __m128i shuffle_separate128 = + _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15); + + while (stage > 4) { + frame_ptr = frame; + temp_ptr = temp; + + // for stage = 5 a branch has 32 elements. So upper stages are even bigger. + for (branch = 0; branch < num_branches; ++branch) { + for (bit = 0; bit < frame_half; bit += 32) { + if ((frame_half - bit) < + 32) // if only 16 bits remaining in frame, not 32 + { + r_temp2 = _mm_load_si128((__m128i*)temp_ptr); + temp_ptr += 16; + r_temp3 = _mm_load_si128((__m128i*)temp_ptr); + temp_ptr += 16; + + shifted2 = _mm_srli_si128(r_temp2, 1); + shifted2 = _mm_and_si128(shifted2, mask_stage0); + r_temp2 = _mm_xor_si128(shifted2, r_temp2); + r_temp2 = _mm_shuffle_epi8(r_temp2, shuffle_separate128); + + shifted2 = _mm_srli_si128(r_temp3, 1); + shifted2 = _mm_and_si128(shifted2, mask_stage0); + r_temp3 = _mm_xor_si128(shifted2, r_temp3); + r_temp3 = _mm_shuffle_epi8(r_temp3, shuffle_separate128); + + r_frame2 = _mm_unpacklo_epi64(r_temp2, r_temp3); + _mm_store_si128((__m128i*)frame_ptr, r_frame2); + + r_frame3 = _mm_unpackhi_epi64(r_temp2, r_temp3); + _mm_store_si128((__m128i*)(frame_ptr + frame_half), r_frame3); + frame_ptr += 16; + break; + } + r_temp0 = _mm256_load_si256((__m256i*)temp_ptr); + temp_ptr += 32; + r_temp1 = _mm256_load_si256((__m256i*)temp_ptr); + temp_ptr += 32; + + shifted = _mm256_srli_si256(r_temp0, 1); // operate on 128 bit lanes + shifted = _mm256_and_si256(shifted, mask_stage1); + r_temp0 = _mm256_xor_si256(shifted, r_temp0); + r_temp0 = _mm256_shuffle_epi8(r_temp0, shuffle_separate); + + shifted = _mm256_srli_si256(r_temp1, 1); + shifted = _mm256_and_si256(shifted, mask_stage1); + r_temp1 = _mm256_xor_si256(shifted, r_temp1); + r_temp1 = _mm256_shuffle_epi8(r_temp1, shuffle_separate); + + r_frame0 = _mm256_unpacklo_epi64(r_temp0, r_temp1); + r_temp1 = _mm256_unpackhi_epi64(r_temp0, r_temp1); + r_frame0 = _mm256_permute4x64_epi64(r_frame0, 0xd8); + r_frame1 = _mm256_permute4x64_epi64(r_temp1, 0xd8); + + _mm256_store_si256((__m256i*)frame_ptr, r_frame0); + + _mm256_store_si256((__m256i*)(frame_ptr + frame_half), r_frame1); + frame_ptr += 32; + } + + frame_ptr += frame_half; + } + memcpy(temp, frame, sizeof(unsigned char) * frame_size); + + num_branches = num_branches << 1; + frame_half = frame_half >> 1; + stage--; } - - frame_ptr += frame_half; - } - memcpy(temp, frame, sizeof(unsigned char) * frame_size); - - num_branches = num_branches << 1; - frame_half = frame_half >> 1; - stage--; } - } - - // This last part requires at least 32-bit frames. - // Smaller frames are useless for SIMD optimization anyways. Just choose GENERIC! - // reset pointers to correct positions. - frame_ptr = frame; - temp_ptr = temp; + // This last part requires at least 32-bit frames. + // Smaller frames are useless for SIMD optimization anyways. Just choose GENERIC! - // prefetch first chunk. - __VOLK_PREFETCH(temp_ptr); + // reset pointers to correct positions. + frame_ptr = frame; + temp_ptr = temp; - const __m256i shuffle_stage4 = _mm256_setr_epi8(0, 8, 4, 12, 2, 10, 6, 14, 1, 9, 5, 13, 3, 11, 7, 15, - 0, 8, 4, 12, 2, 10, 6, 14, 1, 9, 5, 13, 3, 11, 7, 15); - const __m256i mask_stage4 = _mm256_set_epi8(0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, - 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF); - const __m256i mask_stage3 = _mm256_set_epi8(0x0, 0x0, 0x0, 0x0, 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x0, 0x0, 0x0, 0xFF, 0xFF, 0xFF, 0xFF, - 0x0, 0x0, 0x0, 0x0, 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x0, 0x0, 0x0, 0xFF, 0xFF, 0xFF, 0xFF); - const __m256i mask_stage2 = _mm256_set_epi8(0x0, 0x0, 0xFF, 0xFF, 0x0, 0x0, 0xFF, 0xFF, 0x0, 0x0, 0xFF, 0xFF, 0x0, 0x0, 0xFF, 0xFF, - 0x0, 0x0, 0xFF, 0xFF, 0x0, 0x0, 0xFF, 0xFF, 0x0, 0x0, 0xFF, 0xFF, 0x0, 0x0, 0xFF, 0xFF); - - for(branch = 0; branch < num_branches/2; ++branch){ - r_temp0 = _mm256_load_si256((__m256i*) temp_ptr); - - // prefetch next chunk - temp_ptr += 32; + // prefetch first chunk. __VOLK_PREFETCH(temp_ptr); - // shuffle once for bit-reversal. - r_temp0 = _mm256_shuffle_epi8(r_temp0, shuffle_stage4); - - shifted = _mm256_srli_si256(r_temp0, 8); //128 bit lanes - shifted = _mm256_and_si256(shifted, mask_stage4); - r_frame0 = _mm256_xor_si256(shifted, r_temp0); - - shifted = _mm256_srli_si256(r_frame0, 4); - shifted = _mm256_and_si256(shifted, mask_stage3); - r_frame0 = _mm256_xor_si256(shifted, r_frame0); - - shifted = _mm256_srli_si256(r_frame0, 2); - shifted = _mm256_and_si256(shifted, mask_stage2); - r_frame0 = _mm256_xor_si256(shifted, r_frame0); - - shifted = _mm256_srli_si256(r_frame0, 1); - shifted = _mm256_and_si256(shifted, mask_stage1); - r_frame0 = _mm256_xor_si256(shifted, r_frame0); - - // store result of chunk. - _mm256_store_si256((__m256i*)frame_ptr, r_frame0); - frame_ptr += 32; - } + const __m256i shuffle_stage4 = _mm256_setr_epi8(0, + 8, + 4, + 12, + 2, + 10, + 6, + 14, + 1, + 9, + 5, + 13, + 3, + 11, + 7, + 15, + 0, + 8, + 4, + 12, + 2, + 10, + 6, + 14, + 1, + 9, + 5, + 13, + 3, + 11, + 7, + 15); + const __m256i mask_stage4 = _mm256_set_epi8(0x0, + 0x0, + 0x0, + 0x0, + 0x0, + 0x0, + 0x0, + 0x0, + 0xFF, + 0xFF, + 0xFF, + 0xFF, + 0xFF, + 0xFF, + 0xFF, + 0xFF, + 0x0, + 0x0, + 0x0, + 0x0, + 0x0, + 0x0, + 0x0, + 0x0, + 0xFF, + 0xFF, + 0xFF, + 0xFF, + 0xFF, + 0xFF, + 0xFF, + 0xFF); + const __m256i mask_stage3 = _mm256_set_epi8(0x0, + 0x0, + 0x0, + 0x0, + 0xFF, + 0xFF, + 0xFF, + 0xFF, + 0x0, + 0x0, + 0x0, + 0x0, + 0xFF, + 0xFF, + 0xFF, + 0xFF, + 0x0, + 0x0, + 0x0, + 0x0, + 0xFF, + 0xFF, + 0xFF, + 0xFF, + 0x0, + 0x0, + 0x0, + 0x0, + 0xFF, + 0xFF, + 0xFF, + 0xFF); + const __m256i mask_stage2 = _mm256_set_epi8(0x0, + 0x0, + 0xFF, + 0xFF, + 0x0, + 0x0, + 0xFF, + 0xFF, + 0x0, + 0x0, + 0xFF, + 0xFF, + 0x0, + 0x0, + 0xFF, + 0xFF, + 0x0, + 0x0, + 0xFF, + 0xFF, + 0x0, + 0x0, + 0xFF, + 0xFF, + 0x0, + 0x0, + 0xFF, + 0xFF, + 0x0, + 0x0, + 0xFF, + 0xFF); + + for (branch = 0; branch < num_branches / 2; ++branch) { + r_temp0 = _mm256_load_si256((__m256i*)temp_ptr); + + // prefetch next chunk + temp_ptr += 32; + __VOLK_PREFETCH(temp_ptr); + + // shuffle once for bit-reversal. + r_temp0 = _mm256_shuffle_epi8(r_temp0, shuffle_stage4); + + shifted = _mm256_srli_si256(r_temp0, 8); // 128 bit lanes + shifted = _mm256_and_si256(shifted, mask_stage4); + r_frame0 = _mm256_xor_si256(shifted, r_temp0); + + shifted = _mm256_srli_si256(r_frame0, 4); + shifted = _mm256_and_si256(shifted, mask_stage3); + r_frame0 = _mm256_xor_si256(shifted, r_frame0); + + shifted = _mm256_srli_si256(r_frame0, 2); + shifted = _mm256_and_si256(shifted, mask_stage2); + r_frame0 = _mm256_xor_si256(shifted, r_frame0); + + shifted = _mm256_srli_si256(r_frame0, 1); + shifted = _mm256_and_si256(shifted, mask_stage1); + r_frame0 = _mm256_xor_si256(shifted, r_frame0); + + // store result of chunk. + _mm256_store_si256((__m256i*)frame_ptr, r_frame0); + frame_ptr += 32; + } } #endif /* LV_HAVE_AVX2 */ - #endif /* VOLK_KERNELS_VOLK_VOLK_8U_X2_ENCODEFRAMEPOLAR_8U_A_H_ */ diff --git a/kernels/volk/volk_8u_x3_encodepolar_8u_x2.h b/kernels/volk/volk_8u_x3_encodepolar_8u_x2.h index 5bccd95..413836e 100644 --- a/kernels/volk/volk_8u_x3_encodepolar_8u_x2.h +++ b/kernels/volk/volk_8u_x3_encodepolar_8u_x2.h @@ -29,9 +29,9 @@ * * Dispatcher Prototype * \code - * void volk_8u_x3_encodepolar_8u(unsigned char* frame, const unsigned char* frozen_bit_mask, const unsigned char* frozen_bits, - * const unsigned char* info_bits, unsigned int frame_size, unsigned int info_bit_size) - * \endcode + * void volk_8u_x3_encodepolar_8u(unsigned char* frame, const unsigned char* + * frozen_bit_mask, const unsigned char* frozen_bits, const unsigned char* info_bits, + * unsigned int frame_size, unsigned int info_bit_size) \endcode * * \b Inputs * \li frame: buffer for encoded frame @@ -55,14 +55,17 @@ * unsigned char* frozen_bit_mask = get_frozen_bit_mask(frame_size, num_frozen_bits); * * // set elements to desired values. Typically all zero. - * unsigned char* frozen_bits = (unsigned char) volk_malloc(sizeof(unsigned char) * num_frozen_bits, volk_get_alignment()); + * unsigned char* frozen_bits = (unsigned char) volk_malloc(sizeof(unsigned char) * + * num_frozen_bits, volk_get_alignment()); * - * unsigned char* frame = (unsigned char) volk_malloc(sizeof(unsigned char) * frame_size, volk_get_alignment()); - * unsigned char* temp = (unsigned char) volk_malloc(sizeof(unsigned char) * frame_size, volk_get_alignment()); + * unsigned char* frame = (unsigned char) volk_malloc(sizeof(unsigned char) * frame_size, + * volk_get_alignment()); unsigned char* temp = (unsigned char) + * volk_malloc(sizeof(unsigned char) * frame_size, volk_get_alignment()); * * unsigned char* info_bits = get_info_bits_to_encode(num_info_bits); * - * volk_8u_x3_encodepolar_8u_x2_generic(frame, temp, frozen_bit_mask, frozen_bits, info_bits, frame_size); + * volk_8u_x3_encodepolar_8u_x2_generic(frame, temp, frozen_bit_mask, frozen_bits, + * info_bits, frame_size); * * volk_free(frozen_bit_mask); * volk_free(frozen_bits); @@ -77,27 +80,32 @@ #include #include -static inline void -interleave_frozen_and_info_bits(unsigned char* target, const unsigned char* frozen_bit_mask, - const unsigned char* frozen_bits, const unsigned char* info_bits, - const unsigned int frame_size) +static inline void interleave_frozen_and_info_bits(unsigned char* target, + const unsigned char* frozen_bit_mask, + const unsigned char* frozen_bits, + const unsigned char* info_bits, + const unsigned int frame_size) { - unsigned int bit; - for(bit = 0; bit < frame_size; ++bit){ - *target++ = *frozen_bit_mask++ ? *frozen_bits++ : *info_bits++; - } + unsigned int bit; + for (bit = 0; bit < frame_size; ++bit) { + *target++ = *frozen_bit_mask++ ? *frozen_bits++ : *info_bits++; + } } #ifdef LV_HAVE_GENERIC static inline void -volk_8u_x3_encodepolar_8u_x2_generic(unsigned char* frame, unsigned char* temp, const unsigned char* frozen_bit_mask, - const unsigned char* frozen_bits, const unsigned char* info_bits, +volk_8u_x3_encodepolar_8u_x2_generic(unsigned char* frame, + unsigned char* temp, + const unsigned char* frozen_bit_mask, + const unsigned char* frozen_bits, + const unsigned char* info_bits, unsigned int frame_size) { - // interleave - interleave_frozen_and_info_bits(temp, frozen_bit_mask, frozen_bits, info_bits, frame_size); - volk_8u_x2_encodeframepolar_8u_generic(frame, temp, frame_size); + // interleave + interleave_frozen_and_info_bits( + temp, frozen_bit_mask, frozen_bits, info_bits, frame_size); + volk_8u_x2_encodeframepolar_8u_generic(frame, temp, frame_size); } #endif /* LV_HAVE_GENERIC */ @@ -106,14 +114,17 @@ volk_8u_x3_encodepolar_8u_x2_generic(unsigned char* frame, unsigned char* temp, #include static inline void -volk_8u_x3_encodepolar_8u_x2_u_ssse3(unsigned char* frame, unsigned char* temp, - const unsigned char* frozen_bit_mask, - const unsigned char* frozen_bits, const unsigned char* info_bits, - unsigned int frame_size) +volk_8u_x3_encodepolar_8u_x2_u_ssse3(unsigned char* frame, + unsigned char* temp, + const unsigned char* frozen_bit_mask, + const unsigned char* frozen_bits, + const unsigned char* info_bits, + unsigned int frame_size) { - // interleave - interleave_frozen_and_info_bits(temp, frozen_bit_mask, frozen_bits, info_bits, frame_size); - volk_8u_x2_encodeframepolar_8u_u_ssse3(frame, temp, frame_size); + // interleave + interleave_frozen_and_info_bits( + temp, frozen_bit_mask, frozen_bits, info_bits, frame_size); + volk_8u_x2_encodeframepolar_8u_u_ssse3(frame, temp, frame_size); } #endif /* LV_HAVE_SSSE3 */ @@ -121,13 +132,16 @@ volk_8u_x3_encodepolar_8u_x2_u_ssse3(unsigned char* frame, unsigned char* temp, #ifdef LV_HAVE_AVX2 #include static inline void -volk_8u_x3_encodepolar_8u_x2_u_avx2(unsigned char* frame, unsigned char* temp, - const unsigned char* frozen_bit_mask, - const unsigned char* frozen_bits, const unsigned char* info_bits, - unsigned int frame_size) +volk_8u_x3_encodepolar_8u_x2_u_avx2(unsigned char* frame, + unsigned char* temp, + const unsigned char* frozen_bit_mask, + const unsigned char* frozen_bits, + const unsigned char* info_bits, + unsigned int frame_size) { - interleave_frozen_and_info_bits(temp, frozen_bit_mask, frozen_bits, info_bits, frame_size); - volk_8u_x2_encodeframepolar_8u_u_avx2(frame, temp, frame_size); + interleave_frozen_and_info_bits( + temp, frozen_bit_mask, frozen_bits, info_bits, frame_size); + volk_8u_x2_encodeframepolar_8u_u_avx2(frame, temp, frame_size); } #endif /* LV_HAVE_AVX2 */ @@ -139,26 +153,32 @@ volk_8u_x3_encodepolar_8u_x2_u_avx2(unsigned char* frame, unsigned char* temp, #ifdef LV_HAVE_SSSE3 #include static inline void -volk_8u_x3_encodepolar_8u_x2_a_ssse3(unsigned char* frame, unsigned char* temp, - const unsigned char* frozen_bit_mask, - const unsigned char* frozen_bits, const unsigned char* info_bits, - unsigned int frame_size) +volk_8u_x3_encodepolar_8u_x2_a_ssse3(unsigned char* frame, + unsigned char* temp, + const unsigned char* frozen_bit_mask, + const unsigned char* frozen_bits, + const unsigned char* info_bits, + unsigned int frame_size) { - interleave_frozen_and_info_bits(temp, frozen_bit_mask, frozen_bits, info_bits, frame_size); - volk_8u_x2_encodeframepolar_8u_a_ssse3(frame, temp, frame_size); + interleave_frozen_and_info_bits( + temp, frozen_bit_mask, frozen_bits, info_bits, frame_size); + volk_8u_x2_encodeframepolar_8u_a_ssse3(frame, temp, frame_size); } #endif /* LV_HAVE_SSSE3 */ #ifdef LV_HAVE_AVX2 #include static inline void -volk_8u_x3_encodepolar_8u_x2_a_avx2(unsigned char* frame, unsigned char* temp, - const unsigned char* frozen_bit_mask, - const unsigned char* frozen_bits, const unsigned char* info_bits, - unsigned int frame_size) +volk_8u_x3_encodepolar_8u_x2_a_avx2(unsigned char* frame, + unsigned char* temp, + const unsigned char* frozen_bit_mask, + const unsigned char* frozen_bits, + const unsigned char* info_bits, + unsigned int frame_size) { - interleave_frozen_and_info_bits(temp, frozen_bit_mask, frozen_bits, info_bits, frame_size); - volk_8u_x2_encodeframepolar_8u_a_avx2(frame, temp, frame_size); + interleave_frozen_and_info_bits( + temp, frozen_bit_mask, frozen_bits, info_bits, frame_size); + volk_8u_x2_encodeframepolar_8u_a_avx2(frame, temp, frame_size); } #endif /* LV_HAVE_AVX2 */ diff --git a/kernels/volk/volk_8u_x3_encodepolarpuppet_8u.h b/kernels/volk/volk_8u_x3_encodepolarpuppet_8u.h index 1f6be2c..1badbf1 100644 --- a/kernels/volk/volk_8u_x3_encodepolarpuppet_8u.h +++ b/kernels/volk/volk_8u_x3_encodepolarpuppet_8u.h @@ -29,71 +29,82 @@ #include #include -static inline unsigned int -next_lower_power_of_two(const unsigned int val) +static inline unsigned int next_lower_power_of_two(const unsigned int val) { - // algorithm found and adopted from: http://acius2.blogspot.de/2007/11/calculating-next-power-of-2.html - unsigned int res = val; - res = (res >> 1) | res; - res = (res >> 2) | res; - res = (res >> 4) | res; - res = (res >> 8) | res; - res = (res >> 16) | res; - res += 1; - return res >> 1; + // algorithm found and adopted from: + // http://acius2.blogspot.de/2007/11/calculating-next-power-of-2.html + unsigned int res = val; + res = (res >> 1) | res; + res = (res >> 2) | res; + res = (res >> 4) | res; + res = (res >> 8) | res; + res = (res >> 16) | res; + res += 1; + return res >> 1; } -static inline void -adjust_frozen_mask(unsigned char* mask, const unsigned int frame_size) +static inline void adjust_frozen_mask(unsigned char* mask, const unsigned int frame_size) { - // just like the rest of the puppet this function exists for test purposes only. - unsigned int i; - for(i = 0; i < frame_size; ++i){ - *mask = (*mask & 0x80) ? 0xFF : 0x00; - mask++; - } + // just like the rest of the puppet this function exists for test purposes only. + unsigned int i; + for (i = 0; i < frame_size; ++i) { + *mask = (*mask & 0x80) ? 0xFF : 0x00; + mask++; + } } #ifdef LV_HAVE_GENERIC static inline void -volk_8u_x3_encodepolarpuppet_8u_generic(unsigned char* frame, unsigned char* frozen_bit_mask, - const unsigned char* frozen_bits, const unsigned char* info_bits, - unsigned int frame_size) +volk_8u_x3_encodepolarpuppet_8u_generic(unsigned char* frame, + unsigned char* frozen_bit_mask, + const unsigned char* frozen_bits, + const unsigned char* info_bits, + unsigned int frame_size) { - frame_size = next_lower_power_of_two(frame_size); - unsigned char* temp = (unsigned char*) volk_malloc(sizeof(unsigned char) * frame_size, volk_get_alignment()); - adjust_frozen_mask(frozen_bit_mask, frame_size); - volk_8u_x3_encodepolar_8u_x2_generic(frame, temp, frozen_bit_mask, frozen_bits, info_bits, frame_size); - volk_free(temp); + frame_size = next_lower_power_of_two(frame_size); + unsigned char* temp = (unsigned char*)volk_malloc(sizeof(unsigned char) * frame_size, + volk_get_alignment()); + adjust_frozen_mask(frozen_bit_mask, frame_size); + volk_8u_x3_encodepolar_8u_x2_generic( + frame, temp, frozen_bit_mask, frozen_bits, info_bits, frame_size); + volk_free(temp); } #endif /* LV_HAVE_GENERIC */ #ifdef LV_HAVE_SSSE3 static inline void -volk_8u_x3_encodepolarpuppet_8u_u_ssse3(unsigned char* frame, unsigned char* frozen_bit_mask, - const unsigned char* frozen_bits, const unsigned char* info_bits, - unsigned int frame_size) +volk_8u_x3_encodepolarpuppet_8u_u_ssse3(unsigned char* frame, + unsigned char* frozen_bit_mask, + const unsigned char* frozen_bits, + const unsigned char* info_bits, + unsigned int frame_size) { - frame_size = next_lower_power_of_two(frame_size); - unsigned char* temp = (unsigned char*) volk_malloc(sizeof(unsigned char) * frame_size, volk_get_alignment()); - adjust_frozen_mask(frozen_bit_mask, frame_size); - volk_8u_x3_encodepolar_8u_x2_u_ssse3(frame, temp, frozen_bit_mask, frozen_bits, info_bits, frame_size); - volk_free(temp); + frame_size = next_lower_power_of_two(frame_size); + unsigned char* temp = (unsigned char*)volk_malloc(sizeof(unsigned char) * frame_size, + volk_get_alignment()); + adjust_frozen_mask(frozen_bit_mask, frame_size); + volk_8u_x3_encodepolar_8u_x2_u_ssse3( + frame, temp, frozen_bit_mask, frozen_bits, info_bits, frame_size); + volk_free(temp); } #endif /* LV_HAVE_SSSE3 */ #ifdef LV_HAVE_AVX2 static inline void -volk_8u_x3_encodepolarpuppet_8u_u_avx2(unsigned char* frame, unsigned char* frozen_bit_mask, - const unsigned char* frozen_bits, const unsigned char* info_bits, - unsigned int frame_size) +volk_8u_x3_encodepolarpuppet_8u_u_avx2(unsigned char* frame, + unsigned char* frozen_bit_mask, + const unsigned char* frozen_bits, + const unsigned char* info_bits, + unsigned int frame_size) { - frame_size = next_lower_power_of_two(frame_size); - unsigned char* temp = (unsigned char*) volk_malloc(sizeof(unsigned char) * frame_size, volk_get_alignment()); - adjust_frozen_mask(frozen_bit_mask, frame_size); - volk_8u_x3_encodepolar_8u_x2_u_avx2(frame, temp, frozen_bit_mask, frozen_bits, info_bits, frame_size); - volk_free(temp); + frame_size = next_lower_power_of_two(frame_size); + unsigned char* temp = (unsigned char*)volk_malloc(sizeof(unsigned char) * frame_size, + volk_get_alignment()); + adjust_frozen_mask(frozen_bit_mask, frame_size); + volk_8u_x3_encodepolar_8u_x2_u_avx2( + frame, temp, frozen_bit_mask, frozen_bits, info_bits, frame_size); + volk_free(temp); } #endif /* LV_HAVE_AVX2 */ @@ -104,29 +115,37 @@ volk_8u_x3_encodepolarpuppet_8u_u_avx2(unsigned char* frame, unsigned char* froz #ifdef LV_HAVE_SSSE3 static inline void -volk_8u_x3_encodepolarpuppet_8u_a_ssse3(unsigned char* frame, unsigned char* frozen_bit_mask, - const unsigned char* frozen_bits, const unsigned char* info_bits, - unsigned int frame_size) +volk_8u_x3_encodepolarpuppet_8u_a_ssse3(unsigned char* frame, + unsigned char* frozen_bit_mask, + const unsigned char* frozen_bits, + const unsigned char* info_bits, + unsigned int frame_size) { - frame_size = next_lower_power_of_two(frame_size); - unsigned char* temp = (unsigned char*) volk_malloc(sizeof(unsigned char) * frame_size, volk_get_alignment()); - adjust_frozen_mask(frozen_bit_mask, frame_size); - volk_8u_x3_encodepolar_8u_x2_a_ssse3(frame, temp, frozen_bit_mask, frozen_bits, info_bits, frame_size); - volk_free(temp); + frame_size = next_lower_power_of_two(frame_size); + unsigned char* temp = (unsigned char*)volk_malloc(sizeof(unsigned char) * frame_size, + volk_get_alignment()); + adjust_frozen_mask(frozen_bit_mask, frame_size); + volk_8u_x3_encodepolar_8u_x2_a_ssse3( + frame, temp, frozen_bit_mask, frozen_bits, info_bits, frame_size); + volk_free(temp); } #endif /* LV_HAVE_SSSE3 */ #ifdef LV_HAVE_AVX2 static inline void -volk_8u_x3_encodepolarpuppet_8u_a_avx2(unsigned char* frame, unsigned char* frozen_bit_mask, - const unsigned char* frozen_bits, const unsigned char* info_bits, - unsigned int frame_size) +volk_8u_x3_encodepolarpuppet_8u_a_avx2(unsigned char* frame, + unsigned char* frozen_bit_mask, + const unsigned char* frozen_bits, + const unsigned char* info_bits, + unsigned int frame_size) { - frame_size = next_lower_power_of_two(frame_size); - unsigned char* temp = (unsigned char*) volk_malloc(sizeof(unsigned char) * frame_size, volk_get_alignment()); - adjust_frozen_mask(frozen_bit_mask, frame_size); - volk_8u_x3_encodepolar_8u_x2_a_avx2(frame, temp, frozen_bit_mask, frozen_bits, info_bits, frame_size); - volk_free(temp); + frame_size = next_lower_power_of_two(frame_size); + unsigned char* temp = (unsigned char*)volk_malloc(sizeof(unsigned char) * frame_size, + volk_get_alignment()); + adjust_frozen_mask(frozen_bit_mask, frame_size); + volk_8u_x3_encodepolar_8u_x2_a_avx2( + frame, temp, frozen_bit_mask, frozen_bits, info_bits, frame_size); + volk_free(temp); } #endif /* LV_HAVE_AVX2 */ diff --git a/kernels/volk/volk_8u_x4_conv_k7_r2_8u.h b/kernels/volk/volk_8u_x4_conv_k7_r2_8u.h index 029ba75..89460a6 100644 --- a/kernels/volk/volk_8u_x4_conv_k7_r2_8u.h +++ b/kernels/volk/volk_8u_x4_conv_k7_r2_8u.h @@ -30,8 +30,9 @@ * * Dispatcher Prototype * \code - * void volk_8u_x4_conv_k7_r2_8u(unsigned char* Y, unsigned char* X, unsigned char* syms, unsigned char* dec, unsigned int framebits, unsigned int excess, unsigned char* Branchtab) - * \endcode + * void volk_8u_x4_conv_k7_r2_8u(unsigned char* Y, unsigned char* X, unsigned char* syms, + * unsigned char* dec, unsigned int framebits, unsigned int excess, unsigned char* + * Branchtab) \endcode * * \b Inputs * \li X: @@ -58,67 +59,71 @@ #define INCLUDED_volk_8u_x4_conv_k7_r2_8u_H typedef union { - unsigned char/*DECISIONTYPE*/ t[64/*NUMSTATES*//8/*DECISIONTYPE_BITSIZE*/]; - unsigned int w[64/*NUMSTATES*//32]; - unsigned short s[64/*NUMSTATES*//16]; - unsigned char c[64/*NUMSTATES*//8]; + unsigned char /*DECISIONTYPE*/ t[64 /*NUMSTATES*/ / 8 /*DECISIONTYPE_BITSIZE*/]; + unsigned int w[64 /*NUMSTATES*/ / 32]; + unsigned short s[64 /*NUMSTATES*/ / 16]; + unsigned char c[64 /*NUMSTATES*/ / 8]; #ifdef _MSC_VER } decision_t; #else -} decision_t __attribute__ ((aligned (16))); +} decision_t __attribute__((aligned(16))); #endif -static inline void -renormalize(unsigned char* X, unsigned char threshold) +static inline void renormalize(unsigned char* X, unsigned char threshold) { - int NUMSTATES = 64; - int i; - - unsigned char min=X[0]; - //if(min > threshold) { - for(i=0;iX[i]) - min=X[i]; - for(i=0;i threshold) { + for (i = 0; i < NUMSTATES; i++) + if (min > X[i]) + min = X[i]; + for (i = 0; i < NUMSTATES; i++) + X[i] -= min; + //} } -//helper BFLY for GENERIC version -static inline void -BFLY(int i, int s, unsigned char * syms, unsigned char *Y, - unsigned char *X, decision_t * d, unsigned char* Branchtab) +// helper BFLY for GENERIC version +static inline void BFLY(int i, + int s, + unsigned char* syms, + unsigned char* Y, + unsigned char* X, + decision_t* d, + unsigned char* Branchtab) { - int j, decision0, decision1; - unsigned char metric,m0,m1,m2,m3; + int j, decision0, decision1; + unsigned char metric, m0, m1, m2, m3; - int NUMSTATES = 64; - int RATE = 2; - int METRICSHIFT = 1; - int PRECISIONSHIFT = 2; + int NUMSTATES = 64; + int RATE = 2; + int METRICSHIFT = 1; + int PRECISIONSHIFT = 2; - metric =0; - for(j=0;j>METRICSHIFT; - metric=metric>>PRECISIONSHIFT; + metric = 0; + for (j = 0; j < RATE; j++) + metric += (Branchtab[i + j * NUMSTATES / 2] ^ syms[s * RATE + j]) >> METRICSHIFT; + metric = metric >> PRECISIONSHIFT; - unsigned char max = ((RATE*((256 -1)>>METRICSHIFT))>>PRECISIONSHIFT); + unsigned char max = ((RATE * ((256 - 1) >> METRICSHIFT)) >> PRECISIONSHIFT); - m0 = X[i] + metric; - m1 = X[i+NUMSTATES/2] + (max - metric); - m2 = X[i] + (max - metric); - m3 = X[i+NUMSTATES/2] + metric; + m0 = X[i] + metric; + m1 = X[i + NUMSTATES / 2] + (max - metric); + m2 = X[i] + (max - metric); + m3 = X[i + NUMSTATES / 2] + metric; - decision0 = (signed int)(m0-m1) > 0; - decision1 = (signed int)(m2-m3) > 0; + decision0 = (signed int)(m0 - m1) > 0; + decision1 = (signed int)(m2 - m3) > 0; - Y[2*i] = decision0 ? m1 : m0; - Y[2*i+1] = decision1 ? m3 : m2; + Y[2 * i] = decision0 ? m1 : m0; + Y[2 * i + 1] = decision1 ? m3 : m2; - d->w[i/(sizeof(unsigned int)*8/2)+s*(sizeof(decision_t)/sizeof(unsigned int))] |= - (decision0|decision1<<1) << ((2*i)&(sizeof(unsigned int)*8-1)); + d->w[i / (sizeof(unsigned int) * 8 / 2) + + s * (sizeof(decision_t) / sizeof(unsigned int))] |= + (decision0 | decision1 << 1) << ((2 * i) & (sizeof(unsigned int) * 8 - 1)); } @@ -127,188 +132,199 @@ BFLY(int i, int s, unsigned char * syms, unsigned char *Y, #include #include -static inline void -volk_8u_x4_conv_k7_r2_8u_avx2(unsigned char* Y, unsigned char* X, - unsigned char* syms, unsigned char* dec, - unsigned int framebits, unsigned int excess, - unsigned char* Branchtab) +static inline void volk_8u_x4_conv_k7_r2_8u_avx2(unsigned char* Y, + unsigned char* X, + unsigned char* syms, + unsigned char* dec, + unsigned int framebits, + unsigned int excess, + unsigned char* Branchtab) { - unsigned int i9; - for(i9 = 0; i9 < ((framebits + excess)>>1); i9++) { - unsigned char a75, a81; - int a73, a92; - int s20, s21; - unsigned char *a80, *b6; - int *a110, *a91, *a93; - __m256i *a112, *a71, *a72, *a77, *a83, *a95; - __m256i a86, a87; - __m256i a76, a78, a79, a82, a84, a85, a88, a89 - , a90, d10, d9, m23, m24, m25 - , m26, s18, s19, s22 - , s23, s24, s25, t13, t14, t15; - a71 = ((__m256i *) X); - s18 = *(a71); - a72 = (a71 + 1); - s19 = *(a72); - s22 = _mm256_permute2x128_si256(s18,s19,0x20); - s19 = _mm256_permute2x128_si256(s18,s19,0x31); - s18 = s22; - a73 = (4 * i9); - b6 = (syms + a73); - a75 = *(b6); - a76 = _mm256_set1_epi8(a75); - a77 = ((__m256i *) Branchtab); - a78 = *(a77); - a79 = _mm256_xor_si256(a76, a78); - a80 = (b6 + 1); - a81 = *(a80); - a82 = _mm256_set1_epi8(a81); - a83 = (a77 + 1); - a84 = *(a83); - a85 = _mm256_xor_si256(a82, a84); - t13 = _mm256_avg_epu8(a79,a85); - a86 = ((__m256i ) t13); - a87 = _mm256_srli_epi16(a86, 2); - a88 = ((__m256i ) a87); - t14 = _mm256_and_si256(a88, _mm256_set1_epi8(63)); - t15 = _mm256_subs_epu8(_mm256_set1_epi8(63), t14); - m23 = _mm256_adds_epu8(s18, t14); - m24 = _mm256_adds_epu8(s19, t15); - m25 = _mm256_adds_epu8(s18, t15); - m26 = _mm256_adds_epu8(s19, t14); - a89 = _mm256_min_epu8(m24, m23); - d9 = _mm256_cmpeq_epi8(a89, m24); - a90 = _mm256_min_epu8(m26, m25); - d10 = _mm256_cmpeq_epi8(a90, m26); - s22 = _mm256_unpacklo_epi8(d9,d10); - s23 = _mm256_unpackhi_epi8(d9,d10); - s20 = _mm256_movemask_epi8(_mm256_permute2x128_si256(s22, s23, 0x20)); - a91 = ((int *) dec); - a92 = (4 * i9); - a93 = (a91 + a92); - *(a93) = s20; - s21 = _mm256_movemask_epi8(_mm256_permute2x128_si256(s22, s23, 0x31)); - a110 = (a93 + 1); - *(a110) = s21; - s22 = _mm256_unpacklo_epi8(a89, a90); - s23 = _mm256_unpackhi_epi8(a89, a90); - a95 = ((__m256i *) Y); - s24 = _mm256_permute2x128_si256(s22, s23, 0x20); - *(a95) = s24; - s23 = _mm256_permute2x128_si256(s22, s23, 0x31); - a112 = (a95 + 1); - *(a112) = s23; - if ((((unsigned char *) Y)[0]>210)) { - __m256i m5, m6; - m5 = ((__m256i *) Y)[0]; - m5 = _mm256_min_epu8(m5, ((__m256i *) Y)[1]); - __m256i m7; - m7 = _mm256_min_epu8(_mm256_srli_si256(m5, 8), m5); - m7 = ((__m256i ) _mm256_min_epu8(((__m256i ) _mm256_srli_epi64(m7, 32)), ((__m256i ) m7))); - m7 = ((__m256i ) _mm256_min_epu8(((__m256i ) _mm256_srli_epi64(m7, 16)), ((__m256i ) m7))); - m7 = ((__m256i ) _mm256_min_epu8(((__m256i ) _mm256_srli_epi64(m7, 8)), ((__m256i ) m7))); - m7 = _mm256_unpacklo_epi8(m7, m7); - m7 = _mm256_shufflelo_epi16(m7, 0); - m6 = _mm256_unpacklo_epi64(m7, m7); - m6 = _mm256_permute2x128_si256(m6, m6, 0); //copy lower half of m6 to upper half, since above ops operate on 128 bit lanes - ((__m256i *) Y)[0] = _mm256_subs_epu8(((__m256i *) Y)[0], m6); - ((__m256i *) Y)[1] = _mm256_subs_epu8(((__m256i *) Y)[1], m6); + unsigned int i9; + for (i9 = 0; i9 < ((framebits + excess) >> 1); i9++) { + unsigned char a75, a81; + int a73, a92; + int s20, s21; + unsigned char *a80, *b6; + int *a110, *a91, *a93; + __m256i *a112, *a71, *a72, *a77, *a83, *a95; + __m256i a86, a87; + __m256i a76, a78, a79, a82, a84, a85, a88, a89, a90, d10, d9, m23, m24, m25, m26, + s18, s19, s22, s23, s24, s25, t13, t14, t15; + a71 = ((__m256i*)X); + s18 = *(a71); + a72 = (a71 + 1); + s19 = *(a72); + s22 = _mm256_permute2x128_si256(s18, s19, 0x20); + s19 = _mm256_permute2x128_si256(s18, s19, 0x31); + s18 = s22; + a73 = (4 * i9); + b6 = (syms + a73); + a75 = *(b6); + a76 = _mm256_set1_epi8(a75); + a77 = ((__m256i*)Branchtab); + a78 = *(a77); + a79 = _mm256_xor_si256(a76, a78); + a80 = (b6 + 1); + a81 = *(a80); + a82 = _mm256_set1_epi8(a81); + a83 = (a77 + 1); + a84 = *(a83); + a85 = _mm256_xor_si256(a82, a84); + t13 = _mm256_avg_epu8(a79, a85); + a86 = ((__m256i)t13); + a87 = _mm256_srli_epi16(a86, 2); + a88 = ((__m256i)a87); + t14 = _mm256_and_si256(a88, _mm256_set1_epi8(63)); + t15 = _mm256_subs_epu8(_mm256_set1_epi8(63), t14); + m23 = _mm256_adds_epu8(s18, t14); + m24 = _mm256_adds_epu8(s19, t15); + m25 = _mm256_adds_epu8(s18, t15); + m26 = _mm256_adds_epu8(s19, t14); + a89 = _mm256_min_epu8(m24, m23); + d9 = _mm256_cmpeq_epi8(a89, m24); + a90 = _mm256_min_epu8(m26, m25); + d10 = _mm256_cmpeq_epi8(a90, m26); + s22 = _mm256_unpacklo_epi8(d9, d10); + s23 = _mm256_unpackhi_epi8(d9, d10); + s20 = _mm256_movemask_epi8(_mm256_permute2x128_si256(s22, s23, 0x20)); + a91 = ((int*)dec); + a92 = (4 * i9); + a93 = (a91 + a92); + *(a93) = s20; + s21 = _mm256_movemask_epi8(_mm256_permute2x128_si256(s22, s23, 0x31)); + a110 = (a93 + 1); + *(a110) = s21; + s22 = _mm256_unpacklo_epi8(a89, a90); + s23 = _mm256_unpackhi_epi8(a89, a90); + a95 = ((__m256i*)Y); + s24 = _mm256_permute2x128_si256(s22, s23, 0x20); + *(a95) = s24; + s23 = _mm256_permute2x128_si256(s22, s23, 0x31); + a112 = (a95 + 1); + *(a112) = s23; + if ((((unsigned char*)Y)[0] > 210)) { + __m256i m5, m6; + m5 = ((__m256i*)Y)[0]; + m5 = _mm256_min_epu8(m5, ((__m256i*)Y)[1]); + __m256i m7; + m7 = _mm256_min_epu8(_mm256_srli_si256(m5, 8), m5); + m7 = ((__m256i)_mm256_min_epu8(((__m256i)_mm256_srli_epi64(m7, 32)), + ((__m256i)m7))); + m7 = ((__m256i)_mm256_min_epu8(((__m256i)_mm256_srli_epi64(m7, 16)), + ((__m256i)m7))); + m7 = ((__m256i)_mm256_min_epu8(((__m256i)_mm256_srli_epi64(m7, 8)), + ((__m256i)m7))); + m7 = _mm256_unpacklo_epi8(m7, m7); + m7 = _mm256_shufflelo_epi16(m7, 0); + m6 = _mm256_unpacklo_epi64(m7, m7); + m6 = _mm256_permute2x128_si256( + m6, m6, 0); // copy lower half of m6 to upper half, since above ops + // operate on 128 bit lanes + ((__m256i*)Y)[0] = _mm256_subs_epu8(((__m256i*)Y)[0], m6); + ((__m256i*)Y)[1] = _mm256_subs_epu8(((__m256i*)Y)[1], m6); + } + unsigned char a188, a194; + int a205; + int s48, s54; + unsigned char *a187, *a193; + int *a204, *a206, *a223, *b16; + __m256i *a184, *a185, *a190, *a196, *a208, *a225; + __m256i a199, a200; + __m256i a189, a191, a192, a195, a197, a198, a201, a202, a203, d17, d18, m39, m40, + m41, m42, s46, s47, s50, s51, t25, t26, t27; + a184 = ((__m256i*)Y); + s46 = *(a184); + a185 = (a184 + 1); + s47 = *(a185); + s50 = _mm256_permute2x128_si256(s46, s47, 0x20); + s47 = _mm256_permute2x128_si256(s46, s47, 0x31); + s46 = s50; + a187 = (b6 + 2); + a188 = *(a187); + a189 = _mm256_set1_epi8(a188); + a190 = ((__m256i*)Branchtab); + a191 = *(a190); + a192 = _mm256_xor_si256(a189, a191); + a193 = (b6 + 3); + a194 = *(a193); + a195 = _mm256_set1_epi8(a194); + a196 = (a190 + 1); + a197 = *(a196); + a198 = _mm256_xor_si256(a195, a197); + t25 = _mm256_avg_epu8(a192, a198); + a199 = ((__m256i)t25); + a200 = _mm256_srli_epi16(a199, 2); + a201 = ((__m256i)a200); + t26 = _mm256_and_si256(a201, _mm256_set1_epi8(63)); + t27 = _mm256_subs_epu8(_mm256_set1_epi8(63), t26); + m39 = _mm256_adds_epu8(s46, t26); + m40 = _mm256_adds_epu8(s47, t27); + m41 = _mm256_adds_epu8(s46, t27); + m42 = _mm256_adds_epu8(s47, t26); + a202 = _mm256_min_epu8(m40, m39); + d17 = _mm256_cmpeq_epi8(a202, m40); + a203 = _mm256_min_epu8(m42, m41); + d18 = _mm256_cmpeq_epi8(a203, m42); + s24 = _mm256_unpacklo_epi8(d17, d18); + s25 = _mm256_unpackhi_epi8(d17, d18); + s48 = _mm256_movemask_epi8(_mm256_permute2x128_si256(s24, s25, 0x20)); + a204 = ((int*)dec); + a205 = (4 * i9); + b16 = (a204 + a205); + a206 = (b16 + 2); + *(a206) = s48; + s54 = _mm256_movemask_epi8(_mm256_permute2x128_si256(s24, s25, 0x31)); + a223 = (b16 + 3); + *(a223) = s54; + s50 = _mm256_unpacklo_epi8(a202, a203); + s51 = _mm256_unpackhi_epi8(a202, a203); + s25 = _mm256_permute2x128_si256(s50, s51, 0x20); + s51 = _mm256_permute2x128_si256(s50, s51, 0x31); + a208 = ((__m256i*)X); + *(a208) = s25; + a225 = (a208 + 1); + *(a225) = s51; + + if ((((unsigned char*)X)[0] > 210)) { + __m256i m12, m13; + m12 = ((__m256i*)X)[0]; + m12 = _mm256_min_epu8(m12, ((__m256i*)X)[1]); + __m256i m14; + m14 = _mm256_min_epu8(_mm256_srli_si256(m12, 8), m12); + m14 = ((__m256i)_mm256_min_epu8(((__m256i)_mm256_srli_epi64(m14, 32)), + ((__m256i)m14))); + m14 = ((__m256i)_mm256_min_epu8(((__m256i)_mm256_srli_epi64(m14, 16)), + ((__m256i)m14))); + m14 = ((__m256i)_mm256_min_epu8(((__m256i)_mm256_srli_epi64(m14, 8)), + ((__m256i)m14))); + m14 = _mm256_unpacklo_epi8(m14, m14); + m14 = _mm256_shufflelo_epi16(m14, 0); + m13 = _mm256_unpacklo_epi64(m14, m14); + m13 = _mm256_permute2x128_si256(m13, m13, 0); + ((__m256i*)X)[0] = _mm256_subs_epu8(((__m256i*)X)[0], m13); + ((__m256i*)X)[1] = _mm256_subs_epu8(((__m256i*)X)[1], m13); + } } - unsigned char a188, a194; - int a205; - int s48, s54; - unsigned char *a187, *a193; - int *a204, *a206, *a223, *b16; - __m256i *a184, *a185, *a190, *a196, *a208, *a225; - __m256i a199, a200; - __m256i a189, a191, a192, a195, a197, a198, a201 - , a202, a203, d17, d18, m39, m40, m41 - , m42, s46, s47, s50 - , s51, t25, t26, t27; - a184 = ((__m256i *) Y); - s46 = *(a184); - a185 = (a184 + 1); - s47 = *(a185); - s50 = _mm256_permute2x128_si256(s46,s47,0x20); - s47 = _mm256_permute2x128_si256(s46,s47,0x31); - s46 = s50; - a187 = (b6 + 2); - a188 = *(a187); - a189 = _mm256_set1_epi8(a188); - a190 = ((__m256i *) Branchtab); - a191 = *(a190); - a192 = _mm256_xor_si256(a189, a191); - a193 = (b6 + 3); - a194 = *(a193); - a195 = _mm256_set1_epi8(a194); - a196 = (a190 + 1); - a197 = *(a196); - a198 = _mm256_xor_si256(a195, a197); - t25 = _mm256_avg_epu8(a192,a198); - a199 = ((__m256i ) t25); - a200 = _mm256_srli_epi16(a199, 2); - a201 = ((__m256i ) a200); - t26 = _mm256_and_si256(a201, _mm256_set1_epi8(63)); - t27 = _mm256_subs_epu8(_mm256_set1_epi8(63), t26); - m39 = _mm256_adds_epu8(s46, t26); - m40 = _mm256_adds_epu8(s47, t27); - m41 = _mm256_adds_epu8(s46, t27); - m42 = _mm256_adds_epu8(s47, t26); - a202 = _mm256_min_epu8(m40, m39); - d17 = _mm256_cmpeq_epi8(a202, m40); - a203 = _mm256_min_epu8(m42, m41); - d18 = _mm256_cmpeq_epi8(a203, m42); - s24 = _mm256_unpacklo_epi8(d17,d18); - s25 = _mm256_unpackhi_epi8(d17,d18); - s48 = _mm256_movemask_epi8(_mm256_permute2x128_si256(s24, s25, 0x20)); - a204 = ((int *) dec); - a205 = (4 * i9); - b16 = (a204 + a205); - a206 = (b16 + 2); - *(a206) = s48; - s54 = _mm256_movemask_epi8(_mm256_permute2x128_si256(s24, s25, 0x31)); - a223 = (b16 + 3); - *(a223) = s54; - s50 = _mm256_unpacklo_epi8(a202, a203); - s51 = _mm256_unpackhi_epi8(a202, a203); - s25 = _mm256_permute2x128_si256(s50, s51, 0x20); - s51 = _mm256_permute2x128_si256(s50, s51, 0x31); - a208 = ((__m256i *) X); - *(a208) = s25; - a225 = (a208 + 1); - *(a225) = s51; - - if ((((unsigned char *) X)[0]>210)) { - __m256i m12, m13; - m12 = ((__m256i *) X)[0]; - m12 = _mm256_min_epu8(m12, ((__m256i *) X)[1]); - __m256i m14; - m14 = _mm256_min_epu8(_mm256_srli_si256(m12, 8), m12); - m14 = ((__m256i ) _mm256_min_epu8(((__m256i ) _mm256_srli_epi64(m14, 32)), ((__m256i ) m14))); - m14 = ((__m256i ) _mm256_min_epu8(((__m256i ) _mm256_srli_epi64(m14, 16)), ((__m256i ) m14))); - m14 = ((__m256i ) _mm256_min_epu8(((__m256i ) _mm256_srli_epi64(m14, 8)), ((__m256i ) m14))); - m14 = _mm256_unpacklo_epi8(m14, m14); - m14 = _mm256_shufflelo_epi16(m14, 0); - m13 = _mm256_unpacklo_epi64(m14, m14); - m13 = _mm256_permute2x128_si256(m13, m13, 0); - ((__m256i *) X)[0] = _mm256_subs_epu8(((__m256i *) X)[0], m13); - ((__m256i *) X)[1] = _mm256_subs_epu8(((__m256i *) X)[1], m13); - } - } - - renormalize(X, 210); - unsigned int j; - for(j=0; j < (framebits + excess) % 2; ++j) { - int i; - for(i=0;i<64/2;i++){ - BFLY(i, (((framebits+excess) >> 1) << 1) + j , syms, Y, X, (decision_t *)dec, Branchtab); + renormalize(X, 210); + + unsigned int j; + for (j = 0; j < (framebits + excess) % 2; ++j) { + int i; + for (i = 0; i < 64 / 2; i++) { + BFLY(i, + (((framebits + excess) >> 1) << 1) + j, + syms, + Y, + X, + (decision_t*)dec, + Branchtab); + } + + renormalize(Y, 210); } - - renormalize(Y, 210); - - } - /*skip*/ + /*skip*/ } #endif /*LV_HAVE_AVX2*/ @@ -316,295 +332,300 @@ volk_8u_x4_conv_k7_r2_8u_avx2(unsigned char* Y, unsigned char* X, #if LV_HAVE_SSE3 -#include #include -#include #include +#include #include +#include -static inline void -volk_8u_x4_conv_k7_r2_8u_spiral(unsigned char* Y, unsigned char* X, - unsigned char* syms, unsigned char* dec, - unsigned int framebits, unsigned int excess, - unsigned char* Branchtab) +static inline void volk_8u_x4_conv_k7_r2_8u_spiral(unsigned char* Y, + unsigned char* X, + unsigned char* syms, + unsigned char* dec, + unsigned int framebits, + unsigned int excess, + unsigned char* Branchtab) { - unsigned int i9; - for(i9 = 0; i9 < ((framebits + excess) >> 1); i9++) { - unsigned char a75, a81; - int a73, a92; - short int s20, s21, s26, s27; - unsigned char *a74, *a80, *b6; - short int *a110, *a111, *a91, *a93, *a94; - __m128i *a102, *a112, *a113, *a71, *a72, *a77, *a83 - , *a95, *a96, *a97, *a98, *a99; - __m128i a105, a106, a86, a87; - __m128i a100, a101, a103, a104, a107, a108, a109 - , a76, a78, a79, a82, a84, a85, a88, a89 - , a90, d10, d11, d12, d9, m23, m24, m25 - , m26, m27, m28, m29, m30, s18, s19, s22 - , s23, s24, s25, s28, s29, t13, t14, t15 - , t16, t17, t18; - a71 = ((__m128i *) X); - s18 = *(a71); - a72 = (a71 + 2); - s19 = *(a72); - a73 = (4 * i9); - a74 = (syms + a73); - a75 = *(a74); - a76 = _mm_set1_epi8(a75); - a77 = ((__m128i *) Branchtab); - a78 = *(a77); - a79 = _mm_xor_si128(a76, a78); - b6 = (a73 + syms); - a80 = (b6 + 1); - a81 = *(a80); - a82 = _mm_set1_epi8(a81); - a83 = (a77 + 2); - a84 = *(a83); - a85 = _mm_xor_si128(a82, a84); - t13 = _mm_avg_epu8(a79,a85); - a86 = ((__m128i ) t13); - a87 = _mm_srli_epi16(a86, 2); - a88 = ((__m128i ) a87); - t14 = _mm_and_si128(a88, _mm_set_epi8(63, 63, 63, 63, 63, 63, 63 - , 63, 63, 63, 63, 63, 63, 63, 63 - , 63)); - t15 = _mm_subs_epu8(_mm_set_epi8(63, 63, 63, 63, 63, 63, 63 - , 63, 63, 63, 63, 63, 63, 63, 63 - , 63), t14); - m23 = _mm_adds_epu8(s18, t14); - m24 = _mm_adds_epu8(s19, t15); - m25 = _mm_adds_epu8(s18, t15); - m26 = _mm_adds_epu8(s19, t14); - a89 = _mm_min_epu8(m24, m23); - d9 = _mm_cmpeq_epi8(a89, m24); - a90 = _mm_min_epu8(m26, m25); - d10 = _mm_cmpeq_epi8(a90, m26); - s20 = _mm_movemask_epi8(_mm_unpacklo_epi8(d9,d10)); - a91 = ((short int *) dec); - a92 = (8 * i9); - a93 = (a91 + a92); - *(a93) = s20; - s21 = _mm_movemask_epi8(_mm_unpackhi_epi8(d9,d10)); - a94 = (a93 + 1); - *(a94) = s21; - s22 = _mm_unpacklo_epi8(a89, a90); - s23 = _mm_unpackhi_epi8(a89, a90); - a95 = ((__m128i *) Y); - *(a95) = s22; - a96 = (a95 + 1); - *(a96) = s23; - a97 = (a71 + 1); - s24 = *(a97); - a98 = (a71 + 3); - s25 = *(a98); - a99 = (a77 + 1); - a100 = *(a99); - a101 = _mm_xor_si128(a76, a100); - a102 = (a77 + 3); - a103 = *(a102); - a104 = _mm_xor_si128(a82, a103); - t16 = _mm_avg_epu8(a101,a104); - a105 = ((__m128i ) t16); - a106 = _mm_srli_epi16(a105, 2); - a107 = ((__m128i ) a106); - t17 = _mm_and_si128(a107, _mm_set_epi8(63, 63, 63, 63, 63, 63, 63 - , 63, 63, 63, 63, 63, 63, 63, 63 - , 63)); - t18 = _mm_subs_epu8(_mm_set_epi8(63, 63, 63, 63, 63, 63, 63 - , 63, 63, 63, 63, 63, 63, 63, 63 - , 63), t17); - m27 = _mm_adds_epu8(s24, t17); - m28 = _mm_adds_epu8(s25, t18); - m29 = _mm_adds_epu8(s24, t18); - m30 = _mm_adds_epu8(s25, t17); - a108 = _mm_min_epu8(m28, m27); - d11 = _mm_cmpeq_epi8(a108, m28); - a109 = _mm_min_epu8(m30, m29); - d12 = _mm_cmpeq_epi8(a109, m30); - s26 = _mm_movemask_epi8(_mm_unpacklo_epi8(d11,d12)); - a110 = (a93 + 2); - *(a110) = s26; - s27 = _mm_movemask_epi8(_mm_unpackhi_epi8(d11,d12)); - a111 = (a93 + 3); - *(a111) = s27; - s28 = _mm_unpacklo_epi8(a108, a109); - s29 = _mm_unpackhi_epi8(a108, a109); - a112 = (a95 + 2); - *(a112) = s28; - a113 = (a95 + 3); - *(a113) = s29; - if ((((unsigned char *) Y)[0]>210)) { - __m128i m5, m6; - m5 = ((__m128i *) Y)[0]; - m5 = _mm_min_epu8(m5, ((__m128i *) Y)[1]); - m5 = _mm_min_epu8(m5, ((__m128i *) Y)[2]); - m5 = _mm_min_epu8(m5, ((__m128i *) Y)[3]); - __m128i m7; - m7 = _mm_min_epu8(_mm_srli_si128(m5, 8), m5); - m7 = ((__m128i ) _mm_min_epu8(((__m128i ) _mm_srli_epi64(m7, 32)), ((__m128i ) m7))); - m7 = ((__m128i ) _mm_min_epu8(((__m128i ) _mm_srli_epi64(m7, 16)), ((__m128i ) m7))); - m7 = ((__m128i ) _mm_min_epu8(((__m128i ) _mm_srli_epi64(m7, 8)), ((__m128i ) m7))); - m7 = _mm_unpacklo_epi8(m7, m7); - m7 = _mm_shufflelo_epi16(m7, _MM_SHUFFLE(0, 0, 0, 0)); - m6 = _mm_unpacklo_epi64(m7, m7); - ((__m128i *) Y)[0] = _mm_subs_epu8(((__m128i *) Y)[0], m6); - ((__m128i *) Y)[1] = _mm_subs_epu8(((__m128i *) Y)[1], m6); - ((__m128i *) Y)[2] = _mm_subs_epu8(((__m128i *) Y)[2], m6); - ((__m128i *) Y)[3] = _mm_subs_epu8(((__m128i *) Y)[3], m6); - } - unsigned char a188, a194; - int a186, a205; - short int s48, s49, s54, s55; - unsigned char *a187, *a193, *b15; - short int *a204, *a206, *a207, *a223, *a224, *b16; - __m128i *a184, *a185, *a190, *a196, *a208, *a209, *a210 - , *a211, *a212, *a215, *a225, *a226; - __m128i a199, a200, a218, a219; - __m128i a189, a191, a192, a195, a197, a198, a201 - , a202, a203, a213, a214, a216, a217, a220, a221 - , a222, d17, d18, d19, d20, m39, m40, m41 - , m42, m43, m44, m45, m46, s46, s47, s50 - , s51, s52, s53, s56, s57, t25, t26, t27 - , t28, t29, t30; - a184 = ((__m128i *) Y); - s46 = *(a184); - a185 = (a184 + 2); - s47 = *(a185); - a186 = (4 * i9); - b15 = (a186 + syms); - a187 = (b15 + 2); - a188 = *(a187); - a189 = _mm_set1_epi8(a188); - a190 = ((__m128i *) Branchtab); - a191 = *(a190); - a192 = _mm_xor_si128(a189, a191); - a193 = (b15 + 3); - a194 = *(a193); - a195 = _mm_set1_epi8(a194); - a196 = (a190 + 2); - a197 = *(a196); - a198 = _mm_xor_si128(a195, a197); - t25 = _mm_avg_epu8(a192,a198); - a199 = ((__m128i ) t25); - a200 = _mm_srli_epi16(a199, 2); - a201 = ((__m128i ) a200); - t26 = _mm_and_si128(a201, _mm_set_epi8(63, 63, 63, 63, 63, 63, 63 - , 63, 63, 63, 63, 63, 63, 63, 63 - , 63)); - t27 = _mm_subs_epu8(_mm_set_epi8(63, 63, 63, 63, 63, 63, 63 - , 63, 63, 63, 63, 63, 63, 63, 63 - , 63), t26); - m39 = _mm_adds_epu8(s46, t26); - m40 = _mm_adds_epu8(s47, t27); - m41 = _mm_adds_epu8(s46, t27); - m42 = _mm_adds_epu8(s47, t26); - a202 = _mm_min_epu8(m40, m39); - d17 = _mm_cmpeq_epi8(a202, m40); - a203 = _mm_min_epu8(m42, m41); - d18 = _mm_cmpeq_epi8(a203, m42); - s48 = _mm_movemask_epi8(_mm_unpacklo_epi8(d17,d18)); - a204 = ((short int *) dec); - a205 = (8 * i9); - b16 = (a204 + a205); - a206 = (b16 + 4); - *(a206) = s48; - s49 = _mm_movemask_epi8(_mm_unpackhi_epi8(d17,d18)); - a207 = (b16 + 5); - *(a207) = s49; - s50 = _mm_unpacklo_epi8(a202, a203); - s51 = _mm_unpackhi_epi8(a202, a203); - a208 = ((__m128i *) X); - *(a208) = s50; - a209 = (a208 + 1); - *(a209) = s51; - a210 = (a184 + 1); - s52 = *(a210); - a211 = (a184 + 3); - s53 = *(a211); - a212 = (a190 + 1); - a213 = *(a212); - a214 = _mm_xor_si128(a189, a213); - a215 = (a190 + 3); - a216 = *(a215); - a217 = _mm_xor_si128(a195, a216); - t28 = _mm_avg_epu8(a214,a217); - a218 = ((__m128i ) t28); - a219 = _mm_srli_epi16(a218, 2); - a220 = ((__m128i ) a219); - t29 = _mm_and_si128(a220, _mm_set_epi8(63, 63, 63, 63, 63, 63, 63 - , 63, 63, 63, 63, 63, 63, 63, 63 - , 63)); - t30 = _mm_subs_epu8(_mm_set_epi8(63, 63, 63, 63, 63, 63, 63 - , 63, 63, 63, 63, 63, 63, 63, 63 - , 63), t29); - m43 = _mm_adds_epu8(s52, t29); - m44 = _mm_adds_epu8(s53, t30); - m45 = _mm_adds_epu8(s52, t30); - m46 = _mm_adds_epu8(s53, t29); - a221 = _mm_min_epu8(m44, m43); - d19 = _mm_cmpeq_epi8(a221, m44); - a222 = _mm_min_epu8(m46, m45); - d20 = _mm_cmpeq_epi8(a222, m46); - s54 = _mm_movemask_epi8(_mm_unpacklo_epi8(d19,d20)); - a223 = (b16 + 6); - *(a223) = s54; - s55 = _mm_movemask_epi8(_mm_unpackhi_epi8(d19,d20)); - a224 = (b16 + 7); - *(a224) = s55; - s56 = _mm_unpacklo_epi8(a221, a222); - s57 = _mm_unpackhi_epi8(a221, a222); - a225 = (a208 + 2); - *(a225) = s56; - a226 = (a208 + 3); - *(a226) = s57; - if ((((unsigned char *) X)[0]>210)) { - __m128i m12, m13; - m12 = ((__m128i *) X)[0]; - m12 = _mm_min_epu8(m12, ((__m128i *) X)[1]); - m12 = _mm_min_epu8(m12, ((__m128i *) X)[2]); - m12 = _mm_min_epu8(m12, ((__m128i *) X)[3]); - __m128i m14; - m14 = _mm_min_epu8(_mm_srli_si128(m12, 8), m12); - m14 = ((__m128i ) _mm_min_epu8(((__m128i ) _mm_srli_epi64(m14, 32)), ((__m128i ) m14))); - m14 = ((__m128i ) _mm_min_epu8(((__m128i ) _mm_srli_epi64(m14, 16)), ((__m128i ) m14))); - m14 = ((__m128i ) _mm_min_epu8(((__m128i ) _mm_srli_epi64(m14, 8)), ((__m128i ) m14))); - m14 = _mm_unpacklo_epi8(m14, m14); - m14 = _mm_shufflelo_epi16(m14, _MM_SHUFFLE(0, 0, 0, 0)); - m13 = _mm_unpacklo_epi64(m14, m14); - ((__m128i *) X)[0] = _mm_subs_epu8(((__m128i *) X)[0], m13); - ((__m128i *) X)[1] = _mm_subs_epu8(((__m128i *) X)[1], m13); - ((__m128i *) X)[2] = _mm_subs_epu8(((__m128i *) X)[2], m13); - ((__m128i *) X)[3] = _mm_subs_epu8(((__m128i *) X)[3], m13); + unsigned int i9; + for (i9 = 0; i9 < ((framebits + excess) >> 1); i9++) { + unsigned char a75, a81; + int a73, a92; + short int s20, s21, s26, s27; + unsigned char *a74, *a80, *b6; + short int *a110, *a111, *a91, *a93, *a94; + __m128i *a102, *a112, *a113, *a71, *a72, *a77, *a83, *a95, *a96, *a97, *a98, *a99; + __m128i a105, a106, a86, a87; + __m128i a100, a101, a103, a104, a107, a108, a109, a76, a78, a79, a82, a84, a85, + a88, a89, a90, d10, d11, d12, d9, m23, m24, m25, m26, m27, m28, m29, m30, s18, + s19, s22, s23, s24, s25, s28, s29, t13, t14, t15, t16, t17, t18; + a71 = ((__m128i*)X); + s18 = *(a71); + a72 = (a71 + 2); + s19 = *(a72); + a73 = (4 * i9); + a74 = (syms + a73); + a75 = *(a74); + a76 = _mm_set1_epi8(a75); + a77 = ((__m128i*)Branchtab); + a78 = *(a77); + a79 = _mm_xor_si128(a76, a78); + b6 = (a73 + syms); + a80 = (b6 + 1); + a81 = *(a80); + a82 = _mm_set1_epi8(a81); + a83 = (a77 + 2); + a84 = *(a83); + a85 = _mm_xor_si128(a82, a84); + t13 = _mm_avg_epu8(a79, a85); + a86 = ((__m128i)t13); + a87 = _mm_srli_epi16(a86, 2); + a88 = ((__m128i)a87); + t14 = _mm_and_si128( + a88, + _mm_set_epi8(63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63)); + t15 = _mm_subs_epu8( + _mm_set_epi8(63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63), + t14); + m23 = _mm_adds_epu8(s18, t14); + m24 = _mm_adds_epu8(s19, t15); + m25 = _mm_adds_epu8(s18, t15); + m26 = _mm_adds_epu8(s19, t14); + a89 = _mm_min_epu8(m24, m23); + d9 = _mm_cmpeq_epi8(a89, m24); + a90 = _mm_min_epu8(m26, m25); + d10 = _mm_cmpeq_epi8(a90, m26); + s20 = _mm_movemask_epi8(_mm_unpacklo_epi8(d9, d10)); + a91 = ((short int*)dec); + a92 = (8 * i9); + a93 = (a91 + a92); + *(a93) = s20; + s21 = _mm_movemask_epi8(_mm_unpackhi_epi8(d9, d10)); + a94 = (a93 + 1); + *(a94) = s21; + s22 = _mm_unpacklo_epi8(a89, a90); + s23 = _mm_unpackhi_epi8(a89, a90); + a95 = ((__m128i*)Y); + *(a95) = s22; + a96 = (a95 + 1); + *(a96) = s23; + a97 = (a71 + 1); + s24 = *(a97); + a98 = (a71 + 3); + s25 = *(a98); + a99 = (a77 + 1); + a100 = *(a99); + a101 = _mm_xor_si128(a76, a100); + a102 = (a77 + 3); + a103 = *(a102); + a104 = _mm_xor_si128(a82, a103); + t16 = _mm_avg_epu8(a101, a104); + a105 = ((__m128i)t16); + a106 = _mm_srli_epi16(a105, 2); + a107 = ((__m128i)a106); + t17 = _mm_and_si128( + a107, + _mm_set_epi8(63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63)); + t18 = _mm_subs_epu8( + _mm_set_epi8(63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63), + t17); + m27 = _mm_adds_epu8(s24, t17); + m28 = _mm_adds_epu8(s25, t18); + m29 = _mm_adds_epu8(s24, t18); + m30 = _mm_adds_epu8(s25, t17); + a108 = _mm_min_epu8(m28, m27); + d11 = _mm_cmpeq_epi8(a108, m28); + a109 = _mm_min_epu8(m30, m29); + d12 = _mm_cmpeq_epi8(a109, m30); + s26 = _mm_movemask_epi8(_mm_unpacklo_epi8(d11, d12)); + a110 = (a93 + 2); + *(a110) = s26; + s27 = _mm_movemask_epi8(_mm_unpackhi_epi8(d11, d12)); + a111 = (a93 + 3); + *(a111) = s27; + s28 = _mm_unpacklo_epi8(a108, a109); + s29 = _mm_unpackhi_epi8(a108, a109); + a112 = (a95 + 2); + *(a112) = s28; + a113 = (a95 + 3); + *(a113) = s29; + if ((((unsigned char*)Y)[0] > 210)) { + __m128i m5, m6; + m5 = ((__m128i*)Y)[0]; + m5 = _mm_min_epu8(m5, ((__m128i*)Y)[1]); + m5 = _mm_min_epu8(m5, ((__m128i*)Y)[2]); + m5 = _mm_min_epu8(m5, ((__m128i*)Y)[3]); + __m128i m7; + m7 = _mm_min_epu8(_mm_srli_si128(m5, 8), m5); + m7 = + ((__m128i)_mm_min_epu8(((__m128i)_mm_srli_epi64(m7, 32)), ((__m128i)m7))); + m7 = + ((__m128i)_mm_min_epu8(((__m128i)_mm_srli_epi64(m7, 16)), ((__m128i)m7))); + m7 = ((__m128i)_mm_min_epu8(((__m128i)_mm_srli_epi64(m7, 8)), ((__m128i)m7))); + m7 = _mm_unpacklo_epi8(m7, m7); + m7 = _mm_shufflelo_epi16(m7, _MM_SHUFFLE(0, 0, 0, 0)); + m6 = _mm_unpacklo_epi64(m7, m7); + ((__m128i*)Y)[0] = _mm_subs_epu8(((__m128i*)Y)[0], m6); + ((__m128i*)Y)[1] = _mm_subs_epu8(((__m128i*)Y)[1], m6); + ((__m128i*)Y)[2] = _mm_subs_epu8(((__m128i*)Y)[2], m6); + ((__m128i*)Y)[3] = _mm_subs_epu8(((__m128i*)Y)[3], m6); + } + unsigned char a188, a194; + int a186, a205; + short int s48, s49, s54, s55; + unsigned char *a187, *a193, *b15; + short int *a204, *a206, *a207, *a223, *a224, *b16; + __m128i *a184, *a185, *a190, *a196, *a208, *a209, *a210, *a211, *a212, *a215, + *a225, *a226; + __m128i a199, a200, a218, a219; + __m128i a189, a191, a192, a195, a197, a198, a201, a202, a203, a213, a214, a216, + a217, a220, a221, a222, d17, d18, d19, d20, m39, m40, m41, m42, m43, m44, m45, + m46, s46, s47, s50, s51, s52, s53, s56, s57, t25, t26, t27, t28, t29, t30; + a184 = ((__m128i*)Y); + s46 = *(a184); + a185 = (a184 + 2); + s47 = *(a185); + a186 = (4 * i9); + b15 = (a186 + syms); + a187 = (b15 + 2); + a188 = *(a187); + a189 = _mm_set1_epi8(a188); + a190 = ((__m128i*)Branchtab); + a191 = *(a190); + a192 = _mm_xor_si128(a189, a191); + a193 = (b15 + 3); + a194 = *(a193); + a195 = _mm_set1_epi8(a194); + a196 = (a190 + 2); + a197 = *(a196); + a198 = _mm_xor_si128(a195, a197); + t25 = _mm_avg_epu8(a192, a198); + a199 = ((__m128i)t25); + a200 = _mm_srli_epi16(a199, 2); + a201 = ((__m128i)a200); + t26 = _mm_and_si128( + a201, + _mm_set_epi8(63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63)); + t27 = _mm_subs_epu8( + _mm_set_epi8(63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63), + t26); + m39 = _mm_adds_epu8(s46, t26); + m40 = _mm_adds_epu8(s47, t27); + m41 = _mm_adds_epu8(s46, t27); + m42 = _mm_adds_epu8(s47, t26); + a202 = _mm_min_epu8(m40, m39); + d17 = _mm_cmpeq_epi8(a202, m40); + a203 = _mm_min_epu8(m42, m41); + d18 = _mm_cmpeq_epi8(a203, m42); + s48 = _mm_movemask_epi8(_mm_unpacklo_epi8(d17, d18)); + a204 = ((short int*)dec); + a205 = (8 * i9); + b16 = (a204 + a205); + a206 = (b16 + 4); + *(a206) = s48; + s49 = _mm_movemask_epi8(_mm_unpackhi_epi8(d17, d18)); + a207 = (b16 + 5); + *(a207) = s49; + s50 = _mm_unpacklo_epi8(a202, a203); + s51 = _mm_unpackhi_epi8(a202, a203); + a208 = ((__m128i*)X); + *(a208) = s50; + a209 = (a208 + 1); + *(a209) = s51; + a210 = (a184 + 1); + s52 = *(a210); + a211 = (a184 + 3); + s53 = *(a211); + a212 = (a190 + 1); + a213 = *(a212); + a214 = _mm_xor_si128(a189, a213); + a215 = (a190 + 3); + a216 = *(a215); + a217 = _mm_xor_si128(a195, a216); + t28 = _mm_avg_epu8(a214, a217); + a218 = ((__m128i)t28); + a219 = _mm_srli_epi16(a218, 2); + a220 = ((__m128i)a219); + t29 = _mm_and_si128( + a220, + _mm_set_epi8(63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63)); + t30 = _mm_subs_epu8( + _mm_set_epi8(63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63), + t29); + m43 = _mm_adds_epu8(s52, t29); + m44 = _mm_adds_epu8(s53, t30); + m45 = _mm_adds_epu8(s52, t30); + m46 = _mm_adds_epu8(s53, t29); + a221 = _mm_min_epu8(m44, m43); + d19 = _mm_cmpeq_epi8(a221, m44); + a222 = _mm_min_epu8(m46, m45); + d20 = _mm_cmpeq_epi8(a222, m46); + s54 = _mm_movemask_epi8(_mm_unpacklo_epi8(d19, d20)); + a223 = (b16 + 6); + *(a223) = s54; + s55 = _mm_movemask_epi8(_mm_unpackhi_epi8(d19, d20)); + a224 = (b16 + 7); + *(a224) = s55; + s56 = _mm_unpacklo_epi8(a221, a222); + s57 = _mm_unpackhi_epi8(a221, a222); + a225 = (a208 + 2); + *(a225) = s56; + a226 = (a208 + 3); + *(a226) = s57; + if ((((unsigned char*)X)[0] > 210)) { + __m128i m12, m13; + m12 = ((__m128i*)X)[0]; + m12 = _mm_min_epu8(m12, ((__m128i*)X)[1]); + m12 = _mm_min_epu8(m12, ((__m128i*)X)[2]); + m12 = _mm_min_epu8(m12, ((__m128i*)X)[3]); + __m128i m14; + m14 = _mm_min_epu8(_mm_srli_si128(m12, 8), m12); + m14 = ((__m128i)_mm_min_epu8(((__m128i)_mm_srli_epi64(m14, 32)), + ((__m128i)m14))); + m14 = ((__m128i)_mm_min_epu8(((__m128i)_mm_srli_epi64(m14, 16)), + ((__m128i)m14))); + m14 = ((__m128i)_mm_min_epu8(((__m128i)_mm_srli_epi64(m14, 8)), + ((__m128i)m14))); + m14 = _mm_unpacklo_epi8(m14, m14); + m14 = _mm_shufflelo_epi16(m14, _MM_SHUFFLE(0, 0, 0, 0)); + m13 = _mm_unpacklo_epi64(m14, m14); + ((__m128i*)X)[0] = _mm_subs_epu8(((__m128i*)X)[0], m13); + ((__m128i*)X)[1] = _mm_subs_epu8(((__m128i*)X)[1], m13); + ((__m128i*)X)[2] = _mm_subs_epu8(((__m128i*)X)[2], m13); + ((__m128i*)X)[3] = _mm_subs_epu8(((__m128i*)X)[3], m13); + } } - } - - renormalize(X, 210); - /*int ch; - for(ch = 0; ch < 64; ch++) { - printf("%d,", X[ch]); - } - printf("\n");*/ - - unsigned int j; - for(j=0; j < (framebits + excess) % 2; ++j) { - int i; - for(i=0;i<64/2;i++){ - BFLY(i, (((framebits+excess) >> 1) << 1) + j , syms, Y, X, (decision_t *)dec, Branchtab); - } + renormalize(X, 210); - - renormalize(Y, 210); - - /*printf("\n"); + /*int ch; for(ch = 0; ch < 64; ch++) { - printf("%d,", Y[ch]); + printf("%d,", X[ch]); } printf("\n");*/ - } - /*skip*/ + unsigned int j; + for (j = 0; j < (framebits + excess) % 2; ++j) { + int i; + for (i = 0; i < 64 / 2; i++) { + BFLY(i, + (((framebits + excess) >> 1) << 1) + j, + syms, + Y, + X, + (decision_t*)dec, + Branchtab); + } + + + renormalize(Y, 210); + + /*printf("\n"); + for(ch = 0; ch < 64; ch++) { + printf("%d,", Y[ch]); + } + printf("\n");*/ + } + /*skip*/ } #endif /*LV_HAVE_SSE3*/ @@ -612,30 +633,32 @@ volk_8u_x4_conv_k7_r2_8u_spiral(unsigned char* Y, unsigned char* X, #if LV_HAVE_GENERIC -static inline void -volk_8u_x4_conv_k7_r2_8u_generic(unsigned char* Y, unsigned char* X, - unsigned char* syms, unsigned char* dec, - unsigned int framebits, unsigned int excess, - unsigned char* Branchtab) +static inline void volk_8u_x4_conv_k7_r2_8u_generic(unsigned char* Y, + unsigned char* X, + unsigned char* syms, + unsigned char* dec, + unsigned int framebits, + unsigned int excess, + unsigned char* Branchtab) { - int nbits = framebits + excess; - int NUMSTATES = 64; - int RENORMALIZE_THRESHOLD = 210; - - int s,i; - for (s=0;s init_test_list(volk_test_params_t test_params) @@ -32,127 +37,135 @@ std::vector init_test_list(volk_test_params_t test_params) test_params_rotator.set_tol(1e-3); std::vector test_cases; - QA(VOLK_INIT_PUPP(volk_64u_popcntpuppet_64u, volk_64u_popcnt, test_params)) - QA(VOLK_INIT_PUPP(volk_64u_popcntpuppet_64u, volk_64u_popcnt, test_params)) - QA(VOLK_INIT_PUPP(volk_64u_popcntpuppet_64u, volk_64u_popcnt, test_params)) + QA(VOLK_INIT_PUPP(volk_64u_popcntpuppet_64u, volk_64u_popcnt, test_params)) + QA(VOLK_INIT_PUPP(volk_64u_popcntpuppet_64u, volk_64u_popcnt, test_params)) + QA(VOLK_INIT_PUPP(volk_64u_popcntpuppet_64u, volk_64u_popcnt, test_params)) QA(VOLK_INIT_PUPP(volk_16u_byteswappuppet_16u, volk_16u_byteswap, test_params)) QA(VOLK_INIT_PUPP(volk_32u_byteswappuppet_32u, volk_32u_byteswap, test_params)) - QA(VOLK_INIT_PUPP(volk_32u_popcntpuppet_32u, volk_32u_popcnt_32u, test_params)) + QA(VOLK_INIT_PUPP(volk_32u_popcntpuppet_32u, volk_32u_popcnt_32u, test_params)) QA(VOLK_INIT_PUPP(volk_64u_byteswappuppet_64u, volk_64u_byteswap, test_params)) - QA(VOLK_INIT_PUPP(volk_32fc_s32fc_rotatorpuppet_32fc, volk_32fc_s32fc_x2_rotator_32fc, test_params_rotator)) - QA(VOLK_INIT_PUPP(volk_8u_conv_k7_r2puppet_8u, volk_8u_x4_conv_k7_r2_8u, test_params.make_tol(0))) - QA(VOLK_INIT_PUPP(volk_32f_x2_fm_detectpuppet_32f, volk_32f_s32f_32f_fm_detect_32f, test_params)) - QA(VOLK_INIT_TEST(volk_16ic_s32f_deinterleave_real_32f, test_params)) - QA(VOLK_INIT_TEST(volk_16ic_deinterleave_real_8i, test_params)) - QA(VOLK_INIT_TEST(volk_16ic_deinterleave_16i_x2, test_params)) - QA(VOLK_INIT_TEST(volk_16ic_s32f_deinterleave_32f_x2, test_params)) - QA(VOLK_INIT_TEST(volk_16ic_deinterleave_real_16i, test_params)) - QA(VOLK_INIT_TEST(volk_16ic_magnitude_16i, test_params)) - QA(VOLK_INIT_TEST(volk_16ic_s32f_magnitude_32f, test_params)) - QA(VOLK_INIT_TEST(volk_16ic_convert_32fc, test_params)) - QA(VOLK_INIT_TEST(volk_16ic_x2_multiply_16ic, test_params)) - QA(VOLK_INIT_TEST(volk_16ic_x2_dot_prod_16ic, test_params)) - QA(VOLK_INIT_TEST(volk_16i_s32f_convert_32f, test_params)) - QA(VOLK_INIT_TEST(volk_16i_convert_8i, test_params)) - QA(VOLK_INIT_TEST(volk_16i_32fc_dot_prod_32fc, test_params_inacc)) - QA(VOLK_INIT_TEST(volk_32f_accumulator_s32f, test_params_inacc)) - QA(VOLK_INIT_TEST(volk_32f_x2_add_32f, test_params)) - QA(VOLK_INIT_TEST(volk_32f_index_max_16u, test_params)) - QA(VOLK_INIT_TEST(volk_32f_index_max_32u, test_params)) - QA(VOLK_INIT_TEST(volk_32fc_32f_multiply_32fc, test_params)) - QA(VOLK_INIT_TEST(volk_32fc_32f_add_32fc, test_params)) - QA(VOLK_INIT_TEST(volk_32f_log2_32f, test_params.make_absolute(1e-5))) - QA(VOLK_INIT_TEST(volk_32f_expfast_32f, test_params_inacc_tenth)) - QA(VOLK_INIT_TEST(volk_32f_x2_pow_32f, test_params_inacc)) - QA(VOLK_INIT_TEST(volk_32f_sin_32f, test_params_inacc)) - QA(VOLK_INIT_TEST(volk_32f_cos_32f, test_params_inacc)) - QA(VOLK_INIT_TEST(volk_32f_tan_32f, test_params_inacc)) - QA(VOLK_INIT_TEST(volk_32f_atan_32f, test_params_inacc)) - QA(VOLK_INIT_TEST(volk_32f_asin_32f, test_params_inacc)) - QA(VOLK_INIT_TEST(volk_32f_acos_32f, test_params_inacc)) - QA(VOLK_INIT_TEST(volk_32fc_s32f_power_32fc, test_params_power)) - QA(VOLK_INIT_TEST(volk_32f_s32f_calc_spectral_noise_floor_32f, test_params_inacc)) - QA(VOLK_INIT_TEST(volk_32fc_s32f_atan2_32f, test_params)) - QA(VOLK_INIT_TEST(volk_32fc_x2_conjugate_dot_prod_32fc, test_params_inacc_tenth)) - QA(VOLK_INIT_TEST(volk_32fc_deinterleave_32f_x2, test_params)) - QA(VOLK_INIT_TEST(volk_32fc_deinterleave_64f_x2, test_params)) - QA(VOLK_INIT_TEST(volk_32fc_s32f_deinterleave_real_16i, test_params)) - QA(VOLK_INIT_TEST(volk_32fc_deinterleave_imag_32f, test_params)) - QA(VOLK_INIT_TEST(volk_32fc_deinterleave_real_32f, test_params)) - QA(VOLK_INIT_TEST(volk_32fc_deinterleave_real_64f, test_params)) - QA(VOLK_INIT_TEST(volk_32fc_x2_dot_prod_32fc, test_params_inacc)) - QA(VOLK_INIT_TEST(volk_32fc_32f_dot_prod_32fc, test_params_inacc)) - QA(VOLK_INIT_TEST(volk_32fc_index_max_16u, test_params)) - QA(VOLK_INIT_TEST(volk_32fc_index_max_32u, test_params)) - QA(VOLK_INIT_TEST(volk_32fc_s32f_magnitude_16i, test_params)) - QA(VOLK_INIT_TEST(volk_32fc_magnitude_32f, test_params_inacc_tenth)) - QA(VOLK_INIT_TEST(volk_32fc_magnitude_squared_32f, test_params)) - QA(VOLK_INIT_TEST(volk_32fc_x2_add_32fc, test_params)) - QA(VOLK_INIT_TEST(volk_32fc_x2_multiply_32fc, test_params)) - QA(VOLK_INIT_TEST(volk_32fc_x2_multiply_conjugate_32fc, test_params)) - QA(VOLK_INIT_TEST(volk_32fc_x2_divide_32fc, test_params)) - QA(VOLK_INIT_TEST(volk_32fc_conjugate_32fc, test_params)) - QA(VOLK_INIT_TEST(volk_32f_s32f_convert_16i, test_params)) - QA(VOLK_INIT_TEST(volk_32f_s32f_convert_32i, test_params)) - QA(VOLK_INIT_TEST(volk_32f_convert_64f, test_params)) - QA(VOLK_INIT_TEST(volk_32f_s32f_convert_8i, test_params)) - QA(VOLK_INIT_TEST(volk_32fc_convert_16ic, test_params)) - QA(VOLK_INIT_TEST(volk_32fc_s32f_power_spectrum_32f, test_params)) - QA(VOLK_INIT_TEST(volk_32fc_x2_square_dist_32f, test_params)) - QA(VOLK_INIT_TEST(volk_32fc_x2_s32f_square_dist_scalar_mult_32f, test_params)) - QA(VOLK_INIT_TEST(volk_32f_x2_divide_32f, test_params)) - QA(VOLK_INIT_TEST(volk_32f_x2_dot_prod_32f, test_params_inacc)) - QA(VOLK_INIT_TEST(volk_32f_x2_s32f_interleave_16ic, test_params)) - QA(VOLK_INIT_TEST(volk_32f_x2_interleave_32fc, test_params)) - QA(VOLK_INIT_TEST(volk_32f_x2_max_32f, test_params)) - QA(VOLK_INIT_TEST(volk_32f_x2_min_32f, test_params)) - QA(VOLK_INIT_TEST(volk_32f_x2_multiply_32f, test_params)) - QA(VOLK_INIT_TEST(volk_32f_64f_multiply_64f, test_params)) - QA(VOLK_INIT_TEST(volk_32f_64f_add_64f, test_params)) - QA(VOLK_INIT_TEST(volk_32f_s32f_normalize, test_params)) - QA(VOLK_INIT_TEST(volk_32f_s32f_power_32f, test_params)) - QA(VOLK_INIT_TEST(volk_32f_sqrt_32f, test_params_inacc)) - QA(VOLK_INIT_TEST(volk_32f_s32f_stddev_32f, test_params_inacc)) - QA(VOLK_INIT_TEST(volk_32f_stddev_and_mean_32f_x2, test_params_inacc)) - QA(VOLK_INIT_TEST(volk_32f_x2_subtract_32f, test_params)) - QA(VOLK_INIT_TEST(volk_32f_x3_sum_of_poly_32f, test_params_inacc)) - QA(VOLK_INIT_TEST(volk_32i_x2_and_32i, test_params)) - QA(VOLK_INIT_TEST(volk_32i_s32f_convert_32f, test_params)) - QA(VOLK_INIT_TEST(volk_32i_x2_or_32i, test_params)) - QA(VOLK_INIT_TEST(volk_32f_x2_dot_prod_16i, test_params)) - QA(VOLK_INIT_TEST(volk_64f_convert_32f, test_params)) - QA(VOLK_INIT_TEST(volk_64f_x2_max_64f, test_params)) - QA(VOLK_INIT_TEST(volk_64f_x2_min_64f, test_params)) - QA(VOLK_INIT_TEST(volk_64f_x2_multiply_64f, test_params)) - QA(VOLK_INIT_TEST(volk_64f_x2_add_64f, test_params)) - QA(VOLK_INIT_TEST(volk_8ic_deinterleave_16i_x2, test_params)) - QA(VOLK_INIT_TEST(volk_8ic_s32f_deinterleave_32f_x2, test_params)) - QA(VOLK_INIT_TEST(volk_8ic_deinterleave_real_16i, test_params)) - QA(VOLK_INIT_TEST(volk_8ic_s32f_deinterleave_real_32f, test_params)) - QA(VOLK_INIT_TEST(volk_8ic_deinterleave_real_8i, test_params)) - QA(VOLK_INIT_TEST(volk_8ic_x2_multiply_conjugate_16ic, test_params)) - QA(VOLK_INIT_TEST(volk_8ic_x2_s32f_multiply_conjugate_32fc, test_params)) - QA(VOLK_INIT_TEST(volk_8i_convert_16i, test_params)) - QA(VOLK_INIT_TEST(volk_8i_s32f_convert_32f, test_params)) - QA(VOLK_INIT_TEST(volk_32fc_s32fc_multiply_32fc, test_params)) - QA(VOLK_INIT_TEST(volk_32f_s32f_multiply_32f, test_params)) - QA(VOLK_INIT_TEST(volk_32f_binary_slicer_32i, test_params)) - QA(VOLK_INIT_TEST(volk_32f_binary_slicer_8i, test_params)) - QA(VOLK_INIT_TEST(volk_32u_reverse_32u, test_params)) - QA(VOLK_INIT_TEST(volk_32f_tanh_32f, test_params_inacc)) - QA(VOLK_INIT_TEST(volk_32f_s32f_mod_rangepuppet_32f, test_params)) + QA(VOLK_INIT_PUPP(volk_32fc_s32fc_rotatorpuppet_32fc, + volk_32fc_s32fc_x2_rotator_32fc, + test_params_rotator)) + QA(VOLK_INIT_PUPP( + volk_8u_conv_k7_r2puppet_8u, volk_8u_x4_conv_k7_r2_8u, test_params.make_tol(0))) + QA(VOLK_INIT_PUPP( + volk_32f_x2_fm_detectpuppet_32f, volk_32f_s32f_32f_fm_detect_32f, test_params)) + QA(VOLK_INIT_TEST(volk_16ic_s32f_deinterleave_real_32f, test_params)) + QA(VOLK_INIT_TEST(volk_16ic_deinterleave_real_8i, test_params)) + QA(VOLK_INIT_TEST(volk_16ic_deinterleave_16i_x2, test_params)) + QA(VOLK_INIT_TEST(volk_16ic_s32f_deinterleave_32f_x2, test_params)) + QA(VOLK_INIT_TEST(volk_16ic_deinterleave_real_16i, test_params)) + QA(VOLK_INIT_TEST(volk_16ic_magnitude_16i, test_params)) + QA(VOLK_INIT_TEST(volk_16ic_s32f_magnitude_32f, test_params)) + QA(VOLK_INIT_TEST(volk_16ic_convert_32fc, test_params)) + QA(VOLK_INIT_TEST(volk_16ic_x2_multiply_16ic, test_params)) + QA(VOLK_INIT_TEST(volk_16ic_x2_dot_prod_16ic, test_params)) + QA(VOLK_INIT_TEST(volk_16i_s32f_convert_32f, test_params)) + QA(VOLK_INIT_TEST(volk_16i_convert_8i, test_params)) + QA(VOLK_INIT_TEST(volk_16i_32fc_dot_prod_32fc, test_params_inacc)) + QA(VOLK_INIT_TEST(volk_32f_accumulator_s32f, test_params_inacc)) + QA(VOLK_INIT_TEST(volk_32f_x2_add_32f, test_params)) + QA(VOLK_INIT_TEST(volk_32f_index_max_16u, test_params)) + QA(VOLK_INIT_TEST(volk_32f_index_max_32u, test_params)) + QA(VOLK_INIT_TEST(volk_32fc_32f_multiply_32fc, test_params)) + QA(VOLK_INIT_TEST(volk_32fc_32f_add_32fc, test_params)) + QA(VOLK_INIT_TEST(volk_32f_log2_32f, test_params.make_absolute(1e-5))) + QA(VOLK_INIT_TEST(volk_32f_expfast_32f, test_params_inacc_tenth)) + QA(VOLK_INIT_TEST(volk_32f_x2_pow_32f, test_params_inacc)) + QA(VOLK_INIT_TEST(volk_32f_sin_32f, test_params_inacc)) + QA(VOLK_INIT_TEST(volk_32f_cos_32f, test_params_inacc)) + QA(VOLK_INIT_TEST(volk_32f_tan_32f, test_params_inacc)) + QA(VOLK_INIT_TEST(volk_32f_atan_32f, test_params_inacc)) + QA(VOLK_INIT_TEST(volk_32f_asin_32f, test_params_inacc)) + QA(VOLK_INIT_TEST(volk_32f_acos_32f, test_params_inacc)) + QA(VOLK_INIT_TEST(volk_32fc_s32f_power_32fc, test_params_power)) + QA(VOLK_INIT_TEST(volk_32f_s32f_calc_spectral_noise_floor_32f, test_params_inacc)) + QA(VOLK_INIT_TEST(volk_32fc_s32f_atan2_32f, test_params)) + QA(VOLK_INIT_TEST(volk_32fc_x2_conjugate_dot_prod_32fc, test_params_inacc_tenth)) + QA(VOLK_INIT_TEST(volk_32fc_deinterleave_32f_x2, test_params)) + QA(VOLK_INIT_TEST(volk_32fc_deinterleave_64f_x2, test_params)) + QA(VOLK_INIT_TEST(volk_32fc_s32f_deinterleave_real_16i, test_params)) + QA(VOLK_INIT_TEST(volk_32fc_deinterleave_imag_32f, test_params)) + QA(VOLK_INIT_TEST(volk_32fc_deinterleave_real_32f, test_params)) + QA(VOLK_INIT_TEST(volk_32fc_deinterleave_real_64f, test_params)) + QA(VOLK_INIT_TEST(volk_32fc_x2_dot_prod_32fc, test_params_inacc)) + QA(VOLK_INIT_TEST(volk_32fc_32f_dot_prod_32fc, test_params_inacc)) + QA(VOLK_INIT_TEST(volk_32fc_index_max_16u, test_params)) + QA(VOLK_INIT_TEST(volk_32fc_index_max_32u, test_params)) + QA(VOLK_INIT_TEST(volk_32fc_s32f_magnitude_16i, test_params)) + QA(VOLK_INIT_TEST(volk_32fc_magnitude_32f, test_params_inacc_tenth)) + QA(VOLK_INIT_TEST(volk_32fc_magnitude_squared_32f, test_params)) + QA(VOLK_INIT_TEST(volk_32fc_x2_add_32fc, test_params)) + QA(VOLK_INIT_TEST(volk_32fc_x2_multiply_32fc, test_params)) + QA(VOLK_INIT_TEST(volk_32fc_x2_multiply_conjugate_32fc, test_params)) + QA(VOLK_INIT_TEST(volk_32fc_x2_divide_32fc, test_params)) + QA(VOLK_INIT_TEST(volk_32fc_conjugate_32fc, test_params)) + QA(VOLK_INIT_TEST(volk_32f_s32f_convert_16i, test_params)) + QA(VOLK_INIT_TEST(volk_32f_s32f_convert_32i, test_params)) + QA(VOLK_INIT_TEST(volk_32f_convert_64f, test_params)) + QA(VOLK_INIT_TEST(volk_32f_s32f_convert_8i, test_params)) + QA(VOLK_INIT_TEST(volk_32fc_convert_16ic, test_params)) + QA(VOLK_INIT_TEST(volk_32fc_s32f_power_spectrum_32f, test_params)) + QA(VOLK_INIT_TEST(volk_32fc_x2_square_dist_32f, test_params)) + QA(VOLK_INIT_TEST(volk_32fc_x2_s32f_square_dist_scalar_mult_32f, test_params)) + QA(VOLK_INIT_TEST(volk_32f_x2_divide_32f, test_params)) + QA(VOLK_INIT_TEST(volk_32f_x2_dot_prod_32f, test_params_inacc)) + QA(VOLK_INIT_TEST(volk_32f_x2_s32f_interleave_16ic, test_params)) + QA(VOLK_INIT_TEST(volk_32f_x2_interleave_32fc, test_params)) + QA(VOLK_INIT_TEST(volk_32f_x2_max_32f, test_params)) + QA(VOLK_INIT_TEST(volk_32f_x2_min_32f, test_params)) + QA(VOLK_INIT_TEST(volk_32f_x2_multiply_32f, test_params)) + QA(VOLK_INIT_TEST(volk_32f_64f_multiply_64f, test_params)) + QA(VOLK_INIT_TEST(volk_32f_64f_add_64f, test_params)) + QA(VOLK_INIT_TEST(volk_32f_s32f_normalize, test_params)) + QA(VOLK_INIT_TEST(volk_32f_s32f_power_32f, test_params)) + QA(VOLK_INIT_TEST(volk_32f_sqrt_32f, test_params_inacc)) + QA(VOLK_INIT_TEST(volk_32f_s32f_stddev_32f, test_params_inacc)) + QA(VOLK_INIT_TEST(volk_32f_stddev_and_mean_32f_x2, test_params_inacc)) + QA(VOLK_INIT_TEST(volk_32f_x2_subtract_32f, test_params)) + QA(VOLK_INIT_TEST(volk_32f_x3_sum_of_poly_32f, test_params_inacc)) + QA(VOLK_INIT_TEST(volk_32i_x2_and_32i, test_params)) + QA(VOLK_INIT_TEST(volk_32i_s32f_convert_32f, test_params)) + QA(VOLK_INIT_TEST(volk_32i_x2_or_32i, test_params)) + QA(VOLK_INIT_TEST(volk_32f_x2_dot_prod_16i, test_params)) + QA(VOLK_INIT_TEST(volk_64f_convert_32f, test_params)) + QA(VOLK_INIT_TEST(volk_64f_x2_max_64f, test_params)) + QA(VOLK_INIT_TEST(volk_64f_x2_min_64f, test_params)) + QA(VOLK_INIT_TEST(volk_64f_x2_multiply_64f, test_params)) + QA(VOLK_INIT_TEST(volk_64f_x2_add_64f, test_params)) + QA(VOLK_INIT_TEST(volk_8ic_deinterleave_16i_x2, test_params)) + QA(VOLK_INIT_TEST(volk_8ic_s32f_deinterleave_32f_x2, test_params)) + QA(VOLK_INIT_TEST(volk_8ic_deinterleave_real_16i, test_params)) + QA(VOLK_INIT_TEST(volk_8ic_s32f_deinterleave_real_32f, test_params)) + QA(VOLK_INIT_TEST(volk_8ic_deinterleave_real_8i, test_params)) + QA(VOLK_INIT_TEST(volk_8ic_x2_multiply_conjugate_16ic, test_params)) + QA(VOLK_INIT_TEST(volk_8ic_x2_s32f_multiply_conjugate_32fc, test_params)) + QA(VOLK_INIT_TEST(volk_8i_convert_16i, test_params)) + QA(VOLK_INIT_TEST(volk_8i_s32f_convert_32f, test_params)) + QA(VOLK_INIT_TEST(volk_32fc_s32fc_multiply_32fc, test_params)) + QA(VOLK_INIT_TEST(volk_32f_s32f_multiply_32f, test_params)) + QA(VOLK_INIT_TEST(volk_32f_binary_slicer_32i, test_params)) + QA(VOLK_INIT_TEST(volk_32f_binary_slicer_8i, test_params)) + QA(VOLK_INIT_TEST(volk_32u_reverse_32u, test_params)) + QA(VOLK_INIT_TEST(volk_32f_tanh_32f, test_params_inacc)) + QA(VOLK_INIT_TEST(volk_32f_s32f_mod_rangepuppet_32f, test_params)) QA(VOLK_INIT_TEST(volk_32fc_x2_s32fc_multiply_conjugate_add_32fc, test_params)) - QA(VOLK_INIT_PUPP(volk_8u_x3_encodepolarpuppet_8u, volk_8u_x3_encodepolar_8u_x2, test_params)) - QA(VOLK_INIT_PUPP(volk_32f_8u_polarbutterflypuppet_32f, volk_32f_8u_polarbutterfly_32f, test_params)) - QA(VOLK_INIT_TEST(volk_32f_exp_32f, test_params)) - + QA(VOLK_INIT_PUPP( + volk_8u_x3_encodepolarpuppet_8u, volk_8u_x3_encodepolar_8u_x2, test_params)) + QA(VOLK_INIT_PUPP(volk_32f_8u_polarbutterflypuppet_32f, + volk_32f_8u_polarbutterfly_32f, + test_params)) // no one uses these, so don't test them - //VOLK_PROFILE(volk_16i_x5_add_quad_16i_x4, 1e-4, 2046, 10000, &results, benchmark_mode, kernel_regex); - //VOLK_PROFILE(volk_16i_branch_4_state_8, 1e-4, 2046, 10000, &results, benchmark_mode, kernel_regex); - //VOLK_PROFILE(volk_16i_max_star_16i, 0, 0, 204602, 10000, &results, benchmark_mode, kernel_regex); - //VOLK_PROFILE(volk_16i_max_star_horizontal_16i, 0, 0, 204602, 10000, &results, benchmark_mode, kernel_regex); - //VOLK_PROFILE(volk_16i_permute_and_scalar_add, 1e-4, 0, 2046, 10000, &results, benchmark_mode, kernel_regex); - //VOLK_PROFILE(volk_16i_x4_quad_max_star_16i, 1e-4, 0, 2046, 10000, &results, benchmark_mode, kernel_regex); + // VOLK_PROFILE(volk_16i_x5_add_quad_16i_x4, 1e-4, 2046, 10000, &results, + // benchmark_mode, kernel_regex); VOLK_PROFILE(volk_16i_branch_4_state_8, 1e-4, 2046, + // 10000, &results, benchmark_mode, kernel_regex); VOLK_PROFILE(volk_16i_max_star_16i, + // 0, 0, 204602, 10000, &results, benchmark_mode, kernel_regex); + // VOLK_PROFILE(volk_16i_max_star_horizontal_16i, 0, 0, 204602, 10000, &results, + // benchmark_mode, kernel_regex); VOLK_PROFILE(volk_16i_permute_and_scalar_add, 1e-4, + // 0, 2046, 10000, &results, benchmark_mode, kernel_regex); + // VOLK_PROFILE(volk_16i_x4_quad_max_star_16i, 1e-4, 0, 2046, 10000, &results, + // benchmark_mode, kernel_regex); // we need a puppet for this one //(VOLK_INIT_TEST(volk_32fc_s32f_x2_power_spectral_density_32f, test_params)) diff --git a/lib/qa_utils.cc b/lib/qa_utils.cc index 76df069..1dcee6e 100644 --- a/lib/qa_utils.cc +++ b/lib/qa_utils.cc @@ -1,79 +1,94 @@ -#include #include "qa_utils.h" +#include -#include // for volk_func_desc_t -#include // for volk_free, volk_m... +#include // for volk_func_desc_t +#include // for volk_free, volk_m... -#include // for assert -#include // for uint16_t, uint64_t -#include // for CLOCKS_PER_SEC -#include // for int16_t, int32_t +#include // for assert +#include // for uint16_t, uint64_t +#include // for CLOCKS_PER_SEC +#include // for int16_t, int32_t #include -#include // for sqrt, fabs, abs -#include // for memcpy, memset -#include // for clock -#include // for operator<<, basic... -#include // for cout, cerr -#include // for numeric_limits -#include // for map, map<>::mappe... +#include // for sqrt, fabs, abs +#include // for memcpy, memset +#include // for clock +#include // for operator<<, basic... +#include // for cout, cerr +#include // for numeric_limits +#include // for map, map<>::mappe... #include -#include // for vector, _Bit_refe... +#include // for vector, _Bit_refe... template -void random_floats(void *buf, unsigned int n, std::default_random_engine& rnd_engine) +void random_floats(void* buf, unsigned int n, std::default_random_engine& rnd_engine) { - T *array = static_cast(buf); + T* array = static_cast(buf); std::uniform_real_distribution uniform_dist(T(-1), T(1)); - for(unsigned int i = 0; i < n; i++) { + for (unsigned int i = 0; i < n; i++) { array[i] = uniform_dist(rnd_engine); } } -void load_random_data(void *data, volk_type_t type, unsigned int n) { +void load_random_data(void* data, volk_type_t type, unsigned int n) +{ std::random_device rnd_device; std::default_random_engine rnd_engine(rnd_device()); - if(type.is_complex) n *= 2; - if(type.is_float) { - if(type.size == 8) { + if (type.is_complex) + n *= 2; + if (type.is_float) { + if (type.size == 8) { random_floats(data, n, rnd_engine); } else { - random_floats (data, n, rnd_engine); + random_floats(data, n, rnd_engine); } } else { - float int_max = float(uint64_t(2) << (type.size*8)); - if(type.is_signed) int_max /= 2.0; + float int_max = float(uint64_t(2) << (type.size * 8)); + if (type.is_signed) + int_max /= 2.0; std::uniform_real_distribution uniform_dist(-int_max, int_max); - for(unsigned int i=0; i 8 or < 1"; //no shenanigans here + throw "load_random_data: no support for data size > 8 or < 1"; // no + // shenanigans + // here } } } } -static std::vector get_arch_list(volk_func_desc_t desc) { +static std::vector get_arch_list(volk_func_desc_t desc) +{ std::vector archlist; - for(size_t i = 0; i < desc.n_impls; i++) { + for (size_t i = 0; i < desc.n_impls; i++) { archlist.push_back(std::string(desc.impl_names[i])); } @@ -96,7 +111,8 @@ T volk_lexical_cast(const std::string& str) return var; } -volk_type_t volk_type_from_string(std::string name) { +volk_type_t volk_type_from_string(std::string name) +{ volk_type_t type; type.is_float = false; type.is_scalar = false; @@ -105,28 +121,28 @@ volk_type_t volk_type_from_string(std::string name) { type.size = 0; type.str = name; - if(name.size() < 2) { + if (name.size() < 2) { throw std::string("name too short to be a datatype"); } - //is it a scalar? - if(name[0] == 's') { + // is it a scalar? + if (name[0] == 's') { type.is_scalar = true; - name = name.substr(1, name.size()-1); + name = name.substr(1, name.size() - 1); } - //get the data size + // get the data size size_t last_size_pos = name.find_last_of("0123456789"); - if(last_size_pos == std::string::npos) { + if (last_size_pos == std::string::npos) { throw std::string("no size spec in type ").append(name); } - //will throw if malformed - int size = volk_lexical_cast(name.substr(0, last_size_pos+1)); + // will throw if malformed + int size = volk_lexical_cast(name.substr(0, last_size_pos + 1)); assert(((size % 8) == 0) && (size <= 64) && (size != 0)); - type.size = size/8; //in bytes + type.size = size / 8; // in bytes - for(size_t i=last_size_pos+1; i < name.size(); i++) { + for (size_t i = last_size_pos + 1; i < name.size(); i++) { switch (name[i]) { case 'f': type.is_float = true; @@ -148,7 +164,8 @@ volk_type_t volk_type_from_string(std::string name) { return type; } -std::vector split_signature(const std::string &protokernel_signature) { +std::vector split_signature(const std::string& protokernel_signature) +{ std::vector signature_tokens; std::string token; for (unsigned int loc = 0; loc < protokernel_signature.size(); ++loc) { @@ -165,16 +182,17 @@ std::vector split_signature(const std::string &protokernel_signatur return signature_tokens; } -static void get_signatures_from_name(std::vector &inputsig, - std::vector &outputsig, - std::string name) { +static void get_signatures_from_name(std::vector& inputsig, + std::vector& outputsig, + std::string name) +{ std::vector toked = split_signature(name); assert(toked[0] == "volk"); toked.erase(toked.begin()); - //ok. we're assuming a string in the form + // ok. we're assuming a string in the form //(sig)_(multiplier-opt)_..._(name)_(sig)_(multiplier-opt)_..._(alignment) enum { SIDE_INPUT, SIDE_NAME, SIDE_OUTPUT } side = SIDE_INPUT; @@ -184,106 +202,184 @@ static void get_signatures_from_name(std::vector &inputsig, std::string token = toked[token_index]; try { type = volk_type_from_string(token); - if(side == SIDE_NAME) side = SIDE_OUTPUT; //if this is the first one after the name... - - if(side == SIDE_INPUT) inputsig.push_back(type); - else outputsig.push_back(type); - } catch (...){ - if(token[0] == 'x' && (token.size() > 1) && (token[1] > '0' && token[1] < '9')) { //it's a multiplier - if(side == SIDE_INPUT) assert(inputsig.size() > 0); - else assert(outputsig.size() > 0); - int multiplier = volk_lexical_cast(token.substr(1, token.size()-1)); //will throw if invalid - for(int i=1; i 1) && + (token[1] > '0' && token[1] < '9')) { // it's a multiplier + if (side == SIDE_INPUT) + assert(inputsig.size() > 0); + else + assert(outputsig.size() > 0); + int multiplier = volk_lexical_cast( + token.substr(1, token.size() - 1)); // will throw if invalid + for (int i = 1; i < multiplier; i++) { + if (side == SIDE_INPUT) + inputsig.push_back(inputsig.back()); + else + outputsig.push_back(outputsig.back()); } - } - else if(side == SIDE_INPUT) { //it's the function name, at least it better be + } else if (side == + SIDE_INPUT) { // it's the function name, at least it better be side = SIDE_NAME; fn_name.append("_"); fn_name.append(token); - } - else if(side == SIDE_OUTPUT) { - if(token != toked.back()) throw; //the last token in the name is the alignment + } else if (side == SIDE_OUTPUT) { + if (token != toked.back()) + throw; // the last token in the name is the alignment } } } - //we don't need an output signature (some fn's operate on the input data, "in place"), but we do need at least one input! + // we don't need an output signature (some fn's operate on the input data, "in + // place"), but we do need at least one input! assert(inputsig.size() != 0); - } -inline void run_cast_test1(volk_fn_1arg func, std::vector &buffs, unsigned int vlen, unsigned int iter, std::string arch) { - while(iter--) func(buffs[0], vlen, arch.c_str()); +inline void run_cast_test1(volk_fn_1arg func, + std::vector& buffs, + unsigned int vlen, + unsigned int iter, + std::string arch) +{ + while (iter--) + func(buffs[0], vlen, arch.c_str()); } -inline void run_cast_test2(volk_fn_2arg func, std::vector &buffs, unsigned int vlen, unsigned int iter, std::string arch) { - while(iter--) func(buffs[0], buffs[1], vlen, arch.c_str()); +inline void run_cast_test2(volk_fn_2arg func, + std::vector& buffs, + unsigned int vlen, + unsigned int iter, + std::string arch) +{ + while (iter--) + func(buffs[0], buffs[1], vlen, arch.c_str()); } -inline void run_cast_test3(volk_fn_3arg func, std::vector &buffs, unsigned int vlen, unsigned int iter, std::string arch) { - while(iter--) func(buffs[0], buffs[1], buffs[2], vlen, arch.c_str()); +inline void run_cast_test3(volk_fn_3arg func, + std::vector& buffs, + unsigned int vlen, + unsigned int iter, + std::string arch) +{ + while (iter--) + func(buffs[0], buffs[1], buffs[2], vlen, arch.c_str()); } -inline void run_cast_test4(volk_fn_4arg func, std::vector &buffs, unsigned int vlen, unsigned int iter, std::string arch) { - while(iter--) func(buffs[0], buffs[1], buffs[2], buffs[3], vlen, arch.c_str()); +inline void run_cast_test4(volk_fn_4arg func, + std::vector& buffs, + unsigned int vlen, + unsigned int iter, + std::string arch) +{ + while (iter--) + func(buffs[0], buffs[1], buffs[2], buffs[3], vlen, arch.c_str()); } -inline void run_cast_test1_s32f(volk_fn_1arg_s32f func, std::vector &buffs, float scalar, unsigned int vlen, unsigned int iter, std::string arch) { - while(iter--) func(buffs[0], scalar, vlen, arch.c_str()); +inline void run_cast_test1_s32f(volk_fn_1arg_s32f func, + std::vector& buffs, + float scalar, + unsigned int vlen, + unsigned int iter, + std::string arch) +{ + while (iter--) + func(buffs[0], scalar, vlen, arch.c_str()); } -inline void run_cast_test2_s32f(volk_fn_2arg_s32f func, std::vector &buffs, float scalar, unsigned int vlen, unsigned int iter, std::string arch) { - while(iter--) func(buffs[0], buffs[1], scalar, vlen, arch.c_str()); +inline void run_cast_test2_s32f(volk_fn_2arg_s32f func, + std::vector& buffs, + float scalar, + unsigned int vlen, + unsigned int iter, + std::string arch) +{ + while (iter--) + func(buffs[0], buffs[1], scalar, vlen, arch.c_str()); } -inline void run_cast_test3_s32f(volk_fn_3arg_s32f func, std::vector &buffs, float scalar, unsigned int vlen, unsigned int iter, std::string arch) { - while(iter--) func(buffs[0], buffs[1], buffs[2], scalar, vlen, arch.c_str()); +inline void run_cast_test3_s32f(volk_fn_3arg_s32f func, + std::vector& buffs, + float scalar, + unsigned int vlen, + unsigned int iter, + std::string arch) +{ + while (iter--) + func(buffs[0], buffs[1], buffs[2], scalar, vlen, arch.c_str()); } -inline void run_cast_test1_s32fc(volk_fn_1arg_s32fc func, std::vector &buffs, lv_32fc_t scalar, unsigned int vlen, unsigned int iter, std::string arch) { - while(iter--) func(buffs[0], scalar, vlen, arch.c_str()); +inline void run_cast_test1_s32fc(volk_fn_1arg_s32fc func, + std::vector& buffs, + lv_32fc_t scalar, + unsigned int vlen, + unsigned int iter, + std::string arch) +{ + while (iter--) + func(buffs[0], scalar, vlen, arch.c_str()); } -inline void run_cast_test2_s32fc(volk_fn_2arg_s32fc func, std::vector &buffs, lv_32fc_t scalar, unsigned int vlen, unsigned int iter, std::string arch) { - while(iter--) func(buffs[0], buffs[1], scalar, vlen, arch.c_str()); +inline void run_cast_test2_s32fc(volk_fn_2arg_s32fc func, + std::vector& buffs, + lv_32fc_t scalar, + unsigned int vlen, + unsigned int iter, + std::string arch) +{ + while (iter--) + func(buffs[0], buffs[1], scalar, vlen, arch.c_str()); } -inline void run_cast_test3_s32fc(volk_fn_3arg_s32fc func, std::vector &buffs, lv_32fc_t scalar, unsigned int vlen, unsigned int iter, std::string arch) { - while(iter--) func(buffs[0], buffs[1], buffs[2], scalar, vlen, arch.c_str()); +inline void run_cast_test3_s32fc(volk_fn_3arg_s32fc func, + std::vector& buffs, + lv_32fc_t scalar, + unsigned int vlen, + unsigned int iter, + std::string arch) +{ + while (iter--) + func(buffs[0], buffs[1], buffs[2], scalar, vlen, arch.c_str()); } template -bool fcompare(t *in1, t *in2, unsigned int vlen, float tol, bool absolute_mode) { +bool fcompare(t* in1, t* in2, unsigned int vlen, float tol, bool absolute_mode) +{ bool fail = false; int print_max_errs = 10; - for(unsigned int i=0; i tol) { - fail=true; - if(print_max_errs-- > 0) { - std::cout << "offset " << i << " in1: " << t(((t *)(in1))[i]) << " in2: " << t(((t *)(in2))[i]); + if (fabs(((t*)(in1))[i] - ((t*)(in2))[i]) > tol) { + fail = true; + if (print_max_errs-- > 0) { + std::cout << "offset " << i << " in1: " << t(((t*)(in1))[i]) + << " in2: " << t(((t*)(in2))[i]); std::cout << " tolerance was: " << tol << std::endl; } } } else { // for very small numbers we'll see round off errors due to limited // precision. So a special test case... - if(fabs(((t *)(in1))[i]) < 1e-30) { - if( fabs( ((t *)(in2))[i] ) > tol ) - { - fail=true; - if(print_max_errs-- > 0) { - std::cout << "offset " << i << " in1: " << t(((t *)(in1))[i]) << " in2: " << t(((t *)(in2))[i]); + if (fabs(((t*)(in1))[i]) < 1e-30) { + if (fabs(((t*)(in2))[i]) > tol) { + fail = true; + if (print_max_errs-- > 0) { + std::cout << "offset " << i << " in1: " << t(((t*)(in1))[i]) + << " in2: " << t(((t*)(in2))[i]); std::cout << " tolerance was: " << tol << std::endl; } } } // the primary test is the percent different greater than given tol - else if(fabs(((t *)(in1))[i] - ((t *)(in2))[i])/fabs(((t *)in1)[i]) > tol) { - fail=true; - if(print_max_errs-- > 0) { - std::cout << "offset " << i << " in1: " << t(((t *)(in1))[i]) << " in2: " << t(((t *)(in2))[i]); + else if (fabs(((t*)(in1))[i] - ((t*)(in2))[i]) / fabs(((t*)in1)[i]) > tol) { + fail = true; + if (print_max_errs-- > 0) { + std::cout << "offset " << i << " in1: " << t(((t*)(in1))[i]) + << " in2: " << t(((t*)(in2))[i]); std::cout << " tolerance was: " << tol << std::endl; } } @@ -294,43 +390,50 @@ bool fcompare(t *in1, t *in2, unsigned int vlen, float tol, bool absolute_mode) } template -bool ccompare(t *in1, t *in2, unsigned int vlen, float tol, bool absolute_mode) { +bool ccompare(t* in1, t* in2, unsigned int vlen, float tol, bool absolute_mode) +{ if (absolute_mode) { - std::cout << "ccompare does not support absolute mode" << std::endl; - return true; + std::cout << "ccompare does not support absolute mode" << std::endl; + return true; } bool fail = false; int print_max_errs = 10; - for(unsigned int i=0; i<2*vlen; i+=2) { - if (std::isnan(in1[i]) || std::isnan(in1[i+1]) || std::isnan(in2[i]) || std::isnan(in2[i+1]) - || std::isinf(in1[i]) || std::isinf(in1[i+1]) || std::isinf(in2[i]) || std::isinf(in2[i+1])) { - fail=true; - if(print_max_errs-- > 0) { - std::cout << "offset " << i/2 << " in1: " << in1[i] << " + " << in1[i+1] << "j in2: " << in2[i] << " + " << in2[i+1] << "j"; + for (unsigned int i = 0; i < 2 * vlen; i += 2) { + if (std::isnan(in1[i]) || std::isnan(in1[i + 1]) || std::isnan(in2[i]) || + std::isnan(in2[i + 1]) || std::isinf(in1[i]) || std::isinf(in1[i + 1]) || + std::isinf(in2[i]) || std::isinf(in2[i + 1])) { + fail = true; + if (print_max_errs-- > 0) { + std::cout << "offset " << i / 2 << " in1: " << in1[i] << " + " + << in1[i + 1] << "j in2: " << in2[i] << " + " << in2[i + 1] + << "j"; std::cout << " tolerance was: " << tol << std::endl; } } - t diff[2] = { in1[i] - in2[i], in1[i+1] - in2[i+1] }; - t err = std::sqrt(diff[0] * diff[0] + diff[1] * diff[1]); - t norm = std::sqrt(in1[i] * in1[i] + in1[i+1] * in1[i+1]); + t diff[2] = { in1[i] - in2[i], in1[i + 1] - in2[i + 1] }; + t err = std::sqrt(diff[0] * diff[0] + diff[1] * diff[1]); + t norm = std::sqrt(in1[i] * in1[i] + in1[i + 1] * in1[i + 1]); // for very small numbers we'll see round off errors due to limited // precision. So a special test case... if (norm < 1e-30) { - if (err > tol) - { - fail=true; - if(print_max_errs-- > 0) { - std::cout << "offset " << i/2 << " in1: " << in1[i] << " + " << in1[i+1] << "j in2: " << in2[i] << " + " << in2[i+1] << "j"; + if (err > tol) { + fail = true; + if (print_max_errs-- > 0) { + std::cout << "offset " << i / 2 << " in1: " << in1[i] << " + " + << in1[i + 1] << "j in2: " << in2[i] << " + " << in2[i + 1] + << "j"; std::cout << " tolerance was: " << tol << std::endl; } } } // the primary test is the percent different greater than given tol - else if((err / norm) > tol) { - fail=true; - if(print_max_errs-- > 0) { - std::cout << "offset " << i/2 << " in1: " << in1[i] << " + " << in1[i+1] << "j in2: " << in2[i] << " + " << in2[i+1] << "j"; + else if ((err / norm) > tol) { + fail = true; + if (print_max_errs-- > 0) { + std::cout << "offset " << i / 2 << " in1: " << in1[i] << " + " + << in1[i + 1] << "j in2: " << in2[i] << " + " << in2[i + 1] + << "j"; std::cout << " tolerance was: " << tol << std::endl; } } @@ -340,18 +443,21 @@ bool ccompare(t *in1, t *in2, unsigned int vlen, float tol, bool absolute_mode) } template -bool icompare(t *in1, t *in2, unsigned int vlen, unsigned int tol, bool absolute_mode) { +bool icompare(t* in1, t* in2, unsigned int vlen, unsigned int tol, bool absolute_mode) +{ if (absolute_mode) { - std::cout << "icompare does not support absolute mode" << std::endl; - return true; + std::cout << "icompare does not support absolute mode" << std::endl; + return true; } bool fail = false; int print_max_errs = 10; - for(unsigned int i=0; i tol) { - fail=true; - if(print_max_errs-- > 0) { - std::cout << "offset " << i << " in1: " << static_cast(t(((t *)(in1))[i])) << " in2: " << static_cast(t(((t *)(in2))[i])); + for (unsigned int i = 0; i < vlen; i++) { + if (((unsigned int)abs(int(((t*)(in1))[i]) - int(((t*)(in2))[i]))) > tol) { + fail = true; + if (print_max_errs-- > 0) { + std::cout << "offset " << i + << " in1: " << static_cast(t(((t*)(in1))[i])) + << " in2: " << static_cast(t(((t*)(in2))[i])); std::cout << " tolerance was: " << tol << std::endl; } } @@ -360,34 +466,46 @@ bool icompare(t *in1, t *in2, unsigned int vlen, unsigned int tol, bool absolute return fail; } -class volk_qa_aligned_mem_pool{ +class volk_qa_aligned_mem_pool +{ public: - void *get_new(size_t size){ + void* get_new(size_t size) + { size_t alignment = volk_get_alignment(); void* ptr = volk_malloc(size, alignment); memset(ptr, 0x00, size); _mems.push_back(ptr); return ptr; } - ~volk_qa_aligned_mem_pool() { - for(unsigned int ii = 0; ii < _mems.size(); ++ii) { + ~volk_qa_aligned_mem_pool() + { + for (unsigned int ii = 0; ii < _mems.size(); ++ii) { volk_free(_mems[ii]); } } -private: std::vector _mems; + +private: + std::vector _mems; }; bool run_volk_tests(volk_func_desc_t desc, void (*manual_func)(), std::string name, volk_test_params_t test_params, - std::vector *results, - std::string puppet_master_name -) + std::vector* results, + std::string puppet_master_name) { - return run_volk_tests(desc, manual_func, name, test_params.tol(), test_params.scalar(), - test_params.vlen(), test_params.iter(), results, puppet_master_name, - test_params.absolute_mode(), test_params.benchmark_mode()); + return run_volk_tests(desc, + manual_func, + name, + test_params.tol(), + test_params.scalar(), + test_params.vlen(), + test_params.iter(), + results, + puppet_master_name, + test_params.absolute_mode(), + test_params.benchmark_mode()); } bool run_volk_tests(volk_func_desc_t desc, @@ -397,17 +515,18 @@ bool run_volk_tests(volk_func_desc_t desc, lv_32fc_t scalar, unsigned int vlen, unsigned int iter, - std::vector *results, + std::vector* results, std::string puppet_master_name, bool absolute_mode, - bool benchmark_mode -) { + bool benchmark_mode) +{ // Initialize this entry in results vector results->push_back(volk_test_results_t()); results->back().name = name; results->back().vlen = vlen; results->back().iter = iter; - std::cout << "RUN_VOLK_TESTS: " << name << "(" << vlen << "," << iter << ")" << std::endl; + std::cout << "RUN_VOLK_TESTS: " << name << "(" << vlen << "," << iter << ")" + << std::endl; // vlen_twiddle will increase vlen for malloc and data generation // but kernels will still be called with the user provided vlen. @@ -418,57 +537,64 @@ bool run_volk_tests(volk_func_desc_t desc, const float tol_f = tol; const unsigned int tol_i = static_cast(tol); - //first let's get a list of available architectures for the test + // first let's get a list of available architectures for the test std::vector arch_list = get_arch_list(desc); - if((!benchmark_mode) && (arch_list.size() < 2)) { + if ((!benchmark_mode) && (arch_list.size() < 2)) { std::cout << "no architectures to test" << std::endl; return false; } - //something that can hang onto memory and cleanup when this function exits + // something that can hang onto memory and cleanup when this function exits volk_qa_aligned_mem_pool mem_pool; - //now we have to get a function signature by parsing the name + // now we have to get a function signature by parsing the name std::vector inputsig, outputsig; try { get_signatures_from_name(inputsig, outputsig, name); - } - catch (std::exception &error) { - std::cerr << "Error: unable to get function signature from kernel name" << std::endl; + } catch (std::exception& error) { + std::cerr << "Error: unable to get function signature from kernel name" + << std::endl; std::cerr << " - " << name << std::endl; return false; } - //pull the input scalars into their own vector + // pull the input scalars into their own vector std::vector inputsc; - for(size_t i=0; i inbuffs; - for (unsigned int inputsig_index = 0; inputsig_index < inputsig.size(); ++ inputsig_index) { + std::vector inbuffs; + for (unsigned int inputsig_index = 0; inputsig_index < inputsig.size(); + ++inputsig_index) { volk_type_t sig = inputsig[inputsig_index]; - if(!sig.is_scalar) //we don't make buffers for scalars - inbuffs.push_back(mem_pool.get_new(vlen*sig.size*(sig.is_complex ? 2 : 1))); + if (!sig.is_scalar) // we don't make buffers for scalars + inbuffs.push_back( + mem_pool.get_new(vlen * sig.size * (sig.is_complex ? 2 : 1))); } - for(size_t i=0; i > test_data; - for(size_t i=0; i arch_buffs; - for(size_t j=0; j> test_data; + for (size_t i = 0; i < arch_list.size(); i++) { + std::vector arch_buffs; + for (size_t j = 0; j < outputsig.size(); j++) { + arch_buffs.push_back(mem_pool.get_new(vlen * outputsig[j].size * + (outputsig[j].is_complex ? 2 : 1))); } - for(size_t j=0; j start, end; std::vector profile_times; - for(size_t i = 0; i < arch_list.size(); i++) { + for (size_t i = 0; i < arch_list.size(); i++) { start = std::chrono::system_clock::now(); - switch(both_sigs.size()) { - case 1: - if(inputsc.size() == 0) { - run_cast_test1((volk_fn_1arg)(manual_func), test_data[i], vlen, iter, arch_list[i]); - } else if(inputsc.size() == 1 && inputsc[0].is_float) { - if(inputsc[0].is_complex) { - run_cast_test1_s32fc((volk_fn_1arg_s32fc)(manual_func), test_data[i], scalar, vlen, iter, arch_list[i]); - } else { - run_cast_test1_s32f((volk_fn_1arg_s32f)(manual_func), test_data[i], scalar.real(), vlen, iter, arch_list[i]); - } - } else throw "unsupported 1 arg function >1 scalars"; - break; - case 2: - if(inputsc.size() == 0) { - run_cast_test2((volk_fn_2arg)(manual_func), test_data[i], vlen, iter, arch_list[i]); - } else if(inputsc.size() == 1 && inputsc[0].is_float) { - if(inputsc[0].is_complex) { - run_cast_test2_s32fc((volk_fn_2arg_s32fc)(manual_func), test_data[i], scalar, vlen, iter, arch_list[i]); - } else { - run_cast_test2_s32f((volk_fn_2arg_s32f)(manual_func), test_data[i], scalar.real(), vlen, iter, arch_list[i]); - } - } else throw "unsupported 2 arg function >1 scalars"; - break; - case 3: - if(inputsc.size() == 0) { - run_cast_test3((volk_fn_3arg)(manual_func), test_data[i], vlen, iter, arch_list[i]); - } else if(inputsc.size() == 1 && inputsc[0].is_float) { - if(inputsc[0].is_complex) { - run_cast_test3_s32fc((volk_fn_3arg_s32fc)(manual_func), test_data[i], scalar, vlen, iter, arch_list[i]); - } else { - run_cast_test3_s32f((volk_fn_3arg_s32f)(manual_func), test_data[i], scalar.real(), vlen, iter, arch_list[i]); - } - } else throw "unsupported 3 arg function >1 scalars"; - break; - case 4: - run_cast_test4((volk_fn_4arg)(manual_func), test_data[i], vlen, iter, arch_list[i]); - break; - default: - throw "no function handler for this signature"; - break; + switch (both_sigs.size()) { + case 1: + if (inputsc.size() == 0) { + run_cast_test1( + (volk_fn_1arg)(manual_func), test_data[i], vlen, iter, arch_list[i]); + } else if (inputsc.size() == 1 && inputsc[0].is_float) { + if (inputsc[0].is_complex) { + run_cast_test1_s32fc((volk_fn_1arg_s32fc)(manual_func), + test_data[i], + scalar, + vlen, + iter, + arch_list[i]); + } else { + run_cast_test1_s32f((volk_fn_1arg_s32f)(manual_func), + test_data[i], + scalar.real(), + vlen, + iter, + arch_list[i]); + } + } else + throw "unsupported 1 arg function >1 scalars"; + break; + case 2: + if (inputsc.size() == 0) { + run_cast_test2( + (volk_fn_2arg)(manual_func), test_data[i], vlen, iter, arch_list[i]); + } else if (inputsc.size() == 1 && inputsc[0].is_float) { + if (inputsc[0].is_complex) { + run_cast_test2_s32fc((volk_fn_2arg_s32fc)(manual_func), + test_data[i], + scalar, + vlen, + iter, + arch_list[i]); + } else { + run_cast_test2_s32f((volk_fn_2arg_s32f)(manual_func), + test_data[i], + scalar.real(), + vlen, + iter, + arch_list[i]); + } + } else + throw "unsupported 2 arg function >1 scalars"; + break; + case 3: + if (inputsc.size() == 0) { + run_cast_test3( + (volk_fn_3arg)(manual_func), test_data[i], vlen, iter, arch_list[i]); + } else if (inputsc.size() == 1 && inputsc[0].is_float) { + if (inputsc[0].is_complex) { + run_cast_test3_s32fc((volk_fn_3arg_s32fc)(manual_func), + test_data[i], + scalar, + vlen, + iter, + arch_list[i]); + } else { + run_cast_test3_s32f((volk_fn_3arg_s32f)(manual_func), + test_data[i], + scalar.real(), + vlen, + iter, + arch_list[i]); + } + } else + throw "unsupported 3 arg function >1 scalars"; + break; + case 4: + run_cast_test4( + (volk_fn_4arg)(manual_func), test_data[i], vlen, iter, arch_list[i]); + break; + default: + throw "no function handler for this signature"; + break; } end = std::chrono::system_clock::now(); @@ -541,10 +704,10 @@ bool run_volk_tests(volk_func_desc_t desc, profile_times.push_back(arch_time); } - //and now compare each output to the generic output - //first we have to know which output is the generic one, they aren't in order... - size_t generic_offset=0; - for(size_t i=0; i arch_results; - for(size_t i=0; iback().results[arch_list[i]]; + if (fail) { + volk_test_time_t* result = &results->back().results[arch_list[i]]; result->pass = false; fail_global = true; std::cout << name << ": fail on arch " << arch_list[i] << std::endl; @@ -634,15 +851,13 @@ bool run_volk_tests(volk_func_desc_t desc, double best_time_u = std::numeric_limits::max(); std::string best_arch_a = "generic"; std::string best_arch_u = "generic"; - for(size_t i=0; i < arch_list.size(); i++) - { - if((profile_times[i] < best_time_u) && arch_results[i] && desc.impl_alignment[i] == 0) - { + for (size_t i = 0; i < arch_list.size(); i++) { + if ((profile_times[i] < best_time_u) && arch_results[i] && + desc.impl_alignment[i] == 0) { best_time_u = profile_times[i]; best_arch_u = arch_list[i]; } - if((profile_times[i] < best_time_a) && arch_results[i]) - { + if ((profile_times[i] < best_time_a) && arch_results[i]) { best_time_a = profile_times[i]; best_arch_a = arch_list[i]; } @@ -651,7 +866,7 @@ bool run_volk_tests(volk_func_desc_t desc, std::cout << "Best aligned arch: " << best_arch_a << std::endl; std::cout << "Best unaligned arch: " << best_arch_u << std::endl; - if(puppet_master_name == "NULL") { + if (puppet_master_name == "NULL") { results->back().config_name = name; } else { results->back().config_name = puppet_master_name; diff --git a/lib/qa_utils.h b/lib/qa_utils.h index 2d8458b..74c3db4 100644 --- a/lib/qa_utils.h +++ b/lib/qa_utils.h @@ -1,14 +1,14 @@ #ifndef VOLK_QA_UTILS_H #define VOLK_QA_UTILS_H -#include // for bool, false -#include // for volk_func_desc_t -#include // for NULL -#include // for map -#include // for string, basic_string -#include // for vector +#include // for bool, false +#include // for volk_func_desc_t +#include // for NULL +#include // for map +#include // for string, basic_string +#include // for vector -#include "volk/volk_complex.h" // for lv_32fc_t +#include "volk/volk_complex.h" // for lv_32fc_t /************************************************ * VOLK QA type definitions * @@ -22,93 +22,119 @@ struct volk_type_t { std::string str; }; -class volk_test_time_t { - public: - std::string name; - double time; - std::string units; - bool pass; +class volk_test_time_t +{ +public: + std::string name; + double time; + std::string units; + bool pass; }; -class volk_test_results_t { - public: - std::string name; - std::string config_name; - unsigned int vlen; - unsigned int iter; - std::map results; - std::string best_arch_a; - std::string best_arch_u; +class volk_test_results_t +{ +public: + std::string name; + std::string config_name; + unsigned int vlen; + unsigned int iter; + std::map results; + std::string best_arch_a; + std::string best_arch_u; }; -class volk_test_params_t { - private: - float _tol; - lv_32fc_t _scalar; - unsigned int _vlen; - unsigned int _iter; - bool _benchmark_mode; - bool _absolute_mode; - std::string _kernel_regex; - public: - // ctor - volk_test_params_t(float tol, lv_32fc_t scalar, unsigned int vlen, unsigned int iter, - bool benchmark_mode, std::string kernel_regex) : - _tol(tol), _scalar(scalar), _vlen(vlen), _iter(iter), - _benchmark_mode(benchmark_mode), _absolute_mode(false), _kernel_regex(kernel_regex) {}; - // setters - void set_tol(float tol) {_tol=tol;}; - void set_scalar(lv_32fc_t scalar) {_scalar=scalar;}; - void set_vlen(unsigned int vlen) {_vlen=vlen;}; - void set_iter(unsigned int iter) {_iter=iter;}; - void set_benchmark(bool benchmark) {_benchmark_mode=benchmark;}; - void set_regex(std::string regex) {_kernel_regex=regex;}; - // getters - float tol() {return _tol;}; - lv_32fc_t scalar() {return _scalar;}; - unsigned int vlen() {return _vlen;}; - unsigned int iter() {return _iter;}; - bool benchmark_mode() {return _benchmark_mode;}; - bool absolute_mode() {return _absolute_mode;}; - std::string kernel_regex() {return _kernel_regex;}; - volk_test_params_t make_absolute(float tol) { - volk_test_params_t t(*this); - t._tol = tol; - t._absolute_mode = true; - return t; - } - volk_test_params_t make_tol(float tol) { - volk_test_params_t t(*this); - t._tol = tol; - return t; - } +class volk_test_params_t +{ +private: + float _tol; + lv_32fc_t _scalar; + unsigned int _vlen; + unsigned int _iter; + bool _benchmark_mode; + bool _absolute_mode; + std::string _kernel_regex; + +public: + // ctor + volk_test_params_t(float tol, + lv_32fc_t scalar, + unsigned int vlen, + unsigned int iter, + bool benchmark_mode, + std::string kernel_regex) + : _tol(tol), + _scalar(scalar), + _vlen(vlen), + _iter(iter), + _benchmark_mode(benchmark_mode), + _absolute_mode(false), + _kernel_regex(kernel_regex){}; + // setters + void set_tol(float tol) { _tol = tol; }; + void set_scalar(lv_32fc_t scalar) { _scalar = scalar; }; + void set_vlen(unsigned int vlen) { _vlen = vlen; }; + void set_iter(unsigned int iter) { _iter = iter; }; + void set_benchmark(bool benchmark) { _benchmark_mode = benchmark; }; + void set_regex(std::string regex) { _kernel_regex = regex; }; + // getters + float tol() { return _tol; }; + lv_32fc_t scalar() { return _scalar; }; + unsigned int vlen() { return _vlen; }; + unsigned int iter() { return _iter; }; + bool benchmark_mode() { return _benchmark_mode; }; + bool absolute_mode() { return _absolute_mode; }; + std::string kernel_regex() { return _kernel_regex; }; + volk_test_params_t make_absolute(float tol) + { + volk_test_params_t t(*this); + t._tol = tol; + t._absolute_mode = true; + return t; + } + volk_test_params_t make_tol(float tol) + { + volk_test_params_t t(*this); + t._tol = tol; + return t; + } }; -class volk_test_case_t { - private: - volk_func_desc_t _desc; - void(*_kernel_ptr)(); - std::string _name; - volk_test_params_t _test_parameters; - std::string _puppet_master_name; - public: - volk_func_desc_t desc() {return _desc;}; - void (*kernel_ptr()) () {return _kernel_ptr;}; - std::string name() {return _name;}; - std::string puppet_master_name() {return _puppet_master_name;}; - volk_test_params_t test_parameters() {return _test_parameters;}; - // normal ctor - volk_test_case_t(volk_func_desc_t desc, void(*kernel_ptr)(), std::string name, - volk_test_params_t test_parameters) : - _desc(desc), _kernel_ptr(kernel_ptr), _name(name), _test_parameters(test_parameters), - _puppet_master_name("NULL") - {}; - // ctor for puppets - volk_test_case_t(volk_func_desc_t desc, void(*kernel_ptr)(), std::string name, - std::string puppet_master_name, volk_test_params_t test_parameters) : - _desc(desc), _kernel_ptr(kernel_ptr), _name(name), _test_parameters(test_parameters), - _puppet_master_name(puppet_master_name) - {}; +class volk_test_case_t +{ +private: + volk_func_desc_t _desc; + void (*_kernel_ptr)(); + std::string _name; + volk_test_params_t _test_parameters; + std::string _puppet_master_name; + +public: + volk_func_desc_t desc() { return _desc; }; + void (*kernel_ptr())() { return _kernel_ptr; }; + std::string name() { return _name; }; + std::string puppet_master_name() { return _puppet_master_name; }; + volk_test_params_t test_parameters() { return _test_parameters; }; + // normal ctor + volk_test_case_t(volk_func_desc_t desc, + void (*kernel_ptr)(), + std::string name, + volk_test_params_t test_parameters) + : _desc(desc), + _kernel_ptr(kernel_ptr), + _name(name), + _test_parameters(test_parameters), + _puppet_master_name("NULL"){}; + // ctor for puppets + volk_test_case_t(volk_func_desc_t desc, + void (*kernel_ptr)(), + std::string name, + std::string puppet_master_name, + volk_test_params_t test_parameters) + : _desc(desc), + _kernel_ptr(kernel_ptr), + _name(name), + _test_parameters(test_parameters), + _puppet_master_name(puppet_master_name){}; }; /************************************************ @@ -117,42 +143,58 @@ class volk_test_case_t { volk_type_t volk_type_from_string(std::string); float uniform(void); -void random_floats(float *buf, unsigned n); +void random_floats(float* buf, unsigned n); -bool run_volk_tests( - volk_func_desc_t, - void(*)(), - std::string, - volk_test_params_t, - std::vector *results = NULL, - std::string puppet_master_name = "NULL" - ); +bool run_volk_tests(volk_func_desc_t, + void (*)(), + std::string, + volk_test_params_t, + std::vector* results = NULL, + std::string puppet_master_name = "NULL"); -bool run_volk_tests( - volk_func_desc_t, - void(*)(), - std::string, - float, - lv_32fc_t, - unsigned int, - unsigned int, - std::vector *results = NULL, - std::string puppet_master_name = "NULL", - bool absolute_mode = false, - bool benchmark_mode = false -); +bool run_volk_tests(volk_func_desc_t, + void (*)(), + std::string, + float, + lv_32fc_t, + unsigned int, + unsigned int, + std::vector* results = NULL, + std::string puppet_master_name = "NULL", + bool absolute_mode = false, + bool benchmark_mode = false); -#define VOLK_PROFILE(func, test_params, results) run_volk_tests(func##_get_func_desc(), (void (*)())func##_manual, std::string(#func), test_params, results, "NULL") -#define VOLK_PUPPET_PROFILE(func, puppet_master_func, test_params, results) run_volk_tests(func##_get_func_desc(), (void (*)())func##_manual, std::string(#func), test_params, results, std::string(#puppet_master_func)) -typedef void (*volk_fn_1arg)(void *, unsigned int, const char*); //one input, operate in place -typedef void (*volk_fn_2arg)(void *, void *, unsigned int, const char*); -typedef void (*volk_fn_3arg)(void *, void *, void *, unsigned int, const char*); -typedef void (*volk_fn_4arg)(void *, void *, void *, void *, unsigned int, const char*); -typedef void (*volk_fn_1arg_s32f)(void *, float, unsigned int, const char*); //one input vector, one scalar float input -typedef void (*volk_fn_2arg_s32f)(void *, void *, float, unsigned int, const char*); -typedef void (*volk_fn_3arg_s32f)(void *, void *, void *, float, unsigned int, const char*); -typedef void (*volk_fn_1arg_s32fc)(void *, lv_32fc_t, unsigned int, const char*); //one input vector, one scalar float input -typedef void (*volk_fn_2arg_s32fc)(void *, void *, lv_32fc_t, unsigned int, const char*); -typedef void (*volk_fn_3arg_s32fc)(void *, void *, void *, lv_32fc_t, unsigned int, const char*); +#define VOLK_PROFILE(func, test_params, results) \ + run_volk_tests(func##_get_func_desc(), \ + (void (*)())func##_manual, \ + std::string(#func), \ + test_params, \ + results, \ + "NULL") +#define VOLK_PUPPET_PROFILE(func, puppet_master_func, test_params, results) \ + run_volk_tests(func##_get_func_desc(), \ + (void (*)())func##_manual, \ + std::string(#func), \ + test_params, \ + results, \ + std::string(#puppet_master_func)) +typedef void (*volk_fn_1arg)(void*, + unsigned int, + const char*); // one input, operate in place +typedef void (*volk_fn_2arg)(void*, void*, unsigned int, const char*); +typedef void (*volk_fn_3arg)(void*, void*, void*, unsigned int, const char*); +typedef void (*volk_fn_4arg)(void*, void*, void*, void*, unsigned int, const char*); +typedef void (*volk_fn_1arg_s32f)( + void*, float, unsigned int, const char*); // one input vector, one scalar float input +typedef void (*volk_fn_2arg_s32f)(void*, void*, float, unsigned int, const char*); +typedef void (*volk_fn_3arg_s32f)(void*, void*, void*, float, unsigned int, const char*); +typedef void (*volk_fn_1arg_s32fc)( + void*, + lv_32fc_t, + unsigned int, + const char*); // one input vector, one scalar float input +typedef void (*volk_fn_2arg_s32fc)(void*, void*, lv_32fc_t, unsigned int, const char*); +typedef void (*volk_fn_3arg_s32fc)( + void*, void*, void*, lv_32fc_t, unsigned int, const char*); -#endif //VOLK_QA_UTILS_H +#endif // VOLK_QA_UTILS_H diff --git a/lib/testqa.cc b/lib/testqa.cc index 8b0f4d6..c885383 100644 --- a/lib/testqa.cc +++ b/lib/testqa.cc @@ -20,18 +20,18 @@ * Boston, MA 02110-1301, USA. */ -#include // for bool, false, true -#include // for operator<<, basic_ostream, endl, char... -#include // IWYU pragma: keep -#include // for map, map<>::iterator, _Rb_tree_iterator -#include // for string, operator<< -#include // for pair -#include // for vector - +#include // for bool, false, true +#include // IWYU pragma: keep +#include // for operator<<, basic_ostream, endl, char... +#include // for map, map<>::iterator, _Rb_tree_iterator +#include // for string, operator<< +#include // for pair +#include // for vector + +#include "kernel_tests.h" // for init_test_list +#include "qa_utils.h" // for volk_test_case_t, volk_test_results_t +#include "volk/volk_complex.h" // for lv_32fc_t #include -#include "kernel_tests.h" // for init_test_list -#include "qa_utils.h" // for volk_test_case_t, volk_test_results_t -#include "volk/volk_complex.h" // for lv_32fc_t void print_qa_xml(std::vector results, unsigned int nfails); @@ -46,45 +46,52 @@ int main(int argc, char* argv[]) bool def_benchmark_mode = true; std::string def_kernel_regex = ""; - volk_test_params_t test_params(def_tol, def_scalar, def_vlen, def_iter, - def_benchmark_mode, def_kernel_regex); + volk_test_params_t test_params( + def_tol, def_scalar, def_vlen, def_iter, def_benchmark_mode, def_kernel_regex); std::vector test_cases = init_test_list(test_params); std::vector results; - if (argc > 1){ - for(unsigned int ii = 0; ii < test_cases.size(); ++ii){ - if (std::string(argv[1]) == test_cases[ii].name()){ + if (argc > 1) { + for (unsigned int ii = 0; ii < test_cases.size(); ++ii) { + if (std::string(argv[1]) == test_cases[ii].name()) { volk_test_case_t test_case = test_cases[ii]; - if (run_volk_tests(test_case.desc(), test_case.kernel_ptr(), + if (run_volk_tests(test_case.desc(), + test_case.kernel_ptr(), test_case.name(), - test_case.test_parameters(), &results, + test_case.test_parameters(), + &results, test_case.puppet_master_name())) { - return 1; + return 1; } else { - return 0; + return 0; } } } - std::cerr << "Did not run a test for kernel: " << std::string(argv[1]) << " !" << std::endl; + std::cerr << "Did not run a test for kernel: " << std::string(argv[1]) << " !" + << std::endl; return 0; - }else{ + } else { std::vector qa_failures; // Test every kernel reporting failures when they occur - for(unsigned int ii = 0; ii < test_cases.size(); ++ii) { + for (unsigned int ii = 0; ii < test_cases.size(); ++ii) { bool qa_result = false; volk_test_case_t test_case = test_cases[ii]; try { - qa_result = run_volk_tests(test_case.desc(), test_case.kernel_ptr(), test_case.name(), - test_case.test_parameters(), &results, test_case.puppet_master_name()); - } - catch(...) { + qa_result = run_volk_tests(test_case.desc(), + test_case.kernel_ptr(), + test_case.name(), + test_case.test_parameters(), + &results, + test_case.puppet_master_name()); + } catch (...) { // TODO: what exceptions might we need to catch and how do we handle them? - std::cerr << "Exception found on kernel: " << test_case.name() << std::endl; + std::cerr << "Exception found on kernel: " << test_case.name() + << std::endl; qa_result = false; } - if(qa_result) { + if (qa_result) { std::cerr << "Failure on " << test_case.name() << std::endl; qa_failures.push_back(test_case.name()); } @@ -96,9 +103,9 @@ int main(int argc, char* argv[]) // Summarize QA results std::cerr << "Kernel QA finished: " << qa_failures.size() << " failures out of " << test_cases.size() << " tests." << std::endl; - if(qa_failures.size() > 0) { + if (qa_failures.size() > 0) { std::cerr << "The following kernels failed QA:" << std::endl; - for(unsigned int ii = 0; ii < qa_failures.size(); ++ii) { + for (unsigned int ii = 0; ii < qa_failures.size(); ++ii) { std::cerr << " " << qa_failures[ii] << std::endl; } qa_ret_val = 1; @@ -118,26 +125,28 @@ void print_qa_xml(std::vector results, unsigned int nfails) qa_file.open(".unittest/kernels.xml"); qa_file << "" << std::endl; - qa_file << "" << std::endl; + qa_file << "" << std::endl; // Results are in a vector by kernel. Each element has a result // map containing time and arch name with test result - for(unsigned int ii=0; ii < results.size(); ++ii) { + for (unsigned int ii = 0; ii < results.size(); ++ii) { volk_test_results_t result = results[ii]; qa_file << " " << std::endl; std::map::iterator kernel_time_pair; - for(kernel_time_pair = result.results.begin(); kernel_time_pair != result.results.end(); ++kernel_time_pair) { + for (kernel_time_pair = result.results.begin(); + kernel_time_pair != result.results.end(); + ++kernel_time_pair) { volk_test_time_t test_time = kernel_time_pair->second; - qa_file << " " << std::endl; - if(!test_time.pass) - qa_file << " " << - "" << std::endl; + qa_file << " " << std::endl; + if (!test_time.pass) + qa_file << " " + << "" << std::endl; qa_file << " " << std::endl; } qa_file << " " << std::endl; @@ -146,5 +155,4 @@ void print_qa_xml(std::vector results, unsigned int nfails) qa_file << "" << std::endl; qa_file.close(); - } diff --git a/lib/volk_malloc.c b/lib/volk_malloc.c index df36240..b3779e1 100644 --- a/lib/volk_malloc.c +++ b/lib/volk_malloc.c @@ -31,7 +31,8 @@ * see: https://en.cppreference.com/w/c/memory/aligned_alloc * * MSVC is broken - * see: https://docs.microsoft.com/en-us/cpp/overview/visual-cpp-language-conformance?view=vs-2019 + * see: + * https://docs.microsoft.com/en-us/cpp/overview/visual-cpp-language-conformance?view=vs-2019 * This section: * C11 The Universal CRT implemented the parts of the * C11 Standard Library that are required by C++17, @@ -46,39 +47,43 @@ * We must work around this problem because MSVC is non-compliant! */ -void *volk_malloc(size_t size, size_t alignment) + +void* volk_malloc(size_t size, size_t alignment) { #if HAVE_POSIX_MEMALIGN - // quoting posix_memalign() man page: - // "alignment must be a power of two and a multiple of sizeof(void *)" - // volk_get_alignment() could return 1 for some machines (e.g. generic_orc) - if (alignment == 1){ - return malloc(size); - } - void *ptr; - int err = posix_memalign(&ptr, alignment, size); - if(err != 0) { - ptr = NULL; - fprintf(stderr, - "VOLK: Error allocating memory " - "(posix_memalign: error %d: %s)\n", err, strerror(err)); - } + // quoting posix_memalign() man page: + // "alignment must be a power of two and a multiple of sizeof(void *)" + // volk_get_alignment() could return 1 for some machines (e.g. generic_orc) + if (alignment == 1) { + return malloc(size); + } + void* ptr; + int err = posix_memalign(&ptr, alignment, size); + if (err != 0) { + ptr = NULL; + fprintf(stderr, + "VOLK: Error allocating memory " + "(posix_memalign: error %d: %s)\n", + err, + strerror(err)); + } #elif defined(_MSC_VER) - void *ptr = _aligned_malloc(size, alignment); + void* ptr = _aligned_malloc(size, alignment); #else - void *ptr = aligned_alloc(alignment, size); + void* ptr = aligned_alloc(alignment, size); #endif - if(ptr == NULL) { - fprintf(stderr, "VOLK: Error allocating memory (aligned_alloc/_aligned_malloc)\n"); - } - return ptr; + if (ptr == NULL) { + fprintf(stderr, + "VOLK: Error allocating memory (aligned_alloc/_aligned_malloc)\n"); + } + return ptr; } -void volk_free(void *ptr) +void volk_free(void* ptr) { #if defined(_MSC_VER) - _aligned_free(ptr); + _aligned_free(ptr); #else - free(ptr); + free(ptr); #endif } diff --git a/lib/volk_prefs.c b/lib/volk_prefs.c index 0b5fe8e..8934bf7 100644 --- a/lib/volk_prefs.c +++ b/lib/volk_prefs.c @@ -1,6 +1,6 @@ +#include #include #include -#include #include #if defined(_MSC_VER) #include @@ -11,82 +11,84 @@ #endif #include -void volk_get_config_path(char *path, bool read) +void volk_get_config_path(char* path, bool read) { - if (!path) return; - const char *suffix = "/.volk/volk_config"; - const char *suffix2 = "/volk/volk_config"; //non-hidden - char *home = NULL; + if (!path) + return; + const char* suffix = "/.volk/volk_config"; + const char* suffix2 = "/volk/volk_config"; // non-hidden + char* home = NULL; - //allows config redirection via env variable + // allows config redirection via env variable home = getenv("VOLK_CONFIGPATH"); - if(home!=NULL){ - strncpy(path,home,512); - strcat(path,suffix2); - if (!read || access(path, F_OK) != -1){ + if (home != NULL) { + strncpy(path, home, 512); + strcat(path, suffix2); + if (!read || access(path, F_OK) != -1) { return; } } - //check for user-local config file + // check for user-local config file home = getenv("HOME"); - if (home != NULL){ + if (home != NULL) { strncpy(path, home, 512); strcat(path, suffix); - if (!read || (access(path, F_OK) != -1)){ + if (!read || (access(path, F_OK) != -1)) { return; } } - //check for config file in APPDATA (Windows) + // check for config file in APPDATA (Windows) home = getenv("APPDATA"); - if (home != NULL){ + if (home != NULL) { strncpy(path, home, 512); strcat(path, suffix); - if (!read || (access(path, F_OK) != -1)){ + if (!read || (access(path, F_OK) != -1)) { return; } } - //check for system-wide config file - if (access("/etc/volk/volk_config", F_OK) != -1){ + // check for system-wide config file + if (access("/etc/volk/volk_config", F_OK) != -1) { strncpy(path, "/etc", 512); strcat(path, suffix2); - if (!read || (access(path, F_OK) != -1)){ + if (!read || (access(path, F_OK) != -1)) { return; } } - //If still no path was found set path[0] to '0' and fall through + // If still no path was found set path[0] to '0' and fall through path[0] = 0; return; } -size_t volk_load_preferences(volk_arch_pref_t **prefs_res) +size_t volk_load_preferences(volk_arch_pref_t** prefs_res) { - FILE *config_file; + FILE* config_file; char path[512], line[512]; size_t n_arch_prefs = 0; - volk_arch_pref_t *prefs = NULL; + volk_arch_pref_t* prefs = NULL; - //get the config path + // get the config path volk_get_config_path(path, true); - if (!path[0]) return n_arch_prefs; //no prefs found + if (!path[0]) + return n_arch_prefs; // no prefs found config_file = fopen(path, "r"); - if(!config_file) return n_arch_prefs; //no prefs found + if (!config_file) + return n_arch_prefs; // no prefs found - //reset the file pointer and write the prefs into volk_arch_prefs - while(fgets(line, sizeof(line), config_file) != NULL) - { - void *new_prefs = realloc(prefs, (n_arch_prefs + 1) * sizeof(*prefs)); + // reset the file pointer and write the prefs into volk_arch_prefs + while (fgets(line, sizeof(line), config_file) != NULL) { + void* new_prefs = realloc(prefs, (n_arch_prefs + 1) * sizeof(*prefs)); if (!new_prefs) { - printf ("volk_load_preferences: bad malloc\n"); + printf("volk_load_preferences: bad malloc\n"); break; } - prefs = (volk_arch_pref_t *) new_prefs; - volk_arch_pref_t *p = prefs + n_arch_prefs; - if(sscanf(line, "%s %s %s", p->name, p->impl_a, p->impl_u) == 3 && !strncmp(p->name, "volk_", 5)) - { + prefs = (volk_arch_pref_t*)new_prefs; + volk_arch_pref_t* p = prefs + n_arch_prefs; + if (sscanf(line, "%s %s %s", p->name, p->impl_a, p->impl_u) == 3 && + !strncmp(p->name, "volk_", 5)) { n_arch_prefs++; } } diff --git a/lib/volk_rank_archs.c b/lib/volk_rank_archs.c index 346619e..7cf3fd7 100644 --- a/lib/volk_rank_archs.c +++ b/lib/volk_rank_archs.c @@ -24,84 +24,83 @@ #include #include -#include #include +#include -int volk_get_index( - const char *impl_names[], //list of implementations by name - const size_t n_impls, //number of implementations available - const char *impl_name //the implementation name to find -){ +int volk_get_index(const char* impl_names[], // list of implementations by name + const size_t n_impls, // number of implementations available + const char* impl_name // the implementation name to find +) +{ unsigned int i; for (i = 0; i < n_impls; i++) { - if(!strncmp(impl_names[i], impl_name, 20)) { + if (!strncmp(impl_names[i], impl_name, 20)) { return i; } } - //TODO return -1; - //something terrible should happen here + // TODO return -1; + // something terrible should happen here fprintf(stderr, "Volk warning: no arch found, returning generic impl\n"); - return volk_get_index(impl_names, n_impls, "generic"); //but we'll fake it for now + return volk_get_index(impl_names, n_impls, "generic"); // but we'll fake it for now } -int volk_rank_archs( - const char *kern_name, //name of the kernel to rank - const char *impl_names[], //list of implementations by name - const int* impl_deps, //requirement mask per implementation - const bool* alignment, //alignment status of each implementation - size_t n_impls, //number of implementations available - const bool align //if false, filter aligned implementations +int volk_rank_archs(const char* kern_name, // name of the kernel to rank + const char* impl_names[], // list of implementations by name + const int* impl_deps, // requirement mask per implementation + const bool* alignment, // alignment status of each implementation + size_t n_impls, // number of implementations available + const bool align // if false, filter aligned implementations ) { size_t i; - static volk_arch_pref_t *volk_arch_prefs; + static volk_arch_pref_t* volk_arch_prefs; static size_t n_arch_prefs = 0; static int prefs_loaded = 0; - if(!prefs_loaded) { + if (!prefs_loaded) { n_arch_prefs = volk_load_preferences(&volk_arch_prefs); prefs_loaded = 1; } // If we've defined VOLK_GENERIC to be anything, always return the // 'generic' kernel. Used in GR's QA code. - char *gen_env = getenv("VOLK_GENERIC"); - if(gen_env) { - return volk_get_index(impl_names, n_impls, "generic"); + char* gen_env = getenv("VOLK_GENERIC"); + if (gen_env) { + return volk_get_index(impl_names, n_impls, "generic"); } - //now look for the function name in the prefs list - for(i = 0; i < n_arch_prefs; i++) - { - if(!strncmp(kern_name, volk_arch_prefs[i].name, sizeof(volk_arch_prefs[i].name))) //found it + // now look for the function name in the prefs list + for (i = 0; i < n_arch_prefs; i++) { + if (!strncmp(kern_name, + volk_arch_prefs[i].name, + sizeof(volk_arch_prefs[i].name))) // found it { - const char *impl_name = align? volk_arch_prefs[i].impl_a : volk_arch_prefs[i].impl_u; + const char* impl_name = + align ? volk_arch_prefs[i].impl_a : volk_arch_prefs[i].impl_u; return volk_get_index(impl_names, n_impls, impl_name); } } - //return the best index with the largest deps + // return the best index with the largest deps size_t best_index_a = 0; size_t best_index_u = 0; int best_value_a = -1; int best_value_u = -1; - for(i = 0; i < n_impls; i++) - { + for (i = 0; i < n_impls; i++) { const signed val = impl_deps[i]; - if (alignment[i] && val > best_value_a) - { + if (alignment[i] && val > best_value_a) { best_index_a = i; best_value_a = val; } - if (!alignment[i] && val > best_value_u) - { + if (!alignment[i] && val > best_value_u) { best_index_u = i; best_value_u = val; } } - //when align and we found a best aligned, use it - if (align && best_value_a != -1) return best_index_a; + // when align and we found a best aligned, use it + if (align && best_value_a != -1) + return best_index_a; - //otherwise return the best unaligned + // otherwise return the best unaligned return best_index_u; } diff --git a/lib/volk_rank_archs.h b/lib/volk_rank_archs.h index b3bf8ff..9434778 100644 --- a/lib/volk_rank_archs.h +++ b/lib/volk_rank_archs.h @@ -22,26 +22,24 @@ #ifndef INCLUDED_VOLK_RANK_ARCHS_H #define INCLUDED_VOLK_RANK_ARCHS_H -#include #include +#include #ifdef __cplusplus extern "C" { #endif -int volk_get_index( - const char *impl_names[], //list of implementations by name - const size_t n_impls, //number of implementations available - const char *impl_name //the implementation name to find +int volk_get_index(const char* impl_names[], // list of implementations by name + const size_t n_impls, // number of implementations available + const char* impl_name // the implementation name to find ); -int volk_rank_archs( - const char *kern_name, //name of the kernel to rank - const char *impl_names[], //list of implementations by name - const int* impl_deps, //requirement mask per implementation - const bool* alignment, //alignment status of each implementation - size_t n_impls, //number of implementations available - const bool align //if false, filter aligned implementations +int volk_rank_archs(const char* kern_name, // name of the kernel to rank + const char* impl_names[], // list of implementations by name + const int* impl_deps, // requirement mask per implementation + const bool* alignment, // alignment status of each implementation + size_t n_impls, // number of implementations available + const bool align // if false, filter aligned implementations ); #ifdef __cplusplus -- 2.30.2