--- /dev/null
+From 092a59997a1e1d5f421a0a5f87ee655ad173b93f Mon Sep 17 00:00:00 2001
+From: Johannes Demel <demel@uni-bremen.de>
+Date: Sun, 23 Feb 2020 15:03:47 +0100
+Subject: [PATCH 3/7] clang-format: Apply clang-format
+
+This commit adds `.clang-format` from GNU Radio and apply clang-format.
+
+Run:
+`find . -regex '.*\.\(c\|cc\|cpp\|cxx\|h\|hh\)' -exec clang-format \
+-style=file -i {} \;`
+in `.`.
+---
+ .clang-format | 106 ++
+ apps/volk-config-info.cc | 77 +-
+ apps/volk_option_helpers.cc | 268 +--
+ apps/volk_option_helpers.h | 84 +-
+ apps/volk_profile.cc | 205 ++-
+ apps/volk_profile.h | 20 +-
+ cmake/msvc/config.h | 27 +-
+ cmake/msvc/sys/time.h | 77 +-
+ include/volk/saturation_arithmetic.h | 16 +-
+ include/volk/volk_alloc.hh | 42 +-
+ include/volk/volk_avx2_intrinsics.h | 114 +-
+ include/volk/volk_avx_intrinsics.h | 193 +-
+ include/volk/volk_common.h | 148 +-
+ include/volk/volk_complex.h | 41 +-
+ include/volk/volk_malloc.h | 12 +-
+ include/volk/volk_neon_intrinsics.h | 115 +-
+ include/volk/volk_prefs.h | 17 +-
+ include/volk/volk_sse3_intrinsics.h | 79 +-
+ include/volk/volk_sse_intrinsics.h | 53 +-
+ kernels/volk/volk_16i_32fc_dot_prod_32fc.h | 1118 ++++++------
+ kernels/volk/volk_16i_branch_4_state_8.h | 219 ++-
+ kernels/volk/volk_16i_convert_8i.h | 301 ++--
+ kernels/volk/volk_16i_max_star_16i.h | 158 +-
+ .../volk/volk_16i_max_star_horizontal_16i.h | 214 +--
+ .../volk/volk_16i_permute_and_scalar_add.h | 187 +-
+ kernels/volk/volk_16i_s32f_convert_32f.h | 609 +++----
+ kernels/volk/volk_16i_x4_quad_max_star_16i.h | 357 ++--
+ kernels/volk/volk_16i_x5_add_quad_16i_x4.h | 336 ++--
+ kernels/volk/volk_16ic_convert_32fc.h | 241 +--
+ kernels/volk/volk_16ic_deinterleave_16i_x2.h | 431 +++--
+ .../volk/volk_16ic_deinterleave_real_16i.h | 397 +++--
+ kernels/volk/volk_16ic_deinterleave_real_8i.h | 469 +++--
+ kernels/volk/volk_16ic_magnitude_16i.h | 506 +++---
+ .../volk/volk_16ic_s32f_deinterleave_32f_x2.h | 418 ++---
+ .../volk_16ic_s32f_deinterleave_real_32f.h | 372 ++--
+ kernels/volk/volk_16ic_s32f_magnitude_32f.h | 381 ++--
+ kernels/volk/volk_16ic_x2_dot_prod_16ic.h | 750 ++++----
+ kernels/volk/volk_16ic_x2_multiply_16ic.h | 504 ++++--
+ kernels/volk/volk_16u_byteswap.h | 378 ++--
+ kernels/volk/volk_16u_byteswappuppet_16u.h | 44 +-
+ kernels/volk/volk_32f_64f_add_64f.h | 270 +--
+ kernels/volk/volk_32f_64f_multiply_64f.h | 154 +-
+ kernels/volk/volk_32f_8u_polarbutterfly_32f.h | 478 ++---
+ .../volk_32f_8u_polarbutterflypuppet_32f.h | 155 +-
+ kernels/volk/volk_32f_accumulator_s32f.h | 287 +--
+ kernels/volk/volk_32f_acos_32f.h | 700 ++++----
+ kernels/volk/volk_32f_asin_32f.h | 647 +++----
+ kernels/volk/volk_32f_atan_32f.h | 625 +++----
+ kernels/volk/volk_32f_binary_slicer_32i.h | 259 +--
+ kernels/volk/volk_32f_binary_slicer_8i.h | 706 ++++----
+ kernels/volk/volk_32f_convert_64f.h | 214 ++-
+ kernels/volk/volk_32f_cos_32f.h | 1159 ++++++------
+ kernels/volk/volk_32f_expfast_32f.h | 347 ++--
+ kernels/volk/volk_32f_index_max_16u.h | 370 ++--
+ kernels/volk/volk_32f_index_max_32u.h | 770 ++++----
+ kernels/volk/volk_32f_invsqrt_32f.h | 189 +-
+ kernels/volk/volk_32f_log2_32f.h | 719 +++++---
+ kernels/volk/volk_32f_null_32f.h | 16 +-
+ .../volk/volk_32f_s32f_32f_fm_detect_32f.h | 457 ++---
+ ...k_32f_s32f_calc_spectral_noise_floor_32f.h | 683 +++----
+ kernels/volk/volk_32f_s32f_convert_16i.h | 815 ++++-----
+ kernels/volk/volk_32f_s32f_convert_32i.h | 579 +++---
+ kernels/volk/volk_32f_s32f_convert_8i.h | 642 +++----
+ .../volk/volk_32f_s32f_mod_rangepuppet_32f.h | 63 +-
+ kernels/volk/volk_32f_s32f_multiply_32f.h | 271 +--
+ kernels/volk/volk_32f_s32f_normalize.h | 150 +-
+ kernels/volk/volk_32f_s32f_power_32f.h | 166 +-
+ .../volk/volk_32f_s32f_s32f_mod_range_32f.h | 718 ++++----
+ kernels/volk/volk_32f_s32f_stddev_32f.h | 449 ++---
+ kernels/volk/volk_32f_sin_32f.h | 945 +++++-----
+ kernels/volk/volk_32f_sqrt_32f.h | 153 +-
+ .../volk/volk_32f_stddev_and_mean_32f_x2.h | 583 +++---
+ kernels/volk/volk_32f_tan_32f.h | 1023 ++++++-----
+ kernels/volk/volk_32f_tanh_32f.h | 631 ++++---
+ kernels/volk/volk_32f_x2_add_32f.h | 412 +++--
+ kernels/volk/volk_32f_x2_divide_32f.h | 364 ++--
+ kernels/volk/volk_32f_x2_dot_prod_16i.h | 1092 ++++++------
+ kernels/volk/volk_32f_x2_dot_prod_32f.h | 1186 +++++++------
+ .../volk/volk_32f_x2_fm_detectpuppet_32f.h | 40 +-
+ kernels/volk/volk_32f_x2_interleave_32fc.h | 292 +--
+ kernels/volk/volk_32f_x2_max_32f.h | 345 ++--
+ kernels/volk/volk_32f_x2_min_32f.h | 347 ++--
+ kernels/volk/volk_32f_x2_multiply_32f.h | 375 ++--
+ kernels/volk/volk_32f_x2_pow_32f.h | 1175 ++++++------
+ .../volk/volk_32f_x2_s32f_interleave_16ic.h | 324 ++--
+ kernels/volk/volk_32f_x2_subtract_32f.h | 319 ++--
+ kernels/volk/volk_32f_x3_sum_of_poly_32f.h | 1026 +++++------
+ kernels/volk/volk_32fc_32f_add_32fc.h | 281 +--
+ kernels/volk/volk_32fc_32f_dot_prod_32fc.h | 1205 +++++++------
+ kernels/volk/volk_32fc_32f_multiply_32fc.h | 226 +--
+ kernels/volk/volk_32fc_conjugate_32fc.h | 233 +--
+ kernels/volk/volk_32fc_convert_16ic.h | 439 ++---
+ kernels/volk/volk_32fc_deinterleave_32f_x2.h | 297 ++--
+ kernels/volk/volk_32fc_deinterleave_64f_x2.h | 439 ++---
+ .../volk/volk_32fc_deinterleave_imag_32f.h | 210 +--
+ .../volk/volk_32fc_deinterleave_real_32f.h | 214 +--
+ .../volk/volk_32fc_deinterleave_real_64f.h | 262 +--
+ kernels/volk/volk_32fc_index_max_16u.h | 639 +++----
+ kernels/volk/volk_32fc_index_max_32u.h | 630 +++----
+ kernels/volk/volk_32fc_magnitude_32f.h | 556 +++---
+ .../volk/volk_32fc_magnitude_squared_32f.h | 443 ++---
+ kernels/volk/volk_32fc_s32f_atan2_32f.h | 208 +--
+ .../volk_32fc_s32f_deinterleave_real_16i.h | 226 +--
+ kernels/volk/volk_32fc_s32f_magnitude_16i.h | 297 ++--
+ kernels/volk/volk_32fc_s32f_power_32fc.h | 121 +-
+ .../volk/volk_32fc_s32f_power_spectrum_32f.h | 176 +-
+ ..._32fc_s32f_x2_power_spectral_density_32f.h | 297 ++--
+ kernels/volk/volk_32fc_s32fc_multiply_32fc.h | 250 +--
+ .../volk/volk_32fc_s32fc_rotatorpuppet_32fc.h | 118 +-
+ .../volk/volk_32fc_s32fc_x2_rotator_32fc.h | 260 +--
+ kernels/volk/volk_32fc_x2_add_32fc.h | 274 +--
+ .../volk_32fc_x2_conjugate_dot_prod_32fc.h | 1017 ++++++-----
+ kernels/volk/volk_32fc_x2_divide_32fc.h | 372 ++--
+ kernels/volk/volk_32fc_x2_dot_prod_32fc.h | 1334 +++++++-------
+ kernels/volk/volk_32fc_x2_multiply_32fc.h | 575 +++---
+ .../volk_32fc_x2_multiply_conjugate_32fc.h | 347 ++--
+ ...32fc_x2_s32f_square_dist_scalar_mult_32f.h | 657 +++----
+ ...2fc_x2_s32fc_multiply_conjugate_add_32fc.h | 98 +-
+ kernels/volk/volk_32fc_x2_square_dist_32f.h | 426 ++---
+ kernels/volk/volk_32i_s32f_convert_32f.h | 347 ++--
+ kernels/volk/volk_32i_x2_and_32i.h | 320 ++--
+ kernels/volk/volk_32i_x2_or_32i.h | 321 ++--
+ kernels/volk/volk_32u_byteswap.h | 433 ++---
+ kernels/volk/volk_32u_byteswappuppet_32u.h | 44 +-
+ kernels/volk/volk_32u_popcnt.h | 26 +-
+ kernels/volk/volk_32u_popcntpuppet_32u.h | 18 +-
+ kernels/volk/volk_32u_reverse_32u.h | 598 ++++---
+ kernels/volk/volk_64f_convert_32f.h | 324 ++--
+ kernels/volk/volk_64f_x2_add_64f.h | 207 +--
+ kernels/volk/volk_64f_x2_max_64f.h | 276 +--
+ kernels/volk/volk_64f_x2_min_64f.h | 275 +--
+ kernels/volk/volk_64f_x2_multiply_64f.h | 207 +--
+ kernels/volk/volk_64u_byteswap.h | 599 ++++---
+ kernels/volk/volk_64u_byteswappuppet_64u.h | 56 +-
+ kernels/volk/volk_64u_popcnt.h | 79 +-
+ kernels/volk/volk_64u_popcntpuppet_64u.h | 29 +-
+ kernels/volk/volk_8i_convert_16i.h | 315 ++--
+ kernels/volk/volk_8i_s32f_convert_32f.h | 528 +++---
+ kernels/volk/volk_8ic_deinterleave_16i_x2.h | 493 ++++--
+ kernels/volk/volk_8ic_deinterleave_real_16i.h | 346 ++--
+ kernels/volk/volk_8ic_deinterleave_real_8i.h | 482 +++--
+ .../volk/volk_8ic_s32f_deinterleave_32f_x2.h | 571 +++---
+ .../volk_8ic_s32f_deinterleave_real_32f.h | 395 +++--
+ .../volk_8ic_x2_multiply_conjugate_16ic.h | 413 +++--
+ ...volk_8ic_x2_s32f_multiply_conjugate_32fc.h | 496 +++---
+ kernels/volk/volk_8u_conv_k7_r2puppet_8u.h | 494 +++---
+ kernels/volk/volk_8u_x2_encodeframepolar_8u.h | 1569 +++++++++++------
+ kernels/volk/volk_8u_x3_encodepolar_8u_x2.h | 110 +-
+ .../volk/volk_8u_x3_encodepolarpuppet_8u.h | 137 +-
+ kernels/volk/volk_8u_x4_conv_k7_r2_8u.h | 1067 +++++------
+ lib/kernel_tests.h | 257 +--
+ lib/qa_utils.cc | 751 +++++---
+ lib/qa_utils.h | 288 +--
+ lib/testqa.cc | 96 +-
+ lib/volk_malloc.c | 55 +-
+ lib/volk_prefs.c | 74 +-
+ lib/volk_rank_archs.c | 73 +-
+ lib/volk_rank_archs.h | 22 +-
+ 158 files changed, 32509 insertions(+), 27583 deletions(-)
+ create mode 100644 .clang-format
+
+diff --git a/.clang-format b/.clang-format
+new file mode 100644
+index 0000000..285b68d
+--- /dev/null
++++ b/.clang-format
+@@ -0,0 +1,106 @@
++---
++Language: Cpp
++# BasedOnStyle: LLVM
++AccessModifierOffset: -4
++AlignAfterOpenBracket: Align
++AlignConsecutiveAssignments: false
++AlignConsecutiveDeclarations: false
++AlignEscapedNewlinesLeft: true
++AlignOperands: true
++AlignTrailingComments: true
++AllowAllParametersOfDeclarationOnNextLine: true
++AllowShortBlocksOnASingleLine: false
++AllowShortCaseLabelsOnASingleLine: false
++AllowShortFunctionsOnASingleLine: All
++AllowShortIfStatementsOnASingleLine: false
++AllowShortLoopsOnASingleLine: false
++AlwaysBreakAfterDefinitionReturnType: None
++AlwaysBreakAfterReturnType: None
++AlwaysBreakBeforeMultilineStrings: false
++AlwaysBreakTemplateDeclarations: true
++BinPackArguments: false
++BinPackParameters: false
++BreakBeforeBraces: Custom
++BraceWrapping:
++ AfterClass: true
++ AfterControlStatement: false
++ AfterEnum: false
++ AfterFunction: true
++ AfterNamespace: false
++ AfterObjCDeclaration: false
++ AfterStruct: false
++ AfterUnion: false
++ BeforeCatch: false
++ BeforeElse: false
++ IndentBraces: false
++BreakBeforeBinaryOperators: None
++BreakBeforeTernaryOperators: true
++BreakConstructorInitializersBeforeComma: false
++BreakAfterJavaFieldAnnotations: false
++BreakStringLiterals: true
++ColumnLimit: 90
++CommentPragmas: '^ IWYU pragma:'
++ConstructorInitializerAllOnOneLineOrOnePerLine: true
++ConstructorInitializerIndentWidth: 4
++ContinuationIndentWidth: 4
++Cpp11BracedListStyle: false
++DerivePointerAlignment: false
++DisableFormat: false
++ExperimentalAutoDetectBinPacking: false
++ForEachMacros:
++ - foreach
++ - Q_FOREACH
++ - BOOST_FOREACH
++IncludeCategories:
++ - Regex: '^"(gnuradio)/'
++ Priority: 1
++ - Regex: '^<(gnuradio)/'
++ Priority: 2
++ - Regex: '^<(boost)/'
++ Priority: 98
++ - Regex: '^<[a-z]*>$'
++ Priority: 99
++ - Regex: '^".*"$'
++ Priority: 0
++ - Regex: '.*'
++ Priority: 10
++
++IncludeIsMainRegex: '(Test)?$'
++IndentCaseLabels: false
++IndentWidth: 4
++IndentWrappedFunctionNames: false
++JavaScriptQuotes: Leave
++JavaScriptWrapImports: true
++KeepEmptyLinesAtTheStartOfBlocks: true
++MacroBlockBegin: ''
++MacroBlockEnd: ''
++MaxEmptyLinesToKeep: 2
++NamespaceIndentation: None
++ObjCBlockIndentWidth: 2
++ObjCSpaceAfterProperty: false
++ObjCSpaceBeforeProtocolList: true
++PenaltyBreakBeforeFirstCallParameter: 19
++PenaltyBreakComment: 300
++PenaltyBreakFirstLessLess: 120
++PenaltyBreakString: 1000
++PenaltyExcessCharacter: 1000000
++PenaltyReturnTypeOnItsOwnLine: 60
++PointerAlignment: Left
++ReflowComments: true
++SortIncludes: true
++SpaceAfterCStyleCast: false
++SpaceAfterTemplateKeyword: true
++SpaceBeforeAssignmentOperators: true
++SpaceBeforeParens: ControlStatements
++SpaceInEmptyParentheses: false
++SpacesBeforeTrailingComments: 1
++SpacesInAngles: false
++SpacesInContainerLiterals: true
++SpacesInCStyleCastParentheses: false
++SpacesInParentheses: false
++SpacesInSquareBrackets: false
++Standard: Cpp11
++TabWidth: 8
++UseTab: Never
++
++
+diff --git a/apps/volk-config-info.cc b/apps/volk-config-info.cc
+index 4eedcb7..2521993 100644
+--- a/apps/volk-config-info.cc
++++ b/apps/volk-config-info.cc
+@@ -24,52 +24,63 @@
+ #include <config.h>
+ #endif
+
+-#include <volk/constants.h> // for volk_available_machines, volk_c_com...
+-#include <iostream> // for operator<<, endl, cout, ostream
+-#include <string> // for string
++#include <volk/constants.h> // for volk_available_machines, volk_c_com...
++#include <iostream> // for operator<<, endl, cout, ostream
++#include <string> // for string
+
+-#include "volk/volk.h" // for volk_get_alignment, volk_get_machine
+-#include "volk_option_helpers.h" // for option_list, option_t
++#include "volk/volk.h" // for volk_get_alignment, volk_get_machine
++#include "volk_option_helpers.h" // for option_list, option_t
+
+ void print_alignment()
+ {
+- std::cout << "Alignment in bytes: " << volk_get_alignment() << std::endl;
++ std::cout << "Alignment in bytes: " << volk_get_alignment() << std::endl;
+ }
+
+ void print_malloc()
+ {
+- // You don't want to change the volk_malloc code, so just copy the if/else
+- // structure from there and give an explanation for the implementations
+- std::cout << "Used malloc implementation: ";
+- #if HAVE_POSIX_MEMALIGN
+- std::cout << "posix_memalign" << std::endl;
+- #elif defined(_MSC_VER)
+- std::cout << "_aligned_malloc" << std::endl;
+- #else
+- std::cout << "C11 aligned_alloc" << std::endl;
+- #endif
++ // You don't want to change the volk_malloc code, so just copy the if/else
++ // structure from there and give an explanation for the implementations
++ std::cout << "Used malloc implementation: ";
++#if HAVE_POSIX_MEMALIGN
++ std::cout << "posix_memalign" << std::endl;
++#elif defined(_MSC_VER)
++ std::cout << "_aligned_malloc" << std::endl;
++#else
++ std::cout << "C11 aligned_alloc" << std::endl;
++#endif
+ }
+
+
+-int
+-main(int argc, char **argv)
++int main(int argc, char** argv)
+ {
+
+- option_list our_options("volk-config-info");
+- our_options.add(option_t("prefix", "", "print the VOLK installation prefix", volk_prefix()));
+- our_options.add(option_t("cc", "", "print the VOLK C compiler version", volk_c_compiler()));
+- our_options.add(option_t("cflags", "", "print the VOLK CFLAGS", volk_compiler_flags()));
+- our_options.add(option_t("all-machines", "", "print VOLK machines built", volk_available_machines()));
+- our_options.add(option_t("avail-machines", "", "print VOLK machines on the current "
+- "platform", volk_list_machines));
+- our_options.add(option_t("machine", "", "print the current VOLK machine that will be used",
+- volk_get_machine()));
+- our_options.add(option_t("alignment", "", "print the memory alignment", print_alignment));
+- our_options.add(option_t("malloc", "", "print the malloc implementation used in volk_malloc",
+- print_malloc));
+- our_options.add(option_t("version", "v", "print the VOLK version", volk_version()));
++ option_list our_options("volk-config-info");
++ our_options.add(
++ option_t("prefix", "", "print the VOLK installation prefix", volk_prefix()));
++ our_options.add(
++ option_t("cc", "", "print the VOLK C compiler version", volk_c_compiler()));
++ our_options.add(
++ option_t("cflags", "", "print the VOLK CFLAGS", volk_compiler_flags()));
++ our_options.add(option_t(
++ "all-machines", "", "print VOLK machines built", volk_available_machines()));
++ our_options.add(option_t("avail-machines",
++ "",
++ "print VOLK machines on the current "
++ "platform",
++ volk_list_machines));
++ our_options.add(option_t("machine",
++ "",
++ "print the current VOLK machine that will be used",
++ volk_get_machine()));
++ our_options.add(
++ option_t("alignment", "", "print the memory alignment", print_alignment));
++ our_options.add(option_t("malloc",
++ "",
++ "print the malloc implementation used in volk_malloc",
++ print_malloc));
++ our_options.add(option_t("version", "v", "print the VOLK version", volk_version()));
+
+- our_options.parse(argc, argv);
++ our_options.parse(argc, argv);
+
+- return 0;
++ return 0;
+ }
+diff --git a/apps/volk_option_helpers.cc b/apps/volk_option_helpers.cc
+index 4299709..73d51da 100644
+--- a/apps/volk_option_helpers.cc
++++ b/apps/volk_option_helpers.cc
+@@ -4,66 +4,97 @@
+
+ #include "volk_option_helpers.h"
+
+-#include <exception> // for exception
+-#include <iostream> // for operator<<, endl, basic_ostream, cout, ostream
+-#include <utility> // for pair
+-#include <limits.h> // IWYU pragma: keep
+-#include <cstring> // IWYU pragma: keep
+-#include <cstdlib> // IWYU pragma: keep
++#include <limits.h> // IWYU pragma: keep
++#include <cstdlib> // IWYU pragma: keep
++#include <cstring> // IWYU pragma: keep
++#include <exception> // for exception
++#include <iostream> // for operator<<, endl, basic_ostream, cout, ostream
++#include <utility> // for pair
+
+ /*
+ * Option type
+ */
+-option_t::option_t(std::string longform, std::string shortform, std::string msg, void (*callback)())
+- : longform("--" + longform),
+- shortform("-" + shortform),
+- msg(msg),
+- callback(callback) { option_type = VOID_CALLBACK; }
+-
+-option_t::option_t(std::string longform, std::string shortform, std::string msg, void (*callback)(int))
+- : longform("--" + longform),
+- shortform("-" + shortform),
+- msg(msg),
+- callback((void (*)()) callback) { option_type = INT_CALLBACK; }
+-
+-option_t::option_t(std::string longform, std::string shortform, std::string msg, void (*callback)(float))
+- : longform("--" + longform),
+- shortform("-" + shortform),
+- msg(msg),
+- callback((void (*)()) callback) { option_type = FLOAT_CALLBACK; }
+-
+-option_t::option_t(std::string longform, std::string shortform, std::string msg, void (*callback)(bool))
+- : longform("--" + longform),
+- shortform("-" + shortform),
+- msg(msg),
+- callback((void (*)()) callback) { option_type = BOOL_CALLBACK; }
+-
+-option_t::option_t(std::string longform, std::string shortform, std::string msg, void (*callback)(std::string))
+- : longform("--" + longform),
+- shortform("-" + shortform),
+- msg(msg),
+- callback((void (*)()) callback) { option_type = STRING_CALLBACK; }
+-
+-option_t::option_t(std::string longform, std::string shortform, std::string msg, std::string printval)
+- : longform("--" + longform),
+- shortform("-" + shortform),
+- msg(msg),
+- printval(printval) { option_type = STRING; }
++option_t::option_t(std::string longform,
++ std::string shortform,
++ std::string msg,
++ void (*callback)())
++ : longform("--" + longform), shortform("-" + shortform), msg(msg), callback(callback)
++{
++ option_type = VOID_CALLBACK;
++}
++
++option_t::option_t(std::string longform,
++ std::string shortform,
++ std::string msg,
++ void (*callback)(int))
++ : longform("--" + longform),
++ shortform("-" + shortform),
++ msg(msg),
++ callback((void (*)())callback)
++{
++ option_type = INT_CALLBACK;
++}
++
++option_t::option_t(std::string longform,
++ std::string shortform,
++ std::string msg,
++ void (*callback)(float))
++ : longform("--" + longform),
++ shortform("-" + shortform),
++ msg(msg),
++ callback((void (*)())callback)
++{
++ option_type = FLOAT_CALLBACK;
++}
++
++option_t::option_t(std::string longform,
++ std::string shortform,
++ std::string msg,
++ void (*callback)(bool))
++ : longform("--" + longform),
++ shortform("-" + shortform),
++ msg(msg),
++ callback((void (*)())callback)
++{
++ option_type = BOOL_CALLBACK;
++}
++
++option_t::option_t(std::string longform,
++ std::string shortform,
++ std::string msg,
++ void (*callback)(std::string))
++ : longform("--" + longform),
++ shortform("-" + shortform),
++ msg(msg),
++ callback((void (*)())callback)
++{
++ option_type = STRING_CALLBACK;
++}
++
++option_t::option_t(std::string longform,
++ std::string shortform,
++ std::string msg,
++ std::string printval)
++ : longform("--" + longform), shortform("-" + shortform), msg(msg), printval(printval)
++{
++ option_type = STRING;
++}
+
+
+ /*
+ * Option List
+ */
+
+-option_list::option_list(std::string program_name) :
+- program_name(program_name) {
++option_list::option_list(std::string program_name) : program_name(program_name)
++{
+ internal_list = std::vector<option_t>();
+ }
+
+
+ void option_list::add(option_t opt) { internal_list.push_back(opt); }
+
+-void option_list::parse(int argc, char **argv) {
++void option_list::parse(int argc, char** argv)
++{
+ for (int arg_number = 0; arg_number < argc; ++arg_number) {
+ for (std::vector<option_t>::iterator this_option = internal_list.begin();
+ this_option != internal_list.end();
+@@ -73,74 +104,83 @@ void option_list::parse(int argc, char **argv) {
+ this_option->shortform == std::string(argv[arg_number])) {
+
+ if (present_options.count(this_option->longform) == 0) {
+- present_options.insert(std::pair<std::string, int>(this_option->longform, 1));
++ present_options.insert(
++ std::pair<std::string, int>(this_option->longform, 1));
+ } else {
+ present_options[this_option->longform] += 1;
+ }
+ switch (this_option->option_type) {
+- case VOID_CALLBACK:
+- this_option->callback();
+- break;
+- case INT_CALLBACK:
+- try {
+- int_val = atoi(argv[++arg_number]);
+- ((void (*)(int)) this_option->callback)(int_val);
+- } catch (std::exception &exc) {
+- std::cout << "An int option can only receive a number" << std::endl;
+- throw std::exception();
+- };
+- break;
+- case FLOAT_CALLBACK:
+- try {
+- double double_val = atof(argv[++arg_number]);
+- ((void (*)(float)) this_option->callback)(double_val);
+- } catch (std::exception &exc) {
+- std::cout << "A float option can only receive a number" << std::endl;
+- throw std::exception();
+- };
+- break;
+- case BOOL_CALLBACK:
+- try {
+- if (arg_number == (argc - 1)) { // this is the last arg
++ case VOID_CALLBACK:
++ this_option->callback();
++ break;
++ case INT_CALLBACK:
++ try {
++ int_val = atoi(argv[++arg_number]);
++ ((void (*)(int))this_option->callback)(int_val);
++ } catch (std::exception& exc) {
++ std::cout << "An int option can only receive a number"
++ << std::endl;
++ throw std::exception();
++ };
++ break;
++ case FLOAT_CALLBACK:
++ try {
++ double double_val = atof(argv[++arg_number]);
++ ((void (*)(float))this_option->callback)(double_val);
++ } catch (std::exception& exc) {
++ std::cout << "A float option can only receive a number"
++ << std::endl;
++ throw std::exception();
++ };
++ break;
++ case BOOL_CALLBACK:
++ try {
++ if (arg_number == (argc - 1)) { // this is the last arg
++ int_val = 1;
++ } else { // sneak a look at the next arg since it's present
++ char* next_arg = argv[arg_number + 1];
++ if ((strncmp(next_arg, "-", 1) == 0) ||
++ (strncmp(next_arg, "--", 2) == 0)) {
++ // the next arg is actually an arg, the bool is just
++ // present, set to true
++ int_val = 1;
++ } else if (strncmp(next_arg, "true", 4) == 0) {
+ int_val = 1;
+- } else { // sneak a look at the next arg since it's present
+- char *next_arg = argv[arg_number + 1];
+- if ((strncmp(next_arg, "-", 1) == 0) || (strncmp(next_arg, "--", 2) == 0)) {
+- // the next arg is actually an arg, the bool is just present, set to true
+- int_val = 1;
+- } else if (strncmp(next_arg, "true", 4) == 0) {
+- int_val = 1;
+- } else if (strncmp(next_arg, "false", 5) == 0) {
+- int_val = 0;
+- } else {
+- // we got a number or a string.
+- // convert it to a number and depend on the catch to report an error condition
+- int_val = (bool) atoi(argv[++arg_number]);
+- }
++ } else if (strncmp(next_arg, "false", 5) == 0) {
++ int_val = 0;
++ } else {
++ // we got a number or a string.
++ // convert it to a number and depend on the catch to
++ // report an error condition
++ int_val = (bool)atoi(argv[++arg_number]);
+ }
+- } catch (std::exception &e) {
+- int_val = INT_MIN;
+- };
+- if (int_val == INT_MIN) {
+- std::cout << "option: '" << argv[arg_number - 1] << "' -> received an unknown value. Boolean "
+- "options should receive one of '0', '1', 'true', 'false'." << std::endl;
+- throw std::exception();
+- } else if (int_val) {
+- ((void (*)(bool)) this_option->callback)(int_val);
+ }
+- break;
+- case STRING_CALLBACK:
+- try {
+- ((void (*)(std::string)) this_option->callback)(argv[++arg_number]);
+- } catch (std::exception &exc) {
+- throw std::exception();
+- };
+- case STRING:
+- std::cout << this_option->printval << std::endl;
+- break;
++ } catch (std::exception& e) {
++ int_val = INT_MIN;
++ };
++ if (int_val == INT_MIN) {
++ std::cout
++ << "option: '" << argv[arg_number - 1]
++ << "' -> received an unknown value. Boolean "
++ "options should receive one of '0', '1', 'true', 'false'."
++ << std::endl;
++ throw std::exception();
++ } else if (int_val) {
++ ((void (*)(bool))this_option->callback)(int_val);
++ }
++ break;
++ case STRING_CALLBACK:
++ try {
++ ((void (*)(std::string))this_option->callback)(
++ argv[++arg_number]);
++ } catch (std::exception& exc) {
++ throw std::exception();
++ };
++ case STRING:
++ std::cout << this_option->printval << std::endl;
++ break;
+ }
+ }
+-
+ }
+ if (std::string("--help") == std::string(argv[arg_number]) ||
+ std::string("-h") == std::string(argv[arg_number])) {
+@@ -150,7 +190,8 @@ void option_list::parse(int argc, char **argv) {
+ }
+ }
+
+-bool option_list::present(std::string option_name) {
++bool option_list::present(std::string option_name)
++{
+ if (present_options.count("--" + option_name)) {
+ return true;
+ } else {
+@@ -158,7 +199,8 @@ bool option_list::present(std::string option_name) {
+ }
+ }
+
+-void option_list::help() {
++void option_list::help()
++{
+ std::cout << program_name << std::endl;
+ std::cout << " -h [ --help ] \t\tdisplay this help message" << std::endl;
+ for (std::vector<option_t>::iterator this_option = internal_list.begin();
+@@ -172,14 +214,14 @@ void option_list::help() {
+ }
+
+ switch (help_line.size() / 8) {
+- case 0:
+- help_line += "\t";
+- case 1:
+- help_line += "\t";
+- case 2:
+- help_line += "\t";
+- case 3:
+- help_line += "\t";
++ case 0:
++ help_line += "\t";
++ case 1:
++ help_line += "\t";
++ case 2:
++ help_line += "\t";
++ case 3:
++ help_line += "\t";
+ }
+ help_line += this_option->msg;
+ std::cout << help_line << std::endl;
+diff --git a/apps/volk_option_helpers.h b/apps/volk_option_helpers.h
+index 8a71547..0756caf 100644
+--- a/apps/volk_option_helpers.h
++++ b/apps/volk_option_helpers.h
+@@ -5,56 +5,74 @@
+ #ifndef VOLK_VOLK_OPTION_HELPERS_H
+ #define VOLK_VOLK_OPTION_HELPERS_H
+
+-#include <string>
+-#include <cstring>
+ #include <limits.h>
+-#include <vector>
++#include <cstring>
+ #include <map>
++#include <string>
++#include <vector>
+
+-typedef enum
+-{
+- VOID_CALLBACK,
++typedef enum {
++ VOID_CALLBACK,
+ INT_CALLBACK,
+ BOOL_CALLBACK,
+ STRING_CALLBACK,
+ FLOAT_CALLBACK,
+- STRING,
++ STRING,
+ } VOLK_OPTYPE;
+
+-class option_t {
+- public:
+- option_t(std::string longform, std::string shortform, std::string msg, void (*callback)());
+- option_t(std::string longform, std::string shortform, std::string msg, void (*callback)(int));
+- option_t(std::string longform, std::string shortform, std::string msg, void (*callback)(float));
+- option_t(std::string longform, std::string shortform, std::string msg, void (*callback)(bool));
+- option_t(std::string longform, std::string shortform, std::string msg, void (*callback)(std::string));
+- option_t(std::string longform, std::string shortform, std::string msg, std::string printval);
+-
+- std::string longform;
+- std::string shortform;
+- std::string msg;
+- VOLK_OPTYPE option_type;
+- std::string printval;
+- void (*callback)();
++class option_t
++{
++public:
++ option_t(std::string longform,
++ std::string shortform,
++ std::string msg,
++ void (*callback)());
++ option_t(std::string longform,
++ std::string shortform,
++ std::string msg,
++ void (*callback)(int));
++ option_t(std::string longform,
++ std::string shortform,
++ std::string msg,
++ void (*callback)(float));
++ option_t(std::string longform,
++ std::string shortform,
++ std::string msg,
++ void (*callback)(bool));
++ option_t(std::string longform,
++ std::string shortform,
++ std::string msg,
++ void (*callback)(std::string));
++ option_t(std::string longform,
++ std::string shortform,
++ std::string msg,
++ std::string printval);
+
++ std::string longform;
++ std::string shortform;
++ std::string msg;
++ VOLK_OPTYPE option_type;
++ std::string printval;
++ void (*callback)();
+ };
+
+ class option_list
+ {
+- public:
+- option_list(std::string program_name);
+- bool present(std::string option_name);
++public:
++ option_list(std::string program_name);
++ bool present(std::string option_name);
++
++ void add(option_t opt);
+
+- void add(option_t opt);
++ void parse(int argc, char** argv);
+
+- void parse(int argc, char **argv);
++ void help();
+
+- void help();
+- private:
+- std::string program_name;
+- std::vector<option_t> internal_list;
+- std::map<std::string, int> present_options;
++private:
++ std::string program_name;
++ std::vector<option_t> internal_list;
++ std::map<std::string, int> present_options;
+ };
+
+
+-#endif //VOLK_VOLK_OPTION_HELPERS_H
++#endif // VOLK_VOLK_OPTION_HELPERS_H
+diff --git a/apps/volk_profile.cc b/apps/volk_profile.cc
+index 4ef5aeb..3c2e324 100644
+--- a/apps/volk_profile.cc
++++ b/apps/volk_profile.cc
+@@ -27,23 +27,23 @@
+ #include <filesystem>
+ #endif
+ #else
+-#include <boost/filesystem/operations.hpp> // for create_directories, exists
+-#include <boost/filesystem/path.hpp> // for path, operator<<
+-#include <boost/filesystem/path_traits.hpp> // for filesystem
++#include <boost/filesystem/operations.hpp> // for create_directories, exists
++#include <boost/filesystem/path.hpp> // for path, operator<<
++#include <boost/filesystem/path_traits.hpp> // for filesystem
+ #endif
+-#include <stddef.h> // for size_t
+-#include <sys/stat.h> // for stat
+-#include <volk/volk_prefs.h> // for volk_get_config_path
+-#include <iostream> // for operator<<, basic_ostream
+-#include <fstream> // IWYU pragma: keep
+-#include <map> // for map, map<>::iterator
+-#include <utility> // for pair
+-#include <vector> // for vector, vector<>::const_...
+-
+-#include "kernel_tests.h" // for init_test_list
+-#include "qa_utils.h" // for volk_test_results_t, vol...
+-#include "volk/volk_complex.h" // for lv_32fc_t
+-#include "volk_option_helpers.h" // for option_list, option_t
++#include <stddef.h> // for size_t
++#include <sys/stat.h> // for stat
++#include <volk/volk_prefs.h> // for volk_get_config_path
++#include <fstream> // IWYU pragma: keep
++#include <iostream> // for operator<<, basic_ostream
++#include <map> // for map, map<>::iterator
++#include <utility> // for pair
++#include <vector> // for vector, vector<>::const_...
++
++#include "kernel_tests.h" // for init_test_list
++#include "qa_utils.h" // for volk_test_results_t, vol...
++#include "volk/volk_complex.h" // for lv_32fc_t
++#include "volk_option_helpers.h" // for option_list, option_t
+ #include "volk_profile.h"
+
+ #if HAS_STD_FILESYSTEM
+@@ -72,45 +72,61 @@ void set_json(std::string val) { json_filename = val; }
+ std::string volk_config_path("");
+ void set_volk_config(std::string val) { volk_config_path = val; }
+
+-int main(int argc, char *argv[]) {
++int main(int argc, char* argv[])
++{
+
+ option_list profile_options("volk_profile");
+- profile_options.add(option_t("benchmark", "b", "Run all kernels (benchmark mode)", set_benchmark));
+- profile_options.add(option_t("tol", "t", "Set the default tolerance for all tests", set_tolerance));
+- profile_options.add(option_t("vlen", "v", "Set the default vector length for tests", set_vlen));
+- profile_options.add((option_t("iter", "i", "Set the default number of test iterations per kernel", set_iter)));
+- profile_options.add((option_t("tests-substr", "R", "Run tests matching substring", set_substr)));
+- profile_options.add((option_t("update", "u", "Run only kernels missing from config", set_update)));
+- profile_options.add((option_t("dry-run", "n", "Dry run. Respect other options, but don't write to file", set_dryrun)));
+- profile_options.add((option_t("json", "j", "Write results to JSON file named as argument value", set_json)));
+- profile_options.add((option_t("path", "p", "Specify the volk_config path", set_volk_config)));
++ profile_options.add(
++ option_t("benchmark", "b", "Run all kernels (benchmark mode)", set_benchmark));
++ profile_options.add(
++ option_t("tol", "t", "Set the default tolerance for all tests", set_tolerance));
++ profile_options.add(
++ option_t("vlen", "v", "Set the default vector length for tests", set_vlen));
++ profile_options.add((option_t(
++ "iter", "i", "Set the default number of test iterations per kernel", set_iter)));
++ profile_options.add(
++ (option_t("tests-substr", "R", "Run tests matching substring", set_substr)));
++ profile_options.add(
++ (option_t("update", "u", "Run only kernels missing from config", set_update)));
++ profile_options.add(
++ (option_t("dry-run",
++ "n",
++ "Dry run. Respect other options, but don't write to file",
++ set_dryrun)));
++ profile_options.add((option_t(
++ "json", "j", "Write results to JSON file named as argument value", set_json)));
++ profile_options.add(
++ (option_t("path", "p", "Specify the volk_config path", set_volk_config)));
+ profile_options.parse(argc, argv);
+
+ if (profile_options.present("help")) {
+ return 0;
+ }
+
+- if(dry_run) {
+- std::cout << "Warning: this IS a dry-run. Config will not be written!" << std::endl;
++ if (dry_run) {
++ std::cout << "Warning: this IS a dry-run. Config will not be written!"
++ << std::endl;
+ }
+
+ // Adding program options
+ std::ofstream json_file;
+ std::string config_file;
+
+- if ( json_filename != "" ) {
+- json_file.open( json_filename.c_str() );
++ if (json_filename != "") {
++ json_file.open(json_filename.c_str());
+ }
+
+- if ( volk_config_path != "" ) {
++ if (volk_config_path != "") {
+ config_file = volk_config_path + "/volk_config";
+ }
+
+ // Run tests
+ std::vector<volk_test_results_t> results;
+- if(update_mode) {
+- if( config_file != "" ) read_results(&results, config_file);
+- else read_results(&results);
++ if (update_mode) {
++ if (config_file != "")
++ read_results(&results, config_file);
++ else
++ read_results(&results);
+ }
+
+ // Initialize the list of tests
+@@ -118,22 +134,22 @@ int main(int argc, char *argv[]) {
+
+ // Iterate through list of tests running each one
+ std::string substr_to_match(test_params.kernel_regex());
+- for(unsigned int ii = 0; ii < test_cases.size(); ++ii) {
++ for (unsigned int ii = 0; ii < test_cases.size(); ++ii) {
+ bool regex_match = true;
+
+ volk_test_case_t test_case = test_cases[ii];
+ // if the kernel name matches regex then do the test
+ std::string test_case_name = test_case.name();
+- if(test_case_name.find(substr_to_match) == std::string::npos) {
++ if (test_case_name.find(substr_to_match) == std::string::npos) {
+ regex_match = false;
+ }
+
+ // if we are in update mode check if we've already got results
+ // if we have any, then no need to test that kernel
+ bool update = true;
+- if(update_mode) {
+- for(unsigned int jj=0; jj < results.size(); ++jj) {
+- if(results[jj].name == test_case.name() ||
++ if (update_mode) {
++ for (unsigned int jj = 0; jj < results.size(); ++jj) {
++ if (results[jj].name == test_case.name() ||
+ results[jj].name == test_case.puppet_master_name()) {
+ update = false;
+ break;
+@@ -141,39 +157,44 @@ int main(int argc, char *argv[]) {
+ }
+ }
+
+- if( regex_match && update ) {
++ if (regex_match && update) {
+ try {
+- run_volk_tests(test_case.desc(), test_case.kernel_ptr(), test_case.name(),
+- test_case.test_parameters(), &results, test_case.puppet_master_name());
+- }
+- catch (std::string &error) {
+- std::cerr << "Caught Exception in 'run_volk_tests': " << error << std::endl;
++ run_volk_tests(test_case.desc(),
++ test_case.kernel_ptr(),
++ test_case.name(),
++ test_case.test_parameters(),
++ &results,
++ test_case.puppet_master_name());
++ } catch (std::string& error) {
++ std::cerr << "Caught Exception in 'run_volk_tests': " << error
++ << std::endl;
+ }
+ }
+ }
+
+
+ // Output results according to provided options
+- if(json_filename != "") {
++ if (json_filename != "") {
+ write_json(json_file, results);
+ json_file.close();
+ }
+
+- if(!dry_run) {
+- if(config_file != "") write_results(&results, false, config_file);
+- else write_results(&results, false);
+- }
+- else {
++ if (!dry_run) {
++ if (config_file != "")
++ write_results(&results, false, config_file);
++ else
++ write_results(&results, false);
++ } else {
+ std::cout << "Warning: this was a dry-run. Config not generated" << std::endl;
+ }
+ return 0;
+ }
+
+-void read_results(std::vector<volk_test_results_t> *results)
++void read_results(std::vector<volk_test_results_t>* results)
+ {
+ char path[1024];
+ volk_get_config_path(path, true);
+- if(path[0] == 0){
++ if (path[0] == 0) {
+ std::cout << "No prior test results found ..." << std::endl;
+ return;
+ }
+@@ -181,16 +202,16 @@ void read_results(std::vector<volk_test_results_t> *results)
+ read_results(results, std::string(path));
+ }
+
+-void read_results(std::vector<volk_test_results_t> *results, std::string path)
++void read_results(std::vector<volk_test_results_t>* results, std::string path)
+ {
+ struct stat buffer;
+- bool config_status = (stat (path.c_str(), &buffer) == 0);
++ bool config_status = (stat(path.c_str(), &buffer) == 0);
+
+- if( config_status ) {
++ if (config_status) {
+ // a config exists and we are reading results from it
+ std::ifstream config(path.c_str());
+ char config_line[256];
+- while(config.getline(config_line, 255)) {
++ while (config.getline(config_line, 255)) {
+ // tokenize the input line by kernel_name unaligned aligned
+ // then push back in the results vector with fields filled in
+
+@@ -198,26 +219,26 @@ void read_results(std::vector<volk_test_results_t> *results, std::string path)
+ std::string config_str(config_line);
+ std::size_t str_size = config_str.size();
+ std::size_t found = config_str.find(' ');
+-
++
+ // Split line by spaces
+- while(found && found < str_size) {
++ while (found && found < str_size) {
+ found = config_str.find(' ');
+ // kernel names MUST be less than 128 chars, which is
+ // a length restricted by volk/volk_prefs.c
+ // on the last token in the parsed string we won't find a space
+ // so make sure we copy at most 128 chars.
+- if(found > 127) {
++ if (found > 127) {
+ found = 127;
+ }
+ str_size = config_str.size();
+- char buffer[128] = {'\0'};
++ char buffer[128] = { '\0' };
+ config_str.copy(buffer, found + 1, 0);
+ buffer[found] = '\0';
+ single_kernel_result.push_back(std::string(buffer));
+- config_str.erase(0, found+1);
++ config_str.erase(0, found + 1);
+ }
+
+- if(single_kernel_result.size() == 3) {
++ if (single_kernel_result.size() == 3) {
+ volk_test_results_t kernel_result;
+ kernel_result.name = std::string(single_kernel_result[0]);
+ kernel_result.config_name = std::string(single_kernel_result[0]);
+@@ -229,45 +250,47 @@ void read_results(std::vector<volk_test_results_t> *results, std::string path)
+ }
+ }
+
+-void write_results(const std::vector<volk_test_results_t> *results, bool update_result)
++void write_results(const std::vector<volk_test_results_t>* results, bool update_result)
+ {
+ char path[1024];
+ volk_get_config_path(path, false);
+- if(path[0] == 0){
++ if (path[0] == 0) {
+ std::cout << "Aborting 'No config save path found' ..." << std::endl;
+ return;
+ }
+
+- write_results( results, update_result, std::string(path));
++ write_results(results, update_result, std::string(path));
+ }
+
+-void write_results(const std::vector<volk_test_results_t> *results, bool update_result, const std::string path)
++void write_results(const std::vector<volk_test_results_t>* results,
++ bool update_result,
++ const std::string path)
+ {
+-// struct stat buffer;
+-// bool config_status = (stat (path.c_str(), &buffer) == 0);
++ // struct stat buffer;
++ // bool config_status = (stat (path.c_str(), &buffer) == 0);
+
+ /*
+ * These
+ */
+ const fs::path config_path(path);
+- if (! fs::exists(config_path.parent_path()))
+- {
++ if (!fs::exists(config_path.parent_path())) {
+ std::cout << "Creating " << config_path.parent_path() << "..." << std::endl;
+ fs::create_directories(config_path.parent_path());
+ }
+
+ std::ofstream config;
+- if(update_result) {
++ if (update_result) {
+ std::cout << "Updating " << path << "..." << std::endl;
+ config.open(path.c_str(), std::ofstream::app);
+- if (!config.is_open()) { //either we don't have write access or we don't have the dir yet
++ if (!config.is_open()) { // either we don't have write access or we don't have the
++ // dir yet
+ std::cout << "Error opening file " << path << std::endl;
+ }
+- }
+- else {
++ } else {
+ std::cout << "Writing " << path << "..." << std::endl;
+ config.open(path.c_str());
+- if (!config.is_open()) { //either we don't have write access or we don't have the dir yet
++ if (!config.is_open()) { // either we don't have write access or we don't have the
++ // dir yet
+ std::cout << "Error opening file " << path << std::endl;
+ }
+
+@@ -278,43 +301,45 @@ void write_results(const std::vector<volk_test_results_t> *results, bool update_
+ }
+
+ std::vector<volk_test_results_t>::const_iterator profile_results;
+- for(profile_results = results->begin(); profile_results != results->end(); ++profile_results) {
+- config << profile_results->config_name << " "
+- << profile_results->best_arch_a << " "
+- << profile_results->best_arch_u << std::endl;
++ for (profile_results = results->begin(); profile_results != results->end();
++ ++profile_results) {
++ config << profile_results->config_name << " " << profile_results->best_arch_a
++ << " " << profile_results->best_arch_u << std::endl;
+ }
+ config.close();
+ }
+
+-void write_json(std::ofstream &json_file, std::vector<volk_test_results_t> results)
++void write_json(std::ofstream& json_file, std::vector<volk_test_results_t> results)
+ {
+ json_file << "{" << std::endl;
+ json_file << " \"volk_tests\": [" << std::endl;
+ size_t len = results.size();
+ size_t i = 0;
+ std::vector<volk_test_results_t>::iterator result;
+- for(result = results.begin(); result != results.end(); ++result) {
++ for (result = results.begin(); result != results.end(); ++result) {
+ json_file << " {" << std::endl;
+ json_file << " \"name\": \"" << result->name << "\"," << std::endl;
+ json_file << " \"vlen\": " << (int)(result->vlen) << "," << std::endl;
+ json_file << " \"iter\": " << result->iter << "," << std::endl;
+- json_file << " \"best_arch_a\": \"" << result->best_arch_a
+- << "\"," << std::endl;
+- json_file << " \"best_arch_u\": \"" << result->best_arch_u
+- << "\"," << std::endl;
++ json_file << " \"best_arch_a\": \"" << result->best_arch_a << "\","
++ << std::endl;
++ json_file << " \"best_arch_u\": \"" << result->best_arch_u << "\","
++ << std::endl;
+ json_file << " \"results\": {" << std::endl;
+ size_t results_len = result->results.size();
+ size_t ri = 0;
+
+ std::map<std::string, volk_test_time_t>::iterator kernel_time_pair;
+- for(kernel_time_pair = result->results.begin(); kernel_time_pair != result->results.end(); ++kernel_time_pair) {
++ for (kernel_time_pair = result->results.begin();
++ kernel_time_pair != result->results.end();
++ ++kernel_time_pair) {
+ volk_test_time_t time = kernel_time_pair->second;
+ json_file << " \"" << time.name << "\": {" << std::endl;
+ json_file << " \"name\": \"" << time.name << "\"," << std::endl;
+ json_file << " \"time\": " << time.time << "," << std::endl;
+ json_file << " \"units\": \"" << time.units << "\"" << std::endl;
+- json_file << " }" ;
+- if(ri+1 != results_len) {
++ json_file << " }";
++ if (ri + 1 != results_len) {
+ json_file << ",";
+ }
+ json_file << std::endl;
+@@ -322,7 +347,7 @@ void write_json(std::ofstream &json_file, std::vector<volk_test_results_t> resul
+ }
+ json_file << " }" << std::endl;
+ json_file << " }";
+- if(i+1 != len) {
++ if (i + 1 != len) {
+ json_file << ",";
+ }
+ json_file << std::endl;
+diff --git a/apps/volk_profile.h b/apps/volk_profile.h
+index 51629ab..ae3b474 100644
+--- a/apps/volk_profile.h
++++ b/apps/volk_profile.h
+@@ -1,14 +1,16 @@
+
+
+-#include <stdbool.h> // for bool
+-#include <iosfwd> // for ofstream
+-#include <string> // for string
+-#include <vector> // for vector
++#include <stdbool.h> // for bool
++#include <iosfwd> // for ofstream
++#include <string> // for string
++#include <vector> // for vector
+
+ class volk_test_results_t;
+
+-void read_results(std::vector<volk_test_results_t> *results);
+-void read_results(std::vector<volk_test_results_t> *results, std::string path);
+-void write_results(const std::vector<volk_test_results_t> *results, bool update_result);
+-void write_results(const std::vector<volk_test_results_t> *results, bool update_result, const std::string path);
+-void write_json(std::ofstream &json_file, std::vector<volk_test_results_t> results);
++void read_results(std::vector<volk_test_results_t>* results);
++void read_results(std::vector<volk_test_results_t>* results, std::string path);
++void write_results(const std::vector<volk_test_results_t>* results, bool update_result);
++void write_results(const std::vector<volk_test_results_t>* results,
++ bool update_result,
++ const std::string path);
++void write_json(std::ofstream& json_file, std::vector<volk_test_results_t> results);
+diff --git a/cmake/msvc/config.h b/cmake/msvc/config.h
+index 8b12c2a..68f716e 100644
+--- a/cmake/msvc/config.h
++++ b/cmake/msvc/config.h
+@@ -9,7 +9,7 @@
+ // enable inline functions for C code
+ ////////////////////////////////////////////////////////////////////////
+ #ifndef __cplusplus
+-# define inline __inline
++#define inline __inline
+ #endif
+
+ ////////////////////////////////////////////////////////////////////////
+@@ -23,12 +23,21 @@ typedef ptrdiff_t ssize_t;
+ ////////////////////////////////////////////////////////////////////////
+ #if _MSC_VER < 1800
+ #include <math.h>
+-static inline long lrint(double x){return (long)(x > 0.0 ? x + 0.5 : x - 0.5);}
+-static inline long lrintf(float x){return (long)(x > 0.0f ? x + 0.5f : x - 0.5f);}
+-static inline long long llrint(double x){return (long long)(x > 0.0 ? x + 0.5 : x - 0.5);}
+-static inline long long llrintf(float x){return (long long)(x > 0.0f ? x + 0.5f : x - 0.5f);}
+-static inline double rint(double x){return (x > 0.0)? floor(x + 0.5) : ceil(x - 0.5);}
+-static inline float rintf(float x){return (x > 0.0f)? floorf(x + 0.5f) : ceilf(x - 0.5f);}
++static inline long lrint(double x) { return (long)(x > 0.0 ? x + 0.5 : x - 0.5); }
++static inline long lrintf(float x) { return (long)(x > 0.0f ? x + 0.5f : x - 0.5f); }
++static inline long long llrint(double x)
++{
++ return (long long)(x > 0.0 ? x + 0.5 : x - 0.5);
++}
++static inline long long llrintf(float x)
++{
++ return (long long)(x > 0.0f ? x + 0.5f : x - 0.5f);
++}
++static inline double rint(double x) { return (x > 0.0) ? floor(x + 0.5) : ceil(x - 0.5); }
++static inline float rintf(float x)
++{
++ return (x > 0.0f) ? floorf(x + 0.5f) : ceilf(x - 0.5f);
++}
+ #endif
+
+ ////////////////////////////////////////////////////////////////////////
+@@ -43,7 +52,7 @@ static inline float rintf(float x){return (x > 0.0f)? floorf(x + 0.5f) : ceilf(x
+ // random and srandom
+ ////////////////////////////////////////////////////////////////////////
+ #include <stdlib.h>
+-static inline long int random (void) { return rand(); }
+-static inline void srandom (unsigned int seed) { srand(seed); }
++static inline long int random(void) { return rand(); }
++static inline void srandom(unsigned int seed) { srand(seed); }
+
+ #endif // _MSC_CONFIG_H_ ]
+diff --git a/cmake/msvc/sys/time.h b/cmake/msvc/sys/time.h
+index aa0f5dc..4bda1ba 100644
+--- a/cmake/msvc/sys/time.h
++++ b/cmake/msvc/sys/time.h
+@@ -10,67 +10,62 @@
+ #define NOMINMAX
+ #endif
+
+-//http://social.msdn.microsoft.com/Forums/en/vcgeneral/thread/430449b3-f6dd-4e18-84de-eebd26a8d668
++// http://social.msdn.microsoft.com/Forums/en/vcgeneral/thread/430449b3-f6dd-4e18-84de-eebd26a8d668
+ #include < time.h >
+ #include <windows.h> //I've omitted this line.
+ #if defined(_MSC_VER) || defined(_MSC_EXTENSIONS)
+- #define DELTA_EPOCH_IN_MICROSECS 11644473600000000Ui64
++#define DELTA_EPOCH_IN_MICROSECS 11644473600000000Ui64
+ #else
+- #define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
++#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
+ #endif
+
+ #if _MSC_VER < 1900
+ struct timespec {
+
+-time_t tv_sec; /* Seconds since 00:00:00 GMT, */
++ time_t tv_sec; /* Seconds since 00:00:00 GMT, */
+
+-/* 1 January 1970 */
++ /* 1 January 1970 */
+
+-long tv_nsec; /* Additional nanoseconds since */
+-
+-/* tv_sec */
++ long tv_nsec; /* Additional nanoseconds since */
+
++ /* tv_sec */
+ };
+ #endif
+
+-struct timezone
+-{
+- int tz_minuteswest; /* minutes W of Greenwich */
+- int tz_dsttime; /* type of dst correction */
++struct timezone {
++ int tz_minuteswest; /* minutes W of Greenwich */
++ int tz_dsttime; /* type of dst correction */
+ };
+
+-static inline int gettimeofday(struct timeval *tv, struct timezone *tz)
++static inline int gettimeofday(struct timeval* tv, struct timezone* tz)
+ {
+- FILETIME ft;
+- unsigned __int64 tmpres = 0;
+- static int tzflag;
+-
+- if (NULL != tv)
+- {
+- GetSystemTimeAsFileTime(&ft);
+-
+- tmpres |= ft.dwHighDateTime;
+- tmpres <<= 32;
+- tmpres |= ft.dwLowDateTime;
+-
+- /*converting file time to unix epoch*/
+- tmpres -= DELTA_EPOCH_IN_MICROSECS;
+- tv->tv_sec = (long)(tmpres / 1000000UL);
+- tv->tv_usec = (long)(tmpres % 1000000UL);
+- }
+-
+- if (NULL != tz)
+- {
+- if (!tzflag)
+- {
+- _tzset();
+- tzflag++;
++ FILETIME ft;
++ unsigned __int64 tmpres = 0;
++ static int tzflag;
++
++ if (NULL != tv) {
++ GetSystemTimeAsFileTime(&ft);
++
++ tmpres |= ft.dwHighDateTime;
++ tmpres <<= 32;
++ tmpres |= ft.dwLowDateTime;
++
++ /*converting file time to unix epoch*/
++ tmpres -= DELTA_EPOCH_IN_MICROSECS;
++ tv->tv_sec = (long)(tmpres / 1000000UL);
++ tv->tv_usec = (long)(tmpres % 1000000UL);
++ }
++
++ if (NULL != tz) {
++ if (!tzflag) {
++ _tzset();
++ tzflag++;
++ }
++ tz->tz_minuteswest = _timezone / 60;
++ tz->tz_dsttime = _daylight;
+ }
+- tz->tz_minuteswest = _timezone / 60;
+- tz->tz_dsttime = _daylight;
+- }
+
+- return 0;
++ return 0;
+ }
+
+ #endif //_MSC_SYS_TIME_H_
+diff --git a/include/volk/saturation_arithmetic.h b/include/volk/saturation_arithmetic.h
+index 0886844..7b95ba2 100644
+--- a/include/volk/saturation_arithmetic.h
++++ b/include/volk/saturation_arithmetic.h
+@@ -28,20 +28,24 @@
+
+ static inline int16_t sat_adds16i(int16_t x, int16_t y)
+ {
+- int32_t res = (int32_t) x + (int32_t) y;
++ int32_t res = (int32_t)x + (int32_t)y;
+
+- if (res < SHRT_MIN) res = SHRT_MIN;
+- if (res > SHRT_MAX) res = SHRT_MAX;
++ if (res < SHRT_MIN)
++ res = SHRT_MIN;
++ if (res > SHRT_MAX)
++ res = SHRT_MAX;
+
+ return res;
+ }
+
+ static inline int16_t sat_muls16i(int16_t x, int16_t y)
+ {
+- int32_t res = (int32_t) x * (int32_t) y;
++ int32_t res = (int32_t)x * (int32_t)y;
+
+- if (res < SHRT_MIN) res = SHRT_MIN;
+- if (res > SHRT_MAX) res = SHRT_MAX;
++ if (res < SHRT_MIN)
++ res = SHRT_MIN;
++ if (res > SHRT_MAX)
++ res = SHRT_MAX;
+
+ return res;
+ }
+diff --git a/include/volk/volk_alloc.hh b/include/volk/volk_alloc.hh
+index a2975da..44bcfaf 100644
+--- a/include/volk/volk_alloc.hh
++++ b/include/volk/volk_alloc.hh
+@@ -40,30 +40,40 @@ namespace volk {
+ */
+ template <class T>
+ struct alloc {
+- typedef T value_type;
++ typedef T value_type;
+
+- alloc() = default;
++ alloc() = default;
+
+- template <class U> constexpr alloc(alloc<U> const&) noexcept {}
++ template <class U>
++ constexpr alloc(alloc<U> const&) noexcept
++ {
++ }
+
+- T* allocate(std::size_t n) {
+- if (n > std::numeric_limits<std::size_t>::max() / sizeof(T)) throw std::bad_alloc();
++ T* allocate(std::size_t n)
++ {
++ if (n > std::numeric_limits<std::size_t>::max() / sizeof(T))
++ throw std::bad_alloc();
+
+- if (auto p = static_cast<T*>(volk_malloc(n*sizeof(T), volk_get_alignment())))
+- return p;
++ if (auto p = static_cast<T*>(volk_malloc(n * sizeof(T), volk_get_alignment())))
++ return p;
+
+- throw std::bad_alloc();
+- }
++ throw std::bad_alloc();
++ }
+
+- void deallocate(T* p, std::size_t) noexcept { volk_free(p); }
+-
+-} ;
++ void deallocate(T* p, std::size_t) noexcept { volk_free(p); }
++};
+
+ template <class T, class U>
+-bool operator==(alloc<T> const&, alloc<U> const&) { return true; }
++bool operator==(alloc<T> const&, alloc<U> const&)
++{
++ return true;
++}
+
+ template <class T, class U>
+-bool operator!=(alloc<T> const&, alloc<U> const&) { return false; }
++bool operator!=(alloc<T> const&, alloc<U> const&)
++{
++ return false;
++}
+
+
+ /*!
+@@ -73,8 +83,8 @@ bool operator!=(alloc<T> const&, alloc<U> const&) { return false; }
+ * example code:
+ * volk::vector<float> v(100); // vector using volk_malloc, volk_free
+ */
+-template<class T>
+-using vector = std::vector<T, alloc<T> >;
++template <class T>
++using vector = std::vector<T, alloc<T>>;
+
+ } // namespace volk
+ #endif // INCLUDED_VOLK_ALLOC_H
+diff --git a/include/volk/volk_avx2_intrinsics.h b/include/volk/volk_avx2_intrinsics.h
+index 17badc4..00f3b52 100644
+--- a/include/volk/volk_avx2_intrinsics.h
++++ b/include/volk/volk_avx2_intrinsics.h
+@@ -1,19 +1,19 @@
+ /* -*- c++ -*- */
+-/*
++/*
+ * Copyright 2015 Free Software Foundation, Inc.
+- *
++ *
+ * This file is part of GNU Radio
+- *
++ *
+ * GNU Radio is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 3, or (at your option)
+ * any later version.
+- *
++ *
+ * GNU Radio is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+- *
++ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNU Radio; see the file COPYING. If not, write to
+ * the Free Software Foundation, Inc., 51 Franklin Street,
+@@ -27,28 +27,59 @@
+
+ #ifndef INCLUDE_VOLK_VOLK_AVX2_INTRINSICS_H_
+ #define INCLUDE_VOLK_VOLK_AVX2_INTRINSICS_H_
+-#include <immintrin.h>
+ #include "volk/volk_avx_intrinsics.h"
++#include <immintrin.h>
+
+-static inline __m256
+-_mm256_polar_sign_mask_avx2(__m128i fbits){
+- const __m128i zeros = _mm_set1_epi8(0x00);
+- const __m128i sign_extract = _mm_set1_epi8(0x80);
+- const __m256i shuffle_mask = _mm256_setr_epi8(0xff, 0xff, 0xff, 0x00, 0xff, 0xff, 0xff, 0x01, 0xff, 0xff, 0xff, 0x02, 0xff, 0xff, 0xff, 0x03,
+- 0xff, 0xff, 0xff, 0x04, 0xff, 0xff, 0xff, 0x05, 0xff, 0xff, 0xff, 0x06, 0xff, 0xff, 0xff, 0x07);
+- __m256i sign_bits = _mm256_setzero_si256();
+-
+- fbits = _mm_cmpgt_epi8(fbits, zeros);
+- fbits = _mm_and_si128(fbits, sign_extract);
+- sign_bits = _mm256_insertf128_si256(sign_bits,fbits,0);
+- sign_bits = _mm256_insertf128_si256(sign_bits,fbits,1);
+- sign_bits = _mm256_shuffle_epi8(sign_bits, shuffle_mask);
++static inline __m256 _mm256_polar_sign_mask_avx2(__m128i fbits)
++{
++ const __m128i zeros = _mm_set1_epi8(0x00);
++ const __m128i sign_extract = _mm_set1_epi8(0x80);
++ const __m256i shuffle_mask = _mm256_setr_epi8(0xff,
++ 0xff,
++ 0xff,
++ 0x00,
++ 0xff,
++ 0xff,
++ 0xff,
++ 0x01,
++ 0xff,
++ 0xff,
++ 0xff,
++ 0x02,
++ 0xff,
++ 0xff,
++ 0xff,
++ 0x03,
++ 0xff,
++ 0xff,
++ 0xff,
++ 0x04,
++ 0xff,
++ 0xff,
++ 0xff,
++ 0x05,
++ 0xff,
++ 0xff,
++ 0xff,
++ 0x06,
++ 0xff,
++ 0xff,
++ 0xff,
++ 0x07);
++ __m256i sign_bits = _mm256_setzero_si256();
+
+- return _mm256_castsi256_ps(sign_bits);
++ fbits = _mm_cmpgt_epi8(fbits, zeros);
++ fbits = _mm_and_si128(fbits, sign_extract);
++ sign_bits = _mm256_insertf128_si256(sign_bits, fbits, 0);
++ sign_bits = _mm256_insertf128_si256(sign_bits, fbits, 1);
++ sign_bits = _mm256_shuffle_epi8(sign_bits, shuffle_mask);
++
++ return _mm256_castsi256_ps(sign_bits);
+ }
+
+ static inline __m256
+-_mm256_polar_fsign_add_llrs_avx2(__m256 src0, __m256 src1, __m128i fbits){
++_mm256_polar_fsign_add_llrs_avx2(__m256 src0, __m256 src1, __m128i fbits)
++{
+ // prepare sign mask for correct +-
+ __m256 sign_mask = _mm256_polar_sign_mask_avx2(fbits);
+
+@@ -61,26 +92,31 @@ _mm256_polar_fsign_add_llrs_avx2(__m256 src0, __m256 src1, __m128i fbits){
+ return dst;
+ }
+
+-static inline __m256
+-_mm256_magnitudesquared_ps_avx2(const __m256 cplxValue0, const __m256 cplxValue1){
+- const __m256i idx = _mm256_set_epi32(7,6,3,2,5,4,1,0);
+- const __m256 squared0 = _mm256_mul_ps(cplxValue0, cplxValue0); // Square the values
+- const __m256 squared1 = _mm256_mul_ps(cplxValue1, cplxValue1); // Square the Values
+- const __m256 complex_result = _mm256_hadd_ps(squared0, squared1);
+- return _mm256_permutevar8x32_ps(complex_result, idx);
++static inline __m256 _mm256_magnitudesquared_ps_avx2(const __m256 cplxValue0,
++ const __m256 cplxValue1)
++{
++ const __m256i idx = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
++ const __m256 squared0 = _mm256_mul_ps(cplxValue0, cplxValue0); // Square the values
++ const __m256 squared1 = _mm256_mul_ps(cplxValue1, cplxValue1); // Square the Values
++ const __m256 complex_result = _mm256_hadd_ps(squared0, squared1);
++ return _mm256_permutevar8x32_ps(complex_result, idx);
+ }
+
+-static inline __m256
+-_mm256_scaled_norm_dist_ps_avx2(const __m256 symbols0, const __m256 symbols1, const __m256 points0, const __m256 points1, const __m256 scalar){
+- /*
+- * Calculate: |y - x|^2 * SNR_lin
+- * Consider 'symbolsX' and 'pointsX' to be complex float
+- * 'symbolsX' are 'y' and 'pointsX' are 'x'
+- */
+- const __m256 diff0 = _mm256_sub_ps(symbols0, points0);
+- const __m256 diff1 = _mm256_sub_ps(symbols1, points1);
+- const __m256 norms = _mm256_magnitudesquared_ps_avx2(diff0, diff1);
+- return _mm256_mul_ps(norms, scalar);
++static inline __m256 _mm256_scaled_norm_dist_ps_avx2(const __m256 symbols0,
++ const __m256 symbols1,
++ const __m256 points0,
++ const __m256 points1,
++ const __m256 scalar)
++{
++ /*
++ * Calculate: |y - x|^2 * SNR_lin
++ * Consider 'symbolsX' and 'pointsX' to be complex float
++ * 'symbolsX' are 'y' and 'pointsX' are 'x'
++ */
++ const __m256 diff0 = _mm256_sub_ps(symbols0, points0);
++ const __m256 diff1 = _mm256_sub_ps(symbols1, points1);
++ const __m256 norms = _mm256_magnitudesquared_ps_avx2(diff0, diff1);
++ return _mm256_mul_ps(norms, scalar);
+ }
+
+ #endif /* INCLUDE_VOLK_VOLK_AVX2_INTRINSICS_H_ */
+diff --git a/include/volk/volk_avx_intrinsics.h b/include/volk/volk_avx_intrinsics.h
+index 808799f..bec846d 100644
+--- a/include/volk/volk_avx_intrinsics.h
++++ b/include/volk/volk_avx_intrinsics.h
+@@ -1,19 +1,19 @@
+ /* -*- c++ -*- */
+-/*
++/*
+ * Copyright 2015 Free Software Foundation, Inc.
+- *
++ *
+ * This file is part of GNU Radio
+- *
++ *
+ * GNU Radio is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 3, or (at your option)
+ * any later version.
+- *
++ *
+ * GNU Radio is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+- *
++ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNU Radio; see the file COPYING. If not, write to
+ * the Free Software Foundation, Inc., 51 Franklin Street,
+@@ -29,90 +29,126 @@
+ #define INCLUDE_VOLK_VOLK_AVX_INTRINSICS_H_
+ #include <immintrin.h>
+
+-static inline __m256
+-_mm256_complexmul_ps(__m256 x, __m256 y)
++static inline __m256 _mm256_complexmul_ps(__m256 x, __m256 y)
+ {
+- __m256 yl, yh, tmp1, tmp2;
+- yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr ...
+- yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di ...
+- tmp1 = _mm256_mul_ps(x, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr ...
+- x = _mm256_shuffle_ps(x, x, 0xB1); // Re-arrange x to be ai,ar,bi,br ...
+- tmp2 = _mm256_mul_ps(x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
+- return _mm256_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
++ __m256 yl, yh, tmp1, tmp2;
++ yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr ...
++ yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di ...
++ tmp1 = _mm256_mul_ps(x, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr ...
++ x = _mm256_shuffle_ps(x, x, 0xB1); // Re-arrange x to be ai,ar,bi,br ...
++ tmp2 = _mm256_mul_ps(x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
++ return _mm256_addsub_ps(tmp1,
++ tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
+ }
+
+-static inline __m256
+-_mm256_conjugate_ps(__m256 x){
+- const __m256 conjugator = _mm256_setr_ps(0, -0.f, 0, -0.f, 0, -0.f, 0, -0.f);
+- return _mm256_xor_ps(x, conjugator); // conjugate y
++static inline __m256 _mm256_conjugate_ps(__m256 x)
++{
++ const __m256 conjugator = _mm256_setr_ps(0, -0.f, 0, -0.f, 0, -0.f, 0, -0.f);
++ return _mm256_xor_ps(x, conjugator); // conjugate y
+ }
+
+-static inline __m256
+-_mm256_complexconjugatemul_ps(__m256 x, __m256 y){
+- y = _mm256_conjugate_ps(y);
+- return _mm256_complexmul_ps(x, y);
++static inline __m256 _mm256_complexconjugatemul_ps(__m256 x, __m256 y)
++{
++ y = _mm256_conjugate_ps(y);
++ return _mm256_complexmul_ps(x, y);
+ }
+
+-static inline __m256
+-_mm256_normalize_ps(__m256 val)
++static inline __m256 _mm256_normalize_ps(__m256 val)
+ {
+- __m256 tmp1 = _mm256_mul_ps(val, val);
+- tmp1 = _mm256_hadd_ps(tmp1, tmp1);
+- tmp1 = _mm256_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(3, 1, 2, 0)); // equals 0xD8
+- tmp1 = _mm256_sqrt_ps(tmp1);
+- return _mm256_div_ps(val, tmp1);
++ __m256 tmp1 = _mm256_mul_ps(val, val);
++ tmp1 = _mm256_hadd_ps(tmp1, tmp1);
++ tmp1 = _mm256_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(3, 1, 2, 0)); // equals 0xD8
++ tmp1 = _mm256_sqrt_ps(tmp1);
++ return _mm256_div_ps(val, tmp1);
+ }
+
+-static inline __m256
+-_mm256_magnitudesquared_ps(__m256 cplxValue1, __m256 cplxValue2){
+- __m256 complex1, complex2;
+- cplxValue1 = _mm256_mul_ps(cplxValue1, cplxValue1); // Square the values
+- cplxValue2 = _mm256_mul_ps(cplxValue2, cplxValue2); // Square the Values
+- complex1 = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x20);
+- complex2 = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x31);
+- return _mm256_hadd_ps(complex1, complex2); // Add the I2 and Q2 values
++static inline __m256 _mm256_magnitudesquared_ps(__m256 cplxValue1, __m256 cplxValue2)
++{
++ __m256 complex1, complex2;
++ cplxValue1 = _mm256_mul_ps(cplxValue1, cplxValue1); // Square the values
++ cplxValue2 = _mm256_mul_ps(cplxValue2, cplxValue2); // Square the Values
++ complex1 = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x20);
++ complex2 = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x31);
++ return _mm256_hadd_ps(complex1, complex2); // Add the I2 and Q2 values
+ }
+
+-static inline __m256
+-_mm256_magnitude_ps(__m256 cplxValue1, __m256 cplxValue2){
+- return _mm256_sqrt_ps(_mm256_magnitudesquared_ps(cplxValue1, cplxValue2));
++static inline __m256 _mm256_magnitude_ps(__m256 cplxValue1, __m256 cplxValue2)
++{
++ return _mm256_sqrt_ps(_mm256_magnitudesquared_ps(cplxValue1, cplxValue2));
+ }
+
+-static inline __m256
+-_mm256_scaled_norm_dist_ps(const __m256 symbols0, const __m256 symbols1, const __m256 points0, const __m256 points1, const __m256 scalar){
+- /*
+- * Calculate: |y - x|^2 * SNR_lin
+- * Consider 'symbolsX' and 'pointsX' to be complex float
+- * 'symbolsX' are 'y' and 'pointsX' are 'x'
+- */
+- const __m256 diff0 = _mm256_sub_ps(symbols0, points0);
+- const __m256 diff1 = _mm256_sub_ps(symbols1, points1);
+- const __m256 norms = _mm256_magnitudesquared_ps(diff0, diff1);
+- return _mm256_mul_ps(norms, scalar);
++static inline __m256 _mm256_scaled_norm_dist_ps(const __m256 symbols0,
++ const __m256 symbols1,
++ const __m256 points0,
++ const __m256 points1,
++ const __m256 scalar)
++{
++ /*
++ * Calculate: |y - x|^2 * SNR_lin
++ * Consider 'symbolsX' and 'pointsX' to be complex float
++ * 'symbolsX' are 'y' and 'pointsX' are 'x'
++ */
++ const __m256 diff0 = _mm256_sub_ps(symbols0, points0);
++ const __m256 diff1 = _mm256_sub_ps(symbols1, points1);
++ const __m256 norms = _mm256_magnitudesquared_ps(diff0, diff1);
++ return _mm256_mul_ps(norms, scalar);
+ }
+
+-static inline __m256
+-_mm256_polar_sign_mask(__m128i fbits){
+- __m256 sign_mask_dummy = _mm256_setzero_ps();
+- const __m128i zeros = _mm_set1_epi8(0x00);
+- const __m128i sign_extract = _mm_set1_epi8(0x80);
+- const __m128i shuffle_mask0 = _mm_setr_epi8(0xff, 0xff, 0xff, 0x00, 0xff, 0xff, 0xff, 0x01, 0xff, 0xff, 0xff, 0x02, 0xff, 0xff, 0xff, 0x03);
+- const __m128i shuffle_mask1 = _mm_setr_epi8(0xff, 0xff, 0xff, 0x04, 0xff, 0xff, 0xff, 0x05, 0xff, 0xff, 0xff, 0x06, 0xff, 0xff, 0xff, 0x07);
+-
+- fbits = _mm_cmpgt_epi8(fbits, zeros);
+- fbits = _mm_and_si128(fbits, sign_extract);
+- __m128i sign_bits0 = _mm_shuffle_epi8(fbits, shuffle_mask0);
+- __m128i sign_bits1 = _mm_shuffle_epi8(fbits, shuffle_mask1);
+-
+- __m256 sign_mask = _mm256_insertf128_ps(sign_mask_dummy, _mm_castsi128_ps(sign_bits0), 0x0);
+- return _mm256_insertf128_ps(sign_mask, _mm_castsi128_ps(sign_bits1), 0x1);
+-// // This is the desired function call. Though it seems to be missing in GCC.
+-// // Compare: https://software.intel.com/sites/landingpage/IntrinsicsGuide/#
+-// return _mm256_set_m128(_mm_castsi128_ps(sign_bits1), _mm_castsi128_ps(sign_bits0));
++static inline __m256 _mm256_polar_sign_mask(__m128i fbits)
++{
++ __m256 sign_mask_dummy = _mm256_setzero_ps();
++ const __m128i zeros = _mm_set1_epi8(0x00);
++ const __m128i sign_extract = _mm_set1_epi8(0x80);
++ const __m128i shuffle_mask0 = _mm_setr_epi8(0xff,
++ 0xff,
++ 0xff,
++ 0x00,
++ 0xff,
++ 0xff,
++ 0xff,
++ 0x01,
++ 0xff,
++ 0xff,
++ 0xff,
++ 0x02,
++ 0xff,
++ 0xff,
++ 0xff,
++ 0x03);
++ const __m128i shuffle_mask1 = _mm_setr_epi8(0xff,
++ 0xff,
++ 0xff,
++ 0x04,
++ 0xff,
++ 0xff,
++ 0xff,
++ 0x05,
++ 0xff,
++ 0xff,
++ 0xff,
++ 0x06,
++ 0xff,
++ 0xff,
++ 0xff,
++ 0x07);
++
++ fbits = _mm_cmpgt_epi8(fbits, zeros);
++ fbits = _mm_and_si128(fbits, sign_extract);
++ __m128i sign_bits0 = _mm_shuffle_epi8(fbits, shuffle_mask0);
++ __m128i sign_bits1 = _mm_shuffle_epi8(fbits, shuffle_mask1);
++
++ __m256 sign_mask =
++ _mm256_insertf128_ps(sign_mask_dummy, _mm_castsi128_ps(sign_bits0), 0x0);
++ return _mm256_insertf128_ps(sign_mask, _mm_castsi128_ps(sign_bits1), 0x1);
++ // // This is the desired function call. Though it seems to be missing in GCC.
++ // // Compare: https://software.intel.com/sites/landingpage/IntrinsicsGuide/#
++ // return _mm256_set_m128(_mm_castsi128_ps(sign_bits1),
++ // _mm_castsi128_ps(sign_bits0));
+ }
+
+ static inline void
+-_mm256_polar_deinterleave(__m256 *llr0, __m256 *llr1, __m256 src0, __m256 src1){
++_mm256_polar_deinterleave(__m256* llr0, __m256* llr1, __m256 src0, __m256 src1)
++{
+ // deinterleave values
+ __m256 part0 = _mm256_permute2f128_ps(src0, src1, 0x20);
+ __m256 part1 = _mm256_permute2f128_ps(src0, src1, 0x31);
+@@ -120,22 +156,25 @@ _mm256_polar_deinterleave(__m256 *llr0, __m256 *llr1, __m256 src0, __m256 src1){
+ *llr1 = _mm256_shuffle_ps(part0, part1, 0xdd);
+ }
+
+-static inline __m256
+-_mm256_polar_minsum_llrs(__m256 src0, __m256 src1){
++static inline __m256 _mm256_polar_minsum_llrs(__m256 src0, __m256 src1)
++{
+ const __m256 sign_mask = _mm256_set1_ps(-0.0f);
+- const __m256 abs_mask = _mm256_andnot_ps(sign_mask, _mm256_castsi256_ps(_mm256_set1_epi8(0xff)));
++ const __m256 abs_mask =
++ _mm256_andnot_ps(sign_mask, _mm256_castsi256_ps(_mm256_set1_epi8(0xff)));
+
+ __m256 llr0, llr1;
+ _mm256_polar_deinterleave(&llr0, &llr1, src0, src1);
+
+ // calculate result
+- __m256 sign = _mm256_xor_ps(_mm256_and_ps(llr0, sign_mask), _mm256_and_ps(llr1, sign_mask));
+- __m256 dst = _mm256_min_ps(_mm256_and_ps(llr0, abs_mask), _mm256_and_ps(llr1, abs_mask));
++ __m256 sign =
++ _mm256_xor_ps(_mm256_and_ps(llr0, sign_mask), _mm256_and_ps(llr1, sign_mask));
++ __m256 dst =
++ _mm256_min_ps(_mm256_and_ps(llr0, abs_mask), _mm256_and_ps(llr1, abs_mask));
+ return _mm256_or_ps(dst, sign);
+ }
+
+-static inline __m256
+-_mm256_polar_fsign_add_llrs(__m256 src0, __m256 src1, __m128i fbits){
++static inline __m256 _mm256_polar_fsign_add_llrs(__m256 src0, __m256 src1, __m128i fbits)
++{
+ // prepare sign mask for correct +-
+ __m256 sign_mask = _mm256_polar_sign_mask(fbits);
+
+diff --git a/include/volk/volk_common.h b/include/volk/volk_common.h
+index 50ea07b..8167d23 100644
+--- a/include/volk/volk_common.h
++++ b/include/volk/volk_common.h
+@@ -18,61 +18,71 @@
+ // AppleClang also defines __GNUC__, so do this check first. These
+ // will probably be the same as for __GNUC__, but let's keep them
+ // separate just to be safe.
+-# define __VOLK_ATTR_ALIGNED(x) __attribute__((aligned(x)))
+-# define __VOLK_ATTR_UNUSED __attribute__((unused))
+-# define __VOLK_ATTR_INLINE __attribute__((always_inline))
+-# define __VOLK_ATTR_DEPRECATED __attribute__((deprecated))
+-# define __VOLK_ASM __asm__
+-# define __VOLK_VOLATILE __volatile__
+-# define __VOLK_ATTR_EXPORT __attribute__((visibility("default")))
+-# define __VOLK_ATTR_IMPORT __attribute__((visibility("default")))
+-# define __VOLK_PREFETCH(addr) __builtin_prefetch(addr)
+-#elif defined(__GNUC__)
+-# define __VOLK_ATTR_ALIGNED(x) __attribute__((aligned(x)))
+-# define __VOLK_ATTR_UNUSED __attribute__((unused))
+-# define __VOLK_ATTR_INLINE __attribute__((always_inline))
+-# define __VOLK_ATTR_DEPRECATED __attribute__((deprecated))
+-# define __VOLK_ASM __asm__
+-# define __VOLK_VOLATILE __volatile__
+-# if __GNUC__ >= 4
+-# define __VOLK_ATTR_EXPORT __attribute__((visibility("default")))
+-# define __VOLK_ATTR_IMPORT __attribute__((visibility("default")))
+-# else
+-# define __VOLK_ATTR_EXPORT
+-# define __VOLK_ATTR_IMPORT
+-# endif
+-# define __VOLK_PREFETCH(addr) __builtin_prefetch(addr)
++#define __VOLK_ATTR_ALIGNED(x) __attribute__((aligned(x)))
++#define __VOLK_ATTR_UNUSED __attribute__((unused))
++#define __VOLK_ATTR_INLINE __attribute__((always_inline))
++#define __VOLK_ATTR_DEPRECATED __attribute__((deprecated))
++#define __VOLK_ASM __asm__
++#define __VOLK_VOLATILE __volatile__
++#define __VOLK_ATTR_EXPORT __attribute__((visibility("default")))
++#define __VOLK_ATTR_IMPORT __attribute__((visibility("default")))
++#define __VOLK_PREFETCH(addr) __builtin_prefetch(addr)
++#elif defined __GNUC__
++#define __VOLK_ATTR_ALIGNED(x) __attribute__((aligned(x)))
++#define __VOLK_ATTR_UNUSED __attribute__((unused))
++#define __VOLK_ATTR_INLINE __attribute__((always_inline))
++#define __VOLK_ATTR_DEPRECATED __attribute__((deprecated))
++#define __VOLK_ASM __asm__
++#define __VOLK_VOLATILE __volatile__
++#if __GNUC__ >= 4
++#define __VOLK_ATTR_EXPORT __attribute__((visibility("default")))
++#define __VOLK_ATTR_IMPORT __attribute__((visibility("default")))
+ #else
+-# warning "Unknown compiler. Using default VOLK macros, which may or not work."
+-# define __VOLK_ATTR_ALIGNED(x)
+-# define __VOLK_ATTR_UNUSED
+-# define __VOLK_ATTR_INLINE
+-# define __VOLK_ATTR_DEPRECATED
+-# define __VOLK_ATTR_EXPORT
+-# define __VOLK_ATTR_IMPORT
+-# define __VOLK_PREFETCH(addr)
+-# define __VOLK_ASM __asm__
+-# define __VOLK_VOLATILE __volatile__
++#define __VOLK_ATTR_EXPORT
++#define __VOLK_ATTR_IMPORT
++#endif
++#define __VOLK_PREFETCH(addr) __builtin_prefetch(addr)
++#elif _MSC_VER
++#define __VOLK_ATTR_ALIGNED(x) __declspec(align(x))
++#define __VOLK_ATTR_UNUSED
++#define __VOLK_ATTR_INLINE __forceinline
++#define __VOLK_ATTR_DEPRECATED __declspec(deprecated)
++#define __VOLK_ATTR_EXPORT __declspec(dllexport)
++#define __VOLK_ATTR_IMPORT __declspec(dllimport)
++#define __VOLK_PREFETCH(addr)
++#define __VOLK_ASM __asm
++#define __VOLK_VOLATILE
++#else
++#define __VOLK_ATTR_ALIGNED(x)
++#define __VOLK_ATTR_UNUSED
++#define __VOLK_ATTR_INLINE
++#define __VOLK_ATTR_DEPRECATED
++#define __VOLK_ATTR_EXPORT
++#define __VOLK_ATTR_IMPORT
++#define __VOLK_PREFETCH(addr)
++#define __VOLK_ASM __asm__
++#define __VOLK_VOLATILE __volatile__
+ #endif
+
+ ////////////////////////////////////////////////////////////////////////
+ // Ignore annoying warnings in MSVC
+ ////////////////////////////////////////////////////////////////////////
+ #if defined(_MSC_VER)
+-# pragma warning(disable: 4244) //'conversion' conversion from 'type1' to 'type2', possible loss of data
+-# pragma warning(disable: 4305) //'identifier' : truncation from 'type1' to 'type2'
++#pragma warning(disable : 4244) //'conversion' conversion from 'type1' to 'type2',
++ //possible loss of data
++#pragma warning(disable : 4305) //'identifier' : truncation from 'type1' to 'type2'
+ #endif
+
+ ////////////////////////////////////////////////////////////////////////
+ // C-linkage declaration macros
+ // FIXME: due to the usage of complex.h, require gcc for c-linkage
+ ////////////////////////////////////////////////////////////////////////
+-#if defined(__cplusplus) && (defined(__GNUC__) || defined(__clang__))
+-# define __VOLK_DECL_BEGIN extern "C" {
+-# define __VOLK_DECL_END }
++#if defined(__cplusplus) && (__GNUC__)
++#define __VOLK_DECL_BEGIN extern "C" {
++#define __VOLK_DECL_END }
+ #else
+-# define __VOLK_DECL_BEGIN
+-# define __VOLK_DECL_END
++#define __VOLK_DECL_BEGIN
++#define __VOLK_DECL_END
+ #endif
+
+ ////////////////////////////////////////////////////////////////////////
+@@ -80,9 +90,9 @@
+ // http://gcc.gnu.org/wiki/Visibility
+ ////////////////////////////////////////////////////////////////////////
+ #ifdef volk_EXPORTS
+-# define VOLK_API __VOLK_ATTR_EXPORT
++#define VOLK_API __VOLK_ATTR_EXPORT
+ #else
+-# define VOLK_API __VOLK_ATTR_IMPORT
++#define VOLK_API __VOLK_ATTR_IMPORT
+ #endif
+
+ ////////////////////////////////////////////////////////////////////////
+@@ -98,38 +108,38 @@
+ #endif
+ #endif
+
+-union bit128{
+- uint8_t i8[16];
+- uint16_t i16[8];
+- uint32_t i[4];
+- float f[4];
+- double d[2];
++union bit128 {
++ uint8_t i8[16];
++ uint16_t i16[8];
++ uint32_t i[4];
++ float f[4];
++ double d[2];
+
+- #ifdef LV_HAVE_SSE
+- __m128 float_vec;
+- #endif
++#ifdef LV_HAVE_SSE
++ __m128 float_vec;
++#endif
+
+- #ifdef LV_HAVE_SSE2
+- __m128i int_vec;
+- __m128d double_vec;
+- #endif
++#ifdef LV_HAVE_SSE2
++ __m128i int_vec;
++ __m128d double_vec;
++#endif
+ };
+
+-union bit256{
+- uint8_t i8[32];
+- uint16_t i16[16];
+- uint32_t i[8];
+- float f[8];
+- double d[4];
++union bit256 {
++ uint8_t i8[32];
++ uint16_t i16[16];
++ uint32_t i[8];
++ float f[8];
++ double d[4];
+
+- #ifdef LV_HAVE_AVX
+- __m256 float_vec;
+- __m256i int_vec;
+- __m256d double_vec;
+- #endif
++#ifdef LV_HAVE_AVX
++ __m256 float_vec;
++ __m256i int_vec;
++ __m256d double_vec;
++#endif
+ };
+
+-#define bit128_p(x) ((union bit128 *)(x))
+-#define bit256_p(x) ((union bit256 *)(x))
++#define bit128_p(x) ((union bit128*)(x))
++#define bit256_p(x) ((union bit256*)(x))
+
+ #endif /*INCLUDED_LIBVOLK_COMMON_H*/
+diff --git a/include/volk/volk_complex.h b/include/volk/volk_complex.h
+index 1d61d78..ae78873 100644
+--- a/include/volk/volk_complex.h
++++ b/include/volk/volk_complex.h
+@@ -19,49 +19,58 @@
+
+ #ifdef __cplusplus
+
+-#include <complex>
+ #include <stdint.h>
++#include <complex>
+
+-typedef std::complex<int8_t> lv_8sc_t;
++typedef std::complex<int8_t> lv_8sc_t;
+ typedef std::complex<int16_t> lv_16sc_t;
+ typedef std::complex<int32_t> lv_32sc_t;
+ typedef std::complex<int64_t> lv_64sc_t;
+-typedef std::complex<float> lv_32fc_t;
+-typedef std::complex<double> lv_64fc_t;
++typedef std::complex<float> lv_32fc_t;
++typedef std::complex<double> lv_64fc_t;
+
+-template <typename T> inline std::complex<T> lv_cmake(const T &r, const T &i){
++template <typename T>
++inline std::complex<T> lv_cmake(const T& r, const T& i)
++{
+ return std::complex<T>(r, i);
+ }
+
+-template <typename T> inline typename T::value_type lv_creal(const T &x){
++template <typename T>
++inline typename T::value_type lv_creal(const T& x)
++{
+ return x.real();
+ }
+
+-template <typename T> inline typename T::value_type lv_cimag(const T &x){
++template <typename T>
++inline typename T::value_type lv_cimag(const T& x)
++{
+ return x.imag();
+ }
+
+-template <typename T> inline T lv_conj(const T &x){
++template <typename T>
++inline T lv_conj(const T& x)
++{
+ return std::conj(x);
+ }
+
+ #else /* __cplusplus */
+
+ #if __STDC_VERSION__ >= 199901L /* C99 check */
+-/* this allows us to conj in lv_conj without the double detour for single-precision floats */
++/* this allows us to conj in lv_conj without the double detour for single-precision floats
++ */
+ #include <tgmath.h>
+ #endif /* C99 check */
+
+ #include <complex.h>
+
+-typedef char complex lv_8sc_t;
+-typedef short complex lv_16sc_t;
+-typedef long complex lv_32sc_t;
+-typedef long long complex lv_64sc_t;
+-typedef float complex lv_32fc_t;
+-typedef double complex lv_64fc_t;
++typedef char complex lv_8sc_t;
++typedef short complex lv_16sc_t;
++typedef long complex lv_32sc_t;
++typedef long long complex lv_64sc_t;
++typedef float complex lv_32fc_t;
++typedef double complex lv_64fc_t;
+
+-#define lv_cmake(r, i) ((r) + _Complex_I*(i))
++#define lv_cmake(r, i) ((r) + _Complex_I * (i))
+
+ // When GNUC is available, use the complex extensions.
+ // The extensions always return the correct value type.
+diff --git a/include/volk/volk_malloc.h b/include/volk/volk_malloc.h
+index 3477b27..42ca2b0 100644
+--- a/include/volk/volk_malloc.h
++++ b/include/volk/volk_malloc.h
+@@ -23,8 +23,8 @@
+ #ifndef INCLUDED_VOLK_MALLOC_H
+ #define INCLUDED_VOLK_MALLOC_H
+
+-#include <volk/volk_common.h>
+ #include <stdlib.h>
++#include <volk/volk_common.h>
+
+ __VOLK_DECL_BEGIN
+
+@@ -40,7 +40,8 @@ __VOLK_DECL_BEGIN
+ * For Apple Clang, we fall back to `posix_memalign`.
+ * see: https://linux.die.net/man/3/aligned_alloc
+ * For MSVC, we fall back to `_aligned_malloc`.
+- * see: https://docs.microsoft.com/en-us/cpp/c-runtime-library/reference/aligned-malloc?view=vs-2019
++ * see:
++ * https://docs.microsoft.com/en-us/cpp/c-runtime-library/reference/aligned-malloc?view=vs-2019
+ *
+ * Because of the ways in which volk_malloc may allocate memory, it is
+ * important to always free volk_malloc pointers using volk_free.
+@@ -51,7 +52,7 @@ __VOLK_DECL_BEGIN
+ * \param alignment The byte alignment of the allocated memory.
+ * \return pointer to aligned memory.
+ */
+-VOLK_API void *volk_malloc(size_t size, size_t alignment);
++VOLK_API void* volk_malloc(size_t size, size_t alignment);
+
+ /*!
+ * \brief Free's memory allocated by volk_malloc.
+@@ -62,11 +63,12 @@ VOLK_API void *volk_malloc(size_t size, size_t alignment);
+ * Thus, in this case `volk_free` inherits the same behavior `free` exhibits.
+ * see: https://en.cppreference.com/w/c/memory/free
+ * In case `_aligned_malloc` was used, we call `_aligned_free`.
+- * see: https://docs.microsoft.com/en-us/cpp/c-runtime-library/reference/aligned-free?view=vs-2019
++ * see:
++ * https://docs.microsoft.com/en-us/cpp/c-runtime-library/reference/aligned-free?view=vs-2019
+ *
+ * \param aptr The aligned pointer allocated by volk_malloc.
+ */
+-VOLK_API void volk_free(void *aptr);
++VOLK_API void volk_free(void* aptr);
+
+ __VOLK_DECL_END
+
+diff --git a/include/volk/volk_neon_intrinsics.h b/include/volk/volk_neon_intrinsics.h
+index 90e7b54..302bd30 100644
+--- a/include/volk/volk_neon_intrinsics.h
++++ b/include/volk/volk_neon_intrinsics.h
+@@ -67,9 +67,9 @@
+ 3. This notice may not be removed or altered from any source distribution.
+
+ (this is the zlib license)
+-
++
+ _vsincosq_f32
+-
++
+ */
+
+ /*
+@@ -83,13 +83,12 @@
+
+
+ /* Magnitude squared for float32x4x2_t */
+-static inline float32x4_t
+-_vmagnitudesquaredq_f32(float32x4x2_t cmplxValue)
++static inline float32x4_t _vmagnitudesquaredq_f32(float32x4x2_t cmplxValue)
+ {
+ float32x4_t iValue, qValue, result;
+ iValue = vmulq_f32(cmplxValue.val[0], cmplxValue.val[0]); // Square the values
+ qValue = vmulq_f32(cmplxValue.val[1], cmplxValue.val[1]); // Square the values
+- result = vaddq_f32(iValue, qValue); // Add the I2 and Q2 values
++ result = vaddq_f32(iValue, qValue); // Add the I2 and Q2 values
+ return result;
+ }
+
+@@ -97,9 +96,11 @@ _vmagnitudesquaredq_f32(float32x4x2_t cmplxValue)
+ static inline float32x4_t _vinvsqrtq_f32(float32x4_t x)
+ {
+ float32x4_t sqrt_reciprocal = vrsqrteq_f32(x);
+- sqrt_reciprocal = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x, sqrt_reciprocal), sqrt_reciprocal), sqrt_reciprocal);
+- sqrt_reciprocal = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x, sqrt_reciprocal), sqrt_reciprocal), sqrt_reciprocal);
+-
++ sqrt_reciprocal = vmulq_f32(
++ vrsqrtsq_f32(vmulq_f32(x, sqrt_reciprocal), sqrt_reciprocal), sqrt_reciprocal);
++ sqrt_reciprocal = vmulq_f32(
++ vrsqrtsq_f32(vmulq_f32(x, sqrt_reciprocal), sqrt_reciprocal), sqrt_reciprocal);
++
+ return sqrt_reciprocal;
+ }
+
+@@ -108,19 +109,19 @@ static inline float32x4_t _vinvq_f32(float32x4_t x)
+ {
+ // Newton's method
+ float32x4_t recip = vrecpeq_f32(x);
+- recip = vmulq_f32(vrecpsq_f32(x, recip), recip);
+- recip = vmulq_f32(vrecpsq_f32(x, recip), recip);
++ recip = vmulq_f32(vrecpsq_f32(x, recip), recip);
++ recip = vmulq_f32(vrecpsq_f32(x, recip), recip);
+ return recip;
+ }
+
+ /* Complex multiplication for float32x4x2_t */
+-static inline float32x4x2_t
+-_vmultiply_complexq_f32(float32x4x2_t a_val, float32x4x2_t b_val)
++static inline float32x4x2_t _vmultiply_complexq_f32(float32x4x2_t a_val,
++ float32x4x2_t b_val)
+ {
+ float32x4x2_t tmp_real;
+ float32x4x2_t tmp_imag;
+ float32x4x2_t c_val;
+-
++
+ // multiply the real*real and imag*imag to get real result
+ // a0r*b0r|a1r*b1r|a2r*b2r|a3r*b3r
+ tmp_real.val[0] = vmulq_f32(a_val.val[0], b_val.val[0]);
+@@ -140,12 +141,12 @@ _vmultiply_complexq_f32(float32x4x2_t a_val, float32x4x2_t b_val)
+ /* From ARM Compute Library, MIT license */
+ static inline float32x4_t _vtaylor_polyq_f32(float32x4_t x, const float32x4_t coeffs[8])
+ {
+- float32x4_t cA = vmlaq_f32(coeffs[0], coeffs[4], x);
+- float32x4_t cB = vmlaq_f32(coeffs[2], coeffs[6], x);
+- float32x4_t cC = vmlaq_f32(coeffs[1], coeffs[5], x);
+- float32x4_t cD = vmlaq_f32(coeffs[3], coeffs[7], x);
+- float32x4_t x2 = vmulq_f32(x, x);
+- float32x4_t x4 = vmulq_f32(x2, x2);
++ float32x4_t cA = vmlaq_f32(coeffs[0], coeffs[4], x);
++ float32x4_t cB = vmlaq_f32(coeffs[2], coeffs[6], x);
++ float32x4_t cC = vmlaq_f32(coeffs[1], coeffs[5], x);
++ float32x4_t cD = vmlaq_f32(coeffs[3], coeffs[7], x);
++ float32x4_t x2 = vmulq_f32(x, x);
++ float32x4_t x4 = vmulq_f32(x2, x2);
+ float32x4_t res = vmlaq_f32(vmlaq_f32(cA, cB, x2), vmlaq_f32(cC, cD, x2), x4);
+ return res;
+ }
+@@ -155,121 +156,123 @@ static inline float32x4_t _vtaylor_polyq_f32(float32x4_t x, const float32x4_t co
+ static inline float32x4_t _vlogq_f32(float32x4_t x)
+ {
+ const float32x4_t log_tab[8] = {
+- vdupq_n_f32(-2.29561495781f),
+- vdupq_n_f32(-2.47071170807f),
+- vdupq_n_f32(-5.68692588806f),
+- vdupq_n_f32(-0.165253549814f),
+- vdupq_n_f32(5.17591238022f),
+- vdupq_n_f32(0.844007015228f),
+- vdupq_n_f32(4.58445882797f),
+- vdupq_n_f32(0.0141278216615f),
++ vdupq_n_f32(-2.29561495781f), vdupq_n_f32(-2.47071170807f),
++ vdupq_n_f32(-5.68692588806f), vdupq_n_f32(-0.165253549814f),
++ vdupq_n_f32(5.17591238022f), vdupq_n_f32(0.844007015228f),
++ vdupq_n_f32(4.58445882797f), vdupq_n_f32(0.0141278216615f),
+ };
+-
+- const int32x4_t CONST_127 = vdupq_n_s32(127); // 127
++
++ const int32x4_t CONST_127 = vdupq_n_s32(127); // 127
+ const float32x4_t CONST_LN2 = vdupq_n_f32(0.6931471805f); // ln(2)
+-
++
+ // Extract exponent
+- int32x4_t m = vsubq_s32(vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_f32(x), 23)), CONST_127);
+- float32x4_t val = vreinterpretq_f32_s32(vsubq_s32(vreinterpretq_s32_f32(x), vshlq_n_s32(m, 23)));
+-
++ int32x4_t m = vsubq_s32(
++ vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_f32(x), 23)), CONST_127);
++ float32x4_t val =
++ vreinterpretq_f32_s32(vsubq_s32(vreinterpretq_s32_f32(x), vshlq_n_s32(m, 23)));
++
+ // Polynomial Approximation
+ float32x4_t poly = _vtaylor_polyq_f32(val, log_tab);
+-
++
+ // Reconstruct
+ poly = vmlaq_f32(poly, vcvtq_f32_s32(m), CONST_LN2);
+-
++
+ return poly;
+ }
+
+ /* Evaluation of 4 sines & cosines at once.
+ * Optimized from here (zlib license)
+ * http://gruntthepeon.free.fr/ssemath/ */
+-static inline float32x4x2_t _vsincosq_f32(float32x4_t x) {
++static inline float32x4x2_t _vsincosq_f32(float32x4_t x)
++{
+ const float32x4_t c_minus_cephes_DP1 = vdupq_n_f32(-0.78515625);
+ const float32x4_t c_minus_cephes_DP2 = vdupq_n_f32(-2.4187564849853515625e-4);
+ const float32x4_t c_minus_cephes_DP3 = vdupq_n_f32(-3.77489497744594108e-8);
+ const float32x4_t c_sincof_p0 = vdupq_n_f32(-1.9515295891e-4);
+- const float32x4_t c_sincof_p1 = vdupq_n_f32(8.3321608736e-3);
++ const float32x4_t c_sincof_p1 = vdupq_n_f32(8.3321608736e-3);
+ const float32x4_t c_sincof_p2 = vdupq_n_f32(-1.6666654611e-1);
+ const float32x4_t c_coscof_p0 = vdupq_n_f32(2.443315711809948e-005);
+ const float32x4_t c_coscof_p1 = vdupq_n_f32(-1.388731625493765e-003);
+ const float32x4_t c_coscof_p2 = vdupq_n_f32(4.166664568298827e-002);
+ const float32x4_t c_cephes_FOPI = vdupq_n_f32(1.27323954473516); // 4 / M_PI
+-
++
+ const float32x4_t CONST_1 = vdupq_n_f32(1.f);
+ const float32x4_t CONST_1_2 = vdupq_n_f32(0.5f);
+ const float32x4_t CONST_0 = vdupq_n_f32(0.f);
+- const uint32x4_t CONST_2 = vdupq_n_u32(2);
+- const uint32x4_t CONST_4 = vdupq_n_u32(4);
+-
++ const uint32x4_t CONST_2 = vdupq_n_u32(2);
++ const uint32x4_t CONST_4 = vdupq_n_u32(4);
++
+ uint32x4_t emm2;
+-
++
+ uint32x4_t sign_mask_sin, sign_mask_cos;
+ sign_mask_sin = vcltq_f32(x, CONST_0);
+ x = vabsq_f32(x);
+ // scale by 4/pi
+ float32x4_t y = vmulq_f32(x, c_cephes_FOPI);
+-
++
+ // store the integer part of y in mm0
+ emm2 = vcvtq_u32_f32(y);
+ /* j=(j+1) & (~1) (see the cephes sources) */
+ emm2 = vaddq_u32(emm2, vdupq_n_u32(1));
+ emm2 = vandq_u32(emm2, vdupq_n_u32(~1));
+ y = vcvtq_f32_u32(emm2);
+-
++
+ /* get the polynom selection mask
+ there is one polynom for 0 <= x <= Pi/4
+ and another one for Pi/4<x<=Pi/2
+ Both branches will be computed. */
+ const uint32x4_t poly_mask = vtstq_u32(emm2, CONST_2);
+-
++
+ // The magic pass: "Extended precision modular arithmetic"
+ x = vmlaq_f32(x, y, c_minus_cephes_DP1);
+ x = vmlaq_f32(x, y, c_minus_cephes_DP2);
+ x = vmlaq_f32(x, y, c_minus_cephes_DP3);
+-
++
+ sign_mask_sin = veorq_u32(sign_mask_sin, vtstq_u32(emm2, CONST_4));
+ sign_mask_cos = vtstq_u32(vsubq_u32(emm2, CONST_2), CONST_4);
+-
++
+ /* Evaluate the first polynom (0 <= x <= Pi/4) in y1,
+ and the second polynom (Pi/4 <= x <= 0) in y2 */
+ float32x4_t y1, y2;
+- float32x4_t z = vmulq_f32(x,x);
+-
++ float32x4_t z = vmulq_f32(x, x);
++
+ y1 = vmlaq_f32(c_coscof_p1, z, c_coscof_p0);
+ y1 = vmlaq_f32(c_coscof_p2, z, y1);
+ y1 = vmulq_f32(y1, z);
+ y1 = vmulq_f32(y1, z);
+ y1 = vmlsq_f32(y1, z, CONST_1_2);
+ y1 = vaddq_f32(y1, CONST_1);
+-
++
+ y2 = vmlaq_f32(c_sincof_p1, z, c_sincof_p0);
+ y2 = vmlaq_f32(c_sincof_p2, z, y2);
+ y2 = vmulq_f32(y2, z);
+ y2 = vmlaq_f32(x, x, y2);
+-
++
+ /* select the correct result from the two polynoms */
+ const float32x4_t ys = vbslq_f32(poly_mask, y1, y2);
+ const float32x4_t yc = vbslq_f32(poly_mask, y2, y1);
+-
++
+ float32x4x2_t sincos;
+ sincos.val[0] = vbslq_f32(sign_mask_sin, vnegq_f32(ys), ys);
+ sincos.val[1] = vbslq_f32(sign_mask_cos, yc, vnegq_f32(yc));
+-
++
+ return sincos;
+ }
+
+-static inline float32x4_t _vsinq_f32(float32x4_t x) {
++static inline float32x4_t _vsinq_f32(float32x4_t x)
++{
+ const float32x4x2_t sincos = _vsincosq_f32(x);
+ return sincos.val[0];
+ }
+
+-static inline float32x4_t _vcosq_f32(float32x4_t x) {
++static inline float32x4_t _vcosq_f32(float32x4_t x)
++{
+ const float32x4x2_t sincos = _vsincosq_f32(x);
+ return sincos.val[1];
+ }
+
+-static inline float32x4_t _vtanq_f32(float32x4_t x) {
++static inline float32x4_t _vtanq_f32(float32x4_t x)
++{
+ const float32x4x2_t sincos = _vsincosq_f32(x);
+ return vmulq_f32(sincos.val[0], _vinvq_f32(sincos.val[1]));
+ }
+diff --git a/include/volk/volk_prefs.h b/include/volk/volk_prefs.h
+index cfa3806..96b7f1c 100644
+--- a/include/volk/volk_prefs.h
++++ b/include/volk/volk_prefs.h
+@@ -1,17 +1,16 @@
+ #ifndef INCLUDED_VOLK_PREFS_H
+ #define INCLUDED_VOLK_PREFS_H
+
+-#include <volk/volk_common.h>
+ #include <stdbool.h>
+ #include <stdlib.h>
++#include <volk/volk_common.h>
+
+ __VOLK_DECL_BEGIN
+
+-typedef struct volk_arch_pref
+-{
+- char name[128]; //name of the kernel
+- char impl_a[128]; //best aligned impl
+- char impl_u[128]; //best unaligned impl
++typedef struct volk_arch_pref {
++ char name[128]; // name of the kernel
++ char impl_a[128]; // best aligned impl
++ char impl_u[128]; // best unaligned impl
+ } volk_arch_pref_t;
+
+ ////////////////////////////////////////////////////////////////////////
+@@ -19,13 +18,13 @@ typedef struct volk_arch_pref
+ // if config file should be tested on existence for reading.
+ // returns \0 in the argument on failure.
+ ////////////////////////////////////////////////////////////////////////
+-VOLK_API void volk_get_config_path(char *, bool);
++VOLK_API void volk_get_config_path(char*, bool);
+
+ ////////////////////////////////////////////////////////////////////////
+ // load prefs into global prefs struct
+ ////////////////////////////////////////////////////////////////////////
+-VOLK_API size_t volk_load_preferences(volk_arch_pref_t **);
++VOLK_API size_t volk_load_preferences(volk_arch_pref_t**);
+
+ __VOLK_DECL_END
+
+-#endif //INCLUDED_VOLK_PREFS_H
++#endif // INCLUDED_VOLK_PREFS_H
+diff --git a/include/volk/volk_sse3_intrinsics.h b/include/volk/volk_sse3_intrinsics.h
+index 6b53a2a..6bdc8d8 100644
+--- a/include/volk/volk_sse3_intrinsics.h
++++ b/include/volk/volk_sse3_intrinsics.h
+@@ -1,19 +1,19 @@
+ /* -*- c++ -*- */
+-/*
++/*
+ * Copyright 2015 Free Software Foundation, Inc.
+- *
++ *
+ * This file is part of GNU Radio
+- *
++ *
+ * GNU Radio is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 3, or (at your option)
+ * any later version.
+- *
++ *
+ * GNU Radio is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+- *
++ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNU Radio; see the file COPYING. If not, write to
+ * the Free Software Foundation, Inc., 51 Franklin Street,
+@@ -29,49 +29,52 @@
+ #define INCLUDE_VOLK_VOLK_SSE3_INTRINSICS_H_
+ #include <pmmintrin.h>
+
+-static inline __m128
+-_mm_complexmul_ps(__m128 x, __m128 y)
++static inline __m128 _mm_complexmul_ps(__m128 x, __m128 y)
+ {
+- __m128 yl, yh, tmp1, tmp2;
+- yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr
+- yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di
+- tmp1 = _mm_mul_ps(x, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
+- x = _mm_shuffle_ps(x, x, 0xB1); // Re-arrange x to be ai,ar,bi,br
+- tmp2 = _mm_mul_ps(x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
+- return _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
++ __m128 yl, yh, tmp1, tmp2;
++ yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr
++ yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di
++ tmp1 = _mm_mul_ps(x, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
++ x = _mm_shuffle_ps(x, x, 0xB1); // Re-arrange x to be ai,ar,bi,br
++ tmp2 = _mm_mul_ps(x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
++ return _mm_addsub_ps(tmp1,
++ tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
+ }
+
+-static inline __m128
+-_mm_complexconjugatemul_ps(__m128 x, __m128 y)
++static inline __m128 _mm_complexconjugatemul_ps(__m128 x, __m128 y)
+ {
+- const __m128 conjugator = _mm_setr_ps(0, -0.f, 0, -0.f);
+- y = _mm_xor_ps(y, conjugator); // conjugate y
+- return _mm_complexmul_ps(x, y);
++ const __m128 conjugator = _mm_setr_ps(0, -0.f, 0, -0.f);
++ y = _mm_xor_ps(y, conjugator); // conjugate y
++ return _mm_complexmul_ps(x, y);
+ }
+
+-static inline __m128
+-_mm_magnitudesquared_ps_sse3(__m128 cplxValue1, __m128 cplxValue2){
+- cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values
+- cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values
+- return _mm_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values
++static inline __m128 _mm_magnitudesquared_ps_sse3(__m128 cplxValue1, __m128 cplxValue2)
++{
++ cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values
++ cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values
++ return _mm_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values
+ }
+
+-static inline __m128
+-_mm_magnitude_ps_sse3(__m128 cplxValue1, __m128 cplxValue2){
+- return _mm_sqrt_ps(_mm_magnitudesquared_ps_sse3(cplxValue1, cplxValue2));
++static inline __m128 _mm_magnitude_ps_sse3(__m128 cplxValue1, __m128 cplxValue2)
++{
++ return _mm_sqrt_ps(_mm_magnitudesquared_ps_sse3(cplxValue1, cplxValue2));
+ }
+
+-static inline __m128
+-_mm_scaled_norm_dist_ps_sse3(const __m128 symbols0, const __m128 symbols1, const __m128 points0, const __m128 points1, const __m128 scalar){
+- /*
+- * Calculate: |y - x|^2 * SNR_lin
+- * Consider 'symbolsX' and 'pointsX' to be complex float
+- * 'symbolsX' are 'y' and 'pointsX' are 'x'
+- */
+- const __m128 diff0 = _mm_sub_ps(symbols0, points0);
+- const __m128 diff1 = _mm_sub_ps(symbols1, points1);
+- const __m128 norms = _mm_magnitudesquared_ps_sse3(diff0, diff1);
+- return _mm_mul_ps(norms, scalar);
++static inline __m128 _mm_scaled_norm_dist_ps_sse3(const __m128 symbols0,
++ const __m128 symbols1,
++ const __m128 points0,
++ const __m128 points1,
++ const __m128 scalar)
++{
++ /*
++ * Calculate: |y - x|^2 * SNR_lin
++ * Consider 'symbolsX' and 'pointsX' to be complex float
++ * 'symbolsX' are 'y' and 'pointsX' are 'x'
++ */
++ const __m128 diff0 = _mm_sub_ps(symbols0, points0);
++ const __m128 diff1 = _mm_sub_ps(symbols1, points1);
++ const __m128 norms = _mm_magnitudesquared_ps_sse3(diff0, diff1);
++ return _mm_mul_ps(norms, scalar);
+ }
+
+ #endif /* INCLUDE_VOLK_VOLK_SSE3_INTRINSICS_H_ */
+diff --git a/include/volk/volk_sse_intrinsics.h b/include/volk/volk_sse_intrinsics.h
+index 57318e2..24fe7c1 100644
+--- a/include/volk/volk_sse_intrinsics.h
++++ b/include/volk/volk_sse_intrinsics.h
+@@ -1,19 +1,19 @@
+ /* -*- c++ -*- */
+-/*
++/*
+ * Copyright 2015 Free Software Foundation, Inc.
+- *
++ *
+ * This file is part of GNU Radio
+- *
++ *
+ * GNU Radio is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 3, or (at your option)
+ * any later version.
+- *
++ *
+ * GNU Radio is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+- *
++ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNU Radio; see the file COPYING. If not, write to
+ * the Free Software Foundation, Inc., 51 Franklin Street,
+@@ -29,31 +29,34 @@
+ #define INCLUDE_VOLK_VOLK_SSE_INTRINSICS_H_
+ #include <xmmintrin.h>
+
+-static inline __m128
+-_mm_magnitudesquared_ps(__m128 cplxValue1, __m128 cplxValue2){
+- __m128 iValue, qValue;
+- // Arrange in i1i2i3i4 format
+- iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0));
+- // Arrange in q1q2q3q4 format
+- qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1));
+- iValue = _mm_mul_ps(iValue, iValue); // Square the I values
+- qValue = _mm_mul_ps(qValue, qValue); // Square the Q Values
+- return _mm_add_ps(iValue, qValue); // Add the I2 and Q2 values
++static inline __m128 _mm_magnitudesquared_ps(__m128 cplxValue1, __m128 cplxValue2)
++{
++ __m128 iValue, qValue;
++ // Arrange in i1i2i3i4 format
++ iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2, 0, 2, 0));
++ // Arrange in q1q2q3q4 format
++ qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3, 1, 3, 1));
++ iValue = _mm_mul_ps(iValue, iValue); // Square the I values
++ qValue = _mm_mul_ps(qValue, qValue); // Square the Q Values
++ return _mm_add_ps(iValue, qValue); // Add the I2 and Q2 values
+ }
+
+-static inline __m128
+-_mm_magnitude_ps(__m128 cplxValue1, __m128 cplxValue2){
+- return _mm_sqrt_ps(_mm_magnitudesquared_ps(cplxValue1, cplxValue2));
++static inline __m128 _mm_magnitude_ps(__m128 cplxValue1, __m128 cplxValue2)
++{
++ return _mm_sqrt_ps(_mm_magnitudesquared_ps(cplxValue1, cplxValue2));
+ }
+
+-static inline __m128
+-_mm_scaled_norm_dist_ps_sse(const __m128 symbols0, const __m128 symbols1, const __m128 points0, const __m128 points1, const __m128 scalar)
++static inline __m128 _mm_scaled_norm_dist_ps_sse(const __m128 symbols0,
++ const __m128 symbols1,
++ const __m128 points0,
++ const __m128 points1,
++ const __m128 scalar)
+ {
+- // calculate scalar * |x - y|^2
+- const __m128 diff0 = _mm_sub_ps(symbols0, points0);
+- const __m128 diff1 = _mm_sub_ps(symbols1, points1);
+- const __m128 norms = _mm_magnitudesquared_ps(diff0, diff1);
+- return _mm_mul_ps(norms, scalar);
++ // calculate scalar * |x - y|^2
++ const __m128 diff0 = _mm_sub_ps(symbols0, points0);
++ const __m128 diff1 = _mm_sub_ps(symbols1, points1);
++ const __m128 norms = _mm_magnitudesquared_ps(diff0, diff1);
++ return _mm_mul_ps(norms, scalar);
+ }
+
+ #endif /* INCLUDE_VOLK_VOLK_SSE_INTRINSICS_H_ */
+diff --git a/kernels/volk/volk_16i_32fc_dot_prod_32fc.h b/kernels/volk/volk_16i_32fc_dot_prod_32fc.h
+index f250340..2635649 100644
+--- a/kernels/volk/volk_16i_32fc_dot_prod_32fc.h
++++ b/kernels/volk/volk_16i_32fc_dot_prod_32fc.h
+@@ -33,8 +33,8 @@
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+- * void volk_16i_32fc_dot_prod_32fc(lv_32fc_t* result, const short* input, const lv_32fc_t * taps, unsigned int num_points)
+- * \endcode
++ * void volk_16i_32fc_dot_prod_32fc(lv_32fc_t* result, const short* input, const lv_32fc_t
++ * * taps, unsigned int num_points) \endcode
+ *
+ * \b Inputs
+ * \li input: vector of shorts.
+@@ -58,165 +58,178 @@
+ #ifndef INCLUDED_volk_16i_32fc_dot_prod_32fc_H
+ #define INCLUDED_volk_16i_32fc_dot_prod_32fc_H
+
+-#include <volk/volk_common.h>
+ #include <stdio.h>
++#include <volk/volk_common.h>
+
+
+ #ifdef LV_HAVE_GENERIC
+
+-static inline void volk_16i_32fc_dot_prod_32fc_generic(lv_32fc_t* result, const short* input, const lv_32fc_t * taps, unsigned int num_points) {
++static inline void volk_16i_32fc_dot_prod_32fc_generic(lv_32fc_t* result,
++ const short* input,
++ const lv_32fc_t* taps,
++ unsigned int num_points)
++{
+
+- static const int N_UNROLL = 4;
++ static const int N_UNROLL = 4;
+
+- lv_32fc_t acc0 = 0;
+- lv_32fc_t acc1 = 0;
+- lv_32fc_t acc2 = 0;
+- lv_32fc_t acc3 = 0;
++ lv_32fc_t acc0 = 0;
++ lv_32fc_t acc1 = 0;
++ lv_32fc_t acc2 = 0;
++ lv_32fc_t acc3 = 0;
+
+- unsigned i = 0;
+- unsigned n = (num_points / N_UNROLL) * N_UNROLL;
++ unsigned i = 0;
++ unsigned n = (num_points / N_UNROLL) * N_UNROLL;
+
+- for(i = 0; i < n; i += N_UNROLL) {
+- acc0 += taps[i + 0] * (float)input[i + 0];
+- acc1 += taps[i + 1] * (float)input[i + 1];
+- acc2 += taps[i + 2] * (float)input[i + 2];
+- acc3 += taps[i + 3] * (float)input[i + 3];
+- }
++ for (i = 0; i < n; i += N_UNROLL) {
++ acc0 += taps[i + 0] * (float)input[i + 0];
++ acc1 += taps[i + 1] * (float)input[i + 1];
++ acc2 += taps[i + 2] * (float)input[i + 2];
++ acc3 += taps[i + 3] * (float)input[i + 3];
++ }
+
+- for(; i < num_points; i++) {
+- acc0 += taps[i] * (float)input[i];
+- }
++ for (; i < num_points; i++) {
++ acc0 += taps[i] * (float)input[i];
++ }
+
+- *result = acc0 + acc1 + acc2 + acc3;
++ *result = acc0 + acc1 + acc2 + acc3;
+ }
+
+ #endif /*LV_HAVE_GENERIC*/
+
+ #ifdef LV_HAVE_NEON
+ #include <arm_neon.h>
+-static inline void volk_16i_32fc_dot_prod_32fc_neon(lv_32fc_t* result, const short* input, const lv_32fc_t * taps, unsigned int num_points) {
+-
+- unsigned ii;
+- unsigned quarter_points = num_points / 4;
+- lv_32fc_t* tapsPtr = (lv_32fc_t*) taps;
+- short* inputPtr = (short*) input;
+- lv_32fc_t accumulator_vec[4];
+-
+- float32x4x2_t tapsVal, accumulator_val;
+- int16x4_t input16;
+- int32x4_t input32;
+- float32x4_t input_float, prod_re, prod_im;
+-
+- accumulator_val.val[0] = vdupq_n_f32(0.0);
+- accumulator_val.val[1] = vdupq_n_f32(0.0);
+-
+- for(ii = 0; ii < quarter_points; ++ii) {
+- tapsVal = vld2q_f32((float*)tapsPtr);
+- input16 = vld1_s16(inputPtr);
+- // widen 16-bit int to 32-bit int
+- input32 = vmovl_s16(input16);
+- // convert 32-bit int to float with scale
+- input_float = vcvtq_f32_s32(input32);
+-
+- prod_re = vmulq_f32(input_float, tapsVal.val[0]);
+- prod_im = vmulq_f32(input_float, tapsVal.val[1]);
+-
+- accumulator_val.val[0] = vaddq_f32(prod_re, accumulator_val.val[0]);
+- accumulator_val.val[1] = vaddq_f32(prod_im, accumulator_val.val[1]);
+-
+- tapsPtr += 4;
+- inputPtr += 4;
+- }
+- vst2q_f32((float*)accumulator_vec, accumulator_val);
+- accumulator_vec[0] += accumulator_vec[1];
+- accumulator_vec[2] += accumulator_vec[3];
+- accumulator_vec[0] += accumulator_vec[2];
+-
+- for(ii = quarter_points * 4; ii < num_points; ++ii) {
+- accumulator_vec[0] += *(tapsPtr++) * (float)(*(inputPtr++));
+- }
+-
+- *result = accumulator_vec[0];
++static inline void volk_16i_32fc_dot_prod_32fc_neon(lv_32fc_t* result,
++ const short* input,
++ const lv_32fc_t* taps,
++ unsigned int num_points)
++{
++
++ unsigned ii;
++ unsigned quarter_points = num_points / 4;
++ lv_32fc_t* tapsPtr = (lv_32fc_t*)taps;
++ short* inputPtr = (short*)input;
++ lv_32fc_t accumulator_vec[4];
++
++ float32x4x2_t tapsVal, accumulator_val;
++ int16x4_t input16;
++ int32x4_t input32;
++ float32x4_t input_float, prod_re, prod_im;
++
++ accumulator_val.val[0] = vdupq_n_f32(0.0);
++ accumulator_val.val[1] = vdupq_n_f32(0.0);
++
++ for (ii = 0; ii < quarter_points; ++ii) {
++ tapsVal = vld2q_f32((float*)tapsPtr);
++ input16 = vld1_s16(inputPtr);
++ // widen 16-bit int to 32-bit int
++ input32 = vmovl_s16(input16);
++ // convert 32-bit int to float with scale
++ input_float = vcvtq_f32_s32(input32);
++
++ prod_re = vmulq_f32(input_float, tapsVal.val[0]);
++ prod_im = vmulq_f32(input_float, tapsVal.val[1]);
++
++ accumulator_val.val[0] = vaddq_f32(prod_re, accumulator_val.val[0]);
++ accumulator_val.val[1] = vaddq_f32(prod_im, accumulator_val.val[1]);
++
++ tapsPtr += 4;
++ inputPtr += 4;
++ }
++ vst2q_f32((float*)accumulator_vec, accumulator_val);
++ accumulator_vec[0] += accumulator_vec[1];
++ accumulator_vec[2] += accumulator_vec[3];
++ accumulator_vec[0] += accumulator_vec[2];
++
++ for (ii = quarter_points * 4; ii < num_points; ++ii) {
++ accumulator_vec[0] += *(tapsPtr++) * (float)(*(inputPtr++));
++ }
++
++ *result = accumulator_vec[0];
+ }
+
+ #endif /*LV_HAVE_NEON*/
+
+ #if LV_HAVE_SSE && LV_HAVE_MMX
+
+-static inline void volk_16i_32fc_dot_prod_32fc_u_sse( lv_32fc_t* result, const short* input, const lv_32fc_t* taps, unsigned int num_points) {
+-
+- unsigned int number = 0;
+- const unsigned int sixteenthPoints = num_points / 8;
+-
+- float res[2];
+- float *realpt = &res[0], *imagpt = &res[1];
+- const short* aPtr = input;
+- const float* bPtr = (float*)taps;
+-
+- __m64 m0, m1;
+- __m128 f0, f1, f2, f3;
+- __m128 a0Val, a1Val, a2Val, a3Val;
+- __m128 b0Val, b1Val, b2Val, b3Val;
+- __m128 c0Val, c1Val, c2Val, c3Val;
+-
+- __m128 dotProdVal0 = _mm_setzero_ps();
+- __m128 dotProdVal1 = _mm_setzero_ps();
+- __m128 dotProdVal2 = _mm_setzero_ps();
+- __m128 dotProdVal3 = _mm_setzero_ps();
+-
+- for(;number < sixteenthPoints; number++){
+-
+- m0 = _mm_set_pi16(*(aPtr+3), *(aPtr+2), *(aPtr+1), *(aPtr+0));
+- m1 = _mm_set_pi16(*(aPtr+7), *(aPtr+6), *(aPtr+5), *(aPtr+4));
+- f0 = _mm_cvtpi16_ps(m0);
+- f1 = _mm_cvtpi16_ps(m0);
+- f2 = _mm_cvtpi16_ps(m1);
+- f3 = _mm_cvtpi16_ps(m1);
+-
+- a0Val = _mm_unpacklo_ps(f0, f1);
+- a1Val = _mm_unpackhi_ps(f0, f1);
+- a2Val = _mm_unpacklo_ps(f2, f3);
+- a3Val = _mm_unpackhi_ps(f2, f3);
+-
+- b0Val = _mm_loadu_ps(bPtr);
+- b1Val = _mm_loadu_ps(bPtr+4);
+- b2Val = _mm_loadu_ps(bPtr+8);
+- b3Val = _mm_loadu_ps(bPtr+12);
+-
+- c0Val = _mm_mul_ps(a0Val, b0Val);
+- c1Val = _mm_mul_ps(a1Val, b1Val);
+- c2Val = _mm_mul_ps(a2Val, b2Val);
+- c3Val = _mm_mul_ps(a3Val, b3Val);
+-
+- dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0);
+- dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1);
+- dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2);
+- dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3);
+-
+- aPtr += 8;
+- bPtr += 16;
+- }
+-
+- dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
+- dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
+- dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
+-
+- __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
+-
+- _mm_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector
+-
+- *realpt = dotProductVector[0];
+- *imagpt = dotProductVector[1];
+- *realpt += dotProductVector[2];
+- *imagpt += dotProductVector[3];
+-
+- number = sixteenthPoints*8;
+- for(;number < num_points; number++){
+- *realpt += ((*aPtr) * (*bPtr++));
+- *imagpt += ((*aPtr++) * (*bPtr++));
+- }
+-
+- *result = *(lv_32fc_t*)(&res[0]);
++static inline void volk_16i_32fc_dot_prod_32fc_u_sse(lv_32fc_t* result,
++ const short* input,
++ const lv_32fc_t* taps,
++ unsigned int num_points)
++{
++
++ unsigned int number = 0;
++ const unsigned int sixteenthPoints = num_points / 8;
++
++ float res[2];
++ float *realpt = &res[0], *imagpt = &res[1];
++ const short* aPtr = input;
++ const float* bPtr = (float*)taps;
++
++ __m64 m0, m1;
++ __m128 f0, f1, f2, f3;
++ __m128 a0Val, a1Val, a2Val, a3Val;
++ __m128 b0Val, b1Val, b2Val, b3Val;
++ __m128 c0Val, c1Val, c2Val, c3Val;
++
++ __m128 dotProdVal0 = _mm_setzero_ps();
++ __m128 dotProdVal1 = _mm_setzero_ps();
++ __m128 dotProdVal2 = _mm_setzero_ps();
++ __m128 dotProdVal3 = _mm_setzero_ps();
++
++ for (; number < sixteenthPoints; number++) {
++
++ m0 = _mm_set_pi16(*(aPtr + 3), *(aPtr + 2), *(aPtr + 1), *(aPtr + 0));
++ m1 = _mm_set_pi16(*(aPtr + 7), *(aPtr + 6), *(aPtr + 5), *(aPtr + 4));
++ f0 = _mm_cvtpi16_ps(m0);
++ f1 = _mm_cvtpi16_ps(m0);
++ f2 = _mm_cvtpi16_ps(m1);
++ f3 = _mm_cvtpi16_ps(m1);
++
++ a0Val = _mm_unpacklo_ps(f0, f1);
++ a1Val = _mm_unpackhi_ps(f0, f1);
++ a2Val = _mm_unpacklo_ps(f2, f3);
++ a3Val = _mm_unpackhi_ps(f2, f3);
++
++ b0Val = _mm_loadu_ps(bPtr);
++ b1Val = _mm_loadu_ps(bPtr + 4);
++ b2Val = _mm_loadu_ps(bPtr + 8);
++ b3Val = _mm_loadu_ps(bPtr + 12);
++
++ c0Val = _mm_mul_ps(a0Val, b0Val);
++ c1Val = _mm_mul_ps(a1Val, b1Val);
++ c2Val = _mm_mul_ps(a2Val, b2Val);
++ c3Val = _mm_mul_ps(a3Val, b3Val);
++
++ dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0);
++ dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1);
++ dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2);
++ dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3);
++
++ aPtr += 8;
++ bPtr += 16;
++ }
++
++ dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
++ dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
++ dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
++
++ __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
++
++ _mm_store_ps(dotProductVector,
++ dotProdVal0); // Store the results back into the dot product vector
++
++ *realpt = dotProductVector[0];
++ *imagpt = dotProductVector[1];
++ *realpt += dotProductVector[2];
++ *imagpt += dotProductVector[3];
++
++ number = sixteenthPoints * 8;
++ for (; number < num_points; number++) {
++ *realpt += ((*aPtr) * (*bPtr++));
++ *imagpt += ((*aPtr++) * (*bPtr++));
++ }
++
++ *result = *(lv_32fc_t*)(&res[0]);
+ }
+
+ #endif /*LV_HAVE_SSE && LV_HAVE_MMX*/
+@@ -224,85 +237,90 @@ static inline void volk_16i_32fc_dot_prod_32fc_u_sse( lv_32fc_t* result, const
+
+ #if LV_HAVE_AVX2 && LV_HAVE_FMA
+
+-static inline void volk_16i_32fc_dot_prod_32fc_u_avx2_fma( lv_32fc_t* result, const short* input, const lv_32fc_t* taps, unsigned int num_points) {
+-
+- unsigned int number = 0;
+- const unsigned int sixteenthPoints = num_points / 16;
+-
+- float res[2];
+- float *realpt = &res[0], *imagpt = &res[1];
+- const short* aPtr = input;
+- const float* bPtr = (float*)taps;
+-
+- __m128i m0, m1;
+- __m256i f0, f1;
+- __m256 g0, g1, h0, h1, h2, h3;
+- __m256 a0Val, a1Val, a2Val, a3Val;
+- __m256 b0Val, b1Val, b2Val, b3Val;
+-
+- __m256 dotProdVal0 = _mm256_setzero_ps();
+- __m256 dotProdVal1 = _mm256_setzero_ps();
+- __m256 dotProdVal2 = _mm256_setzero_ps();
+- __m256 dotProdVal3 = _mm256_setzero_ps();
+-
+- for(;number < sixteenthPoints; number++){
+-
+- m0 = _mm_loadu_si128((__m128i const*) aPtr);
+- m1 = _mm_loadu_si128((__m128i const*)(aPtr+8));
+-
+- f0 = _mm256_cvtepi16_epi32(m0);
+- g0 = _mm256_cvtepi32_ps(f0);
+- f1 = _mm256_cvtepi16_epi32(m1);
+- g1 = _mm256_cvtepi32_ps(f1);
+-
+- h0 = _mm256_unpacklo_ps(g0, g0);
+- h1 = _mm256_unpackhi_ps(g0, g0);
+- h2 = _mm256_unpacklo_ps(g1, g1);
+- h3 = _mm256_unpackhi_ps(g1, g1);
+-
+- a0Val = _mm256_permute2f128_ps(h0, h1, 0x20);
+- a1Val = _mm256_permute2f128_ps(h0, h1, 0x31);
+- a2Val = _mm256_permute2f128_ps(h2, h3, 0x20);
+- a3Val = _mm256_permute2f128_ps(h2, h3, 0x31);
+-
+- b0Val = _mm256_loadu_ps(bPtr);
+- b1Val = _mm256_loadu_ps(bPtr+8);
+- b2Val = _mm256_loadu_ps(bPtr+16);
+- b3Val = _mm256_loadu_ps(bPtr+24);
+-
+- dotProdVal0 = _mm256_fmadd_ps(a0Val,b0Val,dotProdVal0);
+- dotProdVal1 = _mm256_fmadd_ps(a1Val,b1Val,dotProdVal1);
+- dotProdVal2 = _mm256_fmadd_ps(a2Val,b2Val,dotProdVal2);
+- dotProdVal3 = _mm256_fmadd_ps(a3Val,b3Val,dotProdVal3);
+-
+- aPtr += 16;
+- bPtr += 32;
+- }
+-
+- dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
+- dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2);
+- dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3);
+-
+- __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
+-
+- _mm256_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector
+-
+- *realpt = dotProductVector[0];
+- *imagpt = dotProductVector[1];
+- *realpt += dotProductVector[2];
+- *imagpt += dotProductVector[3];
+- *realpt += dotProductVector[4];
+- *imagpt += dotProductVector[5];
+- *realpt += dotProductVector[6];
+- *imagpt += dotProductVector[7];
+-
+- number = sixteenthPoints*16;
+- for(;number < num_points; number++){
+- *realpt += ((*aPtr) * (*bPtr++));
+- *imagpt += ((*aPtr++) * (*bPtr++));
+- }
+-
+- *result = *(lv_32fc_t*)(&res[0]);
++static inline void volk_16i_32fc_dot_prod_32fc_u_avx2_fma(lv_32fc_t* result,
++ const short* input,
++ const lv_32fc_t* taps,
++ unsigned int num_points)
++{
++
++ unsigned int number = 0;
++ const unsigned int sixteenthPoints = num_points / 16;
++
++ float res[2];
++ float *realpt = &res[0], *imagpt = &res[1];
++ const short* aPtr = input;
++ const float* bPtr = (float*)taps;
++
++ __m128i m0, m1;
++ __m256i f0, f1;
++ __m256 g0, g1, h0, h1, h2, h3;
++ __m256 a0Val, a1Val, a2Val, a3Val;
++ __m256 b0Val, b1Val, b2Val, b3Val;
++
++ __m256 dotProdVal0 = _mm256_setzero_ps();
++ __m256 dotProdVal1 = _mm256_setzero_ps();
++ __m256 dotProdVal2 = _mm256_setzero_ps();
++ __m256 dotProdVal3 = _mm256_setzero_ps();
++
++ for (; number < sixteenthPoints; number++) {
++
++ m0 = _mm_loadu_si128((__m128i const*)aPtr);
++ m1 = _mm_loadu_si128((__m128i const*)(aPtr + 8));
++
++ f0 = _mm256_cvtepi16_epi32(m0);
++ g0 = _mm256_cvtepi32_ps(f0);
++ f1 = _mm256_cvtepi16_epi32(m1);
++ g1 = _mm256_cvtepi32_ps(f1);
++
++ h0 = _mm256_unpacklo_ps(g0, g0);
++ h1 = _mm256_unpackhi_ps(g0, g0);
++ h2 = _mm256_unpacklo_ps(g1, g1);
++ h3 = _mm256_unpackhi_ps(g1, g1);
++
++ a0Val = _mm256_permute2f128_ps(h0, h1, 0x20);
++ a1Val = _mm256_permute2f128_ps(h0, h1, 0x31);
++ a2Val = _mm256_permute2f128_ps(h2, h3, 0x20);
++ a3Val = _mm256_permute2f128_ps(h2, h3, 0x31);
++
++ b0Val = _mm256_loadu_ps(bPtr);
++ b1Val = _mm256_loadu_ps(bPtr + 8);
++ b2Val = _mm256_loadu_ps(bPtr + 16);
++ b3Val = _mm256_loadu_ps(bPtr + 24);
++
++ dotProdVal0 = _mm256_fmadd_ps(a0Val, b0Val, dotProdVal0);
++ dotProdVal1 = _mm256_fmadd_ps(a1Val, b1Val, dotProdVal1);
++ dotProdVal2 = _mm256_fmadd_ps(a2Val, b2Val, dotProdVal2);
++ dotProdVal3 = _mm256_fmadd_ps(a3Val, b3Val, dotProdVal3);
++
++ aPtr += 16;
++ bPtr += 32;
++ }
++
++ dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
++ dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2);
++ dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3);
++
++ __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
++
++ _mm256_store_ps(dotProductVector,
++ dotProdVal0); // Store the results back into the dot product vector
++
++ *realpt = dotProductVector[0];
++ *imagpt = dotProductVector[1];
++ *realpt += dotProductVector[2];
++ *imagpt += dotProductVector[3];
++ *realpt += dotProductVector[4];
++ *imagpt += dotProductVector[5];
++ *realpt += dotProductVector[6];
++ *imagpt += dotProductVector[7];
++
++ number = sixteenthPoints * 16;
++ for (; number < num_points; number++) {
++ *realpt += ((*aPtr) * (*bPtr++));
++ *imagpt += ((*aPtr++) * (*bPtr++));
++ }
++
++ *result = *(lv_32fc_t*)(&res[0]);
+ }
+
+ #endif /*LV_HAVE_AVX2 && lV_HAVE_FMA*/
+@@ -310,91 +328,96 @@ static inline void volk_16i_32fc_dot_prod_32fc_u_avx2_fma( lv_32fc_t* result, co
+
+ #ifdef LV_HAVE_AVX2
+
+-static inline void volk_16i_32fc_dot_prod_32fc_u_avx2( lv_32fc_t* result, const short* input, const lv_32fc_t* taps, unsigned int num_points) {
+-
+- unsigned int number = 0;
+- const unsigned int sixteenthPoints = num_points / 16;
+-
+- float res[2];
+- float *realpt = &res[0], *imagpt = &res[1];
+- const short* aPtr = input;
+- const float* bPtr = (float*)taps;
+-
+- __m128i m0, m1;
+- __m256i f0, f1;
+- __m256 g0, g1, h0, h1, h2, h3;
+- __m256 a0Val, a1Val, a2Val, a3Val;
+- __m256 b0Val, b1Val, b2Val, b3Val;
+- __m256 c0Val, c1Val, c2Val, c3Val;
+-
+- __m256 dotProdVal0 = _mm256_setzero_ps();
+- __m256 dotProdVal1 = _mm256_setzero_ps();
+- __m256 dotProdVal2 = _mm256_setzero_ps();
+- __m256 dotProdVal3 = _mm256_setzero_ps();
+-
+- for(;number < sixteenthPoints; number++){
+-
+- m0 = _mm_loadu_si128((__m128i const*) aPtr);
+- m1 = _mm_loadu_si128((__m128i const*)(aPtr+8));
+-
+- f0 = _mm256_cvtepi16_epi32(m0);
+- g0 = _mm256_cvtepi32_ps(f0);
+- f1 = _mm256_cvtepi16_epi32(m1);
+- g1 = _mm256_cvtepi32_ps(f1);
+-
+- h0 = _mm256_unpacklo_ps(g0, g0);
+- h1 = _mm256_unpackhi_ps(g0, g0);
+- h2 = _mm256_unpacklo_ps(g1, g1);
+- h3 = _mm256_unpackhi_ps(g1, g1);
+-
+- a0Val = _mm256_permute2f128_ps(h0, h1, 0x20);
+- a1Val = _mm256_permute2f128_ps(h0, h1, 0x31);
+- a2Val = _mm256_permute2f128_ps(h2, h3, 0x20);
+- a3Val = _mm256_permute2f128_ps(h2, h3, 0x31);
+-
+- b0Val = _mm256_loadu_ps(bPtr);
+- b1Val = _mm256_loadu_ps(bPtr+8);
+- b2Val = _mm256_loadu_ps(bPtr+16);
+- b3Val = _mm256_loadu_ps(bPtr+24);
+-
+- c0Val = _mm256_mul_ps(a0Val, b0Val);
+- c1Val = _mm256_mul_ps(a1Val, b1Val);
+- c2Val = _mm256_mul_ps(a2Val, b2Val);
+- c3Val = _mm256_mul_ps(a3Val, b3Val);
+-
+- dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0);
+- dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1);
+- dotProdVal2 = _mm256_add_ps(c2Val, dotProdVal2);
+- dotProdVal3 = _mm256_add_ps(c3Val, dotProdVal3);
+-
+- aPtr += 16;
+- bPtr += 32;
+- }
+-
+- dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
+- dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2);
+- dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3);
+-
+- __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
+-
+- _mm256_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector
+-
+- *realpt = dotProductVector[0];
+- *imagpt = dotProductVector[1];
+- *realpt += dotProductVector[2];
+- *imagpt += dotProductVector[3];
+- *realpt += dotProductVector[4];
+- *imagpt += dotProductVector[5];
+- *realpt += dotProductVector[6];
+- *imagpt += dotProductVector[7];
+-
+- number = sixteenthPoints*16;
+- for(;number < num_points; number++){
+- *realpt += ((*aPtr) * (*bPtr++));
+- *imagpt += ((*aPtr++) * (*bPtr++));
+- }
+-
+- *result = *(lv_32fc_t*)(&res[0]);
++static inline void volk_16i_32fc_dot_prod_32fc_u_avx2(lv_32fc_t* result,
++ const short* input,
++ const lv_32fc_t* taps,
++ unsigned int num_points)
++{
++
++ unsigned int number = 0;
++ const unsigned int sixteenthPoints = num_points / 16;
++
++ float res[2];
++ float *realpt = &res[0], *imagpt = &res[1];
++ const short* aPtr = input;
++ const float* bPtr = (float*)taps;
++
++ __m128i m0, m1;
++ __m256i f0, f1;
++ __m256 g0, g1, h0, h1, h2, h3;
++ __m256 a0Val, a1Val, a2Val, a3Val;
++ __m256 b0Val, b1Val, b2Val, b3Val;
++ __m256 c0Val, c1Val, c2Val, c3Val;
++
++ __m256 dotProdVal0 = _mm256_setzero_ps();
++ __m256 dotProdVal1 = _mm256_setzero_ps();
++ __m256 dotProdVal2 = _mm256_setzero_ps();
++ __m256 dotProdVal3 = _mm256_setzero_ps();
++
++ for (; number < sixteenthPoints; number++) {
++
++ m0 = _mm_loadu_si128((__m128i const*)aPtr);
++ m1 = _mm_loadu_si128((__m128i const*)(aPtr + 8));
++
++ f0 = _mm256_cvtepi16_epi32(m0);
++ g0 = _mm256_cvtepi32_ps(f0);
++ f1 = _mm256_cvtepi16_epi32(m1);
++ g1 = _mm256_cvtepi32_ps(f1);
++
++ h0 = _mm256_unpacklo_ps(g0, g0);
++ h1 = _mm256_unpackhi_ps(g0, g0);
++ h2 = _mm256_unpacklo_ps(g1, g1);
++ h3 = _mm256_unpackhi_ps(g1, g1);
++
++ a0Val = _mm256_permute2f128_ps(h0, h1, 0x20);
++ a1Val = _mm256_permute2f128_ps(h0, h1, 0x31);
++ a2Val = _mm256_permute2f128_ps(h2, h3, 0x20);
++ a3Val = _mm256_permute2f128_ps(h2, h3, 0x31);
++
++ b0Val = _mm256_loadu_ps(bPtr);
++ b1Val = _mm256_loadu_ps(bPtr + 8);
++ b2Val = _mm256_loadu_ps(bPtr + 16);
++ b3Val = _mm256_loadu_ps(bPtr + 24);
++
++ c0Val = _mm256_mul_ps(a0Val, b0Val);
++ c1Val = _mm256_mul_ps(a1Val, b1Val);
++ c2Val = _mm256_mul_ps(a2Val, b2Val);
++ c3Val = _mm256_mul_ps(a3Val, b3Val);
++
++ dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0);
++ dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1);
++ dotProdVal2 = _mm256_add_ps(c2Val, dotProdVal2);
++ dotProdVal3 = _mm256_add_ps(c3Val, dotProdVal3);
++
++ aPtr += 16;
++ bPtr += 32;
++ }
++
++ dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
++ dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2);
++ dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3);
++
++ __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
++
++ _mm256_store_ps(dotProductVector,
++ dotProdVal0); // Store the results back into the dot product vector
++
++ *realpt = dotProductVector[0];
++ *imagpt = dotProductVector[1];
++ *realpt += dotProductVector[2];
++ *imagpt += dotProductVector[3];
++ *realpt += dotProductVector[4];
++ *imagpt += dotProductVector[5];
++ *realpt += dotProductVector[6];
++ *imagpt += dotProductVector[7];
++
++ number = sixteenthPoints * 16;
++ for (; number < num_points; number++) {
++ *realpt += ((*aPtr) * (*bPtr++));
++ *imagpt += ((*aPtr++) * (*bPtr++));
++ }
++
++ *result = *(lv_32fc_t*)(&res[0]);
+ }
+
+ #endif /*LV_HAVE_AVX2*/
+@@ -403,171 +426,181 @@ static inline void volk_16i_32fc_dot_prod_32fc_u_avx2( lv_32fc_t* result, const
+ #if LV_HAVE_SSE && LV_HAVE_MMX
+
+
+-static inline void volk_16i_32fc_dot_prod_32fc_a_sse( lv_32fc_t* result, const short* input, const lv_32fc_t* taps, unsigned int num_points) {
+-
+- unsigned int number = 0;
+- const unsigned int sixteenthPoints = num_points / 8;
+-
+- float res[2];
+- float *realpt = &res[0], *imagpt = &res[1];
+- const short* aPtr = input;
+- const float* bPtr = (float*)taps;
+-
+- __m64 m0, m1;
+- __m128 f0, f1, f2, f3;
+- __m128 a0Val, a1Val, a2Val, a3Val;
+- __m128 b0Val, b1Val, b2Val, b3Val;
+- __m128 c0Val, c1Val, c2Val, c3Val;
+-
+- __m128 dotProdVal0 = _mm_setzero_ps();
+- __m128 dotProdVal1 = _mm_setzero_ps();
+- __m128 dotProdVal2 = _mm_setzero_ps();
+- __m128 dotProdVal3 = _mm_setzero_ps();
+-
+- for(;number < sixteenthPoints; number++){
+-
+- m0 = _mm_set_pi16(*(aPtr+3), *(aPtr+2), *(aPtr+1), *(aPtr+0));
+- m1 = _mm_set_pi16(*(aPtr+7), *(aPtr+6), *(aPtr+5), *(aPtr+4));
+- f0 = _mm_cvtpi16_ps(m0);
+- f1 = _mm_cvtpi16_ps(m0);
+- f2 = _mm_cvtpi16_ps(m1);
+- f3 = _mm_cvtpi16_ps(m1);
+-
+- a0Val = _mm_unpacklo_ps(f0, f1);
+- a1Val = _mm_unpackhi_ps(f0, f1);
+- a2Val = _mm_unpacklo_ps(f2, f3);
+- a3Val = _mm_unpackhi_ps(f2, f3);
+-
+- b0Val = _mm_load_ps(bPtr);
+- b1Val = _mm_load_ps(bPtr+4);
+- b2Val = _mm_load_ps(bPtr+8);
+- b3Val = _mm_load_ps(bPtr+12);
+-
+- c0Val = _mm_mul_ps(a0Val, b0Val);
+- c1Val = _mm_mul_ps(a1Val, b1Val);
+- c2Val = _mm_mul_ps(a2Val, b2Val);
+- c3Val = _mm_mul_ps(a3Val, b3Val);
+-
+- dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0);
+- dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1);
+- dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2);
+- dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3);
+-
+- aPtr += 8;
+- bPtr += 16;
+- }
+-
+- dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
+- dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
+- dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
+-
+- __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
+-
+- _mm_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector
+-
+- *realpt = dotProductVector[0];
+- *imagpt = dotProductVector[1];
+- *realpt += dotProductVector[2];
+- *imagpt += dotProductVector[3];
+-
+- number = sixteenthPoints*8;
+- for(;number < num_points; number++){
+- *realpt += ((*aPtr) * (*bPtr++));
+- *imagpt += ((*aPtr++) * (*bPtr++));
+- }
+-
+- *result = *(lv_32fc_t*)(&res[0]);
++static inline void volk_16i_32fc_dot_prod_32fc_a_sse(lv_32fc_t* result,
++ const short* input,
++ const lv_32fc_t* taps,
++ unsigned int num_points)
++{
++
++ unsigned int number = 0;
++ const unsigned int sixteenthPoints = num_points / 8;
++
++ float res[2];
++ float *realpt = &res[0], *imagpt = &res[1];
++ const short* aPtr = input;
++ const float* bPtr = (float*)taps;
++
++ __m64 m0, m1;
++ __m128 f0, f1, f2, f3;
++ __m128 a0Val, a1Val, a2Val, a3Val;
++ __m128 b0Val, b1Val, b2Val, b3Val;
++ __m128 c0Val, c1Val, c2Val, c3Val;
++
++ __m128 dotProdVal0 = _mm_setzero_ps();
++ __m128 dotProdVal1 = _mm_setzero_ps();
++ __m128 dotProdVal2 = _mm_setzero_ps();
++ __m128 dotProdVal3 = _mm_setzero_ps();
++
++ for (; number < sixteenthPoints; number++) {
++
++ m0 = _mm_set_pi16(*(aPtr + 3), *(aPtr + 2), *(aPtr + 1), *(aPtr + 0));
++ m1 = _mm_set_pi16(*(aPtr + 7), *(aPtr + 6), *(aPtr + 5), *(aPtr + 4));
++ f0 = _mm_cvtpi16_ps(m0);
++ f1 = _mm_cvtpi16_ps(m0);
++ f2 = _mm_cvtpi16_ps(m1);
++ f3 = _mm_cvtpi16_ps(m1);
++
++ a0Val = _mm_unpacklo_ps(f0, f1);
++ a1Val = _mm_unpackhi_ps(f0, f1);
++ a2Val = _mm_unpacklo_ps(f2, f3);
++ a3Val = _mm_unpackhi_ps(f2, f3);
++
++ b0Val = _mm_load_ps(bPtr);
++ b1Val = _mm_load_ps(bPtr + 4);
++ b2Val = _mm_load_ps(bPtr + 8);
++ b3Val = _mm_load_ps(bPtr + 12);
++
++ c0Val = _mm_mul_ps(a0Val, b0Val);
++ c1Val = _mm_mul_ps(a1Val, b1Val);
++ c2Val = _mm_mul_ps(a2Val, b2Val);
++ c3Val = _mm_mul_ps(a3Val, b3Val);
++
++ dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0);
++ dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1);
++ dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2);
++ dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3);
++
++ aPtr += 8;
++ bPtr += 16;
++ }
++
++ dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
++ dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
++ dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
++
++ __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
++
++ _mm_store_ps(dotProductVector,
++ dotProdVal0); // Store the results back into the dot product vector
++
++ *realpt = dotProductVector[0];
++ *imagpt = dotProductVector[1];
++ *realpt += dotProductVector[2];
++ *imagpt += dotProductVector[3];
++
++ number = sixteenthPoints * 8;
++ for (; number < num_points; number++) {
++ *realpt += ((*aPtr) * (*bPtr++));
++ *imagpt += ((*aPtr++) * (*bPtr++));
++ }
++
++ *result = *(lv_32fc_t*)(&res[0]);
+ }
+
+ #endif /*LV_HAVE_SSE && LV_HAVE_MMX*/
+
+ #ifdef LV_HAVE_AVX2
+
+-static inline void volk_16i_32fc_dot_prod_32fc_a_avx2( lv_32fc_t* result, const short* input, const lv_32fc_t* taps, unsigned int num_points) {
+-
+- unsigned int number = 0;
+- const unsigned int sixteenthPoints = num_points / 16;
+-
+- float res[2];
+- float *realpt = &res[0], *imagpt = &res[1];
+- const short* aPtr = input;
+- const float* bPtr = (float*)taps;
+-
+- __m128i m0, m1;
+- __m256i f0, f1;
+- __m256 g0, g1, h0, h1, h2, h3;
+- __m256 a0Val, a1Val, a2Val, a3Val;
+- __m256 b0Val, b1Val, b2Val, b3Val;
+- __m256 c0Val, c1Val, c2Val, c3Val;
+-
+- __m256 dotProdVal0 = _mm256_setzero_ps();
+- __m256 dotProdVal1 = _mm256_setzero_ps();
+- __m256 dotProdVal2 = _mm256_setzero_ps();
+- __m256 dotProdVal3 = _mm256_setzero_ps();
+-
+- for(;number < sixteenthPoints; number++){
+-
+- m0 = _mm_load_si128((__m128i const*) aPtr);
+- m1 = _mm_load_si128((__m128i const*)(aPtr+8));
+-
+- f0 = _mm256_cvtepi16_epi32(m0);
+- g0 = _mm256_cvtepi32_ps(f0);
+- f1 = _mm256_cvtepi16_epi32(m1);
+- g1 = _mm256_cvtepi32_ps(f1);
+-
+- h0 = _mm256_unpacklo_ps(g0, g0);
+- h1 = _mm256_unpackhi_ps(g0, g0);
+- h2 = _mm256_unpacklo_ps(g1, g1);
+- h3 = _mm256_unpackhi_ps(g1, g1);
+-
+- a0Val = _mm256_permute2f128_ps(h0, h1, 0x20);
+- a1Val = _mm256_permute2f128_ps(h0, h1, 0x31);
+- a2Val = _mm256_permute2f128_ps(h2, h3, 0x20);
+- a3Val = _mm256_permute2f128_ps(h2, h3, 0x31);
+-
+- b0Val = _mm256_load_ps(bPtr);
+- b1Val = _mm256_load_ps(bPtr+8);
+- b2Val = _mm256_load_ps(bPtr+16);
+- b3Val = _mm256_load_ps(bPtr+24);
+-
+- c0Val = _mm256_mul_ps(a0Val, b0Val);
+- c1Val = _mm256_mul_ps(a1Val, b1Val);
+- c2Val = _mm256_mul_ps(a2Val, b2Val);
+- c3Val = _mm256_mul_ps(a3Val, b3Val);
+-
+- dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0);
+- dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1);
+- dotProdVal2 = _mm256_add_ps(c2Val, dotProdVal2);
+- dotProdVal3 = _mm256_add_ps(c3Val, dotProdVal3);
+-
+- aPtr += 16;
+- bPtr += 32;
+- }
+-
+- dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
+- dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2);
+- dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3);
+-
+- __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
+-
+- _mm256_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector
+-
+- *realpt = dotProductVector[0];
+- *imagpt = dotProductVector[1];
+- *realpt += dotProductVector[2];
+- *imagpt += dotProductVector[3];
+- *realpt += dotProductVector[4];
+- *imagpt += dotProductVector[5];
+- *realpt += dotProductVector[6];
+- *imagpt += dotProductVector[7];
+-
+- number = sixteenthPoints*16;
+- for(;number < num_points; number++){
+- *realpt += ((*aPtr) * (*bPtr++));
+- *imagpt += ((*aPtr++) * (*bPtr++));
+- }
+-
+- *result = *(lv_32fc_t*)(&res[0]);
++static inline void volk_16i_32fc_dot_prod_32fc_a_avx2(lv_32fc_t* result,
++ const short* input,
++ const lv_32fc_t* taps,
++ unsigned int num_points)
++{
++
++ unsigned int number = 0;
++ const unsigned int sixteenthPoints = num_points / 16;
++
++ float res[2];
++ float *realpt = &res[0], *imagpt = &res[1];
++ const short* aPtr = input;
++ const float* bPtr = (float*)taps;
++
++ __m128i m0, m1;
++ __m256i f0, f1;
++ __m256 g0, g1, h0, h1, h2, h3;
++ __m256 a0Val, a1Val, a2Val, a3Val;
++ __m256 b0Val, b1Val, b2Val, b3Val;
++ __m256 c0Val, c1Val, c2Val, c3Val;
++
++ __m256 dotProdVal0 = _mm256_setzero_ps();
++ __m256 dotProdVal1 = _mm256_setzero_ps();
++ __m256 dotProdVal2 = _mm256_setzero_ps();
++ __m256 dotProdVal3 = _mm256_setzero_ps();
++
++ for (; number < sixteenthPoints; number++) {
++
++ m0 = _mm_load_si128((__m128i const*)aPtr);
++ m1 = _mm_load_si128((__m128i const*)(aPtr + 8));
++
++ f0 = _mm256_cvtepi16_epi32(m0);
++ g0 = _mm256_cvtepi32_ps(f0);
++ f1 = _mm256_cvtepi16_epi32(m1);
++ g1 = _mm256_cvtepi32_ps(f1);
++
++ h0 = _mm256_unpacklo_ps(g0, g0);
++ h1 = _mm256_unpackhi_ps(g0, g0);
++ h2 = _mm256_unpacklo_ps(g1, g1);
++ h3 = _mm256_unpackhi_ps(g1, g1);
++
++ a0Val = _mm256_permute2f128_ps(h0, h1, 0x20);
++ a1Val = _mm256_permute2f128_ps(h0, h1, 0x31);
++ a2Val = _mm256_permute2f128_ps(h2, h3, 0x20);
++ a3Val = _mm256_permute2f128_ps(h2, h3, 0x31);
++
++ b0Val = _mm256_load_ps(bPtr);
++ b1Val = _mm256_load_ps(bPtr + 8);
++ b2Val = _mm256_load_ps(bPtr + 16);
++ b3Val = _mm256_load_ps(bPtr + 24);
++
++ c0Val = _mm256_mul_ps(a0Val, b0Val);
++ c1Val = _mm256_mul_ps(a1Val, b1Val);
++ c2Val = _mm256_mul_ps(a2Val, b2Val);
++ c3Val = _mm256_mul_ps(a3Val, b3Val);
++
++ dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0);
++ dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1);
++ dotProdVal2 = _mm256_add_ps(c2Val, dotProdVal2);
++ dotProdVal3 = _mm256_add_ps(c3Val, dotProdVal3);
++
++ aPtr += 16;
++ bPtr += 32;
++ }
++
++ dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
++ dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2);
++ dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3);
++
++ __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
++
++ _mm256_store_ps(dotProductVector,
++ dotProdVal0); // Store the results back into the dot product vector
++
++ *realpt = dotProductVector[0];
++ *imagpt = dotProductVector[1];
++ *realpt += dotProductVector[2];
++ *imagpt += dotProductVector[3];
++ *realpt += dotProductVector[4];
++ *imagpt += dotProductVector[5];
++ *realpt += dotProductVector[6];
++ *imagpt += dotProductVector[7];
++
++ number = sixteenthPoints * 16;
++ for (; number < num_points; number++) {
++ *realpt += ((*aPtr) * (*bPtr++));
++ *imagpt += ((*aPtr++) * (*bPtr++));
++ }
++
++ *result = *(lv_32fc_t*)(&res[0]);
+ }
+
+
+@@ -575,85 +608,90 @@ static inline void volk_16i_32fc_dot_prod_32fc_a_avx2( lv_32fc_t* result, const
+
+ #if LV_HAVE_AVX2 && LV_HAVE_FMA
+
+-static inline void volk_16i_32fc_dot_prod_32fc_a_avx2_fma( lv_32fc_t* result, const short* input, const lv_32fc_t* taps, unsigned int num_points) {
+-
+- unsigned int number = 0;
+- const unsigned int sixteenthPoints = num_points / 16;
+-
+- float res[2];
+- float *realpt = &res[0], *imagpt = &res[1];
+- const short* aPtr = input;
+- const float* bPtr = (float*)taps;
+-
+- __m128i m0, m1;
+- __m256i f0, f1;
+- __m256 g0, g1, h0, h1, h2, h3;
+- __m256 a0Val, a1Val, a2Val, a3Val;
+- __m256 b0Val, b1Val, b2Val, b3Val;
+-
+- __m256 dotProdVal0 = _mm256_setzero_ps();
+- __m256 dotProdVal1 = _mm256_setzero_ps();
+- __m256 dotProdVal2 = _mm256_setzero_ps();
+- __m256 dotProdVal3 = _mm256_setzero_ps();
+-
+- for(;number < sixteenthPoints; number++){
+-
+- m0 = _mm_load_si128((__m128i const*) aPtr);
+- m1 = _mm_load_si128((__m128i const*)(aPtr+8));
+-
+- f0 = _mm256_cvtepi16_epi32(m0);
+- g0 = _mm256_cvtepi32_ps(f0);
+- f1 = _mm256_cvtepi16_epi32(m1);
+- g1 = _mm256_cvtepi32_ps(f1);
+-
+- h0 = _mm256_unpacklo_ps(g0, g0);
+- h1 = _mm256_unpackhi_ps(g0, g0);
+- h2 = _mm256_unpacklo_ps(g1, g1);
+- h3 = _mm256_unpackhi_ps(g1, g1);
+-
+- a0Val = _mm256_permute2f128_ps(h0, h1, 0x20);
+- a1Val = _mm256_permute2f128_ps(h0, h1, 0x31);
+- a2Val = _mm256_permute2f128_ps(h2, h3, 0x20);
+- a3Val = _mm256_permute2f128_ps(h2, h3, 0x31);
+-
+- b0Val = _mm256_load_ps(bPtr);
+- b1Val = _mm256_load_ps(bPtr+8);
+- b2Val = _mm256_load_ps(bPtr+16);
+- b3Val = _mm256_load_ps(bPtr+24);
+-
+- dotProdVal0 = _mm256_fmadd_ps(a0Val,b0Val,dotProdVal0);
+- dotProdVal1 = _mm256_fmadd_ps(a1Val,b1Val,dotProdVal1);
+- dotProdVal2 = _mm256_fmadd_ps(a2Val,b2Val,dotProdVal2);
+- dotProdVal3 = _mm256_fmadd_ps(a3Val,b3Val,dotProdVal3);
+-
+- aPtr += 16;
+- bPtr += 32;
+- }
+-
+- dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
+- dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2);
+- dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3);
+-
+- __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
+-
+- _mm256_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector
+-
+- *realpt = dotProductVector[0];
+- *imagpt = dotProductVector[1];
+- *realpt += dotProductVector[2];
+- *imagpt += dotProductVector[3];
+- *realpt += dotProductVector[4];
+- *imagpt += dotProductVector[5];
+- *realpt += dotProductVector[6];
+- *imagpt += dotProductVector[7];
+-
+- number = sixteenthPoints*16;
+- for(;number < num_points; number++){
+- *realpt += ((*aPtr) * (*bPtr++));
+- *imagpt += ((*aPtr++) * (*bPtr++));
+- }
+-
+- *result = *(lv_32fc_t*)(&res[0]);
++static inline void volk_16i_32fc_dot_prod_32fc_a_avx2_fma(lv_32fc_t* result,
++ const short* input,
++ const lv_32fc_t* taps,
++ unsigned int num_points)
++{
++
++ unsigned int number = 0;
++ const unsigned int sixteenthPoints = num_points / 16;
++
++ float res[2];
++ float *realpt = &res[0], *imagpt = &res[1];
++ const short* aPtr = input;
++ const float* bPtr = (float*)taps;
++
++ __m128i m0, m1;
++ __m256i f0, f1;
++ __m256 g0, g1, h0, h1, h2, h3;
++ __m256 a0Val, a1Val, a2Val, a3Val;
++ __m256 b0Val, b1Val, b2Val, b3Val;
++
++ __m256 dotProdVal0 = _mm256_setzero_ps();
++ __m256 dotProdVal1 = _mm256_setzero_ps();
++ __m256 dotProdVal2 = _mm256_setzero_ps();
++ __m256 dotProdVal3 = _mm256_setzero_ps();
++
++ for (; number < sixteenthPoints; number++) {
++
++ m0 = _mm_load_si128((__m128i const*)aPtr);
++ m1 = _mm_load_si128((__m128i const*)(aPtr + 8));
++
++ f0 = _mm256_cvtepi16_epi32(m0);
++ g0 = _mm256_cvtepi32_ps(f0);
++ f1 = _mm256_cvtepi16_epi32(m1);
++ g1 = _mm256_cvtepi32_ps(f1);
++
++ h0 = _mm256_unpacklo_ps(g0, g0);
++ h1 = _mm256_unpackhi_ps(g0, g0);
++ h2 = _mm256_unpacklo_ps(g1, g1);
++ h3 = _mm256_unpackhi_ps(g1, g1);
++
++ a0Val = _mm256_permute2f128_ps(h0, h1, 0x20);
++ a1Val = _mm256_permute2f128_ps(h0, h1, 0x31);
++ a2Val = _mm256_permute2f128_ps(h2, h3, 0x20);
++ a3Val = _mm256_permute2f128_ps(h2, h3, 0x31);
++
++ b0Val = _mm256_load_ps(bPtr);
++ b1Val = _mm256_load_ps(bPtr + 8);
++ b2Val = _mm256_load_ps(bPtr + 16);
++ b3Val = _mm256_load_ps(bPtr + 24);
++
++ dotProdVal0 = _mm256_fmadd_ps(a0Val, b0Val, dotProdVal0);
++ dotProdVal1 = _mm256_fmadd_ps(a1Val, b1Val, dotProdVal1);
++ dotProdVal2 = _mm256_fmadd_ps(a2Val, b2Val, dotProdVal2);
++ dotProdVal3 = _mm256_fmadd_ps(a3Val, b3Val, dotProdVal3);
++
++ aPtr += 16;
++ bPtr += 32;
++ }
++
++ dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
++ dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2);
++ dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3);
++
++ __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
++
++ _mm256_store_ps(dotProductVector,
++ dotProdVal0); // Store the results back into the dot product vector
++
++ *realpt = dotProductVector[0];
++ *imagpt = dotProductVector[1];
++ *realpt += dotProductVector[2];
++ *imagpt += dotProductVector[3];
++ *realpt += dotProductVector[4];
++ *imagpt += dotProductVector[5];
++ *realpt += dotProductVector[6];
++ *imagpt += dotProductVector[7];
++
++ number = sixteenthPoints * 16;
++ for (; number < num_points; number++) {
++ *realpt += ((*aPtr) * (*bPtr++));
++ *imagpt += ((*aPtr++) * (*bPtr++));
++ }
++
++ *result = *(lv_32fc_t*)(&res[0]);
+ }
+
+
+diff --git a/kernels/volk/volk_16i_branch_4_state_8.h b/kernels/volk/volk_16i_branch_4_state_8.h
+index 31b66cc..4d00b6b 100644
+--- a/kernels/volk/volk_16i_branch_4_state_8.h
++++ b/kernels/volk/volk_16i_branch_4_state_8.h
+@@ -29,8 +29,8 @@
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+- * void volk_16i_branch_4_state_8(short* target, short* src0, char** permuters, short* cntl2, short* cntl3, short* scalars)
+- * \endcode
++ * void volk_16i_branch_4_state_8(short* target, short* src0, char** permuters, short*
++ * cntl2, short* cntl3, short* scalars) \endcode
+ *
+ * \b Inputs
+ * \li src0: <FIXME>
+@@ -61,155 +61,154 @@
+
+ #ifdef LV_HAVE_SSSE3
+
+-#include <xmmintrin.h>
+ #include <emmintrin.h>
+ #include <tmmintrin.h>
++#include <xmmintrin.h>
+
+-static inline void
+-volk_16i_branch_4_state_8_a_ssse3(short* target, short* src0, char** permuters, short* cntl2, short* cntl3, short* scalars)
++static inline void volk_16i_branch_4_state_8_a_ssse3(short* target,
++ short* src0,
++ char** permuters,
++ short* cntl2,
++ short* cntl3,
++ short* scalars)
+ {
+- __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11;
+- __m128i *p_target, *p_src0, *p_cntl2, *p_cntl3, *p_scalars;
++ __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11;
++ __m128i *p_target, *p_src0, *p_cntl2, *p_cntl3, *p_scalars;
+
+- p_target = (__m128i*)target;
+- p_src0 = (__m128i*)src0;
+- p_cntl2 = (__m128i*)cntl2;
+- p_cntl3 = (__m128i*)cntl3;
+- p_scalars = (__m128i*)scalars;
++ p_target = (__m128i*)target;
++ p_src0 = (__m128i*)src0;
++ p_cntl2 = (__m128i*)cntl2;
++ p_cntl3 = (__m128i*)cntl3;
++ p_scalars = (__m128i*)scalars;
+
+- xmm0 = _mm_load_si128(p_scalars);
++ xmm0 = _mm_load_si128(p_scalars);
+
+- xmm1 = _mm_shufflelo_epi16(xmm0, 0);
+- xmm2 = _mm_shufflelo_epi16(xmm0, 0x55);
+- xmm3 = _mm_shufflelo_epi16(xmm0, 0xaa);
+- xmm4 = _mm_shufflelo_epi16(xmm0, 0xff);
++ xmm1 = _mm_shufflelo_epi16(xmm0, 0);
++ xmm2 = _mm_shufflelo_epi16(xmm0, 0x55);
++ xmm3 = _mm_shufflelo_epi16(xmm0, 0xaa);
++ xmm4 = _mm_shufflelo_epi16(xmm0, 0xff);
+
+- xmm1 = _mm_shuffle_epi32(xmm1, 0x00);
+- xmm2 = _mm_shuffle_epi32(xmm2, 0x00);
+- xmm3 = _mm_shuffle_epi32(xmm3, 0x00);
+- xmm4 = _mm_shuffle_epi32(xmm4, 0x00);
++ xmm1 = _mm_shuffle_epi32(xmm1, 0x00);
++ xmm2 = _mm_shuffle_epi32(xmm2, 0x00);
++ xmm3 = _mm_shuffle_epi32(xmm3, 0x00);
++ xmm4 = _mm_shuffle_epi32(xmm4, 0x00);
+
+- xmm0 = _mm_load_si128((__m128i*)permuters[0]);
+- xmm6 = _mm_load_si128((__m128i*)permuters[1]);
+- xmm8 = _mm_load_si128((__m128i*)permuters[2]);
+- xmm10 = _mm_load_si128((__m128i*)permuters[3]);
++ xmm0 = _mm_load_si128((__m128i*)permuters[0]);
++ xmm6 = _mm_load_si128((__m128i*)permuters[1]);
++ xmm8 = _mm_load_si128((__m128i*)permuters[2]);
++ xmm10 = _mm_load_si128((__m128i*)permuters[3]);
+
+- xmm5 = _mm_load_si128(p_src0);
+- xmm0 = _mm_shuffle_epi8(xmm5, xmm0);
+- xmm6 = _mm_shuffle_epi8(xmm5, xmm6);
+- xmm8 = _mm_shuffle_epi8(xmm5, xmm8);
+- xmm10 = _mm_shuffle_epi8(xmm5, xmm10);
++ xmm5 = _mm_load_si128(p_src0);
++ xmm0 = _mm_shuffle_epi8(xmm5, xmm0);
++ xmm6 = _mm_shuffle_epi8(xmm5, xmm6);
++ xmm8 = _mm_shuffle_epi8(xmm5, xmm8);
++ xmm10 = _mm_shuffle_epi8(xmm5, xmm10);
+
+- xmm5 = _mm_add_epi16(xmm1, xmm2);
++ xmm5 = _mm_add_epi16(xmm1, xmm2);
+
+- xmm6 = _mm_add_epi16(xmm2, xmm6);
+- xmm8 = _mm_add_epi16(xmm1, xmm8);
++ xmm6 = _mm_add_epi16(xmm2, xmm6);
++ xmm8 = _mm_add_epi16(xmm1, xmm8);
+
+- xmm7 = _mm_load_si128(p_cntl2);
+- xmm9 = _mm_load_si128(p_cntl3);
++ xmm7 = _mm_load_si128(p_cntl2);
++ xmm9 = _mm_load_si128(p_cntl3);
+
+- xmm0 = _mm_add_epi16(xmm5, xmm0);
++ xmm0 = _mm_add_epi16(xmm5, xmm0);
+
+- xmm7 = _mm_and_si128(xmm7, xmm3);
+- xmm9 = _mm_and_si128(xmm9, xmm4);
++ xmm7 = _mm_and_si128(xmm7, xmm3);
++ xmm9 = _mm_and_si128(xmm9, xmm4);
+
+- xmm5 = _mm_load_si128(&p_cntl2[1]);
+- xmm11 = _mm_load_si128(&p_cntl3[1]);
++ xmm5 = _mm_load_si128(&p_cntl2[1]);
++ xmm11 = _mm_load_si128(&p_cntl3[1]);
+
+- xmm7 = _mm_add_epi16(xmm7, xmm9);
++ xmm7 = _mm_add_epi16(xmm7, xmm9);
+
+- xmm5 = _mm_and_si128(xmm5, xmm3);
+- xmm11 = _mm_and_si128(xmm11, xmm4);
++ xmm5 = _mm_and_si128(xmm5, xmm3);
++ xmm11 = _mm_and_si128(xmm11, xmm4);
+
+- xmm0 = _mm_add_epi16(xmm0, xmm7);
++ xmm0 = _mm_add_epi16(xmm0, xmm7);
+
+
+- xmm7 = _mm_load_si128(&p_cntl2[2]);
+- xmm9 = _mm_load_si128(&p_cntl3[2]);
++ xmm7 = _mm_load_si128(&p_cntl2[2]);
++ xmm9 = _mm_load_si128(&p_cntl3[2]);
+
+- xmm5 = _mm_add_epi16(xmm5, xmm11);
++ xmm5 = _mm_add_epi16(xmm5, xmm11);
+
+- xmm7 = _mm_and_si128(xmm7, xmm3);
+- xmm9 = _mm_and_si128(xmm9, xmm4);
++ xmm7 = _mm_and_si128(xmm7, xmm3);
++ xmm9 = _mm_and_si128(xmm9, xmm4);
+
+- xmm6 = _mm_add_epi16(xmm6, xmm5);
++ xmm6 = _mm_add_epi16(xmm6, xmm5);
+
+
+- xmm5 = _mm_load_si128(&p_cntl2[3]);
+- xmm11 = _mm_load_si128(&p_cntl3[3]);
++ xmm5 = _mm_load_si128(&p_cntl2[3]);
++ xmm11 = _mm_load_si128(&p_cntl3[3]);
+
+- xmm7 = _mm_add_epi16(xmm7, xmm9);
++ xmm7 = _mm_add_epi16(xmm7, xmm9);
+
+- xmm5 = _mm_and_si128(xmm5, xmm3);
+- xmm11 = _mm_and_si128(xmm11, xmm4);
++ xmm5 = _mm_and_si128(xmm5, xmm3);
++ xmm11 = _mm_and_si128(xmm11, xmm4);
+
+- xmm8 = _mm_add_epi16(xmm8, xmm7);
++ xmm8 = _mm_add_epi16(xmm8, xmm7);
+
+- xmm5 = _mm_add_epi16(xmm5, xmm11);
++ xmm5 = _mm_add_epi16(xmm5, xmm11);
+
+- _mm_store_si128(p_target, xmm0);
+- _mm_store_si128(&p_target[1], xmm6);
++ _mm_store_si128(p_target, xmm0);
++ _mm_store_si128(&p_target[1], xmm6);
+
+- xmm10 = _mm_add_epi16(xmm5, xmm10);
++ xmm10 = _mm_add_epi16(xmm5, xmm10);
+
+- _mm_store_si128(&p_target[2], xmm8);
++ _mm_store_si128(&p_target[2], xmm8);
+
+- _mm_store_si128(&p_target[3], xmm10);
++ _mm_store_si128(&p_target[3], xmm10);
+ }
+
+
+ #endif /*LV_HAVE_SSEs*/
+
+ #ifdef LV_HAVE_GENERIC
+-static inline void
+-volk_16i_branch_4_state_8_generic(short* target, short* src0, char** permuters, short* cntl2, short* cntl3, short* scalars)
++static inline void volk_16i_branch_4_state_8_generic(short* target,
++ short* src0,
++ char** permuters,
++ short* cntl2,
++ short* cntl3,
++ short* scalars)
+ {
+- int i = 0;
+-
+- int bound = 4;
+-
+- for(; i < bound; ++i) {
+- target[i* 8] = src0[((char)permuters[i][0])/2]
+- + ((i + 1)%2 * scalars[0])
+- + (((i >> 1)^1) * scalars[1])
+- + (cntl2[i * 8] & scalars[2])
+- + (cntl3[i * 8] & scalars[3]);
+- target[i* 8 + 1] = src0[((char)permuters[i][1 * 2])/2]
+- + ((i + 1)%2 * scalars[0])
+- + (((i >> 1)^1) * scalars[1])
+- + (cntl2[i * 8 + 1] & scalars[2])
+- + (cntl3[i * 8 + 1] & scalars[3]);
+- target[i* 8 + 2] = src0[((char)permuters[i][2 * 2])/2]
+- + ((i + 1)%2 * scalars[0])
+- + (((i >> 1)^1) * scalars[1])
+- + (cntl2[i * 8 + 2] & scalars[2])
+- + (cntl3[i * 8 + 2] & scalars[3]);
+- target[i* 8 + 3] = src0[((char)permuters[i][3 * 2])/2]
+- + ((i + 1)%2 * scalars[0])
+- + (((i >> 1)^1) * scalars[1])
+- + (cntl2[i * 8 + 3] & scalars[2])
+- + (cntl3[i * 8 + 3] & scalars[3]);
+- target[i* 8 + 4] = src0[((char)permuters[i][4 * 2])/2]
+- + ((i + 1)%2 * scalars[0])
+- + (((i >> 1)^1) * scalars[1])
+- + (cntl2[i * 8 + 4] & scalars[2])
+- + (cntl3[i * 8 + 4] & scalars[3]);
+- target[i* 8 + 5] = src0[((char)permuters[i][5 * 2])/2]
+- + ((i + 1)%2 * scalars[0])
+- + (((i >> 1)^1) * scalars[1])
+- + (cntl2[i * 8 + 5] & scalars[2])
+- + (cntl3[i * 8 + 5] & scalars[3]);
+- target[i* 8 + 6] = src0[((char)permuters[i][6 * 2])/2]
+- + ((i + 1)%2 * scalars[0])
+- + (((i >> 1)^1) * scalars[1])
+- + (cntl2[i * 8 + 6] & scalars[2])
+- + (cntl3[i * 8 + 6] & scalars[3]);
+- target[i* 8 + 7] = src0[((char)permuters[i][7 * 2])/2]
+- + ((i + 1)%2 * scalars[0])
+- + (((i >> 1)^1) * scalars[1])
+- + (cntl2[i * 8 + 7] & scalars[2])
+- + (cntl3[i * 8 + 7] & scalars[3]);
+- }
++ int i = 0;
++
++ int bound = 4;
++
++ for (; i < bound; ++i) {
++ target[i * 8] = src0[((char)permuters[i][0]) / 2] + ((i + 1) % 2 * scalars[0]) +
++ (((i >> 1) ^ 1) * scalars[1]) + (cntl2[i * 8] & scalars[2]) +
++ (cntl3[i * 8] & scalars[3]);
++ target[i * 8 + 1] = src0[((char)permuters[i][1 * 2]) / 2] +
++ ((i + 1) % 2 * scalars[0]) + (((i >> 1) ^ 1) * scalars[1]) +
++ (cntl2[i * 8 + 1] & scalars[2]) +
++ (cntl3[i * 8 + 1] & scalars[3]);
++ target[i * 8 + 2] = src0[((char)permuters[i][2 * 2]) / 2] +
++ ((i + 1) % 2 * scalars[0]) + (((i >> 1) ^ 1) * scalars[1]) +
++ (cntl2[i * 8 + 2] & scalars[2]) +
++ (cntl3[i * 8 + 2] & scalars[3]);
++ target[i * 8 + 3] = src0[((char)permuters[i][3 * 2]) / 2] +
++ ((i + 1) % 2 * scalars[0]) + (((i >> 1) ^ 1) * scalars[1]) +
++ (cntl2[i * 8 + 3] & scalars[2]) +
++ (cntl3[i * 8 + 3] & scalars[3]);
++ target[i * 8 + 4] = src0[((char)permuters[i][4 * 2]) / 2] +
++ ((i + 1) % 2 * scalars[0]) + (((i >> 1) ^ 1) * scalars[1]) +
++ (cntl2[i * 8 + 4] & scalars[2]) +
++ (cntl3[i * 8 + 4] & scalars[3]);
++ target[i * 8 + 5] = src0[((char)permuters[i][5 * 2]) / 2] +
++ ((i + 1) % 2 * scalars[0]) + (((i >> 1) ^ 1) * scalars[1]) +
++ (cntl2[i * 8 + 5] & scalars[2]) +
++ (cntl3[i * 8 + 5] & scalars[3]);
++ target[i * 8 + 6] = src0[((char)permuters[i][6 * 2]) / 2] +
++ ((i + 1) % 2 * scalars[0]) + (((i >> 1) ^ 1) * scalars[1]) +
++ (cntl2[i * 8 + 6] & scalars[2]) +
++ (cntl3[i * 8 + 6] & scalars[3]);
++ target[i * 8 + 7] = src0[((char)permuters[i][7 * 2]) / 2] +
++ ((i + 1) % 2 * scalars[0]) + (((i >> 1) ^ 1) * scalars[1]) +
++ (cntl2[i * 8 + 7] & scalars[2]) +
++ (cntl3[i * 8 + 7] & scalars[3]);
++ }
+ }
+
+ #endif /*LV_HAVE_GENERIC*/
+diff --git a/kernels/volk/volk_16i_convert_8i.h b/kernels/volk/volk_16i_convert_8i.h
+index e2f953b..f09515d 100644
+--- a/kernels/volk/volk_16i_convert_8i.h
++++ b/kernels/volk/volk_16i_convert_8i.h
+@@ -29,8 +29,8 @@
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+- * void volk_16i_convert_8i(int8_t* outputVector, const int16_t* inputVector, unsigned int num_points)
+- * \endcode
++ * void volk_16i_convert_8i(int8_t* outputVector, const int16_t* inputVector, unsigned int
++ * num_points) \endcode
+ *
+ * \b Inputs
+ * \li inputVector: The input vector of 16-bit shorts.
+@@ -59,39 +59,42 @@
+ #ifdef LV_HAVE_AVX2
+ #include <immintrin.h>
+
+-static inline void
+-volk_16i_convert_8i_u_avx2(int8_t* outputVector, const int16_t* inputVector, unsigned int num_points)
++static inline void volk_16i_convert_8i_u_avx2(int8_t* outputVector,
++ const int16_t* inputVector,
++ unsigned int num_points)
+ {
+- unsigned int number = 0;
+- const unsigned int thirtysecondPoints = num_points / 32;
++ unsigned int number = 0;
++ const unsigned int thirtysecondPoints = num_points / 32;
+
+- int8_t* outputVectorPtr = outputVector;
+- int16_t* inputPtr = (int16_t*)inputVector;
+- __m256i inputVal1;
+- __m256i inputVal2;
+- __m256i ret;
++ int8_t* outputVectorPtr = outputVector;
++ int16_t* inputPtr = (int16_t*)inputVector;
++ __m256i inputVal1;
++ __m256i inputVal2;
++ __m256i ret;
+
+- for(;number < thirtysecondPoints; number++){
++ for (; number < thirtysecondPoints; number++) {
+
+- // Load the 16 values
+- inputVal1 = _mm256_loadu_si256((__m256i*)inputPtr); inputPtr += 16;
+- inputVal2 = _mm256_loadu_si256((__m256i*)inputPtr); inputPtr += 16;
++ // Load the 16 values
++ inputVal1 = _mm256_loadu_si256((__m256i*)inputPtr);
++ inputPtr += 16;
++ inputVal2 = _mm256_loadu_si256((__m256i*)inputPtr);
++ inputPtr += 16;
+
+- inputVal1 = _mm256_srai_epi16(inputVal1, 8);
+- inputVal2 = _mm256_srai_epi16(inputVal2, 8);
++ inputVal1 = _mm256_srai_epi16(inputVal1, 8);
++ inputVal2 = _mm256_srai_epi16(inputVal2, 8);
+
+- ret = _mm256_packs_epi16(inputVal1, inputVal2);
+- ret = _mm256_permute4x64_epi64(ret, 0b11011000);
++ ret = _mm256_packs_epi16(inputVal1, inputVal2);
++ ret = _mm256_permute4x64_epi64(ret, 0b11011000);
+
+- _mm256_storeu_si256((__m256i*)outputVectorPtr, ret);
++ _mm256_storeu_si256((__m256i*)outputVectorPtr, ret);
+
+- outputVectorPtr += 32;
+- }
++ outputVectorPtr += 32;
++ }
+
+- number = thirtysecondPoints * 32;
+- for(; number < num_points; number++){
+- outputVector[number] =(int8_t)(inputVector[number] >> 8);
+- }
++ number = thirtysecondPoints * 32;
++ for (; number < num_points; number++) {
++ outputVector[number] = (int8_t)(inputVector[number] >> 8);
++ }
+ }
+ #endif /* LV_HAVE_AVX2 */
+
+@@ -99,60 +102,62 @@ volk_16i_convert_8i_u_avx2(int8_t* outputVector, const int16_t* inputVector, uns
+ #ifdef LV_HAVE_SSE2
+ #include <emmintrin.h>
+
+-static inline void
+-volk_16i_convert_8i_u_sse2(int8_t* outputVector, const int16_t* inputVector, unsigned int num_points)
++static inline void volk_16i_convert_8i_u_sse2(int8_t* outputVector,
++ const int16_t* inputVector,
++ unsigned int num_points)
+ {
+- unsigned int number = 0;
+- const unsigned int sixteenthPoints = num_points / 16;
++ unsigned int number = 0;
++ const unsigned int sixteenthPoints = num_points / 16;
+
+- int8_t* outputVectorPtr = outputVector;
+- int16_t* inputPtr = (int16_t*)inputVector;
+- __m128i inputVal1;
+- __m128i inputVal2;
+- __m128i ret;
++ int8_t* outputVectorPtr = outputVector;
++ int16_t* inputPtr = (int16_t*)inputVector;
++ __m128i inputVal1;
++ __m128i inputVal2;
++ __m128i ret;
+
+- for(;number < sixteenthPoints; number++){
++ for (; number < sixteenthPoints; number++) {
+
+- // Load the 16 values
+- inputVal1 = _mm_loadu_si128((__m128i*)inputPtr); inputPtr += 8;
+- inputVal2 = _mm_loadu_si128((__m128i*)inputPtr); inputPtr += 8;
++ // Load the 16 values
++ inputVal1 = _mm_loadu_si128((__m128i*)inputPtr);
++ inputPtr += 8;
++ inputVal2 = _mm_loadu_si128((__m128i*)inputPtr);
++ inputPtr += 8;
+
+- inputVal1 = _mm_srai_epi16(inputVal1, 8);
+- inputVal2 = _mm_srai_epi16(inputVal2, 8);
++ inputVal1 = _mm_srai_epi16(inputVal1, 8);
++ inputVal2 = _mm_srai_epi16(inputVal2, 8);
+
+- ret = _mm_packs_epi16(inputVal1, inputVal2);
++ ret = _mm_packs_epi16(inputVal1, inputVal2);
+
+- _mm_storeu_si128((__m128i*)outputVectorPtr, ret);
++ _mm_storeu_si128((__m128i*)outputVectorPtr, ret);
+
+- outputVectorPtr += 16;
+- }
++ outputVectorPtr += 16;
++ }
+
+- number = sixteenthPoints * 16;
+- for(; number < num_points; number++){
+- outputVector[number] =(int8_t)(inputVector[number] >> 8);
+- }
++ number = sixteenthPoints * 16;
++ for (; number < num_points; number++) {
++ outputVector[number] = (int8_t)(inputVector[number] >> 8);
++ }
+ }
+ #endif /* LV_HAVE_SSE2 */
+
+
+ #ifdef LV_HAVE_GENERIC
+
+-static inline void
+-volk_16i_convert_8i_generic(int8_t* outputVector, const int16_t* inputVector, unsigned int num_points)
++static inline void volk_16i_convert_8i_generic(int8_t* outputVector,
++ const int16_t* inputVector,
++ unsigned int num_points)
+ {
+- int8_t* outputVectorPtr = outputVector;
+- const int16_t* inputVectorPtr = inputVector;
+- unsigned int number = 0;
++ int8_t* outputVectorPtr = outputVector;
++ const int16_t* inputVectorPtr = inputVector;
++ unsigned int number = 0;
+
+- for(number = 0; number < num_points; number++){
+- *outputVectorPtr++ = ((int8_t)(*inputVectorPtr++ >> 8));
+- }
++ for (number = 0; number < num_points; number++) {
++ *outputVectorPtr++ = ((int8_t)(*inputVectorPtr++ >> 8));
++ }
+ }
+ #endif /* LV_HAVE_GENERIC */
+
+
+-
+-
+ #endif /* INCLUDED_volk_16i_convert_8i_u_H */
+ #ifndef INCLUDED_volk_16i_convert_8i_a_H
+ #define INCLUDED_volk_16i_convert_8i_a_H
+@@ -163,39 +168,42 @@ volk_16i_convert_8i_generic(int8_t* outputVector, const int16_t* inputVector, un
+ #ifdef LV_HAVE_AVX2
+ #include <immintrin.h>
+
+-static inline void
+-volk_16i_convert_8i_a_avx2(int8_t* outputVector, const int16_t* inputVector, unsigned int num_points)
++static inline void volk_16i_convert_8i_a_avx2(int8_t* outputVector,
++ const int16_t* inputVector,
++ unsigned int num_points)
+ {
+- unsigned int number = 0;
+- const unsigned int thirtysecondPoints = num_points / 32;
++ unsigned int number = 0;
++ const unsigned int thirtysecondPoints = num_points / 32;
+
+- int8_t* outputVectorPtr = outputVector;
+- int16_t* inputPtr = (int16_t*)inputVector;
+- __m256i inputVal1;
+- __m256i inputVal2;
+- __m256i ret;
++ int8_t* outputVectorPtr = outputVector;
++ int16_t* inputPtr = (int16_t*)inputVector;
++ __m256i inputVal1;
++ __m256i inputVal2;
++ __m256i ret;
+
+- for(;number < thirtysecondPoints; number++){
++ for (; number < thirtysecondPoints; number++) {
+
+- // Load the 16 values
+- inputVal1 = _mm256_load_si256((__m256i*)inputPtr); inputPtr += 16;
+- inputVal2 = _mm256_load_si256((__m256i*)inputPtr); inputPtr += 16;
++ // Load the 16 values
++ inputVal1 = _mm256_load_si256((__m256i*)inputPtr);
++ inputPtr += 16;
++ inputVal2 = _mm256_load_si256((__m256i*)inputPtr);
++ inputPtr += 16;
+
+- inputVal1 = _mm256_srai_epi16(inputVal1, 8);
+- inputVal2 = _mm256_srai_epi16(inputVal2, 8);
++ inputVal1 = _mm256_srai_epi16(inputVal1, 8);
++ inputVal2 = _mm256_srai_epi16(inputVal2, 8);
+
+- ret = _mm256_packs_epi16(inputVal1, inputVal2);
+- ret = _mm256_permute4x64_epi64(ret, 0b11011000);
++ ret = _mm256_packs_epi16(inputVal1, inputVal2);
++ ret = _mm256_permute4x64_epi64(ret, 0b11011000);
+
+- _mm256_store_si256((__m256i*)outputVectorPtr, ret);
++ _mm256_store_si256((__m256i*)outputVectorPtr, ret);
+
+- outputVectorPtr += 32;
+- }
++ outputVectorPtr += 32;
++ }
+
+- number = thirtysecondPoints * 32;
+- for(; number < num_points; number++){
+- outputVector[number] =(int8_t)(inputVector[number] >> 8);
+- }
++ number = thirtysecondPoints * 32;
++ for (; number < num_points; number++) {
++ outputVector[number] = (int8_t)(inputVector[number] >> 8);
++ }
+ }
+ #endif /* LV_HAVE_AVX2 */
+
+@@ -203,38 +211,41 @@ volk_16i_convert_8i_a_avx2(int8_t* outputVector, const int16_t* inputVector, uns
+ #ifdef LV_HAVE_SSE2
+ #include <emmintrin.h>
+
+-static inline void
+-volk_16i_convert_8i_a_sse2(int8_t* outputVector, const int16_t* inputVector, unsigned int num_points)
++static inline void volk_16i_convert_8i_a_sse2(int8_t* outputVector,
++ const int16_t* inputVector,
++ unsigned int num_points)
+ {
+- unsigned int number = 0;
+- const unsigned int sixteenthPoints = num_points / 16;
++ unsigned int number = 0;
++ const unsigned int sixteenthPoints = num_points / 16;
+
+- int8_t* outputVectorPtr = outputVector;
+- int16_t* inputPtr = (int16_t*)inputVector;
+- __m128i inputVal1;
+- __m128i inputVal2;
+- __m128i ret;
++ int8_t* outputVectorPtr = outputVector;
++ int16_t* inputPtr = (int16_t*)inputVector;
++ __m128i inputVal1;
++ __m128i inputVal2;
++ __m128i ret;
+
+- for(;number < sixteenthPoints; number++){
++ for (; number < sixteenthPoints; number++) {
+
+- // Load the 16 values
+- inputVal1 = _mm_load_si128((__m128i*)inputPtr); inputPtr += 8;
+- inputVal2 = _mm_load_si128((__m128i*)inputPtr); inputPtr += 8;
++ // Load the 16 values
++ inputVal1 = _mm_load_si128((__m128i*)inputPtr);
++ inputPtr += 8;
++ inputVal2 = _mm_load_si128((__m128i*)inputPtr);
++ inputPtr += 8;
+
+- inputVal1 = _mm_srai_epi16(inputVal1, 8);
+- inputVal2 = _mm_srai_epi16(inputVal2, 8);
++ inputVal1 = _mm_srai_epi16(inputVal1, 8);
++ inputVal2 = _mm_srai_epi16(inputVal2, 8);
+
+- ret = _mm_packs_epi16(inputVal1, inputVal2);
++ ret = _mm_packs_epi16(inputVal1, inputVal2);
+
+- _mm_store_si128((__m128i*)outputVectorPtr, ret);
++ _mm_store_si128((__m128i*)outputVectorPtr, ret);
+
+- outputVectorPtr += 16;
+- }
++ outputVectorPtr += 16;
++ }
+
+- number = sixteenthPoints * 16;
+- for(; number < num_points; number++){
+- outputVector[number] =(int8_t)(inputVector[number] >> 8);
+- }
++ number = sixteenthPoints * 16;
++ for (; number < num_points; number++) {
++ outputVector[number] = (int8_t)(inputVector[number] >> 8);
++ }
+ }
+ #endif /* LV_HAVE_SSE2 */
+
+@@ -242,53 +253,55 @@ volk_16i_convert_8i_a_sse2(int8_t* outputVector, const int16_t* inputVector, uns
+ #ifdef LV_HAVE_NEON
+ #include <arm_neon.h>
+
+-static inline void
+-volk_16i_convert_8i_neon(int8_t* outputVector, const int16_t* inputVector, unsigned int num_points)
++static inline void volk_16i_convert_8i_neon(int8_t* outputVector,
++ const int16_t* inputVector,
++ unsigned int num_points)
+ {
+- int8_t* outputVectorPtr = outputVector;
+- const int16_t* inputVectorPtr = inputVector;
+- unsigned int number = 0;
+- unsigned int sixteenth_points = num_points / 16;
+-
+- int16x8_t inputVal0;
+- int16x8_t inputVal1;
+- int8x8_t outputVal0;
+- int8x8_t outputVal1;
+- int8x16_t outputVal;
+-
+- for(number = 0; number < sixteenth_points; number++){
+- // load two input vectors
+- inputVal0 = vld1q_s16(inputVectorPtr);
+- inputVal1 = vld1q_s16(inputVectorPtr+8);
+- // shift right
+- outputVal0 = vshrn_n_s16(inputVal0, 8);
+- outputVal1 = vshrn_n_s16(inputVal1, 8);
+- // squash two vectors and write output
+- outputVal = vcombine_s8(outputVal0, outputVal1);
+- vst1q_s8(outputVectorPtr, outputVal);
+- inputVectorPtr += 16;
+- outputVectorPtr += 16;
+- }
+-
+- for(number = sixteenth_points * 16; number < num_points; number++){
+- *outputVectorPtr++ = ((int8_t)(*inputVectorPtr++ >> 8));
+- }
++ int8_t* outputVectorPtr = outputVector;
++ const int16_t* inputVectorPtr = inputVector;
++ unsigned int number = 0;
++ unsigned int sixteenth_points = num_points / 16;
++
++ int16x8_t inputVal0;
++ int16x8_t inputVal1;
++ int8x8_t outputVal0;
++ int8x8_t outputVal1;
++ int8x16_t outputVal;
++
++ for (number = 0; number < sixteenth_points; number++) {
++ // load two input vectors
++ inputVal0 = vld1q_s16(inputVectorPtr);
++ inputVal1 = vld1q_s16(inputVectorPtr + 8);
++ // shift right
++ outputVal0 = vshrn_n_s16(inputVal0, 8);
++ outputVal1 = vshrn_n_s16(inputVal1, 8);
++ // squash two vectors and write output
++ outputVal = vcombine_s8(outputVal0, outputVal1);
++ vst1q_s8(outputVectorPtr, outputVal);
++ inputVectorPtr += 16;
++ outputVectorPtr += 16;
++ }
++
++ for (number = sixteenth_points * 16; number < num_points; number++) {
++ *outputVectorPtr++ = ((int8_t)(*inputVectorPtr++ >> 8));
++ }
+ }
+ #endif /* LV_HAVE_NEON */
+
+
+ #ifdef LV_HAVE_GENERIC
+
+-static inline void
+-volk_16i_convert_8i_a_generic(int8_t* outputVector, const int16_t* inputVector, unsigned int num_points)
++static inline void volk_16i_convert_8i_a_generic(int8_t* outputVector,
++ const int16_t* inputVector,
++ unsigned int num_points)
+ {
+- int8_t* outputVectorPtr = outputVector;
+- const int16_t* inputVectorPtr = inputVector;
+- unsigned int number = 0;
++ int8_t* outputVectorPtr = outputVector;
++ const int16_t* inputVectorPtr = inputVector;
++ unsigned int number = 0;
+
+- for(number = 0; number < num_points; number++){
+- *outputVectorPtr++ = ((int8_t)(*inputVectorPtr++ >> 8));
+- }
++ for (number = 0; number < num_points; number++) {
++ *outputVectorPtr++ = ((int8_t)(*inputVectorPtr++ >> 8));
++ }
+ }
+ #endif /* LV_HAVE_GENERIC */
+
+diff --git a/kernels/volk/volk_16i_max_star_16i.h b/kernels/volk/volk_16i_max_star_16i.h
+index 78fd911..d5dad18 100644
+--- a/kernels/volk/volk_16i_max_star_16i.h
++++ b/kernels/volk/volk_16i_max_star_16i.h
+@@ -53,67 +53,69 @@
+ #ifndef INCLUDED_volk_16i_max_star_16i_a_H
+ #define INCLUDED_volk_16i_max_star_16i_a_H
+
+-#include<inttypes.h>
+-#include<stdio.h>
++#include <inttypes.h>
++#include <stdio.h>
+
+ #ifdef LV_HAVE_SSSE3
+
+-#include<xmmintrin.h>
+-#include<emmintrin.h>
+-#include<tmmintrin.h>
++#include <emmintrin.h>
++#include <tmmintrin.h>
++#include <xmmintrin.h>
+
+ static inline void
+ volk_16i_max_star_16i_a_ssse3(short* target, short* src0, unsigned int num_points)
+ {
+- const unsigned int num_bytes = num_points*2;
++ const unsigned int num_bytes = num_points * 2;
+
+- short candidate = src0[0];
+- short cands[8];
+- __m128i xmm0, xmm1, xmm3, xmm4, xmm5, xmm6;
++ short candidate = src0[0];
++ short cands[8];
++ __m128i xmm0, xmm1, xmm3, xmm4, xmm5, xmm6;
+
+- __m128i *p_src0;
++ __m128i* p_src0;
+
+- p_src0 = (__m128i*)src0;
++ p_src0 = (__m128i*)src0;
+
+- int bound = num_bytes >> 4;
+- int leftovers = (num_bytes >> 1) & 7;
++ int bound = num_bytes >> 4;
++ int leftovers = (num_bytes >> 1) & 7;
+
+- int i = 0;
++ int i = 0;
+
+- xmm1 = _mm_setzero_si128();
+- xmm0 = _mm_setzero_si128();
+- //_mm_insert_epi16(xmm0, candidate, 0);
++ xmm1 = _mm_setzero_si128();
++ xmm0 = _mm_setzero_si128();
++ //_mm_insert_epi16(xmm0, candidate, 0);
+
+- xmm0 = _mm_shuffle_epi8(xmm0, xmm1);
++ xmm0 = _mm_shuffle_epi8(xmm0, xmm1);
+
+- for(i = 0; i < bound; ++i) {
+- xmm1 = _mm_load_si128(p_src0);
+- p_src0 += 1;
+- //xmm2 = _mm_sub_epi16(xmm1, xmm0);
++ for (i = 0; i < bound; ++i) {
++ xmm1 = _mm_load_si128(p_src0);
++ p_src0 += 1;
++ // xmm2 = _mm_sub_epi16(xmm1, xmm0);
+
+- xmm3 = _mm_cmpgt_epi16(xmm0, xmm1);
+- xmm4 = _mm_cmpeq_epi16(xmm0, xmm1);
+- xmm5 = _mm_cmpgt_epi16(xmm1, xmm0);
++ xmm3 = _mm_cmpgt_epi16(xmm0, xmm1);
++ xmm4 = _mm_cmpeq_epi16(xmm0, xmm1);
++ xmm5 = _mm_cmpgt_epi16(xmm1, xmm0);
+
+- xmm6 = _mm_xor_si128(xmm4, xmm5);
++ xmm6 = _mm_xor_si128(xmm4, xmm5);
+
+- xmm3 = _mm_and_si128(xmm3, xmm0);
+- xmm4 = _mm_and_si128(xmm6, xmm1);
++ xmm3 = _mm_and_si128(xmm3, xmm0);
++ xmm4 = _mm_and_si128(xmm6, xmm1);
+
+- xmm0 = _mm_add_epi16(xmm3, xmm4);
+- }
++ xmm0 = _mm_add_epi16(xmm3, xmm4);
++ }
+
+- _mm_store_si128((__m128i*)cands, xmm0);
++ _mm_store_si128((__m128i*)cands, xmm0);
+
+- for(i = 0; i < 8; ++i) {
+- candidate = ((short)(candidate - cands[i]) > 0) ? candidate : cands[i];
+- }
++ for (i = 0; i < 8; ++i) {
++ candidate = ((short)(candidate - cands[i]) > 0) ? candidate : cands[i];
++ }
+
+- for(i = 0; i < leftovers; ++i) {
+- candidate = ((short)(candidate - src0[(bound << 3) + i]) > 0) ? candidate : src0[(bound << 3) + i];
+- }
++ for (i = 0; i < leftovers; ++i) {
++ candidate = ((short)(candidate - src0[(bound << 3) + i]) > 0)
++ ? candidate
++ : src0[(bound << 3) + i];
++ }
+
+- target[0] = candidate;
++ target[0] = candidate;
+ }
+
+ #endif /*LV_HAVE_SSSE3*/
+@@ -124,38 +126,38 @@ volk_16i_max_star_16i_a_ssse3(short* target, short* src0, unsigned int num_point
+ static inline void
+ volk_16i_max_star_16i_neon(short* target, short* src0, unsigned int num_points)
+ {
+- const unsigned int eighth_points = num_points / 8;
+- unsigned number;
+- int16x8_t input_vec;
+- int16x8_t diff, zeros;
+- uint16x8_t comp1, comp2;
+- zeros = vdupq_n_s16(0);
+-
+- int16x8x2_t tmpvec;
+-
+- int16x8_t candidate_vec = vld1q_dup_s16(src0 );
+- short candidate;
+- ++src0;
+-
+- for(number=0; number < eighth_points; ++number) {
+- input_vec = vld1q_s16(src0);
+- __VOLK_PREFETCH(src0+16);
+- diff = vsubq_s16(candidate_vec, input_vec);
+- comp1 = vcgeq_s16(diff, zeros);
+- comp2 = vcltq_s16(diff, zeros);
+-
+- tmpvec.val[0] = vandq_s16(candidate_vec, (int16x8_t)comp1);
+- tmpvec.val[1] = vandq_s16(input_vec, (int16x8_t)comp2);
+-
+- candidate_vec = vaddq_s16(tmpvec.val[0], tmpvec.val[1]);
+- src0 += 8;
+- }
+- vst1q_s16(&candidate, candidate_vec);
+-
+- for(number=0; number < num_points%8; number++) {
+- candidate = ((int16_t)(candidate - src0[number]) > 0) ? candidate : src0[number];
+- }
+- target[0] = candidate;
++ const unsigned int eighth_points = num_points / 8;
++ unsigned number;
++ int16x8_t input_vec;
++ int16x8_t diff, zeros;
++ uint16x8_t comp1, comp2;
++ zeros = vdupq_n_s16(0);
++
++ int16x8x2_t tmpvec;
++
++ int16x8_t candidate_vec = vld1q_dup_s16(src0);
++ short candidate;
++ ++src0;
++
++ for (number = 0; number < eighth_points; ++number) {
++ input_vec = vld1q_s16(src0);
++ __VOLK_PREFETCH(src0 + 16);
++ diff = vsubq_s16(candidate_vec, input_vec);
++ comp1 = vcgeq_s16(diff, zeros);
++ comp2 = vcltq_s16(diff, zeros);
++
++ tmpvec.val[0] = vandq_s16(candidate_vec, (int16x8_t)comp1);
++ tmpvec.val[1] = vandq_s16(input_vec, (int16x8_t)comp2);
++
++ candidate_vec = vaddq_s16(tmpvec.val[0], tmpvec.val[1]);
++ src0 += 8;
++ }
++ vst1q_s16(&candidate, candidate_vec);
++
++ for (number = 0; number < num_points % 8; number++) {
++ candidate = ((int16_t)(candidate - src0[number]) > 0) ? candidate : src0[number];
++ }
++ target[0] = candidate;
+ }
+ #endif /*LV_HAVE_NEON*/
+
+@@ -164,17 +166,17 @@ volk_16i_max_star_16i_neon(short* target, short* src0, unsigned int num_points)
+ static inline void
+ volk_16i_max_star_16i_generic(short* target, short* src0, unsigned int num_points)
+ {
+- const unsigned int num_bytes = num_points*2;
++ const unsigned int num_bytes = num_points * 2;
+
+- int i = 0;
++ int i = 0;
+
+- int bound = num_bytes >> 1;
++ int bound = num_bytes >> 1;
+
+- short candidate = src0[0];
+- for(i = 1; i < bound; ++i) {
+- candidate = ((short)(candidate - src0[i]) > 0) ? candidate : src0[i];
+- }
+- target[0] = candidate;
++ short candidate = src0[0];
++ for (i = 1; i < bound; ++i) {
++ candidate = ((short)(candidate - src0[i]) > 0) ? candidate : src0[i];
++ }
++ target[0] = candidate;
+ }
+
+ #endif /*LV_HAVE_GENERIC*/
+diff --git a/kernels/volk/volk_16i_max_star_horizontal_16i.h b/kernels/volk/volk_16i_max_star_horizontal_16i.h
+index 4ffe264..2e1f52b 100644
+--- a/kernels/volk/volk_16i_max_star_horizontal_16i.h
++++ b/kernels/volk/volk_16i_max_star_horizontal_16i.h
+@@ -29,8 +29,8 @@
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+- * void volk_16i_max_star_horizontal_16i(short* target, short* src0, unsigned int num_points);
+- * \endcode
++ * void volk_16i_max_star_horizontal_16i(short* target, short* src0, unsigned int
++ * num_points); \endcode
+ *
+ * \b Inputs
+ * \li src0: The input vector.
+@@ -55,102 +55,113 @@
+
+ #include <volk/volk_common.h>
+
+-#include<inttypes.h>
+-#include<stdio.h>
++#include <inttypes.h>
++#include <stdio.h>
+
+
+ #ifdef LV_HAVE_SSSE3
+
+-#include<xmmintrin.h>
+-#include<emmintrin.h>
+-#include<tmmintrin.h>
++#include <emmintrin.h>
++#include <tmmintrin.h>
++#include <xmmintrin.h>
+
+-static inline void
+-volk_16i_max_star_horizontal_16i_a_ssse3(int16_t* target, int16_t* src0, unsigned int num_points)
++static inline void volk_16i_max_star_horizontal_16i_a_ssse3(int16_t* target,
++ int16_t* src0,
++ unsigned int num_points)
+ {
+- const unsigned int num_bytes = num_points*2;
++ const unsigned int num_bytes = num_points * 2;
+
+- static const uint8_t shufmask0[16] = {0x00, 0x01, 0x04, 0x05, 0x08, 0x09, 0x0c, 0x0d, 0xff,
+- 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff};
+- static const uint8_t shufmask1[16] = {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00,
+- 0x01, 0x04, 0x05, 0x08, 0x09, 0x0c, 0x0d};
+- static const uint8_t andmask0[16] = {0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02,0x02, 0x00,
+- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00};
+- static const uint8_t andmask1[16] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02,
+- 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02};
++ static const uint8_t shufmask0[16] = {
++ 0x00, 0x01, 0x04, 0x05, 0x08, 0x09, 0x0c, 0x0d,
++ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
++ };
++ static const uint8_t shufmask1[16] = {
++ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
++ 0x00, 0x01, 0x04, 0x05, 0x08, 0x09, 0x0c, 0x0d
++ };
++ static const uint8_t andmask0[16] = {
++ 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02,
++ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
++ };
++ static const uint8_t andmask1[16] = {
++ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
++ 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02
++ };
+
+- __m128i xmm0, xmm1, xmm2, xmm3, xmm4;
+- __m128i xmm5, xmm6, xmm7, xmm8;
++ __m128i xmm0, xmm1, xmm2, xmm3, xmm4;
++ __m128i xmm5, xmm6, xmm7, xmm8;
+
+- xmm4 = _mm_load_si128((__m128i*)shufmask0);
+- xmm5 = _mm_load_si128((__m128i*)shufmask1);
+- xmm6 = _mm_load_si128((__m128i*)andmask0);
+- xmm7 = _mm_load_si128((__m128i*)andmask1);
++ xmm4 = _mm_load_si128((__m128i*)shufmask0);
++ xmm5 = _mm_load_si128((__m128i*)shufmask1);
++ xmm6 = _mm_load_si128((__m128i*)andmask0);
++ xmm7 = _mm_load_si128((__m128i*)andmask1);
+
+- __m128i *p_target, *p_src0;
++ __m128i *p_target, *p_src0;
+
+- p_target = (__m128i*)target;
+- p_src0 = (__m128i*)src0;
++ p_target = (__m128i*)target;
++ p_src0 = (__m128i*)src0;
+
+- int bound = num_bytes >> 5;
+- int intermediate = (num_bytes >> 4) & 1;
+- int leftovers = (num_bytes >> 1) & 7;
++ int bound = num_bytes >> 5;
++ int intermediate = (num_bytes >> 4) & 1;
++ int leftovers = (num_bytes >> 1) & 7;
+
+- int i = 0;
++ int i = 0;
+
+- for(i = 0; i < bound; ++i) {
+- xmm0 = _mm_load_si128(p_src0);
+- xmm1 = _mm_load_si128(&p_src0[1]);
++ for (i = 0; i < bound; ++i) {
++ xmm0 = _mm_load_si128(p_src0);
++ xmm1 = _mm_load_si128(&p_src0[1]);
+
+- xmm2 = _mm_xor_si128(xmm2, xmm2);
+- p_src0 += 2;
++ xmm2 = _mm_xor_si128(xmm2, xmm2);
++ p_src0 += 2;
+
+- xmm3 = _mm_hsub_epi16(xmm0, xmm1);
++ xmm3 = _mm_hsub_epi16(xmm0, xmm1);
+
+- xmm2 = _mm_cmpgt_epi16(xmm2, xmm3);
++ xmm2 = _mm_cmpgt_epi16(xmm2, xmm3);
+
+- xmm8 = _mm_and_si128(xmm2, xmm6);
+- xmm3 = _mm_and_si128(xmm2, xmm7);
++ xmm8 = _mm_and_si128(xmm2, xmm6);
++ xmm3 = _mm_and_si128(xmm2, xmm7);
+
+
+- xmm8 = _mm_add_epi8(xmm8, xmm4);
+- xmm3 = _mm_add_epi8(xmm3, xmm5);
++ xmm8 = _mm_add_epi8(xmm8, xmm4);
++ xmm3 = _mm_add_epi8(xmm3, xmm5);
+
+- xmm0 = _mm_shuffle_epi8(xmm0, xmm8);
+- xmm1 = _mm_shuffle_epi8(xmm1, xmm3);
++ xmm0 = _mm_shuffle_epi8(xmm0, xmm8);
++ xmm1 = _mm_shuffle_epi8(xmm1, xmm3);
+
+
+- xmm3 = _mm_add_epi16(xmm0, xmm1);
++ xmm3 = _mm_add_epi16(xmm0, xmm1);
+
+
+- _mm_store_si128(p_target, xmm3);
++ _mm_store_si128(p_target, xmm3);
+
+- p_target += 1;
+- }
++ p_target += 1;
++ }
+
+- if (intermediate) {
+- xmm0 = _mm_load_si128(p_src0);
++ if (intermediate) {
++ xmm0 = _mm_load_si128(p_src0);
+
+- xmm2 = _mm_xor_si128(xmm2, xmm2);
+- p_src0 += 1;
++ xmm2 = _mm_xor_si128(xmm2, xmm2);
++ p_src0 += 1;
+
+- xmm3 = _mm_hsub_epi16(xmm0, xmm1);
+- xmm2 = _mm_cmpgt_epi16(xmm2, xmm3);
++ xmm3 = _mm_hsub_epi16(xmm0, xmm1);
++ xmm2 = _mm_cmpgt_epi16(xmm2, xmm3);
+
+- xmm8 = _mm_and_si128(xmm2, xmm6);
++ xmm8 = _mm_and_si128(xmm2, xmm6);
+
+- xmm3 = _mm_add_epi8(xmm8, xmm4);
++ xmm3 = _mm_add_epi8(xmm8, xmm4);
+
+- xmm0 = _mm_shuffle_epi8(xmm0, xmm3);
++ xmm0 = _mm_shuffle_epi8(xmm0, xmm3);
+
+- _mm_storel_pd((double*)p_target, bit128_p(&xmm0)->double_vec);
++ _mm_storel_pd((double*)p_target, bit128_p(&xmm0)->double_vec);
+
+- p_target = (__m128i*)((int8_t*)p_target + 8);
+- }
++ p_target = (__m128i*)((int8_t*)p_target + 8);
++ }
+
+- for(i = (bound << 4) + (intermediate << 3); i < (bound << 4) + (intermediate << 3) + leftovers ; i += 2) {
+- target[i>>1] = ((int16_t)(src0[i] - src0[i + 1]) > 0) ? src0[i] : src0[i + 1];
+- }
++ for (i = (bound << 4) + (intermediate << 3);
++ i < (bound << 4) + (intermediate << 3) + leftovers;
++ i += 2) {
++ target[i >> 1] = ((int16_t)(src0[i] - src0[i + 1]) > 0) ? src0[i] : src0[i + 1];
++ }
+ }
+
+ #endif /*LV_HAVE_SSSE3*/
+@@ -158,54 +169,59 @@ volk_16i_max_star_horizontal_16i_a_ssse3(int16_t* target, int16_t* src0, unsigne
+ #ifdef LV_HAVE_NEON
+
+ #include <arm_neon.h>
+-static inline void
+-volk_16i_max_star_horizontal_16i_neon(int16_t* target, int16_t* src0, unsigned int num_points)
++static inline void volk_16i_max_star_horizontal_16i_neon(int16_t* target,
++ int16_t* src0,
++ unsigned int num_points)
+ {
+- const unsigned int eighth_points = num_points / 16;
+- unsigned number;
+- int16x8x2_t input_vec;
+- int16x8_t diff, max_vec, zeros;
+- uint16x8_t comp1, comp2;
+- zeros = vdupq_n_s16(0);
+- for(number=0; number < eighth_points; ++number) {
+- input_vec = vld2q_s16(src0);
+- //__VOLK_PREFETCH(src0+16);
+- diff = vsubq_s16(input_vec.val[0], input_vec.val[1]);
+- comp1 = vcgeq_s16(diff, zeros);
+- comp2 = vcltq_s16(diff, zeros);
+-
+- input_vec.val[0] = vandq_s16(input_vec.val[0], (int16x8_t)comp1);
+- input_vec.val[1] = vandq_s16(input_vec.val[1], (int16x8_t)comp2);
+-
+- max_vec = vaddq_s16(input_vec.val[0], input_vec.val[1]);
+- vst1q_s16(target, max_vec);
+- src0 += 16;
+- target += 8;
+- }
+- for(number=0; number < num_points%16; number+=2) {
+- target[number >> 1] = ((int16_t)(src0[number] - src0[number + 1]) > 0) ? src0[number] : src0[number+1];
+- }
+-
++ const unsigned int eighth_points = num_points / 16;
++ unsigned number;
++ int16x8x2_t input_vec;
++ int16x8_t diff, max_vec, zeros;
++ uint16x8_t comp1, comp2;
++ zeros = vdupq_n_s16(0);
++ for (number = 0; number < eighth_points; ++number) {
++ input_vec = vld2q_s16(src0);
++ //__VOLK_PREFETCH(src0+16);
++ diff = vsubq_s16(input_vec.val[0], input_vec.val[1]);
++ comp1 = vcgeq_s16(diff, zeros);
++ comp2 = vcltq_s16(diff, zeros);
++
++ input_vec.val[0] = vandq_s16(input_vec.val[0], (int16x8_t)comp1);
++ input_vec.val[1] = vandq_s16(input_vec.val[1], (int16x8_t)comp2);
++
++ max_vec = vaddq_s16(input_vec.val[0], input_vec.val[1]);
++ vst1q_s16(target, max_vec);
++ src0 += 16;
++ target += 8;
++ }
++ for (number = 0; number < num_points % 16; number += 2) {
++ target[number >> 1] = ((int16_t)(src0[number] - src0[number + 1]) > 0)
++ ? src0[number]
++ : src0[number + 1];
++ }
+ }
+ #endif /* LV_HAVE_NEON */
+
+ #ifdef LV_HAVE_NEONV7
+-extern void volk_16i_max_star_horizontal_16i_a_neonasm(int16_t* target, int16_t* src0, unsigned int num_points);
++extern void volk_16i_max_star_horizontal_16i_a_neonasm(int16_t* target,
++ int16_t* src0,
++ unsigned int num_points);
+ #endif /* LV_HAVE_NEONV7 */
+
+ #ifdef LV_HAVE_GENERIC
+-static inline void
+-volk_16i_max_star_horizontal_16i_generic(int16_t* target, int16_t* src0, unsigned int num_points)
++static inline void volk_16i_max_star_horizontal_16i_generic(int16_t* target,
++ int16_t* src0,
++ unsigned int num_points)
+ {
+- const unsigned int num_bytes = num_points*2;
++ const unsigned int num_bytes = num_points * 2;
+
+- int i = 0;
++ int i = 0;
+
+- int bound = num_bytes >> 1;
++ int bound = num_bytes >> 1;
+
+- for(i = 0; i < bound; i += 2) {
+- target[i >> 1] = ((int16_t) (src0[i] - src0[i + 1]) > 0) ? src0[i] : src0[i+1];
+- }
++ for (i = 0; i < bound; i += 2) {
++ target[i >> 1] = ((int16_t)(src0[i] - src0[i + 1]) > 0) ? src0[i] : src0[i + 1];
++ }
+ }
+
+ #endif /*LV_HAVE_GENERIC*/
+diff --git a/kernels/volk/volk_16i_permute_and_scalar_add.h b/kernels/volk/volk_16i_permute_and_scalar_add.h
+index 7fcdad3..0563f07 100644
+--- a/kernels/volk/volk_16i_permute_and_scalar_add.h
++++ b/kernels/volk/volk_16i_permute_and_scalar_add.h
+@@ -29,8 +29,9 @@
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+- * void volk_16i_permute_and_scalar_add(short* target, short* src0, short* permute_indexes, short* cntl0, short* cntl1, short* cntl2, short* cntl3, short* scalars, unsigned int num_points)
+- * \endcode
++ * void volk_16i_permute_and_scalar_add(short* target, short* src0, short*
++ * permute_indexes, short* cntl0, short* cntl1, short* cntl2, short* cntl3, short*
++ * scalars, unsigned int num_points) \endcode
+ *
+ * \b Inputs
+ * \li src0: The input vector.
+@@ -58,137 +59,143 @@
+ #ifndef INCLUDED_volk_16i_permute_and_scalar_add_a_H
+ #define INCLUDED_volk_16i_permute_and_scalar_add_a_H
+
+-#include<inttypes.h>
+-#include<stdio.h>
++#include <inttypes.h>
++#include <stdio.h>
+
+ #ifdef LV_HAVE_SSE2
+
+-#include<xmmintrin.h>
+-#include<emmintrin.h>
+-
+-static inline void
+-volk_16i_permute_and_scalar_add_a_sse2(short* target, short* src0, short* permute_indexes,
+- short* cntl0, short* cntl1, short* cntl2, short* cntl3,
+- short* scalars, unsigned int num_points)
++#include <emmintrin.h>
++#include <xmmintrin.h>
++
++static inline void volk_16i_permute_and_scalar_add_a_sse2(short* target,
++ short* src0,
++ short* permute_indexes,
++ short* cntl0,
++ short* cntl1,
++ short* cntl2,
++ short* cntl3,
++ short* scalars,
++ unsigned int num_points)
+ {
+
+- const unsigned int num_bytes = num_points*2;
++ const unsigned int num_bytes = num_points * 2;
+
+- __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
++ __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
+
+- __m128i *p_target, *p_cntl0, *p_cntl1, *p_cntl2, *p_cntl3, *p_scalars;
++ __m128i *p_target, *p_cntl0, *p_cntl1, *p_cntl2, *p_cntl3, *p_scalars;
+
+- short* p_permute_indexes = permute_indexes;
++ short* p_permute_indexes = permute_indexes;
+
+- p_target = (__m128i*)target;
+- p_cntl0 = (__m128i*)cntl0;
+- p_cntl1 = (__m128i*)cntl1;
+- p_cntl2 = (__m128i*)cntl2;
+- p_cntl3 = (__m128i*)cntl3;
+- p_scalars = (__m128i*)scalars;
++ p_target = (__m128i*)target;
++ p_cntl0 = (__m128i*)cntl0;
++ p_cntl1 = (__m128i*)cntl1;
++ p_cntl2 = (__m128i*)cntl2;
++ p_cntl3 = (__m128i*)cntl3;
++ p_scalars = (__m128i*)scalars;
+
+- int i = 0;
++ int i = 0;
+
+- int bound = (num_bytes >> 4);
+- int leftovers = (num_bytes >> 1) & 7;
++ int bound = (num_bytes >> 4);
++ int leftovers = (num_bytes >> 1) & 7;
+
+- xmm0 = _mm_load_si128(p_scalars);
++ xmm0 = _mm_load_si128(p_scalars);
+
+- xmm1 = _mm_shufflelo_epi16(xmm0, 0);
+- xmm2 = _mm_shufflelo_epi16(xmm0, 0x55);
+- xmm3 = _mm_shufflelo_epi16(xmm0, 0xaa);
+- xmm4 = _mm_shufflelo_epi16(xmm0, 0xff);
++ xmm1 = _mm_shufflelo_epi16(xmm0, 0);
++ xmm2 = _mm_shufflelo_epi16(xmm0, 0x55);
++ xmm3 = _mm_shufflelo_epi16(xmm0, 0xaa);
++ xmm4 = _mm_shufflelo_epi16(xmm0, 0xff);
+
+- xmm1 = _mm_shuffle_epi32(xmm1, 0x00);
+- xmm2 = _mm_shuffle_epi32(xmm2, 0x00);
+- xmm3 = _mm_shuffle_epi32(xmm3, 0x00);
+- xmm4 = _mm_shuffle_epi32(xmm4, 0x00);
++ xmm1 = _mm_shuffle_epi32(xmm1, 0x00);
++ xmm2 = _mm_shuffle_epi32(xmm2, 0x00);
++ xmm3 = _mm_shuffle_epi32(xmm3, 0x00);
++ xmm4 = _mm_shuffle_epi32(xmm4, 0x00);
+
+
+- for(; i < bound; ++i) {
+- xmm0 = _mm_setzero_si128();
+- xmm5 = _mm_setzero_si128();
+- xmm6 = _mm_setzero_si128();
+- xmm7 = _mm_setzero_si128();
++ for (; i < bound; ++i) {
++ xmm0 = _mm_setzero_si128();
++ xmm5 = _mm_setzero_si128();
++ xmm6 = _mm_setzero_si128();
++ xmm7 = _mm_setzero_si128();
+
+- xmm0 = _mm_insert_epi16(xmm0, src0[p_permute_indexes[0]], 0);
+- xmm5 = _mm_insert_epi16(xmm5, src0[p_permute_indexes[1]], 1);
+- xmm6 = _mm_insert_epi16(xmm6, src0[p_permute_indexes[2]], 2);
+- xmm7 = _mm_insert_epi16(xmm7, src0[p_permute_indexes[3]], 3);
+- xmm0 = _mm_insert_epi16(xmm0, src0[p_permute_indexes[4]], 4);
+- xmm5 = _mm_insert_epi16(xmm5, src0[p_permute_indexes[5]], 5);
+- xmm6 = _mm_insert_epi16(xmm6, src0[p_permute_indexes[6]], 6);
+- xmm7 = _mm_insert_epi16(xmm7, src0[p_permute_indexes[7]], 7);
++ xmm0 = _mm_insert_epi16(xmm0, src0[p_permute_indexes[0]], 0);
++ xmm5 = _mm_insert_epi16(xmm5, src0[p_permute_indexes[1]], 1);
++ xmm6 = _mm_insert_epi16(xmm6, src0[p_permute_indexes[2]], 2);
++ xmm7 = _mm_insert_epi16(xmm7, src0[p_permute_indexes[3]], 3);
++ xmm0 = _mm_insert_epi16(xmm0, src0[p_permute_indexes[4]], 4);
++ xmm5 = _mm_insert_epi16(xmm5, src0[p_permute_indexes[5]], 5);
++ xmm6 = _mm_insert_epi16(xmm6, src0[p_permute_indexes[6]], 6);
++ xmm7 = _mm_insert_epi16(xmm7, src0[p_permute_indexes[7]], 7);
+
+- xmm0 = _mm_add_epi16(xmm0, xmm5);
+- xmm6 = _mm_add_epi16(xmm6, xmm7);
++ xmm0 = _mm_add_epi16(xmm0, xmm5);
++ xmm6 = _mm_add_epi16(xmm6, xmm7);
+
+- p_permute_indexes += 8;
++ p_permute_indexes += 8;
+
+- xmm0 = _mm_add_epi16(xmm0, xmm6);
++ xmm0 = _mm_add_epi16(xmm0, xmm6);
+
+- xmm5 = _mm_load_si128(p_cntl0);
+- xmm6 = _mm_load_si128(p_cntl1);
+- xmm7 = _mm_load_si128(p_cntl2);
++ xmm5 = _mm_load_si128(p_cntl0);
++ xmm6 = _mm_load_si128(p_cntl1);
++ xmm7 = _mm_load_si128(p_cntl2);
+
+- xmm5 = _mm_and_si128(xmm5, xmm1);
+- xmm6 = _mm_and_si128(xmm6, xmm2);
+- xmm7 = _mm_and_si128(xmm7, xmm3);
++ xmm5 = _mm_and_si128(xmm5, xmm1);
++ xmm6 = _mm_and_si128(xmm6, xmm2);
++ xmm7 = _mm_and_si128(xmm7, xmm3);
+
+- xmm0 = _mm_add_epi16(xmm0, xmm5);
++ xmm0 = _mm_add_epi16(xmm0, xmm5);
+
+- xmm5 = _mm_load_si128(p_cntl3);
++ xmm5 = _mm_load_si128(p_cntl3);
+
+- xmm6 = _mm_add_epi16(xmm6, xmm7);
++ xmm6 = _mm_add_epi16(xmm6, xmm7);
+
+- p_cntl0 += 1;
++ p_cntl0 += 1;
+
+- xmm5 = _mm_and_si128(xmm5, xmm4);
++ xmm5 = _mm_and_si128(xmm5, xmm4);
+
+- xmm0 = _mm_add_epi16(xmm0, xmm6);
++ xmm0 = _mm_add_epi16(xmm0, xmm6);
+
+- p_cntl1 += 1;
+- p_cntl2 += 1;
++ p_cntl1 += 1;
++ p_cntl2 += 1;
+
+- xmm0 = _mm_add_epi16(xmm0, xmm5);
++ xmm0 = _mm_add_epi16(xmm0, xmm5);
+
+- p_cntl3 += 1;
++ p_cntl3 += 1;
+
+- _mm_store_si128(p_target, xmm0);
++ _mm_store_si128(p_target, xmm0);
+
+- p_target += 1;
+- }
++ p_target += 1;
++ }
+
+- for(i = bound * 8; i < (bound * 8) + leftovers; ++i) {
+- target[i] = src0[permute_indexes[i]]
+- + (cntl0[i] & scalars[0])
+- + (cntl1[i] & scalars[1])
+- + (cntl2[i] & scalars[2])
+- + (cntl3[i] & scalars[3]);
+- }
++ for (i = bound * 8; i < (bound * 8) + leftovers; ++i) {
++ target[i] = src0[permute_indexes[i]] + (cntl0[i] & scalars[0]) +
++ (cntl1[i] & scalars[1]) + (cntl2[i] & scalars[2]) +
++ (cntl3[i] & scalars[3]);
++ }
+ }
+ #endif /*LV_HAVE_SSE*/
+
+
+ #ifdef LV_HAVE_GENERIC
+-static inline void
+-volk_16i_permute_and_scalar_add_generic(short* target, short* src0, short* permute_indexes,
+- short* cntl0, short* cntl1, short* cntl2, short* cntl3,
+- short* scalars, unsigned int num_points)
++static inline void volk_16i_permute_and_scalar_add_generic(short* target,
++ short* src0,
++ short* permute_indexes,
++ short* cntl0,
++ short* cntl1,
++ short* cntl2,
++ short* cntl3,
++ short* scalars,
++ unsigned int num_points)
+ {
+- const unsigned int num_bytes = num_points*2;
++ const unsigned int num_bytes = num_points * 2;
+
+- int i = 0;
++ int i = 0;
+
+- int bound = num_bytes >> 1;
++ int bound = num_bytes >> 1;
+
+- for(i = 0; i < bound; ++i) {
+- target[i] = src0[permute_indexes[i]]
+- + (cntl0[i] & scalars[0])
+- + (cntl1[i] & scalars[1])
+- + (cntl2[i] & scalars[2])
+- + (cntl3[i] & scalars[3]);
+- }
++ for (i = 0; i < bound; ++i) {
++ target[i] = src0[permute_indexes[i]] + (cntl0[i] & scalars[0]) +
++ (cntl1[i] & scalars[1]) + (cntl2[i] & scalars[2]) +
++ (cntl3[i] & scalars[3]);
++ }
+ }
+
+ #endif /*LV_HAVE_GENERIC*/
+diff --git a/kernels/volk/volk_16i_s32f_convert_32f.h b/kernels/volk/volk_16i_s32f_convert_32f.h
+index 38ea6f5..3fd3a77 100644
+--- a/kernels/volk/volk_16i_s32f_convert_32f.h
++++ b/kernels/volk/volk_16i_s32f_convert_32f.h
+@@ -29,8 +29,8 @@
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+- * void volk_16i_s32f_convert_32f(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points);
+- * \endcode
++ * void volk_16i_s32f_convert_32f(float* outputVector, const int16_t* inputVector, const
++ * float scalar, unsigned int num_points); \endcode
+ *
+ * \b Inputs
+ * \li inputVector: The input vector of 16-bit shorts.
+@@ -60,238 +60,247 @@
+ #ifdef LV_HAVE_AVX2
+ #include <immintrin.h>
+
+-static inline void
+-volk_16i_s32f_convert_32f_u_avx2(float* outputVector, const int16_t* inputVector,
+- const float scalar, unsigned int num_points)
++static inline void volk_16i_s32f_convert_32f_u_avx2(float* outputVector,
++ const int16_t* inputVector,
++ const float scalar,
++ unsigned int num_points)
+ {
+- unsigned int number = 0;
+- const unsigned int eighthPoints = num_points / 8;
++ unsigned int number = 0;
++ const unsigned int eighthPoints = num_points / 8;
+
+- float* outputVectorPtr = outputVector;
+- __m256 invScalar = _mm256_set1_ps(1.0/scalar);
+- int16_t* inputPtr = (int16_t*)inputVector;
+- __m128i inputVal;
+- __m256i inputVal2;
+- __m256 ret;
++ float* outputVectorPtr = outputVector;
++ __m256 invScalar = _mm256_set1_ps(1.0 / scalar);
++ int16_t* inputPtr = (int16_t*)inputVector;
++ __m128i inputVal;
++ __m256i inputVal2;
++ __m256 ret;
+
+- for(;number < eighthPoints; number++){
++ for (; number < eighthPoints; number++) {
+
+- // Load the 8 values
+- inputVal = _mm_loadu_si128((__m128i*)inputPtr);
++ // Load the 8 values
++ inputVal = _mm_loadu_si128((__m128i*)inputPtr);
+
+- // Convert
+- inputVal2 = _mm256_cvtepi16_epi32(inputVal);
++ // Convert
++ inputVal2 = _mm256_cvtepi16_epi32(inputVal);
+
+- ret = _mm256_cvtepi32_ps(inputVal2);
+- ret = _mm256_mul_ps(ret, invScalar);
++ ret = _mm256_cvtepi32_ps(inputVal2);
++ ret = _mm256_mul_ps(ret, invScalar);
+
+- _mm256_storeu_ps(outputVectorPtr, ret);
++ _mm256_storeu_ps(outputVectorPtr, ret);
+
+- outputVectorPtr += 8;
++ outputVectorPtr += 8;
+
+- inputPtr += 8;
+- }
++ inputPtr += 8;
++ }
+
+- number = eighthPoints * 8;
+- for(; number < num_points; number++){
+- outputVector[number] =((float)(inputVector[number])) / scalar;
+- }
++ number = eighthPoints * 8;
++ for (; number < num_points; number++) {
++ outputVector[number] = ((float)(inputVector[number])) / scalar;
++ }
+ }
+ #endif /* LV_HAVE_AVX2 */
+
+ #ifdef LV_HAVE_AVX
+ #include <immintrin.h>
+
+-static inline void
+-volk_16i_s32f_convert_32f_u_avx(float* outputVector, const int16_t* inputVector,
+- const float scalar, unsigned int num_points)
++static inline void volk_16i_s32f_convert_32f_u_avx(float* outputVector,
++ const int16_t* inputVector,
++ const float scalar,
++ unsigned int num_points)
+ {
+- unsigned int number = 0;
+- const unsigned int eighthPoints = num_points / 8;
++ unsigned int number = 0;
++ const unsigned int eighthPoints = num_points / 8;
+
+- float* outputVectorPtr = outputVector;
+- __m128 invScalar = _mm_set_ps1(1.0/scalar);
+- int16_t* inputPtr = (int16_t*)inputVector;
+- __m128i inputVal, inputVal2;
+- __m128 ret;
+- __m256 output;
+- __m256 dummy = _mm256_setzero_ps();
++ float* outputVectorPtr = outputVector;
++ __m128 invScalar = _mm_set_ps1(1.0 / scalar);
++ int16_t* inputPtr = (int16_t*)inputVector;
++ __m128i inputVal, inputVal2;
++ __m128 ret;
++ __m256 output;
++ __m256 dummy = _mm256_setzero_ps();
+
+- for(;number < eighthPoints; number++){
++ for (; number < eighthPoints; number++) {
+
+- // Load the 8 values
+- //inputVal = _mm_loadu_si128((__m128i*)inputPtr);
+- inputVal = _mm_loadu_si128((__m128i*)inputPtr);
++ // Load the 8 values
++ // inputVal = _mm_loadu_si128((__m128i*)inputPtr);
++ inputVal = _mm_loadu_si128((__m128i*)inputPtr);
+
+- // Shift the input data to the right by 64 bits ( 8 bytes )
+- inputVal2 = _mm_srli_si128(inputVal, 8);
++ // Shift the input data to the right by 64 bits ( 8 bytes )
++ inputVal2 = _mm_srli_si128(inputVal, 8);
+
+- // Convert the lower 4 values into 32 bit words
+- inputVal = _mm_cvtepi16_epi32(inputVal);
+- inputVal2 = _mm_cvtepi16_epi32(inputVal2);
++ // Convert the lower 4 values into 32 bit words
++ inputVal = _mm_cvtepi16_epi32(inputVal);
++ inputVal2 = _mm_cvtepi16_epi32(inputVal2);
+
+- ret = _mm_cvtepi32_ps(inputVal);
+- ret = _mm_mul_ps(ret, invScalar);
+- output = _mm256_insertf128_ps(dummy, ret, 0);
++ ret = _mm_cvtepi32_ps(inputVal);
++ ret = _mm_mul_ps(ret, invScalar);
++ output = _mm256_insertf128_ps(dummy, ret, 0);
+
+- ret = _mm_cvtepi32_ps(inputVal2);
+- ret = _mm_mul_ps(ret, invScalar);
+- output = _mm256_insertf128_ps(output, ret, 1);
++ ret = _mm_cvtepi32_ps(inputVal2);
++ ret = _mm_mul_ps(ret, invScalar);
++ output = _mm256_insertf128_ps(output, ret, 1);
+
+- _mm256_storeu_ps(outputVectorPtr, output);
++ _mm256_storeu_ps(outputVectorPtr, output);
+
+- outputVectorPtr += 8;
++ outputVectorPtr += 8;
+
+- inputPtr += 8;
+- }
++ inputPtr += 8;
++ }
+
+- number = eighthPoints * 8;
+- for(; number < num_points; number++){
+- outputVector[number] =((float)(inputVector[number])) / scalar;
+- }
++ number = eighthPoints * 8;
++ for (; number < num_points; number++) {
++ outputVector[number] = ((float)(inputVector[number])) / scalar;
++ }
+ }
+ #endif /* LV_HAVE_AVX */
+
+ #ifdef LV_HAVE_SSE4_1
+ #include <smmintrin.h>
+
+-static inline void
+-volk_16i_s32f_convert_32f_u_sse4_1(float* outputVector, const int16_t* inputVector,
+- const float scalar, unsigned int num_points)
++static inline void volk_16i_s32f_convert_32f_u_sse4_1(float* outputVector,
++ const int16_t* inputVector,
++ const float scalar,
++ unsigned int num_points)
+ {
+- unsigned int number = 0;
+- const unsigned int eighthPoints = num_points / 8;
++ unsigned int number = 0;
++ const unsigned int eighthPoints = num_points / 8;
+
+- float* outputVectorPtr = outputVector;
+- __m128 invScalar = _mm_set_ps1(1.0/scalar);
+- int16_t* inputPtr = (int16_t*)inputVector;
+- __m128i inputVal;
+- __m128i inputVal2;
+- __m128 ret;
++ float* outputVectorPtr = outputVector;
++ __m128 invScalar = _mm_set_ps1(1.0 / scalar);
++ int16_t* inputPtr = (int16_t*)inputVector;
++ __m128i inputVal;
++ __m128i inputVal2;
++ __m128 ret;
+
+- for(;number < eighthPoints; number++){
++ for (; number < eighthPoints; number++) {
+
+- // Load the 8 values
+- inputVal = _mm_loadu_si128((__m128i*)inputPtr);
++ // Load the 8 values
++ inputVal = _mm_loadu_si128((__m128i*)inputPtr);
+
+- // Shift the input data to the right by 64 bits ( 8 bytes )
+- inputVal2 = _mm_srli_si128(inputVal, 8);
++ // Shift the input data to the right by 64 bits ( 8 bytes )
++ inputVal2 = _mm_srli_si128(inputVal, 8);
+
+- // Convert the lower 4 values into 32 bit words
+- inputVal = _mm_cvtepi16_epi32(inputVal);
+- inputVal2 = _mm_cvtepi16_epi32(inputVal2);
++ // Convert the lower 4 values into 32 bit words
++ inputVal = _mm_cvtepi16_epi32(inputVal);
++ inputVal2 = _mm_cvtepi16_epi32(inputVal2);
+
+- ret = _mm_cvtepi32_ps(inputVal);
+- ret = _mm_mul_ps(ret, invScalar);
+- _mm_storeu_ps(outputVectorPtr, ret);
+- outputVectorPtr += 4;
++ ret = _mm_cvtepi32_ps(inputVal);
++ ret = _mm_mul_ps(ret, invScalar);
++ _mm_storeu_ps(outputVectorPtr, ret);
++ outputVectorPtr += 4;
+
+- ret = _mm_cvtepi32_ps(inputVal2);
+- ret = _mm_mul_ps(ret, invScalar);
+- _mm_storeu_ps(outputVectorPtr, ret);
++ ret = _mm_cvtepi32_ps(inputVal2);
++ ret = _mm_mul_ps(ret, invScalar);
++ _mm_storeu_ps(outputVectorPtr, ret);
+
+- outputVectorPtr += 4;
++ outputVectorPtr += 4;
+
+- inputPtr += 8;
+- }
++ inputPtr += 8;
++ }
+
+- number = eighthPoints * 8;
+- for(; number < num_points; number++){
+- outputVector[number] =((float)(inputVector[number])) / scalar;
+- }
++ number = eighthPoints * 8;
++ for (; number < num_points; number++) {
++ outputVector[number] = ((float)(inputVector[number])) / scalar;
++ }
+ }
+ #endif /* LV_HAVE_SSE4_1 */
+
+ #ifdef LV_HAVE_SSE
+ #include <xmmintrin.h>
+
+-static inline void
+-volk_16i_s32f_convert_32f_u_sse(float* outputVector, const int16_t* inputVector,
+- const float scalar, unsigned int num_points)
++static inline void volk_16i_s32f_convert_32f_u_sse(float* outputVector,
++ const int16_t* inputVector,
++ const float scalar,
++ unsigned int num_points)
+ {
+- unsigned int number = 0;
+- const unsigned int quarterPoints = num_points / 4;
+-
+- float* outputVectorPtr = outputVector;
+- __m128 invScalar = _mm_set_ps1(1.0/scalar);
+- int16_t* inputPtr = (int16_t*)inputVector;
+- __m128 ret;
+-
+- for(;number < quarterPoints; number++){
+- ret = _mm_set_ps((float)(inputPtr[3]), (float)(inputPtr[2]), (float)(inputPtr[1]), (float)(inputPtr[0]));
+-
+- ret = _mm_mul_ps(ret, invScalar);
+- _mm_storeu_ps(outputVectorPtr, ret);
+-
+- inputPtr += 4;
+- outputVectorPtr += 4;
+- }
+-
+- number = quarterPoints * 4;
+- for(; number < num_points; number++){
+- outputVector[number] = (float)(inputVector[number]) / scalar;
+- }
++ unsigned int number = 0;
++ const unsigned int quarterPoints = num_points / 4;
++
++ float* outputVectorPtr = outputVector;
++ __m128 invScalar = _mm_set_ps1(1.0 / scalar);
++ int16_t* inputPtr = (int16_t*)inputVector;
++ __m128 ret;
++
++ for (; number < quarterPoints; number++) {
++ ret = _mm_set_ps((float)(inputPtr[3]),
++ (float)(inputPtr[2]),
++ (float)(inputPtr[1]),
++ (float)(inputPtr[0]));
++
++ ret = _mm_mul_ps(ret, invScalar);
++ _mm_storeu_ps(outputVectorPtr, ret);
++
++ inputPtr += 4;
++ outputVectorPtr += 4;
++ }
++
++ number = quarterPoints * 4;
++ for (; number < num_points; number++) {
++ outputVector[number] = (float)(inputVector[number]) / scalar;
++ }
+ }
+ #endif /* LV_HAVE_SSE */
+
+ #ifdef LV_HAVE_GENERIC
+
+-static inline void
+-volk_16i_s32f_convert_32f_generic(float* outputVector, const int16_t* inputVector,
+- const float scalar, unsigned int num_points)
++static inline void volk_16i_s32f_convert_32f_generic(float* outputVector,
++ const int16_t* inputVector,
++ const float scalar,
++ unsigned int num_points)
+ {
+- float* outputVectorPtr = outputVector;
+- const int16_t* inputVectorPtr = inputVector;
+- unsigned int number = 0;
++ float* outputVectorPtr = outputVector;
++ const int16_t* inputVectorPtr = inputVector;
++ unsigned int number = 0;
+
+- for(number = 0; number < num_points; number++){
+- *outputVectorPtr++ = ((float)(*inputVectorPtr++)) / scalar;
+- }
++ for (number = 0; number < num_points; number++) {
++ *outputVectorPtr++ = ((float)(*inputVectorPtr++)) / scalar;
++ }
+ }
+ #endif /* LV_HAVE_GENERIC */
+
+ #ifdef LV_HAVE_NEON
+ #include <arm_neon.h>
+
+-static inline void
+-volk_16i_s32f_convert_32f_neon(float* outputVector, const int16_t* inputVector,
+- const float scalar, unsigned int num_points)
++static inline void volk_16i_s32f_convert_32f_neon(float* outputVector,
++ const int16_t* inputVector,
++ const float scalar,
++ unsigned int num_points)
+ {
+- float* outputPtr = outputVector;
+- const int16_t* inputPtr = inputVector;
+- unsigned int number = 0;
+- unsigned int eighth_points = num_points / 8;
+-
+- int16x4x2_t input16;
+- int32x4_t input32_0, input32_1;
+- float32x4_t input_float_0, input_float_1;
+- float32x4x2_t output_float;
+- float32x4_t inv_scale;
+-
+- inv_scale = vdupq_n_f32(1.0/scalar);
+-
+- // the generic disassembles to a 128-bit load
+- // and duplicates every instruction to operate on 64-bits
+- // at a time. This is only possible with lanes, which is faster
+- // than just doing a vld1_s16, but still slower.
+- for(number = 0; number < eighth_points; number++){
+- input16 = vld2_s16(inputPtr);
+- // widen 16-bit int to 32-bit int
+- input32_0 = vmovl_s16(input16.val[0]);
+- input32_1 = vmovl_s16(input16.val[1]);
+- // convert 32-bit int to float with scale
+- input_float_0 = vcvtq_f32_s32(input32_0);
+- input_float_1 = vcvtq_f32_s32(input32_1);
+- output_float.val[0] = vmulq_f32(input_float_0, inv_scale);
+- output_float.val[1] = vmulq_f32(input_float_1, inv_scale);
+- vst2q_f32(outputPtr, output_float);
+- inputPtr += 8;
+- outputPtr += 8;
+- }
+-
+- for(number = eighth_points*8; number < num_points; number++){
+- *outputPtr++ = ((float)(*inputPtr++)) / scalar;
+- }
++ float* outputPtr = outputVector;
++ const int16_t* inputPtr = inputVector;
++ unsigned int number = 0;
++ unsigned int eighth_points = num_points / 8;
++
++ int16x4x2_t input16;
++ int32x4_t input32_0, input32_1;
++ float32x4_t input_float_0, input_float_1;
++ float32x4x2_t output_float;
++ float32x4_t inv_scale;
++
++ inv_scale = vdupq_n_f32(1.0 / scalar);
++
++ // the generic disassembles to a 128-bit load
++ // and duplicates every instruction to operate on 64-bits
++ // at a time. This is only possible with lanes, which is faster
++ // than just doing a vld1_s16, but still slower.
++ for (number = 0; number < eighth_points; number++) {
++ input16 = vld2_s16(inputPtr);
++ // widen 16-bit int to 32-bit int
++ input32_0 = vmovl_s16(input16.val[0]);
++ input32_1 = vmovl_s16(input16.val[1]);
++ // convert 32-bit int to float with scale
++ input_float_0 = vcvtq_f32_s32(input32_0);
++ input_float_1 = vcvtq_f32_s32(input32_1);
++ output_float.val[0] = vmulq_f32(input_float_0, inv_scale);
++ output_float.val[1] = vmulq_f32(input_float_1, inv_scale);
++ vst2q_f32(outputPtr, output_float);
++ inputPtr += 8;
++ outputPtr += 8;
++ }
++
++ for (number = eighth_points * 8; number < num_points; number++) {
++ *outputPtr++ = ((float)(*inputPtr++)) / scalar;
++ }
+ }
+ #endif /* LV_HAVE_NEON */
+
+@@ -306,193 +315,201 @@ volk_16i_s32f_convert_32f_neon(float* outputVector, const int16_t* inputVector,
+ #ifdef LV_HAVE_AVX2
+ #include <immintrin.h>
+
+-static inline void
+-volk_16i_s32f_convert_32f_a_avx2(float* outputVector, const int16_t* inputVector,
+- const float scalar, unsigned int num_points)
++static inline void volk_16i_s32f_convert_32f_a_avx2(float* outputVector,
++ const int16_t* inputVector,
++ const float scalar,
++ unsigned int num_points)
+ {
+- unsigned int number = 0;
+- const unsigned int eighthPoints = num_points / 8;
++ unsigned int number = 0;
++ const unsigned int eighthPoints = num_points / 8;
+
+- float* outputVectorPtr = outputVector;
+- __m256 invScalar = _mm256_set1_ps(1.0/scalar);
+- int16_t* inputPtr = (int16_t*)inputVector;
+- __m128i inputVal;
+- __m256i inputVal2;
+- __m256 ret;
++ float* outputVectorPtr = outputVector;
++ __m256 invScalar = _mm256_set1_ps(1.0 / scalar);
++ int16_t* inputPtr = (int16_t*)inputVector;
++ __m128i inputVal;
++ __m256i inputVal2;
++ __m256 ret;
+
+- for(;number < eighthPoints; number++){
++ for (; number < eighthPoints; number++) {
+
+- // Load the 8 values
+- inputVal = _mm_load_si128((__m128i*)inputPtr);
++ // Load the 8 values
++ inputVal = _mm_load_si128((__m128i*)inputPtr);
+
+- // Convert
+- inputVal2 = _mm256_cvtepi16_epi32(inputVal);
++ // Convert
++ inputVal2 = _mm256_cvtepi16_epi32(inputVal);
+
+- ret = _mm256_cvtepi32_ps(inputVal2);
+- ret = _mm256_mul_ps(ret, invScalar);
++ ret = _mm256_cvtepi32_ps(inputVal2);
++ ret = _mm256_mul_ps(ret, invScalar);
+
+- _mm256_store_ps(outputVectorPtr, ret);
++ _mm256_store_ps(outputVectorPtr, ret);
+
+- outputVectorPtr += 8;
++ outputVectorPtr += 8;
+
+- inputPtr += 8;
+- }
++ inputPtr += 8;
++ }
+
+- number = eighthPoints * 8;
+- for(; number < num_points; number++){
+- outputVector[number] =((float)(inputVector[number])) / scalar;
+- }
++ number = eighthPoints * 8;
++ for (; number < num_points; number++) {
++ outputVector[number] = ((float)(inputVector[number])) / scalar;
++ }
+ }
+ #endif /* LV_HAVE_AVX2 */
+
+ #ifdef LV_HAVE_AVX
+ #include <immintrin.h>
+
+-static inline void
+-volk_16i_s32f_convert_32f_a_avx(float* outputVector, const int16_t* inputVector,
+- const float scalar, unsigned int num_points)
++static inline void volk_16i_s32f_convert_32f_a_avx(float* outputVector,
++ const int16_t* inputVector,
++ const float scalar,
++ unsigned int num_points)
+ {
+- unsigned int number = 0;
+- const unsigned int eighthPoints = num_points / 8;
++ unsigned int number = 0;
++ const unsigned int eighthPoints = num_points / 8;
+
+- float* outputVectorPtr = outputVector;
+- __m128 invScalar = _mm_set_ps1(1.0/scalar);
+- int16_t* inputPtr = (int16_t*)inputVector;
+- __m128i inputVal, inputVal2;
+- __m128 ret;
+- __m256 output;
+- __m256 dummy = _mm256_setzero_ps();
++ float* outputVectorPtr = outputVector;
++ __m128 invScalar = _mm_set_ps1(1.0 / scalar);
++ int16_t* inputPtr = (int16_t*)inputVector;
++ __m128i inputVal, inputVal2;
++ __m128 ret;
++ __m256 output;
++ __m256 dummy = _mm256_setzero_ps();
+
+- for(;number < eighthPoints; number++){
++ for (; number < eighthPoints; number++) {
+
+- // Load the 8 values
+- //inputVal = _mm_loadu_si128((__m128i*)inputPtr);
+- inputVal = _mm_load_si128((__m128i*)inputPtr);
++ // Load the 8 values
++ // inputVal = _mm_loadu_si128((__m128i*)inputPtr);
++ inputVal = _mm_load_si128((__m128i*)inputPtr);
+
+- // Shift the input data to the right by 64 bits ( 8 bytes )
+- inputVal2 = _mm_srli_si128(inputVal, 8);
++ // Shift the input data to the right by 64 bits ( 8 bytes )
++ inputVal2 = _mm_srli_si128(inputVal, 8);
+
+- // Convert the lower 4 values into 32 bit words
+- inputVal = _mm_cvtepi16_epi32(inputVal);
+- inputVal2 = _mm_cvtepi16_epi32(inputVal2);
++ // Convert the lower 4 values into 32 bit words
++ inputVal = _mm_cvtepi16_epi32(inputVal);
++ inputVal2 = _mm_cvtepi16_epi32(inputVal2);
+
+- ret = _mm_cvtepi32_ps(inputVal);
+- ret = _mm_mul_ps(ret, invScalar);
+- output = _mm256_insertf128_ps(dummy, ret, 0);
++ ret = _mm_cvtepi32_ps(inputVal);
++ ret = _mm_mul_ps(ret, invScalar);
++ output = _mm256_insertf128_ps(dummy, ret, 0);
+
+- ret = _mm_cvtepi32_ps(inputVal2);
+- ret = _mm_mul_ps(ret, invScalar);
+- output = _mm256_insertf128_ps(output, ret, 1);
++ ret = _mm_cvtepi32_ps(inputVal2);
++ ret = _mm_mul_ps(ret, invScalar);
++ output = _mm256_insertf128_ps(output, ret, 1);
+
+- _mm256_store_ps(outputVectorPtr, output);
++ _mm256_store_ps(outputVectorPtr, output);
+
+- outputVectorPtr += 8;
++ outputVectorPtr += 8;
+
+- inputPtr += 8;
+- }
++ inputPtr += 8;
++ }
+
+- number = eighthPoints * 8;
+- for(; number < num_points; number++){
+- outputVector[number] =((float)(inputVector[number])) / scalar;
+- }
++ number = eighthPoints * 8;
++ for (; number < num_points; number++) {
++ outputVector[number] = ((float)(inputVector[number])) / scalar;
++ }
+ }
+ #endif /* LV_HAVE_AVX */
+
+ #ifdef LV_HAVE_SSE4_1
+ #include <smmintrin.h>
+
+-static inline void
+-volk_16i_s32f_convert_32f_a_sse4_1(float* outputVector, const int16_t* inputVector,
+- const float scalar, unsigned int num_points)
++static inline void volk_16i_s32f_convert_32f_a_sse4_1(float* outputVector,
++ const int16_t* inputVector,
++ const float scalar,
++ unsigned int num_points)
+ {
+- unsigned int number = 0;
+- const unsigned int eighthPoints = num_points / 8;
++ unsigned int number = 0;
++ const unsigned int eighthPoints = num_points / 8;
+
+- float* outputVectorPtr = outputVector;
+- __m128 invScalar = _mm_set_ps1(1.0/scalar);
+- int16_t* inputPtr = (int16_t*)inputVector;
+- __m128i inputVal;
+- __m128i inputVal2;
+- __m128 ret;
++ float* outputVectorPtr = outputVector;
++ __m128 invScalar = _mm_set_ps1(1.0 / scalar);
++ int16_t* inputPtr = (int16_t*)inputVector;
++ __m128i inputVal;
++ __m128i inputVal2;
++ __m128 ret;
+
+- for(;number < eighthPoints; number++){
++ for (; number < eighthPoints; number++) {
+
+- // Load the 8 values
+- inputVal = _mm_loadu_si128((__m128i*)inputPtr);
++ // Load the 8 values
++ inputVal = _mm_loadu_si128((__m128i*)inputPtr);
+
+- // Shift the input data to the right by 64 bits ( 8 bytes )
+- inputVal2 = _mm_srli_si128(inputVal, 8);
++ // Shift the input data to the right by 64 bits ( 8 bytes )
++ inputVal2 = _mm_srli_si128(inputVal, 8);
+
+- // Convert the lower 4 values into 32 bit words
+- inputVal = _mm_cvtepi16_epi32(inputVal);
+- inputVal2 = _mm_cvtepi16_epi32(inputVal2);
++ // Convert the lower 4 values into 32 bit words
++ inputVal = _mm_cvtepi16_epi32(inputVal);
++ inputVal2 = _mm_cvtepi16_epi32(inputVal2);
+
+- ret = _mm_cvtepi32_ps(inputVal);
+- ret = _mm_mul_ps(ret, invScalar);
+- _mm_storeu_ps(outputVectorPtr, ret);
+- outputVectorPtr += 4;
++ ret = _mm_cvtepi32_ps(inputVal);
++ ret = _mm_mul_ps(ret, invScalar);
++ _mm_storeu_ps(outputVectorPtr, ret);
++ outputVectorPtr += 4;
+
+- ret = _mm_cvtepi32_ps(inputVal2);
+- ret = _mm_mul_ps(ret, invScalar);
+- _mm_storeu_ps(outputVectorPtr, ret);
++ ret = _mm_cvtepi32_ps(inputVal2);
++ ret = _mm_mul_ps(ret, invScalar);
++ _mm_storeu_ps(outputVectorPtr, ret);
+
+- outputVectorPtr += 4;
++ outputVectorPtr += 4;
+
+- inputPtr += 8;
+- }
++ inputPtr += 8;
++ }
+
+- number = eighthPoints * 8;
+- for(; number < num_points; number++){
+- outputVector[number] =((float)(inputVector[number])) / scalar;
+- }
++ number = eighthPoints * 8;
++ for (; number < num_points; number++) {
++ outputVector[number] = ((float)(inputVector[number])) / scalar;
++ }
+ }
+ #endif /* LV_HAVE_SSE4_1 */
+
+ #ifdef LV_HAVE_SSE
+ #include <xmmintrin.h>
+
+-static inline void
+-volk_16i_s32f_convert_32f_a_sse(float* outputVector, const int16_t* inputVector,
+- const float scalar, unsigned int num_points)
++static inline void volk_16i_s32f_convert_32f_a_sse(float* outputVector,
++ const int16_t* inputVector,
++ const float scalar,
++ unsigned int num_points)
+ {
+- unsigned int number = 0;
+- const unsigned int quarterPoints = num_points / 4;
+-
+- float* outputVectorPtr = outputVector;
+- __m128 invScalar = _mm_set_ps1(1.0/scalar);
+- int16_t* inputPtr = (int16_t*)inputVector;
+- __m128 ret;
+-
+- for(;number < quarterPoints; number++){
+- ret = _mm_set_ps((float)(inputPtr[3]), (float)(inputPtr[2]), (float)(inputPtr[1]), (float)(inputPtr[0]));
+-
+- ret = _mm_mul_ps(ret, invScalar);
+- _mm_storeu_ps(outputVectorPtr, ret);
+-
+- inputPtr += 4;
+- outputVectorPtr += 4;
+- }
+-
+- number = quarterPoints * 4;
+- for(; number < num_points; number++){
+- outputVector[number] = (float)(inputVector[number]) / scalar;
+- }
++ unsigned int number = 0;
++ const unsigned int quarterPoints = num_points / 4;
++
++ float* outputVectorPtr = outputVector;
++ __m128 invScalar = _mm_set_ps1(1.0 / scalar);
++ int16_t* inputPtr = (int16_t*)inputVector;
++ __m128 ret;
++
++ for (; number < quarterPoints; number++) {
++ ret = _mm_set_ps((float)(inputPtr[3]),
++ (float)(inputPtr[2]),
++ (float)(inputPtr[1]),
++ (float)(inputPtr[0]));
++
++ ret = _mm_mul_ps(ret, invScalar);
++ _mm_storeu_ps(outputVectorPtr, ret);
++
++ inputPtr += 4;
++ outputVectorPtr += 4;
++ }
++
++ number = quarterPoints * 4;
++ for (; number < num_points; number++) {
++ outputVector[number] = (float)(inputVector[number]) / scalar;
++ }
+ }
+ #endif /* LV_HAVE_SSE */
+
+ #ifdef LV_HAVE_GENERIC
+
+-static inline void
+-volk_16i_s32f_convert_32f_a_generic(float* outputVector, const int16_t* inputVector,
+- const float scalar, unsigned int num_points)
++static inline void volk_16i_s32f_convert_32f_a_generic(float* outputVector,
++ const int16_t* inputVector,
++ const float scalar,
++ unsigned int num_points)
+ {
+- float* outputVectorPtr = outputVector;
+- const int16_t* inputVectorPtr = inputVector;
+- unsigned int number = 0;
++ float* outputVectorPtr = outputVector;
++ const int16_t* inputVectorPtr = inputVector;
++ unsigned int number = 0;
+
+- for(number = 0; number < num_points; number++){
+- *outputVectorPtr++ = ((float)(*inputVectorPtr++)) / scalar;
+- }
++ for (number = 0; number < num_points; number++) {
++ *outputVectorPtr++ = ((float)(*inputVectorPtr++)) / scalar;
++ }
+ }
+ #endif /* LV_HAVE_GENERIC */
+
+diff --git a/kernels/volk/volk_16i_x4_quad_max_star_16i.h b/kernels/volk/volk_16i_x4_quad_max_star_16i.h
+index 6aa74c7..619cc90 100644
+--- a/kernels/volk/volk_16i_x4_quad_max_star_16i.h
++++ b/kernels/volk/volk_16i_x4_quad_max_star_16i.h
+@@ -29,8 +29,8 @@
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+- * void volk_16i_x4_quad_max_star_16i(short* target, short* src0, short* src1, short* src2, short* src3, unsigned int num_points)
+- * \endcode
++ * void volk_16i_x4_quad_max_star_16i(short* target, short* src0, short* src1, short*
++ * src2, short* src3, unsigned int num_points) \endcode
+ *
+ * \b Inputs
+ * \li src0: The input vector 0.
+@@ -55,149 +55,152 @@
+ #ifndef INCLUDED_volk_16i_x4_quad_max_star_16i_a_H
+ #define INCLUDED_volk_16i_x4_quad_max_star_16i_a_H
+
+-#include<inttypes.h>
+-#include<stdio.h>
++#include <inttypes.h>
++#include <stdio.h>
+
+ #ifdef LV_HAVE_SSE2
+
+-#include<emmintrin.h>
++#include <emmintrin.h>
+
+-static inline void
+-volk_16i_x4_quad_max_star_16i_a_sse2(short* target, short* src0, short* src1,
+- short* src2, short* src3, unsigned int num_points)
++static inline void volk_16i_x4_quad_max_star_16i_a_sse2(short* target,
++ short* src0,
++ short* src1,
++ short* src2,
++ short* src3,
++ unsigned int num_points)
+ {
+- const unsigned int num_bytes = num_points*2;
+-
+- int i = 0;
++ const unsigned int num_bytes = num_points * 2;
+
+- int bound = (num_bytes >> 4);
+- int bound_copy = bound;
+- int leftovers = (num_bytes >> 1) & 7;
++ int i = 0;
+
+- __m128i *p_target, *p_src0, *p_src1, *p_src2, *p_src3;
+- p_target = (__m128i*) target;
+- p_src0 = (__m128i*)src0;
+- p_src1 = (__m128i*)src1;
+- p_src2 = (__m128i*)src2;
+- p_src3 = (__m128i*)src3;
++ int bound = (num_bytes >> 4);
++ int bound_copy = bound;
++ int leftovers = (num_bytes >> 1) & 7;
+
+- __m128i xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
++ __m128i *p_target, *p_src0, *p_src1, *p_src2, *p_src3;
++ p_target = (__m128i*)target;
++ p_src0 = (__m128i*)src0;
++ p_src1 = (__m128i*)src1;
++ p_src2 = (__m128i*)src2;
++ p_src3 = (__m128i*)src3;
+
+- while(bound_copy > 0) {
+- xmm1 = _mm_load_si128(p_src0);
+- xmm2 = _mm_load_si128(p_src1);
+- xmm3 = _mm_load_si128(p_src2);
+- xmm4 = _mm_load_si128(p_src3);
++ __m128i xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
+
+- xmm5 = _mm_setzero_si128();
+- xmm6 = _mm_setzero_si128();
+- xmm7 = xmm1;
+- xmm8 = xmm3;
++ while (bound_copy > 0) {
++ xmm1 = _mm_load_si128(p_src0);
++ xmm2 = _mm_load_si128(p_src1);
++ xmm3 = _mm_load_si128(p_src2);
++ xmm4 = _mm_load_si128(p_src3);
+
+- xmm1 = _mm_sub_epi16(xmm2, xmm1);
++ xmm5 = _mm_setzero_si128();
++ xmm6 = _mm_setzero_si128();
++ xmm7 = xmm1;
++ xmm8 = xmm3;
+
+- xmm3 = _mm_sub_epi16(xmm4, xmm3);
++ xmm1 = _mm_sub_epi16(xmm2, xmm1);
+
+- xmm5 = _mm_cmpgt_epi16(xmm1, xmm5);
+- xmm6 = _mm_cmpgt_epi16(xmm3, xmm6);
++ xmm3 = _mm_sub_epi16(xmm4, xmm3);
+
+- xmm2 = _mm_and_si128(xmm5, xmm2);
+- xmm4 = _mm_and_si128(xmm6, xmm4);
+- xmm5 = _mm_andnot_si128(xmm5, xmm7);
+- xmm6 = _mm_andnot_si128(xmm6, xmm8);
++ xmm5 = _mm_cmpgt_epi16(xmm1, xmm5);
++ xmm6 = _mm_cmpgt_epi16(xmm3, xmm6);
+
+- xmm5 = _mm_add_epi16(xmm2, xmm5);
+- xmm6 = _mm_add_epi16(xmm4, xmm6);
++ xmm2 = _mm_and_si128(xmm5, xmm2);
++ xmm4 = _mm_and_si128(xmm6, xmm4);
++ xmm5 = _mm_andnot_si128(xmm5, xmm7);
++ xmm6 = _mm_andnot_si128(xmm6, xmm8);
+
+- xmm1 = _mm_xor_si128(xmm1, xmm1);
+- xmm2 = xmm5;
+- xmm5 = _mm_sub_epi16(xmm6, xmm5);
+- p_src0 += 1;
+- bound_copy -= 1;
++ xmm5 = _mm_add_epi16(xmm2, xmm5);
++ xmm6 = _mm_add_epi16(xmm4, xmm6);
+
+- xmm1 = _mm_cmpgt_epi16(xmm5, xmm1);
+- p_src1 += 1;
++ xmm1 = _mm_xor_si128(xmm1, xmm1);
++ xmm2 = xmm5;
++ xmm5 = _mm_sub_epi16(xmm6, xmm5);
++ p_src0 += 1;
++ bound_copy -= 1;
+
+- xmm6 = _mm_and_si128(xmm1, xmm6);
++ xmm1 = _mm_cmpgt_epi16(xmm5, xmm1);
++ p_src1 += 1;
+
+- xmm1 = _mm_andnot_si128(xmm1, xmm2);
+- p_src2 += 1;
++ xmm6 = _mm_and_si128(xmm1, xmm6);
+
+- xmm1 = _mm_add_epi16(xmm6, xmm1);
+- p_src3 += 1;
++ xmm1 = _mm_andnot_si128(xmm1, xmm2);
++ p_src2 += 1;
+
+- _mm_store_si128(p_target, xmm1);
+- p_target += 1;
++ xmm1 = _mm_add_epi16(xmm6, xmm1);
++ p_src3 += 1;
+
+- }
++ _mm_store_si128(p_target, xmm1);
++ p_target += 1;
++ }
+
+
+- /*__VOLK_ASM __VOLK_VOLATILE
+- (
+- "volk_16i_x4_quad_max_star_16i_a_sse2_L1:\n\t"
+- "cmp $0, %[bound]\n\t"
+- "je volk_16i_x4_quad_max_star_16i_a_sse2_END\n\t"
++ /*__VOLK_ASM __VOLK_VOLATILE
++ (
++ "volk_16i_x4_quad_max_star_16i_a_sse2_L1:\n\t"
++ "cmp $0, %[bound]\n\t"
++ "je volk_16i_x4_quad_max_star_16i_a_sse2_END\n\t"
+
+- "movaps (%[src0]), %%xmm1\n\t"
+- "movaps (%[src1]), %%xmm2\n\t"
+- "movaps (%[src2]), %%xmm3\n\t"
+- "movaps (%[src3]), %%xmm4\n\t"
++ "movaps (%[src0]), %%xmm1\n\t"
++ "movaps (%[src1]), %%xmm2\n\t"
++ "movaps (%[src2]), %%xmm3\n\t"
++ "movaps (%[src3]), %%xmm4\n\t"
+
+- "pxor %%xmm5, %%xmm5\n\t"
+- "pxor %%xmm6, %%xmm6\n\t"
+- "movaps %%xmm1, %%xmm7\n\t"
+- "movaps %%xmm3, %%xmm8\n\t"
+- "psubw %%xmm2, %%xmm1\n\t"
+- "psubw %%xmm4, %%xmm3\n\t"
++ "pxor %%xmm5, %%xmm5\n\t"
++ "pxor %%xmm6, %%xmm6\n\t"
++ "movaps %%xmm1, %%xmm7\n\t"
++ "movaps %%xmm3, %%xmm8\n\t"
++ "psubw %%xmm2, %%xmm1\n\t"
++ "psubw %%xmm4, %%xmm3\n\t"
+
+- "pcmpgtw %%xmm1, %%xmm5\n\t"
+- "pcmpgtw %%xmm3, %%xmm6\n\t"
++ "pcmpgtw %%xmm1, %%xmm5\n\t"
++ "pcmpgtw %%xmm3, %%xmm6\n\t"
+
+- "pand %%xmm5, %%xmm2\n\t"
+- "pand %%xmm6, %%xmm4\n\t"
+- "pandn %%xmm7, %%xmm5\n\t"
+- "pandn %%xmm8, %%xmm6\n\t"
++ "pand %%xmm5, %%xmm2\n\t"
++ "pand %%xmm6, %%xmm4\n\t"
++ "pandn %%xmm7, %%xmm5\n\t"
++ "pandn %%xmm8, %%xmm6\n\t"
+
+- "paddw %%xmm2, %%xmm5\n\t"
+- "paddw %%xmm4, %%xmm6\n\t"
++ "paddw %%xmm2, %%xmm5\n\t"
++ "paddw %%xmm4, %%xmm6\n\t"
+
+- "pxor %%xmm1, %%xmm1\n\t"
+- "movaps %%xmm5, %%xmm2\n\t"
++ "pxor %%xmm1, %%xmm1\n\t"
++ "movaps %%xmm5, %%xmm2\n\t"
+
+- "psubw %%xmm6, %%xmm5\n\t"
+- "add $16, %[src0]\n\t"
+- "add $-1, %[bound]\n\t"
++ "psubw %%xmm6, %%xmm5\n\t"
++ "add $16, %[src0]\n\t"
++ "add $-1, %[bound]\n\t"
+
+- "pcmpgtw %%xmm5, %%xmm1\n\t"
+- "add $16, %[src1]\n\t"
++ "pcmpgtw %%xmm5, %%xmm1\n\t"
++ "add $16, %[src1]\n\t"
+
+- "pand %%xmm1, %%xmm6\n\t"
++ "pand %%xmm1, %%xmm6\n\t"
+
+- "pandn %%xmm2, %%xmm1\n\t"
+- "add $16, %[src2]\n\t"
++ "pandn %%xmm2, %%xmm1\n\t"
++ "add $16, %[src2]\n\t"
+
+- "paddw %%xmm6, %%xmm1\n\t"
+- "add $16, %[src3]\n\t"
++ "paddw %%xmm6, %%xmm1\n\t"
++ "add $16, %[src3]\n\t"
+
+- "movaps %%xmm1, (%[target])\n\t"
+- "addw $16, %[target]\n\t"
+- "jmp volk_16i_x4_quad_max_star_16i_a_sse2_L1\n\t"
++ "movaps %%xmm1, (%[target])\n\t"
++ "addw $16, %[target]\n\t"
++ "jmp volk_16i_x4_quad_max_star_16i_a_sse2_L1\n\t"
+
+- "volk_16i_x4_quad_max_star_16i_a_sse2_END:\n\t"
+- :
+- :[bound]"r"(bound), [src0]"r"(src0), [src1]"r"(src1), [src2]"r"(src2), [src3]"r"(src3), [target]"r"(target)
+- :
+- );
+- */
++ "volk_16i_x4_quad_max_star_16i_a_sse2_END:\n\t"
++ :
++ :[bound]"r"(bound), [src0]"r"(src0), [src1]"r"(src1), [src2]"r"(src2),
++ [src3]"r"(src3), [target]"r"(target)
++ :
++ );
++ */
+
+- short temp0 = 0;
+- short temp1 = 0;
+- for(i = bound * 8; i < (bound * 8) + leftovers; ++i) {
+- temp0 = ((short)(src0[i] - src1[i]) > 0) ? src0[i] : src1[i];
+- temp1 = ((short)(src2[i] - src3[i])>0) ? src2[i] : src3[i];
+- target[i] = ((short)(temp0 - temp1)>0) ? temp0 : temp1;
+- }
+- return;
++ short temp0 = 0;
++ short temp1 = 0;
++ for (i = bound * 8; i < (bound * 8) + leftovers; ++i) {
++ temp0 = ((short)(src0[i] - src1[i]) > 0) ? src0[i] : src1[i];
++ temp1 = ((short)(src2[i] - src3[i]) > 0) ? src2[i] : src3[i];
++ target[i] = ((short)(temp0 - temp1) > 0) ? temp0 : temp1;
++ }
++ return;
+ }
+
+ #endif /*LV_HAVE_SSE2*/
+@@ -206,85 +209,91 @@ volk_16i_x4_quad_max_star_16i_a_sse2(short* target, short* src0, short* src1,
+
+ #include <arm_neon.h>
+
+-static inline void
+-volk_16i_x4_quad_max_star_16i_neon(short* target, short* src0, short* src1,
+- short* src2, short* src3, unsigned int num_points)
++static inline void volk_16i_x4_quad_max_star_16i_neon(short* target,
++ short* src0,
++ short* src1,
++ short* src2,
++ short* src3,
++ unsigned int num_points)
+ {
+- const unsigned int eighth_points = num_points / 8;
+- unsigned i;
+-
+- int16x8_t src0_vec, src1_vec, src2_vec, src3_vec;
+- int16x8_t diff12, diff34;
+- int16x8_t comp0, comp1, comp2, comp3;
+- int16x8_t result1_vec, result2_vec;
+- int16x8_t zeros;
+- zeros = vdupq_n_s16(0);
+- for(i=0; i < eighth_points; ++i) {
+- src0_vec = vld1q_s16(src0);
+- src1_vec = vld1q_s16(src1);
+- src2_vec = vld1q_s16(src2);
+- src3_vec = vld1q_s16(src3);
+- diff12 = vsubq_s16(src0_vec, src1_vec);
+- diff34 = vsubq_s16(src2_vec, src3_vec);
+- comp0 = (int16x8_t)vcgeq_s16(diff12, zeros);
+- comp1 = (int16x8_t)vcltq_s16(diff12, zeros);
+- comp2 = (int16x8_t)vcgeq_s16(diff34, zeros);
+- comp3 = (int16x8_t)vcltq_s16(diff34, zeros);
+- comp0 = vandq_s16(src0_vec, comp0);
+- comp1 = vandq_s16(src1_vec, comp1);
+- comp2 = vandq_s16(src2_vec, comp2);
+- comp3 = vandq_s16(src3_vec, comp3);
+-
+- result1_vec = vaddq_s16(comp0, comp1);
+- result2_vec = vaddq_s16(comp2, comp3);
+-
+- diff12 = vsubq_s16(result1_vec, result2_vec);
+- comp0 = (int16x8_t)vcgeq_s16(diff12, zeros);
+- comp1 = (int16x8_t)vcltq_s16(diff12, zeros);
+- comp0 = vandq_s16(result1_vec, comp0);
+- comp1 = vandq_s16(result2_vec, comp1);
+- result1_vec = vaddq_s16(comp0, comp1);
+- vst1q_s16(target, result1_vec);
+- src0 += 8;
+- src1 += 8;
+- src2 += 8;
+- src3 += 8;
+- target += 8;
++ const unsigned int eighth_points = num_points / 8;
++ unsigned i;
++
++ int16x8_t src0_vec, src1_vec, src2_vec, src3_vec;
++ int16x8_t diff12, diff34;
++ int16x8_t comp0, comp1, comp2, comp3;
++ int16x8_t result1_vec, result2_vec;
++ int16x8_t zeros;
++ zeros = vdupq_n_s16(0);
++ for (i = 0; i < eighth_points; ++i) {
++ src0_vec = vld1q_s16(src0);
++ src1_vec = vld1q_s16(src1);
++ src2_vec = vld1q_s16(src2);
++ src3_vec = vld1q_s16(src3);
++ diff12 = vsubq_s16(src0_vec, src1_vec);
++ diff34 = vsubq_s16(src2_vec, src3_vec);
++ comp0 = (int16x8_t)vcgeq_s16(diff12, zeros);
++ comp1 = (int16x8_t)vcltq_s16(diff12, zeros);
++ comp2 = (int16x8_t)vcgeq_s16(diff34, zeros);
++ comp3 = (int16x8_t)vcltq_s16(diff34, zeros);
++ comp0 = vandq_s16(src0_vec, comp0);
++ comp1 = vandq_s16(src1_vec, comp1);
++ comp2 = vandq_s16(src2_vec, comp2);
++ comp3 = vandq_s16(src3_vec, comp3);
++
++ result1_vec = vaddq_s16(comp0, comp1);
++ result2_vec = vaddq_s16(comp2, comp3);
++
++ diff12 = vsubq_s16(result1_vec, result2_vec);
++ comp0 = (int16x8_t)vcgeq_s16(diff12, zeros);
++ comp1 = (int16x8_t)vcltq_s16(diff12, zeros);
++ comp0 = vandq_s16(result1_vec, comp0);
++ comp1 = vandq_s16(result2_vec, comp1);
++ result1_vec = vaddq_s16(comp0, comp1);
++ vst1q_s16(target, result1_vec);
++ src0 += 8;
++ src1 += 8;
++ src2 += 8;
++ src3 += 8;
++ target += 8;
+ }
+
+- short temp0 = 0;
+- short temp1 = 0;
+- for(i=eighth_points*8; i < num_points; ++i) {
+- temp0 = ((short)(*src0 - *src1) > 0) ? *src0 : *src1;
+- temp1 = ((short)(*src2 - *src3) > 0) ? *src2 : *src3;
+- *target++ = ((short)(temp0 - temp1)>0) ? temp0 : temp1;
+- src0++;
+- src1++;
+- src2++;
+- src3++;
+- }
++ short temp0 = 0;
++ short temp1 = 0;
++ for (i = eighth_points * 8; i < num_points; ++i) {
++ temp0 = ((short)(*src0 - *src1) > 0) ? *src0 : *src1;
++ temp1 = ((short)(*src2 - *src3) > 0) ? *src2 : *src3;
++ *target++ = ((short)(temp0 - temp1) > 0) ? temp0 : temp1;
++ src0++;
++ src1++;
++ src2++;
++ src3++;
++ }
+ }
+ #endif /* LV_HAVE_NEON */
+
+
+ #ifdef LV_HAVE_GENERIC
+-static inline void
+-volk_16i_x4_quad_max_star_16i_generic(short* target, short* src0, short* src1,
+- short* src2, short* src3, unsigned int num_points)
++static inline void volk_16i_x4_quad_max_star_16i_generic(short* target,
++ short* src0,
++ short* src1,
++ short* src2,
++ short* src3,
++ unsigned int num_points)
+ {
+- const unsigned int num_bytes = num_points*2;
++ const unsigned int num_bytes = num_points * 2;
+
+- int i = 0;
++ int i = 0;
+
+- int bound = num_bytes >> 1;
++ int bound = num_bytes >> 1;
+
+- short temp0 = 0;
+- short temp1 = 0;
+- for(i = 0; i < bound; ++i) {
+- temp0 = ((short)(src0[i] - src1[i]) > 0) ? src0[i] : src1[i];
+- temp1 = ((short)(src2[i] - src3[i])>0) ? src2[i] : src3[i];
+- target[i] = ((short)(temp0 - temp1)>0) ? temp0 : temp1;
+- }
++ short temp0 = 0;
++ short temp1 = 0;
++ for (i = 0; i < bound; ++i) {
++ temp0 = ((short)(src0[i] - src1[i]) > 0) ? src0[i] : src1[i];
++ temp1 = ((short)(src2[i] - src3[i]) > 0) ? src2[i] : src3[i];
++ target[i] = ((short)(temp0 - temp1) > 0) ? temp0 : temp1;
++ }
+ }
+
+ #endif /*LV_HAVE_GENERIC*/
+diff --git a/kernels/volk/volk_16i_x5_add_quad_16i_x4.h b/kernels/volk/volk_16i_x5_add_quad_16i_x4.h
+index 30417de..f735f11 100644
+--- a/kernels/volk/volk_16i_x5_add_quad_16i_x4.h
++++ b/kernels/volk/volk_16i_x5_add_quad_16i_x4.h
+@@ -29,8 +29,9 @@
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+- * void volk_16i_x5_add_quad_16i_x4(short* target0, short* target1, short* target2, short* target3, short* src0, short* src1, short* src2, short* src3, short* src4, unsigned int num_points);
+- * \endcode
++ * void volk_16i_x5_add_quad_16i_x4(short* target0, short* target1, short* target2, short*
++ * target3, short* src0, short* src1, short* src2, short* src3, short* src4, unsigned int
++ * num_points); \endcode
+ *
+ * \b Inputs
+ * \li src0: The input vector 0.
+@@ -59,182 +60,203 @@
+ #ifndef INCLUDED_volk_16i_x5_add_quad_16i_x4_a_H
+ #define INCLUDED_volk_16i_x5_add_quad_16i_x4_a_H
+
+-#include<inttypes.h>
+-#include<stdio.h>
++#include <inttypes.h>
++#include <stdio.h>
+
+ #ifdef LV_HAVE_SSE2
+-#include<xmmintrin.h>
+-#include<emmintrin.h>
++#include <emmintrin.h>
++#include <xmmintrin.h>
+
+-static inline void
+-volk_16i_x5_add_quad_16i_x4_a_sse2(short* target0, short* target1, short* target2, short* target3,
+- short* src0, short* src1, short* src2, short* src3, short* src4,
+- unsigned int num_points)
++static inline void volk_16i_x5_add_quad_16i_x4_a_sse2(short* target0,
++ short* target1,
++ short* target2,
++ short* target3,
++ short* src0,
++ short* src1,
++ short* src2,
++ short* src3,
++ short* src4,
++ unsigned int num_points)
+ {
+- const unsigned int num_bytes = num_points*2;
+-
+- __m128i xmm0, xmm1, xmm2, xmm3, xmm4;
+- __m128i *p_target0, *p_target1, *p_target2, *p_target3, *p_src0, *p_src1, *p_src2, *p_src3, *p_src4;
+- p_target0 = (__m128i*)target0;
+- p_target1 = (__m128i*)target1;
+- p_target2 = (__m128i*)target2;
+- p_target3 = (__m128i*)target3;
+-
+- p_src0 = (__m128i*)src0;
+- p_src1 = (__m128i*)src1;
+- p_src2 = (__m128i*)src2;
+- p_src3 = (__m128i*)src3;
+- p_src4 = (__m128i*)src4;
+-
+- int i = 0;
+-
+- int bound = (num_bytes >> 4);
+- int leftovers = (num_bytes >> 1) & 7;
+-
+- for(; i < bound; ++i) {
+- xmm0 = _mm_load_si128(p_src0);
+- xmm1 = _mm_load_si128(p_src1);
+- xmm2 = _mm_load_si128(p_src2);
+- xmm3 = _mm_load_si128(p_src3);
+- xmm4 = _mm_load_si128(p_src4);
+-
+- p_src0 += 1;
+- p_src1 += 1;
+-
+- xmm1 = _mm_add_epi16(xmm0, xmm1);
+- xmm2 = _mm_add_epi16(xmm0, xmm2);
+- xmm3 = _mm_add_epi16(xmm0, xmm3);
+- xmm4 = _mm_add_epi16(xmm0, xmm4);
+-
+-
+- p_src2 += 1;
+- p_src3 += 1;
+- p_src4 += 1;
+-
+- _mm_store_si128(p_target0, xmm1);
+- _mm_store_si128(p_target1, xmm2);
+- _mm_store_si128(p_target2, xmm3);
+- _mm_store_si128(p_target3, xmm4);
+-
+- p_target0 += 1;
+- p_target1 += 1;
+- p_target2 += 1;
+- p_target3 += 1;
+- }
+- /*__VOLK_ASM __VOLK_VOLATILE
+- (
+- ".%=volk_16i_x5_add_quad_16i_x4_a_sse2_L1:\n\t"
+- "cmp $0, %[bound]\n\t"
+- "je .%=volk_16i_x5_add_quad_16i_x4_a_sse2_END\n\t"
+- "movaps (%[src0]), %%xmm1\n\t"
+- "movaps (%[src1]), %%xmm2\n\t"
+- "movaps (%[src2]), %%xmm3\n\t"
+- "movaps (%[src3]), %%xmm4\n\t"
+- "movaps (%[src4]), %%xmm5\n\t"
+- "add $16, %[src0]\n\t"
+- "add $16, %[src1]\n\t"
+- "add $16, %[src2]\n\t"
+- "add $16, %[src3]\n\t"
+- "add $16, %[src4]\n\t"
+- "paddw %%xmm1, %%xmm2\n\t"
+- "paddw %%xmm1, %%xmm3\n\t"
+- "paddw %%xmm1, %%xmm4\n\t"
+- "paddw %%xmm1, %%xmm5\n\t"
+- "add $-1, %[bound]\n\t"
+- "movaps %%xmm2, (%[target0])\n\t"
+- "movaps %%xmm3, (%[target1])\n\t"
+- "movaps %%xmm4, (%[target2])\n\t"
+- "movaps %%xmm5, (%[target3])\n\t"
+- "add $16, %[target0]\n\t"
+- "add $16, %[target1]\n\t"
+- "add $16, %[target2]\n\t"
+- "add $16, %[target3]\n\t"
+- "jmp .%=volk_16i_x5_add_quad_16i_x4_a_sse2_L1\n\t"
+- ".%=volk_16i_x5_add_quad_16i_x4_a_sse2_END:\n\t"
+- :
+- :[bound]"r"(bound), [src0]"r"(src0), [src1]"r"(src1), [src2]"r"(src2), [src3]"r"(src3), [src4]"r"(src4), [target0]"r"(target0), [target1]"r"(target1), [target2]"r"(target2), [target3]"r"(target3)
+- :"xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+- );
+- */
+-
+- for(i = bound * 8; i < (bound * 8) + leftovers; ++i) {
+- target0[i] = src0[i] + src1[i];
+- target1[i] = src0[i] + src2[i];
+- target2[i] = src0[i] + src3[i];
+- target3[i] = src0[i] + src4[i];
+- }
++ const unsigned int num_bytes = num_points * 2;
++
++ __m128i xmm0, xmm1, xmm2, xmm3, xmm4;
++ __m128i *p_target0, *p_target1, *p_target2, *p_target3, *p_src0, *p_src1, *p_src2,
++ *p_src3, *p_src4;
++ p_target0 = (__m128i*)target0;
++ p_target1 = (__m128i*)target1;
++ p_target2 = (__m128i*)target2;
++ p_target3 = (__m128i*)target3;
++
++ p_src0 = (__m128i*)src0;
++ p_src1 = (__m128i*)src1;
++ p_src2 = (__m128i*)src2;
++ p_src3 = (__m128i*)src3;
++ p_src4 = (__m128i*)src4;
++
++ int i = 0;
++
++ int bound = (num_bytes >> 4);
++ int leftovers = (num_bytes >> 1) & 7;
++
++ for (; i < bound; ++i) {
++ xmm0 = _mm_load_si128(p_src0);
++ xmm1 = _mm_load_si128(p_src1);
++ xmm2 = _mm_load_si128(p_src2);
++ xmm3 = _mm_load_si128(p_src3);
++ xmm4 = _mm_load_si128(p_src4);
++
++ p_src0 += 1;
++ p_src1 += 1;
++
++ xmm1 = _mm_add_epi16(xmm0, xmm1);
++ xmm2 = _mm_add_epi16(xmm0, xmm2);
++ xmm3 = _mm_add_epi16(xmm0, xmm3);
++ xmm4 = _mm_add_epi16(xmm0, xmm4);
++
++
++ p_src2 += 1;
++ p_src3 += 1;
++ p_src4 += 1;
++
++ _mm_store_si128(p_target0, xmm1);
++ _mm_store_si128(p_target1, xmm2);
++ _mm_store_si128(p_target2, xmm3);
++ _mm_store_si128(p_target3, xmm4);
++
++ p_target0 += 1;
++ p_target1 += 1;
++ p_target2 += 1;
++ p_target3 += 1;
++ }
++ /*__VOLK_ASM __VOLK_VOLATILE
++ (
++ ".%=volk_16i_x5_add_quad_16i_x4_a_sse2_L1:\n\t"
++ "cmp $0, %[bound]\n\t"
++ "je .%=volk_16i_x5_add_quad_16i_x4_a_sse2_END\n\t"
++ "movaps (%[src0]), %%xmm1\n\t"
++ "movaps (%[src1]), %%xmm2\n\t"
++ "movaps (%[src2]), %%xmm3\n\t"
++ "movaps (%[src3]), %%xmm4\n\t"
++ "movaps (%[src4]), %%xmm5\n\t"
++ "add $16, %[src0]\n\t"
++ "add $16, %[src1]\n\t"
++ "add $16, %[src2]\n\t"
++ "add $16, %[src3]\n\t"
++ "add $16, %[src4]\n\t"
++ "paddw %%xmm1, %%xmm2\n\t"
++ "paddw %%xmm1, %%xmm3\n\t"
++ "paddw %%xmm1, %%xmm4\n\t"
++ "paddw %%xmm1, %%xmm5\n\t"
++ "add $-1, %[bound]\n\t"
++ "movaps %%xmm2, (%[target0])\n\t"
++ "movaps %%xmm3, (%[target1])\n\t"
++ "movaps %%xmm4, (%[target2])\n\t"
++ "movaps %%xmm5, (%[target3])\n\t"
++ "add $16, %[target0]\n\t"
++ "add $16, %[target1]\n\t"
++ "add $16, %[target2]\n\t"
++ "add $16, %[target3]\n\t"
++ "jmp .%=volk_16i_x5_add_quad_16i_x4_a_sse2_L1\n\t"
++ ".%=volk_16i_x5_add_quad_16i_x4_a_sse2_END:\n\t"
++ :
++ :[bound]"r"(bound), [src0]"r"(src0), [src1]"r"(src1), [src2]"r"(src2),
++ [src3]"r"(src3), [src4]"r"(src4), [target0]"r"(target0), [target1]"r"(target1),
++ [target2]"r"(target2), [target3]"r"(target3)
++ :"xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
++ );
++ */
++
++ for (i = bound * 8; i < (bound * 8) + leftovers; ++i) {
++ target0[i] = src0[i] + src1[i];
++ target1[i] = src0[i] + src2[i];
++ target2[i] = src0[i] + src3[i];
++ target3[i] = src0[i] + src4[i];
++ }
+ }
+ #endif /*LV_HAVE_SSE2*/
+
+ #ifdef LV_HAVE_NEON
+ #include <arm_neon.h>
+
+-static inline void
+-volk_16i_x5_add_quad_16i_x4_neon(short* target0, short* target1, short* target2, short* target3,
+- short* src0, short* src1, short* src2, short* src3, short* src4,
+- unsigned int num_points)
++static inline void volk_16i_x5_add_quad_16i_x4_neon(short* target0,
++ short* target1,
++ short* target2,
++ short* target3,
++ short* src0,
++ short* src1,
++ short* src2,
++ short* src3,
++ short* src4,
++ unsigned int num_points)
+ {
+- const unsigned int eighth_points = num_points / 8;
+- unsigned int number = 0;
+-
+- int16x8_t src0_vec, src1_vec, src2_vec, src3_vec, src4_vec;
+- int16x8_t target0_vec, target1_vec, target2_vec, target3_vec;
+- for(number = 0; number < eighth_points; ++number) {
+- src0_vec = vld1q_s16(src0);
+- src1_vec = vld1q_s16(src1);
+- src2_vec = vld1q_s16(src2);
+- src3_vec = vld1q_s16(src3);
+- src4_vec = vld1q_s16(src4);
+-
+- target0_vec = vaddq_s16(src0_vec , src1_vec);
+- target1_vec = vaddq_s16(src0_vec , src2_vec);
+- target2_vec = vaddq_s16(src0_vec , src3_vec);
+- target3_vec = vaddq_s16(src0_vec , src4_vec);
+-
+- vst1q_s16(target0, target0_vec);
+- vst1q_s16(target1, target1_vec);
+- vst1q_s16(target2, target2_vec);
+- vst1q_s16(target3, target3_vec);
+- src0 += 8;
+- src1 += 8;
+- src2 += 8;
+- src3 += 8;
+- src4 += 8;
+- target0 += 8;
+- target1 += 8;
+- target2 += 8;
+- target3 += 8;
+- }
+-
+- for(number = eighth_points * 8; number < num_points; ++number) {
+- *target0++ = *src0 + *src1++;
+- *target1++ = *src0 + *src2++;
+- *target2++ = *src0 + *src3++;
+- *target3++ = *src0++ + *src4++;
+- }
++ const unsigned int eighth_points = num_points / 8;
++ unsigned int number = 0;
++
++ int16x8_t src0_vec, src1_vec, src2_vec, src3_vec, src4_vec;
++ int16x8_t target0_vec, target1_vec, target2_vec, target3_vec;
++ for (number = 0; number < eighth_points; ++number) {
++ src0_vec = vld1q_s16(src0);
++ src1_vec = vld1q_s16(src1);
++ src2_vec = vld1q_s16(src2);
++ src3_vec = vld1q_s16(src3);
++ src4_vec = vld1q_s16(src4);
++
++ target0_vec = vaddq_s16(src0_vec, src1_vec);
++ target1_vec = vaddq_s16(src0_vec, src2_vec);
++ target2_vec = vaddq_s16(src0_vec, src3_vec);
++ target3_vec = vaddq_s16(src0_vec, src4_vec);
++
++ vst1q_s16(target0, target0_vec);
++ vst1q_s16(target1, target1_vec);
++ vst1q_s16(target2, target2_vec);
++ vst1q_s16(target3, target3_vec);
++ src0 += 8;
++ src1 += 8;
++ src2 += 8;
++ src3 += 8;
++ src4 += 8;
++ target0 += 8;
++ target1 += 8;
++ target2 += 8;
++ target3 += 8;
++ }
++
++ for (number = eighth_points * 8; number < num_points; ++number) {
++ *target0++ = *src0 + *src1++;
++ *target1++ = *src0 + *src2++;
++ *target2++ = *src0 + *src3++;
++ *target3++ = *src0++ + *src4++;
++ }
+ }
+
+ #endif /* LV_HAVE_NEON */
+
+ #ifdef LV_HAVE_GENERIC
+
+-static inline void
+-volk_16i_x5_add_quad_16i_x4_generic(short* target0, short* target1, short* target2, short* target3,
+- short* src0, short* src1, short* src2, short* src3, short* src4,
+- unsigned int num_points)
++static inline void volk_16i_x5_add_quad_16i_x4_generic(short* target0,
++ short* target1,
++ short* target2,
++ short* target3,
++ short* src0,
++ short* src1,
++ short* src2,
++ short* src3,
++ short* src4,
++ unsigned int num_points)
+ {
+- const unsigned int num_bytes = num_points*2;
++ const unsigned int num_bytes = num_points * 2;
+
+- int i = 0;
++ int i = 0;
+
+- int bound = num_bytes >> 1;
++ int bound = num_bytes >> 1;
+
+- for(i = 0; i < bound; ++i) {
+- target0[i] = src0[i] + src1[i];
+- target1[i] = src0[i] + src2[i];
+- target2[i] = src0[i] + src3[i];
+- target3[i] = src0[i] + src4[i];
+- }
++ for (i = 0; i < bound; ++i) {
++ target0[i] = src0[i] + src1[i];
++ target1[i] = src0[i] + src2[i];
++ target2[i] = src0[i] + src3[i];
++ target3[i] = src0[i] + src4[i];
++ }
+ }
+
+ #endif /* LV_HAVE_GENERIC */
+diff --git a/kernels/volk/volk_16ic_convert_32fc.h b/kernels/volk/volk_16ic_convert_32fc.h
+index 84f067c..1453724 100644
+--- a/kernels/volk/volk_16ic_convert_32fc.h
++++ b/kernels/volk/volk_16ic_convert_32fc.h
+@@ -30,8 +30,8 @@
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+- * void volk_16ic_convert_32fc(lv_32fc_t* outputVector, const lv_16sc_t* inputVector, unsigned int num_points)
+- * \endcode
++ * void volk_16ic_convert_32fc(lv_32fc_t* outputVector, const lv_16sc_t* inputVector,
++ * unsigned int num_points) \endcode
+ *
+ * \b Inputs
+ * \li inputVector: The complex 16-bit integer input data buffer.
+@@ -51,7 +51,9 @@
+ #ifdef LV_HAVE_AVX2
+ #include <immintrin.h>
+
+-static inline void volk_16ic_convert_32fc_a_avx2(lv_32fc_t* outputVector, const lv_16sc_t* inputVector, unsigned int num_points)
++static inline void volk_16ic_convert_32fc_a_avx2(lv_32fc_t* outputVector,
++ const lv_16sc_t* inputVector,
++ unsigned int num_points)
+ {
+ const unsigned int avx_iters = num_points / 8;
+ unsigned int number = 0;
+@@ -61,36 +63,36 @@ static inline void volk_16ic_convert_32fc_a_avx2(lv_32fc_t* outputVector, const
+ __m256i outValInt;
+ __m128i cplxValue;
+
+- for(number = 0; number < avx_iters; number++)
+- {
+- cplxValue = _mm_load_si128((__m128i*)complexVectorPtr);
+- complexVectorPtr += 8;
+-
+- outValInt = _mm256_cvtepi16_epi32(cplxValue);
+- outVal = _mm256_cvtepi32_ps(outValInt);
+- _mm256_store_ps((float*)outputVectorPtr, outVal);
++ for (number = 0; number < avx_iters; number++) {
++ cplxValue = _mm_load_si128((__m128i*)complexVectorPtr);
++ complexVectorPtr += 8;
+
+- outputVectorPtr += 8;
+- }
++ outValInt = _mm256_cvtepi16_epi32(cplxValue);
++ outVal = _mm256_cvtepi32_ps(outValInt);
++ _mm256_store_ps((float*)outputVectorPtr, outVal);
++
++ outputVectorPtr += 8;
++ }
+
+ number = avx_iters * 8;
+- for(; number < num_points*2; number++)
+- {
+- *outputVectorPtr++ = (float)*complexVectorPtr++;
+- }
++ for (; number < num_points * 2; number++) {
++ *outputVectorPtr++ = (float)*complexVectorPtr++;
++ }
+ }
+
+ #endif /* LV_HAVE_AVX2 */
+
+ #ifdef LV_HAVE_GENERIC
+
+-static inline void volk_16ic_convert_32fc_generic(lv_32fc_t* outputVector, const lv_16sc_t* inputVector, unsigned int num_points)
++static inline void volk_16ic_convert_32fc_generic(lv_32fc_t* outputVector,
++ const lv_16sc_t* inputVector,
++ unsigned int num_points)
+ {
+ unsigned int i;
+- for(i = 0; i < num_points; i++)
+- {
+- outputVector[i] = lv_cmake((float)lv_creal(inputVector[i]), (float)lv_cimag(inputVector[i]));
+- }
++ for (i = 0; i < num_points; i++) {
++ outputVector[i] =
++ lv_cmake((float)lv_creal(inputVector[i]), (float)lv_cimag(inputVector[i]));
++ }
+ }
+
+ #endif /* LV_HAVE_GENERIC */
+@@ -99,7 +101,9 @@ static inline void volk_16ic_convert_32fc_generic(lv_32fc_t* outputVector, const
+ #ifdef LV_HAVE_SSE2
+ #include <emmintrin.h>
+
+-static inline void volk_16ic_convert_32fc_a_sse2(lv_32fc_t* outputVector, const lv_16sc_t* inputVector, unsigned int num_points)
++static inline void volk_16ic_convert_32fc_a_sse2(lv_32fc_t* outputVector,
++ const lv_16sc_t* inputVector,
++ unsigned int num_points)
+ {
+ const unsigned int sse_iters = num_points / 2;
+
+@@ -108,18 +112,21 @@ static inline void volk_16ic_convert_32fc_a_sse2(lv_32fc_t* outputVector, const
+ __m128 a;
+ unsigned int number;
+
+- for(number = 0; number < sse_iters; number++)
+- {
+- a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg
+- _mm_store_ps((float*)_out, a);
+- _in += 2;
+- _out += 2;
+- }
+- if (num_points & 1)
+- {
+- *_out++ = lv_cmake((float)lv_creal(*_in), (float)lv_cimag(*_in));
+- _in++;
+- }
++ for (number = 0; number < sse_iters; number++) {
++ a = _mm_set_ps(
++ (float)(lv_cimag(_in[1])),
++ (float)(lv_creal(_in[1])),
++ (float)(lv_cimag(_in[0])),
++ (float)(lv_creal(
++ _in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg
++ _mm_store_ps((float*)_out, a);
++ _in += 2;
++ _out += 2;
++ }
++ if (num_points & 1) {
++ *_out++ = lv_cmake((float)lv_creal(*_in), (float)lv_cimag(*_in));
++ _in++;
++ }
+ }
+
+ #endif /* LV_HAVE_SSE2 */
+@@ -127,7 +134,9 @@ static inline void volk_16ic_convert_32fc_a_sse2(lv_32fc_t* outputVector, const
+ #ifdef LV_HAVE_AVX
+ #include <immintrin.h>
+
+-static inline void volk_16ic_convert_32fc_a_avx(lv_32fc_t* outputVector, const lv_16sc_t* inputVector, unsigned int num_points)
++static inline void volk_16ic_convert_32fc_a_avx(lv_32fc_t* outputVector,
++ const lv_16sc_t* inputVector,
++ unsigned int num_points)
+ {
+ const unsigned int sse_iters = num_points / 4;
+
+@@ -136,19 +145,26 @@ static inline void volk_16ic_convert_32fc_a_avx(lv_32fc_t* outputVector, const l
+ __m256 a;
+ unsigned int i, number;
+
+- for(number = 0; number < sse_iters; number++)
+- {
+- a = _mm256_set_ps((float)(lv_cimag(_in[3])), (float)(lv_creal(_in[3])), (float)(lv_cimag(_in[2])), (float)(lv_creal(_in[2])), (float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg
+- _mm256_store_ps((float*)_out, a);
+- _in += 4;
+- _out += 4;
+- }
++ for (number = 0; number < sse_iters; number++) {
++ a = _mm256_set_ps(
++ (float)(lv_cimag(_in[3])),
++ (float)(lv_creal(_in[3])),
++ (float)(lv_cimag(_in[2])),
++ (float)(lv_creal(_in[2])),
++ (float)(lv_cimag(_in[1])),
++ (float)(lv_creal(_in[1])),
++ (float)(lv_cimag(_in[0])),
++ (float)(lv_creal(
++ _in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg
++ _mm256_store_ps((float*)_out, a);
++ _in += 4;
++ _out += 4;
++ }
+ _mm256_zeroupper();
+- for (i = 0; i < (num_points % 4); ++i)
+- {
+- *_out++ = lv_cmake((float)lv_creal(*_in), (float)lv_cimag(*_in));
+- _in++;
+- }
++ for (i = 0; i < (num_points % 4); ++i) {
++ *_out++ = lv_cmake((float)lv_creal(*_in), (float)lv_cimag(*_in));
++ _in++;
++ }
+ }
+
+ #endif /* LV_HAVE_AVX */
+@@ -157,7 +173,9 @@ static inline void volk_16ic_convert_32fc_a_avx(lv_32fc_t* outputVector, const l
+ #ifdef LV_HAVE_NEON
+ #include <arm_neon.h>
+
+-static inline void volk_16ic_convert_32fc_neon(lv_32fc_t* outputVector, const lv_16sc_t* inputVector, unsigned int num_points)
++static inline void volk_16ic_convert_32fc_neon(lv_32fc_t* outputVector,
++ const lv_16sc_t* inputVector,
++ unsigned int num_points)
+ {
+ const unsigned int sse_iters = num_points / 2;
+
+@@ -169,21 +187,19 @@ static inline void volk_16ic_convert_32fc_neon(lv_32fc_t* outputVector, const lv
+ float32x4_t f32x4;
+ unsigned int i, number;
+
+- for(number = 0; number < sse_iters; number++)
+- {
+- a16x4 = vld1_s16((const int16_t*)_in);
+- __VOLK_PREFETCH(_in + 4);
+- a32x4 = vmovl_s16(a16x4);
+- f32x4 = vcvtq_f32_s32(a32x4);
+- vst1q_f32((float32_t*)_out, f32x4);
+- _in += 2;
+- _out += 2;
+- }
+- for (i = 0; i < (num_points % 2); ++i)
+- {
+- *_out++ = lv_cmake((float)lv_creal(*_in), (float)lv_cimag(*_in));
+- _in++;
+- }
++ for (number = 0; number < sse_iters; number++) {
++ a16x4 = vld1_s16((const int16_t*)_in);
++ __VOLK_PREFETCH(_in + 4);
++ a32x4 = vmovl_s16(a16x4);
++ f32x4 = vcvtq_f32_s32(a32x4);
++ vst1q_f32((float32_t*)_out, f32x4);
++ _in += 2;
++ _out += 2;
++ }
++ for (i = 0; i < (num_points % 2); ++i) {
++ *_out++ = lv_cmake((float)lv_creal(*_in), (float)lv_cimag(*_in));
++ _in++;
++ }
+ }
+ #endif /* LV_HAVE_NEON */
+
+@@ -198,7 +214,9 @@ static inline void volk_16ic_convert_32fc_neon(lv_32fc_t* outputVector, const lv
+ #ifdef LV_HAVE_AVX2
+ #include <immintrin.h>
+
+-static inline void volk_16ic_convert_32fc_u_avx2(lv_32fc_t* outputVector, const lv_16sc_t* inputVector, unsigned int num_points)
++static inline void volk_16ic_convert_32fc_u_avx2(lv_32fc_t* outputVector,
++ const lv_16sc_t* inputVector,
++ unsigned int num_points)
+ {
+ const unsigned int avx_iters = num_points / 8;
+ unsigned int number = 0;
+@@ -208,23 +226,21 @@ static inline void volk_16ic_convert_32fc_u_avx2(lv_32fc_t* outputVector, const
+ __m256i outValInt;
+ __m128i cplxValue;
+
+- for(number = 0; number < avx_iters; number++)
+- {
+- cplxValue = _mm_loadu_si128((__m128i*)complexVectorPtr);
+- complexVectorPtr += 8;
+-
+- outValInt = _mm256_cvtepi16_epi32(cplxValue);
+- outVal = _mm256_cvtepi32_ps(outValInt);
+- _mm256_storeu_ps((float*)outputVectorPtr, outVal);
++ for (number = 0; number < avx_iters; number++) {
++ cplxValue = _mm_loadu_si128((__m128i*)complexVectorPtr);
++ complexVectorPtr += 8;
++
++ outValInt = _mm256_cvtepi16_epi32(cplxValue);
++ outVal = _mm256_cvtepi32_ps(outValInt);
++ _mm256_storeu_ps((float*)outputVectorPtr, outVal);
+
+- outputVectorPtr += 8;
+- }
++ outputVectorPtr += 8;
++ }
+
+ number = avx_iters * 8;
+- for(; number < num_points*2; number++)
+- {
+- *outputVectorPtr++ = (float)*complexVectorPtr++;
+- }
++ for (; number < num_points * 2; number++) {
++ *outputVectorPtr++ = (float)*complexVectorPtr++;
++ }
+ }
+
+ #endif /* LV_HAVE_AVX2 */
+@@ -232,7 +248,9 @@ static inline void volk_16ic_convert_32fc_u_avx2(lv_32fc_t* outputVector, const
+ #ifdef LV_HAVE_SSE2
+ #include <emmintrin.h>
+
+-static inline void volk_16ic_convert_32fc_u_sse2(lv_32fc_t* outputVector, const lv_16sc_t* inputVector, unsigned int num_points)
++static inline void volk_16ic_convert_32fc_u_sse2(lv_32fc_t* outputVector,
++ const lv_16sc_t* inputVector,
++ unsigned int num_points)
+ {
+ const unsigned int sse_iters = num_points / 2;
+
+@@ -241,18 +259,21 @@ static inline void volk_16ic_convert_32fc_u_sse2(lv_32fc_t* outputVector, const
+ __m128 a;
+ unsigned int number;
+
+- for(number = 0; number < sse_iters; number++)
+- {
+- a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg
+- _mm_storeu_ps((float*)_out, a);
+- _in += 2;
+- _out += 2;
+- }
+- if (num_points & 1)
+- {
+- *_out++ = lv_cmake((float)lv_creal(*_in), (float)lv_cimag(*_in));
+- _in++;
+- }
++ for (number = 0; number < sse_iters; number++) {
++ a = _mm_set_ps(
++ (float)(lv_cimag(_in[1])),
++ (float)(lv_creal(_in[1])),
++ (float)(lv_cimag(_in[0])),
++ (float)(lv_creal(
++ _in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg
++ _mm_storeu_ps((float*)_out, a);
++ _in += 2;
++ _out += 2;
++ }
++ if (num_points & 1) {
++ *_out++ = lv_cmake((float)lv_creal(*_in), (float)lv_cimag(*_in));
++ _in++;
++ }
+ }
+
+ #endif /* LV_HAVE_SSE2 */
+@@ -261,7 +282,9 @@ static inline void volk_16ic_convert_32fc_u_sse2(lv_32fc_t* outputVector, const
+ #ifdef LV_HAVE_AVX
+ #include <immintrin.h>
+
+-static inline void volk_16ic_convert_32fc_u_avx(lv_32fc_t* outputVector, const lv_16sc_t* inputVector, unsigned int num_points)
++static inline void volk_16ic_convert_32fc_u_avx(lv_32fc_t* outputVector,
++ const lv_16sc_t* inputVector,
++ unsigned int num_points)
+ {
+ const unsigned int sse_iters = num_points / 4;
+
+@@ -270,21 +293,27 @@ static inline void volk_16ic_convert_32fc_u_avx(lv_32fc_t* outputVector, const l
+ __m256 a;
+ unsigned int i, number;
+
+- for(number = 0; number < sse_iters; number++)
+- {
+- a = _mm256_set_ps((float)(lv_cimag(_in[3])), (float)(lv_creal(_in[3])), (float)(lv_cimag(_in[2])), (float)(lv_creal(_in[2])), (float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg
+- _mm256_storeu_ps((float*)_out, a);
+- _in += 4;
+- _out += 4;
+- }
++ for (number = 0; number < sse_iters; number++) {
++ a = _mm256_set_ps(
++ (float)(lv_cimag(_in[3])),
++ (float)(lv_creal(_in[3])),
++ (float)(lv_cimag(_in[2])),
++ (float)(lv_creal(_in[2])),
++ (float)(lv_cimag(_in[1])),
++ (float)(lv_creal(_in[1])),
++ (float)(lv_cimag(_in[0])),
++ (float)(lv_creal(
++ _in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg
++ _mm256_storeu_ps((float*)_out, a);
++ _in += 4;
++ _out += 4;
++ }
+ _mm256_zeroupper();
+- for (i = 0; i < (num_points % 4); ++i)
+- {
+- *_out++ = lv_cmake((float)lv_creal(*_in), (float)lv_cimag(*_in));
+- _in++;
+- }
++ for (i = 0; i < (num_points % 4); ++i) {
++ *_out++ = lv_cmake((float)lv_creal(*_in), (float)lv_cimag(*_in));
++ _in++;
++ }
+ }
+
+ #endif /* LV_HAVE_AVX */
+ #endif /* INCLUDED_volk_32fc_convert_16ic_u_H */
+-
+diff --git a/kernels/volk/volk_16ic_deinterleave_16i_x2.h b/kernels/volk/volk_16ic_deinterleave_16i_x2.h
+index 40d10b4..9e784a6 100644
+--- a/kernels/volk/volk_16ic_deinterleave_16i_x2.h
++++ b/kernels/volk/volk_16ic_deinterleave_16i_x2.h
+@@ -29,8 +29,8 @@
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+- * void volk_16ic_deinterleave_16i_x2(int16_t* iBuffer, int16_t* qBuffer, const lv_16sc_t* complexVector, unsigned int num_points)
+- * \endcode
++ * void volk_16ic_deinterleave_16i_x2(int16_t* iBuffer, int16_t* qBuffer, const lv_16sc_t*
++ * complexVector, unsigned int num_points) \endcode
+ *
+ * \b Inputs
+ * \li complexVector: The complex input vector.
+@@ -59,179 +59,241 @@
+ #ifdef LV_HAVE_AVX2
+ #include <immintrin.h>
+
+-static inline void
+-volk_16ic_deinterleave_16i_x2_a_avx2(int16_t* iBuffer, int16_t* qBuffer, const lv_16sc_t* complexVector, unsigned int num_points)
++static inline void volk_16ic_deinterleave_16i_x2_a_avx2(int16_t* iBuffer,
++ int16_t* qBuffer,
++ const lv_16sc_t* complexVector,
++ unsigned int num_points)
+ {
+- unsigned int number = 0;
+- const int8_t* complexVectorPtr = (int8_t*)complexVector;
+- int16_t* iBufferPtr = iBuffer;
+- int16_t* qBufferPtr = qBuffer;
+-
+- __m256i MoveMask = _mm256_set_epi8(15,14,11,10,7,6,3,2,13,12,9,8,5,4,1,0, 15,14,11,10,7,6,3,2,13,12,9,8,5,4,1,0);
+-
+- __m256i iMove2, iMove1;
+- __m256i complexVal1, complexVal2, iOutputVal, qOutputVal;
+-
+- unsigned int sixteenthPoints = num_points / 16;
+-
+- for(number = 0; number < sixteenthPoints; number++){
+- complexVal1 = _mm256_load_si256((__m256i*)complexVectorPtr); complexVectorPtr += 32;
+- complexVal2 = _mm256_load_si256((__m256i*)complexVectorPtr); complexVectorPtr += 32;
+-
+- iMove2 = _mm256_shuffle_epi8(complexVal2, MoveMask);
+- iMove1 = _mm256_shuffle_epi8(complexVal1, MoveMask);
+-
+- iOutputVal = _mm256_permute2x128_si256(_mm256_permute4x64_epi64(iMove1,0x08),_mm256_permute4x64_epi64(iMove2,0x80),0x30);
+- qOutputVal = _mm256_permute2x128_si256(_mm256_permute4x64_epi64(iMove1,0x0d),_mm256_permute4x64_epi64(iMove2,0xd0),0x30);
+-
+- _mm256_store_si256((__m256i*)iBufferPtr, iOutputVal);
+- _mm256_store_si256((__m256i*)qBufferPtr, qOutputVal);
+-
+- iBufferPtr += 16;
+- qBufferPtr += 16;
+- }
+-
+- number = sixteenthPoints * 16;
+- int16_t* int16ComplexVectorPtr = (int16_t*)complexVectorPtr;
+- for(; number < num_points; number++){
+- *iBufferPtr++ = *int16ComplexVectorPtr++;
+- *qBufferPtr++ = *int16ComplexVectorPtr++;
+- }
++ unsigned int number = 0;
++ const int8_t* complexVectorPtr = (int8_t*)complexVector;
++ int16_t* iBufferPtr = iBuffer;
++ int16_t* qBufferPtr = qBuffer;
++
++ __m256i MoveMask = _mm256_set_epi8(15,
++ 14,
++ 11,
++ 10,
++ 7,
++ 6,
++ 3,
++ 2,
++ 13,
++ 12,
++ 9,
++ 8,
++ 5,
++ 4,
++ 1,
++ 0,
++ 15,
++ 14,
++ 11,
++ 10,
++ 7,
++ 6,
++ 3,
++ 2,
++ 13,
++ 12,
++ 9,
++ 8,
++ 5,
++ 4,
++ 1,
++ 0);
++
++ __m256i iMove2, iMove1;
++ __m256i complexVal1, complexVal2, iOutputVal, qOutputVal;
++
++ unsigned int sixteenthPoints = num_points / 16;
++
++ for (number = 0; number < sixteenthPoints; number++) {
++ complexVal1 = _mm256_load_si256((__m256i*)complexVectorPtr);
++ complexVectorPtr += 32;
++ complexVal2 = _mm256_load_si256((__m256i*)complexVectorPtr);
++ complexVectorPtr += 32;
++
++ iMove2 = _mm256_shuffle_epi8(complexVal2, MoveMask);
++ iMove1 = _mm256_shuffle_epi8(complexVal1, MoveMask);
++
++ iOutputVal = _mm256_permute2x128_si256(_mm256_permute4x64_epi64(iMove1, 0x08),
++ _mm256_permute4x64_epi64(iMove2, 0x80),
++ 0x30);
++ qOutputVal = _mm256_permute2x128_si256(_mm256_permute4x64_epi64(iMove1, 0x0d),
++ _mm256_permute4x64_epi64(iMove2, 0xd0),
++ 0x30);
++
++ _mm256_store_si256((__m256i*)iBufferPtr, iOutputVal);
++ _mm256_store_si256((__m256i*)qBufferPtr, qOutputVal);
++
++ iBufferPtr += 16;
++ qBufferPtr += 16;
++ }
++
++ number = sixteenthPoints * 16;
++ int16_t* int16ComplexVectorPtr = (int16_t*)complexVectorPtr;
++ for (; number < num_points; number++) {
++ *iBufferPtr++ = *int16ComplexVectorPtr++;
++ *qBufferPtr++ = *int16ComplexVectorPtr++;
++ }
+ }
+ #endif /* LV_HAVE_AVX2 */
+
+ #ifdef LV_HAVE_SSSE3
+ #include <tmmintrin.h>
+
+-static inline void
+-volk_16ic_deinterleave_16i_x2_a_ssse3(int16_t* iBuffer, int16_t* qBuffer, const lv_16sc_t* complexVector, unsigned int num_points)
++static inline void volk_16ic_deinterleave_16i_x2_a_ssse3(int16_t* iBuffer,
++ int16_t* qBuffer,
++ const lv_16sc_t* complexVector,
++ unsigned int num_points)
+ {
+- unsigned int number = 0;
+- const int8_t* complexVectorPtr = (int8_t*)complexVector;
+- int16_t* iBufferPtr = iBuffer;
+- int16_t* qBufferPtr = qBuffer;
+-
+- __m128i iMoveMask1 = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0);
+- __m128i iMoveMask2 = _mm_set_epi8(13, 12, 9, 8, 5, 4, 1, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
+-
+- __m128i qMoveMask1 = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 15, 14, 11, 10, 7, 6, 3, 2);
+- __m128i qMoveMask2 = _mm_set_epi8(15, 14, 11, 10, 7, 6, 3, 2, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
+-
+- __m128i complexVal1, complexVal2, iOutputVal, qOutputVal;
+-
+- unsigned int eighthPoints = num_points / 8;
+-
+- for(number = 0; number < eighthPoints; number++){
+- complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16;
+- complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16;
+-
+- iOutputVal = _mm_or_si128( _mm_shuffle_epi8(complexVal1, iMoveMask1) , _mm_shuffle_epi8(complexVal2, iMoveMask2));
+- qOutputVal = _mm_or_si128( _mm_shuffle_epi8(complexVal1, qMoveMask1) , _mm_shuffle_epi8(complexVal2, qMoveMask2));
+-
+- _mm_store_si128((__m128i*)iBufferPtr, iOutputVal);
+- _mm_store_si128((__m128i*)qBufferPtr, qOutputVal);
+-
+- iBufferPtr += 8;
+- qBufferPtr += 8;
+- }
+-
+- number = eighthPoints * 8;
+- int16_t* int16ComplexVectorPtr = (int16_t*)complexVectorPtr;
+- for(; number < num_points; number++){
+- *iBufferPtr++ = *int16ComplexVectorPtr++;
+- *qBufferPtr++ = *int16ComplexVectorPtr++;
+- }
++ unsigned int number = 0;
++ const int8_t* complexVectorPtr = (int8_t*)complexVector;
++ int16_t* iBufferPtr = iBuffer;
++ int16_t* qBufferPtr = qBuffer;
++
++ __m128i iMoveMask1 = _mm_set_epi8(
++ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0);
++ __m128i iMoveMask2 = _mm_set_epi8(
++ 13, 12, 9, 8, 5, 4, 1, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
++
++ __m128i qMoveMask1 = _mm_set_epi8(
++ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 15, 14, 11, 10, 7, 6, 3, 2);
++ __m128i qMoveMask2 = _mm_set_epi8(
++ 15, 14, 11, 10, 7, 6, 3, 2, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
++
++ __m128i complexVal1, complexVal2, iOutputVal, qOutputVal;
++
++ unsigned int eighthPoints = num_points / 8;
++
++ for (number = 0; number < eighthPoints; number++) {
++ complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr);
++ complexVectorPtr += 16;
++ complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr);
++ complexVectorPtr += 16;
++
++ iOutputVal = _mm_or_si128(_mm_shuffle_epi8(complexVal1, iMoveMask1),
++ _mm_shuffle_epi8(complexVal2, iMoveMask2));
++ qOutputVal = _mm_or_si128(_mm_shuffle_epi8(complexVal1, qMoveMask1),
++ _mm_shuffle_epi8(complexVal2, qMoveMask2));
++
++ _mm_store_si128((__m128i*)iBufferPtr, iOutputVal);
++ _mm_store_si128((__m128i*)qBufferPtr, qOutputVal);
++
++ iBufferPtr += 8;
++ qBufferPtr += 8;
++ }
++
++ number = eighthPoints * 8;
++ int16_t* int16ComplexVectorPtr = (int16_t*)complexVectorPtr;
++ for (; number < num_points; number++) {
++ *iBufferPtr++ = *int16ComplexVectorPtr++;
++ *qBufferPtr++ = *int16ComplexVectorPtr++;
++ }
+ }
+ #endif /* LV_HAVE_SSSE3 */
+
+ #ifdef LV_HAVE_SSE2
+ #include <emmintrin.h>
+
+-static inline void
+-volk_16ic_deinterleave_16i_x2_a_sse2(int16_t* iBuffer, int16_t* qBuffer, const lv_16sc_t* complexVector, unsigned int num_points)
++static inline void volk_16ic_deinterleave_16i_x2_a_sse2(int16_t* iBuffer,
++ int16_t* qBuffer,
++ const lv_16sc_t* complexVector,
++ unsigned int num_points)
+ {
+- unsigned int number = 0;
+- const int16_t* complexVectorPtr = (int16_t*)complexVector;
+- int16_t* iBufferPtr = iBuffer;
+- int16_t* qBufferPtr = qBuffer;
+- __m128i complexVal1, complexVal2, iComplexVal1, iComplexVal2, qComplexVal1, qComplexVal2, iOutputVal, qOutputVal;
+- __m128i lowMask = _mm_set_epi32(0x0, 0x0, 0xFFFFFFFF, 0xFFFFFFFF);
+- __m128i highMask = _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0x0, 0x0);
++ unsigned int number = 0;
++ const int16_t* complexVectorPtr = (int16_t*)complexVector;
++ int16_t* iBufferPtr = iBuffer;
++ int16_t* qBufferPtr = qBuffer;
++ __m128i complexVal1, complexVal2, iComplexVal1, iComplexVal2, qComplexVal1,
++ qComplexVal2, iOutputVal, qOutputVal;
++ __m128i lowMask = _mm_set_epi32(0x0, 0x0, 0xFFFFFFFF, 0xFFFFFFFF);
++ __m128i highMask = _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0x0, 0x0);
+
+- unsigned int eighthPoints = num_points / 8;
++ unsigned int eighthPoints = num_points / 8;
+
+- for(number = 0; number < eighthPoints; number++){
+- complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 8;
+- complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 8;
++ for (number = 0; number < eighthPoints; number++) {
++ complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr);
++ complexVectorPtr += 8;
++ complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr);
++ complexVectorPtr += 8;
+
+- iComplexVal1 = _mm_shufflelo_epi16(complexVal1, _MM_SHUFFLE(3,1,2,0));
++ iComplexVal1 = _mm_shufflelo_epi16(complexVal1, _MM_SHUFFLE(3, 1, 2, 0));
+
+- iComplexVal1 = _mm_shufflehi_epi16(iComplexVal1, _MM_SHUFFLE(3,1,2,0));
++ iComplexVal1 = _mm_shufflehi_epi16(iComplexVal1, _MM_SHUFFLE(3, 1, 2, 0));
+
+- iComplexVal1 = _mm_shuffle_epi32(iComplexVal1, _MM_SHUFFLE(3,1,2,0));
++ iComplexVal1 = _mm_shuffle_epi32(iComplexVal1, _MM_SHUFFLE(3, 1, 2, 0));
+
+- iComplexVal2 = _mm_shufflelo_epi16(complexVal2, _MM_SHUFFLE(3,1,2,0));
++ iComplexVal2 = _mm_shufflelo_epi16(complexVal2, _MM_SHUFFLE(3, 1, 2, 0));
+
+- iComplexVal2 = _mm_shufflehi_epi16(iComplexVal2, _MM_SHUFFLE(3,1,2,0));
++ iComplexVal2 = _mm_shufflehi_epi16(iComplexVal2, _MM_SHUFFLE(3, 1, 2, 0));
+
+- iComplexVal2 = _mm_shuffle_epi32(iComplexVal2, _MM_SHUFFLE(2,0,3,1));
++ iComplexVal2 = _mm_shuffle_epi32(iComplexVal2, _MM_SHUFFLE(2, 0, 3, 1));
+
+- iOutputVal = _mm_or_si128(_mm_and_si128(iComplexVal1, lowMask), _mm_and_si128(iComplexVal2, highMask));
++ iOutputVal = _mm_or_si128(_mm_and_si128(iComplexVal1, lowMask),
++ _mm_and_si128(iComplexVal2, highMask));
+
+- _mm_store_si128((__m128i*)iBufferPtr, iOutputVal);
++ _mm_store_si128((__m128i*)iBufferPtr, iOutputVal);
+
+- qComplexVal1 = _mm_shufflelo_epi16(complexVal1, _MM_SHUFFLE(2,0,3,1));
++ qComplexVal1 = _mm_shufflelo_epi16(complexVal1, _MM_SHUFFLE(2, 0, 3, 1));
+
+- qComplexVal1 = _mm_shufflehi_epi16(qComplexVal1, _MM_SHUFFLE(2,0,3,1));
++ qComplexVal1 = _mm_shufflehi_epi16(qComplexVal1, _MM_SHUFFLE(2, 0, 3, 1));
+
+- qComplexVal1 = _mm_shuffle_epi32(qComplexVal1, _MM_SHUFFLE(3,1,2,0));
++ qComplexVal1 = _mm_shuffle_epi32(qComplexVal1, _MM_SHUFFLE(3, 1, 2, 0));
+
+- qComplexVal2 = _mm_shufflelo_epi16(complexVal2, _MM_SHUFFLE(2,0,3,1));
++ qComplexVal2 = _mm_shufflelo_epi16(complexVal2, _MM_SHUFFLE(2, 0, 3, 1));
+
+- qComplexVal2 = _mm_shufflehi_epi16(qComplexVal2, _MM_SHUFFLE(2,0,3,1));
++ qComplexVal2 = _mm_shufflehi_epi16(qComplexVal2, _MM_SHUFFLE(2, 0, 3, 1));
+
+- qComplexVal2 = _mm_shuffle_epi32(qComplexVal2, _MM_SHUFFLE(2,0,3,1));
++ qComplexVal2 = _mm_shuffle_epi32(qComplexVal2, _MM_SHUFFLE(2, 0, 3, 1));
+
+- qOutputVal = _mm_or_si128(_mm_and_si128(qComplexVal1, lowMask), _mm_and_si128(qComplexVal2, highMask));
++ qOutputVal = _mm_or_si128(_mm_and_si128(qComplexVal1, lowMask),
++ _mm_and_si128(qComplexVal2, highMask));
+
+- _mm_store_si128((__m128i*)qBufferPtr, qOutputVal);
++ _mm_store_si128((__m128i*)qBufferPtr, qOutputVal);
+
+- iBufferPtr += 8;
+- qBufferPtr += 8;
+- }
++ iBufferPtr += 8;
++ qBufferPtr += 8;
++ }
+
+- number = eighthPoints * 8;
+- for(; number < num_points; number++){
+- *iBufferPtr++ = *complexVectorPtr++;
+- *qBufferPtr++ = *complexVectorPtr++;
+- }
++ number = eighthPoints * 8;
++ for (; number < num_points; number++) {
++ *iBufferPtr++ = *complexVectorPtr++;
++ *qBufferPtr++ = *complexVectorPtr++;
++ }
+ }
+ #endif /* LV_HAVE_SSE2 */
+
+ #ifdef LV_HAVE_GENERIC
+
+-static inline void
+-volk_16ic_deinterleave_16i_x2_generic(int16_t* iBuffer, int16_t* qBuffer, const lv_16sc_t* complexVector, unsigned int num_points)
++static inline void volk_16ic_deinterleave_16i_x2_generic(int16_t* iBuffer,
++ int16_t* qBuffer,
++ const lv_16sc_t* complexVector,
++ unsigned int num_points)
+ {
+- const int16_t* complexVectorPtr = (const int16_t*)complexVector;
+- int16_t* iBufferPtr = iBuffer;
+- int16_t* qBufferPtr = qBuffer;
+- unsigned int number;
+- for(number = 0; number < num_points; number++){
+- *iBufferPtr++ = *complexVectorPtr++;
+- *qBufferPtr++ = *complexVectorPtr++;
+- }
++ const int16_t* complexVectorPtr = (const int16_t*)complexVector;
++ int16_t* iBufferPtr = iBuffer;
++ int16_t* qBufferPtr = qBuffer;
++ unsigned int number;
++ for (number = 0; number < num_points; number++) {
++ *iBufferPtr++ = *complexVectorPtr++;
++ *qBufferPtr++ = *complexVectorPtr++;
++ }
+ }
+ #endif /* LV_HAVE_GENERIC */
+
+ #ifdef LV_HAVE_ORC
+
+-extern void
+-volk_16ic_deinterleave_16i_x2_a_orc_impl(int16_t* iBuffer, int16_t* qBuffer, const lv_16sc_t* complexVector, unsigned int num_points);
+-static inline void
+-volk_16ic_deinterleave_16i_x2_u_orc(int16_t* iBuffer, int16_t* qBuffer, const lv_16sc_t* complexVector, unsigned int num_points)
++extern void volk_16ic_deinterleave_16i_x2_a_orc_impl(int16_t* iBuffer,
++ int16_t* qBuffer,
++ const lv_16sc_t* complexVector,
++ unsigned int num_points);
++static inline void volk_16ic_deinterleave_16i_x2_u_orc(int16_t* iBuffer,
++ int16_t* qBuffer,
++ const lv_16sc_t* complexVector,
++ unsigned int num_points)
+ {
+- volk_16ic_deinterleave_16i_x2_a_orc_impl(iBuffer, qBuffer, complexVector, num_points);
++ volk_16ic_deinterleave_16i_x2_a_orc_impl(iBuffer, qBuffer, complexVector, num_points);
+ }
+ #endif /* LV_HAVE_ORC */
+
+@@ -246,44 +308,83 @@ volk_16ic_deinterleave_16i_x2_u_orc(int16_t* iBuffer, int16_t* qBuffer, const lv
+ #ifdef LV_HAVE_AVX2
+ #include <immintrin.h>
+
+-static inline void
+-volk_16ic_deinterleave_16i_x2_u_avx2(int16_t* iBuffer, int16_t* qBuffer, const lv_16sc_t* complexVector, unsigned int num_points)
++static inline void volk_16ic_deinterleave_16i_x2_u_avx2(int16_t* iBuffer,
++ int16_t* qBuffer,
++ const lv_16sc_t* complexVector,
++ unsigned int num_points)
+ {
+- unsigned int number = 0;
+- const int8_t* complexVectorPtr = (int8_t*)complexVector;
+- int16_t* iBufferPtr = iBuffer;
+- int16_t* qBufferPtr = qBuffer;
+-
+- __m256i MoveMask = _mm256_set_epi8(15,14,11,10,7,6,3,2,13,12,9,8,5,4,1,0, 15,14,11,10,7,6,3,2,13,12,9,8,5,4,1,0);
+-
+- __m256i iMove2, iMove1;
+- __m256i complexVal1, complexVal2, iOutputVal, qOutputVal;
+-
+- unsigned int sixteenthPoints = num_points / 16;
+-
+- for(number = 0; number < sixteenthPoints; number++){
+- complexVal1 = _mm256_loadu_si256((__m256i*)complexVectorPtr); complexVectorPtr += 32;
+- complexVal2 = _mm256_loadu_si256((__m256i*)complexVectorPtr); complexVectorPtr += 32;
+-
+- iMove2 = _mm256_shuffle_epi8(complexVal2, MoveMask);
+- iMove1 = _mm256_shuffle_epi8(complexVal1, MoveMask);
+-
+- iOutputVal = _mm256_permute2x128_si256(_mm256_permute4x64_epi64(iMove1,0x08),_mm256_permute4x64_epi64(iMove2,0x80),0x30);
+- qOutputVal = _mm256_permute2x128_si256(_mm256_permute4x64_epi64(iMove1,0x0d),_mm256_permute4x64_epi64(iMove2,0xd0),0x30);
+-
+- _mm256_storeu_si256((__m256i*)iBufferPtr, iOutputVal);
+- _mm256_storeu_si256((__m256i*)qBufferPtr, qOutputVal);
+-
+- iBufferPtr += 16;
+- qBufferPtr += 16;
+- }
+-
+- number = sixteenthPoints * 16;
+- int16_t* int16ComplexVectorPtr = (int16_t*)complexVectorPtr;
+- for(; number < num_points; number++){
+- *iBufferPtr++ = *int16ComplexVectorPtr++;
+- *qBufferPtr++ = *int16ComplexVectorPtr++;
+- }
++ unsigned int number = 0;
++ const int8_t* complexVectorPtr = (int8_t*)complexVector;
++ int16_t* iBufferPtr = iBuffer;
++ int16_t* qBufferPtr = qBuffer;
++
++ __m256i MoveMask = _mm256_set_epi8(15,
++ 14,
++ 11,
++ 10,
++ 7,
++ 6,
++ 3,
++ 2,
++ 13,
++ 12,
++ 9,
++ 8,
++ 5,
++ 4,
++ 1,
++ 0,
++ 15,
++ 14,
++ 11,
++ 10,
++ 7,
++ 6,
++ 3,
++ 2,
++ 13,
++ 12,
++ 9,
++ 8,
++ 5,
++ 4,
++ 1,
++ 0);
++
++ __m256i iMove2, iMove1;
++ __m256i complexVal1, complexVal2, iOutputVal, qOutputVal;
++
++ unsigned int sixteenthPoints = num_points / 16;
++
++ for (number = 0; number < sixteenthPoints; number++) {
++ complexVal1 = _mm256_loadu_si256((__m256i*)complexVectorPtr);
++ complexVectorPtr += 32;
++ complexVal2 = _mm256_loadu_si256((__m256i*)complexVectorPtr);
++ complexVectorPtr += 32;
++
++ iMove2 = _mm256_shuffle_epi8(complexVal2, MoveMask);
++ iMove1 = _mm256_shuffle_epi8(complexVal1, MoveMask);
++
++ iOutputVal = _mm256_permute2x128_si256(_mm256_permute4x64_epi64(iMove1, 0x08),
++ _mm256_permute4x64_epi64(iMove2, 0x80),
++ 0x30);
++ qOutputVal = _mm256_permute2x128_si256(_mm256_permute4x64_epi64(iMove1, 0x0d),
++ _mm256_permute4x64_epi64(iMove2, 0xd0),
++ 0x30);
++
++ _mm256_storeu_si256((__m256i*)iBufferPtr, iOutputVal);
++ _mm256_storeu_si256((__m256i*)qBufferPtr, qOutputVal);
++
++ iBufferPtr += 16;
++ qBufferPtr += 16;
++ }
++
++ number = sixteenthPoints * 16;
++ int16_t* int16ComplexVectorPtr = (int16_t*)complexVectorPtr;
++ for (; number < num_points; number++) {
++ *iBufferPtr++ = *int16ComplexVectorPtr++;
++ *qBufferPtr++ = *int16ComplexVectorPtr++;
++ }
+ }
+ #endif /* LV_HAVE_AVX2 */
+
+diff --git a/kernels/volk/volk_16ic_deinterleave_real_16i.h b/kernels/volk/volk_16ic_deinterleave_real_16i.h
+index c1de553..45fcd99 100644
+--- a/kernels/volk/volk_16ic_deinterleave_real_16i.h
++++ b/kernels/volk/volk_16ic_deinterleave_real_16i.h
+@@ -25,12 +25,13 @@
+ *
+ * \b Overview
+ *
+- * Deinterleaves the complex 16 bit vector and returns the real (inphase) part of the signal.
++ * Deinterleaves the complex 16 bit vector and returns the real (inphase) part of the
++ * signal.
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+- * void volk_16ic_deinterleave_real_16i(int16_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points)
+- * \endcode
++ * void volk_16ic_deinterleave_real_16i(int16_t* iBuffer, const lv_16sc_t* complexVector,
++ * unsigned int num_points) \endcode
+ *
+ * \b Inputs
+ * \li complexVector: The complex input vector.
+@@ -60,79 +61,149 @@
+ #ifdef LV_HAVE_AVX2
+ #include <immintrin.h>
+
+-static inline void
+-volk_16ic_deinterleave_real_16i_a_avx2(int16_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points)
++static inline void volk_16ic_deinterleave_real_16i_a_avx2(int16_t* iBuffer,
++ const lv_16sc_t* complexVector,
++ unsigned int num_points)
+ {
+- unsigned int number = 0;
+- const int16_t* complexVectorPtr = (int16_t*)complexVector;
+- int16_t* iBufferPtr = iBuffer;
+-
+- __m256i iMoveMask1 = _mm256_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0);
+- __m256i iMoveMask2 = _mm256_set_epi8(13, 12, 9, 8, 5, 4, 1, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
+-
+- __m256i complexVal1, complexVal2, iOutputVal;
+-
+- unsigned int sixteenthPoints = num_points / 16;
+-
+- for(number = 0; number < sixteenthPoints; number++){
+- complexVal1 = _mm256_load_si256((__m256i*)complexVectorPtr); complexVectorPtr += 16;
+- complexVal2 = _mm256_load_si256((__m256i*)complexVectorPtr); complexVectorPtr += 16;
+-
+- complexVal1 = _mm256_shuffle_epi8(complexVal1, iMoveMask1);
+- complexVal2 = _mm256_shuffle_epi8(complexVal2, iMoveMask2);
+-
+- iOutputVal = _mm256_or_si256(complexVal1, complexVal2);
+- iOutputVal = _mm256_permute4x64_epi64(iOutputVal, 0xd8);
+-
+- _mm256_store_si256((__m256i*)iBufferPtr, iOutputVal);
+-
+- iBufferPtr += 16;
+- }
+-
+- number = sixteenthPoints * 16;
+- for(; number < num_points; number++){
+- *iBufferPtr++ = *complexVectorPtr++;
+- complexVectorPtr++;
+- }
++ unsigned int number = 0;
++ const int16_t* complexVectorPtr = (int16_t*)complexVector;
++ int16_t* iBufferPtr = iBuffer;
++
++ __m256i iMoveMask1 = _mm256_set_epi8(0x80,
++ 0x80,
++ 0x80,
++ 0x80,
++ 0x80,
++ 0x80,
++ 0x80,
++ 0x80,
++ 13,
++ 12,
++ 9,
++ 8,
++ 5,
++ 4,
++ 1,
++ 0,
++ 0x80,
++ 0x80,
++ 0x80,
++ 0x80,
++ 0x80,
++ 0x80,
++ 0x80,
++ 0x80,
++ 13,
++ 12,
++ 9,
++ 8,
++ 5,
++ 4,
++ 1,
++ 0);
++ __m256i iMoveMask2 = _mm256_set_epi8(13,
++ 12,
++ 9,
++ 8,
++ 5,
++ 4,
++ 1,
++ 0,
++ 0x80,
++ 0x80,
++ 0x80,
++ 0x80,
++ 0x80,
++ 0x80,
++ 0x80,
++ 0x80,
++ 13,
++ 12,
++ 9,
++ 8,
++ 5,
++ 4,
++ 1,
++ 0,
++ 0x80,
++ 0x80,
++ 0x80,
++ 0x80,
++ 0x80,
++ 0x80,
++ 0x80,
++ 0x80);
++
++ __m256i complexVal1, complexVal2, iOutputVal;
++
++ unsigned int sixteenthPoints = num_points / 16;
++
++ for (number = 0; number < sixteenthPoints; number++) {
++ complexVal1 = _mm256_load_si256((__m256i*)complexVectorPtr);
++ complexVectorPtr += 16;
++ complexVal2 = _mm256_load_si256((__m256i*)complexVectorPtr);
++ complexVectorPtr += 16;
++
++ complexVal1 = _mm256_shuffle_epi8(complexVal1, iMoveMask1);
++ complexVal2 = _mm256_shuffle_epi8(complexVal2, iMoveMask2);
++
++ iOutputVal = _mm256_or_si256(complexVal1, complexVal2);
++ iOutputVal = _mm256_permute4x64_epi64(iOutputVal, 0xd8);
++
++ _mm256_store_si256((__m256i*)iBufferPtr, iOutputVal);
++
++ iBufferPtr += 16;
++ }
++
++ number = sixteenthPoints * 16;
++ for (; number < num_points; number++) {
++ *iBufferPtr++ = *complexVectorPtr++;
++ complexVectorPtr++;
++ }
+ }
+ #endif /* LV_HAVE_AVX2 */
+
+ #ifdef LV_HAVE_SSSE3
+ #include <tmmintrin.h>
+
+-static inline void
+-volk_16ic_deinterleave_real_16i_a_ssse3(int16_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points)
++static inline void volk_16ic_deinterleave_real_16i_a_ssse3(int16_t* iBuffer,
++ const lv_16sc_t* complexVector,
++ unsigned int num_points)
+ {
+- unsigned int number = 0;
+- const int16_t* complexVectorPtr = (int16_t*)complexVector;
+- int16_t* iBufferPtr = iBuffer;
++ unsigned int number = 0;
++ const int16_t* complexVectorPtr = (int16_t*)complexVector;
++ int16_t* iBufferPtr = iBuffer;
+
+- __m128i iMoveMask1 = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0);
+- __m128i iMoveMask2 = _mm_set_epi8(13, 12, 9, 8, 5, 4, 1, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
++ __m128i iMoveMask1 = _mm_set_epi8(
++ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0);
++ __m128i iMoveMask2 = _mm_set_epi8(
++ 13, 12, 9, 8, 5, 4, 1, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
+
+- __m128i complexVal1, complexVal2, iOutputVal;
++ __m128i complexVal1, complexVal2, iOutputVal;
+
+- unsigned int eighthPoints = num_points / 8;
++ unsigned int eighthPoints = num_points / 8;
+
+- for(number = 0; number < eighthPoints; number++){
+- complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 8;
+- complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 8;
++ for (number = 0; number < eighthPoints; number++) {
++ complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr);
++ complexVectorPtr += 8;
++ complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr);
++ complexVectorPtr += 8;
+
+- complexVal1 = _mm_shuffle_epi8(complexVal1, iMoveMask1);
+- complexVal2 = _mm_shuffle_epi8(complexVal2, iMoveMask2);
++ complexVal1 = _mm_shuffle_epi8(complexVal1, iMoveMask1);
++ complexVal2 = _mm_shuffle_epi8(complexVal2, iMoveMask2);
+
+- iOutputVal = _mm_or_si128(complexVal1, complexVal2);
++ iOutputVal = _mm_or_si128(complexVal1, complexVal2);
+
+- _mm_store_si128((__m128i*)iBufferPtr, iOutputVal);
++ _mm_store_si128((__m128i*)iBufferPtr, iOutputVal);
+
+- iBufferPtr += 8;
+- }
++ iBufferPtr += 8;
++ }
+
+- number = eighthPoints * 8;
+- for(; number < num_points; number++){
+- *iBufferPtr++ = *complexVectorPtr++;
+- complexVectorPtr++;
+- }
++ number = eighthPoints * 8;
++ for (; number < num_points; number++) {
++ *iBufferPtr++ = *complexVectorPtr++;
++ complexVectorPtr++;
++ }
+ }
+ #endif /* LV_HAVE_SSSE3 */
+
+@@ -140,61 +211,66 @@ volk_16ic_deinterleave_real_16i_a_ssse3(int16_t* iBuffer, const lv_16sc_t* compl
+ #ifdef LV_HAVE_SSE2
+ #include <emmintrin.h>
+
+-static inline void
+-volk_16ic_deinterleave_real_16i_a_sse2(int16_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points)
++static inline void volk_16ic_deinterleave_real_16i_a_sse2(int16_t* iBuffer,
++ const lv_16sc_t* complexVector,
++ unsigned int num_points)
+ {
+- unsigned int number = 0;
+- const int16_t* complexVectorPtr = (int16_t*)complexVector;
+- int16_t* iBufferPtr = iBuffer;
+- __m128i complexVal1, complexVal2, iOutputVal;
+- __m128i lowMask = _mm_set_epi32(0x0, 0x0, 0xFFFFFFFF, 0xFFFFFFFF);
+- __m128i highMask = _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0x0, 0x0);
++ unsigned int number = 0;
++ const int16_t* complexVectorPtr = (int16_t*)complexVector;
++ int16_t* iBufferPtr = iBuffer;
++ __m128i complexVal1, complexVal2, iOutputVal;
++ __m128i lowMask = _mm_set_epi32(0x0, 0x0, 0xFFFFFFFF, 0xFFFFFFFF);
++ __m128i highMask = _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0x0, 0x0);
+
+- unsigned int eighthPoints = num_points / 8;
++ unsigned int eighthPoints = num_points / 8;
+
+- for(number = 0; number < eighthPoints; number++){
+- complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 8;
+- complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 8;
++ for (number = 0; number < eighthPoints; number++) {
++ complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr);
++ complexVectorPtr += 8;
++ complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr);
++ complexVectorPtr += 8;
+
+- complexVal1 = _mm_shufflelo_epi16(complexVal1, _MM_SHUFFLE(3,1,2,0));
++ complexVal1 = _mm_shufflelo_epi16(complexVal1, _MM_SHUFFLE(3, 1, 2, 0));
+
+- complexVal1 = _mm_shufflehi_epi16(complexVal1, _MM_SHUFFLE(3,1,2,0));
++ complexVal1 = _mm_shufflehi_epi16(complexVal1, _MM_SHUFFLE(3, 1, 2, 0));
+
+- complexVal1 = _mm_shuffle_epi32(complexVal1, _MM_SHUFFLE(3,1,2,0));
++ complexVal1 = _mm_shuffle_epi32(complexVal1, _MM_SHUFFLE(3, 1, 2, 0));
+
+- complexVal2 = _mm_shufflelo_epi16(complexVal2, _MM_SHUFFLE(3,1,2,0));
++ complexVal2 = _mm_shufflelo_epi16(complexVal2, _MM_SHUFFLE(3, 1, 2, 0));
+
+- complexVal2 = _mm_shufflehi_epi16(complexVal2, _MM_SHUFFLE(3,1,2,0));
++ complexVal2 = _mm_shufflehi_epi16(complexVal2, _MM_SHUFFLE(3, 1, 2, 0));
+
+- complexVal2 = _mm_shuffle_epi32(complexVal2, _MM_SHUFFLE(2,0,3,1));
++ complexVal2 = _mm_shuffle_epi32(complexVal2, _MM_SHUFFLE(2, 0, 3, 1));
+
+- iOutputVal = _mm_or_si128(_mm_and_si128(complexVal1, lowMask), _mm_and_si128(complexVal2, highMask));
++ iOutputVal = _mm_or_si128(_mm_and_si128(complexVal1, lowMask),
++ _mm_and_si128(complexVal2, highMask));
+
+- _mm_store_si128((__m128i*)iBufferPtr, iOutputVal);
++ _mm_store_si128((__m128i*)iBufferPtr, iOutputVal);
+
+- iBufferPtr += 8;
+- }
++ iBufferPtr += 8;
++ }
+
+- number = eighthPoints * 8;
+- for(; number < num_points; number++){
+- *iBufferPtr++ = *complexVectorPtr++;
+- complexVectorPtr++;
+- }
++ number = eighthPoints * 8;
++ for (; number < num_points; number++) {
++ *iBufferPtr++ = *complexVectorPtr++;
++ complexVectorPtr++;
++ }
+ }
+ #endif /* LV_HAVE_SSE2 */
+
+ #ifdef LV_HAVE_GENERIC
+
+-static inline void
+-volk_16ic_deinterleave_real_16i_generic(int16_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points)
++static inline void volk_16ic_deinterleave_real_16i_generic(int16_t* iBuffer,
++ const lv_16sc_t* complexVector,
++ unsigned int num_points)
+ {
+- unsigned int number = 0;
+- const int16_t* complexVectorPtr = (int16_t*)complexVector;
+- int16_t* iBufferPtr = iBuffer;
+- for(number = 0; number < num_points; number++){
+- *iBufferPtr++ = *complexVectorPtr++;
+- complexVectorPtr++;
+- }
++ unsigned int number = 0;
++ const int16_t* complexVectorPtr = (int16_t*)complexVector;
++ int16_t* iBufferPtr = iBuffer;
++ for (number = 0; number < num_points; number++) {
++ *iBufferPtr++ = *complexVectorPtr++;
++ complexVectorPtr++;
++ }
+ }
+ #endif /* LV_HAVE_GENERIC */
+
+@@ -212,40 +288,105 @@ volk_16ic_deinterleave_real_16i_generic(int16_t* iBuffer, const lv_16sc_t* compl
+ #ifdef LV_HAVE_AVX2
+ #include <immintrin.h>
+
+-static inline void
+-volk_16ic_deinterleave_real_16i_u_avx2(int16_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points)
++static inline void volk_16ic_deinterleave_real_16i_u_avx2(int16_t* iBuffer,
++ const lv_16sc_t* complexVector,
++ unsigned int num_points)
+ {
+- unsigned int number = 0;
+- const int16_t* complexVectorPtr = (int16_t*)complexVector;
+- int16_t* iBufferPtr = iBuffer;
+-
+- __m256i iMoveMask1 = _mm256_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0);
+- __m256i iMoveMask2 = _mm256_set_epi8(13, 12, 9, 8, 5, 4, 1, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
+-
+- __m256i complexVal1, complexVal2, iOutputVal;
+-
+- unsigned int sixteenthPoints = num_points / 16;
+-
+- for(number = 0; number < sixteenthPoints; number++){
+- complexVal1 = _mm256_loadu_si256((__m256i*)complexVectorPtr); complexVectorPtr += 16;
+- complexVal2 = _mm256_loadu_si256((__m256i*)complexVectorPtr); complexVectorPtr += 16;
+-
+- complexVal1 = _mm256_shuffle_epi8(complexVal1, iMoveMask1);
+- complexVal2 = _mm256_shuffle_epi8(complexVal2, iMoveMask2);
+-
+- iOutputVal = _mm256_or_si256(complexVal1, complexVal2);
+- iOutputVal = _mm256_permute4x64_epi64(iOutputVal, 0xd8);
+-
+- _mm256_storeu_si256((__m256i*)iBufferPtr, iOutputVal);
+-
+- iBufferPtr += 16;
+- }
+-
+- number = sixteenthPoints * 16;
+- for(; number < num_points; number++){
+- *iBufferPtr++ = *complexVectorPtr++;
+- complexVectorPtr++;
+- }
++ unsigned int number = 0;
++ const int16_t* complexVectorPtr = (int16_t*)complexVector;
++ int16_t* iBufferPtr = iBuffer;
++
++ __m256i iMoveMask1 = _mm256_set_epi8(0x80,
++ 0x80,
++ 0x80,
++ 0x80,
++ 0x80,
++ 0x80,
++ 0x80,
++ 0x80,
++ 13,
++ 12,
++ 9,
++ 8,
++ 5,
++ 4,
++ 1,
++ 0,
++ 0x80,
++ 0x80,
++ 0x80,
++ 0x80,
++ 0x80,
++ 0x80,
++ 0x80,
++ 0x80,
++ 13,
++ 12,
++ 9,
++ 8,
++ 5,
++ 4,
++ 1,
++ 0);
++ __m256i iMoveMask2 = _mm256_set_epi8(13,
++ 12,
++ 9,
++ 8,
++ 5,
++ 4,
++ 1,
++ 0,
++ 0x80,
++ 0x80,
++ 0x80,
++ 0x80,
++ 0x80,
++ 0x80,
++ 0x80,
++ 0x80,
++ 13,
++ 12,
++ 9,
++ 8,
++ 5,
++ 4,
++ 1,
++ 0,
++ 0x80,
++ 0x80,
++ 0x80,
++ 0x80,
++ 0x80,
++ 0x80,
++ 0x80,
++ 0x80);
++
++ __m256i complexVal1, complexVal2, iOutputVal;
++
++ unsigned int sixteenthPoints = num_points / 16;
++
++ for (number = 0; number < sixteenthPoints; number++) {
++ complexVal1 = _mm256_loadu_si256((__m256i*)complexVectorPtr);
++ complexVectorPtr += 16;
++ complexVal2 = _mm256_loadu_si256((__m256i*)complexVectorPtr);
++ complexVectorPtr += 16;
++
++ complexVal1 = _mm256_shuffle_epi8(complexVal1, iMoveMask1);
++ complexVal2 = _mm256_shuffle_epi8(complexVal2, iMoveMask2);
++
++ iOutputVal = _mm256_or_si256(complexVal1, complexVal2);
++ iOutputVal = _mm256_permute4x64_epi64(iOutputVal, 0xd8);
++
++ _mm256_storeu_si256((__m256i*)iBufferPtr, iOutputVal);
++
++ iBufferPtr += 16;
++ }
++
++ number = sixteenthPoints * 16;
++ for (; number < num_points; number++) {
++ *iBufferPtr++ = *complexVectorPtr++;
++ complexVectorPtr++;
++ }
+ }
+ #endif /* LV_HAVE_AVX2 */
+
+diff --git a/kernels/volk/volk_16ic_deinterleave_real_8i.h b/kernels/volk/volk_16ic_deinterleave_real_8i.h
+index 1022688..3d8e4ea 100644
+--- a/kernels/volk/volk_16ic_deinterleave_real_8i.h
++++ b/kernels/volk/volk_16ic_deinterleave_real_8i.h
+@@ -30,8 +30,8 @@
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+- * void volk_16ic_deinterleave_real_8i(int8_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points)
+- * \endcode
++ * void volk_16ic_deinterleave_real_8i(int8_t* iBuffer, const lv_16sc_t* complexVector,
++ * unsigned int num_points) \endcode
+ *
+ * \b Inputs
+ * \li complexVector: The complex input vector.
+@@ -61,54 +61,121 @@
+ #ifdef LV_HAVE_AVX2
+ #include <immintrin.h>
+
+-static inline void
+-volk_16ic_deinterleave_real_8i_a_avx2(int8_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points)
++static inline void volk_16ic_deinterleave_real_8i_a_avx2(int8_t* iBuffer,
++ const lv_16sc_t* complexVector,
++ unsigned int num_points)
+ {
+- unsigned int number = 0;
+- const int8_t* complexVectorPtr = (int8_t*)complexVector;
+- int8_t* iBufferPtr = iBuffer;
+- __m256i iMoveMask1 = _mm256_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0);
+- __m256i iMoveMask2 = _mm256_set_epi8(13, 12, 9, 8, 5, 4, 1, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
+- __m256i complexVal1, complexVal2, complexVal3, complexVal4, iOutputVal;
+-
+- unsigned int thirtysecondPoints = num_points / 32;
+-
+- for(number = 0; number < thirtysecondPoints; number++){
+- complexVal1 = _mm256_load_si256((__m256i*)complexVectorPtr); complexVectorPtr += 32;
+- complexVal2 = _mm256_load_si256((__m256i*)complexVectorPtr); complexVectorPtr += 32;
+-
+- complexVal3 = _mm256_load_si256((__m256i*)complexVectorPtr); complexVectorPtr += 32;
+- complexVal4 = _mm256_load_si256((__m256i*)complexVectorPtr); complexVectorPtr += 32;
+-
+- complexVal1 = _mm256_shuffle_epi8(complexVal1, iMoveMask1);
+- complexVal2 = _mm256_shuffle_epi8(complexVal2, iMoveMask2);
+-
+- complexVal1 = _mm256_or_si256(complexVal1, complexVal2);
+- complexVal1 = _mm256_permute4x64_epi64(complexVal1, 0xd8);
+-
+- complexVal3 = _mm256_shuffle_epi8(complexVal3, iMoveMask1);
+- complexVal4 = _mm256_shuffle_epi8(complexVal4, iMoveMask2);
+-
+- complexVal3 = _mm256_or_si256(complexVal3, complexVal4);
+- complexVal3 = _mm256_permute4x64_epi64(complexVal3, 0xd8);
+-
+- complexVal1 = _mm256_srai_epi16(complexVal1, 8);
+- complexVal3 = _mm256_srai_epi16(complexVal3, 8);
+-
+- iOutputVal = _mm256_packs_epi16(complexVal1, complexVal3);
+- iOutputVal = _mm256_permute4x64_epi64(iOutputVal, 0xd8);
+-
+- _mm256_store_si256((__m256i*)iBufferPtr, iOutputVal);
+-
+- iBufferPtr += 32;
+- }
+-
+- number = thirtysecondPoints * 32;
+- int16_t* int16ComplexVectorPtr = (int16_t*)complexVectorPtr;
+- for(; number < num_points; number++){
+- *iBufferPtr++ = ((int8_t)(*int16ComplexVectorPtr++ >> 8));
+- int16ComplexVectorPtr++;
+- }
++ unsigned int number = 0;
++ const int8_t* complexVectorPtr = (int8_t*)complexVector;
++ int8_t* iBufferPtr = iBuffer;
++ __m256i iMoveMask1 = _mm256_set_epi8(0x80,
++ 0x80,
++ 0x80,
++ 0x80,
++ 0x80,
++ 0x80,
++ 0x80,
++ 0x80,
++ 13,
++ 12,
++ 9,
++ 8,
++ 5,
++ 4,
++ 1,
++ 0,
++ 0x80,
++ 0x80,
++ 0x80,
++ 0x80,
++ 0x80,
++ 0x80,
++ 0x80,
++ 0x80,
++ 13,
++ 12,
++ 9,
++ 8,
++ 5,
++ 4,
++ 1,
++ 0);
++ __m256i iMoveMask2 = _mm256_set_epi8(13,
++ 12,
++ 9,
++ 8,
++ 5,
++ 4,
++ 1,
++ 0,
++ 0x80,
++ 0x80,
++ 0x80,
++ 0x80,
++ 0x80,
++ 0x80,
++ 0x80,
++ 0x80,
++ 13,
++ 12,
++ 9,
++ 8,
++ 5,
++ 4,
++ 1,
++ 0,
++ 0x80,
++ 0x80,
++ 0x80,
++ 0x80,
++ 0x80,
++ 0x80,
++ 0x80,
++ 0x80);
++ __m256i complexVal1, complexVal2, complexVal3, complexVal4, iOutputVal;
++
++ unsigned int thirtysecondPoints = num_points / 32;
++
++ for (number = 0; number < thirtysecondPoints; number++) {
++ complexVal1 = _mm256_load_si256((__m256i*)complexVectorPtr);
++ complexVectorPtr += 32;
++ complexVal2 = _mm256_load_si256((__m256i*)complexVectorPtr);
++ complexVectorPtr += 32;
++
++ complexVal3 = _mm256_load_si256((__m256i*)complexVectorPtr);
++ complexVectorPtr += 32;
++ complexVal4 = _mm256_load_si256((__m256i*)complexVectorPtr);
++ complexVectorPtr += 32;
++
++ complexVal1 = _mm256_shuffle_epi8(complexVal1, iMoveMask1);
++ complexVal2 = _mm256_shuffle_epi8(complexVal2, iMoveMask2);
++
++ complexVal1 = _mm256_or_si256(complexVal1, complexVal2);
++ complexVal1 = _mm256_permute4x64_epi64(complexVal1, 0xd8);
++
++ complexVal3 = _mm256_shuffle_epi8(complexVal3, iMoveMask1);
++ complexVal4 = _mm256_shuffle_epi8(complexVal4, iMoveMask2);
++
++ complexVal3 = _mm256_or_si256(complexVal3, complexVal4);
++ complexVal3 = _mm256_permute4x64_epi64(complexVal3, 0xd8);
++
++ complexVal1 = _mm256_srai_epi16(complexVal1, 8);
++ complexVal3 = _mm256_srai_epi16(complexVal3, 8);
++
++ iOutputVal = _mm256_packs_epi16(complexVal1, complexVal3);
++ iOutputVal = _mm256_permute4x64_epi64(iOutputVal, 0xd8);
++
++ _mm256_store_si256((__m256i*)iBufferPtr, iOutputVal);
++
++ iBufferPtr += 32;
++ }
++
++ number = thirtysecondPoints * 32;
++ int16_t* int16ComplexVectorPtr = (int16_t*)complexVectorPtr;
++ for (; number < num_points; number++) {
++ *iBufferPtr++ = ((int8_t)(*int16ComplexVectorPtr++ >> 8));
++ int16ComplexVectorPtr++;
++ }
+ }
+ #endif /* LV_HAVE_AVX2 */
+
+@@ -116,105 +183,116 @@ volk_16ic_deinterleave_real_8i_a_avx2(int8_t* iBuffer, const lv_16sc_t* complexV
+ #ifdef LV_HAVE_SSSE3
+ #include <tmmintrin.h>
+
+-static inline void
+-volk_16ic_deinterleave_real_8i_a_ssse3(int8_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points)
++static inline void volk_16ic_deinterleave_real_8i_a_ssse3(int8_t* iBuffer,
++ const lv_16sc_t* complexVector,
++ unsigned int num_points)
+ {
+- unsigned int number = 0;
+- const int8_t* complexVectorPtr = (int8_t*)complexVector;
+- int8_t* iBufferPtr = iBuffer;
+- __m128i iMoveMask1 = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0);
+- __m128i iMoveMask2 = _mm_set_epi8(13, 12, 9, 8, 5, 4, 1, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
+- __m128i complexVal1, complexVal2, complexVal3, complexVal4, iOutputVal;
++ unsigned int number = 0;
++ const int8_t* complexVectorPtr = (int8_t*)complexVector;
++ int8_t* iBufferPtr = iBuffer;
++ __m128i iMoveMask1 = _mm_set_epi8(
++ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0);
++ __m128i iMoveMask2 = _mm_set_epi8(
++ 13, 12, 9, 8, 5, 4, 1, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
++ __m128i complexVal1, complexVal2, complexVal3, complexVal4, iOutputVal;
+
+- unsigned int sixteenthPoints = num_points / 16;
++ unsigned int sixteenthPoints = num_points / 16;
+
+- for(number = 0; number < sixteenthPoints; number++){
+- complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16;
+- complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16;
++ for (number = 0; number < sixteenthPoints; number++) {
++ complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr);
++ complexVectorPtr += 16;
++ complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr);
++ complexVectorPtr += 16;
+
+- complexVal3 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16;
+- complexVal4 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16;
++ complexVal3 = _mm_load_si128((__m128i*)complexVectorPtr);
++ complexVectorPtr += 16;
++ complexVal4 = _mm_load_si128((__m128i*)complexVectorPtr);
++ complexVectorPtr += 16;
+
+- complexVal1 = _mm_shuffle_epi8(complexVal1, iMoveMask1);
+- complexVal2 = _mm_shuffle_epi8(complexVal2, iMoveMask2);
++ complexVal1 = _mm_shuffle_epi8(complexVal1, iMoveMask1);
++ complexVal2 = _mm_shuffle_epi8(complexVal2, iMoveMask2);
+
+- complexVal1 = _mm_or_si128(complexVal1, complexVal2);
++ complexVal1 = _mm_or_si128(complexVal1, complexVal2);
+
+- complexVal3 = _mm_shuffle_epi8(complexVal3, iMoveMask1);
+- complexVal4 = _mm_shuffle_epi8(complexVal4, iMoveMask2);
++ complexVal3 = _mm_shuffle_epi8(complexVal3, iMoveMask1);
++ complexVal4 = _mm_shuffle_epi8(complexVal4, iMoveMask2);
+
+- complexVal3 = _mm_or_si128(complexVal3, complexVal4);
++ complexVal3 = _mm_or_si128(complexVal3, complexVal4);
+
+
+- complexVal1 = _mm_srai_epi16(complexVal1, 8);
+- complexVal3 = _mm_srai_epi16(complexVal3, 8);
++ complexVal1 = _mm_srai_epi16(complexVal1, 8);
++ complexVal3 = _mm_srai_epi16(complexVal3, 8);
+
+- iOutputVal = _mm_packs_epi16(complexVal1, complexVal3);
++ iOutputVal = _mm_packs_epi16(complexVal1, complexVal3);
+
+- _mm_store_si128((__m128i*)iBufferPtr, iOutputVal);
++ _mm_store_si128((__m128i*)iBufferPtr, iOutputVal);
+
+- iBufferPtr += 16;
+- }
++ iBufferPtr += 16;
++ }
+
+- number = sixteenthPoints * 16;
+- int16_t* int16ComplexVectorPtr = (int16_t*)complexVectorPtr;
+- for(; number < num_points; number++){
+- *iBufferPtr++ = ((int8_t)(*int16ComplexVectorPtr++ >> 8));
+- int16ComplexVectorPtr++;
+- }
++ number = sixteenthPoints * 16;
++ int16_t* int16ComplexVectorPtr = (int16_t*)complexVectorPtr;
++ for (; number < num_points; number++) {
++ *iBufferPtr++ = ((int8_t)(*int16ComplexVectorPtr++ >> 8));
++ int16ComplexVectorPtr++;
++ }
+ }
+ #endif /* LV_HAVE_SSSE3 */
+
+ #ifdef LV_HAVE_GENERIC
+
+-static inline void
+-volk_16ic_deinterleave_real_8i_generic(int8_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points)
++static inline void volk_16ic_deinterleave_real_8i_generic(int8_t* iBuffer,
++ const lv_16sc_t* complexVector,
++ unsigned int num_points)
+ {
+- unsigned int number = 0;
+- int16_t* complexVectorPtr = (int16_t*)complexVector;
+- int8_t* iBufferPtr = iBuffer;
+- for(number = 0; number < num_points; number++){
+- *iBufferPtr++ = ((int8_t)(*complexVectorPtr++ >> 8));
+- complexVectorPtr++;
+- }
++ unsigned int number = 0;
++ int16_t* complexVectorPtr = (int16_t*)complexVector;
++ int8_t* iBufferPtr = iBuffer;
++ for (number = 0; number < num_points; number++) {
++ *iBufferPtr++ = ((int8_t)(*complexVectorPtr++ >> 8));
++ complexVectorPtr++;
++ }
+ }
+ #endif /* LV_HAVE_GENERIC */
+
+ #ifdef LV_HAVE_NEON
+ #include <arm_neon.h>
+
+-static inline void
+-volk_16ic_deinterleave_real_8i_neon(int8_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points)
++static inline void volk_16ic_deinterleave_real_8i_neon(int8_t* iBuffer,
++ const lv_16sc_t* complexVector,
++ unsigned int num_points)
+ {
+- const int16_t* complexVectorPtr = (const int16_t*)complexVector;
+- int8_t* iBufferPtr = iBuffer;
+- unsigned int eighth_points = num_points / 8;
+- unsigned int number;
+-
+- int16x8x2_t complexInput;
+- int8x8_t realOutput;
+- for(number = 0; number < eighth_points; number++){
+- complexInput = vld2q_s16(complexVectorPtr);
+- realOutput = vshrn_n_s16(complexInput.val[0], 8);
+- vst1_s8(iBufferPtr, realOutput);
+- complexVectorPtr += 16;
+- iBufferPtr += 8;
+- }
+-
+- for(number = eighth_points*8; number < num_points; number++){
+- *iBufferPtr++ = ((int8_t)(*complexVectorPtr++ >> 8));
+- complexVectorPtr++;
+- }
++ const int16_t* complexVectorPtr = (const int16_t*)complexVector;
++ int8_t* iBufferPtr = iBuffer;
++ unsigned int eighth_points = num_points / 8;
++ unsigned int number;
++
++ int16x8x2_t complexInput;
++ int8x8_t realOutput;
++ for (number = 0; number < eighth_points; number++) {
++ complexInput = vld2q_s16(complexVectorPtr);
++ realOutput = vshrn_n_s16(complexInput.val[0], 8);
++ vst1_s8(iBufferPtr, realOutput);
++ complexVectorPtr += 16;
++ iBufferPtr += 8;
++ }
++
++ for (number = eighth_points * 8; number < num_points; number++) {
++ *iBufferPtr++ = ((int8_t)(*complexVectorPtr++ >> 8));
++ complexVectorPtr++;
++ }
+ }
+ #endif
+
+ #ifdef LV_HAVE_ORC
+
+-extern void
+-volk_16ic_deinterleave_real_8i_a_orc_impl(int8_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points);
++extern void volk_16ic_deinterleave_real_8i_a_orc_impl(int8_t* iBuffer,
++ const lv_16sc_t* complexVector,
++ unsigned int num_points);
+
+-static inline void
+-volk_16ic_deinterleave_real_8i_u_orc(int8_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points)
++static inline void volk_16ic_deinterleave_real_8i_u_orc(int8_t* iBuffer,
++ const lv_16sc_t* complexVector,
++ unsigned int num_points)
+ {
+ volk_16ic_deinterleave_real_8i_a_orc_impl(iBuffer, complexVector, num_points);
+ }
+@@ -233,54 +311,121 @@ volk_16ic_deinterleave_real_8i_u_orc(int8_t* iBuffer, const lv_16sc_t* complexVe
+ #ifdef LV_HAVE_AVX2
+ #include <immintrin.h>
+
+-static inline void
+-volk_16ic_deinterleave_real_8i_u_avx2(int8_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points)
++static inline void volk_16ic_deinterleave_real_8i_u_avx2(int8_t* iBuffer,
++ const lv_16sc_t* complexVector,
++ unsigned int num_points)
+ {
+- unsigned int number = 0;
+- const int8_t* complexVectorPtr = (int8_t*)complexVector;
+- int8_t* iBufferPtr = iBuffer;
+- __m256i iMoveMask1 = _mm256_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0);
+- __m256i iMoveMask2 = _mm256_set_epi8(13, 12, 9, 8, 5, 4, 1, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
+- __m256i complexVal1, complexVal2, complexVal3, complexVal4, iOutputVal;
+-
+- unsigned int thirtysecondPoints = num_points / 32;
+-
+- for(number = 0; number < thirtysecondPoints; number++){
+- complexVal1 = _mm256_loadu_si256((__m256i*)complexVectorPtr); complexVectorPtr += 32;
+- complexVal2 = _mm256_loadu_si256((__m256i*)complexVectorPtr); complexVectorPtr += 32;
+-
+- complexVal3 = _mm256_loadu_si256((__m256i*)complexVectorPtr); complexVectorPtr += 32;
+- complexVal4 = _mm256_loadu_si256((__m256i*)complexVectorPtr); complexVectorPtr += 32;
+-
+- complexVal1 = _mm256_shuffle_epi8(complexVal1, iMoveMask1);
+- complexVal2 = _mm256_shuffle_epi8(complexVal2, iMoveMask2);
+-
+- complexVal1 = _mm256_or_si256(complexVal1, complexVal2);
+- complexVal1 = _mm256_permute4x64_epi64(complexVal1, 0xd8);
+-
+- complexVal3 = _mm256_shuffle_epi8(complexVal3, iMoveMask1);
+- complexVal4 = _mm256_shuffle_epi8(complexVal4, iMoveMask2);
+-
+- complexVal3 = _mm256_or_si256(complexVal3, complexVal4);
+- complexVal3 = _mm256_permute4x64_epi64(complexVal3, 0xd8);
+-
+- complexVal1 = _mm256_srai_epi16(complexVal1, 8);
+- complexVal3 = _mm256_srai_epi16(complexVal3, 8);
+-
+- iOutputVal = _mm256_packs_epi16(complexVal1, complexVal3);
+- iOutputVal = _mm256_permute4x64_epi64(iOutputVal, 0xd8);
+-
+- _mm256_storeu_si256((__m256i*)iBufferPtr, iOutputVal);
+-
+- iBufferPtr += 32;
+- }
+-
+- number = thirtysecondPoints * 32;
+- int16_t* int16ComplexVectorPtr = (int16_t*)complexVectorPtr;
+- for(; number < num_points; number++){
+- *iBufferPtr++ = ((int8_t)(*int16ComplexVectorPtr++ >> 8));
+- int16ComplexVectorPtr++;
+- }
++ unsigned int number = 0;
++ const int8_t* complexVectorPtr = (int8_t*)complexVector;
++ int8_t* iBufferPtr = iBuffer;
++ __m256i iMoveMask1 = _mm256_set_epi8(0x80,
++ 0x80,
++ 0x80,
++ 0x80,
++ 0x80,
++ 0x80,
++ 0x80,
++ 0x80,
++ 13,
++ 12,
++ 9,
++ 8,
++ 5,
++ 4,
++ 1,
++ 0,
++ 0x80,
++ 0x80,
++ 0x80,
++ 0x80,
++ 0x80,
++ 0x80,
++ 0x80,
++ 0x80,
++ 13,
++ 12,
++ 9,
++ 8,
++ 5,
++ 4,
++ 1,
++ 0);
++ __m256i iMoveMask2 = _mm256_set_epi8(13,
++ 12,
++ 9,
++ 8,
++ 5,
++ 4,
++ 1,
++ 0,
++ 0x80,
++ 0x80,
++ 0x80,
++ 0x80,
++ 0x80,
++ 0x80,
++ 0x80,
++ 0x80,
++ 13,
++ 12,
++ 9,
++ 8,
++ 5,
++ 4,
++ 1,
++ 0,
++ 0x80,
++ 0x80,
++ 0x80,
++ 0x80,
++ 0x80,
++ 0x80,
++ 0x80,
++ 0x80);
++ __m256i complexVal1, complexVal2, complexVal3, complexVal4, iOutputVal;
++
++ unsigned int thirtysecondPoints = num_points / 32;
++
++ for (number = 0; number < thirtysecondPoints; number++) {
++ complexVal1 = _mm256_loadu_si256((__m256i*)complexVectorPtr);
++ complexVectorPtr += 32;
++ complexVal2 = _mm256_loadu_si256((__m256i*)complexVectorPtr);
++ complexVectorPtr += 32;
++
++ complexVal3 = _mm256_loadu_si256((__m256i*)complexVectorPtr);
++ complexVectorPtr += 32;
++ complexVal4 = _mm256_loadu_si256((__m256i*)complexVectorPtr);
++ complexVectorPtr += 32;
++
++ complexVal1 = _mm256_shuffle_epi8(complexVal1, iMoveMask1);
++ complexVal2 = _mm256_shuffle_epi8(complexVal2, iMoveMask2);
++
++ complexVal1 = _mm256_or_si256(complexVal1, complexVal2);
++ complexVal1 = _mm256_permute4x64_epi64(complexVal1, 0xd8);
++
++ complexVal3 = _mm256_shuffle_epi8(complexVal3, iMoveMask1);
++ complexVal4 = _mm256_shuffle_epi8(complexVal4, iMoveMask2);
++
++ complexVal3 = _mm256_or_si256(complexVal3, complexVal4);
++ complexVal3 = _mm256_permute4x64_epi64(complexVal3, 0xd8);
++
++ complexVal1 = _mm256_srai_epi16(complexVal1, 8);
++ complexVal3 = _mm256_srai_epi16(complexVal3, 8);
++
++ iOutputVal = _mm256_packs_epi16(complexVal1, complexVal3);
++ iOutputVal = _mm256_permute4x64_epi64(iOutputVal, 0xd8);
++
++ _mm256_storeu_si256((__m256i*)iBufferPtr, iOutputVal);
++
++ iBufferPtr += 32;
++ }
++
++ number = thirtysecondPoints * 32;
++ int16_t* int16ComplexVectorPtr = (int16_t*)complexVectorPtr;
++ for (; number < num_points; number++) {
++ *iBufferPtr++ = ((int8_t)(*int16ComplexVectorPtr++ >> 8));
++ int16ComplexVectorPtr++;
++ }
+ }
+ #endif /* LV_HAVE_AVX2 */
+ #endif /* INCLUDED_volk_16ic_deinterleave_real_8i_u_H */
+diff --git a/kernels/volk/volk_16ic_magnitude_16i.h b/kernels/volk/volk_16ic_magnitude_16i.h
+index bbe72a8..35b40cb 100644
+--- a/kernels/volk/volk_16ic_magnitude_16i.h
++++ b/kernels/volk/volk_16ic_magnitude_16i.h
+@@ -30,8 +30,8 @@
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+- * void volk_16ic_magnitude_16i(int16_t* magnitudeVector, const lv_16sc_t* complexVector, unsigned int num_points)
+- * \endcode
++ * void volk_16ic_magnitude_16i(int16_t* magnitudeVector, const lv_16sc_t* complexVector,
++ * unsigned int num_points) \endcode
+ *
+ * \b Inputs
+ * \li complexVector: The complex input vector.
+@@ -54,242 +54,255 @@
+ #ifndef INCLUDED_volk_16ic_magnitude_16i_a_H
+ #define INCLUDED_volk_16ic_magnitude_16i_a_H
+
+-#include <volk/volk_common.h>
+ #include <inttypes.h>
+-#include <stdio.h>
+-#include <math.h>
+ #include <limits.h>
++#include <math.h>
++#include <stdio.h>
++#include <volk/volk_common.h>
+
+ #ifdef LV_HAVE_AVX2
+ #include <immintrin.h>
+
+-static inline void
+-volk_16ic_magnitude_16i_a_avx2(int16_t* magnitudeVector, const lv_16sc_t* complexVector, unsigned int num_points)
++static inline void volk_16ic_magnitude_16i_a_avx2(int16_t* magnitudeVector,
++ const lv_16sc_t* complexVector,
++ unsigned int num_points)
+ {
+- unsigned int number = 0;
+- const unsigned int eighthPoints = num_points / 8;
+-
+- const int16_t* complexVectorPtr = (const int16_t*)complexVector;
+- int16_t* magnitudeVectorPtr = magnitudeVector;
+-
+- __m256 vScalar = _mm256_set1_ps(SHRT_MAX);
+- __m256 invScalar = _mm256_set1_ps(1.0f/SHRT_MAX);
+- __m256i int1, int2;
+- __m128i short1, short2;
+- __m256 cplxValue1, cplxValue2, result;
+- __m256i idx = _mm256_set_epi32(0,0,0,0,5,1,4,0);
+-
+- for(;number < eighthPoints; number++){
+-
+- int1 = _mm256_load_si256((__m256i*)complexVectorPtr);
+- complexVectorPtr += 16;
+- short1 = _mm256_extracti128_si256(int1,0);
+- short2 = _mm256_extracti128_si256(int1,1);
+-
+- int1 = _mm256_cvtepi16_epi32(short1);
+- int2 = _mm256_cvtepi16_epi32(short2);
+- cplxValue1 = _mm256_cvtepi32_ps(int1);
+- cplxValue2 = _mm256_cvtepi32_ps(int2);
+-
+- cplxValue1 = _mm256_mul_ps(cplxValue1, invScalar);
+- cplxValue2 = _mm256_mul_ps(cplxValue2, invScalar);
+-
+- cplxValue1 = _mm256_mul_ps(cplxValue1, cplxValue1); // Square the values
+- cplxValue2 = _mm256_mul_ps(cplxValue2, cplxValue2); // Square the Values
+-
+- result = _mm256_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values
+-
+- result = _mm256_sqrt_ps(result); // Square root the values
+-
+- result = _mm256_mul_ps(result, vScalar); // Scale the results
+-
+- int1 = _mm256_cvtps_epi32(result);
+- int1 = _mm256_packs_epi32(int1, int1);
+- int1 = _mm256_permutevar8x32_epi32(int1, idx); //permute to compensate for shuffling in hadd and packs
+- short1 = _mm256_extracti128_si256(int1, 0);
+- _mm_store_si128((__m128i*)magnitudeVectorPtr,short1);
+- magnitudeVectorPtr += 8;
+- }
+-
+- number = eighthPoints * 8;
+- magnitudeVectorPtr = &magnitudeVector[number];
+- complexVectorPtr = (const int16_t*)&complexVector[number];
+- for(; number < num_points; number++){
+- const float val1Real = (float)(*complexVectorPtr++) / SHRT_MAX;
+- const float val1Imag = (float)(*complexVectorPtr++) / SHRT_MAX;
+- const float val1Result = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)) * SHRT_MAX;
+- *magnitudeVectorPtr++ = (int16_t)rintf(val1Result);
+- }
++ unsigned int number = 0;
++ const unsigned int eighthPoints = num_points / 8;
++
++ const int16_t* complexVectorPtr = (const int16_t*)complexVector;
++ int16_t* magnitudeVectorPtr = magnitudeVector;
++
++ __m256 vScalar = _mm256_set1_ps(SHRT_MAX);
++ __m256 invScalar = _mm256_set1_ps(1.0f / SHRT_MAX);
++ __m256i int1, int2;
++ __m128i short1, short2;
++ __m256 cplxValue1, cplxValue2, result;
++ __m256i idx = _mm256_set_epi32(0, 0, 0, 0, 5, 1, 4, 0);
++
++ for (; number < eighthPoints; number++) {
++
++ int1 = _mm256_load_si256((__m256i*)complexVectorPtr);
++ complexVectorPtr += 16;
++ short1 = _mm256_extracti128_si256(int1, 0);
++ short2 = _mm256_extracti128_si256(int1, 1);
++
++ int1 = _mm256_cvtepi16_epi32(short1);
++ int2 = _mm256_cvtepi16_epi32(short2);
++ cplxValue1 = _mm256_cvtepi32_ps(int1);
++ cplxValue2 = _mm256_cvtepi32_ps(int2);
++
++ cplxValue1 = _mm256_mul_ps(cplxValue1, invScalar);
++ cplxValue2 = _mm256_mul_ps(cplxValue2, invScalar);
++
++ cplxValue1 = _mm256_mul_ps(cplxValue1, cplxValue1); // Square the values
++ cplxValue2 = _mm256_mul_ps(cplxValue2, cplxValue2); // Square the Values
++
++ result = _mm256_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values
++
++ result = _mm256_sqrt_ps(result); // Square root the values
++
++ result = _mm256_mul_ps(result, vScalar); // Scale the results
++
++ int1 = _mm256_cvtps_epi32(result);
++ int1 = _mm256_packs_epi32(int1, int1);
++ int1 = _mm256_permutevar8x32_epi32(
++ int1, idx); // permute to compensate for shuffling in hadd and packs
++ short1 = _mm256_extracti128_si256(int1, 0);
++ _mm_store_si128((__m128i*)magnitudeVectorPtr, short1);
++ magnitudeVectorPtr += 8;
++ }
++
++ number = eighthPoints * 8;
++ magnitudeVectorPtr = &magnitudeVector[number];
++ complexVectorPtr = (const int16_t*)&complexVector[number];
++ for (; number < num_points; number++) {
++ const float val1Real = (float)(*complexVectorPtr++) / SHRT_MAX;
++ const float val1Imag = (float)(*complexVectorPtr++) / SHRT_MAX;
++ const float val1Result =
++ sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)) * SHRT_MAX;
++ *magnitudeVectorPtr++ = (int16_t)rintf(val1Result);
++ }
+ }
+ #endif /* LV_HAVE_AVX2 */
+
+ #ifdef LV_HAVE_SSE3
+ #include <pmmintrin.h>
+
+-static inline void
+-volk_16ic_magnitude_16i_a_sse3(int16_t* magnitudeVector, const lv_16sc_t* complexVector, unsigned int num_points)
++static inline void volk_16ic_magnitude_16i_a_sse3(int16_t* magnitudeVector,
++ const lv_16sc_t* complexVector,
++ unsigned int num_points)
+ {
+- unsigned int number = 0;
+- const unsigned int quarterPoints = num_points / 4;
++ unsigned int number = 0;
++ const unsigned int quarterPoints = num_points / 4;
+
+- const int16_t* complexVectorPtr = (const int16_t*)complexVector;
+- int16_t* magnitudeVectorPtr = magnitudeVector;
++ const int16_t* complexVectorPtr = (const int16_t*)complexVector;
++ int16_t* magnitudeVectorPtr = magnitudeVector;
+
+- __m128 vScalar = _mm_set_ps1(SHRT_MAX);
+- __m128 invScalar = _mm_set_ps1(1.0f/SHRT_MAX);
++ __m128 vScalar = _mm_set_ps1(SHRT_MAX);
++ __m128 invScalar = _mm_set_ps1(1.0f / SHRT_MAX);
+
+- __m128 cplxValue1, cplxValue2, result;
++ __m128 cplxValue1, cplxValue2, result;
+
+- __VOLK_ATTR_ALIGNED(16) float inputFloatBuffer[8];
+- __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4];
++ __VOLK_ATTR_ALIGNED(16) float inputFloatBuffer[8];
++ __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4];
+
+- for(;number < quarterPoints; number++){
++ for (; number < quarterPoints; number++) {
+
+- inputFloatBuffer[0] = (float)(complexVectorPtr[0]);
+- inputFloatBuffer[1] = (float)(complexVectorPtr[1]);
+- inputFloatBuffer[2] = (float)(complexVectorPtr[2]);
+- inputFloatBuffer[3] = (float)(complexVectorPtr[3]);
++ inputFloatBuffer[0] = (float)(complexVectorPtr[0]);
++ inputFloatBuffer[1] = (float)(complexVectorPtr[1]);
++ inputFloatBuffer[2] = (float)(complexVectorPtr[2]);
++ inputFloatBuffer[3] = (float)(complexVectorPtr[3]);
+
+- inputFloatBuffer[4] = (float)(complexVectorPtr[4]);
+- inputFloatBuffer[5] = (float)(complexVectorPtr[5]);
+- inputFloatBuffer[6] = (float)(complexVectorPtr[6]);
+- inputFloatBuffer[7] = (float)(complexVectorPtr[7]);
++ inputFloatBuffer[4] = (float)(complexVectorPtr[4]);
++ inputFloatBuffer[5] = (float)(complexVectorPtr[5]);
++ inputFloatBuffer[6] = (float)(complexVectorPtr[6]);
++ inputFloatBuffer[7] = (float)(complexVectorPtr[7]);
+
+- cplxValue1 = _mm_load_ps(&inputFloatBuffer[0]);
+- cplxValue2 = _mm_load_ps(&inputFloatBuffer[4]);
++ cplxValue1 = _mm_load_ps(&inputFloatBuffer[0]);
++ cplxValue2 = _mm_load_ps(&inputFloatBuffer[4]);
+
+- complexVectorPtr += 8;
++ complexVectorPtr += 8;
+
+- cplxValue1 = _mm_mul_ps(cplxValue1, invScalar);
+- cplxValue2 = _mm_mul_ps(cplxValue2, invScalar);
++ cplxValue1 = _mm_mul_ps(cplxValue1, invScalar);
++ cplxValue2 = _mm_mul_ps(cplxValue2, invScalar);
+
+- cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values
+- cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values
++ cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values
++ cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values
+
+- result = _mm_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values
++ result = _mm_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values
+
+- result = _mm_sqrt_ps(result); // Square root the values
++ result = _mm_sqrt_ps(result); // Square root the values
+
+- result = _mm_mul_ps(result, vScalar); // Scale the results
++ result = _mm_mul_ps(result, vScalar); // Scale the results
+
+- _mm_store_ps(outputFloatBuffer, result);
+- *magnitudeVectorPtr++ = (int16_t)rintf(outputFloatBuffer[0]);
+- *magnitudeVectorPtr++ = (int16_t)rintf(outputFloatBuffer[1]);
+- *magnitudeVectorPtr++ = (int16_t)rintf(outputFloatBuffer[2]);
+- *magnitudeVectorPtr++ = (int16_t)rintf(outputFloatBuffer[3]);
+- }
++ _mm_store_ps(outputFloatBuffer, result);
++ *magnitudeVectorPtr++ = (int16_t)rintf(outputFloatBuffer[0]);
++ *magnitudeVectorPtr++ = (int16_t)rintf(outputFloatBuffer[1]);
++ *magnitudeVectorPtr++ = (int16_t)rintf(outputFloatBuffer[2]);
++ *magnitudeVectorPtr++ = (int16_t)rintf(outputFloatBuffer[3]);
++ }
+
+- number = quarterPoints * 4;
+- magnitudeVectorPtr = &magnitudeVector[number];
+- complexVectorPtr = (const int16_t*)&complexVector[number];
+- for(; number < num_points; number++){
+- const float val1Real = (float)(*complexVectorPtr++) / SHRT_MAX;
+- const float val1Imag = (float)(*complexVectorPtr++) / SHRT_MAX;
+- const float val1Result = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)) * SHRT_MAX;
+- *magnitudeVectorPtr++ = (int16_t)rintf(val1Result);
+- }
++ number = quarterPoints * 4;
++ magnitudeVectorPtr = &magnitudeVector[number];
++ complexVectorPtr = (const int16_t*)&complexVector[number];
++ for (; number < num_points; number++) {
++ const float val1Real = (float)(*complexVectorPtr++) / SHRT_MAX;
++ const float val1Imag = (float)(*complexVectorPtr++) / SHRT_MAX;
++ const float val1Result =
++ sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)) * SHRT_MAX;
++ *magnitudeVectorPtr++ = (int16_t)rintf(val1Result);
++ }
+ }
+ #endif /* LV_HAVE_SSE3 */
+
+ #ifdef LV_HAVE_SSE
+ #include <xmmintrin.h>
+
+-static inline void
+-volk_16ic_magnitude_16i_a_sse(int16_t* magnitudeVector, const lv_16sc_t* complexVector, unsigned int num_points)
++static inline void volk_16ic_magnitude_16i_a_sse(int16_t* magnitudeVector,
++ const lv_16sc_t* complexVector,
++ unsigned int num_points)
+ {
+- unsigned int number = 0;
+- const unsigned int quarterPoints = num_points / 4;
++ unsigned int number = 0;
++ const unsigned int quarterPoints = num_points / 4;
+
+- const int16_t* complexVectorPtr = (const int16_t*)complexVector;
+- int16_t* magnitudeVectorPtr = magnitudeVector;
++ const int16_t* complexVectorPtr = (const int16_t*)complexVector;
++ int16_t* magnitudeVectorPtr = magnitudeVector;
+
+- __m128 vScalar = _mm_set_ps1(SHRT_MAX);
+- __m128 invScalar = _mm_set_ps1(1.0f/SHRT_MAX);
++ __m128 vScalar = _mm_set_ps1(SHRT_MAX);
++ __m128 invScalar = _mm_set_ps1(1.0f / SHRT_MAX);
+
+- __m128 cplxValue1, cplxValue2, iValue, qValue, result;
++ __m128 cplxValue1, cplxValue2, iValue, qValue, result;
+
+- __VOLK_ATTR_ALIGNED(16) float inputFloatBuffer[4];
+- __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4];
++ __VOLK_ATTR_ALIGNED(16) float inputFloatBuffer[4];
++ __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4];
+
+- for(;number < quarterPoints; number++){
++ for (; number < quarterPoints; number++) {
+
+- inputFloatBuffer[0] = (float)(complexVectorPtr[0]);
+- inputFloatBuffer[1] = (float)(complexVectorPtr[1]);
+- inputFloatBuffer[2] = (float)(complexVectorPtr[2]);
+- inputFloatBuffer[3] = (float)(complexVectorPtr[3]);
++ inputFloatBuffer[0] = (float)(complexVectorPtr[0]);
++ inputFloatBuffer[1] = (float)(complexVectorPtr[1]);
++ inputFloatBuffer[2] = (float)(complexVectorPtr[2]);
++ inputFloatBuffer[3] = (float)(complexVectorPtr[3]);
+
+- cplxValue1 = _mm_load_ps(inputFloatBuffer);
+- complexVectorPtr += 4;
++ cplxValue1 = _mm_load_ps(inputFloatBuffer);
++ complexVectorPtr += 4;
+
+- inputFloatBuffer[0] = (float)(complexVectorPtr[0]);
+- inputFloatBuffer[1] = (float)(complexVectorPtr[1]);
+- inputFloatBuffer[2] = (float)(complexVectorPtr[2]);
+- inputFloatBuffer[3] = (float)(complexVectorPtr[3]);
++ inputFloatBuffer[0] = (float)(complexVectorPtr[0]);
++ inputFloatBuffer[1] = (float)(complexVectorPtr[1]);
++ inputFloatBuffer[2] = (float)(complexVectorPtr[2]);
++ inputFloatBuffer[3] = (float)(complexVectorPtr[3]);
+
+- cplxValue2 = _mm_load_ps(inputFloatBuffer);
+- complexVectorPtr += 4;
++ cplxValue2 = _mm_load_ps(inputFloatBuffer);
++ complexVectorPtr += 4;
+
+- cplxValue1 = _mm_mul_ps(cplxValue1, invScalar);
+- cplxValue2 = _mm_mul_ps(cplxValue2, invScalar);
++ cplxValue1 = _mm_mul_ps(cplxValue1, invScalar);
++ cplxValue2 = _mm_mul_ps(cplxValue2, invScalar);
+
+- // Arrange in i1i2i3i4 format
+- iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0));
+- // Arrange in q1q2q3q4 format
+- qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1));
++ // Arrange in i1i2i3i4 format
++ iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2, 0, 2, 0));
++ // Arrange in q1q2q3q4 format
++ qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3, 1, 3, 1));
+
+- iValue = _mm_mul_ps(iValue, iValue); // Square the I values
+- qValue = _mm_mul_ps(qValue, qValue); // Square the Q Values
++ iValue = _mm_mul_ps(iValue, iValue); // Square the I values
++ qValue = _mm_mul_ps(qValue, qValue); // Square the Q Values
+
+- result = _mm_add_ps(iValue, qValue); // Add the I2 and Q2 values
++ result = _mm_add_ps(iValue, qValue); // Add the I2 and Q2 values
+
+- result = _mm_sqrt_ps(result); // Square root the values
++ result = _mm_sqrt_ps(result); // Square root the values
+
+- result = _mm_mul_ps(result, vScalar); // Scale the results
++ result = _mm_mul_ps(result, vScalar); // Scale the results
+
+- _mm_store_ps(outputFloatBuffer, result);
+- *magnitudeVectorPtr++ = (int16_t)rintf(outputFloatBuffer[0]);
+- *magnitudeVectorPtr++ = (int16_t)rintf(outputFloatBuffer[1]);
+- *magnitudeVectorPtr++ = (int16_t)rintf(outputFloatBuffer[2]);
+- *magnitudeVectorPtr++ = (int16_t)rintf(outputFloatBuffer[3]);
+- }
++ _mm_store_ps(outputFloatBuffer, result);
++ *magnitudeVectorPtr++ = (int16_t)rintf(outputFloatBuffer[0]);
++ *magnitudeVectorPtr++ = (int16_t)rintf(outputFloatBuffer[1]);
++ *magnitudeVectorPtr++ = (int16_t)rintf(outputFloatBuffer[2]);
++ *magnitudeVectorPtr++ = (int16_t)rintf(outputFloatBuffer[3]);
++ }
+
+- number = quarterPoints * 4;
+- magnitudeVectorPtr = &magnitudeVector[number];
+- complexVectorPtr = (const int16_t*)&complexVector[number];
+- for(; number < num_points; number++){
+- const float val1Real = (float)(*complexVectorPtr++) / SHRT_MAX;
+- const float val1Imag = (float)(*complexVectorPtr++) / SHRT_MAX;
+- const float val1Result = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)) * SHRT_MAX;
+- *magnitudeVectorPtr++ = (int16_t)rintf(val1Result);
+- }
++ number = quarterPoints * 4;
++ magnitudeVectorPtr = &magnitudeVector[number];
++ complexVectorPtr = (const int16_t*)&complexVector[number];
++ for (; number < num_points; number++) {
++ const float val1Real = (float)(*complexVectorPtr++) / SHRT_MAX;
++ const float val1Imag = (float)(*complexVectorPtr++) / SHRT_MAX;
++ const float val1Result =
++ sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)) * SHRT_MAX;
++ *magnitudeVectorPtr++ = (int16_t)rintf(val1Result);
++ }
+ }
+ #endif /* LV_HAVE_SSE */
+
+ #ifdef LV_HAVE_GENERIC
+
+-static inline void
+-volk_16ic_magnitude_16i_generic(int16_t* magnitudeVector, const lv_16sc_t* complexVector, unsigned int num_points)
++static inline void volk_16ic_magnitude_16i_generic(int16_t* magnitudeVector,
++ const lv_16sc_t* complexVector,
++ unsigned int num_points)
+ {
+- const int16_t* complexVectorPtr = (const int16_t*)complexVector;
+- int16_t* magnitudeVectorPtr = magnitudeVector;
+- unsigned int number = 0;
+- const float scalar = SHRT_MAX;
+- for(number = 0; number < num_points; number++){
+- float real = ((float)(*complexVectorPtr++)) / scalar;
+- float imag = ((float)(*complexVectorPtr++)) / scalar;
+- *magnitudeVectorPtr++ = (int16_t)rintf(sqrtf((real*real) + (imag*imag)) * scalar);
+- }
++ const int16_t* complexVectorPtr = (const int16_t*)complexVector;
++ int16_t* magnitudeVectorPtr = magnitudeVector;
++ unsigned int number = 0;
++ const float scalar = SHRT_MAX;
++ for (number = 0; number < num_points; number++) {
++ float real = ((float)(*complexVectorPtr++)) / scalar;
++ float imag = ((float)(*complexVectorPtr++)) / scalar;
++ *magnitudeVectorPtr++ =
++ (int16_t)rintf(sqrtf((real * real) + (imag * imag)) * scalar);
++ }
+ }
+ #endif /* LV_HAVE_GENERIC */
+
+ #ifdef LV_HAVE_ORC_DISABLED
+-extern void
+-volk_16ic_magnitude_16i_a_orc_impl(int16_t* magnitudeVector, const lv_16sc_t* complexVector, float scalar, unsigned int num_points);
+-
+-static inline void
+-volk_16ic_magnitude_16i_u_orc(int16_t* magnitudeVector, const lv_16sc_t* complexVector, unsigned int num_points)
++extern void volk_16ic_magnitude_16i_a_orc_impl(int16_t* magnitudeVector,
++ const lv_16sc_t* complexVector,
++ float scalar,
++ unsigned int num_points);
++
++static inline void volk_16ic_magnitude_16i_u_orc(int16_t* magnitudeVector,
++ const lv_16sc_t* complexVector,
++ unsigned int num_points)
+ {
+- volk_16ic_magnitude_16i_a_orc_impl(magnitudeVector, complexVector, SHRT_MAX, num_points);
++ volk_16ic_magnitude_16i_a_orc_impl(
++ magnitudeVector, complexVector, SHRT_MAX, num_points);
+ }
+ #endif /* LV_HAVE_ORC */
+
+@@ -300,71 +313,74 @@ volk_16ic_magnitude_16i_u_orc(int16_t* magnitudeVector, const lv_16sc_t* complex
+ #ifndef INCLUDED_volk_16ic_magnitude_16i_u_H
+ #define INCLUDED_volk_16ic_magnitude_16i_u_H
+
+-#include <volk/volk_common.h>
+ #include <inttypes.h>
+-#include <stdio.h>
+ #include <math.h>
++#include <stdio.h>
++#include <volk/volk_common.h>
+
+ #ifdef LV_HAVE_AVX2
+ #include <immintrin.h>
+
+-static inline void
+-volk_16ic_magnitude_16i_u_avx2(int16_t* magnitudeVector, const lv_16sc_t* complexVector, unsigned int num_points)
++static inline void volk_16ic_magnitude_16i_u_avx2(int16_t* magnitudeVector,
++ const lv_16sc_t* complexVector,
++ unsigned int num_points)
+ {
+- unsigned int number = 0;
+- const unsigned int eighthPoints = num_points / 8;
+-
+- const int16_t* complexVectorPtr = (const int16_t*)complexVector;
+- int16_t* magnitudeVectorPtr = magnitudeVector;
+-
+- __m256 vScalar = _mm256_set1_ps(SHRT_MAX);
+- __m256 invScalar = _mm256_set1_ps(1.0f/SHRT_MAX);
+- __m256i int1, int2;
+- __m128i short1, short2;
+- __m256 cplxValue1, cplxValue2, result;
+- __m256i idx = _mm256_set_epi32(0,0,0,0,5,1,4,0);
+-
+- for(;number < eighthPoints; number++){
+-
+- int1 = _mm256_loadu_si256((__m256i*)complexVectorPtr);
+- complexVectorPtr += 16;
+- short1 = _mm256_extracti128_si256(int1,0);
+- short2 = _mm256_extracti128_si256(int1,1);
+-
+- int1 = _mm256_cvtepi16_epi32(short1);
+- int2 = _mm256_cvtepi16_epi32(short2);
+- cplxValue1 = _mm256_cvtepi32_ps(int1);
+- cplxValue2 = _mm256_cvtepi32_ps(int2);
+-
+- cplxValue1 = _mm256_mul_ps(cplxValue1, invScalar);
+- cplxValue2 = _mm256_mul_ps(cplxValue2, invScalar);
+-
+- cplxValue1 = _mm256_mul_ps(cplxValue1, cplxValue1); // Square the values
+- cplxValue2 = _mm256_mul_ps(cplxValue2, cplxValue2); // Square the Values
+-
+- result = _mm256_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values
+-
+- result = _mm256_sqrt_ps(result); // Square root the values
+-
+- result = _mm256_mul_ps(result, vScalar); // Scale the results
+-
+- int1 = _mm256_cvtps_epi32(result);
+- int1 = _mm256_packs_epi32(int1, int1);
+- int1 = _mm256_permutevar8x32_epi32(int1, idx); //permute to compensate for shuffling in hadd and packs
+- short1 = _mm256_extracti128_si256(int1, 0);
+- _mm_storeu_si128((__m128i*)magnitudeVectorPtr,short1);
+- magnitudeVectorPtr += 8;
+- }
+-
+- number = eighthPoints * 8;
+- magnitudeVectorPtr = &magnitudeVector[number];
+- complexVectorPtr = (const int16_t*)&complexVector[number];
+- for(; number < num_points; number++){
+- const float val1Real = (float)(*complexVectorPtr++) / SHRT_MAX;
+- const float val1Imag = (float)(*complexVectorPtr++) / SHRT_MAX;
+- const float val1Result = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)) * SHRT_MAX;
+- *magnitudeVectorPtr++ = (int16_t)rintf(val1Result);
+- }
++ unsigned int number = 0;
++ const unsigned int eighthPoints = num_points / 8;
++
++ const int16_t* complexVectorPtr = (const int16_t*)complexVector;
++ int16_t* magnitudeVectorPtr = magnitudeVector;
++
++ __m256 vScalar = _mm256_set1_ps(SHRT_MAX);
++ __m256 invScalar = _mm256_set1_ps(1.0f / SHRT_MAX);
++ __m256i int1, int2;
++ __m128i short1, short2;
++ __m256 cplxValue1, cplxValue2, result;
++ __m256i idx = _mm256_set_epi32(0, 0, 0, 0, 5, 1, 4, 0);
++
++ for (; number < eighthPoints; number++) {
++
++ int1 = _mm256_loadu_si256((__m256i*)complexVectorPtr);
++ complexVectorPtr += 16;
++ short1 = _mm256_extracti128_si256(int1, 0);
++ short2 = _mm256_extracti128_si256(int1, 1);
++
++ int1 = _mm256_cvtepi16_epi32(short1);
++ int2 = _mm256_cvtepi16_epi32(short2);
++ cplxValue1 = _mm256_cvtepi32_ps(int1);
++ cplxValue2 = _mm256_cvtepi32_ps(int2);
++
++ cplxValue1 = _mm256_mul_ps(cplxValue1, invScalar);
++ cplxValue2 = _mm256_mul_ps(cplxValue2, invScalar);
++
++ cplxValue1 = _mm256_mul_ps(cplxValue1, cplxValue1); // Square the values
++ cplxValue2 = _mm256_mul_ps(cplxValue2, cplxValue2); // Square the Values
++
++ result = _mm256_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values
++
++ result = _mm256_sqrt_ps(result); // Square root the values
++
++ result = _mm256_mul_ps(result, vScalar); // Scale the results
++
++ int1 = _mm256_cvtps_epi32(result);
++ int1 = _mm256_packs_epi32(int1, int1);
++ int1 = _mm256_permutevar8x32_epi32(
++ int1, idx); // permute to compensate for shuffling in hadd and packs
++ short1 = _mm256_extracti128_si256(int1, 0);
++ _mm_storeu_si128((__m128i*)magnitudeVectorPtr, short1);
++ magnitudeVectorPtr += 8;
++ }
++
++ number = eighthPoints * 8;
++ magnitudeVectorPtr = &magnitudeVector[number];
++ complexVectorPtr = (const int16_t*)&complexVector[number];
++ for (; number < num_points; number++) {
++ const float val1Real = (float)(*complexVectorPtr++) / SHRT_MAX;
++ const float val1Imag = (float)(*complexVectorPtr++) / SHRT_MAX;
++ const float val1Result =
++ sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)) * SHRT_MAX;
++ *magnitudeVectorPtr++ = (int16_t)rintf(val1Result);
++ }
+ }
+ #endif /* LV_HAVE_AVX2 */
+
+@@ -372,24 +388,25 @@ volk_16ic_magnitude_16i_u_avx2(int16_t* magnitudeVector, const lv_16sc_t* comple
+ #include <arm_neon.h>
+ #include <volk/volk_neon_intrinsics.h>
+
+-static inline void
+-volk_16ic_magnitude_16i_neonv7(int16_t* magnitudeVector, const lv_16sc_t* complexVector, unsigned int num_points)
++static inline void volk_16ic_magnitude_16i_neonv7(int16_t* magnitudeVector,
++ const lv_16sc_t* complexVector,
++ unsigned int num_points)
+ {
+ unsigned int number = 0;
+ unsigned int quarter_points = num_points / 4;
+-
++
+ const float scalar = SHRT_MAX;
+ const float inv_scalar = 1.0f / scalar;
+-
++
+ int16_t* magnitudeVectorPtr = magnitudeVector;
+ const lv_16sc_t* complexVectorPtr = complexVector;
+-
++
+ float32x4_t mag_vec;
+ float32x4x2_t c_vec;
+-
+- for(number = 0; number < quarter_points; number++) {
++
++ for (number = 0; number < quarter_points; number++) {
+ const int16x4x2_t c16_vec = vld2_s16((int16_t*)complexVectorPtr);
+- __VOLK_PREFETCH(complexVectorPtr+4);
++ __VOLK_PREFETCH(complexVectorPtr + 4);
+ c_vec.val[0] = vcvtq_f32_s32(vmovl_s16(c16_vec.val[0]));
+ c_vec.val[1] = vcvtq_f32_s32(vmovl_s16(c16_vec.val[1]));
+ // Scale to close to 0-1
+@@ -406,15 +423,16 @@ volk_16ic_magnitude_16i_neonv7(int16_t* magnitudeVector, const lv_16sc_t* comple
+ const int16x4_t mag16_vec = vmovn_s32(vcvtq_s32_f32(mag_vec));
+ vst1_s16(magnitudeVectorPtr, mag16_vec);
+ // Advance pointers
+- magnitudeVectorPtr+=4;
+- complexVectorPtr+=4;
++ magnitudeVectorPtr += 4;
++ complexVectorPtr += 4;
+ }
+-
++
+ // Deal with the rest
+- for(number = quarter_points * 4; number < num_points; number++) {
++ for (number = quarter_points * 4; number < num_points; number++) {
+ const float real = lv_creal(*complexVectorPtr) * inv_scalar;
+ const float imag = lv_cimag(*complexVectorPtr) * inv_scalar;
+- *magnitudeVectorPtr = (int16_t)rintf(sqrtf((real*real) + (imag*imag)) * scalar);
++ *magnitudeVectorPtr =
++ (int16_t)rintf(sqrtf((real * real) + (imag * imag)) * scalar);
+ complexVectorPtr++;
+ magnitudeVectorPtr++;
+ }
+diff --git a/kernels/volk/volk_16ic_s32f_deinterleave_32f_x2.h b/kernels/volk/volk_16ic_s32f_deinterleave_32f_x2.h
+index 50d9341..7425ec6 100644
+--- a/kernels/volk/volk_16ic_s32f_deinterleave_32f_x2.h
++++ b/kernels/volk/volk_16ic_s32f_deinterleave_32f_x2.h
+@@ -30,8 +30,8 @@
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+- * void volk_16ic_s32f_deinterleave_32f_x2(float* iBuffer, float* qBuffer, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){
+- * \endcode
++ * void volk_16ic_s32f_deinterleave_32f_x2(float* iBuffer, float* qBuffer, const
++ * lv_16sc_t* complexVector, const float scalar, unsigned int num_points){ \endcode
+ *
+ * \b Inputs
+ * \li complexVector: The complex input vector of 16-bit shorts.
+@@ -56,197 +56,214 @@
+ #ifndef INCLUDED_volk_16ic_s32f_deinterleave_32f_x2_a_H
+ #define INCLUDED_volk_16ic_s32f_deinterleave_32f_x2_a_H
+
+-#include <volk/volk_common.h>
+ #include <inttypes.h>
+ #include <stdio.h>
++#include <volk/volk_common.h>
+
+ #ifdef LV_HAVE_AVX2
+ #include <immintrin.h>
+
+-static inline
+-void volk_16ic_s32f_deinterleave_32f_x2_a_avx2(float* iBuffer, float* qBuffer, const lv_16sc_t* complexVector,
+- const float scalar, unsigned int num_points)
++static inline void
++volk_16ic_s32f_deinterleave_32f_x2_a_avx2(float* iBuffer,
++ float* qBuffer,
++ const lv_16sc_t* complexVector,
++ const float scalar,
++ unsigned int num_points)
+ {
+- float* iBufferPtr = iBuffer;
+- float* qBufferPtr = qBuffer;
+-
+- uint64_t number = 0;
+- const uint64_t eighthPoints = num_points / 8;
+- __m256 cplxValue1, cplxValue2, iValue, qValue;
+- __m256i cplxValueA, cplxValueB;
+- __m128i cplxValue128;
+-
+- __m256 invScalar = _mm256_set1_ps(1.0/scalar);
+- int16_t* complexVectorPtr = (int16_t*)complexVector;
+- __m256i idx = _mm256_set_epi32(7,6,3,2,5,4,1,0);
+-
+- for(;number < eighthPoints; number++){
+-
+- cplxValueA = _mm256_load_si256((__m256i*) complexVectorPtr);
+- complexVectorPtr += 16;
+-
+- //cvt
+- cplxValue128 = _mm256_extracti128_si256(cplxValueA, 0);
+- cplxValueB = _mm256_cvtepi16_epi32(cplxValue128);
+- cplxValue1 = _mm256_cvtepi32_ps(cplxValueB);
+- cplxValue128 = _mm256_extracti128_si256(cplxValueA, 1);
+- cplxValueB = _mm256_cvtepi16_epi32(cplxValue128);
+- cplxValue2 = _mm256_cvtepi32_ps(cplxValueB);
+-
+- cplxValue1 = _mm256_mul_ps(cplxValue1, invScalar);
+- cplxValue2 = _mm256_mul_ps(cplxValue2, invScalar);
+-
+- // Arrange in i1i2i3i4 format
+- iValue = _mm256_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0));
+- iValue = _mm256_permutevar8x32_ps(iValue,idx);
+- // Arrange in q1q2q3q4 format
+- qValue = _mm256_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1));
+- qValue = _mm256_permutevar8x32_ps(qValue,idx);
+-
+- _mm256_store_ps(iBufferPtr, iValue);
+- _mm256_store_ps(qBufferPtr, qValue);
+-
+- iBufferPtr += 8;
+- qBufferPtr += 8;
+- }
+-
+- number = eighthPoints * 8;
+- complexVectorPtr = (int16_t*)&complexVector[number];
+- for(; number < num_points; number++){
+- *iBufferPtr++ = (float)(*complexVectorPtr++) / scalar;
+- *qBufferPtr++ = (float)(*complexVectorPtr++) / scalar;
+- }
++ float* iBufferPtr = iBuffer;
++ float* qBufferPtr = qBuffer;
++
++ uint64_t number = 0;
++ const uint64_t eighthPoints = num_points / 8;
++ __m256 cplxValue1, cplxValue2, iValue, qValue;
++ __m256i cplxValueA, cplxValueB;
++ __m128i cplxValue128;
++
++ __m256 invScalar = _mm256_set1_ps(1.0 / scalar);
++ int16_t* complexVectorPtr = (int16_t*)complexVector;
++ __m256i idx = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
++
++ for (; number < eighthPoints; number++) {
++
++ cplxValueA = _mm256_load_si256((__m256i*)complexVectorPtr);
++ complexVectorPtr += 16;
++
++ // cvt
++ cplxValue128 = _mm256_extracti128_si256(cplxValueA, 0);
++ cplxValueB = _mm256_cvtepi16_epi32(cplxValue128);
++ cplxValue1 = _mm256_cvtepi32_ps(cplxValueB);
++ cplxValue128 = _mm256_extracti128_si256(cplxValueA, 1);
++ cplxValueB = _mm256_cvtepi16_epi32(cplxValue128);
++ cplxValue2 = _mm256_cvtepi32_ps(cplxValueB);
++
++ cplxValue1 = _mm256_mul_ps(cplxValue1, invScalar);
++ cplxValue2 = _mm256_mul_ps(cplxValue2, invScalar);
++
++ // Arrange in i1i2i3i4 format
++ iValue = _mm256_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2, 0, 2, 0));
++ iValue = _mm256_permutevar8x32_ps(iValue, idx);
++ // Arrange in q1q2q3q4 format
++ qValue = _mm256_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3, 1, 3, 1));
++ qValue = _mm256_permutevar8x32_ps(qValue, idx);
++
++ _mm256_store_ps(iBufferPtr, iValue);
++ _mm256_store_ps(qBufferPtr, qValue);
++
++ iBufferPtr += 8;
++ qBufferPtr += 8;
++ }
++
++ number = eighthPoints * 8;
++ complexVectorPtr = (int16_t*)&complexVector[number];
++ for (; number < num_points; number++) {
++ *iBufferPtr++ = (float)(*complexVectorPtr++) / scalar;
++ *qBufferPtr++ = (float)(*complexVectorPtr++) / scalar;
++ }
+ }
+ #endif /* LV_HAVE_AVX2 */
+
+ #ifdef LV_HAVE_SSE
+ #include <xmmintrin.h>
+
+-static inline
+-void volk_16ic_s32f_deinterleave_32f_x2_a_sse(float* iBuffer, float* qBuffer, const lv_16sc_t* complexVector,
+- const float scalar, unsigned int num_points)
++static inline void
++volk_16ic_s32f_deinterleave_32f_x2_a_sse(float* iBuffer,
++ float* qBuffer,
++ const lv_16sc_t* complexVector,
++ const float scalar,
++ unsigned int num_points)
+ {
+- float* iBufferPtr = iBuffer;
+- float* qBufferPtr = qBuffer;
++ float* iBufferPtr = iBuffer;
++ float* qBufferPtr = qBuffer;
+
+- uint64_t number = 0;
+- const uint64_t quarterPoints = num_points / 4;
+- __m128 cplxValue1, cplxValue2, iValue, qValue;
++ uint64_t number = 0;
++ const uint64_t quarterPoints = num_points / 4;
++ __m128 cplxValue1, cplxValue2, iValue, qValue;
+
+- __m128 invScalar = _mm_set_ps1(1.0/scalar);
+- int16_t* complexVectorPtr = (int16_t*)complexVector;
++ __m128 invScalar = _mm_set_ps1(1.0 / scalar);
++ int16_t* complexVectorPtr = (int16_t*)complexVector;
+
+- __VOLK_ATTR_ALIGNED(16) float floatBuffer[8];
++ __VOLK_ATTR_ALIGNED(16) float floatBuffer[8];
+
+- for(;number < quarterPoints; number++){
++ for (; number < quarterPoints; number++) {
+
+- floatBuffer[0] = (float)(complexVectorPtr[0]);
+- floatBuffer[1] = (float)(complexVectorPtr[1]);
+- floatBuffer[2] = (float)(complexVectorPtr[2]);
+- floatBuffer[3] = (float)(complexVectorPtr[3]);
++ floatBuffer[0] = (float)(complexVectorPtr[0]);
++ floatBuffer[1] = (float)(complexVectorPtr[1]);
++ floatBuffer[2] = (float)(complexVectorPtr[2]);
++ floatBuffer[3] = (float)(complexVectorPtr[3]);
+
+- floatBuffer[4] = (float)(complexVectorPtr[4]);
+- floatBuffer[5] = (float)(complexVectorPtr[5]);
+- floatBuffer[6] = (float)(complexVectorPtr[6]);
+- floatBuffer[7] = (float)(complexVectorPtr[7]);
++ floatBuffer[4] = (float)(complexVectorPtr[4]);
++ floatBuffer[5] = (float)(complexVectorPtr[5]);
++ floatBuffer[6] = (float)(complexVectorPtr[6]);
++ floatBuffer[7] = (float)(complexVectorPtr[7]);
+
+- cplxValue1 = _mm_load_ps(&floatBuffer[0]);
+- cplxValue2 = _mm_load_ps(&floatBuffer[4]);
++ cplxValue1 = _mm_load_ps(&floatBuffer[0]);
++ cplxValue2 = _mm_load_ps(&floatBuffer[4]);
+
+- complexVectorPtr += 8;
++ complexVectorPtr += 8;
+
+- cplxValue1 = _mm_mul_ps(cplxValue1, invScalar);
+- cplxValue2 = _mm_mul_ps(cplxValue2, invScalar);
++ cplxValue1 = _mm_mul_ps(cplxValue1, invScalar);
++ cplxValue2 = _mm_mul_ps(cplxValue2, invScalar);
+
+- // Arrange in i1i2i3i4 format
+- iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0));
+- // Arrange in q1q2q3q4 format
+- qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1));
++ // Arrange in i1i2i3i4 format
++ iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2, 0, 2, 0));
++ // Arrange in q1q2q3q4 format
++ qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3, 1, 3, 1));
+
+- _mm_store_ps(iBufferPtr, iValue);
+- _mm_store_ps(qBufferPtr, qValue);
++ _mm_store_ps(iBufferPtr, iValue);
++ _mm_store_ps(qBufferPtr, qValue);
+
+- iBufferPtr += 4;
+- qBufferPtr += 4;
+- }
++ iBufferPtr += 4;
++ qBufferPtr += 4;
++ }
+
+- number = quarterPoints * 4;
+- complexVectorPtr = (int16_t*)&complexVector[number];
+- for(; number < num_points; number++){
+- *iBufferPtr++ = (float)(*complexVectorPtr++) / scalar;
+- *qBufferPtr++ = (float)(*complexVectorPtr++) / scalar;
+- }
++ number = quarterPoints * 4;
++ complexVectorPtr = (int16_t*)&complexVector[number];
++ for (; number < num_points; number++) {
++ *iBufferPtr++ = (float)(*complexVectorPtr++) / scalar;
++ *qBufferPtr++ = (float)(*complexVectorPtr++) / scalar;
++ }
+ }
+ #endif /* LV_HAVE_SSE */
+
+ #ifdef LV_HAVE_GENERIC
+
+ static inline void
+-volk_16ic_s32f_deinterleave_32f_x2_generic(float* iBuffer, float* qBuffer, const lv_16sc_t* complexVector,
+- const float scalar, unsigned int num_points)
++volk_16ic_s32f_deinterleave_32f_x2_generic(float* iBuffer,
++ float* qBuffer,
++ const lv_16sc_t* complexVector,
++ const float scalar,
++ unsigned int num_points)
+ {
+- const int16_t* complexVectorPtr = (const int16_t*)complexVector;
+- float* iBufferPtr = iBuffer;
+- float* qBufferPtr = qBuffer;
+- unsigned int number;
+- for(number = 0; number < num_points; number++){
+- *iBufferPtr++ = (float)(*complexVectorPtr++) / scalar;
+- *qBufferPtr++ = (float)(*complexVectorPtr++) / scalar;
+- }
++ const int16_t* complexVectorPtr = (const int16_t*)complexVector;
++ float* iBufferPtr = iBuffer;
++ float* qBufferPtr = qBuffer;
++ unsigned int number;
++ for (number = 0; number < num_points; number++) {
++ *iBufferPtr++ = (float)(*complexVectorPtr++) / scalar;
++ *qBufferPtr++ = (float)(*complexVectorPtr++) / scalar;
++ }
+ }
+ #endif /* LV_HAVE_GENERIC */
+
+ #ifdef LV_HAVE_NEON
+ #include <arm_neon.h>
+-static inline void
+-volk_16ic_s32f_deinterleave_32f_x2_neon(float* iBuffer, float* qBuffer, const lv_16sc_t* complexVector,
+- const float scalar, unsigned int num_points)
++static inline void volk_16ic_s32f_deinterleave_32f_x2_neon(float* iBuffer,
++ float* qBuffer,
++ const lv_16sc_t* complexVector,
++ const float scalar,
++ unsigned int num_points)
+ {
+- const int16_t* complexVectorPtr = (const int16_t*)complexVector;
+- float* iBufferPtr = iBuffer;
+- float* qBufferPtr = qBuffer;
+- unsigned int eighth_points = num_points / 4;
+- unsigned int number;
+- float iScalar = 1.f/scalar;
+- float32x4_t invScalar;
+- invScalar = vld1q_dup_f32(&iScalar);
+-
+- int16x4x2_t complexInput_s16;
+- int32x4x2_t complexInput_s32;
+- float32x4x2_t complexFloat;
+-
+- for(number = 0; number < eighth_points; number++){
+- complexInput_s16 = vld2_s16(complexVectorPtr);
+- complexInput_s32.val[0] = vmovl_s16(complexInput_s16.val[0]);
+- complexInput_s32.val[1] = vmovl_s16(complexInput_s16.val[1]);
+- complexFloat.val[0] = vcvtq_f32_s32(complexInput_s32.val[0]);
+- complexFloat.val[1] = vcvtq_f32_s32(complexInput_s32.val[1]);
+- complexFloat.val[0] = vmulq_f32(complexFloat.val[0], invScalar);
+- complexFloat.val[1] = vmulq_f32(complexFloat.val[1], invScalar);
+- vst1q_f32(iBufferPtr, complexFloat.val[0]);
+- vst1q_f32(qBufferPtr, complexFloat.val[1]);
+- complexVectorPtr += 8;
+- iBufferPtr += 4;
+- qBufferPtr += 4;
+- }
+-
+- for(number = eighth_points*4; number < num_points; number++){
+- *iBufferPtr++ = (float)(*complexVectorPtr++) / scalar;
+- *qBufferPtr++ = (float)(*complexVectorPtr++) / scalar;
+- }
++ const int16_t* complexVectorPtr = (const int16_t*)complexVector;
++ float* iBufferPtr = iBuffer;
++ float* qBufferPtr = qBuffer;
++ unsigned int eighth_points = num_points / 4;
++ unsigned int number;
++ float iScalar = 1.f / scalar;
++ float32x4_t invScalar;
++ invScalar = vld1q_dup_f32(&iScalar);
++
++ int16x4x2_t complexInput_s16;
++ int32x4x2_t complexInput_s32;
++ float32x4x2_t complexFloat;
++
++ for (number = 0; number < eighth_points; number++) {
++ complexInput_s16 = vld2_s16(complexVectorPtr);
++ complexInput_s32.val[0] = vmovl_s16(complexInput_s16.val[0]);
++ complexInput_s32.val[1] = vmovl_s16(complexInput_s16.val[1]);
++ complexFloat.val[0] = vcvtq_f32_s32(complexInput_s32.val[0]);
++ complexFloat.val[1] = vcvtq_f32_s32(complexInput_s32.val[1]);
++ complexFloat.val[0] = vmulq_f32(complexFloat.val[0], invScalar);
++ complexFloat.val[1] = vmulq_f32(complexFloat.val[1], invScalar);
++ vst1q_f32(iBufferPtr, complexFloat.val[0]);
++ vst1q_f32(qBufferPtr, complexFloat.val[1]);
++ complexVectorPtr += 8;
++ iBufferPtr += 4;
++ qBufferPtr += 4;
++ }
++
++ for (number = eighth_points * 4; number < num_points; number++) {
++ *iBufferPtr++ = (float)(*complexVectorPtr++) / scalar;
++ *qBufferPtr++ = (float)(*complexVectorPtr++) / scalar;
++ }
+ }
+ #endif /* LV_HAVE_GENERIC */
+
+ #ifdef LV_HAVE_ORC
+-extern void
+-volk_16ic_s32f_deinterleave_32f_x2_a_orc_impl(float* iBuffer, float* qBuffer, const lv_16sc_t* complexVector,
+- const float scalar, unsigned int num_points);
++extern void volk_16ic_s32f_deinterleave_32f_x2_a_orc_impl(float* iBuffer,
++ float* qBuffer,
++ const lv_16sc_t* complexVector,
++ const float scalar,
++ unsigned int num_points);
+
+ static inline void
+-volk_16ic_s32f_deinterleave_32f_x2_u_orc(float* iBuffer, float* qBuffer, const lv_16sc_t* complexVector,
+- const float scalar, unsigned int num_points)
++volk_16ic_s32f_deinterleave_32f_x2_u_orc(float* iBuffer,
++ float* qBuffer,
++ const lv_16sc_t* complexVector,
++ const float scalar,
++ unsigned int num_points)
+ {
+- volk_16ic_s32f_deinterleave_32f_x2_a_orc_impl(iBuffer, qBuffer, complexVector, scalar, num_points);
++ volk_16ic_s32f_deinterleave_32f_x2_a_orc_impl(
++ iBuffer, qBuffer, complexVector, scalar, num_points);
+ }
+ #endif /* LV_HAVE_ORC */
+
+@@ -257,66 +274,69 @@ volk_16ic_s32f_deinterleave_32f_x2_u_orc(float* iBuffer, float* qBuffer, const l
+ #ifndef INCLUDED_volk_16ic_s32f_deinterleave_32f_x2_u_H
+ #define INCLUDED_volk_16ic_s32f_deinterleave_32f_x2_u_H
+
+-#include <volk/volk_common.h>
+ #include <inttypes.h>
+ #include <stdio.h>
++#include <volk/volk_common.h>
+
+ #ifdef LV_HAVE_AVX2
+ #include <immintrin.h>
+
+-static inline
+-void volk_16ic_s32f_deinterleave_32f_x2_u_avx2(float* iBuffer, float* qBuffer, const lv_16sc_t* complexVector,
+- const float scalar, unsigned int num_points)
++static inline void
++volk_16ic_s32f_deinterleave_32f_x2_u_avx2(float* iBuffer,
++ float* qBuffer,
++ const lv_16sc_t* complexVector,
++ const float scalar,
++ unsigned int num_points)
+ {
+- float* iBufferPtr = iBuffer;
+- float* qBufferPtr = qBuffer;
+-
+- uint64_t number = 0;
+- const uint64_t eighthPoints = num_points / 8;
+- __m256 cplxValue1, cplxValue2, iValue, qValue;
+- __m256i cplxValueA, cplxValueB;
+- __m128i cplxValue128;
+-
+- __m256 invScalar = _mm256_set1_ps(1.0/scalar);
+- int16_t* complexVectorPtr = (int16_t*)complexVector;
+- __m256i idx = _mm256_set_epi32(7,6,3,2,5,4,1,0);
+-
+- for(;number < eighthPoints; number++){
+-
+- cplxValueA = _mm256_loadu_si256((__m256i*) complexVectorPtr);
+- complexVectorPtr += 16;
+-
+- //cvt
+- cplxValue128 = _mm256_extracti128_si256(cplxValueA, 0);
+- cplxValueB = _mm256_cvtepi16_epi32(cplxValue128);
+- cplxValue1 = _mm256_cvtepi32_ps(cplxValueB);
+- cplxValue128 = _mm256_extracti128_si256(cplxValueA, 1);
+- cplxValueB = _mm256_cvtepi16_epi32(cplxValue128);
+- cplxValue2 = _mm256_cvtepi32_ps(cplxValueB);
+-
+- cplxValue1 = _mm256_mul_ps(cplxValue1, invScalar);
+- cplxValue2 = _mm256_mul_ps(cplxValue2, invScalar);
+-
+- // Arrange in i1i2i3i4 format
+- iValue = _mm256_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0));
+- iValue = _mm256_permutevar8x32_ps(iValue,idx);
+- // Arrange in q1q2q3q4 format
+- qValue = _mm256_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1));
+- qValue = _mm256_permutevar8x32_ps(qValue,idx);
+-
+- _mm256_storeu_ps(iBufferPtr, iValue);
+- _mm256_storeu_ps(qBufferPtr, qValue);
+-
+- iBufferPtr += 8;
+- qBufferPtr += 8;
+- }
+-
+- number = eighthPoints * 8;
+- complexVectorPtr = (int16_t*)&complexVector[number];
+- for(; number < num_points; number++){
+- *iBufferPtr++ = (float)(*complexVectorPtr++) / scalar;
+- *qBufferPtr++ = (float)(*complexVectorPtr++) / scalar;
+- }
++ float* iBufferPtr = iBuffer;
++ float* qBufferPtr = qBuffer;
++
++ uint64_t number = 0;
++ const uint64_t eighthPoints = num_points / 8;
++ __m256 cplxValue1, cplxValue2, iValue, qValue;
++ __m256i cplxValueA, cplxValueB;
++ __m128i cplxValue128;
++
++ __m256 invScalar = _mm256_set1_ps(1.0 / scalar);
++ int16_t* complexVectorPtr = (int16_t*)complexVector;
++ __m256i idx = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
++
++ for (; number < eighthPoints; number++) {
++
++ cplxValueA = _mm256_loadu_si256((__m256i*)complexVectorPtr);
++ complexVectorPtr += 16;
++
++ // cvt
++ cplxValue128 = _mm256_extracti128_si256(cplxValueA, 0);
++ cplxValueB = _mm256_cvtepi16_epi32(cplxValue128);
++ cplxValue1 = _mm256_cvtepi32_ps(cplxValueB);
++ cplxValue128 = _mm256_extracti128_si256(cplxValueA, 1);
++ cplxValueB = _mm256_cvtepi16_epi32(cplxValue128);
++ cplxValue2 = _mm256_cvtepi32_ps(cplxValueB);
++
++ cplxValue1 = _mm256_mul_ps(cplxValue1, invScalar);
++ cplxValue2 = _mm256_mul_ps(cplxValue2, invScalar);
++
++ // Arrange in i1i2i3i4 format
++ iValue = _mm256_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2, 0, 2, 0));
++ iValue = _mm256_permutevar8x32_ps(iValue, idx);
++ // Arrange in q1q2q3q4 format
++ qValue = _mm256_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3, 1, 3, 1));
++ qValue = _mm256_permutevar8x32_ps(qValue, idx);
++
++ _mm256_storeu_ps(iBufferPtr, iValue);
++ _mm256_storeu_ps(qBufferPtr, qValue);
++
++ iBufferPtr += 8;
++ qBufferPtr += 8;
++ }
++
++ number = eighthPoints * 8;
++ complexVectorPtr = (int16_t*)&complexVector[number];
++ for (; number < num_points; number++) {
++ *iBufferPtr++ = (float)(*complexVectorPtr++) / scalar;
++ *qBufferPtr++ = (float)(*complexVectorPtr++) / scalar;
++ }
+ }
+ #endif /* LV_HAVE_AVX2 */
+
+diff --git a/kernels/volk/volk_16ic_s32f_deinterleave_real_32f.h b/kernels/volk/volk_16ic_s32f_deinterleave_real_32f.h
+index 713e6a1..8b72d1c 100644
+--- a/kernels/volk/volk_16ic_s32f_deinterleave_real_32f.h
++++ b/kernels/volk/volk_16ic_s32f_deinterleave_real_32f.h
+@@ -31,8 +31,8 @@
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+- * void volk_16ic_s32f_deinterleave_real_32f(float* iBuffer, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){
+- * \endcode
++ * void volk_16ic_s32f_deinterleave_real_32f(float* iBuffer, const lv_16sc_t*
++ * complexVector, const float scalar, unsigned int num_points){ \endcode
+ *
+ * \b Inputs
+ * \li complexVector: The complex input vector of 16-bit shorts.
+@@ -56,55 +56,88 @@
+ #ifndef INCLUDED_volk_16ic_s32f_deinterleave_real_32f_a_H
+ #define INCLUDED_volk_16ic_s32f_deinterleave_real_32f_a_H
+
+-#include <volk/volk_common.h>
+ #include <inttypes.h>
+ #include <stdio.h>
++#include <volk/volk_common.h>
+
+ #ifdef LV_HAVE_AVX2
+ #include <immintrin.h>
+
+ static inline void
+-volk_16ic_s32f_deinterleave_real_32f_a_avx2(float* iBuffer, const lv_16sc_t* complexVector,
+- const float scalar, unsigned int num_points)
++volk_16ic_s32f_deinterleave_real_32f_a_avx2(float* iBuffer,
++ const lv_16sc_t* complexVector,
++ const float scalar,
++ unsigned int num_points)
+ {
+- float* iBufferPtr = iBuffer;
+-
+- unsigned int number = 0;
+- const unsigned int eighthPoints = num_points / 8;
+-
+- __m256 iFloatValue;
+-
+- const float iScalar= 1.0 / scalar;
+- __m256 invScalar = _mm256_set1_ps(iScalar);
+- __m256i complexVal, iIntVal;
+- __m128i complexVal128;
+- int8_t* complexVectorPtr = (int8_t*)complexVector;
+-
+- __m256i moveMask = _mm256_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0);
+-
+- for(;number < eighthPoints; number++){
+- complexVal = _mm256_load_si256((__m256i*)complexVectorPtr); complexVectorPtr += 32;
+- complexVal = _mm256_shuffle_epi8(complexVal, moveMask);
+- complexVal = _mm256_permute4x64_epi64(complexVal, 0xd8);
+- complexVal128 = _mm256_extracti128_si256(complexVal, 0);
+-
+- iIntVal = _mm256_cvtepi16_epi32(complexVal128);
+- iFloatValue = _mm256_cvtepi32_ps(iIntVal);
+-
+- iFloatValue = _mm256_mul_ps(iFloatValue, invScalar);
+-
+- _mm256_store_ps(iBufferPtr, iFloatValue);
+-
+- iBufferPtr += 8;
+- }
+-
+- number = eighthPoints * 8;
+- int16_t* sixteenTComplexVectorPtr = (int16_t*)&complexVector[number];
+- for(; number < num_points; number++){
+- *iBufferPtr++ = ((float)(*sixteenTComplexVectorPtr++)) * iScalar;
+- sixteenTComplexVectorPtr++;
+- }
+-
++ float* iBufferPtr = iBuffer;
++
++ unsigned int number = 0;
++ const unsigned int eighthPoints = num_points / 8;
++
++ __m256 iFloatValue;
++
++ const float iScalar = 1.0 / scalar;
++ __m256 invScalar = _mm256_set1_ps(iScalar);
++ __m256i complexVal, iIntVal;
++ __m128i complexVal128;
++ int8_t* complexVectorPtr = (int8_t*)complexVector;
++
++ __m256i moveMask = _mm256_set_epi8(0x80,
++ 0x80,
++ 0x80,
++ 0x80,
++ 0x80,
++ 0x80,
++ 0x80,
++ 0x80,
++ 13,
++ 12,
++ 9,
++ 8,
++ 5,
++ 4,
++ 1,
++ 0,
++ 0x80,
++ 0x80,
++ 0x80,
++ 0x80,
++ 0x80,
++ 0x80,
++ 0x80,
++ 0x80,
++ 13,
++ 12,
++ 9,
++ 8,
++ 5,
++ 4,
++ 1,
++ 0);
++
++ for (; number < eighthPoints; number++) {
++ complexVal = _mm256_load_si256((__m256i*)complexVectorPtr);
++ complexVectorPtr += 32;
++ complexVal = _mm256_shuffle_epi8(complexVal, moveMask);
++ complexVal = _mm256_permute4x64_epi64(complexVal, 0xd8);
++ complexVal128 = _mm256_extracti128_si256(complexVal, 0);
++
++ iIntVal = _mm256_cvtepi16_epi32(complexVal128);
++ iFloatValue = _mm256_cvtepi32_ps(iIntVal);
++
++ iFloatValue = _mm256_mul_ps(iFloatValue, invScalar);
++
++ _mm256_store_ps(iBufferPtr, iFloatValue);
++
++ iBufferPtr += 8;
++ }
++
++ number = eighthPoints * 8;
++ int16_t* sixteenTComplexVectorPtr = (int16_t*)&complexVector[number];
++ for (; number < num_points; number++) {
++ *iBufferPtr++ = ((float)(*sixteenTComplexVectorPtr++)) * iScalar;
++ sixteenTComplexVectorPtr++;
++ }
+ }
+ #endif /* LV_HAVE_AVX2 */
+
+@@ -112,44 +145,47 @@ volk_16ic_s32f_deinterleave_real_32f_a_avx2(float* iBuffer, const lv_16sc_t* com
+ #include <smmintrin.h>
+
+ static inline void
+-volk_16ic_s32f_deinterleave_real_32f_a_sse4_1(float* iBuffer, const lv_16sc_t* complexVector,
+- const float scalar, unsigned int num_points)
++volk_16ic_s32f_deinterleave_real_32f_a_sse4_1(float* iBuffer,
++ const lv_16sc_t* complexVector,
++ const float scalar,
++ unsigned int num_points)
+ {
+- float* iBufferPtr = iBuffer;
+-
+- unsigned int number = 0;
+- const unsigned int quarterPoints = num_points / 4;
++ float* iBufferPtr = iBuffer;
+
+- __m128 iFloatValue;
++ unsigned int number = 0;
++ const unsigned int quarterPoints = num_points / 4;
+
+- const float iScalar= 1.0 / scalar;
+- __m128 invScalar = _mm_set_ps1(iScalar);
+- __m128i complexVal, iIntVal;
+- int8_t* complexVectorPtr = (int8_t*)complexVector;
++ __m128 iFloatValue;
+
+- __m128i moveMask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0);
++ const float iScalar = 1.0 / scalar;
++ __m128 invScalar = _mm_set_ps1(iScalar);
++ __m128i complexVal, iIntVal;
++ int8_t* complexVectorPtr = (int8_t*)complexVector;
+
+- for(;number < quarterPoints; number++){
+- complexVal = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16;
+- complexVal = _mm_shuffle_epi8(complexVal, moveMask);
++ __m128i moveMask = _mm_set_epi8(
++ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0);
+
+- iIntVal = _mm_cvtepi16_epi32(complexVal);
+- iFloatValue = _mm_cvtepi32_ps(iIntVal);
++ for (; number < quarterPoints; number++) {
++ complexVal = _mm_load_si128((__m128i*)complexVectorPtr);
++ complexVectorPtr += 16;
++ complexVal = _mm_shuffle_epi8(complexVal, moveMask);
+
+- iFloatValue = _mm_mul_ps(iFloatValue, invScalar);
++ iIntVal = _mm_cvtepi16_epi32(complexVal);
++ iFloatValue = _mm_cvtepi32_ps(iIntVal);
+
+- _mm_store_ps(iBufferPtr, iFloatValue);
++ iFloatValue = _mm_mul_ps(iFloatValue, invScalar);
+
+- iBufferPtr += 4;
+- }
++ _mm_store_ps(iBufferPtr, iFloatValue);
+
+- number = quarterPoints * 4;
+- int16_t* sixteenTComplexVectorPtr = (int16_t*)&complexVector[number];
+- for(; number < num_points; number++){
+- *iBufferPtr++ = ((float)(*sixteenTComplexVectorPtr++)) * iScalar;
+- sixteenTComplexVectorPtr++;
+- }
++ iBufferPtr += 4;
++ }
+
++ number = quarterPoints * 4;
++ int16_t* sixteenTComplexVectorPtr = (int16_t*)&complexVector[number];
++ for (; number < num_points; number++) {
++ *iBufferPtr++ = ((float)(*sixteenTComplexVectorPtr++)) * iScalar;
++ sixteenTComplexVectorPtr++;
++ }
+ }
+ #endif /* LV_HAVE_SSE4_1 */
+
+@@ -157,59 +193,66 @@ volk_16ic_s32f_deinterleave_real_32f_a_sse4_1(float* iBuffer, const lv_16sc_t* c
+ #include <xmmintrin.h>
+
+ static inline void
+-volk_16ic_s32f_deinterleave_real_32f_a_sse(float* iBuffer, const lv_16sc_t* complexVector,
+- const float scalar, unsigned int num_points)
++volk_16ic_s32f_deinterleave_real_32f_a_sse(float* iBuffer,
++ const lv_16sc_t* complexVector,
++ const float scalar,
++ unsigned int num_points)
+ {
+- float* iBufferPtr = iBuffer;
++ float* iBufferPtr = iBuffer;
+
+- unsigned int number = 0;
+- const unsigned int quarterPoints = num_points / 4;
+- __m128 iValue;
++ unsigned int number = 0;
++ const unsigned int quarterPoints = num_points / 4;
++ __m128 iValue;
+
+- const float iScalar = 1.0/scalar;
+- __m128 invScalar = _mm_set_ps1(iScalar);
+- int16_t* complexVectorPtr = (int16_t*)complexVector;
++ const float iScalar = 1.0 / scalar;
++ __m128 invScalar = _mm_set_ps1(iScalar);
++ int16_t* complexVectorPtr = (int16_t*)complexVector;
+
+- __VOLK_ATTR_ALIGNED(16) float floatBuffer[4];
++ __VOLK_ATTR_ALIGNED(16) float floatBuffer[4];
+
+- for(;number < quarterPoints; number++){
+- floatBuffer[0] = (float)(*complexVectorPtr); complexVectorPtr += 2;
+- floatBuffer[1] = (float)(*complexVectorPtr); complexVectorPtr += 2;
+- floatBuffer[2] = (float)(*complexVectorPtr); complexVectorPtr += 2;
+- floatBuffer[3] = (float)(*complexVectorPtr); complexVectorPtr += 2;
++ for (; number < quarterPoints; number++) {
++ floatBuffer[0] = (float)(*complexVectorPtr);
++ complexVectorPtr += 2;
++ floatBuffer[1] = (float)(*complexVectorPtr);
++ complexVectorPtr += 2;
++ floatBuffer[2] = (float)(*complexVectorPtr);
++ complexVectorPtr += 2;
++ floatBuffer[3] = (float)(*complexVectorPtr);
++ complexVectorPtr += 2;
+
+- iValue = _mm_load_ps(floatBuffer);
++ iValue = _mm_load_ps(floatBuffer);
+
+- iValue = _mm_mul_ps(iValue, invScalar);
++ iValue = _mm_mul_ps(iValue, invScalar);
+
+- _mm_store_ps(iBufferPtr, iValue);
++ _mm_store_ps(iBufferPtr, iValue);
+
+- iBufferPtr += 4;
+- }
+-
+- number = quarterPoints * 4;
+- complexVectorPtr = (int16_t*)&complexVector[number];
+- for(; number < num_points; number++){
+- *iBufferPtr++ = ((float)(*complexVectorPtr++)) * iScalar;
+- complexVectorPtr++;
+- }
++ iBufferPtr += 4;
++ }
+
++ number = quarterPoints * 4;
++ complexVectorPtr = (int16_t*)&complexVector[number];
++ for (; number < num_points; number++) {
++ *iBufferPtr++ = ((float)(*complexVectorPtr++)) * iScalar;
++ complexVectorPtr++;
++ }
+ }
+ #endif /* LV_HAVE_SSE */
+
+ #ifdef LV_HAVE_GENERIC
+ static inline void
+-volk_16ic_s32f_deinterleave_real_32f_generic(float* iBuffer, const lv_16sc_t* complexVector,
+- const float scalar, unsigned int num_points)
++volk_16ic_s32f_deinterleave_real_32f_generic(float* iBuffer,
++ const lv_16sc_t* complexVector,
++ const float scalar,
++ unsigned int num_points)
+ {
+- unsigned int number = 0;
+- const int16_t* complexVectorPtr = (const int16_t*)complexVector;
+- float* iBufferPtr = iBuffer;
+- const float invScalar = 1.0 / scalar;
+- for(number = 0; number < num_points; number++){
+- *iBufferPtr++ = ((float)(*complexVectorPtr++)) * invScalar;
+- complexVectorPtr++;
+- }
++ unsigned int number = 0;
++ const int16_t* complexVectorPtr = (const int16_t*)complexVector;
++ float* iBufferPtr = iBuffer;
++ const float invScalar = 1.0 / scalar;
++ for (number = 0; number < num_points; number++) {
++ *iBufferPtr++ = ((float)(*complexVectorPtr++)) * invScalar;
++ complexVectorPtr++;
++ }
+ }
+ #endif /* LV_HAVE_GENERIC */
+
+@@ -219,55 +262,88 @@ volk_16ic_s32f_deinterleave_real_32f_generic(float* iBuffer, const lv_16sc_t* co
+ #ifndef INCLUDED_volk_16ic_s32f_deinterleave_real_32f_u_H
+ #define INCLUDED_volk_16ic_s32f_deinterleave_real_32f_u_H
+
+-#include <volk/volk_common.h>
+ #include <inttypes.h>
+ #include <stdio.h>
++#include <volk/volk_common.h>
+
+ #ifdef LV_HAVE_AVX2
+ #include <immintrin.h>
+
+ static inline void
+-volk_16ic_s32f_deinterleave_real_32f_u_avx2(float* iBuffer, const lv_16sc_t* complexVector,
+- const float scalar, unsigned int num_points)
++volk_16ic_s32f_deinterleave_real_32f_u_avx2(float* iBuffer,
++ const lv_16sc_t* complexVector,
++ const float scalar,
++ unsigned int num_points)
+ {
+- float* iBufferPtr = iBuffer;
+-
+- unsigned int number = 0;
+- const unsigned int eighthPoints = num_points / 8;
+-
+- __m256 iFloatValue;
+-
+- const float iScalar= 1.0 / scalar;
+- __m256 invScalar = _mm256_set1_ps(iScalar);
+- __m256i complexVal, iIntVal;
+- __m128i complexVal128;
+- int8_t* complexVectorPtr = (int8_t*)complexVector;
+-
+- __m256i moveMask = _mm256_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0);
+-
+- for(;number < eighthPoints; number++){
+- complexVal = _mm256_loadu_si256((__m256i*)complexVectorPtr); complexVectorPtr += 32;
+- complexVal = _mm256_shuffle_epi8(complexVal, moveMask);
+- complexVal = _mm256_permute4x64_epi64(complexVal, 0xd8);
+- complexVal128 = _mm256_extracti128_si256(complexVal, 0);
+-
+- iIntVal = _mm256_cvtepi16_epi32(complexVal128);
+- iFloatValue = _mm256_cvtepi32_ps(iIntVal);
+-
+- iFloatValue = _mm256_mul_ps(iFloatValue, invScalar);
+-
+- _mm256_storeu_ps(iBufferPtr, iFloatValue);
+-
+- iBufferPtr += 8;
+- }
+-
+- number = eighthPoints * 8;
+- int16_t* sixteenTComplexVectorPtr = (int16_t*)&complexVector[number];
+- for(; number < num_points; number++){
+- *iBufferPtr++ = ((float)(*sixteenTComplexVectorPtr++)) * iScalar;
+- sixteenTComplexVectorPtr++;
+- }
+-
++ float* iBufferPtr = iBuffer;
++
++ unsigned int number = 0;
++ const unsigned int eighthPoints = num_points / 8;
++
++ __m256 iFloatValue;
++
++ const float iScalar = 1.0 / scalar;
++ __m256 invScalar = _mm256_set1_ps(iScalar);
++ __m256i complexVal, iIntVal;
++ __m128i complexVal128;
++ int8_t* complexVectorPtr = (int8_t*)complexVector;
++
++ __m256i moveMask = _mm256_set_epi8(0x80,
++ 0x80,
++ 0x80,
++ 0x80,
++ 0x80,
++ 0x80,
++ 0x80,
++ 0x80,
++ 13,
++ 12,
++ 9,
++ 8,
++ 5,
++ 4,
++ 1,
++ 0,
++ 0x80,
++ 0x80,
++ 0x80,
++ 0x80,
++ 0x80,
++ 0x80,
++ 0x80,
++ 0x80,
++ 13,
++ 12,
++ 9,
++ 8,
++ 5,
++ 4,
++ 1,
++ 0);
++
++ for (; number < eighthPoints; number++) {
++ complexVal = _mm256_loadu_si256((__m256i*)complexVectorPtr);
++ complexVectorPtr += 32;
++ complexVal = _mm256_shuffle_epi8(complexVal, moveMask);
++ complexVal = _mm256_permute4x64_epi64(complexVal, 0xd8);
++ complexVal128 = _mm256_extracti128_si256(complexVal, 0);
++
++ iIntVal = _mm256_cvtepi16_epi32(complexVal128);
++ iFloatValue = _mm256_cvtepi32_ps(iIntVal);
++
++ iFloatValue = _mm256_mul_ps(iFloatValue, invScalar);
++
++ _mm256_storeu_ps(iBufferPtr, iFloatValue);
++
++ iBufferPtr += 8;
++ }
++
++ number = eighthPoints * 8;
++ int16_t* sixteenTComplexVectorPtr = (int16_t*)&complexVector[number];
++ for (; number < num_points; number++) {
++ *iBufferPtr++ = ((float)(*sixteenTComplexVectorPtr++)) * iScalar;
++ sixteenTComplexVectorPtr++;
++ }
+ }
+ #endif /* LV_HAVE_AVX2 */
+
+diff --git a/kernels/volk/volk_16ic_s32f_magnitude_32f.h b/kernels/volk/volk_16ic_s32f_magnitude_32f.h
+index bb0459c..c3e3605 100644
+--- a/kernels/volk/volk_16ic_s32f_magnitude_32f.h
++++ b/kernels/volk/volk_16ic_s32f_magnitude_32f.h
+@@ -30,8 +30,8 @@
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+- * void volk_16ic_s32f_magnitude_32f(float* magnitudeVector, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points)
+- * \endcode
++ * void volk_16ic_s32f_magnitude_32f(float* magnitudeVector, const lv_16sc_t*
++ * complexVector, const float scalar, unsigned int num_points) \endcode
+ *
+ * \b Inputs
+ * \li complexVector: The complex input vector of complex 16-bit shorts.
+@@ -55,67 +55,68 @@
+ #ifndef INCLUDED_volk_16ic_s32f_magnitude_32f_a_H
+ #define INCLUDED_volk_16ic_s32f_magnitude_32f_a_H
+
+-#include <volk/volk_common.h>
+ #include <inttypes.h>
+-#include <stdio.h>
+ #include <math.h>
++#include <stdio.h>
++#include <volk/volk_common.h>
+
+ #ifdef LV_HAVE_AVX2
+ #include <immintrin.h>
+
+-static inline void
+-volk_16ic_s32f_magnitude_32f_a_avx2(float* magnitudeVector, const lv_16sc_t* complexVector,
+- const float scalar, unsigned int num_points)
++static inline void volk_16ic_s32f_magnitude_32f_a_avx2(float* magnitudeVector,
++ const lv_16sc_t* complexVector,
++ const float scalar,
++ unsigned int num_points)
+ {
+- unsigned int number = 0;
+- const unsigned int eighthPoints = num_points / 8;
++ unsigned int number = 0;
++ const unsigned int eighthPoints = num_points / 8;
++
++ const int16_t* complexVectorPtr = (const int16_t*)complexVector;
++ float* magnitudeVectorPtr = magnitudeVector;
+
+- const int16_t* complexVectorPtr = (const int16_t*)complexVector;
+- float* magnitudeVectorPtr = magnitudeVector;
++ __m256 invScalar = _mm256_set1_ps(1.0 / scalar);
+
+- __m256 invScalar = _mm256_set1_ps(1.0/scalar);
++ __m256 cplxValue1, cplxValue2, result;
++ __m256i int1, int2;
++ __m128i short1, short2;
++ __m256i idx = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
+
+- __m256 cplxValue1, cplxValue2, result;
+- __m256i int1, int2;
+- __m128i short1, short2;
+- __m256i idx = _mm256_set_epi32(7,6,3,2,5,4,1,0);
++ for (; number < eighthPoints; number++) {
+
+- for(;number < eighthPoints; number++){
+-
+- int1 = _mm256_loadu_si256((__m256i*)complexVectorPtr);
+- complexVectorPtr += 16;
+- short1 = _mm256_extracti128_si256(int1,0);
+- short2 = _mm256_extracti128_si256(int1,1);
++ int1 = _mm256_loadu_si256((__m256i*)complexVectorPtr);
++ complexVectorPtr += 16;
++ short1 = _mm256_extracti128_si256(int1, 0);
++ short2 = _mm256_extracti128_si256(int1, 1);
+
+- int1 = _mm256_cvtepi16_epi32(short1);
+- int2 = _mm256_cvtepi16_epi32(short2);
+- cplxValue1 = _mm256_cvtepi32_ps(int1);
+- cplxValue2 = _mm256_cvtepi32_ps(int2);
++ int1 = _mm256_cvtepi16_epi32(short1);
++ int2 = _mm256_cvtepi16_epi32(short2);
++ cplxValue1 = _mm256_cvtepi32_ps(int1);
++ cplxValue2 = _mm256_cvtepi32_ps(int2);
+
+- cplxValue1 = _mm256_mul_ps(cplxValue1, invScalar);
+- cplxValue2 = _mm256_mul_ps(cplxValue2, invScalar);
++ cplxValue1 = _mm256_mul_ps(cplxValue1, invScalar);
++ cplxValue2 = _mm256_mul_ps(cplxValue2, invScalar);
+
+- cplxValue1 = _mm256_mul_ps(cplxValue1, cplxValue1); // Square the values
+- cplxValue2 = _mm256_mul_ps(cplxValue2, cplxValue2); // Square the Values
++ cplxValue1 = _mm256_mul_ps(cplxValue1, cplxValue1); // Square the values
++ cplxValue2 = _mm256_mul_ps(cplxValue2, cplxValue2); // Square the Values
+
+- result = _mm256_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values
+- result = _mm256_permutevar8x32_ps(result, idx);
++ result = _mm256_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values
++ result = _mm256_permutevar8x32_ps(result, idx);
+
+- result = _mm256_sqrt_ps(result); // Square root the values
++ result = _mm256_sqrt_ps(result); // Square root the values
+
+- _mm256_store_ps(magnitudeVectorPtr, result);
++ _mm256_store_ps(magnitudeVectorPtr, result);
+
+- magnitudeVectorPtr += 8;
+- }
++ magnitudeVectorPtr += 8;
++ }
+
+- number = eighthPoints * 8;
+- magnitudeVectorPtr = &magnitudeVector[number];
+- complexVectorPtr = (const int16_t*)&complexVector[number];
+- for(; number < num_points; number++){
+- float val1Real = (float)(*complexVectorPtr++) / scalar;
+- float val1Imag = (float)(*complexVectorPtr++) / scalar;
+- *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag));
+- }
++ number = eighthPoints * 8;
++ magnitudeVectorPtr = &magnitudeVector[number];
++ complexVectorPtr = (const int16_t*)&complexVector[number];
++ for (; number < num_points; number++) {
++ float val1Real = (float)(*complexVectorPtr++) / scalar;
++ float val1Imag = (float)(*complexVectorPtr++) / scalar;
++ *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag));
++ }
+ }
+ #endif /* LV_HAVE_AVX2 */
+
+@@ -123,127 +124,129 @@ volk_16ic_s32f_magnitude_32f_a_avx2(float* magnitudeVector, const lv_16sc_t* com
+ #ifdef LV_HAVE_SSE3
+ #include <pmmintrin.h>
+
+-static inline void
+-volk_16ic_s32f_magnitude_32f_a_sse3(float* magnitudeVector, const lv_16sc_t* complexVector,
+- const float scalar, unsigned int num_points)
++static inline void volk_16ic_s32f_magnitude_32f_a_sse3(float* magnitudeVector,
++ const lv_16sc_t* complexVector,
++ const float scalar,
++ unsigned int num_points)
+ {
+- unsigned int number = 0;
+- const unsigned int quarterPoints = num_points / 4;
++ unsigned int number = 0;
++ const unsigned int quarterPoints = num_points / 4;
+
+- const int16_t* complexVectorPtr = (const int16_t*)complexVector;
+- float* magnitudeVectorPtr = magnitudeVector;
++ const int16_t* complexVectorPtr = (const int16_t*)complexVector;
++ float* magnitudeVectorPtr = magnitudeVector;
+
+- __m128 invScalar = _mm_set_ps1(1.0/scalar);
++ __m128 invScalar = _mm_set_ps1(1.0 / scalar);
+
+- __m128 cplxValue1, cplxValue2, result;
++ __m128 cplxValue1, cplxValue2, result;
+
+- __VOLK_ATTR_ALIGNED(16) float inputFloatBuffer[8];
++ __VOLK_ATTR_ALIGNED(16) float inputFloatBuffer[8];
+
+- for(;number < quarterPoints; number++){
++ for (; number < quarterPoints; number++) {
+
+- inputFloatBuffer[0] = (float)(complexVectorPtr[0]);
+- inputFloatBuffer[1] = (float)(complexVectorPtr[1]);
+- inputFloatBuffer[2] = (float)(complexVectorPtr[2]);
+- inputFloatBuffer[3] = (float)(complexVectorPtr[3]);
++ inputFloatBuffer[0] = (float)(complexVectorPtr[0]);
++ inputFloatBuffer[1] = (float)(complexVectorPtr[1]);
++ inputFloatBuffer[2] = (float)(complexVectorPtr[2]);
++ inputFloatBuffer[3] = (float)(complexVectorPtr[3]);
+
+- inputFloatBuffer[4] = (float)(complexVectorPtr[4]);
+- inputFloatBuffer[5] = (float)(complexVectorPtr[5]);
+- inputFloatBuffer[6] = (float)(complexVectorPtr[6]);
+- inputFloatBuffer[7] = (float)(complexVectorPtr[7]);
++ inputFloatBuffer[4] = (float)(complexVectorPtr[4]);
++ inputFloatBuffer[5] = (float)(complexVectorPtr[5]);
++ inputFloatBuffer[6] = (float)(complexVectorPtr[6]);
++ inputFloatBuffer[7] = (float)(complexVectorPtr[7]);
+
+- cplxValue1 = _mm_load_ps(&inputFloatBuffer[0]);
+- cplxValue2 = _mm_load_ps(&inputFloatBuffer[4]);
++ cplxValue1 = _mm_load_ps(&inputFloatBuffer[0]);
++ cplxValue2 = _mm_load_ps(&inputFloatBuffer[4]);
+
+- complexVectorPtr += 8;
++ complexVectorPtr += 8;
+
+- cplxValue1 = _mm_mul_ps(cplxValue1, invScalar);
+- cplxValue2 = _mm_mul_ps(cplxValue2, invScalar);
++ cplxValue1 = _mm_mul_ps(cplxValue1, invScalar);
++ cplxValue2 = _mm_mul_ps(cplxValue2, invScalar);
+
+- cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values
+- cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values
++ cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values
++ cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values
+
+- result = _mm_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values
++ result = _mm_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values
+
+- result = _mm_sqrt_ps(result); // Square root the values
++ result = _mm_sqrt_ps(result); // Square root the values
+
+- _mm_store_ps(magnitudeVectorPtr, result);
++ _mm_store_ps(magnitudeVectorPtr, result);
+
+- magnitudeVectorPtr += 4;
+- }
++ magnitudeVectorPtr += 4;
++ }
+
+- number = quarterPoints * 4;
+- magnitudeVectorPtr = &magnitudeVector[number];
+- complexVectorPtr = (const int16_t*)&complexVector[number];
+- for(; number < num_points; number++){
+- float val1Real = (float)(*complexVectorPtr++) / scalar;
+- float val1Imag = (float)(*complexVectorPtr++) / scalar;
+- *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag));
+- }
++ number = quarterPoints * 4;
++ magnitudeVectorPtr = &magnitudeVector[number];
++ complexVectorPtr = (const int16_t*)&complexVector[number];
++ for (; number < num_points; number++) {
++ float val1Real = (float)(*complexVectorPtr++) / scalar;
++ float val1Imag = (float)(*complexVectorPtr++) / scalar;
++ *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag));
++ }
+ }
+ #endif /* LV_HAVE_SSE3 */
+
+ #ifdef LV_HAVE_SSE
+ #include <xmmintrin.h>
+
+-static inline void
+-volk_16ic_s32f_magnitude_32f_a_sse(float* magnitudeVector, const lv_16sc_t* complexVector,
+- const float scalar, unsigned int num_points)
++static inline void volk_16ic_s32f_magnitude_32f_a_sse(float* magnitudeVector,
++ const lv_16sc_t* complexVector,
++ const float scalar,
++ unsigned int num_points)
+ {
+- unsigned int number = 0;
+- const unsigned int quarterPoints = num_points / 4;
++ unsigned int number = 0;
++ const unsigned int quarterPoints = num_points / 4;
+
+- const int16_t* complexVectorPtr = (const int16_t*)complexVector;
+- float* magnitudeVectorPtr = magnitudeVector;
++ const int16_t* complexVectorPtr = (const int16_t*)complexVector;
++ float* magnitudeVectorPtr = magnitudeVector;
+
+- const float iScalar = 1.0 / scalar;
+- __m128 invScalar = _mm_set_ps1(iScalar);
++ const float iScalar = 1.0 / scalar;
++ __m128 invScalar = _mm_set_ps1(iScalar);
+
+- __m128 cplxValue1, cplxValue2, result, re, im;
++ __m128 cplxValue1, cplxValue2, result, re, im;
+
+- __VOLK_ATTR_ALIGNED(16) float inputFloatBuffer[8];
++ __VOLK_ATTR_ALIGNED(16) float inputFloatBuffer[8];
+
+- for(;number < quarterPoints; number++){
+- inputFloatBuffer[0] = (float)(complexVectorPtr[0]);
+- inputFloatBuffer[1] = (float)(complexVectorPtr[1]);
+- inputFloatBuffer[2] = (float)(complexVectorPtr[2]);
+- inputFloatBuffer[3] = (float)(complexVectorPtr[3]);
++ for (; number < quarterPoints; number++) {
++ inputFloatBuffer[0] = (float)(complexVectorPtr[0]);
++ inputFloatBuffer[1] = (float)(complexVectorPtr[1]);
++ inputFloatBuffer[2] = (float)(complexVectorPtr[2]);
++ inputFloatBuffer[3] = (float)(complexVectorPtr[3]);
+
+- inputFloatBuffer[4] = (float)(complexVectorPtr[4]);
+- inputFloatBuffer[5] = (float)(complexVectorPtr[5]);
+- inputFloatBuffer[6] = (float)(complexVectorPtr[6]);
+- inputFloatBuffer[7] = (float)(complexVectorPtr[7]);
++ inputFloatBuffer[4] = (float)(complexVectorPtr[4]);
++ inputFloatBuffer[5] = (float)(complexVectorPtr[5]);
++ inputFloatBuffer[6] = (float)(complexVectorPtr[6]);
++ inputFloatBuffer[7] = (float)(complexVectorPtr[7]);
+
+- cplxValue1 = _mm_load_ps(&inputFloatBuffer[0]);
+- cplxValue2 = _mm_load_ps(&inputFloatBuffer[4]);
++ cplxValue1 = _mm_load_ps(&inputFloatBuffer[0]);
++ cplxValue2 = _mm_load_ps(&inputFloatBuffer[4]);
+
+- re = _mm_shuffle_ps(cplxValue1, cplxValue2, 0x88);
+- im = _mm_shuffle_ps(cplxValue1, cplxValue2, 0xdd);
++ re = _mm_shuffle_ps(cplxValue1, cplxValue2, 0x88);
++ im = _mm_shuffle_ps(cplxValue1, cplxValue2, 0xdd);
+
+- complexVectorPtr += 8;
++ complexVectorPtr += 8;
+
+- cplxValue1 = _mm_mul_ps(re, invScalar);
+- cplxValue2 = _mm_mul_ps(im, invScalar);
++ cplxValue1 = _mm_mul_ps(re, invScalar);
++ cplxValue2 = _mm_mul_ps(im, invScalar);
+
+- cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values
+- cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values
++ cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values
++ cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values
+
+- result = _mm_add_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values
++ result = _mm_add_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values
+
+- result = _mm_sqrt_ps(result); // Square root the values
++ result = _mm_sqrt_ps(result); // Square root the values
+
+- _mm_store_ps(magnitudeVectorPtr, result);
++ _mm_store_ps(magnitudeVectorPtr, result);
+
+- magnitudeVectorPtr += 4;
+- }
++ magnitudeVectorPtr += 4;
++ }
+
+- number = quarterPoints * 4;
+- magnitudeVectorPtr = &magnitudeVector[number];
+- complexVectorPtr = (const int16_t*)&complexVector[number];
+- for(; number < num_points; number++){
+- float val1Real = (float)(*complexVectorPtr++) * iScalar;
+- float val1Imag = (float)(*complexVectorPtr++) * iScalar;
+- *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag));
+- }
++ number = quarterPoints * 4;
++ magnitudeVectorPtr = &magnitudeVector[number];
++ complexVectorPtr = (const int16_t*)&complexVector[number];
++ for (; number < num_points; number++) {
++ float val1Real = (float)(*complexVectorPtr++) * iScalar;
++ float val1Imag = (float)(*complexVectorPtr++) * iScalar;
++ *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag));
++ }
+ }
+
+
+@@ -251,33 +254,37 @@ volk_16ic_s32f_magnitude_32f_a_sse(float* magnitudeVector, const lv_16sc_t* comp
+
+ #ifdef LV_HAVE_GENERIC
+
+-static inline void
+-volk_16ic_s32f_magnitude_32f_generic(float* magnitudeVector, const lv_16sc_t* complexVector,
+- const float scalar, unsigned int num_points)
++static inline void volk_16ic_s32f_magnitude_32f_generic(float* magnitudeVector,
++ const lv_16sc_t* complexVector,
++ const float scalar,
++ unsigned int num_points)
+ {
+- const int16_t* complexVectorPtr = (const int16_t*)complexVector;
+- float* magnitudeVectorPtr = magnitudeVector;
+- unsigned int number = 0;
+- const float invScalar = 1.0 / scalar;
+- for(number = 0; number < num_points; number++){
+- float real = ( (float) (*complexVectorPtr++)) * invScalar;
+- float imag = ( (float) (*complexVectorPtr++)) * invScalar;
+- *magnitudeVectorPtr++ = sqrtf((real*real) + (imag*imag));
+- }
++ const int16_t* complexVectorPtr = (const int16_t*)complexVector;
++ float* magnitudeVectorPtr = magnitudeVector;
++ unsigned int number = 0;
++ const float invScalar = 1.0 / scalar;
++ for (number = 0; number < num_points; number++) {
++ float real = ((float)(*complexVectorPtr++)) * invScalar;
++ float imag = ((float)(*complexVectorPtr++)) * invScalar;
++ *magnitudeVectorPtr++ = sqrtf((real * real) + (imag * imag));
++ }
+ }
+ #endif /* LV_HAVE_GENERIC */
+
+ #ifdef LV_HAVE_ORC_DISABLED
+
+-extern void
+-volk_16ic_s32f_magnitude_32f_a_orc_impl(float* magnitudeVector, const lv_16sc_t* complexVector,
+- const float scalar, unsigned int num_points);
++extern void volk_16ic_s32f_magnitude_32f_a_orc_impl(float* magnitudeVector,
++ const lv_16sc_t* complexVector,
++ const float scalar,
++ unsigned int num_points);
+
+-static inline void
+-volk_16ic_s32f_magnitude_32f_u_orc(float* magnitudeVector, const lv_16sc_t* complexVector,
+- const float scalar, unsigned int num_points)
++static inline void volk_16ic_s32f_magnitude_32f_u_orc(float* magnitudeVector,
++ const lv_16sc_t* complexVector,
++ const float scalar,
++ unsigned int num_points)
+ {
+- volk_16ic_s32f_magnitude_32f_a_orc_impl(magnitudeVector, complexVector, scalar, num_points);
++ volk_16ic_s32f_magnitude_32f_a_orc_impl(
++ magnitudeVector, complexVector, scalar, num_points);
+ }
+ #endif /* LV_HAVE_ORC */
+
+@@ -287,69 +294,69 @@ volk_16ic_s32f_magnitude_32f_u_orc(float* magnitudeVector, const lv_16sc_t* comp
+ #ifndef INCLUDED_volk_16ic_s32f_magnitude_32f_u_H
+ #define INCLUDED_volk_16ic_s32f_magnitude_32f_u_H
+
+-#include <volk/volk_common.h>
+ #include <inttypes.h>
+-#include <stdio.h>
+ #include <math.h>
++#include <stdio.h>
++#include <volk/volk_common.h>
+
+ #ifdef LV_HAVE_AVX2
+ #include <immintrin.h>
+
+-static inline void
+-volk_16ic_s32f_magnitude_32f_u_avx2(float* magnitudeVector, const lv_16sc_t* complexVector,
+- const float scalar, unsigned int num_points)
++static inline void volk_16ic_s32f_magnitude_32f_u_avx2(float* magnitudeVector,
++ const lv_16sc_t* complexVector,
++ const float scalar,
++ unsigned int num_points)
+ {
+- unsigned int number = 0;
+- const unsigned int eighthPoints = num_points / 8;
++ unsigned int number = 0;
++ const unsigned int eighthPoints = num_points / 8;
++
++ const int16_t* complexVectorPtr = (const int16_t*)complexVector;
++ float* magnitudeVectorPtr = magnitudeVector;
+
+- const int16_t* complexVectorPtr = (const int16_t*)complexVector;
+- float* magnitudeVectorPtr = magnitudeVector;
++ __m256 invScalar = _mm256_set1_ps(1.0 / scalar);
+
+- __m256 invScalar = _mm256_set1_ps(1.0/scalar);
++ __m256 cplxValue1, cplxValue2, result;
++ __m256i int1, int2;
++ __m128i short1, short2;
++ __m256i idx = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
+
+- __m256 cplxValue1, cplxValue2, result;
+- __m256i int1, int2;
+- __m128i short1, short2;
+- __m256i idx = _mm256_set_epi32(7,6,3,2,5,4,1,0);
++ for (; number < eighthPoints; number++) {
+
+- for(;number < eighthPoints; number++){
+-
+- int1 = _mm256_loadu_si256((__m256i*)complexVectorPtr);
+- complexVectorPtr += 16;
+- short1 = _mm256_extracti128_si256(int1,0);
+- short2 = _mm256_extracti128_si256(int1,1);
++ int1 = _mm256_loadu_si256((__m256i*)complexVectorPtr);
++ complexVectorPtr += 16;
++ short1 = _mm256_extracti128_si256(int1, 0);
++ short2 = _mm256_extracti128_si256(int1, 1);
+
+- int1 = _mm256_cvtepi16_epi32(short1);
+- int2 = _mm256_cvtepi16_epi32(short2);
+- cplxValue1 = _mm256_cvtepi32_ps(int1);
+- cplxValue2 = _mm256_cvtepi32_ps(int2);
++ int1 = _mm256_cvtepi16_epi32(short1);
++ int2 = _mm256_cvtepi16_epi32(short2);
++ cplxValue1 = _mm256_cvtepi32_ps(int1);
++ cplxValue2 = _mm256_cvtepi32_ps(int2);
+
+- cplxValue1 = _mm256_mul_ps(cplxValue1, invScalar);
+- cplxValue2 = _mm256_mul_ps(cplxValue2, invScalar);
++ cplxValue1 = _mm256_mul_ps(cplxValue1, invScalar);
++ cplxValue2 = _mm256_mul_ps(cplxValue2, invScalar);
+
+- cplxValue1 = _mm256_mul_ps(cplxValue1, cplxValue1); // Square the values
+- cplxValue2 = _mm256_mul_ps(cplxValue2, cplxValue2); // Square the Values
++ cplxValue1 = _mm256_mul_ps(cplxValue1, cplxValue1); // Square the values
++ cplxValue2 = _mm256_mul_ps(cplxValue2, cplxValue2); // Square the Values
+
+- result = _mm256_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values
+- result = _mm256_permutevar8x32_ps(result, idx);
++ result = _mm256_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values
++ result = _mm256_permutevar8x32_ps(result, idx);
+
+- result = _mm256_sqrt_ps(result); // Square root the values
++ result = _mm256_sqrt_ps(result); // Square root the values
+
+- _mm256_storeu_ps(magnitudeVectorPtr, result);
++ _mm256_storeu_ps(magnitudeVectorPtr, result);
+
+- magnitudeVectorPtr += 8;
+- }
++ magnitudeVectorPtr += 8;
++ }
+
+- number = eighthPoints * 8;
+- magnitudeVectorPtr = &magnitudeVector[number];
+- complexVectorPtr = (const int16_t*)&complexVector[number];
+- for(; number < num_points; number++){
+- float val1Real = (float)(*complexVectorPtr++) / scalar;
+- float val1Imag = (float)(*complexVectorPtr++) / scalar;
+- *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag));
+- }
++ number = eighthPoints * 8;
++ magnitudeVectorPtr = &magnitudeVector[number];
++ complexVectorPtr = (const int16_t*)&complexVector[number];
++ for (; number < num_points; number++) {
++ float val1Real = (float)(*complexVectorPtr++) / scalar;
++ float val1Imag = (float)(*complexVectorPtr++) / scalar;
++ *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag));
++ }
+ }
+ #endif /* LV_HAVE_AVX2 */
+
+ #endif /* INCLUDED_volk_16ic_s32f_magnitude_32f_u_H */
+-
+diff --git a/kernels/volk/volk_16ic_x2_dot_prod_16ic.h b/kernels/volk/volk_16ic_x2_dot_prod_16ic.h
+index ae10cff..a1a0e8c 100644
+--- a/kernels/volk/volk_16ic_x2_dot_prod_16ic.h
++++ b/kernels/volk/volk_16ic_x2_dot_prod_16ic.h
+@@ -25,18 +25,20 @@
+ *
+ * \b Overview
+ *
+- * Multiplies two input complex vectors (16-bit integer each component) and accumulates them,
+- * storing the result. Results are saturated so never go beyond the limits of the data type.
++ * Multiplies two input complex vectors (16-bit integer each component) and accumulates
++ * them, storing the result. Results are saturated so never go beyond the limits of the
++ * data type.
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+- * void volk_16ic_x2_dot_prod_16ic(lv_16sc_t* result, const lv_16sc_t* in_a, const lv_16sc_t* in_b, unsigned int num_points);
+- * \endcode
++ * void volk_16ic_x2_dot_prod_16ic(lv_16sc_t* result, const lv_16sc_t* in_a, const
++ * lv_16sc_t* in_b, unsigned int num_points); \endcode
+ *
+ * \b Inputs
+ * \li in_a: One of the vectors to be multiplied and accumulated.
+ * \li in_b: The other vector to be multiplied and accumulated.
+- * \li num_points: Number of complex values to be multiplied together, accumulated and stored into \p result
++ * \li num_points: Number of complex values to be multiplied together, accumulated and
++ * stored into \p result
+ *
+ * \b Outputs
+ * \li result: Value of the accumulated result.
+@@ -46,22 +48,25 @@
+ #ifndef INCLUDED_volk_16ic_x2_dot_prod_16ic_H
+ #define INCLUDED_volk_16ic_x2_dot_prod_16ic_H
+
++#include <volk/saturation_arithmetic.h>
+ #include <volk/volk_common.h>
+ #include <volk/volk_complex.h>
+-#include <volk/saturation_arithmetic.h>
+
+
+ #ifdef LV_HAVE_GENERIC
+
+-static inline void volk_16ic_x2_dot_prod_16ic_generic(lv_16sc_t* result, const lv_16sc_t* in_a, const lv_16sc_t* in_b, unsigned int num_points)
++static inline void volk_16ic_x2_dot_prod_16ic_generic(lv_16sc_t* result,
++ const lv_16sc_t* in_a,
++ const lv_16sc_t* in_b,
++ unsigned int num_points)
+ {
+ result[0] = lv_cmake((int16_t)0, (int16_t)0);
+ unsigned int n;
+- for (n = 0; n < num_points; n++)
+- {
+- lv_16sc_t tmp = in_a[n] * in_b[n];
+- result[0] = lv_cmake(sat_adds16i(lv_creal(result[0]), lv_creal(tmp)), sat_adds16i(lv_cimag(result[0]), lv_cimag(tmp) ));
+- }
++ for (n = 0; n < num_points; n++) {
++ lv_16sc_t tmp = in_a[n] * in_b[n];
++ result[0] = lv_cmake(sat_adds16i(lv_creal(result[0]), lv_creal(tmp)),
++ sat_adds16i(lv_cimag(result[0]), lv_cimag(tmp)));
++ }
+ }
+
+ #endif /*LV_HAVE_GENERIC*/
+@@ -70,7 +75,10 @@ static inline void volk_16ic_x2_dot_prod_16ic_generic(lv_16sc_t* result, const l
+ #ifdef LV_HAVE_SSE2
+ #include <emmintrin.h>
+
+-static inline void volk_16ic_x2_dot_prod_16ic_a_sse2(lv_16sc_t* out, const lv_16sc_t* in_a, const lv_16sc_t* in_b, unsigned int num_points)
++static inline void volk_16ic_x2_dot_prod_16ic_a_sse2(lv_16sc_t* out,
++ const lv_16sc_t* in_a,
++ const lv_16sc_t* in_b,
++ unsigned int num_points)
+ {
+ lv_16sc_t dotProduct = lv_cmake((int16_t)0, (int16_t)0);
+
+@@ -81,62 +89,67 @@ static inline void volk_16ic_x2_dot_prod_16ic_a_sse2(lv_16sc_t* out, const lv_16
+ const lv_16sc_t* _in_b = in_b;
+ lv_16sc_t* _out = out;
+
+- if (sse_iters > 0)
+- {
+- __m128i a, b, c, c_sr, mask_imag, mask_real, real, imag, imag1, imag2, b_sl, a_sl, realcacc, imagcacc;
+- __VOLK_ATTR_ALIGNED(16) lv_16sc_t dotProductVector[4];
++ if (sse_iters > 0) {
++ __m128i a, b, c, c_sr, mask_imag, mask_real, real, imag, imag1, imag2, b_sl, a_sl,
++ realcacc, imagcacc;
++ __VOLK_ATTR_ALIGNED(16) lv_16sc_t dotProductVector[4];
+
+- realcacc = _mm_setzero_si128();
+- imagcacc = _mm_setzero_si128();
++ realcacc = _mm_setzero_si128();
++ imagcacc = _mm_setzero_si128();
+
+- mask_imag = _mm_set_epi8(0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0);
+- mask_real = _mm_set_epi8(0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF);
++ mask_imag = _mm_set_epi8(
++ 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0);
++ mask_real = _mm_set_epi8(
++ 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF);
+
+- for(number = 0; number < sse_iters; number++)
+- {
+- // a[127:0]=[a3.i,a3.r,a2.i,a2.r,a1.i,a1.r,a0.i,a0.r]
+- a = _mm_load_si128((__m128i*)_in_a); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg
+- __VOLK_PREFETCH(_in_a + 8);
+- b = _mm_load_si128((__m128i*)_in_b);
+- __VOLK_PREFETCH(_in_b + 8);
+- c = _mm_mullo_epi16(a, b); // a3.i*b3.i, a3.r*b3.r, ....
++ for (number = 0; number < sse_iters; number++) {
++ // a[127:0]=[a3.i,a3.r,a2.i,a2.r,a1.i,a1.r,a0.i,a0.r]
++ a = _mm_load_si128(
++ (__m128i*)_in_a); // load (2 byte imag, 2 byte real) x 4 into 128 bits reg
++ __VOLK_PREFETCH(_in_a + 8);
++ b = _mm_load_si128((__m128i*)_in_b);
++ __VOLK_PREFETCH(_in_b + 8);
++ c = _mm_mullo_epi16(a, b); // a3.i*b3.i, a3.r*b3.r, ....
+
+- c_sr = _mm_srli_si128(c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst.
+- real = _mm_subs_epi16(c, c_sr);
++ c_sr = _mm_srli_si128(c, 2); // Shift a right by imm8 bytes while shifting in
++ // zeros, and store the results in dst.
++ real = _mm_subs_epi16(c, c_sr);
+
+- b_sl = _mm_slli_si128(b, 2); // b3.r, b2.i ....
+- a_sl = _mm_slli_si128(a, 2); // a3.r, a2.i ....
++ b_sl = _mm_slli_si128(b, 2); // b3.r, b2.i ....
++ a_sl = _mm_slli_si128(a, 2); // a3.r, a2.i ....
+
+- imag1 = _mm_mullo_epi16(a, b_sl); // a3.i*b3.r, ....
+- imag2 = _mm_mullo_epi16(b, a_sl); // b3.i*a3.r, ....
++ imag1 = _mm_mullo_epi16(a, b_sl); // a3.i*b3.r, ....
++ imag2 = _mm_mullo_epi16(b, a_sl); // b3.i*a3.r, ....
+
+- imag = _mm_adds_epi16(imag1, imag2); //with saturation arithmetic!
++ imag = _mm_adds_epi16(imag1, imag2); // with saturation arithmetic!
+
+- realcacc = _mm_adds_epi16(realcacc, real);
+- imagcacc = _mm_adds_epi16(imagcacc, imag);
++ realcacc = _mm_adds_epi16(realcacc, real);
++ imagcacc = _mm_adds_epi16(imagcacc, imag);
+
+- _in_a += 4;
+- _in_b += 4;
+- }
++ _in_a += 4;
++ _in_b += 4;
++ }
+
+- realcacc = _mm_and_si128(realcacc, mask_real);
+- imagcacc = _mm_and_si128(imagcacc, mask_imag);
++ realcacc = _mm_and_si128(realcacc, mask_real);
++ imagcacc = _mm_and_si128(imagcacc, mask_imag);
+
+- a = _mm_or_si128(realcacc, imagcacc);
++ a = _mm_or_si128(realcacc, imagcacc);
+
+- _mm_store_si128((__m128i*)dotProductVector, a); // Store the results back into the dot product vector
++ _mm_store_si128((__m128i*)dotProductVector,
++ a); // Store the results back into the dot product vector
+
+- for (number = 0; number < 4; ++number)
+- {
+- dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(dotProductVector[number])), sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[number])));
+- }
++ for (number = 0; number < 4; ++number) {
++ dotProduct = lv_cmake(
++ sat_adds16i(lv_creal(dotProduct), lv_creal(dotProductVector[number])),
++ sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[number])));
+ }
++ }
+
+- for (number = 0; number < (num_points % 4); ++number)
+- {
+- lv_16sc_t tmp = (*_in_a++) * (*_in_b++);
+- dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(tmp)), sat_adds16i(lv_cimag(dotProduct), lv_cimag(tmp)));
+- }
++ for (number = 0; number < (num_points % 4); ++number) {
++ lv_16sc_t tmp = (*_in_a++) * (*_in_b++);
++ dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(tmp)),
++ sat_adds16i(lv_cimag(dotProduct), lv_cimag(tmp)));
++ }
+
+ *_out = dotProduct;
+ }
+@@ -147,7 +160,10 @@ static inline void volk_16ic_x2_dot_prod_16ic_a_sse2(lv_16sc_t* out, const lv_16
+ #ifdef LV_HAVE_SSE2
+ #include <emmintrin.h>
+
+-static inline void volk_16ic_x2_dot_prod_16ic_u_sse2(lv_16sc_t* out, const lv_16sc_t* in_a, const lv_16sc_t* in_b, unsigned int num_points)
++static inline void volk_16ic_x2_dot_prod_16ic_u_sse2(lv_16sc_t* out,
++ const lv_16sc_t* in_a,
++ const lv_16sc_t* in_b,
++ unsigned int num_points)
+ {
+ lv_16sc_t dotProduct = lv_cmake((int16_t)0, (int16_t)0);
+
+@@ -158,62 +174,67 @@ static inline void volk_16ic_x2_dot_prod_16ic_u_sse2(lv_16sc_t* out, const lv_16
+ lv_16sc_t* _out = out;
+ unsigned int number;
+
+- if (sse_iters > 0)
+- {
+- __m128i a, b, c, c_sr, mask_imag, mask_real, real, imag, imag1, imag2, b_sl, a_sl, realcacc, imagcacc, result;
+- __VOLK_ATTR_ALIGNED(16) lv_16sc_t dotProductVector[4];
++ if (sse_iters > 0) {
++ __m128i a, b, c, c_sr, mask_imag, mask_real, real, imag, imag1, imag2, b_sl, a_sl,
++ realcacc, imagcacc, result;
++ __VOLK_ATTR_ALIGNED(16) lv_16sc_t dotProductVector[4];
+
+- realcacc = _mm_setzero_si128();
+- imagcacc = _mm_setzero_si128();
++ realcacc = _mm_setzero_si128();
++ imagcacc = _mm_setzero_si128();
+
+- mask_imag = _mm_set_epi8(0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0);
+- mask_real = _mm_set_epi8(0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF);
++ mask_imag = _mm_set_epi8(
++ 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0);
++ mask_real = _mm_set_epi8(
++ 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF);
+
+- for(number = 0; number < sse_iters; number++)
+- {
+- // a[127:0]=[a3.i,a3.r,a2.i,a2.r,a1.i,a1.r,a0.i,a0.r]
+- a = _mm_loadu_si128((__m128i*)_in_a); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg
+- __VOLK_PREFETCH(_in_a + 8);
+- b = _mm_loadu_si128((__m128i*)_in_b);
+- __VOLK_PREFETCH(_in_b + 8);
+- c = _mm_mullo_epi16(a, b); // a3.i*b3.i, a3.r*b3.r, ....
++ for (number = 0; number < sse_iters; number++) {
++ // a[127:0]=[a3.i,a3.r,a2.i,a2.r,a1.i,a1.r,a0.i,a0.r]
++ a = _mm_loadu_si128(
++ (__m128i*)_in_a); // load (2 byte imag, 2 byte real) x 4 into 128 bits reg
++ __VOLK_PREFETCH(_in_a + 8);
++ b = _mm_loadu_si128((__m128i*)_in_b);
++ __VOLK_PREFETCH(_in_b + 8);
++ c = _mm_mullo_epi16(a, b); // a3.i*b3.i, a3.r*b3.r, ....
+
+- c_sr = _mm_srli_si128(c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst.
+- real = _mm_subs_epi16(c, c_sr);
++ c_sr = _mm_srli_si128(c, 2); // Shift a right by imm8 bytes while shifting in
++ // zeros, and store the results in dst.
++ real = _mm_subs_epi16(c, c_sr);
+
+- b_sl = _mm_slli_si128(b, 2); // b3.r, b2.i ....
+- a_sl = _mm_slli_si128(a, 2); // a3.r, a2.i ....
++ b_sl = _mm_slli_si128(b, 2); // b3.r, b2.i ....
++ a_sl = _mm_slli_si128(a, 2); // a3.r, a2.i ....
+
+- imag1 = _mm_mullo_epi16(a, b_sl); // a3.i*b3.r, ....
+- imag2 = _mm_mullo_epi16(b, a_sl); // b3.i*a3.r, ....
++ imag1 = _mm_mullo_epi16(a, b_sl); // a3.i*b3.r, ....
++ imag2 = _mm_mullo_epi16(b, a_sl); // b3.i*a3.r, ....
+
+- imag = _mm_adds_epi16(imag1, imag2); //with saturation arithmetic!
++ imag = _mm_adds_epi16(imag1, imag2); // with saturation arithmetic!
+
+- realcacc = _mm_adds_epi16(realcacc, real);
+- imagcacc = _mm_adds_epi16(imagcacc, imag);
++ realcacc = _mm_adds_epi16(realcacc, real);
++ imagcacc = _mm_adds_epi16(imagcacc, imag);
+
+- _in_a += 4;
+- _in_b += 4;
+- }
++ _in_a += 4;
++ _in_b += 4;
++ }
+
+- realcacc = _mm_and_si128(realcacc, mask_real);
+- imagcacc = _mm_and_si128(imagcacc, mask_imag);
++ realcacc = _mm_and_si128(realcacc, mask_real);
++ imagcacc = _mm_and_si128(imagcacc, mask_imag);
+
+- result = _mm_or_si128(realcacc, imagcacc);
++ result = _mm_or_si128(realcacc, imagcacc);
+
+- _mm_storeu_si128((__m128i*)dotProductVector, result); // Store the results back into the dot product vector
++ _mm_storeu_si128((__m128i*)dotProductVector,
++ result); // Store the results back into the dot product vector
+
+- for (number = 0; number < 4; ++number)
+- {
+- dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(dotProductVector[number])), sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[number])));
+- }
++ for (number = 0; number < 4; ++number) {
++ dotProduct = lv_cmake(
++ sat_adds16i(lv_creal(dotProduct), lv_creal(dotProductVector[number])),
++ sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[number])));
+ }
++ }
+
+- for (number = 0; number < (num_points % 4); ++number)
+- {
+- lv_16sc_t tmp = (*_in_a++) * (*_in_b++);
+- dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(tmp)), sat_adds16i(lv_cimag(dotProduct), lv_cimag(tmp)));
+- }
++ for (number = 0; number < (num_points % 4); ++number) {
++ lv_16sc_t tmp = (*_in_a++) * (*_in_b++);
++ dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(tmp)),
++ sat_adds16i(lv_cimag(dotProduct), lv_cimag(tmp)));
++ }
+
+ *_out = dotProduct;
+ }
+@@ -223,7 +244,10 @@ static inline void volk_16ic_x2_dot_prod_16ic_u_sse2(lv_16sc_t* out, const lv_16
+ #ifdef LV_HAVE_AVX2
+ #include <immintrin.h>
+
+-static inline void volk_16ic_x2_dot_prod_16ic_u_axv2(lv_16sc_t* out, const lv_16sc_t* in_a, const lv_16sc_t* in_b, unsigned int num_points)
++static inline void volk_16ic_x2_dot_prod_16ic_u_axv2(lv_16sc_t* out,
++ const lv_16sc_t* in_a,
++ const lv_16sc_t* in_b,
++ unsigned int num_points)
+ {
+ lv_16sc_t dotProduct = lv_cmake((int16_t)0, (int16_t)0);
+
+@@ -234,62 +258,126 @@ static inline void volk_16ic_x2_dot_prod_16ic_u_axv2(lv_16sc_t* out, const lv_16
+ lv_16sc_t* _out = out;
+ unsigned int number;
+
+- if (avx_iters > 0)
+- {
+- __m256i a, b, c, c_sr, mask_imag, mask_real, real, imag, imag1, imag2, b_sl, a_sl, realcacc, imagcacc, result;
+- __VOLK_ATTR_ALIGNED(32) lv_16sc_t dotProductVector[8];
+-
+- realcacc = _mm256_setzero_si256();
+- imagcacc = _mm256_setzero_si256();
+-
+- mask_imag = _mm256_set_epi8(0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0);
+- mask_real = _mm256_set_epi8(0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF);
+-
+- for(number = 0; number < avx_iters; number++)
+- {
+- a = _mm256_loadu_si256((__m256i*)_in_a);
+- __VOLK_PREFETCH(_in_a + 16);
+- b = _mm256_loadu_si256((__m256i*)_in_b);
+- __VOLK_PREFETCH(_in_b + 16);
+- c = _mm256_mullo_epi16(a, b);
+-
+- c_sr = _mm256_srli_si256(c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst.
+- real = _mm256_subs_epi16(c, c_sr);
+-
+- b_sl = _mm256_slli_si256(b, 2);
+- a_sl = _mm256_slli_si256(a, 2);
+-
+- imag1 = _mm256_mullo_epi16(a, b_sl);
+- imag2 = _mm256_mullo_epi16(b, a_sl);
+-
+- imag = _mm256_adds_epi16(imag1, imag2); //with saturation arithmetic!
+-
+- realcacc = _mm256_adds_epi16(realcacc, real);
+- imagcacc = _mm256_adds_epi16(imagcacc, imag);
+-
+- _in_a += 8;
+- _in_b += 8;
+- }
++ if (avx_iters > 0) {
++ __m256i a, b, c, c_sr, mask_imag, mask_real, real, imag, imag1, imag2, b_sl, a_sl,
++ realcacc, imagcacc, result;
++ __VOLK_ATTR_ALIGNED(32) lv_16sc_t dotProductVector[8];
++
++ realcacc = _mm256_setzero_si256();
++ imagcacc = _mm256_setzero_si256();
++
++ mask_imag = _mm256_set_epi8(0xFF,
++ 0xFF,
++ 0,
++ 0,
++ 0xFF,
++ 0xFF,
++ 0,
++ 0,
++ 0xFF,
++ 0xFF,
++ 0,
++ 0,
++ 0xFF,
++ 0xFF,
++ 0,
++ 0,
++ 0xFF,
++ 0xFF,
++ 0,
++ 0,
++ 0xFF,
++ 0xFF,
++ 0,
++ 0,
++ 0xFF,
++ 0xFF,
++ 0,
++ 0,
++ 0xFF,
++ 0xFF,
++ 0,
++ 0);
++ mask_real = _mm256_set_epi8(0,
++ 0,
++ 0xFF,
++ 0xFF,
++ 0,
++ 0,
++ 0xFF,
++ 0xFF,
++ 0,
++ 0,
++ 0xFF,
++ 0xFF,
++ 0,
++ 0,
++ 0xFF,
++ 0xFF,
++ 0,
++ 0,
++ 0xFF,
++ 0xFF,
++ 0,
++ 0,
++ 0xFF,
++ 0xFF,
++ 0,
++ 0,
++ 0xFF,
++ 0xFF,
++ 0,
++ 0,
++ 0xFF,
++ 0xFF);
++
++ for (number = 0; number < avx_iters; number++) {
++ a = _mm256_loadu_si256((__m256i*)_in_a);
++ __VOLK_PREFETCH(_in_a + 16);
++ b = _mm256_loadu_si256((__m256i*)_in_b);
++ __VOLK_PREFETCH(_in_b + 16);
++ c = _mm256_mullo_epi16(a, b);
++
++ c_sr = _mm256_srli_si256(c, 2); // Shift a right by imm8 bytes while shifting
++ // in zeros, and store the results in dst.
++ real = _mm256_subs_epi16(c, c_sr);
++
++ b_sl = _mm256_slli_si256(b, 2);
++ a_sl = _mm256_slli_si256(a, 2);
++
++ imag1 = _mm256_mullo_epi16(a, b_sl);
++ imag2 = _mm256_mullo_epi16(b, a_sl);
++
++ imag = _mm256_adds_epi16(imag1, imag2); // with saturation arithmetic!
++
++ realcacc = _mm256_adds_epi16(realcacc, real);
++ imagcacc = _mm256_adds_epi16(imagcacc, imag);
++
++ _in_a += 8;
++ _in_b += 8;
++ }
+
+- realcacc = _mm256_and_si256(realcacc, mask_real);
+- imagcacc = _mm256_and_si256(imagcacc, mask_imag);
++ realcacc = _mm256_and_si256(realcacc, mask_real);
++ imagcacc = _mm256_and_si256(imagcacc, mask_imag);
+
+- result = _mm256_or_si256(realcacc, imagcacc);
++ result = _mm256_or_si256(realcacc, imagcacc);
+
+- _mm256_storeu_si256((__m256i*)dotProductVector, result); // Store the results back into the dot product vector
+- _mm256_zeroupper();
++ _mm256_storeu_si256((__m256i*)dotProductVector,
++ result); // Store the results back into the dot product vector
++ _mm256_zeroupper();
+
+- for (number = 0; number < 8; ++number)
+- {
+- dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(dotProductVector[number])), sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[number])));
+- }
++ for (number = 0; number < 8; ++number) {
++ dotProduct = lv_cmake(
++ sat_adds16i(lv_creal(dotProduct), lv_creal(dotProductVector[number])),
++ sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[number])));
+ }
++ }
+
+- for (number = 0; number < (num_points % 8); ++number)
+- {
+- lv_16sc_t tmp = (*_in_a++) * (*_in_b++);
+- dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(tmp)), sat_adds16i(lv_cimag(dotProduct), lv_cimag(tmp)));
+- }
++ for (number = 0; number < (num_points % 8); ++number) {
++ lv_16sc_t tmp = (*_in_a++) * (*_in_b++);
++ dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(tmp)),
++ sat_adds16i(lv_cimag(dotProduct), lv_cimag(tmp)));
++ }
+
+ *_out = dotProduct;
+ }
+@@ -299,7 +387,10 @@ static inline void volk_16ic_x2_dot_prod_16ic_u_axv2(lv_16sc_t* out, const lv_16
+ #ifdef LV_HAVE_AVX2
+ #include <immintrin.h>
+
+-static inline void volk_16ic_x2_dot_prod_16ic_a_axv2(lv_16sc_t* out, const lv_16sc_t* in_a, const lv_16sc_t* in_b, unsigned int num_points)
++static inline void volk_16ic_x2_dot_prod_16ic_a_axv2(lv_16sc_t* out,
++ const lv_16sc_t* in_a,
++ const lv_16sc_t* in_b,
++ unsigned int num_points)
+ {
+ lv_16sc_t dotProduct = lv_cmake((int16_t)0, (int16_t)0);
+
+@@ -310,62 +401,126 @@ static inline void volk_16ic_x2_dot_prod_16ic_a_axv2(lv_16sc_t* out, const lv_16
+ lv_16sc_t* _out = out;
+ unsigned int number;
+
+- if (avx_iters > 0)
+- {
+- __m256i a, b, c, c_sr, mask_imag, mask_real, real, imag, imag1, imag2, b_sl, a_sl, realcacc, imagcacc, result;
+- __VOLK_ATTR_ALIGNED(32) lv_16sc_t dotProductVector[8];
+-
+- realcacc = _mm256_setzero_si256();
+- imagcacc = _mm256_setzero_si256();
+-
+- mask_imag = _mm256_set_epi8(0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0);
+- mask_real = _mm256_set_epi8(0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF);
+-
+- for(number = 0; number < avx_iters; number++)
+- {
+- a = _mm256_load_si256((__m256i*)_in_a);
+- __VOLK_PREFETCH(_in_a + 16);
+- b = _mm256_load_si256((__m256i*)_in_b);
+- __VOLK_PREFETCH(_in_b + 16);
+- c = _mm256_mullo_epi16(a, b);
+-
+- c_sr = _mm256_srli_si256(c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst.
+- real = _mm256_subs_epi16(c, c_sr);
+-
+- b_sl = _mm256_slli_si256(b, 2);
+- a_sl = _mm256_slli_si256(a, 2);
+-
+- imag1 = _mm256_mullo_epi16(a, b_sl);
+- imag2 = _mm256_mullo_epi16(b, a_sl);
+-
+- imag = _mm256_adds_epi16(imag1, imag2); //with saturation arithmetic!
+-
+- realcacc = _mm256_adds_epi16(realcacc, real);
+- imagcacc = _mm256_adds_epi16(imagcacc, imag);
+-
+- _in_a += 8;
+- _in_b += 8;
+- }
++ if (avx_iters > 0) {
++ __m256i a, b, c, c_sr, mask_imag, mask_real, real, imag, imag1, imag2, b_sl, a_sl,
++ realcacc, imagcacc, result;
++ __VOLK_ATTR_ALIGNED(32) lv_16sc_t dotProductVector[8];
++
++ realcacc = _mm256_setzero_si256();
++ imagcacc = _mm256_setzero_si256();
++
++ mask_imag = _mm256_set_epi8(0xFF,
++ 0xFF,
++ 0,
++ 0,
++ 0xFF,
++ 0xFF,
++ 0,
++ 0,
++ 0xFF,
++ 0xFF,
++ 0,
++ 0,
++ 0xFF,
++ 0xFF,
++ 0,
++ 0,
++ 0xFF,
++ 0xFF,
++ 0,
++ 0,
++ 0xFF,
++ 0xFF,
++ 0,
++ 0,
++ 0xFF,
++ 0xFF,
++ 0,
++ 0,
++ 0xFF,
++ 0xFF,
++ 0,
++ 0);
++ mask_real = _mm256_set_epi8(0,
++ 0,
++ 0xFF,
++ 0xFF,
++ 0,
++ 0,
++ 0xFF,
++ 0xFF,
++ 0,
++ 0,
++ 0xFF,
++ 0xFF,
++ 0,
++ 0,
++ 0xFF,
++ 0xFF,
++ 0,
++ 0,
++ 0xFF,
++ 0xFF,
++ 0,
++ 0,
++ 0xFF,
++ 0xFF,
++ 0,
++ 0,
++ 0xFF,
++ 0xFF,
++ 0,
++ 0,
++ 0xFF,
++ 0xFF);
++
++ for (number = 0; number < avx_iters; number++) {
++ a = _mm256_load_si256((__m256i*)_in_a);
++ __VOLK_PREFETCH(_in_a + 16);
++ b = _mm256_load_si256((__m256i*)_in_b);
++ __VOLK_PREFETCH(_in_b + 16);
++ c = _mm256_mullo_epi16(a, b);
++
++ c_sr = _mm256_srli_si256(c, 2); // Shift a right by imm8 bytes while shifting
++ // in zeros, and store the results in dst.
++ real = _mm256_subs_epi16(c, c_sr);
++
++ b_sl = _mm256_slli_si256(b, 2);
++ a_sl = _mm256_slli_si256(a, 2);
++
++ imag1 = _mm256_mullo_epi16(a, b_sl);
++ imag2 = _mm256_mullo_epi16(b, a_sl);
++
++ imag = _mm256_adds_epi16(imag1, imag2); // with saturation arithmetic!
++
++ realcacc = _mm256_adds_epi16(realcacc, real);
++ imagcacc = _mm256_adds_epi16(imagcacc, imag);
++
++ _in_a += 8;
++ _in_b += 8;
++ }
+
+- realcacc = _mm256_and_si256(realcacc, mask_real);
+- imagcacc = _mm256_and_si256(imagcacc, mask_imag);
++ realcacc = _mm256_and_si256(realcacc, mask_real);
++ imagcacc = _mm256_and_si256(imagcacc, mask_imag);
+
+- result = _mm256_or_si256(realcacc, imagcacc);
++ result = _mm256_or_si256(realcacc, imagcacc);
+
+- _mm256_store_si256((__m256i*)dotProductVector, result); // Store the results back into the dot product vector
+- _mm256_zeroupper();
++ _mm256_store_si256((__m256i*)dotProductVector,
++ result); // Store the results back into the dot product vector
++ _mm256_zeroupper();
+
+- for (number = 0; number < 8; ++number)
+- {
+- dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(dotProductVector[number])), sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[number])));
+- }
++ for (number = 0; number < 8; ++number) {
++ dotProduct = lv_cmake(
++ sat_adds16i(lv_creal(dotProduct), lv_creal(dotProductVector[number])),
++ sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[number])));
+ }
++ }
+
+- for (number = 0; number < (num_points % 8); ++number)
+- {
+- lv_16sc_t tmp = (*_in_a++) * (*_in_b++);
+- dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(tmp)), sat_adds16i(lv_cimag(dotProduct), lv_cimag(tmp)));
+- }
++ for (number = 0; number < (num_points % 8); ++number) {
++ lv_16sc_t tmp = (*_in_a++) * (*_in_b++);
++ dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(tmp)),
++ sat_adds16i(lv_cimag(dotProduct), lv_cimag(tmp)));
++ }
+
+ *_out = dotProduct;
+ }
+@@ -375,69 +530,70 @@ static inline void volk_16ic_x2_dot_prod_16ic_a_axv2(lv_16sc_t* out, const lv_16
+ #ifdef LV_HAVE_NEON
+ #include <arm_neon.h>
+
+-static inline void volk_16ic_x2_dot_prod_16ic_neon(lv_16sc_t* out, const lv_16sc_t* in_a, const lv_16sc_t* in_b, unsigned int num_points)
++static inline void volk_16ic_x2_dot_prod_16ic_neon(lv_16sc_t* out,
++ const lv_16sc_t* in_a,
++ const lv_16sc_t* in_b,
++ unsigned int num_points)
+ {
+ unsigned int quarter_points = num_points / 4;
+ unsigned int number;
+
+- lv_16sc_t* a_ptr = (lv_16sc_t*) in_a;
+- lv_16sc_t* b_ptr = (lv_16sc_t*) in_b;
++ lv_16sc_t* a_ptr = (lv_16sc_t*)in_a;
++ lv_16sc_t* b_ptr = (lv_16sc_t*)in_b;
+ *out = lv_cmake((int16_t)0, (int16_t)0);
+
+- if (quarter_points > 0)
+- {
+- // for 2-lane vectors, 1st lane holds the real part,
+- // 2nd lane holds the imaginary part
+- int16x4x2_t a_val, b_val, c_val, accumulator;
+- int16x4x2_t tmp_real, tmp_imag;
+- __VOLK_ATTR_ALIGNED(16) lv_16sc_t accum_result[4];
+- accumulator.val[0] = vdup_n_s16(0);
+- accumulator.val[1] = vdup_n_s16(0);
+- lv_16sc_t dotProduct = lv_cmake((int16_t)0, (int16_t)0);
+-
+- for(number = 0; number < quarter_points; ++number)
+- {
+- a_val = vld2_s16((int16_t*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i
+- b_val = vld2_s16((int16_t*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i
+- __VOLK_PREFETCH(a_ptr + 8);
+- __VOLK_PREFETCH(b_ptr + 8);
+-
+- // multiply the real*real and imag*imag to get real result
+- // a0r*b0r|a1r*b1r|a2r*b2r|a3r*b3r
+- tmp_real.val[0] = vmul_s16(a_val.val[0], b_val.val[0]);
+- // a0i*b0i|a1i*b1i|a2i*b2i|a3i*b3i
+- tmp_real.val[1] = vmul_s16(a_val.val[1], b_val.val[1]);
+-
+- // Multiply cross terms to get the imaginary result
+- // a0r*b0i|a1r*b1i|a2r*b2i|a3r*b3i
+- tmp_imag.val[0] = vmul_s16(a_val.val[0], b_val.val[1]);
+- // a0i*b0r|a1i*b1r|a2i*b2r|a3i*b3r
+- tmp_imag.val[1] = vmul_s16(a_val.val[1], b_val.val[0]);
+-
+- c_val.val[0] = vqsub_s16(tmp_real.val[0], tmp_real.val[1]);
+- c_val.val[1] = vqadd_s16(tmp_imag.val[0], tmp_imag.val[1]);
+-
+- accumulator.val[0] = vqadd_s16(accumulator.val[0], c_val.val[0]);
+- accumulator.val[1] = vqadd_s16(accumulator.val[1], c_val.val[1]);
+-
+- a_ptr += 4;
+- b_ptr += 4;
+- }
+-
+- vst2_s16((int16_t*)accum_result, accumulator);
+- for (number = 0; number < 4; ++number)
+- {
+- dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(accum_result[number])), sat_adds16i(lv_cimag(dotProduct), lv_cimag(accum_result[number])));
+- }
+-
+- *out = dotProduct;
++ if (quarter_points > 0) {
++ // for 2-lane vectors, 1st lane holds the real part,
++ // 2nd lane holds the imaginary part
++ int16x4x2_t a_val, b_val, c_val, accumulator;
++ int16x4x2_t tmp_real, tmp_imag;
++ __VOLK_ATTR_ALIGNED(16) lv_16sc_t accum_result[4];
++ accumulator.val[0] = vdup_n_s16(0);
++ accumulator.val[1] = vdup_n_s16(0);
++ lv_16sc_t dotProduct = lv_cmake((int16_t)0, (int16_t)0);
++
++ for (number = 0; number < quarter_points; ++number) {
++ a_val = vld2_s16((int16_t*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i
++ b_val = vld2_s16((int16_t*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i
++ __VOLK_PREFETCH(a_ptr + 8);
++ __VOLK_PREFETCH(b_ptr + 8);
++
++ // multiply the real*real and imag*imag to get real result
++ // a0r*b0r|a1r*b1r|a2r*b2r|a3r*b3r
++ tmp_real.val[0] = vmul_s16(a_val.val[0], b_val.val[0]);
++ // a0i*b0i|a1i*b1i|a2i*b2i|a3i*b3i
++ tmp_real.val[1] = vmul_s16(a_val.val[1], b_val.val[1]);
++
++ // Multiply cross terms to get the imaginary result
++ // a0r*b0i|a1r*b1i|a2r*b2i|a3r*b3i
++ tmp_imag.val[0] = vmul_s16(a_val.val[0], b_val.val[1]);
++ // a0i*b0r|a1i*b1r|a2i*b2r|a3i*b3r
++ tmp_imag.val[1] = vmul_s16(a_val.val[1], b_val.val[0]);
++
++ c_val.val[0] = vqsub_s16(tmp_real.val[0], tmp_real.val[1]);
++ c_val.val[1] = vqadd_s16(tmp_imag.val[0], tmp_imag.val[1]);
++
++ accumulator.val[0] = vqadd_s16(accumulator.val[0], c_val.val[0]);
++ accumulator.val[1] = vqadd_s16(accumulator.val[1], c_val.val[1]);
++
++ a_ptr += 4;
++ b_ptr += 4;
+ }
+
+- // tail case
+- for(number = quarter_points * 4; number < num_points; ++number)
+- {
+- *out += (*a_ptr++) * (*b_ptr++);
++ vst2_s16((int16_t*)accum_result, accumulator);
++ for (number = 0; number < 4; ++number) {
++ dotProduct = lv_cmake(
++ sat_adds16i(lv_creal(dotProduct), lv_creal(accum_result[number])),
++ sat_adds16i(lv_cimag(dotProduct), lv_cimag(accum_result[number])));
+ }
++
++ *out = dotProduct;
++ }
++
++ // tail case
++ for (number = quarter_points * 4; number < num_points; ++number) {
++ *out += (*a_ptr++) * (*b_ptr++);
++ }
+ }
+
+ #endif /* LV_HAVE_NEON */
+@@ -446,13 +602,16 @@ static inline void volk_16ic_x2_dot_prod_16ic_neon(lv_16sc_t* out, const lv_16sc
+ #ifdef LV_HAVE_NEON
+ #include <arm_neon.h>
+
+-static inline void volk_16ic_x2_dot_prod_16ic_neon_vma(lv_16sc_t* out, const lv_16sc_t* in_a, const lv_16sc_t* in_b, unsigned int num_points)
++static inline void volk_16ic_x2_dot_prod_16ic_neon_vma(lv_16sc_t* out,
++ const lv_16sc_t* in_a,
++ const lv_16sc_t* in_b,
++ unsigned int num_points)
+ {
+ unsigned int quarter_points = num_points / 4;
+ unsigned int number;
+
+- lv_16sc_t* a_ptr = (lv_16sc_t*) in_a;
+- lv_16sc_t* b_ptr = (lv_16sc_t*) in_b;
++ lv_16sc_t* a_ptr = (lv_16sc_t*)in_a;
++ lv_16sc_t* b_ptr = (lv_16sc_t*)in_b;
+ // for 2-lane vectors, 1st lane holds the real part,
+ // 2nd lane holds the imaginary part
+ int16x4x2_t a_val, b_val, accumulator;
+@@ -461,35 +620,33 @@ static inline void volk_16ic_x2_dot_prod_16ic_neon_vma(lv_16sc_t* out, const lv_
+ accumulator.val[0] = vdup_n_s16(0);
+ accumulator.val[1] = vdup_n_s16(0);
+
+- for(number = 0; number < quarter_points; ++number)
+- {
+- a_val = vld2_s16((int16_t*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i
+- b_val = vld2_s16((int16_t*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i
+- __VOLK_PREFETCH(a_ptr + 8);
+- __VOLK_PREFETCH(b_ptr + 8);
++ for (number = 0; number < quarter_points; ++number) {
++ a_val = vld2_s16((int16_t*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i
++ b_val = vld2_s16((int16_t*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i
++ __VOLK_PREFETCH(a_ptr + 8);
++ __VOLK_PREFETCH(b_ptr + 8);
+
+- tmp.val[0] = vmul_s16(a_val.val[0], b_val.val[0]);
+- tmp.val[1] = vmul_s16(a_val.val[1], b_val.val[0]);
++ tmp.val[0] = vmul_s16(a_val.val[0], b_val.val[0]);
++ tmp.val[1] = vmul_s16(a_val.val[1], b_val.val[0]);
+
+- // use multiply accumulate/subtract to get result
+- tmp.val[0] = vmls_s16(tmp.val[0], a_val.val[1], b_val.val[1]);
+- tmp.val[1] = vmla_s16(tmp.val[1], a_val.val[0], b_val.val[1]);
++ // use multiply accumulate/subtract to get result
++ tmp.val[0] = vmls_s16(tmp.val[0], a_val.val[1], b_val.val[1]);
++ tmp.val[1] = vmla_s16(tmp.val[1], a_val.val[0], b_val.val[1]);
+
+- accumulator.val[0] = vqadd_s16(accumulator.val[0], tmp.val[0]);
+- accumulator.val[1] = vqadd_s16(accumulator.val[1], tmp.val[1]);
++ accumulator.val[0] = vqadd_s16(accumulator.val[0], tmp.val[0]);
++ accumulator.val[1] = vqadd_s16(accumulator.val[1], tmp.val[1]);
+
+- a_ptr += 4;
+- b_ptr += 4;
+- }
++ a_ptr += 4;
++ b_ptr += 4;
++ }
+
+ vst2_s16((int16_t*)accum_result, accumulator);
+ *out = accum_result[0] + accum_result[1] + accum_result[2] + accum_result[3];
+
+ // tail case
+- for(number = quarter_points * 4; number < num_points; ++number)
+- {
+- *out += (*a_ptr++) * (*b_ptr++);
+- }
++ for (number = quarter_points * 4; number < num_points; ++number) {
++ *out += (*a_ptr++) * (*b_ptr++);
++ }
+ }
+
+ #endif /* LV_HAVE_NEON */
+@@ -498,13 +655,16 @@ static inline void volk_16ic_x2_dot_prod_16ic_neon_vma(lv_16sc_t* out, const lv_
+ #ifdef LV_HAVE_NEON
+ #include <arm_neon.h>
+
+-static inline void volk_16ic_x2_dot_prod_16ic_neon_optvma(lv_16sc_t* out, const lv_16sc_t* in_a, const lv_16sc_t* in_b, unsigned int num_points)
++static inline void volk_16ic_x2_dot_prod_16ic_neon_optvma(lv_16sc_t* out,
++ const lv_16sc_t* in_a,
++ const lv_16sc_t* in_b,
++ unsigned int num_points)
+ {
+ unsigned int quarter_points = num_points / 4;
+ unsigned int number;
+
+- lv_16sc_t* a_ptr = (lv_16sc_t*) in_a;
+- lv_16sc_t* b_ptr = (lv_16sc_t*) in_b;
++ lv_16sc_t* a_ptr = (lv_16sc_t*)in_a;
++ lv_16sc_t* b_ptr = (lv_16sc_t*)in_b;
+ // for 2-lane vectors, 1st lane holds the real part,
+ // 2nd lane holds the imaginary part
+ int16x4x2_t a_val, b_val, accumulator1, accumulator2;
+@@ -515,22 +675,21 @@ static inline void volk_16ic_x2_dot_prod_16ic_neon_optvma(lv_16sc_t* out, const
+ accumulator2.val[0] = vdup_n_s16(0);
+ accumulator2.val[1] = vdup_n_s16(0);
+
+- for(number = 0; number < quarter_points; ++number)
+- {
+- a_val = vld2_s16((int16_t*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i
+- b_val = vld2_s16((int16_t*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i
+- __VOLK_PREFETCH(a_ptr + 8);
+- __VOLK_PREFETCH(b_ptr + 8);
++ for (number = 0; number < quarter_points; ++number) {
++ a_val = vld2_s16((int16_t*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i
++ b_val = vld2_s16((int16_t*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i
++ __VOLK_PREFETCH(a_ptr + 8);
++ __VOLK_PREFETCH(b_ptr + 8);
+
+- // use 2 accumulators to remove inter-instruction data dependencies
+- accumulator1.val[0] = vmla_s16(accumulator1.val[0], a_val.val[0], b_val.val[0]);
+- accumulator2.val[0] = vmls_s16(accumulator2.val[0], a_val.val[1], b_val.val[1]);
+- accumulator1.val[1] = vmla_s16(accumulator1.val[1], a_val.val[0], b_val.val[1]);
+- accumulator2.val[1] = vmla_s16(accumulator2.val[1], a_val.val[1], b_val.val[0]);
++ // use 2 accumulators to remove inter-instruction data dependencies
++ accumulator1.val[0] = vmla_s16(accumulator1.val[0], a_val.val[0], b_val.val[0]);
++ accumulator2.val[0] = vmls_s16(accumulator2.val[0], a_val.val[1], b_val.val[1]);
++ accumulator1.val[1] = vmla_s16(accumulator1.val[1], a_val.val[0], b_val.val[1]);
++ accumulator2.val[1] = vmla_s16(accumulator2.val[1], a_val.val[1], b_val.val[0]);
+
+- a_ptr += 4;
+- b_ptr += 4;
+- }
++ a_ptr += 4;
++ b_ptr += 4;
++ }
+
+ accumulator1.val[0] = vqadd_s16(accumulator1.val[0], accumulator2.val[0]);
+ accumulator1.val[1] = vqadd_s16(accumulator1.val[1], accumulator2.val[1]);
+@@ -539,10 +698,9 @@ static inline void volk_16ic_x2_dot_prod_16ic_neon_optvma(lv_16sc_t* out, const
+ *out = accum_result[0] + accum_result[1] + accum_result[2] + accum_result[3];
+
+ // tail case
+- for(number = quarter_points * 4; number < num_points; ++number)
+- {
+- *out += (*a_ptr++) * (*b_ptr++);
+- }
++ for (number = quarter_points * 4; number < num_points; ++number) {
++ *out += (*a_ptr++) * (*b_ptr++);
++ }
+ }
+
+ #endif /* LV_HAVE_NEON */
+diff --git a/kernels/volk/volk_16ic_x2_multiply_16ic.h b/kernels/volk/volk_16ic_x2_multiply_16ic.h
+index 20d6a7f..2bf835d 100644
+--- a/kernels/volk/volk_16ic_x2_multiply_16ic.h
++++ b/kernels/volk/volk_16ic_x2_multiply_16ic.h
+@@ -25,18 +25,19 @@
+ *
+ * \b Overview
+ *
+- * Multiplies two input complex vectors, point-by-point, storing the result in the third vector.
+- * WARNING: Saturation is not checked.
++ * Multiplies two input complex vectors, point-by-point, storing the result in the third
++ * vector. WARNING: Saturation is not checked.
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+- * void volk_16ic_x2_multiply_16ic(lv_16sc_t* result, const lv_16sc_t* in_a, const lv_16sc_t* in_b, unsigned int num_points);
+- * \endcode
++ * void volk_16ic_x2_multiply_16ic(lv_16sc_t* result, const lv_16sc_t* in_a, const
++ * lv_16sc_t* in_b, unsigned int num_points); \endcode
+ *
+ * \b Inputs
+ * \li in_a: One of the vectors to be multiplied.
+ * \li in_b: The other vector to be multiplied.
+- * \li num_points: The number of complex data points to be multiplied from both input vectors.
++ * \li num_points: The number of complex data points to be multiplied from both input
++ * vectors.
+ *
+ * \b Outputs
+ * \li result: The vector where the results will be stored.
+@@ -51,13 +52,15 @@
+
+ #ifdef LV_HAVE_GENERIC
+
+-static inline void volk_16ic_x2_multiply_16ic_generic(lv_16sc_t* result, const lv_16sc_t* in_a, const lv_16sc_t* in_b, unsigned int num_points)
++static inline void volk_16ic_x2_multiply_16ic_generic(lv_16sc_t* result,
++ const lv_16sc_t* in_a,
++ const lv_16sc_t* in_b,
++ unsigned int num_points)
+ {
+ unsigned int n;
+- for (n = 0; n < num_points; n++)
+- {
+- result[n] = in_a[n] * in_b[n];
+- }
++ for (n = 0; n < num_points; n++) {
++ result[n] = in_a[n] * in_b[n];
++ }
+ }
+
+ #endif /*LV_HAVE_GENERIC*/
+@@ -66,51 +69,58 @@ static inline void volk_16ic_x2_multiply_16ic_generic(lv_16sc_t* result, const l
+ #ifdef LV_HAVE_SSE2
+ #include <emmintrin.h>
+
+-static inline void volk_16ic_x2_multiply_16ic_a_sse2(lv_16sc_t* out, const lv_16sc_t* in_a, const lv_16sc_t* in_b, unsigned int num_points)
++static inline void volk_16ic_x2_multiply_16ic_a_sse2(lv_16sc_t* out,
++ const lv_16sc_t* in_a,
++ const lv_16sc_t* in_b,
++ unsigned int num_points)
+ {
+ const unsigned int sse_iters = num_points / 4;
+- __m128i a, b, c, c_sr, mask_imag, mask_real, real, imag, imag1, imag2, b_sl, a_sl, result;
++ __m128i a, b, c, c_sr, mask_imag, mask_real, real, imag, imag1, imag2, b_sl, a_sl,
++ result;
+
+- mask_imag = _mm_set_epi8(0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0);
+- mask_real = _mm_set_epi8(0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF);
++ mask_imag = _mm_set_epi8(
++ 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0);
++ mask_real = _mm_set_epi8(
++ 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF);
+
+ const lv_16sc_t* _in_a = in_a;
+ const lv_16sc_t* _in_b = in_b;
+ lv_16sc_t* _out = out;
+ unsigned int number;
+
+- for(number = 0; number < sse_iters; number++)
+- {
+- a = _mm_load_si128((__m128i*)_in_a); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg
+- b = _mm_load_si128((__m128i*)_in_b);
+- c = _mm_mullo_epi16 (a, b); // a3.i*b3.i, a3.r*b3.r, ....
++ for (number = 0; number < sse_iters; number++) {
++ a = _mm_load_si128(
++ (__m128i*)_in_a); // load (2 byte imag, 2 byte real) x 4 into 128 bits reg
++ b = _mm_load_si128((__m128i*)_in_b);
++ c = _mm_mullo_epi16(a, b); // a3.i*b3.i, a3.r*b3.r, ....
+
+- c_sr = _mm_srli_si128 (c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst.
+- real = _mm_subs_epi16 (c, c_sr);
+- real = _mm_and_si128 (real, mask_real); // a3.r*b3.r-a3.i*b3.i , 0, a3.r*b3.r- a3.i*b3.i
++ c_sr = _mm_srli_si128(c, 2); // Shift a right by imm8 bytes while shifting in
++ // zeros, and store the results in dst.
++ real = _mm_subs_epi16(c, c_sr);
++ real = _mm_and_si128(real,
++ mask_real); // a3.r*b3.r-a3.i*b3.i , 0, a3.r*b3.r- a3.i*b3.i
+
+- b_sl = _mm_slli_si128(b, 2); // b3.r, b2.i ....
+- a_sl = _mm_slli_si128(a, 2); // a3.r, a2.i ....
++ b_sl = _mm_slli_si128(b, 2); // b3.r, b2.i ....
++ a_sl = _mm_slli_si128(a, 2); // a3.r, a2.i ....
+
+- imag1 = _mm_mullo_epi16(a, b_sl); // a3.i*b3.r, ....
+- imag2 = _mm_mullo_epi16(b, a_sl); // b3.i*a3.r, ....
++ imag1 = _mm_mullo_epi16(a, b_sl); // a3.i*b3.r, ....
++ imag2 = _mm_mullo_epi16(b, a_sl); // b3.i*a3.r, ....
+
+- imag = _mm_adds_epi16(imag1, imag2);
+- imag = _mm_and_si128 (imag, mask_imag); // a3.i*b3.r+b3.i*a3.r, 0, ...
++ imag = _mm_adds_epi16(imag1, imag2);
++ imag = _mm_and_si128(imag, mask_imag); // a3.i*b3.r+b3.i*a3.r, 0, ...
+
+- result = _mm_or_si128 (real, imag);
++ result = _mm_or_si128(real, imag);
+
+- _mm_store_si128((__m128i*)_out, result);
++ _mm_store_si128((__m128i*)_out, result);
+
+- _in_a += 4;
+- _in_b += 4;
+- _out += 4;
+- }
++ _in_a += 4;
++ _in_b += 4;
++ _out += 4;
++ }
+
+- for (number = sse_iters * 4; number < num_points; ++number)
+- {
+- *_out++ = (*_in_a++) * (*_in_b++);
+- }
++ for (number = sse_iters * 4; number < num_points; ++number) {
++ *_out++ = (*_in_a++) * (*_in_b++);
++ }
+ }
+ #endif /* LV_HAVE_SSE2 */
+
+@@ -118,51 +128,58 @@ static inline void volk_16ic_x2_multiply_16ic_a_sse2(lv_16sc_t* out, const lv_16
+ #ifdef LV_HAVE_SSE2
+ #include <emmintrin.h>
+
+-static inline void volk_16ic_x2_multiply_16ic_u_sse2(lv_16sc_t* out, const lv_16sc_t* in_a, const lv_16sc_t* in_b, unsigned int num_points)
++static inline void volk_16ic_x2_multiply_16ic_u_sse2(lv_16sc_t* out,
++ const lv_16sc_t* in_a,
++ const lv_16sc_t* in_b,
++ unsigned int num_points)
+ {
+ const unsigned int sse_iters = num_points / 4;
+- __m128i a, b, c, c_sr, mask_imag, mask_real, real, imag, imag1,imag2, b_sl, a_sl, result;
++ __m128i a, b, c, c_sr, mask_imag, mask_real, real, imag, imag1, imag2, b_sl, a_sl,
++ result;
+
+- mask_imag = _mm_set_epi8(0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0);
+- mask_real = _mm_set_epi8(0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF);
++ mask_imag = _mm_set_epi8(
++ 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0);
++ mask_real = _mm_set_epi8(
++ 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF);
+
+ const lv_16sc_t* _in_a = in_a;
+ const lv_16sc_t* _in_b = in_b;
+ lv_16sc_t* _out = out;
+ unsigned int number;
+
+- for(number = 0; number < sse_iters; number++)
+- {
+- a = _mm_loadu_si128((__m128i*)_in_a); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg
+- b = _mm_loadu_si128((__m128i*)_in_b);
+- c = _mm_mullo_epi16 (a, b); // a3.i*b3.i, a3.r*b3.r, ....
++ for (number = 0; number < sse_iters; number++) {
++ a = _mm_loadu_si128(
++ (__m128i*)_in_a); // load (2 byte imag, 2 byte real) x 4 into 128 bits reg
++ b = _mm_loadu_si128((__m128i*)_in_b);
++ c = _mm_mullo_epi16(a, b); // a3.i*b3.i, a3.r*b3.r, ....
+
+- c_sr = _mm_srli_si128 (c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst.
+- real = _mm_subs_epi16 (c, c_sr);
+- real = _mm_and_si128 (real, mask_real); // a3.r*b3.r-a3.i*b3.i , 0, a3.r*b3.r- a3.i*b3.i
++ c_sr = _mm_srli_si128(c, 2); // Shift a right by imm8 bytes while shifting in
++ // zeros, and store the results in dst.
++ real = _mm_subs_epi16(c, c_sr);
++ real = _mm_and_si128(real,
++ mask_real); // a3.r*b3.r-a3.i*b3.i , 0, a3.r*b3.r- a3.i*b3.i
+
+- b_sl = _mm_slli_si128(b, 2); // b3.r, b2.i ....
+- a_sl = _mm_slli_si128(a, 2); // a3.r, a2.i ....
++ b_sl = _mm_slli_si128(b, 2); // b3.r, b2.i ....
++ a_sl = _mm_slli_si128(a, 2); // a3.r, a2.i ....
+
+- imag1 = _mm_mullo_epi16(a, b_sl); // a3.i*b3.r, ....
+- imag2 = _mm_mullo_epi16(b, a_sl); // b3.i*a3.r, ....
++ imag1 = _mm_mullo_epi16(a, b_sl); // a3.i*b3.r, ....
++ imag2 = _mm_mullo_epi16(b, a_sl); // b3.i*a3.r, ....
+
+- imag = _mm_adds_epi16(imag1, imag2);
+- imag = _mm_and_si128 (imag, mask_imag); // a3.i*b3.r+b3.i*a3.r, 0, ...
++ imag = _mm_adds_epi16(imag1, imag2);
++ imag = _mm_and_si128(imag, mask_imag); // a3.i*b3.r+b3.i*a3.r, 0, ...
+
+- result = _mm_or_si128 (real, imag);
++ result = _mm_or_si128(real, imag);
+
+- _mm_storeu_si128((__m128i*)_out, result);
++ _mm_storeu_si128((__m128i*)_out, result);
+
+- _in_a += 4;
+- _in_b += 4;
+- _out += 4;
+- }
++ _in_a += 4;
++ _in_b += 4;
++ _out += 4;
++ }
+
+- for (number = sse_iters * 4; number < num_points; ++number)
+- {
+- *_out++ = (*_in_a++) * (*_in_b++);
+- }
++ for (number = sse_iters * 4; number < num_points; ++number) {
++ *_out++ = (*_in_a++) * (*_in_b++);
++ }
+ }
+ #endif /* LV_HAVE_SSE2 */
+
+@@ -170,7 +187,10 @@ static inline void volk_16ic_x2_multiply_16ic_u_sse2(lv_16sc_t* out, const lv_16
+ #ifdef LV_HAVE_AVX2
+ #include <immintrin.h>
+
+-static inline void volk_16ic_x2_multiply_16ic_u_avx2(lv_16sc_t* out, const lv_16sc_t* in_a, const lv_16sc_t* in_b, unsigned int num_points)
++static inline void volk_16ic_x2_multiply_16ic_u_avx2(lv_16sc_t* out,
++ const lv_16sc_t* in_a,
++ const lv_16sc_t* in_b,
++ unsigned int num_points)
+ {
+ unsigned int number = 0;
+ const unsigned int avx2_points = num_points / 8;
+@@ -179,44 +199,108 @@ static inline void volk_16ic_x2_multiply_16ic_u_avx2(lv_16sc_t* out, const lv_16
+ const lv_16sc_t* _in_b = in_b;
+ lv_16sc_t* _out = out;
+
+- __m256i a, b, c, c_sr, real, imag, imag1, imag2, b_sl, a_sl, result;
+-
+- const __m256i mask_imag = _mm256_set_epi8(0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0);
+- const __m256i mask_real = _mm256_set_epi8(0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF);
+-
+- for(;number < avx2_points; number++)
+- {
+- a = _mm256_loadu_si256((__m256i*)_in_a); // Load the ar + ai, br + bi as ar,ai,br,bi
+- b = _mm256_loadu_si256((__m256i*)_in_b); // Load the cr + ci, dr + di as cr,ci,dr,di
+- c = _mm256_mullo_epi16(a, b);
+-
+- c_sr = _mm256_srli_si256(c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst.
+- real = _mm256_subs_epi16(c, c_sr);
+- real = _mm256_and_si256(real, mask_real); // a3.r*b3.r-a3.i*b3.i , 0, a3.r*b3.r- a3.i*b3.i
+-
+- b_sl = _mm256_slli_si256(b, 2); // b3.r, b2.i ....
+- a_sl = _mm256_slli_si256(a, 2); // a3.r, a2.i ....
+-
+- imag1 = _mm256_mullo_epi16(a, b_sl); // a3.i*b3.r, ....
+- imag2 = _mm256_mullo_epi16(b, a_sl); // b3.i*a3.r, ....
+-
+- imag = _mm256_adds_epi16(imag1, imag2);
+- imag = _mm256_and_si256(imag, mask_imag); // a3.i*b3.r+b3.i*a3.r, 0, ...
+-
+- result = _mm256_or_si256(real, imag);
+-
+- _mm256_storeu_si256((__m256i*)_out, result);
+-
+- _in_a += 8;
+- _in_b += 8;
+- _out += 8;
+- }
++ __m256i a, b, c, c_sr, real, imag, imag1, imag2, b_sl, a_sl, result;
++
++ const __m256i mask_imag = _mm256_set_epi8(0xFF,
++ 0xFF,
++ 0,
++ 0,
++ 0xFF,
++ 0xFF,
++ 0,
++ 0,
++ 0xFF,
++ 0xFF,
++ 0,
++ 0,
++ 0xFF,
++ 0xFF,
++ 0,
++ 0,
++ 0xFF,
++ 0xFF,
++ 0,
++ 0,
++ 0xFF,
++ 0xFF,
++ 0,
++ 0,
++ 0xFF,
++ 0xFF,
++ 0,
++ 0,
++ 0xFF,
++ 0xFF,
++ 0,
++ 0);
++ const __m256i mask_real = _mm256_set_epi8(0,
++ 0,
++ 0xFF,
++ 0xFF,
++ 0,
++ 0,
++ 0xFF,
++ 0xFF,
++ 0,
++ 0,
++ 0xFF,
++ 0xFF,
++ 0,
++ 0,
++ 0xFF,
++ 0xFF,
++ 0,
++ 0,
++ 0xFF,
++ 0xFF,
++ 0,
++ 0,
++ 0xFF,
++ 0xFF,
++ 0,
++ 0,
++ 0xFF,
++ 0xFF,
++ 0,
++ 0,
++ 0xFF,
++ 0xFF);
++
++ for (; number < avx2_points; number++) {
++ a = _mm256_loadu_si256(
++ (__m256i*)_in_a); // Load the ar + ai, br + bi as ar,ai,br,bi
++ b = _mm256_loadu_si256(
++ (__m256i*)_in_b); // Load the cr + ci, dr + di as cr,ci,dr,di
++ c = _mm256_mullo_epi16(a, b);
++
++ c_sr = _mm256_srli_si256(c, 2); // Shift a right by imm8 bytes while shifting in
++ // zeros, and store the results in dst.
++ real = _mm256_subs_epi16(c, c_sr);
++ real = _mm256_and_si256(
++ real, mask_real); // a3.r*b3.r-a3.i*b3.i , 0, a3.r*b3.r- a3.i*b3.i
++
++ b_sl = _mm256_slli_si256(b, 2); // b3.r, b2.i ....
++ a_sl = _mm256_slli_si256(a, 2); // a3.r, a2.i ....
++
++ imag1 = _mm256_mullo_epi16(a, b_sl); // a3.i*b3.r, ....
++ imag2 = _mm256_mullo_epi16(b, a_sl); // b3.i*a3.r, ....
++
++ imag = _mm256_adds_epi16(imag1, imag2);
++ imag = _mm256_and_si256(imag, mask_imag); // a3.i*b3.r+b3.i*a3.r, 0, ...
++
++ result = _mm256_or_si256(real, imag);
++
++ _mm256_storeu_si256((__m256i*)_out, result);
++
++ _in_a += 8;
++ _in_b += 8;
++ _out += 8;
++ }
+ _mm256_zeroupper();
+ number = avx2_points * 8;
+- for(;number < num_points; number++)
+- {
+- *_out++ = (*_in_a++) * (*_in_b++);
+- }
++ for (; number < num_points; number++) {
++ *_out++ = (*_in_a++) * (*_in_b++);
++ }
+ }
+ #endif /* LV_HAVE_AVX2 */
+
+@@ -224,7 +308,10 @@ static inline void volk_16ic_x2_multiply_16ic_u_avx2(lv_16sc_t* out, const lv_16
+ #ifdef LV_HAVE_AVX2
+ #include <immintrin.h>
+
+-static inline void volk_16ic_x2_multiply_16ic_a_avx2(lv_16sc_t* out, const lv_16sc_t* in_a, const lv_16sc_t* in_b, unsigned int num_points)
++static inline void volk_16ic_x2_multiply_16ic_a_avx2(lv_16sc_t* out,
++ const lv_16sc_t* in_a,
++ const lv_16sc_t* in_b,
++ unsigned int num_points)
+ {
+ unsigned int number = 0;
+ const unsigned int avx2_points = num_points / 8;
+@@ -233,44 +320,108 @@ static inline void volk_16ic_x2_multiply_16ic_a_avx2(lv_16sc_t* out, const lv_16
+ const lv_16sc_t* _in_b = in_b;
+ lv_16sc_t* _out = out;
+
+- __m256i a, b, c, c_sr, real, imag, imag1, imag2, b_sl, a_sl, result;
+-
+- const __m256i mask_imag = _mm256_set_epi8(0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0);
+- const __m256i mask_real = _mm256_set_epi8(0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF);
+-
+- for(;number < avx2_points; number++)
+- {
+- a = _mm256_load_si256((__m256i*)_in_a); // Load the ar + ai, br + bi as ar,ai,br,bi
+- b = _mm256_load_si256((__m256i*)_in_b); // Load the cr + ci, dr + di as cr,ci,dr,di
+- c = _mm256_mullo_epi16(a, b);
+-
+- c_sr = _mm256_srli_si256(c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst.
+- real = _mm256_subs_epi16(c, c_sr);
+- real = _mm256_and_si256(real, mask_real); // a3.r*b3.r-a3.i*b3.i , 0, a3.r*b3.r- a3.i*b3.i
+-
+- b_sl = _mm256_slli_si256(b, 2); // b3.r, b2.i ....
+- a_sl = _mm256_slli_si256(a, 2); // a3.r, a2.i ....
+-
+- imag1 = _mm256_mullo_epi16(a, b_sl); // a3.i*b3.r, ....
+- imag2 = _mm256_mullo_epi16(b, a_sl); // b3.i*a3.r, ....
+-
+- imag = _mm256_adds_epi16(imag1, imag2);
+- imag = _mm256_and_si256(imag, mask_imag); // a3.i*b3.r+b3.i*a3.r, 0, ...
+-
+- result = _mm256_or_si256(real, imag);
+-
+- _mm256_store_si256((__m256i*)_out, result);
+-
+- _in_a += 8;
+- _in_b += 8;
+- _out += 8;
+- }
++ __m256i a, b, c, c_sr, real, imag, imag1, imag2, b_sl, a_sl, result;
++
++ const __m256i mask_imag = _mm256_set_epi8(0xFF,
++ 0xFF,
++ 0,
++ 0,
++ 0xFF,
++ 0xFF,
++ 0,
++ 0,
++ 0xFF,
++ 0xFF,
++ 0,
++ 0,
++ 0xFF,
++ 0xFF,
++ 0,
++ 0,
++ 0xFF,
++ 0xFF,
++ 0,
++ 0,
++ 0xFF,
++ 0xFF,
++ 0,
++ 0,
++ 0xFF,
++ 0xFF,
++ 0,
++ 0,
++ 0xFF,
++ 0xFF,
++ 0,
++ 0);
++ const __m256i mask_real = _mm256_set_epi8(0,
++ 0,
++ 0xFF,
++ 0xFF,
++ 0,
++ 0,
++ 0xFF,
++ 0xFF,
++ 0,
++ 0,
++ 0xFF,
++ 0xFF,
++ 0,
++ 0,
++ 0xFF,
++ 0xFF,
++ 0,
++ 0,
++ 0xFF,
++ 0xFF,
++ 0,
++ 0,
++ 0xFF,
++ 0xFF,
++ 0,
++ 0,
++ 0xFF,
++ 0xFF,
++ 0,
++ 0,
++ 0xFF,
++ 0xFF);
++
++ for (; number < avx2_points; number++) {
++ a = _mm256_load_si256(
++ (__m256i*)_in_a); // Load the ar + ai, br + bi as ar,ai,br,bi
++ b = _mm256_load_si256(
++ (__m256i*)_in_b); // Load the cr + ci, dr + di as cr,ci,dr,di
++ c = _mm256_mullo_epi16(a, b);
++
++ c_sr = _mm256_srli_si256(c, 2); // Shift a right by imm8 bytes while shifting in
++ // zeros, and store the results in dst.
++ real = _mm256_subs_epi16(c, c_sr);
++ real = _mm256_and_si256(
++ real, mask_real); // a3.r*b3.r-a3.i*b3.i , 0, a3.r*b3.r- a3.i*b3.i
++
++ b_sl = _mm256_slli_si256(b, 2); // b3.r, b2.i ....
++ a_sl = _mm256_slli_si256(a, 2); // a3.r, a2.i ....
++
++ imag1 = _mm256_mullo_epi16(a, b_sl); // a3.i*b3.r, ....
++ imag2 = _mm256_mullo_epi16(b, a_sl); // b3.i*a3.r, ....
++
++ imag = _mm256_adds_epi16(imag1, imag2);
++ imag = _mm256_and_si256(imag, mask_imag); // a3.i*b3.r+b3.i*a3.r, 0, ...
++
++ result = _mm256_or_si256(real, imag);
++
++ _mm256_store_si256((__m256i*)_out, result);
++
++ _in_a += 8;
++ _in_b += 8;
++ _out += 8;
++ }
+ _mm256_zeroupper();
+ number = avx2_points * 8;
+- for(;number < num_points; number++)
+- {
+- *_out++ = (*_in_a++) * (*_in_b++);
+- }
++ for (; number < num_points; number++) {
++ *_out++ = (*_in_a++) * (*_in_b++);
++ }
+ }
+ #endif /* LV_HAVE_AVX2 */
+
+@@ -278,48 +429,49 @@ static inline void volk_16ic_x2_multiply_16ic_a_avx2(lv_16sc_t* out, const lv_16
+ #ifdef LV_HAVE_NEON
+ #include <arm_neon.h>
+
+-static inline void volk_16ic_x2_multiply_16ic_neon(lv_16sc_t* out, const lv_16sc_t* in_a, const lv_16sc_t* in_b, unsigned int num_points)
++static inline void volk_16ic_x2_multiply_16ic_neon(lv_16sc_t* out,
++ const lv_16sc_t* in_a,
++ const lv_16sc_t* in_b,
++ unsigned int num_points)
+ {
+- lv_16sc_t *a_ptr = (lv_16sc_t*) in_a;
+- lv_16sc_t *b_ptr = (lv_16sc_t*) in_b;
++ lv_16sc_t* a_ptr = (lv_16sc_t*)in_a;
++ lv_16sc_t* b_ptr = (lv_16sc_t*)in_b;
+ unsigned int quarter_points = num_points / 4;
+ int16x4x2_t a_val, b_val, c_val;
+ int16x4x2_t tmp_real, tmp_imag;
+ unsigned int number = 0;
+
+- for(number = 0; number < quarter_points; ++number)
+- {
+- a_val = vld2_s16((int16_t*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i
+- b_val = vld2_s16((int16_t*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i
+- __VOLK_PREFETCH(a_ptr + 4);
+- __VOLK_PREFETCH(b_ptr + 4);
+-
+- // multiply the real*real and imag*imag to get real result
+- // a0r*b0r|a1r*b1r|a2r*b2r|a3r*b3r
+- tmp_real.val[0] = vmul_s16(a_val.val[0], b_val.val[0]);
+- // a0i*b0i|a1i*b1i|a2i*b2i|a3i*b3i
+- tmp_real.val[1] = vmul_s16(a_val.val[1], b_val.val[1]);
+-
+- // Multiply cross terms to get the imaginary result
+- // a0r*b0i|a1r*b1i|a2r*b2i|a3r*b3i
+- tmp_imag.val[0] = vmul_s16(a_val.val[0], b_val.val[1]);
+- // a0i*b0r|a1i*b1r|a2i*b2r|a3i*b3r
+- tmp_imag.val[1] = vmul_s16(a_val.val[1], b_val.val[0]);
+-
+- // store the results
+- c_val.val[0] = vsub_s16(tmp_real.val[0], tmp_real.val[1]);
+- c_val.val[1] = vadd_s16(tmp_imag.val[0], tmp_imag.val[1]);
+- vst2_s16((int16_t*)out, c_val);
+-
+- a_ptr += 4;
+- b_ptr += 4;
+- out += 4;
+- }
+-
+- for(number = quarter_points * 4; number < num_points; number++)
+- {
+- *out++ = (*a_ptr++) * (*b_ptr++);
+- }
++ for (number = 0; number < quarter_points; ++number) {
++ a_val = vld2_s16((int16_t*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i
++ b_val = vld2_s16((int16_t*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i
++ __VOLK_PREFETCH(a_ptr + 4);
++ __VOLK_PREFETCH(b_ptr + 4);
++
++ // multiply the real*real and imag*imag to get real result
++ // a0r*b0r|a1r*b1r|a2r*b2r|a3r*b3r
++ tmp_real.val[0] = vmul_s16(a_val.val[0], b_val.val[0]);
++ // a0i*b0i|a1i*b1i|a2i*b2i|a3i*b3i
++ tmp_real.val[1] = vmul_s16(a_val.val[1], b_val.val[1]);
++
++ // Multiply cross terms to get the imaginary result
++ // a0r*b0i|a1r*b1i|a2r*b2i|a3r*b3i
++ tmp_imag.val[0] = vmul_s16(a_val.val[0], b_val.val[1]);
++ // a0i*b0r|a1i*b1r|a2i*b2r|a3i*b3r
++ tmp_imag.val[1] = vmul_s16(a_val.val[1], b_val.val[0]);
++
++ // store the results
++ c_val.val[0] = vsub_s16(tmp_real.val[0], tmp_real.val[1]);
++ c_val.val[1] = vadd_s16(tmp_imag.val[0], tmp_imag.val[1]);
++ vst2_s16((int16_t*)out, c_val);
++
++ a_ptr += 4;
++ b_ptr += 4;
++ out += 4;
++ }
++
++ for (number = quarter_points * 4; number < num_points; number++) {
++ *out++ = (*a_ptr++) * (*b_ptr++);
++ }
+ }
+ #endif /* LV_HAVE_NEON */
+
+diff --git a/kernels/volk/volk_16u_byteswap.h b/kernels/volk/volk_16u_byteswap.h
+index eaa972f..221dcdb 100644
+--- a/kernels/volk/volk_16u_byteswap.h
++++ b/kernels/volk/volk_16u_byteswap.h
+@@ -58,74 +58,80 @@
+
+ #if LV_HAVE_AVX2
+ #include <immintrin.h>
+-static inline void volk_16u_byteswap_a_avx2(uint16_t* intsToSwap, unsigned int num_points){
+- unsigned int number;
++static inline void volk_16u_byteswap_a_avx2(uint16_t* intsToSwap, unsigned int num_points)
++{
++ unsigned int number;
+
+- const unsigned int nPerSet = 16;
+- const uint64_t nSets = num_points / nPerSet;
++ const unsigned int nPerSet = 16;
++ const uint64_t nSets = num_points / nPerSet;
+
+- uint16_t* inputPtr = (uint16_t*) intsToSwap;
++ uint16_t* inputPtr = (uint16_t*)intsToSwap;
+
+- const uint8_t shuffleVector[32] = { 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14, 17, 16, 19, 18, 21, 20, 23, 22, 25, 24, 27, 26, 29, 28, 31, 30};
++ const uint8_t shuffleVector[32] = { 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11,
++ 10, 13, 12, 15, 14, 17, 16, 19, 18, 21, 20,
++ 23, 22, 25, 24, 27, 26, 29, 28, 31, 30 };
+
+- const __m256i myShuffle = _mm256_loadu_si256((__m256i*) &shuffleVector[0]);
++ const __m256i myShuffle = _mm256_loadu_si256((__m256i*)&shuffleVector[0]);
+
+- for(number = 0; number < nSets; number++) {
+- // Load the 32t values, increment inputPtr later since we're doing it in-place.
+- const __m256i input = _mm256_load_si256((__m256i*)inputPtr);
+- const __m256i output = _mm256_shuffle_epi8(input, myShuffle);
++ for (number = 0; number < nSets; number++) {
++ // Load the 32t values, increment inputPtr later since we're doing it in-place.
++ const __m256i input = _mm256_load_si256((__m256i*)inputPtr);
++ const __m256i output = _mm256_shuffle_epi8(input, myShuffle);
+
+- // Store the results
+- _mm256_store_si256((__m256i*)inputPtr, output);
+- inputPtr += nPerSet;
+- }
++ // Store the results
++ _mm256_store_si256((__m256i*)inputPtr, output);
++ inputPtr += nPerSet;
++ }
+
+- _mm256_zeroupper();
++ _mm256_zeroupper();
+
+- // Byteswap any remaining points:
+- for(number = nPerSet * nSets; number < num_points; number++) {
+- uint16_t outputVal = *inputPtr;
+- outputVal = (((outputVal >> 8) & 0xff) | ((outputVal << 8) & 0xff00));
+- *inputPtr = outputVal;
+- inputPtr++;
+- }
++ // Byteswap any remaining points:
++ for (number = nPerSet * nSets; number < num_points; number++) {
++ uint16_t outputVal = *inputPtr;
++ outputVal = (((outputVal >> 8) & 0xff) | ((outputVal << 8) & 0xff00));
++ *inputPtr = outputVal;
++ inputPtr++;
++ }
+ }
+ #endif /* LV_HAVE_AVX2 */
+
+
+ #if LV_HAVE_AVX2
+ #include <immintrin.h>
+-static inline void volk_16u_byteswap_u_avx2(uint16_t* intsToSwap, unsigned int num_points){
+- unsigned int number;
++static inline void volk_16u_byteswap_u_avx2(uint16_t* intsToSwap, unsigned int num_points)
++{
++ unsigned int number;
+
+- const unsigned int nPerSet = 16;
+- const uint64_t nSets = num_points / nPerSet;
++ const unsigned int nPerSet = 16;
++ const uint64_t nSets = num_points / nPerSet;
+
+- uint16_t* inputPtr = (uint16_t*) intsToSwap;
++ uint16_t* inputPtr = (uint16_t*)intsToSwap;
+
+- const uint8_t shuffleVector[32] = { 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14, 17, 16, 19, 18, 21, 20, 23, 22, 25, 24, 27, 26, 29, 28, 31, 30};
++ const uint8_t shuffleVector[32] = { 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11,
++ 10, 13, 12, 15, 14, 17, 16, 19, 18, 21, 20,
++ 23, 22, 25, 24, 27, 26, 29, 28, 31, 30 };
+
+- const __m256i myShuffle = _mm256_loadu_si256((__m256i*) &shuffleVector[0]);
++ const __m256i myShuffle = _mm256_loadu_si256((__m256i*)&shuffleVector[0]);
+
+- for (number = 0; number < nSets; number++) {
+- // Load the 32t values, increment inputPtr later since we're doing it in-place.
+- const __m256i input = _mm256_loadu_si256((__m256i*)inputPtr);
+- const __m256i output = _mm256_shuffle_epi8(input,myShuffle);
++ for (number = 0; number < nSets; number++) {
++ // Load the 32t values, increment inputPtr later since we're doing it in-place.
++ const __m256i input = _mm256_loadu_si256((__m256i*)inputPtr);
++ const __m256i output = _mm256_shuffle_epi8(input, myShuffle);
+
+- // Store the results
+- _mm256_storeu_si256((__m256i*)inputPtr, output);
+- inputPtr += nPerSet;
+- }
++ // Store the results
++ _mm256_storeu_si256((__m256i*)inputPtr, output);
++ inputPtr += nPerSet;
++ }
+
+- _mm256_zeroupper();
++ _mm256_zeroupper();
+
+- // Byteswap any remaining points:
+- for(number = nPerSet * nSets; number < num_points; number++) {
+- uint16_t outputVal = *inputPtr;
+- outputVal = (((outputVal >> 8) & 0xff) | ((outputVal << 8) & 0xff00));
+- *inputPtr = outputVal;
+- inputPtr++;
+- }
++ // Byteswap any remaining points:
++ for (number = nPerSet * nSets; number < num_points; number++) {
++ uint16_t outputVal = *inputPtr;
++ outputVal = (((outputVal >> 8) & 0xff) | ((outputVal << 8) & 0xff00));
++ *inputPtr = outputVal;
++ inputPtr++;
++ }
+ }
+ #endif /* LV_HAVE_AVX2 */
+
+@@ -133,47 +139,50 @@ static inline void volk_16u_byteswap_u_avx2(uint16_t* intsToSwap, unsigned int n
+ #ifdef LV_HAVE_SSE2
+ #include <emmintrin.h>
+
+-static inline void volk_16u_byteswap_u_sse2(uint16_t* intsToSwap, unsigned int num_points){
+- unsigned int number = 0;
+- uint16_t* inputPtr = intsToSwap;
+- __m128i input, left, right, output;
+-
+- const unsigned int eighthPoints = num_points / 8;
+- for(;number < eighthPoints; number++){
+- // Load the 16t values, increment inputPtr later since we're doing it in-place.
+- input = _mm_loadu_si128((__m128i*)inputPtr);
+- // Do the two shifts
+- left = _mm_slli_epi16(input, 8);
+- right = _mm_srli_epi16(input, 8);
+- // Or the left and right halves together
+- output = _mm_or_si128(left, right);
+- // Store the results
+- _mm_storeu_si128((__m128i*)inputPtr, output);
+- inputPtr += 8;
+- }
+-
+- // Byteswap any remaining points:
+- number = eighthPoints*8;
+- for(; number < num_points; number++){
+- uint16_t outputVal = *inputPtr;
+- outputVal = (((outputVal >> 8) & 0xff) | ((outputVal << 8) & 0xff00));
+- *inputPtr = outputVal;
+- inputPtr++;
+- }
++static inline void volk_16u_byteswap_u_sse2(uint16_t* intsToSwap, unsigned int num_points)
++{
++ unsigned int number = 0;
++ uint16_t* inputPtr = intsToSwap;
++ __m128i input, left, right, output;
++
++ const unsigned int eighthPoints = num_points / 8;
++ for (; number < eighthPoints; number++) {
++ // Load the 16t values, increment inputPtr later since we're doing it in-place.
++ input = _mm_loadu_si128((__m128i*)inputPtr);
++ // Do the two shifts
++ left = _mm_slli_epi16(input, 8);
++ right = _mm_srli_epi16(input, 8);
++ // Or the left and right halves together
++ output = _mm_or_si128(left, right);
++ // Store the results
++ _mm_storeu_si128((__m128i*)inputPtr, output);
++ inputPtr += 8;
++ }
++
++ // Byteswap any remaining points:
++ number = eighthPoints * 8;
++ for (; number < num_points; number++) {
++ uint16_t outputVal = *inputPtr;
++ outputVal = (((outputVal >> 8) & 0xff) | ((outputVal << 8) & 0xff00));
++ *inputPtr = outputVal;
++ inputPtr++;
++ }
+ }
+ #endif /* LV_HAVE_SSE2 */
+
+ #ifdef LV_HAVE_GENERIC
+
+-static inline void volk_16u_byteswap_generic(uint16_t* intsToSwap, unsigned int num_points){
+- unsigned int point;
+- uint16_t* inputPtr = intsToSwap;
+- for(point = 0; point < num_points; point++){
+- uint16_t output = *inputPtr;
+- output = (((output >> 8) & 0xff) | ((output << 8) & 0xff00));
+- *inputPtr = output;
+- inputPtr++;
+- }
++static inline void volk_16u_byteswap_generic(uint16_t* intsToSwap,
++ unsigned int num_points)
++{
++ unsigned int point;
++ uint16_t* inputPtr = intsToSwap;
++ for (point = 0; point < num_points; point++) {
++ uint16_t output = *inputPtr;
++ output = (((output >> 8) & 0xff) | ((output << 8) & 0xff00));
++ *inputPtr = output;
++ inputPtr++;
++ }
+ }
+ #endif /* LV_HAVE_GENERIC */
+
+@@ -187,129 +196,136 @@ static inline void volk_16u_byteswap_generic(uint16_t* intsToSwap, unsigned int
+ #ifdef LV_HAVE_SSE2
+ #include <emmintrin.h>
+
+-static inline void volk_16u_byteswap_a_sse2(uint16_t* intsToSwap, unsigned int num_points){
+- unsigned int number = 0;
+- uint16_t* inputPtr = intsToSwap;
+- __m128i input, left, right, output;
+-
+- const unsigned int eighthPoints = num_points / 8;
+- for(;number < eighthPoints; number++){
+- // Load the 16t values, increment inputPtr later since we're doing it in-place.
+- input = _mm_load_si128((__m128i*)inputPtr);
+- // Do the two shifts
+- left = _mm_slli_epi16(input, 8);
+- right = _mm_srli_epi16(input, 8);
+- // Or the left and right halves together
+- output = _mm_or_si128(left, right);
+- // Store the results
+- _mm_store_si128((__m128i*)inputPtr, output);
+- inputPtr += 8;
+- }
+-
+-
+- // Byteswap any remaining points:
+- number = eighthPoints*8;
+- for(; number < num_points; number++){
+- uint16_t outputVal = *inputPtr;
+- outputVal = (((outputVal >> 8) & 0xff) | ((outputVal << 8) & 0xff00));
+- *inputPtr = outputVal;
+- inputPtr++;
+- }
++static inline void volk_16u_byteswap_a_sse2(uint16_t* intsToSwap, unsigned int num_points)
++{
++ unsigned int number = 0;
++ uint16_t* inputPtr = intsToSwap;
++ __m128i input, left, right, output;
++
++ const unsigned int eighthPoints = num_points / 8;
++ for (; number < eighthPoints; number++) {
++ // Load the 16t values, increment inputPtr later since we're doing it in-place.
++ input = _mm_load_si128((__m128i*)inputPtr);
++ // Do the two shifts
++ left = _mm_slli_epi16(input, 8);
++ right = _mm_srli_epi16(input, 8);
++ // Or the left and right halves together
++ output = _mm_or_si128(left, right);
++ // Store the results
++ _mm_store_si128((__m128i*)inputPtr, output);
++ inputPtr += 8;
++ }
++
++
++ // Byteswap any remaining points:
++ number = eighthPoints * 8;
++ for (; number < num_points; number++) {
++ uint16_t outputVal = *inputPtr;
++ outputVal = (((outputVal >> 8) & 0xff) | ((outputVal << 8) & 0xff00));
++ *inputPtr = outputVal;
++ inputPtr++;
++ }
+ }
+ #endif /* LV_HAVE_SSE2 */
+
+ #ifdef LV_HAVE_NEON
+ #include <arm_neon.h>
+
+-static inline void volk_16u_byteswap_neon(uint16_t* intsToSwap, unsigned int num_points){
+- unsigned int number;
+- unsigned int eighth_points = num_points / 8;
+- uint16x8_t input, output;
+- uint16_t* inputPtr = intsToSwap;
+-
+- for(number = 0; number < eighth_points; number++) {
+- input = vld1q_u16(inputPtr);
+- output = vsriq_n_u16(output, input, 8);
+- output = vsliq_n_u16(output, input, 8);
+- vst1q_u16(inputPtr, output);
+- inputPtr += 8;
+- }
+-
+- for(number = eighth_points * 8; number < num_points; number++){
+- uint16_t output = *inputPtr;
+- output = (((output >> 8) & 0xff) | ((output << 8) & 0xff00));
+- *inputPtr = output;
+- inputPtr++;
+- }
++static inline void volk_16u_byteswap_neon(uint16_t* intsToSwap, unsigned int num_points)
++{
++ unsigned int number;
++ unsigned int eighth_points = num_points / 8;
++ uint16x8_t input, output;
++ uint16_t* inputPtr = intsToSwap;
++
++ for (number = 0; number < eighth_points; number++) {
++ input = vld1q_u16(inputPtr);
++ output = vsriq_n_u16(output, input, 8);
++ output = vsliq_n_u16(output, input, 8);
++ vst1q_u16(inputPtr, output);
++ inputPtr += 8;
++ }
++
++ for (number = eighth_points * 8; number < num_points; number++) {
++ uint16_t output = *inputPtr;
++ output = (((output >> 8) & 0xff) | ((output << 8) & 0xff00));
++ *inputPtr = output;
++ inputPtr++;
++ }
+ }
+ #endif /* LV_HAVE_NEON */
+
+ #ifdef LV_HAVE_NEON
+ #include <arm_neon.h>
+
+-static inline void volk_16u_byteswap_neon_table(uint16_t* intsToSwap, unsigned int num_points){
+- uint16_t* inputPtr = intsToSwap;
+- unsigned int number = 0;
+- unsigned int n16points = num_points / 16;
+-
+- uint8x8x4_t input_table;
+- uint8x8_t int_lookup01, int_lookup23, int_lookup45, int_lookup67;
+- uint8x8_t swapped_int01, swapped_int23, swapped_int45, swapped_int67;
+-
+- /* these magic numbers are used as byte-indices in the LUT.
+- they are pre-computed to save time. A simple C program
+- can calculate them; for example for lookup01:
+- uint8_t chars[8] = {24, 16, 8, 0, 25, 17, 9, 1};
+- for(ii=0; ii < 8; ++ii) {
+- index += ((uint64_t)(*(chars+ii))) << (ii*8);
++static inline void volk_16u_byteswap_neon_table(uint16_t* intsToSwap,
++ unsigned int num_points)
++{
++ uint16_t* inputPtr = intsToSwap;
++ unsigned int number = 0;
++ unsigned int n16points = num_points / 16;
++
++ uint8x8x4_t input_table;
++ uint8x8_t int_lookup01, int_lookup23, int_lookup45, int_lookup67;
++ uint8x8_t swapped_int01, swapped_int23, swapped_int45, swapped_int67;
++
++ /* these magic numbers are used as byte-indices in the LUT.
++ they are pre-computed to save time. A simple C program
++ can calculate them; for example for lookup01:
++ uint8_t chars[8] = {24, 16, 8, 0, 25, 17, 9, 1};
++ for(ii=0; ii < 8; ++ii) {
++ index += ((uint64_t)(*(chars+ii))) << (ii*8);
++ }
++ */
++ int_lookup01 = vcreate_u8(1232017111498883080);
++ int_lookup23 = vcreate_u8(1376697457175036426);
++ int_lookup45 = vcreate_u8(1521377802851189772);
++ int_lookup67 = vcreate_u8(1666058148527343118);
++
++ for (number = 0; number < n16points; ++number) {
++ input_table = vld4_u8((uint8_t*)inputPtr);
++ swapped_int01 = vtbl4_u8(input_table, int_lookup01);
++ swapped_int23 = vtbl4_u8(input_table, int_lookup23);
++ swapped_int45 = vtbl4_u8(input_table, int_lookup45);
++ swapped_int67 = vtbl4_u8(input_table, int_lookup67);
++ vst1_u8((uint8_t*)inputPtr, swapped_int01);
++ vst1_u8((uint8_t*)(inputPtr + 4), swapped_int23);
++ vst1_u8((uint8_t*)(inputPtr + 8), swapped_int45);
++ vst1_u8((uint8_t*)(inputPtr + 12), swapped_int67);
++
++ inputPtr += 16;
++ }
++
++ for (number = n16points * 16; number < num_points; ++number) {
++ uint16_t output = *inputPtr;
++ output = (((output >> 8) & 0xff) | ((output << 8) & 0xff00));
++ *inputPtr = output;
++ inputPtr++;
+ }
+- */
+- int_lookup01 = vcreate_u8(1232017111498883080);
+- int_lookup23 = vcreate_u8(1376697457175036426);
+- int_lookup45 = vcreate_u8(1521377802851189772);
+- int_lookup67 = vcreate_u8(1666058148527343118);
+-
+- for(number = 0; number < n16points; ++number){
+- input_table = vld4_u8((uint8_t*) inputPtr);
+- swapped_int01 = vtbl4_u8(input_table, int_lookup01);
+- swapped_int23 = vtbl4_u8(input_table, int_lookup23);
+- swapped_int45 = vtbl4_u8(input_table, int_lookup45);
+- swapped_int67 = vtbl4_u8(input_table, int_lookup67);
+- vst1_u8((uint8_t*)inputPtr, swapped_int01);
+- vst1_u8((uint8_t*)(inputPtr+4), swapped_int23);
+- vst1_u8((uint8_t*)(inputPtr+8), swapped_int45);
+- vst1_u8((uint8_t*)(inputPtr+12), swapped_int67);
+-
+- inputPtr += 16;
+- }
+-
+- for(number = n16points * 16; number < num_points; ++number){
+- uint16_t output = *inputPtr;
+- output = (((output >> 8) & 0xff) | ((output << 8) & 0xff00));
+- *inputPtr = output;
+- inputPtr++;
+- }
+ }
+ #endif /* LV_HAVE_NEON */
+
+ #ifdef LV_HAVE_GENERIC
+
+-static inline void volk_16u_byteswap_a_generic(uint16_t* intsToSwap, unsigned int num_points){
+- unsigned int point;
+- uint16_t* inputPtr = intsToSwap;
+- for(point = 0; point < num_points; point++){
+- uint16_t output = *inputPtr;
+- output = (((output >> 8) & 0xff) | ((output << 8) & 0xff00));
+- *inputPtr = output;
+- inputPtr++;
+- }
++static inline void volk_16u_byteswap_a_generic(uint16_t* intsToSwap,
++ unsigned int num_points)
++{
++ unsigned int point;
++ uint16_t* inputPtr = intsToSwap;
++ for (point = 0; point < num_points; point++) {
++ uint16_t output = *inputPtr;
++ output = (((output >> 8) & 0xff) | ((output << 8) & 0xff00));
++ *inputPtr = output;
++ inputPtr++;
++ }
+ }
+ #endif /* LV_HAVE_GENERIC */
+
+ #ifdef LV_HAVE_ORC
+
+ extern void volk_16u_byteswap_a_orc_impl(uint16_t* intsToSwap, unsigned int num_points);
+-static inline void volk_16u_byteswap_u_orc(uint16_t* intsToSwap, unsigned int num_points){
++static inline void volk_16u_byteswap_u_orc(uint16_t* intsToSwap, unsigned int num_points)
++{
+ volk_16u_byteswap_a_orc_impl(intsToSwap, num_points);
+ }
+ #endif /* LV_HAVE_ORC */
+diff --git a/kernels/volk/volk_16u_byteswappuppet_16u.h b/kernels/volk/volk_16u_byteswappuppet_16u.h
+index d3c8c5d..8cb1318 100644
+--- a/kernels/volk/volk_16u_byteswappuppet_16u.h
++++ b/kernels/volk/volk_16u_byteswappuppet_16u.h
+@@ -3,69 +3,83 @@
+
+
+ #include <stdint.h>
+-#include <volk/volk_16u_byteswap.h>
+ #include <string.h>
++#include <volk/volk_16u_byteswap.h>
+
+ #ifdef LV_HAVE_GENERIC
+-static inline void volk_16u_byteswappuppet_16u_generic(uint16_t*output, uint16_t* intsToSwap, unsigned int num_points){
++static inline void volk_16u_byteswappuppet_16u_generic(uint16_t* output,
++ uint16_t* intsToSwap,
++ unsigned int num_points)
++{
+
+ volk_16u_byteswap_generic((uint16_t*)intsToSwap, num_points);
+ memcpy((void*)output, (void*)intsToSwap, num_points * sizeof(uint16_t));
+-
+ }
+ #endif
+
+ #ifdef LV_HAVE_NEON
+-static inline void volk_16u_byteswappuppet_16u_neon(uint16_t*output, uint16_t* intsToSwap, unsigned int num_points){
++static inline void volk_16u_byteswappuppet_16u_neon(uint16_t* output,
++ uint16_t* intsToSwap,
++ unsigned int num_points)
++{
+
+ volk_16u_byteswap_neon((uint16_t*)intsToSwap, num_points);
+ memcpy((void*)output, (void*)intsToSwap, num_points * sizeof(uint16_t));
+-
+ }
+ #endif
+
+ #ifdef LV_HAVE_NEON
+-static inline void volk_16u_byteswappuppet_16u_neon_table(uint16_t*output, uint16_t* intsToSwap, unsigned int num_points){
++static inline void volk_16u_byteswappuppet_16u_neon_table(uint16_t* output,
++ uint16_t* intsToSwap,
++ unsigned int num_points)
++{
+
+ volk_16u_byteswap_neon_table((uint16_t*)intsToSwap, num_points);
+ memcpy((void*)output, (void*)intsToSwap, num_points * sizeof(uint16_t));
+-
+ }
+ #endif
+
+ #ifdef LV_HAVE_SSE2
+-static inline void volk_16u_byteswappuppet_16u_u_sse2(uint16_t *output, uint16_t* intsToSwap, unsigned int num_points){
++static inline void volk_16u_byteswappuppet_16u_u_sse2(uint16_t* output,
++ uint16_t* intsToSwap,
++ unsigned int num_points)
++{
+
+ volk_16u_byteswap_u_sse2((uint16_t*)intsToSwap, num_points);
+ memcpy((void*)output, (void*)intsToSwap, num_points * sizeof(uint16_t));
+-
+ }
+ #endif
+
+ #ifdef LV_HAVE_SSE2
+-static inline void volk_16u_byteswappuppet_16u_a_sse2(uint16_t *output, uint16_t* intsToSwap, unsigned int num_points){
++static inline void volk_16u_byteswappuppet_16u_a_sse2(uint16_t* output,
++ uint16_t* intsToSwap,
++ unsigned int num_points)
++{
+
+ volk_16u_byteswap_a_sse2((uint16_t*)intsToSwap, num_points);
+ memcpy((void*)output, (void*)intsToSwap, num_points * sizeof(uint16_t));
+-
+ }
+ #endif
+
+ #ifdef LV_HAVE_AVX2
+-static inline void volk_16u_byteswappuppet_16u_u_avx2(uint16_t *output, uint16_t* intsToSwap, unsigned int num_points){
++static inline void volk_16u_byteswappuppet_16u_u_avx2(uint16_t* output,
++ uint16_t* intsToSwap,
++ unsigned int num_points)
++{
+
+ volk_16u_byteswap_u_avx2((uint16_t*)intsToSwap, num_points);
+ memcpy((void*)output, (void*)intsToSwap, num_points * sizeof(uint16_t));
+-
+ }
+ #endif
+
+ #ifdef LV_HAVE_AVX2
+-static inline void volk_16u_byteswappuppet_16u_a_avx2(uint16_t *output, uint16_t* intsToSwap, unsigned int num_points){
++static inline void volk_16u_byteswappuppet_16u_a_avx2(uint16_t* output,
++ uint16_t* intsToSwap,
++ unsigned int num_points)
++{
+
+ volk_16u_byteswap_a_avx2((uint16_t*)intsToSwap, num_points);
+ memcpy((void*)output, (void*)intsToSwap, num_points * sizeof(uint16_t));
+-
+ }
+ #endif
+
+diff --git a/kernels/volk/volk_32f_64f_add_64f.h b/kernels/volk/volk_32f_64f_add_64f.h
+index 770c27e..d00ada5 100644
+--- a/kernels/volk/volk_32f_64f_add_64f.h
++++ b/kernels/volk/volk_32f_64f_add_64f.h
+@@ -77,18 +77,19 @@
+
+ #ifdef LV_HAVE_GENERIC
+
+-static inline void volk_32f_64f_add_64f_generic(double *cVector,
+- const float *aVector,
+- const double *bVector,
+- unsigned int num_points) {
+- double *cPtr = cVector;
+- const float *aPtr = aVector;
+- const double *bPtr = bVector;
+- unsigned int number = 0;
+-
+- for (number = 0; number < num_points; number++) {
+- *cPtr++ = ((double)(*aPtr++)) + (*bPtr++);
+- }
++static inline void volk_32f_64f_add_64f_generic(double* cVector,
++ const float* aVector,
++ const double* bVector,
++ unsigned int num_points)
++{
++ double* cPtr = cVector;
++ const float* aPtr = aVector;
++ const double* bPtr = bVector;
++ unsigned int number = 0;
++
++ for (number = 0; number < num_points; number++) {
++ *cPtr++ = ((double)(*aPtr++)) + (*bPtr++);
++ }
+ }
+
+ #endif /* LV_HAVE_GENERIC */
+@@ -96,42 +97,43 @@ static inline void volk_32f_64f_add_64f_generic(double *cVector,
+ #ifdef LV_HAVE_NEONV8
+ #include <arm_neon.h>
+
+-static inline void volk_32f_64f_add_64f_neon(double *cVector,
+- const float *aVector,
+- const double *bVector,
+- unsigned int num_points) {
+- unsigned int number = 0;
+- const unsigned int half_points = num_points / 2;
+-
+- double *cPtr = cVector;
+- const float *aPtr = aVector;
+- const double *bPtr = bVector;
+-
+- float64x2_t aVal, bVal, cVal;
+- float32x2_t aVal1;
+- for (number = 0; number < half_points; number++) {
+- // Load in to NEON registers
+- aVal1 = vld1_f32(aPtr);
+- bVal = vld1q_f64(bPtr);
+- __VOLK_PREFETCH(aPtr + 2);
+- __VOLK_PREFETCH(bPtr + 2);
+- aPtr += 2; // q uses quadwords, 4 floats per vadd
+- bPtr += 2;
+-
+- // Vector conversion
+- aVal = vcvt_f64_f32(aVal1);
+- // vector add
+- cVal = vaddq_f64(aVal, bVal);
+- // Store the results back into the C container
+- vst1q_f64(cPtr, cVal);
+-
+- cPtr += 2;
+- }
+-
+- number = half_points * 2; // should be = num_points
+- for (; number < num_points; number++) {
+- *cPtr++ = ((double)(*aPtr++)) + (*bPtr++);
+- }
++static inline void volk_32f_64f_add_64f_neon(double* cVector,
++ const float* aVector,
++ const double* bVector,
++ unsigned int num_points)
++{
++ unsigned int number = 0;
++ const unsigned int half_points = num_points / 2;
++
++ double* cPtr = cVector;
++ const float* aPtr = aVector;
++ const double* bPtr = bVector;
++
++ float64x2_t aVal, bVal, cVal;
++ float32x2_t aVal1;
++ for (number = 0; number < half_points; number++) {
++ // Load in to NEON registers
++ aVal1 = vld1_f32(aPtr);
++ bVal = vld1q_f64(bPtr);
++ __VOLK_PREFETCH(aPtr + 2);
++ __VOLK_PREFETCH(bPtr + 2);
++ aPtr += 2; // q uses quadwords, 4 floats per vadd
++ bPtr += 2;
++
++ // Vector conversion
++ aVal = vcvt_f64_f32(aVal1);
++ // vector add
++ cVal = vaddq_f64(aVal, bVal);
++ // Store the results back into the C container
++ vst1q_f64(cPtr, cVal);
++
++ cPtr += 2;
++ }
++
++ number = half_points * 2; // should be = num_points
++ for (; number < num_points; number++) {
++ *cPtr++ = ((double)(*aPtr++)) + (*bPtr++);
++ }
+ }
+
+ #endif /* LV_HAVE_NEONV8 */
+@@ -141,49 +143,50 @@ static inline void volk_32f_64f_add_64f_neon(double *cVector,
+ #include <immintrin.h>
+ #include <xmmintrin.h>
+
+-static inline void volk_32f_64f_add_64f_u_avx(double *cVector,
+- const float *aVector,
+- const double *bVector,
+- unsigned int num_points) {
+- unsigned int number = 0;
+- const unsigned int eighth_points = num_points / 8;
+-
+- double *cPtr = cVector;
+- const float *aPtr = aVector;
+- const double *bPtr = bVector;
+-
+- __m256 aVal;
+- __m128 aVal1, aVal2;
+- __m256d aDbl1, aDbl2, bVal1, bVal2, cVal1, cVal2;
+- for (; number < eighth_points; number++) {
+-
+- aVal = _mm256_loadu_ps(aPtr);
+- bVal1 = _mm256_loadu_pd(bPtr);
+- bVal2 = _mm256_loadu_pd(bPtr + 4);
+-
+- aVal1 = _mm256_extractf128_ps(aVal, 0);
+- aVal2 = _mm256_extractf128_ps(aVal, 1);
+-
+- aDbl1 = _mm256_cvtps_pd(aVal1);
+- aDbl2 = _mm256_cvtps_pd(aVal2);
+-
+- cVal1 = _mm256_add_pd(aDbl1, bVal1);
+- cVal2 = _mm256_add_pd(aDbl2, bVal2);
+-
+- _mm256_storeu_pd(cPtr,
+- cVal1); // Store the results back into the C container
+- _mm256_storeu_pd(cPtr + 4,
+- cVal2); // Store the results back into the C container
+-
+- aPtr += 8;
+- bPtr += 8;
+- cPtr += 8;
+- }
+-
+- number = eighth_points * 8;
+- for (; number < num_points; number++) {
+- *cPtr++ = ((double)(*aPtr++)) + (*bPtr++);
+- }
++static inline void volk_32f_64f_add_64f_u_avx(double* cVector,
++ const float* aVector,
++ const double* bVector,
++ unsigned int num_points)
++{
++ unsigned int number = 0;
++ const unsigned int eighth_points = num_points / 8;
++
++ double* cPtr = cVector;
++ const float* aPtr = aVector;
++ const double* bPtr = bVector;
++
++ __m256 aVal;
++ __m128 aVal1, aVal2;
++ __m256d aDbl1, aDbl2, bVal1, bVal2, cVal1, cVal2;
++ for (; number < eighth_points; number++) {
++
++ aVal = _mm256_loadu_ps(aPtr);
++ bVal1 = _mm256_loadu_pd(bPtr);
++ bVal2 = _mm256_loadu_pd(bPtr + 4);
++
++ aVal1 = _mm256_extractf128_ps(aVal, 0);
++ aVal2 = _mm256_extractf128_ps(aVal, 1);
++
++ aDbl1 = _mm256_cvtps_pd(aVal1);
++ aDbl2 = _mm256_cvtps_pd(aVal2);
++
++ cVal1 = _mm256_add_pd(aDbl1, bVal1);
++ cVal2 = _mm256_add_pd(aDbl2, bVal2);
++
++ _mm256_storeu_pd(cPtr,
++ cVal1); // Store the results back into the C container
++ _mm256_storeu_pd(cPtr + 4,
++ cVal2); // Store the results back into the C container
++
++ aPtr += 8;
++ bPtr += 8;
++ cPtr += 8;
++ }
++
++ number = eighth_points * 8;
++ for (; number < num_points; number++) {
++ *cPtr++ = ((double)(*aPtr++)) + (*bPtr++);
++ }
+ }
+
+ #endif /* LV_HAVE_AVX */
+@@ -193,48 +196,49 @@ static inline void volk_32f_64f_add_64f_u_avx(double *cVector,
+ #include <immintrin.h>
+ #include <xmmintrin.h>
+
+-static inline void volk_32f_64f_add_64f_a_avx(double *cVector,
+- const float *aVector,
+- const double *bVector,
+- unsigned int num_points) {
+- unsigned int number = 0;
+- const unsigned int eighth_points = num_points / 8;
+-
+- double *cPtr = cVector;
+- const float *aPtr = aVector;
+- const double *bPtr = bVector;
+-
+- __m256 aVal;
+- __m128 aVal1, aVal2;
+- __m256d aDbl1, aDbl2, bVal1, bVal2, cVal1, cVal2;
+- for (; number < eighth_points; number++) {
+-
+- aVal = _mm256_load_ps(aPtr);
+- bVal1 = _mm256_load_pd(bPtr);
+- bVal2 = _mm256_load_pd(bPtr + 4);
+-
+- aVal1 = _mm256_extractf128_ps(aVal, 0);
+- aVal2 = _mm256_extractf128_ps(aVal, 1);
+-
+- aDbl1 = _mm256_cvtps_pd(aVal1);
+- aDbl2 = _mm256_cvtps_pd(aVal2);
+-
+- cVal1 = _mm256_add_pd(aDbl1, bVal1);
+- cVal2 = _mm256_add_pd(aDbl2, bVal2);
+-
+- _mm256_store_pd(cPtr, cVal1); // Store the results back into the C container
+- _mm256_store_pd(cPtr + 4,
+- cVal2); // Store the results back into the C container
+-
+- aPtr += 8;
+- bPtr += 8;
+- cPtr += 8;
+- }
+-
+- number = eighth_points * 8;
+- for (; number < num_points; number++) {
+- *cPtr++ = ((double)(*aPtr++)) + (*bPtr++);
+- }
++static inline void volk_32f_64f_add_64f_a_avx(double* cVector,
++ const float* aVector,
++ const double* bVector,
++ unsigned int num_points)
++{
++ unsigned int number = 0;
++ const unsigned int eighth_points = num_points / 8;
++
++ double* cPtr = cVector;
++ const float* aPtr = aVector;
++ const double* bPtr = bVector;
++
++ __m256 aVal;
++ __m128 aVal1, aVal2;
++ __m256d aDbl1, aDbl2, bVal1, bVal2, cVal1, cVal2;
++ for (; number < eighth_points; number++) {
++
++ aVal = _mm256_load_ps(aPtr);
++ bVal1 = _mm256_load_pd(bPtr);
++ bVal2 = _mm256_load_pd(bPtr + 4);
++
++ aVal1 = _mm256_extractf128_ps(aVal, 0);
++ aVal2 = _mm256_extractf128_ps(aVal, 1);
++
++ aDbl1 = _mm256_cvtps_pd(aVal1);
++ aDbl2 = _mm256_cvtps_pd(aVal2);
++
++ cVal1 = _mm256_add_pd(aDbl1, bVal1);
++ cVal2 = _mm256_add_pd(aDbl2, bVal2);
++
++ _mm256_store_pd(cPtr, cVal1); // Store the results back into the C container
++ _mm256_store_pd(cPtr + 4,
++ cVal2); // Store the results back into the C container
++
++ aPtr += 8;
++ bPtr += 8;
++ cPtr += 8;
++ }
++
++ number = eighth_points * 8;
++ for (; number < num_points; number++) {
++ *cPtr++ = ((double)(*aPtr++)) + (*bPtr++);
++ }
+ }
+
+ #endif /* LV_HAVE_AVX */
+diff --git a/kernels/volk/volk_32f_64f_multiply_64f.h b/kernels/volk/volk_32f_64f_multiply_64f.h
+index 50f08a1..1039850 100644
+--- a/kernels/volk/volk_32f_64f_multiply_64f.h
++++ b/kernels/volk/volk_32f_64f_multiply_64f.h
+@@ -31,8 +31,8 @@
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+- * void volk_32f_64f_multiply_64f(double* cVector, const double* aVector, const double* bVector, unsigned int num_points)
+- * \endcode
++ * void volk_32f_64f_multiply_64f(double* cVector, const double* aVector, const double*
++ * bVector, unsigned int num_points) \endcode
+ *
+ * \b Inputs
+ * \li aVector: First input vector.
+@@ -76,18 +76,19 @@
+
+ #ifdef LV_HAVE_GENERIC
+
+-static inline void
+-volk_32f_64f_multiply_64f_generic(double *cVector, const float *aVector,
+- const double *bVector, unsigned int num_points)
++static inline void volk_32f_64f_multiply_64f_generic(double* cVector,
++ const float* aVector,
++ const double* bVector,
++ unsigned int num_points)
+ {
+- double *cPtr = cVector;
+- const float *aPtr = aVector;
+- const double *bPtr = bVector;
+- unsigned int number = 0;
+-
+- for (number = 0; number < num_points; number++) {
+- *cPtr++ = ((double)(*aPtr++)) * (*bPtr++);
+- }
++ double* cPtr = cVector;
++ const float* aPtr = aVector;
++ const double* bPtr = bVector;
++ unsigned int number = 0;
++
++ for (number = 0; number < num_points; number++) {
++ *cPtr++ = ((double)(*aPtr++)) * (*bPtr++);
++ }
+ }
+
+ #endif /* LV_HAVE_GENERIC */
+@@ -102,47 +103,48 @@ volk_32f_64f_multiply_64f_generic(double *cVector, const float *aVector,
+ #include <immintrin.h>
+ #include <xmmintrin.h>
+
+-static inline void
+-volk_32f_64f_multiply_64f_u_avx(double *cVector, const float *aVector,
+- const double *bVector, unsigned int num_points)
++static inline void volk_32f_64f_multiply_64f_u_avx(double* cVector,
++ const float* aVector,
++ const double* bVector,
++ unsigned int num_points)
+ {
+- unsigned int number = 0;
+- const unsigned int eighth_points = num_points / 8;
++ unsigned int number = 0;
++ const unsigned int eighth_points = num_points / 8;
+
+- double *cPtr = cVector;
+- const float *aPtr = aVector;
+- const double *bPtr = bVector;
++ double* cPtr = cVector;
++ const float* aPtr = aVector;
++ const double* bPtr = bVector;
+
+- __m256 aVal;
+- __m128 aVal1, aVal2;
+- __m256d aDbl1, aDbl2, bVal1, bVal2, cVal1, cVal2;
+- for (; number < eighth_points; number++) {
++ __m256 aVal;
++ __m128 aVal1, aVal2;
++ __m256d aDbl1, aDbl2, bVal1, bVal2, cVal1, cVal2;
++ for (; number < eighth_points; number++) {
+
+- aVal = _mm256_loadu_ps(aPtr);
+- bVal1 = _mm256_loadu_pd(bPtr);
+- bVal2 = _mm256_loadu_pd(bPtr+4);
++ aVal = _mm256_loadu_ps(aPtr);
++ bVal1 = _mm256_loadu_pd(bPtr);
++ bVal2 = _mm256_loadu_pd(bPtr + 4);
+
+- aVal1 = _mm256_extractf128_ps(aVal, 0);
+- aVal2 = _mm256_extractf128_ps(aVal, 1);
++ aVal1 = _mm256_extractf128_ps(aVal, 0);
++ aVal2 = _mm256_extractf128_ps(aVal, 1);
+
+- aDbl1 = _mm256_cvtps_pd(aVal1);
+- aDbl2 = _mm256_cvtps_pd(aVal2);
++ aDbl1 = _mm256_cvtps_pd(aVal1);
++ aDbl2 = _mm256_cvtps_pd(aVal2);
+
+- cVal1 = _mm256_mul_pd(aDbl1, bVal1);
+- cVal2 = _mm256_mul_pd(aDbl2, bVal2);
++ cVal1 = _mm256_mul_pd(aDbl1, bVal1);
++ cVal2 = _mm256_mul_pd(aDbl2, bVal2);
+
+- _mm256_storeu_pd(cPtr, cVal1); // Store the results back into the C container
+- _mm256_storeu_pd(cPtr+4, cVal2); // Store the results back into the C container
++ _mm256_storeu_pd(cPtr, cVal1); // Store the results back into the C container
++ _mm256_storeu_pd(cPtr + 4, cVal2); // Store the results back into the C container
+
+- aPtr += 8;
+- bPtr += 8;
+- cPtr += 8;
+- }
++ aPtr += 8;
++ bPtr += 8;
++ cPtr += 8;
++ }
+
+- number = eighth_points * 8;
+- for (; number < num_points; number++) {
+- *cPtr++ = ((double)(*aPtr++)) * (*bPtr++);
+- }
++ number = eighth_points * 8;
++ for (; number < num_points; number++) {
++ *cPtr++ = ((double)(*aPtr++)) * (*bPtr++);
++ }
+ }
+
+ #endif /* LV_HAVE_AVX */
+@@ -153,51 +155,51 @@ volk_32f_64f_multiply_64f_u_avx(double *cVector, const float *aVector,
+ #include <immintrin.h>
+ #include <xmmintrin.h>
+
+-static inline void
+-volk_32f_64f_multiply_64f_a_avx(double *cVector, const float *aVector,
+- const double *bVector, unsigned int num_points)
++static inline void volk_32f_64f_multiply_64f_a_avx(double* cVector,
++ const float* aVector,
++ const double* bVector,
++ unsigned int num_points)
+ {
+- unsigned int number = 0;
+- const unsigned int eighth_points = num_points / 8;
++ unsigned int number = 0;
++ const unsigned int eighth_points = num_points / 8;
+
+- double *cPtr = cVector;
+- const float *aPtr = aVector;
+- const double *bPtr = bVector;
++ double* cPtr = cVector;
++ const float* aPtr = aVector;
++ const double* bPtr = bVector;
+
+- __m256 aVal;
+- __m128 aVal1, aVal2;
+- __m256d aDbl1, aDbl2, bVal1, bVal2, cVal1, cVal2;
+- for (; number < eighth_points; number++) {
++ __m256 aVal;
++ __m128 aVal1, aVal2;
++ __m256d aDbl1, aDbl2, bVal1, bVal2, cVal1, cVal2;
++ for (; number < eighth_points; number++) {
+
+- aVal = _mm256_load_ps(aPtr);
+- bVal1 = _mm256_load_pd(bPtr);
+- bVal2 = _mm256_load_pd(bPtr+4);
++ aVal = _mm256_load_ps(aPtr);
++ bVal1 = _mm256_load_pd(bPtr);
++ bVal2 = _mm256_load_pd(bPtr + 4);
+
+- aVal1 = _mm256_extractf128_ps(aVal, 0);
+- aVal2 = _mm256_extractf128_ps(aVal, 1);
++ aVal1 = _mm256_extractf128_ps(aVal, 0);
++ aVal2 = _mm256_extractf128_ps(aVal, 1);
+
+- aDbl1 = _mm256_cvtps_pd(aVal1);
+- aDbl2 = _mm256_cvtps_pd(aVal2);
++ aDbl1 = _mm256_cvtps_pd(aVal1);
++ aDbl2 = _mm256_cvtps_pd(aVal2);
+
+- cVal1 = _mm256_mul_pd(aDbl1, bVal1);
+- cVal2 = _mm256_mul_pd(aDbl2, bVal2);
++ cVal1 = _mm256_mul_pd(aDbl1, bVal1);
++ cVal2 = _mm256_mul_pd(aDbl2, bVal2);
+
+- _mm256_store_pd(cPtr, cVal1); // Store the results back into the C container
+- _mm256_store_pd(cPtr+4, cVal2); // Store the results back into the C container
++ _mm256_store_pd(cPtr, cVal1); // Store the results back into the C container
++ _mm256_store_pd(cPtr + 4, cVal2); // Store the results back into the C container
+
+- aPtr += 8;
+- bPtr += 8;
+- cPtr += 8;
+- }
++ aPtr += 8;
++ bPtr += 8;
++ cPtr += 8;
++ }
+
+- number = eighth_points * 8;
+- for (; number < num_points; number++) {
+- *cPtr++ = ((double)(*aPtr++)) * (*bPtr++);
+- }
++ number = eighth_points * 8;
++ for (; number < num_points; number++) {
++ *cPtr++ = ((double)(*aPtr++)) * (*bPtr++);
++ }
+ }
+
+ #endif /* LV_HAVE_AVX */
+
+
+-
+ #endif /* INCLUDED_volk_32f_64f_multiply_64f_u_H */
+diff --git a/kernels/volk/volk_32f_8u_polarbutterfly_32f.h b/kernels/volk/volk_32f_8u_polarbutterfly_32f.h
+index 4aba6c4..2198b33 100644
+--- a/kernels/volk/volk_32f_8u_polarbutterfly_32f.h
++++ b/kernels/volk/volk_32f_8u_polarbutterfly_32f.h
+@@ -51,14 +51,17 @@
+ * int frame_exp = 10;
+ * int frame_size = 0x01 << frame_exp;
+ *
+- * float* llrs = (float*) volk_malloc(sizeof(float) * frame_size * (frame_exp + 1), volk_get_alignment());
+- * unsigned char* u = (unsigned char) volk_malloc(sizeof(unsigned char) * frame_size * (frame_exp + 1), volk_get_alignment());
++ * float* llrs = (float*) volk_malloc(sizeof(float) * frame_size * (frame_exp + 1),
++ * volk_get_alignment()); unsigned char* u = (unsigned char) volk_malloc(sizeof(unsigned
++ * char) * frame_size * (frame_exp + 1), volk_get_alignment());
+ *
+- * {some_function_to_write_encoded_bits_to_float_llrs(llrs + frame_size * frame_exp, data)};
++ * {some_function_to_write_encoded_bits_to_float_llrs(llrs + frame_size * frame_exp,
++ * data)};
+ *
+ * unsigned int u_num;
+ * for(u_num = 0; u_num < frame_size; u_num++){
+- * volk_32f_8u_polarbutterfly_32f_u_avx(llrs, u, frame_size, frame_exp, 0, u_num, u_num);
++ * volk_32f_8u_polarbutterfly_32f_u_avx(llrs, u, frame_size, frame_exp, 0, u_num,
++ * u_num);
+ * // next line could first search for frozen bit value and then do bit decision.
+ * u[u_num] = llrs[u_num] > 0 ? 0 : 1;
+ * }
+@@ -73,130 +76,131 @@
+ #include <math.h>
+ #include <volk/volk_8u_x2_encodeframepolar_8u.h>
+
+-static inline float
+-llr_odd(const float la, const float lb)
++static inline float llr_odd(const float la, const float lb)
+ {
+- const float ala = fabsf(la);
+- const float alb = fabsf(lb);
+- return copysignf(1.0f, la) * copysignf(1.0f, lb) * (ala > alb ? alb : ala);
++ const float ala = fabsf(la);
++ const float alb = fabsf(lb);
++ return copysignf(1.0f, la) * copysignf(1.0f, lb) * (ala > alb ? alb : ala);
+ }
+
+-static inline void
+-llr_odd_stages(float* llrs, int min_stage, const int depth, const int frame_size, const int row)
++static inline void llr_odd_stages(
++ float* llrs, int min_stage, const int depth, const int frame_size, const int row)
+ {
+- int loop_stage = depth - 1;
+- float* dst_llr_ptr;
+- float* src_llr_ptr;
+- int stage_size = 0x01 << loop_stage;
+-
+- int el;
+- while(min_stage <= loop_stage){
+- dst_llr_ptr = llrs + loop_stage * frame_size + row;
+- src_llr_ptr = dst_llr_ptr + frame_size;
+- for(el = 0; el < stage_size; el++){
+- *dst_llr_ptr++ = llr_odd(*src_llr_ptr, *(src_llr_ptr + 1));
+- src_llr_ptr += 2;
++ int loop_stage = depth - 1;
++ float* dst_llr_ptr;
++ float* src_llr_ptr;
++ int stage_size = 0x01 << loop_stage;
++
++ int el;
++ while (min_stage <= loop_stage) {
++ dst_llr_ptr = llrs + loop_stage * frame_size + row;
++ src_llr_ptr = dst_llr_ptr + frame_size;
++ for (el = 0; el < stage_size; el++) {
++ *dst_llr_ptr++ = llr_odd(*src_llr_ptr, *(src_llr_ptr + 1));
++ src_llr_ptr += 2;
++ }
++
++ --loop_stage;
++ stage_size >>= 1;
+ }
+-
+- --loop_stage;
+- stage_size >>= 1;
+- }
+ }
+
+-static inline float
+-llr_even(const float la, const float lb, const unsigned char f)
++static inline float llr_even(const float la, const float lb, const unsigned char f)
+ {
+- switch(f){
++ switch (f) {
+ case 0:
+- return lb + la;
++ return lb + la;
+ default:
+- return lb - la;
+- }
++ return lb - la;
++ }
+ }
+
+ static inline void
+ even_u_values(unsigned char* u_even, const unsigned char* u, const int u_num)
+ {
+- u++;
+- int i;
+- for(i = 1; i < u_num; i += 2){
+- *u_even++ = *u;
+- u += 2;
+- }
++ u++;
++ int i;
++ for (i = 1; i < u_num; i += 2) {
++ *u_even++ = *u;
++ u += 2;
++ }
+ }
+
+ static inline void
+ odd_xor_even_values(unsigned char* u_xor, const unsigned char* u, const int u_num)
+ {
+- int i;
+- for(i = 1; i < u_num; i += 2){
+- *u_xor++ = *u ^ *(u + 1);
+- u += 2;
+- }
++ int i;
++ for (i = 1; i < u_num; i += 2) {
++ *u_xor++ = *u ^ *(u + 1);
++ u += 2;
++ }
+ }
+
+-static inline int
+-calculate_max_stage_depth_for_row(const int frame_exp, const int row)
++static inline int calculate_max_stage_depth_for_row(const int frame_exp, const int row)
+ {
+- int max_stage_depth = 0;
+- int half_stage_size = 0x01;
+- int stage_size = half_stage_size << 1;
+- while(max_stage_depth < (frame_exp - 1)){ // last stage holds received values.
+- if(!(row % stage_size < half_stage_size)){
+- break;
++ int max_stage_depth = 0;
++ int half_stage_size = 0x01;
++ int stage_size = half_stage_size << 1;
++ while (max_stage_depth < (frame_exp - 1)) { // last stage holds received values.
++ if (!(row % stage_size < half_stage_size)) {
++ break;
++ }
++ half_stage_size <<= 1;
++ stage_size <<= 1;
++ max_stage_depth++;
+ }
+- half_stage_size <<= 1;
+- stage_size <<= 1;
+- max_stage_depth++;
+- }
+- return max_stage_depth;
++ return max_stage_depth;
+ }
+
+ #ifdef LV_HAVE_GENERIC
+
+-static inline void
+-volk_32f_8u_polarbutterfly_32f_generic(float* llrs, unsigned char* u,
+- const int frame_exp,
+- const int stage, const int u_num, const int row)
++static inline void volk_32f_8u_polarbutterfly_32f_generic(float* llrs,
++ unsigned char* u,
++ const int frame_exp,
++ const int stage,
++ const int u_num,
++ const int row)
+ {
+- const int frame_size = 0x01 << frame_exp;
+- const int next_stage = stage + 1;
++ const int frame_size = 0x01 << frame_exp;
++ const int next_stage = stage + 1;
+
+- const int half_stage_size = 0x01 << stage;
+- const int stage_size = half_stage_size << 1;
++ const int half_stage_size = 0x01 << stage;
++ const int stage_size = half_stage_size << 1;
+
+- const bool is_upper_stage_half = row % stage_size < half_stage_size;
++ const bool is_upper_stage_half = row % stage_size < half_stage_size;
+
+-// // this is a natural bit order impl
+- float* next_llrs = llrs + frame_size;// LLRs are stored in a consecutive array.
+- float* call_row_llr = llrs + row;
++ // // this is a natural bit order impl
++ float* next_llrs = llrs + frame_size; // LLRs are stored in a consecutive array.
++ float* call_row_llr = llrs + row;
+
+- const int section = row - (row % stage_size);
+- const int jump_size = ((row % half_stage_size) << 1) % stage_size;
++ const int section = row - (row % stage_size);
++ const int jump_size = ((row % half_stage_size) << 1) % stage_size;
+
+- const int next_upper_row = section + jump_size;
+- const int next_lower_row = next_upper_row + 1;
++ const int next_upper_row = section + jump_size;
++ const int next_lower_row = next_upper_row + 1;
+
+- const float* upper_right_llr_ptr = next_llrs + next_upper_row;
+- const float* lower_right_llr_ptr = next_llrs + next_lower_row;
++ const float* upper_right_llr_ptr = next_llrs + next_upper_row;
++ const float* lower_right_llr_ptr = next_llrs + next_lower_row;
+
+- if(!is_upper_stage_half){
+- const int u_pos = u_num >> stage;
+- const unsigned char f = u[u_pos - 1];
+- *call_row_llr = llr_even(*upper_right_llr_ptr, *lower_right_llr_ptr, f);
+- return;
+- }
++ if (!is_upper_stage_half) {
++ const int u_pos = u_num >> stage;
++ const unsigned char f = u[u_pos - 1];
++ *call_row_llr = llr_even(*upper_right_llr_ptr, *lower_right_llr_ptr, f);
++ return;
++ }
+
+- if(frame_exp > next_stage){
+- unsigned char* u_half = u + frame_size;
+- odd_xor_even_values(u_half, u, u_num);
+- volk_32f_8u_polarbutterfly_32f_generic(next_llrs, u_half, frame_exp, next_stage, u_num, next_upper_row);
++ if (frame_exp > next_stage) {
++ unsigned char* u_half = u + frame_size;
++ odd_xor_even_values(u_half, u, u_num);
++ volk_32f_8u_polarbutterfly_32f_generic(
++ next_llrs, u_half, frame_exp, next_stage, u_num, next_upper_row);
+
+- even_u_values(u_half, u, u_num);
+- volk_32f_8u_polarbutterfly_32f_generic(next_llrs, u_half, frame_exp, next_stage, u_num, next_lower_row);
+- }
++ even_u_values(u_half, u, u_num);
++ volk_32f_8u_polarbutterfly_32f_generic(
++ next_llrs, u_half, frame_exp, next_stage, u_num, next_lower_row);
++ }
+
+- *call_row_llr = llr_odd(*upper_right_llr_ptr, *lower_right_llr_ptr);
++ *call_row_llr = llr_odd(*upper_right_llr_ptr, *lower_right_llr_ptr);
+ }
+
+ #endif /* LV_HAVE_GENERIC */
+@@ -206,99 +210,99 @@ volk_32f_8u_polarbutterfly_32f_generic(float* llrs, unsigned char* u,
+ #include <immintrin.h>
+ #include <volk/volk_avx_intrinsics.h>
+
+-static inline void
+-volk_32f_8u_polarbutterfly_32f_u_avx(float* llrs, unsigned char* u,
+- const int frame_exp,
+- const int stage, const int u_num, const int row)
++static inline void volk_32f_8u_polarbutterfly_32f_u_avx(float* llrs,
++ unsigned char* u,
++ const int frame_exp,
++ const int stage,
++ const int u_num,
++ const int row)
+ {
+- const int frame_size = 0x01 << frame_exp;
+- if(row % 2){ // for odd rows just do the only necessary calculation and return.
+- const float* next_llrs = llrs + frame_size + row;
+- *(llrs + row) = llr_even(*(next_llrs - 1), *next_llrs, u[u_num - 1]);
+- return;
+- }
+-
+- const int max_stage_depth = calculate_max_stage_depth_for_row(frame_exp, row);
+- if(max_stage_depth < 3){ // vectorized version needs larger vectors.
+- volk_32f_8u_polarbutterfly_32f_generic(llrs, u, frame_exp, stage, u_num, row);
+- return;
+- }
+-
+- int loop_stage = max_stage_depth;
+- int stage_size = 0x01 << loop_stage;
+-
+- float* src_llr_ptr;
+- float* dst_llr_ptr;
+-
+- __m256 src0, src1, dst;
+-
+- if(row){ // not necessary for ZERO row. == first bit to be decoded.
+- // first do bit combination for all stages
+- // effectively encode some decoded bits again.
+- unsigned char* u_target = u + frame_size;
+- unsigned char* u_temp = u + 2* frame_size;
+- memcpy(u_temp, u + u_num - stage_size, sizeof(unsigned char) * stage_size);
+-
+- if(stage_size > 15){
+- _mm256_zeroupper();
+- volk_8u_x2_encodeframepolar_8u_u_ssse3(u_target, u_temp, stage_size);
++ const int frame_size = 0x01 << frame_exp;
++ if (row % 2) { // for odd rows just do the only necessary calculation and return.
++ const float* next_llrs = llrs + frame_size + row;
++ *(llrs + row) = llr_even(*(next_llrs - 1), *next_llrs, u[u_num - 1]);
++ return;
+ }
+- else{
+- volk_8u_x2_encodeframepolar_8u_generic(u_target, u_temp, stage_size);
++
++ const int max_stage_depth = calculate_max_stage_depth_for_row(frame_exp, row);
++ if (max_stage_depth < 3) { // vectorized version needs larger vectors.
++ volk_32f_8u_polarbutterfly_32f_generic(llrs, u, frame_exp, stage, u_num, row);
++ return;
+ }
+
+- src_llr_ptr = llrs + (max_stage_depth + 1) * frame_size + row - stage_size;
+- dst_llr_ptr = llrs + max_stage_depth * frame_size + row;
++ int loop_stage = max_stage_depth;
++ int stage_size = 0x01 << loop_stage;
+
+- __m128i fbits;
++ float* src_llr_ptr;
++ float* dst_llr_ptr;
+
+- int p;
+- for(p = 0; p < stage_size; p += 8){
+- _mm256_zeroupper();
+- fbits = _mm_loadu_si128((__m128i*) u_target);
+- u_target += 8;
++ __m256 src0, src1, dst;
+
+- src0 = _mm256_loadu_ps(src_llr_ptr);
+- src1 = _mm256_loadu_ps(src_llr_ptr + 8);
+- src_llr_ptr += 16;
++ if (row) { // not necessary for ZERO row. == first bit to be decoded.
++ // first do bit combination for all stages
++ // effectively encode some decoded bits again.
++ unsigned char* u_target = u + frame_size;
++ unsigned char* u_temp = u + 2 * frame_size;
++ memcpy(u_temp, u + u_num - stage_size, sizeof(unsigned char) * stage_size);
+
+- dst = _mm256_polar_fsign_add_llrs(src0, src1, fbits);
++ if (stage_size > 15) {
++ _mm256_zeroupper();
++ volk_8u_x2_encodeframepolar_8u_u_ssse3(u_target, u_temp, stage_size);
++ } else {
++ volk_8u_x2_encodeframepolar_8u_generic(u_target, u_temp, stage_size);
++ }
+
+- _mm256_storeu_ps(dst_llr_ptr, dst);
+- dst_llr_ptr += 8;
+- }
++ src_llr_ptr = llrs + (max_stage_depth + 1) * frame_size + row - stage_size;
++ dst_llr_ptr = llrs + max_stage_depth * frame_size + row;
+
+- --loop_stage;
+- stage_size >>= 1;
+- }
++ __m128i fbits;
+
+- const int min_stage = stage > 2 ? stage : 2;
++ int p;
++ for (p = 0; p < stage_size; p += 8) {
++ _mm256_zeroupper();
++ fbits = _mm_loadu_si128((__m128i*)u_target);
++ u_target += 8;
+
+- _mm256_zeroall(); // Important to clear cache!
++ src0 = _mm256_loadu_ps(src_llr_ptr);
++ src1 = _mm256_loadu_ps(src_llr_ptr + 8);
++ src_llr_ptr += 16;
+
+- int el;
+- while(min_stage < loop_stage){
+- dst_llr_ptr = llrs + loop_stage * frame_size + row;
+- src_llr_ptr = dst_llr_ptr + frame_size;
+- for(el = 0; el < stage_size; el += 8){
+- src0 = _mm256_loadu_ps(src_llr_ptr);
+- src_llr_ptr += 8;
+- src1 = _mm256_loadu_ps(src_llr_ptr);
+- src_llr_ptr += 8;
++ dst = _mm256_polar_fsign_add_llrs(src0, src1, fbits);
+
+- dst = _mm256_polar_minsum_llrs(src0, src1);
++ _mm256_storeu_ps(dst_llr_ptr, dst);
++ dst_llr_ptr += 8;
++ }
+
+- _mm256_storeu_ps(dst_llr_ptr, dst);
+- dst_llr_ptr += 8;
++ --loop_stage;
++ stage_size >>= 1;
+ }
+
+- --loop_stage;
+- stage_size >>= 1;
++ const int min_stage = stage > 2 ? stage : 2;
++
++ _mm256_zeroall(); // Important to clear cache!
+
+- }
++ int el;
++ while (min_stage < loop_stage) {
++ dst_llr_ptr = llrs + loop_stage * frame_size + row;
++ src_llr_ptr = dst_llr_ptr + frame_size;
++ for (el = 0; el < stage_size; el += 8) {
++ src0 = _mm256_loadu_ps(src_llr_ptr);
++ src_llr_ptr += 8;
++ src1 = _mm256_loadu_ps(src_llr_ptr);
++ src_llr_ptr += 8;
+
+- // for stages < 3 vectors are too small!.
+- llr_odd_stages(llrs, stage, loop_stage + 1,frame_size, row);
++ dst = _mm256_polar_minsum_llrs(src0, src1);
++
++ _mm256_storeu_ps(dst_llr_ptr, dst);
++ dst_llr_ptr += 8;
++ }
++
++ --loop_stage;
++ stage_size >>= 1;
++ }
++
++ // for stages < 3 vectors are too small!.
++ llr_odd_stages(llrs, stage, loop_stage + 1, frame_size, row);
+ }
+
+ #endif /* LV_HAVE_AVX */
+@@ -307,99 +311,99 @@ volk_32f_8u_polarbutterfly_32f_u_avx(float* llrs, unsigned char* u,
+ #include <immintrin.h>
+ #include <volk/volk_avx2_intrinsics.h>
+
+-static inline void
+-volk_32f_8u_polarbutterfly_32f_u_avx2(float* llrs, unsigned char* u,
+- const int frame_exp,
+- const int stage, const int u_num, const int row)
++static inline void volk_32f_8u_polarbutterfly_32f_u_avx2(float* llrs,
++ unsigned char* u,
++ const int frame_exp,
++ const int stage,
++ const int u_num,
++ const int row)
+ {
+- const int frame_size = 0x01 << frame_exp;
+- if(row % 2){ // for odd rows just do the only necessary calculation and return.
+- const float* next_llrs = llrs + frame_size + row;
+- *(llrs + row) = llr_even(*(next_llrs - 1), *next_llrs, u[u_num - 1]);
+- return;
+- }
+-
+- const int max_stage_depth = calculate_max_stage_depth_for_row(frame_exp, row);
+- if(max_stage_depth < 3){ // vectorized version needs larger vectors.
+- volk_32f_8u_polarbutterfly_32f_generic(llrs, u, frame_exp, stage, u_num, row);
+- return;
+- }
+-
+- int loop_stage = max_stage_depth;
+- int stage_size = 0x01 << loop_stage;
+-
+- float* src_llr_ptr;
+- float* dst_llr_ptr;
+-
+- __m256 src0, src1, dst;
+-
+- if(row){ // not necessary for ZERO row. == first bit to be decoded.
+- // first do bit combination for all stages
+- // effectively encode some decoded bits again.
+- unsigned char* u_target = u + frame_size;
+- unsigned char* u_temp = u + 2* frame_size;
+- memcpy(u_temp, u + u_num - stage_size, sizeof(unsigned char) * stage_size);
+-
+- if(stage_size > 15){
+- _mm256_zeroupper();
+- volk_8u_x2_encodeframepolar_8u_u_ssse3(u_target, u_temp, stage_size);
++ const int frame_size = 0x01 << frame_exp;
++ if (row % 2) { // for odd rows just do the only necessary calculation and return.
++ const float* next_llrs = llrs + frame_size + row;
++ *(llrs + row) = llr_even(*(next_llrs - 1), *next_llrs, u[u_num - 1]);
++ return;
+ }
+- else{
+- volk_8u_x2_encodeframepolar_8u_generic(u_target, u_temp, stage_size);
++
++ const int max_stage_depth = calculate_max_stage_depth_for_row(frame_exp, row);
++ if (max_stage_depth < 3) { // vectorized version needs larger vectors.
++ volk_32f_8u_polarbutterfly_32f_generic(llrs, u, frame_exp, stage, u_num, row);
++ return;
+ }
+
+- src_llr_ptr = llrs + (max_stage_depth + 1) * frame_size + row - stage_size;
+- dst_llr_ptr = llrs + max_stage_depth * frame_size + row;
++ int loop_stage = max_stage_depth;
++ int stage_size = 0x01 << loop_stage;
+
+- __m128i fbits;
++ float* src_llr_ptr;
++ float* dst_llr_ptr;
+
+- int p;
+- for(p = 0; p < stage_size; p += 8){
+- _mm256_zeroupper();
+- fbits = _mm_loadu_si128((__m128i*) u_target);
+- u_target += 8;
++ __m256 src0, src1, dst;
+
+- src0 = _mm256_loadu_ps(src_llr_ptr);
+- src1 = _mm256_loadu_ps(src_llr_ptr + 8);
+- src_llr_ptr += 16;
++ if (row) { // not necessary for ZERO row. == first bit to be decoded.
++ // first do bit combination for all stages
++ // effectively encode some decoded bits again.
++ unsigned char* u_target = u + frame_size;
++ unsigned char* u_temp = u + 2 * frame_size;
++ memcpy(u_temp, u + u_num - stage_size, sizeof(unsigned char) * stage_size);
+
+- dst = _mm256_polar_fsign_add_llrs_avx2(src0, src1, fbits);
++ if (stage_size > 15) {
++ _mm256_zeroupper();
++ volk_8u_x2_encodeframepolar_8u_u_ssse3(u_target, u_temp, stage_size);
++ } else {
++ volk_8u_x2_encodeframepolar_8u_generic(u_target, u_temp, stage_size);
++ }
+
+- _mm256_storeu_ps(dst_llr_ptr, dst);
+- dst_llr_ptr += 8;
+- }
++ src_llr_ptr = llrs + (max_stage_depth + 1) * frame_size + row - stage_size;
++ dst_llr_ptr = llrs + max_stage_depth * frame_size + row;
+
+- --loop_stage;
+- stage_size >>= 1;
+- }
++ __m128i fbits;
+
+- const int min_stage = stage > 2 ? stage : 2;
++ int p;
++ for (p = 0; p < stage_size; p += 8) {
++ _mm256_zeroupper();
++ fbits = _mm_loadu_si128((__m128i*)u_target);
++ u_target += 8;
+
+- _mm256_zeroall(); // Important to clear cache!
++ src0 = _mm256_loadu_ps(src_llr_ptr);
++ src1 = _mm256_loadu_ps(src_llr_ptr + 8);
++ src_llr_ptr += 16;
+
+- int el;
+- while(min_stage < loop_stage){
+- dst_llr_ptr = llrs + loop_stage * frame_size + row;
+- src_llr_ptr = dst_llr_ptr + frame_size;
+- for(el = 0; el < stage_size; el += 8){
+- src0 = _mm256_loadu_ps(src_llr_ptr);
+- src_llr_ptr += 8;
+- src1 = _mm256_loadu_ps(src_llr_ptr);
+- src_llr_ptr += 8;
++ dst = _mm256_polar_fsign_add_llrs_avx2(src0, src1, fbits);
+
+- dst = _mm256_polar_minsum_llrs(src0, src1);
++ _mm256_storeu_ps(dst_llr_ptr, dst);
++ dst_llr_ptr += 8;
++ }
+
+- _mm256_storeu_ps(dst_llr_ptr, dst);
+- dst_llr_ptr += 8;
++ --loop_stage;
++ stage_size >>= 1;
+ }
+
+- --loop_stage;
+- stage_size >>= 1;
++ const int min_stage = stage > 2 ? stage : 2;
++
++ _mm256_zeroall(); // Important to clear cache!
++
++ int el;
++ while (min_stage < loop_stage) {
++ dst_llr_ptr = llrs + loop_stage * frame_size + row;
++ src_llr_ptr = dst_llr_ptr + frame_size;
++ for (el = 0; el < stage_size; el += 8) {
++ src0 = _mm256_loadu_ps(src_llr_ptr);
++ src_llr_ptr += 8;
++ src1 = _mm256_loadu_ps(src_llr_ptr);
++ src_llr_ptr += 8;
+
+- }
++ dst = _mm256_polar_minsum_llrs(src0, src1);
++
++ _mm256_storeu_ps(dst_llr_ptr, dst);
++ dst_llr_ptr += 8;
++ }
++
++ --loop_stage;
++ stage_size >>= 1;
++ }
+
+- // for stages < 3 vectors are too small!.
+- llr_odd_stages(llrs, stage, loop_stage + 1,frame_size, row);
++ // for stages < 3 vectors are too small!.
++ llr_odd_stages(llrs, stage, loop_stage + 1, frame_size, row);
+ }
+
+ #endif /* LV_HAVE_AVX2 */
+diff --git a/kernels/volk/volk_32f_8u_polarbutterflypuppet_32f.h b/kernels/volk/volk_32f_8u_polarbutterflypuppet_32f.h
+index fa40a86..6f97dd1 100644
+--- a/kernels/volk/volk_32f_8u_polarbutterflypuppet_32f.h
++++ b/kernels/volk/volk_32f_8u_polarbutterflypuppet_32f.h
+@@ -33,124 +33,129 @@
+ #include <volk/volk_8u_x3_encodepolarpuppet_8u.h>
+
+
+-static inline void
+-sanitize_bytes(unsigned char* u, const int elements)
++static inline void sanitize_bytes(unsigned char* u, const int elements)
+ {
+- int i;
+- unsigned char* u_ptr = u;
+- for(i = 0; i < elements; i++){
+- *u_ptr = (*u_ptr & 0x01);
+- u_ptr++;
+- }
++ int i;
++ unsigned char* u_ptr = u;
++ for (i = 0; i < elements; i++) {
++ *u_ptr = (*u_ptr & 0x01);
++ u_ptr++;
++ }
+ }
+
+-static inline void
+-clean_up_intermediate_values(float* llrs, unsigned char* u, const int frame_size, const int elements)
++static inline void clean_up_intermediate_values(float* llrs,
++ unsigned char* u,
++ const int frame_size,
++ const int elements)
+ {
+- memset(u + frame_size, 0, sizeof(unsigned char) * (elements - frame_size));
+- memset(llrs + frame_size, 0, sizeof(float) * (elements - frame_size));
++ memset(u + frame_size, 0, sizeof(unsigned char) * (elements - frame_size));
++ memset(llrs + frame_size, 0, sizeof(float) * (elements - frame_size));
+ }
+
+ static inline void
+ generate_error_free_input_vector(float* llrs, unsigned char* u, const int frame_size)
+ {
+- memset(u, 0, frame_size);
+- unsigned char* target = u + frame_size;
+- volk_8u_x2_encodeframepolar_8u_generic(target, u + 2 * frame_size, frame_size);
+- float* ft = llrs;
+- int i;
+- for(i = 0; i < frame_size; i++){
+- *ft = (-2 * ((float) *target++)) + 1.0f;
+- ft++;
+- }
++ memset(u, 0, frame_size);
++ unsigned char* target = u + frame_size;
++ volk_8u_x2_encodeframepolar_8u_generic(target, u + 2 * frame_size, frame_size);
++ float* ft = llrs;
++ int i;
++ for (i = 0; i < frame_size; i++) {
++ *ft = (-2 * ((float)*target++)) + 1.0f;
++ ft++;
++ }
+ }
+
+ static inline void
+ print_llr_tree(const float* llrs, const int frame_size, const int frame_exp)
+ {
+- int s, e;
+- for(s = 0; s < frame_size; s++){
+- for(e = 0; e < frame_exp + 1; e++){
+- printf("%+4.2f ", llrs[e * frame_size + s]);
+- }
+- printf("\n");
+- if((s + 1) % 8 == 0){
+- printf("\n");
++ int s, e;
++ for (s = 0; s < frame_size; s++) {
++ for (e = 0; e < frame_exp + 1; e++) {
++ printf("%+4.2f ", llrs[e * frame_size + s]);
++ }
++ printf("\n");
++ if ((s + 1) % 8 == 0) {
++ printf("\n");
++ }
+ }
+- }
+ }
+
+-static inline int
+-maximum_frame_size(const int elements)
++static inline int maximum_frame_size(const int elements)
+ {
+- unsigned int frame_size = next_lower_power_of_two(elements);
+- unsigned int frame_exp = log2_of_power_of_2(frame_size);
+- return next_lower_power_of_two(frame_size / frame_exp);
++ unsigned int frame_size = next_lower_power_of_two(elements);
++ unsigned int frame_exp = log2_of_power_of_2(frame_size);
++ return next_lower_power_of_two(frame_size / frame_exp);
+ }
+
+ #ifdef LV_HAVE_GENERIC
+-static inline void
+-volk_32f_8u_polarbutterflypuppet_32f_generic(float* llrs, const float* input, unsigned char* u, const int elements)
++static inline void volk_32f_8u_polarbutterflypuppet_32f_generic(float* llrs,
++ const float* input,
++ unsigned char* u,
++ const int elements)
+ {
+- unsigned int frame_size = maximum_frame_size(elements);
+- unsigned int frame_exp = log2_of_power_of_2(frame_size);
++ unsigned int frame_size = maximum_frame_size(elements);
++ unsigned int frame_exp = log2_of_power_of_2(frame_size);
+
+- sanitize_bytes(u, elements);
+- clean_up_intermediate_values(llrs, u, frame_size, elements);
+- generate_error_free_input_vector(llrs + frame_exp * frame_size, u, frame_size);
++ sanitize_bytes(u, elements);
++ clean_up_intermediate_values(llrs, u, frame_size, elements);
++ generate_error_free_input_vector(llrs + frame_exp * frame_size, u, frame_size);
+
+- unsigned int u_num = 0;
+- for(; u_num < frame_size; u_num++){
+- volk_32f_8u_polarbutterfly_32f_generic(llrs, u, frame_exp, 0, u_num, u_num);
+- u[u_num] = llrs[u_num] > 0 ? 0 : 1;
+- }
++ unsigned int u_num = 0;
++ for (; u_num < frame_size; u_num++) {
++ volk_32f_8u_polarbutterfly_32f_generic(llrs, u, frame_exp, 0, u_num, u_num);
++ u[u_num] = llrs[u_num] > 0 ? 0 : 1;
++ }
+
+- clean_up_intermediate_values(llrs, u, frame_size, elements);
++ clean_up_intermediate_values(llrs, u, frame_size, elements);
+ }
+ #endif /* LV_HAVE_GENERIC */
+
+ #ifdef LV_HAVE_AVX
+-static inline void
+-volk_32f_8u_polarbutterflypuppet_32f_u_avx(float* llrs, const float* input, unsigned char* u, const int elements)
++static inline void volk_32f_8u_polarbutterflypuppet_32f_u_avx(float* llrs,
++ const float* input,
++ unsigned char* u,
++ const int elements)
+ {
+- unsigned int frame_size = maximum_frame_size(elements);
+- unsigned int frame_exp = log2_of_power_of_2(frame_size);
++ unsigned int frame_size = maximum_frame_size(elements);
++ unsigned int frame_exp = log2_of_power_of_2(frame_size);
+
+- sanitize_bytes(u, elements);
+- clean_up_intermediate_values(llrs, u, frame_size, elements);
+- generate_error_free_input_vector(llrs + frame_exp * frame_size, u, frame_size);
++ sanitize_bytes(u, elements);
++ clean_up_intermediate_values(llrs, u, frame_size, elements);
++ generate_error_free_input_vector(llrs + frame_exp * frame_size, u, frame_size);
+
+- unsigned int u_num = 0;
+- for(; u_num < frame_size; u_num++){
+- volk_32f_8u_polarbutterfly_32f_u_avx(llrs, u, frame_exp, 0, u_num, u_num);
+- u[u_num] = llrs[u_num] > 0 ? 0 : 1;
+- }
++ unsigned int u_num = 0;
++ for (; u_num < frame_size; u_num++) {
++ volk_32f_8u_polarbutterfly_32f_u_avx(llrs, u, frame_exp, 0, u_num, u_num);
++ u[u_num] = llrs[u_num] > 0 ? 0 : 1;
++ }
+
+- clean_up_intermediate_values(llrs, u, frame_size, elements);
++ clean_up_intermediate_values(llrs, u, frame_size, elements);
+ }
+ #endif /* LV_HAVE_AVX */
+
+ #ifdef LV_HAVE_AVX2
+-static inline void
+-volk_32f_8u_polarbutterflypuppet_32f_u_avx2(float* llrs, const float* input, unsigned char* u, const int elements)
++static inline void volk_32f_8u_polarbutterflypuppet_32f_u_avx2(float* llrs,
++ const float* input,
++ unsigned char* u,
++ const int elements)
+ {
+- unsigned int frame_size = maximum_frame_size(elements);
+- unsigned int frame_exp = log2_of_power_of_2(frame_size);
++ unsigned int frame_size = maximum_frame_size(elements);
++ unsigned int frame_exp = log2_of_power_of_2(frame_size);
+
+- sanitize_bytes(u, elements);
+- clean_up_intermediate_values(llrs, u, frame_size, elements);
+- generate_error_free_input_vector(llrs + frame_exp * frame_size, u, frame_size);
++ sanitize_bytes(u, elements);
++ clean_up_intermediate_values(llrs, u, frame_size, elements);
++ generate_error_free_input_vector(llrs + frame_exp * frame_size, u, frame_size);
+
+- unsigned int u_num = 0;
+- for(; u_num < frame_size; u_num++){
+- volk_32f_8u_polarbutterfly_32f_u_avx2(llrs, u, frame_exp, 0, u_num, u_num);
+- u[u_num] = llrs[u_num] > 0 ? 0 : 1;
+- }
++ unsigned int u_num = 0;
++ for (; u_num < frame_size; u_num++) {
++ volk_32f_8u_polarbutterfly_32f_u_avx2(llrs, u, frame_exp, 0, u_num, u_num);
++ u[u_num] = llrs[u_num] > 0 ? 0 : 1;
++ }
+
+- clean_up_intermediate_values(llrs, u, frame_size, elements);
++ clean_up_intermediate_values(llrs, u, frame_size, elements);
+ }
+ #endif /* LV_HAVE_AVX2 */
+
+
+-
+ #endif /* VOLK_KERNELS_VOLK_VOLK_32F_8U_POLARBUTTERFLYPUPPET_32F_H_ */
+diff --git a/kernels/volk/volk_32f_accumulator_s32f.h b/kernels/volk/volk_32f_accumulator_s32f.h
+index f6219c8..9a78f58 100644
+--- a/kernels/volk/volk_32f_accumulator_s32f.h
++++ b/kernels/volk/volk_32f_accumulator_s32f.h
+@@ -29,8 +29,8 @@
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+- * void volk_32f_accumulator_s32f(float* result, const float* inputBuffer, unsigned int num_points)
+- * \endcode
++ * void volk_32f_accumulator_s32f(float* result, const float* inputBuffer, unsigned int
++ * num_points) \endcode
+ *
+ * \b Inputs
+ * \li inputBuffer The buffer of data to be accumulated
+@@ -63,47 +63,48 @@
+ #ifndef INCLUDED_volk_32f_accumulator_s32f_a_H
+ #define INCLUDED_volk_32f_accumulator_s32f_a_H
+
+-#include <volk/volk_common.h>
+ #include <inttypes.h>
++#include <volk/volk_common.h>
+
+ #ifdef LV_HAVE_AVX
+ #include <immintrin.h>
+
+-static inline void
+-volk_32f_accumulator_s32f_a_avx(float* result, const float* inputBuffer, unsigned int num_points)
++static inline void volk_32f_accumulator_s32f_a_avx(float* result,
++ const float* inputBuffer,
++ unsigned int num_points)
+ {
+- float returnValue = 0;
+- unsigned int number = 0;
+- const unsigned int eighthPoints = num_points / 8;
+-
+- const float* aPtr = inputBuffer;
+- __VOLK_ATTR_ALIGNED(32) float tempBuffer[8];
+-
+- __m256 accumulator = _mm256_setzero_ps();
+- __m256 aVal = _mm256_setzero_ps();
+-
+- for(;number < eighthPoints; number++){
+- aVal = _mm256_load_ps(aPtr);
+- accumulator = _mm256_add_ps(accumulator, aVal);
+- aPtr += 8;
+- }
+-
+- _mm256_store_ps(tempBuffer, accumulator);
+-
+- returnValue = tempBuffer[0];
+- returnValue += tempBuffer[1];
+- returnValue += tempBuffer[2];
+- returnValue += tempBuffer[3];
+- returnValue += tempBuffer[4];
+- returnValue += tempBuffer[5];
+- returnValue += tempBuffer[6];
+- returnValue += tempBuffer[7];
+-
+- number = eighthPoints * 8;
+- for(;number < num_points; number++){
+- returnValue += (*aPtr++);
+- }
+- *result = returnValue;
++ float returnValue = 0;
++ unsigned int number = 0;
++ const unsigned int eighthPoints = num_points / 8;
++
++ const float* aPtr = inputBuffer;
++ __VOLK_ATTR_ALIGNED(32) float tempBuffer[8];
++
++ __m256 accumulator = _mm256_setzero_ps();
++ __m256 aVal = _mm256_setzero_ps();
++
++ for (; number < eighthPoints; number++) {
++ aVal = _mm256_load_ps(aPtr);
++ accumulator = _mm256_add_ps(accumulator, aVal);
++ aPtr += 8;
++ }
++
++ _mm256_store_ps(tempBuffer, accumulator);
++
++ returnValue = tempBuffer[0];
++ returnValue += tempBuffer[1];
++ returnValue += tempBuffer[2];
++ returnValue += tempBuffer[3];
++ returnValue += tempBuffer[4];
++ returnValue += tempBuffer[5];
++ returnValue += tempBuffer[6];
++ returnValue += tempBuffer[7];
++
++ number = eighthPoints * 8;
++ for (; number < num_points; number++) {
++ returnValue += (*aPtr++);
++ }
++ *result = returnValue;
+ }
+ #endif /* LV_HAVE_AVX */
+
+@@ -111,41 +112,42 @@ volk_32f_accumulator_s32f_a_avx(float* result, const float* inputBuffer, unsigne
+ #ifdef LV_HAVE_AVX
+ #include <immintrin.h>
+
+-static inline void
+-volk_32f_accumulator_s32f_u_avx(float* result, const float* inputBuffer, unsigned int num_points)
++static inline void volk_32f_accumulator_s32f_u_avx(float* result,
++ const float* inputBuffer,
++ unsigned int num_points)
+ {
+- float returnValue = 0;
+- unsigned int number = 0;
+- const unsigned int eighthPoints = num_points / 8;
+-
+- const float* aPtr = inputBuffer;
+- __VOLK_ATTR_ALIGNED(32) float tempBuffer[8];
+-
+- __m256 accumulator = _mm256_setzero_ps();
+- __m256 aVal = _mm256_setzero_ps();
+-
+- for(;number < eighthPoints; number++){
+- aVal = _mm256_loadu_ps(aPtr);
+- accumulator = _mm256_add_ps(accumulator, aVal);
+- aPtr += 8;
+- }
+-
+- _mm256_store_ps(tempBuffer, accumulator);
+-
+- returnValue = tempBuffer[0];
+- returnValue += tempBuffer[1];
+- returnValue += tempBuffer[2];
+- returnValue += tempBuffer[3];
+- returnValue += tempBuffer[4];
+- returnValue += tempBuffer[5];
+- returnValue += tempBuffer[6];
+- returnValue += tempBuffer[7];
+-
+- number = eighthPoints * 8;
+- for(;number < num_points; number++){
+- returnValue += (*aPtr++);
+- }
+- *result = returnValue;
++ float returnValue = 0;
++ unsigned int number = 0;
++ const unsigned int eighthPoints = num_points / 8;
++
++ const float* aPtr = inputBuffer;
++ __VOLK_ATTR_ALIGNED(32) float tempBuffer[8];
++
++ __m256 accumulator = _mm256_setzero_ps();
++ __m256 aVal = _mm256_setzero_ps();
++
++ for (; number < eighthPoints; number++) {
++ aVal = _mm256_loadu_ps(aPtr);
++ accumulator = _mm256_add_ps(accumulator, aVal);
++ aPtr += 8;
++ }
++
++ _mm256_store_ps(tempBuffer, accumulator);
++
++ returnValue = tempBuffer[0];
++ returnValue += tempBuffer[1];
++ returnValue += tempBuffer[2];
++ returnValue += tempBuffer[3];
++ returnValue += tempBuffer[4];
++ returnValue += tempBuffer[5];
++ returnValue += tempBuffer[6];
++ returnValue += tempBuffer[7];
++
++ number = eighthPoints * 8;
++ for (; number < num_points; number++) {
++ returnValue += (*aPtr++);
++ }
++ *result = returnValue;
+ }
+ #endif /* LV_HAVE_AVX */
+
+@@ -153,37 +155,38 @@ volk_32f_accumulator_s32f_u_avx(float* result, const float* inputBuffer, unsigne
+ #ifdef LV_HAVE_SSE
+ #include <xmmintrin.h>
+
+-static inline void
+-volk_32f_accumulator_s32f_a_sse(float* result, const float* inputBuffer, unsigned int num_points)
++static inline void volk_32f_accumulator_s32f_a_sse(float* result,
++ const float* inputBuffer,
++ unsigned int num_points)
+ {
+- float returnValue = 0;
+- unsigned int number = 0;
+- const unsigned int quarterPoints = num_points / 4;
+-
+- const float* aPtr = inputBuffer;
+- __VOLK_ATTR_ALIGNED(16) float tempBuffer[4];
+-
+- __m128 accumulator = _mm_setzero_ps();
+- __m128 aVal = _mm_setzero_ps();
+-
+- for(;number < quarterPoints; number++){
+- aVal = _mm_load_ps(aPtr);
+- accumulator = _mm_add_ps(accumulator, aVal);
+- aPtr += 4;
+- }
+-
+- _mm_store_ps(tempBuffer,accumulator);
+-
+- returnValue = tempBuffer[0];
+- returnValue += tempBuffer[1];
+- returnValue += tempBuffer[2];
+- returnValue += tempBuffer[3];
+-
+- number = quarterPoints * 4;
+- for(;number < num_points; number++){
+- returnValue += (*aPtr++);
+- }
+- *result = returnValue;
++ float returnValue = 0;
++ unsigned int number = 0;
++ const unsigned int quarterPoints = num_points / 4;
++
++ const float* aPtr = inputBuffer;
++ __VOLK_ATTR_ALIGNED(16) float tempBuffer[4];
++
++ __m128 accumulator = _mm_setzero_ps();
++ __m128 aVal = _mm_setzero_ps();
++
++ for (; number < quarterPoints; number++) {
++ aVal = _mm_load_ps(aPtr);
++ accumulator = _mm_add_ps(accumulator, aVal);
++ aPtr += 4;
++ }
++
++ _mm_store_ps(tempBuffer, accumulator);
++
++ returnValue = tempBuffer[0];
++ returnValue += tempBuffer[1];
++ returnValue += tempBuffer[2];
++ returnValue += tempBuffer[3];
++
++ number = quarterPoints * 4;
++ for (; number < num_points; number++) {
++ returnValue += (*aPtr++);
++ }
++ *result = returnValue;
+ }
+ #endif /* LV_HAVE_SSE */
+
+@@ -191,52 +194,54 @@ volk_32f_accumulator_s32f_a_sse(float* result, const float* inputBuffer, unsigne
+ #ifdef LV_HAVE_SSE
+ #include <xmmintrin.h>
+
+-static inline void
+-volk_32f_accumulator_s32f_u_sse(float* result, const float* inputBuffer, unsigned int num_points)
++static inline void volk_32f_accumulator_s32f_u_sse(float* result,
++ const float* inputBuffer,
++ unsigned int num_points)
+ {
+- float returnValue = 0;
+- unsigned int number = 0;
+- const unsigned int quarterPoints = num_points / 4;
+-
+- const float* aPtr = inputBuffer;
+- __VOLK_ATTR_ALIGNED(16) float tempBuffer[4];
+-
+- __m128 accumulator = _mm_setzero_ps();
+- __m128 aVal = _mm_setzero_ps();
+-
+- for(;number < quarterPoints; number++){
+- aVal = _mm_load_ps(aPtr);
+- accumulator = _mm_add_ps(accumulator, aVal);
+- aPtr += 4;
+- }
+-
+- _mm_store_ps(tempBuffer,accumulator);
+-
+- returnValue = tempBuffer[0];
+- returnValue += tempBuffer[1];
+- returnValue += tempBuffer[2];
+- returnValue += tempBuffer[3];
+-
+- number = quarterPoints * 4;
+- for(;number < num_points; number++){
+- returnValue += (*aPtr++);
+- }
+- *result = returnValue;
++ float returnValue = 0;
++ unsigned int number = 0;
++ const unsigned int quarterPoints = num_points / 4;
++
++ const float* aPtr = inputBuffer;
++ __VOLK_ATTR_ALIGNED(16) float tempBuffer[4];
++
++ __m128 accumulator = _mm_setzero_ps();
++ __m128 aVal = _mm_setzero_ps();
++
++ for (; number < quarterPoints; number++) {
++ aVal = _mm_load_ps(aPtr);
++ accumulator = _mm_add_ps(accumulator, aVal);
++ aPtr += 4;
++ }
++
++ _mm_store_ps(tempBuffer, accumulator);
++
++ returnValue = tempBuffer[0];
++ returnValue += tempBuffer[1];
++ returnValue += tempBuffer[2];
++ returnValue += tempBuffer[3];
++
++ number = quarterPoints * 4;
++ for (; number < num_points; number++) {
++ returnValue += (*aPtr++);
++ }
++ *result = returnValue;
+ }
+ #endif /* LV_HAVE_SSE */
+
+ #ifdef LV_HAVE_GENERIC
+-static inline void
+-volk_32f_accumulator_s32f_generic(float* result, const float* inputBuffer, unsigned int num_points)
++static inline void volk_32f_accumulator_s32f_generic(float* result,
++ const float* inputBuffer,
++ unsigned int num_points)
+ {
+- const float* aPtr = inputBuffer;
+- unsigned int number = 0;
+- float returnValue = 0;
+-
+- for(;number < num_points; number++){
+- returnValue += (*aPtr++);
+- }
+- *result = returnValue;
++ const float* aPtr = inputBuffer;
++ unsigned int number = 0;
++ float returnValue = 0;
++
++ for (; number < num_points; number++) {
++ returnValue += (*aPtr++);
++ }
++ *result = returnValue;
+ }
+ #endif /* LV_HAVE_GENERIC */
+
+diff --git a/kernels/volk/volk_32f_acos_32f.h b/kernels/volk/volk_32f_acos_32f.h
+index 5c14c2f..92918ca 100644
+--- a/kernels/volk/volk_32f_acos_32f.h
++++ b/kernels/volk/volk_32f_acos_32f.h
+@@ -67,11 +67,12 @@
+ * \endcode
+ */
+
+-#include <stdio.h>
+-#include <math.h>
+ #include <inttypes.h>
++#include <math.h>
++#include <stdio.h>
+
+-/* This is the number of terms of Taylor series to evaluate, increase this for more accuracy*/
++/* This is the number of terms of Taylor series to evaluate, increase this for more
++ * accuracy*/
+ #define ACOS_TERMS 2
+
+ #ifndef INCLUDED_volk_32f_acos_32f_a_H
+@@ -80,62 +81,68 @@
+ #if LV_HAVE_AVX2 && LV_HAVE_FMA
+ #include <immintrin.h>
+
+-static inline void
+-volk_32f_acos_32f_a_avx2_fma(float* bVector, const float* aVector, unsigned int num_points)
++static inline void volk_32f_acos_32f_a_avx2_fma(float* bVector,
++ const float* aVector,
++ unsigned int num_points)
+ {
+- float* bPtr = bVector;
+- const float* aPtr = aVector;
+-
+- unsigned int number = 0;
+- unsigned int eighthPoints = num_points / 8;
+- int i, j;
+-
+- __m256 aVal, d, pi, pio2, x, y, z, arccosine;
+- __m256 fzeroes, fones, ftwos, ffours, condition;
+-
+- pi = _mm256_set1_ps(3.14159265358979323846);
+- pio2 = _mm256_set1_ps(3.14159265358979323846/2);
+- fzeroes = _mm256_setzero_ps();
+- fones = _mm256_set1_ps(1.0);
+- ftwos = _mm256_set1_ps(2.0);
+- ffours = _mm256_set1_ps(4.0);
+-
+- for(;number < eighthPoints; number++){
+- aVal = _mm256_load_ps(aPtr);
+- d = aVal;
+- aVal = _mm256_div_ps(_mm256_sqrt_ps(_mm256_mul_ps(_mm256_add_ps(fones, aVal), _mm256_sub_ps(fones, aVal))), aVal);
+- z = aVal;
+- condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS);
+- z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition));
+- condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS);
+- x = _mm256_add_ps(z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition));
+-
+- for(i = 0; i < 2; i++)
+- x = _mm256_add_ps(x, _mm256_sqrt_ps(_mm256_fmadd_ps(x, x,fones)));
+- x = _mm256_div_ps(fones, x);
+- y = fzeroes;
+- for(j = ACOS_TERMS - 1; j >=0 ; j--)
+- y = _mm256_fmadd_ps(y, _mm256_mul_ps(x, x), _mm256_set1_ps(pow(-1,j)/(2*j+1)));
+-
+- y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours));
+- condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS);
+-
+- y = _mm256_add_ps(y, _mm256_and_ps(_mm256_fnmadd_ps(y, ftwos, pio2), condition));
+- arccosine = y;
+- condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS);
+- arccosine = _mm256_sub_ps(arccosine, _mm256_and_ps(_mm256_mul_ps(arccosine, ftwos), condition));
+- condition = _mm256_cmp_ps(d, fzeroes, _CMP_LT_OS);
+- arccosine = _mm256_add_ps(arccosine, _mm256_and_ps(pi, condition));
+-
+- _mm256_store_ps(bPtr, arccosine);
+- aPtr += 8;
+- bPtr += 8;
+- }
+-
+- number = eighthPoints * 8;
+- for(;number < num_points; number++){
+- *bPtr++ = acos(*aPtr++);
+- }
++ float* bPtr = bVector;
++ const float* aPtr = aVector;
++
++ unsigned int number = 0;
++ unsigned int eighthPoints = num_points / 8;
++ int i, j;
++
++ __m256 aVal, d, pi, pio2, x, y, z, arccosine;
++ __m256 fzeroes, fones, ftwos, ffours, condition;
++
++ pi = _mm256_set1_ps(3.14159265358979323846);
++ pio2 = _mm256_set1_ps(3.14159265358979323846 / 2);
++ fzeroes = _mm256_setzero_ps();
++ fones = _mm256_set1_ps(1.0);
++ ftwos = _mm256_set1_ps(2.0);
++ ffours = _mm256_set1_ps(4.0);
++
++ for (; number < eighthPoints; number++) {
++ aVal = _mm256_load_ps(aPtr);
++ d = aVal;
++ aVal = _mm256_div_ps(_mm256_sqrt_ps(_mm256_mul_ps(_mm256_add_ps(fones, aVal),
++ _mm256_sub_ps(fones, aVal))),
++ aVal);
++ z = aVal;
++ condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS);
++ z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition));
++ condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS);
++ x = _mm256_add_ps(
++ z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition));
++
++ for (i = 0; i < 2; i++)
++ x = _mm256_add_ps(x, _mm256_sqrt_ps(_mm256_fmadd_ps(x, x, fones)));
++ x = _mm256_div_ps(fones, x);
++ y = fzeroes;
++ for (j = ACOS_TERMS - 1; j >= 0; j--)
++ y = _mm256_fmadd_ps(
++ y, _mm256_mul_ps(x, x), _mm256_set1_ps(pow(-1, j) / (2 * j + 1)));
++
++ y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours));
++ condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS);
++
++ y = _mm256_add_ps(y, _mm256_and_ps(_mm256_fnmadd_ps(y, ftwos, pio2), condition));
++ arccosine = y;
++ condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS);
++ arccosine = _mm256_sub_ps(
++ arccosine, _mm256_and_ps(_mm256_mul_ps(arccosine, ftwos), condition));
++ condition = _mm256_cmp_ps(d, fzeroes, _CMP_LT_OS);
++ arccosine = _mm256_add_ps(arccosine, _mm256_and_ps(pi, condition));
++
++ _mm256_store_ps(bPtr, arccosine);
++ aPtr += 8;
++ bPtr += 8;
++ }
++
++ number = eighthPoints * 8;
++ for (; number < num_points; number++) {
++ *bPtr++ = acos(*aPtr++);
++ }
+ }
+
+ #endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for aligned */
+@@ -147,59 +154,66 @@ volk_32f_acos_32f_a_avx2_fma(float* bVector, const float* aVector, unsigned int
+ static inline void
+ volk_32f_acos_32f_a_avx(float* bVector, const float* aVector, unsigned int num_points)
+ {
+- float* bPtr = bVector;
+- const float* aPtr = aVector;
+-
+- unsigned int number = 0;
+- unsigned int eighthPoints = num_points / 8;
+- int i, j;
+-
+- __m256 aVal, d, pi, pio2, x, y, z, arccosine;
+- __m256 fzeroes, fones, ftwos, ffours, condition;
+-
+- pi = _mm256_set1_ps(3.14159265358979323846);
+- pio2 = _mm256_set1_ps(3.14159265358979323846/2);
+- fzeroes = _mm256_setzero_ps();
+- fones = _mm256_set1_ps(1.0);
+- ftwos = _mm256_set1_ps(2.0);
+- ffours = _mm256_set1_ps(4.0);
+-
+- for(;number < eighthPoints; number++){
+- aVal = _mm256_load_ps(aPtr);
+- d = aVal;
+- aVal = _mm256_div_ps(_mm256_sqrt_ps(_mm256_mul_ps(_mm256_add_ps(fones, aVal), _mm256_sub_ps(fones, aVal))), aVal);
+- z = aVal;
+- condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS);
+- z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition));
+- condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS);
+- x = _mm256_add_ps(z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition));
+-
+- for(i = 0; i < 2; i++)
+- x = _mm256_add_ps(x, _mm256_sqrt_ps(_mm256_add_ps(fones, _mm256_mul_ps(x, x))));
+- x = _mm256_div_ps(fones, x);
+- y = fzeroes;
+- for(j = ACOS_TERMS - 1; j >=0 ; j--)
+- y = _mm256_add_ps(_mm256_mul_ps(y, _mm256_mul_ps(x, x)), _mm256_set1_ps(pow(-1,j)/(2*j+1)));
+-
+- y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours));
+- condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS);
+-
+- y = _mm256_add_ps(y, _mm256_and_ps(_mm256_sub_ps(pio2, _mm256_mul_ps(y, ftwos)), condition));
+- arccosine = y;
+- condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS);
+- arccosine = _mm256_sub_ps(arccosine, _mm256_and_ps(_mm256_mul_ps(arccosine, ftwos), condition));
+- condition = _mm256_cmp_ps(d, fzeroes, _CMP_LT_OS);
+- arccosine = _mm256_add_ps(arccosine, _mm256_and_ps(pi, condition));
+-
+- _mm256_store_ps(bPtr, arccosine);
+- aPtr += 8;
+- bPtr += 8;
+- }
+-
+- number = eighthPoints * 8;
+- for(;number < num_points; number++){
+- *bPtr++ = acos(*aPtr++);
+- }
++ float* bPtr = bVector;
++ const float* aPtr = aVector;
++
++ unsigned int number = 0;
++ unsigned int eighthPoints = num_points / 8;
++ int i, j;
++
++ __m256 aVal, d, pi, pio2, x, y, z, arccosine;
++ __m256 fzeroes, fones, ftwos, ffours, condition;
++
++ pi = _mm256_set1_ps(3.14159265358979323846);
++ pio2 = _mm256_set1_ps(3.14159265358979323846 / 2);
++ fzeroes = _mm256_setzero_ps();
++ fones = _mm256_set1_ps(1.0);
++ ftwos = _mm256_set1_ps(2.0);
++ ffours = _mm256_set1_ps(4.0);
++
++ for (; number < eighthPoints; number++) {
++ aVal = _mm256_load_ps(aPtr);
++ d = aVal;
++ aVal = _mm256_div_ps(_mm256_sqrt_ps(_mm256_mul_ps(_mm256_add_ps(fones, aVal),
++ _mm256_sub_ps(fones, aVal))),
++ aVal);
++ z = aVal;
++ condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS);
++ z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition));
++ condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS);
++ x = _mm256_add_ps(
++ z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition));
++
++ for (i = 0; i < 2; i++)
++ x = _mm256_add_ps(x,
++ _mm256_sqrt_ps(_mm256_add_ps(fones, _mm256_mul_ps(x, x))));
++ x = _mm256_div_ps(fones, x);
++ y = fzeroes;
++ for (j = ACOS_TERMS - 1; j >= 0; j--)
++ y = _mm256_add_ps(_mm256_mul_ps(y, _mm256_mul_ps(x, x)),
++ _mm256_set1_ps(pow(-1, j) / (2 * j + 1)));
++
++ y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours));
++ condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS);
++
++ y = _mm256_add_ps(
++ y, _mm256_and_ps(_mm256_sub_ps(pio2, _mm256_mul_ps(y, ftwos)), condition));
++ arccosine = y;
++ condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS);
++ arccosine = _mm256_sub_ps(
++ arccosine, _mm256_and_ps(_mm256_mul_ps(arccosine, ftwos), condition));
++ condition = _mm256_cmp_ps(d, fzeroes, _CMP_LT_OS);
++ arccosine = _mm256_add_ps(arccosine, _mm256_and_ps(pi, condition));
++
++ _mm256_store_ps(bPtr, arccosine);
++ aPtr += 8;
++ bPtr += 8;
++ }
++
++ number = eighthPoints * 8;
++ for (; number < num_points; number++) {
++ *bPtr++ = acos(*aPtr++);
++ }
+ }
+
+ #endif /* LV_HAVE_AVX2 for aligned */
+@@ -210,59 +224,63 @@ volk_32f_acos_32f_a_avx(float* bVector, const float* aVector, unsigned int num_p
+ static inline void
+ volk_32f_acos_32f_a_sse4_1(float* bVector, const float* aVector, unsigned int num_points)
+ {
+- float* bPtr = bVector;
+- const float* aPtr = aVector;
+-
+- unsigned int number = 0;
+- unsigned int quarterPoints = num_points / 4;
+- int i, j;
+-
+- __m128 aVal, d, pi, pio2, x, y, z, arccosine;
+- __m128 fzeroes, fones, ftwos, ffours, condition;
+-
+- pi = _mm_set1_ps(3.14159265358979323846);
+- pio2 = _mm_set1_ps(3.14159265358979323846/2);
+- fzeroes = _mm_setzero_ps();
+- fones = _mm_set1_ps(1.0);
+- ftwos = _mm_set1_ps(2.0);
+- ffours = _mm_set1_ps(4.0);
+-
+- for(;number < quarterPoints; number++){
+- aVal = _mm_load_ps(aPtr);
+- d = aVal;
+- aVal = _mm_div_ps(_mm_sqrt_ps(_mm_mul_ps(_mm_add_ps(fones, aVal), _mm_sub_ps(fones, aVal))), aVal);
+- z = aVal;
+- condition = _mm_cmplt_ps(z, fzeroes);
+- z = _mm_sub_ps(z, _mm_and_ps(_mm_mul_ps(z, ftwos), condition));
+- condition = _mm_cmplt_ps(z, fones);
+- x = _mm_add_ps(z, _mm_and_ps(_mm_sub_ps(_mm_div_ps(fones, z), z), condition));
+-
+- for(i = 0; i < 2; i++)
+- x = _mm_add_ps(x, _mm_sqrt_ps(_mm_add_ps(fones, _mm_mul_ps(x, x))));
+- x = _mm_div_ps(fones, x);
+- y = fzeroes;
+- for(j = ACOS_TERMS - 1; j >=0 ; j--)
+- y = _mm_add_ps(_mm_mul_ps(y, _mm_mul_ps(x, x)), _mm_set1_ps(pow(-1,j)/(2*j+1)));
+-
+- y = _mm_mul_ps(y, _mm_mul_ps(x, ffours));
+- condition = _mm_cmpgt_ps(z, fones);
+-
+- y = _mm_add_ps(y, _mm_and_ps(_mm_sub_ps(pio2, _mm_mul_ps(y, ftwos)), condition));
+- arccosine = y;
+- condition = _mm_cmplt_ps(aVal, fzeroes);
+- arccosine = _mm_sub_ps(arccosine, _mm_and_ps(_mm_mul_ps(arccosine, ftwos), condition));
+- condition = _mm_cmplt_ps(d, fzeroes);
+- arccosine = _mm_add_ps(arccosine, _mm_and_ps(pi, condition));
+-
+- _mm_store_ps(bPtr, arccosine);
+- aPtr += 4;
+- bPtr += 4;
+- }
+-
+- number = quarterPoints * 4;
+- for(;number < num_points; number++){
+- *bPtr++ = acosf(*aPtr++);
+- }
++ float* bPtr = bVector;
++ const float* aPtr = aVector;
++
++ unsigned int number = 0;
++ unsigned int quarterPoints = num_points / 4;
++ int i, j;
++
++ __m128 aVal, d, pi, pio2, x, y, z, arccosine;
++ __m128 fzeroes, fones, ftwos, ffours, condition;
++
++ pi = _mm_set1_ps(3.14159265358979323846);
++ pio2 = _mm_set1_ps(3.14159265358979323846 / 2);
++ fzeroes = _mm_setzero_ps();
++ fones = _mm_set1_ps(1.0);
++ ftwos = _mm_set1_ps(2.0);
++ ffours = _mm_set1_ps(4.0);
++
++ for (; number < quarterPoints; number++) {
++ aVal = _mm_load_ps(aPtr);
++ d = aVal;
++ aVal = _mm_div_ps(
++ _mm_sqrt_ps(_mm_mul_ps(_mm_add_ps(fones, aVal), _mm_sub_ps(fones, aVal))),
++ aVal);
++ z = aVal;
++ condition = _mm_cmplt_ps(z, fzeroes);
++ z = _mm_sub_ps(z, _mm_and_ps(_mm_mul_ps(z, ftwos), condition));
++ condition = _mm_cmplt_ps(z, fones);
++ x = _mm_add_ps(z, _mm_and_ps(_mm_sub_ps(_mm_div_ps(fones, z), z), condition));
++
++ for (i = 0; i < 2; i++)
++ x = _mm_add_ps(x, _mm_sqrt_ps(_mm_add_ps(fones, _mm_mul_ps(x, x))));
++ x = _mm_div_ps(fones, x);
++ y = fzeroes;
++ for (j = ACOS_TERMS - 1; j >= 0; j--)
++ y = _mm_add_ps(_mm_mul_ps(y, _mm_mul_ps(x, x)),
++ _mm_set1_ps(pow(-1, j) / (2 * j + 1)));
++
++ y = _mm_mul_ps(y, _mm_mul_ps(x, ffours));
++ condition = _mm_cmpgt_ps(z, fones);
++
++ y = _mm_add_ps(y, _mm_and_ps(_mm_sub_ps(pio2, _mm_mul_ps(y, ftwos)), condition));
++ arccosine = y;
++ condition = _mm_cmplt_ps(aVal, fzeroes);
++ arccosine =
++ _mm_sub_ps(arccosine, _mm_and_ps(_mm_mul_ps(arccosine, ftwos), condition));
++ condition = _mm_cmplt_ps(d, fzeroes);
++ arccosine = _mm_add_ps(arccosine, _mm_and_ps(pi, condition));
++
++ _mm_store_ps(bPtr, arccosine);
++ aPtr += 4;
++ bPtr += 4;
++ }
++
++ number = quarterPoints * 4;
++ for (; number < num_points; number++) {
++ *bPtr++ = acosf(*aPtr++);
++ }
+ }
+
+ #endif /* LV_HAVE_SSE4_1 for aligned */
+@@ -276,62 +294,68 @@ volk_32f_acos_32f_a_sse4_1(float* bVector, const float* aVector, unsigned int nu
+ #if LV_HAVE_AVX2 && LV_HAVE_FMA
+ #include <immintrin.h>
+
+-static inline void
+-volk_32f_acos_32f_u_avx2_fma(float* bVector, const float* aVector, unsigned int num_points)
++static inline void volk_32f_acos_32f_u_avx2_fma(float* bVector,
++ const float* aVector,
++ unsigned int num_points)
+ {
+- float* bPtr = bVector;
+- const float* aPtr = aVector;
+-
+- unsigned int number = 0;
+- unsigned int eighthPoints = num_points / 8;
+- int i, j;
+-
+- __m256 aVal, d, pi, pio2, x, y, z, arccosine;
+- __m256 fzeroes, fones, ftwos, ffours, condition;
+-
+- pi = _mm256_set1_ps(3.14159265358979323846);
+- pio2 = _mm256_set1_ps(3.14159265358979323846/2);
+- fzeroes = _mm256_setzero_ps();
+- fones = _mm256_set1_ps(1.0);
+- ftwos = _mm256_set1_ps(2.0);
+- ffours = _mm256_set1_ps(4.0);
+-
+- for(;number < eighthPoints; number++){
+- aVal = _mm256_loadu_ps(aPtr);
+- d = aVal;
+- aVal = _mm256_div_ps(_mm256_sqrt_ps(_mm256_mul_ps(_mm256_add_ps(fones, aVal), _mm256_sub_ps(fones, aVal))), aVal);
+- z = aVal;
+- condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS);
+- z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition));
+- condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS);
+- x = _mm256_add_ps(z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition));
+-
+- for(i = 0; i < 2; i++)
+- x = _mm256_add_ps(x, _mm256_sqrt_ps(_mm256_fmadd_ps(x, x,fones)));
+- x = _mm256_div_ps(fones, x);
+- y = fzeroes;
+- for(j = ACOS_TERMS - 1; j >=0 ; j--)
+- y = _mm256_fmadd_ps(y, _mm256_mul_ps(x, x), _mm256_set1_ps(pow(-1,j)/(2*j+1)));
+-
+- y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours));
+- condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS);
+-
+- y = _mm256_add_ps(y, _mm256_and_ps(_mm256_fnmadd_ps(y, ftwos, pio2), condition));
+- arccosine = y;
+- condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS);
+- arccosine = _mm256_sub_ps(arccosine, _mm256_and_ps(_mm256_mul_ps(arccosine, ftwos), condition));
+- condition = _mm256_cmp_ps(d, fzeroes, _CMP_LT_OS);
+- arccosine = _mm256_add_ps(arccosine, _mm256_and_ps(pi, condition));
+-
+- _mm256_storeu_ps(bPtr, arccosine);
+- aPtr += 8;
+- bPtr += 8;
+- }
+-
+- number = eighthPoints * 8;
+- for(;number < num_points; number++){
+- *bPtr++ = acos(*aPtr++);
+- }
++ float* bPtr = bVector;
++ const float* aPtr = aVector;
++
++ unsigned int number = 0;
++ unsigned int eighthPoints = num_points / 8;
++ int i, j;
++
++ __m256 aVal, d, pi, pio2, x, y, z, arccosine;
++ __m256 fzeroes, fones, ftwos, ffours, condition;
++
++ pi = _mm256_set1_ps(3.14159265358979323846);
++ pio2 = _mm256_set1_ps(3.14159265358979323846 / 2);
++ fzeroes = _mm256_setzero_ps();
++ fones = _mm256_set1_ps(1.0);
++ ftwos = _mm256_set1_ps(2.0);
++ ffours = _mm256_set1_ps(4.0);
++
++ for (; number < eighthPoints; number++) {
++ aVal = _mm256_loadu_ps(aPtr);
++ d = aVal;
++ aVal = _mm256_div_ps(_mm256_sqrt_ps(_mm256_mul_ps(_mm256_add_ps(fones, aVal),
++ _mm256_sub_ps(fones, aVal))),
++ aVal);
++ z = aVal;
++ condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS);
++ z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition));
++ condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS);
++ x = _mm256_add_ps(
++ z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition));
++
++ for (i = 0; i < 2; i++)
++ x = _mm256_add_ps(x, _mm256_sqrt_ps(_mm256_fmadd_ps(x, x, fones)));
++ x = _mm256_div_ps(fones, x);
++ y = fzeroes;
++ for (j = ACOS_TERMS - 1; j >= 0; j--)
++ y = _mm256_fmadd_ps(
++ y, _mm256_mul_ps(x, x), _mm256_set1_ps(pow(-1, j) / (2 * j + 1)));
++
++ y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours));
++ condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS);
++
++ y = _mm256_add_ps(y, _mm256_and_ps(_mm256_fnmadd_ps(y, ftwos, pio2), condition));
++ arccosine = y;
++ condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS);
++ arccosine = _mm256_sub_ps(
++ arccosine, _mm256_and_ps(_mm256_mul_ps(arccosine, ftwos), condition));
++ condition = _mm256_cmp_ps(d, fzeroes, _CMP_LT_OS);
++ arccosine = _mm256_add_ps(arccosine, _mm256_and_ps(pi, condition));
++
++ _mm256_storeu_ps(bPtr, arccosine);
++ aPtr += 8;
++ bPtr += 8;
++ }
++
++ number = eighthPoints * 8;
++ for (; number < num_points; number++) {
++ *bPtr++ = acos(*aPtr++);
++ }
+ }
+
+ #endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for unaligned */
+@@ -343,59 +367,66 @@ volk_32f_acos_32f_u_avx2_fma(float* bVector, const float* aVector, unsigned int
+ static inline void
+ volk_32f_acos_32f_u_avx(float* bVector, const float* aVector, unsigned int num_points)
+ {
+- float* bPtr = bVector;
+- const float* aPtr = aVector;
+-
+- unsigned int number = 0;
+- unsigned int eighthPoints = num_points / 8;
+- int i, j;
+-
+- __m256 aVal, d, pi, pio2, x, y, z, arccosine;
+- __m256 fzeroes, fones, ftwos, ffours, condition;
+-
+- pi = _mm256_set1_ps(3.14159265358979323846);
+- pio2 = _mm256_set1_ps(3.14159265358979323846/2);
+- fzeroes = _mm256_setzero_ps();
+- fones = _mm256_set1_ps(1.0);
+- ftwos = _mm256_set1_ps(2.0);
+- ffours = _mm256_set1_ps(4.0);
+-
+- for(;number < eighthPoints; number++){
+- aVal = _mm256_loadu_ps(aPtr);
+- d = aVal;
+- aVal = _mm256_div_ps(_mm256_sqrt_ps(_mm256_mul_ps(_mm256_add_ps(fones, aVal), _mm256_sub_ps(fones, aVal))), aVal);
+- z = aVal;
+- condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS);
+- z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition));
+- condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS);
+- x = _mm256_add_ps(z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition));
+-
+- for(i = 0; i < 2; i++)
+- x = _mm256_add_ps(x, _mm256_sqrt_ps(_mm256_add_ps(fones, _mm256_mul_ps(x, x))));
+- x = _mm256_div_ps(fones, x);
+- y = fzeroes;
+- for(j = ACOS_TERMS - 1; j >=0 ; j--)
+- y = _mm256_add_ps(_mm256_mul_ps(y, _mm256_mul_ps(x, x)), _mm256_set1_ps(pow(-1,j)/(2*j+1)));
+-
+- y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours));
+- condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS);
+-
+- y = _mm256_add_ps(y, _mm256_and_ps(_mm256_sub_ps(pio2, _mm256_mul_ps(y, ftwos)), condition));
+- arccosine = y;
+- condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS);
+- arccosine = _mm256_sub_ps(arccosine, _mm256_and_ps(_mm256_mul_ps(arccosine, ftwos), condition));
+- condition = _mm256_cmp_ps(d, fzeroes, _CMP_LT_OS);
+- arccosine = _mm256_add_ps(arccosine, _mm256_and_ps(pi, condition));
+-
+- _mm256_storeu_ps(bPtr, arccosine);
+- aPtr += 8;
+- bPtr += 8;
+- }
+-
+- number = eighthPoints * 8;
+- for(;number < num_points; number++){
+- *bPtr++ = acos(*aPtr++);
+- }
++ float* bPtr = bVector;
++ const float* aPtr = aVector;
++
++ unsigned int number = 0;
++ unsigned int eighthPoints = num_points / 8;
++ int i, j;
++
++ __m256 aVal, d, pi, pio2, x, y, z, arccosine;
++ __m256 fzeroes, fones, ftwos, ffours, condition;
++
++ pi = _mm256_set1_ps(3.14159265358979323846);
++ pio2 = _mm256_set1_ps(3.14159265358979323846 / 2);
++ fzeroes = _mm256_setzero_ps();
++ fones = _mm256_set1_ps(1.0);
++ ftwos = _mm256_set1_ps(2.0);
++ ffours = _mm256_set1_ps(4.0);
++
++ for (; number < eighthPoints; number++) {
++ aVal = _mm256_loadu_ps(aPtr);
++ d = aVal;
++ aVal = _mm256_div_ps(_mm256_sqrt_ps(_mm256_mul_ps(_mm256_add_ps(fones, aVal),
++ _mm256_sub_ps(fones, aVal))),
++ aVal);
++ z = aVal;
++ condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS);
++ z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition));
++ condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS);
++ x = _mm256_add_ps(
++ z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition));
++
++ for (i = 0; i < 2; i++)
++ x = _mm256_add_ps(x,
++ _mm256_sqrt_ps(_mm256_add_ps(fones, _mm256_mul_ps(x, x))));
++ x = _mm256_div_ps(fones, x);
++ y = fzeroes;
++ for (j = ACOS_TERMS - 1; j >= 0; j--)
++ y = _mm256_add_ps(_mm256_mul_ps(y, _mm256_mul_ps(x, x)),
++ _mm256_set1_ps(pow(-1, j) / (2 * j + 1)));
++
++ y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours));
++ condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS);
++
++ y = _mm256_add_ps(
++ y, _mm256_and_ps(_mm256_sub_ps(pio2, _mm256_mul_ps(y, ftwos)), condition));
++ arccosine = y;
++ condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS);
++ arccosine = _mm256_sub_ps(
++ arccosine, _mm256_and_ps(_mm256_mul_ps(arccosine, ftwos), condition));
++ condition = _mm256_cmp_ps(d, fzeroes, _CMP_LT_OS);
++ arccosine = _mm256_add_ps(arccosine, _mm256_and_ps(pi, condition));
++
++ _mm256_storeu_ps(bPtr, arccosine);
++ aPtr += 8;
++ bPtr += 8;
++ }
++
++ number = eighthPoints * 8;
++ for (; number < num_points; number++) {
++ *bPtr++ = acos(*aPtr++);
++ }
+ }
+
+ #endif /* LV_HAVE_AVX2 for unaligned */
+@@ -406,60 +437,64 @@ volk_32f_acos_32f_u_avx(float* bVector, const float* aVector, unsigned int num_p
+ static inline void
+ volk_32f_acos_32f_u_sse4_1(float* bVector, const float* aVector, unsigned int num_points)
+ {
+- float* bPtr = bVector;
+- const float* aPtr = aVector;
+-
+- unsigned int number = 0;
+- unsigned int quarterPoints = num_points / 4;
+- int i, j;
+-
+- __m128 aVal, d, pi, pio2, x, y, z, arccosine;
+- __m128 fzeroes, fones, ftwos, ffours, condition;
+-
+- pi = _mm_set1_ps(3.14159265358979323846);
+- pio2 = _mm_set1_ps(3.14159265358979323846/2);
+- fzeroes = _mm_setzero_ps();
+- fones = _mm_set1_ps(1.0);
+- ftwos = _mm_set1_ps(2.0);
+- ffours = _mm_set1_ps(4.0);
+-
+- for(;number < quarterPoints; number++){
+- aVal = _mm_loadu_ps(aPtr);
+- d = aVal;
+- aVal = _mm_div_ps(_mm_sqrt_ps(_mm_mul_ps(_mm_add_ps(fones, aVal), _mm_sub_ps(fones, aVal))), aVal);
+- z = aVal;
+- condition = _mm_cmplt_ps(z, fzeroes);
+- z = _mm_sub_ps(z, _mm_and_ps(_mm_mul_ps(z, ftwos), condition));
+- condition = _mm_cmplt_ps(z, fones);
+- x = _mm_add_ps(z, _mm_and_ps(_mm_sub_ps(_mm_div_ps(fones, z), z), condition));
+-
+- for(i = 0; i < 2; i++)
+- x = _mm_add_ps(x, _mm_sqrt_ps(_mm_add_ps(fones, _mm_mul_ps(x, x))));
+- x = _mm_div_ps(fones, x);
+- y = fzeroes;
+-
+- for(j = ACOS_TERMS - 1; j >=0 ; j--)
+- y = _mm_add_ps(_mm_mul_ps(y, _mm_mul_ps(x, x)), _mm_set1_ps(pow(-1,j)/(2*j+1)));
+-
+- y = _mm_mul_ps(y, _mm_mul_ps(x, ffours));
+- condition = _mm_cmpgt_ps(z, fones);
+-
+- y = _mm_add_ps(y, _mm_and_ps(_mm_sub_ps(pio2, _mm_mul_ps(y, ftwos)), condition));
+- arccosine = y;
+- condition = _mm_cmplt_ps(aVal, fzeroes);
+- arccosine = _mm_sub_ps(arccosine, _mm_and_ps(_mm_mul_ps(arccosine, ftwos), condition));
+- condition = _mm_cmplt_ps(d, fzeroes);
+- arccosine = _mm_add_ps(arccosine, _mm_and_ps(pi, condition));
+-
+- _mm_storeu_ps(bPtr, arccosine);
+- aPtr += 4;
+- bPtr += 4;
+- }
+-
+- number = quarterPoints * 4;
+- for(;number < num_points; number++){
+- *bPtr++ = acosf(*aPtr++);
+- }
++ float* bPtr = bVector;
++ const float* aPtr = aVector;
++
++ unsigned int number = 0;
++ unsigned int quarterPoints = num_points / 4;
++ int i, j;
++
++ __m128 aVal, d, pi, pio2, x, y, z, arccosine;
++ __m128 fzeroes, fones, ftwos, ffours, condition;
++
++ pi = _mm_set1_ps(3.14159265358979323846);
++ pio2 = _mm_set1_ps(3.14159265358979323846 / 2);
++ fzeroes = _mm_setzero_ps();
++ fones = _mm_set1_ps(1.0);
++ ftwos = _mm_set1_ps(2.0);
++ ffours = _mm_set1_ps(4.0);
++
++ for (; number < quarterPoints; number++) {
++ aVal = _mm_loadu_ps(aPtr);
++ d = aVal;
++ aVal = _mm_div_ps(
++ _mm_sqrt_ps(_mm_mul_ps(_mm_add_ps(fones, aVal), _mm_sub_ps(fones, aVal))),
++ aVal);
++ z = aVal;
++ condition = _mm_cmplt_ps(z, fzeroes);
++ z = _mm_sub_ps(z, _mm_and_ps(_mm_mul_ps(z, ftwos), condition));
++ condition = _mm_cmplt_ps(z, fones);
++ x = _mm_add_ps(z, _mm_and_ps(_mm_sub_ps(_mm_div_ps(fones, z), z), condition));
++
++ for (i = 0; i < 2; i++)
++ x = _mm_add_ps(x, _mm_sqrt_ps(_mm_add_ps(fones, _mm_mul_ps(x, x))));
++ x = _mm_div_ps(fones, x);
++ y = fzeroes;
++
++ for (j = ACOS_TERMS - 1; j >= 0; j--)
++ y = _mm_add_ps(_mm_mul_ps(y, _mm_mul_ps(x, x)),
++ _mm_set1_ps(pow(-1, j) / (2 * j + 1)));
++
++ y = _mm_mul_ps(y, _mm_mul_ps(x, ffours));
++ condition = _mm_cmpgt_ps(z, fones);
++
++ y = _mm_add_ps(y, _mm_and_ps(_mm_sub_ps(pio2, _mm_mul_ps(y, ftwos)), condition));
++ arccosine = y;
++ condition = _mm_cmplt_ps(aVal, fzeroes);
++ arccosine =
++ _mm_sub_ps(arccosine, _mm_and_ps(_mm_mul_ps(arccosine, ftwos), condition));
++ condition = _mm_cmplt_ps(d, fzeroes);
++ arccosine = _mm_add_ps(arccosine, _mm_and_ps(pi, condition));
++
++ _mm_storeu_ps(bPtr, arccosine);
++ aPtr += 4;
++ bPtr += 4;
++ }
++
++ number = quarterPoints * 4;
++ for (; number < num_points; number++) {
++ *bPtr++ = acosf(*aPtr++);
++ }
+ }
+
+ #endif /* LV_HAVE_SSE4_1 for aligned */
+@@ -469,14 +504,13 @@ volk_32f_acos_32f_u_sse4_1(float* bVector, const float* aVector, unsigned int nu
+ static inline void
+ volk_32f_acos_32f_generic(float* bVector, const float* aVector, unsigned int num_points)
+ {
+- float* bPtr = bVector;
+- const float* aPtr = aVector;
+- unsigned int number = 0;
+-
+- for(number = 0; number < num_points; number++){
+- *bPtr++ = acosf(*aPtr++);
+- }
++ float* bPtr = bVector;
++ const float* aPtr = aVector;
++ unsigned int number = 0;
+
++ for (number = 0; number < num_points; number++) {
++ *bPtr++ = acosf(*aPtr++);
++ }
+ }
+ #endif /* LV_HAVE_GENERIC */
+
+diff --git a/kernels/volk/volk_32f_asin_32f.h b/kernels/volk/volk_32f_asin_32f.h
+index 864cfcf..946d382 100644
+--- a/kernels/volk/volk_32f_asin_32f.h
++++ b/kernels/volk/volk_32f_asin_32f.h
+@@ -67,11 +67,12 @@
+ * \endcode
+ */
+
+-#include <stdio.h>
+-#include <math.h>
+ #include <inttypes.h>
++#include <math.h>
++#include <stdio.h>
+
+-/* This is the number of terms of Taylor series to evaluate, increase this for more accuracy*/
++/* This is the number of terms of Taylor series to evaluate, increase this for more
++ * accuracy*/
+ #define ASIN_TERMS 2
+
+ #ifndef INCLUDED_volk_32f_asin_32f_a_H
+@@ -80,60 +81,66 @@
+ #if LV_HAVE_AVX2 && LV_HAVE_FMA
+ #include <immintrin.h>
+
+-static inline void
+-volk_32f_asin_32f_a_avx2_fma(float* bVector, const float* aVector, unsigned int num_points)
++static inline void volk_32f_asin_32f_a_avx2_fma(float* bVector,
++ const float* aVector,
++ unsigned int num_points)
+ {
+- float* bPtr = bVector;
+- const float* aPtr = aVector;
+-
+- unsigned int number = 0;
+- unsigned int eighthPoints = num_points / 8;
+- int i, j;
+-
+- __m256 aVal, pio2, x, y, z, arcsine;
+- __m256 fzeroes, fones, ftwos, ffours, condition;
+-
+- pio2 = _mm256_set1_ps(3.14159265358979323846/2);
+- fzeroes = _mm256_setzero_ps();
+- fones = _mm256_set1_ps(1.0);
+- ftwos = _mm256_set1_ps(2.0);
+- ffours = _mm256_set1_ps(4.0);
+-
+- for(;number < eighthPoints; number++){
+- aVal = _mm256_load_ps(aPtr);
+- aVal = _mm256_div_ps(aVal, _mm256_sqrt_ps(_mm256_mul_ps(_mm256_add_ps(fones, aVal), _mm256_sub_ps(fones, aVal))));
+- z = aVal;
+- condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS);
+- z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition));
+- condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS);
+- x = _mm256_add_ps(z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition));
+-
+- for(i = 0; i < 2; i++){
+- x = _mm256_add_ps(x, _mm256_sqrt_ps(_mm256_fmadd_ps(x,x,fones)));
++ float* bPtr = bVector;
++ const float* aPtr = aVector;
++
++ unsigned int number = 0;
++ unsigned int eighthPoints = num_points / 8;
++ int i, j;
++
++ __m256 aVal, pio2, x, y, z, arcsine;
++ __m256 fzeroes, fones, ftwos, ffours, condition;
++
++ pio2 = _mm256_set1_ps(3.14159265358979323846 / 2);
++ fzeroes = _mm256_setzero_ps();
++ fones = _mm256_set1_ps(1.0);
++ ftwos = _mm256_set1_ps(2.0);
++ ffours = _mm256_set1_ps(4.0);
++
++ for (; number < eighthPoints; number++) {
++ aVal = _mm256_load_ps(aPtr);
++ aVal = _mm256_div_ps(aVal,
++ _mm256_sqrt_ps(_mm256_mul_ps(_mm256_add_ps(fones, aVal),
++ _mm256_sub_ps(fones, aVal))));
++ z = aVal;
++ condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS);
++ z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition));
++ condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS);
++ x = _mm256_add_ps(
++ z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition));
++
++ for (i = 0; i < 2; i++) {
++ x = _mm256_add_ps(x, _mm256_sqrt_ps(_mm256_fmadd_ps(x, x, fones)));
++ }
++ x = _mm256_div_ps(fones, x);
++ y = fzeroes;
++ for (j = ASIN_TERMS - 1; j >= 0; j--) {
++ y = _mm256_fmadd_ps(
++ y, _mm256_mul_ps(x, x), _mm256_set1_ps(pow(-1, j) / (2 * j + 1)));
++ }
++
++ y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours));
++ condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS);
++
++ y = _mm256_add_ps(y, _mm256_and_ps(_mm256_fnmadd_ps(y, ftwos, pio2), condition));
++ arcsine = y;
++ condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS);
++ arcsine = _mm256_sub_ps(arcsine,
++ _mm256_and_ps(_mm256_mul_ps(arcsine, ftwos), condition));
++
++ _mm256_store_ps(bPtr, arcsine);
++ aPtr += 8;
++ bPtr += 8;
+ }
+- x = _mm256_div_ps(fones, x);
+- y = fzeroes;
+- for(j = ASIN_TERMS - 1; j >=0 ; j--){
+- y = _mm256_fmadd_ps(y, _mm256_mul_ps(x, x), _mm256_set1_ps(pow(-1,j)/(2*j+1)));
+- }
+-
+- y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours));
+- condition = _mm256_cmp_ps(z, fones,_CMP_GT_OS);
+-
+- y = _mm256_add_ps(y, _mm256_and_ps(_mm256_fnmadd_ps(y,ftwos,pio2), condition));
+- arcsine = y;
+- condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS);
+- arcsine = _mm256_sub_ps(arcsine, _mm256_and_ps(_mm256_mul_ps(arcsine, ftwos), condition));
+
+- _mm256_store_ps(bPtr, arcsine);
+- aPtr += 8;
+- bPtr += 8;
+- }
+-
+- number = eighthPoints * 8;
+- for(;number < num_points; number++){
+- *bPtr++ = asin(*aPtr++);
+- }
++ number = eighthPoints * 8;
++ for (; number < num_points; number++) {
++ *bPtr++ = asin(*aPtr++);
++ }
+ }
+
+ #endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for aligned */
+@@ -145,57 +152,64 @@ volk_32f_asin_32f_a_avx2_fma(float* bVector, const float* aVector, unsigned int
+ static inline void
+ volk_32f_asin_32f_a_avx(float* bVector, const float* aVector, unsigned int num_points)
+ {
+- float* bPtr = bVector;
+- const float* aPtr = aVector;
+-
+- unsigned int number = 0;
+- unsigned int eighthPoints = num_points / 8;
+- int i, j;
+-
+- __m256 aVal, pio2, x, y, z, arcsine;
+- __m256 fzeroes, fones, ftwos, ffours, condition;
+-
+- pio2 = _mm256_set1_ps(3.14159265358979323846/2);
+- fzeroes = _mm256_setzero_ps();
+- fones = _mm256_set1_ps(1.0);
+- ftwos = _mm256_set1_ps(2.0);
+- ffours = _mm256_set1_ps(4.0);
+-
+- for(;number < eighthPoints; number++){
+- aVal = _mm256_load_ps(aPtr);
+- aVal = _mm256_div_ps(aVal, _mm256_sqrt_ps(_mm256_mul_ps(_mm256_add_ps(fones, aVal), _mm256_sub_ps(fones, aVal))));
+- z = aVal;
+- condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS);
+- z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition));
+- condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS);
+- x = _mm256_add_ps(z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition));
+-
+- for(i = 0; i < 2; i++){
+- x = _mm256_add_ps(x, _mm256_sqrt_ps(_mm256_add_ps(fones, _mm256_mul_ps(x, x))));
++ float* bPtr = bVector;
++ const float* aPtr = aVector;
++
++ unsigned int number = 0;
++ unsigned int eighthPoints = num_points / 8;
++ int i, j;
++
++ __m256 aVal, pio2, x, y, z, arcsine;
++ __m256 fzeroes, fones, ftwos, ffours, condition;
++
++ pio2 = _mm256_set1_ps(3.14159265358979323846 / 2);
++ fzeroes = _mm256_setzero_ps();
++ fones = _mm256_set1_ps(1.0);
++ ftwos = _mm256_set1_ps(2.0);
++ ffours = _mm256_set1_ps(4.0);
++
++ for (; number < eighthPoints; number++) {
++ aVal = _mm256_load_ps(aPtr);
++ aVal = _mm256_div_ps(aVal,
++ _mm256_sqrt_ps(_mm256_mul_ps(_mm256_add_ps(fones, aVal),
++ _mm256_sub_ps(fones, aVal))));
++ z = aVal;
++ condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS);
++ z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition));
++ condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS);
++ x = _mm256_add_ps(
++ z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition));
++
++ for (i = 0; i < 2; i++) {
++ x = _mm256_add_ps(x,
++ _mm256_sqrt_ps(_mm256_add_ps(fones, _mm256_mul_ps(x, x))));
++ }
++ x = _mm256_div_ps(fones, x);
++ y = fzeroes;
++ for (j = ASIN_TERMS - 1; j >= 0; j--) {
++ y = _mm256_add_ps(_mm256_mul_ps(y, _mm256_mul_ps(x, x)),
++ _mm256_set1_ps(pow(-1, j) / (2 * j + 1)));
++ }
++
++ y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours));
++ condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS);
++
++ y = _mm256_add_ps(
++ y, _mm256_and_ps(_mm256_sub_ps(pio2, _mm256_mul_ps(y, ftwos)), condition));
++ arcsine = y;
++ condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS);
++ arcsine = _mm256_sub_ps(arcsine,
++ _mm256_and_ps(_mm256_mul_ps(arcsine, ftwos), condition));
++
++ _mm256_store_ps(bPtr, arcsine);
++ aPtr += 8;
++ bPtr += 8;
+ }
+- x = _mm256_div_ps(fones, x);
+- y = fzeroes;
+- for(j = ASIN_TERMS - 1; j >=0 ; j--){
+- y = _mm256_add_ps(_mm256_mul_ps(y, _mm256_mul_ps(x, x)), _mm256_set1_ps(pow(-1,j)/(2*j+1)));
+- }
+-
+- y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours));
+- condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS);
+-
+- y = _mm256_add_ps(y, _mm256_and_ps(_mm256_sub_ps(pio2, _mm256_mul_ps(y, ftwos)), condition));
+- arcsine = y;
+- condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS);
+- arcsine = _mm256_sub_ps(arcsine, _mm256_and_ps(_mm256_mul_ps(arcsine, ftwos), condition));
+
+- _mm256_store_ps(bPtr, arcsine);
+- aPtr += 8;
+- bPtr += 8;
+- }
+-
+- number = eighthPoints * 8;
+- for(;number < num_points; number++){
+- *bPtr++ = asin(*aPtr++);
+- }
++ number = eighthPoints * 8;
++ for (; number < num_points; number++) {
++ *bPtr++ = asin(*aPtr++);
++ }
+ }
+
+ #endif /* LV_HAVE_AVX for aligned */
+@@ -206,57 +220,60 @@ volk_32f_asin_32f_a_avx(float* bVector, const float* aVector, unsigned int num_p
+ static inline void
+ volk_32f_asin_32f_a_sse4_1(float* bVector, const float* aVector, unsigned int num_points)
+ {
+- float* bPtr = bVector;
+- const float* aPtr = aVector;
+-
+- unsigned int number = 0;
+- unsigned int quarterPoints = num_points / 4;
+- int i, j;
+-
+- __m128 aVal, pio2, x, y, z, arcsine;
+- __m128 fzeroes, fones, ftwos, ffours, condition;
+-
+- pio2 = _mm_set1_ps(3.14159265358979323846/2);
+- fzeroes = _mm_setzero_ps();
+- fones = _mm_set1_ps(1.0);
+- ftwos = _mm_set1_ps(2.0);
+- ffours = _mm_set1_ps(4.0);
+-
+- for(;number < quarterPoints; number++){
+- aVal = _mm_load_ps(aPtr);
+- aVal = _mm_div_ps(aVal, _mm_sqrt_ps(_mm_mul_ps(_mm_add_ps(fones, aVal), _mm_sub_ps(fones, aVal))));
+- z = aVal;
+- condition = _mm_cmplt_ps(z, fzeroes);
+- z = _mm_sub_ps(z, _mm_and_ps(_mm_mul_ps(z, ftwos), condition));
+- condition = _mm_cmplt_ps(z, fones);
+- x = _mm_add_ps(z, _mm_and_ps(_mm_sub_ps(_mm_div_ps(fones, z), z), condition));
+-
+- for(i = 0; i < 2; i++){
+- x = _mm_add_ps(x, _mm_sqrt_ps(_mm_add_ps(fones, _mm_mul_ps(x, x))));
++ float* bPtr = bVector;
++ const float* aPtr = aVector;
++
++ unsigned int number = 0;
++ unsigned int quarterPoints = num_points / 4;
++ int i, j;
++
++ __m128 aVal, pio2, x, y, z, arcsine;
++ __m128 fzeroes, fones, ftwos, ffours, condition;
++
++ pio2 = _mm_set1_ps(3.14159265358979323846 / 2);
++ fzeroes = _mm_setzero_ps();
++ fones = _mm_set1_ps(1.0);
++ ftwos = _mm_set1_ps(2.0);
++ ffours = _mm_set1_ps(4.0);
++
++ for (; number < quarterPoints; number++) {
++ aVal = _mm_load_ps(aPtr);
++ aVal = _mm_div_ps(
++ aVal,
++ _mm_sqrt_ps(_mm_mul_ps(_mm_add_ps(fones, aVal), _mm_sub_ps(fones, aVal))));
++ z = aVal;
++ condition = _mm_cmplt_ps(z, fzeroes);
++ z = _mm_sub_ps(z, _mm_and_ps(_mm_mul_ps(z, ftwos), condition));
++ condition = _mm_cmplt_ps(z, fones);
++ x = _mm_add_ps(z, _mm_and_ps(_mm_sub_ps(_mm_div_ps(fones, z), z), condition));
++
++ for (i = 0; i < 2; i++) {
++ x = _mm_add_ps(x, _mm_sqrt_ps(_mm_add_ps(fones, _mm_mul_ps(x, x))));
++ }
++ x = _mm_div_ps(fones, x);
++ y = fzeroes;
++ for (j = ASIN_TERMS - 1; j >= 0; j--) {
++ y = _mm_add_ps(_mm_mul_ps(y, _mm_mul_ps(x, x)),
++ _mm_set1_ps(pow(-1, j) / (2 * j + 1)));
++ }
++
++ y = _mm_mul_ps(y, _mm_mul_ps(x, ffours));
++ condition = _mm_cmpgt_ps(z, fones);
++
++ y = _mm_add_ps(y, _mm_and_ps(_mm_sub_ps(pio2, _mm_mul_ps(y, ftwos)), condition));
++ arcsine = y;
++ condition = _mm_cmplt_ps(aVal, fzeroes);
++ arcsine = _mm_sub_ps(arcsine, _mm_and_ps(_mm_mul_ps(arcsine, ftwos), condition));
++
++ _mm_store_ps(bPtr, arcsine);
++ aPtr += 4;
++ bPtr += 4;
+ }
+- x = _mm_div_ps(fones, x);
+- y = fzeroes;
+- for(j = ASIN_TERMS - 1; j >=0 ; j--){
+- y = _mm_add_ps(_mm_mul_ps(y, _mm_mul_ps(x, x)), _mm_set1_ps(pow(-1,j)/(2*j+1)));
+- }
+-
+- y = _mm_mul_ps(y, _mm_mul_ps(x, ffours));
+- condition = _mm_cmpgt_ps(z, fones);
+-
+- y = _mm_add_ps(y, _mm_and_ps(_mm_sub_ps(pio2, _mm_mul_ps(y, ftwos)), condition));
+- arcsine = y;
+- condition = _mm_cmplt_ps(aVal, fzeroes);
+- arcsine = _mm_sub_ps(arcsine, _mm_and_ps(_mm_mul_ps(arcsine, ftwos), condition));
+-
+- _mm_store_ps(bPtr, arcsine);
+- aPtr += 4;
+- bPtr += 4;
+- }
+
+- number = quarterPoints * 4;
+- for(;number < num_points; number++){
+- *bPtr++ = asinf(*aPtr++);
+- }
++ number = quarterPoints * 4;
++ for (; number < num_points; number++) {
++ *bPtr++ = asinf(*aPtr++);
++ }
+ }
+
+ #endif /* LV_HAVE_SSE4_1 for aligned */
+@@ -269,60 +286,66 @@ volk_32f_asin_32f_a_sse4_1(float* bVector, const float* aVector, unsigned int nu
+ #if LV_HAVE_AVX2 && LV_HAVE_FMA
+ #include <immintrin.h>
+
+-static inline void
+-volk_32f_asin_32f_u_avx2_fma(float* bVector, const float* aVector, unsigned int num_points)
++static inline void volk_32f_asin_32f_u_avx2_fma(float* bVector,
++ const float* aVector,
++ unsigned int num_points)
+ {
+- float* bPtr = bVector;
+- const float* aPtr = aVector;
+-
+- unsigned int number = 0;
+- unsigned int eighthPoints = num_points / 8;
+- int i, j;
+-
+- __m256 aVal, pio2, x, y, z, arcsine;
+- __m256 fzeroes, fones, ftwos, ffours, condition;
+-
+- pio2 = _mm256_set1_ps(3.14159265358979323846/2);
+- fzeroes = _mm256_setzero_ps();
+- fones = _mm256_set1_ps(1.0);
+- ftwos = _mm256_set1_ps(2.0);
+- ffours = _mm256_set1_ps(4.0);
+-
+- for(;number < eighthPoints; number++){
+- aVal = _mm256_loadu_ps(aPtr);
+- aVal = _mm256_div_ps(aVal, _mm256_sqrt_ps(_mm256_mul_ps(_mm256_add_ps(fones, aVal), _mm256_sub_ps(fones, aVal))));
+- z = aVal;
+- condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS);
+- z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition));
+- condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS);
+- x = _mm256_add_ps(z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition));
+-
+- for(i = 0; i < 2; i++){
+- x = _mm256_add_ps(x, _mm256_sqrt_ps(_mm256_fmadd_ps(x,x,fones)));
+- }
+- x = _mm256_div_ps(fones, x);
+- y = fzeroes;
+- for(j = ASIN_TERMS - 1; j >=0 ; j--){
+- y = _mm256_fmadd_ps(y, _mm256_mul_ps(x, x), _mm256_set1_ps(pow(-1,j)/(2*j+1)));
++ float* bPtr = bVector;
++ const float* aPtr = aVector;
++
++ unsigned int number = 0;
++ unsigned int eighthPoints = num_points / 8;
++ int i, j;
++
++ __m256 aVal, pio2, x, y, z, arcsine;
++ __m256 fzeroes, fones, ftwos, ffours, condition;
++
++ pio2 = _mm256_set1_ps(3.14159265358979323846 / 2);
++ fzeroes = _mm256_setzero_ps();
++ fones = _mm256_set1_ps(1.0);
++ ftwos = _mm256_set1_ps(2.0);
++ ffours = _mm256_set1_ps(4.0);
++
++ for (; number < eighthPoints; number++) {
++ aVal = _mm256_loadu_ps(aPtr);
++ aVal = _mm256_div_ps(aVal,
++ _mm256_sqrt_ps(_mm256_mul_ps(_mm256_add_ps(fones, aVal),
++ _mm256_sub_ps(fones, aVal))));
++ z = aVal;
++ condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS);
++ z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition));
++ condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS);
++ x = _mm256_add_ps(
++ z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition));
++
++ for (i = 0; i < 2; i++) {
++ x = _mm256_add_ps(x, _mm256_sqrt_ps(_mm256_fmadd_ps(x, x, fones)));
++ }
++ x = _mm256_div_ps(fones, x);
++ y = fzeroes;
++ for (j = ASIN_TERMS - 1; j >= 0; j--) {
++ y = _mm256_fmadd_ps(
++ y, _mm256_mul_ps(x, x), _mm256_set1_ps(pow(-1, j) / (2 * j + 1)));
++ }
++
++ y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours));
++ condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS);
++
++ y = _mm256_add_ps(y, _mm256_and_ps(_mm256_fnmadd_ps(y, ftwos, pio2), condition));
++ arcsine = y;
++ condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS);
++ arcsine = _mm256_sub_ps(arcsine,
++ _mm256_and_ps(_mm256_mul_ps(arcsine, ftwos), condition));
++
++ _mm256_storeu_ps(bPtr, arcsine);
++ aPtr += 8;
++ bPtr += 8;
+ }
+
+- y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours));
+- condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS);
+-
+- y = _mm256_add_ps(y, _mm256_and_ps(_mm256_fnmadd_ps(y,ftwos,pio2), condition));
+- arcsine = y;
+- condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS);
+- arcsine = _mm256_sub_ps(arcsine, _mm256_and_ps(_mm256_mul_ps(arcsine, ftwos), condition));
+-
+- _mm256_storeu_ps(bPtr, arcsine);
+- aPtr += 8;
+- bPtr += 8;
+- }
+-
+- number = eighthPoints * 8;
+- for(;number < num_points; number++){
+- *bPtr++ = asin(*aPtr++);
+- }
++ number = eighthPoints * 8;
++ for (; number < num_points; number++) {
++ *bPtr++ = asin(*aPtr++);
++ }
+ }
+
+ #endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for unaligned */
+@@ -334,57 +357,64 @@ volk_32f_asin_32f_u_avx2_fma(float* bVector, const float* aVector, unsigned int
+ static inline void
+ volk_32f_asin_32f_u_avx(float* bVector, const float* aVector, unsigned int num_points)
+ {
+- float* bPtr = bVector;
+- const float* aPtr = aVector;
+-
+- unsigned int number = 0;
+- unsigned int eighthPoints = num_points / 8;
+- int i, j;
+-
+- __m256 aVal, pio2, x, y, z, arcsine;
+- __m256 fzeroes, fones, ftwos, ffours, condition;
+-
+- pio2 = _mm256_set1_ps(3.14159265358979323846/2);
+- fzeroes = _mm256_setzero_ps();
+- fones = _mm256_set1_ps(1.0);
+- ftwos = _mm256_set1_ps(2.0);
+- ffours = _mm256_set1_ps(4.0);
+-
+- for(;number < eighthPoints; number++){
+- aVal = _mm256_loadu_ps(aPtr);
+- aVal = _mm256_div_ps(aVal, _mm256_sqrt_ps(_mm256_mul_ps(_mm256_add_ps(fones, aVal), _mm256_sub_ps(fones, aVal))));
+- z = aVal;
+- condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS);
+- z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition));
+- condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS);
+- x = _mm256_add_ps(z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition));
+-
+- for(i = 0; i < 2; i++){
+- x = _mm256_add_ps(x, _mm256_sqrt_ps(_mm256_add_ps(fones, _mm256_mul_ps(x, x))));
++ float* bPtr = bVector;
++ const float* aPtr = aVector;
++
++ unsigned int number = 0;
++ unsigned int eighthPoints = num_points / 8;
++ int i, j;
++
++ __m256 aVal, pio2, x, y, z, arcsine;
++ __m256 fzeroes, fones, ftwos, ffours, condition;
++
++ pio2 = _mm256_set1_ps(3.14159265358979323846 / 2);
++ fzeroes = _mm256_setzero_ps();
++ fones = _mm256_set1_ps(1.0);
++ ftwos = _mm256_set1_ps(2.0);
++ ffours = _mm256_set1_ps(4.0);
++
++ for (; number < eighthPoints; number++) {
++ aVal = _mm256_loadu_ps(aPtr);
++ aVal = _mm256_div_ps(aVal,
++ _mm256_sqrt_ps(_mm256_mul_ps(_mm256_add_ps(fones, aVal),
++ _mm256_sub_ps(fones, aVal))));
++ z = aVal;
++ condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS);
++ z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition));
++ condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS);
++ x = _mm256_add_ps(
++ z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition));
++
++ for (i = 0; i < 2; i++) {
++ x = _mm256_add_ps(x,
++ _mm256_sqrt_ps(_mm256_add_ps(fones, _mm256_mul_ps(x, x))));
++ }
++ x = _mm256_div_ps(fones, x);
++ y = fzeroes;
++ for (j = ASIN_TERMS - 1; j >= 0; j--) {
++ y = _mm256_add_ps(_mm256_mul_ps(y, _mm256_mul_ps(x, x)),
++ _mm256_set1_ps(pow(-1, j) / (2 * j + 1)));
++ }
++
++ y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours));
++ condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS);
++
++ y = _mm256_add_ps(
++ y, _mm256_and_ps(_mm256_sub_ps(pio2, _mm256_mul_ps(y, ftwos)), condition));
++ arcsine = y;
++ condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS);
++ arcsine = _mm256_sub_ps(arcsine,
++ _mm256_and_ps(_mm256_mul_ps(arcsine, ftwos), condition));
++
++ _mm256_storeu_ps(bPtr, arcsine);
++ aPtr += 8;
++ bPtr += 8;
+ }
+- x = _mm256_div_ps(fones, x);
+- y = fzeroes;
+- for(j = ASIN_TERMS - 1; j >=0 ; j--){
+- y = _mm256_add_ps(_mm256_mul_ps(y, _mm256_mul_ps(x, x)), _mm256_set1_ps(pow(-1,j)/(2*j+1)));
+- }
+-
+- y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours));
+- condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS);
+
+- y = _mm256_add_ps(y, _mm256_and_ps(_mm256_sub_ps(pio2, _mm256_mul_ps(y, ftwos)), condition));
+- arcsine = y;
+- condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS);
+- arcsine = _mm256_sub_ps(arcsine, _mm256_and_ps(_mm256_mul_ps(arcsine, ftwos), condition));
+-
+- _mm256_storeu_ps(bPtr, arcsine);
+- aPtr += 8;
+- bPtr += 8;
+- }
+-
+- number = eighthPoints * 8;
+- for(;number < num_points; number++){
+- *bPtr++ = asin(*aPtr++);
+- }
++ number = eighthPoints * 8;
++ for (; number < num_points; number++) {
++ *bPtr++ = asin(*aPtr++);
++ }
+ }
+
+ #endif /* LV_HAVE_AVX for unaligned */
+@@ -396,57 +426,60 @@ volk_32f_asin_32f_u_avx(float* bVector, const float* aVector, unsigned int num_p
+ static inline void
+ volk_32f_asin_32f_u_sse4_1(float* bVector, const float* aVector, unsigned int num_points)
+ {
+- float* bPtr = bVector;
+- const float* aPtr = aVector;
+-
+- unsigned int number = 0;
+- unsigned int quarterPoints = num_points / 4;
+- int i, j;
+-
+- __m128 aVal, pio2, x, y, z, arcsine;
+- __m128 fzeroes, fones, ftwos, ffours, condition;
+-
+- pio2 = _mm_set1_ps(3.14159265358979323846/2);
+- fzeroes = _mm_setzero_ps();
+- fones = _mm_set1_ps(1.0);
+- ftwos = _mm_set1_ps(2.0);
+- ffours = _mm_set1_ps(4.0);
+-
+- for(;number < quarterPoints; number++){
+- aVal = _mm_loadu_ps(aPtr);
+- aVal = _mm_div_ps(aVal, _mm_sqrt_ps(_mm_mul_ps(_mm_add_ps(fones, aVal), _mm_sub_ps(fones, aVal))));
+- z = aVal;
+- condition = _mm_cmplt_ps(z, fzeroes);
+- z = _mm_sub_ps(z, _mm_and_ps(_mm_mul_ps(z, ftwos), condition));
+- condition = _mm_cmplt_ps(z, fones);
+- x = _mm_add_ps(z, _mm_and_ps(_mm_sub_ps(_mm_div_ps(fones, z), z), condition));
+-
+- for(i = 0; i < 2; i++){
+- x = _mm_add_ps(x, _mm_sqrt_ps(_mm_add_ps(fones, _mm_mul_ps(x, x))));
++ float* bPtr = bVector;
++ const float* aPtr = aVector;
++
++ unsigned int number = 0;
++ unsigned int quarterPoints = num_points / 4;
++ int i, j;
++
++ __m128 aVal, pio2, x, y, z, arcsine;
++ __m128 fzeroes, fones, ftwos, ffours, condition;
++
++ pio2 = _mm_set1_ps(3.14159265358979323846 / 2);
++ fzeroes = _mm_setzero_ps();
++ fones = _mm_set1_ps(1.0);
++ ftwos = _mm_set1_ps(2.0);
++ ffours = _mm_set1_ps(4.0);
++
++ for (; number < quarterPoints; number++) {
++ aVal = _mm_loadu_ps(aPtr);
++ aVal = _mm_div_ps(
++ aVal,
++ _mm_sqrt_ps(_mm_mul_ps(_mm_add_ps(fones, aVal), _mm_sub_ps(fones, aVal))));
++ z = aVal;
++ condition = _mm_cmplt_ps(z, fzeroes);
++ z = _mm_sub_ps(z, _mm_and_ps(_mm_mul_ps(z, ftwos), condition));
++ condition = _mm_cmplt_ps(z, fones);
++ x = _mm_add_ps(z, _mm_and_ps(_mm_sub_ps(_mm_div_ps(fones, z), z), condition));
++
++ for (i = 0; i < 2; i++) {
++ x = _mm_add_ps(x, _mm_sqrt_ps(_mm_add_ps(fones, _mm_mul_ps(x, x))));
++ }
++ x = _mm_div_ps(fones, x);
++ y = fzeroes;
++ for (j = ASIN_TERMS - 1; j >= 0; j--) {
++ y = _mm_add_ps(_mm_mul_ps(y, _mm_mul_ps(x, x)),
++ _mm_set1_ps(pow(-1, j) / (2 * j + 1)));
++ }
++
++ y = _mm_mul_ps(y, _mm_mul_ps(x, ffours));
++ condition = _mm_cmpgt_ps(z, fones);
++
++ y = _mm_add_ps(y, _mm_and_ps(_mm_sub_ps(pio2, _mm_mul_ps(y, ftwos)), condition));
++ arcsine = y;
++ condition = _mm_cmplt_ps(aVal, fzeroes);
++ arcsine = _mm_sub_ps(arcsine, _mm_and_ps(_mm_mul_ps(arcsine, ftwos), condition));
++
++ _mm_storeu_ps(bPtr, arcsine);
++ aPtr += 4;
++ bPtr += 4;
+ }
+- x = _mm_div_ps(fones, x);
+- y = fzeroes;
+- for(j = ASIN_TERMS - 1; j >=0 ; j--){
+- y = _mm_add_ps(_mm_mul_ps(y, _mm_mul_ps(x, x)), _mm_set1_ps(pow(-1,j)/(2*j+1)));
+- }
+-
+- y = _mm_mul_ps(y, _mm_mul_ps(x, ffours));
+- condition = _mm_cmpgt_ps(z, fones);
+
+- y = _mm_add_ps(y, _mm_and_ps(_mm_sub_ps(pio2, _mm_mul_ps(y, ftwos)), condition));
+- arcsine = y;
+- condition = _mm_cmplt_ps(aVal, fzeroes);
+- arcsine = _mm_sub_ps(arcsine, _mm_and_ps(_mm_mul_ps(arcsine, ftwos), condition));
+-
+- _mm_storeu_ps(bPtr, arcsine);
+- aPtr += 4;
+- bPtr += 4;
+- }
+-
+- number = quarterPoints * 4;
+- for(;number < num_points; number++){
+- *bPtr++ = asinf(*aPtr++);
+- }
++ number = quarterPoints * 4;
++ for (; number < num_points; number++) {
++ *bPtr++ = asinf(*aPtr++);
++ }
+ }
+
+ #endif /* LV_HAVE_SSE4_1 for unaligned */
+@@ -456,13 +489,13 @@ volk_32f_asin_32f_u_sse4_1(float* bVector, const float* aVector, unsigned int nu
+ static inline void
+ volk_32f_asin_32f_u_generic(float* bVector, const float* aVector, unsigned int num_points)
+ {
+- float* bPtr = bVector;
+- const float* aPtr = aVector;
+- unsigned int number = 0;
++ float* bPtr = bVector;
++ const float* aPtr = aVector;
++ unsigned int number = 0;
+
+- for(number = 0; number < num_points; number++){
+- *bPtr++ = asinf(*aPtr++);
+- }
++ for (number = 0; number < num_points; number++) {
++ *bPtr++ = asinf(*aPtr++);
++ }
+ }
+ #endif /* LV_HAVE_GENERIC */
+
+diff --git a/kernels/volk/volk_32f_atan_32f.h b/kernels/volk/volk_32f_atan_32f.h
+index 3496f0e..6652ee8 100644
+--- a/kernels/volk/volk_32f_atan_32f.h
++++ b/kernels/volk/volk_32f_atan_32f.h
+@@ -67,11 +67,12 @@
+ * \endcode
+ */
+
+-#include <stdio.h>
+-#include <math.h>
+ #include <inttypes.h>
++#include <math.h>
++#include <stdio.h>
+
+-/* This is the number of terms of Taylor series to evaluate, increase this for more accuracy*/
++/* This is the number of terms of Taylor series to evaluate, increase this for more
++ * accuracy*/
+ #define TERMS 2
+
+ #ifndef INCLUDED_volk_32f_atan_32f_a_H
+@@ -80,59 +81,63 @@
+ #if LV_HAVE_AVX2 && LV_HAVE_FMA
+ #include <immintrin.h>
+
+-static inline void
+-volk_32f_atan_32f_a_avx2_fma(float* bVector, const float* aVector, unsigned int num_points)
++static inline void volk_32f_atan_32f_a_avx2_fma(float* bVector,
++ const float* aVector,
++ unsigned int num_points)
+ {
+- float* bPtr = bVector;
+- const float* aPtr = aVector;
+-
+- unsigned int number = 0;
+- unsigned int eighthPoints = num_points / 8;
+- int i, j;
+-
+- __m256 aVal, pio2, x, y, z, arctangent;
+- __m256 fzeroes, fones, ftwos, ffours, condition;
+-
+- pio2 = _mm256_set1_ps(3.14159265358979323846/2);
+- fzeroes = _mm256_setzero_ps();
+- fones = _mm256_set1_ps(1.0);
+- ftwos = _mm256_set1_ps(2.0);
+- ffours = _mm256_set1_ps(4.0);
+-
+- for(;number < eighthPoints; number++){
+- aVal = _mm256_load_ps(aPtr);
+- z = aVal;
+- condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS);
+- z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition));
+- condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS);
+- x = _mm256_add_ps(z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition));
+-
+- for(i = 0; i < 2; i++){
+- x = _mm256_add_ps(x, _mm256_sqrt_ps(_mm256_fmadd_ps(x,x,fones)));
++ float* bPtr = bVector;
++ const float* aPtr = aVector;
++
++ unsigned int number = 0;
++ unsigned int eighthPoints = num_points / 8;
++ int i, j;
++
++ __m256 aVal, pio2, x, y, z, arctangent;
++ __m256 fzeroes, fones, ftwos, ffours, condition;
++
++ pio2 = _mm256_set1_ps(3.14159265358979323846 / 2);
++ fzeroes = _mm256_setzero_ps();
++ fones = _mm256_set1_ps(1.0);
++ ftwos = _mm256_set1_ps(2.0);
++ ffours = _mm256_set1_ps(4.0);
++
++ for (; number < eighthPoints; number++) {
++ aVal = _mm256_load_ps(aPtr);
++ z = aVal;
++ condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS);
++ z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition));
++ condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS);
++ x = _mm256_add_ps(
++ z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition));
++
++ for (i = 0; i < 2; i++) {
++ x = _mm256_add_ps(x, _mm256_sqrt_ps(_mm256_fmadd_ps(x, x, fones)));
++ }
++ x = _mm256_div_ps(fones, x);
++ y = fzeroes;
++ for (j = TERMS - 1; j >= 0; j--) {
++ y = _mm256_fmadd_ps(
++ y, _mm256_mul_ps(x, x), _mm256_set1_ps(pow(-1, j) / (2 * j + 1)));
++ }
++
++ y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours));
++ condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS);
++
++ y = _mm256_add_ps(y, _mm256_and_ps(_mm256_fnmadd_ps(y, ftwos, pio2), condition));
++ arctangent = y;
++ condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS);
++ arctangent = _mm256_sub_ps(
++ arctangent, _mm256_and_ps(_mm256_mul_ps(arctangent, ftwos), condition));
++
++ _mm256_store_ps(bPtr, arctangent);
++ aPtr += 8;
++ bPtr += 8;
+ }
+- x = _mm256_div_ps(fones, x);
+- y = fzeroes;
+- for(j = TERMS - 1; j >=0 ; j--){
+- y = _mm256_fmadd_ps(y, _mm256_mul_ps(x, x), _mm256_set1_ps(pow(-1,j)/(2*j+1)));
+- }
+-
+- y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours));
+- condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS);
+-
+- y = _mm256_add_ps(y, _mm256_and_ps(_mm256_fnmadd_ps(y,ftwos,pio2), condition));
+- arctangent = y;
+- condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS);
+- arctangent = _mm256_sub_ps(arctangent, _mm256_and_ps(_mm256_mul_ps(arctangent, ftwos), condition));
+-
+- _mm256_store_ps(bPtr, arctangent);
+- aPtr += 8;
+- bPtr += 8;
+- }
+
+- number = eighthPoints * 8;
+- for(;number < num_points; number++){
+- *bPtr++ = atan(*aPtr++);
+- }
++ number = eighthPoints * 8;
++ for (; number < num_points; number++) {
++ *bPtr++ = atan(*aPtr++);
++ }
+ }
+
+ #endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for aligned */
+@@ -144,56 +149,61 @@ volk_32f_atan_32f_a_avx2_fma(float* bVector, const float* aVector, unsigned int
+ static inline void
+ volk_32f_atan_32f_a_avx(float* bVector, const float* aVector, unsigned int num_points)
+ {
+- float* bPtr = bVector;
+- const float* aPtr = aVector;
+-
+- unsigned int number = 0;
+- unsigned int eighthPoints = num_points / 8;
+- int i, j;
+-
+- __m256 aVal, pio2, x, y, z, arctangent;
+- __m256 fzeroes, fones, ftwos, ffours, condition;
+-
+- pio2 = _mm256_set1_ps(3.14159265358979323846/2);
+- fzeroes = _mm256_setzero_ps();
+- fones = _mm256_set1_ps(1.0);
+- ftwos = _mm256_set1_ps(2.0);
+- ffours = _mm256_set1_ps(4.0);
+-
+- for(;number < eighthPoints; number++){
+- aVal = _mm256_load_ps(aPtr);
+- z = aVal;
+- condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS);
+- z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition));
+- condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS);
+- x = _mm256_add_ps(z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition));
+-
+- for(i = 0; i < 2; i++){
+- x = _mm256_add_ps(x, _mm256_sqrt_ps(_mm256_add_ps(fones, _mm256_mul_ps(x, x))));
+- }
+- x = _mm256_div_ps(fones, x);
+- y = fzeroes;
+- for(j = TERMS - 1; j >=0 ; j--){
+- y = _mm256_add_ps(_mm256_mul_ps(y, _mm256_mul_ps(x, x)), _mm256_set1_ps(pow(-1,j)/(2*j+1)));
++ float* bPtr = bVector;
++ const float* aPtr = aVector;
++
++ unsigned int number = 0;
++ unsigned int eighthPoints = num_points / 8;
++ int i, j;
++
++ __m256 aVal, pio2, x, y, z, arctangent;
++ __m256 fzeroes, fones, ftwos, ffours, condition;
++
++ pio2 = _mm256_set1_ps(3.14159265358979323846 / 2);
++ fzeroes = _mm256_setzero_ps();
++ fones = _mm256_set1_ps(1.0);
++ ftwos = _mm256_set1_ps(2.0);
++ ffours = _mm256_set1_ps(4.0);
++
++ for (; number < eighthPoints; number++) {
++ aVal = _mm256_load_ps(aPtr);
++ z = aVal;
++ condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS);
++ z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition));
++ condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS);
++ x = _mm256_add_ps(
++ z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition));
++
++ for (i = 0; i < 2; i++) {
++ x = _mm256_add_ps(x,
++ _mm256_sqrt_ps(_mm256_add_ps(fones, _mm256_mul_ps(x, x))));
++ }
++ x = _mm256_div_ps(fones, x);
++ y = fzeroes;
++ for (j = TERMS - 1; j >= 0; j--) {
++ y = _mm256_add_ps(_mm256_mul_ps(y, _mm256_mul_ps(x, x)),
++ _mm256_set1_ps(pow(-1, j) / (2 * j + 1)));
++ }
++
++ y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours));
++ condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS);
++
++ y = _mm256_add_ps(
++ y, _mm256_and_ps(_mm256_sub_ps(pio2, _mm256_mul_ps(y, ftwos)), condition));
++ arctangent = y;
++ condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS);
++ arctangent = _mm256_sub_ps(
++ arctangent, _mm256_and_ps(_mm256_mul_ps(arctangent, ftwos), condition));
++
++ _mm256_store_ps(bPtr, arctangent);
++ aPtr += 8;
++ bPtr += 8;
+ }
+
+- y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours));
+- condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS);
+-
+- y = _mm256_add_ps(y, _mm256_and_ps(_mm256_sub_ps(pio2, _mm256_mul_ps(y, ftwos)), condition));
+- arctangent = y;
+- condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS);
+- arctangent = _mm256_sub_ps(arctangent, _mm256_and_ps(_mm256_mul_ps(arctangent, ftwos), condition));
+-
+- _mm256_store_ps(bPtr, arctangent);
+- aPtr += 8;
+- bPtr += 8;
+- }
+-
+- number = eighthPoints * 8;
+- for(;number < num_points; number++){
+- *bPtr++ = atan(*aPtr++);
+- }
++ number = eighthPoints * 8;
++ for (; number < num_points; number++) {
++ *bPtr++ = atan(*aPtr++);
++ }
+ }
+
+ #endif /* LV_HAVE_AVX for aligned */
+@@ -204,56 +214,58 @@ volk_32f_atan_32f_a_avx(float* bVector, const float* aVector, unsigned int num_p
+ static inline void
+ volk_32f_atan_32f_a_sse4_1(float* bVector, const float* aVector, unsigned int num_points)
+ {
+- float* bPtr = bVector;
+- const float* aPtr = aVector;
+-
+- unsigned int number = 0;
+- unsigned int quarterPoints = num_points / 4;
+- int i, j;
+-
+- __m128 aVal, pio2, x, y, z, arctangent;
+- __m128 fzeroes, fones, ftwos, ffours, condition;
+-
+- pio2 = _mm_set1_ps(3.14159265358979323846/2);
+- fzeroes = _mm_setzero_ps();
+- fones = _mm_set1_ps(1.0);
+- ftwos = _mm_set1_ps(2.0);
+- ffours = _mm_set1_ps(4.0);
+-
+- for(;number < quarterPoints; number++){
+- aVal = _mm_load_ps(aPtr);
+- z = aVal;
+- condition = _mm_cmplt_ps(z, fzeroes);
+- z = _mm_sub_ps(z, _mm_and_ps(_mm_mul_ps(z, ftwos), condition));
+- condition = _mm_cmplt_ps(z, fones);
+- x = _mm_add_ps(z, _mm_and_ps(_mm_sub_ps(_mm_div_ps(fones, z), z), condition));
+-
+- for(i = 0; i < 2; i++){
+- x = _mm_add_ps(x, _mm_sqrt_ps(_mm_add_ps(fones, _mm_mul_ps(x, x))));
+- }
+- x = _mm_div_ps(fones, x);
+- y = fzeroes;
+- for(j = TERMS - 1; j >=0 ; j--){
+- y = _mm_add_ps(_mm_mul_ps(y, _mm_mul_ps(x, x)), _mm_set1_ps(pow(-1,j)/(2*j+1)));
++ float* bPtr = bVector;
++ const float* aPtr = aVector;
++
++ unsigned int number = 0;
++ unsigned int quarterPoints = num_points / 4;
++ int i, j;
++
++ __m128 aVal, pio2, x, y, z, arctangent;
++ __m128 fzeroes, fones, ftwos, ffours, condition;
++
++ pio2 = _mm_set1_ps(3.14159265358979323846 / 2);
++ fzeroes = _mm_setzero_ps();
++ fones = _mm_set1_ps(1.0);
++ ftwos = _mm_set1_ps(2.0);
++ ffours = _mm_set1_ps(4.0);
++
++ for (; number < quarterPoints; number++) {
++ aVal = _mm_load_ps(aPtr);
++ z = aVal;
++ condition = _mm_cmplt_ps(z, fzeroes);
++ z = _mm_sub_ps(z, _mm_and_ps(_mm_mul_ps(z, ftwos), condition));
++ condition = _mm_cmplt_ps(z, fones);
++ x = _mm_add_ps(z, _mm_and_ps(_mm_sub_ps(_mm_div_ps(fones, z), z), condition));
++
++ for (i = 0; i < 2; i++) {
++ x = _mm_add_ps(x, _mm_sqrt_ps(_mm_add_ps(fones, _mm_mul_ps(x, x))));
++ }
++ x = _mm_div_ps(fones, x);
++ y = fzeroes;
++ for (j = TERMS - 1; j >= 0; j--) {
++ y = _mm_add_ps(_mm_mul_ps(y, _mm_mul_ps(x, x)),
++ _mm_set1_ps(pow(-1, j) / (2 * j + 1)));
++ }
++
++ y = _mm_mul_ps(y, _mm_mul_ps(x, ffours));
++ condition = _mm_cmpgt_ps(z, fones);
++
++ y = _mm_add_ps(y, _mm_and_ps(_mm_sub_ps(pio2, _mm_mul_ps(y, ftwos)), condition));
++ arctangent = y;
++ condition = _mm_cmplt_ps(aVal, fzeroes);
++ arctangent =
++ _mm_sub_ps(arctangent, _mm_and_ps(_mm_mul_ps(arctangent, ftwos), condition));
++
++ _mm_store_ps(bPtr, arctangent);
++ aPtr += 4;
++ bPtr += 4;
+ }
+
+- y = _mm_mul_ps(y, _mm_mul_ps(x, ffours));
+- condition = _mm_cmpgt_ps(z, fones);
+-
+- y = _mm_add_ps(y, _mm_and_ps(_mm_sub_ps(pio2, _mm_mul_ps(y, ftwos)), condition));
+- arctangent = y;
+- condition = _mm_cmplt_ps(aVal, fzeroes);
+- arctangent = _mm_sub_ps(arctangent, _mm_and_ps(_mm_mul_ps(arctangent, ftwos), condition));
+-
+- _mm_store_ps(bPtr, arctangent);
+- aPtr += 4;
+- bPtr += 4;
+- }
+-
+- number = quarterPoints * 4;
+- for(;number < num_points; number++){
+- *bPtr++ = atanf(*aPtr++);
+- }
++ number = quarterPoints * 4;
++ for (; number < num_points; number++) {
++ *bPtr++ = atanf(*aPtr++);
++ }
+ }
+
+ #endif /* LV_HAVE_SSE4_1 for aligned */
+@@ -266,59 +278,63 @@ volk_32f_atan_32f_a_sse4_1(float* bVector, const float* aVector, unsigned int nu
+ #if LV_HAVE_AVX2 && LV_HAVE_FMA
+ #include <immintrin.h>
+
+-static inline void
+-volk_32f_atan_32f_u_avx2_fma(float* bVector, const float* aVector, unsigned int num_points)
++static inline void volk_32f_atan_32f_u_avx2_fma(float* bVector,
++ const float* aVector,
++ unsigned int num_points)
+ {
+- float* bPtr = bVector;
+- const float* aPtr = aVector;
+-
+- unsigned int number = 0;
+- unsigned int eighthPoints = num_points / 8;
+- int i, j;
+-
+- __m256 aVal, pio2, x, y, z, arctangent;
+- __m256 fzeroes, fones, ftwos, ffours, condition;
+-
+- pio2 = _mm256_set1_ps(3.14159265358979323846/2);
+- fzeroes = _mm256_setzero_ps();
+- fones = _mm256_set1_ps(1.0);
+- ftwos = _mm256_set1_ps(2.0);
+- ffours = _mm256_set1_ps(4.0);
+-
+- for(;number < eighthPoints; number++){
+- aVal = _mm256_loadu_ps(aPtr);
+- z = aVal;
+- condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS);
+- z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition));
+- condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS);
+- x = _mm256_add_ps(z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition));
+-
+- for(i = 0; i < 2; i++){
+- x = _mm256_add_ps(x, _mm256_sqrt_ps(_mm256_fmadd_ps(x,x,fones)));
++ float* bPtr = bVector;
++ const float* aPtr = aVector;
++
++ unsigned int number = 0;
++ unsigned int eighthPoints = num_points / 8;
++ int i, j;
++
++ __m256 aVal, pio2, x, y, z, arctangent;
++ __m256 fzeroes, fones, ftwos, ffours, condition;
++
++ pio2 = _mm256_set1_ps(3.14159265358979323846 / 2);
++ fzeroes = _mm256_setzero_ps();
++ fones = _mm256_set1_ps(1.0);
++ ftwos = _mm256_set1_ps(2.0);
++ ffours = _mm256_set1_ps(4.0);
++
++ for (; number < eighthPoints; number++) {
++ aVal = _mm256_loadu_ps(aPtr);
++ z = aVal;
++ condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS);
++ z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition));
++ condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS);
++ x = _mm256_add_ps(
++ z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition));
++
++ for (i = 0; i < 2; i++) {
++ x = _mm256_add_ps(x, _mm256_sqrt_ps(_mm256_fmadd_ps(x, x, fones)));
++ }
++ x = _mm256_div_ps(fones, x);
++ y = fzeroes;
++ for (j = TERMS - 1; j >= 0; j--) {
++ y = _mm256_fmadd_ps(
++ y, _mm256_mul_ps(x, x), _mm256_set1_ps(pow(-1, j) / (2 * j + 1)));
++ }
++
++ y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours));
++ condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS);
++
++ y = _mm256_add_ps(y, _mm256_and_ps(_mm256_fnmadd_ps(y, ftwos, pio2), condition));
++ arctangent = y;
++ condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS);
++ arctangent = _mm256_sub_ps(
++ arctangent, _mm256_and_ps(_mm256_mul_ps(arctangent, ftwos), condition));
++
++ _mm256_storeu_ps(bPtr, arctangent);
++ aPtr += 8;
++ bPtr += 8;
+ }
+- x = _mm256_div_ps(fones, x);
+- y = fzeroes;
+- for(j = TERMS - 1; j >=0 ; j--){
+- y = _mm256_fmadd_ps(y, _mm256_mul_ps(x, x), _mm256_set1_ps(pow(-1,j)/(2*j+1)));
+- }
+-
+- y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours));
+- condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS);
+
+- y = _mm256_add_ps(y, _mm256_and_ps(_mm256_fnmadd_ps(y,ftwos,pio2), condition));
+- arctangent = y;
+- condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS);
+- arctangent = _mm256_sub_ps(arctangent, _mm256_and_ps(_mm256_mul_ps(arctangent, ftwos), condition));
+-
+- _mm256_storeu_ps(bPtr, arctangent);
+- aPtr += 8;
+- bPtr += 8;
+- }
+-
+- number = eighthPoints * 8;
+- for(;number < num_points; number++){
+- *bPtr++ = atan(*aPtr++);
+- }
++ number = eighthPoints * 8;
++ for (; number < num_points; number++) {
++ *bPtr++ = atan(*aPtr++);
++ }
+ }
+
+ #endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for unaligned */
+@@ -330,56 +346,61 @@ volk_32f_atan_32f_u_avx2_fma(float* bVector, const float* aVector, unsigned int
+ static inline void
+ volk_32f_atan_32f_u_avx(float* bVector, const float* aVector, unsigned int num_points)
+ {
+- float* bPtr = bVector;
+- const float* aPtr = aVector;
+-
+- unsigned int number = 0;
+- unsigned int eighthPoints = num_points / 8;
+- int i, j;
+-
+- __m256 aVal, pio2, x, y, z, arctangent;
+- __m256 fzeroes, fones, ftwos, ffours, condition;
+-
+- pio2 = _mm256_set1_ps(3.14159265358979323846/2);
+- fzeroes = _mm256_setzero_ps();
+- fones = _mm256_set1_ps(1.0);
+- ftwos = _mm256_set1_ps(2.0);
+- ffours = _mm256_set1_ps(4.0);
+-
+- for(;number < eighthPoints; number++){
+- aVal = _mm256_loadu_ps(aPtr);
+- z = aVal;
+- condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS);
+- z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition));
+- condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS);
+- x = _mm256_add_ps(z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition));
+-
+- for(i = 0; i < 2; i++){
+- x = _mm256_add_ps(x, _mm256_sqrt_ps(_mm256_add_ps(fones, _mm256_mul_ps(x, x))));
+- }
+- x = _mm256_div_ps(fones, x);
+- y = fzeroes;
+- for(j = TERMS - 1; j >=0 ; j--){
+- y = _mm256_add_ps(_mm256_mul_ps(y, _mm256_mul_ps(x, x)), _mm256_set1_ps(pow(-1,j)/(2*j+1)));
++ float* bPtr = bVector;
++ const float* aPtr = aVector;
++
++ unsigned int number = 0;
++ unsigned int eighthPoints = num_points / 8;
++ int i, j;
++
++ __m256 aVal, pio2, x, y, z, arctangent;
++ __m256 fzeroes, fones, ftwos, ffours, condition;
++
++ pio2 = _mm256_set1_ps(3.14159265358979323846 / 2);
++ fzeroes = _mm256_setzero_ps();
++ fones = _mm256_set1_ps(1.0);
++ ftwos = _mm256_set1_ps(2.0);
++ ffours = _mm256_set1_ps(4.0);
++
++ for (; number < eighthPoints; number++) {
++ aVal = _mm256_loadu_ps(aPtr);
++ z = aVal;
++ condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS);
++ z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition));
++ condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS);
++ x = _mm256_add_ps(
++ z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition));
++
++ for (i = 0; i < 2; i++) {
++ x = _mm256_add_ps(x,
++ _mm256_sqrt_ps(_mm256_add_ps(fones, _mm256_mul_ps(x, x))));
++ }
++ x = _mm256_div_ps(fones, x);
++ y = fzeroes;
++ for (j = TERMS - 1; j >= 0; j--) {
++ y = _mm256_add_ps(_mm256_mul_ps(y, _mm256_mul_ps(x, x)),
++ _mm256_set1_ps(pow(-1, j) / (2 * j + 1)));
++ }
++
++ y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours));
++ condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS);
++
++ y = _mm256_add_ps(
++ y, _mm256_and_ps(_mm256_sub_ps(pio2, _mm256_mul_ps(y, ftwos)), condition));
++ arctangent = y;
++ condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS);
++ arctangent = _mm256_sub_ps(
++ arctangent, _mm256_and_ps(_mm256_mul_ps(arctangent, ftwos), condition));
++
++ _mm256_storeu_ps(bPtr, arctangent);
++ aPtr += 8;
++ bPtr += 8;
+ }
+
+- y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours));
+- condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS);
+-
+- y = _mm256_add_ps(y, _mm256_and_ps(_mm256_sub_ps(pio2, _mm256_mul_ps(y, ftwos)), condition));
+- arctangent = y;
+- condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS);
+- arctangent = _mm256_sub_ps(arctangent, _mm256_and_ps(_mm256_mul_ps(arctangent, ftwos), condition));
+-
+- _mm256_storeu_ps(bPtr, arctangent);
+- aPtr += 8;
+- bPtr += 8;
+- }
+-
+- number = eighthPoints * 8;
+- for(;number < num_points; number++){
+- *bPtr++ = atan(*aPtr++);
+- }
++ number = eighthPoints * 8;
++ for (; number < num_points; number++) {
++ *bPtr++ = atan(*aPtr++);
++ }
+ }
+
+ #endif /* LV_HAVE_AVX for unaligned */
+@@ -390,54 +411,56 @@ volk_32f_atan_32f_u_avx(float* bVector, const float* aVector, unsigned int num_p
+ static inline void
+ volk_32f_atan_32f_u_sse4_1(float* bVector, const float* aVector, unsigned int num_points)
+ {
+- float* bPtr = bVector;
+- const float* aPtr = aVector;
+-
+- unsigned int number = 0;
+- unsigned int quarterPoints = num_points / 4;
+- int i, j;
+-
+- __m128 aVal, pio2, x, y, z, arctangent;
+- __m128 fzeroes, fones, ftwos, ffours, condition;
+-
+- pio2 = _mm_set1_ps(3.14159265358979323846/2);
+- fzeroes = _mm_setzero_ps();
+- fones = _mm_set1_ps(1.0);
+- ftwos = _mm_set1_ps(2.0);
+- ffours = _mm_set1_ps(4.0);
+-
+- for(;number < quarterPoints; number++){
+- aVal = _mm_loadu_ps(aPtr);
+- z = aVal;
+- condition = _mm_cmplt_ps(z, fzeroes);
+- z = _mm_sub_ps(z, _mm_and_ps(_mm_mul_ps(z, ftwos), condition));
+- condition = _mm_cmplt_ps(z, fones);
+- x = _mm_add_ps(z, _mm_and_ps(_mm_sub_ps(_mm_div_ps(fones, z), z), condition));
+-
+- for(i = 0; i < 2; i++)
+- x = _mm_add_ps(x, _mm_sqrt_ps(_mm_add_ps(fones, _mm_mul_ps(x, x))));
+- x = _mm_div_ps(fones, x);
+- y = fzeroes;
+- for(j = TERMS - 1; j >= 0; j--)
+- y = _mm_add_ps(_mm_mul_ps(y, _mm_mul_ps(x, x)), _mm_set1_ps(pow(-1,j)/(2*j+1)));
+-
+- y = _mm_mul_ps(y, _mm_mul_ps(x, ffours));
+- condition = _mm_cmpgt_ps(z, fones);
+-
+- y = _mm_add_ps(y, _mm_and_ps(_mm_sub_ps(pio2, _mm_mul_ps(y, ftwos)), condition));
+- arctangent = y;
+- condition = _mm_cmplt_ps(aVal, fzeroes);
+- arctangent = _mm_sub_ps(arctangent, _mm_and_ps(_mm_mul_ps(arctangent, ftwos), condition));
+-
+- _mm_storeu_ps(bPtr, arctangent);
+- aPtr += 4;
+- bPtr += 4;
+- }
+-
+- number = quarterPoints * 4;
+- for(;number < num_points; number++){
+- *bPtr++ = atanf(*aPtr++);
+- }
++ float* bPtr = bVector;
++ const float* aPtr = aVector;
++
++ unsigned int number = 0;
++ unsigned int quarterPoints = num_points / 4;
++ int i, j;
++
++ __m128 aVal, pio2, x, y, z, arctangent;
++ __m128 fzeroes, fones, ftwos, ffours, condition;
++
++ pio2 = _mm_set1_ps(3.14159265358979323846 / 2);
++ fzeroes = _mm_setzero_ps();
++ fones = _mm_set1_ps(1.0);
++ ftwos = _mm_set1_ps(2.0);
++ ffours = _mm_set1_ps(4.0);
++
++ for (; number < quarterPoints; number++) {
++ aVal = _mm_loadu_ps(aPtr);
++ z = aVal;
++ condition = _mm_cmplt_ps(z, fzeroes);
++ z = _mm_sub_ps(z, _mm_and_ps(_mm_mul_ps(z, ftwos), condition));
++ condition = _mm_cmplt_ps(z, fones);
++ x = _mm_add_ps(z, _mm_and_ps(_mm_sub_ps(_mm_div_ps(fones, z), z), condition));
++
++ for (i = 0; i < 2; i++)
++ x = _mm_add_ps(x, _mm_sqrt_ps(_mm_add_ps(fones, _mm_mul_ps(x, x))));
++ x = _mm_div_ps(fones, x);
++ y = fzeroes;
++ for (j = TERMS - 1; j >= 0; j--)
++ y = _mm_add_ps(_mm_mul_ps(y, _mm_mul_ps(x, x)),
++ _mm_set1_ps(pow(-1, j) / (2 * j + 1)));
++
++ y = _mm_mul_ps(y, _mm_mul_ps(x, ffours));
++ condition = _mm_cmpgt_ps(z, fones);
++
++ y = _mm_add_ps(y, _mm_and_ps(_mm_sub_ps(pio2, _mm_mul_ps(y, ftwos)), condition));
++ arctangent = y;
++ condition = _mm_cmplt_ps(aVal, fzeroes);
++ arctangent =
++ _mm_sub_ps(arctangent, _mm_and_ps(_mm_mul_ps(arctangent, ftwos), condition));
++
++ _mm_storeu_ps(bPtr, arctangent);
++ aPtr += 4;
++ bPtr += 4;
++ }
++
++ number = quarterPoints * 4;
++ for (; number < num_points; number++) {
++ *bPtr++ = atanf(*aPtr++);
++ }
+ }
+
+ #endif /* LV_HAVE_SSE4_1 for unaligned */
+@@ -447,13 +470,13 @@ volk_32f_atan_32f_u_sse4_1(float* bVector, const float* aVector, unsigned int nu
+ static inline void
+ volk_32f_atan_32f_generic(float* bVector, const float* aVector, unsigned int num_points)
+ {
+- float* bPtr = bVector;
+- const float* aPtr = aVector;
+- unsigned int number = 0;
++ float* bPtr = bVector;
++ const float* aPtr = aVector;
++ unsigned int number = 0;
+
+- for(number = 0; number < num_points; number++){
+- *bPtr++ = atanf(*aPtr++);
+- }
++ for (number = 0; number < num_points; number++) {
++ *bPtr++ = atanf(*aPtr++);
++ }
+ }
+ #endif /* LV_HAVE_GENERIC */
+
+diff --git a/kernels/volk/volk_32f_binary_slicer_32i.h b/kernels/volk/volk_32f_binary_slicer_32i.h
+index c56ff8f..635d0c3 100644
+--- a/kernels/volk/volk_32f_binary_slicer_32i.h
++++ b/kernels/volk/volk_32f_binary_slicer_32i.h
+@@ -30,8 +30,8 @@
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+- * void volk_32f_binary_slicer_32i(int* cVector, const float* aVector, unsigned int num_points)
+- * \endcode
++ * void volk_32f_binary_slicer_32i(int* cVector, const float* aVector, unsigned int
++ * num_points) \endcode
+ *
+ * \b Inputs
+ * \li aVector: The input vector of floats.
+@@ -73,37 +73,38 @@
+
+ #ifdef LV_HAVE_GENERIC
+
+-static inline void
+-volk_32f_binary_slicer_32i_generic(int* cVector, const float* aVector, unsigned int num_points)
++static inline void volk_32f_binary_slicer_32i_generic(int* cVector,
++ const float* aVector,
++ unsigned int num_points)
+ {
+- int* cPtr = cVector;
+- const float* aPtr = aVector;
+- unsigned int number = 0;
+-
+- for(number = 0; number < num_points; number++){
+- if( *aPtr++ >= 0) {
+- *cPtr++ = 1;
+- }
+- else {
+- *cPtr++ = 0;
++ int* cPtr = cVector;
++ const float* aPtr = aVector;
++ unsigned int number = 0;
++
++ for (number = 0; number < num_points; number++) {
++ if (*aPtr++ >= 0) {
++ *cPtr++ = 1;
++ } else {
++ *cPtr++ = 0;
++ }
+ }
+- }
+ }
+ #endif /* LV_HAVE_GENERIC */
+
+
+ #ifdef LV_HAVE_GENERIC
+
+-static inline void
+-volk_32f_binary_slicer_32i_generic_branchless(int* cVector, const float* aVector, unsigned int num_points)
++static inline void volk_32f_binary_slicer_32i_generic_branchless(int* cVector,
++ const float* aVector,
++ unsigned int num_points)
+ {
+- int* cPtr = cVector;
+- const float* aPtr = aVector;
+- unsigned int number = 0;
++ int* cPtr = cVector;
++ const float* aPtr = aVector;
++ unsigned int number = 0;
+
+- for(number = 0; number < num_points; number++){
+- *cPtr++ = (*aPtr++ >= 0);
+- }
++ for (number = 0; number < num_points; number++) {
++ *cPtr++ = (*aPtr++ >= 0);
++ }
+ }
+ #endif /* LV_HAVE_GENERIC */
+
+@@ -111,40 +112,40 @@ volk_32f_binary_slicer_32i_generic_branchless(int* cVector, const float* aVector
+ #ifdef LV_HAVE_SSE2
+ #include <emmintrin.h>
+
+-static inline void
+-volk_32f_binary_slicer_32i_a_sse2(int* cVector, const float* aVector, unsigned int num_points)
++static inline void volk_32f_binary_slicer_32i_a_sse2(int* cVector,
++ const float* aVector,
++ unsigned int num_points)
+ {
+- int* cPtr = cVector;
+- const float* aPtr = aVector;
+- unsigned int number = 0;
+-
+- unsigned int quarter_points = num_points / 4;
+- __m128 a_val, res_f;
+- __m128i res_i, binary_i;
+- __m128 zero_val;
+- zero_val = _mm_set1_ps (0.0f);
++ int* cPtr = cVector;
++ const float* aPtr = aVector;
++ unsigned int number = 0;
+
+- for(number = 0; number < quarter_points; number++){
+- a_val = _mm_load_ps(aPtr);
++ unsigned int quarter_points = num_points / 4;
++ __m128 a_val, res_f;
++ __m128i res_i, binary_i;
++ __m128 zero_val;
++ zero_val = _mm_set1_ps(0.0f);
+
+- res_f = _mm_cmpge_ps (a_val, zero_val);
+- res_i = _mm_cvtps_epi32 (res_f);
+- binary_i = _mm_srli_epi32 (res_i, 31);
++ for (number = 0; number < quarter_points; number++) {
++ a_val = _mm_load_ps(aPtr);
+
+- _mm_store_si128((__m128i*)cPtr, binary_i);
++ res_f = _mm_cmpge_ps(a_val, zero_val);
++ res_i = _mm_cvtps_epi32(res_f);
++ binary_i = _mm_srli_epi32(res_i, 31);
+
+- cPtr += 4;
+- aPtr += 4;
+- }
++ _mm_store_si128((__m128i*)cPtr, binary_i);
+
+- for(number = quarter_points * 4; number < num_points; number++){
+- if( *aPtr++ >= 0) {
+- *cPtr++ = 1;
++ cPtr += 4;
++ aPtr += 4;
+ }
+- else {
+- *cPtr++ = 0;
++
++ for (number = quarter_points * 4; number < num_points; number++) {
++ if (*aPtr++ >= 0) {
++ *cPtr++ = 1;
++ } else {
++ *cPtr++ = 0;
++ }
+ }
+- }
+ }
+ #endif /* LV_HAVE_SSE2 */
+
+@@ -152,41 +153,41 @@ volk_32f_binary_slicer_32i_a_sse2(int* cVector, const float* aVector, unsigned i
+ #ifdef LV_HAVE_AVX
+ #include <immintrin.h>
+
+-static inline void
+-volk_32f_binary_slicer_32i_a_avx(int* cVector, const float* aVector, unsigned int num_points)
++static inline void volk_32f_binary_slicer_32i_a_avx(int* cVector,
++ const float* aVector,
++ unsigned int num_points)
+ {
+- int* cPtr = cVector;
+- const float* aPtr = aVector;
+- unsigned int number = 0;
++ int* cPtr = cVector;
++ const float* aPtr = aVector;
++ unsigned int number = 0;
+
+- unsigned int quarter_points = num_points / 8;
+- __m256 a_val, res_f, binary_f;
+- __m256i binary_i;
+- __m256 zero_val, one_val;
+- zero_val = _mm256_set1_ps (0.0f);
+- one_val = _mm256_set1_ps (1.0f);
++ unsigned int quarter_points = num_points / 8;
++ __m256 a_val, res_f, binary_f;
++ __m256i binary_i;
++ __m256 zero_val, one_val;
++ zero_val = _mm256_set1_ps(0.0f);
++ one_val = _mm256_set1_ps(1.0f);
+
+- for(number = 0; number < quarter_points; number++){
+- a_val = _mm256_load_ps(aPtr);
++ for (number = 0; number < quarter_points; number++) {
++ a_val = _mm256_load_ps(aPtr);
+
+- res_f = _mm256_cmp_ps (a_val, zero_val, _CMP_GE_OS);
+- binary_f = _mm256_and_ps (res_f, one_val);
+- binary_i = _mm256_cvtps_epi32(binary_f);
++ res_f = _mm256_cmp_ps(a_val, zero_val, _CMP_GE_OS);
++ binary_f = _mm256_and_ps(res_f, one_val);
++ binary_i = _mm256_cvtps_epi32(binary_f);
+
+- _mm256_store_si256((__m256i *)cPtr, binary_i);
++ _mm256_store_si256((__m256i*)cPtr, binary_i);
+
+- cPtr += 8;
+- aPtr += 8;
+- }
+-
+- for(number = quarter_points * 8; number < num_points; number++){
+- if( *aPtr++ >= 0) {
+- *cPtr++ = 1;
++ cPtr += 8;
++ aPtr += 8;
+ }
+- else {
+- *cPtr++ = 0;
++
++ for (number = quarter_points * 8; number < num_points; number++) {
++ if (*aPtr++ >= 0) {
++ *cPtr++ = 1;
++ } else {
++ *cPtr++ = 0;
++ }
+ }
+- }
+ }
+ #endif /* LV_HAVE_AVX */
+
+@@ -194,40 +195,40 @@ volk_32f_binary_slicer_32i_a_avx(int* cVector, const float* aVector, unsigned in
+ #ifdef LV_HAVE_SSE2
+ #include <emmintrin.h>
+
+-static inline void
+-volk_32f_binary_slicer_32i_u_sse2(int* cVector, const float* aVector, unsigned int num_points)
++static inline void volk_32f_binary_slicer_32i_u_sse2(int* cVector,
++ const float* aVector,
++ unsigned int num_points)
+ {
+- int* cPtr = cVector;
+- const float* aPtr = aVector;
+- unsigned int number = 0;
+-
+- unsigned int quarter_points = num_points / 4;
+- __m128 a_val, res_f;
+- __m128i res_i, binary_i;
+- __m128 zero_val;
+- zero_val = _mm_set1_ps (0.0f);
++ int* cPtr = cVector;
++ const float* aPtr = aVector;
++ unsigned int number = 0;
+
+- for(number = 0; number < quarter_points; number++){
+- a_val = _mm_loadu_ps(aPtr);
++ unsigned int quarter_points = num_points / 4;
++ __m128 a_val, res_f;
++ __m128i res_i, binary_i;
++ __m128 zero_val;
++ zero_val = _mm_set1_ps(0.0f);
+
+- res_f = _mm_cmpge_ps (a_val, zero_val);
+- res_i = _mm_cvtps_epi32 (res_f);
+- binary_i = _mm_srli_epi32 (res_i, 31);
++ for (number = 0; number < quarter_points; number++) {
++ a_val = _mm_loadu_ps(aPtr);
+
+- _mm_storeu_si128((__m128i*)cPtr, binary_i);
++ res_f = _mm_cmpge_ps(a_val, zero_val);
++ res_i = _mm_cvtps_epi32(res_f);
++ binary_i = _mm_srli_epi32(res_i, 31);
+
+- cPtr += 4;
+- aPtr += 4;
+- }
++ _mm_storeu_si128((__m128i*)cPtr, binary_i);
+
+- for(number = quarter_points * 4; number < num_points; number++){
+- if( *aPtr++ >= 0) {
+- *cPtr++ = 1;
++ cPtr += 4;
++ aPtr += 4;
+ }
+- else {
+- *cPtr++ = 0;
++
++ for (number = quarter_points * 4; number < num_points; number++) {
++ if (*aPtr++ >= 0) {
++ *cPtr++ = 1;
++ } else {
++ *cPtr++ = 0;
++ }
+ }
+- }
+ }
+ #endif /* LV_HAVE_SSE2 */
+
+@@ -235,41 +236,41 @@ volk_32f_binary_slicer_32i_u_sse2(int* cVector, const float* aVector, unsigned i
+ #ifdef LV_HAVE_AVX
+ #include <immintrin.h>
+
+-static inline void
+-volk_32f_binary_slicer_32i_u_avx(int* cVector, const float* aVector, unsigned int num_points)
++static inline void volk_32f_binary_slicer_32i_u_avx(int* cVector,
++ const float* aVector,
++ unsigned int num_points)
+ {
+- int* cPtr = cVector;
+- const float* aPtr = aVector;
+- unsigned int number = 0;
+-
+- unsigned int quarter_points = num_points / 8;
+- __m256 a_val, res_f, binary_f;
+- __m256i binary_i;
+- __m256 zero_val, one_val;
+- zero_val = _mm256_set1_ps (0.0f);
+- one_val = _mm256_set1_ps (1.0f);
++ int* cPtr = cVector;
++ const float* aPtr = aVector;
++ unsigned int number = 0;
+
+- for(number = 0; number < quarter_points; number++){
+- a_val = _mm256_loadu_ps(aPtr);
++ unsigned int quarter_points = num_points / 8;
++ __m256 a_val, res_f, binary_f;
++ __m256i binary_i;
++ __m256 zero_val, one_val;
++ zero_val = _mm256_set1_ps(0.0f);
++ one_val = _mm256_set1_ps(1.0f);
+
+- res_f = _mm256_cmp_ps (a_val, zero_val, _CMP_GE_OS);
+- binary_f = _mm256_and_ps (res_f, one_val);
+- binary_i = _mm256_cvtps_epi32(binary_f);
++ for (number = 0; number < quarter_points; number++) {
++ a_val = _mm256_loadu_ps(aPtr);
+
+- _mm256_storeu_si256((__m256i*)cPtr, binary_i);
++ res_f = _mm256_cmp_ps(a_val, zero_val, _CMP_GE_OS);
++ binary_f = _mm256_and_ps(res_f, one_val);
++ binary_i = _mm256_cvtps_epi32(binary_f);
+
+- cPtr += 8;
+- aPtr += 8;
+- }
++ _mm256_storeu_si256((__m256i*)cPtr, binary_i);
+
+- for(number = quarter_points * 8; number < num_points; number++){
+- if( *aPtr++ >= 0) {
+- *cPtr++ = 1;
++ cPtr += 8;
++ aPtr += 8;
+ }
+- else {
+- *cPtr++ = 0;
++
++ for (number = quarter_points * 8; number < num_points; number++) {
++ if (*aPtr++ >= 0) {
++ *cPtr++ = 1;
++ } else {
++ *cPtr++ = 0;
++ }
+ }
+- }
+ }
+ #endif /* LV_HAVE_AVX */
+
+diff --git a/kernels/volk/volk_32f_binary_slicer_8i.h b/kernels/volk/volk_32f_binary_slicer_8i.h
+index 5920621..3eddb5c 100644
+--- a/kernels/volk/volk_32f_binary_slicer_8i.h
++++ b/kernels/volk/volk_32f_binary_slicer_8i.h
+@@ -30,7 +30,8 @@
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+- * void volk_32f_binary_slicer_8i(int8_t* cVector, const float* aVector, unsigned int num_points)
++ * void volk_32f_binary_slicer_8i(int8_t* cVector, const float* aVector, unsigned int
++ num_points)
+ * \endcode
+ *
+ * \b Inputs
+@@ -74,39 +75,38 @@
+
+ #ifdef LV_HAVE_GENERIC
+
+-static inline void
+-volk_32f_binary_slicer_8i_generic(int8_t* cVector, const float* aVector,
+- unsigned int num_points)
++static inline void volk_32f_binary_slicer_8i_generic(int8_t* cVector,
++ const float* aVector,
++ unsigned int num_points)
+ {
+- int8_t* cPtr = cVector;
+- const float* aPtr = aVector;
+- unsigned int number = 0;
+-
+- for(number = 0; number < num_points; number++) {
+- if(*aPtr++ >= 0) {
+- *cPtr++ = 1;
+- }
+- else {
+- *cPtr++ = 0;
++ int8_t* cPtr = cVector;
++ const float* aPtr = aVector;
++ unsigned int number = 0;
++
++ for (number = 0; number < num_points; number++) {
++ if (*aPtr++ >= 0) {
++ *cPtr++ = 1;
++ } else {
++ *cPtr++ = 0;
++ }
+ }
+- }
+ }
+ #endif /* LV_HAVE_GENERIC */
+
+
+ #ifdef LV_HAVE_GENERIC
+
+-static inline void
+-volk_32f_binary_slicer_8i_generic_branchless(int8_t* cVector, const float* aVector,
+- unsigned int num_points)
++static inline void volk_32f_binary_slicer_8i_generic_branchless(int8_t* cVector,
++ const float* aVector,
++ unsigned int num_points)
+ {
+- int8_t* cPtr = cVector;
+- const float* aPtr = aVector;
+- unsigned int number = 0;
++ int8_t* cPtr = cVector;
++ const float* aPtr = aVector;
++ unsigned int number = 0;
+
+- for(number = 0; number < num_points; number++){
+- *cPtr++ = (*aPtr++ >= 0);
+- }
++ for (number = 0; number < num_points; number++) {
++ *cPtr++ = (*aPtr++ >= 0);
++ }
+ }
+ #endif /* LV_HAVE_GENERIC */
+
+@@ -114,279 +114,329 @@ volk_32f_binary_slicer_8i_generic_branchless(int8_t* cVector, const float* aVect
+ #ifdef LV_HAVE_AVX2
+ #include <immintrin.h>
+
+-static inline void
+-volk_32f_binary_slicer_8i_a_avx2(int8_t* cVector, const float* aVector,
+- unsigned int num_points)
++static inline void volk_32f_binary_slicer_8i_a_avx2(int8_t* cVector,
++ const float* aVector,
++ unsigned int num_points)
+ {
+- int8_t* cPtr = cVector;
+- const float* aPtr = aVector;
+- unsigned int number = 0;
+- unsigned int n32points = num_points / 32;
+-
+- const __m256 zero_val = _mm256_set1_ps(0.0f);
+- __m256 a0_val, a1_val, a2_val, a3_val;
+- __m256 res0_f, res1_f, res2_f, res3_f;
+- __m256i res0_i, res1_i, res2_i, res3_i;
+- __m256i byte_shuffle = _mm256_set_epi8( 15, 14, 13, 12, 7, 6, 5, 4,
+- 11, 10, 9, 8, 3, 2, 1, 0,
+- 15, 14, 13, 12, 7, 6, 5, 4,
+- 11, 10, 9, 8, 3, 2, 1, 0);
+-
+- for(number = 0; number < n32points; number++) {
+- a0_val = _mm256_load_ps(aPtr);
+- a1_val = _mm256_load_ps(aPtr+8);
+- a2_val = _mm256_load_ps(aPtr+16);
+- a3_val = _mm256_load_ps(aPtr+24);
+-
+- // compare >= 0; return float
+- res0_f = _mm256_cmp_ps(a0_val, zero_val, _CMP_GE_OS);
+- res1_f = _mm256_cmp_ps(a1_val, zero_val, _CMP_GE_OS);
+- res2_f = _mm256_cmp_ps(a2_val, zero_val, _CMP_GE_OS);
+- res3_f = _mm256_cmp_ps(a3_val, zero_val, _CMP_GE_OS);
+-
+- // convert to 32i and >> 31
+- res0_i = _mm256_srli_epi32(_mm256_cvtps_epi32(res0_f), 31);
+- res1_i = _mm256_srli_epi32(_mm256_cvtps_epi32(res1_f), 31);
+- res2_i = _mm256_srli_epi32(_mm256_cvtps_epi32(res2_f), 31);
+- res3_i = _mm256_srli_epi32(_mm256_cvtps_epi32(res3_f), 31);
+-
+- // pack in to 16-bit results
+- res0_i = _mm256_packs_epi32(res0_i, res1_i);
+- res2_i = _mm256_packs_epi32(res2_i, res3_i);
+- // pack in to 8-bit results
+- // res0: (after packs_epi32)
+- // a0, a1, a2, a3, b0, b1, b2, b3, a4, a5, a6, a7, b4, b5, b6, b7
+- // res2:
+- // c0, c1, c2, c3, d0, d1, d2, d3, c4, c5, c6, c7, d4, d5, d6, d7
+- res0_i = _mm256_packs_epi16(res0_i, res2_i);
+- // shuffle the lanes
+- // res0: (after packs_epi16)
+- // a0, a1, a2, a3, b0, b1, b2, b3, c0, c1, c2, c3, d0, d1, d2, d3
+- // a4, a5, a6, a7, b4, b5, b6, b7, c4, c5, c6, c7, d4, d5, d6, d7
+- // 0, 2, 1, 3 -> 11 01 10 00 (0xd8)
+- res0_i = _mm256_permute4x64_epi64(res0_i, 0xd8);
+-
+- // shuffle bytes within lanes
+- // res0: (after shuffle_epi8)
+- // a0, a1, a2, a3, b0, b1, b2, b3, a4, a5, a6, a7, b4, b5, b6, b7
+- // c0, c1, c2, c3, d0, d1, d2, d3, c4, c5, c6, c7, d4, d5, d6, d7
+- res0_i = _mm256_shuffle_epi8(res0_i, byte_shuffle);
+-
+- _mm256_store_si256((__m256i*)cPtr, res0_i);
+- aPtr += 32;
+- cPtr += 32;
+- }
+-
+- for(number = n32points * 32; number < num_points; number++) {
+- if( *aPtr++ >= 0) {
+- *cPtr++ = 1;
++ int8_t* cPtr = cVector;
++ const float* aPtr = aVector;
++ unsigned int number = 0;
++ unsigned int n32points = num_points / 32;
++
++ const __m256 zero_val = _mm256_set1_ps(0.0f);
++ __m256 a0_val, a1_val, a2_val, a3_val;
++ __m256 res0_f, res1_f, res2_f, res3_f;
++ __m256i res0_i, res1_i, res2_i, res3_i;
++ __m256i byte_shuffle = _mm256_set_epi8(15,
++ 14,
++ 13,
++ 12,
++ 7,
++ 6,
++ 5,
++ 4,
++ 11,
++ 10,
++ 9,
++ 8,
++ 3,
++ 2,
++ 1,
++ 0,
++ 15,
++ 14,
++ 13,
++ 12,
++ 7,
++ 6,
++ 5,
++ 4,
++ 11,
++ 10,
++ 9,
++ 8,
++ 3,
++ 2,
++ 1,
++ 0);
++
++ for (number = 0; number < n32points; number++) {
++ a0_val = _mm256_load_ps(aPtr);
++ a1_val = _mm256_load_ps(aPtr + 8);
++ a2_val = _mm256_load_ps(aPtr + 16);
++ a3_val = _mm256_load_ps(aPtr + 24);
++
++ // compare >= 0; return float
++ res0_f = _mm256_cmp_ps(a0_val, zero_val, _CMP_GE_OS);
++ res1_f = _mm256_cmp_ps(a1_val, zero_val, _CMP_GE_OS);
++ res2_f = _mm256_cmp_ps(a2_val, zero_val, _CMP_GE_OS);
++ res3_f = _mm256_cmp_ps(a3_val, zero_val, _CMP_GE_OS);
++
++ // convert to 32i and >> 31
++ res0_i = _mm256_srli_epi32(_mm256_cvtps_epi32(res0_f), 31);
++ res1_i = _mm256_srli_epi32(_mm256_cvtps_epi32(res1_f), 31);
++ res2_i = _mm256_srli_epi32(_mm256_cvtps_epi32(res2_f), 31);
++ res3_i = _mm256_srli_epi32(_mm256_cvtps_epi32(res3_f), 31);
++
++ // pack in to 16-bit results
++ res0_i = _mm256_packs_epi32(res0_i, res1_i);
++ res2_i = _mm256_packs_epi32(res2_i, res3_i);
++ // pack in to 8-bit results
++ // res0: (after packs_epi32)
++ // a0, a1, a2, a3, b0, b1, b2, b3, a4, a5, a6, a7, b4, b5, b6, b7
++ // res2:
++ // c0, c1, c2, c3, d0, d1, d2, d3, c4, c5, c6, c7, d4, d5, d6, d7
++ res0_i = _mm256_packs_epi16(res0_i, res2_i);
++ // shuffle the lanes
++ // res0: (after packs_epi16)
++ // a0, a1, a2, a3, b0, b1, b2, b3, c0, c1, c2, c3, d0, d1, d2, d3
++ // a4, a5, a6, a7, b4, b5, b6, b7, c4, c5, c6, c7, d4, d5, d6, d7
++ // 0, 2, 1, 3 -> 11 01 10 00 (0xd8)
++ res0_i = _mm256_permute4x64_epi64(res0_i, 0xd8);
++
++ // shuffle bytes within lanes
++ // res0: (after shuffle_epi8)
++ // a0, a1, a2, a3, b0, b1, b2, b3, a4, a5, a6, a7, b4, b5, b6, b7
++ // c0, c1, c2, c3, d0, d1, d2, d3, c4, c5, c6, c7, d4, d5, d6, d7
++ res0_i = _mm256_shuffle_epi8(res0_i, byte_shuffle);
++
++ _mm256_store_si256((__m256i*)cPtr, res0_i);
++ aPtr += 32;
++ cPtr += 32;
+ }
+- else {
+- *cPtr++ = 0;
++
++ for (number = n32points * 32; number < num_points; number++) {
++ if (*aPtr++ >= 0) {
++ *cPtr++ = 1;
++ } else {
++ *cPtr++ = 0;
++ }
+ }
+- }
+ }
+ #endif
+
+ #ifdef LV_HAVE_AVX2
+ #include <immintrin.h>
+
+-static inline void
+-volk_32f_binary_slicer_8i_u_avx2(int8_t* cVector, const float* aVector,
+- unsigned int num_points)
++static inline void volk_32f_binary_slicer_8i_u_avx2(int8_t* cVector,
++ const float* aVector,
++ unsigned int num_points)
+ {
+- int8_t* cPtr = cVector;
+- const float* aPtr = aVector;
+- unsigned int number = 0;
+- unsigned int n32points = num_points / 32;
+-
+- const __m256 zero_val = _mm256_set1_ps(0.0f);
+- __m256 a0_val, a1_val, a2_val, a3_val;
+- __m256 res0_f, res1_f, res2_f, res3_f;
+- __m256i res0_i, res1_i, res2_i, res3_i;
+- __m256i byte_shuffle = _mm256_set_epi8( 15, 14, 13, 12, 7, 6, 5, 4,
+- 11, 10, 9, 8, 3, 2, 1, 0,
+- 15, 14, 13, 12, 7, 6, 5, 4,
+- 11, 10, 9, 8, 3, 2, 1, 0);
+-
+- for(number = 0; number < n32points; number++) {
+- a0_val = _mm256_loadu_ps(aPtr);
+- a1_val = _mm256_loadu_ps(aPtr+8);
+- a2_val = _mm256_loadu_ps(aPtr+16);
+- a3_val = _mm256_loadu_ps(aPtr+24);
+-
+- // compare >= 0; return float
+- res0_f = _mm256_cmp_ps(a0_val, zero_val, _CMP_GE_OS);
+- res1_f = _mm256_cmp_ps(a1_val, zero_val, _CMP_GE_OS);
+- res2_f = _mm256_cmp_ps(a2_val, zero_val, _CMP_GE_OS);
+- res3_f = _mm256_cmp_ps(a3_val, zero_val, _CMP_GE_OS);
+-
+- // convert to 32i and >> 31
+- res0_i = _mm256_srli_epi32(_mm256_cvtps_epi32(res0_f), 31);
+- res1_i = _mm256_srli_epi32(_mm256_cvtps_epi32(res1_f), 31);
+- res2_i = _mm256_srli_epi32(_mm256_cvtps_epi32(res2_f), 31);
+- res3_i = _mm256_srli_epi32(_mm256_cvtps_epi32(res3_f), 31);
+-
+- // pack in to 16-bit results
+- res0_i = _mm256_packs_epi32(res0_i, res1_i);
+- res2_i = _mm256_packs_epi32(res2_i, res3_i);
+- // pack in to 8-bit results
+- // res0: (after packs_epi32)
+- // a0, a1, a2, a3, b0, b1, b2, b3, a4, a5, a6, a7, b4, b5, b6, b7
+- // res2:
+- // c0, c1, c2, c3, d0, d1, d2, d3, c4, c5, c6, c7, d4, d5, d6, d7
+- res0_i = _mm256_packs_epi16(res0_i, res2_i);
+- // shuffle the lanes
+- // res0: (after packs_epi16)
+- // a0, a1, a2, a3, b0, b1, b2, b3, c0, c1, c2, c3, d0, d1, d2, d3
+- // a4, a5, a6, a7, b4, b5, b6, b7, c4, c5, c6, c7, d4, d5, d6, d7
+- // 0, 2, 1, 3 -> 11 01 10 00 (0xd8)
+- res0_i = _mm256_permute4x64_epi64(res0_i, 0xd8);
+-
+- // shuffle bytes within lanes
+- // res0: (after shuffle_epi8)
+- // a0, a1, a2, a3, b0, b1, b2, b3, a4, a5, a6, a7, b4, b5, b6, b7
+- // c0, c1, c2, c3, d0, d1, d2, d3, c4, c5, c6, c7, d4, d5, d6, d7
+- res0_i = _mm256_shuffle_epi8(res0_i, byte_shuffle);
+-
+- _mm256_storeu_si256((__m256i*)cPtr, res0_i);
+- aPtr += 32;
+- cPtr += 32;
+- }
+-
+- for(number = n32points * 32; number < num_points; number++) {
+- if( *aPtr++ >= 0) {
+- *cPtr++ = 1;
++ int8_t* cPtr = cVector;
++ const float* aPtr = aVector;
++ unsigned int number = 0;
++ unsigned int n32points = num_points / 32;
++
++ const __m256 zero_val = _mm256_set1_ps(0.0f);
++ __m256 a0_val, a1_val, a2_val, a3_val;
++ __m256 res0_f, res1_f, res2_f, res3_f;
++ __m256i res0_i, res1_i, res2_i, res3_i;
++ __m256i byte_shuffle = _mm256_set_epi8(15,
++ 14,
++ 13,
++ 12,
++ 7,
++ 6,
++ 5,
++ 4,
++ 11,
++ 10,
++ 9,
++ 8,
++ 3,
++ 2,
++ 1,
++ 0,
++ 15,
++ 14,
++ 13,
++ 12,
++ 7,
++ 6,
++ 5,
++ 4,
++ 11,
++ 10,
++ 9,
++ 8,
++ 3,
++ 2,
++ 1,
++ 0);
++
++ for (number = 0; number < n32points; number++) {
++ a0_val = _mm256_loadu_ps(aPtr);
++ a1_val = _mm256_loadu_ps(aPtr + 8);
++ a2_val = _mm256_loadu_ps(aPtr + 16);
++ a3_val = _mm256_loadu_ps(aPtr + 24);
++
++ // compare >= 0; return float
++ res0_f = _mm256_cmp_ps(a0_val, zero_val, _CMP_GE_OS);
++ res1_f = _mm256_cmp_ps(a1_val, zero_val, _CMP_GE_OS);
++ res2_f = _mm256_cmp_ps(a2_val, zero_val, _CMP_GE_OS);
++ res3_f = _mm256_cmp_ps(a3_val, zero_val, _CMP_GE_OS);
++
++ // convert to 32i and >> 31
++ res0_i = _mm256_srli_epi32(_mm256_cvtps_epi32(res0_f), 31);
++ res1_i = _mm256_srli_epi32(_mm256_cvtps_epi32(res1_f), 31);
++ res2_i = _mm256_srli_epi32(_mm256_cvtps_epi32(res2_f), 31);
++ res3_i = _mm256_srli_epi32(_mm256_cvtps_epi32(res3_f), 31);
++
++ // pack in to 16-bit results
++ res0_i = _mm256_packs_epi32(res0_i, res1_i);
++ res2_i = _mm256_packs_epi32(res2_i, res3_i);
++ // pack in to 8-bit results
++ // res0: (after packs_epi32)
++ // a0, a1, a2, a3, b0, b1, b2, b3, a4, a5, a6, a7, b4, b5, b6, b7
++ // res2:
++ // c0, c1, c2, c3, d0, d1, d2, d3, c4, c5, c6, c7, d4, d5, d6, d7
++ res0_i = _mm256_packs_epi16(res0_i, res2_i);
++ // shuffle the lanes
++ // res0: (after packs_epi16)
++ // a0, a1, a2, a3, b0, b1, b2, b3, c0, c1, c2, c3, d0, d1, d2, d3
++ // a4, a5, a6, a7, b4, b5, b6, b7, c4, c5, c6, c7, d4, d5, d6, d7
++ // 0, 2, 1, 3 -> 11 01 10 00 (0xd8)
++ res0_i = _mm256_permute4x64_epi64(res0_i, 0xd8);
++
++ // shuffle bytes within lanes
++ // res0: (after shuffle_epi8)
++ // a0, a1, a2, a3, b0, b1, b2, b3, a4, a5, a6, a7, b4, b5, b6, b7
++ // c0, c1, c2, c3, d0, d1, d2, d3, c4, c5, c6, c7, d4, d5, d6, d7
++ res0_i = _mm256_shuffle_epi8(res0_i, byte_shuffle);
++
++ _mm256_storeu_si256((__m256i*)cPtr, res0_i);
++ aPtr += 32;
++ cPtr += 32;
+ }
+- else {
+- *cPtr++ = 0;
++
++ for (number = n32points * 32; number < num_points; number++) {
++ if (*aPtr++ >= 0) {
++ *cPtr++ = 1;
++ } else {
++ *cPtr++ = 0;
++ }
+ }
+- }
+ }
+ #endif
+
+
+-
+ #ifdef LV_HAVE_SSE2
+
+ #include <emmintrin.h>
+
+-static inline void
+-volk_32f_binary_slicer_8i_a_sse2(int8_t* cVector, const float* aVector,
+- unsigned int num_points)
++static inline void volk_32f_binary_slicer_8i_a_sse2(int8_t* cVector,
++ const float* aVector,
++ unsigned int num_points)
+ {
+- int8_t* cPtr = cVector;
+- const float* aPtr = aVector;
+- unsigned int number = 0;
+-
+- unsigned int n16points = num_points / 16;
+- __m128 a0_val, a1_val, a2_val, a3_val;
+- __m128 res0_f, res1_f, res2_f, res3_f;
+- __m128i res0_i, res1_i, res2_i, res3_i;
+- __m128 zero_val;
+- zero_val = _mm_set1_ps(0.0f);
+-
+- for(number = 0; number < n16points; number++) {
+- a0_val = _mm_load_ps(aPtr);
+- a1_val = _mm_load_ps(aPtr+4);
+- a2_val = _mm_load_ps(aPtr+8);
+- a3_val = _mm_load_ps(aPtr+12);
+-
+- // compare >= 0; return float
+- res0_f = _mm_cmpge_ps(a0_val, zero_val);
+- res1_f = _mm_cmpge_ps(a1_val, zero_val);
+- res2_f = _mm_cmpge_ps(a2_val, zero_val);
+- res3_f = _mm_cmpge_ps(a3_val, zero_val);
+-
+- // convert to 32i and >> 31
+- res0_i = _mm_srli_epi32(_mm_cvtps_epi32(res0_f), 31);
+- res1_i = _mm_srli_epi32(_mm_cvtps_epi32(res1_f), 31);
+- res2_i = _mm_srli_epi32(_mm_cvtps_epi32(res2_f), 31);
+- res3_i = _mm_srli_epi32(_mm_cvtps_epi32(res3_f), 31);
+-
+- // pack into 16-bit results
+- res0_i = _mm_packs_epi32(res0_i, res1_i);
+- res2_i = _mm_packs_epi32(res2_i, res3_i);
+-
+- // pack into 8-bit results
+- res0_i = _mm_packs_epi16(res0_i, res2_i);
+-
+- _mm_store_si128((__m128i*)cPtr, res0_i);
+-
+- cPtr += 16;
+- aPtr += 16;
+- }
+-
+- for(number = n16points * 16; number < num_points; number++) {
+- if( *aPtr++ >= 0) {
+- *cPtr++ = 1;
++ int8_t* cPtr = cVector;
++ const float* aPtr = aVector;
++ unsigned int number = 0;
++
++ unsigned int n16points = num_points / 16;
++ __m128 a0_val, a1_val, a2_val, a3_val;
++ __m128 res0_f, res1_f, res2_f, res3_f;
++ __m128i res0_i, res1_i, res2_i, res3_i;
++ __m128 zero_val;
++ zero_val = _mm_set1_ps(0.0f);
++
++ for (number = 0; number < n16points; number++) {
++ a0_val = _mm_load_ps(aPtr);
++ a1_val = _mm_load_ps(aPtr + 4);
++ a2_val = _mm_load_ps(aPtr + 8);
++ a3_val = _mm_load_ps(aPtr + 12);
++
++ // compare >= 0; return float
++ res0_f = _mm_cmpge_ps(a0_val, zero_val);
++ res1_f = _mm_cmpge_ps(a1_val, zero_val);
++ res2_f = _mm_cmpge_ps(a2_val, zero_val);
++ res3_f = _mm_cmpge_ps(a3_val, zero_val);
++
++ // convert to 32i and >> 31
++ res0_i = _mm_srli_epi32(_mm_cvtps_epi32(res0_f), 31);
++ res1_i = _mm_srli_epi32(_mm_cvtps_epi32(res1_f), 31);
++ res2_i = _mm_srli_epi32(_mm_cvtps_epi32(res2_f), 31);
++ res3_i = _mm_srli_epi32(_mm_cvtps_epi32(res3_f), 31);
++
++ // pack into 16-bit results
++ res0_i = _mm_packs_epi32(res0_i, res1_i);
++ res2_i = _mm_packs_epi32(res2_i, res3_i);
++
++ // pack into 8-bit results
++ res0_i = _mm_packs_epi16(res0_i, res2_i);
++
++ _mm_store_si128((__m128i*)cPtr, res0_i);
++
++ cPtr += 16;
++ aPtr += 16;
+ }
+- else {
+- *cPtr++ = 0;
++
++ for (number = n16points * 16; number < num_points; number++) {
++ if (*aPtr++ >= 0) {
++ *cPtr++ = 1;
++ } else {
++ *cPtr++ = 0;
++ }
+ }
+- }
+ }
+ #endif /* LV_HAVE_SSE2 */
+
+
+-
+ #ifdef LV_HAVE_SSE2
+ #include <emmintrin.h>
+
+-static inline void
+-volk_32f_binary_slicer_8i_u_sse2(int8_t* cVector, const float* aVector,
+- unsigned int num_points)
++static inline void volk_32f_binary_slicer_8i_u_sse2(int8_t* cVector,
++ const float* aVector,
++ unsigned int num_points)
+ {
+- int8_t* cPtr = cVector;
+- const float* aPtr = aVector;
+- unsigned int number = 0;
+-
+- unsigned int n16points = num_points / 16;
+- __m128 a0_val, a1_val, a2_val, a3_val;
+- __m128 res0_f, res1_f, res2_f, res3_f;
+- __m128i res0_i, res1_i, res2_i, res3_i;
+- __m128 zero_val;
+- zero_val = _mm_set1_ps (0.0f);
+-
+- for(number = 0; number < n16points; number++) {
+- a0_val = _mm_loadu_ps(aPtr);
+- a1_val = _mm_loadu_ps(aPtr+4);
+- a2_val = _mm_loadu_ps(aPtr+8);
+- a3_val = _mm_loadu_ps(aPtr+12);
+-
+- // compare >= 0; return float
+- res0_f = _mm_cmpge_ps(a0_val, zero_val);
+- res1_f = _mm_cmpge_ps(a1_val, zero_val);
+- res2_f = _mm_cmpge_ps(a2_val, zero_val);
+- res3_f = _mm_cmpge_ps(a3_val, zero_val);
+-
+- // convert to 32i and >> 31
+- res0_i = _mm_srli_epi32(_mm_cvtps_epi32(res0_f), 31);
+- res1_i = _mm_srli_epi32(_mm_cvtps_epi32(res1_f), 31);
+- res2_i = _mm_srli_epi32(_mm_cvtps_epi32(res2_f), 31);
+- res3_i = _mm_srli_epi32(_mm_cvtps_epi32(res3_f), 31);
+-
+- // pack into 16-bit results
+- res0_i = _mm_packs_epi32(res0_i, res1_i);
+- res2_i = _mm_packs_epi32(res2_i, res3_i);
+-
+- // pack into 8-bit results
+- res0_i = _mm_packs_epi16(res0_i, res2_i);
+-
+- _mm_storeu_si128((__m128i*)cPtr, res0_i);
+-
+- cPtr += 16;
+- aPtr += 16;
+- }
+-
+- for(number = n16points * 16; number < num_points; number++) {
+- if( *aPtr++ >= 0) {
+- *cPtr++ = 1;
++ int8_t* cPtr = cVector;
++ const float* aPtr = aVector;
++ unsigned int number = 0;
++
++ unsigned int n16points = num_points / 16;
++ __m128 a0_val, a1_val, a2_val, a3_val;
++ __m128 res0_f, res1_f, res2_f, res3_f;
++ __m128i res0_i, res1_i, res2_i, res3_i;
++ __m128 zero_val;
++ zero_val = _mm_set1_ps(0.0f);
++
++ for (number = 0; number < n16points; number++) {
++ a0_val = _mm_loadu_ps(aPtr);
++ a1_val = _mm_loadu_ps(aPtr + 4);
++ a2_val = _mm_loadu_ps(aPtr + 8);
++ a3_val = _mm_loadu_ps(aPtr + 12);
++
++ // compare >= 0; return float
++ res0_f = _mm_cmpge_ps(a0_val, zero_val);
++ res1_f = _mm_cmpge_ps(a1_val, zero_val);
++ res2_f = _mm_cmpge_ps(a2_val, zero_val);
++ res3_f = _mm_cmpge_ps(a3_val, zero_val);
++
++ // convert to 32i and >> 31
++ res0_i = _mm_srli_epi32(_mm_cvtps_epi32(res0_f), 31);
++ res1_i = _mm_srli_epi32(_mm_cvtps_epi32(res1_f), 31);
++ res2_i = _mm_srli_epi32(_mm_cvtps_epi32(res2_f), 31);
++ res3_i = _mm_srli_epi32(_mm_cvtps_epi32(res3_f), 31);
++
++ // pack into 16-bit results
++ res0_i = _mm_packs_epi32(res0_i, res1_i);
++ res2_i = _mm_packs_epi32(res2_i, res3_i);
++
++ // pack into 8-bit results
++ res0_i = _mm_packs_epi16(res0_i, res2_i);
++
++ _mm_storeu_si128((__m128i*)cPtr, res0_i);
++
++ cPtr += 16;
++ aPtr += 16;
+ }
+- else {
+- *cPtr++ = 0;
++
++ for (number = n16points * 16; number < num_points; number++) {
++ if (*aPtr++ >= 0) {
++ *cPtr++ = 1;
++ } else {
++ *cPtr++ = 0;
++ }
+ }
+- }
+ }
+ #endif /* LV_HAVE_SSE2 */
+
+@@ -394,74 +444,72 @@ volk_32f_binary_slicer_8i_u_sse2(int8_t* cVector, const float* aVector,
+ #ifdef LV_HAVE_NEON
+ #include <arm_neon.h>
+
+-static inline void
+-volk_32f_binary_slicer_8i_neon(int8_t* cVector, const float* aVector,
+- unsigned int num_points)
++static inline void volk_32f_binary_slicer_8i_neon(int8_t* cVector,
++ const float* aVector,
++ unsigned int num_points)
+ {
+- int8_t* cPtr = cVector;
+- const float* aPtr = aVector;
+- unsigned int number = 0;
+- unsigned int n16points = num_points / 16;
+-
+- float32x4x2_t input_val0, input_val1;
+- float32x4_t zero_val;
+- uint32x4x2_t res0_u32, res1_u32;
+- uint16x4x2_t res0_u16x4, res1_u16x4;
+- uint16x8x2_t res_u16x8;
+- uint8x8x2_t res_u8;
+- uint8x8_t one;
+-
+- zero_val = vdupq_n_f32(0.0);
+- one = vdup_n_u8(0x01);
+-
+- // TODO: this is a good candidate for asm because the vcombines
+- // can be eliminated simply by picking dst registers that are
+- // adjacent.
+- for(number = 0; number < n16points; number++) {
+- input_val0 = vld2q_f32(aPtr);
+- input_val1 = vld2q_f32(aPtr+8);
+-
+- // test against 0; return uint32
+- res0_u32.val[0] = vcgeq_f32(input_val0.val[0], zero_val);
+- res0_u32.val[1] = vcgeq_f32(input_val0.val[1], zero_val);
+- res1_u32.val[0] = vcgeq_f32(input_val1.val[0], zero_val);
+- res1_u32.val[1] = vcgeq_f32(input_val1.val[1], zero_val);
+-
+- // narrow uint32 -> uint16 followed by combine to 8-element vectors
+- res0_u16x4.val[0] = vmovn_u32(res0_u32.val[0]);
+- res0_u16x4.val[1] = vmovn_u32(res0_u32.val[1]);
+- res1_u16x4.val[0] = vmovn_u32(res1_u32.val[0]);
+- res1_u16x4.val[1] = vmovn_u32(res1_u32.val[1]);
+-
+- res_u16x8.val[0] = vcombine_u16(res0_u16x4.val[0], res1_u16x4.val[0]);
+- res_u16x8.val[1] = vcombine_u16(res0_u16x4.val[1], res1_u16x4.val[1]);
+-
+- // narrow uint16x8 -> uint8x8
+- res_u8.val[0] = vmovn_u16(res_u16x8.val[0]);
+- res_u8.val[1] = vmovn_u16(res_u16x8.val[1]);
+- // we *could* load twice as much data and do another vcombine here
+- // to get a uint8x16x2 vector, still only do 2 vandqs and a single store
+- // but that turns out to be ~16% slower than this version on zc702
+- // it's possible register contention in GCC scheduler slows it down
+- // and a hand-written asm with quad-word u8 registers is much faster.
+-
+- res_u8.val[0] = vand_u8(one, res_u8.val[0]);
+- res_u8.val[1] = vand_u8(one, res_u8.val[1]);
+-
+- vst2_u8((unsigned char*)cPtr, res_u8);
+- cPtr += 16;
+- aPtr += 16;
+-
+- }
+-
+- for(number = n16points * 16; number < num_points; number++) {
+- if(*aPtr++ >= 0) {
+- *cPtr++ = 1;
++ int8_t* cPtr = cVector;
++ const float* aPtr = aVector;
++ unsigned int number = 0;
++ unsigned int n16points = num_points / 16;
++
++ float32x4x2_t input_val0, input_val1;
++ float32x4_t zero_val;
++ uint32x4x2_t res0_u32, res1_u32;
++ uint16x4x2_t res0_u16x4, res1_u16x4;
++ uint16x8x2_t res_u16x8;
++ uint8x8x2_t res_u8;
++ uint8x8_t one;
++
++ zero_val = vdupq_n_f32(0.0);
++ one = vdup_n_u8(0x01);
++
++ // TODO: this is a good candidate for asm because the vcombines
++ // can be eliminated simply by picking dst registers that are
++ // adjacent.
++ for (number = 0; number < n16points; number++) {
++ input_val0 = vld2q_f32(aPtr);
++ input_val1 = vld2q_f32(aPtr + 8);
++
++ // test against 0; return uint32
++ res0_u32.val[0] = vcgeq_f32(input_val0.val[0], zero_val);
++ res0_u32.val[1] = vcgeq_f32(input_val0.val[1], zero_val);
++ res1_u32.val[0] = vcgeq_f32(input_val1.val[0], zero_val);
++ res1_u32.val[1] = vcgeq_f32(input_val1.val[1], zero_val);
++
++ // narrow uint32 -> uint16 followed by combine to 8-element vectors
++ res0_u16x4.val[0] = vmovn_u32(res0_u32.val[0]);
++ res0_u16x4.val[1] = vmovn_u32(res0_u32.val[1]);
++ res1_u16x4.val[0] = vmovn_u32(res1_u32.val[0]);
++ res1_u16x4.val[1] = vmovn_u32(res1_u32.val[1]);
++
++ res_u16x8.val[0] = vcombine_u16(res0_u16x4.val[0], res1_u16x4.val[0]);
++ res_u16x8.val[1] = vcombine_u16(res0_u16x4.val[1], res1_u16x4.val[1]);
++
++ // narrow uint16x8 -> uint8x8
++ res_u8.val[0] = vmovn_u16(res_u16x8.val[0]);
++ res_u8.val[1] = vmovn_u16(res_u16x8.val[1]);
++ // we *could* load twice as much data and do another vcombine here
++ // to get a uint8x16x2 vector, still only do 2 vandqs and a single store
++ // but that turns out to be ~16% slower than this version on zc702
++ // it's possible register contention in GCC scheduler slows it down
++ // and a hand-written asm with quad-word u8 registers is much faster.
++
++ res_u8.val[0] = vand_u8(one, res_u8.val[0]);
++ res_u8.val[1] = vand_u8(one, res_u8.val[1]);
++
++ vst2_u8((unsigned char*)cPtr, res_u8);
++ cPtr += 16;
++ aPtr += 16;
+ }
+- else {
+- *cPtr++ = 0;
++
++ for (number = n16points * 16; number < num_points; number++) {
++ if (*aPtr++ >= 0) {
++ *cPtr++ = 1;
++ } else {
++ *cPtr++ = 0;
++ }
+ }
+- }
+ }
+ #endif /* LV_HAVE_NEON */
+
+diff --git a/kernels/volk/volk_32f_convert_64f.h b/kernels/volk/volk_32f_convert_64f.h
+index bf57e3a..d2e3f8a 100644
+--- a/kernels/volk/volk_32f_convert_64f.h
++++ b/kernels/volk/volk_32f_convert_64f.h
+@@ -29,8 +29,8 @@
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+- * void volk_32f_convert_64f(double* outputVector, const float* inputVector, unsigned int num_points)
+- * \endcode
++ * void volk_32f_convert_64f(double* outputVector, const float* inputVector, unsigned int
++ * num_points) \endcode
+ *
+ * \b Inputs
+ * \li inputVector: The vector of floats to convert to doubles.
+@@ -72,29 +72,33 @@
+ #ifdef LV_HAVE_AVX
+ #include <immintrin.h>
+
+-static inline void volk_32f_convert_64f_u_avx(double* outputVector, const float* inputVector, unsigned int num_points){
+- unsigned int number = 0;
++static inline void volk_32f_convert_64f_u_avx(double* outputVector,
++ const float* inputVector,
++ unsigned int num_points)
++{
++ unsigned int number = 0;
+
+- const unsigned int quarterPoints = num_points / 4;
++ const unsigned int quarterPoints = num_points / 4;
+
+- const float* inputVectorPtr = (const float*)inputVector;
+- double* outputVectorPtr = outputVector;
+- __m256d ret;
+- __m128 inputVal;
++ const float* inputVectorPtr = (const float*)inputVector;
++ double* outputVectorPtr = outputVector;
++ __m256d ret;
++ __m128 inputVal;
+
+- for(;number < quarterPoints; number++){
+- inputVal = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4;
++ for (; number < quarterPoints; number++) {
++ inputVal = _mm_loadu_ps(inputVectorPtr);
++ inputVectorPtr += 4;
+
+- ret = _mm256_cvtps_pd(inputVal);
+- _mm256_storeu_pd(outputVectorPtr, ret);
++ ret = _mm256_cvtps_pd(inputVal);
++ _mm256_storeu_pd(outputVectorPtr, ret);
+
+- outputVectorPtr += 4;
+- }
++ outputVectorPtr += 4;
++ }
+
+- number = quarterPoints * 4;
+- for(; number < num_points; number++){
+- outputVector[number] = (double)(inputVector[number]);
+- }
++ number = quarterPoints * 4;
++ for (; number < num_points; number++) {
++ outputVector[number] = (double)(inputVector[number]);
++ }
+ }
+
+ #endif /* LV_HAVE_AVX */
+@@ -102,56 +106,61 @@ static inline void volk_32f_convert_64f_u_avx(double* outputVector, const float*
+ #ifdef LV_HAVE_SSE2
+ #include <emmintrin.h>
+
+-static inline void volk_32f_convert_64f_u_sse2(double* outputVector, const float* inputVector, unsigned int num_points){
+- unsigned int number = 0;
++static inline void volk_32f_convert_64f_u_sse2(double* outputVector,
++ const float* inputVector,
++ unsigned int num_points)
++{
++ unsigned int number = 0;
+
+- const unsigned int quarterPoints = num_points / 4;
++ const unsigned int quarterPoints = num_points / 4;
+
+- const float* inputVectorPtr = (const float*)inputVector;
+- double* outputVectorPtr = outputVector;
+- __m128d ret;
+- __m128 inputVal;
++ const float* inputVectorPtr = (const float*)inputVector;
++ double* outputVectorPtr = outputVector;
++ __m128d ret;
++ __m128 inputVal;
+
+- for(;number < quarterPoints; number++){
+- inputVal = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4;
++ for (; number < quarterPoints; number++) {
++ inputVal = _mm_loadu_ps(inputVectorPtr);
++ inputVectorPtr += 4;
+
+- ret = _mm_cvtps_pd(inputVal);
++ ret = _mm_cvtps_pd(inputVal);
+
+- _mm_storeu_pd(outputVectorPtr, ret);
+- outputVectorPtr += 2;
++ _mm_storeu_pd(outputVectorPtr, ret);
++ outputVectorPtr += 2;
+
+- inputVal = _mm_movehl_ps(inputVal, inputVal);
++ inputVal = _mm_movehl_ps(inputVal, inputVal);
+
+- ret = _mm_cvtps_pd(inputVal);
++ ret = _mm_cvtps_pd(inputVal);
+
+- _mm_storeu_pd(outputVectorPtr, ret);
+- outputVectorPtr += 2;
+- }
++ _mm_storeu_pd(outputVectorPtr, ret);
++ outputVectorPtr += 2;
++ }
+
+- number = quarterPoints * 4;
+- for(; number < num_points; number++){
+- outputVector[number] = (double)(inputVector[number]);
+- }
++ number = quarterPoints * 4;
++ for (; number < num_points; number++) {
++ outputVector[number] = (double)(inputVector[number]);
++ }
+ }
+ #endif /* LV_HAVE_SSE2 */
+
+
+ #ifdef LV_HAVE_GENERIC
+
+-static inline void volk_32f_convert_64f_generic(double* outputVector, const float* inputVector, unsigned int num_points){
+- double* outputVectorPtr = outputVector;
+- const float* inputVectorPtr = inputVector;
+- unsigned int number = 0;
+-
+- for(number = 0; number < num_points; number++){
+- *outputVectorPtr++ = ((double)(*inputVectorPtr++));
+- }
++static inline void volk_32f_convert_64f_generic(double* outputVector,
++ const float* inputVector,
++ unsigned int num_points)
++{
++ double* outputVectorPtr = outputVector;
++ const float* inputVectorPtr = inputVector;
++ unsigned int number = 0;
++
++ for (number = 0; number < num_points; number++) {
++ *outputVectorPtr++ = ((double)(*inputVectorPtr++));
++ }
+ }
+ #endif /* LV_HAVE_GENERIC */
+
+
+-
+-
+ #endif /* INCLUDED_volk_32f_convert_64f_u_H */
+
+
+@@ -164,83 +173,92 @@ static inline void volk_32f_convert_64f_generic(double* outputVector, const floa
+ #ifdef LV_HAVE_AVX
+ #include <immintrin.h>
+
+-static inline void volk_32f_convert_64f_a_avx(double* outputVector, const float* inputVector, unsigned int num_points){
+- unsigned int number = 0;
++static inline void volk_32f_convert_64f_a_avx(double* outputVector,
++ const float* inputVector,
++ unsigned int num_points)
++{
++ unsigned int number = 0;
+
+- const unsigned int quarterPoints = num_points / 4;
++ const unsigned int quarterPoints = num_points / 4;
+
+- const float* inputVectorPtr = (const float*)inputVector;
+- double* outputVectorPtr = outputVector;
+- __m256d ret;
+- __m128 inputVal;
++ const float* inputVectorPtr = (const float*)inputVector;
++ double* outputVectorPtr = outputVector;
++ __m256d ret;
++ __m128 inputVal;
+
+- for(;number < quarterPoints; number++){
+- inputVal = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4;
++ for (; number < quarterPoints; number++) {
++ inputVal = _mm_load_ps(inputVectorPtr);
++ inputVectorPtr += 4;
+
+- ret = _mm256_cvtps_pd(inputVal);
+- _mm256_store_pd(outputVectorPtr, ret);
++ ret = _mm256_cvtps_pd(inputVal);
++ _mm256_store_pd(outputVectorPtr, ret);
+
+- outputVectorPtr += 4;
+- }
++ outputVectorPtr += 4;
++ }
+
+- number = quarterPoints * 4;
+- for(; number < num_points; number++){
+- outputVector[number] = (double)(inputVector[number]);
+- }
++ number = quarterPoints * 4;
++ for (; number < num_points; number++) {
++ outputVector[number] = (double)(inputVector[number]);
++ }
+ }
+ #endif /* LV_HAVE_AVX */
+
+ #ifdef LV_HAVE_SSE2
+ #include <emmintrin.h>
+
+-static inline void volk_32f_convert_64f_a_sse2(double* outputVector, const float* inputVector, unsigned int num_points){
+- unsigned int number = 0;
++static inline void volk_32f_convert_64f_a_sse2(double* outputVector,
++ const float* inputVector,
++ unsigned int num_points)
++{
++ unsigned int number = 0;
+
+- const unsigned int quarterPoints = num_points / 4;
++ const unsigned int quarterPoints = num_points / 4;
+
+- const float* inputVectorPtr = (const float*)inputVector;
+- double* outputVectorPtr = outputVector;
+- __m128d ret;
+- __m128 inputVal;
++ const float* inputVectorPtr = (const float*)inputVector;
++ double* outputVectorPtr = outputVector;
++ __m128d ret;
++ __m128 inputVal;
+
+- for(;number < quarterPoints; number++){
+- inputVal = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4;
++ for (; number < quarterPoints; number++) {
++ inputVal = _mm_load_ps(inputVectorPtr);
++ inputVectorPtr += 4;
+
+- ret = _mm_cvtps_pd(inputVal);
++ ret = _mm_cvtps_pd(inputVal);
+
+- _mm_store_pd(outputVectorPtr, ret);
+- outputVectorPtr += 2;
++ _mm_store_pd(outputVectorPtr, ret);
++ outputVectorPtr += 2;
+
+- inputVal = _mm_movehl_ps(inputVal, inputVal);
++ inputVal = _mm_movehl_ps(inputVal, inputVal);
+
+- ret = _mm_cvtps_pd(inputVal);
++ ret = _mm_cvtps_pd(inputVal);
+
+- _mm_store_pd(outputVectorPtr, ret);
+- outputVectorPtr += 2;
+- }
++ _mm_store_pd(outputVectorPtr, ret);
++ outputVectorPtr += 2;
++ }
+
+- number = quarterPoints * 4;
+- for(; number < num_points; number++){
+- outputVector[number] = (double)(inputVector[number]);
+- }
++ number = quarterPoints * 4;
++ for (; number < num_points; number++) {
++ outputVector[number] = (double)(inputVector[number]);
++ }
+ }
+ #endif /* LV_HAVE_SSE2 */
+
+
+ #ifdef LV_HAVE_GENERIC
+
+-static inline void volk_32f_convert_64f_a_generic(double* outputVector, const float* inputVector, unsigned int num_points){
+- double* outputVectorPtr = outputVector;
+- const float* inputVectorPtr = inputVector;
+- unsigned int number = 0;
+-
+- for(number = 0; number < num_points; number++){
+- *outputVectorPtr++ = ((double)(*inputVectorPtr++));
+- }
++static inline void volk_32f_convert_64f_a_generic(double* outputVector,
++ const float* inputVector,
++ unsigned int num_points)
++{
++ double* outputVectorPtr = outputVector;
++ const float* inputVectorPtr = inputVector;
++ unsigned int number = 0;
++
++ for (number = 0; number < num_points; number++) {
++ *outputVectorPtr++ = ((double)(*inputVectorPtr++));
++ }
+ }
+ #endif /* LV_HAVE_GENERIC */
+
+
+-
+-
+ #endif /* INCLUDED_volk_32f_convert_64f_a_H */
+diff --git a/kernels/volk/volk_32f_cos_32f.h b/kernels/volk/volk_32f_cos_32f.h
+index 39c2008..b493764 100644
+--- a/kernels/volk/volk_32f_cos_32f.h
++++ b/kernels/volk/volk_32f_cos_32f.h
+@@ -69,9 +69,9 @@
+ * \endcode
+ */
+
+-#include <stdio.h>
+-#include <math.h>
+ #include <inttypes.h>
++#include <math.h>
++#include <stdio.h>
+
+ #ifndef INCLUDED_volk_32f_cos_32f_a_H
+ #define INCLUDED_volk_32f_cos_32f_a_H
+@@ -80,86 +80,102 @@
+ #include <immintrin.h>
+
+ static inline void
+- volk_32f_cos_32f_a_avx2_fma(float* bVector, const float* aVector, unsigned int num_points)
++volk_32f_cos_32f_a_avx2_fma(float* bVector, const float* aVector, unsigned int num_points)
+ {
+- float* bPtr = bVector;
+- const float* aPtr = aVector;
+-
+- unsigned int number = 0;
+- unsigned int eighthPoints = num_points / 8;
+- unsigned int i = 0;
+-
+- __m256 aVal, s, r, m4pi, pio4A, pio4B, pio4C, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones, fzeroes;
+- __m256 sine, cosine;
+- __m256i q, ones, twos, fours;
+-
+- m4pi = _mm256_set1_ps(1.273239544735162542821171882678754627704620361328125);
+- pio4A = _mm256_set1_ps(0.7853981554508209228515625);
+- pio4B = _mm256_set1_ps(0.794662735614792836713604629039764404296875e-8);
+- pio4C = _mm256_set1_ps(0.306161699786838294306516483068750264552437361480769e-16);
+- ffours = _mm256_set1_ps(4.0);
+- ftwos = _mm256_set1_ps(2.0);
+- fones = _mm256_set1_ps(1.0);
+- fzeroes = _mm256_setzero_ps();
+- __m256i zeroes = _mm256_set1_epi32(0);
+- ones = _mm256_set1_epi32(1);
+- __m256i allones = _mm256_set1_epi32(0xffffffff);
+- twos = _mm256_set1_epi32(2);
+- fours = _mm256_set1_epi32(4);
+-
+- cp1 = _mm256_set1_ps(1.0);
+- cp2 = _mm256_set1_ps(0.08333333333333333);
+- cp3 = _mm256_set1_ps(0.002777777777777778);
+- cp4 = _mm256_set1_ps(4.96031746031746e-05);
+- cp5 = _mm256_set1_ps(5.511463844797178e-07);
+- union bit256 condition1;
+- union bit256 condition3;
+-
+- for(;number < eighthPoints; number++){
+-
+- aVal = _mm256_load_ps(aPtr);
+- // s = fabs(aVal)
+- s = _mm256_sub_ps(aVal, _mm256_and_ps(_mm256_mul_ps(aVal, ftwos), _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS)));
+- // q = (int) (s * (4/pi)), floor(aVal / (pi/4))
+- q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi)));
+- // r = q + q&1, q indicates quadrant, r gives
+- r = _mm256_cvtepi32_ps(_mm256_add_epi32(q, _mm256_and_si256(q, ones)));
+-
+- s = _mm256_fnmadd_ps(r,pio4A,s);
+- s = _mm256_fnmadd_ps(r,pio4B,s);
+- s = _mm256_fnmadd_ps(r,pio4C,s);
+-
+- s = _mm256_div_ps(s, _mm256_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
+- s = _mm256_mul_ps(s, s);
+- // Evaluate Taylor series
+- s = _mm256_mul_ps(_mm256_fmadd_ps(_mm256_fmsub_ps(_mm256_fmadd_ps(_mm256_fmsub_ps(s, cp5, cp4), s, cp3), s, cp2), s, cp1), s);
+-
+- for(i = 0; i < 3; i++)
+- s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s));
+- s = _mm256_div_ps(s, ftwos);
+-
+- sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s));
+- cosine = _mm256_sub_ps(fones, s);
+-
+- // if(((q+1)&2) != 0) { cosine=sine;}
+- condition1.int_vec = _mm256_cmpeq_epi32(_mm256_and_si256(_mm256_add_epi32(q, ones), twos), zeroes);
+- condition1.int_vec = _mm256_xor_si256(allones, condition1.int_vec);
+-
+- // if(((q+2)&4) != 0) { cosine = -cosine;}
+- condition3.int_vec = _mm256_cmpeq_epi32(_mm256_and_si256(_mm256_add_epi32(q, twos), fours), zeroes);
+- condition3.int_vec = _mm256_xor_si256(allones, condition3.int_vec);
+-
+- cosine = _mm256_add_ps(cosine, _mm256_and_ps(_mm256_sub_ps(sine, cosine), condition1.float_vec));
+- cosine = _mm256_sub_ps(cosine, _mm256_and_ps(_mm256_mul_ps(cosine, _mm256_set1_ps(2.0f)), condition3.float_vec));
+- _mm256_store_ps(bPtr, cosine);
+- aPtr += 8;
+- bPtr += 8;
+- }
+-
+- number = eighthPoints * 8;
+- for(;number < num_points; number++){
+- *bPtr++ = cos(*aPtr++);
+- }
++ float* bPtr = bVector;
++ const float* aPtr = aVector;
++
++ unsigned int number = 0;
++ unsigned int eighthPoints = num_points / 8;
++ unsigned int i = 0;
++
++ __m256 aVal, s, r, m4pi, pio4A, pio4B, pio4C, cp1, cp2, cp3, cp4, cp5, ffours, ftwos,
++ fones, fzeroes;
++ __m256 sine, cosine;
++ __m256i q, ones, twos, fours;
++
++ m4pi = _mm256_set1_ps(1.273239544735162542821171882678754627704620361328125);
++ pio4A = _mm256_set1_ps(0.7853981554508209228515625);
++ pio4B = _mm256_set1_ps(0.794662735614792836713604629039764404296875e-8);
++ pio4C = _mm256_set1_ps(0.306161699786838294306516483068750264552437361480769e-16);
++ ffours = _mm256_set1_ps(4.0);
++ ftwos = _mm256_set1_ps(2.0);
++ fones = _mm256_set1_ps(1.0);
++ fzeroes = _mm256_setzero_ps();
++ __m256i zeroes = _mm256_set1_epi32(0);
++ ones = _mm256_set1_epi32(1);
++ __m256i allones = _mm256_set1_epi32(0xffffffff);
++ twos = _mm256_set1_epi32(2);
++ fours = _mm256_set1_epi32(4);
++
++ cp1 = _mm256_set1_ps(1.0);
++ cp2 = _mm256_set1_ps(0.08333333333333333);
++ cp3 = _mm256_set1_ps(0.002777777777777778);
++ cp4 = _mm256_set1_ps(4.96031746031746e-05);
++ cp5 = _mm256_set1_ps(5.511463844797178e-07);
++ union bit256 condition1;
++ union bit256 condition3;
++
++ for (; number < eighthPoints; number++) {
++
++ aVal = _mm256_load_ps(aPtr);
++ // s = fabs(aVal)
++ s = _mm256_sub_ps(aVal,
++ _mm256_and_ps(_mm256_mul_ps(aVal, ftwos),
++ _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS)));
++ // q = (int) (s * (4/pi)), floor(aVal / (pi/4))
++ q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi)));
++ // r = q + q&1, q indicates quadrant, r gives
++ r = _mm256_cvtepi32_ps(_mm256_add_epi32(q, _mm256_and_si256(q, ones)));
++
++ s = _mm256_fnmadd_ps(r, pio4A, s);
++ s = _mm256_fnmadd_ps(r, pio4B, s);
++ s = _mm256_fnmadd_ps(r, pio4C, s);
++
++ s = _mm256_div_ps(
++ s,
++ _mm256_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
++ s = _mm256_mul_ps(s, s);
++ // Evaluate Taylor series
++ s = _mm256_mul_ps(
++ _mm256_fmadd_ps(
++ _mm256_fmsub_ps(
++ _mm256_fmadd_ps(_mm256_fmsub_ps(s, cp5, cp4), s, cp3), s, cp2),
++ s,
++ cp1),
++ s);
++
++ for (i = 0; i < 3; i++)
++ s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s));
++ s = _mm256_div_ps(s, ftwos);
++
++ sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s));
++ cosine = _mm256_sub_ps(fones, s);
++
++ // if(((q+1)&2) != 0) { cosine=sine;}
++ condition1.int_vec =
++ _mm256_cmpeq_epi32(_mm256_and_si256(_mm256_add_epi32(q, ones), twos), zeroes);
++ condition1.int_vec = _mm256_xor_si256(allones, condition1.int_vec);
++
++ // if(((q+2)&4) != 0) { cosine = -cosine;}
++ condition3.int_vec = _mm256_cmpeq_epi32(
++ _mm256_and_si256(_mm256_add_epi32(q, twos), fours), zeroes);
++ condition3.int_vec = _mm256_xor_si256(allones, condition3.int_vec);
++
++ cosine = _mm256_add_ps(
++ cosine, _mm256_and_ps(_mm256_sub_ps(sine, cosine), condition1.float_vec));
++ cosine = _mm256_sub_ps(cosine,
++ _mm256_and_ps(_mm256_mul_ps(cosine, _mm256_set1_ps(2.0f)),
++ condition3.float_vec));
++ _mm256_store_ps(bPtr, cosine);
++ aPtr += 8;
++ bPtr += 8;
++ }
++
++ number = eighthPoints * 8;
++ for (; number < num_points; number++) {
++ *bPtr++ = cos(*aPtr++);
++ }
+ }
+
+ #endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for aligned */
+@@ -168,86 +184,109 @@ static inline void
+ #include <immintrin.h>
+
+ static inline void
+- volk_32f_cos_32f_a_avx2(float* bVector, const float* aVector, unsigned int num_points)
++volk_32f_cos_32f_a_avx2(float* bVector, const float* aVector, unsigned int num_points)
+ {
+- float* bPtr = bVector;
+- const float* aPtr = aVector;
+-
+- unsigned int number = 0;
+- unsigned int eighthPoints = num_points / 8;
+- unsigned int i = 0;
+-
+- __m256 aVal, s, r, m4pi, pio4A, pio4B, pio4C, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones, fzeroes;
+- __m256 sine, cosine;
+- __m256i q, ones, twos, fours;
+-
+- m4pi = _mm256_set1_ps(1.273239544735162542821171882678754627704620361328125);
+- pio4A = _mm256_set1_ps(0.7853981554508209228515625);
+- pio4B = _mm256_set1_ps(0.794662735614792836713604629039764404296875e-8);
+- pio4C = _mm256_set1_ps(0.306161699786838294306516483068750264552437361480769e-16);
+- ffours = _mm256_set1_ps(4.0);
+- ftwos = _mm256_set1_ps(2.0);
+- fones = _mm256_set1_ps(1.0);
+- fzeroes = _mm256_setzero_ps();
+- __m256i zeroes = _mm256_set1_epi32(0);
+- ones = _mm256_set1_epi32(1);
+- __m256i allones = _mm256_set1_epi32(0xffffffff);
+- twos = _mm256_set1_epi32(2);
+- fours = _mm256_set1_epi32(4);
+-
+- cp1 = _mm256_set1_ps(1.0);
+- cp2 = _mm256_set1_ps(0.08333333333333333);
+- cp3 = _mm256_set1_ps(0.002777777777777778);
+- cp4 = _mm256_set1_ps(4.96031746031746e-05);
+- cp5 = _mm256_set1_ps(5.511463844797178e-07);
+- union bit256 condition1;
+- union bit256 condition3;
+-
+- for(;number < eighthPoints; number++){
+-
+- aVal = _mm256_load_ps(aPtr);
+- // s = fabs(aVal)
+- s = _mm256_sub_ps(aVal, _mm256_and_ps(_mm256_mul_ps(aVal, ftwos), _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS)));
+- // q = (int) (s * (4/pi)), floor(aVal / (pi/4))
+- q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi)));
+- // r = q + q&1, q indicates quadrant, r gives
+- r = _mm256_cvtepi32_ps(_mm256_add_epi32(q, _mm256_and_si256(q, ones)));
+-
+- s = _mm256_sub_ps(s, _mm256_mul_ps(r,pio4A));
+- s = _mm256_sub_ps(s, _mm256_mul_ps(r,pio4B));
+- s = _mm256_sub_ps(s, _mm256_mul_ps(r,pio4C));
+-
+- s = _mm256_div_ps(s, _mm256_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
+- s = _mm256_mul_ps(s, s);
+- // Evaluate Taylor series
+- s = _mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(s, cp5), cp4), s), cp3), s), cp2), s), cp1), s);
+-
+- for(i = 0; i < 3; i++)
+- s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s));
+- s = _mm256_div_ps(s, ftwos);
+-
+- sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s));
+- cosine = _mm256_sub_ps(fones, s);
+-
+- // if(((q+1)&2) != 0) { cosine=sine;}
+- condition1.int_vec = _mm256_cmpeq_epi32(_mm256_and_si256(_mm256_add_epi32(q, ones), twos), zeroes);
+- condition1.int_vec = _mm256_xor_si256(allones, condition1.int_vec);
+-
+- // if(((q+2)&4) != 0) { cosine = -cosine;}
+- condition3.int_vec = _mm256_cmpeq_epi32(_mm256_and_si256(_mm256_add_epi32(q, twos), fours), zeroes);
+- condition3.int_vec = _mm256_xor_si256(allones, condition3.int_vec);
+-
+- cosine = _mm256_add_ps(cosine, _mm256_and_ps(_mm256_sub_ps(sine, cosine), condition1.float_vec));
+- cosine = _mm256_sub_ps(cosine, _mm256_and_ps(_mm256_mul_ps(cosine, _mm256_set1_ps(2.0f)), condition3.float_vec));
+- _mm256_store_ps(bPtr, cosine);
+- aPtr += 8;
+- bPtr += 8;
+- }
+-
+- number = eighthPoints * 8;
+- for(;number < num_points; number++){
+- *bPtr++ = cos(*aPtr++);
+- }
++ float* bPtr = bVector;
++ const float* aPtr = aVector;
++
++ unsigned int number = 0;
++ unsigned int eighthPoints = num_points / 8;
++ unsigned int i = 0;
++
++ __m256 aVal, s, r, m4pi, pio4A, pio4B, pio4C, cp1, cp2, cp3, cp4, cp5, ffours, ftwos,
++ fones, fzeroes;
++ __m256 sine, cosine;
++ __m256i q, ones, twos, fours;
++
++ m4pi = _mm256_set1_ps(1.273239544735162542821171882678754627704620361328125);
++ pio4A = _mm256_set1_ps(0.7853981554508209228515625);
++ pio4B = _mm256_set1_ps(0.794662735614792836713604629039764404296875e-8);
++ pio4C = _mm256_set1_ps(0.306161699786838294306516483068750264552437361480769e-16);
++ ffours = _mm256_set1_ps(4.0);
++ ftwos = _mm256_set1_ps(2.0);
++ fones = _mm256_set1_ps(1.0);
++ fzeroes = _mm256_setzero_ps();
++ __m256i zeroes = _mm256_set1_epi32(0);
++ ones = _mm256_set1_epi32(1);
++ __m256i allones = _mm256_set1_epi32(0xffffffff);
++ twos = _mm256_set1_epi32(2);
++ fours = _mm256_set1_epi32(4);
++
++ cp1 = _mm256_set1_ps(1.0);
++ cp2 = _mm256_set1_ps(0.08333333333333333);
++ cp3 = _mm256_set1_ps(0.002777777777777778);
++ cp4 = _mm256_set1_ps(4.96031746031746e-05);
++ cp5 = _mm256_set1_ps(5.511463844797178e-07);
++ union bit256 condition1;
++ union bit256 condition3;
++
++ for (; number < eighthPoints; number++) {
++
++ aVal = _mm256_load_ps(aPtr);
++ // s = fabs(aVal)
++ s = _mm256_sub_ps(aVal,
++ _mm256_and_ps(_mm256_mul_ps(aVal, ftwos),
++ _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS)));
++ // q = (int) (s * (4/pi)), floor(aVal / (pi/4))
++ q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi)));
++ // r = q + q&1, q indicates quadrant, r gives
++ r = _mm256_cvtepi32_ps(_mm256_add_epi32(q, _mm256_and_si256(q, ones)));
++
++ s = _mm256_sub_ps(s, _mm256_mul_ps(r, pio4A));
++ s = _mm256_sub_ps(s, _mm256_mul_ps(r, pio4B));
++ s = _mm256_sub_ps(s, _mm256_mul_ps(r, pio4C));
++
++ s = _mm256_div_ps(
++ s,
++ _mm256_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
++ s = _mm256_mul_ps(s, s);
++ // Evaluate Taylor series
++ s = _mm256_mul_ps(
++ _mm256_add_ps(
++ _mm256_mul_ps(
++ _mm256_sub_ps(
++ _mm256_mul_ps(
++ _mm256_add_ps(
++ _mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(s, cp5), cp4),
++ s),
++ cp3),
++ s),
++ cp2),
++ s),
++ cp1),
++ s);
++
++ for (i = 0; i < 3; i++)
++ s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s));
++ s = _mm256_div_ps(s, ftwos);
++
++ sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s));
++ cosine = _mm256_sub_ps(fones, s);
++
++ // if(((q+1)&2) != 0) { cosine=sine;}
++ condition1.int_vec =
++ _mm256_cmpeq_epi32(_mm256_and_si256(_mm256_add_epi32(q, ones), twos), zeroes);
++ condition1.int_vec = _mm256_xor_si256(allones, condition1.int_vec);
++
++ // if(((q+2)&4) != 0) { cosine = -cosine;}
++ condition3.int_vec = _mm256_cmpeq_epi32(
++ _mm256_and_si256(_mm256_add_epi32(q, twos), fours), zeroes);
++ condition3.int_vec = _mm256_xor_si256(allones, condition3.int_vec);
++
++ cosine = _mm256_add_ps(
++ cosine, _mm256_and_ps(_mm256_sub_ps(sine, cosine), condition1.float_vec));
++ cosine = _mm256_sub_ps(cosine,
++ _mm256_and_ps(_mm256_mul_ps(cosine, _mm256_set1_ps(2.0f)),
++ condition3.float_vec));
++ _mm256_store_ps(bPtr, cosine);
++ aPtr += 8;
++ bPtr += 8;
++ }
++
++ number = eighthPoints * 8;
++ for (; number < num_points; number++) {
++ *bPtr++ = cos(*aPtr++);
++ }
+ }
+
+ #endif /* LV_HAVE_AVX2 for aligned */
+@@ -256,86 +295,105 @@ static inline void
+ #include <smmintrin.h>
+
+ static inline void
+- volk_32f_cos_32f_a_sse4_1(float* bVector, const float* aVector, unsigned int num_points)
++volk_32f_cos_32f_a_sse4_1(float* bVector, const float* aVector, unsigned int num_points)
+ {
+- float* bPtr = bVector;
+- const float* aPtr = aVector;
+-
+- unsigned int number = 0;
+- unsigned int quarterPoints = num_points / 4;
+- unsigned int i = 0;
+-
+- __m128 aVal, s, r, m4pi, pio4A, pio4B, pio4C, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones, fzeroes;
+- __m128 sine, cosine;
+- __m128i q, ones, twos, fours;
+-
+- m4pi = _mm_set1_ps(1.273239544735162542821171882678754627704620361328125);
+- pio4A = _mm_set1_ps(0.7853981554508209228515625);
+- pio4B = _mm_set1_ps(0.794662735614792836713604629039764404296875e-8);
+- pio4C = _mm_set1_ps(0.306161699786838294306516483068750264552437361480769e-16);
+- ffours = _mm_set1_ps(4.0);
+- ftwos = _mm_set1_ps(2.0);
+- fones = _mm_set1_ps(1.0);
+- fzeroes = _mm_setzero_ps();
+- __m128i zeroes = _mm_set1_epi32(0);
+- ones = _mm_set1_epi32(1);
+- __m128i allones = _mm_set1_epi32(0xffffffff);
+- twos = _mm_set1_epi32(2);
+- fours = _mm_set1_epi32(4);
+-
+- cp1 = _mm_set1_ps(1.0);
+- cp2 = _mm_set1_ps(0.08333333333333333);
+- cp3 = _mm_set1_ps(0.002777777777777778);
+- cp4 = _mm_set1_ps(4.96031746031746e-05);
+- cp5 = _mm_set1_ps(5.511463844797178e-07);
+- union bit128 condition1;
+- union bit128 condition3;
+-
+- for(;number < quarterPoints; number++){
+-
+- aVal = _mm_load_ps(aPtr);
+- // s = fabs(aVal)
+- s = _mm_sub_ps(aVal, _mm_and_ps(_mm_mul_ps(aVal, ftwos), _mm_cmplt_ps(aVal, fzeroes)));
+- // q = (int) (s * (4/pi)), floor(aVal / (pi/4))
+- q = _mm_cvtps_epi32(_mm_floor_ps(_mm_mul_ps(s, m4pi)));
+- // r = q + q&1, q indicates quadrant, r gives
+- r = _mm_cvtepi32_ps(_mm_add_epi32(q, _mm_and_si128(q, ones)));
+-
+- s = _mm_sub_ps(s, _mm_mul_ps(r, pio4A));
+- s = _mm_sub_ps(s, _mm_mul_ps(r, pio4B));
+- s = _mm_sub_ps(s, _mm_mul_ps(r, pio4C));
+-
+- s = _mm_div_ps(s, _mm_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
+- s = _mm_mul_ps(s, s);
+- // Evaluate Taylor series
+- s = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(s, cp5), cp4), s), cp3), s), cp2), s), cp1), s);
+-
+- for(i = 0; i < 3; i++)
+- s = _mm_mul_ps(s, _mm_sub_ps(ffours, s));
+- s = _mm_div_ps(s, ftwos);
+-
+- sine = _mm_sqrt_ps(_mm_mul_ps(_mm_sub_ps(ftwos, s), s));
+- cosine = _mm_sub_ps(fones, s);
+-
+- // if(((q+1)&2) != 0) { cosine=sine;}
+- condition1.int_vec = _mm_cmpeq_epi32(_mm_and_si128(_mm_add_epi32(q, ones), twos), zeroes);
+- condition1.int_vec = _mm_xor_si128(allones, condition1.int_vec);
+-
+- // if(((q+2)&4) != 0) { cosine = -cosine;}
+- condition3.int_vec = _mm_cmpeq_epi32(_mm_and_si128(_mm_add_epi32(q, twos), fours), zeroes);
+- condition3.int_vec = _mm_xor_si128(allones, condition3.int_vec);
+-
+- cosine = _mm_add_ps(cosine, _mm_and_ps(_mm_sub_ps(sine, cosine), condition1.float_vec));
+- cosine = _mm_sub_ps(cosine, _mm_and_ps(_mm_mul_ps(cosine, _mm_set1_ps(2.0f)), condition3.float_vec));
+- _mm_store_ps(bPtr, cosine);
+- aPtr += 4;
+- bPtr += 4;
+- }
+-
+- number = quarterPoints * 4;
+- for(;number < num_points; number++){
+- *bPtr++ = cosf(*aPtr++);
+- }
++ float* bPtr = bVector;
++ const float* aPtr = aVector;
++
++ unsigned int number = 0;
++ unsigned int quarterPoints = num_points / 4;
++ unsigned int i = 0;
++
++ __m128 aVal, s, r, m4pi, pio4A, pio4B, pio4C, cp1, cp2, cp3, cp4, cp5, ffours, ftwos,
++ fones, fzeroes;
++ __m128 sine, cosine;
++ __m128i q, ones, twos, fours;
++
++ m4pi = _mm_set1_ps(1.273239544735162542821171882678754627704620361328125);
++ pio4A = _mm_set1_ps(0.7853981554508209228515625);
++ pio4B = _mm_set1_ps(0.794662735614792836713604629039764404296875e-8);
++ pio4C = _mm_set1_ps(0.306161699786838294306516483068750264552437361480769e-16);
++ ffours = _mm_set1_ps(4.0);
++ ftwos = _mm_set1_ps(2.0);
++ fones = _mm_set1_ps(1.0);
++ fzeroes = _mm_setzero_ps();
++ __m128i zeroes = _mm_set1_epi32(0);
++ ones = _mm_set1_epi32(1);
++ __m128i allones = _mm_set1_epi32(0xffffffff);
++ twos = _mm_set1_epi32(2);
++ fours = _mm_set1_epi32(4);
++
++ cp1 = _mm_set1_ps(1.0);
++ cp2 = _mm_set1_ps(0.08333333333333333);
++ cp3 = _mm_set1_ps(0.002777777777777778);
++ cp4 = _mm_set1_ps(4.96031746031746e-05);
++ cp5 = _mm_set1_ps(5.511463844797178e-07);
++ union bit128 condition1;
++ union bit128 condition3;
++
++ for (; number < quarterPoints; number++) {
++
++ aVal = _mm_load_ps(aPtr);
++ // s = fabs(aVal)
++ s = _mm_sub_ps(aVal,
++ _mm_and_ps(_mm_mul_ps(aVal, ftwos), _mm_cmplt_ps(aVal, fzeroes)));
++ // q = (int) (s * (4/pi)), floor(aVal / (pi/4))
++ q = _mm_cvtps_epi32(_mm_floor_ps(_mm_mul_ps(s, m4pi)));
++ // r = q + q&1, q indicates quadrant, r gives
++ r = _mm_cvtepi32_ps(_mm_add_epi32(q, _mm_and_si128(q, ones)));
++
++ s = _mm_sub_ps(s, _mm_mul_ps(r, pio4A));
++ s = _mm_sub_ps(s, _mm_mul_ps(r, pio4B));
++ s = _mm_sub_ps(s, _mm_mul_ps(r, pio4C));
++
++ s = _mm_div_ps(
++ s, _mm_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
++ s = _mm_mul_ps(s, s);
++ // Evaluate Taylor series
++ s = _mm_mul_ps(
++ _mm_add_ps(
++ _mm_mul_ps(
++ _mm_sub_ps(
++ _mm_mul_ps(
++ _mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(s, cp5), cp4), s),
++ cp3),
++ s),
++ cp2),
++ s),
++ cp1),
++ s);
++
++ for (i = 0; i < 3; i++)
++ s = _mm_mul_ps(s, _mm_sub_ps(ffours, s));
++ s = _mm_div_ps(s, ftwos);
++
++ sine = _mm_sqrt_ps(_mm_mul_ps(_mm_sub_ps(ftwos, s), s));
++ cosine = _mm_sub_ps(fones, s);
++
++ // if(((q+1)&2) != 0) { cosine=sine;}
++ condition1.int_vec =
++ _mm_cmpeq_epi32(_mm_and_si128(_mm_add_epi32(q, ones), twos), zeroes);
++ condition1.int_vec = _mm_xor_si128(allones, condition1.int_vec);
++
++ // if(((q+2)&4) != 0) { cosine = -cosine;}
++ condition3.int_vec =
++ _mm_cmpeq_epi32(_mm_and_si128(_mm_add_epi32(q, twos), fours), zeroes);
++ condition3.int_vec = _mm_xor_si128(allones, condition3.int_vec);
++
++ cosine = _mm_add_ps(cosine,
++ _mm_and_ps(_mm_sub_ps(sine, cosine), condition1.float_vec));
++ cosine = _mm_sub_ps(
++ cosine,
++ _mm_and_ps(_mm_mul_ps(cosine, _mm_set1_ps(2.0f)), condition3.float_vec));
++ _mm_store_ps(bPtr, cosine);
++ aPtr += 4;
++ bPtr += 4;
++ }
++
++ number = quarterPoints * 4;
++ for (; number < num_points; number++) {
++ *bPtr++ = cosf(*aPtr++);
++ }
+ }
+
+ #endif /* LV_HAVE_SSE4_1 for aligned */
+@@ -343,7 +401,6 @@ static inline void
+ #endif /* INCLUDED_volk_32f_cos_32f_a_H */
+
+
+-
+ #ifndef INCLUDED_volk_32f_cos_32f_u_H
+ #define INCLUDED_volk_32f_cos_32f_u_H
+
+@@ -351,86 +408,102 @@ static inline void
+ #include <immintrin.h>
+
+ static inline void
+- volk_32f_cos_32f_u_avx2_fma(float* bVector, const float* aVector, unsigned int num_points)
++volk_32f_cos_32f_u_avx2_fma(float* bVector, const float* aVector, unsigned int num_points)
+ {
+- float* bPtr = bVector;
+- const float* aPtr = aVector;
+-
+- unsigned int number = 0;
+- unsigned int eighthPoints = num_points / 8;
+- unsigned int i = 0;
+-
+- __m256 aVal, s, r, m4pi, pio4A, pio4B, pio4C, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones, fzeroes;
+- __m256 sine, cosine;
+- __m256i q, ones, twos, fours;
+-
+- m4pi = _mm256_set1_ps(1.273239544735162542821171882678754627704620361328125);
+- pio4A = _mm256_set1_ps(0.7853981554508209228515625);
+- pio4B = _mm256_set1_ps(0.794662735614792836713604629039764404296875e-8);
+- pio4C = _mm256_set1_ps(0.306161699786838294306516483068750264552437361480769e-16);
+- ffours = _mm256_set1_ps(4.0);
+- ftwos = _mm256_set1_ps(2.0);
+- fones = _mm256_set1_ps(1.0);
+- fzeroes = _mm256_setzero_ps();
+- __m256i zeroes = _mm256_set1_epi32(0);
+- ones = _mm256_set1_epi32(1);
+- __m256i allones = _mm256_set1_epi32(0xffffffff);
+- twos = _mm256_set1_epi32(2);
+- fours = _mm256_set1_epi32(4);
+-
+- cp1 = _mm256_set1_ps(1.0);
+- cp2 = _mm256_set1_ps(0.08333333333333333);
+- cp3 = _mm256_set1_ps(0.002777777777777778);
+- cp4 = _mm256_set1_ps(4.96031746031746e-05);
+- cp5 = _mm256_set1_ps(5.511463844797178e-07);
+- union bit256 condition1;
+- union bit256 condition3;
+-
+- for(;number < eighthPoints; number++){
+-
+- aVal = _mm256_loadu_ps(aPtr);
+- // s = fabs(aVal)
+- s = _mm256_sub_ps(aVal, _mm256_and_ps(_mm256_mul_ps(aVal, ftwos), _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS)));
+- // q = (int) (s * (4/pi)), floor(aVal / (pi/4))
+- q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi)));
+- // r = q + q&1, q indicates quadrant, r gives
+- r = _mm256_cvtepi32_ps(_mm256_add_epi32(q, _mm256_and_si256(q, ones)));
+-
+- s = _mm256_fnmadd_ps(r,pio4A,s);
+- s = _mm256_fnmadd_ps(r,pio4B,s);
+- s = _mm256_fnmadd_ps(r,pio4C,s);
+-
+- s = _mm256_div_ps(s, _mm256_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
+- s = _mm256_mul_ps(s, s);
+- // Evaluate Taylor series
+- s = _mm256_mul_ps(_mm256_fmadd_ps(_mm256_fmsub_ps(_mm256_fmadd_ps(_mm256_fmsub_ps(s, cp5, cp4), s, cp3), s, cp2), s, cp1), s);
+-
+- for(i = 0; i < 3; i++)
+- s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s));
+- s = _mm256_div_ps(s, ftwos);
+-
+- sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s));
+- cosine = _mm256_sub_ps(fones, s);
+-
+- // if(((q+1)&2) != 0) { cosine=sine;}
+- condition1.int_vec = _mm256_cmpeq_epi32(_mm256_and_si256(_mm256_add_epi32(q, ones), twos), zeroes);
+- condition1.int_vec = _mm256_xor_si256(allones, condition1.int_vec);
+-
+- // if(((q+2)&4) != 0) { cosine = -cosine;}
+- condition3.int_vec = _mm256_cmpeq_epi32(_mm256_and_si256(_mm256_add_epi32(q, twos), fours), zeroes);
+- condition3.int_vec = _mm256_xor_si256(allones, condition3.int_vec);
+-
+- cosine = _mm256_add_ps(cosine, _mm256_and_ps(_mm256_sub_ps(sine, cosine), condition1.float_vec));
+- cosine = _mm256_sub_ps(cosine, _mm256_and_ps(_mm256_mul_ps(cosine, _mm256_set1_ps(2.0f)), condition3.float_vec));
+- _mm256_storeu_ps(bPtr, cosine);
+- aPtr += 8;
+- bPtr += 8;
+- }
+-
+- number = eighthPoints * 8;
+- for(;number < num_points; number++){
+- *bPtr++ = cos(*aPtr++);
+- }
++ float* bPtr = bVector;
++ const float* aPtr = aVector;
++
++ unsigned int number = 0;
++ unsigned int eighthPoints = num_points / 8;
++ unsigned int i = 0;
++
++ __m256 aVal, s, r, m4pi, pio4A, pio4B, pio4C, cp1, cp2, cp3, cp4, cp5, ffours, ftwos,
++ fones, fzeroes;
++ __m256 sine, cosine;
++ __m256i q, ones, twos, fours;
++
++ m4pi = _mm256_set1_ps(1.273239544735162542821171882678754627704620361328125);
++ pio4A = _mm256_set1_ps(0.7853981554508209228515625);
++ pio4B = _mm256_set1_ps(0.794662735614792836713604629039764404296875e-8);
++ pio4C = _mm256_set1_ps(0.306161699786838294306516483068750264552437361480769e-16);
++ ffours = _mm256_set1_ps(4.0);
++ ftwos = _mm256_set1_ps(2.0);
++ fones = _mm256_set1_ps(1.0);
++ fzeroes = _mm256_setzero_ps();
++ __m256i zeroes = _mm256_set1_epi32(0);
++ ones = _mm256_set1_epi32(1);
++ __m256i allones = _mm256_set1_epi32(0xffffffff);
++ twos = _mm256_set1_epi32(2);
++ fours = _mm256_set1_epi32(4);
++
++ cp1 = _mm256_set1_ps(1.0);
++ cp2 = _mm256_set1_ps(0.08333333333333333);
++ cp3 = _mm256_set1_ps(0.002777777777777778);
++ cp4 = _mm256_set1_ps(4.96031746031746e-05);
++ cp5 = _mm256_set1_ps(5.511463844797178e-07);
++ union bit256 condition1;
++ union bit256 condition3;
++
++ for (; number < eighthPoints; number++) {
++
++ aVal = _mm256_loadu_ps(aPtr);
++ // s = fabs(aVal)
++ s = _mm256_sub_ps(aVal,
++ _mm256_and_ps(_mm256_mul_ps(aVal, ftwos),
++ _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS)));
++ // q = (int) (s * (4/pi)), floor(aVal / (pi/4))
++ q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi)));
++ // r = q + q&1, q indicates quadrant, r gives
++ r = _mm256_cvtepi32_ps(_mm256_add_epi32(q, _mm256_and_si256(q, ones)));
++
++ s = _mm256_fnmadd_ps(r, pio4A, s);
++ s = _mm256_fnmadd_ps(r, pio4B, s);
++ s = _mm256_fnmadd_ps(r, pio4C, s);
++
++ s = _mm256_div_ps(
++ s,
++ _mm256_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
++ s = _mm256_mul_ps(s, s);
++ // Evaluate Taylor series
++ s = _mm256_mul_ps(
++ _mm256_fmadd_ps(
++ _mm256_fmsub_ps(
++ _mm256_fmadd_ps(_mm256_fmsub_ps(s, cp5, cp4), s, cp3), s, cp2),
++ s,
++ cp1),
++ s);
++
++ for (i = 0; i < 3; i++)
++ s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s));
++ s = _mm256_div_ps(s, ftwos);
++
++ sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s));
++ cosine = _mm256_sub_ps(fones, s);
++
++ // if(((q+1)&2) != 0) { cosine=sine;}
++ condition1.int_vec =
++ _mm256_cmpeq_epi32(_mm256_and_si256(_mm256_add_epi32(q, ones), twos), zeroes);
++ condition1.int_vec = _mm256_xor_si256(allones, condition1.int_vec);
++
++ // if(((q+2)&4) != 0) { cosine = -cosine;}
++ condition3.int_vec = _mm256_cmpeq_epi32(
++ _mm256_and_si256(_mm256_add_epi32(q, twos), fours), zeroes);
++ condition3.int_vec = _mm256_xor_si256(allones, condition3.int_vec);
++
++ cosine = _mm256_add_ps(
++ cosine, _mm256_and_ps(_mm256_sub_ps(sine, cosine), condition1.float_vec));
++ cosine = _mm256_sub_ps(cosine,
++ _mm256_and_ps(_mm256_mul_ps(cosine, _mm256_set1_ps(2.0f)),
++ condition3.float_vec));
++ _mm256_storeu_ps(bPtr, cosine);
++ aPtr += 8;
++ bPtr += 8;
++ }
++
++ number = eighthPoints * 8;
++ for (; number < num_points; number++) {
++ *bPtr++ = cos(*aPtr++);
++ }
+ }
+
+ #endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for unaligned */
+@@ -439,86 +512,109 @@ static inline void
+ #include <immintrin.h>
+
+ static inline void
+- volk_32f_cos_32f_u_avx2(float* bVector, const float* aVector, unsigned int num_points)
++volk_32f_cos_32f_u_avx2(float* bVector, const float* aVector, unsigned int num_points)
+ {
+- float* bPtr = bVector;
+- const float* aPtr = aVector;
+-
+- unsigned int number = 0;
+- unsigned int eighthPoints = num_points / 8;
+- unsigned int i = 0;
+-
+- __m256 aVal, s, r, m4pi, pio4A, pio4B, pio4C, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones, fzeroes;
+- __m256 sine, cosine;
+- __m256i q, ones, twos, fours;
+-
+- m4pi = _mm256_set1_ps(1.273239544735162542821171882678754627704620361328125);
+- pio4A = _mm256_set1_ps(0.7853981554508209228515625);
+- pio4B = _mm256_set1_ps(0.794662735614792836713604629039764404296875e-8);
+- pio4C = _mm256_set1_ps(0.306161699786838294306516483068750264552437361480769e-16);
+- ffours = _mm256_set1_ps(4.0);
+- ftwos = _mm256_set1_ps(2.0);
+- fones = _mm256_set1_ps(1.0);
+- fzeroes = _mm256_setzero_ps();
+- __m256i zeroes = _mm256_set1_epi32(0);
+- ones = _mm256_set1_epi32(1);
+- __m256i allones = _mm256_set1_epi32(0xffffffff);
+- twos = _mm256_set1_epi32(2);
+- fours = _mm256_set1_epi32(4);
+-
+- cp1 = _mm256_set1_ps(1.0);
+- cp2 = _mm256_set1_ps(0.08333333333333333);
+- cp3 = _mm256_set1_ps(0.002777777777777778);
+- cp4 = _mm256_set1_ps(4.96031746031746e-05);
+- cp5 = _mm256_set1_ps(5.511463844797178e-07);
+- union bit256 condition1;
+- union bit256 condition3;
+-
+- for(;number < eighthPoints; number++){
+-
+- aVal = _mm256_loadu_ps(aPtr);
+- // s = fabs(aVal)
+- s = _mm256_sub_ps(aVal, _mm256_and_ps(_mm256_mul_ps(aVal, ftwos), _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS)));
+- // q = (int) (s * (4/pi)), floor(aVal / (pi/4))
+- q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi)));
+- // r = q + q&1, q indicates quadrant, r gives
+- r = _mm256_cvtepi32_ps(_mm256_add_epi32(q, _mm256_and_si256(q, ones)));
+-
+- s = _mm256_sub_ps(s, _mm256_mul_ps(r,pio4A));
+- s = _mm256_sub_ps(s, _mm256_mul_ps(r,pio4B));
+- s = _mm256_sub_ps(s, _mm256_mul_ps(r,pio4C));
+-
+- s = _mm256_div_ps(s, _mm256_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
+- s = _mm256_mul_ps(s, s);
+- // Evaluate Taylor series
+- s = _mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(s, cp5), cp4), s), cp3), s), cp2), s), cp1), s);
+-
+- for(i = 0; i < 3; i++)
+- s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s));
+- s = _mm256_div_ps(s, ftwos);
+-
+- sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s));
+- cosine = _mm256_sub_ps(fones, s);
+-
+- // if(((q+1)&2) != 0) { cosine=sine;}
+- condition1.int_vec = _mm256_cmpeq_epi32(_mm256_and_si256(_mm256_add_epi32(q, ones), twos), zeroes);
+- condition1.int_vec = _mm256_xor_si256(allones, condition1.int_vec);
+-
+- // if(((q+2)&4) != 0) { cosine = -cosine;}
+- condition3.int_vec = _mm256_cmpeq_epi32(_mm256_and_si256(_mm256_add_epi32(q, twos), fours), zeroes);
+- condition3.int_vec = _mm256_xor_si256(allones, condition3.int_vec);
+-
+- cosine = _mm256_add_ps(cosine, _mm256_and_ps(_mm256_sub_ps(sine, cosine), condition1.float_vec));
+- cosine = _mm256_sub_ps(cosine, _mm256_and_ps(_mm256_mul_ps(cosine, _mm256_set1_ps(2.0f)), condition3.float_vec));
+- _mm256_storeu_ps(bPtr, cosine);
+- aPtr += 8;
+- bPtr += 8;
+- }
+-
+- number = eighthPoints * 8;
+- for(;number < num_points; number++){
+- *bPtr++ = cos(*aPtr++);
+- }
++ float* bPtr = bVector;
++ const float* aPtr = aVector;
++
++ unsigned int number = 0;
++ unsigned int eighthPoints = num_points / 8;
++ unsigned int i = 0;
++
++ __m256 aVal, s, r, m4pi, pio4A, pio4B, pio4C, cp1, cp2, cp3, cp4, cp5, ffours, ftwos,
++ fones, fzeroes;
++ __m256 sine, cosine;
++ __m256i q, ones, twos, fours;
++
++ m4pi = _mm256_set1_ps(1.273239544735162542821171882678754627704620361328125);
++ pio4A = _mm256_set1_ps(0.7853981554508209228515625);
++ pio4B = _mm256_set1_ps(0.794662735614792836713604629039764404296875e-8);
++ pio4C = _mm256_set1_ps(0.306161699786838294306516483068750264552437361480769e-16);
++ ffours = _mm256_set1_ps(4.0);
++ ftwos = _mm256_set1_ps(2.0);
++ fones = _mm256_set1_ps(1.0);
++ fzeroes = _mm256_setzero_ps();
++ __m256i zeroes = _mm256_set1_epi32(0);
++ ones = _mm256_set1_epi32(1);
++ __m256i allones = _mm256_set1_epi32(0xffffffff);
++ twos = _mm256_set1_epi32(2);
++ fours = _mm256_set1_epi32(4);
++
++ cp1 = _mm256_set1_ps(1.0);
++ cp2 = _mm256_set1_ps(0.08333333333333333);
++ cp3 = _mm256_set1_ps(0.002777777777777778);
++ cp4 = _mm256_set1_ps(4.96031746031746e-05);
++ cp5 = _mm256_set1_ps(5.511463844797178e-07);
++ union bit256 condition1;
++ union bit256 condition3;
++
++ for (; number < eighthPoints; number++) {
++
++ aVal = _mm256_loadu_ps(aPtr);
++ // s = fabs(aVal)
++ s = _mm256_sub_ps(aVal,
++ _mm256_and_ps(_mm256_mul_ps(aVal, ftwos),
++ _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS)));
++ // q = (int) (s * (4/pi)), floor(aVal / (pi/4))
++ q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi)));
++ // r = q + q&1, q indicates quadrant, r gives
++ r = _mm256_cvtepi32_ps(_mm256_add_epi32(q, _mm256_and_si256(q, ones)));
++
++ s = _mm256_sub_ps(s, _mm256_mul_ps(r, pio4A));
++ s = _mm256_sub_ps(s, _mm256_mul_ps(r, pio4B));
++ s = _mm256_sub_ps(s, _mm256_mul_ps(r, pio4C));
++
++ s = _mm256_div_ps(
++ s,
++ _mm256_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
++ s = _mm256_mul_ps(s, s);
++ // Evaluate Taylor series
++ s = _mm256_mul_ps(
++ _mm256_add_ps(
++ _mm256_mul_ps(
++ _mm256_sub_ps(
++ _mm256_mul_ps(
++ _mm256_add_ps(
++ _mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(s, cp5), cp4),
++ s),
++ cp3),
++ s),
++ cp2),
++ s),
++ cp1),
++ s);
++
++ for (i = 0; i < 3; i++)
++ s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s));
++ s = _mm256_div_ps(s, ftwos);
++
++ sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s));
++ cosine = _mm256_sub_ps(fones, s);
++
++ // if(((q+1)&2) != 0) { cosine=sine;}
++ condition1.int_vec =
++ _mm256_cmpeq_epi32(_mm256_and_si256(_mm256_add_epi32(q, ones), twos), zeroes);
++ condition1.int_vec = _mm256_xor_si256(allones, condition1.int_vec);
++
++ // if(((q+2)&4) != 0) { cosine = -cosine;}
++ condition3.int_vec = _mm256_cmpeq_epi32(
++ _mm256_and_si256(_mm256_add_epi32(q, twos), fours), zeroes);
++ condition3.int_vec = _mm256_xor_si256(allones, condition3.int_vec);
++
++ cosine = _mm256_add_ps(
++ cosine, _mm256_and_ps(_mm256_sub_ps(sine, cosine), condition1.float_vec));
++ cosine = _mm256_sub_ps(cosine,
++ _mm256_and_ps(_mm256_mul_ps(cosine, _mm256_set1_ps(2.0f)),
++ condition3.float_vec));
++ _mm256_storeu_ps(bPtr, cosine);
++ aPtr += 8;
++ bPtr += 8;
++ }
++
++ number = eighthPoints * 8;
++ for (; number < num_points; number++) {
++ *bPtr++ = cos(*aPtr++);
++ }
+ }
+
+ #endif /* LV_HAVE_AVX2 for unaligned */
+@@ -529,71 +625,88 @@ static inline void
+ static inline void
+ volk_32f_cos_32f_u_sse4_1(float* bVector, const float* aVector, unsigned int num_points)
+ {
+- float* bPtr = bVector;
+- const float* aPtr = aVector;
+-
+- unsigned int number = 0;
+- unsigned int quarterPoints = num_points / 4;
+- unsigned int i = 0;
+-
+- __m128 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones, fzeroes;
+- __m128 sine, cosine, condition1, condition3;
+- __m128i q, r, ones, twos, fours;
+-
+- m4pi = _mm_set1_ps(1.273239545);
+- pio4A = _mm_set1_ps(0.78515625);
+- pio4B = _mm_set1_ps(0.241876e-3);
+- ffours = _mm_set1_ps(4.0);
+- ftwos = _mm_set1_ps(2.0);
+- fones = _mm_set1_ps(1.0);
+- fzeroes = _mm_setzero_ps();
+- ones = _mm_set1_epi32(1);
+- twos = _mm_set1_epi32(2);
+- fours = _mm_set1_epi32(4);
+-
+- cp1 = _mm_set1_ps(1.0);
+- cp2 = _mm_set1_ps(0.83333333e-1);
+- cp3 = _mm_set1_ps(0.2777778e-2);
+- cp4 = _mm_set1_ps(0.49603e-4);
+- cp5 = _mm_set1_ps(0.551e-6);
+-
+- for(;number < quarterPoints; number++){
+- aVal = _mm_loadu_ps(aPtr);
+- s = _mm_sub_ps(aVal, _mm_and_ps(_mm_mul_ps(aVal, ftwos), _mm_cmplt_ps(aVal, fzeroes)));
+- q = _mm_cvtps_epi32(_mm_floor_ps(_mm_mul_ps(s, m4pi)));
+- r = _mm_add_epi32(q, _mm_and_si128(q, ones));
+-
+- s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4A));
+- s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4B));
+-
+- s = _mm_div_ps(s, _mm_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
+- s = _mm_mul_ps(s, s);
+- // Evaluate Taylor series
+- s = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(s, cp5), cp4), s), cp3), s), cp2), s), cp1), s);
+-
+- for(i = 0; i < 3; i++){
+- s = _mm_mul_ps(s, _mm_sub_ps(ffours, s));
+- }
+- s = _mm_div_ps(s, ftwos);
+-
+- sine = _mm_sqrt_ps(_mm_mul_ps(_mm_sub_ps(ftwos, s), s));
+- cosine = _mm_sub_ps(fones, s);
+-
+- condition1 = _mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, ones), twos)), fzeroes);
++ float* bPtr = bVector;
++ const float* aPtr = aVector;
+
+- condition3 = _mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, twos), fours)), fzeroes);
+-
+- cosine = _mm_add_ps(cosine, _mm_and_ps(_mm_sub_ps(sine, cosine), condition1));
+- cosine = _mm_sub_ps(cosine, _mm_and_ps(_mm_mul_ps(cosine, _mm_set1_ps(2.0f)), condition3));
+- _mm_storeu_ps(bPtr, cosine);
+- aPtr += 4;
+- bPtr += 4;
+- }
++ unsigned int number = 0;
++ unsigned int quarterPoints = num_points / 4;
++ unsigned int i = 0;
++
++ __m128 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones,
++ fzeroes;
++ __m128 sine, cosine, condition1, condition3;
++ __m128i q, r, ones, twos, fours;
++
++ m4pi = _mm_set1_ps(1.273239545);
++ pio4A = _mm_set1_ps(0.78515625);
++ pio4B = _mm_set1_ps(0.241876e-3);
++ ffours = _mm_set1_ps(4.0);
++ ftwos = _mm_set1_ps(2.0);
++ fones = _mm_set1_ps(1.0);
++ fzeroes = _mm_setzero_ps();
++ ones = _mm_set1_epi32(1);
++ twos = _mm_set1_epi32(2);
++ fours = _mm_set1_epi32(4);
++
++ cp1 = _mm_set1_ps(1.0);
++ cp2 = _mm_set1_ps(0.83333333e-1);
++ cp3 = _mm_set1_ps(0.2777778e-2);
++ cp4 = _mm_set1_ps(0.49603e-4);
++ cp5 = _mm_set1_ps(0.551e-6);
++
++ for (; number < quarterPoints; number++) {
++ aVal = _mm_loadu_ps(aPtr);
++ s = _mm_sub_ps(aVal,
++ _mm_and_ps(_mm_mul_ps(aVal, ftwos), _mm_cmplt_ps(aVal, fzeroes)));
++ q = _mm_cvtps_epi32(_mm_floor_ps(_mm_mul_ps(s, m4pi)));
++ r = _mm_add_epi32(q, _mm_and_si128(q, ones));
++
++ s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4A));
++ s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4B));
++
++ s = _mm_div_ps(
++ s, _mm_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
++ s = _mm_mul_ps(s, s);
++ // Evaluate Taylor series
++ s = _mm_mul_ps(
++ _mm_add_ps(
++ _mm_mul_ps(
++ _mm_sub_ps(
++ _mm_mul_ps(
++ _mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(s, cp5), cp4), s),
++ cp3),
++ s),
++ cp2),
++ s),
++ cp1),
++ s);
++
++ for (i = 0; i < 3; i++) {
++ s = _mm_mul_ps(s, _mm_sub_ps(ffours, s));
++ }
++ s = _mm_div_ps(s, ftwos);
++
++ sine = _mm_sqrt_ps(_mm_mul_ps(_mm_sub_ps(ftwos, s), s));
++ cosine = _mm_sub_ps(fones, s);
++
++ condition1 = _mm_cmpneq_ps(
++ _mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, ones), twos)), fzeroes);
++
++ condition3 = _mm_cmpneq_ps(
++ _mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, twos), fours)), fzeroes);
++
++ cosine = _mm_add_ps(cosine, _mm_and_ps(_mm_sub_ps(sine, cosine), condition1));
++ cosine = _mm_sub_ps(
++ cosine, _mm_and_ps(_mm_mul_ps(cosine, _mm_set1_ps(2.0f)), condition3));
++ _mm_storeu_ps(bPtr, cosine);
++ aPtr += 4;
++ bPtr += 4;
++ }
+
+- number = quarterPoints * 4;
+- for(;number < num_points; number++){
+- *bPtr++ = cosf(*aPtr++);
+- }
++ number = quarterPoints * 4;
++ for (; number < num_points; number++) {
++ *bPtr++ = cosf(*aPtr++);
++ }
+ }
+
+ #endif /* LV_HAVE_SSE4_1 for unaligned */
+@@ -606,52 +719,55 @@ volk_32f_cos_32f_u_sse4_1(float* bVector, const float* aVector, unsigned int num
+ * Shibata, Naoki, "Efficient evaluation methods of elementary functions
+ * suitable for SIMD computation," in Springer-Verlag 2010
+ */
+-static inline void
+-volk_32f_cos_32f_generic_fast(float* bVector, const float* aVector, unsigned int num_points)
++static inline void volk_32f_cos_32f_generic_fast(float* bVector,
++ const float* aVector,
++ unsigned int num_points)
+ {
+- float* bPtr = bVector;
+- const float* aPtr = aVector;
+-
+- float m4pi = 1.273239544735162542821171882678754627704620361328125;
+- float pio4A = 0.7853981554508209228515625;
+- float pio4B = 0.794662735614792836713604629039764404296875e-8;
+- float pio4C = 0.306161699786838294306516483068750264552437361480769e-16;
+- int N = 3; // order of argument reduction
+-
+- unsigned int number;
+- for(number = 0; number < num_points; number++){
+- float s = fabs(*aPtr);
+- int q = (int)(s * m4pi);
+- int r = q + (q&1);
+- s -= r * pio4A;
+- s -= r * pio4B;
+- s -= r * pio4C;
+-
+- s = s * 0.125; // 2^-N (<--3)
+- s = s*s;
+- s = ((((s/1814400. - 1.0/20160.0)*s + 1.0/360.0)*s - 1.0/12.0)*s + 1.0)*s;
+-
+- int i;
+- for(i=0; i < N; ++i) {
+- s = (4.0-s)*s;
+- }
+- s = s/2.0;
+-
+- float sine = sqrt((2.0-s)*s);
+- float cosine = 1-s;
+-
+- if (((q+1) & 2) != 0) {
+- s = cosine;
+- cosine = sine;
+- sine = s;
+- }
+- if (((q+2) & 4) != 0) {
+- cosine = -cosine;
+- }
+- *bPtr = cosine;
+- bPtr++;
+- aPtr++;
+- }
++ float* bPtr = bVector;
++ const float* aPtr = aVector;
++
++ float m4pi = 1.273239544735162542821171882678754627704620361328125;
++ float pio4A = 0.7853981554508209228515625;
++ float pio4B = 0.794662735614792836713604629039764404296875e-8;
++ float pio4C = 0.306161699786838294306516483068750264552437361480769e-16;
++ int N = 3; // order of argument reduction
++
++ unsigned int number;
++ for (number = 0; number < num_points; number++) {
++ float s = fabs(*aPtr);
++ int q = (int)(s * m4pi);
++ int r = q + (q & 1);
++ s -= r * pio4A;
++ s -= r * pio4B;
++ s -= r * pio4C;
++
++ s = s * 0.125; // 2^-N (<--3)
++ s = s * s;
++ s = ((((s / 1814400. - 1.0 / 20160.0) * s + 1.0 / 360.0) * s - 1.0 / 12.0) * s +
++ 1.0) *
++ s;
++
++ int i;
++ for (i = 0; i < N; ++i) {
++ s = (4.0 - s) * s;
++ }
++ s = s / 2.0;
++
++ float sine = sqrt((2.0 - s) * s);
++ float cosine = 1 - s;
++
++ if (((q + 1) & 2) != 0) {
++ s = cosine;
++ cosine = sine;
++ sine = s;
++ }
++ if (((q + 2) & 4) != 0) {
++ cosine = -cosine;
++ }
++ *bPtr = cosine;
++ bPtr++;
++ aPtr++;
++ }
+ }
+
+ #endif /* LV_HAVE_GENERIC */
+@@ -662,13 +778,13 @@ volk_32f_cos_32f_generic_fast(float* bVector, const float* aVector, unsigned int
+ static inline void
+ volk_32f_cos_32f_generic(float* bVector, const float* aVector, unsigned int num_points)
+ {
+- float* bPtr = bVector;
+- const float* aPtr = aVector;
+- unsigned int number = 0;
++ float* bPtr = bVector;
++ const float* aPtr = aVector;
++ unsigned int number = 0;
+
+- for(; number < num_points; number++){
+- *bPtr++ = cosf(*aPtr++);
+- }
++ for (; number < num_points; number++) {
++ *bPtr++ = cosf(*aPtr++);
++ }
+ }
+
+ #endif /* LV_HAVE_GENERIC */
+@@ -679,30 +795,29 @@ volk_32f_cos_32f_generic(float* bVector, const float* aVector, unsigned int num_
+ #include <volk/volk_neon_intrinsics.h>
+
+ static inline void
+-volk_32f_cos_32f_neon(float* bVector, const float* aVector,
+- unsigned int num_points)
++volk_32f_cos_32f_neon(float* bVector, const float* aVector, unsigned int num_points)
+ {
+ unsigned int number = 0;
+ unsigned int quarter_points = num_points / 4;
+ float* bVectorPtr = bVector;
+ const float* aVectorPtr = aVector;
+-
++
+ float32x4_t b_vec;
+ float32x4_t a_vec;
+-
+- for(number = 0; number < quarter_points; number++) {
++
++ for (number = 0; number < quarter_points; number++) {
+ a_vec = vld1q_f32(aVectorPtr);
+ // Prefetch next one, speeds things up
+- __VOLK_PREFETCH(aVectorPtr+4);
++ __VOLK_PREFETCH(aVectorPtr + 4);
+ b_vec = _vcosq_f32(a_vec);
+ vst1q_f32(bVectorPtr, b_vec);
+ // move pointers ahead
+- bVectorPtr+=4;
+- aVectorPtr+=4;
++ bVectorPtr += 4;
++ aVectorPtr += 4;
+ }
+-
++
+ // Deal with the rest
+- for(number = quarter_points * 4; number < num_points; number++) {
++ for (number = quarter_points * 4; number < num_points; number++) {
+ *bVectorPtr++ = cosf(*aVectorPtr++);
+ }
+ }
+diff --git a/kernels/volk/volk_32f_expfast_32f.h b/kernels/volk/volk_32f_expfast_32f.h
+index ecb4914..45de3f9 100644
+--- a/kernels/volk/volk_32f_expfast_32f.h
++++ b/kernels/volk/volk_32f_expfast_32f.h
+@@ -30,8 +30,8 @@
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+- * void volk_32f_expfast_32f(float* bVector, const float* aVector, unsigned int num_points)
+- * \endcode
++ * void volk_32f_expfast_32f(float* bVector, const float* aVector, unsigned int
++ * num_points) \endcode
+ *
+ * \b Inputs
+ * \li aVector: Input vector of floats.
+@@ -62,9 +62,9 @@
+ * \endcode
+ */
+
+-#include <stdio.h>
+-#include <math.h>
+ #include <inttypes.h>
++#include <math.h>
++#include <stdio.h>
+
+ #define Mln2 0.6931471805f
+ #define A 8388608.0f
+@@ -79,34 +79,35 @@
+
+ #include <immintrin.h>
+
+-static inline void
+- volk_32f_expfast_32f_a_avx_fma(float* bVector, const float* aVector, unsigned int num_points)
++static inline void volk_32f_expfast_32f_a_avx_fma(float* bVector,
++ const float* aVector,
++ unsigned int num_points)
+ {
+- float* bPtr = bVector;
+- const float* aPtr = aVector;
+-
+- unsigned int number = 0;
+- const unsigned int eighthPoints = num_points / 8;
+-
+- __m256 aVal, bVal, a, b;
+- __m256i exp;
+- a = _mm256_set1_ps(A/Mln2);
+- b = _mm256_set1_ps(B-C);
+-
+- for(;number < eighthPoints; number++){
+- aVal = _mm256_load_ps(aPtr);
+- exp = _mm256_cvtps_epi32(_mm256_fmadd_ps(a,aVal, b));
+- bVal = _mm256_castsi256_ps(exp);
+-
+- _mm256_store_ps(bPtr, bVal);
+- aPtr += 8;
+- bPtr += 8;
+- }
+-
+- number = eighthPoints * 8;
+- for(;number < num_points; number++){
+- *bPtr++ = expf(*aPtr++);
+- }
++ float* bPtr = bVector;
++ const float* aPtr = aVector;
++
++ unsigned int number = 0;
++ const unsigned int eighthPoints = num_points / 8;
++
++ __m256 aVal, bVal, a, b;
++ __m256i exp;
++ a = _mm256_set1_ps(A / Mln2);
++ b = _mm256_set1_ps(B - C);
++
++ for (; number < eighthPoints; number++) {
++ aVal = _mm256_load_ps(aPtr);
++ exp = _mm256_cvtps_epi32(_mm256_fmadd_ps(a, aVal, b));
++ bVal = _mm256_castsi256_ps(exp);
++
++ _mm256_store_ps(bPtr, bVal);
++ aPtr += 8;
++ bPtr += 8;
++ }
++
++ number = eighthPoints * 8;
++ for (; number < num_points; number++) {
++ *bPtr++ = expf(*aPtr++);
++ }
+ }
+
+ #endif /* LV_HAVE_AVX && LV_HAVE_FMA for aligned */
+@@ -116,33 +117,33 @@ static inline void
+ #include <immintrin.h>
+
+ static inline void
+- volk_32f_expfast_32f_a_avx(float* bVector, const float* aVector, unsigned int num_points)
++volk_32f_expfast_32f_a_avx(float* bVector, const float* aVector, unsigned int num_points)
+ {
+- float* bPtr = bVector;
+- const float* aPtr = aVector;
+-
+- unsigned int number = 0;
+- const unsigned int eighthPoints = num_points / 8;
+-
+- __m256 aVal, bVal, a, b;
+- __m256i exp;
+- a = _mm256_set1_ps(A/Mln2);
+- b = _mm256_set1_ps(B-C);
+-
+- for(;number < eighthPoints; number++){
+- aVal = _mm256_load_ps(aPtr);
+- exp = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_mul_ps(a,aVal), b));
+- bVal = _mm256_castsi256_ps(exp);
+-
+- _mm256_store_ps(bPtr, bVal);
+- aPtr += 8;
+- bPtr += 8;
+- }
+-
+- number = eighthPoints * 8;
+- for(;number < num_points; number++){
+- *bPtr++ = expf(*aPtr++);
+- }
++ float* bPtr = bVector;
++ const float* aPtr = aVector;
++
++ unsigned int number = 0;
++ const unsigned int eighthPoints = num_points / 8;
++
++ __m256 aVal, bVal, a, b;
++ __m256i exp;
++ a = _mm256_set1_ps(A / Mln2);
++ b = _mm256_set1_ps(B - C);
++
++ for (; number < eighthPoints; number++) {
++ aVal = _mm256_load_ps(aPtr);
++ exp = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_mul_ps(a, aVal), b));
++ bVal = _mm256_castsi256_ps(exp);
++
++ _mm256_store_ps(bPtr, bVal);
++ aPtr += 8;
++ bPtr += 8;
++ }
++
++ number = eighthPoints * 8;
++ for (; number < num_points; number++) {
++ *bPtr++ = expf(*aPtr++);
++ }
+ }
+
+ #endif /* LV_HAVE_AVX for aligned */
+@@ -150,34 +151,35 @@ static inline void
+ #ifdef LV_HAVE_SSE4_1
+ #include <smmintrin.h>
+
+-static inline void
+-volk_32f_expfast_32f_a_sse4_1(float* bVector, const float* aVector, unsigned int num_points)
++static inline void volk_32f_expfast_32f_a_sse4_1(float* bVector,
++ const float* aVector,
++ unsigned int num_points)
+ {
+- float* bPtr = bVector;
+- const float* aPtr = aVector;
+-
+- unsigned int number = 0;
+- const unsigned int quarterPoints = num_points / 4;
+-
+- __m128 aVal, bVal, a, b;
+- __m128i exp;
+- a = _mm_set1_ps(A/Mln2);
+- b = _mm_set1_ps(B-C);
+-
+- for(;number < quarterPoints; number++){
+- aVal = _mm_load_ps(aPtr);
+- exp = _mm_cvtps_epi32(_mm_add_ps(_mm_mul_ps(a,aVal), b));
+- bVal = _mm_castsi128_ps(exp);
+-
+- _mm_store_ps(bPtr, bVal);
+- aPtr += 4;
+- bPtr += 4;
+- }
+-
+- number = quarterPoints * 4;
+- for(;number < num_points; number++){
+- *bPtr++ = expf(*aPtr++);
+- }
++ float* bPtr = bVector;
++ const float* aPtr = aVector;
++
++ unsigned int number = 0;
++ const unsigned int quarterPoints = num_points / 4;
++
++ __m128 aVal, bVal, a, b;
++ __m128i exp;
++ a = _mm_set1_ps(A / Mln2);
++ b = _mm_set1_ps(B - C);
++
++ for (; number < quarterPoints; number++) {
++ aVal = _mm_load_ps(aPtr);
++ exp = _mm_cvtps_epi32(_mm_add_ps(_mm_mul_ps(a, aVal), b));
++ bVal = _mm_castsi128_ps(exp);
++
++ _mm_store_ps(bPtr, bVal);
++ aPtr += 4;
++ bPtr += 4;
++ }
++
++ number = quarterPoints * 4;
++ for (; number < num_points; number++) {
++ *bPtr++ = expf(*aPtr++);
++ }
+ }
+
+ #endif /* LV_HAVE_SSE4_1 for aligned */
+@@ -190,34 +192,35 @@ volk_32f_expfast_32f_a_sse4_1(float* bVector, const float* aVector, unsigned int
+ #if LV_HAVE_AVX && LV_HAVE_FMA
+ #include <immintrin.h>
+
+-static inline void
+-volk_32f_expfast_32f_u_avx_fma(float* bVector, const float* aVector, unsigned int num_points)
++static inline void volk_32f_expfast_32f_u_avx_fma(float* bVector,
++ const float* aVector,
++ unsigned int num_points)
+ {
+- float* bPtr = bVector;
+- const float* aPtr = aVector;
+-
+- unsigned int number = 0;
+- const unsigned int eighthPoints = num_points / 8;
+-
+- __m256 aVal, bVal, a, b;
+- __m256i exp;
+- a = _mm256_set1_ps(A/Mln2);
+- b = _mm256_set1_ps(B-C);
+-
+- for(;number < eighthPoints; number++){
+- aVal = _mm256_loadu_ps(aPtr);
+- exp = _mm256_cvtps_epi32(_mm256_fmadd_ps(a,aVal, b));
+- bVal = _mm256_castsi256_ps(exp);
+-
+- _mm256_storeu_ps(bPtr, bVal);
+- aPtr += 8;
+- bPtr += 8;
+- }
+-
+- number = eighthPoints * 8;
+- for(;number < num_points; number++){
+- *bPtr++ = expf(*aPtr++);
+- }
++ float* bPtr = bVector;
++ const float* aPtr = aVector;
++
++ unsigned int number = 0;
++ const unsigned int eighthPoints = num_points / 8;
++
++ __m256 aVal, bVal, a, b;
++ __m256i exp;
++ a = _mm256_set1_ps(A / Mln2);
++ b = _mm256_set1_ps(B - C);
++
++ for (; number < eighthPoints; number++) {
++ aVal = _mm256_loadu_ps(aPtr);
++ exp = _mm256_cvtps_epi32(_mm256_fmadd_ps(a, aVal, b));
++ bVal = _mm256_castsi256_ps(exp);
++
++ _mm256_storeu_ps(bPtr, bVal);
++ aPtr += 8;
++ bPtr += 8;
++ }
++
++ number = eighthPoints * 8;
++ for (; number < num_points; number++) {
++ *bPtr++ = expf(*aPtr++);
++ }
+ }
+
+ #endif /* LV_HAVE_AVX && LV_HAVE_FMA for unaligned */
+@@ -228,31 +231,31 @@ volk_32f_expfast_32f_u_avx_fma(float* bVector, const float* aVector, unsigned in
+ static inline void
+ volk_32f_expfast_32f_u_avx(float* bVector, const float* aVector, unsigned int num_points)
+ {
+- float* bPtr = bVector;
+- const float* aPtr = aVector;
+-
+- unsigned int number = 0;
+- const unsigned int eighthPoints = num_points / 8;
+-
+- __m256 aVal, bVal, a, b;
+- __m256i exp;
+- a = _mm256_set1_ps(A/Mln2);
+- b = _mm256_set1_ps(B-C);
+-
+- for(;number < eighthPoints; number++){
+- aVal = _mm256_loadu_ps(aPtr);
+- exp = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_mul_ps(a,aVal), b));
+- bVal = _mm256_castsi256_ps(exp);
+-
+- _mm256_storeu_ps(bPtr, bVal);
+- aPtr += 8;
+- bPtr += 8;
+- }
+-
+- number = eighthPoints * 8;
+- for(;number < num_points; number++){
+- *bPtr++ = expf(*aPtr++);
+- }
++ float* bPtr = bVector;
++ const float* aPtr = aVector;
++
++ unsigned int number = 0;
++ const unsigned int eighthPoints = num_points / 8;
++
++ __m256 aVal, bVal, a, b;
++ __m256i exp;
++ a = _mm256_set1_ps(A / Mln2);
++ b = _mm256_set1_ps(B - C);
++
++ for (; number < eighthPoints; number++) {
++ aVal = _mm256_loadu_ps(aPtr);
++ exp = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_mul_ps(a, aVal), b));
++ bVal = _mm256_castsi256_ps(exp);
++
++ _mm256_storeu_ps(bPtr, bVal);
++ aPtr += 8;
++ bPtr += 8;
++ }
++
++ number = eighthPoints * 8;
++ for (; number < num_points; number++) {
++ *bPtr++ = expf(*aPtr++);
++ }
+ }
+
+ #endif /* LV_HAVE_AVX for unaligned */
+@@ -261,34 +264,35 @@ volk_32f_expfast_32f_u_avx(float* bVector, const float* aVector, unsigned int nu
+ #ifdef LV_HAVE_SSE4_1
+ #include <smmintrin.h>
+
+-static inline void
+-volk_32f_expfast_32f_u_sse4_1(float* bVector, const float* aVector, unsigned int num_points)
++static inline void volk_32f_expfast_32f_u_sse4_1(float* bVector,
++ const float* aVector,
++ unsigned int num_points)
+ {
+- float* bPtr = bVector;
+- const float* aPtr = aVector;
+-
+- unsigned int number = 0;
+- const unsigned int quarterPoints = num_points / 4;
+-
+- __m128 aVal, bVal, a, b;
+- __m128i exp;
+- a = _mm_set1_ps(A/Mln2);
+- b = _mm_set1_ps(B-C);
+-
+- for(;number < quarterPoints; number++){
+- aVal = _mm_loadu_ps(aPtr);
+- exp = _mm_cvtps_epi32(_mm_add_ps(_mm_mul_ps(a,aVal), b));
+- bVal = _mm_castsi128_ps(exp);
+-
+- _mm_storeu_ps(bPtr, bVal);
+- aPtr += 4;
+- bPtr += 4;
+- }
+-
+- number = quarterPoints * 4;
+- for(;number < num_points; number++){
+- *bPtr++ = expf(*aPtr++);
+- }
++ float* bPtr = bVector;
++ const float* aPtr = aVector;
++
++ unsigned int number = 0;
++ const unsigned int quarterPoints = num_points / 4;
++
++ __m128 aVal, bVal, a, b;
++ __m128i exp;
++ a = _mm_set1_ps(A / Mln2);
++ b = _mm_set1_ps(B - C);
++
++ for (; number < quarterPoints; number++) {
++ aVal = _mm_loadu_ps(aPtr);
++ exp = _mm_cvtps_epi32(_mm_add_ps(_mm_mul_ps(a, aVal), b));
++ bVal = _mm_castsi128_ps(exp);
++
++ _mm_storeu_ps(bPtr, bVal);
++ aPtr += 4;
++ bPtr += 4;
++ }
++
++ number = quarterPoints * 4;
++ for (; number < num_points; number++) {
++ *bPtr++ = expf(*aPtr++);
++ }
+ }
+
+ #endif /* LV_HAVE_SSE4_1 for unaligned */
+@@ -296,16 +300,17 @@ volk_32f_expfast_32f_u_sse4_1(float* bVector, const float* aVector, unsigned int
+
+ #ifdef LV_HAVE_GENERIC
+
+-static inline void
+-volk_32f_expfast_32f_generic(float* bVector, const float* aVector, unsigned int num_points)
++static inline void volk_32f_expfast_32f_generic(float* bVector,
++ const float* aVector,
++ unsigned int num_points)
+ {
+- float* bPtr = bVector;
+- const float* aPtr = aVector;
+- unsigned int number = 0;
++ float* bPtr = bVector;
++ const float* aPtr = aVector;
++ unsigned int number = 0;
+
+- for(number = 0; number < num_points; number++){
+- *bPtr++ = expf(*aPtr++);
+- }
++ for (number = 0; number < num_points; number++) {
++ *bPtr++ = expf(*aPtr++);
++ }
+ }
+ #endif /* LV_HAVE_GENERIC */
+
+diff --git a/kernels/volk/volk_32f_index_max_16u.h b/kernels/volk/volk_32f_index_max_16u.h
+index 7ca6928..3ee10f4 100644
+--- a/kernels/volk/volk_32f_index_max_16u.h
++++ b/kernels/volk/volk_32f_index_max_16u.h
+@@ -71,72 +71,71 @@
+ #ifndef INCLUDED_volk_32f_index_max_16u_a_H
+ #define INCLUDED_volk_32f_index_max_16u_a_H
+
+-#include <volk/volk_common.h>
+-#include <volk/volk_common.h>
+ #include <inttypes.h>
+ #include <limits.h>
+ #include <stdio.h>
++#include <volk/volk_common.h>
+
+ #ifdef LV_HAVE_AVX
+ #include <immintrin.h>
+
+ static inline void
+-volk_32f_index_max_16u_a_avx(uint16_t* target, const float* src0,
+- uint32_t num_points)
++volk_32f_index_max_16u_a_avx(uint16_t* target, const float* src0, uint32_t num_points)
+ {
+- num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
+-
+- uint32_t number = 0;
+- const uint32_t eighthPoints = num_points / 8;
++ num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
+
+- float* inputPtr = (float*)src0;
++ uint32_t number = 0;
++ const uint32_t eighthPoints = num_points / 8;
+
+- __m256 indexIncrementValues = _mm256_set1_ps(8);
+- __m256 currentIndexes = _mm256_set_ps(-1,-2,-3,-4,-5,-6,-7,-8);
++ float* inputPtr = (float*)src0;
+
+- float max = src0[0];
+- float index = 0;
+- __m256 maxValues = _mm256_set1_ps(max);
+- __m256 maxValuesIndex = _mm256_setzero_ps();
+- __m256 compareResults;
+- __m256 currentValues;
++ __m256 indexIncrementValues = _mm256_set1_ps(8);
++ __m256 currentIndexes = _mm256_set_ps(-1, -2, -3, -4, -5, -6, -7, -8);
+
+- __VOLK_ATTR_ALIGNED(32) float maxValuesBuffer[8];
+- __VOLK_ATTR_ALIGNED(32) float maxIndexesBuffer[8];
++ float max = src0[0];
++ float index = 0;
++ __m256 maxValues = _mm256_set1_ps(max);
++ __m256 maxValuesIndex = _mm256_setzero_ps();
++ __m256 compareResults;
++ __m256 currentValues;
+
+- for(;number < eighthPoints; number++){
++ __VOLK_ATTR_ALIGNED(32) float maxValuesBuffer[8];
++ __VOLK_ATTR_ALIGNED(32) float maxIndexesBuffer[8];
+
+- currentValues = _mm256_load_ps(inputPtr); inputPtr += 8;
+- currentIndexes = _mm256_add_ps(currentIndexes, indexIncrementValues);
++ for (; number < eighthPoints; number++) {
+
+- compareResults = _mm256_cmp_ps(currentValues, maxValues, _CMP_GT_OS);
++ currentValues = _mm256_load_ps(inputPtr);
++ inputPtr += 8;
++ currentIndexes = _mm256_add_ps(currentIndexes, indexIncrementValues);
+
+- maxValuesIndex = _mm256_blendv_ps(maxValuesIndex, currentIndexes, compareResults);
+- maxValues = _mm256_blendv_ps(maxValues, currentValues, compareResults);
+- }
++ compareResults = _mm256_cmp_ps(currentValues, maxValues, _CMP_GT_OS);
+
+- // Calculate the largest value from the remaining 4 points
+- _mm256_store_ps(maxValuesBuffer, maxValues);
+- _mm256_store_ps(maxIndexesBuffer, maxValuesIndex);
++ maxValuesIndex = _mm256_blendv_ps(maxValuesIndex, currentIndexes, compareResults);
++ maxValues = _mm256_blendv_ps(maxValues, currentValues, compareResults);
++ }
+
+- for(number = 0; number < 8; number++){
+- if(maxValuesBuffer[number] > max){
+- index = maxIndexesBuffer[number];
+- max = maxValuesBuffer[number];
+- } else if(maxValuesBuffer[number] == max){
+- if (index > maxIndexesBuffer[number])
+- index = maxIndexesBuffer[number];
++ // Calculate the largest value from the remaining 4 points
++ _mm256_store_ps(maxValuesBuffer, maxValues);
++ _mm256_store_ps(maxIndexesBuffer, maxValuesIndex);
++
++ for (number = 0; number < 8; number++) {
++ if (maxValuesBuffer[number] > max) {
++ index = maxIndexesBuffer[number];
++ max = maxValuesBuffer[number];
++ } else if (maxValuesBuffer[number] == max) {
++ if (index > maxIndexesBuffer[number])
++ index = maxIndexesBuffer[number];
++ }
+ }
+- }
+
+- number = eighthPoints * 8;
+- for(;number < num_points; number++){
+- if(src0[number] > max){
+- index = number;
+- max = src0[number];
++ number = eighthPoints * 8;
++ for (; number < num_points; number++) {
++ if (src0[number] > max) {
++ index = number;
++ max = src0[number];
++ }
+ }
+- }
+- target[0] = (uint16_t)index;
++ target[0] = (uint16_t)index;
+ }
+
+ #endif /*LV_HAVE_AVX*/
+@@ -145,62 +144,62 @@ volk_32f_index_max_16u_a_avx(uint16_t* target, const float* src0,
+ #include <smmintrin.h>
+
+ static inline void
+-volk_32f_index_max_16u_a_sse4_1(uint16_t* target, const float* src0,
+- uint32_t num_points)
++volk_32f_index_max_16u_a_sse4_1(uint16_t* target, const float* src0, uint32_t num_points)
+ {
+- num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
+-
+- uint32_t number = 0;
+- const uint32_t quarterPoints = num_points / 4;
++ num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
+
+- float* inputPtr = (float*)src0;
++ uint32_t number = 0;
++ const uint32_t quarterPoints = num_points / 4;
+
+- __m128 indexIncrementValues = _mm_set1_ps(4);
+- __m128 currentIndexes = _mm_set_ps(-1,-2,-3,-4);
++ float* inputPtr = (float*)src0;
+
+- float max = src0[0];
+- float index = 0;
+- __m128 maxValues = _mm_set1_ps(max);
+- __m128 maxValuesIndex = _mm_setzero_ps();
+- __m128 compareResults;
+- __m128 currentValues;
++ __m128 indexIncrementValues = _mm_set1_ps(4);
++ __m128 currentIndexes = _mm_set_ps(-1, -2, -3, -4);
+
+- __VOLK_ATTR_ALIGNED(16) float maxValuesBuffer[4];
+- __VOLK_ATTR_ALIGNED(16) float maxIndexesBuffer[4];
++ float max = src0[0];
++ float index = 0;
++ __m128 maxValues = _mm_set1_ps(max);
++ __m128 maxValuesIndex = _mm_setzero_ps();
++ __m128 compareResults;
++ __m128 currentValues;
+
+- for(;number < quarterPoints; number++){
++ __VOLK_ATTR_ALIGNED(16) float maxValuesBuffer[4];
++ __VOLK_ATTR_ALIGNED(16) float maxIndexesBuffer[4];
+
+- currentValues = _mm_load_ps(inputPtr); inputPtr += 4;
+- currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues);
++ for (; number < quarterPoints; number++) {
+
+- compareResults = _mm_cmpgt_ps(currentValues, maxValues);
++ currentValues = _mm_load_ps(inputPtr);
++ inputPtr += 4;
++ currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues);
+
+- maxValuesIndex = _mm_blendv_ps(maxValuesIndex, currentIndexes, compareResults);
+- maxValues = _mm_blendv_ps(maxValues, currentValues, compareResults);
+- }
++ compareResults = _mm_cmpgt_ps(currentValues, maxValues);
+
+- // Calculate the largest value from the remaining 4 points
+- _mm_store_ps(maxValuesBuffer, maxValues);
+- _mm_store_ps(maxIndexesBuffer, maxValuesIndex);
++ maxValuesIndex = _mm_blendv_ps(maxValuesIndex, currentIndexes, compareResults);
++ maxValues = _mm_blendv_ps(maxValues, currentValues, compareResults);
++ }
+
+- for(number = 0; number < 4; number++){
+- if(maxValuesBuffer[number] > max){
+- index = maxIndexesBuffer[number];
+- max = maxValuesBuffer[number];
+- } else if(maxValuesBuffer[number] == max){
+- if (index > maxIndexesBuffer[number])
+- index = maxIndexesBuffer[number];
++ // Calculate the largest value from the remaining 4 points
++ _mm_store_ps(maxValuesBuffer, maxValues);
++ _mm_store_ps(maxIndexesBuffer, maxValuesIndex);
++
++ for (number = 0; number < 4; number++) {
++ if (maxValuesBuffer[number] > max) {
++ index = maxIndexesBuffer[number];
++ max = maxValuesBuffer[number];
++ } else if (maxValuesBuffer[number] == max) {
++ if (index > maxIndexesBuffer[number])
++ index = maxIndexesBuffer[number];
++ }
+ }
+- }
+
+- number = quarterPoints * 4;
+- for(;number < num_points; number++){
+- if(src0[number] > max){
+- index = number;
+- max = src0[number];
++ number = quarterPoints * 4;
++ for (; number < num_points; number++) {
++ if (src0[number] > max) {
++ index = number;
++ max = src0[number];
++ }
+ }
+- }
+- target[0] = (uint16_t)index;
++ target[0] = (uint16_t)index;
+ }
+
+ #endif /*LV_HAVE_SSE4_1*/
+@@ -211,64 +210,64 @@ volk_32f_index_max_16u_a_sse4_1(uint16_t* target, const float* src0,
+ #include <xmmintrin.h>
+
+ static inline void
+-volk_32f_index_max_16u_a_sse(uint16_t* target, const float* src0,
+- uint32_t num_points)
++volk_32f_index_max_16u_a_sse(uint16_t* target, const float* src0, uint32_t num_points)
+ {
+- num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
+-
+- uint32_t number = 0;
+- const uint32_t quarterPoints = num_points / 4;
++ num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
+
+- float* inputPtr = (float*)src0;
++ uint32_t number = 0;
++ const uint32_t quarterPoints = num_points / 4;
+
+- __m128 indexIncrementValues = _mm_set1_ps(4);
+- __m128 currentIndexes = _mm_set_ps(-1,-2,-3,-4);
++ float* inputPtr = (float*)src0;
+
+- float max = src0[0];
+- float index = 0;
+- __m128 maxValues = _mm_set1_ps(max);
+- __m128 maxValuesIndex = _mm_setzero_ps();
+- __m128 compareResults;
+- __m128 currentValues;
++ __m128 indexIncrementValues = _mm_set1_ps(4);
++ __m128 currentIndexes = _mm_set_ps(-1, -2, -3, -4);
+
+- __VOLK_ATTR_ALIGNED(16) float maxValuesBuffer[4];
+- __VOLK_ATTR_ALIGNED(16) float maxIndexesBuffer[4];
++ float max = src0[0];
++ float index = 0;
++ __m128 maxValues = _mm_set1_ps(max);
++ __m128 maxValuesIndex = _mm_setzero_ps();
++ __m128 compareResults;
++ __m128 currentValues;
+
+- for(;number < quarterPoints; number++){
++ __VOLK_ATTR_ALIGNED(16) float maxValuesBuffer[4];
++ __VOLK_ATTR_ALIGNED(16) float maxIndexesBuffer[4];
+
+- currentValues = _mm_load_ps(inputPtr); inputPtr += 4;
+- currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues);
++ for (; number < quarterPoints; number++) {
+
+- compareResults = _mm_cmpgt_ps(currentValues, maxValues);
++ currentValues = _mm_load_ps(inputPtr);
++ inputPtr += 4;
++ currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues);
+
+- maxValuesIndex = _mm_or_ps(_mm_and_ps(compareResults, currentIndexes),
+- _mm_andnot_ps(compareResults, maxValuesIndex));
+- maxValues = _mm_or_ps(_mm_and_ps(compareResults, currentValues),
+- _mm_andnot_ps(compareResults, maxValues));
+- }
++ compareResults = _mm_cmpgt_ps(currentValues, maxValues);
+
+- // Calculate the largest value from the remaining 4 points
+- _mm_store_ps(maxValuesBuffer, maxValues);
+- _mm_store_ps(maxIndexesBuffer, maxValuesIndex);
++ maxValuesIndex = _mm_or_ps(_mm_and_ps(compareResults, currentIndexes),
++ _mm_andnot_ps(compareResults, maxValuesIndex));
++ maxValues = _mm_or_ps(_mm_and_ps(compareResults, currentValues),
++ _mm_andnot_ps(compareResults, maxValues));
++ }
+
+- for(number = 0; number < 4; number++){
+- if(maxValuesBuffer[number] > max){
+- index = maxIndexesBuffer[number];
+- max = maxValuesBuffer[number];
+- } else if(maxValuesBuffer[number] == max){
+- if (index > maxIndexesBuffer[number])
+- index = maxIndexesBuffer[number];
++ // Calculate the largest value from the remaining 4 points
++ _mm_store_ps(maxValuesBuffer, maxValues);
++ _mm_store_ps(maxIndexesBuffer, maxValuesIndex);
++
++ for (number = 0; number < 4; number++) {
++ if (maxValuesBuffer[number] > max) {
++ index = maxIndexesBuffer[number];
++ max = maxValuesBuffer[number];
++ } else if (maxValuesBuffer[number] == max) {
++ if (index > maxIndexesBuffer[number])
++ index = maxIndexesBuffer[number];
++ }
+ }
+- }
+
+- number = quarterPoints * 4;
+- for(;number < num_points; number++){
+- if(src0[number] > max){
+- index = number;
+- max = src0[number];
++ number = quarterPoints * 4;
++ for (; number < num_points; number++) {
++ if (src0[number] > max) {
++ index = number;
++ max = src0[number];
++ }
+ }
+- }
+- target[0] = (uint16_t)index;
++ target[0] = (uint16_t)index;
+ }
+
+ #endif /*LV_HAVE_SSE*/
+@@ -277,23 +276,22 @@ volk_32f_index_max_16u_a_sse(uint16_t* target, const float* src0,
+ #ifdef LV_HAVE_GENERIC
+
+ static inline void
+-volk_32f_index_max_16u_generic(uint16_t* target, const float* src0,
+- uint32_t num_points)
++volk_32f_index_max_16u_generic(uint16_t* target, const float* src0, uint32_t num_points)
+ {
+- num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
++ num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
+
+- float max = src0[0];
+- uint16_t index = 0;
++ float max = src0[0];
++ uint16_t index = 0;
+
+- uint32_t i = 1;
++ uint32_t i = 1;
+
+- for(; i < num_points; ++i) {
+- if(src0[i] > max) {
+- index = i;
+- max = src0[i];
++ for (; i < num_points; ++i) {
++ if (src0[i] > max) {
++ index = i;
++ max = src0[i];
++ }
+ }
+- }
+- target[0] = index;
++ target[0] = index;
+ }
+
+ #endif /*LV_HAVE_GENERIC*/
+@@ -302,76 +300,74 @@ volk_32f_index_max_16u_generic(uint16_t* target, const float* src0,
+ #endif /*INCLUDED_volk_32f_index_max_16u_a_H*/
+
+
+-
+ #ifndef INCLUDED_volk_32f_index_max_16u_u_H
+ #define INCLUDED_volk_32f_index_max_16u_u_H
+
+-#include <volk/volk_common.h>
+-#include <volk/volk_common.h>
+ #include <inttypes.h>
+ #include <limits.h>
+ #include <stdio.h>
++#include <volk/volk_common.h>
+
+ #ifdef LV_HAVE_AVX
+ #include <immintrin.h>
+
+ static inline void
+-volk_32f_index_max_16u_u_avx(uint16_t* target, const float* src0,
+- uint32_t num_points)
++volk_32f_index_max_16u_u_avx(uint16_t* target, const float* src0, uint32_t num_points)
+ {
+- num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
+-
+- uint32_t number = 0;
+- const uint32_t eighthPoints = num_points / 8;
++ num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
+
+- float* inputPtr = (float*)src0;
++ uint32_t number = 0;
++ const uint32_t eighthPoints = num_points / 8;
+
+- __m256 indexIncrementValues = _mm256_set1_ps(8);
+- __m256 currentIndexes = _mm256_set_ps(-1,-2,-3,-4,-5,-6,-7,-8);
++ float* inputPtr = (float*)src0;
+
+- float max = src0[0];
+- float index = 0;
+- __m256 maxValues = _mm256_set1_ps(max);
+- __m256 maxValuesIndex = _mm256_setzero_ps();
+- __m256 compareResults;
+- __m256 currentValues;
++ __m256 indexIncrementValues = _mm256_set1_ps(8);
++ __m256 currentIndexes = _mm256_set_ps(-1, -2, -3, -4, -5, -6, -7, -8);
+
+- __VOLK_ATTR_ALIGNED(32) float maxValuesBuffer[8];
+- __VOLK_ATTR_ALIGNED(32) float maxIndexesBuffer[8];
++ float max = src0[0];
++ float index = 0;
++ __m256 maxValues = _mm256_set1_ps(max);
++ __m256 maxValuesIndex = _mm256_setzero_ps();
++ __m256 compareResults;
++ __m256 currentValues;
+
+- for(;number < eighthPoints; number++){
++ __VOLK_ATTR_ALIGNED(32) float maxValuesBuffer[8];
++ __VOLK_ATTR_ALIGNED(32) float maxIndexesBuffer[8];
+
+- currentValues = _mm256_loadu_ps(inputPtr); inputPtr += 8;
+- currentIndexes = _mm256_add_ps(currentIndexes, indexIncrementValues);
++ for (; number < eighthPoints; number++) {
+
+- compareResults = _mm256_cmp_ps(currentValues, maxValues, _CMP_GT_OS);
++ currentValues = _mm256_loadu_ps(inputPtr);
++ inputPtr += 8;
++ currentIndexes = _mm256_add_ps(currentIndexes, indexIncrementValues);
+
+- maxValuesIndex = _mm256_blendv_ps(maxValuesIndex, currentIndexes, compareResults);
+- maxValues = _mm256_blendv_ps(maxValues, currentValues, compareResults);
+- }
++ compareResults = _mm256_cmp_ps(currentValues, maxValues, _CMP_GT_OS);
+
+- // Calculate the largest value from the remaining 4 points
+- _mm256_storeu_ps(maxValuesBuffer, maxValues);
+- _mm256_storeu_ps(maxIndexesBuffer, maxValuesIndex);
++ maxValuesIndex = _mm256_blendv_ps(maxValuesIndex, currentIndexes, compareResults);
++ maxValues = _mm256_blendv_ps(maxValues, currentValues, compareResults);
++ }
+
+- for(number = 0; number < 8; number++){
+- if(maxValuesBuffer[number] > max){
+- index = maxIndexesBuffer[number];
+- max = maxValuesBuffer[number];
+- } else if(maxValuesBuffer[number] == max){
+- if (index > maxIndexesBuffer[number])
+- index = maxIndexesBuffer[number];
++ // Calculate the largest value from the remaining 4 points
++ _mm256_storeu_ps(maxValuesBuffer, maxValues);
++ _mm256_storeu_ps(maxIndexesBuffer, maxValuesIndex);
++
++ for (number = 0; number < 8; number++) {
++ if (maxValuesBuffer[number] > max) {
++ index = maxIndexesBuffer[number];
++ max = maxValuesBuffer[number];
++ } else if (maxValuesBuffer[number] == max) {
++ if (index > maxIndexesBuffer[number])
++ index = maxIndexesBuffer[number];
++ }
+ }
+- }
+
+- number = eighthPoints * 8;
+- for(;number < num_points; number++){
+- if(src0[number] > max){
+- index = number;
+- max = src0[number];
++ number = eighthPoints * 8;
++ for (; number < num_points; number++) {
++ if (src0[number] > max) {
++ index = number;
++ max = src0[number];
++ }
+ }
+- }
+- target[0] = (uint16_t)index;
++ target[0] = (uint16_t)index;
+ }
+
+ #endif /*LV_HAVE_AVX*/
+diff --git a/kernels/volk/volk_32f_index_max_32u.h b/kernels/volk/volk_32f_index_max_32u.h
+index 318c8e4..315531d 100644
+--- a/kernels/volk/volk_32f_index_max_32u.h
++++ b/kernels/volk/volk_32f_index_max_32u.h
+@@ -25,7 +25,8 @@
+ *
+ * \b Overview
+ *
+- * Returns Argmax_i x[i]. Finds and returns the index which contains the first maximum value in the given vector.
++ * Returns Argmax_i x[i]. Finds and returns the index which contains the first maximum
++ * value in the given vector.
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+@@ -64,70 +65,71 @@
+ #ifndef INCLUDED_volk_32f_index_max_32u_a_H
+ #define INCLUDED_volk_32f_index_max_32u_a_H
+
+-#include <volk/volk_common.h>
+-#include <volk/volk_common.h>
+ #include <inttypes.h>
+ #include <stdio.h>
++#include <volk/volk_common.h>
+
+ #ifdef LV_HAVE_SSE4_1
+-#include<smmintrin.h>
++#include <smmintrin.h>
+
+ static inline void
+ volk_32f_index_max_32u_a_sse4_1(uint32_t* target, const float* src0, uint32_t num_points)
+ {
+- if(num_points > 0){
+- uint32_t number = 0;
+- const uint32_t quarterPoints = num_points / 4;
++ if (num_points > 0) {
++ uint32_t number = 0;
++ const uint32_t quarterPoints = num_points / 4;
+
+- float* inputPtr = (float*)src0;
++ float* inputPtr = (float*)src0;
+
+- __m128 indexIncrementValues = _mm_set1_ps(4);
+- __m128 currentIndexes = _mm_set_ps(-1,-2,-3,-4);
++ __m128 indexIncrementValues = _mm_set1_ps(4);
++ __m128 currentIndexes = _mm_set_ps(-1, -2, -3, -4);
+
+- float max = src0[0];
+- float index = 0;
+- __m128 maxValues = _mm_set1_ps(max);
+- __m128 maxValuesIndex = _mm_setzero_ps();
+- __m128 compareResults;
+- __m128 currentValues;
++ float max = src0[0];
++ float index = 0;
++ __m128 maxValues = _mm_set1_ps(max);
++ __m128 maxValuesIndex = _mm_setzero_ps();
++ __m128 compareResults;
++ __m128 currentValues;
+
+- __VOLK_ATTR_ALIGNED(16) float maxValuesBuffer[4];
+- __VOLK_ATTR_ALIGNED(16) float maxIndexesBuffer[4];
++ __VOLK_ATTR_ALIGNED(16) float maxValuesBuffer[4];
++ __VOLK_ATTR_ALIGNED(16) float maxIndexesBuffer[4];
+
+- for(;number < quarterPoints; number++){
++ for (; number < quarterPoints; number++) {
+
+- currentValues = _mm_load_ps(inputPtr); inputPtr += 4;
+- currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues);
++ currentValues = _mm_load_ps(inputPtr);
++ inputPtr += 4;
++ currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues);
+
+- compareResults = _mm_cmpgt_ps(currentValues, maxValues);
++ compareResults = _mm_cmpgt_ps(currentValues, maxValues);
+
+- maxValuesIndex = _mm_blendv_ps(maxValuesIndex, currentIndexes, compareResults);
+- maxValues = _mm_blendv_ps(maxValues, currentValues, compareResults);
+- }
++ maxValuesIndex =
++ _mm_blendv_ps(maxValuesIndex, currentIndexes, compareResults);
++ maxValues = _mm_blendv_ps(maxValues, currentValues, compareResults);
++ }
+
+- // Calculate the largest value from the remaining 4 points
+- _mm_store_ps(maxValuesBuffer, maxValues);
+- _mm_store_ps(maxIndexesBuffer, maxValuesIndex);
+-
+- for(number = 0; number < 4; number++){
+- if(maxValuesBuffer[number] > max){
+- index = maxIndexesBuffer[number];
+- max = maxValuesBuffer[number];
+- } else if(maxValuesBuffer[number] == max){
+- if (index > maxIndexesBuffer[number])
+- index = maxIndexesBuffer[number];
+- }
+- }
++ // Calculate the largest value from the remaining 4 points
++ _mm_store_ps(maxValuesBuffer, maxValues);
++ _mm_store_ps(maxIndexesBuffer, maxValuesIndex);
++
++ for (number = 0; number < 4; number++) {
++ if (maxValuesBuffer[number] > max) {
++ index = maxIndexesBuffer[number];
++ max = maxValuesBuffer[number];
++ } else if (maxValuesBuffer[number] == max) {
++ if (index > maxIndexesBuffer[number])
++ index = maxIndexesBuffer[number];
++ }
++ }
+
+- number = quarterPoints * 4;
+- for(;number < num_points; number++){
+- if(src0[number] > max){
+- index = number;
+- max = src0[number];
+- }
++ number = quarterPoints * 4;
++ for (; number < num_points; number++) {
++ if (src0[number] > max) {
++ index = number;
++ max = src0[number];
++ }
++ }
++ target[0] = (uint32_t)index;
+ }
+- target[0] = (uint32_t)index;
+- }
+ }
+
+ #endif /*LV_HAVE_SSE4_1*/
+@@ -135,67 +137,68 @@ volk_32f_index_max_32u_a_sse4_1(uint32_t* target, const float* src0, uint32_t nu
+
+ #ifdef LV_HAVE_SSE
+
+-#include<xmmintrin.h>
++#include <xmmintrin.h>
+
+ static inline void
+ volk_32f_index_max_32u_a_sse(uint32_t* target, const float* src0, uint32_t num_points)
+ {
+- if(num_points > 0){
+- uint32_t number = 0;
+- const uint32_t quarterPoints = num_points / 4;
++ if (num_points > 0) {
++ uint32_t number = 0;
++ const uint32_t quarterPoints = num_points / 4;
+
+- float* inputPtr = (float*)src0;
++ float* inputPtr = (float*)src0;
+
+- __m128 indexIncrementValues = _mm_set1_ps(4);
+- __m128 currentIndexes = _mm_set_ps(-1,-2,-3,-4);
++ __m128 indexIncrementValues = _mm_set1_ps(4);
++ __m128 currentIndexes = _mm_set_ps(-1, -2, -3, -4);
+
+- float max = src0[0];
+- float index = 0;
+- __m128 maxValues = _mm_set1_ps(max);
+- __m128 maxValuesIndex = _mm_setzero_ps();
+- __m128 compareResults;
+- __m128 currentValues;
++ float max = src0[0];
++ float index = 0;
++ __m128 maxValues = _mm_set1_ps(max);
++ __m128 maxValuesIndex = _mm_setzero_ps();
++ __m128 compareResults;
++ __m128 currentValues;
+
+- __VOLK_ATTR_ALIGNED(16) float maxValuesBuffer[4];
+- __VOLK_ATTR_ALIGNED(16) float maxIndexesBuffer[4];
++ __VOLK_ATTR_ALIGNED(16) float maxValuesBuffer[4];
++ __VOLK_ATTR_ALIGNED(16) float maxIndexesBuffer[4];
+
+- for(;number < quarterPoints; number++){
++ for (; number < quarterPoints; number++) {
+
+- currentValues = _mm_load_ps(inputPtr); inputPtr += 4;
+- currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues);
++ currentValues = _mm_load_ps(inputPtr);
++ inputPtr += 4;
++ currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues);
+
+- compareResults = _mm_cmpgt_ps(currentValues, maxValues);
++ compareResults = _mm_cmpgt_ps(currentValues, maxValues);
+
+- maxValuesIndex = _mm_or_ps(_mm_and_ps(compareResults, currentIndexes),
+- _mm_andnot_ps(compareResults, maxValuesIndex));
++ maxValuesIndex = _mm_or_ps(_mm_and_ps(compareResults, currentIndexes),
++ _mm_andnot_ps(compareResults, maxValuesIndex));
+
+- maxValues = _mm_or_ps(_mm_and_ps(compareResults, currentValues),
+- _mm_andnot_ps(compareResults, maxValues));
+- }
++ maxValues = _mm_or_ps(_mm_and_ps(compareResults, currentValues),
++ _mm_andnot_ps(compareResults, maxValues));
++ }
+
+- // Calculate the largest value from the remaining 4 points
+- _mm_store_ps(maxValuesBuffer, maxValues);
+- _mm_store_ps(maxIndexesBuffer, maxValuesIndex);
+-
+- for(number = 0; number < 4; number++){
+- if(maxValuesBuffer[number] > max){
+- index = maxIndexesBuffer[number];
+- max = maxValuesBuffer[number];
+- } else if(maxValuesBuffer[number] == max){
+- if (index > maxIndexesBuffer[number])
+- index = maxIndexesBuffer[number];
+- }
+- }
++ // Calculate the largest value from the remaining 4 points
++ _mm_store_ps(maxValuesBuffer, maxValues);
++ _mm_store_ps(maxIndexesBuffer, maxValuesIndex);
++
++ for (number = 0; number < 4; number++) {
++ if (maxValuesBuffer[number] > max) {
++ index = maxIndexesBuffer[number];
++ max = maxValuesBuffer[number];
++ } else if (maxValuesBuffer[number] == max) {
++ if (index > maxIndexesBuffer[number])
++ index = maxIndexesBuffer[number];
++ }
++ }
+
+- number = quarterPoints * 4;
+- for(;number < num_points; number++){
+- if(src0[number] > max){
+- index = number;
+- max = src0[number];
+- }
++ number = quarterPoints * 4;
++ for (; number < num_points; number++) {
++ if (src0[number] > max) {
++ index = number;
++ max = src0[number];
++ }
++ }
++ target[0] = (uint32_t)index;
+ }
+- target[0] = (uint32_t)index;
+- }
+ }
+
+ #endif /*LV_HAVE_SSE*/
+@@ -204,65 +207,61 @@ volk_32f_index_max_32u_a_sse(uint32_t* target, const float* src0, uint32_t num_p
+ #ifdef LV_HAVE_AVX
+ #include <immintrin.h>
+
+-static inline void volk_32f_index_max_32u_a_avx(uint32_t* target, const float* src0, uint32_t num_points)
++static inline void
++volk_32f_index_max_32u_a_avx(uint32_t* target, const float* src0, uint32_t num_points)
+ {
+- if(num_points > 0)
+- {
+- uint32_t number = 0;
+- const uint32_t quarterPoints = num_points / 8;
+-
+- float* inputPtr = (float*)src0;
+-
+- __m256 indexIncrementValues = _mm256_set1_ps(8);
+- __m256 currentIndexes = _mm256_set_ps(-1,-2,-3,-4,-5,-6,-7,-8);
+-
+- float max = src0[0];
+- float index = 0;
+- __m256 maxValues = _mm256_set1_ps(max);
+- __m256 maxValuesIndex = _mm256_setzero_ps();
+- __m256 compareResults;
+- __m256 currentValues;
+-
+- __VOLK_ATTR_ALIGNED(32) float maxValuesBuffer[8];
+- __VOLK_ATTR_ALIGNED(32) float maxIndexesBuffer[8];
+-
+- for(;number < quarterPoints; number++)
+- {
+- currentValues = _mm256_load_ps(inputPtr); inputPtr += 8;
+- currentIndexes = _mm256_add_ps(currentIndexes, indexIncrementValues);
+- compareResults = _mm256_cmp_ps(currentValues, maxValues, _CMP_GT_OS);
+- maxValuesIndex = _mm256_blendv_ps(maxValuesIndex, currentIndexes, compareResults);
+- maxValues = _mm256_blendv_ps(maxValues, currentValues, compareResults);
+- }
+-
+- // Calculate the largest value from the remaining 8 points
+- _mm256_store_ps(maxValuesBuffer, maxValues);
+- _mm256_store_ps(maxIndexesBuffer, maxValuesIndex);
+-
+- for(number = 0; number < 8; number++)
+- {
+- if(maxValuesBuffer[number] > max)
+- {
+- index = maxIndexesBuffer[number];
+- max = maxValuesBuffer[number];
+- }
+- else if(maxValuesBuffer[number] == max){
+- if (index > maxIndexesBuffer[number])
+- index = maxIndexesBuffer[number];
+- }
+- }
+-
+- number = quarterPoints * 8;
+- for(;number < num_points; number++)
+- {
+- if(src0[number] > max)
+- {
+- index = number;
+- max = src0[number];
+- }
+- }
+- target[0] = (uint32_t)index;
++ if (num_points > 0) {
++ uint32_t number = 0;
++ const uint32_t quarterPoints = num_points / 8;
++
++ float* inputPtr = (float*)src0;
++
++ __m256 indexIncrementValues = _mm256_set1_ps(8);
++ __m256 currentIndexes = _mm256_set_ps(-1, -2, -3, -4, -5, -6, -7, -8);
++
++ float max = src0[0];
++ float index = 0;
++ __m256 maxValues = _mm256_set1_ps(max);
++ __m256 maxValuesIndex = _mm256_setzero_ps();
++ __m256 compareResults;
++ __m256 currentValues;
++
++ __VOLK_ATTR_ALIGNED(32) float maxValuesBuffer[8];
++ __VOLK_ATTR_ALIGNED(32) float maxIndexesBuffer[8];
++
++ for (; number < quarterPoints; number++) {
++ currentValues = _mm256_load_ps(inputPtr);
++ inputPtr += 8;
++ currentIndexes = _mm256_add_ps(currentIndexes, indexIncrementValues);
++ compareResults = _mm256_cmp_ps(currentValues, maxValues, _CMP_GT_OS);
++ maxValuesIndex =
++ _mm256_blendv_ps(maxValuesIndex, currentIndexes, compareResults);
++ maxValues = _mm256_blendv_ps(maxValues, currentValues, compareResults);
++ }
++
++ // Calculate the largest value from the remaining 8 points
++ _mm256_store_ps(maxValuesBuffer, maxValues);
++ _mm256_store_ps(maxIndexesBuffer, maxValuesIndex);
++
++ for (number = 0; number < 8; number++) {
++ if (maxValuesBuffer[number] > max) {
++ index = maxIndexesBuffer[number];
++ max = maxValuesBuffer[number];
++ } else if (maxValuesBuffer[number] == max) {
++ if (index > maxIndexesBuffer[number])
++ index = maxIndexesBuffer[number];
++ }
++ }
++
++ number = quarterPoints * 8;
++ for (; number < num_points; number++) {
++ if (src0[number] > max) {
++ index = number;
++ max = src0[number];
++ }
+ }
++ target[0] = (uint32_t)index;
++ }
+ }
+
+ #endif /*LV_HAVE_AVX*/
+@@ -271,66 +270,63 @@ static inline void volk_32f_index_max_32u_a_avx(uint32_t* target, const float* s
+ #ifdef LV_HAVE_NEON
+ #include <arm_neon.h>
+
+-static inline void volk_32f_index_max_32u_neon(uint32_t* target, const float* src0, uint32_t num_points)
++static inline void
++volk_32f_index_max_32u_neon(uint32_t* target, const float* src0, uint32_t num_points)
+ {
+- if(num_points > 0)
+- {
+- uint32_t number = 0;
+- const uint32_t quarterPoints = num_points / 4;
+-
+- float* inputPtr = (float*)src0;
+- float32x4_t indexIncrementValues = vdupq_n_f32(4);
+- __VOLK_ATTR_ALIGNED(16) float currentIndexes_float[4] = { -4.0f, -3.0f, -2.0f, -1.0f };
+- float32x4_t currentIndexes = vld1q_f32(currentIndexes_float);
+-
+- float max = src0[0];
+- float index = 0;
+- float32x4_t maxValues = vdupq_n_f32(max);
+- uint32x4_t maxValuesIndex = vmovq_n_u32(0);
+- uint32x4_t compareResults;
+- uint32x4_t currentIndexes_u;
+- float32x4_t currentValues;
+-
+- __VOLK_ATTR_ALIGNED(16) float maxValuesBuffer[4];
+- __VOLK_ATTR_ALIGNED(16) float maxIndexesBuffer[4];
+-
+- for(;number < quarterPoints; number++)
+- {
+- currentValues = vld1q_f32(inputPtr); inputPtr += 4;
+- currentIndexes = vaddq_f32(currentIndexes, indexIncrementValues);
+- currentIndexes_u = vcvtq_u32_f32(currentIndexes);
+- compareResults = vcleq_f32(currentValues, maxValues);
+- maxValuesIndex = vorrq_u32( vandq_u32( compareResults, maxValuesIndex ), vbicq_u32(currentIndexes_u, compareResults) );
+- maxValues = vmaxq_f32(currentValues, maxValues);
+- }
+-
+- // Calculate the largest value from the remaining 4 points
+- vst1q_f32(maxValuesBuffer, maxValues);
+- vst1q_f32(maxIndexesBuffer, vcvtq_f32_u32(maxValuesIndex));
+- for(number = 0; number < 4; number++)
+- {
+- if(maxValuesBuffer[number] > max)
+- {
+- index = maxIndexesBuffer[number];
+- max = maxValuesBuffer[number];
+- }
+- else if(maxValues[number] == max){
+- if (index > maxIndexesBuffer[number])
+- index = maxIndexesBuffer[number];
+- }
+- }
+-
+- number = quarterPoints * 4;
+- for(;number < num_points; number++)
+- {
+- if(src0[number] > max)
+- {
+- index = number;
+- max = src0[number];
+- }
+- }
+- target[0] = (uint32_t)index;
++ if (num_points > 0) {
++ uint32_t number = 0;
++ const uint32_t quarterPoints = num_points / 4;
++
++ float* inputPtr = (float*)src0;
++ float32x4_t indexIncrementValues = vdupq_n_f32(4);
++ __VOLK_ATTR_ALIGNED(16)
++ float currentIndexes_float[4] = { -4.0f, -3.0f, -2.0f, -1.0f };
++ float32x4_t currentIndexes = vld1q_f32(currentIndexes_float);
++
++ float max = src0[0];
++ float index = 0;
++ float32x4_t maxValues = vdupq_n_f32(max);
++ uint32x4_t maxValuesIndex = vmovq_n_u32(0);
++ uint32x4_t compareResults;
++ uint32x4_t currentIndexes_u;
++ float32x4_t currentValues;
++
++ __VOLK_ATTR_ALIGNED(16) float maxValuesBuffer[4];
++ __VOLK_ATTR_ALIGNED(16) float maxIndexesBuffer[4];
++
++ for (; number < quarterPoints; number++) {
++ currentValues = vld1q_f32(inputPtr);
++ inputPtr += 4;
++ currentIndexes = vaddq_f32(currentIndexes, indexIncrementValues);
++ currentIndexes_u = vcvtq_u32_f32(currentIndexes);
++ compareResults = vcleq_f32(currentValues, maxValues);
++ maxValuesIndex = vorrq_u32(vandq_u32(compareResults, maxValuesIndex),
++ vbicq_u32(currentIndexes_u, compareResults));
++ maxValues = vmaxq_f32(currentValues, maxValues);
++ }
++
++ // Calculate the largest value from the remaining 4 points
++ vst1q_f32(maxValuesBuffer, maxValues);
++ vst1q_f32(maxIndexesBuffer, vcvtq_f32_u32(maxValuesIndex));
++ for (number = 0; number < 4; number++) {
++ if (maxValuesBuffer[number] > max) {
++ index = maxIndexesBuffer[number];
++ max = maxValuesBuffer[number];
++ } else if (maxValues[number] == max) {
++ if (index > maxIndexesBuffer[number])
++ index = maxIndexesBuffer[number];
++ }
++ }
++
++ number = quarterPoints * 4;
++ for (; number < num_points; number++) {
++ if (src0[number] > max) {
++ index = number;
++ max = src0[number];
++ }
+ }
++ target[0] = (uint32_t)index;
++ }
+ }
+
+ #endif /*LV_HAVE_NEON*/
+@@ -341,20 +337,20 @@ static inline void volk_32f_index_max_32u_neon(uint32_t* target, const float* sr
+ static inline void
+ volk_32f_index_max_32u_generic(uint32_t* target, const float* src0, uint32_t num_points)
+ {
+- if(num_points > 0){
+- float max = src0[0];
+- uint32_t index = 0;
++ if (num_points > 0) {
++ float max = src0[0];
++ uint32_t index = 0;
+
+- uint32_t i = 1;
++ uint32_t i = 1;
+
+- for(; i < num_points; ++i) {
+- if(src0[i] > max){
+- index = i;
+- max = src0[i];
+- }
++ for (; i < num_points; ++i) {
++ if (src0[i] > max) {
++ index = i;
++ max = src0[i];
++ }
++ }
++ target[0] = index;
+ }
+- target[0] = index;
+- }
+ }
+
+ #endif /*LV_HAVE_GENERIC*/
+@@ -366,209 +362,195 @@ volk_32f_index_max_32u_generic(uint32_t* target, const float* src0, uint32_t num
+ #ifndef INCLUDED_volk_32f_index_max_32u_u_H
+ #define INCLUDED_volk_32f_index_max_32u_u_H
+
+-#include <volk/volk_common.h>
+-#include <volk/volk_common.h>
+ #include <inttypes.h>
+ #include <stdio.h>
++#include <volk/volk_common.h>
+
+
+ #ifdef LV_HAVE_AVX
+ #include <immintrin.h>
+
+-static inline void volk_32f_index_max_32u_u_avx(uint32_t* target, const float* src0, uint32_t num_points)
++static inline void
++volk_32f_index_max_32u_u_avx(uint32_t* target, const float* src0, uint32_t num_points)
+ {
+- if(num_points > 0)
+- {
+- uint32_t number = 0;
+- const uint32_t quarterPoints = num_points / 8;
+-
+- float* inputPtr = (float*)src0;
+-
+- __m256 indexIncrementValues = _mm256_set1_ps(8);
+- __m256 currentIndexes = _mm256_set_ps(-1,-2,-3,-4,-5,-6,-7,-8);
+-
+- float max = src0[0];
+- float index = 0;
+- __m256 maxValues = _mm256_set1_ps(max);
+- __m256 maxValuesIndex = _mm256_setzero_ps();
+- __m256 compareResults;
+- __m256 currentValues;
+-
+- __VOLK_ATTR_ALIGNED(32) float maxValuesBuffer[8];
+- __VOLK_ATTR_ALIGNED(32) float maxIndexesBuffer[8];
+-
+- for(;number < quarterPoints; number++)
+- {
+- currentValues = _mm256_loadu_ps(inputPtr); inputPtr += 8;
+- currentIndexes = _mm256_add_ps(currentIndexes, indexIncrementValues);
+- compareResults = _mm256_cmp_ps(currentValues, maxValues, _CMP_GT_OS);
+- maxValuesIndex = _mm256_blendv_ps(maxValuesIndex, currentIndexes, compareResults);
+- maxValues = _mm256_blendv_ps(maxValues, currentValues, compareResults);
+- }
+-
+- // Calculate the largest value from the remaining 8 points
+- _mm256_store_ps(maxValuesBuffer, maxValues);
+- _mm256_store_ps(maxIndexesBuffer, maxValuesIndex);
+-
+- for(number = 0; number < 8; number++)
+- {
+- if(maxValuesBuffer[number] > max)
+- {
+- index = maxIndexesBuffer[number];
+- max = maxValuesBuffer[number];
+- }
+- else if(maxValuesBuffer[number] == max){
+- if (index > maxIndexesBuffer[number])
+- index = maxIndexesBuffer[number];
+- }
+- }
+-
+- number = quarterPoints * 8;
+- for(;number < num_points; number++)
+- {
+- if(src0[number] > max)
+- {
+- index = number;
+- max = src0[number];
+- }
+- }
+- target[0] = (uint32_t)index;
++ if (num_points > 0) {
++ uint32_t number = 0;
++ const uint32_t quarterPoints = num_points / 8;
++
++ float* inputPtr = (float*)src0;
++
++ __m256 indexIncrementValues = _mm256_set1_ps(8);
++ __m256 currentIndexes = _mm256_set_ps(-1, -2, -3, -4, -5, -6, -7, -8);
++
++ float max = src0[0];
++ float index = 0;
++ __m256 maxValues = _mm256_set1_ps(max);
++ __m256 maxValuesIndex = _mm256_setzero_ps();
++ __m256 compareResults;
++ __m256 currentValues;
++
++ __VOLK_ATTR_ALIGNED(32) float maxValuesBuffer[8];
++ __VOLK_ATTR_ALIGNED(32) float maxIndexesBuffer[8];
++
++ for (; number < quarterPoints; number++) {
++ currentValues = _mm256_loadu_ps(inputPtr);
++ inputPtr += 8;
++ currentIndexes = _mm256_add_ps(currentIndexes, indexIncrementValues);
++ compareResults = _mm256_cmp_ps(currentValues, maxValues, _CMP_GT_OS);
++ maxValuesIndex =
++ _mm256_blendv_ps(maxValuesIndex, currentIndexes, compareResults);
++ maxValues = _mm256_blendv_ps(maxValues, currentValues, compareResults);
+ }
++
++ // Calculate the largest value from the remaining 8 points
++ _mm256_store_ps(maxValuesBuffer, maxValues);
++ _mm256_store_ps(maxIndexesBuffer, maxValuesIndex);
++
++ for (number = 0; number < 8; number++) {
++ if (maxValuesBuffer[number] > max) {
++ index = maxIndexesBuffer[number];
++ max = maxValuesBuffer[number];
++ } else if (maxValuesBuffer[number] == max) {
++ if (index > maxIndexesBuffer[number])
++ index = maxIndexesBuffer[number];
++ }
++ }
++
++ number = quarterPoints * 8;
++ for (; number < num_points; number++) {
++ if (src0[number] > max) {
++ index = number;
++ max = src0[number];
++ }
++ }
++ target[0] = (uint32_t)index;
++ }
+ }
+
+ #endif /*LV_HAVE_AVX*/
+
+
+ #ifdef LV_HAVE_SSE4_1
+-#include<smmintrin.h>
++#include <smmintrin.h>
+
+-static inline void volk_32f_index_max_32u_u_sse4_1(uint32_t* target, const float* src0, uint32_t num_points)
++static inline void
++volk_32f_index_max_32u_u_sse4_1(uint32_t* target, const float* src0, uint32_t num_points)
+ {
+- if(num_points > 0)
+- {
+- uint32_t number = 0;
+- const uint32_t quarterPoints = num_points / 4;
+-
+- float* inputPtr = (float*)src0;
+-
+- __m128 indexIncrementValues = _mm_set1_ps(4);
+- __m128 currentIndexes = _mm_set_ps(-1,-2,-3,-4);
+-
+- float max = src0[0];
+- float index = 0;
+- __m128 maxValues = _mm_set1_ps(max);
+- __m128 maxValuesIndex = _mm_setzero_ps();
+- __m128 compareResults;
+- __m128 currentValues;
+-
+- __VOLK_ATTR_ALIGNED(16) float maxValuesBuffer[4];
+- __VOLK_ATTR_ALIGNED(16) float maxIndexesBuffer[4];
+-
+- for(;number < quarterPoints; number++)
+- {
+- currentValues = _mm_loadu_ps(inputPtr); inputPtr += 4;
+- currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues);
+- compareResults = _mm_cmpgt_ps(currentValues, maxValues);
+- maxValuesIndex = _mm_blendv_ps(maxValuesIndex, currentIndexes, compareResults);
+- maxValues = _mm_blendv_ps(maxValues, currentValues, compareResults);
+- }
+-
+- // Calculate the largest value from the remaining 4 points
+- _mm_store_ps(maxValuesBuffer, maxValues);
+- _mm_store_ps(maxIndexesBuffer, maxValuesIndex);
+-
+- for(number = 0; number < 4; number++)
+- {
+- if(maxValuesBuffer[number] > max)
+- {
+- index = maxIndexesBuffer[number];
+- max = maxValuesBuffer[number];
+- }
+- else if(maxValuesBuffer[number] == max){
+- if (index > maxIndexesBuffer[number])
+- index = maxIndexesBuffer[number];
+- }
+- }
+-
+- number = quarterPoints * 4;
+- for(;number < num_points; number++)
+- {
+- if(src0[number] > max)
+- {
+- index = number;
+- max = src0[number];
+- }
+- }
+- target[0] = (uint32_t)index;
++ if (num_points > 0) {
++ uint32_t number = 0;
++ const uint32_t quarterPoints = num_points / 4;
++
++ float* inputPtr = (float*)src0;
++
++ __m128 indexIncrementValues = _mm_set1_ps(4);
++ __m128 currentIndexes = _mm_set_ps(-1, -2, -3, -4);
++
++ float max = src0[0];
++ float index = 0;
++ __m128 maxValues = _mm_set1_ps(max);
++ __m128 maxValuesIndex = _mm_setzero_ps();
++ __m128 compareResults;
++ __m128 currentValues;
++
++ __VOLK_ATTR_ALIGNED(16) float maxValuesBuffer[4];
++ __VOLK_ATTR_ALIGNED(16) float maxIndexesBuffer[4];
++
++ for (; number < quarterPoints; number++) {
++ currentValues = _mm_loadu_ps(inputPtr);
++ inputPtr += 4;
++ currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues);
++ compareResults = _mm_cmpgt_ps(currentValues, maxValues);
++ maxValuesIndex =
++ _mm_blendv_ps(maxValuesIndex, currentIndexes, compareResults);
++ maxValues = _mm_blendv_ps(maxValues, currentValues, compareResults);
+ }
++
++ // Calculate the largest value from the remaining 4 points
++ _mm_store_ps(maxValuesBuffer, maxValues);
++ _mm_store_ps(maxIndexesBuffer, maxValuesIndex);
++
++ for (number = 0; number < 4; number++) {
++ if (maxValuesBuffer[number] > max) {
++ index = maxIndexesBuffer[number];
++ max = maxValuesBuffer[number];
++ } else if (maxValuesBuffer[number] == max) {
++ if (index > maxIndexesBuffer[number])
++ index = maxIndexesBuffer[number];
++ }
++ }
++
++ number = quarterPoints * 4;
++ for (; number < num_points; number++) {
++ if (src0[number] > max) {
++ index = number;
++ max = src0[number];
++ }
++ }
++ target[0] = (uint32_t)index;
++ }
+ }
+
+ #endif /*LV_HAVE_SSE4_1*/
+
+ #ifdef LV_HAVE_SSE
+-#include<xmmintrin.h>
++#include <xmmintrin.h>
+
+-static inline void volk_32f_index_max_32u_u_sse(uint32_t* target, const float* src0, uint32_t num_points)
++static inline void
++volk_32f_index_max_32u_u_sse(uint32_t* target, const float* src0, uint32_t num_points)
+ {
+- if(num_points > 0)
+- {
+- uint32_t number = 0;
+- const uint32_t quarterPoints = num_points / 4;
+-
+- float* inputPtr = (float*)src0;
+-
+- __m128 indexIncrementValues = _mm_set1_ps(4);
+- __m128 currentIndexes = _mm_set_ps(-1,-2,-3,-4);
+-
+- float max = src0[0];
+- float index = 0;
+- __m128 maxValues = _mm_set1_ps(max);
+- __m128 maxValuesIndex = _mm_setzero_ps();
+- __m128 compareResults;
+- __m128 currentValues;
+-
+- __VOLK_ATTR_ALIGNED(16) float maxValuesBuffer[4];
+- __VOLK_ATTR_ALIGNED(16) float maxIndexesBuffer[4];
+-
+- for(;number < quarterPoints; number++)
+- {
+- currentValues = _mm_loadu_ps(inputPtr); inputPtr += 4;
+- currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues);
+- compareResults = _mm_cmpgt_ps(currentValues, maxValues);
+- maxValuesIndex = _mm_or_ps(_mm_and_ps(compareResults, currentIndexes),
+- _mm_andnot_ps(compareResults, maxValuesIndex));
+- maxValues = _mm_or_ps(_mm_and_ps(compareResults, currentValues),
+- _mm_andnot_ps(compareResults, maxValues));
+- }
+-
+- // Calculate the largest value from the remaining 4 points
+- _mm_store_ps(maxValuesBuffer, maxValues);
+- _mm_store_ps(maxIndexesBuffer, maxValuesIndex);
+-
+- for(number = 0; number < 4; number++)
+- {
+- if(maxValuesBuffer[number] > max)
+- {
+- index = maxIndexesBuffer[number];
+- max = maxValuesBuffer[number];
+- }
+- else if(maxValuesBuffer[number] == max){
+- if (index > maxIndexesBuffer[number])
+- index = maxIndexesBuffer[number];
+- }
+- }
+-
+- number = quarterPoints * 4;
+- for(;number < num_points; number++)
+- {
+- if(src0[number] > max)
+- {
+- index = number;
+- max = src0[number];
+- }
+- }
+- target[0] = (uint32_t)index;
++ if (num_points > 0) {
++ uint32_t number = 0;
++ const uint32_t quarterPoints = num_points / 4;
++
++ float* inputPtr = (float*)src0;
++
++ __m128 indexIncrementValues = _mm_set1_ps(4);
++ __m128 currentIndexes = _mm_set_ps(-1, -2, -3, -4);
++
++ float max = src0[0];
++ float index = 0;
++ __m128 maxValues = _mm_set1_ps(max);
++ __m128 maxValuesIndex = _mm_setzero_ps();
++ __m128 compareResults;
++ __m128 currentValues;
++
++ __VOLK_ATTR_ALIGNED(16) float maxValuesBuffer[4];
++ __VOLK_ATTR_ALIGNED(16) float maxIndexesBuffer[4];
++
++ for (; number < quarterPoints; number++) {
++ currentValues = _mm_loadu_ps(inputPtr);
++ inputPtr += 4;
++ currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues);
++ compareResults = _mm_cmpgt_ps(currentValues, maxValues);
++ maxValuesIndex = _mm_or_ps(_mm_and_ps(compareResults, currentIndexes),
++ _mm_andnot_ps(compareResults, maxValuesIndex));
++ maxValues = _mm_or_ps(_mm_and_ps(compareResults, currentValues),
++ _mm_andnot_ps(compareResults, maxValues));
+ }
++
++ // Calculate the largest value from the remaining 4 points
++ _mm_store_ps(maxValuesBuffer, maxValues);
++ _mm_store_ps(maxIndexesBuffer, maxValuesIndex);
++
++ for (number = 0; number < 4; number++) {
++ if (maxValuesBuffer[number] > max) {
++ index = maxIndexesBuffer[number];
++ max = maxValuesBuffer[number];
++ } else if (maxValuesBuffer[number] == max) {
++ if (index > maxIndexesBuffer[number])
++ index = maxIndexesBuffer[number];
++ }
++ }
++
++ number = quarterPoints * 4;
++ for (; number < num_points; number++) {
++ if (src0[number] > max) {
++ index = number;
++ max = src0[number];
++ }
++ }
++ target[0] = (uint32_t)index;
++ }
+ }
+
+ #endif /*LV_HAVE_SSE*/
+diff --git a/kernels/volk/volk_32f_invsqrt_32f.h b/kernels/volk/volk_32f_invsqrt_32f.h
+index e416321..e545515 100644
+--- a/kernels/volk/volk_32f_invsqrt_32f.h
++++ b/kernels/volk/volk_32f_invsqrt_32f.h
+@@ -30,8 +30,8 @@
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+- * void volk_32f_invsqrt_32f(float* cVector, const float* aVector, unsigned int num_points)
+- * \endcode
++ * void volk_32f_invsqrt_32f(float* cVector, const float* aVector, unsigned int
++ * num_points) \endcode
+ *
+ * \b Inputs
+ * \li aVector: the input vector of floats.
+@@ -66,27 +66,27 @@
+ #define INCLUDED_volk_32f_invsqrt_32f_a_H
+
+ #include <inttypes.h>
+-#include <stdio.h>
+ #include <math.h>
++#include <stdio.h>
+ #include <string.h>
+
+-static inline float
+-Q_rsqrt(float number)
++static inline float Q_rsqrt(float number)
+ {
+- float x2;
+- const float threehalfs = 1.5F;
+- union f32_to_i32 {
+- int32_t i;
+- float f;
+- } u;
+-
+- x2 = number * 0.5F;
+- u.f = number;
+- u.i = 0x5f3759df - ( u.i >> 1 ); // what the fuck?
+- u.f = u.f * ( threehalfs - ( x2 * u.f * u.f ) ); // 1st iteration
+- //u.f = u.f * ( threehalfs - ( x2 * u.f * u.f ) ); // 2nd iteration, this can be removed
+-
+- return u.f;
++ float x2;
++ const float threehalfs = 1.5F;
++ union f32_to_i32 {
++ int32_t i;
++ float f;
++ } u;
++
++ x2 = number * 0.5F;
++ u.f = number;
++ u.i = 0x5f3759df - (u.i >> 1); // what the fuck?
++ u.f = u.f * (threehalfs - (x2 * u.f * u.f)); // 1st iteration
++ // u.f = u.f * ( threehalfs - ( x2 * u.f * u.f ) ); // 2nd iteration, this can be
++ // removed
++
++ return u.f;
+ }
+
+ #ifdef LV_HAVE_AVX
+@@ -95,24 +95,23 @@ Q_rsqrt(float number)
+ static inline void
+ volk_32f_invsqrt_32f_a_avx(float* cVector, const float* aVector, unsigned int num_points)
+ {
+- unsigned int number = 0;
+- const unsigned int eighthPoints = num_points / 8;
+-
+- float* cPtr = cVector;
+- const float* aPtr = aVector;
+- __m256 aVal, cVal;
+- for (; number < eighthPoints; number++) {
+- aVal = _mm256_load_ps(aPtr);
+- cVal = _mm256_rsqrt_ps(aVal);
+- _mm256_store_ps(cPtr, cVal);
+- aPtr += 8;
+- cPtr += 8;
+- }
+-
+- number = eighthPoints * 8;
+- for(;number < num_points; number++)
+- *cPtr++ = Q_rsqrt(*aPtr++);
+-
++ unsigned int number = 0;
++ const unsigned int eighthPoints = num_points / 8;
++
++ float* cPtr = cVector;
++ const float* aPtr = aVector;
++ __m256 aVal, cVal;
++ for (; number < eighthPoints; number++) {
++ aVal = _mm256_load_ps(aPtr);
++ cVal = _mm256_rsqrt_ps(aVal);
++ _mm256_store_ps(cPtr, cVal);
++ aPtr += 8;
++ cPtr += 8;
++ }
++
++ number = eighthPoints * 8;
++ for (; number < num_points; number++)
++ *cPtr++ = Q_rsqrt(*aPtr++);
+ }
+ #endif /* LV_HAVE_AVX */
+
+@@ -123,29 +122,29 @@ volk_32f_invsqrt_32f_a_avx(float* cVector, const float* aVector, unsigned int nu
+ static inline void
+ volk_32f_invsqrt_32f_a_sse(float* cVector, const float* aVector, unsigned int num_points)
+ {
+- unsigned int number = 0;
+- const unsigned int quarterPoints = num_points / 4;
++ unsigned int number = 0;
++ const unsigned int quarterPoints = num_points / 4;
+
+- float* cPtr = cVector;
+- const float* aPtr = aVector;
++ float* cPtr = cVector;
++ const float* aPtr = aVector;
+
+- __m128 aVal, cVal;
+- for(;number < quarterPoints; number++){
++ __m128 aVal, cVal;
++ for (; number < quarterPoints; number++) {
+
+- aVal = _mm_load_ps(aPtr);
++ aVal = _mm_load_ps(aPtr);
+
+- cVal = _mm_rsqrt_ps(aVal);
++ cVal = _mm_rsqrt_ps(aVal);
+
+- _mm_store_ps(cPtr,cVal); // Store the results back into the C container
++ _mm_store_ps(cPtr, cVal); // Store the results back into the C container
+
+- aPtr += 4;
+- cPtr += 4;
+- }
++ aPtr += 4;
++ cPtr += 4;
++ }
+
+- number = quarterPoints * 4;
+- for(;number < num_points; number++) {
+- *cPtr++ = Q_rsqrt(*aPtr++);
+- }
++ number = quarterPoints * 4;
++ for (; number < num_points; number++) {
++ *cPtr++ = Q_rsqrt(*aPtr++);
++ }
+ }
+ #endif /* LV_HAVE_SSE */
+
+@@ -156,37 +155,38 @@ volk_32f_invsqrt_32f_a_sse(float* cVector, const float* aVector, unsigned int nu
+ static inline void
+ volk_32f_invsqrt_32f_neon(float* cVector, const float* aVector, unsigned int num_points)
+ {
+- unsigned int number;
+- const unsigned int quarter_points = num_points / 4;
+-
+- float* cPtr = cVector;
+- const float* aPtr = aVector;
+- float32x4_t a_val, c_val;
+- for (number = 0; number < quarter_points; ++number) {
+- a_val = vld1q_f32(aPtr);
+- c_val = vrsqrteq_f32(a_val);
+- vst1q_f32(cPtr, c_val);
+- aPtr += 4;
+- cPtr += 4;
+- }
+-
+- for(number=quarter_points * 4;number < num_points; number++)
+- *cPtr++ = Q_rsqrt(*aPtr++);
++ unsigned int number;
++ const unsigned int quarter_points = num_points / 4;
++
++ float* cPtr = cVector;
++ const float* aPtr = aVector;
++ float32x4_t a_val, c_val;
++ for (number = 0; number < quarter_points; ++number) {
++ a_val = vld1q_f32(aPtr);
++ c_val = vrsqrteq_f32(a_val);
++ vst1q_f32(cPtr, c_val);
++ aPtr += 4;
++ cPtr += 4;
++ }
++
++ for (number = quarter_points * 4; number < num_points; number++)
++ *cPtr++ = Q_rsqrt(*aPtr++);
+ }
+ #endif /* LV_HAVE_NEON */
+
+
+ #ifdef LV_HAVE_GENERIC
+
+-static inline void
+-volk_32f_invsqrt_32f_generic(float* cVector, const float* aVector, unsigned int num_points)
++static inline void volk_32f_invsqrt_32f_generic(float* cVector,
++ const float* aVector,
++ unsigned int num_points)
+ {
+- float* cPtr = cVector;
+- const float* aPtr = aVector;
+- unsigned int number = 0;
+- for(number = 0; number < num_points; number++) {
+- *cPtr++ = Q_rsqrt(*aPtr++);
+- }
++ float* cPtr = cVector;
++ const float* aPtr = aVector;
++ unsigned int number = 0;
++ for (number = 0; number < num_points; number++) {
++ *cPtr++ = Q_rsqrt(*aPtr++);
++ }
+ }
+ #endif /* LV_HAVE_GENERIC */
+
+@@ -196,24 +196,23 @@ volk_32f_invsqrt_32f_generic(float* cVector, const float* aVector, unsigned int
+ static inline void
+ volk_32f_invsqrt_32f_u_avx(float* cVector, const float* aVector, unsigned int num_points)
+ {
+- unsigned int number = 0;
+- const unsigned int eighthPoints = num_points / 8;
+-
+- float* cPtr = cVector;
+- const float* aPtr = aVector;
+- __m256 aVal, cVal;
+- for (; number < eighthPoints; number++) {
+- aVal = _mm256_loadu_ps(aPtr);
+- cVal = _mm256_rsqrt_ps(aVal);
+- _mm256_storeu_ps(cPtr, cVal);
+- aPtr += 8;
+- cPtr += 8;
+- }
+-
+- number = eighthPoints * 8;
+- for(;number < num_points; number++)
+- *cPtr++ = Q_rsqrt(*aPtr++);
+-
++ unsigned int number = 0;
++ const unsigned int eighthPoints = num_points / 8;
++
++ float* cPtr = cVector;
++ const float* aPtr = aVector;
++ __m256 aVal, cVal;
++ for (; number < eighthPoints; number++) {
++ aVal = _mm256_loadu_ps(aPtr);
++ cVal = _mm256_rsqrt_ps(aVal);
++ _mm256_storeu_ps(cPtr, cVal);
++ aPtr += 8;
++ cPtr += 8;
++ }
++
++ number = eighthPoints * 8;
++ for (; number < num_points; number++)
++ *cPtr++ = Q_rsqrt(*aPtr++);
+ }
+ #endif /* LV_HAVE_AVX */
+
+diff --git a/kernels/volk/volk_32f_log2_32f.h b/kernels/volk/volk_32f_log2_32f.h
+index 740f89d..47276d4 100644
+--- a/kernels/volk/volk_32f_log2_32f.h
++++ b/kernels/volk/volk_32f_log2_32f.h
+@@ -92,17 +92,18 @@
+ #ifndef INCLUDED_volk_32f_log2_32f_a_H
+ #define INCLUDED_volk_32f_log2_32f_a_H
+
+-#include <stdio.h>
+-#include <stdlib.h>
+ #include <inttypes.h>
+ #include <math.h>
++#include <stdio.h>
++#include <stdlib.h>
+
+ #define LOG_POLY_DEGREE 6
+
+ // +-Inf -> +-127.0f in order to match the behaviour of the SIMD kernels
+-static inline float log2f_non_ieee(float f) {
+- float const result = log2f(f);
+- return isinf(result) ? copysignf(127.0f, result) : result;
++static inline float log2f_non_ieee(float f)
++{
++ float const result = log2f(f);
++ return isinf(result) ? copysignf(127.0f, result) : result;
+ }
+
+ #ifdef LV_HAVE_GENERIC
+@@ -110,12 +111,12 @@ static inline float log2f_non_ieee(float f) {
+ static inline void
+ volk_32f_log2_32f_generic(float* bVector, const float* aVector, unsigned int num_points)
+ {
+- float* bPtr = bVector;
+- const float* aPtr = aVector;
+- unsigned int number = 0;
++ float* bPtr = bVector;
++ const float* aPtr = aVector;
++ unsigned int number = 0;
+
+- for(number = 0; number < num_points; number++)
+- *bPtr++ = log2f_non_ieee(*aPtr++);
++ for (number = 0; number < num_points; number++)
++ *bPtr++ = log2f_non_ieee(*aPtr++);
+ }
+ #endif /* LV_HAVE_GENERIC */
+
+@@ -123,56 +124,86 @@ volk_32f_log2_32f_generic(float* bVector, const float* aVector, unsigned int num
+ #include <immintrin.h>
+
+ #define POLY0_FMAAVX2(x, c0) _mm256_set1_ps(c0)
+-#define POLY1_FMAAVX2(x, c0, c1) _mm256_fmadd_ps(POLY0_FMAAVX2(x, c1), x, _mm256_set1_ps(c0))
+-#define POLY2_FMAAVX2(x, c0, c1, c2) _mm256_fmadd_ps(POLY1_FMAAVX2(x, c1, c2), x, _mm256_set1_ps(c0))
+-#define POLY3_FMAAVX2(x, c0, c1, c2, c3) _mm256_fmadd_ps(POLY2_FMAAVX2(x, c1, c2, c3), x, _mm256_set1_ps(c0))
+-#define POLY4_FMAAVX2(x, c0, c1, c2, c3, c4) _mm256_fmadd_ps(POLY3_FMAAVX2(x, c1, c2, c3, c4), x, _mm256_set1_ps(c0))
+-#define POLY5_FMAAVX2(x, c0, c1, c2, c3, c4, c5) _mm256_fmadd_ps(POLY4_FMAAVX2(x, c1, c2, c3, c4, c5), x, _mm256_set1_ps(c0))
+-
+-static inline void
+-volk_32f_log2_32f_a_avx2_fma(float* bVector, const float* aVector, unsigned int num_points)
++#define POLY1_FMAAVX2(x, c0, c1) \
++ _mm256_fmadd_ps(POLY0_FMAAVX2(x, c1), x, _mm256_set1_ps(c0))
++#define POLY2_FMAAVX2(x, c0, c1, c2) \
++ _mm256_fmadd_ps(POLY1_FMAAVX2(x, c1, c2), x, _mm256_set1_ps(c0))
++#define POLY3_FMAAVX2(x, c0, c1, c2, c3) \
++ _mm256_fmadd_ps(POLY2_FMAAVX2(x, c1, c2, c3), x, _mm256_set1_ps(c0))
++#define POLY4_FMAAVX2(x, c0, c1, c2, c3, c4) \
++ _mm256_fmadd_ps(POLY3_FMAAVX2(x, c1, c2, c3, c4), x, _mm256_set1_ps(c0))
++#define POLY5_FMAAVX2(x, c0, c1, c2, c3, c4, c5) \
++ _mm256_fmadd_ps(POLY4_FMAAVX2(x, c1, c2, c3, c4, c5), x, _mm256_set1_ps(c0))
++
++static inline void volk_32f_log2_32f_a_avx2_fma(float* bVector,
++ const float* aVector,
++ unsigned int num_points)
+ {
+- float* bPtr = bVector;
+- const float* aPtr = aVector;
++ float* bPtr = bVector;
++ const float* aPtr = aVector;
+
+- unsigned int number = 0;
+- const unsigned int eighthPoints = num_points / 8;
++ unsigned int number = 0;
++ const unsigned int eighthPoints = num_points / 8;
+
+- __m256 aVal, bVal, mantissa, frac, leadingOne;
+- __m256i bias, exp;
++ __m256 aVal, bVal, mantissa, frac, leadingOne;
++ __m256i bias, exp;
+
+- for(;number < eighthPoints; number++){
++ for (; number < eighthPoints; number++) {
+
+- aVal = _mm256_load_ps(aPtr);
+- bias = _mm256_set1_epi32(127);
+- leadingOne = _mm256_set1_ps(1.0f);
+- exp = _mm256_sub_epi32(_mm256_srli_epi32(_mm256_and_si256(_mm256_castps_si256(aVal), _mm256_set1_epi32(0x7f800000)), 23), bias);
+- bVal = _mm256_cvtepi32_ps(exp);
++ aVal = _mm256_load_ps(aPtr);
++ bias = _mm256_set1_epi32(127);
++ leadingOne = _mm256_set1_ps(1.0f);
++ exp = _mm256_sub_epi32(
++ _mm256_srli_epi32(_mm256_and_si256(_mm256_castps_si256(aVal),
++ _mm256_set1_epi32(0x7f800000)),
++ 23),
++ bias);
++ bVal = _mm256_cvtepi32_ps(exp);
+
+- // Now to extract mantissa
+- frac = _mm256_or_ps(leadingOne, _mm256_and_ps(aVal, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffff))));
++ // Now to extract mantissa
++ frac = _mm256_or_ps(
++ leadingOne,
++ _mm256_and_ps(aVal, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffff))));
+
+ #if LOG_POLY_DEGREE == 6
+- mantissa = POLY5_FMAAVX2( frac, 3.1157899f, -3.3241990f, 2.5988452f, -1.2315303f, 3.1821337e-1f, -3.4436006e-2f);
++ mantissa = POLY5_FMAAVX2(frac,
++ 3.1157899f,
++ -3.3241990f,
++ 2.5988452f,
++ -1.2315303f,
++ 3.1821337e-1f,
++ -3.4436006e-2f);
+ #elif LOG_POLY_DEGREE == 5
+- mantissa = POLY4_FMAAVX2( frac, 2.8882704548164776201f, -2.52074962577807006663f, 1.48116647521213171641f, -0.465725644288844778798f, 0.0596515482674574969533f);
++ mantissa = POLY4_FMAAVX2(frac,
++ 2.8882704548164776201f,
++ -2.52074962577807006663f,
++ 1.48116647521213171641f,
++ -0.465725644288844778798f,
++ 0.0596515482674574969533f);
+ #elif LOG_POLY_DEGREE == 4
+- mantissa = POLY3_FMAAVX2( frac, 2.61761038894603480148f, -1.75647175389045657003f, 0.688243882994381274313f, -0.107254423828329604454f);
++ mantissa = POLY3_FMAAVX2(frac,
++ 2.61761038894603480148f,
++ -1.75647175389045657003f,
++ 0.688243882994381274313f,
++ -0.107254423828329604454f);
+ #elif LOG_POLY_DEGREE == 3
+- mantissa = POLY2_FMAAVX2( frac, 2.28330284476918490682f, -1.04913055217340124191f, 0.204446009836232697516f);
++ mantissa = POLY2_FMAAVX2(frac,
++ 2.28330284476918490682f,
++ -1.04913055217340124191f,
++ 0.204446009836232697516f);
+ #else
+ #error
+ #endif
+
+- bVal = _mm256_fmadd_ps(mantissa, _mm256_sub_ps(frac, leadingOne), bVal);
+- _mm256_store_ps(bPtr, bVal);
++ bVal = _mm256_fmadd_ps(mantissa, _mm256_sub_ps(frac, leadingOne), bVal);
++ _mm256_store_ps(bPtr, bVal);
+
+- aPtr += 8;
+- bPtr += 8;
+- }
++ aPtr += 8;
++ bPtr += 8;
++ }
+
+- number = eighthPoints * 8;
+- volk_32f_log2_32f_generic(bPtr, aPtr, num_points-number);
++ number = eighthPoints * 8;
++ volk_32f_log2_32f_generic(bPtr, aPtr, num_points - number);
+ }
+
+ #endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for aligned */
+@@ -181,56 +212,86 @@ volk_32f_log2_32f_a_avx2_fma(float* bVector, const float* aVector, unsigned int
+ #include <immintrin.h>
+
+ #define POLY0_AVX2(x, c0) _mm256_set1_ps(c0)
+-#define POLY1_AVX2(x, c0, c1) _mm256_add_ps(_mm256_mul_ps(POLY0_AVX2(x, c1), x), _mm256_set1_ps(c0))
+-#define POLY2_AVX2(x, c0, c1, c2) _mm256_add_ps(_mm256_mul_ps(POLY1_AVX2(x, c1, c2), x), _mm256_set1_ps(c0))
+-#define POLY3_AVX2(x, c0, c1, c2, c3) _mm256_add_ps(_mm256_mul_ps(POLY2_AVX2(x, c1, c2, c3), x), _mm256_set1_ps(c0))
+-#define POLY4_AVX2(x, c0, c1, c2, c3, c4) _mm256_add_ps(_mm256_mul_ps(POLY3_AVX2(x, c1, c2, c3, c4), x), _mm256_set1_ps(c0))
+-#define POLY5_AVX2(x, c0, c1, c2, c3, c4, c5) _mm256_add_ps(_mm256_mul_ps(POLY4_AVX2(x, c1, c2, c3, c4, c5), x), _mm256_set1_ps(c0))
++#define POLY1_AVX2(x, c0, c1) \
++ _mm256_add_ps(_mm256_mul_ps(POLY0_AVX2(x, c1), x), _mm256_set1_ps(c0))
++#define POLY2_AVX2(x, c0, c1, c2) \
++ _mm256_add_ps(_mm256_mul_ps(POLY1_AVX2(x, c1, c2), x), _mm256_set1_ps(c0))
++#define POLY3_AVX2(x, c0, c1, c2, c3) \
++ _mm256_add_ps(_mm256_mul_ps(POLY2_AVX2(x, c1, c2, c3), x), _mm256_set1_ps(c0))
++#define POLY4_AVX2(x, c0, c1, c2, c3, c4) \
++ _mm256_add_ps(_mm256_mul_ps(POLY3_AVX2(x, c1, c2, c3, c4), x), _mm256_set1_ps(c0))
++#define POLY5_AVX2(x, c0, c1, c2, c3, c4, c5) \
++ _mm256_add_ps(_mm256_mul_ps(POLY4_AVX2(x, c1, c2, c3, c4, c5), x), _mm256_set1_ps(c0))
+
+ static inline void
+ volk_32f_log2_32f_a_avx2(float* bVector, const float* aVector, unsigned int num_points)
+ {
+- float* bPtr = bVector;
+- const float* aPtr = aVector;
++ float* bPtr = bVector;
++ const float* aPtr = aVector;
+
+- unsigned int number = 0;
+- const unsigned int eighthPoints = num_points / 8;
++ unsigned int number = 0;
++ const unsigned int eighthPoints = num_points / 8;
+
+- __m256 aVal, bVal, mantissa, frac, leadingOne;
+- __m256i bias, exp;
++ __m256 aVal, bVal, mantissa, frac, leadingOne;
++ __m256i bias, exp;
+
+- for(;number < eighthPoints; number++){
++ for (; number < eighthPoints; number++) {
+
+- aVal = _mm256_load_ps(aPtr);
+- bias = _mm256_set1_epi32(127);
+- leadingOne = _mm256_set1_ps(1.0f);
+- exp = _mm256_sub_epi32(_mm256_srli_epi32(_mm256_and_si256(_mm256_castps_si256(aVal), _mm256_set1_epi32(0x7f800000)), 23), bias);
+- bVal = _mm256_cvtepi32_ps(exp);
++ aVal = _mm256_load_ps(aPtr);
++ bias = _mm256_set1_epi32(127);
++ leadingOne = _mm256_set1_ps(1.0f);
++ exp = _mm256_sub_epi32(
++ _mm256_srli_epi32(_mm256_and_si256(_mm256_castps_si256(aVal),
++ _mm256_set1_epi32(0x7f800000)),
++ 23),
++ bias);
++ bVal = _mm256_cvtepi32_ps(exp);
+
+- // Now to extract mantissa
+- frac = _mm256_or_ps(leadingOne, _mm256_and_ps(aVal, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffff))));
++ // Now to extract mantissa
++ frac = _mm256_or_ps(
++ leadingOne,
++ _mm256_and_ps(aVal, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffff))));
+
+ #if LOG_POLY_DEGREE == 6
+- mantissa = POLY5_AVX2( frac, 3.1157899f, -3.3241990f, 2.5988452f, -1.2315303f, 3.1821337e-1f, -3.4436006e-2f);
++ mantissa = POLY5_AVX2(frac,
++ 3.1157899f,
++ -3.3241990f,
++ 2.5988452f,
++ -1.2315303f,
++ 3.1821337e-1f,
++ -3.4436006e-2f);
+ #elif LOG_POLY_DEGREE == 5
+- mantissa = POLY4_AVX2( frac, 2.8882704548164776201f, -2.52074962577807006663f, 1.48116647521213171641f, -0.465725644288844778798f, 0.0596515482674574969533f);
++ mantissa = POLY4_AVX2(frac,
++ 2.8882704548164776201f,
++ -2.52074962577807006663f,
++ 1.48116647521213171641f,
++ -0.465725644288844778798f,
++ 0.0596515482674574969533f);
+ #elif LOG_POLY_DEGREE == 4
+- mantissa = POLY3_AVX2( frac, 2.61761038894603480148f, -1.75647175389045657003f, 0.688243882994381274313f, -0.107254423828329604454f);
++ mantissa = POLY3_AVX2(frac,
++ 2.61761038894603480148f,
++ -1.75647175389045657003f,
++ 0.688243882994381274313f,
++ -0.107254423828329604454f);
+ #elif LOG_POLY_DEGREE == 3
+- mantissa = POLY2_AVX2( frac, 2.28330284476918490682f, -1.04913055217340124191f, 0.204446009836232697516f);
++ mantissa = POLY2_AVX2(frac,
++ 2.28330284476918490682f,
++ -1.04913055217340124191f,
++ 0.204446009836232697516f);
+ #else
+ #error
+ #endif
+
+- bVal = _mm256_add_ps(_mm256_mul_ps(mantissa, _mm256_sub_ps(frac, leadingOne)), bVal);
+- _mm256_store_ps(bPtr, bVal);
++ bVal =
++ _mm256_add_ps(_mm256_mul_ps(mantissa, _mm256_sub_ps(frac, leadingOne)), bVal);
++ _mm256_store_ps(bPtr, bVal);
+
+- aPtr += 8;
+- bPtr += 8;
+- }
++ aPtr += 8;
++ bPtr += 8;
++ }
+
+- number = eighthPoints * 8;
+- volk_32f_log2_32f_generic(bPtr, aPtr, num_points-number);
++ number = eighthPoints * 8;
++ volk_32f_log2_32f_generic(bPtr, aPtr, num_points - number);
+ }
+
+ #endif /* LV_HAVE_AVX2 for aligned */
+@@ -241,54 +302,79 @@ volk_32f_log2_32f_a_avx2(float* bVector, const float* aVector, unsigned int num_
+ #define POLY0(x, c0) _mm_set1_ps(c0)
+ #define POLY1(x, c0, c1) _mm_add_ps(_mm_mul_ps(POLY0(x, c1), x), _mm_set1_ps(c0))
+ #define POLY2(x, c0, c1, c2) _mm_add_ps(_mm_mul_ps(POLY1(x, c1, c2), x), _mm_set1_ps(c0))
+-#define POLY3(x, c0, c1, c2, c3) _mm_add_ps(_mm_mul_ps(POLY2(x, c1, c2, c3), x), _mm_set1_ps(c0))
+-#define POLY4(x, c0, c1, c2, c3, c4) _mm_add_ps(_mm_mul_ps(POLY3(x, c1, c2, c3, c4), x), _mm_set1_ps(c0))
+-#define POLY5(x, c0, c1, c2, c3, c4, c5) _mm_add_ps(_mm_mul_ps(POLY4(x, c1, c2, c3, c4, c5), x), _mm_set1_ps(c0))
++#define POLY3(x, c0, c1, c2, c3) \
++ _mm_add_ps(_mm_mul_ps(POLY2(x, c1, c2, c3), x), _mm_set1_ps(c0))
++#define POLY4(x, c0, c1, c2, c3, c4) \
++ _mm_add_ps(_mm_mul_ps(POLY3(x, c1, c2, c3, c4), x), _mm_set1_ps(c0))
++#define POLY5(x, c0, c1, c2, c3, c4, c5) \
++ _mm_add_ps(_mm_mul_ps(POLY4(x, c1, c2, c3, c4, c5), x), _mm_set1_ps(c0))
+
+ static inline void
+ volk_32f_log2_32f_a_sse4_1(float* bVector, const float* aVector, unsigned int num_points)
+ {
+- float* bPtr = bVector;
+- const float* aPtr = aVector;
++ float* bPtr = bVector;
++ const float* aPtr = aVector;
+
+- unsigned int number = 0;
+- const unsigned int quarterPoints = num_points / 4;
++ unsigned int number = 0;
++ const unsigned int quarterPoints = num_points / 4;
+
+- __m128 aVal, bVal, mantissa, frac, leadingOne;
+- __m128i bias, exp;
++ __m128 aVal, bVal, mantissa, frac, leadingOne;
++ __m128i bias, exp;
+
+- for(;number < quarterPoints; number++){
++ for (; number < quarterPoints; number++) {
+
+- aVal = _mm_load_ps(aPtr);
+- bias = _mm_set1_epi32(127);
+- leadingOne = _mm_set1_ps(1.0f);
+- exp = _mm_sub_epi32(_mm_srli_epi32(_mm_and_si128(_mm_castps_si128(aVal), _mm_set1_epi32(0x7f800000)), 23), bias);
+- bVal = _mm_cvtepi32_ps(exp);
++ aVal = _mm_load_ps(aPtr);
++ bias = _mm_set1_epi32(127);
++ leadingOne = _mm_set1_ps(1.0f);
++ exp = _mm_sub_epi32(
++ _mm_srli_epi32(
++ _mm_and_si128(_mm_castps_si128(aVal), _mm_set1_epi32(0x7f800000)), 23),
++ bias);
++ bVal = _mm_cvtepi32_ps(exp);
+
+- // Now to extract mantissa
+- frac = _mm_or_ps(leadingOne, _mm_and_ps(aVal, _mm_castsi128_ps(_mm_set1_epi32(0x7fffff))));
++ // Now to extract mantissa
++ frac = _mm_or_ps(leadingOne,
++ _mm_and_ps(aVal, _mm_castsi128_ps(_mm_set1_epi32(0x7fffff))));
+
+ #if LOG_POLY_DEGREE == 6
+- mantissa = POLY5( frac, 3.1157899f, -3.3241990f, 2.5988452f, -1.2315303f, 3.1821337e-1f, -3.4436006e-2f);
++ mantissa = POLY5(frac,
++ 3.1157899f,
++ -3.3241990f,
++ 2.5988452f,
++ -1.2315303f,
++ 3.1821337e-1f,
++ -3.4436006e-2f);
+ #elif LOG_POLY_DEGREE == 5
+- mantissa = POLY4( frac, 2.8882704548164776201f, -2.52074962577807006663f, 1.48116647521213171641f, -0.465725644288844778798f, 0.0596515482674574969533f);
++ mantissa = POLY4(frac,
++ 2.8882704548164776201f,
++ -2.52074962577807006663f,
++ 1.48116647521213171641f,
++ -0.465725644288844778798f,
++ 0.0596515482674574969533f);
+ #elif LOG_POLY_DEGREE == 4
+- mantissa = POLY3( frac, 2.61761038894603480148f, -1.75647175389045657003f, 0.688243882994381274313f, -0.107254423828329604454f);
++ mantissa = POLY3(frac,
++ 2.61761038894603480148f,
++ -1.75647175389045657003f,
++ 0.688243882994381274313f,
++ -0.107254423828329604454f);
+ #elif LOG_POLY_DEGREE == 3
+- mantissa = POLY2( frac, 2.28330284476918490682f, -1.04913055217340124191f, 0.204446009836232697516f);
++ mantissa = POLY2(frac,
++ 2.28330284476918490682f,
++ -1.04913055217340124191f,
++ 0.204446009836232697516f);
+ #else
+ #error
+ #endif
+
+- bVal = _mm_add_ps(bVal, _mm_mul_ps(mantissa, _mm_sub_ps(frac, leadingOne)));
+- _mm_store_ps(bPtr, bVal);
++ bVal = _mm_add_ps(bVal, _mm_mul_ps(mantissa, _mm_sub_ps(frac, leadingOne)));
++ _mm_store_ps(bPtr, bVal);
+
+- aPtr += 4;
+- bPtr += 4;
+- }
++ aPtr += 4;
++ bPtr += 4;
++ }
+
+- number = quarterPoints * 4;
+- volk_32f_log2_32f_generic(bPtr, aPtr, num_points-number);
++ number = quarterPoints * 4;
++ volk_32f_log2_32f_generic(bPtr, aPtr, num_points - number);
+ }
+
+ #endif /* LV_HAVE_SSE4_1 for aligned */
+@@ -297,91 +383,91 @@ volk_32f_log2_32f_a_sse4_1(float* bVector, const float* aVector, unsigned int nu
+ #include <arm_neon.h>
+
+ /* these macros allow us to embed logs in other kernels */
+-#define VLOG2Q_NEON_PREAMBLE() \
+- int32x4_t one = vdupq_n_s32(0x000800000); \
+- /* minimax polynomial */ \
+- float32x4_t p0 = vdupq_n_f32(-3.0400402727048585); \
+- float32x4_t p1 = vdupq_n_f32(6.1129631282966113); \
+- float32x4_t p2 = vdupq_n_f32(-5.3419892024633207); \
+- float32x4_t p3 = vdupq_n_f32(3.2865287703753912); \
+- float32x4_t p4 = vdupq_n_f32(-1.2669182593441635); \
+- float32x4_t p5 = vdupq_n_f32(0.2751487703421256); \
+- float32x4_t p6 = vdupq_n_f32(-0.0256910888150985); \
+- int32x4_t exp_mask = vdupq_n_s32(0x7f800000); \
+- int32x4_t sig_mask = vdupq_n_s32(0x007fffff); \
+- int32x4_t exp_bias = vdupq_n_s32(127);
+-
+-
+-#define VLOG2Q_NEON_F32(log2_approx, aval) \
+- int32x4_t exponent_i = vandq_s32(aval, exp_mask); \
+- int32x4_t significand_i = vandq_s32(aval, sig_mask); \
+- exponent_i = vshrq_n_s32(exponent_i, 23); \
+- \
+- /* extract the exponent and significand \
+- we can treat this as fixed point to save ~9% on the \
+- conversion + float add */ \
+- significand_i = vorrq_s32(one, significand_i); \
+- float32x4_t significand_f = vcvtq_n_f32_s32(significand_i,23); \
+- /* debias the exponent and convert to float */ \
+- exponent_i = vsubq_s32(exponent_i, exp_bias); \
+- float32x4_t exponent_f = vcvtq_f32_s32(exponent_i); \
+- \
+- /* put the significand through a polynomial fit of log2(x) [1,2] \
+- add the result to the exponent */ \
+- log2_approx = vaddq_f32(exponent_f, p0); /* p0 */ \
+- float32x4_t tmp1 = vmulq_f32(significand_f, p1); /* p1 * x */ \
+- log2_approx = vaddq_f32(log2_approx, tmp1); \
+- float32x4_t sig_2 = vmulq_f32(significand_f, significand_f); /* x^2 */ \
+- tmp1 = vmulq_f32(sig_2, p2); /* p2 * x^2 */ \
+- log2_approx = vaddq_f32(log2_approx, tmp1); \
+- \
+- float32x4_t sig_3 = vmulq_f32(sig_2, significand_f); /* x^3 */ \
+- tmp1 = vmulq_f32(sig_3, p3); /* p3 * x^3 */ \
+- log2_approx = vaddq_f32(log2_approx, tmp1); \
+- float32x4_t sig_4 = vmulq_f32(sig_2, sig_2); /* x^4 */ \
+- tmp1 = vmulq_f32(sig_4, p4); /* p4 * x^4 */ \
+- log2_approx = vaddq_f32(log2_approx, tmp1); \
+- float32x4_t sig_5 = vmulq_f32(sig_3, sig_2); /* x^5 */ \
+- tmp1 = vmulq_f32(sig_5, p5); /* p5 * x^5 */ \
+- log2_approx = vaddq_f32(log2_approx, tmp1); \
+- float32x4_t sig_6 = vmulq_f32(sig_3, sig_3); /* x^6 */ \
+- tmp1 = vmulq_f32(sig_6, p6); /* p6 * x^6 */ \
+- log2_approx = vaddq_f32(log2_approx, tmp1);
++#define VLOG2Q_NEON_PREAMBLE() \
++ int32x4_t one = vdupq_n_s32(0x000800000); \
++ /* minimax polynomial */ \
++ float32x4_t p0 = vdupq_n_f32(-3.0400402727048585); \
++ float32x4_t p1 = vdupq_n_f32(6.1129631282966113); \
++ float32x4_t p2 = vdupq_n_f32(-5.3419892024633207); \
++ float32x4_t p3 = vdupq_n_f32(3.2865287703753912); \
++ float32x4_t p4 = vdupq_n_f32(-1.2669182593441635); \
++ float32x4_t p5 = vdupq_n_f32(0.2751487703421256); \
++ float32x4_t p6 = vdupq_n_f32(-0.0256910888150985); \
++ int32x4_t exp_mask = vdupq_n_s32(0x7f800000); \
++ int32x4_t sig_mask = vdupq_n_s32(0x007fffff); \
++ int32x4_t exp_bias = vdupq_n_s32(127);
++
++
++#define VLOG2Q_NEON_F32(log2_approx, aval) \
++ int32x4_t exponent_i = vandq_s32(aval, exp_mask); \
++ int32x4_t significand_i = vandq_s32(aval, sig_mask); \
++ exponent_i = vshrq_n_s32(exponent_i, 23); \
++ \
++ /* extract the exponent and significand \
++ we can treat this as fixed point to save ~9% on the \
++ conversion + float add */ \
++ significand_i = vorrq_s32(one, significand_i); \
++ float32x4_t significand_f = vcvtq_n_f32_s32(significand_i, 23); \
++ /* debias the exponent and convert to float */ \
++ exponent_i = vsubq_s32(exponent_i, exp_bias); \
++ float32x4_t exponent_f = vcvtq_f32_s32(exponent_i); \
++ \
++ /* put the significand through a polynomial fit of log2(x) [1,2] \
++ add the result to the exponent */ \
++ log2_approx = vaddq_f32(exponent_f, p0); /* p0 */ \
++ float32x4_t tmp1 = vmulq_f32(significand_f, p1); /* p1 * x */ \
++ log2_approx = vaddq_f32(log2_approx, tmp1); \
++ float32x4_t sig_2 = vmulq_f32(significand_f, significand_f); /* x^2 */ \
++ tmp1 = vmulq_f32(sig_2, p2); /* p2 * x^2 */ \
++ log2_approx = vaddq_f32(log2_approx, tmp1); \
++ \
++ float32x4_t sig_3 = vmulq_f32(sig_2, significand_f); /* x^3 */ \
++ tmp1 = vmulq_f32(sig_3, p3); /* p3 * x^3 */ \
++ log2_approx = vaddq_f32(log2_approx, tmp1); \
++ float32x4_t sig_4 = vmulq_f32(sig_2, sig_2); /* x^4 */ \
++ tmp1 = vmulq_f32(sig_4, p4); /* p4 * x^4 */ \
++ log2_approx = vaddq_f32(log2_approx, tmp1); \
++ float32x4_t sig_5 = vmulq_f32(sig_3, sig_2); /* x^5 */ \
++ tmp1 = vmulq_f32(sig_5, p5); /* p5 * x^5 */ \
++ log2_approx = vaddq_f32(log2_approx, tmp1); \
++ float32x4_t sig_6 = vmulq_f32(sig_3, sig_3); /* x^6 */ \
++ tmp1 = vmulq_f32(sig_6, p6); /* p6 * x^6 */ \
++ log2_approx = vaddq_f32(log2_approx, tmp1);
+
+ static inline void
+ volk_32f_log2_32f_neon(float* bVector, const float* aVector, unsigned int num_points)
+ {
+- float* bPtr = bVector;
+- const float* aPtr = aVector;
+- unsigned int number;
+- const unsigned int quarterPoints = num_points / 4;
++ float* bPtr = bVector;
++ const float* aPtr = aVector;
++ unsigned int number;
++ const unsigned int quarterPoints = num_points / 4;
+
+- int32x4_t aval;
+- float32x4_t log2_approx;
++ int32x4_t aval;
++ float32x4_t log2_approx;
+
+- VLOG2Q_NEON_PREAMBLE()
+- // lms
+- //p0 = vdupq_n_f32(-1.649132280361871);
+- //p1 = vdupq_n_f32(1.995047138579499);
+- //p2 = vdupq_n_f32(-0.336914839219728);
++ VLOG2Q_NEON_PREAMBLE()
++ // lms
++ // p0 = vdupq_n_f32(-1.649132280361871);
++ // p1 = vdupq_n_f32(1.995047138579499);
++ // p2 = vdupq_n_f32(-0.336914839219728);
+
+- // keep in mind a single precision float is represented as
+- // (-1)^sign * 2^exp * 1.significand, so the log2 is
+- // log2(2^exp * sig) = exponent + log2(1 + significand/(1<<23)
+- for(number = 0; number < quarterPoints; ++number){
+- // load float in to an int register without conversion
+- aval = vld1q_s32((int*)aPtr);
++ // keep in mind a single precision float is represented as
++ // (-1)^sign * 2^exp * 1.significand, so the log2 is
++ // log2(2^exp * sig) = exponent + log2(1 + significand/(1<<23)
++ for (number = 0; number < quarterPoints; ++number) {
++ // load float in to an int register without conversion
++ aval = vld1q_s32((int*)aPtr);
+
+- VLOG2Q_NEON_F32(log2_approx, aval)
++ VLOG2Q_NEON_F32(log2_approx, aval)
+
+- vst1q_f32(bPtr, log2_approx);
++ vst1q_f32(bPtr, log2_approx);
+
+- aPtr += 4;
+- bPtr += 4;
+- }
++ aPtr += 4;
++ bPtr += 4;
++ }
+
+- number = quarterPoints * 4;
+- volk_32f_log2_32f_generic(bPtr, aPtr, num_points-number);
++ number = quarterPoints * 4;
++ volk_32f_log2_32f_generic(bPtr, aPtr, num_points - number);
+ }
+
+ #endif /* LV_HAVE_NEON */
+@@ -398,14 +484,14 @@ volk_32f_log2_32f_neon(float* bVector, const float* aVector, unsigned int num_po
+ static inline void
+ volk_32f_log2_32f_u_generic(float* bVector, const float* aVector, unsigned int num_points)
+ {
+- float* bPtr = bVector;
+- const float* aPtr = aVector;
+- unsigned int number = 0;
+-
+- for(number = 0; number < num_points; number++){
+- float const result = log2f(*aPtr++);
+- *bPtr++ = isinf(result) ? -127.0f : result;
+- }
++ float* bPtr = bVector;
++ const float* aPtr = aVector;
++ unsigned int number = 0;
++
++ for (number = 0; number < num_points; number++) {
++ float const result = log2f(*aPtr++);
++ *bPtr++ = isinf(result) ? -127.0f : result;
++ }
+ }
+
+ #endif /* LV_HAVE_GENERIC */
+@@ -417,54 +503,79 @@ volk_32f_log2_32f_u_generic(float* bVector, const float* aVector, unsigned int n
+ #define POLY0(x, c0) _mm_set1_ps(c0)
+ #define POLY1(x, c0, c1) _mm_add_ps(_mm_mul_ps(POLY0(x, c1), x), _mm_set1_ps(c0))
+ #define POLY2(x, c0, c1, c2) _mm_add_ps(_mm_mul_ps(POLY1(x, c1, c2), x), _mm_set1_ps(c0))
+-#define POLY3(x, c0, c1, c2, c3) _mm_add_ps(_mm_mul_ps(POLY2(x, c1, c2, c3), x), _mm_set1_ps(c0))
+-#define POLY4(x, c0, c1, c2, c3, c4) _mm_add_ps(_mm_mul_ps(POLY3(x, c1, c2, c3, c4), x), _mm_set1_ps(c0))
+-#define POLY5(x, c0, c1, c2, c3, c4, c5) _mm_add_ps(_mm_mul_ps(POLY4(x, c1, c2, c3, c4, c5), x), _mm_set1_ps(c0))
++#define POLY3(x, c0, c1, c2, c3) \
++ _mm_add_ps(_mm_mul_ps(POLY2(x, c1, c2, c3), x), _mm_set1_ps(c0))
++#define POLY4(x, c0, c1, c2, c3, c4) \
++ _mm_add_ps(_mm_mul_ps(POLY3(x, c1, c2, c3, c4), x), _mm_set1_ps(c0))
++#define POLY5(x, c0, c1, c2, c3, c4, c5) \
++ _mm_add_ps(_mm_mul_ps(POLY4(x, c1, c2, c3, c4, c5), x), _mm_set1_ps(c0))
+
+ static inline void
+ volk_32f_log2_32f_u_sse4_1(float* bVector, const float* aVector, unsigned int num_points)
+ {
+- float* bPtr = bVector;
+- const float* aPtr = aVector;
++ float* bPtr = bVector;
++ const float* aPtr = aVector;
+
+- unsigned int number = 0;
+- const unsigned int quarterPoints = num_points / 4;
++ unsigned int number = 0;
++ const unsigned int quarterPoints = num_points / 4;
+
+- __m128 aVal, bVal, mantissa, frac, leadingOne;
+- __m128i bias, exp;
++ __m128 aVal, bVal, mantissa, frac, leadingOne;
++ __m128i bias, exp;
+
+- for(;number < quarterPoints; number++){
++ for (; number < quarterPoints; number++) {
+
+- aVal = _mm_loadu_ps(aPtr);
+- bias = _mm_set1_epi32(127);
+- leadingOne = _mm_set1_ps(1.0f);
+- exp = _mm_sub_epi32(_mm_srli_epi32(_mm_and_si128(_mm_castps_si128(aVal), _mm_set1_epi32(0x7f800000)), 23), bias);
+- bVal = _mm_cvtepi32_ps(exp);
++ aVal = _mm_loadu_ps(aPtr);
++ bias = _mm_set1_epi32(127);
++ leadingOne = _mm_set1_ps(1.0f);
++ exp = _mm_sub_epi32(
++ _mm_srli_epi32(
++ _mm_and_si128(_mm_castps_si128(aVal), _mm_set1_epi32(0x7f800000)), 23),
++ bias);
++ bVal = _mm_cvtepi32_ps(exp);
+
+- // Now to extract mantissa
+- frac = _mm_or_ps(leadingOne, _mm_and_ps(aVal, _mm_castsi128_ps(_mm_set1_epi32(0x7fffff))));
++ // Now to extract mantissa
++ frac = _mm_or_ps(leadingOne,
++ _mm_and_ps(aVal, _mm_castsi128_ps(_mm_set1_epi32(0x7fffff))));
+
+ #if LOG_POLY_DEGREE == 6
+- mantissa = POLY5( frac, 3.1157899f, -3.3241990f, 2.5988452f, -1.2315303f, 3.1821337e-1f, -3.4436006e-2f);
++ mantissa = POLY5(frac,
++ 3.1157899f,
++ -3.3241990f,
++ 2.5988452f,
++ -1.2315303f,
++ 3.1821337e-1f,
++ -3.4436006e-2f);
+ #elif LOG_POLY_DEGREE == 5
+- mantissa = POLY4( frac, 2.8882704548164776201f, -2.52074962577807006663f, 1.48116647521213171641f, -0.465725644288844778798f, 0.0596515482674574969533f);
++ mantissa = POLY4(frac,
++ 2.8882704548164776201f,
++ -2.52074962577807006663f,
++ 1.48116647521213171641f,
++ -0.465725644288844778798f,
++ 0.0596515482674574969533f);
+ #elif LOG_POLY_DEGREE == 4
+- mantissa = POLY3( frac, 2.61761038894603480148f, -1.75647175389045657003f, 0.688243882994381274313f, -0.107254423828329604454f);
++ mantissa = POLY3(frac,
++ 2.61761038894603480148f,
++ -1.75647175389045657003f,
++ 0.688243882994381274313f,
++ -0.107254423828329604454f);
+ #elif LOG_POLY_DEGREE == 3
+- mantissa = POLY2( frac, 2.28330284476918490682f, -1.04913055217340124191f, 0.204446009836232697516f);
++ mantissa = POLY2(frac,
++ 2.28330284476918490682f,
++ -1.04913055217340124191f,
++ 0.204446009836232697516f);
+ #else
+ #error
+ #endif
+
+- bVal = _mm_add_ps(bVal, _mm_mul_ps(mantissa, _mm_sub_ps(frac, leadingOne)));
+- _mm_storeu_ps(bPtr, bVal);
++ bVal = _mm_add_ps(bVal, _mm_mul_ps(mantissa, _mm_sub_ps(frac, leadingOne)));
++ _mm_storeu_ps(bPtr, bVal);
+
+- aPtr += 4;
+- bPtr += 4;
+- }
++ aPtr += 4;
++ bPtr += 4;
++ }
+
+- number = quarterPoints * 4;
+- volk_32f_log2_32f_u_generic(bPtr, aPtr, num_points-number);
++ number = quarterPoints * 4;
++ volk_32f_log2_32f_u_generic(bPtr, aPtr, num_points - number);
+ }
+
+ #endif /* LV_HAVE_SSE4_1 for unaligned */
+@@ -473,56 +584,86 @@ volk_32f_log2_32f_u_sse4_1(float* bVector, const float* aVector, unsigned int nu
+ #include <immintrin.h>
+
+ #define POLY0_FMAAVX2(x, c0) _mm256_set1_ps(c0)
+-#define POLY1_FMAAVX2(x, c0, c1) _mm256_fmadd_ps(POLY0_FMAAVX2(x, c1), x, _mm256_set1_ps(c0))
+-#define POLY2_FMAAVX2(x, c0, c1, c2) _mm256_fmadd_ps(POLY1_FMAAVX2(x, c1, c2), x, _mm256_set1_ps(c0))
+-#define POLY3_FMAAVX2(x, c0, c1, c2, c3) _mm256_fmadd_ps(POLY2_FMAAVX2(x, c1, c2, c3), x, _mm256_set1_ps(c0))
+-#define POLY4_FMAAVX2(x, c0, c1, c2, c3, c4) _mm256_fmadd_ps(POLY3_FMAAVX2(x, c1, c2, c3, c4), x, _mm256_set1_ps(c0))
+-#define POLY5_FMAAVX2(x, c0, c1, c2, c3, c4, c5) _mm256_fmadd_ps(POLY4_FMAAVX2(x, c1, c2, c3, c4, c5), x, _mm256_set1_ps(c0))
+-
+-static inline void
+-volk_32f_log2_32f_u_avx2_fma(float* bVector, const float* aVector, unsigned int num_points)
++#define POLY1_FMAAVX2(x, c0, c1) \
++ _mm256_fmadd_ps(POLY0_FMAAVX2(x, c1), x, _mm256_set1_ps(c0))
++#define POLY2_FMAAVX2(x, c0, c1, c2) \
++ _mm256_fmadd_ps(POLY1_FMAAVX2(x, c1, c2), x, _mm256_set1_ps(c0))
++#define POLY3_FMAAVX2(x, c0, c1, c2, c3) \
++ _mm256_fmadd_ps(POLY2_FMAAVX2(x, c1, c2, c3), x, _mm256_set1_ps(c0))
++#define POLY4_FMAAVX2(x, c0, c1, c2, c3, c4) \
++ _mm256_fmadd_ps(POLY3_FMAAVX2(x, c1, c2, c3, c4), x, _mm256_set1_ps(c0))
++#define POLY5_FMAAVX2(x, c0, c1, c2, c3, c4, c5) \
++ _mm256_fmadd_ps(POLY4_FMAAVX2(x, c1, c2, c3, c4, c5), x, _mm256_set1_ps(c0))
++
++static inline void volk_32f_log2_32f_u_avx2_fma(float* bVector,
++ const float* aVector,
++ unsigned int num_points)
+ {
+- float* bPtr = bVector;
+- const float* aPtr = aVector;
++ float* bPtr = bVector;
++ const float* aPtr = aVector;
+
+- unsigned int number = 0;
+- const unsigned int eighthPoints = num_points / 8;
++ unsigned int number = 0;
++ const unsigned int eighthPoints = num_points / 8;
+
+- __m256 aVal, bVal, mantissa, frac, leadingOne;
+- __m256i bias, exp;
++ __m256 aVal, bVal, mantissa, frac, leadingOne;
++ __m256i bias, exp;
+
+- for(;number < eighthPoints; number++){
++ for (; number < eighthPoints; number++) {
+
+- aVal = _mm256_loadu_ps(aPtr);
+- bias = _mm256_set1_epi32(127);
+- leadingOne = _mm256_set1_ps(1.0f);
+- exp = _mm256_sub_epi32(_mm256_srli_epi32(_mm256_and_si256(_mm256_castps_si256(aVal), _mm256_set1_epi32(0x7f800000)), 23), bias);
+- bVal = _mm256_cvtepi32_ps(exp);
++ aVal = _mm256_loadu_ps(aPtr);
++ bias = _mm256_set1_epi32(127);
++ leadingOne = _mm256_set1_ps(1.0f);
++ exp = _mm256_sub_epi32(
++ _mm256_srli_epi32(_mm256_and_si256(_mm256_castps_si256(aVal),
++ _mm256_set1_epi32(0x7f800000)),
++ 23),
++ bias);
++ bVal = _mm256_cvtepi32_ps(exp);
+
+- // Now to extract mantissa
+- frac = _mm256_or_ps(leadingOne, _mm256_and_ps(aVal, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffff))));
++ // Now to extract mantissa
++ frac = _mm256_or_ps(
++ leadingOne,
++ _mm256_and_ps(aVal, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffff))));
+
+ #if LOG_POLY_DEGREE == 6
+- mantissa = POLY5_FMAAVX2( frac, 3.1157899f, -3.3241990f, 2.5988452f, -1.2315303f, 3.1821337e-1f, -3.4436006e-2f);
++ mantissa = POLY5_FMAAVX2(frac,
++ 3.1157899f,
++ -3.3241990f,
++ 2.5988452f,
++ -1.2315303f,
++ 3.1821337e-1f,
++ -3.4436006e-2f);
+ #elif LOG_POLY_DEGREE == 5
+- mantissa = POLY4_FMAAVX2( frac, 2.8882704548164776201f, -2.52074962577807006663f, 1.48116647521213171641f, -0.465725644288844778798f, 0.0596515482674574969533f);
++ mantissa = POLY4_FMAAVX2(frac,
++ 2.8882704548164776201f,
++ -2.52074962577807006663f,
++ 1.48116647521213171641f,
++ -0.465725644288844778798f,
++ 0.0596515482674574969533f);
+ #elif LOG_POLY_DEGREE == 4
+- mantissa = POLY3_FMAAVX2( frac, 2.61761038894603480148f, -1.75647175389045657003f, 0.688243882994381274313f, -0.107254423828329604454f);
++ mantissa = POLY3_FMAAVX2(frac,
++ 2.61761038894603480148f,
++ -1.75647175389045657003f,
++ 0.688243882994381274313f,
++ -0.107254423828329604454f);
+ #elif LOG_POLY_DEGREE == 3
+- mantissa = POLY2_FMAAVX2( frac, 2.28330284476918490682f, -1.04913055217340124191f, 0.204446009836232697516f);
++ mantissa = POLY2_FMAAVX2(frac,
++ 2.28330284476918490682f,
++ -1.04913055217340124191f,
++ 0.204446009836232697516f);
+ #else
+ #error
+ #endif
+
+- bVal = _mm256_fmadd_ps(mantissa, _mm256_sub_ps(frac, leadingOne), bVal);
+- _mm256_storeu_ps(bPtr, bVal);
++ bVal = _mm256_fmadd_ps(mantissa, _mm256_sub_ps(frac, leadingOne), bVal);
++ _mm256_storeu_ps(bPtr, bVal);
+
+- aPtr += 8;
+- bPtr += 8;
+- }
++ aPtr += 8;
++ bPtr += 8;
++ }
+
+- number = eighthPoints * 8;
+- volk_32f_log2_32f_u_generic(bPtr, aPtr, num_points-number);
++ number = eighthPoints * 8;
++ volk_32f_log2_32f_u_generic(bPtr, aPtr, num_points - number);
+ }
+
+ #endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for unaligned */
+@@ -531,56 +672,86 @@ volk_32f_log2_32f_u_avx2_fma(float* bVector, const float* aVector, unsigned int
+ #include <immintrin.h>
+
+ #define POLY0_AVX2(x, c0) _mm256_set1_ps(c0)
+-#define POLY1_AVX2(x, c0, c1) _mm256_add_ps(_mm256_mul_ps(POLY0_AVX2(x, c1), x), _mm256_set1_ps(c0))
+-#define POLY2_AVX2(x, c0, c1, c2) _mm256_add_ps(_mm256_mul_ps(POLY1_AVX2(x, c1, c2), x), _mm256_set1_ps(c0))
+-#define POLY3_AVX2(x, c0, c1, c2, c3) _mm256_add_ps(_mm256_mul_ps(POLY2_AVX2(x, c1, c2, c3), x), _mm256_set1_ps(c0))
+-#define POLY4_AVX2(x, c0, c1, c2, c3, c4) _mm256_add_ps(_mm256_mul_ps(POLY3_AVX2(x, c1, c2, c3, c4), x), _mm256_set1_ps(c0))
+-#define POLY5_AVX2(x, c0, c1, c2, c3, c4, c5) _mm256_add_ps(_mm256_mul_ps(POLY4_AVX2(x, c1, c2, c3, c4, c5), x), _mm256_set1_ps(c0))
++#define POLY1_AVX2(x, c0, c1) \
++ _mm256_add_ps(_mm256_mul_ps(POLY0_AVX2(x, c1), x), _mm256_set1_ps(c0))
++#define POLY2_AVX2(x, c0, c1, c2) \
++ _mm256_add_ps(_mm256_mul_ps(POLY1_AVX2(x, c1, c2), x), _mm256_set1_ps(c0))
++#define POLY3_AVX2(x, c0, c1, c2, c3) \
++ _mm256_add_ps(_mm256_mul_ps(POLY2_AVX2(x, c1, c2, c3), x), _mm256_set1_ps(c0))
++#define POLY4_AVX2(x, c0, c1, c2, c3, c4) \
++ _mm256_add_ps(_mm256_mul_ps(POLY3_AVX2(x, c1, c2, c3, c4), x), _mm256_set1_ps(c0))
++#define POLY5_AVX2(x, c0, c1, c2, c3, c4, c5) \
++ _mm256_add_ps(_mm256_mul_ps(POLY4_AVX2(x, c1, c2, c3, c4, c5), x), _mm256_set1_ps(c0))
+
+ static inline void
+ volk_32f_log2_32f_u_avx2(float* bVector, const float* aVector, unsigned int num_points)
+ {
+- float* bPtr = bVector;
+- const float* aPtr = aVector;
++ float* bPtr = bVector;
++ const float* aPtr = aVector;
+
+- unsigned int number = 0;
+- const unsigned int eighthPoints = num_points / 8;
++ unsigned int number = 0;
++ const unsigned int eighthPoints = num_points / 8;
+
+- __m256 aVal, bVal, mantissa, frac, leadingOne;
+- __m256i bias, exp;
++ __m256 aVal, bVal, mantissa, frac, leadingOne;
++ __m256i bias, exp;
+
+- for(;number < eighthPoints; number++){
++ for (; number < eighthPoints; number++) {
+
+- aVal = _mm256_loadu_ps(aPtr);
+- bias = _mm256_set1_epi32(127);
+- leadingOne = _mm256_set1_ps(1.0f);
+- exp = _mm256_sub_epi32(_mm256_srli_epi32(_mm256_and_si256(_mm256_castps_si256(aVal), _mm256_set1_epi32(0x7f800000)), 23), bias);
+- bVal = _mm256_cvtepi32_ps(exp);
++ aVal = _mm256_loadu_ps(aPtr);
++ bias = _mm256_set1_epi32(127);
++ leadingOne = _mm256_set1_ps(1.0f);
++ exp = _mm256_sub_epi32(
++ _mm256_srli_epi32(_mm256_and_si256(_mm256_castps_si256(aVal),
++ _mm256_set1_epi32(0x7f800000)),
++ 23),
++ bias);
++ bVal = _mm256_cvtepi32_ps(exp);
+
+- // Now to extract mantissa
+- frac = _mm256_or_ps(leadingOne, _mm256_and_ps(aVal, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffff))));
++ // Now to extract mantissa
++ frac = _mm256_or_ps(
++ leadingOne,
++ _mm256_and_ps(aVal, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffff))));
+
+ #if LOG_POLY_DEGREE == 6
+- mantissa = POLY5_AVX2( frac, 3.1157899f, -3.3241990f, 2.5988452f, -1.2315303f, 3.1821337e-1f, -3.4436006e-2f);
++ mantissa = POLY5_AVX2(frac,
++ 3.1157899f,
++ -3.3241990f,
++ 2.5988452f,
++ -1.2315303f,
++ 3.1821337e-1f,
++ -3.4436006e-2f);
+ #elif LOG_POLY_DEGREE == 5
+- mantissa = POLY4_AVX2( frac, 2.8882704548164776201f, -2.52074962577807006663f, 1.48116647521213171641f, -0.465725644288844778798f, 0.0596515482674574969533f);
++ mantissa = POLY4_AVX2(frac,
++ 2.8882704548164776201f,
++ -2.52074962577807006663f,
++ 1.48116647521213171641f,
++ -0.465725644288844778798f,
++ 0.0596515482674574969533f);
+ #elif LOG_POLY_DEGREE == 4
+- mantissa = POLY3_AVX2( frac, 2.61761038894603480148f, -1.75647175389045657003f, 0.688243882994381274313f, -0.107254423828329604454f);
++ mantissa = POLY3_AVX2(frac,
++ 2.61761038894603480148f,
++ -1.75647175389045657003f,
++ 0.688243882994381274313f,
++ -0.107254423828329604454f);
+ #elif LOG_POLY_DEGREE == 3
+- mantissa = POLY2_AVX2( frac, 2.28330284476918490682f, -1.04913055217340124191f, 0.204446009836232697516f);
++ mantissa = POLY2_AVX2(frac,
++ 2.28330284476918490682f,
++ -1.04913055217340124191f,
++ 0.204446009836232697516f);
+ #else
+ #error
+ #endif
+
+- bVal = _mm256_add_ps(_mm256_mul_ps(mantissa, _mm256_sub_ps(frac, leadingOne)), bVal);
+- _mm256_storeu_ps(bPtr, bVal);
++ bVal =
++ _mm256_add_ps(_mm256_mul_ps(mantissa, _mm256_sub_ps(frac, leadingOne)), bVal);
++ _mm256_storeu_ps(bPtr, bVal);
+
+- aPtr += 8;
+- bPtr += 8;
+- }
++ aPtr += 8;
++ bPtr += 8;
++ }
+
+- number = eighthPoints * 8;
+- volk_32f_log2_32f_u_generic(bPtr, aPtr, num_points-number);
++ number = eighthPoints * 8;
++ volk_32f_log2_32f_u_generic(bPtr, aPtr, num_points - number);
+ }
+
+ #endif /* LV_HAVE_AVX2 for unaligned */
+diff --git a/kernels/volk/volk_32f_null_32f.h b/kernels/volk/volk_32f_null_32f.h
+index 95e8d1a..cbed229 100644
+--- a/kernels/volk/volk_32f_null_32f.h
++++ b/kernels/volk/volk_32f_null_32f.h
+@@ -20,9 +20,9 @@
+ * Boston, MA 02110-1301, USA.
+ */
+
+-#include <stdio.h>
+-#include <math.h>
+ #include <inttypes.h>
++#include <math.h>
++#include <stdio.h>
+
+ #ifndef INCLUDED_volk_32f_null_32f_a_H
+ #define INCLUDED_volk_32f_null_32f_a_H
+@@ -32,13 +32,13 @@
+ static inline void
+ volk_32f_null_32f_generic(float* bVector, const float* aVector, unsigned int num_points)
+ {
+- float* bPtr = bVector;
+- const float* aPtr = aVector;
+- unsigned int number;
++ float* bPtr = bVector;
++ const float* aPtr = aVector;
++ unsigned int number;
+
+- for(number = 0; number < num_points; number++){
+- *bPtr++ = *aPtr++;
+- }
++ for (number = 0; number < num_points; number++) {
++ *bPtr++ = *aPtr++;
++ }
+ }
+ #endif /* LV_HAVE_GENERIC */
+
+diff --git a/kernels/volk/volk_32f_s32f_32f_fm_detect_32f.h b/kernels/volk/volk_32f_s32f_32f_fm_detect_32f.h
+index 9879959..3bf7aea 100644
+--- a/kernels/volk/volk_32f_s32f_32f_fm_detect_32f.h
++++ b/kernels/volk/volk_32f_s32f_32f_fm_detect_32f.h
+@@ -30,14 +30,15 @@
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+- * void volk_32f_s32f_32f_fm_detect_32f(float* outputVector, const float* inputVector, const float bound, float* saveValue, unsigned int num_points)
+- * \endcode
++ * void volk_32f_s32f_32f_fm_detect_32f(float* outputVector, const float* inputVector,
++ * const float bound, float* saveValue, unsigned int num_points) \endcode
+ *
+ * \b Inputs
+- * \li inputVector: The input vector containing phase data (must be on the interval (-bound, bound]).
+- * \li bound: The interval that the input phase data is in, which is used to modulo the differentiation.
+- * \li saveValue: A pointer to a float which contains the phase value of the sample before the first input sample.
+- * \li num_points The number of data points.
++ * \li inputVector: The input vector containing phase data (must be on the interval
++ * (-bound, bound]). \li bound: The interval that the input phase data is in, which is
++ * used to modulo the differentiation. \li saveValue: A pointer to a float which contains
++ * the phase value of the sample before the first input sample. \li num_points The number
++ * of data points.
+ *
+ * \b Outputs
+ * \li outputVector: The vector where the results will be stored.
+@@ -62,67 +63,79 @@
+ #ifdef LV_HAVE_AVX
+ #include <immintrin.h>
+
+-static inline void volk_32f_s32f_32f_fm_detect_32f_a_avx(float* outputVector, const float* inputVector, const float bound, float* saveValue, unsigned int num_points){
+- if (num_points < 1) {
+- return;
+- }
+- unsigned int number = 1;
+- unsigned int j = 0;
+- // num_points-1 keeps Fedora 7's gcc from crashing...
+- // num_points won't work. :(
+- const unsigned int eighthPoints = (num_points-1) / 8;
+-
+- float* outPtr = outputVector;
+- const float* inPtr = inputVector;
+- __m256 upperBound = _mm256_set1_ps(bound);
+- __m256 lowerBound = _mm256_set1_ps(-bound);
+- __m256 next3old1;
+- __m256 next4;
+- __m256 boundAdjust;
+- __m256 posBoundAdjust = _mm256_set1_ps(-2*bound); // Subtract when we're above.
+- __m256 negBoundAdjust = _mm256_set1_ps(2*bound); // Add when we're below.
+- // Do the first 8 by hand since we're going in from the saveValue:
+- *outPtr = *inPtr - *saveValue;
+- if (*outPtr > bound) *outPtr -= 2*bound;
+- if (*outPtr < -bound) *outPtr += 2*bound;
+- inPtr++;
+- outPtr++;
+- for (j = 1; j < ( (8 < num_points) ? 8 : num_points); j++) {
+- *outPtr = *(inPtr) - *(inPtr-1);
+- if (*outPtr > bound) *outPtr -= 2*bound;
+- if (*outPtr < -bound) *outPtr += 2*bound;
+- inPtr++;
+- outPtr++;
+- }
+-
+- for (; number < eighthPoints; number++) {
+- // Load data
+- next3old1 = _mm256_loadu_ps((float*) (inPtr-1));
+- next4 = _mm256_load_ps(inPtr);
+- inPtr += 8;
+- // Subtract and store:
+- next3old1 = _mm256_sub_ps(next4, next3old1);
+- // Bound:
+- boundAdjust = _mm256_cmp_ps(next3old1, upperBound, _CMP_GT_OS);
+- boundAdjust = _mm256_and_ps(boundAdjust, posBoundAdjust);
+- next4 = _mm256_cmp_ps(next3old1, lowerBound, _CMP_LT_OS);
+- next4 = _mm256_and_ps(next4, negBoundAdjust);
+- boundAdjust = _mm256_or_ps(next4, boundAdjust);
+- // Make sure we're in the bounding interval:
+- next3old1 = _mm256_add_ps(next3old1, boundAdjust);
+- _mm256_store_ps(outPtr,next3old1); // Store the results back into the output
+- outPtr += 8;
+- }
+-
+- for (number = (8 > (eighthPoints*8) ? 8 : (8 * eighthPoints)); number < num_points; number++) {
+- *outPtr = *(inPtr) - *(inPtr-1);
+- if (*outPtr > bound) *outPtr -= 2*bound;
+- if (*outPtr < -bound) *outPtr += 2*bound;
++static inline void volk_32f_s32f_32f_fm_detect_32f_a_avx(float* outputVector,
++ const float* inputVector,
++ const float bound,
++ float* saveValue,
++ unsigned int num_points)
++{
++ if (num_points < 1) {
++ return;
++ }
++ unsigned int number = 1;
++ unsigned int j = 0;
++ // num_points-1 keeps Fedora 7's gcc from crashing...
++ // num_points won't work. :(
++ const unsigned int eighthPoints = (num_points - 1) / 8;
++
++ float* outPtr = outputVector;
++ const float* inPtr = inputVector;
++ __m256 upperBound = _mm256_set1_ps(bound);
++ __m256 lowerBound = _mm256_set1_ps(-bound);
++ __m256 next3old1;
++ __m256 next4;
++ __m256 boundAdjust;
++ __m256 posBoundAdjust = _mm256_set1_ps(-2 * bound); // Subtract when we're above.
++ __m256 negBoundAdjust = _mm256_set1_ps(2 * bound); // Add when we're below.
++ // Do the first 8 by hand since we're going in from the saveValue:
++ *outPtr = *inPtr - *saveValue;
++ if (*outPtr > bound)
++ *outPtr -= 2 * bound;
++ if (*outPtr < -bound)
++ *outPtr += 2 * bound;
+ inPtr++;
+ outPtr++;
+- }
+-
+- *saveValue = inputVector[num_points-1];
++ for (j = 1; j < ((8 < num_points) ? 8 : num_points); j++) {
++ *outPtr = *(inPtr) - *(inPtr - 1);
++ if (*outPtr > bound)
++ *outPtr -= 2 * bound;
++ if (*outPtr < -bound)
++ *outPtr += 2 * bound;
++ inPtr++;
++ outPtr++;
++ }
++
++ for (; number < eighthPoints; number++) {
++ // Load data
++ next3old1 = _mm256_loadu_ps((float*)(inPtr - 1));
++ next4 = _mm256_load_ps(inPtr);
++ inPtr += 8;
++ // Subtract and store:
++ next3old1 = _mm256_sub_ps(next4, next3old1);
++ // Bound:
++ boundAdjust = _mm256_cmp_ps(next3old1, upperBound, _CMP_GT_OS);
++ boundAdjust = _mm256_and_ps(boundAdjust, posBoundAdjust);
++ next4 = _mm256_cmp_ps(next3old1, lowerBound, _CMP_LT_OS);
++ next4 = _mm256_and_ps(next4, negBoundAdjust);
++ boundAdjust = _mm256_or_ps(next4, boundAdjust);
++ // Make sure we're in the bounding interval:
++ next3old1 = _mm256_add_ps(next3old1, boundAdjust);
++ _mm256_store_ps(outPtr, next3old1); // Store the results back into the output
++ outPtr += 8;
++ }
++
++ for (number = (8 > (eighthPoints * 8) ? 8 : (8 * eighthPoints)); number < num_points;
++ number++) {
++ *outPtr = *(inPtr) - *(inPtr - 1);
++ if (*outPtr > bound)
++ *outPtr -= 2 * bound;
++ if (*outPtr < -bound)
++ *outPtr += 2 * bound;
++ inPtr++;
++ outPtr++;
++ }
++
++ *saveValue = inputVector[num_points - 1];
+ }
+ #endif /* LV_HAVE_AVX */
+
+@@ -130,102 +143,122 @@ static inline void volk_32f_s32f_32f_fm_detect_32f_a_avx(float* outputVector, co
+ #ifdef LV_HAVE_SSE
+ #include <xmmintrin.h>
+
+-static inline void volk_32f_s32f_32f_fm_detect_32f_a_sse(float* outputVector, const float* inputVector, const float bound, float* saveValue, unsigned int num_points){
+- if (num_points < 1) {
+- return;
+- }
+- unsigned int number = 1;
+- unsigned int j = 0;
+- // num_points-1 keeps Fedora 7's gcc from crashing...
+- // num_points won't work. :(
+- const unsigned int quarterPoints = (num_points-1) / 4;
+-
+- float* outPtr = outputVector;
+- const float* inPtr = inputVector;
+- __m128 upperBound = _mm_set_ps1(bound);
+- __m128 lowerBound = _mm_set_ps1(-bound);
+- __m128 next3old1;
+- __m128 next4;
+- __m128 boundAdjust;
+- __m128 posBoundAdjust = _mm_set_ps1(-2*bound); // Subtract when we're above.
+- __m128 negBoundAdjust = _mm_set_ps1(2*bound); // Add when we're below.
+- // Do the first 4 by hand since we're going in from the saveValue:
+- *outPtr = *inPtr - *saveValue;
+- if (*outPtr > bound) *outPtr -= 2*bound;
+- if (*outPtr < -bound) *outPtr += 2*bound;
+- inPtr++;
+- outPtr++;
+- for (j = 1; j < ( (4 < num_points) ? 4 : num_points); j++) {
+- *outPtr = *(inPtr) - *(inPtr-1);
+- if (*outPtr > bound) *outPtr -= 2*bound;
+- if (*outPtr < -bound) *outPtr += 2*bound;
+- inPtr++;
+- outPtr++;
+- }
+-
+- for (; number < quarterPoints; number++) {
+- // Load data
+- next3old1 = _mm_loadu_ps((float*) (inPtr-1));
+- next4 = _mm_load_ps(inPtr);
+- inPtr += 4;
+- // Subtract and store:
+- next3old1 = _mm_sub_ps(next4, next3old1);
+- // Bound:
+- boundAdjust = _mm_cmpgt_ps(next3old1, upperBound);
+- boundAdjust = _mm_and_ps(boundAdjust, posBoundAdjust);
+- next4 = _mm_cmplt_ps(next3old1, lowerBound);
+- next4 = _mm_and_ps(next4, negBoundAdjust);
+- boundAdjust = _mm_or_ps(next4, boundAdjust);
+- // Make sure we're in the bounding interval:
+- next3old1 = _mm_add_ps(next3old1, boundAdjust);
+- _mm_store_ps(outPtr,next3old1); // Store the results back into the output
+- outPtr += 4;
+- }
+-
+- for (number = (4 > (quarterPoints*4) ? 4 : (4 * quarterPoints)); number < num_points; number++) {
+- *outPtr = *(inPtr) - *(inPtr-1);
+- if (*outPtr > bound) *outPtr -= 2*bound;
+- if (*outPtr < -bound) *outPtr += 2*bound;
++static inline void volk_32f_s32f_32f_fm_detect_32f_a_sse(float* outputVector,
++ const float* inputVector,
++ const float bound,
++ float* saveValue,
++ unsigned int num_points)
++{
++ if (num_points < 1) {
++ return;
++ }
++ unsigned int number = 1;
++ unsigned int j = 0;
++ // num_points-1 keeps Fedora 7's gcc from crashing...
++ // num_points won't work. :(
++ const unsigned int quarterPoints = (num_points - 1) / 4;
++
++ float* outPtr = outputVector;
++ const float* inPtr = inputVector;
++ __m128 upperBound = _mm_set_ps1(bound);
++ __m128 lowerBound = _mm_set_ps1(-bound);
++ __m128 next3old1;
++ __m128 next4;
++ __m128 boundAdjust;
++ __m128 posBoundAdjust = _mm_set_ps1(-2 * bound); // Subtract when we're above.
++ __m128 negBoundAdjust = _mm_set_ps1(2 * bound); // Add when we're below.
++ // Do the first 4 by hand since we're going in from the saveValue:
++ *outPtr = *inPtr - *saveValue;
++ if (*outPtr > bound)
++ *outPtr -= 2 * bound;
++ if (*outPtr < -bound)
++ *outPtr += 2 * bound;
+ inPtr++;
+ outPtr++;
+- }
+-
+- *saveValue = inputVector[num_points-1];
++ for (j = 1; j < ((4 < num_points) ? 4 : num_points); j++) {
++ *outPtr = *(inPtr) - *(inPtr - 1);
++ if (*outPtr > bound)
++ *outPtr -= 2 * bound;
++ if (*outPtr < -bound)
++ *outPtr += 2 * bound;
++ inPtr++;
++ outPtr++;
++ }
++
++ for (; number < quarterPoints; number++) {
++ // Load data
++ next3old1 = _mm_loadu_ps((float*)(inPtr - 1));
++ next4 = _mm_load_ps(inPtr);
++ inPtr += 4;
++ // Subtract and store:
++ next3old1 = _mm_sub_ps(next4, next3old1);
++ // Bound:
++ boundAdjust = _mm_cmpgt_ps(next3old1, upperBound);
++ boundAdjust = _mm_and_ps(boundAdjust, posBoundAdjust);
++ next4 = _mm_cmplt_ps(next3old1, lowerBound);
++ next4 = _mm_and_ps(next4, negBoundAdjust);
++ boundAdjust = _mm_or_ps(next4, boundAdjust);
++ // Make sure we're in the bounding interval:
++ next3old1 = _mm_add_ps(next3old1, boundAdjust);
++ _mm_store_ps(outPtr, next3old1); // Store the results back into the output
++ outPtr += 4;
++ }
++
++ for (number = (4 > (quarterPoints * 4) ? 4 : (4 * quarterPoints));
++ number < num_points;
++ number++) {
++ *outPtr = *(inPtr) - *(inPtr - 1);
++ if (*outPtr > bound)
++ *outPtr -= 2 * bound;
++ if (*outPtr < -bound)
++ *outPtr += 2 * bound;
++ inPtr++;
++ outPtr++;
++ }
++
++ *saveValue = inputVector[num_points - 1];
+ }
+ #endif /* LV_HAVE_SSE */
+
+ #ifdef LV_HAVE_GENERIC
+
+-static inline void volk_32f_s32f_32f_fm_detect_32f_generic(float* outputVector, const float* inputVector, const float bound, float* saveValue, unsigned int num_points){
+- if (num_points < 1) {
+- return;
+- }
+- unsigned int number = 0;
+- float* outPtr = outputVector;
+- const float* inPtr = inputVector;
+-
+- // Do the first 1 by hand since we're going in from the saveValue:
+- *outPtr = *inPtr - *saveValue;
+- if (*outPtr > bound) *outPtr -= 2*bound;
+- if (*outPtr < -bound) *outPtr += 2*bound;
+- inPtr++;
+- outPtr++;
+-
+- for (number = 1; number < num_points; number++) {
+- *outPtr = *(inPtr) - *(inPtr-1);
+- if (*outPtr > bound) *outPtr -= 2*bound;
+- if (*outPtr < -bound) *outPtr += 2*bound;
++static inline void volk_32f_s32f_32f_fm_detect_32f_generic(float* outputVector,
++ const float* inputVector,
++ const float bound,
++ float* saveValue,
++ unsigned int num_points)
++{
++ if (num_points < 1) {
++ return;
++ }
++ unsigned int number = 0;
++ float* outPtr = outputVector;
++ const float* inPtr = inputVector;
++
++ // Do the first 1 by hand since we're going in from the saveValue:
++ *outPtr = *inPtr - *saveValue;
++ if (*outPtr > bound)
++ *outPtr -= 2 * bound;
++ if (*outPtr < -bound)
++ *outPtr += 2 * bound;
+ inPtr++;
+ outPtr++;
+- }
+
+- *saveValue = inputVector[num_points-1];
++ for (number = 1; number < num_points; number++) {
++ *outPtr = *(inPtr) - *(inPtr - 1);
++ if (*outPtr > bound)
++ *outPtr -= 2 * bound;
++ if (*outPtr < -bound)
++ *outPtr += 2 * bound;
++ inPtr++;
++ outPtr++;
++ }
++
++ *saveValue = inputVector[num_points - 1];
+ }
+ #endif /* LV_HAVE_GENERIC */
+
+
+-
+-
+ #endif /* INCLUDED_volk_32f_s32f_32f_fm_detect_32f_a_H */
+
+
+@@ -238,67 +271,79 @@ static inline void volk_32f_s32f_32f_fm_detect_32f_generic(float* outputVector,
+ #ifdef LV_HAVE_AVX
+ #include <immintrin.h>
+
+-static inline void volk_32f_s32f_32f_fm_detect_32f_u_avx(float* outputVector, const float* inputVector, const float bound, float* saveValue, unsigned int num_points){
+- if (num_points < 1) {
+- return;
+- }
+- unsigned int number = 1;
+- unsigned int j = 0;
+- // num_points-1 keeps Fedora 7's gcc from crashing...
+- // num_points won't work. :(
+- const unsigned int eighthPoints = (num_points-1) / 8;
+-
+- float* outPtr = outputVector;
+- const float* inPtr = inputVector;
+- __m256 upperBound = _mm256_set1_ps(bound);
+- __m256 lowerBound = _mm256_set1_ps(-bound);
+- __m256 next3old1;
+- __m256 next4;
+- __m256 boundAdjust;
+- __m256 posBoundAdjust = _mm256_set1_ps(-2*bound); // Subtract when we're above.
+- __m256 negBoundAdjust = _mm256_set1_ps(2*bound); // Add when we're below.
+- // Do the first 8 by hand since we're going in from the saveValue:
+- *outPtr = *inPtr - *saveValue;
+- if (*outPtr > bound) *outPtr -= 2*bound;
+- if (*outPtr < -bound) *outPtr += 2*bound;
+- inPtr++;
+- outPtr++;
+- for (j = 1; j < ( (8 < num_points) ? 8 : num_points); j++) {
+- *outPtr = *(inPtr) - *(inPtr-1);
+- if (*outPtr > bound) *outPtr -= 2*bound;
+- if (*outPtr < -bound) *outPtr += 2*bound;
++static inline void volk_32f_s32f_32f_fm_detect_32f_u_avx(float* outputVector,
++ const float* inputVector,
++ const float bound,
++ float* saveValue,
++ unsigned int num_points)
++{
++ if (num_points < 1) {
++ return;
++ }
++ unsigned int number = 1;
++ unsigned int j = 0;
++ // num_points-1 keeps Fedora 7's gcc from crashing...
++ // num_points won't work. :(
++ const unsigned int eighthPoints = (num_points - 1) / 8;
++
++ float* outPtr = outputVector;
++ const float* inPtr = inputVector;
++ __m256 upperBound = _mm256_set1_ps(bound);
++ __m256 lowerBound = _mm256_set1_ps(-bound);
++ __m256 next3old1;
++ __m256 next4;
++ __m256 boundAdjust;
++ __m256 posBoundAdjust = _mm256_set1_ps(-2 * bound); // Subtract when we're above.
++ __m256 negBoundAdjust = _mm256_set1_ps(2 * bound); // Add when we're below.
++ // Do the first 8 by hand since we're going in from the saveValue:
++ *outPtr = *inPtr - *saveValue;
++ if (*outPtr > bound)
++ *outPtr -= 2 * bound;
++ if (*outPtr < -bound)
++ *outPtr += 2 * bound;
+ inPtr++;
+ outPtr++;
+- }
+-
+- for (; number < eighthPoints; number++) {
+- // Load data
+- next3old1 = _mm256_loadu_ps((float*) (inPtr-1));
+- next4 = _mm256_loadu_ps(inPtr);
+- inPtr += 8;
+- // Subtract and store:
+- next3old1 = _mm256_sub_ps(next4, next3old1);
+- // Bound:
+- boundAdjust = _mm256_cmp_ps(next3old1, upperBound, _CMP_GT_OS);
+- boundAdjust = _mm256_and_ps(boundAdjust, posBoundAdjust);
+- next4 = _mm256_cmp_ps(next3old1, lowerBound, _CMP_LT_OS);
+- next4 = _mm256_and_ps(next4, negBoundAdjust);
+- boundAdjust = _mm256_or_ps(next4, boundAdjust);
+- // Make sure we're in the bounding interval:
+- next3old1 = _mm256_add_ps(next3old1, boundAdjust);
+- _mm256_storeu_ps(outPtr,next3old1); // Store the results back into the output
+- outPtr += 8;
+- }
+-
+- for (number = (8 > (eighthPoints*8) ? 8 : (8 * eighthPoints)); number < num_points; number++) {
+- *outPtr = *(inPtr) - *(inPtr-1);
+- if (*outPtr > bound) *outPtr -= 2*bound;
+- if (*outPtr < -bound) *outPtr += 2*bound;
+- inPtr++;
+- outPtr++;
+- }
+-
+- *saveValue = inputVector[num_points-1];
++ for (j = 1; j < ((8 < num_points) ? 8 : num_points); j++) {
++ *outPtr = *(inPtr) - *(inPtr - 1);
++ if (*outPtr > bound)
++ *outPtr -= 2 * bound;
++ if (*outPtr < -bound)
++ *outPtr += 2 * bound;
++ inPtr++;
++ outPtr++;
++ }
++
++ for (; number < eighthPoints; number++) {
++ // Load data
++ next3old1 = _mm256_loadu_ps((float*)(inPtr - 1));
++ next4 = _mm256_loadu_ps(inPtr);
++ inPtr += 8;
++ // Subtract and store:
++ next3old1 = _mm256_sub_ps(next4, next3old1);
++ // Bound:
++ boundAdjust = _mm256_cmp_ps(next3old1, upperBound, _CMP_GT_OS);
++ boundAdjust = _mm256_and_ps(boundAdjust, posBoundAdjust);
++ next4 = _mm256_cmp_ps(next3old1, lowerBound, _CMP_LT_OS);
++ next4 = _mm256_and_ps(next4, negBoundAdjust);
++ boundAdjust = _mm256_or_ps(next4, boundAdjust);
++ // Make sure we're in the bounding interval:
++ next3old1 = _mm256_add_ps(next3old1, boundAdjust);
++ _mm256_storeu_ps(outPtr, next3old1); // Store the results back into the output
++ outPtr += 8;
++ }
++
++ for (number = (8 > (eighthPoints * 8) ? 8 : (8 * eighthPoints)); number < num_points;
++ number++) {
++ *outPtr = *(inPtr) - *(inPtr - 1);
++ if (*outPtr > bound)
++ *outPtr -= 2 * bound;
++ if (*outPtr < -bound)
++ *outPtr += 2 * bound;
++ inPtr++;
++ outPtr++;
++ }
++
++ *saveValue = inputVector[num_points - 1];
+ }
+ #endif /* LV_HAVE_AVX */
+
+diff --git a/kernels/volk/volk_32f_s32f_calc_spectral_noise_floor_32f.h b/kernels/volk/volk_32f_s32f_calc_spectral_noise_floor_32f.h
+index ae371a2..e7e581f 100644
+--- a/kernels/volk/volk_32f_s32f_calc_spectral_noise_floor_32f.h
++++ b/kernels/volk/volk_32f_s32f_calc_spectral_noise_floor_32f.h
+@@ -35,13 +35,15 @@
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+- * void volk_32f_s32f_calc_spectral_noise_floor_32f(float* noiseFloorAmplitude, const float* realDataPoints, const float spectralExclusionValue, const unsigned int num_points)
+- * \endcode
++ * void volk_32f_s32f_calc_spectral_noise_floor_32f(float* noiseFloorAmplitude, const
++ * float* realDataPoints, const float spectralExclusionValue, const unsigned int
++ * num_points) \endcode
+ *
+ * \b Inputs
+ * \li realDataPoints: The input power spectrum.
+- * \li spectralExclusionValue: The number of dB above the noise floor that a data point must be to be excluded from the noise floor calculation - default value is 20.
+- * \li num_points: The number of data points.
++ * \li spectralExclusionValue: The number of dB above the noise floor that a data point
++ * must be to be excluded from the noise floor calculation - default value is 20. \li
++ * num_points: The number of data points.
+ *
+ * \b Outputs
+ * \li noiseFloorAmplitude: The noise floor of the input spectrum, in dB.
+@@ -59,9 +61,9 @@
+ #ifndef INCLUDED_volk_32f_s32f_calc_spectral_noise_floor_32f_a_H
+ #define INCLUDED_volk_32f_s32f_calc_spectral_noise_floor_32f_a_H
+
+-#include <volk/volk_common.h>
+ #include <inttypes.h>
+ #include <stdio.h>
++#include <volk/volk_common.h>
+
+ #ifdef LV_HAVE_AVX
+ #include <immintrin.h>
+@@ -72,114 +74,117 @@ volk_32f_s32f_calc_spectral_noise_floor_32f_a_avx(float* noiseFloorAmplitude,
+ const float spectralExclusionValue,
+ const unsigned int num_points)
+ {
+- unsigned int number = 0;
+- const unsigned int eighthPoints = num_points / 8;
+-
+- const float* dataPointsPtr = realDataPoints;
+- __VOLK_ATTR_ALIGNED(32) float avgPointsVector[8];
+-
+- __m256 dataPointsVal;
+- __m256 avgPointsVal = _mm256_setzero_ps();
+- // Calculate the sum (for mean) for all points
+- for(; number < eighthPoints; number++){
+-
+- dataPointsVal = _mm256_load_ps(dataPointsPtr);
+-
+- dataPointsPtr += 8;
+-
+- avgPointsVal = _mm256_add_ps(avgPointsVal, dataPointsVal);
+- }
+-
+- _mm256_store_ps(avgPointsVector, avgPointsVal);
+-
+- float sumMean = 0.0;
+- sumMean += avgPointsVector[0];
+- sumMean += avgPointsVector[1];
+- sumMean += avgPointsVector[2];
+- sumMean += avgPointsVector[3];
+- sumMean += avgPointsVector[4];
+- sumMean += avgPointsVector[5];
+- sumMean += avgPointsVector[6];
+- sumMean += avgPointsVector[7];
+-
+- number = eighthPoints * 8;
+- for(;number < num_points; number++){
+- sumMean += realDataPoints[number];
+- }
+-
+- // calculate the spectral mean
+- // +20 because for the comparison below we only want to throw out bins
+- // that are significantly higher (and would, thus, affect the mean more
+- const float meanAmplitude = (sumMean / ((float)num_points)) + spectralExclusionValue;
+-
+- dataPointsPtr = realDataPoints; // Reset the dataPointsPtr
+- __m256 vMeanAmplitudeVector = _mm256_set1_ps(meanAmplitude);
+- __m256 vOnesVector = _mm256_set1_ps(1.0);
+- __m256 vValidBinCount = _mm256_setzero_ps();
+- avgPointsVal = _mm256_setzero_ps();
+- __m256 compareMask;
+- number = 0;
+- // Calculate the sum (for mean) for any points which do NOT exceed the mean amplitude
+- for(; number < eighthPoints; number++){
+-
+- dataPointsVal = _mm256_load_ps(dataPointsPtr);
+-
+- dataPointsPtr += 8;
+-
+- // Identify which items do not exceed the mean amplitude
+- compareMask = _mm256_cmp_ps(dataPointsVal, vMeanAmplitudeVector, _CMP_LE_OQ);
+-
+- // Mask off the items that exceed the mean amplitude and add the avg Points that do not exceed the mean amplitude
+- avgPointsVal = _mm256_add_ps(avgPointsVal, _mm256_and_ps(compareMask, dataPointsVal));
+-
+- // Count the number of bins which do not exceed the mean amplitude
+- vValidBinCount = _mm256_add_ps(vValidBinCount, _mm256_and_ps(compareMask, vOnesVector));
+- }
+-
+- // Calculate the mean from the remaining data points
+- _mm256_store_ps(avgPointsVector, avgPointsVal);
+-
+- sumMean = 0.0;
+- sumMean += avgPointsVector[0];
+- sumMean += avgPointsVector[1];
+- sumMean += avgPointsVector[2];
+- sumMean += avgPointsVector[3];
+- sumMean += avgPointsVector[4];
+- sumMean += avgPointsVector[5];
+- sumMean += avgPointsVector[6];
+- sumMean += avgPointsVector[7];
+-
+- // Calculate the number of valid bins from the remaining count
+- __VOLK_ATTR_ALIGNED(32) float validBinCountVector[8];
+- _mm256_store_ps(validBinCountVector, vValidBinCount);
+-
+- float validBinCount = 0;
+- validBinCount += validBinCountVector[0];
+- validBinCount += validBinCountVector[1];
+- validBinCount += validBinCountVector[2];
+- validBinCount += validBinCountVector[3];
+- validBinCount += validBinCountVector[4];
+- validBinCount += validBinCountVector[5];
+- validBinCount += validBinCountVector[6];
+- validBinCount += validBinCountVector[7];
+-
+- number = eighthPoints * 8;
+- for(;number < num_points; number++){
+- if(realDataPoints[number] <= meanAmplitude){
+- sumMean += realDataPoints[number];
+- validBinCount += 1.0;
++ unsigned int number = 0;
++ const unsigned int eighthPoints = num_points / 8;
++
++ const float* dataPointsPtr = realDataPoints;
++ __VOLK_ATTR_ALIGNED(32) float avgPointsVector[8];
++
++ __m256 dataPointsVal;
++ __m256 avgPointsVal = _mm256_setzero_ps();
++ // Calculate the sum (for mean) for all points
++ for (; number < eighthPoints; number++) {
++
++ dataPointsVal = _mm256_load_ps(dataPointsPtr);
++
++ dataPointsPtr += 8;
++
++ avgPointsVal = _mm256_add_ps(avgPointsVal, dataPointsVal);
+ }
+- }
+
+- float localNoiseFloorAmplitude = 0;
+- if(validBinCount > 0.0){
+- localNoiseFloorAmplitude = sumMean / validBinCount;
+- }
+- else{
+- localNoiseFloorAmplitude = meanAmplitude; // For the odd case that all the amplitudes are equal...
+- }
++ _mm256_store_ps(avgPointsVector, avgPointsVal);
++
++ float sumMean = 0.0;
++ sumMean += avgPointsVector[0];
++ sumMean += avgPointsVector[1];
++ sumMean += avgPointsVector[2];
++ sumMean += avgPointsVector[3];
++ sumMean += avgPointsVector[4];
++ sumMean += avgPointsVector[5];
++ sumMean += avgPointsVector[6];
++ sumMean += avgPointsVector[7];
++
++ number = eighthPoints * 8;
++ for (; number < num_points; number++) {
++ sumMean += realDataPoints[number];
++ }
++
++ // calculate the spectral mean
++ // +20 because for the comparison below we only want to throw out bins
++ // that are significantly higher (and would, thus, affect the mean more
++ const float meanAmplitude = (sumMean / ((float)num_points)) + spectralExclusionValue;
++
++ dataPointsPtr = realDataPoints; // Reset the dataPointsPtr
++ __m256 vMeanAmplitudeVector = _mm256_set1_ps(meanAmplitude);
++ __m256 vOnesVector = _mm256_set1_ps(1.0);
++ __m256 vValidBinCount = _mm256_setzero_ps();
++ avgPointsVal = _mm256_setzero_ps();
++ __m256 compareMask;
++ number = 0;
++ // Calculate the sum (for mean) for any points which do NOT exceed the mean amplitude
++ for (; number < eighthPoints; number++) {
++
++ dataPointsVal = _mm256_load_ps(dataPointsPtr);
++
++ dataPointsPtr += 8;
++
++ // Identify which items do not exceed the mean amplitude
++ compareMask = _mm256_cmp_ps(dataPointsVal, vMeanAmplitudeVector, _CMP_LE_OQ);
++
++ // Mask off the items that exceed the mean amplitude and add the avg Points that
++ // do not exceed the mean amplitude
++ avgPointsVal =
++ _mm256_add_ps(avgPointsVal, _mm256_and_ps(compareMask, dataPointsVal));
++
++ // Count the number of bins which do not exceed the mean amplitude
++ vValidBinCount =
++ _mm256_add_ps(vValidBinCount, _mm256_and_ps(compareMask, vOnesVector));
++ }
+
+- *noiseFloorAmplitude = localNoiseFloorAmplitude;
++ // Calculate the mean from the remaining data points
++ _mm256_store_ps(avgPointsVector, avgPointsVal);
++
++ sumMean = 0.0;
++ sumMean += avgPointsVector[0];
++ sumMean += avgPointsVector[1];
++ sumMean += avgPointsVector[2];
++ sumMean += avgPointsVector[3];
++ sumMean += avgPointsVector[4];
++ sumMean += avgPointsVector[5];
++ sumMean += avgPointsVector[6];
++ sumMean += avgPointsVector[7];
++
++ // Calculate the number of valid bins from the remaining count
++ __VOLK_ATTR_ALIGNED(32) float validBinCountVector[8];
++ _mm256_store_ps(validBinCountVector, vValidBinCount);
++
++ float validBinCount = 0;
++ validBinCount += validBinCountVector[0];
++ validBinCount += validBinCountVector[1];
++ validBinCount += validBinCountVector[2];
++ validBinCount += validBinCountVector[3];
++ validBinCount += validBinCountVector[4];
++ validBinCount += validBinCountVector[5];
++ validBinCount += validBinCountVector[6];
++ validBinCount += validBinCountVector[7];
++
++ number = eighthPoints * 8;
++ for (; number < num_points; number++) {
++ if (realDataPoints[number] <= meanAmplitude) {
++ sumMean += realDataPoints[number];
++ validBinCount += 1.0;
++ }
++ }
++
++ float localNoiseFloorAmplitude = 0;
++ if (validBinCount > 0.0) {
++ localNoiseFloorAmplitude = sumMean / validBinCount;
++ } else {
++ localNoiseFloorAmplitude =
++ meanAmplitude; // For the odd case that all the amplitudes are equal...
++ }
++
++ *noiseFloorAmplitude = localNoiseFloorAmplitude;
+ }
+ #endif /* LV_HAVE_AVX */
+
+@@ -192,102 +197,103 @@ volk_32f_s32f_calc_spectral_noise_floor_32f_a_sse(float* noiseFloorAmplitude,
+ const float spectralExclusionValue,
+ const unsigned int num_points)
+ {
+- unsigned int number = 0;
+- const unsigned int quarterPoints = num_points / 4;
+-
+- const float* dataPointsPtr = realDataPoints;
+- __VOLK_ATTR_ALIGNED(16) float avgPointsVector[4];
+-
+- __m128 dataPointsVal;
+- __m128 avgPointsVal = _mm_setzero_ps();
+- // Calculate the sum (for mean) for all points
+- for(; number < quarterPoints; number++){
+-
+- dataPointsVal = _mm_load_ps(dataPointsPtr);
+-
+- dataPointsPtr += 4;
+-
+- avgPointsVal = _mm_add_ps(avgPointsVal, dataPointsVal);
+- }
+-
+- _mm_store_ps(avgPointsVector, avgPointsVal);
+-
+- float sumMean = 0.0;
+- sumMean += avgPointsVector[0];
+- sumMean += avgPointsVector[1];
+- sumMean += avgPointsVector[2];
+- sumMean += avgPointsVector[3];
+-
+- number = quarterPoints * 4;
+- for(;number < num_points; number++){
+- sumMean += realDataPoints[number];
+- }
+-
+- // calculate the spectral mean
+- // +20 because for the comparison below we only want to throw out bins
+- // that are significantly higher (and would, thus, affect the mean more
+- const float meanAmplitude = (sumMean / ((float)num_points)) + spectralExclusionValue;
+-
+- dataPointsPtr = realDataPoints; // Reset the dataPointsPtr
+- __m128 vMeanAmplitudeVector = _mm_set_ps1(meanAmplitude);
+- __m128 vOnesVector = _mm_set_ps1(1.0);
+- __m128 vValidBinCount = _mm_setzero_ps();
+- avgPointsVal = _mm_setzero_ps();
+- __m128 compareMask;
+- number = 0;
+- // Calculate the sum (for mean) for any points which do NOT exceed the mean amplitude
+- for(; number < quarterPoints; number++){
+-
+- dataPointsVal = _mm_load_ps(dataPointsPtr);
+-
+- dataPointsPtr += 4;
+-
+- // Identify which items do not exceed the mean amplitude
+- compareMask = _mm_cmple_ps(dataPointsVal, vMeanAmplitudeVector);
+-
+- // Mask off the items that exceed the mean amplitude and add the avg Points that do not exceed the mean amplitude
+- avgPointsVal = _mm_add_ps(avgPointsVal, _mm_and_ps(compareMask, dataPointsVal));
+-
+- // Count the number of bins which do not exceed the mean amplitude
+- vValidBinCount = _mm_add_ps(vValidBinCount, _mm_and_ps(compareMask, vOnesVector));
+- }
+-
+- // Calculate the mean from the remaining data points
+- _mm_store_ps(avgPointsVector, avgPointsVal);
+-
+- sumMean = 0.0;
+- sumMean += avgPointsVector[0];
+- sumMean += avgPointsVector[1];
+- sumMean += avgPointsVector[2];
+- sumMean += avgPointsVector[3];
+-
+- // Calculate the number of valid bins from the remaining count
+- __VOLK_ATTR_ALIGNED(16) float validBinCountVector[4];
+- _mm_store_ps(validBinCountVector, vValidBinCount);
+-
+- float validBinCount = 0;
+- validBinCount += validBinCountVector[0];
+- validBinCount += validBinCountVector[1];
+- validBinCount += validBinCountVector[2];
+- validBinCount += validBinCountVector[3];
+-
+- number = quarterPoints * 4;
+- for(;number < num_points; number++){
+- if(realDataPoints[number] <= meanAmplitude){
+- sumMean += realDataPoints[number];
+- validBinCount += 1.0;
++ unsigned int number = 0;
++ const unsigned int quarterPoints = num_points / 4;
++
++ const float* dataPointsPtr = realDataPoints;
++ __VOLK_ATTR_ALIGNED(16) float avgPointsVector[4];
++
++ __m128 dataPointsVal;
++ __m128 avgPointsVal = _mm_setzero_ps();
++ // Calculate the sum (for mean) for all points
++ for (; number < quarterPoints; number++) {
++
++ dataPointsVal = _mm_load_ps(dataPointsPtr);
++
++ dataPointsPtr += 4;
++
++ avgPointsVal = _mm_add_ps(avgPointsVal, dataPointsVal);
++ }
++
++ _mm_store_ps(avgPointsVector, avgPointsVal);
++
++ float sumMean = 0.0;
++ sumMean += avgPointsVector[0];
++ sumMean += avgPointsVector[1];
++ sumMean += avgPointsVector[2];
++ sumMean += avgPointsVector[3];
++
++ number = quarterPoints * 4;
++ for (; number < num_points; number++) {
++ sumMean += realDataPoints[number];
++ }
++
++ // calculate the spectral mean
++ // +20 because for the comparison below we only want to throw out bins
++ // that are significantly higher (and would, thus, affect the mean more
++ const float meanAmplitude = (sumMean / ((float)num_points)) + spectralExclusionValue;
++
++ dataPointsPtr = realDataPoints; // Reset the dataPointsPtr
++ __m128 vMeanAmplitudeVector = _mm_set_ps1(meanAmplitude);
++ __m128 vOnesVector = _mm_set_ps1(1.0);
++ __m128 vValidBinCount = _mm_setzero_ps();
++ avgPointsVal = _mm_setzero_ps();
++ __m128 compareMask;
++ number = 0;
++ // Calculate the sum (for mean) for any points which do NOT exceed the mean amplitude
++ for (; number < quarterPoints; number++) {
++
++ dataPointsVal = _mm_load_ps(dataPointsPtr);
++
++ dataPointsPtr += 4;
++
++ // Identify which items do not exceed the mean amplitude
++ compareMask = _mm_cmple_ps(dataPointsVal, vMeanAmplitudeVector);
++
++ // Mask off the items that exceed the mean amplitude and add the avg Points that
++ // do not exceed the mean amplitude
++ avgPointsVal = _mm_add_ps(avgPointsVal, _mm_and_ps(compareMask, dataPointsVal));
++
++ // Count the number of bins which do not exceed the mean amplitude
++ vValidBinCount = _mm_add_ps(vValidBinCount, _mm_and_ps(compareMask, vOnesVector));
+ }
+- }
+
+- float localNoiseFloorAmplitude = 0;
+- if(validBinCount > 0.0){
+- localNoiseFloorAmplitude = sumMean / validBinCount;
+- }
+- else{
+- localNoiseFloorAmplitude = meanAmplitude; // For the odd case that all the amplitudes are equal...
+- }
++ // Calculate the mean from the remaining data points
++ _mm_store_ps(avgPointsVector, avgPointsVal);
++
++ sumMean = 0.0;
++ sumMean += avgPointsVector[0];
++ sumMean += avgPointsVector[1];
++ sumMean += avgPointsVector[2];
++ sumMean += avgPointsVector[3];
++
++ // Calculate the number of valid bins from the remaining count
++ __VOLK_ATTR_ALIGNED(16) float validBinCountVector[4];
++ _mm_store_ps(validBinCountVector, vValidBinCount);
++
++ float validBinCount = 0;
++ validBinCount += validBinCountVector[0];
++ validBinCount += validBinCountVector[1];
++ validBinCount += validBinCountVector[2];
++ validBinCount += validBinCountVector[3];
++
++ number = quarterPoints * 4;
++ for (; number < num_points; number++) {
++ if (realDataPoints[number] <= meanAmplitude) {
++ sumMean += realDataPoints[number];
++ validBinCount += 1.0;
++ }
++ }
++
++ float localNoiseFloorAmplitude = 0;
++ if (validBinCount > 0.0) {
++ localNoiseFloorAmplitude = sumMean / validBinCount;
++ } else {
++ localNoiseFloorAmplitude =
++ meanAmplitude; // For the odd case that all the amplitudes are equal...
++ }
+
+- *noiseFloorAmplitude = localNoiseFloorAmplitude;
++ *noiseFloorAmplitude = localNoiseFloorAmplitude;
+ }
+ #endif /* LV_HAVE_SSE */
+
+@@ -300,36 +306,36 @@ volk_32f_s32f_calc_spectral_noise_floor_32f_generic(float* noiseFloorAmplitude,
+ const float spectralExclusionValue,
+ const unsigned int num_points)
+ {
+- float sumMean = 0.0;
+- unsigned int number;
+- // find the sum (for mean), etc
+- for(number = 0; number < num_points; number++){
+- // sum (for mean)
+- sumMean += realDataPoints[number];
+- }
+-
+- // calculate the spectral mean
+- // +20 because for the comparison below we only want to throw out bins
+- // that are significantly higher (and would, thus, affect the mean more)
+- const float meanAmplitude = (sumMean / num_points) + spectralExclusionValue;
+-
+- // now throw out any bins higher than the mean
+- sumMean = 0.0;
+- unsigned int newNumDataPoints = num_points;
+- for(number = 0; number < num_points; number++){
+- if (realDataPoints[number] <= meanAmplitude)
+- sumMean += realDataPoints[number];
+- else
+- newNumDataPoints--;
+- }
++ float sumMean = 0.0;
++ unsigned int number;
++ // find the sum (for mean), etc
++ for (number = 0; number < num_points; number++) {
++ // sum (for mean)
++ sumMean += realDataPoints[number];
++ }
++
++ // calculate the spectral mean
++ // +20 because for the comparison below we only want to throw out bins
++ // that are significantly higher (and would, thus, affect the mean more)
++ const float meanAmplitude = (sumMean / num_points) + spectralExclusionValue;
++
++ // now throw out any bins higher than the mean
++ sumMean = 0.0;
++ unsigned int newNumDataPoints = num_points;
++ for (number = 0; number < num_points; number++) {
++ if (realDataPoints[number] <= meanAmplitude)
++ sumMean += realDataPoints[number];
++ else
++ newNumDataPoints--;
++ }
+
+- float localNoiseFloorAmplitude = 0.0;
+- if (newNumDataPoints == 0) // in the odd case that all
+- localNoiseFloorAmplitude = meanAmplitude; // amplitudes are equal!
+- else
+- localNoiseFloorAmplitude = sumMean / ((float)newNumDataPoints);
++ float localNoiseFloorAmplitude = 0.0;
++ if (newNumDataPoints == 0) // in the odd case that all
++ localNoiseFloorAmplitude = meanAmplitude; // amplitudes are equal!
++ else
++ localNoiseFloorAmplitude = sumMean / ((float)newNumDataPoints);
+
+- *noiseFloorAmplitude = localNoiseFloorAmplitude;
++ *noiseFloorAmplitude = localNoiseFloorAmplitude;
+ }
+ #endif /* LV_HAVE_GENERIC */
+
+@@ -339,9 +345,9 @@ volk_32f_s32f_calc_spectral_noise_floor_32f_generic(float* noiseFloorAmplitude,
+ #ifndef INCLUDED_volk_32f_s32f_calc_spectral_noise_floor_32f_u_H
+ #define INCLUDED_volk_32f_s32f_calc_spectral_noise_floor_32f_u_H
+
+-#include <volk/volk_common.h>
+ #include <inttypes.h>
+ #include <stdio.h>
++#include <volk/volk_common.h>
+
+ #ifdef LV_HAVE_AVX
+ #include <immintrin.h>
+@@ -352,114 +358,117 @@ volk_32f_s32f_calc_spectral_noise_floor_32f_u_avx(float* noiseFloorAmplitude,
+ const float spectralExclusionValue,
+ const unsigned int num_points)
+ {
+- unsigned int number = 0;
+- const unsigned int eighthPoints = num_points / 8;
+-
+- const float* dataPointsPtr = realDataPoints;
+- __VOLK_ATTR_ALIGNED(16) float avgPointsVector[8];
+-
+- __m256 dataPointsVal;
+- __m256 avgPointsVal = _mm256_setzero_ps();
+- // Calculate the sum (for mean) for all points
+- for(; number < eighthPoints; number++){
+-
+- dataPointsVal = _mm256_loadu_ps(dataPointsPtr);
+-
+- dataPointsPtr += 8;
+-
+- avgPointsVal = _mm256_add_ps(avgPointsVal, dataPointsVal);
+- }
+-
+- _mm256_storeu_ps(avgPointsVector, avgPointsVal);
+-
+- float sumMean = 0.0;
+- sumMean += avgPointsVector[0];
+- sumMean += avgPointsVector[1];
+- sumMean += avgPointsVector[2];
+- sumMean += avgPointsVector[3];
+- sumMean += avgPointsVector[4];
+- sumMean += avgPointsVector[5];
+- sumMean += avgPointsVector[6];
+- sumMean += avgPointsVector[7];
+-
+- number = eighthPoints * 8;
+- for(;number < num_points; number++){
+- sumMean += realDataPoints[number];
+- }
+-
+- // calculate the spectral mean
+- // +20 because for the comparison below we only want to throw out bins
+- // that are significantly higher (and would, thus, affect the mean more
+- const float meanAmplitude = (sumMean / ((float)num_points)) + spectralExclusionValue;
+-
+- dataPointsPtr = realDataPoints; // Reset the dataPointsPtr
+- __m256 vMeanAmplitudeVector = _mm256_set1_ps(meanAmplitude);
+- __m256 vOnesVector = _mm256_set1_ps(1.0);
+- __m256 vValidBinCount = _mm256_setzero_ps();
+- avgPointsVal = _mm256_setzero_ps();
+- __m256 compareMask;
+- number = 0;
+- // Calculate the sum (for mean) for any points which do NOT exceed the mean amplitude
+- for(; number < eighthPoints; number++){
+-
+- dataPointsVal = _mm256_loadu_ps(dataPointsPtr);
+-
+- dataPointsPtr += 8;
+-
+- // Identify which items do not exceed the mean amplitude
+- compareMask = _mm256_cmp_ps(dataPointsVal, vMeanAmplitudeVector, _CMP_LE_OQ);
+-
+- // Mask off the items that exceed the mean amplitude and add the avg Points that do not exceed the mean amplitude
+- avgPointsVal = _mm256_add_ps(avgPointsVal, _mm256_and_ps(compareMask, dataPointsVal));
+-
+- // Count the number of bins which do not exceed the mean amplitude
+- vValidBinCount = _mm256_add_ps(vValidBinCount, _mm256_and_ps(compareMask, vOnesVector));
+- }
+-
+- // Calculate the mean from the remaining data points
+- _mm256_storeu_ps(avgPointsVector, avgPointsVal);
+-
+- sumMean = 0.0;
+- sumMean += avgPointsVector[0];
+- sumMean += avgPointsVector[1];
+- sumMean += avgPointsVector[2];
+- sumMean += avgPointsVector[3];
+- sumMean += avgPointsVector[4];
+- sumMean += avgPointsVector[5];
+- sumMean += avgPointsVector[6];
+- sumMean += avgPointsVector[7];
+-
+- // Calculate the number of valid bins from the remaining count
+- __VOLK_ATTR_ALIGNED(16) float validBinCountVector[8];
+- _mm256_storeu_ps(validBinCountVector, vValidBinCount);
+-
+- float validBinCount = 0;
+- validBinCount += validBinCountVector[0];
+- validBinCount += validBinCountVector[1];
+- validBinCount += validBinCountVector[2];
+- validBinCount += validBinCountVector[3];
+- validBinCount += validBinCountVector[4];
+- validBinCount += validBinCountVector[5];
+- validBinCount += validBinCountVector[6];
+- validBinCount += validBinCountVector[7];
+-
+- number = eighthPoints * 8;
+- for(;number < num_points; number++){
+- if(realDataPoints[number] <= meanAmplitude){
+- sumMean += realDataPoints[number];
+- validBinCount += 1.0;
++ unsigned int number = 0;
++ const unsigned int eighthPoints = num_points / 8;
++
++ const float* dataPointsPtr = realDataPoints;
++ __VOLK_ATTR_ALIGNED(16) float avgPointsVector[8];
++
++ __m256 dataPointsVal;
++ __m256 avgPointsVal = _mm256_setzero_ps();
++ // Calculate the sum (for mean) for all points
++ for (; number < eighthPoints; number++) {
++
++ dataPointsVal = _mm256_loadu_ps(dataPointsPtr);
++
++ dataPointsPtr += 8;
++
++ avgPointsVal = _mm256_add_ps(avgPointsVal, dataPointsVal);
++ }
++
++ _mm256_storeu_ps(avgPointsVector, avgPointsVal);
++
++ float sumMean = 0.0;
++ sumMean += avgPointsVector[0];
++ sumMean += avgPointsVector[1];
++ sumMean += avgPointsVector[2];
++ sumMean += avgPointsVector[3];
++ sumMean += avgPointsVector[4];
++ sumMean += avgPointsVector[5];
++ sumMean += avgPointsVector[6];
++ sumMean += avgPointsVector[7];
++
++ number = eighthPoints * 8;
++ for (; number < num_points; number++) {
++ sumMean += realDataPoints[number];
++ }
++
++ // calculate the spectral mean
++ // +20 because for the comparison below we only want to throw out bins
++ // that are significantly higher (and would, thus, affect the mean more
++ const float meanAmplitude = (sumMean / ((float)num_points)) + spectralExclusionValue;
++
++ dataPointsPtr = realDataPoints; // Reset the dataPointsPtr
++ __m256 vMeanAmplitudeVector = _mm256_set1_ps(meanAmplitude);
++ __m256 vOnesVector = _mm256_set1_ps(1.0);
++ __m256 vValidBinCount = _mm256_setzero_ps();
++ avgPointsVal = _mm256_setzero_ps();
++ __m256 compareMask;
++ number = 0;
++ // Calculate the sum (for mean) for any points which do NOT exceed the mean amplitude
++ for (; number < eighthPoints; number++) {
++
++ dataPointsVal = _mm256_loadu_ps(dataPointsPtr);
++
++ dataPointsPtr += 8;
++
++ // Identify which items do not exceed the mean amplitude
++ compareMask = _mm256_cmp_ps(dataPointsVal, vMeanAmplitudeVector, _CMP_LE_OQ);
++
++ // Mask off the items that exceed the mean amplitude and add the avg Points that
++ // do not exceed the mean amplitude
++ avgPointsVal =
++ _mm256_add_ps(avgPointsVal, _mm256_and_ps(compareMask, dataPointsVal));
++
++ // Count the number of bins which do not exceed the mean amplitude
++ vValidBinCount =
++ _mm256_add_ps(vValidBinCount, _mm256_and_ps(compareMask, vOnesVector));
++ }
++
++ // Calculate the mean from the remaining data points
++ _mm256_storeu_ps(avgPointsVector, avgPointsVal);
++
++ sumMean = 0.0;
++ sumMean += avgPointsVector[0];
++ sumMean += avgPointsVector[1];
++ sumMean += avgPointsVector[2];
++ sumMean += avgPointsVector[3];
++ sumMean += avgPointsVector[4];
++ sumMean += avgPointsVector[5];
++ sumMean += avgPointsVector[6];
++ sumMean += avgPointsVector[7];
++
++ // Calculate the number of valid bins from the remaining count
++ __VOLK_ATTR_ALIGNED(16) float validBinCountVector[8];
++ _mm256_storeu_ps(validBinCountVector, vValidBinCount);
++
++ float validBinCount = 0;
++ validBinCount += validBinCountVector[0];
++ validBinCount += validBinCountVector[1];
++ validBinCount += validBinCountVector[2];
++ validBinCount += validBinCountVector[3];
++ validBinCount += validBinCountVector[4];
++ validBinCount += validBinCountVector[5];
++ validBinCount += validBinCountVector[6];
++ validBinCount += validBinCountVector[7];
++
++ number = eighthPoints * 8;
++ for (; number < num_points; number++) {
++ if (realDataPoints[number] <= meanAmplitude) {
++ sumMean += realDataPoints[number];
++ validBinCount += 1.0;
++ }
+ }
+- }
+
+- float localNoiseFloorAmplitude = 0;
+- if(validBinCount > 0.0){
+- localNoiseFloorAmplitude = sumMean / validBinCount;
+- }
+- else{
+- localNoiseFloorAmplitude = meanAmplitude; // For the odd case that all the amplitudes are equal...
+- }
++ float localNoiseFloorAmplitude = 0;
++ if (validBinCount > 0.0) {
++ localNoiseFloorAmplitude = sumMean / validBinCount;
++ } else {
++ localNoiseFloorAmplitude =
++ meanAmplitude; // For the odd case that all the amplitudes are equal...
++ }
+
+- *noiseFloorAmplitude = localNoiseFloorAmplitude;
++ *noiseFloorAmplitude = localNoiseFloorAmplitude;
+ }
+ #endif /* LV_HAVE_AVX */
+ #endif /* INCLUDED_volk_32f_s32f_calc_spectral_noise_floor_32f_u_H */
+diff --git a/kernels/volk/volk_32f_s32f_convert_16i.h b/kernels/volk/volk_32f_s32f_convert_16i.h
+index 27ef4d9..c9469b7 100644
+--- a/kernels/volk/volk_32f_s32f_convert_16i.h
++++ b/kernels/volk/volk_32f_s32f_convert_16i.h
+@@ -30,8 +30,8 @@
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+- * void volk_32f_s32f_convert_16i(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points)
+- * \endcode
++ * void volk_32f_s32f_convert_16i(int16_t* outputVector, const float* inputVector, const
++ * float scalar, unsigned int num_points) \endcode
+ *
+ * \b Inputs
+ * \li inputVector: the input vector of floats.
+@@ -42,11 +42,10 @@
+ * \li outputVector: The output vector.
+ *
+ * \b Example
+- * Convert floats from [-1,1] to 16-bit integers with a scale of 5 to maintain smallest delta
+- * int N = 10;
+- * unsigned int alignment = volk_get_alignment();
+- * float* increasing = (float*)volk_malloc(sizeof(float)*N, alignment);
+- * int16_t* out = (int16_t*)volk_malloc(sizeof(int16_t)*N, alignment);
++ * Convert floats from [-1,1] to 16-bit integers with a scale of 5 to maintain smallest
++ * delta int N = 10; unsigned int alignment = volk_get_alignment(); float* increasing =
++ * (float*)volk_malloc(sizeof(float)*N, alignment); int16_t* out =
++ * (int16_t*)volk_malloc(sizeof(int16_t)*N, alignment);
+ *
+ * for(unsigned int ii = 0; ii < N; ++ii){
+ * increasing[ii] = 2.f * ((float)ii / (float)N) - 1.f;
+@@ -76,55 +75,60 @@
+ #ifdef LV_HAVE_AVX2
+ #include <immintrin.h>
+
+-static inline void
+-volk_32f_s32f_convert_16i_u_avx2(int16_t* outputVector, const float* inputVector,
+- const float scalar, unsigned int num_points)
++static inline void volk_32f_s32f_convert_16i_u_avx2(int16_t* outputVector,
++ const float* inputVector,
++ const float scalar,
++ unsigned int num_points)
+ {
+- unsigned int number = 0;
+-
+- const unsigned int sixteenthPoints = num_points / 16;
+-
+- const float* inputVectorPtr = (const float*)inputVector;
+- int16_t* outputVectorPtr = outputVector;
+-
+- float min_val = SHRT_MIN;
+- float max_val = SHRT_MAX;
+- float r;
+-
+- __m256 vScalar = _mm256_set1_ps(scalar);
+- __m256 inputVal1, inputVal2;
+- __m256i intInputVal1, intInputVal2;
+- __m256 ret1, ret2;
+- __m256 vmin_val = _mm256_set1_ps(min_val);
+- __m256 vmax_val = _mm256_set1_ps(max_val);
+-
+- for(;number < sixteenthPoints; number++){
+- inputVal1 = _mm256_loadu_ps(inputVectorPtr); inputVectorPtr += 8;
+- inputVal2 = _mm256_loadu_ps(inputVectorPtr); inputVectorPtr += 8;
+-
+- // Scale and clip
+- ret1 = _mm256_max_ps(_mm256_min_ps(_mm256_mul_ps(inputVal1, vScalar), vmax_val), vmin_val);
+- ret2 = _mm256_max_ps(_mm256_min_ps(_mm256_mul_ps(inputVal2, vScalar), vmax_val), vmin_val);
+-
+- intInputVal1 = _mm256_cvtps_epi32(ret1);
+- intInputVal2 = _mm256_cvtps_epi32(ret2);
+-
+- intInputVal1 = _mm256_packs_epi32(intInputVal1, intInputVal2);
+- intInputVal1 = _mm256_permute4x64_epi64(intInputVal1, 0b11011000);
+-
+- _mm256_storeu_si256((__m256i*)outputVectorPtr, intInputVal1);
+- outputVectorPtr += 16;
+- }
+-
+- number = sixteenthPoints * 16;
+- for(; number < num_points; number++){
+- r = inputVector[number] * scalar;
+- if(r > max_val)
+- r = max_val;
+- else if(r < min_val)
+- r = min_val;
+- outputVector[number] = (int16_t)rintf(r);
+- }
++ unsigned int number = 0;
++
++ const unsigned int sixteenthPoints = num_points / 16;
++
++ const float* inputVectorPtr = (const float*)inputVector;
++ int16_t* outputVectorPtr = outputVector;
++
++ float min_val = SHRT_MIN;
++ float max_val = SHRT_MAX;
++ float r;
++
++ __m256 vScalar = _mm256_set1_ps(scalar);
++ __m256 inputVal1, inputVal2;
++ __m256i intInputVal1, intInputVal2;
++ __m256 ret1, ret2;
++ __m256 vmin_val = _mm256_set1_ps(min_val);
++ __m256 vmax_val = _mm256_set1_ps(max_val);
++
++ for (; number < sixteenthPoints; number++) {
++ inputVal1 = _mm256_loadu_ps(inputVectorPtr);
++ inputVectorPtr += 8;
++ inputVal2 = _mm256_loadu_ps(inputVectorPtr);
++ inputVectorPtr += 8;
++
++ // Scale and clip
++ ret1 = _mm256_max_ps(_mm256_min_ps(_mm256_mul_ps(inputVal1, vScalar), vmax_val),
++ vmin_val);
++ ret2 = _mm256_max_ps(_mm256_min_ps(_mm256_mul_ps(inputVal2, vScalar), vmax_val),
++ vmin_val);
++
++ intInputVal1 = _mm256_cvtps_epi32(ret1);
++ intInputVal2 = _mm256_cvtps_epi32(ret2);
++
++ intInputVal1 = _mm256_packs_epi32(intInputVal1, intInputVal2);
++ intInputVal1 = _mm256_permute4x64_epi64(intInputVal1, 0b11011000);
++
++ _mm256_storeu_si256((__m256i*)outputVectorPtr, intInputVal1);
++ outputVectorPtr += 16;
++ }
++
++ number = sixteenthPoints * 16;
++ for (; number < num_points; number++) {
++ r = inputVector[number] * scalar;
++ if (r > max_val)
++ r = max_val;
++ else if (r < min_val)
++ r = min_val;
++ outputVector[number] = (int16_t)rintf(r);
++ }
+ }
+ #endif /* LV_HAVE_AVX2 */
+
+@@ -132,54 +136,57 @@ volk_32f_s32f_convert_16i_u_avx2(int16_t* outputVector, const float* inputVector
+ #ifdef LV_HAVE_AVX
+ #include <immintrin.h>
+
+-static inline void
+-volk_32f_s32f_convert_16i_u_avx(int16_t* outputVector, const float* inputVector,
+- const float scalar, unsigned int num_points)
++static inline void volk_32f_s32f_convert_16i_u_avx(int16_t* outputVector,
++ const float* inputVector,
++ const float scalar,
++ unsigned int num_points)
+ {
+- unsigned int number = 0;
++ unsigned int number = 0;
+
+- const unsigned int eighthPoints = num_points / 8;
++ const unsigned int eighthPoints = num_points / 8;
+
+- const float* inputVectorPtr = (const float*)inputVector;
+- int16_t* outputVectorPtr = outputVector;
++ const float* inputVectorPtr = (const float*)inputVector;
++ int16_t* outputVectorPtr = outputVector;
+
+- float min_val = SHRT_MIN;
+- float max_val = SHRT_MAX;
+- float r;
++ float min_val = SHRT_MIN;
++ float max_val = SHRT_MAX;
++ float r;
+
+- __m256 vScalar = _mm256_set1_ps(scalar);
+- __m256 inputVal, ret;
+- __m256i intInputVal;
+- __m128i intInputVal1, intInputVal2;
+- __m256 vmin_val = _mm256_set1_ps(min_val);
+- __m256 vmax_val = _mm256_set1_ps(max_val);
++ __m256 vScalar = _mm256_set1_ps(scalar);
++ __m256 inputVal, ret;
++ __m256i intInputVal;
++ __m128i intInputVal1, intInputVal2;
++ __m256 vmin_val = _mm256_set1_ps(min_val);
++ __m256 vmax_val = _mm256_set1_ps(max_val);
+
+- for(;number < eighthPoints; number++){
+- inputVal = _mm256_loadu_ps(inputVectorPtr); inputVectorPtr += 8;
++ for (; number < eighthPoints; number++) {
++ inputVal = _mm256_loadu_ps(inputVectorPtr);
++ inputVectorPtr += 8;
+
+- // Scale and clip
+- ret = _mm256_max_ps(_mm256_min_ps(_mm256_mul_ps(inputVal, vScalar), vmax_val), vmin_val);
++ // Scale and clip
++ ret = _mm256_max_ps(_mm256_min_ps(_mm256_mul_ps(inputVal, vScalar), vmax_val),
++ vmin_val);
+
+- intInputVal = _mm256_cvtps_epi32(ret);
++ intInputVal = _mm256_cvtps_epi32(ret);
+
+- intInputVal1 = _mm256_extractf128_si256(intInputVal, 0);
+- intInputVal2 = _mm256_extractf128_si256(intInputVal, 1);
++ intInputVal1 = _mm256_extractf128_si256(intInputVal, 0);
++ intInputVal2 = _mm256_extractf128_si256(intInputVal, 1);
+
+- intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
++ intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
+
+- _mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1);
+- outputVectorPtr += 8;
+- }
++ _mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1);
++ outputVectorPtr += 8;
++ }
+
+- number = eighthPoints * 8;
+- for(; number < num_points; number++){
+- r = inputVector[number] * scalar;
+- if(r > max_val)
+- r = max_val;
+- else if(r < min_val)
+- r = min_val;
+- outputVector[number] = (int16_t)rintf(r);
+- }
++ number = eighthPoints * 8;
++ for (; number < num_points; number++) {
++ r = inputVector[number] * scalar;
++ if (r > max_val)
++ r = max_val;
++ else if (r < min_val)
++ r = min_val;
++ outputVector[number] = (int16_t)rintf(r);
++ }
+ }
+ #endif /* LV_HAVE_AVX */
+
+@@ -187,54 +194,57 @@ volk_32f_s32f_convert_16i_u_avx(int16_t* outputVector, const float* inputVector,
+ #ifdef LV_HAVE_SSE2
+ #include <emmintrin.h>
+
+-static inline void
+-volk_32f_s32f_convert_16i_u_sse2(int16_t* outputVector, const float* inputVector,
+- const float scalar, unsigned int num_points)
++static inline void volk_32f_s32f_convert_16i_u_sse2(int16_t* outputVector,
++ const float* inputVector,
++ const float scalar,
++ unsigned int num_points)
+ {
+- unsigned int number = 0;
+-
+- const unsigned int eighthPoints = num_points / 8;
+-
+- const float* inputVectorPtr = (const float*)inputVector;
+- int16_t* outputVectorPtr = outputVector;
+-
+- float min_val = SHRT_MIN;
+- float max_val = SHRT_MAX;
+- float r;
+-
+- __m128 vScalar = _mm_set_ps1(scalar);
+- __m128 inputVal1, inputVal2;
+- __m128i intInputVal1, intInputVal2;
+- __m128 ret1, ret2;
+- __m128 vmin_val = _mm_set_ps1(min_val);
+- __m128 vmax_val = _mm_set_ps1(max_val);
+-
+- for(;number < eighthPoints; number++){
+- inputVal1 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4;
+- inputVal2 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4;
+-
+- // Scale and clip
+- ret1 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal1, vScalar), vmax_val), vmin_val);
+- ret2 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal2, vScalar), vmax_val), vmin_val);
+-
+- intInputVal1 = _mm_cvtps_epi32(ret1);
+- intInputVal2 = _mm_cvtps_epi32(ret2);
+-
+- intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
+-
+- _mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1);
+- outputVectorPtr += 8;
+- }
+-
+- number = eighthPoints * 8;
+- for(; number < num_points; number++){
+- r = inputVector[number] * scalar;
+- if(r > max_val)
+- r = max_val;
+- else if(r < min_val)
+- r = min_val;
+- outputVector[number] = (int16_t)rintf(r);
+- }
++ unsigned int number = 0;
++
++ const unsigned int eighthPoints = num_points / 8;
++
++ const float* inputVectorPtr = (const float*)inputVector;
++ int16_t* outputVectorPtr = outputVector;
++
++ float min_val = SHRT_MIN;
++ float max_val = SHRT_MAX;
++ float r;
++
++ __m128 vScalar = _mm_set_ps1(scalar);
++ __m128 inputVal1, inputVal2;
++ __m128i intInputVal1, intInputVal2;
++ __m128 ret1, ret2;
++ __m128 vmin_val = _mm_set_ps1(min_val);
++ __m128 vmax_val = _mm_set_ps1(max_val);
++
++ for (; number < eighthPoints; number++) {
++ inputVal1 = _mm_loadu_ps(inputVectorPtr);
++ inputVectorPtr += 4;
++ inputVal2 = _mm_loadu_ps(inputVectorPtr);
++ inputVectorPtr += 4;
++
++ // Scale and clip
++ ret1 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal1, vScalar), vmax_val), vmin_val);
++ ret2 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal2, vScalar), vmax_val), vmin_val);
++
++ intInputVal1 = _mm_cvtps_epi32(ret1);
++ intInputVal2 = _mm_cvtps_epi32(ret2);
++
++ intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
++
++ _mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1);
++ outputVectorPtr += 8;
++ }
++
++ number = eighthPoints * 8;
++ for (; number < num_points; number++) {
++ r = inputVector[number] * scalar;
++ if (r > max_val)
++ r = max_val;
++ else if (r < min_val)
++ r = min_val;
++ outputVector[number] = (int16_t)rintf(r);
++ }
+ }
+ #endif /* LV_HAVE_SSE2 */
+
+@@ -242,76 +252,78 @@ volk_32f_s32f_convert_16i_u_sse2(int16_t* outputVector, const float* inputVector
+ #ifdef LV_HAVE_SSE
+ #include <xmmintrin.h>
+
+-static inline void
+-volk_32f_s32f_convert_16i_u_sse(int16_t* outputVector, const float* inputVector,
+- const float scalar, unsigned int num_points)
++static inline void volk_32f_s32f_convert_16i_u_sse(int16_t* outputVector,
++ const float* inputVector,
++ const float scalar,
++ unsigned int num_points)
+ {
+- unsigned int number = 0;
+-
+- const unsigned int quarterPoints = num_points / 4;
+-
+- const float* inputVectorPtr = (const float*)inputVector;
+- int16_t* outputVectorPtr = outputVector;
+-
+- float min_val = SHRT_MIN;
+- float max_val = SHRT_MAX;
+- float r;
+-
+- __m128 vScalar = _mm_set_ps1(scalar);
+- __m128 ret;
+- __m128 vmin_val = _mm_set_ps1(min_val);
+- __m128 vmax_val = _mm_set_ps1(max_val);
+-
+- __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4];
+-
+- for(;number < quarterPoints; number++){
+- ret = _mm_loadu_ps(inputVectorPtr);
+- inputVectorPtr += 4;
+-
+- // Scale and clip
+- ret = _mm_max_ps(_mm_min_ps(_mm_mul_ps(ret, vScalar), vmax_val), vmin_val);
+-
+- _mm_store_ps(outputFloatBuffer, ret);
+- *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[0]);
+- *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[1]);
+- *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[2]);
+- *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[3]);
+- }
+-
+- number = quarterPoints * 4;
+- for(; number < num_points; number++){
+- r = inputVector[number] * scalar;
+- if(r > max_val)
+- r = max_val;
+- else if(r < min_val)
+- r = min_val;
+- outputVector[number] = (int16_t)rintf(r);
+- }
++ unsigned int number = 0;
++
++ const unsigned int quarterPoints = num_points / 4;
++
++ const float* inputVectorPtr = (const float*)inputVector;
++ int16_t* outputVectorPtr = outputVector;
++
++ float min_val = SHRT_MIN;
++ float max_val = SHRT_MAX;
++ float r;
++
++ __m128 vScalar = _mm_set_ps1(scalar);
++ __m128 ret;
++ __m128 vmin_val = _mm_set_ps1(min_val);
++ __m128 vmax_val = _mm_set_ps1(max_val);
++
++ __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4];
++
++ for (; number < quarterPoints; number++) {
++ ret = _mm_loadu_ps(inputVectorPtr);
++ inputVectorPtr += 4;
++
++ // Scale and clip
++ ret = _mm_max_ps(_mm_min_ps(_mm_mul_ps(ret, vScalar), vmax_val), vmin_val);
++
++ _mm_store_ps(outputFloatBuffer, ret);
++ *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[0]);
++ *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[1]);
++ *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[2]);
++ *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[3]);
++ }
++
++ number = quarterPoints * 4;
++ for (; number < num_points; number++) {
++ r = inputVector[number] * scalar;
++ if (r > max_val)
++ r = max_val;
++ else if (r < min_val)
++ r = min_val;
++ outputVector[number] = (int16_t)rintf(r);
++ }
+ }
+ #endif /* LV_HAVE_SSE */
+
+
+ #ifdef LV_HAVE_GENERIC
+
+-static inline void
+-volk_32f_s32f_convert_16i_generic(int16_t* outputVector, const float* inputVector,
+- const float scalar, unsigned int num_points)
++static inline void volk_32f_s32f_convert_16i_generic(int16_t* outputVector,
++ const float* inputVector,
++ const float scalar,
++ unsigned int num_points)
+ {
+- int16_t* outputVectorPtr = outputVector;
+- const float* inputVectorPtr = inputVector;
+- unsigned int number = 0;
+- float min_val = SHRT_MIN;
+- float max_val = SHRT_MAX;
+- float r;
+-
+- for(number = 0; number < num_points; number++){
+- r = *inputVectorPtr++ * scalar;
+- if(r > max_val)
+- r = max_val;
+- else if(r < min_val)
+- r = min_val;
+- *outputVectorPtr++ = (int16_t)rintf(r);
+- }
++ int16_t* outputVectorPtr = outputVector;
++ const float* inputVectorPtr = inputVector;
++ unsigned int number = 0;
++ float min_val = SHRT_MIN;
++ float max_val = SHRT_MAX;
++ float r;
++
++ for (number = 0; number < num_points; number++) {
++ r = *inputVectorPtr++ * scalar;
++ if (r > max_val)
++ r = max_val;
++ else if (r < min_val)
++ r = min_val;
++ *outputVectorPtr++ = (int16_t)rintf(r);
++ }
+ }
+ #endif /* LV_HAVE_GENERIC */
+
+@@ -320,63 +332,68 @@ volk_32f_s32f_convert_16i_generic(int16_t* outputVector, const float* inputVecto
+ #ifndef INCLUDED_volk_32f_s32f_convert_16i_a_H
+ #define INCLUDED_volk_32f_s32f_convert_16i_a_H
+
+-#include <volk/volk_common.h>
+ #include <inttypes.h>
+-#include <stdio.h>
+ #include <math.h>
++#include <stdio.h>
++#include <volk/volk_common.h>
+
+ #ifdef LV_HAVE_AVX2
+ #include <immintrin.h>
+
+-static inline void
+-volk_32f_s32f_convert_16i_a_avx2(int16_t* outputVector, const float* inputVector,
+- const float scalar, unsigned int num_points)
++static inline void volk_32f_s32f_convert_16i_a_avx2(int16_t* outputVector,
++ const float* inputVector,
++ const float scalar,
++ unsigned int num_points)
+ {
+- unsigned int number = 0;
+-
+- const unsigned int sixteenthPoints = num_points / 16;
+-
+- const float* inputVectorPtr = (const float*)inputVector;
+- int16_t* outputVectorPtr = outputVector;
+-
+- float min_val = SHRT_MIN;
+- float max_val = SHRT_MAX;
+- float r;
+-
+- __m256 vScalar = _mm256_set1_ps(scalar);
+- __m256 inputVal1, inputVal2;
+- __m256i intInputVal1, intInputVal2;
+- __m256 ret1, ret2;
+- __m256 vmin_val = _mm256_set1_ps(min_val);
+- __m256 vmax_val = _mm256_set1_ps(max_val);
+-
+- for(;number < sixteenthPoints; number++){
+- inputVal1 = _mm256_load_ps(inputVectorPtr); inputVectorPtr += 8;
+- inputVal2 = _mm256_load_ps(inputVectorPtr); inputVectorPtr += 8;
+-
+- // Scale and clip
+- ret1 = _mm256_max_ps(_mm256_min_ps(_mm256_mul_ps(inputVal1, vScalar), vmax_val), vmin_val);
+- ret2 = _mm256_max_ps(_mm256_min_ps(_mm256_mul_ps(inputVal2, vScalar), vmax_val), vmin_val);
+-
+- intInputVal1 = _mm256_cvtps_epi32(ret1);
+- intInputVal2 = _mm256_cvtps_epi32(ret2);
+-
+- intInputVal1 = _mm256_packs_epi32(intInputVal1, intInputVal2);
+- intInputVal1 = _mm256_permute4x64_epi64(intInputVal1, 0b11011000);
+-
+- _mm256_store_si256((__m256i*)outputVectorPtr, intInputVal1);
+- outputVectorPtr += 16;
+- }
+-
+- number = sixteenthPoints * 16;
+- for(; number < num_points; number++){
+- r = inputVector[number] * scalar;
+- if(r > max_val)
+- r = max_val;
+- else if(r < min_val)
+- r = min_val;
+- outputVector[number] = (int16_t)rintf(r);
+- }
++ unsigned int number = 0;
++
++ const unsigned int sixteenthPoints = num_points / 16;
++
++ const float* inputVectorPtr = (const float*)inputVector;
++ int16_t* outputVectorPtr = outputVector;
++
++ float min_val = SHRT_MIN;
++ float max_val = SHRT_MAX;
++ float r;
++
++ __m256 vScalar = _mm256_set1_ps(scalar);
++ __m256 inputVal1, inputVal2;
++ __m256i intInputVal1, intInputVal2;
++ __m256 ret1, ret2;
++ __m256 vmin_val = _mm256_set1_ps(min_val);
++ __m256 vmax_val = _mm256_set1_ps(max_val);
++
++ for (; number < sixteenthPoints; number++) {
++ inputVal1 = _mm256_load_ps(inputVectorPtr);
++ inputVectorPtr += 8;
++ inputVal2 = _mm256_load_ps(inputVectorPtr);
++ inputVectorPtr += 8;
++
++ // Scale and clip
++ ret1 = _mm256_max_ps(_mm256_min_ps(_mm256_mul_ps(inputVal1, vScalar), vmax_val),
++ vmin_val);
++ ret2 = _mm256_max_ps(_mm256_min_ps(_mm256_mul_ps(inputVal2, vScalar), vmax_val),
++ vmin_val);
++
++ intInputVal1 = _mm256_cvtps_epi32(ret1);
++ intInputVal2 = _mm256_cvtps_epi32(ret2);
++
++ intInputVal1 = _mm256_packs_epi32(intInputVal1, intInputVal2);
++ intInputVal1 = _mm256_permute4x64_epi64(intInputVal1, 0b11011000);
++
++ _mm256_store_si256((__m256i*)outputVectorPtr, intInputVal1);
++ outputVectorPtr += 16;
++ }
++
++ number = sixteenthPoints * 16;
++ for (; number < num_points; number++) {
++ r = inputVector[number] * scalar;
++ if (r > max_val)
++ r = max_val;
++ else if (r < min_val)
++ r = min_val;
++ outputVector[number] = (int16_t)rintf(r);
++ }
+ }
+ #endif /* LV_HAVE_AVX2 */
+
+@@ -384,108 +401,114 @@ volk_32f_s32f_convert_16i_a_avx2(int16_t* outputVector, const float* inputVector
+ #ifdef LV_HAVE_AVX
+ #include <immintrin.h>
+
+-static inline void
+-volk_32f_s32f_convert_16i_a_avx(int16_t* outputVector, const float* inputVector,
+- const float scalar, unsigned int num_points)
++static inline void volk_32f_s32f_convert_16i_a_avx(int16_t* outputVector,
++ const float* inputVector,
++ const float scalar,
++ unsigned int num_points)
+ {
+- unsigned int number = 0;
++ unsigned int number = 0;
+
+- const unsigned int eighthPoints = num_points / 8;
++ const unsigned int eighthPoints = num_points / 8;
+
+- const float* inputVectorPtr = (const float*)inputVector;
+- int16_t* outputVectorPtr = outputVector;
++ const float* inputVectorPtr = (const float*)inputVector;
++ int16_t* outputVectorPtr = outputVector;
+
+- float min_val = SHRT_MIN;
+- float max_val = SHRT_MAX;
+- float r;
++ float min_val = SHRT_MIN;
++ float max_val = SHRT_MAX;
++ float r;
+
+- __m256 vScalar = _mm256_set1_ps(scalar);
+- __m256 inputVal, ret;
+- __m256i intInputVal;
+- __m128i intInputVal1, intInputVal2;
+- __m256 vmin_val = _mm256_set1_ps(min_val);
+- __m256 vmax_val = _mm256_set1_ps(max_val);
++ __m256 vScalar = _mm256_set1_ps(scalar);
++ __m256 inputVal, ret;
++ __m256i intInputVal;
++ __m128i intInputVal1, intInputVal2;
++ __m256 vmin_val = _mm256_set1_ps(min_val);
++ __m256 vmax_val = _mm256_set1_ps(max_val);
+
+- for(;number < eighthPoints; number++){
+- inputVal = _mm256_load_ps(inputVectorPtr); inputVectorPtr += 8;
++ for (; number < eighthPoints; number++) {
++ inputVal = _mm256_load_ps(inputVectorPtr);
++ inputVectorPtr += 8;
+
+- // Scale and clip
+- ret = _mm256_max_ps(_mm256_min_ps(_mm256_mul_ps(inputVal, vScalar), vmax_val), vmin_val);
++ // Scale and clip
++ ret = _mm256_max_ps(_mm256_min_ps(_mm256_mul_ps(inputVal, vScalar), vmax_val),
++ vmin_val);
+
+- intInputVal = _mm256_cvtps_epi32(ret);
++ intInputVal = _mm256_cvtps_epi32(ret);
+
+- intInputVal1 = _mm256_extractf128_si256(intInputVal, 0);
+- intInputVal2 = _mm256_extractf128_si256(intInputVal, 1);
++ intInputVal1 = _mm256_extractf128_si256(intInputVal, 0);
++ intInputVal2 = _mm256_extractf128_si256(intInputVal, 1);
+
+- intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
++ intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
+
+- _mm_store_si128((__m128i*)outputVectorPtr, intInputVal1);
+- outputVectorPtr += 8;
+- }
++ _mm_store_si128((__m128i*)outputVectorPtr, intInputVal1);
++ outputVectorPtr += 8;
++ }
+
+- number = eighthPoints * 8;
+- for(; number < num_points; number++){
+- r = inputVector[number] * scalar;
+- if(r > max_val)
+- r = max_val;
+- else if(r < min_val)
+- r = min_val;
+- outputVector[number] = (int16_t)rintf(r);
+- }
++ number = eighthPoints * 8;
++ for (; number < num_points; number++) {
++ r = inputVector[number] * scalar;
++ if (r > max_val)
++ r = max_val;
++ else if (r < min_val)
++ r = min_val;
++ outputVector[number] = (int16_t)rintf(r);
++ }
+ }
+ #endif /* LV_HAVE_AVX */
+
+ #ifdef LV_HAVE_SSE2
+ #include <emmintrin.h>
+
+-static inline void
+-volk_32f_s32f_convert_16i_a_sse2(int16_t* outputVector, const float* inputVector,
+- const float scalar, unsigned int num_points)
++static inline void volk_32f_s32f_convert_16i_a_sse2(int16_t* outputVector,
++ const float* inputVector,
++ const float scalar,
++ unsigned int num_points)
+ {
+- unsigned int number = 0;
+-
+- const unsigned int eighthPoints = num_points / 8;
+-
+- const float* inputVectorPtr = (const float*)inputVector;
+- int16_t* outputVectorPtr = outputVector;
+-
+- float min_val = SHRT_MIN;
+- float max_val = SHRT_MAX;
+- float r;
+-
+- __m128 vScalar = _mm_set_ps1(scalar);
+- __m128 inputVal1, inputVal2;
+- __m128i intInputVal1, intInputVal2;
+- __m128 ret1, ret2;
+- __m128 vmin_val = _mm_set_ps1(min_val);
+- __m128 vmax_val = _mm_set_ps1(max_val);
+-
+- for(;number < eighthPoints; number++){
+- inputVal1 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4;
+- inputVal2 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4;
+-
+- // Scale and clip
+- ret1 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal1, vScalar), vmax_val), vmin_val);
+- ret2 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal2, vScalar), vmax_val), vmin_val);
+-
+- intInputVal1 = _mm_cvtps_epi32(ret1);
+- intInputVal2 = _mm_cvtps_epi32(ret2);
+-
+- intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
+-
+- _mm_store_si128((__m128i*)outputVectorPtr, intInputVal1);
+- outputVectorPtr += 8;
+- }
+-
+- number = eighthPoints * 8;
+- for(; number < num_points; number++){
+- r = inputVector[number] * scalar;
+- if(r > max_val)
+- r = max_val;
+- else if(r < min_val)
+- r = min_val;
+- outputVector[number] = (int16_t)rintf(r);
+- }
++ unsigned int number = 0;
++
++ const unsigned int eighthPoints = num_points / 8;
++
++ const float* inputVectorPtr = (const float*)inputVector;
++ int16_t* outputVectorPtr = outputVector;
++
++ float min_val = SHRT_MIN;
++ float max_val = SHRT_MAX;
++ float r;
++
++ __m128 vScalar = _mm_set_ps1(scalar);
++ __m128 inputVal1, inputVal2;
++ __m128i intInputVal1, intInputVal2;
++ __m128 ret1, ret2;
++ __m128 vmin_val = _mm_set_ps1(min_val);
++ __m128 vmax_val = _mm_set_ps1(max_val);
++
++ for (; number < eighthPoints; number++) {
++ inputVal1 = _mm_load_ps(inputVectorPtr);
++ inputVectorPtr += 4;
++ inputVal2 = _mm_load_ps(inputVectorPtr);
++ inputVectorPtr += 4;
++
++ // Scale and clip
++ ret1 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal1, vScalar), vmax_val), vmin_val);
++ ret2 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal2, vScalar), vmax_val), vmin_val);
++
++ intInputVal1 = _mm_cvtps_epi32(ret1);
++ intInputVal2 = _mm_cvtps_epi32(ret2);
++
++ intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
++
++ _mm_store_si128((__m128i*)outputVectorPtr, intInputVal1);
++ outputVectorPtr += 8;
++ }
++
++ number = eighthPoints * 8;
++ for (; number < num_points; number++) {
++ r = inputVector[number] * scalar;
++ if (r > max_val)
++ r = max_val;
++ else if (r < min_val)
++ r = min_val;
++ outputVector[number] = (int16_t)rintf(r);
++ }
+ }
+ #endif /* LV_HAVE_SSE2 */
+
+@@ -493,76 +516,78 @@ volk_32f_s32f_convert_16i_a_sse2(int16_t* outputVector, const float* inputVector
+ #ifdef LV_HAVE_SSE
+ #include <xmmintrin.h>
+
+-static inline void
+-volk_32f_s32f_convert_16i_a_sse(int16_t* outputVector, const float* inputVector,
+- const float scalar, unsigned int num_points)
++static inline void volk_32f_s32f_convert_16i_a_sse(int16_t* outputVector,
++ const float* inputVector,
++ const float scalar,
++ unsigned int num_points)
+ {
+- unsigned int number = 0;
+-
+- const unsigned int quarterPoints = num_points / 4;
+-
+- const float* inputVectorPtr = (const float*)inputVector;
+- int16_t* outputVectorPtr = outputVector;
+-
+- float min_val = SHRT_MIN;
+- float max_val = SHRT_MAX;
+- float r;
+-
+- __m128 vScalar = _mm_set_ps1(scalar);
+- __m128 ret;
+- __m128 vmin_val = _mm_set_ps1(min_val);
+- __m128 vmax_val = _mm_set_ps1(max_val);
+-
+- __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4];
+-
+- for(;number < quarterPoints; number++){
+- ret = _mm_load_ps(inputVectorPtr);
+- inputVectorPtr += 4;
+-
+- // Scale and clip
+- ret = _mm_max_ps(_mm_min_ps(_mm_mul_ps(ret, vScalar), vmax_val), vmin_val);
+-
+- _mm_store_ps(outputFloatBuffer, ret);
+- *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[0]);
+- *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[1]);
+- *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[2]);
+- *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[3]);
+- }
+-
+- number = quarterPoints * 4;
+- for(; number < num_points; number++){
+- r = inputVector[number] * scalar;
+- if(r > max_val)
+- r = max_val;
+- else if(r < min_val)
+- r = min_val;
+- outputVector[number] = (int16_t)rintf(r);
+- }
++ unsigned int number = 0;
++
++ const unsigned int quarterPoints = num_points / 4;
++
++ const float* inputVectorPtr = (const float*)inputVector;
++ int16_t* outputVectorPtr = outputVector;
++
++ float min_val = SHRT_MIN;
++ float max_val = SHRT_MAX;
++ float r;
++
++ __m128 vScalar = _mm_set_ps1(scalar);
++ __m128 ret;
++ __m128 vmin_val = _mm_set_ps1(min_val);
++ __m128 vmax_val = _mm_set_ps1(max_val);
++
++ __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4];
++
++ for (; number < quarterPoints; number++) {
++ ret = _mm_load_ps(inputVectorPtr);
++ inputVectorPtr += 4;
++
++ // Scale and clip
++ ret = _mm_max_ps(_mm_min_ps(_mm_mul_ps(ret, vScalar), vmax_val), vmin_val);
++
++ _mm_store_ps(outputFloatBuffer, ret);
++ *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[0]);
++ *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[1]);
++ *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[2]);
++ *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[3]);
++ }
++
++ number = quarterPoints * 4;
++ for (; number < num_points; number++) {
++ r = inputVector[number] * scalar;
++ if (r > max_val)
++ r = max_val;
++ else if (r < min_val)
++ r = min_val;
++ outputVector[number] = (int16_t)rintf(r);
++ }
+ }
+ #endif /* LV_HAVE_SSE */
+
+
+ #ifdef LV_HAVE_GENERIC
+
+-static inline void
+-volk_32f_s32f_convert_16i_a_generic(int16_t* outputVector, const float* inputVector,
+- const float scalar, unsigned int num_points)
++static inline void volk_32f_s32f_convert_16i_a_generic(int16_t* outputVector,
++ const float* inputVector,
++ const float scalar,
++ unsigned int num_points)
+ {
+- int16_t* outputVectorPtr = outputVector;
+- const float* inputVectorPtr = inputVector;
+- unsigned int number = 0;
+- float min_val = SHRT_MIN;
+- float max_val = SHRT_MAX;
+- float r;
+-
+- for(number = 0; number < num_points; number++){
+- r = *inputVectorPtr++ * scalar;
+- if(r < min_val)
+- r = min_val;
+- else if(r > max_val)
+- r = max_val;
+- *outputVectorPtr++ = (int16_t)rintf(r);
+- }
++ int16_t* outputVectorPtr = outputVector;
++ const float* inputVectorPtr = inputVector;
++ unsigned int number = 0;
++ float min_val = SHRT_MIN;
++ float max_val = SHRT_MAX;
++ float r;
++
++ for (number = 0; number < num_points; number++) {
++ r = *inputVectorPtr++ * scalar;
++ if (r < min_val)
++ r = min_val;
++ else if (r > max_val)
++ r = max_val;
++ *outputVectorPtr++ = (int16_t)rintf(r);
++ }
+ }
+ #endif /* LV_HAVE_GENERIC */
+
+diff --git a/kernels/volk/volk_32f_s32f_convert_32i.h b/kernels/volk/volk_32f_s32f_convert_32i.h
+index d2a65a0..d5f7cd4 100644
+--- a/kernels/volk/volk_32f_s32f_convert_32i.h
++++ b/kernels/volk/volk_32f_s32f_convert_32i.h
+@@ -30,8 +30,8 @@
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+- * void volk_32f_s32f_convert_32i(int32_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points)
+- * \endcode
++ * void volk_32f_s32f_convert_32i(int32_t* outputVector, const float* inputVector, const
++ * float scalar, unsigned int num_points) \endcode
+ *
+ * \b Inputs
+ * \li inputVector: the input vector of floats.
+@@ -77,46 +77,49 @@
+ #ifdef LV_HAVE_AVX
+ #include <immintrin.h>
+
+-static inline void
+-volk_32f_s32f_convert_32i_u_avx(int32_t* outputVector, const float* inputVector,
+- const float scalar, unsigned int num_points)
++static inline void volk_32f_s32f_convert_32i_u_avx(int32_t* outputVector,
++ const float* inputVector,
++ const float scalar,
++ unsigned int num_points)
+ {
+- unsigned int number = 0;
+-
+- const unsigned int eighthPoints = num_points / 8;
+-
+- const float* inputVectorPtr = (const float*)inputVector;
+- int32_t* outputVectorPtr = outputVector;
+-
+- float min_val = INT_MIN;
+- float max_val = INT_MAX;
+- float r;
+-
+- __m256 vScalar = _mm256_set1_ps(scalar);
+- __m256 inputVal1;
+- __m256i intInputVal1;
+- __m256 vmin_val = _mm256_set1_ps(min_val);
+- __m256 vmax_val = _mm256_set1_ps(max_val);
+-
+- for(;number < eighthPoints; number++){
+- inputVal1 = _mm256_loadu_ps(inputVectorPtr); inputVectorPtr += 8;
+-
+- inputVal1 = _mm256_max_ps(_mm256_min_ps(_mm256_mul_ps(inputVal1, vScalar), vmax_val), vmin_val);
+- intInputVal1 = _mm256_cvtps_epi32(inputVal1);
+-
+- _mm256_storeu_si256((__m256i*)outputVectorPtr, intInputVal1);
+- outputVectorPtr += 8;
+- }
+-
+- number = eighthPoints * 8;
+- for(; number < num_points; number++){
+- r = inputVector[number] * scalar;
+- if(r > max_val)
+- r = max_val;
+- else if(r < min_val)
+- r = min_val;
+- outputVector[number] = (int32_t)rintf(r);
+- }
++ unsigned int number = 0;
++
++ const unsigned int eighthPoints = num_points / 8;
++
++ const float* inputVectorPtr = (const float*)inputVector;
++ int32_t* outputVectorPtr = outputVector;
++
++ float min_val = INT_MIN;
++ float max_val = INT_MAX;
++ float r;
++
++ __m256 vScalar = _mm256_set1_ps(scalar);
++ __m256 inputVal1;
++ __m256i intInputVal1;
++ __m256 vmin_val = _mm256_set1_ps(min_val);
++ __m256 vmax_val = _mm256_set1_ps(max_val);
++
++ for (; number < eighthPoints; number++) {
++ inputVal1 = _mm256_loadu_ps(inputVectorPtr);
++ inputVectorPtr += 8;
++
++ inputVal1 = _mm256_max_ps(
++ _mm256_min_ps(_mm256_mul_ps(inputVal1, vScalar), vmax_val), vmin_val);
++ intInputVal1 = _mm256_cvtps_epi32(inputVal1);
++
++ _mm256_storeu_si256((__m256i*)outputVectorPtr, intInputVal1);
++ outputVectorPtr += 8;
++ }
++
++ number = eighthPoints * 8;
++ for (; number < num_points; number++) {
++ r = inputVector[number] * scalar;
++ if (r > max_val)
++ r = max_val;
++ else if (r < min_val)
++ r = min_val;
++ outputVector[number] = (int32_t)rintf(r);
++ }
+ }
+
+ #endif /* LV_HAVE_AVX */
+@@ -124,46 +127,49 @@ volk_32f_s32f_convert_32i_u_avx(int32_t* outputVector, const float* inputVector,
+ #ifdef LV_HAVE_SSE2
+ #include <emmintrin.h>
+
+-static inline void
+-volk_32f_s32f_convert_32i_u_sse2(int32_t* outputVector, const float* inputVector,
+- const float scalar, unsigned int num_points)
++static inline void volk_32f_s32f_convert_32i_u_sse2(int32_t* outputVector,
++ const float* inputVector,
++ const float scalar,
++ unsigned int num_points)
+ {
+- unsigned int number = 0;
+-
+- const unsigned int quarterPoints = num_points / 4;
+-
+- const float* inputVectorPtr = (const float*)inputVector;
+- int32_t* outputVectorPtr = outputVector;
+-
+- float min_val = INT_MIN;
+- float max_val = INT_MAX;
+- float r;
+-
+- __m128 vScalar = _mm_set_ps1(scalar);
+- __m128 inputVal1;
+- __m128i intInputVal1;
+- __m128 vmin_val = _mm_set_ps1(min_val);
+- __m128 vmax_val = _mm_set_ps1(max_val);
+-
+- for(;number < quarterPoints; number++){
+- inputVal1 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4;
+-
+- inputVal1 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal1, vScalar), vmax_val), vmin_val);
+- intInputVal1 = _mm_cvtps_epi32(inputVal1);
+-
+- _mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1);
+- outputVectorPtr += 4;
+- }
+-
+- number = quarterPoints * 4;
+- for(; number < num_points; number++){
+- r = inputVector[number] * scalar;
+- if(r > max_val)
+- r = max_val;
+- else if(r < min_val)
+- r = min_val;
+- outputVector[number] = (int32_t)rintf(r);
+- }
++ unsigned int number = 0;
++
++ const unsigned int quarterPoints = num_points / 4;
++
++ const float* inputVectorPtr = (const float*)inputVector;
++ int32_t* outputVectorPtr = outputVector;
++
++ float min_val = INT_MIN;
++ float max_val = INT_MAX;
++ float r;
++
++ __m128 vScalar = _mm_set_ps1(scalar);
++ __m128 inputVal1;
++ __m128i intInputVal1;
++ __m128 vmin_val = _mm_set_ps1(min_val);
++ __m128 vmax_val = _mm_set_ps1(max_val);
++
++ for (; number < quarterPoints; number++) {
++ inputVal1 = _mm_loadu_ps(inputVectorPtr);
++ inputVectorPtr += 4;
++
++ inputVal1 =
++ _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal1, vScalar), vmax_val), vmin_val);
++ intInputVal1 = _mm_cvtps_epi32(inputVal1);
++
++ _mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1);
++ outputVectorPtr += 4;
++ }
++
++ number = quarterPoints * 4;
++ for (; number < num_points; number++) {
++ r = inputVector[number] * scalar;
++ if (r > max_val)
++ r = max_val;
++ else if (r < min_val)
++ r = min_val;
++ outputVector[number] = (int32_t)rintf(r);
++ }
+ }
+
+ #endif /* LV_HAVE_SSE2 */
+@@ -172,50 +178,51 @@ volk_32f_s32f_convert_32i_u_sse2(int32_t* outputVector, const float* inputVector
+ #ifdef LV_HAVE_SSE
+ #include <xmmintrin.h>
+
+-static inline void
+-volk_32f_s32f_convert_32i_u_sse(int32_t* outputVector, const float* inputVector,
+- const float scalar, unsigned int num_points)
++static inline void volk_32f_s32f_convert_32i_u_sse(int32_t* outputVector,
++ const float* inputVector,
++ const float scalar,
++ unsigned int num_points)
+ {
+- unsigned int number = 0;
+-
+- const unsigned int quarterPoints = num_points / 4;
+-
+- const float* inputVectorPtr = (const float*)inputVector;
+- int32_t* outputVectorPtr = outputVector;
+-
+- float min_val = INT_MIN;
+- float max_val = INT_MAX;
+- float r;
+-
+- __m128 vScalar = _mm_set_ps1(scalar);
+- __m128 ret;
+- __m128 vmin_val = _mm_set_ps1(min_val);
+- __m128 vmax_val = _mm_set_ps1(max_val);
+-
+- __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4];
+-
+- for(;number < quarterPoints; number++){
+- ret = _mm_loadu_ps(inputVectorPtr);
+- inputVectorPtr += 4;
+-
+- ret = _mm_max_ps(_mm_min_ps(_mm_mul_ps(ret, vScalar), vmax_val), vmin_val);
+-
+- _mm_store_ps(outputFloatBuffer, ret);
+- *outputVectorPtr++ = (int32_t)rintf(outputFloatBuffer[0]);
+- *outputVectorPtr++ = (int32_t)rintf(outputFloatBuffer[1]);
+- *outputVectorPtr++ = (int32_t)rintf(outputFloatBuffer[2]);
+- *outputVectorPtr++ = (int32_t)rintf(outputFloatBuffer[3]);
+- }
+-
+- number = quarterPoints * 4;
+- for(; number < num_points; number++){
+- r = inputVector[number] * scalar;
+- if(r > max_val)
+- r = max_val;
+- else if(r < min_val)
+- r = min_val;
+- outputVector[number] = (int32_t)rintf(r);
+- }
++ unsigned int number = 0;
++
++ const unsigned int quarterPoints = num_points / 4;
++
++ const float* inputVectorPtr = (const float*)inputVector;
++ int32_t* outputVectorPtr = outputVector;
++
++ float min_val = INT_MIN;
++ float max_val = INT_MAX;
++ float r;
++
++ __m128 vScalar = _mm_set_ps1(scalar);
++ __m128 ret;
++ __m128 vmin_val = _mm_set_ps1(min_val);
++ __m128 vmax_val = _mm_set_ps1(max_val);
++
++ __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4];
++
++ for (; number < quarterPoints; number++) {
++ ret = _mm_loadu_ps(inputVectorPtr);
++ inputVectorPtr += 4;
++
++ ret = _mm_max_ps(_mm_min_ps(_mm_mul_ps(ret, vScalar), vmax_val), vmin_val);
++
++ _mm_store_ps(outputFloatBuffer, ret);
++ *outputVectorPtr++ = (int32_t)rintf(outputFloatBuffer[0]);
++ *outputVectorPtr++ = (int32_t)rintf(outputFloatBuffer[1]);
++ *outputVectorPtr++ = (int32_t)rintf(outputFloatBuffer[2]);
++ *outputVectorPtr++ = (int32_t)rintf(outputFloatBuffer[3]);
++ }
++
++ number = quarterPoints * 4;
++ for (; number < num_points; number++) {
++ r = inputVector[number] * scalar;
++ if (r > max_val)
++ r = max_val;
++ else if (r < min_val)
++ r = min_val;
++ outputVector[number] = (int32_t)rintf(r);
++ }
+ }
+
+ #endif /* LV_HAVE_SSE */
+@@ -223,82 +230,85 @@ volk_32f_s32f_convert_32i_u_sse(int32_t* outputVector, const float* inputVector,
+
+ #ifdef LV_HAVE_GENERIC
+
+-static inline void
+-volk_32f_s32f_convert_32i_generic(int32_t* outputVector, const float* inputVector,
+- const float scalar, unsigned int num_points)
++static inline void volk_32f_s32f_convert_32i_generic(int32_t* outputVector,
++ const float* inputVector,
++ const float scalar,
++ unsigned int num_points)
+ {
+- int32_t* outputVectorPtr = outputVector;
+- const float* inputVectorPtr = inputVector;
+- unsigned int number = 0;
+- float min_val = INT_MIN;
+- float max_val = INT_MAX;
+- float r;
+-
+- for(number = 0; number < num_points; number++){
+- r = *inputVectorPtr++ * scalar;
+- if(r > max_val)
+- r = max_val;
+- else if(r < min_val)
+- r = min_val;
+- *outputVectorPtr++ = (int32_t)rintf(r);
+- }
++ int32_t* outputVectorPtr = outputVector;
++ const float* inputVectorPtr = inputVector;
++ unsigned int number = 0;
++ float min_val = INT_MIN;
++ float max_val = INT_MAX;
++ float r;
++
++ for (number = 0; number < num_points; number++) {
++ r = *inputVectorPtr++ * scalar;
++ if (r > max_val)
++ r = max_val;
++ else if (r < min_val)
++ r = min_val;
++ *outputVectorPtr++ = (int32_t)rintf(r);
++ }
+ }
+
+ #endif /* LV_HAVE_GENERIC */
+
+
+-
+ #endif /* INCLUDED_volk_32f_s32f_convert_32i_u_H */
+ #ifndef INCLUDED_volk_32f_s32f_convert_32i_a_H
+ #define INCLUDED_volk_32f_s32f_convert_32i_a_H
+
+-#include <volk/volk_common.h>
+ #include <inttypes.h>
+ #include <stdio.h>
++#include <volk/volk_common.h>
+
+ #ifdef LV_HAVE_AVX
+ #include <immintrin.h>
+
+-static inline void
+-volk_32f_s32f_convert_32i_a_avx(int32_t* outputVector, const float* inputVector,
+- const float scalar, unsigned int num_points)
++static inline void volk_32f_s32f_convert_32i_a_avx(int32_t* outputVector,
++ const float* inputVector,
++ const float scalar,
++ unsigned int num_points)
+ {
+- unsigned int number = 0;
+-
+- const unsigned int eighthPoints = num_points / 8;
+-
+- const float* inputVectorPtr = (const float*)inputVector;
+- int32_t* outputVectorPtr = outputVector;
+-
+- float min_val = INT_MIN;
+- float max_val = INT_MAX;
+- float r;
+-
+- __m256 vScalar = _mm256_set1_ps(scalar);
+- __m256 inputVal1;
+- __m256i intInputVal1;
+- __m256 vmin_val = _mm256_set1_ps(min_val);
+- __m256 vmax_val = _mm256_set1_ps(max_val);
+-
+- for(;number < eighthPoints; number++){
+- inputVal1 = _mm256_load_ps(inputVectorPtr); inputVectorPtr += 8;
+-
+- inputVal1 = _mm256_max_ps(_mm256_min_ps(_mm256_mul_ps(inputVal1, vScalar), vmax_val), vmin_val);
+- intInputVal1 = _mm256_cvtps_epi32(inputVal1);
+-
+- _mm256_store_si256((__m256i*)outputVectorPtr, intInputVal1);
+- outputVectorPtr += 8;
+- }
+-
+- number = eighthPoints * 8;
+- for(; number < num_points; number++){
+- r = inputVector[number] * scalar;
+- if(r > max_val)
+- r = max_val;
+- else if(r < min_val)
+- r = min_val;
+- outputVector[number] = (int32_t)rintf(r);
+- }
++ unsigned int number = 0;
++
++ const unsigned int eighthPoints = num_points / 8;
++
++ const float* inputVectorPtr = (const float*)inputVector;
++ int32_t* outputVectorPtr = outputVector;
++
++ float min_val = INT_MIN;
++ float max_val = INT_MAX;
++ float r;
++
++ __m256 vScalar = _mm256_set1_ps(scalar);
++ __m256 inputVal1;
++ __m256i intInputVal1;
++ __m256 vmin_val = _mm256_set1_ps(min_val);
++ __m256 vmax_val = _mm256_set1_ps(max_val);
++
++ for (; number < eighthPoints; number++) {
++ inputVal1 = _mm256_load_ps(inputVectorPtr);
++ inputVectorPtr += 8;
++
++ inputVal1 = _mm256_max_ps(
++ _mm256_min_ps(_mm256_mul_ps(inputVal1, vScalar), vmax_val), vmin_val);
++ intInputVal1 = _mm256_cvtps_epi32(inputVal1);
++
++ _mm256_store_si256((__m256i*)outputVectorPtr, intInputVal1);
++ outputVectorPtr += 8;
++ }
++
++ number = eighthPoints * 8;
++ for (; number < num_points; number++) {
++ r = inputVector[number] * scalar;
++ if (r > max_val)
++ r = max_val;
++ else if (r < min_val)
++ r = min_val;
++ outputVector[number] = (int32_t)rintf(r);
++ }
+ }
+
+ #endif /* LV_HAVE_AVX */
+@@ -307,46 +317,49 @@ volk_32f_s32f_convert_32i_a_avx(int32_t* outputVector, const float* inputVector,
+ #ifdef LV_HAVE_SSE2
+ #include <emmintrin.h>
+
+-static inline void
+-volk_32f_s32f_convert_32i_a_sse2(int32_t* outputVector, const float* inputVector,
+- const float scalar, unsigned int num_points)
++static inline void volk_32f_s32f_convert_32i_a_sse2(int32_t* outputVector,
++ const float* inputVector,
++ const float scalar,
++ unsigned int num_points)
+ {
+- unsigned int number = 0;
+-
+- const unsigned int quarterPoints = num_points / 4;
+-
+- const float* inputVectorPtr = (const float*)inputVector;
+- int32_t* outputVectorPtr = outputVector;
+-
+- float min_val = INT_MIN;
+- float max_val = INT_MAX;
+- float r;
+-
+- __m128 vScalar = _mm_set_ps1(scalar);
+- __m128 inputVal1;
+- __m128i intInputVal1;
+- __m128 vmin_val = _mm_set_ps1(min_val);
+- __m128 vmax_val = _mm_set_ps1(max_val);
+-
+- for(;number < quarterPoints; number++){
+- inputVal1 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4;
+-
+- inputVal1 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal1, vScalar), vmax_val), vmin_val);
+- intInputVal1 = _mm_cvtps_epi32(inputVal1);
+-
+- _mm_store_si128((__m128i*)outputVectorPtr, intInputVal1);
+- outputVectorPtr += 4;
+- }
+-
+- number = quarterPoints * 4;
+- for(; number < num_points; number++){
+- r = inputVector[number] * scalar;
+- if(r > max_val)
+- r = max_val;
+- else if(r < min_val)
+- r = min_val;
+- outputVector[number] = (int32_t)rintf(r);
+- }
++ unsigned int number = 0;
++
++ const unsigned int quarterPoints = num_points / 4;
++
++ const float* inputVectorPtr = (const float*)inputVector;
++ int32_t* outputVectorPtr = outputVector;
++
++ float min_val = INT_MIN;
++ float max_val = INT_MAX;
++ float r;
++
++ __m128 vScalar = _mm_set_ps1(scalar);
++ __m128 inputVal1;
++ __m128i intInputVal1;
++ __m128 vmin_val = _mm_set_ps1(min_val);
++ __m128 vmax_val = _mm_set_ps1(max_val);
++
++ for (; number < quarterPoints; number++) {
++ inputVal1 = _mm_load_ps(inputVectorPtr);
++ inputVectorPtr += 4;
++
++ inputVal1 =
++ _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal1, vScalar), vmax_val), vmin_val);
++ intInputVal1 = _mm_cvtps_epi32(inputVal1);
++
++ _mm_store_si128((__m128i*)outputVectorPtr, intInputVal1);
++ outputVectorPtr += 4;
++ }
++
++ number = quarterPoints * 4;
++ for (; number < num_points; number++) {
++ r = inputVector[number] * scalar;
++ if (r > max_val)
++ r = max_val;
++ else if (r < min_val)
++ r = min_val;
++ outputVector[number] = (int32_t)rintf(r);
++ }
+ }
+
+ #endif /* LV_HAVE_SSE2 */
+@@ -355,50 +368,51 @@ volk_32f_s32f_convert_32i_a_sse2(int32_t* outputVector, const float* inputVector
+ #ifdef LV_HAVE_SSE
+ #include <xmmintrin.h>
+
+-static inline void
+-volk_32f_s32f_convert_32i_a_sse(int32_t* outputVector, const float* inputVector,
+- const float scalar, unsigned int num_points)
++static inline void volk_32f_s32f_convert_32i_a_sse(int32_t* outputVector,
++ const float* inputVector,
++ const float scalar,
++ unsigned int num_points)
+ {
+- unsigned int number = 0;
+-
+- const unsigned int quarterPoints = num_points / 4;
+-
+- const float* inputVectorPtr = (const float*)inputVector;
+- int32_t* outputVectorPtr = outputVector;
+-
+- float min_val = INT_MIN;
+- float max_val = INT_MAX;
+- float r;
+-
+- __m128 vScalar = _mm_set_ps1(scalar);
+- __m128 ret;
+- __m128 vmin_val = _mm_set_ps1(min_val);
+- __m128 vmax_val = _mm_set_ps1(max_val);
+-
+- __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4];
+-
+- for(;number < quarterPoints; number++){
+- ret = _mm_load_ps(inputVectorPtr);
+- inputVectorPtr += 4;
+-
+- ret = _mm_max_ps(_mm_min_ps(_mm_mul_ps(ret, vScalar), vmax_val), vmin_val);
+-
+- _mm_store_ps(outputFloatBuffer, ret);
+- *outputVectorPtr++ = (int32_t)rintf(outputFloatBuffer[0]);
+- *outputVectorPtr++ = (int32_t)rintf(outputFloatBuffer[1]);
+- *outputVectorPtr++ = (int32_t)rintf(outputFloatBuffer[2]);
+- *outputVectorPtr++ = (int32_t)rintf(outputFloatBuffer[3]);
+- }
+-
+- number = quarterPoints * 4;
+- for(; number < num_points; number++){
+- r = inputVector[number] * scalar;
+- if(r > max_val)
+- r = max_val;
+- else if(r < min_val)
+- r = min_val;
+- outputVector[number] = (int32_t)rintf(r);
+- }
++ unsigned int number = 0;
++
++ const unsigned int quarterPoints = num_points / 4;
++
++ const float* inputVectorPtr = (const float*)inputVector;
++ int32_t* outputVectorPtr = outputVector;
++
++ float min_val = INT_MIN;
++ float max_val = INT_MAX;
++ float r;
++
++ __m128 vScalar = _mm_set_ps1(scalar);
++ __m128 ret;
++ __m128 vmin_val = _mm_set_ps1(min_val);
++ __m128 vmax_val = _mm_set_ps1(max_val);
++
++ __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4];
++
++ for (; number < quarterPoints; number++) {
++ ret = _mm_load_ps(inputVectorPtr);
++ inputVectorPtr += 4;
++
++ ret = _mm_max_ps(_mm_min_ps(_mm_mul_ps(ret, vScalar), vmax_val), vmin_val);
++
++ _mm_store_ps(outputFloatBuffer, ret);
++ *outputVectorPtr++ = (int32_t)rintf(outputFloatBuffer[0]);
++ *outputVectorPtr++ = (int32_t)rintf(outputFloatBuffer[1]);
++ *outputVectorPtr++ = (int32_t)rintf(outputFloatBuffer[2]);
++ *outputVectorPtr++ = (int32_t)rintf(outputFloatBuffer[3]);
++ }
++
++ number = quarterPoints * 4;
++ for (; number < num_points; number++) {
++ r = inputVector[number] * scalar;
++ if (r > max_val)
++ r = max_val;
++ else if (r < min_val)
++ r = min_val;
++ outputVector[number] = (int32_t)rintf(r);
++ }
+ }
+
+ #endif /* LV_HAVE_SSE */
+@@ -406,25 +420,26 @@ volk_32f_s32f_convert_32i_a_sse(int32_t* outputVector, const float* inputVector,
+
+ #ifdef LV_HAVE_GENERIC
+
+-static inline void
+-volk_32f_s32f_convert_32i_a_generic(int32_t* outputVector, const float* inputVector,
+- const float scalar, unsigned int num_points)
++static inline void volk_32f_s32f_convert_32i_a_generic(int32_t* outputVector,
++ const float* inputVector,
++ const float scalar,
++ unsigned int num_points)
+ {
+- int32_t* outputVectorPtr = outputVector;
+- const float* inputVectorPtr = inputVector;
+- unsigned int number = 0;
+- float min_val = INT_MIN;
+- float max_val = INT_MAX;
+- float r;
+-
+- for(number = 0; number < num_points; number++){
+- r = *inputVectorPtr++ * scalar;
+- if(r > max_val)
+- r = max_val;
+- else if(r < min_val)
+- r = min_val;
+- *outputVectorPtr++ = (int32_t)rintf(r);
+- }
++ int32_t* outputVectorPtr = outputVector;
++ const float* inputVectorPtr = inputVector;
++ unsigned int number = 0;
++ float min_val = INT_MIN;
++ float max_val = INT_MAX;
++ float r;
++
++ for (number = 0; number < num_points; number++) {
++ r = *inputVectorPtr++ * scalar;
++ if (r > max_val)
++ r = max_val;
++ else if (r < min_val)
++ r = min_val;
++ *outputVectorPtr++ = (int32_t)rintf(r);
++ }
+ }
+
+ #endif /* LV_HAVE_GENERIC */
+diff --git a/kernels/volk/volk_32f_s32f_convert_8i.h b/kernels/volk/volk_32f_s32f_convert_8i.h
+index 2a1669c..242c3bd 100644
+--- a/kernels/volk/volk_32f_s32f_convert_8i.h
++++ b/kernels/volk/volk_32f_s32f_convert_8i.h
+@@ -30,7 +30,8 @@
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+- * void volk_32f_s32f_convert_8i(int8_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points)
++ * void volk_32f_s32f_convert_8i(int8_t* outputVector, const float* inputVector, const
++ float scalar, unsigned int num_points)
+ * \endcode
+ *
+ * \b Inputs
+@@ -42,7 +43,8 @@
+ * \li outputVector: The output vector.
+ *
+ * \b Example
+- * Convert floats from [-1,1] to 16-bit integers with a scale of 5 to maintain smallest delta
++ * Convert floats from [-1,1] to 16-bit integers with a scale of 5 to maintain smallest
++ delta
+ * int N = 10;
+ * unsigned int alignment = volk_get_alignment();
+ * float* increasing = (float*)volk_malloc(sizeof(float)*N, alignment);
+@@ -74,77 +76,86 @@
+ #include <inttypes.h>
+ #include <stdio.h>
+
+-static inline void
+-volk_32f_s32f_convert_8i_single(int8_t* out, const float in){
+- float min_val = CHAR_MIN;
+- float max_val = CHAR_MAX;
+- if(in > max_val){
+- *out = (int8_t)(max_val);
+- }else if(in < min_val){
+- *out = (int8_t)(min_val);
+- }else{
+- *out = (int8_t)(rintf(in));
+- }
++static inline void volk_32f_s32f_convert_8i_single(int8_t* out, const float in)
++{
++ float min_val = CHAR_MIN;
++ float max_val = CHAR_MAX;
++ if (in > max_val) {
++ *out = (int8_t)(max_val);
++ } else if (in < min_val) {
++ *out = (int8_t)(min_val);
++ } else {
++ *out = (int8_t)(rintf(in));
++ }
+ }
+
+ #ifdef LV_HAVE_AVX2
+ #include <immintrin.h>
+
+-static inline void
+-volk_32f_s32f_convert_8i_u_avx2(int8_t* outputVector, const float* inputVector,
+- const float scalar, unsigned int num_points)
++static inline void volk_32f_s32f_convert_8i_u_avx2(int8_t* outputVector,
++ const float* inputVector,
++ const float scalar,
++ unsigned int num_points)
+ {
+- unsigned int number = 0;
+-
+- const unsigned int thirtysecondPoints = num_points / 32;
+-
+- const float* inputVectorPtr = (const float*)inputVector;
+- int8_t* outputVectorPtr = outputVector;
+-
+- float min_val = CHAR_MIN;
+- float max_val = CHAR_MAX;
+- float r;
+-
+- __m256 vScalar = _mm256_set1_ps(scalar);
+- __m256 inputVal1, inputVal2, inputVal3, inputVal4;
+- __m256i intInputVal1, intInputVal2, intInputVal3, intInputVal4;
+- __m256 vmin_val = _mm256_set1_ps(min_val);
+- __m256 vmax_val = _mm256_set1_ps(max_val);
+- __m256i intInputVal;
+-
+- for(;number < thirtysecondPoints; number++){
+- inputVal1 = _mm256_loadu_ps(inputVectorPtr); inputVectorPtr += 8;
+- inputVal2 = _mm256_loadu_ps(inputVectorPtr); inputVectorPtr += 8;
+- inputVal3 = _mm256_loadu_ps(inputVectorPtr); inputVectorPtr += 8;
+- inputVal4 = _mm256_loadu_ps(inputVectorPtr); inputVectorPtr += 8;
+-
+- inputVal1 = _mm256_max_ps(_mm256_min_ps(_mm256_mul_ps(inputVal1, vScalar), vmax_val), vmin_val);
+- inputVal2 = _mm256_max_ps(_mm256_min_ps(_mm256_mul_ps(inputVal2, vScalar), vmax_val), vmin_val);
+- inputVal3 = _mm256_max_ps(_mm256_min_ps(_mm256_mul_ps(inputVal3, vScalar), vmax_val), vmin_val);
+- inputVal4 = _mm256_max_ps(_mm256_min_ps(_mm256_mul_ps(inputVal4, vScalar), vmax_val), vmin_val);
+-
+- intInputVal1 = _mm256_cvtps_epi32(inputVal1);
+- intInputVal2 = _mm256_cvtps_epi32(inputVal2);
+- intInputVal3 = _mm256_cvtps_epi32(inputVal3);
+- intInputVal4 = _mm256_cvtps_epi32(inputVal4);
+-
+- intInputVal1 = _mm256_packs_epi32(intInputVal1, intInputVal2);
+- intInputVal1 = _mm256_permute4x64_epi64(intInputVal1, 0b11011000);
+- intInputVal3 = _mm256_packs_epi32(intInputVal3, intInputVal4);
+- intInputVal3 = _mm256_permute4x64_epi64(intInputVal3, 0b11011000);
+-
+- intInputVal1 = _mm256_packs_epi16(intInputVal1, intInputVal3);
+- intInputVal = _mm256_permute4x64_epi64(intInputVal1, 0b11011000);
+-
+- _mm256_storeu_si256((__m256i*)outputVectorPtr, intInputVal);
+- outputVectorPtr += 32;
+- }
+-
+- number = thirtysecondPoints * 32;
+- for(; number < num_points; number++){
+- r = inputVector[number] * scalar;
+- volk_32f_s32f_convert_8i_single(&outputVector[number], r);
+- }
++ unsigned int number = 0;
++
++ const unsigned int thirtysecondPoints = num_points / 32;
++
++ const float* inputVectorPtr = (const float*)inputVector;
++ int8_t* outputVectorPtr = outputVector;
++
++ float min_val = CHAR_MIN;
++ float max_val = CHAR_MAX;
++ float r;
++
++ __m256 vScalar = _mm256_set1_ps(scalar);
++ __m256 inputVal1, inputVal2, inputVal3, inputVal4;
++ __m256i intInputVal1, intInputVal2, intInputVal3, intInputVal4;
++ __m256 vmin_val = _mm256_set1_ps(min_val);
++ __m256 vmax_val = _mm256_set1_ps(max_val);
++ __m256i intInputVal;
++
++ for (; number < thirtysecondPoints; number++) {
++ inputVal1 = _mm256_loadu_ps(inputVectorPtr);
++ inputVectorPtr += 8;
++ inputVal2 = _mm256_loadu_ps(inputVectorPtr);
++ inputVectorPtr += 8;
++ inputVal3 = _mm256_loadu_ps(inputVectorPtr);
++ inputVectorPtr += 8;
++ inputVal4 = _mm256_loadu_ps(inputVectorPtr);
++ inputVectorPtr += 8;
++
++ inputVal1 = _mm256_max_ps(
++ _mm256_min_ps(_mm256_mul_ps(inputVal1, vScalar), vmax_val), vmin_val);
++ inputVal2 = _mm256_max_ps(
++ _mm256_min_ps(_mm256_mul_ps(inputVal2, vScalar), vmax_val), vmin_val);
++ inputVal3 = _mm256_max_ps(
++ _mm256_min_ps(_mm256_mul_ps(inputVal3, vScalar), vmax_val), vmin_val);
++ inputVal4 = _mm256_max_ps(
++ _mm256_min_ps(_mm256_mul_ps(inputVal4, vScalar), vmax_val), vmin_val);
++
++ intInputVal1 = _mm256_cvtps_epi32(inputVal1);
++ intInputVal2 = _mm256_cvtps_epi32(inputVal2);
++ intInputVal3 = _mm256_cvtps_epi32(inputVal3);
++ intInputVal4 = _mm256_cvtps_epi32(inputVal4);
++
++ intInputVal1 = _mm256_packs_epi32(intInputVal1, intInputVal2);
++ intInputVal1 = _mm256_permute4x64_epi64(intInputVal1, 0b11011000);
++ intInputVal3 = _mm256_packs_epi32(intInputVal3, intInputVal4);
++ intInputVal3 = _mm256_permute4x64_epi64(intInputVal3, 0b11011000);
++
++ intInputVal1 = _mm256_packs_epi16(intInputVal1, intInputVal3);
++ intInputVal = _mm256_permute4x64_epi64(intInputVal1, 0b11011000);
++
++ _mm256_storeu_si256((__m256i*)outputVectorPtr, intInputVal);
++ outputVectorPtr += 32;
++ }
++
++ number = thirtysecondPoints * 32;
++ for (; number < num_points; number++) {
++ r = inputVector[number] * scalar;
++ volk_32f_s32f_convert_8i_single(&outputVector[number], r);
++ }
+ }
+
+ #endif /* LV_HAVE_AVX2 */
+@@ -153,57 +164,66 @@ volk_32f_s32f_convert_8i_u_avx2(int8_t* outputVector, const float* inputVector,
+ #ifdef LV_HAVE_SSE2
+ #include <emmintrin.h>
+
+-static inline void
+-volk_32f_s32f_convert_8i_u_sse2(int8_t* outputVector, const float* inputVector,
+- const float scalar, unsigned int num_points)
++static inline void volk_32f_s32f_convert_8i_u_sse2(int8_t* outputVector,
++ const float* inputVector,
++ const float scalar,
++ unsigned int num_points)
+ {
+- unsigned int number = 0;
+-
+- const unsigned int sixteenthPoints = num_points / 16;
+-
+- const float* inputVectorPtr = (const float*)inputVector;
+- int8_t* outputVectorPtr = outputVector;
+-
+- float min_val = CHAR_MIN;
+- float max_val = CHAR_MAX;
+- float r;
+-
+- __m128 vScalar = _mm_set_ps1(scalar);
+- __m128 inputVal1, inputVal2, inputVal3, inputVal4;
+- __m128i intInputVal1, intInputVal2, intInputVal3, intInputVal4;
+- __m128 vmin_val = _mm_set_ps1(min_val);
+- __m128 vmax_val = _mm_set_ps1(max_val);
+-
+- for(;number < sixteenthPoints; number++){
+- inputVal1 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4;
+- inputVal2 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4;
+- inputVal3 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4;
+- inputVal4 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4;
+-
+- inputVal1 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal1, vScalar), vmax_val), vmin_val);
+- inputVal2 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal2, vScalar), vmax_val), vmin_val);
+- inputVal3 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal3, vScalar), vmax_val), vmin_val);
+- inputVal4 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal4, vScalar), vmax_val), vmin_val);
+-
+- intInputVal1 = _mm_cvtps_epi32(inputVal1);
+- intInputVal2 = _mm_cvtps_epi32(inputVal2);
+- intInputVal3 = _mm_cvtps_epi32(inputVal3);
+- intInputVal4 = _mm_cvtps_epi32(inputVal4);
+-
+- intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
+- intInputVal3 = _mm_packs_epi32(intInputVal3, intInputVal4);
+-
+- intInputVal1 = _mm_packs_epi16(intInputVal1, intInputVal3);
+-
+- _mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1);
+- outputVectorPtr += 16;
+- }
++ unsigned int number = 0;
++
++ const unsigned int sixteenthPoints = num_points / 16;
++
++ const float* inputVectorPtr = (const float*)inputVector;
++ int8_t* outputVectorPtr = outputVector;
++
++ float min_val = CHAR_MIN;
++ float max_val = CHAR_MAX;
++ float r;
++
++ __m128 vScalar = _mm_set_ps1(scalar);
++ __m128 inputVal1, inputVal2, inputVal3, inputVal4;
++ __m128i intInputVal1, intInputVal2, intInputVal3, intInputVal4;
++ __m128 vmin_val = _mm_set_ps1(min_val);
++ __m128 vmax_val = _mm_set_ps1(max_val);
++
++ for (; number < sixteenthPoints; number++) {
++ inputVal1 = _mm_loadu_ps(inputVectorPtr);
++ inputVectorPtr += 4;
++ inputVal2 = _mm_loadu_ps(inputVectorPtr);
++ inputVectorPtr += 4;
++ inputVal3 = _mm_loadu_ps(inputVectorPtr);
++ inputVectorPtr += 4;
++ inputVal4 = _mm_loadu_ps(inputVectorPtr);
++ inputVectorPtr += 4;
++
++ inputVal1 =
++ _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal1, vScalar), vmax_val), vmin_val);
++ inputVal2 =
++ _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal2, vScalar), vmax_val), vmin_val);
++ inputVal3 =
++ _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal3, vScalar), vmax_val), vmin_val);
++ inputVal4 =
++ _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal4, vScalar), vmax_val), vmin_val);
++
++ intInputVal1 = _mm_cvtps_epi32(inputVal1);
++ intInputVal2 = _mm_cvtps_epi32(inputVal2);
++ intInputVal3 = _mm_cvtps_epi32(inputVal3);
++ intInputVal4 = _mm_cvtps_epi32(inputVal4);
++
++ intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
++ intInputVal3 = _mm_packs_epi32(intInputVal3, intInputVal4);
++
++ intInputVal1 = _mm_packs_epi16(intInputVal1, intInputVal3);
++
++ _mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1);
++ outputVectorPtr += 16;
++ }
+
+- number = sixteenthPoints * 16;
+- for(; number < num_points; number++){
+- r = inputVector[number] * scalar;
+- volk_32f_s32f_convert_8i_single(&outputVector[number], r);
+- }
++ number = sixteenthPoints * 16;
++ for (; number < num_points; number++) {
++ r = inputVector[number] * scalar;
++ volk_32f_s32f_convert_8i_single(&outputVector[number], r);
++ }
+ }
+
+ #endif /* LV_HAVE_SSE2 */
+@@ -212,46 +232,47 @@ volk_32f_s32f_convert_8i_u_sse2(int8_t* outputVector, const float* inputVector,
+ #ifdef LV_HAVE_SSE
+ #include <xmmintrin.h>
+
+-static inline void
+-volk_32f_s32f_convert_8i_u_sse(int8_t* outputVector, const float* inputVector,
+- const float scalar, unsigned int num_points)
++static inline void volk_32f_s32f_convert_8i_u_sse(int8_t* outputVector,
++ const float* inputVector,
++ const float scalar,
++ unsigned int num_points)
+ {
+- unsigned int number = 0;
+- size_t inner_loop;
++ unsigned int number = 0;
++ size_t inner_loop;
+
+- const unsigned int quarterPoints = num_points / 4;
++ const unsigned int quarterPoints = num_points / 4;
+
+- const float* inputVectorPtr = (const float*)inputVector;
+- int8_t* outputVectorPtr = outputVector;
++ const float* inputVectorPtr = (const float*)inputVector;
++ int8_t* outputVectorPtr = outputVector;
+
+- float min_val = CHAR_MIN;
+- float max_val = CHAR_MAX;
+- float r;
++ float min_val = CHAR_MIN;
++ float max_val = CHAR_MAX;
++ float r;
+
+- __m128 vScalar = _mm_set_ps1(scalar);
+- __m128 ret;
+- __m128 vmin_val = _mm_set_ps1(min_val);
+- __m128 vmax_val = _mm_set_ps1(max_val);
++ __m128 vScalar = _mm_set_ps1(scalar);
++ __m128 ret;
++ __m128 vmin_val = _mm_set_ps1(min_val);
++ __m128 vmax_val = _mm_set_ps1(max_val);
+
+- __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4];
++ __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4];
+
+- for(;number < quarterPoints; number++){
+- ret = _mm_loadu_ps(inputVectorPtr);
+- inputVectorPtr += 4;
++ for (; number < quarterPoints; number++) {
++ ret = _mm_loadu_ps(inputVectorPtr);
++ inputVectorPtr += 4;
+
+- ret = _mm_max_ps(_mm_min_ps(_mm_mul_ps(ret, vScalar), vmax_val), vmin_val);
++ ret = _mm_max_ps(_mm_min_ps(_mm_mul_ps(ret, vScalar), vmax_val), vmin_val);
+
+- _mm_store_ps(outputFloatBuffer, ret);
+- for (inner_loop = 0; inner_loop < 4; inner_loop++){
+- *outputVectorPtr++ = (int8_t)(rintf(outputFloatBuffer[inner_loop]));
++ _mm_store_ps(outputFloatBuffer, ret);
++ for (inner_loop = 0; inner_loop < 4; inner_loop++) {
++ *outputVectorPtr++ = (int8_t)(rintf(outputFloatBuffer[inner_loop]));
++ }
+ }
+- }
+
+- number = quarterPoints * 4;
+- for(; number < num_points; number++){
+- r = inputVector[number] * scalar;
+- volk_32f_s32f_convert_8i_single(&outputVector[number], r);
+- }
++ number = quarterPoints * 4;
++ for (; number < num_points; number++) {
++ r = inputVector[number] * scalar;
++ volk_32f_s32f_convert_8i_single(&outputVector[number], r);
++ }
+ }
+
+ #endif /* LV_HAVE_SSE */
+@@ -259,18 +280,19 @@ volk_32f_s32f_convert_8i_u_sse(int8_t* outputVector, const float* inputVector,
+
+ #ifdef LV_HAVE_GENERIC
+
+-static inline void
+-volk_32f_s32f_convert_8i_generic(int8_t* outputVector, const float* inputVector,
+- const float scalar, unsigned int num_points)
++static inline void volk_32f_s32f_convert_8i_generic(int8_t* outputVector,
++ const float* inputVector,
++ const float scalar,
++ unsigned int num_points)
+ {
+- const float* inputVectorPtr = inputVector;
+- unsigned int number = 0;
+- float r;
+-
+- for(number = 0; number < num_points; number++){
+- r = *inputVectorPtr++ * scalar;
+- volk_32f_s32f_convert_8i_single(&outputVector[number], r);
+- }
++ const float* inputVectorPtr = inputVector;
++ unsigned int number = 0;
++ float r;
++
++ for (number = 0; number < num_points; number++) {
++ r = *inputVectorPtr++ * scalar;
++ volk_32f_s32f_convert_8i_single(&outputVector[number], r);
++ }
+ }
+
+ #endif /* LV_HAVE_GENERIC */
+@@ -280,68 +302,77 @@ volk_32f_s32f_convert_8i_generic(int8_t* outputVector, const float* inputVector,
+ #ifndef INCLUDED_volk_32f_s32f_convert_8i_a_H
+ #define INCLUDED_volk_32f_s32f_convert_8i_a_H
+
+-#include <volk/volk_common.h>
+ #include <inttypes.h>
+ #include <stdio.h>
++#include <volk/volk_common.h>
+
+ #ifdef LV_HAVE_AVX2
+ #include <immintrin.h>
+
+-static inline void
+-volk_32f_s32f_convert_8i_a_avx2(int8_t* outputVector, const float* inputVector,
+- const float scalar, unsigned int num_points)
++static inline void volk_32f_s32f_convert_8i_a_avx2(int8_t* outputVector,
++ const float* inputVector,
++ const float scalar,
++ unsigned int num_points)
+ {
+- unsigned int number = 0;
+-
+- const unsigned int thirtysecondPoints = num_points / 32;
+-
+- const float* inputVectorPtr = (const float*)inputVector;
+- int8_t* outputVectorPtr = outputVector;
+-
+- float min_val = CHAR_MIN;
+- float max_val = CHAR_MAX;
+- float r;
+-
+- __m256 vScalar = _mm256_set1_ps(scalar);
+- __m256 inputVal1, inputVal2, inputVal3, inputVal4;
+- __m256i intInputVal1, intInputVal2, intInputVal3, intInputVal4;
+- __m256 vmin_val = _mm256_set1_ps(min_val);
+- __m256 vmax_val = _mm256_set1_ps(max_val);
+- __m256i intInputVal;
+-
+- for(;number < thirtysecondPoints; number++){
+- inputVal1 = _mm256_load_ps(inputVectorPtr); inputVectorPtr += 8;
+- inputVal2 = _mm256_load_ps(inputVectorPtr); inputVectorPtr += 8;
+- inputVal3 = _mm256_load_ps(inputVectorPtr); inputVectorPtr += 8;
+- inputVal4 = _mm256_load_ps(inputVectorPtr); inputVectorPtr += 8;
+-
+- inputVal1 = _mm256_max_ps(_mm256_min_ps(_mm256_mul_ps(inputVal1, vScalar), vmax_val), vmin_val);
+- inputVal2 = _mm256_max_ps(_mm256_min_ps(_mm256_mul_ps(inputVal2, vScalar), vmax_val), vmin_val);
+- inputVal3 = _mm256_max_ps(_mm256_min_ps(_mm256_mul_ps(inputVal3, vScalar), vmax_val), vmin_val);
+- inputVal4 = _mm256_max_ps(_mm256_min_ps(_mm256_mul_ps(inputVal4, vScalar), vmax_val), vmin_val);
+-
+- intInputVal1 = _mm256_cvtps_epi32(inputVal1);
+- intInputVal2 = _mm256_cvtps_epi32(inputVal2);
+- intInputVal3 = _mm256_cvtps_epi32(inputVal3);
+- intInputVal4 = _mm256_cvtps_epi32(inputVal4);
+-
+- intInputVal1 = _mm256_packs_epi32(intInputVal1, intInputVal2);
+- intInputVal1 = _mm256_permute4x64_epi64(intInputVal1, 0b11011000);
+- intInputVal3 = _mm256_packs_epi32(intInputVal3, intInputVal4);
+- intInputVal3 = _mm256_permute4x64_epi64(intInputVal3, 0b11011000);
+-
+- intInputVal1 = _mm256_packs_epi16(intInputVal1, intInputVal3);
+- intInputVal = _mm256_permute4x64_epi64(intInputVal1, 0b11011000);
+-
+- _mm256_store_si256((__m256i*)outputVectorPtr, intInputVal);
+- outputVectorPtr += 32;
+- }
+-
+- number = thirtysecondPoints * 32;
+- for(; number < num_points; number++){
+- r = inputVector[number] * scalar;
+- volk_32f_s32f_convert_8i_single(&outputVector[number], r);
+- }
++ unsigned int number = 0;
++
++ const unsigned int thirtysecondPoints = num_points / 32;
++
++ const float* inputVectorPtr = (const float*)inputVector;
++ int8_t* outputVectorPtr = outputVector;
++
++ float min_val = CHAR_MIN;
++ float max_val = CHAR_MAX;
++ float r;
++
++ __m256 vScalar = _mm256_set1_ps(scalar);
++ __m256 inputVal1, inputVal2, inputVal3, inputVal4;
++ __m256i intInputVal1, intInputVal2, intInputVal3, intInputVal4;
++ __m256 vmin_val = _mm256_set1_ps(min_val);
++ __m256 vmax_val = _mm256_set1_ps(max_val);
++ __m256i intInputVal;
++
++ for (; number < thirtysecondPoints; number++) {
++ inputVal1 = _mm256_load_ps(inputVectorPtr);
++ inputVectorPtr += 8;
++ inputVal2 = _mm256_load_ps(inputVectorPtr);
++ inputVectorPtr += 8;
++ inputVal3 = _mm256_load_ps(inputVectorPtr);
++ inputVectorPtr += 8;
++ inputVal4 = _mm256_load_ps(inputVectorPtr);
++ inputVectorPtr += 8;
++
++ inputVal1 = _mm256_max_ps(
++ _mm256_min_ps(_mm256_mul_ps(inputVal1, vScalar), vmax_val), vmin_val);
++ inputVal2 = _mm256_max_ps(
++ _mm256_min_ps(_mm256_mul_ps(inputVal2, vScalar), vmax_val), vmin_val);
++ inputVal3 = _mm256_max_ps(
++ _mm256_min_ps(_mm256_mul_ps(inputVal3, vScalar), vmax_val), vmin_val);
++ inputVal4 = _mm256_max_ps(
++ _mm256_min_ps(_mm256_mul_ps(inputVal4, vScalar), vmax_val), vmin_val);
++
++ intInputVal1 = _mm256_cvtps_epi32(inputVal1);
++ intInputVal2 = _mm256_cvtps_epi32(inputVal2);
++ intInputVal3 = _mm256_cvtps_epi32(inputVal3);
++ intInputVal4 = _mm256_cvtps_epi32(inputVal4);
++
++ intInputVal1 = _mm256_packs_epi32(intInputVal1, intInputVal2);
++ intInputVal1 = _mm256_permute4x64_epi64(intInputVal1, 0b11011000);
++ intInputVal3 = _mm256_packs_epi32(intInputVal3, intInputVal4);
++ intInputVal3 = _mm256_permute4x64_epi64(intInputVal3, 0b11011000);
++
++ intInputVal1 = _mm256_packs_epi16(intInputVal1, intInputVal3);
++ intInputVal = _mm256_permute4x64_epi64(intInputVal1, 0b11011000);
++
++ _mm256_store_si256((__m256i*)outputVectorPtr, intInputVal);
++ outputVectorPtr += 32;
++ }
++
++ number = thirtysecondPoints * 32;
++ for (; number < num_points; number++) {
++ r = inputVector[number] * scalar;
++ volk_32f_s32f_convert_8i_single(&outputVector[number], r);
++ }
+ }
+
+ #endif /* LV_HAVE_AVX2 */
+@@ -350,57 +381,66 @@ volk_32f_s32f_convert_8i_a_avx2(int8_t* outputVector, const float* inputVector,
+ #ifdef LV_HAVE_SSE2
+ #include <emmintrin.h>
+
+-static inline void
+-volk_32f_s32f_convert_8i_a_sse2(int8_t* outputVector, const float* inputVector,
+- const float scalar, unsigned int num_points)
++static inline void volk_32f_s32f_convert_8i_a_sse2(int8_t* outputVector,
++ const float* inputVector,
++ const float scalar,
++ unsigned int num_points)
+ {
+- unsigned int number = 0;
+-
+- const unsigned int sixteenthPoints = num_points / 16;
+-
+- const float* inputVectorPtr = (const float*)inputVector;
+- int8_t* outputVectorPtr = outputVector;
+-
+- float min_val = CHAR_MIN;
+- float max_val = CHAR_MAX;
+- float r;
+-
+- __m128 vScalar = _mm_set_ps1(scalar);
+- __m128 inputVal1, inputVal2, inputVal3, inputVal4;
+- __m128i intInputVal1, intInputVal2, intInputVal3, intInputVal4;
+- __m128 vmin_val = _mm_set_ps1(min_val);
+- __m128 vmax_val = _mm_set_ps1(max_val);
+-
+- for(;number < sixteenthPoints; number++){
+- inputVal1 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4;
+- inputVal2 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4;
+- inputVal3 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4;
+- inputVal4 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4;
+-
+- inputVal1 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal1, vScalar), vmax_val), vmin_val);
+- inputVal2 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal2, vScalar), vmax_val), vmin_val);
+- inputVal3 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal3, vScalar), vmax_val), vmin_val);
+- inputVal4 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal4, vScalar), vmax_val), vmin_val);
+-
+- intInputVal1 = _mm_cvtps_epi32(inputVal1);
+- intInputVal2 = _mm_cvtps_epi32(inputVal2);
+- intInputVal3 = _mm_cvtps_epi32(inputVal3);
+- intInputVal4 = _mm_cvtps_epi32(inputVal4);
+-
+- intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
+- intInputVal3 = _mm_packs_epi32(intInputVal3, intInputVal4);
+-
+- intInputVal1 = _mm_packs_epi16(intInputVal1, intInputVal3);
+-
+- _mm_store_si128((__m128i*)outputVectorPtr, intInputVal1);
+- outputVectorPtr += 16;
+- }
++ unsigned int number = 0;
++
++ const unsigned int sixteenthPoints = num_points / 16;
++
++ const float* inputVectorPtr = (const float*)inputVector;
++ int8_t* outputVectorPtr = outputVector;
++
++ float min_val = CHAR_MIN;
++ float max_val = CHAR_MAX;
++ float r;
++
++ __m128 vScalar = _mm_set_ps1(scalar);
++ __m128 inputVal1, inputVal2, inputVal3, inputVal4;
++ __m128i intInputVal1, intInputVal2, intInputVal3, intInputVal4;
++ __m128 vmin_val = _mm_set_ps1(min_val);
++ __m128 vmax_val = _mm_set_ps1(max_val);
++
++ for (; number < sixteenthPoints; number++) {
++ inputVal1 = _mm_load_ps(inputVectorPtr);
++ inputVectorPtr += 4;
++ inputVal2 = _mm_load_ps(inputVectorPtr);
++ inputVectorPtr += 4;
++ inputVal3 = _mm_load_ps(inputVectorPtr);
++ inputVectorPtr += 4;
++ inputVal4 = _mm_load_ps(inputVectorPtr);
++ inputVectorPtr += 4;
++
++ inputVal1 =
++ _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal1, vScalar), vmax_val), vmin_val);
++ inputVal2 =
++ _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal2, vScalar), vmax_val), vmin_val);
++ inputVal3 =
++ _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal3, vScalar), vmax_val), vmin_val);
++ inputVal4 =
++ _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal4, vScalar), vmax_val), vmin_val);
++
++ intInputVal1 = _mm_cvtps_epi32(inputVal1);
++ intInputVal2 = _mm_cvtps_epi32(inputVal2);
++ intInputVal3 = _mm_cvtps_epi32(inputVal3);
++ intInputVal4 = _mm_cvtps_epi32(inputVal4);
++
++ intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
++ intInputVal3 = _mm_packs_epi32(intInputVal3, intInputVal4);
++
++ intInputVal1 = _mm_packs_epi16(intInputVal1, intInputVal3);
++
++ _mm_store_si128((__m128i*)outputVectorPtr, intInputVal1);
++ outputVectorPtr += 16;
++ }
+
+- number = sixteenthPoints * 16;
+- for(; number < num_points; number++){
+- r = inputVector[number] * scalar;
+- volk_32f_s32f_convert_8i_single(&outputVector[number], r);
+- }
++ number = sixteenthPoints * 16;
++ for (; number < num_points; number++) {
++ r = inputVector[number] * scalar;
++ volk_32f_s32f_convert_8i_single(&outputVector[number], r);
++ }
+ }
+ #endif /* LV_HAVE_SSE2 */
+
+@@ -408,46 +448,47 @@ volk_32f_s32f_convert_8i_a_sse2(int8_t* outputVector, const float* inputVector,
+ #ifdef LV_HAVE_SSE
+ #include <xmmintrin.h>
+
+-static inline void
+-volk_32f_s32f_convert_8i_a_sse(int8_t* outputVector, const float* inputVector,
+- const float scalar, unsigned int num_points)
++static inline void volk_32f_s32f_convert_8i_a_sse(int8_t* outputVector,
++ const float* inputVector,
++ const float scalar,
++ unsigned int num_points)
+ {
+- unsigned int number = 0;
+- size_t inner_loop;
++ unsigned int number = 0;
++ size_t inner_loop;
+
+- const unsigned int quarterPoints = num_points / 4;
++ const unsigned int quarterPoints = num_points / 4;
+
+- const float* inputVectorPtr = (const float*)inputVector;
++ const float* inputVectorPtr = (const float*)inputVector;
+
+- float min_val = CHAR_MIN;
+- float max_val = CHAR_MAX;
+- float r;
++ float min_val = CHAR_MIN;
++ float max_val = CHAR_MAX;
++ float r;
+
+- int8_t* outputVectorPtr = outputVector;
+- __m128 vScalar = _mm_set_ps1(scalar);
+- __m128 ret;
+- __m128 vmin_val = _mm_set_ps1(min_val);
+- __m128 vmax_val = _mm_set_ps1(max_val);
++ int8_t* outputVectorPtr = outputVector;
++ __m128 vScalar = _mm_set_ps1(scalar);
++ __m128 ret;
++ __m128 vmin_val = _mm_set_ps1(min_val);
++ __m128 vmax_val = _mm_set_ps1(max_val);
+
+- __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4];
++ __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4];
+
+- for(;number < quarterPoints; number++){
+- ret = _mm_load_ps(inputVectorPtr);
+- inputVectorPtr += 4;
++ for (; number < quarterPoints; number++) {
++ ret = _mm_load_ps(inputVectorPtr);
++ inputVectorPtr += 4;
+
+- ret = _mm_max_ps(_mm_min_ps(_mm_mul_ps(ret, vScalar), vmax_val), vmin_val);
++ ret = _mm_max_ps(_mm_min_ps(_mm_mul_ps(ret, vScalar), vmax_val), vmin_val);
+
+- _mm_store_ps(outputFloatBuffer, ret);
+- for (inner_loop = 0; inner_loop < 4; inner_loop++){
+- *outputVectorPtr++ = (int8_t)(rintf(outputFloatBuffer[inner_loop]));
++ _mm_store_ps(outputFloatBuffer, ret);
++ for (inner_loop = 0; inner_loop < 4; inner_loop++) {
++ *outputVectorPtr++ = (int8_t)(rintf(outputFloatBuffer[inner_loop]));
++ }
+ }
+- }
+
+- number = quarterPoints * 4;
+- for(; number < num_points; number++){
+- r = inputVector[number] * scalar;
+- volk_32f_s32f_convert_8i_single(&outputVector[number], r);
+- }
++ number = quarterPoints * 4;
++ for (; number < num_points; number++) {
++ r = inputVector[number] * scalar;
++ volk_32f_s32f_convert_8i_single(&outputVector[number], r);
++ }
+ }
+
+ #endif /* LV_HAVE_SSE */
+@@ -455,18 +496,19 @@ volk_32f_s32f_convert_8i_a_sse(int8_t* outputVector, const float* inputVector,
+
+ #ifdef LV_HAVE_GENERIC
+
+-static inline void
+-volk_32f_s32f_convert_8i_a_generic(int8_t* outputVector, const float* inputVector,
+- const float scalar, unsigned int num_points)
++static inline void volk_32f_s32f_convert_8i_a_generic(int8_t* outputVector,
++ const float* inputVector,
++ const float scalar,
++ unsigned int num_points)
+ {
+- const float* inputVectorPtr = inputVector;
+- unsigned int number = 0;
+- float r;
+-
+- for(number = 0; number < num_points; number++){
+- r = *inputVectorPtr++ * scalar;
+- volk_32f_s32f_convert_8i_single(&outputVector[number], r);
+- }
++ const float* inputVectorPtr = inputVector;
++ unsigned int number = 0;
++ float r;
++
++ for (number = 0; number < num_points; number++) {
++ r = *inputVectorPtr++ * scalar;
++ volk_32f_s32f_convert_8i_single(&outputVector[number], r);
++ }
+ }
+
+ #endif /* LV_HAVE_GENERIC */
+diff --git a/kernels/volk/volk_32f_s32f_mod_rangepuppet_32f.h b/kernels/volk/volk_32f_s32f_mod_rangepuppet_32f.h
+index 6ace77b..28d7ab5 100644
+--- a/kernels/volk/volk_32f_s32f_mod_rangepuppet_32f.h
++++ b/kernels/volk/volk_32f_s32f_mod_rangepuppet_32f.h
+@@ -4,42 +4,77 @@
+ #include <volk/volk_32f_s32f_s32f_mod_range_32f.h>
+
+ #ifdef LV_HAVE_GENERIC
+-static inline void volk_32f_s32f_mod_rangepuppet_32f_generic(float *output, const float *input, float bound, unsigned int num_points){
+- volk_32f_s32f_s32f_mod_range_32f_generic(output, input, bound-3.141f, bound, num_points);
++static inline void volk_32f_s32f_mod_rangepuppet_32f_generic(float* output,
++ const float* input,
++ float bound,
++ unsigned int num_points)
++{
++ volk_32f_s32f_s32f_mod_range_32f_generic(
++ output, input, bound - 3.141f, bound, num_points);
+ }
+ #endif
+
+
+ #ifdef LV_HAVE_SSE
+-static inline void volk_32f_s32f_mod_rangepuppet_32f_u_sse(float *output, const float *input, float bound, unsigned int num_points){
+- volk_32f_s32f_s32f_mod_range_32f_u_sse(output, input, bound-3.141f, bound, num_points);
++static inline void volk_32f_s32f_mod_rangepuppet_32f_u_sse(float* output,
++ const float* input,
++ float bound,
++ unsigned int num_points)
++{
++ volk_32f_s32f_s32f_mod_range_32f_u_sse(
++ output, input, bound - 3.141f, bound, num_points);
+ }
+ #endif
+ #ifdef LV_HAVE_SSE
+-static inline void volk_32f_s32f_mod_rangepuppet_32f_a_sse(float *output, const float *input, float bound, unsigned int num_points){
+- volk_32f_s32f_s32f_mod_range_32f_a_sse(output, input, bound-3.141f, bound, num_points);
++static inline void volk_32f_s32f_mod_rangepuppet_32f_a_sse(float* output,
++ const float* input,
++ float bound,
++ unsigned int num_points)
++{
++ volk_32f_s32f_s32f_mod_range_32f_a_sse(
++ output, input, bound - 3.141f, bound, num_points);
+ }
+ #endif
+
+ #ifdef LV_HAVE_SSE2
+-static inline void volk_32f_s32f_mod_rangepuppet_32f_u_sse2(float *output, const float *input, float bound, unsigned int num_points){
+- volk_32f_s32f_s32f_mod_range_32f_u_sse2(output, input, bound-3.141f, bound, num_points);
++static inline void volk_32f_s32f_mod_rangepuppet_32f_u_sse2(float* output,
++ const float* input,
++ float bound,
++ unsigned int num_points)
++{
++ volk_32f_s32f_s32f_mod_range_32f_u_sse2(
++ output, input, bound - 3.141f, bound, num_points);
+ }
+ #endif
+ #ifdef LV_HAVE_SSE2
+-static inline void volk_32f_s32f_mod_rangepuppet_32f_a_sse2(float *output, const float *input, float bound, unsigned int num_points){
+- volk_32f_s32f_s32f_mod_range_32f_a_sse2(output, input, bound-3.141f, bound, num_points);
++static inline void volk_32f_s32f_mod_rangepuppet_32f_a_sse2(float* output,
++ const float* input,
++ float bound,
++ unsigned int num_points)
++{
++ volk_32f_s32f_s32f_mod_range_32f_a_sse2(
++ output, input, bound - 3.141f, bound, num_points);
+ }
+ #endif
+
+ #ifdef LV_HAVE_AVX
+-static inline void volk_32f_s32f_mod_rangepuppet_32f_u_avx(float *output, const float *input, float bound, unsigned int num_points){
+- volk_32f_s32f_s32f_mod_range_32f_u_avx(output, input, bound-3.141f, bound, num_points);
++static inline void volk_32f_s32f_mod_rangepuppet_32f_u_avx(float* output,
++ const float* input,
++ float bound,
++ unsigned int num_points)
++{
++ volk_32f_s32f_s32f_mod_range_32f_u_avx(
++ output, input, bound - 3.141f, bound, num_points);
+ }
+ #endif
+ #ifdef LV_HAVE_AVX
+-static inline void volk_32f_s32f_mod_rangepuppet_32f_a_avx(float *output, const float *input, float bound, unsigned int num_points){
+- volk_32f_s32f_s32f_mod_range_32f_a_avx(output, input, bound-3.141f, bound, num_points);
++static inline void volk_32f_s32f_mod_rangepuppet_32f_a_avx(float* output,
++ const float* input,
++ float bound,
++ unsigned int num_points)
++{
++ volk_32f_s32f_s32f_mod_range_32f_a_avx(
++ output, input, bound - 3.141f, bound, num_points);
+ }
+ #endif
+ #endif
+diff --git a/kernels/volk/volk_32f_s32f_multiply_32f.h b/kernels/volk/volk_32f_s32f_multiply_32f.h
+index 97c7f69..dcc9c6b 100644
+--- a/kernels/volk/volk_32f_s32f_multiply_32f.h
++++ b/kernels/volk/volk_32f_s32f_multiply_32f.h
+@@ -29,8 +29,8 @@
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+- * void volk_32f_s32f_multiply_32f(float* cVector, const float* aVector, const float scalar, unsigned int num_points)
+- * \endcode
++ * void volk_32f_s32f_multiply_32f(float* cVector, const float* aVector, const float
++ * scalar, unsigned int num_points) \endcode
+ *
+ * \b Inputs
+ * \li aVector: The input vector of floats.
+@@ -75,84 +75,87 @@
+ #ifdef LV_HAVE_SSE
+ #include <xmmintrin.h>
+
+-static inline void
+-volk_32f_s32f_multiply_32f_u_sse(float* cVector, const float* aVector,
+- const float scalar, unsigned int num_points)
++static inline void volk_32f_s32f_multiply_32f_u_sse(float* cVector,
++ const float* aVector,
++ const float scalar,
++ unsigned int num_points)
+ {
+- unsigned int number = 0;
+- const unsigned int quarterPoints = num_points / 4;
++ unsigned int number = 0;
++ const unsigned int quarterPoints = num_points / 4;
+
+- float* cPtr = cVector;
+- const float* aPtr = aVector;
++ float* cPtr = cVector;
++ const float* aPtr = aVector;
+
+- __m128 aVal, bVal, cVal;
+- bVal = _mm_set_ps1(scalar);
+- for(;number < quarterPoints; number++){
+- aVal = _mm_loadu_ps(aPtr);
++ __m128 aVal, bVal, cVal;
++ bVal = _mm_set_ps1(scalar);
++ for (; number < quarterPoints; number++) {
++ aVal = _mm_loadu_ps(aPtr);
+
+- cVal = _mm_mul_ps(aVal, bVal);
++ cVal = _mm_mul_ps(aVal, bVal);
+
+- _mm_storeu_ps(cPtr,cVal); // Store the results back into the C container
++ _mm_storeu_ps(cPtr, cVal); // Store the results back into the C container
+
+- aPtr += 4;
+- cPtr += 4;
+- }
++ aPtr += 4;
++ cPtr += 4;
++ }
+
+- number = quarterPoints * 4;
+- for(;number < num_points; number++){
+- *cPtr++ = (*aPtr++) * scalar;
+- }
++ number = quarterPoints * 4;
++ for (; number < num_points; number++) {
++ *cPtr++ = (*aPtr++) * scalar;
++ }
+ }
+ #endif /* LV_HAVE_SSE */
+
+ #ifdef LV_HAVE_AVX
+ #include <immintrin.h>
+
+-static inline void
+-volk_32f_s32f_multiply_32f_u_avx(float* cVector, const float* aVector,
+- const float scalar, unsigned int num_points)
++static inline void volk_32f_s32f_multiply_32f_u_avx(float* cVector,
++ const float* aVector,
++ const float scalar,
++ unsigned int num_points)
+ {
+- unsigned int number = 0;
+- const unsigned int eighthPoints = num_points / 8;
++ unsigned int number = 0;
++ const unsigned int eighthPoints = num_points / 8;
+
+- float* cPtr = cVector;
+- const float* aPtr = aVector;
++ float* cPtr = cVector;
++ const float* aPtr = aVector;
+
+- __m256 aVal, bVal, cVal;
+- bVal = _mm256_set1_ps(scalar);
+- for(;number < eighthPoints; number++){
++ __m256 aVal, bVal, cVal;
++ bVal = _mm256_set1_ps(scalar);
++ for (; number < eighthPoints; number++) {
+
+- aVal = _mm256_loadu_ps(aPtr);
++ aVal = _mm256_loadu_ps(aPtr);
+
+- cVal = _mm256_mul_ps(aVal, bVal);
++ cVal = _mm256_mul_ps(aVal, bVal);
+
+- _mm256_storeu_ps(cPtr,cVal); // Store the results back into the C container
++ _mm256_storeu_ps(cPtr, cVal); // Store the results back into the C container
+
+- aPtr += 8;
+- cPtr += 8;
+- }
++ aPtr += 8;
++ cPtr += 8;
++ }
+
+- number = eighthPoints * 8;
+- for(;number < num_points; number++){
+- *cPtr++ = (*aPtr++) * scalar;
+- }
++ number = eighthPoints * 8;
++ for (; number < num_points; number++) {
++ *cPtr++ = (*aPtr++) * scalar;
++ }
+ }
+ #endif /* LV_HAVE_AVX */
+
+ #ifdef LV_HAVE_GENERIC
+
+-static inline void
+-volk_32f_s32f_multiply_32f_generic(float* cVector, const float* aVector,
+- const float scalar, unsigned int num_points)
++static inline void volk_32f_s32f_multiply_32f_generic(float* cVector,
++ const float* aVector,
++ const float scalar,
++ unsigned int num_points)
+ {
+- unsigned int number = 0;
+- const float* inputPtr = aVector;
+- float* outputPtr = cVector;
+- for(number = 0; number < num_points; number++){
+- *outputPtr = (*inputPtr) * scalar;
+- inputPtr++;
+- outputPtr++;
+- }
++ unsigned int number = 0;
++ const float* inputPtr = aVector;
++ float* outputPtr = cVector;
++ for (number = 0; number < num_points; number++) {
++ *outputPtr = (*inputPtr) * scalar;
++ inputPtr++;
++ outputPtr++;
++ }
+ }
+ #endif /* LV_HAVE_GENERIC */
+
+@@ -168,126 +171,132 @@ volk_32f_s32f_multiply_32f_generic(float* cVector, const float* aVector,
+ #ifdef LV_HAVE_SSE
+ #include <xmmintrin.h>
+
+-static inline void
+-volk_32f_s32f_multiply_32f_a_sse(float* cVector, const float* aVector,
+- const float scalar, unsigned int num_points)
++static inline void volk_32f_s32f_multiply_32f_a_sse(float* cVector,
++ const float* aVector,
++ const float scalar,
++ unsigned int num_points)
+ {
+- unsigned int number = 0;
+- const unsigned int quarterPoints = num_points / 4;
++ unsigned int number = 0;
++ const unsigned int quarterPoints = num_points / 4;
+
+- float* cPtr = cVector;
+- const float* aPtr = aVector;
++ float* cPtr = cVector;
++ const float* aPtr = aVector;
+
+- __m128 aVal, bVal, cVal;
+- bVal = _mm_set_ps1(scalar);
+- for(;number < quarterPoints; number++){
+- aVal = _mm_load_ps(aPtr);
++ __m128 aVal, bVal, cVal;
++ bVal = _mm_set_ps1(scalar);
++ for (; number < quarterPoints; number++) {
++ aVal = _mm_load_ps(aPtr);
+
+- cVal = _mm_mul_ps(aVal, bVal);
++ cVal = _mm_mul_ps(aVal, bVal);
+
+- _mm_store_ps(cPtr,cVal); // Store the results back into the C container
++ _mm_store_ps(cPtr, cVal); // Store the results back into the C container
+
+- aPtr += 4;
+- cPtr += 4;
+- }
++ aPtr += 4;
++ cPtr += 4;
++ }
+
+- number = quarterPoints * 4;
+- for(;number < num_points; number++){
+- *cPtr++ = (*aPtr++) * scalar;
+- }
++ number = quarterPoints * 4;
++ for (; number < num_points; number++) {
++ *cPtr++ = (*aPtr++) * scalar;
++ }
+ }
+ #endif /* LV_HAVE_SSE */
+
+ #ifdef LV_HAVE_AVX
+ #include <immintrin.h>
+
+-static inline void
+-volk_32f_s32f_multiply_32f_a_avx(float* cVector, const float* aVector,
+- const float scalar, unsigned int num_points)
++static inline void volk_32f_s32f_multiply_32f_a_avx(float* cVector,
++ const float* aVector,
++ const float scalar,
++ unsigned int num_points)
+ {
+- unsigned int number = 0;
+- const unsigned int eighthPoints = num_points / 8;
++ unsigned int number = 0;
++ const unsigned int eighthPoints = num_points / 8;
+
+- float* cPtr = cVector;
+- const float* aPtr = aVector;
++ float* cPtr = cVector;
++ const float* aPtr = aVector;
+
+- __m256 aVal, bVal, cVal;
+- bVal = _mm256_set1_ps(scalar);
+- for(;number < eighthPoints; number++){
+- aVal = _mm256_load_ps(aPtr);
++ __m256 aVal, bVal, cVal;
++ bVal = _mm256_set1_ps(scalar);
++ for (; number < eighthPoints; number++) {
++ aVal = _mm256_load_ps(aPtr);
+
+- cVal = _mm256_mul_ps(aVal, bVal);
++ cVal = _mm256_mul_ps(aVal, bVal);
+
+- _mm256_store_ps(cPtr,cVal); // Store the results back into the C container
++ _mm256_store_ps(cPtr, cVal); // Store the results back into the C container
+
+- aPtr += 8;
+- cPtr += 8;
+- }
++ aPtr += 8;
++ cPtr += 8;
++ }
+
+- number = eighthPoints * 8;
+- for(;number < num_points; number++){
+- *cPtr++ = (*aPtr++) * scalar;
+- }
++ number = eighthPoints * 8;
++ for (; number < num_points; number++) {
++ *cPtr++ = (*aPtr++) * scalar;
++ }
+ }
+ #endif /* LV_HAVE_AVX */
+
+ #ifdef LV_HAVE_NEON
+ #include <arm_neon.h>
+
+-static inline void
+-volk_32f_s32f_multiply_32f_u_neon(float* cVector, const float* aVector,
+- const float scalar, unsigned int num_points)
++static inline void volk_32f_s32f_multiply_32f_u_neon(float* cVector,
++ const float* aVector,
++ const float scalar,
++ unsigned int num_points)
+ {
+- unsigned int number = 0;
+- const float* inputPtr = aVector;
+- float* outputPtr = cVector;
+- const unsigned int quarterPoints = num_points / 4;
+-
+- float32x4_t aVal, cVal;
+-
+- for(number = 0; number < quarterPoints; number++){
+- aVal = vld1q_f32(inputPtr); // Load into NEON regs
+- cVal = vmulq_n_f32 (aVal, scalar); // Do the multiply
+- vst1q_f32(outputPtr, cVal); // Store results back to output
+- inputPtr += 4;
+- outputPtr += 4;
+- }
+- for(number = quarterPoints * 4; number < num_points; number++){
+- *outputPtr++ = (*inputPtr++) * scalar;
+- }
++ unsigned int number = 0;
++ const float* inputPtr = aVector;
++ float* outputPtr = cVector;
++ const unsigned int quarterPoints = num_points / 4;
++
++ float32x4_t aVal, cVal;
++
++ for (number = 0; number < quarterPoints; number++) {
++ aVal = vld1q_f32(inputPtr); // Load into NEON regs
++ cVal = vmulq_n_f32(aVal, scalar); // Do the multiply
++ vst1q_f32(outputPtr, cVal); // Store results back to output
++ inputPtr += 4;
++ outputPtr += 4;
++ }
++ for (number = quarterPoints * 4; number < num_points; number++) {
++ *outputPtr++ = (*inputPtr++) * scalar;
++ }
+ }
+ #endif /* LV_HAVE_NEON */
+
+
+ #ifdef LV_HAVE_GENERIC
+
+-static inline void
+-volk_32f_s32f_multiply_32f_a_generic(float* cVector, const float* aVector,
+- const float scalar, unsigned int num_points)
++static inline void volk_32f_s32f_multiply_32f_a_generic(float* cVector,
++ const float* aVector,
++ const float scalar,
++ unsigned int num_points)
+ {
+- unsigned int number = 0;
+- const float* inputPtr = aVector;
+- float* outputPtr = cVector;
+- for(number = 0; number < num_points; number++){
+- *outputPtr = (*inputPtr) * scalar;
+- inputPtr++;
+- outputPtr++;
+- }
++ unsigned int number = 0;
++ const float* inputPtr = aVector;
++ float* outputPtr = cVector;
++ for (number = 0; number < num_points; number++) {
++ *outputPtr = (*inputPtr) * scalar;
++ inputPtr++;
++ outputPtr++;
++ }
+ }
+ #endif /* LV_HAVE_GENERIC */
+
+
+ #ifdef LV_HAVE_ORC
+
+-extern void
+-volk_32f_s32f_multiply_32f_a_orc_impl(float* dst, const float* src,
+- const float scalar, unsigned int num_points);
++extern void volk_32f_s32f_multiply_32f_a_orc_impl(float* dst,
++ const float* src,
++ const float scalar,
++ unsigned int num_points);
+
+-static inline void
+-volk_32f_s32f_multiply_32f_u_orc(float* cVector, const float* aVector,
+- const float scalar, unsigned int num_points)
++static inline void volk_32f_s32f_multiply_32f_u_orc(float* cVector,
++ const float* aVector,
++ const float scalar,
++ unsigned int num_points)
+ {
+- volk_32f_s32f_multiply_32f_a_orc_impl(cVector, aVector, scalar, num_points);
++ volk_32f_s32f_multiply_32f_a_orc_impl(cVector, aVector, scalar, num_points);
+ }
+
+ #endif /* LV_HAVE_GENERIC */
+diff --git a/kernels/volk/volk_32f_s32f_normalize.h b/kernels/volk/volk_32f_s32f_normalize.h
+index 404d534..0a05492 100644
+--- a/kernels/volk/volk_32f_s32f_normalize.h
++++ b/kernels/volk/volk_32f_s32f_normalize.h
+@@ -30,8 +30,8 @@
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+- * void volk_32f_s32f_normalize(float* vecBuffer, const float scalar, unsigned int num_points)
+- * \endcode
++ * void volk_32f_s32f_normalize(float* vecBuffer, const float scalar, unsigned int
++ * num_points) \endcode
+ *
+ * \b Inputs
+ * \li vecBuffer: The buffer of values to be vectorized.
+@@ -76,84 +76,99 @@
+ #ifdef LV_HAVE_AVX
+ #include <immintrin.h>
+
+-static inline void volk_32f_s32f_normalize_a_avx(float* vecBuffer, const float scalar, unsigned int num_points){
+- unsigned int number = 0;
+- float* inputPtr = vecBuffer;
++static inline void volk_32f_s32f_normalize_a_avx(float* vecBuffer,
++ const float scalar,
++ unsigned int num_points)
++{
++ unsigned int number = 0;
++ float* inputPtr = vecBuffer;
+
+- const float invScalar = 1.0 / scalar;
+- __m256 vecScalar = _mm256_set1_ps(invScalar);
++ const float invScalar = 1.0 / scalar;
++ __m256 vecScalar = _mm256_set1_ps(invScalar);
+
+- __m256 input1;
++ __m256 input1;
+
+- const uint64_t eighthPoints = num_points / 8;
+- for(;number < eighthPoints; number++){
++ const uint64_t eighthPoints = num_points / 8;
++ for (; number < eighthPoints; number++) {
+
+- input1 = _mm256_load_ps(inputPtr);
++ input1 = _mm256_load_ps(inputPtr);
+
+- input1 = _mm256_mul_ps(input1, vecScalar);
++ input1 = _mm256_mul_ps(input1, vecScalar);
+
+- _mm256_store_ps(inputPtr, input1);
++ _mm256_store_ps(inputPtr, input1);
+
+- inputPtr += 8;
+- }
++ inputPtr += 8;
++ }
+
+- number = eighthPoints*8;
+- for(; number < num_points; number++){
+- *inputPtr *= invScalar;
+- inputPtr++;
+- }
++ number = eighthPoints * 8;
++ for (; number < num_points; number++) {
++ *inputPtr *= invScalar;
++ inputPtr++;
++ }
+ }
+ #endif /* LV_HAVE_AVX */
+
+ #ifdef LV_HAVE_SSE
+ #include <xmmintrin.h>
+
+-static inline void volk_32f_s32f_normalize_a_sse(float* vecBuffer, const float scalar, unsigned int num_points){
+- unsigned int number = 0;
+- float* inputPtr = vecBuffer;
++static inline void volk_32f_s32f_normalize_a_sse(float* vecBuffer,
++ const float scalar,
++ unsigned int num_points)
++{
++ unsigned int number = 0;
++ float* inputPtr = vecBuffer;
+
+- const float invScalar = 1.0 / scalar;
+- __m128 vecScalar = _mm_set_ps1(invScalar);
++ const float invScalar = 1.0 / scalar;
++ __m128 vecScalar = _mm_set_ps1(invScalar);
+
+- __m128 input1;
++ __m128 input1;
+
+- const uint64_t quarterPoints = num_points / 4;
+- for(;number < quarterPoints; number++){
++ const uint64_t quarterPoints = num_points / 4;
++ for (; number < quarterPoints; number++) {
+
+- input1 = _mm_load_ps(inputPtr);
++ input1 = _mm_load_ps(inputPtr);
+
+- input1 = _mm_mul_ps(input1, vecScalar);
++ input1 = _mm_mul_ps(input1, vecScalar);
+
+- _mm_store_ps(inputPtr, input1);
++ _mm_store_ps(inputPtr, input1);
+
+- inputPtr += 4;
+- }
++ inputPtr += 4;
++ }
+
+- number = quarterPoints*4;
+- for(; number < num_points; number++){
+- *inputPtr *= invScalar;
+- inputPtr++;
+- }
++ number = quarterPoints * 4;
++ for (; number < num_points; number++) {
++ *inputPtr *= invScalar;
++ inputPtr++;
++ }
+ }
+ #endif /* LV_HAVE_SSE */
+
+ #ifdef LV_HAVE_GENERIC
+
+-static inline void volk_32f_s32f_normalize_generic(float* vecBuffer, const float scalar, unsigned int num_points){
+- unsigned int number = 0;
+- float* inputPtr = vecBuffer;
+- const float invScalar = 1.0 / scalar;
+- for(number = 0; number < num_points; number++){
+- *inputPtr *= invScalar;
+- inputPtr++;
+- }
++static inline void volk_32f_s32f_normalize_generic(float* vecBuffer,
++ const float scalar,
++ unsigned int num_points)
++{
++ unsigned int number = 0;
++ float* inputPtr = vecBuffer;
++ const float invScalar = 1.0 / scalar;
++ for (number = 0; number < num_points; number++) {
++ *inputPtr *= invScalar;
++ inputPtr++;
++ }
+ }
+ #endif /* LV_HAVE_GENERIC */
+
+ #ifdef LV_HAVE_ORC
+
+-extern void volk_32f_s32f_normalize_a_orc_impl(float* dst, float* src, const float scalar, unsigned int num_points);
+-static inline void volk_32f_s32f_normalize_u_orc(float* vecBuffer, const float scalar, unsigned int num_points){
++extern void volk_32f_s32f_normalize_a_orc_impl(float* dst,
++ float* src,
++ const float scalar,
++ unsigned int num_points);
++static inline void volk_32f_s32f_normalize_u_orc(float* vecBuffer,
++ const float scalar,
++ unsigned int num_points)
++{
+ float invscalar = 1.0 / scalar;
+ volk_32f_s32f_normalize_a_orc_impl(vecBuffer, vecBuffer, invscalar, num_points);
+ }
+@@ -169,32 +184,35 @@ static inline void volk_32f_s32f_normalize_u_orc(float* vecBuffer, const float s
+ #ifdef LV_HAVE_AVX
+ #include <immintrin.h>
+
+-static inline void volk_32f_s32f_normalize_u_avx(float* vecBuffer, const float scalar, unsigned int num_points){
+- unsigned int number = 0;
+- float* inputPtr = vecBuffer;
++static inline void volk_32f_s32f_normalize_u_avx(float* vecBuffer,
++ const float scalar,
++ unsigned int num_points)
++{
++ unsigned int number = 0;
++ float* inputPtr = vecBuffer;
+
+- const float invScalar = 1.0 / scalar;
+- __m256 vecScalar = _mm256_set1_ps(invScalar);
++ const float invScalar = 1.0 / scalar;
++ __m256 vecScalar = _mm256_set1_ps(invScalar);
+
+- __m256 input1;
++ __m256 input1;
+
+- const uint64_t eighthPoints = num_points / 8;
+- for(;number < eighthPoints; number++){
++ const uint64_t eighthPoints = num_points / 8;
++ for (; number < eighthPoints; number++) {
+
+- input1 = _mm256_loadu_ps(inputPtr);
++ input1 = _mm256_loadu_ps(inputPtr);
+
+- input1 = _mm256_mul_ps(input1, vecScalar);
++ input1 = _mm256_mul_ps(input1, vecScalar);
+
+- _mm256_storeu_ps(inputPtr, input1);
++ _mm256_storeu_ps(inputPtr, input1);
+
+- inputPtr += 8;
+- }
++ inputPtr += 8;
++ }
+
+- number = eighthPoints*8;
+- for(; number < num_points; number++){
+- *inputPtr *= invScalar;
+- inputPtr++;
+- }
++ number = eighthPoints * 8;
++ for (; number < num_points; number++) {
++ *inputPtr *= invScalar;
++ inputPtr++;
++ }
+ }
+ #endif /* LV_HAVE_AVX */
+
+diff --git a/kernels/volk/volk_32f_s32f_power_32f.h b/kernels/volk/volk_32f_s32f_power_32f.h
+index 070efdc..9b6fdf4 100644
+--- a/kernels/volk/volk_32f_s32f_power_32f.h
++++ b/kernels/volk/volk_32f_s32f_power_32f.h
+@@ -30,8 +30,8 @@
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+- * void volk_32f_s32f_power_32f(float* cVector, const float* aVector, const float power, unsigned int num_points)
+- * \endcode
++ * void volk_32f_s32f_power_32f(float* cVector, const float* aVector, const float power,
++ * unsigned int num_points) \endcode
+ *
+ * \b Inputs
+ * \li aVector: The input vector of floats.
+@@ -72,8 +72,8 @@
+ #define INCLUDED_volk_32f_s32f_power_32f_a_H
+
+ #include <inttypes.h>
+-#include <stdio.h>
+ #include <math.h>
++#include <stdio.h>
+
+ #ifdef LV_HAVE_SSE4_1
+ #include <tmmintrin.h>
+@@ -82,49 +82,51 @@
+ #include <simdmath.h>
+ #endif /* LV_HAVE_LIB_SIMDMATH */
+
+-static inline void
+-volk_32f_s32f_power_32f_a_sse4_1(float* cVector, const float* aVector,
+- const float power, unsigned int num_points)
++static inline void volk_32f_s32f_power_32f_a_sse4_1(float* cVector,
++ const float* aVector,
++ const float power,
++ unsigned int num_points)
+ {
+- unsigned int number = 0;
++ unsigned int number = 0;
+
+- float* cPtr = cVector;
+- const float* aPtr = aVector;
++ float* cPtr = cVector;
++ const float* aPtr = aVector;
+
+ #ifdef LV_HAVE_LIB_SIMDMATH
+- const unsigned int quarterPoints = num_points / 4;
+- __m128 vPower = _mm_set_ps1(power);
+- __m128 zeroValue = _mm_setzero_ps();
+- __m128 signMask;
+- __m128 negatedValues;
+- __m128 negativeOneToPower = _mm_set_ps1(powf(-1, power));
+- __m128 onesMask = _mm_set_ps1(1);
++ const unsigned int quarterPoints = num_points / 4;
++ __m128 vPower = _mm_set_ps1(power);
++ __m128 zeroValue = _mm_setzero_ps();
++ __m128 signMask;
++ __m128 negatedValues;
++ __m128 negativeOneToPower = _mm_set_ps1(powf(-1, power));
++ __m128 onesMask = _mm_set_ps1(1);
+
+- __m128 aVal, cVal;
+- for(;number < quarterPoints; number++){
++ __m128 aVal, cVal;
++ for (; number < quarterPoints; number++) {
+
+- aVal = _mm_load_ps(aPtr);
+- signMask = _mm_cmplt_ps(aVal, zeroValue);
+- negatedValues = _mm_sub_ps(zeroValue, aVal);
+- aVal = _mm_blendv_ps(aVal, negatedValues, signMask);
++ aVal = _mm_load_ps(aPtr);
++ signMask = _mm_cmplt_ps(aVal, zeroValue);
++ negatedValues = _mm_sub_ps(zeroValue, aVal);
++ aVal = _mm_blendv_ps(aVal, negatedValues, signMask);
+
+- // powf4 doesn't support negative values in the base, so we mask them off and then apply the negative after
+- cVal = powf4(aVal, vPower); // Takes each input value to the specified power
++ // powf4 doesn't support negative values in the base, so we mask them off and then
++ // apply the negative after
++ cVal = powf4(aVal, vPower); // Takes each input value to the specified power
+
+- cVal = _mm_mul_ps( _mm_blendv_ps(onesMask, negativeOneToPower, signMask), cVal);
++ cVal = _mm_mul_ps(_mm_blendv_ps(onesMask, negativeOneToPower, signMask), cVal);
+
+- _mm_store_ps(cPtr,cVal); // Store the results back into the C container
++ _mm_store_ps(cPtr, cVal); // Store the results back into the C container
+
+- aPtr += 4;
+- cPtr += 4;
+- }
++ aPtr += 4;
++ cPtr += 4;
++ }
+
+- number = quarterPoints * 4;
++ number = quarterPoints * 4;
+ #endif /* LV_HAVE_LIB_SIMDMATH */
+
+- for(;number < num_points; number++){
+- *cPtr++ = powf((*aPtr++), power);
+- }
++ for (; number < num_points; number++) {
++ *cPtr++ = powf((*aPtr++), power);
++ }
+ }
+
+ #endif /* LV_HAVE_SSE4_1 */
+@@ -137,49 +139,54 @@ volk_32f_s32f_power_32f_a_sse4_1(float* cVector, const float* aVector,
+ #include <simdmath.h>
+ #endif /* LV_HAVE_LIB_SIMDMATH */
+
+-static inline void
+-volk_32f_s32f_power_32f_a_sse(float* cVector, const float* aVector,
+- const float power, unsigned int num_points)
++static inline void volk_32f_s32f_power_32f_a_sse(float* cVector,
++ const float* aVector,
++ const float power,
++ unsigned int num_points)
+ {
+- unsigned int number = 0;
++ unsigned int number = 0;
+
+- float* cPtr = cVector;
+- const float* aPtr = aVector;
++ float* cPtr = cVector;
++ const float* aPtr = aVector;
+
+ #ifdef LV_HAVE_LIB_SIMDMATH
+- const unsigned int quarterPoints = num_points / 4;
+- __m128 vPower = _mm_set_ps1(power);
+- __m128 zeroValue = _mm_setzero_ps();
+- __m128 signMask;
+- __m128 negatedValues;
+- __m128 negativeOneToPower = _mm_set_ps1(powf(-1, power));
+- __m128 onesMask = _mm_set_ps1(1);
+-
+- __m128 aVal, cVal;
+- for(;number < quarterPoints; number++){
+-
+- aVal = _mm_load_ps(aPtr);
+- signMask = _mm_cmplt_ps(aVal, zeroValue);
+- negatedValues = _mm_sub_ps(zeroValue, aVal);
+- aVal = _mm_or_ps(_mm_andnot_ps(signMask, aVal), _mm_and_ps(signMask, negatedValues) );
+-
+- // powf4 doesn't support negative values in the base, so we mask them off and then apply the negative after
+- cVal = powf4(aVal, vPower); // Takes each input value to the specified power
+-
+- cVal = _mm_mul_ps( _mm_or_ps( _mm_andnot_ps(signMask, onesMask), _mm_and_ps(signMask, negativeOneToPower) ), cVal);
+-
+- _mm_store_ps(cPtr,cVal); // Store the results back into the C container
+-
+- aPtr += 4;
+- cPtr += 4;
+- }
+-
+- number = quarterPoints * 4;
++ const unsigned int quarterPoints = num_points / 4;
++ __m128 vPower = _mm_set_ps1(power);
++ __m128 zeroValue = _mm_setzero_ps();
++ __m128 signMask;
++ __m128 negatedValues;
++ __m128 negativeOneToPower = _mm_set_ps1(powf(-1, power));
++ __m128 onesMask = _mm_set_ps1(1);
++
++ __m128 aVal, cVal;
++ for (; number < quarterPoints; number++) {
++
++ aVal = _mm_load_ps(aPtr);
++ signMask = _mm_cmplt_ps(aVal, zeroValue);
++ negatedValues = _mm_sub_ps(zeroValue, aVal);
++ aVal =
++ _mm_or_ps(_mm_andnot_ps(signMask, aVal), _mm_and_ps(signMask, negatedValues));
++
++ // powf4 doesn't support negative values in the base, so we mask them off and then
++ // apply the negative after
++ cVal = powf4(aVal, vPower); // Takes each input value to the specified power
++
++ cVal = _mm_mul_ps(_mm_or_ps(_mm_andnot_ps(signMask, onesMask),
++ _mm_and_ps(signMask, negativeOneToPower)),
++ cVal);
++
++ _mm_store_ps(cPtr, cVal); // Store the results back into the C container
++
++ aPtr += 4;
++ cPtr += 4;
++ }
++
++ number = quarterPoints * 4;
+ #endif /* LV_HAVE_LIB_SIMDMATH */
+
+- for(;number < num_points; number++){
+- *cPtr++ = powf((*aPtr++), power);
+- }
++ for (; number < num_points; number++) {
++ *cPtr++ = powf((*aPtr++), power);
++ }
+ }
+
+ #endif /* LV_HAVE_SSE */
+@@ -187,17 +194,18 @@ volk_32f_s32f_power_32f_a_sse(float* cVector, const float* aVector,
+
+ #ifdef LV_HAVE_GENERIC
+
+-static inline void
+-volk_32f_s32f_power_32f_generic(float* cVector, const float* aVector,
+- const float power, unsigned int num_points)
++static inline void volk_32f_s32f_power_32f_generic(float* cVector,
++ const float* aVector,
++ const float power,
++ unsigned int num_points)
+ {
+- float* cPtr = cVector;
+- const float* aPtr = aVector;
+- unsigned int number = 0;
++ float* cPtr = cVector;
++ const float* aPtr = aVector;
++ unsigned int number = 0;
+
+- for(number = 0; number < num_points; number++){
+- *cPtr++ = powf((*aPtr++), power);
+- }
++ for (number = 0; number < num_points; number++) {
++ *cPtr++ = powf((*aPtr++), power);
++ }
+ }
+ #endif /* LV_HAVE_GENERIC */
+
+diff --git a/kernels/volk/volk_32f_s32f_s32f_mod_range_32f.h b/kernels/volk/volk_32f_s32f_s32f_mod_range_32f.h
+index 53b4937..d7f23fe 100644
+--- a/kernels/volk/volk_32f_s32f_s32f_mod_range_32f.h
++++ b/kernels/volk/volk_32f_s32f_s32f_mod_range_32f.h
+@@ -25,8 +25,8 @@
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+- * void volk_32f_s32f_s32f_mod_range_32f(float* outputVector, const float* inputVector, const float lower_bound, const float upper_bound, unsigned int num_points)
+- * \endcode
++ * void volk_32f_s32f_s32f_mod_range_32f(float* outputVector, const float* inputVector,
++ * const float lower_bound, const float upper_bound, unsigned int num_points) \endcode
+ *
+ * \b Inputs
+ * \li inputVector: The input vector
+@@ -46,117 +46,129 @@
+ #ifdef LV_HAVE_AVX
+ #include <xmmintrin.h>
+
+-static inline void volk_32f_s32f_s32f_mod_range_32f_u_avx(float* outputVector, const float* inputVector, const float lower_bound, const float upper_bound, unsigned int num_points){
+- __m256 lower = _mm256_set1_ps(lower_bound);
+- __m256 upper = _mm256_set1_ps(upper_bound);
+- __m256 distance = _mm256_sub_ps(upper,lower);
+- float dist = upper_bound - lower_bound;
+- __m256 input, output;
+- __m256 is_smaller, is_bigger;
+- __m256 excess, adj;
+-
+- const float *inPtr = inputVector;
+- float *outPtr = outputVector;
+- size_t eight_points = num_points / 8;
+- size_t counter;
+- for(counter = 0; counter < eight_points; counter++) {
+- input = _mm256_loadu_ps(inPtr);
+- // calculate mask: input < lower, input > upper
+- is_smaller = _mm256_cmp_ps(input, lower, _CMP_LT_OQ); //0x11: Less than, ordered, non-signalling
+- is_bigger = _mm256_cmp_ps(input, upper, _CMP_GT_OQ); //0x1e: greater than, ordered, non-signalling
+- // find out how far we are out-of-bound – positive values!
+- excess = _mm256_and_ps(_mm256_sub_ps(lower, input), is_smaller);
+- excess = _mm256_or_ps(_mm256_and_ps(_mm256_sub_ps(input, upper), is_bigger), excess);
+- // how many do we have to add? (int(excess/distance+1)*distance)
+- excess = _mm256_div_ps(excess, distance);
+- // round down
+- excess = _mm256_cvtepi32_ps(_mm256_cvttps_epi32(excess));
+- // plus 1
+- adj = _mm256_set1_ps(1.0f);
+- excess = _mm256_add_ps(excess, adj);
+- // get the sign right, adj is still {1.0f,1.0f,1.0f,1.0f}
+- adj = _mm256_and_ps(adj, is_smaller);
+- adj = _mm256_or_ps(_mm256_and_ps(_mm256_set1_ps(-1.0f), is_bigger), adj);
+- // scale by distance, sign
+- excess = _mm256_mul_ps(_mm256_mul_ps(excess, adj), distance);
+- output = _mm256_add_ps(input, excess);
+- _mm256_storeu_ps(outPtr, output);
+- inPtr += 8;
+- outPtr += 8;
+- }
+-
+- size_t cnt;
+- for(cnt = eight_points * 8; cnt < num_points; cnt++){
+- float val = inputVector[cnt];
+- if(val < lower_bound){
+- float excess = lower_bound - val;
+- signed int count = (int)(excess/dist);
+- outputVector[cnt] = val + (count+1)*dist;
++static inline void volk_32f_s32f_s32f_mod_range_32f_u_avx(float* outputVector,
++ const float* inputVector,
++ const float lower_bound,
++ const float upper_bound,
++ unsigned int num_points)
++{
++ __m256 lower = _mm256_set1_ps(lower_bound);
++ __m256 upper = _mm256_set1_ps(upper_bound);
++ __m256 distance = _mm256_sub_ps(upper, lower);
++ float dist = upper_bound - lower_bound;
++ __m256 input, output;
++ __m256 is_smaller, is_bigger;
++ __m256 excess, adj;
++
++ const float* inPtr = inputVector;
++ float* outPtr = outputVector;
++ size_t eight_points = num_points / 8;
++ size_t counter;
++ for (counter = 0; counter < eight_points; counter++) {
++ input = _mm256_loadu_ps(inPtr);
++ // calculate mask: input < lower, input > upper
++ is_smaller = _mm256_cmp_ps(
++ input, lower, _CMP_LT_OQ); // 0x11: Less than, ordered, non-signalling
++ is_bigger = _mm256_cmp_ps(
++ input, upper, _CMP_GT_OQ); // 0x1e: greater than, ordered, non-signalling
++ // find out how far we are out-of-bound – positive values!
++ excess = _mm256_and_ps(_mm256_sub_ps(lower, input), is_smaller);
++ excess =
++ _mm256_or_ps(_mm256_and_ps(_mm256_sub_ps(input, upper), is_bigger), excess);
++ // how many do we have to add? (int(excess/distance+1)*distance)
++ excess = _mm256_div_ps(excess, distance);
++ // round down
++ excess = _mm256_cvtepi32_ps(_mm256_cvttps_epi32(excess));
++ // plus 1
++ adj = _mm256_set1_ps(1.0f);
++ excess = _mm256_add_ps(excess, adj);
++ // get the sign right, adj is still {1.0f,1.0f,1.0f,1.0f}
++ adj = _mm256_and_ps(adj, is_smaller);
++ adj = _mm256_or_ps(_mm256_and_ps(_mm256_set1_ps(-1.0f), is_bigger), adj);
++ // scale by distance, sign
++ excess = _mm256_mul_ps(_mm256_mul_ps(excess, adj), distance);
++ output = _mm256_add_ps(input, excess);
++ _mm256_storeu_ps(outPtr, output);
++ inPtr += 8;
++ outPtr += 8;
+ }
+- else if(val > upper_bound){
+- float excess = val - upper_bound;
+- signed int count = (int)(excess/dist);
+- outputVector[cnt] = val - (count+1)*dist;
++
++ size_t cnt;
++ for (cnt = eight_points * 8; cnt < num_points; cnt++) {
++ float val = inputVector[cnt];
++ if (val < lower_bound) {
++ float excess = lower_bound - val;
++ signed int count = (int)(excess / dist);
++ outputVector[cnt] = val + (count + 1) * dist;
++ } else if (val > upper_bound) {
++ float excess = val - upper_bound;
++ signed int count = (int)(excess / dist);
++ outputVector[cnt] = val - (count + 1) * dist;
++ } else
++ outputVector[cnt] = val;
+ }
+- else
+- outputVector[cnt] = val;
+- }
+ }
+-static inline void volk_32f_s32f_s32f_mod_range_32f_a_avx(float* outputVector, const float* inputVector, const float lower_bound, const float upper_bound, unsigned int num_points){
+- __m256 lower = _mm256_set1_ps(lower_bound);
+- __m256 upper = _mm256_set1_ps(upper_bound);
+- __m256 distance = _mm256_sub_ps(upper,lower);
+- float dist = upper_bound - lower_bound;
+- __m256 input, output;
+- __m256 is_smaller, is_bigger;
+- __m256 excess, adj;
+-
+- const float *inPtr = inputVector;
+- float *outPtr = outputVector;
+- size_t eight_points = num_points / 8;
+- size_t counter;
+- for(counter = 0; counter < eight_points; counter++) {
+- input = _mm256_load_ps(inPtr);
+- // calculate mask: input < lower, input > upper
+- is_smaller = _mm256_cmp_ps(input, lower, _CMP_LT_OQ); //0x11: Less than, ordered, non-signalling
+- is_bigger = _mm256_cmp_ps(input, upper, _CMP_GT_OQ); //0x1e: greater than, ordered, non-signalling
+- // find out how far we are out-of-bound – positive values!
+- excess = _mm256_and_ps(_mm256_sub_ps(lower, input), is_smaller);
+- excess = _mm256_or_ps(_mm256_and_ps(_mm256_sub_ps(input, upper), is_bigger), excess);
+- // how many do we have to add? (int(excess/distance+1)*distance)
+- excess = _mm256_div_ps(excess, distance);
+- // round down
+- excess = _mm256_cvtepi32_ps(_mm256_cvttps_epi32(excess));
+- // plus 1
+- adj = _mm256_set1_ps(1.0f);
+- excess = _mm256_add_ps(excess, adj);
+- // get the sign right, adj is still {1.0f,1.0f,1.0f,1.0f}
+- adj = _mm256_and_ps(adj, is_smaller);
+- adj = _mm256_or_ps(_mm256_and_ps(_mm256_set1_ps(-1.0f), is_bigger), adj);
+- // scale by distance, sign
+- excess = _mm256_mul_ps(_mm256_mul_ps(excess, adj), distance);
+- output = _mm256_add_ps(input, excess);
+- _mm256_store_ps(outPtr, output);
+- inPtr += 8;
+- outPtr += 8;
+- }
+-
+- size_t cnt;
+- for(cnt = eight_points * 8; cnt < num_points; cnt++){
+- float val = inputVector[cnt];
+- if(val < lower_bound){
+- float excess = lower_bound - val;
+- signed int count = (int)(excess/dist);
+- outputVector[cnt] = val + (count+1)*dist;
++static inline void volk_32f_s32f_s32f_mod_range_32f_a_avx(float* outputVector,
++ const float* inputVector,
++ const float lower_bound,
++ const float upper_bound,
++ unsigned int num_points)
++{
++ __m256 lower = _mm256_set1_ps(lower_bound);
++ __m256 upper = _mm256_set1_ps(upper_bound);
++ __m256 distance = _mm256_sub_ps(upper, lower);
++ float dist = upper_bound - lower_bound;
++ __m256 input, output;
++ __m256 is_smaller, is_bigger;
++ __m256 excess, adj;
++
++ const float* inPtr = inputVector;
++ float* outPtr = outputVector;
++ size_t eight_points = num_points / 8;
++ size_t counter;
++ for (counter = 0; counter < eight_points; counter++) {
++ input = _mm256_load_ps(inPtr);
++ // calculate mask: input < lower, input > upper
++ is_smaller = _mm256_cmp_ps(
++ input, lower, _CMP_LT_OQ); // 0x11: Less than, ordered, non-signalling
++ is_bigger = _mm256_cmp_ps(
++ input, upper, _CMP_GT_OQ); // 0x1e: greater than, ordered, non-signalling
++ // find out how far we are out-of-bound – positive values!
++ excess = _mm256_and_ps(_mm256_sub_ps(lower, input), is_smaller);
++ excess =
++ _mm256_or_ps(_mm256_and_ps(_mm256_sub_ps(input, upper), is_bigger), excess);
++ // how many do we have to add? (int(excess/distance+1)*distance)
++ excess = _mm256_div_ps(excess, distance);
++ // round down
++ excess = _mm256_cvtepi32_ps(_mm256_cvttps_epi32(excess));
++ // plus 1
++ adj = _mm256_set1_ps(1.0f);
++ excess = _mm256_add_ps(excess, adj);
++ // get the sign right, adj is still {1.0f,1.0f,1.0f,1.0f}
++ adj = _mm256_and_ps(adj, is_smaller);
++ adj = _mm256_or_ps(_mm256_and_ps(_mm256_set1_ps(-1.0f), is_bigger), adj);
++ // scale by distance, sign
++ excess = _mm256_mul_ps(_mm256_mul_ps(excess, adj), distance);
++ output = _mm256_add_ps(input, excess);
++ _mm256_store_ps(outPtr, output);
++ inPtr += 8;
++ outPtr += 8;
+ }
+- else if(val > upper_bound){
+- float excess = val - upper_bound;
+- signed int count = (int)(excess/dist);
+- outputVector[cnt] = val - (count+1)*dist;
++
++ size_t cnt;
++ for (cnt = eight_points * 8; cnt < num_points; cnt++) {
++ float val = inputVector[cnt];
++ if (val < lower_bound) {
++ float excess = lower_bound - val;
++ signed int count = (int)(excess / dist);
++ outputVector[cnt] = val + (count + 1) * dist;
++ } else if (val > upper_bound) {
++ float excess = val - upper_bound;
++ signed int count = (int)(excess / dist);
++ outputVector[cnt] = val - (count + 1) * dist;
++ } else
++ outputVector[cnt] = val;
+ }
+- else
+- outputVector[cnt] = val;
+- }
+ }
+ #endif /* LV_HAVE_AVX */
+
+@@ -164,268 +176,282 @@ static inline void volk_32f_s32f_s32f_mod_range_32f_a_avx(float* outputVector, c
+ #ifdef LV_HAVE_SSE2
+ #include <xmmintrin.h>
+
+-static inline void volk_32f_s32f_s32f_mod_range_32f_u_sse2(float* outputVector, const float* inputVector, const float lower_bound, const float upper_bound, unsigned int num_points){
+- __m128 lower = _mm_set_ps1(lower_bound);
+- __m128 upper = _mm_set_ps1(upper_bound);
+- __m128 distance = _mm_sub_ps(upper,lower);
+- float dist = upper_bound - lower_bound;
+- __m128 input, output;
+- __m128 is_smaller, is_bigger;
+- __m128 excess, adj;
+-
+- const float *inPtr = inputVector;
+- float *outPtr = outputVector;
+- size_t quarter_points = num_points / 4;
+- size_t counter;
+- for(counter = 0; counter < quarter_points; counter++) {
+- input = _mm_load_ps(inPtr);
+- // calculate mask: input < lower, input > upper
+- is_smaller = _mm_cmplt_ps(input, lower);
+- is_bigger = _mm_cmpgt_ps(input, upper);
+- // find out how far we are out-of-bound – positive values!
+- excess = _mm_and_ps(_mm_sub_ps(lower, input), is_smaller);
+- excess = _mm_or_ps(_mm_and_ps(_mm_sub_ps(input, upper), is_bigger), excess);
+- // how many do we have to add? (int(excess/distance+1)*distance)
+- excess = _mm_div_ps(excess, distance);
+- // round down
+- excess = _mm_cvtepi32_ps(_mm_cvttps_epi32(excess));
+- // plus 1
+- adj = _mm_set_ps1(1.0f);
+- excess = _mm_add_ps(excess, adj);
+- // get the sign right, adj is still {1.0f,1.0f,1.0f,1.0f}
+- adj = _mm_and_ps(adj, is_smaller);
+- adj = _mm_or_ps(_mm_and_ps(_mm_set_ps1(-1.0f), is_bigger), adj);
+- // scale by distance, sign
+- excess = _mm_mul_ps(_mm_mul_ps(excess, adj), distance);
+- output = _mm_add_ps(input, excess);
+- _mm_store_ps(outPtr, output);
+- inPtr += 4;
+- outPtr += 4;
+- }
+-
+- size_t cnt;
+- for(cnt = quarter_points * 4; cnt < num_points; cnt++){
+- float val = inputVector[cnt];
+- if(val < lower_bound){
+- float excess = lower_bound - val;
+- signed int count = (int)(excess/dist);
+- outputVector[cnt] = val + (count+1)*dist;
++static inline void volk_32f_s32f_s32f_mod_range_32f_u_sse2(float* outputVector,
++ const float* inputVector,
++ const float lower_bound,
++ const float upper_bound,
++ unsigned int num_points)
++{
++ __m128 lower = _mm_set_ps1(lower_bound);
++ __m128 upper = _mm_set_ps1(upper_bound);
++ __m128 distance = _mm_sub_ps(upper, lower);
++ float dist = upper_bound - lower_bound;
++ __m128 input, output;
++ __m128 is_smaller, is_bigger;
++ __m128 excess, adj;
++
++ const float* inPtr = inputVector;
++ float* outPtr = outputVector;
++ size_t quarter_points = num_points / 4;
++ size_t counter;
++ for (counter = 0; counter < quarter_points; counter++) {
++ input = _mm_load_ps(inPtr);
++ // calculate mask: input < lower, input > upper
++ is_smaller = _mm_cmplt_ps(input, lower);
++ is_bigger = _mm_cmpgt_ps(input, upper);
++ // find out how far we are out-of-bound – positive values!
++ excess = _mm_and_ps(_mm_sub_ps(lower, input), is_smaller);
++ excess = _mm_or_ps(_mm_and_ps(_mm_sub_ps(input, upper), is_bigger), excess);
++ // how many do we have to add? (int(excess/distance+1)*distance)
++ excess = _mm_div_ps(excess, distance);
++ // round down
++ excess = _mm_cvtepi32_ps(_mm_cvttps_epi32(excess));
++ // plus 1
++ adj = _mm_set_ps1(1.0f);
++ excess = _mm_add_ps(excess, adj);
++ // get the sign right, adj is still {1.0f,1.0f,1.0f,1.0f}
++ adj = _mm_and_ps(adj, is_smaller);
++ adj = _mm_or_ps(_mm_and_ps(_mm_set_ps1(-1.0f), is_bigger), adj);
++ // scale by distance, sign
++ excess = _mm_mul_ps(_mm_mul_ps(excess, adj), distance);
++ output = _mm_add_ps(input, excess);
++ _mm_store_ps(outPtr, output);
++ inPtr += 4;
++ outPtr += 4;
+ }
+- else if(val > upper_bound){
+- float excess = val - upper_bound;
+- signed int count = (int)(excess/dist);
+- outputVector[cnt] = val - (count+1)*dist;
++
++ size_t cnt;
++ for (cnt = quarter_points * 4; cnt < num_points; cnt++) {
++ float val = inputVector[cnt];
++ if (val < lower_bound) {
++ float excess = lower_bound - val;
++ signed int count = (int)(excess / dist);
++ outputVector[cnt] = val + (count + 1) * dist;
++ } else if (val > upper_bound) {
++ float excess = val - upper_bound;
++ signed int count = (int)(excess / dist);
++ outputVector[cnt] = val - (count + 1) * dist;
++ } else
++ outputVector[cnt] = val;
+ }
+- else
+- outputVector[cnt] = val;
+- }
+ }
+-static inline void volk_32f_s32f_s32f_mod_range_32f_a_sse2(float* outputVector, const float* inputVector, const float lower_bound, const float upper_bound, unsigned int num_points){
+- __m128 lower = _mm_set_ps1(lower_bound);
+- __m128 upper = _mm_set_ps1(upper_bound);
+- __m128 distance = _mm_sub_ps(upper,lower);
+- __m128 input, output;
+- __m128 is_smaller, is_bigger;
+- __m128 excess, adj;
+-
+- const float *inPtr = inputVector;
+- float *outPtr = outputVector;
+- size_t quarter_points = num_points / 4;
+- size_t counter;
+- for(counter = 0; counter < quarter_points; counter++) {
+- input = _mm_load_ps(inPtr);
+- // calculate mask: input < lower, input > upper
+- is_smaller = _mm_cmplt_ps(input, lower);
+- is_bigger = _mm_cmpgt_ps(input, upper);
+- // find out how far we are out-of-bound – positive values!
+- excess = _mm_and_ps(_mm_sub_ps(lower, input), is_smaller);
+- excess = _mm_or_ps(_mm_and_ps(_mm_sub_ps(input, upper), is_bigger), excess);
+- // how many do we have to add? (int(excess/distance+1)*distance)
+- excess = _mm_div_ps(excess, distance);
+- // round down – for some reason, SSE doesn't come with a 4x float -> 4x int32 conversion.
+- excess = _mm_cvtepi32_ps(_mm_cvttps_epi32(excess));
+- // plus 1
+- adj = _mm_set_ps1(1.0f);
+- excess = _mm_add_ps(excess, adj);
+- // get the sign right, adj is still {1.0f,1.0f,1.0f,1.0f}
+- adj = _mm_and_ps(adj, is_smaller);
+- adj = _mm_or_ps(_mm_and_ps(_mm_set_ps1(-1.0f), is_bigger), adj);
+- // scale by distance, sign
+- excess = _mm_mul_ps(_mm_mul_ps(excess, adj), distance);
+- output = _mm_add_ps(input, excess);
+- _mm_store_ps(outPtr, output);
+- inPtr += 4;
+- outPtr += 4;
+- }
+-
+- float dist = upper_bound - lower_bound;
+- size_t cnt;
+- for(cnt = quarter_points * 4; cnt < num_points; cnt++){
+- float val = inputVector[cnt];
+- if(val < lower_bound){
+- float excess = lower_bound - val;
+- signed int count = (int)(excess/dist);
+- outputVector[cnt] = val + (count+1)*dist;
++static inline void volk_32f_s32f_s32f_mod_range_32f_a_sse2(float* outputVector,
++ const float* inputVector,
++ const float lower_bound,
++ const float upper_bound,
++ unsigned int num_points)
++{
++ __m128 lower = _mm_set_ps1(lower_bound);
++ __m128 upper = _mm_set_ps1(upper_bound);
++ __m128 distance = _mm_sub_ps(upper, lower);
++ __m128 input, output;
++ __m128 is_smaller, is_bigger;
++ __m128 excess, adj;
++
++ const float* inPtr = inputVector;
++ float* outPtr = outputVector;
++ size_t quarter_points = num_points / 4;
++ size_t counter;
++ for (counter = 0; counter < quarter_points; counter++) {
++ input = _mm_load_ps(inPtr);
++ // calculate mask: input < lower, input > upper
++ is_smaller = _mm_cmplt_ps(input, lower);
++ is_bigger = _mm_cmpgt_ps(input, upper);
++ // find out how far we are out-of-bound – positive values!
++ excess = _mm_and_ps(_mm_sub_ps(lower, input), is_smaller);
++ excess = _mm_or_ps(_mm_and_ps(_mm_sub_ps(input, upper), is_bigger), excess);
++ // how many do we have to add? (int(excess/distance+1)*distance)
++ excess = _mm_div_ps(excess, distance);
++ // round down – for some reason, SSE doesn't come with a 4x float -> 4x int32
++ // conversion.
++ excess = _mm_cvtepi32_ps(_mm_cvttps_epi32(excess));
++ // plus 1
++ adj = _mm_set_ps1(1.0f);
++ excess = _mm_add_ps(excess, adj);
++ // get the sign right, adj is still {1.0f,1.0f,1.0f,1.0f}
++ adj = _mm_and_ps(adj, is_smaller);
++ adj = _mm_or_ps(_mm_and_ps(_mm_set_ps1(-1.0f), is_bigger), adj);
++ // scale by distance, sign
++ excess = _mm_mul_ps(_mm_mul_ps(excess, adj), distance);
++ output = _mm_add_ps(input, excess);
++ _mm_store_ps(outPtr, output);
++ inPtr += 4;
++ outPtr += 4;
+ }
+- else if(val > upper_bound){
+- float excess = val - upper_bound;
+- signed int count = (int)(excess/dist);
+- outputVector[cnt] = val - (count+1)*dist;
++
++ float dist = upper_bound - lower_bound;
++ size_t cnt;
++ for (cnt = quarter_points * 4; cnt < num_points; cnt++) {
++ float val = inputVector[cnt];
++ if (val < lower_bound) {
++ float excess = lower_bound - val;
++ signed int count = (int)(excess / dist);
++ outputVector[cnt] = val + (count + 1) * dist;
++ } else if (val > upper_bound) {
++ float excess = val - upper_bound;
++ signed int count = (int)(excess / dist);
++ outputVector[cnt] = val - (count + 1) * dist;
++ } else
++ outputVector[cnt] = val;
+ }
+- else
+- outputVector[cnt] = val;
+- }
+ }
+ #endif /* LV_HAVE_SSE2 */
+
+ #ifdef LV_HAVE_SSE
+ #include <xmmintrin.h>
+
+-static inline void volk_32f_s32f_s32f_mod_range_32f_u_sse(float* outputVector, const float* inputVector, const float lower_bound, const float upper_bound, unsigned int num_points){
+- __m128 lower = _mm_set_ps1(lower_bound);
+- __m128 upper = _mm_set_ps1(upper_bound);
+- __m128 distance = _mm_sub_ps(upper,lower);
+- float dist = upper_bound - lower_bound;
+- __m128 input, output;
+- __m128 is_smaller, is_bigger;
+- __m128 excess, adj;
+- __m128i rounddown;
+-
+- const float *inPtr = inputVector;
+- float *outPtr = outputVector;
+- size_t quarter_points = num_points / 4;
+- size_t counter;
+- for(counter = 0; counter < quarter_points; counter++) {
+- input = _mm_load_ps(inPtr);
+- // calculate mask: input < lower, input > upper
+- is_smaller = _mm_cmplt_ps(input, lower);
+- is_bigger = _mm_cmpgt_ps(input, upper);
+- // find out how far we are out-of-bound – positive values!
+- excess = _mm_and_ps(_mm_sub_ps(lower, input), is_smaller);
+- excess = _mm_or_ps(_mm_and_ps(_mm_sub_ps(input, upper), is_bigger), excess);
+- // how many do we have to add? (int(excess/distance+1)*distance)
+- excess = _mm_div_ps(excess, distance);
+- // round down – for some reason
+- rounddown = _mm_cvttps_epi32(excess);
+- excess = _mm_cvtepi32_ps(rounddown);
+- // plus 1
+- adj = _mm_set_ps1(1.0f);
+- excess = _mm_add_ps(excess, adj);
+- // get the sign right, adj is still {1.0f,1.0f,1.0f,1.0f}
+- adj = _mm_and_ps(adj, is_smaller);
+- adj = _mm_or_ps(_mm_and_ps(_mm_set_ps1(-1.0f), is_bigger), adj);
+- // scale by distance, sign
+- excess = _mm_mul_ps(_mm_mul_ps(excess, adj), distance);
+- output = _mm_add_ps(input, excess);
+- _mm_store_ps(outPtr, output);
+- inPtr += 4;
+- outPtr += 4;
+- }
+-
+- size_t cnt;
+- for(cnt = quarter_points * 4; cnt < num_points; cnt++){
+- float val = inputVector[cnt];
+- if(val < lower_bound){
+- float excess = lower_bound - val;
+- signed int count = (int)(excess/dist);
+- outputVector[cnt] = val + (count+1)*dist;
++static inline void volk_32f_s32f_s32f_mod_range_32f_u_sse(float* outputVector,
++ const float* inputVector,
++ const float lower_bound,
++ const float upper_bound,
++ unsigned int num_points)
++{
++ __m128 lower = _mm_set_ps1(lower_bound);
++ __m128 upper = _mm_set_ps1(upper_bound);
++ __m128 distance = _mm_sub_ps(upper, lower);
++ float dist = upper_bound - lower_bound;
++ __m128 input, output;
++ __m128 is_smaller, is_bigger;
++ __m128 excess, adj;
++ __m128i rounddown;
++
++ const float* inPtr = inputVector;
++ float* outPtr = outputVector;
++ size_t quarter_points = num_points / 4;
++ size_t counter;
++ for (counter = 0; counter < quarter_points; counter++) {
++ input = _mm_load_ps(inPtr);
++ // calculate mask: input < lower, input > upper
++ is_smaller = _mm_cmplt_ps(input, lower);
++ is_bigger = _mm_cmpgt_ps(input, upper);
++ // find out how far we are out-of-bound – positive values!
++ excess = _mm_and_ps(_mm_sub_ps(lower, input), is_smaller);
++ excess = _mm_or_ps(_mm_and_ps(_mm_sub_ps(input, upper), is_bigger), excess);
++ // how many do we have to add? (int(excess/distance+1)*distance)
++ excess = _mm_div_ps(excess, distance);
++ // round down – for some reason
++ rounddown = _mm_cvttps_epi32(excess);
++ excess = _mm_cvtepi32_ps(rounddown);
++ // plus 1
++ adj = _mm_set_ps1(1.0f);
++ excess = _mm_add_ps(excess, adj);
++ // get the sign right, adj is still {1.0f,1.0f,1.0f,1.0f}
++ adj = _mm_and_ps(adj, is_smaller);
++ adj = _mm_or_ps(_mm_and_ps(_mm_set_ps1(-1.0f), is_bigger), adj);
++ // scale by distance, sign
++ excess = _mm_mul_ps(_mm_mul_ps(excess, adj), distance);
++ output = _mm_add_ps(input, excess);
++ _mm_store_ps(outPtr, output);
++ inPtr += 4;
++ outPtr += 4;
+ }
+- else if(val > upper_bound){
+- float excess = val - upper_bound;
+- signed int count = (int)(excess/dist);
+- outputVector[cnt] = val - (count+1)*dist;
++
++ size_t cnt;
++ for (cnt = quarter_points * 4; cnt < num_points; cnt++) {
++ float val = inputVector[cnt];
++ if (val < lower_bound) {
++ float excess = lower_bound - val;
++ signed int count = (int)(excess / dist);
++ outputVector[cnt] = val + (count + 1) * dist;
++ } else if (val > upper_bound) {
++ float excess = val - upper_bound;
++ signed int count = (int)(excess / dist);
++ outputVector[cnt] = val - (count + 1) * dist;
++ } else
++ outputVector[cnt] = val;
+ }
+- else
+- outputVector[cnt] = val;
+- }
+ }
+-static inline void volk_32f_s32f_s32f_mod_range_32f_a_sse(float* outputVector, const float* inputVector, const float lower_bound, const float upper_bound, unsigned int num_points){
+- __m128 lower = _mm_set_ps1(lower_bound);
+- __m128 upper = _mm_set_ps1(upper_bound);
+- __m128 distance = _mm_sub_ps(upper,lower);
+- __m128 input, output;
+- __m128 is_smaller, is_bigger;
+- __m128 excess, adj;
+- __m128i rounddown;
+-
+- const float *inPtr = inputVector;
+- float *outPtr = outputVector;
+- size_t quarter_points = num_points / 4;
+- size_t counter;
+- for(counter = 0; counter < quarter_points; counter++) {
+- input = _mm_load_ps(inPtr);
+- // calculate mask: input < lower, input > upper
+- is_smaller = _mm_cmplt_ps(input, lower);
+- is_bigger = _mm_cmpgt_ps(input, upper);
+- // find out how far we are out-of-bound – positive values!
+- excess = _mm_and_ps(_mm_sub_ps(lower, input), is_smaller);
+- excess = _mm_or_ps(_mm_and_ps(_mm_sub_ps(input, upper), is_bigger), excess);
+- // how many do we have to add? (int(excess/distance+1)*distance)
+- excess = _mm_div_ps(excess, distance);
+- // round down
+- rounddown = _mm_cvttps_epi32(excess);
+- excess = _mm_cvtepi32_ps(rounddown);
+- // plus 1
+- adj = _mm_set_ps1(1.0f);
+- excess = _mm_add_ps(excess, adj);
+- // get the sign right, adj is still {1.0f,1.0f,1.0f,1.0f}
+- adj = _mm_and_ps(adj, is_smaller);
+- adj = _mm_or_ps(_mm_and_ps(_mm_set_ps1(-1.0f), is_bigger), adj);
+- // scale by distance, sign
+- excess = _mm_mul_ps(_mm_mul_ps(excess, adj), distance);
+- output = _mm_add_ps(input, excess);
+- _mm_store_ps(outPtr, output);
+- inPtr += 4;
+- outPtr += 4;
+- }
+-
+- float dist = upper_bound - lower_bound;
+- size_t cnt;
+- for(cnt = quarter_points * 4; cnt < num_points; cnt++){
+- float val = inputVector[cnt];
+- if(val < lower_bound){
+- float excess = lower_bound - val;
+- signed int count = (int)(excess/dist);
+- outputVector[cnt] = val + (count+1)*dist;
++static inline void volk_32f_s32f_s32f_mod_range_32f_a_sse(float* outputVector,
++ const float* inputVector,
++ const float lower_bound,
++ const float upper_bound,
++ unsigned int num_points)
++{
++ __m128 lower = _mm_set_ps1(lower_bound);
++ __m128 upper = _mm_set_ps1(upper_bound);
++ __m128 distance = _mm_sub_ps(upper, lower);
++ __m128 input, output;
++ __m128 is_smaller, is_bigger;
++ __m128 excess, adj;
++ __m128i rounddown;
++
++ const float* inPtr = inputVector;
++ float* outPtr = outputVector;
++ size_t quarter_points = num_points / 4;
++ size_t counter;
++ for (counter = 0; counter < quarter_points; counter++) {
++ input = _mm_load_ps(inPtr);
++ // calculate mask: input < lower, input > upper
++ is_smaller = _mm_cmplt_ps(input, lower);
++ is_bigger = _mm_cmpgt_ps(input, upper);
++ // find out how far we are out-of-bound – positive values!
++ excess = _mm_and_ps(_mm_sub_ps(lower, input), is_smaller);
++ excess = _mm_or_ps(_mm_and_ps(_mm_sub_ps(input, upper), is_bigger), excess);
++ // how many do we have to add? (int(excess/distance+1)*distance)
++ excess = _mm_div_ps(excess, distance);
++ // round down
++ rounddown = _mm_cvttps_epi32(excess);
++ excess = _mm_cvtepi32_ps(rounddown);
++ // plus 1
++ adj = _mm_set_ps1(1.0f);
++ excess = _mm_add_ps(excess, adj);
++ // get the sign right, adj is still {1.0f,1.0f,1.0f,1.0f}
++ adj = _mm_and_ps(adj, is_smaller);
++ adj = _mm_or_ps(_mm_and_ps(_mm_set_ps1(-1.0f), is_bigger), adj);
++ // scale by distance, sign
++ excess = _mm_mul_ps(_mm_mul_ps(excess, adj), distance);
++ output = _mm_add_ps(input, excess);
++ _mm_store_ps(outPtr, output);
++ inPtr += 4;
++ outPtr += 4;
+ }
+- else if(val > upper_bound){
+- float excess = val - upper_bound;
+- signed int count = (int)(excess/dist);
+- outputVector[cnt] = val - (count+1)*dist;
++
++ float dist = upper_bound - lower_bound;
++ size_t cnt;
++ for (cnt = quarter_points * 4; cnt < num_points; cnt++) {
++ float val = inputVector[cnt];
++ if (val < lower_bound) {
++ float excess = lower_bound - val;
++ signed int count = (int)(excess / dist);
++ outputVector[cnt] = val + (count + 1) * dist;
++ } else if (val > upper_bound) {
++ float excess = val - upper_bound;
++ signed int count = (int)(excess / dist);
++ outputVector[cnt] = val - (count + 1) * dist;
++ } else
++ outputVector[cnt] = val;
+ }
+- else
+- outputVector[cnt] = val;
+- }
+ }
+ #endif /* LV_HAVE_SSE */
+
+ #ifdef LV_HAVE_GENERIC
+
+-static inline void volk_32f_s32f_s32f_mod_range_32f_generic(float* outputVector, const float* inputVector, const float lower_bound, const float upper_bound, unsigned int num_points){
+- float* outPtr = outputVector;
+- const float *inPtr;
+- float distance = upper_bound - lower_bound;
+-
+- for(inPtr = inputVector; inPtr < inputVector + num_points; inPtr++){
+- float val = *inPtr;
+- if(val < lower_bound){
+- float excess = lower_bound - val;
+- signed int count = (int)(excess/distance);
+- *outPtr = val + (count+1)*distance;
+- }
+- else if(val > upper_bound){
+- float excess = val - upper_bound;
+- signed int count = (int)(excess/distance);
+- *outPtr = val - (count+1)*distance;
++static inline void volk_32f_s32f_s32f_mod_range_32f_generic(float* outputVector,
++ const float* inputVector,
++ const float lower_bound,
++ const float upper_bound,
++ unsigned int num_points)
++{
++ float* outPtr = outputVector;
++ const float* inPtr;
++ float distance = upper_bound - lower_bound;
++
++ for (inPtr = inputVector; inPtr < inputVector + num_points; inPtr++) {
++ float val = *inPtr;
++ if (val < lower_bound) {
++ float excess = lower_bound - val;
++ signed int count = (int)(excess / distance);
++ *outPtr = val + (count + 1) * distance;
++ } else if (val > upper_bound) {
++ float excess = val - upper_bound;
++ signed int count = (int)(excess / distance);
++ *outPtr = val - (count + 1) * distance;
++ } else
++ *outPtr = val;
++ outPtr++;
+ }
+- else
+- *outPtr = val;
+- outPtr++;
+- }
+ }
+ #endif /* LV_HAVE_GENERIC */
+
+
+-
+-
+ #endif /* INCLUDED_VOLK_32F_S32F_S32F_MOD_RANGE_32F_A_H */
+diff --git a/kernels/volk/volk_32f_s32f_stddev_32f.h b/kernels/volk/volk_32f_s32f_stddev_32f.h
+index 4f3dc1c..0a1c32b 100644
+--- a/kernels/volk/volk_32f_s32f_stddev_32f.h
++++ b/kernels/volk/volk_32f_s32f_stddev_32f.h
+@@ -29,8 +29,8 @@
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+- * void volk_32f_s32f_stddev_32f(float* stddev, const float* inputBuffer, const float mean, unsigned int num_points)
+- * \endcode
++ * void volk_32f_s32f_stddev_32f(float* stddev, const float* inputBuffer, const float
++ * mean, unsigned int num_points) \endcode
+ *
+ * \b Inputs
+ * \li inputBuffer: The input vector of floats.
+@@ -68,65 +68,72 @@
+ #ifndef INCLUDED_volk_32f_s32f_stddev_32f_a_H
+ #define INCLUDED_volk_32f_s32f_stddev_32f_a_H
+
+-#include <volk/volk_common.h>
+ #include <inttypes.h>
+-#include <stdio.h>
+ #include <math.h>
++#include <stdio.h>
++#include <volk/volk_common.h>
+
+ #ifdef LV_HAVE_SSE4_1
+ #include <smmintrin.h>
+
+-static inline void
+-volk_32f_s32f_stddev_32f_a_sse4_1(float* stddev, const float* inputBuffer,
+- const float mean, unsigned int num_points)
++static inline void volk_32f_s32f_stddev_32f_a_sse4_1(float* stddev,
++ const float* inputBuffer,
++ const float mean,
++ unsigned int num_points)
+ {
+- float returnValue = 0;
+- if(num_points > 0){
+- unsigned int number = 0;
+- const unsigned int sixteenthPoints = num_points / 16;
+-
+- const float* aPtr = inputBuffer;
+-
+- __VOLK_ATTR_ALIGNED(16) float squareBuffer[4];
+-
+- __m128 squareAccumulator = _mm_setzero_ps();
+- __m128 aVal1, aVal2, aVal3, aVal4;
+- __m128 cVal1, cVal2, cVal3, cVal4;
+- for(;number < sixteenthPoints; number++) {
+- aVal1 = _mm_load_ps(aPtr); aPtr += 4;
+- cVal1 = _mm_dp_ps(aVal1, aVal1, 0xF1);
+-
+- aVal2 = _mm_load_ps(aPtr); aPtr += 4;
+- cVal2 = _mm_dp_ps(aVal2, aVal2, 0xF2);
+-
+- aVal3 = _mm_load_ps(aPtr); aPtr += 4;
+- cVal3 = _mm_dp_ps(aVal3, aVal3, 0xF4);
+-
+- aVal4 = _mm_load_ps(aPtr); aPtr += 4;
+- cVal4 = _mm_dp_ps(aVal4, aVal4, 0xF8);
+-
+- cVal1 = _mm_or_ps(cVal1, cVal2);
+- cVal3 = _mm_or_ps(cVal3, cVal4);
+- cVal1 = _mm_or_ps(cVal1, cVal3);
+-
+- squareAccumulator = _mm_add_ps(squareAccumulator, cVal1); // squareAccumulator += x^2
++ float returnValue = 0;
++ if (num_points > 0) {
++ unsigned int number = 0;
++ const unsigned int sixteenthPoints = num_points / 16;
++
++ const float* aPtr = inputBuffer;
++
++ __VOLK_ATTR_ALIGNED(16) float squareBuffer[4];
++
++ __m128 squareAccumulator = _mm_setzero_ps();
++ __m128 aVal1, aVal2, aVal3, aVal4;
++ __m128 cVal1, cVal2, cVal3, cVal4;
++ for (; number < sixteenthPoints; number++) {
++ aVal1 = _mm_load_ps(aPtr);
++ aPtr += 4;
++ cVal1 = _mm_dp_ps(aVal1, aVal1, 0xF1);
++
++ aVal2 = _mm_load_ps(aPtr);
++ aPtr += 4;
++ cVal2 = _mm_dp_ps(aVal2, aVal2, 0xF2);
++
++ aVal3 = _mm_load_ps(aPtr);
++ aPtr += 4;
++ cVal3 = _mm_dp_ps(aVal3, aVal3, 0xF4);
++
++ aVal4 = _mm_load_ps(aPtr);
++ aPtr += 4;
++ cVal4 = _mm_dp_ps(aVal4, aVal4, 0xF8);
++
++ cVal1 = _mm_or_ps(cVal1, cVal2);
++ cVal3 = _mm_or_ps(cVal3, cVal4);
++ cVal1 = _mm_or_ps(cVal1, cVal3);
++
++ squareAccumulator =
++ _mm_add_ps(squareAccumulator, cVal1); // squareAccumulator += x^2
++ }
++ _mm_store_ps(squareBuffer,
++ squareAccumulator); // Store the results back into the C container
++ returnValue = squareBuffer[0];
++ returnValue += squareBuffer[1];
++ returnValue += squareBuffer[2];
++ returnValue += squareBuffer[3];
++
++ number = sixteenthPoints * 16;
++ for (; number < num_points; number++) {
++ returnValue += (*aPtr) * (*aPtr);
++ aPtr++;
++ }
++ returnValue /= num_points;
++ returnValue -= (mean * mean);
++ returnValue = sqrtf(returnValue);
+ }
+- _mm_store_ps(squareBuffer,squareAccumulator); // Store the results back into the C container
+- returnValue = squareBuffer[0];
+- returnValue += squareBuffer[1];
+- returnValue += squareBuffer[2];
+- returnValue += squareBuffer[3];
+-
+- number = sixteenthPoints * 16;
+- for(;number < num_points; number++){
+- returnValue += (*aPtr) * (*aPtr);
+- aPtr++;
+- }
+- returnValue /= num_points;
+- returnValue -= (mean * mean);
+- returnValue = sqrtf(returnValue);
+- }
+- *stddev = returnValue;
++ *stddev = returnValue;
+ }
+
+ #endif /* LV_HAVE_SSE4_1 */
+@@ -134,43 +141,45 @@ volk_32f_s32f_stddev_32f_a_sse4_1(float* stddev, const float* inputBuffer,
+ #ifdef LV_HAVE_SSE
+ #include <xmmintrin.h>
+
+-static inline void
+-volk_32f_s32f_stddev_32f_a_sse(float* stddev, const float* inputBuffer,
+- const float mean, unsigned int num_points)
++static inline void volk_32f_s32f_stddev_32f_a_sse(float* stddev,
++ const float* inputBuffer,
++ const float mean,
++ unsigned int num_points)
+ {
+- float returnValue = 0;
+- if(num_points > 0){
+- unsigned int number = 0;
+- const unsigned int quarterPoints = num_points / 4;
+-
+- const float* aPtr = inputBuffer;
+-
+- __VOLK_ATTR_ALIGNED(16) float squareBuffer[4];
+-
+- __m128 squareAccumulator = _mm_setzero_ps();
+- __m128 aVal = _mm_setzero_ps();
+- for(;number < quarterPoints; number++) {
+- aVal = _mm_load_ps(aPtr); // aVal = x
+- aVal = _mm_mul_ps(aVal, aVal); // squareAccumulator += x^2
+- squareAccumulator = _mm_add_ps(squareAccumulator, aVal);
+- aPtr += 4;
+- }
+- _mm_store_ps(squareBuffer,squareAccumulator); // Store the results back into the C container
+- returnValue = squareBuffer[0];
+- returnValue += squareBuffer[1];
+- returnValue += squareBuffer[2];
+- returnValue += squareBuffer[3];
+-
+- number = quarterPoints * 4;
+- for(;number < num_points; number++){
+- returnValue += (*aPtr) * (*aPtr);
+- aPtr++;
++ float returnValue = 0;
++ if (num_points > 0) {
++ unsigned int number = 0;
++ const unsigned int quarterPoints = num_points / 4;
++
++ const float* aPtr = inputBuffer;
++
++ __VOLK_ATTR_ALIGNED(16) float squareBuffer[4];
++
++ __m128 squareAccumulator = _mm_setzero_ps();
++ __m128 aVal = _mm_setzero_ps();
++ for (; number < quarterPoints; number++) {
++ aVal = _mm_load_ps(aPtr); // aVal = x
++ aVal = _mm_mul_ps(aVal, aVal); // squareAccumulator += x^2
++ squareAccumulator = _mm_add_ps(squareAccumulator, aVal);
++ aPtr += 4;
++ }
++ _mm_store_ps(squareBuffer,
++ squareAccumulator); // Store the results back into the C container
++ returnValue = squareBuffer[0];
++ returnValue += squareBuffer[1];
++ returnValue += squareBuffer[2];
++ returnValue += squareBuffer[3];
++
++ number = quarterPoints * 4;
++ for (; number < num_points; number++) {
++ returnValue += (*aPtr) * (*aPtr);
++ aPtr++;
++ }
++ returnValue /= num_points;
++ returnValue -= (mean * mean);
++ returnValue = sqrtf(returnValue);
+ }
+- returnValue /= num_points;
+- returnValue -= (mean * mean);
+- returnValue = sqrtf(returnValue);
+- }
+- *stddev = returnValue;
++ *stddev = returnValue;
+ }
+ #endif /* LV_HAVE_SSE */
+
+@@ -178,86 +187,93 @@ volk_32f_s32f_stddev_32f_a_sse(float* stddev, const float* inputBuffer,
+ #ifdef LV_HAVE_AVX
+ #include <immintrin.h>
+
+-static inline void
+-volk_32f_s32f_stddev_32f_a_avx(float* stddev, const float* inputBuffer,
+- const float mean, unsigned int num_points)
++static inline void volk_32f_s32f_stddev_32f_a_avx(float* stddev,
++ const float* inputBuffer,
++ const float mean,
++ unsigned int num_points)
+ {
+- float stdDev = 0;
+- if(num_points > 0){
+- unsigned int number = 0;
+- const unsigned int thirtySecondthPoints = num_points / 32;
+-
+- const float* aPtr = inputBuffer;
+- __VOLK_ATTR_ALIGNED(32) float squareBuffer[8];
+-
+- __m256 squareAccumulator = _mm256_setzero_ps();
+- __m256 aVal1, aVal2, aVal3, aVal4;
+- __m256 cVal1, cVal2, cVal3, cVal4;
+- for(;number < thirtySecondthPoints; number++) {
+- aVal1 = _mm256_load_ps(aPtr); aPtr += 8;
+- cVal1 = _mm256_dp_ps(aVal1, aVal1, 0xF1);
+-
+- aVal2 = _mm256_load_ps(aPtr); aPtr += 8;
+- cVal2 = _mm256_dp_ps(aVal2, aVal2, 0xF2);
+-
+- aVal3 = _mm256_load_ps(aPtr); aPtr += 8;
+- cVal3 = _mm256_dp_ps(aVal3, aVal3, 0xF4);
+-
+- aVal4 = _mm256_load_ps(aPtr); aPtr += 8;
+- cVal4 = _mm256_dp_ps(aVal4, aVal4, 0xF8);
+-
+- cVal1 = _mm256_or_ps(cVal1, cVal2);
+- cVal3 = _mm256_or_ps(cVal3, cVal4);
+- cVal1 = _mm256_or_ps(cVal1, cVal3);
+-
+- squareAccumulator = _mm256_add_ps(squareAccumulator, cVal1); // squareAccumulator += x^2
++ float stdDev = 0;
++ if (num_points > 0) {
++ unsigned int number = 0;
++ const unsigned int thirtySecondthPoints = num_points / 32;
++
++ const float* aPtr = inputBuffer;
++ __VOLK_ATTR_ALIGNED(32) float squareBuffer[8];
++
++ __m256 squareAccumulator = _mm256_setzero_ps();
++ __m256 aVal1, aVal2, aVal3, aVal4;
++ __m256 cVal1, cVal2, cVal3, cVal4;
++ for (; number < thirtySecondthPoints; number++) {
++ aVal1 = _mm256_load_ps(aPtr);
++ aPtr += 8;
++ cVal1 = _mm256_dp_ps(aVal1, aVal1, 0xF1);
++
++ aVal2 = _mm256_load_ps(aPtr);
++ aPtr += 8;
++ cVal2 = _mm256_dp_ps(aVal2, aVal2, 0xF2);
++
++ aVal3 = _mm256_load_ps(aPtr);
++ aPtr += 8;
++ cVal3 = _mm256_dp_ps(aVal3, aVal3, 0xF4);
++
++ aVal4 = _mm256_load_ps(aPtr);
++ aPtr += 8;
++ cVal4 = _mm256_dp_ps(aVal4, aVal4, 0xF8);
++
++ cVal1 = _mm256_or_ps(cVal1, cVal2);
++ cVal3 = _mm256_or_ps(cVal3, cVal4);
++ cVal1 = _mm256_or_ps(cVal1, cVal3);
++
++ squareAccumulator =
++ _mm256_add_ps(squareAccumulator, cVal1); // squareAccumulator += x^2
++ }
++ _mm256_store_ps(squareBuffer,
++ squareAccumulator); // Store the results back into the C container
++ stdDev = squareBuffer[0];
++ stdDev += squareBuffer[1];
++ stdDev += squareBuffer[2];
++ stdDev += squareBuffer[3];
++ stdDev += squareBuffer[4];
++ stdDev += squareBuffer[5];
++ stdDev += squareBuffer[6];
++ stdDev += squareBuffer[7];
++
++ number = thirtySecondthPoints * 32;
++ for (; number < num_points; number++) {
++ stdDev += (*aPtr) * (*aPtr);
++ aPtr++;
++ }
++ stdDev /= num_points;
++ stdDev -= (mean * mean);
++ stdDev = sqrtf(stdDev);
+ }
+- _mm256_store_ps(squareBuffer,squareAccumulator); // Store the results back into the C container
+- stdDev = squareBuffer[0];
+- stdDev += squareBuffer[1];
+- stdDev += squareBuffer[2];
+- stdDev += squareBuffer[3];
+- stdDev += squareBuffer[4];
+- stdDev += squareBuffer[5];
+- stdDev += squareBuffer[6];
+- stdDev += squareBuffer[7];
+-
+- number = thirtySecondthPoints * 32;
+- for(;number < num_points; number++){
+- stdDev += (*aPtr) * (*aPtr);
+- aPtr++;
+- }
+- stdDev /= num_points;
+- stdDev -= (mean * mean);
+- stdDev = sqrtf(stdDev);
+- }
+- *stddev = stdDev;
+-
++ *stddev = stdDev;
+ }
+ #endif /* LV_HAVE_AVX */
+
+
+ #ifdef LV_HAVE_GENERIC
+
+-static inline void
+-volk_32f_s32f_stddev_32f_generic(float* stddev, const float* inputBuffer,
+- const float mean, unsigned int num_points)
++static inline void volk_32f_s32f_stddev_32f_generic(float* stddev,
++ const float* inputBuffer,
++ const float mean,
++ unsigned int num_points)
+ {
+- float returnValue = 0;
+- if(num_points > 0){
+- const float* aPtr = inputBuffer;
+- unsigned int number = 0;
+-
+- for(number = 0; number < num_points; number++){
+- returnValue += (*aPtr) * (*aPtr);
+- aPtr++;
++ float returnValue = 0;
++ if (num_points > 0) {
++ const float* aPtr = inputBuffer;
++ unsigned int number = 0;
++
++ for (number = 0; number < num_points; number++) {
++ returnValue += (*aPtr) * (*aPtr);
++ aPtr++;
++ }
++
++ returnValue /= num_points;
++ returnValue -= (mean * mean);
++ returnValue = sqrtf(returnValue);
+ }
+-
+- returnValue /= num_points;
+- returnValue -= (mean * mean);
+- returnValue = sqrtf(returnValue);
+- }
+- *stddev = returnValue;
++ *stddev = returnValue;
+ }
+
+ #endif /* LV_HAVE_GENERIC */
+@@ -268,69 +284,76 @@ volk_32f_s32f_stddev_32f_generic(float* stddev, const float* inputBuffer,
+ #ifndef INCLUDED_volk_32f_s32f_stddev_32f_u_H
+ #define INCLUDED_volk_32f_s32f_stddev_32f_u_H
+
+-#include <volk/volk_common.h>
+ #include <inttypes.h>
+-#include <stdio.h>
+ #include <math.h>
++#include <stdio.h>
++#include <volk/volk_common.h>
+
+ #ifdef LV_HAVE_AVX
+ #include <immintrin.h>
+
+-static inline void
+-volk_32f_s32f_stddev_32f_u_avx(float* stddev, const float* inputBuffer,
+- const float mean, unsigned int num_points)
++static inline void volk_32f_s32f_stddev_32f_u_avx(float* stddev,
++ const float* inputBuffer,
++ const float mean,
++ unsigned int num_points)
+ {
+- float stdDev = 0;
+- if(num_points > 0){
+- unsigned int number = 0;
+- const unsigned int thirtySecondthPoints = num_points / 32;
+-
+- const float* aPtr = inputBuffer;
+- __VOLK_ATTR_ALIGNED(32) float squareBuffer[8];
+-
+- __m256 squareAccumulator = _mm256_setzero_ps();
+- __m256 aVal1, aVal2, aVal3, aVal4;
+- __m256 cVal1, cVal2, cVal3, cVal4;
+- for(;number < thirtySecondthPoints; number++) {
+- aVal1 = _mm256_loadu_ps(aPtr); aPtr += 8;
+- cVal1 = _mm256_dp_ps(aVal1, aVal1, 0xF1);
+-
+- aVal2 = _mm256_loadu_ps(aPtr); aPtr += 8;
+- cVal2 = _mm256_dp_ps(aVal2, aVal2, 0xF2);
+-
+- aVal3 = _mm256_loadu_ps(aPtr); aPtr += 8;
+- cVal3 = _mm256_dp_ps(aVal3, aVal3, 0xF4);
+-
+- aVal4 = _mm256_loadu_ps(aPtr); aPtr += 8;
+- cVal4 = _mm256_dp_ps(aVal4, aVal4, 0xF8);
+-
+- cVal1 = _mm256_or_ps(cVal1, cVal2);
+- cVal3 = _mm256_or_ps(cVal3, cVal4);
+- cVal1 = _mm256_or_ps(cVal1, cVal3);
+-
+- squareAccumulator = _mm256_add_ps(squareAccumulator, cVal1); // squareAccumulator += x^2
++ float stdDev = 0;
++ if (num_points > 0) {
++ unsigned int number = 0;
++ const unsigned int thirtySecondthPoints = num_points / 32;
++
++ const float* aPtr = inputBuffer;
++ __VOLK_ATTR_ALIGNED(32) float squareBuffer[8];
++
++ __m256 squareAccumulator = _mm256_setzero_ps();
++ __m256 aVal1, aVal2, aVal3, aVal4;
++ __m256 cVal1, cVal2, cVal3, cVal4;
++ for (; number < thirtySecondthPoints; number++) {
++ aVal1 = _mm256_loadu_ps(aPtr);
++ aPtr += 8;
++ cVal1 = _mm256_dp_ps(aVal1, aVal1, 0xF1);
++
++ aVal2 = _mm256_loadu_ps(aPtr);
++ aPtr += 8;
++ cVal2 = _mm256_dp_ps(aVal2, aVal2, 0xF2);
++
++ aVal3 = _mm256_loadu_ps(aPtr);
++ aPtr += 8;
++ cVal3 = _mm256_dp_ps(aVal3, aVal3, 0xF4);
++
++ aVal4 = _mm256_loadu_ps(aPtr);
++ aPtr += 8;
++ cVal4 = _mm256_dp_ps(aVal4, aVal4, 0xF8);
++
++ cVal1 = _mm256_or_ps(cVal1, cVal2);
++ cVal3 = _mm256_or_ps(cVal3, cVal4);
++ cVal1 = _mm256_or_ps(cVal1, cVal3);
++
++ squareAccumulator =
++ _mm256_add_ps(squareAccumulator, cVal1); // squareAccumulator += x^2
++ }
++ _mm256_storeu_ps(
++ squareBuffer,
++ squareAccumulator); // Store the results back into the C container
++ stdDev = squareBuffer[0];
++ stdDev += squareBuffer[1];
++ stdDev += squareBuffer[2];
++ stdDev += squareBuffer[3];
++ stdDev += squareBuffer[4];
++ stdDev += squareBuffer[5];
++ stdDev += squareBuffer[6];
++ stdDev += squareBuffer[7];
++
++ number = thirtySecondthPoints * 32;
++ for (; number < num_points; number++) {
++ stdDev += (*aPtr) * (*aPtr);
++ aPtr++;
++ }
++ stdDev /= num_points;
++ stdDev -= (mean * mean);
++ stdDev = sqrtf(stdDev);
+ }
+- _mm256_storeu_ps(squareBuffer,squareAccumulator); // Store the results back into the C container
+- stdDev = squareBuffer[0];
+- stdDev += squareBuffer[1];
+- stdDev += squareBuffer[2];
+- stdDev += squareBuffer[3];
+- stdDev += squareBuffer[4];
+- stdDev += squareBuffer[5];
+- stdDev += squareBuffer[6];
+- stdDev += squareBuffer[7];
+-
+- number = thirtySecondthPoints * 32;
+- for(;number < num_points; number++){
+- stdDev += (*aPtr) * (*aPtr);
+- aPtr++;
+- }
+- stdDev /= num_points;
+- stdDev -= (mean * mean);
+- stdDev = sqrtf(stdDev);
+- }
+- *stddev = stdDev;
+-
++ *stddev = stdDev;
+ }
+ #endif /* LV_HAVE_AVX */
+
+diff --git a/kernels/volk/volk_32f_sin_32f.h b/kernels/volk/volk_32f_sin_32f.h
+index 3780086..e65f25a 100644
+--- a/kernels/volk/volk_32f_sin_32f.h
++++ b/kernels/volk/volk_32f_sin_32f.h
+@@ -69,9 +69,9 @@
+ * \endcode
+ */
+
+-#include <stdio.h>
+-#include <math.h>
+ #include <inttypes.h>
++#include <math.h>
++#include <stdio.h>
+
+ #ifndef INCLUDED_volk_32f_sin_32f_a_H
+ #define INCLUDED_volk_32f_sin_32f_a_H
+@@ -83,72 +83,93 @@
+ static inline void
+ volk_32f_sin_32f_a_avx2_fma(float* bVector, const float* aVector, unsigned int num_points)
+ {
+- float* bPtr = bVector;
+- const float* aPtr = aVector;
+-
+- unsigned int number = 0;
+- unsigned int eighthPoints = num_points / 8;
+- unsigned int i = 0;
+-
+- __m256 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones, fzeroes;
+- __m256 sine, cosine, condition1, condition2;
+- __m256i q, r, ones, twos, fours;
+-
+- m4pi = _mm256_set1_ps(1.273239545);
+- pio4A = _mm256_set1_ps(0.78515625);
+- pio4B = _mm256_set1_ps(0.241876e-3);
+- ffours = _mm256_set1_ps(4.0);
+- ftwos = _mm256_set1_ps(2.0);
+- fones = _mm256_set1_ps(1.0);
+- fzeroes = _mm256_setzero_ps();
+- ones = _mm256_set1_epi32(1);
+- twos = _mm256_set1_epi32(2);
+- fours = _mm256_set1_epi32(4);
+-
+- cp1 = _mm256_set1_ps(1.0);
+- cp2 = _mm256_set1_ps(0.83333333e-1);
+- cp3 = _mm256_set1_ps(0.2777778e-2);
+- cp4 = _mm256_set1_ps(0.49603e-4);
+- cp5 = _mm256_set1_ps(0.551e-6);
+-
+- for(;number < eighthPoints; number++) {
+- aVal = _mm256_load_ps(aPtr);
+- s = _mm256_sub_ps(aVal, _mm256_and_ps(_mm256_mul_ps(aVal, ftwos), _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS)));
+- q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi)));
+- r = _mm256_add_epi32(q, _mm256_and_si256(q, ones));
+-
+- s = _mm256_fnmadd_ps(_mm256_cvtepi32_ps(r), pio4A, s);
+- s = _mm256_fnmadd_ps(_mm256_cvtepi32_ps(r), pio4B, s);
+-
+- s = _mm256_div_ps(s, _mm256_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
+- s = _mm256_mul_ps(s, s);
+- // Evaluate Taylor series
+- s = _mm256_mul_ps(_mm256_fmadd_ps(_mm256_fmsub_ps(_mm256_fmadd_ps(_mm256_fmsub_ps(s, cp5, cp4), s, cp3), s, cp2), s, cp1), s);
+-
+- for(i = 0; i < 3; i++) {
+- s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s));
++ float* bPtr = bVector;
++ const float* aPtr = aVector;
++
++ unsigned int number = 0;
++ unsigned int eighthPoints = num_points / 8;
++ unsigned int i = 0;
++
++ __m256 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones,
++ fzeroes;
++ __m256 sine, cosine, condition1, condition2;
++ __m256i q, r, ones, twos, fours;
++
++ m4pi = _mm256_set1_ps(1.273239545);
++ pio4A = _mm256_set1_ps(0.78515625);
++ pio4B = _mm256_set1_ps(0.241876e-3);
++ ffours = _mm256_set1_ps(4.0);
++ ftwos = _mm256_set1_ps(2.0);
++ fones = _mm256_set1_ps(1.0);
++ fzeroes = _mm256_setzero_ps();
++ ones = _mm256_set1_epi32(1);
++ twos = _mm256_set1_epi32(2);
++ fours = _mm256_set1_epi32(4);
++
++ cp1 = _mm256_set1_ps(1.0);
++ cp2 = _mm256_set1_ps(0.83333333e-1);
++ cp3 = _mm256_set1_ps(0.2777778e-2);
++ cp4 = _mm256_set1_ps(0.49603e-4);
++ cp5 = _mm256_set1_ps(0.551e-6);
++
++ for (; number < eighthPoints; number++) {
++ aVal = _mm256_load_ps(aPtr);
++ s = _mm256_sub_ps(aVal,
++ _mm256_and_ps(_mm256_mul_ps(aVal, ftwos),
++ _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS)));
++ q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi)));
++ r = _mm256_add_epi32(q, _mm256_and_si256(q, ones));
++
++ s = _mm256_fnmadd_ps(_mm256_cvtepi32_ps(r), pio4A, s);
++ s = _mm256_fnmadd_ps(_mm256_cvtepi32_ps(r), pio4B, s);
++
++ s = _mm256_div_ps(
++ s,
++ _mm256_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
++ s = _mm256_mul_ps(s, s);
++ // Evaluate Taylor series
++ s = _mm256_mul_ps(
++ _mm256_fmadd_ps(
++ _mm256_fmsub_ps(
++ _mm256_fmadd_ps(_mm256_fmsub_ps(s, cp5, cp4), s, cp3), s, cp2),
++ s,
++ cp1),
++ s);
++
++ for (i = 0; i < 3; i++) {
++ s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s));
++ }
++ s = _mm256_div_ps(s, ftwos);
++
++ sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s));
++ cosine = _mm256_sub_ps(fones, s);
++
++ condition1 = _mm256_cmp_ps(
++ _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, ones), twos)),
++ fzeroes,
++ _CMP_NEQ_UQ);
++ condition2 = _mm256_cmp_ps(
++ _mm256_cmp_ps(
++ _mm256_cvtepi32_ps(_mm256_and_si256(q, fours)), fzeroes, _CMP_NEQ_UQ),
++ _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS),
++ _CMP_NEQ_UQ);
++ // Need this condition only for cos
++ // condition3 = _mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q,
++ // twos), fours)), fzeroes);
++
++ sine =
++ _mm256_add_ps(sine, _mm256_and_ps(_mm256_sub_ps(cosine, sine), condition1));
++ sine = _mm256_sub_ps(
++ sine, _mm256_and_ps(_mm256_mul_ps(sine, _mm256_set1_ps(2.0f)), condition2));
++ _mm256_store_ps(bPtr, sine);
++ aPtr += 8;
++ bPtr += 8;
++ }
++
++ number = eighthPoints * 8;
++ for (; number < num_points; number++) {
++ *bPtr++ = sin(*aPtr++);
+ }
+- s = _mm256_div_ps(s, ftwos);
+-
+- sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s));
+- cosine = _mm256_sub_ps(fones, s);
+-
+- condition1 = _mm256_cmp_ps(_mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, ones), twos)), fzeroes, _CMP_NEQ_UQ);
+- condition2 = _mm256_cmp_ps(_mm256_cmp_ps(_mm256_cvtepi32_ps(_mm256_and_si256(q, fours)), fzeroes, _CMP_NEQ_UQ), _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS), _CMP_NEQ_UQ);
+- // Need this condition only for cos
+- //condition3 = _mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, twos), fours)), fzeroes);
+-
+- sine = _mm256_add_ps(sine, _mm256_and_ps(_mm256_sub_ps(cosine, sine), condition1));
+- sine = _mm256_sub_ps(sine, _mm256_and_ps(_mm256_mul_ps(sine, _mm256_set1_ps(2.0f)), condition2));
+- _mm256_store_ps(bPtr, sine);
+- aPtr += 8;
+- bPtr += 8;
+- }
+-
+- number = eighthPoints * 8;
+- for(;number < num_points; number++) {
+- *bPtr++ = sin(*aPtr++);
+- }
+ }
+
+ #endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for aligned */
+@@ -159,72 +180,100 @@ volk_32f_sin_32f_a_avx2_fma(float* bVector, const float* aVector, unsigned int n
+ static inline void
+ volk_32f_sin_32f_a_avx2(float* bVector, const float* aVector, unsigned int num_points)
+ {
+- float* bPtr = bVector;
+- const float* aPtr = aVector;
+-
+- unsigned int number = 0;
+- unsigned int eighthPoints = num_points / 8;
+- unsigned int i = 0;
+-
+- __m256 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones, fzeroes;
+- __m256 sine, cosine, condition1, condition2;
+- __m256i q, r, ones, twos, fours;
+-
+- m4pi = _mm256_set1_ps(1.273239545);
+- pio4A = _mm256_set1_ps(0.78515625);
+- pio4B = _mm256_set1_ps(0.241876e-3);
+- ffours = _mm256_set1_ps(4.0);
+- ftwos = _mm256_set1_ps(2.0);
+- fones = _mm256_set1_ps(1.0);
+- fzeroes = _mm256_setzero_ps();
+- ones = _mm256_set1_epi32(1);
+- twos = _mm256_set1_epi32(2);
+- fours = _mm256_set1_epi32(4);
+-
+- cp1 = _mm256_set1_ps(1.0);
+- cp2 = _mm256_set1_ps(0.83333333e-1);
+- cp3 = _mm256_set1_ps(0.2777778e-2);
+- cp4 = _mm256_set1_ps(0.49603e-4);
+- cp5 = _mm256_set1_ps(0.551e-6);
+-
+- for(;number < eighthPoints; number++) {
+- aVal = _mm256_load_ps(aPtr);
+- s = _mm256_sub_ps(aVal, _mm256_and_ps(_mm256_mul_ps(aVal, ftwos), _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS)));
+- q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi)));
+- r = _mm256_add_epi32(q, _mm256_and_si256(q, ones));
+-
+- s = _mm256_sub_ps(s, _mm256_mul_ps(_mm256_cvtepi32_ps(r), pio4A));
+- s = _mm256_sub_ps(s, _mm256_mul_ps(_mm256_cvtepi32_ps(r), pio4B));
+-
+- s = _mm256_div_ps(s, _mm256_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
+- s = _mm256_mul_ps(s, s);
+- // Evaluate Taylor series
+- s = _mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(s, cp5), cp4), s), cp3), s), cp2), s), cp1), s);
+-
+- for(i = 0; i < 3; i++) {
+- s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s));
++ float* bPtr = bVector;
++ const float* aPtr = aVector;
++
++ unsigned int number = 0;
++ unsigned int eighthPoints = num_points / 8;
++ unsigned int i = 0;
++
++ __m256 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones,
++ fzeroes;
++ __m256 sine, cosine, condition1, condition2;
++ __m256i q, r, ones, twos, fours;
++
++ m4pi = _mm256_set1_ps(1.273239545);
++ pio4A = _mm256_set1_ps(0.78515625);
++ pio4B = _mm256_set1_ps(0.241876e-3);
++ ffours = _mm256_set1_ps(4.0);
++ ftwos = _mm256_set1_ps(2.0);
++ fones = _mm256_set1_ps(1.0);
++ fzeroes = _mm256_setzero_ps();
++ ones = _mm256_set1_epi32(1);
++ twos = _mm256_set1_epi32(2);
++ fours = _mm256_set1_epi32(4);
++
++ cp1 = _mm256_set1_ps(1.0);
++ cp2 = _mm256_set1_ps(0.83333333e-1);
++ cp3 = _mm256_set1_ps(0.2777778e-2);
++ cp4 = _mm256_set1_ps(0.49603e-4);
++ cp5 = _mm256_set1_ps(0.551e-6);
++
++ for (; number < eighthPoints; number++) {
++ aVal = _mm256_load_ps(aPtr);
++ s = _mm256_sub_ps(aVal,
++ _mm256_and_ps(_mm256_mul_ps(aVal, ftwos),
++ _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS)));
++ q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi)));
++ r = _mm256_add_epi32(q, _mm256_and_si256(q, ones));
++
++ s = _mm256_sub_ps(s, _mm256_mul_ps(_mm256_cvtepi32_ps(r), pio4A));
++ s = _mm256_sub_ps(s, _mm256_mul_ps(_mm256_cvtepi32_ps(r), pio4B));
++
++ s = _mm256_div_ps(
++ s,
++ _mm256_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
++ s = _mm256_mul_ps(s, s);
++ // Evaluate Taylor series
++ s = _mm256_mul_ps(
++ _mm256_add_ps(
++ _mm256_mul_ps(
++ _mm256_sub_ps(
++ _mm256_mul_ps(
++ _mm256_add_ps(
++ _mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(s, cp5), cp4),
++ s),
++ cp3),
++ s),
++ cp2),
++ s),
++ cp1),
++ s);
++
++ for (i = 0; i < 3; i++) {
++ s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s));
++ }
++ s = _mm256_div_ps(s, ftwos);
++
++ sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s));
++ cosine = _mm256_sub_ps(fones, s);
++
++ condition1 = _mm256_cmp_ps(
++ _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, ones), twos)),
++ fzeroes,
++ _CMP_NEQ_UQ);
++ condition2 = _mm256_cmp_ps(
++ _mm256_cmp_ps(
++ _mm256_cvtepi32_ps(_mm256_and_si256(q, fours)), fzeroes, _CMP_NEQ_UQ),
++ _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS),
++ _CMP_NEQ_UQ);
++ // Need this condition only for cos
++ // condition3 = _mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q,
++ // twos), fours)), fzeroes);
++
++ sine =
++ _mm256_add_ps(sine, _mm256_and_ps(_mm256_sub_ps(cosine, sine), condition1));
++ sine = _mm256_sub_ps(
++ sine, _mm256_and_ps(_mm256_mul_ps(sine, _mm256_set1_ps(2.0f)), condition2));
++ _mm256_store_ps(bPtr, sine);
++ aPtr += 8;
++ bPtr += 8;
++ }
++
++ number = eighthPoints * 8;
++ for (; number < num_points; number++) {
++ *bPtr++ = sin(*aPtr++);
+ }
+- s = _mm256_div_ps(s, ftwos);
+-
+- sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s));
+- cosine = _mm256_sub_ps(fones, s);
+-
+- condition1 = _mm256_cmp_ps(_mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, ones), twos)), fzeroes, _CMP_NEQ_UQ);
+- condition2 = _mm256_cmp_ps(_mm256_cmp_ps(_mm256_cvtepi32_ps(_mm256_and_si256(q, fours)), fzeroes, _CMP_NEQ_UQ), _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS), _CMP_NEQ_UQ);
+- // Need this condition only for cos
+- //condition3 = _mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, twos), fours)), fzeroes);
+-
+- sine = _mm256_add_ps(sine, _mm256_and_ps(_mm256_sub_ps(cosine, sine), condition1));
+- sine = _mm256_sub_ps(sine, _mm256_and_ps(_mm256_mul_ps(sine, _mm256_set1_ps(2.0f)), condition2));
+- _mm256_store_ps(bPtr, sine);
+- aPtr += 8;
+- bPtr += 8;
+- }
+-
+- number = eighthPoints * 8;
+- for(;number < num_points; number++) {
+- *bPtr++ = sin(*aPtr++);
+- }
+ }
+
+ #endif /* LV_HAVE_AVX2 for aligned */
+@@ -235,72 +284,91 @@ volk_32f_sin_32f_a_avx2(float* bVector, const float* aVector, unsigned int num_p
+ static inline void
+ volk_32f_sin_32f_a_sse4_1(float* bVector, const float* aVector, unsigned int num_points)
+ {
+- float* bPtr = bVector;
+- const float* aPtr = aVector;
+-
+- unsigned int number = 0;
+- unsigned int quarterPoints = num_points / 4;
+- unsigned int i = 0;
+-
+- __m128 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones, fzeroes;
+- __m128 sine, cosine, condition1, condition2;
+- __m128i q, r, ones, twos, fours;
+-
+- m4pi = _mm_set1_ps(1.273239545);
+- pio4A = _mm_set1_ps(0.78515625);
+- pio4B = _mm_set1_ps(0.241876e-3);
+- ffours = _mm_set1_ps(4.0);
+- ftwos = _mm_set1_ps(2.0);
+- fones = _mm_set1_ps(1.0);
+- fzeroes = _mm_setzero_ps();
+- ones = _mm_set1_epi32(1);
+- twos = _mm_set1_epi32(2);
+- fours = _mm_set1_epi32(4);
+-
+- cp1 = _mm_set1_ps(1.0);
+- cp2 = _mm_set1_ps(0.83333333e-1);
+- cp3 = _mm_set1_ps(0.2777778e-2);
+- cp4 = _mm_set1_ps(0.49603e-4);
+- cp5 = _mm_set1_ps(0.551e-6);
+-
+- for(;number < quarterPoints; number++) {
+- aVal = _mm_load_ps(aPtr);
+- s = _mm_sub_ps(aVal, _mm_and_ps(_mm_mul_ps(aVal, ftwos), _mm_cmplt_ps(aVal, fzeroes)));
+- q = _mm_cvtps_epi32(_mm_floor_ps(_mm_mul_ps(s, m4pi)));
+- r = _mm_add_epi32(q, _mm_and_si128(q, ones));
+-
+- s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4A));
+- s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4B));
+-
+- s = _mm_div_ps(s, _mm_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
+- s = _mm_mul_ps(s, s);
+- // Evaluate Taylor series
+- s = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(s, cp5), cp4), s), cp3), s), cp2), s), cp1), s);
+-
+- for(i = 0; i < 3; i++) {
+- s = _mm_mul_ps(s, _mm_sub_ps(ffours, s));
++ float* bPtr = bVector;
++ const float* aPtr = aVector;
++
++ unsigned int number = 0;
++ unsigned int quarterPoints = num_points / 4;
++ unsigned int i = 0;
++
++ __m128 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones,
++ fzeroes;
++ __m128 sine, cosine, condition1, condition2;
++ __m128i q, r, ones, twos, fours;
++
++ m4pi = _mm_set1_ps(1.273239545);
++ pio4A = _mm_set1_ps(0.78515625);
++ pio4B = _mm_set1_ps(0.241876e-3);
++ ffours = _mm_set1_ps(4.0);
++ ftwos = _mm_set1_ps(2.0);
++ fones = _mm_set1_ps(1.0);
++ fzeroes = _mm_setzero_ps();
++ ones = _mm_set1_epi32(1);
++ twos = _mm_set1_epi32(2);
++ fours = _mm_set1_epi32(4);
++
++ cp1 = _mm_set1_ps(1.0);
++ cp2 = _mm_set1_ps(0.83333333e-1);
++ cp3 = _mm_set1_ps(0.2777778e-2);
++ cp4 = _mm_set1_ps(0.49603e-4);
++ cp5 = _mm_set1_ps(0.551e-6);
++
++ for (; number < quarterPoints; number++) {
++ aVal = _mm_load_ps(aPtr);
++ s = _mm_sub_ps(aVal,
++ _mm_and_ps(_mm_mul_ps(aVal, ftwos), _mm_cmplt_ps(aVal, fzeroes)));
++ q = _mm_cvtps_epi32(_mm_floor_ps(_mm_mul_ps(s, m4pi)));
++ r = _mm_add_epi32(q, _mm_and_si128(q, ones));
++
++ s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4A));
++ s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4B));
++
++ s = _mm_div_ps(
++ s, _mm_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
++ s = _mm_mul_ps(s, s);
++ // Evaluate Taylor series
++ s = _mm_mul_ps(
++ _mm_add_ps(
++ _mm_mul_ps(
++ _mm_sub_ps(
++ _mm_mul_ps(
++ _mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(s, cp5), cp4), s),
++ cp3),
++ s),
++ cp2),
++ s),
++ cp1),
++ s);
++
++ for (i = 0; i < 3; i++) {
++ s = _mm_mul_ps(s, _mm_sub_ps(ffours, s));
++ }
++ s = _mm_div_ps(s, ftwos);
++
++ sine = _mm_sqrt_ps(_mm_mul_ps(_mm_sub_ps(ftwos, s), s));
++ cosine = _mm_sub_ps(fones, s);
++
++ condition1 = _mm_cmpneq_ps(
++ _mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, ones), twos)), fzeroes);
++ condition2 = _mm_cmpneq_ps(
++ _mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(q, fours)), fzeroes),
++ _mm_cmplt_ps(aVal, fzeroes));
++ // Need this condition only for cos
++ // condition3 = _mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q,
++ // twos), fours)), fzeroes);
++
++ sine = _mm_add_ps(sine, _mm_and_ps(_mm_sub_ps(cosine, sine), condition1));
++ sine =
++ _mm_sub_ps(sine, _mm_and_ps(_mm_mul_ps(sine, _mm_set1_ps(2.0f)), condition2));
++ _mm_store_ps(bPtr, sine);
++ aPtr += 4;
++ bPtr += 4;
++ }
++
++ number = quarterPoints * 4;
++ for (; number < num_points; number++) {
++ *bPtr++ = sinf(*aPtr++);
+ }
+- s = _mm_div_ps(s, ftwos);
+-
+- sine = _mm_sqrt_ps(_mm_mul_ps(_mm_sub_ps(ftwos, s), s));
+- cosine = _mm_sub_ps(fones, s);
+-
+- condition1 = _mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, ones), twos)), fzeroes);
+- condition2 = _mm_cmpneq_ps(_mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(q, fours)), fzeroes), _mm_cmplt_ps(aVal, fzeroes));
+- // Need this condition only for cos
+- //condition3 = _mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, twos), fours)), fzeroes);
+-
+- sine = _mm_add_ps(sine, _mm_and_ps(_mm_sub_ps(cosine, sine), condition1));
+- sine = _mm_sub_ps(sine, _mm_and_ps(_mm_mul_ps(sine, _mm_set1_ps(2.0f)), condition2));
+- _mm_store_ps(bPtr, sine);
+- aPtr += 4;
+- bPtr += 4;
+- }
+-
+- number = quarterPoints * 4;
+- for(;number < num_points; number++) {
+- *bPtr++ = sinf(*aPtr++);
+- }
+ }
+
+ #endif /* LV_HAVE_SSE4_1 for aligned */
+@@ -317,72 +385,93 @@ volk_32f_sin_32f_a_sse4_1(float* bVector, const float* aVector, unsigned int num
+ static inline void
+ volk_32f_sin_32f_u_avx2_fma(float* bVector, const float* aVector, unsigned int num_points)
+ {
+- float* bPtr = bVector;
+- const float* aPtr = aVector;
+-
+- unsigned int number = 0;
+- unsigned int eighthPoints = num_points / 8;
+- unsigned int i = 0;
+-
+- __m256 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones, fzeroes;
+- __m256 sine, cosine, condition1, condition2;
+- __m256i q, r, ones, twos, fours;
+-
+- m4pi = _mm256_set1_ps(1.273239545);
+- pio4A = _mm256_set1_ps(0.78515625);
+- pio4B = _mm256_set1_ps(0.241876e-3);
+- ffours = _mm256_set1_ps(4.0);
+- ftwos = _mm256_set1_ps(2.0);
+- fones = _mm256_set1_ps(1.0);
+- fzeroes = _mm256_setzero_ps();
+- ones = _mm256_set1_epi32(1);
+- twos = _mm256_set1_epi32(2);
+- fours = _mm256_set1_epi32(4);
+-
+- cp1 = _mm256_set1_ps(1.0);
+- cp2 = _mm256_set1_ps(0.83333333e-1);
+- cp3 = _mm256_set1_ps(0.2777778e-2);
+- cp4 = _mm256_set1_ps(0.49603e-4);
+- cp5 = _mm256_set1_ps(0.551e-6);
+-
+- for(;number < eighthPoints; number++) {
+- aVal = _mm256_loadu_ps(aPtr);
+- s = _mm256_sub_ps(aVal, _mm256_and_ps(_mm256_mul_ps(aVal, ftwos), _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS)));
+- q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi)));
+- r = _mm256_add_epi32(q, _mm256_and_si256(q, ones));
+-
+- s = _mm256_fnmadd_ps(_mm256_cvtepi32_ps(r), pio4A, s);
+- s = _mm256_fnmadd_ps(_mm256_cvtepi32_ps(r), pio4B, s);
+-
+- s = _mm256_div_ps(s, _mm256_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
+- s = _mm256_mul_ps(s, s);
+- // Evaluate Taylor series
+- s = _mm256_mul_ps(_mm256_fmadd_ps(_mm256_fmsub_ps(_mm256_fmadd_ps(_mm256_fmsub_ps(s, cp5, cp4), s, cp3), s, cp2), s, cp1), s);
+-
+- for(i = 0; i < 3; i++) {
+- s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s));
++ float* bPtr = bVector;
++ const float* aPtr = aVector;
++
++ unsigned int number = 0;
++ unsigned int eighthPoints = num_points / 8;
++ unsigned int i = 0;
++
++ __m256 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones,
++ fzeroes;
++ __m256 sine, cosine, condition1, condition2;
++ __m256i q, r, ones, twos, fours;
++
++ m4pi = _mm256_set1_ps(1.273239545);
++ pio4A = _mm256_set1_ps(0.78515625);
++ pio4B = _mm256_set1_ps(0.241876e-3);
++ ffours = _mm256_set1_ps(4.0);
++ ftwos = _mm256_set1_ps(2.0);
++ fones = _mm256_set1_ps(1.0);
++ fzeroes = _mm256_setzero_ps();
++ ones = _mm256_set1_epi32(1);
++ twos = _mm256_set1_epi32(2);
++ fours = _mm256_set1_epi32(4);
++
++ cp1 = _mm256_set1_ps(1.0);
++ cp2 = _mm256_set1_ps(0.83333333e-1);
++ cp3 = _mm256_set1_ps(0.2777778e-2);
++ cp4 = _mm256_set1_ps(0.49603e-4);
++ cp5 = _mm256_set1_ps(0.551e-6);
++
++ for (; number < eighthPoints; number++) {
++ aVal = _mm256_loadu_ps(aPtr);
++ s = _mm256_sub_ps(aVal,
++ _mm256_and_ps(_mm256_mul_ps(aVal, ftwos),
++ _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS)));
++ q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi)));
++ r = _mm256_add_epi32(q, _mm256_and_si256(q, ones));
++
++ s = _mm256_fnmadd_ps(_mm256_cvtepi32_ps(r), pio4A, s);
++ s = _mm256_fnmadd_ps(_mm256_cvtepi32_ps(r), pio4B, s);
++
++ s = _mm256_div_ps(
++ s,
++ _mm256_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
++ s = _mm256_mul_ps(s, s);
++ // Evaluate Taylor series
++ s = _mm256_mul_ps(
++ _mm256_fmadd_ps(
++ _mm256_fmsub_ps(
++ _mm256_fmadd_ps(_mm256_fmsub_ps(s, cp5, cp4), s, cp3), s, cp2),
++ s,
++ cp1),
++ s);
++
++ for (i = 0; i < 3; i++) {
++ s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s));
++ }
++ s = _mm256_div_ps(s, ftwos);
++
++ sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s));
++ cosine = _mm256_sub_ps(fones, s);
++
++ condition1 = _mm256_cmp_ps(
++ _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, ones), twos)),
++ fzeroes,
++ _CMP_NEQ_UQ);
++ condition2 = _mm256_cmp_ps(
++ _mm256_cmp_ps(
++ _mm256_cvtepi32_ps(_mm256_and_si256(q, fours)), fzeroes, _CMP_NEQ_UQ),
++ _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS),
++ _CMP_NEQ_UQ);
++ // Need this condition only for cos
++ // condition3 = _mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q,
++ // twos), fours)), fzeroes);
++
++ sine =
++ _mm256_add_ps(sine, _mm256_and_ps(_mm256_sub_ps(cosine, sine), condition1));
++ sine = _mm256_sub_ps(
++ sine, _mm256_and_ps(_mm256_mul_ps(sine, _mm256_set1_ps(2.0f)), condition2));
++ _mm256_storeu_ps(bPtr, sine);
++ aPtr += 8;
++ bPtr += 8;
++ }
++
++ number = eighthPoints * 8;
++ for (; number < num_points; number++) {
++ *bPtr++ = sin(*aPtr++);
+ }
+- s = _mm256_div_ps(s, ftwos);
+-
+- sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s));
+- cosine = _mm256_sub_ps(fones, s);
+-
+- condition1 = _mm256_cmp_ps(_mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, ones), twos)), fzeroes, _CMP_NEQ_UQ);
+- condition2 = _mm256_cmp_ps(_mm256_cmp_ps(_mm256_cvtepi32_ps(_mm256_and_si256(q, fours)), fzeroes, _CMP_NEQ_UQ), _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS), _CMP_NEQ_UQ);
+- // Need this condition only for cos
+- //condition3 = _mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, twos), fours)), fzeroes);
+-
+- sine = _mm256_add_ps(sine, _mm256_and_ps(_mm256_sub_ps(cosine, sine), condition1));
+- sine = _mm256_sub_ps(sine, _mm256_and_ps(_mm256_mul_ps(sine, _mm256_set1_ps(2.0f)), condition2));
+- _mm256_storeu_ps(bPtr, sine);
+- aPtr += 8;
+- bPtr += 8;
+- }
+-
+- number = eighthPoints * 8;
+- for(;number < num_points; number++) {
+- *bPtr++ = sin(*aPtr++);
+- }
+ }
+
+ #endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for unaligned */
+@@ -393,72 +482,100 @@ volk_32f_sin_32f_u_avx2_fma(float* bVector, const float* aVector, unsigned int n
+ static inline void
+ volk_32f_sin_32f_u_avx2(float* bVector, const float* aVector, unsigned int num_points)
+ {
+- float* bPtr = bVector;
+- const float* aPtr = aVector;
+-
+- unsigned int number = 0;
+- unsigned int eighthPoints = num_points / 8;
+- unsigned int i = 0;
+-
+- __m256 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones, fzeroes;
+- __m256 sine, cosine, condition1, condition2;
+- __m256i q, r, ones, twos, fours;
+-
+- m4pi = _mm256_set1_ps(1.273239545);
+- pio4A = _mm256_set1_ps(0.78515625);
+- pio4B = _mm256_set1_ps(0.241876e-3);
+- ffours = _mm256_set1_ps(4.0);
+- ftwos = _mm256_set1_ps(2.0);
+- fones = _mm256_set1_ps(1.0);
+- fzeroes = _mm256_setzero_ps();
+- ones = _mm256_set1_epi32(1);
+- twos = _mm256_set1_epi32(2);
+- fours = _mm256_set1_epi32(4);
+-
+- cp1 = _mm256_set1_ps(1.0);
+- cp2 = _mm256_set1_ps(0.83333333e-1);
+- cp3 = _mm256_set1_ps(0.2777778e-2);
+- cp4 = _mm256_set1_ps(0.49603e-4);
+- cp5 = _mm256_set1_ps(0.551e-6);
+-
+- for(;number < eighthPoints; number++) {
+- aVal = _mm256_loadu_ps(aPtr);
+- s = _mm256_sub_ps(aVal, _mm256_and_ps(_mm256_mul_ps(aVal, ftwos), _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS)));
+- q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi)));
+- r = _mm256_add_epi32(q, _mm256_and_si256(q, ones));
+-
+- s = _mm256_sub_ps(s, _mm256_mul_ps(_mm256_cvtepi32_ps(r), pio4A));
+- s = _mm256_sub_ps(s, _mm256_mul_ps(_mm256_cvtepi32_ps(r), pio4B));
+-
+- s = _mm256_div_ps(s, _mm256_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
+- s = _mm256_mul_ps(s, s);
+- // Evaluate Taylor series
+- s = _mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(s, cp5), cp4), s), cp3), s), cp2), s), cp1), s);
+-
+- for(i = 0; i < 3; i++) {
+- s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s));
++ float* bPtr = bVector;
++ const float* aPtr = aVector;
++
++ unsigned int number = 0;
++ unsigned int eighthPoints = num_points / 8;
++ unsigned int i = 0;
++
++ __m256 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones,
++ fzeroes;
++ __m256 sine, cosine, condition1, condition2;
++ __m256i q, r, ones, twos, fours;
++
++ m4pi = _mm256_set1_ps(1.273239545);
++ pio4A = _mm256_set1_ps(0.78515625);
++ pio4B = _mm256_set1_ps(0.241876e-3);
++ ffours = _mm256_set1_ps(4.0);
++ ftwos = _mm256_set1_ps(2.0);
++ fones = _mm256_set1_ps(1.0);
++ fzeroes = _mm256_setzero_ps();
++ ones = _mm256_set1_epi32(1);
++ twos = _mm256_set1_epi32(2);
++ fours = _mm256_set1_epi32(4);
++
++ cp1 = _mm256_set1_ps(1.0);
++ cp2 = _mm256_set1_ps(0.83333333e-1);
++ cp3 = _mm256_set1_ps(0.2777778e-2);
++ cp4 = _mm256_set1_ps(0.49603e-4);
++ cp5 = _mm256_set1_ps(0.551e-6);
++
++ for (; number < eighthPoints; number++) {
++ aVal = _mm256_loadu_ps(aPtr);
++ s = _mm256_sub_ps(aVal,
++ _mm256_and_ps(_mm256_mul_ps(aVal, ftwos),
++ _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS)));
++ q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi)));
++ r = _mm256_add_epi32(q, _mm256_and_si256(q, ones));
++
++ s = _mm256_sub_ps(s, _mm256_mul_ps(_mm256_cvtepi32_ps(r), pio4A));
++ s = _mm256_sub_ps(s, _mm256_mul_ps(_mm256_cvtepi32_ps(r), pio4B));
++
++ s = _mm256_div_ps(
++ s,
++ _mm256_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
++ s = _mm256_mul_ps(s, s);
++ // Evaluate Taylor series
++ s = _mm256_mul_ps(
++ _mm256_add_ps(
++ _mm256_mul_ps(
++ _mm256_sub_ps(
++ _mm256_mul_ps(
++ _mm256_add_ps(
++ _mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(s, cp5), cp4),
++ s),
++ cp3),
++ s),
++ cp2),
++ s),
++ cp1),
++ s);
++
++ for (i = 0; i < 3; i++) {
++ s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s));
++ }
++ s = _mm256_div_ps(s, ftwos);
++
++ sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s));
++ cosine = _mm256_sub_ps(fones, s);
++
++ condition1 = _mm256_cmp_ps(
++ _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, ones), twos)),
++ fzeroes,
++ _CMP_NEQ_UQ);
++ condition2 = _mm256_cmp_ps(
++ _mm256_cmp_ps(
++ _mm256_cvtepi32_ps(_mm256_and_si256(q, fours)), fzeroes, _CMP_NEQ_UQ),
++ _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS),
++ _CMP_NEQ_UQ);
++ // Need this condition only for cos
++ // condition3 = _mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q,
++ // twos), fours)), fzeroes);
++
++ sine =
++ _mm256_add_ps(sine, _mm256_and_ps(_mm256_sub_ps(cosine, sine), condition1));
++ sine = _mm256_sub_ps(
++ sine, _mm256_and_ps(_mm256_mul_ps(sine, _mm256_set1_ps(2.0f)), condition2));
++ _mm256_storeu_ps(bPtr, sine);
++ aPtr += 8;
++ bPtr += 8;
++ }
++
++ number = eighthPoints * 8;
++ for (; number < num_points; number++) {
++ *bPtr++ = sin(*aPtr++);
+ }
+- s = _mm256_div_ps(s, ftwos);
+-
+- sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s));
+- cosine = _mm256_sub_ps(fones, s);
+-
+- condition1 = _mm256_cmp_ps(_mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, ones), twos)), fzeroes, _CMP_NEQ_UQ);
+- condition2 = _mm256_cmp_ps(_mm256_cmp_ps(_mm256_cvtepi32_ps(_mm256_and_si256(q, fours)), fzeroes, _CMP_NEQ_UQ), _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS), _CMP_NEQ_UQ);
+- // Need this condition only for cos
+- //condition3 = _mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, twos), fours)), fzeroes);
+-
+- sine = _mm256_add_ps(sine, _mm256_and_ps(_mm256_sub_ps(cosine, sine), condition1));
+- sine = _mm256_sub_ps(sine, _mm256_and_ps(_mm256_mul_ps(sine, _mm256_set1_ps(2.0f)), condition2));
+- _mm256_storeu_ps(bPtr, sine);
+- aPtr += 8;
+- bPtr += 8;
+- }
+-
+- number = eighthPoints * 8;
+- for(;number < num_points; number++) {
+- *bPtr++ = sin(*aPtr++);
+- }
+ }
+
+ #endif /* LV_HAVE_AVX2 for unaligned */
+@@ -470,70 +587,88 @@ volk_32f_sin_32f_u_avx2(float* bVector, const float* aVector, unsigned int num_p
+ static inline void
+ volk_32f_sin_32f_u_sse4_1(float* bVector, const float* aVector, unsigned int num_points)
+ {
+- float* bPtr = bVector;
+- const float* aPtr = aVector;
+-
+- unsigned int number = 0;
+- unsigned int quarterPoints = num_points / 4;
+- unsigned int i = 0;
+-
+- __m128 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones, fzeroes;
+- __m128 sine, cosine, condition1, condition2;
+- __m128i q, r, ones, twos, fours;
+-
+- m4pi = _mm_set1_ps(1.273239545);
+- pio4A = _mm_set1_ps(0.78515625);
+- pio4B = _mm_set1_ps(0.241876e-3);
+- ffours = _mm_set1_ps(4.0);
+- ftwos = _mm_set1_ps(2.0);
+- fones = _mm_set1_ps(1.0);
+- fzeroes = _mm_setzero_ps();
+- ones = _mm_set1_epi32(1);
+- twos = _mm_set1_epi32(2);
+- fours = _mm_set1_epi32(4);
+-
+- cp1 = _mm_set1_ps(1.0);
+- cp2 = _mm_set1_ps(0.83333333e-1);
+- cp3 = _mm_set1_ps(0.2777778e-2);
+- cp4 = _mm_set1_ps(0.49603e-4);
+- cp5 = _mm_set1_ps(0.551e-6);
+-
+- for(;number < quarterPoints; number++) {
+- aVal = _mm_loadu_ps(aPtr);
+- s = _mm_sub_ps(aVal, _mm_and_ps(_mm_mul_ps(aVal, ftwos), _mm_cmplt_ps(aVal, fzeroes)));
+- q = _mm_cvtps_epi32(_mm_floor_ps(_mm_mul_ps(s, m4pi)));
+- r = _mm_add_epi32(q, _mm_and_si128(q, ones));
+-
+- s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4A));
+- s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4B));
+-
+- s = _mm_div_ps(s, _mm_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
+- s = _mm_mul_ps(s, s);
+- // Evaluate Taylor series
+- s = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(s, cp5), cp4), s), cp3), s), cp2), s), cp1), s);
+-
+- for(i = 0; i < 3; i++) {
+- s = _mm_mul_ps(s, _mm_sub_ps(ffours, s));
+- }
+- s = _mm_div_ps(s, ftwos);
+-
+- sine = _mm_sqrt_ps(_mm_mul_ps(_mm_sub_ps(ftwos, s), s));
+- cosine = _mm_sub_ps(fones, s);
+-
+- condition1 = _mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, ones), twos)), fzeroes);
+- condition2 = _mm_cmpneq_ps(_mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(q, fours)), fzeroes), _mm_cmplt_ps(aVal, fzeroes));
++ float* bPtr = bVector;
++ const float* aPtr = aVector;
+
+- sine = _mm_add_ps(sine, _mm_and_ps(_mm_sub_ps(cosine, sine), condition1));
+- sine = _mm_sub_ps(sine, _mm_and_ps(_mm_mul_ps(sine, _mm_set1_ps(2.0f)), condition2));
+- _mm_storeu_ps(bPtr, sine);
+- aPtr += 4;
+- bPtr += 4;
+- }
++ unsigned int number = 0;
++ unsigned int quarterPoints = num_points / 4;
++ unsigned int i = 0;
++
++ __m128 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones,
++ fzeroes;
++ __m128 sine, cosine, condition1, condition2;
++ __m128i q, r, ones, twos, fours;
++
++ m4pi = _mm_set1_ps(1.273239545);
++ pio4A = _mm_set1_ps(0.78515625);
++ pio4B = _mm_set1_ps(0.241876e-3);
++ ffours = _mm_set1_ps(4.0);
++ ftwos = _mm_set1_ps(2.0);
++ fones = _mm_set1_ps(1.0);
++ fzeroes = _mm_setzero_ps();
++ ones = _mm_set1_epi32(1);
++ twos = _mm_set1_epi32(2);
++ fours = _mm_set1_epi32(4);
++
++ cp1 = _mm_set1_ps(1.0);
++ cp2 = _mm_set1_ps(0.83333333e-1);
++ cp3 = _mm_set1_ps(0.2777778e-2);
++ cp4 = _mm_set1_ps(0.49603e-4);
++ cp5 = _mm_set1_ps(0.551e-6);
++
++ for (; number < quarterPoints; number++) {
++ aVal = _mm_loadu_ps(aPtr);
++ s = _mm_sub_ps(aVal,
++ _mm_and_ps(_mm_mul_ps(aVal, ftwos), _mm_cmplt_ps(aVal, fzeroes)));
++ q = _mm_cvtps_epi32(_mm_floor_ps(_mm_mul_ps(s, m4pi)));
++ r = _mm_add_epi32(q, _mm_and_si128(q, ones));
++
++ s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4A));
++ s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4B));
++
++ s = _mm_div_ps(
++ s, _mm_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
++ s = _mm_mul_ps(s, s);
++ // Evaluate Taylor series
++ s = _mm_mul_ps(
++ _mm_add_ps(
++ _mm_mul_ps(
++ _mm_sub_ps(
++ _mm_mul_ps(
++ _mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(s, cp5), cp4), s),
++ cp3),
++ s),
++ cp2),
++ s),
++ cp1),
++ s);
++
++ for (i = 0; i < 3; i++) {
++ s = _mm_mul_ps(s, _mm_sub_ps(ffours, s));
++ }
++ s = _mm_div_ps(s, ftwos);
++
++ sine = _mm_sqrt_ps(_mm_mul_ps(_mm_sub_ps(ftwos, s), s));
++ cosine = _mm_sub_ps(fones, s);
++
++ condition1 = _mm_cmpneq_ps(
++ _mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, ones), twos)), fzeroes);
++ condition2 = _mm_cmpneq_ps(
++ _mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(q, fours)), fzeroes),
++ _mm_cmplt_ps(aVal, fzeroes));
++
++ sine = _mm_add_ps(sine, _mm_and_ps(_mm_sub_ps(cosine, sine), condition1));
++ sine =
++ _mm_sub_ps(sine, _mm_and_ps(_mm_mul_ps(sine, _mm_set1_ps(2.0f)), condition2));
++ _mm_storeu_ps(bPtr, sine);
++ aPtr += 4;
++ bPtr += 4;
++ }
+
+- number = quarterPoints * 4;
+- for(;number < num_points; number++){
+- *bPtr++ = sinf(*aPtr++);
+- }
++ number = quarterPoints * 4;
++ for (; number < num_points; number++) {
++ *bPtr++ = sinf(*aPtr++);
++ }
+ }
+
+ #endif /* LV_HAVE_SSE4_1 for unaligned */
+@@ -544,14 +679,13 @@ volk_32f_sin_32f_u_sse4_1(float* bVector, const float* aVector, unsigned int num
+ static inline void
+ volk_32f_sin_32f_generic(float* bVector, const float* aVector, unsigned int num_points)
+ {
+- float* bPtr = bVector;
+- const float* aPtr = aVector;
+- unsigned int number = 0;
+-
+- for(number = 0; number < num_points; number++) {
+- *bPtr++ = sinf(*aPtr++);
+- }
++ float* bPtr = bVector;
++ const float* aPtr = aVector;
++ unsigned int number = 0;
+
++ for (number = 0; number < num_points; number++) {
++ *bPtr++ = sinf(*aPtr++);
++ }
+ }
+
+ #endif /* LV_HAVE_GENERIC */
+@@ -562,30 +696,29 @@ volk_32f_sin_32f_generic(float* bVector, const float* aVector, unsigned int num_
+ #include <volk/volk_neon_intrinsics.h>
+
+ static inline void
+-volk_32f_sin_32f_neon(float* bVector, const float* aVector,
+- unsigned int num_points)
++volk_32f_sin_32f_neon(float* bVector, const float* aVector, unsigned int num_points)
+ {
+ unsigned int number = 0;
+ unsigned int quarter_points = num_points / 4;
+ float* bVectorPtr = bVector;
+ const float* aVectorPtr = aVector;
+-
++
+ float32x4_t b_vec;
+ float32x4_t a_vec;
+-
+- for(number = 0; number < quarter_points; number++) {
++
++ for (number = 0; number < quarter_points; number++) {
+ a_vec = vld1q_f32(aVectorPtr);
+ // Prefetch next one, speeds things up
+- __VOLK_PREFETCH(aVectorPtr+4);
++ __VOLK_PREFETCH(aVectorPtr + 4);
+ b_vec = _vsinq_f32(a_vec);
+ vst1q_f32(bVectorPtr, b_vec);
+ // move pointers ahead
+- bVectorPtr+=4;
+- aVectorPtr+=4;
++ bVectorPtr += 4;
++ aVectorPtr += 4;
+ }
+-
++
+ // Deal with the rest
+- for(number = quarter_points * 4; number < num_points; number++) {
++ for (number = quarter_points * 4; number < num_points; number++) {
+ *bVectorPtr++ = sinf(*aVectorPtr++);
+ }
+ }
+diff --git a/kernels/volk/volk_32f_sqrt_32f.h b/kernels/volk/volk_32f_sqrt_32f.h
+index 84160af..667d356 100644
+--- a/kernels/volk/volk_32f_sqrt_32f.h
++++ b/kernels/volk/volk_32f_sqrt_32f.h
+@@ -66,8 +66,8 @@
+ #define INCLUDED_volk_32f_sqrt_32f_a_H
+
+ #include <inttypes.h>
+-#include <stdio.h>
+ #include <math.h>
++#include <stdio.h>
+
+ #ifdef LV_HAVE_SSE
+ #include <xmmintrin.h>
+@@ -75,28 +75,28 @@
+ static inline void
+ volk_32f_sqrt_32f_a_sse(float* cVector, const float* aVector, unsigned int num_points)
+ {
+- unsigned int number = 0;
+- const unsigned int quarterPoints = num_points / 4;
++ unsigned int number = 0;
++ const unsigned int quarterPoints = num_points / 4;
+
+- float* cPtr = cVector;
+- const float* aPtr = aVector;
++ float* cPtr = cVector;
++ const float* aPtr = aVector;
+
+- __m128 aVal, cVal;
+- for(;number < quarterPoints; number++) {
+- aVal = _mm_load_ps(aPtr);
++ __m128 aVal, cVal;
++ for (; number < quarterPoints; number++) {
++ aVal = _mm_load_ps(aPtr);
+
+- cVal = _mm_sqrt_ps(aVal);
++ cVal = _mm_sqrt_ps(aVal);
+
+- _mm_store_ps(cPtr,cVal); // Store the results back into the C container
++ _mm_store_ps(cPtr, cVal); // Store the results back into the C container
+
+- aPtr += 4;
+- cPtr += 4;
+- }
++ aPtr += 4;
++ cPtr += 4;
++ }
+
+- number = quarterPoints * 4;
+- for(;number < num_points; number++) {
+- *cPtr++ = sqrtf(*aPtr++);
+- }
++ number = quarterPoints * 4;
++ for (; number < num_points; number++) {
++ *cPtr++ = sqrtf(*aPtr++);
++ }
+ }
+
+ #endif /* LV_HAVE_SSE */
+@@ -107,28 +107,28 @@ volk_32f_sqrt_32f_a_sse(float* cVector, const float* aVector, unsigned int num_p
+ static inline void
+ volk_32f_sqrt_32f_a_avx(float* cVector, const float* aVector, unsigned int num_points)
+ {
+- unsigned int number = 0;
+- const unsigned int eighthPoints = num_points / 8;
++ unsigned int number = 0;
++ const unsigned int eighthPoints = num_points / 8;
+
+- float* cPtr = cVector;
+- const float* aPtr = aVector;
++ float* cPtr = cVector;
++ const float* aPtr = aVector;
+
+- __m256 aVal, cVal;
+- for(;number < eighthPoints; number++) {
+- aVal = _mm256_load_ps(aPtr);
++ __m256 aVal, cVal;
++ for (; number < eighthPoints; number++) {
++ aVal = _mm256_load_ps(aPtr);
+
+- cVal = _mm256_sqrt_ps(aVal);
++ cVal = _mm256_sqrt_ps(aVal);
+
+- _mm256_store_ps(cPtr,cVal); // Store the results back into the C container
++ _mm256_store_ps(cPtr, cVal); // Store the results back into the C container
+
+- aPtr += 8;
+- cPtr += 8;
+- }
++ aPtr += 8;
++ cPtr += 8;
++ }
+
+- number = eighthPoints * 8;
+- for(;number < num_points; number++) {
+- *cPtr++ = sqrtf(*aPtr++);
+- }
++ number = eighthPoints * 8;
++ for (; number < num_points; number++) {
++ *cPtr++ = sqrtf(*aPtr++);
++ }
+ }
+
+ #endif /* LV_HAVE_AVX */
+@@ -140,24 +140,24 @@ volk_32f_sqrt_32f_a_avx(float* cVector, const float* aVector, unsigned int num_p
+ static inline void
+ volk_32f_sqrt_32f_neon(float* cVector, const float* aVector, unsigned int num_points)
+ {
+- float* cPtr = cVector;
+- const float* aPtr = aVector;
+- unsigned int number = 0;
+- unsigned int quarter_points = num_points / 4;
+- float32x4_t in_vec, out_vec;
+-
+- for(number = 0; number < quarter_points; number++) {
+- in_vec = vld1q_f32(aPtr);
+- // note that armv8 has vsqrt_f32 which will be much better
+- out_vec = vrecpeq_f32(vrsqrteq_f32(in_vec) );
+- vst1q_f32(cPtr, out_vec);
+- aPtr += 4;
+- cPtr += 4;
+- }
+-
+- for(number = quarter_points * 4; number < num_points; number++) {
+- *cPtr++ = sqrtf(*aPtr++);
+- }
++ float* cPtr = cVector;
++ const float* aPtr = aVector;
++ unsigned int number = 0;
++ unsigned int quarter_points = num_points / 4;
++ float32x4_t in_vec, out_vec;
++
++ for (number = 0; number < quarter_points; number++) {
++ in_vec = vld1q_f32(aPtr);
++ // note that armv8 has vsqrt_f32 which will be much better
++ out_vec = vrecpeq_f32(vrsqrteq_f32(in_vec));
++ vst1q_f32(cPtr, out_vec);
++ aPtr += 4;
++ cPtr += 4;
++ }
++
++ for (number = quarter_points * 4; number < num_points; number++) {
++ *cPtr++ = sqrtf(*aPtr++);
++ }
+ }
+
+ #endif /* LV_HAVE_NEON */
+@@ -168,13 +168,13 @@ volk_32f_sqrt_32f_neon(float* cVector, const float* aVector, unsigned int num_po
+ static inline void
+ volk_32f_sqrt_32f_generic(float* cVector, const float* aVector, unsigned int num_points)
+ {
+- float* cPtr = cVector;
+- const float* aPtr = aVector;
+- unsigned int number = 0;
++ float* cPtr = cVector;
++ const float* aPtr = aVector;
++ unsigned int number = 0;
+
+- for(number = 0; number < num_points; number++) {
+- *cPtr++ = sqrtf(*aPtr++);
+- }
++ for (number = 0; number < num_points; number++) {
++ *cPtr++ = sqrtf(*aPtr++);
++ }
+ }
+
+ #endif /* LV_HAVE_GENERIC */
+@@ -182,13 +182,12 @@ volk_32f_sqrt_32f_generic(float* cVector, const float* aVector, unsigned int num
+
+ #ifdef LV_HAVE_ORC
+
+-extern void
+-volk_32f_sqrt_32f_a_orc_impl(float *, const float*, unsigned int);
++extern void volk_32f_sqrt_32f_a_orc_impl(float*, const float*, unsigned int);
+
+ static inline void
+ volk_32f_sqrt_32f_u_orc(float* cVector, const float* aVector, unsigned int num_points)
+ {
+- volk_32f_sqrt_32f_a_orc_impl(cVector, aVector, num_points);
++ volk_32f_sqrt_32f_a_orc_impl(cVector, aVector, num_points);
+ }
+
+ #endif /* LV_HAVE_ORC */
+@@ -199,36 +198,36 @@ volk_32f_sqrt_32f_u_orc(float* cVector, const float* aVector, unsigned int num_p
+ #define INCLUDED_volk_32f_sqrt_32f_u_H
+
+ #include <inttypes.h>
+-#include <stdio.h>
+ #include <math.h>
++#include <stdio.h>
+ #ifdef LV_HAVE_AVX
+ #include <immintrin.h>
+
+ static inline void
+ volk_32f_sqrt_32f_u_avx(float* cVector, const float* aVector, unsigned int num_points)
+ {
+- unsigned int number = 0;
+- const unsigned int eighthPoints = num_points / 8;
++ unsigned int number = 0;
++ const unsigned int eighthPoints = num_points / 8;
+
+- float* cPtr = cVector;
+- const float* aPtr = aVector;
++ float* cPtr = cVector;
++ const float* aPtr = aVector;
+
+- __m256 aVal, cVal;
+- for(;number < eighthPoints; number++) {
+- aVal = _mm256_loadu_ps(aPtr);
++ __m256 aVal, cVal;
++ for (; number < eighthPoints; number++) {
++ aVal = _mm256_loadu_ps(aPtr);
+
+- cVal = _mm256_sqrt_ps(aVal);
++ cVal = _mm256_sqrt_ps(aVal);
+
+- _mm256_storeu_ps(cPtr,cVal); // Store the results back into the C container
++ _mm256_storeu_ps(cPtr, cVal); // Store the results back into the C container
+
+- aPtr += 8;
+- cPtr += 8;
+- }
++ aPtr += 8;
++ cPtr += 8;
++ }
+
+- number = eighthPoints * 8;
+- for(;number < num_points; number++) {
+- *cPtr++ = sqrtf(*aPtr++);
+- }
++ number = eighthPoints * 8;
++ for (; number < num_points; number++) {
++ *cPtr++ = sqrtf(*aPtr++);
++ }
+ }
+
+ #endif /* LV_HAVE_AVX */
+diff --git a/kernels/volk/volk_32f_stddev_and_mean_32f_x2.h b/kernels/volk/volk_32f_stddev_and_mean_32f_x2.h
+index 8e996e2..6ad0f17 100644
+--- a/kernels/volk/volk_32f_stddev_and_mean_32f_x2.h
++++ b/kernels/volk/volk_32f_stddev_and_mean_32f_x2.h
+@@ -29,8 +29,8 @@
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+- * void volk_32f_stddev_and_mean_32f_x2(float* stddev, float* mean, const float* inputBuffer, unsigned int num_points)
+- * \endcode
++ * void volk_32f_stddev_and_mean_32f_x2(float* stddev, float* mean, const float*
++ * inputBuffer, unsigned int num_points) \endcode
+ *
+ * \b Inputs
+ * \li inputBuffer: The buffer of points.
+@@ -41,10 +41,8 @@
+ * \li mean: The mean of the input buffer.
+ *
+ * \b Example
+- * Generate random numbers with c++11's normal distribution and estimate the mean and standard deviation
+- * \code
+- * int N = 1000;
+- * unsigned int alignment = volk_get_alignment();
++ * Generate random numbers with c++11's normal distribution and estimate the mean and
++ * standard deviation \code int N = 1000; unsigned int alignment = volk_get_alignment();
+ * float* rand_numbers = (float*)volk_malloc(sizeof(float)*N, alignment);
+ * float* mean = (float*)volk_malloc(sizeof(float), alignment);
+ * float* stddev = (float*)volk_malloc(sizeof(float), alignment);
+@@ -71,88 +69,94 @@
+ #ifndef INCLUDED_volk_32f_stddev_and_mean_32f_x2_a_H
+ #define INCLUDED_volk_32f_stddev_and_mean_32f_x2_a_H
+
+-#include <volk/volk_common.h>
+ #include <inttypes.h>
+-#include <stdio.h>
+ #include <math.h>
++#include <stdio.h>
++#include <volk/volk_common.h>
+
+ #ifdef LV_HAVE_AVX
+ #include <immintrin.h>
+
+-static inline void
+-volk_32f_stddev_and_mean_32f_x2_a_avx(float* stddev, float* mean,
+- const float* inputBuffer,
+- unsigned int num_points)
++static inline void volk_32f_stddev_and_mean_32f_x2_a_avx(float* stddev,
++ float* mean,
++ const float* inputBuffer,
++ unsigned int num_points)
+ {
+- float stdDev = 0;
+- float newMean = 0;
+- if(num_points > 0){
+- unsigned int number = 0;
+- const unsigned int thirtySecondthPoints = num_points / 32;
+-
+- const float* aPtr = inputBuffer;
+- __VOLK_ATTR_ALIGNED(32) float meanBuffer[8];
+- __VOLK_ATTR_ALIGNED(32) float squareBuffer[8];
+-
+- __m256 accumulator = _mm256_setzero_ps();
+- __m256 squareAccumulator = _mm256_setzero_ps();
+- __m256 aVal1, aVal2, aVal3, aVal4;
+- __m256 cVal1, cVal2, cVal3, cVal4;
+- for(;number < thirtySecondthPoints; number++) {
+- aVal1 = _mm256_load_ps(aPtr); aPtr += 8;
+- cVal1 = _mm256_dp_ps(aVal1, aVal1, 0xF1);
+- accumulator = _mm256_add_ps(accumulator, aVal1); // accumulator += x
+-
+- aVal2 = _mm256_load_ps(aPtr); aPtr += 8;
+- cVal2 = _mm256_dp_ps(aVal2, aVal2, 0xF2);
+- accumulator = _mm256_add_ps(accumulator, aVal2); // accumulator += x
+-
+- aVal3 = _mm256_load_ps(aPtr); aPtr += 8;
+- cVal3 = _mm256_dp_ps(aVal3, aVal3, 0xF4);
+- accumulator = _mm256_add_ps(accumulator, aVal3); // accumulator += x
+-
+- aVal4 = _mm256_load_ps(aPtr); aPtr += 8;
+- cVal4 = _mm256_dp_ps(aVal4, aVal4, 0xF8);
+- accumulator = _mm256_add_ps(accumulator, aVal4); // accumulator += x
+-
+- cVal1 = _mm256_or_ps(cVal1, cVal2);
+- cVal3 = _mm256_or_ps(cVal3, cVal4);
+- cVal1 = _mm256_or_ps(cVal1, cVal3);
+-
+- squareAccumulator = _mm256_add_ps(squareAccumulator, cVal1); // squareAccumulator += x^2
+- }
+- _mm256_store_ps(meanBuffer,accumulator); // Store the results back into the C container
+- _mm256_store_ps(squareBuffer,squareAccumulator); // Store the results back into the C container
+- newMean = meanBuffer[0];
+- newMean += meanBuffer[1];
+- newMean += meanBuffer[2];
+- newMean += meanBuffer[3];
+- newMean += meanBuffer[4];
+- newMean += meanBuffer[5];
+- newMean += meanBuffer[6];
+- newMean += meanBuffer[7];
+- stdDev = squareBuffer[0];
+- stdDev += squareBuffer[1];
+- stdDev += squareBuffer[2];
+- stdDev += squareBuffer[3];
+- stdDev += squareBuffer[4];
+- stdDev += squareBuffer[5];
+- stdDev += squareBuffer[6];
+- stdDev += squareBuffer[7];
+-
+- number = thirtySecondthPoints * 32;
+- for(;number < num_points; number++){
+- stdDev += (*aPtr) * (*aPtr);
+- newMean += *aPtr++;
++ float stdDev = 0;
++ float newMean = 0;
++ if (num_points > 0) {
++ unsigned int number = 0;
++ const unsigned int thirtySecondthPoints = num_points / 32;
++
++ const float* aPtr = inputBuffer;
++ __VOLK_ATTR_ALIGNED(32) float meanBuffer[8];
++ __VOLK_ATTR_ALIGNED(32) float squareBuffer[8];
++
++ __m256 accumulator = _mm256_setzero_ps();
++ __m256 squareAccumulator = _mm256_setzero_ps();
++ __m256 aVal1, aVal2, aVal3, aVal4;
++ __m256 cVal1, cVal2, cVal3, cVal4;
++ for (; number < thirtySecondthPoints; number++) {
++ aVal1 = _mm256_load_ps(aPtr);
++ aPtr += 8;
++ cVal1 = _mm256_dp_ps(aVal1, aVal1, 0xF1);
++ accumulator = _mm256_add_ps(accumulator, aVal1); // accumulator += x
++
++ aVal2 = _mm256_load_ps(aPtr);
++ aPtr += 8;
++ cVal2 = _mm256_dp_ps(aVal2, aVal2, 0xF2);
++ accumulator = _mm256_add_ps(accumulator, aVal2); // accumulator += x
++
++ aVal3 = _mm256_load_ps(aPtr);
++ aPtr += 8;
++ cVal3 = _mm256_dp_ps(aVal3, aVal3, 0xF4);
++ accumulator = _mm256_add_ps(accumulator, aVal3); // accumulator += x
++
++ aVal4 = _mm256_load_ps(aPtr);
++ aPtr += 8;
++ cVal4 = _mm256_dp_ps(aVal4, aVal4, 0xF8);
++ accumulator = _mm256_add_ps(accumulator, aVal4); // accumulator += x
++
++ cVal1 = _mm256_or_ps(cVal1, cVal2);
++ cVal3 = _mm256_or_ps(cVal3, cVal4);
++ cVal1 = _mm256_or_ps(cVal1, cVal3);
++
++ squareAccumulator =
++ _mm256_add_ps(squareAccumulator, cVal1); // squareAccumulator += x^2
++ }
++ _mm256_store_ps(meanBuffer,
++ accumulator); // Store the results back into the C container
++ _mm256_store_ps(squareBuffer,
++ squareAccumulator); // Store the results back into the C container
++ newMean = meanBuffer[0];
++ newMean += meanBuffer[1];
++ newMean += meanBuffer[2];
++ newMean += meanBuffer[3];
++ newMean += meanBuffer[4];
++ newMean += meanBuffer[5];
++ newMean += meanBuffer[6];
++ newMean += meanBuffer[7];
++ stdDev = squareBuffer[0];
++ stdDev += squareBuffer[1];
++ stdDev += squareBuffer[2];
++ stdDev += squareBuffer[3];
++ stdDev += squareBuffer[4];
++ stdDev += squareBuffer[5];
++ stdDev += squareBuffer[6];
++ stdDev += squareBuffer[7];
++
++ number = thirtySecondthPoints * 32;
++ for (; number < num_points; number++) {
++ stdDev += (*aPtr) * (*aPtr);
++ newMean += *aPtr++;
++ }
++ newMean /= num_points;
++ stdDev /= num_points;
++ stdDev -= (newMean * newMean);
++ stdDev = sqrtf(stdDev);
+ }
+- newMean /= num_points;
+- stdDev /= num_points;
+- stdDev -= (newMean * newMean);
+- stdDev = sqrtf(stdDev);
+- }
+- *stddev = stdDev;
+- *mean = newMean;
+-
++ *stddev = stdDev;
++ *mean = newMean;
+ }
+ #endif /* LV_HAVE_AVX */
+
+@@ -160,151 +164,164 @@ volk_32f_stddev_and_mean_32f_x2_a_avx(float* stddev, float* mean,
+ #ifdef LV_HAVE_AVX
+ #include <immintrin.h>
+
+-static inline void
+-volk_32f_stddev_and_mean_32f_x2_u_avx(float* stddev, float* mean,
+- const float* inputBuffer,
+- unsigned int num_points)
++static inline void volk_32f_stddev_and_mean_32f_x2_u_avx(float* stddev,
++ float* mean,
++ const float* inputBuffer,
++ unsigned int num_points)
+ {
+- float stdDev = 0;
+- float newMean = 0;
+- if(num_points > 0){
+- unsigned int number = 0;
+- const unsigned int thirtySecondthPoints = num_points / 32;
+-
+- const float* aPtr = inputBuffer;
+- __VOLK_ATTR_ALIGNED(32) float meanBuffer[8];
+- __VOLK_ATTR_ALIGNED(32) float squareBuffer[8];
+-
+- __m256 accumulator = _mm256_setzero_ps();
+- __m256 squareAccumulator = _mm256_setzero_ps();
+- __m256 aVal1, aVal2, aVal3, aVal4;
+- __m256 cVal1, cVal2, cVal3, cVal4;
+- for(;number < thirtySecondthPoints; number++) {
+- aVal1 = _mm256_loadu_ps(aPtr); aPtr += 8;
+- cVal1 = _mm256_dp_ps(aVal1, aVal1, 0xF1);
+- accumulator = _mm256_add_ps(accumulator, aVal1); // accumulator += x
+-
+- aVal2 = _mm256_loadu_ps(aPtr); aPtr += 8;
+- cVal2 = _mm256_dp_ps(aVal2, aVal2, 0xF2);
+- accumulator = _mm256_add_ps(accumulator, aVal2); // accumulator += x
+-
+- aVal3 = _mm256_loadu_ps(aPtr); aPtr += 8;
+- cVal3 = _mm256_dp_ps(aVal3, aVal3, 0xF4);
+- accumulator = _mm256_add_ps(accumulator, aVal3); // accumulator += x
+-
+- aVal4 = _mm256_loadu_ps(aPtr); aPtr += 8;
+- cVal4 = _mm256_dp_ps(aVal4, aVal4, 0xF8);
+- accumulator = _mm256_add_ps(accumulator, aVal4); // accumulator += x
+-
+- cVal1 = _mm256_or_ps(cVal1, cVal2);
+- cVal3 = _mm256_or_ps(cVal3, cVal4);
+- cVal1 = _mm256_or_ps(cVal1, cVal3);
+-
+- squareAccumulator = _mm256_add_ps(squareAccumulator, cVal1); // squareAccumulator += x^2
+- }
+- _mm256_store_ps(meanBuffer,accumulator); // Store the results back into the C container
+- _mm256_store_ps(squareBuffer,squareAccumulator); // Store the results back into the C container
+- newMean = meanBuffer[0];
+- newMean += meanBuffer[1];
+- newMean += meanBuffer[2];
+- newMean += meanBuffer[3];
+- newMean += meanBuffer[4];
+- newMean += meanBuffer[5];
+- newMean += meanBuffer[6];
+- newMean += meanBuffer[7];
+- stdDev = squareBuffer[0];
+- stdDev += squareBuffer[1];
+- stdDev += squareBuffer[2];
+- stdDev += squareBuffer[3];
+- stdDev += squareBuffer[4];
+- stdDev += squareBuffer[5];
+- stdDev += squareBuffer[6];
+- stdDev += squareBuffer[7];
+-
+- number = thirtySecondthPoints * 32;
+- for(;number < num_points; number++){
+- stdDev += (*aPtr) * (*aPtr);
+- newMean += *aPtr++;
++ float stdDev = 0;
++ float newMean = 0;
++ if (num_points > 0) {
++ unsigned int number = 0;
++ const unsigned int thirtySecondthPoints = num_points / 32;
++
++ const float* aPtr = inputBuffer;
++ __VOLK_ATTR_ALIGNED(32) float meanBuffer[8];
++ __VOLK_ATTR_ALIGNED(32) float squareBuffer[8];
++
++ __m256 accumulator = _mm256_setzero_ps();
++ __m256 squareAccumulator = _mm256_setzero_ps();
++ __m256 aVal1, aVal2, aVal3, aVal4;
++ __m256 cVal1, cVal2, cVal3, cVal4;
++ for (; number < thirtySecondthPoints; number++) {
++ aVal1 = _mm256_loadu_ps(aPtr);
++ aPtr += 8;
++ cVal1 = _mm256_dp_ps(aVal1, aVal1, 0xF1);
++ accumulator = _mm256_add_ps(accumulator, aVal1); // accumulator += x
++
++ aVal2 = _mm256_loadu_ps(aPtr);
++ aPtr += 8;
++ cVal2 = _mm256_dp_ps(aVal2, aVal2, 0xF2);
++ accumulator = _mm256_add_ps(accumulator, aVal2); // accumulator += x
++
++ aVal3 = _mm256_loadu_ps(aPtr);
++ aPtr += 8;
++ cVal3 = _mm256_dp_ps(aVal3, aVal3, 0xF4);
++ accumulator = _mm256_add_ps(accumulator, aVal3); // accumulator += x
++
++ aVal4 = _mm256_loadu_ps(aPtr);
++ aPtr += 8;
++ cVal4 = _mm256_dp_ps(aVal4, aVal4, 0xF8);
++ accumulator = _mm256_add_ps(accumulator, aVal4); // accumulator += x
++
++ cVal1 = _mm256_or_ps(cVal1, cVal2);
++ cVal3 = _mm256_or_ps(cVal3, cVal4);
++ cVal1 = _mm256_or_ps(cVal1, cVal3);
++
++ squareAccumulator =
++ _mm256_add_ps(squareAccumulator, cVal1); // squareAccumulator += x^2
++ }
++ _mm256_store_ps(meanBuffer,
++ accumulator); // Store the results back into the C container
++ _mm256_store_ps(squareBuffer,
++ squareAccumulator); // Store the results back into the C container
++ newMean = meanBuffer[0];
++ newMean += meanBuffer[1];
++ newMean += meanBuffer[2];
++ newMean += meanBuffer[3];
++ newMean += meanBuffer[4];
++ newMean += meanBuffer[5];
++ newMean += meanBuffer[6];
++ newMean += meanBuffer[7];
++ stdDev = squareBuffer[0];
++ stdDev += squareBuffer[1];
++ stdDev += squareBuffer[2];
++ stdDev += squareBuffer[3];
++ stdDev += squareBuffer[4];
++ stdDev += squareBuffer[5];
++ stdDev += squareBuffer[6];
++ stdDev += squareBuffer[7];
++
++ number = thirtySecondthPoints * 32;
++ for (; number < num_points; number++) {
++ stdDev += (*aPtr) * (*aPtr);
++ newMean += *aPtr++;
++ }
++ newMean /= num_points;
++ stdDev /= num_points;
++ stdDev -= (newMean * newMean);
++ stdDev = sqrtf(stdDev);
+ }
+- newMean /= num_points;
+- stdDev /= num_points;
+- stdDev -= (newMean * newMean);
+- stdDev = sqrtf(stdDev);
+- }
+- *stddev = stdDev;
+- *mean = newMean;
+-
++ *stddev = stdDev;
++ *mean = newMean;
+ }
+ #endif /* LV_HAVE_AVX */
+
+
+ #ifdef LV_HAVE_SSE4_1
+ #include <smmintrin.h>
+-static inline void
+-volk_32f_stddev_and_mean_32f_x2_a_sse4_1(float* stddev, float* mean,
+- const float* inputBuffer,
+- unsigned int num_points)
++static inline void volk_32f_stddev_and_mean_32f_x2_a_sse4_1(float* stddev,
++ float* mean,
++ const float* inputBuffer,
++ unsigned int num_points)
+ {
+- float returnValue = 0;
+- float newMean = 0;
+- if(num_points > 0){
+- unsigned int number = 0;
+- const unsigned int sixteenthPoints = num_points / 16;
+-
+- const float* aPtr = inputBuffer;
+- __VOLK_ATTR_ALIGNED(16) float meanBuffer[4];
+- __VOLK_ATTR_ALIGNED(16) float squareBuffer[4];
+-
+- __m128 accumulator = _mm_setzero_ps();
+- __m128 squareAccumulator = _mm_setzero_ps();
+- __m128 aVal1, aVal2, aVal3, aVal4;
+- __m128 cVal1, cVal2, cVal3, cVal4;
+- for(;number < sixteenthPoints; number++) {
+- aVal1 = _mm_load_ps(aPtr); aPtr += 4;
+- cVal1 = _mm_dp_ps(aVal1, aVal1, 0xF1);
+- accumulator = _mm_add_ps(accumulator, aVal1); // accumulator += x
+-
+- aVal2 = _mm_load_ps(aPtr); aPtr += 4;
+- cVal2 = _mm_dp_ps(aVal2, aVal2, 0xF2);
+- accumulator = _mm_add_ps(accumulator, aVal2); // accumulator += x
+-
+- aVal3 = _mm_load_ps(aPtr); aPtr += 4;
+- cVal3 = _mm_dp_ps(aVal3, aVal3, 0xF4);
+- accumulator = _mm_add_ps(accumulator, aVal3); // accumulator += x
+-
+- aVal4 = _mm_load_ps(aPtr); aPtr += 4;
+- cVal4 = _mm_dp_ps(aVal4, aVal4, 0xF8);
+- accumulator = _mm_add_ps(accumulator, aVal4); // accumulator += x
+-
+- cVal1 = _mm_or_ps(cVal1, cVal2);
+- cVal3 = _mm_or_ps(cVal3, cVal4);
+- cVal1 = _mm_or_ps(cVal1, cVal3);
+-
+- squareAccumulator = _mm_add_ps(squareAccumulator, cVal1); // squareAccumulator += x^2
+- }
+- _mm_store_ps(meanBuffer,accumulator); // Store the results back into the C container
+- _mm_store_ps(squareBuffer,squareAccumulator); // Store the results back into the C container
+- newMean = meanBuffer[0];
+- newMean += meanBuffer[1];
+- newMean += meanBuffer[2];
+- newMean += meanBuffer[3];
+- returnValue = squareBuffer[0];
+- returnValue += squareBuffer[1];
+- returnValue += squareBuffer[2];
+- returnValue += squareBuffer[3];
+-
+- number = sixteenthPoints * 16;
+- for(;number < num_points; number++){
+- returnValue += (*aPtr) * (*aPtr);
+- newMean += *aPtr++;
++ float returnValue = 0;
++ float newMean = 0;
++ if (num_points > 0) {
++ unsigned int number = 0;
++ const unsigned int sixteenthPoints = num_points / 16;
++
++ const float* aPtr = inputBuffer;
++ __VOLK_ATTR_ALIGNED(16) float meanBuffer[4];
++ __VOLK_ATTR_ALIGNED(16) float squareBuffer[4];
++
++ __m128 accumulator = _mm_setzero_ps();
++ __m128 squareAccumulator = _mm_setzero_ps();
++ __m128 aVal1, aVal2, aVal3, aVal4;
++ __m128 cVal1, cVal2, cVal3, cVal4;
++ for (; number < sixteenthPoints; number++) {
++ aVal1 = _mm_load_ps(aPtr);
++ aPtr += 4;
++ cVal1 = _mm_dp_ps(aVal1, aVal1, 0xF1);
++ accumulator = _mm_add_ps(accumulator, aVal1); // accumulator += x
++
++ aVal2 = _mm_load_ps(aPtr);
++ aPtr += 4;
++ cVal2 = _mm_dp_ps(aVal2, aVal2, 0xF2);
++ accumulator = _mm_add_ps(accumulator, aVal2); // accumulator += x
++
++ aVal3 = _mm_load_ps(aPtr);
++ aPtr += 4;
++ cVal3 = _mm_dp_ps(aVal3, aVal3, 0xF4);
++ accumulator = _mm_add_ps(accumulator, aVal3); // accumulator += x
++
++ aVal4 = _mm_load_ps(aPtr);
++ aPtr += 4;
++ cVal4 = _mm_dp_ps(aVal4, aVal4, 0xF8);
++ accumulator = _mm_add_ps(accumulator, aVal4); // accumulator += x
++
++ cVal1 = _mm_or_ps(cVal1, cVal2);
++ cVal3 = _mm_or_ps(cVal3, cVal4);
++ cVal1 = _mm_or_ps(cVal1, cVal3);
++
++ squareAccumulator =
++ _mm_add_ps(squareAccumulator, cVal1); // squareAccumulator += x^2
++ }
++ _mm_store_ps(meanBuffer,
++ accumulator); // Store the results back into the C container
++ _mm_store_ps(squareBuffer,
++ squareAccumulator); // Store the results back into the C container
++ newMean = meanBuffer[0];
++ newMean += meanBuffer[1];
++ newMean += meanBuffer[2];
++ newMean += meanBuffer[3];
++ returnValue = squareBuffer[0];
++ returnValue += squareBuffer[1];
++ returnValue += squareBuffer[2];
++ returnValue += squareBuffer[3];
++
++ number = sixteenthPoints * 16;
++ for (; number < num_points; number++) {
++ returnValue += (*aPtr) * (*aPtr);
++ newMean += *aPtr++;
++ }
++ newMean /= num_points;
++ returnValue /= num_points;
++ returnValue -= (newMean * newMean);
++ returnValue = sqrtf(returnValue);
+ }
+- newMean /= num_points;
+- returnValue /= num_points;
+- returnValue -= (newMean * newMean);
+- returnValue = sqrtf(returnValue);
+- }
+- *stddev = returnValue;
+- *mean = newMean;
++ *stddev = returnValue;
++ *mean = newMean;
+ }
+ #endif /* LV_HAVE_SSE4_1 */
+
+@@ -312,86 +329,86 @@ volk_32f_stddev_and_mean_32f_x2_a_sse4_1(float* stddev, float* mean,
+ #ifdef LV_HAVE_SSE
+ #include <xmmintrin.h>
+
+-static inline void
+-volk_32f_stddev_and_mean_32f_x2_a_sse(float* stddev, float* mean,
+- const float* inputBuffer,
+- unsigned int num_points)
++static inline void volk_32f_stddev_and_mean_32f_x2_a_sse(float* stddev,
++ float* mean,
++ const float* inputBuffer,
++ unsigned int num_points)
+ {
+- float returnValue = 0;
+- float newMean = 0;
+- if(num_points > 0){
+- unsigned int number = 0;
+- const unsigned int quarterPoints = num_points / 4;
+-
+- const float* aPtr = inputBuffer;
+- __VOLK_ATTR_ALIGNED(16) float meanBuffer[4];
+- __VOLK_ATTR_ALIGNED(16) float squareBuffer[4];
+-
+- __m128 accumulator = _mm_setzero_ps();
+- __m128 squareAccumulator = _mm_setzero_ps();
+- __m128 aVal = _mm_setzero_ps();
+- for(;number < quarterPoints; number++) {
+- aVal = _mm_load_ps(aPtr); // aVal = x
+- accumulator = _mm_add_ps(accumulator, aVal); // accumulator += x
+- aVal = _mm_mul_ps(aVal, aVal); // squareAccumulator += x^2
+- squareAccumulator = _mm_add_ps(squareAccumulator, aVal);
+- aPtr += 4;
++ float returnValue = 0;
++ float newMean = 0;
++ if (num_points > 0) {
++ unsigned int number = 0;
++ const unsigned int quarterPoints = num_points / 4;
++
++ const float* aPtr = inputBuffer;
++ __VOLK_ATTR_ALIGNED(16) float meanBuffer[4];
++ __VOLK_ATTR_ALIGNED(16) float squareBuffer[4];
++
++ __m128 accumulator = _mm_setzero_ps();
++ __m128 squareAccumulator = _mm_setzero_ps();
++ __m128 aVal = _mm_setzero_ps();
++ for (; number < quarterPoints; number++) {
++ aVal = _mm_load_ps(aPtr); // aVal = x
++ accumulator = _mm_add_ps(accumulator, aVal); // accumulator += x
++ aVal = _mm_mul_ps(aVal, aVal); // squareAccumulator += x^2
++ squareAccumulator = _mm_add_ps(squareAccumulator, aVal);
++ aPtr += 4;
++ }
++ _mm_store_ps(meanBuffer,
++ accumulator); // Store the results back into the C container
++ _mm_store_ps(squareBuffer,
++ squareAccumulator); // Store the results back into the C container
++ newMean = meanBuffer[0];
++ newMean += meanBuffer[1];
++ newMean += meanBuffer[2];
++ newMean += meanBuffer[3];
++ returnValue = squareBuffer[0];
++ returnValue += squareBuffer[1];
++ returnValue += squareBuffer[2];
++ returnValue += squareBuffer[3];
++
++ number = quarterPoints * 4;
++ for (; number < num_points; number++) {
++ returnValue += (*aPtr) * (*aPtr);
++ newMean += *aPtr++;
++ }
++ newMean /= num_points;
++ returnValue /= num_points;
++ returnValue -= (newMean * newMean);
++ returnValue = sqrtf(returnValue);
+ }
+- _mm_store_ps(meanBuffer,accumulator); // Store the results back into the C container
+- _mm_store_ps(squareBuffer,squareAccumulator); // Store the results back into the C container
+- newMean = meanBuffer[0];
+- newMean += meanBuffer[1];
+- newMean += meanBuffer[2];
+- newMean += meanBuffer[3];
+- returnValue = squareBuffer[0];
+- returnValue += squareBuffer[1];
+- returnValue += squareBuffer[2];
+- returnValue += squareBuffer[3];
+-
+- number = quarterPoints * 4;
+- for(;number < num_points; number++){
+- returnValue += (*aPtr) * (*aPtr);
+- newMean += *aPtr++;
+- }
+- newMean /= num_points;
+- returnValue /= num_points;
+- returnValue -= (newMean * newMean);
+- returnValue = sqrtf(returnValue);
+- }
+- *stddev = returnValue;
+- *mean = newMean;
++ *stddev = returnValue;
++ *mean = newMean;
+ }
+ #endif /* LV_HAVE_SSE */
+
+
+ #ifdef LV_HAVE_GENERIC
+
+-static inline void
+-volk_32f_stddev_and_mean_32f_x2_generic(float* stddev, float* mean,
+- const float* inputBuffer,
+- unsigned int num_points)
++static inline void volk_32f_stddev_and_mean_32f_x2_generic(float* stddev,
++ float* mean,
++ const float* inputBuffer,
++ unsigned int num_points)
+ {
+- float returnValue = 0;
+- float newMean = 0;
+- if(num_points > 0){
+- const float* aPtr = inputBuffer;
+- unsigned int number = 0;
+-
+- for(number = 0; number < num_points; number++){
+- returnValue += (*aPtr) * (*aPtr);
+- newMean += *aPtr++;
++ float returnValue = 0;
++ float newMean = 0;
++ if (num_points > 0) {
++ const float* aPtr = inputBuffer;
++ unsigned int number = 0;
++
++ for (number = 0; number < num_points; number++) {
++ returnValue += (*aPtr) * (*aPtr);
++ newMean += *aPtr++;
++ }
++ newMean /= num_points;
++ returnValue /= num_points;
++ returnValue -= (newMean * newMean);
++ returnValue = sqrtf(returnValue);
+ }
+- newMean /= num_points;
+- returnValue /= num_points;
+- returnValue -= (newMean * newMean);
+- returnValue = sqrtf(returnValue);
+- }
+- *stddev = returnValue;
+- *mean = newMean;
++ *stddev = returnValue;
++ *mean = newMean;
+ }
+ #endif /* LV_HAVE_GENERIC */
+
+
+-
+-
+ #endif /* INCLUDED_volk_32f_stddev_and_mean_32f_x2_a_H */
+diff --git a/kernels/volk/volk_32f_tan_32f.h b/kernels/volk/volk_32f_tan_32f.h
+index 239b745..a623a66 100644
+--- a/kernels/volk/volk_32f_tan_32f.h
++++ b/kernels/volk/volk_32f_tan_32f.h
+@@ -71,9 +71,9 @@
+ * \endcode
+ */
+
+-#include <stdio.h>
+-#include <math.h>
+ #include <inttypes.h>
++#include <math.h>
++#include <stdio.h>
+
+ #ifndef INCLUDED_volk_32f_tan_32f_a_H
+ #define INCLUDED_volk_32f_tan_32f_a_H
+@@ -82,78 +82,102 @@
+ #include <immintrin.h>
+
+ static inline void
+-volk_32f_tan_32f_a_avx2_fma(float* bVector, const float* aVector,
+- unsigned int num_points)
++volk_32f_tan_32f_a_avx2_fma(float* bVector, const float* aVector, unsigned int num_points)
+ {
+- float* bPtr = bVector;
+- const float* aPtr = aVector;
+-
+- unsigned int number = 0;
+- unsigned int eighthPoints = num_points / 8;
+- unsigned int i = 0;
+-
+- __m256 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones, fzeroes;
+- __m256 sine, cosine, tangent, condition1, condition2, condition3;
+- __m256i q, r, ones, twos, fours;
+-
+- m4pi = _mm256_set1_ps(1.273239545);
+- pio4A = _mm256_set1_ps(0.78515625);
+- pio4B = _mm256_set1_ps(0.241876e-3);
+- ffours = _mm256_set1_ps(4.0);
+- ftwos = _mm256_set1_ps(2.0);
+- fones = _mm256_set1_ps(1.0);
+- fzeroes = _mm256_setzero_ps();
+- ones = _mm256_set1_epi32(1);
+- twos = _mm256_set1_epi32(2);
+- fours = _mm256_set1_epi32(4);
+-
+- cp1 = _mm256_set1_ps(1.0);
+- cp2 = _mm256_set1_ps(0.83333333e-1);
+- cp3 = _mm256_set1_ps(0.2777778e-2);
+- cp4 = _mm256_set1_ps(0.49603e-4);
+- cp5 = _mm256_set1_ps(0.551e-6);
+-
+- for(;number < eighthPoints; number++){
+- aVal = _mm256_load_ps(aPtr);
+- s = _mm256_sub_ps(aVal, _mm256_and_ps(_mm256_mul_ps(aVal, ftwos), _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS)));
+- q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi)));
+- r = _mm256_add_epi32(q, _mm256_and_si256(q, ones));
+-
+- s = _mm256_fnmadd_ps(_mm256_cvtepi32_ps(r), pio4A, s);
+- s = _mm256_fnmadd_ps(_mm256_cvtepi32_ps(r), pio4B, s);
+-
+- s = _mm256_div_ps(s, _mm256_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
+- s = _mm256_mul_ps(s, s);
+- // Evaluate Taylor series
+- s = _mm256_mul_ps(_mm256_fmadd_ps(_mm256_fmsub_ps(_mm256_fmadd_ps(_mm256_fmsub_ps(s, cp5, cp4), s, cp3), s, cp2), s, cp1), s);
+-
+- for(i = 0; i < 3; i++){
+- s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s));
++ float* bPtr = bVector;
++ const float* aPtr = aVector;
++
++ unsigned int number = 0;
++ unsigned int eighthPoints = num_points / 8;
++ unsigned int i = 0;
++
++ __m256 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones,
++ fzeroes;
++ __m256 sine, cosine, tangent, condition1, condition2, condition3;
++ __m256i q, r, ones, twos, fours;
++
++ m4pi = _mm256_set1_ps(1.273239545);
++ pio4A = _mm256_set1_ps(0.78515625);
++ pio4B = _mm256_set1_ps(0.241876e-3);
++ ffours = _mm256_set1_ps(4.0);
++ ftwos = _mm256_set1_ps(2.0);
++ fones = _mm256_set1_ps(1.0);
++ fzeroes = _mm256_setzero_ps();
++ ones = _mm256_set1_epi32(1);
++ twos = _mm256_set1_epi32(2);
++ fours = _mm256_set1_epi32(4);
++
++ cp1 = _mm256_set1_ps(1.0);
++ cp2 = _mm256_set1_ps(0.83333333e-1);
++ cp3 = _mm256_set1_ps(0.2777778e-2);
++ cp4 = _mm256_set1_ps(0.49603e-4);
++ cp5 = _mm256_set1_ps(0.551e-6);
++
++ for (; number < eighthPoints; number++) {
++ aVal = _mm256_load_ps(aPtr);
++ s = _mm256_sub_ps(aVal,
++ _mm256_and_ps(_mm256_mul_ps(aVal, ftwos),
++ _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS)));
++ q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi)));
++ r = _mm256_add_epi32(q, _mm256_and_si256(q, ones));
++
++ s = _mm256_fnmadd_ps(_mm256_cvtepi32_ps(r), pio4A, s);
++ s = _mm256_fnmadd_ps(_mm256_cvtepi32_ps(r), pio4B, s);
++
++ s = _mm256_div_ps(
++ s,
++ _mm256_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
++ s = _mm256_mul_ps(s, s);
++ // Evaluate Taylor series
++ s = _mm256_mul_ps(
++ _mm256_fmadd_ps(
++ _mm256_fmsub_ps(
++ _mm256_fmadd_ps(_mm256_fmsub_ps(s, cp5, cp4), s, cp3), s, cp2),
++ s,
++ cp1),
++ s);
++
++ for (i = 0; i < 3; i++) {
++ s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s));
++ }
++ s = _mm256_div_ps(s, ftwos);
++
++ sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s));
++ cosine = _mm256_sub_ps(fones, s);
++
++ condition1 = _mm256_cmp_ps(
++ _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, ones), twos)),
++ fzeroes,
++ _CMP_NEQ_UQ);
++ condition2 = _mm256_cmp_ps(
++ _mm256_cmp_ps(
++ _mm256_cvtepi32_ps(_mm256_and_si256(q, fours)), fzeroes, _CMP_NEQ_UQ),
++ _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS),
++ _CMP_NEQ_UQ);
++ condition3 = _mm256_cmp_ps(
++ _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, twos), fours)),
++ fzeroes,
++ _CMP_NEQ_UQ);
++
++ __m256 temp = cosine;
++ cosine =
++ _mm256_add_ps(cosine, _mm256_and_ps(_mm256_sub_ps(sine, cosine), condition1));
++ sine = _mm256_add_ps(sine, _mm256_and_ps(_mm256_sub_ps(temp, sine), condition1));
++ sine = _mm256_sub_ps(
++ sine, _mm256_and_ps(_mm256_mul_ps(sine, _mm256_set1_ps(2.0f)), condition2));
++ cosine = _mm256_sub_ps(
++ cosine,
++ _mm256_and_ps(_mm256_mul_ps(cosine, _mm256_set1_ps(2.0f)), condition3));
++ tangent = _mm256_div_ps(sine, cosine);
++ _mm256_store_ps(bPtr, tangent);
++ aPtr += 8;
++ bPtr += 8;
++ }
++
++ number = eighthPoints * 8;
++ for (; number < num_points; number++) {
++ *bPtr++ = tan(*aPtr++);
+ }
+- s = _mm256_div_ps(s, ftwos);
+-
+- sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s));
+- cosine = _mm256_sub_ps(fones, s);
+-
+- condition1 = _mm256_cmp_ps(_mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, ones), twos)), fzeroes, _CMP_NEQ_UQ);
+- condition2 = _mm256_cmp_ps(_mm256_cmp_ps(_mm256_cvtepi32_ps(_mm256_and_si256(q, fours)), fzeroes, _CMP_NEQ_UQ), _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS), _CMP_NEQ_UQ);
+- condition3 = _mm256_cmp_ps(_mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, twos), fours)), fzeroes, _CMP_NEQ_UQ);
+-
+- __m256 temp = cosine;
+- cosine = _mm256_add_ps(cosine, _mm256_and_ps(_mm256_sub_ps(sine, cosine), condition1));
+- sine = _mm256_add_ps(sine, _mm256_and_ps(_mm256_sub_ps(temp, sine), condition1));
+- sine = _mm256_sub_ps(sine, _mm256_and_ps(_mm256_mul_ps(sine, _mm256_set1_ps(2.0f)), condition2));
+- cosine = _mm256_sub_ps(cosine, _mm256_and_ps(_mm256_mul_ps(cosine, _mm256_set1_ps(2.0f)), condition3));
+- tangent = _mm256_div_ps(sine, cosine);
+- _mm256_store_ps(bPtr, tangent);
+- aPtr += 8;
+- bPtr += 8;
+- }
+-
+- number = eighthPoints * 8;
+- for(;number < num_points; number++){
+- *bPtr++ = tan(*aPtr++);
+- }
+ }
+
+ #endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for aligned */
+@@ -162,78 +186,109 @@ volk_32f_tan_32f_a_avx2_fma(float* bVector, const float* aVector,
+ #include <immintrin.h>
+
+ static inline void
+-volk_32f_tan_32f_a_avx2(float* bVector, const float* aVector,
+- unsigned int num_points)
++volk_32f_tan_32f_a_avx2(float* bVector, const float* aVector, unsigned int num_points)
+ {
+- float* bPtr = bVector;
+- const float* aPtr = aVector;
+-
+- unsigned int number = 0;
+- unsigned int eighthPoints = num_points / 8;
+- unsigned int i = 0;
+-
+- __m256 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones, fzeroes;
+- __m256 sine, cosine, tangent, condition1, condition2, condition3;
+- __m256i q, r, ones, twos, fours;
+-
+- m4pi = _mm256_set1_ps(1.273239545);
+- pio4A = _mm256_set1_ps(0.78515625);
+- pio4B = _mm256_set1_ps(0.241876e-3);
+- ffours = _mm256_set1_ps(4.0);
+- ftwos = _mm256_set1_ps(2.0);
+- fones = _mm256_set1_ps(1.0);
+- fzeroes = _mm256_setzero_ps();
+- ones = _mm256_set1_epi32(1);
+- twos = _mm256_set1_epi32(2);
+- fours = _mm256_set1_epi32(4);
+-
+- cp1 = _mm256_set1_ps(1.0);
+- cp2 = _mm256_set1_ps(0.83333333e-1);
+- cp3 = _mm256_set1_ps(0.2777778e-2);
+- cp4 = _mm256_set1_ps(0.49603e-4);
+- cp5 = _mm256_set1_ps(0.551e-6);
+-
+- for(;number < eighthPoints; number++){
+- aVal = _mm256_load_ps(aPtr);
+- s = _mm256_sub_ps(aVal, _mm256_and_ps(_mm256_mul_ps(aVal, ftwos), _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS)));
+- q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi)));
+- r = _mm256_add_epi32(q, _mm256_and_si256(q, ones));
+-
+- s = _mm256_sub_ps(s, _mm256_mul_ps(_mm256_cvtepi32_ps(r), pio4A));
+- s = _mm256_sub_ps(s, _mm256_mul_ps(_mm256_cvtepi32_ps(r), pio4B));
+-
+- s = _mm256_div_ps(s, _mm256_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
+- s = _mm256_mul_ps(s, s);
+- // Evaluate Taylor series
+- s = _mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(s, cp5), cp4), s), cp3), s), cp2), s), cp1), s);
+-
+- for(i = 0; i < 3; i++){
+- s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s));
++ float* bPtr = bVector;
++ const float* aPtr = aVector;
++
++ unsigned int number = 0;
++ unsigned int eighthPoints = num_points / 8;
++ unsigned int i = 0;
++
++ __m256 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones,
++ fzeroes;
++ __m256 sine, cosine, tangent, condition1, condition2, condition3;
++ __m256i q, r, ones, twos, fours;
++
++ m4pi = _mm256_set1_ps(1.273239545);
++ pio4A = _mm256_set1_ps(0.78515625);
++ pio4B = _mm256_set1_ps(0.241876e-3);
++ ffours = _mm256_set1_ps(4.0);
++ ftwos = _mm256_set1_ps(2.0);
++ fones = _mm256_set1_ps(1.0);
++ fzeroes = _mm256_setzero_ps();
++ ones = _mm256_set1_epi32(1);
++ twos = _mm256_set1_epi32(2);
++ fours = _mm256_set1_epi32(4);
++
++ cp1 = _mm256_set1_ps(1.0);
++ cp2 = _mm256_set1_ps(0.83333333e-1);
++ cp3 = _mm256_set1_ps(0.2777778e-2);
++ cp4 = _mm256_set1_ps(0.49603e-4);
++ cp5 = _mm256_set1_ps(0.551e-6);
++
++ for (; number < eighthPoints; number++) {
++ aVal = _mm256_load_ps(aPtr);
++ s = _mm256_sub_ps(aVal,
++ _mm256_and_ps(_mm256_mul_ps(aVal, ftwos),
++ _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS)));
++ q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi)));
++ r = _mm256_add_epi32(q, _mm256_and_si256(q, ones));
++
++ s = _mm256_sub_ps(s, _mm256_mul_ps(_mm256_cvtepi32_ps(r), pio4A));
++ s = _mm256_sub_ps(s, _mm256_mul_ps(_mm256_cvtepi32_ps(r), pio4B));
++
++ s = _mm256_div_ps(
++ s,
++ _mm256_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
++ s = _mm256_mul_ps(s, s);
++ // Evaluate Taylor series
++ s = _mm256_mul_ps(
++ _mm256_add_ps(
++ _mm256_mul_ps(
++ _mm256_sub_ps(
++ _mm256_mul_ps(
++ _mm256_add_ps(
++ _mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(s, cp5), cp4),
++ s),
++ cp3),
++ s),
++ cp2),
++ s),
++ cp1),
++ s);
++
++ for (i = 0; i < 3; i++) {
++ s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s));
++ }
++ s = _mm256_div_ps(s, ftwos);
++
++ sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s));
++ cosine = _mm256_sub_ps(fones, s);
++
++ condition1 = _mm256_cmp_ps(
++ _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, ones), twos)),
++ fzeroes,
++ _CMP_NEQ_UQ);
++ condition2 = _mm256_cmp_ps(
++ _mm256_cmp_ps(
++ _mm256_cvtepi32_ps(_mm256_and_si256(q, fours)), fzeroes, _CMP_NEQ_UQ),
++ _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS),
++ _CMP_NEQ_UQ);
++ condition3 = _mm256_cmp_ps(
++ _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, twos), fours)),
++ fzeroes,
++ _CMP_NEQ_UQ);
++
++ __m256 temp = cosine;
++ cosine =
++ _mm256_add_ps(cosine, _mm256_and_ps(_mm256_sub_ps(sine, cosine), condition1));
++ sine = _mm256_add_ps(sine, _mm256_and_ps(_mm256_sub_ps(temp, sine), condition1));
++ sine = _mm256_sub_ps(
++ sine, _mm256_and_ps(_mm256_mul_ps(sine, _mm256_set1_ps(2.0f)), condition2));
++ cosine = _mm256_sub_ps(
++ cosine,
++ _mm256_and_ps(_mm256_mul_ps(cosine, _mm256_set1_ps(2.0f)), condition3));
++ tangent = _mm256_div_ps(sine, cosine);
++ _mm256_store_ps(bPtr, tangent);
++ aPtr += 8;
++ bPtr += 8;
++ }
++
++ number = eighthPoints * 8;
++ for (; number < num_points; number++) {
++ *bPtr++ = tan(*aPtr++);
+ }
+- s = _mm256_div_ps(s, ftwos);
+-
+- sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s));
+- cosine = _mm256_sub_ps(fones, s);
+-
+- condition1 = _mm256_cmp_ps(_mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, ones), twos)), fzeroes, _CMP_NEQ_UQ);
+- condition2 = _mm256_cmp_ps(_mm256_cmp_ps(_mm256_cvtepi32_ps(_mm256_and_si256(q, fours)), fzeroes, _CMP_NEQ_UQ), _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS), _CMP_NEQ_UQ);
+- condition3 = _mm256_cmp_ps(_mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, twos), fours)), fzeroes, _CMP_NEQ_UQ);
+-
+- __m256 temp = cosine;
+- cosine = _mm256_add_ps(cosine, _mm256_and_ps(_mm256_sub_ps(sine, cosine), condition1));
+- sine = _mm256_add_ps(sine, _mm256_and_ps(_mm256_sub_ps(temp, sine), condition1));
+- sine = _mm256_sub_ps(sine, _mm256_and_ps(_mm256_mul_ps(sine, _mm256_set1_ps(2.0f)), condition2));
+- cosine = _mm256_sub_ps(cosine, _mm256_and_ps(_mm256_mul_ps(cosine, _mm256_set1_ps(2.0f)), condition3));
+- tangent = _mm256_div_ps(sine, cosine);
+- _mm256_store_ps(bPtr, tangent);
+- aPtr += 8;
+- bPtr += 8;
+- }
+-
+- number = eighthPoints * 8;
+- for(;number < num_points; number++){
+- *bPtr++ = tan(*aPtr++);
+- }
+ }
+
+ #endif /* LV_HAVE_AVX2 for aligned */
+@@ -242,78 +297,97 @@ volk_32f_tan_32f_a_avx2(float* bVector, const float* aVector,
+ #include <smmintrin.h>
+
+ static inline void
+-volk_32f_tan_32f_a_sse4_1(float* bVector, const float* aVector,
+- unsigned int num_points)
++volk_32f_tan_32f_a_sse4_1(float* bVector, const float* aVector, unsigned int num_points)
+ {
+- float* bPtr = bVector;
+- const float* aPtr = aVector;
+-
+- unsigned int number = 0;
+- unsigned int quarterPoints = num_points / 4;
+- unsigned int i = 0;
+-
+- __m128 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones, fzeroes;
+- __m128 sine, cosine, tangent, condition1, condition2, condition3;
+- __m128i q, r, ones, twos, fours;
+-
+- m4pi = _mm_set1_ps(1.273239545);
+- pio4A = _mm_set1_ps(0.78515625);
+- pio4B = _mm_set1_ps(0.241876e-3);
+- ffours = _mm_set1_ps(4.0);
+- ftwos = _mm_set1_ps(2.0);
+- fones = _mm_set1_ps(1.0);
+- fzeroes = _mm_setzero_ps();
+- ones = _mm_set1_epi32(1);
+- twos = _mm_set1_epi32(2);
+- fours = _mm_set1_epi32(4);
+-
+- cp1 = _mm_set1_ps(1.0);
+- cp2 = _mm_set1_ps(0.83333333e-1);
+- cp3 = _mm_set1_ps(0.2777778e-2);
+- cp4 = _mm_set1_ps(0.49603e-4);
+- cp5 = _mm_set1_ps(0.551e-6);
+-
+- for(;number < quarterPoints; number++){
+- aVal = _mm_load_ps(aPtr);
+- s = _mm_sub_ps(aVal, _mm_and_ps(_mm_mul_ps(aVal, ftwos), _mm_cmplt_ps(aVal, fzeroes)));
+- q = _mm_cvtps_epi32(_mm_floor_ps(_mm_mul_ps(s, m4pi)));
+- r = _mm_add_epi32(q, _mm_and_si128(q, ones));
+-
+- s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4A));
+- s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4B));
+-
+- s = _mm_div_ps(s, _mm_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
+- s = _mm_mul_ps(s, s);
+- // Evaluate Taylor series
+- s = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(s, cp5), cp4), s), cp3), s), cp2), s), cp1), s);
+-
+- for(i = 0; i < 3; i++){
+- s = _mm_mul_ps(s, _mm_sub_ps(ffours, s));
++ float* bPtr = bVector;
++ const float* aPtr = aVector;
++
++ unsigned int number = 0;
++ unsigned int quarterPoints = num_points / 4;
++ unsigned int i = 0;
++
++ __m128 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones,
++ fzeroes;
++ __m128 sine, cosine, tangent, condition1, condition2, condition3;
++ __m128i q, r, ones, twos, fours;
++
++ m4pi = _mm_set1_ps(1.273239545);
++ pio4A = _mm_set1_ps(0.78515625);
++ pio4B = _mm_set1_ps(0.241876e-3);
++ ffours = _mm_set1_ps(4.0);
++ ftwos = _mm_set1_ps(2.0);
++ fones = _mm_set1_ps(1.0);
++ fzeroes = _mm_setzero_ps();
++ ones = _mm_set1_epi32(1);
++ twos = _mm_set1_epi32(2);
++ fours = _mm_set1_epi32(4);
++
++ cp1 = _mm_set1_ps(1.0);
++ cp2 = _mm_set1_ps(0.83333333e-1);
++ cp3 = _mm_set1_ps(0.2777778e-2);
++ cp4 = _mm_set1_ps(0.49603e-4);
++ cp5 = _mm_set1_ps(0.551e-6);
++
++ for (; number < quarterPoints; number++) {
++ aVal = _mm_load_ps(aPtr);
++ s = _mm_sub_ps(aVal,
++ _mm_and_ps(_mm_mul_ps(aVal, ftwos), _mm_cmplt_ps(aVal, fzeroes)));
++ q = _mm_cvtps_epi32(_mm_floor_ps(_mm_mul_ps(s, m4pi)));
++ r = _mm_add_epi32(q, _mm_and_si128(q, ones));
++
++ s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4A));
++ s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4B));
++
++ s = _mm_div_ps(
++ s, _mm_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
++ s = _mm_mul_ps(s, s);
++ // Evaluate Taylor series
++ s = _mm_mul_ps(
++ _mm_add_ps(
++ _mm_mul_ps(
++ _mm_sub_ps(
++ _mm_mul_ps(
++ _mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(s, cp5), cp4), s),
++ cp3),
++ s),
++ cp2),
++ s),
++ cp1),
++ s);
++
++ for (i = 0; i < 3; i++) {
++ s = _mm_mul_ps(s, _mm_sub_ps(ffours, s));
++ }
++ s = _mm_div_ps(s, ftwos);
++
++ sine = _mm_sqrt_ps(_mm_mul_ps(_mm_sub_ps(ftwos, s), s));
++ cosine = _mm_sub_ps(fones, s);
++
++ condition1 = _mm_cmpneq_ps(
++ _mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, ones), twos)), fzeroes);
++ condition2 = _mm_cmpneq_ps(
++ _mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(q, fours)), fzeroes),
++ _mm_cmplt_ps(aVal, fzeroes));
++ condition3 = _mm_cmpneq_ps(
++ _mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, twos), fours)), fzeroes);
++
++ __m128 temp = cosine;
++ cosine = _mm_add_ps(cosine, _mm_and_ps(_mm_sub_ps(sine, cosine), condition1));
++ sine = _mm_add_ps(sine, _mm_and_ps(_mm_sub_ps(temp, sine), condition1));
++ sine =
++ _mm_sub_ps(sine, _mm_and_ps(_mm_mul_ps(sine, _mm_set1_ps(2.0f)), condition2));
++ cosine = _mm_sub_ps(
++ cosine, _mm_and_ps(_mm_mul_ps(cosine, _mm_set1_ps(2.0f)), condition3));
++ tangent = _mm_div_ps(sine, cosine);
++ _mm_store_ps(bPtr, tangent);
++ aPtr += 4;
++ bPtr += 4;
++ }
++
++ number = quarterPoints * 4;
++ for (; number < num_points; number++) {
++ *bPtr++ = tanf(*aPtr++);
+ }
+- s = _mm_div_ps(s, ftwos);
+-
+- sine = _mm_sqrt_ps(_mm_mul_ps(_mm_sub_ps(ftwos, s), s));
+- cosine = _mm_sub_ps(fones, s);
+-
+- condition1 = _mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, ones), twos)), fzeroes);
+- condition2 = _mm_cmpneq_ps(_mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(q, fours)), fzeroes), _mm_cmplt_ps(aVal, fzeroes));
+- condition3 = _mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, twos), fours)), fzeroes);
+-
+- __m128 temp = cosine;
+- cosine = _mm_add_ps(cosine, _mm_and_ps(_mm_sub_ps(sine, cosine), condition1));
+- sine = _mm_add_ps(sine, _mm_and_ps(_mm_sub_ps(temp, sine), condition1));
+- sine = _mm_sub_ps(sine, _mm_and_ps(_mm_mul_ps(sine, _mm_set1_ps(2.0f)), condition2));
+- cosine = _mm_sub_ps(cosine, _mm_and_ps(_mm_mul_ps(cosine, _mm_set1_ps(2.0f)), condition3));
+- tangent = _mm_div_ps(sine, cosine);
+- _mm_store_ps(bPtr, tangent);
+- aPtr += 4;
+- bPtr += 4;
+- }
+-
+- number = quarterPoints * 4;
+- for(;number < num_points; number++){
+- *bPtr++ = tanf(*aPtr++);
+- }
+ }
+
+ #endif /* LV_HAVE_SSE4_1 for aligned */
+@@ -328,78 +402,102 @@ volk_32f_tan_32f_a_sse4_1(float* bVector, const float* aVector,
+ #include <immintrin.h>
+
+ static inline void
+-volk_32f_tan_32f_u_avx2_fma(float* bVector, const float* aVector,
+- unsigned int num_points)
++volk_32f_tan_32f_u_avx2_fma(float* bVector, const float* aVector, unsigned int num_points)
+ {
+- float* bPtr = bVector;
+- const float* aPtr = aVector;
+-
+- unsigned int number = 0;
+- unsigned int eighthPoints = num_points / 8;
+- unsigned int i = 0;
+-
+- __m256 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones, fzeroes;
+- __m256 sine, cosine, tangent, condition1, condition2, condition3;
+- __m256i q, r, ones, twos, fours;
+-
+- m4pi = _mm256_set1_ps(1.273239545);
+- pio4A = _mm256_set1_ps(0.78515625);
+- pio4B = _mm256_set1_ps(0.241876e-3);
+- ffours = _mm256_set1_ps(4.0);
+- ftwos = _mm256_set1_ps(2.0);
+- fones = _mm256_set1_ps(1.0);
+- fzeroes = _mm256_setzero_ps();
+- ones = _mm256_set1_epi32(1);
+- twos = _mm256_set1_epi32(2);
+- fours = _mm256_set1_epi32(4);
+-
+- cp1 = _mm256_set1_ps(1.0);
+- cp2 = _mm256_set1_ps(0.83333333e-1);
+- cp3 = _mm256_set1_ps(0.2777778e-2);
+- cp4 = _mm256_set1_ps(0.49603e-4);
+- cp5 = _mm256_set1_ps(0.551e-6);
+-
+- for(;number < eighthPoints; number++){
+- aVal = _mm256_loadu_ps(aPtr);
+- s = _mm256_sub_ps(aVal, _mm256_and_ps(_mm256_mul_ps(aVal, ftwos), _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS)));
+- q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi)));
+- r = _mm256_add_epi32(q, _mm256_and_si256(q, ones));
+-
+- s = _mm256_fnmadd_ps(_mm256_cvtepi32_ps(r), pio4A, s);
+- s = _mm256_fnmadd_ps(_mm256_cvtepi32_ps(r), pio4B, s);
+-
+- s = _mm256_div_ps(s, _mm256_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
+- s = _mm256_mul_ps(s, s);
+- // Evaluate Taylor series
+- s = _mm256_mul_ps(_mm256_fmadd_ps(_mm256_fmsub_ps(_mm256_fmadd_ps(_mm256_fmsub_ps(s, cp5, cp4), s, cp3), s, cp2), s, cp1), s);
+-
+- for(i = 0; i < 3; i++){
+- s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s));
++ float* bPtr = bVector;
++ const float* aPtr = aVector;
++
++ unsigned int number = 0;
++ unsigned int eighthPoints = num_points / 8;
++ unsigned int i = 0;
++
++ __m256 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones,
++ fzeroes;
++ __m256 sine, cosine, tangent, condition1, condition2, condition3;
++ __m256i q, r, ones, twos, fours;
++
++ m4pi = _mm256_set1_ps(1.273239545);
++ pio4A = _mm256_set1_ps(0.78515625);
++ pio4B = _mm256_set1_ps(0.241876e-3);
++ ffours = _mm256_set1_ps(4.0);
++ ftwos = _mm256_set1_ps(2.0);
++ fones = _mm256_set1_ps(1.0);
++ fzeroes = _mm256_setzero_ps();
++ ones = _mm256_set1_epi32(1);
++ twos = _mm256_set1_epi32(2);
++ fours = _mm256_set1_epi32(4);
++
++ cp1 = _mm256_set1_ps(1.0);
++ cp2 = _mm256_set1_ps(0.83333333e-1);
++ cp3 = _mm256_set1_ps(0.2777778e-2);
++ cp4 = _mm256_set1_ps(0.49603e-4);
++ cp5 = _mm256_set1_ps(0.551e-6);
++
++ for (; number < eighthPoints; number++) {
++ aVal = _mm256_loadu_ps(aPtr);
++ s = _mm256_sub_ps(aVal,
++ _mm256_and_ps(_mm256_mul_ps(aVal, ftwos),
++ _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS)));
++ q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi)));
++ r = _mm256_add_epi32(q, _mm256_and_si256(q, ones));
++
++ s = _mm256_fnmadd_ps(_mm256_cvtepi32_ps(r), pio4A, s);
++ s = _mm256_fnmadd_ps(_mm256_cvtepi32_ps(r), pio4B, s);
++
++ s = _mm256_div_ps(
++ s,
++ _mm256_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
++ s = _mm256_mul_ps(s, s);
++ // Evaluate Taylor series
++ s = _mm256_mul_ps(
++ _mm256_fmadd_ps(
++ _mm256_fmsub_ps(
++ _mm256_fmadd_ps(_mm256_fmsub_ps(s, cp5, cp4), s, cp3), s, cp2),
++ s,
++ cp1),
++ s);
++
++ for (i = 0; i < 3; i++) {
++ s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s));
++ }
++ s = _mm256_div_ps(s, ftwos);
++
++ sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s));
++ cosine = _mm256_sub_ps(fones, s);
++
++ condition1 = _mm256_cmp_ps(
++ _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, ones), twos)),
++ fzeroes,
++ _CMP_NEQ_UQ);
++ condition2 = _mm256_cmp_ps(
++ _mm256_cmp_ps(
++ _mm256_cvtepi32_ps(_mm256_and_si256(q, fours)), fzeroes, _CMP_NEQ_UQ),
++ _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS),
++ _CMP_NEQ_UQ);
++ condition3 = _mm256_cmp_ps(
++ _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, twos), fours)),
++ fzeroes,
++ _CMP_NEQ_UQ);
++
++ __m256 temp = cosine;
++ cosine =
++ _mm256_add_ps(cosine, _mm256_and_ps(_mm256_sub_ps(sine, cosine), condition1));
++ sine = _mm256_add_ps(sine, _mm256_and_ps(_mm256_sub_ps(temp, sine), condition1));
++ sine = _mm256_sub_ps(
++ sine, _mm256_and_ps(_mm256_mul_ps(sine, _mm256_set1_ps(2.0f)), condition2));
++ cosine = _mm256_sub_ps(
++ cosine,
++ _mm256_and_ps(_mm256_mul_ps(cosine, _mm256_set1_ps(2.0f)), condition3));
++ tangent = _mm256_div_ps(sine, cosine);
++ _mm256_storeu_ps(bPtr, tangent);
++ aPtr += 8;
++ bPtr += 8;
++ }
++
++ number = eighthPoints * 8;
++ for (; number < num_points; number++) {
++ *bPtr++ = tan(*aPtr++);
+ }
+- s = _mm256_div_ps(s, ftwos);
+-
+- sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s));
+- cosine = _mm256_sub_ps(fones, s);
+-
+- condition1 = _mm256_cmp_ps(_mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, ones), twos)), fzeroes, _CMP_NEQ_UQ);
+- condition2 = _mm256_cmp_ps(_mm256_cmp_ps(_mm256_cvtepi32_ps(_mm256_and_si256(q, fours)), fzeroes, _CMP_NEQ_UQ), _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS), _CMP_NEQ_UQ);
+- condition3 = _mm256_cmp_ps(_mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, twos), fours)), fzeroes, _CMP_NEQ_UQ);
+-
+- __m256 temp = cosine;
+- cosine = _mm256_add_ps(cosine, _mm256_and_ps(_mm256_sub_ps(sine, cosine), condition1));
+- sine = _mm256_add_ps(sine, _mm256_and_ps(_mm256_sub_ps(temp, sine), condition1));
+- sine = _mm256_sub_ps(sine, _mm256_and_ps(_mm256_mul_ps(sine, _mm256_set1_ps(2.0f)), condition2));
+- cosine = _mm256_sub_ps(cosine, _mm256_and_ps(_mm256_mul_ps(cosine, _mm256_set1_ps(2.0f)), condition3));
+- tangent = _mm256_div_ps(sine, cosine);
+- _mm256_storeu_ps(bPtr, tangent);
+- aPtr += 8;
+- bPtr += 8;
+- }
+-
+- number = eighthPoints * 8;
+- for(;number < num_points; number++){
+- *bPtr++ = tan(*aPtr++);
+- }
+ }
+
+ #endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for unaligned */
+@@ -408,78 +506,109 @@ volk_32f_tan_32f_u_avx2_fma(float* bVector, const float* aVector,
+ #include <immintrin.h>
+
+ static inline void
+-volk_32f_tan_32f_u_avx2(float* bVector, const float* aVector,
+- unsigned int num_points)
++volk_32f_tan_32f_u_avx2(float* bVector, const float* aVector, unsigned int num_points)
+ {
+- float* bPtr = bVector;
+- const float* aPtr = aVector;
+-
+- unsigned int number = 0;
+- unsigned int eighthPoints = num_points / 8;
+- unsigned int i = 0;
+-
+- __m256 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones, fzeroes;
+- __m256 sine, cosine, tangent, condition1, condition2, condition3;
+- __m256i q, r, ones, twos, fours;
+-
+- m4pi = _mm256_set1_ps(1.273239545);
+- pio4A = _mm256_set1_ps(0.78515625);
+- pio4B = _mm256_set1_ps(0.241876e-3);
+- ffours = _mm256_set1_ps(4.0);
+- ftwos = _mm256_set1_ps(2.0);
+- fones = _mm256_set1_ps(1.0);
+- fzeroes = _mm256_setzero_ps();
+- ones = _mm256_set1_epi32(1);
+- twos = _mm256_set1_epi32(2);
+- fours = _mm256_set1_epi32(4);
+-
+- cp1 = _mm256_set1_ps(1.0);
+- cp2 = _mm256_set1_ps(0.83333333e-1);
+- cp3 = _mm256_set1_ps(0.2777778e-2);
+- cp4 = _mm256_set1_ps(0.49603e-4);
+- cp5 = _mm256_set1_ps(0.551e-6);
+-
+- for(;number < eighthPoints; number++){
+- aVal = _mm256_loadu_ps(aPtr);
+- s = _mm256_sub_ps(aVal, _mm256_and_ps(_mm256_mul_ps(aVal, ftwos), _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS)));
+- q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi)));
+- r = _mm256_add_epi32(q, _mm256_and_si256(q, ones));
+-
+- s = _mm256_sub_ps(s, _mm256_mul_ps(_mm256_cvtepi32_ps(r), pio4A));
+- s = _mm256_sub_ps(s, _mm256_mul_ps(_mm256_cvtepi32_ps(r), pio4B));
+-
+- s = _mm256_div_ps(s, _mm256_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
+- s = _mm256_mul_ps(s, s);
+- // Evaluate Taylor series
+- s = _mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(s, cp5), cp4), s), cp3), s), cp2), s), cp1), s);
+-
+- for(i = 0; i < 3; i++){
+- s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s));
++ float* bPtr = bVector;
++ const float* aPtr = aVector;
++
++ unsigned int number = 0;
++ unsigned int eighthPoints = num_points / 8;
++ unsigned int i = 0;
++
++ __m256 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones,
++ fzeroes;
++ __m256 sine, cosine, tangent, condition1, condition2, condition3;
++ __m256i q, r, ones, twos, fours;
++
++ m4pi = _mm256_set1_ps(1.273239545);
++ pio4A = _mm256_set1_ps(0.78515625);
++ pio4B = _mm256_set1_ps(0.241876e-3);
++ ffours = _mm256_set1_ps(4.0);
++ ftwos = _mm256_set1_ps(2.0);
++ fones = _mm256_set1_ps(1.0);
++ fzeroes = _mm256_setzero_ps();
++ ones = _mm256_set1_epi32(1);
++ twos = _mm256_set1_epi32(2);
++ fours = _mm256_set1_epi32(4);
++
++ cp1 = _mm256_set1_ps(1.0);
++ cp2 = _mm256_set1_ps(0.83333333e-1);
++ cp3 = _mm256_set1_ps(0.2777778e-2);
++ cp4 = _mm256_set1_ps(0.49603e-4);
++ cp5 = _mm256_set1_ps(0.551e-6);
++
++ for (; number < eighthPoints; number++) {
++ aVal = _mm256_loadu_ps(aPtr);
++ s = _mm256_sub_ps(aVal,
++ _mm256_and_ps(_mm256_mul_ps(aVal, ftwos),
++ _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS)));
++ q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi)));
++ r = _mm256_add_epi32(q, _mm256_and_si256(q, ones));
++
++ s = _mm256_sub_ps(s, _mm256_mul_ps(_mm256_cvtepi32_ps(r), pio4A));
++ s = _mm256_sub_ps(s, _mm256_mul_ps(_mm256_cvtepi32_ps(r), pio4B));
++
++ s = _mm256_div_ps(
++ s,
++ _mm256_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
++ s = _mm256_mul_ps(s, s);
++ // Evaluate Taylor series
++ s = _mm256_mul_ps(
++ _mm256_add_ps(
++ _mm256_mul_ps(
++ _mm256_sub_ps(
++ _mm256_mul_ps(
++ _mm256_add_ps(
++ _mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(s, cp5), cp4),
++ s),
++ cp3),
++ s),
++ cp2),
++ s),
++ cp1),
++ s);
++
++ for (i = 0; i < 3; i++) {
++ s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s));
++ }
++ s = _mm256_div_ps(s, ftwos);
++
++ sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s));
++ cosine = _mm256_sub_ps(fones, s);
++
++ condition1 = _mm256_cmp_ps(
++ _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, ones), twos)),
++ fzeroes,
++ _CMP_NEQ_UQ);
++ condition2 = _mm256_cmp_ps(
++ _mm256_cmp_ps(
++ _mm256_cvtepi32_ps(_mm256_and_si256(q, fours)), fzeroes, _CMP_NEQ_UQ),
++ _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS),
++ _CMP_NEQ_UQ);
++ condition3 = _mm256_cmp_ps(
++ _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, twos), fours)),
++ fzeroes,
++ _CMP_NEQ_UQ);
++
++ __m256 temp = cosine;
++ cosine =
++ _mm256_add_ps(cosine, _mm256_and_ps(_mm256_sub_ps(sine, cosine), condition1));
++ sine = _mm256_add_ps(sine, _mm256_and_ps(_mm256_sub_ps(temp, sine), condition1));
++ sine = _mm256_sub_ps(
++ sine, _mm256_and_ps(_mm256_mul_ps(sine, _mm256_set1_ps(2.0f)), condition2));
++ cosine = _mm256_sub_ps(
++ cosine,
++ _mm256_and_ps(_mm256_mul_ps(cosine, _mm256_set1_ps(2.0f)), condition3));
++ tangent = _mm256_div_ps(sine, cosine);
++ _mm256_storeu_ps(bPtr, tangent);
++ aPtr += 8;
++ bPtr += 8;
++ }
++
++ number = eighthPoints * 8;
++ for (; number < num_points; number++) {
++ *bPtr++ = tan(*aPtr++);
+ }
+- s = _mm256_div_ps(s, ftwos);
+-
+- sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s));
+- cosine = _mm256_sub_ps(fones, s);
+-
+- condition1 = _mm256_cmp_ps(_mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, ones), twos)), fzeroes, _CMP_NEQ_UQ);
+- condition2 = _mm256_cmp_ps(_mm256_cmp_ps(_mm256_cvtepi32_ps(_mm256_and_si256(q, fours)), fzeroes, _CMP_NEQ_UQ), _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS), _CMP_NEQ_UQ);
+- condition3 = _mm256_cmp_ps(_mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, twos), fours)), fzeroes, _CMP_NEQ_UQ);
+-
+- __m256 temp = cosine;
+- cosine = _mm256_add_ps(cosine, _mm256_and_ps(_mm256_sub_ps(sine, cosine), condition1));
+- sine = _mm256_add_ps(sine, _mm256_and_ps(_mm256_sub_ps(temp, sine), condition1));
+- sine = _mm256_sub_ps(sine, _mm256_and_ps(_mm256_mul_ps(sine, _mm256_set1_ps(2.0f)), condition2));
+- cosine = _mm256_sub_ps(cosine, _mm256_and_ps(_mm256_mul_ps(cosine, _mm256_set1_ps(2.0f)), condition3));
+- tangent = _mm256_div_ps(sine, cosine);
+- _mm256_storeu_ps(bPtr, tangent);
+- aPtr += 8;
+- bPtr += 8;
+- }
+-
+- number = eighthPoints * 8;
+- for(;number < num_points; number++){
+- *bPtr++ = tan(*aPtr++);
+- }
+ }
+
+ #endif /* LV_HAVE_AVX2 for unaligned */
+@@ -491,75 +620,95 @@ volk_32f_tan_32f_u_avx2(float* bVector, const float* aVector,
+ static inline void
+ volk_32f_tan_32f_u_sse4_1(float* bVector, const float* aVector, unsigned int num_points)
+ {
+- float* bPtr = bVector;
+- const float* aPtr = aVector;
+-
+- unsigned int number = 0;
+- unsigned int quarterPoints = num_points / 4;
+- unsigned int i = 0;
+-
+- __m128 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones, fzeroes;
+- __m128 sine, cosine, tangent, condition1, condition2, condition3;
+- __m128i q, r, ones, twos, fours;
+-
+- m4pi = _mm_set1_ps(1.273239545);
+- pio4A = _mm_set1_ps(0.78515625);
+- pio4B = _mm_set1_ps(0.241876e-3);
+- ffours = _mm_set1_ps(4.0);
+- ftwos = _mm_set1_ps(2.0);
+- fones = _mm_set1_ps(1.0);
+- fzeroes = _mm_setzero_ps();
+- ones = _mm_set1_epi32(1);
+- twos = _mm_set1_epi32(2);
+- fours = _mm_set1_epi32(4);
+-
+- cp1 = _mm_set1_ps(1.0);
+- cp2 = _mm_set1_ps(0.83333333e-1);
+- cp3 = _mm_set1_ps(0.2777778e-2);
+- cp4 = _mm_set1_ps(0.49603e-4);
+- cp5 = _mm_set1_ps(0.551e-6);
+-
+- for(;number < quarterPoints; number++){
+- aVal = _mm_loadu_ps(aPtr);
+- s = _mm_sub_ps(aVal, _mm_and_ps(_mm_mul_ps(aVal, ftwos), _mm_cmplt_ps(aVal, fzeroes)));
+- q = _mm_cvtps_epi32(_mm_floor_ps(_mm_mul_ps(s, m4pi)));
+- r = _mm_add_epi32(q, _mm_and_si128(q, ones));
+-
+- s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4A));
+- s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4B));
+-
+- s = _mm_div_ps(s, _mm_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
+- s = _mm_mul_ps(s, s);
+- // Evaluate Taylor series
+- s = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(s, cp5), cp4), s), cp3), s), cp2), s), cp1), s);
+-
+- for(i = 0; i < 3; i++){
+- s = _mm_mul_ps(s, _mm_sub_ps(ffours, s));
++ float* bPtr = bVector;
++ const float* aPtr = aVector;
++
++ unsigned int number = 0;
++ unsigned int quarterPoints = num_points / 4;
++ unsigned int i = 0;
++
++ __m128 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones,
++ fzeroes;
++ __m128 sine, cosine, tangent, condition1, condition2, condition3;
++ __m128i q, r, ones, twos, fours;
++
++ m4pi = _mm_set1_ps(1.273239545);
++ pio4A = _mm_set1_ps(0.78515625);
++ pio4B = _mm_set1_ps(0.241876e-3);
++ ffours = _mm_set1_ps(4.0);
++ ftwos = _mm_set1_ps(2.0);
++ fones = _mm_set1_ps(1.0);
++ fzeroes = _mm_setzero_ps();
++ ones = _mm_set1_epi32(1);
++ twos = _mm_set1_epi32(2);
++ fours = _mm_set1_epi32(4);
++
++ cp1 = _mm_set1_ps(1.0);
++ cp2 = _mm_set1_ps(0.83333333e-1);
++ cp3 = _mm_set1_ps(0.2777778e-2);
++ cp4 = _mm_set1_ps(0.49603e-4);
++ cp5 = _mm_set1_ps(0.551e-6);
++
++ for (; number < quarterPoints; number++) {
++ aVal = _mm_loadu_ps(aPtr);
++ s = _mm_sub_ps(aVal,
++ _mm_and_ps(_mm_mul_ps(aVal, ftwos), _mm_cmplt_ps(aVal, fzeroes)));
++ q = _mm_cvtps_epi32(_mm_floor_ps(_mm_mul_ps(s, m4pi)));
++ r = _mm_add_epi32(q, _mm_and_si128(q, ones));
++
++ s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4A));
++ s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4B));
++
++ s = _mm_div_ps(
++ s, _mm_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
++ s = _mm_mul_ps(s, s);
++ // Evaluate Taylor series
++ s = _mm_mul_ps(
++ _mm_add_ps(
++ _mm_mul_ps(
++ _mm_sub_ps(
++ _mm_mul_ps(
++ _mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(s, cp5), cp4), s),
++ cp3),
++ s),
++ cp2),
++ s),
++ cp1),
++ s);
++
++ for (i = 0; i < 3; i++) {
++ s = _mm_mul_ps(s, _mm_sub_ps(ffours, s));
++ }
++ s = _mm_div_ps(s, ftwos);
++
++ sine = _mm_sqrt_ps(_mm_mul_ps(_mm_sub_ps(ftwos, s), s));
++ cosine = _mm_sub_ps(fones, s);
++
++ condition1 = _mm_cmpneq_ps(
++ _mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, ones), twos)), fzeroes);
++ condition2 = _mm_cmpneq_ps(
++ _mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(q, fours)), fzeroes),
++ _mm_cmplt_ps(aVal, fzeroes));
++ condition3 = _mm_cmpneq_ps(
++ _mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, twos), fours)), fzeroes);
++
++ __m128 temp = cosine;
++ cosine = _mm_add_ps(cosine, _mm_and_ps(_mm_sub_ps(sine, cosine), condition1));
++ sine = _mm_add_ps(sine, _mm_and_ps(_mm_sub_ps(temp, sine), condition1));
++ sine =
++ _mm_sub_ps(sine, _mm_and_ps(_mm_mul_ps(sine, _mm_set1_ps(2.0f)), condition2));
++ cosine = _mm_sub_ps(
++ cosine, _mm_and_ps(_mm_mul_ps(cosine, _mm_set1_ps(2.0f)), condition3));
++ tangent = _mm_div_ps(sine, cosine);
++ _mm_storeu_ps(bPtr, tangent);
++ aPtr += 4;
++ bPtr += 4;
++ }
++
++ number = quarterPoints * 4;
++ for (; number < num_points; number++) {
++ *bPtr++ = tanf(*aPtr++);
+ }
+- s = _mm_div_ps(s, ftwos);
+-
+- sine = _mm_sqrt_ps(_mm_mul_ps(_mm_sub_ps(ftwos, s), s));
+- cosine = _mm_sub_ps(fones, s);
+-
+- condition1 = _mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, ones), twos)), fzeroes);
+- condition2 = _mm_cmpneq_ps(_mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(q, fours)), fzeroes), _mm_cmplt_ps(aVal, fzeroes));
+- condition3 = _mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, twos), fours)), fzeroes);
+-
+- __m128 temp = cosine;
+- cosine = _mm_add_ps(cosine, _mm_and_ps(_mm_sub_ps(sine, cosine), condition1));
+- sine = _mm_add_ps(sine, _mm_and_ps(_mm_sub_ps(temp, sine), condition1));
+- sine = _mm_sub_ps(sine, _mm_and_ps(_mm_mul_ps(sine, _mm_set1_ps(2.0f)), condition2));
+- cosine = _mm_sub_ps(cosine, _mm_and_ps(_mm_mul_ps(cosine, _mm_set1_ps(2.0f)), condition3));
+- tangent = _mm_div_ps(sine, cosine);
+- _mm_storeu_ps(bPtr, tangent);
+- aPtr += 4;
+- bPtr += 4;
+- }
+-
+- number = quarterPoints * 4;
+- for(;number < num_points; number++){
+- *bPtr++ = tanf(*aPtr++);
+- }
+ }
+
+ #endif /* LV_HAVE_SSE4_1 for unaligned */
+@@ -568,16 +717,15 @@ volk_32f_tan_32f_u_sse4_1(float* bVector, const float* aVector, unsigned int num
+ #ifdef LV_HAVE_GENERIC
+
+ static inline void
+-volk_32f_tan_32f_generic(float* bVector, const float* aVector,
+- unsigned int num_points)
++volk_32f_tan_32f_generic(float* bVector, const float* aVector, unsigned int num_points)
+ {
+- float* bPtr = bVector;
+- const float* aPtr = aVector;
+- unsigned int number = 0;
++ float* bPtr = bVector;
++ const float* aPtr = aVector;
++ unsigned int number = 0;
+
+- for(; number < num_points; number++){
+- *bPtr++ = tanf(*aPtr++);
+- }
++ for (; number < num_points; number++) {
++ *bPtr++ = tanf(*aPtr++);
++ }
+ }
+ #endif /* LV_HAVE_GENERIC */
+
+@@ -587,30 +735,29 @@ volk_32f_tan_32f_generic(float* bVector, const float* aVector,
+ #include <volk/volk_neon_intrinsics.h>
+
+ static inline void
+-volk_32f_tan_32f_neon(float* bVector, const float* aVector,
+- unsigned int num_points)
++volk_32f_tan_32f_neon(float* bVector, const float* aVector, unsigned int num_points)
+ {
+ unsigned int number = 0;
+ unsigned int quarter_points = num_points / 4;
+ float* bVectorPtr = bVector;
+ const float* aVectorPtr = aVector;
+-
++
+ float32x4_t b_vec;
+ float32x4_t a_vec;
+-
+- for(number = 0; number < quarter_points; number++) {
++
++ for (number = 0; number < quarter_points; number++) {
+ a_vec = vld1q_f32(aVectorPtr);
+ // Prefetch next one, speeds things up
+- __VOLK_PREFETCH(aVectorPtr+4);
++ __VOLK_PREFETCH(aVectorPtr + 4);
+ b_vec = _vtanq_f32(a_vec);
+ vst1q_f32(bVectorPtr, b_vec);
+ // move pointers ahead
+- bVectorPtr+=4;
+- aVectorPtr+=4;
++ bVectorPtr += 4;
++ aVectorPtr += 4;
+ }
+-
++
+ // Deal with the rest
+- for(number = quarter_points * 4; number < num_points; number++) {
++ for (number = quarter_points * 4; number < num_points; number++) {
+ *bVectorPtr++ = tanf(*aVectorPtr++);
+ }
+ }
+diff --git a/kernels/volk/volk_32f_tanh_32f.h b/kernels/volk/volk_32f_tanh_32f.h
+index d49432d..f157d39 100644
+--- a/kernels/volk/volk_32f_tanh_32f.h
++++ b/kernels/volk/volk_32f_tanh_32f.h
+@@ -69,22 +69,21 @@
+ #define INCLUDED_volk_32f_tanh_32f_a_H
+
+ #include <inttypes.h>
+-#include <stdio.h>
+ #include <math.h>
++#include <stdio.h>
+ #include <string.h>
+
+ #ifdef LV_HAVE_GENERIC
+
+ static inline void
+-volk_32f_tanh_32f_generic(float* cVector, const float* aVector,
+- unsigned int num_points)
++volk_32f_tanh_32f_generic(float* cVector, const float* aVector, unsigned int num_points)
+ {
+- unsigned int number = 0;
+- float* cPtr = cVector;
+- const float* aPtr = aVector;
+- for(; number < num_points; number++) {
+- *cPtr++ = tanhf(*aPtr++);
+- }
++ unsigned int number = 0;
++ float* cPtr = cVector;
++ const float* aPtr = aVector;
++ for (; number < num_points; number++) {
++ *cPtr++ = tanhf(*aPtr++);
++ }
+ }
+
+ #endif /* LV_HAVE_GENERIC */
+@@ -93,81 +92,88 @@ volk_32f_tanh_32f_generic(float* cVector, const float* aVector,
+ #ifdef LV_HAVE_GENERIC
+
+ static inline void
+-volk_32f_tanh_32f_series(float* cVector, const float* aVector,
+- unsigned int num_points)
++volk_32f_tanh_32f_series(float* cVector, const float* aVector, unsigned int num_points)
+ {
+- unsigned int number = 0;
+- float* cPtr = cVector;
+- const float* aPtr = aVector;
+- for(; number < num_points; number++) {
+- if(*aPtr > 4.97)
+- *cPtr++ = 1;
+- else if(*aPtr <= -4.97)
+- *cPtr++ = -1;
+- else {
+- float x2 = (*aPtr) * (*aPtr);
+- float a = (*aPtr) * (135135.0f + x2 * (17325.0f + x2 * (378.0f + x2)));
+- float b = 135135.0f + x2 * (62370.0f + x2 * (3150.0f + x2 * 28.0f));
+- *cPtr++ = a / b;
+- aPtr++;
++ unsigned int number = 0;
++ float* cPtr = cVector;
++ const float* aPtr = aVector;
++ for (; number < num_points; number++) {
++ if (*aPtr > 4.97)
++ *cPtr++ = 1;
++ else if (*aPtr <= -4.97)
++ *cPtr++ = -1;
++ else {
++ float x2 = (*aPtr) * (*aPtr);
++ float a = (*aPtr) * (135135.0f + x2 * (17325.0f + x2 * (378.0f + x2)));
++ float b = 135135.0f + x2 * (62370.0f + x2 * (3150.0f + x2 * 28.0f));
++ *cPtr++ = a / b;
++ aPtr++;
++ }
+ }
+- }
+ }
+
+ #endif /* LV_HAVE_GENERIC */
+
+
+-
+ #ifdef LV_HAVE_SSE
+ #include <xmmintrin.h>
+
+ static inline void
+-volk_32f_tanh_32f_a_sse(float* cVector, const float* aVector,
+- unsigned int num_points)
++volk_32f_tanh_32f_a_sse(float* cVector, const float* aVector, unsigned int num_points)
+ {
+- unsigned int number = 0;
+- const unsigned int quarterPoints = num_points / 4;
+-
+- float* cPtr = cVector;
+- const float* aPtr = aVector;
+-
+- __m128 aVal, cVal, x2, a, b;
+- __m128 const1, const2, const3, const4, const5, const6;
+- const1 = _mm_set_ps1(135135.0f);
+- const2 = _mm_set_ps1(17325.0f);
+- const3 = _mm_set_ps1(378.0f);
+- const4 = _mm_set_ps1(62370.0f);
+- const5 = _mm_set_ps1(3150.0f);
+- const6 = _mm_set_ps1(28.0f);
+- for(;number < quarterPoints; number++){
+-
+- aVal = _mm_load_ps(aPtr);
+- x2 = _mm_mul_ps(aVal, aVal);
+- a = _mm_mul_ps(aVal, _mm_add_ps(const1, _mm_mul_ps(x2, _mm_add_ps(const2, _mm_mul_ps(x2, _mm_add_ps(const3, x2))))));
+- b = _mm_add_ps(const1, _mm_mul_ps(x2, _mm_add_ps(const4, _mm_mul_ps(x2, _mm_add_ps(const5, _mm_mul_ps(x2, const6))))));
+-
+- cVal = _mm_div_ps(a, b);
+-
+- _mm_store_ps(cPtr, cVal); // Store the results back into the C container
+-
+- aPtr += 4;
+- cPtr += 4;
+- }
+-
+- number = quarterPoints * 4;
+- for(;number < num_points; number++) {
+- if(*aPtr > 4.97)
+- *cPtr++ = 1;
+- else if(*aPtr <= -4.97)
+- *cPtr++ = -1;
+- else {
+- float x2 = (*aPtr) * (*aPtr);
+- float a = (*aPtr) * (135135.0f + x2 * (17325.0f + x2 * (378.0f + x2)));
+- float b = 135135.0f + x2 * (62370.0f + x2 * (3150.0f + x2 * 28.0f));
+- *cPtr++ = a / b;
+- aPtr++;
++ unsigned int number = 0;
++ const unsigned int quarterPoints = num_points / 4;
++
++ float* cPtr = cVector;
++ const float* aPtr = aVector;
++
++ __m128 aVal, cVal, x2, a, b;
++ __m128 const1, const2, const3, const4, const5, const6;
++ const1 = _mm_set_ps1(135135.0f);
++ const2 = _mm_set_ps1(17325.0f);
++ const3 = _mm_set_ps1(378.0f);
++ const4 = _mm_set_ps1(62370.0f);
++ const5 = _mm_set_ps1(3150.0f);
++ const6 = _mm_set_ps1(28.0f);
++ for (; number < quarterPoints; number++) {
++
++ aVal = _mm_load_ps(aPtr);
++ x2 = _mm_mul_ps(aVal, aVal);
++ a = _mm_mul_ps(
++ aVal,
++ _mm_add_ps(
++ const1,
++ _mm_mul_ps(x2,
++ _mm_add_ps(const2, _mm_mul_ps(x2, _mm_add_ps(const3, x2))))));
++ b = _mm_add_ps(
++ const1,
++ _mm_mul_ps(
++ x2,
++ _mm_add_ps(const4,
++ _mm_mul_ps(x2, _mm_add_ps(const5, _mm_mul_ps(x2, const6))))));
++
++ cVal = _mm_div_ps(a, b);
++
++ _mm_store_ps(cPtr, cVal); // Store the results back into the C container
++
++ aPtr += 4;
++ cPtr += 4;
++ }
++
++ number = quarterPoints * 4;
++ for (; number < num_points; number++) {
++ if (*aPtr > 4.97)
++ *cPtr++ = 1;
++ else if (*aPtr <= -4.97)
++ *cPtr++ = -1;
++ else {
++ float x2 = (*aPtr) * (*aPtr);
++ float a = (*aPtr) * (135135.0f + x2 * (17325.0f + x2 * (378.0f + x2)));
++ float b = 135135.0f + x2 * (62370.0f + x2 * (3150.0f + x2 * 28.0f));
++ *cPtr++ = a / b;
++ aPtr++;
++ }
+ }
+- }
+ }
+ #endif /* LV_HAVE_SSE */
+
+@@ -176,52 +182,65 @@ volk_32f_tanh_32f_a_sse(float* cVector, const float* aVector,
+ #include <immintrin.h>
+
+ static inline void
+-volk_32f_tanh_32f_a_avx(float* cVector, const float* aVector,
+- unsigned int num_points)
++volk_32f_tanh_32f_a_avx(float* cVector, const float* aVector, unsigned int num_points)
+ {
+- unsigned int number = 0;
+- const unsigned int eighthPoints = num_points / 8;
+-
+- float* cPtr = cVector;
+- const float* aPtr = aVector;
+-
+- __m256 aVal, cVal, x2, a, b;
+- __m256 const1, const2, const3, const4, const5, const6;
+- const1 = _mm256_set1_ps(135135.0f);
+- const2 = _mm256_set1_ps(17325.0f);
+- const3 = _mm256_set1_ps(378.0f);
+- const4 = _mm256_set1_ps(62370.0f);
+- const5 = _mm256_set1_ps(3150.0f);
+- const6 = _mm256_set1_ps(28.0f);
+- for(;number < eighthPoints; number++){
+-
+- aVal = _mm256_load_ps(aPtr);
+- x2 = _mm256_mul_ps(aVal, aVal);
+- a = _mm256_mul_ps(aVal, _mm256_add_ps(const1, _mm256_mul_ps(x2, _mm256_add_ps(const2, _mm256_mul_ps(x2, _mm256_add_ps(const3, x2))))));
+- b = _mm256_add_ps(const1, _mm256_mul_ps(x2, _mm256_add_ps(const4, _mm256_mul_ps(x2, _mm256_add_ps(const5, _mm256_mul_ps(x2, const6))))));
+-
+- cVal = _mm256_div_ps(a, b);
+-
+- _mm256_store_ps(cPtr, cVal); // Store the results back into the C container
+-
+- aPtr += 8;
+- cPtr += 8;
+- }
+-
+- number = eighthPoints * 8;
+- for(;number < num_points; number++) {
+- if(*aPtr > 4.97)
+- *cPtr++ = 1;
+- else if(*aPtr <= -4.97)
+- *cPtr++ = -1;
+- else {
+- float x2 = (*aPtr) * (*aPtr);
+- float a = (*aPtr) * (135135.0f + x2 * (17325.0f + x2 * (378.0f + x2)));
+- float b = 135135.0f + x2 * (62370.0f + x2 * (3150.0f + x2 * 28.0f));
+- *cPtr++ = a / b;
+- aPtr++;
++ unsigned int number = 0;
++ const unsigned int eighthPoints = num_points / 8;
++
++ float* cPtr = cVector;
++ const float* aPtr = aVector;
++
++ __m256 aVal, cVal, x2, a, b;
++ __m256 const1, const2, const3, const4, const5, const6;
++ const1 = _mm256_set1_ps(135135.0f);
++ const2 = _mm256_set1_ps(17325.0f);
++ const3 = _mm256_set1_ps(378.0f);
++ const4 = _mm256_set1_ps(62370.0f);
++ const5 = _mm256_set1_ps(3150.0f);
++ const6 = _mm256_set1_ps(28.0f);
++ for (; number < eighthPoints; number++) {
++
++ aVal = _mm256_load_ps(aPtr);
++ x2 = _mm256_mul_ps(aVal, aVal);
++ a = _mm256_mul_ps(
++ aVal,
++ _mm256_add_ps(
++ const1,
++ _mm256_mul_ps(
++ x2,
++ _mm256_add_ps(const2,
++ _mm256_mul_ps(x2, _mm256_add_ps(const3, x2))))));
++ b = _mm256_add_ps(
++ const1,
++ _mm256_mul_ps(
++ x2,
++ _mm256_add_ps(
++ const4,
++ _mm256_mul_ps(x2,
++ _mm256_add_ps(const5, _mm256_mul_ps(x2, const6))))));
++
++ cVal = _mm256_div_ps(a, b);
++
++ _mm256_store_ps(cPtr, cVal); // Store the results back into the C container
++
++ aPtr += 8;
++ cPtr += 8;
++ }
++
++ number = eighthPoints * 8;
++ for (; number < num_points; number++) {
++ if (*aPtr > 4.97)
++ *cPtr++ = 1;
++ else if (*aPtr <= -4.97)
++ *cPtr++ = -1;
++ else {
++ float x2 = (*aPtr) * (*aPtr);
++ float a = (*aPtr) * (135135.0f + x2 * (17325.0f + x2 * (378.0f + x2)));
++ float b = 135135.0f + x2 * (62370.0f + x2 * (3150.0f + x2 * 28.0f));
++ *cPtr++ = a / b;
++ aPtr++;
++ }
+ }
+- }
+ }
+ #endif /* LV_HAVE_AVX */
+
+@@ -229,52 +248,55 @@ volk_32f_tanh_32f_a_avx(float* cVector, const float* aVector,
+ #include <immintrin.h>
+
+ static inline void
+-volk_32f_tanh_32f_a_avx_fma(float* cVector, const float* aVector,
+- unsigned int num_points)
++volk_32f_tanh_32f_a_avx_fma(float* cVector, const float* aVector, unsigned int num_points)
+ {
+- unsigned int number = 0;
+- const unsigned int eighthPoints = num_points / 8;
+-
+- float* cPtr = cVector;
+- const float* aPtr = aVector;
+-
+- __m256 aVal, cVal, x2, a, b;
+- __m256 const1, const2, const3, const4, const5, const6;
+- const1 = _mm256_set1_ps(135135.0f);
+- const2 = _mm256_set1_ps(17325.0f);
+- const3 = _mm256_set1_ps(378.0f);
+- const4 = _mm256_set1_ps(62370.0f);
+- const5 = _mm256_set1_ps(3150.0f);
+- const6 = _mm256_set1_ps(28.0f);
+- for(;number < eighthPoints; number++){
+-
+- aVal = _mm256_load_ps(aPtr);
+- x2 = _mm256_mul_ps(aVal, aVal);
+- a = _mm256_mul_ps(aVal, _mm256_fmadd_ps(x2, _mm256_fmadd_ps(x2, _mm256_add_ps(const3, x2), const2),const1));
+- b = _mm256_fmadd_ps(x2, _mm256_fmadd_ps(x2, _mm256_fmadd_ps(x2, const6, const5), const4), const1);
+-
+- cVal = _mm256_div_ps(a, b);
+-
+- _mm256_store_ps(cPtr, cVal); // Store the results back into the C container
+-
+- aPtr += 8;
+- cPtr += 8;
+- }
+-
+- number = eighthPoints * 8;
+- for(;number < num_points; number++) {
+- if(*aPtr > 4.97)
+- *cPtr++ = 1;
+- else if(*aPtr <= -4.97)
+- *cPtr++ = -1;
+- else {
+- float x2 = (*aPtr) * (*aPtr);
+- float a = (*aPtr) * (135135.0f + x2 * (17325.0f + x2 * (378.0f + x2)));
+- float b = 135135.0f + x2 * (62370.0f + x2 * (3150.0f + x2 * 28.0f));
+- *cPtr++ = a / b;
+- aPtr++;
++ unsigned int number = 0;
++ const unsigned int eighthPoints = num_points / 8;
++
++ float* cPtr = cVector;
++ const float* aPtr = aVector;
++
++ __m256 aVal, cVal, x2, a, b;
++ __m256 const1, const2, const3, const4, const5, const6;
++ const1 = _mm256_set1_ps(135135.0f);
++ const2 = _mm256_set1_ps(17325.0f);
++ const3 = _mm256_set1_ps(378.0f);
++ const4 = _mm256_set1_ps(62370.0f);
++ const5 = _mm256_set1_ps(3150.0f);
++ const6 = _mm256_set1_ps(28.0f);
++ for (; number < eighthPoints; number++) {
++
++ aVal = _mm256_load_ps(aPtr);
++ x2 = _mm256_mul_ps(aVal, aVal);
++ a = _mm256_mul_ps(
++ aVal,
++ _mm256_fmadd_ps(
++ x2, _mm256_fmadd_ps(x2, _mm256_add_ps(const3, x2), const2), const1));
++ b = _mm256_fmadd_ps(
++ x2, _mm256_fmadd_ps(x2, _mm256_fmadd_ps(x2, const6, const5), const4), const1);
++
++ cVal = _mm256_div_ps(a, b);
++
++ _mm256_store_ps(cPtr, cVal); // Store the results back into the C container
++
++ aPtr += 8;
++ cPtr += 8;
++ }
++
++ number = eighthPoints * 8;
++ for (; number < num_points; number++) {
++ if (*aPtr > 4.97)
++ *cPtr++ = 1;
++ else if (*aPtr <= -4.97)
++ *cPtr++ = -1;
++ else {
++ float x2 = (*aPtr) * (*aPtr);
++ float a = (*aPtr) * (135135.0f + x2 * (17325.0f + x2 * (378.0f + x2)));
++ float b = 135135.0f + x2 * (62370.0f + x2 * (3150.0f + x2 * 28.0f));
++ *cPtr++ = a / b;
++ aPtr++;
++ }
+ }
+- }
+ }
+ #endif /* LV_HAVE_AVX && LV_HAVE_FMA */
+
+@@ -285,8 +307,8 @@ volk_32f_tanh_32f_a_avx_fma(float* cVector, const float* aVector,
+ #define INCLUDED_volk_32f_tanh_32f_u_H
+
+ #include <inttypes.h>
+-#include <stdio.h>
+ #include <math.h>
++#include <stdio.h>
+ #include <string.h>
+
+
+@@ -294,52 +316,61 @@ volk_32f_tanh_32f_a_avx_fma(float* cVector, const float* aVector,
+ #include <xmmintrin.h>
+
+ static inline void
+-volk_32f_tanh_32f_u_sse(float* cVector, const float* aVector,
+- unsigned int num_points)
++volk_32f_tanh_32f_u_sse(float* cVector, const float* aVector, unsigned int num_points)
+ {
+- unsigned int number = 0;
+- const unsigned int quarterPoints = num_points / 4;
+-
+- float* cPtr = cVector;
+- const float* aPtr = aVector;
+-
+- __m128 aVal, cVal, x2, a, b;
+- __m128 const1, const2, const3, const4, const5, const6;
+- const1 = _mm_set_ps1(135135.0f);
+- const2 = _mm_set_ps1(17325.0f);
+- const3 = _mm_set_ps1(378.0f);
+- const4 = _mm_set_ps1(62370.0f);
+- const5 = _mm_set_ps1(3150.0f);
+- const6 = _mm_set_ps1(28.0f);
+- for(;number < quarterPoints; number++){
+-
+- aVal = _mm_loadu_ps(aPtr);
+- x2 = _mm_mul_ps(aVal, aVal);
+- a = _mm_mul_ps(aVal, _mm_add_ps(const1, _mm_mul_ps(x2, _mm_add_ps(const2, _mm_mul_ps(x2, _mm_add_ps(const3, x2))))));
+- b = _mm_add_ps(const1, _mm_mul_ps(x2, _mm_add_ps(const4, _mm_mul_ps(x2, _mm_add_ps(const5, _mm_mul_ps(x2, const6))))));
+-
+- cVal = _mm_div_ps(a, b);
+-
+- _mm_storeu_ps(cPtr, cVal); // Store the results back into the C container
+-
+- aPtr += 4;
+- cPtr += 4;
+- }
+-
+- number = quarterPoints * 4;
+- for(;number < num_points; number++) {
+- if(*aPtr > 4.97)
+- *cPtr++ = 1;
+- else if(*aPtr <= -4.97)
+- *cPtr++ = -1;
+- else {
+- float x2 = (*aPtr) * (*aPtr);
+- float a = (*aPtr) * (135135.0f + x2 * (17325.0f + x2 * (378.0f + x2)));
+- float b = 135135.0f + x2 * (62370.0f + x2 * (3150.0f + x2 * 28.0f));
+- *cPtr++ = a / b;
+- aPtr++;
++ unsigned int number = 0;
++ const unsigned int quarterPoints = num_points / 4;
++
++ float* cPtr = cVector;
++ const float* aPtr = aVector;
++
++ __m128 aVal, cVal, x2, a, b;
++ __m128 const1, const2, const3, const4, const5, const6;
++ const1 = _mm_set_ps1(135135.0f);
++ const2 = _mm_set_ps1(17325.0f);
++ const3 = _mm_set_ps1(378.0f);
++ const4 = _mm_set_ps1(62370.0f);
++ const5 = _mm_set_ps1(3150.0f);
++ const6 = _mm_set_ps1(28.0f);
++ for (; number < quarterPoints; number++) {
++
++ aVal = _mm_loadu_ps(aPtr);
++ x2 = _mm_mul_ps(aVal, aVal);
++ a = _mm_mul_ps(
++ aVal,
++ _mm_add_ps(
++ const1,
++ _mm_mul_ps(x2,
++ _mm_add_ps(const2, _mm_mul_ps(x2, _mm_add_ps(const3, x2))))));
++ b = _mm_add_ps(
++ const1,
++ _mm_mul_ps(
++ x2,
++ _mm_add_ps(const4,
++ _mm_mul_ps(x2, _mm_add_ps(const5, _mm_mul_ps(x2, const6))))));
++
++ cVal = _mm_div_ps(a, b);
++
++ _mm_storeu_ps(cPtr, cVal); // Store the results back into the C container
++
++ aPtr += 4;
++ cPtr += 4;
++ }
++
++ number = quarterPoints * 4;
++ for (; number < num_points; number++) {
++ if (*aPtr > 4.97)
++ *cPtr++ = 1;
++ else if (*aPtr <= -4.97)
++ *cPtr++ = -1;
++ else {
++ float x2 = (*aPtr) * (*aPtr);
++ float a = (*aPtr) * (135135.0f + x2 * (17325.0f + x2 * (378.0f + x2)));
++ float b = 135135.0f + x2 * (62370.0f + x2 * (3150.0f + x2 * 28.0f));
++ *cPtr++ = a / b;
++ aPtr++;
++ }
+ }
+- }
+ }
+ #endif /* LV_HAVE_SSE */
+
+@@ -348,52 +379,65 @@ volk_32f_tanh_32f_u_sse(float* cVector, const float* aVector,
+ #include <immintrin.h>
+
+ static inline void
+-volk_32f_tanh_32f_u_avx(float* cVector, const float* aVector,
+- unsigned int num_points)
++volk_32f_tanh_32f_u_avx(float* cVector, const float* aVector, unsigned int num_points)
+ {
+- unsigned int number = 0;
+- const unsigned int eighthPoints = num_points / 8;
+-
+- float* cPtr = cVector;
+- const float* aPtr = aVector;
+-
+- __m256 aVal, cVal, x2, a, b;
+- __m256 const1, const2, const3, const4, const5, const6;
+- const1 = _mm256_set1_ps(135135.0f);
+- const2 = _mm256_set1_ps(17325.0f);
+- const3 = _mm256_set1_ps(378.0f);
+- const4 = _mm256_set1_ps(62370.0f);
+- const5 = _mm256_set1_ps(3150.0f);
+- const6 = _mm256_set1_ps(28.0f);
+- for(;number < eighthPoints; number++){
+-
+- aVal = _mm256_loadu_ps(aPtr);
+- x2 = _mm256_mul_ps(aVal, aVal);
+- a = _mm256_mul_ps(aVal, _mm256_add_ps(const1, _mm256_mul_ps(x2, _mm256_add_ps(const2, _mm256_mul_ps(x2, _mm256_add_ps(const3, x2))))));
+- b = _mm256_add_ps(const1, _mm256_mul_ps(x2, _mm256_add_ps(const4, _mm256_mul_ps(x2, _mm256_add_ps(const5, _mm256_mul_ps(x2, const6))))));
+-
+- cVal = _mm256_div_ps(a, b);
+-
+- _mm256_storeu_ps(cPtr, cVal); // Store the results back into the C container
+-
+- aPtr += 8;
+- cPtr += 8;
+- }
+-
+- number = eighthPoints * 8;
+- for(;number < num_points; number++) {
+- if(*aPtr > 4.97)
+- *cPtr++ = 1;
+- else if(*aPtr <= -4.97)
+- *cPtr++ = -1;
+- else {
+- float x2 = (*aPtr) * (*aPtr);
+- float a = (*aPtr) * (135135.0f + x2 * (17325.0f + x2 * (378.0f + x2)));
+- float b = 135135.0f + x2 * (62370.0f + x2 * (3150.0f + x2 * 28.0f));
+- *cPtr++ = a / b;
+- aPtr++;
++ unsigned int number = 0;
++ const unsigned int eighthPoints = num_points / 8;
++
++ float* cPtr = cVector;
++ const float* aPtr = aVector;
++
++ __m256 aVal, cVal, x2, a, b;
++ __m256 const1, const2, const3, const4, const5, const6;
++ const1 = _mm256_set1_ps(135135.0f);
++ const2 = _mm256_set1_ps(17325.0f);
++ const3 = _mm256_set1_ps(378.0f);
++ const4 = _mm256_set1_ps(62370.0f);
++ const5 = _mm256_set1_ps(3150.0f);
++ const6 = _mm256_set1_ps(28.0f);
++ for (; number < eighthPoints; number++) {
++
++ aVal = _mm256_loadu_ps(aPtr);
++ x2 = _mm256_mul_ps(aVal, aVal);
++ a = _mm256_mul_ps(
++ aVal,
++ _mm256_add_ps(
++ const1,
++ _mm256_mul_ps(
++ x2,
++ _mm256_add_ps(const2,
++ _mm256_mul_ps(x2, _mm256_add_ps(const3, x2))))));
++ b = _mm256_add_ps(
++ const1,
++ _mm256_mul_ps(
++ x2,
++ _mm256_add_ps(
++ const4,
++ _mm256_mul_ps(x2,
++ _mm256_add_ps(const5, _mm256_mul_ps(x2, const6))))));
++
++ cVal = _mm256_div_ps(a, b);
++
++ _mm256_storeu_ps(cPtr, cVal); // Store the results back into the C container
++
++ aPtr += 8;
++ cPtr += 8;
++ }
++
++ number = eighthPoints * 8;
++ for (; number < num_points; number++) {
++ if (*aPtr > 4.97)
++ *cPtr++ = 1;
++ else if (*aPtr <= -4.97)
++ *cPtr++ = -1;
++ else {
++ float x2 = (*aPtr) * (*aPtr);
++ float a = (*aPtr) * (135135.0f + x2 * (17325.0f + x2 * (378.0f + x2)));
++ float b = 135135.0f + x2 * (62370.0f + x2 * (3150.0f + x2 * 28.0f));
++ *cPtr++ = a / b;
++ aPtr++;
++ }
+ }
+- }
+ }
+ #endif /* LV_HAVE_AVX */
+
+@@ -401,52 +445,55 @@ volk_32f_tanh_32f_u_avx(float* cVector, const float* aVector,
+ #include <immintrin.h>
+
+ static inline void
+-volk_32f_tanh_32f_u_avx_fma(float* cVector, const float* aVector,
+- unsigned int num_points)
++volk_32f_tanh_32f_u_avx_fma(float* cVector, const float* aVector, unsigned int num_points)
+ {
+- unsigned int number = 0;
+- const unsigned int eighthPoints = num_points / 8;
+-
+- float* cPtr = cVector;
+- const float* aPtr = aVector;
+-
+- __m256 aVal, cVal, x2, a, b;
+- __m256 const1, const2, const3, const4, const5, const6;
+- const1 = _mm256_set1_ps(135135.0f);
+- const2 = _mm256_set1_ps(17325.0f);
+- const3 = _mm256_set1_ps(378.0f);
+- const4 = _mm256_set1_ps(62370.0f);
+- const5 = _mm256_set1_ps(3150.0f);
+- const6 = _mm256_set1_ps(28.0f);
+- for(;number < eighthPoints; number++){
+-
+- aVal = _mm256_loadu_ps(aPtr);
+- x2 = _mm256_mul_ps(aVal, aVal);
+- a = _mm256_mul_ps(aVal, _mm256_fmadd_ps(x2, _mm256_fmadd_ps(x2, _mm256_add_ps(const3, x2), const2),const1));
+- b = _mm256_fmadd_ps(x2, _mm256_fmadd_ps(x2, _mm256_fmadd_ps(x2, const6, const5), const4), const1);
+-
+- cVal = _mm256_div_ps(a, b);
+-
+- _mm256_storeu_ps(cPtr, cVal); // Store the results back into the C container
+-
+- aPtr += 8;
+- cPtr += 8;
+- }
+-
+- number = eighthPoints * 8;
+- for(;number < num_points; number++) {
+- if(*aPtr > 4.97)
+- *cPtr++ = 1;
+- else if(*aPtr <= -4.97)
+- *cPtr++ = -1;
+- else {
+- float x2 = (*aPtr) * (*aPtr);
+- float a = (*aPtr) * (135135.0f + x2 * (17325.0f + x2 * (378.0f + x2)));
+- float b = 135135.0f + x2 * (62370.0f + x2 * (3150.0f + x2 * 28.0f));
+- *cPtr++ = a / b;
+- aPtr++;
++ unsigned int number = 0;
++ const unsigned int eighthPoints = num_points / 8;
++
++ float* cPtr = cVector;
++ const float* aPtr = aVector;
++
++ __m256 aVal, cVal, x2, a, b;
++ __m256 const1, const2, const3, const4, const5, const6;
++ const1 = _mm256_set1_ps(135135.0f);
++ const2 = _mm256_set1_ps(17325.0f);
++ const3 = _mm256_set1_ps(378.0f);
++ const4 = _mm256_set1_ps(62370.0f);
++ const5 = _mm256_set1_ps(3150.0f);
++ const6 = _mm256_set1_ps(28.0f);
++ for (; number < eighthPoints; number++) {
++
++ aVal = _mm256_loadu_ps(aPtr);
++ x2 = _mm256_mul_ps(aVal, aVal);
++ a = _mm256_mul_ps(
++ aVal,
++ _mm256_fmadd_ps(
++ x2, _mm256_fmadd_ps(x2, _mm256_add_ps(const3, x2), const2), const1));
++ b = _mm256_fmadd_ps(
++ x2, _mm256_fmadd_ps(x2, _mm256_fmadd_ps(x2, const6, const5), const4), const1);
++
++ cVal = _mm256_div_ps(a, b);
++
++ _mm256_storeu_ps(cPtr, cVal); // Store the results back into the C container
++
++ aPtr += 8;
++ cPtr += 8;
++ }
++
++ number = eighthPoints * 8;
++ for (; number < num_points; number++) {
++ if (*aPtr > 4.97)
++ *cPtr++ = 1;
++ else if (*aPtr <= -4.97)
++ *cPtr++ = -1;
++ else {
++ float x2 = (*aPtr) * (*aPtr);
++ float a = (*aPtr) * (135135.0f + x2 * (17325.0f + x2 * (378.0f + x2)));
++ float b = 135135.0f + x2 * (62370.0f + x2 * (3150.0f + x2 * 28.0f));
++ *cPtr++ = a / b;
++ aPtr++;
++ }
+ }
+- }
+ }
+ #endif /* LV_HAVE_AVX && LV_HAVE_FMA */
+
+diff --git a/kernels/volk/volk_32f_x2_add_32f.h b/kernels/volk/volk_32f_x2_add_32f.h
+index ce18092..e4b7e93 100644
+--- a/kernels/volk/volk_32f_x2_add_32f.h
++++ b/kernels/volk/volk_32f_x2_add_32f.h
+@@ -31,8 +31,8 @@
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+- * void volk_32f_x2_add_32f(float* cVector, const float* aVector, const float* bVector, unsigned int num_points)
+- * \endcode
++ * void volk_32f_x2_add_32f(float* cVector, const float* aVector, const float* bVector,
++ * unsigned int num_points) \endcode
+ *
+ * \b Inputs
+ * \li aVector: First vector of input points.
+@@ -44,7 +44,8 @@
+ *
+ * \b Example
+ *
+- * The follow example adds the increasing and decreasing vectors such that the result of every summation pair is 10
++ * The follow example adds the increasing and decreasing vectors such that the result of
++ * every summation pair is 10
+ *
+ * \code
+ * int N = 10;
+@@ -79,37 +80,38 @@
+ #ifdef LV_HAVE_AVX512F
+ #include <immintrin.h>
+
+-static inline void
+-volk_32f_x2_add_32f_u_avx512f(float* cVector, const float* aVector,
+- const float* bVector, unsigned int num_points)
++static inline void volk_32f_x2_add_32f_u_avx512f(float* cVector,
++ const float* aVector,
++ const float* bVector,
++ unsigned int num_points)
+ {
+- unsigned int number = 0;
+- const unsigned int sixteenthPoints = num_points / 16;
++ unsigned int number = 0;
++ const unsigned int sixteenthPoints = num_points / 16;
+
+- float* cPtr = cVector;
+- const float* aPtr = aVector;
+- const float* bPtr= bVector;
++ float* cPtr = cVector;
++ const float* aPtr = aVector;
++ const float* bPtr = bVector;
+
+- __m512 aVal, bVal, cVal;
+- for(;number < sixteenthPoints; number++){
++ __m512 aVal, bVal, cVal;
++ for (; number < sixteenthPoints; number++) {
+
+- aVal = _mm512_loadu_ps(aPtr);
+- bVal = _mm512_loadu_ps(bPtr);
++ aVal = _mm512_loadu_ps(aPtr);
++ bVal = _mm512_loadu_ps(bPtr);
+
+- cVal = _mm512_add_ps(aVal, bVal);
++ cVal = _mm512_add_ps(aVal, bVal);
+
+- _mm512_storeu_ps(cPtr,cVal); // Store the results back into the C container
++ _mm512_storeu_ps(cPtr, cVal); // Store the results back into the C container
+
+- aPtr += 16;
+- bPtr += 16;
+- cPtr += 16;
+- }
++ aPtr += 16;
++ bPtr += 16;
++ cPtr += 16;
++ }
+
+- number = sixteenthPoints * 16;
++ number = sixteenthPoints * 16;
+
+- for(;number < num_points; number++){
+- *cPtr++ = (*aPtr++) + (*bPtr++);
+- }
++ for (; number < num_points; number++) {
++ *cPtr++ = (*aPtr++) + (*bPtr++);
++ }
+ }
+
+ #endif /* LV_HAVE_AVX512F */
+@@ -118,35 +120,36 @@ volk_32f_x2_add_32f_u_avx512f(float* cVector, const float* aVector,
+ #ifdef LV_HAVE_AVX
+ #include <immintrin.h>
+
+-static inline void
+-volk_32f_x2_add_32f_u_avx(float* cVector, const float* aVector,
+- const float* bVector, unsigned int num_points)
++static inline void volk_32f_x2_add_32f_u_avx(float* cVector,
++ const float* aVector,
++ const float* bVector,
++ unsigned int num_points)
+ {
+- unsigned int number = 0;
+- const unsigned int eighthPoints = num_points / 8;
+- float* cPtr = cVector;
+- const float* aPtr = aVector;
+- const float* bPtr= bVector;
+- __m256 aVal, bVal, cVal;
+- for(;number < eighthPoints; number++){
++ unsigned int number = 0;
++ const unsigned int eighthPoints = num_points / 8;
++ float* cPtr = cVector;
++ const float* aPtr = aVector;
++ const float* bPtr = bVector;
++ __m256 aVal, bVal, cVal;
++ for (; number < eighthPoints; number++) {
+
+- aVal = _mm256_loadu_ps(aPtr);
+- bVal = _mm256_loadu_ps(bPtr);
++ aVal = _mm256_loadu_ps(aPtr);
++ bVal = _mm256_loadu_ps(bPtr);
+
+- cVal = _mm256_add_ps(aVal, bVal);
++ cVal = _mm256_add_ps(aVal, bVal);
+
+- _mm256_storeu_ps(cPtr,cVal); // Store the results back into the C container
++ _mm256_storeu_ps(cPtr, cVal); // Store the results back into the C container
+
+- aPtr += 8;
+- bPtr += 8;
+- cPtr += 8;
+- }
++ aPtr += 8;
++ bPtr += 8;
++ cPtr += 8;
++ }
+
+- number = eighthPoints * 8;
++ number = eighthPoints * 8;
+
+- for(;number < num_points; number++){
+- *cPtr++ = (*aPtr++) + (*bPtr++);
+- }
++ for (; number < num_points; number++) {
++ *cPtr++ = (*aPtr++) + (*bPtr++);
++ }
+ }
+ #endif /* LV_HAVE_AVX */
+
+@@ -154,54 +157,56 @@ volk_32f_x2_add_32f_u_avx(float* cVector, const float* aVector,
+ #ifdef LV_HAVE_SSE
+ #include <xmmintrin.h>
+
+-static inline void
+-volk_32f_x2_add_32f_u_sse(float* cVector, const float* aVector,
+- const float* bVector, unsigned int num_points)
++static inline void volk_32f_x2_add_32f_u_sse(float* cVector,
++ const float* aVector,
++ const float* bVector,
++ unsigned int num_points)
+ {
+- unsigned int number = 0;
+- const unsigned int quarterPoints = num_points / 4;
++ unsigned int number = 0;
++ const unsigned int quarterPoints = num_points / 4;
+
+- float* cPtr = cVector;
+- const float* aPtr = aVector;
+- const float* bPtr= bVector;
++ float* cPtr = cVector;
++ const float* aPtr = aVector;
++ const float* bPtr = bVector;
+
+- __m128 aVal, bVal, cVal;
+- for(;number < quarterPoints; number++){
++ __m128 aVal, bVal, cVal;
++ for (; number < quarterPoints; number++) {
+
+- aVal = _mm_loadu_ps(aPtr);
+- bVal = _mm_loadu_ps(bPtr);
++ aVal = _mm_loadu_ps(aPtr);
++ bVal = _mm_loadu_ps(bPtr);
+
+- cVal = _mm_add_ps(aVal, bVal);
++ cVal = _mm_add_ps(aVal, bVal);
+
+- _mm_storeu_ps(cPtr,cVal); // Store the results back into the C container
++ _mm_storeu_ps(cPtr, cVal); // Store the results back into the C container
+
+- aPtr += 4;
+- bPtr += 4;
+- cPtr += 4;
+- }
++ aPtr += 4;
++ bPtr += 4;
++ cPtr += 4;
++ }
+
+- number = quarterPoints * 4;
+- for(;number < num_points; number++){
+- *cPtr++ = (*aPtr++) + (*bPtr++);
+- }
++ number = quarterPoints * 4;
++ for (; number < num_points; number++) {
++ *cPtr++ = (*aPtr++) + (*bPtr++);
++ }
+ }
+ #endif /* LV_HAVE_SSE */
+
+
+ #ifdef LV_HAVE_GENERIC
+
+-static inline void
+-volk_32f_x2_add_32f_generic(float* cVector, const float* aVector,
+- const float* bVector, unsigned int num_points)
++static inline void volk_32f_x2_add_32f_generic(float* cVector,
++ const float* aVector,
++ const float* bVector,
++ unsigned int num_points)
+ {
+- float* cPtr = cVector;
+- const float* aPtr = aVector;
+- const float* bPtr= bVector;
+- unsigned int number = 0;
+-
+- for(number = 0; number < num_points; number++){
+- *cPtr++ = (*aPtr++) + (*bPtr++);
+- }
++ float* cPtr = cVector;
++ const float* aPtr = aVector;
++ const float* bPtr = bVector;
++ unsigned int number = 0;
++
++ for (number = 0; number < num_points; number++) {
++ *cPtr++ = (*aPtr++) + (*bPtr++);
++ }
+ }
+ #endif /* LV_HAVE_GENERIC */
+
+@@ -216,37 +221,38 @@ volk_32f_x2_add_32f_generic(float* cVector, const float* aVector,
+ #ifdef LV_HAVE_AVX512F
+ #include <immintrin.h>
+
+-static inline void
+-volk_32f_x2_add_32f_a_avx512f(float* cVector, const float* aVector,
+- const float* bVector, unsigned int num_points)
++static inline void volk_32f_x2_add_32f_a_avx512f(float* cVector,
++ const float* aVector,
++ const float* bVector,
++ unsigned int num_points)
+ {
+- unsigned int number = 0;
+- const unsigned int sixteenthPoints = num_points / 16;
++ unsigned int number = 0;
++ const unsigned int sixteenthPoints = num_points / 16;
+
+- float* cPtr = cVector;
+- const float* aPtr = aVector;
+- const float* bPtr= bVector;
++ float* cPtr = cVector;
++ const float* aPtr = aVector;
++ const float* bPtr = bVector;
+
+- __m512 aVal, bVal, cVal;
+- for(;number < sixteenthPoints; number++){
++ __m512 aVal, bVal, cVal;
++ for (; number < sixteenthPoints; number++) {
+
+- aVal = _mm512_load_ps(aPtr);
+- bVal = _mm512_load_ps(bPtr);
++ aVal = _mm512_load_ps(aPtr);
++ bVal = _mm512_load_ps(bPtr);
+
+- cVal = _mm512_add_ps(aVal, bVal);
++ cVal = _mm512_add_ps(aVal, bVal);
+
+- _mm512_store_ps(cPtr,cVal); // Store the results back into the C container
++ _mm512_store_ps(cPtr, cVal); // Store the results back into the C container
+
+- aPtr += 16;
+- bPtr += 16;
+- cPtr += 16;
+- }
++ aPtr += 16;
++ bPtr += 16;
++ cPtr += 16;
++ }
+
+- number = sixteenthPoints * 16;
++ number = sixteenthPoints * 16;
+
+- for(;number < num_points; number++){
+- *cPtr++ = (*aPtr++) + (*bPtr++);
+- }
++ for (; number < num_points; number++) {
++ *cPtr++ = (*aPtr++) + (*bPtr++);
++ }
+ }
+
+ #endif /* LV_HAVE_AVX512F */
+@@ -255,70 +261,73 @@ volk_32f_x2_add_32f_a_avx512f(float* cVector, const float* aVector,
+ #ifdef LV_HAVE_AVX
+ #include <immintrin.h>
+
+-static inline void
+-volk_32f_x2_add_32f_a_avx(float* cVector, const float* aVector,
+- const float* bVector, unsigned int num_points)
++static inline void volk_32f_x2_add_32f_a_avx(float* cVector,
++ const float* aVector,
++ const float* bVector,
++ unsigned int num_points)
+ {
+- unsigned int number = 0;
+- const unsigned int eighthPoints = num_points / 8;
++ unsigned int number = 0;
++ const unsigned int eighthPoints = num_points / 8;
+
+- float* cPtr = cVector;
+- const float* aPtr = aVector;
+- const float* bPtr= bVector;
++ float* cPtr = cVector;
++ const float* aPtr = aVector;
++ const float* bPtr = bVector;
+
+- __m256 aVal, bVal, cVal;
+- for(;number < eighthPoints; number++){
++ __m256 aVal, bVal, cVal;
++ for (; number < eighthPoints; number++) {
+
+- aVal = _mm256_load_ps(aPtr);
+- bVal = _mm256_load_ps(bPtr);
++ aVal = _mm256_load_ps(aPtr);
++ bVal = _mm256_load_ps(bPtr);
+
+- cVal = _mm256_add_ps(aVal, bVal);
++ cVal = _mm256_add_ps(aVal, bVal);
+
+- _mm256_store_ps(cPtr,cVal); // Store the results back into the C container
++ _mm256_store_ps(cPtr, cVal); // Store the results back into the C container
+
+- aPtr += 8;
+- bPtr += 8;
+- cPtr += 8;
+- }
++ aPtr += 8;
++ bPtr += 8;
++ cPtr += 8;
++ }
+
+- number = eighthPoints * 8;
+- for(;number < num_points; number++){
+- *cPtr++ = (*aPtr++) + (*bPtr++);
+- }
++ number = eighthPoints * 8;
++ for (; number < num_points; number++) {
++ *cPtr++ = (*aPtr++) + (*bPtr++);
++ }
+ }
+ #endif /* LV_HAVE_AVX */
+
+ #ifdef LV_HAVE_SSE
+ #include <xmmintrin.h>
+
+-static inline void
+-volk_32f_x2_add_32f_a_sse(float* cVector, const float* aVector, const float* bVector, unsigned int num_points)
++static inline void volk_32f_x2_add_32f_a_sse(float* cVector,
++ const float* aVector,
++ const float* bVector,
++ unsigned int num_points)
+ {
+- unsigned int number = 0;
+- const unsigned int quarterPoints = num_points / 4;
++ unsigned int number = 0;
++ const unsigned int quarterPoints = num_points / 4;
+
+- float* cPtr = cVector;
+- const float* aPtr = aVector;
+- const float* bPtr= bVector;
++ float* cPtr = cVector;
++ const float* aPtr = aVector;
++ const float* bPtr = bVector;
+
+- __m128 aVal, bVal, cVal;
+- for(;number < quarterPoints; number++){
+- aVal = _mm_load_ps(aPtr);
+- bVal = _mm_load_ps(bPtr);
++ __m128 aVal, bVal, cVal;
++ for (; number < quarterPoints; number++) {
++ aVal = _mm_load_ps(aPtr);
++ bVal = _mm_load_ps(bPtr);
+
+- cVal = _mm_add_ps(aVal, bVal);
++ cVal = _mm_add_ps(aVal, bVal);
+
+- _mm_store_ps(cPtr,cVal); // Store the results back into the C container
++ _mm_store_ps(cPtr, cVal); // Store the results back into the C container
+
+- aPtr += 4;
+- bPtr += 4;
+- cPtr += 4;
+- }
++ aPtr += 4;
++ bPtr += 4;
++ cPtr += 4;
++ }
+
+- number = quarterPoints * 4;
+- for(;number < num_points; number++){
+- *cPtr++ = (*aPtr++) + (*bPtr++);
+- }
++ number = quarterPoints * 4;
++ for (; number < num_points; number++) {
++ *cPtr++ = (*aPtr++) + (*bPtr++);
++ }
+ }
+ #endif /* LV_HAVE_SSE */
+
+@@ -326,78 +335,89 @@ volk_32f_x2_add_32f_a_sse(float* cVector, const float* aVector, const float* bVe
+ #ifdef LV_HAVE_NEON
+ #include <arm_neon.h>
+
+-static inline void
+-volk_32f_x2_add_32f_u_neon(float* cVector, const float* aVector,
+- const float* bVector, unsigned int num_points)
++static inline void volk_32f_x2_add_32f_u_neon(float* cVector,
++ const float* aVector,
++ const float* bVector,
++ unsigned int num_points)
+ {
+- unsigned int number = 0;
+- const unsigned int quarterPoints = num_points / 4;
+-
+- float* cPtr = cVector;
+- const float* aPtr = aVector;
+- const float* bPtr= bVector;
+- float32x4_t aVal, bVal, cVal;
+- for(number=0; number < quarterPoints; number++){
+- // Load in to NEON registers
+- aVal = vld1q_f32(aPtr);
+- bVal = vld1q_f32(bPtr);
+- __VOLK_PREFETCH(aPtr+4);
+- __VOLK_PREFETCH(bPtr+4);
+-
+- // vector add
+- cVal = vaddq_f32(aVal, bVal);
+- // Store the results back into the C container
+- vst1q_f32(cPtr,cVal);
+-
+- aPtr += 4; // q uses quadwords, 4 floats per vadd
+- bPtr += 4;
+- cPtr += 4;
+- }
+-
+- number = quarterPoints * 4; // should be = num_points
+- for(;number < num_points; number++){
+- *cPtr++ = (*aPtr++) + (*bPtr++);
+- }
++ unsigned int number = 0;
++ const unsigned int quarterPoints = num_points / 4;
++
++ float* cPtr = cVector;
++ const float* aPtr = aVector;
++ const float* bPtr = bVector;
++ float32x4_t aVal, bVal, cVal;
++ for (number = 0; number < quarterPoints; number++) {
++ // Load in to NEON registers
++ aVal = vld1q_f32(aPtr);
++ bVal = vld1q_f32(bPtr);
++ __VOLK_PREFETCH(aPtr + 4);
++ __VOLK_PREFETCH(bPtr + 4);
++
++ // vector add
++ cVal = vaddq_f32(aVal, bVal);
++ // Store the results back into the C container
++ vst1q_f32(cPtr, cVal);
++
++ aPtr += 4; // q uses quadwords, 4 floats per vadd
++ bPtr += 4;
++ cPtr += 4;
++ }
++
++ number = quarterPoints * 4; // should be = num_points
++ for (; number < num_points; number++) {
++ *cPtr++ = (*aPtr++) + (*bPtr++);
++ }
+ }
+
+ #endif /* LV_HAVE_NEON */
+
+ #ifdef LV_HAVE_NEONV7
+-extern void volk_32f_x2_add_32f_a_neonasm(float* cVector, const float* aVector, const float* bVector, unsigned int num_points);
++extern void volk_32f_x2_add_32f_a_neonasm(float* cVector,
++ const float* aVector,
++ const float* bVector,
++ unsigned int num_points);
+ #endif /* LV_HAVE_NEONV7 */
+
+ #ifdef LV_HAVE_NEONV7
+-extern void volk_32f_x2_add_32f_a_neonpipeline(float* cVector, const float* aVector, const float* bVector, unsigned int num_points);
++extern void volk_32f_x2_add_32f_a_neonpipeline(float* cVector,
++ const float* aVector,
++ const float* bVector,
++ unsigned int num_points);
+ #endif /* LV_HAVE_NEONV7 */
+
+ #ifdef LV_HAVE_GENERIC
+
+-static inline void
+-volk_32f_x2_add_32f_a_generic(float* cVector, const float* aVector,
+- const float* bVector, unsigned int num_points)
++static inline void volk_32f_x2_add_32f_a_generic(float* cVector,
++ const float* aVector,
++ const float* bVector,
++ unsigned int num_points)
+ {
+- float* cPtr = cVector;
+- const float* aPtr = aVector;
+- const float* bPtr= bVector;
+- unsigned int number = 0;
+-
+- for(number = 0; number < num_points; number++){
+- *cPtr++ = (*aPtr++) + (*bPtr++);
+- }
++ float* cPtr = cVector;
++ const float* aPtr = aVector;
++ const float* bPtr = bVector;
++ unsigned int number = 0;
++
++ for (number = 0; number < num_points; number++) {
++ *cPtr++ = (*aPtr++) + (*bPtr++);
++ }
+ }
+ #endif /* LV_HAVE_GENERIC */
+
+
+ #ifdef LV_HAVE_ORC
+
+-extern void
+-volk_32f_x2_add_32f_a_orc_impl(float* cVector, const float* aVector,
+- const float* bVector, unsigned int num_points);
++extern void volk_32f_x2_add_32f_a_orc_impl(float* cVector,
++ const float* aVector,
++ const float* bVector,
++ unsigned int num_points);
+
+-static inline void
+-volk_32f_x2_add_32f_u_orc(float* cVector, const float* aVector,
+- const float* bVector, unsigned int num_points){
+- volk_32f_x2_add_32f_a_orc_impl(cVector, aVector, bVector, num_points);
++static inline void volk_32f_x2_add_32f_u_orc(float* cVector,
++ const float* aVector,
++ const float* bVector,
++ unsigned int num_points)
++{
++ volk_32f_x2_add_32f_a_orc_impl(cVector, aVector, bVector, num_points);
+ }
+
+ #endif /* LV_HAVE_ORC */
+diff --git a/kernels/volk/volk_32f_x2_divide_32f.h b/kernels/volk/volk_32f_x2_divide_32f.h
+index 130767f..8b80365 100644
+--- a/kernels/volk/volk_32f_x2_divide_32f.h
++++ b/kernels/volk/volk_32f_x2_divide_32f.h
+@@ -31,8 +31,8 @@
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+- * void volk_32f_x2_divide_32f(float* cVector, const float* aVector, const float* bVector, unsigned int num_points)
+- * \endcode
++ * void volk_32f_x2_divide_32f(float* cVector, const float* aVector, const float* bVector,
++ * unsigned int num_points) \endcode
+ *
+ * \b Inputs
+ * \li aVector: First vector of input points.
+@@ -77,35 +77,36 @@
+ #ifdef LV_HAVE_AVX512F
+ #include <immintrin.h>
+
+-static inline void
+-volk_32f_x2_divide_32f_a_avx512f(float* cVector, const float* aVector,
+- const float* bVector, unsigned int num_points)
++static inline void volk_32f_x2_divide_32f_a_avx512f(float* cVector,
++ const float* aVector,
++ const float* bVector,
++ unsigned int num_points)
+ {
+- unsigned int number = 0;
+- const unsigned int sixteenthPoints = num_points / 16;
++ unsigned int number = 0;
++ const unsigned int sixteenthPoints = num_points / 16;
+
+- float* cPtr = cVector;
+- const float* aPtr = aVector;
+- const float* bPtr= bVector;
++ float* cPtr = cVector;
++ const float* aPtr = aVector;
++ const float* bPtr = bVector;
+
+- __m512 aVal, bVal, cVal;
+- for(;number < sixteenthPoints; number++){
+- aVal = _mm512_load_ps(aPtr);
+- bVal = _mm512_load_ps(bPtr);
++ __m512 aVal, bVal, cVal;
++ for (; number < sixteenthPoints; number++) {
++ aVal = _mm512_load_ps(aPtr);
++ bVal = _mm512_load_ps(bPtr);
+
+- cVal = _mm512_div_ps(aVal, bVal);
++ cVal = _mm512_div_ps(aVal, bVal);
+
+- _mm512_store_ps(cPtr,cVal); // Store the results back into the C container
++ _mm512_store_ps(cPtr, cVal); // Store the results back into the C container
+
+- aPtr += 16;
+- bPtr += 16;
+- cPtr += 16;
+- }
++ aPtr += 16;
++ bPtr += 16;
++ cPtr += 16;
++ }
+
+- number = sixteenthPoints * 16;
+- for(;number < num_points; number++){
+- *cPtr++ = (*aPtr++) / (*bPtr++);
+- }
++ number = sixteenthPoints * 16;
++ for (; number < num_points; number++) {
++ *cPtr++ = (*aPtr++) / (*bPtr++);
++ }
+ }
+ #endif /* LV_HAVE_AVX512F */
+
+@@ -113,35 +114,36 @@ volk_32f_x2_divide_32f_a_avx512f(float* cVector, const float* aVector,
+ #ifdef LV_HAVE_AVX
+ #include <immintrin.h>
+
+-static inline void
+-volk_32f_x2_divide_32f_a_avx(float* cVector, const float* aVector,
+- const float* bVector, unsigned int num_points)
++static inline void volk_32f_x2_divide_32f_a_avx(float* cVector,
++ const float* aVector,
++ const float* bVector,
++ unsigned int num_points)
+ {
+- unsigned int number = 0;
+- const unsigned int eighthPoints = num_points / 8;
++ unsigned int number = 0;
++ const unsigned int eighthPoints = num_points / 8;
+
+- float* cPtr = cVector;
+- const float* aPtr = aVector;
+- const float* bPtr= bVector;
++ float* cPtr = cVector;
++ const float* aPtr = aVector;
++ const float* bPtr = bVector;
+
+- __m256 aVal, bVal, cVal;
+- for(;number < eighthPoints; number++){
+- aVal = _mm256_load_ps(aPtr);
+- bVal = _mm256_load_ps(bPtr);
++ __m256 aVal, bVal, cVal;
++ for (; number < eighthPoints; number++) {
++ aVal = _mm256_load_ps(aPtr);
++ bVal = _mm256_load_ps(bPtr);
+
+- cVal = _mm256_div_ps(aVal, bVal);
++ cVal = _mm256_div_ps(aVal, bVal);
+
+- _mm256_store_ps(cPtr,cVal); // Store the results back into the C container
++ _mm256_store_ps(cPtr, cVal); // Store the results back into the C container
+
+- aPtr += 8;
+- bPtr += 8;
+- cPtr += 8;
+- }
++ aPtr += 8;
++ bPtr += 8;
++ cPtr += 8;
++ }
+
+- number = eighthPoints * 8;
+- for(;number < num_points; number++){
+- *cPtr++ = (*aPtr++) / (*bPtr++);
+- }
++ number = eighthPoints * 8;
++ for (; number < num_points; number++) {
++ *cPtr++ = (*aPtr++) / (*bPtr++);
++ }
+ }
+ #endif /* LV_HAVE_AVX */
+
+@@ -149,35 +151,36 @@ volk_32f_x2_divide_32f_a_avx(float* cVector, const float* aVector,
+ #ifdef LV_HAVE_SSE
+ #include <xmmintrin.h>
+
+-static inline void
+-volk_32f_x2_divide_32f_a_sse(float* cVector, const float* aVector,
+- const float* bVector, unsigned int num_points)
++static inline void volk_32f_x2_divide_32f_a_sse(float* cVector,
++ const float* aVector,
++ const float* bVector,
++ unsigned int num_points)
+ {
+- unsigned int number = 0;
+- const unsigned int quarterPoints = num_points / 4;
++ unsigned int number = 0;
++ const unsigned int quarterPoints = num_points / 4;
+
+- float* cPtr = cVector;
+- const float* aPtr = aVector;
+- const float* bPtr= bVector;
++ float* cPtr = cVector;
++ const float* aPtr = aVector;
++ const float* bPtr = bVector;
+
+- __m128 aVal, bVal, cVal;
+- for(;number < quarterPoints; number++){
+- aVal = _mm_load_ps(aPtr);
+- bVal = _mm_load_ps(bPtr);
++ __m128 aVal, bVal, cVal;
++ for (; number < quarterPoints; number++) {
++ aVal = _mm_load_ps(aPtr);
++ bVal = _mm_load_ps(bPtr);
+
+- cVal = _mm_div_ps(aVal, bVal);
++ cVal = _mm_div_ps(aVal, bVal);
+
+- _mm_store_ps(cPtr,cVal); // Store the results back into the C container
++ _mm_store_ps(cPtr, cVal); // Store the results back into the C container
+
+- aPtr += 4;
+- bPtr += 4;
+- cPtr += 4;
+- }
++ aPtr += 4;
++ bPtr += 4;
++ cPtr += 4;
++ }
+
+- number = quarterPoints * 4;
+- for(;number < num_points; number++){
+- *cPtr++ = (*aPtr++) / (*bPtr++);
+- }
++ number = quarterPoints * 4;
++ for (; number < num_points; number++) {
++ *cPtr++ = (*aPtr++) / (*bPtr++);
++ }
+ }
+ #endif /* LV_HAVE_SSE */
+
+@@ -185,54 +188,55 @@ volk_32f_x2_divide_32f_a_sse(float* cVector, const float* aVector,
+ #ifdef LV_HAVE_NEON
+ #include <arm_neon.h>
+
+-static inline void
+-volk_32f_x2_divide_32f_neon(float* cVector, const float* aVector,
+- const float* bVector, unsigned int num_points)
++static inline void volk_32f_x2_divide_32f_neon(float* cVector,
++ const float* aVector,
++ const float* bVector,
++ unsigned int num_points)
+ {
+- float* cPtr = cVector;
+- const float* aPtr = aVector;
+- const float* bPtr = bVector;
+-
+- float32x4x4_t aVal, bVal, bInv, cVal;
+-
+- const unsigned int eighthPoints = num_points / 16;
+- unsigned int number = 0;
+- for(; number < eighthPoints; number++){
+- aVal = vld4q_f32(aPtr);
+- aPtr += 16;
+- bVal = vld4q_f32(bPtr);
+- bPtr += 16;
+-
+- __VOLK_PREFETCH(aPtr+16);
+- __VOLK_PREFETCH(bPtr+16);
+-
+- bInv.val[0] = vrecpeq_f32(bVal.val[0]);
+- bInv.val[0] = vmulq_f32(bInv.val[0], vrecpsq_f32(bInv.val[0], bVal.val[0]));
+- bInv.val[0] = vmulq_f32(bInv.val[0], vrecpsq_f32(bInv.val[0], bVal.val[0]));
+- cVal.val[0] = vmulq_f32(aVal.val[0], bInv.val[0]);
+-
+- bInv.val[1] = vrecpeq_f32(bVal.val[1]);
+- bInv.val[1] = vmulq_f32(bInv.val[1], vrecpsq_f32(bInv.val[1], bVal.val[1]));
+- bInv.val[1] = vmulq_f32(bInv.val[1], vrecpsq_f32(bInv.val[1], bVal.val[1]));
+- cVal.val[1] = vmulq_f32(aVal.val[1], bInv.val[1]);
+-
+- bInv.val[2] = vrecpeq_f32(bVal.val[2]);
+- bInv.val[2] = vmulq_f32(bInv.val[2], vrecpsq_f32(bInv.val[2], bVal.val[2]));
+- bInv.val[2] = vmulq_f32(bInv.val[2], vrecpsq_f32(bInv.val[2], bVal.val[2]));
+- cVal.val[2] = vmulq_f32(aVal.val[2], bInv.val[2]);
+-
+- bInv.val[3] = vrecpeq_f32(bVal.val[3]);
+- bInv.val[3] = vmulq_f32(bInv.val[3], vrecpsq_f32(bInv.val[3], bVal.val[3]));
+- bInv.val[3] = vmulq_f32(bInv.val[3], vrecpsq_f32(bInv.val[3], bVal.val[3]));
+- cVal.val[3] = vmulq_f32(aVal.val[3], bInv.val[3]);
+-
+- vst4q_f32(cPtr, cVal);
+- cPtr += 16;
+- }
+-
+- for(number = eighthPoints * 16; number < num_points; number++){
+- *cPtr++ = (*aPtr++) / (*bPtr++);
+- }
++ float* cPtr = cVector;
++ const float* aPtr = aVector;
++ const float* bPtr = bVector;
++
++ float32x4x4_t aVal, bVal, bInv, cVal;
++
++ const unsigned int eighthPoints = num_points / 16;
++ unsigned int number = 0;
++ for (; number < eighthPoints; number++) {
++ aVal = vld4q_f32(aPtr);
++ aPtr += 16;
++ bVal = vld4q_f32(bPtr);
++ bPtr += 16;
++
++ __VOLK_PREFETCH(aPtr + 16);
++ __VOLK_PREFETCH(bPtr + 16);
++
++ bInv.val[0] = vrecpeq_f32(bVal.val[0]);
++ bInv.val[0] = vmulq_f32(bInv.val[0], vrecpsq_f32(bInv.val[0], bVal.val[0]));
++ bInv.val[0] = vmulq_f32(bInv.val[0], vrecpsq_f32(bInv.val[0], bVal.val[0]));
++ cVal.val[0] = vmulq_f32(aVal.val[0], bInv.val[0]);
++
++ bInv.val[1] = vrecpeq_f32(bVal.val[1]);
++ bInv.val[1] = vmulq_f32(bInv.val[1], vrecpsq_f32(bInv.val[1], bVal.val[1]));
++ bInv.val[1] = vmulq_f32(bInv.val[1], vrecpsq_f32(bInv.val[1], bVal.val[1]));
++ cVal.val[1] = vmulq_f32(aVal.val[1], bInv.val[1]);
++
++ bInv.val[2] = vrecpeq_f32(bVal.val[2]);
++ bInv.val[2] = vmulq_f32(bInv.val[2], vrecpsq_f32(bInv.val[2], bVal.val[2]));
++ bInv.val[2] = vmulq_f32(bInv.val[2], vrecpsq_f32(bInv.val[2], bVal.val[2]));
++ cVal.val[2] = vmulq_f32(aVal.val[2], bInv.val[2]);
++
++ bInv.val[3] = vrecpeq_f32(bVal.val[3]);
++ bInv.val[3] = vmulq_f32(bInv.val[3], vrecpsq_f32(bInv.val[3], bVal.val[3]));
++ bInv.val[3] = vmulq_f32(bInv.val[3], vrecpsq_f32(bInv.val[3], bVal.val[3]));
++ cVal.val[3] = vmulq_f32(aVal.val[3], bInv.val[3]);
++
++ vst4q_f32(cPtr, cVal);
++ cPtr += 16;
++ }
++
++ for (number = eighthPoints * 16; number < num_points; number++) {
++ *cPtr++ = (*aPtr++) / (*bPtr++);
++ }
+ }
+
+ #endif /* LV_HAVE_NEON */
+@@ -240,38 +244,40 @@ volk_32f_x2_divide_32f_neon(float* cVector, const float* aVector,
+
+ #ifdef LV_HAVE_GENERIC
+
+-static inline void
+-volk_32f_x2_divide_32f_generic(float* cVector, const float* aVector,
+- const float* bVector, unsigned int num_points)
++static inline void volk_32f_x2_divide_32f_generic(float* cVector,
++ const float* aVector,
++ const float* bVector,
++ unsigned int num_points)
+ {
+- float* cPtr = cVector;
+- const float* aPtr = aVector;
+- const float* bPtr= bVector;
+- unsigned int number = 0;
+-
+- for(number = 0; number < num_points; number++){
+- *cPtr++ = (*aPtr++) / (*bPtr++);
+- }
++ float* cPtr = cVector;
++ const float* aPtr = aVector;
++ const float* bPtr = bVector;
++ unsigned int number = 0;
++
++ for (number = 0; number < num_points; number++) {
++ *cPtr++ = (*aPtr++) / (*bPtr++);
++ }
+ }
+ #endif /* LV_HAVE_GENERIC */
+
+
+ #ifdef LV_HAVE_ORC
+
+-extern void
+-volk_32f_x2_divide_32f_a_orc_impl(float* cVector, const float* aVector,
+- const float* bVector, unsigned int num_points);
++extern void volk_32f_x2_divide_32f_a_orc_impl(float* cVector,
++ const float* aVector,
++ const float* bVector,
++ unsigned int num_points);
+
+-static inline void
+-volk_32f_x2_divide_32f_u_orc(float* cVector, const float* aVector,
+- const float* bVector, unsigned int num_points)
++static inline void volk_32f_x2_divide_32f_u_orc(float* cVector,
++ const float* aVector,
++ const float* bVector,
++ unsigned int num_points)
+ {
+- volk_32f_x2_divide_32f_a_orc_impl(cVector, aVector, bVector, num_points);
++ volk_32f_x2_divide_32f_a_orc_impl(cVector, aVector, bVector, num_points);
+ }
+ #endif /* LV_HAVE_ORC */
+
+
+-
+ #endif /* INCLUDED_volk_32f_x2_divide_32f_a_H */
+
+
+@@ -284,35 +290,36 @@ volk_32f_x2_divide_32f_u_orc(float* cVector, const float* aVector,
+ #ifdef LV_HAVE_AVX512F
+ #include <immintrin.h>
+
+-static inline void
+-volk_32f_x2_divide_32f_u_avx512f(float* cVector, const float* aVector,
+- const float* bVector, unsigned int num_points)
++static inline void volk_32f_x2_divide_32f_u_avx512f(float* cVector,
++ const float* aVector,
++ const float* bVector,
++ unsigned int num_points)
+ {
+- unsigned int number = 0;
+- const unsigned int sixteenthPoints = num_points / 16;
++ unsigned int number = 0;
++ const unsigned int sixteenthPoints = num_points / 16;
+
+- float* cPtr = cVector;
+- const float* aPtr = aVector;
+- const float* bPtr= bVector;
++ float* cPtr = cVector;
++ const float* aPtr = aVector;
++ const float* bPtr = bVector;
+
+- __m512 aVal, bVal, cVal;
+- for(;number < sixteenthPoints; number++){
+- aVal = _mm512_loadu_ps(aPtr);
+- bVal = _mm512_loadu_ps(bPtr);
++ __m512 aVal, bVal, cVal;
++ for (; number < sixteenthPoints; number++) {
++ aVal = _mm512_loadu_ps(aPtr);
++ bVal = _mm512_loadu_ps(bPtr);
+
+- cVal = _mm512_div_ps(aVal, bVal);
++ cVal = _mm512_div_ps(aVal, bVal);
+
+- _mm512_storeu_ps(cPtr,cVal); // Store the results back into the C container
++ _mm512_storeu_ps(cPtr, cVal); // Store the results back into the C container
+
+- aPtr += 16;
+- bPtr += 16;
+- cPtr += 16;
+- }
++ aPtr += 16;
++ bPtr += 16;
++ cPtr += 16;
++ }
+
+- number = sixteenthPoints * 16;
+- for(;number < num_points; number++){
+- *cPtr++ = (*aPtr++) / (*bPtr++);
+- }
++ number = sixteenthPoints * 16;
++ for (; number < num_points; number++) {
++ *cPtr++ = (*aPtr++) / (*bPtr++);
++ }
+ }
+ #endif /* LV_HAVE_AVX512F */
+
+@@ -320,35 +327,36 @@ volk_32f_x2_divide_32f_u_avx512f(float* cVector, const float* aVector,
+ #ifdef LV_HAVE_AVX
+ #include <immintrin.h>
+
+-static inline void
+-volk_32f_x2_divide_32f_u_avx(float* cVector, const float* aVector,
+- const float* bVector, unsigned int num_points)
++static inline void volk_32f_x2_divide_32f_u_avx(float* cVector,
++ const float* aVector,
++ const float* bVector,
++ unsigned int num_points)
+ {
+- unsigned int number = 0;
+- const unsigned int eighthPoints = num_points / 8;
++ unsigned int number = 0;
++ const unsigned int eighthPoints = num_points / 8;
+
+- float* cPtr = cVector;
+- const float* aPtr = aVector;
+- const float* bPtr= bVector;
++ float* cPtr = cVector;
++ const float* aPtr = aVector;
++ const float* bPtr = bVector;
+
+- __m256 aVal, bVal, cVal;
+- for(;number < eighthPoints; number++){
+- aVal = _mm256_loadu_ps(aPtr);
+- bVal = _mm256_loadu_ps(bPtr);
++ __m256 aVal, bVal, cVal;
++ for (; number < eighthPoints; number++) {
++ aVal = _mm256_loadu_ps(aPtr);
++ bVal = _mm256_loadu_ps(bPtr);
+
+- cVal = _mm256_div_ps(aVal, bVal);
++ cVal = _mm256_div_ps(aVal, bVal);
+
+- _mm256_storeu_ps(cPtr,cVal); // Store the results back into the C container
++ _mm256_storeu_ps(cPtr, cVal); // Store the results back into the C container
+
+- aPtr += 8;
+- bPtr += 8;
+- cPtr += 8;
+- }
++ aPtr += 8;
++ bPtr += 8;
++ cPtr += 8;
++ }
+
+- number = eighthPoints * 8;
+- for(;number < num_points; number++){
+- *cPtr++ = (*aPtr++) / (*bPtr++);
+- }
++ number = eighthPoints * 8;
++ for (; number < num_points; number++) {
++ *cPtr++ = (*aPtr++) / (*bPtr++);
++ }
+ }
+ #endif /* LV_HAVE_AVX */
+
+diff --git a/kernels/volk/volk_32f_x2_dot_prod_16i.h b/kernels/volk/volk_32f_x2_dot_prod_16i.h
+index c1b5a82..4da7db6 100644
+--- a/kernels/volk/volk_32f_x2_dot_prod_16i.h
++++ b/kernels/volk/volk_32f_x2_dot_prod_16i.h
+@@ -33,8 +33,8 @@
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+- * void volk_32f_x2_dot_prod_16i(int16_t* result, const float* input, const float* taps, unsigned int num_points)
+- * \endcode
++ * void volk_32f_x2_dot_prod_16i(int16_t* result, const float* input, const float* taps,
++ * unsigned int num_points) \endcode
+ *
+ * \b Inputs
+ * \li input: vector of floats.
+@@ -58,25 +58,29 @@
+ #ifndef INCLUDED_volk_32f_x2_dot_prod_16i_H
+ #define INCLUDED_volk_32f_x2_dot_prod_16i_H
+
+-#include <volk/volk_common.h>
+ #include <stdio.h>
++#include <volk/volk_common.h>
+
+
+ #ifdef LV_HAVE_GENERIC
+
+
+-static inline void volk_32f_x2_dot_prod_16i_generic(int16_t* result, const float* input, const float* taps, unsigned int num_points) {
++static inline void volk_32f_x2_dot_prod_16i_generic(int16_t* result,
++ const float* input,
++ const float* taps,
++ unsigned int num_points)
++{
+
+- float dotProduct = 0;
+- const float* aPtr = input;
+- const float* bPtr= taps;
+- unsigned int number = 0;
++ float dotProduct = 0;
++ const float* aPtr = input;
++ const float* bPtr = taps;
++ unsigned int number = 0;
+
+- for(number = 0; number < num_points; number++){
+- dotProduct += ((*aPtr++) * (*bPtr++));
+- }
++ for (number = 0; number < num_points; number++) {
++ dotProduct += ((*aPtr++) * (*bPtr++));
++ }
+
+- *result = (int16_t)dotProduct;
++ *result = (int16_t)dotProduct;
+ }
+
+ #endif /*LV_HAVE_GENERIC*/
+@@ -84,68 +88,73 @@ static inline void volk_32f_x2_dot_prod_16i_generic(int16_t* result, const float
+
+ #ifdef LV_HAVE_SSE
+
+-static inline void volk_32f_x2_dot_prod_16i_a_sse(int16_t* result, const float* input, const float* taps, unsigned int num_points) {
+-
+- unsigned int number = 0;
+- const unsigned int sixteenthPoints = num_points / 16;
+-
+- float dotProduct = 0;
+- const float* aPtr = input;
+- const float* bPtr = taps;
+-
+- __m128 a0Val, a1Val, a2Val, a3Val;
+- __m128 b0Val, b1Val, b2Val, b3Val;
+- __m128 c0Val, c1Val, c2Val, c3Val;
+-
+- __m128 dotProdVal0 = _mm_setzero_ps();
+- __m128 dotProdVal1 = _mm_setzero_ps();
+- __m128 dotProdVal2 = _mm_setzero_ps();
+- __m128 dotProdVal3 = _mm_setzero_ps();
+-
+- for(;number < sixteenthPoints; number++){
+-
+- a0Val = _mm_load_ps(aPtr);
+- a1Val = _mm_load_ps(aPtr+4);
+- a2Val = _mm_load_ps(aPtr+8);
+- a3Val = _mm_load_ps(aPtr+12);
+- b0Val = _mm_load_ps(bPtr);
+- b1Val = _mm_load_ps(bPtr+4);
+- b2Val = _mm_load_ps(bPtr+8);
+- b3Val = _mm_load_ps(bPtr+12);
+-
+- c0Val = _mm_mul_ps(a0Val, b0Val);
+- c1Val = _mm_mul_ps(a1Val, b1Val);
+- c2Val = _mm_mul_ps(a2Val, b2Val);
+- c3Val = _mm_mul_ps(a3Val, b3Val);
+-
+- dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0);
+- dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1);
+- dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2);
+- dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3);
+-
+- aPtr += 16;
+- bPtr += 16;
+- }
+-
+- dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
+- dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
+- dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
+-
+- __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
+-
+- _mm_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector
+-
+- dotProduct = dotProductVector[0];
+- dotProduct += dotProductVector[1];
+- dotProduct += dotProductVector[2];
+- dotProduct += dotProductVector[3];
+-
+- number = sixteenthPoints*16;
+- for(;number < num_points; number++){
+- dotProduct += ((*aPtr++) * (*bPtr++));
+- }
+-
+- *result = (short)dotProduct;
++static inline void volk_32f_x2_dot_prod_16i_a_sse(int16_t* result,
++ const float* input,
++ const float* taps,
++ unsigned int num_points)
++{
++
++ unsigned int number = 0;
++ const unsigned int sixteenthPoints = num_points / 16;
++
++ float dotProduct = 0;
++ const float* aPtr = input;
++ const float* bPtr = taps;
++
++ __m128 a0Val, a1Val, a2Val, a3Val;
++ __m128 b0Val, b1Val, b2Val, b3Val;
++ __m128 c0Val, c1Val, c2Val, c3Val;
++
++ __m128 dotProdVal0 = _mm_setzero_ps();
++ __m128 dotProdVal1 = _mm_setzero_ps();
++ __m128 dotProdVal2 = _mm_setzero_ps();
++ __m128 dotProdVal3 = _mm_setzero_ps();
++
++ for (; number < sixteenthPoints; number++) {
++
++ a0Val = _mm_load_ps(aPtr);
++ a1Val = _mm_load_ps(aPtr + 4);
++ a2Val = _mm_load_ps(aPtr + 8);
++ a3Val = _mm_load_ps(aPtr + 12);
++ b0Val = _mm_load_ps(bPtr);
++ b1Val = _mm_load_ps(bPtr + 4);
++ b2Val = _mm_load_ps(bPtr + 8);
++ b3Val = _mm_load_ps(bPtr + 12);
++
++ c0Val = _mm_mul_ps(a0Val, b0Val);
++ c1Val = _mm_mul_ps(a1Val, b1Val);
++ c2Val = _mm_mul_ps(a2Val, b2Val);
++ c3Val = _mm_mul_ps(a3Val, b3Val);
++
++ dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0);
++ dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1);
++ dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2);
++ dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3);
++
++ aPtr += 16;
++ bPtr += 16;
++ }
++
++ dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
++ dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
++ dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
++
++ __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
++
++ _mm_store_ps(dotProductVector,
++ dotProdVal0); // Store the results back into the dot product vector
++
++ dotProduct = dotProductVector[0];
++ dotProduct += dotProductVector[1];
++ dotProduct += dotProductVector[2];
++ dotProduct += dotProductVector[3];
++
++ number = sixteenthPoints * 16;
++ for (; number < num_points; number++) {
++ dotProduct += ((*aPtr++) * (*bPtr++));
++ }
++
++ *result = (short)dotProduct;
+ }
+
+ #endif /*LV_HAVE_SSE*/
+@@ -153,66 +162,71 @@ static inline void volk_32f_x2_dot_prod_16i_a_sse(int16_t* result, const float*
+
+ #if LV_HAVE_AVX2 && LV_HAVE_FMA
+
+-static inline void volk_32f_x2_dot_prod_16i_a_avx2_fma(int16_t* result, const float* input, const float* taps, unsigned int num_points) {
+-
+- unsigned int number = 0;
+- const unsigned int thirtysecondPoints = num_points / 32;
+-
+- float dotProduct = 0;
+- const float* aPtr = input;
+- const float* bPtr = taps;
+-
+- __m256 a0Val, a1Val, a2Val, a3Val;
+- __m256 b0Val, b1Val, b2Val, b3Val;
+-
+- __m256 dotProdVal0 = _mm256_setzero_ps();
+- __m256 dotProdVal1 = _mm256_setzero_ps();
+- __m256 dotProdVal2 = _mm256_setzero_ps();
+- __m256 dotProdVal3 = _mm256_setzero_ps();
+-
+- for(;number < thirtysecondPoints; number++){
+-
+- a0Val = _mm256_load_ps(aPtr);
+- a1Val = _mm256_load_ps(aPtr+8);
+- a2Val = _mm256_load_ps(aPtr+16);
+- a3Val = _mm256_load_ps(aPtr+24);
+- b0Val = _mm256_load_ps(bPtr);
+- b1Val = _mm256_load_ps(bPtr+8);
+- b2Val = _mm256_load_ps(bPtr+16);
+- b3Val = _mm256_load_ps(bPtr+24);
+-
+- dotProdVal0 = _mm256_fmadd_ps(a0Val, b0Val, dotProdVal0);
+- dotProdVal1 = _mm256_fmadd_ps(a1Val, b1Val, dotProdVal1);
+- dotProdVal2 = _mm256_fmadd_ps(a2Val, b2Val, dotProdVal2);
+- dotProdVal3 = _mm256_fmadd_ps(a3Val, b3Val, dotProdVal3);
+-
+- aPtr += 32;
+- bPtr += 32;
+- }
+-
+- dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
+- dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2);
+- dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3);
+-
+- __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
+-
+- _mm256_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector
+-
+- dotProduct = dotProductVector[0];
+- dotProduct += dotProductVector[1];
+- dotProduct += dotProductVector[2];
+- dotProduct += dotProductVector[3];
+- dotProduct += dotProductVector[4];
+- dotProduct += dotProductVector[5];
+- dotProduct += dotProductVector[6];
+- dotProduct += dotProductVector[7];
+-
+- number = thirtysecondPoints*32;
+- for(;number < num_points; number++){
+- dotProduct += ((*aPtr++) * (*bPtr++));
+- }
+-
+- *result = (short)dotProduct;
++static inline void volk_32f_x2_dot_prod_16i_a_avx2_fma(int16_t* result,
++ const float* input,
++ const float* taps,
++ unsigned int num_points)
++{
++
++ unsigned int number = 0;
++ const unsigned int thirtysecondPoints = num_points / 32;
++
++ float dotProduct = 0;
++ const float* aPtr = input;
++ const float* bPtr = taps;
++
++ __m256 a0Val, a1Val, a2Val, a3Val;
++ __m256 b0Val, b1Val, b2Val, b3Val;
++
++ __m256 dotProdVal0 = _mm256_setzero_ps();
++ __m256 dotProdVal1 = _mm256_setzero_ps();
++ __m256 dotProdVal2 = _mm256_setzero_ps();
++ __m256 dotProdVal3 = _mm256_setzero_ps();
++
++ for (; number < thirtysecondPoints; number++) {
++
++ a0Val = _mm256_load_ps(aPtr);
++ a1Val = _mm256_load_ps(aPtr + 8);
++ a2Val = _mm256_load_ps(aPtr + 16);
++ a3Val = _mm256_load_ps(aPtr + 24);
++ b0Val = _mm256_load_ps(bPtr);
++ b1Val = _mm256_load_ps(bPtr + 8);
++ b2Val = _mm256_load_ps(bPtr + 16);
++ b3Val = _mm256_load_ps(bPtr + 24);
++
++ dotProdVal0 = _mm256_fmadd_ps(a0Val, b0Val, dotProdVal0);
++ dotProdVal1 = _mm256_fmadd_ps(a1Val, b1Val, dotProdVal1);
++ dotProdVal2 = _mm256_fmadd_ps(a2Val, b2Val, dotProdVal2);
++ dotProdVal3 = _mm256_fmadd_ps(a3Val, b3Val, dotProdVal3);
++
++ aPtr += 32;
++ bPtr += 32;
++ }
++
++ dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
++ dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2);
++ dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3);
++
++ __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
++
++ _mm256_store_ps(dotProductVector,
++ dotProdVal0); // Store the results back into the dot product vector
++
++ dotProduct = dotProductVector[0];
++ dotProduct += dotProductVector[1];
++ dotProduct += dotProductVector[2];
++ dotProduct += dotProductVector[3];
++ dotProduct += dotProductVector[4];
++ dotProduct += dotProductVector[5];
++ dotProduct += dotProductVector[6];
++ dotProduct += dotProductVector[7];
++
++ number = thirtysecondPoints * 32;
++ for (; number < num_points; number++) {
++ dotProduct += ((*aPtr++) * (*bPtr++));
++ }
++
++ *result = (short)dotProduct;
+ }
+
+ #endif /*LV_HAVE_AVX2 && LV_HAVE_FMA*/
+@@ -220,146 +234,156 @@ static inline void volk_32f_x2_dot_prod_16i_a_avx2_fma(int16_t* result, const f
+
+ #ifdef LV_HAVE_AVX
+
+-static inline void volk_32f_x2_dot_prod_16i_a_avx(int16_t* result, const float* input, const float* taps, unsigned int num_points) {
+-
+- unsigned int number = 0;
+- const unsigned int thirtysecondPoints = num_points / 32;
+-
+- float dotProduct = 0;
+- const float* aPtr = input;
+- const float* bPtr = taps;
+-
+- __m256 a0Val, a1Val, a2Val, a3Val;
+- __m256 b0Val, b1Val, b2Val, b3Val;
+- __m256 c0Val, c1Val, c2Val, c3Val;
+-
+- __m256 dotProdVal0 = _mm256_setzero_ps();
+- __m256 dotProdVal1 = _mm256_setzero_ps();
+- __m256 dotProdVal2 = _mm256_setzero_ps();
+- __m256 dotProdVal3 = _mm256_setzero_ps();
+-
+- for(;number < thirtysecondPoints; number++){
+-
+- a0Val = _mm256_load_ps(aPtr);
+- a1Val = _mm256_load_ps(aPtr+8);
+- a2Val = _mm256_load_ps(aPtr+16);
+- a3Val = _mm256_load_ps(aPtr+24);
+- b0Val = _mm256_load_ps(bPtr);
+- b1Val = _mm256_load_ps(bPtr+8);
+- b2Val = _mm256_load_ps(bPtr+16);
+- b3Val = _mm256_load_ps(bPtr+24);
+-
+- c0Val = _mm256_mul_ps(a0Val, b0Val);
+- c1Val = _mm256_mul_ps(a1Val, b1Val);
+- c2Val = _mm256_mul_ps(a2Val, b2Val);
+- c3Val = _mm256_mul_ps(a3Val, b3Val);
+-
+- dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0);
+- dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1);
+- dotProdVal2 = _mm256_add_ps(c2Val, dotProdVal2);
+- dotProdVal3 = _mm256_add_ps(c3Val, dotProdVal3);
+-
+- aPtr += 32;
+- bPtr += 32;
+- }
+-
+- dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
+- dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2);
+- dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3);
+-
+- __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
+-
+- _mm256_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector
+-
+- dotProduct = dotProductVector[0];
+- dotProduct += dotProductVector[1];
+- dotProduct += dotProductVector[2];
+- dotProduct += dotProductVector[3];
+- dotProduct += dotProductVector[4];
+- dotProduct += dotProductVector[5];
+- dotProduct += dotProductVector[6];
+- dotProduct += dotProductVector[7];
+-
+- number = thirtysecondPoints*32;
+- for(;number < num_points; number++){
+- dotProduct += ((*aPtr++) * (*bPtr++));
+- }
+-
+- *result = (short)dotProduct;
++static inline void volk_32f_x2_dot_prod_16i_a_avx(int16_t* result,
++ const float* input,
++ const float* taps,
++ unsigned int num_points)
++{
++
++ unsigned int number = 0;
++ const unsigned int thirtysecondPoints = num_points / 32;
++
++ float dotProduct = 0;
++ const float* aPtr = input;
++ const float* bPtr = taps;
++
++ __m256 a0Val, a1Val, a2Val, a3Val;
++ __m256 b0Val, b1Val, b2Val, b3Val;
++ __m256 c0Val, c1Val, c2Val, c3Val;
++
++ __m256 dotProdVal0 = _mm256_setzero_ps();
++ __m256 dotProdVal1 = _mm256_setzero_ps();
++ __m256 dotProdVal2 = _mm256_setzero_ps();
++ __m256 dotProdVal3 = _mm256_setzero_ps();
++
++ for (; number < thirtysecondPoints; number++) {
++
++ a0Val = _mm256_load_ps(aPtr);
++ a1Val = _mm256_load_ps(aPtr + 8);
++ a2Val = _mm256_load_ps(aPtr + 16);
++ a3Val = _mm256_load_ps(aPtr + 24);
++ b0Val = _mm256_load_ps(bPtr);
++ b1Val = _mm256_load_ps(bPtr + 8);
++ b2Val = _mm256_load_ps(bPtr + 16);
++ b3Val = _mm256_load_ps(bPtr + 24);
++
++ c0Val = _mm256_mul_ps(a0Val, b0Val);
++ c1Val = _mm256_mul_ps(a1Val, b1Val);
++ c2Val = _mm256_mul_ps(a2Val, b2Val);
++ c3Val = _mm256_mul_ps(a3Val, b3Val);
++
++ dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0);
++ dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1);
++ dotProdVal2 = _mm256_add_ps(c2Val, dotProdVal2);
++ dotProdVal3 = _mm256_add_ps(c3Val, dotProdVal3);
++
++ aPtr += 32;
++ bPtr += 32;
++ }
++
++ dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
++ dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2);
++ dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3);
++
++ __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
++
++ _mm256_store_ps(dotProductVector,
++ dotProdVal0); // Store the results back into the dot product vector
++
++ dotProduct = dotProductVector[0];
++ dotProduct += dotProductVector[1];
++ dotProduct += dotProductVector[2];
++ dotProduct += dotProductVector[3];
++ dotProduct += dotProductVector[4];
++ dotProduct += dotProductVector[5];
++ dotProduct += dotProductVector[6];
++ dotProduct += dotProductVector[7];
++
++ number = thirtysecondPoints * 32;
++ for (; number < num_points; number++) {
++ dotProduct += ((*aPtr++) * (*bPtr++));
++ }
++
++ *result = (short)dotProduct;
+ }
+
+ #endif /*LV_HAVE_AVX*/
+
+ #ifdef LV_HAVE_AVX512F
+
+-static inline void volk_32f_x2_dot_prod_16i_a_avx512f(int16_t* result, const float* input, const float* taps, unsigned int num_points) {
+-
+- unsigned int number = 0;
+- const unsigned int sixtyfourthPoints = num_points / 64;
+-
+- float dotProduct = 0;
+- const float* aPtr = input;
+- const float* bPtr = taps;
+-
+- __m512 a0Val, a1Val, a2Val, a3Val;
+- __m512 b0Val, b1Val, b2Val, b3Val;
+-
+- __m512 dotProdVal0 = _mm512_setzero_ps();
+- __m512 dotProdVal1 = _mm512_setzero_ps();
+- __m512 dotProdVal2 = _mm512_setzero_ps();
+- __m512 dotProdVal3 = _mm512_setzero_ps();
+-
+- for(;number < sixtyfourthPoints; number++){
+-
+- a0Val = _mm512_load_ps(aPtr);
+- a1Val = _mm512_load_ps(aPtr+16);
+- a2Val = _mm512_load_ps(aPtr+32);
+- a3Val = _mm512_load_ps(aPtr+48);
+- b0Val = _mm512_load_ps(bPtr);
+- b1Val = _mm512_load_ps(bPtr+16);
+- b2Val = _mm512_load_ps(bPtr+32);
+- b3Val = _mm512_load_ps(bPtr+48);
+-
+- dotProdVal0 = _mm512_fmadd_ps(a0Val, b0Val, dotProdVal0);
+- dotProdVal1 = _mm512_fmadd_ps(a1Val, b1Val, dotProdVal1);
+- dotProdVal2 = _mm512_fmadd_ps(a2Val, b2Val, dotProdVal2);
+- dotProdVal3 = _mm512_fmadd_ps(a3Val, b3Val, dotProdVal3);
+-
+- aPtr += 64;
+- bPtr += 64;
+- }
+-
+- dotProdVal0 = _mm512_add_ps(dotProdVal0, dotProdVal1);
+- dotProdVal0 = _mm512_add_ps(dotProdVal0, dotProdVal2);
+- dotProdVal0 = _mm512_add_ps(dotProdVal0, dotProdVal3);
+-
+- __VOLK_ATTR_ALIGNED(64) float dotProductVector[16];
+-
+- _mm512_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector
+-
+- dotProduct = dotProductVector[0];
+- dotProduct += dotProductVector[1];
+- dotProduct += dotProductVector[2];
+- dotProduct += dotProductVector[3];
+- dotProduct += dotProductVector[4];
+- dotProduct += dotProductVector[5];
+- dotProduct += dotProductVector[6];
+- dotProduct += dotProductVector[7];
+- dotProduct += dotProductVector[8];
+- dotProduct += dotProductVector[9];
+- dotProduct += dotProductVector[10];
+- dotProduct += dotProductVector[11];
+- dotProduct += dotProductVector[12];
+- dotProduct += dotProductVector[13];
+- dotProduct += dotProductVector[14];
+- dotProduct += dotProductVector[15];
+-
+- number = sixtyfourthPoints*64;
+- for(;number < num_points; number++){
+- dotProduct += ((*aPtr++) * (*bPtr++));
+- }
+-
+- *result = (short)dotProduct;
++static inline void volk_32f_x2_dot_prod_16i_a_avx512f(int16_t* result,
++ const float* input,
++ const float* taps,
++ unsigned int num_points)
++{
++
++ unsigned int number = 0;
++ const unsigned int sixtyfourthPoints = num_points / 64;
++
++ float dotProduct = 0;
++ const float* aPtr = input;
++ const float* bPtr = taps;
++
++ __m512 a0Val, a1Val, a2Val, a3Val;
++ __m512 b0Val, b1Val, b2Val, b3Val;
++
++ __m512 dotProdVal0 = _mm512_setzero_ps();
++ __m512 dotProdVal1 = _mm512_setzero_ps();
++ __m512 dotProdVal2 = _mm512_setzero_ps();
++ __m512 dotProdVal3 = _mm512_setzero_ps();
++
++ for (; number < sixtyfourthPoints; number++) {
++
++ a0Val = _mm512_load_ps(aPtr);
++ a1Val = _mm512_load_ps(aPtr + 16);
++ a2Val = _mm512_load_ps(aPtr + 32);
++ a3Val = _mm512_load_ps(aPtr + 48);
++ b0Val = _mm512_load_ps(bPtr);
++ b1Val = _mm512_load_ps(bPtr + 16);
++ b2Val = _mm512_load_ps(bPtr + 32);
++ b3Val = _mm512_load_ps(bPtr + 48);
++
++ dotProdVal0 = _mm512_fmadd_ps(a0Val, b0Val, dotProdVal0);
++ dotProdVal1 = _mm512_fmadd_ps(a1Val, b1Val, dotProdVal1);
++ dotProdVal2 = _mm512_fmadd_ps(a2Val, b2Val, dotProdVal2);
++ dotProdVal3 = _mm512_fmadd_ps(a3Val, b3Val, dotProdVal3);
++
++ aPtr += 64;
++ bPtr += 64;
++ }
++
++ dotProdVal0 = _mm512_add_ps(dotProdVal0, dotProdVal1);
++ dotProdVal0 = _mm512_add_ps(dotProdVal0, dotProdVal2);
++ dotProdVal0 = _mm512_add_ps(dotProdVal0, dotProdVal3);
++
++ __VOLK_ATTR_ALIGNED(64) float dotProductVector[16];
++
++ _mm512_store_ps(dotProductVector,
++ dotProdVal0); // Store the results back into the dot product vector
++
++ dotProduct = dotProductVector[0];
++ dotProduct += dotProductVector[1];
++ dotProduct += dotProductVector[2];
++ dotProduct += dotProductVector[3];
++ dotProduct += dotProductVector[4];
++ dotProduct += dotProductVector[5];
++ dotProduct += dotProductVector[6];
++ dotProduct += dotProductVector[7];
++ dotProduct += dotProductVector[8];
++ dotProduct += dotProductVector[9];
++ dotProduct += dotProductVector[10];
++ dotProduct += dotProductVector[11];
++ dotProduct += dotProductVector[12];
++ dotProduct += dotProductVector[13];
++ dotProduct += dotProductVector[14];
++ dotProduct += dotProductVector[15];
++
++ number = sixtyfourthPoints * 64;
++ for (; number < num_points; number++) {
++ dotProduct += ((*aPtr++) * (*bPtr++));
++ }
++
++ *result = (short)dotProduct;
+ }
+
+ #endif /*LV_HAVE_AVX512F*/
+@@ -367,68 +391,73 @@ static inline void volk_32f_x2_dot_prod_16i_a_avx512f(int16_t* result, const fl
+
+ #ifdef LV_HAVE_SSE
+
+-static inline void volk_32f_x2_dot_prod_16i_u_sse(int16_t* result, const float* input, const float* taps, unsigned int num_points) {
+-
+- unsigned int number = 0;
+- const unsigned int sixteenthPoints = num_points / 16;
+-
+- float dotProduct = 0;
+- const float* aPtr = input;
+- const float* bPtr = taps;
+-
+- __m128 a0Val, a1Val, a2Val, a3Val;
+- __m128 b0Val, b1Val, b2Val, b3Val;
+- __m128 c0Val, c1Val, c2Val, c3Val;
+-
+- __m128 dotProdVal0 = _mm_setzero_ps();
+- __m128 dotProdVal1 = _mm_setzero_ps();
+- __m128 dotProdVal2 = _mm_setzero_ps();
+- __m128 dotProdVal3 = _mm_setzero_ps();
+-
+- for(;number < sixteenthPoints; number++){
+-
+- a0Val = _mm_loadu_ps(aPtr);
+- a1Val = _mm_loadu_ps(aPtr+4);
+- a2Val = _mm_loadu_ps(aPtr+8);
+- a3Val = _mm_loadu_ps(aPtr+12);
+- b0Val = _mm_loadu_ps(bPtr);
+- b1Val = _mm_loadu_ps(bPtr+4);
+- b2Val = _mm_loadu_ps(bPtr+8);
+- b3Val = _mm_loadu_ps(bPtr+12);
+-
+- c0Val = _mm_mul_ps(a0Val, b0Val);
+- c1Val = _mm_mul_ps(a1Val, b1Val);
+- c2Val = _mm_mul_ps(a2Val, b2Val);
+- c3Val = _mm_mul_ps(a3Val, b3Val);
+-
+- dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0);
+- dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1);
+- dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2);
+- dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3);
+-
+- aPtr += 16;
+- bPtr += 16;
+- }
+-
+- dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
+- dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
+- dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
+-
+- __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
+-
+- _mm_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector
+-
+- dotProduct = dotProductVector[0];
+- dotProduct += dotProductVector[1];
+- dotProduct += dotProductVector[2];
+- dotProduct += dotProductVector[3];
+-
+- number = sixteenthPoints*16;
+- for(;number < num_points; number++){
+- dotProduct += ((*aPtr++) * (*bPtr++));
+- }
+-
+- *result = (short)dotProduct;
++static inline void volk_32f_x2_dot_prod_16i_u_sse(int16_t* result,
++ const float* input,
++ const float* taps,
++ unsigned int num_points)
++{
++
++ unsigned int number = 0;
++ const unsigned int sixteenthPoints = num_points / 16;
++
++ float dotProduct = 0;
++ const float* aPtr = input;
++ const float* bPtr = taps;
++
++ __m128 a0Val, a1Val, a2Val, a3Val;
++ __m128 b0Val, b1Val, b2Val, b3Val;
++ __m128 c0Val, c1Val, c2Val, c3Val;
++
++ __m128 dotProdVal0 = _mm_setzero_ps();
++ __m128 dotProdVal1 = _mm_setzero_ps();
++ __m128 dotProdVal2 = _mm_setzero_ps();
++ __m128 dotProdVal3 = _mm_setzero_ps();
++
++ for (; number < sixteenthPoints; number++) {
++
++ a0Val = _mm_loadu_ps(aPtr);
++ a1Val = _mm_loadu_ps(aPtr + 4);
++ a2Val = _mm_loadu_ps(aPtr + 8);
++ a3Val = _mm_loadu_ps(aPtr + 12);
++ b0Val = _mm_loadu_ps(bPtr);
++ b1Val = _mm_loadu_ps(bPtr + 4);
++ b2Val = _mm_loadu_ps(bPtr + 8);
++ b3Val = _mm_loadu_ps(bPtr + 12);
++
++ c0Val = _mm_mul_ps(a0Val, b0Val);
++ c1Val = _mm_mul_ps(a1Val, b1Val);
++ c2Val = _mm_mul_ps(a2Val, b2Val);
++ c3Val = _mm_mul_ps(a3Val, b3Val);
++
++ dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0);
++ dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1);
++ dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2);
++ dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3);
++
++ aPtr += 16;
++ bPtr += 16;
++ }
++
++ dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
++ dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
++ dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
++
++ __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
++
++ _mm_store_ps(dotProductVector,
++ dotProdVal0); // Store the results back into the dot product vector
++
++ dotProduct = dotProductVector[0];
++ dotProduct += dotProductVector[1];
++ dotProduct += dotProductVector[2];
++ dotProduct += dotProductVector[3];
++
++ number = sixteenthPoints * 16;
++ for (; number < num_points; number++) {
++ dotProduct += ((*aPtr++) * (*bPtr++));
++ }
++
++ *result = (short)dotProduct;
+ }
+
+ #endif /*LV_HAVE_SSE*/
+@@ -436,66 +465,71 @@ static inline void volk_32f_x2_dot_prod_16i_u_sse(int16_t* result, const float*
+
+ #if LV_HAVE_AVX2 && LV_HAVE_FMA
+
+-static inline void volk_32f_x2_dot_prod_16i_u_avx2_fma(int16_t* result, const float* input, const float* taps, unsigned int num_points) {
+-
+- unsigned int number = 0;
+- const unsigned int thirtysecondPoints = num_points / 32;
+-
+- float dotProduct = 0;
+- const float* aPtr = input;
+- const float* bPtr = taps;
+-
+- __m256 a0Val, a1Val, a2Val, a3Val;
+- __m256 b0Val, b1Val, b2Val, b3Val;
+-
+- __m256 dotProdVal0 = _mm256_setzero_ps();
+- __m256 dotProdVal1 = _mm256_setzero_ps();
+- __m256 dotProdVal2 = _mm256_setzero_ps();
+- __m256 dotProdVal3 = _mm256_setzero_ps();
+-
+- for(;number < thirtysecondPoints; number++){
+-
+- a0Val = _mm256_loadu_ps(aPtr);
+- a1Val = _mm256_loadu_ps(aPtr+8);
+- a2Val = _mm256_loadu_ps(aPtr+16);
+- a3Val = _mm256_loadu_ps(aPtr+24);
+- b0Val = _mm256_loadu_ps(bPtr);
+- b1Val = _mm256_loadu_ps(bPtr+8);
+- b2Val = _mm256_loadu_ps(bPtr+16);
+- b3Val = _mm256_loadu_ps(bPtr+24);
+-
+- dotProdVal0 = _mm256_fmadd_ps(a0Val, b0Val, dotProdVal0);
+- dotProdVal1 = _mm256_fmadd_ps(a1Val, b1Val, dotProdVal1);
+- dotProdVal2 = _mm256_fmadd_ps(a2Val, b2Val, dotProdVal2);
+- dotProdVal3 = _mm256_fmadd_ps(a3Val, b3Val, dotProdVal3);
+-
+- aPtr += 32;
+- bPtr += 32;
+- }
+-
+- dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
+- dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2);
+- dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3);
+-
+- __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
+-
+- _mm256_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector
+-
+- dotProduct = dotProductVector[0];
+- dotProduct += dotProductVector[1];
+- dotProduct += dotProductVector[2];
+- dotProduct += dotProductVector[3];
+- dotProduct += dotProductVector[4];
+- dotProduct += dotProductVector[5];
+- dotProduct += dotProductVector[6];
+- dotProduct += dotProductVector[7];
+-
+- number = thirtysecondPoints*32;
+- for(;number < num_points; number++){
+- dotProduct += ((*aPtr++) * (*bPtr++));
+- }
+-
+- *result = (short)dotProduct;
++static inline void volk_32f_x2_dot_prod_16i_u_avx2_fma(int16_t* result,
++ const float* input,
++ const float* taps,
++ unsigned int num_points)
++{
++
++ unsigned int number = 0;
++ const unsigned int thirtysecondPoints = num_points / 32;
++
++ float dotProduct = 0;
++ const float* aPtr = input;
++ const float* bPtr = taps;
++
++ __m256 a0Val, a1Val, a2Val, a3Val;
++ __m256 b0Val, b1Val, b2Val, b3Val;
++
++ __m256 dotProdVal0 = _mm256_setzero_ps();
++ __m256 dotProdVal1 = _mm256_setzero_ps();
++ __m256 dotProdVal2 = _mm256_setzero_ps();
++ __m256 dotProdVal3 = _mm256_setzero_ps();
++
++ for (; number < thirtysecondPoints; number++) {
++
++ a0Val = _mm256_loadu_ps(aPtr);
++ a1Val = _mm256_loadu_ps(aPtr + 8);
++ a2Val = _mm256_loadu_ps(aPtr + 16);
++ a3Val = _mm256_loadu_ps(aPtr + 24);
++ b0Val = _mm256_loadu_ps(bPtr);
++ b1Val = _mm256_loadu_ps(bPtr + 8);
++ b2Val = _mm256_loadu_ps(bPtr + 16);
++ b3Val = _mm256_loadu_ps(bPtr + 24);
++
++ dotProdVal0 = _mm256_fmadd_ps(a0Val, b0Val, dotProdVal0);
++ dotProdVal1 = _mm256_fmadd_ps(a1Val, b1Val, dotProdVal1);
++ dotProdVal2 = _mm256_fmadd_ps(a2Val, b2Val, dotProdVal2);
++ dotProdVal3 = _mm256_fmadd_ps(a3Val, b3Val, dotProdVal3);
++
++ aPtr += 32;
++ bPtr += 32;
++ }
++
++ dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
++ dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2);
++ dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3);
++
++ __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
++
++ _mm256_store_ps(dotProductVector,
++ dotProdVal0); // Store the results back into the dot product vector
++
++ dotProduct = dotProductVector[0];
++ dotProduct += dotProductVector[1];
++ dotProduct += dotProductVector[2];
++ dotProduct += dotProductVector[3];
++ dotProduct += dotProductVector[4];
++ dotProduct += dotProductVector[5];
++ dotProduct += dotProductVector[6];
++ dotProduct += dotProductVector[7];
++
++ number = thirtysecondPoints * 32;
++ for (; number < num_points; number++) {
++ dotProduct += ((*aPtr++) * (*bPtr++));
++ }
++
++ *result = (short)dotProduct;
+ }
+
+ #endif /*LV_HAVE_AVX2 && lV_HAVE_FMA*/
+@@ -503,146 +537,156 @@ static inline void volk_32f_x2_dot_prod_16i_u_avx2_fma(int16_t* result, const f
+
+ #ifdef LV_HAVE_AVX
+
+-static inline void volk_32f_x2_dot_prod_16i_u_avx(int16_t* result, const float* input, const float* taps, unsigned int num_points) {
+-
+- unsigned int number = 0;
+- const unsigned int thirtysecondPoints = num_points / 32;
+-
+- float dotProduct = 0;
+- const float* aPtr = input;
+- const float* bPtr = taps;
+-
+- __m256 a0Val, a1Val, a2Val, a3Val;
+- __m256 b0Val, b1Val, b2Val, b3Val;
+- __m256 c0Val, c1Val, c2Val, c3Val;
+-
+- __m256 dotProdVal0 = _mm256_setzero_ps();
+- __m256 dotProdVal1 = _mm256_setzero_ps();
+- __m256 dotProdVal2 = _mm256_setzero_ps();
+- __m256 dotProdVal3 = _mm256_setzero_ps();
+-
+- for(;number < thirtysecondPoints; number++){
+-
+- a0Val = _mm256_loadu_ps(aPtr);
+- a1Val = _mm256_loadu_ps(aPtr+8);
+- a2Val = _mm256_loadu_ps(aPtr+16);
+- a3Val = _mm256_loadu_ps(aPtr+24);
+- b0Val = _mm256_loadu_ps(bPtr);
+- b1Val = _mm256_loadu_ps(bPtr+8);
+- b2Val = _mm256_loadu_ps(bPtr+16);
+- b3Val = _mm256_loadu_ps(bPtr+24);
+-
+- c0Val = _mm256_mul_ps(a0Val, b0Val);
+- c1Val = _mm256_mul_ps(a1Val, b1Val);
+- c2Val = _mm256_mul_ps(a2Val, b2Val);
+- c3Val = _mm256_mul_ps(a3Val, b3Val);
+-
+- dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0);
+- dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1);
+- dotProdVal2 = _mm256_add_ps(c2Val, dotProdVal2);
+- dotProdVal3 = _mm256_add_ps(c3Val, dotProdVal3);
+-
+- aPtr += 32;
+- bPtr += 32;
+- }
+-
+- dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
+- dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2);
+- dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3);
+-
+- __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
+-
+- _mm256_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector
+-
+- dotProduct = dotProductVector[0];
+- dotProduct += dotProductVector[1];
+- dotProduct += dotProductVector[2];
+- dotProduct += dotProductVector[3];
+- dotProduct += dotProductVector[4];
+- dotProduct += dotProductVector[5];
+- dotProduct += dotProductVector[6];
+- dotProduct += dotProductVector[7];
+-
+- number = thirtysecondPoints*32;
+- for(;number < num_points; number++){
+- dotProduct += ((*aPtr++) * (*bPtr++));
+- }
+-
+- *result = (short)dotProduct;
++static inline void volk_32f_x2_dot_prod_16i_u_avx(int16_t* result,
++ const float* input,
++ const float* taps,
++ unsigned int num_points)
++{
++
++ unsigned int number = 0;
++ const unsigned int thirtysecondPoints = num_points / 32;
++
++ float dotProduct = 0;
++ const float* aPtr = input;
++ const float* bPtr = taps;
++
++ __m256 a0Val, a1Val, a2Val, a3Val;
++ __m256 b0Val, b1Val, b2Val, b3Val;
++ __m256 c0Val, c1Val, c2Val, c3Val;
++
++ __m256 dotProdVal0 = _mm256_setzero_ps();
++ __m256 dotProdVal1 = _mm256_setzero_ps();
++ __m256 dotProdVal2 = _mm256_setzero_ps();
++ __m256 dotProdVal3 = _mm256_setzero_ps();
++
++ for (; number < thirtysecondPoints; number++) {
++
++ a0Val = _mm256_loadu_ps(aPtr);
++ a1Val = _mm256_loadu_ps(aPtr + 8);
++ a2Val = _mm256_loadu_ps(aPtr + 16);
++ a3Val = _mm256_loadu_ps(aPtr + 24);
++ b0Val = _mm256_loadu_ps(bPtr);
++ b1Val = _mm256_loadu_ps(bPtr + 8);
++ b2Val = _mm256_loadu_ps(bPtr + 16);
++ b3Val = _mm256_loadu_ps(bPtr + 24);
++
++ c0Val = _mm256_mul_ps(a0Val, b0Val);
++ c1Val = _mm256_mul_ps(a1Val, b1Val);
++ c2Val = _mm256_mul_ps(a2Val, b2Val);
++ c3Val = _mm256_mul_ps(a3Val, b3Val);
++
++ dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0);
++ dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1);
++ dotProdVal2 = _mm256_add_ps(c2Val, dotProdVal2);
++ dotProdVal3 = _mm256_add_ps(c3Val, dotProdVal3);
++
++ aPtr += 32;
++ bPtr += 32;
++ }
++
++ dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
++ dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2);
++ dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3);
++
++ __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
++
++ _mm256_store_ps(dotProductVector,
++ dotProdVal0); // Store the results back into the dot product vector
++
++ dotProduct = dotProductVector[0];
++ dotProduct += dotProductVector[1];
++ dotProduct += dotProductVector[2];
++ dotProduct += dotProductVector[3];
++ dotProduct += dotProductVector[4];
++ dotProduct += dotProductVector[5];
++ dotProduct += dotProductVector[6];
++ dotProduct += dotProductVector[7];
++
++ number = thirtysecondPoints * 32;
++ for (; number < num_points; number++) {
++ dotProduct += ((*aPtr++) * (*bPtr++));
++ }
++
++ *result = (short)dotProduct;
+ }
+
+ #endif /*LV_HAVE_AVX*/
+
+ #ifdef LV_HAVE_AVX512F
+
+-static inline void volk_32f_x2_dot_prod_16i_u_avx512f(int16_t* result, const float* input, const float* taps, unsigned int num_points) {
+-
+- unsigned int number = 0;
+- const unsigned int sixtyfourthPoints = num_points / 64;
+-
+- float dotProduct = 0;
+- const float* aPtr = input;
+- const float* bPtr = taps;
+-
+- __m512 a0Val, a1Val, a2Val, a3Val;
+- __m512 b0Val, b1Val, b2Val, b3Val;
+-
+- __m512 dotProdVal0 = _mm512_setzero_ps();
+- __m512 dotProdVal1 = _mm512_setzero_ps();
+- __m512 dotProdVal2 = _mm512_setzero_ps();
+- __m512 dotProdVal3 = _mm512_setzero_ps();
+-
+- for(;number < sixtyfourthPoints; number++){
+-
+- a0Val = _mm512_loadu_ps(aPtr);
+- a1Val = _mm512_loadu_ps(aPtr+16);
+- a2Val = _mm512_loadu_ps(aPtr+32);
+- a3Val = _mm512_loadu_ps(aPtr+48);
+- b0Val = _mm512_loadu_ps(bPtr);
+- b1Val = _mm512_loadu_ps(bPtr+16);
+- b2Val = _mm512_loadu_ps(bPtr+32);
+- b3Val = _mm512_loadu_ps(bPtr+48);
+-
+- dotProdVal0 = _mm512_fmadd_ps(a0Val, b0Val, dotProdVal0);
+- dotProdVal1 = _mm512_fmadd_ps(a1Val, b1Val, dotProdVal1);
+- dotProdVal2 = _mm512_fmadd_ps(a2Val, b2Val, dotProdVal2);
+- dotProdVal3 = _mm512_fmadd_ps(a3Val, b3Val, dotProdVal3);
+-
+- aPtr += 64;
+- bPtr += 64;
+- }
+-
+- dotProdVal0 = _mm512_add_ps(dotProdVal0, dotProdVal1);
+- dotProdVal0 = _mm512_add_ps(dotProdVal0, dotProdVal2);
+- dotProdVal0 = _mm512_add_ps(dotProdVal0, dotProdVal3);
+-
+- __VOLK_ATTR_ALIGNED(64) float dotProductVector[16];
+-
+- _mm512_storeu_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector
+-
+- dotProduct = dotProductVector[0];
+- dotProduct += dotProductVector[1];
+- dotProduct += dotProductVector[2];
+- dotProduct += dotProductVector[3];
+- dotProduct += dotProductVector[4];
+- dotProduct += dotProductVector[5];
+- dotProduct += dotProductVector[6];
+- dotProduct += dotProductVector[7];
+- dotProduct += dotProductVector[8];
+- dotProduct += dotProductVector[9];
+- dotProduct += dotProductVector[10];
+- dotProduct += dotProductVector[11];
+- dotProduct += dotProductVector[12];
+- dotProduct += dotProductVector[13];
+- dotProduct += dotProductVector[14];
+- dotProduct += dotProductVector[15];
+-
+- number = sixtyfourthPoints*64;
+- for(;number < num_points; number++){
+- dotProduct += ((*aPtr++) * (*bPtr++));
+- }
+-
+- *result = (short)dotProduct;
++static inline void volk_32f_x2_dot_prod_16i_u_avx512f(int16_t* result,
++ const float* input,
++ const float* taps,
++ unsigned int num_points)
++{
++
++ unsigned int number = 0;
++ const unsigned int sixtyfourthPoints = num_points / 64;
++
++ float dotProduct = 0;
++ const float* aPtr = input;
++ const float* bPtr = taps;
++
++ __m512 a0Val, a1Val, a2Val, a3Val;
++ __m512 b0Val, b1Val, b2Val, b3Val;
++
++ __m512 dotProdVal0 = _mm512_setzero_ps();
++ __m512 dotProdVal1 = _mm512_setzero_ps();
++ __m512 dotProdVal2 = _mm512_setzero_ps();
++ __m512 dotProdVal3 = _mm512_setzero_ps();
++
++ for (; number < sixtyfourthPoints; number++) {
++
++ a0Val = _mm512_loadu_ps(aPtr);
++ a1Val = _mm512_loadu_ps(aPtr + 16);
++ a2Val = _mm512_loadu_ps(aPtr + 32);
++ a3Val = _mm512_loadu_ps(aPtr + 48);
++ b0Val = _mm512_loadu_ps(bPtr);
++ b1Val = _mm512_loadu_ps(bPtr + 16);
++ b2Val = _mm512_loadu_ps(bPtr + 32);
++ b3Val = _mm512_loadu_ps(bPtr + 48);
++
++ dotProdVal0 = _mm512_fmadd_ps(a0Val, b0Val, dotProdVal0);
++ dotProdVal1 = _mm512_fmadd_ps(a1Val, b1Val, dotProdVal1);
++ dotProdVal2 = _mm512_fmadd_ps(a2Val, b2Val, dotProdVal2);
++ dotProdVal3 = _mm512_fmadd_ps(a3Val, b3Val, dotProdVal3);
++
++ aPtr += 64;
++ bPtr += 64;
++ }
++
++ dotProdVal0 = _mm512_add_ps(dotProdVal0, dotProdVal1);
++ dotProdVal0 = _mm512_add_ps(dotProdVal0, dotProdVal2);
++ dotProdVal0 = _mm512_add_ps(dotProdVal0, dotProdVal3);
++
++ __VOLK_ATTR_ALIGNED(64) float dotProductVector[16];
++
++ _mm512_storeu_ps(dotProductVector,
++ dotProdVal0); // Store the results back into the dot product vector
++
++ dotProduct = dotProductVector[0];
++ dotProduct += dotProductVector[1];
++ dotProduct += dotProductVector[2];
++ dotProduct += dotProductVector[3];
++ dotProduct += dotProductVector[4];
++ dotProduct += dotProductVector[5];
++ dotProduct += dotProductVector[6];
++ dotProduct += dotProductVector[7];
++ dotProduct += dotProductVector[8];
++ dotProduct += dotProductVector[9];
++ dotProduct += dotProductVector[10];
++ dotProduct += dotProductVector[11];
++ dotProduct += dotProductVector[12];
++ dotProduct += dotProductVector[13];
++ dotProduct += dotProductVector[14];
++ dotProduct += dotProductVector[15];
++
++ number = sixtyfourthPoints * 64;
++ for (; number < num_points; number++) {
++ dotProduct += ((*aPtr++) * (*bPtr++));
++ }
++
++ *result = (short)dotProduct;
+ }
+
+ #endif /*LV_HAVE_AVX512F*/
+diff --git a/kernels/volk/volk_32f_x2_dot_prod_32f.h b/kernels/volk/volk_32f_x2_dot_prod_32f.h
+index ea0f7ba..7854031 100644
+--- a/kernels/volk/volk_32f_x2_dot_prod_32f.h
++++ b/kernels/volk/volk_32f_x2_dot_prod_32f.h
+@@ -33,8 +33,8 @@
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+- * void volk_32f_x2_dot_prod_32f(float* result, const float* input, const float* taps, unsigned int num_points)
+- * \endcode
++ * void volk_32f_x2_dot_prod_32f(float* result, const float* input, const float* taps,
++ * unsigned int num_points) \endcode
+ *
+ * \b Inputs
+ * \li input: vector of floats.
+@@ -45,10 +45,8 @@
+ * \li result: pointer to a float value to hold the dot product result.
+ *
+ * \b Example
+- * Take the dot product of an increasing vector and a vector of ones. The result is the sum of integers (0,9).
+- * \code
+- * int N = 10;
+- * unsigned int alignment = volk_get_alignment();
++ * Take the dot product of an increasing vector and a vector of ones. The result is the
++ * sum of integers (0,9). \code int N = 10; unsigned int alignment = volk_get_alignment();
+ * float* increasing = (float*)volk_malloc(sizeof(float)*N, alignment);
+ * float* ones = (float*)volk_malloc(sizeof(float)*N, alignment);
+ * float* out = (float*)volk_malloc(sizeof(float)*1, alignment);
+@@ -73,25 +71,29 @@
+ #ifndef INCLUDED_volk_32f_x2_dot_prod_32f_u_H
+ #define INCLUDED_volk_32f_x2_dot_prod_32f_u_H
+
++#include <stdio.h>
+ #include <volk/volk_common.h>
+-#include<stdio.h>
+
+
+ #ifdef LV_HAVE_GENERIC
+
+
+-static inline void volk_32f_x2_dot_prod_32f_generic(float * result, const float * input, const float * taps, unsigned int num_points) {
++static inline void volk_32f_x2_dot_prod_32f_generic(float* result,
++ const float* input,
++ const float* taps,
++ unsigned int num_points)
++{
+
+- float dotProduct = 0;
+- const float* aPtr = input;
+- const float* bPtr= taps;
+- unsigned int number = 0;
++ float dotProduct = 0;
++ const float* aPtr = input;
++ const float* bPtr = taps;
++ unsigned int number = 0;
+
+- for(number = 0; number < num_points; number++){
+- dotProduct += ((*aPtr++) * (*bPtr++));
+- }
++ for (number = 0; number < num_points; number++) {
++ dotProduct += ((*aPtr++) * (*bPtr++));
++ }
+
+- *result = dotProduct;
++ *result = dotProduct;
+ }
+
+ #endif /*LV_HAVE_GENERIC*/
+@@ -100,69 +102,73 @@ static inline void volk_32f_x2_dot_prod_32f_generic(float * result, const float
+ #ifdef LV_HAVE_SSE
+
+
+-static inline void volk_32f_x2_dot_prod_32f_u_sse( float* result, const float* input, const float* taps, unsigned int num_points) {
+-
+- unsigned int number = 0;
+- const unsigned int sixteenthPoints = num_points / 16;
+-
+- float dotProduct = 0;
+- const float* aPtr = input;
+- const float* bPtr = taps;
++static inline void volk_32f_x2_dot_prod_32f_u_sse(float* result,
++ const float* input,
++ const float* taps,
++ unsigned int num_points)
++{
+
+- __m128 a0Val, a1Val, a2Val, a3Val;
+- __m128 b0Val, b1Val, b2Val, b3Val;
+- __m128 c0Val, c1Val, c2Val, c3Val;
+-
+- __m128 dotProdVal0 = _mm_setzero_ps();
+- __m128 dotProdVal1 = _mm_setzero_ps();
+- __m128 dotProdVal2 = _mm_setzero_ps();
+- __m128 dotProdVal3 = _mm_setzero_ps();
+-
+- for(;number < sixteenthPoints; number++){
+-
+- a0Val = _mm_loadu_ps(aPtr);
+- a1Val = _mm_loadu_ps(aPtr+4);
+- a2Val = _mm_loadu_ps(aPtr+8);
+- a3Val = _mm_loadu_ps(aPtr+12);
+- b0Val = _mm_loadu_ps(bPtr);
+- b1Val = _mm_loadu_ps(bPtr+4);
+- b2Val = _mm_loadu_ps(bPtr+8);
+- b3Val = _mm_loadu_ps(bPtr+12);
+-
+- c0Val = _mm_mul_ps(a0Val, b0Val);
+- c1Val = _mm_mul_ps(a1Val, b1Val);
+- c2Val = _mm_mul_ps(a2Val, b2Val);
+- c3Val = _mm_mul_ps(a3Val, b3Val);
+-
+- dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0);
+- dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1);
+- dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2);
+- dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3);
++ unsigned int number = 0;
++ const unsigned int sixteenthPoints = num_points / 16;
+
+- aPtr += 16;
+- bPtr += 16;
+- }
++ float dotProduct = 0;
++ const float* aPtr = input;
++ const float* bPtr = taps;
++
++ __m128 a0Val, a1Val, a2Val, a3Val;
++ __m128 b0Val, b1Val, b2Val, b3Val;
++ __m128 c0Val, c1Val, c2Val, c3Val;
++
++ __m128 dotProdVal0 = _mm_setzero_ps();
++ __m128 dotProdVal1 = _mm_setzero_ps();
++ __m128 dotProdVal2 = _mm_setzero_ps();
++ __m128 dotProdVal3 = _mm_setzero_ps();
++
++ for (; number < sixteenthPoints; number++) {
++
++ a0Val = _mm_loadu_ps(aPtr);
++ a1Val = _mm_loadu_ps(aPtr + 4);
++ a2Val = _mm_loadu_ps(aPtr + 8);
++ a3Val = _mm_loadu_ps(aPtr + 12);
++ b0Val = _mm_loadu_ps(bPtr);
++ b1Val = _mm_loadu_ps(bPtr + 4);
++ b2Val = _mm_loadu_ps(bPtr + 8);
++ b3Val = _mm_loadu_ps(bPtr + 12);
++
++ c0Val = _mm_mul_ps(a0Val, b0Val);
++ c1Val = _mm_mul_ps(a1Val, b1Val);
++ c2Val = _mm_mul_ps(a2Val, b2Val);
++ c3Val = _mm_mul_ps(a3Val, b3Val);
++
++ dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0);
++ dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1);
++ dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2);
++ dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3);
+
+- dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
+- dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
+- dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
++ aPtr += 16;
++ bPtr += 16;
++ }
+
+- __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
++ dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
++ dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
++ dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
+
+- _mm_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector
++ __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
+
+- dotProduct = dotProductVector[0];
+- dotProduct += dotProductVector[1];
+- dotProduct += dotProductVector[2];
+- dotProduct += dotProductVector[3];
++ _mm_store_ps(dotProductVector,
++ dotProdVal0); // Store the results back into the dot product vector
+
+- number = sixteenthPoints*16;
+- for(;number < num_points; number++){
+- dotProduct += ((*aPtr++) * (*bPtr++));
+- }
++ dotProduct = dotProductVector[0];
++ dotProduct += dotProductVector[1];
++ dotProduct += dotProductVector[2];
++ dotProduct += dotProductVector[3];
+
+- *result = dotProduct;
++ number = sixteenthPoints * 16;
++ for (; number < num_points; number++) {
++ dotProduct += ((*aPtr++) * (*bPtr++));
++ }
+
++ *result = dotProduct;
+ }
+
+ #endif /*LV_HAVE_SSE*/
+@@ -171,127 +177,145 @@ static inline void volk_32f_x2_dot_prod_32f_u_sse( float* result, const float*
+
+ #include <pmmintrin.h>
+
+-static inline void volk_32f_x2_dot_prod_32f_u_sse3(float * result, const float * input, const float * taps, unsigned int num_points) {
+- unsigned int number = 0;
+- const unsigned int sixteenthPoints = num_points / 16;
+-
+- float dotProduct = 0;
+- const float* aPtr = input;
+- const float* bPtr = taps;
+-
+- __m128 a0Val, a1Val, a2Val, a3Val;
+- __m128 b0Val, b1Val, b2Val, b3Val;
+- __m128 c0Val, c1Val, c2Val, c3Val;
+-
+- __m128 dotProdVal0 = _mm_setzero_ps();
+- __m128 dotProdVal1 = _mm_setzero_ps();
+- __m128 dotProdVal2 = _mm_setzero_ps();
+- __m128 dotProdVal3 = _mm_setzero_ps();
+-
+- for(;number < sixteenthPoints; number++){
+-
+- a0Val = _mm_loadu_ps(aPtr);
+- a1Val = _mm_loadu_ps(aPtr+4);
+- a2Val = _mm_loadu_ps(aPtr+8);
+- a3Val = _mm_loadu_ps(aPtr+12);
+- b0Val = _mm_loadu_ps(bPtr);
+- b1Val = _mm_loadu_ps(bPtr+4);
+- b2Val = _mm_loadu_ps(bPtr+8);
+- b3Val = _mm_loadu_ps(bPtr+12);
+-
+- c0Val = _mm_mul_ps(a0Val, b0Val);
+- c1Val = _mm_mul_ps(a1Val, b1Val);
+- c2Val = _mm_mul_ps(a2Val, b2Val);
+- c3Val = _mm_mul_ps(a3Val, b3Val);
+-
+- dotProdVal0 = _mm_add_ps(dotProdVal0, c0Val);
+- dotProdVal1 = _mm_add_ps(dotProdVal1, c1Val);
+- dotProdVal2 = _mm_add_ps(dotProdVal2, c2Val);
+- dotProdVal3 = _mm_add_ps(dotProdVal3, c3Val);
+-
+- aPtr += 16;
+- bPtr += 16;
+- }
+-
+- dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
+- dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
+- dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
+-
+- __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
+- _mm_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector
+-
+- dotProduct = dotProductVector[0];
+- dotProduct += dotProductVector[1];
+- dotProduct += dotProductVector[2];
+- dotProduct += dotProductVector[3];
+-
+- number = sixteenthPoints*16;
+- for(;number < num_points; number++){
+- dotProduct += ((*aPtr++) * (*bPtr++));
+- }
+-
+- *result = dotProduct;
+-}
+-
+-#endif /*LV_HAVE_SSE3*/
++static inline void volk_32f_x2_dot_prod_32f_u_sse3(float* result,
++ const float* input,
++ const float* taps,
++ unsigned int num_points)
++{
++ unsigned int number = 0;
++ const unsigned int sixteenthPoints = num_points / 16;
+
+-#ifdef LV_HAVE_SSE4_1
++ float dotProduct = 0;
++ const float* aPtr = input;
++ const float* bPtr = taps;
++
++ __m128 a0Val, a1Val, a2Val, a3Val;
++ __m128 b0Val, b1Val, b2Val, b3Val;
++ __m128 c0Val, c1Val, c2Val, c3Val;
++
++ __m128 dotProdVal0 = _mm_setzero_ps();
++ __m128 dotProdVal1 = _mm_setzero_ps();
++ __m128 dotProdVal2 = _mm_setzero_ps();
++ __m128 dotProdVal3 = _mm_setzero_ps();
++
++ for (; number < sixteenthPoints; number++) {
++
++ a0Val = _mm_loadu_ps(aPtr);
++ a1Val = _mm_loadu_ps(aPtr + 4);
++ a2Val = _mm_loadu_ps(aPtr + 8);
++ a3Val = _mm_loadu_ps(aPtr + 12);
++ b0Val = _mm_loadu_ps(bPtr);
++ b1Val = _mm_loadu_ps(bPtr + 4);
++ b2Val = _mm_loadu_ps(bPtr + 8);
++ b3Val = _mm_loadu_ps(bPtr + 12);
++
++ c0Val = _mm_mul_ps(a0Val, b0Val);
++ c1Val = _mm_mul_ps(a1Val, b1Val);
++ c2Val = _mm_mul_ps(a2Val, b2Val);
++ c3Val = _mm_mul_ps(a3Val, b3Val);
++
++ dotProdVal0 = _mm_add_ps(dotProdVal0, c0Val);
++ dotProdVal1 = _mm_add_ps(dotProdVal1, c1Val);
++ dotProdVal2 = _mm_add_ps(dotProdVal2, c2Val);
++ dotProdVal3 = _mm_add_ps(dotProdVal3, c3Val);
+
+-#include <smmintrin.h>
++ aPtr += 16;
++ bPtr += 16;
++ }
+
+-static inline void volk_32f_x2_dot_prod_32f_u_sse4_1(float * result, const float * input, const float* taps, unsigned int num_points) {
+- unsigned int number = 0;
+- const unsigned int sixteenthPoints = num_points / 16;
++ dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
++ dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
++ dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
+
+- float dotProduct = 0;
+- const float* aPtr = input;
+- const float* bPtr = taps;
++ __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
++ _mm_store_ps(dotProductVector,
++ dotProdVal0); // Store the results back into the dot product vector
+
+- __m128 aVal1, bVal1, cVal1;
+- __m128 aVal2, bVal2, cVal2;
+- __m128 aVal3, bVal3, cVal3;
+- __m128 aVal4, bVal4, cVal4;
++ dotProduct = dotProductVector[0];
++ dotProduct += dotProductVector[1];
++ dotProduct += dotProductVector[2];
++ dotProduct += dotProductVector[3];
+
+- __m128 dotProdVal = _mm_setzero_ps();
++ number = sixteenthPoints * 16;
++ for (; number < num_points; number++) {
++ dotProduct += ((*aPtr++) * (*bPtr++));
++ }
+
+- for(;number < sixteenthPoints; number++){
++ *result = dotProduct;
++}
+
+- aVal1 = _mm_loadu_ps(aPtr); aPtr += 4;
+- aVal2 = _mm_loadu_ps(aPtr); aPtr += 4;
+- aVal3 = _mm_loadu_ps(aPtr); aPtr += 4;
+- aVal4 = _mm_loadu_ps(aPtr); aPtr += 4;
++#endif /*LV_HAVE_SSE3*/
+
+- bVal1 = _mm_loadu_ps(bPtr); bPtr += 4;
+- bVal2 = _mm_loadu_ps(bPtr); bPtr += 4;
+- bVal3 = _mm_loadu_ps(bPtr); bPtr += 4;
+- bVal4 = _mm_loadu_ps(bPtr); bPtr += 4;
++#ifdef LV_HAVE_SSE4_1
+
+- cVal1 = _mm_dp_ps(aVal1, bVal1, 0xF1);
+- cVal2 = _mm_dp_ps(aVal2, bVal2, 0xF2);
+- cVal3 = _mm_dp_ps(aVal3, bVal3, 0xF4);
+- cVal4 = _mm_dp_ps(aVal4, bVal4, 0xF8);
++#include <smmintrin.h>
+
+- cVal1 = _mm_or_ps(cVal1, cVal2);
+- cVal3 = _mm_or_ps(cVal3, cVal4);
+- cVal1 = _mm_or_ps(cVal1, cVal3);
++static inline void volk_32f_x2_dot_prod_32f_u_sse4_1(float* result,
++ const float* input,
++ const float* taps,
++ unsigned int num_points)
++{
++ unsigned int number = 0;
++ const unsigned int sixteenthPoints = num_points / 16;
+
+- dotProdVal = _mm_add_ps(dotProdVal, cVal1);
+- }
++ float dotProduct = 0;
++ const float* aPtr = input;
++ const float* bPtr = taps;
++
++ __m128 aVal1, bVal1, cVal1;
++ __m128 aVal2, bVal2, cVal2;
++ __m128 aVal3, bVal3, cVal3;
++ __m128 aVal4, bVal4, cVal4;
++
++ __m128 dotProdVal = _mm_setzero_ps();
++
++ for (; number < sixteenthPoints; number++) {
++
++ aVal1 = _mm_loadu_ps(aPtr);
++ aPtr += 4;
++ aVal2 = _mm_loadu_ps(aPtr);
++ aPtr += 4;
++ aVal3 = _mm_loadu_ps(aPtr);
++ aPtr += 4;
++ aVal4 = _mm_loadu_ps(aPtr);
++ aPtr += 4;
++
++ bVal1 = _mm_loadu_ps(bPtr);
++ bPtr += 4;
++ bVal2 = _mm_loadu_ps(bPtr);
++ bPtr += 4;
++ bVal3 = _mm_loadu_ps(bPtr);
++ bPtr += 4;
++ bVal4 = _mm_loadu_ps(bPtr);
++ bPtr += 4;
++
++ cVal1 = _mm_dp_ps(aVal1, bVal1, 0xF1);
++ cVal2 = _mm_dp_ps(aVal2, bVal2, 0xF2);
++ cVal3 = _mm_dp_ps(aVal3, bVal3, 0xF4);
++ cVal4 = _mm_dp_ps(aVal4, bVal4, 0xF8);
++
++ cVal1 = _mm_or_ps(cVal1, cVal2);
++ cVal3 = _mm_or_ps(cVal3, cVal4);
++ cVal1 = _mm_or_ps(cVal1, cVal3);
++
++ dotProdVal = _mm_add_ps(dotProdVal, cVal1);
++ }
+
+- __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
+- _mm_store_ps(dotProductVector, dotProdVal); // Store the results back into the dot product vector
++ __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
++ _mm_store_ps(dotProductVector,
++ dotProdVal); // Store the results back into the dot product vector
+
+- dotProduct = dotProductVector[0];
+- dotProduct += dotProductVector[1];
+- dotProduct += dotProductVector[2];
+- dotProduct += dotProductVector[3];
++ dotProduct = dotProductVector[0];
++ dotProduct += dotProductVector[1];
++ dotProduct += dotProductVector[2];
++ dotProduct += dotProductVector[3];
+
+- number = sixteenthPoints * 16;
+- for(;number < num_points; number++){
+- dotProduct += ((*aPtr++) * (*bPtr++));
+- }
++ number = sixteenthPoints * 16;
++ for (; number < num_points; number++) {
++ dotProduct += ((*aPtr++) * (*bPtr++));
++ }
+
+- *result = dotProduct;
++ *result = dotProduct;
+ }
+
+ #endif /*LV_HAVE_SSE4_1*/
+@@ -300,147 +324,154 @@ static inline void volk_32f_x2_dot_prod_32f_u_sse4_1(float * result, const float
+
+ #include <immintrin.h>
+
+-static inline void volk_32f_x2_dot_prod_32f_u_avx( float* result, const float* input, const float* taps, unsigned int num_points) {
++static inline void volk_32f_x2_dot_prod_32f_u_avx(float* result,
++ const float* input,
++ const float* taps,
++ unsigned int num_points)
++{
+
+- unsigned int number = 0;
+- const unsigned int sixteenthPoints = num_points / 16;
+-
+- float dotProduct = 0;
+- const float* aPtr = input;
+- const float* bPtr = taps;
++ unsigned int number = 0;
++ const unsigned int sixteenthPoints = num_points / 16;
+
+- __m256 a0Val, a1Val;
+- __m256 b0Val, b1Val;
+- __m256 c0Val, c1Val;
++ float dotProduct = 0;
++ const float* aPtr = input;
++ const float* bPtr = taps;
+
+- __m256 dotProdVal0 = _mm256_setzero_ps();
+- __m256 dotProdVal1 = _mm256_setzero_ps();
++ __m256 a0Val, a1Val;
++ __m256 b0Val, b1Val;
++ __m256 c0Val, c1Val;
+
+- for(;number < sixteenthPoints; number++){
++ __m256 dotProdVal0 = _mm256_setzero_ps();
++ __m256 dotProdVal1 = _mm256_setzero_ps();
+
+- a0Val = _mm256_loadu_ps(aPtr);
+- a1Val = _mm256_loadu_ps(aPtr+8);
+- b0Val = _mm256_loadu_ps(bPtr);
+- b1Val = _mm256_loadu_ps(bPtr+8);
++ for (; number < sixteenthPoints; number++) {
+
+- c0Val = _mm256_mul_ps(a0Val, b0Val);
+- c1Val = _mm256_mul_ps(a1Val, b1Val);
++ a0Val = _mm256_loadu_ps(aPtr);
++ a1Val = _mm256_loadu_ps(aPtr + 8);
++ b0Val = _mm256_loadu_ps(bPtr);
++ b1Val = _mm256_loadu_ps(bPtr + 8);
+
+- dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0);
+- dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1);
++ c0Val = _mm256_mul_ps(a0Val, b0Val);
++ c1Val = _mm256_mul_ps(a1Val, b1Val);
+
+- aPtr += 16;
+- bPtr += 16;
+- }
++ dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0);
++ dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1);
+
+- dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
++ aPtr += 16;
++ bPtr += 16;
++ }
+
+- __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
++ dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
+
+- _mm256_storeu_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector
++ __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
+
+- dotProduct = dotProductVector[0];
+- dotProduct += dotProductVector[1];
+- dotProduct += dotProductVector[2];
+- dotProduct += dotProductVector[3];
+- dotProduct += dotProductVector[4];
+- dotProduct += dotProductVector[5];
+- dotProduct += dotProductVector[6];
+- dotProduct += dotProductVector[7];
++ _mm256_storeu_ps(dotProductVector,
++ dotProdVal0); // Store the results back into the dot product vector
+
+- number = sixteenthPoints*16;
+- for(;number < num_points; number++){
+- dotProduct += ((*aPtr++) * (*bPtr++));
+- }
++ dotProduct = dotProductVector[0];
++ dotProduct += dotProductVector[1];
++ dotProduct += dotProductVector[2];
++ dotProduct += dotProductVector[3];
++ dotProduct += dotProductVector[4];
++ dotProduct += dotProductVector[5];
++ dotProduct += dotProductVector[6];
++ dotProduct += dotProductVector[7];
+
+- *result = dotProduct;
++ number = sixteenthPoints * 16;
++ for (; number < num_points; number++) {
++ dotProduct += ((*aPtr++) * (*bPtr++));
++ }
+
++ *result = dotProduct;
+ }
+
+ #endif /*LV_HAVE_AVX*/
+
+ #if LV_HAVE_AVX2 && LV_HAVE_FMA
+ #include <immintrin.h>
+-static inline void volk_32f_x2_dot_prod_32f_u_avx2_fma(float * result, const float * input, const float* taps, unsigned int num_points){
+- unsigned int number;
+- const unsigned int eighthPoints = num_points / 8;
++static inline void volk_32f_x2_dot_prod_32f_u_avx2_fma(float* result,
++ const float* input,
++ const float* taps,
++ unsigned int num_points)
++{
++ unsigned int number;
++ const unsigned int eighthPoints = num_points / 8;
+
+- const float* aPtr = input;
+- const float* bPtr = taps;
+-
+- __m256 dotProdVal = _mm256_setzero_ps();
+- __m256 aVal1, bVal1;
++ const float* aPtr = input;
++ const float* bPtr = taps;
+
+- for (number = 0; number < eighthPoints; number++ ) {
++ __m256 dotProdVal = _mm256_setzero_ps();
++ __m256 aVal1, bVal1;
+
+- aVal1 = _mm256_loadu_ps(aPtr);
+- bVal1 = _mm256_loadu_ps(bPtr);
+- aPtr += 8;
+- bPtr += 8;
++ for (number = 0; number < eighthPoints; number++) {
+
+- dotProdVal = _mm256_fmadd_ps(aVal1, bVal1, dotProdVal);
+- }
++ aVal1 = _mm256_loadu_ps(aPtr);
++ bVal1 = _mm256_loadu_ps(bPtr);
++ aPtr += 8;
++ bPtr += 8;
+
+- __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
+- _mm256_storeu_ps(dotProductVector, dotProdVal); // Store the results back into the dot product vector
+- _mm256_zeroupper();
++ dotProdVal = _mm256_fmadd_ps(aVal1, bVal1, dotProdVal);
++ }
+
+- float dotProduct =
+- dotProductVector[0] + dotProductVector[1] +
+- dotProductVector[2] + dotProductVector[3] +
+- dotProductVector[4] + dotProductVector[5] +
+- dotProductVector[6] + dotProductVector[7];
++ __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
++ _mm256_storeu_ps(dotProductVector,
++ dotProdVal); // Store the results back into the dot product vector
++ _mm256_zeroupper();
+
+- for(number = eighthPoints * 8; number < num_points; number++){
+- dotProduct += ((*aPtr++) * (*bPtr++));
+- }
++ float dotProduct = dotProductVector[0] + dotProductVector[1] + dotProductVector[2] +
++ dotProductVector[3] + dotProductVector[4] + dotProductVector[5] +
++ dotProductVector[6] + dotProductVector[7];
+
+- *result = dotProduct;
++ for (number = eighthPoints * 8; number < num_points; number++) {
++ dotProduct += ((*aPtr++) * (*bPtr++));
++ }
+
++ *result = dotProduct;
+ }
+ #endif /* LV_HAVE_AVX2 && LV_HAVE_FMA */
+
+ #if LV_HAVE_AVX512F
+ #include <immintrin.h>
+-static inline void volk_32f_x2_dot_prod_32f_u_avx512f(float * result, const float * input, const float* taps, unsigned int num_points){
+- unsigned int number;
+- const unsigned int sixteenthPoints = num_points / 16;
+-
+- const float* aPtr = input;
+- const float* bPtr = taps;
++static inline void volk_32f_x2_dot_prod_32f_u_avx512f(float* result,
++ const float* input,
++ const float* taps,
++ unsigned int num_points)
++{
++ unsigned int number;
++ const unsigned int sixteenthPoints = num_points / 16;
+
+- __m512 dotProdVal = _mm512_setzero_ps();
+- __m512 aVal1, bVal1;
++ const float* aPtr = input;
++ const float* bPtr = taps;
+
+- for (number = 0; number < sixteenthPoints; number++ ) {
++ __m512 dotProdVal = _mm512_setzero_ps();
++ __m512 aVal1, bVal1;
+
+- aVal1 = _mm512_loadu_ps(aPtr);
+- bVal1 = _mm512_loadu_ps(bPtr);
+- aPtr += 16;
+- bPtr += 16;
++ for (number = 0; number < sixteenthPoints; number++) {
+
+- dotProdVal = _mm512_fmadd_ps(aVal1, bVal1, dotProdVal);
+- }
++ aVal1 = _mm512_loadu_ps(aPtr);
++ bVal1 = _mm512_loadu_ps(bPtr);
++ aPtr += 16;
++ bPtr += 16;
+
+- __VOLK_ATTR_ALIGNED(64) float dotProductVector[16];
+- _mm512_storeu_ps(dotProductVector, dotProdVal); // Store the results back into the dot product vector
++ dotProdVal = _mm512_fmadd_ps(aVal1, bVal1, dotProdVal);
++ }
+
+- float dotProduct =
+- dotProductVector[0] + dotProductVector[1] +
+- dotProductVector[2] + dotProductVector[3] +
+- dotProductVector[4] + dotProductVector[5] +
+- dotProductVector[6] + dotProductVector[7] +
+- dotProductVector[8] + dotProductVector[9] +
+- dotProductVector[10] + dotProductVector[11] +
+- dotProductVector[12] + dotProductVector[13] +
+- dotProductVector[14] + dotProductVector[15];
++ __VOLK_ATTR_ALIGNED(64) float dotProductVector[16];
++ _mm512_storeu_ps(dotProductVector,
++ dotProdVal); // Store the results back into the dot product vector
+
+- for(number = sixteenthPoints * 16; number < num_points; number++){
+- dotProduct += ((*aPtr++) * (*bPtr++));
+- }
++ float dotProduct = dotProductVector[0] + dotProductVector[1] + dotProductVector[2] +
++ dotProductVector[3] + dotProductVector[4] + dotProductVector[5] +
++ dotProductVector[6] + dotProductVector[7] + dotProductVector[8] +
++ dotProductVector[9] + dotProductVector[10] + dotProductVector[11] +
++ dotProductVector[12] + dotProductVector[13] +
++ dotProductVector[14] + dotProductVector[15];
+
+- *result = dotProduct;
++ for (number = sixteenthPoints * 16; number < num_points; number++) {
++ dotProduct += ((*aPtr++) * (*bPtr++));
++ }
+
++ *result = dotProduct;
+ }
+ #endif /* LV_HAVE_AVX512F */
+
+@@ -449,25 +480,29 @@ static inline void volk_32f_x2_dot_prod_32f_u_avx512f(float * result, const floa
+ #ifndef INCLUDED_volk_32f_x2_dot_prod_32f_a_H
+ #define INCLUDED_volk_32f_x2_dot_prod_32f_a_H
+
++#include <stdio.h>
+ #include <volk/volk_common.h>
+-#include<stdio.h>
+
+
+ #ifdef LV_HAVE_GENERIC
+
+
+-static inline void volk_32f_x2_dot_prod_32f_a_generic(float * result, const float * input, const float * taps, unsigned int num_points) {
++static inline void volk_32f_x2_dot_prod_32f_a_generic(float* result,
++ const float* input,
++ const float* taps,
++ unsigned int num_points)
++{
+
+- float dotProduct = 0;
+- const float* aPtr = input;
+- const float* bPtr= taps;
+- unsigned int number = 0;
++ float dotProduct = 0;
++ const float* aPtr = input;
++ const float* bPtr = taps;
++ unsigned int number = 0;
+
+- for(number = 0; number < num_points; number++){
+- dotProduct += ((*aPtr++) * (*bPtr++));
+- }
++ for (number = 0; number < num_points; number++) {
++ dotProduct += ((*aPtr++) * (*bPtr++));
++ }
+
+- *result = dotProduct;
++ *result = dotProduct;
+ }
+
+ #endif /*LV_HAVE_GENERIC*/
+@@ -476,69 +511,73 @@ static inline void volk_32f_x2_dot_prod_32f_a_generic(float * result, const floa
+ #ifdef LV_HAVE_SSE
+
+
+-static inline void volk_32f_x2_dot_prod_32f_a_sse( float* result, const float* input, const float* taps, unsigned int num_points) {
+-
+- unsigned int number = 0;
+- const unsigned int sixteenthPoints = num_points / 16;
+-
+- float dotProduct = 0;
+- const float* aPtr = input;
+- const float* bPtr = taps;
+-
+- __m128 a0Val, a1Val, a2Val, a3Val;
+- __m128 b0Val, b1Val, b2Val, b3Val;
+- __m128 c0Val, c1Val, c2Val, c3Val;
+-
+- __m128 dotProdVal0 = _mm_setzero_ps();
+- __m128 dotProdVal1 = _mm_setzero_ps();
+- __m128 dotProdVal2 = _mm_setzero_ps();
+- __m128 dotProdVal3 = _mm_setzero_ps();
++static inline void volk_32f_x2_dot_prod_32f_a_sse(float* result,
++ const float* input,
++ const float* taps,
++ unsigned int num_points)
++{
+
+- for(;number < sixteenthPoints; number++){
+-
+- a0Val = _mm_load_ps(aPtr);
+- a1Val = _mm_load_ps(aPtr+4);
+- a2Val = _mm_load_ps(aPtr+8);
+- a3Val = _mm_load_ps(aPtr+12);
+- b0Val = _mm_load_ps(bPtr);
+- b1Val = _mm_load_ps(bPtr+4);
+- b2Val = _mm_load_ps(bPtr+8);
+- b3Val = _mm_load_ps(bPtr+12);
+-
+- c0Val = _mm_mul_ps(a0Val, b0Val);
+- c1Val = _mm_mul_ps(a1Val, b1Val);
+- c2Val = _mm_mul_ps(a2Val, b2Val);
+- c3Val = _mm_mul_ps(a3Val, b3Val);
+-
+- dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0);
+- dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1);
+- dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2);
+- dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3);
++ unsigned int number = 0;
++ const unsigned int sixteenthPoints = num_points / 16;
+
+- aPtr += 16;
+- bPtr += 16;
+- }
++ float dotProduct = 0;
++ const float* aPtr = input;
++ const float* bPtr = taps;
++
++ __m128 a0Val, a1Val, a2Val, a3Val;
++ __m128 b0Val, b1Val, b2Val, b3Val;
++ __m128 c0Val, c1Val, c2Val, c3Val;
++
++ __m128 dotProdVal0 = _mm_setzero_ps();
++ __m128 dotProdVal1 = _mm_setzero_ps();
++ __m128 dotProdVal2 = _mm_setzero_ps();
++ __m128 dotProdVal3 = _mm_setzero_ps();
++
++ for (; number < sixteenthPoints; number++) {
++
++ a0Val = _mm_load_ps(aPtr);
++ a1Val = _mm_load_ps(aPtr + 4);
++ a2Val = _mm_load_ps(aPtr + 8);
++ a3Val = _mm_load_ps(aPtr + 12);
++ b0Val = _mm_load_ps(bPtr);
++ b1Val = _mm_load_ps(bPtr + 4);
++ b2Val = _mm_load_ps(bPtr + 8);
++ b3Val = _mm_load_ps(bPtr + 12);
++
++ c0Val = _mm_mul_ps(a0Val, b0Val);
++ c1Val = _mm_mul_ps(a1Val, b1Val);
++ c2Val = _mm_mul_ps(a2Val, b2Val);
++ c3Val = _mm_mul_ps(a3Val, b3Val);
++
++ dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0);
++ dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1);
++ dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2);
++ dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3);
+
+- dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
+- dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
+- dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
++ aPtr += 16;
++ bPtr += 16;
++ }
+
+- __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
++ dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
++ dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
++ dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
+
+- _mm_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector
++ __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
+
+- dotProduct = dotProductVector[0];
+- dotProduct += dotProductVector[1];
+- dotProduct += dotProductVector[2];
+- dotProduct += dotProductVector[3];
++ _mm_store_ps(dotProductVector,
++ dotProdVal0); // Store the results back into the dot product vector
+
+- number = sixteenthPoints*16;
+- for(;number < num_points; number++){
+- dotProduct += ((*aPtr++) * (*bPtr++));
+- }
++ dotProduct = dotProductVector[0];
++ dotProduct += dotProductVector[1];
++ dotProduct += dotProductVector[2];
++ dotProduct += dotProductVector[3];
+
+- *result = dotProduct;
++ number = sixteenthPoints * 16;
++ for (; number < num_points; number++) {
++ dotProduct += ((*aPtr++) * (*bPtr++));
++ }
+
++ *result = dotProduct;
+ }
+
+ #endif /*LV_HAVE_SSE*/
+@@ -547,127 +586,145 @@ static inline void volk_32f_x2_dot_prod_32f_a_sse( float* result, const float*
+
+ #include <pmmintrin.h>
+
+-static inline void volk_32f_x2_dot_prod_32f_a_sse3(float * result, const float * input, const float * taps, unsigned int num_points) {
+- unsigned int number = 0;
+- const unsigned int sixteenthPoints = num_points / 16;
+-
+- float dotProduct = 0;
+- const float* aPtr = input;
+- const float* bPtr = taps;
+-
+- __m128 a0Val, a1Val, a2Val, a3Val;
+- __m128 b0Val, b1Val, b2Val, b3Val;
+- __m128 c0Val, c1Val, c2Val, c3Val;
+-
+- __m128 dotProdVal0 = _mm_setzero_ps();
+- __m128 dotProdVal1 = _mm_setzero_ps();
+- __m128 dotProdVal2 = _mm_setzero_ps();
+- __m128 dotProdVal3 = _mm_setzero_ps();
+-
+- for(;number < sixteenthPoints; number++){
+-
+- a0Val = _mm_load_ps(aPtr);
+- a1Val = _mm_load_ps(aPtr+4);
+- a2Val = _mm_load_ps(aPtr+8);
+- a3Val = _mm_load_ps(aPtr+12);
+- b0Val = _mm_load_ps(bPtr);
+- b1Val = _mm_load_ps(bPtr+4);
+- b2Val = _mm_load_ps(bPtr+8);
+- b3Val = _mm_load_ps(bPtr+12);
+-
+- c0Val = _mm_mul_ps(a0Val, b0Val);
+- c1Val = _mm_mul_ps(a1Val, b1Val);
+- c2Val = _mm_mul_ps(a2Val, b2Val);
+- c3Val = _mm_mul_ps(a3Val, b3Val);
+-
+- dotProdVal0 = _mm_add_ps(dotProdVal0, c0Val);
+- dotProdVal1 = _mm_add_ps(dotProdVal1, c1Val);
+- dotProdVal2 = _mm_add_ps(dotProdVal2, c2Val);
+- dotProdVal3 = _mm_add_ps(dotProdVal3, c3Val);
+-
+- aPtr += 16;
+- bPtr += 16;
+- }
+-
+- dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
+- dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
+- dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
+-
+- __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
+- _mm_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector
+-
+- dotProduct = dotProductVector[0];
+- dotProduct += dotProductVector[1];
+- dotProduct += dotProductVector[2];
+- dotProduct += dotProductVector[3];
+-
+- number = sixteenthPoints*16;
+- for(;number < num_points; number++){
+- dotProduct += ((*aPtr++) * (*bPtr++));
+- }
+-
+- *result = dotProduct;
+-}
+-
+-#endif /*LV_HAVE_SSE3*/
++static inline void volk_32f_x2_dot_prod_32f_a_sse3(float* result,
++ const float* input,
++ const float* taps,
++ unsigned int num_points)
++{
++ unsigned int number = 0;
++ const unsigned int sixteenthPoints = num_points / 16;
+
+-#ifdef LV_HAVE_SSE4_1
++ float dotProduct = 0;
++ const float* aPtr = input;
++ const float* bPtr = taps;
++
++ __m128 a0Val, a1Val, a2Val, a3Val;
++ __m128 b0Val, b1Val, b2Val, b3Val;
++ __m128 c0Val, c1Val, c2Val, c3Val;
++
++ __m128 dotProdVal0 = _mm_setzero_ps();
++ __m128 dotProdVal1 = _mm_setzero_ps();
++ __m128 dotProdVal2 = _mm_setzero_ps();
++ __m128 dotProdVal3 = _mm_setzero_ps();
++
++ for (; number < sixteenthPoints; number++) {
++
++ a0Val = _mm_load_ps(aPtr);
++ a1Val = _mm_load_ps(aPtr + 4);
++ a2Val = _mm_load_ps(aPtr + 8);
++ a3Val = _mm_load_ps(aPtr + 12);
++ b0Val = _mm_load_ps(bPtr);
++ b1Val = _mm_load_ps(bPtr + 4);
++ b2Val = _mm_load_ps(bPtr + 8);
++ b3Val = _mm_load_ps(bPtr + 12);
++
++ c0Val = _mm_mul_ps(a0Val, b0Val);
++ c1Val = _mm_mul_ps(a1Val, b1Val);
++ c2Val = _mm_mul_ps(a2Val, b2Val);
++ c3Val = _mm_mul_ps(a3Val, b3Val);
++
++ dotProdVal0 = _mm_add_ps(dotProdVal0, c0Val);
++ dotProdVal1 = _mm_add_ps(dotProdVal1, c1Val);
++ dotProdVal2 = _mm_add_ps(dotProdVal2, c2Val);
++ dotProdVal3 = _mm_add_ps(dotProdVal3, c3Val);
+
+-#include <smmintrin.h>
++ aPtr += 16;
++ bPtr += 16;
++ }
+
+-static inline void volk_32f_x2_dot_prod_32f_a_sse4_1(float * result, const float * input, const float* taps, unsigned int num_points) {
+- unsigned int number = 0;
+- const unsigned int sixteenthPoints = num_points / 16;
++ dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
++ dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
++ dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
+
+- float dotProduct = 0;
+- const float* aPtr = input;
+- const float* bPtr = taps;
++ __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
++ _mm_store_ps(dotProductVector,
++ dotProdVal0); // Store the results back into the dot product vector
+
+- __m128 aVal1, bVal1, cVal1;
+- __m128 aVal2, bVal2, cVal2;
+- __m128 aVal3, bVal3, cVal3;
+- __m128 aVal4, bVal4, cVal4;
++ dotProduct = dotProductVector[0];
++ dotProduct += dotProductVector[1];
++ dotProduct += dotProductVector[2];
++ dotProduct += dotProductVector[3];
+
+- __m128 dotProdVal = _mm_setzero_ps();
++ number = sixteenthPoints * 16;
++ for (; number < num_points; number++) {
++ dotProduct += ((*aPtr++) * (*bPtr++));
++ }
+
+- for(;number < sixteenthPoints; number++){
++ *result = dotProduct;
++}
+
+- aVal1 = _mm_load_ps(aPtr); aPtr += 4;
+- aVal2 = _mm_load_ps(aPtr); aPtr += 4;
+- aVal3 = _mm_load_ps(aPtr); aPtr += 4;
+- aVal4 = _mm_load_ps(aPtr); aPtr += 4;
++#endif /*LV_HAVE_SSE3*/
+
+- bVal1 = _mm_load_ps(bPtr); bPtr += 4;
+- bVal2 = _mm_load_ps(bPtr); bPtr += 4;
+- bVal3 = _mm_load_ps(bPtr); bPtr += 4;
+- bVal4 = _mm_load_ps(bPtr); bPtr += 4;
++#ifdef LV_HAVE_SSE4_1
+
+- cVal1 = _mm_dp_ps(aVal1, bVal1, 0xF1);
+- cVal2 = _mm_dp_ps(aVal2, bVal2, 0xF2);
+- cVal3 = _mm_dp_ps(aVal3, bVal3, 0xF4);
+- cVal4 = _mm_dp_ps(aVal4, bVal4, 0xF8);
++#include <smmintrin.h>
+
+- cVal1 = _mm_or_ps(cVal1, cVal2);
+- cVal3 = _mm_or_ps(cVal3, cVal4);
+- cVal1 = _mm_or_ps(cVal1, cVal3);
++static inline void volk_32f_x2_dot_prod_32f_a_sse4_1(float* result,
++ const float* input,
++ const float* taps,
++ unsigned int num_points)
++{
++ unsigned int number = 0;
++ const unsigned int sixteenthPoints = num_points / 16;
+
+- dotProdVal = _mm_add_ps(dotProdVal, cVal1);
+- }
++ float dotProduct = 0;
++ const float* aPtr = input;
++ const float* bPtr = taps;
++
++ __m128 aVal1, bVal1, cVal1;
++ __m128 aVal2, bVal2, cVal2;
++ __m128 aVal3, bVal3, cVal3;
++ __m128 aVal4, bVal4, cVal4;
++
++ __m128 dotProdVal = _mm_setzero_ps();
++
++ for (; number < sixteenthPoints; number++) {
++
++ aVal1 = _mm_load_ps(aPtr);
++ aPtr += 4;
++ aVal2 = _mm_load_ps(aPtr);
++ aPtr += 4;
++ aVal3 = _mm_load_ps(aPtr);
++ aPtr += 4;
++ aVal4 = _mm_load_ps(aPtr);
++ aPtr += 4;
++
++ bVal1 = _mm_load_ps(bPtr);
++ bPtr += 4;
++ bVal2 = _mm_load_ps(bPtr);
++ bPtr += 4;
++ bVal3 = _mm_load_ps(bPtr);
++ bPtr += 4;
++ bVal4 = _mm_load_ps(bPtr);
++ bPtr += 4;
++
++ cVal1 = _mm_dp_ps(aVal1, bVal1, 0xF1);
++ cVal2 = _mm_dp_ps(aVal2, bVal2, 0xF2);
++ cVal3 = _mm_dp_ps(aVal3, bVal3, 0xF4);
++ cVal4 = _mm_dp_ps(aVal4, bVal4, 0xF8);
++
++ cVal1 = _mm_or_ps(cVal1, cVal2);
++ cVal3 = _mm_or_ps(cVal3, cVal4);
++ cVal1 = _mm_or_ps(cVal1, cVal3);
++
++ dotProdVal = _mm_add_ps(dotProdVal, cVal1);
++ }
+
+- __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
+- _mm_store_ps(dotProductVector, dotProdVal); // Store the results back into the dot product vector
++ __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
++ _mm_store_ps(dotProductVector,
++ dotProdVal); // Store the results back into the dot product vector
+
+- dotProduct = dotProductVector[0];
+- dotProduct += dotProductVector[1];
+- dotProduct += dotProductVector[2];
+- dotProduct += dotProductVector[3];
++ dotProduct = dotProductVector[0];
++ dotProduct += dotProductVector[1];
++ dotProduct += dotProductVector[2];
++ dotProduct += dotProductVector[3];
+
+- number = sixteenthPoints * 16;
+- for(;number < num_points; number++){
+- dotProduct += ((*aPtr++) * (*bPtr++));
+- }
++ number = sixteenthPoints * 16;
++ for (; number < num_points; number++) {
++ dotProduct += ((*aPtr++) * (*bPtr++));
++ }
+
+- *result = dotProduct;
++ *result = dotProduct;
+ }
+
+ #endif /*LV_HAVE_SSE4_1*/
+@@ -676,159 +733,170 @@ static inline void volk_32f_x2_dot_prod_32f_a_sse4_1(float * result, const float
+
+ #include <immintrin.h>
+
+-static inline void volk_32f_x2_dot_prod_32f_a_avx( float* result, const float* input, const float* taps, unsigned int num_points) {
++static inline void volk_32f_x2_dot_prod_32f_a_avx(float* result,
++ const float* input,
++ const float* taps,
++ unsigned int num_points)
++{
+
+- unsigned int number = 0;
+- const unsigned int sixteenthPoints = num_points / 16;
+-
+- float dotProduct = 0;
+- const float* aPtr = input;
+- const float* bPtr = taps;
++ unsigned int number = 0;
++ const unsigned int sixteenthPoints = num_points / 16;
+
+- __m256 a0Val, a1Val;
+- __m256 b0Val, b1Val;
+- __m256 c0Val, c1Val;
++ float dotProduct = 0;
++ const float* aPtr = input;
++ const float* bPtr = taps;
+
+- __m256 dotProdVal0 = _mm256_setzero_ps();
+- __m256 dotProdVal1 = _mm256_setzero_ps();
++ __m256 a0Val, a1Val;
++ __m256 b0Val, b1Val;
++ __m256 c0Val, c1Val;
+
+- for(;number < sixteenthPoints; number++){
++ __m256 dotProdVal0 = _mm256_setzero_ps();
++ __m256 dotProdVal1 = _mm256_setzero_ps();
+
+- a0Val = _mm256_load_ps(aPtr);
+- a1Val = _mm256_load_ps(aPtr+8);
+- b0Val = _mm256_load_ps(bPtr);
+- b1Val = _mm256_load_ps(bPtr+8);
++ for (; number < sixteenthPoints; number++) {
+
+- c0Val = _mm256_mul_ps(a0Val, b0Val);
+- c1Val = _mm256_mul_ps(a1Val, b1Val);
++ a0Val = _mm256_load_ps(aPtr);
++ a1Val = _mm256_load_ps(aPtr + 8);
++ b0Val = _mm256_load_ps(bPtr);
++ b1Val = _mm256_load_ps(bPtr + 8);
+
+- dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0);
+- dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1);
++ c0Val = _mm256_mul_ps(a0Val, b0Val);
++ c1Val = _mm256_mul_ps(a1Val, b1Val);
+
+- aPtr += 16;
+- bPtr += 16;
+- }
++ dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0);
++ dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1);
+
+- dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
++ aPtr += 16;
++ bPtr += 16;
++ }
+
+- __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
++ dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
+
+- _mm256_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector
++ __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
+
+- dotProduct = dotProductVector[0];
+- dotProduct += dotProductVector[1];
+- dotProduct += dotProductVector[2];
+- dotProduct += dotProductVector[3];
+- dotProduct += dotProductVector[4];
+- dotProduct += dotProductVector[5];
+- dotProduct += dotProductVector[6];
+- dotProduct += dotProductVector[7];
++ _mm256_store_ps(dotProductVector,
++ dotProdVal0); // Store the results back into the dot product vector
+
+- number = sixteenthPoints*16;
+- for(;number < num_points; number++){
+- dotProduct += ((*aPtr++) * (*bPtr++));
+- }
++ dotProduct = dotProductVector[0];
++ dotProduct += dotProductVector[1];
++ dotProduct += dotProductVector[2];
++ dotProduct += dotProductVector[3];
++ dotProduct += dotProductVector[4];
++ dotProduct += dotProductVector[5];
++ dotProduct += dotProductVector[6];
++ dotProduct += dotProductVector[7];
+
+- *result = dotProduct;
++ number = sixteenthPoints * 16;
++ for (; number < num_points; number++) {
++ dotProduct += ((*aPtr++) * (*bPtr++));
++ }
+
++ *result = dotProduct;
+ }
+ #endif /*LV_HAVE_AVX*/
+
+
+ #if LV_HAVE_AVX2 && LV_HAVE_FMA
+ #include <immintrin.h>
+-static inline void volk_32f_x2_dot_prod_32f_a_avx2_fma(float * result, const float * input, const float* taps, unsigned int num_points){
+- unsigned int number;
+- const unsigned int eighthPoints = num_points / 8;
+-
+- const float* aPtr = input;
+- const float* bPtr = taps;
++static inline void volk_32f_x2_dot_prod_32f_a_avx2_fma(float* result,
++ const float* input,
++ const float* taps,
++ unsigned int num_points)
++{
++ unsigned int number;
++ const unsigned int eighthPoints = num_points / 8;
+
+- __m256 dotProdVal = _mm256_setzero_ps();
+- __m256 aVal1, bVal1;
++ const float* aPtr = input;
++ const float* bPtr = taps;
+
+- for (number = 0; number < eighthPoints; number++ ) {
++ __m256 dotProdVal = _mm256_setzero_ps();
++ __m256 aVal1, bVal1;
+
+- aVal1 = _mm256_load_ps(aPtr);
+- bVal1 = _mm256_load_ps(bPtr);
+- aPtr += 8;
+- bPtr += 8;
++ for (number = 0; number < eighthPoints; number++) {
+
+- dotProdVal = _mm256_fmadd_ps(aVal1, bVal1, dotProdVal);
+- }
++ aVal1 = _mm256_load_ps(aPtr);
++ bVal1 = _mm256_load_ps(bPtr);
++ aPtr += 8;
++ bPtr += 8;
+
+- __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
+- _mm256_store_ps(dotProductVector, dotProdVal); // Store the results back into the dot product vector
+- _mm256_zeroupper();
++ dotProdVal = _mm256_fmadd_ps(aVal1, bVal1, dotProdVal);
++ }
+
+- float dotProduct =
+- dotProductVector[0] + dotProductVector[1] +
+- dotProductVector[2] + dotProductVector[3] +
+- dotProductVector[4] + dotProductVector[5] +
+- dotProductVector[6] + dotProductVector[7];
++ __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
++ _mm256_store_ps(dotProductVector,
++ dotProdVal); // Store the results back into the dot product vector
++ _mm256_zeroupper();
+
+- for(number = eighthPoints * 8; number < num_points; number++){
+- dotProduct += ((*aPtr++) * (*bPtr++));
+- }
++ float dotProduct = dotProductVector[0] + dotProductVector[1] + dotProductVector[2] +
++ dotProductVector[3] + dotProductVector[4] + dotProductVector[5] +
++ dotProductVector[6] + dotProductVector[7];
+
+- *result = dotProduct;
++ for (number = eighthPoints * 8; number < num_points; number++) {
++ dotProduct += ((*aPtr++) * (*bPtr++));
++ }
+
++ *result = dotProduct;
+ }
+ #endif /* LV_HAVE_AVX2 && LV_HAVE_FMA */
+
+ #if LV_HAVE_AVX512F
+ #include <immintrin.h>
+-static inline void volk_32f_x2_dot_prod_32f_a_avx512f(float * result, const float * input, const float* taps, unsigned int num_points){
+- unsigned int number;
+- const unsigned int sixteenthPoints = num_points / 16;
+-
+- const float* aPtr = input;
+- const float* bPtr = taps;
++static inline void volk_32f_x2_dot_prod_32f_a_avx512f(float* result,
++ const float* input,
++ const float* taps,
++ unsigned int num_points)
++{
++ unsigned int number;
++ const unsigned int sixteenthPoints = num_points / 16;
+
+- __m512 dotProdVal = _mm512_setzero_ps();
+- __m512 aVal1, bVal1;
++ const float* aPtr = input;
++ const float* bPtr = taps;
+
+- for (number = 0; number < sixteenthPoints; number++ ) {
++ __m512 dotProdVal = _mm512_setzero_ps();
++ __m512 aVal1, bVal1;
+
+- aVal1 = _mm512_load_ps(aPtr);
+- bVal1 = _mm512_load_ps(bPtr);
+- aPtr += 16;
+- bPtr += 16;
++ for (number = 0; number < sixteenthPoints; number++) {
+
+- dotProdVal = _mm512_fmadd_ps(aVal1, bVal1, dotProdVal);
+- }
++ aVal1 = _mm512_load_ps(aPtr);
++ bVal1 = _mm512_load_ps(bPtr);
++ aPtr += 16;
++ bPtr += 16;
+
+- __VOLK_ATTR_ALIGNED(64) float dotProductVector[16];
+- _mm512_store_ps(dotProductVector, dotProdVal); // Store the results back into the dot product vector
++ dotProdVal = _mm512_fmadd_ps(aVal1, bVal1, dotProdVal);
++ }
+
+- float dotProduct =
+- dotProductVector[0] + dotProductVector[1] +
+- dotProductVector[2] + dotProductVector[3] +
+- dotProductVector[4] + dotProductVector[5] +
+- dotProductVector[6] + dotProductVector[7] +
+- dotProductVector[8] + dotProductVector[9] +
+- dotProductVector[10] + dotProductVector[11] +
+- dotProductVector[12] + dotProductVector[13] +
+- dotProductVector[14] + dotProductVector[15];
++ __VOLK_ATTR_ALIGNED(64) float dotProductVector[16];
++ _mm512_store_ps(dotProductVector,
++ dotProdVal); // Store the results back into the dot product vector
+
+- for(number = sixteenthPoints * 16; number < num_points; number++){
+- dotProduct += ((*aPtr++) * (*bPtr++));
+- }
++ float dotProduct = dotProductVector[0] + dotProductVector[1] + dotProductVector[2] +
++ dotProductVector[3] + dotProductVector[4] + dotProductVector[5] +
++ dotProductVector[6] + dotProductVector[7] + dotProductVector[8] +
++ dotProductVector[9] + dotProductVector[10] + dotProductVector[11] +
++ dotProductVector[12] + dotProductVector[13] +
++ dotProductVector[14] + dotProductVector[15];
+
+- *result = dotProduct;
++ for (number = sixteenthPoints * 16; number < num_points; number++) {
++ dotProduct += ((*aPtr++) * (*bPtr++));
++ }
+
++ *result = dotProduct;
+ }
+ #endif /* LV_HAVE_AVX512F */
+
+ #ifdef LV_HAVE_NEON
+ #include <arm_neon.h>
+
+-static inline void volk_32f_x2_dot_prod_32f_neonopts(float * result, const float * input, const float * taps, unsigned int num_points) {
++static inline void volk_32f_x2_dot_prod_32f_neonopts(float* result,
++ const float* input,
++ const float* taps,
++ unsigned int num_points)
++{
+
+ unsigned int quarter_points = num_points / 16;
+ float dotProduct = 0;
+ const float* aPtr = input;
+- const float* bPtr= taps;
++ const float* bPtr = taps;
+ unsigned int number = 0;
+
+ float32x4x4_t a_val, b_val, accumulator0;
+@@ -838,7 +906,7 @@ static inline void volk_32f_x2_dot_prod_32f_neonopts(float * result, const float
+ accumulator0.val[3] = vdupq_n_f32(0);
+ // factor of 4 loop unroll with independent accumulators
+ // uses 12 out of 16 neon q registers
+- for( number = 0; number < quarter_points; ++number) {
++ for (number = 0; number < quarter_points; ++number) {
+ a_val = vld4q_f32(aPtr);
+ b_val = vld4q_f32(bPtr);
+ accumulator0.val[0] = vmlaq_f32(accumulator0.val[0], a_val.val[0], b_val.val[0]);
+@@ -855,8 +923,8 @@ static inline void volk_32f_x2_dot_prod_32f_neonopts(float * result, const float
+ vst1q_f32(accumulator, accumulator0.val[0]);
+ dotProduct = accumulator[0] + accumulator[1] + accumulator[2] + accumulator[3];
+
+- for(number = quarter_points*16; number < num_points; number++){
+- dotProduct += ((*aPtr++) * (*bPtr++));
++ for (number = quarter_points * 16; number < num_points; number++) {
++ dotProduct += ((*aPtr++) * (*bPtr++));
+ }
+
+ *result = dotProduct;
+@@ -865,26 +933,30 @@ static inline void volk_32f_x2_dot_prod_32f_neonopts(float * result, const float
+ #endif
+
+
+-
+-
+ #ifdef LV_HAVE_NEON
+-static inline void volk_32f_x2_dot_prod_32f_neon(float * result, const float * input, const float * taps, unsigned int num_points) {
++static inline void volk_32f_x2_dot_prod_32f_neon(float* result,
++ const float* input,
++ const float* taps,
++ unsigned int num_points)
++{
+
+ unsigned int quarter_points = num_points / 8;
+ float dotProduct = 0;
+ const float* aPtr = input;
+- const float* bPtr= taps;
++ const float* bPtr = taps;
+ unsigned int number = 0;
+
+ float32x4x2_t a_val, b_val, accumulator_val;
+ accumulator_val.val[0] = vdupq_n_f32(0);
+ accumulator_val.val[1] = vdupq_n_f32(0);
+ // factor of 2 loop unroll with independent accumulators
+- for( number = 0; number < quarter_points; ++number) {
++ for (number = 0; number < quarter_points; ++number) {
+ a_val = vld2q_f32(aPtr);
+ b_val = vld2q_f32(bPtr);
+- accumulator_val.val[0] = vmlaq_f32(accumulator_val.val[0], a_val.val[0], b_val.val[0]);
+- accumulator_val.val[1] = vmlaq_f32(accumulator_val.val[1], a_val.val[1], b_val.val[1]);
++ accumulator_val.val[0] =
++ vmlaq_f32(accumulator_val.val[0], a_val.val[0], b_val.val[0]);
++ accumulator_val.val[1] =
++ vmlaq_f32(accumulator_val.val[1], a_val.val[1], b_val.val[1]);
+ aPtr += 8;
+ bPtr += 8;
+ }
+@@ -893,8 +965,8 @@ static inline void volk_32f_x2_dot_prod_32f_neon(float * result, const float * i
+ vst1q_f32(accumulator, accumulator_val.val[0]);
+ dotProduct = accumulator[0] + accumulator[1] + accumulator[2] + accumulator[3];
+
+- for(number = quarter_points*8; number < num_points; number++){
+- dotProduct += ((*aPtr++) * (*bPtr++));
++ for (number = quarter_points * 8; number < num_points; number++) {
++ dotProduct += ((*aPtr++) * (*bPtr++));
+ }
+
+ *result = dotProduct;
+@@ -903,11 +975,17 @@ static inline void volk_32f_x2_dot_prod_32f_neon(float * result, const float * i
+ #endif /* LV_HAVE_NEON */
+
+ #ifdef LV_HAVE_NEONV7
+-extern void volk_32f_x2_dot_prod_32f_a_neonasm(float* cVector, const float* aVector, const float* bVector, unsigned int num_points);
++extern void volk_32f_x2_dot_prod_32f_a_neonasm(float* cVector,
++ const float* aVector,
++ const float* bVector,
++ unsigned int num_points);
+ #endif /* LV_HAVE_NEONV7 */
+
+ #ifdef LV_HAVE_NEONV7
+-extern void volk_32f_x2_dot_prod_32f_a_neonasm_opts(float* cVector, const float* aVector, const float* bVector, unsigned int num_points);
++extern void volk_32f_x2_dot_prod_32f_a_neonasm_opts(float* cVector,
++ const float* aVector,
++ const float* bVector,
++ unsigned int num_points);
+ #endif /* LV_HAVE_NEONV7 */
+
+ #endif /*INCLUDED_volk_32f_x2_dot_prod_32f_a_H*/
+diff --git a/kernels/volk/volk_32f_x2_fm_detectpuppet_32f.h b/kernels/volk/volk_32f_x2_fm_detectpuppet_32f.h
+index e1da185..3a3caca 100644
+--- a/kernels/volk/volk_32f_x2_fm_detectpuppet_32f.h
++++ b/kernels/volk/volk_32f_x2_fm_detectpuppet_32f.h
+@@ -28,32 +28,44 @@
+ #ifdef LV_HAVE_AVX
+ #include <immintrin.h>
+
+-static inline void volk_32f_x2_fm_detectpuppet_32f_a_avx(float* outputVector, const float* inputVector, float* saveValue, unsigned int num_points)
++static inline void volk_32f_x2_fm_detectpuppet_32f_a_avx(float* outputVector,
++ const float* inputVector,
++ float* saveValue,
++ unsigned int num_points)
+ {
+- const float bound = 1.0f;
++ const float bound = 1.0f;
+
+- volk_32f_s32f_32f_fm_detect_32f_a_avx(outputVector, inputVector, bound, saveValue, num_points);
++ volk_32f_s32f_32f_fm_detect_32f_a_avx(
++ outputVector, inputVector, bound, saveValue, num_points);
+ }
+ #endif /* LV_HAVE_AVX */
+
+ #ifdef LV_HAVE_SSE
+ #include <xmmintrin.h>
+
+-static inline void volk_32f_x2_fm_detectpuppet_32f_a_sse(float* outputVector, const float* inputVector, float* saveValue, unsigned int num_points)
++static inline void volk_32f_x2_fm_detectpuppet_32f_a_sse(float* outputVector,
++ const float* inputVector,
++ float* saveValue,
++ unsigned int num_points)
+ {
+- const float bound = 1.0f;
++ const float bound = 1.0f;
+
+- volk_32f_s32f_32f_fm_detect_32f_a_sse(outputVector, inputVector, bound, saveValue, num_points);
++ volk_32f_s32f_32f_fm_detect_32f_a_sse(
++ outputVector, inputVector, bound, saveValue, num_points);
+ }
+ #endif /* LV_HAVE_SSE */
+
+ #ifdef LV_HAVE_GENERIC
+
+-static inline void volk_32f_x2_fm_detectpuppet_32f_generic(float* outputVector, const float* inputVector, float* saveValue, unsigned int num_points)
++static inline void volk_32f_x2_fm_detectpuppet_32f_generic(float* outputVector,
++ const float* inputVector,
++ float* saveValue,
++ unsigned int num_points)
+ {
+- const float bound = 1.0f;
++ const float bound = 1.0f;
+
+- volk_32f_s32f_32f_fm_detect_32f_generic(outputVector, inputVector, bound, saveValue, num_points);
++ volk_32f_s32f_32f_fm_detect_32f_generic(
++ outputVector, inputVector, bound, saveValue, num_points);
+ }
+ #endif /* LV_HAVE_GENERIC */
+
+@@ -69,11 +81,15 @@ static inline void volk_32f_x2_fm_detectpuppet_32f_generic(float* outputVector,
+ #ifdef LV_HAVE_AVX
+ #include <immintrin.h>
+
+-static inline void volk_32f_x2_fm_detectpuppet_32f_u_avx(float* outputVector, const float* inputVector, float* saveValue, unsigned int num_points)
++static inline void volk_32f_x2_fm_detectpuppet_32f_u_avx(float* outputVector,
++ const float* inputVector,
++ float* saveValue,
++ unsigned int num_points)
+ {
+- const float bound = 1.0f;
++ const float bound = 1.0f;
+
+- volk_32f_s32f_32f_fm_detect_32f_u_avx(outputVector, inputVector, bound, saveValue, num_points);
++ volk_32f_s32f_32f_fm_detect_32f_u_avx(
++ outputVector, inputVector, bound, saveValue, num_points);
+ }
+ #endif /* LV_HAVE_AVX */
+ #endif /* INCLUDED_volk_32f_x2_fm_detectpuppet_32f_u_H */
+diff --git a/kernels/volk/volk_32f_x2_interleave_32fc.h b/kernels/volk/volk_32f_x2_interleave_32fc.h
+index ef8ada2..d0cc6dd 100644
+--- a/kernels/volk/volk_32f_x2_interleave_32fc.h
++++ b/kernels/volk/volk_32f_x2_interleave_32fc.h
+@@ -33,8 +33,8 @@
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+- * void volk_32f_x2_interleave_32fc(lv_32fc_t* complexVector, const float* iBuffer, const float* qBuffer, unsigned int num_points)
+- * \endcode
++ * void volk_32f_x2_interleave_32fc(lv_32fc_t* complexVector, const float* iBuffer, const
++ * float* qBuffer, unsigned int num_points) \endcode
+ *
+ * \b Inputs
+ * \li iBuffer: Input vector of samples for the real part.
+@@ -79,44 +79,45 @@
+ #ifdef LV_HAVE_AVX
+ #include <immintrin.h>
+
+-static inline void
+-volk_32f_x2_interleave_32fc_a_avx(lv_32fc_t* complexVector, const float* iBuffer,
+- const float* qBuffer, unsigned int num_points)
++static inline void volk_32f_x2_interleave_32fc_a_avx(lv_32fc_t* complexVector,
++ const float* iBuffer,
++ const float* qBuffer,
++ unsigned int num_points)
+ {
+- unsigned int number = 0;
+- float* complexVectorPtr = (float*)complexVector;
+- const float* iBufferPtr = iBuffer;
+- const float* qBufferPtr = qBuffer;
+-
+- const uint64_t eighthPoints = num_points / 8;
+-
+- __m256 iValue, qValue, cplxValue1, cplxValue2, cplxValue;
+- for(;number < eighthPoints; number++){
+- iValue = _mm256_load_ps(iBufferPtr);
+- qValue = _mm256_load_ps(qBufferPtr);
+-
+- // Interleaves the lower two values in the i and q variables into one buffer
+- cplxValue1 = _mm256_unpacklo_ps(iValue, qValue);
+- // Interleaves the upper two values in the i and q variables into one buffer
+- cplxValue2 = _mm256_unpackhi_ps(iValue, qValue);
+-
+- cplxValue = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x20);
+- _mm256_store_ps(complexVectorPtr, cplxValue);
+- complexVectorPtr += 8;
+-
+- cplxValue = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x31);
+- _mm256_store_ps(complexVectorPtr, cplxValue);
+- complexVectorPtr += 8;
+-
+- iBufferPtr += 8;
+- qBufferPtr += 8;
+- }
+-
+- number = eighthPoints * 8;
+- for(; number < num_points; number++){
+- *complexVectorPtr++ = *iBufferPtr++;
+- *complexVectorPtr++ = *qBufferPtr++;
+- }
++ unsigned int number = 0;
++ float* complexVectorPtr = (float*)complexVector;
++ const float* iBufferPtr = iBuffer;
++ const float* qBufferPtr = qBuffer;
++
++ const uint64_t eighthPoints = num_points / 8;
++
++ __m256 iValue, qValue, cplxValue1, cplxValue2, cplxValue;
++ for (; number < eighthPoints; number++) {
++ iValue = _mm256_load_ps(iBufferPtr);
++ qValue = _mm256_load_ps(qBufferPtr);
++
++ // Interleaves the lower two values in the i and q variables into one buffer
++ cplxValue1 = _mm256_unpacklo_ps(iValue, qValue);
++ // Interleaves the upper two values in the i and q variables into one buffer
++ cplxValue2 = _mm256_unpackhi_ps(iValue, qValue);
++
++ cplxValue = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x20);
++ _mm256_store_ps(complexVectorPtr, cplxValue);
++ complexVectorPtr += 8;
++
++ cplxValue = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x31);
++ _mm256_store_ps(complexVectorPtr, cplxValue);
++ complexVectorPtr += 8;
++
++ iBufferPtr += 8;
++ qBufferPtr += 8;
++ }
++
++ number = eighthPoints * 8;
++ for (; number < num_points; number++) {
++ *complexVectorPtr++ = *iBufferPtr++;
++ *complexVectorPtr++ = *qBufferPtr++;
++ }
+ }
+
+ #endif /* LV_HAV_AVX */
+@@ -124,41 +125,42 @@ volk_32f_x2_interleave_32fc_a_avx(lv_32fc_t* complexVector, const float* iBuffer
+ #ifdef LV_HAVE_SSE
+ #include <xmmintrin.h>
+
+-static inline void
+-volk_32f_x2_interleave_32fc_a_sse(lv_32fc_t* complexVector, const float* iBuffer,
+- const float* qBuffer, unsigned int num_points)
++static inline void volk_32f_x2_interleave_32fc_a_sse(lv_32fc_t* complexVector,
++ const float* iBuffer,
++ const float* qBuffer,
++ unsigned int num_points)
+ {
+- unsigned int number = 0;
+- float* complexVectorPtr = (float*)complexVector;
+- const float* iBufferPtr = iBuffer;
+- const float* qBufferPtr = qBuffer;
+-
+- const uint64_t quarterPoints = num_points / 4;
+-
+- __m128 iValue, qValue, cplxValue;
+- for(;number < quarterPoints; number++){
+- iValue = _mm_load_ps(iBufferPtr);
+- qValue = _mm_load_ps(qBufferPtr);
+-
+- // Interleaves the lower two values in the i and q variables into one buffer
+- cplxValue = _mm_unpacklo_ps(iValue, qValue);
+- _mm_store_ps(complexVectorPtr, cplxValue);
+- complexVectorPtr += 4;
+-
+- // Interleaves the upper two values in the i and q variables into one buffer
+- cplxValue = _mm_unpackhi_ps(iValue, qValue);
+- _mm_store_ps(complexVectorPtr, cplxValue);
+- complexVectorPtr += 4;
+-
+- iBufferPtr += 4;
+- qBufferPtr += 4;
+- }
+-
+- number = quarterPoints * 4;
+- for(; number < num_points; number++){
+- *complexVectorPtr++ = *iBufferPtr++;
+- *complexVectorPtr++ = *qBufferPtr++;
+- }
++ unsigned int number = 0;
++ float* complexVectorPtr = (float*)complexVector;
++ const float* iBufferPtr = iBuffer;
++ const float* qBufferPtr = qBuffer;
++
++ const uint64_t quarterPoints = num_points / 4;
++
++ __m128 iValue, qValue, cplxValue;
++ for (; number < quarterPoints; number++) {
++ iValue = _mm_load_ps(iBufferPtr);
++ qValue = _mm_load_ps(qBufferPtr);
++
++ // Interleaves the lower two values in the i and q variables into one buffer
++ cplxValue = _mm_unpacklo_ps(iValue, qValue);
++ _mm_store_ps(complexVectorPtr, cplxValue);
++ complexVectorPtr += 4;
++
++ // Interleaves the upper two values in the i and q variables into one buffer
++ cplxValue = _mm_unpackhi_ps(iValue, qValue);
++ _mm_store_ps(complexVectorPtr, cplxValue);
++ complexVectorPtr += 4;
++
++ iBufferPtr += 4;
++ qBufferPtr += 4;
++ }
++
++ number = quarterPoints * 4;
++ for (; number < num_points; number++) {
++ *complexVectorPtr++ = *iBufferPtr++;
++ *complexVectorPtr++ = *qBufferPtr++;
++ }
+ }
+ #endif /* LV_HAVE_SSE */
+
+@@ -166,52 +168,53 @@ volk_32f_x2_interleave_32fc_a_sse(lv_32fc_t* complexVector, const float* iBuffer
+ #ifdef LV_HAVE_NEON
+ #include <arm_neon.h>
+
+-static inline void
+-volk_32f_x2_interleave_32fc_neon(lv_32fc_t* complexVector, const float* iBuffer,
+- const float* qBuffer, unsigned int num_points)
++static inline void volk_32f_x2_interleave_32fc_neon(lv_32fc_t* complexVector,
++ const float* iBuffer,
++ const float* qBuffer,
++ unsigned int num_points)
+ {
+- unsigned int quarter_points = num_points / 4;
+- unsigned int number;
+- float* complexVectorPtr = (float*) complexVector;
+-
+- float32x4x2_t complex_vec;
+- for(number=0; number < quarter_points; ++number) {
+- complex_vec.val[0] = vld1q_f32(iBuffer);
+- complex_vec.val[1] = vld1q_f32(qBuffer);
+- vst2q_f32(complexVectorPtr, complex_vec);
+- iBuffer += 4;
+- qBuffer += 4;
+- complexVectorPtr += 8;
+- }
+-
+- for(number=quarter_points * 4; number < num_points; ++number) {
+- *complexVectorPtr++ = *iBuffer++;
+- *complexVectorPtr++ = *qBuffer++;
+- }
++ unsigned int quarter_points = num_points / 4;
++ unsigned int number;
++ float* complexVectorPtr = (float*)complexVector;
++
++ float32x4x2_t complex_vec;
++ for (number = 0; number < quarter_points; ++number) {
++ complex_vec.val[0] = vld1q_f32(iBuffer);
++ complex_vec.val[1] = vld1q_f32(qBuffer);
++ vst2q_f32(complexVectorPtr, complex_vec);
++ iBuffer += 4;
++ qBuffer += 4;
++ complexVectorPtr += 8;
++ }
++
++ for (number = quarter_points * 4; number < num_points; ++number) {
++ *complexVectorPtr++ = *iBuffer++;
++ *complexVectorPtr++ = *qBuffer++;
++ }
+ }
+ #endif /* LV_HAVE_NEON */
+
+
+ #ifdef LV_HAVE_GENERIC
+
+-static inline void
+-volk_32f_x2_interleave_32fc_generic(lv_32fc_t* complexVector, const float* iBuffer,
+- const float* qBuffer, unsigned int num_points)
++static inline void volk_32f_x2_interleave_32fc_generic(lv_32fc_t* complexVector,
++ const float* iBuffer,
++ const float* qBuffer,
++ unsigned int num_points)
+ {
+- float* complexVectorPtr = (float*)complexVector;
+- const float* iBufferPtr = iBuffer;
+- const float* qBufferPtr = qBuffer;
+- unsigned int number;
+-
+- for(number = 0; number < num_points; number++){
+- *complexVectorPtr++ = *iBufferPtr++;
+- *complexVectorPtr++ = *qBufferPtr++;
+- }
++ float* complexVectorPtr = (float*)complexVector;
++ const float* iBufferPtr = iBuffer;
++ const float* qBufferPtr = qBuffer;
++ unsigned int number;
++
++ for (number = 0; number < num_points; number++) {
++ *complexVectorPtr++ = *iBufferPtr++;
++ *complexVectorPtr++ = *qBufferPtr++;
++ }
+ }
+ #endif /* LV_HAVE_GENERIC */
+
+
+-
+ #endif /* INCLUDED_volk_32f_x2_interleave_32fc_a_H */
+
+ #ifndef INCLUDED_volk_32f_x2_interleave_32fc_u_H
+@@ -223,44 +226,45 @@ volk_32f_x2_interleave_32fc_generic(lv_32fc_t* complexVector, const float* iBuff
+ #ifdef LV_HAVE_AVX
+ #include <immintrin.h>
+
+-static inline void
+-volk_32f_x2_interleave_32fc_u_avx(lv_32fc_t* complexVector, const float* iBuffer,
+- const float* qBuffer, unsigned int num_points)
++static inline void volk_32f_x2_interleave_32fc_u_avx(lv_32fc_t* complexVector,
++ const float* iBuffer,
++ const float* qBuffer,
++ unsigned int num_points)
+ {
+- unsigned int number = 0;
+- float* complexVectorPtr = (float*)complexVector;
+- const float* iBufferPtr = iBuffer;
+- const float* qBufferPtr = qBuffer;
+-
+- const uint64_t eighthPoints = num_points / 8;
+-
+- __m256 iValue, qValue, cplxValue1, cplxValue2, cplxValue;
+- for(;number < eighthPoints; number++){
+- iValue = _mm256_loadu_ps(iBufferPtr);
+- qValue = _mm256_loadu_ps(qBufferPtr);
+-
+- // Interleaves the lower two values in the i and q variables into one buffer
+- cplxValue1 = _mm256_unpacklo_ps(iValue, qValue);
+- // Interleaves the upper two values in the i and q variables into one buffer
+- cplxValue2 = _mm256_unpackhi_ps(iValue, qValue);
+-
+- cplxValue = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x20);
+- _mm256_storeu_ps(complexVectorPtr, cplxValue);
+- complexVectorPtr += 8;
+-
+- cplxValue = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x31);
+- _mm256_storeu_ps(complexVectorPtr, cplxValue);
+- complexVectorPtr += 8;
+-
+- iBufferPtr += 8;
+- qBufferPtr += 8;
+- }
+-
+- number = eighthPoints * 8;
+- for(; number < num_points; number++){
+- *complexVectorPtr++ = *iBufferPtr++;
+- *complexVectorPtr++ = *qBufferPtr++;
+- }
++ unsigned int number = 0;
++ float* complexVectorPtr = (float*)complexVector;
++ const float* iBufferPtr = iBuffer;
++ const float* qBufferPtr = qBuffer;
++
++ const uint64_t eighthPoints = num_points / 8;
++
++ __m256 iValue, qValue, cplxValue1, cplxValue2, cplxValue;
++ for (; number < eighthPoints; number++) {
++ iValue = _mm256_loadu_ps(iBufferPtr);
++ qValue = _mm256_loadu_ps(qBufferPtr);
++
++ // Interleaves the lower two values in the i and q variables into one buffer
++ cplxValue1 = _mm256_unpacklo_ps(iValue, qValue);
++ // Interleaves the upper two values in the i and q variables into one buffer
++ cplxValue2 = _mm256_unpackhi_ps(iValue, qValue);
++
++ cplxValue = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x20);
++ _mm256_storeu_ps(complexVectorPtr, cplxValue);
++ complexVectorPtr += 8;
++
++ cplxValue = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x31);
++ _mm256_storeu_ps(complexVectorPtr, cplxValue);
++ complexVectorPtr += 8;
++
++ iBufferPtr += 8;
++ qBufferPtr += 8;
++ }
++
++ number = eighthPoints * 8;
++ for (; number < num_points; number++) {
++ *complexVectorPtr++ = *iBufferPtr++;
++ *complexVectorPtr++ = *qBufferPtr++;
++ }
+ }
+ #endif /* LV_HAVE_AVX */
+
+diff --git a/kernels/volk/volk_32f_x2_max_32f.h b/kernels/volk/volk_32f_x2_max_32f.h
+index 82086a6..c7eb67f 100644
+--- a/kernels/volk/volk_32f_x2_max_32f.h
++++ b/kernels/volk/volk_32f_x2_max_32f.h
+@@ -32,8 +32,8 @@
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+- * void volk_32f_x2_max_32f(float* cVector, const float* aVector, const float* bVector, unsigned int num_points)
+- * \endcode
++ * void volk_32f_x2_max_32f(float* cVector, const float* aVector, const float* bVector,
++ * unsigned int num_points) \endcode
+ *
+ * \b Inputs
+ * \li aVector: First input vector.
+@@ -77,176 +77,183 @@
+ #ifdef LV_HAVE_AVX512F
+ #include <immintrin.h>
+
+-static inline void
+-volk_32f_x2_max_32f_a_avx512f(float* cVector, const float* aVector,
+- const float* bVector, unsigned int num_points)
++static inline void volk_32f_x2_max_32f_a_avx512f(float* cVector,
++ const float* aVector,
++ const float* bVector,
++ unsigned int num_points)
+ {
+- unsigned int number = 0;
+- const unsigned int sixteenthPoints = num_points / 16;
++ unsigned int number = 0;
++ const unsigned int sixteenthPoints = num_points / 16;
+
+- float* cPtr = cVector;
+- const float* aPtr = aVector;
+- const float* bPtr= bVector;
++ float* cPtr = cVector;
++ const float* aPtr = aVector;
++ const float* bPtr = bVector;
+
+- __m512 aVal, bVal, cVal;
+- for(;number < sixteenthPoints; number++){
+- aVal = _mm512_load_ps(aPtr);
+- bVal = _mm512_load_ps(bPtr);
++ __m512 aVal, bVal, cVal;
++ for (; number < sixteenthPoints; number++) {
++ aVal = _mm512_load_ps(aPtr);
++ bVal = _mm512_load_ps(bPtr);
+
+- cVal = _mm512_max_ps(aVal, bVal);
++ cVal = _mm512_max_ps(aVal, bVal);
+
+- _mm512_store_ps(cPtr,cVal); // Store the results back into the C container
++ _mm512_store_ps(cPtr, cVal); // Store the results back into the C container
+
+- aPtr += 16;
+- bPtr += 16;
+- cPtr += 16;
+- }
++ aPtr += 16;
++ bPtr += 16;
++ cPtr += 16;
++ }
+
+- number = sixteenthPoints * 16;
+- for(;number < num_points; number++){
+- const float a = *aPtr++;
+- const float b = *bPtr++;
+- *cPtr++ = ( a > b ? a : b);
+- }
++ number = sixteenthPoints * 16;
++ for (; number < num_points; number++) {
++ const float a = *aPtr++;
++ const float b = *bPtr++;
++ *cPtr++ = (a > b ? a : b);
++ }
+ }
+ #endif /* LV_HAVE_AVX512F */
+
+ #ifdef LV_HAVE_SSE
+ #include <xmmintrin.h>
+
+-static inline void
+-volk_32f_x2_max_32f_a_sse(float* cVector, const float* aVector,
+- const float* bVector, unsigned int num_points)
++static inline void volk_32f_x2_max_32f_a_sse(float* cVector,
++ const float* aVector,
++ const float* bVector,
++ unsigned int num_points)
+ {
+- unsigned int number = 0;
+- const unsigned int quarterPoints = num_points / 4;
++ unsigned int number = 0;
++ const unsigned int quarterPoints = num_points / 4;
+
+- float* cPtr = cVector;
+- const float* aPtr = aVector;
+- const float* bPtr= bVector;
++ float* cPtr = cVector;
++ const float* aPtr = aVector;
++ const float* bPtr = bVector;
+
+- __m128 aVal, bVal, cVal;
+- for(;number < quarterPoints; number++){
+- aVal = _mm_load_ps(aPtr);
+- bVal = _mm_load_ps(bPtr);
++ __m128 aVal, bVal, cVal;
++ for (; number < quarterPoints; number++) {
++ aVal = _mm_load_ps(aPtr);
++ bVal = _mm_load_ps(bPtr);
+
+- cVal = _mm_max_ps(aVal, bVal);
++ cVal = _mm_max_ps(aVal, bVal);
+
+- _mm_store_ps(cPtr,cVal); // Store the results back into the C container
++ _mm_store_ps(cPtr, cVal); // Store the results back into the C container
+
+- aPtr += 4;
+- bPtr += 4;
+- cPtr += 4;
+- }
++ aPtr += 4;
++ bPtr += 4;
++ cPtr += 4;
++ }
+
+- number = quarterPoints * 4;
+- for(;number < num_points; number++){
+- const float a = *aPtr++;
+- const float b = *bPtr++;
+- *cPtr++ = ( a > b ? a : b);
+- }
++ number = quarterPoints * 4;
++ for (; number < num_points; number++) {
++ const float a = *aPtr++;
++ const float b = *bPtr++;
++ *cPtr++ = (a > b ? a : b);
++ }
+ }
+ #endif /* LV_HAVE_SSE */
+
+ #ifdef LV_HAVE_AVX
+ #include <immintrin.h>
+
+-static inline void
+-volk_32f_x2_max_32f_a_avx(float* cVector, const float* aVector,
+- const float* bVector, unsigned int num_points)
++static inline void volk_32f_x2_max_32f_a_avx(float* cVector,
++ const float* aVector,
++ const float* bVector,
++ unsigned int num_points)
+ {
+- unsigned int number = 0;
+- const unsigned int eighthPoints = num_points / 8;
++ unsigned int number = 0;
++ const unsigned int eighthPoints = num_points / 8;
+
+- float* cPtr = cVector;
+- const float* aPtr = aVector;
+- const float* bPtr= bVector;
++ float* cPtr = cVector;
++ const float* aPtr = aVector;
++ const float* bPtr = bVector;
+
+- __m256 aVal, bVal, cVal;
+- for(;number < eighthPoints; number++){
+- aVal = _mm256_load_ps(aPtr);
+- bVal = _mm256_load_ps(bPtr);
++ __m256 aVal, bVal, cVal;
++ for (; number < eighthPoints; number++) {
++ aVal = _mm256_load_ps(aPtr);
++ bVal = _mm256_load_ps(bPtr);
+
+- cVal = _mm256_max_ps(aVal, bVal);
++ cVal = _mm256_max_ps(aVal, bVal);
+
+- _mm256_store_ps(cPtr,cVal); // Store the results back into the C container
++ _mm256_store_ps(cPtr, cVal); // Store the results back into the C container
+
+- aPtr += 8;
+- bPtr += 8;
+- cPtr += 8;
+- }
++ aPtr += 8;
++ bPtr += 8;
++ cPtr += 8;
++ }
+
+- number = eighthPoints * 8;
+- for(;number < num_points; number++){
+- const float a = *aPtr++;
+- const float b = *bPtr++;
+- *cPtr++ = ( a > b ? a : b);
+- }
++ number = eighthPoints * 8;
++ for (; number < num_points; number++) {
++ const float a = *aPtr++;
++ const float b = *bPtr++;
++ *cPtr++ = (a > b ? a : b);
++ }
+ }
+ #endif /* LV_HAVE_AVX */
+
+ #ifdef LV_HAVE_NEON
+ #include <arm_neon.h>
+
+-static inline void
+-volk_32f_x2_max_32f_neon(float* cVector, const float* aVector,
+- const float* bVector, unsigned int num_points)
++static inline void volk_32f_x2_max_32f_neon(float* cVector,
++ const float* aVector,
++ const float* bVector,
++ unsigned int num_points)
+ {
+- unsigned int quarter_points = num_points / 4;
+- float* cPtr = cVector;
+- const float* aPtr = aVector;
+- const float* bPtr= bVector;
+- unsigned int number = 0;
+-
+- float32x4_t a_vec, b_vec, c_vec;
+- for(number = 0; number < quarter_points; number++){
+- a_vec = vld1q_f32(aPtr);
+- b_vec = vld1q_f32(bPtr);
+- c_vec = vmaxq_f32(a_vec, b_vec);
+- vst1q_f32(cPtr, c_vec);
+- aPtr += 4;
+- bPtr += 4;
+- cPtr += 4;
+- }
+-
+- for(number = quarter_points*4; number < num_points; number++){
+- const float a = *aPtr++;
+- const float b = *bPtr++;
+- *cPtr++ = ( a > b ? a : b);
+- }
++ unsigned int quarter_points = num_points / 4;
++ float* cPtr = cVector;
++ const float* aPtr = aVector;
++ const float* bPtr = bVector;
++ unsigned int number = 0;
++
++ float32x4_t a_vec, b_vec, c_vec;
++ for (number = 0; number < quarter_points; number++) {
++ a_vec = vld1q_f32(aPtr);
++ b_vec = vld1q_f32(bPtr);
++ c_vec = vmaxq_f32(a_vec, b_vec);
++ vst1q_f32(cPtr, c_vec);
++ aPtr += 4;
++ bPtr += 4;
++ cPtr += 4;
++ }
++
++ for (number = quarter_points * 4; number < num_points; number++) {
++ const float a = *aPtr++;
++ const float b = *bPtr++;
++ *cPtr++ = (a > b ? a : b);
++ }
+ }
+ #endif /* LV_HAVE_NEON */
+
+
+ #ifdef LV_HAVE_GENERIC
+
+-static inline void
+-volk_32f_x2_max_32f_generic(float* cVector, const float* aVector,
+- const float* bVector, unsigned int num_points)
++static inline void volk_32f_x2_max_32f_generic(float* cVector,
++ const float* aVector,
++ const float* bVector,
++ unsigned int num_points)
+ {
+- float* cPtr = cVector;
+- const float* aPtr = aVector;
+- const float* bPtr= bVector;
+- unsigned int number = 0;
+-
+- for(number = 0; number < num_points; number++){
+- const float a = *aPtr++;
+- const float b = *bPtr++;
+- *cPtr++ = ( a > b ? a : b);
+- }
++ float* cPtr = cVector;
++ const float* aPtr = aVector;
++ const float* bPtr = bVector;
++ unsigned int number = 0;
++
++ for (number = 0; number < num_points; number++) {
++ const float a = *aPtr++;
++ const float b = *bPtr++;
++ *cPtr++ = (a > b ? a : b);
++ }
+ }
+ #endif /* LV_HAVE_GENERIC */
+
+ #ifdef LV_HAVE_ORC
+-extern void
+-volk_32f_x2_max_32f_a_orc_impl(float* cVector, const float* aVector,
+- const float* bVector, unsigned int num_points);
+-
+-static inline void
+-volk_32f_x2_max_32f_u_orc(float* cVector, const float* aVector,
+- const float* bVector, unsigned int num_points)
++extern void volk_32f_x2_max_32f_a_orc_impl(float* cVector,
++ const float* aVector,
++ const float* bVector,
++ unsigned int num_points);
++
++static inline void volk_32f_x2_max_32f_u_orc(float* cVector,
++ const float* aVector,
++ const float* bVector,
++ unsigned int num_points)
+ {
+- volk_32f_x2_max_32f_a_orc_impl(cVector, aVector, bVector, num_points);
++ volk_32f_x2_max_32f_a_orc_impl(cVector, aVector, bVector, num_points);
+ }
+ #endif /* LV_HAVE_ORC */
+
+@@ -263,74 +270,76 @@ volk_32f_x2_max_32f_u_orc(float* cVector, const float* aVector,
+ #ifdef LV_HAVE_AVX512F
+ #include <immintrin.h>
+
+-static inline void
+-volk_32f_x2_max_32f_u_avx512f(float* cVector, const float* aVector,
+- const float* bVector, unsigned int num_points)
++static inline void volk_32f_x2_max_32f_u_avx512f(float* cVector,
++ const float* aVector,
++ const float* bVector,
++ unsigned int num_points)
+ {
+- unsigned int number = 0;
+- const unsigned int sixteenthPoints = num_points / 16;
++ unsigned int number = 0;
++ const unsigned int sixteenthPoints = num_points / 16;
+
+- float* cPtr = cVector;
+- const float* aPtr = aVector;
+- const float* bPtr= bVector;
++ float* cPtr = cVector;
++ const float* aPtr = aVector;
++ const float* bPtr = bVector;
+
+- __m512 aVal, bVal, cVal;
+- for(;number < sixteenthPoints; number++){
+- aVal = _mm512_loadu_ps(aPtr);
+- bVal = _mm512_loadu_ps(bPtr);
++ __m512 aVal, bVal, cVal;
++ for (; number < sixteenthPoints; number++) {
++ aVal = _mm512_loadu_ps(aPtr);
++ bVal = _mm512_loadu_ps(bPtr);
+
+- cVal = _mm512_max_ps(aVal, bVal);
++ cVal = _mm512_max_ps(aVal, bVal);
+
+- _mm512_storeu_ps(cPtr,cVal); // Store the results back into the C container
++ _mm512_storeu_ps(cPtr, cVal); // Store the results back into the C container
+
+- aPtr += 16;
+- bPtr += 16;
+- cPtr += 16;
+- }
++ aPtr += 16;
++ bPtr += 16;
++ cPtr += 16;
++ }
+
+- number = sixteenthPoints * 16;
+- for(;number < num_points; number++){
+- const float a = *aPtr++;
+- const float b = *bPtr++;
+- *cPtr++ = ( a > b ? a : b);
+- }
++ number = sixteenthPoints * 16;
++ for (; number < num_points; number++) {
++ const float a = *aPtr++;
++ const float b = *bPtr++;
++ *cPtr++ = (a > b ? a : b);
++ }
+ }
+ #endif /* LV_HAVE_AVX512F */
+
+ #ifdef LV_HAVE_AVX
+ #include <immintrin.h>
+
+-static inline void
+-volk_32f_x2_max_32f_u_avx(float* cVector, const float* aVector,
+- const float* bVector, unsigned int num_points)
++static inline void volk_32f_x2_max_32f_u_avx(float* cVector,
++ const float* aVector,
++ const float* bVector,
++ unsigned int num_points)
+ {
+- unsigned int number = 0;
+- const unsigned int eighthPoints = num_points / 8;
++ unsigned int number = 0;
++ const unsigned int eighthPoints = num_points / 8;
+
+- float* cPtr = cVector;
+- const float* aPtr = aVector;
+- const float* bPtr= bVector;
++ float* cPtr = cVector;
++ const float* aPtr = aVector;
++ const float* bPtr = bVector;
+
+- __m256 aVal, bVal, cVal;
+- for(;number < eighthPoints; number++){
+- aVal = _mm256_loadu_ps(aPtr);
+- bVal = _mm256_loadu_ps(bPtr);
++ __m256 aVal, bVal, cVal;
++ for (; number < eighthPoints; number++) {
++ aVal = _mm256_loadu_ps(aPtr);
++ bVal = _mm256_loadu_ps(bPtr);
+
+- cVal = _mm256_max_ps(aVal, bVal);
++ cVal = _mm256_max_ps(aVal, bVal);
+
+- _mm256_storeu_ps(cPtr,cVal); // Store the results back into the C container
++ _mm256_storeu_ps(cPtr, cVal); // Store the results back into the C container
+
+- aPtr += 8;
+- bPtr += 8;
+- cPtr += 8;
+- }
++ aPtr += 8;
++ bPtr += 8;
++ cPtr += 8;
++ }
+
+- number = eighthPoints * 8;
+- for(;number < num_points; number++){
+- const float a = *aPtr++;
+- const float b = *bPtr++;
+- *cPtr++ = ( a > b ? a : b);
+- }
++ number = eighthPoints * 8;
++ for (; number < num_points; number++) {
++ const float a = *aPtr++;
++ const float b = *bPtr++;
++ *cPtr++ = (a > b ? a : b);
++ }
+ }
+ #endif /* LV_HAVE_AVX */
+
+diff --git a/kernels/volk/volk_32f_x2_min_32f.h b/kernels/volk/volk_32f_x2_min_32f.h
+index 454eb76..aecd11a 100644
+--- a/kernels/volk/volk_32f_x2_min_32f.h
++++ b/kernels/volk/volk_32f_x2_min_32f.h
+@@ -32,8 +32,8 @@
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+- * void volk_32f_x2_min_32f(float* cVector, const float* aVector, const float* bVector, unsigned int num_points)
+- * \endcode
++ * void volk_32f_x2_min_32f(float* cVector, const float* aVector, const float* bVector,
++ * unsigned int num_points) \endcode
+ *
+ * \b Inputs
+ * \li aVector: First input vector.
+@@ -77,37 +77,38 @@
+ #ifdef LV_HAVE_SSE
+ #include <xmmintrin.h>
+
+-static inline void
+-volk_32f_x2_min_32f_a_sse(float* cVector, const float* aVector,
+- const float* bVector, unsigned int num_points)
++static inline void volk_32f_x2_min_32f_a_sse(float* cVector,
++ const float* aVector,
++ const float* bVector,
++ unsigned int num_points)
+ {
+- unsigned int number = 0;
+- const unsigned int quarterPoints = num_points / 4;
++ unsigned int number = 0;
++ const unsigned int quarterPoints = num_points / 4;
+
+- float* cPtr = cVector;
+- const float* aPtr = aVector;
+- const float* bPtr= bVector;
++ float* cPtr = cVector;
++ const float* aPtr = aVector;
++ const float* bPtr = bVector;
+
+- __m128 aVal, bVal, cVal;
+- for(;number < quarterPoints; number++){
+- aVal = _mm_load_ps(aPtr);
+- bVal = _mm_load_ps(bPtr);
++ __m128 aVal, bVal, cVal;
++ for (; number < quarterPoints; number++) {
++ aVal = _mm_load_ps(aPtr);
++ bVal = _mm_load_ps(bPtr);
+
+- cVal = _mm_min_ps(aVal, bVal);
++ cVal = _mm_min_ps(aVal, bVal);
+
+- _mm_store_ps(cPtr,cVal); // Store the results back into the C container
++ _mm_store_ps(cPtr, cVal); // Store the results back into the C container
+
+- aPtr += 4;
+- bPtr += 4;
+- cPtr += 4;
+- }
++ aPtr += 4;
++ bPtr += 4;
++ cPtr += 4;
++ }
+
+- number = quarterPoints * 4;
+- for(;number < num_points; number++){
+- const float a = *aPtr++;
+- const float b = *bPtr++;
+- *cPtr++ = ( a < b ? a : b);
+- }
++ number = quarterPoints * 4;
++ for (; number < num_points; number++) {
++ const float a = *aPtr++;
++ const float b = *bPtr++;
++ *cPtr++ = (a < b ? a : b);
++ }
+ }
+ #endif /* LV_HAVE_SSE */
+
+@@ -115,143 +116,149 @@ volk_32f_x2_min_32f_a_sse(float* cVector, const float* aVector,
+ #ifdef LV_HAVE_NEON
+ #include <arm_neon.h>
+
+-static inline void
+-volk_32f_x2_min_32f_neon(float* cVector, const float* aVector,
+- const float* bVector, unsigned int num_points)
++static inline void volk_32f_x2_min_32f_neon(float* cVector,
++ const float* aVector,
++ const float* bVector,
++ unsigned int num_points)
+ {
+- float* cPtr = cVector;
+- const float* aPtr = aVector;
+- const float* bPtr= bVector;
+- unsigned int number = 0;
+- unsigned int quarter_points = num_points / 4;
+-
+- float32x4_t a_vec, b_vec, c_vec;
+- for(number = 0; number < quarter_points; number++){
+- a_vec = vld1q_f32(aPtr);
+- b_vec = vld1q_f32(bPtr);
+-
+- c_vec = vminq_f32(a_vec, b_vec);
+-
+- vst1q_f32(cPtr, c_vec);
+- aPtr += 4;
+- bPtr += 4;
+- cPtr += 4;
+- }
+-
+- for(number = quarter_points*4; number < num_points; number++){
+- const float a = *aPtr++;
+- const float b = *bPtr++;
+- *cPtr++ = ( a < b ? a : b);
+- }
++ float* cPtr = cVector;
++ const float* aPtr = aVector;
++ const float* bPtr = bVector;
++ unsigned int number = 0;
++ unsigned int quarter_points = num_points / 4;
++
++ float32x4_t a_vec, b_vec, c_vec;
++ for (number = 0; number < quarter_points; number++) {
++ a_vec = vld1q_f32(aPtr);
++ b_vec = vld1q_f32(bPtr);
++
++ c_vec = vminq_f32(a_vec, b_vec);
++
++ vst1q_f32(cPtr, c_vec);
++ aPtr += 4;
++ bPtr += 4;
++ cPtr += 4;
++ }
++
++ for (number = quarter_points * 4; number < num_points; number++) {
++ const float a = *aPtr++;
++ const float b = *bPtr++;
++ *cPtr++ = (a < b ? a : b);
++ }
+ }
+ #endif /* LV_HAVE_NEON */
+
+
+ #ifdef LV_HAVE_GENERIC
+
+-static inline void
+-volk_32f_x2_min_32f_generic(float* cVector, const float* aVector,
+- const float* bVector, unsigned int num_points)
++static inline void volk_32f_x2_min_32f_generic(float* cVector,
++ const float* aVector,
++ const float* bVector,
++ unsigned int num_points)
+ {
+- float* cPtr = cVector;
+- const float* aPtr = aVector;
+- const float* bPtr= bVector;
+- unsigned int number = 0;
+-
+- for(number = 0; number < num_points; number++){
+- const float a = *aPtr++;
+- const float b = *bPtr++;
+- *cPtr++ = ( a < b ? a : b);
+- }
++ float* cPtr = cVector;
++ const float* aPtr = aVector;
++ const float* bPtr = bVector;
++ unsigned int number = 0;
++
++ for (number = 0; number < num_points; number++) {
++ const float a = *aPtr++;
++ const float b = *bPtr++;
++ *cPtr++ = (a < b ? a : b);
++ }
+ }
+ #endif /* LV_HAVE_GENERIC */
+
+
+ #ifdef LV_HAVE_ORC
+
+-extern void
+-volk_32f_x2_min_32f_a_orc_impl(float* cVector, const float* aVector,
+- const float* bVector, unsigned int num_points);
++extern void volk_32f_x2_min_32f_a_orc_impl(float* cVector,
++ const float* aVector,
++ const float* bVector,
++ unsigned int num_points);
+
+-static inline void
+-volk_32f_x2_min_32f_u_orc(float* cVector, const float* aVector,
+- const float* bVector, unsigned int num_points)
++static inline void volk_32f_x2_min_32f_u_orc(float* cVector,
++ const float* aVector,
++ const float* bVector,
++ unsigned int num_points)
+ {
+- volk_32f_x2_min_32f_a_orc_impl(cVector, aVector, bVector, num_points);
++ volk_32f_x2_min_32f_a_orc_impl(cVector, aVector, bVector, num_points);
+ }
+ #endif /* LV_HAVE_ORC */
+
+ #ifdef LV_HAVE_AVX
+ #include <immintrin.h>
+
+-static inline void
+-volk_32f_x2_min_32f_a_avx(float* cVector, const float* aVector,
+- const float* bVector, unsigned int num_points)
++static inline void volk_32f_x2_min_32f_a_avx(float* cVector,
++ const float* aVector,
++ const float* bVector,
++ unsigned int num_points)
+ {
+- unsigned int number = 0;
+- const unsigned int eighthPoints = num_points / 8;
++ unsigned int number = 0;
++ const unsigned int eighthPoints = num_points / 8;
+
+- float* cPtr = cVector;
+- const float* aPtr = aVector;
+- const float* bPtr= bVector;
++ float* cPtr = cVector;
++ const float* aPtr = aVector;
++ const float* bPtr = bVector;
+
+- __m256 aVal, bVal, cVal;
+- for(;number < eighthPoints; number++){
+- aVal = _mm256_load_ps(aPtr);
+- bVal = _mm256_load_ps(bPtr);
++ __m256 aVal, bVal, cVal;
++ for (; number < eighthPoints; number++) {
++ aVal = _mm256_load_ps(aPtr);
++ bVal = _mm256_load_ps(bPtr);
+
+- cVal = _mm256_min_ps(aVal, bVal);
++ cVal = _mm256_min_ps(aVal, bVal);
+
+- _mm256_store_ps(cPtr,cVal); // Store the results back into the C container
++ _mm256_store_ps(cPtr, cVal); // Store the results back into the C container
+
+- aPtr += 8;
+- bPtr += 8;
+- cPtr += 8;
+- }
++ aPtr += 8;
++ bPtr += 8;
++ cPtr += 8;
++ }
+
+- number = eighthPoints * 8;
+- for(;number < num_points; number++){
+- const float a = *aPtr++;
+- const float b = *bPtr++;
+- *cPtr++ = ( a < b ? a : b);
+- }
++ number = eighthPoints * 8;
++ for (; number < num_points; number++) {
++ const float a = *aPtr++;
++ const float b = *bPtr++;
++ *cPtr++ = (a < b ? a : b);
++ }
+ }
+ #endif /* LV_HAVE_AVX */
+
+ #ifdef LV_HAVE_AVX512F
+ #include <immintrin.h>
+
+-static inline void
+-volk_32f_x2_min_32f_a_avx512f(float* cVector, const float* aVector,
+- const float* bVector, unsigned int num_points)
++static inline void volk_32f_x2_min_32f_a_avx512f(float* cVector,
++ const float* aVector,
++ const float* bVector,
++ unsigned int num_points)
+ {
+- unsigned int number = 0;
+- const unsigned int sixteenthPoints = num_points / 16;
++ unsigned int number = 0;
++ const unsigned int sixteenthPoints = num_points / 16;
+
+- float* cPtr = cVector;
+- const float* aPtr = aVector;
+- const float* bPtr= bVector;
++ float* cPtr = cVector;
++ const float* aPtr = aVector;
++ const float* bPtr = bVector;
+
+- __m512 aVal, bVal, cVal;
+- for(;number < sixteenthPoints; number++){
+- aVal = _mm512_load_ps(aPtr);
+- bVal = _mm512_load_ps(bPtr);
++ __m512 aVal, bVal, cVal;
++ for (; number < sixteenthPoints; number++) {
++ aVal = _mm512_load_ps(aPtr);
++ bVal = _mm512_load_ps(bPtr);
+
+- cVal = _mm512_min_ps(aVal, bVal);
++ cVal = _mm512_min_ps(aVal, bVal);
+
+- _mm512_store_ps(cPtr,cVal); // Store the results back into the C container
++ _mm512_store_ps(cPtr, cVal); // Store the results back into the C container
+
+- aPtr += 16;
+- bPtr += 16;
+- cPtr += 16;
+- }
++ aPtr += 16;
++ bPtr += 16;
++ cPtr += 16;
++ }
+
+- number = sixteenthPoints * 16;
+- for(;number < num_points; number++){
+- const float a = *aPtr++;
+- const float b = *bPtr++;
+- *cPtr++ = ( a < b ? a : b);
+- }
++ number = sixteenthPoints * 16;
++ for (; number < num_points; number++) {
++ const float a = *aPtr++;
++ const float b = *bPtr++;
++ *cPtr++ = (a < b ? a : b);
++ }
+ }
+ #endif /* LV_HAVE_AVX512F */
+
+@@ -267,74 +274,76 @@ volk_32f_x2_min_32f_a_avx512f(float* cVector, const float* aVector,
+ #ifdef LV_HAVE_AVX512F
+ #include <immintrin.h>
+
+-static inline void
+-volk_32f_x2_min_32f_u_avx512f(float* cVector, const float* aVector,
+- const float* bVector, unsigned int num_points)
++static inline void volk_32f_x2_min_32f_u_avx512f(float* cVector,
++ const float* aVector,
++ const float* bVector,
++ unsigned int num_points)
+ {
+- unsigned int number = 0;
+- const unsigned int sixteenthPoints = num_points / 16;
++ unsigned int number = 0;
++ const unsigned int sixteenthPoints = num_points / 16;
+
+- float* cPtr = cVector;
+- const float* aPtr = aVector;
+- const float* bPtr= bVector;
++ float* cPtr = cVector;
++ const float* aPtr = aVector;
++ const float* bPtr = bVector;
+
+- __m512 aVal, bVal, cVal;
+- for(;number < sixteenthPoints; number++){
+- aVal = _mm512_loadu_ps(aPtr);
+- bVal = _mm512_loadu_ps(bPtr);
++ __m512 aVal, bVal, cVal;
++ for (; number < sixteenthPoints; number++) {
++ aVal = _mm512_loadu_ps(aPtr);
++ bVal = _mm512_loadu_ps(bPtr);
+
+- cVal = _mm512_min_ps(aVal, bVal);
++ cVal = _mm512_min_ps(aVal, bVal);
+
+- _mm512_storeu_ps(cPtr,cVal); // Store the results back into the C container
++ _mm512_storeu_ps(cPtr, cVal); // Store the results back into the C container
+
+- aPtr += 16;
+- bPtr += 16;
+- cPtr += 16;
+- }
++ aPtr += 16;
++ bPtr += 16;
++ cPtr += 16;
++ }
+
+- number = sixteenthPoints * 16;
+- for(;number < num_points; number++){
+- const float a = *aPtr++;
+- const float b = *bPtr++;
+- *cPtr++ = ( a < b ? a : b);
+- }
++ number = sixteenthPoints * 16;
++ for (; number < num_points; number++) {
++ const float a = *aPtr++;
++ const float b = *bPtr++;
++ *cPtr++ = (a < b ? a : b);
++ }
+ }
+ #endif /* LV_HAVE_AVX512F */
+
+ #ifdef LV_HAVE_AVX
+ #include <immintrin.h>
+
+-static inline void
+-volk_32f_x2_min_32f_u_avx(float* cVector, const float* aVector,
+- const float* bVector, unsigned int num_points)
++static inline void volk_32f_x2_min_32f_u_avx(float* cVector,
++ const float* aVector,
++ const float* bVector,
++ unsigned int num_points)
+ {
+- unsigned int number = 0;
+- const unsigned int eighthPoints = num_points / 8;
++ unsigned int number = 0;
++ const unsigned int eighthPoints = num_points / 8;
+
+- float* cPtr = cVector;
+- const float* aPtr = aVector;
+- const float* bPtr= bVector;
++ float* cPtr = cVector;
++ const float* aPtr = aVector;
++ const float* bPtr = bVector;
+
+- __m256 aVal, bVal, cVal;
+- for(;number < eighthPoints; number++){
+- aVal = _mm256_loadu_ps(aPtr);
+- bVal = _mm256_loadu_ps(bPtr);
++ __m256 aVal, bVal, cVal;
++ for (; number < eighthPoints; number++) {
++ aVal = _mm256_loadu_ps(aPtr);
++ bVal = _mm256_loadu_ps(bPtr);
+
+- cVal = _mm256_min_ps(aVal, bVal);
++ cVal = _mm256_min_ps(aVal, bVal);
+
+- _mm256_storeu_ps(cPtr,cVal); // Store the results back into the C container
++ _mm256_storeu_ps(cPtr, cVal); // Store the results back into the C container
+
+- aPtr += 8;
+- bPtr += 8;
+- cPtr += 8;
+- }
++ aPtr += 8;
++ bPtr += 8;
++ cPtr += 8;
++ }
+
+- number = eighthPoints * 8;
+- for(;number < num_points; number++){
+- const float a = *aPtr++;
+- const float b = *bPtr++;
+- *cPtr++ = ( a < b ? a : b);
+- }
++ number = eighthPoints * 8;
++ for (; number < num_points; number++) {
++ const float a = *aPtr++;
++ const float b = *bPtr++;
++ *cPtr++ = (a < b ? a : b);
++ }
+ }
+ #endif /* LV_HAVE_AVX */
+
+diff --git a/kernels/volk/volk_32f_x2_multiply_32f.h b/kernels/volk/volk_32f_x2_multiply_32f.h
+index deb9ae3..eebba18 100644
+--- a/kernels/volk/volk_32f_x2_multiply_32f.h
++++ b/kernels/volk/volk_32f_x2_multiply_32f.h
+@@ -31,8 +31,8 @@
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+- * void volk_32f_x2_multiply_32f(float* cVector, const float* aVector, const float* bVector, unsigned int num_points)
+- * \endcode
++ * void volk_32f_x2_multiply_32f(float* cVector, const float* aVector, const float*
++ * bVector, unsigned int num_points) \endcode
+ *
+ * \b Inputs
+ * \li aVector: First input vector.
+@@ -77,126 +77,130 @@
+ #ifdef LV_HAVE_SSE
+ #include <xmmintrin.h>
+
+-static inline void
+-volk_32f_x2_multiply_32f_u_sse(float* cVector, const float* aVector,
+- const float* bVector, unsigned int num_points)
++static inline void volk_32f_x2_multiply_32f_u_sse(float* cVector,
++ const float* aVector,
++ const float* bVector,
++ unsigned int num_points)
+ {
+- unsigned int number = 0;
+- const unsigned int quarterPoints = num_points / 4;
++ unsigned int number = 0;
++ const unsigned int quarterPoints = num_points / 4;
+
+- float* cPtr = cVector;
+- const float* aPtr = aVector;
+- const float* bPtr= bVector;
++ float* cPtr = cVector;
++ const float* aPtr = aVector;
++ const float* bPtr = bVector;
+
+- __m128 aVal, bVal, cVal;
+- for(;number < quarterPoints; number++){
++ __m128 aVal, bVal, cVal;
++ for (; number < quarterPoints; number++) {
+
+- aVal = _mm_loadu_ps(aPtr);
+- bVal = _mm_loadu_ps(bPtr);
++ aVal = _mm_loadu_ps(aPtr);
++ bVal = _mm_loadu_ps(bPtr);
+
+- cVal = _mm_mul_ps(aVal, bVal);
++ cVal = _mm_mul_ps(aVal, bVal);
+
+- _mm_storeu_ps(cPtr,cVal); // Store the results back into the C container
++ _mm_storeu_ps(cPtr, cVal); // Store the results back into the C container
+
+- aPtr += 4;
+- bPtr += 4;
+- cPtr += 4;
+- }
++ aPtr += 4;
++ bPtr += 4;
++ cPtr += 4;
++ }
+
+- number = quarterPoints * 4;
+- for(;number < num_points; number++){
+- *cPtr++ = (*aPtr++) * (*bPtr++);
+- }
++ number = quarterPoints * 4;
++ for (; number < num_points; number++) {
++ *cPtr++ = (*aPtr++) * (*bPtr++);
++ }
+ }
+ #endif /* LV_HAVE_SSE */
+
+ #ifdef LV_HAVE_AVX512F
+ #include <immintrin.h>
+
+-static inline void
+-volk_32f_x2_multiply_32f_u_avx512f(float* cVector, const float* aVector,
+- const float* bVector, unsigned int num_points)
++static inline void volk_32f_x2_multiply_32f_u_avx512f(float* cVector,
++ const float* aVector,
++ const float* bVector,
++ unsigned int num_points)
+ {
+- unsigned int number = 0;
+- const unsigned int sixteenthPoints = num_points / 16;
++ unsigned int number = 0;
++ const unsigned int sixteenthPoints = num_points / 16;
+
+- float* cPtr = cVector;
+- const float* aPtr = aVector;
+- const float* bPtr= bVector;
++ float* cPtr = cVector;
++ const float* aPtr = aVector;
++ const float* bPtr = bVector;
+
+- __m512 aVal, bVal, cVal;
+- for(;number < sixteenthPoints; number++){
++ __m512 aVal, bVal, cVal;
++ for (; number < sixteenthPoints; number++) {
+
+- aVal = _mm512_loadu_ps(aPtr);
+- bVal = _mm512_loadu_ps(bPtr);
++ aVal = _mm512_loadu_ps(aPtr);
++ bVal = _mm512_loadu_ps(bPtr);
+
+- cVal = _mm512_mul_ps(aVal, bVal);
++ cVal = _mm512_mul_ps(aVal, bVal);
+
+- _mm512_storeu_ps(cPtr,cVal); // Store the results back into the C container
++ _mm512_storeu_ps(cPtr, cVal); // Store the results back into the C container
+
+- aPtr += 16;
+- bPtr += 16;
+- cPtr += 16;
+- }
++ aPtr += 16;
++ bPtr += 16;
++ cPtr += 16;
++ }
+
+- number = sixteenthPoints * 16;
+- for(;number < num_points; number++){
+- *cPtr++ = (*aPtr++) * (*bPtr++);
+- }
++ number = sixteenthPoints * 16;
++ for (; number < num_points; number++) {
++ *cPtr++ = (*aPtr++) * (*bPtr++);
++ }
+ }
+ #endif /* LV_HAVE_AVX512F */
+
+ #ifdef LV_HAVE_AVX
+ #include <immintrin.h>
+
+-static inline void
+-volk_32f_x2_multiply_32f_u_avx(float* cVector, const float* aVector,
+- const float* bVector, unsigned int num_points)
++static inline void volk_32f_x2_multiply_32f_u_avx(float* cVector,
++ const float* aVector,
++ const float* bVector,
++ unsigned int num_points)
+ {
+- unsigned int number = 0;
+- const unsigned int eighthPoints = num_points / 8;
++ unsigned int number = 0;
++ const unsigned int eighthPoints = num_points / 8;
+
+- float* cPtr = cVector;
+- const float* aPtr = aVector;
+- const float* bPtr= bVector;
++ float* cPtr = cVector;
++ const float* aPtr = aVector;
++ const float* bPtr = bVector;
+
+- __m256 aVal, bVal, cVal;
+- for(;number < eighthPoints; number++){
++ __m256 aVal, bVal, cVal;
++ for (; number < eighthPoints; number++) {
+
+- aVal = _mm256_loadu_ps(aPtr);
+- bVal = _mm256_loadu_ps(bPtr);
++ aVal = _mm256_loadu_ps(aPtr);
++ bVal = _mm256_loadu_ps(bPtr);
+
+- cVal = _mm256_mul_ps(aVal, bVal);
++ cVal = _mm256_mul_ps(aVal, bVal);
+
+- _mm256_storeu_ps(cPtr,cVal); // Store the results back into the C container
++ _mm256_storeu_ps(cPtr, cVal); // Store the results back into the C container
+
+- aPtr += 8;
+- bPtr += 8;
+- cPtr += 8;
+- }
++ aPtr += 8;
++ bPtr += 8;
++ cPtr += 8;
++ }
+
+- number = eighthPoints * 8;
+- for(;number < num_points; number++){
+- *cPtr++ = (*aPtr++) * (*bPtr++);
+- }
++ number = eighthPoints * 8;
++ for (; number < num_points; number++) {
++ *cPtr++ = (*aPtr++) * (*bPtr++);
++ }
+ }
+ #endif /* LV_HAVE_AVX */
+
+
+ #ifdef LV_HAVE_GENERIC
+
+-static inline void
+-volk_32f_x2_multiply_32f_generic(float* cVector, const float* aVector,
+- const float* bVector, unsigned int num_points)
++static inline void volk_32f_x2_multiply_32f_generic(float* cVector,
++ const float* aVector,
++ const float* bVector,
++ unsigned int num_points)
+ {
+- float* cPtr = cVector;
+- const float* aPtr = aVector;
+- const float* bPtr= bVector;
+- unsigned int number = 0;
+-
+- for(number = 0; number < num_points; number++){
+- *cPtr++ = (*aPtr++) * (*bPtr++);
+- }
++ float* cPtr = cVector;
++ const float* aPtr = aVector;
++ const float* bPtr = bVector;
++ unsigned int number = 0;
++
++ for (number = 0; number < num_points; number++) {
++ *cPtr++ = (*aPtr++) * (*bPtr++);
++ }
+ }
+ #endif /* LV_HAVE_GENERIC */
+
+@@ -213,72 +217,74 @@ volk_32f_x2_multiply_32f_generic(float* cVector, const float* aVector,
+ #ifdef LV_HAVE_SSE
+ #include <xmmintrin.h>
+
+-static inline void
+-volk_32f_x2_multiply_32f_a_sse(float* cVector, const float* aVector,
+- const float* bVector, unsigned int num_points)
++static inline void volk_32f_x2_multiply_32f_a_sse(float* cVector,
++ const float* aVector,
++ const float* bVector,
++ unsigned int num_points)
+ {
+- unsigned int number = 0;
+- const unsigned int quarterPoints = num_points / 4;
++ unsigned int number = 0;
++ const unsigned int quarterPoints = num_points / 4;
+
+- float* cPtr = cVector;
+- const float* aPtr = aVector;
+- const float* bPtr= bVector;
++ float* cPtr = cVector;
++ const float* aPtr = aVector;
++ const float* bPtr = bVector;
+
+- __m128 aVal, bVal, cVal;
+- for(;number < quarterPoints; number++){
++ __m128 aVal, bVal, cVal;
++ for (; number < quarterPoints; number++) {
+
+- aVal = _mm_load_ps(aPtr);
+- bVal = _mm_load_ps(bPtr);
++ aVal = _mm_load_ps(aPtr);
++ bVal = _mm_load_ps(bPtr);
+
+- cVal = _mm_mul_ps(aVal, bVal);
++ cVal = _mm_mul_ps(aVal, bVal);
+
+- _mm_store_ps(cPtr,cVal); // Store the results back into the C container
++ _mm_store_ps(cPtr, cVal); // Store the results back into the C container
+
+- aPtr += 4;
+- bPtr += 4;
+- cPtr += 4;
+- }
++ aPtr += 4;
++ bPtr += 4;
++ cPtr += 4;
++ }
+
+- number = quarterPoints * 4;
+- for(;number < num_points; number++){
+- *cPtr++ = (*aPtr++) * (*bPtr++);
+- }
++ number = quarterPoints * 4;
++ for (; number < num_points; number++) {
++ *cPtr++ = (*aPtr++) * (*bPtr++);
++ }
+ }
+ #endif /* LV_HAVE_SSE */
+
+ #ifdef LV_HAVE_AVX512F
+ #include <immintrin.h>
+
+-static inline void
+-volk_32f_x2_multiply_32f_a_avx512f(float* cVector, const float* aVector,
+- const float* bVector, unsigned int num_points)
++static inline void volk_32f_x2_multiply_32f_a_avx512f(float* cVector,
++ const float* aVector,
++ const float* bVector,
++ unsigned int num_points)
+ {
+- unsigned int number = 0;
+- const unsigned int sixteenthPoints = num_points / 16;
++ unsigned int number = 0;
++ const unsigned int sixteenthPoints = num_points / 16;
+
+- float* cPtr = cVector;
+- const float* aPtr = aVector;
+- const float* bPtr= bVector;
++ float* cPtr = cVector;
++ const float* aPtr = aVector;
++ const float* bPtr = bVector;
+
+- __m512 aVal, bVal, cVal;
+- for(;number < sixteenthPoints; number++){
++ __m512 aVal, bVal, cVal;
++ for (; number < sixteenthPoints; number++) {
+
+- aVal = _mm512_load_ps(aPtr);
+- bVal = _mm512_load_ps(bPtr);
++ aVal = _mm512_load_ps(aPtr);
++ bVal = _mm512_load_ps(bPtr);
+
+- cVal = _mm512_mul_ps(aVal, bVal);
++ cVal = _mm512_mul_ps(aVal, bVal);
+
+- _mm512_store_ps(cPtr,cVal); // Store the results back into the C container
++ _mm512_store_ps(cPtr, cVal); // Store the results back into the C container
+
+- aPtr += 16;
+- bPtr += 16;
+- cPtr += 16;
+- }
++ aPtr += 16;
++ bPtr += 16;
++ cPtr += 16;
++ }
+
+- number = sixteenthPoints * 16;
+- for(;number < num_points; number++){
+- *cPtr++ = (*aPtr++) * (*bPtr++);
+- }
++ number = sixteenthPoints * 16;
++ for (; number < num_points; number++) {
++ *cPtr++ = (*aPtr++) * (*bPtr++);
++ }
+ }
+ #endif /* LV_HAVE_AVX512F */
+
+@@ -286,36 +292,37 @@ volk_32f_x2_multiply_32f_a_avx512f(float* cVector, const float* aVector,
+ #ifdef LV_HAVE_AVX
+ #include <immintrin.h>
+
+-static inline void
+-volk_32f_x2_multiply_32f_a_avx(float* cVector, const float* aVector,
+- const float* bVector, unsigned int num_points)
++static inline void volk_32f_x2_multiply_32f_a_avx(float* cVector,
++ const float* aVector,
++ const float* bVector,
++ unsigned int num_points)
+ {
+- unsigned int number = 0;
+- const unsigned int eighthPoints = num_points / 8;
++ unsigned int number = 0;
++ const unsigned int eighthPoints = num_points / 8;
+
+- float* cPtr = cVector;
+- const float* aPtr = aVector;
+- const float* bPtr= bVector;
++ float* cPtr = cVector;
++ const float* aPtr = aVector;
++ const float* bPtr = bVector;
+
+- __m256 aVal, bVal, cVal;
+- for(;number < eighthPoints; number++){
++ __m256 aVal, bVal, cVal;
++ for (; number < eighthPoints; number++) {
+
+- aVal = _mm256_load_ps(aPtr);
+- bVal = _mm256_load_ps(bPtr);
++ aVal = _mm256_load_ps(aPtr);
++ bVal = _mm256_load_ps(bPtr);
+
+- cVal = _mm256_mul_ps(aVal, bVal);
++ cVal = _mm256_mul_ps(aVal, bVal);
+
+- _mm256_store_ps(cPtr,cVal); // Store the results back into the C container
++ _mm256_store_ps(cPtr, cVal); // Store the results back into the C container
+
+- aPtr += 8;
+- bPtr += 8;
+- cPtr += 8;
+- }
++ aPtr += 8;
++ bPtr += 8;
++ cPtr += 8;
++ }
+
+- number = eighthPoints * 8;
+- for(;number < num_points; number++){
+- *cPtr++ = (*aPtr++) * (*bPtr++);
+- }
++ number = eighthPoints * 8;
++ for (; number < num_points; number++) {
++ *cPtr++ = (*aPtr++) * (*bPtr++);
++ }
+ }
+ #endif /* LV_HAVE_AVX */
+
+@@ -323,57 +330,61 @@ volk_32f_x2_multiply_32f_a_avx(float* cVector, const float* aVector,
+ #ifdef LV_HAVE_NEON
+ #include <arm_neon.h>
+
+-static inline void
+-volk_32f_x2_multiply_32f_neon(float* cVector, const float* aVector,
+- const float* bVector, unsigned int num_points)
++static inline void volk_32f_x2_multiply_32f_neon(float* cVector,
++ const float* aVector,
++ const float* bVector,
++ unsigned int num_points)
+ {
+- const unsigned int quarter_points = num_points / 4;
+- unsigned int number;
+- float32x4_t avec, bvec, cvec;
+- for(number=0; number < quarter_points; ++number) {
+- avec = vld1q_f32(aVector);
+- bvec = vld1q_f32(bVector);
+- cvec = vmulq_f32(avec, bvec);
+- vst1q_f32(cVector, cvec);
+- aVector += 4;
+- bVector += 4;
+- cVector += 4;
+- }
+- for(number=quarter_points*4; number < num_points; ++number) {
+- *cVector++ = *aVector++ * *bVector++;
+- }
++ const unsigned int quarter_points = num_points / 4;
++ unsigned int number;
++ float32x4_t avec, bvec, cvec;
++ for (number = 0; number < quarter_points; ++number) {
++ avec = vld1q_f32(aVector);
++ bvec = vld1q_f32(bVector);
++ cvec = vmulq_f32(avec, bvec);
++ vst1q_f32(cVector, cvec);
++ aVector += 4;
++ bVector += 4;
++ cVector += 4;
++ }
++ for (number = quarter_points * 4; number < num_points; ++number) {
++ *cVector++ = *aVector++ * *bVector++;
++ }
+ }
+ #endif /* LV_HAVE_NEON */
+
+
+ #ifdef LV_HAVE_GENERIC
+
+-static inline void
+-volk_32f_x2_multiply_32f_a_generic(float* cVector, const float* aVector,
+- const float* bVector, unsigned int num_points)
++static inline void volk_32f_x2_multiply_32f_a_generic(float* cVector,
++ const float* aVector,
++ const float* bVector,
++ unsigned int num_points)
+ {
+- float* cPtr = cVector;
+- const float* aPtr = aVector;
+- const float* bPtr= bVector;
+- unsigned int number = 0;
+-
+- for(number = 0; number < num_points; number++){
+- *cPtr++ = (*aPtr++) * (*bPtr++);
+- }
++ float* cPtr = cVector;
++ const float* aPtr = aVector;
++ const float* bPtr = bVector;
++ unsigned int number = 0;
++
++ for (number = 0; number < num_points; number++) {
++ *cPtr++ = (*aPtr++) * (*bPtr++);
++ }
+ }
+ #endif /* LV_HAVE_GENERIC */
+
+
+ #ifdef LV_HAVE_ORC
+-extern void
+-volk_32f_x2_multiply_32f_a_orc_impl(float* cVector, const float* aVector,
+- const float* bVector, unsigned int num_points);
+-
+-static inline void
+-volk_32f_x2_multiply_32f_u_orc(float* cVector, const float* aVector,
+- const float* bVector, unsigned int num_points)
++extern void volk_32f_x2_multiply_32f_a_orc_impl(float* cVector,
++ const float* aVector,
++ const float* bVector,
++ unsigned int num_points);
++
++static inline void volk_32f_x2_multiply_32f_u_orc(float* cVector,
++ const float* aVector,
++ const float* bVector,
++ unsigned int num_points)
+ {
+- volk_32f_x2_multiply_32f_a_orc_impl(cVector, aVector, bVector, num_points);
++ volk_32f_x2_multiply_32f_a_orc_impl(cVector, aVector, bVector, num_points);
+ }
+ #endif /* LV_HAVE_ORC */
+
+diff --git a/kernels/volk/volk_32f_x2_pow_32f.h b/kernels/volk/volk_32f_x2_pow_32f.h
+index daa7f4e..106c57b 100644
+--- a/kernels/volk/volk_32f_x2_pow_32f.h
++++ b/kernels/volk/volk_32f_x2_pow_32f.h
+@@ -31,8 +31,8 @@
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+- * void volk_32f_x2_pow_32f(float* cVector, const float* bVector, const float* aVector, unsigned int num_points)
+- * \endcode
++ * void volk_32f_x2_pow_32f(float* cVector, const float* bVector, const float* aVector,
++ * unsigned int num_points) \endcode
+ *
+ * \b Inputs
+ * \li bVector: The input vector of indices (power values).
+@@ -71,10 +71,10 @@
+ #ifndef INCLUDED_volk_32f_x2_pow_32f_a_H
+ #define INCLUDED_volk_32f_x2_pow_32f_a_H
+
+-#include <stdio.h>
+-#include <stdlib.h>
+ #include <inttypes.h>
+ #include <math.h>
++#include <stdio.h>
++#include <stdlib.h>
+
+ #define POW_POLY_DEGREE 3
+
+@@ -82,99 +82,130 @@
+ #include <immintrin.h>
+
+ #define POLY0_AVX2_FMA(x, c0) _mm256_set1_ps(c0)
+-#define POLY1_AVX2_FMA(x, c0, c1) _mm256_fmadd_ps(POLY0_AVX2_FMA(x, c1), x, _mm256_set1_ps(c0))
+-#define POLY2_AVX2_FMA(x, c0, c1, c2) _mm256_fmadd_ps(POLY1_AVX2_FMA(x, c1, c2), x, _mm256_set1_ps(c0))
+-#define POLY3_AVX2_FMA(x, c0, c1, c2, c3) _mm256_fmadd_ps(POLY2_AVX2_FMA(x, c1, c2, c3), x, _mm256_set1_ps(c0))
+-#define POLY4_AVX2_FMA(x, c0, c1, c2, c3, c4) _mm256_fmadd_ps(POLY3_AVX2_FMA(x, c1, c2, c3, c4), x, _mm256_set1_ps(c0))
+-#define POLY5_AVX2_FMA(x, c0, c1, c2, c3, c4, c5) _mm256_fmadd_ps(POLY4_AVX2_FMA(x, c1, c2, c3, c4, c5), x, _mm256_set1_ps(c0))
+-
+-static inline void
+-volk_32f_x2_pow_32f_a_avx2_fma(float* cVector, const float* bVector,
+- const float* aVector, unsigned int num_points)
++#define POLY1_AVX2_FMA(x, c0, c1) \
++ _mm256_fmadd_ps(POLY0_AVX2_FMA(x, c1), x, _mm256_set1_ps(c0))
++#define POLY2_AVX2_FMA(x, c0, c1, c2) \
++ _mm256_fmadd_ps(POLY1_AVX2_FMA(x, c1, c2), x, _mm256_set1_ps(c0))
++#define POLY3_AVX2_FMA(x, c0, c1, c2, c3) \
++ _mm256_fmadd_ps(POLY2_AVX2_FMA(x, c1, c2, c3), x, _mm256_set1_ps(c0))
++#define POLY4_AVX2_FMA(x, c0, c1, c2, c3, c4) \
++ _mm256_fmadd_ps(POLY3_AVX2_FMA(x, c1, c2, c3, c4), x, _mm256_set1_ps(c0))
++#define POLY5_AVX2_FMA(x, c0, c1, c2, c3, c4, c5) \
++ _mm256_fmadd_ps(POLY4_AVX2_FMA(x, c1, c2, c3, c4, c5), x, _mm256_set1_ps(c0))
++
++static inline void volk_32f_x2_pow_32f_a_avx2_fma(float* cVector,
++ const float* bVector,
++ const float* aVector,
++ unsigned int num_points)
+ {
+- float* cPtr = cVector;
+- const float* bPtr = bVector;
+- const float* aPtr = aVector;
+-
+- unsigned int number = 0;
+- const unsigned int eighthPoints = num_points / 8;
+-
+- __m256 aVal, bVal, cVal, logarithm, mantissa, frac, leadingOne;
+- __m256 tmp, fx, mask, pow2n, z, y;
+- __m256 one, exp_hi, exp_lo, ln2, log2EF, half, exp_C1, exp_C2;
+- __m256 exp_p0, exp_p1, exp_p2, exp_p3, exp_p4, exp_p5;
+- __m256i bias, exp, emm0, pi32_0x7f;
+-
+- one = _mm256_set1_ps(1.0);
+- exp_hi = _mm256_set1_ps(88.3762626647949);
+- exp_lo = _mm256_set1_ps(-88.3762626647949);
+- ln2 = _mm256_set1_ps(0.6931471805);
+- log2EF = _mm256_set1_ps(1.44269504088896341);
+- half = _mm256_set1_ps(0.5);
+- exp_C1 = _mm256_set1_ps(0.693359375);
+- exp_C2 = _mm256_set1_ps(-2.12194440e-4);
+- pi32_0x7f = _mm256_set1_epi32(0x7f);
+-
+- exp_p0 = _mm256_set1_ps(1.9875691500e-4);
+- exp_p1 = _mm256_set1_ps(1.3981999507e-3);
+- exp_p2 = _mm256_set1_ps(8.3334519073e-3);
+- exp_p3 = _mm256_set1_ps(4.1665795894e-2);
+- exp_p4 = _mm256_set1_ps(1.6666665459e-1);
+- exp_p5 = _mm256_set1_ps(5.0000001201e-1);
+-
+- for(;number < eighthPoints; number++){
+- // First compute the logarithm
+- aVal = _mm256_load_ps(aPtr);
+- bias = _mm256_set1_epi32(127);
+- leadingOne = _mm256_set1_ps(1.0f);
+- exp = _mm256_sub_epi32(_mm256_srli_epi32(_mm256_and_si256(_mm256_castps_si256(aVal), _mm256_set1_epi32(0x7f800000)), 23), bias);
+- logarithm = _mm256_cvtepi32_ps(exp);
+-
+- frac = _mm256_or_ps(leadingOne, _mm256_and_ps(aVal, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffff))));
++ float* cPtr = cVector;
++ const float* bPtr = bVector;
++ const float* aPtr = aVector;
++
++ unsigned int number = 0;
++ const unsigned int eighthPoints = num_points / 8;
++
++ __m256 aVal, bVal, cVal, logarithm, mantissa, frac, leadingOne;
++ __m256 tmp, fx, mask, pow2n, z, y;
++ __m256 one, exp_hi, exp_lo, ln2, log2EF, half, exp_C1, exp_C2;
++ __m256 exp_p0, exp_p1, exp_p2, exp_p3, exp_p4, exp_p5;
++ __m256i bias, exp, emm0, pi32_0x7f;
++
++ one = _mm256_set1_ps(1.0);
++ exp_hi = _mm256_set1_ps(88.3762626647949);
++ exp_lo = _mm256_set1_ps(-88.3762626647949);
++ ln2 = _mm256_set1_ps(0.6931471805);
++ log2EF = _mm256_set1_ps(1.44269504088896341);
++ half = _mm256_set1_ps(0.5);
++ exp_C1 = _mm256_set1_ps(0.693359375);
++ exp_C2 = _mm256_set1_ps(-2.12194440e-4);
++ pi32_0x7f = _mm256_set1_epi32(0x7f);
++
++ exp_p0 = _mm256_set1_ps(1.9875691500e-4);
++ exp_p1 = _mm256_set1_ps(1.3981999507e-3);
++ exp_p2 = _mm256_set1_ps(8.3334519073e-3);
++ exp_p3 = _mm256_set1_ps(4.1665795894e-2);
++ exp_p4 = _mm256_set1_ps(1.6666665459e-1);
++ exp_p5 = _mm256_set1_ps(5.0000001201e-1);
++
++ for (; number < eighthPoints; number++) {
++ // First compute the logarithm
++ aVal = _mm256_load_ps(aPtr);
++ bias = _mm256_set1_epi32(127);
++ leadingOne = _mm256_set1_ps(1.0f);
++ exp = _mm256_sub_epi32(
++ _mm256_srli_epi32(_mm256_and_si256(_mm256_castps_si256(aVal),
++ _mm256_set1_epi32(0x7f800000)),
++ 23),
++ bias);
++ logarithm = _mm256_cvtepi32_ps(exp);
++
++ frac = _mm256_or_ps(
++ leadingOne,
++ _mm256_and_ps(aVal, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffff))));
+
+ #if POW_POLY_DEGREE == 6
+- mantissa = POLY5_AVX2_FMA( frac, 3.1157899f, -3.3241990f, 2.5988452f, -1.2315303f, 3.1821337e-1f, -3.4436006e-2f);
++ mantissa = POLY5_AVX2_FMA(frac,
++ 3.1157899f,
++ -3.3241990f,
++ 2.5988452f,
++ -1.2315303f,
++ 3.1821337e-1f,
++ -3.4436006e-2f);
+ #elif POW_POLY_DEGREE == 5
+- mantissa = POLY4_AVX2_FMA( frac, 2.8882704548164776201f, -2.52074962577807006663f, 1.48116647521213171641f, -0.465725644288844778798f, 0.0596515482674574969533f);
++ mantissa = POLY4_AVX2_FMA(frac,
++ 2.8882704548164776201f,
++ -2.52074962577807006663f,
++ 1.48116647521213171641f,
++ -0.465725644288844778798f,
++ 0.0596515482674574969533f);
+ #elif POW_POLY_DEGREE == 4
+- mantissa = POLY3_AVX2_FMA( frac, 2.61761038894603480148f, -1.75647175389045657003f, 0.688243882994381274313f, -0.107254423828329604454f);
++ mantissa = POLY3_AVX2_FMA(frac,
++ 2.61761038894603480148f,
++ -1.75647175389045657003f,
++ 0.688243882994381274313f,
++ -0.107254423828329604454f);
+ #elif POW_POLY_DEGREE == 3
+- mantissa = POLY2_AVX2_FMA( frac, 2.28330284476918490682f, -1.04913055217340124191f, 0.204446009836232697516f);
++ mantissa = POLY2_AVX2_FMA(frac,
++ 2.28330284476918490682f,
++ -1.04913055217340124191f,
++ 0.204446009836232697516f);
+ #else
+ #error
+ #endif
+
+- logarithm = _mm256_fmadd_ps(mantissa, _mm256_sub_ps(frac, leadingOne), logarithm);
+- logarithm = _mm256_mul_ps(logarithm, ln2);
++ logarithm = _mm256_fmadd_ps(mantissa, _mm256_sub_ps(frac, leadingOne), logarithm);
++ logarithm = _mm256_mul_ps(logarithm, ln2);
+
+- // Now calculate b*lna
+- bVal = _mm256_load_ps(bPtr);
+- bVal = _mm256_mul_ps(bVal, logarithm);
++ // Now calculate b*lna
++ bVal = _mm256_load_ps(bPtr);
++ bVal = _mm256_mul_ps(bVal, logarithm);
+
+- // Now compute exp(b*lna)
+- bVal = _mm256_max_ps(_mm256_min_ps(bVal, exp_hi), exp_lo);
++ // Now compute exp(b*lna)
++ bVal = _mm256_max_ps(_mm256_min_ps(bVal, exp_hi), exp_lo);
+
+- fx = _mm256_fmadd_ps(bVal, log2EF, half);
++ fx = _mm256_fmadd_ps(bVal, log2EF, half);
+
+- emm0 = _mm256_cvttps_epi32(fx);
+- tmp = _mm256_cvtepi32_ps(emm0);
++ emm0 = _mm256_cvttps_epi32(fx);
++ tmp = _mm256_cvtepi32_ps(emm0);
+
+- mask = _mm256_and_ps(_mm256_cmp_ps(tmp, fx, _CMP_GT_OS), one);
+- fx = _mm256_sub_ps(tmp, mask);
++ mask = _mm256_and_ps(_mm256_cmp_ps(tmp, fx, _CMP_GT_OS), one);
++ fx = _mm256_sub_ps(tmp, mask);
+
+- tmp = _mm256_fnmadd_ps(fx, exp_C1, bVal);
+- bVal = _mm256_fnmadd_ps(fx, exp_C2, tmp);
+- z = _mm256_mul_ps(bVal, bVal);
++ tmp = _mm256_fnmadd_ps(fx, exp_C1, bVal);
++ bVal = _mm256_fnmadd_ps(fx, exp_C2, tmp);
++ z = _mm256_mul_ps(bVal, bVal);
+
+- y = _mm256_fmadd_ps(exp_p0, bVal, exp_p1);
+- y = _mm256_fmadd_ps(y, bVal, exp_p2);
+- y = _mm256_fmadd_ps(y, bVal, exp_p3);
+- y = _mm256_fmadd_ps(y, bVal, exp_p4);
+- y = _mm256_fmadd_ps(y, bVal, exp_p5);
+- y = _mm256_fmadd_ps(y, z, bVal);
+- y = _mm256_add_ps(y, one);
++ y = _mm256_fmadd_ps(exp_p0, bVal, exp_p1);
++ y = _mm256_fmadd_ps(y, bVal, exp_p2);
++ y = _mm256_fmadd_ps(y, bVal, exp_p3);
++ y = _mm256_fmadd_ps(y, bVal, exp_p4);
++ y = _mm256_fmadd_ps(y, bVal, exp_p5);
++ y = _mm256_fmadd_ps(y, z, bVal);
++ y = _mm256_add_ps(y, one);
+
+- emm0 = _mm256_slli_epi32(_mm256_add_epi32(_mm256_cvttps_epi32(fx), pi32_0x7f), 23);
++ emm0 =
++ _mm256_slli_epi32(_mm256_add_epi32(_mm256_cvttps_epi32(fx), pi32_0x7f), 23);
+
+ pow2n = _mm256_castsi256_ps(emm0);
+ cVal = _mm256_mul_ps(y, pow2n);
+@@ -184,12 +215,12 @@ volk_32f_x2_pow_32f_a_avx2_fma(float* cVector, const float* bVector,
+ aPtr += 8;
+ bPtr += 8;
+ cPtr += 8;
+- }
++ }
+
+- number = eighthPoints * 8;
+- for(;number < num_points; number++){
+- *cPtr++ = pow(*aPtr++, *bPtr++);
+- }
++ number = eighthPoints * 8;
++ for (; number < num_points; number++) {
++ *cPtr++ = pow(*aPtr++, *bPtr++);
++ }
+ }
+
+ #endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for aligned */
+@@ -198,99 +229,131 @@ volk_32f_x2_pow_32f_a_avx2_fma(float* cVector, const float* bVector,
+ #include <immintrin.h>
+
+ #define POLY0_AVX2(x, c0) _mm256_set1_ps(c0)
+-#define POLY1_AVX2(x, c0, c1) _mm256_add_ps(_mm256_mul_ps(POLY0_AVX2(x, c1), x), _mm256_set1_ps(c0))
+-#define POLY2_AVX2(x, c0, c1, c2) _mm256_add_ps(_mm256_mul_ps(POLY1_AVX2(x, c1, c2), x), _mm256_set1_ps(c0))
+-#define POLY3_AVX2(x, c0, c1, c2, c3) _mm256_add_ps(_mm256_mul_ps(POLY2_AVX2(x, c1, c2, c3), x), _mm256_set1_ps(c0))
+-#define POLY4_AVX2(x, c0, c1, c2, c3, c4) _mm256_add_ps(_mm256_mul_ps(POLY3_AVX2(x, c1, c2, c3, c4), x), _mm256_set1_ps(c0))
+-#define POLY5_AVX2(x, c0, c1, c2, c3, c4, c5) _mm256_add_ps(_mm256_mul_ps(POLY4_AVX2(x, c1, c2, c3, c4, c5), x), _mm256_set1_ps(c0))
+-
+-static inline void
+-volk_32f_x2_pow_32f_a_avx2(float* cVector, const float* bVector,
+- const float* aVector, unsigned int num_points)
++#define POLY1_AVX2(x, c0, c1) \
++ _mm256_add_ps(_mm256_mul_ps(POLY0_AVX2(x, c1), x), _mm256_set1_ps(c0))
++#define POLY2_AVX2(x, c0, c1, c2) \
++ _mm256_add_ps(_mm256_mul_ps(POLY1_AVX2(x, c1, c2), x), _mm256_set1_ps(c0))
++#define POLY3_AVX2(x, c0, c1, c2, c3) \
++ _mm256_add_ps(_mm256_mul_ps(POLY2_AVX2(x, c1, c2, c3), x), _mm256_set1_ps(c0))
++#define POLY4_AVX2(x, c0, c1, c2, c3, c4) \
++ _mm256_add_ps(_mm256_mul_ps(POLY3_AVX2(x, c1, c2, c3, c4), x), _mm256_set1_ps(c0))
++#define POLY5_AVX2(x, c0, c1, c2, c3, c4, c5) \
++ _mm256_add_ps(_mm256_mul_ps(POLY4_AVX2(x, c1, c2, c3, c4, c5), x), _mm256_set1_ps(c0))
++
++static inline void volk_32f_x2_pow_32f_a_avx2(float* cVector,
++ const float* bVector,
++ const float* aVector,
++ unsigned int num_points)
+ {
+- float* cPtr = cVector;
+- const float* bPtr = bVector;
+- const float* aPtr = aVector;
+-
+- unsigned int number = 0;
+- const unsigned int eighthPoints = num_points / 8;
+-
+- __m256 aVal, bVal, cVal, logarithm, mantissa, frac, leadingOne;
+- __m256 tmp, fx, mask, pow2n, z, y;
+- __m256 one, exp_hi, exp_lo, ln2, log2EF, half, exp_C1, exp_C2;
+- __m256 exp_p0, exp_p1, exp_p2, exp_p3, exp_p4, exp_p5;
+- __m256i bias, exp, emm0, pi32_0x7f;
+-
+- one = _mm256_set1_ps(1.0);
+- exp_hi = _mm256_set1_ps(88.3762626647949);
+- exp_lo = _mm256_set1_ps(-88.3762626647949);
+- ln2 = _mm256_set1_ps(0.6931471805);
+- log2EF = _mm256_set1_ps(1.44269504088896341);
+- half = _mm256_set1_ps(0.5);
+- exp_C1 = _mm256_set1_ps(0.693359375);
+- exp_C2 = _mm256_set1_ps(-2.12194440e-4);
+- pi32_0x7f = _mm256_set1_epi32(0x7f);
+-
+- exp_p0 = _mm256_set1_ps(1.9875691500e-4);
+- exp_p1 = _mm256_set1_ps(1.3981999507e-3);
+- exp_p2 = _mm256_set1_ps(8.3334519073e-3);
+- exp_p3 = _mm256_set1_ps(4.1665795894e-2);
+- exp_p4 = _mm256_set1_ps(1.6666665459e-1);
+- exp_p5 = _mm256_set1_ps(5.0000001201e-1);
+-
+- for(;number < eighthPoints; number++){
+- // First compute the logarithm
+- aVal = _mm256_load_ps(aPtr);
+- bias = _mm256_set1_epi32(127);
+- leadingOne = _mm256_set1_ps(1.0f);
+- exp = _mm256_sub_epi32(_mm256_srli_epi32(_mm256_and_si256(_mm256_castps_si256(aVal), _mm256_set1_epi32(0x7f800000)), 23), bias);
+- logarithm = _mm256_cvtepi32_ps(exp);
+-
+- frac = _mm256_or_ps(leadingOne, _mm256_and_ps(aVal, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffff))));
++ float* cPtr = cVector;
++ const float* bPtr = bVector;
++ const float* aPtr = aVector;
++
++ unsigned int number = 0;
++ const unsigned int eighthPoints = num_points / 8;
++
++ __m256 aVal, bVal, cVal, logarithm, mantissa, frac, leadingOne;
++ __m256 tmp, fx, mask, pow2n, z, y;
++ __m256 one, exp_hi, exp_lo, ln2, log2EF, half, exp_C1, exp_C2;
++ __m256 exp_p0, exp_p1, exp_p2, exp_p3, exp_p4, exp_p5;
++ __m256i bias, exp, emm0, pi32_0x7f;
++
++ one = _mm256_set1_ps(1.0);
++ exp_hi = _mm256_set1_ps(88.3762626647949);
++ exp_lo = _mm256_set1_ps(-88.3762626647949);
++ ln2 = _mm256_set1_ps(0.6931471805);
++ log2EF = _mm256_set1_ps(1.44269504088896341);
++ half = _mm256_set1_ps(0.5);
++ exp_C1 = _mm256_set1_ps(0.693359375);
++ exp_C2 = _mm256_set1_ps(-2.12194440e-4);
++ pi32_0x7f = _mm256_set1_epi32(0x7f);
++
++ exp_p0 = _mm256_set1_ps(1.9875691500e-4);
++ exp_p1 = _mm256_set1_ps(1.3981999507e-3);
++ exp_p2 = _mm256_set1_ps(8.3334519073e-3);
++ exp_p3 = _mm256_set1_ps(4.1665795894e-2);
++ exp_p4 = _mm256_set1_ps(1.6666665459e-1);
++ exp_p5 = _mm256_set1_ps(5.0000001201e-1);
++
++ for (; number < eighthPoints; number++) {
++ // First compute the logarithm
++ aVal = _mm256_load_ps(aPtr);
++ bias = _mm256_set1_epi32(127);
++ leadingOne = _mm256_set1_ps(1.0f);
++ exp = _mm256_sub_epi32(
++ _mm256_srli_epi32(_mm256_and_si256(_mm256_castps_si256(aVal),
++ _mm256_set1_epi32(0x7f800000)),
++ 23),
++ bias);
++ logarithm = _mm256_cvtepi32_ps(exp);
++
++ frac = _mm256_or_ps(
++ leadingOne,
++ _mm256_and_ps(aVal, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffff))));
+
+ #if POW_POLY_DEGREE == 6
+- mantissa = POLY5_AVX2( frac, 3.1157899f, -3.3241990f, 2.5988452f, -1.2315303f, 3.1821337e-1f, -3.4436006e-2f);
++ mantissa = POLY5_AVX2(frac,
++ 3.1157899f,
++ -3.3241990f,
++ 2.5988452f,
++ -1.2315303f,
++ 3.1821337e-1f,
++ -3.4436006e-2f);
+ #elif POW_POLY_DEGREE == 5
+- mantissa = POLY4_AVX2( frac, 2.8882704548164776201f, -2.52074962577807006663f, 1.48116647521213171641f, -0.465725644288844778798f, 0.0596515482674574969533f);
++ mantissa = POLY4_AVX2(frac,
++ 2.8882704548164776201f,
++ -2.52074962577807006663f,
++ 1.48116647521213171641f,
++ -0.465725644288844778798f,
++ 0.0596515482674574969533f);
+ #elif POW_POLY_DEGREE == 4
+- mantissa = POLY3_AVX2( frac, 2.61761038894603480148f, -1.75647175389045657003f, 0.688243882994381274313f, -0.107254423828329604454f);
++ mantissa = POLY3_AVX2(frac,
++ 2.61761038894603480148f,
++ -1.75647175389045657003f,
++ 0.688243882994381274313f,
++ -0.107254423828329604454f);
+ #elif POW_POLY_DEGREE == 3
+- mantissa = POLY2_AVX2( frac, 2.28330284476918490682f, -1.04913055217340124191f, 0.204446009836232697516f);
++ mantissa = POLY2_AVX2(frac,
++ 2.28330284476918490682f,
++ -1.04913055217340124191f,
++ 0.204446009836232697516f);
+ #else
+ #error
+ #endif
+
+- logarithm = _mm256_add_ps(_mm256_mul_ps(mantissa, _mm256_sub_ps(frac, leadingOne)), logarithm);
+- logarithm = _mm256_mul_ps(logarithm, ln2);
++ logarithm = _mm256_add_ps(
++ _mm256_mul_ps(mantissa, _mm256_sub_ps(frac, leadingOne)), logarithm);
++ logarithm = _mm256_mul_ps(logarithm, ln2);
+
+- // Now calculate b*lna
+- bVal = _mm256_load_ps(bPtr);
+- bVal = _mm256_mul_ps(bVal, logarithm);
++ // Now calculate b*lna
++ bVal = _mm256_load_ps(bPtr);
++ bVal = _mm256_mul_ps(bVal, logarithm);
+
+- // Now compute exp(b*lna)
+- bVal = _mm256_max_ps(_mm256_min_ps(bVal, exp_hi), exp_lo);
++ // Now compute exp(b*lna)
++ bVal = _mm256_max_ps(_mm256_min_ps(bVal, exp_hi), exp_lo);
+
+- fx = _mm256_add_ps(_mm256_mul_ps(bVal, log2EF), half);
++ fx = _mm256_add_ps(_mm256_mul_ps(bVal, log2EF), half);
+
+- emm0 = _mm256_cvttps_epi32(fx);
+- tmp = _mm256_cvtepi32_ps(emm0);
++ emm0 = _mm256_cvttps_epi32(fx);
++ tmp = _mm256_cvtepi32_ps(emm0);
+
+- mask = _mm256_and_ps(_mm256_cmp_ps(tmp, fx, _CMP_GT_OS), one);
+- fx = _mm256_sub_ps(tmp, mask);
++ mask = _mm256_and_ps(_mm256_cmp_ps(tmp, fx, _CMP_GT_OS), one);
++ fx = _mm256_sub_ps(tmp, mask);
+
+- tmp = _mm256_sub_ps(bVal, _mm256_mul_ps(fx, exp_C1));
+- bVal = _mm256_sub_ps(tmp, _mm256_mul_ps(fx, exp_C2));
+- z = _mm256_mul_ps(bVal, bVal);
++ tmp = _mm256_sub_ps(bVal, _mm256_mul_ps(fx, exp_C1));
++ bVal = _mm256_sub_ps(tmp, _mm256_mul_ps(fx, exp_C2));
++ z = _mm256_mul_ps(bVal, bVal);
+
+- y = _mm256_add_ps(_mm256_mul_ps(exp_p0, bVal), exp_p1);
+- y = _mm256_add_ps(_mm256_mul_ps(y, bVal), exp_p2);
+- y = _mm256_add_ps(_mm256_mul_ps(y, bVal), exp_p3);
+- y = _mm256_add_ps(_mm256_mul_ps(y, bVal), exp_p4);
+- y = _mm256_add_ps(_mm256_mul_ps(y, bVal), exp_p5);
+- y = _mm256_add_ps(_mm256_mul_ps(y, z), bVal);
+- y = _mm256_add_ps(y, one);
++ y = _mm256_add_ps(_mm256_mul_ps(exp_p0, bVal), exp_p1);
++ y = _mm256_add_ps(_mm256_mul_ps(y, bVal), exp_p2);
++ y = _mm256_add_ps(_mm256_mul_ps(y, bVal), exp_p3);
++ y = _mm256_add_ps(_mm256_mul_ps(y, bVal), exp_p4);
++ y = _mm256_add_ps(_mm256_mul_ps(y, bVal), exp_p5);
++ y = _mm256_add_ps(_mm256_mul_ps(y, z), bVal);
++ y = _mm256_add_ps(y, one);
+
+- emm0 = _mm256_slli_epi32(_mm256_add_epi32(_mm256_cvttps_epi32(fx), pi32_0x7f), 23);
++ emm0 =
++ _mm256_slli_epi32(_mm256_add_epi32(_mm256_cvttps_epi32(fx), pi32_0x7f), 23);
+
+ pow2n = _mm256_castsi256_ps(emm0);
+ cVal = _mm256_mul_ps(y, pow2n);
+@@ -300,12 +363,12 @@ volk_32f_x2_pow_32f_a_avx2(float* cVector, const float* bVector,
+ aPtr += 8;
+ bPtr += 8;
+ cPtr += 8;
+- }
++ }
+
+- number = eighthPoints * 8;
+- for(;number < num_points; number++){
+- *cPtr++ = pow(*aPtr++, *bPtr++);
+- }
++ number = eighthPoints * 8;
++ for (; number < num_points; number++) {
++ *cPtr++ = pow(*aPtr++, *bPtr++);
++ }
+ }
+
+ #endif /* LV_HAVE_AVX2 for aligned */
+@@ -317,97 +380,124 @@ volk_32f_x2_pow_32f_a_avx2(float* cVector, const float* bVector,
+ #define POLY0(x, c0) _mm_set1_ps(c0)
+ #define POLY1(x, c0, c1) _mm_add_ps(_mm_mul_ps(POLY0(x, c1), x), _mm_set1_ps(c0))
+ #define POLY2(x, c0, c1, c2) _mm_add_ps(_mm_mul_ps(POLY1(x, c1, c2), x), _mm_set1_ps(c0))
+-#define POLY3(x, c0, c1, c2, c3) _mm_add_ps(_mm_mul_ps(POLY2(x, c1, c2, c3), x), _mm_set1_ps(c0))
+-#define POLY4(x, c0, c1, c2, c3, c4) _mm_add_ps(_mm_mul_ps(POLY3(x, c1, c2, c3, c4), x), _mm_set1_ps(c0))
+-#define POLY5(x, c0, c1, c2, c3, c4, c5) _mm_add_ps(_mm_mul_ps(POLY4(x, c1, c2, c3, c4, c5), x), _mm_set1_ps(c0))
+-
+-static inline void
+-volk_32f_x2_pow_32f_a_sse4_1(float* cVector, const float* bVector,
+- const float* aVector, unsigned int num_points)
++#define POLY3(x, c0, c1, c2, c3) \
++ _mm_add_ps(_mm_mul_ps(POLY2(x, c1, c2, c3), x), _mm_set1_ps(c0))
++#define POLY4(x, c0, c1, c2, c3, c4) \
++ _mm_add_ps(_mm_mul_ps(POLY3(x, c1, c2, c3, c4), x), _mm_set1_ps(c0))
++#define POLY5(x, c0, c1, c2, c3, c4, c5) \
++ _mm_add_ps(_mm_mul_ps(POLY4(x, c1, c2, c3, c4, c5), x), _mm_set1_ps(c0))
++
++static inline void volk_32f_x2_pow_32f_a_sse4_1(float* cVector,
++ const float* bVector,
++ const float* aVector,
++ unsigned int num_points)
+ {
+- float* cPtr = cVector;
+- const float* bPtr = bVector;
+- const float* aPtr = aVector;
+-
+- unsigned int number = 0;
+- const unsigned int quarterPoints = num_points / 4;
+-
+- __m128 aVal, bVal, cVal, logarithm, mantissa, frac, leadingOne;
+- __m128 tmp, fx, mask, pow2n, z, y;
+- __m128 one, exp_hi, exp_lo, ln2, log2EF, half, exp_C1, exp_C2;
+- __m128 exp_p0, exp_p1, exp_p2, exp_p3, exp_p4, exp_p5;
+- __m128i bias, exp, emm0, pi32_0x7f;
+-
+- one = _mm_set1_ps(1.0);
+- exp_hi = _mm_set1_ps(88.3762626647949);
+- exp_lo = _mm_set1_ps(-88.3762626647949);
+- ln2 = _mm_set1_ps(0.6931471805);
+- log2EF = _mm_set1_ps(1.44269504088896341);
+- half = _mm_set1_ps(0.5);
+- exp_C1 = _mm_set1_ps(0.693359375);
+- exp_C2 = _mm_set1_ps(-2.12194440e-4);
+- pi32_0x7f = _mm_set1_epi32(0x7f);
+-
+- exp_p0 = _mm_set1_ps(1.9875691500e-4);
+- exp_p1 = _mm_set1_ps(1.3981999507e-3);
+- exp_p2 = _mm_set1_ps(8.3334519073e-3);
+- exp_p3 = _mm_set1_ps(4.1665795894e-2);
+- exp_p4 = _mm_set1_ps(1.6666665459e-1);
+- exp_p5 = _mm_set1_ps(5.0000001201e-1);
+-
+- for(;number < quarterPoints; number++){
+- // First compute the logarithm
+- aVal = _mm_load_ps(aPtr);
+- bias = _mm_set1_epi32(127);
+- leadingOne = _mm_set1_ps(1.0f);
+- exp = _mm_sub_epi32(_mm_srli_epi32(_mm_and_si128(_mm_castps_si128(aVal), _mm_set1_epi32(0x7f800000)), 23), bias);
+- logarithm = _mm_cvtepi32_ps(exp);
+-
+- frac = _mm_or_ps(leadingOne, _mm_and_ps(aVal, _mm_castsi128_ps(_mm_set1_epi32(0x7fffff))));
++ float* cPtr = cVector;
++ const float* bPtr = bVector;
++ const float* aPtr = aVector;
++
++ unsigned int number = 0;
++ const unsigned int quarterPoints = num_points / 4;
++
++ __m128 aVal, bVal, cVal, logarithm, mantissa, frac, leadingOne;
++ __m128 tmp, fx, mask, pow2n, z, y;
++ __m128 one, exp_hi, exp_lo, ln2, log2EF, half, exp_C1, exp_C2;
++ __m128 exp_p0, exp_p1, exp_p2, exp_p3, exp_p4, exp_p5;
++ __m128i bias, exp, emm0, pi32_0x7f;
++
++ one = _mm_set1_ps(1.0);
++ exp_hi = _mm_set1_ps(88.3762626647949);
++ exp_lo = _mm_set1_ps(-88.3762626647949);
++ ln2 = _mm_set1_ps(0.6931471805);
++ log2EF = _mm_set1_ps(1.44269504088896341);
++ half = _mm_set1_ps(0.5);
++ exp_C1 = _mm_set1_ps(0.693359375);
++ exp_C2 = _mm_set1_ps(-2.12194440e-4);
++ pi32_0x7f = _mm_set1_epi32(0x7f);
++
++ exp_p0 = _mm_set1_ps(1.9875691500e-4);
++ exp_p1 = _mm_set1_ps(1.3981999507e-3);
++ exp_p2 = _mm_set1_ps(8.3334519073e-3);
++ exp_p3 = _mm_set1_ps(4.1665795894e-2);
++ exp_p4 = _mm_set1_ps(1.6666665459e-1);
++ exp_p5 = _mm_set1_ps(5.0000001201e-1);
++
++ for (; number < quarterPoints; number++) {
++ // First compute the logarithm
++ aVal = _mm_load_ps(aPtr);
++ bias = _mm_set1_epi32(127);
++ leadingOne = _mm_set1_ps(1.0f);
++ exp = _mm_sub_epi32(
++ _mm_srli_epi32(
++ _mm_and_si128(_mm_castps_si128(aVal), _mm_set1_epi32(0x7f800000)), 23),
++ bias);
++ logarithm = _mm_cvtepi32_ps(exp);
++
++ frac = _mm_or_ps(leadingOne,
++ _mm_and_ps(aVal, _mm_castsi128_ps(_mm_set1_epi32(0x7fffff))));
+
+ #if POW_POLY_DEGREE == 6
+- mantissa = POLY5( frac, 3.1157899f, -3.3241990f, 2.5988452f, -1.2315303f, 3.1821337e-1f, -3.4436006e-2f);
++ mantissa = POLY5(frac,
++ 3.1157899f,
++ -3.3241990f,
++ 2.5988452f,
++ -1.2315303f,
++ 3.1821337e-1f,
++ -3.4436006e-2f);
+ #elif POW_POLY_DEGREE == 5
+- mantissa = POLY4( frac, 2.8882704548164776201f, -2.52074962577807006663f, 1.48116647521213171641f, -0.465725644288844778798f, 0.0596515482674574969533f);
++ mantissa = POLY4(frac,
++ 2.8882704548164776201f,
++ -2.52074962577807006663f,
++ 1.48116647521213171641f,
++ -0.465725644288844778798f,
++ 0.0596515482674574969533f);
+ #elif POW_POLY_DEGREE == 4
+- mantissa = POLY3( frac, 2.61761038894603480148f, -1.75647175389045657003f, 0.688243882994381274313f, -0.107254423828329604454f);
++ mantissa = POLY3(frac,
++ 2.61761038894603480148f,
++ -1.75647175389045657003f,
++ 0.688243882994381274313f,
++ -0.107254423828329604454f);
+ #elif POW_POLY_DEGREE == 3
+- mantissa = POLY2( frac, 2.28330284476918490682f, -1.04913055217340124191f, 0.204446009836232697516f);
++ mantissa = POLY2(frac,
++ 2.28330284476918490682f,
++ -1.04913055217340124191f,
++ 0.204446009836232697516f);
+ #else
+ #error
+ #endif
+
+- logarithm = _mm_add_ps(logarithm, _mm_mul_ps(mantissa, _mm_sub_ps(frac, leadingOne)));
+- logarithm = _mm_mul_ps(logarithm, ln2);
++ logarithm =
++ _mm_add_ps(logarithm, _mm_mul_ps(mantissa, _mm_sub_ps(frac, leadingOne)));
++ logarithm = _mm_mul_ps(logarithm, ln2);
+
+
+- // Now calculate b*lna
+- bVal = _mm_load_ps(bPtr);
+- bVal = _mm_mul_ps(bVal, logarithm);
++ // Now calculate b*lna
++ bVal = _mm_load_ps(bPtr);
++ bVal = _mm_mul_ps(bVal, logarithm);
+
+- // Now compute exp(b*lna)
+- bVal = _mm_max_ps(_mm_min_ps(bVal, exp_hi), exp_lo);
++ // Now compute exp(b*lna)
++ bVal = _mm_max_ps(_mm_min_ps(bVal, exp_hi), exp_lo);
+
+- fx = _mm_add_ps(_mm_mul_ps(bVal, log2EF), half);
++ fx = _mm_add_ps(_mm_mul_ps(bVal, log2EF), half);
+
+- emm0 = _mm_cvttps_epi32(fx);
+- tmp = _mm_cvtepi32_ps(emm0);
++ emm0 = _mm_cvttps_epi32(fx);
++ tmp = _mm_cvtepi32_ps(emm0);
+
+- mask = _mm_and_ps(_mm_cmpgt_ps(tmp, fx), one);
+- fx = _mm_sub_ps(tmp, mask);
++ mask = _mm_and_ps(_mm_cmpgt_ps(tmp, fx), one);
++ fx = _mm_sub_ps(tmp, mask);
+
+- tmp = _mm_mul_ps(fx, exp_C1);
+- z = _mm_mul_ps(fx, exp_C2);
+- bVal = _mm_sub_ps(_mm_sub_ps(bVal, tmp), z);
+- z = _mm_mul_ps(bVal, bVal);
++ tmp = _mm_mul_ps(fx, exp_C1);
++ z = _mm_mul_ps(fx, exp_C2);
++ bVal = _mm_sub_ps(_mm_sub_ps(bVal, tmp), z);
++ z = _mm_mul_ps(bVal, bVal);
+
+- y = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(exp_p0, bVal), exp_p1), bVal);
+- y = _mm_add_ps(_mm_mul_ps(_mm_add_ps(y, exp_p2), bVal), exp_p3);
+- y = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(y, bVal), exp_p4), bVal);
+- y = _mm_add_ps(_mm_mul_ps(_mm_add_ps(y, exp_p5), z), bVal);
+- y = _mm_add_ps(y, one);
++ y = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(exp_p0, bVal), exp_p1), bVal);
++ y = _mm_add_ps(_mm_mul_ps(_mm_add_ps(y, exp_p2), bVal), exp_p3);
++ y = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(y, bVal), exp_p4), bVal);
++ y = _mm_add_ps(_mm_mul_ps(_mm_add_ps(y, exp_p5), z), bVal);
++ y = _mm_add_ps(y, one);
+
+- emm0 = _mm_slli_epi32(_mm_add_epi32(_mm_cvttps_epi32(fx), pi32_0x7f), 23);
++ emm0 = _mm_slli_epi32(_mm_add_epi32(_mm_cvttps_epi32(fx), pi32_0x7f), 23);
+
+ pow2n = _mm_castsi128_ps(emm0);
+ cVal = _mm_mul_ps(y, pow2n);
+@@ -417,12 +507,12 @@ volk_32f_x2_pow_32f_a_sse4_1(float* cVector, const float* bVector,
+ aPtr += 4;
+ bPtr += 4;
+ cPtr += 4;
+- }
++ }
+
+- number = quarterPoints * 4;
+- for(;number < num_points; number++){
+- *cPtr++ = powf(*aPtr++, *bPtr++);
+- }
++ number = quarterPoints * 4;
++ for (; number < num_points; number++) {
++ *cPtr++ = powf(*aPtr++, *bPtr++);
++ }
+ }
+
+ #endif /* LV_HAVE_SSE4_1 for aligned */
+@@ -432,27 +522,28 @@ volk_32f_x2_pow_32f_a_sse4_1(float* cVector, const float* bVector,
+ #ifndef INCLUDED_volk_32f_x2_pow_32f_u_H
+ #define INCLUDED_volk_32f_x2_pow_32f_u_H
+
+-#include <stdio.h>
+-#include <stdlib.h>
+ #include <inttypes.h>
+ #include <math.h>
++#include <stdio.h>
++#include <stdlib.h>
+
+ #define POW_POLY_DEGREE 3
+
+ #ifdef LV_HAVE_GENERIC
+
+-static inline void
+-volk_32f_x2_pow_32f_generic(float* cVector, const float* bVector,
+- const float* aVector, unsigned int num_points)
++static inline void volk_32f_x2_pow_32f_generic(float* cVector,
++ const float* bVector,
++ const float* aVector,
++ unsigned int num_points)
+ {
+- float* cPtr = cVector;
+- const float* bPtr = bVector;
+- const float* aPtr = aVector;
+- unsigned int number = 0;
+-
+- for(number = 0; number < num_points; number++){
+- *cPtr++ = powf(*aPtr++, *bPtr++);
+- }
++ float* cPtr = cVector;
++ const float* bPtr = bVector;
++ const float* aPtr = aVector;
++ unsigned int number = 0;
++
++ for (number = 0; number < num_points; number++) {
++ *cPtr++ = powf(*aPtr++, *bPtr++);
++ }
+ }
+ #endif /* LV_HAVE_GENERIC */
+
+@@ -463,112 +554,139 @@ volk_32f_x2_pow_32f_generic(float* cVector, const float* bVector,
+ #define POLY0(x, c0) _mm_set1_ps(c0)
+ #define POLY1(x, c0, c1) _mm_add_ps(_mm_mul_ps(POLY0(x, c1), x), _mm_set1_ps(c0))
+ #define POLY2(x, c0, c1, c2) _mm_add_ps(_mm_mul_ps(POLY1(x, c1, c2), x), _mm_set1_ps(c0))
+-#define POLY3(x, c0, c1, c2, c3) _mm_add_ps(_mm_mul_ps(POLY2(x, c1, c2, c3), x), _mm_set1_ps(c0))
+-#define POLY4(x, c0, c1, c2, c3, c4) _mm_add_ps(_mm_mul_ps(POLY3(x, c1, c2, c3, c4), x), _mm_set1_ps(c0))
+-#define POLY5(x, c0, c1, c2, c3, c4, c5) _mm_add_ps(_mm_mul_ps(POLY4(x, c1, c2, c3, c4, c5), x), _mm_set1_ps(c0))
+-
+-static inline void
+-volk_32f_x2_pow_32f_u_sse4_1(float* cVector, const float* bVector,
+- const float* aVector, unsigned int num_points)
++#define POLY3(x, c0, c1, c2, c3) \
++ _mm_add_ps(_mm_mul_ps(POLY2(x, c1, c2, c3), x), _mm_set1_ps(c0))
++#define POLY4(x, c0, c1, c2, c3, c4) \
++ _mm_add_ps(_mm_mul_ps(POLY3(x, c1, c2, c3, c4), x), _mm_set1_ps(c0))
++#define POLY5(x, c0, c1, c2, c3, c4, c5) \
++ _mm_add_ps(_mm_mul_ps(POLY4(x, c1, c2, c3, c4, c5), x), _mm_set1_ps(c0))
++
++static inline void volk_32f_x2_pow_32f_u_sse4_1(float* cVector,
++ const float* bVector,
++ const float* aVector,
++ unsigned int num_points)
+ {
+- float* cPtr = cVector;
+- const float* bPtr = bVector;
+- const float* aPtr = aVector;
+-
+- unsigned int number = 0;
+- const unsigned int quarterPoints = num_points / 4;
+-
+- __m128 aVal, bVal, cVal, logarithm, mantissa, frac, leadingOne;
+- __m128 tmp, fx, mask, pow2n, z, y;
+- __m128 one, exp_hi, exp_lo, ln2, log2EF, half, exp_C1, exp_C2;
+- __m128 exp_p0, exp_p1, exp_p2, exp_p3, exp_p4, exp_p5;
+- __m128i bias, exp, emm0, pi32_0x7f;
+-
+- one = _mm_set1_ps(1.0);
+- exp_hi = _mm_set1_ps(88.3762626647949);
+- exp_lo = _mm_set1_ps(-88.3762626647949);
+- ln2 = _mm_set1_ps(0.6931471805);
+- log2EF = _mm_set1_ps(1.44269504088896341);
+- half = _mm_set1_ps(0.5);
+- exp_C1 = _mm_set1_ps(0.693359375);
+- exp_C2 = _mm_set1_ps(-2.12194440e-4);
+- pi32_0x7f = _mm_set1_epi32(0x7f);
+-
+- exp_p0 = _mm_set1_ps(1.9875691500e-4);
+- exp_p1 = _mm_set1_ps(1.3981999507e-3);
+- exp_p2 = _mm_set1_ps(8.3334519073e-3);
+- exp_p3 = _mm_set1_ps(4.1665795894e-2);
+- exp_p4 = _mm_set1_ps(1.6666665459e-1);
+- exp_p5 = _mm_set1_ps(5.0000001201e-1);
+-
+- for(;number < quarterPoints; number++){
+- // First compute the logarithm
+- aVal = _mm_loadu_ps(aPtr);
+- bias = _mm_set1_epi32(127);
+- leadingOne = _mm_set1_ps(1.0f);
+- exp = _mm_sub_epi32(_mm_srli_epi32(_mm_and_si128(_mm_castps_si128(aVal), _mm_set1_epi32(0x7f800000)), 23), bias);
+- logarithm = _mm_cvtepi32_ps(exp);
+-
+- frac = _mm_or_ps(leadingOne, _mm_and_ps(aVal, _mm_castsi128_ps(_mm_set1_epi32(0x7fffff))));
++ float* cPtr = cVector;
++ const float* bPtr = bVector;
++ const float* aPtr = aVector;
++
++ unsigned int number = 0;
++ const unsigned int quarterPoints = num_points / 4;
++
++ __m128 aVal, bVal, cVal, logarithm, mantissa, frac, leadingOne;
++ __m128 tmp, fx, mask, pow2n, z, y;
++ __m128 one, exp_hi, exp_lo, ln2, log2EF, half, exp_C1, exp_C2;
++ __m128 exp_p0, exp_p1, exp_p2, exp_p3, exp_p4, exp_p5;
++ __m128i bias, exp, emm0, pi32_0x7f;
++
++ one = _mm_set1_ps(1.0);
++ exp_hi = _mm_set1_ps(88.3762626647949);
++ exp_lo = _mm_set1_ps(-88.3762626647949);
++ ln2 = _mm_set1_ps(0.6931471805);
++ log2EF = _mm_set1_ps(1.44269504088896341);
++ half = _mm_set1_ps(0.5);
++ exp_C1 = _mm_set1_ps(0.693359375);
++ exp_C2 = _mm_set1_ps(-2.12194440e-4);
++ pi32_0x7f = _mm_set1_epi32(0x7f);
++
++ exp_p0 = _mm_set1_ps(1.9875691500e-4);
++ exp_p1 = _mm_set1_ps(1.3981999507e-3);
++ exp_p2 = _mm_set1_ps(8.3334519073e-3);
++ exp_p3 = _mm_set1_ps(4.1665795894e-2);
++ exp_p4 = _mm_set1_ps(1.6666665459e-1);
++ exp_p5 = _mm_set1_ps(5.0000001201e-1);
++
++ for (; number < quarterPoints; number++) {
++ // First compute the logarithm
++ aVal = _mm_loadu_ps(aPtr);
++ bias = _mm_set1_epi32(127);
++ leadingOne = _mm_set1_ps(1.0f);
++ exp = _mm_sub_epi32(
++ _mm_srli_epi32(
++ _mm_and_si128(_mm_castps_si128(aVal), _mm_set1_epi32(0x7f800000)), 23),
++ bias);
++ logarithm = _mm_cvtepi32_ps(exp);
++
++ frac = _mm_or_ps(leadingOne,
++ _mm_and_ps(aVal, _mm_castsi128_ps(_mm_set1_epi32(0x7fffff))));
+
+ #if POW_POLY_DEGREE == 6
+- mantissa = POLY5( frac, 3.1157899f, -3.3241990f, 2.5988452f, -1.2315303f, 3.1821337e-1f, -3.4436006e-2f);
++ mantissa = POLY5(frac,
++ 3.1157899f,
++ -3.3241990f,
++ 2.5988452f,
++ -1.2315303f,
++ 3.1821337e-1f,
++ -3.4436006e-2f);
+ #elif POW_POLY_DEGREE == 5
+- mantissa = POLY4( frac, 2.8882704548164776201f, -2.52074962577807006663f, 1.48116647521213171641f, -0.465725644288844778798f, 0.0596515482674574969533f);
++ mantissa = POLY4(frac,
++ 2.8882704548164776201f,
++ -2.52074962577807006663f,
++ 1.48116647521213171641f,
++ -0.465725644288844778798f,
++ 0.0596515482674574969533f);
+ #elif POW_POLY_DEGREE == 4
+- mantissa = POLY3( frac, 2.61761038894603480148f, -1.75647175389045657003f, 0.688243882994381274313f, -0.107254423828329604454f);
++ mantissa = POLY3(frac,
++ 2.61761038894603480148f,
++ -1.75647175389045657003f,
++ 0.688243882994381274313f,
++ -0.107254423828329604454f);
+ #elif POW_POLY_DEGREE == 3
+- mantissa = POLY2( frac, 2.28330284476918490682f, -1.04913055217340124191f, 0.204446009836232697516f);
++ mantissa = POLY2(frac,
++ 2.28330284476918490682f,
++ -1.04913055217340124191f,
++ 0.204446009836232697516f);
+ #else
+ #error
+ #endif
+
+- logarithm = _mm_add_ps(logarithm, _mm_mul_ps(mantissa, _mm_sub_ps(frac, leadingOne)));
+- logarithm = _mm_mul_ps(logarithm, ln2);
++ logarithm =
++ _mm_add_ps(logarithm, _mm_mul_ps(mantissa, _mm_sub_ps(frac, leadingOne)));
++ logarithm = _mm_mul_ps(logarithm, ln2);
+
+
+- // Now calculate b*lna
+- bVal = _mm_loadu_ps(bPtr);
+- bVal = _mm_mul_ps(bVal, logarithm);
++ // Now calculate b*lna
++ bVal = _mm_loadu_ps(bPtr);
++ bVal = _mm_mul_ps(bVal, logarithm);
+
+- // Now compute exp(b*lna)
+- bVal = _mm_max_ps(_mm_min_ps(bVal, exp_hi), exp_lo);
++ // Now compute exp(b*lna)
++ bVal = _mm_max_ps(_mm_min_ps(bVal, exp_hi), exp_lo);
+
+- fx = _mm_add_ps(_mm_mul_ps(bVal, log2EF), half);
++ fx = _mm_add_ps(_mm_mul_ps(bVal, log2EF), half);
+
+- emm0 = _mm_cvttps_epi32(fx);
+- tmp = _mm_cvtepi32_ps(emm0);
++ emm0 = _mm_cvttps_epi32(fx);
++ tmp = _mm_cvtepi32_ps(emm0);
+
+- mask = _mm_and_ps(_mm_cmpgt_ps(tmp, fx), one);
+- fx = _mm_sub_ps(tmp, mask);
++ mask = _mm_and_ps(_mm_cmpgt_ps(tmp, fx), one);
++ fx = _mm_sub_ps(tmp, mask);
+
+- tmp = _mm_mul_ps(fx, exp_C1);
+- z = _mm_mul_ps(fx, exp_C2);
+- bVal = _mm_sub_ps(_mm_sub_ps(bVal, tmp), z);
+- z = _mm_mul_ps(bVal, bVal);
++ tmp = _mm_mul_ps(fx, exp_C1);
++ z = _mm_mul_ps(fx, exp_C2);
++ bVal = _mm_sub_ps(_mm_sub_ps(bVal, tmp), z);
++ z = _mm_mul_ps(bVal, bVal);
+
+- y = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(exp_p0, bVal), exp_p1), bVal);
+- y = _mm_add_ps(_mm_mul_ps(_mm_add_ps(y, exp_p2), bVal), exp_p3);
+- y = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(y, bVal), exp_p4), bVal);
+- y = _mm_add_ps(_mm_mul_ps(_mm_add_ps(y, exp_p5), z), bVal);
+- y = _mm_add_ps(y, one);
++ y = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(exp_p0, bVal), exp_p1), bVal);
++ y = _mm_add_ps(_mm_mul_ps(_mm_add_ps(y, exp_p2), bVal), exp_p3);
++ y = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(y, bVal), exp_p4), bVal);
++ y = _mm_add_ps(_mm_mul_ps(_mm_add_ps(y, exp_p5), z), bVal);
++ y = _mm_add_ps(y, one);
+
+- emm0 = _mm_slli_epi32(_mm_add_epi32(_mm_cvttps_epi32(fx), pi32_0x7f), 23);
++ emm0 = _mm_slli_epi32(_mm_add_epi32(_mm_cvttps_epi32(fx), pi32_0x7f), 23);
+
+- pow2n = _mm_castsi128_ps(emm0);
+- cVal = _mm_mul_ps(y, pow2n);
++ pow2n = _mm_castsi128_ps(emm0);
++ cVal = _mm_mul_ps(y, pow2n);
+
+- _mm_storeu_ps(cPtr, cVal);
++ _mm_storeu_ps(cPtr, cVal);
+
+- aPtr += 4;
+- bPtr += 4;
+- cPtr += 4;
+- }
++ aPtr += 4;
++ bPtr += 4;
++ cPtr += 4;
++ }
+
+- number = quarterPoints * 4;
+- for(;number < num_points; number++){
+- *cPtr++ = powf(*aPtr++, *bPtr++);
+- }
++ number = quarterPoints * 4;
++ for (; number < num_points; number++) {
++ *cPtr++ = powf(*aPtr++, *bPtr++);
++ }
+ }
+
+ #endif /* LV_HAVE_SSE4_1 for unaligned */
+@@ -577,100 +695,131 @@ volk_32f_x2_pow_32f_u_sse4_1(float* cVector, const float* bVector,
+ #include <immintrin.h>
+
+ #define POLY0_AVX2_FMA(x, c0) _mm256_set1_ps(c0)
+-#define POLY1_AVX2_FMA(x, c0, c1) _mm256_fmadd_ps(POLY0_AVX2_FMA(x, c1), x, _mm256_set1_ps(c0))
+-#define POLY2_AVX2_FMA(x, c0, c1, c2) _mm256_fmadd_ps(POLY1_AVX2_FMA(x, c1, c2), x, _mm256_set1_ps(c0))
+-#define POLY3_AVX2_FMA(x, c0, c1, c2, c3) _mm256_fmadd_ps(POLY2_AVX2_FMA(x, c1, c2, c3), x, _mm256_set1_ps(c0))
+-#define POLY4_AVX2_FMA(x, c0, c1, c2, c3, c4) _mm256_fmadd_ps(POLY3_AVX2_FMA(x, c1, c2, c3, c4), x, _mm256_set1_ps(c0))
+-#define POLY5_AVX2_FMA(x, c0, c1, c2, c3, c4, c5) _mm256_fmadd_ps(POLY4_AVX2_FMA(x, c1, c2, c3, c4, c5), x, _mm256_set1_ps(c0))
+-
+-static inline void
+-volk_32f_x2_pow_32f_u_avx2_fma(float* cVector, const float* bVector,
+- const float* aVector, unsigned int num_points)
++#define POLY1_AVX2_FMA(x, c0, c1) \
++ _mm256_fmadd_ps(POLY0_AVX2_FMA(x, c1), x, _mm256_set1_ps(c0))
++#define POLY2_AVX2_FMA(x, c0, c1, c2) \
++ _mm256_fmadd_ps(POLY1_AVX2_FMA(x, c1, c2), x, _mm256_set1_ps(c0))
++#define POLY3_AVX2_FMA(x, c0, c1, c2, c3) \
++ _mm256_fmadd_ps(POLY2_AVX2_FMA(x, c1, c2, c3), x, _mm256_set1_ps(c0))
++#define POLY4_AVX2_FMA(x, c0, c1, c2, c3, c4) \
++ _mm256_fmadd_ps(POLY3_AVX2_FMA(x, c1, c2, c3, c4), x, _mm256_set1_ps(c0))
++#define POLY5_AVX2_FMA(x, c0, c1, c2, c3, c4, c5) \
++ _mm256_fmadd_ps(POLY4_AVX2_FMA(x, c1, c2, c3, c4, c5), x, _mm256_set1_ps(c0))
++
++static inline void volk_32f_x2_pow_32f_u_avx2_fma(float* cVector,
++ const float* bVector,
++ const float* aVector,
++ unsigned int num_points)
+ {
+- float* cPtr = cVector;
+- const float* bPtr = bVector;
+- const float* aPtr = aVector;
+-
+- unsigned int number = 0;
+- const unsigned int eighthPoints = num_points / 8;
+-
+- __m256 aVal, bVal, cVal, logarithm, mantissa, frac, leadingOne;
+- __m256 tmp, fx, mask, pow2n, z, y;
+- __m256 one, exp_hi, exp_lo, ln2, log2EF, half, exp_C1, exp_C2;
+- __m256 exp_p0, exp_p1, exp_p2, exp_p3, exp_p4, exp_p5;
+- __m256i bias, exp, emm0, pi32_0x7f;
+-
+- one = _mm256_set1_ps(1.0);
+- exp_hi = _mm256_set1_ps(88.3762626647949);
+- exp_lo = _mm256_set1_ps(-88.3762626647949);
+- ln2 = _mm256_set1_ps(0.6931471805);
+- log2EF = _mm256_set1_ps(1.44269504088896341);
+- half = _mm256_set1_ps(0.5);
+- exp_C1 = _mm256_set1_ps(0.693359375);
+- exp_C2 = _mm256_set1_ps(-2.12194440e-4);
+- pi32_0x7f = _mm256_set1_epi32(0x7f);
+-
+- exp_p0 = _mm256_set1_ps(1.9875691500e-4);
+- exp_p1 = _mm256_set1_ps(1.3981999507e-3);
+- exp_p2 = _mm256_set1_ps(8.3334519073e-3);
+- exp_p3 = _mm256_set1_ps(4.1665795894e-2);
+- exp_p4 = _mm256_set1_ps(1.6666665459e-1);
+- exp_p5 = _mm256_set1_ps(5.0000001201e-1);
+-
+- for(;number < eighthPoints; number++){
+- // First compute the logarithm
+- aVal = _mm256_loadu_ps(aPtr);
+- bias = _mm256_set1_epi32(127);
+- leadingOne = _mm256_set1_ps(1.0f);
+- exp = _mm256_sub_epi32(_mm256_srli_epi32(_mm256_and_si256(_mm256_castps_si256(aVal), _mm256_set1_epi32(0x7f800000)), 23), bias);
+- logarithm = _mm256_cvtepi32_ps(exp);
+-
+- frac = _mm256_or_ps(leadingOne, _mm256_and_ps(aVal, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffff))));
++ float* cPtr = cVector;
++ const float* bPtr = bVector;
++ const float* aPtr = aVector;
++
++ unsigned int number = 0;
++ const unsigned int eighthPoints = num_points / 8;
++
++ __m256 aVal, bVal, cVal, logarithm, mantissa, frac, leadingOne;
++ __m256 tmp, fx, mask, pow2n, z, y;
++ __m256 one, exp_hi, exp_lo, ln2, log2EF, half, exp_C1, exp_C2;
++ __m256 exp_p0, exp_p1, exp_p2, exp_p3, exp_p4, exp_p5;
++ __m256i bias, exp, emm0, pi32_0x7f;
++
++ one = _mm256_set1_ps(1.0);
++ exp_hi = _mm256_set1_ps(88.3762626647949);
++ exp_lo = _mm256_set1_ps(-88.3762626647949);
++ ln2 = _mm256_set1_ps(0.6931471805);
++ log2EF = _mm256_set1_ps(1.44269504088896341);
++ half = _mm256_set1_ps(0.5);
++ exp_C1 = _mm256_set1_ps(0.693359375);
++ exp_C2 = _mm256_set1_ps(-2.12194440e-4);
++ pi32_0x7f = _mm256_set1_epi32(0x7f);
++
++ exp_p0 = _mm256_set1_ps(1.9875691500e-4);
++ exp_p1 = _mm256_set1_ps(1.3981999507e-3);
++ exp_p2 = _mm256_set1_ps(8.3334519073e-3);
++ exp_p3 = _mm256_set1_ps(4.1665795894e-2);
++ exp_p4 = _mm256_set1_ps(1.6666665459e-1);
++ exp_p5 = _mm256_set1_ps(5.0000001201e-1);
++
++ for (; number < eighthPoints; number++) {
++ // First compute the logarithm
++ aVal = _mm256_loadu_ps(aPtr);
++ bias = _mm256_set1_epi32(127);
++ leadingOne = _mm256_set1_ps(1.0f);
++ exp = _mm256_sub_epi32(
++ _mm256_srli_epi32(_mm256_and_si256(_mm256_castps_si256(aVal),
++ _mm256_set1_epi32(0x7f800000)),
++ 23),
++ bias);
++ logarithm = _mm256_cvtepi32_ps(exp);
++
++ frac = _mm256_or_ps(
++ leadingOne,
++ _mm256_and_ps(aVal, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffff))));
+
+ #if POW_POLY_DEGREE == 6
+- mantissa = POLY5_AVX2_FMA( frac, 3.1157899f, -3.3241990f, 2.5988452f, -1.2315303f, 3.1821337e-1f, -3.4436006e-2f);
++ mantissa = POLY5_AVX2_FMA(frac,
++ 3.1157899f,
++ -3.3241990f,
++ 2.5988452f,
++ -1.2315303f,
++ 3.1821337e-1f,
++ -3.4436006e-2f);
+ #elif POW_POLY_DEGREE == 5
+- mantissa = POLY4_AVX2_FMA( frac, 2.8882704548164776201f, -2.52074962577807006663f, 1.48116647521213171641f, -0.465725644288844778798f, 0.0596515482674574969533f);
++ mantissa = POLY4_AVX2_FMA(frac,
++ 2.8882704548164776201f,
++ -2.52074962577807006663f,
++ 1.48116647521213171641f,
++ -0.465725644288844778798f,
++ 0.0596515482674574969533f);
+ #elif POW_POLY_DEGREE == 4
+- mantissa = POLY3_AVX2_FMA( frac, 2.61761038894603480148f, -1.75647175389045657003f, 0.688243882994381274313f, -0.107254423828329604454f);
++ mantissa = POLY3_AVX2_FMA(frac,
++ 2.61761038894603480148f,
++ -1.75647175389045657003f,
++ 0.688243882994381274313f,
++ -0.107254423828329604454f);
+ #elif POW_POLY_DEGREE == 3
+- mantissa = POLY2_AVX2_FMA( frac, 2.28330284476918490682f, -1.04913055217340124191f, 0.204446009836232697516f);
++ mantissa = POLY2_AVX2_FMA(frac,
++ 2.28330284476918490682f,
++ -1.04913055217340124191f,
++ 0.204446009836232697516f);
+ #else
+ #error
+ #endif
+
+- logarithm = _mm256_fmadd_ps(mantissa, _mm256_sub_ps(frac, leadingOne), logarithm);
+- logarithm = _mm256_mul_ps(logarithm, ln2);
++ logarithm = _mm256_fmadd_ps(mantissa, _mm256_sub_ps(frac, leadingOne), logarithm);
++ logarithm = _mm256_mul_ps(logarithm, ln2);
+
+
+- // Now calculate b*lna
+- bVal = _mm256_loadu_ps(bPtr);
+- bVal = _mm256_mul_ps(bVal, logarithm);
++ // Now calculate b*lna
++ bVal = _mm256_loadu_ps(bPtr);
++ bVal = _mm256_mul_ps(bVal, logarithm);
+
+- // Now compute exp(b*lna)
+- bVal = _mm256_max_ps(_mm256_min_ps(bVal, exp_hi), exp_lo);
++ // Now compute exp(b*lna)
++ bVal = _mm256_max_ps(_mm256_min_ps(bVal, exp_hi), exp_lo);
+
+- fx = _mm256_fmadd_ps(bVal, log2EF, half);
++ fx = _mm256_fmadd_ps(bVal, log2EF, half);
+
+- emm0 = _mm256_cvttps_epi32(fx);
+- tmp = _mm256_cvtepi32_ps(emm0);
++ emm0 = _mm256_cvttps_epi32(fx);
++ tmp = _mm256_cvtepi32_ps(emm0);
+
+- mask = _mm256_and_ps(_mm256_cmp_ps(tmp, fx, _CMP_GT_OS), one);
+- fx = _mm256_sub_ps(tmp, mask);
++ mask = _mm256_and_ps(_mm256_cmp_ps(tmp, fx, _CMP_GT_OS), one);
++ fx = _mm256_sub_ps(tmp, mask);
+
+- tmp = _mm256_fnmadd_ps(fx, exp_C1, bVal);
+- bVal = _mm256_fnmadd_ps(fx, exp_C2, tmp);
+- z = _mm256_mul_ps(bVal, bVal);
++ tmp = _mm256_fnmadd_ps(fx, exp_C1, bVal);
++ bVal = _mm256_fnmadd_ps(fx, exp_C2, tmp);
++ z = _mm256_mul_ps(bVal, bVal);
+
+- y = _mm256_fmadd_ps(exp_p0, bVal, exp_p1);
+- y = _mm256_fmadd_ps(y, bVal, exp_p2);
+- y = _mm256_fmadd_ps(y, bVal, exp_p3);
+- y = _mm256_fmadd_ps(y, bVal, exp_p4);
+- y = _mm256_fmadd_ps(y, bVal, exp_p5);
+- y = _mm256_fmadd_ps(y, z, bVal);
+- y = _mm256_add_ps(y, one);
++ y = _mm256_fmadd_ps(exp_p0, bVal, exp_p1);
++ y = _mm256_fmadd_ps(y, bVal, exp_p2);
++ y = _mm256_fmadd_ps(y, bVal, exp_p3);
++ y = _mm256_fmadd_ps(y, bVal, exp_p4);
++ y = _mm256_fmadd_ps(y, bVal, exp_p5);
++ y = _mm256_fmadd_ps(y, z, bVal);
++ y = _mm256_add_ps(y, one);
+
+- emm0 = _mm256_slli_epi32(_mm256_add_epi32(_mm256_cvttps_epi32(fx), pi32_0x7f), 23);
++ emm0 =
++ _mm256_slli_epi32(_mm256_add_epi32(_mm256_cvttps_epi32(fx), pi32_0x7f), 23);
+
+ pow2n = _mm256_castsi256_ps(emm0);
+ cVal = _mm256_mul_ps(y, pow2n);
+@@ -680,12 +829,12 @@ volk_32f_x2_pow_32f_u_avx2_fma(float* cVector, const float* bVector,
+ aPtr += 8;
+ bPtr += 8;
+ cPtr += 8;
+- }
++ }
+
+- number = eighthPoints * 8;
+- for(;number < num_points; number++){
+- *cPtr++ = pow(*aPtr++, *bPtr++);
+- }
++ number = eighthPoints * 8;
++ for (; number < num_points; number++) {
++ *cPtr++ = pow(*aPtr++, *bPtr++);
++ }
+ }
+
+ #endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for unaligned */
+@@ -694,99 +843,131 @@ volk_32f_x2_pow_32f_u_avx2_fma(float* cVector, const float* bVector,
+ #include <immintrin.h>
+
+ #define POLY0_AVX2(x, c0) _mm256_set1_ps(c0)
+-#define POLY1_AVX2(x, c0, c1) _mm256_add_ps(_mm256_mul_ps(POLY0_AVX2(x, c1), x), _mm256_set1_ps(c0))
+-#define POLY2_AVX2(x, c0, c1, c2) _mm256_add_ps(_mm256_mul_ps(POLY1_AVX2(x, c1, c2), x), _mm256_set1_ps(c0))
+-#define POLY3_AVX2(x, c0, c1, c2, c3) _mm256_add_ps(_mm256_mul_ps(POLY2_AVX2(x, c1, c2, c3), x), _mm256_set1_ps(c0))
+-#define POLY4_AVX2(x, c0, c1, c2, c3, c4) _mm256_add_ps(_mm256_mul_ps(POLY3_AVX2(x, c1, c2, c3, c4), x), _mm256_set1_ps(c0))
+-#define POLY5_AVX2(x, c0, c1, c2, c3, c4, c5) _mm256_add_ps(_mm256_mul_ps(POLY4_AVX2(x, c1, c2, c3, c4, c5), x), _mm256_set1_ps(c0))
+-
+-static inline void
+-volk_32f_x2_pow_32f_u_avx2(float* cVector, const float* bVector,
+- const float* aVector, unsigned int num_points)
++#define POLY1_AVX2(x, c0, c1) \
++ _mm256_add_ps(_mm256_mul_ps(POLY0_AVX2(x, c1), x), _mm256_set1_ps(c0))
++#define POLY2_AVX2(x, c0, c1, c2) \
++ _mm256_add_ps(_mm256_mul_ps(POLY1_AVX2(x, c1, c2), x), _mm256_set1_ps(c0))
++#define POLY3_AVX2(x, c0, c1, c2, c3) \
++ _mm256_add_ps(_mm256_mul_ps(POLY2_AVX2(x, c1, c2, c3), x), _mm256_set1_ps(c0))
++#define POLY4_AVX2(x, c0, c1, c2, c3, c4) \
++ _mm256_add_ps(_mm256_mul_ps(POLY3_AVX2(x, c1, c2, c3, c4), x), _mm256_set1_ps(c0))
++#define POLY5_AVX2(x, c0, c1, c2, c3, c4, c5) \
++ _mm256_add_ps(_mm256_mul_ps(POLY4_AVX2(x, c1, c2, c3, c4, c5), x), _mm256_set1_ps(c0))
++
++static inline void volk_32f_x2_pow_32f_u_avx2(float* cVector,
++ const float* bVector,
++ const float* aVector,
++ unsigned int num_points)
+ {
+- float* cPtr = cVector;
+- const float* bPtr = bVector;
+- const float* aPtr = aVector;
+-
+- unsigned int number = 0;
+- const unsigned int eighthPoints = num_points / 8;
+-
+- __m256 aVal, bVal, cVal, logarithm, mantissa, frac, leadingOne;
+- __m256 tmp, fx, mask, pow2n, z, y;
+- __m256 one, exp_hi, exp_lo, ln2, log2EF, half, exp_C1, exp_C2;
+- __m256 exp_p0, exp_p1, exp_p2, exp_p3, exp_p4, exp_p5;
+- __m256i bias, exp, emm0, pi32_0x7f;
+-
+- one = _mm256_set1_ps(1.0);
+- exp_hi = _mm256_set1_ps(88.3762626647949);
+- exp_lo = _mm256_set1_ps(-88.3762626647949);
+- ln2 = _mm256_set1_ps(0.6931471805);
+- log2EF = _mm256_set1_ps(1.44269504088896341);
+- half = _mm256_set1_ps(0.5);
+- exp_C1 = _mm256_set1_ps(0.693359375);
+- exp_C2 = _mm256_set1_ps(-2.12194440e-4);
+- pi32_0x7f = _mm256_set1_epi32(0x7f);
+-
+- exp_p0 = _mm256_set1_ps(1.9875691500e-4);
+- exp_p1 = _mm256_set1_ps(1.3981999507e-3);
+- exp_p2 = _mm256_set1_ps(8.3334519073e-3);
+- exp_p3 = _mm256_set1_ps(4.1665795894e-2);
+- exp_p4 = _mm256_set1_ps(1.6666665459e-1);
+- exp_p5 = _mm256_set1_ps(5.0000001201e-1);
+-
+- for(;number < eighthPoints; number++){
+- // First compute the logarithm
+- aVal = _mm256_loadu_ps(aPtr);
+- bias = _mm256_set1_epi32(127);
+- leadingOne = _mm256_set1_ps(1.0f);
+- exp = _mm256_sub_epi32(_mm256_srli_epi32(_mm256_and_si256(_mm256_castps_si256(aVal), _mm256_set1_epi32(0x7f800000)), 23), bias);
+- logarithm = _mm256_cvtepi32_ps(exp);
+-
+- frac = _mm256_or_ps(leadingOne, _mm256_and_ps(aVal, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffff))));
++ float* cPtr = cVector;
++ const float* bPtr = bVector;
++ const float* aPtr = aVector;
++
++ unsigned int number = 0;
++ const unsigned int eighthPoints = num_points / 8;
++
++ __m256 aVal, bVal, cVal, logarithm, mantissa, frac, leadingOne;
++ __m256 tmp, fx, mask, pow2n, z, y;
++ __m256 one, exp_hi, exp_lo, ln2, log2EF, half, exp_C1, exp_C2;
++ __m256 exp_p0, exp_p1, exp_p2, exp_p3, exp_p4, exp_p5;
++ __m256i bias, exp, emm0, pi32_0x7f;
++
++ one = _mm256_set1_ps(1.0);
++ exp_hi = _mm256_set1_ps(88.3762626647949);
++ exp_lo = _mm256_set1_ps(-88.3762626647949);
++ ln2 = _mm256_set1_ps(0.6931471805);
++ log2EF = _mm256_set1_ps(1.44269504088896341);
++ half = _mm256_set1_ps(0.5);
++ exp_C1 = _mm256_set1_ps(0.693359375);
++ exp_C2 = _mm256_set1_ps(-2.12194440e-4);
++ pi32_0x7f = _mm256_set1_epi32(0x7f);
++
++ exp_p0 = _mm256_set1_ps(1.9875691500e-4);
++ exp_p1 = _mm256_set1_ps(1.3981999507e-3);
++ exp_p2 = _mm256_set1_ps(8.3334519073e-3);
++ exp_p3 = _mm256_set1_ps(4.1665795894e-2);
++ exp_p4 = _mm256_set1_ps(1.6666665459e-1);
++ exp_p5 = _mm256_set1_ps(5.0000001201e-1);
++
++ for (; number < eighthPoints; number++) {
++ // First compute the logarithm
++ aVal = _mm256_loadu_ps(aPtr);
++ bias = _mm256_set1_epi32(127);
++ leadingOne = _mm256_set1_ps(1.0f);
++ exp = _mm256_sub_epi32(
++ _mm256_srli_epi32(_mm256_and_si256(_mm256_castps_si256(aVal),
++ _mm256_set1_epi32(0x7f800000)),
++ 23),
++ bias);
++ logarithm = _mm256_cvtepi32_ps(exp);
++
++ frac = _mm256_or_ps(
++ leadingOne,
++ _mm256_and_ps(aVal, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffff))));
+
+ #if POW_POLY_DEGREE == 6
+- mantissa = POLY5_AVX2( frac, 3.1157899f, -3.3241990f, 2.5988452f, -1.2315303f, 3.1821337e-1f, -3.4436006e-2f);
++ mantissa = POLY5_AVX2(frac,
++ 3.1157899f,
++ -3.3241990f,
++ 2.5988452f,
++ -1.2315303f,
++ 3.1821337e-1f,
++ -3.4436006e-2f);
+ #elif POW_POLY_DEGREE == 5
+- mantissa = POLY4_AVX2( frac, 2.8882704548164776201f, -2.52074962577807006663f, 1.48116647521213171641f, -0.465725644288844778798f, 0.0596515482674574969533f);
++ mantissa = POLY4_AVX2(frac,
++ 2.8882704548164776201f,
++ -2.52074962577807006663f,
++ 1.48116647521213171641f,
++ -0.465725644288844778798f,
++ 0.0596515482674574969533f);
+ #elif POW_POLY_DEGREE == 4
+- mantissa = POLY3_AVX2( frac, 2.61761038894603480148f, -1.75647175389045657003f, 0.688243882994381274313f, -0.107254423828329604454f);
++ mantissa = POLY3_AVX2(frac,
++ 2.61761038894603480148f,
++ -1.75647175389045657003f,
++ 0.688243882994381274313f,
++ -0.107254423828329604454f);
+ #elif POW_POLY_DEGREE == 3
+- mantissa = POLY2_AVX2( frac, 2.28330284476918490682f, -1.04913055217340124191f, 0.204446009836232697516f);
++ mantissa = POLY2_AVX2(frac,
++ 2.28330284476918490682f,
++ -1.04913055217340124191f,
++ 0.204446009836232697516f);
+ #else
+ #error
+ #endif
+
+- logarithm = _mm256_add_ps(_mm256_mul_ps(mantissa, _mm256_sub_ps(frac, leadingOne)), logarithm);
+- logarithm = _mm256_mul_ps(logarithm, ln2);
++ logarithm = _mm256_add_ps(
++ _mm256_mul_ps(mantissa, _mm256_sub_ps(frac, leadingOne)), logarithm);
++ logarithm = _mm256_mul_ps(logarithm, ln2);
+
+- // Now calculate b*lna
+- bVal = _mm256_loadu_ps(bPtr);
+- bVal = _mm256_mul_ps(bVal, logarithm);
++ // Now calculate b*lna
++ bVal = _mm256_loadu_ps(bPtr);
++ bVal = _mm256_mul_ps(bVal, logarithm);
+
+- // Now compute exp(b*lna)
+- bVal = _mm256_max_ps(_mm256_min_ps(bVal, exp_hi), exp_lo);
++ // Now compute exp(b*lna)
++ bVal = _mm256_max_ps(_mm256_min_ps(bVal, exp_hi), exp_lo);
+
+- fx = _mm256_add_ps(_mm256_mul_ps(bVal, log2EF), half);
++ fx = _mm256_add_ps(_mm256_mul_ps(bVal, log2EF), half);
+
+- emm0 = _mm256_cvttps_epi32(fx);
+- tmp = _mm256_cvtepi32_ps(emm0);
++ emm0 = _mm256_cvttps_epi32(fx);
++ tmp = _mm256_cvtepi32_ps(emm0);
+
+- mask = _mm256_and_ps(_mm256_cmp_ps(tmp, fx, _CMP_GT_OS), one);
+- fx = _mm256_sub_ps(tmp, mask);
++ mask = _mm256_and_ps(_mm256_cmp_ps(tmp, fx, _CMP_GT_OS), one);
++ fx = _mm256_sub_ps(tmp, mask);
+
+- tmp = _mm256_sub_ps(bVal, _mm256_mul_ps(fx, exp_C1));
+- bVal = _mm256_sub_ps(tmp, _mm256_mul_ps(fx, exp_C2));
+- z = _mm256_mul_ps(bVal, bVal);
++ tmp = _mm256_sub_ps(bVal, _mm256_mul_ps(fx, exp_C1));
++ bVal = _mm256_sub_ps(tmp, _mm256_mul_ps(fx, exp_C2));
++ z = _mm256_mul_ps(bVal, bVal);
+
+- y = _mm256_add_ps(_mm256_mul_ps(exp_p0, bVal), exp_p1);
+- y = _mm256_add_ps(_mm256_mul_ps(y, bVal), exp_p2);
+- y = _mm256_add_ps(_mm256_mul_ps(y, bVal), exp_p3);
+- y = _mm256_add_ps(_mm256_mul_ps(y, bVal), exp_p4);
+- y = _mm256_add_ps(_mm256_mul_ps(y, bVal), exp_p5);
+- y = _mm256_add_ps(_mm256_mul_ps(y, z), bVal);
+- y = _mm256_add_ps(y, one);
++ y = _mm256_add_ps(_mm256_mul_ps(exp_p0, bVal), exp_p1);
++ y = _mm256_add_ps(_mm256_mul_ps(y, bVal), exp_p2);
++ y = _mm256_add_ps(_mm256_mul_ps(y, bVal), exp_p3);
++ y = _mm256_add_ps(_mm256_mul_ps(y, bVal), exp_p4);
++ y = _mm256_add_ps(_mm256_mul_ps(y, bVal), exp_p5);
++ y = _mm256_add_ps(_mm256_mul_ps(y, z), bVal);
++ y = _mm256_add_ps(y, one);
+
+- emm0 = _mm256_slli_epi32(_mm256_add_epi32(_mm256_cvttps_epi32(fx), pi32_0x7f), 23);
++ emm0 =
++ _mm256_slli_epi32(_mm256_add_epi32(_mm256_cvttps_epi32(fx), pi32_0x7f), 23);
+
+ pow2n = _mm256_castsi256_ps(emm0);
+ cVal = _mm256_mul_ps(y, pow2n);
+@@ -796,12 +977,12 @@ volk_32f_x2_pow_32f_u_avx2(float* cVector, const float* bVector,
+ aPtr += 8;
+ bPtr += 8;
+ cPtr += 8;
+- }
++ }
+
+- number = eighthPoints * 8;
+- for(;number < num_points; number++){
+- *cPtr++ = pow(*aPtr++, *bPtr++);
+- }
++ number = eighthPoints * 8;
++ for (; number < num_points; number++) {
++ *cPtr++ = pow(*aPtr++, *bPtr++);
++ }
+ }
+
+ #endif /* LV_HAVE_AVX2 for unaligned */
+diff --git a/kernels/volk/volk_32f_x2_s32f_interleave_16ic.h b/kernels/volk/volk_32f_x2_s32f_interleave_16ic.h
+index 8021faf..04e5892 100644
+--- a/kernels/volk/volk_32f_x2_s32f_interleave_16ic.h
++++ b/kernels/volk/volk_32f_x2_s32f_interleave_16ic.h
+@@ -32,8 +32,8 @@
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+- * void volk_32f_x2_s32f_interleave_16ic(lv_16sc_t* complexVector, const float* iBuffer, const float* qBuffer, const float scalar, unsigned int num_points)
+- * \endcode
++ * void volk_32f_x2_s32f_interleave_16ic(lv_16sc_t* complexVector, const float* iBuffer,
++ * const float* qBuffer, const float scalar, unsigned int num_points) \endcode
+ *
+ * \b Inputs
+ * \li iBuffer: Input vector of samples for the real part.
+@@ -75,60 +75,62 @@
+ #ifndef INCLUDED_volk_32f_x2_s32f_interleave_16ic_a_H
+ #define INCLUDED_volk_32f_x2_s32f_interleave_16ic_a_H
+
+-#include <volk/volk_common.h>
+ #include <inttypes.h>
+ #include <stdio.h>
++#include <volk/volk_common.h>
+
+ #ifdef LV_HAVE_AVX2
+ #include <immintrin.h>
+
+-static inline void
+-volk_32f_x2_s32f_interleave_16ic_a_avx2(lv_16sc_t* complexVector, const float* iBuffer,
+- const float* qBuffer, const float scalar, unsigned int num_points)
++static inline void volk_32f_x2_s32f_interleave_16ic_a_avx2(lv_16sc_t* complexVector,
++ const float* iBuffer,
++ const float* qBuffer,
++ const float scalar,
++ unsigned int num_points)
+ {
+- unsigned int number = 0;
+- const float* iBufferPtr = iBuffer;
+- const float* qBufferPtr = qBuffer;
++ unsigned int number = 0;
++ const float* iBufferPtr = iBuffer;
++ const float* qBufferPtr = qBuffer;
+
+- __m256 vScalar = _mm256_set1_ps(scalar);
++ __m256 vScalar = _mm256_set1_ps(scalar);
+
+- const unsigned int eighthPoints = num_points / 8;
++ const unsigned int eighthPoints = num_points / 8;
+
+- __m256 iValue, qValue, cplxValue1, cplxValue2;
+- __m256i intValue1, intValue2;
++ __m256 iValue, qValue, cplxValue1, cplxValue2;
++ __m256i intValue1, intValue2;
+
+- int16_t* complexVectorPtr = (int16_t*)complexVector;
++ int16_t* complexVectorPtr = (int16_t*)complexVector;
+
+- for(;number < eighthPoints; number++){
+- iValue = _mm256_load_ps(iBufferPtr);
+- qValue = _mm256_load_ps(qBufferPtr);
++ for (; number < eighthPoints; number++) {
++ iValue = _mm256_load_ps(iBufferPtr);
++ qValue = _mm256_load_ps(qBufferPtr);
+
+- // Interleaves the lower two values in the i and q variables into one buffer
+- cplxValue1 = _mm256_unpacklo_ps(iValue, qValue);
+- cplxValue1 = _mm256_mul_ps(cplxValue1, vScalar);
++ // Interleaves the lower two values in the i and q variables into one buffer
++ cplxValue1 = _mm256_unpacklo_ps(iValue, qValue);
++ cplxValue1 = _mm256_mul_ps(cplxValue1, vScalar);
+
+- // Interleaves the upper two values in the i and q variables into one buffer
+- cplxValue2 = _mm256_unpackhi_ps(iValue, qValue);
+- cplxValue2 = _mm256_mul_ps(cplxValue2, vScalar);
++ // Interleaves the upper two values in the i and q variables into one buffer
++ cplxValue2 = _mm256_unpackhi_ps(iValue, qValue);
++ cplxValue2 = _mm256_mul_ps(cplxValue2, vScalar);
+
+- intValue1 = _mm256_cvtps_epi32(cplxValue1);
+- intValue2 = _mm256_cvtps_epi32(cplxValue2);
++ intValue1 = _mm256_cvtps_epi32(cplxValue1);
++ intValue2 = _mm256_cvtps_epi32(cplxValue2);
+
+- intValue1 = _mm256_packs_epi32(intValue1, intValue2);
++ intValue1 = _mm256_packs_epi32(intValue1, intValue2);
+
+- _mm256_store_si256((__m256i*)complexVectorPtr, intValue1);
+- complexVectorPtr += 16;
++ _mm256_store_si256((__m256i*)complexVectorPtr, intValue1);
++ complexVectorPtr += 16;
+
+- iBufferPtr += 8;
+- qBufferPtr += 8;
+- }
++ iBufferPtr += 8;
++ qBufferPtr += 8;
++ }
+
+- number = eighthPoints * 8;
+- complexVectorPtr = (int16_t*)(&complexVector[number]);
+- for(; number < num_points; number++){
+- *complexVectorPtr++ = (int16_t)rintf(*iBufferPtr++ * scalar);
+- *complexVectorPtr++ = (int16_t)rintf(*qBufferPtr++ * scalar);
+- }
++ number = eighthPoints * 8;
++ complexVectorPtr = (int16_t*)(&complexVector[number]);
++ for (; number < num_points; number++) {
++ *complexVectorPtr++ = (int16_t)rintf(*iBufferPtr++ * scalar);
++ *complexVectorPtr++ = (int16_t)rintf(*qBufferPtr++ * scalar);
++ }
+ }
+ #endif /* LV_HAVE_AVX2 */
+
+@@ -136,53 +138,55 @@ volk_32f_x2_s32f_interleave_16ic_a_avx2(lv_16sc_t* complexVector, const float* i
+ #ifdef LV_HAVE_SSE2
+ #include <emmintrin.h>
+
+-static inline void
+-volk_32f_x2_s32f_interleave_16ic_a_sse2(lv_16sc_t* complexVector, const float* iBuffer,
+- const float* qBuffer, const float scalar, unsigned int num_points)
++static inline void volk_32f_x2_s32f_interleave_16ic_a_sse2(lv_16sc_t* complexVector,
++ const float* iBuffer,
++ const float* qBuffer,
++ const float scalar,
++ unsigned int num_points)
+ {
+- unsigned int number = 0;
+- const float* iBufferPtr = iBuffer;
+- const float* qBufferPtr = qBuffer;
++ unsigned int number = 0;
++ const float* iBufferPtr = iBuffer;
++ const float* qBufferPtr = qBuffer;
+
+- __m128 vScalar = _mm_set_ps1(scalar);
++ __m128 vScalar = _mm_set_ps1(scalar);
+
+- const unsigned int quarterPoints = num_points / 4;
++ const unsigned int quarterPoints = num_points / 4;
+
+- __m128 iValue, qValue, cplxValue1, cplxValue2;
+- __m128i intValue1, intValue2;
++ __m128 iValue, qValue, cplxValue1, cplxValue2;
++ __m128i intValue1, intValue2;
+
+- int16_t* complexVectorPtr = (int16_t*)complexVector;
++ int16_t* complexVectorPtr = (int16_t*)complexVector;
+
+- for(;number < quarterPoints; number++){
+- iValue = _mm_load_ps(iBufferPtr);
+- qValue = _mm_load_ps(qBufferPtr);
++ for (; number < quarterPoints; number++) {
++ iValue = _mm_load_ps(iBufferPtr);
++ qValue = _mm_load_ps(qBufferPtr);
+
+- // Interleaves the lower two values in the i and q variables into one buffer
+- cplxValue1 = _mm_unpacklo_ps(iValue, qValue);
+- cplxValue1 = _mm_mul_ps(cplxValue1, vScalar);
++ // Interleaves the lower two values in the i and q variables into one buffer
++ cplxValue1 = _mm_unpacklo_ps(iValue, qValue);
++ cplxValue1 = _mm_mul_ps(cplxValue1, vScalar);
+
+- // Interleaves the upper two values in the i and q variables into one buffer
+- cplxValue2 = _mm_unpackhi_ps(iValue, qValue);
+- cplxValue2 = _mm_mul_ps(cplxValue2, vScalar);
++ // Interleaves the upper two values in the i and q variables into one buffer
++ cplxValue2 = _mm_unpackhi_ps(iValue, qValue);
++ cplxValue2 = _mm_mul_ps(cplxValue2, vScalar);
+
+- intValue1 = _mm_cvtps_epi32(cplxValue1);
+- intValue2 = _mm_cvtps_epi32(cplxValue2);
++ intValue1 = _mm_cvtps_epi32(cplxValue1);
++ intValue2 = _mm_cvtps_epi32(cplxValue2);
+
+- intValue1 = _mm_packs_epi32(intValue1, intValue2);
++ intValue1 = _mm_packs_epi32(intValue1, intValue2);
+
+- _mm_store_si128((__m128i*)complexVectorPtr, intValue1);
+- complexVectorPtr += 8;
++ _mm_store_si128((__m128i*)complexVectorPtr, intValue1);
++ complexVectorPtr += 8;
+
+- iBufferPtr += 4;
+- qBufferPtr += 4;
+- }
++ iBufferPtr += 4;
++ qBufferPtr += 4;
++ }
+
+- number = quarterPoints * 4;
+- complexVectorPtr = (int16_t*)(&complexVector[number]);
+- for(; number < num_points; number++){
+- *complexVectorPtr++ = (int16_t)rintf(*iBufferPtr++ * scalar);
+- *complexVectorPtr++ = (int16_t)rintf(*qBufferPtr++ * scalar);
+- }
++ number = quarterPoints * 4;
++ complexVectorPtr = (int16_t*)(&complexVector[number]);
++ for (; number < num_points; number++) {
++ *complexVectorPtr++ = (int16_t)rintf(*iBufferPtr++ * scalar);
++ *complexVectorPtr++ = (int16_t)rintf(*qBufferPtr++ * scalar);
++ }
+ }
+ #endif /* LV_HAVE_SSE2 */
+
+@@ -190,79 +194,83 @@ volk_32f_x2_s32f_interleave_16ic_a_sse2(lv_16sc_t* complexVector, const float* i
+ #ifdef LV_HAVE_SSE
+ #include <xmmintrin.h>
+
+-static inline void
+-volk_32f_x2_s32f_interleave_16ic_a_sse(lv_16sc_t* complexVector, const float* iBuffer,
+- const float* qBuffer, const float scalar, unsigned int num_points)
++static inline void volk_32f_x2_s32f_interleave_16ic_a_sse(lv_16sc_t* complexVector,
++ const float* iBuffer,
++ const float* qBuffer,
++ const float scalar,
++ unsigned int num_points)
+ {
+- unsigned int number = 0;
+- const float* iBufferPtr = iBuffer;
+- const float* qBufferPtr = qBuffer;
++ unsigned int number = 0;
++ const float* iBufferPtr = iBuffer;
++ const float* qBufferPtr = qBuffer;
+
+- __m128 vScalar = _mm_set_ps1(scalar);
++ __m128 vScalar = _mm_set_ps1(scalar);
+
+- const unsigned int quarterPoints = num_points / 4;
++ const unsigned int quarterPoints = num_points / 4;
+
+- __m128 iValue, qValue, cplxValue;
++ __m128 iValue, qValue, cplxValue;
+
+- int16_t* complexVectorPtr = (int16_t*)complexVector;
++ int16_t* complexVectorPtr = (int16_t*)complexVector;
+
+- __VOLK_ATTR_ALIGNED(16) float floatBuffer[4];
++ __VOLK_ATTR_ALIGNED(16) float floatBuffer[4];
+
+- for(;number < quarterPoints; number++){
+- iValue = _mm_load_ps(iBufferPtr);
+- qValue = _mm_load_ps(qBufferPtr);
++ for (; number < quarterPoints; number++) {
++ iValue = _mm_load_ps(iBufferPtr);
++ qValue = _mm_load_ps(qBufferPtr);
+
+- // Interleaves the lower two values in the i and q variables into one buffer
+- cplxValue = _mm_unpacklo_ps(iValue, qValue);
+- cplxValue = _mm_mul_ps(cplxValue, vScalar);
++ // Interleaves the lower two values in the i and q variables into one buffer
++ cplxValue = _mm_unpacklo_ps(iValue, qValue);
++ cplxValue = _mm_mul_ps(cplxValue, vScalar);
+
+- _mm_store_ps(floatBuffer, cplxValue);
++ _mm_store_ps(floatBuffer, cplxValue);
+
+- *complexVectorPtr++ = (int16_t)rintf(floatBuffer[0]);
+- *complexVectorPtr++ = (int16_t)rintf(floatBuffer[1]);
+- *complexVectorPtr++ = (int16_t)rintf(floatBuffer[2]);
+- *complexVectorPtr++ = (int16_t)rintf(floatBuffer[3]);
++ *complexVectorPtr++ = (int16_t)rintf(floatBuffer[0]);
++ *complexVectorPtr++ = (int16_t)rintf(floatBuffer[1]);
++ *complexVectorPtr++ = (int16_t)rintf(floatBuffer[2]);
++ *complexVectorPtr++ = (int16_t)rintf(floatBuffer[3]);
+
+- // Interleaves the upper two values in the i and q variables into one buffer
+- cplxValue = _mm_unpackhi_ps(iValue, qValue);
+- cplxValue = _mm_mul_ps(cplxValue, vScalar);
++ // Interleaves the upper two values in the i and q variables into one buffer
++ cplxValue = _mm_unpackhi_ps(iValue, qValue);
++ cplxValue = _mm_mul_ps(cplxValue, vScalar);
+
+- _mm_store_ps(floatBuffer, cplxValue);
++ _mm_store_ps(floatBuffer, cplxValue);
+
+- *complexVectorPtr++ = (int16_t)rintf(floatBuffer[0]);
+- *complexVectorPtr++ = (int16_t)rintf(floatBuffer[1]);
+- *complexVectorPtr++ = (int16_t)rintf(floatBuffer[2]);
+- *complexVectorPtr++ = (int16_t)rintf(floatBuffer[3]);
++ *complexVectorPtr++ = (int16_t)rintf(floatBuffer[0]);
++ *complexVectorPtr++ = (int16_t)rintf(floatBuffer[1]);
++ *complexVectorPtr++ = (int16_t)rintf(floatBuffer[2]);
++ *complexVectorPtr++ = (int16_t)rintf(floatBuffer[3]);
+
+- iBufferPtr += 4;
+- qBufferPtr += 4;
+- }
++ iBufferPtr += 4;
++ qBufferPtr += 4;
++ }
+
+- number = quarterPoints * 4;
+- complexVectorPtr = (int16_t*)(&complexVector[number]);
+- for(; number < num_points; number++){
+- *complexVectorPtr++ = (int16_t)rintf(*iBufferPtr++ * scalar);
+- *complexVectorPtr++ = (int16_t)rintf(*qBufferPtr++ * scalar);
+- }
++ number = quarterPoints * 4;
++ complexVectorPtr = (int16_t*)(&complexVector[number]);
++ for (; number < num_points; number++) {
++ *complexVectorPtr++ = (int16_t)rintf(*iBufferPtr++ * scalar);
++ *complexVectorPtr++ = (int16_t)rintf(*qBufferPtr++ * scalar);
++ }
+ }
+ #endif /* LV_HAVE_SSE */
+
+
+ #ifdef LV_HAVE_GENERIC
+
+-static inline void
+-volk_32f_x2_s32f_interleave_16ic_generic(lv_16sc_t* complexVector, const float* iBuffer,
+- const float* qBuffer, const float scalar, unsigned int num_points)
++static inline void volk_32f_x2_s32f_interleave_16ic_generic(lv_16sc_t* complexVector,
++ const float* iBuffer,
++ const float* qBuffer,
++ const float scalar,
++ unsigned int num_points)
+ {
+- int16_t* complexVectorPtr = (int16_t*)complexVector;
+- const float* iBufferPtr = iBuffer;
+- const float* qBufferPtr = qBuffer;
+- unsigned int number = 0;
+-
+- for(number = 0; number < num_points; number++){
+- *complexVectorPtr++ = (int16_t)rintf(*iBufferPtr++ * scalar);
+- *complexVectorPtr++ = (int16_t)rintf(*qBufferPtr++ * scalar);
+- }
++ int16_t* complexVectorPtr = (int16_t*)complexVector;
++ const float* iBufferPtr = iBuffer;
++ const float* qBufferPtr = qBuffer;
++ unsigned int number = 0;
++
++ for (number = 0; number < num_points; number++) {
++ *complexVectorPtr++ = (int16_t)rintf(*iBufferPtr++ * scalar);
++ *complexVectorPtr++ = (int16_t)rintf(*qBufferPtr++ * scalar);
++ }
+ }
+ #endif /* LV_HAVE_GENERIC */
+
+@@ -272,60 +280,62 @@ volk_32f_x2_s32f_interleave_16ic_generic(lv_16sc_t* complexVector, const float*
+ #ifndef INCLUDED_volk_32f_x2_s32f_interleave_16ic_u_H
+ #define INCLUDED_volk_32f_x2_s32f_interleave_16ic_u_H
+
+-#include <volk/volk_common.h>
+ #include <inttypes.h>
+ #include <stdio.h>
++#include <volk/volk_common.h>
+
+ #ifdef LV_HAVE_AVX2
+ #include <immintrin.h>
+
+-static inline void
+-volk_32f_x2_s32f_interleave_16ic_u_avx2(lv_16sc_t* complexVector, const float* iBuffer,
+- const float* qBuffer, const float scalar, unsigned int num_points)
++static inline void volk_32f_x2_s32f_interleave_16ic_u_avx2(lv_16sc_t* complexVector,
++ const float* iBuffer,
++ const float* qBuffer,
++ const float scalar,
++ unsigned int num_points)
+ {
+- unsigned int number = 0;
+- const float* iBufferPtr = iBuffer;
+- const float* qBufferPtr = qBuffer;
++ unsigned int number = 0;
++ const float* iBufferPtr = iBuffer;
++ const float* qBufferPtr = qBuffer;
+
+- __m256 vScalar = _mm256_set1_ps(scalar);
++ __m256 vScalar = _mm256_set1_ps(scalar);
+
+- const unsigned int eighthPoints = num_points / 8;
++ const unsigned int eighthPoints = num_points / 8;
+
+- __m256 iValue, qValue, cplxValue1, cplxValue2;
+- __m256i intValue1, intValue2;
++ __m256 iValue, qValue, cplxValue1, cplxValue2;
++ __m256i intValue1, intValue2;
+
+- int16_t* complexVectorPtr = (int16_t*)complexVector;
++ int16_t* complexVectorPtr = (int16_t*)complexVector;
+
+- for(;number < eighthPoints; number++){
+- iValue = _mm256_loadu_ps(iBufferPtr);
+- qValue = _mm256_loadu_ps(qBufferPtr);
++ for (; number < eighthPoints; number++) {
++ iValue = _mm256_loadu_ps(iBufferPtr);
++ qValue = _mm256_loadu_ps(qBufferPtr);
+
+- // Interleaves the lower two values in the i and q variables into one buffer
+- cplxValue1 = _mm256_unpacklo_ps(iValue, qValue);
+- cplxValue1 = _mm256_mul_ps(cplxValue1, vScalar);
++ // Interleaves the lower two values in the i and q variables into one buffer
++ cplxValue1 = _mm256_unpacklo_ps(iValue, qValue);
++ cplxValue1 = _mm256_mul_ps(cplxValue1, vScalar);
+
+- // Interleaves the upper two values in the i and q variables into one buffer
+- cplxValue2 = _mm256_unpackhi_ps(iValue, qValue);
+- cplxValue2 = _mm256_mul_ps(cplxValue2, vScalar);
++ // Interleaves the upper two values in the i and q variables into one buffer
++ cplxValue2 = _mm256_unpackhi_ps(iValue, qValue);
++ cplxValue2 = _mm256_mul_ps(cplxValue2, vScalar);
+
+- intValue1 = _mm256_cvtps_epi32(cplxValue1);
+- intValue2 = _mm256_cvtps_epi32(cplxValue2);
++ intValue1 = _mm256_cvtps_epi32(cplxValue1);
++ intValue2 = _mm256_cvtps_epi32(cplxValue2);
+
+- intValue1 = _mm256_packs_epi32(intValue1, intValue2);
++ intValue1 = _mm256_packs_epi32(intValue1, intValue2);
+
+- _mm256_storeu_si256((__m256i*)complexVectorPtr, intValue1);
+- complexVectorPtr += 16;
++ _mm256_storeu_si256((__m256i*)complexVectorPtr, intValue1);
++ complexVectorPtr += 16;
+
+- iBufferPtr += 8;
+- qBufferPtr += 8;
+- }
++ iBufferPtr += 8;
++ qBufferPtr += 8;
++ }
+
+- number = eighthPoints * 8;
+- complexVectorPtr = (int16_t*)(&complexVector[number]);
+- for(; number < num_points; number++){
+- *complexVectorPtr++ = (int16_t)rintf(*iBufferPtr++ * scalar);
+- *complexVectorPtr++ = (int16_t)rintf(*qBufferPtr++ * scalar);
+- }
++ number = eighthPoints * 8;
++ complexVectorPtr = (int16_t*)(&complexVector[number]);
++ for (; number < num_points; number++) {
++ *complexVectorPtr++ = (int16_t)rintf(*iBufferPtr++ * scalar);
++ *complexVectorPtr++ = (int16_t)rintf(*qBufferPtr++ * scalar);
++ }
+ }
+ #endif /* LV_HAVE_AVX2 */
+
+diff --git a/kernels/volk/volk_32f_x2_subtract_32f.h b/kernels/volk/volk_32f_x2_subtract_32f.h
+index bdfa0a1..359974c 100644
+--- a/kernels/volk/volk_32f_x2_subtract_32f.h
++++ b/kernels/volk/volk_32f_x2_subtract_32f.h
+@@ -31,8 +31,8 @@
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+- * void volk_32f_x2_subtract_32f(float* cVector, const float* aVector, const float* bVector, unsigned int num_points)
+- * \endcode
++ * void volk_32f_x2_subtract_32f(float* cVector, const float* aVector, const float*
++ * bVector, unsigned int num_points) \endcode
+ *
+ * \b Inputs
+ * \li aVector: The initial vector.
+@@ -77,126 +77,130 @@
+ #ifdef LV_HAVE_AVX512F
+ #include <immintrin.h>
+
+-static inline void
+-volk_32f_x2_subtract_32f_a_avx512f(float* cVector, const float* aVector,
+- const float* bVector, unsigned int num_points)
++static inline void volk_32f_x2_subtract_32f_a_avx512f(float* cVector,
++ const float* aVector,
++ const float* bVector,
++ unsigned int num_points)
+ {
+- unsigned int number = 0;
+- const unsigned int sixteenthPoints = num_points / 16;
++ unsigned int number = 0;
++ const unsigned int sixteenthPoints = num_points / 16;
+
+- float* cPtr = cVector;
+- const float* aPtr = aVector;
+- const float* bPtr = bVector;
++ float* cPtr = cVector;
++ const float* aPtr = aVector;
++ const float* bPtr = bVector;
+
+- __m512 aVal, bVal, cVal;
+- for(;number < sixteenthPoints; number++){
++ __m512 aVal, bVal, cVal;
++ for (; number < sixteenthPoints; number++) {
+
+- aVal = _mm512_load_ps(aPtr);
+- bVal = _mm512_load_ps(bPtr);
++ aVal = _mm512_load_ps(aPtr);
++ bVal = _mm512_load_ps(bPtr);
+
+- cVal = _mm512_sub_ps(aVal, bVal);
++ cVal = _mm512_sub_ps(aVal, bVal);
+
+- _mm512_store_ps(cPtr,cVal); // Store the results back into the C container
++ _mm512_store_ps(cPtr, cVal); // Store the results back into the C container
+
+- aPtr += 16;
+- bPtr += 16;
+- cPtr += 16;
+- }
++ aPtr += 16;
++ bPtr += 16;
++ cPtr += 16;
++ }
+
+- number = sixteenthPoints *16;
+- for(;number < num_points; number++){
+- *cPtr++ = (*aPtr++) - (*bPtr++);
+- }
++ number = sixteenthPoints * 16;
++ for (; number < num_points; number++) {
++ *cPtr++ = (*aPtr++) - (*bPtr++);
++ }
+ }
+ #endif /* LV_HAVE_AVX512F */
+
+ #ifdef LV_HAVE_AVX
+ #include <immintrin.h>
+
+-static inline void
+-volk_32f_x2_subtract_32f_a_avx(float* cVector, const float* aVector,
+- const float* bVector, unsigned int num_points)
++static inline void volk_32f_x2_subtract_32f_a_avx(float* cVector,
++ const float* aVector,
++ const float* bVector,
++ unsigned int num_points)
+ {
+- unsigned int number = 0;
+- const unsigned int eighthPoints = num_points / 8;
++ unsigned int number = 0;
++ const unsigned int eighthPoints = num_points / 8;
+
+- float* cPtr = cVector;
+- const float* aPtr = aVector;
+- const float* bPtr = bVector;
++ float* cPtr = cVector;
++ const float* aPtr = aVector;
++ const float* bPtr = bVector;
+
+- __m256 aVal, bVal, cVal;
+- for(;number < eighthPoints; number++){
++ __m256 aVal, bVal, cVal;
++ for (; number < eighthPoints; number++) {
+
+- aVal = _mm256_load_ps(aPtr);
+- bVal = _mm256_load_ps(bPtr);
++ aVal = _mm256_load_ps(aPtr);
++ bVal = _mm256_load_ps(bPtr);
+
+- cVal = _mm256_sub_ps(aVal, bVal);
++ cVal = _mm256_sub_ps(aVal, bVal);
+
+- _mm256_store_ps(cPtr,cVal); // Store the results back into the C container
++ _mm256_store_ps(cPtr, cVal); // Store the results back into the C container
+
+- aPtr += 8;
+- bPtr += 8;
+- cPtr += 8;
+- }
++ aPtr += 8;
++ bPtr += 8;
++ cPtr += 8;
++ }
+
+- number = eighthPoints * 8;
+- for(;number < num_points; number++){
+- *cPtr++ = (*aPtr++) - (*bPtr++);
+- }
++ number = eighthPoints * 8;
++ for (; number < num_points; number++) {
++ *cPtr++ = (*aPtr++) - (*bPtr++);
++ }
+ }
+ #endif /* LV_HAVE_AVX */
+
+ #ifdef LV_HAVE_SSE
+ #include <xmmintrin.h>
+
+-static inline void
+-volk_32f_x2_subtract_32f_a_sse(float* cVector, const float* aVector,
+- const float* bVector, unsigned int num_points)
++static inline void volk_32f_x2_subtract_32f_a_sse(float* cVector,
++ const float* aVector,
++ const float* bVector,
++ unsigned int num_points)
+ {
+- unsigned int number = 0;
+- const unsigned int quarterPoints = num_points / 4;
++ unsigned int number = 0;
++ const unsigned int quarterPoints = num_points / 4;
+
+- float* cPtr = cVector;
+- const float* aPtr = aVector;
+- const float* bPtr = bVector;
++ float* cPtr = cVector;
++ const float* aPtr = aVector;
++ const float* bPtr = bVector;
+
+- __m128 aVal, bVal, cVal;
+- for(;number < quarterPoints; number++){
++ __m128 aVal, bVal, cVal;
++ for (; number < quarterPoints; number++) {
+
+- aVal = _mm_load_ps(aPtr);
+- bVal = _mm_load_ps(bPtr);
++ aVal = _mm_load_ps(aPtr);
++ bVal = _mm_load_ps(bPtr);
+
+- cVal = _mm_sub_ps(aVal, bVal);
++ cVal = _mm_sub_ps(aVal, bVal);
+
+- _mm_store_ps(cPtr,cVal); // Store the results back into the C container
++ _mm_store_ps(cPtr, cVal); // Store the results back into the C container
+
+- aPtr += 4;
+- bPtr += 4;
+- cPtr += 4;
+- }
++ aPtr += 4;
++ bPtr += 4;
++ cPtr += 4;
++ }
+
+- number = quarterPoints * 4;
+- for(;number < num_points; number++){
+- *cPtr++ = (*aPtr++) - (*bPtr++);
+- }
++ number = quarterPoints * 4;
++ for (; number < num_points; number++) {
++ *cPtr++ = (*aPtr++) - (*bPtr++);
++ }
+ }
+ #endif /* LV_HAVE_SSE */
+
+
+ #ifdef LV_HAVE_GENERIC
+
+-static inline void
+-volk_32f_x2_subtract_32f_generic(float* cVector, const float* aVector,
+- const float* bVector, unsigned int num_points)
++static inline void volk_32f_x2_subtract_32f_generic(float* cVector,
++ const float* aVector,
++ const float* bVector,
++ unsigned int num_points)
+ {
+- float* cPtr = cVector;
+- const float* aPtr = aVector;
+- const float* bPtr = bVector;
+- unsigned int number = 0;
+-
+- for(number = 0; number < num_points; number++){
+- *cPtr++ = (*aPtr++) - (*bPtr++);
+- }
++ float* cPtr = cVector;
++ const float* aPtr = aVector;
++ const float* bPtr = bVector;
++ unsigned int number = 0;
++
++ for (number = 0; number < num_points; number++) {
++ *cPtr++ = (*aPtr++) - (*bPtr++);
++ }
+ }
+ #endif /* LV_HAVE_GENERIC */
+
+@@ -204,45 +208,48 @@ volk_32f_x2_subtract_32f_generic(float* cVector, const float* aVector,
+ #ifdef LV_HAVE_NEON
+ #include <arm_neon.h>
+
+-static inline void
+-volk_32f_x2_subtract_32f_neon(float* cVector, const float* aVector,
+- const float* bVector, unsigned int num_points)
++static inline void volk_32f_x2_subtract_32f_neon(float* cVector,
++ const float* aVector,
++ const float* bVector,
++ unsigned int num_points)
+ {
+- float* cPtr = cVector;
+- const float* aPtr = aVector;
+- const float* bPtr = bVector;
+- unsigned int number = 0;
+- unsigned int quarter_points = num_points / 4;
+-
+- float32x4_t a_vec, b_vec, c_vec;
+-
+- for(number = 0; number < quarter_points; number++){
+- a_vec = vld1q_f32(aPtr);
+- b_vec = vld1q_f32(bPtr);
+- c_vec = vsubq_f32(a_vec, b_vec);
+- vst1q_f32(cPtr, c_vec);
+- aPtr += 4;
+- bPtr += 4;
+- cPtr += 4;
+- }
+-
+- for(number = quarter_points * 4; number < num_points; number++){
+- *cPtr++ = (*aPtr++) - (*bPtr++);
+- }
++ float* cPtr = cVector;
++ const float* aPtr = aVector;
++ const float* bPtr = bVector;
++ unsigned int number = 0;
++ unsigned int quarter_points = num_points / 4;
++
++ float32x4_t a_vec, b_vec, c_vec;
++
++ for (number = 0; number < quarter_points; number++) {
++ a_vec = vld1q_f32(aPtr);
++ b_vec = vld1q_f32(bPtr);
++ c_vec = vsubq_f32(a_vec, b_vec);
++ vst1q_f32(cPtr, c_vec);
++ aPtr += 4;
++ bPtr += 4;
++ cPtr += 4;
++ }
++
++ for (number = quarter_points * 4; number < num_points; number++) {
++ *cPtr++ = (*aPtr++) - (*bPtr++);
++ }
+ }
+ #endif /* LV_HAVE_NEON */
+
+
+ #ifdef LV_HAVE_ORC
+-extern void
+-volk_32f_x2_subtract_32f_a_orc_impl(float* cVector, const float* aVector,
+- const float* bVector, unsigned int num_points);
+-
+-static inline void
+-volk_32f_x2_subtract_32f_u_orc(float* cVector, const float* aVector,
+- const float* bVector, unsigned int num_points)
++extern void volk_32f_x2_subtract_32f_a_orc_impl(float* cVector,
++ const float* aVector,
++ const float* bVector,
++ unsigned int num_points);
++
++static inline void volk_32f_x2_subtract_32f_u_orc(float* cVector,
++ const float* aVector,
++ const float* bVector,
++ unsigned int num_points)
+ {
+- volk_32f_x2_subtract_32f_a_orc_impl(cVector, aVector, bVector, num_points);
++ volk_32f_x2_subtract_32f_a_orc_impl(cVector, aVector, bVector, num_points);
+ }
+ #endif /* LV_HAVE_ORC */
+
+@@ -259,36 +266,37 @@ volk_32f_x2_subtract_32f_u_orc(float* cVector, const float* aVector,
+ #ifdef LV_HAVE_AVX512F
+ #include <immintrin.h>
+
+-static inline void
+-volk_32f_x2_subtract_32f_u_avx512f(float* cVector, const float* aVector,
+- const float* bVector, unsigned int num_points)
++static inline void volk_32f_x2_subtract_32f_u_avx512f(float* cVector,
++ const float* aVector,
++ const float* bVector,
++ unsigned int num_points)
+ {
+- unsigned int number = 0;
+- const unsigned int sixteenthPoints = num_points / 16;
++ unsigned int number = 0;
++ const unsigned int sixteenthPoints = num_points / 16;
+
+- float* cPtr = cVector;
+- const float* aPtr = aVector;
+- const float* bPtr = bVector;
++ float* cPtr = cVector;
++ const float* aPtr = aVector;
++ const float* bPtr = bVector;
+
+- __m512 aVal, bVal, cVal;
+- for(;number < sixteenthPoints; number++){
++ __m512 aVal, bVal, cVal;
++ for (; number < sixteenthPoints; number++) {
+
+- aVal = _mm512_loadu_ps(aPtr);
+- bVal = _mm512_loadu_ps(bPtr);
++ aVal = _mm512_loadu_ps(aPtr);
++ bVal = _mm512_loadu_ps(bPtr);
+
+- cVal = _mm512_sub_ps(aVal, bVal);
++ cVal = _mm512_sub_ps(aVal, bVal);
+
+- _mm512_storeu_ps(cPtr,cVal); // Store the results back into the C container
++ _mm512_storeu_ps(cPtr, cVal); // Store the results back into the C container
+
+- aPtr += 16;
+- bPtr += 16;
+- cPtr += 16;
+- }
++ aPtr += 16;
++ bPtr += 16;
++ cPtr += 16;
++ }
+
+- number = sixteenthPoints *16;
+- for(;number < num_points; number++){
+- *cPtr++ = (*aPtr++) - (*bPtr++);
+- }
++ number = sixteenthPoints * 16;
++ for (; number < num_points; number++) {
++ *cPtr++ = (*aPtr++) - (*bPtr++);
++ }
+ }
+ #endif /* LV_HAVE_AVX512F */
+
+@@ -296,36 +304,37 @@ volk_32f_x2_subtract_32f_u_avx512f(float* cVector, const float* aVector,
+ #ifdef LV_HAVE_AVX
+ #include <immintrin.h>
+
+-static inline void
+-volk_32f_x2_subtract_32f_u_avx(float* cVector, const float* aVector,
+- const float* bVector, unsigned int num_points)
++static inline void volk_32f_x2_subtract_32f_u_avx(float* cVector,
++ const float* aVector,
++ const float* bVector,
++ unsigned int num_points)
+ {
+- unsigned int number = 0;
+- const unsigned int eighthPoints = num_points / 8;
++ unsigned int number = 0;
++ const unsigned int eighthPoints = num_points / 8;
+
+- float* cPtr = cVector;
+- const float* aPtr = aVector;
+- const float* bPtr = bVector;
++ float* cPtr = cVector;
++ const float* aPtr = aVector;
++ const float* bPtr = bVector;
+
+- __m256 aVal, bVal, cVal;
+- for(;number < eighthPoints; number++){
++ __m256 aVal, bVal, cVal;
++ for (; number < eighthPoints; number++) {
+
+- aVal = _mm256_loadu_ps(aPtr);
+- bVal = _mm256_loadu_ps(bPtr);
++ aVal = _mm256_loadu_ps(aPtr);
++ bVal = _mm256_loadu_ps(bPtr);
+
+- cVal = _mm256_sub_ps(aVal, bVal);
++ cVal = _mm256_sub_ps(aVal, bVal);
+
+- _mm256_storeu_ps(cPtr,cVal); // Store the results back into the C container
++ _mm256_storeu_ps(cPtr, cVal); // Store the results back into the C container
+
+- aPtr += 8;
+- bPtr += 8;
+- cPtr += 8;
+- }
++ aPtr += 8;
++ bPtr += 8;
++ cPtr += 8;
++ }
+
+- number = eighthPoints * 8;
+- for(;number < num_points; number++){
+- *cPtr++ = (*aPtr++) - (*bPtr++);
+- }
++ number = eighthPoints * 8;
++ for (; number < num_points; number++) {
++ *cPtr++ = (*aPtr++) - (*bPtr++);
++ }
+ }
+ #endif /* LV_HAVE_AVX */
+
+diff --git a/kernels/volk/volk_32f_x3_sum_of_poly_32f.h b/kernels/volk/volk_32f_x3_sum_of_poly_32f.h
+index e74a385..b0b1466 100644
+--- a/kernels/volk/volk_32f_x3_sum_of_poly_32f.h
++++ b/kernels/volk/volk_32f_x3_sum_of_poly_32f.h
+@@ -30,12 +30,13 @@
+ * multiply by the rectangle/bin width.
+ *
+ * Expressed as a formula, this function calculates
+- * \f$ \sum f(x) = \sum (c_0 + c_1 \cdot x + c_2 \cdot x^2 + c_3 \cdot x^3 + c_4 \cdot x^4)\f$
++ * \f$ \sum f(x) = \sum (c_0 + c_1 \cdot x + c_2 \cdot x^2 + c_3 \cdot x^3 + c_4 \cdot
++ * x^4)\f$
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+- * void volk_32f_x3_sum_of_poly_32f(float* target, float* src0, float* center_point_array, float* cutoff, unsigned int num_points)
+- * \endcode
++ * void volk_32f_x3_sum_of_poly_32f(float* target, float* src0, float* center_point_array,
++ * float* cutoff, unsigned int num_points) \endcode
+ *
+ * \b Inputs
+ * \li src0: x values
+@@ -53,9 +54,10 @@
+ * \code
+ * int npoints = 4096;
+ * float* coefficients = (float*)volk_malloc(sizeof(float) * 5, volk_get_alignment());
+- * float* input = (float*)volk_malloc(sizeof(float) * npoints, volk_get_alignment());
+- * float* result = (float*)volk_malloc(sizeof(float), volk_get_alignment());
+- * float* cutoff = (float*)volk_malloc(sizeof(float), volk_get_alignment());
++ * float* input = (float*)volk_malloc(sizeof(float) * npoints,
++ * volk_get_alignment()); float* result = (float*)volk_malloc(sizeof(float),
++ * volk_get_alignment()); float* cutoff = (float*)volk_malloc(sizeof(float),
++ * volk_get_alignment());
+ * // load precomputed Taylor series coefficients
+ * coefficients[0] = 4.48168907033806f; // c1
+ * coefficients[1] = coefficients[0] * 0.5f; // c2
+@@ -82,288 +84,291 @@
+ #ifndef INCLUDED_volk_32f_x3_sum_of_poly_32f_a_H
+ #define INCLUDED_volk_32f_x3_sum_of_poly_32f_a_H
+
+-#include<inttypes.h>
+-#include<stdio.h>
+-#include<volk/volk_complex.h>
++#include <inttypes.h>
++#include <stdio.h>
++#include <volk/volk_complex.h>
+
+ #ifndef MAX
+-#define MAX(X,Y) ((X) > (Y)?(X):(Y))
++#define MAX(X, Y) ((X) > (Y) ? (X) : (Y))
+ #endif
+
+ #ifdef LV_HAVE_SSE3
+-#include<xmmintrin.h>
+-#include<pmmintrin.h>
+-
+-static inline void
+-volk_32f_x3_sum_of_poly_32f_a_sse3(float* target, float* src0, float* center_point_array,
+- float* cutoff, unsigned int num_points)
++#include <pmmintrin.h>
++#include <xmmintrin.h>
++
++static inline void volk_32f_x3_sum_of_poly_32f_a_sse3(float* target,
++ float* src0,
++ float* center_point_array,
++ float* cutoff,
++ unsigned int num_points)
+ {
+- float result = 0.0f;
+- float fst = 0.0f;
+- float sq = 0.0f;
+- float thrd = 0.0f;
+- float frth = 0.0f;
+-
+- __m128 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10;
+-
+- xmm9 = _mm_setzero_ps();
+- xmm1 = _mm_setzero_ps();
+- xmm0 = _mm_load1_ps(¢er_point_array[0]);
+- xmm6 = _mm_load1_ps(¢er_point_array[1]);
+- xmm7 = _mm_load1_ps(¢er_point_array[2]);
+- xmm8 = _mm_load1_ps(¢er_point_array[3]);
+- xmm10 = _mm_load1_ps(cutoff);
+-
+- int bound = num_points/8;
+- int leftovers = num_points - 8*bound;
+- int i = 0;
+- for(; i < bound; ++i) {
+- // 1st
+- xmm2 = _mm_load_ps(src0);
+- xmm2 = _mm_max_ps(xmm10, xmm2);
+- xmm3 = _mm_mul_ps(xmm2, xmm2);
+- xmm4 = _mm_mul_ps(xmm2, xmm3);
+- xmm5 = _mm_mul_ps(xmm3, xmm3);
+-
+- xmm2 = _mm_mul_ps(xmm2, xmm0);
+- xmm3 = _mm_mul_ps(xmm3, xmm6);
+- xmm4 = _mm_mul_ps(xmm4, xmm7);
+- xmm5 = _mm_mul_ps(xmm5, xmm8);
+-
+- xmm2 = _mm_add_ps(xmm2, xmm3);
+- xmm3 = _mm_add_ps(xmm4, xmm5);
+-
+- src0 += 4;
+-
+- xmm9 = _mm_add_ps(xmm2, xmm9);
+- xmm9 = _mm_add_ps(xmm3, xmm9);
+-
+- // 2nd
+- xmm2 = _mm_load_ps(src0);
+- xmm2 = _mm_max_ps(xmm10, xmm2);
+- xmm3 = _mm_mul_ps(xmm2, xmm2);
+- xmm4 = _mm_mul_ps(xmm2, xmm3);
+- xmm5 = _mm_mul_ps(xmm3, xmm3);
+-
+- xmm2 = _mm_mul_ps(xmm2, xmm0);
+- xmm3 = _mm_mul_ps(xmm3, xmm6);
+- xmm4 = _mm_mul_ps(xmm4, xmm7);
+- xmm5 = _mm_mul_ps(xmm5, xmm8);
+-
+- xmm2 = _mm_add_ps(xmm2, xmm3);
+- xmm3 = _mm_add_ps(xmm4, xmm5);
+-
+- src0 += 4;
+-
+- xmm1 = _mm_add_ps(xmm2, xmm1);
+- xmm1 = _mm_add_ps(xmm3, xmm1);
+- }
+- xmm2 = _mm_hadd_ps(xmm9, xmm1);
+- xmm3 = _mm_hadd_ps(xmm2, xmm2);
+- xmm4 = _mm_hadd_ps(xmm3, xmm3);
+- _mm_store_ss(&result, xmm4);
+-
+- for(i = 0; i < leftovers; ++i) {
+- fst = *src0++;
+- fst = MAX(fst, *cutoff);
+- sq = fst * fst;
+- thrd = fst * sq;
+- frth = sq * sq;
+- result += (center_point_array[0] * fst +
+- center_point_array[1] * sq +
+- center_point_array[2] * thrd +
+- center_point_array[3] * frth);
+- }
+-
+- result += (float)(num_points) * center_point_array[4];
+- *target = result;
++ float result = 0.0f;
++ float fst = 0.0f;
++ float sq = 0.0f;
++ float thrd = 0.0f;
++ float frth = 0.0f;
++
++ __m128 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10;
++
++ xmm9 = _mm_setzero_ps();
++ xmm1 = _mm_setzero_ps();
++ xmm0 = _mm_load1_ps(¢er_point_array[0]);
++ xmm6 = _mm_load1_ps(¢er_point_array[1]);
++ xmm7 = _mm_load1_ps(¢er_point_array[2]);
++ xmm8 = _mm_load1_ps(¢er_point_array[3]);
++ xmm10 = _mm_load1_ps(cutoff);
++
++ int bound = num_points / 8;
++ int leftovers = num_points - 8 * bound;
++ int i = 0;
++ for (; i < bound; ++i) {
++ // 1st
++ xmm2 = _mm_load_ps(src0);
++ xmm2 = _mm_max_ps(xmm10, xmm2);
++ xmm3 = _mm_mul_ps(xmm2, xmm2);
++ xmm4 = _mm_mul_ps(xmm2, xmm3);
++ xmm5 = _mm_mul_ps(xmm3, xmm3);
++
++ xmm2 = _mm_mul_ps(xmm2, xmm0);
++ xmm3 = _mm_mul_ps(xmm3, xmm6);
++ xmm4 = _mm_mul_ps(xmm4, xmm7);
++ xmm5 = _mm_mul_ps(xmm5, xmm8);
++
++ xmm2 = _mm_add_ps(xmm2, xmm3);
++ xmm3 = _mm_add_ps(xmm4, xmm5);
++
++ src0 += 4;
++
++ xmm9 = _mm_add_ps(xmm2, xmm9);
++ xmm9 = _mm_add_ps(xmm3, xmm9);
++
++ // 2nd
++ xmm2 = _mm_load_ps(src0);
++ xmm2 = _mm_max_ps(xmm10, xmm2);
++ xmm3 = _mm_mul_ps(xmm2, xmm2);
++ xmm4 = _mm_mul_ps(xmm2, xmm3);
++ xmm5 = _mm_mul_ps(xmm3, xmm3);
++
++ xmm2 = _mm_mul_ps(xmm2, xmm0);
++ xmm3 = _mm_mul_ps(xmm3, xmm6);
++ xmm4 = _mm_mul_ps(xmm4, xmm7);
++ xmm5 = _mm_mul_ps(xmm5, xmm8);
++
++ xmm2 = _mm_add_ps(xmm2, xmm3);
++ xmm3 = _mm_add_ps(xmm4, xmm5);
++
++ src0 += 4;
++
++ xmm1 = _mm_add_ps(xmm2, xmm1);
++ xmm1 = _mm_add_ps(xmm3, xmm1);
++ }
++ xmm2 = _mm_hadd_ps(xmm9, xmm1);
++ xmm3 = _mm_hadd_ps(xmm2, xmm2);
++ xmm4 = _mm_hadd_ps(xmm3, xmm3);
++ _mm_store_ss(&result, xmm4);
++
++ for (i = 0; i < leftovers; ++i) {
++ fst = *src0++;
++ fst = MAX(fst, *cutoff);
++ sq = fst * fst;
++ thrd = fst * sq;
++ frth = sq * sq;
++ result += (center_point_array[0] * fst + center_point_array[1] * sq +
++ center_point_array[2] * thrd + center_point_array[3] * frth);
++ }
++
++ result += (float)(num_points)*center_point_array[4];
++ *target = result;
+ }
+
+
+ #endif /*LV_HAVE_SSE3*/
+
+ #if LV_HAVE_AVX && LV_HAVE_FMA
+-#include<immintrin.h>
++#include <immintrin.h>
+
+-static inline void
+-volk_32f_x3_sum_of_poly_32f_a_avx2_fma(float* target, float* src0, float* center_point_array,
+- float* cutoff, unsigned int num_points)
++static inline void volk_32f_x3_sum_of_poly_32f_a_avx2_fma(float* target,
++ float* src0,
++ float* center_point_array,
++ float* cutoff,
++ unsigned int num_points)
+ {
+- const unsigned int eighth_points = num_points / 8;
+- float fst = 0.0;
+- float sq = 0.0;
+- float thrd = 0.0;
+- float frth = 0.0;
+-
+- __m256 cpa0, cpa1, cpa2, cpa3, cutoff_vec;
+- __m256 target_vec;
+- __m256 x_to_1, x_to_2, x_to_3, x_to_4;
+-
+- cpa0 = _mm256_set1_ps(center_point_array[0]);
+- cpa1 = _mm256_set1_ps(center_point_array[1]);
+- cpa2 = _mm256_set1_ps(center_point_array[2]);
+- cpa3 = _mm256_set1_ps(center_point_array[3]);
+- cutoff_vec = _mm256_set1_ps(*cutoff);
+- target_vec = _mm256_setzero_ps();
+-
+- unsigned int i;
+-
+- for(i = 0; i < eighth_points; ++i) {
+- x_to_1 = _mm256_load_ps(src0);
+- x_to_1 = _mm256_max_ps(x_to_1, cutoff_vec);
+- x_to_2 = _mm256_mul_ps(x_to_1, x_to_1); // x^2
+- x_to_3 = _mm256_mul_ps(x_to_1, x_to_2); // x^3
+- // x^1 * x^3 is slightly faster than x^2 * x^2
+- x_to_4 = _mm256_mul_ps(x_to_1, x_to_3); // x^4
+-
+- x_to_2 = _mm256_mul_ps(x_to_2, cpa1); // cpa[1] * x^2
+- x_to_4 = _mm256_mul_ps(x_to_4, cpa3); // cpa[3] * x^4
+-
+- x_to_1 = _mm256_fmadd_ps(x_to_1, cpa0, x_to_2);
+- x_to_3 = _mm256_fmadd_ps(x_to_3, cpa2, x_to_4);
+- // this is slightly faster than result += (x_to_1 + x_to_3)
+- target_vec = _mm256_add_ps(x_to_1, target_vec);
+- target_vec = _mm256_add_ps(x_to_3, target_vec);
+-
+- src0 += 8;
+- }
+-
+- // the hadd for vector reduction has very very slight impact @ 50k iters
+- __VOLK_ATTR_ALIGNED(32) float temp_results[8];
+- target_vec = _mm256_hadd_ps(target_vec, target_vec); // x0+x1 | x2+x3 | x0+x1 | x2+x3 || x4+x5 | x6+x7 | x4+x5 | x6+x7
+- _mm256_store_ps(temp_results, target_vec);
+- *target = temp_results[0] + temp_results[1] + temp_results[4] + temp_results[5];
+-
+- for(i = eighth_points*8; i < num_points; ++i) {
+- fst = *src0++;
+- fst = MAX(fst, *cutoff);
+- sq = fst * fst;
+- thrd = fst * sq;
+- frth = sq * sq;
+- *target += (center_point_array[0] * fst +
+- center_point_array[1] * sq +
+- center_point_array[2] * thrd +
+- center_point_array[3] * frth);
+- }
+- *target += (float)(num_points) * center_point_array[4];
++ const unsigned int eighth_points = num_points / 8;
++ float fst = 0.0;
++ float sq = 0.0;
++ float thrd = 0.0;
++ float frth = 0.0;
++
++ __m256 cpa0, cpa1, cpa2, cpa3, cutoff_vec;
++ __m256 target_vec;
++ __m256 x_to_1, x_to_2, x_to_3, x_to_4;
++
++ cpa0 = _mm256_set1_ps(center_point_array[0]);
++ cpa1 = _mm256_set1_ps(center_point_array[1]);
++ cpa2 = _mm256_set1_ps(center_point_array[2]);
++ cpa3 = _mm256_set1_ps(center_point_array[3]);
++ cutoff_vec = _mm256_set1_ps(*cutoff);
++ target_vec = _mm256_setzero_ps();
++
++ unsigned int i;
++
++ for (i = 0; i < eighth_points; ++i) {
++ x_to_1 = _mm256_load_ps(src0);
++ x_to_1 = _mm256_max_ps(x_to_1, cutoff_vec);
++ x_to_2 = _mm256_mul_ps(x_to_1, x_to_1); // x^2
++ x_to_3 = _mm256_mul_ps(x_to_1, x_to_2); // x^3
++ // x^1 * x^3 is slightly faster than x^2 * x^2
++ x_to_4 = _mm256_mul_ps(x_to_1, x_to_3); // x^4
++
++ x_to_2 = _mm256_mul_ps(x_to_2, cpa1); // cpa[1] * x^2
++ x_to_4 = _mm256_mul_ps(x_to_4, cpa3); // cpa[3] * x^4
++
++ x_to_1 = _mm256_fmadd_ps(x_to_1, cpa0, x_to_2);
++ x_to_3 = _mm256_fmadd_ps(x_to_3, cpa2, x_to_4);
++ // this is slightly faster than result += (x_to_1 + x_to_3)
++ target_vec = _mm256_add_ps(x_to_1, target_vec);
++ target_vec = _mm256_add_ps(x_to_3, target_vec);
++
++ src0 += 8;
++ }
++
++ // the hadd for vector reduction has very very slight impact @ 50k iters
++ __VOLK_ATTR_ALIGNED(32) float temp_results[8];
++ target_vec = _mm256_hadd_ps(
++ target_vec,
++ target_vec); // x0+x1 | x2+x3 | x0+x1 | x2+x3 || x4+x5 | x6+x7 | x4+x5 | x6+x7
++ _mm256_store_ps(temp_results, target_vec);
++ *target = temp_results[0] + temp_results[1] + temp_results[4] + temp_results[5];
++
++ for (i = eighth_points * 8; i < num_points; ++i) {
++ fst = *src0++;
++ fst = MAX(fst, *cutoff);
++ sq = fst * fst;
++ thrd = fst * sq;
++ frth = sq * sq;
++ *target += (center_point_array[0] * fst + center_point_array[1] * sq +
++ center_point_array[2] * thrd + center_point_array[3] * frth);
++ }
++ *target += (float)(num_points)*center_point_array[4];
+ }
+ #endif // LV_HAVE_AVX && LV_HAVE_FMA
+
+ #ifdef LV_HAVE_AVX
+-#include<immintrin.h>
++#include <immintrin.h>
+
+-static inline void
+-volk_32f_x3_sum_of_poly_32f_a_avx(float* target, float* src0, float* center_point_array,
+- float* cutoff, unsigned int num_points)
++static inline void volk_32f_x3_sum_of_poly_32f_a_avx(float* target,
++ float* src0,
++ float* center_point_array,
++ float* cutoff,
++ unsigned int num_points)
+ {
+- const unsigned int eighth_points = num_points / 8;
+- float fst = 0.0;
+- float sq = 0.0;
+- float thrd = 0.0;
+- float frth = 0.0;
+-
+- __m256 cpa0, cpa1, cpa2, cpa3, cutoff_vec;
+- __m256 target_vec;
+- __m256 x_to_1, x_to_2, x_to_3, x_to_4;
+-
+- cpa0 = _mm256_set1_ps(center_point_array[0]);
+- cpa1 = _mm256_set1_ps(center_point_array[1]);
+- cpa2 = _mm256_set1_ps(center_point_array[2]);
+- cpa3 = _mm256_set1_ps(center_point_array[3]);
+- cutoff_vec = _mm256_set1_ps(*cutoff);
+- target_vec = _mm256_setzero_ps();
+-
+- unsigned int i;
+-
+- for(i = 0; i < eighth_points; ++i) {
+- x_to_1 = _mm256_load_ps(src0);
+- x_to_1 = _mm256_max_ps(x_to_1, cutoff_vec);
+- x_to_2 = _mm256_mul_ps(x_to_1, x_to_1); // x^2
+- x_to_3 = _mm256_mul_ps(x_to_1, x_to_2); // x^3
+- // x^1 * x^3 is slightly faster than x^2 * x^2
+- x_to_4 = _mm256_mul_ps(x_to_1, x_to_3); // x^4
+-
+- x_to_1 = _mm256_mul_ps(x_to_1, cpa0); // cpa[0] * x^1
+- x_to_2 = _mm256_mul_ps(x_to_2, cpa1); // cpa[1] * x^2
+- x_to_3 = _mm256_mul_ps(x_to_3, cpa2); // cpa[2] * x^3
+- x_to_4 = _mm256_mul_ps(x_to_4, cpa3); // cpa[3] * x^4
+-
+- x_to_1 = _mm256_add_ps(x_to_1, x_to_2);
+- x_to_3 = _mm256_add_ps(x_to_3, x_to_4);
+- // this is slightly faster than result += (x_to_1 + x_to_3)
+- target_vec = _mm256_add_ps(x_to_1, target_vec);
+- target_vec = _mm256_add_ps(x_to_3, target_vec);
+-
+- src0 += 8;
+- }
+-
+- // the hadd for vector reduction has very very slight impact @ 50k iters
+- __VOLK_ATTR_ALIGNED(32) float temp_results[8];
+- target_vec = _mm256_hadd_ps(target_vec, target_vec); // x0+x1 | x2+x3 | x0+x1 | x2+x3 || x4+x5 | x6+x7 | x4+x5 | x6+x7
+- _mm256_store_ps(temp_results, target_vec);
+- *target = temp_results[0] + temp_results[1] + temp_results[4] + temp_results[5];
+-
+- for(i = eighth_points*8; i < num_points; ++i) {
+- fst = *src0++;
+- fst = MAX(fst, *cutoff);
+- sq = fst * fst;
+- thrd = fst * sq;
+- frth = sq * sq;
+- *target += (center_point_array[0] * fst +
+- center_point_array[1] * sq +
+- center_point_array[2] * thrd +
+- center_point_array[3] * frth);
+- }
+- *target += (float)(num_points) * center_point_array[4];
++ const unsigned int eighth_points = num_points / 8;
++ float fst = 0.0;
++ float sq = 0.0;
++ float thrd = 0.0;
++ float frth = 0.0;
++
++ __m256 cpa0, cpa1, cpa2, cpa3, cutoff_vec;
++ __m256 target_vec;
++ __m256 x_to_1, x_to_2, x_to_3, x_to_4;
++
++ cpa0 = _mm256_set1_ps(center_point_array[0]);
++ cpa1 = _mm256_set1_ps(center_point_array[1]);
++ cpa2 = _mm256_set1_ps(center_point_array[2]);
++ cpa3 = _mm256_set1_ps(center_point_array[3]);
++ cutoff_vec = _mm256_set1_ps(*cutoff);
++ target_vec = _mm256_setzero_ps();
++
++ unsigned int i;
++
++ for (i = 0; i < eighth_points; ++i) {
++ x_to_1 = _mm256_load_ps(src0);
++ x_to_1 = _mm256_max_ps(x_to_1, cutoff_vec);
++ x_to_2 = _mm256_mul_ps(x_to_1, x_to_1); // x^2
++ x_to_3 = _mm256_mul_ps(x_to_1, x_to_2); // x^3
++ // x^1 * x^3 is slightly faster than x^2 * x^2
++ x_to_4 = _mm256_mul_ps(x_to_1, x_to_3); // x^4
++
++ x_to_1 = _mm256_mul_ps(x_to_1, cpa0); // cpa[0] * x^1
++ x_to_2 = _mm256_mul_ps(x_to_2, cpa1); // cpa[1] * x^2
++ x_to_3 = _mm256_mul_ps(x_to_3, cpa2); // cpa[2] * x^3
++ x_to_4 = _mm256_mul_ps(x_to_4, cpa3); // cpa[3] * x^4
++
++ x_to_1 = _mm256_add_ps(x_to_1, x_to_2);
++ x_to_3 = _mm256_add_ps(x_to_3, x_to_4);
++ // this is slightly faster than result += (x_to_1 + x_to_3)
++ target_vec = _mm256_add_ps(x_to_1, target_vec);
++ target_vec = _mm256_add_ps(x_to_3, target_vec);
++
++ src0 += 8;
++ }
++
++ // the hadd for vector reduction has very very slight impact @ 50k iters
++ __VOLK_ATTR_ALIGNED(32) float temp_results[8];
++ target_vec = _mm256_hadd_ps(
++ target_vec,
++ target_vec); // x0+x1 | x2+x3 | x0+x1 | x2+x3 || x4+x5 | x6+x7 | x4+x5 | x6+x7
++ _mm256_store_ps(temp_results, target_vec);
++ *target = temp_results[0] + temp_results[1] + temp_results[4] + temp_results[5];
++
++ for (i = eighth_points * 8; i < num_points; ++i) {
++ fst = *src0++;
++ fst = MAX(fst, *cutoff);
++ sq = fst * fst;
++ thrd = fst * sq;
++ frth = sq * sq;
++ *target += (center_point_array[0] * fst + center_point_array[1] * sq +
++ center_point_array[2] * thrd + center_point_array[3] * frth);
++ }
++ *target += (float)(num_points)*center_point_array[4];
+ }
+ #endif // LV_HAVE_AVX
+
+
+-
+ #ifdef LV_HAVE_GENERIC
+
+-static inline void
+-volk_32f_x3_sum_of_poly_32f_generic(float* target, float* src0, float* center_point_array,
+- float* cutoff, unsigned int num_points)
++static inline void volk_32f_x3_sum_of_poly_32f_generic(float* target,
++ float* src0,
++ float* center_point_array,
++ float* cutoff,
++ unsigned int num_points)
+ {
+- const unsigned int eighth_points = num_points / 8;
+-
+- float result[8] = {0.0f,0.0f,0.0f,0.0f, 0.0f,0.0f,0.0f,0.0f};
+- float fst = 0.0f;
+- float sq = 0.0f;
+- float thrd = 0.0f;
+- float frth = 0.0f;
+-
+- unsigned int i = 0;
+- unsigned int k = 0;
+- for(i = 0; i < eighth_points; ++i) {
+- for(k = 0; k < 8; ++k) {
+- fst = *src0++;
+- fst = MAX(fst, *cutoff);
+- sq = fst * fst;
+- thrd = fst * sq;
+- frth = fst * thrd;
+- result[k] += center_point_array[0] * fst + center_point_array[1] * sq;
+- result[k] += center_point_array[2] * thrd + center_point_array[3] * frth;
++ const unsigned int eighth_points = num_points / 8;
++
++ float result[8] = { 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f };
++ float fst = 0.0f;
++ float sq = 0.0f;
++ float thrd = 0.0f;
++ float frth = 0.0f;
++
++ unsigned int i = 0;
++ unsigned int k = 0;
++ for (i = 0; i < eighth_points; ++i) {
++ for (k = 0; k < 8; ++k) {
++ fst = *src0++;
++ fst = MAX(fst, *cutoff);
++ sq = fst * fst;
++ thrd = fst * sq;
++ frth = fst * thrd;
++ result[k] += center_point_array[0] * fst + center_point_array[1] * sq;
++ result[k] += center_point_array[2] * thrd + center_point_array[3] * frth;
++ }
+ }
+- }
+- for(k = 0; k < 8; k+=2)
+- result[k] = result[k]+result[k+1];
+-
+- *target = result[0] + result[2] + result[4] + result[6];
+-
+- for(i = eighth_points*8; i < num_points; ++i) {
+- fst = *src0++;
+- fst = MAX(fst, *cutoff);
+- sq = fst * fst;
+- thrd = fst * sq;
+- frth = fst * thrd;
+- *target += (center_point_array[0] * fst +
+- center_point_array[1] * sq +
+- center_point_array[2] * thrd +
+- center_point_array[3] * frth);
+- }
+- *target += (float)(num_points) * center_point_array[4];
++ for (k = 0; k < 8; k += 2)
++ result[k] = result[k] + result[k + 1];
++
++ *target = result[0] + result[2] + result[4] + result[6];
++
++ for (i = eighth_points * 8; i < num_points; ++i) {
++ fst = *src0++;
++ fst = MAX(fst, *cutoff);
++ sq = fst * fst;
++ thrd = fst * sq;
++ frth = fst * thrd;
++ *target += (center_point_array[0] * fst + center_point_array[1] * sq +
++ center_point_array[2] * thrd + center_point_array[3] * frth);
++ }
++ *target += (float)(num_points)*center_point_array[4];
+ }
+
+ #endif /*LV_HAVE_GENERIC*/
+@@ -372,51 +377,52 @@ volk_32f_x3_sum_of_poly_32f_generic(float* target, float* src0, float* center_po
+ #include <arm_neon.h>
+
+ static inline void
+-volk_32f_x3_sum_of_poly_32f_a_neon(float* __restrict target, float* __restrict src0,
++volk_32f_x3_sum_of_poly_32f_a_neon(float* __restrict target,
++ float* __restrict src0,
+ float* __restrict center_point_array,
+- float* __restrict cutoff, unsigned int num_points)
++ float* __restrict cutoff,
++ unsigned int num_points)
+ {
+- unsigned int i;
+- float zero[4] = {0.0f, 0.0f, 0.0f, 0.0f };
+-
+- float32x2_t x_to_1, x_to_2, x_to_3, x_to_4;
+- float32x2_t cutoff_vector;
+- float32x2x2_t x_low, x_high;
+- float32x4_t x_qvector, c_qvector, cpa_qvector;
+- float accumulator;
+- float res_accumulators[4];
+-
+- c_qvector = vld1q_f32( zero );
+- // load the cutoff in to a vector
+- cutoff_vector = vdup_n_f32( *cutoff );
+- // ... center point array
+- cpa_qvector = vld1q_f32( center_point_array );
+-
+- for(i=0; i < num_points; ++i) {
+- // load x (src0)
+- x_to_1 = vdup_n_f32( *src0++ );
+-
+- // Get a vector of max(src0, cutoff)
+- x_to_1 = vmax_f32(x_to_1, cutoff_vector ); // x^1
+- x_to_2 = vmul_f32(x_to_1, x_to_1); // x^2
+- x_to_3 = vmul_f32(x_to_2, x_to_1); // x^3
+- x_to_4 = vmul_f32(x_to_3, x_to_1); // x^4
+- // zip up doubles to interleave
+- x_low = vzip_f32(x_to_1, x_to_2); // [x^2 | x^1 || x^2 | x^1]
+- x_high = vzip_f32(x_to_3, x_to_4); // [x^4 | x^3 || x^4 | x^3]
+- // float32x4_t vcombine_f32(float32x2_t low, float32x2_t high); // VMOV d0,d0
+- x_qvector = vcombine_f32(x_low.val[0], x_high.val[0]);
+- // now we finally have [x^4 | x^3 | x^2 | x] !
+-
+- c_qvector = vmlaq_f32(c_qvector, x_qvector, cpa_qvector);
+-
+- }
+- // there should be better vector reduction techniques
+- vst1q_f32(res_accumulators, c_qvector );
+- accumulator = res_accumulators[0] + res_accumulators[1] +
+- res_accumulators[2] + res_accumulators[3];
+-
+- *target = accumulator + (float)num_points * center_point_array[4];
++ unsigned int i;
++ float zero[4] = { 0.0f, 0.0f, 0.0f, 0.0f };
++
++ float32x2_t x_to_1, x_to_2, x_to_3, x_to_4;
++ float32x2_t cutoff_vector;
++ float32x2x2_t x_low, x_high;
++ float32x4_t x_qvector, c_qvector, cpa_qvector;
++ float accumulator;
++ float res_accumulators[4];
++
++ c_qvector = vld1q_f32(zero);
++ // load the cutoff in to a vector
++ cutoff_vector = vdup_n_f32(*cutoff);
++ // ... center point array
++ cpa_qvector = vld1q_f32(center_point_array);
++
++ for (i = 0; i < num_points; ++i) {
++ // load x (src0)
++ x_to_1 = vdup_n_f32(*src0++);
++
++ // Get a vector of max(src0, cutoff)
++ x_to_1 = vmax_f32(x_to_1, cutoff_vector); // x^1
++ x_to_2 = vmul_f32(x_to_1, x_to_1); // x^2
++ x_to_3 = vmul_f32(x_to_2, x_to_1); // x^3
++ x_to_4 = vmul_f32(x_to_3, x_to_1); // x^4
++ // zip up doubles to interleave
++ x_low = vzip_f32(x_to_1, x_to_2); // [x^2 | x^1 || x^2 | x^1]
++ x_high = vzip_f32(x_to_3, x_to_4); // [x^4 | x^3 || x^4 | x^3]
++ // float32x4_t vcombine_f32(float32x2_t low, float32x2_t high); // VMOV d0,d0
++ x_qvector = vcombine_f32(x_low.val[0], x_high.val[0]);
++ // now we finally have [x^4 | x^3 | x^2 | x] !
++
++ c_qvector = vmlaq_f32(c_qvector, x_qvector, cpa_qvector);
++ }
++ // there should be better vector reduction techniques
++ vst1q_f32(res_accumulators, c_qvector);
++ accumulator = res_accumulators[0] + res_accumulators[1] + res_accumulators[2] +
++ res_accumulators[3];
++
++ *target = accumulator + (float)num_points * center_point_array[4];
+ }
+
+ #endif /* LV_HAVE_NEON */
+@@ -425,82 +431,82 @@ volk_32f_x3_sum_of_poly_32f_a_neon(float* __restrict target, float* __restrict s
+ #ifdef LV_HAVE_NEON
+
+ static inline void
+-volk_32f_x3_sum_of_poly_32f_neonvert(float* __restrict target, float* __restrict src0,
++volk_32f_x3_sum_of_poly_32f_neonvert(float* __restrict target,
++ float* __restrict src0,
+ float* __restrict center_point_array,
+- float* __restrict cutoff, unsigned int num_points)
++ float* __restrict cutoff,
++ unsigned int num_points)
+ {
+- unsigned int i;
+- float zero[4] = {0.0f, 0.0f, 0.0f, 0.0f };
+-
+- float accumulator;
+-
+- float32x4_t accumulator1_vec, accumulator2_vec, accumulator3_vec, accumulator4_vec;
+- accumulator1_vec = vld1q_f32(zero);
+- accumulator2_vec = vld1q_f32(zero);
+- accumulator3_vec = vld1q_f32(zero);
+- accumulator4_vec = vld1q_f32(zero);
+- float32x4_t x_to_1, x_to_2, x_to_3, x_to_4;
+- float32x4_t cutoff_vector, cpa_0, cpa_1, cpa_2, cpa_3;
+-
+- // load the cutoff in to a vector
+- cutoff_vector = vdupq_n_f32( *cutoff );
+- // ... center point array
+- cpa_0 = vdupq_n_f32(center_point_array[0]);
+- cpa_1 = vdupq_n_f32(center_point_array[1]);
+- cpa_2 = vdupq_n_f32(center_point_array[2]);
+- cpa_3 = vdupq_n_f32(center_point_array[3]);
+-
+- // nathan is not sure why this is slower *and* wrong compared to neonvertfma
+- for(i=0; i < num_points/4; ++i) {
+- // load x
+- x_to_1 = vld1q_f32( src0 );
+-
+- // Get a vector of max(src0, cutoff)
+- x_to_1 = vmaxq_f32(x_to_1, cutoff_vector ); // x^1
+- x_to_2 = vmulq_f32(x_to_1, x_to_1); // x^2
+- x_to_3 = vmulq_f32(x_to_2, x_to_1); // x^3
+- x_to_4 = vmulq_f32(x_to_3, x_to_1); // x^4
+- x_to_1 = vmulq_f32(x_to_1, cpa_0);
+- x_to_2 = vmulq_f32(x_to_2, cpa_1);
+- x_to_3 = vmulq_f32(x_to_3, cpa_2);
+- x_to_4 = vmulq_f32(x_to_4, cpa_3);
+- accumulator1_vec = vaddq_f32(accumulator1_vec, x_to_1);
+- accumulator2_vec = vaddq_f32(accumulator2_vec, x_to_2);
+- accumulator3_vec = vaddq_f32(accumulator3_vec, x_to_3);
+- accumulator4_vec = vaddq_f32(accumulator4_vec, x_to_4);
+-
+- src0 += 4;
+- }
+- accumulator1_vec = vaddq_f32(accumulator1_vec, accumulator2_vec);
+- accumulator3_vec = vaddq_f32(accumulator3_vec, accumulator4_vec);
+- accumulator1_vec = vaddq_f32(accumulator1_vec, accumulator3_vec);
+-
+- __VOLK_ATTR_ALIGNED(32) float res_accumulators[4];
+- vst1q_f32(res_accumulators, accumulator1_vec );
+- accumulator = res_accumulators[0] + res_accumulators[1] +
+- res_accumulators[2] + res_accumulators[3];
+-
+- float fst = 0.0;
+- float sq = 0.0;
+- float thrd = 0.0;
+- float frth = 0.0;
+-
+- for(i = 4*num_points/4; i < num_points; ++i) {
+- fst = src0[i];
+- fst = MAX(fst, *cutoff);
+-
+- sq = fst * fst;
+- thrd = fst * sq;
+- frth = sq * sq;
+- //fith = sq * thrd;
+-
+- accumulator += (center_point_array[0] * fst +
+- center_point_array[1] * sq +
+- center_point_array[2] * thrd +
+- center_point_array[3] * frth); //+
+- }
+-
+- *target = accumulator + (float)num_points * center_point_array[4];
++ unsigned int i;
++ float zero[4] = { 0.0f, 0.0f, 0.0f, 0.0f };
++
++ float accumulator;
++
++ float32x4_t accumulator1_vec, accumulator2_vec, accumulator3_vec, accumulator4_vec;
++ accumulator1_vec = vld1q_f32(zero);
++ accumulator2_vec = vld1q_f32(zero);
++ accumulator3_vec = vld1q_f32(zero);
++ accumulator4_vec = vld1q_f32(zero);
++ float32x4_t x_to_1, x_to_2, x_to_3, x_to_4;
++ float32x4_t cutoff_vector, cpa_0, cpa_1, cpa_2, cpa_3;
++
++ // load the cutoff in to a vector
++ cutoff_vector = vdupq_n_f32(*cutoff);
++ // ... center point array
++ cpa_0 = vdupq_n_f32(center_point_array[0]);
++ cpa_1 = vdupq_n_f32(center_point_array[1]);
++ cpa_2 = vdupq_n_f32(center_point_array[2]);
++ cpa_3 = vdupq_n_f32(center_point_array[3]);
++
++ // nathan is not sure why this is slower *and* wrong compared to neonvertfma
++ for (i = 0; i < num_points / 4; ++i) {
++ // load x
++ x_to_1 = vld1q_f32(src0);
++
++ // Get a vector of max(src0, cutoff)
++ x_to_1 = vmaxq_f32(x_to_1, cutoff_vector); // x^1
++ x_to_2 = vmulq_f32(x_to_1, x_to_1); // x^2
++ x_to_3 = vmulq_f32(x_to_2, x_to_1); // x^3
++ x_to_4 = vmulq_f32(x_to_3, x_to_1); // x^4
++ x_to_1 = vmulq_f32(x_to_1, cpa_0);
++ x_to_2 = vmulq_f32(x_to_2, cpa_1);
++ x_to_3 = vmulq_f32(x_to_3, cpa_2);
++ x_to_4 = vmulq_f32(x_to_4, cpa_3);
++ accumulator1_vec = vaddq_f32(accumulator1_vec, x_to_1);
++ accumulator2_vec = vaddq_f32(accumulator2_vec, x_to_2);
++ accumulator3_vec = vaddq_f32(accumulator3_vec, x_to_3);
++ accumulator4_vec = vaddq_f32(accumulator4_vec, x_to_4);
++
++ src0 += 4;
++ }
++ accumulator1_vec = vaddq_f32(accumulator1_vec, accumulator2_vec);
++ accumulator3_vec = vaddq_f32(accumulator3_vec, accumulator4_vec);
++ accumulator1_vec = vaddq_f32(accumulator1_vec, accumulator3_vec);
++
++ __VOLK_ATTR_ALIGNED(32) float res_accumulators[4];
++ vst1q_f32(res_accumulators, accumulator1_vec);
++ accumulator = res_accumulators[0] + res_accumulators[1] + res_accumulators[2] +
++ res_accumulators[3];
++
++ float fst = 0.0;
++ float sq = 0.0;
++ float thrd = 0.0;
++ float frth = 0.0;
++
++ for (i = 4 * num_points / 4; i < num_points; ++i) {
++ fst = src0[i];
++ fst = MAX(fst, *cutoff);
++
++ sq = fst * fst;
++ thrd = fst * sq;
++ frth = sq * sq;
++ // fith = sq * thrd;
++
++ accumulator += (center_point_array[0] * fst + center_point_array[1] * sq +
++ center_point_array[2] * thrd + center_point_array[3] * frth); //+
++ }
++
++ *target = accumulator + (float)num_points * center_point_array[4];
+ }
+
+ #endif /* LV_HAVE_NEON */
+@@ -510,150 +516,154 @@ volk_32f_x3_sum_of_poly_32f_neonvert(float* __restrict target, float* __restrict
+ #ifndef INCLUDED_volk_32f_x3_sum_of_poly_32f_u_H
+ #define INCLUDED_volk_32f_x3_sum_of_poly_32f_u_H
+
+-#include<inttypes.h>
+-#include<stdio.h>
+-#include<volk/volk_complex.h>
++#include <inttypes.h>
++#include <stdio.h>
++#include <volk/volk_complex.h>
+
+ #ifndef MAX
+-#define MAX(X,Y) ((X) > (Y)?(X):(Y))
++#define MAX(X, Y) ((X) > (Y) ? (X) : (Y))
+ #endif
+
+ #if LV_HAVE_AVX && LV_HAVE_FMA
+-#include<immintrin.h>
++#include <immintrin.h>
+
+-static inline void
+-volk_32f_x3_sum_of_poly_32f_u_avx_fma(float* target, float* src0, float* center_point_array,
+- float* cutoff, unsigned int num_points)
++static inline void volk_32f_x3_sum_of_poly_32f_u_avx_fma(float* target,
++ float* src0,
++ float* center_point_array,
++ float* cutoff,
++ unsigned int num_points)
+ {
+- const unsigned int eighth_points = num_points / 8;
+- float fst = 0.0;
+- float sq = 0.0;
+- float thrd = 0.0;
+- float frth = 0.0;
+-
+- __m256 cpa0, cpa1, cpa2, cpa3, cutoff_vec;
+- __m256 target_vec;
+- __m256 x_to_1, x_to_2, x_to_3, x_to_4;
+-
+- cpa0 = _mm256_set1_ps(center_point_array[0]);
+- cpa1 = _mm256_set1_ps(center_point_array[1]);
+- cpa2 = _mm256_set1_ps(center_point_array[2]);
+- cpa3 = _mm256_set1_ps(center_point_array[3]);
+- cutoff_vec = _mm256_set1_ps(*cutoff);
+- target_vec = _mm256_setzero_ps();
+-
+- unsigned int i;
+-
+- for(i = 0; i < eighth_points; ++i) {
+- x_to_1 = _mm256_loadu_ps(src0);
+- x_to_1 = _mm256_max_ps(x_to_1, cutoff_vec);
+- x_to_2 = _mm256_mul_ps(x_to_1, x_to_1); // x^2
+- x_to_3 = _mm256_mul_ps(x_to_1, x_to_2); // x^3
+- // x^1 * x^3 is slightly faster than x^2 * x^2
+- x_to_4 = _mm256_mul_ps(x_to_1, x_to_3); // x^4
+-
+- x_to_2 = _mm256_mul_ps(x_to_2, cpa1); // cpa[1] * x^2
+- x_to_4 = _mm256_mul_ps(x_to_4, cpa3); // cpa[3] * x^4
+-
+- x_to_1 = _mm256_fmadd_ps(x_to_1, cpa0, x_to_2);
+- x_to_3 = _mm256_fmadd_ps(x_to_3, cpa2, x_to_4);
+- // this is slightly faster than result += (x_to_1 + x_to_3)
+- target_vec = _mm256_add_ps(x_to_1, target_vec);
+- target_vec = _mm256_add_ps(x_to_3, target_vec);
+-
+- src0 += 8;
+- }
+-
+- // the hadd for vector reduction has very very slight impact @ 50k iters
+- __VOLK_ATTR_ALIGNED(32) float temp_results[8];
+- target_vec = _mm256_hadd_ps(target_vec, target_vec); // x0+x1 | x2+x3 | x0+x1 | x2+x3 || x4+x5 | x6+x7 | x4+x5 | x6+x7
+- _mm256_storeu_ps(temp_results, target_vec);
+- *target = temp_results[0] + temp_results[1] + temp_results[4] + temp_results[5];
+-
+- for(i = eighth_points*8; i < num_points; ++i) {
+- fst = *src0++;
+- fst = MAX(fst, *cutoff);
+- sq = fst * fst;
+- thrd = fst * sq;
+- frth = sq * sq;
+- *target += (center_point_array[0] * fst +
+- center_point_array[1] * sq +
+- center_point_array[2] * thrd +
+- center_point_array[3] * frth);
+- }
+-
+- *target += (float)(num_points) * center_point_array[4];
++ const unsigned int eighth_points = num_points / 8;
++ float fst = 0.0;
++ float sq = 0.0;
++ float thrd = 0.0;
++ float frth = 0.0;
++
++ __m256 cpa0, cpa1, cpa2, cpa3, cutoff_vec;
++ __m256 target_vec;
++ __m256 x_to_1, x_to_2, x_to_3, x_to_4;
++
++ cpa0 = _mm256_set1_ps(center_point_array[0]);
++ cpa1 = _mm256_set1_ps(center_point_array[1]);
++ cpa2 = _mm256_set1_ps(center_point_array[2]);
++ cpa3 = _mm256_set1_ps(center_point_array[3]);
++ cutoff_vec = _mm256_set1_ps(*cutoff);
++ target_vec = _mm256_setzero_ps();
++
++ unsigned int i;
++
++ for (i = 0; i < eighth_points; ++i) {
++ x_to_1 = _mm256_loadu_ps(src0);
++ x_to_1 = _mm256_max_ps(x_to_1, cutoff_vec);
++ x_to_2 = _mm256_mul_ps(x_to_1, x_to_1); // x^2
++ x_to_3 = _mm256_mul_ps(x_to_1, x_to_2); // x^3
++ // x^1 * x^3 is slightly faster than x^2 * x^2
++ x_to_4 = _mm256_mul_ps(x_to_1, x_to_3); // x^4
++
++ x_to_2 = _mm256_mul_ps(x_to_2, cpa1); // cpa[1] * x^2
++ x_to_4 = _mm256_mul_ps(x_to_4, cpa3); // cpa[3] * x^4
++
++ x_to_1 = _mm256_fmadd_ps(x_to_1, cpa0, x_to_2);
++ x_to_3 = _mm256_fmadd_ps(x_to_3, cpa2, x_to_4);
++ // this is slightly faster than result += (x_to_1 + x_to_3)
++ target_vec = _mm256_add_ps(x_to_1, target_vec);
++ target_vec = _mm256_add_ps(x_to_3, target_vec);
++
++ src0 += 8;
++ }
++
++ // the hadd for vector reduction has very very slight impact @ 50k iters
++ __VOLK_ATTR_ALIGNED(32) float temp_results[8];
++ target_vec = _mm256_hadd_ps(
++ target_vec,
++ target_vec); // x0+x1 | x2+x3 | x0+x1 | x2+x3 || x4+x5 | x6+x7 | x4+x5 | x6+x7
++ _mm256_storeu_ps(temp_results, target_vec);
++ *target = temp_results[0] + temp_results[1] + temp_results[4] + temp_results[5];
++
++ for (i = eighth_points * 8; i < num_points; ++i) {
++ fst = *src0++;
++ fst = MAX(fst, *cutoff);
++ sq = fst * fst;
++ thrd = fst * sq;
++ frth = sq * sq;
++ *target += (center_point_array[0] * fst + center_point_array[1] * sq +
++ center_point_array[2] * thrd + center_point_array[3] * frth);
++ }
++
++ *target += (float)(num_points)*center_point_array[4];
+ }
+ #endif // LV_HAVE_AVX && LV_HAVE_FMA
+
+ #ifdef LV_HAVE_AVX
+-#include<immintrin.h>
++#include <immintrin.h>
+
+-static inline void
+-volk_32f_x3_sum_of_poly_32f_u_avx(float* target, float* src0, float* center_point_array,
+- float* cutoff, unsigned int num_points)
++static inline void volk_32f_x3_sum_of_poly_32f_u_avx(float* target,
++ float* src0,
++ float* center_point_array,
++ float* cutoff,
++ unsigned int num_points)
+ {
+- const unsigned int eighth_points = num_points / 8;
+- float fst = 0.0;
+- float sq = 0.0;
+- float thrd = 0.0;
+- float frth = 0.0;
+-
+- __m256 cpa0, cpa1, cpa2, cpa3, cutoff_vec;
+- __m256 target_vec;
+- __m256 x_to_1, x_to_2, x_to_3, x_to_4;
+-
+- cpa0 = _mm256_set1_ps(center_point_array[0]);
+- cpa1 = _mm256_set1_ps(center_point_array[1]);
+- cpa2 = _mm256_set1_ps(center_point_array[2]);
+- cpa3 = _mm256_set1_ps(center_point_array[3]);
+- cutoff_vec = _mm256_set1_ps(*cutoff);
+- target_vec = _mm256_setzero_ps();
+-
+- unsigned int i;
+-
+- for(i = 0; i < eighth_points; ++i) {
+- x_to_1 = _mm256_loadu_ps(src0);
+- x_to_1 = _mm256_max_ps(x_to_1, cutoff_vec);
+- x_to_2 = _mm256_mul_ps(x_to_1, x_to_1); // x^2
+- x_to_3 = _mm256_mul_ps(x_to_1, x_to_2); // x^3
+- // x^1 * x^3 is slightly faster than x^2 * x^2
+- x_to_4 = _mm256_mul_ps(x_to_1, x_to_3); // x^4
+-
+- x_to_1 = _mm256_mul_ps(x_to_1, cpa0); // cpa[0] * x^1
+- x_to_2 = _mm256_mul_ps(x_to_2, cpa1); // cpa[1] * x^2
+- x_to_3 = _mm256_mul_ps(x_to_3, cpa2); // cpa[2] * x^3
+- x_to_4 = _mm256_mul_ps(x_to_4, cpa3); // cpa[3] * x^4
+-
+- x_to_1 = _mm256_add_ps(x_to_1, x_to_2);
+- x_to_3 = _mm256_add_ps(x_to_3, x_to_4);
+- // this is slightly faster than result += (x_to_1 + x_to_3)
+- target_vec = _mm256_add_ps(x_to_1, target_vec);
+- target_vec = _mm256_add_ps(x_to_3, target_vec);
+-
+- src0 += 8;
+- }
+-
+- // the hadd for vector reduction has very very slight impact @ 50k iters
+- __VOLK_ATTR_ALIGNED(32) float temp_results[8];
+- target_vec = _mm256_hadd_ps(target_vec, target_vec); // x0+x1 | x2+x3 | x0+x1 | x2+x3 || x4+x5 | x6+x7 | x4+x5 | x6+x7
+- _mm256_storeu_ps(temp_results, target_vec);
+- *target = temp_results[0] + temp_results[1] + temp_results[4] + temp_results[5];
+-
+- for(i = eighth_points*8; i < num_points; ++i) {
+- fst = *src0++;
+- fst = MAX(fst, *cutoff);
+- sq = fst * fst;
+- thrd = fst * sq;
+- frth = sq * sq;
+-
+- *target += (center_point_array[0] * fst +
+- center_point_array[1] * sq +
+- center_point_array[2] * thrd +
+- center_point_array[3] * frth);
+- }
+-
+- *target += (float)(num_points) * center_point_array[4];
++ const unsigned int eighth_points = num_points / 8;
++ float fst = 0.0;
++ float sq = 0.0;
++ float thrd = 0.0;
++ float frth = 0.0;
++
++ __m256 cpa0, cpa1, cpa2, cpa3, cutoff_vec;
++ __m256 target_vec;
++ __m256 x_to_1, x_to_2, x_to_3, x_to_4;
++
++ cpa0 = _mm256_set1_ps(center_point_array[0]);
++ cpa1 = _mm256_set1_ps(center_point_array[1]);
++ cpa2 = _mm256_set1_ps(center_point_array[2]);
++ cpa3 = _mm256_set1_ps(center_point_array[3]);
++ cutoff_vec = _mm256_set1_ps(*cutoff);
++ target_vec = _mm256_setzero_ps();
++
++ unsigned int i;
++
++ for (i = 0; i < eighth_points; ++i) {
++ x_to_1 = _mm256_loadu_ps(src0);
++ x_to_1 = _mm256_max_ps(x_to_1, cutoff_vec);
++ x_to_2 = _mm256_mul_ps(x_to_1, x_to_1); // x^2
++ x_to_3 = _mm256_mul_ps(x_to_1, x_to_2); // x^3
++ // x^1 * x^3 is slightly faster than x^2 * x^2
++ x_to_4 = _mm256_mul_ps(x_to_1, x_to_3); // x^4
++
++ x_to_1 = _mm256_mul_ps(x_to_1, cpa0); // cpa[0] * x^1
++ x_to_2 = _mm256_mul_ps(x_to_2, cpa1); // cpa[1] * x^2
++ x_to_3 = _mm256_mul_ps(x_to_3, cpa2); // cpa[2] * x^3
++ x_to_4 = _mm256_mul_ps(x_to_4, cpa3); // cpa[3] * x^4
++
++ x_to_1 = _mm256_add_ps(x_to_1, x_to_2);
++ x_to_3 = _mm256_add_ps(x_to_3, x_to_4);
++ // this is slightly faster than result += (x_to_1 + x_to_3)
++ target_vec = _mm256_add_ps(x_to_1, target_vec);
++ target_vec = _mm256_add_ps(x_to_3, target_vec);
++
++ src0 += 8;
++ }
++
++ // the hadd for vector reduction has very very slight impact @ 50k iters
++ __VOLK_ATTR_ALIGNED(32) float temp_results[8];
++ target_vec = _mm256_hadd_ps(
++ target_vec,
++ target_vec); // x0+x1 | x2+x3 | x0+x1 | x2+x3 || x4+x5 | x6+x7 | x4+x5 | x6+x7
++ _mm256_storeu_ps(temp_results, target_vec);
++ *target = temp_results[0] + temp_results[1] + temp_results[4] + temp_results[5];
++
++ for (i = eighth_points * 8; i < num_points; ++i) {
++ fst = *src0++;
++ fst = MAX(fst, *cutoff);
++ sq = fst * fst;
++ thrd = fst * sq;
++ frth = sq * sq;
++
++ *target += (center_point_array[0] * fst + center_point_array[1] * sq +
++ center_point_array[2] * thrd + center_point_array[3] * frth);
++ }
++
++ *target += (float)(num_points)*center_point_array[4];
+ }
+ #endif // LV_HAVE_AVX
+
+diff --git a/kernels/volk/volk_32fc_32f_add_32fc.h b/kernels/volk/volk_32fc_32f_add_32fc.h
+index 86a3818..b25ca6a 100644
+--- a/kernels/volk/volk_32fc_32f_add_32fc.h
++++ b/kernels/volk/volk_32fc_32f_add_32fc.h
+@@ -31,8 +31,8 @@
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+- * void volk_32fc_32f_add_32fc(lv_32fc_t* cVector, const lv_32fc_t* aVector, const float* bVector, unsigned int num_points)
+- * \endcode
++ * void volk_32fc_32f_add_32fc(lv_32fc_t* cVector, const lv_32fc_t* aVector, const float*
++ * bVector, unsigned int num_points) \endcode
+ *
+ * \b Inputs
+ * \li aVector: First vector of input points.
+@@ -44,7 +44,8 @@
+ *
+ * \b Example
+ *
+- * The follow example adds the increasing and decreasing vectors such that the result of every summation pair is 10
++ * The follow example adds the increasing and decreasing vectors such that the result of
++ * every summation pair is 10
+ *
+ * \code
+ * int N = 10;
+@@ -75,18 +76,19 @@
+
+ #ifdef LV_HAVE_GENERIC
+
+-static inline void
+-volk_32fc_32f_add_32fc_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector,
+- const float* bVector, unsigned int num_points)
++static inline void volk_32fc_32f_add_32fc_generic(lv_32fc_t* cVector,
++ const lv_32fc_t* aVector,
++ const float* bVector,
++ unsigned int num_points)
+ {
+- lv_32fc_t* cPtr = cVector;
+- const lv_32fc_t* aPtr = aVector;
+- const float* bPtr= bVector;
+- unsigned int number = 0;
+-
+- for(number = 0; number < num_points; number++){
+- *cPtr++ = (*aPtr++) + (*bPtr++);
+- }
++ lv_32fc_t* cPtr = cVector;
++ const lv_32fc_t* aPtr = aVector;
++ const float* bPtr = bVector;
++ unsigned int number = 0;
++
++ for (number = 0; number < num_points; number++) {
++ *cPtr++ = (*aPtr++) + (*bPtr++);
++ }
+ }
+ #endif /* LV_HAVE_GENERIC */
+
+@@ -94,143 +96,150 @@ volk_32fc_32f_add_32fc_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector,
+ #ifdef LV_HAVE_AVX
+ #include <immintrin.h>
+
+-static inline void
+-volk_32fc_32f_add_32fc_u_avx(lv_32fc_t* cVector, const lv_32fc_t* aVector,
+- const float* bVector, unsigned int num_points)
++static inline void volk_32fc_32f_add_32fc_u_avx(lv_32fc_t* cVector,
++ const lv_32fc_t* aVector,
++ const float* bVector,
++ unsigned int num_points)
+ {
+- unsigned int number = 0;
+- const unsigned int eighthPoints = num_points / 8;
+-
+- lv_32fc_t* cPtr = cVector;
+- const lv_32fc_t* aPtr = aVector;
+- const float* bPtr= bVector;
+-
+- __m256 aVal1, aVal2, bVal, cVal1, cVal2;
+- __m256 cpx_b1, cpx_b2;
+- __m256 zero;
+- zero = _mm256_setzero_ps();
+- __m256 tmp1, tmp2;
+- for(;number < eighthPoints; number++){
+-
+- aVal1 = _mm256_loadu_ps((float *) aPtr);
+- aVal2 = _mm256_loadu_ps((float *) (aPtr+4));
+- bVal = _mm256_loadu_ps(bPtr);
+- cpx_b1 = _mm256_unpacklo_ps(bVal, zero); // b0, 0, b1, 0, b4, 0, b5, 0
+- cpx_b2 = _mm256_unpackhi_ps(bVal, zero); // b2, 0, b3, 0, b6, 0, b7, 0
+-
+- tmp1 = _mm256_permute2f128_ps(cpx_b1, cpx_b2, 0x0+(0x2<<4));
+- tmp2 = _mm256_permute2f128_ps(cpx_b1, cpx_b2, 0x1+(0x3<<4));
+-
+- cVal1 = _mm256_add_ps(aVal1, tmp1);
+- cVal2 = _mm256_add_ps(aVal2, tmp2);
+-
+- _mm256_storeu_ps((float *) cPtr, cVal1); // Store the results back into the C container
+- _mm256_storeu_ps((float *) (cPtr+4), cVal2); // Store the results back into the C container
+-
+- aPtr += 8;
+- bPtr += 8;
+- cPtr += 8;
+- }
+-
+- number = eighthPoints * 8;
+- for(;number < num_points; number++){
+- *cPtr++ = (*aPtr++) + (*bPtr++);
+- }
++ unsigned int number = 0;
++ const unsigned int eighthPoints = num_points / 8;
++
++ lv_32fc_t* cPtr = cVector;
++ const lv_32fc_t* aPtr = aVector;
++ const float* bPtr = bVector;
++
++ __m256 aVal1, aVal2, bVal, cVal1, cVal2;
++ __m256 cpx_b1, cpx_b2;
++ __m256 zero;
++ zero = _mm256_setzero_ps();
++ __m256 tmp1, tmp2;
++ for (; number < eighthPoints; number++) {
++
++ aVal1 = _mm256_loadu_ps((float*)aPtr);
++ aVal2 = _mm256_loadu_ps((float*)(aPtr + 4));
++ bVal = _mm256_loadu_ps(bPtr);
++ cpx_b1 = _mm256_unpacklo_ps(bVal, zero); // b0, 0, b1, 0, b4, 0, b5, 0
++ cpx_b2 = _mm256_unpackhi_ps(bVal, zero); // b2, 0, b3, 0, b6, 0, b7, 0
++
++ tmp1 = _mm256_permute2f128_ps(cpx_b1, cpx_b2, 0x0 + (0x2 << 4));
++ tmp2 = _mm256_permute2f128_ps(cpx_b1, cpx_b2, 0x1 + (0x3 << 4));
++
++ cVal1 = _mm256_add_ps(aVal1, tmp1);
++ cVal2 = _mm256_add_ps(aVal2, tmp2);
++
++ _mm256_storeu_ps((float*)cPtr,
++ cVal1); // Store the results back into the C container
++ _mm256_storeu_ps((float*)(cPtr + 4),
++ cVal2); // Store the results back into the C container
++
++ aPtr += 8;
++ bPtr += 8;
++ cPtr += 8;
++ }
++
++ number = eighthPoints * 8;
++ for (; number < num_points; number++) {
++ *cPtr++ = (*aPtr++) + (*bPtr++);
++ }
+ }
+ #endif /* LV_HAVE_AVX */
+
+ #ifdef LV_HAVE_AVX
+ #include <immintrin.h>
+
+-static inline void
+-volk_32fc_32f_add_32fc_a_avx(lv_32fc_t* cVector, const lv_32fc_t* aVector,
+- const float* bVector, unsigned int num_points)
++static inline void volk_32fc_32f_add_32fc_a_avx(lv_32fc_t* cVector,
++ const lv_32fc_t* aVector,
++ const float* bVector,
++ unsigned int num_points)
+ {
+- unsigned int number = 0;
+- const unsigned int eighthPoints = num_points / 8;
+-
+- lv_32fc_t* cPtr = cVector;
+- const lv_32fc_t* aPtr = aVector;
+- const float* bPtr= bVector;
+-
+- __m256 aVal1, aVal2, bVal, cVal1, cVal2;
+- __m256 cpx_b1, cpx_b2;
+- __m256 zero;
+- zero = _mm256_setzero_ps();
+- __m256 tmp1, tmp2;
+- for(;number < eighthPoints; number++){
+-
+- aVal1 = _mm256_load_ps((float *) aPtr);
+- aVal2 = _mm256_load_ps((float *) (aPtr+4));
+- bVal = _mm256_load_ps(bPtr);
+- cpx_b1 = _mm256_unpacklo_ps(bVal, zero); // b0, 0, b1, 0, b4, 0, b5, 0
+- cpx_b2 = _mm256_unpackhi_ps(bVal, zero); // b2, 0, b3, 0, b6, 0, b7, 0
+-
+- tmp1 = _mm256_permute2f128_ps(cpx_b1, cpx_b2, 0x0+(0x2<<4));
+- tmp2 = _mm256_permute2f128_ps(cpx_b1, cpx_b2, 0x1+(0x3<<4));
+-
+- cVal1 = _mm256_add_ps(aVal1, tmp1);
+- cVal2 = _mm256_add_ps(aVal2, tmp2);
+-
+- _mm256_store_ps((float *) cPtr, cVal1); // Store the results back into the C container
+- _mm256_store_ps((float *) (cPtr+4), cVal2); // Store the results back into the C container
+-
+- aPtr += 8;
+- bPtr += 8;
+- cPtr += 8;
+- }
+-
+- number = eighthPoints * 8;
+- for(;number < num_points; number++){
+- *cPtr++ = (*aPtr++) + (*bPtr++);
+- }
++ unsigned int number = 0;
++ const unsigned int eighthPoints = num_points / 8;
++
++ lv_32fc_t* cPtr = cVector;
++ const lv_32fc_t* aPtr = aVector;
++ const float* bPtr = bVector;
++
++ __m256 aVal1, aVal2, bVal, cVal1, cVal2;
++ __m256 cpx_b1, cpx_b2;
++ __m256 zero;
++ zero = _mm256_setzero_ps();
++ __m256 tmp1, tmp2;
++ for (; number < eighthPoints; number++) {
++
++ aVal1 = _mm256_load_ps((float*)aPtr);
++ aVal2 = _mm256_load_ps((float*)(aPtr + 4));
++ bVal = _mm256_load_ps(bPtr);
++ cpx_b1 = _mm256_unpacklo_ps(bVal, zero); // b0, 0, b1, 0, b4, 0, b5, 0
++ cpx_b2 = _mm256_unpackhi_ps(bVal, zero); // b2, 0, b3, 0, b6, 0, b7, 0
++
++ tmp1 = _mm256_permute2f128_ps(cpx_b1, cpx_b2, 0x0 + (0x2 << 4));
++ tmp2 = _mm256_permute2f128_ps(cpx_b1, cpx_b2, 0x1 + (0x3 << 4));
++
++ cVal1 = _mm256_add_ps(aVal1, tmp1);
++ cVal2 = _mm256_add_ps(aVal2, tmp2);
++
++ _mm256_store_ps((float*)cPtr,
++ cVal1); // Store the results back into the C container
++ _mm256_store_ps((float*)(cPtr + 4),
++ cVal2); // Store the results back into the C container
++
++ aPtr += 8;
++ bPtr += 8;
++ cPtr += 8;
++ }
++
++ number = eighthPoints * 8;
++ for (; number < num_points; number++) {
++ *cPtr++ = (*aPtr++) + (*bPtr++);
++ }
+ }
+ #endif /* LV_HAVE_AVX */
+
+ #ifdef LV_HAVE_NEON
+ #include <arm_neon.h>
+
+-static inline void
+-volk_32fc_32f_add_32fc_neon(lv_32fc_t* cVector, const lv_32fc_t* aVector,
+- const float* bVector, unsigned int num_points)
++static inline void volk_32fc_32f_add_32fc_neon(lv_32fc_t* cVector,
++ const lv_32fc_t* aVector,
++ const float* bVector,
++ unsigned int num_points)
+ {
+- lv_32fc_t* cPtr = cVector;
+- const lv_32fc_t* aPtr = aVector;
+- const float* bPtr = bVector;
+-
+- float32x4x4_t aVal0, aVal1;
+- float32x4x2_t bVal0, bVal1;
+-
+- const unsigned int sixteenthPoints = num_points / 16;
+- unsigned int number = 0;
+- for(; number < sixteenthPoints; number++){
+- aVal0 = vld4q_f32((const float*)aPtr);
+- aPtr += 8;
+- aVal1 = vld4q_f32((const float*)aPtr);
+- aPtr += 8;
+- __VOLK_PREFETCH(aPtr+16);
+-
+- bVal0 = vld2q_f32((const float*)bPtr);
+- bPtr += 8;
+- bVal1 = vld2q_f32((const float*)bPtr);
+- bPtr += 8;
+- __VOLK_PREFETCH(bPtr+16);
+-
+- aVal0.val[0] = vaddq_f32(aVal0.val[0], bVal0.val[0]);
+- aVal0.val[2] = vaddq_f32(aVal0.val[2], bVal0.val[1]);
+-
+- aVal1.val[2] = vaddq_f32(aVal1.val[2], bVal1.val[1]);
+- aVal1.val[0] = vaddq_f32(aVal1.val[0], bVal1.val[0]);
+-
+- vst4q_f32((float*)(cPtr), aVal0);
+- cPtr += 8;
+- vst4q_f32((float*)(cPtr), aVal1);
+- cPtr += 8;
+- }
+-
+- for(number = sixteenthPoints * 16; number < num_points; number++){
+- *cPtr++ = (*aPtr++) + (*bPtr++);
+- }
++ lv_32fc_t* cPtr = cVector;
++ const lv_32fc_t* aPtr = aVector;
++ const float* bPtr = bVector;
++
++ float32x4x4_t aVal0, aVal1;
++ float32x4x2_t bVal0, bVal1;
++
++ const unsigned int sixteenthPoints = num_points / 16;
++ unsigned int number = 0;
++ for (; number < sixteenthPoints; number++) {
++ aVal0 = vld4q_f32((const float*)aPtr);
++ aPtr += 8;
++ aVal1 = vld4q_f32((const float*)aPtr);
++ aPtr += 8;
++ __VOLK_PREFETCH(aPtr + 16);
++
++ bVal0 = vld2q_f32((const float*)bPtr);
++ bPtr += 8;
++ bVal1 = vld2q_f32((const float*)bPtr);
++ bPtr += 8;
++ __VOLK_PREFETCH(bPtr + 16);
++
++ aVal0.val[0] = vaddq_f32(aVal0.val[0], bVal0.val[0]);
++ aVal0.val[2] = vaddq_f32(aVal0.val[2], bVal0.val[1]);
++
++ aVal1.val[2] = vaddq_f32(aVal1.val[2], bVal1.val[1]);
++ aVal1.val[0] = vaddq_f32(aVal1.val[0], bVal1.val[0]);
++
++ vst4q_f32((float*)(cPtr), aVal0);
++ cPtr += 8;
++ vst4q_f32((float*)(cPtr), aVal1);
++ cPtr += 8;
++ }
++
++ for (number = sixteenthPoints * 16; number < num_points; number++) {
++ *cPtr++ = (*aPtr++) + (*bPtr++);
++ }
+ }
+ #endif /* LV_HAVE_NEON */
+
+diff --git a/kernels/volk/volk_32fc_32f_dot_prod_32fc.h b/kernels/volk/volk_32fc_32f_dot_prod_32fc.h
+index 35f7077..d905870 100644
+--- a/kernels/volk/volk_32fc_32f_dot_prod_32fc.h
++++ b/kernels/volk/volk_32fc_32f_dot_prod_32fc.h
+@@ -33,8 +33,8 @@
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+- * void volk_32fc_32f_dot_prod_32fc(lv_32fc_t* result, const lv_32fc_t* input, const float * taps, unsigned int num_points)
+- * \endcode
++ * void volk_32fc_32f_dot_prod_32fc(lv_32fc_t* result, const lv_32fc_t* input, const float
++ * * taps, unsigned int num_points) \endcode
+ *
+ * \b Inputs
+ * \li input: vector of complex samples
+@@ -63,28 +63,32 @@
+ #ifndef INCLUDED_volk_32fc_32f_dot_prod_32fc_a_H
+ #define INCLUDED_volk_32fc_32f_dot_prod_32fc_a_H
+
+-#include <volk/volk_common.h>
+ #include <stdio.h>
++#include <volk/volk_common.h>
+
+ #ifdef LV_HAVE_GENERIC
+
+-static inline void volk_32fc_32f_dot_prod_32fc_generic(lv_32fc_t* result, const lv_32fc_t* input, const float * taps, unsigned int num_points) {
++static inline void volk_32fc_32f_dot_prod_32fc_generic(lv_32fc_t* result,
++ const lv_32fc_t* input,
++ const float* taps,
++ unsigned int num_points)
++{
+
+- float res[2];
+- float *realpt = &res[0], *imagpt = &res[1];
+- const float* aPtr = (float*)input;
+- const float* bPtr= taps;
+- unsigned int number = 0;
++ float res[2];
++ float *realpt = &res[0], *imagpt = &res[1];
++ const float* aPtr = (float*)input;
++ const float* bPtr = taps;
++ unsigned int number = 0;
+
+- *realpt = 0;
+- *imagpt = 0;
++ *realpt = 0;
++ *imagpt = 0;
+
+- for(number = 0; number < num_points; number++){
+- *realpt += ((*aPtr++) * (*bPtr));
+- *imagpt += ((*aPtr++) * (*bPtr++));
+- }
++ for (number = 0; number < num_points; number++) {
++ *realpt += ((*aPtr++) * (*bPtr));
++ *imagpt += ((*aPtr++) * (*bPtr++));
++ }
+
+- *result = *(lv_32fc_t*)(&res[0]);
++ *result = *(lv_32fc_t*)(&res[0]);
+ }
+
+ #endif /*LV_HAVE_GENERIC*/
+@@ -93,78 +97,83 @@ static inline void volk_32fc_32f_dot_prod_32fc_generic(lv_32fc_t* result, const
+
+ #include <immintrin.h>
+
+-static inline void volk_32fc_32f_dot_prod_32fc_a_avx2_fma( lv_32fc_t* result, const lv_32fc_t* input, const float* taps, unsigned int num_points) {
+-
+- unsigned int number = 0;
+- const unsigned int sixteenthPoints = num_points / 16;
+-
+- float res[2];
+- float *realpt = &res[0], *imagpt = &res[1];
+- const float* aPtr = (float*)input;
+- const float* bPtr = taps;
+-
+- __m256 a0Val, a1Val, a2Val, a3Val;
+- __m256 b0Val, b1Val, b2Val, b3Val;
+- __m256 x0Val, x1Val, x0loVal, x0hiVal, x1loVal, x1hiVal;
+-
+- __m256 dotProdVal0 = _mm256_setzero_ps();
+- __m256 dotProdVal1 = _mm256_setzero_ps();
+- __m256 dotProdVal2 = _mm256_setzero_ps();
+- __m256 dotProdVal3 = _mm256_setzero_ps();
+-
+- for(;number < sixteenthPoints; number++){
+-
+- a0Val = _mm256_load_ps(aPtr);
+- a1Val = _mm256_load_ps(aPtr+8);
+- a2Val = _mm256_load_ps(aPtr+16);
+- a3Val = _mm256_load_ps(aPtr+24);
+-
+- x0Val = _mm256_load_ps(bPtr); // t0|t1|t2|t3|t4|t5|t6|t7
+- x1Val = _mm256_load_ps(bPtr+8);
+- x0loVal = _mm256_unpacklo_ps(x0Val, x0Val); // t0|t0|t1|t1|t4|t4|t5|t5
+- x0hiVal = _mm256_unpackhi_ps(x0Val, x0Val); // t2|t2|t3|t3|t6|t6|t7|t7
+- x1loVal = _mm256_unpacklo_ps(x1Val, x1Val);
+- x1hiVal = _mm256_unpackhi_ps(x1Val, x1Val);
+-
+- // TODO: it may be possible to rearrange swizzling to better pipeline data
+- b0Val = _mm256_permute2f128_ps(x0loVal, x0hiVal, 0x20); // t0|t0|t1|t1|t2|t2|t3|t3
+- b1Val = _mm256_permute2f128_ps(x0loVal, x0hiVal, 0x31); // t4|t4|t5|t5|t6|t6|t7|t7
+- b2Val = _mm256_permute2f128_ps(x1loVal, x1hiVal, 0x20);
+- b3Val = _mm256_permute2f128_ps(x1loVal, x1hiVal, 0x31);
+-
+- dotProdVal0 = _mm256_fmadd_ps(a0Val, b0Val, dotProdVal0);
+- dotProdVal1 = _mm256_fmadd_ps(a1Val, b1Val, dotProdVal1);
+- dotProdVal2 = _mm256_fmadd_ps(a2Val, b2Val, dotProdVal2);
+- dotProdVal3 = _mm256_fmadd_ps(a3Val, b3Val, dotProdVal3);
+-
+- aPtr += 32;
+- bPtr += 16;
+- }
+-
+- dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
+- dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2);
+- dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3);
+-
+- __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
+-
+- _mm256_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector
+-
+- *realpt = dotProductVector[0];
+- *imagpt = dotProductVector[1];
+- *realpt += dotProductVector[2];
+- *imagpt += dotProductVector[3];
+- *realpt += dotProductVector[4];
+- *imagpt += dotProductVector[5];
+- *realpt += dotProductVector[6];
+- *imagpt += dotProductVector[7];
+-
+- number = sixteenthPoints*16;
+- for(;number < num_points; number++){
+- *realpt += ((*aPtr++) * (*bPtr));
+- *imagpt += ((*aPtr++) * (*bPtr++));
+- }
+-
+- *result = *(lv_32fc_t*)(&res[0]);
++static inline void volk_32fc_32f_dot_prod_32fc_a_avx2_fma(lv_32fc_t* result,
++ const lv_32fc_t* input,
++ const float* taps,
++ unsigned int num_points)
++{
++
++ unsigned int number = 0;
++ const unsigned int sixteenthPoints = num_points / 16;
++
++ float res[2];
++ float *realpt = &res[0], *imagpt = &res[1];
++ const float* aPtr = (float*)input;
++ const float* bPtr = taps;
++
++ __m256 a0Val, a1Val, a2Val, a3Val;
++ __m256 b0Val, b1Val, b2Val, b3Val;
++ __m256 x0Val, x1Val, x0loVal, x0hiVal, x1loVal, x1hiVal;
++
++ __m256 dotProdVal0 = _mm256_setzero_ps();
++ __m256 dotProdVal1 = _mm256_setzero_ps();
++ __m256 dotProdVal2 = _mm256_setzero_ps();
++ __m256 dotProdVal3 = _mm256_setzero_ps();
++
++ for (; number < sixteenthPoints; number++) {
++
++ a0Val = _mm256_load_ps(aPtr);
++ a1Val = _mm256_load_ps(aPtr + 8);
++ a2Val = _mm256_load_ps(aPtr + 16);
++ a3Val = _mm256_load_ps(aPtr + 24);
++
++ x0Val = _mm256_load_ps(bPtr); // t0|t1|t2|t3|t4|t5|t6|t7
++ x1Val = _mm256_load_ps(bPtr + 8);
++ x0loVal = _mm256_unpacklo_ps(x0Val, x0Val); // t0|t0|t1|t1|t4|t4|t5|t5
++ x0hiVal = _mm256_unpackhi_ps(x0Val, x0Val); // t2|t2|t3|t3|t6|t6|t7|t7
++ x1loVal = _mm256_unpacklo_ps(x1Val, x1Val);
++ x1hiVal = _mm256_unpackhi_ps(x1Val, x1Val);
++
++ // TODO: it may be possible to rearrange swizzling to better pipeline data
++ b0Val = _mm256_permute2f128_ps(x0loVal, x0hiVal, 0x20); // t0|t0|t1|t1|t2|t2|t3|t3
++ b1Val = _mm256_permute2f128_ps(x0loVal, x0hiVal, 0x31); // t4|t4|t5|t5|t6|t6|t7|t7
++ b2Val = _mm256_permute2f128_ps(x1loVal, x1hiVal, 0x20);
++ b3Val = _mm256_permute2f128_ps(x1loVal, x1hiVal, 0x31);
++
++ dotProdVal0 = _mm256_fmadd_ps(a0Val, b0Val, dotProdVal0);
++ dotProdVal1 = _mm256_fmadd_ps(a1Val, b1Val, dotProdVal1);
++ dotProdVal2 = _mm256_fmadd_ps(a2Val, b2Val, dotProdVal2);
++ dotProdVal3 = _mm256_fmadd_ps(a3Val, b3Val, dotProdVal3);
++
++ aPtr += 32;
++ bPtr += 16;
++ }
++
++ dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
++ dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2);
++ dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3);
++
++ __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
++
++ _mm256_store_ps(dotProductVector,
++ dotProdVal0); // Store the results back into the dot product vector
++
++ *realpt = dotProductVector[0];
++ *imagpt = dotProductVector[1];
++ *realpt += dotProductVector[2];
++ *imagpt += dotProductVector[3];
++ *realpt += dotProductVector[4];
++ *imagpt += dotProductVector[5];
++ *realpt += dotProductVector[6];
++ *imagpt += dotProductVector[7];
++
++ number = sixteenthPoints * 16;
++ for (; number < num_points; number++) {
++ *realpt += ((*aPtr++) * (*bPtr));
++ *imagpt += ((*aPtr++) * (*bPtr++));
++ }
++
++ *result = *(lv_32fc_t*)(&res[0]);
+ }
+
+ #endif /*LV_HAVE_AVX2 && LV_HAVE_FMA*/
+@@ -173,164 +182,172 @@ static inline void volk_32fc_32f_dot_prod_32fc_a_avx2_fma( lv_32fc_t* result, co
+
+ #include <immintrin.h>
+
+-static inline void volk_32fc_32f_dot_prod_32fc_a_avx( lv_32fc_t* result, const lv_32fc_t* input, const float* taps, unsigned int num_points) {
+-
+- unsigned int number = 0;
+- const unsigned int sixteenthPoints = num_points / 16;
+-
+- float res[2];
+- float *realpt = &res[0], *imagpt = &res[1];
+- const float* aPtr = (float*)input;
+- const float* bPtr = taps;
+-
+- __m256 a0Val, a1Val, a2Val, a3Val;
+- __m256 b0Val, b1Val, b2Val, b3Val;
+- __m256 x0Val, x1Val, x0loVal, x0hiVal, x1loVal, x1hiVal;
+- __m256 c0Val, c1Val, c2Val, c3Val;
+-
+- __m256 dotProdVal0 = _mm256_setzero_ps();
+- __m256 dotProdVal1 = _mm256_setzero_ps();
+- __m256 dotProdVal2 = _mm256_setzero_ps();
+- __m256 dotProdVal3 = _mm256_setzero_ps();
+-
+- for(;number < sixteenthPoints; number++){
+-
+- a0Val = _mm256_load_ps(aPtr);
+- a1Val = _mm256_load_ps(aPtr+8);
+- a2Val = _mm256_load_ps(aPtr+16);
+- a3Val = _mm256_load_ps(aPtr+24);
+-
+- x0Val = _mm256_load_ps(bPtr); // t0|t1|t2|t3|t4|t5|t6|t7
+- x1Val = _mm256_load_ps(bPtr+8);
+- x0loVal = _mm256_unpacklo_ps(x0Val, x0Val); // t0|t0|t1|t1|t4|t4|t5|t5
+- x0hiVal = _mm256_unpackhi_ps(x0Val, x0Val); // t2|t2|t3|t3|t6|t6|t7|t7
+- x1loVal = _mm256_unpacklo_ps(x1Val, x1Val);
+- x1hiVal = _mm256_unpackhi_ps(x1Val, x1Val);
+-
+- // TODO: it may be possible to rearrange swizzling to better pipeline data
+- b0Val = _mm256_permute2f128_ps(x0loVal, x0hiVal, 0x20); // t0|t0|t1|t1|t2|t2|t3|t3
+- b1Val = _mm256_permute2f128_ps(x0loVal, x0hiVal, 0x31); // t4|t4|t5|t5|t6|t6|t7|t7
+- b2Val = _mm256_permute2f128_ps(x1loVal, x1hiVal, 0x20);
+- b3Val = _mm256_permute2f128_ps(x1loVal, x1hiVal, 0x31);
+-
+- c0Val = _mm256_mul_ps(a0Val, b0Val);
+- c1Val = _mm256_mul_ps(a1Val, b1Val);
+- c2Val = _mm256_mul_ps(a2Val, b2Val);
+- c3Val = _mm256_mul_ps(a3Val, b3Val);
+-
+- dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0);
+- dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1);
+- dotProdVal2 = _mm256_add_ps(c2Val, dotProdVal2);
+- dotProdVal3 = _mm256_add_ps(c3Val, dotProdVal3);
+-
+- aPtr += 32;
+- bPtr += 16;
+- }
+-
+- dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
+- dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2);
+- dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3);
+-
+- __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
+-
+- _mm256_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector
+-
+- *realpt = dotProductVector[0];
+- *imagpt = dotProductVector[1];
+- *realpt += dotProductVector[2];
+- *imagpt += dotProductVector[3];
+- *realpt += dotProductVector[4];
+- *imagpt += dotProductVector[5];
+- *realpt += dotProductVector[6];
+- *imagpt += dotProductVector[7];
+-
+- number = sixteenthPoints*16;
+- for(;number < num_points; number++){
+- *realpt += ((*aPtr++) * (*bPtr));
+- *imagpt += ((*aPtr++) * (*bPtr++));
+- }
+-
+- *result = *(lv_32fc_t*)(&res[0]);
++static inline void volk_32fc_32f_dot_prod_32fc_a_avx(lv_32fc_t* result,
++ const lv_32fc_t* input,
++ const float* taps,
++ unsigned int num_points)
++{
++
++ unsigned int number = 0;
++ const unsigned int sixteenthPoints = num_points / 16;
++
++ float res[2];
++ float *realpt = &res[0], *imagpt = &res[1];
++ const float* aPtr = (float*)input;
++ const float* bPtr = taps;
++
++ __m256 a0Val, a1Val, a2Val, a3Val;
++ __m256 b0Val, b1Val, b2Val, b3Val;
++ __m256 x0Val, x1Val, x0loVal, x0hiVal, x1loVal, x1hiVal;
++ __m256 c0Val, c1Val, c2Val, c3Val;
++
++ __m256 dotProdVal0 = _mm256_setzero_ps();
++ __m256 dotProdVal1 = _mm256_setzero_ps();
++ __m256 dotProdVal2 = _mm256_setzero_ps();
++ __m256 dotProdVal3 = _mm256_setzero_ps();
++
++ for (; number < sixteenthPoints; number++) {
++
++ a0Val = _mm256_load_ps(aPtr);
++ a1Val = _mm256_load_ps(aPtr + 8);
++ a2Val = _mm256_load_ps(aPtr + 16);
++ a3Val = _mm256_load_ps(aPtr + 24);
++
++ x0Val = _mm256_load_ps(bPtr); // t0|t1|t2|t3|t4|t5|t6|t7
++ x1Val = _mm256_load_ps(bPtr + 8);
++ x0loVal = _mm256_unpacklo_ps(x0Val, x0Val); // t0|t0|t1|t1|t4|t4|t5|t5
++ x0hiVal = _mm256_unpackhi_ps(x0Val, x0Val); // t2|t2|t3|t3|t6|t6|t7|t7
++ x1loVal = _mm256_unpacklo_ps(x1Val, x1Val);
++ x1hiVal = _mm256_unpackhi_ps(x1Val, x1Val);
++
++ // TODO: it may be possible to rearrange swizzling to better pipeline data
++ b0Val = _mm256_permute2f128_ps(x0loVal, x0hiVal, 0x20); // t0|t0|t1|t1|t2|t2|t3|t3
++ b1Val = _mm256_permute2f128_ps(x0loVal, x0hiVal, 0x31); // t4|t4|t5|t5|t6|t6|t7|t7
++ b2Val = _mm256_permute2f128_ps(x1loVal, x1hiVal, 0x20);
++ b3Val = _mm256_permute2f128_ps(x1loVal, x1hiVal, 0x31);
++
++ c0Val = _mm256_mul_ps(a0Val, b0Val);
++ c1Val = _mm256_mul_ps(a1Val, b1Val);
++ c2Val = _mm256_mul_ps(a2Val, b2Val);
++ c3Val = _mm256_mul_ps(a3Val, b3Val);
++
++ dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0);
++ dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1);
++ dotProdVal2 = _mm256_add_ps(c2Val, dotProdVal2);
++ dotProdVal3 = _mm256_add_ps(c3Val, dotProdVal3);
++
++ aPtr += 32;
++ bPtr += 16;
++ }
++
++ dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
++ dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2);
++ dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3);
++
++ __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
++
++ _mm256_store_ps(dotProductVector,
++ dotProdVal0); // Store the results back into the dot product vector
++
++ *realpt = dotProductVector[0];
++ *imagpt = dotProductVector[1];
++ *realpt += dotProductVector[2];
++ *imagpt += dotProductVector[3];
++ *realpt += dotProductVector[4];
++ *imagpt += dotProductVector[5];
++ *realpt += dotProductVector[6];
++ *imagpt += dotProductVector[7];
++
++ number = sixteenthPoints * 16;
++ for (; number < num_points; number++) {
++ *realpt += ((*aPtr++) * (*bPtr));
++ *imagpt += ((*aPtr++) * (*bPtr++));
++ }
++
++ *result = *(lv_32fc_t*)(&res[0]);
+ }
+
+ #endif /*LV_HAVE_AVX*/
+
+
+-
+-
+ #ifdef LV_HAVE_SSE
+
+
+-static inline void volk_32fc_32f_dot_prod_32fc_a_sse( lv_32fc_t* result, const lv_32fc_t* input, const float* taps, unsigned int num_points) {
+-
+- unsigned int number = 0;
+- const unsigned int sixteenthPoints = num_points / 8;
+-
+- float res[2];
+- float *realpt = &res[0], *imagpt = &res[1];
+- const float* aPtr = (float*)input;
+- const float* bPtr = taps;
+-
+- __m128 a0Val, a1Val, a2Val, a3Val;
+- __m128 b0Val, b1Val, b2Val, b3Val;
+- __m128 x0Val, x1Val, x2Val, x3Val;
+- __m128 c0Val, c1Val, c2Val, c3Val;
+-
+- __m128 dotProdVal0 = _mm_setzero_ps();
+- __m128 dotProdVal1 = _mm_setzero_ps();
+- __m128 dotProdVal2 = _mm_setzero_ps();
+- __m128 dotProdVal3 = _mm_setzero_ps();
+-
+- for(;number < sixteenthPoints; number++){
+-
+- a0Val = _mm_load_ps(aPtr);
+- a1Val = _mm_load_ps(aPtr+4);
+- a2Val = _mm_load_ps(aPtr+8);
+- a3Val = _mm_load_ps(aPtr+12);
+-
+- x0Val = _mm_load_ps(bPtr);
+- x1Val = _mm_load_ps(bPtr);
+- x2Val = _mm_load_ps(bPtr+4);
+- x3Val = _mm_load_ps(bPtr+4);
+- b0Val = _mm_unpacklo_ps(x0Val, x1Val);
+- b1Val = _mm_unpackhi_ps(x0Val, x1Val);
+- b2Val = _mm_unpacklo_ps(x2Val, x3Val);
+- b3Val = _mm_unpackhi_ps(x2Val, x3Val);
+-
+- c0Val = _mm_mul_ps(a0Val, b0Val);
+- c1Val = _mm_mul_ps(a1Val, b1Val);
+- c2Val = _mm_mul_ps(a2Val, b2Val);
+- c3Val = _mm_mul_ps(a3Val, b3Val);
+-
+- dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0);
+- dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1);
+- dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2);
+- dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3);
+-
+- aPtr += 16;
+- bPtr += 8;
+- }
+-
+- dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
+- dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
+- dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
+-
+- __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
+-
+- _mm_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector
+-
+- *realpt = dotProductVector[0];
+- *imagpt = dotProductVector[1];
+- *realpt += dotProductVector[2];
+- *imagpt += dotProductVector[3];
+-
+- number = sixteenthPoints*8;
+- for(;number < num_points; number++){
+- *realpt += ((*aPtr++) * (*bPtr));
+- *imagpt += ((*aPtr++) * (*bPtr++));
+- }
+-
+- *result = *(lv_32fc_t*)(&res[0]);
++static inline void volk_32fc_32f_dot_prod_32fc_a_sse(lv_32fc_t* result,
++ const lv_32fc_t* input,
++ const float* taps,
++ unsigned int num_points)
++{
++
++ unsigned int number = 0;
++ const unsigned int sixteenthPoints = num_points / 8;
++
++ float res[2];
++ float *realpt = &res[0], *imagpt = &res[1];
++ const float* aPtr = (float*)input;
++ const float* bPtr = taps;
++
++ __m128 a0Val, a1Val, a2Val, a3Val;
++ __m128 b0Val, b1Val, b2Val, b3Val;
++ __m128 x0Val, x1Val, x2Val, x3Val;
++ __m128 c0Val, c1Val, c2Val, c3Val;
++
++ __m128 dotProdVal0 = _mm_setzero_ps();
++ __m128 dotProdVal1 = _mm_setzero_ps();
++ __m128 dotProdVal2 = _mm_setzero_ps();
++ __m128 dotProdVal3 = _mm_setzero_ps();
++
++ for (; number < sixteenthPoints; number++) {
++
++ a0Val = _mm_load_ps(aPtr);
++ a1Val = _mm_load_ps(aPtr + 4);
++ a2Val = _mm_load_ps(aPtr + 8);
++ a3Val = _mm_load_ps(aPtr + 12);
++
++ x0Val = _mm_load_ps(bPtr);
++ x1Val = _mm_load_ps(bPtr);
++ x2Val = _mm_load_ps(bPtr + 4);
++ x3Val = _mm_load_ps(bPtr + 4);
++ b0Val = _mm_unpacklo_ps(x0Val, x1Val);
++ b1Val = _mm_unpackhi_ps(x0Val, x1Val);
++ b2Val = _mm_unpacklo_ps(x2Val, x3Val);
++ b3Val = _mm_unpackhi_ps(x2Val, x3Val);
++
++ c0Val = _mm_mul_ps(a0Val, b0Val);
++ c1Val = _mm_mul_ps(a1Val, b1Val);
++ c2Val = _mm_mul_ps(a2Val, b2Val);
++ c3Val = _mm_mul_ps(a3Val, b3Val);
++
++ dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0);
++ dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1);
++ dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2);
++ dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3);
++
++ aPtr += 16;
++ bPtr += 8;
++ }
++
++ dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
++ dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
++ dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
++
++ __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
++
++ _mm_store_ps(dotProductVector,
++ dotProdVal0); // Store the results back into the dot product vector
++
++ *realpt = dotProductVector[0];
++ *imagpt = dotProductVector[1];
++ *realpt += dotProductVector[2];
++ *imagpt += dotProductVector[3];
++
++ number = sixteenthPoints * 8;
++ for (; number < num_points; number++) {
++ *realpt += ((*aPtr++) * (*bPtr));
++ *imagpt += ((*aPtr++) * (*bPtr++));
++ }
++
++ *result = *(lv_32fc_t*)(&res[0]);
+ }
+
+ #endif /*LV_HAVE_SSE*/
+@@ -339,78 +356,83 @@ static inline void volk_32fc_32f_dot_prod_32fc_a_sse( lv_32fc_t* result, const
+
+ #include <immintrin.h>
+
+-static inline void volk_32fc_32f_dot_prod_32fc_u_avx2_fma( lv_32fc_t* result, const lv_32fc_t* input, const float* taps, unsigned int num_points) {
+-
+- unsigned int number = 0;
+- const unsigned int sixteenthPoints = num_points / 16;
+-
+- float res[2];
+- float *realpt = &res[0], *imagpt = &res[1];
+- const float* aPtr = (float*)input;
+- const float* bPtr = taps;
+-
+- __m256 a0Val, a1Val, a2Val, a3Val;
+- __m256 b0Val, b1Val, b2Val, b3Val;
+- __m256 x0Val, x1Val, x0loVal, x0hiVal, x1loVal, x1hiVal;
+-
+- __m256 dotProdVal0 = _mm256_setzero_ps();
+- __m256 dotProdVal1 = _mm256_setzero_ps();
+- __m256 dotProdVal2 = _mm256_setzero_ps();
+- __m256 dotProdVal3 = _mm256_setzero_ps();
+-
+- for(;number < sixteenthPoints; number++){
+-
+- a0Val = _mm256_loadu_ps(aPtr);
+- a1Val = _mm256_loadu_ps(aPtr+8);
+- a2Val = _mm256_loadu_ps(aPtr+16);
+- a3Val = _mm256_loadu_ps(aPtr+24);
+-
+- x0Val = _mm256_load_ps(bPtr); // t0|t1|t2|t3|t4|t5|t6|t7
+- x1Val = _mm256_load_ps(bPtr+8);
+- x0loVal = _mm256_unpacklo_ps(x0Val, x0Val); // t0|t0|t1|t1|t4|t4|t5|t5
+- x0hiVal = _mm256_unpackhi_ps(x0Val, x0Val); // t2|t2|t3|t3|t6|t6|t7|t7
+- x1loVal = _mm256_unpacklo_ps(x1Val, x1Val);
+- x1hiVal = _mm256_unpackhi_ps(x1Val, x1Val);
+-
+- // TODO: it may be possible to rearrange swizzling to better pipeline data
+- b0Val = _mm256_permute2f128_ps(x0loVal, x0hiVal, 0x20); // t0|t0|t1|t1|t2|t2|t3|t3
+- b1Val = _mm256_permute2f128_ps(x0loVal, x0hiVal, 0x31); // t4|t4|t5|t5|t6|t6|t7|t7
+- b2Val = _mm256_permute2f128_ps(x1loVal, x1hiVal, 0x20);
+- b3Val = _mm256_permute2f128_ps(x1loVal, x1hiVal, 0x31);
+-
+- dotProdVal0 = _mm256_fmadd_ps(a0Val, b0Val, dotProdVal0);
+- dotProdVal1 = _mm256_fmadd_ps(a1Val, b1Val, dotProdVal1);
+- dotProdVal2 = _mm256_fmadd_ps(a2Val, b2Val, dotProdVal2);
+- dotProdVal3 = _mm256_fmadd_ps(a3Val, b3Val, dotProdVal3);
+-
+- aPtr += 32;
+- bPtr += 16;
+- }
+-
+- dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
+- dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2);
+- dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3);
+-
+- __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
+-
+- _mm256_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector
+-
+- *realpt = dotProductVector[0];
+- *imagpt = dotProductVector[1];
+- *realpt += dotProductVector[2];
+- *imagpt += dotProductVector[3];
+- *realpt += dotProductVector[4];
+- *imagpt += dotProductVector[5];
+- *realpt += dotProductVector[6];
+- *imagpt += dotProductVector[7];
+-
+- number = sixteenthPoints*16;
+- for(;number < num_points; number++){
+- *realpt += ((*aPtr++) * (*bPtr));
+- *imagpt += ((*aPtr++) * (*bPtr++));
+- }
+-
+- *result = *(lv_32fc_t*)(&res[0]);
++static inline void volk_32fc_32f_dot_prod_32fc_u_avx2_fma(lv_32fc_t* result,
++ const lv_32fc_t* input,
++ const float* taps,
++ unsigned int num_points)
++{
++
++ unsigned int number = 0;
++ const unsigned int sixteenthPoints = num_points / 16;
++
++ float res[2];
++ float *realpt = &res[0], *imagpt = &res[1];
++ const float* aPtr = (float*)input;
++ const float* bPtr = taps;
++
++ __m256 a0Val, a1Val, a2Val, a3Val;
++ __m256 b0Val, b1Val, b2Val, b3Val;
++ __m256 x0Val, x1Val, x0loVal, x0hiVal, x1loVal, x1hiVal;
++
++ __m256 dotProdVal0 = _mm256_setzero_ps();
++ __m256 dotProdVal1 = _mm256_setzero_ps();
++ __m256 dotProdVal2 = _mm256_setzero_ps();
++ __m256 dotProdVal3 = _mm256_setzero_ps();
++
++ for (; number < sixteenthPoints; number++) {
++
++ a0Val = _mm256_loadu_ps(aPtr);
++ a1Val = _mm256_loadu_ps(aPtr + 8);
++ a2Val = _mm256_loadu_ps(aPtr + 16);
++ a3Val = _mm256_loadu_ps(aPtr + 24);
++
++ x0Val = _mm256_load_ps(bPtr); // t0|t1|t2|t3|t4|t5|t6|t7
++ x1Val = _mm256_load_ps(bPtr + 8);
++ x0loVal = _mm256_unpacklo_ps(x0Val, x0Val); // t0|t0|t1|t1|t4|t4|t5|t5
++ x0hiVal = _mm256_unpackhi_ps(x0Val, x0Val); // t2|t2|t3|t3|t6|t6|t7|t7
++ x1loVal = _mm256_unpacklo_ps(x1Val, x1Val);
++ x1hiVal = _mm256_unpackhi_ps(x1Val, x1Val);
++
++ // TODO: it may be possible to rearrange swizzling to better pipeline data
++ b0Val = _mm256_permute2f128_ps(x0loVal, x0hiVal, 0x20); // t0|t0|t1|t1|t2|t2|t3|t3
++ b1Val = _mm256_permute2f128_ps(x0loVal, x0hiVal, 0x31); // t4|t4|t5|t5|t6|t6|t7|t7
++ b2Val = _mm256_permute2f128_ps(x1loVal, x1hiVal, 0x20);
++ b3Val = _mm256_permute2f128_ps(x1loVal, x1hiVal, 0x31);
++
++ dotProdVal0 = _mm256_fmadd_ps(a0Val, b0Val, dotProdVal0);
++ dotProdVal1 = _mm256_fmadd_ps(a1Val, b1Val, dotProdVal1);
++ dotProdVal2 = _mm256_fmadd_ps(a2Val, b2Val, dotProdVal2);
++ dotProdVal3 = _mm256_fmadd_ps(a3Val, b3Val, dotProdVal3);
++
++ aPtr += 32;
++ bPtr += 16;
++ }
++
++ dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
++ dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2);
++ dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3);
++
++ __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
++
++ _mm256_store_ps(dotProductVector,
++ dotProdVal0); // Store the results back into the dot product vector
++
++ *realpt = dotProductVector[0];
++ *imagpt = dotProductVector[1];
++ *realpt += dotProductVector[2];
++ *imagpt += dotProductVector[3];
++ *realpt += dotProductVector[4];
++ *imagpt += dotProductVector[5];
++ *realpt += dotProductVector[6];
++ *imagpt += dotProductVector[7];
++
++ number = sixteenthPoints * 16;
++ for (; number < num_points; number++) {
++ *realpt += ((*aPtr++) * (*bPtr));
++ *imagpt += ((*aPtr++) * (*bPtr++));
++ }
++
++ *result = *(lv_32fc_t*)(&res[0]);
+ }
+
+ #endif /*LV_HAVE_AVX2 && LV_HAVE_FMA*/
+@@ -419,162 +441,172 @@ static inline void volk_32fc_32f_dot_prod_32fc_u_avx2_fma( lv_32fc_t* result, co
+
+ #include <immintrin.h>
+
+-static inline void volk_32fc_32f_dot_prod_32fc_u_avx( lv_32fc_t* result, const lv_32fc_t* input, const float* taps, unsigned int num_points) {
+-
+- unsigned int number = 0;
+- const unsigned int sixteenthPoints = num_points / 16;
+-
+- float res[2];
+- float *realpt = &res[0], *imagpt = &res[1];
+- const float* aPtr = (float*)input;
+- const float* bPtr = taps;
+-
+- __m256 a0Val, a1Val, a2Val, a3Val;
+- __m256 b0Val, b1Val, b2Val, b3Val;
+- __m256 x0Val, x1Val, x0loVal, x0hiVal, x1loVal, x1hiVal;
+- __m256 c0Val, c1Val, c2Val, c3Val;
+-
+- __m256 dotProdVal0 = _mm256_setzero_ps();
+- __m256 dotProdVal1 = _mm256_setzero_ps();
+- __m256 dotProdVal2 = _mm256_setzero_ps();
+- __m256 dotProdVal3 = _mm256_setzero_ps();
+-
+- for(;number < sixteenthPoints; number++){
+-
+- a0Val = _mm256_loadu_ps(aPtr);
+- a1Val = _mm256_loadu_ps(aPtr+8);
+- a2Val = _mm256_loadu_ps(aPtr+16);
+- a3Val = _mm256_loadu_ps(aPtr+24);
+-
+- x0Val = _mm256_loadu_ps(bPtr); // t0|t1|t2|t3|t4|t5|t6|t7
+- x1Val = _mm256_loadu_ps(bPtr+8);
+- x0loVal = _mm256_unpacklo_ps(x0Val, x0Val); // t0|t0|t1|t1|t4|t4|t5|t5
+- x0hiVal = _mm256_unpackhi_ps(x0Val, x0Val); // t2|t2|t3|t3|t6|t6|t7|t7
+- x1loVal = _mm256_unpacklo_ps(x1Val, x1Val);
+- x1hiVal = _mm256_unpackhi_ps(x1Val, x1Val);
+-
+- // TODO: it may be possible to rearrange swizzling to better pipeline data
+- b0Val = _mm256_permute2f128_ps(x0loVal, x0hiVal, 0x20); // t0|t0|t1|t1|t2|t2|t3|t3
+- b1Val = _mm256_permute2f128_ps(x0loVal, x0hiVal, 0x31); // t4|t4|t5|t5|t6|t6|t7|t7
+- b2Val = _mm256_permute2f128_ps(x1loVal, x1hiVal, 0x20);
+- b3Val = _mm256_permute2f128_ps(x1loVal, x1hiVal, 0x31);
+-
+- c0Val = _mm256_mul_ps(a0Val, b0Val);
+- c1Val = _mm256_mul_ps(a1Val, b1Val);
+- c2Val = _mm256_mul_ps(a2Val, b2Val);
+- c3Val = _mm256_mul_ps(a3Val, b3Val);
+-
+- dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0);
+- dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1);
+- dotProdVal2 = _mm256_add_ps(c2Val, dotProdVal2);
+- dotProdVal3 = _mm256_add_ps(c3Val, dotProdVal3);
+-
+- aPtr += 32;
+- bPtr += 16;
+- }
+-
+- dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
+- dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2);
+- dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3);
+-
+- __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
+-
+- _mm256_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector
+-
+- *realpt = dotProductVector[0];
+- *imagpt = dotProductVector[1];
+- *realpt += dotProductVector[2];
+- *imagpt += dotProductVector[3];
+- *realpt += dotProductVector[4];
+- *imagpt += dotProductVector[5];
+- *realpt += dotProductVector[6];
+- *imagpt += dotProductVector[7];
+-
+- number = sixteenthPoints*16;
+- for(;number < num_points; number++){
+- *realpt += ((*aPtr++) * (*bPtr));
+- *imagpt += ((*aPtr++) * (*bPtr++));
+- }
+-
+- *result = *(lv_32fc_t*)(&res[0]);
++static inline void volk_32fc_32f_dot_prod_32fc_u_avx(lv_32fc_t* result,
++ const lv_32fc_t* input,
++ const float* taps,
++ unsigned int num_points)
++{
++
++ unsigned int number = 0;
++ const unsigned int sixteenthPoints = num_points / 16;
++
++ float res[2];
++ float *realpt = &res[0], *imagpt = &res[1];
++ const float* aPtr = (float*)input;
++ const float* bPtr = taps;
++
++ __m256 a0Val, a1Val, a2Val, a3Val;
++ __m256 b0Val, b1Val, b2Val, b3Val;
++ __m256 x0Val, x1Val, x0loVal, x0hiVal, x1loVal, x1hiVal;
++ __m256 c0Val, c1Val, c2Val, c3Val;
++
++ __m256 dotProdVal0 = _mm256_setzero_ps();
++ __m256 dotProdVal1 = _mm256_setzero_ps();
++ __m256 dotProdVal2 = _mm256_setzero_ps();
++ __m256 dotProdVal3 = _mm256_setzero_ps();
++
++ for (; number < sixteenthPoints; number++) {
++
++ a0Val = _mm256_loadu_ps(aPtr);
++ a1Val = _mm256_loadu_ps(aPtr + 8);
++ a2Val = _mm256_loadu_ps(aPtr + 16);
++ a3Val = _mm256_loadu_ps(aPtr + 24);
++
++ x0Val = _mm256_loadu_ps(bPtr); // t0|t1|t2|t3|t4|t5|t6|t7
++ x1Val = _mm256_loadu_ps(bPtr + 8);
++ x0loVal = _mm256_unpacklo_ps(x0Val, x0Val); // t0|t0|t1|t1|t4|t4|t5|t5
++ x0hiVal = _mm256_unpackhi_ps(x0Val, x0Val); // t2|t2|t3|t3|t6|t6|t7|t7
++ x1loVal = _mm256_unpacklo_ps(x1Val, x1Val);
++ x1hiVal = _mm256_unpackhi_ps(x1Val, x1Val);
++
++ // TODO: it may be possible to rearrange swizzling to better pipeline data
++ b0Val = _mm256_permute2f128_ps(x0loVal, x0hiVal, 0x20); // t0|t0|t1|t1|t2|t2|t3|t3
++ b1Val = _mm256_permute2f128_ps(x0loVal, x0hiVal, 0x31); // t4|t4|t5|t5|t6|t6|t7|t7
++ b2Val = _mm256_permute2f128_ps(x1loVal, x1hiVal, 0x20);
++ b3Val = _mm256_permute2f128_ps(x1loVal, x1hiVal, 0x31);
++
++ c0Val = _mm256_mul_ps(a0Val, b0Val);
++ c1Val = _mm256_mul_ps(a1Val, b1Val);
++ c2Val = _mm256_mul_ps(a2Val, b2Val);
++ c3Val = _mm256_mul_ps(a3Val, b3Val);
++
++ dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0);
++ dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1);
++ dotProdVal2 = _mm256_add_ps(c2Val, dotProdVal2);
++ dotProdVal3 = _mm256_add_ps(c3Val, dotProdVal3);
++
++ aPtr += 32;
++ bPtr += 16;
++ }
++
++ dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
++ dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2);
++ dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3);
++
++ __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
++
++ _mm256_store_ps(dotProductVector,
++ dotProdVal0); // Store the results back into the dot product vector
++
++ *realpt = dotProductVector[0];
++ *imagpt = dotProductVector[1];
++ *realpt += dotProductVector[2];
++ *imagpt += dotProductVector[3];
++ *realpt += dotProductVector[4];
++ *imagpt += dotProductVector[5];
++ *realpt += dotProductVector[6];
++ *imagpt += dotProductVector[7];
++
++ number = sixteenthPoints * 16;
++ for (; number < num_points; number++) {
++ *realpt += ((*aPtr++) * (*bPtr));
++ *imagpt += ((*aPtr++) * (*bPtr++));
++ }
++
++ *result = *(lv_32fc_t*)(&res[0]);
+ }
+ #endif /*LV_HAVE_AVX*/
+
+ #ifdef LV_HAVE_NEON
+ #include <arm_neon.h>
+
+-static inline void volk_32fc_32f_dot_prod_32fc_neon_unroll ( lv_32fc_t* __restrict result, const lv_32fc_t* __restrict input, const float* __restrict taps, unsigned int num_points) {
+-
+- unsigned int number;
+- const unsigned int quarterPoints = num_points / 8;
+-
+- float res[2];
+- float *realpt = &res[0], *imagpt = &res[1];
+- const float* inputPtr = (float*)input;
+- const float* tapsPtr = taps;
+- float zero[4] = {0.0f, 0.0f, 0.0f, 0.0f };
+- float accVector_real[4];
+- float accVector_imag[4];
+-
+- float32x4x2_t inputVector0, inputVector1;
+- float32x4_t tapsVector0, tapsVector1;
+- float32x4_t tmp_real0, tmp_imag0;
+- float32x4_t tmp_real1, tmp_imag1;
+- float32x4_t real_accumulator0, imag_accumulator0;
+- float32x4_t real_accumulator1, imag_accumulator1;
+-
+- // zero out accumulators
+- // take a *float, return float32x4_t
+- real_accumulator0 = vld1q_f32( zero );
+- imag_accumulator0 = vld1q_f32( zero );
+- real_accumulator1 = vld1q_f32( zero );
+- imag_accumulator1 = vld1q_f32( zero );
+-
+- for(number=0 ;number < quarterPoints; number++){
+- // load doublewords and duplicate in to second lane
+- tapsVector0 = vld1q_f32(tapsPtr );
+- tapsVector1 = vld1q_f32(tapsPtr+4 );
+-
+- // load quadword of complex numbers in to 2 lanes. 1st lane is real, 2dn imag
+- inputVector0 = vld2q_f32(inputPtr );
+- inputVector1 = vld2q_f32(inputPtr+8 );
+- // inputVector is now a struct of two vectors, 0th is real, 1st is imag
+-
+- tmp_real0 = vmulq_f32(tapsVector0, inputVector0.val[0]);
+- tmp_imag0 = vmulq_f32(tapsVector0, inputVector0.val[1]);
+-
+- tmp_real1 = vmulq_f32(tapsVector1, inputVector1.val[0]);
+- tmp_imag1 = vmulq_f32(tapsVector1, inputVector1.val[1]);
+-
+- real_accumulator0 = vaddq_f32(real_accumulator0, tmp_real0);
+- imag_accumulator0 = vaddq_f32(imag_accumulator0, tmp_imag0);
+-
+- real_accumulator1 = vaddq_f32(real_accumulator1, tmp_real1);
+- imag_accumulator1 = vaddq_f32(imag_accumulator1, tmp_imag1);
+-
+- tapsPtr += 8;
+- inputPtr += 16;
+- }
+-
+- real_accumulator0 = vaddq_f32( real_accumulator0, real_accumulator1);
+- imag_accumulator0 = vaddq_f32( imag_accumulator0, imag_accumulator1);
+- // void vst1q_f32( float32_t * ptr, float32x4_t val);
+- // store results back to a complex (array of 2 floats)
+- vst1q_f32(accVector_real, real_accumulator0);
+- vst1q_f32(accVector_imag, imag_accumulator0);
+- *realpt = accVector_real[0] + accVector_real[1] +
+- accVector_real[2] + accVector_real[3] ;
+-
+- *imagpt = accVector_imag[0] + accVector_imag[1] +
+- accVector_imag[2] + accVector_imag[3] ;
+-
+- // clean up the remainder
+- for(number=quarterPoints*8; number < num_points; number++){
+- *realpt += ((*inputPtr++) * (*tapsPtr));
+- *imagpt += ((*inputPtr++) * (*tapsPtr++));
+- }
+-
+- *result = *(lv_32fc_t*)(&res[0]);
++static inline void
++volk_32fc_32f_dot_prod_32fc_neon_unroll(lv_32fc_t* __restrict result,
++ const lv_32fc_t* __restrict input,
++ const float* __restrict taps,
++ unsigned int num_points)
++{
++
++ unsigned int number;
++ const unsigned int quarterPoints = num_points / 8;
++
++ float res[2];
++ float *realpt = &res[0], *imagpt = &res[1];
++ const float* inputPtr = (float*)input;
++ const float* tapsPtr = taps;
++ float zero[4] = { 0.0f, 0.0f, 0.0f, 0.0f };
++ float accVector_real[4];
++ float accVector_imag[4];
++
++ float32x4x2_t inputVector0, inputVector1;
++ float32x4_t tapsVector0, tapsVector1;
++ float32x4_t tmp_real0, tmp_imag0;
++ float32x4_t tmp_real1, tmp_imag1;
++ float32x4_t real_accumulator0, imag_accumulator0;
++ float32x4_t real_accumulator1, imag_accumulator1;
++
++ // zero out accumulators
++ // take a *float, return float32x4_t
++ real_accumulator0 = vld1q_f32(zero);
++ imag_accumulator0 = vld1q_f32(zero);
++ real_accumulator1 = vld1q_f32(zero);
++ imag_accumulator1 = vld1q_f32(zero);
++
++ for (number = 0; number < quarterPoints; number++) {
++ // load doublewords and duplicate in to second lane
++ tapsVector0 = vld1q_f32(tapsPtr);
++ tapsVector1 = vld1q_f32(tapsPtr + 4);
++
++ // load quadword of complex numbers in to 2 lanes. 1st lane is real, 2dn imag
++ inputVector0 = vld2q_f32(inputPtr);
++ inputVector1 = vld2q_f32(inputPtr + 8);
++ // inputVector is now a struct of two vectors, 0th is real, 1st is imag
++
++ tmp_real0 = vmulq_f32(tapsVector0, inputVector0.val[0]);
++ tmp_imag0 = vmulq_f32(tapsVector0, inputVector0.val[1]);
++
++ tmp_real1 = vmulq_f32(tapsVector1, inputVector1.val[0]);
++ tmp_imag1 = vmulq_f32(tapsVector1, inputVector1.val[1]);
++
++ real_accumulator0 = vaddq_f32(real_accumulator0, tmp_real0);
++ imag_accumulator0 = vaddq_f32(imag_accumulator0, tmp_imag0);
++
++ real_accumulator1 = vaddq_f32(real_accumulator1, tmp_real1);
++ imag_accumulator1 = vaddq_f32(imag_accumulator1, tmp_imag1);
++
++ tapsPtr += 8;
++ inputPtr += 16;
++ }
++
++ real_accumulator0 = vaddq_f32(real_accumulator0, real_accumulator1);
++ imag_accumulator0 = vaddq_f32(imag_accumulator0, imag_accumulator1);
++ // void vst1q_f32( float32_t * ptr, float32x4_t val);
++ // store results back to a complex (array of 2 floats)
++ vst1q_f32(accVector_real, real_accumulator0);
++ vst1q_f32(accVector_imag, imag_accumulator0);
++ *realpt =
++ accVector_real[0] + accVector_real[1] + accVector_real[2] + accVector_real[3];
++
++ *imagpt =
++ accVector_imag[0] + accVector_imag[1] + accVector_imag[2] + accVector_imag[3];
++
++ // clean up the remainder
++ for (number = quarterPoints * 8; number < num_points; number++) {
++ *realpt += ((*inputPtr++) * (*tapsPtr));
++ *imagpt += ((*inputPtr++) * (*tapsPtr++));
++ }
++
++ *result = *(lv_32fc_t*)(&res[0]);
+ }
+
+ #endif /*LV_HAVE_NEON*/
+@@ -582,154 +614,171 @@ static inline void volk_32fc_32f_dot_prod_32fc_neon_unroll ( lv_32fc_t* __restri
+ #ifdef LV_HAVE_NEON
+ #include <arm_neon.h>
+
+-static inline void volk_32fc_32f_dot_prod_32fc_a_neon ( lv_32fc_t* __restrict result, const lv_32fc_t* __restrict input, const float* __restrict taps, unsigned int num_points) {
+-
+- unsigned int number;
+- const unsigned int quarterPoints = num_points / 4;
++static inline void volk_32fc_32f_dot_prod_32fc_a_neon(lv_32fc_t* __restrict result,
++ const lv_32fc_t* __restrict input,
++ const float* __restrict taps,
++ unsigned int num_points)
++{
+
+- float res[2];
+- float *realpt = &res[0], *imagpt = &res[1];
+- const float* inputPtr = (float*)input;
+- const float* tapsPtr = taps;
+- float zero[4] = {0.0f, 0.0f, 0.0f, 0.0f };
+- float accVector_real[4];
+- float accVector_imag[4];
++ unsigned int number;
++ const unsigned int quarterPoints = num_points / 4;
+
+- float32x4x2_t inputVector;
+- float32x4_t tapsVector;
+- float32x4_t tmp_real, tmp_imag;
+- float32x4_t real_accumulator, imag_accumulator;
++ float res[2];
++ float *realpt = &res[0], *imagpt = &res[1];
++ const float* inputPtr = (float*)input;
++ const float* tapsPtr = taps;
++ float zero[4] = { 0.0f, 0.0f, 0.0f, 0.0f };
++ float accVector_real[4];
++ float accVector_imag[4];
+
++ float32x4x2_t inputVector;
++ float32x4_t tapsVector;
++ float32x4_t tmp_real, tmp_imag;
++ float32x4_t real_accumulator, imag_accumulator;
+
+- // zero out accumulators
+- // take a *float, return float32x4_t
+- real_accumulator = vld1q_f32( zero );
+- imag_accumulator = vld1q_f32( zero );
+
+- for(number=0 ;number < quarterPoints; number++){
+- // load taps ( float32x2x2_t = vld1q_f32( float32_t const * ptr) )
+- // load doublewords and duplicate in to second lane
+- tapsVector = vld1q_f32(tapsPtr );
++ // zero out accumulators
++ // take a *float, return float32x4_t
++ real_accumulator = vld1q_f32(zero);
++ imag_accumulator = vld1q_f32(zero);
+
+- // load quadword of complex numbers in to 2 lanes. 1st lane is real, 2dn imag
+- inputVector = vld2q_f32(inputPtr );
++ for (number = 0; number < quarterPoints; number++) {
++ // load taps ( float32x2x2_t = vld1q_f32( float32_t const * ptr) )
++ // load doublewords and duplicate in to second lane
++ tapsVector = vld1q_f32(tapsPtr);
+
+- tmp_real = vmulq_f32(tapsVector, inputVector.val[0]);
+- tmp_imag = vmulq_f32(tapsVector, inputVector.val[1]);
++ // load quadword of complex numbers in to 2 lanes. 1st lane is real, 2dn imag
++ inputVector = vld2q_f32(inputPtr);
+
+- real_accumulator = vaddq_f32(real_accumulator, tmp_real);
+- imag_accumulator = vaddq_f32(imag_accumulator, tmp_imag);
++ tmp_real = vmulq_f32(tapsVector, inputVector.val[0]);
++ tmp_imag = vmulq_f32(tapsVector, inputVector.val[1]);
+
++ real_accumulator = vaddq_f32(real_accumulator, tmp_real);
++ imag_accumulator = vaddq_f32(imag_accumulator, tmp_imag);
+
+- tapsPtr += 4;
+- inputPtr += 8;
+
+- }
++ tapsPtr += 4;
++ inputPtr += 8;
++ }
+
+- // store results back to a complex (array of 2 floats)
+- vst1q_f32(accVector_real, real_accumulator);
+- vst1q_f32(accVector_imag, imag_accumulator);
+- *realpt = accVector_real[0] + accVector_real[1] +
+- accVector_real[2] + accVector_real[3] ;
++ // store results back to a complex (array of 2 floats)
++ vst1q_f32(accVector_real, real_accumulator);
++ vst1q_f32(accVector_imag, imag_accumulator);
++ *realpt =
++ accVector_real[0] + accVector_real[1] + accVector_real[2] + accVector_real[3];
+
+- *imagpt = accVector_imag[0] + accVector_imag[1] +
+- accVector_imag[2] + accVector_imag[3] ;
++ *imagpt =
++ accVector_imag[0] + accVector_imag[1] + accVector_imag[2] + accVector_imag[3];
+
+- // clean up the remainder
+- for(number=quarterPoints*4; number < num_points; number++){
+- *realpt += ((*inputPtr++) * (*tapsPtr));
+- *imagpt += ((*inputPtr++) * (*tapsPtr++));
+- }
++ // clean up the remainder
++ for (number = quarterPoints * 4; number < num_points; number++) {
++ *realpt += ((*inputPtr++) * (*tapsPtr));
++ *imagpt += ((*inputPtr++) * (*tapsPtr++));
++ }
+
+- *result = *(lv_32fc_t*)(&res[0]);
++ *result = *(lv_32fc_t*)(&res[0]);
+ }
+
+ #endif /*LV_HAVE_NEON*/
+
+ #ifdef LV_HAVE_NEONV7
+-extern void volk_32fc_32f_dot_prod_32fc_a_neonasm ( lv_32fc_t* result, const lv_32fc_t* input, const float* taps, unsigned int num_points);
++extern void volk_32fc_32f_dot_prod_32fc_a_neonasm(lv_32fc_t* result,
++ const lv_32fc_t* input,
++ const float* taps,
++ unsigned int num_points);
+ #endif /*LV_HAVE_NEONV7*/
+
+ #ifdef LV_HAVE_NEONV7
+-extern void volk_32fc_32f_dot_prod_32fc_a_neonasmvmla ( lv_32fc_t* result, const lv_32fc_t* input, const float* taps, unsigned int num_points);
++extern void volk_32fc_32f_dot_prod_32fc_a_neonasmvmla(lv_32fc_t* result,
++ const lv_32fc_t* input,
++ const float* taps,
++ unsigned int num_points);
+ #endif /*LV_HAVE_NEONV7*/
+
+ #ifdef LV_HAVE_NEONV7
+-extern void volk_32fc_32f_dot_prod_32fc_a_neonpipeline ( lv_32fc_t* result, const lv_32fc_t* input, const float* taps, unsigned int num_points);
++extern void volk_32fc_32f_dot_prod_32fc_a_neonpipeline(lv_32fc_t* result,
++ const lv_32fc_t* input,
++ const float* taps,
++ unsigned int num_points);
+ #endif /*LV_HAVE_NEONV7*/
+
+ #ifdef LV_HAVE_SSE
+
+-static inline void volk_32fc_32f_dot_prod_32fc_u_sse( lv_32fc_t* result, const lv_32fc_t* input, const float* taps, unsigned int num_points) {
+-
+- unsigned int number = 0;
+- const unsigned int sixteenthPoints = num_points / 8;
+-
+- float res[2];
+- float *realpt = &res[0], *imagpt = &res[1];
+- const float* aPtr = (float*)input;
+- const float* bPtr = taps;
+-
+- __m128 a0Val, a1Val, a2Val, a3Val;
+- __m128 b0Val, b1Val, b2Val, b3Val;
+- __m128 x0Val, x1Val, x2Val, x3Val;
+- __m128 c0Val, c1Val, c2Val, c3Val;
+-
+- __m128 dotProdVal0 = _mm_setzero_ps();
+- __m128 dotProdVal1 = _mm_setzero_ps();
+- __m128 dotProdVal2 = _mm_setzero_ps();
+- __m128 dotProdVal3 = _mm_setzero_ps();
+-
+- for(;number < sixteenthPoints; number++){
+-
+- a0Val = _mm_loadu_ps(aPtr);
+- a1Val = _mm_loadu_ps(aPtr+4);
+- a2Val = _mm_loadu_ps(aPtr+8);
+- a3Val = _mm_loadu_ps(aPtr+12);
+-
+- x0Val = _mm_loadu_ps(bPtr);
+- x1Val = _mm_loadu_ps(bPtr);
+- x2Val = _mm_loadu_ps(bPtr+4);
+- x3Val = _mm_loadu_ps(bPtr+4);
+- b0Val = _mm_unpacklo_ps(x0Val, x1Val);
+- b1Val = _mm_unpackhi_ps(x0Val, x1Val);
+- b2Val = _mm_unpacklo_ps(x2Val, x3Val);
+- b3Val = _mm_unpackhi_ps(x2Val, x3Val);
+-
+- c0Val = _mm_mul_ps(a0Val, b0Val);
+- c1Val = _mm_mul_ps(a1Val, b1Val);
+- c2Val = _mm_mul_ps(a2Val, b2Val);
+- c3Val = _mm_mul_ps(a3Val, b3Val);
+-
+- dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0);
+- dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1);
+- dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2);
+- dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3);
+-
+- aPtr += 16;
+- bPtr += 8;
+- }
+-
+- dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
+- dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
+- dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
+-
+- __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
+-
+- _mm_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector
+-
+- *realpt = dotProductVector[0];
+- *imagpt = dotProductVector[1];
+- *realpt += dotProductVector[2];
+- *imagpt += dotProductVector[3];
+-
+- number = sixteenthPoints*8;
+- for(;number < num_points; number++){
+- *realpt += ((*aPtr++) * (*bPtr));
+- *imagpt += ((*aPtr++) * (*bPtr++));
+- }
+-
+- *result = *(lv_32fc_t*)(&res[0]);
++static inline void volk_32fc_32f_dot_prod_32fc_u_sse(lv_32fc_t* result,
++ const lv_32fc_t* input,
++ const float* taps,
++ unsigned int num_points)
++{
++
++ unsigned int number = 0;
++ const unsigned int sixteenthPoints = num_points / 8;
++
++ float res[2];
++ float *realpt = &res[0], *imagpt = &res[1];
++ const float* aPtr = (float*)input;
++ const float* bPtr = taps;
++
++ __m128 a0Val, a1Val, a2Val, a3Val;
++ __m128 b0Val, b1Val, b2Val, b3Val;
++ __m128 x0Val, x1Val, x2Val, x3Val;
++ __m128 c0Val, c1Val, c2Val, c3Val;
++
++ __m128 dotProdVal0 = _mm_setzero_ps();
++ __m128 dotProdVal1 = _mm_setzero_ps();
++ __m128 dotProdVal2 = _mm_setzero_ps();
++ __m128 dotProdVal3 = _mm_setzero_ps();
++
++ for (; number < sixteenthPoints; number++) {
++
++ a0Val = _mm_loadu_ps(aPtr);
++ a1Val = _mm_loadu_ps(aPtr + 4);
++ a2Val = _mm_loadu_ps(aPtr + 8);
++ a3Val = _mm_loadu_ps(aPtr + 12);
++
++ x0Val = _mm_loadu_ps(bPtr);
++ x1Val = _mm_loadu_ps(bPtr);
++ x2Val = _mm_loadu_ps(bPtr + 4);
++ x3Val = _mm_loadu_ps(bPtr + 4);
++ b0Val = _mm_unpacklo_ps(x0Val, x1Val);
++ b1Val = _mm_unpackhi_ps(x0Val, x1Val);
++ b2Val = _mm_unpacklo_ps(x2Val, x3Val);
++ b3Val = _mm_unpackhi_ps(x2Val, x3Val);
++
++ c0Val = _mm_mul_ps(a0Val, b0Val);
++ c1Val = _mm_mul_ps(a1Val, b1Val);
++ c2Val = _mm_mul_ps(a2Val, b2Val);
++ c3Val = _mm_mul_ps(a3Val, b3Val);
++
++ dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0);
++ dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1);
++ dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2);
++ dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3);
++
++ aPtr += 16;
++ bPtr += 8;
++ }
++
++ dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
++ dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
++ dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
++
++ __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
++
++ _mm_store_ps(dotProductVector,
++ dotProdVal0); // Store the results back into the dot product vector
++
++ *realpt = dotProductVector[0];
++ *imagpt = dotProductVector[1];
++ *realpt += dotProductVector[2];
++ *imagpt += dotProductVector[3];
++
++ number = sixteenthPoints * 8;
++ for (; number < num_points; number++) {
++ *realpt += ((*aPtr++) * (*bPtr));
++ *imagpt += ((*aPtr++) * (*bPtr++));
++ }
++
++ *result = *(lv_32fc_t*)(&res[0]);
+ }
+
+ #endif /*LV_HAVE_SSE*/
+diff --git a/kernels/volk/volk_32fc_32f_multiply_32fc.h b/kernels/volk/volk_32fc_32f_multiply_32fc.h
+index b47883f..196ba9a 100644
+--- a/kernels/volk/volk_32fc_32f_multiply_32fc.h
++++ b/kernels/volk/volk_32fc_32f_multiply_32fc.h
+@@ -30,8 +30,8 @@
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+- * void volk_32fc_32f_multiply_32fc(lv_32fc_t* cVector, const lv_32fc_t* aVector, const float* bVector, unsigned int num_points);
+- * \endcode
++ * void volk_32fc_32f_multiply_32fc(lv_32fc_t* cVector, const lv_32fc_t* aVector, const
++ * float* bVector, unsigned int num_points); \endcode
+ *
+ * \b Inputs
+ * \li aVector: The input vector of complex floats.
+@@ -61,52 +61,55 @@
+ #ifdef LV_HAVE_AVX
+ #include <immintrin.h>
+
+-static inline void
+-volk_32fc_32f_multiply_32fc_a_avx(lv_32fc_t* cVector, const lv_32fc_t* aVector,
+- const float* bVector, unsigned int num_points)
++static inline void volk_32fc_32f_multiply_32fc_a_avx(lv_32fc_t* cVector,
++ const lv_32fc_t* aVector,
++ const float* bVector,
++ unsigned int num_points)
+ {
+- unsigned int number = 0;
+- const unsigned int eighthPoints = num_points / 8;
++ unsigned int number = 0;
++ const unsigned int eighthPoints = num_points / 8;
+
+- lv_32fc_t* cPtr = cVector;
+- const lv_32fc_t* aPtr = aVector;
+- const float* bPtr= bVector;
++ lv_32fc_t* cPtr = cVector;
++ const lv_32fc_t* aPtr = aVector;
++ const float* bPtr = bVector;
+
+- __m256 aVal1, aVal2, bVal, bVal1, bVal2, cVal1, cVal2;
++ __m256 aVal1, aVal2, bVal, bVal1, bVal2, cVal1, cVal2;
+
+- __m256i permute_mask = _mm256_set_epi32(3, 3, 2, 2, 1, 1, 0, 0);
++ __m256i permute_mask = _mm256_set_epi32(3, 3, 2, 2, 1, 1, 0, 0);
+
+- for(;number < eighthPoints; number++){
++ for (; number < eighthPoints; number++) {
+
+- aVal1 = _mm256_load_ps((float *)aPtr);
+- aPtr += 4;
++ aVal1 = _mm256_load_ps((float*)aPtr);
++ aPtr += 4;
+
+- aVal2 = _mm256_load_ps((float *)aPtr);
+- aPtr += 4;
++ aVal2 = _mm256_load_ps((float*)aPtr);
++ aPtr += 4;
+
+- bVal = _mm256_load_ps(bPtr); // b0|b1|b2|b3|b4|b5|b6|b7
+- bPtr += 8;
++ bVal = _mm256_load_ps(bPtr); // b0|b1|b2|b3|b4|b5|b6|b7
++ bPtr += 8;
+
+- bVal1 = _mm256_permute2f128_ps(bVal, bVal, 0x00); // b0|b1|b2|b3|b0|b1|b2|b3
+- bVal2 = _mm256_permute2f128_ps(bVal, bVal, 0x11); // b4|b5|b6|b7|b4|b5|b6|b7
++ bVal1 = _mm256_permute2f128_ps(bVal, bVal, 0x00); // b0|b1|b2|b3|b0|b1|b2|b3
++ bVal2 = _mm256_permute2f128_ps(bVal, bVal, 0x11); // b4|b5|b6|b7|b4|b5|b6|b7
+
+- bVal1 = _mm256_permutevar_ps(bVal1, permute_mask); // b0|b0|b1|b1|b2|b2|b3|b3
+- bVal2 = _mm256_permutevar_ps(bVal2, permute_mask); // b4|b4|b5|b5|b6|b6|b7|b7
++ bVal1 = _mm256_permutevar_ps(bVal1, permute_mask); // b0|b0|b1|b1|b2|b2|b3|b3
++ bVal2 = _mm256_permutevar_ps(bVal2, permute_mask); // b4|b4|b5|b5|b6|b6|b7|b7
+
+- cVal1 = _mm256_mul_ps(aVal1, bVal1);
+- cVal2 = _mm256_mul_ps(aVal2, bVal2);
++ cVal1 = _mm256_mul_ps(aVal1, bVal1);
++ cVal2 = _mm256_mul_ps(aVal2, bVal2);
+
+- _mm256_store_ps((float*)cPtr,cVal1); // Store the results back into the C container
+- cPtr += 4;
++ _mm256_store_ps((float*)cPtr,
++ cVal1); // Store the results back into the C container
++ cPtr += 4;
+
+- _mm256_store_ps((float*)cPtr,cVal2); // Store the results back into the C container
+- cPtr += 4;
+- }
++ _mm256_store_ps((float*)cPtr,
++ cVal2); // Store the results back into the C container
++ cPtr += 4;
++ }
+
+- number = eighthPoints * 8;
+- for(;number < num_points; ++number){
+- *cPtr++ = (*aPtr++) * (*bPtr++);
+- }
++ number = eighthPoints * 8;
++ for (; number < num_points; ++number) {
++ *cPtr++ = (*aPtr++) * (*bPtr++);
++ }
+ }
+ #endif /* LV_HAVE_AVX */
+
+@@ -114,67 +117,69 @@ volk_32fc_32f_multiply_32fc_a_avx(lv_32fc_t* cVector, const lv_32fc_t* aVector,
+ #ifdef LV_HAVE_SSE
+ #include <xmmintrin.h>
+
+-static inline void
+-volk_32fc_32f_multiply_32fc_a_sse(lv_32fc_t* cVector, const lv_32fc_t* aVector,
+- const float* bVector, unsigned int num_points)
++static inline void volk_32fc_32f_multiply_32fc_a_sse(lv_32fc_t* cVector,
++ const lv_32fc_t* aVector,
++ const float* bVector,
++ unsigned int num_points)
+ {
+- unsigned int number = 0;
+- const unsigned int quarterPoints = num_points / 4;
++ unsigned int number = 0;
++ const unsigned int quarterPoints = num_points / 4;
+
+- lv_32fc_t* cPtr = cVector;
+- const lv_32fc_t* aPtr = aVector;
+- const float* bPtr= bVector;
++ lv_32fc_t* cPtr = cVector;
++ const lv_32fc_t* aPtr = aVector;
++ const float* bPtr = bVector;
+
+- __m128 aVal1, aVal2, bVal, bVal1, bVal2, cVal;
+- for(;number < quarterPoints; number++){
++ __m128 aVal1, aVal2, bVal, bVal1, bVal2, cVal;
++ for (; number < quarterPoints; number++) {
+
+- aVal1 = _mm_load_ps((const float*)aPtr);
+- aPtr += 2;
++ aVal1 = _mm_load_ps((const float*)aPtr);
++ aPtr += 2;
+
+- aVal2 = _mm_load_ps((const float*)aPtr);
+- aPtr += 2;
++ aVal2 = _mm_load_ps((const float*)aPtr);
++ aPtr += 2;
+
+- bVal = _mm_load_ps(bPtr);
+- bPtr += 4;
++ bVal = _mm_load_ps(bPtr);
++ bPtr += 4;
+
+- bVal1 = _mm_shuffle_ps(bVal, bVal, _MM_SHUFFLE(1,1,0,0));
+- bVal2 = _mm_shuffle_ps(bVal, bVal, _MM_SHUFFLE(3,3,2,2));
++ bVal1 = _mm_shuffle_ps(bVal, bVal, _MM_SHUFFLE(1, 1, 0, 0));
++ bVal2 = _mm_shuffle_ps(bVal, bVal, _MM_SHUFFLE(3, 3, 2, 2));
+
+- cVal = _mm_mul_ps(aVal1, bVal1);
++ cVal = _mm_mul_ps(aVal1, bVal1);
+
+- _mm_store_ps((float*)cPtr,cVal); // Store the results back into the C container
+- cPtr += 2;
++ _mm_store_ps((float*)cPtr, cVal); // Store the results back into the C container
++ cPtr += 2;
+
+- cVal = _mm_mul_ps(aVal2, bVal2);
++ cVal = _mm_mul_ps(aVal2, bVal2);
+
+- _mm_store_ps((float*)cPtr,cVal); // Store the results back into the C container
++ _mm_store_ps((float*)cPtr, cVal); // Store the results back into the C container
+
+- cPtr += 2;
+- }
++ cPtr += 2;
++ }
+
+- number = quarterPoints * 4;
+- for(;number < num_points; number++){
+- *cPtr++ = (*aPtr++) * (*bPtr);
+- bPtr++;
+- }
++ number = quarterPoints * 4;
++ for (; number < num_points; number++) {
++ *cPtr++ = (*aPtr++) * (*bPtr);
++ bPtr++;
++ }
+ }
+ #endif /* LV_HAVE_SSE */
+
+
+ #ifdef LV_HAVE_GENERIC
+
+-static inline void
+-volk_32fc_32f_multiply_32fc_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector,
+- const float* bVector, unsigned int num_points)
++static inline void volk_32fc_32f_multiply_32fc_generic(lv_32fc_t* cVector,
++ const lv_32fc_t* aVector,
++ const float* bVector,
++ unsigned int num_points)
+ {
+- lv_32fc_t* cPtr = cVector;
+- const lv_32fc_t* aPtr = aVector;
+- const float* bPtr= bVector;
+- unsigned int number = 0;
+-
+- for(number = 0; number < num_points; number++){
+- *cPtr++ = (*aPtr++) * (*bPtr++);
+- }
++ lv_32fc_t* cPtr = cVector;
++ const lv_32fc_t* aPtr = aVector;
++ const float* bPtr = bVector;
++ unsigned int number = 0;
++
++ for (number = 0; number < num_points; number++) {
++ *cPtr++ = (*aPtr++) * (*bPtr++);
++ }
+ }
+ #endif /* LV_HAVE_GENERIC */
+
+@@ -182,49 +187,52 @@ volk_32fc_32f_multiply_32fc_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector
+ #ifdef LV_HAVE_NEON
+ #include <arm_neon.h>
+
+-static inline void
+-volk_32fc_32f_multiply_32fc_neon(lv_32fc_t* cVector, const lv_32fc_t* aVector,
+- const float* bVector, unsigned int num_points)
++static inline void volk_32fc_32f_multiply_32fc_neon(lv_32fc_t* cVector,
++ const lv_32fc_t* aVector,
++ const float* bVector,
++ unsigned int num_points)
+ {
+- lv_32fc_t* cPtr = cVector;
+- const lv_32fc_t* aPtr = aVector;
+- const float* bPtr= bVector;
+- unsigned int number = 0;
+- unsigned int quarter_points = num_points / 4;
+-
+- float32x4x2_t inputVector, outputVector;
+- float32x4_t tapsVector;
+- for(number = 0; number < quarter_points; number++){
+- inputVector = vld2q_f32((float*)aPtr);
+- tapsVector = vld1q_f32(bPtr);
+-
+- outputVector.val[0] = vmulq_f32(inputVector.val[0], tapsVector);
+- outputVector.val[1] = vmulq_f32(inputVector.val[1], tapsVector);
+-
+- vst2q_f32((float*)cPtr, outputVector);
+- aPtr += 4;
+- bPtr += 4;
+- cPtr += 4;
+- }
+-
+- for(number = quarter_points * 4; number < num_points; number++){
+- *cPtr++ = (*aPtr++) * (*bPtr++);
+- }
++ lv_32fc_t* cPtr = cVector;
++ const lv_32fc_t* aPtr = aVector;
++ const float* bPtr = bVector;
++ unsigned int number = 0;
++ unsigned int quarter_points = num_points / 4;
++
++ float32x4x2_t inputVector, outputVector;
++ float32x4_t tapsVector;
++ for (number = 0; number < quarter_points; number++) {
++ inputVector = vld2q_f32((float*)aPtr);
++ tapsVector = vld1q_f32(bPtr);
++
++ outputVector.val[0] = vmulq_f32(inputVector.val[0], tapsVector);
++ outputVector.val[1] = vmulq_f32(inputVector.val[1], tapsVector);
++
++ vst2q_f32((float*)cPtr, outputVector);
++ aPtr += 4;
++ bPtr += 4;
++ cPtr += 4;
++ }
++
++ for (number = quarter_points * 4; number < num_points; number++) {
++ *cPtr++ = (*aPtr++) * (*bPtr++);
++ }
+ }
+ #endif /* LV_HAVE_NEON */
+
+
+ #ifdef LV_HAVE_ORC
+
+-extern void
+-volk_32fc_32f_multiply_32fc_a_orc_impl(lv_32fc_t* cVector, const lv_32fc_t* aVector,
+- const float* bVector, unsigned int num_points);
++extern void volk_32fc_32f_multiply_32fc_a_orc_impl(lv_32fc_t* cVector,
++ const lv_32fc_t* aVector,
++ const float* bVector,
++ unsigned int num_points);
+
+-static inline void
+-volk_32fc_32f_multiply_32fc_u_orc(lv_32fc_t* cVector, const lv_32fc_t* aVector,
+- const float* bVector, unsigned int num_points)
++static inline void volk_32fc_32f_multiply_32fc_u_orc(lv_32fc_t* cVector,
++ const lv_32fc_t* aVector,
++ const float* bVector,
++ unsigned int num_points)
+ {
+- volk_32fc_32f_multiply_32fc_a_orc_impl(cVector, aVector, bVector, num_points);
++ volk_32fc_32f_multiply_32fc_a_orc_impl(cVector, aVector, bVector, num_points);
+ }
+
+ #endif /* LV_HAVE_GENERIC */
+diff --git a/kernels/volk/volk_32fc_conjugate_32fc.h b/kernels/volk/volk_32fc_conjugate_32fc.h
+index 6994d0e..9195e3a 100644
+--- a/kernels/volk/volk_32fc_conjugate_32fc.h
++++ b/kernels/volk/volk_32fc_conjugate_32fc.h
+@@ -29,8 +29,8 @@
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+- * void volk_32fc_conjugate_32fc(lv_32fc_t* cVector, const lv_32fc_t* aVector, unsigned int num_points)
+- * \endcode
++ * void volk_32fc_conjugate_32fc(lv_32fc_t* cVector, const lv_32fc_t* aVector, unsigned
++ * int num_points) \endcode
+ *
+ * \b Inputs
+ * \li aVector: The input vector of complex floats.
+@@ -68,91 +68,94 @@
+ #ifndef INCLUDED_volk_32fc_conjugate_32fc_u_H
+ #define INCLUDED_volk_32fc_conjugate_32fc_u_H
+
++#include <float.h>
+ #include <inttypes.h>
+ #include <stdio.h>
+ #include <volk/volk_complex.h>
+-#include <float.h>
+
+ #ifdef LV_HAVE_AVX
+ #include <immintrin.h>
+
+-static inline void
+-volk_32fc_conjugate_32fc_u_avx(lv_32fc_t* cVector, const lv_32fc_t* aVector, unsigned int num_points)
++static inline void volk_32fc_conjugate_32fc_u_avx(lv_32fc_t* cVector,
++ const lv_32fc_t* aVector,
++ unsigned int num_points)
+ {
+- unsigned int number = 0;
+- const unsigned int quarterPoints = num_points / 4;
++ unsigned int number = 0;
++ const unsigned int quarterPoints = num_points / 4;
+
+- __m256 x;
+- lv_32fc_t* c = cVector;
+- const lv_32fc_t* a = aVector;
++ __m256 x;
++ lv_32fc_t* c = cVector;
++ const lv_32fc_t* a = aVector;
+
+- __m256 conjugator = _mm256_setr_ps(0, -0.f, 0, -0.f, 0, -0.f, 0, -0.f);
++ __m256 conjugator = _mm256_setr_ps(0, -0.f, 0, -0.f, 0, -0.f, 0, -0.f);
+
+- for(;number < quarterPoints; number++){
++ for (; number < quarterPoints; number++) {
+
+- x = _mm256_loadu_ps((float*)a); // Load the complex data as ar,ai,br,bi
++ x = _mm256_loadu_ps((float*)a); // Load the complex data as ar,ai,br,bi
+
+- x = _mm256_xor_ps(x, conjugator); // conjugate register
++ x = _mm256_xor_ps(x, conjugator); // conjugate register
+
+- _mm256_storeu_ps((float*)c,x); // Store the results back into the C container
++ _mm256_storeu_ps((float*)c, x); // Store the results back into the C container
+
+- a += 4;
+- c += 4;
+- }
++ a += 4;
++ c += 4;
++ }
+
+- number = quarterPoints * 4;
++ number = quarterPoints * 4;
+
+- for(;number < num_points; number++) {
+- *c++ = lv_conj(*a++);
+- }
++ for (; number < num_points; number++) {
++ *c++ = lv_conj(*a++);
++ }
+ }
+ #endif /* LV_HAVE_AVX */
+
+ #ifdef LV_HAVE_SSE3
+ #include <pmmintrin.h>
+
+-static inline void
+-volk_32fc_conjugate_32fc_u_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector, unsigned int num_points)
++static inline void volk_32fc_conjugate_32fc_u_sse3(lv_32fc_t* cVector,
++ const lv_32fc_t* aVector,
++ unsigned int num_points)
+ {
+- unsigned int number = 0;
+- const unsigned int halfPoints = num_points / 2;
++ unsigned int number = 0;
++ const unsigned int halfPoints = num_points / 2;
+
+- __m128 x;
+- lv_32fc_t* c = cVector;
+- const lv_32fc_t* a = aVector;
++ __m128 x;
++ lv_32fc_t* c = cVector;
++ const lv_32fc_t* a = aVector;
+
+- __m128 conjugator = _mm_setr_ps(0, -0.f, 0, -0.f);
++ __m128 conjugator = _mm_setr_ps(0, -0.f, 0, -0.f);
+
+- for(;number < halfPoints; number++){
++ for (; number < halfPoints; number++) {
+
+- x = _mm_loadu_ps((float*)a); // Load the complex data as ar,ai,br,bi
++ x = _mm_loadu_ps((float*)a); // Load the complex data as ar,ai,br,bi
+
+- x = _mm_xor_ps(x, conjugator); // conjugate register
++ x = _mm_xor_ps(x, conjugator); // conjugate register
+
+- _mm_storeu_ps((float*)c,x); // Store the results back into the C container
++ _mm_storeu_ps((float*)c, x); // Store the results back into the C container
+
+- a += 2;
+- c += 2;
+- }
++ a += 2;
++ c += 2;
++ }
+
+- if((num_points % 2) != 0) {
+- *c = lv_conj(*a);
+- }
++ if ((num_points % 2) != 0) {
++ *c = lv_conj(*a);
++ }
+ }
+ #endif /* LV_HAVE_SSE3 */
+
+ #ifdef LV_HAVE_GENERIC
+
+-static inline void
+-volk_32fc_conjugate_32fc_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, unsigned int num_points)
++static inline void volk_32fc_conjugate_32fc_generic(lv_32fc_t* cVector,
++ const lv_32fc_t* aVector,
++ unsigned int num_points)
+ {
+- lv_32fc_t* cPtr = cVector;
+- const lv_32fc_t* aPtr = aVector;
+- unsigned int number = 0;
++ lv_32fc_t* cPtr = cVector;
++ const lv_32fc_t* aPtr = aVector;
++ unsigned int number = 0;
+
+- for(number = 0; number < num_points; number++){
+- *cPtr++ = lv_conj(*aPtr++);
+- }
++ for (number = 0; number < num_points; number++) {
++ *cPtr++ = lv_conj(*aPtr++);
++ }
+ }
+ #endif /* LV_HAVE_GENERIC */
+
+@@ -161,124 +164,128 @@ volk_32fc_conjugate_32fc_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, u
+ #ifndef INCLUDED_volk_32fc_conjugate_32fc_a_H
+ #define INCLUDED_volk_32fc_conjugate_32fc_a_H
+
++#include <float.h>
+ #include <inttypes.h>
+ #include <stdio.h>
+ #include <volk/volk_complex.h>
+-#include <float.h>
+
+ #ifdef LV_HAVE_AVX
+ #include <immintrin.h>
+
+-static inline void
+-volk_32fc_conjugate_32fc_a_avx(lv_32fc_t* cVector, const lv_32fc_t* aVector, unsigned int num_points)
++static inline void volk_32fc_conjugate_32fc_a_avx(lv_32fc_t* cVector,
++ const lv_32fc_t* aVector,
++ unsigned int num_points)
+ {
+- unsigned int number = 0;
+- const unsigned int quarterPoints = num_points / 4;
++ unsigned int number = 0;
++ const unsigned int quarterPoints = num_points / 4;
+
+- __m256 x;
+- lv_32fc_t* c = cVector;
+- const lv_32fc_t* a = aVector;
++ __m256 x;
++ lv_32fc_t* c = cVector;
++ const lv_32fc_t* a = aVector;
+
+- __m256 conjugator = _mm256_setr_ps(0, -0.f, 0, -0.f, 0, -0.f, 0, -0.f);
++ __m256 conjugator = _mm256_setr_ps(0, -0.f, 0, -0.f, 0, -0.f, 0, -0.f);
+
+- for(;number < quarterPoints; number++){
++ for (; number < quarterPoints; number++) {
+
+- x = _mm256_load_ps((float*)a); // Load the complex data as ar,ai,br,bi
++ x = _mm256_load_ps((float*)a); // Load the complex data as ar,ai,br,bi
+
+- x = _mm256_xor_ps(x, conjugator); // conjugate register
++ x = _mm256_xor_ps(x, conjugator); // conjugate register
+
+- _mm256_store_ps((float*)c,x); // Store the results back into the C container
++ _mm256_store_ps((float*)c, x); // Store the results back into the C container
+
+- a += 4;
+- c += 4;
+- }
++ a += 4;
++ c += 4;
++ }
+
+- number = quarterPoints * 4;
++ number = quarterPoints * 4;
+
+- for(;number < num_points; number++) {
+- *c++ = lv_conj(*a++);
+- }
++ for (; number < num_points; number++) {
++ *c++ = lv_conj(*a++);
++ }
+ }
+ #endif /* LV_HAVE_AVX */
+
+ #ifdef LV_HAVE_SSE3
+ #include <pmmintrin.h>
+
+-static inline void
+-volk_32fc_conjugate_32fc_a_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector, unsigned int num_points)
++static inline void volk_32fc_conjugate_32fc_a_sse3(lv_32fc_t* cVector,
++ const lv_32fc_t* aVector,
++ unsigned int num_points)
+ {
+- unsigned int number = 0;
+- const unsigned int halfPoints = num_points / 2;
++ unsigned int number = 0;
++ const unsigned int halfPoints = num_points / 2;
+
+- __m128 x;
+- lv_32fc_t* c = cVector;
+- const lv_32fc_t* a = aVector;
++ __m128 x;
++ lv_32fc_t* c = cVector;
++ const lv_32fc_t* a = aVector;
+
+- __m128 conjugator = _mm_setr_ps(0, -0.f, 0, -0.f);
++ __m128 conjugator = _mm_setr_ps(0, -0.f, 0, -0.f);
+
+- for(;number < halfPoints; number++){
++ for (; number < halfPoints; number++) {
+
+- x = _mm_load_ps((float*)a); // Load the complex data as ar,ai,br,bi
++ x = _mm_load_ps((float*)a); // Load the complex data as ar,ai,br,bi
+
+- x = _mm_xor_ps(x, conjugator); // conjugate register
++ x = _mm_xor_ps(x, conjugator); // conjugate register
+
+- _mm_store_ps((float*)c,x); // Store the results back into the C container
++ _mm_store_ps((float*)c, x); // Store the results back into the C container
+
+- a += 2;
+- c += 2;
+- }
++ a += 2;
++ c += 2;
++ }
+
+- if((num_points % 2) != 0) {
+- *c = lv_conj(*a);
+- }
++ if ((num_points % 2) != 0) {
++ *c = lv_conj(*a);
++ }
+ }
+ #endif /* LV_HAVE_SSE3 */
+
+ #ifdef LV_HAVE_NEON
+ #include <arm_neon.h>
+
+-static inline void
+-volk_32fc_conjugate_32fc_a_neon(lv_32fc_t* cVector, const lv_32fc_t* aVector, unsigned int num_points)
++static inline void volk_32fc_conjugate_32fc_a_neon(lv_32fc_t* cVector,
++ const lv_32fc_t* aVector,
++ unsigned int num_points)
+ {
+- unsigned int number;
+- const unsigned int quarterPoints = num_points / 4;
++ unsigned int number;
++ const unsigned int quarterPoints = num_points / 4;
+
+- float32x4x2_t x;
+- lv_32fc_t* c = cVector;
+- const lv_32fc_t* a = aVector;
++ float32x4x2_t x;
++ lv_32fc_t* c = cVector;
++ const lv_32fc_t* a = aVector;
+
+- for(number=0; number < quarterPoints; number++){
+- __VOLK_PREFETCH(a+4);
+- x = vld2q_f32((float*)a); // Load the complex data as ar,br,cr,dr; ai,bi,ci,di
++ for (number = 0; number < quarterPoints; number++) {
++ __VOLK_PREFETCH(a + 4);
++ x = vld2q_f32((float*)a); // Load the complex data as ar,br,cr,dr; ai,bi,ci,di
+
+- // xor the imaginary lane
+- x.val[1] = vnegq_f32( x.val[1]);
++ // xor the imaginary lane
++ x.val[1] = vnegq_f32(x.val[1]);
+
+- vst2q_f32((float*)c,x); // Store the results back into the C container
++ vst2q_f32((float*)c, x); // Store the results back into the C container
+
+- a += 4;
+- c += 4;
+- }
++ a += 4;
++ c += 4;
++ }
+
+- for(number=quarterPoints*4; number < num_points; number++){
+- *c++ = lv_conj(*a++);
+- }
++ for (number = quarterPoints * 4; number < num_points; number++) {
++ *c++ = lv_conj(*a++);
++ }
+ }
+ #endif /* LV_HAVE_NEON */
+
+
+ #ifdef LV_HAVE_GENERIC
+
+-static inline void
+-volk_32fc_conjugate_32fc_a_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, unsigned int num_points)
++static inline void volk_32fc_conjugate_32fc_a_generic(lv_32fc_t* cVector,
++ const lv_32fc_t* aVector,
++ unsigned int num_points)
+ {
+- lv_32fc_t* cPtr = cVector;
+- const lv_32fc_t* aPtr = aVector;
+- unsigned int number = 0;
++ lv_32fc_t* cPtr = cVector;
++ const lv_32fc_t* aPtr = aVector;
++ unsigned int number = 0;
+
+- for(number = 0; number < num_points; number++){
+- *cPtr++ = lv_conj(*aPtr++);
+- }
++ for (number = 0; number < num_points; number++) {
++ *cPtr++ = lv_conj(*aPtr++);
++ }
+ }
+ #endif /* LV_HAVE_GENERIC */
+
+diff --git a/kernels/volk/volk_32fc_convert_16ic.h b/kernels/volk/volk_32fc_convert_16ic.h
+index 0ba2383..5788158 100644
+--- a/kernels/volk/volk_32fc_convert_16ic.h
++++ b/kernels/volk/volk_32fc_convert_16ic.h
+@@ -31,8 +31,8 @@
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+- * void volk_32fc_convert_16ic(lv_16sc_t* outputVector, const lv_32fc_t* inputVector, unsigned int num_points);
+- * \endcode
++ * void volk_32fc_convert_16ic(lv_16sc_t* outputVector, const lv_32fc_t* inputVector,
++ * unsigned int num_points); \endcode
+ *
+ * \b Inputs
+ * \li inputVector: The complex 32-bit float input data buffer.
+@@ -46,14 +46,16 @@
+ #ifndef INCLUDED_volk_32fc_convert_16ic_a_H
+ #define INCLUDED_volk_32fc_convert_16ic_a_H
+
++#include "volk/volk_complex.h"
+ #include <limits.h>
+ #include <math.h>
+-#include "volk/volk_complex.h"
+
+ #ifdef LV_HAVE_AVX2
+ #include <immintrin.h>
+
+-static inline void volk_32fc_convert_16ic_a_avx2(lv_16sc_t* outputVector, const lv_32fc_t* inputVector, unsigned int num_points)
++static inline void volk_32fc_convert_16ic_a_avx2(lv_16sc_t* outputVector,
++ const lv_32fc_t* inputVector,
++ unsigned int num_points)
+ {
+ const unsigned int avx_iters = num_points / 8;
+
+@@ -71,44 +73,44 @@ static inline void volk_32fc_convert_16ic_a_avx2(lv_16sc_t* outputVector, const
+ const __m256 vmax_val = _mm256_set1_ps(max_val);
+ unsigned int i;
+
+- for(i = 0; i < avx_iters; i++)
+- {
+- inputVal1 = _mm256_load_ps((float*)inputVectorPtr);
+- inputVectorPtr += 8;
+- inputVal2 = _mm256_load_ps((float*)inputVectorPtr);
+- inputVectorPtr += 8;
+- __VOLK_PREFETCH(inputVectorPtr + 16);
+-
+- // Clip
+- ret1 = _mm256_max_ps(_mm256_min_ps(inputVal1, vmax_val), vmin_val);
+- ret2 = _mm256_max_ps(_mm256_min_ps(inputVal2, vmax_val), vmin_val);
+-
+- intInputVal1 = _mm256_cvtps_epi32(ret1);
+- intInputVal2 = _mm256_cvtps_epi32(ret2);
+-
+- intInputVal1 = _mm256_packs_epi32(intInputVal1, intInputVal2);
+- intInputVal1 = _mm256_permute4x64_epi64(intInputVal1, 0xd8);
+-
+- _mm256_store_si256((__m256i*)outputVectorPtr, intInputVal1);
+- outputVectorPtr += 16;
+- }
+-
+- for(i = avx_iters * 16; i < num_points * 2; i++)
+- {
+- aux = *inputVectorPtr++;
+- if(aux > max_val)
+- aux = max_val;
+- else if(aux < min_val)
+- aux = min_val;
+- *outputVectorPtr++ = (int16_t)rintf(aux);
+- }
++ for (i = 0; i < avx_iters; i++) {
++ inputVal1 = _mm256_load_ps((float*)inputVectorPtr);
++ inputVectorPtr += 8;
++ inputVal2 = _mm256_load_ps((float*)inputVectorPtr);
++ inputVectorPtr += 8;
++ __VOLK_PREFETCH(inputVectorPtr + 16);
++
++ // Clip
++ ret1 = _mm256_max_ps(_mm256_min_ps(inputVal1, vmax_val), vmin_val);
++ ret2 = _mm256_max_ps(_mm256_min_ps(inputVal2, vmax_val), vmin_val);
++
++ intInputVal1 = _mm256_cvtps_epi32(ret1);
++ intInputVal2 = _mm256_cvtps_epi32(ret2);
++
++ intInputVal1 = _mm256_packs_epi32(intInputVal1, intInputVal2);
++ intInputVal1 = _mm256_permute4x64_epi64(intInputVal1, 0xd8);
++
++ _mm256_store_si256((__m256i*)outputVectorPtr, intInputVal1);
++ outputVectorPtr += 16;
++ }
++
++ for (i = avx_iters * 16; i < num_points * 2; i++) {
++ aux = *inputVectorPtr++;
++ if (aux > max_val)
++ aux = max_val;
++ else if (aux < min_val)
++ aux = min_val;
++ *outputVectorPtr++ = (int16_t)rintf(aux);
++ }
+ }
+ #endif /* LV_HAVE_AVX2 */
+
+ #ifdef LV_HAVE_SSE2
+ #include <emmintrin.h>
+
+-static inline void volk_32fc_convert_16ic_a_sse2(lv_16sc_t* outputVector, const lv_32fc_t* inputVector, unsigned int num_points)
++static inline void volk_32fc_convert_16ic_a_sse2(lv_16sc_t* outputVector,
++ const lv_32fc_t* inputVector,
++ unsigned int num_points)
+ {
+ const unsigned int sse_iters = num_points / 4;
+
+@@ -126,34 +128,34 @@ static inline void volk_32fc_convert_16ic_a_sse2(lv_16sc_t* outputVector, const
+ const __m128 vmax_val = _mm_set_ps1(max_val);
+ unsigned int i;
+
+- for(i = 0; i < sse_iters; i++)
+- {
+- inputVal1 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4;
+- inputVal2 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4;
+- __VOLK_PREFETCH(inputVectorPtr + 8);
+-
+- // Clip
+- ret1 = _mm_max_ps(_mm_min_ps(inputVal1, vmax_val), vmin_val);
+- ret2 = _mm_max_ps(_mm_min_ps(inputVal2, vmax_val), vmin_val);
+-
+- intInputVal1 = _mm_cvtps_epi32(ret1);
+- intInputVal2 = _mm_cvtps_epi32(ret2);
+-
+- intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
+-
+- _mm_store_si128((__m128i*)outputVectorPtr, intInputVal1);
+- outputVectorPtr += 8;
+- }
+-
+- for(i = sse_iters * 8; i < num_points * 2; i++)
+- {
+- aux = *inputVectorPtr++;
+- if(aux > max_val)
+- aux = max_val;
+- else if(aux < min_val)
+- aux = min_val;
+- *outputVectorPtr++ = (int16_t)rintf(aux);
+- }
++ for (i = 0; i < sse_iters; i++) {
++ inputVal1 = _mm_load_ps((float*)inputVectorPtr);
++ inputVectorPtr += 4;
++ inputVal2 = _mm_load_ps((float*)inputVectorPtr);
++ inputVectorPtr += 4;
++ __VOLK_PREFETCH(inputVectorPtr + 8);
++
++ // Clip
++ ret1 = _mm_max_ps(_mm_min_ps(inputVal1, vmax_val), vmin_val);
++ ret2 = _mm_max_ps(_mm_min_ps(inputVal2, vmax_val), vmin_val);
++
++ intInputVal1 = _mm_cvtps_epi32(ret1);
++ intInputVal2 = _mm_cvtps_epi32(ret2);
++
++ intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
++
++ _mm_store_si128((__m128i*)outputVectorPtr, intInputVal1);
++ outputVectorPtr += 8;
++ }
++
++ for (i = sse_iters * 8; i < num_points * 2; i++) {
++ aux = *inputVectorPtr++;
++ if (aux > max_val)
++ aux = max_val;
++ else if (aux < min_val)
++ aux = min_val;
++ *outputVectorPtr++ = (int16_t)rintf(aux);
++ }
+ }
+ #endif /* LV_HAVE_SSE2 */
+
+@@ -161,13 +163,24 @@ static inline void volk_32fc_convert_16ic_a_sse2(lv_16sc_t* outputVector, const
+ #if LV_HAVE_NEONV7
+ #include <arm_neon.h>
+
+-#define VCVTRQ_S32_F32(res,val) \
+- __VOLK_ASM ("VCVTR.S32.F32 %[r0], %[v0]\n\t" : [r0]"=w"(res[0]) : [v0]"w"(val[0]) : ); \
+- __VOLK_ASM ("VCVTR.S32.F32 %[r1], %[v1]\n\t" : [r1]"=w"(res[1]) : [v1]"w"(val[1]) : ); \
+- __VOLK_ASM ("VCVTR.S32.F32 %[r2], %[v2]\n\t" : [r2]"=w"(res[2]) : [v2]"w"(val[2]) : ); \
+- __VOLK_ASM ("VCVTR.S32.F32 %[r3], %[v3]\n\t" : [r3]"=w"(res[3]) : [v3]"w"(val[3]) : );
+-
+-static inline void volk_32fc_convert_16ic_neon(lv_16sc_t* outputVector, const lv_32fc_t* inputVector, unsigned int num_points)
++#define VCVTRQ_S32_F32(res, val) \
++ __VOLK_ASM("VCVTR.S32.F32 %[r0], %[v0]\n\t" \
++ : [r0] "=w"(res[0]) \
++ : [v0] "w"(val[0]) \
++ :); \
++ __VOLK_ASM("VCVTR.S32.F32 %[r1], %[v1]\n\t" \
++ : [r1] "=w"(res[1]) \
++ : [v1] "w"(val[1]) \
++ :); \
++ __VOLK_ASM("VCVTR.S32.F32 %[r2], %[v2]\n\t" \
++ : [r2] "=w"(res[2]) \
++ : [v2] "w"(val[2]) \
++ :); \
++ __VOLK_ASM("VCVTR.S32.F32 %[r3], %[v3]\n\t" : [r3] "=w"(res[3]) : [v3] "w"(val[3]) :);
++
++static inline void volk_32fc_convert_16ic_neon(lv_16sc_t* outputVector,
++ const lv_32fc_t* inputVector,
++ unsigned int num_points)
+ {
+
+ const unsigned int neon_iters = num_points / 4;
+@@ -184,43 +197,41 @@ static inline void volk_32fc_convert_16ic_neon(lv_16sc_t* outputVector, const lv
+ const float32x4_t max_val = vmovq_n_f32(max_val_f);
+ float32x4_t ret1, ret2, a, b;
+
+- int32x4_t toint_a={0,0,0,0};
+- int32x4_t toint_b={0,0,0,0};
++ int32x4_t toint_a = { 0, 0, 0, 0 };
++ int32x4_t toint_b = { 0, 0, 0, 0 };
+ int16x4_t intInputVal1, intInputVal2;
+ int16x8_t res;
+
+- for(i = 0; i < neon_iters; i++)
+- {
+- a = vld1q_f32((const float32_t*)(inputVectorPtr));
+- inputVectorPtr += 4;
+- b = vld1q_f32((const float32_t*)(inputVectorPtr));
+- inputVectorPtr += 4;
+- __VOLK_PREFETCH(inputVectorPtr + 8);
+-
+- ret1 = vmaxq_f32(vminq_f32(a, max_val), min_val);
+- ret2 = vmaxq_f32(vminq_f32(b, max_val), min_val);
+-
+- // vcvtr takes into account the current rounding mode (as does rintf)
+- VCVTRQ_S32_F32(toint_a, ret1);
+- VCVTRQ_S32_F32(toint_b, ret2);
+-
+- intInputVal1 = vqmovn_s32(toint_a);
+- intInputVal2 = vqmovn_s32(toint_b);
+-
+- res = vcombine_s16(intInputVal1, intInputVal2);
+- vst1q_s16((int16_t*)outputVectorPtr, res);
+- outputVectorPtr += 8;
+- }
+-
+- for(i = neon_iters * 8; i < num_points * 2; i++)
+- {
+- aux = *inputVectorPtr++;
+- if(aux > max_val_f)
+- aux = max_val_f;
+- else if(aux < min_val_f)
+- aux = min_val_f;
+- *outputVectorPtr++ = (int16_t)rintf(aux);
+- }
++ for (i = 0; i < neon_iters; i++) {
++ a = vld1q_f32((const float32_t*)(inputVectorPtr));
++ inputVectorPtr += 4;
++ b = vld1q_f32((const float32_t*)(inputVectorPtr));
++ inputVectorPtr += 4;
++ __VOLK_PREFETCH(inputVectorPtr + 8);
++
++ ret1 = vmaxq_f32(vminq_f32(a, max_val), min_val);
++ ret2 = vmaxq_f32(vminq_f32(b, max_val), min_val);
++
++ // vcvtr takes into account the current rounding mode (as does rintf)
++ VCVTRQ_S32_F32(toint_a, ret1);
++ VCVTRQ_S32_F32(toint_b, ret2);
++
++ intInputVal1 = vqmovn_s32(toint_a);
++ intInputVal2 = vqmovn_s32(toint_b);
++
++ res = vcombine_s16(intInputVal1, intInputVal2);
++ vst1q_s16((int16_t*)outputVectorPtr, res);
++ outputVectorPtr += 8;
++ }
++
++ for (i = neon_iters * 8; i < num_points * 2; i++) {
++ aux = *inputVectorPtr++;
++ if (aux > max_val_f)
++ aux = max_val_f;
++ else if (aux < min_val_f)
++ aux = min_val_f;
++ *outputVectorPtr++ = (int16_t)rintf(aux);
++ }
+ }
+
+ #undef VCVTRQ_S32_F32
+@@ -229,7 +240,9 @@ static inline void volk_32fc_convert_16ic_neon(lv_16sc_t* outputVector, const lv
+ #if LV_HAVE_NEONV8
+ #include <arm_neon.h>
+
+-static inline void volk_32fc_convert_16ic_neonv8(lv_16sc_t* outputVector, const lv_32fc_t* inputVector, unsigned int num_points)
++static inline void volk_32fc_convert_16ic_neonv8(lv_16sc_t* outputVector,
++ const lv_32fc_t* inputVector,
++ unsigned int num_points)
+ {
+ const unsigned int neon_iters = num_points / 4;
+
+@@ -245,50 +258,49 @@ static inline void volk_32fc_convert_16ic_neonv8(lv_16sc_t* outputVector, const
+ const float32x4_t max_val = vmovq_n_f32(max_val_f);
+ float32x4_t ret1, ret2, a, b;
+
+- int32x4_t toint_a={0,0,0,0}, toint_b={0,0,0,0};
++ int32x4_t toint_a = { 0, 0, 0, 0 }, toint_b = { 0, 0, 0, 0 };
+ int16x4_t intInputVal1, intInputVal2;
+ int16x8_t res;
+
+- for(i = 0; i < neon_iters; i++)
+- {
+- a = vld1q_f32((const float32_t*)(inputVectorPtr));
+- inputVectorPtr += 4;
+- b = vld1q_f32((const float32_t*)(inputVectorPtr));
+- inputVectorPtr += 4;
+- __VOLK_PREFETCH(inputVectorPtr + 8);
+-
+- ret1 = vmaxq_f32(vminq_f32(a, max_val), min_val);
+- ret2 = vmaxq_f32(vminq_f32(b, max_val), min_val);
+-
+- // vrndiq takes into account the current rounding mode (as does rintf)
+- toint_a = vcvtq_s32_f32(vrndiq_f32(ret1));
+- toint_b = vcvtq_s32_f32(vrndiq_f32(ret2));
+-
+- intInputVal1 = vqmovn_s32(toint_a);
+- intInputVal2 = vqmovn_s32(toint_b);
+-
+- res = vcombine_s16(intInputVal1, intInputVal2);
+- vst1q_s16((int16_t*)outputVectorPtr, res);
+- outputVectorPtr += 8;
+- }
+-
+- for(i = neon_iters * 8; i < num_points * 2; i++)
+- {
+- aux = *inputVectorPtr++;
+- if(aux > max_val_f)
+- aux = max_val_f;
+- else if(aux < min_val_f)
+- aux = min_val_f;
+- *outputVectorPtr++ = (int16_t)rintf(aux);
+- }
++ for (i = 0; i < neon_iters; i++) {
++ a = vld1q_f32((const float32_t*)(inputVectorPtr));
++ inputVectorPtr += 4;
++ b = vld1q_f32((const float32_t*)(inputVectorPtr));
++ inputVectorPtr += 4;
++ __VOLK_PREFETCH(inputVectorPtr + 8);
++
++ ret1 = vmaxq_f32(vminq_f32(a, max_val), min_val);
++ ret2 = vmaxq_f32(vminq_f32(b, max_val), min_val);
++
++ // vrndiq takes into account the current rounding mode (as does rintf)
++ toint_a = vcvtq_s32_f32(vrndiq_f32(ret1));
++ toint_b = vcvtq_s32_f32(vrndiq_f32(ret2));
++
++ intInputVal1 = vqmovn_s32(toint_a);
++ intInputVal2 = vqmovn_s32(toint_b);
++
++ res = vcombine_s16(intInputVal1, intInputVal2);
++ vst1q_s16((int16_t*)outputVectorPtr, res);
++ outputVectorPtr += 8;
++ }
++
++ for (i = neon_iters * 8; i < num_points * 2; i++) {
++ aux = *inputVectorPtr++;
++ if (aux > max_val_f)
++ aux = max_val_f;
++ else if (aux < min_val_f)
++ aux = min_val_f;
++ *outputVectorPtr++ = (int16_t)rintf(aux);
++ }
+ }
+ #endif /* LV_HAVE_NEONV8 */
+
+
+-
+ #ifdef LV_HAVE_GENERIC
+
+-static inline void volk_32fc_convert_16ic_generic(lv_16sc_t* outputVector, const lv_32fc_t* inputVector, unsigned int num_points)
++static inline void volk_32fc_convert_16ic_generic(lv_16sc_t* outputVector,
++ const lv_32fc_t* inputVector,
++ unsigned int num_points)
+ {
+ float* inputVectorPtr = (float*)inputVector;
+ int16_t* outputVectorPtr = (int16_t*)outputVector;
+@@ -296,15 +308,14 @@ static inline void volk_32fc_convert_16ic_generic(lv_16sc_t* outputVector, const
+ const float max_val = (float)SHRT_MAX;
+ float aux;
+ unsigned int i;
+- for(i = 0; i < num_points * 2; i++)
+- {
+- aux = *inputVectorPtr++;
+- if(aux > max_val)
+- aux = max_val;
+- else if(aux < min_val)
+- aux = min_val;
+- *outputVectorPtr++ = (int16_t)rintf(aux);
+- }
++ for (i = 0; i < num_points * 2; i++) {
++ aux = *inputVectorPtr++;
++ if (aux > max_val)
++ aux = max_val;
++ else if (aux < min_val)
++ aux = min_val;
++ *outputVectorPtr++ = (int16_t)rintf(aux);
++ }
+ }
+ #endif /* LV_HAVE_GENERIC */
+
+@@ -313,15 +324,17 @@ static inline void volk_32fc_convert_16ic_generic(lv_16sc_t* outputVector, const
+ #ifndef INCLUDED_volk_32fc_convert_16ic_u_H
+ #define INCLUDED_volk_32fc_convert_16ic_u_H
+
++#include "volk/volk_complex.h"
+ #include <limits.h>
+ #include <math.h>
+-#include "volk/volk_complex.h"
+
+
+ #ifdef LV_HAVE_AVX2
+ #include <immintrin.h>
+
+-static inline void volk_32fc_convert_16ic_u_avx2(lv_16sc_t* outputVector, const lv_32fc_t* inputVector, unsigned int num_points)
++static inline void volk_32fc_convert_16ic_u_avx2(lv_16sc_t* outputVector,
++ const lv_32fc_t* inputVector,
++ unsigned int num_points)
+ {
+ const unsigned int avx_iters = num_points / 8;
+
+@@ -339,37 +352,35 @@ static inline void volk_32fc_convert_16ic_u_avx2(lv_16sc_t* outputVector, const
+ const __m256 vmax_val = _mm256_set1_ps(max_val);
+ unsigned int i;
+
+- for(i = 0; i < avx_iters; i++)
+- {
+- inputVal1 = _mm256_loadu_ps((float*)inputVectorPtr);
+- inputVectorPtr += 8;
+- inputVal2 = _mm256_loadu_ps((float*)inputVectorPtr);
+- inputVectorPtr += 8;
+- __VOLK_PREFETCH(inputVectorPtr + 16);
+-
+- // Clip
+- ret1 = _mm256_max_ps(_mm256_min_ps(inputVal1, vmax_val), vmin_val);
+- ret2 = _mm256_max_ps(_mm256_min_ps(inputVal2, vmax_val), vmin_val);
+-
+- intInputVal1 = _mm256_cvtps_epi32(ret1);
+- intInputVal2 = _mm256_cvtps_epi32(ret2);
+-
+- intInputVal1 = _mm256_packs_epi32(intInputVal1, intInputVal2);
+- intInputVal1 = _mm256_permute4x64_epi64(intInputVal1, 0xd8);
+-
+- _mm256_storeu_si256((__m256i*)outputVectorPtr, intInputVal1);
+- outputVectorPtr += 16;
+- }
+-
+- for(i = avx_iters * 16; i < num_points * 2; i++)
+- {
+- aux = *inputVectorPtr++;
+- if(aux > max_val)
+- aux = max_val;
+- else if(aux < min_val)
+- aux = min_val;
+- *outputVectorPtr++ = (int16_t)rintf(aux);
+- }
++ for (i = 0; i < avx_iters; i++) {
++ inputVal1 = _mm256_loadu_ps((float*)inputVectorPtr);
++ inputVectorPtr += 8;
++ inputVal2 = _mm256_loadu_ps((float*)inputVectorPtr);
++ inputVectorPtr += 8;
++ __VOLK_PREFETCH(inputVectorPtr + 16);
++
++ // Clip
++ ret1 = _mm256_max_ps(_mm256_min_ps(inputVal1, vmax_val), vmin_val);
++ ret2 = _mm256_max_ps(_mm256_min_ps(inputVal2, vmax_val), vmin_val);
++
++ intInputVal1 = _mm256_cvtps_epi32(ret1);
++ intInputVal2 = _mm256_cvtps_epi32(ret2);
++
++ intInputVal1 = _mm256_packs_epi32(intInputVal1, intInputVal2);
++ intInputVal1 = _mm256_permute4x64_epi64(intInputVal1, 0xd8);
++
++ _mm256_storeu_si256((__m256i*)outputVectorPtr, intInputVal1);
++ outputVectorPtr += 16;
++ }
++
++ for (i = avx_iters * 16; i < num_points * 2; i++) {
++ aux = *inputVectorPtr++;
++ if (aux > max_val)
++ aux = max_val;
++ else if (aux < min_val)
++ aux = min_val;
++ *outputVectorPtr++ = (int16_t)rintf(aux);
++ }
+ }
+ #endif /* LV_HAVE_AVX2 */
+
+@@ -377,7 +388,9 @@ static inline void volk_32fc_convert_16ic_u_avx2(lv_16sc_t* outputVector, const
+ #ifdef LV_HAVE_SSE2
+ #include <emmintrin.h>
+
+-static inline void volk_32fc_convert_16ic_u_sse2(lv_16sc_t* outputVector, const lv_32fc_t* inputVector, unsigned int num_points)
++static inline void volk_32fc_convert_16ic_u_sse2(lv_16sc_t* outputVector,
++ const lv_32fc_t* inputVector,
++ unsigned int num_points)
+ {
+ const unsigned int sse_iters = num_points / 4;
+
+@@ -395,36 +408,34 @@ static inline void volk_32fc_convert_16ic_u_sse2(lv_16sc_t* outputVector, const
+ const __m128 vmax_val = _mm_set_ps1(max_val);
+
+ unsigned int i;
+- for(i = 0; i < sse_iters; i++)
+- {
+- inputVal1 = _mm_loadu_ps((float*)inputVectorPtr);
+- inputVectorPtr += 4;
+- inputVal2 = _mm_loadu_ps((float*)inputVectorPtr);
+- inputVectorPtr += 4;
+- __VOLK_PREFETCH(inputVectorPtr + 8);
+-
+- // Clip
+- ret1 = _mm_max_ps(_mm_min_ps(inputVal1, vmax_val), vmin_val);
+- ret2 = _mm_max_ps(_mm_min_ps(inputVal2, vmax_val), vmin_val);
+-
+- intInputVal1 = _mm_cvtps_epi32(ret1);
+- intInputVal2 = _mm_cvtps_epi32(ret2);
+-
+- intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
+-
+- _mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1);
+- outputVectorPtr += 8;
+- }
+-
+- for(i = sse_iters * 8; i < num_points * 2; i++)
+- {
+- aux = *inputVectorPtr++;
+- if(aux > max_val)
+- aux = max_val;
+- else if(aux < min_val)
+- aux = min_val;
+- *outputVectorPtr++ = (int16_t)rintf(aux);
+- }
++ for (i = 0; i < sse_iters; i++) {
++ inputVal1 = _mm_loadu_ps((float*)inputVectorPtr);
++ inputVectorPtr += 4;
++ inputVal2 = _mm_loadu_ps((float*)inputVectorPtr);
++ inputVectorPtr += 4;
++ __VOLK_PREFETCH(inputVectorPtr + 8);
++
++ // Clip
++ ret1 = _mm_max_ps(_mm_min_ps(inputVal1, vmax_val), vmin_val);
++ ret2 = _mm_max_ps(_mm_min_ps(inputVal2, vmax_val), vmin_val);
++
++ intInputVal1 = _mm_cvtps_epi32(ret1);
++ intInputVal2 = _mm_cvtps_epi32(ret2);
++
++ intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
++
++ _mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1);
++ outputVectorPtr += 8;
++ }
++
++ for (i = sse_iters * 8; i < num_points * 2; i++) {
++ aux = *inputVectorPtr++;
++ if (aux > max_val)
++ aux = max_val;
++ else if (aux < min_val)
++ aux = min_val;
++ *outputVectorPtr++ = (int16_t)rintf(aux);
++ }
+ }
+ #endif /* LV_HAVE_SSE2 */
+ #endif /* INCLUDED_volk_32fc_convert_16ic_u_H */
+diff --git a/kernels/volk/volk_32fc_deinterleave_32f_x2.h b/kernels/volk/volk_32fc_deinterleave_32f_x2.h
+index 40cd664..1a06c48 100644
+--- a/kernels/volk/volk_32fc_deinterleave_32f_x2.h
++++ b/kernels/volk/volk_32fc_deinterleave_32f_x2.h
+@@ -30,8 +30,8 @@
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+- * void volk_32fc_deinterleave_32f_x2(float* iBuffer, float* qBuffer, const lv_32fc_t* complexVector, unsigned int num_points)
+- * \endcode
++ * void volk_32fc_deinterleave_32f_x2(float* iBuffer, float* qBuffer, const lv_32fc_t*
++ * complexVector, unsigned int num_points) \endcode
+ *
+ * \b Inputs
+ * \li complexVector: The complex input vector.
+@@ -78,86 +78,88 @@
+
+ #ifdef LV_HAVE_AVX
+ #include <immintrin.h>
+-static inline void
+-volk_32fc_deinterleave_32f_x2_a_avx(float* iBuffer, float* qBuffer, const lv_32fc_t* complexVector,
+- unsigned int num_points)
++static inline void volk_32fc_deinterleave_32f_x2_a_avx(float* iBuffer,
++ float* qBuffer,
++ const lv_32fc_t* complexVector,
++ unsigned int num_points)
+ {
+- const float* complexVectorPtr = (float*)complexVector;
+- float* iBufferPtr = iBuffer;
+- float* qBufferPtr = qBuffer;
+-
+- unsigned int number = 0;
+- // Mask for real and imaginary parts
+- const unsigned int eighthPoints = num_points / 8;
+- __m256 cplxValue1, cplxValue2, complex1, complex2, iValue, qValue;
+- for(;number < eighthPoints; number++){
+- cplxValue1 = _mm256_load_ps(complexVectorPtr);
+- complexVectorPtr += 8;
+-
+- cplxValue2 = _mm256_load_ps(complexVectorPtr);
+- complexVectorPtr += 8;
+-
+- complex1 = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x20);
+- complex2 = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x31);
+-
+- // Arrange in i1i2i3i4 format
+- iValue = _mm256_shuffle_ps(complex1, complex2, 0x88);
+- // Arrange in q1q2q3q4 format
+- qValue = _mm256_shuffle_ps(complex1, complex2, 0xdd);
+-
+- _mm256_store_ps(iBufferPtr, iValue);
+- _mm256_store_ps(qBufferPtr, qValue);
+-
+- iBufferPtr += 8;
+- qBufferPtr += 8;
+- }
+-
+- number = eighthPoints * 8;
+- for(; number < num_points; number++){
+- *iBufferPtr++ = *complexVectorPtr++;
+- *qBufferPtr++ = *complexVectorPtr++;
+- }
++ const float* complexVectorPtr = (float*)complexVector;
++ float* iBufferPtr = iBuffer;
++ float* qBufferPtr = qBuffer;
++
++ unsigned int number = 0;
++ // Mask for real and imaginary parts
++ const unsigned int eighthPoints = num_points / 8;
++ __m256 cplxValue1, cplxValue2, complex1, complex2, iValue, qValue;
++ for (; number < eighthPoints; number++) {
++ cplxValue1 = _mm256_load_ps(complexVectorPtr);
++ complexVectorPtr += 8;
++
++ cplxValue2 = _mm256_load_ps(complexVectorPtr);
++ complexVectorPtr += 8;
++
++ complex1 = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x20);
++ complex2 = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x31);
++
++ // Arrange in i1i2i3i4 format
++ iValue = _mm256_shuffle_ps(complex1, complex2, 0x88);
++ // Arrange in q1q2q3q4 format
++ qValue = _mm256_shuffle_ps(complex1, complex2, 0xdd);
++
++ _mm256_store_ps(iBufferPtr, iValue);
++ _mm256_store_ps(qBufferPtr, qValue);
++
++ iBufferPtr += 8;
++ qBufferPtr += 8;
++ }
++
++ number = eighthPoints * 8;
++ for (; number < num_points; number++) {
++ *iBufferPtr++ = *complexVectorPtr++;
++ *qBufferPtr++ = *complexVectorPtr++;
++ }
+ }
+ #endif /* LV_HAVE_AVX */
+
+ #ifdef LV_HAVE_SSE
+ #include <xmmintrin.h>
+
+-static inline void
+-volk_32fc_deinterleave_32f_x2_a_sse(float* iBuffer, float* qBuffer, const lv_32fc_t* complexVector,
+- unsigned int num_points)
++static inline void volk_32fc_deinterleave_32f_x2_a_sse(float* iBuffer,
++ float* qBuffer,
++ const lv_32fc_t* complexVector,
++ unsigned int num_points)
+ {
+- const float* complexVectorPtr = (float*)complexVector;
+- float* iBufferPtr = iBuffer;
+- float* qBufferPtr = qBuffer;
+-
+- unsigned int number = 0;
+- const unsigned int quarterPoints = num_points / 4;
+- __m128 cplxValue1, cplxValue2, iValue, qValue;
+- for(;number < quarterPoints; number++){
+- cplxValue1 = _mm_load_ps(complexVectorPtr);
+- complexVectorPtr += 4;
+-
+- cplxValue2 = _mm_load_ps(complexVectorPtr);
+- complexVectorPtr += 4;
+-
+- // Arrange in i1i2i3i4 format
+- iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0));
+- // Arrange in q1q2q3q4 format
+- qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1));
+-
+- _mm_store_ps(iBufferPtr, iValue);
+- _mm_store_ps(qBufferPtr, qValue);
+-
+- iBufferPtr += 4;
+- qBufferPtr += 4;
+- }
+-
+- number = quarterPoints * 4;
+- for(; number < num_points; number++){
+- *iBufferPtr++ = *complexVectorPtr++;
+- *qBufferPtr++ = *complexVectorPtr++;
+- }
++ const float* complexVectorPtr = (float*)complexVector;
++ float* iBufferPtr = iBuffer;
++ float* qBufferPtr = qBuffer;
++
++ unsigned int number = 0;
++ const unsigned int quarterPoints = num_points / 4;
++ __m128 cplxValue1, cplxValue2, iValue, qValue;
++ for (; number < quarterPoints; number++) {
++ cplxValue1 = _mm_load_ps(complexVectorPtr);
++ complexVectorPtr += 4;
++
++ cplxValue2 = _mm_load_ps(complexVectorPtr);
++ complexVectorPtr += 4;
++
++ // Arrange in i1i2i3i4 format
++ iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2, 0, 2, 0));
++ // Arrange in q1q2q3q4 format
++ qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3, 1, 3, 1));
++
++ _mm_store_ps(iBufferPtr, iValue);
++ _mm_store_ps(qBufferPtr, qValue);
++
++ iBufferPtr += 4;
++ qBufferPtr += 4;
++ }
++
++ number = quarterPoints * 4;
++ for (; number < num_points; number++) {
++ *iBufferPtr++ = *complexVectorPtr++;
++ *qBufferPtr++ = *complexVectorPtr++;
++ }
+ }
+ #endif /* LV_HAVE_SSE */
+
+@@ -165,48 +167,50 @@ volk_32fc_deinterleave_32f_x2_a_sse(float* iBuffer, float* qBuffer, const lv_32f
+ #ifdef LV_HAVE_NEON
+ #include <arm_neon.h>
+
+-static inline void
+-volk_32fc_deinterleave_32f_x2_neon(float* iBuffer, float* qBuffer, const lv_32fc_t* complexVector,
+- unsigned int num_points)
++static inline void volk_32fc_deinterleave_32f_x2_neon(float* iBuffer,
++ float* qBuffer,
++ const lv_32fc_t* complexVector,
++ unsigned int num_points)
+ {
+- unsigned int number = 0;
+- unsigned int quarter_points = num_points / 4;
+- const float* complexVectorPtr = (float*)complexVector;
+- float* iBufferPtr = iBuffer;
+- float* qBufferPtr = qBuffer;
+- float32x4x2_t complexInput;
+-
+- for(number = 0; number < quarter_points; number++){
+- complexInput = vld2q_f32(complexVectorPtr);
+- vst1q_f32( iBufferPtr, complexInput.val[0] );
+- vst1q_f32( qBufferPtr, complexInput.val[1] );
+- complexVectorPtr += 8;
+- iBufferPtr += 4;
+- qBufferPtr += 4;
+- }
+-
+- for(number = quarter_points*4; number < num_points; number++){
+- *iBufferPtr++ = *complexVectorPtr++;
+- *qBufferPtr++ = *complexVectorPtr++;
+- }
++ unsigned int number = 0;
++ unsigned int quarter_points = num_points / 4;
++ const float* complexVectorPtr = (float*)complexVector;
++ float* iBufferPtr = iBuffer;
++ float* qBufferPtr = qBuffer;
++ float32x4x2_t complexInput;
++
++ for (number = 0; number < quarter_points; number++) {
++ complexInput = vld2q_f32(complexVectorPtr);
++ vst1q_f32(iBufferPtr, complexInput.val[0]);
++ vst1q_f32(qBufferPtr, complexInput.val[1]);
++ complexVectorPtr += 8;
++ iBufferPtr += 4;
++ qBufferPtr += 4;
++ }
++
++ for (number = quarter_points * 4; number < num_points; number++) {
++ *iBufferPtr++ = *complexVectorPtr++;
++ *qBufferPtr++ = *complexVectorPtr++;
++ }
+ }
+ #endif /* LV_HAVE_NEON */
+
+
+ #ifdef LV_HAVE_GENERIC
+
+-static inline void
+-volk_32fc_deinterleave_32f_x2_generic(float* iBuffer, float* qBuffer, const lv_32fc_t* complexVector,
+- unsigned int num_points)
++static inline void volk_32fc_deinterleave_32f_x2_generic(float* iBuffer,
++ float* qBuffer,
++ const lv_32fc_t* complexVector,
++ unsigned int num_points)
+ {
+- const float* complexVectorPtr = (float*)complexVector;
+- float* iBufferPtr = iBuffer;
+- float* qBufferPtr = qBuffer;
+- unsigned int number;
+- for(number = 0; number < num_points; number++){
+- *iBufferPtr++ = *complexVectorPtr++;
+- *qBufferPtr++ = *complexVectorPtr++;
+- }
++ const float* complexVectorPtr = (float*)complexVector;
++ float* iBufferPtr = iBuffer;
++ float* qBufferPtr = qBuffer;
++ unsigned int number;
++ for (number = 0; number < num_points; number++) {
++ *iBufferPtr++ = *complexVectorPtr++;
++ *qBufferPtr++ = *complexVectorPtr++;
++ }
+ }
+ #endif /* LV_HAVE_GENERIC */
+
+@@ -221,45 +225,46 @@ volk_32fc_deinterleave_32f_x2_generic(float* iBuffer, float* qBuffer, const lv_3
+
+ #ifdef LV_HAVE_AVX
+ #include <immintrin.h>
+-static inline void
+-volk_32fc_deinterleave_32f_x2_u_avx(float* iBuffer, float* qBuffer, const lv_32fc_t* complexVector,
+- unsigned int num_points)
++static inline void volk_32fc_deinterleave_32f_x2_u_avx(float* iBuffer,
++ float* qBuffer,
++ const lv_32fc_t* complexVector,
++ unsigned int num_points)
+ {
+- const float* complexVectorPtr = (float*)complexVector;
+- float* iBufferPtr = iBuffer;
+- float* qBufferPtr = qBuffer;
+-
+- unsigned int number = 0;
+- // Mask for real and imaginary parts
+- const unsigned int eighthPoints = num_points / 8;
+- __m256 cplxValue1, cplxValue2, complex1, complex2, iValue, qValue;
+- for(;number < eighthPoints; number++){
+- cplxValue1 = _mm256_loadu_ps(complexVectorPtr);
+- complexVectorPtr += 8;
+-
+- cplxValue2 = _mm256_loadu_ps(complexVectorPtr);
+- complexVectorPtr += 8;
+-
+- complex1 = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x20);
+- complex2 = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x31);
+-
+- // Arrange in i1i2i3i4 format
+- iValue = _mm256_shuffle_ps(complex1, complex2, 0x88);
+- // Arrange in q1q2q3q4 format
+- qValue = _mm256_shuffle_ps(complex1, complex2, 0xdd);
+-
+- _mm256_storeu_ps(iBufferPtr, iValue);
+- _mm256_storeu_ps(qBufferPtr, qValue);
+-
+- iBufferPtr += 8;
+- qBufferPtr += 8;
+- }
+-
+- number = eighthPoints * 8;
+- for(; number < num_points; number++){
+- *iBufferPtr++ = *complexVectorPtr++;
+- *qBufferPtr++ = *complexVectorPtr++;
+- }
++ const float* complexVectorPtr = (float*)complexVector;
++ float* iBufferPtr = iBuffer;
++ float* qBufferPtr = qBuffer;
++
++ unsigned int number = 0;
++ // Mask for real and imaginary parts
++ const unsigned int eighthPoints = num_points / 8;
++ __m256 cplxValue1, cplxValue2, complex1, complex2, iValue, qValue;
++ for (; number < eighthPoints; number++) {
++ cplxValue1 = _mm256_loadu_ps(complexVectorPtr);
++ complexVectorPtr += 8;
++
++ cplxValue2 = _mm256_loadu_ps(complexVectorPtr);
++ complexVectorPtr += 8;
++
++ complex1 = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x20);
++ complex2 = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x31);
++
++ // Arrange in i1i2i3i4 format
++ iValue = _mm256_shuffle_ps(complex1, complex2, 0x88);
++ // Arrange in q1q2q3q4 format
++ qValue = _mm256_shuffle_ps(complex1, complex2, 0xdd);
++
++ _mm256_storeu_ps(iBufferPtr, iValue);
++ _mm256_storeu_ps(qBufferPtr, qValue);
++
++ iBufferPtr += 8;
++ qBufferPtr += 8;
++ }
++
++ number = eighthPoints * 8;
++ for (; number < num_points; number++) {
++ *iBufferPtr++ = *complexVectorPtr++;
++ *qBufferPtr++ = *complexVectorPtr++;
++ }
+ }
+ #endif /* LV_HAVE_AVX */
+ #endif /* INCLUDED_volk_32fc_deinterleave_32f_x2_u_H */
+diff --git a/kernels/volk/volk_32fc_deinterleave_64f_x2.h b/kernels/volk/volk_32fc_deinterleave_64f_x2.h
+index 3e799cb..3b69c3c 100644
+--- a/kernels/volk/volk_32fc_deinterleave_64f_x2.h
++++ b/kernels/volk/volk_32fc_deinterleave_64f_x2.h
+@@ -79,110 +79,113 @@
+ #ifdef LV_HAVE_AVX
+ #include <immintrin.h>
+
+-static inline void
+-volk_32fc_deinterleave_64f_x2_u_avx(double *iBuffer, double *qBuffer,
+- const lv_32fc_t *complexVector,
+- unsigned int num_points) {
+- unsigned int number = 0;
+-
+- const float *complexVectorPtr = (float *)complexVector;
+- double *iBufferPtr = iBuffer;
+- double *qBufferPtr = qBuffer;
+-
+- const unsigned int quarterPoints = num_points / 4;
+- __m256 cplxValue;
+- __m128 complexH, complexL, fVal;
+- __m256d dVal;
+-
+- for (; number < quarterPoints; number++) {
+-
+- cplxValue = _mm256_loadu_ps(complexVectorPtr);
+- complexVectorPtr += 8;
+-
+- complexH = _mm256_extractf128_ps(cplxValue, 1);
+- complexL = _mm256_extractf128_ps(cplxValue, 0);
+-
+- // Arrange in i1i2i1i2 format
+- fVal = _mm_shuffle_ps(complexL, complexH, _MM_SHUFFLE(2, 0, 2, 0));
+- dVal = _mm256_cvtps_pd(fVal);
+- _mm256_storeu_pd(iBufferPtr, dVal);
+-
+- // Arrange in q1q2q1q2 format
+- fVal = _mm_shuffle_ps(complexL, complexH, _MM_SHUFFLE(3, 1, 3, 1));
+- dVal = _mm256_cvtps_pd(fVal);
+- _mm256_storeu_pd(qBufferPtr, dVal);
+-
+- iBufferPtr += 4;
+- qBufferPtr += 4;
+- }
+-
+- number = quarterPoints * 4;
+- for (; number < num_points; number++) {
+- *iBufferPtr++ = *complexVectorPtr++;
+- *qBufferPtr++ = *complexVectorPtr++;
+- }
++static inline void volk_32fc_deinterleave_64f_x2_u_avx(double* iBuffer,
++ double* qBuffer,
++ const lv_32fc_t* complexVector,
++ unsigned int num_points)
++{
++ unsigned int number = 0;
++
++ const float* complexVectorPtr = (float*)complexVector;
++ double* iBufferPtr = iBuffer;
++ double* qBufferPtr = qBuffer;
++
++ const unsigned int quarterPoints = num_points / 4;
++ __m256 cplxValue;
++ __m128 complexH, complexL, fVal;
++ __m256d dVal;
++
++ for (; number < quarterPoints; number++) {
++
++ cplxValue = _mm256_loadu_ps(complexVectorPtr);
++ complexVectorPtr += 8;
++
++ complexH = _mm256_extractf128_ps(cplxValue, 1);
++ complexL = _mm256_extractf128_ps(cplxValue, 0);
++
++ // Arrange in i1i2i1i2 format
++ fVal = _mm_shuffle_ps(complexL, complexH, _MM_SHUFFLE(2, 0, 2, 0));
++ dVal = _mm256_cvtps_pd(fVal);
++ _mm256_storeu_pd(iBufferPtr, dVal);
++
++ // Arrange in q1q2q1q2 format
++ fVal = _mm_shuffle_ps(complexL, complexH, _MM_SHUFFLE(3, 1, 3, 1));
++ dVal = _mm256_cvtps_pd(fVal);
++ _mm256_storeu_pd(qBufferPtr, dVal);
++
++ iBufferPtr += 4;
++ qBufferPtr += 4;
++ }
++
++ number = quarterPoints * 4;
++ for (; number < num_points; number++) {
++ *iBufferPtr++ = *complexVectorPtr++;
++ *qBufferPtr++ = *complexVectorPtr++;
++ }
+ }
+ #endif /* LV_HAVE_AVX */
+
+ #ifdef LV_HAVE_SSE2
+ #include <emmintrin.h>
+
+-static inline void
+-volk_32fc_deinterleave_64f_x2_u_sse2(double *iBuffer, double *qBuffer,
+- const lv_32fc_t *complexVector,
+- unsigned int num_points) {
+- unsigned int number = 0;
+-
+- const float *complexVectorPtr = (float *)complexVector;
+- double *iBufferPtr = iBuffer;
+- double *qBufferPtr = qBuffer;
+-
+- const unsigned int halfPoints = num_points / 2;
+- __m128 cplxValue, fVal;
+- __m128d dVal;
+-
+- for (; number < halfPoints; number++) {
+-
+- cplxValue = _mm_loadu_ps(complexVectorPtr);
+- complexVectorPtr += 4;
+-
+- // Arrange in i1i2i1i2 format
+- fVal = _mm_shuffle_ps(cplxValue, cplxValue, _MM_SHUFFLE(2, 0, 2, 0));
+- dVal = _mm_cvtps_pd(fVal);
+- _mm_storeu_pd(iBufferPtr, dVal);
+-
+- // Arrange in q1q2q1q2 format
+- fVal = _mm_shuffle_ps(cplxValue, cplxValue, _MM_SHUFFLE(3, 1, 3, 1));
+- dVal = _mm_cvtps_pd(fVal);
+- _mm_storeu_pd(qBufferPtr, dVal);
+-
+- iBufferPtr += 2;
+- qBufferPtr += 2;
+- }
+-
+- number = halfPoints * 2;
+- for (; number < num_points; number++) {
+- *iBufferPtr++ = *complexVectorPtr++;
+- *qBufferPtr++ = *complexVectorPtr++;
+- }
++static inline void volk_32fc_deinterleave_64f_x2_u_sse2(double* iBuffer,
++ double* qBuffer,
++ const lv_32fc_t* complexVector,
++ unsigned int num_points)
++{
++ unsigned int number = 0;
++
++ const float* complexVectorPtr = (float*)complexVector;
++ double* iBufferPtr = iBuffer;
++ double* qBufferPtr = qBuffer;
++
++ const unsigned int halfPoints = num_points / 2;
++ __m128 cplxValue, fVal;
++ __m128d dVal;
++
++ for (; number < halfPoints; number++) {
++
++ cplxValue = _mm_loadu_ps(complexVectorPtr);
++ complexVectorPtr += 4;
++
++ // Arrange in i1i2i1i2 format
++ fVal = _mm_shuffle_ps(cplxValue, cplxValue, _MM_SHUFFLE(2, 0, 2, 0));
++ dVal = _mm_cvtps_pd(fVal);
++ _mm_storeu_pd(iBufferPtr, dVal);
++
++ // Arrange in q1q2q1q2 format
++ fVal = _mm_shuffle_ps(cplxValue, cplxValue, _MM_SHUFFLE(3, 1, 3, 1));
++ dVal = _mm_cvtps_pd(fVal);
++ _mm_storeu_pd(qBufferPtr, dVal);
++
++ iBufferPtr += 2;
++ qBufferPtr += 2;
++ }
++
++ number = halfPoints * 2;
++ for (; number < num_points; number++) {
++ *iBufferPtr++ = *complexVectorPtr++;
++ *qBufferPtr++ = *complexVectorPtr++;
++ }
+ }
+ #endif /* LV_HAVE_SSE */
+
+ #ifdef LV_HAVE_GENERIC
+
+-static inline void
+-volk_32fc_deinterleave_64f_x2_generic(double *iBuffer, double *qBuffer,
+- const lv_32fc_t *complexVector,
+- unsigned int num_points) {
+- unsigned int number = 0;
+- const float *complexVectorPtr = (float *)complexVector;
+- double *iBufferPtr = iBuffer;
+- double *qBufferPtr = qBuffer;
+-
+- for (number = 0; number < num_points; number++) {
+- *iBufferPtr++ = (double)*complexVectorPtr++;
+- *qBufferPtr++ = (double)*complexVectorPtr++;
+- }
++static inline void volk_32fc_deinterleave_64f_x2_generic(double* iBuffer,
++ double* qBuffer,
++ const lv_32fc_t* complexVector,
++ unsigned int num_points)
++{
++ unsigned int number = 0;
++ const float* complexVectorPtr = (float*)complexVector;
++ double* iBufferPtr = iBuffer;
++ double* qBufferPtr = qBuffer;
++
++ for (number = 0; number < num_points; number++) {
++ *iBufferPtr++ = (double)*complexVectorPtr++;
++ *qBufferPtr++ = (double)*complexVectorPtr++;
++ }
+ }
+ #endif /* LV_HAVE_GENERIC */
+
+@@ -196,146 +199,150 @@ volk_32fc_deinterleave_64f_x2_generic(double *iBuffer, double *qBuffer,
+ #ifdef LV_HAVE_AVX
+ #include <immintrin.h>
+
+-static inline void
+-volk_32fc_deinterleave_64f_x2_a_avx(double *iBuffer, double *qBuffer,
+- const lv_32fc_t *complexVector,
+- unsigned int num_points) {
+- unsigned int number = 0;
+-
+- const float *complexVectorPtr = (float *)complexVector;
+- double *iBufferPtr = iBuffer;
+- double *qBufferPtr = qBuffer;
+-
+- const unsigned int quarterPoints = num_points / 4;
+- __m256 cplxValue;
+- __m128 complexH, complexL, fVal;
+- __m256d dVal;
+-
+- for (; number < quarterPoints; number++) {
+-
+- cplxValue = _mm256_load_ps(complexVectorPtr);
+- complexVectorPtr += 8;
+-
+- complexH = _mm256_extractf128_ps(cplxValue, 1);
+- complexL = _mm256_extractf128_ps(cplxValue, 0);
+-
+- // Arrange in i1i2i1i2 format
+- fVal = _mm_shuffle_ps(complexL, complexH, _MM_SHUFFLE(2, 0, 2, 0));
+- dVal = _mm256_cvtps_pd(fVal);
+- _mm256_store_pd(iBufferPtr, dVal);
+-
+- // Arrange in q1q2q1q2 format
+- fVal = _mm_shuffle_ps(complexL, complexH, _MM_SHUFFLE(3, 1, 3, 1));
+- dVal = _mm256_cvtps_pd(fVal);
+- _mm256_store_pd(qBufferPtr, dVal);
+-
+- iBufferPtr += 4;
+- qBufferPtr += 4;
+- }
+-
+- number = quarterPoints * 4;
+- for (; number < num_points; number++) {
+- *iBufferPtr++ = *complexVectorPtr++;
+- *qBufferPtr++ = *complexVectorPtr++;
+- }
++static inline void volk_32fc_deinterleave_64f_x2_a_avx(double* iBuffer,
++ double* qBuffer,
++ const lv_32fc_t* complexVector,
++ unsigned int num_points)
++{
++ unsigned int number = 0;
++
++ const float* complexVectorPtr = (float*)complexVector;
++ double* iBufferPtr = iBuffer;
++ double* qBufferPtr = qBuffer;
++
++ const unsigned int quarterPoints = num_points / 4;
++ __m256 cplxValue;
++ __m128 complexH, complexL, fVal;
++ __m256d dVal;
++
++ for (; number < quarterPoints; number++) {
++
++ cplxValue = _mm256_load_ps(complexVectorPtr);
++ complexVectorPtr += 8;
++
++ complexH = _mm256_extractf128_ps(cplxValue, 1);
++ complexL = _mm256_extractf128_ps(cplxValue, 0);
++
++ // Arrange in i1i2i1i2 format
++ fVal = _mm_shuffle_ps(complexL, complexH, _MM_SHUFFLE(2, 0, 2, 0));
++ dVal = _mm256_cvtps_pd(fVal);
++ _mm256_store_pd(iBufferPtr, dVal);
++
++ // Arrange in q1q2q1q2 format
++ fVal = _mm_shuffle_ps(complexL, complexH, _MM_SHUFFLE(3, 1, 3, 1));
++ dVal = _mm256_cvtps_pd(fVal);
++ _mm256_store_pd(qBufferPtr, dVal);
++
++ iBufferPtr += 4;
++ qBufferPtr += 4;
++ }
++
++ number = quarterPoints * 4;
++ for (; number < num_points; number++) {
++ *iBufferPtr++ = *complexVectorPtr++;
++ *qBufferPtr++ = *complexVectorPtr++;
++ }
+ }
+ #endif /* LV_HAVE_AVX */
+
+ #ifdef LV_HAVE_SSE2
+ #include <emmintrin.h>
+
+-static inline void
+-volk_32fc_deinterleave_64f_x2_a_sse2(double *iBuffer, double *qBuffer,
+- const lv_32fc_t *complexVector,
+- unsigned int num_points) {
+- unsigned int number = 0;
+-
+- const float *complexVectorPtr = (float *)complexVector;
+- double *iBufferPtr = iBuffer;
+- double *qBufferPtr = qBuffer;
+-
+- const unsigned int halfPoints = num_points / 2;
+- __m128 cplxValue, fVal;
+- __m128d dVal;
+-
+- for (; number < halfPoints; number++) {
+-
+- cplxValue = _mm_load_ps(complexVectorPtr);
+- complexVectorPtr += 4;
+-
+- // Arrange in i1i2i1i2 format
+- fVal = _mm_shuffle_ps(cplxValue, cplxValue, _MM_SHUFFLE(2, 0, 2, 0));
+- dVal = _mm_cvtps_pd(fVal);
+- _mm_store_pd(iBufferPtr, dVal);
+-
+- // Arrange in q1q2q1q2 format
+- fVal = _mm_shuffle_ps(cplxValue, cplxValue, _MM_SHUFFLE(3, 1, 3, 1));
+- dVal = _mm_cvtps_pd(fVal);
+- _mm_store_pd(qBufferPtr, dVal);
+-
+- iBufferPtr += 2;
+- qBufferPtr += 2;
+- }
+-
+- number = halfPoints * 2;
+- for (; number < num_points; number++) {
+- *iBufferPtr++ = *complexVectorPtr++;
+- *qBufferPtr++ = *complexVectorPtr++;
+- }
++static inline void volk_32fc_deinterleave_64f_x2_a_sse2(double* iBuffer,
++ double* qBuffer,
++ const lv_32fc_t* complexVector,
++ unsigned int num_points)
++{
++ unsigned int number = 0;
++
++ const float* complexVectorPtr = (float*)complexVector;
++ double* iBufferPtr = iBuffer;
++ double* qBufferPtr = qBuffer;
++
++ const unsigned int halfPoints = num_points / 2;
++ __m128 cplxValue, fVal;
++ __m128d dVal;
++
++ for (; number < halfPoints; number++) {
++
++ cplxValue = _mm_load_ps(complexVectorPtr);
++ complexVectorPtr += 4;
++
++ // Arrange in i1i2i1i2 format
++ fVal = _mm_shuffle_ps(cplxValue, cplxValue, _MM_SHUFFLE(2, 0, 2, 0));
++ dVal = _mm_cvtps_pd(fVal);
++ _mm_store_pd(iBufferPtr, dVal);
++
++ // Arrange in q1q2q1q2 format
++ fVal = _mm_shuffle_ps(cplxValue, cplxValue, _MM_SHUFFLE(3, 1, 3, 1));
++ dVal = _mm_cvtps_pd(fVal);
++ _mm_store_pd(qBufferPtr, dVal);
++
++ iBufferPtr += 2;
++ qBufferPtr += 2;
++ }
++
++ number = halfPoints * 2;
++ for (; number < num_points; number++) {
++ *iBufferPtr++ = *complexVectorPtr++;
++ *qBufferPtr++ = *complexVectorPtr++;
++ }
+ }
+ #endif /* LV_HAVE_SSE */
+
+ #ifdef LV_HAVE_GENERIC
+
+-static inline void
+-volk_32fc_deinterleave_64f_x2_a_generic(double *iBuffer, double *qBuffer,
+- const lv_32fc_t *complexVector,
+- unsigned int num_points) {
+- unsigned int number = 0;
+- const float *complexVectorPtr = (float *)complexVector;
+- double *iBufferPtr = iBuffer;
+- double *qBufferPtr = qBuffer;
+-
+- for (number = 0; number < num_points; number++) {
+- *iBufferPtr++ = (double)*complexVectorPtr++;
+- *qBufferPtr++ = (double)*complexVectorPtr++;
+- }
++static inline void volk_32fc_deinterleave_64f_x2_a_generic(double* iBuffer,
++ double* qBuffer,
++ const lv_32fc_t* complexVector,
++ unsigned int num_points)
++{
++ unsigned int number = 0;
++ const float* complexVectorPtr = (float*)complexVector;
++ double* iBufferPtr = iBuffer;
++ double* qBufferPtr = qBuffer;
++
++ for (number = 0; number < num_points; number++) {
++ *iBufferPtr++ = (double)*complexVectorPtr++;
++ *qBufferPtr++ = (double)*complexVectorPtr++;
++ }
+ }
+ #endif /* LV_HAVE_GENERIC */
+
+ #ifdef LV_HAVE_NEONV8
+ #include <arm_neon.h>
+
+-static inline void
+-volk_32fc_deinterleave_64f_x2_neon(double *iBuffer, double *qBuffer,
+- const lv_32fc_t *complexVector,
+- unsigned int num_points) {
+- unsigned int number = 0;
+- unsigned int half_points = num_points / 2;
+- const float *complexVectorPtr = (float *)complexVector;
+- double *iBufferPtr = iBuffer;
+- double *qBufferPtr = qBuffer;
+- float32x2x2_t complexInput;
+- float64x2_t iVal, qVal;
+-
+- for (number = 0; number < half_points; number++) {
+- complexInput = vld2_f32(complexVectorPtr);
+-
+- iVal = vcvt_f64_f32(complexInput.val[0]);
+- qVal = vcvt_f64_f32(complexInput.val[1]);
+-
+- vst1q_f64(iBufferPtr, iVal);
+- vst1q_f64(qBufferPtr, qVal);
+-
+- complexVectorPtr += 4;
+- iBufferPtr += 2;
+- qBufferPtr += 2;
+- }
+-
+- for (number = half_points * 2; number < num_points; number++) {
+- *iBufferPtr++ = (double)*complexVectorPtr++;
+- *qBufferPtr++ = (double)*complexVectorPtr++;
+- }
++static inline void volk_32fc_deinterleave_64f_x2_neon(double* iBuffer,
++ double* qBuffer,
++ const lv_32fc_t* complexVector,
++ unsigned int num_points)
++{
++ unsigned int number = 0;
++ unsigned int half_points = num_points / 2;
++ const float* complexVectorPtr = (float*)complexVector;
++ double* iBufferPtr = iBuffer;
++ double* qBufferPtr = qBuffer;
++ float32x2x2_t complexInput;
++ float64x2_t iVal, qVal;
++
++ for (number = 0; number < half_points; number++) {
++ complexInput = vld2_f32(complexVectorPtr);
++
++ iVal = vcvt_f64_f32(complexInput.val[0]);
++ qVal = vcvt_f64_f32(complexInput.val[1]);
++
++ vst1q_f64(iBufferPtr, iVal);
++ vst1q_f64(qBufferPtr, qVal);
++
++ complexVectorPtr += 4;
++ iBufferPtr += 2;
++ qBufferPtr += 2;
++ }
++
++ for (number = half_points * 2; number < num_points; number++) {
++ *iBufferPtr++ = (double)*complexVectorPtr++;
++ *qBufferPtr++ = (double)*complexVectorPtr++;
++ }
+ }
+ #endif /* LV_HAVE_NEONV8 */
+
+diff --git a/kernels/volk/volk_32fc_deinterleave_imag_32f.h b/kernels/volk/volk_32fc_deinterleave_imag_32f.h
+index 13f9764..e3dfa12 100644
+--- a/kernels/volk/volk_32fc_deinterleave_imag_32f.h
++++ b/kernels/volk/volk_32fc_deinterleave_imag_32f.h
+@@ -30,8 +30,8 @@
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+- * void volk_32fc_deinterleave_image_32f(float* qBuffer, const lv_32fc_t* complexVector, unsigned int num_points)
+- * \endcode
++ * void volk_32fc_deinterleave_image_32f(float* qBuffer, const lv_32fc_t* complexVector,
++ * unsigned int num_points) \endcode
+ *
+ * \b Inputs
+ * \li complexVector: The complex input vector.
+@@ -76,121 +76,121 @@
+ #ifdef LV_HAVE_AVX
+ #include <immintrin.h>
+
+-static inline void
+-volk_32fc_deinterleave_imag_32f_a_avx(float* qBuffer, const lv_32fc_t* complexVector,
+- unsigned int num_points)
++static inline void volk_32fc_deinterleave_imag_32f_a_avx(float* qBuffer,
++ const lv_32fc_t* complexVector,
++ unsigned int num_points)
+ {
+- unsigned int number = 0;
+- const unsigned int eighthPoints = num_points / 8;
+- const float* complexVectorPtr = (const float*)complexVector;
+- float* qBufferPtr = qBuffer;
++ unsigned int number = 0;
++ const unsigned int eighthPoints = num_points / 8;
++ const float* complexVectorPtr = (const float*)complexVector;
++ float* qBufferPtr = qBuffer;
+
+- __m256 cplxValue1, cplxValue2, complex1, complex2, qValue;
+- for(;number < eighthPoints; number++){
++ __m256 cplxValue1, cplxValue2, complex1, complex2, qValue;
++ for (; number < eighthPoints; number++) {
+
+- cplxValue1 = _mm256_load_ps(complexVectorPtr);
+- complexVectorPtr += 8;
++ cplxValue1 = _mm256_load_ps(complexVectorPtr);
++ complexVectorPtr += 8;
+
+- cplxValue2 = _mm256_load_ps(complexVectorPtr);
+- complexVectorPtr += 8;
++ cplxValue2 = _mm256_load_ps(complexVectorPtr);
++ complexVectorPtr += 8;
+
+- complex1 = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x20);
+- complex2 = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x31);
++ complex1 = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x20);
++ complex2 = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x31);
+
+- // Arrange in q1q2q3q4 format
+- qValue = _mm256_shuffle_ps(complex1, complex2, 0xdd);
++ // Arrange in q1q2q3q4 format
++ qValue = _mm256_shuffle_ps(complex1, complex2, 0xdd);
+
+- _mm256_store_ps(qBufferPtr, qValue);
++ _mm256_store_ps(qBufferPtr, qValue);
+
+- qBufferPtr += 8;
+- }
++ qBufferPtr += 8;
++ }
+
+- number = eighthPoints * 8;
+- for(; number < num_points; number++){
+- complexVectorPtr++;
+- *qBufferPtr++ = *complexVectorPtr++;
+- }
++ number = eighthPoints * 8;
++ for (; number < num_points; number++) {
++ complexVectorPtr++;
++ *qBufferPtr++ = *complexVectorPtr++;
++ }
+ }
+ #endif /* LV_HAVE_AVX */
+
+ #ifdef LV_HAVE_SSE
+ #include <xmmintrin.h>
+
+-static inline void
+-volk_32fc_deinterleave_imag_32f_a_sse(float* qBuffer, const lv_32fc_t* complexVector,
+- unsigned int num_points)
++static inline void volk_32fc_deinterleave_imag_32f_a_sse(float* qBuffer,
++ const lv_32fc_t* complexVector,
++ unsigned int num_points)
+ {
+- unsigned int number = 0;
+- const unsigned int quarterPoints = num_points / 4;
++ unsigned int number = 0;
++ const unsigned int quarterPoints = num_points / 4;
+
+- const float* complexVectorPtr = (const float*)complexVector;
+- float* qBufferPtr = qBuffer;
++ const float* complexVectorPtr = (const float*)complexVector;
++ float* qBufferPtr = qBuffer;
+
+- __m128 cplxValue1, cplxValue2, iValue;
+- for(;number < quarterPoints; number++){
++ __m128 cplxValue1, cplxValue2, iValue;
++ for (; number < quarterPoints; number++) {
+
+- cplxValue1 = _mm_load_ps(complexVectorPtr);
+- complexVectorPtr += 4;
++ cplxValue1 = _mm_load_ps(complexVectorPtr);
++ complexVectorPtr += 4;
+
+- cplxValue2 = _mm_load_ps(complexVectorPtr);
+- complexVectorPtr += 4;
++ cplxValue2 = _mm_load_ps(complexVectorPtr);
++ complexVectorPtr += 4;
+
+- // Arrange in q1q2q3q4 format
+- iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1));
++ // Arrange in q1q2q3q4 format
++ iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3, 1, 3, 1));
+
+- _mm_store_ps(qBufferPtr, iValue);
++ _mm_store_ps(qBufferPtr, iValue);
+
+- qBufferPtr += 4;
+- }
++ qBufferPtr += 4;
++ }
+
+- number = quarterPoints * 4;
+- for(; number < num_points; number++){
+- complexVectorPtr++;
+- *qBufferPtr++ = *complexVectorPtr++;
+- }
++ number = quarterPoints * 4;
++ for (; number < num_points; number++) {
++ complexVectorPtr++;
++ *qBufferPtr++ = *complexVectorPtr++;
++ }
+ }
+ #endif /* LV_HAVE_SSE */
+
+ #ifdef LV_HAVE_NEON
+ #include <arm_neon.h>
+
+-static inline void
+-volk_32fc_deinterleave_imag_32f_neon(float* qBuffer, const lv_32fc_t* complexVector,
+- unsigned int num_points)
++static inline void volk_32fc_deinterleave_imag_32f_neon(float* qBuffer,
++ const lv_32fc_t* complexVector,
++ unsigned int num_points)
+ {
+- unsigned int number = 0;
+- unsigned int quarter_points = num_points / 4;
+- const float* complexVectorPtr = (float*)complexVector;
+- float* qBufferPtr = qBuffer;
+- float32x4x2_t complexInput;
+-
+- for(number = 0; number < quarter_points; number++){
+- complexInput = vld2q_f32(complexVectorPtr);
+- vst1q_f32( qBufferPtr, complexInput.val[1] );
+- complexVectorPtr += 8;
+- qBufferPtr += 4;
+- }
+-
+- for(number = quarter_points*4; number < num_points; number++){
+- complexVectorPtr++;
+- *qBufferPtr++ = *complexVectorPtr++;
+- }
++ unsigned int number = 0;
++ unsigned int quarter_points = num_points / 4;
++ const float* complexVectorPtr = (float*)complexVector;
++ float* qBufferPtr = qBuffer;
++ float32x4x2_t complexInput;
++
++ for (number = 0; number < quarter_points; number++) {
++ complexInput = vld2q_f32(complexVectorPtr);
++ vst1q_f32(qBufferPtr, complexInput.val[1]);
++ complexVectorPtr += 8;
++ qBufferPtr += 4;
++ }
++
++ for (number = quarter_points * 4; number < num_points; number++) {
++ complexVectorPtr++;
++ *qBufferPtr++ = *complexVectorPtr++;
++ }
+ }
+ #endif /* LV_HAVE_NEON */
+
+ #ifdef LV_HAVE_GENERIC
+
+-static inline void
+-volk_32fc_deinterleave_imag_32f_generic(float* qBuffer, const lv_32fc_t* complexVector,
+- unsigned int num_points)
++static inline void volk_32fc_deinterleave_imag_32f_generic(float* qBuffer,
++ const lv_32fc_t* complexVector,
++ unsigned int num_points)
+ {
+- unsigned int number = 0;
+- const float* complexVectorPtr = (float*)complexVector;
+- float* qBufferPtr = qBuffer;
+- for(number = 0; number < num_points; number++){
+- complexVectorPtr++;
+- *qBufferPtr++ = *complexVectorPtr++;
+- }
++ unsigned int number = 0;
++ const float* complexVectorPtr = (float*)complexVector;
++ float* qBufferPtr = qBuffer;
++ for (number = 0; number < num_points; number++) {
++ complexVectorPtr++;
++ *qBufferPtr++ = *complexVectorPtr++;
++ }
+ }
+ #endif /* LV_HAVE_GENERIC */
+
+@@ -206,40 +206,40 @@ volk_32fc_deinterleave_imag_32f_generic(float* qBuffer, const lv_32fc_t* complex
+ #ifdef LV_HAVE_AVX
+ #include <immintrin.h>
+
+-static inline void
+-volk_32fc_deinterleave_imag_32f_u_avx(float* qBuffer, const lv_32fc_t* complexVector,
+- unsigned int num_points)
++static inline void volk_32fc_deinterleave_imag_32f_u_avx(float* qBuffer,
++ const lv_32fc_t* complexVector,
++ unsigned int num_points)
+ {
+- unsigned int number = 0;
+- const unsigned int eighthPoints = num_points / 8;
+- const float* complexVectorPtr = (const float*)complexVector;
+- float* qBufferPtr = qBuffer;
++ unsigned int number = 0;
++ const unsigned int eighthPoints = num_points / 8;
++ const float* complexVectorPtr = (const float*)complexVector;
++ float* qBufferPtr = qBuffer;
+
+- __m256 cplxValue1, cplxValue2, complex1, complex2, qValue;
+- for(;number < eighthPoints; number++){
++ __m256 cplxValue1, cplxValue2, complex1, complex2, qValue;
++ for (; number < eighthPoints; number++) {
+
+- cplxValue1 = _mm256_loadu_ps(complexVectorPtr);
+- complexVectorPtr += 8;
++ cplxValue1 = _mm256_loadu_ps(complexVectorPtr);
++ complexVectorPtr += 8;
+
+- cplxValue2 = _mm256_loadu_ps(complexVectorPtr);
+- complexVectorPtr += 8;
++ cplxValue2 = _mm256_loadu_ps(complexVectorPtr);
++ complexVectorPtr += 8;
+
+- complex1 = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x20);
+- complex2 = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x31);
++ complex1 = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x20);
++ complex2 = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x31);
+
+- // Arrange in q1q2q3q4 format
+- qValue = _mm256_shuffle_ps(complex1, complex2, 0xdd);
++ // Arrange in q1q2q3q4 format
++ qValue = _mm256_shuffle_ps(complex1, complex2, 0xdd);
+
+- _mm256_storeu_ps(qBufferPtr, qValue);
++ _mm256_storeu_ps(qBufferPtr, qValue);
+
+- qBufferPtr += 8;
+- }
++ qBufferPtr += 8;
++ }
+
+- number = eighthPoints * 8;
+- for(; number < num_points; number++){
+- complexVectorPtr++;
+- *qBufferPtr++ = *complexVectorPtr++;
+- }
++ number = eighthPoints * 8;
++ for (; number < num_points; number++) {
++ complexVectorPtr++;
++ *qBufferPtr++ = *complexVectorPtr++;
++ }
+ }
+ #endif /* LV_HAVE_AVX */
+ #endif /* INCLUDED_volk_32fc_deinterleave_imag_32f_u_H */
+diff --git a/kernels/volk/volk_32fc_deinterleave_real_32f.h b/kernels/volk/volk_32fc_deinterleave_real_32f.h
+index 92a94d3..2526a16 100644
+--- a/kernels/volk/volk_32fc_deinterleave_real_32f.h
++++ b/kernels/volk/volk_32fc_deinterleave_real_32f.h
+@@ -30,8 +30,8 @@
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+- * void volk_32fc_deinterleave_real_32f(float* iBuffer, const lv_32fc_t* complexVector, unsigned int num_points)
+- * \endcode
++ * void volk_32fc_deinterleave_real_32f(float* iBuffer, const lv_32fc_t* complexVector,
++ * unsigned int num_points) \endcode
+ *
+ * \b Inputs
+ * \li complexVector: The complex input vector.
+@@ -76,96 +76,96 @@
+ #ifdef LV_HAVE_AVX2
+ #include <immintrin.h>
+
+-static inline void
+-volk_32fc_deinterleave_real_32f_a_avx2(float* iBuffer, const lv_32fc_t* complexVector,
+- unsigned int num_points)
++static inline void volk_32fc_deinterleave_real_32f_a_avx2(float* iBuffer,
++ const lv_32fc_t* complexVector,
++ unsigned int num_points)
+ {
+- unsigned int number = 0;
+- const unsigned int eighthPoints = num_points / 8;
++ unsigned int number = 0;
++ const unsigned int eighthPoints = num_points / 8;
+
+- const float* complexVectorPtr = (const float*)complexVector;
+- float* iBufferPtr = iBuffer;
++ const float* complexVectorPtr = (const float*)complexVector;
++ float* iBufferPtr = iBuffer;
+
+- __m256 cplxValue1, cplxValue2;
+- __m256 iValue;
+- __m256i idx = _mm256_set_epi32(7,6,3,2,5,4,1,0);
+- for(;number < eighthPoints; number++){
++ __m256 cplxValue1, cplxValue2;
++ __m256 iValue;
++ __m256i idx = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
++ for (; number < eighthPoints; number++) {
+
+- cplxValue1 = _mm256_load_ps(complexVectorPtr);
+- complexVectorPtr += 8;
++ cplxValue1 = _mm256_load_ps(complexVectorPtr);
++ complexVectorPtr += 8;
+
+- cplxValue2 = _mm256_load_ps(complexVectorPtr);
+- complexVectorPtr += 8;
++ cplxValue2 = _mm256_load_ps(complexVectorPtr);
++ complexVectorPtr += 8;
+
+- // Arrange in i1i2i3i4 format
+- iValue = _mm256_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0));
+- iValue = _mm256_permutevar8x32_ps(iValue,idx);
++ // Arrange in i1i2i3i4 format
++ iValue = _mm256_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2, 0, 2, 0));
++ iValue = _mm256_permutevar8x32_ps(iValue, idx);
+
+- _mm256_store_ps(iBufferPtr, iValue);
++ _mm256_store_ps(iBufferPtr, iValue);
+
+- iBufferPtr += 8;
+- }
++ iBufferPtr += 8;
++ }
+
+- number = eighthPoints * 8;
+- for(; number < num_points; number++){
+- *iBufferPtr++ = *complexVectorPtr++;
+- complexVectorPtr++;
+- }
++ number = eighthPoints * 8;
++ for (; number < num_points; number++) {
++ *iBufferPtr++ = *complexVectorPtr++;
++ complexVectorPtr++;
++ }
+ }
+ #endif /* LV_HAVE_AVX2 */
+
+ #ifdef LV_HAVE_SSE
+ #include <xmmintrin.h>
+
+-static inline void
+-volk_32fc_deinterleave_real_32f_a_sse(float* iBuffer, const lv_32fc_t* complexVector,
+- unsigned int num_points)
++static inline void volk_32fc_deinterleave_real_32f_a_sse(float* iBuffer,
++ const lv_32fc_t* complexVector,
++ unsigned int num_points)
+ {
+- unsigned int number = 0;
+- const unsigned int quarterPoints = num_points / 4;
++ unsigned int number = 0;
++ const unsigned int quarterPoints = num_points / 4;
+
+- const float* complexVectorPtr = (const float*)complexVector;
+- float* iBufferPtr = iBuffer;
++ const float* complexVectorPtr = (const float*)complexVector;
++ float* iBufferPtr = iBuffer;
+
+- __m128 cplxValue1, cplxValue2, iValue;
+- for(;number < quarterPoints; number++){
++ __m128 cplxValue1, cplxValue2, iValue;
++ for (; number < quarterPoints; number++) {
+
+- cplxValue1 = _mm_load_ps(complexVectorPtr);
+- complexVectorPtr += 4;
++ cplxValue1 = _mm_load_ps(complexVectorPtr);
++ complexVectorPtr += 4;
+
+- cplxValue2 = _mm_load_ps(complexVectorPtr);
+- complexVectorPtr += 4;
++ cplxValue2 = _mm_load_ps(complexVectorPtr);
++ complexVectorPtr += 4;
+
+- // Arrange in i1i2i3i4 format
+- iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0));
++ // Arrange in i1i2i3i4 format
++ iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2, 0, 2, 0));
+
+- _mm_store_ps(iBufferPtr, iValue);
++ _mm_store_ps(iBufferPtr, iValue);
+
+- iBufferPtr += 4;
+- }
++ iBufferPtr += 4;
++ }
+
+- number = quarterPoints * 4;
+- for(; number < num_points; number++){
+- *iBufferPtr++ = *complexVectorPtr++;
+- complexVectorPtr++;
+- }
++ number = quarterPoints * 4;
++ for (; number < num_points; number++) {
++ *iBufferPtr++ = *complexVectorPtr++;
++ complexVectorPtr++;
++ }
+ }
+ #endif /* LV_HAVE_SSE */
+
+
+ #ifdef LV_HAVE_GENERIC
+
+-static inline void
+-volk_32fc_deinterleave_real_32f_generic(float* iBuffer, const lv_32fc_t* complexVector,
+- unsigned int num_points)
++static inline void volk_32fc_deinterleave_real_32f_generic(float* iBuffer,
++ const lv_32fc_t* complexVector,
++ unsigned int num_points)
+ {
+- unsigned int number = 0;
+- const float* complexVectorPtr = (float*)complexVector;
+- float* iBufferPtr = iBuffer;
+- for(number = 0; number < num_points; number++){
+- *iBufferPtr++ = *complexVectorPtr++;
+- complexVectorPtr++;
+- }
++ unsigned int number = 0;
++ const float* complexVectorPtr = (float*)complexVector;
++ float* iBufferPtr = iBuffer;
++ for (number = 0; number < num_points; number++) {
++ *iBufferPtr++ = *complexVectorPtr++;
++ complexVectorPtr++;
++ }
+ }
+ #endif /* LV_HAVE_GENERIC */
+
+@@ -173,27 +173,27 @@ volk_32fc_deinterleave_real_32f_generic(float* iBuffer, const lv_32fc_t* complex
+ #ifdef LV_HAVE_NEON
+ #include <arm_neon.h>
+
+-static inline void
+-volk_32fc_deinterleave_real_32f_neon(float* iBuffer, const lv_32fc_t* complexVector,
+- unsigned int num_points)
++static inline void volk_32fc_deinterleave_real_32f_neon(float* iBuffer,
++ const lv_32fc_t* complexVector,
++ unsigned int num_points)
+ {
+- unsigned int number = 0;
+- unsigned int quarter_points = num_points / 4;
+- const float* complexVectorPtr = (float*)complexVector;
+- float* iBufferPtr = iBuffer;
+- float32x4x2_t complexInput;
+-
+- for(number = 0; number < quarter_points; number++){
+- complexInput = vld2q_f32(complexVectorPtr);
+- vst1q_f32( iBufferPtr, complexInput.val[0] );
+- complexVectorPtr += 8;
+- iBufferPtr += 4;
+- }
+-
+- for(number = quarter_points*4; number < num_points; number++){
+- *iBufferPtr++ = *complexVectorPtr++;
+- complexVectorPtr++;
+- }
++ unsigned int number = 0;
++ unsigned int quarter_points = num_points / 4;
++ const float* complexVectorPtr = (float*)complexVector;
++ float* iBufferPtr = iBuffer;
++ float32x4x2_t complexInput;
++
++ for (number = 0; number < quarter_points; number++) {
++ complexInput = vld2q_f32(complexVectorPtr);
++ vst1q_f32(iBufferPtr, complexInput.val[0]);
++ complexVectorPtr += 8;
++ iBufferPtr += 4;
++ }
++
++ for (number = quarter_points * 4; number < num_points; number++) {
++ *iBufferPtr++ = *complexVectorPtr++;
++ complexVectorPtr++;
++ }
+ }
+ #endif /* LV_HAVE_NEON */
+
+@@ -209,41 +209,41 @@ volk_32fc_deinterleave_real_32f_neon(float* iBuffer, const lv_32fc_t* complexVec
+ #ifdef LV_HAVE_AVX2
+ #include <immintrin.h>
+
+-static inline void
+-volk_32fc_deinterleave_real_32f_u_avx2(float* iBuffer, const lv_32fc_t* complexVector,
+- unsigned int num_points)
++static inline void volk_32fc_deinterleave_real_32f_u_avx2(float* iBuffer,
++ const lv_32fc_t* complexVector,
++ unsigned int num_points)
+ {
+- unsigned int number = 0;
+- const unsigned int eighthPoints = num_points / 8;
++ unsigned int number = 0;
++ const unsigned int eighthPoints = num_points / 8;
+
+- const float* complexVectorPtr = (const float*)complexVector;
+- float* iBufferPtr = iBuffer;
++ const float* complexVectorPtr = (const float*)complexVector;
++ float* iBufferPtr = iBuffer;
+
+- __m256 cplxValue1, cplxValue2;
+- __m256 iValue;
+- __m256i idx = _mm256_set_epi32(7,6,3,2,5,4,1,0);
+- for(;number < eighthPoints; number++){
++ __m256 cplxValue1, cplxValue2;
++ __m256 iValue;
++ __m256i idx = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
++ for (; number < eighthPoints; number++) {
+
+- cplxValue1 = _mm256_loadu_ps(complexVectorPtr);
+- complexVectorPtr += 8;
++ cplxValue1 = _mm256_loadu_ps(complexVectorPtr);
++ complexVectorPtr += 8;
+
+- cplxValue2 = _mm256_loadu_ps(complexVectorPtr);
+- complexVectorPtr += 8;
++ cplxValue2 = _mm256_loadu_ps(complexVectorPtr);
++ complexVectorPtr += 8;
+
+- // Arrange in i1i2i3i4 format
+- iValue = _mm256_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0));
+- iValue = _mm256_permutevar8x32_ps(iValue,idx);
++ // Arrange in i1i2i3i4 format
++ iValue = _mm256_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2, 0, 2, 0));
++ iValue = _mm256_permutevar8x32_ps(iValue, idx);
+
+- _mm256_storeu_ps(iBufferPtr, iValue);
++ _mm256_storeu_ps(iBufferPtr, iValue);
+
+- iBufferPtr += 8;
+- }
++ iBufferPtr += 8;
++ }
+
+- number = eighthPoints * 8;
+- for(; number < num_points; number++){
+- *iBufferPtr++ = *complexVectorPtr++;
+- complexVectorPtr++;
+- }
++ number = eighthPoints * 8;
++ for (; number < num_points; number++) {
++ *iBufferPtr++ = *complexVectorPtr++;
++ complexVectorPtr++;
++ }
+ }
+ #endif /* LV_HAVE_AVX2 */
+
+diff --git a/kernels/volk/volk_32fc_deinterleave_real_64f.h b/kernels/volk/volk_32fc_deinterleave_real_64f.h
+index 3d6e901..9ec7769 100644
+--- a/kernels/volk/volk_32fc_deinterleave_real_64f.h
++++ b/kernels/volk/volk_32fc_deinterleave_real_64f.h
+@@ -77,124 +77,132 @@
+ #ifdef LV_HAVE_AVX2
+ #include <immintrin.h>
+
+-static inline void volk_32fc_deinterleave_real_64f_a_avx2(
+- double *iBuffer, const lv_32fc_t *complexVector, unsigned int num_points) {
+- unsigned int number = 0;
+-
+- const float *complexVectorPtr = (float *)complexVector;
+- double *iBufferPtr = iBuffer;
+-
+- const unsigned int quarterPoints = num_points / 4;
+- __m256 cplxValue;
+- __m128 fVal;
+- __m256d dVal;
+- __m256i idx = _mm256_set_epi32(0, 0, 0, 0, 6, 4, 2, 0);
+- for (; number < quarterPoints; number++) {
+-
+- cplxValue = _mm256_load_ps(complexVectorPtr);
+- complexVectorPtr += 8;
+-
+- // Arrange in i1i2i1i2 format
+- cplxValue = _mm256_permutevar8x32_ps(cplxValue, idx);
+- fVal = _mm256_extractf128_ps(cplxValue, 0);
+- dVal = _mm256_cvtps_pd(fVal);
+- _mm256_store_pd(iBufferPtr, dVal);
+-
+- iBufferPtr += 4;
+- }
+-
+- number = quarterPoints * 4;
+- for (; number < num_points; number++) {
+- *iBufferPtr++ = (double)*complexVectorPtr++;
+- complexVectorPtr++;
+- }
++static inline void volk_32fc_deinterleave_real_64f_a_avx2(double* iBuffer,
++ const lv_32fc_t* complexVector,
++ unsigned int num_points)
++{
++ unsigned int number = 0;
++
++ const float* complexVectorPtr = (float*)complexVector;
++ double* iBufferPtr = iBuffer;
++
++ const unsigned int quarterPoints = num_points / 4;
++ __m256 cplxValue;
++ __m128 fVal;
++ __m256d dVal;
++ __m256i idx = _mm256_set_epi32(0, 0, 0, 0, 6, 4, 2, 0);
++ for (; number < quarterPoints; number++) {
++
++ cplxValue = _mm256_load_ps(complexVectorPtr);
++ complexVectorPtr += 8;
++
++ // Arrange in i1i2i1i2 format
++ cplxValue = _mm256_permutevar8x32_ps(cplxValue, idx);
++ fVal = _mm256_extractf128_ps(cplxValue, 0);
++ dVal = _mm256_cvtps_pd(fVal);
++ _mm256_store_pd(iBufferPtr, dVal);
++
++ iBufferPtr += 4;
++ }
++
++ number = quarterPoints * 4;
++ for (; number < num_points; number++) {
++ *iBufferPtr++ = (double)*complexVectorPtr++;
++ complexVectorPtr++;
++ }
+ }
+ #endif /* LV_HAVE_AVX2 */
+
+ #ifdef LV_HAVE_SSE2
+ #include <emmintrin.h>
+
+-static inline void volk_32fc_deinterleave_real_64f_a_sse2(
+- double *iBuffer, const lv_32fc_t *complexVector, unsigned int num_points) {
+- unsigned int number = 0;
++static inline void volk_32fc_deinterleave_real_64f_a_sse2(double* iBuffer,
++ const lv_32fc_t* complexVector,
++ unsigned int num_points)
++{
++ unsigned int number = 0;
+
+- const float *complexVectorPtr = (float *)complexVector;
+- double *iBufferPtr = iBuffer;
++ const float* complexVectorPtr = (float*)complexVector;
++ double* iBufferPtr = iBuffer;
+
+- const unsigned int halfPoints = num_points / 2;
+- __m128 cplxValue, fVal;
+- __m128d dVal;
+- for (; number < halfPoints; number++) {
++ const unsigned int halfPoints = num_points / 2;
++ __m128 cplxValue, fVal;
++ __m128d dVal;
++ for (; number < halfPoints; number++) {
+
+- cplxValue = _mm_load_ps(complexVectorPtr);
+- complexVectorPtr += 4;
++ cplxValue = _mm_load_ps(complexVectorPtr);
++ complexVectorPtr += 4;
+
+- // Arrange in i1i2i1i2 format
+- fVal = _mm_shuffle_ps(cplxValue, cplxValue, _MM_SHUFFLE(2, 0, 2, 0));
+- dVal = _mm_cvtps_pd(fVal);
+- _mm_store_pd(iBufferPtr, dVal);
++ // Arrange in i1i2i1i2 format
++ fVal = _mm_shuffle_ps(cplxValue, cplxValue, _MM_SHUFFLE(2, 0, 2, 0));
++ dVal = _mm_cvtps_pd(fVal);
++ _mm_store_pd(iBufferPtr, dVal);
+
+- iBufferPtr += 2;
+- }
++ iBufferPtr += 2;
++ }
+
+- number = halfPoints * 2;
+- for (; number < num_points; number++) {
+- *iBufferPtr++ = (double)*complexVectorPtr++;
+- complexVectorPtr++;
+- }
++ number = halfPoints * 2;
++ for (; number < num_points; number++) {
++ *iBufferPtr++ = (double)*complexVectorPtr++;
++ complexVectorPtr++;
++ }
+ }
+ #endif /* LV_HAVE_SSE */
+
+ #ifdef LV_HAVE_GENERIC
+
+-static inline void volk_32fc_deinterleave_real_64f_generic(
+- double *iBuffer, const lv_32fc_t *complexVector, unsigned int num_points) {
+- unsigned int number = 0;
+- const float *complexVectorPtr = (float *)complexVector;
+- double *iBufferPtr = iBuffer;
+- for (number = 0; number < num_points; number++) {
+- *iBufferPtr++ = (double)*complexVectorPtr++;
+- complexVectorPtr++;
+- }
++static inline void volk_32fc_deinterleave_real_64f_generic(double* iBuffer,
++ const lv_32fc_t* complexVector,
++ unsigned int num_points)
++{
++ unsigned int number = 0;
++ const float* complexVectorPtr = (float*)complexVector;
++ double* iBufferPtr = iBuffer;
++ for (number = 0; number < num_points; number++) {
++ *iBufferPtr++ = (double)*complexVectorPtr++;
++ complexVectorPtr++;
++ }
+ }
+ #endif /* LV_HAVE_GENERIC */
+
+ #ifdef LV_HAVE_NEONV8
+ #include <arm_neon.h>
+
+-static inline void volk_32fc_deinterleave_real_64f_neon(
+- double *iBuffer, const lv_32fc_t *complexVector, unsigned int num_points) {
+- unsigned int number = 0;
+- unsigned int quarter_points = num_points / 4;
+- const float *complexVectorPtr = (float *)complexVector;
+- double *iBufferPtr = iBuffer;
+- float32x2x4_t complexInput;
+- float64x2_t iVal1;
+- float64x2_t iVal2;
+- float64x2x2_t iVal;
+-
+- for (number = 0; number < quarter_points; number++) {
+- // Load data into register
+- complexInput = vld4_f32(complexVectorPtr);
+-
+- // Perform single to double precision conversion
+- iVal1 = vcvt_f64_f32(complexInput.val[0]);
+- iVal2 = vcvt_f64_f32(complexInput.val[2]);
+- iVal.val[0] = iVal1;
+- iVal.val[1] = iVal2;
+-
+- // Store results into memory buffer
+- vst2q_f64(iBufferPtr, iVal);
+-
+- // Update pointers
+- iBufferPtr += 4;
+- complexVectorPtr += 8;
+- }
+-
+- for (number = quarter_points * 4; number < num_points; number++) {
+- *iBufferPtr++ = (double)*complexVectorPtr++;
+- complexVectorPtr++;
+- }
++static inline void volk_32fc_deinterleave_real_64f_neon(double* iBuffer,
++ const lv_32fc_t* complexVector,
++ unsigned int num_points)
++{
++ unsigned int number = 0;
++ unsigned int quarter_points = num_points / 4;
++ const float* complexVectorPtr = (float*)complexVector;
++ double* iBufferPtr = iBuffer;
++ float32x2x4_t complexInput;
++ float64x2_t iVal1;
++ float64x2_t iVal2;
++ float64x2x2_t iVal;
++
++ for (number = 0; number < quarter_points; number++) {
++ // Load data into register
++ complexInput = vld4_f32(complexVectorPtr);
++
++ // Perform single to double precision conversion
++ iVal1 = vcvt_f64_f32(complexInput.val[0]);
++ iVal2 = vcvt_f64_f32(complexInput.val[2]);
++ iVal.val[0] = iVal1;
++ iVal.val[1] = iVal2;
++
++ // Store results into memory buffer
++ vst2q_f64(iBufferPtr, iVal);
++
++ // Update pointers
++ iBufferPtr += 4;
++ complexVectorPtr += 8;
++ }
++
++ for (number = quarter_points * 4; number < num_points; number++) {
++ *iBufferPtr++ = (double)*complexVectorPtr++;
++ complexVectorPtr++;
++ }
+ }
+ #endif /* LV_HAVE_NEON */
+
+@@ -209,37 +217,39 @@ static inline void volk_32fc_deinterleave_real_64f_neon(
+ #ifdef LV_HAVE_AVX2
+ #include <immintrin.h>
+
+-static inline void volk_32fc_deinterleave_real_64f_u_avx2(
+- double *iBuffer, const lv_32fc_t *complexVector, unsigned int num_points) {
+- unsigned int number = 0;
+-
+- const float *complexVectorPtr = (float *)complexVector;
+- double *iBufferPtr = iBuffer;
+-
+- const unsigned int quarterPoints = num_points / 4;
+- __m256 cplxValue;
+- __m128 fVal;
+- __m256d dVal;
+- __m256i idx = _mm256_set_epi32(0, 0, 0, 0, 6, 4, 2, 0);
+- for (; number < quarterPoints; number++) {
+-
+- cplxValue = _mm256_loadu_ps(complexVectorPtr);
+- complexVectorPtr += 8;
+-
+- // Arrange in i1i2i1i2 format
+- cplxValue = _mm256_permutevar8x32_ps(cplxValue, idx);
+- fVal = _mm256_extractf128_ps(cplxValue, 0);
+- dVal = _mm256_cvtps_pd(fVal);
+- _mm256_storeu_pd(iBufferPtr, dVal);
+-
+- iBufferPtr += 4;
+- }
+-
+- number = quarterPoints * 4;
+- for (; number < num_points; number++) {
+- *iBufferPtr++ = (double)*complexVectorPtr++;
+- complexVectorPtr++;
+- }
++static inline void volk_32fc_deinterleave_real_64f_u_avx2(double* iBuffer,
++ const lv_32fc_t* complexVector,
++ unsigned int num_points)
++{
++ unsigned int number = 0;
++
++ const float* complexVectorPtr = (float*)complexVector;
++ double* iBufferPtr = iBuffer;
++
++ const unsigned int quarterPoints = num_points / 4;
++ __m256 cplxValue;
++ __m128 fVal;
++ __m256d dVal;
++ __m256i idx = _mm256_set_epi32(0, 0, 0, 0, 6, 4, 2, 0);
++ for (; number < quarterPoints; number++) {
++
++ cplxValue = _mm256_loadu_ps(complexVectorPtr);
++ complexVectorPtr += 8;
++
++ // Arrange in i1i2i1i2 format
++ cplxValue = _mm256_permutevar8x32_ps(cplxValue, idx);
++ fVal = _mm256_extractf128_ps(cplxValue, 0);
++ dVal = _mm256_cvtps_pd(fVal);
++ _mm256_storeu_pd(iBufferPtr, dVal);
++
++ iBufferPtr += 4;
++ }
++
++ number = quarterPoints * 4;
++ for (; number < num_points; number++) {
++ *iBufferPtr++ = (double)*complexVectorPtr++;
++ complexVectorPtr++;
++ }
+ }
+ #endif /* LV_HAVE_AVX2 */
+
+diff --git a/kernels/volk/volk_32fc_index_max_16u.h b/kernels/volk/volk_32fc_index_max_16u.h
+index a9f9508..b9f9cfd 100644
+--- a/kernels/volk/volk_32fc_index_max_16u.h
++++ b/kernels/volk/volk_32fc_index_max_16u.h
+@@ -76,346 +76,353 @@
+ #ifndef INCLUDED_volk_32fc_index_max_16u_a_H
+ #define INCLUDED_volk_32fc_index_max_16u_a_H
+
+-#include <volk/volk_common.h>
+ #include <inttypes.h>
+-#include <stdio.h>
+ #include <limits.h>
++#include <stdio.h>
++#include <volk/volk_common.h>
+ #include <volk/volk_complex.h>
+
+ #ifdef LV_HAVE_AVX2
+ #include <immintrin.h>
+
+ static inline void
+-volk_32fc_index_max_16u_a_avx2(uint16_t* target, lv_32fc_t* src0,
+- uint32_t num_points)
++volk_32fc_index_max_16u_a_avx2(uint16_t* target, lv_32fc_t* src0, uint32_t num_points)
+ {
+- num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
+- // Branchless version, if we think it'll make a difference
+- //num_points = USHRT_MAX ^ ((num_points ^ USHRT_MAX) & -(num_points < USHRT_MAX));
+-
+- const uint32_t num_bytes = num_points*8;
+-
+- union bit256 holderf;
+- union bit256 holderi;
+- float sq_dist = 0.0;
++ num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
++ // Branchless version, if we think it'll make a difference
++ // num_points = USHRT_MAX ^ ((num_points ^ USHRT_MAX) & -(num_points < USHRT_MAX));
+
+- union bit256 xmm5, xmm4;
+- __m256 xmm1, xmm2, xmm3;
+- __m256i xmm8, xmm11, xmm12, xmmfive, xmmfour, xmm9, holder0, holder1, xmm10;
++ const uint32_t num_bytes = num_points * 8;
+
+- xmm5.int_vec = xmmfive = _mm256_setzero_si256();
+- xmm4.int_vec = xmmfour = _mm256_setzero_si256();
+- holderf.int_vec = holder0 = _mm256_setzero_si256();
+- holderi.int_vec = holder1 = _mm256_setzero_si256();
++ union bit256 holderf;
++ union bit256 holderi;
++ float sq_dist = 0.0;
+
+- int bound = num_bytes >> 6;
+- int i = 0;
++ union bit256 xmm5, xmm4;
++ __m256 xmm1, xmm2, xmm3;
++ __m256i xmm8, xmm11, xmm12, xmmfive, xmmfour, xmm9, holder0, holder1, xmm10;
+
+- xmm8 = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
+- xmm9 = _mm256_setzero_si256(); //=xmm8
+- xmm10 = _mm256_set1_epi32(8);
+- xmm3 = _mm256_setzero_ps();
++ xmm5.int_vec = xmmfive = _mm256_setzero_si256();
++ xmm4.int_vec = xmmfour = _mm256_setzero_si256();
++ holderf.int_vec = holder0 = _mm256_setzero_si256();
++ holderi.int_vec = holder1 = _mm256_setzero_si256();
+
+- __m256i idx = _mm256_set_epi32(7,6,3,2,5,4,1,0);
+- for(; i < bound; ++i) {
+- xmm1 = _mm256_load_ps((float*)src0);
+- xmm2 = _mm256_load_ps((float*)&src0[4]);
++ int bound = num_bytes >> 6;
++ int i = 0;
+
+- src0 += 8;
++ xmm8 = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
++ xmm9 = _mm256_setzero_si256(); //=xmm8
++ xmm10 = _mm256_set1_epi32(8);
++ xmm3 = _mm256_setzero_ps();
+
+- xmm1 = _mm256_mul_ps(xmm1, xmm1);
+- xmm2 = _mm256_mul_ps(xmm2, xmm2);
++ __m256i idx = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
++ for (; i < bound; ++i) {
++ xmm1 = _mm256_load_ps((float*)src0);
++ xmm2 = _mm256_load_ps((float*)&src0[4]);
+
+- xmm1 = _mm256_hadd_ps(xmm1, xmm2);
+- xmm1 = _mm256_permutevar8x32_ps(xmm1, idx);
++ src0 += 8;
+
+- xmm3 = _mm256_max_ps(xmm1, xmm3);
++ xmm1 = _mm256_mul_ps(xmm1, xmm1);
++ xmm2 = _mm256_mul_ps(xmm2, xmm2);
+
+- xmm4.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_LT_OS);
+- xmm5.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_EQ_OQ);
++ xmm1 = _mm256_hadd_ps(xmm1, xmm2);
++ xmm1 = _mm256_permutevar8x32_ps(xmm1, idx);
+
+- xmm11 = _mm256_and_si256(xmm8, xmm5.int_vec);
+- xmm12 = _mm256_and_si256(xmm9, xmm4.int_vec);
++ xmm3 = _mm256_max_ps(xmm1, xmm3);
+
+- xmm9 = _mm256_add_epi32(xmm11, xmm12);
++ xmm4.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_LT_OS);
++ xmm5.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_EQ_OQ);
+
+- xmm8 = _mm256_add_epi32(xmm8, xmm10);
+- }
+- xmm10 = _mm256_set1_epi32(4);
+- if (num_bytes >> 5 & 1) {
+- xmm1 = _mm256_load_ps((float*)src0);
++ xmm11 = _mm256_and_si256(xmm8, xmm5.int_vec);
++ xmm12 = _mm256_and_si256(xmm9, xmm4.int_vec);
+
+- src0 += 4;
++ xmm9 = _mm256_add_epi32(xmm11, xmm12);
+
+- xmm1 = _mm256_mul_ps(xmm1, xmm1);
++ xmm8 = _mm256_add_epi32(xmm8, xmm10);
++ }
++ xmm10 = _mm256_set1_epi32(4);
++ if (num_bytes >> 5 & 1) {
++ xmm1 = _mm256_load_ps((float*)src0);
+
+- xmm1 = _mm256_hadd_ps(xmm1, xmm1);
+- xmm1 = _mm256_permutevar8x32_ps(xmm1, idx);
++ src0 += 4;
+
+- xmm3 = _mm256_max_ps(xmm1, xmm3);
++ xmm1 = _mm256_mul_ps(xmm1, xmm1);
+
+- xmm4.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_LT_OS);
+- xmm5.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_EQ_OQ);
++ xmm1 = _mm256_hadd_ps(xmm1, xmm1);
++ xmm1 = _mm256_permutevar8x32_ps(xmm1, idx);
+
+- xmm11 = _mm256_and_si256(xmm8, xmm5.int_vec);
+- xmm12 = _mm256_and_si256(xmm9, xmm4.int_vec);
++ xmm3 = _mm256_max_ps(xmm1, xmm3);
+
+- xmm9 = _mm256_add_epi32(xmm11, xmm12);
++ xmm4.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_LT_OS);
++ xmm5.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_EQ_OQ);
+
+- xmm8 = _mm256_add_epi32(xmm8, xmm10);
+- }
++ xmm11 = _mm256_and_si256(xmm8, xmm5.int_vec);
++ xmm12 = _mm256_and_si256(xmm9, xmm4.int_vec);
+
+- idx = _mm256_set_epi32(1,0,1,0,1,0,1,0);
+- xmm10 = _mm256_set1_epi32(2);
+- if (num_bytes >> 4 & 1) {
+- xmm2 = _mm256_load_ps((float*)src0);
++ xmm9 = _mm256_add_epi32(xmm11, xmm12);
+
+- xmm1 = _mm256_permutevar8x32_ps(bit256_p(&xmm8)->float_vec, idx);
+- xmm8 = bit256_p(&xmm1)->int_vec;
++ xmm8 = _mm256_add_epi32(xmm8, xmm10);
++ }
+
+- xmm2 = _mm256_mul_ps(xmm2, xmm2);
++ idx = _mm256_set_epi32(1, 0, 1, 0, 1, 0, 1, 0);
++ xmm10 = _mm256_set1_epi32(2);
++ if (num_bytes >> 4 & 1) {
++ xmm2 = _mm256_load_ps((float*)src0);
+
+- src0 += 2;
++ xmm1 = _mm256_permutevar8x32_ps(bit256_p(&xmm8)->float_vec, idx);
++ xmm8 = bit256_p(&xmm1)->int_vec;
+
+- xmm1 = _mm256_hadd_ps(xmm2, xmm2);
++ xmm2 = _mm256_mul_ps(xmm2, xmm2);
+
+- xmm3 = _mm256_max_ps(xmm1, xmm3);
++ src0 += 2;
+
+- xmm4.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_LT_OS);
+- xmm5.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_EQ_OQ);
+-
+- xmm11 = _mm256_and_si256(xmm8, xmm5.int_vec);
+- xmm12 = _mm256_and_si256(xmm9, xmm4.int_vec);
++ xmm1 = _mm256_hadd_ps(xmm2, xmm2);
+
+- xmm9 = _mm256_add_epi32(xmm11, xmm12);
++ xmm3 = _mm256_max_ps(xmm1, xmm3);
+
+- xmm8 = _mm256_add_epi32(xmm8, xmm10);
+- }
++ xmm4.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_LT_OS);
++ xmm5.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_EQ_OQ);
+
+- /*
+- idx = _mm256_setzero_si256();
+- for(i = 0; i < leftovers2; ++i) {
+- //printf("%u, %u, %u, %u\n", ((uint32_t*)&xmm9)[0], ((uint32_t*)&xmm9)[1], ((uint32_t*)&xmm9)[2], ((uint32_t*)&xmm9)[3]);
++ xmm11 = _mm256_and_si256(xmm8, xmm5.int_vec);
++ xmm12 = _mm256_and_si256(xmm9, xmm4.int_vec);
+
+- sq_dist = lv_creal(src0[0]) * lv_creal(src0[0]) + lv_cimag(src0[0]) * lv_cimag(src0[0]);
++ xmm9 = _mm256_add_epi32(xmm11, xmm12);
+
+- //xmm = _mm_load1_ps(&sq_dist);//insert?
+- xmm2 = _mm256_set1_ps(sq_dist);
+- //xmm2 = _mm256_insertf128_ps(xmm2, xmm, 0);
++ xmm8 = _mm256_add_epi32(xmm8, xmm10);
++ }
+
+- xmm1 = xmm3;
++ /*
++ idx = _mm256_setzero_si256();
++ for(i = 0; i < leftovers2; ++i) {
++ //printf("%u, %u, %u, %u\n", ((uint32_t*)&xmm9)[0], ((uint32_t*)&xmm9)[1],
++ ((uint32_t*)&xmm9)[2], ((uint32_t*)&xmm9)[3]);
+
+- xmm3 = _mm256_max_ps(xmm3, xmm2);//only lowest 32bit value
+- xmm3 = _mm256_permutevar8x32_ps(xmm3, idx);
++ sq_dist = lv_creal(src0[0]) * lv_creal(src0[0]) + lv_cimag(src0[0]) *
++ lv_cimag(src0[0]);
+
+- xmm4.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_LT_OS);
+- xmm5.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_EQ_OQ);
++ //xmm = _mm_load1_ps(&sq_dist);//insert?
++ xmm2 = _mm256_set1_ps(sq_dist);
++ //xmm2 = _mm256_insertf128_ps(xmm2, xmm, 0);
+
+- xmm8 = _mm256_permutevar8x32_epi32(xmm8, idx);
++ xmm1 = xmm3;
+
+- xmm11 = _mm256_and_si256(xmm8, xmm4.int_vec);
+- xmm12 = _mm256_and_si256(xmm9, xmm5.int_vec);
++ xmm3 = _mm256_max_ps(xmm3, xmm2);//only lowest 32bit value
++ xmm3 = _mm256_permutevar8x32_ps(xmm3, idx);
+
+- xmm9 = _mm256_add_epi32(xmm11, xmm12);
+-}*/
++ xmm4.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_LT_OS);
++ xmm5.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_EQ_OQ);
+
+- _mm256_store_ps((float*)&(holderf.f), xmm3);
+- _mm256_store_si256(&(holderi.int_vec), xmm9);
++ xmm8 = _mm256_permutevar8x32_epi32(xmm8, idx);
+
+- target[0] = holderi.i[0];
+- sq_dist = holderf.f[0];
+- target[0] = (holderf.f[1] > sq_dist) ? holderi.i[1] : target[0];
+- sq_dist = (holderf.f[1] > sq_dist) ? holderf.f[1] : sq_dist;
+- target[0] = (holderf.f[2] > sq_dist) ? holderi.i[2] : target[0];
+- sq_dist = (holderf.f[2] > sq_dist) ? holderf.f[2] : sq_dist;
+- target[0] = (holderf.f[3] > sq_dist) ? holderi.i[3] : target[0];
+- sq_dist = (holderf.f[3] > sq_dist) ? holderf.f[3] : sq_dist;
+- target[0] = (holderf.f[4] > sq_dist) ? holderi.i[4] : target[0];
+- sq_dist = (holderf.f[4] > sq_dist) ? holderf.f[4] : sq_dist;
+- target[0] = (holderf.f[5] > sq_dist) ? holderi.i[5] : target[0];
+- sq_dist = (holderf.f[5] > sq_dist) ? holderf.f[5] : sq_dist;
+- target[0] = (holderf.f[6] > sq_dist) ? holderi.i[6] : target[0];
+- sq_dist = (holderf.f[6] > sq_dist) ? holderf.f[6] : sq_dist;
+- target[0] = (holderf.f[7] > sq_dist) ? holderi.i[7] : target[0];
+- sq_dist = (holderf.f[7] > sq_dist) ? holderf.f[7] : sq_dist;
++ xmm11 = _mm256_and_si256(xmm8, xmm4.int_vec);
++ xmm12 = _mm256_and_si256(xmm9, xmm5.int_vec);
+
++ xmm9 = _mm256_add_epi32(xmm11, xmm12);
++ }*/
++
++ _mm256_store_ps((float*)&(holderf.f), xmm3);
++ _mm256_store_si256(&(holderi.int_vec), xmm9);
++
++ target[0] = holderi.i[0];
++ sq_dist = holderf.f[0];
++ target[0] = (holderf.f[1] > sq_dist) ? holderi.i[1] : target[0];
++ sq_dist = (holderf.f[1] > sq_dist) ? holderf.f[1] : sq_dist;
++ target[0] = (holderf.f[2] > sq_dist) ? holderi.i[2] : target[0];
++ sq_dist = (holderf.f[2] > sq_dist) ? holderf.f[2] : sq_dist;
++ target[0] = (holderf.f[3] > sq_dist) ? holderi.i[3] : target[0];
++ sq_dist = (holderf.f[3] > sq_dist) ? holderf.f[3] : sq_dist;
++ target[0] = (holderf.f[4] > sq_dist) ? holderi.i[4] : target[0];
++ sq_dist = (holderf.f[4] > sq_dist) ? holderf.f[4] : sq_dist;
++ target[0] = (holderf.f[5] > sq_dist) ? holderi.i[5] : target[0];
++ sq_dist = (holderf.f[5] > sq_dist) ? holderf.f[5] : sq_dist;
++ target[0] = (holderf.f[6] > sq_dist) ? holderi.i[6] : target[0];
++ sq_dist = (holderf.f[6] > sq_dist) ? holderf.f[6] : sq_dist;
++ target[0] = (holderf.f[7] > sq_dist) ? holderi.i[7] : target[0];
++ sq_dist = (holderf.f[7] > sq_dist) ? holderf.f[7] : sq_dist;
+ }
+
+ #endif /*LV_HAVE_AVX2*/
+
+ #ifdef LV_HAVE_SSE3
+-#include <xmmintrin.h>
+ #include <pmmintrin.h>
++#include <xmmintrin.h>
+
+ static inline void
+-volk_32fc_index_max_16u_a_sse3(uint16_t* target, lv_32fc_t* src0,
+- uint32_t num_points)
++volk_32fc_index_max_16u_a_sse3(uint16_t* target, lv_32fc_t* src0, uint32_t num_points)
+ {
+- num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
+- // Branchless version, if we think it'll make a difference
+- //num_points = USHRT_MAX ^ ((num_points ^ USHRT_MAX) & -(num_points < USHRT_MAX));
++ num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
++ // Branchless version, if we think it'll make a difference
++ // num_points = USHRT_MAX ^ ((num_points ^ USHRT_MAX) & -(num_points < USHRT_MAX));
+
+- const uint32_t num_bytes = num_points*8;
++ const uint32_t num_bytes = num_points * 8;
+
+- union bit128 holderf;
+- union bit128 holderi;
+- float sq_dist = 0.0;
++ union bit128 holderf;
++ union bit128 holderi;
++ float sq_dist = 0.0;
+
+- union bit128 xmm5, xmm4;
+- __m128 xmm1, xmm2, xmm3;
+- __m128i xmm8, xmm11, xmm12, xmmfive, xmmfour, xmm9, holder0, holder1, xmm10;
++ union bit128 xmm5, xmm4;
++ __m128 xmm1, xmm2, xmm3;
++ __m128i xmm8, xmm11, xmm12, xmmfive, xmmfour, xmm9, holder0, holder1, xmm10;
+
+- xmm5.int_vec = xmmfive = _mm_setzero_si128();
+- xmm4.int_vec = xmmfour = _mm_setzero_si128();
+- holderf.int_vec = holder0 = _mm_setzero_si128();
+- holderi.int_vec = holder1 = _mm_setzero_si128();
++ xmm5.int_vec = xmmfive = _mm_setzero_si128();
++ xmm4.int_vec = xmmfour = _mm_setzero_si128();
++ holderf.int_vec = holder0 = _mm_setzero_si128();
++ holderi.int_vec = holder1 = _mm_setzero_si128();
+
+- int bound = num_bytes >> 5;
+- int i = 0;
++ int bound = num_bytes >> 5;
++ int i = 0;
+
+- xmm8 = _mm_set_epi32(3, 2, 1, 0);//remember the crazy reverse order!
+- xmm9 = _mm_setzero_si128();
+- xmm10 = _mm_set_epi32(4, 4, 4, 4);
+- xmm3 = _mm_setzero_ps();
+- //printf("%f, %f, %f, %f\n", ((float*)&xmm10)[0], ((float*)&xmm10)[1], ((float*)&xmm10)[2], ((float*)&xmm10)[3]);
++ xmm8 = _mm_set_epi32(3, 2, 1, 0); // remember the crazy reverse order!
++ xmm9 = _mm_setzero_si128();
++ xmm10 = _mm_set_epi32(4, 4, 4, 4);
++ xmm3 = _mm_setzero_ps();
++ // printf("%f, %f, %f, %f\n", ((float*)&xmm10)[0], ((float*)&xmm10)[1],
++ // ((float*)&xmm10)[2], ((float*)&xmm10)[3]);
+
+- for(; i < bound; ++i) {
+- xmm1 = _mm_load_ps((float*)src0);
+- xmm2 = _mm_load_ps((float*)&src0[2]);
++ for (; i < bound; ++i) {
++ xmm1 = _mm_load_ps((float*)src0);
++ xmm2 = _mm_load_ps((float*)&src0[2]);
+
+- src0 += 4;
++ src0 += 4;
+
+- xmm1 = _mm_mul_ps(xmm1, xmm1);
+- xmm2 = _mm_mul_ps(xmm2, xmm2);
++ xmm1 = _mm_mul_ps(xmm1, xmm1);
++ xmm2 = _mm_mul_ps(xmm2, xmm2);
+
+- xmm1 = _mm_hadd_ps(xmm1, xmm2);
++ xmm1 = _mm_hadd_ps(xmm1, xmm2);
+
+- xmm3 = _mm_max_ps(xmm1, xmm3);
++ xmm3 = _mm_max_ps(xmm1, xmm3);
+
+- xmm4.float_vec = _mm_cmplt_ps(xmm1, xmm3);
+- xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3);
++ xmm4.float_vec = _mm_cmplt_ps(xmm1, xmm3);
++ xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3);
+
+- xmm11 = _mm_and_si128(xmm8, xmm5.int_vec);
+- xmm12 = _mm_and_si128(xmm9, xmm4.int_vec);
++ xmm11 = _mm_and_si128(xmm8, xmm5.int_vec);
++ xmm12 = _mm_and_si128(xmm9, xmm4.int_vec);
+
+- xmm9 = _mm_add_epi32(xmm11, xmm12);
++ xmm9 = _mm_add_epi32(xmm11, xmm12);
+
+- xmm8 = _mm_add_epi32(xmm8, xmm10);
++ xmm8 = _mm_add_epi32(xmm8, xmm10);
+
+- //printf("%f, %f, %f, %f\n", ((float*)&xmm3)[0], ((float*)&xmm3)[1], ((float*)&xmm3)[2], ((float*)&xmm3)[3]);
+- //printf("%u, %u, %u, %u\n", ((uint32_t*)&xmm10)[0], ((uint32_t*)&xmm10)[1], ((uint32_t*)&xmm10)[2], ((uint32_t*)&xmm10)[3]);
+- }
++ // printf("%f, %f, %f, %f\n", ((float*)&xmm3)[0], ((float*)&xmm3)[1],
++ // ((float*)&xmm3)[2], ((float*)&xmm3)[3]); printf("%u, %u, %u, %u\n",
++ // ((uint32_t*)&xmm10)[0], ((uint32_t*)&xmm10)[1], ((uint32_t*)&xmm10)[2],
++ // ((uint32_t*)&xmm10)[3]);
++ }
+
+
+- if (num_bytes >> 4 & 1) {
+- xmm2 = _mm_load_ps((float*)src0);
++ if (num_bytes >> 4 & 1) {
++ xmm2 = _mm_load_ps((float*)src0);
+
+- xmm1 = _mm_movelh_ps(bit128_p(&xmm8)->float_vec, bit128_p(&xmm8)->float_vec);
+- xmm8 = bit128_p(&xmm1)->int_vec;
++ xmm1 = _mm_movelh_ps(bit128_p(&xmm8)->float_vec, bit128_p(&xmm8)->float_vec);
++ xmm8 = bit128_p(&xmm1)->int_vec;
+
+- xmm2 = _mm_mul_ps(xmm2, xmm2);
++ xmm2 = _mm_mul_ps(xmm2, xmm2);
+
+- src0 += 2;
++ src0 += 2;
+
+- xmm1 = _mm_hadd_ps(xmm2, xmm2);
++ xmm1 = _mm_hadd_ps(xmm2, xmm2);
+
+- xmm3 = _mm_max_ps(xmm1, xmm3);
++ xmm3 = _mm_max_ps(xmm1, xmm3);
+
+- xmm10 = _mm_set_epi32(2, 2, 2, 2);//load1_ps((float*)&init[2]);
++ xmm10 = _mm_set_epi32(2, 2, 2, 2); // load1_ps((float*)&init[2]);
+
+- xmm4.float_vec = _mm_cmplt_ps(xmm1, xmm3);
+- xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3);
++ xmm4.float_vec = _mm_cmplt_ps(xmm1, xmm3);
++ xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3);
+
+- xmm11 = _mm_and_si128(xmm8, xmm5.int_vec);
+- xmm12 = _mm_and_si128(xmm9, xmm4.int_vec);
++ xmm11 = _mm_and_si128(xmm8, xmm5.int_vec);
++ xmm12 = _mm_and_si128(xmm9, xmm4.int_vec);
+
+- xmm9 = _mm_add_epi32(xmm11, xmm12);
++ xmm9 = _mm_add_epi32(xmm11, xmm12);
+
+- xmm8 = _mm_add_epi32(xmm8, xmm10);
+- //printf("egads%u, %u, %u, %u\n", ((uint32_t*)&xmm9)[0], ((uint32_t*)&xmm9)[1], ((uint32_t*)&xmm9)[2], ((uint32_t*)&xmm9)[3]);
+- }
++ xmm8 = _mm_add_epi32(xmm8, xmm10);
++ // printf("egads%u, %u, %u, %u\n", ((uint32_t*)&xmm9)[0], ((uint32_t*)&xmm9)[1],
++ // ((uint32_t*)&xmm9)[2], ((uint32_t*)&xmm9)[3]);
++ }
+
+- if (num_bytes >> 3 & 1) {
+- //printf("%u, %u, %u, %u\n", ((uint32_t*)&xmm9)[0], ((uint32_t*)&xmm9)[1], ((uint32_t*)&xmm9)[2], ((uint32_t*)&xmm9)[3]);
++ if (num_bytes >> 3 & 1) {
++ // printf("%u, %u, %u, %u\n", ((uint32_t*)&xmm9)[0], ((uint32_t*)&xmm9)[1],
++ // ((uint32_t*)&xmm9)[2], ((uint32_t*)&xmm9)[3]);
+
+- sq_dist = lv_creal(src0[0]) * lv_creal(src0[0]) + lv_cimag(src0[0]) * lv_cimag(src0[0]);
++ sq_dist =
++ lv_creal(src0[0]) * lv_creal(src0[0]) + lv_cimag(src0[0]) * lv_cimag(src0[0]);
+
+- xmm2 = _mm_load1_ps(&sq_dist);
++ xmm2 = _mm_load1_ps(&sq_dist);
+
+- xmm1 = xmm3;
++ xmm1 = xmm3;
+
+- xmm3 = _mm_max_ss(xmm3, xmm2);
++ xmm3 = _mm_max_ss(xmm3, xmm2);
+
+- xmm4.float_vec = _mm_cmplt_ps(xmm1, xmm3);
+- xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3);
++ xmm4.float_vec = _mm_cmplt_ps(xmm1, xmm3);
++ xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3);
+
+- xmm8 = _mm_shuffle_epi32(xmm8, 0x00);
++ xmm8 = _mm_shuffle_epi32(xmm8, 0x00);
+
+- xmm11 = _mm_and_si128(xmm8, xmm4.int_vec);
+- xmm12 = _mm_and_si128(xmm9, xmm5.int_vec);
++ xmm11 = _mm_and_si128(xmm8, xmm4.int_vec);
++ xmm12 = _mm_and_si128(xmm9, xmm5.int_vec);
+
+- xmm9 = _mm_add_epi32(xmm11, xmm12);
+- }
++ xmm9 = _mm_add_epi32(xmm11, xmm12);
++ }
+
+- //printf("%f, %f, %f, %f\n", ((float*)&xmm3)[0], ((float*)&xmm3)[1], ((float*)&xmm3)[2], ((float*)&xmm3)[3]);
+- //printf("%u, %u, %u, %u\n", ((uint32_t*)&xmm9)[0], ((uint32_t*)&xmm9)[1], ((uint32_t*)&xmm9)[2], ((uint32_t*)&xmm9)[3]);
++ // printf("%f, %f, %f, %f\n", ((float*)&xmm3)[0], ((float*)&xmm3)[1],
++ // ((float*)&xmm3)[2], ((float*)&xmm3)[3]); printf("%u, %u, %u, %u\n",
++ // ((uint32_t*)&xmm9)[0], ((uint32_t*)&xmm9)[1], ((uint32_t*)&xmm9)[2],
++ // ((uint32_t*)&xmm9)[3]);
+
+- _mm_store_ps((float*)&(holderf.f), xmm3);
+- _mm_store_si128(&(holderi.int_vec), xmm9);
++ _mm_store_ps((float*)&(holderf.f), xmm3);
++ _mm_store_si128(&(holderi.int_vec), xmm9);
+
+- target[0] = holderi.i[0];
+- sq_dist = holderf.f[0];
+- target[0] = (holderf.f[1] > sq_dist) ? holderi.i[1] : target[0];
+- sq_dist = (holderf.f[1] > sq_dist) ? holderf.f[1] : sq_dist;
+- target[0] = (holderf.f[2] > sq_dist) ? holderi.i[2] : target[0];
+- sq_dist = (holderf.f[2] > sq_dist) ? holderf.f[2] : sq_dist;
+- target[0] = (holderf.f[3] > sq_dist) ? holderi.i[3] : target[0];
+- sq_dist = (holderf.f[3] > sq_dist) ? holderf.f[3] : sq_dist;
++ target[0] = holderi.i[0];
++ sq_dist = holderf.f[0];
++ target[0] = (holderf.f[1] > sq_dist) ? holderi.i[1] : target[0];
++ sq_dist = (holderf.f[1] > sq_dist) ? holderf.f[1] : sq_dist;
++ target[0] = (holderf.f[2] > sq_dist) ? holderi.i[2] : target[0];
++ sq_dist = (holderf.f[2] > sq_dist) ? holderf.f[2] : sq_dist;
++ target[0] = (holderf.f[3] > sq_dist) ? holderi.i[3] : target[0];
++ sq_dist = (holderf.f[3] > sq_dist) ? holderf.f[3] : sq_dist;
+
+- /*
+- float placeholder = 0.0;
+- uint32_t temp0, temp1;
+- uint32_t g0 = (((float*)&xmm3)[0] > ((float*)&xmm3)[1]);
+- uint32_t l0 = g0 ^ 1;
++ /*
++ float placeholder = 0.0;
++ uint32_t temp0, temp1;
++ uint32_t g0 = (((float*)&xmm3)[0] > ((float*)&xmm3)[1]);
++ uint32_t l0 = g0 ^ 1;
+
+- uint32_t g1 = (((float*)&xmm3)[1] > ((float*)&xmm3)[2]);
+- uint32_t l1 = g1 ^ 1;
++ uint32_t g1 = (((float*)&xmm3)[1] > ((float*)&xmm3)[2]);
++ uint32_t l1 = g1 ^ 1;
+
+- temp0 = g0 * ((uint32_t*)&xmm9)[0] + l0 * ((uint32_t*)&xmm9)[1];
+- temp1 = g0 * ((uint32_t*)&xmm9)[2] + l0 * ((uint32_t*)&xmm9)[3];
+- sq_dist = g0 * ((float*)&xmm3)[0] + l0 * ((float*)&xmm3)[1];
+- placeholder = g0 * ((float*)&xmm3)[2] + l0 * ((float*)&xmm3)[3];
++ temp0 = g0 * ((uint32_t*)&xmm9)[0] + l0 * ((uint32_t*)&xmm9)[1];
++ temp1 = g0 * ((uint32_t*)&xmm9)[2] + l0 * ((uint32_t*)&xmm9)[3];
++ sq_dist = g0 * ((float*)&xmm3)[0] + l0 * ((float*)&xmm3)[1];
++ placeholder = g0 * ((float*)&xmm3)[2] + l0 * ((float*)&xmm3)[3];
+
+- g0 = (sq_dist > placeholder);
+- l0 = g0 ^ 1;
+- target[0] = g0 * temp0 + l0 * temp1;
+- */
++ g0 = (sq_dist > placeholder);
++ l0 = g0 ^ 1;
++ target[0] = g0 * temp0 + l0 * temp1;
++ */
+ }
+
+ #endif /*LV_HAVE_SSE3*/
+
+ #ifdef LV_HAVE_GENERIC
+ static inline void
+- volk_32fc_index_max_16u_generic(uint16_t* target, lv_32fc_t* src0,
+- uint32_t num_points)
++volk_32fc_index_max_16u_generic(uint16_t* target, lv_32fc_t* src0, uint32_t num_points)
+ {
+- num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
++ num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
+
+- const uint32_t num_bytes = num_points*8;
++ const uint32_t num_bytes = num_points * 8;
+
+- float sq_dist = 0.0;
+- float max = 0.0;
+- uint16_t index = 0;
++ float sq_dist = 0.0;
++ float max = 0.0;
++ uint16_t index = 0;
+
+- uint32_t i = 0;
++ uint32_t i = 0;
+
+- for(; i < num_bytes >> 3; ++i) {
+- sq_dist = lv_creal(src0[i]) * lv_creal(src0[i]) + lv_cimag(src0[i]) * lv_cimag(src0[i]);
++ for (; i<num_bytes>> 3; ++i) {
++ sq_dist =
++ lv_creal(src0[i]) * lv_creal(src0[i]) + lv_cimag(src0[i]) * lv_cimag(src0[i]);
+
+- index = sq_dist > max ? i : index;
+- max = sq_dist > max ? sq_dist : max;
+- }
+- target[0] = index;
++ index = sq_dist > max ? i : index;
++ max = sq_dist > max ? sq_dist : max;
++ }
++ target[0] = index;
+ }
+
+ #endif /*LV_HAVE_GENERIC*/
+@@ -427,142 +434,140 @@ static inline void
+ #ifndef INCLUDED_volk_32fc_index_max_16u_u_H
+ #define INCLUDED_volk_32fc_index_max_16u_u_H
+
+-#include <volk/volk_common.h>
+ #include <inttypes.h>
+-#include <stdio.h>
+ #include <limits.h>
++#include <stdio.h>
++#include <volk/volk_common.h>
+ #include <volk/volk_complex.h>
+
+ #ifdef LV_HAVE_AVX2
+ #include <immintrin.h>
+
+ static inline void
+-volk_32fc_index_max_16u_u_avx2(uint16_t* target, lv_32fc_t* src0,
+- uint32_t num_points)
++volk_32fc_index_max_16u_u_avx2(uint16_t* target, lv_32fc_t* src0, uint32_t num_points)
+ {
+- num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
+- // Branchless version, if we think it'll make a difference
+- //num_points = USHRT_MAX ^ ((num_points ^ USHRT_MAX) & -(num_points < USHRT_MAX));
++ num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
++ // Branchless version, if we think it'll make a difference
++ // num_points = USHRT_MAX ^ ((num_points ^ USHRT_MAX) & -(num_points < USHRT_MAX));
+
+- const uint32_t num_bytes = num_points*8;
++ const uint32_t num_bytes = num_points * 8;
+
+- union bit256 holderf;
+- union bit256 holderi;
+- float sq_dist = 0.0;
++ union bit256 holderf;
++ union bit256 holderi;
++ float sq_dist = 0.0;
+
+- union bit256 xmm5, xmm4;
+- __m256 xmm1, xmm2, xmm3;
+- __m256i xmm8, xmm11, xmm12, xmmfive, xmmfour, xmm9, holder0, holder1, xmm10;
++ union bit256 xmm5, xmm4;
++ __m256 xmm1, xmm2, xmm3;
++ __m256i xmm8, xmm11, xmm12, xmmfive, xmmfour, xmm9, holder0, holder1, xmm10;
+
+- xmm5.int_vec = xmmfive = _mm256_setzero_si256();
+- xmm4.int_vec = xmmfour = _mm256_setzero_si256();
+- holderf.int_vec = holder0 = _mm256_setzero_si256();
+- holderi.int_vec = holder1 = _mm256_setzero_si256();
++ xmm5.int_vec = xmmfive = _mm256_setzero_si256();
++ xmm4.int_vec = xmmfour = _mm256_setzero_si256();
++ holderf.int_vec = holder0 = _mm256_setzero_si256();
++ holderi.int_vec = holder1 = _mm256_setzero_si256();
+
+- int bound = num_bytes >> 6;
+- int i = 0;
++ int bound = num_bytes >> 6;
++ int i = 0;
+
+- xmm8 = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
+- xmm9 = _mm256_setzero_si256(); //=xmm8
+- xmm10 = _mm256_set1_epi32(8);
+- xmm3 = _mm256_setzero_ps();
++ xmm8 = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
++ xmm9 = _mm256_setzero_si256(); //=xmm8
++ xmm10 = _mm256_set1_epi32(8);
++ xmm3 = _mm256_setzero_ps();
+
+- __m256i idx = _mm256_set_epi32(7,6,3,2,5,4,1,0);
+- for(; i < bound; ++i) {
+- xmm1 = _mm256_loadu_ps((float*)src0);
+- xmm2 = _mm256_loadu_ps((float*)&src0[4]);
++ __m256i idx = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
++ for (; i < bound; ++i) {
++ xmm1 = _mm256_loadu_ps((float*)src0);
++ xmm2 = _mm256_loadu_ps((float*)&src0[4]);
+
+- src0 += 8;
++ src0 += 8;
+
+- xmm1 = _mm256_mul_ps(xmm1, xmm1);
+- xmm2 = _mm256_mul_ps(xmm2, xmm2);
++ xmm1 = _mm256_mul_ps(xmm1, xmm1);
++ xmm2 = _mm256_mul_ps(xmm2, xmm2);
+
+- xmm1 = _mm256_hadd_ps(xmm1, xmm2);
+- xmm1 = _mm256_permutevar8x32_ps(xmm1, idx);
++ xmm1 = _mm256_hadd_ps(xmm1, xmm2);
++ xmm1 = _mm256_permutevar8x32_ps(xmm1, idx);
+
+- xmm3 = _mm256_max_ps(xmm1, xmm3);
++ xmm3 = _mm256_max_ps(xmm1, xmm3);
+
+- xmm4.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_LT_OS);
+- xmm5.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_EQ_OQ);
++ xmm4.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_LT_OS);
++ xmm5.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_EQ_OQ);
+
+- xmm11 = _mm256_and_si256(xmm8, xmm5.int_vec);
+- xmm12 = _mm256_and_si256(xmm9, xmm4.int_vec);
++ xmm11 = _mm256_and_si256(xmm8, xmm5.int_vec);
++ xmm12 = _mm256_and_si256(xmm9, xmm4.int_vec);
+
+- xmm9 = _mm256_add_epi32(xmm11, xmm12);
++ xmm9 = _mm256_add_epi32(xmm11, xmm12);
+
+- xmm8 = _mm256_add_epi32(xmm8, xmm10);
+- }
+- xmm10 = _mm256_set1_epi32(4);
+- if (num_bytes >> 5 & 1) {
+- xmm1 = _mm256_loadu_ps((float*)src0);
++ xmm8 = _mm256_add_epi32(xmm8, xmm10);
++ }
++ xmm10 = _mm256_set1_epi32(4);
++ if (num_bytes >> 5 & 1) {
++ xmm1 = _mm256_loadu_ps((float*)src0);
+
+- src0 += 4;
++ src0 += 4;
+
+- xmm1 = _mm256_mul_ps(xmm1, xmm1);
++ xmm1 = _mm256_mul_ps(xmm1, xmm1);
+
+- xmm1 = _mm256_hadd_ps(xmm1, xmm1);
+- xmm1 = _mm256_permutevar8x32_ps(xmm1, idx);
++ xmm1 = _mm256_hadd_ps(xmm1, xmm1);
++ xmm1 = _mm256_permutevar8x32_ps(xmm1, idx);
+
+- xmm3 = _mm256_max_ps(xmm1, xmm3);
++ xmm3 = _mm256_max_ps(xmm1, xmm3);
+
+- xmm4.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_LT_OS);
+- xmm5.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_EQ_OQ);
++ xmm4.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_LT_OS);
++ xmm5.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_EQ_OQ);
+
+- xmm11 = _mm256_and_si256(xmm8, xmm5.int_vec);
+- xmm12 = _mm256_and_si256(xmm9, xmm4.int_vec);
++ xmm11 = _mm256_and_si256(xmm8, xmm5.int_vec);
++ xmm12 = _mm256_and_si256(xmm9, xmm4.int_vec);
+
+- xmm9 = _mm256_add_epi32(xmm11, xmm12);
++ xmm9 = _mm256_add_epi32(xmm11, xmm12);
+
+- xmm8 = _mm256_add_epi32(xmm8, xmm10);
+- }
++ xmm8 = _mm256_add_epi32(xmm8, xmm10);
++ }
+
+- idx = _mm256_set_epi32(1,0,1,0,1,0,1,0);
+- xmm10 = _mm256_set1_epi32(2);
+- if (num_bytes >> 4 & 1) {
+- xmm2 = _mm256_loadu_ps((float*)src0);
++ idx = _mm256_set_epi32(1, 0, 1, 0, 1, 0, 1, 0);
++ xmm10 = _mm256_set1_epi32(2);
++ if (num_bytes >> 4 & 1) {
++ xmm2 = _mm256_loadu_ps((float*)src0);
+
+- xmm1 = _mm256_permutevar8x32_ps(bit256_p(&xmm8)->float_vec, idx);
+- xmm8 = bit256_p(&xmm1)->int_vec;
++ xmm1 = _mm256_permutevar8x32_ps(bit256_p(&xmm8)->float_vec, idx);
++ xmm8 = bit256_p(&xmm1)->int_vec;
+
+- xmm2 = _mm256_mul_ps(xmm2, xmm2);
++ xmm2 = _mm256_mul_ps(xmm2, xmm2);
+
+- src0 += 2;
++ src0 += 2;
+
+- xmm1 = _mm256_hadd_ps(xmm2, xmm2);
++ xmm1 = _mm256_hadd_ps(xmm2, xmm2);
+
+- xmm3 = _mm256_max_ps(xmm1, xmm3);
++ xmm3 = _mm256_max_ps(xmm1, xmm3);
+
+- xmm4.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_LT_OS);
+- xmm5.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_EQ_OQ);
++ xmm4.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_LT_OS);
++ xmm5.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_EQ_OQ);
+
+- xmm11 = _mm256_and_si256(xmm8, xmm5.int_vec);
+- xmm12 = _mm256_and_si256(xmm9, xmm4.int_vec);
++ xmm11 = _mm256_and_si256(xmm8, xmm5.int_vec);
++ xmm12 = _mm256_and_si256(xmm9, xmm4.int_vec);
+
+- xmm9 = _mm256_add_epi32(xmm11, xmm12);
++ xmm9 = _mm256_add_epi32(xmm11, xmm12);
++
++ xmm8 = _mm256_add_epi32(xmm8, xmm10);
++ }
+
+- xmm8 = _mm256_add_epi32(xmm8, xmm10);
+- }
+-
+- _mm256_storeu_ps((float*)&(holderf.f), xmm3);
+- _mm256_storeu_si256(&(holderi.int_vec), xmm9);
+-
+- target[0] = holderi.i[0];
+- sq_dist = holderf.f[0];
+- target[0] = (holderf.f[1] > sq_dist) ? holderi.i[1] : target[0];
+- sq_dist = (holderf.f[1] > sq_dist) ? holderf.f[1] : sq_dist;
+- target[0] = (holderf.f[2] > sq_dist) ? holderi.i[2] : target[0];
+- sq_dist = (holderf.f[2] > sq_dist) ? holderf.f[2] : sq_dist;
+- target[0] = (holderf.f[3] > sq_dist) ? holderi.i[3] : target[0];
+- sq_dist = (holderf.f[3] > sq_dist) ? holderf.f[3] : sq_dist;
+- target[0] = (holderf.f[4] > sq_dist) ? holderi.i[4] : target[0];
+- sq_dist = (holderf.f[4] > sq_dist) ? holderf.f[4] : sq_dist;
+- target[0] = (holderf.f[5] > sq_dist) ? holderi.i[5] : target[0];
+- sq_dist = (holderf.f[5] > sq_dist) ? holderf.f[5] : sq_dist;
+- target[0] = (holderf.f[6] > sq_dist) ? holderi.i[6] : target[0];
+- sq_dist = (holderf.f[6] > sq_dist) ? holderf.f[6] : sq_dist;
+- target[0] = (holderf.f[7] > sq_dist) ? holderi.i[7] : target[0];
+- sq_dist = (holderf.f[7] > sq_dist) ? holderf.f[7] : sq_dist;
++ _mm256_storeu_ps((float*)&(holderf.f), xmm3);
++ _mm256_storeu_si256(&(holderi.int_vec), xmm9);
+
++ target[0] = holderi.i[0];
++ sq_dist = holderf.f[0];
++ target[0] = (holderf.f[1] > sq_dist) ? holderi.i[1] : target[0];
++ sq_dist = (holderf.f[1] > sq_dist) ? holderf.f[1] : sq_dist;
++ target[0] = (holderf.f[2] > sq_dist) ? holderi.i[2] : target[0];
++ sq_dist = (holderf.f[2] > sq_dist) ? holderf.f[2] : sq_dist;
++ target[0] = (holderf.f[3] > sq_dist) ? holderi.i[3] : target[0];
++ sq_dist = (holderf.f[3] > sq_dist) ? holderf.f[3] : sq_dist;
++ target[0] = (holderf.f[4] > sq_dist) ? holderi.i[4] : target[0];
++ sq_dist = (holderf.f[4] > sq_dist) ? holderf.f[4] : sq_dist;
++ target[0] = (holderf.f[5] > sq_dist) ? holderi.i[5] : target[0];
++ sq_dist = (holderf.f[5] > sq_dist) ? holderf.f[5] : sq_dist;
++ target[0] = (holderf.f[6] > sq_dist) ? holderi.i[6] : target[0];
++ sq_dist = (holderf.f[6] > sq_dist) ? holderf.f[6] : sq_dist;
++ target[0] = (holderf.f[7] > sq_dist) ? holderi.i[7] : target[0];
++ sq_dist = (holderf.f[7] > sq_dist) ? holderf.f[7] : sq_dist;
+ }
+
+ #endif /*LV_HAVE_AVX2*/
+diff --git a/kernels/volk/volk_32fc_index_max_32u.h b/kernels/volk/volk_32fc_index_max_32u.h
+index 67a3faa..7756fc6 100644
+--- a/kernels/volk/volk_32fc_index_max_32u.h
++++ b/kernels/volk/volk_32fc_index_max_32u.h
+@@ -70,309 +70,314 @@
+ #ifndef INCLUDED_volk_32fc_index_max_32u_a_H
+ #define INCLUDED_volk_32fc_index_max_32u_a_H
+
++#include <inttypes.h>
++#include <stdio.h>
+ #include <volk/volk_common.h>
+-#include<inttypes.h>
+-#include<stdio.h>
+-#include<volk/volk_complex.h>
++#include <volk/volk_complex.h>
+
+ #ifdef LV_HAVE_AVX2
+-#include<immintrin.h>
++#include <immintrin.h>
+
+ static inline void
+-volk_32fc_index_max_32u_a_avx2(uint32_t* target, lv_32fc_t* src0,
+- uint32_t num_points)
++volk_32fc_index_max_32u_a_avx2(uint32_t* target, lv_32fc_t* src0, uint32_t num_points)
+ {
+- const uint32_t num_bytes = num_points*8;
++ const uint32_t num_bytes = num_points * 8;
+
+- union bit256 holderf;
+- union bit256 holderi;
+- float sq_dist = 0.0;
++ union bit256 holderf;
++ union bit256 holderi;
++ float sq_dist = 0.0;
+
+- union bit256 xmm5, xmm4;
+- __m256 xmm1, xmm2, xmm3;
+- __m256i xmm8, xmm11, xmm12, xmmfive, xmmfour, xmm9, holder0, holder1, xmm10;
++ union bit256 xmm5, xmm4;
++ __m256 xmm1, xmm2, xmm3;
++ __m256i xmm8, xmm11, xmm12, xmmfive, xmmfour, xmm9, holder0, holder1, xmm10;
+
+- xmm5.int_vec = xmmfive = _mm256_setzero_si256();
+- xmm4.int_vec = xmmfour = _mm256_setzero_si256();
+- holderf.int_vec = holder0 = _mm256_setzero_si256();
+- holderi.int_vec = holder1 = _mm256_setzero_si256();
++ xmm5.int_vec = xmmfive = _mm256_setzero_si256();
++ xmm4.int_vec = xmmfour = _mm256_setzero_si256();
++ holderf.int_vec = holder0 = _mm256_setzero_si256();
++ holderi.int_vec = holder1 = _mm256_setzero_si256();
+
+- int bound = num_bytes >> 6;
+- int i = 0;
++ int bound = num_bytes >> 6;
++ int i = 0;
+
+- xmm8 = _mm256_set_epi32(7,6,5,4,3, 2, 1, 0);
+- xmm9 = _mm256_setzero_si256();
+- xmm10 = _mm256_set1_epi32(8);
+- xmm3 = _mm256_setzero_ps();
+- __m256i idx = _mm256_set_epi32(7,6,3,2,5,4,1,0);
++ xmm8 = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
++ xmm9 = _mm256_setzero_si256();
++ xmm10 = _mm256_set1_epi32(8);
++ xmm3 = _mm256_setzero_ps();
++ __m256i idx = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
+
+- for(; i < bound; ++i) {
+- xmm1 = _mm256_load_ps((float*)src0);
+- xmm2 = _mm256_load_ps((float*)&src0[4]);
++ for (; i < bound; ++i) {
++ xmm1 = _mm256_load_ps((float*)src0);
++ xmm2 = _mm256_load_ps((float*)&src0[4]);
+
+- src0 += 8;
++ src0 += 8;
+
+- xmm1 = _mm256_mul_ps(xmm1, xmm1);
+- xmm2 = _mm256_mul_ps(xmm2, xmm2);
++ xmm1 = _mm256_mul_ps(xmm1, xmm1);
++ xmm2 = _mm256_mul_ps(xmm2, xmm2);
+
+- xmm1 = _mm256_hadd_ps(xmm1, xmm2);
+- xmm1 = _mm256_permutevar8x32_ps(xmm1, idx);
++ xmm1 = _mm256_hadd_ps(xmm1, xmm2);
++ xmm1 = _mm256_permutevar8x32_ps(xmm1, idx);
+
+- xmm3 = _mm256_max_ps(xmm1, xmm3);
++ xmm3 = _mm256_max_ps(xmm1, xmm3);
+
+- xmm4.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_LT_OS);
+- xmm5.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_EQ_OQ);
++ xmm4.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_LT_OS);
++ xmm5.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_EQ_OQ);
+
+- xmm11 = _mm256_and_si256(xmm8, xmm5.int_vec);
+- xmm12 = _mm256_and_si256(xmm9, xmm4.int_vec);
++ xmm11 = _mm256_and_si256(xmm8, xmm5.int_vec);
++ xmm12 = _mm256_and_si256(xmm9, xmm4.int_vec);
+
+- xmm9 = _mm256_add_epi32(xmm11, xmm12);
++ xmm9 = _mm256_add_epi32(xmm11, xmm12);
+
+- xmm8 = _mm256_add_epi32(xmm8, xmm10);
+- }
+-
+- xmm10 = _mm256_set1_epi32(4);
+- if (num_bytes >> 5 & 1) {
+- xmm1 = _mm256_load_ps((float*)src0);
+-
+- xmm1 = _mm256_mul_ps(xmm1, xmm1);
++ xmm8 = _mm256_add_epi32(xmm8, xmm10);
++ }
+
+- src0 += 4;
++ xmm10 = _mm256_set1_epi32(4);
++ if (num_bytes >> 4 & 1) {
++ xmm1 = _mm256_load_ps((float*)src0);
+
+- xmm1 = _mm256_hadd_ps(xmm1, xmm1);
++ xmm1 = _mm256_mul_ps(xmm1, xmm1);
+
+- xmm3 = _mm256_max_ps(xmm1, xmm3);
++ src0 += 4;
+
+- xmm4.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_LT_OS);
+- xmm5.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_EQ_OQ);
++ xmm1 = _mm256_hadd_ps(xmm1, xmm1);
+
+- xmm11 = _mm256_and_si256(xmm8, xmm5.int_vec);
+- xmm12 = _mm256_and_si256(xmm9, xmm4.int_vec);
++ xmm3 = _mm256_max_ps(xmm1, xmm3);
+
+- xmm9 = _mm256_add_epi32(xmm11, xmm12);
++ xmm4.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_LT_OS);
++ xmm5.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_EQ_OQ);
+
+- xmm8 = _mm256_add_epi32(xmm8, xmm10);
+- }
++ xmm11 = _mm256_and_si256(xmm8, xmm5.int_vec);
++ xmm12 = _mm256_and_si256(xmm9, xmm4.int_vec);
+
+- idx = _mm256_set_epi32(1,0,1,0,1,0,1,0);
+- xmm10 = _mm256_set1_epi32(2);
+- if (num_bytes >> 4 & 1) {
+- xmm2 = _mm256_load_ps((float*)src0);
++ xmm9 = _mm256_add_epi32(xmm11, xmm12);
+
+- xmm1 = _mm256_permutevar8x32_ps(bit256_p(&xmm8)->float_vec, idx);
+- xmm8 = bit256_p(&xmm1)->int_vec;
++ xmm8 = _mm256_add_epi32(xmm8, xmm10);
++ }
+
+- xmm2 = _mm256_mul_ps(xmm2, xmm2);
++ idx = _mm256_set_epi32(1, 0, 1, 0, 1, 0, 1, 0);
++ xmm10 = _mm256_set1_epi32(2);
++ if (num_bytes >> 4 & 1) {
++ xmm2 = _mm256_load_ps((float*)src0);
+
+- src0 += 2;
++ xmm1 = _mm256_permutevar8x32_ps(bit256_p(&xmm8)->float_vec, idx);
++ xmm8 = bit256_p(&xmm1)->int_vec;
+
+- xmm1 = _mm256_hadd_ps(xmm2, xmm2);
++ xmm2 = _mm256_mul_ps(xmm2, xmm2);
+
+- xmm3 = _mm256_max_ps(xmm1, xmm3);
++ src0 += 2;
+
+- xmm4.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_LT_OS);
+- xmm5.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_EQ_OQ);
++ xmm1 = _mm256_hadd_ps(xmm2, xmm2);
+
+- xmm11 = _mm256_and_si256(xmm8, xmm5.int_vec);
+- xmm12 = _mm256_and_si256(xmm9, xmm4.int_vec);
++ xmm3 = _mm256_max_ps(xmm1, xmm3);
+
+- xmm9 = _mm256_add_epi32(xmm11, xmm12);
++ xmm4.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_LT_OS);
++ xmm5.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_EQ_OQ);
+
+- xmm8 = _mm256_add_epi32(xmm8, xmm10);
+- }
++ xmm11 = _mm256_and_si256(xmm8, xmm5.int_vec);
++ xmm12 = _mm256_and_si256(xmm9, xmm4.int_vec);
+
+- _mm256_store_ps((float*)&(holderf.f), xmm3);
+- _mm256_store_si256(&(holderi.int_vec), xmm9);
++ xmm9 = _mm256_add_epi32(xmm11, xmm12);
+
+- target[0] = holderi.i[0];
+- sq_dist = holderf.f[0];
+- target[0] = (holderf.f[1] > sq_dist) ? holderi.i[1] : target[0];
+- sq_dist = (holderf.f[1] > sq_dist) ? holderf.f[1] : sq_dist;
+- target[0] = (holderf.f[2] > sq_dist) ? holderi.i[2] : target[0];
+- sq_dist = (holderf.f[2] > sq_dist) ? holderf.f[2] : sq_dist;
+- target[0] = (holderf.f[3] > sq_dist) ? holderi.i[3] : target[0];
+- sq_dist = (holderf.f[3] > sq_dist) ? holderf.f[3] : sq_dist;
+- target[0] = (holderf.f[4] > sq_dist) ? holderi.i[4] : target[0];
+- sq_dist = (holderf.f[4] > sq_dist) ? holderf.f[4] : sq_dist;
+- target[0] = (holderf.f[5] > sq_dist) ? holderi.i[5] : target[0];
+- sq_dist = (holderf.f[5] > sq_dist) ? holderf.f[5] : sq_dist;
+- target[0] = (holderf.f[6] > sq_dist) ? holderi.i[6] : target[0];
+- sq_dist = (holderf.f[6] > sq_dist) ? holderf.f[6] : sq_dist;
+- target[0] = (holderf.f[7] > sq_dist) ? holderi.i[7] : target[0];
+- sq_dist = (holderf.f[7] > sq_dist) ? holderf.f[7] : sq_dist;
++ xmm8 = _mm256_add_epi32(xmm8, xmm10);
++ }
+
++ _mm256_store_ps((float*)&(holderf.f), xmm3);
++ _mm256_store_si256(&(holderi.int_vec), xmm9);
++
++ target[0] = holderi.i[0];
++ sq_dist = holderf.f[0];
++ target[0] = (holderf.f[1] > sq_dist) ? holderi.i[1] : target[0];
++ sq_dist = (holderf.f[1] > sq_dist) ? holderf.f[1] : sq_dist;
++ target[0] = (holderf.f[2] > sq_dist) ? holderi.i[2] : target[0];
++ sq_dist = (holderf.f[2] > sq_dist) ? holderf.f[2] : sq_dist;
++ target[0] = (holderf.f[3] > sq_dist) ? holderi.i[3] : target[0];
++ sq_dist = (holderf.f[3] > sq_dist) ? holderf.f[3] : sq_dist;
++ target[0] = (holderf.f[4] > sq_dist) ? holderi.i[4] : target[0];
++ sq_dist = (holderf.f[4] > sq_dist) ? holderf.f[4] : sq_dist;
++ target[0] = (holderf.f[5] > sq_dist) ? holderi.i[5] : target[0];
++ sq_dist = (holderf.f[5] > sq_dist) ? holderf.f[5] : sq_dist;
++ target[0] = (holderf.f[6] > sq_dist) ? holderi.i[6] : target[0];
++ sq_dist = (holderf.f[6] > sq_dist) ? holderf.f[6] : sq_dist;
++ target[0] = (holderf.f[7] > sq_dist) ? holderi.i[7] : target[0];
++ sq_dist = (holderf.f[7] > sq_dist) ? holderf.f[7] : sq_dist;
+ }
+
+ #endif /*LV_HAVE_AVX2*/
+
+ #ifdef LV_HAVE_SSE3
+-#include<xmmintrin.h>
+-#include<pmmintrin.h>
++#include <pmmintrin.h>
++#include <xmmintrin.h>
+
+ static inline void
+-volk_32fc_index_max_32u_a_sse3(uint32_t* target, lv_32fc_t* src0,
+- uint32_t num_points)
++volk_32fc_index_max_32u_a_sse3(uint32_t* target, lv_32fc_t* src0, uint32_t num_points)
+ {
+- const uint32_t num_bytes = num_points*8;
+-
+- union bit128 holderf;
+- union bit128 holderi;
+- float sq_dist = 0.0;
+-
+- union bit128 xmm5, xmm4;
+- __m128 xmm1, xmm2, xmm3;
+- __m128i xmm8, xmm11, xmm12, xmmfive, xmmfour, xmm9, holder0, holder1, xmm10;
++ const uint32_t num_bytes = num_points * 8;
+
+- xmm5.int_vec = xmmfive = _mm_setzero_si128();
+- xmm4.int_vec = xmmfour = _mm_setzero_si128();
+- holderf.int_vec = holder0 = _mm_setzero_si128();
+- holderi.int_vec = holder1 = _mm_setzero_si128();
++ union bit128 holderf;
++ union bit128 holderi;
++ float sq_dist = 0.0;
+
+- int bound = num_bytes >> 5;
+- int i = 0;
++ union bit128 xmm5, xmm4;
++ __m128 xmm1, xmm2, xmm3;
++ __m128i xmm8, xmm11, xmm12, xmmfive, xmmfour, xmm9, holder0, holder1, xmm10;
+
+- xmm8 = _mm_set_epi32(3, 2, 1, 0);//remember the crazy reverse order!
+- xmm9 = _mm_setzero_si128();
+- xmm10 = _mm_set_epi32(4, 4, 4, 4);
+- xmm3 = _mm_setzero_ps();
++ xmm5.int_vec = xmmfive = _mm_setzero_si128();
++ xmm4.int_vec = xmmfour = _mm_setzero_si128();
++ holderf.int_vec = holder0 = _mm_setzero_si128();
++ holderi.int_vec = holder1 = _mm_setzero_si128();
+
+- //printf("%f, %f, %f, %f\n", ((float*)&xmm10)[0], ((float*)&xmm10)[1], ((float*)&xmm10)[2], ((float*)&xmm10)[3]);
++ int bound = num_bytes >> 5;
++ int i = 0;
+
+- for(; i < bound; ++i) {
+- xmm1 = _mm_load_ps((float*)src0);
+- xmm2 = _mm_load_ps((float*)&src0[2]);
++ xmm8 = _mm_set_epi32(3, 2, 1, 0); // remember the crazy reverse order!
++ xmm9 = _mm_setzero_si128();
++ xmm10 = _mm_set_epi32(4, 4, 4, 4);
++ xmm3 = _mm_setzero_ps();
+
+- src0 += 4;
++ // printf("%f, %f, %f, %f\n", ((float*)&xmm10)[0], ((float*)&xmm10)[1],
++ // ((float*)&xmm10)[2], ((float*)&xmm10)[3]);
+
+- xmm1 = _mm_mul_ps(xmm1, xmm1);
+- xmm2 = _mm_mul_ps(xmm2, xmm2);
++ for (; i < bound; ++i) {
++ xmm1 = _mm_load_ps((float*)src0);
++ xmm2 = _mm_load_ps((float*)&src0[2]);
+
+- xmm1 = _mm_hadd_ps(xmm1, xmm2);
++ src0 += 4;
+
+- xmm3 = _mm_max_ps(xmm1, xmm3);
++ xmm1 = _mm_mul_ps(xmm1, xmm1);
++ xmm2 = _mm_mul_ps(xmm2, xmm2);
+
+- xmm4.float_vec = _mm_cmplt_ps(xmm1, xmm3);
+- xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3);
++ xmm1 = _mm_hadd_ps(xmm1, xmm2);
+
+- xmm11 = _mm_and_si128(xmm8, xmm5.int_vec);
+- xmm12 = _mm_and_si128(xmm9, xmm4.int_vec);
++ xmm3 = _mm_max_ps(xmm1, xmm3);
+
+- xmm9 = _mm_add_epi32(xmm11, xmm12);
++ xmm4.float_vec = _mm_cmplt_ps(xmm1, xmm3);
++ xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3);
+
+- xmm8 = _mm_add_epi32(xmm8, xmm10);
++ xmm11 = _mm_and_si128(xmm8, xmm5.int_vec);
++ xmm12 = _mm_and_si128(xmm9, xmm4.int_vec);
+
+- //printf("%f, %f, %f, %f\n", ((float*)&xmm3)[0], ((float*)&xmm3)[1], ((float*)&xmm3)[2], ((float*)&xmm3)[3]);
+- //printf("%u, %u, %u, %u\n", ((uint32_t*)&xmm10)[0], ((uint32_t*)&xmm10)[1], ((uint32_t*)&xmm10)[2], ((uint32_t*)&xmm10)[3]);
+- }
++ xmm9 = _mm_add_epi32(xmm11, xmm12);
+
++ xmm8 = _mm_add_epi32(xmm8, xmm10);
+
+- if (num_bytes >> 4 & 1) {
+- xmm2 = _mm_load_ps((float*)src0);
+-
+- xmm1 = _mm_movelh_ps(bit128_p(&xmm8)->float_vec, bit128_p(&xmm8)->float_vec);
+- xmm8 = bit128_p(&xmm1)->int_vec;
+-
+- xmm2 = _mm_mul_ps(xmm2, xmm2);
+-
+- src0 += 2;
+-
+- xmm1 = _mm_hadd_ps(xmm2, xmm2);
++ // printf("%f, %f, %f, %f\n", ((float*)&xmm3)[0], ((float*)&xmm3)[1],
++ // ((float*)&xmm3)[2], ((float*)&xmm3)[3]); printf("%u, %u, %u, %u\n",
++ // ((uint32_t*)&xmm10)[0], ((uint32_t*)&xmm10)[1], ((uint32_t*)&xmm10)[2],
++ // ((uint32_t*)&xmm10)[3]);
++ }
+
+- xmm3 = _mm_max_ps(xmm1, xmm3);
+
+- xmm10 = _mm_set_epi32(2, 2, 2, 2);//load1_ps((float*)&init[2]);
++ if (num_bytes >> 4 & 1) {
++ xmm2 = _mm_load_ps((float*)src0);
+
+- xmm4.float_vec = _mm_cmplt_ps(xmm1, xmm3);
+- xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3);
++ xmm1 = _mm_movelh_ps(bit128_p(&xmm8)->float_vec, bit128_p(&xmm8)->float_vec);
++ xmm8 = bit128_p(&xmm1)->int_vec;
+
+- xmm11 = _mm_and_si128(xmm8, xmm5.int_vec);
+- xmm12 = _mm_and_si128(xmm9, xmm4.int_vec);
++ xmm2 = _mm_mul_ps(xmm2, xmm2);
+
+- xmm9 = _mm_add_epi32(xmm11, xmm12);
++ src0 += 2;
+
+- xmm8 = _mm_add_epi32(xmm8, xmm10);
+- //printf("egads%u, %u, %u, %u\n", ((uint32_t*)&xmm9)[0], ((uint32_t*)&xmm9)[1], ((uint32_t*)&xmm9)[2], ((uint32_t*)&xmm9)[3]);
+- }
++ xmm1 = _mm_hadd_ps(xmm2, xmm2);
+
+- if (num_bytes >> 3 & 1) {
+- //printf("%u, %u, %u, %u\n", ((uint32_t*)&xmm9)[0], ((uint32_t*)&xmm9)[1], ((uint32_t*)&xmm9)[2], ((uint32_t*)&xmm9)[3]);
++ xmm3 = _mm_max_ps(xmm1, xmm3);
+
+- sq_dist = lv_creal(src0[0]) * lv_creal(src0[0]) + lv_cimag(src0[0]) * lv_cimag(src0[0]);
++ xmm10 = _mm_set_epi32(2, 2, 2, 2); // load1_ps((float*)&init[2]);
+
+- xmm2 = _mm_load1_ps(&sq_dist);
++ xmm4.float_vec = _mm_cmplt_ps(xmm1, xmm3);
++ xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3);
+
+- xmm1 = xmm3;
++ xmm11 = _mm_and_si128(xmm8, xmm5.int_vec);
++ xmm12 = _mm_and_si128(xmm9, xmm4.int_vec);
+
+- xmm3 = _mm_max_ss(xmm3, xmm2);
++ xmm9 = _mm_add_epi32(xmm11, xmm12);
+
+- xmm4.float_vec = _mm_cmplt_ps(xmm1, xmm3);
+- xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3);
++ xmm8 = _mm_add_epi32(xmm8, xmm10);
++ // printf("egads%u, %u, %u, %u\n", ((uint32_t*)&xmm9)[0], ((uint32_t*)&xmm9)[1],
++ // ((uint32_t*)&xmm9)[2], ((uint32_t*)&xmm9)[3]);
++ }
+
+- xmm8 = _mm_shuffle_epi32(xmm8, 0x00);
++ if (num_bytes >> 3 & 1) {
++ // printf("%u, %u, %u, %u\n", ((uint32_t*)&xmm9)[0], ((uint32_t*)&xmm9)[1],
++ // ((uint32_t*)&xmm9)[2], ((uint32_t*)&xmm9)[3]);
+
+- xmm11 = _mm_and_si128(xmm8, xmm4.int_vec);
+- xmm12 = _mm_and_si128(xmm9, xmm5.int_vec);
++ sq_dist =
++ lv_creal(src0[0]) * lv_creal(src0[0]) + lv_cimag(src0[0]) * lv_cimag(src0[0]);
+
+- xmm9 = _mm_add_epi32(xmm11, xmm12);
+- }
++ xmm2 = _mm_load1_ps(&sq_dist);
+
+- //printf("%f, %f, %f, %f\n", ((float*)&xmm3)[0], ((float*)&xmm3)[1], ((float*)&xmm3)[2], ((float*)&xmm3)[3]);
+- //printf("%u, %u, %u, %u\n", ((uint32_t*)&xmm9)[0], ((uint32_t*)&xmm9)[1], ((uint32_t*)&xmm9)[2], ((uint32_t*)&xmm9)[3]);
++ xmm1 = xmm3;
+
+- _mm_store_ps((float*)&(holderf.f), xmm3);
+- _mm_store_si128(&(holderi.int_vec), xmm9);
++ xmm3 = _mm_max_ss(xmm3, xmm2);
+
+- target[0] = holderi.i[0];
+- sq_dist = holderf.f[0];
+- target[0] = (holderf.f[1] > sq_dist) ? holderi.i[1] : target[0];
+- sq_dist = (holderf.f[1] > sq_dist) ? holderf.f[1] : sq_dist;
+- target[0] = (holderf.f[2] > sq_dist) ? holderi.i[2] : target[0];
+- sq_dist = (holderf.f[2] > sq_dist) ? holderf.f[2] : sq_dist;
+- target[0] = (holderf.f[3] > sq_dist) ? holderi.i[3] : target[0];
+- sq_dist = (holderf.f[3] > sq_dist) ? holderf.f[3] : sq_dist;
++ xmm4.float_vec = _mm_cmplt_ps(xmm1, xmm3);
++ xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3);
+
+- /*
+- float placeholder = 0.0;
+- uint32_t temp0, temp1;
+- uint32_t g0 = (((float*)&xmm3)[0] > ((float*)&xmm3)[1]);
+- uint32_t l0 = g0 ^ 1;
++ xmm8 = _mm_shuffle_epi32(xmm8, 0x00);
+
+- uint32_t g1 = (((float*)&xmm3)[1] > ((float*)&xmm3)[2]);
+- uint32_t l1 = g1 ^ 1;
++ xmm11 = _mm_and_si128(xmm8, xmm4.int_vec);
++ xmm12 = _mm_and_si128(xmm9, xmm5.int_vec);
+
+- temp0 = g0 * ((uint32_t*)&xmm9)[0] + l0 * ((uint32_t*)&xmm9)[1];
+- temp1 = g0 * ((uint32_t*)&xmm9)[2] + l0 * ((uint32_t*)&xmm9)[3];
+- sq_dist = g0 * ((float*)&xmm3)[0] + l0 * ((float*)&xmm3)[1];
+- placeholder = g0 * ((float*)&xmm3)[2] + l0 * ((float*)&xmm3)[3];
++ xmm9 = _mm_add_epi32(xmm11, xmm12);
++ }
+
+- g0 = (sq_dist > placeholder);
+- l0 = g0 ^ 1;
+- target[0] = g0 * temp0 + l0 * temp1;
+- */
++ // printf("%f, %f, %f, %f\n", ((float*)&xmm3)[0], ((float*)&xmm3)[1],
++ // ((float*)&xmm3)[2], ((float*)&xmm3)[3]); printf("%u, %u, %u, %u\n",
++ // ((uint32_t*)&xmm9)[0], ((uint32_t*)&xmm9)[1], ((uint32_t*)&xmm9)[2],
++ // ((uint32_t*)&xmm9)[3]);
++
++ _mm_store_ps((float*)&(holderf.f), xmm3);
++ _mm_store_si128(&(holderi.int_vec), xmm9);
++
++ target[0] = holderi.i[0];
++ sq_dist = holderf.f[0];
++ target[0] = (holderf.f[1] > sq_dist) ? holderi.i[1] : target[0];
++ sq_dist = (holderf.f[1] > sq_dist) ? holderf.f[1] : sq_dist;
++ target[0] = (holderf.f[2] > sq_dist) ? holderi.i[2] : target[0];
++ sq_dist = (holderf.f[2] > sq_dist) ? holderf.f[2] : sq_dist;
++ target[0] = (holderf.f[3] > sq_dist) ? holderi.i[3] : target[0];
++ sq_dist = (holderf.f[3] > sq_dist) ? holderf.f[3] : sq_dist;
++
++ /*
++ float placeholder = 0.0;
++ uint32_t temp0, temp1;
++ uint32_t g0 = (((float*)&xmm3)[0] > ((float*)&xmm3)[1]);
++ uint32_t l0 = g0 ^ 1;
++
++ uint32_t g1 = (((float*)&xmm3)[1] > ((float*)&xmm3)[2]);
++ uint32_t l1 = g1 ^ 1;
++
++ temp0 = g0 * ((uint32_t*)&xmm9)[0] + l0 * ((uint32_t*)&xmm9)[1];
++ temp1 = g0 * ((uint32_t*)&xmm9)[2] + l0 * ((uint32_t*)&xmm9)[3];
++ sq_dist = g0 * ((float*)&xmm3)[0] + l0 * ((float*)&xmm3)[1];
++ placeholder = g0 * ((float*)&xmm3)[2] + l0 * ((float*)&xmm3)[3];
++
++ g0 = (sq_dist > placeholder);
++ l0 = g0 ^ 1;
++ target[0] = g0 * temp0 + l0 * temp1;
++ */
+ }
+
+ #endif /*LV_HAVE_SSE3*/
+
+ #ifdef LV_HAVE_GENERIC
+ static inline void
+- volk_32fc_index_max_32u_generic(uint32_t* target, lv_32fc_t* src0,
+- uint32_t num_points)
++volk_32fc_index_max_32u_generic(uint32_t* target, lv_32fc_t* src0, uint32_t num_points)
+ {
+- const uint32_t num_bytes = num_points*8;
++ const uint32_t num_bytes = num_points * 8;
+
+- float sq_dist = 0.0;
+- float max = 0.0;
+- uint32_t index = 0;
++ float sq_dist = 0.0;
++ float max = 0.0;
++ uint32_t index = 0;
+
+- uint32_t i = 0;
++ uint32_t i = 0;
+
+- for(; i < num_bytes >> 3; ++i) {
+- sq_dist = lv_creal(src0[i]) * lv_creal(src0[i]) + lv_cimag(src0[i]) * lv_cimag(src0[i]);
++ for (; i<num_bytes>> 3; ++i) {
++ sq_dist =
++ lv_creal(src0[i]) * lv_creal(src0[i]) + lv_cimag(src0[i]) * lv_cimag(src0[i]);
+
+- index = sq_dist > max ? i : index;
+- max = sq_dist > max ? sq_dist : max;
+- }
+- target[0] = index;
++ index = sq_dist > max ? i : index;
++ max = sq_dist > max ? sq_dist : max;
++ }
++ target[0] = index;
+ }
+
+ #endif /*LV_HAVE_GENERIC*/
+@@ -384,137 +389,135 @@ static inline void
+ #ifndef INCLUDED_volk_32fc_index_max_32u_u_H
+ #define INCLUDED_volk_32fc_index_max_32u_u_H
+
++#include <inttypes.h>
++#include <stdio.h>
+ #include <volk/volk_common.h>
+-#include<inttypes.h>
+-#include<stdio.h>
+-#include<volk/volk_complex.h>
++#include <volk/volk_complex.h>
+
+ #ifdef LV_HAVE_AVX2
+-#include<immintrin.h>
++#include <immintrin.h>
+
+ static inline void
+-volk_32fc_index_max_32u_u_avx2(uint32_t* target, lv_32fc_t* src0,
+- uint32_t num_points)
++volk_32fc_index_max_32u_u_avx2(uint32_t* target, lv_32fc_t* src0, uint32_t num_points)
+ {
+- const uint32_t num_bytes = num_points*8;
+-
+- union bit256 holderf;
+- union bit256 holderi;
+- float sq_dist = 0.0;
++ const uint32_t num_bytes = num_points * 8;
+
+- union bit256 xmm5, xmm4;
+- __m256 xmm1, xmm2, xmm3;
+- __m256i xmm8, xmm11, xmm12, xmmfive, xmmfour, xmm9, holder0, holder1, xmm10;
++ union bit256 holderf;
++ union bit256 holderi;
++ float sq_dist = 0.0;
+
+- xmm5.int_vec = xmmfive = _mm256_setzero_si256();
+- xmm4.int_vec = xmmfour = _mm256_setzero_si256();
+- holderf.int_vec = holder0 = _mm256_setzero_si256();
+- holderi.int_vec = holder1 = _mm256_setzero_si256();
++ union bit256 xmm5, xmm4;
++ __m256 xmm1, xmm2, xmm3;
++ __m256i xmm8, xmm11, xmm12, xmmfive, xmmfour, xmm9, holder0, holder1, xmm10;
+
+- int bound = num_bytes >> 6;
+- int i = 0;
++ xmm5.int_vec = xmmfive = _mm256_setzero_si256();
++ xmm4.int_vec = xmmfour = _mm256_setzero_si256();
++ holderf.int_vec = holder0 = _mm256_setzero_si256();
++ holderi.int_vec = holder1 = _mm256_setzero_si256();
+
+- xmm8 = _mm256_set_epi32(7,6,5,4,3, 2, 1, 0);
+- xmm9 = _mm256_setzero_si256();
+- xmm10 = _mm256_set1_epi32(8);
+- xmm3 = _mm256_setzero_ps();
+- __m256i idx = _mm256_set_epi32(7,6,3,2,5,4,1,0);
++ int bound = num_bytes >> 6;
++ int i = 0;
+
+- for(; i < bound; ++i) {
+- xmm1 = _mm256_loadu_ps((float*)src0);
+- xmm2 = _mm256_loadu_ps((float*)&src0[4]);
++ xmm8 = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
++ xmm9 = _mm256_setzero_si256();
++ xmm10 = _mm256_set1_epi32(8);
++ xmm3 = _mm256_setzero_ps();
++ __m256i idx = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
+
+- src0 += 8;
++ for (; i < bound; ++i) {
++ xmm1 = _mm256_loadu_ps((float*)src0);
++ xmm2 = _mm256_loadu_ps((float*)&src0[4]);
+
+- xmm1 = _mm256_mul_ps(xmm1, xmm1);
+- xmm2 = _mm256_mul_ps(xmm2, xmm2);
++ src0 += 8;
+
+- xmm1 = _mm256_hadd_ps(xmm1, xmm2);
+- xmm1 = _mm256_permutevar8x32_ps(xmm1, idx);
++ xmm1 = _mm256_mul_ps(xmm1, xmm1);
++ xmm2 = _mm256_mul_ps(xmm2, xmm2);
+
+- xmm3 = _mm256_max_ps(xmm1, xmm3);
++ xmm1 = _mm256_hadd_ps(xmm1, xmm2);
++ xmm1 = _mm256_permutevar8x32_ps(xmm1, idx);
+
+- xmm4.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_LT_OS);
+- xmm5.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_EQ_OQ);
++ xmm3 = _mm256_max_ps(xmm1, xmm3);
+
+- xmm11 = _mm256_and_si256(xmm8, xmm5.int_vec);
+- xmm12 = _mm256_and_si256(xmm9, xmm4.int_vec);
++ xmm4.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_LT_OS);
++ xmm5.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_EQ_OQ);
+
+- xmm9 = _mm256_add_epi32(xmm11, xmm12);
++ xmm11 = _mm256_and_si256(xmm8, xmm5.int_vec);
++ xmm12 = _mm256_and_si256(xmm9, xmm4.int_vec);
+
+- xmm8 = _mm256_add_epi32(xmm8, xmm10);
+- }
++ xmm9 = _mm256_add_epi32(xmm11, xmm12);
+
+- xmm10 = _mm256_set1_epi32(4);
+- if (num_bytes >> 5 & 1) {
+- xmm1 = _mm256_loadu_ps((float*)src0);
+-
+- xmm1 = _mm256_mul_ps(xmm1, xmm1);
++ xmm8 = _mm256_add_epi32(xmm8, xmm10);
++ }
+
+- src0 += 4;
++ xmm10 = _mm256_set1_epi32(4);
++ if (num_bytes >> 4 & 1) {
++ xmm1 = _mm256_loadu_ps((float*)src0);
+
+- xmm1 = _mm256_hadd_ps(xmm1, xmm1);
++ xmm1 = _mm256_mul_ps(xmm1, xmm1);
+
+- xmm3 = _mm256_max_ps(xmm1, xmm3);
++ src0 += 4;
+
+- xmm4.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_LT_OS);
+- xmm5.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_EQ_OQ);
++ xmm1 = _mm256_hadd_ps(xmm1, xmm1);
+
+- xmm11 = _mm256_and_si256(xmm8, xmm5.int_vec);
+- xmm12 = _mm256_and_si256(xmm9, xmm4.int_vec);
++ xmm3 = _mm256_max_ps(xmm1, xmm3);
+
+- xmm9 = _mm256_add_epi32(xmm11, xmm12);
++ xmm4.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_LT_OS);
++ xmm5.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_EQ_OQ);
+
+- xmm8 = _mm256_add_epi32(xmm8, xmm10);
+- }
++ xmm11 = _mm256_and_si256(xmm8, xmm5.int_vec);
++ xmm12 = _mm256_and_si256(xmm9, xmm4.int_vec);
+
+- idx = _mm256_set_epi32(1,0,1,0,1,0,1,0);
+- xmm10 = _mm256_set1_epi32(2);
+- if (num_bytes >> 4 & 1) {
+- xmm2 = _mm256_loadu_ps((float*)src0);
++ xmm9 = _mm256_add_epi32(xmm11, xmm12);
+
+- xmm1 = _mm256_permutevar8x32_ps(bit256_p(&xmm8)->float_vec, idx);
+- xmm8 = bit256_p(&xmm1)->int_vec;
++ xmm8 = _mm256_add_epi32(xmm8, xmm10);
++ }
+
+- xmm2 = _mm256_mul_ps(xmm2, xmm2);
++ idx = _mm256_set_epi32(1, 0, 1, 0, 1, 0, 1, 0);
++ xmm10 = _mm256_set1_epi32(2);
++ if (num_bytes >> 4 & 1) {
++ xmm2 = _mm256_loadu_ps((float*)src0);
+
+- src0 += 2;
++ xmm1 = _mm256_permutevar8x32_ps(bit256_p(&xmm8)->float_vec, idx);
++ xmm8 = bit256_p(&xmm1)->int_vec;
+
+- xmm1 = _mm256_hadd_ps(xmm2, xmm2);
++ xmm2 = _mm256_mul_ps(xmm2, xmm2);
+
+- xmm3 = _mm256_max_ps(xmm1, xmm3);
++ src0 += 2;
+
+- xmm4.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_LT_OS);
+- xmm5.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_EQ_OQ);
++ xmm1 = _mm256_hadd_ps(xmm2, xmm2);
+
+- xmm11 = _mm256_and_si256(xmm8, xmm5.int_vec);
+- xmm12 = _mm256_and_si256(xmm9, xmm4.int_vec);
++ xmm3 = _mm256_max_ps(xmm1, xmm3);
+
+- xmm9 = _mm256_add_epi32(xmm11, xmm12);
++ xmm4.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_LT_OS);
++ xmm5.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_EQ_OQ);
+
+- xmm8 = _mm256_add_epi32(xmm8, xmm10);
+- }
++ xmm11 = _mm256_and_si256(xmm8, xmm5.int_vec);
++ xmm12 = _mm256_and_si256(xmm9, xmm4.int_vec);
+
+- _mm256_storeu_ps((float*)&(holderf.f), xmm3);
+- _mm256_storeu_si256(&(holderi.int_vec), xmm9);
++ xmm9 = _mm256_add_epi32(xmm11, xmm12);
+
+- target[0] = holderi.i[0];
+- sq_dist = holderf.f[0];
+- target[0] = (holderf.f[1] > sq_dist) ? holderi.i[1] : target[0];
+- sq_dist = (holderf.f[1] > sq_dist) ? holderf.f[1] : sq_dist;
+- target[0] = (holderf.f[2] > sq_dist) ? holderi.i[2] : target[0];
+- sq_dist = (holderf.f[2] > sq_dist) ? holderf.f[2] : sq_dist;
+- target[0] = (holderf.f[3] > sq_dist) ? holderi.i[3] : target[0];
+- sq_dist = (holderf.f[3] > sq_dist) ? holderf.f[3] : sq_dist;
+- target[0] = (holderf.f[4] > sq_dist) ? holderi.i[4] : target[0];
+- sq_dist = (holderf.f[4] > sq_dist) ? holderf.f[4] : sq_dist;
+- target[0] = (holderf.f[5] > sq_dist) ? holderi.i[5] : target[0];
+- sq_dist = (holderf.f[5] > sq_dist) ? holderf.f[5] : sq_dist;
+- target[0] = (holderf.f[6] > sq_dist) ? holderi.i[6] : target[0];
+- sq_dist = (holderf.f[6] > sq_dist) ? holderf.f[6] : sq_dist;
+- target[0] = (holderf.f[7] > sq_dist) ? holderi.i[7] : target[0];
+- sq_dist = (holderf.f[7] > sq_dist) ? holderf.f[7] : sq_dist;
++ xmm8 = _mm256_add_epi32(xmm8, xmm10);
++ }
+
++ _mm256_storeu_ps((float*)&(holderf.f), xmm3);
++ _mm256_storeu_si256(&(holderi.int_vec), xmm9);
++
++ target[0] = holderi.i[0];
++ sq_dist = holderf.f[0];
++ target[0] = (holderf.f[1] > sq_dist) ? holderi.i[1] : target[0];
++ sq_dist = (holderf.f[1] > sq_dist) ? holderf.f[1] : sq_dist;
++ target[0] = (holderf.f[2] > sq_dist) ? holderi.i[2] : target[0];
++ sq_dist = (holderf.f[2] > sq_dist) ? holderf.f[2] : sq_dist;
++ target[0] = (holderf.f[3] > sq_dist) ? holderi.i[3] : target[0];
++ sq_dist = (holderf.f[3] > sq_dist) ? holderf.f[3] : sq_dist;
++ target[0] = (holderf.f[4] > sq_dist) ? holderi.i[4] : target[0];
++ sq_dist = (holderf.f[4] > sq_dist) ? holderf.f[4] : sq_dist;
++ target[0] = (holderf.f[5] > sq_dist) ? holderi.i[5] : target[0];
++ sq_dist = (holderf.f[5] > sq_dist) ? holderf.f[5] : sq_dist;
++ target[0] = (holderf.f[6] > sq_dist) ? holderi.i[6] : target[0];
++ sq_dist = (holderf.f[6] > sq_dist) ? holderf.f[6] : sq_dist;
++ target[0] = (holderf.f[7] > sq_dist) ? holderi.i[7] : target[0];
++ sq_dist = (holderf.f[7] > sq_dist) ? holderf.f[7] : sq_dist;
+ }
+
+ #endif /*LV_HAVE_AVX2*/
+@@ -523,29 +526,29 @@ volk_32fc_index_max_32u_u_avx2(uint32_t* target, lv_32fc_t* src0,
+ #include <arm_neon.h>
+ #include <volk/volk_neon_intrinsics.h>
+
+-static inline void volk_32fc_index_max_32u_neon(uint32_t* target, lv_32fc_t* src0, uint32_t num_points)
++static inline void
++volk_32fc_index_max_32u_neon(uint32_t* target, lv_32fc_t* src0, uint32_t num_points)
+ {
+ unsigned int number = 0;
+ const uint32_t quarter_points = num_points / 4;
+ const lv_32fc_t* src0Ptr = src0;
+-
+- uint32_t indices[4] = {0, 1, 2, 3};
++
++ uint32_t indices[4] = { 0, 1, 2, 3 };
+ const uint32x4_t vec_indices_incr = vdupq_n_u32(4);
+ uint32x4_t vec_indices = vld1q_u32(indices);
+ uint32x4_t vec_max_indices = vec_indices;
+-
+- if(num_points)
+- {
++
++ if (num_points) {
+ float max = *src0Ptr;
+ uint32_t index = 0;
+-
++
+ float32x4_t vec_max = vdupq_n_f32(*src0Ptr);
+-
+- for(;number < quarter_points; number++)
+- {
++
++ for (; number < quarter_points; number++) {
+ // Load complex and compute magnitude squared
+- const float32x4_t vec_mag2 = _vmagnitudesquaredq_f32(vld2q_f32((float*)src0Ptr));
+- __VOLK_PREFETCH(src0Ptr+=4);
++ const float32x4_t vec_mag2 =
++ _vmagnitudesquaredq_f32(vld2q_f32((float*)src0Ptr));
++ __VOLK_PREFETCH(src0Ptr += 4);
+ // a > b?
+ const uint32x4_t gt_mask = vcgtq_f32(vec_mag2, vec_max);
+ vec_max = vbslq_f32(gt_mask, vec_mag2, vec_max);
+@@ -556,20 +559,19 @@ static inline void volk_32fc_index_max_32u_neon(uint32_t* target, lv_32fc_t* src
+ float tmp_max[4];
+ vst1q_u32(tmp_max_indices, vec_max_indices);
+ vst1q_f32(tmp_max, vec_max);
+-
++
+ for (int i = 0; i < 4; i++) {
+ if (tmp_max[i] > max) {
+ max = tmp_max[i];
+ index = tmp_max_indices[i];
+ }
+ }
+-
++
+ // Deal with the rest
+- for(number = quarter_points * 4;number < num_points; number++)
+- {
++ for (number = quarter_points * 4; number < num_points; number++) {
+ const float re = lv_creal(*src0Ptr);
+ const float im = lv_cimag(*src0Ptr);
+- if ((re*re+im*im) > max) {
++ if ((re * re + im * im) > max) {
+ max = *src0Ptr;
+ index = number;
+ }
+diff --git a/kernels/volk/volk_32fc_magnitude_32f.h b/kernels/volk/volk_32fc_magnitude_32f.h
+index 1ba6871..6a0a7d8 100644
+--- a/kernels/volk/volk_32fc_magnitude_32f.h
++++ b/kernels/volk/volk_32fc_magnitude_32f.h
+@@ -30,8 +30,8 @@
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+- * void volk_32fc_magnitude_32f(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points)
+- * \endcode
++ * void volk_32fc_magnitude_32f(float* magnitudeVector, const lv_32fc_t* complexVector,
++ * unsigned int num_points) \endcode
+ *
+ * \b Inputs
+ * \li complexVector: The complex input vector.
+@@ -72,41 +72,41 @@
+ #define INCLUDED_volk_32fc_magnitude_32f_u_H
+
+ #include <inttypes.h>
+-#include <stdio.h>
+ #include <math.h>
++#include <stdio.h>
+
+ #ifdef LV_HAVE_AVX
+ #include <immintrin.h>
+ #include <volk/volk_avx_intrinsics.h>
+
+-static inline void
+-volk_32fc_magnitude_32f_u_avx(float* magnitudeVector, const lv_32fc_t* complexVector,
+- unsigned int num_points)
++static inline void volk_32fc_magnitude_32f_u_avx(float* magnitudeVector,
++ const lv_32fc_t* complexVector,
++ unsigned int num_points)
+ {
+- unsigned int number = 0;
+- const unsigned int eighthPoints = num_points / 8;
+-
+- const float* complexVectorPtr = (float*) complexVector;
+- float* magnitudeVectorPtr = magnitudeVector;
+-
+- __m256 cplxValue1, cplxValue2, result;
+-
+- for(; number < eighthPoints; number++){
+- cplxValue1 = _mm256_loadu_ps(complexVectorPtr);
+- cplxValue2 = _mm256_loadu_ps(complexVectorPtr + 8);
+- result = _mm256_magnitude_ps(cplxValue1, cplxValue2);
+- _mm256_storeu_ps(magnitudeVectorPtr, result);
+-
+- complexVectorPtr += 16;
+- magnitudeVectorPtr += 8;
+- }
+-
+- number = eighthPoints * 8;
+- for(; number < num_points; number++){
+- float val1Real = *complexVectorPtr++;
+- float val1Imag = *complexVectorPtr++;
+- *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag));
+- }
++ unsigned int number = 0;
++ const unsigned int eighthPoints = num_points / 8;
++
++ const float* complexVectorPtr = (float*)complexVector;
++ float* magnitudeVectorPtr = magnitudeVector;
++
++ __m256 cplxValue1, cplxValue2, result;
++
++ for (; number < eighthPoints; number++) {
++ cplxValue1 = _mm256_loadu_ps(complexVectorPtr);
++ cplxValue2 = _mm256_loadu_ps(complexVectorPtr + 8);
++ result = _mm256_magnitude_ps(cplxValue1, cplxValue2);
++ _mm256_storeu_ps(magnitudeVectorPtr, result);
++
++ complexVectorPtr += 16;
++ magnitudeVectorPtr += 8;
++ }
++
++ number = eighthPoints * 8;
++ for (; number < num_points; number++) {
++ float val1Real = *complexVectorPtr++;
++ float val1Imag = *complexVectorPtr++;
++ *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag));
++ }
+ }
+ #endif /* LV_HAVE_AVX */
+
+@@ -114,137 +114,137 @@ volk_32fc_magnitude_32f_u_avx(float* magnitudeVector, const lv_32fc_t* complexVe
+ #include <pmmintrin.h>
+ #include <volk/volk_sse3_intrinsics.h>
+
+-static inline void
+-volk_32fc_magnitude_32f_u_sse3(float* magnitudeVector, const lv_32fc_t* complexVector,
+- unsigned int num_points)
++static inline void volk_32fc_magnitude_32f_u_sse3(float* magnitudeVector,
++ const lv_32fc_t* complexVector,
++ unsigned int num_points)
+ {
+- unsigned int number = 0;
+- const unsigned int quarterPoints = num_points / 4;
++ unsigned int number = 0;
++ const unsigned int quarterPoints = num_points / 4;
+
+- const float* complexVectorPtr = (float*) complexVector;
+- float* magnitudeVectorPtr = magnitudeVector;
++ const float* complexVectorPtr = (float*)complexVector;
++ float* magnitudeVectorPtr = magnitudeVector;
+
+- __m128 cplxValue1, cplxValue2, result;
+- for(; number < quarterPoints; number++){
+- cplxValue1 = _mm_loadu_ps(complexVectorPtr);
+- complexVectorPtr += 4;
++ __m128 cplxValue1, cplxValue2, result;
++ for (; number < quarterPoints; number++) {
++ cplxValue1 = _mm_loadu_ps(complexVectorPtr);
++ complexVectorPtr += 4;
+
+- cplxValue2 = _mm_loadu_ps(complexVectorPtr);
+- complexVectorPtr += 4;
++ cplxValue2 = _mm_loadu_ps(complexVectorPtr);
++ complexVectorPtr += 4;
+
+- result = _mm_magnitude_ps_sse3(cplxValue1, cplxValue2);
++ result = _mm_magnitude_ps_sse3(cplxValue1, cplxValue2);
+
+- _mm_storeu_ps(magnitudeVectorPtr, result);
+- magnitudeVectorPtr += 4;
+- }
++ _mm_storeu_ps(magnitudeVectorPtr, result);
++ magnitudeVectorPtr += 4;
++ }
+
+- number = quarterPoints * 4;
+- for(; number < num_points; number++){
+- float val1Real = *complexVectorPtr++;
+- float val1Imag = *complexVectorPtr++;
+- *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag));
+- }
++ number = quarterPoints * 4;
++ for (; number < num_points; number++) {
++ float val1Real = *complexVectorPtr++;
++ float val1Imag = *complexVectorPtr++;
++ *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag));
++ }
+ }
+ #endif /* LV_HAVE_SSE3 */
+
+
+ #ifdef LV_HAVE_SSE
+-#include <xmmintrin.h>
+ #include <volk/volk_sse_intrinsics.h>
++#include <xmmintrin.h>
+
+-static inline void
+-volk_32fc_magnitude_32f_u_sse(float* magnitudeVector, const lv_32fc_t* complexVector,
+- unsigned int num_points)
++static inline void volk_32fc_magnitude_32f_u_sse(float* magnitudeVector,
++ const lv_32fc_t* complexVector,
++ unsigned int num_points)
+ {
+- unsigned int number = 0;
+- const unsigned int quarterPoints = num_points / 4;
++ unsigned int number = 0;
++ const unsigned int quarterPoints = num_points / 4;
+
+- const float* complexVectorPtr = (float*) complexVector;
+- float* magnitudeVectorPtr = magnitudeVector;
++ const float* complexVectorPtr = (float*)complexVector;
++ float* magnitudeVectorPtr = magnitudeVector;
+
+- __m128 cplxValue1, cplxValue2, result;
++ __m128 cplxValue1, cplxValue2, result;
+
+- for(; number < quarterPoints; number++){
+- cplxValue1 = _mm_loadu_ps(complexVectorPtr);
+- complexVectorPtr += 4;
++ for (; number < quarterPoints; number++) {
++ cplxValue1 = _mm_loadu_ps(complexVectorPtr);
++ complexVectorPtr += 4;
+
+- cplxValue2 = _mm_loadu_ps(complexVectorPtr);
+- complexVectorPtr += 4;
++ cplxValue2 = _mm_loadu_ps(complexVectorPtr);
++ complexVectorPtr += 4;
+
+- result = _mm_magnitude_ps(cplxValue1, cplxValue2);
+- _mm_storeu_ps(magnitudeVectorPtr, result);
+- magnitudeVectorPtr += 4;
+- }
++ result = _mm_magnitude_ps(cplxValue1, cplxValue2);
++ _mm_storeu_ps(magnitudeVectorPtr, result);
++ magnitudeVectorPtr += 4;
++ }
+
+- number = quarterPoints * 4;
+- for(; number < num_points; number++){
+- float val1Real = *complexVectorPtr++;
+- float val1Imag = *complexVectorPtr++;
+- *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag));
+- }
++ number = quarterPoints * 4;
++ for (; number < num_points; number++) {
++ float val1Real = *complexVectorPtr++;
++ float val1Imag = *complexVectorPtr++;
++ *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag));
++ }
+ }
+ #endif /* LV_HAVE_SSE */
+
+
+ #ifdef LV_HAVE_GENERIC
+
+-static inline void
+-volk_32fc_magnitude_32f_generic(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points)
++static inline void volk_32fc_magnitude_32f_generic(float* magnitudeVector,
++ const lv_32fc_t* complexVector,
++ unsigned int num_points)
+ {
+- const float* complexVectorPtr = (float*)complexVector;
+- float* magnitudeVectorPtr = magnitudeVector;
+- unsigned int number = 0;
+- for(number = 0; number < num_points; number++){
+- const float real = *complexVectorPtr++;
+- const float imag = *complexVectorPtr++;
+- *magnitudeVectorPtr++ = sqrtf((real*real) + (imag*imag));
+- }
++ const float* complexVectorPtr = (float*)complexVector;
++ float* magnitudeVectorPtr = magnitudeVector;
++ unsigned int number = 0;
++ for (number = 0; number < num_points; number++) {
++ const float real = *complexVectorPtr++;
++ const float imag = *complexVectorPtr++;
++ *magnitudeVectorPtr++ = sqrtf((real * real) + (imag * imag));
++ }
+ }
+ #endif /* LV_HAVE_GENERIC */
+
+
+-
+ #endif /* INCLUDED_volk_32fc_magnitude_32f_u_H */
+ #ifndef INCLUDED_volk_32fc_magnitude_32f_a_H
+ #define INCLUDED_volk_32fc_magnitude_32f_a_H
+
+ #include <inttypes.h>
+-#include <stdio.h>
+ #include <math.h>
++#include <stdio.h>
+
+ #ifdef LV_HAVE_AVX
+ #include <immintrin.h>
+ #include <volk/volk_avx_intrinsics.h>
+
+-static inline void
+-volk_32fc_magnitude_32f_a_avx(float* magnitudeVector, const lv_32fc_t* complexVector,
+- unsigned int num_points)
++static inline void volk_32fc_magnitude_32f_a_avx(float* magnitudeVector,
++ const lv_32fc_t* complexVector,
++ unsigned int num_points)
+ {
+- unsigned int number = 0;
+- const unsigned int eighthPoints = num_points / 8;
+-
+- const float* complexVectorPtr = (float*) complexVector;
+- float* magnitudeVectorPtr = magnitudeVector;
+-
+- __m256 cplxValue1, cplxValue2, result;
+- for(; number < eighthPoints; number++){
+- cplxValue1 = _mm256_load_ps(complexVectorPtr);
+- complexVectorPtr += 8;
+-
+- cplxValue2 = _mm256_load_ps(complexVectorPtr);
+- complexVectorPtr += 8;
+-
+- result = _mm256_magnitude_ps(cplxValue1, cplxValue2);
+- _mm256_store_ps(magnitudeVectorPtr, result);
+- magnitudeVectorPtr += 8;
+- }
+-
+- number = eighthPoints * 8;
+- for(; number < num_points; number++){
+- float val1Real = *complexVectorPtr++;
+- float val1Imag = *complexVectorPtr++;
+- *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag));
+- }
++ unsigned int number = 0;
++ const unsigned int eighthPoints = num_points / 8;
++
++ const float* complexVectorPtr = (float*)complexVector;
++ float* magnitudeVectorPtr = magnitudeVector;
++
++ __m256 cplxValue1, cplxValue2, result;
++ for (; number < eighthPoints; number++) {
++ cplxValue1 = _mm256_load_ps(complexVectorPtr);
++ complexVectorPtr += 8;
++
++ cplxValue2 = _mm256_load_ps(complexVectorPtr);
++ complexVectorPtr += 8;
++
++ result = _mm256_magnitude_ps(cplxValue1, cplxValue2);
++ _mm256_store_ps(magnitudeVectorPtr, result);
++ magnitudeVectorPtr += 8;
++ }
++
++ number = eighthPoints * 8;
++ for (; number < num_points; number++) {
++ float val1Real = *complexVectorPtr++;
++ float val1Imag = *complexVectorPtr++;
++ *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag));
++ }
+ }
+ #endif /* LV_HAVE_AVX */
+
+@@ -252,89 +252,89 @@ volk_32fc_magnitude_32f_a_avx(float* magnitudeVector, const lv_32fc_t* complexVe
+ #include <pmmintrin.h>
+ #include <volk/volk_sse3_intrinsics.h>
+
+-static inline void
+-volk_32fc_magnitude_32f_a_sse3(float* magnitudeVector, const lv_32fc_t* complexVector,
+- unsigned int num_points)
++static inline void volk_32fc_magnitude_32f_a_sse3(float* magnitudeVector,
++ const lv_32fc_t* complexVector,
++ unsigned int num_points)
+ {
+- unsigned int number = 0;
+- const unsigned int quarterPoints = num_points / 4;
+-
+- const float* complexVectorPtr = (float*) complexVector;
+- float* magnitudeVectorPtr = magnitudeVector;
+-
+- __m128 cplxValue1, cplxValue2, result;
+- for(; number < quarterPoints; number++){
+- cplxValue1 = _mm_load_ps(complexVectorPtr);
+- complexVectorPtr += 4;
+-
+- cplxValue2 = _mm_load_ps(complexVectorPtr);
+- complexVectorPtr += 4;
+-
+- result = _mm_magnitude_ps_sse3(cplxValue1, cplxValue2);
+- _mm_store_ps(magnitudeVectorPtr, result);
+- magnitudeVectorPtr += 4;
+- }
+-
+- number = quarterPoints * 4;
+- for(; number < num_points; number++){
+- float val1Real = *complexVectorPtr++;
+- float val1Imag = *complexVectorPtr++;
+- *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag));
+- }
++ unsigned int number = 0;
++ const unsigned int quarterPoints = num_points / 4;
++
++ const float* complexVectorPtr = (float*)complexVector;
++ float* magnitudeVectorPtr = magnitudeVector;
++
++ __m128 cplxValue1, cplxValue2, result;
++ for (; number < quarterPoints; number++) {
++ cplxValue1 = _mm_load_ps(complexVectorPtr);
++ complexVectorPtr += 4;
++
++ cplxValue2 = _mm_load_ps(complexVectorPtr);
++ complexVectorPtr += 4;
++
++ result = _mm_magnitude_ps_sse3(cplxValue1, cplxValue2);
++ _mm_store_ps(magnitudeVectorPtr, result);
++ magnitudeVectorPtr += 4;
++ }
++
++ number = quarterPoints * 4;
++ for (; number < num_points; number++) {
++ float val1Real = *complexVectorPtr++;
++ float val1Imag = *complexVectorPtr++;
++ *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag));
++ }
+ }
+ #endif /* LV_HAVE_SSE3 */
+
+ #ifdef LV_HAVE_SSE
+-#include <xmmintrin.h>
+ #include <volk/volk_sse_intrinsics.h>
++#include <xmmintrin.h>
+
+-static inline void
+-volk_32fc_magnitude_32f_a_sse(float* magnitudeVector, const lv_32fc_t* complexVector,
+- unsigned int num_points)
++static inline void volk_32fc_magnitude_32f_a_sse(float* magnitudeVector,
++ const lv_32fc_t* complexVector,
++ unsigned int num_points)
+ {
+- unsigned int number = 0;
+- const unsigned int quarterPoints = num_points / 4;
+-
+- const float* complexVectorPtr = (float*) complexVector;
+- float* magnitudeVectorPtr = magnitudeVector;
+-
+- __m128 cplxValue1, cplxValue2, result;
+- for(; number < quarterPoints; number++){
+- cplxValue1 = _mm_load_ps(complexVectorPtr);
+- complexVectorPtr += 4;
+-
+- cplxValue2 = _mm_load_ps(complexVectorPtr);
+- complexVectorPtr += 4;
+-
+- result = _mm_magnitude_ps(cplxValue1, cplxValue2);
+- _mm_store_ps(magnitudeVectorPtr, result);
+- magnitudeVectorPtr += 4;
+- }
+-
+- number = quarterPoints * 4;
+- for(; number < num_points; number++){
+- float val1Real = *complexVectorPtr++;
+- float val1Imag = *complexVectorPtr++;
+- *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag));
+- }
++ unsigned int number = 0;
++ const unsigned int quarterPoints = num_points / 4;
++
++ const float* complexVectorPtr = (float*)complexVector;
++ float* magnitudeVectorPtr = magnitudeVector;
++
++ __m128 cplxValue1, cplxValue2, result;
++ for (; number < quarterPoints; number++) {
++ cplxValue1 = _mm_load_ps(complexVectorPtr);
++ complexVectorPtr += 4;
++
++ cplxValue2 = _mm_load_ps(complexVectorPtr);
++ complexVectorPtr += 4;
++
++ result = _mm_magnitude_ps(cplxValue1, cplxValue2);
++ _mm_store_ps(magnitudeVectorPtr, result);
++ magnitudeVectorPtr += 4;
++ }
++
++ number = quarterPoints * 4;
++ for (; number < num_points; number++) {
++ float val1Real = *complexVectorPtr++;
++ float val1Imag = *complexVectorPtr++;
++ *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag));
++ }
+ }
+ #endif /* LV_HAVE_SSE */
+
+
+ #ifdef LV_HAVE_GENERIC
+
+-static inline void
+-volk_32fc_magnitude_32f_a_generic(float* magnitudeVector, const lv_32fc_t* complexVector,
+- unsigned int num_points)
++static inline void volk_32fc_magnitude_32f_a_generic(float* magnitudeVector,
++ const lv_32fc_t* complexVector,
++ unsigned int num_points)
+ {
+- const float* complexVectorPtr = (float*)complexVector;
+- float* magnitudeVectorPtr = magnitudeVector;
+- unsigned int number = 0;
+- for(number = 0; number < num_points; number++){
+- const float real = *complexVectorPtr++;
+- const float imag = *complexVectorPtr++;
+- *magnitudeVectorPtr++ = sqrtf((real*real) + (imag*imag));
+- }
++ const float* complexVectorPtr = (float*)complexVector;
++ float* magnitudeVectorPtr = magnitudeVector;
++ unsigned int number = 0;
++ for (number = 0; number < num_points; number++) {
++ const float real = *complexVectorPtr++;
++ const float imag = *complexVectorPtr++;
++ *magnitudeVectorPtr++ = sqrtf((real * real) + (imag * imag));
++ }
+ }
+ #endif /* LV_HAVE_GENERIC */
+
+@@ -342,41 +342,43 @@ volk_32fc_magnitude_32f_a_generic(float* magnitudeVector, const lv_32fc_t* compl
+ #ifdef LV_HAVE_NEON
+ #include <arm_neon.h>
+
+-static inline void
+-volk_32fc_magnitude_32f_neon(float* magnitudeVector, const lv_32fc_t* complexVector,
+- unsigned int num_points)
++static inline void volk_32fc_magnitude_32f_neon(float* magnitudeVector,
++ const lv_32fc_t* complexVector,
++ unsigned int num_points)
+ {
+- unsigned int number;
+- unsigned int quarter_points = num_points / 4;
+- const float* complexVectorPtr = (float*)complexVector;
+- float* magnitudeVectorPtr = magnitudeVector;
+-
+- float32x4x2_t complex_vec;
+- float32x4_t magnitude_vec;
+- for(number = 0; number < quarter_points; number++){
+- complex_vec = vld2q_f32(complexVectorPtr);
+- complex_vec.val[0] = vmulq_f32(complex_vec.val[0], complex_vec.val[0]);
+- magnitude_vec = vmlaq_f32(complex_vec.val[0], complex_vec.val[1], complex_vec.val[1]);
+- magnitude_vec = vrsqrteq_f32(magnitude_vec);
+- magnitude_vec = vrecpeq_f32( magnitude_vec ); // no plain ol' sqrt
+- vst1q_f32(magnitudeVectorPtr, magnitude_vec);
+-
+- complexVectorPtr += 8;
+- magnitudeVectorPtr += 4;
+- }
+-
+- for(number = quarter_points*4; number < num_points; number++){
+- const float real = *complexVectorPtr++;
+- const float imag = *complexVectorPtr++;
+- *magnitudeVectorPtr++ = sqrtf((real*real) + (imag*imag));
+- }
++ unsigned int number;
++ unsigned int quarter_points = num_points / 4;
++ const float* complexVectorPtr = (float*)complexVector;
++ float* magnitudeVectorPtr = magnitudeVector;
++
++ float32x4x2_t complex_vec;
++ float32x4_t magnitude_vec;
++ for (number = 0; number < quarter_points; number++) {
++ complex_vec = vld2q_f32(complexVectorPtr);
++ complex_vec.val[0] = vmulq_f32(complex_vec.val[0], complex_vec.val[0]);
++ magnitude_vec =
++ vmlaq_f32(complex_vec.val[0], complex_vec.val[1], complex_vec.val[1]);
++ magnitude_vec = vrsqrteq_f32(magnitude_vec);
++ magnitude_vec = vrecpeq_f32(magnitude_vec); // no plain ol' sqrt
++ vst1q_f32(magnitudeVectorPtr, magnitude_vec);
++
++ complexVectorPtr += 8;
++ magnitudeVectorPtr += 4;
++ }
++
++ for (number = quarter_points * 4; number < num_points; number++) {
++ const float real = *complexVectorPtr++;
++ const float imag = *complexVectorPtr++;
++ *magnitudeVectorPtr++ = sqrtf((real * real) + (imag * imag));
++ }
+ }
+ #endif /* LV_HAVE_NEON */
+
+
+ #ifdef LV_HAVE_NEON
+ /*!
+- \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector
++ \brief Calculates the magnitude of the complexVector and stores the results in the
++ magnitudeVector
+
+ This is an approximation from "Streamlining Digital Signal Processing" by
+ Richard Lyons. Apparently max error is about 1% and mean error is about 0.6%.
+@@ -387,80 +389,80 @@ volk_32fc_magnitude_32f_neon(float* magnitudeVector, const lv_32fc_t* complexVec
+
+ \param complexVector The vector containing the complex input values
+ \param magnitudeVector The vector containing the real output values
+- \param num_points The number of complex values in complexVector to be calculated and stored into cVector
++ \param num_points The number of complex values in complexVector to be calculated and
++ stored into cVector
+ */
+-static inline void
+-volk_32fc_magnitude_32f_neon_fancy_sweet(float* magnitudeVector, const lv_32fc_t* complexVector,
+- unsigned int num_points)
++static inline void volk_32fc_magnitude_32f_neon_fancy_sweet(
++ float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points)
+ {
+- unsigned int number;
+- unsigned int quarter_points = num_points / 4;
+- const float* complexVectorPtr = (float*)complexVector;
+- float* magnitudeVectorPtr = magnitudeVector;
+-
+- const float threshold = 0.4142135;
+-
+- float32x4_t a_vec, b_vec, a_high, a_low, b_high, b_low;
+- a_high = vdupq_n_f32( 0.84 );
+- b_high = vdupq_n_f32( 0.561);
+- a_low = vdupq_n_f32( 0.99 );
+- b_low = vdupq_n_f32( 0.197);
+-
+- uint32x4_t comp0, comp1;
+-
+- float32x4x2_t complex_vec;
+- float32x4_t min_vec, max_vec, magnitude_vec;
+- float32x4_t real_abs, imag_abs;
+- for(number = 0; number < quarter_points; number++){
+- complex_vec = vld2q_f32(complexVectorPtr);
+-
+- real_abs = vabsq_f32(complex_vec.val[0]);
+- imag_abs = vabsq_f32(complex_vec.val[1]);
+-
+- min_vec = vminq_f32(real_abs, imag_abs);
+- max_vec = vmaxq_f32(real_abs, imag_abs);
+-
+- // effective branch to choose coefficient pair.
+- comp0 = vcgtq_f32(min_vec, vmulq_n_f32(max_vec, threshold));
+- comp1 = vcleq_f32(min_vec, vmulq_n_f32(max_vec, threshold));
+-
+- // and 0s or 1s with coefficients from previous effective branch
+- a_vec = (float32x4_t)vaddq_s32(vandq_s32((int32x4_t)comp0, (int32x4_t)a_high),
+- vandq_s32((int32x4_t)comp1, (int32x4_t)a_low));
+- b_vec = (float32x4_t)vaddq_s32(vandq_s32((int32x4_t)comp0, (int32x4_t)b_high),
+- vandq_s32((int32x4_t)comp1, (int32x4_t)b_low));
+-
+- // coefficients chosen, do the weighted sum
+- min_vec = vmulq_f32(min_vec, b_vec);
+- max_vec = vmulq_f32(max_vec, a_vec);
+-
+- magnitude_vec = vaddq_f32(min_vec, max_vec);
+- vst1q_f32(magnitudeVectorPtr, magnitude_vec);
+-
+- complexVectorPtr += 8;
+- magnitudeVectorPtr += 4;
+- }
+-
+- for(number = quarter_points*4; number < num_points; number++){
+- const float real = *complexVectorPtr++;
+- const float imag = *complexVectorPtr++;
+- *magnitudeVectorPtr++ = sqrtf((real*real) + (imag*imag));
+- }
++ unsigned int number;
++ unsigned int quarter_points = num_points / 4;
++ const float* complexVectorPtr = (float*)complexVector;
++ float* magnitudeVectorPtr = magnitudeVector;
++
++ const float threshold = 0.4142135;
++
++ float32x4_t a_vec, b_vec, a_high, a_low, b_high, b_low;
++ a_high = vdupq_n_f32(0.84);
++ b_high = vdupq_n_f32(0.561);
++ a_low = vdupq_n_f32(0.99);
++ b_low = vdupq_n_f32(0.197);
++
++ uint32x4_t comp0, comp1;
++
++ float32x4x2_t complex_vec;
++ float32x4_t min_vec, max_vec, magnitude_vec;
++ float32x4_t real_abs, imag_abs;
++ for (number = 0; number < quarter_points; number++) {
++ complex_vec = vld2q_f32(complexVectorPtr);
++
++ real_abs = vabsq_f32(complex_vec.val[0]);
++ imag_abs = vabsq_f32(complex_vec.val[1]);
++
++ min_vec = vminq_f32(real_abs, imag_abs);
++ max_vec = vmaxq_f32(real_abs, imag_abs);
++
++ // effective branch to choose coefficient pair.
++ comp0 = vcgtq_f32(min_vec, vmulq_n_f32(max_vec, threshold));
++ comp1 = vcleq_f32(min_vec, vmulq_n_f32(max_vec, threshold));
++
++ // and 0s or 1s with coefficients from previous effective branch
++ a_vec = (float32x4_t)vaddq_s32(vandq_s32((int32x4_t)comp0, (int32x4_t)a_high),
++ vandq_s32((int32x4_t)comp1, (int32x4_t)a_low));
++ b_vec = (float32x4_t)vaddq_s32(vandq_s32((int32x4_t)comp0, (int32x4_t)b_high),
++ vandq_s32((int32x4_t)comp1, (int32x4_t)b_low));
++
++ // coefficients chosen, do the weighted sum
++ min_vec = vmulq_f32(min_vec, b_vec);
++ max_vec = vmulq_f32(max_vec, a_vec);
++
++ magnitude_vec = vaddq_f32(min_vec, max_vec);
++ vst1q_f32(magnitudeVectorPtr, magnitude_vec);
++
++ complexVectorPtr += 8;
++ magnitudeVectorPtr += 4;
++ }
++
++ for (number = quarter_points * 4; number < num_points; number++) {
++ const float real = *complexVectorPtr++;
++ const float imag = *complexVectorPtr++;
++ *magnitudeVectorPtr++ = sqrtf((real * real) + (imag * imag));
++ }
+ }
+ #endif /* LV_HAVE_NEON */
+
+
+ #ifdef LV_HAVE_ORC
+
+-extern void
+-volk_32fc_magnitude_32f_a_orc_impl(float* magnitudeVector, const lv_32fc_t* complexVector,
+- unsigned int num_points);
++extern void volk_32fc_magnitude_32f_a_orc_impl(float* magnitudeVector,
++ const lv_32fc_t* complexVector,
++ unsigned int num_points);
+
+-static inline void
+-volk_32fc_magnitude_32f_u_orc(float* magnitudeVector, const lv_32fc_t* complexVector,
+- unsigned int num_points)
++static inline void volk_32fc_magnitude_32f_u_orc(float* magnitudeVector,
++ const lv_32fc_t* complexVector,
++ unsigned int num_points)
+ {
+- volk_32fc_magnitude_32f_a_orc_impl(magnitudeVector, complexVector, num_points);
++ volk_32fc_magnitude_32f_a_orc_impl(magnitudeVector, complexVector, num_points);
+ }
+ #endif /* LV_HAVE_ORC */
+
+diff --git a/kernels/volk/volk_32fc_magnitude_squared_32f.h b/kernels/volk/volk_32fc_magnitude_squared_32f.h
+index 51bb4df..cb093ca 100644
+--- a/kernels/volk/volk_32fc_magnitude_squared_32f.h
++++ b/kernels/volk/volk_32fc_magnitude_squared_32f.h
+@@ -30,8 +30,8 @@
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+- * void volk_32fc_magnitude_squared_32f(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points)
+- * \endcode
++ * void volk_32fc_magnitude_squared_32f(float* magnitudeVector, const lv_32fc_t*
++ * complexVector, unsigned int num_points) \endcode
+ *
+ * \b Inputs
+ * \li complexVector: The complex input vector.
+@@ -72,41 +72,41 @@
+ #define INCLUDED_volk_32fc_magnitude_squared_32f_u_H
+
+ #include <inttypes.h>
+-#include <stdio.h>
+ #include <math.h>
++#include <stdio.h>
+
+ #ifdef LV_HAVE_AVX
+ #include <immintrin.h>
+ #include <volk/volk_avx_intrinsics.h>
+
+-static inline void
+-volk_32fc_magnitude_squared_32f_u_avx(float* magnitudeVector, const lv_32fc_t* complexVector,
+- unsigned int num_points)
++static inline void volk_32fc_magnitude_squared_32f_u_avx(float* magnitudeVector,
++ const lv_32fc_t* complexVector,
++ unsigned int num_points)
+ {
+- unsigned int number = 0;
+- const unsigned int eighthPoints = num_points / 8;
+-
+- const float* complexVectorPtr = (float*) complexVector;
+- float* magnitudeVectorPtr = magnitudeVector;
+-
+- __m256 cplxValue1, cplxValue2, result;
+-
+- for(; number < eighthPoints; number++){
+- cplxValue1 = _mm256_loadu_ps(complexVectorPtr);
+- cplxValue2 = _mm256_loadu_ps(complexVectorPtr + 8);
+- result = _mm256_magnitudesquared_ps(cplxValue1, cplxValue2);
+- _mm256_storeu_ps(magnitudeVectorPtr, result);
+-
+- complexVectorPtr += 16;
+- magnitudeVectorPtr += 8;
+- }
+-
+- number = eighthPoints * 8;
+- for(; number < num_points; number++){
+- float val1Real = *complexVectorPtr++;
+- float val1Imag = *complexVectorPtr++;
+- *magnitudeVectorPtr++ = (val1Real * val1Real) + (val1Imag * val1Imag);
+- }
++ unsigned int number = 0;
++ const unsigned int eighthPoints = num_points / 8;
++
++ const float* complexVectorPtr = (float*)complexVector;
++ float* magnitudeVectorPtr = magnitudeVector;
++
++ __m256 cplxValue1, cplxValue2, result;
++
++ for (; number < eighthPoints; number++) {
++ cplxValue1 = _mm256_loadu_ps(complexVectorPtr);
++ cplxValue2 = _mm256_loadu_ps(complexVectorPtr + 8);
++ result = _mm256_magnitudesquared_ps(cplxValue1, cplxValue2);
++ _mm256_storeu_ps(magnitudeVectorPtr, result);
++
++ complexVectorPtr += 16;
++ magnitudeVectorPtr += 8;
++ }
++
++ number = eighthPoints * 8;
++ for (; number < num_points; number++) {
++ float val1Real = *complexVectorPtr++;
++ float val1Imag = *complexVectorPtr++;
++ *magnitudeVectorPtr++ = (val1Real * val1Real) + (val1Imag * val1Imag);
++ }
+ }
+ #endif /* LV_HAVE_AVX */
+
+@@ -115,137 +115,136 @@ volk_32fc_magnitude_squared_32f_u_avx(float* magnitudeVector, const lv_32fc_t* c
+ #include <pmmintrin.h>
+ #include <volk/volk_sse3_intrinsics.h>
+
+-static inline void
+-volk_32fc_magnitude_squared_32f_u_sse3(float* magnitudeVector, const lv_32fc_t* complexVector,
+- unsigned int num_points)
++static inline void volk_32fc_magnitude_squared_32f_u_sse3(float* magnitudeVector,
++ const lv_32fc_t* complexVector,
++ unsigned int num_points)
+ {
+- unsigned int number = 0;
+- const unsigned int quarterPoints = num_points / 4;
+-
+- const float* complexVectorPtr = (float*) complexVector;
+- float* magnitudeVectorPtr = magnitudeVector;
+-
+- __m128 cplxValue1, cplxValue2, result;
+- for(; number < quarterPoints; number++){
+- cplxValue1 = _mm_loadu_ps(complexVectorPtr);
+- complexVectorPtr += 4;
+-
+- cplxValue2 = _mm_loadu_ps(complexVectorPtr);
+- complexVectorPtr += 4;
+-
+- result = _mm_magnitudesquared_ps_sse3(cplxValue1, cplxValue2);
+- _mm_storeu_ps(magnitudeVectorPtr, result);
+- magnitudeVectorPtr += 4;
+- }
+-
+- number = quarterPoints * 4;
+- for(; number < num_points; number++){
+- float val1Real = *complexVectorPtr++;
+- float val1Imag = *complexVectorPtr++;
+- *magnitudeVectorPtr++ = (val1Real * val1Real) + (val1Imag * val1Imag);
+- }
++ unsigned int number = 0;
++ const unsigned int quarterPoints = num_points / 4;
++
++ const float* complexVectorPtr = (float*)complexVector;
++ float* magnitudeVectorPtr = magnitudeVector;
++
++ __m128 cplxValue1, cplxValue2, result;
++ for (; number < quarterPoints; number++) {
++ cplxValue1 = _mm_loadu_ps(complexVectorPtr);
++ complexVectorPtr += 4;
++
++ cplxValue2 = _mm_loadu_ps(complexVectorPtr);
++ complexVectorPtr += 4;
++
++ result = _mm_magnitudesquared_ps_sse3(cplxValue1, cplxValue2);
++ _mm_storeu_ps(magnitudeVectorPtr, result);
++ magnitudeVectorPtr += 4;
++ }
++
++ number = quarterPoints * 4;
++ for (; number < num_points; number++) {
++ float val1Real = *complexVectorPtr++;
++ float val1Imag = *complexVectorPtr++;
++ *magnitudeVectorPtr++ = (val1Real * val1Real) + (val1Imag * val1Imag);
++ }
+ }
+ #endif /* LV_HAVE_SSE3 */
+
+
+ #ifdef LV_HAVE_SSE
+-#include <xmmintrin.h>
+ #include <volk/volk_sse_intrinsics.h>
++#include <xmmintrin.h>
+
+-static inline void
+-volk_32fc_magnitude_squared_32f_u_sse(float* magnitudeVector, const lv_32fc_t* complexVector,
+- unsigned int num_points)
++static inline void volk_32fc_magnitude_squared_32f_u_sse(float* magnitudeVector,
++ const lv_32fc_t* complexVector,
++ unsigned int num_points)
+ {
+- unsigned int number = 0;
+- const unsigned int quarterPoints = num_points / 4;
++ unsigned int number = 0;
++ const unsigned int quarterPoints = num_points / 4;
+
+- const float* complexVectorPtr = (float*) complexVector;
+- float* magnitudeVectorPtr = magnitudeVector;
++ const float* complexVectorPtr = (float*)complexVector;
++ float* magnitudeVectorPtr = magnitudeVector;
+
+- __m128 cplxValue1, cplxValue2, result;
++ __m128 cplxValue1, cplxValue2, result;
+
+- for(; number < quarterPoints; number++){
+- cplxValue1 = _mm_loadu_ps(complexVectorPtr);
+- complexVectorPtr += 4;
++ for (; number < quarterPoints; number++) {
++ cplxValue1 = _mm_loadu_ps(complexVectorPtr);
++ complexVectorPtr += 4;
+
+- cplxValue2 = _mm_loadu_ps(complexVectorPtr);
+- complexVectorPtr += 4;
++ cplxValue2 = _mm_loadu_ps(complexVectorPtr);
++ complexVectorPtr += 4;
+
+- result = _mm_magnitudesquared_ps(cplxValue1, cplxValue2);
+- _mm_storeu_ps(magnitudeVectorPtr, result);
+- magnitudeVectorPtr += 4;
+- }
++ result = _mm_magnitudesquared_ps(cplxValue1, cplxValue2);
++ _mm_storeu_ps(magnitudeVectorPtr, result);
++ magnitudeVectorPtr += 4;
++ }
+
+- number = quarterPoints * 4;
+- for(; number < num_points; number++){
+- float val1Real = *complexVectorPtr++;
+- float val1Imag = *complexVectorPtr++;
+- *magnitudeVectorPtr++ = (val1Real * val1Real) + (val1Imag * val1Imag);
+- }
++ number = quarterPoints * 4;
++ for (; number < num_points; number++) {
++ float val1Real = *complexVectorPtr++;
++ float val1Imag = *complexVectorPtr++;
++ *magnitudeVectorPtr++ = (val1Real * val1Real) + (val1Imag * val1Imag);
++ }
+ }
+ #endif /* LV_HAVE_SSE */
+
+
+ #ifdef LV_HAVE_GENERIC
+
+-static inline void
+-volk_32fc_magnitude_squared_32f_generic(float* magnitudeVector, const lv_32fc_t* complexVector,
+- unsigned int num_points)
++static inline void volk_32fc_magnitude_squared_32f_generic(float* magnitudeVector,
++ const lv_32fc_t* complexVector,
++ unsigned int num_points)
+ {
+- const float* complexVectorPtr = (float*)complexVector;
+- float* magnitudeVectorPtr = magnitudeVector;
+- unsigned int number = 0;
+- for(number = 0; number < num_points; number++){
+- const float real = *complexVectorPtr++;
+- const float imag = *complexVectorPtr++;
+- *magnitudeVectorPtr++ = (real*real) + (imag*imag);
+- }
++ const float* complexVectorPtr = (float*)complexVector;
++ float* magnitudeVectorPtr = magnitudeVector;
++ unsigned int number = 0;
++ for (number = 0; number < num_points; number++) {
++ const float real = *complexVectorPtr++;
++ const float imag = *complexVectorPtr++;
++ *magnitudeVectorPtr++ = (real * real) + (imag * imag);
++ }
+ }
+ #endif /* LV_HAVE_GENERIC */
+
+
+-
+ #endif /* INCLUDED_volk_32fc_magnitude_32f_u_H */
+ #ifndef INCLUDED_volk_32fc_magnitude_squared_32f_a_H
+ #define INCLUDED_volk_32fc_magnitude_squared_32f_a_H
+
+ #include <inttypes.h>
+-#include <stdio.h>
+ #include <math.h>
++#include <stdio.h>
+
+ #ifdef LV_HAVE_AVX
+ #include <immintrin.h>
+ #include <volk/volk_avx_intrinsics.h>
+
+-static inline void
+-volk_32fc_magnitude_squared_32f_a_avx(float* magnitudeVector, const lv_32fc_t* complexVector,
+- unsigned int num_points)
++static inline void volk_32fc_magnitude_squared_32f_a_avx(float* magnitudeVector,
++ const lv_32fc_t* complexVector,
++ unsigned int num_points)
+ {
+- unsigned int number = 0;
+- const unsigned int eighthPoints = num_points / 8;
+-
+- const float* complexVectorPtr = (float*) complexVector;
+- float* magnitudeVectorPtr = magnitudeVector;
+-
+- __m256 cplxValue1, cplxValue2, result;
+- for(; number < eighthPoints; number++){
+- cplxValue1 = _mm256_load_ps(complexVectorPtr);
+- complexVectorPtr += 8;
+-
+- cplxValue2 = _mm256_load_ps(complexVectorPtr);
+- complexVectorPtr += 8;
+-
+- result = _mm256_magnitudesquared_ps(cplxValue1, cplxValue2);
+- _mm256_store_ps(magnitudeVectorPtr, result);
+- magnitudeVectorPtr += 8;
+- }
+-
+- number = eighthPoints * 8;
+- for(; number < num_points; number++){
+- float val1Real = *complexVectorPtr++;
+- float val1Imag = *complexVectorPtr++;
+- *magnitudeVectorPtr++ = (val1Real * val1Real) + (val1Imag * val1Imag);
+- }
++ unsigned int number = 0;
++ const unsigned int eighthPoints = num_points / 8;
++
++ const float* complexVectorPtr = (float*)complexVector;
++ float* magnitudeVectorPtr = magnitudeVector;
++
++ __m256 cplxValue1, cplxValue2, result;
++ for (; number < eighthPoints; number++) {
++ cplxValue1 = _mm256_load_ps(complexVectorPtr);
++ complexVectorPtr += 8;
++
++ cplxValue2 = _mm256_load_ps(complexVectorPtr);
++ complexVectorPtr += 8;
++
++ result = _mm256_magnitudesquared_ps(cplxValue1, cplxValue2);
++ _mm256_store_ps(magnitudeVectorPtr, result);
++ magnitudeVectorPtr += 8;
++ }
++
++ number = eighthPoints * 8;
++ for (; number < num_points; number++) {
++ float val1Real = *complexVectorPtr++;
++ float val1Imag = *complexVectorPtr++;
++ *magnitudeVectorPtr++ = (val1Real * val1Real) + (val1Imag * val1Imag);
++ }
+ }
+ #endif /* LV_HAVE_AVX */
+
+@@ -254,72 +253,72 @@ volk_32fc_magnitude_squared_32f_a_avx(float* magnitudeVector, const lv_32fc_t* c
+ #include <pmmintrin.h>
+ #include <volk/volk_sse3_intrinsics.h>
+
+-static inline void
+-volk_32fc_magnitude_squared_32f_a_sse3(float* magnitudeVector, const lv_32fc_t* complexVector,
+- unsigned int num_points)
++static inline void volk_32fc_magnitude_squared_32f_a_sse3(float* magnitudeVector,
++ const lv_32fc_t* complexVector,
++ unsigned int num_points)
+ {
+- unsigned int number = 0;
+- const unsigned int quarterPoints = num_points / 4;
+-
+- const float* complexVectorPtr = (float*) complexVector;
+- float* magnitudeVectorPtr = magnitudeVector;
+-
+- __m128 cplxValue1, cplxValue2, result;
+- for(; number < quarterPoints; number++){
+- cplxValue1 = _mm_load_ps(complexVectorPtr);
+- complexVectorPtr += 4;
+-
+- cplxValue2 = _mm_load_ps(complexVectorPtr);
+- complexVectorPtr += 4;
+-
+- result = _mm_magnitudesquared_ps_sse3(cplxValue1, cplxValue2);
+- _mm_store_ps(magnitudeVectorPtr, result);
+- magnitudeVectorPtr += 4;
+- }
+-
+- number = quarterPoints * 4;
+- for(; number < num_points; number++){
+- float val1Real = *complexVectorPtr++;
+- float val1Imag = *complexVectorPtr++;
+- *magnitudeVectorPtr++ = (val1Real * val1Real) + (val1Imag * val1Imag);
+- }
++ unsigned int number = 0;
++ const unsigned int quarterPoints = num_points / 4;
++
++ const float* complexVectorPtr = (float*)complexVector;
++ float* magnitudeVectorPtr = magnitudeVector;
++
++ __m128 cplxValue1, cplxValue2, result;
++ for (; number < quarterPoints; number++) {
++ cplxValue1 = _mm_load_ps(complexVectorPtr);
++ complexVectorPtr += 4;
++
++ cplxValue2 = _mm_load_ps(complexVectorPtr);
++ complexVectorPtr += 4;
++
++ result = _mm_magnitudesquared_ps_sse3(cplxValue1, cplxValue2);
++ _mm_store_ps(magnitudeVectorPtr, result);
++ magnitudeVectorPtr += 4;
++ }
++
++ number = quarterPoints * 4;
++ for (; number < num_points; number++) {
++ float val1Real = *complexVectorPtr++;
++ float val1Imag = *complexVectorPtr++;
++ *magnitudeVectorPtr++ = (val1Real * val1Real) + (val1Imag * val1Imag);
++ }
+ }
+ #endif /* LV_HAVE_SSE3 */
+
+
+ #ifdef LV_HAVE_SSE
+-#include <xmmintrin.h>
+ #include <volk/volk_sse_intrinsics.h>
++#include <xmmintrin.h>
+
+-static inline void
+-volk_32fc_magnitude_squared_32f_a_sse(float* magnitudeVector, const lv_32fc_t* complexVector,
+- unsigned int num_points)
++static inline void volk_32fc_magnitude_squared_32f_a_sse(float* magnitudeVector,
++ const lv_32fc_t* complexVector,
++ unsigned int num_points)
+ {
+- unsigned int number = 0;
+- const unsigned int quarterPoints = num_points / 4;
+-
+- const float* complexVectorPtr = (float*)complexVector;
+- float* magnitudeVectorPtr = magnitudeVector;
+-
+- __m128 cplxValue1, cplxValue2, result;
+- for(;number < quarterPoints; number++){
+- cplxValue1 = _mm_load_ps(complexVectorPtr);
+- complexVectorPtr += 4;
+-
+- cplxValue2 = _mm_load_ps(complexVectorPtr);
+- complexVectorPtr += 4;
+-
+- result = _mm_magnitudesquared_ps(cplxValue1, cplxValue2);
+- _mm_store_ps(magnitudeVectorPtr, result);
+- magnitudeVectorPtr += 4;
+- }
+-
+- number = quarterPoints * 4;
+- for(; number < num_points; number++){
+- float val1Real = *complexVectorPtr++;
+- float val1Imag = *complexVectorPtr++;
+- *magnitudeVectorPtr++ = (val1Real * val1Real) + (val1Imag * val1Imag);
+- }
++ unsigned int number = 0;
++ const unsigned int quarterPoints = num_points / 4;
++
++ const float* complexVectorPtr = (float*)complexVector;
++ float* magnitudeVectorPtr = magnitudeVector;
++
++ __m128 cplxValue1, cplxValue2, result;
++ for (; number < quarterPoints; number++) {
++ cplxValue1 = _mm_load_ps(complexVectorPtr);
++ complexVectorPtr += 4;
++
++ cplxValue2 = _mm_load_ps(complexVectorPtr);
++ complexVectorPtr += 4;
++
++ result = _mm_magnitudesquared_ps(cplxValue1, cplxValue2);
++ _mm_store_ps(magnitudeVectorPtr, result);
++ magnitudeVectorPtr += 4;
++ }
++
++ number = quarterPoints * 4;
++ for (; number < num_points; number++) {
++ float val1Real = *complexVectorPtr++;
++ float val1Imag = *complexVectorPtr++;
++ *magnitudeVectorPtr++ = (val1Real * val1Real) + (val1Imag * val1Imag);
++ }
+ }
+ #endif /* LV_HAVE_SSE */
+
+@@ -327,55 +326,57 @@ volk_32fc_magnitude_squared_32f_a_sse(float* magnitudeVector, const lv_32fc_t* c
+ #ifdef LV_HAVE_NEON
+ #include <arm_neon.h>
+
+-static inline void
+-volk_32fc_magnitude_squared_32f_neon(float* magnitudeVector, const lv_32fc_t* complexVector,
+- unsigned int num_points)
++static inline void volk_32fc_magnitude_squared_32f_neon(float* magnitudeVector,
++ const lv_32fc_t* complexVector,
++ unsigned int num_points)
+ {
+- unsigned int number = 0;
+- const unsigned int quarterPoints = num_points / 4;
+-
+- const float* complexVectorPtr = (float*)complexVector;
+- float* magnitudeVectorPtr = magnitudeVector;
+-
+- float32x4x2_t cmplx_val;
+- float32x4_t result;
+- for(;number < quarterPoints; number++){
+- cmplx_val = vld2q_f32(complexVectorPtr);
+- complexVectorPtr += 8;
+-
+- cmplx_val.val[0] = vmulq_f32(cmplx_val.val[0], cmplx_val.val[0]); // Square the values
+- cmplx_val.val[1] = vmulq_f32(cmplx_val.val[1], cmplx_val.val[1]); // Square the values
+-
+- result = vaddq_f32(cmplx_val.val[0], cmplx_val.val[1]); // Add the I2 and Q2 values
+-
+- vst1q_f32(magnitudeVectorPtr, result);
+- magnitudeVectorPtr += 4;
+- }
+-
+- number = quarterPoints * 4;
+- for(; number < num_points; number++){
+- float val1Real = *complexVectorPtr++;
+- float val1Imag = *complexVectorPtr++;
+- *magnitudeVectorPtr++ = (val1Real * val1Real) + (val1Imag * val1Imag);
+- }
++ unsigned int number = 0;
++ const unsigned int quarterPoints = num_points / 4;
++
++ const float* complexVectorPtr = (float*)complexVector;
++ float* magnitudeVectorPtr = magnitudeVector;
++
++ float32x4x2_t cmplx_val;
++ float32x4_t result;
++ for (; number < quarterPoints; number++) {
++ cmplx_val = vld2q_f32(complexVectorPtr);
++ complexVectorPtr += 8;
++
++ cmplx_val.val[0] =
++ vmulq_f32(cmplx_val.val[0], cmplx_val.val[0]); // Square the values
++ cmplx_val.val[1] =
++ vmulq_f32(cmplx_val.val[1], cmplx_val.val[1]); // Square the values
++
++ result =
++ vaddq_f32(cmplx_val.val[0], cmplx_val.val[1]); // Add the I2 and Q2 values
++
++ vst1q_f32(magnitudeVectorPtr, result);
++ magnitudeVectorPtr += 4;
++ }
++
++ number = quarterPoints * 4;
++ for (; number < num_points; number++) {
++ float val1Real = *complexVectorPtr++;
++ float val1Imag = *complexVectorPtr++;
++ *magnitudeVectorPtr++ = (val1Real * val1Real) + (val1Imag * val1Imag);
++ }
+ }
+ #endif /* LV_HAVE_NEON */
+
+
+ #ifdef LV_HAVE_GENERIC
+
+-static inline void
+-volk_32fc_magnitude_squared_32f_a_generic(float* magnitudeVector, const lv_32fc_t* complexVector,
+- unsigned int num_points)
++static inline void volk_32fc_magnitude_squared_32f_a_generic(
++ float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points)
+ {
+- const float* complexVectorPtr = (float*)complexVector;
+- float* magnitudeVectorPtr = magnitudeVector;
+- unsigned int number = 0;
+- for(number = 0; number < num_points; number++){
+- const float real = *complexVectorPtr++;
+- const float imag = *complexVectorPtr++;
+- *magnitudeVectorPtr++ = (real*real) + (imag*imag);
+- }
++ const float* complexVectorPtr = (float*)complexVector;
++ float* magnitudeVectorPtr = magnitudeVector;
++ unsigned int number = 0;
++ for (number = 0; number < num_points; number++) {
++ const float real = *complexVectorPtr++;
++ const float imag = *complexVectorPtr++;
++ *magnitudeVectorPtr++ = (real * real) + (imag * imag);
++ }
+ }
+ #endif /* LV_HAVE_GENERIC */
+
+diff --git a/kernels/volk/volk_32fc_s32f_atan2_32f.h b/kernels/volk/volk_32fc_s32f_atan2_32f.h
+index c169336..f08f793 100644
+--- a/kernels/volk/volk_32fc_s32f_atan2_32f.h
++++ b/kernels/volk/volk_32fc_s32f_atan2_32f.h
+@@ -30,13 +30,13 @@
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+- * void volk_32fc_s32f_atan2_32f(float* outputVector, const lv_32fc_t* complexVector, const float normalizeFactor, unsigned int num_points)
+- * \endcode
++ * void volk_32fc_s32f_atan2_32f(float* outputVector, const lv_32fc_t* complexVector,
++ * const float normalizeFactor, unsigned int num_points) \endcode
+ *
+ * \b Inputs
+- * \li inputVector: The byte-aligned input vector containing interleaved IQ data (I = cos, Q = sin).
+- * \li normalizeFactor: The atan results are divided by this normalization factor.
+- * \li num_points: The number of complex values in \p inputVector.
++ * \li inputVector: The byte-aligned input vector containing interleaved IQ data (I = cos,
++ * Q = sin). \li normalizeFactor: The atan results are divided by this normalization
++ * factor. \li num_points: The number of complex values in \p inputVector.
+ *
+ * \b Outputs
+ * \li outputVector: The vector where the results will be stored.
+@@ -75,8 +75,8 @@
+ #define INCLUDED_volk_32fc_s32f_atan2_32f_a_H
+
+ #include <inttypes.h>
+-#include <stdio.h>
+ #include <math.h>
++#include <stdio.h>
+
+ #ifdef LV_HAVE_SSE4_1
+ #include <smmintrin.h>
+@@ -85,50 +85,54 @@
+ #include <simdmath.h>
+ #endif /* LV_HAVE_LIB_SIMDMATH */
+
+-static inline void volk_32fc_s32f_atan2_32f_a_sse4_1(float* outputVector, const lv_32fc_t* complexVector, const float normalizeFactor, unsigned int num_points){
+- const float* complexVectorPtr = (float*)complexVector;
+- float* outPtr = outputVector;
++static inline void volk_32fc_s32f_atan2_32f_a_sse4_1(float* outputVector,
++ const lv_32fc_t* complexVector,
++ const float normalizeFactor,
++ unsigned int num_points)
++{
++ const float* complexVectorPtr = (float*)complexVector;
++ float* outPtr = outputVector;
+
+- unsigned int number = 0;
+- const float invNormalizeFactor = 1.0 / normalizeFactor;
++ unsigned int number = 0;
++ const float invNormalizeFactor = 1.0 / normalizeFactor;
+
+ #ifdef LV_HAVE_LIB_SIMDMATH
+- const unsigned int quarterPoints = num_points / 4;
+- __m128 testVector = _mm_set_ps1(2*M_PI);
+- __m128 correctVector = _mm_set_ps1(M_PI);
+- __m128 vNormalizeFactor = _mm_set_ps1(invNormalizeFactor);
+- __m128 phase;
+- __m128 complex1, complex2, iValue, qValue;
+- __m128 keepMask;
+-
+- for (; number < quarterPoints; number++) {
+- // Load IQ data:
+- complex1 = _mm_load_ps(complexVectorPtr);
+- complexVectorPtr += 4;
+- complex2 = _mm_load_ps(complexVectorPtr);
+- complexVectorPtr += 4;
+- // Deinterleave IQ data:
+- iValue = _mm_shuffle_ps(complex1, complex2, _MM_SHUFFLE(2,0,2,0));
+- qValue = _mm_shuffle_ps(complex1, complex2, _MM_SHUFFLE(3,1,3,1));
+- // Arctan to get phase:
+- phase = atan2f4(qValue, iValue);
+- // When Q = 0 and I < 0, atan2f4 sucks and returns 2pi vice pi.
+- // Compare to 2pi:
+- keepMask = _mm_cmpneq_ps(phase,testVector);
+- phase = _mm_blendv_ps(correctVector, phase, keepMask);
+- // done with above correction.
+- phase = _mm_mul_ps(phase, vNormalizeFactor);
+- _mm_store_ps((float*)outPtr, phase);
+- outPtr += 4;
+- }
+- number = quarterPoints * 4;
++ const unsigned int quarterPoints = num_points / 4;
++ __m128 testVector = _mm_set_ps1(2 * M_PI);
++ __m128 correctVector = _mm_set_ps1(M_PI);
++ __m128 vNormalizeFactor = _mm_set_ps1(invNormalizeFactor);
++ __m128 phase;
++ __m128 complex1, complex2, iValue, qValue;
++ __m128 keepMask;
++
++ for (; number < quarterPoints; number++) {
++ // Load IQ data:
++ complex1 = _mm_load_ps(complexVectorPtr);
++ complexVectorPtr += 4;
++ complex2 = _mm_load_ps(complexVectorPtr);
++ complexVectorPtr += 4;
++ // Deinterleave IQ data:
++ iValue = _mm_shuffle_ps(complex1, complex2, _MM_SHUFFLE(2, 0, 2, 0));
++ qValue = _mm_shuffle_ps(complex1, complex2, _MM_SHUFFLE(3, 1, 3, 1));
++ // Arctan to get phase:
++ phase = atan2f4(qValue, iValue);
++ // When Q = 0 and I < 0, atan2f4 sucks and returns 2pi vice pi.
++ // Compare to 2pi:
++ keepMask = _mm_cmpneq_ps(phase, testVector);
++ phase = _mm_blendv_ps(correctVector, phase, keepMask);
++ // done with above correction.
++ phase = _mm_mul_ps(phase, vNormalizeFactor);
++ _mm_store_ps((float*)outPtr, phase);
++ outPtr += 4;
++ }
++ number = quarterPoints * 4;
+ #endif /* LV_HAVE_SIMDMATH_H */
+
+- for (; number < num_points; number++) {
+- const float real = *complexVectorPtr++;
+- const float imag = *complexVectorPtr++;
+- *outPtr++ = atan2f(imag, real) * invNormalizeFactor;
+- }
++ for (; number < num_points; number++) {
++ const float real = *complexVectorPtr++;
++ const float imag = *complexVectorPtr++;
++ *outPtr++ = atan2f(imag, real) * invNormalizeFactor;
++ }
+ }
+ #endif /* LV_HAVE_SSE4_1 */
+
+@@ -140,72 +144,78 @@ static inline void volk_32fc_s32f_atan2_32f_a_sse4_1(float* outputVector, const
+ #include <simdmath.h>
+ #endif /* LV_HAVE_LIB_SIMDMATH */
+
+-static inline void volk_32fc_s32f_atan2_32f_a_sse(float* outputVector, const lv_32fc_t* complexVector, const float normalizeFactor, unsigned int num_points){
+- const float* complexVectorPtr = (float*)complexVector;
+- float* outPtr = outputVector;
++static inline void volk_32fc_s32f_atan2_32f_a_sse(float* outputVector,
++ const lv_32fc_t* complexVector,
++ const float normalizeFactor,
++ unsigned int num_points)
++{
++ const float* complexVectorPtr = (float*)complexVector;
++ float* outPtr = outputVector;
+
+- unsigned int number = 0;
+- const float invNormalizeFactor = 1.0 / normalizeFactor;
++ unsigned int number = 0;
++ const float invNormalizeFactor = 1.0 / normalizeFactor;
+
+ #ifdef LV_HAVE_LIB_SIMDMATH
+- const unsigned int quarterPoints = num_points / 4;
+- __m128 testVector = _mm_set_ps1(2*M_PI);
+- __m128 correctVector = _mm_set_ps1(M_PI);
+- __m128 vNormalizeFactor = _mm_set_ps1(invNormalizeFactor);
+- __m128 phase;
+- __m128 complex1, complex2, iValue, qValue;
+- __m128 mask;
+- __m128 keepMask;
+-
+- for (; number < quarterPoints; number++) {
+- // Load IQ data:
+- complex1 = _mm_load_ps(complexVectorPtr);
+- complexVectorPtr += 4;
+- complex2 = _mm_load_ps(complexVectorPtr);
+- complexVectorPtr += 4;
+- // Deinterleave IQ data:
+- iValue = _mm_shuffle_ps(complex1, complex2, _MM_SHUFFLE(2,0,2,0));
+- qValue = _mm_shuffle_ps(complex1, complex2, _MM_SHUFFLE(3,1,3,1));
+- // Arctan to get phase:
+- phase = atan2f4(qValue, iValue);
+- // When Q = 0 and I < 0, atan2f4 sucks and returns 2pi vice pi.
+- // Compare to 2pi:
+- keepMask = _mm_cmpneq_ps(phase,testVector);
+- phase = _mm_and_ps(phase, keepMask);
+- mask = _mm_andnot_ps(keepMask, correctVector);
+- phase = _mm_or_ps(phase, mask);
+- // done with above correction.
+- phase = _mm_mul_ps(phase, vNormalizeFactor);
+- _mm_store_ps((float*)outPtr, phase);
+- outPtr += 4;
+- }
+- number = quarterPoints * 4;
++ const unsigned int quarterPoints = num_points / 4;
++ __m128 testVector = _mm_set_ps1(2 * M_PI);
++ __m128 correctVector = _mm_set_ps1(M_PI);
++ __m128 vNormalizeFactor = _mm_set_ps1(invNormalizeFactor);
++ __m128 phase;
++ __m128 complex1, complex2, iValue, qValue;
++ __m128 mask;
++ __m128 keepMask;
++
++ for (; number < quarterPoints; number++) {
++ // Load IQ data:
++ complex1 = _mm_load_ps(complexVectorPtr);
++ complexVectorPtr += 4;
++ complex2 = _mm_load_ps(complexVectorPtr);
++ complexVectorPtr += 4;
++ // Deinterleave IQ data:
++ iValue = _mm_shuffle_ps(complex1, complex2, _MM_SHUFFLE(2, 0, 2, 0));
++ qValue = _mm_shuffle_ps(complex1, complex2, _MM_SHUFFLE(3, 1, 3, 1));
++ // Arctan to get phase:
++ phase = atan2f4(qValue, iValue);
++ // When Q = 0 and I < 0, atan2f4 sucks and returns 2pi vice pi.
++ // Compare to 2pi:
++ keepMask = _mm_cmpneq_ps(phase, testVector);
++ phase = _mm_and_ps(phase, keepMask);
++ mask = _mm_andnot_ps(keepMask, correctVector);
++ phase = _mm_or_ps(phase, mask);
++ // done with above correction.
++ phase = _mm_mul_ps(phase, vNormalizeFactor);
++ _mm_store_ps((float*)outPtr, phase);
++ outPtr += 4;
++ }
++ number = quarterPoints * 4;
+ #endif /* LV_HAVE_SIMDMATH_H */
+
+- for (; number < num_points; number++) {
+- const float real = *complexVectorPtr++;
+- const float imag = *complexVectorPtr++;
+- *outPtr++ = atan2f(imag, real) * invNormalizeFactor;
+- }
++ for (; number < num_points; number++) {
++ const float real = *complexVectorPtr++;
++ const float imag = *complexVectorPtr++;
++ *outPtr++ = atan2f(imag, real) * invNormalizeFactor;
++ }
+ }
+ #endif /* LV_HAVE_SSE */
+
+ #ifdef LV_HAVE_GENERIC
+
+-static inline void volk_32fc_s32f_atan2_32f_generic(float* outputVector, const lv_32fc_t* inputVector, const float normalizeFactor, unsigned int num_points){
+- float* outPtr = outputVector;
+- const float* inPtr = (float*)inputVector;
+- const float invNormalizeFactor = 1.0 / normalizeFactor;
+- unsigned int number;
+- for ( number = 0; number < num_points; number++) {
+- const float real = *inPtr++;
+- const float imag = *inPtr++;
+- *outPtr++ = atan2f(imag, real) * invNormalizeFactor;
+- }
++static inline void volk_32fc_s32f_atan2_32f_generic(float* outputVector,
++ const lv_32fc_t* inputVector,
++ const float normalizeFactor,
++ unsigned int num_points)
++{
++ float* outPtr = outputVector;
++ const float* inPtr = (float*)inputVector;
++ const float invNormalizeFactor = 1.0 / normalizeFactor;
++ unsigned int number;
++ for (number = 0; number < num_points; number++) {
++ const float real = *inPtr++;
++ const float imag = *inPtr++;
++ *outPtr++ = atan2f(imag, real) * invNormalizeFactor;
++ }
+ }
+ #endif /* LV_HAVE_GENERIC */
+
+
+-
+-
+ #endif /* INCLUDED_volk_32fc_s32f_atan2_32f_a_H */
+diff --git a/kernels/volk/volk_32fc_s32f_deinterleave_real_16i.h b/kernels/volk/volk_32fc_s32f_deinterleave_real_16i.h
+index 64c6a8b..f70f494 100644
+--- a/kernels/volk/volk_32fc_s32f_deinterleave_real_16i.h
++++ b/kernels/volk/volk_32fc_s32f_deinterleave_real_16i.h
+@@ -30,8 +30,8 @@
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+- * void volk_32fc_s32f_deinterleave_real_16i(int16_t* iBuffer, const lv_32fc_t* complexVector, const float scalar, unsigned int num_points)
+- * \endcode
++ * void volk_32fc_s32f_deinterleave_real_16i(int16_t* iBuffer, const lv_32fc_t*
++ * complexVector, const float scalar, unsigned int num_points) \endcode
+ *
+ * \b Inputs
+ * \li complexVector: The complex input vector.
+@@ -73,61 +73,62 @@
+ #ifndef INCLUDED_volk_32fc_s32f_deinterleave_real_16i_a_H
+ #define INCLUDED_volk_32fc_s32f_deinterleave_real_16i_a_H
+
+-#include <volk/volk_common.h>
+ #include <inttypes.h>
+ #include <stdio.h>
++#include <volk/volk_common.h>
+
+
+ #ifdef LV_HAVE_AVX2
+ #include <immintrin.h>
+
+ static inline void
+-volk_32fc_s32f_deinterleave_real_16i_a_avx2(int16_t* iBuffer, const lv_32fc_t* complexVector,
+- const float scalar, unsigned int num_points)
++volk_32fc_s32f_deinterleave_real_16i_a_avx2(int16_t* iBuffer,
++ const lv_32fc_t* complexVector,
++ const float scalar,
++ unsigned int num_points)
+ {
+- unsigned int number = 0;
+- const unsigned int eighthPoints = num_points / 8;
+-
+- const float* complexVectorPtr = (float*)complexVector;
+- int16_t* iBufferPtr = iBuffer;
++ unsigned int number = 0;
++ const unsigned int eighthPoints = num_points / 8;
+
+- __m256 vScalar = _mm256_set1_ps(scalar);
++ const float* complexVectorPtr = (float*)complexVector;
++ int16_t* iBufferPtr = iBuffer;
+
+- __m256 cplxValue1, cplxValue2, iValue;
+- __m256i a;
+- __m128i b;
++ __m256 vScalar = _mm256_set1_ps(scalar);
+
+- __m256i idx = _mm256_set_epi32(3,3,3,3,5,1,4,0);
++ __m256 cplxValue1, cplxValue2, iValue;
++ __m256i a;
++ __m128i b;
+
+- for(;number < eighthPoints; number++){
+- cplxValue1 = _mm256_load_ps(complexVectorPtr);
+- complexVectorPtr += 8;
++ __m256i idx = _mm256_set_epi32(3, 3, 3, 3, 5, 1, 4, 0);
+
+- cplxValue2 = _mm256_load_ps(complexVectorPtr);
+- complexVectorPtr += 8;
++ for (; number < eighthPoints; number++) {
++ cplxValue1 = _mm256_load_ps(complexVectorPtr);
++ complexVectorPtr += 8;
+
+- // Arrange in i1i2i3i4 format
+- iValue = _mm256_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0));
++ cplxValue2 = _mm256_load_ps(complexVectorPtr);
++ complexVectorPtr += 8;
+
+- iValue = _mm256_mul_ps(iValue, vScalar);
++ // Arrange in i1i2i3i4 format
++ iValue = _mm256_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2, 0, 2, 0));
+
+- iValue = _mm256_round_ps(iValue, _MM_FROUND_TO_ZERO);
+- a = _mm256_cvtps_epi32(iValue);
+- a = _mm256_packs_epi32(a,a);
+- a = _mm256_permutevar8x32_epi32(a,idx);
+- b = _mm256_extracti128_si256(a,0);
++ iValue = _mm256_mul_ps(iValue, vScalar);
+
+- _mm_store_si128((__m128i*)iBufferPtr,b);
+- iBufferPtr += 8;
++ iValue = _mm256_round_ps(iValue, _MM_FROUND_TO_ZERO);
++ a = _mm256_cvtps_epi32(iValue);
++ a = _mm256_packs_epi32(a, a);
++ a = _mm256_permutevar8x32_epi32(a, idx);
++ b = _mm256_extracti128_si256(a, 0);
+
+- }
++ _mm_store_si128((__m128i*)iBufferPtr, b);
++ iBufferPtr += 8;
++ }
+
+- number = eighthPoints * 8;
+- iBufferPtr = &iBuffer[number];
+- for(; number < num_points; number++){
+- *iBufferPtr++ = (int16_t)(*complexVectorPtr++ * scalar);
+- complexVectorPtr++;
+- }
++ number = eighthPoints * 8;
++ iBufferPtr = &iBuffer[number];
++ for (; number < num_points; number++) {
++ *iBufferPtr++ = (int16_t)(*complexVectorPtr++ * scalar);
++ complexVectorPtr++;
++ }
+ }
+
+
+@@ -137,46 +138,48 @@ volk_32fc_s32f_deinterleave_real_16i_a_avx2(int16_t* iBuffer, const lv_32fc_t* c
+ #include <xmmintrin.h>
+
+ static inline void
+-volk_32fc_s32f_deinterleave_real_16i_a_sse(int16_t* iBuffer, const lv_32fc_t* complexVector,
+- const float scalar, unsigned int num_points)
++volk_32fc_s32f_deinterleave_real_16i_a_sse(int16_t* iBuffer,
++ const lv_32fc_t* complexVector,
++ const float scalar,
++ unsigned int num_points)
+ {
+- unsigned int number = 0;
+- const unsigned int quarterPoints = num_points / 4;
++ unsigned int number = 0;
++ const unsigned int quarterPoints = num_points / 4;
+
+- const float* complexVectorPtr = (float*)complexVector;
+- int16_t* iBufferPtr = iBuffer;
++ const float* complexVectorPtr = (float*)complexVector;
++ int16_t* iBufferPtr = iBuffer;
+
+- __m128 vScalar = _mm_set_ps1(scalar);
++ __m128 vScalar = _mm_set_ps1(scalar);
+
+- __m128 cplxValue1, cplxValue2, iValue;
++ __m128 cplxValue1, cplxValue2, iValue;
+
+- __VOLK_ATTR_ALIGNED(16) float floatBuffer[4];
++ __VOLK_ATTR_ALIGNED(16) float floatBuffer[4];
+
+- for(;number < quarterPoints; number++){
+- cplxValue1 = _mm_load_ps(complexVectorPtr);
+- complexVectorPtr += 4;
++ for (; number < quarterPoints; number++) {
++ cplxValue1 = _mm_load_ps(complexVectorPtr);
++ complexVectorPtr += 4;
+
+- cplxValue2 = _mm_load_ps(complexVectorPtr);
+- complexVectorPtr += 4;
++ cplxValue2 = _mm_load_ps(complexVectorPtr);
++ complexVectorPtr += 4;
+
+- // Arrange in i1i2i3i4 format
+- iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0));
++ // Arrange in i1i2i3i4 format
++ iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2, 0, 2, 0));
+
+- iValue = _mm_mul_ps(iValue, vScalar);
++ iValue = _mm_mul_ps(iValue, vScalar);
+
+- _mm_store_ps(floatBuffer, iValue);
+- *iBufferPtr++ = (int16_t)(floatBuffer[0]);
+- *iBufferPtr++ = (int16_t)(floatBuffer[1]);
+- *iBufferPtr++ = (int16_t)(floatBuffer[2]);
+- *iBufferPtr++ = (int16_t)(floatBuffer[3]);
+- }
++ _mm_store_ps(floatBuffer, iValue);
++ *iBufferPtr++ = (int16_t)(floatBuffer[0]);
++ *iBufferPtr++ = (int16_t)(floatBuffer[1]);
++ *iBufferPtr++ = (int16_t)(floatBuffer[2]);
++ *iBufferPtr++ = (int16_t)(floatBuffer[3]);
++ }
+
+- number = quarterPoints * 4;
+- iBufferPtr = &iBuffer[number];
+- for(; number < num_points; number++){
+- *iBufferPtr++ = (int16_t)(*complexVectorPtr++ * scalar);
+- complexVectorPtr++;
+- }
++ number = quarterPoints * 4;
++ iBufferPtr = &iBuffer[number];
++ for (; number < num_points; number++) {
++ *iBufferPtr++ = (int16_t)(*complexVectorPtr++ * scalar);
++ complexVectorPtr++;
++ }
+ }
+
+ #endif /* LV_HAVE_SSE */
+@@ -185,16 +188,18 @@ volk_32fc_s32f_deinterleave_real_16i_a_sse(int16_t* iBuffer, const lv_32fc_t* co
+ #ifdef LV_HAVE_GENERIC
+
+ static inline void
+-volk_32fc_s32f_deinterleave_real_16i_generic(int16_t* iBuffer, const lv_32fc_t* complexVector,
+- const float scalar, unsigned int num_points)
++volk_32fc_s32f_deinterleave_real_16i_generic(int16_t* iBuffer,
++ const lv_32fc_t* complexVector,
++ const float scalar,
++ unsigned int num_points)
+ {
+- const float* complexVectorPtr = (float*)complexVector;
+- int16_t* iBufferPtr = iBuffer;
+- unsigned int number = 0;
+- for(number = 0; number < num_points; number++){
+- *iBufferPtr++ = (int16_t)(*complexVectorPtr++ * scalar);
+- complexVectorPtr++;
+- }
++ const float* complexVectorPtr = (float*)complexVector;
++ int16_t* iBufferPtr = iBuffer;
++ unsigned int number = 0;
++ for (number = 0; number < num_points; number++) {
++ *iBufferPtr++ = (int16_t)(*complexVectorPtr++ * scalar);
++ complexVectorPtr++;
++ }
+ }
+
+ #endif /* LV_HAVE_GENERIC */
+@@ -204,60 +209,61 @@ volk_32fc_s32f_deinterleave_real_16i_generic(int16_t* iBuffer, const lv_32fc_t*
+ #ifndef INCLUDED_volk_32fc_s32f_deinterleave_real_16i_u_H
+ #define INCLUDED_volk_32fc_s32f_deinterleave_real_16i_u_H
+
+-#include <volk/volk_common.h>
+ #include <inttypes.h>
+ #include <stdio.h>
++#include <volk/volk_common.h>
+
+ #ifdef LV_HAVE_AVX2
+ #include <immintrin.h>
+
+ static inline void
+-volk_32fc_s32f_deinterleave_real_16i_u_avx2(int16_t* iBuffer, const lv_32fc_t* complexVector,
+- const float scalar, unsigned int num_points)
++volk_32fc_s32f_deinterleave_real_16i_u_avx2(int16_t* iBuffer,
++ const lv_32fc_t* complexVector,
++ const float scalar,
++ unsigned int num_points)
+ {
+- unsigned int number = 0;
+- const unsigned int eighthPoints = num_points / 8;
+-
+- const float* complexVectorPtr = (float*)complexVector;
+- int16_t* iBufferPtr = iBuffer;
++ unsigned int number = 0;
++ const unsigned int eighthPoints = num_points / 8;
+
+- __m256 vScalar = _mm256_set1_ps(scalar);
++ const float* complexVectorPtr = (float*)complexVector;
++ int16_t* iBufferPtr = iBuffer;
+
+- __m256 cplxValue1, cplxValue2, iValue;
+- __m256i a;
+- __m128i b;
++ __m256 vScalar = _mm256_set1_ps(scalar);
+
+- __m256i idx = _mm256_set_epi32(3,3,3,3,5,1,4,0);
++ __m256 cplxValue1, cplxValue2, iValue;
++ __m256i a;
++ __m128i b;
+
+- for(;number < eighthPoints; number++){
+- cplxValue1 = _mm256_loadu_ps(complexVectorPtr);
+- complexVectorPtr += 8;
++ __m256i idx = _mm256_set_epi32(3, 3, 3, 3, 5, 1, 4, 0);
+
+- cplxValue2 = _mm256_loadu_ps(complexVectorPtr);
+- complexVectorPtr += 8;
++ for (; number < eighthPoints; number++) {
++ cplxValue1 = _mm256_loadu_ps(complexVectorPtr);
++ complexVectorPtr += 8;
+
+- // Arrange in i1i2i3i4 format
+- iValue = _mm256_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0));
++ cplxValue2 = _mm256_loadu_ps(complexVectorPtr);
++ complexVectorPtr += 8;
+
+- iValue = _mm256_mul_ps(iValue, vScalar);
++ // Arrange in i1i2i3i4 format
++ iValue = _mm256_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2, 0, 2, 0));
+
+- iValue = _mm256_round_ps(iValue, _MM_FROUND_TO_ZERO);
+- a = _mm256_cvtps_epi32(iValue);
+- a = _mm256_packs_epi32(a,a);
+- a = _mm256_permutevar8x32_epi32(a,idx);
+- b = _mm256_extracti128_si256(a,0);
++ iValue = _mm256_mul_ps(iValue, vScalar);
+
+- _mm_storeu_si128((__m128i*)iBufferPtr,b);
+- iBufferPtr += 8;
++ iValue = _mm256_round_ps(iValue, _MM_FROUND_TO_ZERO);
++ a = _mm256_cvtps_epi32(iValue);
++ a = _mm256_packs_epi32(a, a);
++ a = _mm256_permutevar8x32_epi32(a, idx);
++ b = _mm256_extracti128_si256(a, 0);
+
+- }
++ _mm_storeu_si128((__m128i*)iBufferPtr, b);
++ iBufferPtr += 8;
++ }
+
+- number = eighthPoints * 8;
+- iBufferPtr = &iBuffer[number];
+- for(; number < num_points; number++){
+- *iBufferPtr++ = (int16_t)(*complexVectorPtr++ * scalar);
+- complexVectorPtr++;
+- }
++ number = eighthPoints * 8;
++ iBufferPtr = &iBuffer[number];
++ for (; number < num_points; number++) {
++ *iBufferPtr++ = (int16_t)(*complexVectorPtr++ * scalar);
++ complexVectorPtr++;
++ }
+ }
+
+ #endif /* LV_HAVE_AVX2 */
+diff --git a/kernels/volk/volk_32fc_s32f_magnitude_16i.h b/kernels/volk/volk_32fc_s32f_magnitude_16i.h
+index 6e7e7cb..91a5b8e 100644
+--- a/kernels/volk/volk_32fc_s32f_magnitude_16i.h
++++ b/kernels/volk/volk_32fc_s32f_magnitude_16i.h
+@@ -31,8 +31,8 @@
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+- * void volk_32fc_s32f_magnitude_16i(int16_t* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points)
+- * \endcode
++ * void volk_32fc_s32f_magnitude_16i(int16_t* magnitudeVector, const lv_32fc_t*
++ * complexVector, unsigned int num_points) \endcode
+ *
+ * \b Inputs
+ * \li complexVector: The complex input vector.
+@@ -73,123 +73,129 @@
+ #ifdef LV_HAVE_GENERIC
+ #include <volk/volk_common.h>
+
+-static inline void
+-volk_32fc_s32f_magnitude_16i_generic(int16_t* magnitudeVector, const lv_32fc_t* complexVector,
+- const float scalar, unsigned int num_points)
++static inline void volk_32fc_s32f_magnitude_16i_generic(int16_t* magnitudeVector,
++ const lv_32fc_t* complexVector,
++ const float scalar,
++ unsigned int num_points)
+ {
+- const float* complexVectorPtr = (float*)complexVector;
+- int16_t* magnitudeVectorPtr = magnitudeVector;
+- unsigned int number = 0;
+- for(number = 0; number < num_points; number++){
+- __VOLK_VOLATILE float real = *complexVectorPtr++;
+- __VOLK_VOLATILE float imag = *complexVectorPtr++;
+- real *= real;
+- imag *= imag;
+- *magnitudeVectorPtr++ = (int16_t)rintf(scalar*sqrtf(real + imag));
+- }
++ const float* complexVectorPtr = (float*)complexVector;
++ int16_t* magnitudeVectorPtr = magnitudeVector;
++ unsigned int number = 0;
++ for (number = 0; number < num_points; number++) {
++ __VOLK_VOLATILE float real = *complexVectorPtr++;
++ __VOLK_VOLATILE float imag = *complexVectorPtr++;
++ real *= real;
++ imag *= imag;
++ *magnitudeVectorPtr++ = (int16_t)rintf(scalar * sqrtf(real + imag));
++ }
+ }
+ #endif /* LV_HAVE_GENERIC */
+
+ #ifndef INCLUDED_volk_32fc_s32f_magnitude_16i_a_H
+ #define INCLUDED_volk_32fc_s32f_magnitude_16i_a_H
+
+-#include <volk/volk_common.h>
+ #include <inttypes.h>
+-#include <stdio.h>
+ #include <math.h>
++#include <stdio.h>
++#include <volk/volk_common.h>
+
+ #ifdef LV_HAVE_AVX2
+ #include <immintrin.h>
+
+-static inline void
+-volk_32fc_s32f_magnitude_16i_a_avx2(int16_t* magnitudeVector, const lv_32fc_t* complexVector,
+- const float scalar, unsigned int num_points)
++static inline void volk_32fc_s32f_magnitude_16i_a_avx2(int16_t* magnitudeVector,
++ const lv_32fc_t* complexVector,
++ const float scalar,
++ unsigned int num_points)
+ {
+- unsigned int number = 0;
+- const unsigned int eighthPoints = num_points / 8;
++ unsigned int number = 0;
++ const unsigned int eighthPoints = num_points / 8;
+
+- const float* complexVectorPtr = (const float*)complexVector;
+- int16_t* magnitudeVectorPtr = magnitudeVector;
++ const float* complexVectorPtr = (const float*)complexVector;
++ int16_t* magnitudeVectorPtr = magnitudeVector;
+
+- __m256 vScalar = _mm256_set1_ps(scalar);
+- __m256i idx = _mm256_set_epi32(0,0,0,0,5,1,4,0);
+- __m256 cplxValue1, cplxValue2, result;
+- __m256i resultInt;
+- __m128i resultShort;
++ __m256 vScalar = _mm256_set1_ps(scalar);
++ __m256i idx = _mm256_set_epi32(0, 0, 0, 0, 5, 1, 4, 0);
++ __m256 cplxValue1, cplxValue2, result;
++ __m256i resultInt;
++ __m128i resultShort;
+
+- for(;number < eighthPoints; number++){
+- cplxValue1 = _mm256_load_ps(complexVectorPtr);
+- complexVectorPtr += 8;
++ for (; number < eighthPoints; number++) {
++ cplxValue1 = _mm256_load_ps(complexVectorPtr);
++ complexVectorPtr += 8;
+
+- cplxValue2 = _mm256_load_ps(complexVectorPtr);
+- complexVectorPtr += 8;
++ cplxValue2 = _mm256_load_ps(complexVectorPtr);
++ complexVectorPtr += 8;
+
+- cplxValue1 = _mm256_mul_ps(cplxValue1, cplxValue1); // Square the values
+- cplxValue2 = _mm256_mul_ps(cplxValue2, cplxValue2); // Square the Values
++ cplxValue1 = _mm256_mul_ps(cplxValue1, cplxValue1); // Square the values
++ cplxValue2 = _mm256_mul_ps(cplxValue2, cplxValue2); // Square the Values
+
+- result = _mm256_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values
++ result = _mm256_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values
+
+- result = _mm256_sqrt_ps(result);
++ result = _mm256_sqrt_ps(result);
+
+- result = _mm256_mul_ps(result, vScalar);
++ result = _mm256_mul_ps(result, vScalar);
+
+- resultInt = _mm256_cvtps_epi32(result);
+- resultInt = _mm256_packs_epi32(resultInt, resultInt);
+- resultInt = _mm256_permutevar8x32_epi32(resultInt, idx); //permute to compensate for shuffling in hadd and packs
+- resultShort = _mm256_extracti128_si256(resultInt,0);
+- _mm_store_si128((__m128i*)magnitudeVectorPtr,resultShort);
+- magnitudeVectorPtr += 8;
+- }
++ resultInt = _mm256_cvtps_epi32(result);
++ resultInt = _mm256_packs_epi32(resultInt, resultInt);
++ resultInt = _mm256_permutevar8x32_epi32(
++ resultInt, idx); // permute to compensate for shuffling in hadd and packs
++ resultShort = _mm256_extracti128_si256(resultInt, 0);
++ _mm_store_si128((__m128i*)magnitudeVectorPtr, resultShort);
++ magnitudeVectorPtr += 8;
++ }
+
+- number = eighthPoints * 8;
+- volk_32fc_s32f_magnitude_16i_generic(magnitudeVector+number, complexVector+number, scalar, num_points-number);
++ number = eighthPoints * 8;
++ volk_32fc_s32f_magnitude_16i_generic(
++ magnitudeVector + number, complexVector + number, scalar, num_points - number);
+ }
+ #endif /* LV_HAVE_AVX2 */
+
+ #ifdef LV_HAVE_SSE3
+ #include <pmmintrin.h>
+
+-static inline void
+-volk_32fc_s32f_magnitude_16i_a_sse3(int16_t* magnitudeVector, const lv_32fc_t* complexVector,
+- const float scalar, unsigned int num_points)
++static inline void volk_32fc_s32f_magnitude_16i_a_sse3(int16_t* magnitudeVector,
++ const lv_32fc_t* complexVector,
++ const float scalar,
++ unsigned int num_points)
+ {
+- unsigned int number = 0;
+- const unsigned int quarterPoints = num_points / 4;
++ unsigned int number = 0;
++ const unsigned int quarterPoints = num_points / 4;
+
+- const float* complexVectorPtr = (const float*)complexVector;
+- int16_t* magnitudeVectorPtr = magnitudeVector;
++ const float* complexVectorPtr = (const float*)complexVector;
++ int16_t* magnitudeVectorPtr = magnitudeVector;
+
+- __m128 vScalar = _mm_set_ps1(scalar);
++ __m128 vScalar = _mm_set_ps1(scalar);
+
+- __m128 cplxValue1, cplxValue2, result;
++ __m128 cplxValue1, cplxValue2, result;
+
+- __VOLK_ATTR_ALIGNED(16) float floatBuffer[4];
++ __VOLK_ATTR_ALIGNED(16) float floatBuffer[4];
+
+- for(;number < quarterPoints; number++){
+- cplxValue1 = _mm_load_ps(complexVectorPtr);
+- complexVectorPtr += 4;
++ for (; number < quarterPoints; number++) {
++ cplxValue1 = _mm_load_ps(complexVectorPtr);
++ complexVectorPtr += 4;
+
+- cplxValue2 = _mm_load_ps(complexVectorPtr);
+- complexVectorPtr += 4;
++ cplxValue2 = _mm_load_ps(complexVectorPtr);
++ complexVectorPtr += 4;
+
+- cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values
+- cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values
++ cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values
++ cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values
+
+- result = _mm_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values
++ result = _mm_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values
+
+- result = _mm_sqrt_ps(result);
++ result = _mm_sqrt_ps(result);
+
+- result = _mm_mul_ps(result, vScalar);
++ result = _mm_mul_ps(result, vScalar);
+
+- _mm_store_ps(floatBuffer, result);
+- *magnitudeVectorPtr++ = (int16_t)rintf(floatBuffer[0]);
+- *magnitudeVectorPtr++ = (int16_t)rintf(floatBuffer[1]);
+- *magnitudeVectorPtr++ = (int16_t)rintf(floatBuffer[2]);
+- *magnitudeVectorPtr++ = (int16_t)rintf(floatBuffer[3]);
+- }
++ _mm_store_ps(floatBuffer, result);
++ *magnitudeVectorPtr++ = (int16_t)rintf(floatBuffer[0]);
++ *magnitudeVectorPtr++ = (int16_t)rintf(floatBuffer[1]);
++ *magnitudeVectorPtr++ = (int16_t)rintf(floatBuffer[2]);
++ *magnitudeVectorPtr++ = (int16_t)rintf(floatBuffer[3]);
++ }
+
+- number = quarterPoints * 4;
+- volk_32fc_s32f_magnitude_16i_generic(magnitudeVector+number, complexVector+number, scalar, num_points-number);
++ number = quarterPoints * 4;
++ volk_32fc_s32f_magnitude_16i_generic(
++ magnitudeVector + number, complexVector + number, scalar, num_points - number);
+ }
+ #endif /* LV_HAVE_SSE3 */
+
+@@ -197,53 +203,57 @@ volk_32fc_s32f_magnitude_16i_a_sse3(int16_t* magnitudeVector, const lv_32fc_t* c
+ #ifdef LV_HAVE_SSE
+ #include <xmmintrin.h>
+
+-static inline void
+-volk_32fc_s32f_magnitude_16i_a_sse(int16_t* magnitudeVector, const lv_32fc_t* complexVector,
+- const float scalar, unsigned int num_points)
++static inline void volk_32fc_s32f_magnitude_16i_a_sse(int16_t* magnitudeVector,
++ const lv_32fc_t* complexVector,
++ const float scalar,
++ unsigned int num_points)
+ {
+- unsigned int number = 0;
+- const unsigned int quarterPoints = num_points / 4;
++ unsigned int number = 0;
++ const unsigned int quarterPoints = num_points / 4;
+
+- const float* complexVectorPtr = (const float*)complexVector;
+- int16_t* magnitudeVectorPtr = magnitudeVector;
++ const float* complexVectorPtr = (const float*)complexVector;
++ int16_t* magnitudeVectorPtr = magnitudeVector;
+
+- __m128 vScalar = _mm_set_ps1(scalar);
++ __m128 vScalar = _mm_set_ps1(scalar);
+
+- __m128 cplxValue1, cplxValue2, result;
+- __m128 iValue, qValue;
++ __m128 cplxValue1, cplxValue2, result;
++ __m128 iValue, qValue;
+
+- __VOLK_ATTR_ALIGNED(16) float floatBuffer[4];
++ __VOLK_ATTR_ALIGNED(16) float floatBuffer[4];
+
+- for(;number < quarterPoints; number++){
+- cplxValue1 = _mm_load_ps(complexVectorPtr);
+- complexVectorPtr += 4;
++ for (; number < quarterPoints; number++) {
++ cplxValue1 = _mm_load_ps(complexVectorPtr);
++ complexVectorPtr += 4;
+
+- cplxValue2 = _mm_load_ps(complexVectorPtr);
+- complexVectorPtr += 4;
++ cplxValue2 = _mm_load_ps(complexVectorPtr);
++ complexVectorPtr += 4;
+
+- // Arrange in i1i2i3i4 format
+- iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0));
+- // Arrange in q1q2q3q4 format
+- qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1));
++ // Arrange in i1i2i3i4 format
++ iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2, 0, 2, 0));
++ // Arrange in q1q2q3q4 format
++ qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3, 1, 3, 1));
+
+- __VOLK_VOLATILE __m128 iValue2 = _mm_mul_ps(iValue, iValue); // Square the I values
+- __VOLK_VOLATILE __m128 qValue2 = _mm_mul_ps(qValue, qValue); // Square the Q Values
++ __VOLK_VOLATILE __m128 iValue2 =
++ _mm_mul_ps(iValue, iValue); // Square the I values
++ __VOLK_VOLATILE __m128 qValue2 =
++ _mm_mul_ps(qValue, qValue); // Square the Q Values
+
+- result = _mm_add_ps(iValue2, qValue2); // Add the I2 and Q2 values
++ result = _mm_add_ps(iValue2, qValue2); // Add the I2 and Q2 values
+
+- result = _mm_sqrt_ps(result);
++ result = _mm_sqrt_ps(result);
+
+- result = _mm_mul_ps(result, vScalar);
++ result = _mm_mul_ps(result, vScalar);
+
+- _mm_store_ps(floatBuffer, result);
+- *magnitudeVectorPtr++ = (int16_t)rintf(floatBuffer[0]);
+- *magnitudeVectorPtr++ = (int16_t)rintf(floatBuffer[1]);
+- *magnitudeVectorPtr++ = (int16_t)rintf(floatBuffer[2]);
+- *magnitudeVectorPtr++ = (int16_t)rintf(floatBuffer[3]);
+- }
++ _mm_store_ps(floatBuffer, result);
++ *magnitudeVectorPtr++ = (int16_t)rintf(floatBuffer[0]);
++ *magnitudeVectorPtr++ = (int16_t)rintf(floatBuffer[1]);
++ *magnitudeVectorPtr++ = (int16_t)rintf(floatBuffer[2]);
++ *magnitudeVectorPtr++ = (int16_t)rintf(floatBuffer[3]);
++ }
+
+- number = quarterPoints * 4;
+- volk_32fc_s32f_magnitude_16i_generic(magnitudeVector+number, complexVector+number, scalar, num_points-number);
++ number = quarterPoints * 4;
++ volk_32fc_s32f_magnitude_16i_generic(
++ magnitudeVector + number, complexVector + number, scalar, num_points - number);
+ }
+ #endif /* LV_HAVE_SSE */
+
+@@ -253,56 +263,59 @@ volk_32fc_s32f_magnitude_16i_a_sse(int16_t* magnitudeVector, const lv_32fc_t* co
+ #ifndef INCLUDED_volk_32fc_s32f_magnitude_16i_u_H
+ #define INCLUDED_volk_32fc_s32f_magnitude_16i_u_H
+
+-#include <volk/volk_common.h>
+ #include <inttypes.h>
+-#include <stdio.h>
+ #include <math.h>
++#include <stdio.h>
++#include <volk/volk_common.h>
+
+ #ifdef LV_HAVE_AVX2
+ #include <immintrin.h>
+
+-static inline void
+-volk_32fc_s32f_magnitude_16i_u_avx2(int16_t* magnitudeVector, const lv_32fc_t* complexVector,
+- const float scalar, unsigned int num_points)
++static inline void volk_32fc_s32f_magnitude_16i_u_avx2(int16_t* magnitudeVector,
++ const lv_32fc_t* complexVector,
++ const float scalar,
++ unsigned int num_points)
+ {
+- unsigned int number = 0;
+- const unsigned int eighthPoints = num_points / 8;
++ unsigned int number = 0;
++ const unsigned int eighthPoints = num_points / 8;
+
+- const float* complexVectorPtr = (const float*)complexVector;
+- int16_t* magnitudeVectorPtr = magnitudeVector;
++ const float* complexVectorPtr = (const float*)complexVector;
++ int16_t* magnitudeVectorPtr = magnitudeVector;
+
+- __m256 vScalar = _mm256_set1_ps(scalar);
+- __m256i idx = _mm256_set_epi32(0,0,0,0,5,1,4,0);
+- __m256 cplxValue1, cplxValue2, result;
+- __m256i resultInt;
+- __m128i resultShort;
++ __m256 vScalar = _mm256_set1_ps(scalar);
++ __m256i idx = _mm256_set_epi32(0, 0, 0, 0, 5, 1, 4, 0);
++ __m256 cplxValue1, cplxValue2, result;
++ __m256i resultInt;
++ __m128i resultShort;
+
+- for(;number < eighthPoints; number++){
+- cplxValue1 = _mm256_loadu_ps(complexVectorPtr);
+- complexVectorPtr += 8;
++ for (; number < eighthPoints; number++) {
++ cplxValue1 = _mm256_loadu_ps(complexVectorPtr);
++ complexVectorPtr += 8;
+
+- cplxValue2 = _mm256_loadu_ps(complexVectorPtr);
+- complexVectorPtr += 8;
++ cplxValue2 = _mm256_loadu_ps(complexVectorPtr);
++ complexVectorPtr += 8;
+
+- cplxValue1 = _mm256_mul_ps(cplxValue1, cplxValue1); // Square the values
+- cplxValue2 = _mm256_mul_ps(cplxValue2, cplxValue2); // Square the Values
++ cplxValue1 = _mm256_mul_ps(cplxValue1, cplxValue1); // Square the values
++ cplxValue2 = _mm256_mul_ps(cplxValue2, cplxValue2); // Square the Values
+
+- result = _mm256_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values
++ result = _mm256_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values
+
+- result = _mm256_sqrt_ps(result);
++ result = _mm256_sqrt_ps(result);
+
+- result = _mm256_mul_ps(result, vScalar);
++ result = _mm256_mul_ps(result, vScalar);
+
+- resultInt = _mm256_cvtps_epi32(result);
+- resultInt = _mm256_packs_epi32(resultInt, resultInt);
+- resultInt = _mm256_permutevar8x32_epi32(resultInt, idx); //permute to compensate for shuffling in hadd and packs
+- resultShort = _mm256_extracti128_si256(resultInt,0);
+- _mm_storeu_si128((__m128i*)magnitudeVectorPtr,resultShort);
+- magnitudeVectorPtr += 8;
+- }
++ resultInt = _mm256_cvtps_epi32(result);
++ resultInt = _mm256_packs_epi32(resultInt, resultInt);
++ resultInt = _mm256_permutevar8x32_epi32(
++ resultInt, idx); // permute to compensate for shuffling in hadd and packs
++ resultShort = _mm256_extracti128_si256(resultInt, 0);
++ _mm_storeu_si128((__m128i*)magnitudeVectorPtr, resultShort);
++ magnitudeVectorPtr += 8;
++ }
+
+- number = eighthPoints * 8;
+- volk_32fc_s32f_magnitude_16i_generic(magnitudeVector+number, complexVector+number, scalar, num_points-number);
++ number = eighthPoints * 8;
++ volk_32fc_s32f_magnitude_16i_generic(
++ magnitudeVector + number, complexVector + number, scalar, num_points - number);
+ }
+ #endif /* LV_HAVE_AVX2 */
+
+diff --git a/kernels/volk/volk_32fc_s32f_power_32fc.h b/kernels/volk/volk_32fc_s32f_power_32fc.h
+index d2803f2..b31179c 100644
+--- a/kernels/volk/volk_32fc_s32f_power_32fc.h
++++ b/kernels/volk/volk_32fc_s32f_power_32fc.h
+@@ -31,8 +31,8 @@
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+- * void volk_32fc_s32f_power_32fc(lv_32fc_t* cVector, const lv_32fc_t* aVector, const float power, unsigned int num_points)
+- * \endcode
++ * void volk_32fc_s32f_power_32fc(lv_32fc_t* cVector, const lv_32fc_t* aVector, const
++ * float power, unsigned int num_points) \endcode
+ *
+ * \b Inputs
+ * \li aVector: The complex input vector.
+@@ -56,15 +56,17 @@
+ #define INCLUDED_volk_32fc_s32f_power_32fc_a_H
+
+ #include <inttypes.h>
+-#include <stdio.h>
+ #include <math.h>
++#include <stdio.h>
+
+ //! raise a complex float to a real float power
+-static inline lv_32fc_t __volk_s32fc_s32f_power_s32fc_a(const lv_32fc_t exp, const float power)
++static inline lv_32fc_t __volk_s32fc_s32f_power_s32fc_a(const lv_32fc_t exp,
++ const float power)
+ {
+- const float arg = power*atan2f(lv_creal(exp), lv_cimag(exp));
+- const float mag = powf(lv_creal(exp)*lv_creal(exp) + lv_cimag(exp)*lv_cimag(exp), power/2);
+- return mag*lv_cmake(-cosf(arg), sinf(arg));
++ const float arg = power * atan2f(lv_creal(exp), lv_cimag(exp));
++ const float mag =
++ powf(lv_creal(exp) * lv_creal(exp) + lv_cimag(exp) * lv_cimag(exp), power / 2);
++ return mag * lv_cmake(-cosf(arg), sinf(arg));
+ }
+
+ #ifdef LV_HAVE_SSE
+@@ -74,83 +76,94 @@ static inline lv_32fc_t __volk_s32fc_s32f_power_s32fc_a(const lv_32fc_t exp, con
+ #include <simdmath.h>
+ #endif /* LV_HAVE_LIB_SIMDMATH */
+
+-static inline void
+-volk_32fc_s32f_power_32fc_a_sse(lv_32fc_t* cVector, const lv_32fc_t* aVector,
+- const float power, unsigned int num_points)
++static inline void volk_32fc_s32f_power_32fc_a_sse(lv_32fc_t* cVector,
++ const lv_32fc_t* aVector,
++ const float power,
++ unsigned int num_points)
+ {
+- unsigned int number = 0;
++ unsigned int number = 0;
+
+- lv_32fc_t* cPtr = cVector;
+- const lv_32fc_t* aPtr = aVector;
++ lv_32fc_t* cPtr = cVector;
++ const lv_32fc_t* aPtr = aVector;
+
+ #ifdef LV_HAVE_LIB_SIMDMATH
+- const unsigned int quarterPoints = num_points / 4;
+- __m128 vPower = _mm_set_ps1(power);
++ const unsigned int quarterPoints = num_points / 4;
++ __m128 vPower = _mm_set_ps1(power);
+
+- __m128 cplxValue1, cplxValue2, magnitude, phase, iValue, qValue;
+- for(;number < quarterPoints; number++){
++ __m128 cplxValue1, cplxValue2, magnitude, phase, iValue, qValue;
++ for (; number < quarterPoints; number++) {
+
+- cplxValue1 = _mm_load_ps((float*)aPtr);
+- aPtr += 2;
++ cplxValue1 = _mm_load_ps((float*)aPtr);
++ aPtr += 2;
+
+- cplxValue2 = _mm_load_ps((float*)aPtr);
+- aPtr += 2;
++ cplxValue2 = _mm_load_ps((float*)aPtr);
++ aPtr += 2;
+
+- // Convert to polar coordinates
++ // Convert to polar coordinates
+
+- // Arrange in i1i2i3i4 format
+- iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0));
+- // Arrange in q1q2q3q4 format
+- qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1));
++ // Arrange in i1i2i3i4 format
++ iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2, 0, 2, 0));
++ // Arrange in q1q2q3q4 format
++ qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3, 1, 3, 1));
+
+- phase = atan2f4(qValue, iValue); // Calculate the Phase
++ phase = atan2f4(qValue, iValue); // Calculate the Phase
+
+- magnitude = _mm_sqrt_ps(_mm_add_ps(_mm_mul_ps(iValue, iValue), _mm_mul_ps(qValue, qValue))); // Calculate the magnitude by square rooting the added I2 and Q2 values
++ magnitude = _mm_sqrt_ps(
++ _mm_add_ps(_mm_mul_ps(iValue, iValue),
++ _mm_mul_ps(qValue, qValue))); // Calculate the magnitude by square
++ // rooting the added I2 and Q2 values
+
+- // Now calculate the power of the polar coordinate data
+- magnitude = powf4(magnitude, vPower); // Take the magnitude to the specified power
++ // Now calculate the power of the polar coordinate data
++ magnitude = powf4(magnitude, vPower); // Take the magnitude to the specified power
+
+- phase = _mm_mul_ps(phase, vPower); // Multiply the phase by the specified power
++ phase = _mm_mul_ps(phase, vPower); // Multiply the phase by the specified power
+
+- // Convert back to cartesian coordinates
+- iValue = _mm_mul_ps( cosf4(phase), magnitude); // Multiply the cos of the phase by the magnitude
+- qValue = _mm_mul_ps( sinf4(phase), magnitude); // Multiply the sin of the phase by the magnitude
++ // Convert back to cartesian coordinates
++ iValue = _mm_mul_ps(cosf4(phase),
++ magnitude); // Multiply the cos of the phase by the magnitude
++ qValue = _mm_mul_ps(sinf4(phase),
++ magnitude); // Multiply the sin of the phase by the magnitude
+
+- cplxValue1 = _mm_unpacklo_ps(iValue, qValue); // Interleave the lower two i & q values
+- cplxValue2 = _mm_unpackhi_ps(iValue, qValue); // Interleave the upper two i & q values
++ cplxValue1 =
++ _mm_unpacklo_ps(iValue, qValue); // Interleave the lower two i & q values
++ cplxValue2 =
++ _mm_unpackhi_ps(iValue, qValue); // Interleave the upper two i & q values
+
+- _mm_store_ps((float*)cPtr,cplxValue1); // Store the results back into the C container
++ _mm_store_ps((float*)cPtr,
++ cplxValue1); // Store the results back into the C container
+
+- cPtr += 2;
++ cPtr += 2;
+
+- _mm_store_ps((float*)cPtr,cplxValue2); // Store the results back into the C container
++ _mm_store_ps((float*)cPtr,
++ cplxValue2); // Store the results back into the C container
+
+- cPtr += 2;
+- }
++ cPtr += 2;
++ }
+
+- number = quarterPoints * 4;
++ number = quarterPoints * 4;
+ #endif /* LV_HAVE_LIB_SIMDMATH */
+
+- for(;number < num_points; number++){
+- *cPtr++ = __volk_s32fc_s32f_power_s32fc_a((*aPtr++), power);
+- }
++ for (; number < num_points; number++) {
++ *cPtr++ = __volk_s32fc_s32f_power_s32fc_a((*aPtr++), power);
++ }
+ }
+ #endif /* LV_HAVE_SSE */
+
+
+ #ifdef LV_HAVE_GENERIC
+
+-static inline void
+-volk_32fc_s32f_power_32fc_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector,
+- const float power, unsigned int num_points)
++static inline void volk_32fc_s32f_power_32fc_generic(lv_32fc_t* cVector,
++ const lv_32fc_t* aVector,
++ const float power,
++ unsigned int num_points)
+ {
+- lv_32fc_t* cPtr = cVector;
+- const lv_32fc_t* aPtr = aVector;
+- unsigned int number = 0;
++ lv_32fc_t* cPtr = cVector;
++ const lv_32fc_t* aPtr = aVector;
++ unsigned int number = 0;
+
+- for(number = 0; number < num_points; number++){
+- *cPtr++ = __volk_s32fc_s32f_power_s32fc_a((*aPtr++), power);
+- }
++ for (number = 0; number < num_points; number++) {
++ *cPtr++ = __volk_s32fc_s32f_power_s32fc_a((*aPtr++), power);
++ }
+ }
+
+ #endif /* LV_HAVE_GENERIC */
+diff --git a/kernels/volk/volk_32fc_s32f_power_spectrum_32f.h b/kernels/volk/volk_32fc_s32f_power_spectrum_32f.h
+index abe4662..a1a036d 100644
+--- a/kernels/volk/volk_32fc_s32f_power_spectrum_32f.h
++++ b/kernels/volk/volk_32fc_s32f_power_spectrum_32f.h
+@@ -29,13 +29,13 @@
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+- * void volk_32fc_s32f_power_spectrum_32f(float* logPowerOutput, const lv_32fc_t* complexFFTInput, const float normalizationFactor, unsigned int num_points)
+- * \endcode
++ * void volk_32fc_s32f_power_spectrum_32f(float* logPowerOutput, const lv_32fc_t*
++ * complexFFTInput, const float normalizationFactor, unsigned int num_points) \endcode
+ *
+ * \b Inputs
+ * \li complexFFTInput The complex data output from the FFT point.
+- * \li normalizationFactor: This value is divided against all the input values before the power is calculated.
+- * \li num_points: The number of fft data points.
++ * \li normalizationFactor: This value is divided against all the input values before the
++ * power is calculated. \li num_points: The number of fft data points.
+ *
+ * \b Outputs
+ * \li logPowerOutput: The 10.0 * log10(r*r + i*i) for each data point.
+@@ -54,8 +54,8 @@
+ #define INCLUDED_volk_32fc_s32f_power_spectrum_32f_a_H
+
+ #include <inttypes.h>
+-#include <stdio.h>
+ #include <math.h>
++#include <stdio.h>
+
+ #ifdef LV_HAVE_SSE3
+ #include <pmmintrin.h>
+@@ -65,74 +65,75 @@
+ #endif /* LV_HAVE_LIB_SIMDMATH */
+
+ static inline void
+-volk_32fc_s32f_power_spectrum_32f_a_sse3(float* logPowerOutput, const lv_32fc_t* complexFFTInput,
+- const float normalizationFactor, unsigned int num_points)
++volk_32fc_s32f_power_spectrum_32f_a_sse3(float* logPowerOutput,
++ const lv_32fc_t* complexFFTInput,
++ const float normalizationFactor,
++ unsigned int num_points)
+ {
+- const float* inputPtr = (const float*)complexFFTInput;
+- float* destPtr = logPowerOutput;
+- uint64_t number = 0;
+- const float iNormalizationFactor = 1.0 / normalizationFactor;
++ const float* inputPtr = (const float*)complexFFTInput;
++ float* destPtr = logPowerOutput;
++ uint64_t number = 0;
++ const float iNormalizationFactor = 1.0 / normalizationFactor;
+ #ifdef LV_HAVE_LIB_SIMDMATH
+- __m128 magScalar = _mm_set_ps1(10.0);
+- magScalar = _mm_div_ps(magScalar, logf4(magScalar));
++ __m128 magScalar = _mm_set_ps1(10.0);
++ magScalar = _mm_div_ps(magScalar, logf4(magScalar));
+
+- __m128 invNormalizationFactor = _mm_set_ps1(iNormalizationFactor);
++ __m128 invNormalizationFactor = _mm_set_ps1(iNormalizationFactor);
+
+- __m128 power;
+- __m128 input1, input2;
+- const uint64_t quarterPoints = num_points / 4;
+- for(;number < quarterPoints; number++){
+- // Load the complex values
+- input1 =_mm_load_ps(inputPtr);
+- inputPtr += 4;
+- input2 =_mm_load_ps(inputPtr);
+- inputPtr += 4;
++ __m128 power;
++ __m128 input1, input2;
++ const uint64_t quarterPoints = num_points / 4;
++ for (; number < quarterPoints; number++) {
++ // Load the complex values
++ input1 = _mm_load_ps(inputPtr);
++ inputPtr += 4;
++ input2 = _mm_load_ps(inputPtr);
++ inputPtr += 4;
+
+- // Apply the normalization factor
+- input1 = _mm_mul_ps(input1, invNormalizationFactor);
+- input2 = _mm_mul_ps(input2, invNormalizationFactor);
++ // Apply the normalization factor
++ input1 = _mm_mul_ps(input1, invNormalizationFactor);
++ input2 = _mm_mul_ps(input2, invNormalizationFactor);
+
+- // Multiply each value by itself
+- // (r1*r1), (i1*i1), (r2*r2), (i2*i2)
+- input1 = _mm_mul_ps(input1, input1);
+- // (r3*r3), (i3*i3), (r4*r4), (i4*i4)
+- input2 = _mm_mul_ps(input2, input2);
++ // Multiply each value by itself
++ // (r1*r1), (i1*i1), (r2*r2), (i2*i2)
++ input1 = _mm_mul_ps(input1, input1);
++ // (r3*r3), (i3*i3), (r4*r4), (i4*i4)
++ input2 = _mm_mul_ps(input2, input2);
+
+- // Horizontal add, to add (r*r) + (i*i) for each complex value
+- // (r1*r1)+(i1*i1), (r2*r2) + (i2*i2), (r3*r3)+(i3*i3), (r4*r4)+(i4*i4)
+- power = _mm_hadd_ps(input1, input2);
++ // Horizontal add, to add (r*r) + (i*i) for each complex value
++ // (r1*r1)+(i1*i1), (r2*r2) + (i2*i2), (r3*r3)+(i3*i3), (r4*r4)+(i4*i4)
++ power = _mm_hadd_ps(input1, input2);
+
+- // Calculate the natural log power
+- power = logf4(power);
++ // Calculate the natural log power
++ power = logf4(power);
+
+- // Convert to log10 and multiply by 10.0
+- power = _mm_mul_ps(power, magScalar);
++ // Convert to log10 and multiply by 10.0
++ power = _mm_mul_ps(power, magScalar);
+
+- // Store the floating point results
+- _mm_store_ps(destPtr, power);
++ // Store the floating point results
++ _mm_store_ps(destPtr, power);
+
+- destPtr += 4;
+- }
++ destPtr += 4;
++ }
+
+- number = quarterPoints*4;
++ number = quarterPoints * 4;
+ #endif /* LV_HAVE_LIB_SIMDMATH */
+- // Calculate the FFT for any remaining points
+-
+- for(; number < num_points; number++){
+- // Calculate dBm
+- // 50 ohm load assumption
+- // 10 * log10 (v^2 / (2 * 50.0 * .001)) = 10 * log10( v^2 * 10)
+- // 75 ohm load assumption
+- // 10 * log10 (v^2 / (2 * 75.0 * .001)) = 10 * log10( v^2 * 15)
++ // Calculate the FFT for any remaining points
+
+- const float real = *inputPtr++ * iNormalizationFactor;
+- const float imag = *inputPtr++ * iNormalizationFactor;
++ for (; number < num_points; number++) {
++ // Calculate dBm
++ // 50 ohm load assumption
++ // 10 * log10 (v^2 / (2 * 50.0 * .001)) = 10 * log10( v^2 * 10)
++ // 75 ohm load assumption
++ // 10 * log10 (v^2 / (2 * 75.0 * .001)) = 10 * log10( v^2 * 15)
+
+- *destPtr = 10.0*log10f(((real * real) + (imag * imag)) + 1e-20);
++ const float real = *inputPtr++ * iNormalizationFactor;
++ const float imag = *inputPtr++ * iNormalizationFactor;
+
+- destPtr++;
+- }
++ *destPtr = 10.0 * log10f(((real * real) + (imag * imag)) + 1e-20);
+
++ destPtr++;
++ }
+ }
+ #endif /* LV_HAVE_SSE3 */
+
+@@ -141,7 +142,10 @@ volk_32fc_s32f_power_spectrum_32f_a_sse3(float* logPowerOutput, const lv_32fc_t*
+ #include <volk/volk_neon_intrinsics.h>
+
+ static inline void
+-volk_32fc_s32f_power_spectrum_32f_neon(float* logPowerOutput, const lv_32fc_t* complexFFTInput, const float normalizationFactor, unsigned int num_points)
++volk_32fc_s32f_power_spectrum_32f_neon(float* logPowerOutput,
++ const lv_32fc_t* complexFFTInput,
++ const float normalizationFactor,
++ unsigned int num_points)
+ {
+ float* logPowerOutputPtr = logPowerOutput;
+ const lv_32fc_t* complexFFTInputPtr = complexFFTInput;
+@@ -151,14 +155,14 @@ volk_32fc_s32f_power_spectrum_32f_neon(float* logPowerOutput, const lv_32fc_t* c
+ float32x4x2_t fft_vec;
+ float32x4_t log_pwr_vec;
+ float32x4_t mag_squared_vec;
+-
++
+ const float inv_ln10_10 = 4.34294481903f; // 10.0/ln(10.)
+-
+- for(number = 0; number < quarter_points; number++) {
++
++ for (number = 0; number < quarter_points; number++) {
+ // Load
+ fft_vec = vld2q_f32((float*)complexFFTInputPtr);
+ // Prefetch next 4
+- __VOLK_PREFETCH(complexFFTInputPtr+4);
++ __VOLK_PREFETCH(complexFFTInputPtr + 4);
+ // Normalize
+ fft_vec.val[0] = vmulq_n_f32(fft_vec.val[0], iNormalizationFactor);
+ fft_vec.val[1] = vmulq_n_f32(fft_vec.val[1], iNormalizationFactor);
+@@ -167,12 +171,12 @@ volk_32fc_s32f_power_spectrum_32f_neon(float* logPowerOutput, const lv_32fc_t* c
+ // Store
+ vst1q_f32(logPowerOutputPtr, log_pwr_vec);
+ // Move pointers ahead
+- complexFFTInputPtr+=4;
+- logPowerOutputPtr+=4;
++ complexFFTInputPtr += 4;
++ logPowerOutputPtr += 4;
+ }
+-
++
+ // deal with the rest
+- for(number = quarter_points * 4; number < num_points; number++) {
++ for (number = quarter_points * 4; number < num_points; number++) {
+ const float real = lv_creal(*complexFFTInputPtr) * iNormalizationFactor;
+ const float imag = lv_cimag(*complexFFTInputPtr) * iNormalizationFactor;
+ *logPowerOutputPtr = 10.0 * log10f(((real * real) + (imag * imag)) + 1e-20);
+@@ -186,27 +190,29 @@ volk_32fc_s32f_power_spectrum_32f_neon(float* logPowerOutput, const lv_32fc_t* c
+ #ifdef LV_HAVE_GENERIC
+
+ static inline void
+-volk_32fc_s32f_power_spectrum_32f_generic(float* logPowerOutput, const lv_32fc_t* complexFFTInput,
+- const float normalizationFactor, unsigned int num_points)
++volk_32fc_s32f_power_spectrum_32f_generic(float* logPowerOutput,
++ const lv_32fc_t* complexFFTInput,
++ const float normalizationFactor,
++ unsigned int num_points)
+ {
+- // Calculate the Power of the complex point
+- const float* inputPtr = (float*)complexFFTInput;
+- float* realFFTDataPointsPtr = logPowerOutput;
+- const float iNormalizationFactor = 1.0 / normalizationFactor;
+- unsigned int point;
+- for(point = 0; point < num_points; point++){
+- // Calculate dBm
+- // 50 ohm load assumption
+- // 10 * log10 (v^2 / (2 * 50.0 * .001)) = 10 * log10( v^2 * 10)
+- // 75 ohm load assumption
+- // 10 * log10 (v^2 / (2 * 75.0 * .001)) = 10 * log10( v^2 * 15)
+-
+- const float real = *inputPtr++ * iNormalizationFactor;
+- const float imag = *inputPtr++ * iNormalizationFactor;
+-
+- *realFFTDataPointsPtr = 10.0*log10f(((real * real) + (imag * imag)) + 1e-20);
+- realFFTDataPointsPtr++;
+- }
++ // Calculate the Power of the complex point
++ const float* inputPtr = (float*)complexFFTInput;
++ float* realFFTDataPointsPtr = logPowerOutput;
++ const float iNormalizationFactor = 1.0 / normalizationFactor;
++ unsigned int point;
++ for (point = 0; point < num_points; point++) {
++ // Calculate dBm
++ // 50 ohm load assumption
++ // 10 * log10 (v^2 / (2 * 50.0 * .001)) = 10 * log10( v^2 * 10)
++ // 75 ohm load assumption
++ // 10 * log10 (v^2 / (2 * 75.0 * .001)) = 10 * log10( v^2 * 15)
++
++ const float real = *inputPtr++ * iNormalizationFactor;
++ const float imag = *inputPtr++ * iNormalizationFactor;
++
++ *realFFTDataPointsPtr = 10.0 * log10f(((real * real) + (imag * imag)) + 1e-20);
++ realFFTDataPointsPtr++;
++ }
+ }
+ #endif /* LV_HAVE_GENERIC */
+
+diff --git a/kernels/volk/volk_32fc_s32f_x2_power_spectral_density_32f.h b/kernels/volk/volk_32fc_s32f_x2_power_spectral_density_32f.h
+index 3260b08..37ca43c 100644
+--- a/kernels/volk/volk_32fc_s32f_x2_power_spectral_density_32f.h
++++ b/kernels/volk/volk_32fc_s32f_x2_power_spectral_density_32f.h
+@@ -29,14 +29,15 @@
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+- * void volk_32fc_s32f_x2_power_spectral_density_32f(float* logPowerOutput, const lv_32fc_t* complexFFTInput, const float normalizationFactor, const float rbw, unsigned int num_points)
+- * \endcode
++ * void volk_32fc_s32f_x2_power_spectral_density_32f(float* logPowerOutput, const
++ * lv_32fc_t* complexFFTInput, const float normalizationFactor, const float rbw, unsigned
++ * int num_points) \endcode
+ *
+ * \b Inputs
+ * \li complexFFTInput The complex data output from the FFT point.
+- * \li normalizationFactor: This value is divided against all the input values before the power is calculated.
+- * \li rbw: The resolution bandwidth of the fft spectrum
+- * \li num_points: The number of fft data points.
++ * \li normalizationFactor: This value is divided against all the input values before the
++ * power is calculated. \li rbw: The resolution bandwidth of the fft spectrum \li
++ * num_points: The number of fft data points.
+ *
+ * \b Outputs
+ * \li logPowerOutput: The 10.0 * log10((r*r + i*i)/RBW) for each data point.
+@@ -55,8 +56,8 @@
+ #define INCLUDED_volk_32fc_s32f_x2_power_spectral_density_32f_a_H
+
+ #include <inttypes.h>
+-#include <stdio.h>
+ #include <math.h>
++#include <stdio.h>
+
+ #ifdef LV_HAVE_AVX
+ #include <immintrin.h>
+@@ -66,83 +67,84 @@
+ #endif /* LV_HAVE_LIB_SIMDMATH */
+
+ static inline void
+-volk_32fc_s32f_x2_power_spectral_density_32f_a_avx(float* logPowerOutput, const lv_32fc_t* complexFFTInput,
+- const float normalizationFactor, const float rbw,
++volk_32fc_s32f_x2_power_spectral_density_32f_a_avx(float* logPowerOutput,
++ const lv_32fc_t* complexFFTInput,
++ const float normalizationFactor,
++ const float rbw,
+ unsigned int num_points)
+ {
+- const float* inputPtr = (const float*)complexFFTInput;
+- float* destPtr = logPowerOutput;
+- uint64_t number = 0;
+- const float iRBW = 1.0 / rbw;
+- const float iNormalizationFactor = 1.0 / normalizationFactor;
++ const float* inputPtr = (const float*)complexFFTInput;
++ float* destPtr = logPowerOutput;
++ uint64_t number = 0;
++ const float iRBW = 1.0 / rbw;
++ const float iNormalizationFactor = 1.0 / normalizationFactor;
+
+ #ifdef LV_HAVE_LIB_SIMDMATH
+- __m256 magScalar = _mm256_set1_ps(10.0);
+- magScalar = _mm256_div_ps(magScalar, logf4(magScalar));
++ __m256 magScalar = _mm256_set1_ps(10.0);
++ magScalar = _mm256_div_ps(magScalar, logf4(magScalar));
+
+- __m256 invRBW = _mm256_set1_ps(iRBW);
++ __m256 invRBW = _mm256_set1_ps(iRBW);
+
+- __m256 invNormalizationFactor = _mm256_set1_ps(iNormalizationFactor);
++ __m256 invNormalizationFactor = _mm256_set1_ps(iNormalizationFactor);
+
+- __m256 power;
+- __m256 input1, input2;
+- const uint64_t eighthPoints = num_points / 8;
+- for(;number < eighthPoints; number++){
+- // Load the complex values
+- input1 =_mm256_load_ps(inputPtr);
+- inputPtr += 8;
+- input2 =_mm256_load_ps(inputPtr);
+- inputPtr += 8;
++ __m256 power;
++ __m256 input1, input2;
++ const uint64_t eighthPoints = num_points / 8;
++ for (; number < eighthPoints; number++) {
++ // Load the complex values
++ input1 = _mm256_load_ps(inputPtr);
++ inputPtr += 8;
++ input2 = _mm256_load_ps(inputPtr);
++ inputPtr += 8;
+
+- // Apply the normalization factor
+- input1 = _mm256_mul_ps(input1, invNormalizationFactor);
+- input2 = _mm256_mul_ps(input2, invNormalizationFactor);
++ // Apply the normalization factor
++ input1 = _mm256_mul_ps(input1, invNormalizationFactor);
++ input2 = _mm256_mul_ps(input2, invNormalizationFactor);
+
+- // Multiply each value by itself
+- // (r1*r1), (i1*i1), (r2*r2), (i2*i2)
+- input1 = _mm256_mul_ps(input1, input1);
+- // (r3*r3), (i3*i3), (r4*r4), (i4*i4)
+- input2 = _mm256_mul_ps(input2, input2);
++ // Multiply each value by itself
++ // (r1*r1), (i1*i1), (r2*r2), (i2*i2)
++ input1 = _mm256_mul_ps(input1, input1);
++ // (r3*r3), (i3*i3), (r4*r4), (i4*i4)
++ input2 = _mm256_mul_ps(input2, input2);
+
+- // Horizontal add, to add (r*r) + (i*i) for each complex value
+- // (r1*r1)+(i1*i1), (r2*r2) + (i2*i2), (r3*r3)+(i3*i3), (r4*r4)+(i4*i4)
+- inputVal1 = _mm256_permute2f128_ps(input1, input2, 0x20);
+- inputVal2 = _mm256_permute2f128_ps(input1, input2, 0x31);
++ // Horizontal add, to add (r*r) + (i*i) for each complex value
++ // (r1*r1)+(i1*i1), (r2*r2) + (i2*i2), (r3*r3)+(i3*i3), (r4*r4)+(i4*i4)
++ inputVal1 = _mm256_permute2f128_ps(input1, input2, 0x20);
++ inputVal2 = _mm256_permute2f128_ps(input1, input2, 0x31);
+
+- power = _mm256_hadd_ps(inputVal1, inputVal2);
++ power = _mm256_hadd_ps(inputVal1, inputVal2);
+
+- // Divide by the rbw
+- power = _mm256_mul_ps(power, invRBW);
++ // Divide by the rbw
++ power = _mm256_mul_ps(power, invRBW);
+
+- // Calculate the natural log power
+- power = logf4(power);
++ // Calculate the natural log power
++ power = logf4(power);
+
+- // Convert to log10 and multiply by 10.0
+- power = _mm256_mul_ps(power, magScalar);
++ // Convert to log10 and multiply by 10.0
++ power = _mm256_mul_ps(power, magScalar);
+
+- // Store the floating point results
+- _mm256_store_ps(destPtr, power);
++ // Store the floating point results
++ _mm256_store_ps(destPtr, power);
+
+- destPtr += 8;
+- }
++ destPtr += 8;
++ }
+
+- number = eighthPoints*8;
++ number = eighthPoints * 8;
+ #endif /* LV_HAVE_LIB_SIMDMATH */
+- // Calculate the FFT for any remaining points
+- for(; number < num_points; number++){
+- // Calculate dBm
+- // 50 ohm load assumption
+- // 10 * log10 (v^2 / (2 * 50.0 * .001)) = 10 * log10( v^2 * 10)
+- // 75 ohm load assumption
+- // 10 * log10 (v^2 / (2 * 75.0 * .001)) = 10 * log10( v^2 * 15)
+-
+- const float real = *inputPtr++ * iNormalizationFactor;
+- const float imag = *inputPtr++ * iNormalizationFactor;
+-
+- *destPtr = 10.0*log10f((((real * real) + (imag * imag)) + 1e-20) * iRBW);
+- destPtr++;
+- }
+-
++ // Calculate the FFT for any remaining points
++ for (; number < num_points; number++) {
++ // Calculate dBm
++ // 50 ohm load assumption
++ // 10 * log10 (v^2 / (2 * 50.0 * .001)) = 10 * log10( v^2 * 10)
++ // 75 ohm load assumption
++ // 10 * log10 (v^2 / (2 * 75.0 * .001)) = 10 * log10( v^2 * 15)
++
++ const float real = *inputPtr++ * iNormalizationFactor;
++ const float imag = *inputPtr++ * iNormalizationFactor;
++
++ *destPtr = 10.0 * log10f((((real * real) + (imag * imag)) + 1e-20) * iRBW);
++ destPtr++;
++ }
+ }
+ #endif /* LV_HAVE_AVX */
+
+@@ -150,86 +152,86 @@ volk_32fc_s32f_x2_power_spectral_density_32f_a_avx(float* logPowerOutput, const
+ #include <pmmintrin.h>
+
+
+-
+ #ifdef LV_HAVE_LIB_SIMDMATH
+ #include <simdmath.h>
+ #endif /* LV_HAVE_LIB_SIMDMATH */
+
+ static inline void
+-volk_32fc_s32f_x2_power_spectral_density_32f_a_sse3(float* logPowerOutput, const lv_32fc_t* complexFFTInput,
+- const float normalizationFactor, const float rbw,
++volk_32fc_s32f_x2_power_spectral_density_32f_a_sse3(float* logPowerOutput,
++ const lv_32fc_t* complexFFTInput,
++ const float normalizationFactor,
++ const float rbw,
+ unsigned int num_points)
+ {
+- const float* inputPtr = (const float*)complexFFTInput;
+- float* destPtr = logPowerOutput;
+- uint64_t number = 0;
+- const float iRBW = 1.0 / rbw;
+- const float iNormalizationFactor = 1.0 / normalizationFactor;
++ const float* inputPtr = (const float*)complexFFTInput;
++ float* destPtr = logPowerOutput;
++ uint64_t number = 0;
++ const float iRBW = 1.0 / rbw;
++ const float iNormalizationFactor = 1.0 / normalizationFactor;
+
+ #ifdef LV_HAVE_LIB_SIMDMATH
+- __m128 magScalar = _mm_set_ps1(10.0);
+- magScalar = _mm_div_ps(magScalar, logf4(magScalar));
++ __m128 magScalar = _mm_set_ps1(10.0);
++ magScalar = _mm_div_ps(magScalar, logf4(magScalar));
+
+- __m128 invRBW = _mm_set_ps1(iRBW);
++ __m128 invRBW = _mm_set_ps1(iRBW);
+
+- __m128 invNormalizationFactor = _mm_set_ps1(iNormalizationFactor);
++ __m128 invNormalizationFactor = _mm_set_ps1(iNormalizationFactor);
+
+- __m128 power;
+- __m128 input1, input2;
+- const uint64_t quarterPoints = num_points / 4;
+- for(;number < quarterPoints; number++){
+- // Load the complex values
+- input1 =_mm_load_ps(inputPtr);
+- inputPtr += 4;
+- input2 =_mm_load_ps(inputPtr);
+- inputPtr += 4;
++ __m128 power;
++ __m128 input1, input2;
++ const uint64_t quarterPoints = num_points / 4;
++ for (; number < quarterPoints; number++) {
++ // Load the complex values
++ input1 = _mm_load_ps(inputPtr);
++ inputPtr += 4;
++ input2 = _mm_load_ps(inputPtr);
++ inputPtr += 4;
+
+- // Apply the normalization factor
+- input1 = _mm_mul_ps(input1, invNormalizationFactor);
+- input2 = _mm_mul_ps(input2, invNormalizationFactor);
++ // Apply the normalization factor
++ input1 = _mm_mul_ps(input1, invNormalizationFactor);
++ input2 = _mm_mul_ps(input2, invNormalizationFactor);
+
+- // Multiply each value by itself
+- // (r1*r1), (i1*i1), (r2*r2), (i2*i2)
+- input1 = _mm_mul_ps(input1, input1);
+- // (r3*r3), (i3*i3), (r4*r4), (i4*i4)
+- input2 = _mm_mul_ps(input2, input2);
++ // Multiply each value by itself
++ // (r1*r1), (i1*i1), (r2*r2), (i2*i2)
++ input1 = _mm_mul_ps(input1, input1);
++ // (r3*r3), (i3*i3), (r4*r4), (i4*i4)
++ input2 = _mm_mul_ps(input2, input2);
+
+- // Horizontal add, to add (r*r) + (i*i) for each complex value
+- // (r1*r1)+(i1*i1), (r2*r2) + (i2*i2), (r3*r3)+(i3*i3), (r4*r4)+(i4*i4)
+- power = _mm_hadd_ps(input1, input2);
++ // Horizontal add, to add (r*r) + (i*i) for each complex value
++ // (r1*r1)+(i1*i1), (r2*r2) + (i2*i2), (r3*r3)+(i3*i3), (r4*r4)+(i4*i4)
++ power = _mm_hadd_ps(input1, input2);
+
+- // Divide by the rbw
+- power = _mm_mul_ps(power, invRBW);
++ // Divide by the rbw
++ power = _mm_mul_ps(power, invRBW);
+
+- // Calculate the natural log power
+- power = logf4(power);
++ // Calculate the natural log power
++ power = logf4(power);
+
+- // Convert to log10 and multiply by 10.0
+- power = _mm_mul_ps(power, magScalar);
++ // Convert to log10 and multiply by 10.0
++ power = _mm_mul_ps(power, magScalar);
+
+- // Store the floating point results
+- _mm_store_ps(destPtr, power);
++ // Store the floating point results
++ _mm_store_ps(destPtr, power);
+
+- destPtr += 4;
+- }
++ destPtr += 4;
++ }
+
+- number = quarterPoints*4;
++ number = quarterPoints * 4;
+ #endif /* LV_HAVE_LIB_SIMDMATH */
+- // Calculate the FFT for any remaining points
+- for(; number < num_points; number++){
+- // Calculate dBm
+- // 50 ohm load assumption
+- // 10 * log10 (v^2 / (2 * 50.0 * .001)) = 10 * log10( v^2 * 10)
+- // 75 ohm load assumption
+- // 10 * log10 (v^2 / (2 * 75.0 * .001)) = 10 * log10( v^2 * 15)
+-
+- const float real = *inputPtr++ * iNormalizationFactor;
+- const float imag = *inputPtr++ * iNormalizationFactor;
+-
+- *destPtr = 10.0*log10f((((real * real) + (imag * imag)) + 1e-20) * iRBW);
+- destPtr++;
+- }
+-
++ // Calculate the FFT for any remaining points
++ for (; number < num_points; number++) {
++ // Calculate dBm
++ // 50 ohm load assumption
++ // 10 * log10 (v^2 / (2 * 50.0 * .001)) = 10 * log10( v^2 * 10)
++ // 75 ohm load assumption
++ // 10 * log10 (v^2 / (2 * 75.0 * .001)) = 10 * log10( v^2 * 15)
++
++ const float real = *inputPtr++ * iNormalizationFactor;
++ const float imag = *inputPtr++ * iNormalizationFactor;
++
++ *destPtr = 10.0 * log10f((((real * real) + (imag * imag)) + 1e-20) * iRBW);
++ destPtr++;
++ }
+ }
+ #endif /* LV_HAVE_SSE3 */
+
+@@ -237,31 +239,34 @@ volk_32fc_s32f_x2_power_spectral_density_32f_a_sse3(float* logPowerOutput, const
+ #ifdef LV_HAVE_GENERIC
+
+ static inline void
+-volk_32fc_s32f_x2_power_spectral_density_32f_generic(float* logPowerOutput, const lv_32fc_t* complexFFTInput,
+- const float normalizationFactor, const float rbw,
++volk_32fc_s32f_x2_power_spectral_density_32f_generic(float* logPowerOutput,
++ const lv_32fc_t* complexFFTInput,
++ const float normalizationFactor,
++ const float rbw,
+ unsigned int num_points)
+ {
+- // Calculate the Power of the complex point
+- const float* inputPtr = (float*)complexFFTInput;
+- float* realFFTDataPointsPtr = logPowerOutput;
+- unsigned int point;
+- const float invRBW = 1.0 / rbw;
+- const float iNormalizationFactor = 1.0 / normalizationFactor;
+-
+- for(point = 0; point < num_points; point++){
+- // Calculate dBm
+- // 50 ohm load assumption
+- // 10 * log10 (v^2 / (2 * 50.0 * .001)) = 10 * log10( v^2 * 10)
+- // 75 ohm load assumption
+- // 10 * log10 (v^2 / (2 * 75.0 * .001)) = 10 * log10( v^2 * 15)
+-
+- const float real = *inputPtr++ * iNormalizationFactor;
+- const float imag = *inputPtr++ * iNormalizationFactor;
+-
+- *realFFTDataPointsPtr = 10.0*log10f((((real * real) + (imag * imag)) + 1e-20) * invRBW);
+-
+- realFFTDataPointsPtr++;
+- }
++ // Calculate the Power of the complex point
++ const float* inputPtr = (float*)complexFFTInput;
++ float* realFFTDataPointsPtr = logPowerOutput;
++ unsigned int point;
++ const float invRBW = 1.0 / rbw;
++ const float iNormalizationFactor = 1.0 / normalizationFactor;
++
++ for (point = 0; point < num_points; point++) {
++ // Calculate dBm
++ // 50 ohm load assumption
++ // 10 * log10 (v^2 / (2 * 50.0 * .001)) = 10 * log10( v^2 * 10)
++ // 75 ohm load assumption
++ // 10 * log10 (v^2 / (2 * 75.0 * .001)) = 10 * log10( v^2 * 15)
++
++ const float real = *inputPtr++ * iNormalizationFactor;
++ const float imag = *inputPtr++ * iNormalizationFactor;
++
++ *realFFTDataPointsPtr =
++ 10.0 * log10f((((real * real) + (imag * imag)) + 1e-20) * invRBW);
++
++ realFFTDataPointsPtr++;
++ }
+ }
+ #endif /* LV_HAVE_GENERIC */
+
+diff --git a/kernels/volk/volk_32fc_s32fc_multiply_32fc.h b/kernels/volk/volk_32fc_s32fc_multiply_32fc.h
+index fe416b4..840008a 100644
+--- a/kernels/volk/volk_32fc_s32fc_multiply_32fc.h
++++ b/kernels/volk/volk_32fc_s32fc_multiply_32fc.h
+@@ -30,8 +30,8 @@
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+- * void volk_32fc_s32fc_multiply_32fc(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t scalar, unsigned int num_points);
+- * \endcode
++ * void volk_32fc_s32fc_multiply_32fc(lv_32fc_t* cVector, const lv_32fc_t* aVector, const
++ * lv_32fc_t scalar, unsigned int num_points); \endcode
+ *
+ * \b Inputs
+ * \li aVector: The input vector to be multiplied.
+@@ -76,15 +76,19 @@
+ #ifndef INCLUDED_volk_32fc_s32fc_multiply_32fc_u_H
+ #define INCLUDED_volk_32fc_s32fc_multiply_32fc_u_H
+
++#include <float.h>
+ #include <inttypes.h>
+ #include <stdio.h>
+ #include <volk/volk_complex.h>
+-#include <float.h>
+
+ #if LV_HAVE_AVX && LV_HAVE_FMA
+ #include <immintrin.h>
+
+-static inline void volk_32fc_s32fc_multiply_32fc_u_avx_fma(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t scalar, unsigned int num_points){
++static inline void volk_32fc_s32fc_multiply_32fc_u_avx_fma(lv_32fc_t* cVector,
++ const lv_32fc_t* aVector,
++ const lv_32fc_t scalar,
++ unsigned int num_points)
++{
+ unsigned int number = 0;
+ unsigned int i = 0;
+ const unsigned int quarterPoints = num_points / 4;
+@@ -97,34 +101,38 @@ static inline void volk_32fc_s32fc_multiply_32fc_u_avx_fma(lv_32fc_t* cVector, c
+ yl = _mm256_set1_ps(lv_creal(scalar));
+ yh = _mm256_set1_ps(lv_cimag(scalar));
+
+- for(;number < quarterPoints; number++){
+- x = _mm256_loadu_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
++ for (; number < quarterPoints; number++) {
++ x = _mm256_loadu_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
+
+- tmp1 = x;
++ tmp1 = x;
+
+- x = _mm256_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br
++ x = _mm256_shuffle_ps(x, x, 0xB1); // Re-arrange x to be ai,ar,bi,br
+
+- tmp2 = _mm256_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
++ tmp2 = _mm256_mul_ps(x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
+
+- z = _mm256_fmaddsub_ps(tmp1, yl,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
++ z = _mm256_fmaddsub_ps(
++ tmp1, yl, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
+
+- _mm256_storeu_ps((float*)c,z); // Store the results back into the C container
++ _mm256_storeu_ps((float*)c, z); // Store the results back into the C container
+
+- a += 4;
+- c += 4;
++ a += 4;
++ c += 4;
+ }
+
+- for(i = num_points-isodd; i < num_points; i++) {
++ for (i = num_points - isodd; i < num_points; i++) {
+ *c++ = (*a++) * scalar;
+ }
+-
+ }
+ #endif /* LV_HAVE_AVX && LV_HAVE_FMA */
+
+ #ifdef LV_HAVE_AVX
+ #include <immintrin.h>
+
+-static inline void volk_32fc_s32fc_multiply_32fc_u_avx(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t scalar, unsigned int num_points){
++static inline void volk_32fc_s32fc_multiply_32fc_u_avx(lv_32fc_t* cVector,
++ const lv_32fc_t* aVector,
++ const lv_32fc_t scalar,
++ unsigned int num_points)
++{
+ unsigned int number = 0;
+ unsigned int i = 0;
+ const unsigned int quarterPoints = num_points / 4;
+@@ -137,35 +145,39 @@ static inline void volk_32fc_s32fc_multiply_32fc_u_avx(lv_32fc_t* cVector, const
+ yl = _mm256_set1_ps(lv_creal(scalar));
+ yh = _mm256_set1_ps(lv_cimag(scalar));
+
+- for(;number < quarterPoints; number++){
+- x = _mm256_loadu_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
++ for (; number < quarterPoints; number++) {
++ x = _mm256_loadu_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
+
+- tmp1 = _mm256_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
++ tmp1 = _mm256_mul_ps(x, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
+
+- x = _mm256_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br
++ x = _mm256_shuffle_ps(x, x, 0xB1); // Re-arrange x to be ai,ar,bi,br
+
+- tmp2 = _mm256_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
++ tmp2 = _mm256_mul_ps(x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
+
+- z = _mm256_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
++ z = _mm256_addsub_ps(tmp1,
++ tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
+
+- _mm256_storeu_ps((float*)c,z); // Store the results back into the C container
++ _mm256_storeu_ps((float*)c, z); // Store the results back into the C container
+
+- a += 4;
+- c += 4;
++ a += 4;
++ c += 4;
+ }
+
+- for(i = num_points-isodd; i < num_points; i++) {
++ for (i = num_points - isodd; i < num_points; i++) {
+ *c++ = (*a++) * scalar;
+ }
+-
+ }
+ #endif /* LV_HAVE_AVX */
+
+ #ifdef LV_HAVE_SSE3
+ #include <pmmintrin.h>
+
+-static inline void volk_32fc_s32fc_multiply_32fc_u_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t scalar, unsigned int num_points){
+- unsigned int number = 0;
++static inline void volk_32fc_s32fc_multiply_32fc_u_sse3(lv_32fc_t* cVector,
++ const lv_32fc_t* aVector,
++ const lv_32fc_t scalar,
++ unsigned int num_points)
++{
++ unsigned int number = 0;
+ const unsigned int halfPoints = num_points / 2;
+
+ __m128 x, yl, yh, z, tmp1, tmp2;
+@@ -176,53 +188,58 @@ static inline void volk_32fc_s32fc_multiply_32fc_u_sse3(lv_32fc_t* cVector, cons
+ yl = _mm_set_ps1(lv_creal(scalar));
+ yh = _mm_set_ps1(lv_cimag(scalar));
+
+- for(;number < halfPoints; number++){
++ for (; number < halfPoints; number++) {
+
+- x = _mm_loadu_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
++ x = _mm_loadu_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
+
+- tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
++ tmp1 = _mm_mul_ps(x, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
+
+- x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br
++ x = _mm_shuffle_ps(x, x, 0xB1); // Re-arrange x to be ai,ar,bi,br
+
+- tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
++ tmp2 = _mm_mul_ps(x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
+
+- z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
++ z = _mm_addsub_ps(tmp1,
++ tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
+
+- _mm_storeu_ps((float*)c,z); // Store the results back into the C container
++ _mm_storeu_ps((float*)c, z); // Store the results back into the C container
+
+- a += 2;
+- c += 2;
++ a += 2;
++ c += 2;
+ }
+
+- if((num_points % 2) != 0) {
+- *c = (*a) * scalar;
++ if ((num_points % 2) != 0) {
++ *c = (*a) * scalar;
+ }
+ }
+ #endif /* LV_HAVE_SSE */
+
+ #ifdef LV_HAVE_GENERIC
+
+-static inline void volk_32fc_s32fc_multiply_32fc_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t scalar, unsigned int num_points){
++static inline void volk_32fc_s32fc_multiply_32fc_generic(lv_32fc_t* cVector,
++ const lv_32fc_t* aVector,
++ const lv_32fc_t scalar,
++ unsigned int num_points)
++{
+ lv_32fc_t* cPtr = cVector;
+ const lv_32fc_t* aPtr = aVector;
+ unsigned int number = num_points;
+
+ // unwrap loop
+- while (number >= 8){
+- *cPtr++ = (*aPtr++) * scalar;
+- *cPtr++ = (*aPtr++) * scalar;
+- *cPtr++ = (*aPtr++) * scalar;
+- *cPtr++ = (*aPtr++) * scalar;
+- *cPtr++ = (*aPtr++) * scalar;
+- *cPtr++ = (*aPtr++) * scalar;
+- *cPtr++ = (*aPtr++) * scalar;
+- *cPtr++ = (*aPtr++) * scalar;
+- number -= 8;
++ while (number >= 8) {
++ *cPtr++ = (*aPtr++) * scalar;
++ *cPtr++ = (*aPtr++) * scalar;
++ *cPtr++ = (*aPtr++) * scalar;
++ *cPtr++ = (*aPtr++) * scalar;
++ *cPtr++ = (*aPtr++) * scalar;
++ *cPtr++ = (*aPtr++) * scalar;
++ *cPtr++ = (*aPtr++) * scalar;
++ *cPtr++ = (*aPtr++) * scalar;
++ number -= 8;
+ }
+
+ // clean up any remaining
+ while (number-- > 0)
+- *cPtr++ = *aPtr++ * scalar;
++ *cPtr++ = *aPtr++ * scalar;
+ }
+ #endif /* LV_HAVE_GENERIC */
+
+@@ -231,15 +248,19 @@ static inline void volk_32fc_s32fc_multiply_32fc_generic(lv_32fc_t* cVector, con
+ #ifndef INCLUDED_volk_32fc_s32fc_multiply_32fc_a_H
+ #define INCLUDED_volk_32fc_s32fc_multiply_32fc_a_H
+
++#include <float.h>
+ #include <inttypes.h>
+ #include <stdio.h>
+ #include <volk/volk_complex.h>
+-#include <float.h>
+
+ #if LV_HAVE_AVX && LV_HAVE_FMA
+ #include <immintrin.h>
+
+-static inline void volk_32fc_s32fc_multiply_32fc_a_avx_fma(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t scalar, unsigned int num_points){
++static inline void volk_32fc_s32fc_multiply_32fc_a_avx_fma(lv_32fc_t* cVector,
++ const lv_32fc_t* aVector,
++ const lv_32fc_t scalar,
++ unsigned int num_points)
++{
+ unsigned int number = 0;
+ unsigned int i = 0;
+ const unsigned int quarterPoints = num_points / 4;
+@@ -252,27 +273,27 @@ static inline void volk_32fc_s32fc_multiply_32fc_a_avx_fma(lv_32fc_t* cVector, c
+ yl = _mm256_set1_ps(lv_creal(scalar));
+ yh = _mm256_set1_ps(lv_cimag(scalar));
+
+- for(;number < quarterPoints; number++){
+- x = _mm256_load_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
++ for (; number < quarterPoints; number++) {
++ x = _mm256_load_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
+
+- tmp1 = x;
++ tmp1 = x;
+
+- x = _mm256_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br
++ x = _mm256_shuffle_ps(x, x, 0xB1); // Re-arrange x to be ai,ar,bi,br
+
+- tmp2 = _mm256_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
++ tmp2 = _mm256_mul_ps(x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
+
+- z = _mm256_fmaddsub_ps(tmp1, yl,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
++ z = _mm256_fmaddsub_ps(
++ tmp1, yl, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
+
+- _mm256_store_ps((float*)c,z); // Store the results back into the C container
++ _mm256_store_ps((float*)c, z); // Store the results back into the C container
+
+- a += 4;
+- c += 4;
++ a += 4;
++ c += 4;
+ }
+
+- for(i = num_points-isodd; i < num_points; i++) {
++ for (i = num_points - isodd; i < num_points; i++) {
+ *c++ = (*a++) * scalar;
+ }
+-
+ }
+ #endif /* LV_HAVE_AVX && LV_HAVE_FMA */
+
+@@ -280,7 +301,11 @@ static inline void volk_32fc_s32fc_multiply_32fc_a_avx_fma(lv_32fc_t* cVector, c
+ #ifdef LV_HAVE_AVX
+ #include <immintrin.h>
+
+-static inline void volk_32fc_s32fc_multiply_32fc_a_avx(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t scalar, unsigned int num_points){
++static inline void volk_32fc_s32fc_multiply_32fc_a_avx(lv_32fc_t* cVector,
++ const lv_32fc_t* aVector,
++ const lv_32fc_t scalar,
++ unsigned int num_points)
++{
+ unsigned int number = 0;
+ unsigned int i = 0;
+ const unsigned int quarterPoints = num_points / 4;
+@@ -293,35 +318,39 @@ static inline void volk_32fc_s32fc_multiply_32fc_a_avx(lv_32fc_t* cVector, const
+ yl = _mm256_set1_ps(lv_creal(scalar));
+ yh = _mm256_set1_ps(lv_cimag(scalar));
+
+- for(;number < quarterPoints; number++){
+- x = _mm256_load_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
++ for (; number < quarterPoints; number++) {
++ x = _mm256_load_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
+
+- tmp1 = _mm256_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
++ tmp1 = _mm256_mul_ps(x, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
+
+- x = _mm256_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br
++ x = _mm256_shuffle_ps(x, x, 0xB1); // Re-arrange x to be ai,ar,bi,br
+
+- tmp2 = _mm256_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
++ tmp2 = _mm256_mul_ps(x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
+
+- z = _mm256_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
++ z = _mm256_addsub_ps(tmp1,
++ tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
+
+- _mm256_store_ps((float*)c,z); // Store the results back into the C container
++ _mm256_store_ps((float*)c, z); // Store the results back into the C container
+
+- a += 4;
+- c += 4;
++ a += 4;
++ c += 4;
+ }
+
+- for(i = num_points-isodd; i < num_points; i++) {
++ for (i = num_points - isodd; i < num_points; i++) {
+ *c++ = (*a++) * scalar;
+ }
+-
+ }
+ #endif /* LV_HAVE_AVX */
+
+ #ifdef LV_HAVE_SSE3
+ #include <pmmintrin.h>
+
+-static inline void volk_32fc_s32fc_multiply_32fc_a_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t scalar, unsigned int num_points){
+- unsigned int number = 0;
++static inline void volk_32fc_s32fc_multiply_32fc_a_sse3(lv_32fc_t* cVector,
++ const lv_32fc_t* aVector,
++ const lv_32fc_t scalar,
++ unsigned int num_points)
++{
++ unsigned int number = 0;
+ const unsigned int halfPoints = num_points / 2;
+
+ __m128 x, yl, yh, z, tmp1, tmp2;
+@@ -332,26 +361,27 @@ static inline void volk_32fc_s32fc_multiply_32fc_a_sse3(lv_32fc_t* cVector, cons
+ yl = _mm_set_ps1(lv_creal(scalar));
+ yh = _mm_set_ps1(lv_cimag(scalar));
+
+- for(;number < halfPoints; number++){
++ for (; number < halfPoints; number++) {
+
+- x = _mm_load_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
++ x = _mm_load_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
+
+- tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
++ tmp1 = _mm_mul_ps(x, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
+
+- x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br
++ x = _mm_shuffle_ps(x, x, 0xB1); // Re-arrange x to be ai,ar,bi,br
+
+- tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
++ tmp2 = _mm_mul_ps(x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
+
+- z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
++ z = _mm_addsub_ps(tmp1,
++ tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
+
+- _mm_store_ps((float*)c,z); // Store the results back into the C container
++ _mm_store_ps((float*)c, z); // Store the results back into the C container
+
+- a += 2;
+- c += 2;
++ a += 2;
++ c += 2;
+ }
+
+- if((num_points % 2) != 0) {
+- *c = (*a) * scalar;
++ if ((num_points % 2) != 0) {
++ *c = (*a) * scalar;
+ }
+ }
+ #endif /* LV_HAVE_SSE */
+@@ -359,7 +389,11 @@ static inline void volk_32fc_s32fc_multiply_32fc_a_sse3(lv_32fc_t* cVector, cons
+ #ifdef LV_HAVE_NEON
+ #include <arm_neon.h>
+
+-static inline void volk_32fc_s32fc_multiply_32fc_neon(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t scalar, unsigned int num_points){
++static inline void volk_32fc_s32fc_multiply_32fc_neon(lv_32fc_t* cVector,
++ const lv_32fc_t* aVector,
++ const lv_32fc_t scalar,
++ unsigned int num_points)
++{
+ lv_32fc_t* cPtr = cVector;
+ const lv_32fc_t* aPtr = aVector;
+ unsigned int number = num_points;
+@@ -370,7 +404,7 @@ static inline void volk_32fc_s32fc_multiply_32fc_neon(lv_32fc_t* cVector, const
+
+ scalar_val.val[0] = vld1q_dup_f32((const float*)&scalar);
+ scalar_val.val[1] = vld1q_dup_f32(((const float*)&scalar) + 1);
+- for(number = 0; number < quarter_points; ++number) {
++ for (number = 0; number < quarter_points; ++number) {
+ a_val = vld2q_f32((float*)aPtr);
+ tmp_imag.val[1] = vmulq_f32(a_val.val[1], scalar_val.val[0]);
+ tmp_imag.val[0] = vmulq_f32(a_val.val[0], scalar_val.val[0]);
+@@ -383,35 +417,39 @@ static inline void volk_32fc_s32fc_multiply_32fc_neon(lv_32fc_t* cVector, const
+ cPtr += 4;
+ }
+
+- for(number = quarter_points*4; number < num_points; number++){
+- *cPtr++ = *aPtr++ * scalar;
++ for (number = quarter_points * 4; number < num_points; number++) {
++ *cPtr++ = *aPtr++ * scalar;
+ }
+ }
+ #endif /* LV_HAVE_NEON */
+
+ #ifdef LV_HAVE_GENERIC
+
+-static inline void volk_32fc_s32fc_multiply_32fc_a_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t scalar, unsigned int num_points){
++static inline void volk_32fc_s32fc_multiply_32fc_a_generic(lv_32fc_t* cVector,
++ const lv_32fc_t* aVector,
++ const lv_32fc_t scalar,
++ unsigned int num_points)
++{
+ lv_32fc_t* cPtr = cVector;
+ const lv_32fc_t* aPtr = aVector;
+ unsigned int number = num_points;
+
+ // unwrap loop
+- while (number >= 8){
+- *cPtr++ = (*aPtr++) * scalar;
+- *cPtr++ = (*aPtr++) * scalar;
+- *cPtr++ = (*aPtr++) * scalar;
+- *cPtr++ = (*aPtr++) * scalar;
+- *cPtr++ = (*aPtr++) * scalar;
+- *cPtr++ = (*aPtr++) * scalar;
+- *cPtr++ = (*aPtr++) * scalar;
+- *cPtr++ = (*aPtr++) * scalar;
+- number -= 8;
++ while (number >= 8) {
++ *cPtr++ = (*aPtr++) * scalar;
++ *cPtr++ = (*aPtr++) * scalar;
++ *cPtr++ = (*aPtr++) * scalar;
++ *cPtr++ = (*aPtr++) * scalar;
++ *cPtr++ = (*aPtr++) * scalar;
++ *cPtr++ = (*aPtr++) * scalar;
++ *cPtr++ = (*aPtr++) * scalar;
++ *cPtr++ = (*aPtr++) * scalar;
++ number -= 8;
+ }
+
+ // clean up any remaining
+ while (number-- > 0)
+- *cPtr++ = *aPtr++ * scalar;
++ *cPtr++ = *aPtr++ * scalar;
+ }
+ #endif /* LV_HAVE_GENERIC */
+
+diff --git a/kernels/volk/volk_32fc_s32fc_rotatorpuppet_32fc.h b/kernels/volk/volk_32fc_s32fc_rotatorpuppet_32fc.h
+index 181abc5..eba98fe 100644
+--- a/kernels/volk/volk_32fc_s32fc_rotatorpuppet_32fc.h
++++ b/kernels/volk/volk_32fc_s32fc_rotatorpuppet_32fc.h
+@@ -25,19 +25,24 @@
+ #define INCLUDED_volk_32fc_s32fc_rotatorpuppet_32fc_a_H
+
+
+-#include <volk/volk_complex.h>
+ #include <stdio.h>
+ #include <volk/volk_32fc_s32fc_x2_rotator_32fc.h>
++#include <volk/volk_complex.h>
+
+
+ #ifdef LV_HAVE_GENERIC
+
+-static inline void volk_32fc_s32fc_rotatorpuppet_32fc_generic(lv_32fc_t* outVector, const lv_32fc_t* inVector, const lv_32fc_t phase_inc, unsigned int num_points){
+- lv_32fc_t phase[1] = {lv_cmake(.3, 0.95393)};
++static inline void volk_32fc_s32fc_rotatorpuppet_32fc_generic(lv_32fc_t* outVector,
++ const lv_32fc_t* inVector,
++ const lv_32fc_t phase_inc,
++ unsigned int num_points)
++{
++ lv_32fc_t phase[1] = { lv_cmake(.3, 0.95393) };
+ (*phase) /= hypotf(lv_creal(*phase), lv_cimag(*phase));
+- const lv_32fc_t phase_inc_n = phase_inc / hypotf(lv_creal(phase_inc), lv_cimag(phase_inc));
+- volk_32fc_s32fc_x2_rotator_32fc_generic(outVector, inVector, phase_inc_n, phase, num_points);
+-
++ const lv_32fc_t phase_inc_n =
++ phase_inc / hypotf(lv_creal(phase_inc), lv_cimag(phase_inc));
++ volk_32fc_s32fc_x2_rotator_32fc_generic(
++ outVector, inVector, phase_inc_n, phase, num_points);
+ }
+
+ #endif /* LV_HAVE_GENERIC */
+@@ -47,12 +52,17 @@ static inline void volk_32fc_s32fc_rotatorpuppet_32fc_generic(lv_32fc_t* outVect
+ #include <arm_neon.h>
+ #include <volk/volk_neon_intrinsics.h>
+
+-static inline void volk_32fc_s32fc_rotatorpuppet_32fc_neon(lv_32fc_t* outVector, const lv_32fc_t* inVector, const lv_32fc_t phase_inc, unsigned int num_points){
+- lv_32fc_t phase[1] = {lv_cmake(.3, 0.95393)};
++static inline void volk_32fc_s32fc_rotatorpuppet_32fc_neon(lv_32fc_t* outVector,
++ const lv_32fc_t* inVector,
++ const lv_32fc_t phase_inc,
++ unsigned int num_points)
++{
++ lv_32fc_t phase[1] = { lv_cmake(.3, 0.95393) };
+ (*phase) /= hypotf(lv_creal(*phase), lv_cimag(*phase));
+- const lv_32fc_t phase_inc_n = phase_inc / hypotf(lv_creal(phase_inc), lv_cimag(phase_inc));
+- volk_32fc_s32fc_x2_rotator_32fc_neon(outVector, inVector, phase_inc_n, phase, num_points);
+-
++ const lv_32fc_t phase_inc_n =
++ phase_inc / hypotf(lv_creal(phase_inc), lv_cimag(phase_inc));
++ volk_32fc_s32fc_x2_rotator_32fc_neon(
++ outVector, inVector, phase_inc_n, phase, num_points);
+ }
+
+ #endif /* LV_HAVE_NEON */
+@@ -61,12 +71,17 @@ static inline void volk_32fc_s32fc_rotatorpuppet_32fc_neon(lv_32fc_t* outVector,
+ #ifdef LV_HAVE_SSE4_1
+ #include <smmintrin.h>
+
+-static inline void volk_32fc_s32fc_rotatorpuppet_32fc_a_sse4_1(lv_32fc_t* outVector, const lv_32fc_t* inVector, const lv_32fc_t phase_inc, unsigned int num_points){
+- lv_32fc_t phase[1] = {lv_cmake(.3, .95393)};
++static inline void volk_32fc_s32fc_rotatorpuppet_32fc_a_sse4_1(lv_32fc_t* outVector,
++ const lv_32fc_t* inVector,
++ const lv_32fc_t phase_inc,
++ unsigned int num_points)
++{
++ lv_32fc_t phase[1] = { lv_cmake(.3, .95393) };
+ (*phase) /= hypotf(lv_creal(*phase), lv_cimag(*phase));
+- const lv_32fc_t phase_inc_n = phase_inc / hypotf(lv_creal(phase_inc), lv_cimag(phase_inc));
+- volk_32fc_s32fc_x2_rotator_32fc_a_sse4_1(outVector, inVector, phase_inc_n, phase, num_points);
+-
++ const lv_32fc_t phase_inc_n =
++ phase_inc / hypotf(lv_creal(phase_inc), lv_cimag(phase_inc));
++ volk_32fc_s32fc_x2_rotator_32fc_a_sse4_1(
++ outVector, inVector, phase_inc_n, phase, num_points);
+ }
+
+ #endif /* LV_HAVE_SSE4_1 */
+@@ -74,12 +89,17 @@ static inline void volk_32fc_s32fc_rotatorpuppet_32fc_a_sse4_1(lv_32fc_t* outVec
+
+ #ifdef LV_HAVE_SSE4_1
+ #include <smmintrin.h>
+-static inline void volk_32fc_s32fc_rotatorpuppet_32fc_u_sse4_1(lv_32fc_t* outVector, const lv_32fc_t* inVector, const lv_32fc_t phase_inc, unsigned int num_points){
+- lv_32fc_t phase[1] = {lv_cmake(.3, .95393)};
++static inline void volk_32fc_s32fc_rotatorpuppet_32fc_u_sse4_1(lv_32fc_t* outVector,
++ const lv_32fc_t* inVector,
++ const lv_32fc_t phase_inc,
++ unsigned int num_points)
++{
++ lv_32fc_t phase[1] = { lv_cmake(.3, .95393) };
+ (*phase) /= hypotf(lv_creal(*phase), lv_cimag(*phase));
+- const lv_32fc_t phase_inc_n = phase_inc / hypotf(lv_creal(phase_inc), lv_cimag(phase_inc));
+- volk_32fc_s32fc_x2_rotator_32fc_u_sse4_1(outVector, inVector, phase_inc_n, phase, num_points);
+-
++ const lv_32fc_t phase_inc_n =
++ phase_inc / hypotf(lv_creal(phase_inc), lv_cimag(phase_inc));
++ volk_32fc_s32fc_x2_rotator_32fc_u_sse4_1(
++ outVector, inVector, phase_inc_n, phase, num_points);
+ }
+
+ #endif /* LV_HAVE_SSE4_1 */
+@@ -88,11 +108,17 @@ static inline void volk_32fc_s32fc_rotatorpuppet_32fc_u_sse4_1(lv_32fc_t* outVec
+ #ifdef LV_HAVE_AVX
+ #include <immintrin.h>
+
+-static inline void volk_32fc_s32fc_rotatorpuppet_32fc_a_avx(lv_32fc_t* outVector, const lv_32fc_t* inVector, const lv_32fc_t phase_inc, unsigned int num_points){
+- lv_32fc_t phase[1] = {lv_cmake(.3, .95393)};
++static inline void volk_32fc_s32fc_rotatorpuppet_32fc_a_avx(lv_32fc_t* outVector,
++ const lv_32fc_t* inVector,
++ const lv_32fc_t phase_inc,
++ unsigned int num_points)
++{
++ lv_32fc_t phase[1] = { lv_cmake(.3, .95393) };
+ (*phase) /= hypotf(lv_creal(*phase), lv_cimag(*phase));
+- const lv_32fc_t phase_inc_n = phase_inc / hypotf(lv_creal(phase_inc), lv_cimag(phase_inc));
+- volk_32fc_s32fc_x2_rotator_32fc_a_avx(outVector, inVector, phase_inc_n, phase, num_points);
++ const lv_32fc_t phase_inc_n =
++ phase_inc / hypotf(lv_creal(phase_inc), lv_cimag(phase_inc));
++ volk_32fc_s32fc_x2_rotator_32fc_a_avx(
++ outVector, inVector, phase_inc_n, phase, num_points);
+ }
+
+ #endif /* LV_HAVE_AVX */
+@@ -101,11 +127,17 @@ static inline void volk_32fc_s32fc_rotatorpuppet_32fc_a_avx(lv_32fc_t* outVector
+ #ifdef LV_HAVE_AVX
+ #include <immintrin.h>
+
+-static inline void volk_32fc_s32fc_rotatorpuppet_32fc_u_avx(lv_32fc_t* outVector, const lv_32fc_t* inVector, const lv_32fc_t phase_inc, unsigned int num_points){
+- lv_32fc_t phase[1] = {lv_cmake(.3, .95393)};
++static inline void volk_32fc_s32fc_rotatorpuppet_32fc_u_avx(lv_32fc_t* outVector,
++ const lv_32fc_t* inVector,
++ const lv_32fc_t phase_inc,
++ unsigned int num_points)
++{
++ lv_32fc_t phase[1] = { lv_cmake(.3, .95393) };
+ (*phase) /= hypotf(lv_creal(*phase), lv_cimag(*phase));
+- const lv_32fc_t phase_inc_n = phase_inc / hypotf(lv_creal(phase_inc), lv_cimag(phase_inc));
+- volk_32fc_s32fc_x2_rotator_32fc_u_avx(outVector, inVector, phase_inc_n, phase, num_points);
++ const lv_32fc_t phase_inc_n =
++ phase_inc / hypotf(lv_creal(phase_inc), lv_cimag(phase_inc));
++ volk_32fc_s32fc_x2_rotator_32fc_u_avx(
++ outVector, inVector, phase_inc_n, phase, num_points);
+ }
+
+ #endif /* LV_HAVE_AVX */
+@@ -113,11 +145,17 @@ static inline void volk_32fc_s32fc_rotatorpuppet_32fc_u_avx(lv_32fc_t* outVector
+ #if LV_HAVE_AVX && LV_HAVE_FMA
+ #include <immintrin.h>
+
+-static inline void volk_32fc_s32fc_rotatorpuppet_32fc_a_avx_fma(lv_32fc_t* outVector, const lv_32fc_t* inVector, const lv_32fc_t phase_inc, unsigned int num_points){
+- lv_32fc_t phase[1] = {lv_cmake(.3, .95393)};
++static inline void volk_32fc_s32fc_rotatorpuppet_32fc_a_avx_fma(lv_32fc_t* outVector,
++ const lv_32fc_t* inVector,
++ const lv_32fc_t phase_inc,
++ unsigned int num_points)
++{
++ lv_32fc_t phase[1] = { lv_cmake(.3, .95393) };
+ (*phase) /= hypotf(lv_creal(*phase), lv_cimag(*phase));
+- const lv_32fc_t phase_inc_n = phase_inc / hypotf(lv_creal(phase_inc), lv_cimag(phase_inc));
+- volk_32fc_s32fc_x2_rotator_32fc_a_avx_fma(outVector, inVector, phase_inc_n, phase, num_points);
++ const lv_32fc_t phase_inc_n =
++ phase_inc / hypotf(lv_creal(phase_inc), lv_cimag(phase_inc));
++ volk_32fc_s32fc_x2_rotator_32fc_a_avx_fma(
++ outVector, inVector, phase_inc_n, phase, num_points);
+ }
+
+ #endif /* LV_HAVE_AVX && LV_HAVE_FMA*/
+@@ -126,11 +164,17 @@ static inline void volk_32fc_s32fc_rotatorpuppet_32fc_a_avx_fma(lv_32fc_t* outVe
+ #if LV_HAVE_AVX && LV_HAVE_FMA
+ #include <immintrin.h>
+
+-static inline void volk_32fc_s32fc_rotatorpuppet_32fc_u_avx_fma(lv_32fc_t* outVector, const lv_32fc_t* inVector, const lv_32fc_t phase_inc, unsigned int num_points){
+- lv_32fc_t phase[1] = {lv_cmake(.3, .95393)};
++static inline void volk_32fc_s32fc_rotatorpuppet_32fc_u_avx_fma(lv_32fc_t* outVector,
++ const lv_32fc_t* inVector,
++ const lv_32fc_t phase_inc,
++ unsigned int num_points)
++{
++ lv_32fc_t phase[1] = { lv_cmake(.3, .95393) };
+ (*phase) /= hypotf(lv_creal(*phase), lv_cimag(*phase));
+- const lv_32fc_t phase_inc_n = phase_inc / hypotf(lv_creal(phase_inc), lv_cimag(phase_inc));
+- volk_32fc_s32fc_x2_rotator_32fc_u_avx_fma(outVector, inVector, phase_inc_n, phase, num_points);
++ const lv_32fc_t phase_inc_n =
++ phase_inc / hypotf(lv_creal(phase_inc), lv_cimag(phase_inc));
++ volk_32fc_s32fc_x2_rotator_32fc_u_avx_fma(
++ outVector, inVector, phase_inc_n, phase, num_points);
+ }
+
+ #endif /* LV_HAVE_AVX && LV_HAVE_FMA*/
+diff --git a/kernels/volk/volk_32fc_s32fc_x2_rotator_32fc.h b/kernels/volk/volk_32fc_s32fc_x2_rotator_32fc.h
+index a886458..c97b8cb 100644
+--- a/kernels/volk/volk_32fc_s32fc_x2_rotator_32fc.h
++++ b/kernels/volk/volk_32fc_s32fc_x2_rotator_32fc.h
+@@ -30,14 +30,15 @@
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+- * void volk_32fc_s32fc_x2_rotator_32fc(lv_32fc_t* outVector, const lv_32fc_t* inVector, const lv_32fc_t phase_inc, lv_32fc_t* phase, unsigned int num_points)
+- * \endcode
++ * void volk_32fc_s32fc_x2_rotator_32fc(lv_32fc_t* outVector, const lv_32fc_t* inVector,
++ * const lv_32fc_t phase_inc, lv_32fc_t* phase, unsigned int num_points) \endcode
+ *
+ * \b Inputs
+ * \li inVector: Vector to be rotated.
+ * \li phase_inc: rotational velocity.
+ * \li phase: initial phase offset.
+- * \li num_points: The number of values in inVector to be rotated and stored into outVector.
++ * \li num_points: The number of values in inVector to be rotated and stored into
++ * outVector.
+ *
+ * \b Outputs
+ * \li outVector: The vector where the results will be stored.
+@@ -81,31 +82,36 @@
+ #define INCLUDED_volk_32fc_s32fc_rotator_32fc_a_H
+
+
+-#include <volk/volk_complex.h>
++#include <math.h>
+ #include <stdio.h>
+ #include <stdlib.h>
+-#include <math.h>
++#include <volk/volk_complex.h>
+ #define ROTATOR_RELOAD 512
+
+
+ #ifdef LV_HAVE_GENERIC
+
+-static inline void volk_32fc_s32fc_x2_rotator_32fc_generic(lv_32fc_t* outVector, const lv_32fc_t* inVector, const lv_32fc_t phase_inc, lv_32fc_t* phase, unsigned int num_points){
++static inline void volk_32fc_s32fc_x2_rotator_32fc_generic(lv_32fc_t* outVector,
++ const lv_32fc_t* inVector,
++ const lv_32fc_t phase_inc,
++ lv_32fc_t* phase,
++ unsigned int num_points)
++{
+ unsigned int i = 0;
+ int j = 0;
+- for(i = 0; i < (unsigned int)(num_points/ROTATOR_RELOAD); ++i) {
+- for(j = 0; j < ROTATOR_RELOAD; ++j) {
++ for (i = 0; i < (unsigned int)(num_points / ROTATOR_RELOAD); ++i) {
++ for (j = 0; j < ROTATOR_RELOAD; ++j) {
+ *outVector++ = *inVector++ * (*phase);
+ (*phase) *= phase_inc;
+ }
+
+ (*phase) /= hypotf(lv_creal(*phase), lv_cimag(*phase));
+ }
+- for(i = 0; i < num_points%ROTATOR_RELOAD; ++i) {
++ for (i = 0; i < num_points % ROTATOR_RELOAD; ++i) {
+ *outVector++ = *inVector++ * (*phase);
+ (*phase) *= phase_inc;
+ }
+- if(i){
++ if (i) {
+ // Make sure, we normalize phase on every call!
+ (*phase) /= hypotf(lv_creal(*phase), lv_cimag(*phase));
+ }
+@@ -118,43 +124,47 @@ static inline void volk_32fc_s32fc_x2_rotator_32fc_generic(lv_32fc_t* outVector,
+ #include <arm_neon.h>
+ #include <volk/volk_neon_intrinsics.h>
+
+-static inline void volk_32fc_s32fc_x2_rotator_32fc_neon(lv_32fc_t* outVector, const lv_32fc_t* inVector, const lv_32fc_t phase_inc, lv_32fc_t* phase, unsigned int num_points)
++static inline void volk_32fc_s32fc_x2_rotator_32fc_neon(lv_32fc_t* outVector,
++ const lv_32fc_t* inVector,
++ const lv_32fc_t phase_inc,
++ lv_32fc_t* phase,
++ unsigned int num_points)
+
+ {
+ lv_32fc_t* outputVectorPtr = outVector;
+ const lv_32fc_t* inputVectorPtr = inVector;
+ lv_32fc_t incr = 1;
+- lv_32fc_t phasePtr[4] = {(*phase), (*phase), (*phase), (*phase)};
++ lv_32fc_t phasePtr[4] = { (*phase), (*phase), (*phase), (*phase) };
+ float32x4x2_t input_vec;
+ float32x4x2_t output_vec;
+-
++
+ unsigned int i = 0, j = 0;
+ const unsigned int quarter_points = num_points / 4;
+-
+- for(i = 0; i < 4; ++i) {
++
++ for (i = 0; i < 4; ++i) {
+ phasePtr[i] *= incr;
+ incr *= (phase_inc);
+ }
+-
++
+ // Notice that incr has be incremented in the previous loop
+- const lv_32fc_t incrPtr[4] = {incr, incr, incr, incr};
+- const float32x4x2_t incr_vec = vld2q_f32((float*) incrPtr);
+- float32x4x2_t phase_vec = vld2q_f32((float*) phasePtr);
+-
+- for(i = 0; i < (unsigned int)(quarter_points/ROTATOR_RELOAD); i++) {
+- for(j = 0; j < ROTATOR_RELOAD; j++) {
+- input_vec = vld2q_f32((float*) inputVectorPtr);
++ const lv_32fc_t incrPtr[4] = { incr, incr, incr, incr };
++ const float32x4x2_t incr_vec = vld2q_f32((float*)incrPtr);
++ float32x4x2_t phase_vec = vld2q_f32((float*)phasePtr);
++
++ for (i = 0; i < (unsigned int)(quarter_points / ROTATOR_RELOAD); i++) {
++ for (j = 0; j < ROTATOR_RELOAD; j++) {
++ input_vec = vld2q_f32((float*)inputVectorPtr);
+ // Prefetch next one, speeds things up
+- __VOLK_PREFETCH(inputVectorPtr+4);
++ __VOLK_PREFETCH(inputVectorPtr + 4);
+ // Rotate
+ output_vec = _vmultiply_complexq_f32(input_vec, phase_vec);
+ // Increase phase
+ phase_vec = _vmultiply_complexq_f32(phase_vec, incr_vec);
+ // Store output
+ vst2q_f32((float*)outputVectorPtr, output_vec);
+-
+- outputVectorPtr+=4;
+- inputVectorPtr+=4;
++
++ outputVectorPtr += 4;
++ inputVectorPtr += 4;
+ }
+ // normalize phase so magnitude doesn't grow because of
+ // floating point rounding error
+@@ -164,20 +174,20 @@ static inline void volk_32fc_s32fc_x2_rotator_32fc_neon(lv_32fc_t* outVector, co
+ phase_vec.val[0] = vmulq_f32(phase_vec.val[0], inv_mag);
+ phase_vec.val[1] = vmulq_f32(phase_vec.val[1], inv_mag);
+ }
+-
+- for(i = 0; i < quarter_points % ROTATOR_RELOAD; i++) {
+- input_vec = vld2q_f32((float*) inputVectorPtr);
++
++ for (i = 0; i < quarter_points % ROTATOR_RELOAD; i++) {
++ input_vec = vld2q_f32((float*)inputVectorPtr);
+ // Prefetch next one, speeds things up
+- __VOLK_PREFETCH(inputVectorPtr+4);
++ __VOLK_PREFETCH(inputVectorPtr + 4);
+ // Rotate
+ output_vec = _vmultiply_complexq_f32(input_vec, phase_vec);
+ // Increase phase
+ phase_vec = _vmultiply_complexq_f32(phase_vec, incr_vec);
+ // Store output
+ vst2q_f32((float*)outputVectorPtr, output_vec);
+-
+- outputVectorPtr+=4;
+- inputVectorPtr+=4;
++
++ outputVectorPtr += 4;
++ inputVectorPtr += 4;
+ }
+ // if(i) == true means we looped above
+ if (i) {
+@@ -191,13 +201,13 @@ static inline void volk_32fc_s32fc_x2_rotator_32fc_neon(lv_32fc_t* outVector, co
+ }
+ // Store current phase
+ vst2q_f32((float*)phasePtr, phase_vec);
+-
++
+ // Deal with the rest
+- for(i = 0; i < num_points % 4; i++) {
++ for (i = 0; i < num_points % 4; i++) {
+ *outputVectorPtr++ = *inputVectorPtr++ * phasePtr[0];
+ phasePtr[0] *= (phase_inc);
+ }
+-
++
+ // For continious phase next time we need to call this function
+ (*phase) = phasePtr[0];
+ }
+@@ -208,15 +218,20 @@ static inline void volk_32fc_s32fc_x2_rotator_32fc_neon(lv_32fc_t* outVector, co
+ #ifdef LV_HAVE_SSE4_1
+ #include <smmintrin.h>
+
+-static inline void volk_32fc_s32fc_x2_rotator_32fc_a_sse4_1(lv_32fc_t* outVector, const lv_32fc_t* inVector, const lv_32fc_t phase_inc, lv_32fc_t* phase, unsigned int num_points){
++static inline void volk_32fc_s32fc_x2_rotator_32fc_a_sse4_1(lv_32fc_t* outVector,
++ const lv_32fc_t* inVector,
++ const lv_32fc_t phase_inc,
++ lv_32fc_t* phase,
++ unsigned int num_points)
++{
+ lv_32fc_t* cPtr = outVector;
+ const lv_32fc_t* aPtr = inVector;
+ lv_32fc_t incr = 1;
+- lv_32fc_t phase_Ptr[2] = {(*phase), (*phase)};
++ lv_32fc_t phase_Ptr[2] = { (*phase), (*phase) };
+
+ unsigned int i, j = 0;
+
+- for(i = 0; i < 2; ++i) {
++ for (i = 0; i < 2; ++i) {
+ phase_Ptr[i] *= incr;
+ incr *= (phase_inc);
+ }
+@@ -227,13 +242,13 @@ static inline void volk_32fc_s32fc_x2_rotator_32fc_a_sse4_1(lv_32fc_t* outVector
+ __m128 aVal, phase_Val, inc_Val, yl, yh, tmp1, tmp2, z, ylp, yhp, tmp1p, tmp2p;
+
+ phase_Val = _mm_loadu_ps((float*)phase_Ptr);
+- inc_Val = _mm_set_ps(lv_cimag(incr), lv_creal(incr),lv_cimag(incr), lv_creal(incr));
++ inc_Val = _mm_set_ps(lv_cimag(incr), lv_creal(incr), lv_cimag(incr), lv_creal(incr));
+
+ const unsigned int halfPoints = num_points / 2;
+
+
+- for(i = 0; i < (unsigned int)(halfPoints/ROTATOR_RELOAD); i++) {
+- for(j = 0; j < ROTATOR_RELOAD; ++j) {
++ for (i = 0; i < (unsigned int)(halfPoints / ROTATOR_RELOAD); i++) {
++ for (j = 0; j < ROTATOR_RELOAD; ++j) {
+
+ aVal = _mm_load_ps((float*)aPtr);
+
+@@ -264,7 +279,7 @@ static inline void volk_32fc_s32fc_x2_rotator_32fc_a_sse4_1(lv_32fc_t* outVector
+ tmp2 = _mm_sqrt_ps(tmp1);
+ phase_Val = _mm_div_ps(phase_Val, tmp2);
+ }
+- for(i = 0; i < halfPoints%ROTATOR_RELOAD; ++i) {
++ for (i = 0; i < halfPoints % ROTATOR_RELOAD; ++i) {
+ aVal = _mm_load_ps((float*)aPtr);
+
+ yl = _mm_moveldup_ps(phase_Val);
+@@ -304,7 +319,6 @@ static inline void volk_32fc_s32fc_x2_rotator_32fc_a_sse4_1(lv_32fc_t* outVector
+ }
+
+ (*phase) = phase_Ptr[0];
+-
+ }
+
+ #endif /* LV_HAVE_SSE4_1 for aligned */
+@@ -313,15 +327,20 @@ static inline void volk_32fc_s32fc_x2_rotator_32fc_a_sse4_1(lv_32fc_t* outVector
+ #ifdef LV_HAVE_SSE4_1
+ #include <smmintrin.h>
+
+-static inline void volk_32fc_s32fc_x2_rotator_32fc_u_sse4_1(lv_32fc_t* outVector, const lv_32fc_t* inVector, const lv_32fc_t phase_inc, lv_32fc_t* phase, unsigned int num_points){
++static inline void volk_32fc_s32fc_x2_rotator_32fc_u_sse4_1(lv_32fc_t* outVector,
++ const lv_32fc_t* inVector,
++ const lv_32fc_t phase_inc,
++ lv_32fc_t* phase,
++ unsigned int num_points)
++{
+ lv_32fc_t* cPtr = outVector;
+ const lv_32fc_t* aPtr = inVector;
+ lv_32fc_t incr = 1;
+- lv_32fc_t phase_Ptr[2] = {(*phase), (*phase)};
++ lv_32fc_t phase_Ptr[2] = { (*phase), (*phase) };
+
+ unsigned int i, j = 0;
+
+- for(i = 0; i < 2; ++i) {
++ for (i = 0; i < 2; ++i) {
+ phase_Ptr[i] *= incr;
+ incr *= (phase_inc);
+ }
+@@ -332,13 +351,13 @@ static inline void volk_32fc_s32fc_x2_rotator_32fc_u_sse4_1(lv_32fc_t* outVector
+ __m128 aVal, phase_Val, inc_Val, yl, yh, tmp1, tmp2, z, ylp, yhp, tmp1p, tmp2p;
+
+ phase_Val = _mm_loadu_ps((float*)phase_Ptr);
+- inc_Val = _mm_set_ps(lv_cimag(incr), lv_creal(incr),lv_cimag(incr), lv_creal(incr));
++ inc_Val = _mm_set_ps(lv_cimag(incr), lv_creal(incr), lv_cimag(incr), lv_creal(incr));
+
+ const unsigned int halfPoints = num_points / 2;
+
+
+- for(i = 0; i < (unsigned int)(halfPoints/ROTATOR_RELOAD); i++) {
+- for(j = 0; j < ROTATOR_RELOAD; ++j) {
++ for (i = 0; i < (unsigned int)(halfPoints / ROTATOR_RELOAD); i++) {
++ for (j = 0; j < ROTATOR_RELOAD; ++j) {
+
+ aVal = _mm_loadu_ps((float*)aPtr);
+
+@@ -369,7 +388,7 @@ static inline void volk_32fc_s32fc_x2_rotator_32fc_u_sse4_1(lv_32fc_t* outVector
+ tmp2 = _mm_sqrt_ps(tmp1);
+ phase_Val = _mm_div_ps(phase_Val, tmp2);
+ }
+- for(i = 0; i < halfPoints%ROTATOR_RELOAD; ++i) {
++ for (i = 0; i < halfPoints % ROTATOR_RELOAD; ++i) {
+ aVal = _mm_loadu_ps((float*)aPtr);
+
+ yl = _mm_moveldup_ps(phase_Val);
+@@ -409,7 +428,6 @@ static inline void volk_32fc_s32fc_x2_rotator_32fc_u_sse4_1(lv_32fc_t* outVector
+ }
+
+ (*phase) = phase_Ptr[0];
+-
+ }
+
+ #endif /* LV_HAVE_SSE4_1 */
+@@ -419,15 +437,20 @@ static inline void volk_32fc_s32fc_x2_rotator_32fc_u_sse4_1(lv_32fc_t* outVector
+ #include <immintrin.h>
+ #include <volk/volk_avx_intrinsics.h>
+
+-static inline void volk_32fc_s32fc_x2_rotator_32fc_a_avx(lv_32fc_t* outVector, const lv_32fc_t* inVector, const lv_32fc_t phase_inc, lv_32fc_t* phase, unsigned int num_points){
++static inline void volk_32fc_s32fc_x2_rotator_32fc_a_avx(lv_32fc_t* outVector,
++ const lv_32fc_t* inVector,
++ const lv_32fc_t phase_inc,
++ lv_32fc_t* phase,
++ unsigned int num_points)
++{
+ lv_32fc_t* cPtr = outVector;
+ const lv_32fc_t* aPtr = inVector;
+ lv_32fc_t incr = lv_cmake(1.0, 0.0);
+- lv_32fc_t phase_Ptr[4] = {(*phase), (*phase), (*phase), (*phase)};
++ lv_32fc_t phase_Ptr[4] = { (*phase), (*phase), (*phase), (*phase) };
+
+ unsigned int i, j = 0;
+
+- for(i = 0; i < 4; ++i) {
++ for (i = 0; i < 4; ++i) {
+ phase_Ptr[i] *= incr;
+ incr *= (phase_inc);
+ }
+@@ -435,16 +458,20 @@ static inline void volk_32fc_s32fc_x2_rotator_32fc_a_avx(lv_32fc_t* outVector, c
+ __m256 aVal, phase_Val, z;
+
+ phase_Val = _mm256_loadu_ps((float*)phase_Ptr);
+-
+- const __m256 inc_Val = _mm256_set_ps(lv_cimag(incr), lv_creal(incr),
+- lv_cimag(incr), lv_creal(incr),
+- lv_cimag(incr), lv_creal(incr),
+- lv_cimag(incr), lv_creal(incr));
++
++ const __m256 inc_Val = _mm256_set_ps(lv_cimag(incr),
++ lv_creal(incr),
++ lv_cimag(incr),
++ lv_creal(incr),
++ lv_cimag(incr),
++ lv_creal(incr),
++ lv_cimag(incr),
++ lv_creal(incr));
+
+ const unsigned int fourthPoints = num_points / 4;
+
+- for(i = 0; i < (unsigned int)(fourthPoints/ROTATOR_RELOAD); i++) {
+- for(j = 0; j < ROTATOR_RELOAD; ++j) {
++ for (i = 0; i < (unsigned int)(fourthPoints / ROTATOR_RELOAD); i++) {
++ for (j = 0; j < ROTATOR_RELOAD; ++j) {
+
+ aVal = _mm256_load_ps((float*)aPtr);
+
+@@ -458,8 +485,8 @@ static inline void volk_32fc_s32fc_x2_rotator_32fc_a_avx(lv_32fc_t* outVector, c
+ }
+ phase_Val = _mm256_normalize_ps(phase_Val);
+ }
+-
+- for(i = 0; i < fourthPoints%ROTATOR_RELOAD; ++i) {
++
++ for (i = 0; i < fourthPoints % ROTATOR_RELOAD; ++i) {
+ aVal = _mm256_load_ps((float*)aPtr);
+
+ z = _mm256_complexmul_ps(aVal, phase_Val);
+@@ -473,10 +500,10 @@ static inline void volk_32fc_s32fc_x2_rotator_32fc_a_avx(lv_32fc_t* outVector, c
+ if (i) {
+ phase_Val = _mm256_normalize_ps(phase_Val);
+ }
+-
++
+ _mm256_storeu_ps((float*)phase_Ptr, phase_Val);
+ (*phase) = phase_Ptr[0];
+- volk_32fc_s32fc_x2_rotator_32fc_generic(cPtr, aPtr, phase_inc, phase, num_points%4);
++ volk_32fc_s32fc_x2_rotator_32fc_generic(cPtr, aPtr, phase_inc, phase, num_points % 4);
+ }
+
+ #endif /* LV_HAVE_AVX for aligned */
+@@ -486,15 +513,20 @@ static inline void volk_32fc_s32fc_x2_rotator_32fc_a_avx(lv_32fc_t* outVector, c
+ #include <immintrin.h>
+ #include <volk/volk_avx_intrinsics.h>
+
+-static inline void volk_32fc_s32fc_x2_rotator_32fc_u_avx(lv_32fc_t* outVector, const lv_32fc_t* inVector, const lv_32fc_t phase_inc, lv_32fc_t* phase, unsigned int num_points){
++static inline void volk_32fc_s32fc_x2_rotator_32fc_u_avx(lv_32fc_t* outVector,
++ const lv_32fc_t* inVector,
++ const lv_32fc_t phase_inc,
++ lv_32fc_t* phase,
++ unsigned int num_points)
++{
+ lv_32fc_t* cPtr = outVector;
+ const lv_32fc_t* aPtr = inVector;
+ lv_32fc_t incr = lv_cmake(1.0, 0.0);
+- lv_32fc_t phase_Ptr[4] = {(*phase), (*phase), (*phase), (*phase)};
++ lv_32fc_t phase_Ptr[4] = { (*phase), (*phase), (*phase), (*phase) };
+
+ unsigned int i, j = 0;
+
+- for(i = 0; i < 4; ++i) {
++ for (i = 0; i < 4; ++i) {
+ phase_Ptr[i] *= incr;
+ incr *= (phase_inc);
+ }
+@@ -502,19 +534,23 @@ static inline void volk_32fc_s32fc_x2_rotator_32fc_u_avx(lv_32fc_t* outVector, c
+ __m256 aVal, phase_Val, z;
+
+ phase_Val = _mm256_loadu_ps((float*)phase_Ptr);
+-
+- const __m256 inc_Val = _mm256_set_ps(lv_cimag(incr), lv_creal(incr),
+- lv_cimag(incr), lv_creal(incr),
+- lv_cimag(incr), lv_creal(incr),
+- lv_cimag(incr), lv_creal(incr));
+-
++
++ const __m256 inc_Val = _mm256_set_ps(lv_cimag(incr),
++ lv_creal(incr),
++ lv_cimag(incr),
++ lv_creal(incr),
++ lv_cimag(incr),
++ lv_creal(incr),
++ lv_cimag(incr),
++ lv_creal(incr));
++
+ const unsigned int fourthPoints = num_points / 4;
+
+- for(i = 0; i < (unsigned int)(fourthPoints/ROTATOR_RELOAD); ++i) {
+- for(j = 0; j < ROTATOR_RELOAD; ++j) {
++ for (i = 0; i < (unsigned int)(fourthPoints / ROTATOR_RELOAD); ++i) {
++ for (j = 0; j < ROTATOR_RELOAD; ++j) {
+
+ aVal = _mm256_loadu_ps((float*)aPtr);
+-
++
+ z = _mm256_complexmul_ps(aVal, phase_Val);
+ phase_Val = _mm256_complexmul_ps(phase_Val, inc_Val);
+
+@@ -524,10 +560,9 @@ static inline void volk_32fc_s32fc_x2_rotator_32fc_u_avx(lv_32fc_t* outVector, c
+ cPtr += 4;
+ }
+ phase_Val = _mm256_normalize_ps(phase_Val);
+-
+ }
+-
+- for(i = 0; i < fourthPoints%ROTATOR_RELOAD; ++i) {
++
++ for (i = 0; i < num_points % ROTATOR_RELOAD; ++i) {
+ aVal = _mm256_loadu_ps((float*)aPtr);
+
+ z = _mm256_complexmul_ps(aVal, phase_Val);
+@@ -544,7 +579,7 @@ static inline void volk_32fc_s32fc_x2_rotator_32fc_u_avx(lv_32fc_t* outVector, c
+
+ _mm256_storeu_ps((float*)phase_Ptr, phase_Val);
+ (*phase) = phase_Ptr[0];
+- volk_32fc_s32fc_x2_rotator_32fc_generic(cPtr, aPtr, phase_inc, phase, num_points%4);
++ volk_32fc_s32fc_x2_rotator_32fc_generic(cPtr, aPtr, phase_inc, phase, num_points % 4);
+ }
+
+ #endif /* LV_HAVE_AVX */
+@@ -552,15 +587,21 @@ static inline void volk_32fc_s32fc_x2_rotator_32fc_u_avx(lv_32fc_t* outVector, c
+ #if LV_HAVE_AVX && LV_HAVE_FMA
+ #include <immintrin.h>
+
+-static inline void volk_32fc_s32fc_x2_rotator_32fc_a_avx_fma(lv_32fc_t* outVector, const lv_32fc_t* inVector, const lv_32fc_t phase_inc, lv_32fc_t* phase, unsigned int num_points){
++static inline void volk_32fc_s32fc_x2_rotator_32fc_a_avx_fma(lv_32fc_t* outVector,
++ const lv_32fc_t* inVector,
++ const lv_32fc_t phase_inc,
++ lv_32fc_t* phase,
++ unsigned int num_points)
++{
+ lv_32fc_t* cPtr = outVector;
+ const lv_32fc_t* aPtr = inVector;
+ lv_32fc_t incr = 1;
+- __VOLK_ATTR_ALIGNED(32) lv_32fc_t phase_Ptr[4] = {(*phase), (*phase), (*phase), (*phase)};
++ __VOLK_ATTR_ALIGNED(32)
++ lv_32fc_t phase_Ptr[4] = { (*phase), (*phase), (*phase), (*phase) };
+
+ unsigned int i, j = 0;
+
+- for(i = 0; i < 4; ++i) {
++ for (i = 0; i < 4; ++i) {
+ phase_Ptr[i] *= incr;
+ incr *= (phase_inc);
+ }
+@@ -568,11 +609,18 @@ static inline void volk_32fc_s32fc_x2_rotator_32fc_a_avx_fma(lv_32fc_t* outVecto
+ __m256 aVal, phase_Val, inc_Val, yl, yh, tmp1, tmp2, z, ylp, yhp, tmp1p, tmp2p;
+
+ phase_Val = _mm256_load_ps((float*)phase_Ptr);
+- inc_Val = _mm256_set_ps(lv_cimag(incr), lv_creal(incr),lv_cimag(incr), lv_creal(incr),lv_cimag(incr), lv_creal(incr),lv_cimag(incr), lv_creal(incr));
++ inc_Val = _mm256_set_ps(lv_cimag(incr),
++ lv_creal(incr),
++ lv_cimag(incr),
++ lv_creal(incr),
++ lv_cimag(incr),
++ lv_creal(incr),
++ lv_cimag(incr),
++ lv_creal(incr));
+ const unsigned int fourthPoints = num_points / 4;
+
+- for(i = 0; i < (unsigned int)(fourthPoints/ROTATOR_RELOAD); i++) {
+- for(j = 0; j < ROTATOR_RELOAD; ++j) {
++ for (i = 0; i < (unsigned int)(fourthPoints / ROTATOR_RELOAD); i++) {
++ for (j = 0; j < ROTATOR_RELOAD; ++j) {
+
+ aVal = _mm256_load_ps((float*)aPtr);
+
+@@ -603,7 +651,7 @@ static inline void volk_32fc_s32fc_x2_rotator_32fc_a_avx_fma(lv_32fc_t* outVecto
+ tmp2 = _mm256_sqrt_ps(tmp1);
+ phase_Val = _mm256_div_ps(phase_Val, tmp2);
+ }
+- for(i = 0; i < fourthPoints%ROTATOR_RELOAD; ++i) {
++ for (i = 0; i < fourthPoints % ROTATOR_RELOAD; ++i) {
+ aVal = _mm256_load_ps((float*)aPtr);
+
+ yl = _mm256_moveldup_ps(phase_Val);
+@@ -636,13 +684,12 @@ static inline void volk_32fc_s32fc_x2_rotator_32fc_a_avx_fma(lv_32fc_t* outVecto
+ }
+
+ _mm256_store_ps((float*)phase_Ptr, phase_Val);
+- for(i = 0; i < num_points%4; ++i) {
++ for (i = 0; i < num_points % 4; ++i) {
+ *cPtr++ = *aPtr++ * phase_Ptr[0];
+ phase_Ptr[0] *= (phase_inc);
+ }
+
+ (*phase) = phase_Ptr[0];
+-
+ }
+
+ #endif /* LV_HAVE_AVX && LV_HAVE_FMA for aligned*/
+@@ -650,15 +697,20 @@ static inline void volk_32fc_s32fc_x2_rotator_32fc_a_avx_fma(lv_32fc_t* outVecto
+ #if LV_HAVE_AVX && LV_HAVE_FMA
+ #include <immintrin.h>
+
+-static inline void volk_32fc_s32fc_x2_rotator_32fc_u_avx_fma(lv_32fc_t* outVector, const lv_32fc_t* inVector, const lv_32fc_t phase_inc, lv_32fc_t* phase, unsigned int num_points){
++static inline void volk_32fc_s32fc_x2_rotator_32fc_u_avx_fma(lv_32fc_t* outVector,
++ const lv_32fc_t* inVector,
++ const lv_32fc_t phase_inc,
++ lv_32fc_t* phase,
++ unsigned int num_points)
++{
+ lv_32fc_t* cPtr = outVector;
+ const lv_32fc_t* aPtr = inVector;
+ lv_32fc_t incr = 1;
+- lv_32fc_t phase_Ptr[4] = {(*phase), (*phase), (*phase), (*phase)};
++ lv_32fc_t phase_Ptr[4] = { (*phase), (*phase), (*phase), (*phase) };
+
+ unsigned int i, j = 0;
+
+- for(i = 0; i < 4; ++i) {
++ for (i = 0; i < 4; ++i) {
+ phase_Ptr[i] *= incr;
+ incr *= (phase_inc);
+ }
+@@ -666,11 +718,18 @@ static inline void volk_32fc_s32fc_x2_rotator_32fc_u_avx_fma(lv_32fc_t* outVecto
+ __m256 aVal, phase_Val, inc_Val, yl, yh, tmp1, tmp2, z, ylp, yhp, tmp1p, tmp2p;
+
+ phase_Val = _mm256_loadu_ps((float*)phase_Ptr);
+- inc_Val = _mm256_set_ps(lv_cimag(incr), lv_creal(incr),lv_cimag(incr), lv_creal(incr),lv_cimag(incr), lv_creal(incr),lv_cimag(incr), lv_creal(incr));
++ inc_Val = _mm256_set_ps(lv_cimag(incr),
++ lv_creal(incr),
++ lv_cimag(incr),
++ lv_creal(incr),
++ lv_cimag(incr),
++ lv_creal(incr),
++ lv_cimag(incr),
++ lv_creal(incr));
+ const unsigned int fourthPoints = num_points / 4;
+
+- for(i = 0; i < (unsigned int)(fourthPoints/ROTATOR_RELOAD); i++) {
+- for(j = 0; j < ROTATOR_RELOAD; ++j) {
++ for (i = 0; i < (unsigned int)(fourthPoints / ROTATOR_RELOAD); i++) {
++ for (j = 0; j < ROTATOR_RELOAD; ++j) {
+
+ aVal = _mm256_loadu_ps((float*)aPtr);
+
+@@ -701,7 +760,7 @@ static inline void volk_32fc_s32fc_x2_rotator_32fc_u_avx_fma(lv_32fc_t* outVecto
+ tmp2 = _mm256_sqrt_ps(tmp1);
+ phase_Val = _mm256_div_ps(phase_Val, tmp2);
+ }
+- for(i = 0; i < fourthPoints%ROTATOR_RELOAD; ++i) {
++ for (i = 0; i < fourthPoints % ROTATOR_RELOAD; ++i) {
+ aVal = _mm256_loadu_ps((float*)aPtr);
+
+ yl = _mm256_moveldup_ps(phase_Val);
+@@ -734,13 +793,12 @@ static inline void volk_32fc_s32fc_x2_rotator_32fc_u_avx_fma(lv_32fc_t* outVecto
+ }
+
+ _mm256_storeu_ps((float*)phase_Ptr, phase_Val);
+- for(i = 0; i < num_points%4; ++i) {
++ for (i = 0; i < num_points % 4; ++i) {
+ *cPtr++ = *aPtr++ * phase_Ptr[0];
+ phase_Ptr[0] *= (phase_inc);
+ }
+
+ (*phase) = phase_Ptr[0];
+-
+ }
+
+ #endif /* LV_HAVE_AVX && LV_HAVE_FMA*/
+diff --git a/kernels/volk/volk_32fc_x2_add_32fc.h b/kernels/volk/volk_32fc_x2_add_32fc.h
+index 90ff787..e7356c3 100644
+--- a/kernels/volk/volk_32fc_x2_add_32fc.h
++++ b/kernels/volk/volk_32fc_x2_add_32fc.h
+@@ -31,8 +31,8 @@
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+- * void volk_32fc_x2_add_32fc(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points)
+- * \endcode
++ * void volk_32fc_x2_add_32fc(lv_32fc_t* cVector, const lv_32fc_t* aVector, const
++ * lv_32fc_t* bVector, unsigned int num_points) \endcode
+ *
+ * \b Inputs
+ * \li aVector: First vector of input points.
+@@ -44,7 +44,8 @@
+ *
+ * \b Example
+ *
+- * The follow example adds the increasing and decreasing vectors such that the result of every summation pair is 10
++ * The follow example adds the increasing and decreasing vectors such that the result of
++ * every summation pair is 10
+ *
+ * \code
+ * int N = 10;
+@@ -76,36 +77,38 @@
+ #ifdef LV_HAVE_AVX
+ #include <immintrin.h>
+
+-static inline void
+-volk_32fc_x2_add_32fc_u_avx(lv_32fc_t* cVector, const lv_32fc_t* aVector,
+- const lv_32fc_t* bVector, unsigned int num_points)
++static inline void volk_32fc_x2_add_32fc_u_avx(lv_32fc_t* cVector,
++ const lv_32fc_t* aVector,
++ const lv_32fc_t* bVector,
++ unsigned int num_points)
+ {
+- unsigned int number = 0;
+- const unsigned int quarterPoints = num_points / 4;
++ unsigned int number = 0;
++ const unsigned int quarterPoints = num_points / 4;
+
+- lv_32fc_t* cPtr = cVector;
+- const lv_32fc_t* aPtr = aVector;
+- const lv_32fc_t* bPtr= bVector;
++ lv_32fc_t* cPtr = cVector;
++ const lv_32fc_t* aPtr = aVector;
++ const lv_32fc_t* bPtr = bVector;
+
+- __m256 aVal, bVal, cVal;
+- for(;number < quarterPoints; number++){
++ __m256 aVal, bVal, cVal;
++ for (; number < quarterPoints; number++) {
+
+- aVal = _mm256_loadu_ps((float *) aPtr);
+- bVal = _mm256_loadu_ps((float *) bPtr);
++ aVal = _mm256_loadu_ps((float*)aPtr);
++ bVal = _mm256_loadu_ps((float*)bPtr);
+
+- cVal = _mm256_add_ps(aVal, bVal);
++ cVal = _mm256_add_ps(aVal, bVal);
+
+- _mm256_storeu_ps((float *) cPtr,cVal); // Store the results back into the C container
++ _mm256_storeu_ps((float*)cPtr,
++ cVal); // Store the results back into the C container
+
+- aPtr += 4;
+- bPtr += 4;
+- cPtr += 4;
+- }
++ aPtr += 4;
++ bPtr += 4;
++ cPtr += 4;
++ }
+
+- number = quarterPoints * 4;
+- for(;number < num_points; number++){
+- *cPtr++ = (*aPtr++) + (*bPtr++);
+- }
++ number = quarterPoints * 4;
++ for (; number < num_points; number++) {
++ *cPtr++ = (*aPtr++) + (*bPtr++);
++ }
+ }
+ #endif /* LV_HAVE_AVX */
+
+@@ -113,36 +116,38 @@ volk_32fc_x2_add_32fc_u_avx(lv_32fc_t* cVector, const lv_32fc_t* aVector,
+ #ifdef LV_HAVE_AVX
+ #include <immintrin.h>
+
+-static inline void
+-volk_32fc_x2_add_32fc_a_avx(lv_32fc_t* cVector, const lv_32fc_t* aVector,
+- const lv_32fc_t* bVector, unsigned int num_points)
++static inline void volk_32fc_x2_add_32fc_a_avx(lv_32fc_t* cVector,
++ const lv_32fc_t* aVector,
++ const lv_32fc_t* bVector,
++ unsigned int num_points)
+ {
+- unsigned int number = 0;
+- const unsigned int quarterPoints = num_points / 4;
++ unsigned int number = 0;
++ const unsigned int quarterPoints = num_points / 4;
+
+- lv_32fc_t* cPtr = cVector;
+- const lv_32fc_t* aPtr = aVector;
+- const lv_32fc_t* bPtr= bVector;
++ lv_32fc_t* cPtr = cVector;
++ const lv_32fc_t* aPtr = aVector;
++ const lv_32fc_t* bPtr = bVector;
+
+- __m256 aVal, bVal, cVal;
+- for(;number < quarterPoints; number++){
++ __m256 aVal, bVal, cVal;
++ for (; number < quarterPoints; number++) {
+
+- aVal = _mm256_load_ps((float*) aPtr);
+- bVal = _mm256_load_ps((float*) bPtr);
++ aVal = _mm256_load_ps((float*)aPtr);
++ bVal = _mm256_load_ps((float*)bPtr);
+
+- cVal = _mm256_add_ps(aVal, bVal);
++ cVal = _mm256_add_ps(aVal, bVal);
+
+- _mm256_store_ps((float*) cPtr,cVal); // Store the results back into the C container
++ _mm256_store_ps((float*)cPtr,
++ cVal); // Store the results back into the C container
+
+- aPtr += 4;
+- bPtr += 4;
+- cPtr += 4;
+- }
++ aPtr += 4;
++ bPtr += 4;
++ cPtr += 4;
++ }
+
+- number = quarterPoints * 4;
+- for(;number < num_points; number++){
+- *cPtr++ = (*aPtr++) + (*bPtr++);
+- }
++ number = quarterPoints * 4;
++ for (; number < num_points; number++) {
++ *cPtr++ = (*aPtr++) + (*bPtr++);
++ }
+ }
+ #endif /* LV_HAVE_AVX */
+
+@@ -150,54 +155,56 @@ volk_32fc_x2_add_32fc_a_avx(lv_32fc_t* cVector, const lv_32fc_t* aVector,
+ #ifdef LV_HAVE_SSE
+ #include <xmmintrin.h>
+
+-static inline void
+-volk_32fc_x2_add_32fc_u_sse(lv_32fc_t* cVector, const lv_32fc_t* aVector,
+- const lv_32fc_t* bVector, unsigned int num_points)
++static inline void volk_32fc_x2_add_32fc_u_sse(lv_32fc_t* cVector,
++ const lv_32fc_t* aVector,
++ const lv_32fc_t* bVector,
++ unsigned int num_points)
+ {
+- unsigned int number = 0;
+- const unsigned int halfPoints = num_points / 2;
++ unsigned int number = 0;
++ const unsigned int halfPoints = num_points / 2;
+
+- lv_32fc_t* cPtr = cVector;
+- const lv_32fc_t* aPtr = aVector;
+- const lv_32fc_t* bPtr= bVector;
++ lv_32fc_t* cPtr = cVector;
++ const lv_32fc_t* aPtr = aVector;
++ const lv_32fc_t* bPtr = bVector;
+
+- __m128 aVal, bVal, cVal;
+- for(;number < halfPoints; number++){
++ __m128 aVal, bVal, cVal;
++ for (; number < halfPoints; number++) {
+
+- aVal = _mm_loadu_ps((float *) aPtr);
+- bVal = _mm_loadu_ps((float *) bPtr);
++ aVal = _mm_loadu_ps((float*)aPtr);
++ bVal = _mm_loadu_ps((float*)bPtr);
+
+- cVal = _mm_add_ps(aVal, bVal);
++ cVal = _mm_add_ps(aVal, bVal);
+
+- _mm_storeu_ps((float*) cPtr, cVal); // Store the results back into the C container
++ _mm_storeu_ps((float*)cPtr, cVal); // Store the results back into the C container
+
+- aPtr += 2;
+- bPtr += 2;
+- cPtr += 2;
+- }
++ aPtr += 2;
++ bPtr += 2;
++ cPtr += 2;
++ }
+
+- number = halfPoints * 2;
+- for(;number < num_points; number++){
+- *cPtr++ = (*aPtr++) + (*bPtr++);
+- }
++ number = halfPoints * 2;
++ for (; number < num_points; number++) {
++ *cPtr++ = (*aPtr++) + (*bPtr++);
++ }
+ }
+ #endif /* LV_HAVE_SSE */
+
+
+ #ifdef LV_HAVE_GENERIC
+
+-static inline void
+-volk_32fc_x2_add_32fc_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector,
+- const lv_32fc_t* bVector, unsigned int num_points)
++static inline void volk_32fc_x2_add_32fc_generic(lv_32fc_t* cVector,
++ const lv_32fc_t* aVector,
++ const lv_32fc_t* bVector,
++ unsigned int num_points)
+ {
+- lv_32fc_t* cPtr = cVector;
+- const lv_32fc_t* aPtr = aVector;
+- const lv_32fc_t* bPtr= bVector;
+- unsigned int number = 0;
+-
+- for(number = 0; number < num_points; number++){
+- *cPtr++ = (*aPtr++) + (*bPtr++);
+- }
++ lv_32fc_t* cPtr = cVector;
++ const lv_32fc_t* aPtr = aVector;
++ const lv_32fc_t* bPtr = bVector;
++ unsigned int number = 0;
++
++ for (number = 0; number < num_points; number++) {
++ *cPtr++ = (*aPtr++) + (*bPtr++);
++ }
+ }
+ #endif /* LV_HAVE_GENERIC */
+
+@@ -205,34 +212,36 @@ volk_32fc_x2_add_32fc_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector,
+ #ifdef LV_HAVE_SSE
+ #include <xmmintrin.h>
+
+-static inline void
+-volk_32fc_x2_add_32fc_a_sse(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points)
++static inline void volk_32fc_x2_add_32fc_a_sse(lv_32fc_t* cVector,
++ const lv_32fc_t* aVector,
++ const lv_32fc_t* bVector,
++ unsigned int num_points)
+ {
+- unsigned int number = 0;
+- const unsigned int halfPoints = num_points / 2;
++ unsigned int number = 0;
++ const unsigned int halfPoints = num_points / 2;
+
+- lv_32fc_t* cPtr = cVector;
+- const lv_32fc_t* aPtr = aVector;
+- const lv_32fc_t* bPtr= bVector;
++ lv_32fc_t* cPtr = cVector;
++ const lv_32fc_t* aPtr = aVector;
++ const lv_32fc_t* bPtr = bVector;
+
+- __m128 aVal, bVal, cVal;
+- for(;number < halfPoints; number++){
+- aVal = _mm_load_ps((float *) aPtr);
+- bVal = _mm_load_ps((float *) bPtr);
++ __m128 aVal, bVal, cVal;
++ for (; number < halfPoints; number++) {
++ aVal = _mm_load_ps((float*)aPtr);
++ bVal = _mm_load_ps((float*)bPtr);
+
+- cVal = _mm_add_ps(aVal, bVal);
++ cVal = _mm_add_ps(aVal, bVal);
+
+- _mm_store_ps((float *) cPtr,cVal); // Store the results back into the C container
++ _mm_store_ps((float*)cPtr, cVal); // Store the results back into the C container
+
+- aPtr += 2;
+- bPtr += 2;
+- cPtr += 2;
+- }
++ aPtr += 2;
++ bPtr += 2;
++ cPtr += 2;
++ }
+
+- number = halfPoints * 2;
+- for(;number < num_points; number++){
+- *cPtr++ = (*aPtr++) + (*bPtr++);
+- }
++ number = halfPoints * 2;
++ for (; number < num_points; number++) {
++ *cPtr++ = (*aPtr++) + (*bPtr++);
++ }
+ }
+ #endif /* LV_HAVE_SSE */
+
+@@ -240,38 +249,39 @@ volk_32fc_x2_add_32fc_a_sse(lv_32fc_t* cVector, const lv_32fc_t* aVector, const
+ #ifdef LV_HAVE_NEON
+ #include <arm_neon.h>
+
+-static inline void
+-volk_32fc_x2_add_32fc_u_neon(lv_32fc_t* cVector, const lv_32fc_t* aVector,
+- const lv_32fc_t* bVector, unsigned int num_points)
++static inline void volk_32fc_x2_add_32fc_u_neon(lv_32fc_t* cVector,
++ const lv_32fc_t* aVector,
++ const lv_32fc_t* bVector,
++ unsigned int num_points)
+ {
+- unsigned int number = 0;
+- const unsigned int halfPoints = num_points / 2;
+-
+- lv_32fc_t* cPtr = cVector;
+- const lv_32fc_t* aPtr = aVector;
+- const lv_32fc_t* bPtr= bVector;
+- float32x4_t aVal, bVal, cVal;
+- for(number=0; number < halfPoints; number++){
+- // Load in to NEON registers
+- aVal = vld1q_f32((const float32_t*)(aPtr));
+- bVal = vld1q_f32((const float32_t*)(bPtr));
+- __VOLK_PREFETCH(aPtr+2);
+- __VOLK_PREFETCH(bPtr+2);
+-
+- // vector add
+- cVal = vaddq_f32(aVal, bVal);
+- // Store the results back into the C container
+- vst1q_f32((float*)(cPtr),cVal);
+-
+- aPtr += 2; // q uses quadwords, 4 lv_32fc_ts per vadd
+- bPtr += 2;
+- cPtr += 2;
+- }
+-
+- number = halfPoints * 2; // should be = num_points
+- for(;number < num_points; number++){
+- *cPtr++ = (*aPtr++) + (*bPtr++);
+- }
++ unsigned int number = 0;
++ const unsigned int halfPoints = num_points / 2;
++
++ lv_32fc_t* cPtr = cVector;
++ const lv_32fc_t* aPtr = aVector;
++ const lv_32fc_t* bPtr = bVector;
++ float32x4_t aVal, bVal, cVal;
++ for (number = 0; number < halfPoints; number++) {
++ // Load in to NEON registers
++ aVal = vld1q_f32((const float32_t*)(aPtr));
++ bVal = vld1q_f32((const float32_t*)(bPtr));
++ __VOLK_PREFETCH(aPtr + 2);
++ __VOLK_PREFETCH(bPtr + 2);
++
++ // vector add
++ cVal = vaddq_f32(aVal, bVal);
++ // Store the results back into the C container
++ vst1q_f32((float*)(cPtr), cVal);
++
++ aPtr += 2; // q uses quadwords, 4 lv_32fc_ts per vadd
++ bPtr += 2;
++ cPtr += 2;
++ }
++
++ number = halfPoints * 2; // should be = num_points
++ for (; number < num_points; number++) {
++ *cPtr++ = (*aPtr++) + (*bPtr++);
++ }
+ }
+
+ #endif /* LV_HAVE_NEON */
+diff --git a/kernels/volk/volk_32fc_x2_conjugate_dot_prod_32fc.h b/kernels/volk/volk_32fc_x2_conjugate_dot_prod_32fc.h
+index 77432ec..0f69499 100644
+--- a/kernels/volk/volk_32fc_x2_conjugate_dot_prod_32fc.h
++++ b/kernels/volk/volk_32fc_x2_conjugate_dot_prod_32fc.h
+@@ -34,8 +34,8 @@
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+- * void volk_32fc_x2_conjugate_dot_prod_32fc(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points)
+- * \endcode
++ * void volk_32fc_x2_conjugate_dot_prod_32fc(lv_32fc_t* result, const lv_32fc_t* input,
++ * const lv_32fc_t* taps, unsigned int num_points) \endcode
+ *
+ * \b Inputs
+ * \li input: vector of complex floats.
+@@ -60,40 +60,44 @@
+ #define INCLUDED_volk_32fc_x2_conjugate_dot_prod_32fc_u_H
+
+
+-#include<volk/volk_complex.h>
++#include <volk/volk_complex.h>
+
+
+ #ifdef LV_HAVE_GENERIC
+
+-static inline void volk_32fc_x2_conjugate_dot_prod_32fc_generic(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) {
++static inline void volk_32fc_x2_conjugate_dot_prod_32fc_generic(lv_32fc_t* result,
++ const lv_32fc_t* input,
++ const lv_32fc_t* taps,
++ unsigned int num_points)
++{
+
+- const unsigned int num_bytes = num_points*8;
++ const unsigned int num_bytes = num_points * 8;
+
+- float * res = (float*) result;
+- float * in = (float*) input;
+- float * tp = (float*) taps;
+- unsigned int n_2_ccomplex_blocks = num_bytes >> 4;
++ float* res = (float*)result;
++ float* in = (float*)input;
++ float* tp = (float*)taps;
++ unsigned int n_2_ccomplex_blocks = num_bytes >> 4;
+
+- float sum0[2] = {0,0};
+- float sum1[2] = {0,0};
+- unsigned int i = 0;
++ float sum0[2] = { 0, 0 };
++ float sum1[2] = { 0, 0 };
++ unsigned int i = 0;
+
+- for(i = 0; i < n_2_ccomplex_blocks; ++i) {
+- sum0[0] += in[0] * tp[0] + in[1] * tp[1];
+- sum0[1] += (-in[0] * tp[1]) + in[1] * tp[0];
+- sum1[0] += in[2] * tp[2] + in[3] * tp[3];
+- sum1[1] += (-in[2] * tp[3]) + in[3] * tp[2];
++ for (i = 0; i < n_2_ccomplex_blocks; ++i) {
++ sum0[0] += in[0] * tp[0] + in[1] * tp[1];
++ sum0[1] += (-in[0] * tp[1]) + in[1] * tp[0];
++ sum1[0] += in[2] * tp[2] + in[3] * tp[3];
++ sum1[1] += (-in[2] * tp[3]) + in[3] * tp[2];
+
+- in += 4;
+- tp += 4;
+- }
++ in += 4;
++ tp += 4;
++ }
+
+- res[0] = sum0[0] + sum1[0];
+- res[1] = sum0[1] + sum1[1];
++ res[0] = sum0[0] + sum1[0];
++ res[1] = sum0[1] + sum1[1];
+
+- if (num_bytes >> 3 & 1) {
+- *result += input[(num_bytes >> 3) - 1] * lv_conj(taps[(num_bytes >> 3) - 1]);
+- }
++ if (num_bytes >> 3 & 1) {
++ *result += input[(num_bytes >> 3) - 1] * lv_conj(taps[(num_bytes >> 3) - 1]);
++ }
+ }
+
+ #endif /*LV_HAVE_GENERIC*/
+@@ -103,125 +107,134 @@ static inline void volk_32fc_x2_conjugate_dot_prod_32fc_generic(lv_32fc_t* resul
+ #include <immintrin.h>
+
+ static inline void volk_32fc_x2_conjugate_dot_prod_32fc_u_avx(lv_32fc_t* result,
+- const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points)
++ const lv_32fc_t* input,
++ const lv_32fc_t* taps,
++ unsigned int num_points)
+ {
+- // Partial sums for indices i, i+1, i+2 and i+3.
+- __m256 sum_a_mult_b_real = _mm256_setzero_ps();
+- __m256 sum_a_mult_b_imag = _mm256_setzero_ps();
+-
+- for (long unsigned i = 0; i < (num_points & ~3u); i += 4) {
+- /* Four complex elements a time are processed.
+- * (ar + jâ‹…ai)*conj(br + jâ‹…bi) =
+- * ar⋅br + ai⋅bi + j⋅(ai⋅br − ar⋅bi)
+- */
++ // Partial sums for indices i, i+1, i+2 and i+3.
++ __m256 sum_a_mult_b_real = _mm256_setzero_ps();
++ __m256 sum_a_mult_b_imag = _mm256_setzero_ps();
++
++ for (long unsigned i = 0; i < (num_points & ~3u); i += 4) {
++ /* Four complex elements a time are processed.
++ * (ar + jâ‹…ai)*conj(br + jâ‹…bi) =
++ * ar⋅br + ai⋅bi + j⋅(ai⋅br − ar⋅bi)
++ */
++
++ /* Load input and taps, split and duplicate real und imaginary parts of taps.
++ * a: | ai,i+3 | ar,i+3 | … | ai,i+1 | ar,i+1 | ai,i+0 | ar,i+0 |
++ * b: | bi,i+3 | br,i+3 | … | bi,i+1 | br,i+1 | bi,i+0 | br,i+0 |
++ * b_real: | br,i+3 | br,i+3 | … | br,i+1 | br,i+1 | br,i+0 | br,i+0 |
++ * b_imag: | bi,i+3 | bi,i+3 | … | bi,i+1 | bi,i+1 | bi,i+0 | bi,i+0 |
++ */
++ __m256 a = _mm256_loadu_ps((const float*)&input[i]);
++ __m256 b = _mm256_loadu_ps((const float*)&taps[i]);
++ __m256 b_real = _mm256_moveldup_ps(b);
++ __m256 b_imag = _mm256_movehdup_ps(b);
++
++ // Add | ai⋅br,i+3 | ar⋅br,i+3 | … | ai⋅br,i+0 | ar⋅br,i+0 | to partial sum.
++ sum_a_mult_b_real = _mm256_add_ps(sum_a_mult_b_real, _mm256_mul_ps(a, b_real));
++ // Add | ai⋅bi,i+3 | −ar⋅bi,i+3 | … | ai⋅bi,i+0 | −ar⋅bi,i+0 | to partial sum.
++ sum_a_mult_b_imag = _mm256_addsub_ps(sum_a_mult_b_imag, _mm256_mul_ps(a, b_imag));
++ }
+
+- /* Load input and taps, split and duplicate real und imaginary parts of taps.
+- * a: | ai,i+3 | ar,i+3 | … | ai,i+1 | ar,i+1 | ai,i+0 | ar,i+0 |
+- * b: | bi,i+3 | br,i+3 | … | bi,i+1 | br,i+1 | bi,i+0 | br,i+0 |
+- * b_real: | br,i+3 | br,i+3 | … | br,i+1 | br,i+1 | br,i+0 | br,i+0 |
+- * b_imag: | bi,i+3 | bi,i+3 | … | bi,i+1 | bi,i+1 | bi,i+0 | bi,i+0 |
++ // Swap position of −ar⋅bi and ai⋅bi.
++ sum_a_mult_b_imag = _mm256_permute_ps(sum_a_mult_b_imag, _MM_SHUFFLE(2, 3, 0, 1));
++ // | ai⋅br + ai⋅bi | ai⋅br − ar⋅bi |, sum contains four such partial sums.
++ __m256 sum = _mm256_add_ps(sum_a_mult_b_real, sum_a_mult_b_imag);
++ /* Sum the four partial sums: Add high half of vector sum to the low one, i.e.
++ * s1 + s3 and s0 + s2 …
+ */
+- __m256 a = _mm256_loadu_ps((const float *) &input[i]);
+- __m256 b = _mm256_loadu_ps((const float *) &taps[i]);
+- __m256 b_real = _mm256_moveldup_ps(b);
+- __m256 b_imag = _mm256_movehdup_ps(b);
+-
+- // Add | ai⋅br,i+3 | ar⋅br,i+3 | … | ai⋅br,i+0 | ar⋅br,i+0 | to partial sum.
+- sum_a_mult_b_real = _mm256_add_ps(sum_a_mult_b_real, _mm256_mul_ps(a, b_real));
+- // Add | ai⋅bi,i+3 | −ar⋅bi,i+3 | … | ai⋅bi,i+0 | −ar⋅bi,i+0 | to partial sum.
+- sum_a_mult_b_imag = _mm256_addsub_ps(sum_a_mult_b_imag, _mm256_mul_ps(a, b_imag));
+- }
+-
+- // Swap position of −ar⋅bi and ai⋅bi.
+- sum_a_mult_b_imag = _mm256_permute_ps(sum_a_mult_b_imag, _MM_SHUFFLE(2, 3, 0, 1));
+- // | ai⋅br + ai⋅bi | ai⋅br − ar⋅bi |, sum contains four such partial sums.
+- __m256 sum = _mm256_add_ps(sum_a_mult_b_real, sum_a_mult_b_imag);
+- /* Sum the four partial sums: Add high half of vector sum to the low one, i.e.
+- * s1 + s3 and s0 + s2 …
+- */
+- sum = _mm256_add_ps(sum, _mm256_permute2f128_ps(sum, sum, 0x01));
+- // … and now (s0 + s2) + (s1 + s3)
+- sum = _mm256_add_ps(sum, _mm256_permute_ps(sum, _MM_SHUFFLE(1, 0, 3, 2)));
+- // Store result.
+- __m128 lower = _mm256_extractf128_ps(sum, 0);
+- _mm_storel_pi((__m64 *) result, lower);
+-
+- // Handle the last elements if num_points mod 4 is bigger than 0.
+- for (long unsigned i = num_points & ~3u; i < num_points; ++i) {
+- *result += lv_cmake(
+- lv_creal(input[i]) * lv_creal(taps[i]) + lv_cimag(input[i]) * lv_cimag(taps[i]),
+- lv_cimag(input[i]) * lv_creal(taps[i]) - lv_creal(input[i]) * lv_cimag(taps[i]));
+- }
++ sum = _mm256_add_ps(sum, _mm256_permute2f128_ps(sum, sum, 0x01));
++ // … and now (s0 + s2) + (s1 + s3)
++ sum = _mm256_add_ps(sum, _mm256_permute_ps(sum, _MM_SHUFFLE(1, 0, 3, 2)));
++ // Store result.
++ __m128 lower = _mm256_extractf128_ps(sum, 0);
++ _mm_storel_pi((__m64*)result, lower);
++
++ // Handle the last elements if num_points mod 4 is bigger than 0.
++ for (long unsigned i = num_points & ~3u; i < num_points; ++i) {
++ *result += lv_cmake(lv_creal(input[i]) * lv_creal(taps[i]) +
++ lv_cimag(input[i]) * lv_cimag(taps[i]),
++ lv_cimag(input[i]) * lv_creal(taps[i]) -
++ lv_creal(input[i]) * lv_cimag(taps[i]));
++ }
+ }
+
+ #endif /* LV_HAVE_AVX */
+
+ #ifdef LV_HAVE_SSE3
+
+-#include <xmmintrin.h>
+ #include <pmmintrin.h>
++#include <xmmintrin.h>
+
+ static inline void volk_32fc_x2_conjugate_dot_prod_32fc_u_sse3(lv_32fc_t* result,
+- const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points)
++ const lv_32fc_t* input,
++ const lv_32fc_t* taps,
++ unsigned int num_points)
+ {
+- // Partial sums for indices i and i+1.
+- __m128 sum_a_mult_b_real = _mm_setzero_ps();
+- __m128 sum_a_mult_b_imag = _mm_setzero_ps();
+-
+- for (long unsigned i = 0; i < (num_points & ~1u); i += 2) {
+- /* Two complex elements a time are processed.
+- * (ar + jâ‹…ai)*conj(br + jâ‹…bi) =
+- * ar⋅br + ai⋅bi + j⋅(ai⋅br − ar⋅bi)
+- */
++ // Partial sums for indices i and i+1.
++ __m128 sum_a_mult_b_real = _mm_setzero_ps();
++ __m128 sum_a_mult_b_imag = _mm_setzero_ps();
++
++ for (long unsigned i = 0; i < (num_points & ~1u); i += 2) {
++ /* Two complex elements a time are processed.
++ * (ar + jâ‹…ai)*conj(br + jâ‹…bi) =
++ * ar⋅br + ai⋅bi + j⋅(ai⋅br − ar⋅bi)
++ */
++
++ /* Load input and taps, split and duplicate real und imaginary parts of taps.
++ * a: | ai,i+1 | ar,i+1 | ai,i+0 | ar,i+0 |
++ * b: | bi,i+1 | br,i+1 | bi,i+0 | br,i+0 |
++ * b_real: | br,i+1 | br,i+1 | br,i+0 | br,i+0 |
++ * b_imag: | bi,i+1 | bi,i+1 | bi,i+0 | bi,i+0 |
++ */
++ __m128 a = _mm_loadu_ps((const float*)&input[i]);
++ __m128 b = _mm_loadu_ps((const float*)&taps[i]);
++ __m128 b_real = _mm_moveldup_ps(b);
++ __m128 b_imag = _mm_movehdup_ps(b);
++
++ // Add | aiâ‹…br,i+1 | arâ‹…br,i+1 | aiâ‹…br,i+0 | arâ‹…br,i+0 | to partial sum.
++ sum_a_mult_b_real = _mm_add_ps(sum_a_mult_b_real, _mm_mul_ps(a, b_real));
++ // Add | ai⋅bi,i+1 | −ar⋅bi,i+1 | ai⋅bi,i+0 | −ar⋅bi,i+0 | to partial sum.
++ sum_a_mult_b_imag = _mm_addsub_ps(sum_a_mult_b_imag, _mm_mul_ps(a, b_imag));
++ }
+
+- /* Load input and taps, split and duplicate real und imaginary parts of taps.
+- * a: | ai,i+1 | ar,i+1 | ai,i+0 | ar,i+0 |
+- * b: | bi,i+1 | br,i+1 | bi,i+0 | br,i+0 |
+- * b_real: | br,i+1 | br,i+1 | br,i+0 | br,i+0 |
+- * b_imag: | bi,i+1 | bi,i+1 | bi,i+0 | bi,i+0 |
+- */
+- __m128 a = _mm_loadu_ps((const float *) &input[i]);
+- __m128 b = _mm_loadu_ps((const float *) &taps[i]);
+- __m128 b_real = _mm_moveldup_ps(b);
+- __m128 b_imag = _mm_movehdup_ps(b);
+-
+- // Add | aiâ‹…br,i+1 | arâ‹…br,i+1 | aiâ‹…br,i+0 | arâ‹…br,i+0 | to partial sum.
+- sum_a_mult_b_real = _mm_add_ps(sum_a_mult_b_real, _mm_mul_ps(a, b_real));
+- // Add | ai⋅bi,i+1 | −ar⋅bi,i+1 | ai⋅bi,i+0 | −ar⋅bi,i+0 | to partial sum.
+- sum_a_mult_b_imag = _mm_addsub_ps(sum_a_mult_b_imag, _mm_mul_ps(a, b_imag));
+- }
+-
+- // Swap position of −ar⋅bi and ai⋅bi.
+- sum_a_mult_b_imag = _mm_shuffle_ps(sum_a_mult_b_imag, sum_a_mult_b_imag,
+- _MM_SHUFFLE(2, 3, 0, 1));
+- // | ai⋅br + ai⋅bi | ai⋅br − ar⋅bi |, sum contains two such partial sums.
+- __m128 sum = _mm_add_ps(sum_a_mult_b_real, sum_a_mult_b_imag);
+- // Sum the two partial sums.
+- sum = _mm_add_ps(sum, _mm_shuffle_ps(sum, sum, _MM_SHUFFLE(1, 0, 3, 2)));
+- // Store result.
+- _mm_storel_pi((__m64 *) result, sum);
+-
+- // Handle the last element if num_points mod 2 is 1.
+- if (num_points & 1u) {
+- *result += lv_cmake(
+- lv_creal(input[num_points - 1]) * lv_creal(taps[num_points - 1]) +
+- lv_cimag(input[num_points - 1]) * lv_cimag(taps[num_points - 1]),
+- lv_cimag(input[num_points - 1]) * lv_creal(taps[num_points - 1]) -
+- lv_creal(input[num_points - 1]) * lv_cimag(taps[num_points - 1]));
+- }
++ // Swap position of −ar⋅bi and ai⋅bi.
++ sum_a_mult_b_imag =
++ _mm_shuffle_ps(sum_a_mult_b_imag, sum_a_mult_b_imag, _MM_SHUFFLE(2, 3, 0, 1));
++ // | ai⋅br + ai⋅bi | ai⋅br − ar⋅bi |, sum contains two such partial sums.
++ __m128 sum = _mm_add_ps(sum_a_mult_b_real, sum_a_mult_b_imag);
++ // Sum the two partial sums.
++ sum = _mm_add_ps(sum, _mm_shuffle_ps(sum, sum, _MM_SHUFFLE(1, 0, 3, 2)));
++ // Store result.
++ _mm_storel_pi((__m64*)result, sum);
++
++ // Handle the last element if num_points mod 2 is 1.
++ if (num_points & 1u) {
++ *result += lv_cmake(
++ lv_creal(input[num_points - 1]) * lv_creal(taps[num_points - 1]) +
++ lv_cimag(input[num_points - 1]) * lv_cimag(taps[num_points - 1]),
++ lv_cimag(input[num_points - 1]) * lv_creal(taps[num_points - 1]) -
++ lv_creal(input[num_points - 1]) * lv_cimag(taps[num_points - 1]));
++ }
+ }
+
+ #endif /*LV_HAVE_SSE3*/
+
+ #ifdef LV_HAVE_NEON
+ #include <arm_neon.h>
+-static inline void volk_32fc_x2_conjugate_dot_prod_32fc_neon(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) {
++static inline void volk_32fc_x2_conjugate_dot_prod_32fc_neon(lv_32fc_t* result,
++ const lv_32fc_t* input,
++ const lv_32fc_t* taps,
++ unsigned int num_points)
++{
+
+ unsigned int quarter_points = num_points / 4;
+ unsigned int number;
+
+- lv_32fc_t* a_ptr = (lv_32fc_t*) taps;
+- lv_32fc_t* b_ptr = (lv_32fc_t*) input;
++ lv_32fc_t* a_ptr = (lv_32fc_t*)taps;
++ lv_32fc_t* b_ptr = (lv_32fc_t*)input;
+ // for 2-lane vectors, 1st lane holds the real part,
+ // 2nd lane holds the imaginary part
+ float32x4x2_t a_val, b_val, accumulator;
+@@ -229,11 +242,11 @@ static inline void volk_32fc_x2_conjugate_dot_prod_32fc_neon(lv_32fc_t* result,
+ accumulator.val[0] = vdupq_n_f32(0);
+ accumulator.val[1] = vdupq_n_f32(0);
+
+- for(number = 0; number < quarter_points; ++number) {
++ for (number = 0; number < quarter_points; ++number) {
+ a_val = vld2q_f32((float*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i
+ b_val = vld2q_f32((float*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i
+- __VOLK_PREFETCH(a_ptr+8);
+- __VOLK_PREFETCH(b_ptr+8);
++ __VOLK_PREFETCH(a_ptr + 8);
++ __VOLK_PREFETCH(b_ptr + 8);
+
+ // do the first multiply
+ tmp_imag.val[1] = vmulq_f32(a_val.val[1], b_val.val[0]);
+@@ -255,11 +268,10 @@ static inline void volk_32fc_x2_conjugate_dot_prod_32fc_neon(lv_32fc_t* result,
+ *result = accum_result[0] + accum_result[1] + accum_result[2] + accum_result[3];
+
+ // tail case
+- for(number = quarter_points*4; number < num_points; ++number) {
+- *result += (*a_ptr++) * lv_conj(*b_ptr++);
++ for (number = quarter_points * 4; number < num_points; ++number) {
++ *result += (*a_ptr++) * lv_conj(*b_ptr++);
+ }
+ *result = lv_conj(*result);
+-
+ }
+ #endif /*LV_HAVE_NEON*/
+
+@@ -268,120 +280,125 @@ static inline void volk_32fc_x2_conjugate_dot_prod_32fc_neon(lv_32fc_t* result,
+ #ifndef INCLUDED_volk_32fc_x2_conjugate_dot_prod_32fc_a_H
+ #define INCLUDED_volk_32fc_x2_conjugate_dot_prod_32fc_a_H
+
++#include <stdio.h>
+ #include <volk/volk_common.h>
+-#include<volk/volk_complex.h>
+-#include<stdio.h>
++#include <volk/volk_complex.h>
+
+
+ #ifdef LV_HAVE_AVX
+ #include <immintrin.h>
+
+ static inline void volk_32fc_x2_conjugate_dot_prod_32fc_a_avx(lv_32fc_t* result,
+- const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points)
++ const lv_32fc_t* input,
++ const lv_32fc_t* taps,
++ unsigned int num_points)
+ {
+- // Partial sums for indices i, i+1, i+2 and i+3.
+- __m256 sum_a_mult_b_real = _mm256_setzero_ps();
+- __m256 sum_a_mult_b_imag = _mm256_setzero_ps();
+-
+- for (long unsigned i = 0; i < (num_points & ~3u); i += 4) {
+- /* Four complex elements a time are processed.
+- * (ar + jâ‹…ai)*conj(br + jâ‹…bi) =
+- * ar⋅br + ai⋅bi + j⋅(ai⋅br − ar⋅bi)
+- */
++ // Partial sums for indices i, i+1, i+2 and i+3.
++ __m256 sum_a_mult_b_real = _mm256_setzero_ps();
++ __m256 sum_a_mult_b_imag = _mm256_setzero_ps();
++
++ for (long unsigned i = 0; i < (num_points & ~3u); i += 4) {
++ /* Four complex elements a time are processed.
++ * (ar + jâ‹…ai)*conj(br + jâ‹…bi) =
++ * ar⋅br + ai⋅bi + j⋅(ai⋅br − ar⋅bi)
++ */
++
++ /* Load input and taps, split and duplicate real und imaginary parts of taps.
++ * a: | ai,i+3 | ar,i+3 | … | ai,i+1 | ar,i+1 | ai,i+0 | ar,i+0 |
++ * b: | bi,i+3 | br,i+3 | … | bi,i+1 | br,i+1 | bi,i+0 | br,i+0 |
++ * b_real: | br,i+3 | br,i+3 | … | br,i+1 | br,i+1 | br,i+0 | br,i+0 |
++ * b_imag: | bi,i+3 | bi,i+3 | … | bi,i+1 | bi,i+1 | bi,i+0 | bi,i+0 |
++ */
++ __m256 a = _mm256_load_ps((const float*)&input[i]);
++ __m256 b = _mm256_load_ps((const float*)&taps[i]);
++ __m256 b_real = _mm256_moveldup_ps(b);
++ __m256 b_imag = _mm256_movehdup_ps(b);
++
++ // Add | ai⋅br,i+3 | ar⋅br,i+3 | … | ai⋅br,i+0 | ar⋅br,i+0 | to partial sum.
++ sum_a_mult_b_real = _mm256_add_ps(sum_a_mult_b_real, _mm256_mul_ps(a, b_real));
++ // Add | ai⋅bi,i+3 | −ar⋅bi,i+3 | … | ai⋅bi,i+0 | −ar⋅bi,i+0 | to partial sum.
++ sum_a_mult_b_imag = _mm256_addsub_ps(sum_a_mult_b_imag, _mm256_mul_ps(a, b_imag));
++ }
+
+- /* Load input and taps, split and duplicate real und imaginary parts of taps.
+- * a: | ai,i+3 | ar,i+3 | … | ai,i+1 | ar,i+1 | ai,i+0 | ar,i+0 |
+- * b: | bi,i+3 | br,i+3 | … | bi,i+1 | br,i+1 | bi,i+0 | br,i+0 |
+- * b_real: | br,i+3 | br,i+3 | … | br,i+1 | br,i+1 | br,i+0 | br,i+0 |
+- * b_imag: | bi,i+3 | bi,i+3 | … | bi,i+1 | bi,i+1 | bi,i+0 | bi,i+0 |
++ // Swap position of −ar⋅bi and ai⋅bi.
++ sum_a_mult_b_imag = _mm256_permute_ps(sum_a_mult_b_imag, _MM_SHUFFLE(2, 3, 0, 1));
++ // | ai⋅br + ai⋅bi | ai⋅br − ar⋅bi |, sum contains four such partial sums.
++ __m256 sum = _mm256_add_ps(sum_a_mult_b_real, sum_a_mult_b_imag);
++ /* Sum the four partial sums: Add high half of vector sum to the low one, i.e.
++ * s1 + s3 and s0 + s2 …
+ */
+- __m256 a = _mm256_load_ps((const float *) &input[i]);
+- __m256 b = _mm256_load_ps((const float *) &taps[i]);
+- __m256 b_real = _mm256_moveldup_ps(b);
+- __m256 b_imag = _mm256_movehdup_ps(b);
+-
+- // Add | ai⋅br,i+3 | ar⋅br,i+3 | … | ai⋅br,i+0 | ar⋅br,i+0 | to partial sum.
+- sum_a_mult_b_real = _mm256_add_ps(sum_a_mult_b_real, _mm256_mul_ps(a, b_real));
+- // Add | ai⋅bi,i+3 | −ar⋅bi,i+3 | … | ai⋅bi,i+0 | −ar⋅bi,i+0 | to partial sum.
+- sum_a_mult_b_imag = _mm256_addsub_ps(sum_a_mult_b_imag, _mm256_mul_ps(a, b_imag));
+- }
+-
+- // Swap position of −ar⋅bi and ai⋅bi.
+- sum_a_mult_b_imag = _mm256_permute_ps(sum_a_mult_b_imag, _MM_SHUFFLE(2, 3, 0, 1));
+- // | ai⋅br + ai⋅bi | ai⋅br − ar⋅bi |, sum contains four such partial sums.
+- __m256 sum = _mm256_add_ps(sum_a_mult_b_real, sum_a_mult_b_imag);
+- /* Sum the four partial sums: Add high half of vector sum to the low one, i.e.
+- * s1 + s3 and s0 + s2 …
+- */
+- sum = _mm256_add_ps(sum, _mm256_permute2f128_ps(sum, sum, 0x01));
+- // … and now (s0 + s2) + (s1 + s3)
+- sum = _mm256_add_ps(sum, _mm256_permute_ps(sum, _MM_SHUFFLE(1, 0, 3, 2)));
+- // Store result.
+- __m128 lower = _mm256_extractf128_ps(sum, 0);
+- _mm_storel_pi((__m64 *) result, lower);
+-
+- // Handle the last elements if num_points mod 4 is bigger than 0.
+- for (long unsigned i = num_points & ~3u; i < num_points; ++i) {
+- *result += lv_cmake(
+- lv_creal(input[i]) * lv_creal(taps[i]) + lv_cimag(input[i]) * lv_cimag(taps[i]),
+- lv_cimag(input[i]) * lv_creal(taps[i]) - lv_creal(input[i]) * lv_cimag(taps[i]));
+- }
++ sum = _mm256_add_ps(sum, _mm256_permute2f128_ps(sum, sum, 0x01));
++ // … and now (s0 + s2) + (s1 + s3)
++ sum = _mm256_add_ps(sum, _mm256_permute_ps(sum, _MM_SHUFFLE(1, 0, 3, 2)));
++ // Store result.
++ __m128 lower = _mm256_extractf128_ps(sum, 0);
++ _mm_storel_pi((__m64*)result, lower);
++
++ // Handle the last elements if num_points mod 4 is bigger than 0.
++ for (long unsigned i = num_points & ~3u; i < num_points; ++i) {
++ *result += lv_cmake(lv_creal(input[i]) * lv_creal(taps[i]) +
++ lv_cimag(input[i]) * lv_cimag(taps[i]),
++ lv_cimag(input[i]) * lv_creal(taps[i]) -
++ lv_creal(input[i]) * lv_cimag(taps[i]));
++ }
+ }
+ #endif /* LV_HAVE_AVX */
+
+ #ifdef LV_HAVE_SSE3
+
+-#include <xmmintrin.h>
+ #include <pmmintrin.h>
++#include <xmmintrin.h>
+
+ static inline void volk_32fc_x2_conjugate_dot_prod_32fc_a_sse3(lv_32fc_t* result,
+- const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points)
++ const lv_32fc_t* input,
++ const lv_32fc_t* taps,
++ unsigned int num_points)
+ {
+- // Partial sums for indices i and i+1.
+- __m128 sum_a_mult_b_real = _mm_setzero_ps();
+- __m128 sum_a_mult_b_imag = _mm_setzero_ps();
+-
+- for (long unsigned i = 0; i < (num_points & ~1u); i += 2) {
+- /* Two complex elements a time are processed.
+- * (ar + jâ‹…ai)*conj(br + jâ‹…bi) =
+- * ar⋅br + ai⋅bi + j⋅(ai⋅br − ar⋅bi)
+- */
++ // Partial sums for indices i and i+1.
++ __m128 sum_a_mult_b_real = _mm_setzero_ps();
++ __m128 sum_a_mult_b_imag = _mm_setzero_ps();
++
++ for (long unsigned i = 0; i < (num_points & ~1u); i += 2) {
++ /* Two complex elements a time are processed.
++ * (ar + jâ‹…ai)*conj(br + jâ‹…bi) =
++ * ar⋅br + ai⋅bi + j⋅(ai⋅br − ar⋅bi)
++ */
++
++ /* Load input and taps, split and duplicate real und imaginary parts of taps.
++ * a: | ai,i+1 | ar,i+1 | ai,i+0 | ar,i+0 |
++ * b: | bi,i+1 | br,i+1 | bi,i+0 | br,i+0 |
++ * b_real: | br,i+1 | br,i+1 | br,i+0 | br,i+0 |
++ * b_imag: | bi,i+1 | bi,i+1 | bi,i+0 | bi,i+0 |
++ */
++ __m128 a = _mm_load_ps((const float*)&input[i]);
++ __m128 b = _mm_load_ps((const float*)&taps[i]);
++ __m128 b_real = _mm_moveldup_ps(b);
++ __m128 b_imag = _mm_movehdup_ps(b);
++
++ // Add | aiâ‹…br,i+1 | arâ‹…br,i+1 | aiâ‹…br,i+0 | arâ‹…br,i+0 | to partial sum.
++ sum_a_mult_b_real = _mm_add_ps(sum_a_mult_b_real, _mm_mul_ps(a, b_real));
++ // Add | ai⋅bi,i+1 | −ar⋅bi,i+1 | ai⋅bi,i+0 | −ar⋅bi,i+0 | to partial sum.
++ sum_a_mult_b_imag = _mm_addsub_ps(sum_a_mult_b_imag, _mm_mul_ps(a, b_imag));
++ }
+
+- /* Load input and taps, split and duplicate real und imaginary parts of taps.
+- * a: | ai,i+1 | ar,i+1 | ai,i+0 | ar,i+0 |
+- * b: | bi,i+1 | br,i+1 | bi,i+0 | br,i+0 |
+- * b_real: | br,i+1 | br,i+1 | br,i+0 | br,i+0 |
+- * b_imag: | bi,i+1 | bi,i+1 | bi,i+0 | bi,i+0 |
+- */
+- __m128 a = _mm_load_ps((const float *) &input[i]);
+- __m128 b = _mm_load_ps((const float *) &taps[i]);
+- __m128 b_real = _mm_moveldup_ps(b);
+- __m128 b_imag = _mm_movehdup_ps(b);
+-
+- // Add | aiâ‹…br,i+1 | arâ‹…br,i+1 | aiâ‹…br,i+0 | arâ‹…br,i+0 | to partial sum.
+- sum_a_mult_b_real = _mm_add_ps(sum_a_mult_b_real, _mm_mul_ps(a, b_real));
+- // Add | ai⋅bi,i+1 | −ar⋅bi,i+1 | ai⋅bi,i+0 | −ar⋅bi,i+0 | to partial sum.
+- sum_a_mult_b_imag = _mm_addsub_ps(sum_a_mult_b_imag, _mm_mul_ps(a, b_imag));
+- }
+-
+- // Swap position of −ar⋅bi and ai⋅bi.
+- sum_a_mult_b_imag = _mm_shuffle_ps(sum_a_mult_b_imag, sum_a_mult_b_imag,
+- _MM_SHUFFLE(2, 3, 0, 1));
+- // | ai⋅br + ai⋅bi | ai⋅br − ar⋅bi |, sum contains two such partial sums.
+- __m128 sum = _mm_add_ps(sum_a_mult_b_real, sum_a_mult_b_imag);
+- // Sum the two partial sums.
+- sum = _mm_add_ps(sum, _mm_shuffle_ps(sum, sum, _MM_SHUFFLE(1, 0, 3, 2)));
+- // Store result.
+- _mm_storel_pi((__m64 *) result, sum);
+-
+- // Handle the last element if num_points mod 2 is 1.
+- if (num_points & 1u) {
+- *result += lv_cmake(
+- lv_creal(input[num_points - 1]) * lv_creal(taps[num_points - 1]) +
+- lv_cimag(input[num_points - 1]) * lv_cimag(taps[num_points - 1]),
+- lv_cimag(input[num_points - 1]) * lv_creal(taps[num_points - 1]) -
+- lv_creal(input[num_points - 1]) * lv_cimag(taps[num_points - 1]));
+- }
++ // Swap position of −ar⋅bi and ai⋅bi.
++ sum_a_mult_b_imag =
++ _mm_shuffle_ps(sum_a_mult_b_imag, sum_a_mult_b_imag, _MM_SHUFFLE(2, 3, 0, 1));
++ // | ai⋅br + ai⋅bi | ai⋅br − ar⋅bi |, sum contains two such partial sums.
++ __m128 sum = _mm_add_ps(sum_a_mult_b_real, sum_a_mult_b_imag);
++ // Sum the two partial sums.
++ sum = _mm_add_ps(sum, _mm_shuffle_ps(sum, sum, _MM_SHUFFLE(1, 0, 3, 2)));
++ // Store result.
++ _mm_storel_pi((__m64*)result, sum);
++
++ // Handle the last element if num_points mod 2 is 1.
++ if (num_points & 1u) {
++ *result += lv_cmake(
++ lv_creal(input[num_points - 1]) * lv_creal(taps[num_points - 1]) +
++ lv_cimag(input[num_points - 1]) * lv_cimag(taps[num_points - 1]),
++ lv_cimag(input[num_points - 1]) * lv_creal(taps[num_points - 1]) -
++ lv_creal(input[num_points - 1]) * lv_cimag(taps[num_points - 1]));
++ }
+ }
+
+ #endif /*LV_HAVE_SSE3*/
+@@ -390,35 +407,39 @@ static inline void volk_32fc_x2_conjugate_dot_prod_32fc_a_sse3(lv_32fc_t* result
+ #ifdef LV_HAVE_GENERIC
+
+
+-static inline void volk_32fc_x2_conjugate_dot_prod_32fc_a_generic(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) {
++static inline void volk_32fc_x2_conjugate_dot_prod_32fc_a_generic(lv_32fc_t* result,
++ const lv_32fc_t* input,
++ const lv_32fc_t* taps,
++ unsigned int num_points)
++{
+
+- const unsigned int num_bytes = num_points*8;
++ const unsigned int num_bytes = num_points * 8;
+
+- float * res = (float*) result;
+- float * in = (float*) input;
+- float * tp = (float*) taps;
+- unsigned int n_2_ccomplex_blocks = num_bytes >> 4;
++ float* res = (float*)result;
++ float* in = (float*)input;
++ float* tp = (float*)taps;
++ unsigned int n_2_ccomplex_blocks = num_bytes >> 4;
+
+- float sum0[2] = {0,0};
+- float sum1[2] = {0,0};
+- unsigned int i = 0;
++ float sum0[2] = { 0, 0 };
++ float sum1[2] = { 0, 0 };
++ unsigned int i = 0;
+
+- for(i = 0; i < n_2_ccomplex_blocks; ++i) {
+- sum0[0] += in[0] * tp[0] + in[1] * tp[1];
+- sum0[1] += (-in[0] * tp[1]) + in[1] * tp[0];
+- sum1[0] += in[2] * tp[2] + in[3] * tp[3];
+- sum1[1] += (-in[2] * tp[3]) + in[3] * tp[2];
++ for (i = 0; i < n_2_ccomplex_blocks; ++i) {
++ sum0[0] += in[0] * tp[0] + in[1] * tp[1];
++ sum0[1] += (-in[0] * tp[1]) + in[1] * tp[0];
++ sum1[0] += in[2] * tp[2] + in[3] * tp[3];
++ sum1[1] += (-in[2] * tp[3]) + in[3] * tp[2];
+
+- in += 4;
+- tp += 4;
+- }
++ in += 4;
++ tp += 4;
++ }
+
+- res[0] = sum0[0] + sum1[0];
+- res[1] = sum0[1] + sum1[1];
++ res[0] = sum0[0] + sum1[0];
++ res[1] = sum0[1] + sum1[1];
+
+- if (num_bytes >> 3 & 1) {
+- *result += input[(num_bytes >> 3) - 1] * lv_conj(taps[(num_bytes >> 3) - 1]);
+- }
++ if (num_bytes >> 3 & 1) {
++ *result += input[(num_bytes >> 3) - 1] * lv_conj(taps[(num_bytes >> 3) - 1]);
++ }
+ }
+
+ #endif /*LV_HAVE_GENERIC*/
+@@ -426,256 +447,276 @@ static inline void volk_32fc_x2_conjugate_dot_prod_32fc_a_generic(lv_32fc_t* res
+
+ #if LV_HAVE_SSE && LV_HAVE_64
+
+-static inline void volk_32fc_x2_conjugate_dot_prod_32fc_a_sse(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) {
+-
+- const unsigned int num_bytes = num_points*8;
+-
+- __VOLK_ATTR_ALIGNED(16) static const uint32_t conjugator[4]= {0x00000000, 0x80000000, 0x00000000, 0x80000000};
+-
+- __VOLK_ASM __VOLK_VOLATILE
+- (
+- "# ccomplex_conjugate_dotprod_generic (float* result, const float *input,\n\t"
+- "# const float *taps, unsigned num_bytes)\n\t"
+- "# float sum0 = 0;\n\t"
+- "# float sum1 = 0;\n\t"
+- "# float sum2 = 0;\n\t"
+- "# float sum3 = 0;\n\t"
+- "# do {\n\t"
+- "# sum0 += input[0] * taps[0] - input[1] * taps[1];\n\t"
+- "# sum1 += input[0] * taps[1] + input[1] * taps[0];\n\t"
+- "# sum2 += input[2] * taps[2] - input[3] * taps[3];\n\t"
+- "# sum3 += input[2] * taps[3] + input[3] * taps[2];\n\t"
+- "# input += 4;\n\t"
+- "# taps += 4; \n\t"
+- "# } while (--n_2_ccomplex_blocks != 0);\n\t"
+- "# result[0] = sum0 + sum2;\n\t"
+- "# result[1] = sum1 + sum3;\n\t"
+- "# TODO: prefetch and better scheduling\n\t"
+- " xor %%r9, %%r9\n\t"
+- " xor %%r10, %%r10\n\t"
+- " movq %[conjugator], %%r9\n\t"
+- " movq %%rcx, %%rax\n\t"
+- " movaps 0(%%r9), %%xmm8\n\t"
+- " movq %%rcx, %%r8\n\t"
+- " movq %[rsi], %%r9\n\t"
+- " movq %[rdx], %%r10\n\t"
+- " xorps %%xmm6, %%xmm6 # zero accumulators\n\t"
+- " movaps 0(%%r9), %%xmm0\n\t"
+- " xorps %%xmm7, %%xmm7 # zero accumulators\n\t"
+- " movups 0(%%r10), %%xmm2\n\t"
+- " shr $5, %%rax # rax = n_2_ccomplex_blocks / 2\n\t"
+- " shr $4, %%r8\n\t"
+- " xorps %%xmm8, %%xmm2\n\t"
+- " jmp .%=L1_test\n\t"
+- " # 4 taps / loop\n\t"
+- " # something like ?? cycles / loop\n\t"
+- ".%=Loop1: \n\t"
+- "# complex prod: C += A * B, w/ temp Z & Y (or B), xmmPN=$0x8000000080000000\n\t"
+- "# movaps (%%r9), %%xmmA\n\t"
+- "# movaps (%%r10), %%xmmB\n\t"
+- "# movaps %%xmmA, %%xmmZ\n\t"
+- "# shufps $0xb1, %%xmmZ, %%xmmZ # swap internals\n\t"
+- "# mulps %%xmmB, %%xmmA\n\t"
+- "# mulps %%xmmZ, %%xmmB\n\t"
+- "# # SSE replacement for: pfpnacc %%xmmB, %%xmmA\n\t"
+- "# xorps %%xmmPN, %%xmmA\n\t"
+- "# movaps %%xmmA, %%xmmZ\n\t"
+- "# unpcklps %%xmmB, %%xmmA\n\t"
+- "# unpckhps %%xmmB, %%xmmZ\n\t"
+- "# movaps %%xmmZ, %%xmmY\n\t"
+- "# shufps $0x44, %%xmmA, %%xmmZ # b01000100\n\t"
+- "# shufps $0xee, %%xmmY, %%xmmA # b11101110\n\t"
+- "# addps %%xmmZ, %%xmmA\n\t"
+- "# addps %%xmmA, %%xmmC\n\t"
+- "# A=xmm0, B=xmm2, Z=xmm4\n\t"
+- "# A'=xmm1, B'=xmm3, Z'=xmm5\n\t"
+- " movaps 16(%%r9), %%xmm1\n\t"
+- " movaps %%xmm0, %%xmm4\n\t"
+- " mulps %%xmm2, %%xmm0\n\t"
+- " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t"
+- " movaps 16(%%r10), %%xmm3\n\t"
+- " movaps %%xmm1, %%xmm5\n\t"
+- " xorps %%xmm8, %%xmm3\n\t"
+- " addps %%xmm0, %%xmm6\n\t"
+- " mulps %%xmm3, %%xmm1\n\t"
+- " shufps $0xb1, %%xmm5, %%xmm5 # swap internals\n\t"
+- " addps %%xmm1, %%xmm6\n\t"
+- " mulps %%xmm4, %%xmm2\n\t"
+- " movaps 32(%%r9), %%xmm0\n\t"
+- " addps %%xmm2, %%xmm7\n\t"
+- " mulps %%xmm5, %%xmm3\n\t"
+- " add $32, %%r9\n\t"
+- " movaps 32(%%r10), %%xmm2\n\t"
+- " addps %%xmm3, %%xmm7\n\t"
+- " add $32, %%r10\n\t"
+- " xorps %%xmm8, %%xmm2\n\t"
+- ".%=L1_test:\n\t"
+- " dec %%rax\n\t"
+- " jge .%=Loop1\n\t"
+- " # We've handled the bulk of multiplies up to here.\n\t"
+- " # Let's sse if original n_2_ccomplex_blocks was odd.\n\t"
+- " # If so, we've got 2 more taps to do.\n\t"
+- " and $1, %%r8\n\t"
+- " je .%=Leven\n\t"
+- " # The count was odd, do 2 more taps.\n\t"
+- " # Note that we've already got mm0/mm2 preloaded\n\t"
+- " # from the main loop.\n\t"
+- " movaps %%xmm0, %%xmm4\n\t"
+- " mulps %%xmm2, %%xmm0\n\t"
+- " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t"
+- " addps %%xmm0, %%xmm6\n\t"
+- " mulps %%xmm4, %%xmm2\n\t"
+- " addps %%xmm2, %%xmm7\n\t"
+- ".%=Leven:\n\t"
+- " # neg inversor\n\t"
+- " xorps %%xmm1, %%xmm1\n\t"
+- " mov $0x80000000, %%r9\n\t"
+- " movd %%r9, %%xmm1\n\t"
+- " shufps $0x11, %%xmm1, %%xmm1 # b00010001 # 0 -0 0 -0\n\t"
+- " # pfpnacc\n\t"
+- " xorps %%xmm1, %%xmm6\n\t"
+- " movaps %%xmm6, %%xmm2\n\t"
+- " unpcklps %%xmm7, %%xmm6\n\t"
+- " unpckhps %%xmm7, %%xmm2\n\t"
+- " movaps %%xmm2, %%xmm3\n\t"
+- " shufps $0x44, %%xmm6, %%xmm2 # b01000100\n\t"
+- " shufps $0xee, %%xmm3, %%xmm6 # b11101110\n\t"
+- " addps %%xmm2, %%xmm6\n\t"
+- " # xmm6 = r1 i2 r3 i4\n\t"
+- " movhlps %%xmm6, %%xmm4 # xmm4 = r3 i4 ?? ??\n\t"
+- " addps %%xmm4, %%xmm6 # xmm6 = r1+r3 i2+i4 ?? ??\n\t"
+- " movlps %%xmm6, (%[rdi]) # store low 2x32 bits (complex) to memory\n\t"
+- :
+- :[rsi] "r" (input), [rdx] "r" (taps), "c" (num_bytes), [rdi] "r" (result), [conjugator] "r" (conjugator)
+- :"rax", "r8", "r9", "r10"
+- );
+-
+- int getem = num_bytes % 16;
+-
+- for(; getem > 0; getem -= 8) {
+- *result += (input[(num_bytes >> 3) - 1] * lv_conj(taps[(num_bytes >> 3) - 1]));
+- }
++static inline void volk_32fc_x2_conjugate_dot_prod_32fc_a_sse(lv_32fc_t* result,
++ const lv_32fc_t* input,
++ const lv_32fc_t* taps,
++ unsigned int num_points)
++{
++
++ const unsigned int num_bytes = num_points * 8;
++
++ __VOLK_ATTR_ALIGNED(16)
++ static const uint32_t conjugator[4] = {
++ 0x00000000, 0x80000000, 0x00000000, 0x80000000
++ };
++
++ __VOLK_ASM __VOLK_VOLATILE(
++ "# ccomplex_conjugate_dotprod_generic (float* result, const float *input,\n\t"
++ "# const float *taps, unsigned num_bytes)\n\t"
++ "# float sum0 = 0;\n\t"
++ "# float sum1 = 0;\n\t"
++ "# float sum2 = 0;\n\t"
++ "# float sum3 = 0;\n\t"
++ "# do {\n\t"
++ "# sum0 += input[0] * taps[0] - input[1] * taps[1];\n\t"
++ "# sum1 += input[0] * taps[1] + input[1] * taps[0];\n\t"
++ "# sum2 += input[2] * taps[2] - input[3] * taps[3];\n\t"
++ "# sum3 += input[2] * taps[3] + input[3] * taps[2];\n\t"
++ "# input += 4;\n\t"
++ "# taps += 4; \n\t"
++ "# } while (--n_2_ccomplex_blocks != 0);\n\t"
++ "# result[0] = sum0 + sum2;\n\t"
++ "# result[1] = sum1 + sum3;\n\t"
++ "# TODO: prefetch and better scheduling\n\t"
++ " xor %%r9, %%r9\n\t"
++ " xor %%r10, %%r10\n\t"
++ " movq %[conjugator], %%r9\n\t"
++ " movq %%rcx, %%rax\n\t"
++ " movaps 0(%%r9), %%xmm8\n\t"
++ " movq %%rcx, %%r8\n\t"
++ " movq %[rsi], %%r9\n\t"
++ " movq %[rdx], %%r10\n\t"
++ " xorps %%xmm6, %%xmm6 # zero accumulators\n\t"
++ " movaps 0(%%r9), %%xmm0\n\t"
++ " xorps %%xmm7, %%xmm7 # zero accumulators\n\t"
++ " movups 0(%%r10), %%xmm2\n\t"
++ " shr $5, %%rax # rax = n_2_ccomplex_blocks / 2\n\t"
++ " shr $4, %%r8\n\t"
++ " xorps %%xmm8, %%xmm2\n\t"
++ " jmp .%=L1_test\n\t"
++ " # 4 taps / loop\n\t"
++ " # something like ?? cycles / loop\n\t"
++ ".%=Loop1: \n\t"
++ "# complex prod: C += A * B, w/ temp Z & Y (or B), xmmPN=$0x8000000080000000\n\t"
++ "# movaps (%%r9), %%xmmA\n\t"
++ "# movaps (%%r10), %%xmmB\n\t"
++ "# movaps %%xmmA, %%xmmZ\n\t"
++ "# shufps $0xb1, %%xmmZ, %%xmmZ # swap internals\n\t"
++ "# mulps %%xmmB, %%xmmA\n\t"
++ "# mulps %%xmmZ, %%xmmB\n\t"
++ "# # SSE replacement for: pfpnacc %%xmmB, %%xmmA\n\t"
++ "# xorps %%xmmPN, %%xmmA\n\t"
++ "# movaps %%xmmA, %%xmmZ\n\t"
++ "# unpcklps %%xmmB, %%xmmA\n\t"
++ "# unpckhps %%xmmB, %%xmmZ\n\t"
++ "# movaps %%xmmZ, %%xmmY\n\t"
++ "# shufps $0x44, %%xmmA, %%xmmZ # b01000100\n\t"
++ "# shufps $0xee, %%xmmY, %%xmmA # b11101110\n\t"
++ "# addps %%xmmZ, %%xmmA\n\t"
++ "# addps %%xmmA, %%xmmC\n\t"
++ "# A=xmm0, B=xmm2, Z=xmm4\n\t"
++ "# A'=xmm1, B'=xmm3, Z'=xmm5\n\t"
++ " movaps 16(%%r9), %%xmm1\n\t"
++ " movaps %%xmm0, %%xmm4\n\t"
++ " mulps %%xmm2, %%xmm0\n\t"
++ " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t"
++ " movaps 16(%%r10), %%xmm3\n\t"
++ " movaps %%xmm1, %%xmm5\n\t"
++ " xorps %%xmm8, %%xmm3\n\t"
++ " addps %%xmm0, %%xmm6\n\t"
++ " mulps %%xmm3, %%xmm1\n\t"
++ " shufps $0xb1, %%xmm5, %%xmm5 # swap internals\n\t"
++ " addps %%xmm1, %%xmm6\n\t"
++ " mulps %%xmm4, %%xmm2\n\t"
++ " movaps 32(%%r9), %%xmm0\n\t"
++ " addps %%xmm2, %%xmm7\n\t"
++ " mulps %%xmm5, %%xmm3\n\t"
++ " add $32, %%r9\n\t"
++ " movaps 32(%%r10), %%xmm2\n\t"
++ " addps %%xmm3, %%xmm7\n\t"
++ " add $32, %%r10\n\t"
++ " xorps %%xmm8, %%xmm2\n\t"
++ ".%=L1_test:\n\t"
++ " dec %%rax\n\t"
++ " jge .%=Loop1\n\t"
++ " # We've handled the bulk of multiplies up to here.\n\t"
++ " # Let's sse if original n_2_ccomplex_blocks was odd.\n\t"
++ " # If so, we've got 2 more taps to do.\n\t"
++ " and $1, %%r8\n\t"
++ " je .%=Leven\n\t"
++ " # The count was odd, do 2 more taps.\n\t"
++ " # Note that we've already got mm0/mm2 preloaded\n\t"
++ " # from the main loop.\n\t"
++ " movaps %%xmm0, %%xmm4\n\t"
++ " mulps %%xmm2, %%xmm0\n\t"
++ " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t"
++ " addps %%xmm0, %%xmm6\n\t"
++ " mulps %%xmm4, %%xmm2\n\t"
++ " addps %%xmm2, %%xmm7\n\t"
++ ".%=Leven:\n\t"
++ " # neg inversor\n\t"
++ " xorps %%xmm1, %%xmm1\n\t"
++ " mov $0x80000000, %%r9\n\t"
++ " movd %%r9, %%xmm1\n\t"
++ " shufps $0x11, %%xmm1, %%xmm1 # b00010001 # 0 -0 0 -0\n\t"
++ " # pfpnacc\n\t"
++ " xorps %%xmm1, %%xmm6\n\t"
++ " movaps %%xmm6, %%xmm2\n\t"
++ " unpcklps %%xmm7, %%xmm6\n\t"
++ " unpckhps %%xmm7, %%xmm2\n\t"
++ " movaps %%xmm2, %%xmm3\n\t"
++ " shufps $0x44, %%xmm6, %%xmm2 # b01000100\n\t"
++ " shufps $0xee, %%xmm3, %%xmm6 # b11101110\n\t"
++ " addps %%xmm2, %%xmm6\n\t"
++ " # xmm6 = r1 i2 r3 i4\n\t"
++ " movhlps %%xmm6, %%xmm4 # xmm4 = r3 i4 ?? ??\n\t"
++ " addps %%xmm4, %%xmm6 # xmm6 = r1+r3 i2+i4 ?? ??\n\t"
++ " movlps %%xmm6, (%[rdi]) # store low 2x32 bits (complex) "
++ "to memory\n\t"
++ :
++ : [rsi] "r"(input),
++ [rdx] "r"(taps),
++ "c"(num_bytes),
++ [rdi] "r"(result),
++ [conjugator] "r"(conjugator)
++ : "rax", "r8", "r9", "r10");
++
++ int getem = num_bytes % 16;
++
++ for (; getem > 0; getem -= 8) {
++ *result += (input[(num_bytes >> 3) - 1] * lv_conj(taps[(num_bytes >> 3) - 1]));
++ }
+ }
+ #endif
+
+ #if LV_HAVE_SSE && LV_HAVE_32
+-static inline void volk_32fc_x2_conjugate_dot_prod_32fc_a_sse_32(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) {
+-
+- const unsigned int num_bytes = num_points*8;
+-
+- __VOLK_ATTR_ALIGNED(16) static const uint32_t conjugator[4]= {0x00000000, 0x80000000, 0x00000000, 0x80000000};
+-
+- int bound = num_bytes >> 4;
+- int leftovers = num_bytes % 16;
+-
+- __VOLK_ASM __VOLK_VOLATILE
+- (
+- " #pushl %%ebp\n\t"
+- " #movl %%esp, %%ebp\n\t"
+- " #movl 12(%%ebp), %%eax # input\n\t"
+- " #movl 16(%%ebp), %%edx # taps\n\t"
+- " #movl 20(%%ebp), %%ecx # n_bytes\n\t"
+- " movaps 0(%[conjugator]), %%xmm1\n\t"
+- " xorps %%xmm6, %%xmm6 # zero accumulators\n\t"
+- " movaps 0(%[eax]), %%xmm0\n\t"
+- " xorps %%xmm7, %%xmm7 # zero accumulators\n\t"
+- " movaps 0(%[edx]), %%xmm2\n\t"
+- " movl %[ecx], (%[out])\n\t"
+- " shrl $5, %[ecx] # ecx = n_2_ccomplex_blocks / 2\n\t"
+-
+- " xorps %%xmm1, %%xmm2\n\t"
+- " jmp .%=L1_test\n\t"
+- " # 4 taps / loop\n\t"
+- " # something like ?? cycles / loop\n\t"
+- ".%=Loop1: \n\t"
+- "# complex prod: C += A * B, w/ temp Z & Y (or B), xmmPN=$0x8000000080000000\n\t"
+- "# movaps (%[eax]), %%xmmA\n\t"
+- "# movaps (%[edx]), %%xmmB\n\t"
+- "# movaps %%xmmA, %%xmmZ\n\t"
+- "# shufps $0xb1, %%xmmZ, %%xmmZ # swap internals\n\t"
+- "# mulps %%xmmB, %%xmmA\n\t"
+- "# mulps %%xmmZ, %%xmmB\n\t"
+- "# # SSE replacement for: pfpnacc %%xmmB, %%xmmA\n\t"
+- "# xorps %%xmmPN, %%xmmA\n\t"
+- "# movaps %%xmmA, %%xmmZ\n\t"
+- "# unpcklps %%xmmB, %%xmmA\n\t"
+- "# unpckhps %%xmmB, %%xmmZ\n\t"
+- "# movaps %%xmmZ, %%xmmY\n\t"
+- "# shufps $0x44, %%xmmA, %%xmmZ # b01000100\n\t"
+- "# shufps $0xee, %%xmmY, %%xmmA # b11101110\n\t"
+- "# addps %%xmmZ, %%xmmA\n\t"
+- "# addps %%xmmA, %%xmmC\n\t"
+- "# A=xmm0, B=xmm2, Z=xmm4\n\t"
+- "# A'=xmm1, B'=xmm3, Z'=xmm5\n\t"
+- " movaps 16(%[edx]), %%xmm3\n\t"
+- " movaps %%xmm0, %%xmm4\n\t"
+- " xorps %%xmm1, %%xmm3\n\t"
+- " mulps %%xmm2, %%xmm0\n\t"
+- " movaps 16(%[eax]), %%xmm1\n\t"
+- " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t"
+- " movaps %%xmm1, %%xmm5\n\t"
+- " addps %%xmm0, %%xmm6\n\t"
+- " mulps %%xmm3, %%xmm1\n\t"
+- " shufps $0xb1, %%xmm5, %%xmm5 # swap internals\n\t"
+- " addps %%xmm1, %%xmm6\n\t"
+- " movaps 0(%[conjugator]), %%xmm1\n\t"
+- " mulps %%xmm4, %%xmm2\n\t"
+- " movaps 32(%[eax]), %%xmm0\n\t"
+- " addps %%xmm2, %%xmm7\n\t"
+- " mulps %%xmm5, %%xmm3\n\t"
+- " addl $32, %[eax]\n\t"
+- " movaps 32(%[edx]), %%xmm2\n\t"
+- " addps %%xmm3, %%xmm7\n\t"
+- " xorps %%xmm1, %%xmm2\n\t"
+- " addl $32, %[edx]\n\t"
+- ".%=L1_test:\n\t"
+- " decl %[ecx]\n\t"
+- " jge .%=Loop1\n\t"
+- " # We've handled the bulk of multiplies up to here.\n\t"
+- " # Let's sse if original n_2_ccomplex_blocks was odd.\n\t"
+- " # If so, we've got 2 more taps to do.\n\t"
+- " movl 0(%[out]), %[ecx] # n_2_ccomplex_blocks\n\t"
+- " shrl $4, %[ecx]\n\t"
+- " andl $1, %[ecx]\n\t"
+- " je .%=Leven\n\t"
+- " # The count was odd, do 2 more taps.\n\t"
+- " # Note that we've already got mm0/mm2 preloaded\n\t"
+- " # from the main loop.\n\t"
+- " movaps %%xmm0, %%xmm4\n\t"
+- " mulps %%xmm2, %%xmm0\n\t"
+- " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t"
+- " addps %%xmm0, %%xmm6\n\t"
+- " mulps %%xmm4, %%xmm2\n\t"
+- " addps %%xmm2, %%xmm7\n\t"
+- ".%=Leven:\n\t"
+- " # neg inversor\n\t"
+- " #movl 8(%%ebp), %[eax] \n\t"
+- " xorps %%xmm1, %%xmm1\n\t"
+- " movl $0x80000000, (%[out])\n\t"
+- " movss (%[out]), %%xmm1\n\t"
+- " shufps $0x11, %%xmm1, %%xmm1 # b00010001 # 0 -0 0 -0\n\t"
+- " # pfpnacc\n\t"
+- " xorps %%xmm1, %%xmm6\n\t"
+- " movaps %%xmm6, %%xmm2\n\t"
+- " unpcklps %%xmm7, %%xmm6\n\t"
+- " unpckhps %%xmm7, %%xmm2\n\t"
+- " movaps %%xmm2, %%xmm3\n\t"
+- " shufps $0x44, %%xmm6, %%xmm2 # b01000100\n\t"
+- " shufps $0xee, %%xmm3, %%xmm6 # b11101110\n\t"
+- " addps %%xmm2, %%xmm6\n\t"
+- " # xmm6 = r1 i2 r3 i4\n\t"
+- " #movl 8(%%ebp), %[eax] # @result\n\t"
+- " movhlps %%xmm6, %%xmm4 # xmm4 = r3 i4 ?? ??\n\t"
+- " addps %%xmm4, %%xmm6 # xmm6 = r1+r3 i2+i4 ?? ??\n\t"
+- " movlps %%xmm6, (%[out]) # store low 2x32 bits (complex) to memory\n\t"
+- " #popl %%ebp\n\t"
+- :
+- : [eax] "r" (input), [edx] "r" (taps), [ecx] "r" (num_bytes), [out] "r" (result), [conjugator] "r" (conjugator)
+- );
+-
+- for(; leftovers > 0; leftovers -= 8) {
+- *result += (input[(bound << 1)] * lv_conj(taps[(bound << 1)]));
+- }
++static inline void volk_32fc_x2_conjugate_dot_prod_32fc_a_sse_32(lv_32fc_t* result,
++ const lv_32fc_t* input,
++ const lv_32fc_t* taps,
++ unsigned int num_points)
++{
++
++ const unsigned int num_bytes = num_points * 8;
++
++ __VOLK_ATTR_ALIGNED(16)
++ static const uint32_t conjugator[4] = {
++ 0x00000000, 0x80000000, 0x00000000, 0x80000000
++ };
++
++ int bound = num_bytes >> 4;
++ int leftovers = num_bytes % 16;
++
++ __VOLK_ASM __VOLK_VOLATILE(
++ " #pushl %%ebp\n\t"
++ " #movl %%esp, %%ebp\n\t"
++ " #movl 12(%%ebp), %%eax # input\n\t"
++ " #movl 16(%%ebp), %%edx # taps\n\t"
++ " #movl 20(%%ebp), %%ecx # n_bytes\n\t"
++ " movaps 0(%[conjugator]), %%xmm1\n\t"
++ " xorps %%xmm6, %%xmm6 # zero accumulators\n\t"
++ " movaps 0(%[eax]), %%xmm0\n\t"
++ " xorps %%xmm7, %%xmm7 # zero accumulators\n\t"
++ " movaps 0(%[edx]), %%xmm2\n\t"
++ " movl %[ecx], (%[out])\n\t"
++ " shrl $5, %[ecx] # ecx = n_2_ccomplex_blocks / 2\n\t"
++
++ " xorps %%xmm1, %%xmm2\n\t"
++ " jmp .%=L1_test\n\t"
++ " # 4 taps / loop\n\t"
++ " # something like ?? cycles / loop\n\t"
++ ".%=Loop1: \n\t"
++ "# complex prod: C += A * B, w/ temp Z & Y (or B), xmmPN=$0x8000000080000000\n\t"
++ "# movaps (%[eax]), %%xmmA\n\t"
++ "# movaps (%[edx]), %%xmmB\n\t"
++ "# movaps %%xmmA, %%xmmZ\n\t"
++ "# shufps $0xb1, %%xmmZ, %%xmmZ # swap internals\n\t"
++ "# mulps %%xmmB, %%xmmA\n\t"
++ "# mulps %%xmmZ, %%xmmB\n\t"
++ "# # SSE replacement for: pfpnacc %%xmmB, %%xmmA\n\t"
++ "# xorps %%xmmPN, %%xmmA\n\t"
++ "# movaps %%xmmA, %%xmmZ\n\t"
++ "# unpcklps %%xmmB, %%xmmA\n\t"
++ "# unpckhps %%xmmB, %%xmmZ\n\t"
++ "# movaps %%xmmZ, %%xmmY\n\t"
++ "# shufps $0x44, %%xmmA, %%xmmZ # b01000100\n\t"
++ "# shufps $0xee, %%xmmY, %%xmmA # b11101110\n\t"
++ "# addps %%xmmZ, %%xmmA\n\t"
++ "# addps %%xmmA, %%xmmC\n\t"
++ "# A=xmm0, B=xmm2, Z=xmm4\n\t"
++ "# A'=xmm1, B'=xmm3, Z'=xmm5\n\t"
++ " movaps 16(%[edx]), %%xmm3\n\t"
++ " movaps %%xmm0, %%xmm4\n\t"
++ " xorps %%xmm1, %%xmm3\n\t"
++ " mulps %%xmm2, %%xmm0\n\t"
++ " movaps 16(%[eax]), %%xmm1\n\t"
++ " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t"
++ " movaps %%xmm1, %%xmm5\n\t"
++ " addps %%xmm0, %%xmm6\n\t"
++ " mulps %%xmm3, %%xmm1\n\t"
++ " shufps $0xb1, %%xmm5, %%xmm5 # swap internals\n\t"
++ " addps %%xmm1, %%xmm6\n\t"
++ " movaps 0(%[conjugator]), %%xmm1\n\t"
++ " mulps %%xmm4, %%xmm2\n\t"
++ " movaps 32(%[eax]), %%xmm0\n\t"
++ " addps %%xmm2, %%xmm7\n\t"
++ " mulps %%xmm5, %%xmm3\n\t"
++ " addl $32, %[eax]\n\t"
++ " movaps 32(%[edx]), %%xmm2\n\t"
++ " addps %%xmm3, %%xmm7\n\t"
++ " xorps %%xmm1, %%xmm2\n\t"
++ " addl $32, %[edx]\n\t"
++ ".%=L1_test:\n\t"
++ " decl %[ecx]\n\t"
++ " jge .%=Loop1\n\t"
++ " # We've handled the bulk of multiplies up to here.\n\t"
++ " # Let's sse if original n_2_ccomplex_blocks was odd.\n\t"
++ " # If so, we've got 2 more taps to do.\n\t"
++ " movl 0(%[out]), %[ecx] # n_2_ccomplex_blocks\n\t"
++ " shrl $4, %[ecx]\n\t"
++ " andl $1, %[ecx]\n\t"
++ " je .%=Leven\n\t"
++ " # The count was odd, do 2 more taps.\n\t"
++ " # Note that we've already got mm0/mm2 preloaded\n\t"
++ " # from the main loop.\n\t"
++ " movaps %%xmm0, %%xmm4\n\t"
++ " mulps %%xmm2, %%xmm0\n\t"
++ " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t"
++ " addps %%xmm0, %%xmm6\n\t"
++ " mulps %%xmm4, %%xmm2\n\t"
++ " addps %%xmm2, %%xmm7\n\t"
++ ".%=Leven:\n\t"
++ " # neg inversor\n\t"
++ " #movl 8(%%ebp), %[eax] \n\t"
++ " xorps %%xmm1, %%xmm1\n\t"
++ " movl $0x80000000, (%[out])\n\t"
++ " movss (%[out]), %%xmm1\n\t"
++ " shufps $0x11, %%xmm1, %%xmm1 # b00010001 # 0 -0 0 -0\n\t"
++ " # pfpnacc\n\t"
++ " xorps %%xmm1, %%xmm6\n\t"
++ " movaps %%xmm6, %%xmm2\n\t"
++ " unpcklps %%xmm7, %%xmm6\n\t"
++ " unpckhps %%xmm7, %%xmm2\n\t"
++ " movaps %%xmm2, %%xmm3\n\t"
++ " shufps $0x44, %%xmm6, %%xmm2 # b01000100\n\t"
++ " shufps $0xee, %%xmm3, %%xmm6 # b11101110\n\t"
++ " addps %%xmm2, %%xmm6\n\t"
++ " # xmm6 = r1 i2 r3 i4\n\t"
++ " #movl 8(%%ebp), %[eax] # @result\n\t"
++ " movhlps %%xmm6, %%xmm4 # xmm4 = r3 i4 ?? ??\n\t"
++ " addps %%xmm4, %%xmm6 # xmm6 = r1+r3 i2+i4 ?? ??\n\t"
++ " movlps %%xmm6, (%[out]) # store low 2x32 bits (complex) "
++ "to memory\n\t"
++ " #popl %%ebp\n\t"
++ :
++ : [eax] "r"(input),
++ [edx] "r"(taps),
++ [ecx] "r"(num_bytes),
++ [out] "r"(result),
++ [conjugator] "r"(conjugator));
++
++ for (; leftovers > 0; leftovers -= 8) {
++ *result += (input[(bound << 1)] * lv_conj(taps[(bound << 1)]));
++ }
+ }
+ #endif /*LV_HAVE_SSE*/
+
+diff --git a/kernels/volk/volk_32fc_x2_divide_32fc.h b/kernels/volk/volk_32fc_x2_divide_32fc.h
+index 3ce6ede..78c245a 100644
+--- a/kernels/volk/volk_32fc_x2_divide_32fc.h
++++ b/kernels/volk/volk_32fc_x2_divide_32fc.h
+@@ -29,8 +29,8 @@
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+- * void volk_32fc_x2_divide_32fc(lv_32fc_t* cVector, const lv_32fc_t* numeratorVector, const lv_32fc_t* denumeratorVector, unsigned int num_points);
+- * \endcode
++ * void volk_32fc_x2_divide_32fc(lv_32fc_t* cVector, const lv_32fc_t* numeratorVector,
++ * const lv_32fc_t* denumeratorVector, unsigned int num_points); \endcode
+ *
+ * \b Inputs
+ * \li numeratorVector: The numerator complex values.
+@@ -41,7 +41,8 @@
+ * \li outputVector: The output vector complex floats.
+ *
+ * \b Example
+- * divide a complex vector by itself, demonstrating the result should be pretty close to 1+0j.
++ * divide a complex vector by itself, demonstrating the result should be pretty close to
++ * 1+0j.
+ *
+ * \code
+ * int N = 10;
+@@ -71,17 +72,18 @@
+ #ifndef INCLUDED_volk_32fc_x2_divide_32fc_u_H
+ #define INCLUDED_volk_32fc_x2_divide_32fc_u_H
+
++#include <float.h>
+ #include <inttypes.h>
+ #include <volk/volk_complex.h>
+-#include <float.h>
+
+ #ifdef LV_HAVE_SSE3
+ #include <pmmintrin.h>
+ #include <volk/volk_sse3_intrinsics.h>
+
+-static inline void
+-volk_32fc_x2_divide_32fc_u_sse3(lv_32fc_t* cVector, const lv_32fc_t* numeratorVector,
+- const lv_32fc_t* denumeratorVector, unsigned int num_points)
++static inline void volk_32fc_x2_divide_32fc_u_sse3(lv_32fc_t* cVector,
++ const lv_32fc_t* numeratorVector,
++ const lv_32fc_t* denumeratorVector,
++ unsigned int num_points)
+ {
+ /*
+ * we'll do the "classical"
+@@ -89,44 +91,46 @@ volk_32fc_x2_divide_32fc_u_sse3(lv_32fc_t* cVector, const lv_32fc_t* numeratorVe
+ * --- = -------
+ * b |b|^2
+ * */
+- unsigned int number = 0;
+- const unsigned int quarterPoints = num_points / 4;
+-
+- __m128 num01, num23, den01, den23, norm, result;
+- lv_32fc_t* c = cVector;
+- const lv_32fc_t* a = numeratorVector;
+- const lv_32fc_t* b = denumeratorVector;
+-
+- for(; number < quarterPoints; number++){
+- num01 = _mm_loadu_ps((float*) a); // first pair
+- den01 = _mm_loadu_ps((float*) b); // first pair
+- num01 = _mm_complexconjugatemul_ps(num01, den01); // a conj(b)
+- a += 2;
+- b += 2;
+-
+- num23 = _mm_loadu_ps((float*) a); // second pair
+- den23 = _mm_loadu_ps((float*) b); // second pair
+- num23 = _mm_complexconjugatemul_ps(num23, den23); // a conj(b)
+- a += 2;
+- b += 2;
+-
+- norm = _mm_magnitudesquared_ps_sse3(den01, den23);
+- den01 = _mm_unpacklo_ps(norm,norm);
+- den23 = _mm_unpackhi_ps(norm,norm);
+-
+- result = _mm_div_ps(num01, den01);
+- _mm_storeu_ps((float*) c, result); // Store the results back into the C container
+- c += 2;
+- result = _mm_div_ps(num23, den23);
+- _mm_storeu_ps((float*) c, result); // Store the results back into the C container
+- c += 2;
+- }
+-
+- number *= 4;
+- for(;number < num_points; number++){
+- *c = (*a) / (*b);
+- a++; b++; c++;
+- }
++ unsigned int number = 0;
++ const unsigned int quarterPoints = num_points / 4;
++
++ __m128 num01, num23, den01, den23, norm, result;
++ lv_32fc_t* c = cVector;
++ const lv_32fc_t* a = numeratorVector;
++ const lv_32fc_t* b = denumeratorVector;
++
++ for (; number < quarterPoints; number++) {
++ num01 = _mm_loadu_ps((float*)a); // first pair
++ den01 = _mm_loadu_ps((float*)b); // first pair
++ num01 = _mm_complexconjugatemul_ps(num01, den01); // a conj(b)
++ a += 2;
++ b += 2;
++
++ num23 = _mm_loadu_ps((float*)a); // second pair
++ den23 = _mm_loadu_ps((float*)b); // second pair
++ num23 = _mm_complexconjugatemul_ps(num23, den23); // a conj(b)
++ a += 2;
++ b += 2;
++
++ norm = _mm_magnitudesquared_ps_sse3(den01, den23);
++ den01 = _mm_unpacklo_ps(norm, norm);
++ den23 = _mm_unpackhi_ps(norm, norm);
++
++ result = _mm_div_ps(num01, den01);
++ _mm_storeu_ps((float*)c, result); // Store the results back into the C container
++ c += 2;
++ result = _mm_div_ps(num23, den23);
++ _mm_storeu_ps((float*)c, result); // Store the results back into the C container
++ c += 2;
++ }
++
++ number *= 4;
++ for (; number < num_points; number++) {
++ *c = (*a) / (*b);
++ a++;
++ b++;
++ c++;
++ }
+ }
+ #endif /* LV_HAVE_SSE3 */
+
+@@ -135,9 +139,10 @@ volk_32fc_x2_divide_32fc_u_sse3(lv_32fc_t* cVector, const lv_32fc_t* numeratorVe
+ #include <immintrin.h>
+ #include <volk/volk_avx_intrinsics.h>
+
+-static inline void
+-volk_32fc_x2_divide_32fc_u_avx(lv_32fc_t* cVector, const lv_32fc_t* numeratorVector,
+- const lv_32fc_t* denumeratorVector, unsigned int num_points)
++static inline void volk_32fc_x2_divide_32fc_u_avx(lv_32fc_t* cVector,
++ const lv_32fc_t* numeratorVector,
++ const lv_32fc_t* denumeratorVector,
++ unsigned int num_points)
+ {
+ /*
+ * we'll do the "classical"
+@@ -153,17 +158,21 @@ volk_32fc_x2_divide_32fc_u_avx(lv_32fc_t* cVector, const lv_32fc_t* numeratorVec
+ const lv_32fc_t* a = numeratorVector;
+ const lv_32fc_t* b = denumeratorVector;
+
+- for(; number < quarterPoints; number++){
+- num = _mm256_loadu_ps((float*) a); // Load the ar + ai, br + bi ... as ar,ai,br,bi ...
+- denum = _mm256_loadu_ps((float*) b); // Load the cr + ci, dr + di ... as cr,ci,dr,di ...
++ for (; number < quarterPoints; number++) {
++ num = _mm256_loadu_ps(
++ (float*)a); // Load the ar + ai, br + bi ... as ar,ai,br,bi ...
++ denum = _mm256_loadu_ps(
++ (float*)b); // Load the cr + ci, dr + di ... as cr,ci,dr,di ...
+ mul_conj = _mm256_complexconjugatemul_ps(num, denum);
+ sq = _mm256_mul_ps(denum, denum); // Square the values
+- mag_sq_un = _mm256_hadd_ps(sq,sq); // obtain the actual squared magnitude, although out of order
++ mag_sq_un = _mm256_hadd_ps(
++ sq, sq); // obtain the actual squared magnitude, although out of order
+ mag_sq = _mm256_permute_ps(mag_sq_un, 0xd8); // I order them
+- // best guide I found on using these functions: https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=2738,2059,2738,2738,3875,3874,3875,2738,3870
+- div = _mm256_div_ps(mul_conj,mag_sq);
++ // best guide I found on using these functions:
++ // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=2738,2059,2738,2738,3875,3874,3875,2738,3870
++ div = _mm256_div_ps(mul_conj, mag_sq);
+
+- _mm256_storeu_ps((float*) c, div); // Store the results back into the C container
++ _mm256_storeu_ps((float*)c, div); // Store the results back into the C container
+
+ a += 4;
+ b += 4;
+@@ -172,51 +181,51 @@ volk_32fc_x2_divide_32fc_u_avx(lv_32fc_t* cVector, const lv_32fc_t* numeratorVec
+
+ number = quarterPoints * 4;
+
+- for(; number < num_points; number++){
++ for (; number < num_points; number++) {
+ *c++ = (*a++) / (*b++);
+ }
+-
+ }
+ #endif /* LV_HAVE_AVX */
+
+
+ #ifdef LV_HAVE_GENERIC
+
+-static inline void
+-volk_32fc_x2_divide_32fc_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector,
+- const lv_32fc_t* bVector, unsigned int num_points)
++static inline void volk_32fc_x2_divide_32fc_generic(lv_32fc_t* cVector,
++ const lv_32fc_t* aVector,
++ const lv_32fc_t* bVector,
++ unsigned int num_points)
+ {
+- lv_32fc_t* cPtr = cVector;
+- const lv_32fc_t* aPtr = aVector;
+- const lv_32fc_t* bPtr= bVector;
+- unsigned int number = 0;
+-
+- for(number = 0; number < num_points; number++){
+- *cPtr++ = (*aPtr++) / (*bPtr++);
+- }
++ lv_32fc_t* cPtr = cVector;
++ const lv_32fc_t* aPtr = aVector;
++ const lv_32fc_t* bPtr = bVector;
++ unsigned int number = 0;
++
++ for (number = 0; number < num_points; number++) {
++ *cPtr++ = (*aPtr++) / (*bPtr++);
++ }
+ }
+ #endif /* LV_HAVE_GENERIC */
+
+
+-
+ #endif /* INCLUDED_volk_32fc_x2_divide_32fc_u_H */
+
+
+ #ifndef INCLUDED_volk_32fc_x2_divide_32fc_a_H
+ #define INCLUDED_volk_32fc_x2_divide_32fc_a_H
+
++#include <float.h>
+ #include <inttypes.h>
+ #include <stdio.h>
+ #include <volk/volk_complex.h>
+-#include <float.h>
+
+ #ifdef LV_HAVE_SSE3
+ #include <pmmintrin.h>
+ #include <volk/volk_sse3_intrinsics.h>
+
+-static inline void
+-volk_32fc_x2_divide_32fc_a_sse3(lv_32fc_t* cVector, const lv_32fc_t* numeratorVector,
+- const lv_32fc_t* denumeratorVector, unsigned int num_points)
++static inline void volk_32fc_x2_divide_32fc_a_sse3(lv_32fc_t* cVector,
++ const lv_32fc_t* numeratorVector,
++ const lv_32fc_t* denumeratorVector,
++ unsigned int num_points)
+ {
+ /*
+ * we'll do the "classical"
+@@ -224,45 +233,47 @@ volk_32fc_x2_divide_32fc_a_sse3(lv_32fc_t* cVector, const lv_32fc_t* numeratorVe
+ * --- = -------
+ * b |b|^2
+ * */
+- unsigned int number = 0;
+- const unsigned int quarterPoints = num_points / 4;
+-
+- __m128 num01, num23, den01, den23, norm, result;
+- lv_32fc_t* c = cVector;
+- const lv_32fc_t* a = numeratorVector;
+- const lv_32fc_t* b = denumeratorVector;
+-
+- for(; number < quarterPoints; number++){
+- num01 = _mm_load_ps((float*) a); // first pair
+- den01 = _mm_load_ps((float*) b); // first pair
+- num01 = _mm_complexconjugatemul_ps(num01, den01); // a conj(b)
+- a += 2;
+- b += 2;
+-
+- num23 = _mm_load_ps((float*) a); // second pair
+- den23 = _mm_load_ps((float*) b); // second pair
+- num23 = _mm_complexconjugatemul_ps(num23, den23); // a conj(b)
+- a += 2;
+- b += 2;
+-
+- norm = _mm_magnitudesquared_ps_sse3(den01, den23);
+-
+- den01 = _mm_unpacklo_ps(norm,norm); // select the lower floats twice
+- den23 = _mm_unpackhi_ps(norm,norm); // select the upper floats twice
+-
+- result = _mm_div_ps(num01, den01);
+- _mm_store_ps((float*) c, result); // Store the results back into the C container
+- c += 2;
+- result = _mm_div_ps(num23, den23);
+- _mm_store_ps((float*) c, result); // Store the results back into the C container
+- c += 2;
+- }
+-
+- number *= 4;
+- for(;number < num_points; number++){
+- *c = (*a) / (*b);
+- a++; b++; c++;
+- }
++ unsigned int number = 0;
++ const unsigned int quarterPoints = num_points / 4;
++
++ __m128 num01, num23, den01, den23, norm, result;
++ lv_32fc_t* c = cVector;
++ const lv_32fc_t* a = numeratorVector;
++ const lv_32fc_t* b = denumeratorVector;
++
++ for (; number < quarterPoints; number++) {
++ num01 = _mm_load_ps((float*)a); // first pair
++ den01 = _mm_load_ps((float*)b); // first pair
++ num01 = _mm_complexconjugatemul_ps(num01, den01); // a conj(b)
++ a += 2;
++ b += 2;
++
++ num23 = _mm_load_ps((float*)a); // second pair
++ den23 = _mm_load_ps((float*)b); // second pair
++ num23 = _mm_complexconjugatemul_ps(num23, den23); // a conj(b)
++ a += 2;
++ b += 2;
++
++ norm = _mm_magnitudesquared_ps_sse3(den01, den23);
++
++ den01 = _mm_unpacklo_ps(norm, norm); // select the lower floats twice
++ den23 = _mm_unpackhi_ps(norm, norm); // select the upper floats twice
++
++ result = _mm_div_ps(num01, den01);
++ _mm_store_ps((float*)c, result); // Store the results back into the C container
++ c += 2;
++ result = _mm_div_ps(num23, den23);
++ _mm_store_ps((float*)c, result); // Store the results back into the C container
++ c += 2;
++ }
++
++ number *= 4;
++ for (; number < num_points; number++) {
++ *c = (*a) / (*b);
++ a++;
++ b++;
++ c++;
++ }
+ }
+ #endif /* LV_HAVE_SSE */
+
+@@ -270,9 +281,10 @@ volk_32fc_x2_divide_32fc_a_sse3(lv_32fc_t* cVector, const lv_32fc_t* numeratorVe
+ #include <immintrin.h>
+ #include <volk/volk_avx_intrinsics.h>
+
+-static inline void
+-volk_32fc_x2_divide_32fc_a_avx(lv_32fc_t* cVector, const lv_32fc_t* numeratorVector,
+- const lv_32fc_t* denumeratorVector, unsigned int num_points)
++static inline void volk_32fc_x2_divide_32fc_a_avx(lv_32fc_t* cVector,
++ const lv_32fc_t* numeratorVector,
++ const lv_32fc_t* denumeratorVector,
++ unsigned int num_points)
+ {
+ /*
+ * we'll do the "classical"
+@@ -288,17 +300,21 @@ volk_32fc_x2_divide_32fc_a_avx(lv_32fc_t* cVector, const lv_32fc_t* numeratorVec
+ const lv_32fc_t* a = numeratorVector;
+ const lv_32fc_t* b = denumeratorVector;
+
+- for(; number < quarterPoints; number++){
+- num = _mm256_load_ps((float*) a); // Load the ar + ai, br + bi ... as ar,ai,br,bi ...
+- denum = _mm256_load_ps((float*) b); // Load the cr + ci, dr + di ... as cr,ci,dr,di ...
++ for (; number < quarterPoints; number++) {
++ num =
++ _mm256_load_ps((float*)a); // Load the ar + ai, br + bi ... as ar,ai,br,bi ...
++ denum =
++ _mm256_load_ps((float*)b); // Load the cr + ci, dr + di ... as cr,ci,dr,di ...
+ mul_conj = _mm256_complexconjugatemul_ps(num, denum);
+ sq = _mm256_mul_ps(denum, denum); // Square the values
+- mag_sq_un = _mm256_hadd_ps(sq,sq); // obtain the actual squared magnitude, although out of order
++ mag_sq_un = _mm256_hadd_ps(
++ sq, sq); // obtain the actual squared magnitude, although out of order
+ mag_sq = _mm256_permute_ps(mag_sq_un, 0xd8); // I order them
+- // best guide I found on using these functions: https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=2738,2059,2738,2738,3875,3874,3875,2738,3870
+- div = _mm256_div_ps(mul_conj,mag_sq);
++ // best guide I found on using these functions:
++ // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=2738,2059,2738,2738,3875,3874,3875,2738,3870
++ div = _mm256_div_ps(mul_conj, mag_sq);
+
+- _mm256_store_ps((float*) c, div); // Store the results back into the C container
++ _mm256_store_ps((float*)c, div); // Store the results back into the C container
+
+ a += 4;
+ b += 4;
+@@ -307,78 +323,78 @@ volk_32fc_x2_divide_32fc_a_avx(lv_32fc_t* cVector, const lv_32fc_t* numeratorVec
+
+ number = quarterPoints * 4;
+
+- for(; number < num_points; number++){
++ for (; number < num_points; number++) {
+ *c++ = (*a++) / (*b++);
+ }
+-
+-
+ }
+ #endif /* LV_HAVE_AVX */
+
+ #ifdef LV_HAVE_NEON
+ #include <arm_neon.h>
+
+-static inline void
+-volk_32fc_x2_divide_32fc_neon(lv_32fc_t* cVector, const lv_32fc_t* aVector,
+- const lv_32fc_t* bVector, unsigned int num_points)
++static inline void volk_32fc_x2_divide_32fc_neon(lv_32fc_t* cVector,
++ const lv_32fc_t* aVector,
++ const lv_32fc_t* bVector,
++ unsigned int num_points)
+ {
+- lv_32fc_t* cPtr = cVector;
+- const lv_32fc_t* aPtr = aVector;
+- const lv_32fc_t* bPtr = bVector;
+-
+- float32x4x2_t aVal, bVal, cVal;
+- float32x4_t bAbs, bAbsInv;
+-
+- const unsigned int quarterPoints = num_points / 4;
+- unsigned int number = 0;
+- for(; number < quarterPoints; number++){
+- aVal = vld2q_f32((const float*)(aPtr));
+- bVal = vld2q_f32((const float*)(bPtr));
+- aPtr += 4;
+- bPtr += 4;
+- __VOLK_PREFETCH(aPtr+4);
+- __VOLK_PREFETCH(bPtr+4);
+-
+- bAbs = vmulq_f32( bVal.val[0], bVal.val[0]);
+- bAbs = vmlaq_f32(bAbs, bVal.val[1], bVal.val[1]);
+-
+- bAbsInv = vrecpeq_f32(bAbs);
+- bAbsInv = vmulq_f32(bAbsInv, vrecpsq_f32(bAbsInv, bAbs));
+- bAbsInv = vmulq_f32(bAbsInv, vrecpsq_f32(bAbsInv, bAbs));
+-
+- cVal.val[0] = vmulq_f32( aVal.val[0], bVal.val[0]);
+- cVal.val[0] = vmlaq_f32(cVal.val[0], aVal.val[1], bVal.val[1]);
+- cVal.val[0] = vmulq_f32(cVal.val[0], bAbsInv);
+-
+- cVal.val[1] = vmulq_f32( aVal.val[1], bVal.val[0]);
+- cVal.val[1] = vmlsq_f32(cVal.val[1], aVal.val[0], bVal.val[1]);
+- cVal.val[1] = vmulq_f32(cVal.val[1], bAbsInv);
+-
+- vst2q_f32((float*)(cPtr), cVal);
+- cPtr += 4;
+- }
+-
+- for(number = quarterPoints * 4; number < num_points; number++){
+- *cPtr++ = (*aPtr++) / (*bPtr++);
+- }
++ lv_32fc_t* cPtr = cVector;
++ const lv_32fc_t* aPtr = aVector;
++ const lv_32fc_t* bPtr = bVector;
++
++ float32x4x2_t aVal, bVal, cVal;
++ float32x4_t bAbs, bAbsInv;
++
++ const unsigned int quarterPoints = num_points / 4;
++ unsigned int number = 0;
++ for (; number < quarterPoints; number++) {
++ aVal = vld2q_f32((const float*)(aPtr));
++ bVal = vld2q_f32((const float*)(bPtr));
++ aPtr += 4;
++ bPtr += 4;
++ __VOLK_PREFETCH(aPtr + 4);
++ __VOLK_PREFETCH(bPtr + 4);
++
++ bAbs = vmulq_f32(bVal.val[0], bVal.val[0]);
++ bAbs = vmlaq_f32(bAbs, bVal.val[1], bVal.val[1]);
++
++ bAbsInv = vrecpeq_f32(bAbs);
++ bAbsInv = vmulq_f32(bAbsInv, vrecpsq_f32(bAbsInv, bAbs));
++ bAbsInv = vmulq_f32(bAbsInv, vrecpsq_f32(bAbsInv, bAbs));
++
++ cVal.val[0] = vmulq_f32(aVal.val[0], bVal.val[0]);
++ cVal.val[0] = vmlaq_f32(cVal.val[0], aVal.val[1], bVal.val[1]);
++ cVal.val[0] = vmulq_f32(cVal.val[0], bAbsInv);
++
++ cVal.val[1] = vmulq_f32(aVal.val[1], bVal.val[0]);
++ cVal.val[1] = vmlsq_f32(cVal.val[1], aVal.val[0], bVal.val[1]);
++ cVal.val[1] = vmulq_f32(cVal.val[1], bAbsInv);
++
++ vst2q_f32((float*)(cPtr), cVal);
++ cPtr += 4;
++ }
++
++ for (number = quarterPoints * 4; number < num_points; number++) {
++ *cPtr++ = (*aPtr++) / (*bPtr++);
++ }
+ }
+ #endif /* LV_HAVE_NEON */
+
+
+ #ifdef LV_HAVE_GENERIC
+
+-static inline void
+-volk_32fc_x2_divide_32fc_a_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector,
+- const lv_32fc_t* bVector, unsigned int num_points)
++static inline void volk_32fc_x2_divide_32fc_a_generic(lv_32fc_t* cVector,
++ const lv_32fc_t* aVector,
++ const lv_32fc_t* bVector,
++ unsigned int num_points)
+ {
+- lv_32fc_t* cPtr = cVector;
+- const lv_32fc_t* aPtr = aVector;
+- const lv_32fc_t* bPtr= bVector;
+- unsigned int number = 0;
+-
+- for(number = 0; number < num_points; number++){
+- *cPtr++ = (*aPtr++) / (*bPtr++);
+- }
++ lv_32fc_t* cPtr = cVector;
++ const lv_32fc_t* aPtr = aVector;
++ const lv_32fc_t* bPtr = bVector;
++ unsigned int number = 0;
++
++ for (number = 0; number < num_points; number++) {
++ *cPtr++ = (*aPtr++) / (*bPtr++);
++ }
+ }
+ #endif /* LV_HAVE_GENERIC */
+
+diff --git a/kernels/volk/volk_32fc_x2_dot_prod_32fc.h b/kernels/volk/volk_32fc_x2_dot_prod_32fc.h
+index f4a4469..b0b7fee 100644
+--- a/kernels/volk/volk_32fc_x2_dot_prod_32fc.h
++++ b/kernels/volk/volk_32fc_x2_dot_prod_32fc.h
+@@ -33,8 +33,8 @@
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+- * void volk_32fc_x2_dot_prod_32fc(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points)
+- * \endcode
++ * void volk_32fc_x2_dot_prod_32fc(lv_32fc_t* result, const lv_32fc_t* input, const
++ * lv_32fc_t* taps, unsigned int num_points) \endcode
+ *
+ * \b Inputs
+ * \li input: vector of complex floats.
+@@ -58,236 +58,246 @@
+ #ifndef INCLUDED_volk_32fc_x2_dot_prod_32fc_u_H
+ #define INCLUDED_volk_32fc_x2_dot_prod_32fc_u_H
+
+-#include <volk/volk_common.h>
+-#include <volk/volk_complex.h>
+ #include <stdio.h>
+ #include <string.h>
++#include <volk/volk_common.h>
++#include <volk/volk_complex.h>
+
+
+ #ifdef LV_HAVE_GENERIC
+
+
+-static inline void volk_32fc_x2_dot_prod_32fc_generic(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) {
++static inline void volk_32fc_x2_dot_prod_32fc_generic(lv_32fc_t* result,
++ const lv_32fc_t* input,
++ const lv_32fc_t* taps,
++ unsigned int num_points)
++{
+
+- float * res = (float*) result;
+- float * in = (float*) input;
+- float * tp = (float*) taps;
+- unsigned int n_2_ccomplex_blocks = num_points/2;
++ float* res = (float*)result;
++ float* in = (float*)input;
++ float* tp = (float*)taps;
++ unsigned int n_2_ccomplex_blocks = num_points / 2;
+
+- float sum0[2] = {0,0};
+- float sum1[2] = {0,0};
+- unsigned int i = 0;
++ float sum0[2] = { 0, 0 };
++ float sum1[2] = { 0, 0 };
++ unsigned int i = 0;
+
+- for(i = 0; i < n_2_ccomplex_blocks; ++i) {
+- sum0[0] += in[0] * tp[0] - in[1] * tp[1];
+- sum0[1] += in[0] * tp[1] + in[1] * tp[0];
+- sum1[0] += in[2] * tp[2] - in[3] * tp[3];
+- sum1[1] += in[2] * tp[3] + in[3] * tp[2];
++ for (i = 0; i < n_2_ccomplex_blocks; ++i) {
++ sum0[0] += in[0] * tp[0] - in[1] * tp[1];
++ sum0[1] += in[0] * tp[1] + in[1] * tp[0];
++ sum1[0] += in[2] * tp[2] - in[3] * tp[3];
++ sum1[1] += in[2] * tp[3] + in[3] * tp[2];
+
+- in += 4;
+- tp += 4;
+- }
++ in += 4;
++ tp += 4;
++ }
+
+- res[0] = sum0[0] + sum1[0];
+- res[1] = sum0[1] + sum1[1];
++ res[0] = sum0[0] + sum1[0];
++ res[1] = sum0[1] + sum1[1];
+
+- // Cleanup if we had an odd number of points
+- if (num_points & 1) {
+- *result += input[num_points - 1] * taps[num_points - 1];
+- }
++ // Cleanup if we had an odd number of points
++ if (num_points & 1) {
++ *result += input[num_points - 1] * taps[num_points - 1];
++ }
+ }
+
+ #endif /*LV_HAVE_GENERIC*/
+
+
+-
+ #if LV_HAVE_SSE && LV_HAVE_64
+
+-static inline void volk_32fc_x2_dot_prod_32fc_u_sse_64(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) {
+-
+- const unsigned int num_bytes = num_points*8;
+- unsigned int isodd = num_points & 1;
+-
+- __VOLK_ASM
+- (
+- "# ccomplex_dotprod_generic (float* result, const float *input,\n\t"
+- "# const float *taps, unsigned num_bytes)\n\t"
+- "# float sum0 = 0;\n\t"
+- "# float sum1 = 0;\n\t"
+- "# float sum2 = 0;\n\t"
+- "# float sum3 = 0;\n\t"
+- "# do {\n\t"
+- "# sum0 += input[0] * taps[0] - input[1] * taps[1];\n\t"
+- "# sum1 += input[0] * taps[1] + input[1] * taps[0];\n\t"
+- "# sum2 += input[2] * taps[2] - input[3] * taps[3];\n\t"
+- "# sum3 += input[2] * taps[3] + input[3] * taps[2];\n\t"
+- "# input += 4;\n\t"
+- "# taps += 4; \n\t"
+- "# } while (--n_2_ccomplex_blocks != 0);\n\t"
+- "# result[0] = sum0 + sum2;\n\t"
+- "# result[1] = sum1 + sum3;\n\t"
+- "# TODO: prefetch and better scheduling\n\t"
+- " xor %%r9, %%r9\n\t"
+- " xor %%r10, %%r10\n\t"
+- " movq %%rcx, %%rax\n\t"
+- " movq %%rcx, %%r8\n\t"
+- " movq %[rsi], %%r9\n\t"
+- " movq %[rdx], %%r10\n\t"
+- " xorps %%xmm6, %%xmm6 # zero accumulators\n\t"
+- " movups 0(%%r9), %%xmm0\n\t"
+- " xorps %%xmm7, %%xmm7 # zero accumulators\n\t"
+- " movups 0(%%r10), %%xmm2\n\t"
+- " shr $5, %%rax # rax = n_2_ccomplex_blocks / 2\n\t"
+- " shr $4, %%r8\n\t"
+- " jmp .%=L1_test\n\t"
+- " # 4 taps / loop\n\t"
+- " # something like ?? cycles / loop\n\t"
+- ".%=Loop1: \n\t"
+- "# complex prod: C += A * B, w/ temp Z & Y (or B), xmmPN=$0x8000000080000000\n\t"
+- "# movups (%%r9), %%xmmA\n\t"
+- "# movups (%%r10), %%xmmB\n\t"
+- "# movups %%xmmA, %%xmmZ\n\t"
+- "# shufps $0xb1, %%xmmZ, %%xmmZ # swap internals\n\t"
+- "# mulps %%xmmB, %%xmmA\n\t"
+- "# mulps %%xmmZ, %%xmmB\n\t"
+- "# # SSE replacement for: pfpnacc %%xmmB, %%xmmA\n\t"
+- "# xorps %%xmmPN, %%xmmA\n\t"
+- "# movups %%xmmA, %%xmmZ\n\t"
+- "# unpcklps %%xmmB, %%xmmA\n\t"
+- "# unpckhps %%xmmB, %%xmmZ\n\t"
+- "# movups %%xmmZ, %%xmmY\n\t"
+- "# shufps $0x44, %%xmmA, %%xmmZ # b01000100\n\t"
+- "# shufps $0xee, %%xmmY, %%xmmA # b11101110\n\t"
+- "# addps %%xmmZ, %%xmmA\n\t"
+- "# addps %%xmmA, %%xmmC\n\t"
+- "# A=xmm0, B=xmm2, Z=xmm4\n\t"
+- "# A'=xmm1, B'=xmm3, Z'=xmm5\n\t"
+- " movups 16(%%r9), %%xmm1\n\t"
+- " movups %%xmm0, %%xmm4\n\t"
+- " mulps %%xmm2, %%xmm0\n\t"
+- " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t"
+- " movups 16(%%r10), %%xmm3\n\t"
+- " movups %%xmm1, %%xmm5\n\t"
+- " addps %%xmm0, %%xmm6\n\t"
+- " mulps %%xmm3, %%xmm1\n\t"
+- " shufps $0xb1, %%xmm5, %%xmm5 # swap internals\n\t"
+- " addps %%xmm1, %%xmm6\n\t"
+- " mulps %%xmm4, %%xmm2\n\t"
+- " movups 32(%%r9), %%xmm0\n\t"
+- " addps %%xmm2, %%xmm7\n\t"
+- " mulps %%xmm5, %%xmm3\n\t"
+- " add $32, %%r9\n\t"
+- " movups 32(%%r10), %%xmm2\n\t"
+- " addps %%xmm3, %%xmm7\n\t"
+- " add $32, %%r10\n\t"
+- ".%=L1_test:\n\t"
+- " dec %%rax\n\t"
+- " jge .%=Loop1\n\t"
+- " # We've handled the bulk of multiplies up to here.\n\t"
+- " # Let's sse if original n_2_ccomplex_blocks was odd.\n\t"
+- " # If so, we've got 2 more taps to do.\n\t"
+- " and $1, %%r8\n\t"
+- " je .%=Leven\n\t"
+- " # The count was odd, do 2 more taps.\n\t"
+- " # Note that we've already got mm0/mm2 preloaded\n\t"
+- " # from the main loop.\n\t"
+- " movups %%xmm0, %%xmm4\n\t"
+- " mulps %%xmm2, %%xmm0\n\t"
+- " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t"
+- " addps %%xmm0, %%xmm6\n\t"
+- " mulps %%xmm4, %%xmm2\n\t"
+- " addps %%xmm2, %%xmm7\n\t"
+- ".%=Leven:\n\t"
+- " # neg inversor\n\t"
+- " xorps %%xmm1, %%xmm1\n\t"
+- " mov $0x80000000, %%r9\n\t"
+- " movd %%r9, %%xmm1\n\t"
+- " shufps $0x11, %%xmm1, %%xmm1 # b00010001 # 0 -0 0 -0\n\t"
+- " # pfpnacc\n\t"
+- " xorps %%xmm1, %%xmm6\n\t"
+- " movups %%xmm6, %%xmm2\n\t"
+- " unpcklps %%xmm7, %%xmm6\n\t"
+- " unpckhps %%xmm7, %%xmm2\n\t"
+- " movups %%xmm2, %%xmm3\n\t"
+- " shufps $0x44, %%xmm6, %%xmm2 # b01000100\n\t"
+- " shufps $0xee, %%xmm3, %%xmm6 # b11101110\n\t"
+- " addps %%xmm2, %%xmm6\n\t"
+- " # xmm6 = r1 i2 r3 i4\n\t"
+- " movhlps %%xmm6, %%xmm4 # xmm4 = r3 i4 ?? ??\n\t"
+- " addps %%xmm4, %%xmm6 # xmm6 = r1+r3 i2+i4 ?? ??\n\t"
+- " movlps %%xmm6, (%[rdi]) # store low 2x32 bits (complex) to memory\n\t"
+- :
+- :[rsi] "r" (input), [rdx] "r" (taps), "c" (num_bytes), [rdi] "r" (result)
+- :"rax", "r8", "r9", "r10"
+- );
+-
+-
+- if(isodd) {
+- *result += input[num_points - 1] * taps[num_points - 1];
+- }
+-
+- return;
++static inline void volk_32fc_x2_dot_prod_32fc_u_sse_64(lv_32fc_t* result,
++ const lv_32fc_t* input,
++ const lv_32fc_t* taps,
++ unsigned int num_points)
++{
++
++ const unsigned int num_bytes = num_points * 8;
++ unsigned int isodd = num_points & 1;
++
++ __VOLK_ASM(
++ "# ccomplex_dotprod_generic (float* result, const float *input,\n\t"
++ "# const float *taps, unsigned num_bytes)\n\t"
++ "# float sum0 = 0;\n\t"
++ "# float sum1 = 0;\n\t"
++ "# float sum2 = 0;\n\t"
++ "# float sum3 = 0;\n\t"
++ "# do {\n\t"
++ "# sum0 += input[0] * taps[0] - input[1] * taps[1];\n\t"
++ "# sum1 += input[0] * taps[1] + input[1] * taps[0];\n\t"
++ "# sum2 += input[2] * taps[2] - input[3] * taps[3];\n\t"
++ "# sum3 += input[2] * taps[3] + input[3] * taps[2];\n\t"
++ "# input += 4;\n\t"
++ "# taps += 4; \n\t"
++ "# } while (--n_2_ccomplex_blocks != 0);\n\t"
++ "# result[0] = sum0 + sum2;\n\t"
++ "# result[1] = sum1 + sum3;\n\t"
++ "# TODO: prefetch and better scheduling\n\t"
++ " xor %%r9, %%r9\n\t"
++ " xor %%r10, %%r10\n\t"
++ " movq %%rcx, %%rax\n\t"
++ " movq %%rcx, %%r8\n\t"
++ " movq %[rsi], %%r9\n\t"
++ " movq %[rdx], %%r10\n\t"
++ " xorps %%xmm6, %%xmm6 # zero accumulators\n\t"
++ " movups 0(%%r9), %%xmm0\n\t"
++ " xorps %%xmm7, %%xmm7 # zero accumulators\n\t"
++ " movups 0(%%r10), %%xmm2\n\t"
++ " shr $5, %%rax # rax = n_2_ccomplex_blocks / 2\n\t"
++ " shr $4, %%r8\n\t"
++ " jmp .%=L1_test\n\t"
++ " # 4 taps / loop\n\t"
++ " # something like ?? cycles / loop\n\t"
++ ".%=Loop1: \n\t"
++ "# complex prod: C += A * B, w/ temp Z & Y (or B), xmmPN=$0x8000000080000000\n\t"
++ "# movups (%%r9), %%xmmA\n\t"
++ "# movups (%%r10), %%xmmB\n\t"
++ "# movups %%xmmA, %%xmmZ\n\t"
++ "# shufps $0xb1, %%xmmZ, %%xmmZ # swap internals\n\t"
++ "# mulps %%xmmB, %%xmmA\n\t"
++ "# mulps %%xmmZ, %%xmmB\n\t"
++ "# # SSE replacement for: pfpnacc %%xmmB, %%xmmA\n\t"
++ "# xorps %%xmmPN, %%xmmA\n\t"
++ "# movups %%xmmA, %%xmmZ\n\t"
++ "# unpcklps %%xmmB, %%xmmA\n\t"
++ "# unpckhps %%xmmB, %%xmmZ\n\t"
++ "# movups %%xmmZ, %%xmmY\n\t"
++ "# shufps $0x44, %%xmmA, %%xmmZ # b01000100\n\t"
++ "# shufps $0xee, %%xmmY, %%xmmA # b11101110\n\t"
++ "# addps %%xmmZ, %%xmmA\n\t"
++ "# addps %%xmmA, %%xmmC\n\t"
++ "# A=xmm0, B=xmm2, Z=xmm4\n\t"
++ "# A'=xmm1, B'=xmm3, Z'=xmm5\n\t"
++ " movups 16(%%r9), %%xmm1\n\t"
++ " movups %%xmm0, %%xmm4\n\t"
++ " mulps %%xmm2, %%xmm0\n\t"
++ " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t"
++ " movups 16(%%r10), %%xmm3\n\t"
++ " movups %%xmm1, %%xmm5\n\t"
++ " addps %%xmm0, %%xmm6\n\t"
++ " mulps %%xmm3, %%xmm1\n\t"
++ " shufps $0xb1, %%xmm5, %%xmm5 # swap internals\n\t"
++ " addps %%xmm1, %%xmm6\n\t"
++ " mulps %%xmm4, %%xmm2\n\t"
++ " movups 32(%%r9), %%xmm0\n\t"
++ " addps %%xmm2, %%xmm7\n\t"
++ " mulps %%xmm5, %%xmm3\n\t"
++ " add $32, %%r9\n\t"
++ " movups 32(%%r10), %%xmm2\n\t"
++ " addps %%xmm3, %%xmm7\n\t"
++ " add $32, %%r10\n\t"
++ ".%=L1_test:\n\t"
++ " dec %%rax\n\t"
++ " jge .%=Loop1\n\t"
++ " # We've handled the bulk of multiplies up to here.\n\t"
++ " # Let's sse if original n_2_ccomplex_blocks was odd.\n\t"
++ " # If so, we've got 2 more taps to do.\n\t"
++ " and $1, %%r8\n\t"
++ " je .%=Leven\n\t"
++ " # The count was odd, do 2 more taps.\n\t"
++ " # Note that we've already got mm0/mm2 preloaded\n\t"
++ " # from the main loop.\n\t"
++ " movups %%xmm0, %%xmm4\n\t"
++ " mulps %%xmm2, %%xmm0\n\t"
++ " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t"
++ " addps %%xmm0, %%xmm6\n\t"
++ " mulps %%xmm4, %%xmm2\n\t"
++ " addps %%xmm2, %%xmm7\n\t"
++ ".%=Leven:\n\t"
++ " # neg inversor\n\t"
++ " xorps %%xmm1, %%xmm1\n\t"
++ " mov $0x80000000, %%r9\n\t"
++ " movd %%r9, %%xmm1\n\t"
++ " shufps $0x11, %%xmm1, %%xmm1 # b00010001 # 0 -0 0 -0\n\t"
++ " # pfpnacc\n\t"
++ " xorps %%xmm1, %%xmm6\n\t"
++ " movups %%xmm6, %%xmm2\n\t"
++ " unpcklps %%xmm7, %%xmm6\n\t"
++ " unpckhps %%xmm7, %%xmm2\n\t"
++ " movups %%xmm2, %%xmm3\n\t"
++ " shufps $0x44, %%xmm6, %%xmm2 # b01000100\n\t"
++ " shufps $0xee, %%xmm3, %%xmm6 # b11101110\n\t"
++ " addps %%xmm2, %%xmm6\n\t"
++ " # xmm6 = r1 i2 r3 i4\n\t"
++ " movhlps %%xmm6, %%xmm4 # xmm4 = r3 i4 ?? ??\n\t"
++ " addps %%xmm4, %%xmm6 # xmm6 = r1+r3 i2+i4 ?? ??\n\t"
++ " movlps %%xmm6, (%[rdi]) # store low 2x32 bits (complex) "
++ "to memory\n\t"
++ :
++ : [rsi] "r"(input), [rdx] "r"(taps), "c"(num_bytes), [rdi] "r"(result)
++ : "rax", "r8", "r9", "r10");
++
++
++ if (isodd) {
++ *result += input[num_points - 1] * taps[num_points - 1];
++ }
+
++ return;
+ }
+
+ #endif /* LV_HAVE_SSE && LV_HAVE_64 */
+
+
+-
+-
+ #ifdef LV_HAVE_SSE3
+
+ #include <pmmintrin.h>
+
+-static inline void volk_32fc_x2_dot_prod_32fc_u_sse3(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) {
++static inline void volk_32fc_x2_dot_prod_32fc_u_sse3(lv_32fc_t* result,
++ const lv_32fc_t* input,
++ const lv_32fc_t* taps,
++ unsigned int num_points)
++{
+
+- lv_32fc_t dotProduct;
+- memset(&dotProduct, 0x0, 2*sizeof(float));
++ lv_32fc_t dotProduct;
++ memset(&dotProduct, 0x0, 2 * sizeof(float));
+
+- unsigned int number = 0;
+- const unsigned int halfPoints = num_points/2;
+- unsigned int isodd = num_points & 1;
++ unsigned int number = 0;
++ const unsigned int halfPoints = num_points / 2;
++ unsigned int isodd = num_points & 1;
+
+- __m128 x, y, yl, yh, z, tmp1, tmp2, dotProdVal;
++ __m128 x, y, yl, yh, z, tmp1, tmp2, dotProdVal;
+
+- const lv_32fc_t* a = input;
+- const lv_32fc_t* b = taps;
++ const lv_32fc_t* a = input;
++ const lv_32fc_t* b = taps;
+
+- dotProdVal = _mm_setzero_ps();
++ dotProdVal = _mm_setzero_ps();
+
+- for(;number < halfPoints; number++){
++ for (; number < halfPoints; number++) {
+
+- x = _mm_loadu_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
+- y = _mm_loadu_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di
++ x = _mm_loadu_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
++ y = _mm_loadu_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di
+
+- yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr
+- yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di
++ yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr
++ yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di
+
+- tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
++ tmp1 = _mm_mul_ps(x, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
+
+- x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br
++ x = _mm_shuffle_ps(x, x, 0xB1); // Re-arrange x to be ai,ar,bi,br
+
+- tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
++ tmp2 = _mm_mul_ps(x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
+
+- z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
++ z = _mm_addsub_ps(tmp1,
++ tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
+
+- dotProdVal = _mm_add_ps(dotProdVal, z); // Add the complex multiplication results together
++ dotProdVal =
++ _mm_add_ps(dotProdVal, z); // Add the complex multiplication results together
+
+- a += 2;
+- b += 2;
+- }
++ a += 2;
++ b += 2;
++ }
+
+- __VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector[2];
++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector[2];
+
+- _mm_storeu_ps((float*)dotProductVector,dotProdVal); // Store the results back into the dot product vector
++ _mm_storeu_ps((float*)dotProductVector,
++ dotProdVal); // Store the results back into the dot product vector
+
+- dotProduct += ( dotProductVector[0] + dotProductVector[1] );
++ dotProduct += (dotProductVector[0] + dotProductVector[1]);
+
+- if(isodd) {
+- dotProduct += input[num_points - 1] * taps[num_points - 1];
+- }
++ if (isodd) {
++ dotProduct += input[num_points - 1] * taps[num_points - 1];
++ }
+
+- *result = dotProduct;
++ *result = dotProduct;
+ }
+
+ #endif /*LV_HAVE_SSE3*/
+@@ -296,78 +306,82 @@ static inline void volk_32fc_x2_dot_prod_32fc_u_sse3(lv_32fc_t* result, const lv
+
+ #include <smmintrin.h>
+
+-static inline void volk_32fc_x2_dot_prod_32fc_u_sse4_1(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) {
++static inline void volk_32fc_x2_dot_prod_32fc_u_sse4_1(lv_32fc_t* result,
++ const lv_32fc_t* input,
++ const lv_32fc_t* taps,
++ unsigned int num_points)
++{
+
+- unsigned int i = 0;
+- const unsigned int qtr_points = num_points/4;
+- const unsigned int isodd = num_points & 3;
++ unsigned int i = 0;
++ const unsigned int qtr_points = num_points / 4;
++ const unsigned int isodd = num_points & 3;
+
+- __m128 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, real0, real1, im0, im1;
+- float *p_input, *p_taps;
+- __m64 *p_result;
++ __m128 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, real0, real1, im0, im1;
++ float *p_input, *p_taps;
++ __m64* p_result;
+
+- p_result = (__m64*)result;
+- p_input = (float*)input;
+- p_taps = (float*)taps;
++ p_result = (__m64*)result;
++ p_input = (float*)input;
++ p_taps = (float*)taps;
+
+- static const __m128i neg = {0x000000000000000080000000};
++ static const __m128i neg = { 0x000000000000000080000000 };
+
+- real0 = _mm_setzero_ps();
+- real1 = _mm_setzero_ps();
+- im0 = _mm_setzero_ps();
+- im1 = _mm_setzero_ps();
++ real0 = _mm_setzero_ps();
++ real1 = _mm_setzero_ps();
++ im0 = _mm_setzero_ps();
++ im1 = _mm_setzero_ps();
+
+- for(; i < qtr_points; ++i) {
+- xmm0 = _mm_loadu_ps(p_input);
+- xmm1 = _mm_loadu_ps(p_taps);
++ for (; i < qtr_points; ++i) {
++ xmm0 = _mm_loadu_ps(p_input);
++ xmm1 = _mm_loadu_ps(p_taps);
+
+- p_input += 4;
+- p_taps += 4;
++ p_input += 4;
++ p_taps += 4;
+
+- xmm2 = _mm_loadu_ps(p_input);
+- xmm3 = _mm_loadu_ps(p_taps);
++ xmm2 = _mm_loadu_ps(p_input);
++ xmm3 = _mm_loadu_ps(p_taps);
+
+- p_input += 4;
+- p_taps += 4;
++ p_input += 4;
++ p_taps += 4;
+
+- xmm4 = _mm_unpackhi_ps(xmm0, xmm2);
+- xmm5 = _mm_unpackhi_ps(xmm1, xmm3);
+- xmm0 = _mm_unpacklo_ps(xmm0, xmm2);
+- xmm2 = _mm_unpacklo_ps(xmm1, xmm3);
++ xmm4 = _mm_unpackhi_ps(xmm0, xmm2);
++ xmm5 = _mm_unpackhi_ps(xmm1, xmm3);
++ xmm0 = _mm_unpacklo_ps(xmm0, xmm2);
++ xmm2 = _mm_unpacklo_ps(xmm1, xmm3);
+
+- //imaginary vector from input
+- xmm1 = _mm_unpackhi_ps(xmm0, xmm4);
+- //real vector from input
+- xmm3 = _mm_unpacklo_ps(xmm0, xmm4);
+- //imaginary vector from taps
+- xmm0 = _mm_unpackhi_ps(xmm2, xmm5);
+- //real vector from taps
+- xmm2 = _mm_unpacklo_ps(xmm2, xmm5);
++ // imaginary vector from input
++ xmm1 = _mm_unpackhi_ps(xmm0, xmm4);
++ // real vector from input
++ xmm3 = _mm_unpacklo_ps(xmm0, xmm4);
++ // imaginary vector from taps
++ xmm0 = _mm_unpackhi_ps(xmm2, xmm5);
++ // real vector from taps
++ xmm2 = _mm_unpacklo_ps(xmm2, xmm5);
+
+- xmm4 = _mm_dp_ps(xmm3, xmm2, 0xf1);
+- xmm5 = _mm_dp_ps(xmm1, xmm0, 0xf1);
++ xmm4 = _mm_dp_ps(xmm3, xmm2, 0xf1);
++ xmm5 = _mm_dp_ps(xmm1, xmm0, 0xf1);
+
+- xmm6 = _mm_dp_ps(xmm3, xmm0, 0xf2);
+- xmm7 = _mm_dp_ps(xmm1, xmm2, 0xf2);
++ xmm6 = _mm_dp_ps(xmm3, xmm0, 0xf2);
++ xmm7 = _mm_dp_ps(xmm1, xmm2, 0xf2);
+
+- real0 = _mm_add_ps(xmm4, real0);
+- real1 = _mm_add_ps(xmm5, real1);
+- im0 = _mm_add_ps(xmm6, im0);
+- im1 = _mm_add_ps(xmm7, im1);
+- }
++ real0 = _mm_add_ps(xmm4, real0);
++ real1 = _mm_add_ps(xmm5, real1);
++ im0 = _mm_add_ps(xmm6, im0);
++ im1 = _mm_add_ps(xmm7, im1);
++ }
+
+- real1 = _mm_xor_ps(real1, bit128_p(&neg)->float_vec);
++ real1 = _mm_xor_ps(real1, bit128_p(&neg)->float_vec);
+
+- im0 = _mm_add_ps(im0, im1);
+- real0 = _mm_add_ps(real0, real1);
++ im0 = _mm_add_ps(im0, im1);
++ real0 = _mm_add_ps(real0, real1);
+
+- im0 = _mm_add_ps(im0, real0);
++ im0 = _mm_add_ps(im0, real0);
+
+- _mm_storel_pi(p_result, im0);
++ _mm_storel_pi(p_result, im0);
+
+- for(i = num_points-isodd; i < num_points; i++) {
+- *result += input[i] * taps[i];
+- }
++ for (i = num_points - isodd; i < num_points; i++) {
++ *result += input[i] * taps[i];
++ }
+ }
+
+ #endif /*LV_HAVE_SSE4_1*/
+@@ -376,55 +390,63 @@ static inline void volk_32fc_x2_dot_prod_32fc_u_sse4_1(lv_32fc_t* result, const
+
+ #include <immintrin.h>
+
+-static inline void volk_32fc_x2_dot_prod_32fc_u_avx(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) {
++static inline void volk_32fc_x2_dot_prod_32fc_u_avx(lv_32fc_t* result,
++ const lv_32fc_t* input,
++ const lv_32fc_t* taps,
++ unsigned int num_points)
++{
+
+- unsigned int isodd = num_points & 3;
+- unsigned int i = 0;
+- lv_32fc_t dotProduct;
+- memset(&dotProduct, 0x0, 2*sizeof(float));
++ unsigned int isodd = num_points & 3;
++ unsigned int i = 0;
++ lv_32fc_t dotProduct;
++ memset(&dotProduct, 0x0, 2 * sizeof(float));
+
+- unsigned int number = 0;
+- const unsigned int quarterPoints = num_points / 4;
++ unsigned int number = 0;
++ const unsigned int quarterPoints = num_points / 4;
+
+- __m256 x, y, yl, yh, z, tmp1, tmp2, dotProdVal;
++ __m256 x, y, yl, yh, z, tmp1, tmp2, dotProdVal;
+
+- const lv_32fc_t* a = input;
+- const lv_32fc_t* b = taps;
++ const lv_32fc_t* a = input;
++ const lv_32fc_t* b = taps;
+
+- dotProdVal = _mm256_setzero_ps();
++ dotProdVal = _mm256_setzero_ps();
+
+- for(;number < quarterPoints; number++){
+- x = _mm256_loadu_ps((float*)a); // Load a,b,e,f as ar,ai,br,bi,er,ei,fr,fi
+- y = _mm256_loadu_ps((float*)b); // Load c,d,g,h as cr,ci,dr,di,gr,gi,hr,hi
++ for (; number < quarterPoints; number++) {
++ x = _mm256_loadu_ps((float*)a); // Load a,b,e,f as ar,ai,br,bi,er,ei,fr,fi
++ y = _mm256_loadu_ps((float*)b); // Load c,d,g,h as cr,ci,dr,di,gr,gi,hr,hi
+
+- yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr,gr,gr,hr,hr
+- yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di,gi,gi,hi,hi
++ yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr,gr,gr,hr,hr
++ yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di,gi,gi,hi,hi
+
+- tmp1 = _mm256_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr ...
++ tmp1 = _mm256_mul_ps(x, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr ...
+
+- x = _mm256_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br,ei,er,fi,fr
++ x = _mm256_shuffle_ps(x, x, 0xB1); // Re-arrange x to be ai,ar,bi,br,ei,er,fi,fr
+
+- tmp2 = _mm256_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di ...
++ tmp2 = _mm256_mul_ps(x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di ...
+
+- z = _mm256_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
++ z = _mm256_addsub_ps(tmp1,
++ tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
+
+- dotProdVal = _mm256_add_ps(dotProdVal, z); // Add the complex multiplication results together
++ dotProdVal = _mm256_add_ps(dotProdVal,
++ z); // Add the complex multiplication results together
+
+- a += 4;
+- b += 4;
+- }
++ a += 4;
++ b += 4;
++ }
+
+- __VOLK_ATTR_ALIGNED(32) lv_32fc_t dotProductVector[4];
++ __VOLK_ATTR_ALIGNED(32) lv_32fc_t dotProductVector[4];
+
+- _mm256_storeu_ps((float*)dotProductVector,dotProdVal); // Store the results back into the dot product vector
++ _mm256_storeu_ps((float*)dotProductVector,
++ dotProdVal); // Store the results back into the dot product vector
+
+- dotProduct += ( dotProductVector[0] + dotProductVector[1] + dotProductVector[2] + dotProductVector[3]);
++ dotProduct += (dotProductVector[0] + dotProductVector[1] + dotProductVector[2] +
++ dotProductVector[3]);
+
+- for(i = num_points-isodd; i < num_points; i++) {
+- dotProduct += input[i] * taps[i];
+- }
++ for (i = num_points - isodd; i < num_points; i++) {
++ dotProduct += input[i] * taps[i];
++ }
+
+- *result = dotProduct;
++ *result = dotProduct;
+ }
+
+ #endif /*LV_HAVE_AVX*/
+@@ -432,56 +454,64 @@ static inline void volk_32fc_x2_dot_prod_32fc_u_avx(lv_32fc_t* result, const lv_
+ #if LV_HAVE_AVX && LV_HAVE_FMA
+ #include <immintrin.h>
+
+-static inline void volk_32fc_x2_dot_prod_32fc_u_avx_fma(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) {
++static inline void volk_32fc_x2_dot_prod_32fc_u_avx_fma(lv_32fc_t* result,
++ const lv_32fc_t* input,
++ const lv_32fc_t* taps,
++ unsigned int num_points)
++{
+
+- unsigned int isodd = num_points & 3;
+- unsigned int i = 0;
+- lv_32fc_t dotProduct;
+- memset(&dotProduct, 0x0, 2*sizeof(float));
++ unsigned int isodd = num_points & 3;
++ unsigned int i = 0;
++ lv_32fc_t dotProduct;
++ memset(&dotProduct, 0x0, 2 * sizeof(float));
+
+- unsigned int number = 0;
+- const unsigned int quarterPoints = num_points / 4;
++ unsigned int number = 0;
++ const unsigned int quarterPoints = num_points / 4;
+
+- __m256 x, y, yl, yh, z, tmp1, tmp2, dotProdVal;
++ __m256 x, y, yl, yh, z, tmp1, tmp2, dotProdVal;
+
+- const lv_32fc_t* a = input;
+- const lv_32fc_t* b = taps;
++ const lv_32fc_t* a = input;
++ const lv_32fc_t* b = taps;
+
+- dotProdVal = _mm256_setzero_ps();
++ dotProdVal = _mm256_setzero_ps();
+
+- for(;number < quarterPoints; number++){
++ for (; number < quarterPoints; number++) {
+
+- x = _mm256_loadu_ps((float*)a); // Load a,b,e,f as ar,ai,br,bi,er,ei,fr,fi
+- y = _mm256_loadu_ps((float*)b); // Load c,d,g,h as cr,ci,dr,di,gr,gi,hr,hi
++ x = _mm256_loadu_ps((float*)a); // Load a,b,e,f as ar,ai,br,bi,er,ei,fr,fi
++ y = _mm256_loadu_ps((float*)b); // Load c,d,g,h as cr,ci,dr,di,gr,gi,hr,hi
+
+- yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr,gr,gr,hr,hr
+- yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di,gi,gi,hi,hi
++ yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr,gr,gr,hr,hr
++ yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di,gi,gi,hi,hi
+
+- tmp1 = x;
++ tmp1 = x;
+
+- x = _mm256_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br,ei,er,fi,fr
++ x = _mm256_shuffle_ps(x, x, 0xB1); // Re-arrange x to be ai,ar,bi,br,ei,er,fi,fr
+
+- tmp2 = _mm256_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di ...
++ tmp2 = _mm256_mul_ps(x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di ...
+
+- z = _mm256_fmaddsub_ps(tmp1, yl,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
++ z = _mm256_fmaddsub_ps(
++ tmp1, yl, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
+
+- dotProdVal = _mm256_add_ps(dotProdVal, z); // Add the complex multiplication results together
++ dotProdVal = _mm256_add_ps(dotProdVal,
++ z); // Add the complex multiplication results together
+
+- a += 4;
+- b += 4;
+- }
++ a += 4;
++ b += 4;
++ }
+
+- __VOLK_ATTR_ALIGNED(32) lv_32fc_t dotProductVector[4];
++ __VOLK_ATTR_ALIGNED(32) lv_32fc_t dotProductVector[4];
+
+- _mm256_storeu_ps((float*)dotProductVector,dotProdVal); // Store the results back into the dot product vector
++ _mm256_storeu_ps((float*)dotProductVector,
++ dotProdVal); // Store the results back into the dot product vector
+
+- dotProduct += ( dotProductVector[0] + dotProductVector[1] + dotProductVector[2] + dotProductVector[3]);
++ dotProduct += (dotProductVector[0] + dotProductVector[1] + dotProductVector[2] +
++ dotProductVector[3]);
+
+- for(i = num_points-isodd; i < num_points; i++) {
+- dotProduct += input[i] * taps[i];
+- }
++ for (i = num_points - isodd; i < num_points; i++) {
++ dotProduct += input[i] * taps[i];
++ }
+
+- *result = dotProduct;
++ *result = dotProduct;
+ }
+
+ #endif /*LV_HAVE_AVX && LV_HAVE_FMA*/
+@@ -491,44 +521,48 @@ static inline void volk_32fc_x2_dot_prod_32fc_u_avx_fma(lv_32fc_t* result, const
+ #ifndef INCLUDED_volk_32fc_x2_dot_prod_32fc_a_H
+ #define INCLUDED_volk_32fc_x2_dot_prod_32fc_a_H
+
+-#include <volk/volk_common.h>
+-#include <volk/volk_complex.h>
+ #include <stdio.h>
+ #include <string.h>
++#include <volk/volk_common.h>
++#include <volk/volk_complex.h>
+
+
+ #ifdef LV_HAVE_GENERIC
+
+
+-static inline void volk_32fc_x2_dot_prod_32fc_a_generic(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) {
++static inline void volk_32fc_x2_dot_prod_32fc_a_generic(lv_32fc_t* result,
++ const lv_32fc_t* input,
++ const lv_32fc_t* taps,
++ unsigned int num_points)
++{
+
+- const unsigned int num_bytes = num_points*8;
++ const unsigned int num_bytes = num_points * 8;
+
+- float * res = (float*) result;
+- float * in = (float*) input;
+- float * tp = (float*) taps;
+- unsigned int n_2_ccomplex_blocks = num_bytes >> 4;
++ float* res = (float*)result;
++ float* in = (float*)input;
++ float* tp = (float*)taps;
++ unsigned int n_2_ccomplex_blocks = num_bytes >> 4;
+
+- float sum0[2] = {0,0};
+- float sum1[2] = {0,0};
+- unsigned int i = 0;
++ float sum0[2] = { 0, 0 };
++ float sum1[2] = { 0, 0 };
++ unsigned int i = 0;
+
+- for(i = 0; i < n_2_ccomplex_blocks; ++i) {
+- sum0[0] += in[0] * tp[0] - in[1] * tp[1];
+- sum0[1] += in[0] * tp[1] + in[1] * tp[0];
+- sum1[0] += in[2] * tp[2] - in[3] * tp[3];
+- sum1[1] += in[2] * tp[3] + in[3] * tp[2];
++ for (i = 0; i < n_2_ccomplex_blocks; ++i) {
++ sum0[0] += in[0] * tp[0] - in[1] * tp[1];
++ sum0[1] += in[0] * tp[1] + in[1] * tp[0];
++ sum1[0] += in[2] * tp[2] - in[3] * tp[3];
++ sum1[1] += in[2] * tp[3] + in[3] * tp[2];
+
+- in += 4;
+- tp += 4;
+- }
++ in += 4;
++ tp += 4;
++ }
+
+- res[0] = sum0[0] + sum1[0];
+- res[1] = sum0[1] + sum1[1];
++ res[0] = sum0[0] + sum1[0];
++ res[1] = sum0[1] + sum1[1];
+
+- if (num_points & 1) {
+- *result += input[num_points - 1] * taps[num_points - 1];
+- }
++ if (num_points & 1) {
++ *result += input[num_points - 1] * taps[num_points - 1];
++ }
+ }
+
+ #endif /*LV_HAVE_GENERIC*/
+@@ -537,140 +571,146 @@ static inline void volk_32fc_x2_dot_prod_32fc_a_generic(lv_32fc_t* result, const
+ #if LV_HAVE_SSE && LV_HAVE_64
+
+
+-static inline void volk_32fc_x2_dot_prod_32fc_a_sse_64(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) {
+-
+- const unsigned int num_bytes = num_points*8;
+- unsigned int isodd = num_points & 1;
+-
+- __VOLK_ASM
+- (
+- "# ccomplex_dotprod_generic (float* result, const float *input,\n\t"
+- "# const float *taps, unsigned num_bytes)\n\t"
+- "# float sum0 = 0;\n\t"
+- "# float sum1 = 0;\n\t"
+- "# float sum2 = 0;\n\t"
+- "# float sum3 = 0;\n\t"
+- "# do {\n\t"
+- "# sum0 += input[0] * taps[0] - input[1] * taps[1];\n\t"
+- "# sum1 += input[0] * taps[1] + input[1] * taps[0];\n\t"
+- "# sum2 += input[2] * taps[2] - input[3] * taps[3];\n\t"
+- "# sum3 += input[2] * taps[3] + input[3] * taps[2];\n\t"
+- "# input += 4;\n\t"
+- "# taps += 4; \n\t"
+- "# } while (--n_2_ccomplex_blocks != 0);\n\t"
+- "# result[0] = sum0 + sum2;\n\t"
+- "# result[1] = sum1 + sum3;\n\t"
+- "# TODO: prefetch and better scheduling\n\t"
+- " xor %%r9, %%r9\n\t"
+- " xor %%r10, %%r10\n\t"
+- " movq %%rcx, %%rax\n\t"
+- " movq %%rcx, %%r8\n\t"
+- " movq %[rsi], %%r9\n\t"
+- " movq %[rdx], %%r10\n\t"
+- " xorps %%xmm6, %%xmm6 # zero accumulators\n\t"
+- " movaps 0(%%r9), %%xmm0\n\t"
+- " xorps %%xmm7, %%xmm7 # zero accumulators\n\t"
+- " movaps 0(%%r10), %%xmm2\n\t"
+- " shr $5, %%rax # rax = n_2_ccomplex_blocks / 2\n\t"
+- " shr $4, %%r8\n\t"
+- " jmp .%=L1_test\n\t"
+- " # 4 taps / loop\n\t"
+- " # something like ?? cycles / loop\n\t"
+- ".%=Loop1: \n\t"
+- "# complex prod: C += A * B, w/ temp Z & Y (or B), xmmPN=$0x8000000080000000\n\t"
+- "# movaps (%%r9), %%xmmA\n\t"
+- "# movaps (%%r10), %%xmmB\n\t"
+- "# movaps %%xmmA, %%xmmZ\n\t"
+- "# shufps $0xb1, %%xmmZ, %%xmmZ # swap internals\n\t"
+- "# mulps %%xmmB, %%xmmA\n\t"
+- "# mulps %%xmmZ, %%xmmB\n\t"
+- "# # SSE replacement for: pfpnacc %%xmmB, %%xmmA\n\t"
+- "# xorps %%xmmPN, %%xmmA\n\t"
+- "# movaps %%xmmA, %%xmmZ\n\t"
+- "# unpcklps %%xmmB, %%xmmA\n\t"
+- "# unpckhps %%xmmB, %%xmmZ\n\t"
+- "# movaps %%xmmZ, %%xmmY\n\t"
+- "# shufps $0x44, %%xmmA, %%xmmZ # b01000100\n\t"
+- "# shufps $0xee, %%xmmY, %%xmmA # b11101110\n\t"
+- "# addps %%xmmZ, %%xmmA\n\t"
+- "# addps %%xmmA, %%xmmC\n\t"
+- "# A=xmm0, B=xmm2, Z=xmm4\n\t"
+- "# A'=xmm1, B'=xmm3, Z'=xmm5\n\t"
+- " movaps 16(%%r9), %%xmm1\n\t"
+- " movaps %%xmm0, %%xmm4\n\t"
+- " mulps %%xmm2, %%xmm0\n\t"
+- " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t"
+- " movaps 16(%%r10), %%xmm3\n\t"
+- " movaps %%xmm1, %%xmm5\n\t"
+- " addps %%xmm0, %%xmm6\n\t"
+- " mulps %%xmm3, %%xmm1\n\t"
+- " shufps $0xb1, %%xmm5, %%xmm5 # swap internals\n\t"
+- " addps %%xmm1, %%xmm6\n\t"
+- " mulps %%xmm4, %%xmm2\n\t"
+- " movaps 32(%%r9), %%xmm0\n\t"
+- " addps %%xmm2, %%xmm7\n\t"
+- " mulps %%xmm5, %%xmm3\n\t"
+- " add $32, %%r9\n\t"
+- " movaps 32(%%r10), %%xmm2\n\t"
+- " addps %%xmm3, %%xmm7\n\t"
+- " add $32, %%r10\n\t"
+- ".%=L1_test:\n\t"
+- " dec %%rax\n\t"
+- " jge .%=Loop1\n\t"
+- " # We've handled the bulk of multiplies up to here.\n\t"
+- " # Let's sse if original n_2_ccomplex_blocks was odd.\n\t"
+- " # If so, we've got 2 more taps to do.\n\t"
+- " and $1, %%r8\n\t"
+- " je .%=Leven\n\t"
+- " # The count was odd, do 2 more taps.\n\t"
+- " # Note that we've already got mm0/mm2 preloaded\n\t"
+- " # from the main loop.\n\t"
+- " movaps %%xmm0, %%xmm4\n\t"
+- " mulps %%xmm2, %%xmm0\n\t"
+- " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t"
+- " addps %%xmm0, %%xmm6\n\t"
+- " mulps %%xmm4, %%xmm2\n\t"
+- " addps %%xmm2, %%xmm7\n\t"
+- ".%=Leven:\n\t"
+- " # neg inversor\n\t"
+- " xorps %%xmm1, %%xmm1\n\t"
+- " mov $0x80000000, %%r9\n\t"
+- " movd %%r9, %%xmm1\n\t"
+- " shufps $0x11, %%xmm1, %%xmm1 # b00010001 # 0 -0 0 -0\n\t"
+- " # pfpnacc\n\t"
+- " xorps %%xmm1, %%xmm6\n\t"
+- " movaps %%xmm6, %%xmm2\n\t"
+- " unpcklps %%xmm7, %%xmm6\n\t"
+- " unpckhps %%xmm7, %%xmm2\n\t"
+- " movaps %%xmm2, %%xmm3\n\t"
+- " shufps $0x44, %%xmm6, %%xmm2 # b01000100\n\t"
+- " shufps $0xee, %%xmm3, %%xmm6 # b11101110\n\t"
+- " addps %%xmm2, %%xmm6\n\t"
+- " # xmm6 = r1 i2 r3 i4\n\t"
+- " movhlps %%xmm6, %%xmm4 # xmm4 = r3 i4 ?? ??\n\t"
+- " addps %%xmm4, %%xmm6 # xmm6 = r1+r3 i2+i4 ?? ??\n\t"
+- " movlps %%xmm6, (%[rdi]) # store low 2x32 bits (complex) to memory\n\t"
+- :
+- :[rsi] "r" (input), [rdx] "r" (taps), "c" (num_bytes), [rdi] "r" (result)
+- :"rax", "r8", "r9", "r10"
+- );
+-
+-
+- if(isodd) {
+- *result += input[num_points - 1] * taps[num_points - 1];
+- }
+-
+- return;
++static inline void volk_32fc_x2_dot_prod_32fc_a_sse_64(lv_32fc_t* result,
++ const lv_32fc_t* input,
++ const lv_32fc_t* taps,
++ unsigned int num_points)
++{
++
++ const unsigned int num_bytes = num_points * 8;
++ unsigned int isodd = num_points & 1;
++
++ __VOLK_ASM(
++ "# ccomplex_dotprod_generic (float* result, const float *input,\n\t"
++ "# const float *taps, unsigned num_bytes)\n\t"
++ "# float sum0 = 0;\n\t"
++ "# float sum1 = 0;\n\t"
++ "# float sum2 = 0;\n\t"
++ "# float sum3 = 0;\n\t"
++ "# do {\n\t"
++ "# sum0 += input[0] * taps[0] - input[1] * taps[1];\n\t"
++ "# sum1 += input[0] * taps[1] + input[1] * taps[0];\n\t"
++ "# sum2 += input[2] * taps[2] - input[3] * taps[3];\n\t"
++ "# sum3 += input[2] * taps[3] + input[3] * taps[2];\n\t"
++ "# input += 4;\n\t"
++ "# taps += 4; \n\t"
++ "# } while (--n_2_ccomplex_blocks != 0);\n\t"
++ "# result[0] = sum0 + sum2;\n\t"
++ "# result[1] = sum1 + sum3;\n\t"
++ "# TODO: prefetch and better scheduling\n\t"
++ " xor %%r9, %%r9\n\t"
++ " xor %%r10, %%r10\n\t"
++ " movq %%rcx, %%rax\n\t"
++ " movq %%rcx, %%r8\n\t"
++ " movq %[rsi], %%r9\n\t"
++ " movq %[rdx], %%r10\n\t"
++ " xorps %%xmm6, %%xmm6 # zero accumulators\n\t"
++ " movaps 0(%%r9), %%xmm0\n\t"
++ " xorps %%xmm7, %%xmm7 # zero accumulators\n\t"
++ " movaps 0(%%r10), %%xmm2\n\t"
++ " shr $5, %%rax # rax = n_2_ccomplex_blocks / 2\n\t"
++ " shr $4, %%r8\n\t"
++ " jmp .%=L1_test\n\t"
++ " # 4 taps / loop\n\t"
++ " # something like ?? cycles / loop\n\t"
++ ".%=Loop1: \n\t"
++ "# complex prod: C += A * B, w/ temp Z & Y (or B), xmmPN=$0x8000000080000000\n\t"
++ "# movaps (%%r9), %%xmmA\n\t"
++ "# movaps (%%r10), %%xmmB\n\t"
++ "# movaps %%xmmA, %%xmmZ\n\t"
++ "# shufps $0xb1, %%xmmZ, %%xmmZ # swap internals\n\t"
++ "# mulps %%xmmB, %%xmmA\n\t"
++ "# mulps %%xmmZ, %%xmmB\n\t"
++ "# # SSE replacement for: pfpnacc %%xmmB, %%xmmA\n\t"
++ "# xorps %%xmmPN, %%xmmA\n\t"
++ "# movaps %%xmmA, %%xmmZ\n\t"
++ "# unpcklps %%xmmB, %%xmmA\n\t"
++ "# unpckhps %%xmmB, %%xmmZ\n\t"
++ "# movaps %%xmmZ, %%xmmY\n\t"
++ "# shufps $0x44, %%xmmA, %%xmmZ # b01000100\n\t"
++ "# shufps $0xee, %%xmmY, %%xmmA # b11101110\n\t"
++ "# addps %%xmmZ, %%xmmA\n\t"
++ "# addps %%xmmA, %%xmmC\n\t"
++ "# A=xmm0, B=xmm2, Z=xmm4\n\t"
++ "# A'=xmm1, B'=xmm3, Z'=xmm5\n\t"
++ " movaps 16(%%r9), %%xmm1\n\t"
++ " movaps %%xmm0, %%xmm4\n\t"
++ " mulps %%xmm2, %%xmm0\n\t"
++ " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t"
++ " movaps 16(%%r10), %%xmm3\n\t"
++ " movaps %%xmm1, %%xmm5\n\t"
++ " addps %%xmm0, %%xmm6\n\t"
++ " mulps %%xmm3, %%xmm1\n\t"
++ " shufps $0xb1, %%xmm5, %%xmm5 # swap internals\n\t"
++ " addps %%xmm1, %%xmm6\n\t"
++ " mulps %%xmm4, %%xmm2\n\t"
++ " movaps 32(%%r9), %%xmm0\n\t"
++ " addps %%xmm2, %%xmm7\n\t"
++ " mulps %%xmm5, %%xmm3\n\t"
++ " add $32, %%r9\n\t"
++ " movaps 32(%%r10), %%xmm2\n\t"
++ " addps %%xmm3, %%xmm7\n\t"
++ " add $32, %%r10\n\t"
++ ".%=L1_test:\n\t"
++ " dec %%rax\n\t"
++ " jge .%=Loop1\n\t"
++ " # We've handled the bulk of multiplies up to here.\n\t"
++ " # Let's sse if original n_2_ccomplex_blocks was odd.\n\t"
++ " # If so, we've got 2 more taps to do.\n\t"
++ " and $1, %%r8\n\t"
++ " je .%=Leven\n\t"
++ " # The count was odd, do 2 more taps.\n\t"
++ " # Note that we've already got mm0/mm2 preloaded\n\t"
++ " # from the main loop.\n\t"
++ " movaps %%xmm0, %%xmm4\n\t"
++ " mulps %%xmm2, %%xmm0\n\t"
++ " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t"
++ " addps %%xmm0, %%xmm6\n\t"
++ " mulps %%xmm4, %%xmm2\n\t"
++ " addps %%xmm2, %%xmm7\n\t"
++ ".%=Leven:\n\t"
++ " # neg inversor\n\t"
++ " xorps %%xmm1, %%xmm1\n\t"
++ " mov $0x80000000, %%r9\n\t"
++ " movd %%r9, %%xmm1\n\t"
++ " shufps $0x11, %%xmm1, %%xmm1 # b00010001 # 0 -0 0 -0\n\t"
++ " # pfpnacc\n\t"
++ " xorps %%xmm1, %%xmm6\n\t"
++ " movaps %%xmm6, %%xmm2\n\t"
++ " unpcklps %%xmm7, %%xmm6\n\t"
++ " unpckhps %%xmm7, %%xmm2\n\t"
++ " movaps %%xmm2, %%xmm3\n\t"
++ " shufps $0x44, %%xmm6, %%xmm2 # b01000100\n\t"
++ " shufps $0xee, %%xmm3, %%xmm6 # b11101110\n\t"
++ " addps %%xmm2, %%xmm6\n\t"
++ " # xmm6 = r1 i2 r3 i4\n\t"
++ " movhlps %%xmm6, %%xmm4 # xmm4 = r3 i4 ?? ??\n\t"
++ " addps %%xmm4, %%xmm6 # xmm6 = r1+r3 i2+i4 ?? ??\n\t"
++ " movlps %%xmm6, (%[rdi]) # store low 2x32 bits (complex) "
++ "to memory\n\t"
++ :
++ : [rsi] "r"(input), [rdx] "r"(taps), "c"(num_bytes), [rdi] "r"(result)
++ : "rax", "r8", "r9", "r10");
++
++
++ if (isodd) {
++ *result += input[num_points - 1] * taps[num_points - 1];
++ }
+
++ return;
+ }
+
+ #endif
+
+ #if LV_HAVE_SSE && LV_HAVE_32
+
+-static inline void volk_32fc_x2_dot_prod_32fc_a_sse_32(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) {
++static inline void volk_32fc_x2_dot_prod_32fc_a_sse_32(lv_32fc_t* result,
++ const lv_32fc_t* input,
++ const lv_32fc_t* taps,
++ unsigned int num_points)
++{
+
+- volk_32fc_x2_dot_prod_32fc_a_generic(result, input, taps, num_points);
++ volk_32fc_x2_dot_prod_32fc_a_generic(result, input, taps, num_points);
+
+ #if 0
+ const unsigned int num_bytes = num_points*8;
+@@ -792,57 +832,64 @@ static inline void volk_32fc_x2_dot_prod_32fc_a_sse_32(lv_32fc_t* result, const
+
+ #include <pmmintrin.h>
+
+-static inline void volk_32fc_x2_dot_prod_32fc_a_sse3(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) {
++static inline void volk_32fc_x2_dot_prod_32fc_a_sse3(lv_32fc_t* result,
++ const lv_32fc_t* input,
++ const lv_32fc_t* taps,
++ unsigned int num_points)
++{
+
+- const unsigned int num_bytes = num_points*8;
+- unsigned int isodd = num_points & 1;
++ const unsigned int num_bytes = num_points * 8;
++ unsigned int isodd = num_points & 1;
+
+- lv_32fc_t dotProduct;
+- memset(&dotProduct, 0x0, 2*sizeof(float));
++ lv_32fc_t dotProduct;
++ memset(&dotProduct, 0x0, 2 * sizeof(float));
+
+- unsigned int number = 0;
+- const unsigned int halfPoints = num_bytes >> 4;
++ unsigned int number = 0;
++ const unsigned int halfPoints = num_bytes >> 4;
+
+- __m128 x, y, yl, yh, z, tmp1, tmp2, dotProdVal;
++ __m128 x, y, yl, yh, z, tmp1, tmp2, dotProdVal;
+
+- const lv_32fc_t* a = input;
+- const lv_32fc_t* b = taps;
++ const lv_32fc_t* a = input;
++ const lv_32fc_t* b = taps;
+
+- dotProdVal = _mm_setzero_ps();
++ dotProdVal = _mm_setzero_ps();
+
+- for(;number < halfPoints; number++){
++ for (; number < halfPoints; number++) {
+
+- x = _mm_load_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
+- y = _mm_load_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di
++ x = _mm_load_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
++ y = _mm_load_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di
+
+- yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr
+- yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di
++ yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr
++ yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di
+
+- tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
++ tmp1 = _mm_mul_ps(x, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
+
+- x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br
++ x = _mm_shuffle_ps(x, x, 0xB1); // Re-arrange x to be ai,ar,bi,br
+
+- tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
++ tmp2 = _mm_mul_ps(x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
+
+- z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
++ z = _mm_addsub_ps(tmp1,
++ tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
+
+- dotProdVal = _mm_add_ps(dotProdVal, z); // Add the complex multiplication results together
++ dotProdVal =
++ _mm_add_ps(dotProdVal, z); // Add the complex multiplication results together
+
+- a += 2;
+- b += 2;
+- }
++ a += 2;
++ b += 2;
++ }
+
+- __VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector[2];
++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector[2];
+
+- _mm_store_ps((float*)dotProductVector,dotProdVal); // Store the results back into the dot product vector
++ _mm_store_ps((float*)dotProductVector,
++ dotProdVal); // Store the results back into the dot product vector
+
+- dotProduct += ( dotProductVector[0] + dotProductVector[1] );
++ dotProduct += (dotProductVector[0] + dotProductVector[1]);
+
+- if(isodd) {
+- dotProduct += input[num_points - 1] * taps[num_points - 1];
+- }
++ if (isodd) {
++ dotProduct += input[num_points - 1] * taps[num_points - 1];
++ }
+
+- *result = dotProduct;
++ *result = dotProduct;
+ }
+
+ #endif /*LV_HAVE_SSE3*/
+@@ -852,78 +899,82 @@ static inline void volk_32fc_x2_dot_prod_32fc_a_sse3(lv_32fc_t* result, const lv
+
+ #include <smmintrin.h>
+
+-static inline void volk_32fc_x2_dot_prod_32fc_a_sse4_1(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) {
++static inline void volk_32fc_x2_dot_prod_32fc_a_sse4_1(lv_32fc_t* result,
++ const lv_32fc_t* input,
++ const lv_32fc_t* taps,
++ unsigned int num_points)
++{
+
+- unsigned int i = 0;
+- const unsigned int qtr_points = num_points/4;
+- const unsigned int isodd = num_points & 3;
++ unsigned int i = 0;
++ const unsigned int qtr_points = num_points / 4;
++ const unsigned int isodd = num_points & 3;
+
+- __m128 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, real0, real1, im0, im1;
+- float *p_input, *p_taps;
+- __m64 *p_result;
++ __m128 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, real0, real1, im0, im1;
++ float *p_input, *p_taps;
++ __m64* p_result;
+
+- static const __m128i neg = {0x000000000000000080000000};
++ static const __m128i neg = { 0x000000000000000080000000 };
+
+- p_result = (__m64*)result;
+- p_input = (float*)input;
+- p_taps = (float*)taps;
++ p_result = (__m64*)result;
++ p_input = (float*)input;
++ p_taps = (float*)taps;
+
+- real0 = _mm_setzero_ps();
+- real1 = _mm_setzero_ps();
+- im0 = _mm_setzero_ps();
+- im1 = _mm_setzero_ps();
++ real0 = _mm_setzero_ps();
++ real1 = _mm_setzero_ps();
++ im0 = _mm_setzero_ps();
++ im1 = _mm_setzero_ps();
+
+- for(; i < qtr_points; ++i) {
+- xmm0 = _mm_load_ps(p_input);
+- xmm1 = _mm_load_ps(p_taps);
++ for (; i < qtr_points; ++i) {
++ xmm0 = _mm_load_ps(p_input);
++ xmm1 = _mm_load_ps(p_taps);
+
+- p_input += 4;
+- p_taps += 4;
++ p_input += 4;
++ p_taps += 4;
+
+- xmm2 = _mm_load_ps(p_input);
+- xmm3 = _mm_load_ps(p_taps);
++ xmm2 = _mm_load_ps(p_input);
++ xmm3 = _mm_load_ps(p_taps);
+
+- p_input += 4;
+- p_taps += 4;
++ p_input += 4;
++ p_taps += 4;
+
+- xmm4 = _mm_unpackhi_ps(xmm0, xmm2);
+- xmm5 = _mm_unpackhi_ps(xmm1, xmm3);
+- xmm0 = _mm_unpacklo_ps(xmm0, xmm2);
+- xmm2 = _mm_unpacklo_ps(xmm1, xmm3);
++ xmm4 = _mm_unpackhi_ps(xmm0, xmm2);
++ xmm5 = _mm_unpackhi_ps(xmm1, xmm3);
++ xmm0 = _mm_unpacklo_ps(xmm0, xmm2);
++ xmm2 = _mm_unpacklo_ps(xmm1, xmm3);
+
+- //imaginary vector from input
+- xmm1 = _mm_unpackhi_ps(xmm0, xmm4);
+- //real vector from input
+- xmm3 = _mm_unpacklo_ps(xmm0, xmm4);
+- //imaginary vector from taps
+- xmm0 = _mm_unpackhi_ps(xmm2, xmm5);
+- //real vector from taps
+- xmm2 = _mm_unpacklo_ps(xmm2, xmm5);
++ // imaginary vector from input
++ xmm1 = _mm_unpackhi_ps(xmm0, xmm4);
++ // real vector from input
++ xmm3 = _mm_unpacklo_ps(xmm0, xmm4);
++ // imaginary vector from taps
++ xmm0 = _mm_unpackhi_ps(xmm2, xmm5);
++ // real vector from taps
++ xmm2 = _mm_unpacklo_ps(xmm2, xmm5);
+
+- xmm4 = _mm_dp_ps(xmm3, xmm2, 0xf1);
+- xmm5 = _mm_dp_ps(xmm1, xmm0, 0xf1);
++ xmm4 = _mm_dp_ps(xmm3, xmm2, 0xf1);
++ xmm5 = _mm_dp_ps(xmm1, xmm0, 0xf1);
+
+- xmm6 = _mm_dp_ps(xmm3, xmm0, 0xf2);
+- xmm7 = _mm_dp_ps(xmm1, xmm2, 0xf2);
++ xmm6 = _mm_dp_ps(xmm3, xmm0, 0xf2);
++ xmm7 = _mm_dp_ps(xmm1, xmm2, 0xf2);
+
+- real0 = _mm_add_ps(xmm4, real0);
+- real1 = _mm_add_ps(xmm5, real1);
+- im0 = _mm_add_ps(xmm6, im0);
+- im1 = _mm_add_ps(xmm7, im1);
+- }
++ real0 = _mm_add_ps(xmm4, real0);
++ real1 = _mm_add_ps(xmm5, real1);
++ im0 = _mm_add_ps(xmm6, im0);
++ im1 = _mm_add_ps(xmm7, im1);
++ }
+
+- real1 = _mm_xor_ps(real1, bit128_p(&neg)->float_vec);
++ real1 = _mm_xor_ps(real1, bit128_p(&neg)->float_vec);
+
+- im0 = _mm_add_ps(im0, im1);
+- real0 = _mm_add_ps(real0, real1);
++ im0 = _mm_add_ps(im0, im1);
++ real0 = _mm_add_ps(real0, real1);
+
+- im0 = _mm_add_ps(im0, real0);
++ im0 = _mm_add_ps(im0, real0);
+
+- _mm_storel_pi(p_result, im0);
++ _mm_storel_pi(p_result, im0);
+
+- for(i = num_points-isodd; i < num_points; i++) {
+- *result += input[i] * taps[i];
+- }
++ for (i = num_points - isodd; i < num_points; i++) {
++ *result += input[i] * taps[i];
++ }
+ }
+
+ #endif /*LV_HAVE_SSE4_1*/
+@@ -931,13 +982,17 @@ static inline void volk_32fc_x2_dot_prod_32fc_a_sse4_1(lv_32fc_t* result, const
+ #ifdef LV_HAVE_NEON
+ #include <arm_neon.h>
+
+-static inline void volk_32fc_x2_dot_prod_32fc_neon(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) {
++static inline void volk_32fc_x2_dot_prod_32fc_neon(lv_32fc_t* result,
++ const lv_32fc_t* input,
++ const lv_32fc_t* taps,
++ unsigned int num_points)
++{
+
+ unsigned int quarter_points = num_points / 4;
+ unsigned int number;
+
+- lv_32fc_t* a_ptr = (lv_32fc_t*) taps;
+- lv_32fc_t* b_ptr = (lv_32fc_t*) input;
++ lv_32fc_t* a_ptr = (lv_32fc_t*)taps;
++ lv_32fc_t* b_ptr = (lv_32fc_t*)input;
+ // for 2-lane vectors, 1st lane holds the real part,
+ // 2nd lane holds the imaginary part
+ float32x4x2_t a_val, b_val, c_val, accumulator;
+@@ -945,11 +1000,11 @@ static inline void volk_32fc_x2_dot_prod_32fc_neon(lv_32fc_t* result, const lv_3
+ accumulator.val[0] = vdupq_n_f32(0);
+ accumulator.val[1] = vdupq_n_f32(0);
+
+- for(number = 0; number < quarter_points; ++number) {
++ for (number = 0; number < quarter_points; ++number) {
+ a_val = vld2q_f32((float*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i
+ b_val = vld2q_f32((float*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i
+- __VOLK_PREFETCH(a_ptr+8);
+- __VOLK_PREFETCH(b_ptr+8);
++ __VOLK_PREFETCH(a_ptr + 8);
++ __VOLK_PREFETCH(b_ptr + 8);
+
+ // multiply the real*real and imag*imag to get real result
+ // a0r*b0r|a1r*b1r|a2r*b2r|a3r*b3r
+@@ -977,22 +1032,25 @@ static inline void volk_32fc_x2_dot_prod_32fc_neon(lv_32fc_t* result, const lv_3
+ *result = accum_result[0] + accum_result[1] + accum_result[2] + accum_result[3];
+
+ // tail case
+- for(number = quarter_points*4; number < num_points; ++number) {
++ for (number = quarter_points * 4; number < num_points; ++number) {
+ *result += (*a_ptr++) * (*b_ptr++);
+ }
+-
+ }
+ #endif /*LV_HAVE_NEON*/
+
+ #ifdef LV_HAVE_NEON
+ #include <arm_neon.h>
+-static inline void volk_32fc_x2_dot_prod_32fc_neon_opttests(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) {
++static inline void volk_32fc_x2_dot_prod_32fc_neon_opttests(lv_32fc_t* result,
++ const lv_32fc_t* input,
++ const lv_32fc_t* taps,
++ unsigned int num_points)
++{
+
+ unsigned int quarter_points = num_points / 4;
+ unsigned int number;
+
+- lv_32fc_t* a_ptr = (lv_32fc_t*) taps;
+- lv_32fc_t* b_ptr = (lv_32fc_t*) input;
++ lv_32fc_t* a_ptr = (lv_32fc_t*)taps;
++ lv_32fc_t* b_ptr = (lv_32fc_t*)input;
+ // for 2-lane vectors, 1st lane holds the real part,
+ // 2nd lane holds the imaginary part
+ float32x4x2_t a_val, b_val, accumulator;
+@@ -1000,11 +1058,11 @@ static inline void volk_32fc_x2_dot_prod_32fc_neon_opttests(lv_32fc_t* result, c
+ accumulator.val[0] = vdupq_n_f32(0);
+ accumulator.val[1] = vdupq_n_f32(0);
+
+- for(number = 0; number < quarter_points; ++number) {
++ for (number = 0; number < quarter_points; ++number) {
+ a_val = vld2q_f32((float*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i
+ b_val = vld2q_f32((float*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i
+- __VOLK_PREFETCH(a_ptr+8);
+- __VOLK_PREFETCH(b_ptr+8);
++ __VOLK_PREFETCH(a_ptr + 8);
++ __VOLK_PREFETCH(b_ptr + 8);
+
+ // do the first multiply
+ tmp_imag.val[1] = vmulq_f32(a_val.val[1], b_val.val[0]);
+@@ -1026,21 +1084,24 @@ static inline void volk_32fc_x2_dot_prod_32fc_neon_opttests(lv_32fc_t* result, c
+ *result = accum_result[0] + accum_result[1] + accum_result[2] + accum_result[3];
+
+ // tail case
+- for(number = quarter_points*4; number < num_points; ++number) {
++ for (number = quarter_points * 4; number < num_points; ++number) {
+ *result += (*a_ptr++) * (*b_ptr++);
+ }
+-
+ }
+ #endif /*LV_HAVE_NEON*/
+
+ #ifdef LV_HAVE_NEON
+-static inline void volk_32fc_x2_dot_prod_32fc_neon_optfma(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) {
++static inline void volk_32fc_x2_dot_prod_32fc_neon_optfma(lv_32fc_t* result,
++ const lv_32fc_t* input,
++ const lv_32fc_t* taps,
++ unsigned int num_points)
++{
+
+ unsigned int quarter_points = num_points / 4;
+ unsigned int number;
+
+- lv_32fc_t* a_ptr = (lv_32fc_t*) taps;
+- lv_32fc_t* b_ptr = (lv_32fc_t*) input;
++ lv_32fc_t* a_ptr = (lv_32fc_t*)taps;
++ lv_32fc_t* b_ptr = (lv_32fc_t*)input;
+ // for 2-lane vectors, 1st lane holds the real part,
+ // 2nd lane holds the imaginary part
+ float32x4x2_t a_val, b_val, accumulator1, accumulator2;
+@@ -1049,11 +1110,11 @@ static inline void volk_32fc_x2_dot_prod_32fc_neon_optfma(lv_32fc_t* result, con
+ accumulator2.val[0] = vdupq_n_f32(0);
+ accumulator2.val[1] = vdupq_n_f32(0);
+
+- for(number = 0; number < quarter_points; ++number) {
++ for (number = 0; number < quarter_points; ++number) {
+ a_val = vld2q_f32((float*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i
+ b_val = vld2q_f32((float*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i
+- __VOLK_PREFETCH(a_ptr+8);
+- __VOLK_PREFETCH(b_ptr+8);
++ __VOLK_PREFETCH(a_ptr + 8);
++ __VOLK_PREFETCH(b_ptr + 8);
+
+ // use 2 accumulators to remove inter-instruction data dependencies
+ accumulator1.val[0] = vmlaq_f32(accumulator1.val[0], a_val.val[0], b_val.val[0]);
+@@ -1071,22 +1132,26 @@ static inline void volk_32fc_x2_dot_prod_32fc_neon_optfma(lv_32fc_t* result, con
+ *result = accum_result[0] + accum_result[1] + accum_result[2] + accum_result[3];
+
+ // tail case
+- for(number = quarter_points*4; number < num_points; ++number) {
++ for (number = quarter_points * 4; number < num_points; ++number) {
+ *result += (*a_ptr++) * (*b_ptr++);
+ }
+-
+ }
+ #endif /*LV_HAVE_NEON*/
+
+ #ifdef LV_HAVE_NEON
+-static inline void volk_32fc_x2_dot_prod_32fc_neon_optfmaunroll(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) {
+-// NOTE: GCC does a poor job with this kernel, but the equivalent ASM code is very fast
++static inline void volk_32fc_x2_dot_prod_32fc_neon_optfmaunroll(lv_32fc_t* result,
++ const lv_32fc_t* input,
++ const lv_32fc_t* taps,
++ unsigned int num_points)
++{
++ // NOTE: GCC does a poor job with this kernel, but the equivalent ASM code is very
++ // fast
+
+ unsigned int quarter_points = num_points / 8;
+ unsigned int number;
+
+- lv_32fc_t* a_ptr = (lv_32fc_t*) taps;
+- lv_32fc_t* b_ptr = (lv_32fc_t*) input;
++ lv_32fc_t* a_ptr = (lv_32fc_t*)taps;
++ lv_32fc_t* b_ptr = (lv_32fc_t*)input;
+ // for 2-lane vectors, 1st lane holds the real part,
+ // 2nd lane holds the imaginary part
+ float32x4x4_t a_val, b_val, accumulator1, accumulator2;
+@@ -1101,11 +1166,11 @@ static inline void volk_32fc_x2_dot_prod_32fc_neon_optfmaunroll(lv_32fc_t* resul
+ accumulator2.val[3] = vdupq_n_f32(0);
+
+ // 8 input regs, 8 accumulators -> 16/16 neon regs are used
+- for(number = 0; number < quarter_points; ++number) {
++ for (number = 0; number < quarter_points; ++number) {
+ a_val = vld4q_f32((float*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i
+ b_val = vld4q_f32((float*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i
+- __VOLK_PREFETCH(a_ptr+8);
+- __VOLK_PREFETCH(b_ptr+8);
++ __VOLK_PREFETCH(a_ptr + 8);
++ __VOLK_PREFETCH(b_ptr + 8);
+
+ // use 2 accumulators to remove inter-instruction data dependencies
+ accumulator1.val[0] = vmlaq_f32(accumulator1.val[0], a_val.val[0], b_val.val[0]);
+@@ -1136,10 +1201,9 @@ static inline void volk_32fc_x2_dot_prod_32fc_neon_optfmaunroll(lv_32fc_t* resul
+ *result = accum_result[0] + accum_result[1] + accum_result[2] + accum_result[3];
+
+ // tail case
+- for(number = quarter_points*8; number < num_points; ++number) {
++ for (number = quarter_points * 8; number < num_points; ++number) {
+ *result += (*a_ptr++) * (*b_ptr++);
+ }
+-
+ }
+ #endif /*LV_HAVE_NEON*/
+
+@@ -1148,56 +1212,64 @@ static inline void volk_32fc_x2_dot_prod_32fc_neon_optfmaunroll(lv_32fc_t* resul
+
+ #include <immintrin.h>
+
+-static inline void volk_32fc_x2_dot_prod_32fc_a_avx(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) {
++static inline void volk_32fc_x2_dot_prod_32fc_a_avx(lv_32fc_t* result,
++ const lv_32fc_t* input,
++ const lv_32fc_t* taps,
++ unsigned int num_points)
++{
+
+- unsigned int isodd = num_points & 3;
+- unsigned int i = 0;
+- lv_32fc_t dotProduct;
+- memset(&dotProduct, 0x0, 2*sizeof(float));
++ unsigned int isodd = num_points & 3;
++ unsigned int i = 0;
++ lv_32fc_t dotProduct;
++ memset(&dotProduct, 0x0, 2 * sizeof(float));
+
+- unsigned int number = 0;
+- const unsigned int quarterPoints = num_points / 4;
++ unsigned int number = 0;
++ const unsigned int quarterPoints = num_points / 4;
+
+- __m256 x, y, yl, yh, z, tmp1, tmp2, dotProdVal;
++ __m256 x, y, yl, yh, z, tmp1, tmp2, dotProdVal;
+
+- const lv_32fc_t* a = input;
+- const lv_32fc_t* b = taps;
++ const lv_32fc_t* a = input;
++ const lv_32fc_t* b = taps;
+
+- dotProdVal = _mm256_setzero_ps();
++ dotProdVal = _mm256_setzero_ps();
+
+- for(;number < quarterPoints; number++){
++ for (; number < quarterPoints; number++) {
+
+- x = _mm256_load_ps((float*)a); // Load a,b,e,f as ar,ai,br,bi,er,ei,fr,fi
+- y = _mm256_load_ps((float*)b); // Load c,d,g,h as cr,ci,dr,di,gr,gi,hr,hi
++ x = _mm256_load_ps((float*)a); // Load a,b,e,f as ar,ai,br,bi,er,ei,fr,fi
++ y = _mm256_load_ps((float*)b); // Load c,d,g,h as cr,ci,dr,di,gr,gi,hr,hi
+
+- yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr,gr,gr,hr,hr
+- yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di,gi,gi,hi,hi
++ yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr,gr,gr,hr,hr
++ yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di,gi,gi,hi,hi
+
+- tmp1 = _mm256_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr ...
++ tmp1 = _mm256_mul_ps(x, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr ...
+
+- x = _mm256_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br,ei,er,fi,fr
++ x = _mm256_shuffle_ps(x, x, 0xB1); // Re-arrange x to be ai,ar,bi,br,ei,er,fi,fr
+
+- tmp2 = _mm256_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di ...
++ tmp2 = _mm256_mul_ps(x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di ...
+
+- z = _mm256_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
++ z = _mm256_addsub_ps(tmp1,
++ tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
+
+- dotProdVal = _mm256_add_ps(dotProdVal, z); // Add the complex multiplication results together
++ dotProdVal = _mm256_add_ps(dotProdVal,
++ z); // Add the complex multiplication results together
+
+- a += 4;
+- b += 4;
+- }
++ a += 4;
++ b += 4;
++ }
+
+- __VOLK_ATTR_ALIGNED(32) lv_32fc_t dotProductVector[4];
++ __VOLK_ATTR_ALIGNED(32) lv_32fc_t dotProductVector[4];
+
+- _mm256_store_ps((float*)dotProductVector,dotProdVal); // Store the results back into the dot product vector
++ _mm256_store_ps((float*)dotProductVector,
++ dotProdVal); // Store the results back into the dot product vector
+
+- dotProduct += ( dotProductVector[0] + dotProductVector[1] + dotProductVector[2] + dotProductVector[3]);
++ dotProduct += (dotProductVector[0] + dotProductVector[1] + dotProductVector[2] +
++ dotProductVector[3]);
+
+- for(i = num_points-isodd; i < num_points; i++) {
+- dotProduct += input[i] * taps[i];
+- }
++ for (i = num_points - isodd; i < num_points; i++) {
++ dotProduct += input[i] * taps[i];
++ }
+
+- *result = dotProduct;
++ *result = dotProduct;
+ }
+
+ #endif /*LV_HAVE_AVX*/
+@@ -1205,56 +1277,64 @@ static inline void volk_32fc_x2_dot_prod_32fc_a_avx(lv_32fc_t* result, const lv_
+ #if LV_HAVE_AVX && LV_HAVE_FMA
+ #include <immintrin.h>
+
+-static inline void volk_32fc_x2_dot_prod_32fc_a_avx_fma(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) {
++static inline void volk_32fc_x2_dot_prod_32fc_a_avx_fma(lv_32fc_t* result,
++ const lv_32fc_t* input,
++ const lv_32fc_t* taps,
++ unsigned int num_points)
++{
+
+- unsigned int isodd = num_points & 3;
+- unsigned int i = 0;
+- lv_32fc_t dotProduct;
+- memset(&dotProduct, 0x0, 2*sizeof(float));
++ unsigned int isodd = num_points & 3;
++ unsigned int i = 0;
++ lv_32fc_t dotProduct;
++ memset(&dotProduct, 0x0, 2 * sizeof(float));
+
+- unsigned int number = 0;
+- const unsigned int quarterPoints = num_points / 4;
++ unsigned int number = 0;
++ const unsigned int quarterPoints = num_points / 4;
+
+- __m256 x, y, yl, yh, z, tmp1, tmp2, dotProdVal;
++ __m256 x, y, yl, yh, z, tmp1, tmp2, dotProdVal;
+
+- const lv_32fc_t* a = input;
+- const lv_32fc_t* b = taps;
++ const lv_32fc_t* a = input;
++ const lv_32fc_t* b = taps;
+
+- dotProdVal = _mm256_setzero_ps();
++ dotProdVal = _mm256_setzero_ps();
+
+- for(;number < quarterPoints; number++){
++ for (; number < quarterPoints; number++) {
+
+- x = _mm256_load_ps((float*)a); // Load a,b,e,f as ar,ai,br,bi,er,ei,fr,fi
+- y = _mm256_load_ps((float*)b); // Load c,d,g,h as cr,ci,dr,di,gr,gi,hr,hi
++ x = _mm256_load_ps((float*)a); // Load a,b,e,f as ar,ai,br,bi,er,ei,fr,fi
++ y = _mm256_load_ps((float*)b); // Load c,d,g,h as cr,ci,dr,di,gr,gi,hr,hi
+
+- yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr,gr,gr,hr,hr
+- yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di,gi,gi,hi,hi
++ yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr,gr,gr,hr,hr
++ yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di,gi,gi,hi,hi
+
+- tmp1 = x;
++ tmp1 = x;
+
+- x = _mm256_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br,ei,er,fi,fr
++ x = _mm256_shuffle_ps(x, x, 0xB1); // Re-arrange x to be ai,ar,bi,br,ei,er,fi,fr
+
+- tmp2 = _mm256_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di ...
++ tmp2 = _mm256_mul_ps(x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di ...
+
+- z = _mm256_fmaddsub_ps(tmp1, yl,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
++ z = _mm256_fmaddsub_ps(
++ tmp1, yl, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
+
+- dotProdVal = _mm256_add_ps(dotProdVal, z); // Add the complex multiplication results together
++ dotProdVal = _mm256_add_ps(dotProdVal,
++ z); // Add the complex multiplication results together
+
+- a += 4;
+- b += 4;
+- }
++ a += 4;
++ b += 4;
++ }
+
+- __VOLK_ATTR_ALIGNED(32) lv_32fc_t dotProductVector[4];
++ __VOLK_ATTR_ALIGNED(32) lv_32fc_t dotProductVector[4];
+
+- _mm256_store_ps((float*)dotProductVector,dotProdVal); // Store the results back into the dot product vector
++ _mm256_store_ps((float*)dotProductVector,
++ dotProdVal); // Store the results back into the dot product vector
+
+- dotProduct += ( dotProductVector[0] + dotProductVector[1] + dotProductVector[2] + dotProductVector[3]);
++ dotProduct += (dotProductVector[0] + dotProductVector[1] + dotProductVector[2] +
++ dotProductVector[3]);
+
+- for(i = num_points-isodd; i < num_points; i++) {
+- dotProduct += input[i] * taps[i];
+- }
++ for (i = num_points - isodd; i < num_points; i++) {
++ dotProduct += input[i] * taps[i];
++ }
+
+- *result = dotProduct;
++ *result = dotProduct;
+ }
+
+ #endif /*LV_HAVE_AVX && LV_HAVE_FMA*/
+diff --git a/kernels/volk/volk_32fc_x2_multiply_32fc.h b/kernels/volk/volk_32fc_x2_multiply_32fc.h
+index 6bf428b..6cb6907 100644
+--- a/kernels/volk/volk_32fc_x2_multiply_32fc.h
++++ b/kernels/volk/volk_32fc_x2_multiply_32fc.h
+@@ -29,8 +29,8 @@
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+- * void volk_32fc_x2_multiply_32fc(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points);
+- * \endcode
++ * void volk_32fc_x2_multiply_32fc(lv_32fc_t* cVector, const lv_32fc_t* aVector, const
++ * lv_32fc_t* bVector, unsigned int num_points); \endcode
+ *
+ * \b Inputs
+ * \li aVector: The first input vector of complex floats.
+@@ -70,55 +70,62 @@
+ #ifndef INCLUDED_volk_32fc_x2_multiply_32fc_u_H
+ #define INCLUDED_volk_32fc_x2_multiply_32fc_u_H
+
++#include <float.h>
+ #include <inttypes.h>
+ #include <stdio.h>
+ #include <volk/volk_complex.h>
+-#include <float.h>
+
+ #if LV_HAVE_AVX2 && LV_HAVE_FMA
+ #include <immintrin.h>
+ /*!
+- \brief Multiplies the two input complex vectors and stores their results in the third vector
+- \param cVector The vector where the results will be stored
+- \param aVector One of the vectors to be multiplied
+- \param bVector One of the vectors to be multiplied
+- \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
++ \brief Multiplies the two input complex vectors and stores their results in the third
++ vector \param cVector The vector where the results will be stored \param aVector One of
++ the vectors to be multiplied \param bVector One of the vectors to be multiplied \param
++ num_points The number of complex values in aVector and bVector to be multiplied together
++ and stored into cVector
+ */
+-static inline void volk_32fc_x2_multiply_32fc_u_avx2_fma(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){
+- unsigned int number = 0;
+- const unsigned int quarterPoints = num_points / 4;
++static inline void volk_32fc_x2_multiply_32fc_u_avx2_fma(lv_32fc_t* cVector,
++ const lv_32fc_t* aVector,
++ const lv_32fc_t* bVector,
++ unsigned int num_points)
++{
++ unsigned int number = 0;
++ const unsigned int quarterPoints = num_points / 4;
+
+- lv_32fc_t* c = cVector;
+- const lv_32fc_t* a = aVector;
+- const lv_32fc_t* b = bVector;
++ lv_32fc_t* c = cVector;
++ const lv_32fc_t* a = aVector;
++ const lv_32fc_t* b = bVector;
+
+- for(;number < quarterPoints; number++){
++ for (; number < quarterPoints; number++) {
+
+- const __m256 x = _mm256_loadu_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
+- const __m256 y = _mm256_loadu_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di
++ const __m256 x =
++ _mm256_loadu_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
++ const __m256 y =
++ _mm256_loadu_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di
+
+- const __m256 yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr
+- const __m256 yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di
++ const __m256 yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr
++ const __m256 yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di
+
+- const __m256 tmp2x = _mm256_permute_ps(x,0xB1); // Re-arrange x to be ai,ar,bi,br
++ const __m256 tmp2x = _mm256_permute_ps(x, 0xB1); // Re-arrange x to be ai,ar,bi,br
+
+- const __m256 tmp2 = _mm256_mul_ps(tmp2x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
++ const __m256 tmp2 = _mm256_mul_ps(tmp2x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
+
+- const __m256 z = _mm256_fmaddsub_ps(x, yl, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
++ const __m256 z = _mm256_fmaddsub_ps(
++ x, yl, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
+
+- _mm256_storeu_ps((float*)c,z); // Store the results back into the C container
++ _mm256_storeu_ps((float*)c, z); // Store the results back into the C container
+
+- a += 4;
+- b += 4;
+- c += 4;
+- }
++ a += 4;
++ b += 4;
++ c += 4;
++ }
+
+- _mm256_zeroupper();
++ _mm256_zeroupper();
+
+- number = quarterPoints * 4;
+- for(;number < num_points; number++){
+- *c++ = (*a++) * (*b++);
+- }
++ number = quarterPoints * 4;
++ for (; number < num_points; number++) {
++ *c++ = (*a++) * (*b++);
++ }
+ }
+ #endif /* LV_HAVE_AVX2 && LV_HAVE_FMA */
+
+@@ -127,34 +134,37 @@ static inline void volk_32fc_x2_multiply_32fc_u_avx2_fma(lv_32fc_t* cVector, con
+ #include <immintrin.h>
+ #include <volk/volk_avx_intrinsics.h>
+
+-static inline void
+-volk_32fc_x2_multiply_32fc_u_avx(lv_32fc_t* cVector, const lv_32fc_t* aVector,
+- const lv_32fc_t* bVector, unsigned int num_points)
++static inline void volk_32fc_x2_multiply_32fc_u_avx(lv_32fc_t* cVector,
++ const lv_32fc_t* aVector,
++ const lv_32fc_t* bVector,
++ unsigned int num_points)
+ {
+- unsigned int number = 0;
+- const unsigned int quarterPoints = num_points / 4;
+-
+- __m256 x, y, z;
+- lv_32fc_t* c = cVector;
+- const lv_32fc_t* a = aVector;
+- const lv_32fc_t* b = bVector;
+-
+- for(; number < quarterPoints; number++){
+- x = _mm256_loadu_ps((float*) a); // Load the ar + ai, br + bi ... as ar,ai,br,bi ...
+- y = _mm256_loadu_ps((float*) b); // Load the cr + ci, dr + di ... as cr,ci,dr,di ...
+- z = _mm256_complexmul_ps(x, y);
+- _mm256_storeu_ps((float*) c, z); // Store the results back into the C container
+-
+- a += 4;
+- b += 4;
+- c += 4;
+- }
+-
+- number = quarterPoints * 4;
+-
+- for(; number < num_points; number++){
+- *c++ = (*a++) * (*b++);
+- }
++ unsigned int number = 0;
++ const unsigned int quarterPoints = num_points / 4;
++
++ __m256 x, y, z;
++ lv_32fc_t* c = cVector;
++ const lv_32fc_t* a = aVector;
++ const lv_32fc_t* b = bVector;
++
++ for (; number < quarterPoints; number++) {
++ x = _mm256_loadu_ps(
++ (float*)a); // Load the ar + ai, br + bi ... as ar,ai,br,bi ...
++ y = _mm256_loadu_ps(
++ (float*)b); // Load the cr + ci, dr + di ... as cr,ci,dr,di ...
++ z = _mm256_complexmul_ps(x, y);
++ _mm256_storeu_ps((float*)c, z); // Store the results back into the C container
++
++ a += 4;
++ b += 4;
++ c += 4;
++ }
++
++ number = quarterPoints * 4;
++
++ for (; number < num_points; number++) {
++ *c++ = (*a++) * (*b++);
++ }
+ }
+ #endif /* LV_HAVE_AVX */
+
+@@ -163,50 +173,52 @@ volk_32fc_x2_multiply_32fc_u_avx(lv_32fc_t* cVector, const lv_32fc_t* aVector,
+ #include <pmmintrin.h>
+ #include <volk/volk_sse3_intrinsics.h>
+
+-static inline void
+-volk_32fc_x2_multiply_32fc_u_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector,
+- const lv_32fc_t* bVector, unsigned int num_points)
++static inline void volk_32fc_x2_multiply_32fc_u_sse3(lv_32fc_t* cVector,
++ const lv_32fc_t* aVector,
++ const lv_32fc_t* bVector,
++ unsigned int num_points)
+ {
+- unsigned int number = 0;
+- const unsigned int halfPoints = num_points / 2;
+-
+- __m128 x, y, z;
+- lv_32fc_t* c = cVector;
+- const lv_32fc_t* a = aVector;
+- const lv_32fc_t* b = bVector;
+-
+- for(; number < halfPoints; number++){
+- x = _mm_loadu_ps((float*) a); // Load the ar + ai, br + bi as ar,ai,br,bi
+- y = _mm_loadu_ps((float*) b); // Load the cr + ci, dr + di as cr,ci,dr,di
+- z = _mm_complexmul_ps(x, y);
+- _mm_storeu_ps((float*) c, z); // Store the results back into the C container
+-
+- a += 2;
+- b += 2;
+- c += 2;
+- }
+-
+- if((num_points % 2) != 0){
+- *c = (*a) * (*b);
+- }
++ unsigned int number = 0;
++ const unsigned int halfPoints = num_points / 2;
++
++ __m128 x, y, z;
++ lv_32fc_t* c = cVector;
++ const lv_32fc_t* a = aVector;
++ const lv_32fc_t* b = bVector;
++
++ for (; number < halfPoints; number++) {
++ x = _mm_loadu_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
++ y = _mm_loadu_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di
++ z = _mm_complexmul_ps(x, y);
++ _mm_storeu_ps((float*)c, z); // Store the results back into the C container
++
++ a += 2;
++ b += 2;
++ c += 2;
++ }
++
++ if ((num_points % 2) != 0) {
++ *c = (*a) * (*b);
++ }
+ }
+ #endif /* LV_HAVE_SSE */
+
+
+ #ifdef LV_HAVE_GENERIC
+
+-static inline void
+-volk_32fc_x2_multiply_32fc_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector,
+- const lv_32fc_t* bVector, unsigned int num_points)
++static inline void volk_32fc_x2_multiply_32fc_generic(lv_32fc_t* cVector,
++ const lv_32fc_t* aVector,
++ const lv_32fc_t* bVector,
++ unsigned int num_points)
+ {
+- lv_32fc_t* cPtr = cVector;
+- const lv_32fc_t* aPtr = aVector;
+- const lv_32fc_t* bPtr= bVector;
+- unsigned int number = 0;
+-
+- for(number = 0; number < num_points; number++){
+- *cPtr++ = (*aPtr++) * (*bPtr++);
+- }
++ lv_32fc_t* cPtr = cVector;
++ const lv_32fc_t* aPtr = aVector;
++ const lv_32fc_t* bPtr = bVector;
++ unsigned int number = 0;
++
++ for (number = 0; number < num_points; number++) {
++ *cPtr++ = (*aPtr++) * (*bPtr++);
++ }
+ }
+ #endif /* LV_HAVE_GENERIC */
+
+@@ -215,55 +227,62 @@ volk_32fc_x2_multiply_32fc_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector,
+ #ifndef INCLUDED_volk_32fc_x2_multiply_32fc_a_H
+ #define INCLUDED_volk_32fc_x2_multiply_32fc_a_H
+
++#include <float.h>
+ #include <inttypes.h>
+ #include <stdio.h>
+ #include <volk/volk_complex.h>
+-#include <float.h>
+
+ #if LV_HAVE_AVX2 && LV_HAVE_FMA
+ #include <immintrin.h>
+ /*!
+- \brief Multiplies the two input complex vectors and stores their results in the third vector
+- \param cVector The vector where the results will be stored
+- \param aVector One of the vectors to be multiplied
+- \param bVector One of the vectors to be multiplied
+- \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
++ \brief Multiplies the two input complex vectors and stores their results in the third
++ vector \param cVector The vector where the results will be stored \param aVector One of
++ the vectors to be multiplied \param bVector One of the vectors to be multiplied \param
++ num_points The number of complex values in aVector and bVector to be multiplied together
++ and stored into cVector
+ */
+-static inline void volk_32fc_x2_multiply_32fc_a_avx2_fma(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){
+- unsigned int number = 0;
+- const unsigned int quarterPoints = num_points / 4;
++static inline void volk_32fc_x2_multiply_32fc_a_avx2_fma(lv_32fc_t* cVector,
++ const lv_32fc_t* aVector,
++ const lv_32fc_t* bVector,
++ unsigned int num_points)
++{
++ unsigned int number = 0;
++ const unsigned int quarterPoints = num_points / 4;
+
+- lv_32fc_t* c = cVector;
+- const lv_32fc_t* a = aVector;
+- const lv_32fc_t* b = bVector;
++ lv_32fc_t* c = cVector;
++ const lv_32fc_t* a = aVector;
++ const lv_32fc_t* b = bVector;
+
+- for(;number < quarterPoints; number++){
++ for (; number < quarterPoints; number++) {
+
+- const __m256 x = _mm256_load_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
+- const __m256 y = _mm256_load_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di
++ const __m256 x =
++ _mm256_load_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
++ const __m256 y =
++ _mm256_load_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di
+
+- const __m256 yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr
+- const __m256 yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di
++ const __m256 yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr
++ const __m256 yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di
+
+- const __m256 tmp2x = _mm256_permute_ps(x,0xB1); // Re-arrange x to be ai,ar,bi,br
++ const __m256 tmp2x = _mm256_permute_ps(x, 0xB1); // Re-arrange x to be ai,ar,bi,br
+
+- const __m256 tmp2 = _mm256_mul_ps(tmp2x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
++ const __m256 tmp2 = _mm256_mul_ps(tmp2x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
+
+- const __m256 z = _mm256_fmaddsub_ps(x, yl, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
++ const __m256 z = _mm256_fmaddsub_ps(
++ x, yl, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
+
+- _mm256_store_ps((float*)c,z); // Store the results back into the C container
++ _mm256_store_ps((float*)c, z); // Store the results back into the C container
+
+- a += 4;
+- b += 4;
+- c += 4;
+- }
++ a += 4;
++ b += 4;
++ c += 4;
++ }
+
+- _mm256_zeroupper();
++ _mm256_zeroupper();
+
+- number = quarterPoints * 4;
+- for(;number < num_points; number++){
+- *c++ = (*a++) * (*b++);
+- }
++ number = quarterPoints * 4;
++ for (; number < num_points; number++) {
++ *c++ = (*a++) * (*b++);
++ }
+ }
+ #endif /* LV_HAVE_AVX2 && LV_HAVE_FMA */
+
+@@ -272,34 +291,35 @@ static inline void volk_32fc_x2_multiply_32fc_a_avx2_fma(lv_32fc_t* cVector, con
+ #include <immintrin.h>
+ #include <volk/volk_avx_intrinsics.h>
+
+-static inline void
+-volk_32fc_x2_multiply_32fc_a_avx(lv_32fc_t* cVector, const lv_32fc_t* aVector,
+- const lv_32fc_t* bVector, unsigned int num_points)
++static inline void volk_32fc_x2_multiply_32fc_a_avx(lv_32fc_t* cVector,
++ const lv_32fc_t* aVector,
++ const lv_32fc_t* bVector,
++ unsigned int num_points)
+ {
+- unsigned int number = 0;
+- const unsigned int quarterPoints = num_points / 4;
+-
+- __m256 x, y, z;
+- lv_32fc_t* c = cVector;
+- const lv_32fc_t* a = aVector;
+- const lv_32fc_t* b = bVector;
+-
+- for(; number < quarterPoints; number++){
+- x = _mm256_load_ps((float*) a); // Load the ar + ai, br + bi ... as ar,ai,br,bi ...
+- y = _mm256_load_ps((float*) b); // Load the cr + ci, dr + di ... as cr,ci,dr,di ...
+- z = _mm256_complexmul_ps(x, y);
+- _mm256_store_ps((float*) c, z); // Store the results back into the C container
+-
+- a += 4;
+- b += 4;
+- c += 4;
+- }
+-
+- number = quarterPoints * 4;
+-
+- for(; number < num_points; number++){
+- *c++ = (*a++) * (*b++);
+- }
++ unsigned int number = 0;
++ const unsigned int quarterPoints = num_points / 4;
++
++ __m256 x, y, z;
++ lv_32fc_t* c = cVector;
++ const lv_32fc_t* a = aVector;
++ const lv_32fc_t* b = bVector;
++
++ for (; number < quarterPoints; number++) {
++ x = _mm256_load_ps((float*)a); // Load the ar + ai, br + bi ... as ar,ai,br,bi ...
++ y = _mm256_load_ps((float*)b); // Load the cr + ci, dr + di ... as cr,ci,dr,di ...
++ z = _mm256_complexmul_ps(x, y);
++ _mm256_store_ps((float*)c, z); // Store the results back into the C container
++
++ a += 4;
++ b += 4;
++ c += 4;
++ }
++
++ number = quarterPoints * 4;
++
++ for (; number < num_points; number++) {
++ *c++ = (*a++) * (*b++);
++ }
+ }
+ #endif /* LV_HAVE_AVX */
+
+@@ -307,50 +327,52 @@ volk_32fc_x2_multiply_32fc_a_avx(lv_32fc_t* cVector, const lv_32fc_t* aVector,
+ #include <pmmintrin.h>
+ #include <volk/volk_sse3_intrinsics.h>
+
+-static inline void
+-volk_32fc_x2_multiply_32fc_a_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector,
+- const lv_32fc_t* bVector, unsigned int num_points)
++static inline void volk_32fc_x2_multiply_32fc_a_sse3(lv_32fc_t* cVector,
++ const lv_32fc_t* aVector,
++ const lv_32fc_t* bVector,
++ unsigned int num_points)
+ {
+- unsigned int number = 0;
+- const unsigned int halfPoints = num_points / 2;
+-
+- __m128 x, y, z;
+- lv_32fc_t* c = cVector;
+- const lv_32fc_t* a = aVector;
+- const lv_32fc_t* b = bVector;
+-
+- for(; number < halfPoints; number++){
+- x = _mm_load_ps((float*) a); // Load the ar + ai, br + bi as ar,ai,br,bi
+- y = _mm_load_ps((float*) b); // Load the cr + ci, dr + di as cr,ci,dr,di
+- z = _mm_complexmul_ps(x, y);
+- _mm_store_ps((float*) c, z); // Store the results back into the C container
+-
+- a += 2;
+- b += 2;
+- c += 2;
+- }
+-
+- if((num_points % 2) != 0){
+- *c = (*a) * (*b);
+- }
++ unsigned int number = 0;
++ const unsigned int halfPoints = num_points / 2;
++
++ __m128 x, y, z;
++ lv_32fc_t* c = cVector;
++ const lv_32fc_t* a = aVector;
++ const lv_32fc_t* b = bVector;
++
++ for (; number < halfPoints; number++) {
++ x = _mm_load_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
++ y = _mm_load_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di
++ z = _mm_complexmul_ps(x, y);
++ _mm_store_ps((float*)c, z); // Store the results back into the C container
++
++ a += 2;
++ b += 2;
++ c += 2;
++ }
++
++ if ((num_points % 2) != 0) {
++ *c = (*a) * (*b);
++ }
+ }
+ #endif /* LV_HAVE_SSE */
+
+
+ #ifdef LV_HAVE_GENERIC
+
+-static inline void
+-volk_32fc_x2_multiply_32fc_a_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector,
+- const lv_32fc_t* bVector, unsigned int num_points)
++static inline void volk_32fc_x2_multiply_32fc_a_generic(lv_32fc_t* cVector,
++ const lv_32fc_t* aVector,
++ const lv_32fc_t* bVector,
++ unsigned int num_points)
+ {
+- lv_32fc_t* cPtr = cVector;
+- const lv_32fc_t* aPtr = aVector;
+- const lv_32fc_t* bPtr= bVector;
+- unsigned int number = 0;
+-
+- for(number = 0; number < num_points; number++){
+- *cPtr++ = (*aPtr++) * (*bPtr++);
+- }
++ lv_32fc_t* cPtr = cVector;
++ const lv_32fc_t* aPtr = aVector;
++ const lv_32fc_t* bPtr = bVector;
++ unsigned int number = 0;
++
++ for (number = 0; number < num_points; number++) {
++ *cPtr++ = (*aPtr++) * (*bPtr++);
++ }
+ }
+ #endif /* LV_HAVE_GENERIC */
+
+@@ -358,113 +380,118 @@ volk_32fc_x2_multiply_32fc_a_generic(lv_32fc_t* cVector, const lv_32fc_t* aVecto
+ #ifdef LV_HAVE_NEON
+ #include <arm_neon.h>
+
+-static inline void
+-volk_32fc_x2_multiply_32fc_neon(lv_32fc_t* cVector, const lv_32fc_t* aVector,
+- const lv_32fc_t* bVector, unsigned int num_points)
++static inline void volk_32fc_x2_multiply_32fc_neon(lv_32fc_t* cVector,
++ const lv_32fc_t* aVector,
++ const lv_32fc_t* bVector,
++ unsigned int num_points)
+ {
+- lv_32fc_t *a_ptr = (lv_32fc_t*) aVector;
+- lv_32fc_t *b_ptr = (lv_32fc_t*) bVector;
+- unsigned int quarter_points = num_points / 4;
+- float32x4x2_t a_val, b_val, c_val;
+- float32x4x2_t tmp_real, tmp_imag;
+- unsigned int number = 0;
+-
+- for(number = 0; number < quarter_points; ++number) {
+- a_val = vld2q_f32((float*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i
+- b_val = vld2q_f32((float*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i
+- __VOLK_PREFETCH(a_ptr+4);
+- __VOLK_PREFETCH(b_ptr+4);
+-
+- // multiply the real*real and imag*imag to get real result
+- // a0r*b0r|a1r*b1r|a2r*b2r|a3r*b3r
+- tmp_real.val[0] = vmulq_f32(a_val.val[0], b_val.val[0]);
+- // a0i*b0i|a1i*b1i|a2i*b2i|a3i*b3i
+- tmp_real.val[1] = vmulq_f32(a_val.val[1], b_val.val[1]);
+-
+- // Multiply cross terms to get the imaginary result
+- // a0r*b0i|a1r*b1i|a2r*b2i|a3r*b3i
+- tmp_imag.val[0] = vmulq_f32(a_val.val[0], b_val.val[1]);
+- // a0i*b0r|a1i*b1r|a2i*b2r|a3i*b3r
+- tmp_imag.val[1] = vmulq_f32(a_val.val[1], b_val.val[0]);
+-
+- // store the results
+- c_val.val[0] = vsubq_f32(tmp_real.val[0], tmp_real.val[1]);
+- c_val.val[1] = vaddq_f32(tmp_imag.val[0], tmp_imag.val[1]);
+- vst2q_f32((float*)cVector, c_val);
+-
+- a_ptr += 4;
+- b_ptr += 4;
+- cVector += 4;
+- }
+-
+- for(number = quarter_points*4; number < num_points; number++){
+- *cVector++ = (*a_ptr++) * (*b_ptr++);
+- }
++ lv_32fc_t* a_ptr = (lv_32fc_t*)aVector;
++ lv_32fc_t* b_ptr = (lv_32fc_t*)bVector;
++ unsigned int quarter_points = num_points / 4;
++ float32x4x2_t a_val, b_val, c_val;
++ float32x4x2_t tmp_real, tmp_imag;
++ unsigned int number = 0;
++
++ for (number = 0; number < quarter_points; ++number) {
++ a_val = vld2q_f32((float*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i
++ b_val = vld2q_f32((float*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i
++ __VOLK_PREFETCH(a_ptr + 4);
++ __VOLK_PREFETCH(b_ptr + 4);
++
++ // multiply the real*real and imag*imag to get real result
++ // a0r*b0r|a1r*b1r|a2r*b2r|a3r*b3r
++ tmp_real.val[0] = vmulq_f32(a_val.val[0], b_val.val[0]);
++ // a0i*b0i|a1i*b1i|a2i*b2i|a3i*b3i
++ tmp_real.val[1] = vmulq_f32(a_val.val[1], b_val.val[1]);
++
++ // Multiply cross terms to get the imaginary result
++ // a0r*b0i|a1r*b1i|a2r*b2i|a3r*b3i
++ tmp_imag.val[0] = vmulq_f32(a_val.val[0], b_val.val[1]);
++ // a0i*b0r|a1i*b1r|a2i*b2r|a3i*b3r
++ tmp_imag.val[1] = vmulq_f32(a_val.val[1], b_val.val[0]);
++
++ // store the results
++ c_val.val[0] = vsubq_f32(tmp_real.val[0], tmp_real.val[1]);
++ c_val.val[1] = vaddq_f32(tmp_imag.val[0], tmp_imag.val[1]);
++ vst2q_f32((float*)cVector, c_val);
++
++ a_ptr += 4;
++ b_ptr += 4;
++ cVector += 4;
++ }
++
++ for (number = quarter_points * 4; number < num_points; number++) {
++ *cVector++ = (*a_ptr++) * (*b_ptr++);
++ }
+ }
+ #endif /* LV_HAVE_NEON */
+
+
+ #ifdef LV_HAVE_NEON
+
+-static inline void
+-volk_32fc_x2_multiply_32fc_neon_opttests(lv_32fc_t* cVector, const lv_32fc_t* aVector,
+- const lv_32fc_t* bVector, unsigned int num_points)
++static inline void volk_32fc_x2_multiply_32fc_neon_opttests(lv_32fc_t* cVector,
++ const lv_32fc_t* aVector,
++ const lv_32fc_t* bVector,
++ unsigned int num_points)
+ {
+- lv_32fc_t *a_ptr = (lv_32fc_t*) aVector;
+- lv_32fc_t *b_ptr = (lv_32fc_t*) bVector;
+- unsigned int quarter_points = num_points / 4;
+- float32x4x2_t a_val, b_val;
+- float32x4x2_t tmp_imag;
+- unsigned int number = 0;
+-
+- for(number = 0; number < quarter_points; ++number) {
+- a_val = vld2q_f32((float*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i
+- b_val = vld2q_f32((float*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i
+- __VOLK_PREFETCH(a_ptr+4);
+- __VOLK_PREFETCH(b_ptr+4);
+-
+- // do the first multiply
+- tmp_imag.val[1] = vmulq_f32(a_val.val[1], b_val.val[0]);
+- tmp_imag.val[0] = vmulq_f32(a_val.val[0], b_val.val[0]);
+-
+- // use multiply accumulate/subtract to get result
+- tmp_imag.val[1] = vmlaq_f32(tmp_imag.val[1], a_val.val[0], b_val.val[1]);
+- tmp_imag.val[0] = vmlsq_f32(tmp_imag.val[0], a_val.val[1], b_val.val[1]);
+-
+- // store
+- vst2q_f32((float*)cVector, tmp_imag);
+- // increment pointers
+- a_ptr += 4;
+- b_ptr += 4;
+- cVector += 4;
+- }
+-
+- for(number = quarter_points*4; number < num_points; number++){
+- *cVector++ = (*a_ptr++) * (*b_ptr++);
+- }
++ lv_32fc_t* a_ptr = (lv_32fc_t*)aVector;
++ lv_32fc_t* b_ptr = (lv_32fc_t*)bVector;
++ unsigned int quarter_points = num_points / 4;
++ float32x4x2_t a_val, b_val;
++ float32x4x2_t tmp_imag;
++ unsigned int number = 0;
++
++ for (number = 0; number < quarter_points; ++number) {
++ a_val = vld2q_f32((float*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i
++ b_val = vld2q_f32((float*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i
++ __VOLK_PREFETCH(a_ptr + 4);
++ __VOLK_PREFETCH(b_ptr + 4);
++
++ // do the first multiply
++ tmp_imag.val[1] = vmulq_f32(a_val.val[1], b_val.val[0]);
++ tmp_imag.val[0] = vmulq_f32(a_val.val[0], b_val.val[0]);
++
++ // use multiply accumulate/subtract to get result
++ tmp_imag.val[1] = vmlaq_f32(tmp_imag.val[1], a_val.val[0], b_val.val[1]);
++ tmp_imag.val[0] = vmlsq_f32(tmp_imag.val[0], a_val.val[1], b_val.val[1]);
++
++ // store
++ vst2q_f32((float*)cVector, tmp_imag);
++ // increment pointers
++ a_ptr += 4;
++ b_ptr += 4;
++ cVector += 4;
++ }
++
++ for (number = quarter_points * 4; number < num_points; number++) {
++ *cVector++ = (*a_ptr++) * (*b_ptr++);
++ }
+ }
+ #endif /* LV_HAVE_NEON */
+
+
+ #ifdef LV_HAVE_NEONV7
+
+-extern void
+-volk_32fc_x2_multiply_32fc_a_neonasm(lv_32fc_t* cVector, const lv_32fc_t* aVector,
+- const lv_32fc_t* bVector, unsigned int num_points);
++extern void volk_32fc_x2_multiply_32fc_a_neonasm(lv_32fc_t* cVector,
++ const lv_32fc_t* aVector,
++ const lv_32fc_t* bVector,
++ unsigned int num_points);
+ #endif /* LV_HAVE_NEONV7 */
+
+
+ #ifdef LV_HAVE_ORC
+
+-extern void
+-volk_32fc_x2_multiply_32fc_a_orc_impl(lv_32fc_t* cVector, const lv_32fc_t* aVector,
+- const lv_32fc_t* bVector, unsigned int num_points);
++extern void volk_32fc_x2_multiply_32fc_a_orc_impl(lv_32fc_t* cVector,
++ const lv_32fc_t* aVector,
++ const lv_32fc_t* bVector,
++ unsigned int num_points);
+
+-static inline void
+-volk_32fc_x2_multiply_32fc_u_orc(lv_32fc_t* cVector, const lv_32fc_t* aVector,
+- const lv_32fc_t* bVector, unsigned int num_points)
++static inline void volk_32fc_x2_multiply_32fc_u_orc(lv_32fc_t* cVector,
++ const lv_32fc_t* aVector,
++ const lv_32fc_t* bVector,
++ unsigned int num_points)
+ {
+- volk_32fc_x2_multiply_32fc_a_orc_impl(cVector, aVector, bVector, num_points);
++ volk_32fc_x2_multiply_32fc_a_orc_impl(cVector, aVector, bVector, num_points);
+ }
+
+ #endif /* LV_HAVE_ORC */
+diff --git a/kernels/volk/volk_32fc_x2_multiply_conjugate_32fc.h b/kernels/volk/volk_32fc_x2_multiply_conjugate_32fc.h
+index 1b1a8b3..4f834c2 100644
+--- a/kernels/volk/volk_32fc_x2_multiply_conjugate_32fc.h
++++ b/kernels/volk/volk_32fc_x2_multiply_conjugate_32fc.h
+@@ -30,8 +30,8 @@
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+- * void volk_32fc_x2_multiply_conjugate_32fc(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points);
+- * \endcode
++ * void volk_32fc_x2_multiply_conjugate_32fc(lv_32fc_t* cVector, const lv_32fc_t* aVector,
++ * const lv_32fc_t* bVector, unsigned int num_points); \endcode
+ *
+ * \b Inputs
+ * \li aVector: The first input vector of complex floats.
+@@ -71,43 +71,46 @@
+ #ifndef INCLUDED_volk_32fc_x2_multiply_conjugate_32fc_u_H
+ #define INCLUDED_volk_32fc_x2_multiply_conjugate_32fc_u_H
+
++#include <float.h>
+ #include <inttypes.h>
+ #include <stdio.h>
+ #include <volk/volk_complex.h>
+-#include <float.h>
+
+ #ifdef LV_HAVE_AVX
+ #include <immintrin.h>
+ #include <volk/volk_avx_intrinsics.h>
+
+-static inline void
+-volk_32fc_x2_multiply_conjugate_32fc_u_avx(lv_32fc_t* cVector, const lv_32fc_t* aVector,
+- const lv_32fc_t* bVector, unsigned int num_points)
++static inline void volk_32fc_x2_multiply_conjugate_32fc_u_avx(lv_32fc_t* cVector,
++ const lv_32fc_t* aVector,
++ const lv_32fc_t* bVector,
++ unsigned int num_points)
+ {
+- unsigned int number = 0;
+- const unsigned int quarterPoints = num_points / 4;
+-
+- __m256 x, y, z;
+- lv_32fc_t* c = cVector;
+- const lv_32fc_t* a = aVector;
+- const lv_32fc_t* b = bVector;
+-
+- for(; number < quarterPoints; number++){
+- x = _mm256_loadu_ps((float*) a); // Load the ar + ai, br + bi ... as ar,ai,br,bi ...
+- y = _mm256_loadu_ps((float*) b); // Load the cr + ci, dr + di ... as cr,ci,dr,di ...
+- z = _mm256_complexconjugatemul_ps(x, y);
+- _mm256_storeu_ps((float*) c, z); // Store the results back into the C container
+-
+- a += 4;
+- b += 4;
+- c += 4;
+- }
+-
+- number = quarterPoints * 4;
+-
+- for(; number < num_points; number++){
+- *c++ = (*a++) * lv_conj(*b++);
+- }
++ unsigned int number = 0;
++ const unsigned int quarterPoints = num_points / 4;
++
++ __m256 x, y, z;
++ lv_32fc_t* c = cVector;
++ const lv_32fc_t* a = aVector;
++ const lv_32fc_t* b = bVector;
++
++ for (; number < quarterPoints; number++) {
++ x = _mm256_loadu_ps(
++ (float*)a); // Load the ar + ai, br + bi ... as ar,ai,br,bi ...
++ y = _mm256_loadu_ps(
++ (float*)b); // Load the cr + ci, dr + di ... as cr,ci,dr,di ...
++ z = _mm256_complexconjugatemul_ps(x, y);
++ _mm256_storeu_ps((float*)c, z); // Store the results back into the C container
++
++ a += 4;
++ b += 4;
++ c += 4;
++ }
++
++ number = quarterPoints * 4;
++
++ for (; number < num_points; number++) {
++ *c++ = (*a++) * lv_conj(*b++);
++ }
+ }
+ #endif /* LV_HAVE_AVX */
+
+@@ -116,96 +119,98 @@ volk_32fc_x2_multiply_conjugate_32fc_u_avx(lv_32fc_t* cVector, const lv_32fc_t*
+ #include <pmmintrin.h>
+ #include <volk/volk_sse3_intrinsics.h>
+
+-static inline void
+-volk_32fc_x2_multiply_conjugate_32fc_u_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector,
+- const lv_32fc_t* bVector, unsigned int num_points)
++static inline void volk_32fc_x2_multiply_conjugate_32fc_u_sse3(lv_32fc_t* cVector,
++ const lv_32fc_t* aVector,
++ const lv_32fc_t* bVector,
++ unsigned int num_points)
+ {
+- unsigned int number = 0;
+- const unsigned int halfPoints = num_points / 2;
+-
+- __m128 x, y, z;
+- lv_32fc_t* c = cVector;
+- const lv_32fc_t* a = aVector;
+- const lv_32fc_t* b = bVector;
+-
+- for(; number < halfPoints; number++){
+- x = _mm_loadu_ps((float*) a); // Load the ar + ai, br + bi as ar,ai,br,bi
+- y = _mm_loadu_ps((float*) b); // Load the cr + ci, dr + di as cr,ci,dr,di
+- z = _mm_complexconjugatemul_ps(x, y);
+- _mm_storeu_ps((float*) c, z); // Store the results back into the C container
+-
+- a += 2;
+- b += 2;
+- c += 2;
+- }
+-
+- if((num_points % 2) != 0){
+- *c = (*a) * lv_conj(*b);
+- }
++ unsigned int number = 0;
++ const unsigned int halfPoints = num_points / 2;
++
++ __m128 x, y, z;
++ lv_32fc_t* c = cVector;
++ const lv_32fc_t* a = aVector;
++ const lv_32fc_t* b = bVector;
++
++ for (; number < halfPoints; number++) {
++ x = _mm_loadu_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
++ y = _mm_loadu_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di
++ z = _mm_complexconjugatemul_ps(x, y);
++ _mm_storeu_ps((float*)c, z); // Store the results back into the C container
++
++ a += 2;
++ b += 2;
++ c += 2;
++ }
++
++ if ((num_points % 2) != 0) {
++ *c = (*a) * lv_conj(*b);
++ }
+ }
+ #endif /* LV_HAVE_SSE */
+
+
+ #ifdef LV_HAVE_GENERIC
+
+-static inline void
+-volk_32fc_x2_multiply_conjugate_32fc_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector,
+- const lv_32fc_t* bVector, unsigned int num_points)
++static inline void volk_32fc_x2_multiply_conjugate_32fc_generic(lv_32fc_t* cVector,
++ const lv_32fc_t* aVector,
++ const lv_32fc_t* bVector,
++ unsigned int num_points)
+ {
+- lv_32fc_t* cPtr = cVector;
+- const lv_32fc_t* aPtr = aVector;
+- const lv_32fc_t* bPtr= bVector;
+- unsigned int number = 0;
+-
+- for(number = 0; number < num_points; number++){
+- *cPtr++ = (*aPtr++) * lv_conj(*bPtr++);
+- }
++ lv_32fc_t* cPtr = cVector;
++ const lv_32fc_t* aPtr = aVector;
++ const lv_32fc_t* bPtr = bVector;
++ unsigned int number = 0;
++
++ for (number = 0; number < num_points; number++) {
++ *cPtr++ = (*aPtr++) * lv_conj(*bPtr++);
++ }
+ }
+ #endif /* LV_HAVE_GENERIC */
+
+
+-
+ #endif /* INCLUDED_volk_32fc_x2_multiply_conjugate_32fc_u_H */
+ #ifndef INCLUDED_volk_32fc_x2_multiply_conjugate_32fc_a_H
+ #define INCLUDED_volk_32fc_x2_multiply_conjugate_32fc_a_H
+
++#include <float.h>
+ #include <inttypes.h>
+ #include <stdio.h>
+ #include <volk/volk_complex.h>
+-#include <float.h>
+
+ #ifdef LV_HAVE_AVX
+ #include <immintrin.h>
+ #include <volk/volk_avx_intrinsics.h>
+
+-static inline void
+-volk_32fc_x2_multiply_conjugate_32fc_a_avx(lv_32fc_t* cVector, const lv_32fc_t* aVector,
+- const lv_32fc_t* bVector, unsigned int num_points)
++static inline void volk_32fc_x2_multiply_conjugate_32fc_a_avx(lv_32fc_t* cVector,
++ const lv_32fc_t* aVector,
++ const lv_32fc_t* bVector,
++ unsigned int num_points)
+ {
+- unsigned int number = 0;
+- const unsigned int quarterPoints = num_points / 4;
+-
+- __m256 x, y, z;
+- lv_32fc_t* c = cVector;
+- const lv_32fc_t* a = aVector;
+- const lv_32fc_t* b = bVector;
+-
+- for(; number < quarterPoints; number++){
+- x = _mm256_load_ps((float*) a); // Load the ar + ai, br + bi ... as ar,ai,br,bi ...
+- y = _mm256_load_ps((float*) b); // Load the cr + ci, dr + di ... as cr,ci,dr,di ...
+- z = _mm256_complexconjugatemul_ps(x, y);
+- _mm256_store_ps((float*) c, z); // Store the results back into the C container
+-
+- a += 4;
+- b += 4;
+- c += 4;
+- }
+-
+- number = quarterPoints * 4;
+-
+- for(; number < num_points; number++){
+- *c++ = (*a++) * lv_conj(*b++);
+- }
++ unsigned int number = 0;
++ const unsigned int quarterPoints = num_points / 4;
++
++ __m256 x, y, z;
++ lv_32fc_t* c = cVector;
++ const lv_32fc_t* a = aVector;
++ const lv_32fc_t* b = bVector;
++
++ for (; number < quarterPoints; number++) {
++ x = _mm256_load_ps((float*)a); // Load the ar + ai, br + bi ... as ar,ai,br,bi ...
++ y = _mm256_load_ps((float*)b); // Load the cr + ci, dr + di ... as cr,ci,dr,di ...
++ z = _mm256_complexconjugatemul_ps(x, y);
++ _mm256_store_ps((float*)c, z); // Store the results back into the C container
++
++ a += 4;
++ b += 4;
++ c += 4;
++ }
++
++ number = quarterPoints * 4;
++
++ for (; number < num_points; number++) {
++ *c++ = (*a++) * lv_conj(*b++);
++ }
+ }
+ #endif /* LV_HAVE_AVX */
+
+@@ -214,32 +219,33 @@ volk_32fc_x2_multiply_conjugate_32fc_a_avx(lv_32fc_t* cVector, const lv_32fc_t*
+ #include <pmmintrin.h>
+ #include <volk/volk_sse3_intrinsics.h>
+
+-static inline void
+-volk_32fc_x2_multiply_conjugate_32fc_a_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector,
+- const lv_32fc_t* bVector, unsigned int num_points)
++static inline void volk_32fc_x2_multiply_conjugate_32fc_a_sse3(lv_32fc_t* cVector,
++ const lv_32fc_t* aVector,
++ const lv_32fc_t* bVector,
++ unsigned int num_points)
+ {
+- unsigned int number = 0;
+- const unsigned int halfPoints = num_points / 2;
+-
+- __m128 x, y, z;
+- lv_32fc_t* c = cVector;
+- const lv_32fc_t* a = aVector;
+- const lv_32fc_t* b = bVector;
+-
+- for(; number < halfPoints; number++){
+- x = _mm_load_ps((float*) a); // Load the ar + ai, br + bi as ar,ai,br,bi
+- y = _mm_load_ps((float*) b); // Load the cr + ci, dr + di as cr,ci,dr,di
+- z = _mm_complexconjugatemul_ps(x, y);
+- _mm_store_ps((float*) c, z); // Store the results back into the C container
+-
+- a += 2;
+- b += 2;
+- c += 2;
+- }
+-
+- if((num_points % 2) != 0){
+- *c = (*a) * lv_conj(*b);
+- }
++ unsigned int number = 0;
++ const unsigned int halfPoints = num_points / 2;
++
++ __m128 x, y, z;
++ lv_32fc_t* c = cVector;
++ const lv_32fc_t* a = aVector;
++ const lv_32fc_t* b = bVector;
++
++ for (; number < halfPoints; number++) {
++ x = _mm_load_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
++ y = _mm_load_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di
++ z = _mm_complexconjugatemul_ps(x, y);
++ _mm_store_ps((float*)c, z); // Store the results back into the C container
++
++ a += 2;
++ b += 2;
++ c += 2;
++ }
++
++ if ((num_points % 2) != 0) {
++ *c = (*a) * lv_conj(*b);
++ }
+ }
+ #endif /* LV_HAVE_SSE */
+
+@@ -247,49 +253,50 @@ volk_32fc_x2_multiply_conjugate_32fc_a_sse3(lv_32fc_t* cVector, const lv_32fc_t*
+ #ifdef LV_HAVE_NEON
+ #include <arm_neon.h>
+
+-static inline void
+-volk_32fc_x2_multiply_conjugate_32fc_neon(lv_32fc_t* cVector, const lv_32fc_t* aVector,
+- const lv_32fc_t* bVector, unsigned int num_points)
++static inline void volk_32fc_x2_multiply_conjugate_32fc_neon(lv_32fc_t* cVector,
++ const lv_32fc_t* aVector,
++ const lv_32fc_t* bVector,
++ unsigned int num_points)
+ {
+- lv_32fc_t *a_ptr = (lv_32fc_t*) aVector;
+- lv_32fc_t *b_ptr = (lv_32fc_t*) bVector;
+- unsigned int quarter_points = num_points / 4;
+- float32x4x2_t a_val, b_val, c_val;
+- float32x4x2_t tmp_real, tmp_imag;
+- unsigned int number = 0;
+-
+- for(number = 0; number < quarter_points; ++number) {
+- a_val = vld2q_f32((float*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i
+- b_val = vld2q_f32((float*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i
+- b_val.val[1] = vnegq_f32(b_val.val[1]);
+- __VOLK_PREFETCH(a_ptr+4);
+- __VOLK_PREFETCH(b_ptr+4);
+-
+- // multiply the real*real and imag*imag to get real result
+- // a0r*b0r|a1r*b1r|a2r*b2r|a3r*b3r
+- tmp_real.val[0] = vmulq_f32(a_val.val[0], b_val.val[0]);
+- // a0i*b0i|a1i*b1i|a2i*b2i|a3i*b3i
+- tmp_real.val[1] = vmulq_f32(a_val.val[1], b_val.val[1]);
+-
+- // Multiply cross terms to get the imaginary result
++ lv_32fc_t* a_ptr = (lv_32fc_t*)aVector;
++ lv_32fc_t* b_ptr = (lv_32fc_t*)bVector;
++ unsigned int quarter_points = num_points / 4;
++ float32x4x2_t a_val, b_val, c_val;
++ float32x4x2_t tmp_real, tmp_imag;
++ unsigned int number = 0;
++
++ for (number = 0; number < quarter_points; ++number) {
++ a_val = vld2q_f32((float*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i
++ b_val = vld2q_f32((float*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i
++ b_val.val[1] = vnegq_f32(b_val.val[1]);
++ __VOLK_PREFETCH(a_ptr + 4);
++ __VOLK_PREFETCH(b_ptr + 4);
++
++ // multiply the real*real and imag*imag to get real result
++ // a0r*b0r|a1r*b1r|a2r*b2r|a3r*b3r
++ tmp_real.val[0] = vmulq_f32(a_val.val[0], b_val.val[0]);
++ // a0i*b0i|a1i*b1i|a2i*b2i|a3i*b3i
++ tmp_real.val[1] = vmulq_f32(a_val.val[1], b_val.val[1]);
++
++ // Multiply cross terms to get the imaginary result
+ // a0r*b0i|a1r*b1i|a2r*b2i|a3r*b3i
+- tmp_imag.val[0] = vmulq_f32(a_val.val[0], b_val.val[1]);
+- // a0i*b0r|a1i*b1r|a2i*b2r|a3i*b3r
+- tmp_imag.val[1] = vmulq_f32(a_val.val[1], b_val.val[0]);
+-
+- // store the results
+- c_val.val[0] = vsubq_f32(tmp_real.val[0], tmp_real.val[1]);
+- c_val.val[1] = vaddq_f32(tmp_imag.val[0], tmp_imag.val[1]);
+- vst2q_f32((float*)cVector, c_val);
+-
+- a_ptr += 4;
+- b_ptr += 4;
+- cVector += 4;
++ tmp_imag.val[0] = vmulq_f32(a_val.val[0], b_val.val[1]);
++ // a0i*b0r|a1i*b1r|a2i*b2r|a3i*b3r
++ tmp_imag.val[1] = vmulq_f32(a_val.val[1], b_val.val[0]);
++
++ // store the results
++ c_val.val[0] = vsubq_f32(tmp_real.val[0], tmp_real.val[1]);
++ c_val.val[1] = vaddq_f32(tmp_imag.val[0], tmp_imag.val[1]);
++ vst2q_f32((float*)cVector, c_val);
++
++ a_ptr += 4;
++ b_ptr += 4;
++ cVector += 4;
+ }
+
+- for(number = quarter_points*4; number < num_points; number++){
+- *cVector++ = (*a_ptr++) * conj(*b_ptr++);
+- }
++ for (number = quarter_points * 4; number < num_points; number++) {
++ *cVector++ = (*a_ptr++) * conj(*b_ptr++);
++ }
+ }
+ #endif /* LV_HAVE_NEON */
+
+@@ -297,17 +304,19 @@ volk_32fc_x2_multiply_conjugate_32fc_neon(lv_32fc_t* cVector, const lv_32fc_t* a
+ #ifdef LV_HAVE_GENERIC
+
+ static inline void
+-volk_32fc_x2_multiply_conjugate_32fc_a_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector,
+- const lv_32fc_t* bVector, unsigned int num_points)
++volk_32fc_x2_multiply_conjugate_32fc_a_generic(lv_32fc_t* cVector,
++ const lv_32fc_t* aVector,
++ const lv_32fc_t* bVector,
++ unsigned int num_points)
+ {
+- lv_32fc_t* cPtr = cVector;
+- const lv_32fc_t* aPtr = aVector;
+- const lv_32fc_t* bPtr= bVector;
+- unsigned int number = 0;
+-
+- for(number = 0; number < num_points; number++){
+- *cPtr++ = (*aPtr++) * lv_conj(*bPtr++);
+- }
++ lv_32fc_t* cPtr = cVector;
++ const lv_32fc_t* aPtr = aVector;
++ const lv_32fc_t* bPtr = bVector;
++ unsigned int number = 0;
++
++ for (number = 0; number < num_points; number++) {
++ *cPtr++ = (*aPtr++) * lv_conj(*bPtr++);
++ }
+ }
+ #endif /* LV_HAVE_GENERIC */
+
+diff --git a/kernels/volk/volk_32fc_x2_s32f_square_dist_scalar_mult_32f.h b/kernels/volk/volk_32fc_x2_s32f_square_dist_scalar_mult_32f.h
+index 1c65f23..1d10561 100644
+--- a/kernels/volk/volk_32fc_x2_s32f_square_dist_scalar_mult_32f.h
++++ b/kernels/volk/volk_32fc_x2_s32f_square_dist_scalar_mult_32f.h
+@@ -30,8 +30,8 @@
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+- * void volk_32fc_x2_s32f_square_dist_scalar_mult_32f(float* target, lv_32fc_t* src0, lv_32fc_t* points, float scalar, unsigned int num_points)
+- * \endcode
++ * void volk_32fc_x2_s32f_square_dist_scalar_mult_32f(float* target, lv_32fc_t* src0,
++ * lv_32fc_t* points, float scalar, unsigned int num_points) \endcode
+ *
+ * \b Inputs
+ * \li src0: The complex input. Only the first point is used.
+@@ -79,103 +79,107 @@
+ #ifndef INCLUDED_volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_H
+ #define INCLUDED_volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_H
+
+-#include<volk/volk_complex.h>
++#include <volk/volk_complex.h>
+
+
+-static inline void
+-calculate_scaled_distances(float* target, const lv_32fc_t symbol, const lv_32fc_t* points,
+- const float scalar, const unsigned int num_points)
++static inline void calculate_scaled_distances(float* target,
++ const lv_32fc_t symbol,
++ const lv_32fc_t* points,
++ const float scalar,
++ const unsigned int num_points)
+ {
+- lv_32fc_t diff;
+- for(unsigned int i = 0; i < num_points; ++i) {
+- /*
+- * Calculate: |y - x|^2 * SNR_lin
+- * Compare C++: *target++ = scalar * std::norm(symbol - *constellation++);
+- */
+- diff = symbol - *points++;
+- *target++ = scalar * (lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff));
+- }
++ lv_32fc_t diff;
++ for (unsigned int i = 0; i < num_points; ++i) {
++ /*
++ * Calculate: |y - x|^2 * SNR_lin
++ * Compare C++: *target++ = scalar * std::norm(symbol - *constellation++);
++ */
++ diff = symbol - *points++;
++ *target++ =
++ scalar * (lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff));
++ }
+ }
+
+
+ #ifdef LV_HAVE_AVX2
+-#include<immintrin.h>
+-#include<volk/volk_avx2_intrinsics.h>
++#include <immintrin.h>
++#include <volk/volk_avx2_intrinsics.h>
+
+ static inline void
+-volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_avx2(float* target, lv_32fc_t* src0,
+- lv_32fc_t* points, float scalar,
++volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_avx2(float* target,
++ lv_32fc_t* src0,
++ lv_32fc_t* points,
++ float scalar,
+ unsigned int num_points)
+ {
+- const unsigned int num_bytes = num_points*8;
+- __m128 xmm9, xmm10;
+- __m256 xmm4, xmm6;
+- __m256 xmm_points0, xmm_points1, xmm_result;
++ const unsigned int num_bytes = num_points * 8;
++ __m128 xmm9, xmm10;
++ __m256 xmm4, xmm6;
++ __m256 xmm_points0, xmm_points1, xmm_result;
+
+- const unsigned int bound = num_bytes >> 6;
+-
+- // load complex value into all parts of the register.
+- const __m256 xmm_symbol = _mm256_castpd_ps(_mm256_broadcast_sd((const double*)src0));
+- const __m128 xmm128_symbol = _mm256_extractf128_ps(xmm_symbol, 1);
+-
+- // Load scalar into all 8 parts of the register
+- const __m256 xmm_scalar = _mm256_broadcast_ss(&scalar);
+- const __m128 xmm128_scalar = _mm256_extractf128_ps(xmm_scalar, 1);
++ const unsigned int bound = num_bytes >> 6;
+
+- // Set permutation constant
+- const __m256i idx = _mm256_set_epi32(7,6,3,2,5,4,1,0);
+-
+- for(unsigned int i = 0; i < bound; ++i) {
+- xmm_points0 = _mm256_load_ps((float*)points);
+- xmm_points1 = _mm256_load_ps((float*)(points + 4));
+- points += 8;
+- __VOLK_PREFETCH(points);
++ // load complex value into all parts of the register.
++ const __m256 xmm_symbol = _mm256_castpd_ps(_mm256_broadcast_sd((const double*)src0));
++ const __m128 xmm128_symbol = _mm256_extractf128_ps(xmm_symbol, 1);
+
+- xmm_result = _mm256_scaled_norm_dist_ps_avx2(xmm_symbol, xmm_symbol,
+- xmm_points0, xmm_points1,
+- xmm_scalar);
+-
+- _mm256_store_ps(target, xmm_result);
+- target += 8;
+- }
++ // Load scalar into all 8 parts of the register
++ const __m256 xmm_scalar = _mm256_broadcast_ss(&scalar);
++ const __m128 xmm128_scalar = _mm256_extractf128_ps(xmm_scalar, 1);
+
+- if (num_bytes >> 5 & 1) {
+- xmm_points0 = _mm256_load_ps((float*)points);
++ // Set permutation constant
++ const __m256i idx = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
+
+- xmm4 = _mm256_sub_ps(xmm_symbol, xmm_points0);
++ for (unsigned int i = 0; i < bound; ++i) {
++ xmm_points0 = _mm256_load_ps((float*)points);
++ xmm_points1 = _mm256_load_ps((float*)(points + 4));
++ points += 8;
++ __VOLK_PREFETCH(points);
+
+- points += 4;
++ xmm_result = _mm256_scaled_norm_dist_ps_avx2(
++ xmm_symbol, xmm_symbol, xmm_points0, xmm_points1, xmm_scalar);
+
+- xmm6 = _mm256_mul_ps(xmm4, xmm4);
++ _mm256_store_ps(target, xmm_result);
++ target += 8;
++ }
+
+- xmm4 = _mm256_hadd_ps(xmm6, xmm6);
+- xmm4 = _mm256_permutevar8x32_ps(xmm4, idx);
++ if (num_bytes >> 5 & 1) {
++ xmm_points0 = _mm256_load_ps((float*)points);
+
+- xmm_result = _mm256_mul_ps(xmm4, xmm_scalar);
++ xmm4 = _mm256_sub_ps(xmm_symbol, xmm_points0);
+
+- xmm9 = _mm256_extractf128_ps(xmm_result, 1);
+- _mm_store_ps(target,xmm9);
+- target += 4;
+- }
++ points += 4;
+
+- if (num_bytes >> 4 & 1) {
+- xmm9 = _mm_load_ps((float*)points);
++ xmm6 = _mm256_mul_ps(xmm4, xmm4);
+
+- xmm10 = _mm_sub_ps(xmm128_symbol, xmm9);
++ xmm4 = _mm256_hadd_ps(xmm6, xmm6);
++ xmm4 = _mm256_permutevar8x32_ps(xmm4, idx);
+
+- points += 2;
++ xmm_result = _mm256_mul_ps(xmm4, xmm_scalar);
+
+- xmm9 = _mm_mul_ps(xmm10, xmm10);
++ xmm9 = _mm256_extractf128_ps(xmm_result, 1);
++ _mm_store_ps(target, xmm9);
++ target += 4;
++ }
+
+- xmm10 = _mm_hadd_ps(xmm9, xmm9);
++ if (num_bytes >> 4 & 1) {
++ xmm9 = _mm_load_ps((float*)points);
+
+- xmm10 = _mm_mul_ps(xmm10, xmm128_scalar);
++ xmm10 = _mm_sub_ps(xmm128_symbol, xmm9);
+
+- _mm_storeh_pi((__m64*)target, xmm10);
+- target += 2;
+- }
++ points += 2;
+
+- calculate_scaled_distances(target, src0[0], points, scalar, (num_bytes >> 3) & 1);
++ xmm9 = _mm_mul_ps(xmm10, xmm10);
++
++ xmm10 = _mm_hadd_ps(xmm9, xmm9);
++
++ xmm10 = _mm_mul_ps(xmm10, xmm128_scalar);
++
++ _mm_storeh_pi((__m64*)target, xmm10);
++ target += 2;
++ }
++
++ calculate_scaled_distances(target, src0[0], points, scalar, (num_bytes >> 3) & 1);
+ }
+
+ #endif /*LV_HAVE_AVX2*/
+@@ -186,131 +190,139 @@ volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_avx2(float* target, lv_32fc_t* s
+ #include <volk/volk_avx_intrinsics.h>
+
+ static inline void
+-volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_avx(float *target, lv_32fc_t *src0,
+- lv_32fc_t *points, float scalar,
+- unsigned int num_points) {
+- const int eightsPoints = num_points / 8;
+- const int remainder = num_points - 8 * eightsPoints;
+-
+- __m256 xmm_points0, xmm_points1, xmm_result;
+-
+- // load complex value into all parts of the register.
+- const __m256 xmm_symbol = _mm256_castpd_ps(_mm256_broadcast_sd((const double*)src0));
+-
+- // Load scalar into all 8 parts of the register
+- const __m256 xmm_scalar = _mm256_broadcast_ss(&scalar);
+-
+- for(int i = 0; i < eightsPoints; ++i){
+- xmm_points0 = _mm256_load_ps((float*)points);
+- xmm_points1 = _mm256_load_ps((float*)(points + 4));
+- points += 8;
+-
+- xmm_result = _mm256_scaled_norm_dist_ps(xmm_symbol, xmm_symbol, xmm_points0,
+- xmm_points1, xmm_scalar);
+-
+- _mm256_store_ps(target, xmm_result);
+- target += 8;
+- }
+-
+- const lv_32fc_t symbol = *src0;
+- calculate_scaled_distances(target, symbol, points, scalar, remainder);
++volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_avx(float* target,
++ lv_32fc_t* src0,
++ lv_32fc_t* points,
++ float scalar,
++ unsigned int num_points)
++{
++ const int eightsPoints = num_points / 8;
++ const int remainder = num_points - 8 * eightsPoints;
++
++ __m256 xmm_points0, xmm_points1, xmm_result;
++
++ // load complex value into all parts of the register.
++ const __m256 xmm_symbol = _mm256_castpd_ps(_mm256_broadcast_sd((const double*)src0));
++
++ // Load scalar into all 8 parts of the register
++ const __m256 xmm_scalar = _mm256_broadcast_ss(&scalar);
++
++ for (int i = 0; i < eightsPoints; ++i) {
++ xmm_points0 = _mm256_load_ps((float*)points);
++ xmm_points1 = _mm256_load_ps((float*)(points + 4));
++ points += 8;
++
++ xmm_result = _mm256_scaled_norm_dist_ps(
++ xmm_symbol, xmm_symbol, xmm_points0, xmm_points1, xmm_scalar);
++
++ _mm256_store_ps(target, xmm_result);
++ target += 8;
++ }
++
++ const lv_32fc_t symbol = *src0;
++ calculate_scaled_distances(target, symbol, points, scalar, remainder);
+ }
+
+ #endif /* LV_HAVE_AVX */
+
+
+ #ifdef LV_HAVE_SSE3
+-#include<pmmintrin.h>
+-#include<volk/volk_sse3_intrinsics.h>
++#include <pmmintrin.h>
++#include <volk/volk_sse3_intrinsics.h>
+
+ static inline void
+-volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_sse3(float* target, lv_32fc_t* src0,
+- lv_32fc_t* points, float scalar,
++volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_sse3(float* target,
++ lv_32fc_t* src0,
++ lv_32fc_t* points,
++ float scalar,
+ unsigned int num_points)
+ {
+- __m128 xmm_points0, xmm_points1, xmm_result;
+-
+- /*
+- * First do 4 values in every loop iteration.
+- * There may be up to 3 values left.
+- * leftovers0 indicates if at least 2 more are available for SSE execution.
+- * leftovers1 indicates if there is a single element left.
+- */
+- const int quarterPoints = num_points / 4;
+- const int leftovers0 = (num_points / 2) - 2 * quarterPoints;
+- const int leftovers1 = num_points % 2;
+-
+- // load complex value into both parts of the register.
+- const __m128 xmm_symbol = _mm_castpd_ps(_mm_load1_pd((const double*)src0));
+-
+- // Load scalar into all 4 parts of the register
+- const __m128 xmm_scalar = _mm_load1_ps(&scalar);
+-
+- for(int i = 0; i < quarterPoints; ++i) {
+- xmm_points0 = _mm_load_ps((float*)points);
+- xmm_points1 = _mm_load_ps((float*)(points + 2));
+- points += 4;
+- __VOLK_PREFETCH(points);
+- // calculate distances
+- xmm_result = _mm_scaled_norm_dist_ps_sse3(xmm_symbol, xmm_symbol, xmm_points0,
+- xmm_points1, xmm_scalar);
+-
+- _mm_store_ps(target, xmm_result);
+- target += 4;
+- }
+-
+- for(int i = 0; i < leftovers0; ++i) {
+- xmm_points0 = _mm_load_ps((float*)points);
+- points += 2;
+-
+- xmm_points0 = _mm_sub_ps(xmm_symbol, xmm_points0);
+- xmm_points0 = _mm_mul_ps(xmm_points0, xmm_points0);
+- xmm_points0 = _mm_hadd_ps(xmm_points0, xmm_points0);
+- xmm_result = _mm_mul_ps(xmm_points0, xmm_scalar);
+-
+- _mm_storeh_pi((__m64*)target, xmm_result);
+- target += 2;
+- }
+-
+- calculate_scaled_distances(target, src0[0], points, scalar, leftovers1);
++ __m128 xmm_points0, xmm_points1, xmm_result;
++
++ /*
++ * First do 4 values in every loop iteration.
++ * There may be up to 3 values left.
++ * leftovers0 indicates if at least 2 more are available for SSE execution.
++ * leftovers1 indicates if there is a single element left.
++ */
++ const int quarterPoints = num_points / 4;
++ const int leftovers0 = (num_points / 2) - 2 * quarterPoints;
++ const int leftovers1 = num_points % 2;
++
++ // load complex value into both parts of the register.
++ const __m128 xmm_symbol = _mm_castpd_ps(_mm_load1_pd((const double*)src0));
++
++ // Load scalar into all 4 parts of the register
++ const __m128 xmm_scalar = _mm_load1_ps(&scalar);
++
++ for (int i = 0; i < quarterPoints; ++i) {
++ xmm_points0 = _mm_load_ps((float*)points);
++ xmm_points1 = _mm_load_ps((float*)(points + 2));
++ points += 4;
++ __VOLK_PREFETCH(points);
++ // calculate distances
++ xmm_result = _mm_scaled_norm_dist_ps_sse3(
++ xmm_symbol, xmm_symbol, xmm_points0, xmm_points1, xmm_scalar);
++
++ _mm_store_ps(target, xmm_result);
++ target += 4;
++ }
++
++ for (int i = 0; i < leftovers0; ++i) {
++ xmm_points0 = _mm_load_ps((float*)points);
++ points += 2;
++
++ xmm_points0 = _mm_sub_ps(xmm_symbol, xmm_points0);
++ xmm_points0 = _mm_mul_ps(xmm_points0, xmm_points0);
++ xmm_points0 = _mm_hadd_ps(xmm_points0, xmm_points0);
++ xmm_result = _mm_mul_ps(xmm_points0, xmm_scalar);
++
++ _mm_storeh_pi((__m64*)target, xmm_result);
++ target += 2;
++ }
++
++ calculate_scaled_distances(target, src0[0], points, scalar, leftovers1);
+ }
+
+ #endif /*LV_HAVE_SSE3*/
+
+ #ifdef LV_HAVE_SSE
+-#include <xmmintrin.h>
+ #include <volk/volk_sse_intrinsics.h>
++#include <xmmintrin.h>
+ static inline void
+-volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_sse(float* target, lv_32fc_t* src0,
+- lv_32fc_t* points, float scalar,
++volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_sse(float* target,
++ lv_32fc_t* src0,
++ lv_32fc_t* points,
++ float scalar,
+ unsigned int num_points)
+ {
+- const __m128 xmm_scalar = _mm_set1_ps(scalar);
+- const __m128 xmm_symbol = _mm_castpd_ps(_mm_load1_pd((const double*)src0));
+-
+- for (unsigned i = 0; i < num_points / 4; ++i) {
+- __m128 xmm_points0 = _mm_load_ps((float *) points);
+- __m128 xmm_points1 = _mm_load_ps((float *) (points + 2));
+- points += 4;
+- __m128 xmm_result = _mm_scaled_norm_dist_ps_sse(xmm_symbol, xmm_symbol,
+- xmm_points0, xmm_points1,
+- xmm_scalar);
+- _mm_store_ps((float *) target, xmm_result);
+- target += 4;
+- }
+-
+- calculate_scaled_distances(target, src0[0], points, scalar, num_points % 4);
++ const __m128 xmm_scalar = _mm_set1_ps(scalar);
++ const __m128 xmm_symbol = _mm_castpd_ps(_mm_load1_pd((const double*)src0));
++
++ for (unsigned i = 0; i < num_points / 4; ++i) {
++ __m128 xmm_points0 = _mm_load_ps((float*)points);
++ __m128 xmm_points1 = _mm_load_ps((float*)(points + 2));
++ points += 4;
++ __m128 xmm_result = _mm_scaled_norm_dist_ps_sse(
++ xmm_symbol, xmm_symbol, xmm_points0, xmm_points1, xmm_scalar);
++ _mm_store_ps((float*)target, xmm_result);
++ target += 4;
++ }
++
++ calculate_scaled_distances(target, src0[0], points, scalar, num_points % 4);
+ }
+ #endif // LV_HAVE_SSE
+
+ #ifdef LV_HAVE_GENERIC
+ static inline void
+-volk_32fc_x2_s32f_square_dist_scalar_mult_32f_generic(float* target, lv_32fc_t* src0,
+- lv_32fc_t* points, float scalar,
++volk_32fc_x2_s32f_square_dist_scalar_mult_32f_generic(float* target,
++ lv_32fc_t* src0,
++ lv_32fc_t* points,
++ float scalar,
+ unsigned int num_points)
+ {
+- const lv_32fc_t symbol = *src0;
+- calculate_scaled_distances(target, symbol, points, scalar, num_points);
++ const lv_32fc_t symbol = *src0;
++ calculate_scaled_distances(target, symbol, points, scalar, num_points);
+ }
+
+ #endif /*LV_HAVE_GENERIC*/
+@@ -321,87 +333,88 @@ volk_32fc_x2_s32f_square_dist_scalar_mult_32f_generic(float* target, lv_32fc_t*
+ #ifndef INCLUDED_volk_32fc_x2_s32f_square_dist_scalar_mult_32f_u_H
+ #define INCLUDED_volk_32fc_x2_s32f_square_dist_scalar_mult_32f_u_H
+
+-#include<volk/volk_complex.h>
++#include <volk/volk_complex.h>
+
+
+ #ifdef LV_HAVE_AVX2
+-#include<immintrin.h>
++#include <immintrin.h>
+ #include <volk/volk_avx2_intrinsics.h>
+
+ static inline void
+-volk_32fc_x2_s32f_square_dist_scalar_mult_32f_u_avx2(float* target, lv_32fc_t* src0,
+- lv_32fc_t* points, float scalar,
++volk_32fc_x2_s32f_square_dist_scalar_mult_32f_u_avx2(float* target,
++ lv_32fc_t* src0,
++ lv_32fc_t* points,
++ float scalar,
+ unsigned int num_points)
+ {
+- const unsigned int num_bytes = num_points*8;
+- __m128 xmm9, xmm10;
+- __m256 xmm4, xmm6;
+- __m256 xmm_points0, xmm_points1, xmm_result;
++ const unsigned int num_bytes = num_points * 8;
++ __m128 xmm9, xmm10;
++ __m256 xmm4, xmm6;
++ __m256 xmm_points0, xmm_points1, xmm_result;
++
++ const unsigned int bound = num_bytes >> 6;
++
++ // load complex value into all parts of the register.
++ const __m256 xmm_symbol = _mm256_castpd_ps(_mm256_broadcast_sd((const double*)src0));
++ const __m128 xmm128_symbol = _mm256_extractf128_ps(xmm_symbol, 1);
++
++ // Load scalar into all 8 parts of the register
++ const __m256 xmm_scalar = _mm256_broadcast_ss(&scalar);
++ const __m128 xmm128_scalar = _mm256_extractf128_ps(xmm_scalar, 1);
+
+- const unsigned int bound = num_bytes >> 6;
+-
+- // load complex value into all parts of the register.
+- const __m256 xmm_symbol = _mm256_castpd_ps(_mm256_broadcast_sd((const double*)src0));
+- const __m128 xmm128_symbol = _mm256_extractf128_ps(xmm_symbol, 1);
+-
+- // Load scalar into all 8 parts of the register
+- const __m256 xmm_scalar = _mm256_broadcast_ss(&scalar);
+- const __m128 xmm128_scalar = _mm256_extractf128_ps(xmm_scalar, 1);
++ // Set permutation constant
++ const __m256i idx = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
+
+- // Set permutation constant
+- const __m256i idx = _mm256_set_epi32(7,6,3,2,5,4,1,0);
+-
+- for(unsigned int i = 0; i < bound; ++i) {
+- xmm_points0 = _mm256_loadu_ps((float*)points);
+- xmm_points1 = _mm256_loadu_ps((float*)(points + 4));
+- points += 8;
+- __VOLK_PREFETCH(points);
++ for (unsigned int i = 0; i < bound; ++i) {
++ xmm_points0 = _mm256_loadu_ps((float*)points);
++ xmm_points1 = _mm256_loadu_ps((float*)(points + 4));
++ points += 8;
++ __VOLK_PREFETCH(points);
+
+- xmm_result = _mm256_scaled_norm_dist_ps_avx2(xmm_symbol, xmm_symbol,
+- xmm_points0, xmm_points1,
+- xmm_scalar);
+-
+- _mm256_storeu_ps(target, xmm_result);
+- target += 8;
+- }
++ xmm_result = _mm256_scaled_norm_dist_ps_avx2(
++ xmm_symbol, xmm_symbol, xmm_points0, xmm_points1, xmm_scalar);
+
+- if (num_bytes >> 5 & 1) {
+- xmm_points0 = _mm256_loadu_ps((float*)points);
++ _mm256_storeu_ps(target, xmm_result);
++ target += 8;
++ }
+
+- xmm4 = _mm256_sub_ps(xmm_symbol, xmm_points0);
++ if (num_bytes >> 5 & 1) {
++ xmm_points0 = _mm256_loadu_ps((float*)points);
+
+- points += 4;
++ xmm4 = _mm256_sub_ps(xmm_symbol, xmm_points0);
+
+- xmm6 = _mm256_mul_ps(xmm4, xmm4);
++ points += 4;
+
+- xmm4 = _mm256_hadd_ps(xmm6, xmm6);
+- xmm4 = _mm256_permutevar8x32_ps(xmm4, idx);
++ xmm6 = _mm256_mul_ps(xmm4, xmm4);
+
+- xmm_result = _mm256_mul_ps(xmm4, xmm_scalar);
++ xmm4 = _mm256_hadd_ps(xmm6, xmm6);
++ xmm4 = _mm256_permutevar8x32_ps(xmm4, idx);
+
+- xmm9 = _mm256_extractf128_ps(xmm_result, 1);
+- _mm_storeu_ps(target,xmm9);
+- target += 4;
+- }
++ xmm_result = _mm256_mul_ps(xmm4, xmm_scalar);
+
+- if (num_bytes >> 4 & 1) {
+- xmm9 = _mm_loadu_ps((float*)points);
++ xmm9 = _mm256_extractf128_ps(xmm_result, 1);
++ _mm_storeu_ps(target, xmm9);
++ target += 4;
++ }
+
+- xmm10 = _mm_sub_ps(xmm128_symbol, xmm9);
++ if (num_bytes >> 4 & 1) {
++ xmm9 = _mm_loadu_ps((float*)points);
+
+- points += 2;
++ xmm10 = _mm_sub_ps(xmm128_symbol, xmm9);
+
+- xmm9 = _mm_mul_ps(xmm10, xmm10);
++ points += 2;
+
+- xmm10 = _mm_hadd_ps(xmm9, xmm9);
++ xmm9 = _mm_mul_ps(xmm10, xmm10);
+
+- xmm10 = _mm_mul_ps(xmm10, xmm128_scalar);
++ xmm10 = _mm_hadd_ps(xmm9, xmm9);
+
+- _mm_storeh_pi((__m64*)target, xmm10);
+- target += 2;
+- }
++ xmm10 = _mm_mul_ps(xmm10, xmm128_scalar);
+
+- calculate_scaled_distances(target, src0[0], points, scalar, (num_bytes >> 3) & 1);
++ _mm_storeh_pi((__m64*)target, xmm10);
++ target += 2;
++ }
++
++ calculate_scaled_distances(target, src0[0], points, scalar, (num_bytes >> 3) & 1);
+ }
+
+ #endif /*LV_HAVE_AVX2*/
+@@ -412,120 +425,126 @@ volk_32fc_x2_s32f_square_dist_scalar_mult_32f_u_avx2(float* target, lv_32fc_t* s
+ #include <volk/volk_avx_intrinsics.h>
+
+ static inline void
+-volk_32fc_x2_s32f_square_dist_scalar_mult_32f_u_avx(float *target, lv_32fc_t *src0,
+- lv_32fc_t *points, float scalar,
+- unsigned int num_points) {
+- const int eightsPoints = num_points / 8;
+- const int remainder = num_points - 8 * eightsPoints;
+-
+- __m256 xmm_points0, xmm_points1, xmm_result;
+-
+- // load complex value into all parts of the register.
+- const __m256 xmm_symbol = _mm256_castpd_ps(_mm256_broadcast_sd((const double*)src0));
+-
+- // Load scalar into all 8 parts of the register
+- const __m256 xmm_scalar = _mm256_broadcast_ss(&scalar);
+-
+- for(int i = 0; i < eightsPoints; ++i){
+- xmm_points0 = _mm256_loadu_ps((float*)points);
+- xmm_points1 = _mm256_loadu_ps((float*)(points + 4));
+- points += 8;
+-
+- xmm_result = _mm256_scaled_norm_dist_ps(xmm_symbol, xmm_symbol, xmm_points0,
+- xmm_points1, xmm_scalar);
+-
+- _mm256_storeu_ps(target, xmm_result);
+- target += 8;
+- }
+-
+- const lv_32fc_t symbol = *src0;
+- calculate_scaled_distances(target, symbol, points, scalar, remainder);
++volk_32fc_x2_s32f_square_dist_scalar_mult_32f_u_avx(float* target,
++ lv_32fc_t* src0,
++ lv_32fc_t* points,
++ float scalar,
++ unsigned int num_points)
++{
++ const int eightsPoints = num_points / 8;
++ const int remainder = num_points - 8 * eightsPoints;
++
++ __m256 xmm_points0, xmm_points1, xmm_result;
++
++ // load complex value into all parts of the register.
++ const __m256 xmm_symbol = _mm256_castpd_ps(_mm256_broadcast_sd((const double*)src0));
++
++ // Load scalar into all 8 parts of the register
++ const __m256 xmm_scalar = _mm256_broadcast_ss(&scalar);
++
++ for (int i = 0; i < eightsPoints; ++i) {
++ xmm_points0 = _mm256_loadu_ps((float*)points);
++ xmm_points1 = _mm256_loadu_ps((float*)(points + 4));
++ points += 8;
++
++ xmm_result = _mm256_scaled_norm_dist_ps(
++ xmm_symbol, xmm_symbol, xmm_points0, xmm_points1, xmm_scalar);
++
++ _mm256_storeu_ps(target, xmm_result);
++ target += 8;
++ }
++
++ const lv_32fc_t symbol = *src0;
++ calculate_scaled_distances(target, symbol, points, scalar, remainder);
+ }
+
+ #endif /* LV_HAVE_AVX */
+
+
+ #ifdef LV_HAVE_SSE3
+-#include<pmmintrin.h>
+-#include<volk/volk_sse3_intrinsics.h>
++#include <pmmintrin.h>
++#include <volk/volk_sse3_intrinsics.h>
+
+ static inline void
+-volk_32fc_x2_s32f_square_dist_scalar_mult_32f_u_sse3(float* target, lv_32fc_t* src0,
+- lv_32fc_t* points, float scalar,
++volk_32fc_x2_s32f_square_dist_scalar_mult_32f_u_sse3(float* target,
++ lv_32fc_t* src0,
++ lv_32fc_t* points,
++ float scalar,
+ unsigned int num_points)
+ {
+- __m128 xmm_points0, xmm_points1, xmm_result;
+-
+- /*
+- * First do 4 values in every loop iteration.
+- * There may be up to 3 values left.
+- * leftovers0 indicates if at least 2 more are available for SSE execution.
+- * leftovers1 indicates if there is a single element left.
+- */
+- const int quarterPoints = num_points / 4;
+- const int leftovers0 = (num_points / 2) - 2 * quarterPoints;
+- const int leftovers1 = num_points % 2;
+-
+- // load complex value into both parts of the register.
+- const __m128 xmm_symbol = _mm_castpd_ps(_mm_load1_pd((const double*)src0));
+-
+- // Load scalar into all 4 parts of the register
+- const __m128 xmm_scalar = _mm_load1_ps(&scalar);
+-
+- for(int i = 0; i < quarterPoints; ++i) {
+- xmm_points0 = _mm_loadu_ps((float*)points);
+- xmm_points1 = _mm_loadu_ps((float*)(points + 2));
+- points += 4;
+- __VOLK_PREFETCH(points);
+- // calculate distances
+- xmm_result = _mm_scaled_norm_dist_ps_sse3(xmm_symbol, xmm_symbol, xmm_points0,
+- xmm_points1, xmm_scalar);
+-
+- _mm_storeu_ps(target, xmm_result);
+- target += 4;
+- }
+-
+- for(int i = 0; i < leftovers0; ++i) {
+- xmm_points0 = _mm_loadu_ps((float*)points);
+- points += 2;
+-
+- xmm_points0 = _mm_sub_ps(xmm_symbol, xmm_points0);
+- xmm_points0 = _mm_mul_ps(xmm_points0, xmm_points0);
+- xmm_points0 = _mm_hadd_ps(xmm_points0, xmm_points0);
+- xmm_result = _mm_mul_ps(xmm_points0, xmm_scalar);
+-
+- _mm_storeh_pi((__m64*)target, xmm_result);
+- target += 2;
+- }
+-
+- calculate_scaled_distances(target, src0[0], points, scalar, leftovers1);
++ __m128 xmm_points0, xmm_points1, xmm_result;
++
++ /*
++ * First do 4 values in every loop iteration.
++ * There may be up to 3 values left.
++ * leftovers0 indicates if at least 2 more are available for SSE execution.
++ * leftovers1 indicates if there is a single element left.
++ */
++ const int quarterPoints = num_points / 4;
++ const int leftovers0 = (num_points / 2) - 2 * quarterPoints;
++ const int leftovers1 = num_points % 2;
++
++ // load complex value into both parts of the register.
++ const __m128 xmm_symbol = _mm_castpd_ps(_mm_load1_pd((const double*)src0));
++
++ // Load scalar into all 4 parts of the register
++ const __m128 xmm_scalar = _mm_load1_ps(&scalar);
++
++ for (int i = 0; i < quarterPoints; ++i) {
++ xmm_points0 = _mm_loadu_ps((float*)points);
++ xmm_points1 = _mm_loadu_ps((float*)(points + 2));
++ points += 4;
++ __VOLK_PREFETCH(points);
++ // calculate distances
++ xmm_result = _mm_scaled_norm_dist_ps_sse3(
++ xmm_symbol, xmm_symbol, xmm_points0, xmm_points1, xmm_scalar);
++
++ _mm_storeu_ps(target, xmm_result);
++ target += 4;
++ }
++
++ for (int i = 0; i < leftovers0; ++i) {
++ xmm_points0 = _mm_loadu_ps((float*)points);
++ points += 2;
++
++ xmm_points0 = _mm_sub_ps(xmm_symbol, xmm_points0);
++ xmm_points0 = _mm_mul_ps(xmm_points0, xmm_points0);
++ xmm_points0 = _mm_hadd_ps(xmm_points0, xmm_points0);
++ xmm_result = _mm_mul_ps(xmm_points0, xmm_scalar);
++
++ _mm_storeh_pi((__m64*)target, xmm_result);
++ target += 2;
++ }
++
++ calculate_scaled_distances(target, src0[0], points, scalar, leftovers1);
+ }
+
+ #endif /*LV_HAVE_SSE3*/
+
+ #ifdef LV_HAVE_SSE
+-#include <xmmintrin.h>
+ #include <volk/volk_sse_intrinsics.h>
++#include <xmmintrin.h>
+ static inline void
+-volk_32fc_x2_s32f_square_dist_scalar_mult_32f_u_sse(float* target, lv_32fc_t* src0,
+- lv_32fc_t* points, float scalar,
++volk_32fc_x2_s32f_square_dist_scalar_mult_32f_u_sse(float* target,
++ lv_32fc_t* src0,
++ lv_32fc_t* points,
++ float scalar,
+ unsigned int num_points)
+ {
+- const __m128 xmm_scalar = _mm_set1_ps(scalar);
+- const __m128 xmm_symbol = _mm_castpd_ps(_mm_load1_pd((const double*)src0));
+-
+- for (unsigned i = 0; i < num_points / 4; ++i) {
+- __m128 xmm_points0 = _mm_loadu_ps((float *) points);
+- __m128 xmm_points1 = _mm_loadu_ps((float *) (points + 2));
+- points += 4;
+- __m128 xmm_result = _mm_scaled_norm_dist_ps_sse(xmm_symbol, xmm_symbol,
+- xmm_points0, xmm_points1,
+- xmm_scalar);
+- _mm_storeu_ps((float *) target, xmm_result);
+- target += 4;
+- }
+-
+- calculate_scaled_distances(target, src0[0], points, scalar, num_points % 4);
++ const __m128 xmm_scalar = _mm_set1_ps(scalar);
++ const __m128 xmm_symbol = _mm_castpd_ps(_mm_load1_pd((const double*)src0));
++
++ for (unsigned i = 0; i < num_points / 4; ++i) {
++ __m128 xmm_points0 = _mm_loadu_ps((float*)points);
++ __m128 xmm_points1 = _mm_loadu_ps((float*)(points + 2));
++ points += 4;
++ __m128 xmm_result = _mm_scaled_norm_dist_ps_sse(
++ xmm_symbol, xmm_symbol, xmm_points0, xmm_points1, xmm_scalar);
++ _mm_storeu_ps((float*)target, xmm_result);
++ target += 4;
++ }
++
++ calculate_scaled_distances(target, src0[0], points, scalar, num_points % 4);
+ }
+ #endif // LV_HAVE_SSE
+
+diff --git a/kernels/volk/volk_32fc_x2_s32fc_multiply_conjugate_add_32fc.h b/kernels/volk/volk_32fc_x2_s32fc_multiply_conjugate_add_32fc.h
+index 6c7f4d3..1fb9b68 100644
+--- a/kernels/volk/volk_32fc_x2_s32fc_multiply_conjugate_add_32fc.h
++++ b/kernels/volk/volk_32fc_x2_s32fc_multiply_conjugate_add_32fc.h
+@@ -32,14 +32,16 @@
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+- * void volk_32fc_x2_s32fc_multiply_conjugate_add_32fc(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, const lv_32fc_t scalar, unsigned int num_points);
+- * \endcode
++ * void volk_32fc_x2_s32fc_multiply_conjugate_add_32fc(lv_32fc_t* cVector, const
++ * lv_32fc_t* aVector, const lv_32fc_t* bVector, const lv_32fc_t scalar, unsigned int
++ * num_points); \endcode
+ *
+ * \b Inputs
+ * \li aVector: The input vector to be added.
+ * \li bVector: The input vector to be conjugate and multiplied.
+ * \li scalar: The complex scalar to multiply against conjugated bVector.
+- * \li num_points: The number of complex values in aVector and bVector to be conjugate, multiplied and stored into cVector.
++ * \li num_points: The number of complex values in aVector and bVector to be conjugate,
++ * multiplied and stored into cVector.
+ *
+ * \b Outputs
+ * \li cVector: The vector where the results will be stored.
+@@ -84,15 +86,21 @@
+ #ifndef INCLUDED_volk_32fc_x2_s32fc_multiply_conjugate_add_32fc_H
+ #define INCLUDED_volk_32fc_x2_s32fc_multiply_conjugate_add_32fc_H
+
++#include <float.h>
+ #include <inttypes.h>
+ #include <stdio.h>
+ #include <volk/volk_complex.h>
+-#include <float.h>
+
+
+ #ifdef LV_HAVE_GENERIC
+
+-static inline void volk_32fc_x2_s32fc_multiply_conjugate_add_32fc_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, const lv_32fc_t scalar, unsigned int num_points){
++static inline void
++volk_32fc_x2_s32fc_multiply_conjugate_add_32fc_generic(lv_32fc_t* cVector,
++ const lv_32fc_t* aVector,
++ const lv_32fc_t* bVector,
++ const lv_32fc_t scalar,
++ unsigned int num_points)
++{
+ const lv_32fc_t* aPtr = aVector;
+ const lv_32fc_t* bPtr = bVector;
+ lv_32fc_t* cPtr = cVector;
+@@ -123,14 +131,20 @@ static inline void volk_32fc_x2_s32fc_multiply_conjugate_add_32fc_generic(lv_32f
+ #include <immintrin.h>
+ #include <volk/volk_avx_intrinsics.h>
+
+-static inline void volk_32fc_x2_s32fc_multiply_conjugate_add_32fc_u_avx(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, const lv_32fc_t scalar, unsigned int num_points) {
++static inline void
++volk_32fc_x2_s32fc_multiply_conjugate_add_32fc_u_avx(lv_32fc_t* cVector,
++ const lv_32fc_t* aVector,
++ const lv_32fc_t* bVector,
++ const lv_32fc_t scalar,
++ unsigned int num_points)
++{
+ unsigned int number = 0;
+ unsigned int i = 0;
+ const unsigned int quarterPoints = num_points / 4;
+ unsigned int isodd = num_points & 3;
+
+ __m256 x, y, s, z;
+- lv_32fc_t v_scalar[4] = {scalar, scalar, scalar, scalar};
++ lv_32fc_t v_scalar[4] = { scalar, scalar, scalar, scalar };
+
+ const lv_32fc_t* a = aVector;
+ const lv_32fc_t* b = bVector;
+@@ -139,19 +153,19 @@ static inline void volk_32fc_x2_s32fc_multiply_conjugate_add_32fc_u_avx(lv_32fc_
+ // Set up constant scalar vector
+ s = _mm256_loadu_ps((float*)v_scalar);
+
+- for(;number < quarterPoints; number++) {
++ for (; number < quarterPoints; number++) {
+ x = _mm256_loadu_ps((float*)b);
+ y = _mm256_loadu_ps((float*)a);
+ z = _mm256_complexconjugatemul_ps(s, x);
+ z = _mm256_add_ps(y, z);
+- _mm256_storeu_ps((float*)c,z);
++ _mm256_storeu_ps((float*)c, z);
+
+ a += 4;
+ b += 4;
+ c += 4;
+ }
+
+- for(i = num_points-isodd; i < num_points; i++) {
++ for (i = num_points - isodd; i < num_points; i++) {
+ *c++ = (*a++) + lv_conj(*b++) * scalar;
+ }
+ }
+@@ -162,12 +176,18 @@ static inline void volk_32fc_x2_s32fc_multiply_conjugate_add_32fc_u_avx(lv_32fc_
+ #include <pmmintrin.h>
+ #include <volk/volk_sse3_intrinsics.h>
+
+-static inline void volk_32fc_x2_s32fc_multiply_conjugate_add_32fc_u_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, const lv_32fc_t scalar, unsigned int num_points) {
++static inline void
++volk_32fc_x2_s32fc_multiply_conjugate_add_32fc_u_sse3(lv_32fc_t* cVector,
++ const lv_32fc_t* aVector,
++ const lv_32fc_t* bVector,
++ const lv_32fc_t scalar,
++ unsigned int num_points)
++{
+ unsigned int number = 0;
+ const unsigned int halfPoints = num_points / 2;
+
+ __m128 x, y, s, z;
+- lv_32fc_t v_scalar[2] = {scalar, scalar};
++ lv_32fc_t v_scalar[2] = { scalar, scalar };
+
+ const lv_32fc_t* a = aVector;
+ const lv_32fc_t* b = bVector;
+@@ -176,19 +196,19 @@ static inline void volk_32fc_x2_s32fc_multiply_conjugate_add_32fc_u_sse3(lv_32fc
+ // Set up constant scalar vector
+ s = _mm_loadu_ps((float*)v_scalar);
+
+- for(;number < halfPoints; number++){
++ for (; number < halfPoints; number++) {
+ x = _mm_loadu_ps((float*)b);
+ y = _mm_loadu_ps((float*)a);
+ z = _mm_complexconjugatemul_ps(s, x);
+ z = _mm_add_ps(y, z);
+- _mm_storeu_ps((float*)c,z);
++ _mm_storeu_ps((float*)c, z);
+
+ a += 2;
+ b += 2;
+ c += 2;
+ }
+
+- if((num_points % 2) != 0) {
++ if ((num_points % 2) != 0) {
+ *c = *a + lv_conj(*b) * scalar;
+ }
+ }
+@@ -199,14 +219,20 @@ static inline void volk_32fc_x2_s32fc_multiply_conjugate_add_32fc_u_sse3(lv_32fc
+ #include <immintrin.h>
+ #include <volk/volk_avx_intrinsics.h>
+
+-static inline void volk_32fc_x2_s32fc_multiply_conjugate_add_32fc_a_avx(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, const lv_32fc_t scalar, unsigned int num_points) {
++static inline void
++volk_32fc_x2_s32fc_multiply_conjugate_add_32fc_a_avx(lv_32fc_t* cVector,
++ const lv_32fc_t* aVector,
++ const lv_32fc_t* bVector,
++ const lv_32fc_t scalar,
++ unsigned int num_points)
++{
+ unsigned int number = 0;
+ unsigned int i = 0;
+ const unsigned int quarterPoints = num_points / 4;
+ unsigned int isodd = num_points & 3;
+
+ __m256 x, y, s, z;
+- lv_32fc_t v_scalar[4] = {scalar, scalar, scalar, scalar};
++ lv_32fc_t v_scalar[4] = { scalar, scalar, scalar, scalar };
+
+ const lv_32fc_t* a = aVector;
+ const lv_32fc_t* b = bVector;
+@@ -215,19 +241,19 @@ static inline void volk_32fc_x2_s32fc_multiply_conjugate_add_32fc_a_avx(lv_32fc_
+ // Set up constant scalar vector
+ s = _mm256_load_ps((float*)v_scalar);
+
+- for(;number < quarterPoints; number++) {
++ for (; number < quarterPoints; number++) {
+ x = _mm256_load_ps((float*)b);
+ y = _mm256_load_ps((float*)a);
+ z = _mm256_complexconjugatemul_ps(s, x);
+ z = _mm256_add_ps(y, z);
+- _mm256_store_ps((float*)c,z);
++ _mm256_store_ps((float*)c, z);
+
+ a += 4;
+ b += 4;
+ c += 4;
+ }
+
+- for(i = num_points-isodd; i < num_points; i++) {
++ for (i = num_points - isodd; i < num_points; i++) {
+ *c++ = (*a++) + lv_conj(*b++) * scalar;
+ }
+ }
+@@ -238,12 +264,18 @@ static inline void volk_32fc_x2_s32fc_multiply_conjugate_add_32fc_a_avx(lv_32fc_
+ #include <pmmintrin.h>
+ #include <volk/volk_sse3_intrinsics.h>
+
+-static inline void volk_32fc_x2_s32fc_multiply_conjugate_add_32fc_a_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, const lv_32fc_t scalar, unsigned int num_points) {
++static inline void
++volk_32fc_x2_s32fc_multiply_conjugate_add_32fc_a_sse3(lv_32fc_t* cVector,
++ const lv_32fc_t* aVector,
++ const lv_32fc_t* bVector,
++ const lv_32fc_t scalar,
++ unsigned int num_points)
++{
+ unsigned int number = 0;
+ const unsigned int halfPoints = num_points / 2;
+
+ __m128 x, y, s, z;
+- lv_32fc_t v_scalar[2] = {scalar, scalar};
++ lv_32fc_t v_scalar[2] = { scalar, scalar };
+
+ const lv_32fc_t* a = aVector;
+ const lv_32fc_t* b = bVector;
+@@ -252,19 +284,19 @@ static inline void volk_32fc_x2_s32fc_multiply_conjugate_add_32fc_a_sse3(lv_32fc
+ // Set up constant scalar vector
+ s = _mm_load_ps((float*)v_scalar);
+
+- for(;number < halfPoints; number++){
++ for (; number < halfPoints; number++) {
+ x = _mm_load_ps((float*)b);
+ y = _mm_load_ps((float*)a);
+ z = _mm_complexconjugatemul_ps(s, x);
+ z = _mm_add_ps(y, z);
+- _mm_store_ps((float*)c,z);
++ _mm_store_ps((float*)c, z);
+
+ a += 2;
+ b += 2;
+ c += 2;
+ }
+
+- if((num_points % 2) != 0) {
++ if ((num_points % 2) != 0) {
+ *c = *a + lv_conj(*b) * scalar;
+ }
+ }
+@@ -272,9 +304,15 @@ static inline void volk_32fc_x2_s32fc_multiply_conjugate_add_32fc_a_sse3(lv_32fc
+
+
+ #ifdef LV_HAVE_NEON
+-#include <arm_neon.h>
+-
+-static inline void volk_32fc_x2_s32fc_multiply_conjugate_add_32fc_neon(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, const lv_32fc_t scalar, unsigned int num_points){
++#include <arm_neon.h>
++
++static inline void
++volk_32fc_x2_s32fc_multiply_conjugate_add_32fc_neon(lv_32fc_t* cVector,
++ const lv_32fc_t* aVector,
++ const lv_32fc_t* bVector,
++ const lv_32fc_t scalar,
++ unsigned int num_points)
++{
+ const lv_32fc_t* bPtr = bVector;
+ const lv_32fc_t* aPtr = aVector;
+ lv_32fc_t* cPtr = cVector;
+@@ -287,7 +325,7 @@ static inline void volk_32fc_x2_s32fc_multiply_conjugate_add_32fc_neon(lv_32fc_t
+ scalar_val.val[0] = vld1q_dup_f32((const float*)&scalar);
+ scalar_val.val[1] = vld1q_dup_f32(((const float*)&scalar) + 1);
+
+- for(number = 0; number < quarter_points; ++number) {
++ for (number = 0; number < quarter_points; ++number) {
+ a_val = vld2q_f32((float*)aPtr);
+ b_val = vld2q_f32((float*)bPtr);
+ b_val.val[1] = vnegq_f32(b_val.val[1]);
+@@ -310,7 +348,7 @@ static inline void volk_32fc_x2_s32fc_multiply_conjugate_add_32fc_neon(lv_32fc_t
+ cPtr += 4;
+ }
+
+- for(number = quarter_points*4; number < num_points; number++){
++ for (number = quarter_points * 4; number < num_points; number++) {
+ *cPtr++ = (*aPtr++) + lv_conj(*bPtr++) * scalar;
+ }
+ }
+diff --git a/kernels/volk/volk_32fc_x2_square_dist_32f.h b/kernels/volk/volk_32fc_x2_square_dist_32f.h
+index d6c6dff..75f4072 100644
+--- a/kernels/volk/volk_32fc_x2_square_dist_32f.h
++++ b/kernels/volk/volk_32fc_x2_square_dist_32f.h
+@@ -30,8 +30,8 @@
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+- * void volk_32fc_x2_square_dist_32f(float* target, lv_32fc_t* src0, lv_32fc_t* points, unsigned int num_points) {
+- * \endcode
++ * void volk_32fc_x2_square_dist_32f(float* target, lv_32fc_t* src0, lv_32fc_t* points,
++ * unsigned int num_points) { \endcode
+ *
+ * \b Inputs
+ * \li src0: The complex input. Only the first point is used.
+@@ -78,183 +78,185 @@
+ #ifndef INCLUDED_volk_32fc_x2_square_dist_32f_a_H
+ #define INCLUDED_volk_32fc_x2_square_dist_32f_a_H
+
+-#include<inttypes.h>
+-#include<stdio.h>
+-#include<volk/volk_complex.h>
++#include <inttypes.h>
++#include <stdio.h>
++#include <volk/volk_complex.h>
+
+ #ifdef LV_HAVE_AVX2
+-#include<immintrin.h>
++#include <immintrin.h>
+
+-static inline void
+-volk_32fc_x2_square_dist_32f_a_avx2(float* target, lv_32fc_t* src0, lv_32fc_t* points,
+- unsigned int num_points)
++static inline void volk_32fc_x2_square_dist_32f_a_avx2(float* target,
++ lv_32fc_t* src0,
++ lv_32fc_t* points,
++ unsigned int num_points)
+ {
+- const unsigned int num_bytes = num_points*8;
+- __m128 xmm0, xmm9, xmm10;
+- __m256 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
+-
+- lv_32fc_t diff;
+- float sq_dist;
+- int bound = num_bytes >> 6;
+- int leftovers0 = (num_bytes >> 5) & 1;
+- int leftovers1 = (num_bytes >> 4) & 1;
+- int leftovers2 = (num_bytes >> 3) & 1;
+- int i = 0;
+-
+- __m256i idx = _mm256_set_epi32(7,6,3,2,5,4,1,0);
+- xmm1 = _mm256_setzero_ps();
+- xmm2 = _mm256_load_ps((float*)&points[0]);
+- xmm0 = _mm_load_ps((float*)src0);
+- xmm0 = _mm_permute_ps(xmm0, 0b01000100);
+- xmm1 = _mm256_insertf128_ps(xmm1, xmm0, 0);
+- xmm1 = _mm256_insertf128_ps(xmm1, xmm0, 1);
+- xmm3 = _mm256_load_ps((float*)&points[4]);
+-
+- for(; i < bound; ++i) {
+- xmm4 = _mm256_sub_ps(xmm1, xmm2);
+- xmm5 = _mm256_sub_ps(xmm1, xmm3);
+- points += 8;
+- xmm6 = _mm256_mul_ps(xmm4, xmm4);
+- xmm7 = _mm256_mul_ps(xmm5, xmm5);
+-
++ const unsigned int num_bytes = num_points * 8;
++ __m128 xmm0, xmm9, xmm10;
++ __m256 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
++
++ lv_32fc_t diff;
++ float sq_dist;
++ int bound = num_bytes >> 6;
++ int leftovers0 = (num_bytes >> 5) & 1;
++ int leftovers1 = (num_bytes >> 4) & 1;
++ int leftovers2 = (num_bytes >> 3) & 1;
++ int i = 0;
++
++ __m256i idx = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
++ xmm1 = _mm256_setzero_ps();
+ xmm2 = _mm256_load_ps((float*)&points[0]);
++ xmm0 = _mm_load_ps((float*)src0);
++ xmm0 = _mm_permute_ps(xmm0, 0b01000100);
++ xmm1 = _mm256_insertf128_ps(xmm1, xmm0, 0);
++ xmm1 = _mm256_insertf128_ps(xmm1, xmm0, 1);
++ xmm3 = _mm256_load_ps((float*)&points[4]);
+
+- xmm4 = _mm256_hadd_ps(xmm6, xmm7);
+- xmm4 = _mm256_permutevar8x32_ps(xmm4, idx);
++ for (; i < bound; ++i) {
++ xmm4 = _mm256_sub_ps(xmm1, xmm2);
++ xmm5 = _mm256_sub_ps(xmm1, xmm3);
++ points += 8;
++ xmm6 = _mm256_mul_ps(xmm4, xmm4);
++ xmm7 = _mm256_mul_ps(xmm5, xmm5);
+
+- xmm3 = _mm256_load_ps((float*)&points[4]);
++ xmm2 = _mm256_load_ps((float*)&points[0]);
+
+- _mm256_store_ps(target, xmm4);
++ xmm4 = _mm256_hadd_ps(xmm6, xmm7);
++ xmm4 = _mm256_permutevar8x32_ps(xmm4, idx);
+
+- target += 8;
+- }
++ xmm3 = _mm256_load_ps((float*)&points[4]);
+
+- for(i = 0; i < leftovers0; ++i) {
++ _mm256_store_ps(target, xmm4);
+
+- xmm2 = _mm256_load_ps((float*)&points[0]);
++ target += 8;
++ }
+
+- xmm4 = _mm256_sub_ps(xmm1, xmm2);
++ for (i = 0; i < leftovers0; ++i) {
+
+- points += 4;
++ xmm2 = _mm256_load_ps((float*)&points[0]);
+
+- xmm6 = _mm256_mul_ps(xmm4, xmm4);
++ xmm4 = _mm256_sub_ps(xmm1, xmm2);
+
+- xmm4 = _mm256_hadd_ps(xmm6, xmm6);
+- xmm4 = _mm256_permutevar8x32_ps(xmm4, idx);
++ points += 4;
+
+- xmm9 = _mm256_extractf128_ps(xmm4, 1);
+- _mm_store_ps(target,xmm9);
++ xmm6 = _mm256_mul_ps(xmm4, xmm4);
+
+- target += 4;
+- }
++ xmm4 = _mm256_hadd_ps(xmm6, xmm6);
++ xmm4 = _mm256_permutevar8x32_ps(xmm4, idx);
++
++ xmm9 = _mm256_extractf128_ps(xmm4, 1);
++ _mm_store_ps(target, xmm9);
+
+- for(i = 0; i < leftovers1; ++i) {
+- xmm9 = _mm_load_ps((float*)&points[0]);
++ target += 4;
++ }
+
+- xmm10 = _mm_sub_ps(xmm0, xmm9);
++ for (i = 0; i < leftovers1; ++i) {
++ xmm9 = _mm_load_ps((float*)&points[0]);
+
+- points += 2;
++ xmm10 = _mm_sub_ps(xmm0, xmm9);
+
+- xmm9 = _mm_mul_ps(xmm10, xmm10);
++ points += 2;
+
+- xmm10 = _mm_hadd_ps(xmm9, xmm9);
++ xmm9 = _mm_mul_ps(xmm10, xmm10);
+
+- _mm_storeh_pi((__m64*)target, xmm10);
++ xmm10 = _mm_hadd_ps(xmm9, xmm9);
+
+- target += 2;
+- }
++ _mm_storeh_pi((__m64*)target, xmm10);
+
+- for(i = 0; i < leftovers2; ++i) {
++ target += 2;
++ }
+
+- diff = src0[0] - points[0];
++ for (i = 0; i < leftovers2; ++i) {
+
+- sq_dist = lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff);
++ diff = src0[0] - points[0];
+
+- target[0] = sq_dist;
+- }
++ sq_dist = lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff);
++
++ target[0] = sq_dist;
++ }
+ }
+
+ #endif /*LV_HAVE_AVX2*/
+
+ #ifdef LV_HAVE_SSE3
+-#include<xmmintrin.h>
+-#include<pmmintrin.h>
++#include <pmmintrin.h>
++#include <xmmintrin.h>
+
+-static inline void
+-volk_32fc_x2_square_dist_32f_a_sse3(float* target, lv_32fc_t* src0, lv_32fc_t* points,
+- unsigned int num_points)
++static inline void volk_32fc_x2_square_dist_32f_a_sse3(float* target,
++ lv_32fc_t* src0,
++ lv_32fc_t* points,
++ unsigned int num_points)
+ {
+- const unsigned int num_bytes = num_points*8;
++ const unsigned int num_bytes = num_points * 8;
+
+- __m128 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
++ __m128 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
+
+- lv_32fc_t diff;
+- float sq_dist;
+- int bound = num_bytes >> 5;
+- int i = 0;
++ lv_32fc_t diff;
++ float sq_dist;
++ int bound = num_bytes >> 5;
++ int i = 0;
+
+- xmm1 = _mm_setzero_ps();
+- xmm1 = _mm_loadl_pi(xmm1, (__m64*)src0);
+- xmm2 = _mm_load_ps((float*)&points[0]);
+- xmm1 = _mm_movelh_ps(xmm1, xmm1);
+- xmm3 = _mm_load_ps((float*)&points[2]);
++ xmm1 = _mm_setzero_ps();
++ xmm1 = _mm_loadl_pi(xmm1, (__m64*)src0);
++ xmm2 = _mm_load_ps((float*)&points[0]);
++ xmm1 = _mm_movelh_ps(xmm1, xmm1);
++ xmm3 = _mm_load_ps((float*)&points[2]);
++
++ for (; i < bound - 1; ++i) {
++ xmm4 = _mm_sub_ps(xmm1, xmm2);
++ xmm5 = _mm_sub_ps(xmm1, xmm3);
++ points += 4;
++ xmm6 = _mm_mul_ps(xmm4, xmm4);
++ xmm7 = _mm_mul_ps(xmm5, xmm5);
++
++ xmm2 = _mm_load_ps((float*)&points[0]);
++
++ xmm4 = _mm_hadd_ps(xmm6, xmm7);
++
++ xmm3 = _mm_load_ps((float*)&points[2]);
++
++ _mm_store_ps(target, xmm4);
++
++ target += 4;
++ }
+
+- for(; i < bound - 1; ++i) {
+ xmm4 = _mm_sub_ps(xmm1, xmm2);
+ xmm5 = _mm_sub_ps(xmm1, xmm3);
++
+ points += 4;
+ xmm6 = _mm_mul_ps(xmm4, xmm4);
+ xmm7 = _mm_mul_ps(xmm5, xmm5);
+
+- xmm2 = _mm_load_ps((float*)&points[0]);
+-
+ xmm4 = _mm_hadd_ps(xmm6, xmm7);
+
+- xmm3 = _mm_load_ps((float*)&points[2]);
+-
+ _mm_store_ps(target, xmm4);
+
+ target += 4;
+- }
+-
+- xmm4 = _mm_sub_ps(xmm1, xmm2);
+- xmm5 = _mm_sub_ps(xmm1, xmm3);
+-
+- points += 4;
+- xmm6 = _mm_mul_ps(xmm4, xmm4);
+- xmm7 = _mm_mul_ps(xmm5, xmm5);
+
+- xmm4 = _mm_hadd_ps(xmm6, xmm7);
++ if (num_bytes >> 4 & 1) {
+
+- _mm_store_ps(target, xmm4);
++ xmm2 = _mm_load_ps((float*)&points[0]);
+
+- target += 4;
++ xmm4 = _mm_sub_ps(xmm1, xmm2);
+
+- if (num_bytes >> 4 & 1) {
++ points += 2;
+
+- xmm2 = _mm_load_ps((float*)&points[0]);
+-
+- xmm4 = _mm_sub_ps(xmm1, xmm2);
++ xmm6 = _mm_mul_ps(xmm4, xmm4);
+
+- points += 2;
+-
+- xmm6 = _mm_mul_ps(xmm4, xmm4);
++ xmm4 = _mm_hadd_ps(xmm6, xmm6);
+
+- xmm4 = _mm_hadd_ps(xmm6, xmm6);
++ _mm_storeh_pi((__m64*)target, xmm4);
+
+- _mm_storeh_pi((__m64*)target, xmm4);
++ target += 2;
++ }
+
+- target += 2;
+- }
++ if (num_bytes >> 3 & 1) {
+
+- if (num_bytes >> 3 & 1) {
++ diff = src0[0] - points[0];
+
+- diff = src0[0] - points[0];
++ sq_dist = lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff);
+
+- sq_dist = lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff);
+-
+- target[0] = sq_dist;
+- }
++ target[0] = sq_dist;
++ }
+ }
+
+ #endif /*LV_HAVE_SSE3*/
+@@ -262,55 +264,58 @@ volk_32fc_x2_square_dist_32f_a_sse3(float* target, lv_32fc_t* src0, lv_32fc_t* p
+
+ #ifdef LV_HAVE_NEON
+ #include <arm_neon.h>
+-static inline void
+-volk_32fc_x2_square_dist_32f_neon(float* target, lv_32fc_t* src0, lv_32fc_t* points, unsigned int num_points)
++static inline void volk_32fc_x2_square_dist_32f_neon(float* target,
++ lv_32fc_t* src0,
++ lv_32fc_t* points,
++ unsigned int num_points)
+ {
+- const unsigned int quarter_points = num_points / 4;
+- unsigned int number;
+-
+- float32x4x2_t a_vec, b_vec;
+- float32x4x2_t diff_vec;
+- float32x4_t tmp, tmp1, dist_sq;
+- a_vec.val[0] = vdupq_n_f32( lv_creal(src0[0]) );
+- a_vec.val[1] = vdupq_n_f32( lv_cimag(src0[0]) );
+- for(number=0; number < quarter_points; ++number) {
+- b_vec = vld2q_f32((float*)points);
+- diff_vec.val[0] = vsubq_f32(a_vec.val[0], b_vec.val[0]);
+- diff_vec.val[1] = vsubq_f32(a_vec.val[1], b_vec.val[1]);
+- tmp = vmulq_f32(diff_vec.val[0], diff_vec.val[0]);
+- tmp1 = vmulq_f32(diff_vec.val[1], diff_vec.val[1]);
+-
+- dist_sq = vaddq_f32(tmp, tmp1);
+- vst1q_f32(target, dist_sq);
+- points += 4;
+- target += 4;
+- }
+- for(number=quarter_points*4; number < num_points; ++number) {
+- lv_32fc_t diff = src0[0] - *points++;
+- *target++ = lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff);
+- }
++ const unsigned int quarter_points = num_points / 4;
++ unsigned int number;
++
++ float32x4x2_t a_vec, b_vec;
++ float32x4x2_t diff_vec;
++ float32x4_t tmp, tmp1, dist_sq;
++ a_vec.val[0] = vdupq_n_f32(lv_creal(src0[0]));
++ a_vec.val[1] = vdupq_n_f32(lv_cimag(src0[0]));
++ for (number = 0; number < quarter_points; ++number) {
++ b_vec = vld2q_f32((float*)points);
++ diff_vec.val[0] = vsubq_f32(a_vec.val[0], b_vec.val[0]);
++ diff_vec.val[1] = vsubq_f32(a_vec.val[1], b_vec.val[1]);
++ tmp = vmulq_f32(diff_vec.val[0], diff_vec.val[0]);
++ tmp1 = vmulq_f32(diff_vec.val[1], diff_vec.val[1]);
++
++ dist_sq = vaddq_f32(tmp, tmp1);
++ vst1q_f32(target, dist_sq);
++ points += 4;
++ target += 4;
++ }
++ for (number = quarter_points * 4; number < num_points; ++number) {
++ lv_32fc_t diff = src0[0] - *points++;
++ *target++ = lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff);
++ }
+ }
+ #endif /* LV_HAVE_NEON */
+
+
+ #ifdef LV_HAVE_GENERIC
+-static inline void
+-volk_32fc_x2_square_dist_32f_generic(float* target, lv_32fc_t* src0, lv_32fc_t* points,
+- unsigned int num_points)
++static inline void volk_32fc_x2_square_dist_32f_generic(float* target,
++ lv_32fc_t* src0,
++ lv_32fc_t* points,
++ unsigned int num_points)
+ {
+- const unsigned int num_bytes = num_points*8;
++ const unsigned int num_bytes = num_points * 8;
+
+- lv_32fc_t diff;
+- float sq_dist;
+- unsigned int i = 0;
++ lv_32fc_t diff;
++ float sq_dist;
++ unsigned int i = 0;
+
+- for(; i < num_bytes >> 3; ++i) {
+- diff = src0[0] - points[i];
++ for (; i<num_bytes>> 3; ++i) {
++ diff = src0[0] - points[i];
+
+- sq_dist = lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff);
++ sq_dist = lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff);
+
+- target[i] = sq_dist;
+- }
++ target[i] = sq_dist;
++ }
+ }
+
+ #endif /*LV_HAVE_GENERIC*/
+@@ -321,80 +326,85 @@ volk_32fc_x2_square_dist_32f_generic(float* target, lv_32fc_t* src0, lv_32fc_t*
+ #ifndef INCLUDED_volk_32fc_x2_square_dist_32f_u_H
+ #define INCLUDED_volk_32fc_x2_square_dist_32f_u_H
+
+-#include<inttypes.h>
+-#include<stdio.h>
+-#include<volk/volk_complex.h>
++#include <inttypes.h>
++#include <stdio.h>
++#include <volk/volk_complex.h>
+
+ #ifdef LV_HAVE_AVX2
+-#include<immintrin.h>
++#include <immintrin.h>
+
+-static inline void
+-volk_32fc_x2_square_dist_32f_u_avx2(float* target, lv_32fc_t* src0, lv_32fc_t* points,
+- unsigned int num_points)
++static inline void volk_32fc_x2_square_dist_32f_u_avx2(float* target,
++ lv_32fc_t* src0,
++ lv_32fc_t* points,
++ unsigned int num_points)
+ {
+- const unsigned int num_bytes = num_points*8;
+- __m128 xmm0, xmm9;
+- __m256 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
+-
+- lv_32fc_t diff;
+- float sq_dist;
+- int bound = num_bytes >> 6;
+- int leftovers1 = (num_bytes >> 3) & 0b11;
+- int i = 0;
+-
+- __m256i idx = _mm256_set_epi32(7,6,3,2,5,4,1,0);
+- xmm1 = _mm256_setzero_ps();
+- xmm0 = _mm_loadu_ps((float*)src0);
+- xmm0 = _mm_permute_ps(xmm0, 0b01000100);
+- xmm1 = _mm256_insertf128_ps(xmm1, xmm0, 0);
+- xmm1 = _mm256_insertf128_ps(xmm1, xmm0, 1);
+-
+- for(; i < bound; ++i) {
++ const unsigned int num_bytes = num_points * 8;
++ __m128 xmm0, xmm9;
++ __m256 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
++
++ lv_32fc_t diff;
++ float sq_dist;
++ int bound = num_bytes >> 6;
++ int leftovers1 = (num_bytes >> 3) & 0b11;
++ int i = 0;
++
++ __m256i idx = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
++ xmm1 = _mm256_setzero_ps();
+ xmm2 = _mm256_loadu_ps((float*)&points[0]);
++ xmm0 = _mm_loadu_ps((float*)src0);
++ xmm0 = _mm_permute_ps(xmm0, 0b01000100);
++ xmm1 = _mm256_insertf128_ps(xmm1, xmm0, 0);
++ xmm1 = _mm256_insertf128_ps(xmm1, xmm0, 1);
+ xmm3 = _mm256_loadu_ps((float*)&points[4]);
+- xmm4 = _mm256_sub_ps(xmm1, xmm2);
+- xmm5 = _mm256_sub_ps(xmm1, xmm3);
+- points += 8;
+- xmm6 = _mm256_mul_ps(xmm4, xmm4);
+- xmm7 = _mm256_mul_ps(xmm5, xmm5);
+
+- xmm4 = _mm256_hadd_ps(xmm6, xmm7);
+- xmm4 = _mm256_permutevar8x32_ps(xmm4, idx);
++ for (; i < bound; ++i) {
++ xmm4 = _mm256_sub_ps(xmm1, xmm2);
++ xmm5 = _mm256_sub_ps(xmm1, xmm3);
++ points += 8;
++ xmm6 = _mm256_mul_ps(xmm4, xmm4);
++ xmm7 = _mm256_mul_ps(xmm5, xmm5);
+
+- _mm256_storeu_ps(target, xmm4);
++ xmm2 = _mm256_loadu_ps((float*)&points[0]);
+
+- target += 8;
+- }
++ xmm4 = _mm256_hadd_ps(xmm6, xmm7);
++ xmm4 = _mm256_permutevar8x32_ps(xmm4, idx);
+
+- if (num_bytes >> 5 & 1) {
++ xmm3 = _mm256_loadu_ps((float*)&points[4]);
+
+- xmm2 = _mm256_loadu_ps((float*)&points[0]);
++ _mm256_storeu_ps(target, xmm4);
+
+- xmm4 = _mm256_sub_ps(xmm1, xmm2);
++ target += 8;
++ }
+
+- points += 4;
++ if (num_bytes >> 5 & 1) {
+
+- xmm6 = _mm256_mul_ps(xmm4, xmm4);
++ xmm2 = _mm256_loadu_ps((float*)&points[0]);
+
+- xmm4 = _mm256_hadd_ps(xmm6, xmm6);
+- xmm4 = _mm256_permutevar8x32_ps(xmm4, idx);
++ xmm4 = _mm256_sub_ps(xmm1, xmm2);
+
+- xmm9 = _mm256_extractf128_ps(xmm4, 1);
+- _mm_storeu_ps(target,xmm9);
++ points += 4;
+
+- target += 4;
+- }
++ xmm6 = _mm256_mul_ps(xmm4, xmm4);
++
++ xmm4 = _mm256_hadd_ps(xmm6, xmm6);
++ xmm4 = _mm256_permutevar8x32_ps(xmm4, idx);
++
++ xmm9 = _mm256_extractf128_ps(xmm4, 1);
++ _mm_storeu_ps(target, xmm9);
++
++ target += 4;
++ }
+
+- for(i = 0; i < leftovers1; ++i) {
++ for (i = 0; i < leftovers1; ++i) {
+
+- diff = src0[0] - points[0];
+- points += 1;
++ diff = src0[0] - points[0];
++ points += 1;
+
+- sq_dist = lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff);
++ sq_dist = lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff);
+
+- target[0] = sq_dist;
+- target += 1;
+- }
++ target[0] = sq_dist;
++ target += 1;
++ }
+ }
+
+ #endif /*LV_HAVE_AVX2*/
+diff --git a/kernels/volk/volk_32i_s32f_convert_32f.h b/kernels/volk/volk_32i_s32f_convert_32f.h
+index 87d94f9..6b67cdb 100644
+--- a/kernels/volk/volk_32i_s32f_convert_32f.h
++++ b/kernels/volk/volk_32i_s32f_convert_32f.h
+@@ -30,8 +30,8 @@
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+- * void volk_32i_s32f_convert_32f(float* outputVector, const int32_t* inputVector, const float scalar, unsigned int num_points)
+- * \endcode
++ * void volk_32i_s32f_convert_32f(float* outputVector, const int32_t* inputVector, const
++ * float scalar, unsigned int num_points) \endcode
+ *
+ * \b Inputs
+ * \li inputVector: The vector of 32-bit integers.
+@@ -70,37 +70,38 @@
+ #ifdef LV_HAVE_AVX512F
+ #include <immintrin.h>
+
+-static inline void
+-volk_32i_s32f_convert_32f_u_avx512f(float* outputVector, const int32_t* inputVector,
+- const float scalar, unsigned int num_points)
++static inline void volk_32i_s32f_convert_32f_u_avx512f(float* outputVector,
++ const int32_t* inputVector,
++ const float scalar,
++ unsigned int num_points)
+ {
+- unsigned int number = 0;
+- const unsigned int onesixteenthPoints = num_points / 16;
++ unsigned int number = 0;
++ const unsigned int onesixteenthPoints = num_points / 16;
+
+- float* outputVectorPtr = outputVector;
+- const float iScalar = 1.0 / scalar;
+- __m512 invScalar = _mm512_set1_ps(iScalar);
+- int32_t* inputPtr = (int32_t*)inputVector;
+- __m512i inputVal;
+- __m512 ret;
++ float* outputVectorPtr = outputVector;
++ const float iScalar = 1.0 / scalar;
++ __m512 invScalar = _mm512_set1_ps(iScalar);
++ int32_t* inputPtr = (int32_t*)inputVector;
++ __m512i inputVal;
++ __m512 ret;
+
+- for(;number < onesixteenthPoints; number++){
+- // Load the values
+- inputVal = _mm512_loadu_si512((__m512i*)inputPtr);
++ for (; number < onesixteenthPoints; number++) {
++ // Load the values
++ inputVal = _mm512_loadu_si512((__m512i*)inputPtr);
+
+- ret = _mm512_cvtepi32_ps(inputVal);
+- ret = _mm512_mul_ps(ret, invScalar);
++ ret = _mm512_cvtepi32_ps(inputVal);
++ ret = _mm512_mul_ps(ret, invScalar);
+
+- _mm512_storeu_ps(outputVectorPtr, ret);
++ _mm512_storeu_ps(outputVectorPtr, ret);
+
+- outputVectorPtr += 16;
+- inputPtr += 16;
+- }
++ outputVectorPtr += 16;
++ inputPtr += 16;
++ }
+
+- number = onesixteenthPoints * 16;
+- for(; number < num_points; number++){
+- outputVector[number] =((float)(inputVector[number])) * iScalar;
+- }
++ number = onesixteenthPoints * 16;
++ for (; number < num_points; number++) {
++ outputVector[number] = ((float)(inputVector[number])) * iScalar;
++ }
+ }
+ #endif /* LV_HAVE_AVX512F */
+
+@@ -108,37 +109,38 @@ volk_32i_s32f_convert_32f_u_avx512f(float* outputVector, const int32_t* inputVec
+ #ifdef LV_HAVE_AVX2
+ #include <immintrin.h>
+
+-static inline void
+-volk_32i_s32f_convert_32f_u_avx2(float* outputVector, const int32_t* inputVector,
+- const float scalar, unsigned int num_points)
++static inline void volk_32i_s32f_convert_32f_u_avx2(float* outputVector,
++ const int32_t* inputVector,
++ const float scalar,
++ unsigned int num_points)
+ {
+- unsigned int number = 0;
+- const unsigned int oneEightPoints = num_points / 8;
++ unsigned int number = 0;
++ const unsigned int oneEightPoints = num_points / 8;
+
+- float* outputVectorPtr = outputVector;
+- const float iScalar = 1.0 / scalar;
+- __m256 invScalar = _mm256_set1_ps(iScalar);
+- int32_t* inputPtr = (int32_t*)inputVector;
+- __m256i inputVal;
+- __m256 ret;
++ float* outputVectorPtr = outputVector;
++ const float iScalar = 1.0 / scalar;
++ __m256 invScalar = _mm256_set1_ps(iScalar);
++ int32_t* inputPtr = (int32_t*)inputVector;
++ __m256i inputVal;
++ __m256 ret;
+
+- for(;number < oneEightPoints; number++){
+- // Load the 4 values
+- inputVal = _mm256_loadu_si256((__m256i*)inputPtr);
++ for (; number < oneEightPoints; number++) {
++ // Load the 4 values
++ inputVal = _mm256_loadu_si256((__m256i*)inputPtr);
+
+- ret = _mm256_cvtepi32_ps(inputVal);
+- ret = _mm256_mul_ps(ret, invScalar);
++ ret = _mm256_cvtepi32_ps(inputVal);
++ ret = _mm256_mul_ps(ret, invScalar);
+
+- _mm256_storeu_ps(outputVectorPtr, ret);
++ _mm256_storeu_ps(outputVectorPtr, ret);
+
+- outputVectorPtr += 8;
+- inputPtr += 8;
+- }
++ outputVectorPtr += 8;
++ inputPtr += 8;
++ }
+
+- number = oneEightPoints * 8;
+- for(; number < num_points; number++){
+- outputVector[number] =((float)(inputVector[number])) * iScalar;
+- }
++ number = oneEightPoints * 8;
++ for (; number < num_points; number++) {
++ outputVector[number] = ((float)(inputVector[number])) * iScalar;
++ }
+ }
+ #endif /* LV_HAVE_AVX2 */
+
+@@ -146,62 +148,63 @@ volk_32i_s32f_convert_32f_u_avx2(float* outputVector, const int32_t* inputVector
+ #ifdef LV_HAVE_SSE2
+ #include <emmintrin.h>
+
+-static inline void
+-volk_32i_s32f_convert_32f_u_sse2(float* outputVector, const int32_t* inputVector,
+- const float scalar, unsigned int num_points)
++static inline void volk_32i_s32f_convert_32f_u_sse2(float* outputVector,
++ const int32_t* inputVector,
++ const float scalar,
++ unsigned int num_points)
+ {
+- unsigned int number = 0;
+- const unsigned int quarterPoints = num_points / 4;
++ unsigned int number = 0;
++ const unsigned int quarterPoints = num_points / 4;
+
+- float* outputVectorPtr = outputVector;
+- const float iScalar = 1.0 / scalar;
+- __m128 invScalar = _mm_set_ps1(iScalar);
+- int32_t* inputPtr = (int32_t*)inputVector;
+- __m128i inputVal;
+- __m128 ret;
++ float* outputVectorPtr = outputVector;
++ const float iScalar = 1.0 / scalar;
++ __m128 invScalar = _mm_set_ps1(iScalar);
++ int32_t* inputPtr = (int32_t*)inputVector;
++ __m128i inputVal;
++ __m128 ret;
+
+- for(;number < quarterPoints; number++){
+- // Load the 4 values
+- inputVal = _mm_loadu_si128((__m128i*)inputPtr);
++ for (; number < quarterPoints; number++) {
++ // Load the 4 values
++ inputVal = _mm_loadu_si128((__m128i*)inputPtr);
+
+- ret = _mm_cvtepi32_ps(inputVal);
+- ret = _mm_mul_ps(ret, invScalar);
++ ret = _mm_cvtepi32_ps(inputVal);
++ ret = _mm_mul_ps(ret, invScalar);
+
+- _mm_storeu_ps(outputVectorPtr, ret);
++ _mm_storeu_ps(outputVectorPtr, ret);
+
+- outputVectorPtr += 4;
+- inputPtr += 4;
+- }
++ outputVectorPtr += 4;
++ inputPtr += 4;
++ }
+
+- number = quarterPoints * 4;
+- for(; number < num_points; number++){
+- outputVector[number] =((float)(inputVector[number])) * iScalar;
+- }
++ number = quarterPoints * 4;
++ for (; number < num_points; number++) {
++ outputVector[number] = ((float)(inputVector[number])) * iScalar;
++ }
+ }
+ #endif /* LV_HAVE_SSE2 */
+
+
+ #ifdef LV_HAVE_GENERIC
+
+-static inline void
+-volk_32i_s32f_convert_32f_generic(float* outputVector, const int32_t* inputVector,
+- const float scalar, unsigned int num_points)
++static inline void volk_32i_s32f_convert_32f_generic(float* outputVector,
++ const int32_t* inputVector,
++ const float scalar,
++ unsigned int num_points)
+ {
+- float* outputVectorPtr = outputVector;
+- const int32_t* inputVectorPtr = inputVector;
+- unsigned int number = 0;
+- const float iScalar = 1.0 / scalar;
+-
+- for(number = 0; number < num_points; number++){
+- *outputVectorPtr++ = ((float)(*inputVectorPtr++)) * iScalar;
+- }
++ float* outputVectorPtr = outputVector;
++ const int32_t* inputVectorPtr = inputVector;
++ unsigned int number = 0;
++ const float iScalar = 1.0 / scalar;
++
++ for (number = 0; number < num_points; number++) {
++ *outputVectorPtr++ = ((float)(*inputVectorPtr++)) * iScalar;
++ }
+ }
+ #endif /* LV_HAVE_GENERIC */
+
+ #endif /* INCLUDED_volk_32i_s32f_convert_32f_u_H */
+
+
+-
+ #ifndef INCLUDED_volk_32i_s32f_convert_32f_a_H
+ #define INCLUDED_volk_32i_s32f_convert_32f_a_H
+
+@@ -211,74 +214,76 @@ volk_32i_s32f_convert_32f_generic(float* outputVector, const int32_t* inputVecto
+ #ifdef LV_HAVE_AVX512F
+ #include <immintrin.h>
+
+-static inline void
+-volk_32i_s32f_convert_32f_a_avx512f(float* outputVector, const int32_t* inputVector,
+- const float scalar, unsigned int num_points)
++static inline void volk_32i_s32f_convert_32f_a_avx512f(float* outputVector,
++ const int32_t* inputVector,
++ const float scalar,
++ unsigned int num_points)
+ {
+- unsigned int number = 0;
+- const unsigned int onesixteenthPoints = num_points / 16;
++ unsigned int number = 0;
++ const unsigned int onesixteenthPoints = num_points / 16;
+
+- float* outputVectorPtr = outputVector;
+- const float iScalar = 1.0 / scalar;
+- __m512 invScalar = _mm512_set1_ps(iScalar);
+- int32_t* inputPtr = (int32_t*)inputVector;
+- __m512i inputVal;
+- __m512 ret;
++ float* outputVectorPtr = outputVector;
++ const float iScalar = 1.0 / scalar;
++ __m512 invScalar = _mm512_set1_ps(iScalar);
++ int32_t* inputPtr = (int32_t*)inputVector;
++ __m512i inputVal;
++ __m512 ret;
+
+- for(;number < onesixteenthPoints; number++){
+- // Load the values
+- inputVal = _mm512_load_si512((__m512i*)inputPtr);
++ for (; number < onesixteenthPoints; number++) {
++ // Load the values
++ inputVal = _mm512_load_si512((__m512i*)inputPtr);
+
+- ret = _mm512_cvtepi32_ps(inputVal);
+- ret = _mm512_mul_ps(ret, invScalar);
++ ret = _mm512_cvtepi32_ps(inputVal);
++ ret = _mm512_mul_ps(ret, invScalar);
+
+- _mm512_store_ps(outputVectorPtr, ret);
++ _mm512_store_ps(outputVectorPtr, ret);
+
+- outputVectorPtr += 16;
+- inputPtr += 16;
+- }
++ outputVectorPtr += 16;
++ inputPtr += 16;
++ }
+
+- number = onesixteenthPoints * 16;
+- for(; number < num_points; number++){
+- outputVector[number] =((float)(inputVector[number])) * iScalar;
+- }
++ number = onesixteenthPoints * 16;
++ for (; number < num_points; number++) {
++ outputVector[number] = ((float)(inputVector[number])) * iScalar;
++ }
+ }
+ #endif /* LV_HAVE_AVX512F */
+
+ #ifdef LV_HAVE_AVX2
+ #include <immintrin.h>
+
+-static inline void
+-volk_32i_s32f_convert_32f_a_avx2(float* outputVector, const int32_t* inputVector,
+- const float scalar, unsigned int num_points)
++static inline void volk_32i_s32f_convert_32f_a_avx2(float* outputVector,
++ const int32_t* inputVector,
++ const float scalar,
++ unsigned int num_points)
+ {
+- unsigned int number = 0;
+- const unsigned int oneEightPoints = num_points / 8;
++ unsigned int number = 0;
++ const unsigned int oneEightPoints = num_points / 8;
+
+- float* outputVectorPtr = outputVector;
+- const float iScalar = 1.0 / scalar;
+- __m256 invScalar = _mm256_set1_ps(iScalar);
+- int32_t* inputPtr = (int32_t*)inputVector;
+- __m256i inputVal;
+- __m256 ret;
++ float* outputVectorPtr = outputVector;
++ const float iScalar = 1.0 / scalar;
++ __m256 invScalar = _mm256_set1_ps(iScalar);
++ int32_t* inputPtr = (int32_t*)inputVector;
++ __m256i inputVal;
++ __m256 ret;
+
+- for(;number < oneEightPoints; number++){
+- // Load the 4 values
+- inputVal = _mm256_load_si256((__m256i*)inputPtr);
++ for (; number < oneEightPoints; number++) {
++ // Load the 4 values
++ inputVal = _mm256_load_si256((__m256i*)inputPtr);
+
+- ret = _mm256_cvtepi32_ps(inputVal);
+- ret = _mm256_mul_ps(ret, invScalar);
++ ret = _mm256_cvtepi32_ps(inputVal);
++ ret = _mm256_mul_ps(ret, invScalar);
+
+- _mm256_store_ps(outputVectorPtr, ret);
++ _mm256_store_ps(outputVectorPtr, ret);
+
+- outputVectorPtr += 8;
+- inputPtr += 8;
+- }
++ outputVectorPtr += 8;
++ inputPtr += 8;
++ }
+
+- number = oneEightPoints * 8;
+- for(; number < num_points; number++){
+- outputVector[number] =((float)(inputVector[number])) * iScalar;
+- }
++ number = oneEightPoints * 8;
++ for (; number < num_points; number++) {
++ outputVector[number] = ((float)(inputVector[number])) * iScalar;
++ }
+ }
+ #endif /* LV_HAVE_AVX2 */
+
+@@ -286,59 +291,59 @@ volk_32i_s32f_convert_32f_a_avx2(float* outputVector, const int32_t* inputVector
+ #ifdef LV_HAVE_SSE2
+ #include <emmintrin.h>
+
+-static inline void
+-volk_32i_s32f_convert_32f_a_sse2(float* outputVector, const int32_t* inputVector,
+- const float scalar, unsigned int num_points)
++static inline void volk_32i_s32f_convert_32f_a_sse2(float* outputVector,
++ const int32_t* inputVector,
++ const float scalar,
++ unsigned int num_points)
+ {
+- unsigned int number = 0;
+- const unsigned int quarterPoints = num_points / 4;
++ unsigned int number = 0;
++ const unsigned int quarterPoints = num_points / 4;
+
+- float* outputVectorPtr = outputVector;
+- const float iScalar = 1.0 / scalar;
+- __m128 invScalar = _mm_set_ps1(iScalar);
+- int32_t* inputPtr = (int32_t*)inputVector;
+- __m128i inputVal;
+- __m128 ret;
++ float* outputVectorPtr = outputVector;
++ const float iScalar = 1.0 / scalar;
++ __m128 invScalar = _mm_set_ps1(iScalar);
++ int32_t* inputPtr = (int32_t*)inputVector;
++ __m128i inputVal;
++ __m128 ret;
+
+- for(;number < quarterPoints; number++){
+- // Load the 4 values
+- inputVal = _mm_load_si128((__m128i*)inputPtr);
++ for (; number < quarterPoints; number++) {
++ // Load the 4 values
++ inputVal = _mm_load_si128((__m128i*)inputPtr);
+
+- ret = _mm_cvtepi32_ps(inputVal);
+- ret = _mm_mul_ps(ret, invScalar);
++ ret = _mm_cvtepi32_ps(inputVal);
++ ret = _mm_mul_ps(ret, invScalar);
+
+- _mm_store_ps(outputVectorPtr, ret);
++ _mm_store_ps(outputVectorPtr, ret);
+
+- outputVectorPtr += 4;
+- inputPtr += 4;
+- }
++ outputVectorPtr += 4;
++ inputPtr += 4;
++ }
+
+- number = quarterPoints * 4;
+- for(; number < num_points; number++){
+- outputVector[number] =((float)(inputVector[number])) * iScalar;
+- }
++ number = quarterPoints * 4;
++ for (; number < num_points; number++) {
++ outputVector[number] = ((float)(inputVector[number])) * iScalar;
++ }
+ }
+ #endif /* LV_HAVE_SSE2 */
+
+
+ #ifdef LV_HAVE_GENERIC
+
+-static inline void
+-volk_32i_s32f_convert_32f_a_generic(float* outputVector, const int32_t* inputVector,
+- const float scalar, unsigned int num_points)
++static inline void volk_32i_s32f_convert_32f_a_generic(float* outputVector,
++ const int32_t* inputVector,
++ const float scalar,
++ unsigned int num_points)
+ {
+- float* outputVectorPtr = outputVector;
+- const int32_t* inputVectorPtr = inputVector;
+- unsigned int number = 0;
+- const float iScalar = 1.0 / scalar;
+-
+- for(number = 0; number < num_points; number++){
+- *outputVectorPtr++ = ((float)(*inputVectorPtr++)) * iScalar;
+- }
++ float* outputVectorPtr = outputVector;
++ const int32_t* inputVectorPtr = inputVector;
++ unsigned int number = 0;
++ const float iScalar = 1.0 / scalar;
++
++ for (number = 0; number < num_points; number++) {
++ *outputVectorPtr++ = ((float)(*inputVectorPtr++)) * iScalar;
++ }
+ }
+ #endif /* LV_HAVE_GENERIC */
+
+
+-
+-
+ #endif /* INCLUDED_volk_32i_s32f_convert_32f_a_H */
+diff --git a/kernels/volk/volk_32i_x2_and_32i.h b/kernels/volk/volk_32i_x2_and_32i.h
+index 76f0175..755cfdc 100644
+--- a/kernels/volk/volk_32i_x2_and_32i.h
++++ b/kernels/volk/volk_32i_x2_and_32i.h
+@@ -29,8 +29,8 @@
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+- * void volk_32i_x2_and_32i(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points)
+- * \endcode
++ * void volk_32i_x2_and_32i(int32_t* cVector, const int32_t* aVector, const int32_t*
++ * bVector, unsigned int num_points) \endcode
+ *
+ * \b Inputs
+ * \li aVector: Input vector of samples.
+@@ -87,72 +87,75 @@
+ #ifdef LV_HAVE_AVX512F
+ #include <immintrin.h>
+
+-static inline void
+-volk_32i_x2_and_32i_a_avx512f(int32_t* cVector, const int32_t* aVector,
+- const int32_t* bVector, unsigned int num_points)
++static inline void volk_32i_x2_and_32i_a_avx512f(int32_t* cVector,
++ const int32_t* aVector,
++ const int32_t* bVector,
++ unsigned int num_points)
+ {
+- unsigned int number = 0;
+- const unsigned int sixteenthPoints = num_points / 16;
++ unsigned int number = 0;
++ const unsigned int sixteenthPoints = num_points / 16;
+
+- int32_t* cPtr = (int32_t*)cVector;
+- const int32_t* aPtr = (int32_t*)aVector;
+- const int32_t* bPtr = (int32_t*)bVector;
++ int32_t* cPtr = (int32_t*)cVector;
++ const int32_t* aPtr = (int32_t*)aVector;
++ const int32_t* bPtr = (int32_t*)bVector;
+
+- __m512i aVal, bVal, cVal;
+- for(;number < sixteenthPoints; number++){
++ __m512i aVal, bVal, cVal;
++ for (; number < sixteenthPoints; number++) {
+
+- aVal = _mm512_load_si512(aPtr);
+- bVal = _mm512_load_si512(bPtr);
++ aVal = _mm512_load_si512(aPtr);
++ bVal = _mm512_load_si512(bPtr);
+
+- cVal = _mm512_and_si512(aVal, bVal);
++ cVal = _mm512_and_si512(aVal, bVal);
+
+- _mm512_store_si512(cPtr,cVal); // Store the results back into the C container
++ _mm512_store_si512(cPtr, cVal); // Store the results back into the C container
+
+- aPtr += 16;
+- bPtr += 16;
+- cPtr += 16;
+- }
++ aPtr += 16;
++ bPtr += 16;
++ cPtr += 16;
++ }
+
+- number = sixteenthPoints * 16;
+- for(;number < num_points; number++){
+- cVector[number] = aVector[number] & bVector[number];
+- }
++ number = sixteenthPoints * 16;
++ for (; number < num_points; number++) {
++ cVector[number] = aVector[number] & bVector[number];
++ }
+ }
+ #endif /* LV_HAVE_AVX512F */
+
+ #ifdef LV_HAVE_AVX2
+ #include <immintrin.h>
+
+-static inline void
+-volk_32i_x2_and_32i_a_avx2(int32_t* cVector, const int32_t* aVector,
+- const int32_t* bVector, unsigned int num_points)
++static inline void volk_32i_x2_and_32i_a_avx2(int32_t* cVector,
++ const int32_t* aVector,
++ const int32_t* bVector,
++ unsigned int num_points)
+ {
+- unsigned int number = 0;
+- const unsigned int oneEightPoints = num_points / 8;
++ unsigned int number = 0;
++ const unsigned int oneEightPoints = num_points / 8;
+
+- int32_t* cPtr = cVector;
+- const int32_t* aPtr = aVector;
+- const int32_t* bPtr = bVector;
++ int32_t* cPtr = cVector;
++ const int32_t* aPtr = aVector;
++ const int32_t* bPtr = bVector;
+
+- __m256i aVal, bVal, cVal;
+- for(;number < oneEightPoints; number++){
++ __m256i aVal, bVal, cVal;
++ for (; number < oneEightPoints; number++) {
+
+- aVal = _mm256_load_si256((__m256i*)aPtr);
+- bVal = _mm256_load_si256((__m256i*)bPtr);
++ aVal = _mm256_load_si256((__m256i*)aPtr);
++ bVal = _mm256_load_si256((__m256i*)bPtr);
+
+- cVal = _mm256_and_si256(aVal, bVal);
++ cVal = _mm256_and_si256(aVal, bVal);
+
+- _mm256_store_si256((__m256i*)cPtr,cVal); // Store the results back into the C container
++ _mm256_store_si256((__m256i*)cPtr,
++ cVal); // Store the results back into the C container
+
+- aPtr += 8;
+- bPtr += 8;
+- cPtr += 8;
+- }
++ aPtr += 8;
++ bPtr += 8;
++ cPtr += 8;
++ }
+
+- number = oneEightPoints * 8;
+- for(;number < num_points; number++){
+- cVector[number] = aVector[number] & bVector[number];
+- }
++ number = oneEightPoints * 8;
++ for (; number < num_points; number++) {
++ cVector[number] = aVector[number] & bVector[number];
++ }
+ }
+ #endif /* LV_HAVE_AVX2 */
+
+@@ -160,36 +163,37 @@ volk_32i_x2_and_32i_a_avx2(int32_t* cVector, const int32_t* aVector,
+ #ifdef LV_HAVE_SSE
+ #include <xmmintrin.h>
+
+-static inline void
+-volk_32i_x2_and_32i_a_sse(int32_t* cVector, const int32_t* aVector,
+- const int32_t* bVector, unsigned int num_points)
++static inline void volk_32i_x2_and_32i_a_sse(int32_t* cVector,
++ const int32_t* aVector,
++ const int32_t* bVector,
++ unsigned int num_points)
+ {
+- unsigned int number = 0;
+- const unsigned int quarterPoints = num_points / 4;
++ unsigned int number = 0;
++ const unsigned int quarterPoints = num_points / 4;
+
+- float* cPtr = (float*)cVector;
+- const float* aPtr = (float*)aVector;
+- const float* bPtr = (float*)bVector;
++ float* cPtr = (float*)cVector;
++ const float* aPtr = (float*)aVector;
++ const float* bPtr = (float*)bVector;
+
+- __m128 aVal, bVal, cVal;
+- for(;number < quarterPoints; number++){
++ __m128 aVal, bVal, cVal;
++ for (; number < quarterPoints; number++) {
+
+- aVal = _mm_load_ps(aPtr);
+- bVal = _mm_load_ps(bPtr);
++ aVal = _mm_load_ps(aPtr);
++ bVal = _mm_load_ps(bPtr);
+
+- cVal = _mm_and_ps(aVal, bVal);
++ cVal = _mm_and_ps(aVal, bVal);
+
+- _mm_store_ps(cPtr,cVal); // Store the results back into the C container
++ _mm_store_ps(cPtr, cVal); // Store the results back into the C container
+
+- aPtr += 4;
+- bPtr += 4;
+- cPtr += 4;
+- }
++ aPtr += 4;
++ bPtr += 4;
++ cPtr += 4;
++ }
+
+- number = quarterPoints * 4;
+- for(;number < num_points; number++){
+- cVector[number] = aVector[number] & bVector[number];
+- }
++ number = quarterPoints * 4;
++ for (; number < num_points; number++) {
++ cVector[number] = aVector[number] & bVector[number];
++ }
+ }
+ #endif /* LV_HAVE_SSE */
+
+@@ -197,62 +201,67 @@ volk_32i_x2_and_32i_a_sse(int32_t* cVector, const int32_t* aVector,
+ #ifdef LV_HAVE_NEON
+ #include <arm_neon.h>
+
+-static inline void
+-volk_32i_x2_and_32i_neon(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points)
++static inline void volk_32i_x2_and_32i_neon(int32_t* cVector,
++ const int32_t* aVector,
++ const int32_t* bVector,
++ unsigned int num_points)
+ {
+- int32_t* cPtr = cVector;
+- const int32_t* aPtr = aVector;
+- const int32_t* bPtr= bVector;
+- unsigned int number = 0;
+- unsigned int quarter_points = num_points / 4;
+-
+- int32x4_t a_val, b_val, c_val;
+-
+- for(number = 0; number < quarter_points; number++){
+- a_val = vld1q_s32(aPtr);
+- b_val = vld1q_s32(bPtr);
+- c_val = vandq_s32(a_val, b_val);
+- vst1q_s32(cPtr, c_val);
+- aPtr += 4;
+- bPtr += 4;
+- cPtr += 4;
+- }
+-
+- for(number = quarter_points * 4; number < num_points; number++){
+- *cPtr++ = (*aPtr++) & (*bPtr++);
+- }
++ int32_t* cPtr = cVector;
++ const int32_t* aPtr = aVector;
++ const int32_t* bPtr = bVector;
++ unsigned int number = 0;
++ unsigned int quarter_points = num_points / 4;
++
++ int32x4_t a_val, b_val, c_val;
++
++ for (number = 0; number < quarter_points; number++) {
++ a_val = vld1q_s32(aPtr);
++ b_val = vld1q_s32(bPtr);
++ c_val = vandq_s32(a_val, b_val);
++ vst1q_s32(cPtr, c_val);
++ aPtr += 4;
++ bPtr += 4;
++ cPtr += 4;
++ }
++
++ for (number = quarter_points * 4; number < num_points; number++) {
++ *cPtr++ = (*aPtr++) & (*bPtr++);
++ }
+ }
+ #endif /* LV_HAVE_NEON */
+
+
+ #ifdef LV_HAVE_GENERIC
+
+-static inline void
+-volk_32i_x2_and_32i_generic(int32_t* cVector, const int32_t* aVector,
+- const int32_t* bVector, unsigned int num_points)
++static inline void volk_32i_x2_and_32i_generic(int32_t* cVector,
++ const int32_t* aVector,
++ const int32_t* bVector,
++ unsigned int num_points)
+ {
+- int32_t* cPtr = cVector;
+- const int32_t* aPtr = aVector;
+- const int32_t* bPtr= bVector;
+- unsigned int number = 0;
+-
+- for(number = 0; number < num_points; number++){
+- *cPtr++ = (*aPtr++) & (*bPtr++);
+- }
++ int32_t* cPtr = cVector;
++ const int32_t* aPtr = aVector;
++ const int32_t* bPtr = bVector;
++ unsigned int number = 0;
++
++ for (number = 0; number < num_points; number++) {
++ *cPtr++ = (*aPtr++) & (*bPtr++);
++ }
+ }
+ #endif /* LV_HAVE_GENERIC */
+
+
+ #ifdef LV_HAVE_ORC
+-extern void
+-volk_32i_x2_and_32i_a_orc_impl(int32_t* cVector, const int32_t* aVector,
+- const int32_t* bVector, unsigned int num_points);
+-
+-static inline void
+-volk_32i_x2_and_32i_u_orc(int32_t* cVector, const int32_t* aVector,
+- const int32_t* bVector, unsigned int num_points)
++extern void volk_32i_x2_and_32i_a_orc_impl(int32_t* cVector,
++ const int32_t* aVector,
++ const int32_t* bVector,
++ unsigned int num_points);
++
++static inline void volk_32i_x2_and_32i_u_orc(int32_t* cVector,
++ const int32_t* aVector,
++ const int32_t* bVector,
++ unsigned int num_points)
+ {
+- volk_32i_x2_and_32i_a_orc_impl(cVector, aVector, bVector, num_points);
++ volk_32i_x2_and_32i_a_orc_impl(cVector, aVector, bVector, num_points);
+ }
+ #endif /* LV_HAVE_ORC */
+
+@@ -269,72 +278,75 @@ volk_32i_x2_and_32i_u_orc(int32_t* cVector, const int32_t* aVector,
+ #ifdef LV_HAVE_AVX512F
+ #include <immintrin.h>
+
+-static inline void
+-volk_32i_x2_and_32i_u_avx512f(int32_t* cVector, const int32_t* aVector,
+- const int32_t* bVector, unsigned int num_points)
++static inline void volk_32i_x2_and_32i_u_avx512f(int32_t* cVector,
++ const int32_t* aVector,
++ const int32_t* bVector,
++ unsigned int num_points)
+ {
+- unsigned int number = 0;
+- const unsigned int sixteenthPoints = num_points / 16;
++ unsigned int number = 0;
++ const unsigned int sixteenthPoints = num_points / 16;
+
+- int32_t* cPtr = (int32_t*)cVector;
+- const int32_t* aPtr = (int32_t*)aVector;
+- const int32_t* bPtr = (int32_t*)bVector;
++ int32_t* cPtr = (int32_t*)cVector;
++ const int32_t* aPtr = (int32_t*)aVector;
++ const int32_t* bPtr = (int32_t*)bVector;
+
+- __m512i aVal, bVal, cVal;
+- for(;number < sixteenthPoints; number++){
++ __m512i aVal, bVal, cVal;
++ for (; number < sixteenthPoints; number++) {
+
+- aVal = _mm512_loadu_si512(aPtr);
+- bVal = _mm512_loadu_si512(bPtr);
++ aVal = _mm512_loadu_si512(aPtr);
++ bVal = _mm512_loadu_si512(bPtr);
+
+- cVal = _mm512_and_si512(aVal, bVal);
++ cVal = _mm512_and_si512(aVal, bVal);
+
+- _mm512_storeu_si512(cPtr,cVal); // Store the results back into the C container
++ _mm512_storeu_si512(cPtr, cVal); // Store the results back into the C container
+
+- aPtr += 16;
+- bPtr += 16;
+- cPtr += 16;
+- }
++ aPtr += 16;
++ bPtr += 16;
++ cPtr += 16;
++ }
+
+- number = sixteenthPoints * 16;
+- for(;number < num_points; number++){
+- cVector[number] = aVector[number] & bVector[number];
+- }
++ number = sixteenthPoints * 16;
++ for (; number < num_points; number++) {
++ cVector[number] = aVector[number] & bVector[number];
++ }
+ }
+ #endif /* LV_HAVE_AVX512F */
+
+ #ifdef LV_HAVE_AVX2
+ #include <immintrin.h>
+
+-static inline void
+-volk_32i_x2_and_32i_u_avx2(int32_t* cVector, const int32_t* aVector,
+- const int32_t* bVector, unsigned int num_points)
++static inline void volk_32i_x2_and_32i_u_avx2(int32_t* cVector,
++ const int32_t* aVector,
++ const int32_t* bVector,
++ unsigned int num_points)
+ {
+- unsigned int number = 0;
+- const unsigned int oneEightPoints = num_points / 8;
++ unsigned int number = 0;
++ const unsigned int oneEightPoints = num_points / 8;
+
+- int32_t* cPtr = cVector;
+- const int32_t* aPtr = aVector;
+- const int32_t* bPtr = bVector;
++ int32_t* cPtr = cVector;
++ const int32_t* aPtr = aVector;
++ const int32_t* bPtr = bVector;
+
+- __m256i aVal, bVal, cVal;
+- for(;number < oneEightPoints; number++){
++ __m256i aVal, bVal, cVal;
++ for (; number < oneEightPoints; number++) {
+
+- aVal = _mm256_loadu_si256((__m256i*)aPtr);
+- bVal = _mm256_loadu_si256((__m256i*)bPtr);
++ aVal = _mm256_loadu_si256((__m256i*)aPtr);
++ bVal = _mm256_loadu_si256((__m256i*)bPtr);
+
+- cVal = _mm256_and_si256(aVal, bVal);
++ cVal = _mm256_and_si256(aVal, bVal);
+
+- _mm256_storeu_si256((__m256i*)cPtr,cVal); // Store the results back into the C container
++ _mm256_storeu_si256((__m256i*)cPtr,
++ cVal); // Store the results back into the C container
+
+- aPtr += 8;
+- bPtr += 8;
+- cPtr += 8;
+- }
++ aPtr += 8;
++ bPtr += 8;
++ cPtr += 8;
++ }
+
+- number = oneEightPoints * 8;
+- for(;number < num_points; number++){
+- cVector[number] = aVector[number] & bVector[number];
+- }
++ number = oneEightPoints * 8;
++ for (; number < num_points; number++) {
++ cVector[number] = aVector[number] & bVector[number];
++ }
+ }
+ #endif /* LV_HAVE_AVX2 */
+
+diff --git a/kernels/volk/volk_32i_x2_or_32i.h b/kernels/volk/volk_32i_x2_or_32i.h
+index be4c086..b03db89 100644
+--- a/kernels/volk/volk_32i_x2_or_32i.h
++++ b/kernels/volk/volk_32i_x2_or_32i.h
+@@ -29,8 +29,8 @@
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+- * void volk_32i_x2_or_32i(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points)
+- * \endcode
++ * void volk_32i_x2_or_32i(int32_t* cVector, const int32_t* aVector, const int32_t*
++ * bVector, unsigned int num_points) \endcode
+ *
+ * \b Inputs
+ * \li aVector: Input vector of samples.
+@@ -87,72 +87,75 @@
+ #ifdef LV_HAVE_AVX512F
+ #include <immintrin.h>
+
+-static inline void
+-volk_32i_x2_or_32i_a_avx512f(int32_t* cVector, const int32_t* aVector,
+- const int32_t* bVector, unsigned int num_points)
++static inline void volk_32i_x2_or_32i_a_avx512f(int32_t* cVector,
++ const int32_t* aVector,
++ const int32_t* bVector,
++ unsigned int num_points)
+ {
+- unsigned int number = 0;
+- const unsigned int sixteenthPoints = num_points / 16;
++ unsigned int number = 0;
++ const unsigned int sixteenthPoints = num_points / 16;
+
+- int32_t* cPtr = (int32_t*)cVector;
+- const int32_t* aPtr = (int32_t*)aVector;
+- const int32_t* bPtr = (int32_t*)bVector;
++ int32_t* cPtr = (int32_t*)cVector;
++ const int32_t* aPtr = (int32_t*)aVector;
++ const int32_t* bPtr = (int32_t*)bVector;
+
+- __m512i aVal, bVal, cVal;
+- for(;number < sixteenthPoints; number++){
++ __m512i aVal, bVal, cVal;
++ for (; number < sixteenthPoints; number++) {
+
+- aVal = _mm512_load_si512(aPtr);
+- bVal = _mm512_load_si512(bPtr);
++ aVal = _mm512_load_si512(aPtr);
++ bVal = _mm512_load_si512(bPtr);
+
+- cVal = _mm512_or_si512(aVal, bVal);
++ cVal = _mm512_or_si512(aVal, bVal);
+
+- _mm512_store_si512(cPtr,cVal); // Store the results back into the C container
++ _mm512_store_si512(cPtr, cVal); // Store the results back into the C container
+
+- aPtr += 16;
+- bPtr += 16;
+- cPtr += 16;
+- }
++ aPtr += 16;
++ bPtr += 16;
++ cPtr += 16;
++ }
+
+- number = sixteenthPoints * 16;
+- for(;number < num_points; number++){
+- cVector[number] = aVector[number] | bVector[number];
+- }
++ number = sixteenthPoints * 16;
++ for (; number < num_points; number++) {
++ cVector[number] = aVector[number] | bVector[number];
++ }
+ }
+ #endif /* LV_HAVE_AVX512F */
+
+ #ifdef LV_HAVE_AVX2
+ #include <immintrin.h>
+
+-static inline void
+-volk_32i_x2_or_32i_a_avx2(int32_t* cVector, const int32_t* aVector,
+- const int32_t* bVector, unsigned int num_points)
++static inline void volk_32i_x2_or_32i_a_avx2(int32_t* cVector,
++ const int32_t* aVector,
++ const int32_t* bVector,
++ unsigned int num_points)
+ {
+- unsigned int number = 0;
+- const unsigned int oneEightPoints = num_points / 8;
++ unsigned int number = 0;
++ const unsigned int oneEightPoints = num_points / 8;
+
+- int32_t* cPtr = cVector;
+- const int32_t* aPtr = aVector;
+- const int32_t* bPtr = bVector;
++ int32_t* cPtr = cVector;
++ const int32_t* aPtr = aVector;
++ const int32_t* bPtr = bVector;
+
+- __m256i aVal, bVal, cVal;
+- for(;number < oneEightPoints; number++){
++ __m256i aVal, bVal, cVal;
++ for (; number < oneEightPoints; number++) {
+
+- aVal = _mm256_load_si256((__m256i*)aPtr);
+- bVal = _mm256_load_si256((__m256i*)bPtr);
++ aVal = _mm256_load_si256((__m256i*)aPtr);
++ bVal = _mm256_load_si256((__m256i*)bPtr);
+
+- cVal = _mm256_or_si256(aVal, bVal);
++ cVal = _mm256_or_si256(aVal, bVal);
+
+- _mm256_store_si256((__m256i*)cPtr,cVal); // Store the results back into the C container
++ _mm256_store_si256((__m256i*)cPtr,
++ cVal); // Store the results back into the C container
+
+- aPtr += 8;
+- bPtr += 8;
+- cPtr += 8;
+- }
++ aPtr += 8;
++ bPtr += 8;
++ cPtr += 8;
++ }
+
+- number = oneEightPoints * 8;
+- for(;number < num_points; number++){
+- cVector[number] = aVector[number] | bVector[number];
+- }
++ number = oneEightPoints * 8;
++ for (; number < num_points; number++) {
++ cVector[number] = aVector[number] | bVector[number];
++ }
+ }
+ #endif /* LV_HAVE_AVX2 */
+
+@@ -160,35 +163,36 @@ volk_32i_x2_or_32i_a_avx2(int32_t* cVector, const int32_t* aVector,
+ #ifdef LV_HAVE_SSE
+ #include <xmmintrin.h>
+
+-static inline void
+-volk_32i_x2_or_32i_a_sse(int32_t* cVector, const int32_t* aVector,
+- const int32_t* bVector, unsigned int num_points)
++static inline void volk_32i_x2_or_32i_a_sse(int32_t* cVector,
++ const int32_t* aVector,
++ const int32_t* bVector,
++ unsigned int num_points)
+ {
+- unsigned int number = 0;
+- const unsigned int quarterPoints = num_points / 4;
++ unsigned int number = 0;
++ const unsigned int quarterPoints = num_points / 4;
+
+- float* cPtr = (float*)cVector;
+- const float* aPtr = (float*)aVector;
+- const float* bPtr = (float*)bVector;
++ float* cPtr = (float*)cVector;
++ const float* aPtr = (float*)aVector;
++ const float* bPtr = (float*)bVector;
+
+- __m128 aVal, bVal, cVal;
+- for(;number < quarterPoints; number++){
+- aVal = _mm_load_ps(aPtr);
+- bVal = _mm_load_ps(bPtr);
++ __m128 aVal, bVal, cVal;
++ for (; number < quarterPoints; number++) {
++ aVal = _mm_load_ps(aPtr);
++ bVal = _mm_load_ps(bPtr);
+
+- cVal = _mm_or_ps(aVal, bVal);
++ cVal = _mm_or_ps(aVal, bVal);
+
+- _mm_store_ps(cPtr,cVal); // Store the results back into the C container
++ _mm_store_ps(cPtr, cVal); // Store the results back into the C container
+
+- aPtr += 4;
+- bPtr += 4;
+- cPtr += 4;
+- }
++ aPtr += 4;
++ bPtr += 4;
++ cPtr += 4;
++ }
+
+- number = quarterPoints * 4;
+- for(;number < num_points; number++){
+- cVector[number] = aVector[number] | bVector[number];
+- }
++ number = quarterPoints * 4;
++ for (; number < num_points; number++) {
++ cVector[number] = aVector[number] | bVector[number];
++ }
+ }
+ #endif /* LV_HAVE_SSE */
+
+@@ -196,63 +200,67 @@ volk_32i_x2_or_32i_a_sse(int32_t* cVector, const int32_t* aVector,
+ #ifdef LV_HAVE_NEON
+ #include <arm_neon.h>
+
+-static inline void
+-volk_32i_x2_or_32i_neon(int32_t* cVector, const int32_t* aVector,
+- const int32_t* bVector, unsigned int num_points)
++static inline void volk_32i_x2_or_32i_neon(int32_t* cVector,
++ const int32_t* aVector,
++ const int32_t* bVector,
++ unsigned int num_points)
+ {
+- int32_t* cPtr = cVector;
+- const int32_t* aPtr = aVector;
+- const int32_t* bPtr= bVector;
+- unsigned int number = 0;
+- unsigned int quarter_points = num_points / 4;
+-
+- int32x4_t a_val, b_val, c_val;
+-
+- for(number = 0; number < quarter_points; number++){
+- a_val = vld1q_s32(aPtr);
+- b_val = vld1q_s32(bPtr);
+- c_val = vorrq_s32(a_val, b_val);
+- vst1q_s32(cPtr, c_val);
+- aPtr += 4;
+- bPtr += 4;
+- cPtr += 4;
+- }
+-
+- for(number = quarter_points * 4; number < num_points; number++){
+- *cPtr++ = (*aPtr++) | (*bPtr++);
+- }
++ int32_t* cPtr = cVector;
++ const int32_t* aPtr = aVector;
++ const int32_t* bPtr = bVector;
++ unsigned int number = 0;
++ unsigned int quarter_points = num_points / 4;
++
++ int32x4_t a_val, b_val, c_val;
++
++ for (number = 0; number < quarter_points; number++) {
++ a_val = vld1q_s32(aPtr);
++ b_val = vld1q_s32(bPtr);
++ c_val = vorrq_s32(a_val, b_val);
++ vst1q_s32(cPtr, c_val);
++ aPtr += 4;
++ bPtr += 4;
++ cPtr += 4;
++ }
++
++ for (number = quarter_points * 4; number < num_points; number++) {
++ *cPtr++ = (*aPtr++) | (*bPtr++);
++ }
+ }
+ #endif /* LV_HAVE_NEON */
+
+
+ #ifdef LV_HAVE_GENERIC
+
+-static inline void
+-volk_32i_x2_or_32i_generic(int32_t* cVector, const int32_t* aVector,
+- const int32_t* bVector, unsigned int num_points)
++static inline void volk_32i_x2_or_32i_generic(int32_t* cVector,
++ const int32_t* aVector,
++ const int32_t* bVector,
++ unsigned int num_points)
+ {
+- int32_t* cPtr = cVector;
+- const int32_t* aPtr = aVector;
+- const int32_t* bPtr= bVector;
+- unsigned int number = 0;
+-
+- for(number = 0; number < num_points; number++){
+- *cPtr++ = (*aPtr++) | (*bPtr++);
+- }
++ int32_t* cPtr = cVector;
++ const int32_t* aPtr = aVector;
++ const int32_t* bPtr = bVector;
++ unsigned int number = 0;
++
++ for (number = 0; number < num_points; number++) {
++ *cPtr++ = (*aPtr++) | (*bPtr++);
++ }
+ }
+ #endif /* LV_HAVE_GENERIC */
+
+
+ #ifdef LV_HAVE_ORC
+-extern void
+-volk_32i_x2_or_32i_a_orc_impl(int32_t* cVector, const int32_t* aVector,
+- const int32_t* bVector, unsigned int num_points);
+-
+-static inline void
+-volk_32i_x2_or_32i_u_orc(int32_t* cVector, const int32_t* aVector,
+- const int32_t* bVector, unsigned int num_points)
++extern void volk_32i_x2_or_32i_a_orc_impl(int32_t* cVector,
++ const int32_t* aVector,
++ const int32_t* bVector,
++ unsigned int num_points);
++
++static inline void volk_32i_x2_or_32i_u_orc(int32_t* cVector,
++ const int32_t* aVector,
++ const int32_t* bVector,
++ unsigned int num_points)
+ {
+- volk_32i_x2_or_32i_a_orc_impl(cVector, aVector, bVector, num_points);
++ volk_32i_x2_or_32i_a_orc_impl(cVector, aVector, bVector, num_points);
+ }
+ #endif /* LV_HAVE_ORC */
+
+@@ -269,72 +277,75 @@ volk_32i_x2_or_32i_u_orc(int32_t* cVector, const int32_t* aVector,
+ #ifdef LV_HAVE_AVX512F
+ #include <immintrin.h>
+
+-static inline void
+-volk_32i_x2_or_32i_u_avx512f(int32_t* cVector, const int32_t* aVector,
+- const int32_t* bVector, unsigned int num_points)
++static inline void volk_32i_x2_or_32i_u_avx512f(int32_t* cVector,
++ const int32_t* aVector,
++ const int32_t* bVector,
++ unsigned int num_points)
+ {
+- unsigned int number = 0;
+- const unsigned int sixteenthPoints = num_points / 16;
++ unsigned int number = 0;
++ const unsigned int sixteenthPoints = num_points / 16;
+
+- int32_t* cPtr = (int32_t*)cVector;
+- const int32_t* aPtr = (int32_t*)aVector;
+- const int32_t* bPtr = (int32_t*)bVector;
++ int32_t* cPtr = (int32_t*)cVector;
++ const int32_t* aPtr = (int32_t*)aVector;
++ const int32_t* bPtr = (int32_t*)bVector;
+
+- __m512i aVal, bVal, cVal;
+- for(;number < sixteenthPoints; number++){
++ __m512i aVal, bVal, cVal;
++ for (; number < sixteenthPoints; number++) {
+
+- aVal = _mm512_loadu_si512(aPtr);
+- bVal = _mm512_loadu_si512(bPtr);
++ aVal = _mm512_loadu_si512(aPtr);
++ bVal = _mm512_loadu_si512(bPtr);
+
+- cVal = _mm512_or_si512(aVal, bVal);
++ cVal = _mm512_or_si512(aVal, bVal);
+
+- _mm512_storeu_si512(cPtr,cVal); // Store the results back into the C container
++ _mm512_storeu_si512(cPtr, cVal); // Store the results back into the C container
+
+- aPtr += 16;
+- bPtr += 16;
+- cPtr += 16;
+- }
++ aPtr += 16;
++ bPtr += 16;
++ cPtr += 16;
++ }
+
+- number = sixteenthPoints * 16;
+- for(;number < num_points; number++){
+- cVector[number] = aVector[number] | bVector[number];
+- }
++ number = sixteenthPoints * 16;
++ for (; number < num_points; number++) {
++ cVector[number] = aVector[number] | bVector[number];
++ }
+ }
+ #endif /* LV_HAVE_AVX512F */
+
+ #ifdef LV_HAVE_AVX2
+ #include <immintrin.h>
+
+-static inline void
+-volk_32i_x2_or_32i_u_avx2(int32_t* cVector, const int32_t* aVector,
+- const int32_t* bVector, unsigned int num_points)
++static inline void volk_32i_x2_or_32i_u_avx2(int32_t* cVector,
++ const int32_t* aVector,
++ const int32_t* bVector,
++ unsigned int num_points)
+ {
+- unsigned int number = 0;
+- const unsigned int oneEightPoints = num_points / 8;
++ unsigned int number = 0;
++ const unsigned int oneEightPoints = num_points / 8;
+
+- int32_t* cPtr = cVector;
+- const int32_t* aPtr = aVector;
+- const int32_t* bPtr = bVector;
++ int32_t* cPtr = cVector;
++ const int32_t* aPtr = aVector;
++ const int32_t* bPtr = bVector;
+
+- __m256i aVal, bVal, cVal;
+- for(;number < oneEightPoints; number++){
++ __m256i aVal, bVal, cVal;
++ for (; number < oneEightPoints; number++) {
+
+- aVal = _mm256_loadu_si256((__m256i*)aPtr);
+- bVal = _mm256_loadu_si256((__m256i*)bPtr);
++ aVal = _mm256_loadu_si256((__m256i*)aPtr);
++ bVal = _mm256_loadu_si256((__m256i*)bPtr);
+
+- cVal = _mm256_or_si256(aVal, bVal);
++ cVal = _mm256_or_si256(aVal, bVal);
+
+- _mm256_storeu_si256((__m256i*)cPtr,cVal); // Store the results back into the C container
++ _mm256_storeu_si256((__m256i*)cPtr,
++ cVal); // Store the results back into the C container
+
+- aPtr += 8;
+- bPtr += 8;
+- cPtr += 8;
+- }
++ aPtr += 8;
++ bPtr += 8;
++ cPtr += 8;
++ }
+
+- number = oneEightPoints * 8;
+- for(;number < num_points; number++){
+- cVector[number] = aVector[number] | bVector[number];
+- }
++ number = oneEightPoints * 8;
++ for (; number < num_points; number++) {
++ cVector[number] = aVector[number] | bVector[number];
++ }
+ }
+ #endif /* LV_HAVE_AVX2 */
+
+diff --git a/kernels/volk/volk_32u_byteswap.h b/kernels/volk/volk_32u_byteswap.h
+index f5e6f11..185047c 100644
+--- a/kernels/volk/volk_32u_byteswap.h
++++ b/kernels/volk/volk_32u_byteswap.h
+@@ -71,38 +71,42 @@
+
+ #if LV_HAVE_AVX2
+ #include <immintrin.h>
+-static inline void volk_32u_byteswap_u_avx2(uint32_t* intsToSwap, unsigned int num_points){
++static inline void volk_32u_byteswap_u_avx2(uint32_t* intsToSwap, unsigned int num_points)
++{
+
+- unsigned int number;
++ unsigned int number;
+
+- const unsigned int nPerSet = 8;
+- const uint64_t nSets = num_points / nPerSet;
++ const unsigned int nPerSet = 8;
++ const uint64_t nSets = num_points / nPerSet;
+
+- uint32_t* inputPtr = intsToSwap;
++ uint32_t* inputPtr = intsToSwap;
+
+- const uint8_t shuffleVector[32] = { 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12, 19, 18, 17, 16, 23, 22, 21, 20, 27, 26, 25, 24, 31, 30, 29, 28 };
++ const uint8_t shuffleVector[32] = { 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9,
++ 8, 15, 14, 13, 12, 19, 18, 17, 16, 23, 22,
++ 21, 20, 27, 26, 25, 24, 31, 30, 29, 28 };
+
+- const __m256i myShuffle = _mm256_loadu_si256((__m256i*) &shuffleVector);
++ const __m256i myShuffle = _mm256_loadu_si256((__m256i*)&shuffleVector);
+
+- for (number = 0 ;number < nSets; number++) {
++ for (number = 0; number < nSets; number++) {
+
+- // Load the 32t values, increment inputPtr later since we're doing it in-place.
+- const __m256i input = _mm256_loadu_si256((__m256i*)inputPtr);
+- const __m256i output = _mm256_shuffle_epi8(input,myShuffle);
++ // Load the 32t values, increment inputPtr later since we're doing it in-place.
++ const __m256i input = _mm256_loadu_si256((__m256i*)inputPtr);
++ const __m256i output = _mm256_shuffle_epi8(input, myShuffle);
+
+- // Store the results
+- _mm256_storeu_si256((__m256i*)inputPtr, output);
+- inputPtr += nPerSet;
+- }
+- _mm256_zeroupper();
+-
+- // Byteswap any remaining points:
+- for(number = nSets * nPerSet; number < num_points; number++){
+- uint32_t outputVal = *inputPtr;
+- outputVal = (((outputVal >> 24) & 0xff) | ((outputVal >> 8) & 0x0000ff00) | ((outputVal << 8) & 0x00ff0000) | ((outputVal << 24) & 0xff000000));
+- *inputPtr = outputVal;
+- inputPtr++;
+- }
++ // Store the results
++ _mm256_storeu_si256((__m256i*)inputPtr, output);
++ inputPtr += nPerSet;
++ }
++ _mm256_zeroupper();
++
++ // Byteswap any remaining points:
++ for (number = nSets * nPerSet; number < num_points; number++) {
++ uint32_t outputVal = *inputPtr;
++ outputVal = (((outputVal >> 24) & 0xff) | ((outputVal >> 8) & 0x0000ff00) |
++ ((outputVal << 8) & 0x00ff0000) | ((outputVal << 24) & 0xff000000));
++ *inputPtr = outputVal;
++ inputPtr++;
++ }
+ }
+ #endif /* LV_HAVE_AVX2 */
+
+@@ -110,42 +114,44 @@ static inline void volk_32u_byteswap_u_avx2(uint32_t* intsToSwap, unsigned int n
+ #ifdef LV_HAVE_SSE2
+ #include <emmintrin.h>
+
+-static inline void volk_32u_byteswap_u_sse2(uint32_t* intsToSwap, unsigned int num_points){
+- unsigned int number = 0;
+-
+- uint32_t* inputPtr = intsToSwap;
+- __m128i input, byte1, byte2, byte3, byte4, output;
+- __m128i byte2mask = _mm_set1_epi32(0x00FF0000);
+- __m128i byte3mask = _mm_set1_epi32(0x0000FF00);
+-
+- const uint64_t quarterPoints = num_points / 4;
+- for(;number < quarterPoints; number++){
+- // Load the 32t values, increment inputPtr later since we're doing it in-place.
+- input = _mm_loadu_si128((__m128i*)inputPtr);
+- // Do the four shifts
+- byte1 = _mm_slli_epi32(input, 24);
+- byte2 = _mm_slli_epi32(input, 8);
+- byte3 = _mm_srli_epi32(input, 8);
+- byte4 = _mm_srli_epi32(input, 24);
+- // Or bytes together
+- output = _mm_or_si128(byte1, byte4);
+- byte2 = _mm_and_si128(byte2, byte2mask);
+- output = _mm_or_si128(output, byte2);
+- byte3 = _mm_and_si128(byte3, byte3mask);
+- output = _mm_or_si128(output, byte3);
+- // Store the results
+- _mm_storeu_si128((__m128i*)inputPtr, output);
+- inputPtr += 4;
+- }
+-
+- // Byteswap any remaining points:
+- number = quarterPoints*4;
+- for(; number < num_points; number++){
+- uint32_t outputVal = *inputPtr;
+- outputVal = (((outputVal >> 24) & 0xff) | ((outputVal >> 8) & 0x0000ff00) | ((outputVal << 8) & 0x00ff0000) | ((outputVal << 24) & 0xff000000));
+- *inputPtr = outputVal;
+- inputPtr++;
+- }
++static inline void volk_32u_byteswap_u_sse2(uint32_t* intsToSwap, unsigned int num_points)
++{
++ unsigned int number = 0;
++
++ uint32_t* inputPtr = intsToSwap;
++ __m128i input, byte1, byte2, byte3, byte4, output;
++ __m128i byte2mask = _mm_set1_epi32(0x00FF0000);
++ __m128i byte3mask = _mm_set1_epi32(0x0000FF00);
++
++ const uint64_t quarterPoints = num_points / 4;
++ for (; number < quarterPoints; number++) {
++ // Load the 32t values, increment inputPtr later since we're doing it in-place.
++ input = _mm_loadu_si128((__m128i*)inputPtr);
++ // Do the four shifts
++ byte1 = _mm_slli_epi32(input, 24);
++ byte2 = _mm_slli_epi32(input, 8);
++ byte3 = _mm_srli_epi32(input, 8);
++ byte4 = _mm_srli_epi32(input, 24);
++ // Or bytes together
++ output = _mm_or_si128(byte1, byte4);
++ byte2 = _mm_and_si128(byte2, byte2mask);
++ output = _mm_or_si128(output, byte2);
++ byte3 = _mm_and_si128(byte3, byte3mask);
++ output = _mm_or_si128(output, byte3);
++ // Store the results
++ _mm_storeu_si128((__m128i*)inputPtr, output);
++ inputPtr += 4;
++ }
++
++ // Byteswap any remaining points:
++ number = quarterPoints * 4;
++ for (; number < num_points; number++) {
++ uint32_t outputVal = *inputPtr;
++ outputVal = (((outputVal >> 24) & 0xff) | ((outputVal >> 8) & 0x0000ff00) |
++ ((outputVal << 8) & 0x00ff0000) | ((outputVal << 24) & 0xff000000));
++ *inputPtr = outputVal;
++ inputPtr++;
++ }
+ }
+ #endif /* LV_HAVE_SSE2 */
+
+@@ -153,100 +159,106 @@ static inline void volk_32u_byteswap_u_sse2(uint32_t* intsToSwap, unsigned int n
+ #ifdef LV_HAVE_NEON
+ #include <arm_neon.h>
+
+-static inline void volk_32u_byteswap_neon(uint32_t* intsToSwap, unsigned int num_points){
+- uint32_t* inputPtr = intsToSwap;
+- unsigned int number = 0;
+- unsigned int n8points = num_points / 8;
+-
+- uint8x8x4_t input_table;
+- uint8x8_t int_lookup01, int_lookup23, int_lookup45, int_lookup67;
+- uint8x8_t swapped_int01, swapped_int23, swapped_int45, swapped_int67;
+-
+- /* these magic numbers are used as byte-indices in the LUT.
+- they are pre-computed to save time. A simple C program
+- can calculate them; for example for lookup01:
+- uint8_t chars[8] = {24, 16, 8, 0, 25, 17, 9, 1};
+- for(ii=0; ii < 8; ++ii) {
+- index += ((uint64_t)(*(chars+ii))) << (ii*8);
++static inline void volk_32u_byteswap_neon(uint32_t* intsToSwap, unsigned int num_points)
++{
++ uint32_t* inputPtr = intsToSwap;
++ unsigned int number = 0;
++ unsigned int n8points = num_points / 8;
++
++ uint8x8x4_t input_table;
++ uint8x8_t int_lookup01, int_lookup23, int_lookup45, int_lookup67;
++ uint8x8_t swapped_int01, swapped_int23, swapped_int45, swapped_int67;
++
++ /* these magic numbers are used as byte-indices in the LUT.
++ they are pre-computed to save time. A simple C program
++ can calculate them; for example for lookup01:
++ uint8_t chars[8] = {24, 16, 8, 0, 25, 17, 9, 1};
++ for(ii=0; ii < 8; ++ii) {
++ index += ((uint64_t)(*(chars+ii))) << (ii*8);
++ }
++ */
++ int_lookup01 = vcreate_u8(74609667900706840);
++ int_lookup23 = vcreate_u8(219290013576860186);
++ int_lookup45 = vcreate_u8(363970359253013532);
++ int_lookup67 = vcreate_u8(508650704929166878);
++
++ for (number = 0; number < n8points; ++number) {
++ input_table = vld4_u8((uint8_t*)inputPtr);
++ swapped_int01 = vtbl4_u8(input_table, int_lookup01);
++ swapped_int23 = vtbl4_u8(input_table, int_lookup23);
++ swapped_int45 = vtbl4_u8(input_table, int_lookup45);
++ swapped_int67 = vtbl4_u8(input_table, int_lookup67);
++ vst1_u8((uint8_t*)inputPtr, swapped_int01);
++ vst1_u8((uint8_t*)(inputPtr + 2), swapped_int23);
++ vst1_u8((uint8_t*)(inputPtr + 4), swapped_int45);
++ vst1_u8((uint8_t*)(inputPtr + 6), swapped_int67);
++
++ inputPtr += 8;
++ }
++
++ for (number = n8points * 8; number < num_points; ++number) {
++ uint32_t output = *inputPtr;
++ output = (((output >> 24) & 0xff) | ((output >> 8) & 0x0000ff00) |
++ ((output << 8) & 0x00ff0000) | ((output << 24) & 0xff000000));
++
++ *inputPtr = output;
++ inputPtr++;
+ }
+- */
+- int_lookup01 = vcreate_u8(74609667900706840);
+- int_lookup23 = vcreate_u8(219290013576860186);
+- int_lookup45 = vcreate_u8(363970359253013532);
+- int_lookup67 = vcreate_u8(508650704929166878);
+-
+- for(number = 0; number < n8points; ++number){
+- input_table = vld4_u8((uint8_t*) inputPtr);
+- swapped_int01 = vtbl4_u8(input_table, int_lookup01);
+- swapped_int23 = vtbl4_u8(input_table, int_lookup23);
+- swapped_int45 = vtbl4_u8(input_table, int_lookup45);
+- swapped_int67 = vtbl4_u8(input_table, int_lookup67);
+- vst1_u8((uint8_t*) inputPtr, swapped_int01);
+- vst1_u8((uint8_t*) (inputPtr+2), swapped_int23);
+- vst1_u8((uint8_t*) (inputPtr+4), swapped_int45);
+- vst1_u8((uint8_t*) (inputPtr+6), swapped_int67);
+-
+- inputPtr += 8;
+- }
+-
+- for(number = n8points * 8; number < num_points; ++number){
+- uint32_t output = *inputPtr;
+- output = (((output >> 24) & 0xff) | ((output >> 8) & 0x0000ff00) | ((output << 8) & 0x00ff0000) | ((output << 24) & 0xff000000));
+-
+- *inputPtr = output;
+- inputPtr++;
+- }
+ }
+ #endif /* LV_HAVE_NEON */
+
+ #ifdef LV_HAVE_NEONV8
+ #include <arm_neon.h>
+
+-static inline void volk_32u_byteswap_neonv8(uint32_t* intsToSwap, unsigned int num_points){
+- uint32_t* inputPtr = (uint32_t*)intsToSwap;
+- const unsigned int n8points = num_points / 8;
+- uint8x16_t input;
+- uint8x16_t idx = { 3,2,1,0, 7,6,5,4, 11,10,9,8, 15,14,13,12 };
+-
+- unsigned int number = 0;
+- for(number = 0; number < n8points; ++number){
+- __VOLK_PREFETCH(inputPtr+8);
+- input = vld1q_u8((uint8_t*) inputPtr);
+- input = vqtbl1q_u8(input, idx);
+- vst1q_u8((uint8_t*) inputPtr, input);
+- inputPtr += 4;
+-
+- input = vld1q_u8((uint8_t*) inputPtr);
+- input = vqtbl1q_u8(input, idx);
+- vst1q_u8((uint8_t*) inputPtr, input);
+- inputPtr += 4;
+- }
+-
+- for(number = n8points * 8; number < num_points; ++number){
+- uint32_t output = *inputPtr;
++static inline void volk_32u_byteswap_neonv8(uint32_t* intsToSwap, unsigned int num_points)
++{
++ uint32_t* inputPtr = (uint32_t*)intsToSwap;
++ const unsigned int n8points = num_points / 8;
++ uint8x16_t input;
++ uint8x16_t idx = { 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12 };
++
++ unsigned int number = 0;
++ for (number = 0; number < n8points; ++number) {
++ __VOLK_PREFETCH(inputPtr + 8);
++ input = vld1q_u8((uint8_t*)inputPtr);
++ input = vqtbl1q_u8(input, idx);
++ vst1q_u8((uint8_t*)inputPtr, input);
++ inputPtr += 4;
++
++ input = vld1q_u8((uint8_t*)inputPtr);
++ input = vqtbl1q_u8(input, idx);
++ vst1q_u8((uint8_t*)inputPtr, input);
++ inputPtr += 4;
++ }
+
+- output = (((output >> 24) & 0xff) | ((output >> 8) & 0x0000ff00) | ((output << 8) & 0x00ff0000) | ((output << 24) & 0xff000000));
++ for (number = n8points * 8; number < num_points; ++number) {
++ uint32_t output = *inputPtr;
+
+- *inputPtr++ = output;
+- }
++ output = (((output >> 24) & 0xff) | ((output >> 8) & 0x0000ff00) |
++ ((output << 8) & 0x00ff0000) | ((output << 24) & 0xff000000));
+
++ *inputPtr++ = output;
++ }
+ }
+ #endif /* LV_HAVE_NEONV8 */
+
+
+ #ifdef LV_HAVE_GENERIC
+
+-static inline void volk_32u_byteswap_generic(uint32_t* intsToSwap, unsigned int num_points){
+- uint32_t* inputPtr = intsToSwap;
++static inline void volk_32u_byteswap_generic(uint32_t* intsToSwap,
++ unsigned int num_points)
++{
++ uint32_t* inputPtr = intsToSwap;
+
+- unsigned int point;
+- for(point = 0; point < num_points; point++){
+- uint32_t output = *inputPtr;
+- output = (((output >> 24) & 0xff) | ((output >> 8) & 0x0000ff00) | ((output << 8) & 0x00ff0000) | ((output << 24) & 0xff000000));
++ unsigned int point;
++ for (point = 0; point < num_points; point++) {
++ uint32_t output = *inputPtr;
++ output = (((output >> 24) & 0xff) | ((output >> 8) & 0x0000ff00) |
++ ((output << 8) & 0x00ff0000) | ((output << 24) & 0xff000000));
+
+- *inputPtr = output;
+- inputPtr++;
+- }
++ *inputPtr = output;
++ inputPtr++;
++ }
+ }
+ #endif /* LV_HAVE_GENERIC */
+
+@@ -261,38 +273,42 @@ static inline void volk_32u_byteswap_generic(uint32_t* intsToSwap, unsigned int
+
+ #if LV_HAVE_AVX2
+ #include <immintrin.h>
+-static inline void volk_32u_byteswap_a_avx2(uint32_t* intsToSwap, unsigned int num_points){
++static inline void volk_32u_byteswap_a_avx2(uint32_t* intsToSwap, unsigned int num_points)
++{
+
+- unsigned int number;
++ unsigned int number;
+
+- const unsigned int nPerSet = 8;
+- const uint64_t nSets = num_points / nPerSet;
++ const unsigned int nPerSet = 8;
++ const uint64_t nSets = num_points / nPerSet;
+
+- uint32_t* inputPtr = intsToSwap;
++ uint32_t* inputPtr = intsToSwap;
+
+- const uint8_t shuffleVector[32] = { 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12, 19, 18, 17, 16, 23, 22, 21, 20, 27, 26, 25, 24, 31, 30, 29, 28 };
++ const uint8_t shuffleVector[32] = { 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9,
++ 8, 15, 14, 13, 12, 19, 18, 17, 16, 23, 22,
++ 21, 20, 27, 26, 25, 24, 31, 30, 29, 28 };
+
+- const __m256i myShuffle = _mm256_loadu_si256((__m256i*) &shuffleVector);
++ const __m256i myShuffle = _mm256_loadu_si256((__m256i*)&shuffleVector);
+
+- for (number = 0 ;number < nSets; number++) {
++ for (number = 0; number < nSets; number++) {
+
+- // Load the 32t values, increment inputPtr later since we're doing it in-place.
+- const __m256i input = _mm256_load_si256((__m256i*)inputPtr);
+- const __m256i output = _mm256_shuffle_epi8(input,myShuffle);
++ // Load the 32t values, increment inputPtr later since we're doing it in-place.
++ const __m256i input = _mm256_load_si256((__m256i*)inputPtr);
++ const __m256i output = _mm256_shuffle_epi8(input, myShuffle);
+
+- // Store the results
+- _mm256_store_si256((__m256i*)inputPtr, output);
+- inputPtr += nPerSet;
+- }
+- _mm256_zeroupper();
+-
+- // Byteswap any remaining points:
+- for(number = nSets * nPerSet; number < num_points; number++){
+- uint32_t outputVal = *inputPtr;
+- outputVal = (((outputVal >> 24) & 0xff) | ((outputVal >> 8) & 0x0000ff00) | ((outputVal << 8) & 0x00ff0000) | ((outputVal << 24) & 0xff000000));
+- *inputPtr = outputVal;
+- inputPtr++;
+- }
++ // Store the results
++ _mm256_store_si256((__m256i*)inputPtr, output);
++ inputPtr += nPerSet;
++ }
++ _mm256_zeroupper();
++
++ // Byteswap any remaining points:
++ for (number = nSets * nPerSet; number < num_points; number++) {
++ uint32_t outputVal = *inputPtr;
++ outputVal = (((outputVal >> 24) & 0xff) | ((outputVal >> 8) & 0x0000ff00) |
++ ((outputVal << 8) & 0x00ff0000) | ((outputVal << 24) & 0xff000000));
++ *inputPtr = outputVal;
++ inputPtr++;
++ }
+ }
+ #endif /* LV_HAVE_AVX2 */
+
+@@ -301,63 +317,66 @@ static inline void volk_32u_byteswap_a_avx2(uint32_t* intsToSwap, unsigned int n
+ #include <emmintrin.h>
+
+
+-static inline void volk_32u_byteswap_a_sse2(uint32_t* intsToSwap, unsigned int num_points){
+- unsigned int number = 0;
+-
+- uint32_t* inputPtr = intsToSwap;
+- __m128i input, byte1, byte2, byte3, byte4, output;
+- __m128i byte2mask = _mm_set1_epi32(0x00FF0000);
+- __m128i byte3mask = _mm_set1_epi32(0x0000FF00);
+-
+- const uint64_t quarterPoints = num_points / 4;
+- for(;number < quarterPoints; number++){
+- // Load the 32t values, increment inputPtr later since we're doing it in-place.
+- input = _mm_load_si128((__m128i*)inputPtr);
+- // Do the four shifts
+- byte1 = _mm_slli_epi32(input, 24);
+- byte2 = _mm_slli_epi32(input, 8);
+- byte3 = _mm_srli_epi32(input, 8);
+- byte4 = _mm_srli_epi32(input, 24);
+- // Or bytes together
+- output = _mm_or_si128(byte1, byte4);
+- byte2 = _mm_and_si128(byte2, byte2mask);
+- output = _mm_or_si128(output, byte2);
+- byte3 = _mm_and_si128(byte3, byte3mask);
+- output = _mm_or_si128(output, byte3);
+- // Store the results
+- _mm_store_si128((__m128i*)inputPtr, output);
+- inputPtr += 4;
+- }
+-
+- // Byteswap any remaining points:
+- number = quarterPoints*4;
+- for(; number < num_points; number++){
+- uint32_t outputVal = *inputPtr;
+- outputVal = (((outputVal >> 24) & 0xff) | ((outputVal >> 8) & 0x0000ff00) | ((outputVal << 8) & 0x00ff0000) | ((outputVal << 24) & 0xff000000));
+- *inputPtr = outputVal;
+- inputPtr++;
+- }
++static inline void volk_32u_byteswap_a_sse2(uint32_t* intsToSwap, unsigned int num_points)
++{
++ unsigned int number = 0;
++
++ uint32_t* inputPtr = intsToSwap;
++ __m128i input, byte1, byte2, byte3, byte4, output;
++ __m128i byte2mask = _mm_set1_epi32(0x00FF0000);
++ __m128i byte3mask = _mm_set1_epi32(0x0000FF00);
++
++ const uint64_t quarterPoints = num_points / 4;
++ for (; number < quarterPoints; number++) {
++ // Load the 32t values, increment inputPtr later since we're doing it in-place.
++ input = _mm_load_si128((__m128i*)inputPtr);
++ // Do the four shifts
++ byte1 = _mm_slli_epi32(input, 24);
++ byte2 = _mm_slli_epi32(input, 8);
++ byte3 = _mm_srli_epi32(input, 8);
++ byte4 = _mm_srli_epi32(input, 24);
++ // Or bytes together
++ output = _mm_or_si128(byte1, byte4);
++ byte2 = _mm_and_si128(byte2, byte2mask);
++ output = _mm_or_si128(output, byte2);
++ byte3 = _mm_and_si128(byte3, byte3mask);
++ output = _mm_or_si128(output, byte3);
++ // Store the results
++ _mm_store_si128((__m128i*)inputPtr, output);
++ inputPtr += 4;
++ }
++
++ // Byteswap any remaining points:
++ number = quarterPoints * 4;
++ for (; number < num_points; number++) {
++ uint32_t outputVal = *inputPtr;
++ outputVal = (((outputVal >> 24) & 0xff) | ((outputVal >> 8) & 0x0000ff00) |
++ ((outputVal << 8) & 0x00ff0000) | ((outputVal << 24) & 0xff000000));
++ *inputPtr = outputVal;
++ inputPtr++;
++ }
+ }
+ #endif /* LV_HAVE_SSE2 */
+
+
+ #ifdef LV_HAVE_GENERIC
+
+-static inline void volk_32u_byteswap_a_generic(uint32_t* intsToSwap, unsigned int num_points){
+- uint32_t* inputPtr = intsToSwap;
++static inline void volk_32u_byteswap_a_generic(uint32_t* intsToSwap,
++ unsigned int num_points)
++{
++ uint32_t* inputPtr = intsToSwap;
+
+- unsigned int point;
+- for(point = 0; point < num_points; point++){
+- uint32_t output = *inputPtr;
+- output = (((output >> 24) & 0xff) | ((output >> 8) & 0x0000ff00) | ((output << 8) & 0x00ff0000) | ((output << 24) & 0xff000000));
++ unsigned int point;
++ for (point = 0; point < num_points; point++) {
++ uint32_t output = *inputPtr;
++ output = (((output >> 24) & 0xff) | ((output >> 8) & 0x0000ff00) |
++ ((output << 8) & 0x00ff0000) | ((output << 24) & 0xff000000));
+
+- *inputPtr = output;
+- inputPtr++;
+- }
++ *inputPtr = output;
++ inputPtr++;
++ }
+ }
+ #endif /* LV_HAVE_GENERIC */
+
+
+-
+-
+ #endif /* INCLUDED_volk_32u_byteswap_a_H */
+diff --git a/kernels/volk/volk_32u_byteswappuppet_32u.h b/kernels/volk/volk_32u_byteswappuppet_32u.h
+index c33a5fc..ca5ca17 100644
+--- a/kernels/volk/volk_32u_byteswappuppet_32u.h
++++ b/kernels/volk/volk_32u_byteswappuppet_32u.h
+@@ -1,70 +1,84 @@
+ #ifndef INCLUDED_volk_32u_byteswappuppet_32u_H
+ #define INCLUDED_volk_32u_byteswappuppet_32u_H
+
+-#include <volk/volk_32u_byteswap.h>
+ #include <stdint.h>
+ #include <string.h>
++#include <volk/volk_32u_byteswap.h>
+
+ #ifdef LV_HAVE_GENERIC
+-static inline void volk_32u_byteswappuppet_32u_generic(uint32_t*output, uint32_t* intsToSwap, unsigned int num_points){
++static inline void volk_32u_byteswappuppet_32u_generic(uint32_t* output,
++ uint32_t* intsToSwap,
++ unsigned int num_points)
++{
+
+ volk_32u_byteswap_generic((uint32_t*)intsToSwap, num_points);
+ memcpy((void*)output, (void*)intsToSwap, num_points * sizeof(uint32_t));
+-
+ }
+ #endif
+
+ #ifdef LV_HAVE_NEON
+-static inline void volk_32u_byteswappuppet_32u_neon(uint32_t*output, uint32_t* intsToSwap, unsigned int num_points){
++static inline void volk_32u_byteswappuppet_32u_neon(uint32_t* output,
++ uint32_t* intsToSwap,
++ unsigned int num_points)
++{
+
+ volk_32u_byteswap_neon((uint32_t*)intsToSwap, num_points);
+ memcpy((void*)output, (void*)intsToSwap, num_points * sizeof(uint32_t));
+-
+ }
+ #endif
+
+ #ifdef LV_HAVE_NEONV8
+-static inline void volk_32u_byteswappuppet_32u_neonv8(uint32_t*output, uint32_t* intsToSwap, unsigned int num_points){
++static inline void volk_32u_byteswappuppet_32u_neonv8(uint32_t* output,
++ uint32_t* intsToSwap,
++ unsigned int num_points)
++{
+
+ volk_32u_byteswap_neonv8((uint32_t*)intsToSwap, num_points);
+ memcpy((void*)output, (void*)intsToSwap, num_points * sizeof(uint32_t));
+-
+ }
+ #endif
+
+ #ifdef LV_HAVE_SSE2
+-static inline void volk_32u_byteswappuppet_32u_u_sse2(uint32_t *output, uint32_t* intsToSwap, unsigned int num_points){
++static inline void volk_32u_byteswappuppet_32u_u_sse2(uint32_t* output,
++ uint32_t* intsToSwap,
++ unsigned int num_points)
++{
+
+ volk_32u_byteswap_u_sse2((uint32_t*)intsToSwap, num_points);
+ memcpy((void*)output, (void*)intsToSwap, num_points * sizeof(uint32_t));
+-
+ }
+ #endif
+
+ #ifdef LV_HAVE_SSE2
+-static inline void volk_32u_byteswappuppet_32u_a_sse2(uint32_t* output, uint32_t* intsToSwap, unsigned int num_points){
++static inline void volk_32u_byteswappuppet_32u_a_sse2(uint32_t* output,
++ uint32_t* intsToSwap,
++ unsigned int num_points)
++{
+
+ volk_32u_byteswap_a_sse2((uint32_t*)intsToSwap, num_points);
+ memcpy((void*)output, (void*)intsToSwap, num_points * sizeof(uint32_t));
+-
+ }
+ #endif
+
+ #ifdef LV_HAVE_AVX2
+-static inline void volk_32u_byteswappuppet_32u_u_avx2(uint32_t* output, uint32_t* intsToSwap, unsigned int num_points){
++static inline void volk_32u_byteswappuppet_32u_u_avx2(uint32_t* output,
++ uint32_t* intsToSwap,
++ unsigned int num_points)
++{
+
+ volk_32u_byteswap_u_avx2((uint32_t*)intsToSwap, num_points);
+ memcpy((void*)output, (void*)intsToSwap, num_points * sizeof(uint32_t));
+-
+ }
+ #endif
+
+ #ifdef LV_HAVE_AVX2
+-static inline void volk_32u_byteswappuppet_32u_a_avx2(uint32_t* output, uint32_t* intsToSwap, unsigned int num_points){
++static inline void volk_32u_byteswappuppet_32u_a_avx2(uint32_t* output,
++ uint32_t* intsToSwap,
++ unsigned int num_points)
++{
+
+ volk_32u_byteswap_a_avx2((uint32_t*)intsToSwap, num_points);
+ memcpy((void*)output, (void*)intsToSwap, num_points * sizeof(uint32_t));
+-
+ }
+ #endif
+
+diff --git a/kernels/volk/volk_32u_popcnt.h b/kernels/volk/volk_32u_popcnt.h
+index 7aa4d43..f6f0c10 100644
+--- a/kernels/volk/volk_32u_popcnt.h
++++ b/kernels/volk/volk_32u_popcnt.h
+@@ -56,24 +56,23 @@
+ #ifndef INCLUDED_VOLK_32u_POPCNT_A16_H
+ #define INCLUDED_VOLK_32u_POPCNT_A16_H
+
+-#include <stdio.h>
+ #include <inttypes.h>
++#include <stdio.h>
+
+ #ifdef LV_HAVE_GENERIC
+
+-static inline void
+-volk_32u_popcnt_generic(uint32_t* ret, const uint32_t value)
++static inline void volk_32u_popcnt_generic(uint32_t* ret, const uint32_t value)
+ {
+- // This is faster than a lookup table
+- uint32_t retVal = value;
++ // This is faster than a lookup table
++ uint32_t retVal = value;
+
+- retVal = (retVal & 0x55555555) + (retVal >> 1 & 0x55555555);
+- retVal = (retVal & 0x33333333) + (retVal >> 2 & 0x33333333);
+- retVal = (retVal + (retVal >> 4)) & 0x0F0F0F0F;
+- retVal = (retVal + (retVal >> 8));
+- retVal = (retVal + (retVal >> 16)) & 0x0000003F;
++ retVal = (retVal & 0x55555555) + (retVal >> 1 & 0x55555555);
++ retVal = (retVal & 0x33333333) + (retVal >> 2 & 0x33333333);
++ retVal = (retVal + (retVal >> 4)) & 0x0F0F0F0F;
++ retVal = (retVal + (retVal >> 8));
++ retVal = (retVal + (retVal >> 16)) & 0x0000003F;
+
+- *ret = retVal;
++ *ret = retVal;
+ }
+
+ #endif /*LV_HAVE_GENERIC*/
+@@ -83,10 +82,9 @@ volk_32u_popcnt_generic(uint32_t* ret, const uint32_t value)
+
+ #include <nmmintrin.h>
+
+-static inline void
+-volk_32u_popcnt_a_sse4_2(uint32_t* ret, const uint32_t value)
++static inline void volk_32u_popcnt_a_sse4_2(uint32_t* ret, const uint32_t value)
+ {
+- *ret = _mm_popcnt_u32(value);
++ *ret = _mm_popcnt_u32(value);
+ }
+
+ #endif /*LV_HAVE_SSE4_2*/
+diff --git a/kernels/volk/volk_32u_popcntpuppet_32u.h b/kernels/volk/volk_32u_popcntpuppet_32u.h
+index d5edd35..c0389cc 100644
+--- a/kernels/volk/volk_32u_popcntpuppet_32u.h
++++ b/kernels/volk/volk_32u_popcntpuppet_32u.h
+@@ -27,19 +27,25 @@
+ #include <volk/volk_32u_popcnt.h>
+
+ #ifdef LV_HAVE_GENERIC
+-static inline void volk_32u_popcntpuppet_32u_generic(uint32_t* outVector, const uint32_t* inVector, unsigned int num_points){
++static inline void volk_32u_popcntpuppet_32u_generic(uint32_t* outVector,
++ const uint32_t* inVector,
++ unsigned int num_points)
++{
+ unsigned int ii;
+- for(ii=0; ii < num_points; ++ii) {
+- volk_32u_popcnt_generic(outVector+ii, *(inVector+ii) );
++ for (ii = 0; ii < num_points; ++ii) {
++ volk_32u_popcnt_generic(outVector + ii, *(inVector + ii));
+ }
+ }
+ #endif /* LV_HAVE_GENERIC */
+
+ #ifdef LV_HAVE_SSE4_2
+-static inline void volk_32u_popcntpuppet_32u_a_sse4_2(uint32_t* outVector, const uint32_t* inVector, unsigned int num_points){
++static inline void volk_32u_popcntpuppet_32u_a_sse4_2(uint32_t* outVector,
++ const uint32_t* inVector,
++ unsigned int num_points)
++{
+ unsigned int ii;
+- for(ii=0; ii < num_points; ++ii) {
+- volk_32u_popcnt_a_sse4_2(outVector+ii, *(inVector+ii) );
++ for (ii = 0; ii < num_points; ++ii) {
++ volk_32u_popcnt_a_sse4_2(outVector + ii, *(inVector + ii));
+ }
+ }
+ #endif /* LV_HAVE_SSE4_2 */
+diff --git a/kernels/volk/volk_32u_reverse_32u.h b/kernels/volk/volk_32u_reverse_32u.h
+index b670b13..aff0a9e 100644
+--- a/kernels/volk/volk_32u_reverse_32u.h
++++ b/kernels/volk/volk_32u_reverse_32u.h
+@@ -24,7 +24,8 @@
+ * \b bit reversal of the input 32 bit word
+
+ * <b>Dispatcher Prototype</b>
+- * \code volk_32u_reverse_32u(uint32_t *outputVector, uint32_t *inputVector; unsigned int num_points);
++ * \code volk_32u_reverse_32u(uint32_t *outputVector, uint32_t *inputVector; unsigned int
++ num_points);
+ * \endcode
+ *
+ * \b Inputs
+@@ -32,338 +33,344 @@
+ * \li num_points The number of data points.
+ *
+ * \b Outputs
+- * \li outputVector: The vector where the results will be stored, which is the bit-reversed input
++ * \li outputVector: The vector where the results will be stored, which is the
++ bit-reversed input
+ *
+ * \endcode
+ */
+ #ifndef INCLUDED_VOLK_32u_REVERSE_32u_U_H
+ struct dword_split {
+- int b00: 1;
+- int b01: 1;
+- int b02: 1;
+- int b03: 1;
+- int b04: 1;
+- int b05: 1;
+- int b06: 1;
+- int b07: 1;
+- int b08: 1;
+- int b09: 1;
+- int b10: 1;
+- int b11: 1;
+- int b12: 1;
+- int b13: 1;
+- int b14: 1;
+- int b15: 1;
+- int b16: 1;
+- int b17: 1;
+- int b18: 1;
+- int b19: 1;
+- int b20: 1;
+- int b21: 1;
+- int b22: 1;
+- int b23: 1;
+- int b24: 1;
+- int b25: 1;
+- int b26: 1;
+- int b27: 1;
+- int b28: 1;
+- int b29: 1;
+- int b30: 1;
+- int b31: 1;
++ int b00 : 1;
++ int b01 : 1;
++ int b02 : 1;
++ int b03 : 1;
++ int b04 : 1;
++ int b05 : 1;
++ int b06 : 1;
++ int b07 : 1;
++ int b08 : 1;
++ int b09 : 1;
++ int b10 : 1;
++ int b11 : 1;
++ int b12 : 1;
++ int b13 : 1;
++ int b14 : 1;
++ int b15 : 1;
++ int b16 : 1;
++ int b17 : 1;
++ int b18 : 1;
++ int b19 : 1;
++ int b20 : 1;
++ int b21 : 1;
++ int b22 : 1;
++ int b23 : 1;
++ int b24 : 1;
++ int b25 : 1;
++ int b26 : 1;
++ int b27 : 1;
++ int b28 : 1;
++ int b29 : 1;
++ int b30 : 1;
++ int b31 : 1;
+ };
+ struct char_split {
+- uint8_t b00: 1;
+- uint8_t b01: 1;
+- uint8_t b02: 1;
+- uint8_t b03: 1;
+- uint8_t b04: 1;
+- uint8_t b05: 1;
+- uint8_t b06: 1;
+- uint8_t b07: 1;
++ uint8_t b00 : 1;
++ uint8_t b01 : 1;
++ uint8_t b02 : 1;
++ uint8_t b03 : 1;
++ uint8_t b04 : 1;
++ uint8_t b05 : 1;
++ uint8_t b06 : 1;
++ uint8_t b07 : 1;
+ };
+
+-//Idea from "Bit Twiddling Hacks", which dedicates this method to public domain
+-//http://graphics.stanford.edu/~seander/bithacks.html#BitReverseTable
++// Idea from "Bit Twiddling Hacks", which dedicates this method to public domain
++// http://graphics.stanford.edu/~seander/bithacks.html#BitReverseTable
+ static const unsigned char BitReverseTable256[] = {
+- 0x00, 0x80, 0x40, 0xC0, 0x20, 0xA0, 0x60, 0xE0, 0x10, 0x90, 0x50, 0xD0, 0x30,
+- 0xB0, 0x70, 0xF0, 0x08, 0x88, 0x48, 0xC8, 0x28, 0xA8, 0x68, 0xE8, 0x18, 0x98,
+- 0x58, 0xD8, 0x38, 0xB8, 0x78, 0xF8, 0x04, 0x84, 0x44, 0xC4, 0x24, 0xA4, 0x64,
+- 0xE4, 0x14, 0x94, 0x54, 0xD4, 0x34, 0xB4, 0x74, 0xF4, 0x0C, 0x8C, 0x4C, 0xCC,
+- 0x2C, 0xAC, 0x6C, 0xEC, 0x1C, 0x9C, 0x5C, 0xDC, 0x3C, 0xBC, 0x7C, 0xFC, 0x02,
+- 0x82, 0x42, 0xC2, 0x22, 0xA2, 0x62, 0xE2, 0x12, 0x92, 0x52, 0xD2, 0x32, 0xB2,
+- 0x72, 0xF2, 0x0A, 0x8A, 0x4A, 0xCA, 0x2A, 0xAA, 0x6A, 0xEA, 0x1A, 0x9A, 0x5A,
+- 0xDA, 0x3A, 0xBA, 0x7A, 0xFA, 0x06, 0x86, 0x46, 0xC6, 0x26, 0xA6, 0x66, 0xE6,
+- 0x16, 0x96, 0x56, 0xD6, 0x36, 0xB6, 0x76, 0xF6, 0x0E, 0x8E, 0x4E, 0xCE, 0x2E,
+- 0xAE, 0x6E, 0xEE, 0x1E, 0x9E, 0x5E, 0xDE, 0x3E, 0xBE, 0x7E, 0xFE, 0x01, 0x81,
+- 0x41, 0xC1, 0x21, 0xA1, 0x61, 0xE1, 0x11, 0x91, 0x51, 0xD1, 0x31, 0xB1, 0x71,
+- 0xF1, 0x09, 0x89, 0x49, 0xC9, 0x29, 0xA9, 0x69, 0xE9, 0x19, 0x99, 0x59, 0xD9,
+- 0x39, 0xB9, 0x79, 0xF9, 0x05, 0x85, 0x45, 0xC5, 0x25, 0xA5, 0x65, 0xE5, 0x15,
+- 0x95, 0x55, 0xD5, 0x35, 0xB5, 0x75, 0xF5, 0x0D, 0x8D, 0x4D, 0xCD, 0x2D, 0xAD,
+- 0x6D, 0xED, 0x1D, 0x9D, 0x5D, 0xDD, 0x3D, 0xBD, 0x7D, 0xFD, 0x03, 0x83, 0x43,
+- 0xC3, 0x23, 0xA3, 0x63, 0xE3, 0x13, 0x93, 0x53, 0xD3, 0x33, 0xB3, 0x73, 0xF3,
+- 0x0B, 0x8B, 0x4B, 0xCB, 0x2B, 0xAB, 0x6B, 0xEB, 0x1B, 0x9B, 0x5B, 0xDB, 0x3B,
+- 0xBB, 0x7B, 0xFB, 0x07, 0x87, 0x47, 0xC7, 0x27, 0xA7, 0x67, 0xE7, 0x17, 0x97,
+- 0x57, 0xD7, 0x37, 0xB7, 0x77, 0xF7, 0x0F, 0x8F, 0x4F, 0xCF, 0x2F, 0xAF, 0x6F,
+- 0xEF, 0x1F, 0x9F, 0x5F, 0xDF, 0x3F, 0xBF, 0x7F, 0xFF
++ 0x00, 0x80, 0x40, 0xC0, 0x20, 0xA0, 0x60, 0xE0, 0x10, 0x90, 0x50, 0xD0, 0x30, 0xB0,
++ 0x70, 0xF0, 0x08, 0x88, 0x48, 0xC8, 0x28, 0xA8, 0x68, 0xE8, 0x18, 0x98, 0x58, 0xD8,
++ 0x38, 0xB8, 0x78, 0xF8, 0x04, 0x84, 0x44, 0xC4, 0x24, 0xA4, 0x64, 0xE4, 0x14, 0x94,
++ 0x54, 0xD4, 0x34, 0xB4, 0x74, 0xF4, 0x0C, 0x8C, 0x4C, 0xCC, 0x2C, 0xAC, 0x6C, 0xEC,
++ 0x1C, 0x9C, 0x5C, 0xDC, 0x3C, 0xBC, 0x7C, 0xFC, 0x02, 0x82, 0x42, 0xC2, 0x22, 0xA2,
++ 0x62, 0xE2, 0x12, 0x92, 0x52, 0xD2, 0x32, 0xB2, 0x72, 0xF2, 0x0A, 0x8A, 0x4A, 0xCA,
++ 0x2A, 0xAA, 0x6A, 0xEA, 0x1A, 0x9A, 0x5A, 0xDA, 0x3A, 0xBA, 0x7A, 0xFA, 0x06, 0x86,
++ 0x46, 0xC6, 0x26, 0xA6, 0x66, 0xE6, 0x16, 0x96, 0x56, 0xD6, 0x36, 0xB6, 0x76, 0xF6,
++ 0x0E, 0x8E, 0x4E, 0xCE, 0x2E, 0xAE, 0x6E, 0xEE, 0x1E, 0x9E, 0x5E, 0xDE, 0x3E, 0xBE,
++ 0x7E, 0xFE, 0x01, 0x81, 0x41, 0xC1, 0x21, 0xA1, 0x61, 0xE1, 0x11, 0x91, 0x51, 0xD1,
++ 0x31, 0xB1, 0x71, 0xF1, 0x09, 0x89, 0x49, 0xC9, 0x29, 0xA9, 0x69, 0xE9, 0x19, 0x99,
++ 0x59, 0xD9, 0x39, 0xB9, 0x79, 0xF9, 0x05, 0x85, 0x45, 0xC5, 0x25, 0xA5, 0x65, 0xE5,
++ 0x15, 0x95, 0x55, 0xD5, 0x35, 0xB5, 0x75, 0xF5, 0x0D, 0x8D, 0x4D, 0xCD, 0x2D, 0xAD,
++ 0x6D, 0xED, 0x1D, 0x9D, 0x5D, 0xDD, 0x3D, 0xBD, 0x7D, 0xFD, 0x03, 0x83, 0x43, 0xC3,
++ 0x23, 0xA3, 0x63, 0xE3, 0x13, 0x93, 0x53, 0xD3, 0x33, 0xB3, 0x73, 0xF3, 0x0B, 0x8B,
++ 0x4B, 0xCB, 0x2B, 0xAB, 0x6B, 0xEB, 0x1B, 0x9B, 0x5B, 0xDB, 0x3B, 0xBB, 0x7B, 0xFB,
++ 0x07, 0x87, 0x47, 0xC7, 0x27, 0xA7, 0x67, 0xE7, 0x17, 0x97, 0x57, 0xD7, 0x37, 0xB7,
++ 0x77, 0xF7, 0x0F, 0x8F, 0x4F, 0xCF, 0x2F, 0xAF, 0x6F, 0xEF, 0x1F, 0x9F, 0x5F, 0xDF,
++ 0x3F, 0xBF, 0x7F, 0xFF
+ };
+ #ifdef LV_HAVE_GENERIC
+-static inline void volk_32u_reverse_32u_dword_shuffle(uint32_t* out, const uint32_t* in,
+- unsigned int num_points)
++static inline void volk_32u_reverse_32u_dword_shuffle(uint32_t* out,
++ const uint32_t* in,
++ unsigned int num_points)
+ {
+- const struct dword_split *in_ptr = (const struct dword_split*)in;
+- struct dword_split * out_ptr = (struct dword_split*)out;
+- unsigned int number = 0;
+- for(; number < num_points; ++number){
+- out_ptr->b00 = in_ptr->b31;
+- out_ptr->b01 = in_ptr->b30;
+- out_ptr->b02 = in_ptr->b29;
+- out_ptr->b03 = in_ptr->b28;
+- out_ptr->b04 = in_ptr->b27;
+- out_ptr->b05 = in_ptr->b26;
+- out_ptr->b06 = in_ptr->b25;
+- out_ptr->b07 = in_ptr->b24;
+- out_ptr->b08 = in_ptr->b23;
+- out_ptr->b09 = in_ptr->b22;
+- out_ptr->b10 = in_ptr->b21;
+- out_ptr->b11 = in_ptr->b20;
+- out_ptr->b12 = in_ptr->b19;
+- out_ptr->b13 = in_ptr->b18;
+- out_ptr->b14 = in_ptr->b17;
+- out_ptr->b15 = in_ptr->b16;
+- out_ptr->b16 = in_ptr->b15;
+- out_ptr->b17 = in_ptr->b14;
+- out_ptr->b18 = in_ptr->b13;
+- out_ptr->b19 = in_ptr->b12;
+- out_ptr->b20 = in_ptr->b11;
+- out_ptr->b21 = in_ptr->b10;
+- out_ptr->b22 = in_ptr->b09;
+- out_ptr->b23 = in_ptr->b08;
+- out_ptr->b24 = in_ptr->b07;
+- out_ptr->b25 = in_ptr->b06;
+- out_ptr->b26 = in_ptr->b05;
+- out_ptr->b27 = in_ptr->b04;
+- out_ptr->b28 = in_ptr->b03;
+- out_ptr->b29 = in_ptr->b02;
+- out_ptr->b30 = in_ptr->b01;
+- out_ptr->b31 = in_ptr->b00;
+- ++in_ptr;
+- ++out_ptr;
+- }
++ const struct dword_split* in_ptr = (const struct dword_split*)in;
++ struct dword_split* out_ptr = (struct dword_split*)out;
++ unsigned int number = 0;
++ for (; number < num_points; ++number) {
++ out_ptr->b00 = in_ptr->b31;
++ out_ptr->b01 = in_ptr->b30;
++ out_ptr->b02 = in_ptr->b29;
++ out_ptr->b03 = in_ptr->b28;
++ out_ptr->b04 = in_ptr->b27;
++ out_ptr->b05 = in_ptr->b26;
++ out_ptr->b06 = in_ptr->b25;
++ out_ptr->b07 = in_ptr->b24;
++ out_ptr->b08 = in_ptr->b23;
++ out_ptr->b09 = in_ptr->b22;
++ out_ptr->b10 = in_ptr->b21;
++ out_ptr->b11 = in_ptr->b20;
++ out_ptr->b12 = in_ptr->b19;
++ out_ptr->b13 = in_ptr->b18;
++ out_ptr->b14 = in_ptr->b17;
++ out_ptr->b15 = in_ptr->b16;
++ out_ptr->b16 = in_ptr->b15;
++ out_ptr->b17 = in_ptr->b14;
++ out_ptr->b18 = in_ptr->b13;
++ out_ptr->b19 = in_ptr->b12;
++ out_ptr->b20 = in_ptr->b11;
++ out_ptr->b21 = in_ptr->b10;
++ out_ptr->b22 = in_ptr->b09;
++ out_ptr->b23 = in_ptr->b08;
++ out_ptr->b24 = in_ptr->b07;
++ out_ptr->b25 = in_ptr->b06;
++ out_ptr->b26 = in_ptr->b05;
++ out_ptr->b27 = in_ptr->b04;
++ out_ptr->b28 = in_ptr->b03;
++ out_ptr->b29 = in_ptr->b02;
++ out_ptr->b30 = in_ptr->b01;
++ out_ptr->b31 = in_ptr->b00;
++ ++in_ptr;
++ ++out_ptr;
++ }
+ }
+ #endif /* LV_HAVE_GENERIC */
+
+ #ifdef LV_HAVE_GENERIC
+-static inline void volk_32u_reverse_32u_byte_shuffle(uint32_t* out, const uint32_t* in,
+- unsigned int num_points)
++static inline void volk_32u_reverse_32u_byte_shuffle(uint32_t* out,
++ const uint32_t* in,
++ unsigned int num_points)
+ {
+- const uint32_t *in_ptr = in;
+- uint32_t *out_ptr = out;
+- unsigned int number = 0;
+- for(; number < num_points; ++number){
+- const struct char_split *in8 = (const struct char_split*)in_ptr;
+- struct char_split *out8 = (struct char_split*)out_ptr;
++ const uint32_t* in_ptr = in;
++ uint32_t* out_ptr = out;
++ unsigned int number = 0;
++ for (; number < num_points; ++number) {
++ const struct char_split* in8 = (const struct char_split*)in_ptr;
++ struct char_split* out8 = (struct char_split*)out_ptr;
+
+- out8[3].b00 = in8[0].b07;
+- out8[3].b01 = in8[0].b06;
+- out8[3].b02 = in8[0].b05;
+- out8[3].b03 = in8[0].b04;
+- out8[3].b04 = in8[0].b03;
+- out8[3].b05 = in8[0].b02;
+- out8[3].b06 = in8[0].b01;
+- out8[3].b07 = in8[0].b00;
++ out8[3].b00 = in8[0].b07;
++ out8[3].b01 = in8[0].b06;
++ out8[3].b02 = in8[0].b05;
++ out8[3].b03 = in8[0].b04;
++ out8[3].b04 = in8[0].b03;
++ out8[3].b05 = in8[0].b02;
++ out8[3].b06 = in8[0].b01;
++ out8[3].b07 = in8[0].b00;
+
+- out8[2].b00 = in8[1].b07;
+- out8[2].b01 = in8[1].b06;
+- out8[2].b02 = in8[1].b05;
+- out8[2].b03 = in8[1].b04;
+- out8[2].b04 = in8[1].b03;
+- out8[2].b05 = in8[1].b02;
+- out8[2].b06 = in8[1].b01;
+- out8[2].b07 = in8[1].b00;
++ out8[2].b00 = in8[1].b07;
++ out8[2].b01 = in8[1].b06;
++ out8[2].b02 = in8[1].b05;
++ out8[2].b03 = in8[1].b04;
++ out8[2].b04 = in8[1].b03;
++ out8[2].b05 = in8[1].b02;
++ out8[2].b06 = in8[1].b01;
++ out8[2].b07 = in8[1].b00;
+
+- out8[1].b00 = in8[2].b07;
+- out8[1].b01 = in8[2].b06;
+- out8[1].b02 = in8[2].b05;
+- out8[1].b03 = in8[2].b04;
+- out8[1].b04 = in8[2].b03;
+- out8[1].b05 = in8[2].b02;
+- out8[1].b06 = in8[2].b01;
+- out8[1].b07 = in8[2].b00;
++ out8[1].b00 = in8[2].b07;
++ out8[1].b01 = in8[2].b06;
++ out8[1].b02 = in8[2].b05;
++ out8[1].b03 = in8[2].b04;
++ out8[1].b04 = in8[2].b03;
++ out8[1].b05 = in8[2].b02;
++ out8[1].b06 = in8[2].b01;
++ out8[1].b07 = in8[2].b00;
+
+- out8[0].b00 = in8[3].b07;
+- out8[0].b01 = in8[3].b06;
+- out8[0].b02 = in8[3].b05;
+- out8[0].b03 = in8[3].b04;
+- out8[0].b04 = in8[3].b03;
+- out8[0].b05 = in8[3].b02;
+- out8[0].b06 = in8[3].b01;
+- out8[0].b07 = in8[3].b00;
+- ++in_ptr;
+- ++out_ptr;
+- }
++ out8[0].b00 = in8[3].b07;
++ out8[0].b01 = in8[3].b06;
++ out8[0].b02 = in8[3].b05;
++ out8[0].b03 = in8[3].b04;
++ out8[0].b04 = in8[3].b03;
++ out8[0].b05 = in8[3].b02;
++ out8[0].b06 = in8[3].b01;
++ out8[0].b07 = in8[3].b00;
++ ++in_ptr;
++ ++out_ptr;
++ }
+ }
+ #endif /* LV_HAVE_GENERIC */
+
+-//Idea from "Bit Twiddling Hacks", which dedicates this method to public domain
+-//http://graphics.stanford.edu/~seander/bithacks.html#BitReverseTable
++// Idea from "Bit Twiddling Hacks", which dedicates this method to public domain
++// http://graphics.stanford.edu/~seander/bithacks.html#BitReverseTable
+ #ifdef LV_HAVE_GENERIC
+-static inline void volk_32u_reverse_32u_lut(uint32_t* out, const uint32_t* in,
+- unsigned int num_points)
++static inline void
++volk_32u_reverse_32u_lut(uint32_t* out, const uint32_t* in, unsigned int num_points)
+ {
+- const uint32_t *in_ptr = in;
+- uint32_t *out_ptr = out;
+- unsigned int number = 0;
+- for(; number < num_points; ++number){
+- *out_ptr =
+- (BitReverseTable256[*in_ptr & 0xff] << 24) |
+- (BitReverseTable256[(*in_ptr >> 8) & 0xff] << 16) |
+- (BitReverseTable256[(*in_ptr >> 16) & 0xff] << 8) |
+- (BitReverseTable256[(*in_ptr >> 24) & 0xff]);
+- ++in_ptr;
+- ++out_ptr;
+- }
++ const uint32_t* in_ptr = in;
++ uint32_t* out_ptr = out;
++ unsigned int number = 0;
++ for (; number < num_points; ++number) {
++ *out_ptr = (BitReverseTable256[*in_ptr & 0xff] << 24) |
++ (BitReverseTable256[(*in_ptr >> 8) & 0xff] << 16) |
++ (BitReverseTable256[(*in_ptr >> 16) & 0xff] << 8) |
++ (BitReverseTable256[(*in_ptr >> 24) & 0xff]);
++ ++in_ptr;
++ ++out_ptr;
++ }
+ }
+ #endif /* LV_HAVE_GENERIC */
+
+-//Single-Byte code from "Bit Twiddling Hacks", which dedicates this method to public domain
+-//http://graphics.stanford.edu/~seander/bithacks.html#ReverseByteWith64Bits
++// Single-Byte code from "Bit Twiddling Hacks", which dedicates this method to public
++// domain http://graphics.stanford.edu/~seander/bithacks.html#ReverseByteWith64Bits
+ #ifdef LV_HAVE_GENERIC
+-static inline void volk_32u_reverse_32u_2001magic(uint32_t* out, const uint32_t* in,
+- unsigned int num_points)
++static inline void
++volk_32u_reverse_32u_2001magic(uint32_t* out, const uint32_t* in, unsigned int num_points)
+ {
+- const uint32_t *in_ptr = in;
+- uint32_t *out_ptr = out;
+- const uint8_t *in8;
+- uint8_t *out8;
+- unsigned int number = 0;
+- for(; number < num_points; ++number){
+- in8 = (const uint8_t*)in_ptr;
+- out8 = (uint8_t*)out_ptr;
+- out8[3] = ((in8[0] * 0x80200802ULL) & 0x0884422110ULL) * 0x0101010101ULL >> 32;
+- out8[2] = ((in8[1] * 0x80200802ULL) & 0x0884422110ULL) * 0x0101010101ULL >> 32;
+- out8[1] = ((in8[2] * 0x80200802ULL) & 0x0884422110ULL) * 0x0101010101ULL >> 32;
+- out8[0] = ((in8[3] * 0x80200802ULL) & 0x0884422110ULL) * 0x0101010101ULL >> 32;
+- ++in_ptr;
+- ++out_ptr;
+- }
++ const uint32_t* in_ptr = in;
++ uint32_t* out_ptr = out;
++ const uint8_t* in8;
++ uint8_t* out8;
++ unsigned int number = 0;
++ for (; number < num_points; ++number) {
++ in8 = (const uint8_t*)in_ptr;
++ out8 = (uint8_t*)out_ptr;
++ out8[3] = ((in8[0] * 0x80200802ULL) & 0x0884422110ULL) * 0x0101010101ULL >> 32;
++ out8[2] = ((in8[1] * 0x80200802ULL) & 0x0884422110ULL) * 0x0101010101ULL >> 32;
++ out8[1] = ((in8[2] * 0x80200802ULL) & 0x0884422110ULL) * 0x0101010101ULL >> 32;
++ out8[0] = ((in8[3] * 0x80200802ULL) & 0x0884422110ULL) * 0x0101010101ULL >> 32;
++ ++in_ptr;
++ ++out_ptr;
++ }
+ }
+ #endif /* LV_HAVE_GENERIC */
+
+ #ifdef LV_HAVE_GENERIC
+ // Current gr-pager implementation
+-static inline void volk_32u_reverse_32u_1972magic(uint32_t* out, const uint32_t* in,
+- unsigned int num_points)
++static inline void
++volk_32u_reverse_32u_1972magic(uint32_t* out, const uint32_t* in, unsigned int num_points)
+ {
+- const uint32_t *in_ptr = in;
+- uint32_t *out_ptr = out;
+- const uint8_t *in8;
+- uint8_t *out8;
+- unsigned int number = 0;
+- for(; number < num_points; ++number){
+- in8 = (const uint8_t*)in_ptr;
+- out8 = (uint8_t*)out_ptr;
+- out8[3] = (in8[0] * 0x0202020202ULL & 0x010884422010ULL) % 1023;
+- out8[2] = (in8[1] * 0x0202020202ULL & 0x010884422010ULL) % 1023;
+- out8[1] = (in8[2] * 0x0202020202ULL & 0x010884422010ULL) % 1023;
+- out8[0] = (in8[3] * 0x0202020202ULL & 0x010884422010ULL) % 1023;
+- ++in_ptr;
+- ++out_ptr;
+- }
++ const uint32_t* in_ptr = in;
++ uint32_t* out_ptr = out;
++ const uint8_t* in8;
++ uint8_t* out8;
++ unsigned int number = 0;
++ for (; number < num_points; ++number) {
++ in8 = (const uint8_t*)in_ptr;
++ out8 = (uint8_t*)out_ptr;
++ out8[3] = (in8[0] * 0x0202020202ULL & 0x010884422010ULL) % 1023;
++ out8[2] = (in8[1] * 0x0202020202ULL & 0x010884422010ULL) % 1023;
++ out8[1] = (in8[2] * 0x0202020202ULL & 0x010884422010ULL) % 1023;
++ out8[0] = (in8[3] * 0x0202020202ULL & 0x010884422010ULL) % 1023;
++ ++in_ptr;
++ ++out_ptr;
++ }
+ }
+ #endif /* LV_HAVE_GENERIC */
+
+-//After lengthy thought and quite a bit of whiteboarding:
++// After lengthy thought and quite a bit of whiteboarding:
+ #ifdef LV_HAVE_GENERIC
+-static inline void volk_32u_reverse_32u_bintree_permute_top_down(uint32_t* out, const uint32_t* in,
+- unsigned int num_points)
++static inline void volk_32u_reverse_32u_bintree_permute_top_down(uint32_t* out,
++ const uint32_t* in,
++ unsigned int num_points)
+ {
+- const uint32_t *in_ptr = in;
+- uint32_t *out_ptr = out;
+- unsigned int number = 0;
+- for(; number < num_points; ++number){
+- uint32_t tmp = *in_ptr;
+- /* permute uint16:
+- The idea is to simply shift the lower 16 bit up, and the upper 16 bit down.
+- */
+- tmp = ( tmp << 16 ) | ( tmp >> 16 );
+- /* permute bytes:
+- shift up by 1 B first, then only consider even bytes, and OR with the unshifted even bytes
+- */
+- tmp = ((tmp & (0xFF | 0xFF << 16)) << 8) | ((tmp >> 8) & (0xFF | 0xFF << 16));
+- /* permute 4bit tuples:
+- Same idea, but the "consideration" mask expression becomes unwieldy
+- */
+- tmp = ((tmp & (0xF | 0xF << 8 | 0xF << 16 | 0xF << 24)) << 4) | ((tmp >> 4) & (0xF | 0xF << 8 | 0xF << 16 | 0xF << 24));
+- /* permute 2bit tuples:
+- Here, we collapsed the "consideration" mask to a simple hexmask: 0b0011 =
+- 3; we need those every 4b, which coincides with a hex digit!
+- */
+- tmp = ((tmp & (0x33333333)) << 2) | ((tmp >> 2) & (0x33333333));
+- /* permute odd/even:
+- 0x01 = 0x1; we need these every 2b, which works out: 0x01 | (0x01 << 2) = 0x05!
+- */
+- tmp = ((tmp & (0x55555555)) << 1) | ((tmp >> 1) & (0x55555555));
++ const uint32_t* in_ptr = in;
++ uint32_t* out_ptr = out;
++ unsigned int number = 0;
++ for (; number < num_points; ++number) {
++ uint32_t tmp = *in_ptr;
++ /* permute uint16:
++ The idea is to simply shift the lower 16 bit up, and the upper 16 bit down.
++ */
++ tmp = (tmp << 16) | (tmp >> 16);
++ /* permute bytes:
++ shift up by 1 B first, then only consider even bytes, and OR with the unshifted
++ even bytes
++ */
++ tmp = ((tmp & (0xFF | 0xFF << 16)) << 8) | ((tmp >> 8) & (0xFF | 0xFF << 16));
++ /* permute 4bit tuples:
++ Same idea, but the "consideration" mask expression becomes unwieldy
++ */
++ tmp = ((tmp & (0xF | 0xF << 8 | 0xF << 16 | 0xF << 24)) << 4) |
++ ((tmp >> 4) & (0xF | 0xF << 8 | 0xF << 16 | 0xF << 24));
++ /* permute 2bit tuples:
++ Here, we collapsed the "consideration" mask to a simple hexmask: 0b0011 =
++ 3; we need those every 4b, which coincides with a hex digit!
++ */
++ tmp = ((tmp & (0x33333333)) << 2) | ((tmp >> 2) & (0x33333333));
++ /* permute odd/even:
++ 0x01 = 0x1; we need these every 2b, which works out: 0x01 | (0x01 << 2) =
++ 0x05!
++ */
++ tmp = ((tmp & (0x55555555)) << 1) | ((tmp >> 1) & (0x55555555));
+
+- *out_ptr = tmp;
+- ++in_ptr;
+- ++out_ptr;
+- }
++ *out_ptr = tmp;
++ ++in_ptr;
++ ++out_ptr;
++ }
+ }
+ #endif /* LV_HAVE_GENERIC */
+ #ifdef LV_HAVE_GENERIC
+-static inline void volk_32u_reverse_32u_bintree_permute_bottom_up(uint32_t* out, const uint32_t* in,
+- unsigned int num_points)
++static inline void volk_32u_reverse_32u_bintree_permute_bottom_up(uint32_t* out,
++ const uint32_t* in,
++ unsigned int num_points)
+ {
+- //same stuff as top_down, inverted order (permutation matrices don't care, you know!)
+- const uint32_t *in_ptr = in;
+- uint32_t *out_ptr = out;
+- unsigned int number = 0;
+- for(; number < num_points; ++number){
+- uint32_t tmp = *in_ptr;
+- tmp = ((tmp & (0x55555555)) << 1) | ((tmp >> 1) & (0x55555555));
+- tmp = ((tmp & (0x33333333)) << 2) | ((tmp >> 2) & (0x33333333));
+- tmp = ((tmp & (0xF | 0xF << 8 | 0xF << 16 | 0xF << 24)) << 4) | ((tmp >> 4) & (0xF | 0xF << 8 | 0xF << 16 | 0xF << 24));
+- tmp = ((tmp & (0xFF | 0xFF << 16)) << 8) | ((tmp >> 8) & (0xFF | 0xFF << 16));
+- tmp = ( tmp << 16 ) | ( tmp >> 16 );
++ // same stuff as top_down, inverted order (permutation matrices don't care, you know!)
++ const uint32_t* in_ptr = in;
++ uint32_t* out_ptr = out;
++ unsigned int number = 0;
++ for (; number < num_points; ++number) {
++ uint32_t tmp = *in_ptr;
++ tmp = ((tmp & (0x55555555)) << 1) | ((tmp >> 1) & (0x55555555));
++ tmp = ((tmp & (0x33333333)) << 2) | ((tmp >> 2) & (0x33333333));
++ tmp = ((tmp & (0xF | 0xF << 8 | 0xF << 16 | 0xF << 24)) << 4) |
++ ((tmp >> 4) & (0xF | 0xF << 8 | 0xF << 16 | 0xF << 24));
++ tmp = ((tmp & (0xFF | 0xFF << 16)) << 8) | ((tmp >> 8) & (0xFF | 0xFF << 16));
++ tmp = (tmp << 16) | (tmp >> 16);
+
+- *out_ptr = tmp;
+- ++in_ptr;
+- ++out_ptr;
+- }
++ *out_ptr = tmp;
++ ++in_ptr;
++ ++out_ptr;
++ }
+ }
+ #endif /* LV_HAVE_GENERIC */
+
+ #ifdef LV_HAVE_NEONV8
+ #include <arm_neon.h>
+
+-static inline void volk_32u_reverse_32u_neonv8(uint32_t* out, const uint32_t* in,
+- unsigned int num_points)
+-{
+- const uint32_t *in_ptr = in;
+- uint32_t *out_ptr = out;
++static inline void
++volk_32u_reverse_32u_neonv8(uint32_t* out, const uint32_t* in, unsigned int num_points)
++{
++ const uint32_t* in_ptr = in;
++ uint32_t* out_ptr = out;
+
+- const uint8x16_t idx = { 3,2,1,0, 7,6,5,4, 11,10,9,8, 15,14,13,12 };
++ const uint8x16_t idx = { 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12 };
+
+- const unsigned int quarterPoints = num_points/4;
++ const unsigned int quarterPoints = num_points / 4;
+ unsigned int number = 0;
+- for(; number < quarterPoints; ++number){
+- __VOLK_PREFETCH(in_ptr+4);
+- uint32x4_t x = vld1q_u32(in_ptr);
+- uint32x4_t z = vreinterpretq_u32_u8(vqtbl1q_u8(vrbitq_u8(vreinterpretq_u8_u32 (x)),
+- idx));
+- vst1q_u32 (out_ptr, z);
+- in_ptr += 4;
+- out_ptr += 4;
++ for (; number < quarterPoints; ++number) {
++ __VOLK_PREFETCH(in_ptr + 4);
++ uint32x4_t x = vld1q_u32(in_ptr);
++ uint32x4_t z =
++ vreinterpretq_u32_u8(vqtbl1q_u8(vrbitq_u8(vreinterpretq_u8_u32(x)), idx));
++ vst1q_u32(out_ptr, z);
++ in_ptr += 4;
++ out_ptr += 4;
+ }
+- number = quarterPoints*4;
+- for(; number < num_points; ++number){
+- *out_ptr =
+- (BitReverseTable256[*in_ptr & 0xff] << 24) |
+- (BitReverseTable256[(*in_ptr >> 8) & 0xff] << 16) |
+- (BitReverseTable256[(*in_ptr >> 16) & 0xff] << 8) |
+- (BitReverseTable256[(*in_ptr >> 24) & 0xff]);
+- ++in_ptr;
+- ++out_ptr;
++ number = quarterPoints * 4;
++ for (; number < num_points; ++number) {
++ *out_ptr = (BitReverseTable256[*in_ptr & 0xff] << 24) |
++ (BitReverseTable256[(*in_ptr >> 8) & 0xff] << 16) |
++ (BitReverseTable256[(*in_ptr >> 16) & 0xff] << 8) |
++ (BitReverseTable256[(*in_ptr >> 24) & 0xff]);
++ ++in_ptr;
++ ++out_ptr;
+ }
+ }
+
+@@ -371,29 +378,35 @@ static inline void volk_32u_reverse_32u_neonv8(uint32_t* out, const uint32_t* in
+ #ifdef LV_HAVE_NEON
+ #include <arm_neon.h>
+
+-#define DO_RBIT \
+- __VOLK_ASM("rbit %[result], %[value]" \
+- : [result]"=r" (*out_ptr) \
+- : [value] "r" (*in_ptr) \
+- : ); \
+- in_ptr++; \
+- out_ptr++;
++#define DO_RBIT \
++ __VOLK_ASM("rbit %[result], %[value]" \
++ : [result] "=r"(*out_ptr) \
++ : [value] "r"(*in_ptr) \
++ :); \
++ in_ptr++; \
++ out_ptr++;
+
+-static inline void volk_32u_reverse_32u_arm(uint32_t* out, const uint32_t* in,
+- unsigned int num_points)
++static inline void
++volk_32u_reverse_32u_arm(uint32_t* out, const uint32_t* in, unsigned int num_points)
+ {
+
+- const uint32_t *in_ptr = in;
+- uint32_t *out_ptr = out;
+- const unsigned int eighthPoints = num_points/8;
++ const uint32_t* in_ptr = in;
++ uint32_t* out_ptr = out;
++ const unsigned int eighthPoints = num_points / 8;
+ unsigned int number = 0;
+- for(; number < eighthPoints; ++number){
+- __VOLK_PREFETCH(in_ptr+8);
+- DO_RBIT; DO_RBIT; DO_RBIT; DO_RBIT;
+- DO_RBIT; DO_RBIT; DO_RBIT; DO_RBIT;
++ for (; number < eighthPoints; ++number) {
++ __VOLK_PREFETCH(in_ptr + 8);
++ DO_RBIT;
++ DO_RBIT;
++ DO_RBIT;
++ DO_RBIT;
++ DO_RBIT;
++ DO_RBIT;
++ DO_RBIT;
++ DO_RBIT;
+ }
+- number = eighthPoints*8;
+- for(; number < num_points; ++number){
++ number = eighthPoints * 8;
++ for (; number < num_points; ++number) {
+ DO_RBIT;
+ }
+ }
+@@ -403,4 +416,3 @@ static inline void volk_32u_reverse_32u_arm(uint32_t* out, const uint32_t* in,
+
+
+ #endif /* INCLUDED_volk_32u_reverse_32u_u_H */
+-
+diff --git a/kernels/volk/volk_64f_convert_32f.h b/kernels/volk/volk_64f_convert_32f.h
+index 20422cf..4ebccc0 100644
+--- a/kernels/volk/volk_64f_convert_32f.h
++++ b/kernels/volk/volk_64f_convert_32f.h
+@@ -29,8 +29,8 @@
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+- * void volk_64f_convert_32f(float* outputVector, const double* inputVector, unsigned int num_points)
+- * \endcode
++ * void volk_64f_convert_32f(float* outputVector, const double* inputVector, unsigned int
++ * num_points) \endcode
+ *
+ * \b Inputs
+ * \li inputVector: The vector of doubles to convert to floats.
+@@ -70,34 +70,39 @@
+ #ifdef LV_HAVE_AVX512F
+ #include <immintrin.h>
+
+-static inline void volk_64f_convert_32f_u_avx512f(float* outputVector, const double* inputVector, unsigned int num_points){
+- unsigned int number = 0;
++static inline void volk_64f_convert_32f_u_avx512f(float* outputVector,
++ const double* inputVector,
++ unsigned int num_points)
++{
++ unsigned int number = 0;
+
+- const unsigned int oneSixteenthPoints = num_points / 16;
++ const unsigned int oneSixteenthPoints = num_points / 16;
+
+- const double* inputVectorPtr = (const double*)inputVector;
+- float* outputVectorPtr = outputVector;
+- __m256 ret1, ret2;
+- __m512d inputVal1, inputVal2;
++ const double* inputVectorPtr = (const double*)inputVector;
++ float* outputVectorPtr = outputVector;
++ __m256 ret1, ret2;
++ __m512d inputVal1, inputVal2;
+
+- for(;number < oneSixteenthPoints; number++){
+- inputVal1 = _mm512_loadu_pd(inputVectorPtr); inputVectorPtr += 8;
+- inputVal2 = _mm512_loadu_pd(inputVectorPtr); inputVectorPtr += 8;
++ for (; number < oneSixteenthPoints; number++) {
++ inputVal1 = _mm512_loadu_pd(inputVectorPtr);
++ inputVectorPtr += 8;
++ inputVal2 = _mm512_loadu_pd(inputVectorPtr);
++ inputVectorPtr += 8;
+
+- ret1 = _mm512_cvtpd_ps(inputVal1);
+- ret2 = _mm512_cvtpd_ps(inputVal2);
++ ret1 = _mm512_cvtpd_ps(inputVal1);
++ ret2 = _mm512_cvtpd_ps(inputVal2);
+
+- _mm256_storeu_ps(outputVectorPtr, ret1);
+- outputVectorPtr += 8;
++ _mm256_storeu_ps(outputVectorPtr, ret1);
++ outputVectorPtr += 8;
+
+- _mm256_storeu_ps(outputVectorPtr, ret2);
+- outputVectorPtr += 8;
+- }
++ _mm256_storeu_ps(outputVectorPtr, ret2);
++ outputVectorPtr += 8;
++ }
+
+- number = oneSixteenthPoints * 16;
+- for(; number < num_points; number++){
+- outputVector[number] = (float)(inputVector[number]);
+- }
++ number = oneSixteenthPoints * 16;
++ for (; number < num_points; number++) {
++ outputVector[number] = (float)(inputVector[number]);
++ }
+ }
+ #endif /* LV_HAVE_AVX512F */
+
+@@ -105,34 +110,39 @@ static inline void volk_64f_convert_32f_u_avx512f(float* outputVector, const dou
+ #ifdef LV_HAVE_AVX
+ #include <immintrin.h>
+
+-static inline void volk_64f_convert_32f_u_avx(float* outputVector, const double* inputVector, unsigned int num_points){
+- unsigned int number = 0;
++static inline void volk_64f_convert_32f_u_avx(float* outputVector,
++ const double* inputVector,
++ unsigned int num_points)
++{
++ unsigned int number = 0;
+
+- const unsigned int oneEightPoints = num_points / 8;
++ const unsigned int oneEightPoints = num_points / 8;
+
+- const double* inputVectorPtr = (const double*)inputVector;
+- float* outputVectorPtr = outputVector;
+- __m128 ret1, ret2;
+- __m256d inputVal1, inputVal2;
++ const double* inputVectorPtr = (const double*)inputVector;
++ float* outputVectorPtr = outputVector;
++ __m128 ret1, ret2;
++ __m256d inputVal1, inputVal2;
+
+- for(;number < oneEightPoints; number++){
+- inputVal1 = _mm256_loadu_pd(inputVectorPtr); inputVectorPtr += 4;
+- inputVal2 = _mm256_loadu_pd(inputVectorPtr); inputVectorPtr += 4;
++ for (; number < oneEightPoints; number++) {
++ inputVal1 = _mm256_loadu_pd(inputVectorPtr);
++ inputVectorPtr += 4;
++ inputVal2 = _mm256_loadu_pd(inputVectorPtr);
++ inputVectorPtr += 4;
+
+- ret1 = _mm256_cvtpd_ps(inputVal1);
+- ret2 = _mm256_cvtpd_ps(inputVal2);
++ ret1 = _mm256_cvtpd_ps(inputVal1);
++ ret2 = _mm256_cvtpd_ps(inputVal2);
+
+- _mm_storeu_ps(outputVectorPtr, ret1);
+- outputVectorPtr += 4;
++ _mm_storeu_ps(outputVectorPtr, ret1);
++ outputVectorPtr += 4;
+
+- _mm_storeu_ps(outputVectorPtr, ret2);
+- outputVectorPtr += 4;
+- }
++ _mm_storeu_ps(outputVectorPtr, ret2);
++ outputVectorPtr += 4;
++ }
+
+- number = oneEightPoints * 8;
+- for(; number < num_points; number++){
+- outputVector[number] = (float)(inputVector[number]);
+- }
++ number = oneEightPoints * 8;
++ for (; number < num_points; number++) {
++ outputVector[number] = (float)(inputVector[number]);
++ }
+ }
+ #endif /* LV_HAVE_AVX */
+
+@@ -140,53 +150,59 @@ static inline void volk_64f_convert_32f_u_avx(float* outputVector, const double*
+ #ifdef LV_HAVE_SSE2
+ #include <emmintrin.h>
+
+-static inline void volk_64f_convert_32f_u_sse2(float* outputVector, const double* inputVector, unsigned int num_points){
+- unsigned int number = 0;
++static inline void volk_64f_convert_32f_u_sse2(float* outputVector,
++ const double* inputVector,
++ unsigned int num_points)
++{
++ unsigned int number = 0;
+
+- const unsigned int quarterPoints = num_points / 4;
++ const unsigned int quarterPoints = num_points / 4;
+
+- const double* inputVectorPtr = (const double*)inputVector;
+- float* outputVectorPtr = outputVector;
+- __m128 ret, ret2;
+- __m128d inputVal1, inputVal2;
++ const double* inputVectorPtr = (const double*)inputVector;
++ float* outputVectorPtr = outputVector;
++ __m128 ret, ret2;
++ __m128d inputVal1, inputVal2;
+
+- for(;number < quarterPoints; number++){
+- inputVal1 = _mm_loadu_pd(inputVectorPtr); inputVectorPtr += 2;
+- inputVal2 = _mm_loadu_pd(inputVectorPtr); inputVectorPtr += 2;
++ for (; number < quarterPoints; number++) {
++ inputVal1 = _mm_loadu_pd(inputVectorPtr);
++ inputVectorPtr += 2;
++ inputVal2 = _mm_loadu_pd(inputVectorPtr);
++ inputVectorPtr += 2;
+
+- ret = _mm_cvtpd_ps(inputVal1);
+- ret2 = _mm_cvtpd_ps(inputVal2);
++ ret = _mm_cvtpd_ps(inputVal1);
++ ret2 = _mm_cvtpd_ps(inputVal2);
+
+- ret = _mm_movelh_ps(ret, ret2);
++ ret = _mm_movelh_ps(ret, ret2);
+
+- _mm_storeu_ps(outputVectorPtr, ret);
+- outputVectorPtr += 4;
+- }
++ _mm_storeu_ps(outputVectorPtr, ret);
++ outputVectorPtr += 4;
++ }
+
+- number = quarterPoints * 4;
+- for(; number < num_points; number++){
+- outputVector[number] = (float)(inputVector[number]);
+- }
++ number = quarterPoints * 4;
++ for (; number < num_points; number++) {
++ outputVector[number] = (float)(inputVector[number]);
++ }
+ }
+ #endif /* LV_HAVE_SSE2 */
+
+
+ #ifdef LV_HAVE_GENERIC
+
+-static inline void volk_64f_convert_32f_generic(float* outputVector, const double* inputVector, unsigned int num_points){
+- float* outputVectorPtr = outputVector;
+- const double* inputVectorPtr = inputVector;
+- unsigned int number = 0;
+-
+- for(number = 0; number < num_points; number++){
+- *outputVectorPtr++ = ((float)(*inputVectorPtr++));
+- }
++static inline void volk_64f_convert_32f_generic(float* outputVector,
++ const double* inputVector,
++ unsigned int num_points)
++{
++ float* outputVectorPtr = outputVector;
++ const double* inputVectorPtr = inputVector;
++ unsigned int number = 0;
++
++ for (number = 0; number < num_points; number++) {
++ *outputVectorPtr++ = ((float)(*inputVectorPtr++));
++ }
+ }
+ #endif /* LV_HAVE_GENERIC */
+
+
+-
+-
+ #endif /* INCLUDED_volk_64f_convert_32f_u_H */
+ #ifndef INCLUDED_volk_64f_convert_32f_a_H
+ #define INCLUDED_volk_64f_convert_32f_a_H
+@@ -197,34 +213,39 @@ static inline void volk_64f_convert_32f_generic(float* outputVector, const doubl
+ #ifdef LV_HAVE_AVX512F
+ #include <immintrin.h>
+
+-static inline void volk_64f_convert_32f_a_avx512f(float* outputVector, const double* inputVector, unsigned int num_points){
+- unsigned int number = 0;
++static inline void volk_64f_convert_32f_a_avx512f(float* outputVector,
++ const double* inputVector,
++ unsigned int num_points)
++{
++ unsigned int number = 0;
+
+- const unsigned int oneSixteenthPoints = num_points / 16;
++ const unsigned int oneSixteenthPoints = num_points / 16;
+
+- const double* inputVectorPtr = (const double*)inputVector;
+- float* outputVectorPtr = outputVector;
+- __m256 ret1, ret2;
+- __m512d inputVal1, inputVal2;
++ const double* inputVectorPtr = (const double*)inputVector;
++ float* outputVectorPtr = outputVector;
++ __m256 ret1, ret2;
++ __m512d inputVal1, inputVal2;
+
+- for(;number < oneSixteenthPoints; number++){
+- inputVal1 = _mm512_load_pd(inputVectorPtr); inputVectorPtr += 8;
+- inputVal2 = _mm512_load_pd(inputVectorPtr); inputVectorPtr += 8;
++ for (; number < oneSixteenthPoints; number++) {
++ inputVal1 = _mm512_load_pd(inputVectorPtr);
++ inputVectorPtr += 8;
++ inputVal2 = _mm512_load_pd(inputVectorPtr);
++ inputVectorPtr += 8;
+
+- ret1 = _mm512_cvtpd_ps(inputVal1);
+- ret2 = _mm512_cvtpd_ps(inputVal2);
++ ret1 = _mm512_cvtpd_ps(inputVal1);
++ ret2 = _mm512_cvtpd_ps(inputVal2);
+
+- _mm256_store_ps(outputVectorPtr, ret1);
+- outputVectorPtr += 8;
++ _mm256_store_ps(outputVectorPtr, ret1);
++ outputVectorPtr += 8;
+
+- _mm256_store_ps(outputVectorPtr, ret2);
+- outputVectorPtr += 8;
+- }
++ _mm256_store_ps(outputVectorPtr, ret2);
++ outputVectorPtr += 8;
++ }
+
+- number = oneSixteenthPoints * 16;
+- for(; number < num_points; number++){
+- outputVector[number] = (float)(inputVector[number]);
+- }
++ number = oneSixteenthPoints * 16;
++ for (; number < num_points; number++) {
++ outputVector[number] = (float)(inputVector[number]);
++ }
+ }
+ #endif /* LV_HAVE_AVX512F */
+
+@@ -232,34 +253,39 @@ static inline void volk_64f_convert_32f_a_avx512f(float* outputVector, const dou
+ #ifdef LV_HAVE_AVX
+ #include <immintrin.h>
+
+-static inline void volk_64f_convert_32f_a_avx(float* outputVector, const double* inputVector, unsigned int num_points){
+- unsigned int number = 0;
++static inline void volk_64f_convert_32f_a_avx(float* outputVector,
++ const double* inputVector,
++ unsigned int num_points)
++{
++ unsigned int number = 0;
+
+- const unsigned int oneEightPoints = num_points / 8;
++ const unsigned int oneEightPoints = num_points / 8;
+
+- const double* inputVectorPtr = (const double*)inputVector;
+- float* outputVectorPtr = outputVector;
+- __m128 ret1, ret2;
+- __m256d inputVal1, inputVal2;
++ const double* inputVectorPtr = (const double*)inputVector;
++ float* outputVectorPtr = outputVector;
++ __m128 ret1, ret2;
++ __m256d inputVal1, inputVal2;
+
+- for(;number < oneEightPoints; number++){
+- inputVal1 = _mm256_load_pd(inputVectorPtr); inputVectorPtr += 4;
+- inputVal2 = _mm256_load_pd(inputVectorPtr); inputVectorPtr += 4;
++ for (; number < oneEightPoints; number++) {
++ inputVal1 = _mm256_load_pd(inputVectorPtr);
++ inputVectorPtr += 4;
++ inputVal2 = _mm256_load_pd(inputVectorPtr);
++ inputVectorPtr += 4;
+
+- ret1 = _mm256_cvtpd_ps(inputVal1);
+- ret2 = _mm256_cvtpd_ps(inputVal2);
++ ret1 = _mm256_cvtpd_ps(inputVal1);
++ ret2 = _mm256_cvtpd_ps(inputVal2);
+
+- _mm_store_ps(outputVectorPtr, ret1);
+- outputVectorPtr += 4;
++ _mm_store_ps(outputVectorPtr, ret1);
++ outputVectorPtr += 4;
+
+- _mm_store_ps(outputVectorPtr, ret2);
+- outputVectorPtr += 4;
+- }
++ _mm_store_ps(outputVectorPtr, ret2);
++ outputVectorPtr += 4;
++ }
+
+- number = oneEightPoints * 8;
+- for(; number < num_points; number++){
+- outputVector[number] = (float)(inputVector[number]);
+- }
++ number = oneEightPoints * 8;
++ for (; number < num_points; number++) {
++ outputVector[number] = (float)(inputVector[number]);
++ }
+ }
+ #endif /* LV_HAVE_AVX */
+
+@@ -267,51 +293,57 @@ static inline void volk_64f_convert_32f_a_avx(float* outputVector, const double*
+ #ifdef LV_HAVE_SSE2
+ #include <emmintrin.h>
+
+-static inline void volk_64f_convert_32f_a_sse2(float* outputVector, const double* inputVector, unsigned int num_points){
+- unsigned int number = 0;
++static inline void volk_64f_convert_32f_a_sse2(float* outputVector,
++ const double* inputVector,
++ unsigned int num_points)
++{
++ unsigned int number = 0;
+
+- const unsigned int quarterPoints = num_points / 4;
++ const unsigned int quarterPoints = num_points / 4;
+
+- const double* inputVectorPtr = (const double*)inputVector;
+- float* outputVectorPtr = outputVector;
+- __m128 ret, ret2;
+- __m128d inputVal1, inputVal2;
++ const double* inputVectorPtr = (const double*)inputVector;
++ float* outputVectorPtr = outputVector;
++ __m128 ret, ret2;
++ __m128d inputVal1, inputVal2;
+
+- for(;number < quarterPoints; number++){
+- inputVal1 = _mm_load_pd(inputVectorPtr); inputVectorPtr += 2;
+- inputVal2 = _mm_load_pd(inputVectorPtr); inputVectorPtr += 2;
++ for (; number < quarterPoints; number++) {
++ inputVal1 = _mm_load_pd(inputVectorPtr);
++ inputVectorPtr += 2;
++ inputVal2 = _mm_load_pd(inputVectorPtr);
++ inputVectorPtr += 2;
+
+- ret = _mm_cvtpd_ps(inputVal1);
+- ret2 = _mm_cvtpd_ps(inputVal2);
++ ret = _mm_cvtpd_ps(inputVal1);
++ ret2 = _mm_cvtpd_ps(inputVal2);
+
+- ret = _mm_movelh_ps(ret, ret2);
++ ret = _mm_movelh_ps(ret, ret2);
+
+- _mm_store_ps(outputVectorPtr, ret);
+- outputVectorPtr += 4;
+- }
++ _mm_store_ps(outputVectorPtr, ret);
++ outputVectorPtr += 4;
++ }
+
+- number = quarterPoints * 4;
+- for(; number < num_points; number++){
+- outputVector[number] = (float)(inputVector[number]);
+- }
++ number = quarterPoints * 4;
++ for (; number < num_points; number++) {
++ outputVector[number] = (float)(inputVector[number]);
++ }
+ }
+ #endif /* LV_HAVE_SSE2 */
+
+
+ #ifdef LV_HAVE_GENERIC
+
+-static inline void volk_64f_convert_32f_a_generic(float* outputVector, const double* inputVector, unsigned int num_points){
+- float* outputVectorPtr = outputVector;
+- const double* inputVectorPtr = inputVector;
+- unsigned int number = 0;
+-
+- for(number = 0; number < num_points; number++){
+- *outputVectorPtr++ = ((float)(*inputVectorPtr++));
+- }
++static inline void volk_64f_convert_32f_a_generic(float* outputVector,
++ const double* inputVector,
++ unsigned int num_points)
++{
++ float* outputVectorPtr = outputVector;
++ const double* inputVectorPtr = inputVector;
++ unsigned int number = 0;
++
++ for (number = 0; number < num_points; number++) {
++ *outputVectorPtr++ = ((float)(*inputVectorPtr++));
++ }
+ }
+ #endif /* LV_HAVE_GENERIC */
+
+
+-
+-
+ #endif /* INCLUDED_volk_64f_convert_32f_a_H */
+diff --git a/kernels/volk/volk_64f_x2_add_64f.h b/kernels/volk/volk_64f_x2_add_64f.h
+index 03b8e4c..5c512cc 100644
+--- a/kernels/volk/volk_64f_x2_add_64f.h
++++ b/kernels/volk/volk_64f_x2_add_64f.h
+@@ -31,8 +31,8 @@
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+- * void volk_64f_x2_add_64f(float* cVector, const float* aVector, const float* bVector, unsigned int num_points)
+- * \endcode
++ * void volk_64f_x2_add_64f(float* cVector, const float* aVector, const float* bVector,
++ * unsigned int num_points) \endcode
+ *
+ * \b Inputs
+ * \li aVector: First input vector.
+@@ -76,18 +76,19 @@
+
+ #ifdef LV_HAVE_GENERIC
+
+-static inline void
+-volk_64f_x2_add_64f_generic(double *cVector, const double *aVector,
+- const double *bVector, unsigned int num_points)
++static inline void volk_64f_x2_add_64f_generic(double* cVector,
++ const double* aVector,
++ const double* bVector,
++ unsigned int num_points)
+ {
+- double *cPtr = cVector;
+- const double *aPtr = aVector;
+- const double *bPtr = bVector;
+- unsigned int number = 0;
+-
+- for (number = 0; number < num_points; number++) {
+- *cPtr++ = (*aPtr++) + (*bPtr++);
+- }
++ double* cPtr = cVector;
++ const double* aPtr = aVector;
++ const double* bPtr = bVector;
++ unsigned int number = 0;
++
++ for (number = 0; number < num_points; number++) {
++ *cPtr++ = (*aPtr++) + (*bPtr++);
++ }
+ }
+
+ #endif /* LV_HAVE_GENERIC */
+@@ -100,35 +101,36 @@ volk_64f_x2_add_64f_generic(double *cVector, const double *aVector,
+
+ #include <emmintrin.h>
+
+-static inline void
+-volk_64f_x2_add_64f_u_sse2(double *cVector, const double *aVector,
+- const double *bVector, unsigned int num_points)
++static inline void volk_64f_x2_add_64f_u_sse2(double* cVector,
++ const double* aVector,
++ const double* bVector,
++ unsigned int num_points)
+ {
+- unsigned int number = 0;
+- const unsigned int half_points = num_points / 2;
++ unsigned int number = 0;
++ const unsigned int half_points = num_points / 2;
+
+- double *cPtr = cVector;
+- const double *aPtr = aVector;
+- const double *bPtr = bVector;
++ double* cPtr = cVector;
++ const double* aPtr = aVector;
++ const double* bPtr = bVector;
+
+- __m128d aVal, bVal, cVal;
+- for (; number < half_points; number++) {
+- aVal = _mm_loadu_pd(aPtr);
+- bVal = _mm_loadu_pd(bPtr);
++ __m128d aVal, bVal, cVal;
++ for (; number < half_points; number++) {
++ aVal = _mm_loadu_pd(aPtr);
++ bVal = _mm_loadu_pd(bPtr);
+
+- cVal = _mm_add_pd(aVal, bVal);
++ cVal = _mm_add_pd(aVal, bVal);
+
+- _mm_storeu_pd(cPtr, cVal); // Store the results back into the C container
++ _mm_storeu_pd(cPtr, cVal); // Store the results back into the C container
+
+- aPtr += 2;
+- bPtr += 2;
+- cPtr += 2;
+- }
++ aPtr += 2;
++ bPtr += 2;
++ cPtr += 2;
++ }
+
+- number = half_points * 2;
+- for (; number < num_points; number++) {
+- *cPtr++ = (*aPtr++) + (*bPtr++);
+- }
++ number = half_points * 2;
++ for (; number < num_points; number++) {
++ *cPtr++ = (*aPtr++) + (*bPtr++);
++ }
+ }
+
+ #endif /* LV_HAVE_SSE2 */
+@@ -138,36 +140,37 @@ volk_64f_x2_add_64f_u_sse2(double *cVector, const double *aVector,
+
+ #include <immintrin.h>
+
+-static inline void
+-volk_64f_x2_add_64f_u_avx(double *cVector, const double *aVector,
+- const double *bVector, unsigned int num_points)
++static inline void volk_64f_x2_add_64f_u_avx(double* cVector,
++ const double* aVector,
++ const double* bVector,
++ unsigned int num_points)
+ {
+- unsigned int number = 0;
+- const unsigned int quarter_points = num_points / 4;
++ unsigned int number = 0;
++ const unsigned int quarter_points = num_points / 4;
+
+- double *cPtr = cVector;
+- const double *aPtr = aVector;
+- const double *bPtr = bVector;
++ double* cPtr = cVector;
++ const double* aPtr = aVector;
++ const double* bPtr = bVector;
+
+- __m256d aVal, bVal, cVal;
+- for (; number < quarter_points; number++) {
++ __m256d aVal, bVal, cVal;
++ for (; number < quarter_points; number++) {
+
+- aVal = _mm256_loadu_pd(aPtr);
+- bVal = _mm256_loadu_pd(bPtr);
++ aVal = _mm256_loadu_pd(aPtr);
++ bVal = _mm256_loadu_pd(bPtr);
+
+- cVal = _mm256_add_pd(aVal, bVal);
++ cVal = _mm256_add_pd(aVal, bVal);
+
+- _mm256_storeu_pd(cPtr, cVal); // Store the results back into the C container
++ _mm256_storeu_pd(cPtr, cVal); // Store the results back into the C container
+
+- aPtr += 4;
+- bPtr += 4;
+- cPtr += 4;
+- }
++ aPtr += 4;
++ bPtr += 4;
++ cPtr += 4;
++ }
+
+- number = quarter_points * 4;
+- for (; number < num_points; number++) {
+- *cPtr++ = (*aPtr++) + (*bPtr++);
+- }
++ number = quarter_points * 4;
++ for (; number < num_points; number++) {
++ *cPtr++ = (*aPtr++) + (*bPtr++);
++ }
+ }
+
+ #endif /* LV_HAVE_AVX */
+@@ -180,35 +183,36 @@ volk_64f_x2_add_64f_u_avx(double *cVector, const double *aVector,
+
+ #include <emmintrin.h>
+
+-static inline void
+-volk_64f_x2_add_64f_a_sse2(double *cVector, const double *aVector,
+- const double *bVector, unsigned int num_points)
++static inline void volk_64f_x2_add_64f_a_sse2(double* cVector,
++ const double* aVector,
++ const double* bVector,
++ unsigned int num_points)
+ {
+- unsigned int number = 0;
+- const unsigned int half_points = num_points / 2;
++ unsigned int number = 0;
++ const unsigned int half_points = num_points / 2;
+
+- double *cPtr = cVector;
+- const double *aPtr = aVector;
+- const double *bPtr = bVector;
++ double* cPtr = cVector;
++ const double* aPtr = aVector;
++ const double* bPtr = bVector;
+
+- __m128d aVal, bVal, cVal;
+- for (; number < half_points; number++) {
+- aVal = _mm_load_pd(aPtr);
+- bVal = _mm_load_pd(bPtr);
++ __m128d aVal, bVal, cVal;
++ for (; number < half_points; number++) {
++ aVal = _mm_load_pd(aPtr);
++ bVal = _mm_load_pd(bPtr);
+
+- cVal = _mm_add_pd(aVal, bVal);
++ cVal = _mm_add_pd(aVal, bVal);
+
+- _mm_store_pd(cPtr, cVal); // Store the results back into the C container
++ _mm_store_pd(cPtr, cVal); // Store the results back into the C container
+
+- aPtr += 2;
+- bPtr += 2;
+- cPtr += 2;
+- }
++ aPtr += 2;
++ bPtr += 2;
++ cPtr += 2;
++ }
+
+- number = half_points * 2;
+- for (; number < num_points; number++) {
+- *cPtr++ = (*aPtr++) + (*bPtr++);
+- }
++ number = half_points * 2;
++ for (; number < num_points; number++) {
++ *cPtr++ = (*aPtr++) + (*bPtr++);
++ }
+ }
+
+ #endif /* LV_HAVE_SSE2 */
+@@ -218,36 +222,37 @@ volk_64f_x2_add_64f_a_sse2(double *cVector, const double *aVector,
+
+ #include <immintrin.h>
+
+-static inline void
+-volk_64f_x2_add_64f_a_avx(double *cVector, const double *aVector,
+- const double *bVector, unsigned int num_points)
++static inline void volk_64f_x2_add_64f_a_avx(double* cVector,
++ const double* aVector,
++ const double* bVector,
++ unsigned int num_points)
+ {
+- unsigned int number = 0;
+- const unsigned int quarter_points = num_points / 4;
++ unsigned int number = 0;
++ const unsigned int quarter_points = num_points / 4;
+
+- double *cPtr = cVector;
+- const double *aPtr = aVector;
+- const double *bPtr = bVector;
++ double* cPtr = cVector;
++ const double* aPtr = aVector;
++ const double* bPtr = bVector;
+
+- __m256d aVal, bVal, cVal;
+- for (; number < quarter_points; number++) {
++ __m256d aVal, bVal, cVal;
++ for (; number < quarter_points; number++) {
+
+- aVal = _mm256_load_pd(aPtr);
+- bVal = _mm256_load_pd(bPtr);
++ aVal = _mm256_load_pd(aPtr);
++ bVal = _mm256_load_pd(bPtr);
+
+- cVal = _mm256_add_pd(aVal, bVal);
++ cVal = _mm256_add_pd(aVal, bVal);
+
+- _mm256_store_pd(cPtr, cVal); // Store the results back into the C container
++ _mm256_store_pd(cPtr, cVal); // Store the results back into the C container
+
+- aPtr += 4;
+- bPtr += 4;
+- cPtr += 4;
+- }
++ aPtr += 4;
++ bPtr += 4;
++ cPtr += 4;
++ }
+
+- number = quarter_points * 4;
+- for (; number < num_points; number++) {
+- *cPtr++ = (*aPtr++) + (*bPtr++);
+- }
++ number = quarter_points * 4;
++ for (; number < num_points; number++) {
++ *cPtr++ = (*aPtr++) + (*bPtr++);
++ }
+ }
+
+ #endif /* LV_HAVE_AVX */
+diff --git a/kernels/volk/volk_64f_x2_max_64f.h b/kernels/volk/volk_64f_x2_max_64f.h
+index d4464b7..8f7f743 100644
+--- a/kernels/volk/volk_64f_x2_max_64f.h
++++ b/kernels/volk/volk_64f_x2_max_64f.h
+@@ -32,8 +32,8 @@
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+- * void volk_64f_x2_max_64f(double* cVector, const double* aVector, const double* bVector, unsigned int num_points)
+- * \endcode
++ * void volk_64f_x2_max_64f(double* cVector, const double* aVector, const double* bVector,
++ * unsigned int num_points) \endcode
+ *
+ * \b Inputs
+ * \li aVector: First input vector.
+@@ -77,38 +77,39 @@
+ #ifdef LV_HAVE_AVX512F
+ #include <immintrin.h>
+
+-static inline void
+-volk_64f_x2_max_64f_a_avx512f(double* cVector, const double* aVector,
+- const double* bVector, unsigned int num_points)
++static inline void volk_64f_x2_max_64f_a_avx512f(double* cVector,
++ const double* aVector,
++ const double* bVector,
++ unsigned int num_points)
+ {
+- unsigned int number = 0;
+- const unsigned int eigthPoints = num_points / 8;
++ unsigned int number = 0;
++ const unsigned int eigthPoints = num_points / 8;
+
+- double* cPtr = cVector;
+- const double* aPtr = aVector;
+- const double* bPtr= bVector;
++ double* cPtr = cVector;
++ const double* aPtr = aVector;
++ const double* bPtr = bVector;
+
+- __m512d aVal, bVal, cVal;
+- for(;number < eigthPoints; number++){
++ __m512d aVal, bVal, cVal;
++ for (; number < eigthPoints; number++) {
+
+- aVal = _mm512_load_pd(aPtr);
+- bVal = _mm512_load_pd(bPtr);
++ aVal = _mm512_load_pd(aPtr);
++ bVal = _mm512_load_pd(bPtr);
+
+- cVal = _mm512_max_pd(aVal, bVal);
++ cVal = _mm512_max_pd(aVal, bVal);
+
+- _mm512_store_pd(cPtr,cVal); // Store the results back into the C container
++ _mm512_store_pd(cPtr, cVal); // Store the results back into the C container
+
+- aPtr += 8;
+- bPtr += 8;
+- cPtr += 8;
+- }
++ aPtr += 8;
++ bPtr += 8;
++ cPtr += 8;
++ }
+
+- number = eigthPoints * 8;
+- for(;number < num_points; number++){
+- const double a = *aPtr++;
+- const double b = *bPtr++;
+- *cPtr++ = ( a > b ? a : b);
+- }
++ number = eigthPoints * 8;
++ for (; number < num_points; number++) {
++ const double a = *aPtr++;
++ const double b = *bPtr++;
++ *cPtr++ = (a > b ? a : b);
++ }
+ }
+ #endif /* LV_HAVE_AVX512F */
+
+@@ -116,38 +117,39 @@ volk_64f_x2_max_64f_a_avx512f(double* cVector, const double* aVector,
+ #ifdef LV_HAVE_AVX
+ #include <immintrin.h>
+
+-static inline void
+-volk_64f_x2_max_64f_a_avx(double* cVector, const double* aVector,
+- const double* bVector, unsigned int num_points)
++static inline void volk_64f_x2_max_64f_a_avx(double* cVector,
++ const double* aVector,
++ const double* bVector,
++ unsigned int num_points)
+ {
+- unsigned int number = 0;
+- const unsigned int quarterPoints = num_points / 4;
++ unsigned int number = 0;
++ const unsigned int quarterPoints = num_points / 4;
+
+- double* cPtr = cVector;
+- const double* aPtr = aVector;
+- const double* bPtr= bVector;
++ double* cPtr = cVector;
++ const double* aPtr = aVector;
++ const double* bPtr = bVector;
+
+- __m256d aVal, bVal, cVal;
+- for(;number < quarterPoints; number++){
++ __m256d aVal, bVal, cVal;
++ for (; number < quarterPoints; number++) {
+
+- aVal = _mm256_load_pd(aPtr);
+- bVal = _mm256_load_pd(bPtr);
++ aVal = _mm256_load_pd(aPtr);
++ bVal = _mm256_load_pd(bPtr);
+
+- cVal = _mm256_max_pd(aVal, bVal);
++ cVal = _mm256_max_pd(aVal, bVal);
+
+- _mm256_store_pd(cPtr,cVal); // Store the results back into the C container
++ _mm256_store_pd(cPtr, cVal); // Store the results back into the C container
+
+- aPtr += 4;
+- bPtr += 4;
+- cPtr += 4;
+- }
++ aPtr += 4;
++ bPtr += 4;
++ cPtr += 4;
++ }
+
+- number = quarterPoints * 4;
+- for(;number < num_points; number++){
+- const double a = *aPtr++;
+- const double b = *bPtr++;
+- *cPtr++ = ( a > b ? a : b);
+- }
++ number = quarterPoints * 4;
++ for (; number < num_points; number++) {
++ const double a = *aPtr++;
++ const double b = *bPtr++;
++ *cPtr++ = (a > b ? a : b);
++ }
+ }
+ #endif /* LV_HAVE_AVX */
+
+@@ -155,58 +157,60 @@ volk_64f_x2_max_64f_a_avx(double* cVector, const double* aVector,
+ #ifdef LV_HAVE_SSE2
+ #include <emmintrin.h>
+
+-static inline void
+-volk_64f_x2_max_64f_a_sse2(double* cVector, const double* aVector,
+- const double* bVector, unsigned int num_points)
++static inline void volk_64f_x2_max_64f_a_sse2(double* cVector,
++ const double* aVector,
++ const double* bVector,
++ unsigned int num_points)
+ {
+- unsigned int number = 0;
+- const unsigned int halfPoints = num_points / 2;
++ unsigned int number = 0;
++ const unsigned int halfPoints = num_points / 2;
+
+- double* cPtr = cVector;
+- const double* aPtr = aVector;
+- const double* bPtr= bVector;
++ double* cPtr = cVector;
++ const double* aPtr = aVector;
++ const double* bPtr = bVector;
+
+- __m128d aVal, bVal, cVal;
+- for(;number < halfPoints; number++){
++ __m128d aVal, bVal, cVal;
++ for (; number < halfPoints; number++) {
+
+- aVal = _mm_load_pd(aPtr);
+- bVal = _mm_load_pd(bPtr);
++ aVal = _mm_load_pd(aPtr);
++ bVal = _mm_load_pd(bPtr);
+
+- cVal = _mm_max_pd(aVal, bVal);
++ cVal = _mm_max_pd(aVal, bVal);
+
+- _mm_store_pd(cPtr,cVal); // Store the results back into the C container
++ _mm_store_pd(cPtr, cVal); // Store the results back into the C container
+
+- aPtr += 2;
+- bPtr += 2;
+- cPtr += 2;
+- }
++ aPtr += 2;
++ bPtr += 2;
++ cPtr += 2;
++ }
+
+- number = halfPoints * 2;
+- for(;number < num_points; number++){
+- const double a = *aPtr++;
+- const double b = *bPtr++;
+- *cPtr++ = ( a > b ? a : b);
+- }
++ number = halfPoints * 2;
++ for (; number < num_points; number++) {
++ const double a = *aPtr++;
++ const double b = *bPtr++;
++ *cPtr++ = (a > b ? a : b);
++ }
+ }
+ #endif /* LV_HAVE_SSE2 */
+
+
+ #ifdef LV_HAVE_GENERIC
+
+-static inline void
+-volk_64f_x2_max_64f_generic(double* cVector, const double* aVector,
+- const double* bVector, unsigned int num_points)
++static inline void volk_64f_x2_max_64f_generic(double* cVector,
++ const double* aVector,
++ const double* bVector,
++ unsigned int num_points)
+ {
+- double* cPtr = cVector;
+- const double* aPtr = aVector;
+- const double* bPtr= bVector;
+- unsigned int number = 0;
+-
+- for(number = 0; number < num_points; number++){
+- const double a = *aPtr++;
+- const double b = *bPtr++;
+- *cPtr++ = ( a > b ? a : b);
+- }
++ double* cPtr = cVector;
++ const double* aPtr = aVector;
++ const double* bPtr = bVector;
++ unsigned int number = 0;
++
++ for (number = 0; number < num_points; number++) {
++ const double a = *aPtr++;
++ const double b = *bPtr++;
++ *cPtr++ = (a > b ? a : b);
++ }
+ }
+ #endif /* LV_HAVE_GENERIC */
+
+@@ -223,38 +227,39 @@ volk_64f_x2_max_64f_generic(double* cVector, const double* aVector,
+ #ifdef LV_HAVE_AVX512F
+ #include <immintrin.h>
+
+-static inline void
+-volk_64f_x2_max_64f_u_avx512f(double* cVector, const double* aVector,
+- const double* bVector, unsigned int num_points)
++static inline void volk_64f_x2_max_64f_u_avx512f(double* cVector,
++ const double* aVector,
++ const double* bVector,
++ unsigned int num_points)
+ {
+- unsigned int number = 0;
+- const unsigned int eigthPoints = num_points / 8;
++ unsigned int number = 0;
++ const unsigned int eigthPoints = num_points / 8;
+
+- double* cPtr = cVector;
+- const double* aPtr = aVector;
+- const double* bPtr= bVector;
++ double* cPtr = cVector;
++ const double* aPtr = aVector;
++ const double* bPtr = bVector;
+
+- __m512d aVal, bVal, cVal;
+- for(;number < eigthPoints; number++){
++ __m512d aVal, bVal, cVal;
++ for (; number < eigthPoints; number++) {
+
+- aVal = _mm512_loadu_pd(aPtr);
+- bVal = _mm512_loadu_pd(bPtr);
++ aVal = _mm512_loadu_pd(aPtr);
++ bVal = _mm512_loadu_pd(bPtr);
+
+- cVal = _mm512_max_pd(aVal, bVal);
++ cVal = _mm512_max_pd(aVal, bVal);
+
+- _mm512_storeu_pd(cPtr,cVal); // Store the results back into the C container
++ _mm512_storeu_pd(cPtr, cVal); // Store the results back into the C container
+
+- aPtr += 8;
+- bPtr += 8;
+- cPtr += 8;
+- }
++ aPtr += 8;
++ bPtr += 8;
++ cPtr += 8;
++ }
+
+- number = eigthPoints * 8;
+- for(;number < num_points; number++){
+- const double a = *aPtr++;
+- const double b = *bPtr++;
+- *cPtr++ = ( a > b ? a : b);
+- }
++ number = eigthPoints * 8;
++ for (; number < num_points; number++) {
++ const double a = *aPtr++;
++ const double b = *bPtr++;
++ *cPtr++ = (a > b ? a : b);
++ }
+ }
+ #endif /* LV_HAVE_AVX512F */
+
+@@ -262,38 +267,39 @@ volk_64f_x2_max_64f_u_avx512f(double* cVector, const double* aVector,
+ #ifdef LV_HAVE_AVX
+ #include <immintrin.h>
+
+-static inline void
+-volk_64f_x2_max_64f_u_avx(double* cVector, const double* aVector,
+- const double* bVector, unsigned int num_points)
++static inline void volk_64f_x2_max_64f_u_avx(double* cVector,
++ const double* aVector,
++ const double* bVector,
++ unsigned int num_points)
+ {
+- unsigned int number = 0;
+- const unsigned int quarterPoints = num_points / 4;
++ unsigned int number = 0;
++ const unsigned int quarterPoints = num_points / 4;
+
+- double* cPtr = cVector;
+- const double* aPtr = aVector;
+- const double* bPtr= bVector;
++ double* cPtr = cVector;
++ const double* aPtr = aVector;
++ const double* bPtr = bVector;
+
+- __m256d aVal, bVal, cVal;
+- for(;number < quarterPoints; number++){
++ __m256d aVal, bVal, cVal;
++ for (; number < quarterPoints; number++) {
+
+- aVal = _mm256_loadu_pd(aPtr);
+- bVal = _mm256_loadu_pd(bPtr);
++ aVal = _mm256_loadu_pd(aPtr);
++ bVal = _mm256_loadu_pd(bPtr);
+
+- cVal = _mm256_max_pd(aVal, bVal);
++ cVal = _mm256_max_pd(aVal, bVal);
+
+- _mm256_storeu_pd(cPtr,cVal); // Store the results back into the C container
++ _mm256_storeu_pd(cPtr, cVal); // Store the results back into the C container
+
+- aPtr += 4;
+- bPtr += 4;
+- cPtr += 4;
+- }
++ aPtr += 4;
++ bPtr += 4;
++ cPtr += 4;
++ }
+
+- number = quarterPoints * 4;
+- for(;number < num_points; number++){
+- const double a = *aPtr++;
+- const double b = *bPtr++;
+- *cPtr++ = ( a > b ? a : b);
+- }
++ number = quarterPoints * 4;
++ for (; number < num_points; number++) {
++ const double a = *aPtr++;
++ const double b = *bPtr++;
++ *cPtr++ = (a > b ? a : b);
++ }
+ }
+ #endif /* LV_HAVE_AVX */
+
+diff --git a/kernels/volk/volk_64f_x2_min_64f.h b/kernels/volk/volk_64f_x2_min_64f.h
+index 0ffa305..7dc4d59 100644
+--- a/kernels/volk/volk_64f_x2_min_64f.h
++++ b/kernels/volk/volk_64f_x2_min_64f.h
+@@ -32,7 +32,8 @@
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+- * void volk_64f_x2_min_64f(double* cVector, const double* aVector, const double* bVector, unsigned int num_points)
++ * void volk_64f_x2_min_64f(double* cVector, const double* aVector, const double* bVector,
++ unsigned int num_points)
+ * \endcode
+ *
+ * \b Inputs
+@@ -77,38 +78,39 @@
+ #ifdef LV_HAVE_AVX512F
+ #include <immintrin.h>
+
+-static inline void
+-volk_64f_x2_min_64f_a_avx512f(double* cVector, const double* aVector,
+- const double* bVector, unsigned int num_points)
++static inline void volk_64f_x2_min_64f_a_avx512f(double* cVector,
++ const double* aVector,
++ const double* bVector,
++ unsigned int num_points)
+ {
+- unsigned int number = 0;
+- const unsigned int eigthPoints = num_points / 8;
++ unsigned int number = 0;
++ const unsigned int eigthPoints = num_points / 8;
+
+- double* cPtr = cVector;
+- const double* aPtr = aVector;
+- const double* bPtr= bVector;
++ double* cPtr = cVector;
++ const double* aPtr = aVector;
++ const double* bPtr = bVector;
+
+- __m512d aVal, bVal, cVal;
+- for(;number < eigthPoints; number++){
++ __m512d aVal, bVal, cVal;
++ for (; number < eigthPoints; number++) {
+
+- aVal = _mm512_load_pd(aPtr);
+- bVal = _mm512_load_pd(bPtr);
++ aVal = _mm512_load_pd(aPtr);
++ bVal = _mm512_load_pd(bPtr);
+
+- cVal = _mm512_min_pd(aVal, bVal);
++ cVal = _mm512_min_pd(aVal, bVal);
+
+- _mm512_store_pd(cPtr,cVal); // Store the results back into the C container
++ _mm512_store_pd(cPtr, cVal); // Store the results back into the C container
+
+- aPtr += 8;
+- bPtr += 8;
+- cPtr += 8;
+- }
++ aPtr += 8;
++ bPtr += 8;
++ cPtr += 8;
++ }
+
+- number = eigthPoints * 8;
+- for(;number < num_points; number++){
+- const double a = *aPtr++;
+- const double b = *bPtr++;
+- *cPtr++ = ( a < b ? a : b);
+- }
++ number = eigthPoints * 8;
++ for (; number < num_points; number++) {
++ const double a = *aPtr++;
++ const double b = *bPtr++;
++ *cPtr++ = (a < b ? a : b);
++ }
+ }
+ #endif /* LV_HAVE_AVX512F */
+
+@@ -116,38 +118,39 @@ volk_64f_x2_min_64f_a_avx512f(double* cVector, const double* aVector,
+ #ifdef LV_HAVE_AVX
+ #include <immintrin.h>
+
+-static inline void
+-volk_64f_x2_min_64f_a_avx(double* cVector, const double* aVector,
+- const double* bVector, unsigned int num_points)
++static inline void volk_64f_x2_min_64f_a_avx(double* cVector,
++ const double* aVector,
++ const double* bVector,
++ unsigned int num_points)
+ {
+- unsigned int number = 0;
+- const unsigned int quarterPoints = num_points / 4;
++ unsigned int number = 0;
++ const unsigned int quarterPoints = num_points / 4;
+
+- double* cPtr = cVector;
+- const double* aPtr = aVector;
+- const double* bPtr= bVector;
++ double* cPtr = cVector;
++ const double* aPtr = aVector;
++ const double* bPtr = bVector;
+
+- __m256d aVal, bVal, cVal;
+- for(;number < quarterPoints; number++){
++ __m256d aVal, bVal, cVal;
++ for (; number < quarterPoints; number++) {
+
+- aVal = _mm256_load_pd(aPtr);
+- bVal = _mm256_load_pd(bPtr);
++ aVal = _mm256_load_pd(aPtr);
++ bVal = _mm256_load_pd(bPtr);
+
+- cVal = _mm256_min_pd(aVal, bVal);
++ cVal = _mm256_min_pd(aVal, bVal);
+
+- _mm256_store_pd(cPtr,cVal); // Store the results back into the C container
++ _mm256_store_pd(cPtr, cVal); // Store the results back into the C container
+
+- aPtr += 4;
+- bPtr += 4;
+- cPtr += 4;
+- }
++ aPtr += 4;
++ bPtr += 4;
++ cPtr += 4;
++ }
+
+- number = quarterPoints * 4;
+- for(;number < num_points; number++){
+- const double a = *aPtr++;
+- const double b = *bPtr++;
+- *cPtr++ = ( a < b ? a : b);
+- }
++ number = quarterPoints * 4;
++ for (; number < num_points; number++) {
++ const double a = *aPtr++;
++ const double b = *bPtr++;
++ *cPtr++ = (a < b ? a : b);
++ }
+ }
+ #endif /* LV_HAVE_AVX */
+
+@@ -155,58 +158,60 @@ volk_64f_x2_min_64f_a_avx(double* cVector, const double* aVector,
+ #ifdef LV_HAVE_SSE2
+ #include <emmintrin.h>
+
+-static inline void
+-volk_64f_x2_min_64f_a_sse2(double* cVector, const double* aVector,
+- const double* bVector, unsigned int num_points)
++static inline void volk_64f_x2_min_64f_a_sse2(double* cVector,
++ const double* aVector,
++ const double* bVector,
++ unsigned int num_points)
+ {
+- unsigned int number = 0;
+- const unsigned int halfPoints = num_points / 2;
++ unsigned int number = 0;
++ const unsigned int halfPoints = num_points / 2;
+
+- double* cPtr = cVector;
+- const double* aPtr = aVector;
+- const double* bPtr= bVector;
++ double* cPtr = cVector;
++ const double* aPtr = aVector;
++ const double* bPtr = bVector;
+
+- __m128d aVal, bVal, cVal;
+- for(;number < halfPoints; number++){
++ __m128d aVal, bVal, cVal;
++ for (; number < halfPoints; number++) {
+
+- aVal = _mm_load_pd(aPtr);
+- bVal = _mm_load_pd(bPtr);
++ aVal = _mm_load_pd(aPtr);
++ bVal = _mm_load_pd(bPtr);
+
+- cVal = _mm_min_pd(aVal, bVal);
++ cVal = _mm_min_pd(aVal, bVal);
+
+- _mm_store_pd(cPtr,cVal); // Store the results back into the C container
++ _mm_store_pd(cPtr, cVal); // Store the results back into the C container
+
+- aPtr += 2;
+- bPtr += 2;
+- cPtr += 2;
+- }
++ aPtr += 2;
++ bPtr += 2;
++ cPtr += 2;
++ }
+
+- number = halfPoints * 2;
+- for(;number < num_points; number++){
+- const double a = *aPtr++;
+- const double b = *bPtr++;
+- *cPtr++ = ( a < b ? a : b);
+- }
++ number = halfPoints * 2;
++ for (; number < num_points; number++) {
++ const double a = *aPtr++;
++ const double b = *bPtr++;
++ *cPtr++ = (a < b ? a : b);
++ }
+ }
+ #endif /* LV_HAVE_SSE2 */
+
+
+ #ifdef LV_HAVE_GENERIC
+
+-static inline void
+-volk_64f_x2_min_64f_generic(double* cVector, const double* aVector,
+- const double* bVector, unsigned int num_points)
++static inline void volk_64f_x2_min_64f_generic(double* cVector,
++ const double* aVector,
++ const double* bVector,
++ unsigned int num_points)
+ {
+- double* cPtr = cVector;
+- const double* aPtr = aVector;
+- const double* bPtr= bVector;
+- unsigned int number = 0;
+-
+- for(number = 0; number < num_points; number++){
+- const double a = *aPtr++;
+- const double b = *bPtr++;
+- *cPtr++ = ( a < b ? a : b);
+- }
++ double* cPtr = cVector;
++ const double* aPtr = aVector;
++ const double* bPtr = bVector;
++ unsigned int number = 0;
++
++ for (number = 0; number < num_points; number++) {
++ const double a = *aPtr++;
++ const double b = *bPtr++;
++ *cPtr++ = (a < b ? a : b);
++ }
+ }
+ #endif /* LV_HAVE_GENERIC */
+
+@@ -222,38 +227,39 @@ volk_64f_x2_min_64f_generic(double* cVector, const double* aVector,
+ #ifdef LV_HAVE_AVX512F
+ #include <immintrin.h>
+
+-static inline void
+-volk_64f_x2_min_64f_u_avx512f(double* cVector, const double* aVector,
+- const double* bVector, unsigned int num_points)
++static inline void volk_64f_x2_min_64f_u_avx512f(double* cVector,
++ const double* aVector,
++ const double* bVector,
++ unsigned int num_points)
+ {
+- unsigned int number = 0;
+- const unsigned int eigthPoints = num_points / 8;
++ unsigned int number = 0;
++ const unsigned int eigthPoints = num_points / 8;
+
+- double* cPtr = cVector;
+- const double* aPtr = aVector;
+- const double* bPtr= bVector;
++ double* cPtr = cVector;
++ const double* aPtr = aVector;
++ const double* bPtr = bVector;
+
+- __m512d aVal, bVal, cVal;
+- for(;number < eigthPoints; number++){
++ __m512d aVal, bVal, cVal;
++ for (; number < eigthPoints; number++) {
+
+- aVal = _mm512_loadu_pd(aPtr);
+- bVal = _mm512_loadu_pd(bPtr);
++ aVal = _mm512_loadu_pd(aPtr);
++ bVal = _mm512_loadu_pd(bPtr);
+
+- cVal = _mm512_min_pd(aVal, bVal);
++ cVal = _mm512_min_pd(aVal, bVal);
+
+- _mm512_storeu_pd(cPtr,cVal); // Store the results back into the C container
++ _mm512_storeu_pd(cPtr, cVal); // Store the results back into the C container
+
+- aPtr += 8;
+- bPtr += 8;
+- cPtr += 8;
+- }
++ aPtr += 8;
++ bPtr += 8;
++ cPtr += 8;
++ }
+
+- number = eigthPoints * 8;
+- for(;number < num_points; number++){
+- const double a = *aPtr++;
+- const double b = *bPtr++;
+- *cPtr++ = ( a < b ? a : b);
+- }
++ number = eigthPoints * 8;
++ for (; number < num_points; number++) {
++ const double a = *aPtr++;
++ const double b = *bPtr++;
++ *cPtr++ = (a < b ? a : b);
++ }
+ }
+ #endif /* LV_HAVE_AVX512F */
+
+@@ -261,38 +267,39 @@ volk_64f_x2_min_64f_u_avx512f(double* cVector, const double* aVector,
+ #ifdef LV_HAVE_AVX
+ #include <immintrin.h>
+
+-static inline void
+-volk_64f_x2_min_64f_u_avx(double* cVector, const double* aVector,
+- const double* bVector, unsigned int num_points)
++static inline void volk_64f_x2_min_64f_u_avx(double* cVector,
++ const double* aVector,
++ const double* bVector,
++ unsigned int num_points)
+ {
+- unsigned int number = 0;
+- const unsigned int quarterPoints = num_points / 4;
++ unsigned int number = 0;
++ const unsigned int quarterPoints = num_points / 4;
+
+- double* cPtr = cVector;
+- const double* aPtr = aVector;
+- const double* bPtr= bVector;
++ double* cPtr = cVector;
++ const double* aPtr = aVector;
++ const double* bPtr = bVector;
+
+- __m256d aVal, bVal, cVal;
+- for(;number < quarterPoints; number++){
++ __m256d aVal, bVal, cVal;
++ for (; number < quarterPoints; number++) {
+
+- aVal = _mm256_loadu_pd(aPtr);
+- bVal = _mm256_loadu_pd(bPtr);
++ aVal = _mm256_loadu_pd(aPtr);
++ bVal = _mm256_loadu_pd(bPtr);
+
+- cVal = _mm256_min_pd(aVal, bVal);
++ cVal = _mm256_min_pd(aVal, bVal);
+
+- _mm256_storeu_pd(cPtr,cVal); // Store the results back into the C container
++ _mm256_storeu_pd(cPtr, cVal); // Store the results back into the C container
+
+- aPtr += 4;
+- bPtr += 4;
+- cPtr += 4;
+- }
++ aPtr += 4;
++ bPtr += 4;
++ cPtr += 4;
++ }
+
+- number = quarterPoints * 4;
+- for(;number < num_points; number++){
+- const double a = *aPtr++;
+- const double b = *bPtr++;
+- *cPtr++ = ( a < b ? a : b);
+- }
++ number = quarterPoints * 4;
++ for (; number < num_points; number++) {
++ const double a = *aPtr++;
++ const double b = *bPtr++;
++ *cPtr++ = (a < b ? a : b);
++ }
+ }
+ #endif /* LV_HAVE_AVX */
+
+diff --git a/kernels/volk/volk_64f_x2_multiply_64f.h b/kernels/volk/volk_64f_x2_multiply_64f.h
+index 6fa9e8e..39a155d 100644
+--- a/kernels/volk/volk_64f_x2_multiply_64f.h
++++ b/kernels/volk/volk_64f_x2_multiply_64f.h
+@@ -31,8 +31,8 @@
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+- * void volk_64f_x2_multiply_64f(float* cVector, const float* aVector, const float* bVector, unsigned int num_points)
+- * \endcode
++ * void volk_64f_x2_multiply_64f(float* cVector, const float* aVector, const float*
++ * bVector, unsigned int num_points) \endcode
+ *
+ * \b Inputs
+ * \li aVector: First input vector.
+@@ -76,18 +76,19 @@
+
+ #ifdef LV_HAVE_GENERIC
+
+-static inline void
+-volk_64f_x2_multiply_64f_generic(double *cVector, const double *aVector,
+- const double *bVector, unsigned int num_points)
++static inline void volk_64f_x2_multiply_64f_generic(double* cVector,
++ const double* aVector,
++ const double* bVector,
++ unsigned int num_points)
+ {
+- double *cPtr = cVector;
+- const double *aPtr = aVector;
+- const double *bPtr = bVector;
+- unsigned int number = 0;
+-
+- for (number = 0; number < num_points; number++) {
+- *cPtr++ = (*aPtr++) * (*bPtr++);
+- }
++ double* cPtr = cVector;
++ const double* aPtr = aVector;
++ const double* bPtr = bVector;
++ unsigned int number = 0;
++
++ for (number = 0; number < num_points; number++) {
++ *cPtr++ = (*aPtr++) * (*bPtr++);
++ }
+ }
+
+ #endif /* LV_HAVE_GENERIC */
+@@ -100,35 +101,36 @@ volk_64f_x2_multiply_64f_generic(double *cVector, const double *aVector,
+
+ #include <emmintrin.h>
+
+-static inline void
+-volk_64f_x2_multiply_64f_u_sse2(double *cVector, const double *aVector,
+- const double *bVector, unsigned int num_points)
++static inline void volk_64f_x2_multiply_64f_u_sse2(double* cVector,
++ const double* aVector,
++ const double* bVector,
++ unsigned int num_points)
+ {
+- unsigned int number = 0;
+- const unsigned int half_points = num_points / 2;
++ unsigned int number = 0;
++ const unsigned int half_points = num_points / 2;
+
+- double *cPtr = cVector;
+- const double *aPtr = aVector;
+- const double *bPtr = bVector;
++ double* cPtr = cVector;
++ const double* aPtr = aVector;
++ const double* bPtr = bVector;
+
+- __m128d aVal, bVal, cVal;
+- for (; number < half_points; number++) {
+- aVal = _mm_loadu_pd(aPtr);
+- bVal = _mm_loadu_pd(bPtr);
++ __m128d aVal, bVal, cVal;
++ for (; number < half_points; number++) {
++ aVal = _mm_loadu_pd(aPtr);
++ bVal = _mm_loadu_pd(bPtr);
+
+- cVal = _mm_mul_pd(aVal, bVal);
++ cVal = _mm_mul_pd(aVal, bVal);
+
+- _mm_storeu_pd(cPtr, cVal); // Store the results back into the C container
++ _mm_storeu_pd(cPtr, cVal); // Store the results back into the C container
+
+- aPtr += 2;
+- bPtr += 2;
+- cPtr += 2;
+- }
++ aPtr += 2;
++ bPtr += 2;
++ cPtr += 2;
++ }
+
+- number = half_points * 2;
+- for (; number < num_points; number++) {
+- *cPtr++ = (*aPtr++) * (*bPtr++);
+- }
++ number = half_points * 2;
++ for (; number < num_points; number++) {
++ *cPtr++ = (*aPtr++) * (*bPtr++);
++ }
+ }
+
+ #endif /* LV_HAVE_SSE2 */
+@@ -138,36 +140,37 @@ volk_64f_x2_multiply_64f_u_sse2(double *cVector, const double *aVector,
+
+ #include <immintrin.h>
+
+-static inline void
+-volk_64f_x2_multiply_64f_u_avx(double *cVector, const double *aVector,
+- const double *bVector, unsigned int num_points)
++static inline void volk_64f_x2_multiply_64f_u_avx(double* cVector,
++ const double* aVector,
++ const double* bVector,
++ unsigned int num_points)
+ {
+- unsigned int number = 0;
+- const unsigned int quarter_points = num_points / 4;
++ unsigned int number = 0;
++ const unsigned int quarter_points = num_points / 4;
+
+- double *cPtr = cVector;
+- const double *aPtr = aVector;
+- const double *bPtr = bVector;
++ double* cPtr = cVector;
++ const double* aPtr = aVector;
++ const double* bPtr = bVector;
+
+- __m256d aVal, bVal, cVal;
+- for (; number < quarter_points; number++) {
++ __m256d aVal, bVal, cVal;
++ for (; number < quarter_points; number++) {
+
+- aVal = _mm256_loadu_pd(aPtr);
+- bVal = _mm256_loadu_pd(bPtr);
++ aVal = _mm256_loadu_pd(aPtr);
++ bVal = _mm256_loadu_pd(bPtr);
+
+- cVal = _mm256_mul_pd(aVal, bVal);
++ cVal = _mm256_mul_pd(aVal, bVal);
+
+- _mm256_storeu_pd(cPtr, cVal); // Store the results back into the C container
++ _mm256_storeu_pd(cPtr, cVal); // Store the results back into the C container
+
+- aPtr += 4;
+- bPtr += 4;
+- cPtr += 4;
+- }
++ aPtr += 4;
++ bPtr += 4;
++ cPtr += 4;
++ }
+
+- number = quarter_points * 4;
+- for (; number < num_points; number++) {
+- *cPtr++ = (*aPtr++) * (*bPtr++);
+- }
++ number = quarter_points * 4;
++ for (; number < num_points; number++) {
++ *cPtr++ = (*aPtr++) * (*bPtr++);
++ }
+ }
+
+ #endif /* LV_HAVE_AVX */
+@@ -180,35 +183,36 @@ volk_64f_x2_multiply_64f_u_avx(double *cVector, const double *aVector,
+
+ #include <emmintrin.h>
+
+-static inline void
+-volk_64f_x2_multiply_64f_a_sse2(double *cVector, const double *aVector,
+- const double *bVector, unsigned int num_points)
++static inline void volk_64f_x2_multiply_64f_a_sse2(double* cVector,
++ const double* aVector,
++ const double* bVector,
++ unsigned int num_points)
+ {
+- unsigned int number = 0;
+- const unsigned int half_points = num_points / 2;
++ unsigned int number = 0;
++ const unsigned int half_points = num_points / 2;
+
+- double *cPtr = cVector;
+- const double *aPtr = aVector;
+- const double *bPtr = bVector;
++ double* cPtr = cVector;
++ const double* aPtr = aVector;
++ const double* bPtr = bVector;
+
+- __m128d aVal, bVal, cVal;
+- for (; number < half_points; number++) {
+- aVal = _mm_load_pd(aPtr);
+- bVal = _mm_load_pd(bPtr);
++ __m128d aVal, bVal, cVal;
++ for (; number < half_points; number++) {
++ aVal = _mm_load_pd(aPtr);
++ bVal = _mm_load_pd(bPtr);
+
+- cVal = _mm_mul_pd(aVal, bVal);
++ cVal = _mm_mul_pd(aVal, bVal);
+
+- _mm_store_pd(cPtr, cVal); // Store the results back into the C container
++ _mm_store_pd(cPtr, cVal); // Store the results back into the C container
+
+- aPtr += 2;
+- bPtr += 2;
+- cPtr += 2;
+- }
++ aPtr += 2;
++ bPtr += 2;
++ cPtr += 2;
++ }
+
+- number = half_points * 2;
+- for (; number < num_points; number++) {
+- *cPtr++ = (*aPtr++) * (*bPtr++);
+- }
++ number = half_points * 2;
++ for (; number < num_points; number++) {
++ *cPtr++ = (*aPtr++) * (*bPtr++);
++ }
+ }
+
+ #endif /* LV_HAVE_SSE2 */
+@@ -218,36 +222,37 @@ volk_64f_x2_multiply_64f_a_sse2(double *cVector, const double *aVector,
+
+ #include <immintrin.h>
+
+-static inline void
+-volk_64f_x2_multiply_64f_a_avx(double *cVector, const double *aVector,
+- const double *bVector, unsigned int num_points)
++static inline void volk_64f_x2_multiply_64f_a_avx(double* cVector,
++ const double* aVector,
++ const double* bVector,
++ unsigned int num_points)
+ {
+- unsigned int number = 0;
+- const unsigned int quarter_points = num_points / 4;
++ unsigned int number = 0;
++ const unsigned int quarter_points = num_points / 4;
+
+- double *cPtr = cVector;
+- const double *aPtr = aVector;
+- const double *bPtr = bVector;
++ double* cPtr = cVector;
++ const double* aPtr = aVector;
++ const double* bPtr = bVector;
+
+- __m256d aVal, bVal, cVal;
+- for (; number < quarter_points; number++) {
++ __m256d aVal, bVal, cVal;
++ for (; number < quarter_points; number++) {
+
+- aVal = _mm256_load_pd(aPtr);
+- bVal = _mm256_load_pd(bPtr);
++ aVal = _mm256_load_pd(aPtr);
++ bVal = _mm256_load_pd(bPtr);
+
+- cVal = _mm256_mul_pd(aVal, bVal);
++ cVal = _mm256_mul_pd(aVal, bVal);
+
+- _mm256_store_pd(cPtr, cVal); // Store the results back into the C container
++ _mm256_store_pd(cPtr, cVal); // Store the results back into the C container
+
+- aPtr += 4;
+- bPtr += 4;
+- cPtr += 4;
+- }
++ aPtr += 4;
++ bPtr += 4;
++ cPtr += 4;
++ }
+
+- number = quarter_points * 4;
+- for (; number < num_points; number++) {
+- *cPtr++ = (*aPtr++) * (*bPtr++);
+- }
++ number = quarter_points * 4;
++ for (; number < num_points; number++) {
++ *cPtr++ = (*aPtr++) * (*bPtr++);
++ }
+ }
+
+ #endif /* LV_HAVE_AVX */
+diff --git a/kernels/volk/volk_64u_byteswap.h b/kernels/volk/volk_64u_byteswap.h
+index 96e0661..38621a4 100644
+--- a/kernels/volk/volk_64u_byteswap.h
++++ b/kernels/volk/volk_64u_byteswap.h
+@@ -72,71 +72,77 @@
+ #ifdef LV_HAVE_SSE2
+ #include <emmintrin.h>
+
+-static inline void volk_64u_byteswap_u_sse2(uint64_t* intsToSwap, unsigned int num_points){
++static inline void volk_64u_byteswap_u_sse2(uint64_t* intsToSwap, unsigned int num_points)
++{
+ uint32_t* inputPtr = (uint32_t*)intsToSwap;
+ __m128i input, byte1, byte2, byte3, byte4, output;
+ __m128i byte2mask = _mm_set1_epi32(0x00FF0000);
+ __m128i byte3mask = _mm_set1_epi32(0x0000FF00);
+ uint64_t number = 0;
+ const unsigned int halfPoints = num_points / 2;
+- for(;number < halfPoints; number++){
+- // Load the 32t values, increment inputPtr later since we're doing it in-place.
+- input = _mm_loadu_si128((__m128i*)inputPtr);
+-
+- // Do the four shifts
+- byte1 = _mm_slli_epi32(input, 24);
+- byte2 = _mm_slli_epi32(input, 8);
+- byte3 = _mm_srli_epi32(input, 8);
+- byte4 = _mm_srli_epi32(input, 24);
+- // Or bytes together
+- output = _mm_or_si128(byte1, byte4);
+- byte2 = _mm_and_si128(byte2, byte2mask);
+- output = _mm_or_si128(output, byte2);
+- byte3 = _mm_and_si128(byte3, byte3mask);
+- output = _mm_or_si128(output, byte3);
+-
+- // Reorder the two words
+- output = _mm_shuffle_epi32(output, _MM_SHUFFLE(2, 3, 0, 1));
+-
+- // Store the results
+- _mm_storeu_si128((__m128i*)inputPtr, output);
+- inputPtr += 4;
++ for (; number < halfPoints; number++) {
++ // Load the 32t values, increment inputPtr later since we're doing it in-place.
++ input = _mm_loadu_si128((__m128i*)inputPtr);
++
++ // Do the four shifts
++ byte1 = _mm_slli_epi32(input, 24);
++ byte2 = _mm_slli_epi32(input, 8);
++ byte3 = _mm_srli_epi32(input, 8);
++ byte4 = _mm_srli_epi32(input, 24);
++ // Or bytes together
++ output = _mm_or_si128(byte1, byte4);
++ byte2 = _mm_and_si128(byte2, byte2mask);
++ output = _mm_or_si128(output, byte2);
++ byte3 = _mm_and_si128(byte3, byte3mask);
++ output = _mm_or_si128(output, byte3);
++
++ // Reorder the two words
++ output = _mm_shuffle_epi32(output, _MM_SHUFFLE(2, 3, 0, 1));
++
++ // Store the results
++ _mm_storeu_si128((__m128i*)inputPtr, output);
++ inputPtr += 4;
+ }
+
+ // Byteswap any remaining points:
+- number = halfPoints*2;
+- for(; number < num_points; number++){
+- uint32_t output1 = *inputPtr;
+- uint32_t output2 = inputPtr[1];
++ number = halfPoints * 2;
++ for (; number < num_points; number++) {
++ uint32_t output1 = *inputPtr;
++ uint32_t output2 = inputPtr[1];
+
+- output1 = (((output1 >> 24) & 0xff) | ((output1 >> 8) & 0x0000ff00) | ((output1 << 8) & 0x00ff0000) | ((output1 << 24) & 0xff000000));
++ output1 = (((output1 >> 24) & 0xff) | ((output1 >> 8) & 0x0000ff00) |
++ ((output1 << 8) & 0x00ff0000) | ((output1 << 24) & 0xff000000));
+
+- output2 = (((output2 >> 24) & 0xff) | ((output2 >> 8) & 0x0000ff00) | ((output2 << 8) & 0x00ff0000) | ((output2 << 24) & 0xff000000));
++ output2 = (((output2 >> 24) & 0xff) | ((output2 >> 8) & 0x0000ff00) |
++ ((output2 << 8) & 0x00ff0000) | ((output2 << 24) & 0xff000000));
+
+- *inputPtr++ = output2;
+- *inputPtr++ = output1;
++ *inputPtr++ = output2;
++ *inputPtr++ = output1;
+ }
+ }
+ #endif /* LV_HAVE_SSE2 */
+
+
+-
+ #ifdef LV_HAVE_GENERIC
+
+-static inline void volk_64u_byteswap_generic(uint64_t* intsToSwap, unsigned int num_points){
+- uint32_t* inputPtr = (uint32_t*)intsToSwap;
+- unsigned int point;
+- for(point = 0; point < num_points; point++){
+- uint32_t output1 = *inputPtr;
+- uint32_t output2 = inputPtr[1];
++static inline void volk_64u_byteswap_generic(uint64_t* intsToSwap,
++ unsigned int num_points)
++{
++ uint32_t* inputPtr = (uint32_t*)intsToSwap;
++ unsigned int point;
++ for (point = 0; point < num_points; point++) {
++ uint32_t output1 = *inputPtr;
++ uint32_t output2 = inputPtr[1];
+
+- output1 = (((output1 >> 24) & 0xff) | ((output1 >> 8) & 0x0000ff00) | ((output1 << 8) & 0x00ff0000) | ((output1 << 24) & 0xff000000));
++ output1 = (((output1 >> 24) & 0xff) | ((output1 >> 8) & 0x0000ff00) |
++ ((output1 << 8) & 0x00ff0000) | ((output1 << 24) & 0xff000000));
+
+- output2 = (((output2 >> 24) & 0xff) | ((output2 >> 8) & 0x0000ff00) | ((output2 << 8) & 0x00ff0000) | ((output2 << 24) & 0xff000000));
++ output2 = (((output2 >> 24) & 0xff) | ((output2 >> 8) & 0x0000ff00) |
++ ((output2 << 8) & 0x00ff0000) | ((output2 << 24) & 0xff000000));
+
+- *inputPtr++ = output2;
+- *inputPtr++ = output1;
+- }
++ *inputPtr++ = output2;
++ *inputPtr++ = output1;
++ }
+ }
+ #endif /* LV_HAVE_GENERIC */
+
+@@ -144,47 +150,47 @@ static inline void volk_64u_byteswap_generic(uint64_t* intsToSwap, unsigned int
+ #include <immintrin.h>
+ static inline void volk_64u_byteswap_a_avx2(uint64_t* intsToSwap, unsigned int num_points)
+ {
+- unsigned int number = 0;
+-
+- const unsigned int nPerSet = 4;
+- const uint64_t nSets = num_points / nPerSet;
++ unsigned int number = 0;
+
+- uint32_t* inputPtr = (uint32_t*)intsToSwap;
++ const unsigned int nPerSet = 4;
++ const uint64_t nSets = num_points / nPerSet;
+
+- const uint8_t shuffleVector[32] = { 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8, 23, 22, 21, 20, 19, 18, 17, 16, 31, 30, 29, 28, 27, 26, 25, 24 };
++ uint32_t* inputPtr = (uint32_t*)intsToSwap;
+
+- const __m256i myShuffle = _mm256_loadu_si256((__m256i*) &shuffleVector[0]);
++ const uint8_t shuffleVector[32] = { 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13,
++ 12, 11, 10, 9, 8, 23, 22, 21, 20, 19, 18,
++ 17, 16, 31, 30, 29, 28, 27, 26, 25, 24 };
+
+- for ( ;number < nSets; number++ ) {
++ const __m256i myShuffle = _mm256_loadu_si256((__m256i*)&shuffleVector[0]);
+
+- // Load the 32t values, increment inputPtr later since we're doing it in-place.
+- const __m256i input = _mm256_load_si256((__m256i*)inputPtr);
+- const __m256i output = _mm256_shuffle_epi8(input, myShuffle);
++ for (; number < nSets; number++) {
+
+- // Store the results
+- _mm256_store_si256((__m256i*)inputPtr, output);
++ // Load the 32t values, increment inputPtr later since we're doing it in-place.
++ const __m256i input = _mm256_load_si256((__m256i*)inputPtr);
++ const __m256i output = _mm256_shuffle_epi8(input, myShuffle);
+
+- /* inputPtr is 32bit so increment twice */
+- inputPtr += 2 * nPerSet;
+- }
+- _mm256_zeroupper();
++ // Store the results
++ _mm256_store_si256((__m256i*)inputPtr, output);
+
+- // Byteswap any remaining points:
+- for(number = nSets * nPerSet; number < num_points; ++number ) {
+- uint32_t output1 = *inputPtr;
+- uint32_t output2 = inputPtr[1];
+- uint32_t out1 = ((((output1) >> 24) & 0x000000ff) |
+- (((output1) >> 8) & 0x0000ff00) |
+- (((output1) << 8) & 0x00ff0000) |
+- (((output1) << 24) & 0xff000000) );
++ /* inputPtr is 32bit so increment twice */
++ inputPtr += 2 * nPerSet;
++ }
++ _mm256_zeroupper();
+
+- uint32_t out2 = ((((output2) >> 24) & 0x000000ff) |
+- (((output2) >> 8) & 0x0000ff00) |
+- (((output2) << 8) & 0x00ff0000) |
+- (((output2) << 24) & 0xff000000) );
+- *inputPtr++ = out2;
+- *inputPtr++ = out1;
+- }
++ // Byteswap any remaining points:
++ for (number = nSets * nPerSet; number < num_points; ++number) {
++ uint32_t output1 = *inputPtr;
++ uint32_t output2 = inputPtr[1];
++ uint32_t out1 =
++ ((((output1) >> 24) & 0x000000ff) | (((output1) >> 8) & 0x0000ff00) |
++ (((output1) << 8) & 0x00ff0000) | (((output1) << 24) & 0xff000000));
++
++ uint32_t out2 =
++ ((((output2) >> 24) & 0x000000ff) | (((output2) >> 8) & 0x0000ff00) |
++ (((output2) << 8) & 0x00ff0000) | (((output2) << 24) & 0xff000000));
++ *inputPtr++ = out2;
++ *inputPtr++ = out1;
++ }
+ }
+
+ #endif /* LV_HAVE_AVX2 */
+@@ -192,48 +198,47 @@ static inline void volk_64u_byteswap_a_avx2(uint64_t* intsToSwap, unsigned int n
+
+ #if LV_HAVE_SSSE3
+ #include <tmmintrin.h>
+-static inline void volk_64u_byteswap_a_ssse3(uint64_t* intsToSwap, unsigned int num_points)
++static inline void volk_64u_byteswap_a_ssse3(uint64_t* intsToSwap,
++ unsigned int num_points)
+ {
+- unsigned int number = 0;
++ unsigned int number = 0;
+
+- const unsigned int nPerSet = 2;
+- const uint64_t nSets = num_points / nPerSet;
++ const unsigned int nPerSet = 2;
++ const uint64_t nSets = num_points / nPerSet;
+
+- uint32_t* inputPtr = (uint32_t*)intsToSwap;
+-
+- uint8_t shuffleVector[16] = { 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8 };
++ uint32_t* inputPtr = (uint32_t*)intsToSwap;
+
+- const __m128i myShuffle = _mm_loadu_si128((__m128i*) &shuffleVector);
++ uint8_t shuffleVector[16] = { 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8 };
+
+- for ( ;number < nSets; number++ ) {
++ const __m128i myShuffle = _mm_loadu_si128((__m128i*)&shuffleVector);
+
+- // Load the 32t values, increment inputPtr later since we're doing it in-place.
+- const __m128i input = _mm_load_si128((__m128i*)inputPtr);
+- const __m128i output = _mm_shuffle_epi8(input,myShuffle);
++ for (; number < nSets; number++) {
+
+- // Store the results
+- _mm_store_si128((__m128i*)inputPtr, output);
++ // Load the 32t values, increment inputPtr later since we're doing it in-place.
++ const __m128i input = _mm_load_si128((__m128i*)inputPtr);
++ const __m128i output = _mm_shuffle_epi8(input, myShuffle);
+
+- /* inputPtr is 32bit so increment twice */
+- inputPtr += 2 * nPerSet;
+- }
++ // Store the results
++ _mm_store_si128((__m128i*)inputPtr, output);
+
+- // Byteswap any remaining points:
+- for(number = nSets * nPerSet; number < num_points; ++number ) {
+- uint32_t output1 = *inputPtr;
+- uint32_t output2 = inputPtr[1];
+- uint32_t out1 = ((((output1) >> 24) & 0x000000ff) |
+- (((output1) >> 8) & 0x0000ff00) |
+- (((output1) << 8) & 0x00ff0000) |
+- (((output1) << 24) & 0xff000000) );
++ /* inputPtr is 32bit so increment twice */
++ inputPtr += 2 * nPerSet;
++ }
+
+- uint32_t out2 = ((((output2) >> 24) & 0x000000ff) |
+- (((output2) >> 8) & 0x0000ff00) |
+- (((output2) << 8) & 0x00ff0000) |
+- (((output2) << 24) & 0xff000000) );
+- *inputPtr++ = out2;
+- *inputPtr++ = out1;
+- }
++ // Byteswap any remaining points:
++ for (number = nSets * nPerSet; number < num_points; ++number) {
++ uint32_t output1 = *inputPtr;
++ uint32_t output2 = inputPtr[1];
++ uint32_t out1 =
++ ((((output1) >> 24) & 0x000000ff) | (((output1) >> 8) & 0x0000ff00) |
++ (((output1) << 8) & 0x00ff0000) | (((output1) << 24) & 0xff000000));
++
++ uint32_t out2 =
++ ((((output2) >> 24) & 0x000000ff) | (((output2) >> 8) & 0x0000ff00) |
++ (((output2) << 8) & 0x00ff0000) | (((output2) << 24) & 0xff000000));
++ *inputPtr++ = out2;
++ *inputPtr++ = out1;
++ }
+ }
+ #endif /* LV_HAVE_SSSE3 */
+
+@@ -241,86 +246,90 @@ static inline void volk_64u_byteswap_a_ssse3(uint64_t* intsToSwap, unsigned int
+ #ifdef LV_HAVE_NEONV8
+ #include <arm_neon.h>
+
+-static inline void volk_64u_byteswap_neonv8(uint64_t* intsToSwap, unsigned int num_points){
+- uint32_t* inputPtr = (uint32_t*)intsToSwap;
+- const unsigned int n4points = num_points / 4;
+- uint8x16x2_t input;
+- uint8x16_t idx = { 7,6,5,4, 3,2,1,0, 15,14,13,12, 11,10,9,8 };
+-
+- unsigned int number = 0;
+- for(number = 0; number < n4points; ++number){
+- __VOLK_PREFETCH(inputPtr+8);
+- input = vld2q_u8((uint8_t*) inputPtr);
+- input.val[0] = vqtbl1q_u8(input.val[0], idx);
+- input.val[1] = vqtbl1q_u8(input.val[1], idx);
+- vst2q_u8((uint8_t*) inputPtr, input);
+-
+- inputPtr += 8;
+- }
+-
+- for(number = n4points * 4; number < num_points; ++number){
+- uint32_t output1 = *inputPtr;
+- uint32_t output2 = inputPtr[1];
++static inline void volk_64u_byteswap_neonv8(uint64_t* intsToSwap, unsigned int num_points)
++{
++ uint32_t* inputPtr = (uint32_t*)intsToSwap;
++ const unsigned int n4points = num_points / 4;
++ uint8x16x2_t input;
++ uint8x16_t idx = { 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8 };
++
++ unsigned int number = 0;
++ for (number = 0; number < n4points; ++number) {
++ __VOLK_PREFETCH(inputPtr + 8);
++ input = vld2q_u8((uint8_t*)inputPtr);
++ input.val[0] = vqtbl1q_u8(input.val[0], idx);
++ input.val[1] = vqtbl1q_u8(input.val[1], idx);
++ vst2q_u8((uint8_t*)inputPtr, input);
++
++ inputPtr += 8;
++ }
+
+- output1 = (((output1 >> 24) & 0xff) | ((output1 >> 8) & 0x0000ff00) | ((output1 << 8) & 0x00ff0000) | ((output1 << 24) & 0xff000000));
+- output2 = (((output2 >> 24) & 0xff) | ((output2 >> 8) & 0x0000ff00) | ((output2 << 8) & 0x00ff0000) | ((output2 << 24) & 0xff000000));
++ for (number = n4points * 4; number < num_points; ++number) {
++ uint32_t output1 = *inputPtr;
++ uint32_t output2 = inputPtr[1];
+
+- *inputPtr++ = output2;
+- *inputPtr++ = output1;
+- }
++ output1 = (((output1 >> 24) & 0xff) | ((output1 >> 8) & 0x0000ff00) |
++ ((output1 << 8) & 0x00ff0000) | ((output1 << 24) & 0xff000000));
++ output2 = (((output2 >> 24) & 0xff) | ((output2 >> 8) & 0x0000ff00) |
++ ((output2 << 8) & 0x00ff0000) | ((output2 << 24) & 0xff000000));
+
++ *inputPtr++ = output2;
++ *inputPtr++ = output1;
++ }
+ }
+ #else
+ #ifdef LV_HAVE_NEON
+ #include <arm_neon.h>
+
+-static inline void volk_64u_byteswap_neon(uint64_t* intsToSwap, unsigned int num_points){
+- uint32_t* inputPtr = (uint32_t*)intsToSwap;
+- unsigned int number = 0;
+- unsigned int n8points = num_points / 4;
+-
+- uint8x8x4_t input_table;
+- uint8x8_t int_lookup01, int_lookup23, int_lookup45, int_lookup67;
+- uint8x8_t swapped_int01, swapped_int23, swapped_int45, swapped_int67;
+-
+- /* these magic numbers are used as byte-indices in the LUT.
+- they are pre-computed to save time. A simple C program
+- can calculate them; for example for lookup01:
+- uint8_t chars[8] = {24, 16, 8, 0, 25, 17, 9, 1};
+- for(ii=0; ii < 8; ++ii) {
+- index += ((uint64_t)(*(chars+ii))) << (ii*8);
++static inline void volk_64u_byteswap_neon(uint64_t* intsToSwap, unsigned int num_points)
++{
++ uint32_t* inputPtr = (uint32_t*)intsToSwap;
++ unsigned int number = 0;
++ unsigned int n8points = num_points / 4;
++
++ uint8x8x4_t input_table;
++ uint8x8_t int_lookup01, int_lookup23, int_lookup45, int_lookup67;
++ uint8x8_t swapped_int01, swapped_int23, swapped_int45, swapped_int67;
++
++ /* these magic numbers are used as byte-indices in the LUT.
++ they are pre-computed to save time. A simple C program
++ can calculate them; for example for lookup01:
++ uint8_t chars[8] = {24, 16, 8, 0, 25, 17, 9, 1};
++ for(ii=0; ii < 8; ++ii) {
++ index += ((uint64_t)(*(chars+ii))) << (ii*8);
++ }
++ */
++ int_lookup01 = vcreate_u8(2269495096316185);
++ int_lookup23 = vcreate_u8(146949840772469531);
++ int_lookup45 = vcreate_u8(291630186448622877);
++ int_lookup67 = vcreate_u8(436310532124776223);
++
++ for (number = 0; number < n8points; ++number) {
++ input_table = vld4_u8((uint8_t*)inputPtr);
++ swapped_int01 = vtbl4_u8(input_table, int_lookup01);
++ swapped_int23 = vtbl4_u8(input_table, int_lookup23);
++ swapped_int45 = vtbl4_u8(input_table, int_lookup45);
++ swapped_int67 = vtbl4_u8(input_table, int_lookup67);
++ vst1_u8((uint8_t*)inputPtr, swapped_int01);
++ vst1_u8((uint8_t*)(inputPtr + 2), swapped_int23);
++ vst1_u8((uint8_t*)(inputPtr + 4), swapped_int45);
++ vst1_u8((uint8_t*)(inputPtr + 6), swapped_int67);
++
++ inputPtr += 4;
+ }
+- */
+- int_lookup01 = vcreate_u8(2269495096316185);
+- int_lookup23 = vcreate_u8(146949840772469531);
+- int_lookup45 = vcreate_u8(291630186448622877);
+- int_lookup67 = vcreate_u8(436310532124776223);
+-
+- for(number = 0; number < n8points; ++number){
+- input_table = vld4_u8((uint8_t*) inputPtr);
+- swapped_int01 = vtbl4_u8(input_table, int_lookup01);
+- swapped_int23 = vtbl4_u8(input_table, int_lookup23);
+- swapped_int45 = vtbl4_u8(input_table, int_lookup45);
+- swapped_int67 = vtbl4_u8(input_table, int_lookup67);
+- vst1_u8((uint8_t*) inputPtr, swapped_int01);
+- vst1_u8((uint8_t*) (inputPtr+2), swapped_int23);
+- vst1_u8((uint8_t*) (inputPtr+4), swapped_int45);
+- vst1_u8((uint8_t*) (inputPtr+6), swapped_int67);
+-
+- inputPtr += 4;
+- }
+-
+- for(number = n8points * 4; number < num_points; ++number){
+- uint32_t output1 = *inputPtr;
+- uint32_t output2 = inputPtr[1];
+-
+- output1 = (((output1 >> 24) & 0xff) | ((output1 >> 8) & 0x0000ff00) | ((output1 << 8) & 0x00ff0000) | ((output1 << 24) & 0xff000000));
+- output2 = (((output2 >> 24) & 0xff) | ((output2 >> 8) & 0x0000ff00) | ((output2 << 8) & 0x00ff0000) | ((output2 << 24) & 0xff000000));
+-
+- *inputPtr++ = output2;
+- *inputPtr++ = output1;
+- }
+
++ for (number = n8points * 4; number < num_points; ++number) {
++ uint32_t output1 = *inputPtr;
++ uint32_t output2 = inputPtr[1];
++
++ output1 = (((output1 >> 24) & 0xff) | ((output1 >> 8) & 0x0000ff00) |
++ ((output1 << 8) & 0x00ff0000) | ((output1 << 24) & 0xff000000));
++ output2 = (((output2 >> 24) & 0xff) | ((output2 >> 8) & 0x0000ff00) |
++ ((output2 << 8) & 0x00ff0000) | ((output2 << 24) & 0xff000000));
++
++ *inputPtr++ = output2;
++ *inputPtr++ = output1;
++ }
+ }
+ #endif /* LV_HAVE_NEON */
+ #endif
+@@ -336,49 +345,52 @@ static inline void volk_64u_byteswap_neon(uint64_t* intsToSwap, unsigned int num
+ #ifdef LV_HAVE_SSE2
+ #include <emmintrin.h>
+
+-static inline void volk_64u_byteswap_a_sse2(uint64_t* intsToSwap, unsigned int num_points){
++static inline void volk_64u_byteswap_a_sse2(uint64_t* intsToSwap, unsigned int num_points)
++{
+ uint32_t* inputPtr = (uint32_t*)intsToSwap;
+ __m128i input, byte1, byte2, byte3, byte4, output;
+ __m128i byte2mask = _mm_set1_epi32(0x00FF0000);
+ __m128i byte3mask = _mm_set1_epi32(0x0000FF00);
+ uint64_t number = 0;
+ const unsigned int halfPoints = num_points / 2;
+- for(;number < halfPoints; number++){
+- // Load the 32t values, increment inputPtr later since we're doing it in-place.
+- input = _mm_load_si128((__m128i*)inputPtr);
+-
+- // Do the four shifts
+- byte1 = _mm_slli_epi32(input, 24);
+- byte2 = _mm_slli_epi32(input, 8);
+- byte3 = _mm_srli_epi32(input, 8);
+- byte4 = _mm_srli_epi32(input, 24);
+- // Or bytes together
+- output = _mm_or_si128(byte1, byte4);
+- byte2 = _mm_and_si128(byte2, byte2mask);
+- output = _mm_or_si128(output, byte2);
+- byte3 = _mm_and_si128(byte3, byte3mask);
+- output = _mm_or_si128(output, byte3);
+-
+- // Reorder the two words
+- output = _mm_shuffle_epi32(output, _MM_SHUFFLE(2, 3, 0, 1));
+-
+- // Store the results
+- _mm_store_si128((__m128i*)inputPtr, output);
+- inputPtr += 4;
++ for (; number < halfPoints; number++) {
++ // Load the 32t values, increment inputPtr later since we're doing it in-place.
++ input = _mm_load_si128((__m128i*)inputPtr);
++
++ // Do the four shifts
++ byte1 = _mm_slli_epi32(input, 24);
++ byte2 = _mm_slli_epi32(input, 8);
++ byte3 = _mm_srli_epi32(input, 8);
++ byte4 = _mm_srli_epi32(input, 24);
++ // Or bytes together
++ output = _mm_or_si128(byte1, byte4);
++ byte2 = _mm_and_si128(byte2, byte2mask);
++ output = _mm_or_si128(output, byte2);
++ byte3 = _mm_and_si128(byte3, byte3mask);
++ output = _mm_or_si128(output, byte3);
++
++ // Reorder the two words
++ output = _mm_shuffle_epi32(output, _MM_SHUFFLE(2, 3, 0, 1));
++
++ // Store the results
++ _mm_store_si128((__m128i*)inputPtr, output);
++ inputPtr += 4;
+ }
+
+ // Byteswap any remaining points:
+- number = halfPoints*2;
+- for(; number < num_points; number++){
+- uint32_t output1 = *inputPtr;
+- uint32_t output2 = inputPtr[1];
++ number = halfPoints * 2;
++ for (; number < num_points; number++) {
++ uint32_t output1 = *inputPtr;
++ uint32_t output2 = inputPtr[1];
+
+- output1 = (((output1 >> 24) & 0xff) | ((output1 >> 8) & 0x0000ff00) | ((output1 << 8) & 0x00ff0000) | ((output1 << 24) & 0xff000000));
++ output1 = (((output1 >> 24) & 0xff) | ((output1 >> 8) & 0x0000ff00) |
++ ((output1 << 8) & 0x00ff0000) | ((output1 << 24) & 0xff000000));
+
+- output2 = (((output2 >> 24) & 0xff) | ((output2 >> 8) & 0x0000ff00) | ((output2 << 8) & 0x00ff0000) | ((output2 << 24) & 0xff000000));
++ output2 = (((output2 >> 24) & 0xff) | ((output2 >> 8) & 0x0000ff00) |
++ ((output2 << 8) & 0x00ff0000) | ((output2 << 24) & 0xff000000));
+
+- *inputPtr++ = output2;
+- *inputPtr++ = output1;
++ *inputPtr++ = output2;
++ *inputPtr++ = output1;
+ }
+ }
+ #endif /* LV_HAVE_SSE2 */
+@@ -387,46 +399,46 @@ static inline void volk_64u_byteswap_a_sse2(uint64_t* intsToSwap, unsigned int n
+ #include <immintrin.h>
+ static inline void volk_64u_byteswap_u_avx2(uint64_t* intsToSwap, unsigned int num_points)
+ {
+- unsigned int number = 0;
+-
+- const unsigned int nPerSet = 4;
+- const uint64_t nSets = num_points / nPerSet;
+-
+- uint32_t* inputPtr = (uint32_t*)intsToSwap;
+-
+- const uint8_t shuffleVector[32] = { 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8, 23, 22, 21, 20, 19, 18, 17, 16, 31, 30, 29, 28, 27, 26, 25, 24 };
+-
+- const __m256i myShuffle = _mm256_loadu_si256((__m256i*) &shuffleVector[0]);
+-
+- for ( ;number < nSets; number++ ) {
+- // Load the 32t values, increment inputPtr later since we're doing it in-place.
+- const __m256i input = _mm256_loadu_si256((__m256i*)inputPtr);
+- const __m256i output = _mm256_shuffle_epi8(input,myShuffle);
+-
+- // Store the results
+- _mm256_storeu_si256((__m256i*)inputPtr, output);
+-
+- /* inputPtr is 32bit so increment twice */
+- inputPtr += 2 * nPerSet;
+- }
+- _mm256_zeroupper();
+-
+- // Byteswap any remaining points:
+- for(number = nSets * nPerSet; number < num_points; ++number ) {
+- uint32_t output1 = *inputPtr;
+- uint32_t output2 = inputPtr[1];
+- uint32_t out1 = ((((output1) >> 24) & 0x000000ff) |
+- (((output1) >> 8) & 0x0000ff00) |
+- (((output1) << 8) & 0x00ff0000) |
+- (((output1) << 24) & 0xff000000) );
+-
+- uint32_t out2 = ((((output2) >> 24) & 0x000000ff) |
+- (((output2) >> 8) & 0x0000ff00) |
+- (((output2) << 8) & 0x00ff0000) |
+- (((output2) << 24) & 0xff000000) );
+- *inputPtr++ = out2;
+- *inputPtr++ = out1;
+- }
++ unsigned int number = 0;
++
++ const unsigned int nPerSet = 4;
++ const uint64_t nSets = num_points / nPerSet;
++
++ uint32_t* inputPtr = (uint32_t*)intsToSwap;
++
++ const uint8_t shuffleVector[32] = { 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13,
++ 12, 11, 10, 9, 8, 23, 22, 21, 20, 19, 18,
++ 17, 16, 31, 30, 29, 28, 27, 26, 25, 24 };
++
++ const __m256i myShuffle = _mm256_loadu_si256((__m256i*)&shuffleVector[0]);
++
++ for (; number < nSets; number++) {
++ // Load the 32t values, increment inputPtr later since we're doing it in-place.
++ const __m256i input = _mm256_loadu_si256((__m256i*)inputPtr);
++ const __m256i output = _mm256_shuffle_epi8(input, myShuffle);
++
++ // Store the results
++ _mm256_storeu_si256((__m256i*)inputPtr, output);
++
++ /* inputPtr is 32bit so increment twice */
++ inputPtr += 2 * nPerSet;
++ }
++ _mm256_zeroupper();
++
++ // Byteswap any remaining points:
++ for (number = nSets * nPerSet; number < num_points; ++number) {
++ uint32_t output1 = *inputPtr;
++ uint32_t output2 = inputPtr[1];
++ uint32_t out1 =
++ ((((output1) >> 24) & 0x000000ff) | (((output1) >> 8) & 0x0000ff00) |
++ (((output1) << 8) & 0x00ff0000) | (((output1) << 24) & 0xff000000));
++
++ uint32_t out2 =
++ ((((output2) >> 24) & 0x000000ff) | (((output2) >> 8) & 0x0000ff00) |
++ (((output2) << 8) & 0x00ff0000) | (((output2) << 24) & 0xff000000));
++ *inputPtr++ = out2;
++ *inputPtr++ = out1;
++ }
+ }
+
+ #endif /* LV_HAVE_AVX2 */
+@@ -434,70 +446,71 @@ static inline void volk_64u_byteswap_u_avx2(uint64_t* intsToSwap, unsigned int n
+
+ #if LV_HAVE_SSSE3
+ #include <tmmintrin.h>
+-static inline void volk_64u_byteswap_u_ssse3(uint64_t* intsToSwap, unsigned int num_points)
++static inline void volk_64u_byteswap_u_ssse3(uint64_t* intsToSwap,
++ unsigned int num_points)
+ {
+- unsigned int number = 0;
+-
+- const unsigned int nPerSet = 2;
+- const uint64_t nSets = num_points / nPerSet;
++ unsigned int number = 0;
+
+- uint32_t* inputPtr = (uint32_t*)intsToSwap;
++ const unsigned int nPerSet = 2;
++ const uint64_t nSets = num_points / nPerSet;
+
+- uint8_t shuffleVector[16] = { 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8 };
++ uint32_t* inputPtr = (uint32_t*)intsToSwap;
+
+- const __m128i myShuffle = _mm_loadu_si128((__m128i*) &shuffleVector);
++ uint8_t shuffleVector[16] = { 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8 };
+
+- for ( ;number < nSets; number++ ) {
+- // Load the 32t values, increment inputPtr later since we're doing it in-place.
+- const __m128i input = _mm_loadu_si128((__m128i*)inputPtr);
+- const __m128i output = _mm_shuffle_epi8(input,myShuffle);
++ const __m128i myShuffle = _mm_loadu_si128((__m128i*)&shuffleVector);
+
+- // Store the results
+- _mm_storeu_si128((__m128i*)inputPtr, output);
++ for (; number < nSets; number++) {
++ // Load the 32t values, increment inputPtr later since we're doing it in-place.
++ const __m128i input = _mm_loadu_si128((__m128i*)inputPtr);
++ const __m128i output = _mm_shuffle_epi8(input, myShuffle);
+
+- /* inputPtr is 32bit so increment twice */
+- inputPtr += 2 * nPerSet;
+- }
++ // Store the results
++ _mm_storeu_si128((__m128i*)inputPtr, output);
+
+- // Byteswap any remaining points:
+- for(number = nSets * nPerSet; number < num_points; ++number ) {
+- uint32_t output1 = *inputPtr;
+- uint32_t output2 = inputPtr[1];
+- uint32_t out1 = ((((output1) >> 24) & 0x000000ff) |
+- (((output1) >> 8) & 0x0000ff00) |
+- (((output1) << 8) & 0x00ff0000) |
+- (((output1) << 24) & 0xff000000) );
++ /* inputPtr is 32bit so increment twice */
++ inputPtr += 2 * nPerSet;
++ }
+
+- uint32_t out2 = ((((output2) >> 24) & 0x000000ff) |
+- (((output2) >> 8) & 0x0000ff00) |
+- (((output2) << 8) & 0x00ff0000) |
+- (((output2) << 24) & 0xff000000) );
+- *inputPtr++ = out2;
+- *inputPtr++ = out1;
+- }
++ // Byteswap any remaining points:
++ for (number = nSets * nPerSet; number < num_points; ++number) {
++ uint32_t output1 = *inputPtr;
++ uint32_t output2 = inputPtr[1];
++ uint32_t out1 =
++ ((((output1) >> 24) & 0x000000ff) | (((output1) >> 8) & 0x0000ff00) |
++ (((output1) << 8) & 0x00ff0000) | (((output1) << 24) & 0xff000000));
++
++ uint32_t out2 =
++ ((((output2) >> 24) & 0x000000ff) | (((output2) >> 8) & 0x0000ff00) |
++ (((output2) << 8) & 0x00ff0000) | (((output2) << 24) & 0xff000000));
++ *inputPtr++ = out2;
++ *inputPtr++ = out1;
++ }
+ }
+ #endif /* LV_HAVE_SSSE3 */
+
+ #ifdef LV_HAVE_GENERIC
+
+-static inline void volk_64u_byteswap_a_generic(uint64_t* intsToSwap, unsigned int num_points){
+- uint32_t* inputPtr = (uint32_t*)intsToSwap;
+- unsigned int point;
+- for(point = 0; point < num_points; point++){
+- uint32_t output1 = *inputPtr;
+- uint32_t output2 = inputPtr[1];
++static inline void volk_64u_byteswap_a_generic(uint64_t* intsToSwap,
++ unsigned int num_points)
++{
++ uint32_t* inputPtr = (uint32_t*)intsToSwap;
++ unsigned int point;
++ for (point = 0; point < num_points; point++) {
++ uint32_t output1 = *inputPtr;
++ uint32_t output2 = inputPtr[1];
+
+- output1 = (((output1 >> 24) & 0xff) | ((output1 >> 8) & 0x0000ff00) | ((output1 << 8) & 0x00ff0000) | ((output1 << 24) & 0xff000000));
++ output1 = (((output1 >> 24) & 0xff) | ((output1 >> 8) & 0x0000ff00) |
++ ((output1 << 8) & 0x00ff0000) | ((output1 << 24) & 0xff000000));
+
+- output2 = (((output2 >> 24) & 0xff) | ((output2 >> 8) & 0x0000ff00) | ((output2 << 8) & 0x00ff0000) | ((output2 << 24) & 0xff000000));
++ output2 = (((output2 >> 24) & 0xff) | ((output2 >> 8) & 0x0000ff00) |
++ ((output2 << 8) & 0x00ff0000) | ((output2 << 24) & 0xff000000));
+
+- *inputPtr++ = output2;
+- *inputPtr++ = output1;
+- }
++ *inputPtr++ = output2;
++ *inputPtr++ = output1;
++ }
+ }
+ #endif /* LV_HAVE_GENERIC */
+
+
+-
+-
+ #endif /* INCLUDED_volk_64u_byteswap_a_H */
+diff --git a/kernels/volk/volk_64u_byteswappuppet_64u.h b/kernels/volk/volk_64u_byteswappuppet_64u.h
+index 2db0171..ded54ee 100644
+--- a/kernels/volk/volk_64u_byteswappuppet_64u.h
++++ b/kernels/volk/volk_64u_byteswappuppet_64u.h
+@@ -3,87 +3,105 @@
+
+
+ #include <stdint.h>
+-#include <volk/volk_64u_byteswap.h>
+ #include <string.h>
++#include <volk/volk_64u_byteswap.h>
+
+ #ifdef LV_HAVE_GENERIC
+-static inline void volk_64u_byteswappuppet_64u_generic(uint64_t*output, uint64_t* intsToSwap, unsigned int num_points){
++static inline void volk_64u_byteswappuppet_64u_generic(uint64_t* output,
++ uint64_t* intsToSwap,
++ unsigned int num_points)
++{
+
+ volk_64u_byteswap_generic((uint64_t*)intsToSwap, num_points);
+ memcpy((void*)output, (void*)intsToSwap, num_points * sizeof(uint64_t));
+-
+ }
+ #endif
+
+ #ifdef LV_HAVE_NEONV8
+-static inline void volk_64u_byteswappuppet_64u_neonv8(uint64_t*output, uint64_t* intsToSwap, unsigned int num_points){
++static inline void volk_64u_byteswappuppet_64u_neonv8(uint64_t* output,
++ uint64_t* intsToSwap,
++ unsigned int num_points)
++{
+
+ volk_64u_byteswap_neonv8((uint64_t*)intsToSwap, num_points);
+ memcpy((void*)output, (void*)intsToSwap, num_points * sizeof(uint64_t));
+-
+ }
+ #else
+ #ifdef LV_HAVE_NEON
+-static inline void volk_64u_byteswappuppet_64u_neon(uint64_t*output, uint64_t* intsToSwap, unsigned int num_points){
++static inline void volk_64u_byteswappuppet_64u_neon(uint64_t* output,
++ uint64_t* intsToSwap,
++ unsigned int num_points)
++{
+
+ volk_64u_byteswap_neon((uint64_t*)intsToSwap, num_points);
+ memcpy((void*)output, (void*)intsToSwap, num_points * sizeof(uint64_t));
+-
+ }
+ #endif
+ #endif
+
+ #ifdef LV_HAVE_SSE2
+-static inline void volk_64u_byteswappuppet_64u_u_sse2(uint64_t* output, uint64_t* intsToSwap, unsigned int num_points){
++static inline void volk_64u_byteswappuppet_64u_u_sse2(uint64_t* output,
++ uint64_t* intsToSwap,
++ unsigned int num_points)
++{
+
+ volk_64u_byteswap_u_sse2((uint64_t*)intsToSwap, num_points);
+ memcpy((void*)output, (void*)intsToSwap, num_points * sizeof(uint64_t));
+-
+ }
+ #endif
+
+ #ifdef LV_HAVE_SSE2
+-static inline void volk_64u_byteswappuppet_64u_a_sse2(uint64_t* output, uint64_t* intsToSwap, unsigned int num_points){
++static inline void volk_64u_byteswappuppet_64u_a_sse2(uint64_t* output,
++ uint64_t* intsToSwap,
++ unsigned int num_points)
++{
+
+ volk_64u_byteswap_a_sse2((uint64_t*)intsToSwap, num_points);
+ memcpy((void*)output, (void*)intsToSwap, num_points * sizeof(uint64_t));
+-
+ }
+ #endif
+
+ #ifdef LV_HAVE_SSSE3
+-static inline void volk_64u_byteswappuppet_64u_u_ssse3(uint64_t* output, uint64_t* intsToSwap, unsigned int num_points){
++static inline void volk_64u_byteswappuppet_64u_u_ssse3(uint64_t* output,
++ uint64_t* intsToSwap,
++ unsigned int num_points)
++{
+
+ volk_64u_byteswap_u_ssse3((uint64_t*)intsToSwap, num_points);
+ memcpy((void*)output, (void*)intsToSwap, num_points * sizeof(uint64_t));
+-
+ }
+ #endif
+
+ #ifdef LV_HAVE_SSSE3
+-static inline void volk_64u_byteswappuppet_64u_a_ssse3(uint64_t* output, uint64_t* intsToSwap, unsigned int num_points){
++static inline void volk_64u_byteswappuppet_64u_a_ssse3(uint64_t* output,
++ uint64_t* intsToSwap,
++ unsigned int num_points)
++{
+
+ volk_64u_byteswap_a_ssse3((uint64_t*)intsToSwap, num_points);
+ memcpy((void*)output, (void*)intsToSwap, num_points * sizeof(uint64_t));
+-
+ }
+ #endif
+
+ #ifdef LV_HAVE_AVX2
+-static inline void volk_64u_byteswappuppet_64u_u_avx2(uint64_t* output, uint64_t* intsToSwap, unsigned int num_points){
++static inline void volk_64u_byteswappuppet_64u_u_avx2(uint64_t* output,
++ uint64_t* intsToSwap,
++ unsigned int num_points)
++{
+
+ volk_64u_byteswap_u_avx2((uint64_t*)intsToSwap, num_points);
+ memcpy((void*)output, (void*)intsToSwap, num_points * sizeof(uint64_t));
+-
+ }
+ #endif
+
+ #ifdef LV_HAVE_AVX2
+-static inline void volk_64u_byteswappuppet_64u_a_avx2(uint64_t* output, uint64_t* intsToSwap, unsigned int num_points){
++static inline void volk_64u_byteswappuppet_64u_a_avx2(uint64_t* output,
++ uint64_t* intsToSwap,
++ unsigned int num_points)
++{
+
+ volk_64u_byteswap_a_avx2((uint64_t*)intsToSwap, num_points);
+ memcpy((void*)output, (void*)intsToSwap, num_points * sizeof(uint64_t));
+-
+ }
+ #endif
+
+diff --git a/kernels/volk/volk_64u_popcnt.h b/kernels/volk/volk_64u_popcnt.h
+index cbce2ec..43c2ae0 100644
+--- a/kernels/volk/volk_64u_popcnt.h
++++ b/kernels/volk/volk_64u_popcnt.h
+@@ -60,39 +60,38 @@
+ #ifndef INCLUDED_volk_64u_popcnt_a_H
+ #define INCLUDED_volk_64u_popcnt_a_H
+
+-#include <stdio.h>
+ #include <inttypes.h>
++#include <stdio.h>
+
+
+ #ifdef LV_HAVE_GENERIC
+
+
+-static inline void
+-volk_64u_popcnt_generic(uint64_t* ret, const uint64_t value)
++static inline void volk_64u_popcnt_generic(uint64_t* ret, const uint64_t value)
+ {
+- //const uint32_t* valueVector = (const uint32_t*)&value;
+-
+- // This is faster than a lookup table
+- //uint32_t retVal = valueVector[0];
+- uint32_t retVal = (uint32_t)(value & 0x00000000FFFFFFFFull);
+-
+- retVal = (retVal & 0x55555555) + (retVal >> 1 & 0x55555555);
+- retVal = (retVal & 0x33333333) + (retVal >> 2 & 0x33333333);
+- retVal = (retVal + (retVal >> 4)) & 0x0F0F0F0F;
+- retVal = (retVal + (retVal >> 8));
+- retVal = (retVal + (retVal >> 16)) & 0x0000003F;
+- uint64_t retVal64 = retVal;
+-
+- //retVal = valueVector[1];
+- retVal = (uint32_t)((value & 0xFFFFFFFF00000000ull) >> 32);
+- retVal = (retVal & 0x55555555) + (retVal >> 1 & 0x55555555);
+- retVal = (retVal & 0x33333333) + (retVal >> 2 & 0x33333333);
+- retVal = (retVal + (retVal >> 4)) & 0x0F0F0F0F;
+- retVal = (retVal + (retVal >> 8));
+- retVal = (retVal + (retVal >> 16)) & 0x0000003F;
+- retVal64 += retVal;
+-
+- *ret = retVal64;
++ // const uint32_t* valueVector = (const uint32_t*)&value;
++
++ // This is faster than a lookup table
++ // uint32_t retVal = valueVector[0];
++ uint32_t retVal = (uint32_t)(value & 0x00000000FFFFFFFFull);
++
++ retVal = (retVal & 0x55555555) + (retVal >> 1 & 0x55555555);
++ retVal = (retVal & 0x33333333) + (retVal >> 2 & 0x33333333);
++ retVal = (retVal + (retVal >> 4)) & 0x0F0F0F0F;
++ retVal = (retVal + (retVal >> 8));
++ retVal = (retVal + (retVal >> 16)) & 0x0000003F;
++ uint64_t retVal64 = retVal;
++
++ // retVal = valueVector[1];
++ retVal = (uint32_t)((value & 0xFFFFFFFF00000000ull) >> 32);
++ retVal = (retVal & 0x55555555) + (retVal >> 1 & 0x55555555);
++ retVal = (retVal & 0x33333333) + (retVal >> 2 & 0x33333333);
++ retVal = (retVal + (retVal >> 4)) & 0x0F0F0F0F;
++ retVal = (retVal + (retVal >> 8));
++ retVal = (retVal + (retVal >> 16)) & 0x0000003F;
++ retVal64 += retVal;
++
++ *ret = retVal64;
+ }
+
+ #endif /*LV_HAVE_GENERIC*/
+@@ -104,7 +103,7 @@ volk_64u_popcnt_generic(uint64_t* ret, const uint64_t value)
+
+ static inline void volk_64u_popcnt_a_sse4_2(uint64_t* ret, const uint64_t value)
+ {
+- *ret = _mm_popcnt_u64(value);
++ *ret = _mm_popcnt_u64(value);
+ }
+
+ #endif /*LV_HAVE_SSE4_2*/
+@@ -114,19 +113,19 @@ static inline void volk_64u_popcnt_a_sse4_2(uint64_t* ret, const uint64_t value)
+ #include <arm_neon.h>
+ static inline void volk_64u_popcnt_neon(uint64_t* ret, const uint64_t value)
+ {
+- uint8x8_t input_val, count8x8_val;
+- uint16x4_t count16x4_val;
+- uint32x2_t count32x2_val;
+- uint64x1_t count64x1_val;
+-
+- input_val = vld1_u8((unsigned char *) &value);
+- count8x8_val = vcnt_u8(input_val);
+- count16x4_val = vpaddl_u8(count8x8_val);
+- count32x2_val = vpaddl_u16(count16x4_val);
+- count64x1_val = vpaddl_u32(count32x2_val);
+- vst1_u64(ret, count64x1_val);
+-
+- //*ret = _mm_popcnt_u64(value);
++ uint8x8_t input_val, count8x8_val;
++ uint16x4_t count16x4_val;
++ uint32x2_t count32x2_val;
++ uint64x1_t count64x1_val;
++
++ input_val = vld1_u8((unsigned char*)&value);
++ count8x8_val = vcnt_u8(input_val);
++ count16x4_val = vpaddl_u8(count8x8_val);
++ count32x2_val = vpaddl_u16(count16x4_val);
++ count64x1_val = vpaddl_u32(count32x2_val);
++ vst1_u64(ret, count64x1_val);
++
++ //*ret = _mm_popcnt_u64(value);
+ }
+ #endif /*LV_HAVE_NEON*/
+
+diff --git a/kernels/volk/volk_64u_popcntpuppet_64u.h b/kernels/volk/volk_64u_popcntpuppet_64u.h
+index e38ebb3..688281a 100644
+--- a/kernels/volk/volk_64u_popcntpuppet_64u.h
++++ b/kernels/volk/volk_64u_popcntpuppet_64u.h
+@@ -23,35 +23,44 @@
+ #ifndef INCLUDED_volk_64u_popcntpuppet_64u_H
+ #define INCLUDED_volk_64u_popcntpuppet_64u_H
+
+-#include <volk/volk_64u_popcnt.h>
+ #include <stdint.h>
+ #include <string.h>
++#include <volk/volk_64u_popcnt.h>
+
+ #ifdef LV_HAVE_GENERIC
+-static inline void volk_64u_popcntpuppet_64u_generic(uint64_t* outVector, const uint64_t* inVector, unsigned int num_points){
++static inline void volk_64u_popcntpuppet_64u_generic(uint64_t* outVector,
++ const uint64_t* inVector,
++ unsigned int num_points)
++{
+ unsigned int ii;
+- for(ii=0; ii < num_points; ++ii) {
+- volk_64u_popcnt_generic(outVector+ii, num_points );
++ for (ii = 0; ii < num_points; ++ii) {
++ volk_64u_popcnt_generic(outVector + ii, num_points);
+ }
+ memcpy((void*)outVector, (void*)inVector, num_points * sizeof(uint64_t));
+ }
+ #endif /* LV_HAVE_GENERIC */
+
+ #if LV_HAVE_SSE4_2 && LV_HAVE_64
+-static inline void volk_64u_popcntpuppet_64u_a_sse4_2(uint64_t* outVector, const uint64_t* inVector, unsigned int num_points){
++static inline void volk_64u_popcntpuppet_64u_a_sse4_2(uint64_t* outVector,
++ const uint64_t* inVector,
++ unsigned int num_points)
++{
+ unsigned int ii;
+- for(ii=0; ii < num_points; ++ii) {
+- volk_64u_popcnt_a_sse4_2(outVector+ii, num_points );
++ for (ii = 0; ii < num_points; ++ii) {
++ volk_64u_popcnt_a_sse4_2(outVector + ii, num_points);
+ }
+ memcpy((void*)outVector, (void*)inVector, num_points * sizeof(uint64_t));
+ }
+ #endif /* LV_HAVE_SSE4_2 */
+
+ #ifdef LV_HAVE_NEON
+-static inline void volk_64u_popcntpuppet_64u_neon(uint64_t* outVector, const uint64_t* inVector, unsigned int num_points){
++static inline void volk_64u_popcntpuppet_64u_neon(uint64_t* outVector,
++ const uint64_t* inVector,
++ unsigned int num_points)
++{
+ unsigned int ii;
+- for(ii=0; ii < num_points; ++ii) {
+- volk_64u_popcnt_neon(outVector+ii, num_points );
++ for (ii = 0; ii < num_points; ++ii) {
++ volk_64u_popcnt_neon(outVector + ii, num_points);
+ }
+ memcpy((void*)outVector, (void*)inVector, num_points * sizeof(uint64_t));
+ }
+diff --git a/kernels/volk/volk_8i_convert_16i.h b/kernels/volk/volk_8i_convert_16i.h
+index 40400c3..69d8f6a 100644
+--- a/kernels/volk/volk_8i_convert_16i.h
++++ b/kernels/volk/volk_8i_convert_16i.h
+@@ -30,8 +30,8 @@
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+- * void volk_8i_convert_16i(int16_t* outputVector, const int8_t* inputVector, unsigned int num_points)
+- * \endcode
++ * void volk_8i_convert_16i(int16_t* outputVector, const int8_t* inputVector, unsigned int
++ * num_points) \endcode
+ *
+ * \b Inputs
+ * \li inputVector: The input vector of 8-bit chars.
+@@ -59,32 +59,32 @@
+ #ifdef LV_HAVE_AVX2
+ #include <immintrin.h>
+
+-static inline void
+-volk_8i_convert_16i_u_avx2(int16_t* outputVector, const int8_t* inputVector,
+- unsigned int num_points)
++static inline void volk_8i_convert_16i_u_avx2(int16_t* outputVector,
++ const int8_t* inputVector,
++ unsigned int num_points)
+ {
+- unsigned int number = 0;
+- const unsigned int sixteenthPoints = num_points / 16;
+-
+- const __m128i* inputVectorPtr = (const __m128i*)inputVector;
+- __m256i* outputVectorPtr = (__m256i*)outputVector;
+- __m128i inputVal;
+- __m256i ret;
+-
+- for(;number < sixteenthPoints; number++){
+- inputVal = _mm_loadu_si128(inputVectorPtr);
+- ret = _mm256_cvtepi8_epi16(inputVal);
+- ret = _mm256_slli_epi16(ret, 8); // Multiply by 256
+- _mm256_storeu_si256(outputVectorPtr, ret);
+-
+- outputVectorPtr++;
+- inputVectorPtr++;
+- }
+-
+- number = sixteenthPoints * 16;
+- for(; number < num_points; number++){
+- outputVector[number] = (int16_t)(inputVector[number])*256;
+- }
++ unsigned int number = 0;
++ const unsigned int sixteenthPoints = num_points / 16;
++
++ const __m128i* inputVectorPtr = (const __m128i*)inputVector;
++ __m256i* outputVectorPtr = (__m256i*)outputVector;
++ __m128i inputVal;
++ __m256i ret;
++
++ for (; number < sixteenthPoints; number++) {
++ inputVal = _mm_loadu_si128(inputVectorPtr);
++ ret = _mm256_cvtepi8_epi16(inputVal);
++ ret = _mm256_slli_epi16(ret, 8); // Multiply by 256
++ _mm256_storeu_si256(outputVectorPtr, ret);
++
++ outputVectorPtr++;
++ inputVectorPtr++;
++ }
++
++ number = sixteenthPoints * 16;
++ for (; number < num_points; number++) {
++ outputVector[number] = (int16_t)(inputVector[number]) * 256;
++ }
+ }
+ #endif /* LV_HAVE_AVX2 */
+
+@@ -92,57 +92,57 @@ volk_8i_convert_16i_u_avx2(int16_t* outputVector, const int8_t* inputVector,
+ #ifdef LV_HAVE_SSE4_1
+ #include <smmintrin.h>
+
+-static inline void
+-volk_8i_convert_16i_u_sse4_1(int16_t* outputVector, const int8_t* inputVector,
+- unsigned int num_points)
++static inline void volk_8i_convert_16i_u_sse4_1(int16_t* outputVector,
++ const int8_t* inputVector,
++ unsigned int num_points)
+ {
+- unsigned int number = 0;
+- const unsigned int sixteenthPoints = num_points / 16;
++ unsigned int number = 0;
++ const unsigned int sixteenthPoints = num_points / 16;
+
+- const __m128i* inputVectorPtr = (const __m128i*)inputVector;
+- __m128i* outputVectorPtr = (__m128i*)outputVector;
+- __m128i inputVal;
+- __m128i ret;
++ const __m128i* inputVectorPtr = (const __m128i*)inputVector;
++ __m128i* outputVectorPtr = (__m128i*)outputVector;
++ __m128i inputVal;
++ __m128i ret;
+
+- for(;number < sixteenthPoints; number++){
+- inputVal = _mm_loadu_si128(inputVectorPtr);
+- ret = _mm_cvtepi8_epi16(inputVal);
+- ret = _mm_slli_epi16(ret, 8); // Multiply by 256
+- _mm_storeu_si128(outputVectorPtr, ret);
++ for (; number < sixteenthPoints; number++) {
++ inputVal = _mm_loadu_si128(inputVectorPtr);
++ ret = _mm_cvtepi8_epi16(inputVal);
++ ret = _mm_slli_epi16(ret, 8); // Multiply by 256
++ _mm_storeu_si128(outputVectorPtr, ret);
+
+- outputVectorPtr++;
++ outputVectorPtr++;
+
+- inputVal = _mm_srli_si128(inputVal, 8);
+- ret = _mm_cvtepi8_epi16(inputVal);
+- ret = _mm_slli_epi16(ret, 8); // Multiply by 256
+- _mm_storeu_si128(outputVectorPtr, ret);
++ inputVal = _mm_srli_si128(inputVal, 8);
++ ret = _mm_cvtepi8_epi16(inputVal);
++ ret = _mm_slli_epi16(ret, 8); // Multiply by 256
++ _mm_storeu_si128(outputVectorPtr, ret);
+
+- outputVectorPtr++;
++ outputVectorPtr++;
+
+- inputVectorPtr++;
+- }
++ inputVectorPtr++;
++ }
+
+- number = sixteenthPoints * 16;
+- for(; number < num_points; number++){
+- outputVector[number] = (int16_t)(inputVector[number])*256;
+- }
++ number = sixteenthPoints * 16;
++ for (; number < num_points; number++) {
++ outputVector[number] = (int16_t)(inputVector[number]) * 256;
++ }
+ }
+ #endif /* LV_HAVE_SSE4_1 */
+
+
+ #ifdef LV_HAVE_GENERIC
+
+-static inline void
+-volk_8i_convert_16i_generic(int16_t* outputVector, const int8_t* inputVector,
+- unsigned int num_points)
++static inline void volk_8i_convert_16i_generic(int16_t* outputVector,
++ const int8_t* inputVector,
++ unsigned int num_points)
+ {
+- int16_t* outputVectorPtr = outputVector;
+- const int8_t* inputVectorPtr = inputVector;
+- unsigned int number = 0;
++ int16_t* outputVectorPtr = outputVector;
++ const int8_t* inputVectorPtr = inputVector;
++ unsigned int number = 0;
+
+- for(number = 0; number < num_points; number++){
+- *outputVectorPtr++ = ((int16_t)(*inputVectorPtr++)) * 256;
+- }
++ for (number = 0; number < num_points; number++) {
++ *outputVectorPtr++ = ((int16_t)(*inputVectorPtr++)) * 256;
++ }
+ }
+ #endif /* LV_HAVE_GENERIC */
+
+@@ -150,7 +150,6 @@ volk_8i_convert_16i_generic(int16_t* outputVector, const int8_t* inputVector,
+ #endif /* INCLUDED_VOLK_8s_CONVERT_16s_UNALIGNED8_H */
+
+
+-
+ #ifndef INCLUDED_volk_8i_convert_16i_a_H
+ #define INCLUDED_volk_8i_convert_16i_a_H
+
+@@ -160,32 +159,32 @@ volk_8i_convert_16i_generic(int16_t* outputVector, const int8_t* inputVector,
+ #ifdef LV_HAVE_AVX2
+ #include <immintrin.h>
+
+-static inline void
+-volk_8i_convert_16i_a_avx2(int16_t* outputVector, const int8_t* inputVector,
+- unsigned int num_points)
++static inline void volk_8i_convert_16i_a_avx2(int16_t* outputVector,
++ const int8_t* inputVector,
++ unsigned int num_points)
+ {
+- unsigned int number = 0;
+- const unsigned int sixteenthPoints = num_points / 16;
+-
+- const __m128i* inputVectorPtr = (const __m128i*)inputVector;
+- __m256i* outputVectorPtr = (__m256i*)outputVector;
+- __m128i inputVal;
+- __m256i ret;
+-
+- for(;number < sixteenthPoints; number++){
+- inputVal = _mm_load_si128(inputVectorPtr);
+- ret = _mm256_cvtepi8_epi16(inputVal);
+- ret = _mm256_slli_epi16(ret, 8); // Multiply by 256
+- _mm256_store_si256(outputVectorPtr, ret);
+-
+- outputVectorPtr++;
+- inputVectorPtr++;
+- }
+-
+- number = sixteenthPoints * 16;
+- for(; number < num_points; number++){
+- outputVector[number] = (int16_t)(inputVector[number])*256;
+- }
++ unsigned int number = 0;
++ const unsigned int sixteenthPoints = num_points / 16;
++
++ const __m128i* inputVectorPtr = (const __m128i*)inputVector;
++ __m256i* outputVectorPtr = (__m256i*)outputVector;
++ __m128i inputVal;
++ __m256i ret;
++
++ for (; number < sixteenthPoints; number++) {
++ inputVal = _mm_load_si128(inputVectorPtr);
++ ret = _mm256_cvtepi8_epi16(inputVal);
++ ret = _mm256_slli_epi16(ret, 8); // Multiply by 256
++ _mm256_store_si256(outputVectorPtr, ret);
++
++ outputVectorPtr++;
++ inputVectorPtr++;
++ }
++
++ number = sixteenthPoints * 16;
++ for (; number < num_points; number++) {
++ outputVector[number] = (int16_t)(inputVector[number]) * 256;
++ }
+ }
+ #endif /* LV_HAVE_AVX2 */
+
+@@ -193,57 +192,57 @@ volk_8i_convert_16i_a_avx2(int16_t* outputVector, const int8_t* inputVector,
+ #ifdef LV_HAVE_SSE4_1
+ #include <smmintrin.h>
+
+-static inline void
+-volk_8i_convert_16i_a_sse4_1(int16_t* outputVector, const int8_t* inputVector,
+- unsigned int num_points)
++static inline void volk_8i_convert_16i_a_sse4_1(int16_t* outputVector,
++ const int8_t* inputVector,
++ unsigned int num_points)
+ {
+- unsigned int number = 0;
+- const unsigned int sixteenthPoints = num_points / 16;
++ unsigned int number = 0;
++ const unsigned int sixteenthPoints = num_points / 16;
+
+- const __m128i* inputVectorPtr = (const __m128i*)inputVector;
+- __m128i* outputVectorPtr = (__m128i*)outputVector;
+- __m128i inputVal;
+- __m128i ret;
++ const __m128i* inputVectorPtr = (const __m128i*)inputVector;
++ __m128i* outputVectorPtr = (__m128i*)outputVector;
++ __m128i inputVal;
++ __m128i ret;
+
+- for(;number < sixteenthPoints; number++){
+- inputVal = _mm_load_si128(inputVectorPtr);
+- ret = _mm_cvtepi8_epi16(inputVal);
+- ret = _mm_slli_epi16(ret, 8); // Multiply by 256
+- _mm_store_si128(outputVectorPtr, ret);
++ for (; number < sixteenthPoints; number++) {
++ inputVal = _mm_load_si128(inputVectorPtr);
++ ret = _mm_cvtepi8_epi16(inputVal);
++ ret = _mm_slli_epi16(ret, 8); // Multiply by 256
++ _mm_store_si128(outputVectorPtr, ret);
+
+- outputVectorPtr++;
++ outputVectorPtr++;
+
+- inputVal = _mm_srli_si128(inputVal, 8);
+- ret = _mm_cvtepi8_epi16(inputVal);
+- ret = _mm_slli_epi16(ret, 8); // Multiply by 256
+- _mm_store_si128(outputVectorPtr, ret);
++ inputVal = _mm_srli_si128(inputVal, 8);
++ ret = _mm_cvtepi8_epi16(inputVal);
++ ret = _mm_slli_epi16(ret, 8); // Multiply by 256
++ _mm_store_si128(outputVectorPtr, ret);
+
+- outputVectorPtr++;
++ outputVectorPtr++;
+
+- inputVectorPtr++;
+- }
++ inputVectorPtr++;
++ }
+
+- number = sixteenthPoints * 16;
+- for(; number < num_points; number++){
+- outputVector[number] = (int16_t)(inputVector[number])*256;
+- }
++ number = sixteenthPoints * 16;
++ for (; number < num_points; number++) {
++ outputVector[number] = (int16_t)(inputVector[number]) * 256;
++ }
+ }
+ #endif /* LV_HAVE_SSE4_1 */
+
+
+ #ifdef LV_HAVE_GENERIC
+
+-static inline void
+-volk_8i_convert_16i_a_generic(int16_t* outputVector, const int8_t* inputVector,
+- unsigned int num_points)
++static inline void volk_8i_convert_16i_a_generic(int16_t* outputVector,
++ const int8_t* inputVector,
++ unsigned int num_points)
+ {
+- int16_t* outputVectorPtr = outputVector;
+- const int8_t* inputVectorPtr = inputVector;
+- unsigned int number = 0;
++ int16_t* outputVectorPtr = outputVector;
++ const int8_t* inputVectorPtr = inputVector;
++ unsigned int number = 0;
+
+- for(number = 0; number < num_points; number++){
+- *outputVectorPtr++ = ((int16_t)(*inputVectorPtr++)) * 256;
+- }
++ for (number = 0; number < num_points; number++) {
++ *outputVectorPtr++ = ((int16_t)(*inputVectorPtr++)) * 256;
++ }
+ }
+ #endif /* LV_HAVE_GENERIC */
+
+@@ -251,51 +250,51 @@ volk_8i_convert_16i_a_generic(int16_t* outputVector, const int8_t* inputVector,
+ #ifdef LV_HAVE_NEON
+ #include <arm_neon.h>
+
+-static inline void
+-volk_8i_convert_16i_neon(int16_t* outputVector, const int8_t* inputVector, unsigned int num_points)
++static inline void volk_8i_convert_16i_neon(int16_t* outputVector,
++ const int8_t* inputVector,
++ unsigned int num_points)
+ {
+- int16_t* outputVectorPtr = outputVector;
+- const int8_t* inputVectorPtr = inputVector;
+- unsigned int number;
+- const unsigned int eighth_points = num_points / 8;
+-
+- int8x8_t input_vec ;
+- int16x8_t converted_vec;
+-
+- // NEON doesn't have a concept of 8 bit registers, so we are really
+- // dealing with the low half of 16-bit registers. Since this requires
+- // a move instruction we likely do better with ASM here.
+- for(number = 0; number < eighth_points; ++number) {
+- input_vec = vld1_s8(inputVectorPtr);
+- converted_vec = vmovl_s8(input_vec);
+- //converted_vec = vmulq_s16(converted_vec, scale_factor);
+- converted_vec = vshlq_n_s16(converted_vec, 8);
+- vst1q_s16( outputVectorPtr, converted_vec);
+-
+- inputVectorPtr += 8;
+- outputVectorPtr += 8;
+- }
+-
+- for(number = eighth_points * 8; number < num_points; number++){
+- *outputVectorPtr++ = ((int16_t)(*inputVectorPtr++)) * 256;
+- }
++ int16_t* outputVectorPtr = outputVector;
++ const int8_t* inputVectorPtr = inputVector;
++ unsigned int number;
++ const unsigned int eighth_points = num_points / 8;
++
++ int8x8_t input_vec;
++ int16x8_t converted_vec;
++
++ // NEON doesn't have a concept of 8 bit registers, so we are really
++ // dealing with the low half of 16-bit registers. Since this requires
++ // a move instruction we likely do better with ASM here.
++ for (number = 0; number < eighth_points; ++number) {
++ input_vec = vld1_s8(inputVectorPtr);
++ converted_vec = vmovl_s8(input_vec);
++ // converted_vec = vmulq_s16(converted_vec, scale_factor);
++ converted_vec = vshlq_n_s16(converted_vec, 8);
++ vst1q_s16(outputVectorPtr, converted_vec);
++
++ inputVectorPtr += 8;
++ outputVectorPtr += 8;
++ }
++
++ for (number = eighth_points * 8; number < num_points; number++) {
++ *outputVectorPtr++ = ((int16_t)(*inputVectorPtr++)) * 256;
++ }
+ }
+ #endif /* LV_HAVE_NEON */
+
+
+ #ifdef LV_HAVE_ORC
+-extern void
+-volk_8i_convert_16i_a_orc_impl(int16_t* outputVector, const int8_t* inputVector,
+- unsigned int num_points);
++extern void volk_8i_convert_16i_a_orc_impl(int16_t* outputVector,
++ const int8_t* inputVector,
++ unsigned int num_points);
+
+-static inline void
+-volk_8i_convert_16i_u_orc(int16_t* outputVector, const int8_t* inputVector,
+- unsigned int num_points)
++static inline void volk_8i_convert_16i_u_orc(int16_t* outputVector,
++ const int8_t* inputVector,
++ unsigned int num_points)
+ {
+- volk_8i_convert_16i_a_orc_impl(outputVector, inputVector, num_points);
++ volk_8i_convert_16i_a_orc_impl(outputVector, inputVector, num_points);
+ }
+ #endif /* LV_HAVE_ORC */
+
+
+-
+ #endif /* INCLUDED_VOLK_8s_CONVERT_16s_ALIGNED8_H */
+diff --git a/kernels/volk/volk_8i_s32f_convert_32f.h b/kernels/volk/volk_8i_s32f_convert_32f.h
+index 97d160b..c3d5666 100644
+--- a/kernels/volk/volk_8i_s32f_convert_32f.h
++++ b/kernels/volk/volk_8i_s32f_convert_32f.h
+@@ -30,8 +30,8 @@
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+- * void volk_8i_s32f_convert_32f(float* outputVector, const int8_t* inputVector, const float scalar, unsigned int num_points)
+- * \endcode
++ * void volk_8i_s32f_convert_32f(float* outputVector, const int8_t* inputVector, const
++ * float scalar, unsigned int num_points) \endcode
+ *
+ * \b Inputs
+ * \li inputVector: The input vector of 8-bit chars.
+@@ -60,44 +60,45 @@
+ #ifdef LV_HAVE_AVX2
+ #include <immintrin.h>
+
+-static inline void
+-volk_8i_s32f_convert_32f_u_avx2(float* outputVector, const int8_t* inputVector,
+- const float scalar, unsigned int num_points)
++static inline void volk_8i_s32f_convert_32f_u_avx2(float* outputVector,
++ const int8_t* inputVector,
++ const float scalar,
++ unsigned int num_points)
+ {
+- unsigned int number = 0;
+- const unsigned int sixteenthPoints = num_points / 16;
+-
+- float* outputVectorPtr = outputVector;
+- const float iScalar = 1.0 / scalar;
+- __m256 invScalar = _mm256_set1_ps( iScalar );
+- const int8_t* inputVectorPtr = inputVector;
+- __m256 ret;
+- __m128i inputVal128;
+- __m256i interimVal;
+-
+- for(;number < sixteenthPoints; number++){
+- inputVal128 = _mm_loadu_si128((__m128i*)inputVectorPtr);
+-
+- interimVal = _mm256_cvtepi8_epi32(inputVal128);
+- ret = _mm256_cvtepi32_ps(interimVal);
+- ret = _mm256_mul_ps(ret, invScalar);
+- _mm256_storeu_ps(outputVectorPtr, ret);
+- outputVectorPtr += 8;
+-
+- inputVal128 = _mm_srli_si128(inputVal128, 8);
+- interimVal = _mm256_cvtepi8_epi32(inputVal128);
+- ret = _mm256_cvtepi32_ps(interimVal);
+- ret = _mm256_mul_ps(ret, invScalar);
+- _mm256_storeu_ps(outputVectorPtr, ret);
+- outputVectorPtr += 8;
+-
+- inputVectorPtr += 16;
+- }
+-
+- number = sixteenthPoints * 16;
+- for(; number < num_points; number++){
+- outputVector[number] = (float)(inputVector[number]) * iScalar;
+- }
++ unsigned int number = 0;
++ const unsigned int sixteenthPoints = num_points / 16;
++
++ float* outputVectorPtr = outputVector;
++ const float iScalar = 1.0 / scalar;
++ __m256 invScalar = _mm256_set1_ps(iScalar);
++ const int8_t* inputVectorPtr = inputVector;
++ __m256 ret;
++ __m128i inputVal128;
++ __m256i interimVal;
++
++ for (; number < sixteenthPoints; number++) {
++ inputVal128 = _mm_loadu_si128((__m128i*)inputVectorPtr);
++
++ interimVal = _mm256_cvtepi8_epi32(inputVal128);
++ ret = _mm256_cvtepi32_ps(interimVal);
++ ret = _mm256_mul_ps(ret, invScalar);
++ _mm256_storeu_ps(outputVectorPtr, ret);
++ outputVectorPtr += 8;
++
++ inputVal128 = _mm_srli_si128(inputVal128, 8);
++ interimVal = _mm256_cvtepi8_epi32(inputVal128);
++ ret = _mm256_cvtepi32_ps(interimVal);
++ ret = _mm256_mul_ps(ret, invScalar);
++ _mm256_storeu_ps(outputVectorPtr, ret);
++ outputVectorPtr += 8;
++
++ inputVectorPtr += 16;
++ }
++
++ number = sixteenthPoints * 16;
++ for (; number < num_points; number++) {
++ outputVector[number] = (float)(inputVector[number]) * iScalar;
++ }
+ }
+ #endif /* LV_HAVE_AVX2 */
+
+@@ -105,80 +106,81 @@ volk_8i_s32f_convert_32f_u_avx2(float* outputVector, const int8_t* inputVector,
+ #ifdef LV_HAVE_SSE4_1
+ #include <smmintrin.h>
+
+-static inline void
+-volk_8i_s32f_convert_32f_u_sse4_1(float* outputVector, const int8_t* inputVector,
+- const float scalar, unsigned int num_points)
++static inline void volk_8i_s32f_convert_32f_u_sse4_1(float* outputVector,
++ const int8_t* inputVector,
++ const float scalar,
++ unsigned int num_points)
+ {
+- unsigned int number = 0;
+- const unsigned int sixteenthPoints = num_points / 16;
+-
+- float* outputVectorPtr = outputVector;
+- const float iScalar = 1.0 / scalar;
+- __m128 invScalar = _mm_set_ps1( iScalar );
+- const int8_t* inputVectorPtr = inputVector;
+- __m128 ret;
+- __m128i inputVal;
+- __m128i interimVal;
+-
+- for(;number < sixteenthPoints; number++){
+- inputVal = _mm_loadu_si128((__m128i*)inputVectorPtr);
+-
+- interimVal = _mm_cvtepi8_epi32(inputVal);
+- ret = _mm_cvtepi32_ps(interimVal);
+- ret = _mm_mul_ps(ret, invScalar);
+- _mm_storeu_ps(outputVectorPtr, ret);
+- outputVectorPtr += 4;
+-
+- inputVal = _mm_srli_si128(inputVal, 4);
+- interimVal = _mm_cvtepi8_epi32(inputVal);
+- ret = _mm_cvtepi32_ps(interimVal);
+- ret = _mm_mul_ps(ret, invScalar);
+- _mm_storeu_ps(outputVectorPtr, ret);
+- outputVectorPtr += 4;
+-
+- inputVal = _mm_srli_si128(inputVal, 4);
+- interimVal = _mm_cvtepi8_epi32(inputVal);
+- ret = _mm_cvtepi32_ps(interimVal);
+- ret = _mm_mul_ps(ret, invScalar);
+- _mm_storeu_ps(outputVectorPtr, ret);
+- outputVectorPtr += 4;
+-
+- inputVal = _mm_srli_si128(inputVal, 4);
+- interimVal = _mm_cvtepi8_epi32(inputVal);
+- ret = _mm_cvtepi32_ps(interimVal);
+- ret = _mm_mul_ps(ret, invScalar);
+- _mm_storeu_ps(outputVectorPtr, ret);
+- outputVectorPtr += 4;
+-
+- inputVectorPtr += 16;
+- }
+-
+- number = sixteenthPoints * 16;
+- for(; number < num_points; number++){
+- outputVector[number] = (float)(inputVector[number]) * iScalar;
+- }
++ unsigned int number = 0;
++ const unsigned int sixteenthPoints = num_points / 16;
++
++ float* outputVectorPtr = outputVector;
++ const float iScalar = 1.0 / scalar;
++ __m128 invScalar = _mm_set_ps1(iScalar);
++ const int8_t* inputVectorPtr = inputVector;
++ __m128 ret;
++ __m128i inputVal;
++ __m128i interimVal;
++
++ for (; number < sixteenthPoints; number++) {
++ inputVal = _mm_loadu_si128((__m128i*)inputVectorPtr);
++
++ interimVal = _mm_cvtepi8_epi32(inputVal);
++ ret = _mm_cvtepi32_ps(interimVal);
++ ret = _mm_mul_ps(ret, invScalar);
++ _mm_storeu_ps(outputVectorPtr, ret);
++ outputVectorPtr += 4;
++
++ inputVal = _mm_srli_si128(inputVal, 4);
++ interimVal = _mm_cvtepi8_epi32(inputVal);
++ ret = _mm_cvtepi32_ps(interimVal);
++ ret = _mm_mul_ps(ret, invScalar);
++ _mm_storeu_ps(outputVectorPtr, ret);
++ outputVectorPtr += 4;
++
++ inputVal = _mm_srli_si128(inputVal, 4);
++ interimVal = _mm_cvtepi8_epi32(inputVal);
++ ret = _mm_cvtepi32_ps(interimVal);
++ ret = _mm_mul_ps(ret, invScalar);
++ _mm_storeu_ps(outputVectorPtr, ret);
++ outputVectorPtr += 4;
++
++ inputVal = _mm_srli_si128(inputVal, 4);
++ interimVal = _mm_cvtepi8_epi32(inputVal);
++ ret = _mm_cvtepi32_ps(interimVal);
++ ret = _mm_mul_ps(ret, invScalar);
++ _mm_storeu_ps(outputVectorPtr, ret);
++ outputVectorPtr += 4;
++
++ inputVectorPtr += 16;
++ }
++
++ number = sixteenthPoints * 16;
++ for (; number < num_points; number++) {
++ outputVector[number] = (float)(inputVector[number]) * iScalar;
++ }
+ }
+ #endif /* LV_HAVE_SSE4_1 */
+
+ #ifdef LV_HAVE_GENERIC
+
+-static inline void
+-volk_8i_s32f_convert_32f_generic(float* outputVector, const int8_t* inputVector,
+- const float scalar, unsigned int num_points)
++static inline void volk_8i_s32f_convert_32f_generic(float* outputVector,
++ const int8_t* inputVector,
++ const float scalar,
++ unsigned int num_points)
+ {
+- float* outputVectorPtr = outputVector;
+- const int8_t* inputVectorPtr = inputVector;
+- unsigned int number = 0;
+- const float iScalar = 1.0 / scalar;
+-
+- for(number = 0; number < num_points; number++){
+- *outputVectorPtr++ = ((float)(*inputVectorPtr++)) * iScalar;
+- }
++ float* outputVectorPtr = outputVector;
++ const int8_t* inputVectorPtr = inputVector;
++ unsigned int number = 0;
++ const float iScalar = 1.0 / scalar;
++
++ for (number = 0; number < num_points; number++) {
++ *outputVectorPtr++ = ((float)(*inputVectorPtr++)) * iScalar;
++ }
+ }
+ #endif /* LV_HAVE_GENERIC */
+
+
+-
+ #endif /* INCLUDED_VOLK_8s_CONVERT_32f_UNALIGNED8_H */
+
+ #ifndef INCLUDED_volk_8i_s32f_convert_32f_a_H
+@@ -190,195 +192,199 @@ volk_8i_s32f_convert_32f_generic(float* outputVector, const int8_t* inputVector,
+ #ifdef LV_HAVE_AVX2
+ #include <immintrin.h>
+
+-static inline void
+-volk_8i_s32f_convert_32f_a_avx2(float* outputVector, const int8_t* inputVector,
+- const float scalar, unsigned int num_points)
++static inline void volk_8i_s32f_convert_32f_a_avx2(float* outputVector,
++ const int8_t* inputVector,
++ const float scalar,
++ unsigned int num_points)
+ {
+- unsigned int number = 0;
+- const unsigned int sixteenthPoints = num_points / 16;
+-
+- float* outputVectorPtr = outputVector;
+- const float iScalar = 1.0 / scalar;
+- __m256 invScalar = _mm256_set1_ps( iScalar );
+- const int8_t* inputVectorPtr = inputVector;
+- __m256 ret;
+- __m128i inputVal128;
+- __m256i interimVal;
+-
+- for(;number < sixteenthPoints; number++){
+- inputVal128 = _mm_load_si128((__m128i*)inputVectorPtr);
+-
+- interimVal = _mm256_cvtepi8_epi32(inputVal128);
+- ret = _mm256_cvtepi32_ps(interimVal);
+- ret = _mm256_mul_ps(ret, invScalar);
+- _mm256_store_ps(outputVectorPtr, ret);
+- outputVectorPtr += 8;
+-
+- inputVal128 = _mm_srli_si128(inputVal128, 8);
+- interimVal = _mm256_cvtepi8_epi32(inputVal128);
+- ret = _mm256_cvtepi32_ps(interimVal);
+- ret = _mm256_mul_ps(ret, invScalar);
+- _mm256_store_ps(outputVectorPtr, ret);
+- outputVectorPtr += 8;
+-
+- inputVectorPtr += 16;
+- }
+-
+- number = sixteenthPoints * 16;
+- for(; number < num_points; number++){
+- outputVector[number] = (float)(inputVector[number]) * iScalar;
+- }
++ unsigned int number = 0;
++ const unsigned int sixteenthPoints = num_points / 16;
++
++ float* outputVectorPtr = outputVector;
++ const float iScalar = 1.0 / scalar;
++ __m256 invScalar = _mm256_set1_ps(iScalar);
++ const int8_t* inputVectorPtr = inputVector;
++ __m256 ret;
++ __m128i inputVal128;
++ __m256i interimVal;
++
++ for (; number < sixteenthPoints; number++) {
++ inputVal128 = _mm_load_si128((__m128i*)inputVectorPtr);
++
++ interimVal = _mm256_cvtepi8_epi32(inputVal128);
++ ret = _mm256_cvtepi32_ps(interimVal);
++ ret = _mm256_mul_ps(ret, invScalar);
++ _mm256_store_ps(outputVectorPtr, ret);
++ outputVectorPtr += 8;
++
++ inputVal128 = _mm_srli_si128(inputVal128, 8);
++ interimVal = _mm256_cvtepi8_epi32(inputVal128);
++ ret = _mm256_cvtepi32_ps(interimVal);
++ ret = _mm256_mul_ps(ret, invScalar);
++ _mm256_store_ps(outputVectorPtr, ret);
++ outputVectorPtr += 8;
++
++ inputVectorPtr += 16;
++ }
++
++ number = sixteenthPoints * 16;
++ for (; number < num_points; number++) {
++ outputVector[number] = (float)(inputVector[number]) * iScalar;
++ }
+ }
+ #endif /* LV_HAVE_AVX2 */
+
+ #ifdef LV_HAVE_SSE4_1
+ #include <smmintrin.h>
+
+-static inline void
+-volk_8i_s32f_convert_32f_a_sse4_1(float* outputVector, const int8_t* inputVector,
+- const float scalar, unsigned int num_points)
++static inline void volk_8i_s32f_convert_32f_a_sse4_1(float* outputVector,
++ const int8_t* inputVector,
++ const float scalar,
++ unsigned int num_points)
+ {
+- unsigned int number = 0;
+- const unsigned int sixteenthPoints = num_points / 16;
+-
+- float* outputVectorPtr = outputVector;
+- const float iScalar = 1.0 / scalar;
+- __m128 invScalar = _mm_set_ps1(iScalar);
+- const int8_t* inputVectorPtr = inputVector;
+- __m128 ret;
+- __m128i inputVal;
+- __m128i interimVal;
+-
+- for(;number < sixteenthPoints; number++){
+- inputVal = _mm_load_si128((__m128i*)inputVectorPtr);
+-
+- interimVal = _mm_cvtepi8_epi32(inputVal);
+- ret = _mm_cvtepi32_ps(interimVal);
+- ret = _mm_mul_ps(ret, invScalar);
+- _mm_store_ps(outputVectorPtr, ret);
+- outputVectorPtr += 4;
+-
+- inputVal = _mm_srli_si128(inputVal, 4);
+- interimVal = _mm_cvtepi8_epi32(inputVal);
+- ret = _mm_cvtepi32_ps(interimVal);
+- ret = _mm_mul_ps(ret, invScalar);
+- _mm_store_ps(outputVectorPtr, ret);
+- outputVectorPtr += 4;
+-
+- inputVal = _mm_srli_si128(inputVal, 4);
+- interimVal = _mm_cvtepi8_epi32(inputVal);
+- ret = _mm_cvtepi32_ps(interimVal);
+- ret = _mm_mul_ps(ret, invScalar);
+- _mm_store_ps(outputVectorPtr, ret);
+- outputVectorPtr += 4;
+-
+- inputVal = _mm_srli_si128(inputVal, 4);
+- interimVal = _mm_cvtepi8_epi32(inputVal);
+- ret = _mm_cvtepi32_ps(interimVal);
+- ret = _mm_mul_ps(ret, invScalar);
+- _mm_store_ps(outputVectorPtr, ret);
+- outputVectorPtr += 4;
+-
+- inputVectorPtr += 16;
+- }
+-
+- number = sixteenthPoints * 16;
+- for(; number < num_points; number++){
+- outputVector[number] = (float)(inputVector[number]) * iScalar;
+- }
++ unsigned int number = 0;
++ const unsigned int sixteenthPoints = num_points / 16;
++
++ float* outputVectorPtr = outputVector;
++ const float iScalar = 1.0 / scalar;
++ __m128 invScalar = _mm_set_ps1(iScalar);
++ const int8_t* inputVectorPtr = inputVector;
++ __m128 ret;
++ __m128i inputVal;
++ __m128i interimVal;
++
++ for (; number < sixteenthPoints; number++) {
++ inputVal = _mm_load_si128((__m128i*)inputVectorPtr);
++
++ interimVal = _mm_cvtepi8_epi32(inputVal);
++ ret = _mm_cvtepi32_ps(interimVal);
++ ret = _mm_mul_ps(ret, invScalar);
++ _mm_store_ps(outputVectorPtr, ret);
++ outputVectorPtr += 4;
++
++ inputVal = _mm_srli_si128(inputVal, 4);
++ interimVal = _mm_cvtepi8_epi32(inputVal);
++ ret = _mm_cvtepi32_ps(interimVal);
++ ret = _mm_mul_ps(ret, invScalar);
++ _mm_store_ps(outputVectorPtr, ret);
++ outputVectorPtr += 4;
++
++ inputVal = _mm_srli_si128(inputVal, 4);
++ interimVal = _mm_cvtepi8_epi32(inputVal);
++ ret = _mm_cvtepi32_ps(interimVal);
++ ret = _mm_mul_ps(ret, invScalar);
++ _mm_store_ps(outputVectorPtr, ret);
++ outputVectorPtr += 4;
++
++ inputVal = _mm_srli_si128(inputVal, 4);
++ interimVal = _mm_cvtepi8_epi32(inputVal);
++ ret = _mm_cvtepi32_ps(interimVal);
++ ret = _mm_mul_ps(ret, invScalar);
++ _mm_store_ps(outputVectorPtr, ret);
++ outputVectorPtr += 4;
++
++ inputVectorPtr += 16;
++ }
++
++ number = sixteenthPoints * 16;
++ for (; number < num_points; number++) {
++ outputVector[number] = (float)(inputVector[number]) * iScalar;
++ }
+ }
+ #endif /* LV_HAVE_SSE4_1 */
+
+ #ifdef LV_HAVE_NEON
+ #include <arm_neon.h>
+
+-static inline void
+-volk_8i_s32f_convert_32f_neon(float* outputVector, const int8_t* inputVector,
+- const float scalar, unsigned int num_points)
++static inline void volk_8i_s32f_convert_32f_neon(float* outputVector,
++ const int8_t* inputVector,
++ const float scalar,
++ unsigned int num_points)
+ {
+- float* outputVectorPtr = outputVector;
+- const int8_t* inputVectorPtr = inputVector;
+-
+- const float iScalar = 1.0 / scalar;
+- const float32x4_t qiScalar = vdupq_n_f32(iScalar);
+-
+- int8x8x2_t inputVal;
+- float32x4x2_t outputFloat;
+- int16x8_t tmp;
+-
+- unsigned int number = 0;
+- const unsigned int sixteenthPoints = num_points / 16;
+- for(;number < sixteenthPoints; number++){
+- __VOLK_PREFETCH(inputVectorPtr+16);
+-
+- inputVal = vld2_s8(inputVectorPtr);
+- inputVal = vzip_s8(inputVal.val[0], inputVal.val[1]);
+- inputVectorPtr += 16;
+-
+- tmp = vmovl_s8(inputVal.val[0]);
+-
+- outputFloat.val[0] = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp)));
+- outputFloat.val[0] = vmulq_f32(outputFloat.val[0], qiScalar);
+- vst1q_f32(outputVectorPtr, outputFloat.val[0]);
+- outputVectorPtr += 4;
+-
+- outputFloat.val[1] = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp)));
+- outputFloat.val[1] = vmulq_f32(outputFloat.val[1], qiScalar);
+- vst1q_f32(outputVectorPtr, outputFloat.val[1]);
+- outputVectorPtr += 4;
+-
+- tmp = vmovl_s8(inputVal.val[1]);
+-
+- outputFloat.val[0] = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp)));
+- outputFloat.val[0] = vmulq_f32(outputFloat.val[0], qiScalar);
+- vst1q_f32(outputVectorPtr, outputFloat.val[0]);
+- outputVectorPtr += 4;
+-
+- outputFloat.val[1] = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp)));
+- outputFloat.val[1] = vmulq_f32(outputFloat.val[1], qiScalar);
+- vst1q_f32(outputVectorPtr, outputFloat.val[1]);
+- outputVectorPtr += 4;
+- }
+- for(number = sixteenthPoints * 16; number < num_points; number++){
+- *outputVectorPtr++ = ((float)(*inputVectorPtr++)) * iScalar;
+- }
++ float* outputVectorPtr = outputVector;
++ const int8_t* inputVectorPtr = inputVector;
++
++ const float iScalar = 1.0 / scalar;
++ const float32x4_t qiScalar = vdupq_n_f32(iScalar);
++
++ int8x8x2_t inputVal;
++ float32x4x2_t outputFloat;
++ int16x8_t tmp;
++
++ unsigned int number = 0;
++ const unsigned int sixteenthPoints = num_points / 16;
++ for (; number < sixteenthPoints; number++) {
++ __VOLK_PREFETCH(inputVectorPtr + 16);
++
++ inputVal = vld2_s8(inputVectorPtr);
++ inputVal = vzip_s8(inputVal.val[0], inputVal.val[1]);
++ inputVectorPtr += 16;
++
++ tmp = vmovl_s8(inputVal.val[0]);
++
++ outputFloat.val[0] = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp)));
++ outputFloat.val[0] = vmulq_f32(outputFloat.val[0], qiScalar);
++ vst1q_f32(outputVectorPtr, outputFloat.val[0]);
++ outputVectorPtr += 4;
++
++ outputFloat.val[1] = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp)));
++ outputFloat.val[1] = vmulq_f32(outputFloat.val[1], qiScalar);
++ vst1q_f32(outputVectorPtr, outputFloat.val[1]);
++ outputVectorPtr += 4;
++
++ tmp = vmovl_s8(inputVal.val[1]);
++
++ outputFloat.val[0] = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp)));
++ outputFloat.val[0] = vmulq_f32(outputFloat.val[0], qiScalar);
++ vst1q_f32(outputVectorPtr, outputFloat.val[0]);
++ outputVectorPtr += 4;
++
++ outputFloat.val[1] = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp)));
++ outputFloat.val[1] = vmulq_f32(outputFloat.val[1], qiScalar);
++ vst1q_f32(outputVectorPtr, outputFloat.val[1]);
++ outputVectorPtr += 4;
++ }
++ for (number = sixteenthPoints * 16; number < num_points; number++) {
++ *outputVectorPtr++ = ((float)(*inputVectorPtr++)) * iScalar;
++ }
+ }
+
+ #endif /* LV_HAVE_NEON */
+
+ #ifdef LV_HAVE_GENERIC
+
+-static inline void
+-volk_8i_s32f_convert_32f_a_generic(float* outputVector, const int8_t* inputVector,
+- const float scalar, unsigned int num_points)
++static inline void volk_8i_s32f_convert_32f_a_generic(float* outputVector,
++ const int8_t* inputVector,
++ const float scalar,
++ unsigned int num_points)
+ {
+- float* outputVectorPtr = outputVector;
+- const int8_t* inputVectorPtr = inputVector;
+- unsigned int number = 0;
+- const float iScalar = 1.0 / scalar;
+-
+- for(number = 0; number < num_points; number++){
+- *outputVectorPtr++ = ((float)(*inputVectorPtr++)) * iScalar;
+- }
++ float* outputVectorPtr = outputVector;
++ const int8_t* inputVectorPtr = inputVector;
++ unsigned int number = 0;
++ const float iScalar = 1.0 / scalar;
++
++ for (number = 0; number < num_points; number++) {
++ *outputVectorPtr++ = ((float)(*inputVectorPtr++)) * iScalar;
++ }
+ }
+ #endif /* LV_HAVE_GENERIC */
+
+
+ #ifdef LV_HAVE_ORC
+-extern void
+-volk_8i_s32f_convert_32f_a_orc_impl(float* outputVector, const int8_t* inputVector,
+- const float scalar, unsigned int num_points);
+-
+-static inline void
+-volk_8i_s32f_convert_32f_u_orc(float* outputVector, const int8_t* inputVector,
+- const float scalar, unsigned int num_points)
++extern void volk_8i_s32f_convert_32f_a_orc_impl(float* outputVector,
++ const int8_t* inputVector,
++ const float scalar,
++ unsigned int num_points);
++
++static inline void volk_8i_s32f_convert_32f_u_orc(float* outputVector,
++ const int8_t* inputVector,
++ const float scalar,
++ unsigned int num_points)
+ {
+- float invscalar = 1.0 / scalar;
+- volk_8i_s32f_convert_32f_a_orc_impl(outputVector, inputVector, invscalar, num_points);
++ float invscalar = 1.0 / scalar;
++ volk_8i_s32f_convert_32f_a_orc_impl(outputVector, inputVector, invscalar, num_points);
+ }
+ #endif /* LV_HAVE_ORC */
+
+
+-
+ #endif /* INCLUDED_VOLK_8s_CONVERT_32f_ALIGNED8_H */
+-
+diff --git a/kernels/volk/volk_8ic_deinterleave_16i_x2.h b/kernels/volk/volk_8ic_deinterleave_16i_x2.h
+index b4cf251..fa998a0 100644
+--- a/kernels/volk/volk_8ic_deinterleave_16i_x2.h
++++ b/kernels/volk/volk_8ic_deinterleave_16i_x2.h
+@@ -30,8 +30,8 @@
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+- * void volk_8ic_deinterleave_16i_x2(int16_t* iBuffer, int16_t* qBuffer, const lv_8sc_t* complexVector, unsigned int num_points)
+- * \endcode
++ * void volk_8ic_deinterleave_16i_x2(int16_t* iBuffer, int16_t* qBuffer, const lv_8sc_t*
++ * complexVector, unsigned int num_points) \endcode
+ *
+ * \b Inputs
+ * \li complexVector: The complex input vector.
+@@ -60,91 +60,150 @@
+ #ifdef LV_HAVE_AVX2
+ #include <immintrin.h>
+
+-static inline void
+-volk_8ic_deinterleave_16i_x2_a_avx2(int16_t* iBuffer, int16_t* qBuffer,
+- const lv_8sc_t* complexVector, unsigned int num_points)
++static inline void volk_8ic_deinterleave_16i_x2_a_avx2(int16_t* iBuffer,
++ int16_t* qBuffer,
++ const lv_8sc_t* complexVector,
++ unsigned int num_points)
+ {
+- unsigned int number = 0;
+- const int8_t* complexVectorPtr = (int8_t*)complexVector;
+- int16_t* iBufferPtr = iBuffer;
+- int16_t* qBufferPtr = qBuffer;
+- __m256i MoveMask = _mm256_set_epi8(15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0, 15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0);
+- __m256i complexVal, iOutputVal, qOutputVal;
+- __m128i iOutputVal0, qOutputVal0;
+-
+- unsigned int sixteenthPoints = num_points / 16;
+-
+- for(number = 0; number < sixteenthPoints; number++){
+- complexVal = _mm256_load_si256((__m256i*)complexVectorPtr); complexVectorPtr += 32;
+-
+- complexVal = _mm256_shuffle_epi8(complexVal, MoveMask);
+- complexVal = _mm256_permute4x64_epi64(complexVal, 0xd8);
+-
+- iOutputVal0 = _mm256_extracti128_si256(complexVal, 0);
+- qOutputVal0 = _mm256_extracti128_si256(complexVal, 1);
+-
+- iOutputVal = _mm256_cvtepi8_epi16(iOutputVal0);
+- iOutputVal = _mm256_slli_epi16(iOutputVal, 8);
+-
+- qOutputVal = _mm256_cvtepi8_epi16(qOutputVal0);
+- qOutputVal = _mm256_slli_epi16(qOutputVal, 8);
+-
+- _mm256_store_si256((__m256i*)iBufferPtr, iOutputVal);
+- _mm256_store_si256((__m256i*)qBufferPtr, qOutputVal);
+-
+- iBufferPtr += 16;
+- qBufferPtr += 16;
+- }
+-
+- number = sixteenthPoints * 16;
+- for(; number < num_points; number++){
+- *iBufferPtr++ = ((int16_t)*complexVectorPtr++) * 256; // load 8 bit Complexvector into 16 bit, shift left by 8 bits and store
+- *qBufferPtr++ = ((int16_t)*complexVectorPtr++) * 256;
+- }
++ unsigned int number = 0;
++ const int8_t* complexVectorPtr = (int8_t*)complexVector;
++ int16_t* iBufferPtr = iBuffer;
++ int16_t* qBufferPtr = qBuffer;
++ __m256i MoveMask = _mm256_set_epi8(15,
++ 13,
++ 11,
++ 9,
++ 7,
++ 5,
++ 3,
++ 1,
++ 14,
++ 12,
++ 10,
++ 8,
++ 6,
++ 4,
++ 2,
++ 0,
++ 15,
++ 13,
++ 11,
++ 9,
++ 7,
++ 5,
++ 3,
++ 1,
++ 14,
++ 12,
++ 10,
++ 8,
++ 6,
++ 4,
++ 2,
++ 0);
++ __m256i complexVal, iOutputVal, qOutputVal;
++ __m128i iOutputVal0, qOutputVal0;
++
++ unsigned int sixteenthPoints = num_points / 16;
++
++ for (number = 0; number < sixteenthPoints; number++) {
++ complexVal = _mm256_load_si256((__m256i*)complexVectorPtr);
++ complexVectorPtr += 32;
++
++ complexVal = _mm256_shuffle_epi8(complexVal, MoveMask);
++ complexVal = _mm256_permute4x64_epi64(complexVal, 0xd8);
++
++ iOutputVal0 = _mm256_extracti128_si256(complexVal, 0);
++ qOutputVal0 = _mm256_extracti128_si256(complexVal, 1);
++
++ iOutputVal = _mm256_cvtepi8_epi16(iOutputVal0);
++ iOutputVal = _mm256_slli_epi16(iOutputVal, 8);
++
++ qOutputVal = _mm256_cvtepi8_epi16(qOutputVal0);
++ qOutputVal = _mm256_slli_epi16(qOutputVal, 8);
++
++ _mm256_store_si256((__m256i*)iBufferPtr, iOutputVal);
++ _mm256_store_si256((__m256i*)qBufferPtr, qOutputVal);
++
++ iBufferPtr += 16;
++ qBufferPtr += 16;
++ }
++
++ number = sixteenthPoints * 16;
++ for (; number < num_points; number++) {
++ *iBufferPtr++ =
++ ((int16_t)*complexVectorPtr++) *
++ 256; // load 8 bit Complexvector into 16 bit, shift left by 8 bits and store
++ *qBufferPtr++ = ((int16_t)*complexVectorPtr++) * 256;
++ }
+ }
+ #endif /* LV_HAVE_AVX2 */
+
+ #ifdef LV_HAVE_SSE4_1
+ #include <smmintrin.h>
+
+-static inline void
+-volk_8ic_deinterleave_16i_x2_a_sse4_1(int16_t* iBuffer, int16_t* qBuffer,
+- const lv_8sc_t* complexVector, unsigned int num_points)
++static inline void volk_8ic_deinterleave_16i_x2_a_sse4_1(int16_t* iBuffer,
++ int16_t* qBuffer,
++ const lv_8sc_t* complexVector,
++ unsigned int num_points)
+ {
+- unsigned int number = 0;
+- const int8_t* complexVectorPtr = (int8_t*)complexVector;
+- int16_t* iBufferPtr = iBuffer;
+- int16_t* qBufferPtr = qBuffer;
+- __m128i iMoveMask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0); // set 16 byte values
+- __m128i qMoveMask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 15, 13, 11, 9, 7, 5, 3, 1);
+- __m128i complexVal, iOutputVal, qOutputVal;
+-
+- unsigned int eighthPoints = num_points / 8;
+-
+- for(number = 0; number < eighthPoints; number++){
+- complexVal = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16; // aligned load
+-
+- iOutputVal = _mm_shuffle_epi8(complexVal, iMoveMask); // shuffle 16 bytes of 128bit complexVal
+- qOutputVal = _mm_shuffle_epi8(complexVal, qMoveMask);
+-
+- iOutputVal = _mm_cvtepi8_epi16(iOutputVal); // fills 2-byte sign extended versions of lower 8 bytes of input to output
+- iOutputVal = _mm_slli_epi16(iOutputVal, 8); // shift in left by 8 bits, each of the 8 16-bit integers, shift in with zeros
+-
+- qOutputVal = _mm_cvtepi8_epi16(qOutputVal);
+- qOutputVal = _mm_slli_epi16(qOutputVal, 8);
+-
+- _mm_store_si128((__m128i*)iBufferPtr, iOutputVal); // aligned store
+- _mm_store_si128((__m128i*)qBufferPtr, qOutputVal);
+-
+- iBufferPtr += 8;
+- qBufferPtr += 8;
+- }
+-
+- number = eighthPoints * 8;
+- for(; number < num_points; number++){
+- *iBufferPtr++ = ((int16_t)*complexVectorPtr++) * 256; // load 8 bit Complexvector into 16 bit, shift left by 8 bits and store
+- *qBufferPtr++ = ((int16_t)*complexVectorPtr++) * 256;
+- }
++ unsigned int number = 0;
++ const int8_t* complexVectorPtr = (int8_t*)complexVector;
++ int16_t* iBufferPtr = iBuffer;
++ int16_t* qBufferPtr = qBuffer;
++ __m128i iMoveMask = _mm_set_epi8(0x80,
++ 0x80,
++ 0x80,
++ 0x80,
++ 0x80,
++ 0x80,
++ 0x80,
++ 0x80,
++ 14,
++ 12,
++ 10,
++ 8,
++ 6,
++ 4,
++ 2,
++ 0); // set 16 byte values
++ __m128i qMoveMask = _mm_set_epi8(
++ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 15, 13, 11, 9, 7, 5, 3, 1);
++ __m128i complexVal, iOutputVal, qOutputVal;
++
++ unsigned int eighthPoints = num_points / 8;
++
++ for (number = 0; number < eighthPoints; number++) {
++ complexVal = _mm_load_si128((__m128i*)complexVectorPtr);
++ complexVectorPtr += 16; // aligned load
++
++ iOutputVal = _mm_shuffle_epi8(complexVal,
++ iMoveMask); // shuffle 16 bytes of 128bit complexVal
++ qOutputVal = _mm_shuffle_epi8(complexVal, qMoveMask);
++
++ iOutputVal = _mm_cvtepi8_epi16(iOutputVal); // fills 2-byte sign extended versions
++ // of lower 8 bytes of input to output
++ iOutputVal =
++ _mm_slli_epi16(iOutputVal, 8); // shift in left by 8 bits, each of the 8
++ // 16-bit integers, shift in with zeros
++
++ qOutputVal = _mm_cvtepi8_epi16(qOutputVal);
++ qOutputVal = _mm_slli_epi16(qOutputVal, 8);
++
++ _mm_store_si128((__m128i*)iBufferPtr, iOutputVal); // aligned store
++ _mm_store_si128((__m128i*)qBufferPtr, qOutputVal);
++
++ iBufferPtr += 8;
++ qBufferPtr += 8;
++ }
++
++ number = eighthPoints * 8;
++ for (; number < num_points; number++) {
++ *iBufferPtr++ =
++ ((int16_t)*complexVectorPtr++) *
++ 256; // load 8 bit Complexvector into 16 bit, shift left by 8 bits and store
++ *qBufferPtr++ = ((int16_t)*complexVectorPtr++) * 256;
++ }
+ }
+ #endif /* LV_HAVE_SSE4_1 */
+
+@@ -152,86 +211,111 @@ volk_8ic_deinterleave_16i_x2_a_sse4_1(int16_t* iBuffer, int16_t* qBuffer,
+ #ifdef LV_HAVE_AVX
+ #include <immintrin.h>
+
+-static inline void
+-volk_8ic_deinterleave_16i_x2_a_avx(int16_t* iBuffer, int16_t* qBuffer,
+- const lv_8sc_t* complexVector, unsigned int num_points)
++static inline void volk_8ic_deinterleave_16i_x2_a_avx(int16_t* iBuffer,
++ int16_t* qBuffer,
++ const lv_8sc_t* complexVector,
++ unsigned int num_points)
+ {
+- unsigned int number = 0;
+- const int8_t* complexVectorPtr = (int8_t*)complexVector;
+- int16_t* iBufferPtr = iBuffer;
+- int16_t* qBufferPtr = qBuffer;
+- __m128i iMoveMask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0); // set 16 byte values
+- __m128i qMoveMask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 15, 13, 11, 9, 7, 5, 3, 1);
+- __m256i complexVal, iOutputVal, qOutputVal;
+- __m128i complexVal1, complexVal0;
+- __m128i iOutputVal1, iOutputVal0, qOutputVal1, qOutputVal0;
+-
+- unsigned int sixteenthPoints = num_points / 16;
+-
+- for(number = 0; number < sixteenthPoints; number++){
+- complexVal = _mm256_load_si256((__m256i*)complexVectorPtr); complexVectorPtr += 32; // aligned load
+-
+- // Extract from complexVal to iOutputVal and qOutputVal
+- complexVal1 = _mm256_extractf128_si256(complexVal, 1);
+- complexVal0 = _mm256_extractf128_si256(complexVal, 0);
+-
+- iOutputVal1 = _mm_shuffle_epi8(complexVal1, iMoveMask); // shuffle 16 bytes of 128bit complexVal
+- iOutputVal0 = _mm_shuffle_epi8(complexVal0, iMoveMask);
+- qOutputVal1 = _mm_shuffle_epi8(complexVal1, qMoveMask);
+- qOutputVal0 = _mm_shuffle_epi8(complexVal0, qMoveMask);
+-
+- iOutputVal1 = _mm_cvtepi8_epi16(iOutputVal1); // fills 2-byte sign extended versions of lower 8 bytes of input to output
+- iOutputVal1 = _mm_slli_epi16(iOutputVal1, 8); // shift in left by 8 bits, each of the 8 16-bit integers, shift in with zeros
+- iOutputVal0 = _mm_cvtepi8_epi16(iOutputVal0);
+- iOutputVal0 = _mm_slli_epi16(iOutputVal0, 8);
+-
+- qOutputVal1 = _mm_cvtepi8_epi16(qOutputVal1);
+- qOutputVal1 = _mm_slli_epi16(qOutputVal1, 8);
+- qOutputVal0 = _mm_cvtepi8_epi16(qOutputVal0);
+- qOutputVal0 = _mm_slli_epi16(qOutputVal0, 8);
+-
+- // Pack iOutputVal0,1 to iOutputVal
+- __m256i dummy = _mm256_setzero_si256();
+- iOutputVal = _mm256_insertf128_si256(dummy, iOutputVal0, 0);
+- iOutputVal = _mm256_insertf128_si256(iOutputVal, iOutputVal1, 1);
+- qOutputVal = _mm256_insertf128_si256(dummy, qOutputVal0, 0);
+- qOutputVal = _mm256_insertf128_si256(qOutputVal, qOutputVal1, 1);
+-
+- _mm256_store_si256((__m256i*)iBufferPtr, iOutputVal); // aligned store
+- _mm256_store_si256((__m256i*)qBufferPtr, qOutputVal);
+-
+- iBufferPtr += 16;
+- qBufferPtr += 16;
+- }
+-
+- number = sixteenthPoints * 16;
+- for(; number < num_points; number++){
+- *iBufferPtr++ = ((int16_t)*complexVectorPtr++) * 256; // load 8 bit Complexvector into 16 bit, shift left by 8 bits and store
+- *qBufferPtr++ = ((int16_t)*complexVectorPtr++) * 256;
+- }
++ unsigned int number = 0;
++ const int8_t* complexVectorPtr = (int8_t*)complexVector;
++ int16_t* iBufferPtr = iBuffer;
++ int16_t* qBufferPtr = qBuffer;
++ __m128i iMoveMask = _mm_set_epi8(0x80,
++ 0x80,
++ 0x80,
++ 0x80,
++ 0x80,
++ 0x80,
++ 0x80,
++ 0x80,
++ 14,
++ 12,
++ 10,
++ 8,
++ 6,
++ 4,
++ 2,
++ 0); // set 16 byte values
++ __m128i qMoveMask = _mm_set_epi8(
++ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 15, 13, 11, 9, 7, 5, 3, 1);
++ __m256i complexVal, iOutputVal, qOutputVal;
++ __m128i complexVal1, complexVal0;
++ __m128i iOutputVal1, iOutputVal0, qOutputVal1, qOutputVal0;
++
++ unsigned int sixteenthPoints = num_points / 16;
++
++ for (number = 0; number < sixteenthPoints; number++) {
++ complexVal = _mm256_load_si256((__m256i*)complexVectorPtr);
++ complexVectorPtr += 32; // aligned load
++
++ // Extract from complexVal to iOutputVal and qOutputVal
++ complexVal1 = _mm256_extractf128_si256(complexVal, 1);
++ complexVal0 = _mm256_extractf128_si256(complexVal, 0);
++
++ iOutputVal1 = _mm_shuffle_epi8(
++ complexVal1, iMoveMask); // shuffle 16 bytes of 128bit complexVal
++ iOutputVal0 = _mm_shuffle_epi8(complexVal0, iMoveMask);
++ qOutputVal1 = _mm_shuffle_epi8(complexVal1, qMoveMask);
++ qOutputVal0 = _mm_shuffle_epi8(complexVal0, qMoveMask);
++
++ iOutputVal1 =
++ _mm_cvtepi8_epi16(iOutputVal1); // fills 2-byte sign extended versions of
++ // lower 8 bytes of input to output
++ iOutputVal1 =
++ _mm_slli_epi16(iOutputVal1, 8); // shift in left by 8 bits, each of the 8
++ // 16-bit integers, shift in with zeros
++ iOutputVal0 = _mm_cvtepi8_epi16(iOutputVal0);
++ iOutputVal0 = _mm_slli_epi16(iOutputVal0, 8);
++
++ qOutputVal1 = _mm_cvtepi8_epi16(qOutputVal1);
++ qOutputVal1 = _mm_slli_epi16(qOutputVal1, 8);
++ qOutputVal0 = _mm_cvtepi8_epi16(qOutputVal0);
++ qOutputVal0 = _mm_slli_epi16(qOutputVal0, 8);
++
++ // Pack iOutputVal0,1 to iOutputVal
++ __m256i dummy = _mm256_setzero_si256();
++ iOutputVal = _mm256_insertf128_si256(dummy, iOutputVal0, 0);
++ iOutputVal = _mm256_insertf128_si256(iOutputVal, iOutputVal1, 1);
++ qOutputVal = _mm256_insertf128_si256(dummy, qOutputVal0, 0);
++ qOutputVal = _mm256_insertf128_si256(qOutputVal, qOutputVal1, 1);
++
++ _mm256_store_si256((__m256i*)iBufferPtr, iOutputVal); // aligned store
++ _mm256_store_si256((__m256i*)qBufferPtr, qOutputVal);
++
++ iBufferPtr += 16;
++ qBufferPtr += 16;
++ }
++
++ number = sixteenthPoints * 16;
++ for (; number < num_points; number++) {
++ *iBufferPtr++ =
++ ((int16_t)*complexVectorPtr++) *
++ 256; // load 8 bit Complexvector into 16 bit, shift left by 8 bits and store
++ *qBufferPtr++ = ((int16_t)*complexVectorPtr++) * 256;
++ }
+ }
+ #endif /* LV_HAVE_AVX */
+
+
+ #ifdef LV_HAVE_GENERIC
+
+-static inline void
+-volk_8ic_deinterleave_16i_x2_generic(int16_t* iBuffer, int16_t* qBuffer,
+- const lv_8sc_t* complexVector, unsigned int num_points)
++static inline void volk_8ic_deinterleave_16i_x2_generic(int16_t* iBuffer,
++ int16_t* qBuffer,
++ const lv_8sc_t* complexVector,
++ unsigned int num_points)
+ {
+- const int8_t* complexVectorPtr = (const int8_t*)complexVector;
+- int16_t* iBufferPtr = iBuffer;
+- int16_t* qBufferPtr = qBuffer;
+- unsigned int number;
+- for(number = 0; number < num_points; number++){
+- *iBufferPtr++ = (int16_t)(*complexVectorPtr++)*256;
+- *qBufferPtr++ = (int16_t)(*complexVectorPtr++)*256;
+- }
++ const int8_t* complexVectorPtr = (const int8_t*)complexVector;
++ int16_t* iBufferPtr = iBuffer;
++ int16_t* qBufferPtr = qBuffer;
++ unsigned int number;
++ for (number = 0; number < num_points; number++) {
++ *iBufferPtr++ = (int16_t)(*complexVectorPtr++) * 256;
++ *qBufferPtr++ = (int16_t)(*complexVectorPtr++) * 256;
++ }
+ }
+ #endif /* LV_HAVE_GENERIC */
+
+
+-
+ #endif /* INCLUDED_volk_8ic_deinterleave_16i_x2_a_H */
+
+ #ifndef INCLUDED_volk_8ic_deinterleave_16i_x2_u_H
+@@ -243,47 +327,82 @@ volk_8ic_deinterleave_16i_x2_generic(int16_t* iBuffer, int16_t* qBuffer,
+ #ifdef LV_HAVE_AVX2
+ #include <immintrin.h>
+
+-static inline void
+-volk_8ic_deinterleave_16i_x2_u_avx2(int16_t* iBuffer, int16_t* qBuffer,
+- const lv_8sc_t* complexVector, unsigned int num_points)
++static inline void volk_8ic_deinterleave_16i_x2_u_avx2(int16_t* iBuffer,
++ int16_t* qBuffer,
++ const lv_8sc_t* complexVector,
++ unsigned int num_points)
+ {
+- unsigned int number = 0;
+- const int8_t* complexVectorPtr = (int8_t*)complexVector;
+- int16_t* iBufferPtr = iBuffer;
+- int16_t* qBufferPtr = qBuffer;
+- __m256i MoveMask = _mm256_set_epi8(15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0, 15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0);
+- __m256i complexVal, iOutputVal, qOutputVal;
+- __m128i iOutputVal0, qOutputVal0;
+-
+- unsigned int sixteenthPoints = num_points / 16;
+-
+- for(number = 0; number < sixteenthPoints; number++){
+- complexVal = _mm256_loadu_si256((__m256i*)complexVectorPtr); complexVectorPtr += 32;
+-
+- complexVal = _mm256_shuffle_epi8(complexVal, MoveMask);
+- complexVal = _mm256_permute4x64_epi64(complexVal, 0xd8);
+-
+- iOutputVal0 = _mm256_extracti128_si256(complexVal, 0);
+- qOutputVal0 = _mm256_extracti128_si256(complexVal, 1);
+-
+- iOutputVal = _mm256_cvtepi8_epi16(iOutputVal0);
+- iOutputVal = _mm256_slli_epi16(iOutputVal, 8);
+-
+- qOutputVal = _mm256_cvtepi8_epi16(qOutputVal0);
+- qOutputVal = _mm256_slli_epi16(qOutputVal, 8);
+-
+- _mm256_storeu_si256((__m256i*)iBufferPtr, iOutputVal);
+- _mm256_storeu_si256((__m256i*)qBufferPtr, qOutputVal);
+-
+- iBufferPtr += 16;
+- qBufferPtr += 16;
+- }
+-
+- number = sixteenthPoints * 16;
+- for(; number < num_points; number++){
+- *iBufferPtr++ = ((int16_t)*complexVectorPtr++) * 256; // load 8 bit Complexvector into 16 bit, shift left by 8 bits and store
+- *qBufferPtr++ = ((int16_t)*complexVectorPtr++) * 256;
+- }
++ unsigned int number = 0;
++ const int8_t* complexVectorPtr = (int8_t*)complexVector;
++ int16_t* iBufferPtr = iBuffer;
++ int16_t* qBufferPtr = qBuffer;
++ __m256i MoveMask = _mm256_set_epi8(15,
++ 13,
++ 11,
++ 9,
++ 7,
++ 5,
++ 3,
++ 1,
++ 14,
++ 12,
++ 10,
++ 8,
++ 6,
++ 4,
++ 2,
++ 0,
++ 15,
++ 13,
++ 11,
++ 9,
++ 7,
++ 5,
++ 3,
++ 1,
++ 14,
++ 12,
++ 10,
++ 8,
++ 6,
++ 4,
++ 2,
++ 0);
++ __m256i complexVal, iOutputVal, qOutputVal;
++ __m128i iOutputVal0, qOutputVal0;
++
++ unsigned int sixteenthPoints = num_points / 16;
++
++ for (number = 0; number < sixteenthPoints; number++) {
++ complexVal = _mm256_loadu_si256((__m256i*)complexVectorPtr);
++ complexVectorPtr += 32;
++
++ complexVal = _mm256_shuffle_epi8(complexVal, MoveMask);
++ complexVal = _mm256_permute4x64_epi64(complexVal, 0xd8);
++
++ iOutputVal0 = _mm256_extracti128_si256(complexVal, 0);
++ qOutputVal0 = _mm256_extracti128_si256(complexVal, 1);
++
++ iOutputVal = _mm256_cvtepi8_epi16(iOutputVal0);
++ iOutputVal = _mm256_slli_epi16(iOutputVal, 8);
++
++ qOutputVal = _mm256_cvtepi8_epi16(qOutputVal0);
++ qOutputVal = _mm256_slli_epi16(qOutputVal, 8);
++
++ _mm256_storeu_si256((__m256i*)iBufferPtr, iOutputVal);
++ _mm256_storeu_si256((__m256i*)qBufferPtr, qOutputVal);
++
++ iBufferPtr += 16;
++ qBufferPtr += 16;
++ }
++
++ number = sixteenthPoints * 16;
++ for (; number < num_points; number++) {
++ *iBufferPtr++ =
++ ((int16_t)*complexVectorPtr++) *
++ 256; // load 8 bit Complexvector into 16 bit, shift left by 8 bits and store
++ *qBufferPtr++ = ((int16_t)*complexVectorPtr++) * 256;
++ }
+ }
+ #endif /* LV_HAVE_AVX2 */
+ #endif /* INCLUDED_volk_8ic_deinterleave_16i_x2_u_H */
+diff --git a/kernels/volk/volk_8ic_deinterleave_real_16i.h b/kernels/volk/volk_8ic_deinterleave_real_16i.h
+index f15879a..aaebb47 100644
+--- a/kernels/volk/volk_8ic_deinterleave_real_16i.h
++++ b/kernels/volk/volk_8ic_deinterleave_real_16i.h
+@@ -30,8 +30,8 @@
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+- * void volk_8ic_deinterleave_real_16i(int16_t* iBuffer, const lv_8sc_t* complexVector, unsigned int num_points)
+- * \endcode
++ * void volk_8ic_deinterleave_real_16i(int16_t* iBuffer, const lv_8sc_t* complexVector,
++ * unsigned int num_points) \endcode
+ *
+ * \b Inputs
+ * \li complexVector: The complex input vector.
+@@ -60,75 +60,109 @@
+ #ifdef LV_HAVE_AVX2
+ #include <immintrin.h>
+
+-static inline void
+-volk_8ic_deinterleave_real_16i_a_avx2(int16_t* iBuffer, const lv_8sc_t* complexVector,
+- unsigned int num_points)
++static inline void volk_8ic_deinterleave_real_16i_a_avx2(int16_t* iBuffer,
++ const lv_8sc_t* complexVector,
++ unsigned int num_points)
+ {
+- unsigned int number = 0;
+- const int8_t* complexVectorPtr = (int8_t*)complexVector;
+- int16_t* iBufferPtr = iBuffer;
+- __m256i moveMask = _mm256_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0);
+- __m256i complexVal, outputVal;
+- __m128i outputVal0;
+-
+- unsigned int sixteenthPoints = num_points / 16;
+-
+- for(number = 0; number < sixteenthPoints; number++){
+- complexVal = _mm256_load_si256((__m256i*)complexVectorPtr); complexVectorPtr += 32;
+-
+- complexVal = _mm256_shuffle_epi8(complexVal, moveMask);
+- complexVal = _mm256_permute4x64_epi64(complexVal, 0xd8);
+-
+- outputVal0 = _mm256_extractf128_si256(complexVal, 0);
+-
+- outputVal = _mm256_cvtepi8_epi16(outputVal0);
+- outputVal = _mm256_slli_epi16(outputVal, 7);
+-
+- _mm256_store_si256((__m256i*)iBufferPtr, outputVal);
+-
+- iBufferPtr += 16;
+- }
+-
+- number = sixteenthPoints * 16;
+- for(; number < num_points; number++){
+- *iBufferPtr++ = ((int16_t)*complexVectorPtr++) * 128;
+- complexVectorPtr++;
+- }
++ unsigned int number = 0;
++ const int8_t* complexVectorPtr = (int8_t*)complexVector;
++ int16_t* iBufferPtr = iBuffer;
++ __m256i moveMask = _mm256_set_epi8(0x80,
++ 0x80,
++ 0x80,
++ 0x80,
++ 0x80,
++ 0x80,
++ 0x80,
++ 0x80,
++ 14,
++ 12,
++ 10,
++ 8,
++ 6,
++ 4,
++ 2,
++ 0,
++ 0x80,
++ 0x80,
++ 0x80,
++ 0x80,
++ 0x80,
++ 0x80,
++ 0x80,
++ 0x80,
++ 14,
++ 12,
++ 10,
++ 8,
++ 6,
++ 4,
++ 2,
++ 0);
++ __m256i complexVal, outputVal;
++ __m128i outputVal0;
++
++ unsigned int sixteenthPoints = num_points / 16;
++
++ for (number = 0; number < sixteenthPoints; number++) {
++ complexVal = _mm256_load_si256((__m256i*)complexVectorPtr);
++ complexVectorPtr += 32;
++
++ complexVal = _mm256_shuffle_epi8(complexVal, moveMask);
++ complexVal = _mm256_permute4x64_epi64(complexVal, 0xd8);
++
++ outputVal0 = _mm256_extractf128_si256(complexVal, 0);
++
++ outputVal = _mm256_cvtepi8_epi16(outputVal0);
++ outputVal = _mm256_slli_epi16(outputVal, 7);
++
++ _mm256_store_si256((__m256i*)iBufferPtr, outputVal);
++
++ iBufferPtr += 16;
++ }
++
++ number = sixteenthPoints * 16;
++ for (; number < num_points; number++) {
++ *iBufferPtr++ = ((int16_t)*complexVectorPtr++) * 128;
++ complexVectorPtr++;
++ }
+ }
+ #endif /* LV_HAVE_AVX2 */
+
+ #ifdef LV_HAVE_SSE4_1
+ #include <smmintrin.h>
+
+-static inline void
+-volk_8ic_deinterleave_real_16i_a_sse4_1(int16_t* iBuffer, const lv_8sc_t* complexVector,
+- unsigned int num_points)
++static inline void volk_8ic_deinterleave_real_16i_a_sse4_1(int16_t* iBuffer,
++ const lv_8sc_t* complexVector,
++ unsigned int num_points)
+ {
+- unsigned int number = 0;
+- const int8_t* complexVectorPtr = (int8_t*)complexVector;
+- int16_t* iBufferPtr = iBuffer;
+- __m128i moveMask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0);
+- __m128i complexVal, outputVal;
++ unsigned int number = 0;
++ const int8_t* complexVectorPtr = (int8_t*)complexVector;
++ int16_t* iBufferPtr = iBuffer;
++ __m128i moveMask = _mm_set_epi8(
++ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0);
++ __m128i complexVal, outputVal;
+
+- unsigned int eighthPoints = num_points / 8;
++ unsigned int eighthPoints = num_points / 8;
+
+- for(number = 0; number < eighthPoints; number++){
+- complexVal = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16;
++ for (number = 0; number < eighthPoints; number++) {
++ complexVal = _mm_load_si128((__m128i*)complexVectorPtr);
++ complexVectorPtr += 16;
+
+- complexVal = _mm_shuffle_epi8(complexVal, moveMask);
++ complexVal = _mm_shuffle_epi8(complexVal, moveMask);
+
+- outputVal = _mm_cvtepi8_epi16(complexVal);
+- outputVal = _mm_slli_epi16(outputVal, 7);
++ outputVal = _mm_cvtepi8_epi16(complexVal);
++ outputVal = _mm_slli_epi16(outputVal, 7);
+
+- _mm_store_si128((__m128i*)iBufferPtr, outputVal);
+- iBufferPtr += 8;
+- }
++ _mm_store_si128((__m128i*)iBufferPtr, outputVal);
++ iBufferPtr += 8;
++ }
+
+- number = eighthPoints * 8;
+- for(; number < num_points; number++){
+- *iBufferPtr++ = ((int16_t)*complexVectorPtr++) * 128;
+- complexVectorPtr++;
+- }
++ number = eighthPoints * 8;
++ for (; number < num_points; number++) {
++ *iBufferPtr++ = ((int16_t)*complexVectorPtr++) * 128;
++ complexVectorPtr++;
++ }
+ }
+ #endif /* LV_HAVE_SSE4_1 */
+
+@@ -136,63 +170,65 @@ volk_8ic_deinterleave_real_16i_a_sse4_1(int16_t* iBuffer, const lv_8sc_t* comple
+ #ifdef LV_HAVE_AVX
+ #include <immintrin.h>
+
+-static inline void
+-volk_8ic_deinterleave_real_16i_a_avx(int16_t* iBuffer, const lv_8sc_t* complexVector,
+- unsigned int num_points)
++static inline void volk_8ic_deinterleave_real_16i_a_avx(int16_t* iBuffer,
++ const lv_8sc_t* complexVector,
++ unsigned int num_points)
+ {
+- unsigned int number = 0;
+- const int8_t* complexVectorPtr = (int8_t*)complexVector;
+- int16_t* iBufferPtr = iBuffer;
+- __m128i moveMask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0);
+- __m256i complexVal, outputVal;
+- __m128i complexVal1, complexVal0, outputVal1, outputVal0;
+-
+- unsigned int sixteenthPoints = num_points / 16;
+-
+- for(number = 0; number < sixteenthPoints; number++){
+- complexVal = _mm256_load_si256((__m256i*)complexVectorPtr); complexVectorPtr += 32;
+-
+- complexVal1 = _mm256_extractf128_si256(complexVal, 1);
+- complexVal0 = _mm256_extractf128_si256(complexVal, 0);
+-
+- outputVal1 = _mm_shuffle_epi8(complexVal1, moveMask);
+- outputVal0 = _mm_shuffle_epi8(complexVal0, moveMask);
+-
+- outputVal1 = _mm_cvtepi8_epi16(outputVal1);
+- outputVal1 = _mm_slli_epi16(outputVal1, 7);
+- outputVal0 = _mm_cvtepi8_epi16(outputVal0);
+- outputVal0 = _mm_slli_epi16(outputVal0, 7);
+-
+- __m256i dummy = _mm256_setzero_si256();
+- outputVal = _mm256_insertf128_si256(dummy, outputVal0, 0);
+- outputVal = _mm256_insertf128_si256(outputVal, outputVal1, 1);
+- _mm256_store_si256((__m256i*)iBufferPtr, outputVal);
+-
+- iBufferPtr += 16;
+- }
+-
+- number = sixteenthPoints * 16;
+- for(; number < num_points; number++){
+- *iBufferPtr++ = ((int16_t)*complexVectorPtr++) * 128;
+- complexVectorPtr++;
+- }
++ unsigned int number = 0;
++ const int8_t* complexVectorPtr = (int8_t*)complexVector;
++ int16_t* iBufferPtr = iBuffer;
++ __m128i moveMask = _mm_set_epi8(
++ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0);
++ __m256i complexVal, outputVal;
++ __m128i complexVal1, complexVal0, outputVal1, outputVal0;
++
++ unsigned int sixteenthPoints = num_points / 16;
++
++ for (number = 0; number < sixteenthPoints; number++) {
++ complexVal = _mm256_load_si256((__m256i*)complexVectorPtr);
++ complexVectorPtr += 32;
++
++ complexVal1 = _mm256_extractf128_si256(complexVal, 1);
++ complexVal0 = _mm256_extractf128_si256(complexVal, 0);
++
++ outputVal1 = _mm_shuffle_epi8(complexVal1, moveMask);
++ outputVal0 = _mm_shuffle_epi8(complexVal0, moveMask);
++
++ outputVal1 = _mm_cvtepi8_epi16(outputVal1);
++ outputVal1 = _mm_slli_epi16(outputVal1, 7);
++ outputVal0 = _mm_cvtepi8_epi16(outputVal0);
++ outputVal0 = _mm_slli_epi16(outputVal0, 7);
++
++ __m256i dummy = _mm256_setzero_si256();
++ outputVal = _mm256_insertf128_si256(dummy, outputVal0, 0);
++ outputVal = _mm256_insertf128_si256(outputVal, outputVal1, 1);
++ _mm256_store_si256((__m256i*)iBufferPtr, outputVal);
++
++ iBufferPtr += 16;
++ }
++
++ number = sixteenthPoints * 16;
++ for (; number < num_points; number++) {
++ *iBufferPtr++ = ((int16_t)*complexVectorPtr++) * 128;
++ complexVectorPtr++;
++ }
+ }
+ #endif /* LV_HAVE_AVX */
+
+
+ #ifdef LV_HAVE_GENERIC
+
+-static inline void
+-volk_8ic_deinterleave_real_16i_generic(int16_t* iBuffer, const lv_8sc_t* complexVector,
+- unsigned int num_points)
++static inline void volk_8ic_deinterleave_real_16i_generic(int16_t* iBuffer,
++ const lv_8sc_t* complexVector,
++ unsigned int num_points)
+ {
+- unsigned int number = 0;
+- const int8_t* complexVectorPtr = (const int8_t*)complexVector;
+- int16_t* iBufferPtr = iBuffer;
+- for(number = 0; number < num_points; number++){
+- *iBufferPtr++ = ((int16_t)(*complexVectorPtr++)) * 128;
+- complexVectorPtr++;
+- }
++ unsigned int number = 0;
++ const int8_t* complexVectorPtr = (const int8_t*)complexVector;
++ int16_t* iBufferPtr = iBuffer;
++ for (number = 0; number < num_points; number++) {
++ *iBufferPtr++ = ((int16_t)(*complexVectorPtr++)) * 128;
++ complexVectorPtr++;
++ }
+ }
+ #endif /* LV_HAVE_GENERIC */
+
+@@ -209,40 +245,72 @@ volk_8ic_deinterleave_real_16i_generic(int16_t* iBuffer, const lv_8sc_t* complex
+ #ifdef LV_HAVE_AVX2
+ #include <immintrin.h>
+
+-static inline void
+-volk_8ic_deinterleave_real_16i_u_avx2(int16_t* iBuffer, const lv_8sc_t* complexVector,
+- unsigned int num_points)
++static inline void volk_8ic_deinterleave_real_16i_u_avx2(int16_t* iBuffer,
++ const lv_8sc_t* complexVector,
++ unsigned int num_points)
+ {
+- unsigned int number = 0;
+- const int8_t* complexVectorPtr = (int8_t*)complexVector;
+- int16_t* iBufferPtr = iBuffer;
+- __m256i moveMask = _mm256_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0);
+- __m256i complexVal, outputVal;
+- __m128i outputVal0;
+-
+- unsigned int sixteenthPoints = num_points / 16;
+-
+- for(number = 0; number < sixteenthPoints; number++){
+- complexVal = _mm256_loadu_si256((__m256i*)complexVectorPtr); complexVectorPtr += 32;
+-
+- complexVal = _mm256_shuffle_epi8(complexVal, moveMask);
+- complexVal = _mm256_permute4x64_epi64(complexVal, 0xd8);
+-
+- outputVal0 = _mm256_extractf128_si256(complexVal, 0);
+-
+- outputVal = _mm256_cvtepi8_epi16(outputVal0);
+- outputVal = _mm256_slli_epi16(outputVal, 7);
+-
+- _mm256_storeu_si256((__m256i*)iBufferPtr, outputVal);
+-
+- iBufferPtr += 16;
+- }
+-
+- number = sixteenthPoints * 16;
+- for(; number < num_points; number++){
+- *iBufferPtr++ = ((int16_t)*complexVectorPtr++) * 128;
+- complexVectorPtr++;
+- }
++ unsigned int number = 0;
++ const int8_t* complexVectorPtr = (int8_t*)complexVector;
++ int16_t* iBufferPtr = iBuffer;
++ __m256i moveMask = _mm256_set_epi8(0x80,
++ 0x80,
++ 0x80,
++ 0x80,
++ 0x80,
++ 0x80,
++ 0x80,
++ 0x80,
++ 14,
++ 12,
++ 10,
++ 8,
++ 6,
++ 4,
++ 2,
++ 0,
++ 0x80,
++ 0x80,
++ 0x80,
++ 0x80,
++ 0x80,
++ 0x80,
++ 0x80,
++ 0x80,
++ 14,
++ 12,
++ 10,
++ 8,
++ 6,
++ 4,
++ 2,
++ 0);
++ __m256i complexVal, outputVal;
++ __m128i outputVal0;
++
++ unsigned int sixteenthPoints = num_points / 16;
++
++ for (number = 0; number < sixteenthPoints; number++) {
++ complexVal = _mm256_loadu_si256((__m256i*)complexVectorPtr);
++ complexVectorPtr += 32;
++
++ complexVal = _mm256_shuffle_epi8(complexVal, moveMask);
++ complexVal = _mm256_permute4x64_epi64(complexVal, 0xd8);
++
++ outputVal0 = _mm256_extractf128_si256(complexVal, 0);
++
++ outputVal = _mm256_cvtepi8_epi16(outputVal0);
++ outputVal = _mm256_slli_epi16(outputVal, 7);
++
++ _mm256_storeu_si256((__m256i*)iBufferPtr, outputVal);
++
++ iBufferPtr += 16;
++ }
++
++ number = sixteenthPoints * 16;
++ for (; number < num_points; number++) {
++ *iBufferPtr++ = ((int16_t)*complexVectorPtr++) * 128;
++ complexVectorPtr++;
++ }
+ }
+ #endif /* LV_HAVE_AVX2 */
+ #endif /* INCLUDED_volk_8ic_deinterleave_real_16i_u_H */
+diff --git a/kernels/volk/volk_8ic_deinterleave_real_8i.h b/kernels/volk/volk_8ic_deinterleave_real_8i.h
+index 6cc3f15..a1a835d 100644
+--- a/kernels/volk/volk_8ic_deinterleave_real_8i.h
++++ b/kernels/volk/volk_8ic_deinterleave_real_8i.h
+@@ -30,8 +30,8 @@
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+- * void volk_8ic_deinterleave_real_8i(int8_t* iBuffer, const lv_8sc_t* complexVector, unsigned int num_points)
+- * \endcode
++ * void volk_8ic_deinterleave_real_8i(int8_t* iBuffer, const lv_8sc_t* complexVector,
++ * unsigned int num_points) \endcode
+ *
+ * \b Inputs
+ * \li complexVector: The complex input vector.
+@@ -59,40 +59,102 @@
+ #ifdef LV_HAVE_AVX2
+ #include <immintrin.h>
+
+-static inline void
+-volk_8ic_deinterleave_real_8i_a_avx2(int8_t* iBuffer, const lv_8sc_t* complexVector,
+- unsigned int num_points)
++static inline void volk_8ic_deinterleave_real_8i_a_avx2(int8_t* iBuffer,
++ const lv_8sc_t* complexVector,
++ unsigned int num_points)
+ {
+- unsigned int number = 0;
+- const int8_t* complexVectorPtr = (int8_t*)complexVector;
+- int8_t* iBufferPtr = iBuffer;
+- __m256i moveMask1 = _mm256_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0);
+- __m256i moveMask2 = _mm256_set_epi8(14, 12, 10, 8, 6, 4, 2, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
+- __m256i complexVal1, complexVal2, outputVal;
+-
+- unsigned int thirtysecondPoints = num_points / 32;
+-
+- for(number = 0; number < thirtysecondPoints; number++){
+-
+- complexVal1 = _mm256_load_si256((__m256i*)complexVectorPtr);
+- complexVectorPtr += 32;
+- complexVal2 = _mm256_load_si256((__m256i*)complexVectorPtr);
+- complexVectorPtr += 32;
+-
+- complexVal1 = _mm256_shuffle_epi8(complexVal1, moveMask1);
+- complexVal2 = _mm256_shuffle_epi8(complexVal2, moveMask2);
+- outputVal = _mm256_or_si256(complexVal1, complexVal2);
+- outputVal = _mm256_permute4x64_epi64(outputVal, 0xd8);
+-
+- _mm256_store_si256((__m256i*)iBufferPtr, outputVal);
+- iBufferPtr += 32;
+- }
+-
+- number = thirtysecondPoints * 32;
+- for(; number < num_points; number++){
+- *iBufferPtr++ = *complexVectorPtr++;
+- complexVectorPtr++;
+- }
++ unsigned int number = 0;
++ const int8_t* complexVectorPtr = (int8_t*)complexVector;
++ int8_t* iBufferPtr = iBuffer;
++ __m256i moveMask1 = _mm256_set_epi8(0x80,
++ 0x80,
++ 0x80,
++ 0x80,
++ 0x80,
++ 0x80,
++ 0x80,
++ 0x80,
++ 14,
++ 12,
++ 10,
++ 8,
++ 6,
++ 4,
++ 2,
++ 0,
++ 0x80,
++ 0x80,
++ 0x80,
++ 0x80,
++ 0x80,
++ 0x80,
++ 0x80,
++ 0x80,
++ 14,
++ 12,
++ 10,
++ 8,
++ 6,
++ 4,
++ 2,
++ 0);
++ __m256i moveMask2 = _mm256_set_epi8(14,
++ 12,
++ 10,
++ 8,
++ 6,
++ 4,
++ 2,
++ 0,
++ 0x80,
++ 0x80,
++ 0x80,
++ 0x80,
++ 0x80,
++ 0x80,
++ 0x80,
++ 0x80,
++ 14,
++ 12,
++ 10,
++ 8,
++ 6,
++ 4,
++ 2,
++ 0,
++ 0x80,
++ 0x80,
++ 0x80,
++ 0x80,
++ 0x80,
++ 0x80,
++ 0x80,
++ 0x80);
++ __m256i complexVal1, complexVal2, outputVal;
++
++ unsigned int thirtysecondPoints = num_points / 32;
++
++ for (number = 0; number < thirtysecondPoints; number++) {
++
++ complexVal1 = _mm256_load_si256((__m256i*)complexVectorPtr);
++ complexVectorPtr += 32;
++ complexVal2 = _mm256_load_si256((__m256i*)complexVectorPtr);
++ complexVectorPtr += 32;
++
++ complexVal1 = _mm256_shuffle_epi8(complexVal1, moveMask1);
++ complexVal2 = _mm256_shuffle_epi8(complexVal2, moveMask2);
++ outputVal = _mm256_or_si256(complexVal1, complexVal2);
++ outputVal = _mm256_permute4x64_epi64(outputVal, 0xd8);
++
++ _mm256_store_si256((__m256i*)iBufferPtr, outputVal);
++ iBufferPtr += 32;
++ }
++
++ number = thirtysecondPoints * 32;
++ for (; number < num_points; number++) {
++ *iBufferPtr++ = *complexVectorPtr++;
++ complexVectorPtr++;
++ }
+ }
+ #endif /* LV_HAVE_AVX2 */
+
+@@ -100,37 +162,41 @@ volk_8ic_deinterleave_real_8i_a_avx2(int8_t* iBuffer, const lv_8sc_t* complexVec
+ #ifdef LV_HAVE_SSSE3
+ #include <tmmintrin.h>
+
+-static inline void
+-volk_8ic_deinterleave_real_8i_a_ssse3(int8_t* iBuffer, const lv_8sc_t* complexVector,
+- unsigned int num_points)
++static inline void volk_8ic_deinterleave_real_8i_a_ssse3(int8_t* iBuffer,
++ const lv_8sc_t* complexVector,
++ unsigned int num_points)
+ {
+- unsigned int number = 0;
+- const int8_t* complexVectorPtr = (int8_t*)complexVector;
+- int8_t* iBufferPtr = iBuffer;
+- __m128i moveMask1 = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0);
+- __m128i moveMask2 = _mm_set_epi8(14, 12, 10, 8, 6, 4, 2, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
+- __m128i complexVal1, complexVal2, outputVal;
+-
+- unsigned int sixteenthPoints = num_points / 16;
+-
+- for(number = 0; number < sixteenthPoints; number++){
+- complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16;
+- complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16;
+-
+- complexVal1 = _mm_shuffle_epi8(complexVal1, moveMask1);
+- complexVal2 = _mm_shuffle_epi8(complexVal2, moveMask2);
+-
+- outputVal = _mm_or_si128(complexVal1, complexVal2);
+-
+- _mm_store_si128((__m128i*)iBufferPtr, outputVal);
+- iBufferPtr += 16;
+- }
+-
+- number = sixteenthPoints * 16;
+- for(; number < num_points; number++){
+- *iBufferPtr++ = *complexVectorPtr++;
+- complexVectorPtr++;
+- }
++ unsigned int number = 0;
++ const int8_t* complexVectorPtr = (int8_t*)complexVector;
++ int8_t* iBufferPtr = iBuffer;
++ __m128i moveMask1 = _mm_set_epi8(
++ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0);
++ __m128i moveMask2 = _mm_set_epi8(
++ 14, 12, 10, 8, 6, 4, 2, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
++ __m128i complexVal1, complexVal2, outputVal;
++
++ unsigned int sixteenthPoints = num_points / 16;
++
++ for (number = 0; number < sixteenthPoints; number++) {
++ complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr);
++ complexVectorPtr += 16;
++ complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr);
++ complexVectorPtr += 16;
++
++ complexVal1 = _mm_shuffle_epi8(complexVal1, moveMask1);
++ complexVal2 = _mm_shuffle_epi8(complexVal2, moveMask2);
++
++ outputVal = _mm_or_si128(complexVal1, complexVal2);
++
++ _mm_store_si128((__m128i*)iBufferPtr, outputVal);
++ iBufferPtr += 16;
++ }
++
++ number = sixteenthPoints * 16;
++ for (; number < num_points; number++) {
++ *iBufferPtr++ = *complexVectorPtr++;
++ complexVectorPtr++;
++ }
+ }
+ #endif /* LV_HAVE_SSSE3 */
+
+@@ -138,72 +204,75 @@ volk_8ic_deinterleave_real_8i_a_ssse3(int8_t* iBuffer, const lv_8sc_t* complexVe
+ #ifdef LV_HAVE_AVX
+ #include <immintrin.h>
+
+-static inline void
+-volk_8ic_deinterleave_real_8i_a_avx(int8_t* iBuffer, const lv_8sc_t* complexVector,
+- unsigned int num_points)
++static inline void volk_8ic_deinterleave_real_8i_a_avx(int8_t* iBuffer,
++ const lv_8sc_t* complexVector,
++ unsigned int num_points)
+ {
+- unsigned int number = 0;
+- const int8_t* complexVectorPtr = (int8_t*)complexVector;
+- int8_t* iBufferPtr = iBuffer;
+- __m128i moveMaskL = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0);
+- __m128i moveMaskH = _mm_set_epi8(14, 12, 10, 8, 6, 4, 2, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
+- __m256i complexVal1, complexVal2, outputVal;
+- __m128i complexVal1H, complexVal1L, complexVal2H, complexVal2L, outputVal1, outputVal2;
+-
+- unsigned int thirtysecondPoints = num_points / 32;
+-
+- for(number = 0; number < thirtysecondPoints; number++){
+-
+- complexVal1 = _mm256_load_si256((__m256i*)complexVectorPtr);
+- complexVectorPtr += 32;
+- complexVal2 = _mm256_load_si256((__m256i*)complexVectorPtr);
+- complexVectorPtr += 32;
+-
+- complexVal1H = _mm256_extractf128_si256(complexVal1, 1);
+- complexVal1L = _mm256_extractf128_si256(complexVal1, 0);
+- complexVal2H = _mm256_extractf128_si256(complexVal2, 1);
+- complexVal2L = _mm256_extractf128_si256(complexVal2, 0);
+-
+- complexVal1H = _mm_shuffle_epi8(complexVal1H, moveMaskH);
+- complexVal1L = _mm_shuffle_epi8(complexVal1L, moveMaskL);
+- outputVal1 = _mm_or_si128(complexVal1H, complexVal1L);
+-
+-
+- complexVal2H = _mm_shuffle_epi8(complexVal2H, moveMaskH);
+- complexVal2L = _mm_shuffle_epi8(complexVal2L, moveMaskL);
+- outputVal2 = _mm_or_si128(complexVal2H, complexVal2L);
+-
+- __m256i dummy = _mm256_setzero_si256();
+- outputVal = _mm256_insertf128_si256(dummy, outputVal1, 0);
+- outputVal = _mm256_insertf128_si256(outputVal, outputVal2, 1);
+-
+-
+- _mm256_store_si256((__m256i*)iBufferPtr, outputVal);
+- iBufferPtr += 32;
+- }
+-
+- number = thirtysecondPoints * 32;
+- for(; number < num_points; number++){
+- *iBufferPtr++ = *complexVectorPtr++;
+- complexVectorPtr++;
+- }
++ unsigned int number = 0;
++ const int8_t* complexVectorPtr = (int8_t*)complexVector;
++ int8_t* iBufferPtr = iBuffer;
++ __m128i moveMaskL = _mm_set_epi8(
++ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0);
++ __m128i moveMaskH = _mm_set_epi8(
++ 14, 12, 10, 8, 6, 4, 2, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
++ __m256i complexVal1, complexVal2, outputVal;
++ __m128i complexVal1H, complexVal1L, complexVal2H, complexVal2L, outputVal1,
++ outputVal2;
++
++ unsigned int thirtysecondPoints = num_points / 32;
++
++ for (number = 0; number < thirtysecondPoints; number++) {
++
++ complexVal1 = _mm256_load_si256((__m256i*)complexVectorPtr);
++ complexVectorPtr += 32;
++ complexVal2 = _mm256_load_si256((__m256i*)complexVectorPtr);
++ complexVectorPtr += 32;
++
++ complexVal1H = _mm256_extractf128_si256(complexVal1, 1);
++ complexVal1L = _mm256_extractf128_si256(complexVal1, 0);
++ complexVal2H = _mm256_extractf128_si256(complexVal2, 1);
++ complexVal2L = _mm256_extractf128_si256(complexVal2, 0);
++
++ complexVal1H = _mm_shuffle_epi8(complexVal1H, moveMaskH);
++ complexVal1L = _mm_shuffle_epi8(complexVal1L, moveMaskL);
++ outputVal1 = _mm_or_si128(complexVal1H, complexVal1L);
++
++
++ complexVal2H = _mm_shuffle_epi8(complexVal2H, moveMaskH);
++ complexVal2L = _mm_shuffle_epi8(complexVal2L, moveMaskL);
++ outputVal2 = _mm_or_si128(complexVal2H, complexVal2L);
++
++ __m256i dummy = _mm256_setzero_si256();
++ outputVal = _mm256_insertf128_si256(dummy, outputVal1, 0);
++ outputVal = _mm256_insertf128_si256(outputVal, outputVal2, 1);
++
++
++ _mm256_store_si256((__m256i*)iBufferPtr, outputVal);
++ iBufferPtr += 32;
++ }
++
++ number = thirtysecondPoints * 32;
++ for (; number < num_points; number++) {
++ *iBufferPtr++ = *complexVectorPtr++;
++ complexVectorPtr++;
++ }
+ }
+ #endif /* LV_HAVE_AVX */
+
+
+ #ifdef LV_HAVE_GENERIC
+
+-static inline void
+-volk_8ic_deinterleave_real_8i_generic(int8_t* iBuffer, const lv_8sc_t* complexVector,
+- unsigned int num_points)
++static inline void volk_8ic_deinterleave_real_8i_generic(int8_t* iBuffer,
++ const lv_8sc_t* complexVector,
++ unsigned int num_points)
+ {
+- unsigned int number = 0;
+- const int8_t* complexVectorPtr = (int8_t*)complexVector;
+- int8_t* iBufferPtr = iBuffer;
+- for(number = 0; number < num_points; number++){
+- *iBufferPtr++ = *complexVectorPtr++;
+- complexVectorPtr++;
+- }
++ unsigned int number = 0;
++ const int8_t* complexVectorPtr = (int8_t*)complexVector;
++ int8_t* iBufferPtr = iBuffer;
++ for (number = 0; number < num_points; number++) {
++ *iBufferPtr++ = *complexVectorPtr++;
++ complexVectorPtr++;
++ }
+ }
+ #endif /* LV_HAVE_GENERIC */
+
+@@ -211,26 +280,27 @@ volk_8ic_deinterleave_real_8i_generic(int8_t* iBuffer, const lv_8sc_t* complexVe
+ #ifdef LV_HAVE_NEON
+ #include <arm_neon.h>
+
+-static inline void
+-volk_8ic_deinterleave_real_8i_neon(int8_t* iBuffer, const lv_8sc_t* complexVector, unsigned int num_points)
++static inline void volk_8ic_deinterleave_real_8i_neon(int8_t* iBuffer,
++ const lv_8sc_t* complexVector,
++ unsigned int num_points)
+ {
+- unsigned int number;
+- unsigned int sixteenth_points = num_points / 16;
+-
+- int8x16x2_t input_vector;
+- for(number=0; number < sixteenth_points; ++number) {
+- input_vector = vld2q_s8((int8_t*) complexVector );
+- vst1q_s8(iBuffer, input_vector.val[0]);
+- iBuffer += 16;
+- complexVector += 16;
+- }
+-
+- const int8_t* complexVectorPtr = (int8_t*)complexVector;
+- int8_t* iBufferPtr = iBuffer;
+- for(number = sixteenth_points*16; number < num_points; number++){
+- *iBufferPtr++ = *complexVectorPtr++;
+- complexVectorPtr++;
+- }
++ unsigned int number;
++ unsigned int sixteenth_points = num_points / 16;
++
++ int8x16x2_t input_vector;
++ for (number = 0; number < sixteenth_points; ++number) {
++ input_vector = vld2q_s8((int8_t*)complexVector);
++ vst1q_s8(iBuffer, input_vector.val[0]);
++ iBuffer += 16;
++ complexVector += 16;
++ }
++
++ const int8_t* complexVectorPtr = (int8_t*)complexVector;
++ int8_t* iBufferPtr = iBuffer;
++ for (number = sixteenth_points * 16; number < num_points; number++) {
++ *iBufferPtr++ = *complexVectorPtr++;
++ complexVectorPtr++;
++ }
+ }
+ #endif /* LV_HAVE_NEON */
+
+@@ -246,40 +316,102 @@ volk_8ic_deinterleave_real_8i_neon(int8_t* iBuffer, const lv_8sc_t* complexVecto
+ #ifdef LV_HAVE_AVX2
+ #include <immintrin.h>
+
+-static inline void
+-volk_8ic_deinterleave_real_8i_u_avx2(int8_t* iBuffer, const lv_8sc_t* complexVector,
+- unsigned int num_points)
++static inline void volk_8ic_deinterleave_real_8i_u_avx2(int8_t* iBuffer,
++ const lv_8sc_t* complexVector,
++ unsigned int num_points)
+ {
+- unsigned int number = 0;
+- const int8_t* complexVectorPtr = (int8_t*)complexVector;
+- int8_t* iBufferPtr = iBuffer;
+- __m256i moveMask1 = _mm256_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0);
+- __m256i moveMask2 = _mm256_set_epi8(14, 12, 10, 8, 6, 4, 2, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
+- __m256i complexVal1, complexVal2, outputVal;
+-
+- unsigned int thirtysecondPoints = num_points / 32;
+-
+- for(number = 0; number < thirtysecondPoints; number++){
+-
+- complexVal1 = _mm256_loadu_si256((__m256i*)complexVectorPtr);
+- complexVectorPtr += 32;
+- complexVal2 = _mm256_loadu_si256((__m256i*)complexVectorPtr);
+- complexVectorPtr += 32;
+-
+- complexVal1 = _mm256_shuffle_epi8(complexVal1, moveMask1);
+- complexVal2 = _mm256_shuffle_epi8(complexVal2, moveMask2);
+- outputVal = _mm256_or_si256(complexVal1, complexVal2);
+- outputVal = _mm256_permute4x64_epi64(outputVal, 0xd8);
+-
+- _mm256_storeu_si256((__m256i*)iBufferPtr, outputVal);
+- iBufferPtr += 32;
+- }
+-
+- number = thirtysecondPoints * 32;
+- for(; number < num_points; number++){
+- *iBufferPtr++ = *complexVectorPtr++;
+- complexVectorPtr++;
+- }
++ unsigned int number = 0;
++ const int8_t* complexVectorPtr = (int8_t*)complexVector;
++ int8_t* iBufferPtr = iBuffer;
++ __m256i moveMask1 = _mm256_set_epi8(0x80,
++ 0x80,
++ 0x80,
++ 0x80,
++ 0x80,
++ 0x80,
++ 0x80,
++ 0x80,
++ 14,
++ 12,
++ 10,
++ 8,
++ 6,
++ 4,
++ 2,
++ 0,
++ 0x80,
++ 0x80,
++ 0x80,
++ 0x80,
++ 0x80,
++ 0x80,
++ 0x80,
++ 0x80,
++ 14,
++ 12,
++ 10,
++ 8,
++ 6,
++ 4,
++ 2,
++ 0);
++ __m256i moveMask2 = _mm256_set_epi8(14,
++ 12,
++ 10,
++ 8,
++ 6,
++ 4,
++ 2,
++ 0,
++ 0x80,
++ 0x80,
++ 0x80,
++ 0x80,
++ 0x80,
++ 0x80,
++ 0x80,
++ 0x80,
++ 14,
++ 12,
++ 10,
++ 8,
++ 6,
++ 4,
++ 2,
++ 0,
++ 0x80,
++ 0x80,
++ 0x80,
++ 0x80,
++ 0x80,
++ 0x80,
++ 0x80,
++ 0x80);
++ __m256i complexVal1, complexVal2, outputVal;
++
++ unsigned int thirtysecondPoints = num_points / 32;
++
++ for (number = 0; number < thirtysecondPoints; number++) {
++
++ complexVal1 = _mm256_loadu_si256((__m256i*)complexVectorPtr);
++ complexVectorPtr += 32;
++ complexVal2 = _mm256_loadu_si256((__m256i*)complexVectorPtr);
++ complexVectorPtr += 32;
++
++ complexVal1 = _mm256_shuffle_epi8(complexVal1, moveMask1);
++ complexVal2 = _mm256_shuffle_epi8(complexVal2, moveMask2);
++ outputVal = _mm256_or_si256(complexVal1, complexVal2);
++ outputVal = _mm256_permute4x64_epi64(outputVal, 0xd8);
++
++ _mm256_storeu_si256((__m256i*)iBufferPtr, outputVal);
++ iBufferPtr += 32;
++ }
++
++ number = thirtysecondPoints * 32;
++ for (; number < num_points; number++) {
++ *iBufferPtr++ = *complexVectorPtr++;
++ complexVectorPtr++;
++ }
+ }
+ #endif /* LV_HAVE_AVX2 */
+
+diff --git a/kernels/volk/volk_8ic_s32f_deinterleave_32f_x2.h b/kernels/volk/volk_8ic_s32f_deinterleave_32f_x2.h
+index 736f7c0..f622752 100644
+--- a/kernels/volk/volk_8ic_s32f_deinterleave_32f_x2.h
++++ b/kernels/volk/volk_8ic_s32f_deinterleave_32f_x2.h
+@@ -31,8 +31,8 @@
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+- * void volk_8ic_s32f_deinterleave_32f_x2(float* iBuffer, float* qBuffer, const lv_8sc_t* complexVector, const float scalar, unsigned int num_points)
+- * \endcode
++ * void volk_8ic_s32f_deinterleave_32f_x2(float* iBuffer, float* qBuffer, const lv_8sc_t*
++ * complexVector, const float scalar, unsigned int num_points) \endcode
+ *
+ * \b Inputs
+ * \li complexVector: The complex input vector.
+@@ -56,74 +56,79 @@
+ #ifndef INCLUDED_volk_8ic_s32f_deinterleave_32f_x2_a_H
+ #define INCLUDED_volk_8ic_s32f_deinterleave_32f_x2_a_H
+
+-#include <volk/volk_common.h>
+ #include <inttypes.h>
+ #include <stdio.h>
++#include <volk/volk_common.h>
+
+
+ #ifdef LV_HAVE_SSE4_1
+ #include <smmintrin.h>
+
+ static inline void
+-volk_8ic_s32f_deinterleave_32f_x2_a_sse4_1(float* iBuffer, float* qBuffer, const lv_8sc_t* complexVector,
+- const float scalar, unsigned int num_points)
++volk_8ic_s32f_deinterleave_32f_x2_a_sse4_1(float* iBuffer,
++ float* qBuffer,
++ const lv_8sc_t* complexVector,
++ const float scalar,
++ unsigned int num_points)
+ {
+- float* iBufferPtr = iBuffer;
+- float* qBufferPtr = qBuffer;
+-
+- unsigned int number = 0;
+- const unsigned int eighthPoints = num_points / 8;
+- __m128 iFloatValue, qFloatValue;
+-
+- const float iScalar= 1.0 / scalar;
+- __m128 invScalar = _mm_set_ps1(iScalar);
+- __m128i complexVal, iIntVal, qIntVal, iComplexVal, qComplexVal;
+- int8_t* complexVectorPtr = (int8_t*)complexVector;
+-
+- __m128i iMoveMask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0);
+- __m128i qMoveMask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 15, 13, 11, 9, 7, 5, 3, 1);
+-
+- for(;number < eighthPoints; number++){
+- complexVal = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16;
+- iComplexVal = _mm_shuffle_epi8(complexVal, iMoveMask);
+- qComplexVal = _mm_shuffle_epi8(complexVal, qMoveMask);
+-
+- iIntVal = _mm_cvtepi8_epi32(iComplexVal);
+- iFloatValue = _mm_cvtepi32_ps(iIntVal);
+- iFloatValue = _mm_mul_ps(iFloatValue, invScalar);
+- _mm_store_ps(iBufferPtr, iFloatValue);
+- iBufferPtr += 4;
+-
+- iComplexVal = _mm_srli_si128(iComplexVal, 4);
+-
+- iIntVal = _mm_cvtepi8_epi32(iComplexVal);
+- iFloatValue = _mm_cvtepi32_ps(iIntVal);
+- iFloatValue = _mm_mul_ps(iFloatValue, invScalar);
+- _mm_store_ps(iBufferPtr, iFloatValue);
+- iBufferPtr += 4;
+-
+- qIntVal = _mm_cvtepi8_epi32(qComplexVal);
+- qFloatValue = _mm_cvtepi32_ps(qIntVal);
+- qFloatValue = _mm_mul_ps(qFloatValue, invScalar);
+- _mm_store_ps(qBufferPtr, qFloatValue);
+- qBufferPtr += 4;
+-
+- qComplexVal = _mm_srli_si128(qComplexVal, 4);
+-
+- qIntVal = _mm_cvtepi8_epi32(qComplexVal);
+- qFloatValue = _mm_cvtepi32_ps(qIntVal);
+- qFloatValue = _mm_mul_ps(qFloatValue, invScalar);
+- _mm_store_ps(qBufferPtr, qFloatValue);
+-
+- qBufferPtr += 4;
+- }
+-
+- number = eighthPoints * 8;
+- for(; number < num_points; number++){
+- *iBufferPtr++ = (float)(*complexVectorPtr++) * iScalar;
+- *qBufferPtr++ = (float)(*complexVectorPtr++) * iScalar;
+- }
+-
++ float* iBufferPtr = iBuffer;
++ float* qBufferPtr = qBuffer;
++
++ unsigned int number = 0;
++ const unsigned int eighthPoints = num_points / 8;
++ __m128 iFloatValue, qFloatValue;
++
++ const float iScalar = 1.0 / scalar;
++ __m128 invScalar = _mm_set_ps1(iScalar);
++ __m128i complexVal, iIntVal, qIntVal, iComplexVal, qComplexVal;
++ int8_t* complexVectorPtr = (int8_t*)complexVector;
++
++ __m128i iMoveMask = _mm_set_epi8(
++ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0);
++ __m128i qMoveMask = _mm_set_epi8(
++ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 15, 13, 11, 9, 7, 5, 3, 1);
++
++ for (; number < eighthPoints; number++) {
++ complexVal = _mm_load_si128((__m128i*)complexVectorPtr);
++ complexVectorPtr += 16;
++ iComplexVal = _mm_shuffle_epi8(complexVal, iMoveMask);
++ qComplexVal = _mm_shuffle_epi8(complexVal, qMoveMask);
++
++ iIntVal = _mm_cvtepi8_epi32(iComplexVal);
++ iFloatValue = _mm_cvtepi32_ps(iIntVal);
++ iFloatValue = _mm_mul_ps(iFloatValue, invScalar);
++ _mm_store_ps(iBufferPtr, iFloatValue);
++ iBufferPtr += 4;
++
++ iComplexVal = _mm_srli_si128(iComplexVal, 4);
++
++ iIntVal = _mm_cvtepi8_epi32(iComplexVal);
++ iFloatValue = _mm_cvtepi32_ps(iIntVal);
++ iFloatValue = _mm_mul_ps(iFloatValue, invScalar);
++ _mm_store_ps(iBufferPtr, iFloatValue);
++ iBufferPtr += 4;
++
++ qIntVal = _mm_cvtepi8_epi32(qComplexVal);
++ qFloatValue = _mm_cvtepi32_ps(qIntVal);
++ qFloatValue = _mm_mul_ps(qFloatValue, invScalar);
++ _mm_store_ps(qBufferPtr, qFloatValue);
++ qBufferPtr += 4;
++
++ qComplexVal = _mm_srli_si128(qComplexVal, 4);
++
++ qIntVal = _mm_cvtepi8_epi32(qComplexVal);
++ qFloatValue = _mm_cvtepi32_ps(qIntVal);
++ qFloatValue = _mm_mul_ps(qFloatValue, invScalar);
++ _mm_store_ps(qBufferPtr, qFloatValue);
++
++ qBufferPtr += 4;
++ }
++
++ number = eighthPoints * 8;
++ for (; number < num_points; number++) {
++ *iBufferPtr++ = (float)(*complexVectorPtr++) * iScalar;
++ *qBufferPtr++ = (float)(*complexVectorPtr++) * iScalar;
++ }
+ }
+ #endif /* LV_HAVE_SSE4_1 */
+
+@@ -131,59 +136,60 @@ volk_8ic_s32f_deinterleave_32f_x2_a_sse4_1(float* iBuffer, float* qBuffer, const
+ #ifdef LV_HAVE_SSE
+ #include <xmmintrin.h>
+
+-static inline void
+-volk_8ic_s32f_deinterleave_32f_x2_a_sse(float* iBuffer, float* qBuffer,
+- const lv_8sc_t* complexVector,
+- const float scalar, unsigned int num_points)
++static inline void volk_8ic_s32f_deinterleave_32f_x2_a_sse(float* iBuffer,
++ float* qBuffer,
++ const lv_8sc_t* complexVector,
++ const float scalar,
++ unsigned int num_points)
+ {
+- float* iBufferPtr = iBuffer;
+- float* qBufferPtr = qBuffer;
++ float* iBufferPtr = iBuffer;
++ float* qBufferPtr = qBuffer;
+
+- unsigned int number = 0;
+- const unsigned int quarterPoints = num_points / 4;
+- __m128 cplxValue1, cplxValue2, iValue, qValue;
++ unsigned int number = 0;
++ const unsigned int quarterPoints = num_points / 4;
++ __m128 cplxValue1, cplxValue2, iValue, qValue;
+
+- __m128 invScalar = _mm_set_ps1(1.0/scalar);
+- int8_t* complexVectorPtr = (int8_t*)complexVector;
++ __m128 invScalar = _mm_set_ps1(1.0 / scalar);
++ int8_t* complexVectorPtr = (int8_t*)complexVector;
+
+- __VOLK_ATTR_ALIGNED(16) float floatBuffer[8];
++ __VOLK_ATTR_ALIGNED(16) float floatBuffer[8];
+
+- for(;number < quarterPoints; number++){
+- floatBuffer[0] = (float)(complexVectorPtr[0]);
+- floatBuffer[1] = (float)(complexVectorPtr[1]);
+- floatBuffer[2] = (float)(complexVectorPtr[2]);
+- floatBuffer[3] = (float)(complexVectorPtr[3]);
++ for (; number < quarterPoints; number++) {
++ floatBuffer[0] = (float)(complexVectorPtr[0]);
++ floatBuffer[1] = (float)(complexVectorPtr[1]);
++ floatBuffer[2] = (float)(complexVectorPtr[2]);
++ floatBuffer[3] = (float)(complexVectorPtr[3]);
+
+- floatBuffer[4] = (float)(complexVectorPtr[4]);
+- floatBuffer[5] = (float)(complexVectorPtr[5]);
+- floatBuffer[6] = (float)(complexVectorPtr[6]);
+- floatBuffer[7] = (float)(complexVectorPtr[7]);
++ floatBuffer[4] = (float)(complexVectorPtr[4]);
++ floatBuffer[5] = (float)(complexVectorPtr[5]);
++ floatBuffer[6] = (float)(complexVectorPtr[6]);
++ floatBuffer[7] = (float)(complexVectorPtr[7]);
+
+- cplxValue1 = _mm_load_ps(&floatBuffer[0]);
+- cplxValue2 = _mm_load_ps(&floatBuffer[4]);
++ cplxValue1 = _mm_load_ps(&floatBuffer[0]);
++ cplxValue2 = _mm_load_ps(&floatBuffer[4]);
+
+- complexVectorPtr += 8;
++ complexVectorPtr += 8;
+
+- cplxValue1 = _mm_mul_ps(cplxValue1, invScalar);
+- cplxValue2 = _mm_mul_ps(cplxValue2, invScalar);
++ cplxValue1 = _mm_mul_ps(cplxValue1, invScalar);
++ cplxValue2 = _mm_mul_ps(cplxValue2, invScalar);
+
+- // Arrange in i1i2i3i4 format
+- iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0));
+- qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1));
++ // Arrange in i1i2i3i4 format
++ iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2, 0, 2, 0));
++ qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3, 1, 3, 1));
+
+- _mm_store_ps(iBufferPtr, iValue);
+- _mm_store_ps(qBufferPtr, qValue);
++ _mm_store_ps(iBufferPtr, iValue);
++ _mm_store_ps(qBufferPtr, qValue);
+
+- iBufferPtr += 4;
+- qBufferPtr += 4;
+- }
++ iBufferPtr += 4;
++ qBufferPtr += 4;
++ }
+
+- number = quarterPoints * 4;
+- complexVectorPtr = (int8_t*)&complexVector[number];
+- for(; number < num_points; number++){
+- *iBufferPtr++ = (float)(*complexVectorPtr++) / scalar;
+- *qBufferPtr++ = (float)(*complexVectorPtr++) / scalar;
+- }
++ number = quarterPoints * 4;
++ complexVectorPtr = (int8_t*)&complexVector[number];
++ for (; number < num_points; number++) {
++ *iBufferPtr++ = (float)(*complexVectorPtr++) / scalar;
++ *qBufferPtr++ = (float)(*complexVectorPtr++) / scalar;
++ }
+ }
+ #endif /* LV_HAVE_SSE */
+
+@@ -191,70 +197,127 @@ volk_8ic_s32f_deinterleave_32f_x2_a_sse(float* iBuffer, float* qBuffer,
+ #ifdef LV_HAVE_AVX2
+ #include <immintrin.h>
+
+-static inline void
+-volk_8ic_s32f_deinterleave_32f_x2_a_avx2(float* iBuffer, float* qBuffer, const lv_8sc_t* complexVector,
+- const float scalar, unsigned int num_points)
++static inline void volk_8ic_s32f_deinterleave_32f_x2_a_avx2(float* iBuffer,
++ float* qBuffer,
++ const lv_8sc_t* complexVector,
++ const float scalar,
++ unsigned int num_points)
+ {
+- float* iBufferPtr = iBuffer;
+- float* qBufferPtr = qBuffer;
+-
+- unsigned int number = 0;
+- const unsigned int sixteenthPoints = num_points / 16;
+- __m256 iFloatValue, qFloatValue;
+-
+- const float iScalar= 1.0 / scalar;
+- __m256 invScalar = _mm256_set1_ps(iScalar);
+- __m256i complexVal, iIntVal, qIntVal, iComplexVal, qComplexVal;
+- int8_t* complexVectorPtr = (int8_t*)complexVector;
+-
+- __m256i iMoveMask = _mm256_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+- 14, 12, 10, 8, 6, 4, 2, 0,
+- 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+- 14, 12, 10, 8, 6, 4, 2, 0);
+- __m256i qMoveMask = _mm256_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+- 15, 13, 11, 9, 7, 5, 3, 1,
+- 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+- 15, 13, 11, 9, 7, 5, 3, 1);
+-
+- for(;number < sixteenthPoints; number++){
+- complexVal = _mm256_load_si256((__m256i*)complexVectorPtr);
+- complexVectorPtr += 32;
+- iComplexVal = _mm256_shuffle_epi8(complexVal, iMoveMask);
+- qComplexVal = _mm256_shuffle_epi8(complexVal, qMoveMask);
+-
+- iIntVal = _mm256_cvtepi8_epi32(_mm256_castsi256_si128(iComplexVal));
+- iFloatValue = _mm256_cvtepi32_ps(iIntVal);
+- iFloatValue = _mm256_mul_ps(iFloatValue, invScalar);
+- _mm256_store_ps(iBufferPtr, iFloatValue);
+- iBufferPtr += 8;
+-
+- iComplexVal = _mm256_permute4x64_epi64(iComplexVal, 0b11000110);
+- iIntVal = _mm256_cvtepi8_epi32(_mm256_castsi256_si128(iComplexVal));
+- iFloatValue = _mm256_cvtepi32_ps(iIntVal);
+- iFloatValue = _mm256_mul_ps(iFloatValue, invScalar);
+- _mm256_store_ps(iBufferPtr, iFloatValue);
+- iBufferPtr += 8;
+-
+- qIntVal = _mm256_cvtepi8_epi32(_mm256_castsi256_si128(qComplexVal));
+- qFloatValue = _mm256_cvtepi32_ps(qIntVal);
+- qFloatValue = _mm256_mul_ps(qFloatValue, invScalar);
+- _mm256_store_ps(qBufferPtr, qFloatValue);
+- qBufferPtr += 8;
+-
+- qComplexVal = _mm256_permute4x64_epi64(qComplexVal, 0b11000110);
+- qIntVal = _mm256_cvtepi8_epi32(_mm256_castsi256_si128(qComplexVal));
+- qFloatValue = _mm256_cvtepi32_ps(qIntVal);
+- qFloatValue = _mm256_mul_ps(qFloatValue, invScalar);
+- _mm256_store_ps(qBufferPtr, qFloatValue);
+- qBufferPtr += 8;
+- }
+-
+- number = sixteenthPoints * 16;
+- for(; number < num_points; number++){
+- *iBufferPtr++ = (float)(*complexVectorPtr++) * iScalar;
+- *qBufferPtr++ = (float)(*complexVectorPtr++) * iScalar;
+- }
+-
++ float* iBufferPtr = iBuffer;
++ float* qBufferPtr = qBuffer;
++
++ unsigned int number = 0;
++ const unsigned int sixteenthPoints = num_points / 16;
++ __m256 iFloatValue, qFloatValue;
++
++ const float iScalar = 1.0 / scalar;
++ __m256 invScalar = _mm256_set1_ps(iScalar);
++ __m256i complexVal, iIntVal, qIntVal, iComplexVal, qComplexVal;
++ int8_t* complexVectorPtr = (int8_t*)complexVector;
++
++ __m256i iMoveMask = _mm256_set_epi8(0x80,
++ 0x80,
++ 0x80,
++ 0x80,
++ 0x80,
++ 0x80,
++ 0x80,
++ 0x80,
++ 14,
++ 12,
++ 10,
++ 8,
++ 6,
++ 4,
++ 2,
++ 0,
++ 0x80,
++ 0x80,
++ 0x80,
++ 0x80,
++ 0x80,
++ 0x80,
++ 0x80,
++ 0x80,
++ 14,
++ 12,
++ 10,
++ 8,
++ 6,
++ 4,
++ 2,
++ 0);
++ __m256i qMoveMask = _mm256_set_epi8(0x80,
++ 0x80,
++ 0x80,
++ 0x80,
++ 0x80,
++ 0x80,
++ 0x80,
++ 0x80,
++ 15,
++ 13,
++ 11,
++ 9,
++ 7,
++ 5,
++ 3,
++ 1,
++ 0x80,
++ 0x80,
++ 0x80,
++ 0x80,
++ 0x80,
++ 0x80,
++ 0x80,
++ 0x80,
++ 15,
++ 13,
++ 11,
++ 9,
++ 7,
++ 5,
++ 3,
++ 1);
++
++ for (; number < sixteenthPoints; number++) {
++ complexVal = _mm256_load_si256((__m256i*)complexVectorPtr);
++ complexVectorPtr += 32;
++ iComplexVal = _mm256_shuffle_epi8(complexVal, iMoveMask);
++ qComplexVal = _mm256_shuffle_epi8(complexVal, qMoveMask);
++
++ iIntVal = _mm256_cvtepi8_epi32(_mm256_castsi256_si128(iComplexVal));
++ iFloatValue = _mm256_cvtepi32_ps(iIntVal);
++ iFloatValue = _mm256_mul_ps(iFloatValue, invScalar);
++ _mm256_store_ps(iBufferPtr, iFloatValue);
++ iBufferPtr += 8;
++
++ iComplexVal = _mm256_permute4x64_epi64(iComplexVal, 0b11000110);
++ iIntVal = _mm256_cvtepi8_epi32(_mm256_castsi256_si128(iComplexVal));
++ iFloatValue = _mm256_cvtepi32_ps(iIntVal);
++ iFloatValue = _mm256_mul_ps(iFloatValue, invScalar);
++ _mm256_store_ps(iBufferPtr, iFloatValue);
++ iBufferPtr += 8;
++
++ qIntVal = _mm256_cvtepi8_epi32(_mm256_castsi256_si128(qComplexVal));
++ qFloatValue = _mm256_cvtepi32_ps(qIntVal);
++ qFloatValue = _mm256_mul_ps(qFloatValue, invScalar);
++ _mm256_store_ps(qBufferPtr, qFloatValue);
++ qBufferPtr += 8;
++
++ qComplexVal = _mm256_permute4x64_epi64(qComplexVal, 0b11000110);
++ qIntVal = _mm256_cvtepi8_epi32(_mm256_castsi256_si128(qComplexVal));
++ qFloatValue = _mm256_cvtepi32_ps(qIntVal);
++ qFloatValue = _mm256_mul_ps(qFloatValue, invScalar);
++ _mm256_store_ps(qBufferPtr, qFloatValue);
++ qBufferPtr += 8;
++ }
++
++ number = sixteenthPoints * 16;
++ for (; number < num_points; number++) {
++ *iBufferPtr++ = (float)(*complexVectorPtr++) * iScalar;
++ *qBufferPtr++ = (float)(*complexVectorPtr++) * iScalar;
++ }
+ }
+ #endif /* LV_HAVE_AVX2 */
+
+@@ -262,19 +325,21 @@ volk_8ic_s32f_deinterleave_32f_x2_a_avx2(float* iBuffer, float* qBuffer, const l
+ #ifdef LV_HAVE_GENERIC
+
+ static inline void
+-volk_8ic_s32f_deinterleave_32f_x2_generic(float* iBuffer, float* qBuffer,
++volk_8ic_s32f_deinterleave_32f_x2_generic(float* iBuffer,
++ float* qBuffer,
+ const lv_8sc_t* complexVector,
+- const float scalar, unsigned int num_points)
++ const float scalar,
++ unsigned int num_points)
+ {
+- const int8_t* complexVectorPtr = (const int8_t*)complexVector;
+- float* iBufferPtr = iBuffer;
+- float* qBufferPtr = qBuffer;
+- unsigned int number;
+- const float invScalar = 1.0 / scalar;
+- for(number = 0; number < num_points; number++){
+- *iBufferPtr++ = (float)(*complexVectorPtr++)*invScalar;
+- *qBufferPtr++ = (float)(*complexVectorPtr++)*invScalar;
+- }
++ const int8_t* complexVectorPtr = (const int8_t*)complexVector;
++ float* iBufferPtr = iBuffer;
++ float* qBufferPtr = qBuffer;
++ unsigned int number;
++ const float invScalar = 1.0 / scalar;
++ for (number = 0; number < num_points; number++) {
++ *iBufferPtr++ = (float)(*complexVectorPtr++) * invScalar;
++ *qBufferPtr++ = (float)(*complexVectorPtr++) * invScalar;
++ }
+ }
+ #endif /* LV_HAVE_GENERIC */
+
+@@ -285,75 +350,107 @@ volk_8ic_s32f_deinterleave_32f_x2_generic(float* iBuffer, float* qBuffer,
+ #ifndef INCLUDED_volk_8ic_s32f_deinterleave_32f_x2_u_H
+ #define INCLUDED_volk_8ic_s32f_deinterleave_32f_x2_u_H
+
+-#include <volk/volk_common.h>
+ #include <inttypes.h>
+ #include <stdio.h>
++#include <volk/volk_common.h>
+
+ #ifdef LV_HAVE_AVX2
+ #include <immintrin.h>
+
+-static inline void
+-volk_8ic_s32f_deinterleave_32f_x2_u_avx2(float* iBuffer, float* qBuffer, const lv_8sc_t* complexVector,
+- const float scalar, unsigned int num_points)
++static inline void volk_8ic_s32f_deinterleave_32f_x2_u_avx2(float* iBuffer,
++ float* qBuffer,
++ const lv_8sc_t* complexVector,
++ const float scalar,
++ unsigned int num_points)
+ {
+- float* iBufferPtr = iBuffer;
+- float* qBufferPtr = qBuffer;
+-
+- unsigned int number = 0;
+- const unsigned int sixteenthPoints = num_points / 16;
+- __m256 iFloatValue, qFloatValue;
+-
+- const float iScalar= 1.0 / scalar;
+- __m256 invScalar = _mm256_set1_ps(iScalar);
+- __m256i complexVal, iIntVal, qIntVal;
+- __m128i iComplexVal, qComplexVal;
+- int8_t* complexVectorPtr = (int8_t*)complexVector;
+-
+- __m256i MoveMask = _mm256_set_epi8(15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8,
+- 6, 4, 2, 0,15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0);
+-
+- for(;number < sixteenthPoints; number++){
+- complexVal = _mm256_loadu_si256((__m256i*)complexVectorPtr); complexVectorPtr += 32;
+- complexVal = _mm256_shuffle_epi8(complexVal, MoveMask);
+- complexVal = _mm256_permute4x64_epi64(complexVal,0xd8);
+- iComplexVal = _mm256_extractf128_si256(complexVal,0);
+- qComplexVal = _mm256_extractf128_si256(complexVal,1);
+-
+- iIntVal = _mm256_cvtepi8_epi32(iComplexVal);
+- iFloatValue = _mm256_cvtepi32_ps(iIntVal);
+- iFloatValue = _mm256_mul_ps(iFloatValue, invScalar);
+- _mm256_storeu_ps(iBufferPtr, iFloatValue);
+- iBufferPtr += 8;
+-
+- qIntVal = _mm256_cvtepi8_epi32(qComplexVal);
+- qFloatValue = _mm256_cvtepi32_ps(qIntVal);
+- qFloatValue = _mm256_mul_ps(qFloatValue, invScalar);
+- _mm256_storeu_ps(qBufferPtr, qFloatValue);
+- qBufferPtr += 8;
+-
+- complexVal = _mm256_srli_si256(complexVal, 8);
+- iComplexVal = _mm256_extractf128_si256(complexVal,0);
+- qComplexVal = _mm256_extractf128_si256(complexVal,1);
+-
+- iIntVal = _mm256_cvtepi8_epi32(iComplexVal);
+- iFloatValue = _mm256_cvtepi32_ps(iIntVal);
+- iFloatValue = _mm256_mul_ps(iFloatValue, invScalar);
+- _mm256_storeu_ps(iBufferPtr, iFloatValue);
+- iBufferPtr += 8;
+-
+- qIntVal = _mm256_cvtepi8_epi32(qComplexVal);
+- qFloatValue = _mm256_cvtepi32_ps(qIntVal);
+- qFloatValue = _mm256_mul_ps(qFloatValue, invScalar);
+- _mm256_storeu_ps(qBufferPtr, qFloatValue);
+- qBufferPtr += 8;
+- }
+-
+- number = sixteenthPoints * 16;
+- for(; number < num_points; number++){
+- *iBufferPtr++ = (float)(*complexVectorPtr++) * iScalar;
+- *qBufferPtr++ = (float)(*complexVectorPtr++) * iScalar;
+- }
+-
++ float* iBufferPtr = iBuffer;
++ float* qBufferPtr = qBuffer;
++
++ unsigned int number = 0;
++ const unsigned int sixteenthPoints = num_points / 16;
++ __m256 iFloatValue, qFloatValue;
++
++ const float iScalar = 1.0 / scalar;
++ __m256 invScalar = _mm256_set1_ps(iScalar);
++ __m256i complexVal, iIntVal, qIntVal;
++ __m128i iComplexVal, qComplexVal;
++ int8_t* complexVectorPtr = (int8_t*)complexVector;
++
++ __m256i MoveMask = _mm256_set_epi8(15,
++ 13,
++ 11,
++ 9,
++ 7,
++ 5,
++ 3,
++ 1,
++ 14,
++ 12,
++ 10,
++ 8,
++ 6,
++ 4,
++ 2,
++ 0,
++ 15,
++ 13,
++ 11,
++ 9,
++ 7,
++ 5,
++ 3,
++ 1,
++ 14,
++ 12,
++ 10,
++ 8,
++ 6,
++ 4,
++ 2,
++ 0);
++
++ for (; number < sixteenthPoints; number++) {
++ complexVal = _mm256_loadu_si256((__m256i*)complexVectorPtr);
++ complexVectorPtr += 32;
++ complexVal = _mm256_shuffle_epi8(complexVal, MoveMask);
++ complexVal = _mm256_permute4x64_epi64(complexVal, 0xd8);
++ iComplexVal = _mm256_extractf128_si256(complexVal, 0);
++ qComplexVal = _mm256_extractf128_si256(complexVal, 1);
++
++ iIntVal = _mm256_cvtepi8_epi32(iComplexVal);
++ iFloatValue = _mm256_cvtepi32_ps(iIntVal);
++ iFloatValue = _mm256_mul_ps(iFloatValue, invScalar);
++ _mm256_storeu_ps(iBufferPtr, iFloatValue);
++ iBufferPtr += 8;
++
++ qIntVal = _mm256_cvtepi8_epi32(qComplexVal);
++ qFloatValue = _mm256_cvtepi32_ps(qIntVal);
++ qFloatValue = _mm256_mul_ps(qFloatValue, invScalar);
++ _mm256_storeu_ps(qBufferPtr, qFloatValue);
++ qBufferPtr += 8;
++
++ complexVal = _mm256_srli_si256(complexVal, 8);
++ iComplexVal = _mm256_extractf128_si256(complexVal, 0);
++ qComplexVal = _mm256_extractf128_si256(complexVal, 1);
++
++ iIntVal = _mm256_cvtepi8_epi32(iComplexVal);
++ iFloatValue = _mm256_cvtepi32_ps(iIntVal);
++ iFloatValue = _mm256_mul_ps(iFloatValue, invScalar);
++ _mm256_storeu_ps(iBufferPtr, iFloatValue);
++ iBufferPtr += 8;
++
++ qIntVal = _mm256_cvtepi8_epi32(qComplexVal);
++ qFloatValue = _mm256_cvtepi32_ps(qIntVal);
++ qFloatValue = _mm256_mul_ps(qFloatValue, invScalar);
++ _mm256_storeu_ps(qBufferPtr, qFloatValue);
++ qBufferPtr += 8;
++ }
++
++ number = sixteenthPoints * 16;
++ for (; number < num_points; number++) {
++ *iBufferPtr++ = (float)(*complexVectorPtr++) * iScalar;
++ *qBufferPtr++ = (float)(*complexVectorPtr++) * iScalar;
++ }
+ }
+ #endif /* LV_HAVE_AVX2 */
+
+diff --git a/kernels/volk/volk_8ic_s32f_deinterleave_real_32f.h b/kernels/volk/volk_8ic_s32f_deinterleave_real_32f.h
+index 0c85ee9..4c1afe7 100644
+--- a/kernels/volk/volk_8ic_s32f_deinterleave_real_32f.h
++++ b/kernels/volk/volk_8ic_s32f_deinterleave_real_32f.h
+@@ -31,8 +31,8 @@
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+- * void volk_8ic_s32f_deinterleave_real_32f(float* iBuffer, const lv_8sc_t* complexVector, const float scalar, unsigned int num_points)
+- * \endcode
++ * void volk_8ic_s32f_deinterleave_real_32f(float* iBuffer, const lv_8sc_t* complexVector,
++ * const float scalar, unsigned int num_points) \endcode
+ *
+ * \b Inputs
+ * \li complexVector: The complex input vector.
+@@ -55,57 +55,86 @@
+ #ifndef INCLUDED_volk_8ic_s32f_deinterleave_real_32f_a_H
+ #define INCLUDED_volk_8ic_s32f_deinterleave_real_32f_a_H
+
+-#include <volk/volk_common.h>
+ #include <inttypes.h>
+ #include <stdio.h>
++#include <volk/volk_common.h>
+
+ #ifdef LV_HAVE_AVX2
+ #include <immintrin.h>
+
+ static inline void
+-volk_8ic_s32f_deinterleave_real_32f_a_avx2(float* iBuffer, const lv_8sc_t* complexVector,
+- const float scalar, unsigned int num_points)
++volk_8ic_s32f_deinterleave_real_32f_a_avx2(float* iBuffer,
++ const lv_8sc_t* complexVector,
++ const float scalar,
++ unsigned int num_points)
+ {
+- float* iBufferPtr = iBuffer;
+-
+- unsigned int number = 0;
+- const unsigned int sixteenthPoints = num_points / 16;
+- __m256 iFloatValue;
+-
+- const float iScalar= 1.0 / scalar;
+- __m256 invScalar = _mm256_set1_ps(iScalar);
+- __m256i complexVal, iIntVal;
+- int8_t* complexVectorPtr = (int8_t*)complexVector;
+-
+- __m256i moveMask = _mm256_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+- 14, 12, 10, 8, 6, 4, 2, 0,
+- 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+- 14, 12, 10, 8, 6, 4, 2, 0);
+- for(;number < sixteenthPoints; number++){
+- complexVal = _mm256_load_si256((__m256i*)complexVectorPtr);
+- complexVectorPtr += 32;
+- complexVal = _mm256_shuffle_epi8(complexVal, moveMask);
+-
+- iIntVal = _mm256_cvtepi8_epi32(_mm256_castsi256_si128(complexVal));
+- iFloatValue = _mm256_cvtepi32_ps(iIntVal);
+- iFloatValue = _mm256_mul_ps(iFloatValue, invScalar);
+- _mm256_store_ps(iBufferPtr, iFloatValue);
+- iBufferPtr += 8;
+-
+- complexVal = _mm256_permute4x64_epi64(complexVal, 0b11000110);
+- iIntVal = _mm256_cvtepi8_epi32(_mm256_castsi256_si128(complexVal));
+- iFloatValue = _mm256_cvtepi32_ps(iIntVal);
+- iFloatValue = _mm256_mul_ps(iFloatValue, invScalar);
+- _mm256_store_ps(iBufferPtr, iFloatValue);
+- iBufferPtr += 8;
+- }
+-
+- number = sixteenthPoints * 16;
+- for(; number < num_points; number++){
+- *iBufferPtr++ = (float)(*complexVectorPtr++) * iScalar;
+- complexVectorPtr++;
+- }
+-
++ float* iBufferPtr = iBuffer;
++
++ unsigned int number = 0;
++ const unsigned int sixteenthPoints = num_points / 16;
++ __m256 iFloatValue;
++
++ const float iScalar = 1.0 / scalar;
++ __m256 invScalar = _mm256_set1_ps(iScalar);
++ __m256i complexVal, iIntVal;
++ int8_t* complexVectorPtr = (int8_t*)complexVector;
++
++ __m256i moveMask = _mm256_set_epi8(0x80,
++ 0x80,
++ 0x80,
++ 0x80,
++ 0x80,
++ 0x80,
++ 0x80,
++ 0x80,
++ 14,
++ 12,
++ 10,
++ 8,
++ 6,
++ 4,
++ 2,
++ 0,
++ 0x80,
++ 0x80,
++ 0x80,
++ 0x80,
++ 0x80,
++ 0x80,
++ 0x80,
++ 0x80,
++ 14,
++ 12,
++ 10,
++ 8,
++ 6,
++ 4,
++ 2,
++ 0);
++ for (; number < sixteenthPoints; number++) {
++ complexVal = _mm256_load_si256((__m256i*)complexVectorPtr);
++ complexVectorPtr += 32;
++ complexVal = _mm256_shuffle_epi8(complexVal, moveMask);
++
++ iIntVal = _mm256_cvtepi8_epi32(_mm256_castsi256_si128(complexVal));
++ iFloatValue = _mm256_cvtepi32_ps(iIntVal);
++ iFloatValue = _mm256_mul_ps(iFloatValue, invScalar);
++ _mm256_store_ps(iBufferPtr, iFloatValue);
++ iBufferPtr += 8;
++
++ complexVal = _mm256_permute4x64_epi64(complexVal, 0b11000110);
++ iIntVal = _mm256_cvtepi8_epi32(_mm256_castsi256_si128(complexVal));
++ iFloatValue = _mm256_cvtepi32_ps(iIntVal);
++ iFloatValue = _mm256_mul_ps(iFloatValue, invScalar);
++ _mm256_store_ps(iBufferPtr, iFloatValue);
++ iBufferPtr += 8;
++ }
++
++ number = sixteenthPoints * 16;
++ for (; number < num_points; number++) {
++ *iBufferPtr++ = (float)(*complexVectorPtr++) * iScalar;
++ complexVectorPtr++;
++ }
+ }
+ #endif /* LV_HAVE_AVX2 */
+
+@@ -114,52 +143,55 @@ volk_8ic_s32f_deinterleave_real_32f_a_avx2(float* iBuffer, const lv_8sc_t* compl
+ #include <smmintrin.h>
+
+ static inline void
+-volk_8ic_s32f_deinterleave_real_32f_a_sse4_1(float* iBuffer, const lv_8sc_t* complexVector,
+- const float scalar, unsigned int num_points)
++volk_8ic_s32f_deinterleave_real_32f_a_sse4_1(float* iBuffer,
++ const lv_8sc_t* complexVector,
++ const float scalar,
++ unsigned int num_points)
+ {
+- float* iBufferPtr = iBuffer;
+-
+- unsigned int number = 0;
+- const unsigned int eighthPoints = num_points / 8;
+- __m128 iFloatValue;
++ float* iBufferPtr = iBuffer;
+
+- const float iScalar= 1.0 / scalar;
+- __m128 invScalar = _mm_set_ps1(iScalar);
+- __m128i complexVal, iIntVal;
+- int8_t* complexVectorPtr = (int8_t*)complexVector;
++ unsigned int number = 0;
++ const unsigned int eighthPoints = num_points / 8;
++ __m128 iFloatValue;
+
+- __m128i moveMask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0);
++ const float iScalar = 1.0 / scalar;
++ __m128 invScalar = _mm_set_ps1(iScalar);
++ __m128i complexVal, iIntVal;
++ int8_t* complexVectorPtr = (int8_t*)complexVector;
+
+- for(;number < eighthPoints; number++){
+- complexVal = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16;
+- complexVal = _mm_shuffle_epi8(complexVal, moveMask);
++ __m128i moveMask = _mm_set_epi8(
++ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0);
+
+- iIntVal = _mm_cvtepi8_epi32(complexVal);
+- iFloatValue = _mm_cvtepi32_ps(iIntVal);
++ for (; number < eighthPoints; number++) {
++ complexVal = _mm_load_si128((__m128i*)complexVectorPtr);
++ complexVectorPtr += 16;
++ complexVal = _mm_shuffle_epi8(complexVal, moveMask);
+
+- iFloatValue = _mm_mul_ps(iFloatValue, invScalar);
++ iIntVal = _mm_cvtepi8_epi32(complexVal);
++ iFloatValue = _mm_cvtepi32_ps(iIntVal);
+
+- _mm_store_ps(iBufferPtr, iFloatValue);
++ iFloatValue = _mm_mul_ps(iFloatValue, invScalar);
+
+- iBufferPtr += 4;
++ _mm_store_ps(iBufferPtr, iFloatValue);
+
+- complexVal = _mm_srli_si128(complexVal, 4);
+- iIntVal = _mm_cvtepi8_epi32(complexVal);
+- iFloatValue = _mm_cvtepi32_ps(iIntVal);
++ iBufferPtr += 4;
+
+- iFloatValue = _mm_mul_ps(iFloatValue, invScalar);
++ complexVal = _mm_srli_si128(complexVal, 4);
++ iIntVal = _mm_cvtepi8_epi32(complexVal);
++ iFloatValue = _mm_cvtepi32_ps(iIntVal);
+
+- _mm_store_ps(iBufferPtr, iFloatValue);
++ iFloatValue = _mm_mul_ps(iFloatValue, invScalar);
+
+- iBufferPtr += 4;
+- }
++ _mm_store_ps(iBufferPtr, iFloatValue);
+
+- number = eighthPoints * 8;
+- for(; number < num_points; number++){
+- *iBufferPtr++ = (float)(*complexVectorPtr++) * iScalar;
+- complexVectorPtr++;
+- }
++ iBufferPtr += 4;
++ }
+
++ number = eighthPoints * 8;
++ for (; number < num_points; number++) {
++ *iBufferPtr++ = (float)(*complexVectorPtr++) * iScalar;
++ complexVectorPtr++;
++ }
+ }
+ #endif /* LV_HAVE_SSE4_1 */
+
+@@ -168,42 +200,47 @@ volk_8ic_s32f_deinterleave_real_32f_a_sse4_1(float* iBuffer, const lv_8sc_t* com
+ #include <xmmintrin.h>
+
+ static inline void
+-volk_8ic_s32f_deinterleave_real_32f_a_sse(float* iBuffer, const lv_8sc_t* complexVector,
+- const float scalar, unsigned int num_points)
++volk_8ic_s32f_deinterleave_real_32f_a_sse(float* iBuffer,
++ const lv_8sc_t* complexVector,
++ const float scalar,
++ unsigned int num_points)
+ {
+- float* iBufferPtr = iBuffer;
+-
+- unsigned int number = 0;
+- const unsigned int quarterPoints = num_points / 4;
+- __m128 iValue;
++ float* iBufferPtr = iBuffer;
+
+- const float iScalar= 1.0 / scalar;
+- __m128 invScalar = _mm_set_ps1(iScalar);
+- int8_t* complexVectorPtr = (int8_t*)complexVector;
++ unsigned int number = 0;
++ const unsigned int quarterPoints = num_points / 4;
++ __m128 iValue;
+
+- __VOLK_ATTR_ALIGNED(16) float floatBuffer[4];
++ const float iScalar = 1.0 / scalar;
++ __m128 invScalar = _mm_set_ps1(iScalar);
++ int8_t* complexVectorPtr = (int8_t*)complexVector;
+
+- for(;number < quarterPoints; number++){
+- floatBuffer[0] = (float)(*complexVectorPtr); complexVectorPtr += 2;
+- floatBuffer[1] = (float)(*complexVectorPtr); complexVectorPtr += 2;
+- floatBuffer[2] = (float)(*complexVectorPtr); complexVectorPtr += 2;
+- floatBuffer[3] = (float)(*complexVectorPtr); complexVectorPtr += 2;
++ __VOLK_ATTR_ALIGNED(16) float floatBuffer[4];
+
+- iValue = _mm_load_ps(floatBuffer);
++ for (; number < quarterPoints; number++) {
++ floatBuffer[0] = (float)(*complexVectorPtr);
++ complexVectorPtr += 2;
++ floatBuffer[1] = (float)(*complexVectorPtr);
++ complexVectorPtr += 2;
++ floatBuffer[2] = (float)(*complexVectorPtr);
++ complexVectorPtr += 2;
++ floatBuffer[3] = (float)(*complexVectorPtr);
++ complexVectorPtr += 2;
+
+- iValue = _mm_mul_ps(iValue, invScalar);
++ iValue = _mm_load_ps(floatBuffer);
+
+- _mm_store_ps(iBufferPtr, iValue);
++ iValue = _mm_mul_ps(iValue, invScalar);
+
+- iBufferPtr += 4;
+- }
++ _mm_store_ps(iBufferPtr, iValue);
+
+- number = quarterPoints * 4;
+- for(; number < num_points; number++){
+- *iBufferPtr++ = (float)(*complexVectorPtr++) * iScalar;
+- complexVectorPtr++;
+- }
++ iBufferPtr += 4;
++ }
+
++ number = quarterPoints * 4;
++ for (; number < num_points; number++) {
++ *iBufferPtr++ = (float)(*complexVectorPtr++) * iScalar;
++ complexVectorPtr++;
++ }
+ }
+ #endif /* LV_HAVE_SSE */
+
+@@ -211,83 +248,117 @@ volk_8ic_s32f_deinterleave_real_32f_a_sse(float* iBuffer, const lv_8sc_t* comple
+ #ifdef LV_HAVE_GENERIC
+
+ static inline void
+-volk_8ic_s32f_deinterleave_real_32f_generic(float* iBuffer, const lv_8sc_t* complexVector,
+- const float scalar, unsigned int num_points)
++volk_8ic_s32f_deinterleave_real_32f_generic(float* iBuffer,
++ const lv_8sc_t* complexVector,
++ const float scalar,
++ unsigned int num_points)
+ {
+- unsigned int number = 0;
+- const int8_t* complexVectorPtr = (const int8_t*)complexVector;
+- float* iBufferPtr = iBuffer;
+- const float invScalar = 1.0 / scalar;
+- for(number = 0; number < num_points; number++){
+- *iBufferPtr++ = ((float)(*complexVectorPtr++)) * invScalar;
+- complexVectorPtr++;
+- }
++ unsigned int number = 0;
++ const int8_t* complexVectorPtr = (const int8_t*)complexVector;
++ float* iBufferPtr = iBuffer;
++ const float invScalar = 1.0 / scalar;
++ for (number = 0; number < num_points; number++) {
++ *iBufferPtr++ = ((float)(*complexVectorPtr++)) * invScalar;
++ complexVectorPtr++;
++ }
+ }
+ #endif /* LV_HAVE_GENERIC */
+
+
+-
+ #endif /* INCLUDED_volk_8ic_s32f_deinterleave_real_32f_a_H */
+
+ #ifndef INCLUDED_volk_8ic_s32f_deinterleave_real_32f_u_H
+ #define INCLUDED_volk_8ic_s32f_deinterleave_real_32f_u_H
+
+-#include <volk/volk_common.h>
+ #include <inttypes.h>
+ #include <stdio.h>
++#include <volk/volk_common.h>
+
+ #ifdef LV_HAVE_AVX2
+ #include <immintrin.h>
+
+ static inline void
+-volk_8ic_s32f_deinterleave_real_32f_u_avx2(float* iBuffer, const lv_8sc_t* complexVector,
+- const float scalar, unsigned int num_points)
++volk_8ic_s32f_deinterleave_real_32f_u_avx2(float* iBuffer,
++ const lv_8sc_t* complexVector,
++ const float scalar,
++ unsigned int num_points)
+ {
+- float* iBufferPtr = iBuffer;
+-
+- unsigned int number = 0;
+- const unsigned int sixteenthPoints = num_points / 16;
+- __m256 iFloatValue;
+-
+- const float iScalar= 1.0 / scalar;
+- __m256 invScalar = _mm256_set1_ps(iScalar);
+- __m256i complexVal, iIntVal;
+- __m128i hcomplexVal;
+- int8_t* complexVectorPtr = (int8_t*)complexVector;
+-
+- __m256i moveMask = _mm256_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0);
+-
+- for(;number < sixteenthPoints; number++){
+- complexVal = _mm256_loadu_si256((__m256i*)complexVectorPtr); complexVectorPtr += 32;
+- complexVal = _mm256_shuffle_epi8(complexVal, moveMask);
+-
+- hcomplexVal = _mm256_extracti128_si256(complexVal,0);
+- iIntVal = _mm256_cvtepi8_epi32(hcomplexVal);
+- iFloatValue = _mm256_cvtepi32_ps(iIntVal);
+-
+- iFloatValue = _mm256_mul_ps(iFloatValue, invScalar);
+-
+- _mm256_storeu_ps(iBufferPtr, iFloatValue);
+-
+- iBufferPtr += 8;
+-
+- hcomplexVal = _mm256_extracti128_si256(complexVal,1);
+- iIntVal = _mm256_cvtepi8_epi32(hcomplexVal);
+- iFloatValue = _mm256_cvtepi32_ps(iIntVal);
+-
+- iFloatValue = _mm256_mul_ps(iFloatValue, invScalar);
+-
+- _mm256_storeu_ps(iBufferPtr, iFloatValue);
+-
+- iBufferPtr += 8;
+- }
+-
+- number = sixteenthPoints * 16;
+- for(; number < num_points; number++){
+- *iBufferPtr++ = (float)(*complexVectorPtr++) * iScalar;
+- complexVectorPtr++;
+- }
+-
++ float* iBufferPtr = iBuffer;
++
++ unsigned int number = 0;
++ const unsigned int sixteenthPoints = num_points / 16;
++ __m256 iFloatValue;
++
++ const float iScalar = 1.0 / scalar;
++ __m256 invScalar = _mm256_set1_ps(iScalar);
++ __m256i complexVal, iIntVal;
++ __m128i hcomplexVal;
++ int8_t* complexVectorPtr = (int8_t*)complexVector;
++
++ __m256i moveMask = _mm256_set_epi8(0x80,
++ 0x80,
++ 0x80,
++ 0x80,
++ 0x80,
++ 0x80,
++ 0x80,
++ 0x80,
++ 14,
++ 12,
++ 10,
++ 8,
++ 6,
++ 4,
++ 2,
++ 0,
++ 0x80,
++ 0x80,
++ 0x80,
++ 0x80,
++ 0x80,
++ 0x80,
++ 0x80,
++ 0x80,
++ 14,
++ 12,
++ 10,
++ 8,
++ 6,
++ 4,
++ 2,
++ 0);
++
++ for (; number < sixteenthPoints; number++) {
++ complexVal = _mm256_loadu_si256((__m256i*)complexVectorPtr);
++ complexVectorPtr += 32;
++ complexVal = _mm256_shuffle_epi8(complexVal, moveMask);
++
++ hcomplexVal = _mm256_extracti128_si256(complexVal, 0);
++ iIntVal = _mm256_cvtepi8_epi32(hcomplexVal);
++ iFloatValue = _mm256_cvtepi32_ps(iIntVal);
++
++ iFloatValue = _mm256_mul_ps(iFloatValue, invScalar);
++
++ _mm256_storeu_ps(iBufferPtr, iFloatValue);
++
++ iBufferPtr += 8;
++
++ hcomplexVal = _mm256_extracti128_si256(complexVal, 1);
++ iIntVal = _mm256_cvtepi8_epi32(hcomplexVal);
++ iFloatValue = _mm256_cvtepi32_ps(iIntVal);
++
++ iFloatValue = _mm256_mul_ps(iFloatValue, invScalar);
++
++ _mm256_storeu_ps(iBufferPtr, iFloatValue);
++
++ iBufferPtr += 8;
++ }
++
++ number = sixteenthPoints * 16;
++ for (; number < num_points; number++) {
++ *iBufferPtr++ = (float)(*complexVectorPtr++) * iScalar;
++ complexVectorPtr++;
++ }
+ }
+ #endif /* LV_HAVE_AVX2 */
+
+diff --git a/kernels/volk/volk_8ic_x2_multiply_conjugate_16ic.h b/kernels/volk/volk_8ic_x2_multiply_conjugate_16ic.h
+index 6762658..7f9fd96 100644
+--- a/kernels/volk/volk_8ic_x2_multiply_conjugate_16ic.h
++++ b/kernels/volk/volk_8ic_x2_multiply_conjugate_16ic.h
+@@ -30,64 +30,73 @@
+ #ifdef LV_HAVE_AVX2
+ #include <immintrin.h>
+ /*!
+- \brief Multiplys the one complex vector with the complex conjugate of the second complex vector and stores their results in the third vector
+- \param cVector The complex vector where the results will be stored
+- \param aVector One of the complex vectors to be multiplied
+- \param bVector The complex vector which will be converted to complex conjugate and multiplied
+- \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
++ \brief Multiplys the one complex vector with the complex conjugate of the second complex
++ vector and stores their results in the third vector \param cVector The complex vector
++ where the results will be stored \param aVector One of the complex vectors to be
++ multiplied \param bVector The complex vector which will be converted to complex
++ conjugate and multiplied \param num_points The number of complex values in aVector and
++ bVector to be multiplied together and stored into cVector
+ */
+-static inline void volk_8ic_x2_multiply_conjugate_16ic_a_avx2(lv_16sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t* bVector, unsigned int num_points){
+- unsigned int number = 0;
+- const unsigned int quarterPoints = num_points / 8;
+-
+- __m256i x, y, realz, imagz;
+- lv_16sc_t* c = cVector;
+- const lv_8sc_t* a = aVector;
+- const lv_8sc_t* b = bVector;
+- __m256i conjugateSign = _mm256_set_epi16(-1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1);
+-
+- for(;number < quarterPoints; number++){
+- // Convert 8 bit values into 16 bit values
+- x = _mm256_cvtepi8_epi16(_mm_load_si128((__m128i*)a));
+- y = _mm256_cvtepi8_epi16(_mm_load_si128((__m128i*)b));
+-
+- // Calculate the ar*cr - ai*(-ci) portions
+- realz = _mm256_madd_epi16(x,y);
+-
+- // Calculate the complex conjugate of the cr + ci j values
+- y = _mm256_sign_epi16(y, conjugateSign);
+-
+- // Shift the order of the cr and ci values
+- y = _mm256_shufflehi_epi16(_mm256_shufflelo_epi16(y, _MM_SHUFFLE(2,3,0,1) ), _MM_SHUFFLE(2,3,0,1));
+-
+- // Calculate the ar*(-ci) + cr*(ai)
+- imagz = _mm256_madd_epi16(x,y);
+-
+- // Perform the addition of products
+-
+- _mm256_store_si256((__m256i*)c, _mm256_packs_epi32(_mm256_unpacklo_epi32(realz, imagz), _mm256_unpackhi_epi32(realz, imagz)));
+-
+- a += 8;
+- b += 8;
+- c += 8;
+- }
+-
+- number = quarterPoints * 8;
+- int16_t* c16Ptr = (int16_t*)&cVector[number];
+- int8_t* a8Ptr = (int8_t*)&aVector[number];
+- int8_t* b8Ptr = (int8_t*)&bVector[number];
+- for(; number < num_points; number++){
+- float aReal = (float)*a8Ptr++;
+- float aImag = (float)*a8Ptr++;
+- lv_32fc_t aVal = lv_cmake(aReal, aImag );
+- float bReal = (float)*b8Ptr++;
+- float bImag = (float)*b8Ptr++;
+- lv_32fc_t bVal = lv_cmake( bReal, -bImag );
+- lv_32fc_t temp = aVal * bVal;
+-
+- *c16Ptr++ = (int16_t)lv_creal(temp);
+- *c16Ptr++ = (int16_t)lv_cimag(temp);
+- }
++static inline void volk_8ic_x2_multiply_conjugate_16ic_a_avx2(lv_16sc_t* cVector,
++ const lv_8sc_t* aVector,
++ const lv_8sc_t* bVector,
++ unsigned int num_points)
++{
++ unsigned int number = 0;
++ const unsigned int quarterPoints = num_points / 8;
++
++ __m256i x, y, realz, imagz;
++ lv_16sc_t* c = cVector;
++ const lv_8sc_t* a = aVector;
++ const lv_8sc_t* b = bVector;
++ __m256i conjugateSign =
++ _mm256_set_epi16(-1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1);
++
++ for (; number < quarterPoints; number++) {
++ // Convert 8 bit values into 16 bit values
++ x = _mm256_cvtepi8_epi16(_mm_load_si128((__m128i*)a));
++ y = _mm256_cvtepi8_epi16(_mm_load_si128((__m128i*)b));
++
++ // Calculate the ar*cr - ai*(-ci) portions
++ realz = _mm256_madd_epi16(x, y);
++
++ // Calculate the complex conjugate of the cr + ci j values
++ y = _mm256_sign_epi16(y, conjugateSign);
++
++ // Shift the order of the cr and ci values
++ y = _mm256_shufflehi_epi16(_mm256_shufflelo_epi16(y, _MM_SHUFFLE(2, 3, 0, 1)),
++ _MM_SHUFFLE(2, 3, 0, 1));
++
++ // Calculate the ar*(-ci) + cr*(ai)
++ imagz = _mm256_madd_epi16(x, y);
++
++ // Perform the addition of products
++
++ _mm256_store_si256((__m256i*)c,
++ _mm256_packs_epi32(_mm256_unpacklo_epi32(realz, imagz),
++ _mm256_unpackhi_epi32(realz, imagz)));
++
++ a += 8;
++ b += 8;
++ c += 8;
++ }
++
++ number = quarterPoints * 8;
++ int16_t* c16Ptr = (int16_t*)&cVector[number];
++ int8_t* a8Ptr = (int8_t*)&aVector[number];
++ int8_t* b8Ptr = (int8_t*)&bVector[number];
++ for (; number < num_points; number++) {
++ float aReal = (float)*a8Ptr++;
++ float aImag = (float)*a8Ptr++;
++ lv_32fc_t aVal = lv_cmake(aReal, aImag);
++ float bReal = (float)*b8Ptr++;
++ float bImag = (float)*b8Ptr++;
++ lv_32fc_t bVal = lv_cmake(bReal, -bImag);
++ lv_32fc_t temp = aVal * bVal;
++
++ *c16Ptr++ = (int16_t)lv_creal(temp);
++ *c16Ptr++ = (int16_t)lv_cimag(temp);
++ }
+ }
+ #endif /* LV_HAVE_AVX2 */
+
+@@ -95,90 +104,103 @@ static inline void volk_8ic_x2_multiply_conjugate_16ic_a_avx2(lv_16sc_t* cVector
+ #ifdef LV_HAVE_SSE4_1
+ #include <smmintrin.h>
+ /*!
+- \brief Multiplys the one complex vector with the complex conjugate of the second complex vector and stores their results in the third vector
+- \param cVector The complex vector where the results will be stored
+- \param aVector One of the complex vectors to be multiplied
+- \param bVector The complex vector which will be converted to complex conjugate and multiplied
+- \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
++ \brief Multiplys the one complex vector with the complex conjugate of the second complex
++ vector and stores their results in the third vector \param cVector The complex vector
++ where the results will be stored \param aVector One of the complex vectors to be
++ multiplied \param bVector The complex vector which will be converted to complex
++ conjugate and multiplied \param num_points The number of complex values in aVector and
++ bVector to be multiplied together and stored into cVector
+ */
+-static inline void volk_8ic_x2_multiply_conjugate_16ic_a_sse4_1(lv_16sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t* bVector, unsigned int num_points){
+- unsigned int number = 0;
+- const unsigned int quarterPoints = num_points / 4;
+-
+- __m128i x, y, realz, imagz;
+- lv_16sc_t* c = cVector;
+- const lv_8sc_t* a = aVector;
+- const lv_8sc_t* b = bVector;
+- __m128i conjugateSign = _mm_set_epi16(-1, 1, -1, 1, -1, 1, -1, 1);
+-
+- for(;number < quarterPoints; number++){
+- // Convert into 8 bit values into 16 bit values
+- x = _mm_cvtepi8_epi16(_mm_loadl_epi64((__m128i*)a));
+- y = _mm_cvtepi8_epi16(_mm_loadl_epi64((__m128i*)b));
+-
+- // Calculate the ar*cr - ai*(-ci) portions
+- realz = _mm_madd_epi16(x,y);
+-
+- // Calculate the complex conjugate of the cr + ci j values
+- y = _mm_sign_epi16(y, conjugateSign);
+-
+- // Shift the order of the cr and ci values
+- y = _mm_shufflehi_epi16(_mm_shufflelo_epi16(y, _MM_SHUFFLE(2,3,0,1) ), _MM_SHUFFLE(2,3,0,1));
+-
+- // Calculate the ar*(-ci) + cr*(ai)
+- imagz = _mm_madd_epi16(x,y);
+-
+- _mm_store_si128((__m128i*)c, _mm_packs_epi32(_mm_unpacklo_epi32(realz, imagz), _mm_unpackhi_epi32(realz, imagz)));
+-
+- a += 4;
+- b += 4;
+- c += 4;
+- }
+-
+- number = quarterPoints * 4;
+- int16_t* c16Ptr = (int16_t*)&cVector[number];
+- int8_t* a8Ptr = (int8_t*)&aVector[number];
+- int8_t* b8Ptr = (int8_t*)&bVector[number];
+- for(; number < num_points; number++){
+- float aReal = (float)*a8Ptr++;
+- float aImag = (float)*a8Ptr++;
+- lv_32fc_t aVal = lv_cmake(aReal, aImag );
+- float bReal = (float)*b8Ptr++;
+- float bImag = (float)*b8Ptr++;
+- lv_32fc_t bVal = lv_cmake( bReal, -bImag );
+- lv_32fc_t temp = aVal * bVal;
+-
+- *c16Ptr++ = (int16_t)lv_creal(temp);
+- *c16Ptr++ = (int16_t)lv_cimag(temp);
+- }
++static inline void volk_8ic_x2_multiply_conjugate_16ic_a_sse4_1(lv_16sc_t* cVector,
++ const lv_8sc_t* aVector,
++ const lv_8sc_t* bVector,
++ unsigned int num_points)
++{
++ unsigned int number = 0;
++ const unsigned int quarterPoints = num_points / 4;
++
++ __m128i x, y, realz, imagz;
++ lv_16sc_t* c = cVector;
++ const lv_8sc_t* a = aVector;
++ const lv_8sc_t* b = bVector;
++ __m128i conjugateSign = _mm_set_epi16(-1, 1, -1, 1, -1, 1, -1, 1);
++
++ for (; number < quarterPoints; number++) {
++ // Convert into 8 bit values into 16 bit values
++ x = _mm_cvtepi8_epi16(_mm_loadl_epi64((__m128i*)a));
++ y = _mm_cvtepi8_epi16(_mm_loadl_epi64((__m128i*)b));
++
++ // Calculate the ar*cr - ai*(-ci) portions
++ realz = _mm_madd_epi16(x, y);
++
++ // Calculate the complex conjugate of the cr + ci j values
++ y = _mm_sign_epi16(y, conjugateSign);
++
++ // Shift the order of the cr and ci values
++ y = _mm_shufflehi_epi16(_mm_shufflelo_epi16(y, _MM_SHUFFLE(2, 3, 0, 1)),
++ _MM_SHUFFLE(2, 3, 0, 1));
++
++ // Calculate the ar*(-ci) + cr*(ai)
++ imagz = _mm_madd_epi16(x, y);
++
++ _mm_store_si128((__m128i*)c,
++ _mm_packs_epi32(_mm_unpacklo_epi32(realz, imagz),
++ _mm_unpackhi_epi32(realz, imagz)));
++
++ a += 4;
++ b += 4;
++ c += 4;
++ }
++
++ number = quarterPoints * 4;
++ int16_t* c16Ptr = (int16_t*)&cVector[number];
++ int8_t* a8Ptr = (int8_t*)&aVector[number];
++ int8_t* b8Ptr = (int8_t*)&bVector[number];
++ for (; number < num_points; number++) {
++ float aReal = (float)*a8Ptr++;
++ float aImag = (float)*a8Ptr++;
++ lv_32fc_t aVal = lv_cmake(aReal, aImag);
++ float bReal = (float)*b8Ptr++;
++ float bImag = (float)*b8Ptr++;
++ lv_32fc_t bVal = lv_cmake(bReal, -bImag);
++ lv_32fc_t temp = aVal * bVal;
++
++ *c16Ptr++ = (int16_t)lv_creal(temp);
++ *c16Ptr++ = (int16_t)lv_cimag(temp);
++ }
+ }
+ #endif /* LV_HAVE_SSE4_1 */
+
+ #ifdef LV_HAVE_GENERIC
+ /*!
+- \brief Multiplys the one complex vector with the complex conjugate of the second complex vector and stores their results in the third vector
+- \param cVector The complex vector where the results will be stored
+- \param aVector One of the complex vectors to be multiplied
+- \param bVector The complex vector which will be converted to complex conjugate and multiplied
+- \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
++ \brief Multiplys the one complex vector with the complex conjugate of the second complex
++ vector and stores their results in the third vector \param cVector The complex vector
++ where the results will be stored \param aVector One of the complex vectors to be
++ multiplied \param bVector The complex vector which will be converted to complex
++ conjugate and multiplied \param num_points The number of complex values in aVector and
++ bVector to be multiplied together and stored into cVector
+ */
+-static inline void volk_8ic_x2_multiply_conjugate_16ic_generic(lv_16sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t* bVector, unsigned int num_points){
+- unsigned int number = 0;
+- int16_t* c16Ptr = (int16_t*)cVector;
+- int8_t* a8Ptr = (int8_t*)aVector;
+- int8_t* b8Ptr = (int8_t*)bVector;
+- for(number =0; number < num_points; number++){
+- float aReal = (float)*a8Ptr++;
+- float aImag = (float)*a8Ptr++;
+- lv_32fc_t aVal = lv_cmake(aReal, aImag );
+- float bReal = (float)*b8Ptr++;
+- float bImag = (float)*b8Ptr++;
+- lv_32fc_t bVal = lv_cmake( bReal, -bImag );
+- lv_32fc_t temp = aVal * bVal;
+-
+- *c16Ptr++ = (int16_t)lv_creal(temp);
+- *c16Ptr++ = (int16_t)lv_cimag(temp);
+- }
++static inline void volk_8ic_x2_multiply_conjugate_16ic_generic(lv_16sc_t* cVector,
++ const lv_8sc_t* aVector,
++ const lv_8sc_t* bVector,
++ unsigned int num_points)
++{
++ unsigned int number = 0;
++ int16_t* c16Ptr = (int16_t*)cVector;
++ int8_t* a8Ptr = (int8_t*)aVector;
++ int8_t* b8Ptr = (int8_t*)bVector;
++ for (number = 0; number < num_points; number++) {
++ float aReal = (float)*a8Ptr++;
++ float aImag = (float)*a8Ptr++;
++ lv_32fc_t aVal = lv_cmake(aReal, aImag);
++ float bReal = (float)*b8Ptr++;
++ float bImag = (float)*b8Ptr++;
++ lv_32fc_t bVal = lv_cmake(bReal, -bImag);
++ lv_32fc_t temp = aVal * bVal;
++
++ *c16Ptr++ = (int16_t)lv_creal(temp);
++ *c16Ptr++ = (int16_t)lv_cimag(temp);
++ }
+ }
+ #endif /* LV_HAVE_GENERIC */
+
+@@ -194,64 +216,73 @@ static inline void volk_8ic_x2_multiply_conjugate_16ic_generic(lv_16sc_t* cVecto
+ #ifdef LV_HAVE_AVX2
+ #include <immintrin.h>
+ /*!
+- \brief Multiplys the one complex vector with the complex conjugate of the second complex vector and stores their results in the third vector
+- \param cVector The complex vector where the results will be stored
+- \param aVector One of the complex vectors to be multiplied
+- \param bVector The complex vector which will be converted to complex conjugate and multiplied
+- \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
++ \brief Multiplys the one complex vector with the complex conjugate of the second complex
++ vector and stores their results in the third vector \param cVector The complex vector
++ where the results will be stored \param aVector One of the complex vectors to be
++ multiplied \param bVector The complex vector which will be converted to complex
++ conjugate and multiplied \param num_points The number of complex values in aVector and
++ bVector to be multiplied together and stored into cVector
+ */
+-static inline void volk_8ic_x2_multiply_conjugate_16ic_u_avx2(lv_16sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t* bVector, unsigned int num_points){
+- unsigned int number = 0;
+- const unsigned int oneEigthPoints = num_points / 8;
+-
+- __m256i x, y, realz, imagz;
+- lv_16sc_t* c = cVector;
+- const lv_8sc_t* a = aVector;
+- const lv_8sc_t* b = bVector;
+- __m256i conjugateSign = _mm256_set_epi16(-1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1);
+-
+- for(;number < oneEigthPoints; number++){
+- // Convert 8 bit values into 16 bit values
+- x = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*)a));
+- y = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*)b));
+-
+- // Calculate the ar*cr - ai*(-ci) portions
+- realz = _mm256_madd_epi16(x,y);
+-
+- // Calculate the complex conjugate of the cr + ci j values
+- y = _mm256_sign_epi16(y, conjugateSign);
+-
+- // Shift the order of the cr and ci values
+- y = _mm256_shufflehi_epi16(_mm256_shufflelo_epi16(y, _MM_SHUFFLE(2,3,0,1) ), _MM_SHUFFLE(2,3,0,1));
+-
+- // Calculate the ar*(-ci) + cr*(ai)
+- imagz = _mm256_madd_epi16(x,y);
+-
+- // Perform the addition of products
+-
+- _mm256_storeu_si256((__m256i*)c, _mm256_packs_epi32(_mm256_unpacklo_epi32(realz, imagz), _mm256_unpackhi_epi32(realz, imagz)));
+-
+- a += 8;
+- b += 8;
+- c += 8;
+- }
+-
+- number = oneEigthPoints * 8;
+- int16_t* c16Ptr = (int16_t*)&cVector[number];
+- int8_t* a8Ptr = (int8_t*)&aVector[number];
+- int8_t* b8Ptr = (int8_t*)&bVector[number];
+- for(; number < num_points; number++){
+- float aReal = (float)*a8Ptr++;
+- float aImag = (float)*a8Ptr++;
+- lv_32fc_t aVal = lv_cmake(aReal, aImag );
+- float bReal = (float)*b8Ptr++;
+- float bImag = (float)*b8Ptr++;
+- lv_32fc_t bVal = lv_cmake( bReal, -bImag );
+- lv_32fc_t temp = aVal * bVal;
+-
+- *c16Ptr++ = (int16_t)lv_creal(temp);
+- *c16Ptr++ = (int16_t)lv_cimag(temp);
+- }
++static inline void volk_8ic_x2_multiply_conjugate_16ic_u_avx2(lv_16sc_t* cVector,
++ const lv_8sc_t* aVector,
++ const lv_8sc_t* bVector,
++ unsigned int num_points)
++{
++ unsigned int number = 0;
++ const unsigned int oneEigthPoints = num_points / 8;
++
++ __m256i x, y, realz, imagz;
++ lv_16sc_t* c = cVector;
++ const lv_8sc_t* a = aVector;
++ const lv_8sc_t* b = bVector;
++ __m256i conjugateSign =
++ _mm256_set_epi16(-1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1);
++
++ for (; number < oneEigthPoints; number++) {
++ // Convert 8 bit values into 16 bit values
++ x = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*)a));
++ y = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*)b));
++
++ // Calculate the ar*cr - ai*(-ci) portions
++ realz = _mm256_madd_epi16(x, y);
++
++ // Calculate the complex conjugate of the cr + ci j values
++ y = _mm256_sign_epi16(y, conjugateSign);
++
++ // Shift the order of the cr and ci values
++ y = _mm256_shufflehi_epi16(_mm256_shufflelo_epi16(y, _MM_SHUFFLE(2, 3, 0, 1)),
++ _MM_SHUFFLE(2, 3, 0, 1));
++
++ // Calculate the ar*(-ci) + cr*(ai)
++ imagz = _mm256_madd_epi16(x, y);
++
++ // Perform the addition of products
++
++ _mm256_storeu_si256((__m256i*)c,
++ _mm256_packs_epi32(_mm256_unpacklo_epi32(realz, imagz),
++ _mm256_unpackhi_epi32(realz, imagz)));
++
++ a += 8;
++ b += 8;
++ c += 8;
++ }
++
++ number = oneEigthPoints * 8;
++ int16_t* c16Ptr = (int16_t*)&cVector[number];
++ int8_t* a8Ptr = (int8_t*)&aVector[number];
++ int8_t* b8Ptr = (int8_t*)&bVector[number];
++ for (; number < num_points; number++) {
++ float aReal = (float)*a8Ptr++;
++ float aImag = (float)*a8Ptr++;
++ lv_32fc_t aVal = lv_cmake(aReal, aImag);
++ float bReal = (float)*b8Ptr++;
++ float bImag = (float)*b8Ptr++;
++ lv_32fc_t bVal = lv_cmake(bReal, -bImag);
++ lv_32fc_t temp = aVal * bVal;
++
++ *c16Ptr++ = (int16_t)lv_creal(temp);
++ *c16Ptr++ = (int16_t)lv_cimag(temp);
++ }
+ }
+ #endif /* LV_HAVE_AVX2 */
+
+diff --git a/kernels/volk/volk_8ic_x2_s32f_multiply_conjugate_32fc.h b/kernels/volk/volk_8ic_x2_s32f_multiply_conjugate_32fc.h
+index 82e40c8..db6bd7a 100644
+--- a/kernels/volk/volk_8ic_x2_s32f_multiply_conjugate_32fc.h
++++ b/kernels/volk/volk_8ic_x2_s32f_multiply_conjugate_32fc.h
+@@ -30,14 +30,15 @@
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+- * void volk_8ic_x2_s32f_multiply_conjugate_32fc(lv_32fc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t* bVector, const float scalar, unsigned int num_points)
+- * \endcode
++ * void volk_8ic_x2_s32f_multiply_conjugate_32fc(lv_32fc_t* cVector, const lv_8sc_t*
++ * aVector, const lv_8sc_t* bVector, const float scalar, unsigned int num_points) \endcode
+ *
+ * \b Inputs
+ * \li aVector: One of the complex vectors to be multiplied.
+- * \li bVector: The complex vector which will be converted to complex conjugate and multiplied.
+- * \li scalar: each output value is scaled by 1/scalar.
+- * \li num_points: The number of complex values in aVector and bVector to be multiplied together and stored into cVector.
++ * \li bVector: The complex vector which will be converted to complex conjugate and
++ * multiplied. \li scalar: each output value is scaled by 1/scalar. \li num_points: The
++ * number of complex values in aVector and bVector to be multiplied together and stored
++ * into cVector.
+ *
+ * \b Outputs
+ * \li cVector: The complex vector where the results will be stored.
+@@ -64,160 +65,167 @@
+ #include <immintrin.h>
+
+ static inline void
+-volk_8ic_x2_s32f_multiply_conjugate_32fc_a_avx2(lv_32fc_t* cVector, const lv_8sc_t* aVector,
+- const lv_8sc_t* bVector, const float scalar,
+- unsigned int num_points)
++volk_8ic_x2_s32f_multiply_conjugate_32fc_a_avx2(lv_32fc_t* cVector,
++ const lv_8sc_t* aVector,
++ const lv_8sc_t* bVector,
++ const float scalar,
++ unsigned int num_points)
+ {
+- unsigned int number = 0;
+- const unsigned int oneEigthPoints = num_points / 8;
+-
+- __m256i x, y, realz, imagz;
+- __m256 ret, retlo, rethi;
+- lv_32fc_t* c = cVector;
+- const lv_8sc_t* a = aVector;
+- const lv_8sc_t* b = bVector;
+- __m256i conjugateSign = _mm256_set_epi16(-1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1);
+-
+- __m256 invScalar = _mm256_set1_ps(1.0/scalar);
+-
+- for(;number < oneEigthPoints; number++){
+- // Convert 8 bit values into 16 bit values
+- x = _mm256_cvtepi8_epi16(_mm_load_si128((__m128i*)a));
+- y = _mm256_cvtepi8_epi16(_mm_load_si128((__m128i*)b));
+-
+- // Calculate the ar*cr - ai*(-ci) portions
+- realz = _mm256_madd_epi16(x,y);
+-
+- // Calculate the complex conjugate of the cr + ci j values
+- y = _mm256_sign_epi16(y, conjugateSign);
+-
+- // Shift the order of the cr and ci values
+- y = _mm256_shufflehi_epi16(_mm256_shufflelo_epi16(y, _MM_SHUFFLE(2,3,0,1) ), _MM_SHUFFLE(2,3,0,1));
+-
+- // Calculate the ar*(-ci) + cr*(ai)
+- imagz = _mm256_madd_epi16(x,y);
+-
+- // Interleave real and imaginary and then convert to float values
+- retlo = _mm256_cvtepi32_ps(_mm256_unpacklo_epi32(realz, imagz));
+-
+- // Normalize the floating point values
+- retlo = _mm256_mul_ps(retlo, invScalar);
+-
+- // Interleave real and imaginary and then convert to float values
+- rethi = _mm256_cvtepi32_ps(_mm256_unpackhi_epi32(realz, imagz));
+-
+- // Normalize the floating point values
+- rethi = _mm256_mul_ps(rethi, invScalar);
+-
+- ret = _mm256_permute2f128_ps(retlo, rethi, 0b00100000);
+- _mm256_store_ps((float*)c, ret);
+- c += 4;
+-
+- ret = _mm256_permute2f128_ps(retlo, rethi, 0b00110001);
+- _mm256_store_ps((float*)c, ret);
+- c += 4;
+-
+- a += 8;
+- b += 8;
+- }
+-
+- number = oneEigthPoints * 8;
+- float* cFloatPtr = (float*)&cVector[number];
+- int8_t* a8Ptr = (int8_t*)&aVector[number];
+- int8_t* b8Ptr = (int8_t*)&bVector[number];
+- for(; number < num_points; number++){
+- float aReal = (float)*a8Ptr++;
+- float aImag = (float)*a8Ptr++;
+- lv_32fc_t aVal = lv_cmake(aReal, aImag );
+- float bReal = (float)*b8Ptr++;
+- float bImag = (float)*b8Ptr++;
+- lv_32fc_t bVal = lv_cmake( bReal, -bImag );
+- lv_32fc_t temp = aVal * bVal;
+-
+- *cFloatPtr++ = lv_creal(temp) / scalar;
+- *cFloatPtr++ = lv_cimag(temp) / scalar;
+- }
++ unsigned int number = 0;
++ const unsigned int oneEigthPoints = num_points / 8;
++
++ __m256i x, y, realz, imagz;
++ __m256 ret, retlo, rethi;
++ lv_32fc_t* c = cVector;
++ const lv_8sc_t* a = aVector;
++ const lv_8sc_t* b = bVector;
++ __m256i conjugateSign =
++ _mm256_set_epi16(-1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1);
++
++ __m256 invScalar = _mm256_set1_ps(1.0 / scalar);
++
++ for (; number < oneEigthPoints; number++) {
++ // Convert 8 bit values into 16 bit values
++ x = _mm256_cvtepi8_epi16(_mm_load_si128((__m128i*)a));
++ y = _mm256_cvtepi8_epi16(_mm_load_si128((__m128i*)b));
++
++ // Calculate the ar*cr - ai*(-ci) portions
++ realz = _mm256_madd_epi16(x, y);
++
++ // Calculate the complex conjugate of the cr + ci j values
++ y = _mm256_sign_epi16(y, conjugateSign);
++
++ // Shift the order of the cr and ci values
++ y = _mm256_shufflehi_epi16(_mm256_shufflelo_epi16(y, _MM_SHUFFLE(2, 3, 0, 1)),
++ _MM_SHUFFLE(2, 3, 0, 1));
++
++ // Calculate the ar*(-ci) + cr*(ai)
++ imagz = _mm256_madd_epi16(x, y);
++
++ // Interleave real and imaginary and then convert to float values
++ retlo = _mm256_cvtepi32_ps(_mm256_unpacklo_epi32(realz, imagz));
++
++ // Normalize the floating point values
++ retlo = _mm256_mul_ps(retlo, invScalar);
++
++ // Interleave real and imaginary and then convert to float values
++ rethi = _mm256_cvtepi32_ps(_mm256_unpackhi_epi32(realz, imagz));
++
++ // Normalize the floating point values
++ rethi = _mm256_mul_ps(rethi, invScalar);
++
++ ret = _mm256_permute2f128_ps(retlo, rethi, 0b00100000);
++ _mm256_store_ps((float*)c, ret);
++ c += 4;
++
++ ret = _mm256_permute2f128_ps(retlo, rethi, 0b00110001);
++ _mm256_store_ps((float*)c, ret);
++ c += 4;
++
++ a += 8;
++ b += 8;
++ }
++
++ number = oneEigthPoints * 8;
++ float* cFloatPtr = (float*)&cVector[number];
++ int8_t* a8Ptr = (int8_t*)&aVector[number];
++ int8_t* b8Ptr = (int8_t*)&bVector[number];
++ for (; number < num_points; number++) {
++ float aReal = (float)*a8Ptr++;
++ float aImag = (float)*a8Ptr++;
++ lv_32fc_t aVal = lv_cmake(aReal, aImag);
++ float bReal = (float)*b8Ptr++;
++ float bImag = (float)*b8Ptr++;
++ lv_32fc_t bVal = lv_cmake(bReal, -bImag);
++ lv_32fc_t temp = aVal * bVal;
++
++ *cFloatPtr++ = lv_creal(temp) / scalar;
++ *cFloatPtr++ = lv_cimag(temp) / scalar;
++ }
+ }
+-#endif /* LV_HAVE_AVX2*/
++#endif /* LV_HAVE_AVX2*/
+
+
+ #ifdef LV_HAVE_SSE4_1
+ #include <smmintrin.h>
+
+ static inline void
+-volk_8ic_x2_s32f_multiply_conjugate_32fc_a_sse4_1(lv_32fc_t* cVector, const lv_8sc_t* aVector,
+- const lv_8sc_t* bVector, const float scalar,
++volk_8ic_x2_s32f_multiply_conjugate_32fc_a_sse4_1(lv_32fc_t* cVector,
++ const lv_8sc_t* aVector,
++ const lv_8sc_t* bVector,
++ const float scalar,
+ unsigned int num_points)
+ {
+- unsigned int number = 0;
+- const unsigned int quarterPoints = num_points / 4;
+-
+- __m128i x, y, realz, imagz;
+- __m128 ret;
+- lv_32fc_t* c = cVector;
+- const lv_8sc_t* a = aVector;
+- const lv_8sc_t* b = bVector;
+- __m128i conjugateSign = _mm_set_epi16(-1, 1, -1, 1, -1, 1, -1, 1);
+-
+- __m128 invScalar = _mm_set_ps1(1.0/scalar);
+-
+- for(;number < quarterPoints; number++){
+- // Convert into 8 bit values into 16 bit values
+- x = _mm_cvtepi8_epi16(_mm_loadl_epi64((__m128i*)a));
+- y = _mm_cvtepi8_epi16(_mm_loadl_epi64((__m128i*)b));
+-
+- // Calculate the ar*cr - ai*(-ci) portions
+- realz = _mm_madd_epi16(x,y);
+-
+- // Calculate the complex conjugate of the cr + ci j values
+- y = _mm_sign_epi16(y, conjugateSign);
+-
+- // Shift the order of the cr and ci values
+- y = _mm_shufflehi_epi16(_mm_shufflelo_epi16(y, _MM_SHUFFLE(2,3,0,1) ), _MM_SHUFFLE(2,3,0,1));
+-
+- // Calculate the ar*(-ci) + cr*(ai)
+- imagz = _mm_madd_epi16(x,y);
+-
+- // Interleave real and imaginary and then convert to float values
+- ret = _mm_cvtepi32_ps(_mm_unpacklo_epi32(realz, imagz));
+-
+- // Normalize the floating point values
+- ret = _mm_mul_ps(ret, invScalar);
+-
+- // Store the floating point values
+- _mm_store_ps((float*)c, ret);
+- c += 2;
+-
+- // Interleave real and imaginary and then convert to float values
+- ret = _mm_cvtepi32_ps(_mm_unpackhi_epi32(realz, imagz));
+-
+- // Normalize the floating point values
+- ret = _mm_mul_ps(ret, invScalar);
+-
+- // Store the floating point values
+- _mm_store_ps((float*)c, ret);
+- c += 2;
+-
+- a += 4;
+- b += 4;
+- }
+-
+- number = quarterPoints * 4;
+- float* cFloatPtr = (float*)&cVector[number];
+- int8_t* a8Ptr = (int8_t*)&aVector[number];
+- int8_t* b8Ptr = (int8_t*)&bVector[number];
+- for(; number < num_points; number++){
+- float aReal = (float)*a8Ptr++;
+- float aImag = (float)*a8Ptr++;
+- lv_32fc_t aVal = lv_cmake(aReal, aImag );
+- float bReal = (float)*b8Ptr++;
+- float bImag = (float)*b8Ptr++;
+- lv_32fc_t bVal = lv_cmake( bReal, -bImag );
+- lv_32fc_t temp = aVal * bVal;
+-
+- *cFloatPtr++ = lv_creal(temp) / scalar;
+- *cFloatPtr++ = lv_cimag(temp) / scalar;
+- }
++ unsigned int number = 0;
++ const unsigned int quarterPoints = num_points / 4;
++
++ __m128i x, y, realz, imagz;
++ __m128 ret;
++ lv_32fc_t* c = cVector;
++ const lv_8sc_t* a = aVector;
++ const lv_8sc_t* b = bVector;
++ __m128i conjugateSign = _mm_set_epi16(-1, 1, -1, 1, -1, 1, -1, 1);
++
++ __m128 invScalar = _mm_set_ps1(1.0 / scalar);
++
++ for (; number < quarterPoints; number++) {
++ // Convert into 8 bit values into 16 bit values
++ x = _mm_cvtepi8_epi16(_mm_loadl_epi64((__m128i*)a));
++ y = _mm_cvtepi8_epi16(_mm_loadl_epi64((__m128i*)b));
++
++ // Calculate the ar*cr - ai*(-ci) portions
++ realz = _mm_madd_epi16(x, y);
++
++ // Calculate the complex conjugate of the cr + ci j values
++ y = _mm_sign_epi16(y, conjugateSign);
++
++ // Shift the order of the cr and ci values
++ y = _mm_shufflehi_epi16(_mm_shufflelo_epi16(y, _MM_SHUFFLE(2, 3, 0, 1)),
++ _MM_SHUFFLE(2, 3, 0, 1));
++
++ // Calculate the ar*(-ci) + cr*(ai)
++ imagz = _mm_madd_epi16(x, y);
++
++ // Interleave real and imaginary and then convert to float values
++ ret = _mm_cvtepi32_ps(_mm_unpacklo_epi32(realz, imagz));
++
++ // Normalize the floating point values
++ ret = _mm_mul_ps(ret, invScalar);
++
++ // Store the floating point values
++ _mm_store_ps((float*)c, ret);
++ c += 2;
++
++ // Interleave real and imaginary and then convert to float values
++ ret = _mm_cvtepi32_ps(_mm_unpackhi_epi32(realz, imagz));
++
++ // Normalize the floating point values
++ ret = _mm_mul_ps(ret, invScalar);
++
++ // Store the floating point values
++ _mm_store_ps((float*)c, ret);
++ c += 2;
++
++ a += 4;
++ b += 4;
++ }
++
++ number = quarterPoints * 4;
++ float* cFloatPtr = (float*)&cVector[number];
++ int8_t* a8Ptr = (int8_t*)&aVector[number];
++ int8_t* b8Ptr = (int8_t*)&bVector[number];
++ for (; number < num_points; number++) {
++ float aReal = (float)*a8Ptr++;
++ float aImag = (float)*a8Ptr++;
++ lv_32fc_t aVal = lv_cmake(aReal, aImag);
++ float bReal = (float)*b8Ptr++;
++ float bImag = (float)*b8Ptr++;
++ lv_32fc_t bVal = lv_cmake(bReal, -bImag);
++ lv_32fc_t temp = aVal * bVal;
++
++ *cFloatPtr++ = lv_creal(temp) / scalar;
++ *cFloatPtr++ = lv_cimag(temp) / scalar;
++ }
+ }
+ #endif /* LV_HAVE_SSE4_1 */
+
+@@ -225,27 +233,29 @@ volk_8ic_x2_s32f_multiply_conjugate_32fc_a_sse4_1(lv_32fc_t* cVector, const lv_8
+ #ifdef LV_HAVE_GENERIC
+
+ static inline void
+-volk_8ic_x2_s32f_multiply_conjugate_32fc_generic(lv_32fc_t* cVector, const lv_8sc_t* aVector,
+- const lv_8sc_t* bVector, const float scalar,
++volk_8ic_x2_s32f_multiply_conjugate_32fc_generic(lv_32fc_t* cVector,
++ const lv_8sc_t* aVector,
++ const lv_8sc_t* bVector,
++ const float scalar,
+ unsigned int num_points)
+ {
+- unsigned int number = 0;
+- float* cPtr = (float*)cVector;
+- const float invScalar = 1.0 / scalar;
+- int8_t* a8Ptr = (int8_t*)aVector;
+- int8_t* b8Ptr = (int8_t*)bVector;
+- for(number = 0; number < num_points; number++){
+- float aReal = (float)*a8Ptr++;
+- float aImag = (float)*a8Ptr++;
+- lv_32fc_t aVal = lv_cmake(aReal, aImag );
+- float bReal = (float)*b8Ptr++;
+- float bImag = (float)*b8Ptr++;
+- lv_32fc_t bVal = lv_cmake( bReal, -bImag );
+- lv_32fc_t temp = aVal * bVal;
+-
+- *cPtr++ = (lv_creal(temp) * invScalar);
+- *cPtr++ = (lv_cimag(temp) * invScalar);
+- }
++ unsigned int number = 0;
++ float* cPtr = (float*)cVector;
++ const float invScalar = 1.0 / scalar;
++ int8_t* a8Ptr = (int8_t*)aVector;
++ int8_t* b8Ptr = (int8_t*)bVector;
++ for (number = 0; number < num_points; number++) {
++ float aReal = (float)*a8Ptr++;
++ float aImag = (float)*a8Ptr++;
++ lv_32fc_t aVal = lv_cmake(aReal, aImag);
++ float bReal = (float)*b8Ptr++;
++ float bImag = (float)*b8Ptr++;
++ lv_32fc_t bVal = lv_cmake(bReal, -bImag);
++ lv_32fc_t temp = aVal * bVal;
++
++ *cPtr++ = (lv_creal(temp) * invScalar);
++ *cPtr++ = (lv_cimag(temp) * invScalar);
++ }
+ }
+ #endif /* LV_HAVE_GENERIC */
+
+@@ -263,81 +273,85 @@ volk_8ic_x2_s32f_multiply_conjugate_32fc_generic(lv_32fc_t* cVector, const lv_8s
+ #include <immintrin.h>
+
+ static inline void
+-volk_8ic_x2_s32f_multiply_conjugate_32fc_u_avx2(lv_32fc_t* cVector, const lv_8sc_t* aVector,
+- const lv_8sc_t* bVector, const float scalar,
+- unsigned int num_points)
++volk_8ic_x2_s32f_multiply_conjugate_32fc_u_avx2(lv_32fc_t* cVector,
++ const lv_8sc_t* aVector,
++ const lv_8sc_t* bVector,
++ const float scalar,
++ unsigned int num_points)
+ {
+- unsigned int number = 0;
+- const unsigned int oneEigthPoints = num_points / 8;
+-
+- __m256i x, y, realz, imagz;
+- __m256 ret, retlo, rethi;
+- lv_32fc_t* c = cVector;
+- const lv_8sc_t* a = aVector;
+- const lv_8sc_t* b = bVector;
+- __m256i conjugateSign = _mm256_set_epi16(-1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1);
+-
+- __m256 invScalar = _mm256_set1_ps(1.0/scalar);
+-
+- for(;number < oneEigthPoints; number++){
+- // Convert 8 bit values into 16 bit values
+- x = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*)a));
+- y = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*)b));
+-
+- // Calculate the ar*cr - ai*(-ci) portions
+- realz = _mm256_madd_epi16(x,y);
+-
+- // Calculate the complex conjugate of the cr + ci j values
+- y = _mm256_sign_epi16(y, conjugateSign);
+-
+- // Shift the order of the cr and ci values
+- y = _mm256_shufflehi_epi16(_mm256_shufflelo_epi16(y, _MM_SHUFFLE(2,3,0,1) ), _MM_SHUFFLE(2,3,0,1));
+-
+- // Calculate the ar*(-ci) + cr*(ai)
+- imagz = _mm256_madd_epi16(x,y);
+-
+- // Interleave real and imaginary and then convert to float values
+- retlo = _mm256_cvtepi32_ps(_mm256_unpacklo_epi32(realz, imagz));
+-
+- // Normalize the floating point values
+- retlo = _mm256_mul_ps(retlo, invScalar);
+-
+- // Interleave real and imaginary and then convert to float values
+- rethi = _mm256_cvtepi32_ps(_mm256_unpackhi_epi32(realz, imagz));
+-
+- // Normalize the floating point values
+- rethi = _mm256_mul_ps(rethi, invScalar);
+-
+- ret = _mm256_permute2f128_ps(retlo, rethi, 0b00100000);
+- _mm256_storeu_ps((float*)c, ret);
+- c += 4;
+-
+- ret = _mm256_permute2f128_ps(retlo, rethi, 0b00110001);
+- _mm256_storeu_ps((float*)c, ret);
+- c += 4;
+-
+- a += 8;
+- b += 8;
+- }
+-
+- number = oneEigthPoints * 8;
+- float* cFloatPtr = (float*)&cVector[number];
+- int8_t* a8Ptr = (int8_t*)&aVector[number];
+- int8_t* b8Ptr = (int8_t*)&bVector[number];
+- for(; number < num_points; number++){
+- float aReal = (float)*a8Ptr++;
+- float aImag = (float)*a8Ptr++;
+- lv_32fc_t aVal = lv_cmake(aReal, aImag );
+- float bReal = (float)*b8Ptr++;
+- float bImag = (float)*b8Ptr++;
+- lv_32fc_t bVal = lv_cmake( bReal, -bImag );
+- lv_32fc_t temp = aVal * bVal;
+-
+- *cFloatPtr++ = lv_creal(temp) / scalar;
+- *cFloatPtr++ = lv_cimag(temp) / scalar;
+- }
++ unsigned int number = 0;
++ const unsigned int oneEigthPoints = num_points / 8;
++
++ __m256i x, y, realz, imagz;
++ __m256 ret, retlo, rethi;
++ lv_32fc_t* c = cVector;
++ const lv_8sc_t* a = aVector;
++ const lv_8sc_t* b = bVector;
++ __m256i conjugateSign =
++ _mm256_set_epi16(-1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1);
++
++ __m256 invScalar = _mm256_set1_ps(1.0 / scalar);
++
++ for (; number < oneEigthPoints; number++) {
++ // Convert 8 bit values into 16 bit values
++ x = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*)a));
++ y = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*)b));
++
++ // Calculate the ar*cr - ai*(-ci) portions
++ realz = _mm256_madd_epi16(x, y);
++
++ // Calculate the complex conjugate of the cr + ci j values
++ y = _mm256_sign_epi16(y, conjugateSign);
++
++ // Shift the order of the cr and ci values
++ y = _mm256_shufflehi_epi16(_mm256_shufflelo_epi16(y, _MM_SHUFFLE(2, 3, 0, 1)),
++ _MM_SHUFFLE(2, 3, 0, 1));
++
++ // Calculate the ar*(-ci) + cr*(ai)
++ imagz = _mm256_madd_epi16(x, y);
++
++ // Interleave real and imaginary and then convert to float values
++ retlo = _mm256_cvtepi32_ps(_mm256_unpacklo_epi32(realz, imagz));
++
++ // Normalize the floating point values
++ retlo = _mm256_mul_ps(retlo, invScalar);
++
++ // Interleave real and imaginary and then convert to float values
++ rethi = _mm256_cvtepi32_ps(_mm256_unpackhi_epi32(realz, imagz));
++
++ // Normalize the floating point values
++ rethi = _mm256_mul_ps(rethi, invScalar);
++
++ ret = _mm256_permute2f128_ps(retlo, rethi, 0b00100000);
++ _mm256_storeu_ps((float*)c, ret);
++ c += 4;
++
++ ret = _mm256_permute2f128_ps(retlo, rethi, 0b00110001);
++ _mm256_storeu_ps((float*)c, ret);
++ c += 4;
++
++ a += 8;
++ b += 8;
++ }
++
++ number = oneEigthPoints * 8;
++ float* cFloatPtr = (float*)&cVector[number];
++ int8_t* a8Ptr = (int8_t*)&aVector[number];
++ int8_t* b8Ptr = (int8_t*)&bVector[number];
++ for (; number < num_points; number++) {
++ float aReal = (float)*a8Ptr++;
++ float aImag = (float)*a8Ptr++;
++ lv_32fc_t aVal = lv_cmake(aReal, aImag);
++ float bReal = (float)*b8Ptr++;
++ float bImag = (float)*b8Ptr++;
++ lv_32fc_t bVal = lv_cmake(bReal, -bImag);
++ lv_32fc_t temp = aVal * bVal;
++
++ *cFloatPtr++ = lv_creal(temp) / scalar;
++ *cFloatPtr++ = lv_cimag(temp) / scalar;
++ }
+ }
+-#endif /* LV_HAVE_AVX2*/
++#endif /* LV_HAVE_AVX2*/
+
+
+ #endif /* INCLUDED_volk_8ic_x2_s32f_multiply_conjugate_32fc_u_H */
+diff --git a/kernels/volk/volk_8u_conv_k7_r2puppet_8u.h b/kernels/volk/volk_8u_conv_k7_r2puppet_8u.h
+index 00f83de..69287cd 100644
+--- a/kernels/volk/volk_8u_conv_k7_r2puppet_8u.h
++++ b/kernels/volk/volk_8u_conv_k7_r2puppet_8u.h
+@@ -23,21 +23,21 @@
+ #ifndef INCLUDED_volk_8u_conv_k7_r2puppet_8u_H
+ #define INCLUDED_volk_8u_conv_k7_r2puppet_8u_H
+
++#include <string.h>
+ #include <volk/volk.h>
+ #include <volk/volk_8u_x4_conv_k7_r2_8u.h>
+-#include <string.h>
+
+ typedef union {
+- //decision_t is a BIT vector
+- unsigned char* t;
+- unsigned int* w;
++ // decision_t is a BIT vector
++ unsigned char* t;
++ unsigned int* w;
+ } p_decision_t;
+
+ static inline int parity(int x, unsigned char* Partab)
+ {
+- x ^= (x >> 16);
+- x ^= (x >> 8);
+- return Partab[x];
++ x ^= (x >> 16);
++ x ^= (x >> 8);
++ return Partab[x];
+ }
+
+ static inline int chainback_viterbi(unsigned char* data,
+@@ -46,135 +46,143 @@ static inline int chainback_viterbi(unsigned char* data,
+ unsigned int tailsize,
+ unsigned char* decisions)
+ {
+- unsigned char* d;
+- int d_ADDSHIFT = 0;
+- int d_numstates = (1 << 6);
+- int d_decision_t_size = d_numstates/8;
+- unsigned int d_k = 7;
+- int d_framebits = nbits;
+- /* ADDSHIFT and SUBSHIFT make sure that the thing returned is a byte. */
+- d = decisions;
+- /* Make room beyond the end of the encoder register so we can
+- * accumulate a full byte of decoded data
+- */
+-
+- endstate = (endstate%d_numstates) << d_ADDSHIFT;
+-
+- /* The store into data[] only needs to be done every 8 bits.
+- * But this avoids a conditional branch, and the writes will
+- * combine in the cache anyway
+- */
+-
+- d += tailsize * d_decision_t_size ; /* Look past tail */
+- int retval;
+- int dif = tailsize - (d_k - 1);
+- //printf("break, %d, %d\n", dif, (nbits+dif)%d_framebits);
+- p_decision_t dec;
+- while(nbits-- > d_framebits - (d_k - 1)) {
+- int k;
+- dec.t = &d[nbits * d_decision_t_size];
+- k = (dec.w[(endstate>>d_ADDSHIFT)/32] >> ((endstate>>d_ADDSHIFT)%32)) & 1;
+-
+- endstate = (endstate >> 1) | (k << (d_k-2+d_ADDSHIFT));
+- //data[((nbits+dif)%nbits)>>3] = endstate>>d_SUBSHIFT;
+- //printf("%d, %d\n", k, (nbits+dif)%d_framebits);
+- data[((nbits+dif)%d_framebits)] = k;
+-
+- retval = endstate;
+- }
+- nbits += 1;
+-
+- while(nbits-- != 0) {
+- int k;
+-
+- dec.t = &d[nbits * d_decision_t_size];
+-
+- k = (dec.w[(endstate>>d_ADDSHIFT)/32] >> ((endstate>>d_ADDSHIFT)%32)) & 1;
+-
+- endstate = (endstate >> 1) | (k << (d_k-2+d_ADDSHIFT));
+- data[((nbits+dif)%d_framebits)] = k;
+- }
+- //printf("%d, %d, %d, %d, %d, %d, %d, %d\n", data[4095],data[4094],data[4093],data[4092],data[4091],data[4090],data[4089],data[4088]);
+-
+-
+- return retval >> d_ADDSHIFT;
++ unsigned char* d;
++ int d_ADDSHIFT = 0;
++ int d_numstates = (1 << 6);
++ int d_decision_t_size = d_numstates / 8;
++ unsigned int d_k = 7;
++ int d_framebits = nbits;
++ /* ADDSHIFT and SUBSHIFT make sure that the thing returned is a byte. */
++ d = decisions;
++ /* Make room beyond the end of the encoder register so we can
++ * accumulate a full byte of decoded data
++ */
++
++ endstate = (endstate % d_numstates) << d_ADDSHIFT;
++
++ /* The store into data[] only needs to be done every 8 bits.
++ * But this avoids a conditional branch, and the writes will
++ * combine in the cache anyway
++ */
++
++ d += tailsize * d_decision_t_size; /* Look past tail */
++ int retval;
++ int dif = tailsize - (d_k - 1);
++ // printf("break, %d, %d\n", dif, (nbits+dif)%d_framebits);
++ p_decision_t dec;
++ while (nbits-- > d_framebits - (d_k - 1)) {
++ int k;
++ dec.t = &d[nbits * d_decision_t_size];
++ k = (dec.w[(endstate >> d_ADDSHIFT) / 32] >> ((endstate >> d_ADDSHIFT) % 32)) & 1;
++
++ endstate = (endstate >> 1) | (k << (d_k - 2 + d_ADDSHIFT));
++ // data[((nbits+dif)%nbits)>>3] = endstate>>d_SUBSHIFT;
++ // printf("%d, %d\n", k, (nbits+dif)%d_framebits);
++ data[((nbits + dif) % d_framebits)] = k;
++
++ retval = endstate;
++ }
++ nbits += 1;
++
++ while (nbits-- != 0) {
++ int k;
++
++ dec.t = &d[nbits * d_decision_t_size];
++
++ k = (dec.w[(endstate >> d_ADDSHIFT) / 32] >> ((endstate >> d_ADDSHIFT) % 32)) & 1;
++
++ endstate = (endstate >> 1) | (k << (d_k - 2 + d_ADDSHIFT));
++ data[((nbits + dif) % d_framebits)] = k;
++ }
++ // printf("%d, %d, %d, %d, %d, %d, %d, %d\n",
++ // data[4095],data[4094],data[4093],data[4092],data[4091],data[4090],data[4089],data[4088]);
++
++
++ return retval >> d_ADDSHIFT;
+ }
+
+
+ #if LV_HAVE_SSE3
+
+-#include <pmmintrin.h>
+ #include <emmintrin.h>
+-#include <xmmintrin.h>
+ #include <mmintrin.h>
++#include <pmmintrin.h>
+ #include <stdio.h>
++#include <xmmintrin.h>
+
+-static inline void volk_8u_conv_k7_r2puppet_8u_spiral(unsigned char* syms, unsigned char* dec, unsigned int framebits) {
+-
+-
+- static int once = 1;
+- int d_numstates = (1 << 6);
+- int rate = 2;
+- static unsigned char* D;
+- static unsigned char* Y;
+- static unsigned char* X;
+- static unsigned int excess = 6;
+- static unsigned char* Branchtab;
+- static unsigned char Partab[256];
+-
+- int d_polys[2] = {79, 109};
+-
+-
+- if(once) {
+-
+- X = (unsigned char*)volk_malloc(2*d_numstates, volk_get_alignment());
+- Y = X + d_numstates;
+- Branchtab = (unsigned char*)volk_malloc(d_numstates/2*rate, volk_get_alignment());
+- D = (unsigned char*)volk_malloc((d_numstates/8) * (framebits + 6), volk_get_alignment());
+- int state, i;
+- int cnt,ti;
+-
+- /* Initialize parity lookup table */
+- for(i=0;i<256;i++){
+- cnt = 0;
+- ti = i;
+- while(ti){
+- if(ti & 1)
+- cnt++;
+- ti >>= 1;
+- }
+- Partab[i] = cnt & 1;
+- }
+- /* Initialize the branch table */
+- for(state=0;state < d_numstates/2;state++){
+- for(i=0; i<rate; i++){
+- Branchtab[i*d_numstates/2+state] = parity((2*state) & d_polys[i], Partab) ? 255 : 0;
+- }
+- }
++static inline void volk_8u_conv_k7_r2puppet_8u_spiral(unsigned char* syms,
++ unsigned char* dec,
++ unsigned int framebits)
++{
+
+- once = 0;
+- }
+
+- //unbias the old_metrics
+- memset(X, 31, d_numstates);
++ static int once = 1;
++ int d_numstates = (1 << 6);
++ int rate = 2;
++ static unsigned char* D;
++ static unsigned char* Y;
++ static unsigned char* X;
++ static unsigned int excess = 6;
++ static unsigned char* Branchtab;
++ static unsigned char Partab[256];
++
++ int d_polys[2] = { 79, 109 };
++
++
++ if (once) {
++
++ X = (unsigned char*)volk_malloc(2 * d_numstates, volk_get_alignment());
++ Y = X + d_numstates;
++ Branchtab =
++ (unsigned char*)volk_malloc(d_numstates / 2 * rate, volk_get_alignment());
++ D = (unsigned char*)volk_malloc((d_numstates / 8) * (framebits + 6),
++ volk_get_alignment());
++ int state, i;
++ int cnt, ti;
++
++ /* Initialize parity lookup table */
++ for (i = 0; i < 256; i++) {
++ cnt = 0;
++ ti = i;
++ while (ti) {
++ if (ti & 1)
++ cnt++;
++ ti >>= 1;
++ }
++ Partab[i] = cnt & 1;
++ }
++ /* Initialize the branch table */
++ for (state = 0; state < d_numstates / 2; state++) {
++ for (i = 0; i < rate; i++) {
++ Branchtab[i * d_numstates / 2 + state] =
++ parity((2 * state) & d_polys[i], Partab) ? 255 : 0;
++ }
++ }
++
++ once = 0;
++ }
++
++ // unbias the old_metrics
++ memset(X, 31, d_numstates);
+
+- // initialize decisions
+- memset(D, 0, (d_numstates/8) * (framebits + 6));
++ // initialize decisions
++ memset(D, 0, (d_numstates / 8) * (framebits + 6));
+
+- volk_8u_x4_conv_k7_r2_8u_spiral(Y, X, syms, D, framebits/2 - excess, excess, Branchtab);
++ volk_8u_x4_conv_k7_r2_8u_spiral(
++ Y, X, syms, D, framebits / 2 - excess, excess, Branchtab);
+
+- unsigned int min = X[0];
+- int i = 0, state = 0;
+- for(i = 0; i < (d_numstates); ++i) {
+- if(X[i] < min) {
+- min = X[i];
+- state = i;
++ unsigned int min = X[0];
++ int i = 0, state = 0;
++ for (i = 0; i < (d_numstates); ++i) {
++ if (X[i] < min) {
++ min = X[i];
++ state = i;
++ }
+ }
+- }
+
+- chainback_viterbi(dec, framebits/2 -excess, state, excess, D);
++ chainback_viterbi(dec, framebits / 2 - excess, state, excess, D);
+
+- return;
++ return;
+ }
+
+ #endif /*LV_HAVE_SSE3*/
+@@ -185,151 +193,161 @@ static inline void volk_8u_conv_k7_r2puppet_8u_spiral(unsigned char* syms, unsig
+ #include <immintrin.h>
+ #include <stdio.h>
+
+-static inline void volk_8u_conv_k7_r2puppet_8u_avx2(unsigned char* syms, unsigned char* dec, unsigned int framebits) {
+-
+-
+- static int once = 1;
+- int d_numstates = (1 << 6);
+- int rate = 2;
+- static unsigned char* D;
+- static unsigned char* Y;
+- static unsigned char* X;
+- static unsigned int excess = 6;
+- static unsigned char* Branchtab;
+- static unsigned char Partab[256];
+-
+- int d_polys[2] = {79, 109};
+-
+-
+- if(once) {
+-
+- X = (unsigned char*)volk_malloc(2*d_numstates, volk_get_alignment());
+- Y = X + d_numstates;
+- Branchtab = (unsigned char*)volk_malloc(d_numstates/2*rate, volk_get_alignment());
+- D = (unsigned char*)volk_malloc((d_numstates/8) * (framebits + 6), volk_get_alignment());
+- int state, i;
+- int cnt,ti;
+-
+- /* Initialize parity lookup table */
+- for(i=0;i<256;i++){
+- cnt = 0;
+- ti = i;
+- while(ti){
+- if(ti & 1)
+- cnt++;
+- ti >>= 1;
+- }
+- Partab[i] = cnt & 1;
+- }
+- /* Initialize the branch table */
+- for(state=0;state < d_numstates/2;state++){
+- for(i=0; i<rate; i++){
+- Branchtab[i*d_numstates/2+state] = parity((2*state) & d_polys[i], Partab) ? 255 : 0;
+- }
+- }
++static inline void volk_8u_conv_k7_r2puppet_8u_avx2(unsigned char* syms,
++ unsigned char* dec,
++ unsigned int framebits)
++{
+
+- once = 0;
+- }
+
+- //unbias the old_metrics
+- memset(X, 31, d_numstates);
++ static int once = 1;
++ int d_numstates = (1 << 6);
++ int rate = 2;
++ static unsigned char* D;
++ static unsigned char* Y;
++ static unsigned char* X;
++ static unsigned int excess = 6;
++ static unsigned char* Branchtab;
++ static unsigned char Partab[256];
++
++ int d_polys[2] = { 79, 109 };
++
++
++ if (once) {
++
++ X = (unsigned char*)volk_malloc(2 * d_numstates, volk_get_alignment());
++ Y = X + d_numstates;
++ Branchtab =
++ (unsigned char*)volk_malloc(d_numstates / 2 * rate, volk_get_alignment());
++ D = (unsigned char*)volk_malloc((d_numstates / 8) * (framebits + 6),
++ volk_get_alignment());
++ int state, i;
++ int cnt, ti;
++
++ /* Initialize parity lookup table */
++ for (i = 0; i < 256; i++) {
++ cnt = 0;
++ ti = i;
++ while (ti) {
++ if (ti & 1)
++ cnt++;
++ ti >>= 1;
++ }
++ Partab[i] = cnt & 1;
++ }
++ /* Initialize the branch table */
++ for (state = 0; state < d_numstates / 2; state++) {
++ for (i = 0; i < rate; i++) {
++ Branchtab[i * d_numstates / 2 + state] =
++ parity((2 * state) & d_polys[i], Partab) ? 255 : 0;
++ }
++ }
++
++ once = 0;
++ }
++
++ // unbias the old_metrics
++ memset(X, 31, d_numstates);
+
+- // initialize decisions
+- memset(D, 0, (d_numstates/8) * (framebits + 6));
++ // initialize decisions
++ memset(D, 0, (d_numstates / 8) * (framebits + 6));
+
+- volk_8u_x4_conv_k7_r2_8u_avx2(Y, X, syms, D, framebits/2 - excess, excess, Branchtab);
++ volk_8u_x4_conv_k7_r2_8u_avx2(
++ Y, X, syms, D, framebits / 2 - excess, excess, Branchtab);
+
+- unsigned int min = X[0];
+- int i = 0, state = 0;
+- for(i = 0; i < (d_numstates); ++i) {
+- if(X[i] < min) {
+- min = X[i];
+- state = i;
++ unsigned int min = X[0];
++ int i = 0, state = 0;
++ for (i = 0; i < (d_numstates); ++i) {
++ if (X[i] < min) {
++ min = X[i];
++ state = i;
++ }
+ }
+- }
+
+- chainback_viterbi(dec, framebits/2 -excess, state, excess, D);
++ chainback_viterbi(dec, framebits / 2 - excess, state, excess, D);
+
+- return;
++ return;
+ }
+
+ #endif /*LV_HAVE_AVX2*/
+
+
+-
+ #if LV_HAVE_GENERIC
+
+
+-static inline void volk_8u_conv_k7_r2puppet_8u_generic(unsigned char* syms, unsigned char* dec, unsigned int framebits) {
+-
+-
+-
+- static int once = 1;
+- int d_numstates = (1 << 6);
+- int rate = 2;
+- static unsigned char* Y;
+- static unsigned char* X;
+- static unsigned char* D;
+- static unsigned int excess = 6;
+- static unsigned char* Branchtab;
+- static unsigned char Partab[256];
+-
+- int d_polys[2] = {79, 109};
+-
+-
+- if(once) {
+-
+- X = (unsigned char*)volk_malloc(2*d_numstates, volk_get_alignment());
+- Y = X + d_numstates;
+- Branchtab = (unsigned char*)volk_malloc(d_numstates/2*rate, volk_get_alignment());
+- D = (unsigned char*)volk_malloc((d_numstates/8) * (framebits + 6), volk_get_alignment());
++static inline void volk_8u_conv_k7_r2puppet_8u_generic(unsigned char* syms,
++ unsigned char* dec,
++ unsigned int framebits)
++{
+
+- int state, i;
+- int cnt,ti;
+
+- /* Initialize parity lookup table */
+- for(i=0;i<256;i++){
+- cnt = 0;
+- ti = i;
+- while(ti){
+- if(ti & 1)
+- cnt++;
+- ti >>= 1;
+- }
+- Partab[i] = cnt & 1;
++ static int once = 1;
++ int d_numstates = (1 << 6);
++ int rate = 2;
++ static unsigned char* Y;
++ static unsigned char* X;
++ static unsigned char* D;
++ static unsigned int excess = 6;
++ static unsigned char* Branchtab;
++ static unsigned char Partab[256];
++
++ int d_polys[2] = { 79, 109 };
++
++
++ if (once) {
++
++ X = (unsigned char*)volk_malloc(2 * d_numstates, volk_get_alignment());
++ Y = X + d_numstates;
++ Branchtab =
++ (unsigned char*)volk_malloc(d_numstates / 2 * rate, volk_get_alignment());
++ D = (unsigned char*)volk_malloc((d_numstates / 8) * (framebits + 6),
++ volk_get_alignment());
++
++ int state, i;
++ int cnt, ti;
++
++ /* Initialize parity lookup table */
++ for (i = 0; i < 256; i++) {
++ cnt = 0;
++ ti = i;
++ while (ti) {
++ if (ti & 1)
++ cnt++;
++ ti >>= 1;
++ }
++ Partab[i] = cnt & 1;
++ }
++ /* Initialize the branch table */
++ for (state = 0; state < d_numstates / 2; state++) {
++ for (i = 0; i < rate; i++) {
++ Branchtab[i * d_numstates / 2 + state] =
++ parity((2 * state) & d_polys[i], Partab) ? 255 : 0;
++ }
++ }
++
++ once = 0;
+ }
+- /* Initialize the branch table */
+- for(state=0;state < d_numstates/2;state++){
+- for(i=0; i<rate; i++){
+- Branchtab[i*d_numstates/2+state] = parity((2*state) & d_polys[i], Partab) ? 255 : 0;
+- }
+- }
+-
+- once = 0;
+- }
+
+- //unbias the old_metrics
+- memset(X, 31, d_numstates);
++ // unbias the old_metrics
++ memset(X, 31, d_numstates);
+
+- // initialize decisions
+- memset(D, 0, (d_numstates/8) * (framebits + 6));
++ // initialize decisions
++ memset(D, 0, (d_numstates / 8) * (framebits + 6));
+
+- volk_8u_x4_conv_k7_r2_8u_generic(Y, X, syms, D, framebits/2 - excess, excess, Branchtab);
++ volk_8u_x4_conv_k7_r2_8u_generic(
++ Y, X, syms, D, framebits / 2 - excess, excess, Branchtab);
+
+- unsigned int min = X[0];
+- int i = 0, state = 0;
+- for(i = 0; i < (d_numstates); ++i) {
+- if(X[i] < min) {
+- min = X[i];
+- state = i;
++ unsigned int min = X[0];
++ int i = 0, state = 0;
++ for (i = 0; i < (d_numstates); ++i) {
++ if (X[i] < min) {
++ min = X[i];
++ state = i;
++ }
+ }
+- }
+-
+- chainback_viterbi(dec, framebits/2 -excess, state, excess, D);
+-
+- return;
+
++ chainback_viterbi(dec, framebits / 2 - excess, state, excess, D);
+
++ return;
+ }
+
+ #endif /* LV_HAVE_GENERIC */
+diff --git a/kernels/volk/volk_8u_x2_encodeframepolar_8u.h b/kernels/volk/volk_8u_x2_encodeframepolar_8u.h
+index bc176ec..e8d980d 100644
+--- a/kernels/volk/volk_8u_x2_encodeframepolar_8u.h
++++ b/kernels/volk/volk_8u_x2_encodeframepolar_8u.h
+@@ -28,172 +28,236 @@
+ #define VOLK_KERNELS_VOLK_VOLK_8U_X2_ENCODEFRAMEPOLAR_8U_U_H_
+ #include <string.h>
+
+-static inline unsigned int
+-log2_of_power_of_2(unsigned int val){
+- // algorithm from: http://graphics.stanford.edu/~seander/bithacks.html#IntegerLog
+- static const unsigned int b[] = {0xAAAAAAAA, 0xCCCCCCCC, 0xF0F0F0F0,
+- 0xFF00FF00, 0xFFFF0000};
+-
+- unsigned int res = (val & b[0]) != 0;
+- res |= ((val & b[4]) != 0) << 4;
+- res |= ((val & b[3]) != 0) << 3;
+- res |= ((val & b[2]) != 0) << 2;
+- res |= ((val & b[1]) != 0) << 1;
+- return res;
++static inline unsigned int log2_of_power_of_2(unsigned int val)
++{
++ // algorithm from: http://graphics.stanford.edu/~seander/bithacks.html#IntegerLog
++ static const unsigned int b[] = {
++ 0xAAAAAAAA, 0xCCCCCCCC, 0xF0F0F0F0, 0xFF00FF00, 0xFFFF0000
++ };
++
++ unsigned int res = (val & b[0]) != 0;
++ res |= ((val & b[4]) != 0) << 4;
++ res |= ((val & b[3]) != 0) << 3;
++ res |= ((val & b[2]) != 0) << 2;
++ res |= ((val & b[1]) != 0) << 1;
++ return res;
+ }
+
+-static inline void
+-encodepolar_single_stage(unsigned char* frame_ptr, const unsigned char* temp_ptr,
+- const unsigned int num_branches, const unsigned int frame_half)
++static inline void encodepolar_single_stage(unsigned char* frame_ptr,
++ const unsigned char* temp_ptr,
++ const unsigned int num_branches,
++ const unsigned int frame_half)
+ {
+- unsigned int branch, bit;
+- for(branch = 0; branch < num_branches; ++branch){
+- for(bit = 0; bit < frame_half; ++bit){
+- *frame_ptr = *temp_ptr ^ *(temp_ptr + 1);
+- *(frame_ptr + frame_half) = *(temp_ptr + 1);
+- ++frame_ptr;
+- temp_ptr += 2;
++ unsigned int branch, bit;
++ for (branch = 0; branch < num_branches; ++branch) {
++ for (bit = 0; bit < frame_half; ++bit) {
++ *frame_ptr = *temp_ptr ^ *(temp_ptr + 1);
++ *(frame_ptr + frame_half) = *(temp_ptr + 1);
++ ++frame_ptr;
++ temp_ptr += 2;
++ }
++ frame_ptr += frame_half;
+ }
+- frame_ptr += frame_half;
+- }
+ }
+
+ #ifdef LV_HAVE_GENERIC
+
+-static inline void
+-volk_8u_x2_encodeframepolar_8u_generic(unsigned char* frame, unsigned char* temp,
+- unsigned int frame_size)
++static inline void volk_8u_x2_encodeframepolar_8u_generic(unsigned char* frame,
++ unsigned char* temp,
++ unsigned int frame_size)
+ {
+- unsigned int stage = log2_of_power_of_2(frame_size);
+- unsigned int frame_half = frame_size >> 1;
+- unsigned int num_branches = 1;
+-
+- while(stage){
+- // encode stage
+- encodepolar_single_stage(frame, temp, num_branches, frame_half);
+- memcpy(temp, frame, sizeof(unsigned char) * frame_size);
+-
+- // update all the parameters.
+- num_branches = num_branches << 1;
+- frame_half = frame_half >> 1;
+- --stage;
+- }
++ unsigned int stage = log2_of_power_of_2(frame_size);
++ unsigned int frame_half = frame_size >> 1;
++ unsigned int num_branches = 1;
++
++ while (stage) {
++ // encode stage
++ encodepolar_single_stage(frame, temp, num_branches, frame_half);
++ memcpy(temp, frame, sizeof(unsigned char) * frame_size);
++
++ // update all the parameters.
++ num_branches = num_branches << 1;
++ frame_half = frame_half >> 1;
++ --stage;
++ }
+ }
+ #endif /* LV_HAVE_GENERIC */
+
+ #ifdef LV_HAVE_SSSE3
+ #include <tmmintrin.h>
+
+-static inline void
+-volk_8u_x2_encodeframepolar_8u_u_ssse3(unsigned char* frame, unsigned char* temp,
+- unsigned int frame_size)
++static inline void volk_8u_x2_encodeframepolar_8u_u_ssse3(unsigned char* frame,
++ unsigned char* temp,
++ unsigned int frame_size)
+ {
+- const unsigned int po2 = log2_of_power_of_2(frame_size);
+-
+- unsigned int stage = po2;
+- unsigned char* frame_ptr = frame;
+- unsigned char* temp_ptr = temp;
+-
+- unsigned int frame_half = frame_size >> 1;
+- unsigned int num_branches = 1;
+- unsigned int branch;
+- unsigned int bit;
+-
+- // prepare constants
+- const __m128i mask_stage1 = _mm_set_epi8(0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF);
+-
+- // get some SIMD registers to play with.
+- __m128i r_frame0, r_temp0, shifted;
+-
+- {
+- __m128i r_frame1, r_temp1;
+- const __m128i shuffle_separate = _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15);
+-
+- while(stage > 4){
+- frame_ptr = frame;
+- temp_ptr = temp;
+-
+- // for stage = 5 a branch has 32 elements. So upper stages are even bigger.
+- for(branch = 0; branch < num_branches; ++branch){
+- for(bit = 0; bit < frame_half; bit += 16){
+- r_temp0 = _mm_loadu_si128((__m128i *) temp_ptr);
+- temp_ptr += 16;
+- r_temp1 = _mm_loadu_si128((__m128i *) temp_ptr);
+- temp_ptr += 16;
+-
+- shifted = _mm_srli_si128(r_temp0, 1);
+- shifted = _mm_and_si128(shifted, mask_stage1);
+- r_temp0 = _mm_xor_si128(shifted, r_temp0);
+- r_temp0 = _mm_shuffle_epi8(r_temp0, shuffle_separate);
+-
+- shifted = _mm_srli_si128(r_temp1, 1);
+- shifted = _mm_and_si128(shifted, mask_stage1);
+- r_temp1 = _mm_xor_si128(shifted, r_temp1);
+- r_temp1 = _mm_shuffle_epi8(r_temp1, shuffle_separate);
+-
+- r_frame0 = _mm_unpacklo_epi64(r_temp0, r_temp1);
+- _mm_storeu_si128((__m128i*) frame_ptr, r_frame0);
+-
+- r_frame1 = _mm_unpackhi_epi64(r_temp0, r_temp1);
+- _mm_storeu_si128((__m128i*) (frame_ptr + frame_half), r_frame1);
+- frame_ptr += 16;
++ const unsigned int po2 = log2_of_power_of_2(frame_size);
++
++ unsigned int stage = po2;
++ unsigned char* frame_ptr = frame;
++ unsigned char* temp_ptr = temp;
++
++ unsigned int frame_half = frame_size >> 1;
++ unsigned int num_branches = 1;
++ unsigned int branch;
++ unsigned int bit;
++
++ // prepare constants
++ const __m128i mask_stage1 = _mm_set_epi8(0x0,
++ 0xFF,
++ 0x0,
++ 0xFF,
++ 0x0,
++ 0xFF,
++ 0x0,
++ 0xFF,
++ 0x0,
++ 0xFF,
++ 0x0,
++ 0xFF,
++ 0x0,
++ 0xFF,
++ 0x0,
++ 0xFF);
++
++ // get some SIMD registers to play with.
++ __m128i r_frame0, r_temp0, shifted;
++
++ {
++ __m128i r_frame1, r_temp1;
++ const __m128i shuffle_separate =
++ _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15);
++
++ while (stage > 4) {
++ frame_ptr = frame;
++ temp_ptr = temp;
++
++ // for stage = 5 a branch has 32 elements. So upper stages are even bigger.
++ for (branch = 0; branch < num_branches; ++branch) {
++ for (bit = 0; bit < frame_half; bit += 16) {
++ r_temp0 = _mm_loadu_si128((__m128i*)temp_ptr);
++ temp_ptr += 16;
++ r_temp1 = _mm_loadu_si128((__m128i*)temp_ptr);
++ temp_ptr += 16;
++
++ shifted = _mm_srli_si128(r_temp0, 1);
++ shifted = _mm_and_si128(shifted, mask_stage1);
++ r_temp0 = _mm_xor_si128(shifted, r_temp0);
++ r_temp0 = _mm_shuffle_epi8(r_temp0, shuffle_separate);
++
++ shifted = _mm_srli_si128(r_temp1, 1);
++ shifted = _mm_and_si128(shifted, mask_stage1);
++ r_temp1 = _mm_xor_si128(shifted, r_temp1);
++ r_temp1 = _mm_shuffle_epi8(r_temp1, shuffle_separate);
++
++ r_frame0 = _mm_unpacklo_epi64(r_temp0, r_temp1);
++ _mm_storeu_si128((__m128i*)frame_ptr, r_frame0);
++
++ r_frame1 = _mm_unpackhi_epi64(r_temp0, r_temp1);
++ _mm_storeu_si128((__m128i*)(frame_ptr + frame_half), r_frame1);
++ frame_ptr += 16;
++ }
++
++ frame_ptr += frame_half;
++ }
++ memcpy(temp, frame, sizeof(unsigned char) * frame_size);
++
++ num_branches = num_branches << 1;
++ frame_half = frame_half >> 1;
++ stage--;
+ }
+-
+- frame_ptr += frame_half;
+- }
+- memcpy(temp, frame, sizeof(unsigned char) * frame_size);
+-
+- num_branches = num_branches << 1;
+- frame_half = frame_half >> 1;
+- stage--;
+ }
+- }
+
+- // This last part requires at least 16-bit frames.
+- // Smaller frames are useless for SIMD optimization anyways. Just choose GENERIC!
++ // This last part requires at least 16-bit frames.
++ // Smaller frames are useless for SIMD optimization anyways. Just choose GENERIC!
+
+- // reset pointers to correct positions.
+- frame_ptr = frame;
+- temp_ptr = temp;
++ // reset pointers to correct positions.
++ frame_ptr = frame;
++ temp_ptr = temp;
+
+- // prefetch first chunk
+- __VOLK_PREFETCH(temp_ptr);
+-
+- const __m128i shuffle_stage4 = _mm_setr_epi8(0, 8, 4, 12, 2, 10, 6, 14, 1, 9, 5, 13, 3, 11, 7, 15);
+- const __m128i mask_stage4 = _mm_set_epi8(0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
+- const __m128i mask_stage3 = _mm_set_epi8(0x0, 0x0, 0x0, 0x0, 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x0, 0x0, 0x0, 0xFF, 0xFF, 0xFF, 0xFF);
+- const __m128i mask_stage2 = _mm_set_epi8(0x0, 0x0, 0xFF, 0xFF, 0x0, 0x0, 0xFF, 0xFF, 0x0, 0x0, 0xFF, 0xFF, 0x0, 0x0, 0xFF, 0xFF);
+-
+- for(branch = 0; branch < num_branches; ++branch){
+- r_temp0 = _mm_loadu_si128((__m128i*) temp_ptr);
+-
+- // prefetch next chunk
+- temp_ptr += 16;
++ // prefetch first chunk
+ __VOLK_PREFETCH(temp_ptr);
+
+- // shuffle once for bit-reversal.
+- r_temp0 = _mm_shuffle_epi8(r_temp0, shuffle_stage4);
+-
+- shifted = _mm_srli_si128(r_temp0, 8);
+- shifted = _mm_and_si128(shifted, mask_stage4);
+- r_frame0 = _mm_xor_si128(shifted, r_temp0);
+-
+- shifted = _mm_srli_si128(r_frame0, 4);
+- shifted = _mm_and_si128(shifted, mask_stage3);
+- r_frame0 = _mm_xor_si128(shifted, r_frame0);
+-
+- shifted = _mm_srli_si128(r_frame0, 2);
+- shifted = _mm_and_si128(shifted, mask_stage2);
+- r_frame0 = _mm_xor_si128(shifted, r_frame0);
+-
+- shifted = _mm_srli_si128(r_frame0, 1);
+- shifted = _mm_and_si128(shifted, mask_stage1);
+- r_frame0 = _mm_xor_si128(shifted, r_frame0);
+-
+- // store result of chunk.
+- _mm_storeu_si128((__m128i*)frame_ptr, r_frame0);
+- frame_ptr += 16;
+- }
++ const __m128i shuffle_stage4 =
++ _mm_setr_epi8(0, 8, 4, 12, 2, 10, 6, 14, 1, 9, 5, 13, 3, 11, 7, 15);
++ const __m128i mask_stage4 = _mm_set_epi8(0x0,
++ 0x0,
++ 0x0,
++ 0x0,
++ 0x0,
++ 0x0,
++ 0x0,
++ 0x0,
++ 0xFF,
++ 0xFF,
++ 0xFF,
++ 0xFF,
++ 0xFF,
++ 0xFF,
++ 0xFF,
++ 0xFF);
++ const __m128i mask_stage3 = _mm_set_epi8(0x0,
++ 0x0,
++ 0x0,
++ 0x0,
++ 0xFF,
++ 0xFF,
++ 0xFF,
++ 0xFF,
++ 0x0,
++ 0x0,
++ 0x0,
++ 0x0,
++ 0xFF,
++ 0xFF,
++ 0xFF,
++ 0xFF);
++ const __m128i mask_stage2 = _mm_set_epi8(0x0,
++ 0x0,
++ 0xFF,
++ 0xFF,
++ 0x0,
++ 0x0,
++ 0xFF,
++ 0xFF,
++ 0x0,
++ 0x0,
++ 0xFF,
++ 0xFF,
++ 0x0,
++ 0x0,
++ 0xFF,
++ 0xFF);
++
++ for (branch = 0; branch < num_branches; ++branch) {
++ r_temp0 = _mm_loadu_si128((__m128i*)temp_ptr);
++
++ // prefetch next chunk
++ temp_ptr += 16;
++ __VOLK_PREFETCH(temp_ptr);
++
++ // shuffle once for bit-reversal.
++ r_temp0 = _mm_shuffle_epi8(r_temp0, shuffle_stage4);
++
++ shifted = _mm_srli_si128(r_temp0, 8);
++ shifted = _mm_and_si128(shifted, mask_stage4);
++ r_frame0 = _mm_xor_si128(shifted, r_temp0);
++
++ shifted = _mm_srli_si128(r_frame0, 4);
++ shifted = _mm_and_si128(shifted, mask_stage3);
++ r_frame0 = _mm_xor_si128(shifted, r_frame0);
++
++ shifted = _mm_srli_si128(r_frame0, 2);
++ shifted = _mm_and_si128(shifted, mask_stage2);
++ r_frame0 = _mm_xor_si128(shifted, r_frame0);
++
++ shifted = _mm_srli_si128(r_frame0, 1);
++ shifted = _mm_and_si128(shifted, mask_stage1);
++ r_frame0 = _mm_xor_si128(shifted, r_frame0);
++
++ // store result of chunk.
++ _mm_storeu_si128((__m128i*)frame_ptr, r_frame0);
++ frame_ptr += 16;
++ }
+ }
+
+ #endif /* LV_HAVE_SSSE3 */
+@@ -201,154 +265,351 @@ volk_8u_x2_encodeframepolar_8u_u_ssse3(unsigned char* frame, unsigned char* temp
+ #ifdef LV_HAVE_AVX2
+ #include <immintrin.h>
+
+-static inline void
+-volk_8u_x2_encodeframepolar_8u_u_avx2(unsigned char* frame, unsigned char* temp,
+- unsigned int frame_size)
++static inline void volk_8u_x2_encodeframepolar_8u_u_avx2(unsigned char* frame,
++ unsigned char* temp,
++ unsigned int frame_size)
+ {
+- const unsigned int po2 = log2_of_power_of_2(frame_size);
+-
+- unsigned int stage = po2;
+- unsigned char* frame_ptr = frame;
+- unsigned char* temp_ptr = temp;
+-
+- unsigned int frame_half = frame_size >> 1;
+- unsigned int num_branches = 1;
+- unsigned int branch;
+- unsigned int bit;
+-
+- // prepare constants
+- const __m256i mask_stage1 = _mm256_set_epi8(0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF,
+- 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF);
+-
+- const __m128i mask_stage0 = _mm_set_epi8(0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF);
+- // get some SIMD registers to play with.
+- __m256i r_frame0, r_temp0, shifted;
+- __m128i r_temp2, r_frame2, shifted2;
+- {
+- __m256i r_frame1, r_temp1;
+- __m128i r_frame3, r_temp3;
+- const __m256i shuffle_separate = _mm256_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15,
+- 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15);
+- const __m128i shuffle_separate128 = _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15);
+-
+- while(stage > 4){
+- frame_ptr = frame;
+- temp_ptr = temp;
+-
+- // for stage = 5 a branch has 32 elements. So upper stages are even bigger.
+- for(branch = 0; branch < num_branches; ++branch){
+- for(bit = 0; bit < frame_half; bit += 32){
+- if ((frame_half-bit)<32) //if only 16 bits remaining in frame, not 32
+- {
+- r_temp2 = _mm_loadu_si128((__m128i *) temp_ptr);
+- temp_ptr += 16;
+- r_temp3 = _mm_loadu_si128((__m128i *) temp_ptr);
+- temp_ptr += 16;
+-
+- shifted2 = _mm_srli_si128(r_temp2, 1);
+- shifted2 = _mm_and_si128(shifted2, mask_stage0);
+- r_temp2 = _mm_xor_si128(shifted2, r_temp2);
+- r_temp2 = _mm_shuffle_epi8(r_temp2, shuffle_separate128);
+-
+- shifted2 = _mm_srli_si128(r_temp3, 1);
+- shifted2 = _mm_and_si128(shifted2, mask_stage0);
+- r_temp3 = _mm_xor_si128(shifted2, r_temp3);
+- r_temp3 = _mm_shuffle_epi8(r_temp3, shuffle_separate128);
+-
+- r_frame2 = _mm_unpacklo_epi64(r_temp2, r_temp3);
+- _mm_storeu_si128((__m128i*) frame_ptr, r_frame2);
+-
+- r_frame3 = _mm_unpackhi_epi64(r_temp2, r_temp3);
+- _mm_storeu_si128((__m128i*) (frame_ptr + frame_half), r_frame3);
+- frame_ptr += 16;
+- break;
+- }
+- r_temp0 = _mm256_loadu_si256((__m256i *) temp_ptr);
+- temp_ptr += 32;
+- r_temp1 = _mm256_loadu_si256((__m256i *) temp_ptr);
+- temp_ptr += 32;
+-
+- shifted = _mm256_srli_si256(r_temp0, 1);//operate on 128 bit lanes
+- shifted = _mm256_and_si256(shifted, mask_stage1);
+- r_temp0 = _mm256_xor_si256(shifted, r_temp0);
+- r_temp0 = _mm256_shuffle_epi8(r_temp0, shuffle_separate);
+-
+- shifted = _mm256_srli_si256(r_temp1, 1);
+- shifted = _mm256_and_si256(shifted, mask_stage1);
+- r_temp1 = _mm256_xor_si256(shifted, r_temp1);
+- r_temp1 = _mm256_shuffle_epi8(r_temp1, shuffle_separate);
+-
+- r_frame0 = _mm256_unpacklo_epi64(r_temp0, r_temp1);
+- r_temp1 = _mm256_unpackhi_epi64(r_temp0, r_temp1);
+- r_frame0 = _mm256_permute4x64_epi64(r_frame0, 0xd8);
+- r_frame1 = _mm256_permute4x64_epi64(r_temp1, 0xd8);
+-
+- _mm256_storeu_si256((__m256i*) frame_ptr, r_frame0);
+-
+- _mm256_storeu_si256((__m256i*) (frame_ptr + frame_half), r_frame1);
+- frame_ptr += 32;
++ const unsigned int po2 = log2_of_power_of_2(frame_size);
++
++ unsigned int stage = po2;
++ unsigned char* frame_ptr = frame;
++ unsigned char* temp_ptr = temp;
++
++ unsigned int frame_half = frame_size >> 1;
++ unsigned int num_branches = 1;
++ unsigned int branch;
++ unsigned int bit;
++
++ // prepare constants
++ const __m256i mask_stage1 = _mm256_set_epi8(0x0,
++ 0xFF,
++ 0x0,
++ 0xFF,
++ 0x0,
++ 0xFF,
++ 0x0,
++ 0xFF,
++ 0x0,
++ 0xFF,
++ 0x0,
++ 0xFF,
++ 0x0,
++ 0xFF,
++ 0x0,
++ 0xFF,
++ 0x0,
++ 0xFF,
++ 0x0,
++ 0xFF,
++ 0x0,
++ 0xFF,
++ 0x0,
++ 0xFF,
++ 0x0,
++ 0xFF,
++ 0x0,
++ 0xFF,
++ 0x0,
++ 0xFF,
++ 0x0,
++ 0xFF);
++
++ const __m128i mask_stage0 = _mm_set_epi8(0x0,
++ 0xFF,
++ 0x0,
++ 0xFF,
++ 0x0,
++ 0xFF,
++ 0x0,
++ 0xFF,
++ 0x0,
++ 0xFF,
++ 0x0,
++ 0xFF,
++ 0x0,
++ 0xFF,
++ 0x0,
++ 0xFF);
++ // get some SIMD registers to play with.
++ __m256i r_frame0, r_temp0, shifted;
++ __m128i r_temp2, r_frame2, shifted2;
++ {
++ __m256i r_frame1, r_temp1;
++ __m128i r_frame3, r_temp3;
++ const __m256i shuffle_separate = _mm256_setr_epi8(0,
++ 2,
++ 4,
++ 6,
++ 8,
++ 10,
++ 12,
++ 14,
++ 1,
++ 3,
++ 5,
++ 7,
++ 9,
++ 11,
++ 13,
++ 15,
++ 0,
++ 2,
++ 4,
++ 6,
++ 8,
++ 10,
++ 12,
++ 14,
++ 1,
++ 3,
++ 5,
++ 7,
++ 9,
++ 11,
++ 13,
++ 15);
++ const __m128i shuffle_separate128 =
++ _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15);
++
++ while (stage > 4) {
++ frame_ptr = frame;
++ temp_ptr = temp;
++
++ // for stage = 5 a branch has 32 elements. So upper stages are even bigger.
++ for (branch = 0; branch < num_branches; ++branch) {
++ for (bit = 0; bit < frame_half; bit += 32) {
++ if ((frame_half - bit) <
++ 32) // if only 16 bits remaining in frame, not 32
++ {
++ r_temp2 = _mm_loadu_si128((__m128i*)temp_ptr);
++ temp_ptr += 16;
++ r_temp3 = _mm_loadu_si128((__m128i*)temp_ptr);
++ temp_ptr += 16;
++
++ shifted2 = _mm_srli_si128(r_temp2, 1);
++ shifted2 = _mm_and_si128(shifted2, mask_stage0);
++ r_temp2 = _mm_xor_si128(shifted2, r_temp2);
++ r_temp2 = _mm_shuffle_epi8(r_temp2, shuffle_separate128);
++
++ shifted2 = _mm_srli_si128(r_temp3, 1);
++ shifted2 = _mm_and_si128(shifted2, mask_stage0);
++ r_temp3 = _mm_xor_si128(shifted2, r_temp3);
++ r_temp3 = _mm_shuffle_epi8(r_temp3, shuffle_separate128);
++
++ r_frame2 = _mm_unpacklo_epi64(r_temp2, r_temp3);
++ _mm_storeu_si128((__m128i*)frame_ptr, r_frame2);
++
++ r_frame3 = _mm_unpackhi_epi64(r_temp2, r_temp3);
++ _mm_storeu_si128((__m128i*)(frame_ptr + frame_half), r_frame3);
++ frame_ptr += 16;
++ break;
++ }
++ r_temp0 = _mm256_loadu_si256((__m256i*)temp_ptr);
++ temp_ptr += 32;
++ r_temp1 = _mm256_loadu_si256((__m256i*)temp_ptr);
++ temp_ptr += 32;
++
++ shifted = _mm256_srli_si256(r_temp0, 1); // operate on 128 bit lanes
++ shifted = _mm256_and_si256(shifted, mask_stage1);
++ r_temp0 = _mm256_xor_si256(shifted, r_temp0);
++ r_temp0 = _mm256_shuffle_epi8(r_temp0, shuffle_separate);
++
++ shifted = _mm256_srli_si256(r_temp1, 1);
++ shifted = _mm256_and_si256(shifted, mask_stage1);
++ r_temp1 = _mm256_xor_si256(shifted, r_temp1);
++ r_temp1 = _mm256_shuffle_epi8(r_temp1, shuffle_separate);
++
++ r_frame0 = _mm256_unpacklo_epi64(r_temp0, r_temp1);
++ r_temp1 = _mm256_unpackhi_epi64(r_temp0, r_temp1);
++ r_frame0 = _mm256_permute4x64_epi64(r_frame0, 0xd8);
++ r_frame1 = _mm256_permute4x64_epi64(r_temp1, 0xd8);
++
++ _mm256_storeu_si256((__m256i*)frame_ptr, r_frame0);
++
++ _mm256_storeu_si256((__m256i*)(frame_ptr + frame_half), r_frame1);
++ frame_ptr += 32;
++ }
++
++ frame_ptr += frame_half;
++ }
++ memcpy(temp, frame, sizeof(unsigned char) * frame_size);
++
++ num_branches = num_branches << 1;
++ frame_half = frame_half >> 1;
++ stage--;
+ }
+-
+- frame_ptr += frame_half;
+- }
+- memcpy(temp, frame, sizeof(unsigned char) * frame_size);
+-
+- num_branches = num_branches << 1;
+- frame_half = frame_half >> 1;
+- stage--;
+ }
+- }
+-
+- // This last part requires at least 32-bit frames.
+- // Smaller frames are useless for SIMD optimization anyways. Just choose GENERIC!
+-
+- // reset pointers to correct positions.
+- frame_ptr = frame;
+- temp_ptr = temp;
+
+- // prefetch first chunk
+- __VOLK_PREFETCH(temp_ptr);
++ // This last part requires at least 32-bit frames.
++ // Smaller frames are useless for SIMD optimization anyways. Just choose GENERIC!
+
+- const __m256i shuffle_stage4 = _mm256_setr_epi8(0, 8, 4, 12, 2, 10, 6, 14, 1, 9, 5, 13, 3, 11, 7, 15,
+- 0, 8, 4, 12, 2, 10, 6, 14, 1, 9, 5, 13, 3, 11, 7, 15);
+- const __m256i mask_stage4 = _mm256_set_epi8(0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+- 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
+- const __m256i mask_stage3 = _mm256_set_epi8(0x0, 0x0, 0x0, 0x0, 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x0, 0x0, 0x0, 0xFF, 0xFF, 0xFF, 0xFF,
+- 0x0, 0x0, 0x0, 0x0, 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x0, 0x0, 0x0, 0xFF, 0xFF, 0xFF, 0xFF);
+- const __m256i mask_stage2 = _mm256_set_epi8(0x0, 0x0, 0xFF, 0xFF, 0x0, 0x0, 0xFF, 0xFF, 0x0, 0x0, 0xFF, 0xFF, 0x0, 0x0, 0xFF, 0xFF,
+- 0x0, 0x0, 0xFF, 0xFF, 0x0, 0x0, 0xFF, 0xFF, 0x0, 0x0, 0xFF, 0xFF, 0x0, 0x0, 0xFF, 0xFF);
++ // reset pointers to correct positions.
++ frame_ptr = frame;
++ temp_ptr = temp;
+
+- for(branch = 0; branch < num_branches/2; ++branch){
+- r_temp0 = _mm256_loadu_si256((__m256i*) temp_ptr);
+-
+- // prefetch next chunk
+- temp_ptr += 32;
++ // prefetch first chunk
+ __VOLK_PREFETCH(temp_ptr);
+
+- // shuffle once for bit-reversal.
+- r_temp0 = _mm256_shuffle_epi8(r_temp0, shuffle_stage4);
+-
+- shifted = _mm256_srli_si256(r_temp0, 8); //128 bit lanes
+- shifted = _mm256_and_si256(shifted, mask_stage4);
+- r_frame0 = _mm256_xor_si256(shifted, r_temp0);
+-
+-
+- shifted = _mm256_srli_si256(r_frame0, 4);
+- shifted = _mm256_and_si256(shifted, mask_stage3);
+- r_frame0 = _mm256_xor_si256(shifted, r_frame0);
+-
+- shifted = _mm256_srli_si256(r_frame0, 2);
+- shifted = _mm256_and_si256(shifted, mask_stage2);
+- r_frame0 = _mm256_xor_si256(shifted, r_frame0);
+-
+- shifted = _mm256_srli_si256(r_frame0, 1);
+- shifted = _mm256_and_si256(shifted, mask_stage1);
+- r_frame0 = _mm256_xor_si256(shifted, r_frame0);
+-
+- // store result of chunk.
+- _mm256_storeu_si256((__m256i*)frame_ptr, r_frame0);
+- frame_ptr += 32;
+- }
++ const __m256i shuffle_stage4 = _mm256_setr_epi8(0,
++ 8,
++ 4,
++ 12,
++ 2,
++ 10,
++ 6,
++ 14,
++ 1,
++ 9,
++ 5,
++ 13,
++ 3,
++ 11,
++ 7,
++ 15,
++ 0,
++ 8,
++ 4,
++ 12,
++ 2,
++ 10,
++ 6,
++ 14,
++ 1,
++ 9,
++ 5,
++ 13,
++ 3,
++ 11,
++ 7,
++ 15);
++ const __m256i mask_stage4 = _mm256_set_epi8(0x0,
++ 0x0,
++ 0x0,
++ 0x0,
++ 0x0,
++ 0x0,
++ 0x0,
++ 0x0,
++ 0xFF,
++ 0xFF,
++ 0xFF,
++ 0xFF,
++ 0xFF,
++ 0xFF,
++ 0xFF,
++ 0xFF,
++ 0x0,
++ 0x0,
++ 0x0,
++ 0x0,
++ 0x0,
++ 0x0,
++ 0x0,
++ 0x0,
++ 0xFF,
++ 0xFF,
++ 0xFF,
++ 0xFF,
++ 0xFF,
++ 0xFF,
++ 0xFF,
++ 0xFF);
++ const __m256i mask_stage3 = _mm256_set_epi8(0x0,
++ 0x0,
++ 0x0,
++ 0x0,
++ 0xFF,
++ 0xFF,
++ 0xFF,
++ 0xFF,
++ 0x0,
++ 0x0,
++ 0x0,
++ 0x0,
++ 0xFF,
++ 0xFF,
++ 0xFF,
++ 0xFF,
++ 0x0,
++ 0x0,
++ 0x0,
++ 0x0,
++ 0xFF,
++ 0xFF,
++ 0xFF,
++ 0xFF,
++ 0x0,
++ 0x0,
++ 0x0,
++ 0x0,
++ 0xFF,
++ 0xFF,
++ 0xFF,
++ 0xFF);
++ const __m256i mask_stage2 = _mm256_set_epi8(0x0,
++ 0x0,
++ 0xFF,
++ 0xFF,
++ 0x0,
++ 0x0,
++ 0xFF,
++ 0xFF,
++ 0x0,
++ 0x0,
++ 0xFF,
++ 0xFF,
++ 0x0,
++ 0x0,
++ 0xFF,
++ 0xFF,
++ 0x0,
++ 0x0,
++ 0xFF,
++ 0xFF,
++ 0x0,
++ 0x0,
++ 0xFF,
++ 0xFF,
++ 0x0,
++ 0x0,
++ 0xFF,
++ 0xFF,
++ 0x0,
++ 0x0,
++ 0xFF,
++ 0xFF);
++
++ for (branch = 0; branch < num_branches / 2; ++branch) {
++ r_temp0 = _mm256_loadu_si256((__m256i*)temp_ptr);
++
++ // prefetch next chunk
++ temp_ptr += 32;
++ __VOLK_PREFETCH(temp_ptr);
++
++ // shuffle once for bit-reversal.
++ r_temp0 = _mm256_shuffle_epi8(r_temp0, shuffle_stage4);
++
++ shifted = _mm256_srli_si256(r_temp0, 8); // 128 bit lanes
++ shifted = _mm256_and_si256(shifted, mask_stage4);
++ r_frame0 = _mm256_xor_si256(shifted, r_temp0);
++
++
++ shifted = _mm256_srli_si256(r_frame0, 4);
++ shifted = _mm256_and_si256(shifted, mask_stage3);
++ r_frame0 = _mm256_xor_si256(shifted, r_frame0);
++
++ shifted = _mm256_srli_si256(r_frame0, 2);
++ shifted = _mm256_and_si256(shifted, mask_stage2);
++ r_frame0 = _mm256_xor_si256(shifted, r_frame0);
++
++ shifted = _mm256_srli_si256(r_frame0, 1);
++ shifted = _mm256_and_si256(shifted, mask_stage1);
++ r_frame0 = _mm256_xor_si256(shifted, r_frame0);
++
++ // store result of chunk.
++ _mm256_storeu_si256((__m256i*)frame_ptr, r_frame0);
++ frame_ptr += 32;
++ }
+ }
+ #endif /* LV_HAVE_AVX2 */
+
+@@ -360,272 +621,530 @@ volk_8u_x2_encodeframepolar_8u_u_avx2(unsigned char* frame, unsigned char* temp,
+ #ifdef LV_HAVE_SSSE3
+ #include <tmmintrin.h>
+
+-static inline void
+-volk_8u_x2_encodeframepolar_8u_a_ssse3(unsigned char* frame, unsigned char* temp,
+- unsigned int frame_size)
++static inline void volk_8u_x2_encodeframepolar_8u_a_ssse3(unsigned char* frame,
++ unsigned char* temp,
++ unsigned int frame_size)
+ {
+- const unsigned int po2 = log2_of_power_of_2(frame_size);
+-
+- unsigned int stage = po2;
+- unsigned char* frame_ptr = frame;
+- unsigned char* temp_ptr = temp;
+-
+- unsigned int frame_half = frame_size >> 1;
+- unsigned int num_branches = 1;
+- unsigned int branch;
+- unsigned int bit;
+-
+- // prepare constants
+- const __m128i mask_stage1 = _mm_set_epi8(0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF);
+-
+- // get some SIMD registers to play with.
+- __m128i r_frame0, r_temp0, shifted;
+-
+- {
+- __m128i r_frame1, r_temp1;
+- const __m128i shuffle_separate = _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15);
+-
+- while(stage > 4){
+- frame_ptr = frame;
+- temp_ptr = temp;
+-
+- // for stage = 5 a branch has 32 elements. So upper stages are even bigger.
+- for(branch = 0; branch < num_branches; ++branch){
+- for(bit = 0; bit < frame_half; bit += 16){
+- r_temp0 = _mm_load_si128((__m128i *) temp_ptr);
+- temp_ptr += 16;
+- r_temp1 = _mm_load_si128((__m128i *) temp_ptr);
+- temp_ptr += 16;
+-
+- shifted = _mm_srli_si128(r_temp0, 1);
+- shifted = _mm_and_si128(shifted, mask_stage1);
+- r_temp0 = _mm_xor_si128(shifted, r_temp0);
+- r_temp0 = _mm_shuffle_epi8(r_temp0, shuffle_separate);
+-
+- shifted = _mm_srli_si128(r_temp1, 1);
+- shifted = _mm_and_si128(shifted, mask_stage1);
+- r_temp1 = _mm_xor_si128(shifted, r_temp1);
+- r_temp1 = _mm_shuffle_epi8(r_temp1, shuffle_separate);
+-
+- r_frame0 = _mm_unpacklo_epi64(r_temp0, r_temp1);
+- _mm_store_si128((__m128i*) frame_ptr, r_frame0);
+-
+- r_frame1 = _mm_unpackhi_epi64(r_temp0, r_temp1);
+- _mm_store_si128((__m128i*) (frame_ptr + frame_half), r_frame1);
+- frame_ptr += 16;
++ const unsigned int po2 = log2_of_power_of_2(frame_size);
++
++ unsigned int stage = po2;
++ unsigned char* frame_ptr = frame;
++ unsigned char* temp_ptr = temp;
++
++ unsigned int frame_half = frame_size >> 1;
++ unsigned int num_branches = 1;
++ unsigned int branch;
++ unsigned int bit;
++
++ // prepare constants
++ const __m128i mask_stage1 = _mm_set_epi8(0x0,
++ 0xFF,
++ 0x0,
++ 0xFF,
++ 0x0,
++ 0xFF,
++ 0x0,
++ 0xFF,
++ 0x0,
++ 0xFF,
++ 0x0,
++ 0xFF,
++ 0x0,
++ 0xFF,
++ 0x0,
++ 0xFF);
++
++ // get some SIMD registers to play with.
++ __m128i r_frame0, r_temp0, shifted;
++
++ {
++ __m128i r_frame1, r_temp1;
++ const __m128i shuffle_separate =
++ _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15);
++
++ while (stage > 4) {
++ frame_ptr = frame;
++ temp_ptr = temp;
++
++ // for stage = 5 a branch has 32 elements. So upper stages are even bigger.
++ for (branch = 0; branch < num_branches; ++branch) {
++ for (bit = 0; bit < frame_half; bit += 16) {
++ r_temp0 = _mm_load_si128((__m128i*)temp_ptr);
++ temp_ptr += 16;
++ r_temp1 = _mm_load_si128((__m128i*)temp_ptr);
++ temp_ptr += 16;
++
++ shifted = _mm_srli_si128(r_temp0, 1);
++ shifted = _mm_and_si128(shifted, mask_stage1);
++ r_temp0 = _mm_xor_si128(shifted, r_temp0);
++ r_temp0 = _mm_shuffle_epi8(r_temp0, shuffle_separate);
++
++ shifted = _mm_srli_si128(r_temp1, 1);
++ shifted = _mm_and_si128(shifted, mask_stage1);
++ r_temp1 = _mm_xor_si128(shifted, r_temp1);
++ r_temp1 = _mm_shuffle_epi8(r_temp1, shuffle_separate);
++
++ r_frame0 = _mm_unpacklo_epi64(r_temp0, r_temp1);
++ _mm_store_si128((__m128i*)frame_ptr, r_frame0);
++
++ r_frame1 = _mm_unpackhi_epi64(r_temp0, r_temp1);
++ _mm_store_si128((__m128i*)(frame_ptr + frame_half), r_frame1);
++ frame_ptr += 16;
++ }
++
++ frame_ptr += frame_half;
++ }
++ memcpy(temp, frame, sizeof(unsigned char) * frame_size);
++
++ num_branches = num_branches << 1;
++ frame_half = frame_half >> 1;
++ stage--;
+ }
+-
+- frame_ptr += frame_half;
+- }
+- memcpy(temp, frame, sizeof(unsigned char) * frame_size);
+-
+- num_branches = num_branches << 1;
+- frame_half = frame_half >> 1;
+- stage--;
+ }
+- }
+-
+- // This last part requires at least 16-bit frames.
+- // Smaller frames are useless for SIMD optimization anyways. Just choose GENERIC!
+-
+- // reset pointers to correct positions.
+- frame_ptr = frame;
+- temp_ptr = temp;
+
+- // prefetch first chunk
+- __VOLK_PREFETCH(temp_ptr);
++ // This last part requires at least 16-bit frames.
++ // Smaller frames are useless for SIMD optimization anyways. Just choose GENERIC!
+
+- const __m128i shuffle_stage4 = _mm_setr_epi8(0, 8, 4, 12, 2, 10, 6, 14, 1, 9, 5, 13, 3, 11, 7, 15);
+- const __m128i mask_stage4 = _mm_set_epi8(0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
+- const __m128i mask_stage3 = _mm_set_epi8(0x0, 0x0, 0x0, 0x0, 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x0, 0x0, 0x0, 0xFF, 0xFF, 0xFF, 0xFF);
+- const __m128i mask_stage2 = _mm_set_epi8(0x0, 0x0, 0xFF, 0xFF, 0x0, 0x0, 0xFF, 0xFF, 0x0, 0x0, 0xFF, 0xFF, 0x0, 0x0, 0xFF, 0xFF);
++ // reset pointers to correct positions.
++ frame_ptr = frame;
++ temp_ptr = temp;
+
+- for(branch = 0; branch < num_branches; ++branch){
+- r_temp0 = _mm_load_si128((__m128i*) temp_ptr);
+-
+- // prefetch next chunk
+- temp_ptr += 16;
++ // prefetch first chunk
+ __VOLK_PREFETCH(temp_ptr);
+
+- // shuffle once for bit-reversal.
+- r_temp0 = _mm_shuffle_epi8(r_temp0, shuffle_stage4);
+-
+- shifted = _mm_srli_si128(r_temp0, 8);
+- shifted = _mm_and_si128(shifted, mask_stage4);
+- r_frame0 = _mm_xor_si128(shifted, r_temp0);
+-
+- shifted = _mm_srli_si128(r_frame0, 4);
+- shifted = _mm_and_si128(shifted, mask_stage3);
+- r_frame0 = _mm_xor_si128(shifted, r_frame0);
+-
+- shifted = _mm_srli_si128(r_frame0, 2);
+- shifted = _mm_and_si128(shifted, mask_stage2);
+- r_frame0 = _mm_xor_si128(shifted, r_frame0);
+-
+- shifted = _mm_srli_si128(r_frame0, 1);
+- shifted = _mm_and_si128(shifted, mask_stage1);
+- r_frame0 = _mm_xor_si128(shifted, r_frame0);
+-
+- // store result of chunk.
+- _mm_store_si128((__m128i*)frame_ptr, r_frame0);
+- frame_ptr += 16;
+- }
++ const __m128i shuffle_stage4 =
++ _mm_setr_epi8(0, 8, 4, 12, 2, 10, 6, 14, 1, 9, 5, 13, 3, 11, 7, 15);
++ const __m128i mask_stage4 = _mm_set_epi8(0x0,
++ 0x0,
++ 0x0,
++ 0x0,
++ 0x0,
++ 0x0,
++ 0x0,
++ 0x0,
++ 0xFF,
++ 0xFF,
++ 0xFF,
++ 0xFF,
++ 0xFF,
++ 0xFF,
++ 0xFF,
++ 0xFF);
++ const __m128i mask_stage3 = _mm_set_epi8(0x0,
++ 0x0,
++ 0x0,
++ 0x0,
++ 0xFF,
++ 0xFF,
++ 0xFF,
++ 0xFF,
++ 0x0,
++ 0x0,
++ 0x0,
++ 0x0,
++ 0xFF,
++ 0xFF,
++ 0xFF,
++ 0xFF);
++ const __m128i mask_stage2 = _mm_set_epi8(0x0,
++ 0x0,
++ 0xFF,
++ 0xFF,
++ 0x0,
++ 0x0,
++ 0xFF,
++ 0xFF,
++ 0x0,
++ 0x0,
++ 0xFF,
++ 0xFF,
++ 0x0,
++ 0x0,
++ 0xFF,
++ 0xFF);
++
++ for (branch = 0; branch < num_branches; ++branch) {
++ r_temp0 = _mm_load_si128((__m128i*)temp_ptr);
++
++ // prefetch next chunk
++ temp_ptr += 16;
++ __VOLK_PREFETCH(temp_ptr);
++
++ // shuffle once for bit-reversal.
++ r_temp0 = _mm_shuffle_epi8(r_temp0, shuffle_stage4);
++
++ shifted = _mm_srli_si128(r_temp0, 8);
++ shifted = _mm_and_si128(shifted, mask_stage4);
++ r_frame0 = _mm_xor_si128(shifted, r_temp0);
++
++ shifted = _mm_srli_si128(r_frame0, 4);
++ shifted = _mm_and_si128(shifted, mask_stage3);
++ r_frame0 = _mm_xor_si128(shifted, r_frame0);
++
++ shifted = _mm_srli_si128(r_frame0, 2);
++ shifted = _mm_and_si128(shifted, mask_stage2);
++ r_frame0 = _mm_xor_si128(shifted, r_frame0);
++
++ shifted = _mm_srli_si128(r_frame0, 1);
++ shifted = _mm_and_si128(shifted, mask_stage1);
++ r_frame0 = _mm_xor_si128(shifted, r_frame0);
++
++ // store result of chunk.
++ _mm_store_si128((__m128i*)frame_ptr, r_frame0);
++ frame_ptr += 16;
++ }
+ }
+ #endif /* LV_HAVE_SSSE3 */
+
+ #ifdef LV_HAVE_AVX2
+ #include <immintrin.h>
+
+-static inline void
+-volk_8u_x2_encodeframepolar_8u_a_avx2(unsigned char* frame, unsigned char* temp,
+- unsigned int frame_size)
++static inline void volk_8u_x2_encodeframepolar_8u_a_avx2(unsigned char* frame,
++ unsigned char* temp,
++ unsigned int frame_size)
+ {
+- const unsigned int po2 = log2_of_power_of_2(frame_size);
+-
+- unsigned int stage = po2;
+- unsigned char* frame_ptr = frame;
+- unsigned char* temp_ptr = temp;
+-
+- unsigned int frame_half = frame_size >> 1;
+- unsigned int num_branches = 1;
+- unsigned int branch;
+- unsigned int bit;
+-
+- // prepare constants
+- const __m256i mask_stage1 = _mm256_set_epi8(0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF,
+- 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF);
+-
+- const __m128i mask_stage0 = _mm_set_epi8(0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF);
+- // get some SIMD registers to play with.
+- __m256i r_frame0, r_temp0, shifted;
+- __m128i r_temp2, r_frame2, shifted2;
+- {
+- __m256i r_frame1, r_temp1;
+- __m128i r_frame3, r_temp3;
+- const __m256i shuffle_separate = _mm256_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15,
+- 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15);
+- const __m128i shuffle_separate128 = _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15);
+-
+- while(stage > 4){
+- frame_ptr = frame;
+- temp_ptr = temp;
+-
+- // for stage = 5 a branch has 32 elements. So upper stages are even bigger.
+- for(branch = 0; branch < num_branches; ++branch){
+- for(bit = 0; bit < frame_half; bit += 32){
+- if ((frame_half-bit)<32) //if only 16 bits remaining in frame, not 32
+- {
+- r_temp2 = _mm_load_si128((__m128i *) temp_ptr);
+- temp_ptr += 16;
+- r_temp3 = _mm_load_si128((__m128i *) temp_ptr);
+- temp_ptr += 16;
+-
+- shifted2 = _mm_srli_si128(r_temp2, 1);
+- shifted2 = _mm_and_si128(shifted2, mask_stage0);
+- r_temp2 = _mm_xor_si128(shifted2, r_temp2);
+- r_temp2 = _mm_shuffle_epi8(r_temp2, shuffle_separate128);
+-
+- shifted2 = _mm_srli_si128(r_temp3, 1);
+- shifted2 = _mm_and_si128(shifted2, mask_stage0);
+- r_temp3 = _mm_xor_si128(shifted2, r_temp3);
+- r_temp3 = _mm_shuffle_epi8(r_temp3, shuffle_separate128);
+-
+- r_frame2 = _mm_unpacklo_epi64(r_temp2, r_temp3);
+- _mm_store_si128((__m128i*) frame_ptr, r_frame2);
+-
+- r_frame3 = _mm_unpackhi_epi64(r_temp2, r_temp3);
+- _mm_store_si128((__m128i*) (frame_ptr + frame_half), r_frame3);
+- frame_ptr += 16;
+- break;
+- }
+- r_temp0 = _mm256_load_si256((__m256i *) temp_ptr);
+- temp_ptr += 32;
+- r_temp1 = _mm256_load_si256((__m256i *) temp_ptr);
+- temp_ptr += 32;
+-
+- shifted = _mm256_srli_si256(r_temp0, 1);//operate on 128 bit lanes
+- shifted = _mm256_and_si256(shifted, mask_stage1);
+- r_temp0 = _mm256_xor_si256(shifted, r_temp0);
+- r_temp0 = _mm256_shuffle_epi8(r_temp0, shuffle_separate);
+-
+- shifted = _mm256_srli_si256(r_temp1, 1);
+- shifted = _mm256_and_si256(shifted, mask_stage1);
+- r_temp1 = _mm256_xor_si256(shifted, r_temp1);
+- r_temp1 = _mm256_shuffle_epi8(r_temp1, shuffle_separate);
+-
+- r_frame0 = _mm256_unpacklo_epi64(r_temp0, r_temp1);
+- r_temp1 = _mm256_unpackhi_epi64(r_temp0, r_temp1);
+- r_frame0 = _mm256_permute4x64_epi64(r_frame0, 0xd8);
+- r_frame1 = _mm256_permute4x64_epi64(r_temp1, 0xd8);
+-
+- _mm256_store_si256((__m256i*) frame_ptr, r_frame0);
+-
+- _mm256_store_si256((__m256i*) (frame_ptr + frame_half), r_frame1);
+- frame_ptr += 32;
++ const unsigned int po2 = log2_of_power_of_2(frame_size);
++
++ unsigned int stage = po2;
++ unsigned char* frame_ptr = frame;
++ unsigned char* temp_ptr = temp;
++
++ unsigned int frame_half = frame_size >> 1;
++ unsigned int num_branches = 1;
++ unsigned int branch;
++ unsigned int bit;
++
++ // prepare constants
++ const __m256i mask_stage1 = _mm256_set_epi8(0x0,
++ 0xFF,
++ 0x0,
++ 0xFF,
++ 0x0,
++ 0xFF,
++ 0x0,
++ 0xFF,
++ 0x0,
++ 0xFF,
++ 0x0,
++ 0xFF,
++ 0x0,
++ 0xFF,
++ 0x0,
++ 0xFF,
++ 0x0,
++ 0xFF,
++ 0x0,
++ 0xFF,
++ 0x0,
++ 0xFF,
++ 0x0,
++ 0xFF,
++ 0x0,
++ 0xFF,
++ 0x0,
++ 0xFF,
++ 0x0,
++ 0xFF,
++ 0x0,
++ 0xFF);
++
++ const __m128i mask_stage0 = _mm_set_epi8(0x0,
++ 0xFF,
++ 0x0,
++ 0xFF,
++ 0x0,
++ 0xFF,
++ 0x0,
++ 0xFF,
++ 0x0,
++ 0xFF,
++ 0x0,
++ 0xFF,
++ 0x0,
++ 0xFF,
++ 0x0,
++ 0xFF);
++ // get some SIMD registers to play with.
++ __m256i r_frame0, r_temp0, shifted;
++ __m128i r_temp2, r_frame2, shifted2;
++ {
++ __m256i r_frame1, r_temp1;
++ __m128i r_frame3, r_temp3;
++ const __m256i shuffle_separate = _mm256_setr_epi8(0,
++ 2,
++ 4,
++ 6,
++ 8,
++ 10,
++ 12,
++ 14,
++ 1,
++ 3,
++ 5,
++ 7,
++ 9,
++ 11,
++ 13,
++ 15,
++ 0,
++ 2,
++ 4,
++ 6,
++ 8,
++ 10,
++ 12,
++ 14,
++ 1,
++ 3,
++ 5,
++ 7,
++ 9,
++ 11,
++ 13,
++ 15);
++ const __m128i shuffle_separate128 =
++ _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15);
++
++ while (stage > 4) {
++ frame_ptr = frame;
++ temp_ptr = temp;
++
++ // for stage = 5 a branch has 32 elements. So upper stages are even bigger.
++ for (branch = 0; branch < num_branches; ++branch) {
++ for (bit = 0; bit < frame_half; bit += 32) {
++ if ((frame_half - bit) <
++ 32) // if only 16 bits remaining in frame, not 32
++ {
++ r_temp2 = _mm_load_si128((__m128i*)temp_ptr);
++ temp_ptr += 16;
++ r_temp3 = _mm_load_si128((__m128i*)temp_ptr);
++ temp_ptr += 16;
++
++ shifted2 = _mm_srli_si128(r_temp2, 1);
++ shifted2 = _mm_and_si128(shifted2, mask_stage0);
++ r_temp2 = _mm_xor_si128(shifted2, r_temp2);
++ r_temp2 = _mm_shuffle_epi8(r_temp2, shuffle_separate128);
++
++ shifted2 = _mm_srli_si128(r_temp3, 1);
++ shifted2 = _mm_and_si128(shifted2, mask_stage0);
++ r_temp3 = _mm_xor_si128(shifted2, r_temp3);
++ r_temp3 = _mm_shuffle_epi8(r_temp3, shuffle_separate128);
++
++ r_frame2 = _mm_unpacklo_epi64(r_temp2, r_temp3);
++ _mm_store_si128((__m128i*)frame_ptr, r_frame2);
++
++ r_frame3 = _mm_unpackhi_epi64(r_temp2, r_temp3);
++ _mm_store_si128((__m128i*)(frame_ptr + frame_half), r_frame3);
++ frame_ptr += 16;
++ break;
++ }
++ r_temp0 = _mm256_load_si256((__m256i*)temp_ptr);
++ temp_ptr += 32;
++ r_temp1 = _mm256_load_si256((__m256i*)temp_ptr);
++ temp_ptr += 32;
++
++ shifted = _mm256_srli_si256(r_temp0, 1); // operate on 128 bit lanes
++ shifted = _mm256_and_si256(shifted, mask_stage1);
++ r_temp0 = _mm256_xor_si256(shifted, r_temp0);
++ r_temp0 = _mm256_shuffle_epi8(r_temp0, shuffle_separate);
++
++ shifted = _mm256_srli_si256(r_temp1, 1);
++ shifted = _mm256_and_si256(shifted, mask_stage1);
++ r_temp1 = _mm256_xor_si256(shifted, r_temp1);
++ r_temp1 = _mm256_shuffle_epi8(r_temp1, shuffle_separate);
++
++ r_frame0 = _mm256_unpacklo_epi64(r_temp0, r_temp1);
++ r_temp1 = _mm256_unpackhi_epi64(r_temp0, r_temp1);
++ r_frame0 = _mm256_permute4x64_epi64(r_frame0, 0xd8);
++ r_frame1 = _mm256_permute4x64_epi64(r_temp1, 0xd8);
++
++ _mm256_store_si256((__m256i*)frame_ptr, r_frame0);
++
++ _mm256_store_si256((__m256i*)(frame_ptr + frame_half), r_frame1);
++ frame_ptr += 32;
++ }
++
++ frame_ptr += frame_half;
++ }
++ memcpy(temp, frame, sizeof(unsigned char) * frame_size);
++
++ num_branches = num_branches << 1;
++ frame_half = frame_half >> 1;
++ stage--;
+ }
+-
+- frame_ptr += frame_half;
+- }
+- memcpy(temp, frame, sizeof(unsigned char) * frame_size);
+-
+- num_branches = num_branches << 1;
+- frame_half = frame_half >> 1;
+- stage--;
+ }
+- }
+-
+- // This last part requires at least 32-bit frames.
+- // Smaller frames are useless for SIMD optimization anyways. Just choose GENERIC!
+
+- // reset pointers to correct positions.
+- frame_ptr = frame;
+- temp_ptr = temp;
++ // This last part requires at least 32-bit frames.
++ // Smaller frames are useless for SIMD optimization anyways. Just choose GENERIC!
+
+- // prefetch first chunk.
+- __VOLK_PREFETCH(temp_ptr);
++ // reset pointers to correct positions.
++ frame_ptr = frame;
++ temp_ptr = temp;
+
+- const __m256i shuffle_stage4 = _mm256_setr_epi8(0, 8, 4, 12, 2, 10, 6, 14, 1, 9, 5, 13, 3, 11, 7, 15,
+- 0, 8, 4, 12, 2, 10, 6, 14, 1, 9, 5, 13, 3, 11, 7, 15);
+- const __m256i mask_stage4 = _mm256_set_epi8(0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+- 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
+- const __m256i mask_stage3 = _mm256_set_epi8(0x0, 0x0, 0x0, 0x0, 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x0, 0x0, 0x0, 0xFF, 0xFF, 0xFF, 0xFF,
+- 0x0, 0x0, 0x0, 0x0, 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x0, 0x0, 0x0, 0xFF, 0xFF, 0xFF, 0xFF);
+- const __m256i mask_stage2 = _mm256_set_epi8(0x0, 0x0, 0xFF, 0xFF, 0x0, 0x0, 0xFF, 0xFF, 0x0, 0x0, 0xFF, 0xFF, 0x0, 0x0, 0xFF, 0xFF,
+- 0x0, 0x0, 0xFF, 0xFF, 0x0, 0x0, 0xFF, 0xFF, 0x0, 0x0, 0xFF, 0xFF, 0x0, 0x0, 0xFF, 0xFF);
+-
+- for(branch = 0; branch < num_branches/2; ++branch){
+- r_temp0 = _mm256_load_si256((__m256i*) temp_ptr);
+-
+- // prefetch next chunk
+- temp_ptr += 32;
++ // prefetch first chunk.
+ __VOLK_PREFETCH(temp_ptr);
+
+- // shuffle once for bit-reversal.
+- r_temp0 = _mm256_shuffle_epi8(r_temp0, shuffle_stage4);
+-
+- shifted = _mm256_srli_si256(r_temp0, 8); //128 bit lanes
+- shifted = _mm256_and_si256(shifted, mask_stage4);
+- r_frame0 = _mm256_xor_si256(shifted, r_temp0);
+-
+- shifted = _mm256_srli_si256(r_frame0, 4);
+- shifted = _mm256_and_si256(shifted, mask_stage3);
+- r_frame0 = _mm256_xor_si256(shifted, r_frame0);
+-
+- shifted = _mm256_srli_si256(r_frame0, 2);
+- shifted = _mm256_and_si256(shifted, mask_stage2);
+- r_frame0 = _mm256_xor_si256(shifted, r_frame0);
+-
+- shifted = _mm256_srli_si256(r_frame0, 1);
+- shifted = _mm256_and_si256(shifted, mask_stage1);
+- r_frame0 = _mm256_xor_si256(shifted, r_frame0);
+-
+- // store result of chunk.
+- _mm256_store_si256((__m256i*)frame_ptr, r_frame0);
+- frame_ptr += 32;
+- }
++ const __m256i shuffle_stage4 = _mm256_setr_epi8(0,
++ 8,
++ 4,
++ 12,
++ 2,
++ 10,
++ 6,
++ 14,
++ 1,
++ 9,
++ 5,
++ 13,
++ 3,
++ 11,
++ 7,
++ 15,
++ 0,
++ 8,
++ 4,
++ 12,
++ 2,
++ 10,
++ 6,
++ 14,
++ 1,
++ 9,
++ 5,
++ 13,
++ 3,
++ 11,
++ 7,
++ 15);
++ const __m256i mask_stage4 = _mm256_set_epi8(0x0,
++ 0x0,
++ 0x0,
++ 0x0,
++ 0x0,
++ 0x0,
++ 0x0,
++ 0x0,
++ 0xFF,
++ 0xFF,
++ 0xFF,
++ 0xFF,
++ 0xFF,
++ 0xFF,
++ 0xFF,
++ 0xFF,
++ 0x0,
++ 0x0,
++ 0x0,
++ 0x0,
++ 0x0,
++ 0x0,
++ 0x0,
++ 0x0,
++ 0xFF,
++ 0xFF,
++ 0xFF,
++ 0xFF,
++ 0xFF,
++ 0xFF,
++ 0xFF,
++ 0xFF);
++ const __m256i mask_stage3 = _mm256_set_epi8(0x0,
++ 0x0,
++ 0x0,
++ 0x0,
++ 0xFF,
++ 0xFF,
++ 0xFF,
++ 0xFF,
++ 0x0,
++ 0x0,
++ 0x0,
++ 0x0,
++ 0xFF,
++ 0xFF,
++ 0xFF,
++ 0xFF,
++ 0x0,
++ 0x0,
++ 0x0,
++ 0x0,
++ 0xFF,
++ 0xFF,
++ 0xFF,
++ 0xFF,
++ 0x0,
++ 0x0,
++ 0x0,
++ 0x0,
++ 0xFF,
++ 0xFF,
++ 0xFF,
++ 0xFF);
++ const __m256i mask_stage2 = _mm256_set_epi8(0x0,
++ 0x0,
++ 0xFF,
++ 0xFF,
++ 0x0,
++ 0x0,
++ 0xFF,
++ 0xFF,
++ 0x0,
++ 0x0,
++ 0xFF,
++ 0xFF,
++ 0x0,
++ 0x0,
++ 0xFF,
++ 0xFF,
++ 0x0,
++ 0x0,
++ 0xFF,
++ 0xFF,
++ 0x0,
++ 0x0,
++ 0xFF,
++ 0xFF,
++ 0x0,
++ 0x0,
++ 0xFF,
++ 0xFF,
++ 0x0,
++ 0x0,
++ 0xFF,
++ 0xFF);
++
++ for (branch = 0; branch < num_branches / 2; ++branch) {
++ r_temp0 = _mm256_load_si256((__m256i*)temp_ptr);
++
++ // prefetch next chunk
++ temp_ptr += 32;
++ __VOLK_PREFETCH(temp_ptr);
++
++ // shuffle once for bit-reversal.
++ r_temp0 = _mm256_shuffle_epi8(r_temp0, shuffle_stage4);
++
++ shifted = _mm256_srli_si256(r_temp0, 8); // 128 bit lanes
++ shifted = _mm256_and_si256(shifted, mask_stage4);
++ r_frame0 = _mm256_xor_si256(shifted, r_temp0);
++
++ shifted = _mm256_srli_si256(r_frame0, 4);
++ shifted = _mm256_and_si256(shifted, mask_stage3);
++ r_frame0 = _mm256_xor_si256(shifted, r_frame0);
++
++ shifted = _mm256_srli_si256(r_frame0, 2);
++ shifted = _mm256_and_si256(shifted, mask_stage2);
++ r_frame0 = _mm256_xor_si256(shifted, r_frame0);
++
++ shifted = _mm256_srli_si256(r_frame0, 1);
++ shifted = _mm256_and_si256(shifted, mask_stage1);
++ r_frame0 = _mm256_xor_si256(shifted, r_frame0);
++
++ // store result of chunk.
++ _mm256_store_si256((__m256i*)frame_ptr, r_frame0);
++ frame_ptr += 32;
++ }
+ }
+ #endif /* LV_HAVE_AVX2 */
+
+
+-
+ #endif /* VOLK_KERNELS_VOLK_VOLK_8U_X2_ENCODEFRAMEPOLAR_8U_A_H_ */
+diff --git a/kernels/volk/volk_8u_x3_encodepolar_8u_x2.h b/kernels/volk/volk_8u_x3_encodepolar_8u_x2.h
+index 5bccd95..413836e 100644
+--- a/kernels/volk/volk_8u_x3_encodepolar_8u_x2.h
++++ b/kernels/volk/volk_8u_x3_encodepolar_8u_x2.h
+@@ -29,9 +29,9 @@
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+- * void volk_8u_x3_encodepolar_8u(unsigned char* frame, const unsigned char* frozen_bit_mask, const unsigned char* frozen_bits,
+- * const unsigned char* info_bits, unsigned int frame_size, unsigned int info_bit_size)
+- * \endcode
++ * void volk_8u_x3_encodepolar_8u(unsigned char* frame, const unsigned char*
++ * frozen_bit_mask, const unsigned char* frozen_bits, const unsigned char* info_bits,
++ * unsigned int frame_size, unsigned int info_bit_size) \endcode
+ *
+ * \b Inputs
+ * \li frame: buffer for encoded frame
+@@ -55,14 +55,17 @@
+ * unsigned char* frozen_bit_mask = get_frozen_bit_mask(frame_size, num_frozen_bits);
+ *
+ * // set elements to desired values. Typically all zero.
+- * unsigned char* frozen_bits = (unsigned char) volk_malloc(sizeof(unsigned char) * num_frozen_bits, volk_get_alignment());
++ * unsigned char* frozen_bits = (unsigned char) volk_malloc(sizeof(unsigned char) *
++ * num_frozen_bits, volk_get_alignment());
+ *
+- * unsigned char* frame = (unsigned char) volk_malloc(sizeof(unsigned char) * frame_size, volk_get_alignment());
+- * unsigned char* temp = (unsigned char) volk_malloc(sizeof(unsigned char) * frame_size, volk_get_alignment());
++ * unsigned char* frame = (unsigned char) volk_malloc(sizeof(unsigned char) * frame_size,
++ * volk_get_alignment()); unsigned char* temp = (unsigned char)
++ * volk_malloc(sizeof(unsigned char) * frame_size, volk_get_alignment());
+ *
+ * unsigned char* info_bits = get_info_bits_to_encode(num_info_bits);
+ *
+- * volk_8u_x3_encodepolar_8u_x2_generic(frame, temp, frozen_bit_mask, frozen_bits, info_bits, frame_size);
++ * volk_8u_x3_encodepolar_8u_x2_generic(frame, temp, frozen_bit_mask, frozen_bits,
++ * info_bits, frame_size);
+ *
+ * volk_free(frozen_bit_mask);
+ * volk_free(frozen_bits);
+@@ -77,27 +80,32 @@
+ #include <stdio.h>
+ #include <volk/volk_8u_x2_encodeframepolar_8u.h>
+
+-static inline void
+-interleave_frozen_and_info_bits(unsigned char* target, const unsigned char* frozen_bit_mask,
+- const unsigned char* frozen_bits, const unsigned char* info_bits,
+- const unsigned int frame_size)
++static inline void interleave_frozen_and_info_bits(unsigned char* target,
++ const unsigned char* frozen_bit_mask,
++ const unsigned char* frozen_bits,
++ const unsigned char* info_bits,
++ const unsigned int frame_size)
+ {
+- unsigned int bit;
+- for(bit = 0; bit < frame_size; ++bit){
+- *target++ = *frozen_bit_mask++ ? *frozen_bits++ : *info_bits++;
+- }
++ unsigned int bit;
++ for (bit = 0; bit < frame_size; ++bit) {
++ *target++ = *frozen_bit_mask++ ? *frozen_bits++ : *info_bits++;
++ }
+ }
+
+ #ifdef LV_HAVE_GENERIC
+
+ static inline void
+-volk_8u_x3_encodepolar_8u_x2_generic(unsigned char* frame, unsigned char* temp, const unsigned char* frozen_bit_mask,
+- const unsigned char* frozen_bits, const unsigned char* info_bits,
++volk_8u_x3_encodepolar_8u_x2_generic(unsigned char* frame,
++ unsigned char* temp,
++ const unsigned char* frozen_bit_mask,
++ const unsigned char* frozen_bits,
++ const unsigned char* info_bits,
+ unsigned int frame_size)
+ {
+- // interleave
+- interleave_frozen_and_info_bits(temp, frozen_bit_mask, frozen_bits, info_bits, frame_size);
+- volk_8u_x2_encodeframepolar_8u_generic(frame, temp, frame_size);
++ // interleave
++ interleave_frozen_and_info_bits(
++ temp, frozen_bit_mask, frozen_bits, info_bits, frame_size);
++ volk_8u_x2_encodeframepolar_8u_generic(frame, temp, frame_size);
+ }
+ #endif /* LV_HAVE_GENERIC */
+
+@@ -106,14 +114,17 @@ volk_8u_x3_encodepolar_8u_x2_generic(unsigned char* frame, unsigned char* temp,
+ #include <tmmintrin.h>
+
+ static inline void
+-volk_8u_x3_encodepolar_8u_x2_u_ssse3(unsigned char* frame, unsigned char* temp,
+- const unsigned char* frozen_bit_mask,
+- const unsigned char* frozen_bits, const unsigned char* info_bits,
+- unsigned int frame_size)
++volk_8u_x3_encodepolar_8u_x2_u_ssse3(unsigned char* frame,
++ unsigned char* temp,
++ const unsigned char* frozen_bit_mask,
++ const unsigned char* frozen_bits,
++ const unsigned char* info_bits,
++ unsigned int frame_size)
+ {
+- // interleave
+- interleave_frozen_and_info_bits(temp, frozen_bit_mask, frozen_bits, info_bits, frame_size);
+- volk_8u_x2_encodeframepolar_8u_u_ssse3(frame, temp, frame_size);
++ // interleave
++ interleave_frozen_and_info_bits(
++ temp, frozen_bit_mask, frozen_bits, info_bits, frame_size);
++ volk_8u_x2_encodeframepolar_8u_u_ssse3(frame, temp, frame_size);
+ }
+
+ #endif /* LV_HAVE_SSSE3 */
+@@ -121,13 +132,16 @@ volk_8u_x3_encodepolar_8u_x2_u_ssse3(unsigned char* frame, unsigned char* temp,
+ #ifdef LV_HAVE_AVX2
+ #include <immintrin.h>
+ static inline void
+-volk_8u_x3_encodepolar_8u_x2_u_avx2(unsigned char* frame, unsigned char* temp,
+- const unsigned char* frozen_bit_mask,
+- const unsigned char* frozen_bits, const unsigned char* info_bits,
+- unsigned int frame_size)
++volk_8u_x3_encodepolar_8u_x2_u_avx2(unsigned char* frame,
++ unsigned char* temp,
++ const unsigned char* frozen_bit_mask,
++ const unsigned char* frozen_bits,
++ const unsigned char* info_bits,
++ unsigned int frame_size)
+ {
+- interleave_frozen_and_info_bits(temp, frozen_bit_mask, frozen_bits, info_bits, frame_size);
+- volk_8u_x2_encodeframepolar_8u_u_avx2(frame, temp, frame_size);
++ interleave_frozen_and_info_bits(
++ temp, frozen_bit_mask, frozen_bits, info_bits, frame_size);
++ volk_8u_x2_encodeframepolar_8u_u_avx2(frame, temp, frame_size);
+ }
+ #endif /* LV_HAVE_AVX2 */
+
+@@ -139,26 +153,32 @@ volk_8u_x3_encodepolar_8u_x2_u_avx2(unsigned char* frame, unsigned char* temp,
+ #ifdef LV_HAVE_SSSE3
+ #include <tmmintrin.h>
+ static inline void
+-volk_8u_x3_encodepolar_8u_x2_a_ssse3(unsigned char* frame, unsigned char* temp,
+- const unsigned char* frozen_bit_mask,
+- const unsigned char* frozen_bits, const unsigned char* info_bits,
+- unsigned int frame_size)
++volk_8u_x3_encodepolar_8u_x2_a_ssse3(unsigned char* frame,
++ unsigned char* temp,
++ const unsigned char* frozen_bit_mask,
++ const unsigned char* frozen_bits,
++ const unsigned char* info_bits,
++ unsigned int frame_size)
+ {
+- interleave_frozen_and_info_bits(temp, frozen_bit_mask, frozen_bits, info_bits, frame_size);
+- volk_8u_x2_encodeframepolar_8u_a_ssse3(frame, temp, frame_size);
++ interleave_frozen_and_info_bits(
++ temp, frozen_bit_mask, frozen_bits, info_bits, frame_size);
++ volk_8u_x2_encodeframepolar_8u_a_ssse3(frame, temp, frame_size);
+ }
+ #endif /* LV_HAVE_SSSE3 */
+
+ #ifdef LV_HAVE_AVX2
+ #include <immintrin.h>
+ static inline void
+-volk_8u_x3_encodepolar_8u_x2_a_avx2(unsigned char* frame, unsigned char* temp,
+- const unsigned char* frozen_bit_mask,
+- const unsigned char* frozen_bits, const unsigned char* info_bits,
+- unsigned int frame_size)
++volk_8u_x3_encodepolar_8u_x2_a_avx2(unsigned char* frame,
++ unsigned char* temp,
++ const unsigned char* frozen_bit_mask,
++ const unsigned char* frozen_bits,
++ const unsigned char* info_bits,
++ unsigned int frame_size)
+ {
+- interleave_frozen_and_info_bits(temp, frozen_bit_mask, frozen_bits, info_bits, frame_size);
+- volk_8u_x2_encodeframepolar_8u_a_avx2(frame, temp, frame_size);
++ interleave_frozen_and_info_bits(
++ temp, frozen_bit_mask, frozen_bits, info_bits, frame_size);
++ volk_8u_x2_encodeframepolar_8u_a_avx2(frame, temp, frame_size);
+ }
+ #endif /* LV_HAVE_AVX2 */
+
+diff --git a/kernels/volk/volk_8u_x3_encodepolarpuppet_8u.h b/kernels/volk/volk_8u_x3_encodepolarpuppet_8u.h
+index 1f6be2c..1badbf1 100644
+--- a/kernels/volk/volk_8u_x3_encodepolarpuppet_8u.h
++++ b/kernels/volk/volk_8u_x3_encodepolarpuppet_8u.h
+@@ -29,71 +29,82 @@
+ #include <volk/volk.h>
+ #include <volk/volk_8u_x3_encodepolar_8u_x2.h>
+
+-static inline unsigned int
+-next_lower_power_of_two(const unsigned int val)
++static inline unsigned int next_lower_power_of_two(const unsigned int val)
+ {
+- // algorithm found and adopted from: http://acius2.blogspot.de/2007/11/calculating-next-power-of-2.html
+- unsigned int res = val;
+- res = (res >> 1) | res;
+- res = (res >> 2) | res;
+- res = (res >> 4) | res;
+- res = (res >> 8) | res;
+- res = (res >> 16) | res;
+- res += 1;
+- return res >> 1;
++ // algorithm found and adopted from:
++ // http://acius2.blogspot.de/2007/11/calculating-next-power-of-2.html
++ unsigned int res = val;
++ res = (res >> 1) | res;
++ res = (res >> 2) | res;
++ res = (res >> 4) | res;
++ res = (res >> 8) | res;
++ res = (res >> 16) | res;
++ res += 1;
++ return res >> 1;
+ }
+
+-static inline void
+-adjust_frozen_mask(unsigned char* mask, const unsigned int frame_size)
++static inline void adjust_frozen_mask(unsigned char* mask, const unsigned int frame_size)
+ {
+- // just like the rest of the puppet this function exists for test purposes only.
+- unsigned int i;
+- for(i = 0; i < frame_size; ++i){
+- *mask = (*mask & 0x80) ? 0xFF : 0x00;
+- mask++;
+- }
++ // just like the rest of the puppet this function exists for test purposes only.
++ unsigned int i;
++ for (i = 0; i < frame_size; ++i) {
++ *mask = (*mask & 0x80) ? 0xFF : 0x00;
++ mask++;
++ }
+ }
+
+ #ifdef LV_HAVE_GENERIC
+ static inline void
+-volk_8u_x3_encodepolarpuppet_8u_generic(unsigned char* frame, unsigned char* frozen_bit_mask,
+- const unsigned char* frozen_bits, const unsigned char* info_bits,
+- unsigned int frame_size)
++volk_8u_x3_encodepolarpuppet_8u_generic(unsigned char* frame,
++ unsigned char* frozen_bit_mask,
++ const unsigned char* frozen_bits,
++ const unsigned char* info_bits,
++ unsigned int frame_size)
+ {
+- frame_size = next_lower_power_of_two(frame_size);
+- unsigned char* temp = (unsigned char*) volk_malloc(sizeof(unsigned char) * frame_size, volk_get_alignment());
+- adjust_frozen_mask(frozen_bit_mask, frame_size);
+- volk_8u_x3_encodepolar_8u_x2_generic(frame, temp, frozen_bit_mask, frozen_bits, info_bits, frame_size);
+- volk_free(temp);
++ frame_size = next_lower_power_of_two(frame_size);
++ unsigned char* temp = (unsigned char*)volk_malloc(sizeof(unsigned char) * frame_size,
++ volk_get_alignment());
++ adjust_frozen_mask(frozen_bit_mask, frame_size);
++ volk_8u_x3_encodepolar_8u_x2_generic(
++ frame, temp, frozen_bit_mask, frozen_bits, info_bits, frame_size);
++ volk_free(temp);
+ }
+ #endif /* LV_HAVE_GENERIC */
+
+
+ #ifdef LV_HAVE_SSSE3
+ static inline void
+-volk_8u_x3_encodepolarpuppet_8u_u_ssse3(unsigned char* frame, unsigned char* frozen_bit_mask,
+- const unsigned char* frozen_bits, const unsigned char* info_bits,
+- unsigned int frame_size)
++volk_8u_x3_encodepolarpuppet_8u_u_ssse3(unsigned char* frame,
++ unsigned char* frozen_bit_mask,
++ const unsigned char* frozen_bits,
++ const unsigned char* info_bits,
++ unsigned int frame_size)
+ {
+- frame_size = next_lower_power_of_two(frame_size);
+- unsigned char* temp = (unsigned char*) volk_malloc(sizeof(unsigned char) * frame_size, volk_get_alignment());
+- adjust_frozen_mask(frozen_bit_mask, frame_size);
+- volk_8u_x3_encodepolar_8u_x2_u_ssse3(frame, temp, frozen_bit_mask, frozen_bits, info_bits, frame_size);
+- volk_free(temp);
++ frame_size = next_lower_power_of_two(frame_size);
++ unsigned char* temp = (unsigned char*)volk_malloc(sizeof(unsigned char) * frame_size,
++ volk_get_alignment());
++ adjust_frozen_mask(frozen_bit_mask, frame_size);
++ volk_8u_x3_encodepolar_8u_x2_u_ssse3(
++ frame, temp, frozen_bit_mask, frozen_bits, info_bits, frame_size);
++ volk_free(temp);
+ }
+ #endif /* LV_HAVE_SSSE3 */
+
+ #ifdef LV_HAVE_AVX2
+ static inline void
+-volk_8u_x3_encodepolarpuppet_8u_u_avx2(unsigned char* frame, unsigned char* frozen_bit_mask,
+- const unsigned char* frozen_bits, const unsigned char* info_bits,
+- unsigned int frame_size)
++volk_8u_x3_encodepolarpuppet_8u_u_avx2(unsigned char* frame,
++ unsigned char* frozen_bit_mask,
++ const unsigned char* frozen_bits,
++ const unsigned char* info_bits,
++ unsigned int frame_size)
+ {
+- frame_size = next_lower_power_of_two(frame_size);
+- unsigned char* temp = (unsigned char*) volk_malloc(sizeof(unsigned char) * frame_size, volk_get_alignment());
+- adjust_frozen_mask(frozen_bit_mask, frame_size);
+- volk_8u_x3_encodepolar_8u_x2_u_avx2(frame, temp, frozen_bit_mask, frozen_bits, info_bits, frame_size);
+- volk_free(temp);
++ frame_size = next_lower_power_of_two(frame_size);
++ unsigned char* temp = (unsigned char*)volk_malloc(sizeof(unsigned char) * frame_size,
++ volk_get_alignment());
++ adjust_frozen_mask(frozen_bit_mask, frame_size);
++ volk_8u_x3_encodepolar_8u_x2_u_avx2(
++ frame, temp, frozen_bit_mask, frozen_bits, info_bits, frame_size);
++ volk_free(temp);
+ }
+ #endif /* LV_HAVE_AVX2 */
+
+@@ -104,29 +115,37 @@ volk_8u_x3_encodepolarpuppet_8u_u_avx2(unsigned char* frame, unsigned char* froz
+
+ #ifdef LV_HAVE_SSSE3
+ static inline void
+-volk_8u_x3_encodepolarpuppet_8u_a_ssse3(unsigned char* frame, unsigned char* frozen_bit_mask,
+- const unsigned char* frozen_bits, const unsigned char* info_bits,
+- unsigned int frame_size)
++volk_8u_x3_encodepolarpuppet_8u_a_ssse3(unsigned char* frame,
++ unsigned char* frozen_bit_mask,
++ const unsigned char* frozen_bits,
++ const unsigned char* info_bits,
++ unsigned int frame_size)
+ {
+- frame_size = next_lower_power_of_two(frame_size);
+- unsigned char* temp = (unsigned char*) volk_malloc(sizeof(unsigned char) * frame_size, volk_get_alignment());
+- adjust_frozen_mask(frozen_bit_mask, frame_size);
+- volk_8u_x3_encodepolar_8u_x2_a_ssse3(frame, temp, frozen_bit_mask, frozen_bits, info_bits, frame_size);
+- volk_free(temp);
++ frame_size = next_lower_power_of_two(frame_size);
++ unsigned char* temp = (unsigned char*)volk_malloc(sizeof(unsigned char) * frame_size,
++ volk_get_alignment());
++ adjust_frozen_mask(frozen_bit_mask, frame_size);
++ volk_8u_x3_encodepolar_8u_x2_a_ssse3(
++ frame, temp, frozen_bit_mask, frozen_bits, info_bits, frame_size);
++ volk_free(temp);
+ }
+ #endif /* LV_HAVE_SSSE3 */
+
+ #ifdef LV_HAVE_AVX2
+ static inline void
+-volk_8u_x3_encodepolarpuppet_8u_a_avx2(unsigned char* frame, unsigned char* frozen_bit_mask,
+- const unsigned char* frozen_bits, const unsigned char* info_bits,
+- unsigned int frame_size)
++volk_8u_x3_encodepolarpuppet_8u_a_avx2(unsigned char* frame,
++ unsigned char* frozen_bit_mask,
++ const unsigned char* frozen_bits,
++ const unsigned char* info_bits,
++ unsigned int frame_size)
+ {
+- frame_size = next_lower_power_of_two(frame_size);
+- unsigned char* temp = (unsigned char*) volk_malloc(sizeof(unsigned char) * frame_size, volk_get_alignment());
+- adjust_frozen_mask(frozen_bit_mask, frame_size);
+- volk_8u_x3_encodepolar_8u_x2_a_avx2(frame, temp, frozen_bit_mask, frozen_bits, info_bits, frame_size);
+- volk_free(temp);
++ frame_size = next_lower_power_of_two(frame_size);
++ unsigned char* temp = (unsigned char*)volk_malloc(sizeof(unsigned char) * frame_size,
++ volk_get_alignment());
++ adjust_frozen_mask(frozen_bit_mask, frame_size);
++ volk_8u_x3_encodepolar_8u_x2_a_avx2(
++ frame, temp, frozen_bit_mask, frozen_bits, info_bits, frame_size);
++ volk_free(temp);
+ }
+ #endif /* LV_HAVE_AVX2 */
+
+diff --git a/kernels/volk/volk_8u_x4_conv_k7_r2_8u.h b/kernels/volk/volk_8u_x4_conv_k7_r2_8u.h
+index 029ba75..89460a6 100644
+--- a/kernels/volk/volk_8u_x4_conv_k7_r2_8u.h
++++ b/kernels/volk/volk_8u_x4_conv_k7_r2_8u.h
+@@ -30,8 +30,9 @@
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+- * void volk_8u_x4_conv_k7_r2_8u(unsigned char* Y, unsigned char* X, unsigned char* syms, unsigned char* dec, unsigned int framebits, unsigned int excess, unsigned char* Branchtab)
+- * \endcode
++ * void volk_8u_x4_conv_k7_r2_8u(unsigned char* Y, unsigned char* X, unsigned char* syms,
++ * unsigned char* dec, unsigned int framebits, unsigned int excess, unsigned char*
++ * Branchtab) \endcode
+ *
+ * \b Inputs
+ * \li X: <FIXME>
+@@ -58,67 +59,71 @@
+ #define INCLUDED_volk_8u_x4_conv_k7_r2_8u_H
+
+ typedef union {
+- unsigned char/*DECISIONTYPE*/ t[64/*NUMSTATES*//8/*DECISIONTYPE_BITSIZE*/];
+- unsigned int w[64/*NUMSTATES*//32];
+- unsigned short s[64/*NUMSTATES*//16];
+- unsigned char c[64/*NUMSTATES*//8];
++ unsigned char /*DECISIONTYPE*/ t[64 /*NUMSTATES*/ / 8 /*DECISIONTYPE_BITSIZE*/];
++ unsigned int w[64 /*NUMSTATES*/ / 32];
++ unsigned short s[64 /*NUMSTATES*/ / 16];
++ unsigned char c[64 /*NUMSTATES*/ / 8];
+ #ifdef _MSC_VER
+ } decision_t;
+ #else
+-} decision_t __attribute__ ((aligned (16)));
++} decision_t __attribute__((aligned(16)));
+ #endif
+
+
+-static inline void
+-renormalize(unsigned char* X, unsigned char threshold)
++static inline void renormalize(unsigned char* X, unsigned char threshold)
+ {
+- int NUMSTATES = 64;
+- int i;
+-
+- unsigned char min=X[0];
+- //if(min > threshold) {
+- for(i=0;i<NUMSTATES;i++)
+- if (min>X[i])
+- min=X[i];
+- for(i=0;i<NUMSTATES;i++)
+- X[i]-=min;
+- //}
++ int NUMSTATES = 64;
++ int i;
++
++ unsigned char min = X[0];
++ // if(min > threshold) {
++ for (i = 0; i < NUMSTATES; i++)
++ if (min > X[i])
++ min = X[i];
++ for (i = 0; i < NUMSTATES; i++)
++ X[i] -= min;
++ //}
+ }
+
+
+-//helper BFLY for GENERIC version
+-static inline void
+-BFLY(int i, int s, unsigned char * syms, unsigned char *Y,
+- unsigned char *X, decision_t * d, unsigned char* Branchtab)
++// helper BFLY for GENERIC version
++static inline void BFLY(int i,
++ int s,
++ unsigned char* syms,
++ unsigned char* Y,
++ unsigned char* X,
++ decision_t* d,
++ unsigned char* Branchtab)
+ {
+- int j, decision0, decision1;
+- unsigned char metric,m0,m1,m2,m3;
++ int j, decision0, decision1;
++ unsigned char metric, m0, m1, m2, m3;
+
+- int NUMSTATES = 64;
+- int RATE = 2;
+- int METRICSHIFT = 1;
+- int PRECISIONSHIFT = 2;
++ int NUMSTATES = 64;
++ int RATE = 2;
++ int METRICSHIFT = 1;
++ int PRECISIONSHIFT = 2;
+
+- metric =0;
+- for(j=0;j<RATE;j++)
+- metric += (Branchtab[i+j*NUMSTATES/2] ^ syms[s*RATE+j])>>METRICSHIFT;
+- metric=metric>>PRECISIONSHIFT;
++ metric = 0;
++ for (j = 0; j < RATE; j++)
++ metric += (Branchtab[i + j * NUMSTATES / 2] ^ syms[s * RATE + j]) >> METRICSHIFT;
++ metric = metric >> PRECISIONSHIFT;
+
+- unsigned char max = ((RATE*((256 -1)>>METRICSHIFT))>>PRECISIONSHIFT);
++ unsigned char max = ((RATE * ((256 - 1) >> METRICSHIFT)) >> PRECISIONSHIFT);
+
+- m0 = X[i] + metric;
+- m1 = X[i+NUMSTATES/2] + (max - metric);
+- m2 = X[i] + (max - metric);
+- m3 = X[i+NUMSTATES/2] + metric;
++ m0 = X[i] + metric;
++ m1 = X[i + NUMSTATES / 2] + (max - metric);
++ m2 = X[i] + (max - metric);
++ m3 = X[i + NUMSTATES / 2] + metric;
+
+- decision0 = (signed int)(m0-m1) > 0;
+- decision1 = (signed int)(m2-m3) > 0;
++ decision0 = (signed int)(m0 - m1) > 0;
++ decision1 = (signed int)(m2 - m3) > 0;
+
+- Y[2*i] = decision0 ? m1 : m0;
+- Y[2*i+1] = decision1 ? m3 : m2;
++ Y[2 * i] = decision0 ? m1 : m0;
++ Y[2 * i + 1] = decision1 ? m3 : m2;
+
+- d->w[i/(sizeof(unsigned int)*8/2)+s*(sizeof(decision_t)/sizeof(unsigned int))] |=
+- (decision0|decision1<<1) << ((2*i)&(sizeof(unsigned int)*8-1));
++ d->w[i / (sizeof(unsigned int) * 8 / 2) +
++ s * (sizeof(decision_t) / sizeof(unsigned int))] |=
++ (decision0 | decision1 << 1) << ((2 * i) & (sizeof(unsigned int) * 8 - 1));
+ }
+
+
+@@ -127,188 +132,199 @@ BFLY(int i, int s, unsigned char * syms, unsigned char *Y,
+ #include <immintrin.h>
+ #include <stdio.h>
+
+-static inline void
+-volk_8u_x4_conv_k7_r2_8u_avx2(unsigned char* Y, unsigned char* X,
+- unsigned char* syms, unsigned char* dec,
+- unsigned int framebits, unsigned int excess,
+- unsigned char* Branchtab)
++static inline void volk_8u_x4_conv_k7_r2_8u_avx2(unsigned char* Y,
++ unsigned char* X,
++ unsigned char* syms,
++ unsigned char* dec,
++ unsigned int framebits,
++ unsigned int excess,
++ unsigned char* Branchtab)
+ {
+- unsigned int i9;
+- for(i9 = 0; i9 < ((framebits + excess)>>1); i9++) {
+- unsigned char a75, a81;
+- int a73, a92;
+- int s20, s21;
+- unsigned char *a80, *b6;
+- int *a110, *a91, *a93;
+- __m256i *a112, *a71, *a72, *a77, *a83, *a95;
+- __m256i a86, a87;
+- __m256i a76, a78, a79, a82, a84, a85, a88, a89
+- , a90, d10, d9, m23, m24, m25
+- , m26, s18, s19, s22
+- , s23, s24, s25, t13, t14, t15;
+- a71 = ((__m256i *) X);
+- s18 = *(a71);
+- a72 = (a71 + 1);
+- s19 = *(a72);
+- s22 = _mm256_permute2x128_si256(s18,s19,0x20);
+- s19 = _mm256_permute2x128_si256(s18,s19,0x31);
+- s18 = s22;
+- a73 = (4 * i9);
+- b6 = (syms + a73);
+- a75 = *(b6);
+- a76 = _mm256_set1_epi8(a75);
+- a77 = ((__m256i *) Branchtab);
+- a78 = *(a77);
+- a79 = _mm256_xor_si256(a76, a78);
+- a80 = (b6 + 1);
+- a81 = *(a80);
+- a82 = _mm256_set1_epi8(a81);
+- a83 = (a77 + 1);
+- a84 = *(a83);
+- a85 = _mm256_xor_si256(a82, a84);
+- t13 = _mm256_avg_epu8(a79,a85);
+- a86 = ((__m256i ) t13);
+- a87 = _mm256_srli_epi16(a86, 2);
+- a88 = ((__m256i ) a87);
+- t14 = _mm256_and_si256(a88, _mm256_set1_epi8(63));
+- t15 = _mm256_subs_epu8(_mm256_set1_epi8(63), t14);
+- m23 = _mm256_adds_epu8(s18, t14);
+- m24 = _mm256_adds_epu8(s19, t15);
+- m25 = _mm256_adds_epu8(s18, t15);
+- m26 = _mm256_adds_epu8(s19, t14);
+- a89 = _mm256_min_epu8(m24, m23);
+- d9 = _mm256_cmpeq_epi8(a89, m24);
+- a90 = _mm256_min_epu8(m26, m25);
+- d10 = _mm256_cmpeq_epi8(a90, m26);
+- s22 = _mm256_unpacklo_epi8(d9,d10);
+- s23 = _mm256_unpackhi_epi8(d9,d10);
+- s20 = _mm256_movemask_epi8(_mm256_permute2x128_si256(s22, s23, 0x20));
+- a91 = ((int *) dec);
+- a92 = (4 * i9);
+- a93 = (a91 + a92);
+- *(a93) = s20;
+- s21 = _mm256_movemask_epi8(_mm256_permute2x128_si256(s22, s23, 0x31));
+- a110 = (a93 + 1);
+- *(a110) = s21;
+- s22 = _mm256_unpacklo_epi8(a89, a90);
+- s23 = _mm256_unpackhi_epi8(a89, a90);
+- a95 = ((__m256i *) Y);
+- s24 = _mm256_permute2x128_si256(s22, s23, 0x20);
+- *(a95) = s24;
+- s23 = _mm256_permute2x128_si256(s22, s23, 0x31);
+- a112 = (a95 + 1);
+- *(a112) = s23;
+- if ((((unsigned char *) Y)[0]>210)) {
+- __m256i m5, m6;
+- m5 = ((__m256i *) Y)[0];
+- m5 = _mm256_min_epu8(m5, ((__m256i *) Y)[1]);
+- __m256i m7;
+- m7 = _mm256_min_epu8(_mm256_srli_si256(m5, 8), m5);
+- m7 = ((__m256i ) _mm256_min_epu8(((__m256i ) _mm256_srli_epi64(m7, 32)), ((__m256i ) m7)));
+- m7 = ((__m256i ) _mm256_min_epu8(((__m256i ) _mm256_srli_epi64(m7, 16)), ((__m256i ) m7)));
+- m7 = ((__m256i ) _mm256_min_epu8(((__m256i ) _mm256_srli_epi64(m7, 8)), ((__m256i ) m7)));
+- m7 = _mm256_unpacklo_epi8(m7, m7);
+- m7 = _mm256_shufflelo_epi16(m7, 0);
+- m6 = _mm256_unpacklo_epi64(m7, m7);
+- m6 = _mm256_permute2x128_si256(m6, m6, 0); //copy lower half of m6 to upper half, since above ops operate on 128 bit lanes
+- ((__m256i *) Y)[0] = _mm256_subs_epu8(((__m256i *) Y)[0], m6);
+- ((__m256i *) Y)[1] = _mm256_subs_epu8(((__m256i *) Y)[1], m6);
++ unsigned int i9;
++ for (i9 = 0; i9 < ((framebits + excess) >> 1); i9++) {
++ unsigned char a75, a81;
++ int a73, a92;
++ int s20, s21;
++ unsigned char *a80, *b6;
++ int *a110, *a91, *a93;
++ __m256i *a112, *a71, *a72, *a77, *a83, *a95;
++ __m256i a86, a87;
++ __m256i a76, a78, a79, a82, a84, a85, a88, a89, a90, d10, d9, m23, m24, m25, m26,
++ s18, s19, s22, s23, s24, s25, t13, t14, t15;
++ a71 = ((__m256i*)X);
++ s18 = *(a71);
++ a72 = (a71 + 1);
++ s19 = *(a72);
++ s22 = _mm256_permute2x128_si256(s18, s19, 0x20);
++ s19 = _mm256_permute2x128_si256(s18, s19, 0x31);
++ s18 = s22;
++ a73 = (4 * i9);
++ b6 = (syms + a73);
++ a75 = *(b6);
++ a76 = _mm256_set1_epi8(a75);
++ a77 = ((__m256i*)Branchtab);
++ a78 = *(a77);
++ a79 = _mm256_xor_si256(a76, a78);
++ a80 = (b6 + 1);
++ a81 = *(a80);
++ a82 = _mm256_set1_epi8(a81);
++ a83 = (a77 + 1);
++ a84 = *(a83);
++ a85 = _mm256_xor_si256(a82, a84);
++ t13 = _mm256_avg_epu8(a79, a85);
++ a86 = ((__m256i)t13);
++ a87 = _mm256_srli_epi16(a86, 2);
++ a88 = ((__m256i)a87);
++ t14 = _mm256_and_si256(a88, _mm256_set1_epi8(63));
++ t15 = _mm256_subs_epu8(_mm256_set1_epi8(63), t14);
++ m23 = _mm256_adds_epu8(s18, t14);
++ m24 = _mm256_adds_epu8(s19, t15);
++ m25 = _mm256_adds_epu8(s18, t15);
++ m26 = _mm256_adds_epu8(s19, t14);
++ a89 = _mm256_min_epu8(m24, m23);
++ d9 = _mm256_cmpeq_epi8(a89, m24);
++ a90 = _mm256_min_epu8(m26, m25);
++ d10 = _mm256_cmpeq_epi8(a90, m26);
++ s22 = _mm256_unpacklo_epi8(d9, d10);
++ s23 = _mm256_unpackhi_epi8(d9, d10);
++ s20 = _mm256_movemask_epi8(_mm256_permute2x128_si256(s22, s23, 0x20));
++ a91 = ((int*)dec);
++ a92 = (4 * i9);
++ a93 = (a91 + a92);
++ *(a93) = s20;
++ s21 = _mm256_movemask_epi8(_mm256_permute2x128_si256(s22, s23, 0x31));
++ a110 = (a93 + 1);
++ *(a110) = s21;
++ s22 = _mm256_unpacklo_epi8(a89, a90);
++ s23 = _mm256_unpackhi_epi8(a89, a90);
++ a95 = ((__m256i*)Y);
++ s24 = _mm256_permute2x128_si256(s22, s23, 0x20);
++ *(a95) = s24;
++ s23 = _mm256_permute2x128_si256(s22, s23, 0x31);
++ a112 = (a95 + 1);
++ *(a112) = s23;
++ if ((((unsigned char*)Y)[0] > 210)) {
++ __m256i m5, m6;
++ m5 = ((__m256i*)Y)[0];
++ m5 = _mm256_min_epu8(m5, ((__m256i*)Y)[1]);
++ __m256i m7;
++ m7 = _mm256_min_epu8(_mm256_srli_si256(m5, 8), m5);
++ m7 = ((__m256i)_mm256_min_epu8(((__m256i)_mm256_srli_epi64(m7, 32)),
++ ((__m256i)m7)));
++ m7 = ((__m256i)_mm256_min_epu8(((__m256i)_mm256_srli_epi64(m7, 16)),
++ ((__m256i)m7)));
++ m7 = ((__m256i)_mm256_min_epu8(((__m256i)_mm256_srli_epi64(m7, 8)),
++ ((__m256i)m7)));
++ m7 = _mm256_unpacklo_epi8(m7, m7);
++ m7 = _mm256_shufflelo_epi16(m7, 0);
++ m6 = _mm256_unpacklo_epi64(m7, m7);
++ m6 = _mm256_permute2x128_si256(
++ m6, m6, 0); // copy lower half of m6 to upper half, since above ops
++ // operate on 128 bit lanes
++ ((__m256i*)Y)[0] = _mm256_subs_epu8(((__m256i*)Y)[0], m6);
++ ((__m256i*)Y)[1] = _mm256_subs_epu8(((__m256i*)Y)[1], m6);
++ }
++ unsigned char a188, a194;
++ int a205;
++ int s48, s54;
++ unsigned char *a187, *a193;
++ int *a204, *a206, *a223, *b16;
++ __m256i *a184, *a185, *a190, *a196, *a208, *a225;
++ __m256i a199, a200;
++ __m256i a189, a191, a192, a195, a197, a198, a201, a202, a203, d17, d18, m39, m40,
++ m41, m42, s46, s47, s50, s51, t25, t26, t27;
++ a184 = ((__m256i*)Y);
++ s46 = *(a184);
++ a185 = (a184 + 1);
++ s47 = *(a185);
++ s50 = _mm256_permute2x128_si256(s46, s47, 0x20);
++ s47 = _mm256_permute2x128_si256(s46, s47, 0x31);
++ s46 = s50;
++ a187 = (b6 + 2);
++ a188 = *(a187);
++ a189 = _mm256_set1_epi8(a188);
++ a190 = ((__m256i*)Branchtab);
++ a191 = *(a190);
++ a192 = _mm256_xor_si256(a189, a191);
++ a193 = (b6 + 3);
++ a194 = *(a193);
++ a195 = _mm256_set1_epi8(a194);
++ a196 = (a190 + 1);
++ a197 = *(a196);
++ a198 = _mm256_xor_si256(a195, a197);
++ t25 = _mm256_avg_epu8(a192, a198);
++ a199 = ((__m256i)t25);
++ a200 = _mm256_srli_epi16(a199, 2);
++ a201 = ((__m256i)a200);
++ t26 = _mm256_and_si256(a201, _mm256_set1_epi8(63));
++ t27 = _mm256_subs_epu8(_mm256_set1_epi8(63), t26);
++ m39 = _mm256_adds_epu8(s46, t26);
++ m40 = _mm256_adds_epu8(s47, t27);
++ m41 = _mm256_adds_epu8(s46, t27);
++ m42 = _mm256_adds_epu8(s47, t26);
++ a202 = _mm256_min_epu8(m40, m39);
++ d17 = _mm256_cmpeq_epi8(a202, m40);
++ a203 = _mm256_min_epu8(m42, m41);
++ d18 = _mm256_cmpeq_epi8(a203, m42);
++ s24 = _mm256_unpacklo_epi8(d17, d18);
++ s25 = _mm256_unpackhi_epi8(d17, d18);
++ s48 = _mm256_movemask_epi8(_mm256_permute2x128_si256(s24, s25, 0x20));
++ a204 = ((int*)dec);
++ a205 = (4 * i9);
++ b16 = (a204 + a205);
++ a206 = (b16 + 2);
++ *(a206) = s48;
++ s54 = _mm256_movemask_epi8(_mm256_permute2x128_si256(s24, s25, 0x31));
++ a223 = (b16 + 3);
++ *(a223) = s54;
++ s50 = _mm256_unpacklo_epi8(a202, a203);
++ s51 = _mm256_unpackhi_epi8(a202, a203);
++ s25 = _mm256_permute2x128_si256(s50, s51, 0x20);
++ s51 = _mm256_permute2x128_si256(s50, s51, 0x31);
++ a208 = ((__m256i*)X);
++ *(a208) = s25;
++ a225 = (a208 + 1);
++ *(a225) = s51;
++
++ if ((((unsigned char*)X)[0] > 210)) {
++ __m256i m12, m13;
++ m12 = ((__m256i*)X)[0];
++ m12 = _mm256_min_epu8(m12, ((__m256i*)X)[1]);
++ __m256i m14;
++ m14 = _mm256_min_epu8(_mm256_srli_si256(m12, 8), m12);
++ m14 = ((__m256i)_mm256_min_epu8(((__m256i)_mm256_srli_epi64(m14, 32)),
++ ((__m256i)m14)));
++ m14 = ((__m256i)_mm256_min_epu8(((__m256i)_mm256_srli_epi64(m14, 16)),
++ ((__m256i)m14)));
++ m14 = ((__m256i)_mm256_min_epu8(((__m256i)_mm256_srli_epi64(m14, 8)),
++ ((__m256i)m14)));
++ m14 = _mm256_unpacklo_epi8(m14, m14);
++ m14 = _mm256_shufflelo_epi16(m14, 0);
++ m13 = _mm256_unpacklo_epi64(m14, m14);
++ m13 = _mm256_permute2x128_si256(m13, m13, 0);
++ ((__m256i*)X)[0] = _mm256_subs_epu8(((__m256i*)X)[0], m13);
++ ((__m256i*)X)[1] = _mm256_subs_epu8(((__m256i*)X)[1], m13);
++ }
+ }
+- unsigned char a188, a194;
+- int a205;
+- int s48, s54;
+- unsigned char *a187, *a193;
+- int *a204, *a206, *a223, *b16;
+- __m256i *a184, *a185, *a190, *a196, *a208, *a225;
+- __m256i a199, a200;
+- __m256i a189, a191, a192, a195, a197, a198, a201
+- , a202, a203, d17, d18, m39, m40, m41
+- , m42, s46, s47, s50
+- , s51, t25, t26, t27;
+- a184 = ((__m256i *) Y);
+- s46 = *(a184);
+- a185 = (a184 + 1);
+- s47 = *(a185);
+- s50 = _mm256_permute2x128_si256(s46,s47,0x20);
+- s47 = _mm256_permute2x128_si256(s46,s47,0x31);
+- s46 = s50;
+- a187 = (b6 + 2);
+- a188 = *(a187);
+- a189 = _mm256_set1_epi8(a188);
+- a190 = ((__m256i *) Branchtab);
+- a191 = *(a190);
+- a192 = _mm256_xor_si256(a189, a191);
+- a193 = (b6 + 3);
+- a194 = *(a193);
+- a195 = _mm256_set1_epi8(a194);
+- a196 = (a190 + 1);
+- a197 = *(a196);
+- a198 = _mm256_xor_si256(a195, a197);
+- t25 = _mm256_avg_epu8(a192,a198);
+- a199 = ((__m256i ) t25);
+- a200 = _mm256_srli_epi16(a199, 2);
+- a201 = ((__m256i ) a200);
+- t26 = _mm256_and_si256(a201, _mm256_set1_epi8(63));
+- t27 = _mm256_subs_epu8(_mm256_set1_epi8(63), t26);
+- m39 = _mm256_adds_epu8(s46, t26);
+- m40 = _mm256_adds_epu8(s47, t27);
+- m41 = _mm256_adds_epu8(s46, t27);
+- m42 = _mm256_adds_epu8(s47, t26);
+- a202 = _mm256_min_epu8(m40, m39);
+- d17 = _mm256_cmpeq_epi8(a202, m40);
+- a203 = _mm256_min_epu8(m42, m41);
+- d18 = _mm256_cmpeq_epi8(a203, m42);
+- s24 = _mm256_unpacklo_epi8(d17,d18);
+- s25 = _mm256_unpackhi_epi8(d17,d18);
+- s48 = _mm256_movemask_epi8(_mm256_permute2x128_si256(s24, s25, 0x20));
+- a204 = ((int *) dec);
+- a205 = (4 * i9);
+- b16 = (a204 + a205);
+- a206 = (b16 + 2);
+- *(a206) = s48;
+- s54 = _mm256_movemask_epi8(_mm256_permute2x128_si256(s24, s25, 0x31));
+- a223 = (b16 + 3);
+- *(a223) = s54;
+- s50 = _mm256_unpacklo_epi8(a202, a203);
+- s51 = _mm256_unpackhi_epi8(a202, a203);
+- s25 = _mm256_permute2x128_si256(s50, s51, 0x20);
+- s51 = _mm256_permute2x128_si256(s50, s51, 0x31);
+- a208 = ((__m256i *) X);
+- *(a208) = s25;
+- a225 = (a208 + 1);
+- *(a225) = s51;
+-
+- if ((((unsigned char *) X)[0]>210)) {
+- __m256i m12, m13;
+- m12 = ((__m256i *) X)[0];
+- m12 = _mm256_min_epu8(m12, ((__m256i *) X)[1]);
+- __m256i m14;
+- m14 = _mm256_min_epu8(_mm256_srli_si256(m12, 8), m12);
+- m14 = ((__m256i ) _mm256_min_epu8(((__m256i ) _mm256_srli_epi64(m14, 32)), ((__m256i ) m14)));
+- m14 = ((__m256i ) _mm256_min_epu8(((__m256i ) _mm256_srli_epi64(m14, 16)), ((__m256i ) m14)));
+- m14 = ((__m256i ) _mm256_min_epu8(((__m256i ) _mm256_srli_epi64(m14, 8)), ((__m256i ) m14)));
+- m14 = _mm256_unpacklo_epi8(m14, m14);
+- m14 = _mm256_shufflelo_epi16(m14, 0);
+- m13 = _mm256_unpacklo_epi64(m14, m14);
+- m13 = _mm256_permute2x128_si256(m13, m13, 0);
+- ((__m256i *) X)[0] = _mm256_subs_epu8(((__m256i *) X)[0], m13);
+- ((__m256i *) X)[1] = _mm256_subs_epu8(((__m256i *) X)[1], m13);
+- }
+- }
+-
+- renormalize(X, 210);
+
+- unsigned int j;
+- for(j=0; j < (framebits + excess) % 2; ++j) {
+- int i;
+- for(i=0;i<64/2;i++){
+- BFLY(i, (((framebits+excess) >> 1) << 1) + j , syms, Y, X, (decision_t *)dec, Branchtab);
++ renormalize(X, 210);
++
++ unsigned int j;
++ for (j = 0; j < (framebits + excess) % 2; ++j) {
++ int i;
++ for (i = 0; i < 64 / 2; i++) {
++ BFLY(i,
++ (((framebits + excess) >> 1) << 1) + j,
++ syms,
++ Y,
++ X,
++ (decision_t*)dec,
++ Branchtab);
++ }
++
++ renormalize(Y, 210);
+ }
+-
+- renormalize(Y, 210);
+-
+- }
+- /*skip*/
++ /*skip*/
+ }
+
+ #endif /*LV_HAVE_AVX2*/
+@@ -316,295 +332,300 @@ volk_8u_x4_conv_k7_r2_8u_avx2(unsigned char* Y, unsigned char* X,
+
+ #if LV_HAVE_SSE3
+
+-#include <pmmintrin.h>
+ #include <emmintrin.h>
+-#include <xmmintrin.h>
+ #include <mmintrin.h>
++#include <pmmintrin.h>
+ #include <stdio.h>
++#include <xmmintrin.h>
+
+-static inline void
+-volk_8u_x4_conv_k7_r2_8u_spiral(unsigned char* Y, unsigned char* X,
+- unsigned char* syms, unsigned char* dec,
+- unsigned int framebits, unsigned int excess,
+- unsigned char* Branchtab)
++static inline void volk_8u_x4_conv_k7_r2_8u_spiral(unsigned char* Y,
++ unsigned char* X,
++ unsigned char* syms,
++ unsigned char* dec,
++ unsigned int framebits,
++ unsigned int excess,
++ unsigned char* Branchtab)
+ {
+- unsigned int i9;
+- for(i9 = 0; i9 < ((framebits + excess) >> 1); i9++) {
+- unsigned char a75, a81;
+- int a73, a92;
+- short int s20, s21, s26, s27;
+- unsigned char *a74, *a80, *b6;
+- short int *a110, *a111, *a91, *a93, *a94;
+- __m128i *a102, *a112, *a113, *a71, *a72, *a77, *a83
+- , *a95, *a96, *a97, *a98, *a99;
+- __m128i a105, a106, a86, a87;
+- __m128i a100, a101, a103, a104, a107, a108, a109
+- , a76, a78, a79, a82, a84, a85, a88, a89
+- , a90, d10, d11, d12, d9, m23, m24, m25
+- , m26, m27, m28, m29, m30, s18, s19, s22
+- , s23, s24, s25, s28, s29, t13, t14, t15
+- , t16, t17, t18;
+- a71 = ((__m128i *) X);
+- s18 = *(a71);
+- a72 = (a71 + 2);
+- s19 = *(a72);
+- a73 = (4 * i9);
+- a74 = (syms + a73);
+- a75 = *(a74);
+- a76 = _mm_set1_epi8(a75);
+- a77 = ((__m128i *) Branchtab);
+- a78 = *(a77);
+- a79 = _mm_xor_si128(a76, a78);
+- b6 = (a73 + syms);
+- a80 = (b6 + 1);
+- a81 = *(a80);
+- a82 = _mm_set1_epi8(a81);
+- a83 = (a77 + 2);
+- a84 = *(a83);
+- a85 = _mm_xor_si128(a82, a84);
+- t13 = _mm_avg_epu8(a79,a85);
+- a86 = ((__m128i ) t13);
+- a87 = _mm_srli_epi16(a86, 2);
+- a88 = ((__m128i ) a87);
+- t14 = _mm_and_si128(a88, _mm_set_epi8(63, 63, 63, 63, 63, 63, 63
+- , 63, 63, 63, 63, 63, 63, 63, 63
+- , 63));
+- t15 = _mm_subs_epu8(_mm_set_epi8(63, 63, 63, 63, 63, 63, 63
+- , 63, 63, 63, 63, 63, 63, 63, 63
+- , 63), t14);
+- m23 = _mm_adds_epu8(s18, t14);
+- m24 = _mm_adds_epu8(s19, t15);
+- m25 = _mm_adds_epu8(s18, t15);
+- m26 = _mm_adds_epu8(s19, t14);
+- a89 = _mm_min_epu8(m24, m23);
+- d9 = _mm_cmpeq_epi8(a89, m24);
+- a90 = _mm_min_epu8(m26, m25);
+- d10 = _mm_cmpeq_epi8(a90, m26);
+- s20 = _mm_movemask_epi8(_mm_unpacklo_epi8(d9,d10));
+- a91 = ((short int *) dec);
+- a92 = (8 * i9);
+- a93 = (a91 + a92);
+- *(a93) = s20;
+- s21 = _mm_movemask_epi8(_mm_unpackhi_epi8(d9,d10));
+- a94 = (a93 + 1);
+- *(a94) = s21;
+- s22 = _mm_unpacklo_epi8(a89, a90);
+- s23 = _mm_unpackhi_epi8(a89, a90);
+- a95 = ((__m128i *) Y);
+- *(a95) = s22;
+- a96 = (a95 + 1);
+- *(a96) = s23;
+- a97 = (a71 + 1);
+- s24 = *(a97);
+- a98 = (a71 + 3);
+- s25 = *(a98);
+- a99 = (a77 + 1);
+- a100 = *(a99);
+- a101 = _mm_xor_si128(a76, a100);
+- a102 = (a77 + 3);
+- a103 = *(a102);
+- a104 = _mm_xor_si128(a82, a103);
+- t16 = _mm_avg_epu8(a101,a104);
+- a105 = ((__m128i ) t16);
+- a106 = _mm_srli_epi16(a105, 2);
+- a107 = ((__m128i ) a106);
+- t17 = _mm_and_si128(a107, _mm_set_epi8(63, 63, 63, 63, 63, 63, 63
+- , 63, 63, 63, 63, 63, 63, 63, 63
+- , 63));
+- t18 = _mm_subs_epu8(_mm_set_epi8(63, 63, 63, 63, 63, 63, 63
+- , 63, 63, 63, 63, 63, 63, 63, 63
+- , 63), t17);
+- m27 = _mm_adds_epu8(s24, t17);
+- m28 = _mm_adds_epu8(s25, t18);
+- m29 = _mm_adds_epu8(s24, t18);
+- m30 = _mm_adds_epu8(s25, t17);
+- a108 = _mm_min_epu8(m28, m27);
+- d11 = _mm_cmpeq_epi8(a108, m28);
+- a109 = _mm_min_epu8(m30, m29);
+- d12 = _mm_cmpeq_epi8(a109, m30);
+- s26 = _mm_movemask_epi8(_mm_unpacklo_epi8(d11,d12));
+- a110 = (a93 + 2);
+- *(a110) = s26;
+- s27 = _mm_movemask_epi8(_mm_unpackhi_epi8(d11,d12));
+- a111 = (a93 + 3);
+- *(a111) = s27;
+- s28 = _mm_unpacklo_epi8(a108, a109);
+- s29 = _mm_unpackhi_epi8(a108, a109);
+- a112 = (a95 + 2);
+- *(a112) = s28;
+- a113 = (a95 + 3);
+- *(a113) = s29;
+- if ((((unsigned char *) Y)[0]>210)) {
+- __m128i m5, m6;
+- m5 = ((__m128i *) Y)[0];
+- m5 = _mm_min_epu8(m5, ((__m128i *) Y)[1]);
+- m5 = _mm_min_epu8(m5, ((__m128i *) Y)[2]);
+- m5 = _mm_min_epu8(m5, ((__m128i *) Y)[3]);
+- __m128i m7;
+- m7 = _mm_min_epu8(_mm_srli_si128(m5, 8), m5);
+- m7 = ((__m128i ) _mm_min_epu8(((__m128i ) _mm_srli_epi64(m7, 32)), ((__m128i ) m7)));
+- m7 = ((__m128i ) _mm_min_epu8(((__m128i ) _mm_srli_epi64(m7, 16)), ((__m128i ) m7)));
+- m7 = ((__m128i ) _mm_min_epu8(((__m128i ) _mm_srli_epi64(m7, 8)), ((__m128i ) m7)));
+- m7 = _mm_unpacklo_epi8(m7, m7);
+- m7 = _mm_shufflelo_epi16(m7, _MM_SHUFFLE(0, 0, 0, 0));
+- m6 = _mm_unpacklo_epi64(m7, m7);
+- ((__m128i *) Y)[0] = _mm_subs_epu8(((__m128i *) Y)[0], m6);
+- ((__m128i *) Y)[1] = _mm_subs_epu8(((__m128i *) Y)[1], m6);
+- ((__m128i *) Y)[2] = _mm_subs_epu8(((__m128i *) Y)[2], m6);
+- ((__m128i *) Y)[3] = _mm_subs_epu8(((__m128i *) Y)[3], m6);
+- }
+- unsigned char a188, a194;
+- int a186, a205;
+- short int s48, s49, s54, s55;
+- unsigned char *a187, *a193, *b15;
+- short int *a204, *a206, *a207, *a223, *a224, *b16;
+- __m128i *a184, *a185, *a190, *a196, *a208, *a209, *a210
+- , *a211, *a212, *a215, *a225, *a226;
+- __m128i a199, a200, a218, a219;
+- __m128i a189, a191, a192, a195, a197, a198, a201
+- , a202, a203, a213, a214, a216, a217, a220, a221
+- , a222, d17, d18, d19, d20, m39, m40, m41
+- , m42, m43, m44, m45, m46, s46, s47, s50
+- , s51, s52, s53, s56, s57, t25, t26, t27
+- , t28, t29, t30;
+- a184 = ((__m128i *) Y);
+- s46 = *(a184);
+- a185 = (a184 + 2);
+- s47 = *(a185);
+- a186 = (4 * i9);
+- b15 = (a186 + syms);
+- a187 = (b15 + 2);
+- a188 = *(a187);
+- a189 = _mm_set1_epi8(a188);
+- a190 = ((__m128i *) Branchtab);
+- a191 = *(a190);
+- a192 = _mm_xor_si128(a189, a191);
+- a193 = (b15 + 3);
+- a194 = *(a193);
+- a195 = _mm_set1_epi8(a194);
+- a196 = (a190 + 2);
+- a197 = *(a196);
+- a198 = _mm_xor_si128(a195, a197);
+- t25 = _mm_avg_epu8(a192,a198);
+- a199 = ((__m128i ) t25);
+- a200 = _mm_srli_epi16(a199, 2);
+- a201 = ((__m128i ) a200);
+- t26 = _mm_and_si128(a201, _mm_set_epi8(63, 63, 63, 63, 63, 63, 63
+- , 63, 63, 63, 63, 63, 63, 63, 63
+- , 63));
+- t27 = _mm_subs_epu8(_mm_set_epi8(63, 63, 63, 63, 63, 63, 63
+- , 63, 63, 63, 63, 63, 63, 63, 63
+- , 63), t26);
+- m39 = _mm_adds_epu8(s46, t26);
+- m40 = _mm_adds_epu8(s47, t27);
+- m41 = _mm_adds_epu8(s46, t27);
+- m42 = _mm_adds_epu8(s47, t26);
+- a202 = _mm_min_epu8(m40, m39);
+- d17 = _mm_cmpeq_epi8(a202, m40);
+- a203 = _mm_min_epu8(m42, m41);
+- d18 = _mm_cmpeq_epi8(a203, m42);
+- s48 = _mm_movemask_epi8(_mm_unpacklo_epi8(d17,d18));
+- a204 = ((short int *) dec);
+- a205 = (8 * i9);
+- b16 = (a204 + a205);
+- a206 = (b16 + 4);
+- *(a206) = s48;
+- s49 = _mm_movemask_epi8(_mm_unpackhi_epi8(d17,d18));
+- a207 = (b16 + 5);
+- *(a207) = s49;
+- s50 = _mm_unpacklo_epi8(a202, a203);
+- s51 = _mm_unpackhi_epi8(a202, a203);
+- a208 = ((__m128i *) X);
+- *(a208) = s50;
+- a209 = (a208 + 1);
+- *(a209) = s51;
+- a210 = (a184 + 1);
+- s52 = *(a210);
+- a211 = (a184 + 3);
+- s53 = *(a211);
+- a212 = (a190 + 1);
+- a213 = *(a212);
+- a214 = _mm_xor_si128(a189, a213);
+- a215 = (a190 + 3);
+- a216 = *(a215);
+- a217 = _mm_xor_si128(a195, a216);
+- t28 = _mm_avg_epu8(a214,a217);
+- a218 = ((__m128i ) t28);
+- a219 = _mm_srli_epi16(a218, 2);
+- a220 = ((__m128i ) a219);
+- t29 = _mm_and_si128(a220, _mm_set_epi8(63, 63, 63, 63, 63, 63, 63
+- , 63, 63, 63, 63, 63, 63, 63, 63
+- , 63));
+- t30 = _mm_subs_epu8(_mm_set_epi8(63, 63, 63, 63, 63, 63, 63
+- , 63, 63, 63, 63, 63, 63, 63, 63
+- , 63), t29);
+- m43 = _mm_adds_epu8(s52, t29);
+- m44 = _mm_adds_epu8(s53, t30);
+- m45 = _mm_adds_epu8(s52, t30);
+- m46 = _mm_adds_epu8(s53, t29);
+- a221 = _mm_min_epu8(m44, m43);
+- d19 = _mm_cmpeq_epi8(a221, m44);
+- a222 = _mm_min_epu8(m46, m45);
+- d20 = _mm_cmpeq_epi8(a222, m46);
+- s54 = _mm_movemask_epi8(_mm_unpacklo_epi8(d19,d20));
+- a223 = (b16 + 6);
+- *(a223) = s54;
+- s55 = _mm_movemask_epi8(_mm_unpackhi_epi8(d19,d20));
+- a224 = (b16 + 7);
+- *(a224) = s55;
+- s56 = _mm_unpacklo_epi8(a221, a222);
+- s57 = _mm_unpackhi_epi8(a221, a222);
+- a225 = (a208 + 2);
+- *(a225) = s56;
+- a226 = (a208 + 3);
+- *(a226) = s57;
+- if ((((unsigned char *) X)[0]>210)) {
+- __m128i m12, m13;
+- m12 = ((__m128i *) X)[0];
+- m12 = _mm_min_epu8(m12, ((__m128i *) X)[1]);
+- m12 = _mm_min_epu8(m12, ((__m128i *) X)[2]);
+- m12 = _mm_min_epu8(m12, ((__m128i *) X)[3]);
+- __m128i m14;
+- m14 = _mm_min_epu8(_mm_srli_si128(m12, 8), m12);
+- m14 = ((__m128i ) _mm_min_epu8(((__m128i ) _mm_srli_epi64(m14, 32)), ((__m128i ) m14)));
+- m14 = ((__m128i ) _mm_min_epu8(((__m128i ) _mm_srli_epi64(m14, 16)), ((__m128i ) m14)));
+- m14 = ((__m128i ) _mm_min_epu8(((__m128i ) _mm_srli_epi64(m14, 8)), ((__m128i ) m14)));
+- m14 = _mm_unpacklo_epi8(m14, m14);
+- m14 = _mm_shufflelo_epi16(m14, _MM_SHUFFLE(0, 0, 0, 0));
+- m13 = _mm_unpacklo_epi64(m14, m14);
+- ((__m128i *) X)[0] = _mm_subs_epu8(((__m128i *) X)[0], m13);
+- ((__m128i *) X)[1] = _mm_subs_epu8(((__m128i *) X)[1], m13);
+- ((__m128i *) X)[2] = _mm_subs_epu8(((__m128i *) X)[2], m13);
+- ((__m128i *) X)[3] = _mm_subs_epu8(((__m128i *) X)[3], m13);
++ unsigned int i9;
++ for (i9 = 0; i9 < ((framebits + excess) >> 1); i9++) {
++ unsigned char a75, a81;
++ int a73, a92;
++ short int s20, s21, s26, s27;
++ unsigned char *a74, *a80, *b6;
++ short int *a110, *a111, *a91, *a93, *a94;
++ __m128i *a102, *a112, *a113, *a71, *a72, *a77, *a83, *a95, *a96, *a97, *a98, *a99;
++ __m128i a105, a106, a86, a87;
++ __m128i a100, a101, a103, a104, a107, a108, a109, a76, a78, a79, a82, a84, a85,
++ a88, a89, a90, d10, d11, d12, d9, m23, m24, m25, m26, m27, m28, m29, m30, s18,
++ s19, s22, s23, s24, s25, s28, s29, t13, t14, t15, t16, t17, t18;
++ a71 = ((__m128i*)X);
++ s18 = *(a71);
++ a72 = (a71 + 2);
++ s19 = *(a72);
++ a73 = (4 * i9);
++ a74 = (syms + a73);
++ a75 = *(a74);
++ a76 = _mm_set1_epi8(a75);
++ a77 = ((__m128i*)Branchtab);
++ a78 = *(a77);
++ a79 = _mm_xor_si128(a76, a78);
++ b6 = (a73 + syms);
++ a80 = (b6 + 1);
++ a81 = *(a80);
++ a82 = _mm_set1_epi8(a81);
++ a83 = (a77 + 2);
++ a84 = *(a83);
++ a85 = _mm_xor_si128(a82, a84);
++ t13 = _mm_avg_epu8(a79, a85);
++ a86 = ((__m128i)t13);
++ a87 = _mm_srli_epi16(a86, 2);
++ a88 = ((__m128i)a87);
++ t14 = _mm_and_si128(
++ a88,
++ _mm_set_epi8(63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63));
++ t15 = _mm_subs_epu8(
++ _mm_set_epi8(63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63),
++ t14);
++ m23 = _mm_adds_epu8(s18, t14);
++ m24 = _mm_adds_epu8(s19, t15);
++ m25 = _mm_adds_epu8(s18, t15);
++ m26 = _mm_adds_epu8(s19, t14);
++ a89 = _mm_min_epu8(m24, m23);
++ d9 = _mm_cmpeq_epi8(a89, m24);
++ a90 = _mm_min_epu8(m26, m25);
++ d10 = _mm_cmpeq_epi8(a90, m26);
++ s20 = _mm_movemask_epi8(_mm_unpacklo_epi8(d9, d10));
++ a91 = ((short int*)dec);
++ a92 = (8 * i9);
++ a93 = (a91 + a92);
++ *(a93) = s20;
++ s21 = _mm_movemask_epi8(_mm_unpackhi_epi8(d9, d10));
++ a94 = (a93 + 1);
++ *(a94) = s21;
++ s22 = _mm_unpacklo_epi8(a89, a90);
++ s23 = _mm_unpackhi_epi8(a89, a90);
++ a95 = ((__m128i*)Y);
++ *(a95) = s22;
++ a96 = (a95 + 1);
++ *(a96) = s23;
++ a97 = (a71 + 1);
++ s24 = *(a97);
++ a98 = (a71 + 3);
++ s25 = *(a98);
++ a99 = (a77 + 1);
++ a100 = *(a99);
++ a101 = _mm_xor_si128(a76, a100);
++ a102 = (a77 + 3);
++ a103 = *(a102);
++ a104 = _mm_xor_si128(a82, a103);
++ t16 = _mm_avg_epu8(a101, a104);
++ a105 = ((__m128i)t16);
++ a106 = _mm_srli_epi16(a105, 2);
++ a107 = ((__m128i)a106);
++ t17 = _mm_and_si128(
++ a107,
++ _mm_set_epi8(63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63));
++ t18 = _mm_subs_epu8(
++ _mm_set_epi8(63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63),
++ t17);
++ m27 = _mm_adds_epu8(s24, t17);
++ m28 = _mm_adds_epu8(s25, t18);
++ m29 = _mm_adds_epu8(s24, t18);
++ m30 = _mm_adds_epu8(s25, t17);
++ a108 = _mm_min_epu8(m28, m27);
++ d11 = _mm_cmpeq_epi8(a108, m28);
++ a109 = _mm_min_epu8(m30, m29);
++ d12 = _mm_cmpeq_epi8(a109, m30);
++ s26 = _mm_movemask_epi8(_mm_unpacklo_epi8(d11, d12));
++ a110 = (a93 + 2);
++ *(a110) = s26;
++ s27 = _mm_movemask_epi8(_mm_unpackhi_epi8(d11, d12));
++ a111 = (a93 + 3);
++ *(a111) = s27;
++ s28 = _mm_unpacklo_epi8(a108, a109);
++ s29 = _mm_unpackhi_epi8(a108, a109);
++ a112 = (a95 + 2);
++ *(a112) = s28;
++ a113 = (a95 + 3);
++ *(a113) = s29;
++ if ((((unsigned char*)Y)[0] > 210)) {
++ __m128i m5, m6;
++ m5 = ((__m128i*)Y)[0];
++ m5 = _mm_min_epu8(m5, ((__m128i*)Y)[1]);
++ m5 = _mm_min_epu8(m5, ((__m128i*)Y)[2]);
++ m5 = _mm_min_epu8(m5, ((__m128i*)Y)[3]);
++ __m128i m7;
++ m7 = _mm_min_epu8(_mm_srli_si128(m5, 8), m5);
++ m7 =
++ ((__m128i)_mm_min_epu8(((__m128i)_mm_srli_epi64(m7, 32)), ((__m128i)m7)));
++ m7 =
++ ((__m128i)_mm_min_epu8(((__m128i)_mm_srli_epi64(m7, 16)), ((__m128i)m7)));
++ m7 = ((__m128i)_mm_min_epu8(((__m128i)_mm_srli_epi64(m7, 8)), ((__m128i)m7)));
++ m7 = _mm_unpacklo_epi8(m7, m7);
++ m7 = _mm_shufflelo_epi16(m7, _MM_SHUFFLE(0, 0, 0, 0));
++ m6 = _mm_unpacklo_epi64(m7, m7);
++ ((__m128i*)Y)[0] = _mm_subs_epu8(((__m128i*)Y)[0], m6);
++ ((__m128i*)Y)[1] = _mm_subs_epu8(((__m128i*)Y)[1], m6);
++ ((__m128i*)Y)[2] = _mm_subs_epu8(((__m128i*)Y)[2], m6);
++ ((__m128i*)Y)[3] = _mm_subs_epu8(((__m128i*)Y)[3], m6);
++ }
++ unsigned char a188, a194;
++ int a186, a205;
++ short int s48, s49, s54, s55;
++ unsigned char *a187, *a193, *b15;
++ short int *a204, *a206, *a207, *a223, *a224, *b16;
++ __m128i *a184, *a185, *a190, *a196, *a208, *a209, *a210, *a211, *a212, *a215,
++ *a225, *a226;
++ __m128i a199, a200, a218, a219;
++ __m128i a189, a191, a192, a195, a197, a198, a201, a202, a203, a213, a214, a216,
++ a217, a220, a221, a222, d17, d18, d19, d20, m39, m40, m41, m42, m43, m44, m45,
++ m46, s46, s47, s50, s51, s52, s53, s56, s57, t25, t26, t27, t28, t29, t30;
++ a184 = ((__m128i*)Y);
++ s46 = *(a184);
++ a185 = (a184 + 2);
++ s47 = *(a185);
++ a186 = (4 * i9);
++ b15 = (a186 + syms);
++ a187 = (b15 + 2);
++ a188 = *(a187);
++ a189 = _mm_set1_epi8(a188);
++ a190 = ((__m128i*)Branchtab);
++ a191 = *(a190);
++ a192 = _mm_xor_si128(a189, a191);
++ a193 = (b15 + 3);
++ a194 = *(a193);
++ a195 = _mm_set1_epi8(a194);
++ a196 = (a190 + 2);
++ a197 = *(a196);
++ a198 = _mm_xor_si128(a195, a197);
++ t25 = _mm_avg_epu8(a192, a198);
++ a199 = ((__m128i)t25);
++ a200 = _mm_srli_epi16(a199, 2);
++ a201 = ((__m128i)a200);
++ t26 = _mm_and_si128(
++ a201,
++ _mm_set_epi8(63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63));
++ t27 = _mm_subs_epu8(
++ _mm_set_epi8(63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63),
++ t26);
++ m39 = _mm_adds_epu8(s46, t26);
++ m40 = _mm_adds_epu8(s47, t27);
++ m41 = _mm_adds_epu8(s46, t27);
++ m42 = _mm_adds_epu8(s47, t26);
++ a202 = _mm_min_epu8(m40, m39);
++ d17 = _mm_cmpeq_epi8(a202, m40);
++ a203 = _mm_min_epu8(m42, m41);
++ d18 = _mm_cmpeq_epi8(a203, m42);
++ s48 = _mm_movemask_epi8(_mm_unpacklo_epi8(d17, d18));
++ a204 = ((short int*)dec);
++ a205 = (8 * i9);
++ b16 = (a204 + a205);
++ a206 = (b16 + 4);
++ *(a206) = s48;
++ s49 = _mm_movemask_epi8(_mm_unpackhi_epi8(d17, d18));
++ a207 = (b16 + 5);
++ *(a207) = s49;
++ s50 = _mm_unpacklo_epi8(a202, a203);
++ s51 = _mm_unpackhi_epi8(a202, a203);
++ a208 = ((__m128i*)X);
++ *(a208) = s50;
++ a209 = (a208 + 1);
++ *(a209) = s51;
++ a210 = (a184 + 1);
++ s52 = *(a210);
++ a211 = (a184 + 3);
++ s53 = *(a211);
++ a212 = (a190 + 1);
++ a213 = *(a212);
++ a214 = _mm_xor_si128(a189, a213);
++ a215 = (a190 + 3);
++ a216 = *(a215);
++ a217 = _mm_xor_si128(a195, a216);
++ t28 = _mm_avg_epu8(a214, a217);
++ a218 = ((__m128i)t28);
++ a219 = _mm_srli_epi16(a218, 2);
++ a220 = ((__m128i)a219);
++ t29 = _mm_and_si128(
++ a220,
++ _mm_set_epi8(63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63));
++ t30 = _mm_subs_epu8(
++ _mm_set_epi8(63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63),
++ t29);
++ m43 = _mm_adds_epu8(s52, t29);
++ m44 = _mm_adds_epu8(s53, t30);
++ m45 = _mm_adds_epu8(s52, t30);
++ m46 = _mm_adds_epu8(s53, t29);
++ a221 = _mm_min_epu8(m44, m43);
++ d19 = _mm_cmpeq_epi8(a221, m44);
++ a222 = _mm_min_epu8(m46, m45);
++ d20 = _mm_cmpeq_epi8(a222, m46);
++ s54 = _mm_movemask_epi8(_mm_unpacklo_epi8(d19, d20));
++ a223 = (b16 + 6);
++ *(a223) = s54;
++ s55 = _mm_movemask_epi8(_mm_unpackhi_epi8(d19, d20));
++ a224 = (b16 + 7);
++ *(a224) = s55;
++ s56 = _mm_unpacklo_epi8(a221, a222);
++ s57 = _mm_unpackhi_epi8(a221, a222);
++ a225 = (a208 + 2);
++ *(a225) = s56;
++ a226 = (a208 + 3);
++ *(a226) = s57;
++ if ((((unsigned char*)X)[0] > 210)) {
++ __m128i m12, m13;
++ m12 = ((__m128i*)X)[0];
++ m12 = _mm_min_epu8(m12, ((__m128i*)X)[1]);
++ m12 = _mm_min_epu8(m12, ((__m128i*)X)[2]);
++ m12 = _mm_min_epu8(m12, ((__m128i*)X)[3]);
++ __m128i m14;
++ m14 = _mm_min_epu8(_mm_srli_si128(m12, 8), m12);
++ m14 = ((__m128i)_mm_min_epu8(((__m128i)_mm_srli_epi64(m14, 32)),
++ ((__m128i)m14)));
++ m14 = ((__m128i)_mm_min_epu8(((__m128i)_mm_srli_epi64(m14, 16)),
++ ((__m128i)m14)));
++ m14 = ((__m128i)_mm_min_epu8(((__m128i)_mm_srli_epi64(m14, 8)),
++ ((__m128i)m14)));
++ m14 = _mm_unpacklo_epi8(m14, m14);
++ m14 = _mm_shufflelo_epi16(m14, _MM_SHUFFLE(0, 0, 0, 0));
++ m13 = _mm_unpacklo_epi64(m14, m14);
++ ((__m128i*)X)[0] = _mm_subs_epu8(((__m128i*)X)[0], m13);
++ ((__m128i*)X)[1] = _mm_subs_epu8(((__m128i*)X)[1], m13);
++ ((__m128i*)X)[2] = _mm_subs_epu8(((__m128i*)X)[2], m13);
++ ((__m128i*)X)[3] = _mm_subs_epu8(((__m128i*)X)[3], m13);
++ }
+ }
+- }
+-
+- renormalize(X, 210);
+
+- /*int ch;
+- for(ch = 0; ch < 64; ch++) {
+- printf("%d,", X[ch]);
+- }
+- printf("\n");*/
+-
+- unsigned int j;
+- for(j=0; j < (framebits + excess) % 2; ++j) {
+- int i;
+- for(i=0;i<64/2;i++){
+- BFLY(i, (((framebits+excess) >> 1) << 1) + j , syms, Y, X, (decision_t *)dec, Branchtab);
+- }
++ renormalize(X, 210);
+
+-
+- renormalize(Y, 210);
+-
+- /*printf("\n");
++ /*int ch;
+ for(ch = 0; ch < 64; ch++) {
+- printf("%d,", Y[ch]);
++ printf("%d,", X[ch]);
+ }
+ printf("\n");*/
+
+- }
+- /*skip*/
++ unsigned int j;
++ for (j = 0; j < (framebits + excess) % 2; ++j) {
++ int i;
++ for (i = 0; i < 64 / 2; i++) {
++ BFLY(i,
++ (((framebits + excess) >> 1) << 1) + j,
++ syms,
++ Y,
++ X,
++ (decision_t*)dec,
++ Branchtab);
++ }
++
++
++ renormalize(Y, 210);
++
++ /*printf("\n");
++ for(ch = 0; ch < 64; ch++) {
++ printf("%d,", Y[ch]);
++ }
++ printf("\n");*/
++ }
++ /*skip*/
+ }
+
+ #endif /*LV_HAVE_SSE3*/
+@@ -612,30 +633,32 @@ volk_8u_x4_conv_k7_r2_8u_spiral(unsigned char* Y, unsigned char* X,
+
+ #if LV_HAVE_GENERIC
+
+-static inline void
+-volk_8u_x4_conv_k7_r2_8u_generic(unsigned char* Y, unsigned char* X,
+- unsigned char* syms, unsigned char* dec,
+- unsigned int framebits, unsigned int excess,
+- unsigned char* Branchtab)
++static inline void volk_8u_x4_conv_k7_r2_8u_generic(unsigned char* Y,
++ unsigned char* X,
++ unsigned char* syms,
++ unsigned char* dec,
++ unsigned int framebits,
++ unsigned int excess,
++ unsigned char* Branchtab)
+ {
+- int nbits = framebits + excess;
+- int NUMSTATES = 64;
+- int RENORMALIZE_THRESHOLD = 210;
+-
+- int s,i;
+- for (s=0;s<nbits;s++){
+- void *tmp;
+- for(i=0;i<NUMSTATES/2;i++){
+- BFLY(i, s, syms, Y, X, (decision_t *)dec, Branchtab);
++ int nbits = framebits + excess;
++ int NUMSTATES = 64;
++ int RENORMALIZE_THRESHOLD = 210;
++
++ int s, i;
++ for (s = 0; s < nbits; s++) {
++ void* tmp;
++ for (i = 0; i < NUMSTATES / 2; i++) {
++ BFLY(i, s, syms, Y, X, (decision_t*)dec, Branchtab);
++ }
++
++ renormalize(Y, RENORMALIZE_THRESHOLD);
++
++ /// Swap pointers to old and new metrics
++ tmp = (void*)X;
++ X = Y;
++ Y = (unsigned char*)tmp;
+ }
+-
+- renormalize(Y, RENORMALIZE_THRESHOLD);
+-
+- /// Swap pointers to old and new metrics
+- tmp = (void *)X;
+- X = Y;
+- Y = (unsigned char*)tmp;
+- }
+ }
+
+ #endif /* LV_HAVE_GENERIC */
+diff --git a/lib/kernel_tests.h b/lib/kernel_tests.h
+index 8552488..51be069 100644
+--- a/lib/kernel_tests.h
++++ b/lib/kernel_tests.h
+@@ -8,13 +8,18 @@
+
+ // for puppets we need to get all the func_variants for the puppet and just
+ // keep track of the actual function name to write to results
+-#define VOLK_INIT_PUPP(func, puppet_master_func, test_params)\
+- volk_test_case_t(func##_get_func_desc(), (void(*)())func##_manual, std::string(#func),\
+- std::string(#puppet_master_func), test_params)
++#define VOLK_INIT_PUPP(func, puppet_master_func, test_params) \
++ volk_test_case_t(func##_get_func_desc(), \
++ (void (*)())func##_manual, \
++ std::string(#func), \
++ std::string(#puppet_master_func), \
++ test_params)
+
+-#define VOLK_INIT_TEST(func, test_params)\
+- volk_test_case_t(func##_get_func_desc(), (void(*)())func##_manual, std::string(#func),\
+- test_params)
++#define VOLK_INIT_TEST(func, test_params) \
++ volk_test_case_t(func##_get_func_desc(), \
++ (void (*)())func##_manual, \
++ std::string(#func), \
++ test_params)
+
+ #define QA(test) test_cases.push_back(test);
+ std::vector<volk_test_case_t> init_test_list(volk_test_params_t test_params)
+@@ -32,127 +37,135 @@ std::vector<volk_test_case_t> init_test_list(volk_test_params_t test_params)
+ test_params_rotator.set_tol(1e-3);
+
+ std::vector<volk_test_case_t> test_cases;
+- QA(VOLK_INIT_PUPP(volk_64u_popcntpuppet_64u, volk_64u_popcnt, test_params))
+- QA(VOLK_INIT_PUPP(volk_64u_popcntpuppet_64u, volk_64u_popcnt, test_params))
+- QA(VOLK_INIT_PUPP(volk_64u_popcntpuppet_64u, volk_64u_popcnt, test_params))
++ QA(VOLK_INIT_PUPP(volk_64u_popcntpuppet_64u, volk_64u_popcnt, test_params))
++ QA(VOLK_INIT_PUPP(volk_64u_popcntpuppet_64u, volk_64u_popcnt, test_params))
++ QA(VOLK_INIT_PUPP(volk_64u_popcntpuppet_64u, volk_64u_popcnt, test_params))
+ QA(VOLK_INIT_PUPP(volk_16u_byteswappuppet_16u, volk_16u_byteswap, test_params))
+ QA(VOLK_INIT_PUPP(volk_32u_byteswappuppet_32u, volk_32u_byteswap, test_params))
+- QA(VOLK_INIT_PUPP(volk_32u_popcntpuppet_32u, volk_32u_popcnt_32u, test_params))
++ QA(VOLK_INIT_PUPP(volk_32u_popcntpuppet_32u, volk_32u_popcnt_32u, test_params))
+ QA(VOLK_INIT_PUPP(volk_64u_byteswappuppet_64u, volk_64u_byteswap, test_params))
+- QA(VOLK_INIT_PUPP(volk_32fc_s32fc_rotatorpuppet_32fc, volk_32fc_s32fc_x2_rotator_32fc, test_params_rotator))
+- QA(VOLK_INIT_PUPP(volk_8u_conv_k7_r2puppet_8u, volk_8u_x4_conv_k7_r2_8u, test_params.make_tol(0)))
+- QA(VOLK_INIT_PUPP(volk_32f_x2_fm_detectpuppet_32f, volk_32f_s32f_32f_fm_detect_32f, test_params))
+- QA(VOLK_INIT_TEST(volk_16ic_s32f_deinterleave_real_32f, test_params))
+- QA(VOLK_INIT_TEST(volk_16ic_deinterleave_real_8i, test_params))
+- QA(VOLK_INIT_TEST(volk_16ic_deinterleave_16i_x2, test_params))
+- QA(VOLK_INIT_TEST(volk_16ic_s32f_deinterleave_32f_x2, test_params))
+- QA(VOLK_INIT_TEST(volk_16ic_deinterleave_real_16i, test_params))
+- QA(VOLK_INIT_TEST(volk_16ic_magnitude_16i, test_params))
+- QA(VOLK_INIT_TEST(volk_16ic_s32f_magnitude_32f, test_params))
+- QA(VOLK_INIT_TEST(volk_16ic_convert_32fc, test_params))
+- QA(VOLK_INIT_TEST(volk_16ic_x2_multiply_16ic, test_params))
+- QA(VOLK_INIT_TEST(volk_16ic_x2_dot_prod_16ic, test_params))
+- QA(VOLK_INIT_TEST(volk_16i_s32f_convert_32f, test_params))
+- QA(VOLK_INIT_TEST(volk_16i_convert_8i, test_params))
+- QA(VOLK_INIT_TEST(volk_16i_32fc_dot_prod_32fc, test_params_inacc))
+- QA(VOLK_INIT_TEST(volk_32f_accumulator_s32f, test_params_inacc))
+- QA(VOLK_INIT_TEST(volk_32f_x2_add_32f, test_params))
+- QA(VOLK_INIT_TEST(volk_32f_index_max_16u, test_params))
+- QA(VOLK_INIT_TEST(volk_32f_index_max_32u, test_params))
+- QA(VOLK_INIT_TEST(volk_32fc_32f_multiply_32fc, test_params))
+- QA(VOLK_INIT_TEST(volk_32fc_32f_add_32fc, test_params))
+- QA(VOLK_INIT_TEST(volk_32f_log2_32f, test_params.make_absolute(1e-5)))
+- QA(VOLK_INIT_TEST(volk_32f_expfast_32f, test_params_inacc_tenth))
+- QA(VOLK_INIT_TEST(volk_32f_x2_pow_32f, test_params_inacc))
+- QA(VOLK_INIT_TEST(volk_32f_sin_32f, test_params_inacc))
+- QA(VOLK_INIT_TEST(volk_32f_cos_32f, test_params_inacc))
+- QA(VOLK_INIT_TEST(volk_32f_tan_32f, test_params_inacc))
+- QA(VOLK_INIT_TEST(volk_32f_atan_32f, test_params_inacc))
+- QA(VOLK_INIT_TEST(volk_32f_asin_32f, test_params_inacc))
+- QA(VOLK_INIT_TEST(volk_32f_acos_32f, test_params_inacc))
+- QA(VOLK_INIT_TEST(volk_32fc_s32f_power_32fc, test_params_power))
+- QA(VOLK_INIT_TEST(volk_32f_s32f_calc_spectral_noise_floor_32f, test_params_inacc))
+- QA(VOLK_INIT_TEST(volk_32fc_s32f_atan2_32f, test_params))
+- QA(VOLK_INIT_TEST(volk_32fc_x2_conjugate_dot_prod_32fc, test_params_inacc_tenth))
+- QA(VOLK_INIT_TEST(volk_32fc_deinterleave_32f_x2, test_params))
+- QA(VOLK_INIT_TEST(volk_32fc_deinterleave_64f_x2, test_params))
+- QA(VOLK_INIT_TEST(volk_32fc_s32f_deinterleave_real_16i, test_params))
+- QA(VOLK_INIT_TEST(volk_32fc_deinterleave_imag_32f, test_params))
+- QA(VOLK_INIT_TEST(volk_32fc_deinterleave_real_32f, test_params))
+- QA(VOLK_INIT_TEST(volk_32fc_deinterleave_real_64f, test_params))
+- QA(VOLK_INIT_TEST(volk_32fc_x2_dot_prod_32fc, test_params_inacc))
+- QA(VOLK_INIT_TEST(volk_32fc_32f_dot_prod_32fc, test_params_inacc))
+- QA(VOLK_INIT_TEST(volk_32fc_index_max_16u, test_params))
+- QA(VOLK_INIT_TEST(volk_32fc_index_max_32u, test_params))
+- QA(VOLK_INIT_TEST(volk_32fc_s32f_magnitude_16i, test_params))
+- QA(VOLK_INIT_TEST(volk_32fc_magnitude_32f, test_params_inacc_tenth))
+- QA(VOLK_INIT_TEST(volk_32fc_magnitude_squared_32f, test_params))
+- QA(VOLK_INIT_TEST(volk_32fc_x2_add_32fc, test_params))
+- QA(VOLK_INIT_TEST(volk_32fc_x2_multiply_32fc, test_params))
+- QA(VOLK_INIT_TEST(volk_32fc_x2_multiply_conjugate_32fc, test_params))
+- QA(VOLK_INIT_TEST(volk_32fc_x2_divide_32fc, test_params))
+- QA(VOLK_INIT_TEST(volk_32fc_conjugate_32fc, test_params))
+- QA(VOLK_INIT_TEST(volk_32f_s32f_convert_16i, test_params))
+- QA(VOLK_INIT_TEST(volk_32f_s32f_convert_32i, test_params))
+- QA(VOLK_INIT_TEST(volk_32f_convert_64f, test_params))
+- QA(VOLK_INIT_TEST(volk_32f_s32f_convert_8i, test_params))
+- QA(VOLK_INIT_TEST(volk_32fc_convert_16ic, test_params))
+- QA(VOLK_INIT_TEST(volk_32fc_s32f_power_spectrum_32f, test_params))
+- QA(VOLK_INIT_TEST(volk_32fc_x2_square_dist_32f, test_params))
+- QA(VOLK_INIT_TEST(volk_32fc_x2_s32f_square_dist_scalar_mult_32f, test_params))
+- QA(VOLK_INIT_TEST(volk_32f_x2_divide_32f, test_params))
+- QA(VOLK_INIT_TEST(volk_32f_x2_dot_prod_32f, test_params_inacc))
+- QA(VOLK_INIT_TEST(volk_32f_x2_s32f_interleave_16ic, test_params))
+- QA(VOLK_INIT_TEST(volk_32f_x2_interleave_32fc, test_params))
+- QA(VOLK_INIT_TEST(volk_32f_x2_max_32f, test_params))
+- QA(VOLK_INIT_TEST(volk_32f_x2_min_32f, test_params))
+- QA(VOLK_INIT_TEST(volk_32f_x2_multiply_32f, test_params))
+- QA(VOLK_INIT_TEST(volk_32f_64f_multiply_64f, test_params))
+- QA(VOLK_INIT_TEST(volk_32f_64f_add_64f, test_params))
+- QA(VOLK_INIT_TEST(volk_32f_s32f_normalize, test_params))
+- QA(VOLK_INIT_TEST(volk_32f_s32f_power_32f, test_params))
+- QA(VOLK_INIT_TEST(volk_32f_sqrt_32f, test_params_inacc))
+- QA(VOLK_INIT_TEST(volk_32f_s32f_stddev_32f, test_params_inacc))
+- QA(VOLK_INIT_TEST(volk_32f_stddev_and_mean_32f_x2, test_params_inacc))
+- QA(VOLK_INIT_TEST(volk_32f_x2_subtract_32f, test_params))
+- QA(VOLK_INIT_TEST(volk_32f_x3_sum_of_poly_32f, test_params_inacc))
+- QA(VOLK_INIT_TEST(volk_32i_x2_and_32i, test_params))
+- QA(VOLK_INIT_TEST(volk_32i_s32f_convert_32f, test_params))
+- QA(VOLK_INIT_TEST(volk_32i_x2_or_32i, test_params))
+- QA(VOLK_INIT_TEST(volk_32f_x2_dot_prod_16i, test_params))
+- QA(VOLK_INIT_TEST(volk_64f_convert_32f, test_params))
+- QA(VOLK_INIT_TEST(volk_64f_x2_max_64f, test_params))
+- QA(VOLK_INIT_TEST(volk_64f_x2_min_64f, test_params))
+- QA(VOLK_INIT_TEST(volk_64f_x2_multiply_64f, test_params))
+- QA(VOLK_INIT_TEST(volk_64f_x2_add_64f, test_params))
+- QA(VOLK_INIT_TEST(volk_8ic_deinterleave_16i_x2, test_params))
+- QA(VOLK_INIT_TEST(volk_8ic_s32f_deinterleave_32f_x2, test_params))
+- QA(VOLK_INIT_TEST(volk_8ic_deinterleave_real_16i, test_params))
+- QA(VOLK_INIT_TEST(volk_8ic_s32f_deinterleave_real_32f, test_params))
+- QA(VOLK_INIT_TEST(volk_8ic_deinterleave_real_8i, test_params))
+- QA(VOLK_INIT_TEST(volk_8ic_x2_multiply_conjugate_16ic, test_params))
+- QA(VOLK_INIT_TEST(volk_8ic_x2_s32f_multiply_conjugate_32fc, test_params))
+- QA(VOLK_INIT_TEST(volk_8i_convert_16i, test_params))
+- QA(VOLK_INIT_TEST(volk_8i_s32f_convert_32f, test_params))
+- QA(VOLK_INIT_TEST(volk_32fc_s32fc_multiply_32fc, test_params))
+- QA(VOLK_INIT_TEST(volk_32f_s32f_multiply_32f, test_params))
+- QA(VOLK_INIT_TEST(volk_32f_binary_slicer_32i, test_params))
+- QA(VOLK_INIT_TEST(volk_32f_binary_slicer_8i, test_params))
+- QA(VOLK_INIT_TEST(volk_32u_reverse_32u, test_params))
+- QA(VOLK_INIT_TEST(volk_32f_tanh_32f, test_params_inacc))
+- QA(VOLK_INIT_TEST(volk_32f_s32f_mod_rangepuppet_32f, test_params))
++ QA(VOLK_INIT_PUPP(volk_32fc_s32fc_rotatorpuppet_32fc,
++ volk_32fc_s32fc_x2_rotator_32fc,
++ test_params_rotator))
++ QA(VOLK_INIT_PUPP(
++ volk_8u_conv_k7_r2puppet_8u, volk_8u_x4_conv_k7_r2_8u, test_params.make_tol(0)))
++ QA(VOLK_INIT_PUPP(
++ volk_32f_x2_fm_detectpuppet_32f, volk_32f_s32f_32f_fm_detect_32f, test_params))
++ QA(VOLK_INIT_TEST(volk_16ic_s32f_deinterleave_real_32f, test_params))
++ QA(VOLK_INIT_TEST(volk_16ic_deinterleave_real_8i, test_params))
++ QA(VOLK_INIT_TEST(volk_16ic_deinterleave_16i_x2, test_params))
++ QA(VOLK_INIT_TEST(volk_16ic_s32f_deinterleave_32f_x2, test_params))
++ QA(VOLK_INIT_TEST(volk_16ic_deinterleave_real_16i, test_params))
++ QA(VOLK_INIT_TEST(volk_16ic_magnitude_16i, test_params))
++ QA(VOLK_INIT_TEST(volk_16ic_s32f_magnitude_32f, test_params))
++ QA(VOLK_INIT_TEST(volk_16ic_convert_32fc, test_params))
++ QA(VOLK_INIT_TEST(volk_16ic_x2_multiply_16ic, test_params))
++ QA(VOLK_INIT_TEST(volk_16ic_x2_dot_prod_16ic, test_params))
++ QA(VOLK_INIT_TEST(volk_16i_s32f_convert_32f, test_params))
++ QA(VOLK_INIT_TEST(volk_16i_convert_8i, test_params))
++ QA(VOLK_INIT_TEST(volk_16i_32fc_dot_prod_32fc, test_params_inacc))
++ QA(VOLK_INIT_TEST(volk_32f_accumulator_s32f, test_params_inacc))
++ QA(VOLK_INIT_TEST(volk_32f_x2_add_32f, test_params))
++ QA(VOLK_INIT_TEST(volk_32f_index_max_16u, test_params))
++ QA(VOLK_INIT_TEST(volk_32f_index_max_32u, test_params))
++ QA(VOLK_INIT_TEST(volk_32fc_32f_multiply_32fc, test_params))
++ QA(VOLK_INIT_TEST(volk_32fc_32f_add_32fc, test_params))
++ QA(VOLK_INIT_TEST(volk_32f_log2_32f, test_params.make_absolute(1e-5)))
++ QA(VOLK_INIT_TEST(volk_32f_expfast_32f, test_params_inacc_tenth))
++ QA(VOLK_INIT_TEST(volk_32f_x2_pow_32f, test_params_inacc))
++ QA(VOLK_INIT_TEST(volk_32f_sin_32f, test_params_inacc))
++ QA(VOLK_INIT_TEST(volk_32f_cos_32f, test_params_inacc))
++ QA(VOLK_INIT_TEST(volk_32f_tan_32f, test_params_inacc))
++ QA(VOLK_INIT_TEST(volk_32f_atan_32f, test_params_inacc))
++ QA(VOLK_INIT_TEST(volk_32f_asin_32f, test_params_inacc))
++ QA(VOLK_INIT_TEST(volk_32f_acos_32f, test_params_inacc))
++ QA(VOLK_INIT_TEST(volk_32fc_s32f_power_32fc, test_params_power))
++ QA(VOLK_INIT_TEST(volk_32f_s32f_calc_spectral_noise_floor_32f, test_params_inacc))
++ QA(VOLK_INIT_TEST(volk_32fc_s32f_atan2_32f, test_params))
++ QA(VOLK_INIT_TEST(volk_32fc_x2_conjugate_dot_prod_32fc, test_params_inacc_tenth))
++ QA(VOLK_INIT_TEST(volk_32fc_deinterleave_32f_x2, test_params))
++ QA(VOLK_INIT_TEST(volk_32fc_deinterleave_64f_x2, test_params))
++ QA(VOLK_INIT_TEST(volk_32fc_s32f_deinterleave_real_16i, test_params))
++ QA(VOLK_INIT_TEST(volk_32fc_deinterleave_imag_32f, test_params))
++ QA(VOLK_INIT_TEST(volk_32fc_deinterleave_real_32f, test_params))
++ QA(VOLK_INIT_TEST(volk_32fc_deinterleave_real_64f, test_params))
++ QA(VOLK_INIT_TEST(volk_32fc_x2_dot_prod_32fc, test_params_inacc))
++ QA(VOLK_INIT_TEST(volk_32fc_32f_dot_prod_32fc, test_params_inacc))
++ QA(VOLK_INIT_TEST(volk_32fc_index_max_16u, test_params))
++ QA(VOLK_INIT_TEST(volk_32fc_index_max_32u, test_params))
++ QA(VOLK_INIT_TEST(volk_32fc_s32f_magnitude_16i, test_params))
++ QA(VOLK_INIT_TEST(volk_32fc_magnitude_32f, test_params_inacc_tenth))
++ QA(VOLK_INIT_TEST(volk_32fc_magnitude_squared_32f, test_params))
++ QA(VOLK_INIT_TEST(volk_32fc_x2_add_32fc, test_params))
++ QA(VOLK_INIT_TEST(volk_32fc_x2_multiply_32fc, test_params))
++ QA(VOLK_INIT_TEST(volk_32fc_x2_multiply_conjugate_32fc, test_params))
++ QA(VOLK_INIT_TEST(volk_32fc_x2_divide_32fc, test_params))
++ QA(VOLK_INIT_TEST(volk_32fc_conjugate_32fc, test_params))
++ QA(VOLK_INIT_TEST(volk_32f_s32f_convert_16i, test_params))
++ QA(VOLK_INIT_TEST(volk_32f_s32f_convert_32i, test_params))
++ QA(VOLK_INIT_TEST(volk_32f_convert_64f, test_params))
++ QA(VOLK_INIT_TEST(volk_32f_s32f_convert_8i, test_params))
++ QA(VOLK_INIT_TEST(volk_32fc_convert_16ic, test_params))
++ QA(VOLK_INIT_TEST(volk_32fc_s32f_power_spectrum_32f, test_params))
++ QA(VOLK_INIT_TEST(volk_32fc_x2_square_dist_32f, test_params))
++ QA(VOLK_INIT_TEST(volk_32fc_x2_s32f_square_dist_scalar_mult_32f, test_params))
++ QA(VOLK_INIT_TEST(volk_32f_x2_divide_32f, test_params))
++ QA(VOLK_INIT_TEST(volk_32f_x2_dot_prod_32f, test_params_inacc))
++ QA(VOLK_INIT_TEST(volk_32f_x2_s32f_interleave_16ic, test_params))
++ QA(VOLK_INIT_TEST(volk_32f_x2_interleave_32fc, test_params))
++ QA(VOLK_INIT_TEST(volk_32f_x2_max_32f, test_params))
++ QA(VOLK_INIT_TEST(volk_32f_x2_min_32f, test_params))
++ QA(VOLK_INIT_TEST(volk_32f_x2_multiply_32f, test_params))
++ QA(VOLK_INIT_TEST(volk_32f_64f_multiply_64f, test_params))
++ QA(VOLK_INIT_TEST(volk_32f_64f_add_64f, test_params))
++ QA(VOLK_INIT_TEST(volk_32f_s32f_normalize, test_params))
++ QA(VOLK_INIT_TEST(volk_32f_s32f_power_32f, test_params))
++ QA(VOLK_INIT_TEST(volk_32f_sqrt_32f, test_params_inacc))
++ QA(VOLK_INIT_TEST(volk_32f_s32f_stddev_32f, test_params_inacc))
++ QA(VOLK_INIT_TEST(volk_32f_stddev_and_mean_32f_x2, test_params_inacc))
++ QA(VOLK_INIT_TEST(volk_32f_x2_subtract_32f, test_params))
++ QA(VOLK_INIT_TEST(volk_32f_x3_sum_of_poly_32f, test_params_inacc))
++ QA(VOLK_INIT_TEST(volk_32i_x2_and_32i, test_params))
++ QA(VOLK_INIT_TEST(volk_32i_s32f_convert_32f, test_params))
++ QA(VOLK_INIT_TEST(volk_32i_x2_or_32i, test_params))
++ QA(VOLK_INIT_TEST(volk_32f_x2_dot_prod_16i, test_params))
++ QA(VOLK_INIT_TEST(volk_64f_convert_32f, test_params))
++ QA(VOLK_INIT_TEST(volk_64f_x2_max_64f, test_params))
++ QA(VOLK_INIT_TEST(volk_64f_x2_min_64f, test_params))
++ QA(VOLK_INIT_TEST(volk_64f_x2_multiply_64f, test_params))
++ QA(VOLK_INIT_TEST(volk_64f_x2_add_64f, test_params))
++ QA(VOLK_INIT_TEST(volk_8ic_deinterleave_16i_x2, test_params))
++ QA(VOLK_INIT_TEST(volk_8ic_s32f_deinterleave_32f_x2, test_params))
++ QA(VOLK_INIT_TEST(volk_8ic_deinterleave_real_16i, test_params))
++ QA(VOLK_INIT_TEST(volk_8ic_s32f_deinterleave_real_32f, test_params))
++ QA(VOLK_INIT_TEST(volk_8ic_deinterleave_real_8i, test_params))
++ QA(VOLK_INIT_TEST(volk_8ic_x2_multiply_conjugate_16ic, test_params))
++ QA(VOLK_INIT_TEST(volk_8ic_x2_s32f_multiply_conjugate_32fc, test_params))
++ QA(VOLK_INIT_TEST(volk_8i_convert_16i, test_params))
++ QA(VOLK_INIT_TEST(volk_8i_s32f_convert_32f, test_params))
++ QA(VOLK_INIT_TEST(volk_32fc_s32fc_multiply_32fc, test_params))
++ QA(VOLK_INIT_TEST(volk_32f_s32f_multiply_32f, test_params))
++ QA(VOLK_INIT_TEST(volk_32f_binary_slicer_32i, test_params))
++ QA(VOLK_INIT_TEST(volk_32f_binary_slicer_8i, test_params))
++ QA(VOLK_INIT_TEST(volk_32u_reverse_32u, test_params))
++ QA(VOLK_INIT_TEST(volk_32f_tanh_32f, test_params_inacc))
++ QA(VOLK_INIT_TEST(volk_32f_s32f_mod_rangepuppet_32f, test_params))
+ QA(VOLK_INIT_TEST(volk_32fc_x2_s32fc_multiply_conjugate_add_32fc, test_params))
+- QA(VOLK_INIT_PUPP(volk_8u_x3_encodepolarpuppet_8u, volk_8u_x3_encodepolar_8u_x2, test_params))
+- QA(VOLK_INIT_PUPP(volk_32f_8u_polarbutterflypuppet_32f, volk_32f_8u_polarbutterfly_32f, test_params))
+- QA(VOLK_INIT_TEST(volk_32f_exp_32f, test_params))
+-
++ QA(VOLK_INIT_PUPP(
++ volk_8u_x3_encodepolarpuppet_8u, volk_8u_x3_encodepolar_8u_x2, test_params))
++ QA(VOLK_INIT_PUPP(volk_32f_8u_polarbutterflypuppet_32f,
++ volk_32f_8u_polarbutterfly_32f,
++ test_params))
+ // no one uses these, so don't test them
+- //VOLK_PROFILE(volk_16i_x5_add_quad_16i_x4, 1e-4, 2046, 10000, &results, benchmark_mode, kernel_regex);
+- //VOLK_PROFILE(volk_16i_branch_4_state_8, 1e-4, 2046, 10000, &results, benchmark_mode, kernel_regex);
+- //VOLK_PROFILE(volk_16i_max_star_16i, 0, 0, 204602, 10000, &results, benchmark_mode, kernel_regex);
+- //VOLK_PROFILE(volk_16i_max_star_horizontal_16i, 0, 0, 204602, 10000, &results, benchmark_mode, kernel_regex);
+- //VOLK_PROFILE(volk_16i_permute_and_scalar_add, 1e-4, 0, 2046, 10000, &results, benchmark_mode, kernel_regex);
+- //VOLK_PROFILE(volk_16i_x4_quad_max_star_16i, 1e-4, 0, 2046, 10000, &results, benchmark_mode, kernel_regex);
++ // VOLK_PROFILE(volk_16i_x5_add_quad_16i_x4, 1e-4, 2046, 10000, &results,
++ // benchmark_mode, kernel_regex); VOLK_PROFILE(volk_16i_branch_4_state_8, 1e-4, 2046,
++ // 10000, &results, benchmark_mode, kernel_regex); VOLK_PROFILE(volk_16i_max_star_16i,
++ // 0, 0, 204602, 10000, &results, benchmark_mode, kernel_regex);
++ // VOLK_PROFILE(volk_16i_max_star_horizontal_16i, 0, 0, 204602, 10000, &results,
++ // benchmark_mode, kernel_regex); VOLK_PROFILE(volk_16i_permute_and_scalar_add, 1e-4,
++ // 0, 2046, 10000, &results, benchmark_mode, kernel_regex);
++ // VOLK_PROFILE(volk_16i_x4_quad_max_star_16i, 1e-4, 0, 2046, 10000, &results,
++ // benchmark_mode, kernel_regex);
+ // we need a puppet for this one
+ //(VOLK_INIT_TEST(volk_32fc_s32f_x2_power_spectral_density_32f, test_params))
+
+diff --git a/lib/qa_utils.cc b/lib/qa_utils.cc
+index 76df069..1dcee6e 100644
+--- a/lib/qa_utils.cc
++++ b/lib/qa_utils.cc
+@@ -1,79 +1,94 @@
+-#include <volk/volk.h>
+ #include "qa_utils.h"
++#include <volk/volk.h>
+
+-#include <volk/volk.h> // for volk_func_desc_t
+-#include <volk/volk_malloc.h> // for volk_free, volk_m...
++#include <volk/volk.h> // for volk_func_desc_t
++#include <volk/volk_malloc.h> // for volk_free, volk_m...
+
+-#include <assert.h> // for assert
+-#include <stdint.h> // for uint16_t, uint64_t
+-#include <sys/time.h> // for CLOCKS_PER_SEC
+-#include <sys/types.h> // for int16_t, int32_t
++#include <assert.h> // for assert
++#include <stdint.h> // for uint16_t, uint64_t
++#include <sys/time.h> // for CLOCKS_PER_SEC
++#include <sys/types.h> // for int16_t, int32_t
+ #include <chrono>
+-#include <cmath> // for sqrt, fabs, abs
+-#include <cstring> // for memcpy, memset
+-#include <ctime> // for clock
+-#include <fstream> // for operator<<, basic...
+-#include <iostream> // for cout, cerr
+-#include <limits> // for numeric_limits
+-#include <map> // for map, map<>::mappe...
++#include <cmath> // for sqrt, fabs, abs
++#include <cstring> // for memcpy, memset
++#include <ctime> // for clock
++#include <fstream> // for operator<<, basic...
++#include <iostream> // for cout, cerr
++#include <limits> // for numeric_limits
++#include <map> // for map, map<>::mappe...
+ #include <random>
+-#include <vector> // for vector, _Bit_refe...
++#include <vector> // for vector, _Bit_refe...
+
+ template <typename T>
+-void random_floats(void *buf, unsigned int n, std::default_random_engine& rnd_engine)
++void random_floats(void* buf, unsigned int n, std::default_random_engine& rnd_engine)
+ {
+- T *array = static_cast<T*>(buf);
++ T* array = static_cast<T*>(buf);
+ std::uniform_real_distribution<T> uniform_dist(T(-1), T(1));
+- for(unsigned int i = 0; i < n; i++) {
++ for (unsigned int i = 0; i < n; i++) {
+ array[i] = uniform_dist(rnd_engine);
+ }
+ }
+
+-void load_random_data(void *data, volk_type_t type, unsigned int n) {
++void load_random_data(void* data, volk_type_t type, unsigned int n)
++{
+ std::random_device rnd_device;
+ std::default_random_engine rnd_engine(rnd_device());
+- if(type.is_complex) n *= 2;
+- if(type.is_float) {
+- if(type.size == 8) {
++ if (type.is_complex)
++ n *= 2;
++ if (type.is_float) {
++ if (type.size == 8) {
+ random_floats<double>(data, n, rnd_engine);
+ } else {
+- random_floats<float> (data, n, rnd_engine);
++ random_floats<float>(data, n, rnd_engine);
+ }
+ } else {
+- float int_max = float(uint64_t(2) << (type.size*8));
+- if(type.is_signed) int_max /= 2.0;
++ float int_max = float(uint64_t(2) << (type.size * 8));
++ if (type.is_signed)
++ int_max /= 2.0;
+ std::uniform_real_distribution<float> uniform_dist(-int_max, int_max);
+- for(unsigned int i=0; i<n; i++) {
++ for (unsigned int i = 0; i < n; i++) {
+ float scaled_rand = uniform_dist(rnd_engine);
+- //man i really don't know how to do this in a more clever way, you have to cast down at some point
+- switch(type.size) {
++ // man i really don't know how to do this in a more clever way, you have to
++ // cast down at some point
++ switch (type.size) {
+ case 8:
+- if(type.is_signed) ((int64_t *)data)[i] = (int64_t) scaled_rand;
+- else ((uint64_t *)data)[i] = (uint64_t) scaled_rand;
+- break;
++ if (type.is_signed)
++ ((int64_t*)data)[i] = (int64_t)scaled_rand;
++ else
++ ((uint64_t*)data)[i] = (uint64_t)scaled_rand;
++ break;
+ case 4:
+- if(type.is_signed) ((int32_t *)data)[i] = (int32_t) scaled_rand;
+- else ((uint32_t *)data)[i] = (uint32_t) scaled_rand;
+- break;
++ if (type.is_signed)
++ ((int32_t*)data)[i] = (int32_t)scaled_rand;
++ else
++ ((uint32_t*)data)[i] = (uint32_t)scaled_rand;
++ break;
+ case 2:
+- if(type.is_signed) ((int16_t *)data)[i] = (int16_t)((int16_t) scaled_rand % 8);
+- else ((uint16_t *)data)[i] = (uint16_t) ((int16_t) scaled_rand % 8);
+- break;
++ if (type.is_signed)
++ ((int16_t*)data)[i] = (int16_t)((int16_t)scaled_rand % 8);
++ else
++ ((uint16_t*)data)[i] = (uint16_t)((int16_t)scaled_rand % 8);
++ break;
+ case 1:
+- if(type.is_signed) ((int8_t *)data)[i] = (int8_t) scaled_rand;
+- else ((uint8_t *)data)[i] = (uint8_t) scaled_rand;
+- break;
++ if (type.is_signed)
++ ((int8_t*)data)[i] = (int8_t)scaled_rand;
++ else
++ ((uint8_t*)data)[i] = (uint8_t)scaled_rand;
++ break;
+ default:
+- throw "load_random_data: no support for data size > 8 or < 1"; //no shenanigans here
++ throw "load_random_data: no support for data size > 8 or < 1"; // no
++ // shenanigans
++ // here
+ }
+ }
+ }
+ }
+
+-static std::vector<std::string> get_arch_list(volk_func_desc_t desc) {
++static std::vector<std::string> get_arch_list(volk_func_desc_t desc)
++{
+ std::vector<std::string> archlist;
+
+- for(size_t i = 0; i < desc.n_impls; i++) {
++ for (size_t i = 0; i < desc.n_impls; i++) {
+ archlist.push_back(std::string(desc.impl_names[i]));
+ }
+
+@@ -96,7 +111,8 @@ T volk_lexical_cast(const std::string& str)
+ return var;
+ }
+
+-volk_type_t volk_type_from_string(std::string name) {
++volk_type_t volk_type_from_string(std::string name)
++{
+ volk_type_t type;
+ type.is_float = false;
+ type.is_scalar = false;
+@@ -105,28 +121,28 @@ volk_type_t volk_type_from_string(std::string name) {
+ type.size = 0;
+ type.str = name;
+
+- if(name.size() < 2) {
++ if (name.size() < 2) {
+ throw std::string("name too short to be a datatype");
+ }
+
+- //is it a scalar?
+- if(name[0] == 's') {
++ // is it a scalar?
++ if (name[0] == 's') {
+ type.is_scalar = true;
+- name = name.substr(1, name.size()-1);
++ name = name.substr(1, name.size() - 1);
+ }
+
+- //get the data size
++ // get the data size
+ size_t last_size_pos = name.find_last_of("0123456789");
+- if(last_size_pos == std::string::npos) {
++ if (last_size_pos == std::string::npos) {
+ throw std::string("no size spec in type ").append(name);
+ }
+- //will throw if malformed
+- int size = volk_lexical_cast<int>(name.substr(0, last_size_pos+1));
++ // will throw if malformed
++ int size = volk_lexical_cast<int>(name.substr(0, last_size_pos + 1));
+
+ assert(((size % 8) == 0) && (size <= 64) && (size != 0));
+- type.size = size/8; //in bytes
++ type.size = size / 8; // in bytes
+
+- for(size_t i=last_size_pos+1; i < name.size(); i++) {
++ for (size_t i = last_size_pos + 1; i < name.size(); i++) {
+ switch (name[i]) {
+ case 'f':
+ type.is_float = true;
+@@ -148,7 +164,8 @@ volk_type_t volk_type_from_string(std::string name) {
+ return type;
+ }
+
+-std::vector<std::string> split_signature(const std::string &protokernel_signature) {
++std::vector<std::string> split_signature(const std::string& protokernel_signature)
++{
+ std::vector<std::string> signature_tokens;
+ std::string token;
+ for (unsigned int loc = 0; loc < protokernel_signature.size(); ++loc) {
+@@ -165,16 +182,17 @@ std::vector<std::string> split_signature(const std::string &protokernel_signatur
+ return signature_tokens;
+ }
+
+-static void get_signatures_from_name(std::vector<volk_type_t> &inputsig,
+- std::vector<volk_type_t> &outputsig,
+- std::string name) {
++static void get_signatures_from_name(std::vector<volk_type_t>& inputsig,
++ std::vector<volk_type_t>& outputsig,
++ std::string name)
++{
+
+ std::vector<std::string> toked = split_signature(name);
+
+ assert(toked[0] == "volk");
+ toked.erase(toked.begin());
+
+- //ok. we're assuming a string in the form
++ // ok. we're assuming a string in the form
+ //(sig)_(multiplier-opt)_..._(name)_(sig)_(multiplier-opt)_..._(alignment)
+
+ enum { SIDE_INPUT, SIDE_NAME, SIDE_OUTPUT } side = SIDE_INPUT;
+@@ -184,106 +202,184 @@ static void get_signatures_from_name(std::vector<volk_type_t> &inputsig,
+ std::string token = toked[token_index];
+ try {
+ type = volk_type_from_string(token);
+- if(side == SIDE_NAME) side = SIDE_OUTPUT; //if this is the first one after the name...
+-
+- if(side == SIDE_INPUT) inputsig.push_back(type);
+- else outputsig.push_back(type);
+- } catch (...){
+- if(token[0] == 'x' && (token.size() > 1) && (token[1] > '0' && token[1] < '9')) { //it's a multiplier
+- if(side == SIDE_INPUT) assert(inputsig.size() > 0);
+- else assert(outputsig.size() > 0);
+- int multiplier = volk_lexical_cast<int>(token.substr(1, token.size()-1)); //will throw if invalid
+- for(int i=1; i<multiplier; i++) {
+- if(side == SIDE_INPUT) inputsig.push_back(inputsig.back());
+- else outputsig.push_back(outputsig.back());
++ if (side == SIDE_NAME)
++ side = SIDE_OUTPUT; // if this is the first one after the name...
++
++ if (side == SIDE_INPUT)
++ inputsig.push_back(type);
++ else
++ outputsig.push_back(type);
++ } catch (...) {
++ if (token[0] == 'x' && (token.size() > 1) &&
++ (token[1] > '0' && token[1] < '9')) { // it's a multiplier
++ if (side == SIDE_INPUT)
++ assert(inputsig.size() > 0);
++ else
++ assert(outputsig.size() > 0);
++ int multiplier = volk_lexical_cast<int>(
++ token.substr(1, token.size() - 1)); // will throw if invalid
++ for (int i = 1; i < multiplier; i++) {
++ if (side == SIDE_INPUT)
++ inputsig.push_back(inputsig.back());
++ else
++ outputsig.push_back(outputsig.back());
+ }
+- }
+- else if(side == SIDE_INPUT) { //it's the function name, at least it better be
++ } else if (side ==
++ SIDE_INPUT) { // it's the function name, at least it better be
+ side = SIDE_NAME;
+ fn_name.append("_");
+ fn_name.append(token);
+- }
+- else if(side == SIDE_OUTPUT) {
+- if(token != toked.back()) throw; //the last token in the name is the alignment
++ } else if (side == SIDE_OUTPUT) {
++ if (token != toked.back())
++ throw; // the last token in the name is the alignment
+ }
+ }
+ }
+- //we don't need an output signature (some fn's operate on the input data, "in place"), but we do need at least one input!
++ // we don't need an output signature (some fn's operate on the input data, "in
++ // place"), but we do need at least one input!
+ assert(inputsig.size() != 0);
+-
+ }
+
+-inline void run_cast_test1(volk_fn_1arg func, std::vector<void *> &buffs, unsigned int vlen, unsigned int iter, std::string arch) {
+- while(iter--) func(buffs[0], vlen, arch.c_str());
++inline void run_cast_test1(volk_fn_1arg func,
++ std::vector<void*>& buffs,
++ unsigned int vlen,
++ unsigned int iter,
++ std::string arch)
++{
++ while (iter--)
++ func(buffs[0], vlen, arch.c_str());
+ }
+
+-inline void run_cast_test2(volk_fn_2arg func, std::vector<void *> &buffs, unsigned int vlen, unsigned int iter, std::string arch) {
+- while(iter--) func(buffs[0], buffs[1], vlen, arch.c_str());
++inline void run_cast_test2(volk_fn_2arg func,
++ std::vector<void*>& buffs,
++ unsigned int vlen,
++ unsigned int iter,
++ std::string arch)
++{
++ while (iter--)
++ func(buffs[0], buffs[1], vlen, arch.c_str());
+ }
+
+-inline void run_cast_test3(volk_fn_3arg func, std::vector<void *> &buffs, unsigned int vlen, unsigned int iter, std::string arch) {
+- while(iter--) func(buffs[0], buffs[1], buffs[2], vlen, arch.c_str());
++inline void run_cast_test3(volk_fn_3arg func,
++ std::vector<void*>& buffs,
++ unsigned int vlen,
++ unsigned int iter,
++ std::string arch)
++{
++ while (iter--)
++ func(buffs[0], buffs[1], buffs[2], vlen, arch.c_str());
+ }
+
+-inline void run_cast_test4(volk_fn_4arg func, std::vector<void *> &buffs, unsigned int vlen, unsigned int iter, std::string arch) {
+- while(iter--) func(buffs[0], buffs[1], buffs[2], buffs[3], vlen, arch.c_str());
++inline void run_cast_test4(volk_fn_4arg func,
++ std::vector<void*>& buffs,
++ unsigned int vlen,
++ unsigned int iter,
++ std::string arch)
++{
++ while (iter--)
++ func(buffs[0], buffs[1], buffs[2], buffs[3], vlen, arch.c_str());
+ }
+
+-inline void run_cast_test1_s32f(volk_fn_1arg_s32f func, std::vector<void *> &buffs, float scalar, unsigned int vlen, unsigned int iter, std::string arch) {
+- while(iter--) func(buffs[0], scalar, vlen, arch.c_str());
++inline void run_cast_test1_s32f(volk_fn_1arg_s32f func,
++ std::vector<void*>& buffs,
++ float scalar,
++ unsigned int vlen,
++ unsigned int iter,
++ std::string arch)
++{
++ while (iter--)
++ func(buffs[0], scalar, vlen, arch.c_str());
+ }
+
+-inline void run_cast_test2_s32f(volk_fn_2arg_s32f func, std::vector<void *> &buffs, float scalar, unsigned int vlen, unsigned int iter, std::string arch) {
+- while(iter--) func(buffs[0], buffs[1], scalar, vlen, arch.c_str());
++inline void run_cast_test2_s32f(volk_fn_2arg_s32f func,
++ std::vector<void*>& buffs,
++ float scalar,
++ unsigned int vlen,
++ unsigned int iter,
++ std::string arch)
++{
++ while (iter--)
++ func(buffs[0], buffs[1], scalar, vlen, arch.c_str());
+ }
+
+-inline void run_cast_test3_s32f(volk_fn_3arg_s32f func, std::vector<void *> &buffs, float scalar, unsigned int vlen, unsigned int iter, std::string arch) {
+- while(iter--) func(buffs[0], buffs[1], buffs[2], scalar, vlen, arch.c_str());
++inline void run_cast_test3_s32f(volk_fn_3arg_s32f func,
++ std::vector<void*>& buffs,
++ float scalar,
++ unsigned int vlen,
++ unsigned int iter,
++ std::string arch)
++{
++ while (iter--)
++ func(buffs[0], buffs[1], buffs[2], scalar, vlen, arch.c_str());
+ }
+
+-inline void run_cast_test1_s32fc(volk_fn_1arg_s32fc func, std::vector<void *> &buffs, lv_32fc_t scalar, unsigned int vlen, unsigned int iter, std::string arch) {
+- while(iter--) func(buffs[0], scalar, vlen, arch.c_str());
++inline void run_cast_test1_s32fc(volk_fn_1arg_s32fc func,
++ std::vector<void*>& buffs,
++ lv_32fc_t scalar,
++ unsigned int vlen,
++ unsigned int iter,
++ std::string arch)
++{
++ while (iter--)
++ func(buffs[0], scalar, vlen, arch.c_str());
+ }
+
+-inline void run_cast_test2_s32fc(volk_fn_2arg_s32fc func, std::vector<void *> &buffs, lv_32fc_t scalar, unsigned int vlen, unsigned int iter, std::string arch) {
+- while(iter--) func(buffs[0], buffs[1], scalar, vlen, arch.c_str());
++inline void run_cast_test2_s32fc(volk_fn_2arg_s32fc func,
++ std::vector<void*>& buffs,
++ lv_32fc_t scalar,
++ unsigned int vlen,
++ unsigned int iter,
++ std::string arch)
++{
++ while (iter--)
++ func(buffs[0], buffs[1], scalar, vlen, arch.c_str());
+ }
+
+-inline void run_cast_test3_s32fc(volk_fn_3arg_s32fc func, std::vector<void *> &buffs, lv_32fc_t scalar, unsigned int vlen, unsigned int iter, std::string arch) {
+- while(iter--) func(buffs[0], buffs[1], buffs[2], scalar, vlen, arch.c_str());
++inline void run_cast_test3_s32fc(volk_fn_3arg_s32fc func,
++ std::vector<void*>& buffs,
++ lv_32fc_t scalar,
++ unsigned int vlen,
++ unsigned int iter,
++ std::string arch)
++{
++ while (iter--)
++ func(buffs[0], buffs[1], buffs[2], scalar, vlen, arch.c_str());
+ }
+
+ template <class t>
+-bool fcompare(t *in1, t *in2, unsigned int vlen, float tol, bool absolute_mode) {
++bool fcompare(t* in1, t* in2, unsigned int vlen, float tol, bool absolute_mode)
++{
+ bool fail = false;
+ int print_max_errs = 10;
+- for(unsigned int i=0; i<vlen; i++) {
++ for (unsigned int i = 0; i < vlen; i++) {
+ if (absolute_mode) {
+- if (fabs(((t *)(in1))[i] - ((t *)(in2))[i]) > tol) {
+- fail=true;
+- if(print_max_errs-- > 0) {
+- std::cout << "offset " << i << " in1: " << t(((t *)(in1))[i]) << " in2: " << t(((t *)(in2))[i]);
++ if (fabs(((t*)(in1))[i] - ((t*)(in2))[i]) > tol) {
++ fail = true;
++ if (print_max_errs-- > 0) {
++ std::cout << "offset " << i << " in1: " << t(((t*)(in1))[i])
++ << " in2: " << t(((t*)(in2))[i]);
+ std::cout << " tolerance was: " << tol << std::endl;
+ }
+ }
+ } else {
+ // for very small numbers we'll see round off errors due to limited
+ // precision. So a special test case...
+- if(fabs(((t *)(in1))[i]) < 1e-30) {
+- if( fabs( ((t *)(in2))[i] ) > tol )
+- {
+- fail=true;
+- if(print_max_errs-- > 0) {
+- std::cout << "offset " << i << " in1: " << t(((t *)(in1))[i]) << " in2: " << t(((t *)(in2))[i]);
++ if (fabs(((t*)(in1))[i]) < 1e-30) {
++ if (fabs(((t*)(in2))[i]) > tol) {
++ fail = true;
++ if (print_max_errs-- > 0) {
++ std::cout << "offset " << i << " in1: " << t(((t*)(in1))[i])
++ << " in2: " << t(((t*)(in2))[i]);
+ std::cout << " tolerance was: " << tol << std::endl;
+ }
+ }
+ }
+ // the primary test is the percent different greater than given tol
+- else if(fabs(((t *)(in1))[i] - ((t *)(in2))[i])/fabs(((t *)in1)[i]) > tol) {
+- fail=true;
+- if(print_max_errs-- > 0) {
+- std::cout << "offset " << i << " in1: " << t(((t *)(in1))[i]) << " in2: " << t(((t *)(in2))[i]);
++ else if (fabs(((t*)(in1))[i] - ((t*)(in2))[i]) / fabs(((t*)in1)[i]) > tol) {
++ fail = true;
++ if (print_max_errs-- > 0) {
++ std::cout << "offset " << i << " in1: " << t(((t*)(in1))[i])
++ << " in2: " << t(((t*)(in2))[i]);
+ std::cout << " tolerance was: " << tol << std::endl;
+ }
+ }
+@@ -294,43 +390,50 @@ bool fcompare(t *in1, t *in2, unsigned int vlen, float tol, bool absolute_mode)
+ }
+
+ template <class t>
+-bool ccompare(t *in1, t *in2, unsigned int vlen, float tol, bool absolute_mode) {
++bool ccompare(t* in1, t* in2, unsigned int vlen, float tol, bool absolute_mode)
++{
+ if (absolute_mode) {
+- std::cout << "ccompare does not support absolute mode" << std::endl;
+- return true;
++ std::cout << "ccompare does not support absolute mode" << std::endl;
++ return true;
+ }
+ bool fail = false;
+ int print_max_errs = 10;
+- for(unsigned int i=0; i<2*vlen; i+=2) {
+- if (std::isnan(in1[i]) || std::isnan(in1[i+1]) || std::isnan(in2[i]) || std::isnan(in2[i+1])
+- || std::isinf(in1[i]) || std::isinf(in1[i+1]) || std::isinf(in2[i]) || std::isinf(in2[i+1])) {
+- fail=true;
+- if(print_max_errs-- > 0) {
+- std::cout << "offset " << i/2 << " in1: " << in1[i] << " + " << in1[i+1] << "j in2: " << in2[i] << " + " << in2[i+1] << "j";
++ for (unsigned int i = 0; i < 2 * vlen; i += 2) {
++ if (std::isnan(in1[i]) || std::isnan(in1[i + 1]) || std::isnan(in2[i]) ||
++ std::isnan(in2[i + 1]) || std::isinf(in1[i]) || std::isinf(in1[i + 1]) ||
++ std::isinf(in2[i]) || std::isinf(in2[i + 1])) {
++ fail = true;
++ if (print_max_errs-- > 0) {
++ std::cout << "offset " << i / 2 << " in1: " << in1[i] << " + "
++ << in1[i + 1] << "j in2: " << in2[i] << " + " << in2[i + 1]
++ << "j";
+ std::cout << " tolerance was: " << tol << std::endl;
+ }
+ }
+- t diff[2] = { in1[i] - in2[i], in1[i+1] - in2[i+1] };
+- t err = std::sqrt(diff[0] * diff[0] + diff[1] * diff[1]);
+- t norm = std::sqrt(in1[i] * in1[i] + in1[i+1] * in1[i+1]);
++ t diff[2] = { in1[i] - in2[i], in1[i + 1] - in2[i + 1] };
++ t err = std::sqrt(diff[0] * diff[0] + diff[1] * diff[1]);
++ t norm = std::sqrt(in1[i] * in1[i] + in1[i + 1] * in1[i + 1]);
+
+ // for very small numbers we'll see round off errors due to limited
+ // precision. So a special test case...
+ if (norm < 1e-30) {
+- if (err > tol)
+- {
+- fail=true;
+- if(print_max_errs-- > 0) {
+- std::cout << "offset " << i/2 << " in1: " << in1[i] << " + " << in1[i+1] << "j in2: " << in2[i] << " + " << in2[i+1] << "j";
++ if (err > tol) {
++ fail = true;
++ if (print_max_errs-- > 0) {
++ std::cout << "offset " << i / 2 << " in1: " << in1[i] << " + "
++ << in1[i + 1] << "j in2: " << in2[i] << " + " << in2[i + 1]
++ << "j";
+ std::cout << " tolerance was: " << tol << std::endl;
+ }
+ }
+ }
+ // the primary test is the percent different greater than given tol
+- else if((err / norm) > tol) {
+- fail=true;
+- if(print_max_errs-- > 0) {
+- std::cout << "offset " << i/2 << " in1: " << in1[i] << " + " << in1[i+1] << "j in2: " << in2[i] << " + " << in2[i+1] << "j";
++ else if ((err / norm) > tol) {
++ fail = true;
++ if (print_max_errs-- > 0) {
++ std::cout << "offset " << i / 2 << " in1: " << in1[i] << " + "
++ << in1[i + 1] << "j in2: " << in2[i] << " + " << in2[i + 1]
++ << "j";
+ std::cout << " tolerance was: " << tol << std::endl;
+ }
+ }
+@@ -340,18 +443,21 @@ bool ccompare(t *in1, t *in2, unsigned int vlen, float tol, bool absolute_mode)
+ }
+
+ template <class t>
+-bool icompare(t *in1, t *in2, unsigned int vlen, unsigned int tol, bool absolute_mode) {
++bool icompare(t* in1, t* in2, unsigned int vlen, unsigned int tol, bool absolute_mode)
++{
+ if (absolute_mode) {
+- std::cout << "icompare does not support absolute mode" << std::endl;
+- return true;
++ std::cout << "icompare does not support absolute mode" << std::endl;
++ return true;
+ }
+ bool fail = false;
+ int print_max_errs = 10;
+- for(unsigned int i=0; i<vlen; i++) {
+- if(((unsigned int)abs(int(((t *)(in1))[i]) - int(((t *)(in2))[i]))) > tol) {
+- fail=true;
+- if(print_max_errs-- > 0) {
+- std::cout << "offset " << i << " in1: " << static_cast<int>(t(((t *)(in1))[i])) << " in2: " << static_cast<int>(t(((t *)(in2))[i]));
++ for (unsigned int i = 0; i < vlen; i++) {
++ if (((unsigned int)abs(int(((t*)(in1))[i]) - int(((t*)(in2))[i]))) > tol) {
++ fail = true;
++ if (print_max_errs-- > 0) {
++ std::cout << "offset " << i
++ << " in1: " << static_cast<int>(t(((t*)(in1))[i]))
++ << " in2: " << static_cast<int>(t(((t*)(in2))[i]));
+ std::cout << " tolerance was: " << tol << std::endl;
+ }
+ }
+@@ -360,34 +466,46 @@ bool icompare(t *in1, t *in2, unsigned int vlen, unsigned int tol, bool absolute
+ return fail;
+ }
+
+-class volk_qa_aligned_mem_pool{
++class volk_qa_aligned_mem_pool
++{
+ public:
+- void *get_new(size_t size){
++ void* get_new(size_t size)
++ {
+ size_t alignment = volk_get_alignment();
+ void* ptr = volk_malloc(size, alignment);
+ memset(ptr, 0x00, size);
+ _mems.push_back(ptr);
+ return ptr;
+ }
+- ~volk_qa_aligned_mem_pool() {
+- for(unsigned int ii = 0; ii < _mems.size(); ++ii) {
++ ~volk_qa_aligned_mem_pool()
++ {
++ for (unsigned int ii = 0; ii < _mems.size(); ++ii) {
+ volk_free(_mems[ii]);
+ }
+ }
+-private: std::vector<void * > _mems;
++
++private:
++ std::vector<void*> _mems;
+ };
+
+ bool run_volk_tests(volk_func_desc_t desc,
+ void (*manual_func)(),
+ std::string name,
+ volk_test_params_t test_params,
+- std::vector<volk_test_results_t> *results,
+- std::string puppet_master_name
+-)
++ std::vector<volk_test_results_t>* results,
++ std::string puppet_master_name)
+ {
+- return run_volk_tests(desc, manual_func, name, test_params.tol(), test_params.scalar(),
+- test_params.vlen(), test_params.iter(), results, puppet_master_name,
+- test_params.absolute_mode(), test_params.benchmark_mode());
++ return run_volk_tests(desc,
++ manual_func,
++ name,
++ test_params.tol(),
++ test_params.scalar(),
++ test_params.vlen(),
++ test_params.iter(),
++ results,
++ puppet_master_name,
++ test_params.absolute_mode(),
++ test_params.benchmark_mode());
+ }
+
+ bool run_volk_tests(volk_func_desc_t desc,
+@@ -397,17 +515,18 @@ bool run_volk_tests(volk_func_desc_t desc,
+ lv_32fc_t scalar,
+ unsigned int vlen,
+ unsigned int iter,
+- std::vector<volk_test_results_t> *results,
++ std::vector<volk_test_results_t>* results,
+ std::string puppet_master_name,
+ bool absolute_mode,
+- bool benchmark_mode
+-) {
++ bool benchmark_mode)
++{
+ // Initialize this entry in results vector
+ results->push_back(volk_test_results_t());
+ results->back().name = name;
+ results->back().vlen = vlen;
+ results->back().iter = iter;
+- std::cout << "RUN_VOLK_TESTS: " << name << "(" << vlen << "," << iter << ")" << std::endl;
++ std::cout << "RUN_VOLK_TESTS: " << name << "(" << vlen << "," << iter << ")"
++ << std::endl;
+
+ // vlen_twiddle will increase vlen for malloc and data generation
+ // but kernels will still be called with the user provided vlen.
+@@ -418,57 +537,64 @@ bool run_volk_tests(volk_func_desc_t desc,
+ const float tol_f = tol;
+ const unsigned int tol_i = static_cast<const unsigned int>(tol);
+
+- //first let's get a list of available architectures for the test
++ // first let's get a list of available architectures for the test
+ std::vector<std::string> arch_list = get_arch_list(desc);
+
+- if((!benchmark_mode) && (arch_list.size() < 2)) {
++ if ((!benchmark_mode) && (arch_list.size() < 2)) {
+ std::cout << "no architectures to test" << std::endl;
+ return false;
+ }
+
+- //something that can hang onto memory and cleanup when this function exits
++ // something that can hang onto memory and cleanup when this function exits
+ volk_qa_aligned_mem_pool mem_pool;
+
+- //now we have to get a function signature by parsing the name
++ // now we have to get a function signature by parsing the name
+ std::vector<volk_type_t> inputsig, outputsig;
+ try {
+ get_signatures_from_name(inputsig, outputsig, name);
+- }
+- catch (std::exception &error) {
+- std::cerr << "Error: unable to get function signature from kernel name" << std::endl;
++ } catch (std::exception& error) {
++ std::cerr << "Error: unable to get function signature from kernel name"
++ << std::endl;
+ std::cerr << " - " << name << std::endl;
+ return false;
+ }
+
+- //pull the input scalars into their own vector
++ // pull the input scalars into their own vector
+ std::vector<volk_type_t> inputsc;
+- for(size_t i=0; i<inputsig.size(); i++) {
+- if(inputsig[i].is_scalar) {
++ for (size_t i = 0; i < inputsig.size(); i++) {
++ if (inputsig[i].is_scalar) {
+ inputsc.push_back(inputsig[i]);
+ inputsig.erase(inputsig.begin() + i);
+ i -= 1;
+ }
+ }
+- std::vector<void *> inbuffs;
+- for (unsigned int inputsig_index = 0; inputsig_index < inputsig.size(); ++ inputsig_index) {
++ std::vector<void*> inbuffs;
++ for (unsigned int inputsig_index = 0; inputsig_index < inputsig.size();
++ ++inputsig_index) {
+ volk_type_t sig = inputsig[inputsig_index];
+- if(!sig.is_scalar) //we don't make buffers for scalars
+- inbuffs.push_back(mem_pool.get_new(vlen*sig.size*(sig.is_complex ? 2 : 1)));
++ if (!sig.is_scalar) // we don't make buffers for scalars
++ inbuffs.push_back(
++ mem_pool.get_new(vlen * sig.size * (sig.is_complex ? 2 : 1)));
+ }
+- for(size_t i=0; i<inbuffs.size(); i++) {
++ for (size_t i = 0; i < inbuffs.size(); i++) {
+ load_random_data(inbuffs[i], inputsig[i], vlen);
+ }
+
+- //ok let's make a vector of vector of void buffers, which holds the input/output vectors for each arch
+- std::vector<std::vector<void *> > test_data;
+- for(size_t i=0; i<arch_list.size(); i++) {
+- std::vector<void *> arch_buffs;
+- for(size_t j=0; j<outputsig.size(); j++) {
+- arch_buffs.push_back(mem_pool.get_new(vlen*outputsig[j].size*(outputsig[j].is_complex ? 2 : 1)));
++ // ok let's make a vector of vector of void buffers, which holds the input/output
++ // vectors for each arch
++ std::vector<std::vector<void*>> test_data;
++ for (size_t i = 0; i < arch_list.size(); i++) {
++ std::vector<void*> arch_buffs;
++ for (size_t j = 0; j < outputsig.size(); j++) {
++ arch_buffs.push_back(mem_pool.get_new(vlen * outputsig[j].size *
++ (outputsig[j].is_complex ? 2 : 1)));
+ }
+- for(size_t j=0; j<inputsig.size(); j++) {
+- void *arch_inbuff = mem_pool.get_new(vlen*inputsig[j].size*(inputsig[j].is_complex ? 2 : 1));
+- memcpy(arch_inbuff, inbuffs[j], vlen * inputsig[j].size * (inputsig[j].is_complex ? 2 : 1));
++ for (size_t j = 0; j < inputsig.size(); j++) {
++ void* arch_inbuff = mem_pool.get_new(vlen * inputsig[j].size *
++ (inputsig[j].is_complex ? 2 : 1));
++ memcpy(arch_inbuff,
++ inbuffs[j],
++ vlen * inputsig[j].size * (inputsig[j].is_complex ? 2 : 1));
+ arch_buffs.push_back(arch_inbuff);
+ }
+ test_data.push_back(arch_buffs);
+@@ -478,53 +604,90 @@ bool run_volk_tests(volk_func_desc_t desc,
+ both_sigs.insert(both_sigs.end(), outputsig.begin(), outputsig.end());
+ both_sigs.insert(both_sigs.end(), inputsig.begin(), inputsig.end());
+
+- //now run the test
++ // now run the test
+ vlen = vlen - vlen_twiddle;
+ std::chrono::time_point<std::chrono::system_clock> start, end;
+ std::vector<double> profile_times;
+- for(size_t i = 0; i < arch_list.size(); i++) {
++ for (size_t i = 0; i < arch_list.size(); i++) {
+ start = std::chrono::system_clock::now();
+
+- switch(both_sigs.size()) {
+- case 1:
+- if(inputsc.size() == 0) {
+- run_cast_test1((volk_fn_1arg)(manual_func), test_data[i], vlen, iter, arch_list[i]);
+- } else if(inputsc.size() == 1 && inputsc[0].is_float) {
+- if(inputsc[0].is_complex) {
+- run_cast_test1_s32fc((volk_fn_1arg_s32fc)(manual_func), test_data[i], scalar, vlen, iter, arch_list[i]);
+- } else {
+- run_cast_test1_s32f((volk_fn_1arg_s32f)(manual_func), test_data[i], scalar.real(), vlen, iter, arch_list[i]);
+- }
+- } else throw "unsupported 1 arg function >1 scalars";
+- break;
+- case 2:
+- if(inputsc.size() == 0) {
+- run_cast_test2((volk_fn_2arg)(manual_func), test_data[i], vlen, iter, arch_list[i]);
+- } else if(inputsc.size() == 1 && inputsc[0].is_float) {
+- if(inputsc[0].is_complex) {
+- run_cast_test2_s32fc((volk_fn_2arg_s32fc)(manual_func), test_data[i], scalar, vlen, iter, arch_list[i]);
+- } else {
+- run_cast_test2_s32f((volk_fn_2arg_s32f)(manual_func), test_data[i], scalar.real(), vlen, iter, arch_list[i]);
+- }
+- } else throw "unsupported 2 arg function >1 scalars";
+- break;
+- case 3:
+- if(inputsc.size() == 0) {
+- run_cast_test3((volk_fn_3arg)(manual_func), test_data[i], vlen, iter, arch_list[i]);
+- } else if(inputsc.size() == 1 && inputsc[0].is_float) {
+- if(inputsc[0].is_complex) {
+- run_cast_test3_s32fc((volk_fn_3arg_s32fc)(manual_func), test_data[i], scalar, vlen, iter, arch_list[i]);
+- } else {
+- run_cast_test3_s32f((volk_fn_3arg_s32f)(manual_func), test_data[i], scalar.real(), vlen, iter, arch_list[i]);
+- }
+- } else throw "unsupported 3 arg function >1 scalars";
+- break;
+- case 4:
+- run_cast_test4((volk_fn_4arg)(manual_func), test_data[i], vlen, iter, arch_list[i]);
+- break;
+- default:
+- throw "no function handler for this signature";
+- break;
++ switch (both_sigs.size()) {
++ case 1:
++ if (inputsc.size() == 0) {
++ run_cast_test1(
++ (volk_fn_1arg)(manual_func), test_data[i], vlen, iter, arch_list[i]);
++ } else if (inputsc.size() == 1 && inputsc[0].is_float) {
++ if (inputsc[0].is_complex) {
++ run_cast_test1_s32fc((volk_fn_1arg_s32fc)(manual_func),
++ test_data[i],
++ scalar,
++ vlen,
++ iter,
++ arch_list[i]);
++ } else {
++ run_cast_test1_s32f((volk_fn_1arg_s32f)(manual_func),
++ test_data[i],
++ scalar.real(),
++ vlen,
++ iter,
++ arch_list[i]);
++ }
++ } else
++ throw "unsupported 1 arg function >1 scalars";
++ break;
++ case 2:
++ if (inputsc.size() == 0) {
++ run_cast_test2(
++ (volk_fn_2arg)(manual_func), test_data[i], vlen, iter, arch_list[i]);
++ } else if (inputsc.size() == 1 && inputsc[0].is_float) {
++ if (inputsc[0].is_complex) {
++ run_cast_test2_s32fc((volk_fn_2arg_s32fc)(manual_func),
++ test_data[i],
++ scalar,
++ vlen,
++ iter,
++ arch_list[i]);
++ } else {
++ run_cast_test2_s32f((volk_fn_2arg_s32f)(manual_func),
++ test_data[i],
++ scalar.real(),
++ vlen,
++ iter,
++ arch_list[i]);
++ }
++ } else
++ throw "unsupported 2 arg function >1 scalars";
++ break;
++ case 3:
++ if (inputsc.size() == 0) {
++ run_cast_test3(
++ (volk_fn_3arg)(manual_func), test_data[i], vlen, iter, arch_list[i]);
++ } else if (inputsc.size() == 1 && inputsc[0].is_float) {
++ if (inputsc[0].is_complex) {
++ run_cast_test3_s32fc((volk_fn_3arg_s32fc)(manual_func),
++ test_data[i],
++ scalar,
++ vlen,
++ iter,
++ arch_list[i]);
++ } else {
++ run_cast_test3_s32f((volk_fn_3arg_s32f)(manual_func),
++ test_data[i],
++ scalar.real(),
++ vlen,
++ iter,
++ arch_list[i]);
++ }
++ } else
++ throw "unsupported 3 arg function >1 scalars";
++ break;
++ case 4:
++ run_cast_test4(
++ (volk_fn_4arg)(manual_func), test_data[i], vlen, iter, arch_list[i]);
++ break;
++ default:
++ throw "no function handler for this signature";
++ break;
+ }
+
+ end = std::chrono::system_clock::now();
+@@ -541,10 +704,10 @@ bool run_volk_tests(volk_func_desc_t desc,
+ profile_times.push_back(arch_time);
+ }
+
+- //and now compare each output to the generic output
+- //first we have to know which output is the generic one, they aren't in order...
+- size_t generic_offset=0;
+- for(size_t i=0; i<arch_list.size(); i++) {
++ // and now compare each output to the generic output
++ // first we have to know which output is the generic one, they aren't in order...
++ size_t generic_offset = 0;
++ for (size_t i = 0; i < arch_list.size(); i++) {
+ if (arch_list[i] == "generic") {
+ generic_offset = i;
+ }
+@@ -555,72 +718,126 @@ bool run_volk_tests(volk_func_desc_t desc,
+ bool fail;
+ bool fail_global = false;
+ std::vector<bool> arch_results;
+- for(size_t i=0; i<arch_list.size(); i++) {
++ for (size_t i = 0; i < arch_list.size(); i++) {
+ fail = false;
+- if(i != generic_offset) {
+- for(size_t j=0; j<both_sigs.size(); j++) {
+- if(both_sigs[j].is_float) {
+- if(both_sigs[j].size == 8) {
++ if (i != generic_offset) {
++ for (size_t j = 0; j < both_sigs.size(); j++) {
++ if (both_sigs[j].is_float) {
++ if (both_sigs[j].size == 8) {
+ if (both_sigs[j].is_complex) {
+- fail = ccompare((double *) test_data[generic_offset][j], (double *) test_data[i][j], vlen, tol_f, absolute_mode);
++ fail = ccompare((double*)test_data[generic_offset][j],
++ (double*)test_data[i][j],
++ vlen,
++ tol_f,
++ absolute_mode);
+ } else {
+- fail = fcompare((double *) test_data[generic_offset][j], (double *) test_data[i][j], vlen, tol_f, absolute_mode);
++ fail = fcompare((double*)test_data[generic_offset][j],
++ (double*)test_data[i][j],
++ vlen,
++ tol_f,
++ absolute_mode);
+ }
+ } else {
+ if (both_sigs[j].is_complex) {
+- fail = ccompare((float *) test_data[generic_offset][j], (float *) test_data[i][j], vlen, tol_f, absolute_mode);
++ fail = ccompare((float*)test_data[generic_offset][j],
++ (float*)test_data[i][j],
++ vlen,
++ tol_f,
++ absolute_mode);
+ } else {
+- fail = fcompare((float *) test_data[generic_offset][j], (float *) test_data[i][j], vlen, tol_f, absolute_mode);
++ fail = fcompare((float*)test_data[generic_offset][j],
++ (float*)test_data[i][j],
++ vlen,
++ tol_f,
++ absolute_mode);
+ }
+ }
+ } else {
+- //i could replace this whole switch statement with a memcmp if i wasn't interested in printing the outputs where they differ
+- switch(both_sigs[j].size) {
++ // i could replace this whole switch statement with a memcmp if i
++ // wasn't interested in printing the outputs where they differ
++ switch (both_sigs[j].size) {
+ case 8:
+- if(both_sigs[j].is_signed) {
+- fail = icompare((int64_t *) test_data[generic_offset][j], (int64_t *) test_data[i][j], vlen*(both_sigs[j].is_complex ? 2 : 1), tol_i, absolute_mode);
++ if (both_sigs[j].is_signed) {
++ fail = icompare((int64_t*)test_data[generic_offset][j],
++ (int64_t*)test_data[i][j],
++ vlen * (both_sigs[j].is_complex ? 2 : 1),
++ tol_i,
++ absolute_mode);
+ } else {
+- fail = icompare((uint64_t *) test_data[generic_offset][j], (uint64_t *) test_data[i][j], vlen*(both_sigs[j].is_complex ? 2 : 1), tol_i, absolute_mode);
++ fail = icompare((uint64_t*)test_data[generic_offset][j],
++ (uint64_t*)test_data[i][j],
++ vlen * (both_sigs[j].is_complex ? 2 : 1),
++ tol_i,
++ absolute_mode);
+ }
+ break;
+ case 4:
+- if(both_sigs[j].is_complex) {
+- if(both_sigs[j].is_signed) {
+- fail = icompare((int16_t *) test_data[generic_offset][j], (int16_t *) test_data[i][j], vlen*(both_sigs[j].is_complex ? 2 : 1), tol_i, absolute_mode);
++ if (both_sigs[j].is_complex) {
++ if (both_sigs[j].is_signed) {
++ fail = icompare((int16_t*)test_data[generic_offset][j],
++ (int16_t*)test_data[i][j],
++ vlen * (both_sigs[j].is_complex ? 2 : 1),
++ tol_i,
++ absolute_mode);
+ } else {
+- fail = icompare((uint16_t *) test_data[generic_offset][j], (uint16_t *) test_data[i][j], vlen*(both_sigs[j].is_complex ? 2 : 1), tol_i, absolute_mode);
++ fail = icompare((uint16_t*)test_data[generic_offset][j],
++ (uint16_t*)test_data[i][j],
++ vlen * (both_sigs[j].is_complex ? 2 : 1),
++ tol_i,
++ absolute_mode);
+ }
+- }
+- else {
++ } else {
+ if (both_sigs[j].is_signed) {
+- fail = icompare((int32_t *) test_data[generic_offset][j], (int32_t *) test_data[i][j],
+- vlen * (both_sigs[j].is_complex ? 2 : 1), tol_i, absolute_mode);
++ fail = icompare((int32_t*)test_data[generic_offset][j],
++ (int32_t*)test_data[i][j],
++ vlen * (both_sigs[j].is_complex ? 2 : 1),
++ tol_i,
++ absolute_mode);
+ } else {
+- fail = icompare((uint32_t *) test_data[generic_offset][j], (uint32_t *) test_data[i][j],
+- vlen * (both_sigs[j].is_complex ? 2 : 1), tol_i, absolute_mode);
++ fail = icompare((uint32_t*)test_data[generic_offset][j],
++ (uint32_t*)test_data[i][j],
++ vlen * (both_sigs[j].is_complex ? 2 : 1),
++ tol_i,
++ absolute_mode);
+ }
+ }
+ break;
+ case 2:
+- if(both_sigs[j].is_signed) {
+- fail = icompare((int16_t *) test_data[generic_offset][j], (int16_t *) test_data[i][j], vlen*(both_sigs[j].is_complex ? 2 : 1), tol_i, absolute_mode);
++ if (both_sigs[j].is_signed) {
++ fail = icompare((int16_t*)test_data[generic_offset][j],
++ (int16_t*)test_data[i][j],
++ vlen * (both_sigs[j].is_complex ? 2 : 1),
++ tol_i,
++ absolute_mode);
+ } else {
+- fail = icompare((uint16_t *) test_data[generic_offset][j], (uint16_t *) test_data[i][j], vlen*(both_sigs[j].is_complex ? 2 : 1), tol_i, absolute_mode);
++ fail = icompare((uint16_t*)test_data[generic_offset][j],
++ (uint16_t*)test_data[i][j],
++ vlen * (both_sigs[j].is_complex ? 2 : 1),
++ tol_i,
++ absolute_mode);
+ }
+ break;
+ case 1:
+- if(both_sigs[j].is_signed) {
+- fail = icompare((int8_t *) test_data[generic_offset][j], (int8_t *) test_data[i][j], vlen*(both_sigs[j].is_complex ? 2 : 1), tol_i, absolute_mode);
++ if (both_sigs[j].is_signed) {
++ fail = icompare((int8_t*)test_data[generic_offset][j],
++ (int8_t*)test_data[i][j],
++ vlen * (both_sigs[j].is_complex ? 2 : 1),
++ tol_i,
++ absolute_mode);
+ } else {
+- fail = icompare((uint8_t *) test_data[generic_offset][j], (uint8_t *) test_data[i][j], vlen*(both_sigs[j].is_complex ? 2 : 1), tol_i, absolute_mode);
++ fail = icompare((uint8_t*)test_data[generic_offset][j],
++ (uint8_t*)test_data[i][j],
++ vlen * (both_sigs[j].is_complex ? 2 : 1),
++ tol_i,
++ absolute_mode);
+ }
+ break;
+ default:
+- fail=1;
++ fail = 1;
+ }
+ }
+- if(fail) {
+- volk_test_time_t *result = &results->back().results[arch_list[i]];
++ if (fail) {
++ volk_test_time_t* result = &results->back().results[arch_list[i]];
+ result->pass = false;
+ fail_global = true;
+ std::cout << name << ": fail on arch " << arch_list[i] << std::endl;
+@@ -634,15 +851,13 @@ bool run_volk_tests(volk_func_desc_t desc,
+ double best_time_u = std::numeric_limits<double>::max();
+ std::string best_arch_a = "generic";
+ std::string best_arch_u = "generic";
+- for(size_t i=0; i < arch_list.size(); i++)
+- {
+- if((profile_times[i] < best_time_u) && arch_results[i] && desc.impl_alignment[i] == 0)
+- {
++ for (size_t i = 0; i < arch_list.size(); i++) {
++ if ((profile_times[i] < best_time_u) && arch_results[i] &&
++ desc.impl_alignment[i] == 0) {
+ best_time_u = profile_times[i];
+ best_arch_u = arch_list[i];
+ }
+- if((profile_times[i] < best_time_a) && arch_results[i])
+- {
++ if ((profile_times[i] < best_time_a) && arch_results[i]) {
+ best_time_a = profile_times[i];
+ best_arch_a = arch_list[i];
+ }
+@@ -651,7 +866,7 @@ bool run_volk_tests(volk_func_desc_t desc,
+ std::cout << "Best aligned arch: " << best_arch_a << std::endl;
+ std::cout << "Best unaligned arch: " << best_arch_u << std::endl;
+
+- if(puppet_master_name == "NULL") {
++ if (puppet_master_name == "NULL") {
+ results->back().config_name = name;
+ } else {
+ results->back().config_name = puppet_master_name;
+diff --git a/lib/qa_utils.h b/lib/qa_utils.h
+index 2d8458b..74c3db4 100644
+--- a/lib/qa_utils.h
++++ b/lib/qa_utils.h
+@@ -1,14 +1,14 @@
+ #ifndef VOLK_QA_UTILS_H
+ #define VOLK_QA_UTILS_H
+
+-#include <stdbool.h> // for bool, false
+-#include <volk/volk.h> // for volk_func_desc_t
+-#include <cstdlib> // for NULL
+-#include <map> // for map
+-#include <string> // for string, basic_string
+-#include <vector> // for vector
++#include <stdbool.h> // for bool, false
++#include <volk/volk.h> // for volk_func_desc_t
++#include <cstdlib> // for NULL
++#include <map> // for map
++#include <string> // for string, basic_string
++#include <vector> // for vector
+
+-#include "volk/volk_complex.h" // for lv_32fc_t
++#include "volk/volk_complex.h" // for lv_32fc_t
+
+ /************************************************
+ * VOLK QA type definitions *
+@@ -22,93 +22,119 @@ struct volk_type_t {
+ std::string str;
+ };
+
+-class volk_test_time_t {
+- public:
+- std::string name;
+- double time;
+- std::string units;
+- bool pass;
++class volk_test_time_t
++{
++public:
++ std::string name;
++ double time;
++ std::string units;
++ bool pass;
+ };
+
+-class volk_test_results_t {
+- public:
+- std::string name;
+- std::string config_name;
+- unsigned int vlen;
+- unsigned int iter;
+- std::map<std::string, volk_test_time_t> results;
+- std::string best_arch_a;
+- std::string best_arch_u;
++class volk_test_results_t
++{
++public:
++ std::string name;
++ std::string config_name;
++ unsigned int vlen;
++ unsigned int iter;
++ std::map<std::string, volk_test_time_t> results;
++ std::string best_arch_a;
++ std::string best_arch_u;
+ };
+
+-class volk_test_params_t {
+- private:
+- float _tol;
+- lv_32fc_t _scalar;
+- unsigned int _vlen;
+- unsigned int _iter;
+- bool _benchmark_mode;
+- bool _absolute_mode;
+- std::string _kernel_regex;
+- public:
+- // ctor
+- volk_test_params_t(float tol, lv_32fc_t scalar, unsigned int vlen, unsigned int iter,
+- bool benchmark_mode, std::string kernel_regex) :
+- _tol(tol), _scalar(scalar), _vlen(vlen), _iter(iter),
+- _benchmark_mode(benchmark_mode), _absolute_mode(false), _kernel_regex(kernel_regex) {};
+- // setters
+- void set_tol(float tol) {_tol=tol;};
+- void set_scalar(lv_32fc_t scalar) {_scalar=scalar;};
+- void set_vlen(unsigned int vlen) {_vlen=vlen;};
+- void set_iter(unsigned int iter) {_iter=iter;};
+- void set_benchmark(bool benchmark) {_benchmark_mode=benchmark;};
+- void set_regex(std::string regex) {_kernel_regex=regex;};
+- // getters
+- float tol() {return _tol;};
+- lv_32fc_t scalar() {return _scalar;};
+- unsigned int vlen() {return _vlen;};
+- unsigned int iter() {return _iter;};
+- bool benchmark_mode() {return _benchmark_mode;};
+- bool absolute_mode() {return _absolute_mode;};
+- std::string kernel_regex() {return _kernel_regex;};
+- volk_test_params_t make_absolute(float tol) {
+- volk_test_params_t t(*this);
+- t._tol = tol;
+- t._absolute_mode = true;
+- return t;
+- }
+- volk_test_params_t make_tol(float tol) {
+- volk_test_params_t t(*this);
+- t._tol = tol;
+- return t;
+- }
++class volk_test_params_t
++{
++private:
++ float _tol;
++ lv_32fc_t _scalar;
++ unsigned int _vlen;
++ unsigned int _iter;
++ bool _benchmark_mode;
++ bool _absolute_mode;
++ std::string _kernel_regex;
++
++public:
++ // ctor
++ volk_test_params_t(float tol,
++ lv_32fc_t scalar,
++ unsigned int vlen,
++ unsigned int iter,
++ bool benchmark_mode,
++ std::string kernel_regex)
++ : _tol(tol),
++ _scalar(scalar),
++ _vlen(vlen),
++ _iter(iter),
++ _benchmark_mode(benchmark_mode),
++ _absolute_mode(false),
++ _kernel_regex(kernel_regex){};
++ // setters
++ void set_tol(float tol) { _tol = tol; };
++ void set_scalar(lv_32fc_t scalar) { _scalar = scalar; };
++ void set_vlen(unsigned int vlen) { _vlen = vlen; };
++ void set_iter(unsigned int iter) { _iter = iter; };
++ void set_benchmark(bool benchmark) { _benchmark_mode = benchmark; };
++ void set_regex(std::string regex) { _kernel_regex = regex; };
++ // getters
++ float tol() { return _tol; };
++ lv_32fc_t scalar() { return _scalar; };
++ unsigned int vlen() { return _vlen; };
++ unsigned int iter() { return _iter; };
++ bool benchmark_mode() { return _benchmark_mode; };
++ bool absolute_mode() { return _absolute_mode; };
++ std::string kernel_regex() { return _kernel_regex; };
++ volk_test_params_t make_absolute(float tol)
++ {
++ volk_test_params_t t(*this);
++ t._tol = tol;
++ t._absolute_mode = true;
++ return t;
++ }
++ volk_test_params_t make_tol(float tol)
++ {
++ volk_test_params_t t(*this);
++ t._tol = tol;
++ return t;
++ }
+ };
+
+-class volk_test_case_t {
+- private:
+- volk_func_desc_t _desc;
+- void(*_kernel_ptr)();
+- std::string _name;
+- volk_test_params_t _test_parameters;
+- std::string _puppet_master_name;
+- public:
+- volk_func_desc_t desc() {return _desc;};
+- void (*kernel_ptr()) () {return _kernel_ptr;};
+- std::string name() {return _name;};
+- std::string puppet_master_name() {return _puppet_master_name;};
+- volk_test_params_t test_parameters() {return _test_parameters;};
+- // normal ctor
+- volk_test_case_t(volk_func_desc_t desc, void(*kernel_ptr)(), std::string name,
+- volk_test_params_t test_parameters) :
+- _desc(desc), _kernel_ptr(kernel_ptr), _name(name), _test_parameters(test_parameters),
+- _puppet_master_name("NULL")
+- {};
+- // ctor for puppets
+- volk_test_case_t(volk_func_desc_t desc, void(*kernel_ptr)(), std::string name,
+- std::string puppet_master_name, volk_test_params_t test_parameters) :
+- _desc(desc), _kernel_ptr(kernel_ptr), _name(name), _test_parameters(test_parameters),
+- _puppet_master_name(puppet_master_name)
+- {};
++class volk_test_case_t
++{
++private:
++ volk_func_desc_t _desc;
++ void (*_kernel_ptr)();
++ std::string _name;
++ volk_test_params_t _test_parameters;
++ std::string _puppet_master_name;
++
++public:
++ volk_func_desc_t desc() { return _desc; };
++ void (*kernel_ptr())() { return _kernel_ptr; };
++ std::string name() { return _name; };
++ std::string puppet_master_name() { return _puppet_master_name; };
++ volk_test_params_t test_parameters() { return _test_parameters; };
++ // normal ctor
++ volk_test_case_t(volk_func_desc_t desc,
++ void (*kernel_ptr)(),
++ std::string name,
++ volk_test_params_t test_parameters)
++ : _desc(desc),
++ _kernel_ptr(kernel_ptr),
++ _name(name),
++ _test_parameters(test_parameters),
++ _puppet_master_name("NULL"){};
++ // ctor for puppets
++ volk_test_case_t(volk_func_desc_t desc,
++ void (*kernel_ptr)(),
++ std::string name,
++ std::string puppet_master_name,
++ volk_test_params_t test_parameters)
++ : _desc(desc),
++ _kernel_ptr(kernel_ptr),
++ _name(name),
++ _test_parameters(test_parameters),
++ _puppet_master_name(puppet_master_name){};
+ };
+
+ /************************************************
+@@ -117,42 +143,58 @@ class volk_test_case_t {
+ volk_type_t volk_type_from_string(std::string);
+
+ float uniform(void);
+-void random_floats(float *buf, unsigned n);
++void random_floats(float* buf, unsigned n);
+
+-bool run_volk_tests(
+- volk_func_desc_t,
+- void(*)(),
+- std::string,
+- volk_test_params_t,
+- std::vector<volk_test_results_t> *results = NULL,
+- std::string puppet_master_name = "NULL"
+- );
++bool run_volk_tests(volk_func_desc_t,
++ void (*)(),
++ std::string,
++ volk_test_params_t,
++ std::vector<volk_test_results_t>* results = NULL,
++ std::string puppet_master_name = "NULL");
+
+-bool run_volk_tests(
+- volk_func_desc_t,
+- void(*)(),
+- std::string,
+- float,
+- lv_32fc_t,
+- unsigned int,
+- unsigned int,
+- std::vector<volk_test_results_t> *results = NULL,
+- std::string puppet_master_name = "NULL",
+- bool absolute_mode = false,
+- bool benchmark_mode = false
+-);
++bool run_volk_tests(volk_func_desc_t,
++ void (*)(),
++ std::string,
++ float,
++ lv_32fc_t,
++ unsigned int,
++ unsigned int,
++ std::vector<volk_test_results_t>* results = NULL,
++ std::string puppet_master_name = "NULL",
++ bool absolute_mode = false,
++ bool benchmark_mode = false);
+
+-#define VOLK_PROFILE(func, test_params, results) run_volk_tests(func##_get_func_desc(), (void (*)())func##_manual, std::string(#func), test_params, results, "NULL")
+-#define VOLK_PUPPET_PROFILE(func, puppet_master_func, test_params, results) run_volk_tests(func##_get_func_desc(), (void (*)())func##_manual, std::string(#func), test_params, results, std::string(#puppet_master_func))
+-typedef void (*volk_fn_1arg)(void *, unsigned int, const char*); //one input, operate in place
+-typedef void (*volk_fn_2arg)(void *, void *, unsigned int, const char*);
+-typedef void (*volk_fn_3arg)(void *, void *, void *, unsigned int, const char*);
+-typedef void (*volk_fn_4arg)(void *, void *, void *, void *, unsigned int, const char*);
+-typedef void (*volk_fn_1arg_s32f)(void *, float, unsigned int, const char*); //one input vector, one scalar float input
+-typedef void (*volk_fn_2arg_s32f)(void *, void *, float, unsigned int, const char*);
+-typedef void (*volk_fn_3arg_s32f)(void *, void *, void *, float, unsigned int, const char*);
+-typedef void (*volk_fn_1arg_s32fc)(void *, lv_32fc_t, unsigned int, const char*); //one input vector, one scalar float input
+-typedef void (*volk_fn_2arg_s32fc)(void *, void *, lv_32fc_t, unsigned int, const char*);
+-typedef void (*volk_fn_3arg_s32fc)(void *, void *, void *, lv_32fc_t, unsigned int, const char*);
++#define VOLK_PROFILE(func, test_params, results) \
++ run_volk_tests(func##_get_func_desc(), \
++ (void (*)())func##_manual, \
++ std::string(#func), \
++ test_params, \
++ results, \
++ "NULL")
++#define VOLK_PUPPET_PROFILE(func, puppet_master_func, test_params, results) \
++ run_volk_tests(func##_get_func_desc(), \
++ (void (*)())func##_manual, \
++ std::string(#func), \
++ test_params, \
++ results, \
++ std::string(#puppet_master_func))
++typedef void (*volk_fn_1arg)(void*,
++ unsigned int,
++ const char*); // one input, operate in place
++typedef void (*volk_fn_2arg)(void*, void*, unsigned int, const char*);
++typedef void (*volk_fn_3arg)(void*, void*, void*, unsigned int, const char*);
++typedef void (*volk_fn_4arg)(void*, void*, void*, void*, unsigned int, const char*);
++typedef void (*volk_fn_1arg_s32f)(
++ void*, float, unsigned int, const char*); // one input vector, one scalar float input
++typedef void (*volk_fn_2arg_s32f)(void*, void*, float, unsigned int, const char*);
++typedef void (*volk_fn_3arg_s32f)(void*, void*, void*, float, unsigned int, const char*);
++typedef void (*volk_fn_1arg_s32fc)(
++ void*,
++ lv_32fc_t,
++ unsigned int,
++ const char*); // one input vector, one scalar float input
++typedef void (*volk_fn_2arg_s32fc)(void*, void*, lv_32fc_t, unsigned int, const char*);
++typedef void (*volk_fn_3arg_s32fc)(
++ void*, void*, void*, lv_32fc_t, unsigned int, const char*);
+
+-#endif //VOLK_QA_UTILS_H
++#endif // VOLK_QA_UTILS_H
+diff --git a/lib/testqa.cc b/lib/testqa.cc
+index 8b0f4d6..c885383 100644
+--- a/lib/testqa.cc
++++ b/lib/testqa.cc
+@@ -20,18 +20,18 @@
+ * Boston, MA 02110-1301, USA.
+ */
+
+-#include <stdbool.h> // for bool, false, true
+-#include <iostream> // for operator<<, basic_ostream, endl, char...
+-#include <fstream> // IWYU pragma: keep
+-#include <map> // for map, map<>::iterator, _Rb_tree_iterator
+-#include <string> // for string, operator<<
+-#include <utility> // for pair
+-#include <vector> // for vector
+-
++#include <stdbool.h> // for bool, false, true
++#include <fstream> // IWYU pragma: keep
++#include <iostream> // for operator<<, basic_ostream, endl, char...
++#include <map> // for map, map<>::iterator, _Rb_tree_iterator
++#include <string> // for string, operator<<
++#include <utility> // for pair
++#include <vector> // for vector
++
++#include "kernel_tests.h" // for init_test_list
++#include "qa_utils.h" // for volk_test_case_t, volk_test_results_t
++#include "volk/volk_complex.h" // for lv_32fc_t
+ #include <volk/volk.h>
+-#include "kernel_tests.h" // for init_test_list
+-#include "qa_utils.h" // for volk_test_case_t, volk_test_results_t
+-#include "volk/volk_complex.h" // for lv_32fc_t
+
+ void print_qa_xml(std::vector<volk_test_results_t> results, unsigned int nfails);
+
+@@ -46,45 +46,52 @@ int main(int argc, char* argv[])
+ bool def_benchmark_mode = true;
+ std::string def_kernel_regex = "";
+
+- volk_test_params_t test_params(def_tol, def_scalar, def_vlen, def_iter,
+- def_benchmark_mode, def_kernel_regex);
++ volk_test_params_t test_params(
++ def_tol, def_scalar, def_vlen, def_iter, def_benchmark_mode, def_kernel_regex);
+ std::vector<volk_test_case_t> test_cases = init_test_list(test_params);
+ std::vector<volk_test_results_t> results;
+
+- if (argc > 1){
+- for(unsigned int ii = 0; ii < test_cases.size(); ++ii){
+- if (std::string(argv[1]) == test_cases[ii].name()){
++ if (argc > 1) {
++ for (unsigned int ii = 0; ii < test_cases.size(); ++ii) {
++ if (std::string(argv[1]) == test_cases[ii].name()) {
+ volk_test_case_t test_case = test_cases[ii];
+- if (run_volk_tests(test_case.desc(), test_case.kernel_ptr(),
++ if (run_volk_tests(test_case.desc(),
++ test_case.kernel_ptr(),
+ test_case.name(),
+- test_case.test_parameters(), &results,
++ test_case.test_parameters(),
++ &results,
+ test_case.puppet_master_name())) {
+- return 1;
++ return 1;
+ } else {
+- return 0;
++ return 0;
+ }
+ }
+ }
+- std::cerr << "Did not run a test for kernel: " << std::string(argv[1]) << " !" << std::endl;
++ std::cerr << "Did not run a test for kernel: " << std::string(argv[1]) << " !"
++ << std::endl;
+ return 0;
+
+- }else{
++ } else {
+ std::vector<std::string> qa_failures;
+ // Test every kernel reporting failures when they occur
+- for(unsigned int ii = 0; ii < test_cases.size(); ++ii) {
++ for (unsigned int ii = 0; ii < test_cases.size(); ++ii) {
+ bool qa_result = false;
+ volk_test_case_t test_case = test_cases[ii];
+ try {
+- qa_result = run_volk_tests(test_case.desc(), test_case.kernel_ptr(), test_case.name(),
+- test_case.test_parameters(), &results, test_case.puppet_master_name());
+- }
+- catch(...) {
++ qa_result = run_volk_tests(test_case.desc(),
++ test_case.kernel_ptr(),
++ test_case.name(),
++ test_case.test_parameters(),
++ &results,
++ test_case.puppet_master_name());
++ } catch (...) {
+ // TODO: what exceptions might we need to catch and how do we handle them?
+- std::cerr << "Exception found on kernel: " << test_case.name() << std::endl;
++ std::cerr << "Exception found on kernel: " << test_case.name()
++ << std::endl;
+ qa_result = false;
+ }
+
+- if(qa_result) {
++ if (qa_result) {
+ std::cerr << "Failure on " << test_case.name() << std::endl;
+ qa_failures.push_back(test_case.name());
+ }
+@@ -96,9 +103,9 @@ int main(int argc, char* argv[])
+ // Summarize QA results
+ std::cerr << "Kernel QA finished: " << qa_failures.size() << " failures out of "
+ << test_cases.size() << " tests." << std::endl;
+- if(qa_failures.size() > 0) {
++ if (qa_failures.size() > 0) {
+ std::cerr << "The following kernels failed QA:" << std::endl;
+- for(unsigned int ii = 0; ii < qa_failures.size(); ++ii) {
++ for (unsigned int ii = 0; ii < qa_failures.size(); ++ii) {
+ std::cerr << " " << qa_failures[ii] << std::endl;
+ }
+ qa_ret_val = 1;
+@@ -118,26 +125,28 @@ void print_qa_xml(std::vector<volk_test_results_t> results, unsigned int nfails)
+ qa_file.open(".unittest/kernels.xml");
+
+ qa_file << "<?xml version=\"1.0\" encoding=\"UTF-8\"?>" << std::endl;
+- qa_file << "<testsuites name=\"kernels\" " <<
+- "tests=\"" << results.size() << "\" " <<
+- "failures=\"" << nfails << "\" id=\"1\">" << std::endl;
++ qa_file << "<testsuites name=\"kernels\" "
++ << "tests=\"" << results.size() << "\" "
++ << "failures=\"" << nfails << "\" id=\"1\">" << std::endl;
+
+ // Results are in a vector by kernel. Each element has a result
+ // map containing time and arch name with test result
+- for(unsigned int ii=0; ii < results.size(); ++ii) {
++ for (unsigned int ii = 0; ii < results.size(); ++ii) {
+ volk_test_results_t result = results[ii];
+ qa_file << " <testsuite name=\"" << result.name << "\">" << std::endl;
+
+ std::map<std::string, volk_test_time_t>::iterator kernel_time_pair;
+- for(kernel_time_pair = result.results.begin(); kernel_time_pair != result.results.end(); ++kernel_time_pair) {
++ for (kernel_time_pair = result.results.begin();
++ kernel_time_pair != result.results.end();
++ ++kernel_time_pair) {
+ volk_test_time_t test_time = kernel_time_pair->second;
+- qa_file << " <testcase name=\"" << test_time.name << "\" " <<
+- "classname=\"" << result.name << "\" " <<
+- "time=\"" << test_time.time << "\">" << std::endl;
+- if(!test_time.pass)
+- qa_file << " <failure " <<
+- "message=\"fail on arch " << test_time.name << "\">" <<
+- "</failure>" << std::endl;
++ qa_file << " <testcase name=\"" << test_time.name << "\" "
++ << "classname=\"" << result.name << "\" "
++ << "time=\"" << test_time.time << "\">" << std::endl;
++ if (!test_time.pass)
++ qa_file << " <failure "
++ << "message=\"fail on arch " << test_time.name << "\">"
++ << "</failure>" << std::endl;
+ qa_file << " </testcase>" << std::endl;
+ }
+ qa_file << " </testsuite>" << std::endl;
+@@ -146,5 +155,4 @@ void print_qa_xml(std::vector<volk_test_results_t> results, unsigned int nfails)
+
+ qa_file << "</testsuites>" << std::endl;
+ qa_file.close();
+-
+ }
+diff --git a/lib/volk_malloc.c b/lib/volk_malloc.c
+index df36240..b3779e1 100644
+--- a/lib/volk_malloc.c
++++ b/lib/volk_malloc.c
+@@ -31,7 +31,8 @@
+ * see: https://en.cppreference.com/w/c/memory/aligned_alloc
+ *
+ * MSVC is broken
+- * see: https://docs.microsoft.com/en-us/cpp/overview/visual-cpp-language-conformance?view=vs-2019
++ * see:
++ * https://docs.microsoft.com/en-us/cpp/overview/visual-cpp-language-conformance?view=vs-2019
+ * This section:
+ * C11 The Universal CRT implemented the parts of the
+ * C11 Standard Library that are required by C++17,
+@@ -46,39 +47,43 @@
+ * We must work around this problem because MSVC is non-compliant!
+ */
+
+-void *volk_malloc(size_t size, size_t alignment)
++
++void* volk_malloc(size_t size, size_t alignment)
+ {
+ #if HAVE_POSIX_MEMALIGN
+- // quoting posix_memalign() man page:
+- // "alignment must be a power of two and a multiple of sizeof(void *)"
+- // volk_get_alignment() could return 1 for some machines (e.g. generic_orc)
+- if (alignment == 1){
+- return malloc(size);
+- }
+- void *ptr;
+- int err = posix_memalign(&ptr, alignment, size);
+- if(err != 0) {
+- ptr = NULL;
+- fprintf(stderr,
+- "VOLK: Error allocating memory "
+- "(posix_memalign: error %d: %s)\n", err, strerror(err));
+- }
++ // quoting posix_memalign() man page:
++ // "alignment must be a power of two and a multiple of sizeof(void *)"
++ // volk_get_alignment() could return 1 for some machines (e.g. generic_orc)
++ if (alignment == 1) {
++ return malloc(size);
++ }
++ void* ptr;
++ int err = posix_memalign(&ptr, alignment, size);
++ if (err != 0) {
++ ptr = NULL;
++ fprintf(stderr,
++ "VOLK: Error allocating memory "
++ "(posix_memalign: error %d: %s)\n",
++ err,
++ strerror(err));
++ }
+ #elif defined(_MSC_VER)
+- void *ptr = _aligned_malloc(size, alignment);
++ void* ptr = _aligned_malloc(size, alignment);
+ #else
+- void *ptr = aligned_alloc(alignment, size);
++ void* ptr = aligned_alloc(alignment, size);
+ #endif
+- if(ptr == NULL) {
+- fprintf(stderr, "VOLK: Error allocating memory (aligned_alloc/_aligned_malloc)\n");
+- }
+- return ptr;
++ if (ptr == NULL) {
++ fprintf(stderr,
++ "VOLK: Error allocating memory (aligned_alloc/_aligned_malloc)\n");
++ }
++ return ptr;
+ }
+
+-void volk_free(void *ptr)
++void volk_free(void* ptr)
+ {
+ #if defined(_MSC_VER)
+- _aligned_free(ptr);
++ _aligned_free(ptr);
+ #else
+- free(ptr);
++ free(ptr);
+ #endif
+ }
+diff --git a/lib/volk_prefs.c b/lib/volk_prefs.c
+index 0b5fe8e..8934bf7 100644
+--- a/lib/volk_prefs.c
++++ b/lib/volk_prefs.c
+@@ -1,6 +1,6 @@
++#include <stdbool.h>
+ #include <stdio.h>
+ #include <stdlib.h>
+-#include <stdbool.h>
+ #include <string.h>
+ #if defined(_MSC_VER)
+ #include <io.h>
+@@ -11,82 +11,84 @@
+ #endif
+ #include <volk/volk_prefs.h>
+
+-void volk_get_config_path(char *path, bool read)
++void volk_get_config_path(char* path, bool read)
+ {
+- if (!path) return;
+- const char *suffix = "/.volk/volk_config";
+- const char *suffix2 = "/volk/volk_config"; //non-hidden
+- char *home = NULL;
++ if (!path)
++ return;
++ const char* suffix = "/.volk/volk_config";
++ const char* suffix2 = "/volk/volk_config"; // non-hidden
++ char* home = NULL;
+
+- //allows config redirection via env variable
++ // allows config redirection via env variable
+ home = getenv("VOLK_CONFIGPATH");
+- if(home!=NULL){
+- strncpy(path,home,512);
+- strcat(path,suffix2);
+- if (!read || access(path, F_OK) != -1){
++ if (home != NULL) {
++ strncpy(path, home, 512);
++ strcat(path, suffix2);
++ if (!read || access(path, F_OK) != -1) {
+ return;
+ }
+ }
+
+- //check for user-local config file
++ // check for user-local config file
+ home = getenv("HOME");
+- if (home != NULL){
++ if (home != NULL) {
+ strncpy(path, home, 512);
+ strcat(path, suffix);
+- if (!read || (access(path, F_OK) != -1)){
++ if (!read || (access(path, F_OK) != -1)) {
+ return;
+ }
+ }
+
+- //check for config file in APPDATA (Windows)
++ // check for config file in APPDATA (Windows)
+ home = getenv("APPDATA");
+- if (home != NULL){
++ if (home != NULL) {
+ strncpy(path, home, 512);
+ strcat(path, suffix);
+- if (!read || (access(path, F_OK) != -1)){
++ if (!read || (access(path, F_OK) != -1)) {
+ return;
+ }
+ }
+
+- //check for system-wide config file
+- if (access("/etc/volk/volk_config", F_OK) != -1){
++ // check for system-wide config file
++ if (access("/etc/volk/volk_config", F_OK) != -1) {
+ strncpy(path, "/etc", 512);
+ strcat(path, suffix2);
+- if (!read || (access(path, F_OK) != -1)){
++ if (!read || (access(path, F_OK) != -1)) {
+ return;
+ }
+ }
+
+- //If still no path was found set path[0] to '0' and fall through
++ // If still no path was found set path[0] to '0' and fall through
+ path[0] = 0;
+ return;
+ }
+
+-size_t volk_load_preferences(volk_arch_pref_t **prefs_res)
++size_t volk_load_preferences(volk_arch_pref_t** prefs_res)
+ {
+- FILE *config_file;
++ FILE* config_file;
+ char path[512], line[512];
+ size_t n_arch_prefs = 0;
+- volk_arch_pref_t *prefs = NULL;
++ volk_arch_pref_t* prefs = NULL;
+
+- //get the config path
++ // get the config path
+ volk_get_config_path(path, true);
+- if (!path[0]) return n_arch_prefs; //no prefs found
++ if (!path[0])
++ return n_arch_prefs; // no prefs found
+ config_file = fopen(path, "r");
+- if(!config_file) return n_arch_prefs; //no prefs found
++ if (!config_file)
++ return n_arch_prefs; // no prefs found
+
+- //reset the file pointer and write the prefs into volk_arch_prefs
+- while(fgets(line, sizeof(line), config_file) != NULL)
+- {
+- void *new_prefs = realloc(prefs, (n_arch_prefs + 1) * sizeof(*prefs));
++ // reset the file pointer and write the prefs into volk_arch_prefs
++ while (fgets(line, sizeof(line), config_file) != NULL) {
++ void* new_prefs = realloc(prefs, (n_arch_prefs + 1) * sizeof(*prefs));
+ if (!new_prefs) {
+- printf ("volk_load_preferences: bad malloc\n");
++ printf("volk_load_preferences: bad malloc\n");
+ break;
+ }
+- prefs = (volk_arch_pref_t *) new_prefs;
+- volk_arch_pref_t *p = prefs + n_arch_prefs;
+- if(sscanf(line, "%s %s %s", p->name, p->impl_a, p->impl_u) == 3 && !strncmp(p->name, "volk_", 5))
+- {
++ prefs = (volk_arch_pref_t*)new_prefs;
++ volk_arch_pref_t* p = prefs + n_arch_prefs;
++ if (sscanf(line, "%s %s %s", p->name, p->impl_a, p->impl_u) == 3 &&
++ !strncmp(p->name, "volk_", 5)) {
+ n_arch_prefs++;
+ }
+ }
+diff --git a/lib/volk_rank_archs.c b/lib/volk_rank_archs.c
+index 346619e..7cf3fd7 100644
+--- a/lib/volk_rank_archs.c
++++ b/lib/volk_rank_archs.c
+@@ -24,84 +24,83 @@
+ #include <stdlib.h>
+ #include <string.h>
+
+-#include <volk_rank_archs.h>
+ #include <volk/volk_prefs.h>
++#include <volk_rank_archs.h>
+
+-int volk_get_index(
+- const char *impl_names[], //list of implementations by name
+- const size_t n_impls, //number of implementations available
+- const char *impl_name //the implementation name to find
+-){
++int volk_get_index(const char* impl_names[], // list of implementations by name
++ const size_t n_impls, // number of implementations available
++ const char* impl_name // the implementation name to find
++)
++{
+ unsigned int i;
+ for (i = 0; i < n_impls; i++) {
+- if(!strncmp(impl_names[i], impl_name, 20)) {
++ if (!strncmp(impl_names[i], impl_name, 20)) {
+ return i;
+ }
+ }
+- //TODO return -1;
+- //something terrible should happen here
++ // TODO return -1;
++ // something terrible should happen here
+ fprintf(stderr, "Volk warning: no arch found, returning generic impl\n");
+- return volk_get_index(impl_names, n_impls, "generic"); //but we'll fake it for now
++ return volk_get_index(impl_names, n_impls, "generic"); // but we'll fake it for now
+ }
+
+-int volk_rank_archs(
+- const char *kern_name, //name of the kernel to rank
+- const char *impl_names[], //list of implementations by name
+- const int* impl_deps, //requirement mask per implementation
+- const bool* alignment, //alignment status of each implementation
+- size_t n_impls, //number of implementations available
+- const bool align //if false, filter aligned implementations
++int volk_rank_archs(const char* kern_name, // name of the kernel to rank
++ const char* impl_names[], // list of implementations by name
++ const int* impl_deps, // requirement mask per implementation
++ const bool* alignment, // alignment status of each implementation
++ size_t n_impls, // number of implementations available
++ const bool align // if false, filter aligned implementations
+ )
+ {
+ size_t i;
+- static volk_arch_pref_t *volk_arch_prefs;
++ static volk_arch_pref_t* volk_arch_prefs;
+ static size_t n_arch_prefs = 0;
+ static int prefs_loaded = 0;
+- if(!prefs_loaded) {
++ if (!prefs_loaded) {
+ n_arch_prefs = volk_load_preferences(&volk_arch_prefs);
+ prefs_loaded = 1;
+ }
+
+ // If we've defined VOLK_GENERIC to be anything, always return the
+ // 'generic' kernel. Used in GR's QA code.
+- char *gen_env = getenv("VOLK_GENERIC");
+- if(gen_env) {
+- return volk_get_index(impl_names, n_impls, "generic");
++ char* gen_env = getenv("VOLK_GENERIC");
++ if (gen_env) {
++ return volk_get_index(impl_names, n_impls, "generic");
+ }
+
+- //now look for the function name in the prefs list
+- for(i = 0; i < n_arch_prefs; i++)
+- {
+- if(!strncmp(kern_name, volk_arch_prefs[i].name, sizeof(volk_arch_prefs[i].name))) //found it
++ // now look for the function name in the prefs list
++ for (i = 0; i < n_arch_prefs; i++) {
++ if (!strncmp(kern_name,
++ volk_arch_prefs[i].name,
++ sizeof(volk_arch_prefs[i].name))) // found it
+ {
+- const char *impl_name = align? volk_arch_prefs[i].impl_a : volk_arch_prefs[i].impl_u;
++ const char* impl_name =
++ align ? volk_arch_prefs[i].impl_a : volk_arch_prefs[i].impl_u;
+ return volk_get_index(impl_names, n_impls, impl_name);
+ }
+ }
+
+- //return the best index with the largest deps
++ // return the best index with the largest deps
+ size_t best_index_a = 0;
+ size_t best_index_u = 0;
+ int best_value_a = -1;
+ int best_value_u = -1;
+- for(i = 0; i < n_impls; i++)
+- {
++ for (i = 0; i < n_impls; i++) {
+ const signed val = impl_deps[i];
+- if (alignment[i] && val > best_value_a)
+- {
++ if (alignment[i] && val > best_value_a) {
+ best_index_a = i;
+ best_value_a = val;
+ }
+- if (!alignment[i] && val > best_value_u)
+- {
++ if (!alignment[i] && val > best_value_u) {
+ best_index_u = i;
+ best_value_u = val;
+ }
+ }
+
+- //when align and we found a best aligned, use it
+- if (align && best_value_a != -1) return best_index_a;
++ // when align and we found a best aligned, use it
++ if (align && best_value_a != -1)
++ return best_index_a;
+
+- //otherwise return the best unaligned
++ // otherwise return the best unaligned
+ return best_index_u;
+ }
+diff --git a/lib/volk_rank_archs.h b/lib/volk_rank_archs.h
+index b3bf8ff..9434778 100644
+--- a/lib/volk_rank_archs.h
++++ b/lib/volk_rank_archs.h
+@@ -22,26 +22,24 @@
+ #ifndef INCLUDED_VOLK_RANK_ARCHS_H
+ #define INCLUDED_VOLK_RANK_ARCHS_H
+
+-#include <stdlib.h>
+ #include <stdbool.h>
++#include <stdlib.h>
+
+ #ifdef __cplusplus
+ extern "C" {
+ #endif
+
+-int volk_get_index(
+- const char *impl_names[], //list of implementations by name
+- const size_t n_impls, //number of implementations available
+- const char *impl_name //the implementation name to find
++int volk_get_index(const char* impl_names[], // list of implementations by name
++ const size_t n_impls, // number of implementations available
++ const char* impl_name // the implementation name to find
+ );
+
+-int volk_rank_archs(
+- const char *kern_name, //name of the kernel to rank
+- const char *impl_names[], //list of implementations by name
+- const int* impl_deps, //requirement mask per implementation
+- const bool* alignment, //alignment status of each implementation
+- size_t n_impls, //number of implementations available
+- const bool align //if false, filter aligned implementations
++int volk_rank_archs(const char* kern_name, // name of the kernel to rank
++ const char* impl_names[], // list of implementations by name
++ const int* impl_deps, // requirement mask per implementation
++ const bool* alignment, // alignment status of each implementation
++ size_t n_impls, // number of implementations available
++ const bool align // if false, filter aligned implementations
+ );
+
+ #ifdef __cplusplus
+--
+2.20.1
+